Ares64 Performance Core (#3149)

* prep for performance ares64 core, needs work unmanaged side * get this going * rebuild this * apparently build didnt get cp'd? need to investigate * fix build, other shit * suppress these warnings * tweaks and builds * apparently bizinvoker doesnt like having LibAres64 class shared between non-waterbox and waterboxed, so split it. also states for performance core * builds * fix this option, describe supersampling properly * penguin64
2022-02-16 02:15:27 -08:00 · 2022-02-16 02:15:27 -08:00 · 655ed7949e
parent 05f11be191
commit 655ed7949e
209 changed files with 172313 additions and 78 deletions
--- a/Assets/dll/ares64.wbx.gz
+++ b/Assets/dll/ares64.wbx.gz
--- a/Assets/dll/libares64.dll
+++ b/Assets/dll/libares64.dll
--- a/Assets/dll/libares64.so
+++ b/Assets/dll/libares64.so
--- a/src/BizHawk.Client.Common/config/Config.cs
+++ b/src/BizHawk.Client.Common/config/Config.cs
@ -27,7 +27,7 @@ namespace BizHawk.Client.Common
 			(new[] { VSystemID.Raw.SNES },
 				new[] { CoreNames.Faust, CoreNames.Snes9X, CoreNames.Bsnes, CoreNames.Bsnes115 }),
 			(new[] { VSystemID.Raw.N64 },
-				new[] { CoreNames.Mupen64Plus, CoreNames.Ares64, }),
+				new[] { CoreNames.Mupen64Plus, CoreNames.Ares64Performance, CoreNames.Ares64Accuracy }),
 			(new[] { VSystemID.Raw.SGB },
 				new[] { CoreNames.Gambatte, CoreNames.Bsnes, CoreNames.Bsnes115}),
 			(new[] { VSystemID.Raw.GB, VSystemID.Raw.GBC },
--- a/src/BizHawk.Emulation.Cores/Consoles/Nintendo/Ares64/Ares64.Accuracy.ISettable.cs
+++ b/src/BizHawk.Emulation.Cores/Consoles/Nintendo/Ares64/Ares64.Accuracy.ISettable.cs
@ -3,7 +3,7 @@ using System.ComponentModel;
 using BizHawk.Common;
 using BizHawk.Emulation.Common;

-namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64
+namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64.Accuracy
 {
 	public partial class Ares64 : ISettable<object, Ares64.Ares64SyncSettings>
 	{
--- a/src/BizHawk.Emulation.Cores/Consoles/Nintendo/Ares64/Ares64.Accuracy.cs
+++ b/src/BizHawk.Emulation.Cores/Consoles/Nintendo/Ares64/Ares64.Accuracy.cs
@ -6,13 +6,13 @@ using BizHawk.Emulation.Common;
 using BizHawk.Emulation.Cores.Properties;
 using BizHawk.Emulation.Cores.Waterbox;

-namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64
+namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64.Accuracy
 {
-	[PortedCore(CoreNames.Ares64, "ares team, Near", "v126", "https://ares-emulator.github.io/", isReleased: false)]
+	[PortedCore(CoreNames.Ares64Accuracy, "ares team, Near", "v126", "https://ares-emulator.github.io/", isReleased: false)]
 	[ServiceNotApplicable(new[] { typeof(IDriveLight), })]
 	public partial class Ares64 : WaterboxCore, IRegionable
 	{
-		private readonly LibAres64 _core;
+		private readonly LibAres64Accuracy _core;

 		[CoreConstructor(VSystemID.Raw.N64)]
 		public Ares64(CoreLoadParameters<object, Ares64SyncSettings> lp)
@ -40,7 +40,7 @@ namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64

 			N64Controller = CreateControllerDefinition(ControllerSettings);

-			_core = PreInit<LibAres64>(new WaterboxOptions
+			_core = PreInit<LibAres64Accuracy>(new WaterboxOptions
 			{
 				Filename = "ares64.wbx",
 				SbrkHeapSizeKB = 2 * 1024,
@ -68,19 +68,32 @@ namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64
 				VsyncDenominator = 1;
 			}

+			LibAres64.LoadFlags loadFlags = 0;
+			if (_syncSettings.RestrictAnalogRange)
+				loadFlags |= LibAres64.LoadFlags.RestrictAnalogRange;
+			if (pal)
+				loadFlags |= LibAres64.LoadFlags.Pal;
+
 			var pif = Util.DecompressGzipFile(new MemoryStream(pal ? Resources.PIF_PAL_ROM.Value : Resources.PIF_NTSC_ROM.Value));

-			_exe.AddReadonlyFile(pif, pal ? "pif.pal.rom" : "pif.ntsc.rom");
-			_exe.AddReadonlyFile(rom, "program.rom");
-
-			if (!_core.Init(ControllerSettings, _syncSettings.RestrictAnalogRange, pal))
+			unsafe
 			{
-				throw new InvalidOperationException("Init returned false!");
+				fixed (byte* pifPtr = pif, romPtr = rom)
+				{
+					var loadData = new LibAres64.LoadData()
+					{
+						PifData = (IntPtr)pifPtr,
+						PifLen = pif.Length,
+						RomData = (IntPtr)romPtr,
+						RomLen = rom.Length,
+					};
+					if (!_core.Init(loadData, ControllerSettings, loadFlags))
+					{
+						throw new InvalidOperationException("Init returned false!");
+					}
+				}
 			}

-			_exe.RemoveReadonlyFile(pal ? "pif.pal.rom" : "pif.ntsc.rom");
-			_exe.RemoveReadonlyFile("program.rom");
-
 			PostInit();
 			DeterministicEmulation = true;
 		}
--- a/src/BizHawk.Emulation.Cores/Consoles/Nintendo/Ares64/Ares64.Performance.ISettable.cs
+++ b/src/BizHawk.Emulation.Cores/Consoles/Nintendo/Ares64/Ares64.Performance.ISettable.cs
@ -0,0 +1,77 @@
+using System;
+using System.ComponentModel;
+
+using Newtonsoft.Json;
+
+using BizHawk.Common;
+using BizHawk.Emulation.Common;
+
+namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64.Performance
+{
+	public partial class Ares64 : ISettable<object, Ares64.Ares64SyncSettings>
+	{
+		private Ares64SyncSettings _syncSettings;
+
+		public object GetSettings() => null;
+
+		public Ares64SyncSettings GetSyncSettings() => _syncSettings.Clone();
+
+		public PutSettingsDirtyBits PutSettings(object o) => PutSettingsDirtyBits.None;
+
+		public PutSettingsDirtyBits PutSyncSettings(Ares64SyncSettings o)
+		{
+			var ret = Ares64SyncSettings.NeedsReboot(_syncSettings, o);
+			_syncSettings = o;
+			return ret ? PutSettingsDirtyBits.RebootCore : PutSettingsDirtyBits.None;
+		}
+
+		public class Ares64SyncSettings
+		{
+			[DisplayName("Player 1 Controller")]
+			[Description("")]
+			[DefaultValue(LibAres64.ControllerType.Mempak)]
+			public LibAres64.ControllerType P1Controller { get; set; }
+
+			[DisplayName("Player 2 Controller")]
+			[Description("")]
+			[DefaultValue(LibAres64.ControllerType.Unplugged)]
+			public LibAres64.ControllerType P2Controller { get; set; }
+
+			[DisplayName("Player 3 Controller")]
+			[Description("")]
+			[DefaultValue(LibAres64.ControllerType.Unplugged)]
+			public LibAres64.ControllerType P3Controller { get; set; }
+
+			[DisplayName("Player 4 Controller")]
+			[Description("")]
+			[DefaultValue(LibAres64.ControllerType.Unplugged)]
+			public LibAres64.ControllerType P4Controller { get; set; }
+
+			[DisplayName("Restrict Analog Range")]
+			[Description("Restricts analog range to account for physical limitations.")]
+			[DefaultValue(false)]
+			public bool RestrictAnalogRange { get; set; }
+
+			[DisplayName("Enable Vulkan")]
+			[Description("Enables Vulkan RDP. May fallback to software RDP if your GPU does not support Vulkan.")]
+			[DefaultValue(true)]
+			public bool EnableVulkan { get; set; }
+
+			[DisplayName("Supersampling")]
+			[Description("Scales HD and UHD resolutions back down to SD")]
+			[DefaultValue(false)]
+			public bool SuperSample { get; set; }
+
+			[DisplayName("Vulkan Upscale")]
+			[Description("")]
+			[DefaultValue(LibAres64.VulkanUpscaleOpts.SD)]
+			public LibAres64.VulkanUpscaleOpts VulkanUpscale { get; set; }
+
+			public Ares64SyncSettings() => SettingsUtil.SetDefaultValues(this);
+
+			public Ares64SyncSettings Clone() => MemberwiseClone() as Ares64SyncSettings;
+
+			public static bool NeedsReboot(Ares64SyncSettings x, Ares64SyncSettings y) => !DeepEquality.DeepEquals(x, y);
+		}
+	}
+}
--- a/src/BizHawk.Emulation.Cores/Consoles/Nintendo/Ares64/Ares64.Performance.cs
+++ b/src/BizHawk.Emulation.Cores/Consoles/Nintendo/Ares64/Ares64.Performance.cs
@ -0,0 +1,449 @@
+using System;
+using System.IO;
+using System.Linq;
+
+using BizHawk.BizInvoke;
+using BizHawk.Common;
+using BizHawk.Emulation.Common;
+using BizHawk.Emulation.Cores.Properties;
+using BizHawk.Emulation.Cores.Waterbox;
+
+namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64.Performance
+{
+	[PortedCore(CoreNames.Ares64Performance, "ares team, Near", "v126", "https://ares-emulator.github.io/", singleInstance: true, isReleased: false)]
+	[ServiceNotApplicable(new[] { typeof(IDriveLight), })]
+	public partial class Ares64 : IEmulator, IVideoProvider, ISoundProvider, IStatable, IInputPollable, ISaveRam, IRegionable
+	{
+		private static readonly LibAres64Performance _core;
+
+		static Ares64()
+		{
+			var resolver = new DynamicLibraryImportResolver(
+				OSTailoredCode.IsUnixHost ? "libares64.so" : "libares64.dll", hasLimitedLifetime: false);
+			_core = BizInvoker.GetInvoker<LibAres64Performance>(resolver, CallingConventionAdapters.Native);
+		}
+
+		private readonly BasicServiceProvider _serviceProvider;
+
+		public IEmulatorServiceProvider ServiceProvider => _serviceProvider;
+
+		public int Frame { get; private set; }
+
+		public int LagCount { get; set; }
+
+		public bool IsLagFrame { get; set; }
+
+		[FeatureNotImplemented]
+		public IInputCallbackSystem InputCallbacks => throw new NotImplementedException();
+
+		public string SystemId => VSystemID.Raw.N64;
+
+		public bool DeterministicEmulation => false;
+
+		public void ResetCounters()
+		{
+			Frame = 0;
+			LagCount = 0;
+			IsLagFrame = false;
+		}
+
+		public void Dispose() => _core.Deinit();
+
+		[CoreConstructor(VSystemID.Raw.N64)]
+		public Ares64(CoreLoadParameters<object, Ares64SyncSettings> lp)
+		{
+			if (lp.DeterministicEmulationRequested)
+			{
+				throw new InvalidOperationException("This core is not deterministic!");
+			}
+
+			_serviceProvider = new(this);
+
+			_syncSettings = lp.SyncSettings ?? new();
+
+			int upscale = _syncSettings.EnableVulkan ? (int)_syncSettings.VulkanUpscale : 1;
+			_videoBuffer = new int[640 * upscale * 576 * upscale];
+
+			ControllerSettings = new[]
+			{
+				_syncSettings.P1Controller,
+				_syncSettings.P2Controller,
+				_syncSettings.P3Controller,
+				_syncSettings.P4Controller,
+			};
+
+			N64Controller = CreateControllerDefinition(ControllerSettings);
+
+			var rom = lp.Roms[0].RomData;
+
+			Region = rom[0x3E] switch
+			{
+				0x44 or 0x46 or 0x49 or 0x50 or 0x53 or 0x55 or 0x58 or 0x59 => DisplayType.PAL,
+				_ => DisplayType.NTSC,
+			};
+
+			var pal = Region == DisplayType.PAL;
+
+			VsyncNumerator = pal ? 50 : 60000;
+			VsyncDenominator = pal ? 1 : 1001;
+
+			LibAres64.LoadFlags loadFlags = 0;
+			if (_syncSettings.RestrictAnalogRange)
+				loadFlags |= LibAres64.LoadFlags.RestrictAnalogRange;
+			if (pal)
+				loadFlags |= LibAres64.LoadFlags.Pal;
+			if (_syncSettings.EnableVulkan)
+				loadFlags |= LibAres64.LoadFlags.UseVulkan;
+			if (_syncSettings.SuperSample)
+				loadFlags |= LibAres64.LoadFlags.SuperSample;
+
+			var pif = Util.DecompressGzipFile(new MemoryStream(pal ? Resources.PIF_PAL_ROM.Value : Resources.PIF_NTSC_ROM.Value));
+
+			unsafe
+			{
+				fixed (byte* pifPtr = pif, romPtr = rom)
+				{
+					var loadData = new LibAres64.LoadData()
+					{
+						PifData = (IntPtr)pifPtr,
+						PifLen = pif.Length,
+						RomData = (IntPtr)romPtr,
+						RomLen = rom.Length,
+						VulkanUpscale = upscale,
+					};
+					if (!_core.Init(loadData, ControllerSettings, loadFlags))
+					{
+						throw new InvalidOperationException("Init returned false!");
+					}
+				}
+			}
+
+			ResetCounters();
+
+			var areas = new LibWaterboxCore.MemoryArea[256];
+			_core.GetMemoryAreas(areas);
+			_memoryAreas = areas.Where(a => a.Data != IntPtr.Zero && a.Size != 0 && !a.Flags.HasFlag(LibWaterboxCore.MemoryDomainFlags.FunctionHook))
+				.ToArray();
+
+			var memoryDomains = _memoryAreas.Select(a => new WaterboxMemoryDomainPointer(a, _monitor)).ToList();
+			var primaryDomain = memoryDomains
+				.Where(md => md.Definition.Flags.HasFlag(LibWaterboxCore.MemoryDomainFlags.Primary))
+				.Single();
+
+			var mdl = new MemoryDomainList(
+				memoryDomains.Cast<MemoryDomain>().ToList()
+			)
+			{
+				MainMemory = primaryDomain
+			};
+			_serviceProvider.Register<IMemoryDomains>(mdl);
+
+			_saveramAreas = memoryDomains
+				.Where(md => md.Definition.Flags.HasFlag(LibWaterboxCore.MemoryDomainFlags.Saverammable))
+				.ToArray();
+			_saveramSize = (int)_saveramAreas.Sum(a => a.Size);
+		}
+
+		public DisplayType Region { get; }
+
+		public ControllerDefinition ControllerDefinition => N64Controller;
+
+		private ControllerDefinition N64Controller { get; }
+
+		public LibAres64.ControllerType[] ControllerSettings { get; }
+
+		private static ControllerDefinition CreateControllerDefinition(LibAres64.ControllerType[] controllerSettings)
+		{
+			var ret = new ControllerDefinition("Nintendo 64 Controller");
+			for (int i = 0; i < 4; i++)
+			{
+				if (controllerSettings[i] != LibAres64.ControllerType.Unplugged)
+				{
+					ret.BoolButtons.Add($"P{i + 1} DPad U");
+					ret.BoolButtons.Add($"P{i + 1} DPad D");
+					ret.BoolButtons.Add($"P{i + 1} DPad L");
+					ret.BoolButtons.Add($"P{i + 1} DPad R");
+					ret.BoolButtons.Add($"P{i + 1} Start");
+					ret.BoolButtons.Add($"P{i + 1} Z");
+					ret.BoolButtons.Add($"P{i + 1} B");
+					ret.BoolButtons.Add($"P{i + 1} A");
+					ret.BoolButtons.Add($"P{i + 1} C Up");
+					ret.BoolButtons.Add($"P{i + 1} C Down");
+					ret.BoolButtons.Add($"P{i + 1} C Left");
+					ret.BoolButtons.Add($"P{i + 1} C Right");
+					ret.BoolButtons.Add($"P{i + 1} L");
+					ret.BoolButtons.Add($"P{i + 1} R");
+					ret.AddXYPair($"P{i + 1} {{0}} Axis", AxisPairOrientation.RightAndUp, (-128).RangeTo(127), 0);
+					if (controllerSettings[i] == LibAres64.ControllerType.Rumblepak)
+					{
+						ret.HapticsChannels.Add($"P{i + 1} Rumble Pak");
+					}
+				}
+			}
+			ret.BoolButtons.Add("Reset");
+			ret.BoolButtons.Add("Power");
+			return ret.MakeImmutable();
+		}
+
+		private static LibAres64.Buttons GetButtons(IController controller, int num)
+		{
+			LibAres64.Buttons ret = 0;
+
+			if (controller.IsPressed($"P{num} DPad U"))
+				ret |= LibAres64.Buttons.UP;
+			if (controller.IsPressed($"P{num} DPad D"))
+				ret |= LibAres64.Buttons.DOWN;
+			if (controller.IsPressed($"P{num} DPad L"))
+				ret |= LibAres64.Buttons.LEFT;
+			if (controller.IsPressed($"P{num} DPad R"))
+				ret |= LibAres64.Buttons.RIGHT;
+			if (controller.IsPressed($"P{num} B"))
+				ret |= LibAres64.Buttons.B;
+			if (controller.IsPressed($"P{num} A"))
+				ret |= LibAres64.Buttons.A;
+			if (controller.IsPressed($"P{num} C Up"))
+				ret |= LibAres64.Buttons.C_UP;
+			if (controller.IsPressed($"P{num} C Down"))
+				ret |= LibAres64.Buttons.C_DOWN;
+			if (controller.IsPressed($"P{num} C Left"))
+				ret |= LibAres64.Buttons.C_LEFT;
+			if (controller.IsPressed($"P{num} C Right"))
+				ret |= LibAres64.Buttons.C_RIGHT;
+			if (controller.IsPressed($"P{num} L"))
+				ret |= LibAres64.Buttons.L;
+			if (controller.IsPressed($"P{num} R"))
+				ret |= LibAres64.Buttons.R;
+			if (controller.IsPressed($"P{num} Z"))
+				ret |= LibAres64.Buttons.Z;
+			if (controller.IsPressed($"P{num} Start"))
+				ret |= LibAres64.Buttons.START;
+
+			return ret;
+		}
+
+		private LibWaterboxCore.FrameInfo FrameAdvancePrep(IController controller, bool render, bool rendersound)
+		{
+			for (int i = 0; i < 4; i++)
+			{
+				if (ControllerSettings[i] == LibAres64.ControllerType.Rumblepak)
+				{
+					controller.SetHapticChannelStrength($"P{i + 1} Rumble Pak", _core.GetRumbleStatus(i) ? int.MaxValue : 0);
+				}
+			}
+
+			return new LibAres64.FrameInfo
+			{
+				P1Buttons = GetButtons(controller, 1),
+				P1XAxis = (short)controller.AxisValue("P1 X Axis"),
+				P1YAxis = (short)controller.AxisValue("P1 Y Axis"),
+
+				P2Buttons = GetButtons(controller, 2),
+				P2XAxis = (short)controller.AxisValue("P2 X Axis"),
+				P2YAxis = (short)controller.AxisValue("P2 Y Axis"),
+
+				P3Buttons = GetButtons(controller, 3),
+				P3XAxis = (short)controller.AxisValue("P3 X Axis"),
+				P3YAxis = (short)controller.AxisValue("P3 Y Axis"),
+
+				P4Buttons = GetButtons(controller, 4),
+				P4XAxis = (short)controller.AxisValue("P4 X Axis"),
+				P4YAxis = (short)controller.AxisValue("P4 Y Axis"),
+
+				Reset = controller.IsPressed("Reset"),
+				Power = controller.IsPressed("Power"),
+			};
+		}
+
+		public unsafe bool FrameAdvance(IController controller, bool render, bool rendersound = true)
+		{
+			_core.SetInputCallback(null);
+
+			fixed (int* vp = _videoBuffer)
+			fixed (short* sp = _soundBuffer)
+			{
+				var frame = FrameAdvancePrep(controller, render, rendersound);
+				frame.VideoBuffer = (IntPtr)vp;
+				frame.SoundBuffer = (IntPtr)sp;
+
+				_core.FrameAdvance(frame);
+
+				Frame++;
+				if (IsLagFrame = frame.Lagged != 0)
+					LagCount++;
+
+				if (render)
+				{
+					BufferWidth = frame.Width;
+					BufferHeight = frame.Height;
+				}
+				if (rendersound)
+				{
+					_numSamples = frame.Samples;
+				}
+				else
+				{
+					_numSamples = 0;
+				}
+
+				FrameAdvancePost();
+			}
+
+			return true;
+		}
+
+		private void FrameAdvancePost()
+		{
+			if (BufferWidth == 0)
+			{
+				BufferWidth = BufferHeight == 239 ? 320 : 640;
+			}
+		}
+
+		public int[] GetVideoBuffer() => _videoBuffer;
+
+		private readonly int[] _videoBuffer;
+
+		public int VirtualWidth => 640;
+
+		public int VirtualHeight => 480;
+
+		public int BufferWidth { get; private set; }
+
+		public int BufferHeight { get; private set; }
+
+		public int VsyncNumerator { get; }
+
+		public int VsyncDenominator { get; }
+
+		public int BackgroundColor => unchecked((int)0xff000000);
+
+		public void SetSyncMode(SyncSoundMode mode)
+		{
+			if (mode == SyncSoundMode.Async)
+			{
+				throw new NotSupportedException("Async mode is not supported.");
+			}
+		}
+
+		public void GetSamplesSync(out short[] samples, out int nsamp)
+		{
+			samples = _soundBuffer;
+			nsamp = _numSamples;
+		}
+
+		public void GetSamplesAsync(short[] samples) => throw new InvalidOperationException("Async mode is not supported.");
+
+		public void DiscardSamples() {}
+
+		private readonly short[] _soundBuffer = new short[2048 * 2];
+
+		private int _numSamples;
+
+		public bool CanProvideAsync => false;
+
+		public SyncSoundMode SyncMode => SyncSoundMode.Sync;
+
+		private byte[] _stateBuffer = new byte[0];
+
+		public void SaveStateBinary(BinaryWriter writer)
+		{
+			var len = _core.SerializeSize();
+			if (len != _stateBuffer.Length)
+			{
+				_stateBuffer = new byte[len];
+			}
+			_core.Serialize(_stateBuffer);
+			writer.Write(_stateBuffer.Length);
+			writer.Write(_stateBuffer);
+		}
+
+		public void LoadStateBinary(BinaryReader reader)
+		{
+			var len = reader.ReadInt32();
+			if (len != _core.SerializeSize())
+			{
+				throw new InvalidOperationException("Savestate size mismatch!");
+			}
+			if (len != _stateBuffer.Length)
+			{
+				_stateBuffer = new byte[len];
+			}
+			reader.Read(_stateBuffer, 0, len);
+			if (!_core.Unserialize(_stateBuffer, len))
+			{
+				throw new Exception($"{nameof(_core.Unserialize)}() returned false!");
+			}
+		}
+
+		private readonly LibWaterboxCore.MemoryArea[] _memoryAreas;
+
+		private readonly WaterboxMemoryDomain[] _saveramAreas;
+		private readonly int _saveramSize;
+
+		public unsafe bool SaveRamModified
+		{
+			get
+			{
+				if (_saveramSize == 0)
+					return false;
+				var buff = new byte[4096];
+				fixed (byte* bp = buff)
+				{
+					foreach (var area in _saveramAreas)
+					{
+						var stream = new MemoryDomainStream(area);
+						int cmp = (area.Definition.Flags & LibWaterboxCore.MemoryDomainFlags.OneFilled) != 0 ? -1 : 0;
+						while (true)
+						{
+							int nread = stream.Read(buff, 0, 4096);
+							if (nread == 0)
+								break;
+
+							int* p = (int*)bp;
+							int* pend = p + nread / sizeof(int);
+							while (p < pend)
+							{
+								if (*p++ != cmp)
+									return true;
+							}
+						}
+					}
+				}
+				return false;
+			}
+		}
+
+		public byte[] CloneSaveRam()
+		{
+			if (_saveramSize == 0)
+				return null;
+			var ret = new byte[_saveramSize];
+			var dest = new MemoryStream(ret, true);
+			foreach (var area in _saveramAreas)
+			{
+				new MemoryDomainStream(area).CopyTo(dest);
+			}
+			return ret;
+		}
+
+		public void StoreSaveRam(byte[] data)
+		{
+			if (data.Length != _saveramSize)
+				throw new InvalidOperationException("Saveram size mismatch");
+			var source = new MemoryStream(data, false);
+			foreach (var area in _saveramAreas)
+			{
+				WaterboxUtils.CopySome(source, new MemoryDomainStream(area), area.Size);
+			}
+		}
+
+		private readonly DummyMonitor _monitor = new();
+
+		private class DummyMonitor : IMonitor
+		{
+			public void Enter() { }
+
+			public void Exit() { }
+		}
+	}
+}
--- a/src/BizHawk.Emulation.Cores/Consoles/Nintendo/Ares64/LibAres64.cs
+++ b/src/BizHawk.Emulation.Cores/Consoles/Nintendo/Ares64/LibAres64.cs
@ -59,10 +59,57 @@ namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64
 			public bool Power;
 		}

+		[Flags]
+		public enum LoadFlags : uint
+		{
+			RestrictAnalogRange = 1 << 0,
+			Pal = 1 << 1,
+			// performance only flags
+			UseVulkan = 1 << 2,
+			SuperSample = 1 << 3,
+		}
+
+		public enum VulkanUpscaleOpts : uint
+		{
+			SD = 1,
+			HD = 2,
+			UHD = 4,
+		}
+
+		[StructLayout(LayoutKind.Sequential)]
+		public class LoadData
+		{
+			public IntPtr PifData;
+			public int PifLen;
+			public IntPtr RomData;
+			public int RomLen;
+			// performance only data
+			public int VulkanUpscale;
+		}
+
 		[BizImport(CC)]
-		public abstract bool Init(ControllerType[] controllerSettings, bool restrictAnalogRange, bool pal);
+		public abstract bool Init(LoadData loadData, ControllerType[] controllerSettings, LoadFlags loadFlags);

 		[BizImport(CC)]
 		public abstract bool GetRumbleStatus(int num);
 	}
+
+	public abstract class LibAres64Accuracy : LibAres64
+	{
+	}
+
+	public abstract class LibAres64Performance : LibAres64
+	{
+		[BizImport(CC)]
+		public abstract void Deinit();
+
+		[BizImport(CC)]
+		public abstract int SerializeSize();
+
+		[BizImport(CC)]
+		public abstract void Serialize(byte[] buf);
+
+		[BizImport(CC)]
+		public abstract bool Unserialize(byte[] buf, int sz);
+	}
 }
--- a/src/BizHawk.Emulation.Cores/CoreNames.cs
+++ b/src/BizHawk.Emulation.Cores/CoreNames.cs
@ -10,7 +10,8 @@ namespace BizHawk.Emulation.Cores
 	public static class CoreNames
 	{
 		public const string A7800Hawk = "A7800Hawk";
-		public const string Ares64 = "Ares64";
+		public const string Ares64Accuracy = "Ares64 (Accuracy)";
+		public const string Ares64Performance = "Ares64 (Performance)";
 		public const string Atari2600Hawk = "Atari2600Hawk";
 		public const string Bsnes = "BSNES";
 		public const string Bsnes115 = "BSNESv115+";
--- a/src/BizHawk.Emulation.Cores/vpads_schemata/N64Schema.cs
+++ b/src/BizHawk.Emulation.Cores/vpads_schemata/N64Schema.cs
@ -26,11 +26,21 @@ namespace BizHawk.Emulation.Cores
 					}
 				}
 			}
-			else if (core is Ares64 ares64)
+			else if (core is Consoles.Nintendo.Ares64.Accuracy.Ares64 ares64Acc)
 			{
 				for (var i = 0; i < 4; i++)
 				{
-					if (ares64.ControllerSettings[i] != LibAres64.ControllerType.Unplugged)
+					if (ares64Acc.ControllerSettings[i] != LibAres64.ControllerType.Unplugged)
+					{
+						yield return StandardController(i + 1);
+					}
+				}
+			}
+			else if (core is Consoles.Nintendo.Ares64.Performance.Ares64 ares64Perf)
+			{
+				for (var i = 0; i < 4; i++)
+				{
+					if (ares64Perf.ControllerSettings[i] != LibAres64.ControllerType.Unplugged)
 					{
 						yield return StandardController(i + 1);
 					}
--- a/waterbox/ares64/Accuracy.mak
+++ b/waterbox/ares64/Accuracy.mak
@ -5,13 +5,14 @@ MAME_PATH = $(ROOT_DIR)/ares/thirdparty/mame

 CXXFLAGS := -std=c++17 -msse4.2 \
 	-I../libco -I.$(ROOT_DIR)/ares/ -I.$(ROOT_DIR)/ares/thirdparty/ -I.$(ARES_PATH) \
-	-Werror=int-to-pointer-cast -Wno-unused-but-set-variable \
+	-Werror=int-to-pointer-cast -Wno-unused-but-set-variable -Wno-delete-non-virtual-dtor \
 	-Wno-parentheses -Wno-reorder -Wno-unused-variable \
 	-Wno-sign-compare -Wno-switch -Wno-unused-local-typedefs \
 	-fno-strict-aliasing -fwrapv -fno-operator-names \
 	-I.$(MAME_PATH)/devices -I.$(MAME_PATH)/emu \
 	-I.$(MAME_PATH)/lib/util -I.$(MAME_PATH)/mame \
-	-I.$(MAME_PATH)/osd -DMAME_RDP -DLSB_FIRST -DPTR64 -DSDLMAME_EMSCRIPTEN
+	-I.$(MAME_PATH)/osd -DMAME_RDP -DLSB_FIRST -DPTR64 -DSDLMAME_EMSCRIPTEN \
+	-DWATERBOXED

 TARGET = ares64.wbx

--- a/waterbox/ares64/BizInterface.cpp
+++ b/waterbox/ares64/BizInterface.cpp
@ -1,7 +1,16 @@
 #include <n64/n64.hpp>

+#if WATERBOXED
 #include <emulibc.h>
 #include <waterboxcore.h>
+#endif
+
+#include <vector>
+
+#ifndef WATERBOXED
+#define ECL_EXPORT __attribute__((visibility("default")))
+#include "../emulibc/waterboxcore.h"
+#endif

 #define EXPORT extern "C" ECL_EXPORT

@ -38,7 +47,7 @@ struct BizPlatform : ares::Platform
 	auto video(ares::Node::Video::Screen, const u32*, u32, u32, u32) -> void override;
 	auto input(ares::Node::Input::Input) -> void override;

-	ares::VFS::Pak bizpak = new vfs::directory;
+	ares::VFS::Pak bizpak = nullptr;
 	ares::Node::Audio::Stream stream = nullptr;
 	u32* videobuf = nullptr;
 	u32 pitch = 0;
@ -84,16 +93,19 @@ auto BizPlatform::input(ares::Node::Input::Input node) -> void
 	}
 };

-static ares::Node::System root;
-static BizPlatform platform;
+static ares::Node::System root = nullptr;
+static BizPlatform* platform = nullptr;
+static array_view<u8>* pifData = nullptr;
+static array_view<u8>* romData = nullptr;
+static array_view<u8>* saveData = nullptr;

 static inline void HackeryDoo()
 {
 	root->run();
 	root->run();
-	platform.newframe = false;
+	platform->newframe = false;
 	f64 buf[2];
-	while (platform.stream->pending()) platform.stream->read(buf);
+	while (platform->stream->pending()) platform->stream->read(buf);
 }

 typedef enum
@ -311,46 +323,71 @@ static inline SaveType DetectSaveType(u8* rom)

 namespace ares::Nintendo64 { extern bool RestrictAnalogRange; }

-EXPORT bool Init(ControllerType* controllers, bool restrictAnalogRange, bool pal)
+bool Inited = false;
+
+typedef struct
 {
-	FILE* f;
-	array_view<u8>* data;
+	u8* PifData;
+	u32 PifLen;
+	u8* RomData;
+	u32 RomLen;
+#ifndef WATERBOXED
+	u32 VulkanUpscale;
+#endif
+} LoadData;
+
+typedef enum
+{
+	RESTRICT_ANALOG_RANGE = 1 << 0,
+	IS_PAL = 1 << 1,
+#ifndef WATERBOXED
+	USE_VULKAN = 1 << 2,
+	SUPER_SAMPLE = 1 << 3,
+#endif
+} LoadFlags;
+
+EXPORT void Deinit();
+
+EXPORT bool Init(LoadData* loadData, ControllerType* controllers, LoadFlags loadFlags)
+{
+	if (Inited) Deinit();
+
+	platform = new BizPlatform;
+	platform->bizpak = new vfs::directory;
+
+	u8* data;
 	u32 len;
 	string name;

+	bool pal = loadFlags & IS_PAL;
+
 	name = pal ? "pif.pal.rom" : "pif.ntsc.rom";
-	f = fopen(name, "rb");
-	fseek(f, 0, SEEK_END);
-	len = ftell(f);
-	data = new array_view<u8>(new u8[len], len);
-	fseek(f, 0, SEEK_SET);
-	fread((void*)data->data(), 1, len, f);
-	fclose(f);
-	platform.bizpak->append(name, *data);
+	len = loadData->PifLen;
+	data = new u8[len];
+	memcpy(data, loadData->PifData, len);
+	pifData = new array_view<u8>(data, len);
+	platform->bizpak->append(name, *pifData);

 	name = "program.rom";
-	f = fopen(name, "rb");
-	fseek(f, 0, SEEK_END);
-	len = ftell(f);
-	data = new array_view<u8>(new u8[len], len);
-	fseek(f, 0, SEEK_SET);
-	fread((void*)data->data(), 1, len, f);
-	fclose(f);
-	platform.bizpak->append(name, *data);
+	len = loadData->RomLen;
+	data = new u8[len];
+	memcpy(data, loadData->RomData, len);
+	romData = new array_view<u8>(data, len);
+	platform->bizpak->append(name, *romData);

 	string region = pal ? "PAL" : "NTSC";
-	platform.bizpak->setAttribute("region", region);
+	platform->bizpak->setAttribute("region", region);

 	string cic = pal ? "CIC-NUS-7101" : "CIC-NUS-6102";
-	u32 crc32 = Hash::CRC32({&((u8*)data->data())[0x40], 0x9C0}).value();
+	u32 crc32 = Hash::CRC32({&data[0x40], 0x9C0}).value();
 	if (crc32 == 0x1DEB51A9) cic = pal ? "CIC-NUS-7102" : "CIC-NUS-6101";
 	if (crc32 == 0xC08E5BD6) cic = pal ? "CIC-NUS-7101" : "CIC-NUS-6102";
 	if (crc32 == 0x03B8376A) cic = pal ? "CIC-NUS-7103" : "CIC-NUS-6103";
 	if (crc32 == 0xCF7F41DC) cic = pal ? "CIC-NUS-7105" : "CIC-NUS-6105";
 	if (crc32 == 0xD1059C6A) cic = pal ? "CIC-NUS-7106" : "CIC-NUS-6106";
-	platform.bizpak->setAttribute("cic", cic);
+	platform->bizpak->setAttribute("cic", cic);

-	SaveType save = DetectSaveType((u8*)data->data());
+	SaveType save = DetectSaveType(data);
 	if (save != NONE)
 	{
 		switch (save)
@ -360,17 +397,25 @@ EXPORT bool Init(ControllerType* controllers, bool restrictAnalogRange, bool pal
 			case SRAM32KB: len = 32 * 1024; name = "save.ram"; break;
 			case SRAM96KB: len = 96 * 1024; name = "save.ram"; break;
 			case FLASH128KB: len = 128 * 1024; name = "save.flash"; break;
-			default: return false;
+			default: Deinit(); return false;
 		}
-		data = new array_view<u8>(new u8[len], len);
-		memset((void*)data->data(), 0xFF, len);
-		platform.bizpak->append(name, *data);
+		data = new u8[len];
+		memset(data, 0xFF, len);
+		saveData = new array_view<u8>(data, len);
+		platform->bizpak->append(name, *saveData);
 	}

-	ares::platform = &platform;
+	ares::platform = platform;
+
+#ifndef WATERBOXED
+	ares::Nintendo64::option("Enable Vulkan", !!(loadFlags & USE_VULKAN));
+	ares::Nintendo64::option("Quality", loadData->VulkanUpscale == 1 ? "SD" : (loadData->VulkanUpscale == 2 ? "HD" : "UHD"));
+	ares::Nintendo64::option("Supersampling", !!(loadFlags & SUPER_SAMPLE));
+#endif

 	if (!ares::Nintendo64::load(root, {"[Nintendo] Nintendo 64 (", region, ")"}))
 	{
+		Deinit();
 		return false;
 	}

@ -381,6 +426,7 @@ EXPORT bool Init(ControllerType* controllers, bool restrictAnalogRange, bool pal
 	}
 	else
 	{
+		Deinit();
 		return false;
 	}

@ -393,7 +439,6 @@ EXPORT bool Init(ControllerType* controllers, bool restrictAnalogRange, bool pal
 			auto peripheral = port->allocate("Gamepad");
 			port->connect();

-			string name;
 			switch (controllers[i])
 			{
 				case Mempak: name = "Controller Pak"; break;
@ -408,22 +453,51 @@ EXPORT bool Init(ControllerType* controllers, bool restrictAnalogRange, bool pal
 			}
 			else
 			{
+				Deinit();
 				return false;
 			}
 		}
 		else
 		{
+			Deinit();
 			return false;
 		}
 	}

-	ares::Nintendo64::RestrictAnalogRange = restrictAnalogRange;
+	ares::Nintendo64::RestrictAnalogRange = loadFlags & RESTRICT_ANALOG_RANGE;

 	root->power(false);
 	HackeryDoo();
+	Inited = true;
 	return true;
 }

+EXPORT void Deinit()
+{
+	if (root) root->unload();
+	if (platform)
+	{
+		if (platform->bizpak) platform->bizpak.reset();
+		delete platform;
+	}
+	if (pifData)
+	{
+		delete[] (u8*)pifData->data();
+		delete pifData;
+	}
+	if (romData)
+	{
+		delete[] (u8*)romData->data();
+		delete romData;
+	}
+	if (saveData)
+	{
+		delete[] (u8*)saveData->data();
+		delete saveData;
+	}
+	Inited = false;
+}
+
 EXPORT bool GetRumbleStatus(u32 num)
 {
 	ares::Nintendo64::Gamepad* c = nullptr;
@ -437,6 +511,23 @@ EXPORT bool GetRumbleStatus(u32 num)
 	return c ? c->motor->enable() : false;
 }

+EXPORT u32 SerializeSize()
+{
+	return root->serialize(false).size();
+}
+
+EXPORT void Serialize(u8* buf)
+{
+	auto s = root->serialize(false);
+	memcpy(buf, s.data(), s.size());
+}
+
+EXPORT bool Unserialize(u8* buf, u32 sz)
+{
+	serializer s(buf, sz);
+	return root->unserialize(s);
+}
+
 #define MAYBE_ADD_MEMORY_DOMAIN(mem, name, flags) do { \
 	if (ares::Nintendo64::mem.data) \
 	{ \
@ -544,39 +635,39 @@ EXPORT void FrameAdvance(MyFrameInfo* f)
 	UPDATE_CONTROLLER(3);
 	UPDATE_CONTROLLER(4);

-	platform.lagged = true;
+	platform->lagged = true;

 	root->run();

-	f->Width = platform.width;
-	f->Height = platform.height;
-	if (platform.newframe)
+	f->Width = platform->width;
+	f->Height = platform->height;
+	if (platform->newframe)
 	{
-		u32* src = platform.videobuf;
+		u32* src = platform->videobuf;
 		u32* dst = f->VideoBuffer;
 		for (int i = 0; i < f->Height; i++)
 		{
 			memcpy(dst, src, f->Width * 4);
 			dst += f->Width;
-			src += platform.pitch;
+			src += platform->pitch;
 		}
-		platform.newframe = false;
+		platform->newframe = false;
 	}

 	s16* soundbuf = f->SoundBuffer;
-	while (platform.stream->pending())
+	while (platform->stream->pending())
 	{
 		f64 buf[2];
-		platform.stream->read(buf);
+		platform->stream->read(buf);
 		*soundbuf++ = (s16)std::clamp(buf[0] * 32768, -32768.0, 32767.0);
 		*soundbuf++ = (s16)std::clamp(buf[1] * 32768, -32768.0, 32767.0);
 		f->Samples++;
 	}

-	f->Lagged = platform.lagged;
+	f->Lagged = platform->lagged;
 }

 EXPORT void SetInputCallback(void (*callback)())
 {
-	platform.inputcb = callback;
-}
+	platform->inputcb = callback;
+ }
--- a/waterbox/ares64/Performance.mak
+++ b/waterbox/ares64/Performance.mak
@ -0,0 +1,160 @@
+ARES_PATH = $(ROOT_DIR)/ares/ares
+MAME_PATH = $(ROOT_DIR)/ares/thirdparty/mame
+SLJIT_PATH = $(ROOT_DIR)/ares/thirdparty/sljit
+
+CCFLAGS := -std=c99 -Wall -Wno-format -Wno-parentheses
+
+CXXFLAGS := -std=c++17 -msse4.2 -O3 -flto -fvisibility=internal \
+	-I../libco -I.$(ROOT_DIR)/ares/ -I.$(ROOT_DIR)/ares/thirdparty/ -I.$(ARES_PATH) \
+	-Werror=int-to-pointer-cast -Wno-unused-but-set-variable \
+	-Wno-parentheses -Wno-reorder -Wno-unused-variable \
+	-Wno-sign-compare -Wno-switch -Wno-unused-local-typedefs \
+	-fno-strict-aliasing -fwrapv -fno-operator-names \
+	-I.$(MAME_PATH)/devices -I.$(MAME_PATH)/emu \
+	-I.$(MAME_PATH)/lib/util -I.$(MAME_PATH)/mame \
+	-I.$(MAME_PATH)/osd -DMAME_RDP -DLSB_FIRST -DPTR64 -DSLJIT_HAVE_CONFIG_PRE=1 -DSLJIT_HAVE_CONFIG_POST=1 -fPIC
+
+LDFLAGS := -shared
+
+ifeq ($(OS),Windows_NT)
+	CCFLAGS += -DVK_USE_PLATFORM_WIN32_KHR
+	CXXFLAGS += -DVK_USE_PLATFORM_WIN32_KHR -DOSD_WINDOWS=1
+	TARGET = libares64.dll
+else
+	CXXFLAGS += -DSDLMAME_LINUX
+	TARGET = libares64.so
+endif
+
+SRCS_LIBCO = \
+	$(ROOT_DIR)/ares/libco/libco.c
+
+SRCS_PROCESSORS = \
+	$(ARES_PATH)/component/processor/sm5k/sm5k.cpp
+
+SRCS_ARES = \
+	$(ARES_PATH)/ares/ares.cpp \
+	$(ARES_PATH)/ares/memory/fixed-allocator.cpp
+
+SRCS_N64 = \
+	$(ARES_PATH)/n64/memory/memory.cpp \
+	$(ARES_PATH)/n64/system/system.cpp \
+	$(ARES_PATH)/n64/cartridge/cartridge.cpp \
+	$(ARES_PATH)/n64/controller/controller.cpp \
+	$(ARES_PATH)/n64/dd/dd.cpp \
+	$(ARES_PATH)/n64/sp/sp.cpp \
+	$(ARES_PATH)/n64/dp/dp.cpp \
+	$(ARES_PATH)/n64/mi/mi.cpp \
+	$(ARES_PATH)/n64/vi/vi.cpp \
+	$(ARES_PATH)/n64/ai/ai.cpp \
+	$(ARES_PATH)/n64/pi/pi.cpp \
+	$(ARES_PATH)/n64/ri/ri.cpp \
+	$(ARES_PATH)/n64/si/si.cpp \
+	$(ARES_PATH)/n64/rdram/rdram.cpp \
+	$(ARES_PATH)/n64/cpu/cpu.cpp \
+	$(ARES_PATH)/n64/rdp/rdp.cpp \
+	$(ARES_PATH)/n64/rsp/rsp.cpp \
+	$(ARES_PATH)/n64/vulkan/vulkan.cpp
+
+PARALLEL_RDP_IMPLEMENTATION = $(ARES_PATH)/n64/vulkan/parallel-rdp
+
+SRCS_PARALLEL_RDP = \
+	$(wildcard $(PARALLEL_RDP_IMPLEMENTATION)/parallel-rdp/*.cpp) \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/buffer.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/buffer_pool.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/command_buffer.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/command_pool.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/context.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/cookie.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/descriptor_set.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/device.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/event_manager.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/fence.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/fence_manager.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/image.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/memory_allocator.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/pipeline_event.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/query_pool.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/render_pass.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/sampler.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/semaphore.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/semaphore_manager.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/shader.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/texture_format.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/util/logging.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/util/thread_id.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/util/aligned_alloc.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/util/timer.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/util/timeline_trace_file.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/util/thread_name.cpp \
+	$(PARALLEL_RDP_IMPLEMENTATION)/volk/volk.c
+
+PARALLEL_RDP_INCLUDE_DIRS = \
+	-I.$(PARALLEL_RDP_IMPLEMENTATION)/parallel-rdp \
+	-I.$(PARALLEL_RDP_IMPLEMENTATION)/volk \
+	-I.$(PARALLEL_RDP_IMPLEMENTATION)/vulkan \
+	-I.$(PARALLEL_RDP_IMPLEMENTATION)/vulkan-headers/include \
+	-I.$(PARALLEL_RDP_IMPLEMENTATION)/util
+
+CXXFLAGS += $(PARALLEL_RDP_INCLUDE_DIRS) -DVULKAN -DGRANITE_VULKAN_MT
+CCFLAGS += $(PARALLEL_RDP_INCLUDE_DIRS)
+
+SRCS_MAME = \
+	$(MAME_PATH)/emu/emucore.cpp \
+	$(MAME_PATH)/lib/util/delegate.cpp \
+	$(MAME_PATH)/lib/util/strformat.cpp \
+	$(MAME_PATH)/mame/video/n64.cpp \
+	$(MAME_PATH)/mame/video/pin64.cpp \
+	$(MAME_PATH)/mame/video/rdpblend.cpp \
+	$(MAME_PATH)/mame/video/rdptpipe.cpp \
+	$(MAME_PATH)/osd/osdcore.cpp \
+	$(MAME_PATH)/osd/osdsync.cpp
+
+SRCS_SLJIT = \
+	$(SLJIT_PATH)/../sljitAllocator.cpp \
+	$(SLJIT_PATH)/sljit_src/sljitLir.c
+
+SRCS = $(SRCS_LIBCO) $(SRCS_PROCESSORS) $(SRCS_ARES) $(SRCS_N64) $(SRCS_PARALLEL_RDP) $(SRCS_MAME) $(SRCS_SLJIT) BizInterface.cpp
+
+ROOT_DIR := $(shell dirname $(realpath Performance.mak))
+OUTPUTDLL_DIR := $(realpath $(ROOT_DIR)/../../Assets/dll)
+OUTPUTDLLCOPY_DIR := $(realpath $(ROOT_DIR)/../../output/dll)
+OUT_DIR := $(ROOT_DIR)/obj
+OBJ_DIR := $(OUT_DIR)/release_performance
+
+CC := gcc
+CXX := g++
+
+_OBJS := $(addsuffix .o,$(realpath $(SRCS)))
+OBJS := $(patsubst $(ROOT_DIR)%,$(OBJ_DIR)%,$(_OBJS))
+
+$(OBJ_DIR)/%.c.o: %.c
+	@echo cc $<
+	@mkdir -p $(@D)
+	@$(CC) -c -o $@ $< $(CCFLAGS) $(PER_FILE_FLAGS_$<)
+$(OBJ_DIR)/%.cpp.o: %.cpp
+	@echo cxx $<
+	@mkdir -p $(@D)
+	@$(CXX) -c -o $@ $< $(CXXFLAGS) $(PER_FILE_FLAGS_$<)
+
+.DEFAULT_GOAL := install
+
+.PHONY: release install
+
+TARGET_RELEASE := $(OBJ_DIR)/$(TARGET)
+
+release: $(TARGET_RELEASE)
+
+$(TARGET_RELEASE): $(OBJS)
+	@echo ld $@
+	@$(CXX) -o $@ $(LDFLAGS) $(CCFLAGS) $(CXXFLAGS) $(OBJS)
+
+install: $(TARGET_RELEASE)
+	@cp -f $(TARGET_RELEASE) $(OUTPUTDLL_DIR)/$(TARGET)
+	@cp -f $(TARGET_RELEASE) $(OUTPUTDLLCOPY_DIR)/$(TARGET)
+	@echo Release build of $(TARGET) installed.
+
+.PHONY: clean
+clean:
+	rm -rf $(OUT_DIR)
+
+-include $(OBJS:%o=%d)
--- a/waterbox/ares64/ares/ares/ares/ares.hpp
+++ b/waterbox/ares64/ares/ares/ares/ares.hpp
@ -1,8 +1,12 @@
 #pragma once

+#ifdef WATERBOXED
 #include <emulibc.h>
-
 #include <libco.h>
+#else
+#include <libco/libco.h>
+#endif
+
 #include <sljit.h>

 #include <nall/platform.hpp>
@ -57,7 +61,11 @@ namespace ares {
  }

  namespace Video {
+#ifdef WATERBOXED
    static constexpr bool Threaded = false;
+#else
+    static constexpr bool Threaded = true;
+#endif
  }

  namespace Constants {
--- a/waterbox/ares64/ares/ares/ares/node/video/screen.cpp
+++ b/waterbox/ares64/ares/ares/ares/node/video/screen.cpp
@ -3,10 +3,17 @@ Screen::Screen(string name, u32 width, u32 height) : Video(name) {
  _canvasHeight = height;

  if(width && height) {
+#ifdef WATERBOXED
    _inputA = alloc_invisible<u32>(width * height);
    _inputB = alloc_invisible<u32>(width * height);
    _output = alloc_invisible<u32>(width * height);
    _rotate = alloc_invisible<u32>(width * height);
+#else
+    _inputA = new u32[width * height]();
+    _inputB = new u32[width * height]();
+    _output = new u32[width * height]();
+    _rotate = new u32[width * height]();
+#endif

    if constexpr(ares::Video::Threaded) {
      _thread = nall::thread::create({&Screen::main, this});
--- a/waterbox/ares64/ares/ares/ares/platform.hpp
+++ b/waterbox/ares64/ares/ares/ares/platform.hpp
@ -16,6 +16,7 @@ struct Platform {
  virtual auto pak(Node::Object) -> shared_pointer<vfs::directory> { return {}; }
  virtual auto event(Event) -> void {}
  virtual auto log(string_view message) -> void {}
+  virtual auto status(string_view message) -> void {}
  virtual auto video(Node::Video::Screen, const u32* data, u32 pitch, u32 width, u32 height) -> void {}
  virtual auto audio(Node::Audio::Stream) -> void {}
  virtual auto input(Node::Input::Input) -> void {}
--- a/waterbox/ares64/ares/ares/n64/accuracy.hpp
+++ b/waterbox/ares64/ares/ares/n64/accuracy.hpp
@ -1,6 +1,10 @@
 struct Accuracy {
  //enable all accuracy flags
+#ifdef WATERBOXED
  static constexpr bool Reference = 1;
+#else
+  static constexpr bool Reference = 0;
+#endif

  struct CPU {
    static constexpr bool Interpreter = 0 | Reference;
--- a/waterbox/ares64/ares/ares/n64/controller/gamepad/gamepad.cpp
+++ b/waterbox/ares64/ares/ares/n64/controller/gamepad/gamepad.cpp
@ -33,10 +33,12 @@ Gamepad::~Gamepad() {
 }

 auto Gamepad::save() -> void {
+/*
  if(!slot) return;
  if(slot->name() == "Controller Pak") {
    ram.save(pak->write("save.pak"));
  }
+*/
 }

 auto Gamepad::allocate(string name) -> Node::Peripheral {
--- a/waterbox/ares64/ares/ares/n64/system/system.cpp
+++ b/waterbox/ares64/ares/ares/n64/system/system.cpp
@ -99,6 +99,7 @@ auto System::unload() -> void {
  vulkan.unload();
  #endif
  cartridgeSlot.unload();
+  puts("unloading port 1");
  controllerPort1.unload();
  controllerPort2.unload();
  controllerPort3.unload();
@ -119,12 +120,14 @@ auto System::unload() -> void {
 }

 auto System::save() -> void {
+/*
  if(!node) return;
  cartridge.save();
  controllerPort1.save();
  controllerPort2.save();
  controllerPort3.save();
  controllerPort4.save();
+*/
 }

 auto System::power(bool reset) -> void {
--- a/waterbox/ares64/ares/ares/n64/vi/io.cpp
+++ b/waterbox/ares64/ares/ares/n64/vi/io.cpp
@ -100,7 +100,7 @@ auto VI::writeWord(u32 address, u32 data_) -> void {
  n32 data = data_;

  #if defined(VULKAN)
-  vulkan.writeWord(address, data);
+  if (vulkan.enable) vulkan.writeWord(address, data);
  #endif

  if(address == 0) {
--- a/waterbox/ares64/ares/ares/n64/vi/vi.cpp
+++ b/waterbox/ares64/ares/ares/n64/vi/vi.cpp
@ -10,11 +10,16 @@ VI vi;
 auto VI::load(Node::Object parent) -> void {
  node = parent->append<Node::Object>("VI");

+  u32 width = 640;
+  u32 height = 576;
+
  #if defined(VULKAN)
-  screen = node->append<Node::Video::Screen>("Screen", vulkan.outputUpscale * 640, vulkan.outputUpscale * 576);
-  #else
-  screen = node->append<Node::Video::Screen>("Screen", 640, 576);
+  if (vulkan.enable) {
+    width *= vulkan.outputUpscale;
+    height *= vulkan.outputUpscale;
+  }
  #endif
+  screen = node->append<Node::Video::Screen>("Screen", width, height);
  screen->setRefresh({&VI::refresh, this});
  screen->colors((1 << 24) + (1 << 15), [&](n32 color) -> n64 {
    if(color < (1 << 24)) {
@ -31,10 +36,15 @@ auto VI::load(Node::Object parent) -> void {
      return a << 48 | r << 32 | g << 16 | b << 0;
    }
  });
+
  #if defined(VULKAN)
-  screen->setSize(vulkan.outputUpscale * 640, vulkan.outputUpscale * 480);
-  if(!vulkan.supersampleScanout) {
-    screen->setScale(1.0 / vulkan.outputUpscale, 1.0 / vulkan.outputUpscale);
+  if(vulkan.enable) {
+    screen->setSize(vulkan.outputUpscale * 640, vulkan.outputUpscale * 480);
+    if(!vulkan.supersampleScanout) {
+      screen->setScale(1.0 / vulkan.outputUpscale, 1.0 / vulkan.outputUpscale);
+    }
+  } else {
+    screen->setSize(640, 480);
  }
  #else
  screen->setSize(640, 480);
@ -62,8 +72,10 @@ auto VI::main() -> void {
    io.field = io.field + 1 & io.serrate;
    if(!io.field) {
      #if defined(VULKAN)
-      gpuOutputValid = vulkan.scanoutAsync(io.field);
-      vulkan.frame();
+      if (vulkan.enable) {
+        gpuOutputValid = vulkan.scanoutAsync(io.field);
+        vulkan.frame();
+      }
      #endif

      refreshed = true;
@ -81,7 +93,7 @@ auto VI::step(u32 clocks) -> void {

 auto VI::refresh() -> void {
  #if defined(VULKAN)
-  if(gpuOutputValid) {
+  if(vulkan.enable && gpuOutputValid) {
    const u8* rgba = nullptr;
    u32 width = 0, height = 0;
    vulkan.mapScanoutRead(rgba, width, height);
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/COMMIT
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/COMMIT
@ -0,0 +1 @@
+31ea5eb2d6fcb2d8f1df5f0951364322d09ac01a
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/LICENSE
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/LICENSE
@ -0,0 +1,20 @@
+Copyright (c) 2020 Themaister
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/README.md
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/README.md
@ -0,0 +1,265 @@
+# paraLLEl-RDP
+
+This project is a revival and complete rewrite of the old, defunct paraLLEl-RDP project.
+
+The goal is to implement the Nintendo 64 RDP graphics chip as accurately as possible using Vulkan compute.
+The implementation aims to be bitexact with the
+[Angrylion-Plus](https://github.com/ata4/angrylion-rdp-plus) reference renderer where possible.
+
+## Disclaimer
+
+While paraLLEl-RDP uses [Angrylion-Plus](https://github.com/ata4/angrylion-rdp-plus)
+as an implementation reference, it is not a port, and not a derived codebase of said project.
+It is written from scratch by studying [Angrylion-Plus](https://github.com/ata4/angrylion-rdp-plus)
+and trying to understand what is going on.
+The test suite uses [Angrylion-Plus](https://github.com/ata4/angrylion-rdp-plus) as a reference
+to validate implementation and cross-checking behavior.
+
+## Use cases
+
+- **Much** faster LLE RDP emulation of N64 compared to a CPU implementation
+  as parallel graphics workloads are offloaded to the GPU.
+  Emulation performance is now completely bound by CPU and LLE RSP performance.
+  Early benchmarking results suggest 2000 - 5000 VI/s being achieved on mid-range desktop GPUs based on timestamp data.
+  There is no way the CPU emulation can keep up with that, but that means this should
+  scale down to fairly gimped GPUs as well, assuming the driver requirements are met.
+- A backend renderer for standalone engines which aim to efficiently reproduce faithful N64 graphics.
+- Hopefully, an easier to understand implementation than the reference renderer.
+- An esoteric use case of advanced Vulkan compute programming.
+
+## Missing features
+
+The implementation is quite complete, and compatibility is very high in the limited amount of content I've tested.
+However, not every single feature is supported at this moment.
+Ticking the last boxes depends mostly on real content making use of said features.
+
+- Color combiner chroma keying
+- Various "bugs" / questionable behavior that seems meaningless to emulate
+- Certain extreme edge cases in TMEM upload. The implementation has tests for many "crazy" edge cases though.
+- ... possibly other obscure features
+
+The VI is essentially complete. A fancy deinterlacer might be useful to add since we have plenty of GPU cycles to spare in the graphics queue.
+The VI filtering is always turned on if game requests it, but features can selectively be turned off for the pixel purists.
+
+## Environment variables for development / testing
+
+### `RDP_DEBUG` / `RDP_DEBUG_X` / `RDP_DEBUG_Y`
+
+Supports printf in shaders, which is extremely useful to drill down difficult bugs.
+Only printfs from certain pixels can be filtered through to avoid spam.
+
+### `VI_DEBUG` / `VI_DEBUG_X` / `VI_DEBUG_Y`
+
+Same as `RDP_DEBUG` but for the VI.
+
+### `PARALLEL_RDP_MEASURE_SYNC_TIME`
+
+Measures time stalled in `CommandProcessor::wait_for_timeline`. Useful to measure
+CPU overhead in hard-synced emulator integrations.
+
+### `PARALLEL_RDP_SMALL_TYPES=0`
+
+Force-disables 8/16-bit arithmetic support. Useful when suspecting driver bugs.
+
+### `PARALLEL_RDP_UBERSHADER=1`
+
+Forces the use of ubershaders. Can be extremely slow depending on the shader compiler.
+
+### `PARALLEL_RDP_FORCE_SYNC_SHADER=1`
+
+Disabled async pipeline optimization, and blocks for every shader compiler.
+Only use if the ubershader crashes, since this adds the dreaded shader compilation stalls.
+
+### `PARALLEL_RDP_BENCH=1`
+
+Measures RDP rendering time spent on GPU using Vulkan timestamps.
+At end of a run, reports average time spent per render pass,
+and how many render passes are flushed per frame.
+
+### `PARALLEL_RDP_SUBGROUP=0`
+
+Force-disables use of Vulkan subgroup operations,
+which are used to optimize the tile binning algorithm.
+
+### `PARALLEL_RDP_ALLOW_EXTERNAL_HOST=0`
+
+Disables use of `VK_EXT_external_memory_host`. For testing.
+
+## Vulkan driver requirements
+
+paraLLEl-RDP requires up-to-date Vulkan implementations. A lot of the great improvements over the previous implementation
+comes from the idea that we can implement N64's UMA by simply importing RDRAM directly as an SSBO and perform 8 and 16-bit
+data access over the bus. With the tile based architecture in paraLLEl-RDP, this works very well and actual
+PCI-e traffic is massively reduced. The bandwidth for doing this is also trivial. On iGPU systems, this also works really well, since
+it's all the same memory anyways.
+
+Thus, the requirements are as follows. All of these features are widely supported, or will soon be in drivers.
+paraLLEl-RDP does not aim for compatibility with ancient hardware and drivers.
+Just use the reference renderer for that. This is enthusiast software for a niche audience.
+
+- Vulkan 1.1
+- VK_KHR_8bit_storage / VK_KHR_16bit_storage
+- Optionally VK_KHR_shader_float16_int8 which enables small integer arithmetic
+- Optionally subgroup support with VK_EXT_subgroup_size_control
+- For integration in emulators, VK_EXT_external_memory_host is currently required (may be relaxed later at some performance cost)
+
+### Tested drivers
+
+paraLLEl-RDP has been tested on Linux and Windows on all desktop vendors.
+
+- Intel Mesa (20.0.6) - Passes conformance
+- Intel Windows - Passes conformance (**CAVEAT**. Intel Windows requires 64 KiB alignment for host memory import, make sure to add some padding around RDRAM in an emulator to make this work well.)
+- AMD RADV LLVM (20.0.6) - Passes conformance
+- AMD RADV ACO - Passes conformance with bleeding edge drivers and `PARALLEL_RDP_SMALL_TYPES=0`.
+- Linux AMDGPU-PRO - Passes conformance, with caveat that 8/16-bit arithmetic does not work correctly for some tests.
+  paraLLEl-RDP automatically disables small integer arithmetic for proprietary AMD driver.
+- AMD Windows - Passes conformance with same caveat and workaround as AMDGPU-PRO.
+- NVIDIA Linux - Passes conformance (**MAJOR CAVEAT**, NVIDIA Linux does not support VK_EXT_external_memory_host as of 2020-05-12.)
+- NVIDIA Windows - Passes conformance
+
+## Implementation strategy
+
+This project uses Vulkan compute shaders to implement a fully programmable rasterization pipeline.
+The overall rendering architecture is reused from [RetroWarp](https://github.com/Themaister/RetroWarp)
+with some further refinements.
+
+The lower level Vulkan backend comes from [Granite](https://github.com/Themaister/Granite).
+
+### Asynchronous pipeline optimization
+
+Toggleable paths in RDP state is expressed as specialization constants. The rendering thread will
+detect new state combinations and kick off building pipelines which only specify exact state needed to render.
+This is a massive performance optimization.
+
+The same shaders are used for an "ubershader" fallback when pipelines are not ready.
+In this case, specialization constants are simply not used.
+The same SPIR-V modules are reused to great effect using this Vulkan feature.
+
+### Tile-based rendering
+
+See [RetroWarp](https://github.com/Themaister/RetroWarp) for more details.
+
+### GPU-driven TMEM management
+
+TMEM management is fully GPU-driven, but this is a very complicated implementation.
+Certain combinations of formats are not supported, but such cases would produce
+meaningless results, and it is unclear that applications can make meaningful use of these "weird" uploads.
+
+### Synchronization
+
+Synchronizing the GPU and CPU emulation is one of the hot button issues of N64 emulation.
+The integration code is designed around a timeline of synchronization points which can be waited on by the CPU
+when appropriate. For accurate emulation, an OpSyncFull is generally followed by a full wait,
+but most games can be more relaxed and only synchronize with the CPU N frames later.
+Implementation of this behavior is outside the scope of paraLLEl-RDP, and is left up to the integration code.
+
+### Asynchronous compute
+
+GPUs with a dedicated compute queue is recommended for optimal performance since
+RDP shading work can happen on the compute queue, and won't be blocked by graphics workloads happening
+in the graphics queue, which will typically be VI scanout and frontend applying shaders on top.
+
+## Project structure
+
+This project implements several submodules which are quite useful.
+
+### rdp-replayer
+
+This app replays RDP dump files, which are produced by running content through an RDP dumper.
+An implementation can be found in e.g. parallel-N64. The file format is very simple and essentially
+contains a record of RDRAM changes and RDP command streams.
+This dump is replayed and a live comparison between the reference renderer can be compared to paraLLEl-RDP
+with visual output. The UI is extremely crude, and is not user-friendly, but good enough for my use.
+
+### rdp-conformance
+
+I made a somewhat comprehensive test suite for the RDP, with a custom higher level RDP command stream generator.
+There are roughly ~150 fuzz tests which exercise many aspects of the RDP.
+In order to pass the test, paraLLEl-RDP must produce bit-exact results compared to Angrylion,
+so the test condition is as stringent as possible.
+
+#### A note on bitexactness
+
+There are a few cases where bit-exactness is a meaningless term, such as the noise feature of the RDP.
+It is not particularly meaningful to exactly reproduce noise, since it is by its very nature unpredictable.
+For that reason, this repo references a fork of the reference renderer which implements deterministic "undefined behavior"
+where appropriate. The exact formulation of the noise generator is not very interesting as long as
+correct entropy and output range is reproduced.
+
+##### Intentional differences from reference renderer
+
+Certain effects invoke "undefined behavior" in the RDP and requires cycle accuracy to resolve bit-accurately with real RDP.
+Reference renderer attempts to emulate these effects, but to reproduce this behavior breaks any form of multi-threading.
+To be able to validate dumps in a sensible way with buggy content, I modified the reference slightly to make certain
+"undefined behavior" deterministic. This doesn't meaningfully change the rendered output in the cases I've seen in the wild.
+Some of these effects would be possible to emulate,
+but at the cost of lots of added complexity and it wouldn't be quite correct anyways given the cycle accuracy issue.
+
+- CombinedColor/Alpha in first cycle is cleared to zero. Some games read this in first cycle,
+  and reference renderer will read whatever was generated last pixel.
+  This causes issues in some cases, where cycle accuracy would have caused the feedback to converge to zero over time.
+- Reading LODFrac in 1 cycle mode. This is currently ignored. The results generated seem non-sensical. Never seen this in the wild.
+- Using TexLOD in copy mode. This is currently ignored. The results generated seem non-sensical. Never seen this in the wild.
+- Reading MemoryColor in first blender cycle in 2-cycle mode. Reference seems to wait until the second cycle before updating this value,
+  despite memory coverage being updated right away. The sensible thing to do is to allow reading memory color in first cycle.
+- Alpha testing in 2-cycle mode reads combined alpha from next pixel in reference.
+  Just doing alpha testing in first cycle on current pixel is good enough.
+  If this is correct hardware behavior, I consider this a hardware bug.
+- Reading Texel1 in cycle 1 of 2-cycle mode reads the Texel0 from next pixel.
+  In the few cases I've seen this, the rendered output is slightly buggy, but it's hardly visible in motion.
+  The workaround is just to read Texel0 from current pixel which still renders fine.
+
+### vi-conformance
+
+This is a conformance suite, except for the video interface (VI) unit.
+
+### rdp-validate-dump
+
+This tool replays an RDP dump headless and compares outputs between reference renderer and paraLLEl-RDP.
+To pass, bitexact output must be generated.
+
+## Build
+
+Checkout submodules. This pulls in Angrylion-Plus as well as Granite.
+
+```
+git submodule update --init --recursive
+```
+
+Standard CMake build.
+
+```
+mkdir build
+cd build
+cmake ..
+cmake --build . --parallel (--config Release on MSVC)
+```
+
+### Run test suite
+
+You can run rdp-conformance and vi-conformance with ctest to verify if your driver is behaving correctly.
+
+```
+ctest (-C Release on MSVC)
+```
+
+### Embedding shaders in a C++ header
+
+If embedding paraLLEl-RDP in an emulator project, it is helpful to pre-compile and bake SPIR-V shaders in a C++ header.
+Build slangmosh from Granite, and then run:
+
+```
+slangmosh parallel-rdp/shaders/slangmosh.json --output slangmosh.hpp --vk11 --strip -O --namespace RDP
+```
+
+### Generating a standalone code base for emulator integration
+
+Run the `generate_standalone_codebase.sh $OUTDIR` script with an output directory `$OUTDIR/` as argument to generate a standalone code base which can be built without any special build system support.
+Include `$OUTDIR/config.mk` if building with Make to make your life easier.
+Note that `slangmosh` must be in your path for this script to run. It executes the command above to build `slangmosh.hpp`.
+
+## License
+
+paraLLEl-RDP is licensed under the permissive license MIT. See included LICENSE file.
+This implementation builds heavily on the knowledge (but not code) gained from studying the reference implementation,
+thus it felt fair to release it under a permissive license, so my work could be reused more easily.
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/config.mk
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/config.mk
@ -0,0 +1,53 @@
+# For use in standalone implementations.
+
+PARALLEL_RDP_CFLAGS :=
+PARALLEL_RDP_CXXFLAGS := -DGRANITE_VULKAN_MT
+
+PARALLEL_RDP_SOURCES_CXX := \
+        $(wildcard $(PARALLEL_RDP_IMPLEMENTATION)/parallel-rdp/*.cpp) \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/buffer.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/buffer_pool.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/command_buffer.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/command_pool.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/context.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/cookie.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/descriptor_set.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/device.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/event_manager.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/fence.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/fence_manager.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/image.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/memory_allocator.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/pipeline_event.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/query_pool.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/render_pass.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/sampler.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/semaphore.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/semaphore_manager.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/shader.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/vulkan/texture_format.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/util/logging.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/util/thread_id.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/util/aligned_alloc.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/util/timer.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/util/timeline_trace_file.cpp \
+        $(PARALLEL_RDP_IMPLEMENTATION)/util/thread_name.cpp
+
+PARALLEL_RDP_SOURCES_C := \
+        $(PARALLEL_RDP_IMPLEMENTATION)/volk/volk.c
+
+PARALLEL_RDP_INCLUDE_DIRS := \
+        -I$(PARALLEL_RDP_IMPLEMENTATION)/parallel-rdp \
+        -I$(PARALLEL_RDP_IMPLEMENTATION)/volk \
+        -I$(PARALLEL_RDP_IMPLEMENTATION)/vulkan \
+        -I$(PARALLEL_RDP_IMPLEMENTATION)/vulkan-headers/include \
+        -I$(PARALLEL_RDP_IMPLEMENTATION)/util
+
+PARALLEL_RDP_LDFLAGS := -pthread
+ifeq (,$(findstring win,$(platform)))
+    PARALLEL_RDP_LDFLAGS += -ldl
+else
+    PARALLEL_RDP_CFLAGS += -DVK_USE_PLATFORM_WIN32_KHR
+    PARALLEL_RDP_LDFLAGS += -lwinmm
+endif
+
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/command_ring.cpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/command_ring.cpp
@ -0,0 +1,135 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <chrono>
+#include "command_ring.hpp"
+#include "rdp_device.hpp"
+#include "thread_id.hpp"
+#include <assert.h>
+
+namespace RDP
+{
+void CommandRing::init(
+#ifdef PARALLEL_RDP_SHADER_DIR
+		Granite::Global::GlobalManagersHandle global_handles_,
+#endif
+		CommandProcessor *processor_, unsigned count)
+{
+	assert((count & (count - 1)) == 0);
+	teardown_thread();
+	processor = processor_;
+	ring.resize(count);
+	write_count = 0;
+	read_count = 0;
+#ifdef PARALLEL_RDP_SHADER_DIR
+	global_handles = std::move(global_handles_);
+#endif
+	thr = std::thread(&CommandRing::thread_loop, this);
+}
+
+void CommandRing::teardown_thread()
+{
+	if (thr.joinable())
+	{
+		enqueue_command(0, nullptr);
+		thr.join();
+	}
+}
+
+CommandRing::~CommandRing()
+{
+	teardown_thread();
+}
+
+void CommandRing::drain()
+{
+	std::unique_lock<std::mutex> holder{lock};
+	cond.wait(holder, [this]() {
+		return write_count == completed_count;
+	});
+}
+
+void CommandRing::enqueue_command(unsigned num_words, const uint32_t *words)
+{
+	std::unique_lock<std::mutex> holder{lock};
+	cond.wait(holder, [this, num_words]() {
+		return write_count + num_words + 1 <= read_count + ring.size();
+	});
+
+	size_t mask = ring.size() - 1;
+	ring[write_count++ & mask] = num_words;
+	for (unsigned i = 0; i < num_words; i++)
+		ring[write_count++ & mask] = words[i];
+
+	cond.notify_one();
+}
+
+void CommandRing::thread_loop()
+{
+	Util::register_thread_index(0);
+
+#ifdef PARALLEL_RDP_SHADER_DIR
+	// Here to let the RDP play nice with full Granite.
+	// When we move to standalone Granite, we won't need to interact with global subsystems like this.
+	Granite::Global::set_thread_context(*global_handles);
+	global_handles.reset();
+#endif
+
+	std::vector<uint32_t> tmp_buffer;
+	tmp_buffer.reserve(64);
+	size_t mask = ring.size() - 1;
+
+	for (;;)
+	{
+		bool is_idle = false;
+		{
+			std::unique_lock<std::mutex> holder{lock};
+			if (cond.wait_for(holder, std::chrono::microseconds(500), [this]() { return write_count > read_count; }))
+			{
+				uint32_t num_words = ring[read_count++ & mask];
+				tmp_buffer.resize(num_words);
+				for (uint32_t i = 0; i < num_words; i++)
+					tmp_buffer[i] = ring[read_count++ & mask];
+			}
+			else
+			{
+				// If we don't receive commands at a steady pace,
+				// notify rendering thread that we should probably kick some work.
+				tmp_buffer.resize(1);
+				tmp_buffer[0] = uint32_t(Op::MetaIdle) << 24;
+				is_idle = true;
+			}
+		}
+
+		if (tmp_buffer.empty())
+			break;
+
+		processor->enqueue_command_direct(tmp_buffer.size(), tmp_buffer.data());
+		if (!is_idle)
+		{
+			std::lock_guard<std::mutex> holder{lock};
+			completed_count = read_count;
+			cond.notify_one();
+		}
+	}
+}
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/command_ring.hpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/command_ring.hpp
@ -0,0 +1,67 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <vector>
+
+#ifdef PARALLEL_RDP_SHADER_DIR
+#include "global_managers.hpp"
+#endif
+
+namespace RDP
+{
+class CommandProcessor;
+class CommandRing
+{
+public:
+	void init(
+#ifdef PARALLEL_RDP_SHADER_DIR
+			Granite::Global::GlobalManagersHandle global_handles,
+#endif
+			CommandProcessor *processor, unsigned count);
+	~CommandRing();
+	void drain();
+
+	void enqueue_command(unsigned num_words, const uint32_t *words);
+
+private:
+	CommandProcessor *processor = nullptr;
+	std::thread thr;
+	std::mutex lock;
+	std::condition_variable cond;
+
+	std::vector<uint32_t> ring;
+	uint64_t write_count = 0;
+	uint64_t read_count = 0;
+	uint64_t completed_count = 0;
+
+	void thread_loop();
+	void teardown_thread();
+#ifdef PARALLEL_RDP_SHADER_DIR
+	Granite::Global::GlobalManagersHandle global_handles;
+#endif
+};
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/luts.hpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/luts.hpp
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/rdp_common.hpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/rdp_common.hpp
@ -0,0 +1,402 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+namespace Vulkan
+{
+class Program;
+class Shader;
+}
+
+namespace RDP
+{
+template <typename Program, typename Shader> struct Shaders;
+using ShaderBank = Shaders<Vulkan::Program *, Vulkan::Shader *>;
+
+// list of command IDs
+enum class Op
+{
+	Nop = 0,
+
+	MetaSignalTimeline = 1,
+	MetaFlush = 2,
+	MetaIdle = 3,
+	MetaSetQuirks = 4,
+
+	FillTriangle = 0x08,
+	FillZBufferTriangle = 0x09,
+	TextureTriangle = 0x0a,
+	TextureZBufferTriangle = 0x0b,
+	ShadeTriangle = 0x0c,
+	ShadeZBufferTriangle = 0x0d,
+	ShadeTextureTriangle = 0x0e,
+	ShadeTextureZBufferTriangle = 0x0f,
+	TextureRectangle = 0x24,
+	TextureRectangleFlip = 0x25,
+	SyncLoad = 0x26,
+	SyncPipe = 0x27,
+	SyncTile = 0x28,
+	SyncFull = 0x29,
+	SetKeyGB = 0x2a,
+	SetKeyR = 0x2b,
+	SetConvert = 0x2c,
+	SetScissor = 0x2d,
+	SetPrimDepth = 0x2e,
+	SetOtherModes = 0x2f,
+	LoadTLut = 0x30,
+	SetTileSize = 0x32,
+	LoadBlock = 0x33,
+	LoadTile = 0x34,
+	SetTile = 0x35,
+	FillRectangle = 0x36,
+	SetFillColor = 0x37,
+	SetFogColor = 0x38,
+	SetBlendColor = 0x39,
+	SetPrimColor = 0x3a,
+	SetEnvColor = 0x3b,
+	SetCombine = 0x3c,
+	SetTextureImage = 0x3d,
+	SetMaskImage = 0x3e,
+	SetColorImage = 0x3f
+};
+
+enum class RGBMul : uint8_t
+{
+	Combined = 0,
+	Texel0 = 1,
+	Texel1 = 2,
+	Primitive = 3,
+	Shade = 4,
+	Env = 5,
+	KeyScale = 6,
+	CombinedAlpha = 7,
+	Texel0Alpha = 8,
+	Texel1Alpha = 9,
+	PrimitiveAlpha = 10,
+	ShadeAlpha = 11,
+	EnvAlpha = 12,
+	LODFrac = 13,
+	PrimLODFrac = 14,
+	ConvertK5 = 15,
+	Zero = 16
+};
+
+enum class RGBMulAdd : uint8_t
+{
+	Combined = 0,
+	Texel0 = 1,
+	Texel1 = 2,
+	Primitive = 3,
+	Shade = 4,
+	Env = 5,
+	One = 6,
+	Noise = 7,
+	Zero = 8
+};
+
+enum class RGBMulSub : uint8_t
+{
+	Combined = 0,
+	Texel0 = 1,
+	Texel1 = 2,
+	Primitive = 3,
+	Shade = 4,
+	Env = 5,
+	KeyCenter = 6,
+	ConvertK4 = 7,
+	Zero = 8
+};
+
+enum class RGBAdd : uint8_t
+{
+	Combined = 0,
+	Texel0 = 1,
+	Texel1 = 2,
+	Primitive = 3,
+	Shade = 4,
+	Env = 5,
+	One = 6,
+	Zero = 7
+};
+
+enum class AlphaAddSub : uint8_t
+{
+	CombinedAlpha = 0,
+	Texel0Alpha = 1,
+	Texel1Alpha = 2,
+	PrimitiveAlpha = 3,
+	ShadeAlpha = 4,
+	EnvAlpha = 5,
+	One = 6,
+	Zero = 7
+};
+
+enum class AlphaMul : uint8_t
+{
+	LODFrac = 0,
+	Texel0Alpha = 1,
+	Texel1Alpha = 2,
+	PrimitiveAlpha = 3,
+	ShadeAlpha = 4,
+	EnvAlpha = 5,
+	PrimLODFrac = 6,
+	Zero = 7
+};
+
+enum class TextureSize : uint8_t
+{
+	Bpp4 = 0,
+	Bpp8 = 1,
+	Bpp16 = 2,
+	Bpp32 = 3
+};
+
+enum class TextureFormat : uint8_t
+{
+	RGBA = 0,
+	YUV = 1,
+	CI = 2,
+	IA = 3,
+	I = 4
+};
+
+enum class RGBDitherMode : uint8_t
+{
+	Magic = 0,
+	Bayer = 1,
+	Noise = 2,
+	Off = 3
+};
+
+enum class AlphaDitherMode : uint8_t
+{
+	Pattern = 0,
+	InvPattern = 1,
+	Noise = 2,
+	Off = 3
+};
+
+enum class CycleType : uint8_t
+{
+	Cycle1 = 0,
+	Cycle2 = 1,
+	Copy = 2,
+	Fill = 3
+};
+
+enum class BlendMode1A : uint8_t
+{
+	PixelColor = 0,
+	MemoryColor = 1,
+	BlendColor = 2,
+	FogColor = 3
+};
+
+enum class BlendMode1B : uint8_t
+{
+	PixelAlpha = 0,
+	FogAlpha = 1,
+	ShadeAlpha = 2,
+	Zero = 3
+};
+
+enum class BlendMode2A : uint8_t
+{
+	PixelColor = 0,
+	MemoryColor = 1,
+	BlendColor = 2,
+	FogColor = 3
+};
+
+enum class BlendMode2B : uint8_t
+{
+	InvPixelAlpha = 0,
+	MemoryAlpha = 1,
+	One = 2,
+	Zero = 3
+};
+
+enum class CoverageMode : uint8_t
+{
+	Clamp = 0,
+	Wrap = 1,
+	Zap = 2,
+	Save = 3
+};
+
+enum class ZMode : uint8_t
+{
+	Opaque = 0,
+	Interpenetrating = 1,
+	Transparent = 2,
+	Decal = 3
+};
+
+enum TileInfoFlagBits
+{
+	TILE_INFO_CLAMP_S_BIT = 1 << 0,
+	TILE_INFO_MIRROR_S_BIT = 1 << 1,
+	TILE_INFO_CLAMP_T_BIT = 1 << 2,
+	TILE_INFO_MIRROR_T_BIT = 1 << 3
+};
+using TileInfoFlags = uint8_t;
+
+struct TileSize
+{
+	uint32_t slo = 0;
+	uint32_t shi = 0;
+	uint32_t tlo = 0;
+	uint32_t thi = 0;
+};
+
+struct TileMeta
+{
+	uint32_t offset = 0;
+	uint32_t stride = 0;
+	TextureFormat fmt = TextureFormat::RGBA;
+	TextureSize size = TextureSize::Bpp16;
+	uint8_t palette = 0;
+	uint8_t mask_s = 0;
+	uint8_t shift_s = 0;
+	uint8_t mask_t = 0;
+	uint8_t shift_t = 0;
+	TileInfoFlags flags = 0;
+};
+
+struct TileInfo
+{
+	TileSize size;
+	TileMeta meta;
+};
+
+struct CombinerInputsRGB
+{
+	RGBMulAdd muladd;
+	RGBMulSub mulsub;
+	RGBMul mul;
+	RGBAdd add;
+};
+
+struct CombinerInputsAlpha
+{
+	AlphaAddSub muladd;
+	AlphaAddSub mulsub;
+	AlphaMul mul;
+	AlphaAddSub add;
+};
+
+struct CombinerInputs
+{
+	CombinerInputsRGB rgb;
+	CombinerInputsAlpha alpha;
+};
+
+struct BlendModes
+{
+	BlendMode1A blend_1a;
+	BlendMode1B blend_1b;
+	BlendMode2A blend_2a;
+	BlendMode2B blend_2b;
+};
+
+static_assert(sizeof(TileInfo) == 32, "TileInfo must be 32 bytes.");
+
+enum class VIRegister
+{
+	Control = 0,
+	Origin,
+	Width,
+	Intr,
+	VCurrentLine,
+	Timing,
+	VSync,
+	HSync,
+	Leap,
+	HStart,
+	VStart,
+	VBurst,
+	XScale,
+	YScale,
+	Count
+};
+
+enum VIControlFlagBits
+{
+	VI_CONTROL_TYPE_BLANK_BIT = 0 << 0,
+	VI_CONTROL_TYPE_RESERVED_BIT = 1 << 0,
+	VI_CONTROL_TYPE_RGBA5551_BIT = 2 << 0,
+	VI_CONTROL_TYPE_RGBA8888_BIT = 3 << 0,
+	VI_CONTROL_TYPE_MASK = 3 << 0,
+	VI_CONTROL_GAMMA_DITHER_ENABLE_BIT = 1 << 2,
+	VI_CONTROL_GAMMA_ENABLE_BIT = 1 << 3,
+	VI_CONTROL_DIVOT_ENABLE_BIT = 1 << 4,
+	VI_CONTROL_SERRATE_BIT = 1 << 6,
+	VI_CONTROL_AA_MODE_RESAMP_EXTRA_ALWAYS_BIT = 0 << 8,
+	VI_CONTROL_AA_MODE_RESAMP_EXTRA_BIT = 1 << 8,
+	VI_CONTROL_AA_MODE_RESAMP_ONLY_BIT = 2 << 8,
+	VI_CONTROL_AA_MODE_RESAMP_REPLICATE_BIT = 3 << 8,
+	VI_CONTROL_AA_MODE_MASK = 3 << 8,
+	VI_CONTROL_DITHER_FILTER_ENABLE_BIT = 1 << 16,
+	VI_CONTROL_META_AA_BIT = 1 << 17,
+	VI_CONTROL_META_SCALE_BIT = 1 << 18
+};
+using VIControlFlags = uint32_t;
+
+static inline uint32_t make_vi_start_register(uint32_t start_value, uint32_t end_value)
+{
+	return ((start_value & 0x3ff) << 16) | (end_value & 0x3ff);
+}
+
+static inline uint32_t make_vi_scale_register(uint32_t scale_factor, uint32_t bias)
+{
+	return ((bias & 0xfff) << 16) | (scale_factor & 0xfff);
+}
+
+constexpr uint32_t VI_V_SYNC_NTSC = 525;
+constexpr uint32_t VI_V_SYNC_PAL = 625;
+constexpr uint32_t VI_H_OFFSET_NTSC = 108;
+constexpr uint32_t VI_H_OFFSET_PAL = 128;
+constexpr uint32_t VI_V_OFFSET_NTSC = 34;
+constexpr uint32_t VI_V_OFFSET_PAL = 44;
+constexpr uint32_t VI_V_RES_NTSC = 480;
+constexpr uint32_t VI_V_RES_PAL = 576;
+constexpr int VI_SCANOUT_WIDTH = 640;
+
+static inline uint32_t make_default_v_start()
+{
+	return make_vi_start_register(VI_V_OFFSET_NTSC, VI_V_OFFSET_NTSC + 224 * 2);
+}
+
+static inline uint32_t make_default_h_start()
+{
+	return make_vi_start_register(VI_H_OFFSET_NTSC, VI_H_OFFSET_NTSC + VI_SCANOUT_WIDTH);
+}
+
+template <int bits>
+static int32_t sext(int32_t v)
+{
+	struct { int32_t dummy : bits; } d;
+	d.dummy = v;
+	return d.dummy;
+}
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/rdp_data_structures.hpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/rdp_data_structures.hpp
@ -0,0 +1,389 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include "rdp_common.hpp"
+
+namespace RDP
+{
+enum TriangleSetupFlagBits
+{
+	TRIANGLE_SETUP_FLIP_BIT = 1 << 0,
+	TRIANGLE_SETUP_DO_OFFSET_BIT = 1 << 1,
+	TRIANGLE_SETUP_SKIP_XFRAC_BIT = 1 << 2,
+	TRIANGLE_SETUP_INTERLACE_FIELD_BIT = 1 << 3,
+	TRIANGLE_SETUP_INTERLACE_KEEP_ODD_BIT = 1 << 4,
+	TRIANGLE_SETUP_DISABLE_UPSCALING_BIT = 1 << 5,
+	TRIANGLE_SETUP_NATIVE_LOD_BIT = 1 << 6
+};
+using TriangleSetupFlags = uint8_t;
+
+enum StaticRasterizationFlagBits
+{
+	RASTERIZATION_INTERLACE_FIELD_BIT = 1 << 0,
+	RASTERIZATION_INTERLACE_KEEP_ODD_BIT = 1 << 1,
+	RASTERIZATION_AA_BIT = 1 << 2,
+	RASTERIZATION_PERSPECTIVE_CORRECT_BIT = 1 << 3,
+	RASTERIZATION_TLUT_BIT = 1 << 4,
+	RASTERIZATION_TLUT_TYPE_BIT = 1 << 5,
+	RASTERIZATION_CVG_TIMES_ALPHA_BIT = 1 << 6,
+	RASTERIZATION_ALPHA_CVG_SELECT_BIT = 1 << 7,
+	RASTERIZATION_MULTI_CYCLE_BIT = 1 << 8,
+	RASTERIZATION_TEX_LOD_ENABLE_BIT = 1 << 9,
+	RASTERIZATION_SHARPEN_LOD_ENABLE_BIT = 1 << 10,
+	RASTERIZATION_DETAIL_LOD_ENABLE_BIT = 1 << 11,
+	RASTERIZATION_FILL_BIT = 1 << 12,
+	RASTERIZATION_COPY_BIT = 1 << 13,
+	RASTERIZATION_SAMPLE_MODE_BIT = 1 << 14,
+	RASTERIZATION_ALPHA_TEST_BIT = 1 << 15,
+	RASTERIZATION_ALPHA_TEST_DITHER_BIT = 1 << 16,
+	RASTERIZATION_SAMPLE_MID_TEXEL_BIT = 1 << 17,
+	RASTERIZATION_USES_TEXEL0_BIT = 1 << 18,
+	RASTERIZATION_USES_TEXEL1_BIT = 1 << 19,
+	RASTERIZATION_USES_LOD_BIT = 1 << 20,
+	RASTERIZATION_USES_PIPELINED_TEXEL1_BIT = 1 << 21,
+	RASTERIZATION_CONVERT_ONE_BIT = 1 << 22,
+	RASTERIZATION_BILERP_0_BIT = 1 << 23,
+	RASTERIZATION_BILERP_1_BIT = 1 << 24,
+	RASTERIZATION_UPSCALING_LOG2_BIT_OFFSET = 26,
+	RASTERIZATION_NEED_NOISE_BIT = 1 << 28,
+	RASTERIZATION_USE_STATIC_TEXTURE_SIZE_FORMAT_BIT = 1 << 29,
+	RASTERIZATION_USE_SPECIALIZATION_CONSTANT_BIT = 1 << 30
+};
+using StaticRasterizationFlags = uint32_t;
+
+enum DepthBlendFlagBits
+{
+	DEPTH_BLEND_DEPTH_TEST_BIT = 1 << 0,
+	DEPTH_BLEND_DEPTH_UPDATE_BIT = 1 << 1,
+	DEPTH_BLEND_FORCE_BLEND_BIT = 1 << 3,
+	DEPTH_BLEND_IMAGE_READ_ENABLE_BIT = 1 << 4,
+	DEPTH_BLEND_COLOR_ON_COVERAGE_BIT = 1 << 5,
+	DEPTH_BLEND_MULTI_CYCLE_BIT = 1 << 6,
+	DEPTH_BLEND_AA_BIT = 1 << 7,
+	DEPTH_BLEND_DITHER_ENABLE_BIT = 1 << 8
+};
+using DepthBlendFlags = uint32_t;
+
+struct TriangleSetup
+{
+	int32_t xh, xm, xl;
+	int16_t yh, ym;
+
+	int32_t dxhdy, dxmdy, dxldy;
+	int16_t yl;
+	TriangleSetupFlags flags;
+	uint8_t tile;
+};
+
+struct AttributeSetup
+{
+	int32_t r, g, b, a;
+	int32_t drdx, dgdx, dbdx, dadx;
+	int32_t drde, dgde, dbde, dade;
+	int32_t drdy, dgdy, dbdy, dady;
+
+	int32_t s, t, z, w;
+	int32_t dsdx, dtdx, dzdx, dwdx;
+	int32_t dsde, dtde, dzde, dwde;
+	int32_t dsdy, dtdy, dzdy, dwdy;
+};
+
+struct ConstantCombinerInputs
+{
+	uint8_t muladd[4];
+	uint8_t mulsub[4];
+	uint8_t mul[4];
+	uint8_t add[4];
+};
+
+// Per-primitive state which is very dynamic in nature and does not change anything about the shader itself.
+struct DerivedSetup
+{
+	ConstantCombinerInputs constants[2];
+	uint8_t fog_color[4];
+	uint8_t blend_color[4];
+	uint32_t fill_color;
+	uint16_t dz;
+	uint8_t dz_compressed;
+	uint8_t min_lod;
+	int16_t convert_factors[4];
+};
+
+static_assert((sizeof(TriangleSetup) & 15) == 0, "TriangleSetup must be aligned to 16 bytes.");
+static_assert((sizeof(AttributeSetup) & 15) == 0, "AttributeSetup must be aligned to 16 bytes.");
+static_assert(sizeof(DerivedSetup) == 56, "DerivedSetup is not 56 bytes.");
+
+struct ScissorState
+{
+	uint32_t xlo;
+	uint32_t ylo;
+	uint32_t xhi;
+	uint32_t yhi;
+};
+
+struct StaticRasterizationState
+{
+	CombinerInputs combiner[2];
+	StaticRasterizationFlags flags;
+	uint32_t dither;
+	uint32_t texture_size;
+	uint32_t texture_fmt;
+};
+static_assert(sizeof(StaticRasterizationState) == 32, "StaticRasterizationState must be 32 bytes.");
+
+struct DepthBlendState
+{
+	BlendModes blend_cycles[2];
+	DepthBlendFlags flags;
+	CoverageMode coverage_mode;
+	ZMode z_mode;
+	uint8_t padding[2];
+};
+static_assert(sizeof(DepthBlendState) == 16, "DepthBlendState must be 16 bytes.");
+
+struct InstanceIndices
+{
+	uint8_t static_index;
+	uint8_t depth_blend_index;
+	uint8_t tile_instance_index;
+	uint8_t padding[5];
+	uint8_t tile_indices[8];
+};
+static_assert((sizeof(InstanceIndices) & 15) == 0, "InstanceIndices must be aligned to 16 bytes.");
+
+struct UploadInfo
+{
+	int32_t width, height;
+	float min_t_mod, max_t_mod;
+
+	int32_t vram_addr;
+	int32_t vram_width;
+	int32_t vram_size;
+	int32_t vram_effective_width;
+
+	int32_t tmem_offset;
+	int32_t tmem_stride_words;
+	int32_t tmem_size;
+	int32_t tmem_fmt;
+
+	int32_t mode;
+	float inv_tmem_stride_words;
+	int32_t dxt;
+	int32_t padding;
+};
+static_assert((sizeof(UploadInfo) & 15) == 0, "UploadInfo must be aligned to 16 bytes.");
+
+struct SpanSetup
+{
+	int32_t r, g, b, a;
+	int32_t s, t, w, z;
+
+	int16_t xlo[4];
+	int16_t xhi[4];
+
+	int32_t interpolation_base_x;
+	int32_t start_x;
+	int32_t end_x;
+	int16_t lodlength;
+	uint16_t valid_line;
+};
+static_assert((sizeof(SpanSetup) & 15) == 0, "SpanSetup is not aligned to 16 bytes.");
+
+struct SpanInfoOffsets
+{
+	int32_t offset, ylo, yhi, padding;
+};
+static_assert((sizeof(SpanInfoOffsets) == 16), "SpanInfoOffsets is not 16 bytes.");
+
+struct SpanInterpolationJob
+{
+	uint16_t primitive_index, base_y, max_y, padding;
+};
+static_assert((sizeof(SpanInterpolationJob) == 8), "SpanInterpolationJob is not 8 bytes.");
+
+struct GlobalState
+{
+	uint32_t addr_index;
+	uint32_t depth_addr_index;
+	uint32_t fb_width, fb_height;
+	uint32_t group_mask;
+};
+
+struct TileRasterWork
+{
+	uint32_t tile_x, tile_y;
+	uint32_t tile_instance;
+	uint32_t primitive;
+};
+static_assert((sizeof(TileRasterWork) == 16), "TileRasterWork is not 16 bytes.");
+
+struct GlobalFBInfo
+{
+	uint32_t dx_shift;
+	uint32_t dx_mask;
+	uint32_t fb_size;
+	uint32_t base_primitive_index;
+};
+
+template <typename T, unsigned N>
+class StateCache
+{
+public:
+	unsigned add(const T &t)
+	{
+		if (cached_index >= 0)
+			if (memcmp(&elements[cached_index], &t, sizeof(T)) == 0)
+				return unsigned(cached_index);
+
+		for (int i = int(count) - 1; i >= 0; i--)
+		{
+			if (memcmp(&elements[i], &t, sizeof(T)) == 0)
+			{
+				cached_index = i;
+				return unsigned(i);
+			}
+		}
+
+		assert(count < N);
+		memcpy(elements + count, &t, sizeof(T));
+		unsigned ret = count++;
+		cached_index = int(ret);
+		return ret;
+	}
+
+	bool full() const
+	{
+		return count == N;
+	}
+
+	unsigned size() const
+	{
+		return count;
+	}
+
+	unsigned byte_size() const
+	{
+		return size() * sizeof(T);
+	}
+
+	const T *data() const
+	{
+		return elements;
+	}
+
+	void reset()
+	{
+		count = 0;
+		cached_index = -1;
+	}
+
+	bool empty() const
+	{
+		return count == 0;
+	}
+
+private:
+	unsigned count = 0;
+	int cached_index = -1;
+	T elements[N];
+};
+
+template <typename T, unsigned N>
+class StreamCache
+{
+public:
+	void add(const T &t)
+	{
+		assert(count < N);
+		memcpy(&elements[count++], &t, sizeof(T));
+	}
+
+	bool full() const
+	{
+		return count == N;
+	}
+
+	unsigned size() const
+	{
+		return count;
+	}
+
+	unsigned byte_size() const
+	{
+		return size() * sizeof(T);
+	}
+
+	const T *data() const
+	{
+		return elements;
+	}
+
+	void reset()
+	{
+		count = 0;
+	}
+
+	bool empty() const
+	{
+		return count == 0;
+	}
+
+private:
+	unsigned count = 0;
+	T elements[N];
+};
+
+namespace Limits
+{
+constexpr unsigned MaxPrimitives = 256;
+constexpr unsigned MaxStaticRasterizationStates = 64;
+constexpr unsigned MaxDepthBlendStates = 64;
+constexpr unsigned MaxTileInfoStates = 256;
+constexpr unsigned NumSyncStates = 32;
+constexpr unsigned MaxNumTiles = 8;
+constexpr unsigned MaxTMEMInstances = 256;
+constexpr unsigned MaxSpanSetups = 32 * 1024;
+constexpr unsigned MaxWidth = 1024;
+constexpr unsigned MaxHeight = 1024;
+constexpr unsigned MaxTileInstances = 0x8000;
+}
+
+namespace ImplementationConstants
+{
+constexpr unsigned DefaultWorkgroupSize = 64;
+
+constexpr unsigned TileWidth = 8;
+constexpr unsigned TileHeight = 8;
+constexpr unsigned MaxTilesX = Limits::MaxWidth / TileWidth;
+constexpr unsigned MaxTilesY = Limits::MaxHeight / TileHeight;
+constexpr unsigned IncoherentPageSize = 1024;
+constexpr unsigned MaxPendingRenderPassesBeforeFlush = 8;
+constexpr unsigned MinimumPrimitivesForIdleFlush = 32;
+constexpr unsigned MinimumRenderPassesForIdleFlush = 2;
+}
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/rdp_device.cpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/rdp_device.cpp
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/rdp_device.hpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/rdp_device.hpp
@ -0,0 +1,243 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <memory>
+#include <thread>
+#include <queue>
+#include "device.hpp"
+#include "video_interface.hpp"
+#include "rdp_renderer.hpp"
+#include "rdp_common.hpp"
+#include "command_ring.hpp"
+#include "worker_thread.hpp"
+#include "rdp_dump_write.hpp"
+
+#ifndef GRANITE_VULKAN_MT
+#error "Granite Vulkan backend must be built with multithreading support."
+#endif
+
+namespace RDP
+{
+struct RGBA
+{
+	uint8_t r, g, b, a;
+};
+
+enum CommandProcessorFlagBits
+{
+	COMMAND_PROCESSOR_FLAG_HOST_VISIBLE_HIDDEN_RDRAM_BIT = 1 << 0,
+	COMMAND_PROCESSOR_FLAG_HOST_VISIBLE_TMEM_BIT = 1 << 1,
+	COMMAND_PROCESSOR_FLAG_UPSCALING_2X_BIT = 1 << 2,
+	COMMAND_PROCESSOR_FLAG_UPSCALING_4X_BIT = 1 << 3,
+	COMMAND_PROCESSOR_FLAG_UPSCALING_8X_BIT = 1 << 4,
+	COMMAND_PROCESSOR_FLAG_SUPER_SAMPLED_READ_BACK_BIT = 1 << 5,
+	COMMAND_PROCESSOR_FLAG_SUPER_SAMPLED_DITHER_BIT = 1 << 6
+};
+using CommandProcessorFlags = uint32_t;
+
+struct CoherencyCopy
+{
+	size_t src_offset = 0;
+	size_t mask_offset = 0;
+	size_t dst_offset = 0;
+	size_t size = 0;
+	std::atomic_uint32_t *counter_base = nullptr;
+	unsigned counters = 0;
+};
+
+struct CoherencyOperation
+{
+	Vulkan::Fence fence;
+	uint64_t timeline_value = 0;
+
+	uint8_t *dst = nullptr;
+	const Vulkan::Buffer *src = nullptr;
+	std::vector<CoherencyCopy> copies;
+	std::atomic_uint32_t *unlock_cookie = nullptr;
+};
+
+// These options control various behavior when upscaling to workaround glitches which arise naturally as part of upscaling.
+struct Quirks
+{
+	inline Quirks()
+	{
+		u.options.native_resolution_tex_rect = true;
+		u.options.native_texture_lod = false;
+	}
+
+	inline void set_native_resolution_tex_rect(bool enable)
+	{
+		u.options.native_resolution_tex_rect = enable;
+	}
+
+	inline void set_native_texture_lod(bool enable)
+	{
+		u.options.native_texture_lod = enable;
+	}
+
+	union
+	{
+		struct Opts
+		{
+			// If true, force TEX_RECT and TEX_RECT_FLIP to render without upscaling.
+			// Works around bilinear filtering bugs in Cycle1/Cycle2 mode where game assumed 1:1 pixel transfer.
+			bool native_resolution_tex_rect;
+
+			// Forces LOD to be computed as 1x upscale.
+			// Fixes content which relies on LOD computation to select textures in clever ways.
+			bool native_texture_lod;
+		} options;
+		uint32_t words[1];
+	} u;
+};
+
+class CommandProcessor
+{
+public:
+	CommandProcessor(Vulkan::Device &device,
+	                 void *rdram_ptr,
+	                 size_t rdram_offset,
+	                 size_t rdram_size,
+	                 size_t hidden_rdram_size,
+	                 CommandProcessorFlags flags);
+
+	~CommandProcessor();
+
+	bool device_is_supported() const;
+
+	// Synchronization.
+	void flush();
+	uint64_t signal_timeline();
+	void wait_for_timeline(uint64_t index);
+	void idle();
+	void begin_frame_context();
+
+	// Queues up state and drawing commands.
+	void enqueue_command(unsigned num_words, const uint32_t *words);
+	void enqueue_command_direct(unsigned num_words, const uint32_t *words);
+
+	void set_quirks(const Quirks &quirks);
+
+	// Interact with memory.
+	void *begin_read_rdram();
+	void end_write_rdram();
+	void *begin_read_hidden_rdram();
+	void end_write_hidden_rdram();
+	size_t get_rdram_size() const;
+	size_t get_hidden_rdram_size() const;
+	void *get_tmem();
+
+	// Sets VI register
+	void set_vi_register(VIRegister reg, uint32_t value);
+
+	Vulkan::ImageHandle scanout(const ScanoutOptions &opts = {});
+	void scanout_sync(std::vector<RGBA> &colors, unsigned &width, unsigned &height);
+	void scanout_async_buffer(VIScanoutBuffer &buffer, const ScanoutOptions &opts = {});
+
+private:
+	Vulkan::Device &device;
+	Vulkan::BufferHandle rdram;
+	Vulkan::BufferHandle hidden_rdram;
+	Vulkan::BufferHandle tmem;
+	size_t rdram_offset;
+	size_t rdram_size;
+	CommandProcessorFlags flags;
+#ifndef PARALLEL_RDP_SHADER_DIR
+	std::unique_ptr<ShaderBank> shader_bank;
+#endif
+
+	CommandRing ring;
+
+	VideoInterface vi;
+	Renderer renderer;
+
+	void clear_hidden_rdram();
+	void clear_tmem();
+	void clear_buffer(Vulkan::Buffer &buffer, uint32_t value);
+	void init_renderer();
+	void enqueue_command_inner(unsigned num_words, const uint32_t *words);
+
+	Vulkan::ImageHandle scanout(const ScanoutOptions &opts, VkImageLayout target_layout);
+
+#define OP(x) void op_##x(const uint32_t *words)
+	OP(fill_triangle); OP(fill_z_buffer_triangle); OP(texture_triangle); OP(texture_z_buffer_triangle);
+	OP(shade_triangle); OP(shade_z_buffer_triangle); OP(shade_texture_triangle); OP(shade_texture_z_buffer_triangle);
+	OP(texture_rectangle); OP(texture_rectangle_flip); OP(sync_load); OP(sync_pipe);
+	OP(sync_tile); OP(sync_full); OP(set_key_gb); OP(set_key_r);
+	OP(set_convert); OP(set_scissor); OP(set_prim_depth); OP(set_other_modes);
+	OP(load_tlut); OP(set_tile_size); OP(load_block);
+	OP(load_tile); OP(set_tile); OP(fill_rectangle); OP(set_fill_color);
+	OP(set_fog_color); OP(set_blend_color); OP(set_prim_color); OP(set_env_color);
+	OP(set_combine); OP(set_texture_image); OP(set_mask_image); OP(set_color_image);
+#undef OP
+
+	ScissorState scissor_state = {};
+	StaticRasterizationState static_state = {};
+	DepthBlendState depth_blend = {};
+
+	struct
+	{
+		uint32_t addr;
+		uint32_t width;
+		TextureFormat fmt;
+		TextureSize size;
+	} texture_image = {};
+
+	uint64_t timeline_value = 0;
+	uint64_t thread_timeline_value = 0;
+
+	struct FenceExecutor
+	{
+		explicit inline FenceExecutor(Vulkan::Device *device_, uint64_t *ptr)
+			: device(device_), value(ptr)
+		{
+		}
+
+		Vulkan::Device *device;
+		uint64_t *value;
+		bool is_sentinel(const CoherencyOperation &work) const;
+		void perform_work(CoherencyOperation &work);
+		void notify_work_locked(const CoherencyOperation &work);
+	};
+	WorkerThread<CoherencyOperation, FenceExecutor> timeline_worker;
+
+	uint8_t *host_rdram = nullptr;
+	bool measure_stall_time = false;
+	bool single_threaded_processing = false;
+	bool is_supported = false;
+	bool is_host_coherent = true;
+	bool timestamp = false;
+
+	friend class Renderer;
+
+	void enqueue_coherency_operation(CoherencyOperation &&op);
+	void drain_command_ring();
+	void decode_triangle_setup(TriangleSetup &setup, const uint32_t *words) const;
+
+	Quirks quirks;
+
+	std::unique_ptr<RDPDumpWriter> dump_writer;
+	bool dump_in_command_list = false;
+};
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/rdp_dump_write.cpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/rdp_dump_write.cpp
@ -0,0 +1,151 @@
+/* Copyright (c) 2021 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "rdp_dump_write.hpp"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace RDP
+{
+RDPDumpWriter::~RDPDumpWriter()
+{
+	end();
+	if (file)
+		fclose(file);
+}
+
+bool RDPDumpWriter::init(const char *path, uint32_t dram_size, uint32_t hidden_dram_size)
+{
+	if (file)
+		return false;
+
+	rdp_dram_cache.clear();
+	rdp_dram_cache.resize(dram_size);
+	rdp_hidden_dram_cache.clear();
+	rdp_hidden_dram_cache.resize(hidden_dram_size);
+
+	file = fopen(path, "wb");
+	if (!file)
+		return false;
+
+	fwrite("RDPDUMP2", 8, 1, file);
+	fwrite(&dram_size, sizeof(dram_size), 1, file);
+	fwrite(&hidden_dram_size, sizeof(hidden_dram_size), 1, file);
+	return true;
+}
+
+void RDPDumpWriter::end_frame()
+{
+	if (!file)
+		return;
+
+	uint32_t cmd = RDP_DUMP_CMD_END_FRAME;
+	fwrite(&cmd, sizeof(cmd), 1, file);
+}
+
+void RDPDumpWriter::end()
+{
+	if (!file)
+		return;
+
+	uint32_t cmd = RDP_DUMP_CMD_EOF;
+	fwrite(&cmd, sizeof(cmd), 1, file);
+
+	fclose(file);
+	file = nullptr;
+
+	rdp_dram_cache.clear();
+	rdp_hidden_dram_cache.clear();
+}
+
+void RDPDumpWriter::flush(const void *dram_, uint32_t size,
+                          RDPDumpCmd block_cmd, RDPDumpCmd flush_cmd,
+                          uint8_t *cache)
+{
+	if (!file)
+		return;
+
+	const auto *dram = static_cast<const uint8_t *>(dram_);
+	const uint32_t block_size = 4 * 1024;
+	uint32_t i = 0;
+
+	for (i = 0; i < size; i += block_size)
+	{
+		if (memcmp(dram + i, cache + i, block_size) != 0)
+		{
+			uint32_t cmd = block_cmd;
+			fwrite(&cmd, sizeof(cmd), 1, file);
+			fwrite(&i, sizeof(i), 1, file);
+			fwrite(&block_size, sizeof(block_size), 1, file);
+			fwrite(dram + i, 1, block_size, file);
+			memcpy(cache + i, dram + i, block_size);
+		}
+	}
+
+	uint32_t cmd = flush_cmd;
+	fwrite(&cmd, sizeof(cmd), 1, file);
+
+}
+
+void RDPDumpWriter::flush_dram(const void *dram_, uint32_t size)
+{
+	flush(dram_, size, RDP_DUMP_CMD_UPDATE_DRAM, RDP_DUMP_CMD_UPDATE_DRAM_FLUSH, rdp_dram_cache.data());
+}
+
+void RDPDumpWriter::flush_hidden_dram(const void *dram_, uint32_t size)
+{
+	flush(dram_, size, RDP_DUMP_CMD_UPDATE_HIDDEN_DRAM, RDP_DUMP_CMD_UPDATE_HIDDEN_DRAM_FLUSH, rdp_hidden_dram_cache.data());
+}
+
+void RDPDumpWriter::signal_complete()
+{
+	if (!file)
+		return;
+
+	uint32_t cmd = RDP_DUMP_CMD_SIGNAL_COMPLETE;
+	fwrite(&cmd, sizeof(cmd), 1, file);
+}
+
+void RDPDumpWriter::emit_command(uint32_t command, const uint32_t *cmd_data, uint32_t cmd_words)
+{
+	if (!file)
+		return;
+
+	uint32_t cmd = RDP_DUMP_CMD_RDP_COMMAND;
+	fwrite(&cmd, sizeof(cmd), 1, file);
+	fwrite(&command, sizeof(command), 1, file);
+	fwrite(&cmd_words, sizeof(cmd_words), 1, file);
+	fwrite(cmd_data, sizeof(*cmd_data), cmd_words, file);
+}
+
+void RDPDumpWriter::set_vi_register(uint32_t vi_register, uint32_t value)
+{
+	if (!file)
+		return;
+
+	uint32_t cmd = RDP_DUMP_CMD_SET_VI_REGISTER;
+	fwrite(&cmd, sizeof(cmd), 1, file);
+	fwrite(&vi_register, sizeof(vi_register), 1, file);
+	fwrite(&value, sizeof(value), 1, file);
+}
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/rdp_dump_write.hpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/rdp_dump_write.hpp
@ -0,0 +1,65 @@
+/* Copyright (c) 2021 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+
+namespace RDP
+{
+class RDPDumpWriter
+{
+public:
+	~RDPDumpWriter();
+	bool init(const char *path, uint32_t dram_size, uint32_t hidden_dram_size);
+	void flush_dram(const void *dram, uint32_t size);
+	void flush_hidden_dram(const void *dram, uint32_t size);
+	void signal_complete();
+	void emit_command(uint32_t command, const uint32_t *cmd_data, uint32_t cmd_words);
+	void set_vi_register(uint32_t vi_register, uint32_t value);
+	void end_frame();
+
+private:
+	enum RDPDumpCmd : uint32_t
+	{
+		RDP_DUMP_CMD_INVALID = 0,
+		RDP_DUMP_CMD_UPDATE_DRAM = 1,
+		RDP_DUMP_CMD_RDP_COMMAND = 2,
+		RDP_DUMP_CMD_SET_VI_REGISTER = 3,
+		RDP_DUMP_CMD_END_FRAME = 4,
+		RDP_DUMP_CMD_SIGNAL_COMPLETE = 5,
+		RDP_DUMP_CMD_EOF = 6,
+		RDP_DUMP_CMD_UPDATE_DRAM_FLUSH = 7,
+		RDP_DUMP_CMD_UPDATE_HIDDEN_DRAM = 8,
+		RDP_DUMP_CMD_UPDATE_HIDDEN_DRAM_FLUSH = 9,
+		RDP_DUMP_CMD_INT_MAX = 0x7fffffff
+	};
+
+	FILE *file = nullptr;
+	std::vector<uint8_t> rdp_dram_cache;
+	std::vector<uint8_t> rdp_hidden_dram_cache;
+	void flush(const void *dram_, uint32_t size, RDPDumpCmd block_cmd, RDPDumpCmd flush_cmd, uint8_t *cache);
+	void end();
+};
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/rdp_renderer.cpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/rdp_renderer.cpp
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/rdp_renderer.hpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/rdp_renderer.hpp
@ -0,0 +1,393 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "rdp_data_structures.hpp"
+#include "device.hpp"
+#include "rdp_common.hpp"
+#include "worker_thread.hpp"
+#include <unordered_set>
+
+namespace RDP
+{
+struct CoherencyOperation;
+
+struct SyncObject
+{
+	Vulkan::Fence fence;
+};
+
+enum class FBFormat : uint32_t
+{
+	I4 = 0,
+	I8 = 1,
+	RGBA5551 = 2,
+	IA88 = 3,
+	RGBA8888 = 4
+};
+
+enum class UploadMode : uint32_t
+{
+	Tile = 0,
+	TLUT = 1,
+	Block = 2
+};
+
+struct LoadTileInfo
+{
+	uint32_t tex_addr;
+	uint32_t tex_width;
+	uint16_t slo, tlo, shi, thi;
+	TextureFormat fmt;
+	TextureSize size;
+	UploadMode mode;
+};
+
+class CommandProcessor;
+
+struct RendererOptions
+{
+	unsigned upscaling_factor = 1;
+	bool super_sampled_readback = false;
+	bool super_sampled_readback_dither = false;
+};
+
+class Renderer : public Vulkan::DebugChannelInterface
+{
+public:
+	explicit Renderer(CommandProcessor &processor);
+	~Renderer();
+	void set_device(Vulkan::Device *device);
+
+	// If coherent is false, RDRAM is a buffer split into data in lower half and writemask state in upper half, each part being size large.
+	// offset must be 0 in this case.
+	void set_rdram(Vulkan::Buffer *buffer, uint8_t *host_rdram, size_t offset, size_t size, bool coherent);
+	void set_hidden_rdram(Vulkan::Buffer *buffer);
+	void set_tmem(Vulkan::Buffer *buffer);
+	void set_shader_bank(const ShaderBank *bank);
+
+	bool init_renderer(const RendererOptions &options);
+
+	// setup may be mutated to apply various fixups to triangle setup.
+	void draw_flat_primitive(TriangleSetup &setup);
+	void draw_shaded_primitive(TriangleSetup &setup, const AttributeSetup &attr);
+
+	void set_color_framebuffer(uint32_t addr, uint32_t width, FBFormat fmt);
+	void set_depth_framebuffer(uint32_t addr);
+
+	void set_scissor_state(const ScissorState &state);
+	void set_static_rasterization_state(const StaticRasterizationState &state);
+	void set_depth_blend_state(const DepthBlendState &state);
+
+	void set_tile(uint32_t tile, const TileMeta &info);
+	void set_tile_size(uint32_t tile, uint32_t slo, uint32_t shi, uint32_t tlo, uint32_t thi);
+	void load_tile(uint32_t tile, const LoadTileInfo &info);
+	void load_tile_iteration(uint32_t tile, const LoadTileInfo &info, uint32_t tmem_offset);
+
+	void set_blend_color(uint32_t color);
+	void set_fog_color(uint32_t color);
+	void set_env_color(uint32_t color);
+	void set_primitive_color(uint8_t min_level, uint8_t prim_lod_frac, uint32_t color);
+	void set_fill_color(uint32_t color);
+	void set_primitive_depth(uint16_t prim_depth, uint16_t prim_dz);
+	void set_enable_primitive_depth(bool enable);
+	void set_convert(uint16_t k0, uint16_t k1, uint16_t k2, uint16_t k3, uint16_t k4, uint16_t k5);
+	void set_color_key(unsigned component, uint32_t width, uint32_t center, uint32_t scale);
+
+	// Called when the command thread has not seen any activity in a given period of time.
+	// This is useful so we don't needlessly queue up work when we might as well kick it to the GPU.
+	void notify_idle_command_thread();
+	void flush_and_signal();
+
+	int resolve_shader_define(const char *name, const char *define) const;
+
+	void resolve_coherency_external(unsigned offset, unsigned length);
+	void submit_update_upscaled_domain_external(Vulkan::CommandBuffer &cmd,
+	                                            unsigned addr, unsigned pixels, unsigned pixel_size_log2);
+	unsigned get_scaling_factor() const;
+
+	const Vulkan::Buffer *get_upscaled_rdram_buffer() const;
+	const Vulkan::Buffer *get_upscaled_hidden_rdram_buffer() const;
+
+	void lock_command_processing();
+	void unlock_command_processing();
+
+private:
+	CommandProcessor &processor;
+	Vulkan::Device *device = nullptr;
+	Vulkan::Buffer *rdram = nullptr;
+
+	Vulkan::BufferHandle upscaling_reference_rdram;
+	Vulkan::BufferHandle upscaling_multisampled_rdram;
+	Vulkan::BufferHandle upscaling_multisampled_hidden_rdram;
+
+	struct
+	{
+		uint8_t *host_rdram = nullptr;
+		Vulkan::BufferHandle staging_rdram;
+		Vulkan::BufferHandle staging_readback;
+		std::unique_ptr<std::atomic_uint32_t[]> pending_writes_for_page;
+		std::vector<uint32_t> page_to_direct_copy;
+		std::vector<uint32_t> page_to_masked_copy;
+		std::vector<uint32_t> page_to_pending_readback;
+		unsigned num_pages = 0;
+		unsigned staging_readback_pages = 0;
+		unsigned staging_readback_index = 0; // Ringbuffer the readbacks.
+	} incoherent;
+
+	size_t rdram_offset = 0;
+	size_t rdram_size = 0;
+	bool is_host_coherent = false;
+	Vulkan::Buffer *hidden_rdram = nullptr;
+	Vulkan::Buffer *tmem = nullptr;
+	const ShaderBank *shader_bank = nullptr;
+
+	bool init_caps();
+	void init_blender_lut();
+	void init_buffers(const RendererOptions &options);
+	bool init_internal_upscaling_factor(const RendererOptions &options);
+
+	struct
+	{
+		uint32_t addr = 0;
+		uint32_t depth_addr = 0;
+		uint32_t width = 0;
+		uint32_t deduced_height = 0;
+		FBFormat fmt = FBFormat::I8;
+		bool depth_write_pending = false;
+		bool color_write_pending = false;
+	} fb;
+
+	struct StreamCaches
+	{
+		ScissorState scissor_state = {};
+		StaticRasterizationState static_raster_state = {};
+		DepthBlendState depth_blend_state = {};
+
+		StateCache<StaticRasterizationState, Limits::MaxStaticRasterizationStates> static_raster_state_cache;
+		StateCache<DepthBlendState, Limits::MaxDepthBlendStates> depth_blend_state_cache;
+		StateCache<TileInfo, Limits::MaxTileInfoStates> tile_info_state_cache;
+
+		StreamCache<TriangleSetup, Limits::MaxPrimitives> triangle_setup;
+		StreamCache<ScissorState, Limits::MaxPrimitives> scissor_setup;
+		StreamCache<AttributeSetup, Limits::MaxPrimitives> attribute_setup;
+		StreamCache<DerivedSetup, Limits::MaxPrimitives> derived_setup;
+		StreamCache<InstanceIndices, Limits::MaxPrimitives> state_indices;
+		StreamCache<SpanInfoOffsets, Limits::MaxPrimitives> span_info_offsets;
+		StreamCache<SpanInterpolationJob, Limits::MaxSpanSetups> span_info_jobs;
+
+		std::vector<UploadInfo> tmem_upload_infos;
+		unsigned max_shaded_tiles = 0;
+		Vulkan::CommandBufferHandle cmd;
+	} stream;
+
+	void ensure_command_buffer();
+
+	TileInfo tiles[Limits::MaxNumTiles];
+	Vulkan::BufferHandle tmem_instances;
+	Vulkan::BufferHandle span_setups;
+	Vulkan::BufferHandle blender_divider_lut_buffer;
+	Vulkan::BufferViewHandle blender_divider_buffer;
+
+	Vulkan::BufferHandle tile_binning_buffer;
+	Vulkan::BufferHandle tile_binning_buffer_coarse;
+
+	Vulkan::BufferHandle indirect_dispatch_buffer;
+	Vulkan::BufferHandle tile_work_list;
+	Vulkan::BufferHandle per_tile_offsets;
+	Vulkan::BufferHandle per_tile_shaded_color;
+	Vulkan::BufferHandle per_tile_shaded_depth;
+	Vulkan::BufferHandle per_tile_shaded_shaded_alpha;
+	Vulkan::BufferHandle per_tile_shaded_coverage;
+
+	struct MappedBuffer
+	{
+		Vulkan::BufferHandle buffer;
+		bool is_host = false;
+	};
+
+	struct RenderBuffers
+	{
+		void init(Vulkan::Device &device, Vulkan::BufferDomain domain, RenderBuffers *borrow);
+		static MappedBuffer create_buffer(Vulkan::Device &device, Vulkan::BufferDomain domain, VkDeviceSize size, MappedBuffer *borrow);
+
+		MappedBuffer triangle_setup;
+		MappedBuffer attribute_setup;
+		MappedBuffer derived_setup;
+		MappedBuffer scissor_setup;
+
+		MappedBuffer static_raster_state;
+		MappedBuffer depth_blend_state;
+		MappedBuffer tile_info_state;
+
+		MappedBuffer state_indices;
+		MappedBuffer span_info_offsets;
+
+		MappedBuffer span_info_jobs;
+		Vulkan::BufferViewHandle span_info_jobs_view;
+	};
+
+	struct RenderBuffersUpdater
+	{
+		void init(Vulkan::Device &device);
+		void upload(Vulkan::Device &device, const StreamCaches &caches, Vulkan::CommandBuffer &cmd);
+
+		template <typename Cache>
+		void upload(Vulkan::CommandBuffer &cmd, Vulkan::Device &device,
+		            const MappedBuffer &gpu, const MappedBuffer &cpu, const Cache &cache, bool &did_upload);
+
+		RenderBuffers cpu, gpu;
+	};
+
+	struct InternalSynchronization
+	{
+		Vulkan::Fence fence;
+	};
+
+	struct Constants
+	{
+		uint32_t blend_color = 0;
+		uint32_t fog_color = 0;
+		uint32_t env_color = 0;
+		uint32_t primitive_color = 0;
+		uint32_t fill_color = 0;
+		uint8_t min_level = 0;
+		uint8_t prim_lod_frac = 0;
+		int32_t prim_depth = 0;
+		uint16_t prim_dz = 0;
+		uint16_t convert[6] = {};
+
+		uint16_t key_width[3] = {};
+		uint8_t key_center[3] = {};
+		uint8_t key_scale[3] = {};
+
+		bool use_prim_depth = false;
+	} constants;
+
+	RenderBuffersUpdater buffer_instances[Limits::NumSyncStates];
+	InternalSynchronization internal_sync[Limits::NumSyncStates];
+	uint32_t sync_indices_needs_flush = 0;
+	unsigned buffer_instance = 0;
+	uint32_t base_primitive_index = 0;
+	unsigned pending_render_passes = 0;
+	unsigned pending_render_passes_upscaled = 0;
+	unsigned pending_primitives = 0;
+	unsigned pending_primitives_upscaled = 0;
+
+	bool tmem_upload_needs_flush(uint32_t addr) const;
+
+	bool render_pass_is_upscaled() const;
+	bool should_render_upscaled() const;
+
+	void flush_queues();
+	void submit_render_pass(Vulkan::CommandBuffer &cmd);
+	void submit_render_pass_upscaled(Vulkan::CommandBuffer &cmd);
+	void submit_render_pass_end(Vulkan::CommandBuffer &cmd);
+	void submit_to_queue();
+	void begin_new_context();
+	void reset_context();
+	bool need_flush() const;
+	void maintain_queues();
+	void maintain_queues_idle();
+	void update_tmem_instances(Vulkan::CommandBuffer &cmd);
+	void submit_span_setup_jobs(Vulkan::CommandBuffer &cmd, bool upscaled);
+	void update_deduced_height(const TriangleSetup &setup);
+	void submit_tile_binning_combined(Vulkan::CommandBuffer &cmd, bool upscaled);
+	void clear_indirect_buffer(Vulkan::CommandBuffer &cmd);
+	void submit_rasterization(Vulkan::CommandBuffer &cmd, Vulkan::Buffer &tmem, bool upscaled);
+	void submit_depth_blend(Vulkan::CommandBuffer &cmd, Vulkan::Buffer &tmem, bool upscaled, bool force_write_mask);
+
+	enum class ResolveStage { Pre, Post, SSAAResolve };
+	void submit_update_upscaled_domain(Vulkan::CommandBuffer &cmd, ResolveStage stage);
+	void submit_update_upscaled_domain(Vulkan::CommandBuffer &cmd, ResolveStage stage,
+	                                   unsigned addr, unsigned depth_addr,
+	                                   unsigned width, unsigned height,
+	                                   unsigned pixel_size_log2);
+	void submit_clear_super_sample_write_mask(Vulkan::CommandBuffer &cmd, unsigned width, unsigned height);
+
+	SpanInfoOffsets allocate_span_jobs(const TriangleSetup &setup);
+
+	DerivedSetup build_derived_attributes(const AttributeSetup &attr) const;
+	void build_combiner_constants(DerivedSetup &setup, unsigned cycle) const;
+	int filter_debug_channel_x = -1;
+	int filter_debug_channel_y = -1;
+	bool debug_channel = false;
+
+	void message(const std::string &tag, uint32_t code,
+	             uint32_t x, uint32_t y, uint32_t z,
+	             uint32_t num_words, const Vulkan::DebugChannelInterface::Word *words) override;
+
+	bool can_support_minimum_subgroup_size(unsigned size) const;
+	bool supports_subgroup_size_control(uint32_t minimum_size, uint32_t maximum_size) const;
+
+	std::unordered_set<Util::Hash> pending_async_pipelines;
+
+	unsigned compute_conservative_max_num_tiles(const TriangleSetup &setup) const;
+
+	void deduce_static_texture_state(unsigned tile, unsigned max_lod_level);
+	void deduce_noise_state();
+	static StaticRasterizationState normalize_static_state(StaticRasterizationState state);
+	void fixup_triangle_setup(TriangleSetup &setup) const;
+
+	struct Caps
+	{
+		int timestamp = 0;
+		bool force_sync = false;
+		bool ubershader = false;
+		bool supports_small_integer_arithmetic = false;
+		bool subgroup_tile_binning = false;
+		bool subgroup_depth_blend = false;
+		bool super_sample_readback = false;
+		bool super_sample_readback_dither = false;
+		unsigned upscaling = 1;
+		unsigned max_num_tile_instances = Limits::MaxTileInstances;
+		unsigned max_tiles_x = ImplementationConstants::MaxTilesX;
+		unsigned max_tiles_y = ImplementationConstants::MaxTilesY;
+		unsigned max_width = Limits::MaxWidth;
+		unsigned max_height = Limits::MaxHeight;
+	} caps;
+
+	struct PipelineExecutor
+	{
+		Vulkan::Device *device;
+		bool is_sentinel(const Vulkan::DeferredPipelineCompile &compile) const;
+		void perform_work(const Vulkan::DeferredPipelineCompile &compile) const;
+		void notify_work_locked(const Vulkan::DeferredPipelineCompile &compile) const;
+	};
+
+	std::unique_ptr<WorkerThread<Vulkan::DeferredPipelineCompile, PipelineExecutor>> pipeline_worker;
+
+	void resolve_coherency_host_to_gpu(Vulkan::CommandBuffer &cmd);
+	void resolve_coherency_gpu_to_host(CoherencyOperation &op, Vulkan::CommandBuffer &cmd);
+	uint32_t get_byte_size_for_bound_color_framebuffer() const;
+	uint32_t get_byte_size_for_bound_depth_framebuffer() const;
+	void mark_pages_for_gpu_read(uint32_t base_addr, uint32_t byte_count);
+	void lock_pages_for_gpu_write(uint32_t base_addr, uint32_t byte_count);
+
+	std::atomic_uint32_t active_submissions;
+	void enqueue_fence_wait(Vulkan::Fence fence);
+	uint64_t last_submit_ns = 0;
+
+	std::mutex idle_lock;
+};
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/binning.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/binning.h
@ -0,0 +1,130 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef BINNING_H_
+#define BINNING_H_
+
+// There are 4 critical Y coordinates to test when binning. Top, bottom, mid, and mid - 1.
+
+const int SUBPIXELS_Y = 4;
+
+ivec4 quantize_x(ivec4 x)
+{
+	return x >> 15;
+}
+
+int minimum4(ivec4 v)
+{
+	ivec2 minimum2 = min(v.xy, v.zw);
+	return min(minimum2.x, minimum2.y);
+}
+
+int maximum4(ivec4 v)
+{
+	ivec2 maximum2 = max(v.xy, v.zw);
+	return max(maximum2.x, maximum2.y);
+}
+
+ivec4 madd_32_64(ivec4 a, int b, int c, out ivec4 hi_bits)
+{
+	ivec4 lo, hi;
+	imulExtended(a, ivec4(b), hi, lo);
+	uvec4 carry;
+	lo = ivec4(uaddCarry(lo, uvec4(c), carry));
+	hi += ivec4(carry);
+	hi_bits = hi;
+	return lo;
+}
+
+ivec2 interpolate_xs(TriangleSetup setup, ivec4 ys, bool flip, int scaling)
+{
+	int yh_interpolation_base = setup.yh & ~(SUBPIXELS_Y - 1);
+	int ym_interpolation_base = setup.ym;
+
+	yh_interpolation_base *= scaling;
+	ym_interpolation_base *= scaling;
+
+	// Interpolate in 64-bit so we can detect quirky overflow scenarios.
+	ivec4 xh_hi, xm_hi, xl_hi;
+	ivec4 xh = madd_32_64(ys - yh_interpolation_base, setup.dxhdy, scaling * setup.xh, xh_hi);
+	ivec4 xm = madd_32_64(ys - yh_interpolation_base, setup.dxmdy, scaling * setup.xm, xm_hi);
+	ivec4 xl = madd_32_64(ys - ym_interpolation_base, setup.dxldy, scaling * setup.xl, xl_hi);
+	xl = mix(xl, xm, lessThan(ys, ivec4(scaling * setup.ym)));
+	xl_hi = mix(xl_hi, xm_hi, lessThan(ys, ivec4(scaling * setup.ym)));
+
+	// Handle overflow scenarios. Saturate 64-bit signed to 32-bit signed without 64-bit math.
+	xh = mix(xh, ivec4(0x7fffffff), greaterThan(xh_hi, ivec4(0)));
+	xh = mix(xh, ivec4(-0x80000000), lessThan(xh_hi, ivec4(-1)));
+	xl = mix(xl, ivec4(0x7fffffff), greaterThan(xl_hi, ivec4(0)));
+	xl = mix(xl, ivec4(-0x80000000), lessThan(xl_hi, ivec4(-1)));
+
+	ivec4 xh_shifted = quantize_x(xh);
+	ivec4 xl_shifted = quantize_x(xl);
+
+	ivec4 xleft, xright;
+	if (flip)
+	{
+		xleft = xh_shifted;
+		xright = xl_shifted;
+	}
+	else
+	{
+		xleft = xl_shifted;
+		xright = xh_shifted;
+	}
+
+	// If one of the results are out of range, we have overflow, and we need to be conservative when binning.
+	int max_range = maximum4(max(abs(xleft), abs(xright)));
+	ivec2 range;
+	if (max_range <= 2047 * scaling)
+		range = ivec2(minimum4(xleft), maximum4(xright));
+	else
+		range = ivec2(0, 0x7fffffff);
+
+	return range;
+}
+
+bool bin_primitive(TriangleSetup setup, ivec2 lo, ivec2 hi, int scaling)
+{
+	int start_y = lo.y * SUBPIXELS_Y;
+	int end_y = (hi.y * SUBPIXELS_Y) + (SUBPIXELS_Y - 1);
+
+	// First, we clip start/end against y_lo, y_hi.
+	start_y = max(start_y, scaling * int(setup.yh));
+	end_y = min(end_y, scaling * int(setup.yl) - 1);
+
+	// Y is clipped out, exit early.
+	if (end_y < start_y)
+		return false;
+
+	bool flip = (setup.flags & TRIANGLE_SETUP_FLIP_BIT) != 0;
+
+	// Sample the X ranges for min and max Y, and potentially the mid-point as well.
+	ivec4 ys = ivec4(start_y, end_y, clamp(setup.ym * scaling + ivec2(-1, 0), ivec2(start_y), ivec2(end_y)));
+	ivec2 x_range = interpolate_xs(setup, ys, flip, scaling);
+
+	x_range.x = max(x_range.x, lo.x);
+	x_range.y = min(x_range.y, hi.x);
+	return x_range.x <= x_range.y;
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/blender.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/blender.h
@ -0,0 +1,145 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef BLENDER_H_
+#define BLENDER_H_
+
+struct BlendInputs
+{
+	u8x4 pixel_color;
+	u8x4 memory_color;
+	u8x4 fog_color;
+	u8x4 blend_color;
+	u8 shade_alpha;
+};
+
+const int BLEND_MODE_1A_PIXEL_COLOR = 0;
+const int BLEND_MODE_1A_MEMORY_COLOR = 1;
+const int BLEND_MODE_1A_BLEND_COLOR = 2;
+const int BLEND_MODE_1A_FOG_COLOR = 3;
+
+const int BLEND_MODE_1B_PIXEL_ALPHA = 0;
+const int BLEND_MODE_1B_FOG_ALPHA = 1;
+const int BLEND_MODE_1B_SHADE_ALPHA = 2;
+const int BLEND_MODE_1B_ZERO = 3;
+
+const int BLEND_MODE_2A_PIXEL_COLOR = 0;
+const int BLEND_MODE_2A_MEMORY_COLOR = 1;
+const int BLEND_MODE_2A_BLEND_COLOR = 2;
+const int BLEND_MODE_2A_FOG_COLOR = 3;
+
+const int BLEND_MODE_2B_INV_PIXEL_ALPHA = 0;
+const int BLEND_MODE_2B_MEMORY_ALPHA = 1;
+const int BLEND_MODE_2B_ONE = 2;
+const int BLEND_MODE_2B_ZERO = 3;
+
+u8x3 blender(BlendInputs inputs, u8x4 blend_modes,
+             bool force_blend, bool blend_en, bool color_on_coverage, bool coverage_wrap, u8x2 blend_shift,
+             bool final_cycle)
+{
+	u8x3 rgb1;
+	switch (int(blend_modes.z))
+	{
+	case BLEND_MODE_2A_PIXEL_COLOR: rgb1 = inputs.pixel_color.rgb; break;
+	case BLEND_MODE_2A_MEMORY_COLOR: rgb1 = inputs.memory_color.rgb; break;
+	case BLEND_MODE_2A_BLEND_COLOR: rgb1 = inputs.blend_color.rgb; break;
+	case BLEND_MODE_2A_FOG_COLOR: rgb1 = inputs.fog_color.rgb; break;
+	}
+
+	if (final_cycle)
+	{
+		if (color_on_coverage && !coverage_wrap)
+			return rgb1;
+	}
+
+	u8x3 rgb0;
+	switch (int(blend_modes.x))
+	{
+	case BLEND_MODE_1A_PIXEL_COLOR: rgb0 = inputs.pixel_color.rgb; break;
+	case BLEND_MODE_1A_MEMORY_COLOR: rgb0 = inputs.memory_color.rgb; break;
+	case BLEND_MODE_1A_BLEND_COLOR: rgb0 = inputs.blend_color.rgb; break;
+	case BLEND_MODE_1A_FOG_COLOR: rgb0 = inputs.fog_color.rgb; break;
+	}
+
+	if (final_cycle)
+	{
+		if (!blend_en || (blend_modes.y == BLEND_MODE_1B_PIXEL_ALPHA &&
+						  blend_modes.w == BLEND_MODE_2B_INV_PIXEL_ALPHA &&
+						  inputs.pixel_color.a == U8_C(0xff)))
+		{
+			return rgb0;
+		}
+	}
+
+	u8 a0;
+	u8 a1;
+
+	switch (int(blend_modes.y))
+	{
+	case BLEND_MODE_1B_PIXEL_ALPHA: a0 = inputs.pixel_color.a; break;
+	case BLEND_MODE_1B_FOG_ALPHA: a0 = inputs.fog_color.a; break;
+	case BLEND_MODE_1B_SHADE_ALPHA: a0 = inputs.shade_alpha; break;
+	case BLEND_MODE_1B_ZERO: a0 = U8_C(0); break;
+	}
+
+	switch (int(blend_modes.w))
+	{
+	case BLEND_MODE_2B_INV_PIXEL_ALPHA: a1 = ~a0 & U8_C(0xff); break;
+	case BLEND_MODE_2B_MEMORY_ALPHA: a1 = inputs.memory_color.a; break;
+	case BLEND_MODE_2B_ONE: a1 = U8_C(0xff); break;
+	case BLEND_MODE_2B_ZERO: a1 = U8_C(0); break;
+	}
+
+	a0 >>= U8_C(3);
+	a1 >>= U8_C(3);
+
+	if (blend_modes.w == BLEND_MODE_2B_MEMORY_ALPHA)
+	{
+		a0 = (a0 >> blend_shift.x) & U8_C(0x3c);
+		a1 = (a1 >> blend_shift.y) | U8_C(3);
+	}
+
+	i16x3 blended = i16x3(rgb0) * i16(a0) + i16x3(rgb1) * (i16(a1) + I16_C(1));
+
+	if (!final_cycle || force_blend)
+	{
+		rgb0 = u8x3(blended >> I16_C(5));
+	}
+	else
+	{
+		// Serious funk here. Somehow the RDP implemented a divider to deal with weighted average.
+		// Typically relevant when using blender shifters from interpenetrating Z mode.
+		// Under normal condition, this is implemented as a straight integer divider, but
+		// for edge cases, we need a look-up table. The results make no sense.
+		int blend_sum = (int(a0) >> 2) + (int(a1) >> 2) + 1;
+		blended >>= I16_C(2);
+		blended &= I16_C(0x7ff);
+
+		rgb0.r = u8(texelFetch(uBlenderDividerLUT, (blend_sum << 11) | blended.x).x);
+		rgb0.g = u8(texelFetch(uBlenderDividerLUT, (blend_sum << 11) | blended.y).x);
+		rgb0.b = u8(texelFetch(uBlenderDividerLUT, (blend_sum << 11) | blended.z).x);
+	}
+
+	return rgb0 & U8_C(0xff);
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/clamping.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/clamping.h
@ -0,0 +1,78 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef CLAMPING_H_
+#define CLAMPING_H_
+
+#if SMALL_TYPES && 0
+// This path is buggy on RADV LLVM, disable for time being.
+i16x4 clamp_9bit_notrunc(i16x4 color)
+{
+	// [-129, -256] should clamp to 0xff, subtracting by 0x80 will underflow back to positive numbers.
+	// [-128, -1] should clamp to 0.
+	color -= I16_C(0x80);
+	// Sign-extend to 9-bit.
+	color <<= I16_C(7);
+	color >>= I16_C(7);
+	color += I16_C(0x80);
+	return clamp(color, i16x4(0), i16x4(0xff));
+}
+#else
+i16x4 clamp_9bit_notrunc(ivec4 color)
+{
+	// [-129, -256] should clamp to 0xff, subtracting by 0x80 will underflow back to positive numbers.
+	// [-128, -1] should clamp to 0.
+	color -= 0x80;
+	// Sign-extend to 9-bit.
+	color = bitfieldExtract(color, 0, 9);
+	color += 0x80;
+	return i16x4(clamp(color, ivec4(0), ivec4(0xff)));
+}
+#endif
+
+u8x4 clamp_9bit(i16x4 color)
+{
+	return u8x4(clamp_9bit_notrunc(color));
+}
+
+int clamp_9bit(int color)
+{
+	return clamp(bitfieldExtract(color - 0x80, 0, 9) + 0x80, 0, 0xff);
+}
+
+// Returns 18-bit UNORM depth.
+int clamp_z(int z)
+{
+	// Similar to RGBA, we reserve an extra bit to deal with overflow and underflow.
+	z -= (1 << 17);
+	z <<= (31 - 18);
+	z >>= (31 - 18);
+	z += (1 << 17);
+
+	// [0x00000, 0x3ffff] maps to self.
+	// [0x40000, 0x5ffff] maps to 0x3ffff.
+	// [0x60000, 0x7ffff] maps to 0.
+
+	return clamp(z, 0, 0x3ffff);
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/clear_indirect_buffer.comp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/clear_indirect_buffer.comp
@ -0,0 +1,33 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+layout(local_size_x_id = 0) in;
+
+layout(set = 0, binding = 0, std430) writeonly buffer ClearIndirectBuffer
+{
+    uvec4 indirects[];
+};
+
+void main()
+{
+    indirects[gl_GlobalInvocationID.x] = uvec4(0, 1, 1, 0);
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/clear_super_sampled_write_mask.comp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/clear_super_sampled_write_mask.comp
@ -0,0 +1,34 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+layout(local_size_x_id = 0) in;
+
+layout(set = 0, binding = 0, std430) writeonly buffer ToClear
+{
+    uint elems[];
+} mask_ram;
+
+void main()
+{
+    mask_ram.elems[gl_GlobalInvocationID.x] = 0u;
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/clear_write_mask.comp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/clear_write_mask.comp
@ -0,0 +1,42 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+layout(local_size_x_id = 0) in;
+layout(constant_id = 1) const int PAGE_STRIDE = 256;
+
+layout(set = 0, binding = 0, std430) writeonly buffer SSBO
+{
+    uint write_mask[];
+};
+
+layout(set = 1, binding = 0, std140) uniform UBO
+{
+    uvec4 offsets[1024];
+};
+
+void main()
+{
+    uint offset = offsets[gl_WorkGroupID.x >> 2u][gl_WorkGroupID.x & 3u];
+    offset *= PAGE_STRIDE;
+    write_mask[offset + gl_LocalInvocationIndex] = 0u;
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/combiner.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/combiner.h
@ -0,0 +1,284 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef COMBINER_H_
+#define COMBINER_H_
+
+#include "clamping.h"
+
+ivec4 special_expand(ivec4 value)
+{
+	// Special sign-extend without explicit clamp.
+	return bitfieldExtract(value - 0x80, 0, 9) + 0x80;
+}
+
+i16x4 combiner_equation(ivec4 a, ivec4 b, ivec4 c, ivec4 d)
+{
+	// Sign-extend multiplier to 9 bits.
+	c = bitfieldExtract(c, 0, 9);
+
+	// Need this to deal with very specific 9-bit sign bits ...
+	a = special_expand(a);
+	b = special_expand(b);
+	d = special_expand(d);
+
+	ivec4 color = (a - b) * c;
+	color += 0x80;
+	return i16x4(color >> 8) + i16x4(d);
+}
+
+struct CombinerInputs
+{
+	u8x4 constant_muladd;
+	u8x4 constant_mulsub;
+	u8x4 constant_mul;
+	u8x4 constant_add;
+
+	u8x4 shade;
+	i16x4 combined;
+	i16x4 texel0;
+	i16x4 texel1;
+	i16 lod_frac;
+	i16 noise;
+};
+
+const int RGB_MULADD_COMBINED = 0;
+const int RGB_MULADD_TEXEL0 = 1;
+const int RGB_MULADD_TEXEL1 = 2;
+const int RGB_MULADD_SHADE = 4;
+const int RGB_MULADD_ONE = 6;
+const int RGB_MULADD_NOISE = 7;
+
+const int RGB_MULSUB_COMBINED = 0;
+const int RGB_MULSUB_TEXEL0 = 1;
+const int RGB_MULSUB_TEXEL1 = 2;
+const int RGB_MULSUB_SHADE = 4;
+const int RGB_MULSUB_K4 = 7;
+
+const int RGB_MUL_COMBINED = 0;
+const int RGB_MUL_TEXEL0 = 1;
+const int RGB_MUL_TEXEL1 = 2;
+const int RGB_MUL_SHADE = 4;
+const int RGB_MUL_COMBINED_ALPHA = 7;
+const int RGB_MUL_TEXEL0_ALPHA = 8;
+const int RGB_MUL_TEXEL1_ALPHA = 9;
+const int RGB_MUL_SHADE_ALPHA = 11;
+const int RGB_MUL_LOD_FRAC = 13;
+const int RGB_MUL_K5 = 15;
+
+const int RGB_ADD_COMBINED = 0;
+const int RGB_ADD_TEXEL0 = 1;
+const int RGB_ADD_TEXEL1 = 2;
+const int RGB_ADD_SHADE = 4;
+const int RGB_ADD_ONE = 6;
+
+const int ALPHA_ADDSUB_COMBINED = 0;
+const int ALPHA_ADDSUB_TEXEL0_ALPHA = 1;
+const int ALPHA_ADDSUB_TEXEL1_ALPHA = 2;
+const int ALPHA_ADDSUB_SHADE_ALPHA = 4;
+const int ALPHA_ADDSUB_ONE = 6;
+
+const int ALPHA_MUL_LOD_FRAC = 0;
+const int ALPHA_MUL_TEXEL0_ALPHA = 1;
+const int ALPHA_MUL_TEXEL1_ALPHA = 2;
+const int ALPHA_MUL_SHADE_ALPHA = 4;
+
+ivec4 select_muladd(CombinerInputs inputs, int selector_rgb, int selector_alpha)
+{
+	ivec3 res;
+	switch (selector_rgb)
+	{
+	case RGB_MULADD_COMBINED: res = inputs.combined.rgb; break;
+	case RGB_MULADD_TEXEL0: res = inputs.texel0.rgb; break;
+	case RGB_MULADD_TEXEL1: res = inputs.texel1.rgb; break;
+	case RGB_MULADD_SHADE: res = inputs.shade.rgb; break;
+	case RGB_MULADD_NOISE: res = ivec3(inputs.noise); break;
+	case RGB_MULADD_ONE: res = ivec3(0x100); break;
+	default: res = inputs.constant_muladd.rgb; break;
+	}
+
+	int alpha;
+	switch (selector_alpha)
+	{
+	case ALPHA_ADDSUB_COMBINED: alpha = inputs.combined.a; break;
+	case ALPHA_ADDSUB_TEXEL0_ALPHA: alpha = inputs.texel0.a; break;
+	case ALPHA_ADDSUB_TEXEL1_ALPHA: alpha = inputs.texel1.a; break;
+	case ALPHA_ADDSUB_SHADE_ALPHA: alpha = inputs.shade.a; break;
+	case ALPHA_ADDSUB_ONE: alpha = 0x100; break;
+	default: alpha = inputs.constant_muladd.a; break;
+	}
+	return ivec4(res, alpha);
+}
+
+ivec4 select_mulsub(CombinerInputs inputs, int selector_rgb, int selector_alpha)
+{
+	ivec3 res;
+	switch (selector_rgb)
+	{
+	case RGB_MULSUB_COMBINED: res = inputs.combined.rgb; break;
+	case RGB_MULSUB_TEXEL0: res = inputs.texel0.rgb; break;
+	case RGB_MULSUB_TEXEL1: res = inputs.texel1.rgb; break;
+	case RGB_MULSUB_SHADE: res = inputs.shade.rgb; break;
+	case RGB_MULSUB_K4: res = ivec3((int(inputs.constant_mulsub.g) << 8) | inputs.constant_mulsub.b); break;
+	default: res = inputs.constant_mulsub.rgb; break;
+	}
+
+	int alpha;
+	switch (selector_alpha)
+	{
+	case ALPHA_ADDSUB_COMBINED: alpha = inputs.combined.a; break;
+	case ALPHA_ADDSUB_TEXEL0_ALPHA: alpha = inputs.texel0.a; break;
+	case ALPHA_ADDSUB_TEXEL1_ALPHA: alpha = inputs.texel1.a; break;
+	case ALPHA_ADDSUB_SHADE_ALPHA: alpha = inputs.shade.a; break;
+	case ALPHA_ADDSUB_ONE: alpha = 0x100; break;
+	default: alpha = inputs.constant_mulsub.a; break;
+	}
+	return ivec4(res, alpha);
+}
+
+ivec4 select_mul(CombinerInputs inputs, int selector_rgb, int selector_alpha)
+{
+	ivec3 res;
+	switch (selector_rgb)
+	{
+	case RGB_MUL_COMBINED: res = inputs.combined.rgb; break;
+	case RGB_MUL_COMBINED_ALPHA: res = inputs.combined.aaa; break;
+	case RGB_MUL_TEXEL0: res = inputs.texel0.rgb; break;
+	case RGB_MUL_TEXEL1: res = inputs.texel1.rgb; break;
+	case RGB_MUL_SHADE: res = inputs.shade.rgb; break;
+	case RGB_MUL_TEXEL0_ALPHA: res = inputs.texel0.aaa; break;
+	case RGB_MUL_TEXEL1_ALPHA: res = inputs.texel1.aaa; break;
+	case RGB_MUL_SHADE_ALPHA: res = inputs.shade.aaa; break;
+	case RGB_MUL_LOD_FRAC: res = ivec3(inputs.lod_frac); break;
+	case RGB_MUL_K5: res = ivec3((int(inputs.constant_mul.g) << 8) | inputs.constant_mul.b); break;
+	default: res = inputs.constant_mul.rgb; break;
+	}
+
+	int alpha;
+	switch (selector_alpha)
+	{
+	case ALPHA_MUL_LOD_FRAC: alpha = inputs.lod_frac; break;
+	case ALPHA_MUL_TEXEL0_ALPHA: alpha = inputs.texel0.a; break;
+	case ALPHA_MUL_TEXEL1_ALPHA: alpha = inputs.texel1.a; break;
+	case ALPHA_MUL_SHADE_ALPHA: alpha = inputs.shade.a; break;
+	default: alpha = inputs.constant_mul.a; break;
+	}
+	return ivec4(res, alpha);
+}
+
+ivec4 select_add(CombinerInputs inputs, int selector_rgb, int selector_alpha)
+{
+	ivec3 res;
+	switch (selector_rgb)
+	{
+	case RGB_ADD_COMBINED: res = inputs.combined.rgb; break;
+	case RGB_ADD_TEXEL0: res = inputs.texel0.rgb; break;
+	case RGB_ADD_TEXEL1: res = inputs.texel1.rgb; break;
+	case RGB_ADD_SHADE: res = inputs.shade.rgb; break;
+	case RGB_ADD_ONE: res = ivec3(0x100); break;
+	default: res = inputs.constant_add.rgb; break;
+	}
+
+	int alpha;
+	switch (selector_alpha)
+	{
+	case ALPHA_ADDSUB_COMBINED: alpha = inputs.combined.a; break;
+	case ALPHA_ADDSUB_TEXEL0_ALPHA: alpha = inputs.texel0.a; break;
+	case ALPHA_ADDSUB_TEXEL1_ALPHA: alpha = inputs.texel1.a; break;
+	case ALPHA_ADDSUB_SHADE_ALPHA: alpha = inputs.shade.a; break;
+	case ALPHA_ADDSUB_ONE: alpha = 0x100; break;
+	default: alpha = inputs.constant_add.a; break;
+	}
+	return ivec4(res, alpha);
+}
+
+i16x4 combiner_cycle0(CombinerInputs inputs, u8x4 combiner_inputs_rgb, u8x4 combiner_inputs_alpha, int alpha_dith,
+                      int coverage, bool cvg_times_alpha, bool alpha_cvg_select, bool alpha_test, out u8 alpha_test_reference)
+{
+	ivec4 muladd = select_muladd(inputs, combiner_inputs_rgb.x, combiner_inputs_alpha.x);
+	ivec4 mulsub = select_mulsub(inputs, combiner_inputs_rgb.y, combiner_inputs_alpha.y);
+	ivec4 mul = select_mul(inputs, combiner_inputs_rgb.z, combiner_inputs_alpha.z);
+	ivec4 add = select_add(inputs, combiner_inputs_rgb.w, combiner_inputs_alpha.w);
+
+	i16x4 combined = combiner_equation(muladd, mulsub, mul, add);
+
+	if (alpha_test)
+	{
+		int clamped_alpha = clamp_9bit(combined.a);
+		// Expands 0xff to 0x100 to avoid having to divide by 2**n - 1.
+		int expanded_alpha = clamped_alpha + ((clamped_alpha + 1) >> 8);
+
+		if (alpha_cvg_select)
+		{
+			int modulated_alpha;
+			if (cvg_times_alpha)
+				modulated_alpha = (expanded_alpha * coverage + 4) >> 3;
+			else
+				modulated_alpha = coverage << 5;
+			expanded_alpha = modulated_alpha;
+		}
+		else
+			expanded_alpha += alpha_dith;
+
+		alpha_test_reference = u8(clamp(expanded_alpha, 0, 0xff));
+	}
+	else
+		alpha_test_reference = U8_C(0);
+
+	return combined;
+}
+
+i16x4 combiner_cycle1(CombinerInputs inputs, u8x4 combiner_inputs_rgb, u8x4 combiner_inputs_alpha, int alpha_dith,
+		              inout int coverage, bool cvg_times_alpha, bool alpha_cvg_select)
+{
+	ivec4 muladd = select_muladd(inputs, combiner_inputs_rgb.x, combiner_inputs_alpha.x);
+	ivec4 mulsub = select_mulsub(inputs, combiner_inputs_rgb.y, combiner_inputs_alpha.y);
+	ivec4 mul = select_mul(inputs, combiner_inputs_rgb.z, combiner_inputs_alpha.z);
+	ivec4 add = select_add(inputs, combiner_inputs_rgb.w, combiner_inputs_alpha.w);
+
+	i16x4 combined = combiner_equation(muladd, mulsub, mul, add);
+
+	combined = clamp_9bit_notrunc(combined);
+
+	// Expands 0xff to 0x100 to avoid having to divide by 2**n - 1.
+	int expanded_alpha = combined.a + ((combined.a + 1) >> 8);
+
+	int modulated_alpha;
+	if (cvg_times_alpha)
+	{
+		modulated_alpha = (expanded_alpha * coverage + 4) >> 3;
+		coverage = modulated_alpha >> 5;
+	}
+	else
+		modulated_alpha = coverage << 5;
+
+	if (alpha_cvg_select)
+		expanded_alpha = modulated_alpha;
+	else
+		expanded_alpha += alpha_dith;
+
+	combined.a = i16(clamp(expanded_alpha, 0, 0xff));
+
+	return combined;
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/coverage.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/coverage.h
@ -0,0 +1,81 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef COVERAGE_H_
+#define COVERAGE_H_
+
+#include "data_structures.h"
+
+const int SUBPIXELS_LOG2 = 2;
+const int SUBPIXELS = 1 << SUBPIXELS_LOG2;
+
+u8 compute_coverage(u16x4 xleft, u16x4 xright, int x)
+{
+	u16x4 xshift = u16x4(0, 4, 2, 6) + (u16(x) << U16_C(3));
+	bvec4 clip_lo_x01 = lessThan(xshift, xleft.xxyy);
+	bvec4 clip_lo_x23 = lessThan(xshift, xleft.zzww);
+	bvec4 clip_hi_x01 = greaterThanEqual(xshift, xright.xxyy);
+	bvec4 clip_hi_x23 = greaterThanEqual(xshift, xright.zzww);
+
+	u8x4 clip_x0 = u8x4(clip_lo_x01) | u8x4(clip_hi_x01);
+	u8x4 clip_x1 = u8x4(clip_lo_x23) | u8x4(clip_hi_x23);
+	u8x4 clip_x = clip_x0 * u8x4(1, 2, 4, 8) + clip_x1 * u8x4(16, 32, 64, 128);
+	u8 clip_coverage = (clip_x.x | clip_x.y) | (clip_x.z | clip_x.w);
+	return ~clip_coverage & U8_C(0xff);
+}
+
+const int COVERAGE_CLAMP = 0;
+const int COVERAGE_WRAP = 1;
+const int COVERAGE_ZAP = 2;
+const int COVERAGE_SAVE = 3;
+
+int blend_coverage(int coverage, int memory_coverage, bool blend_en, int mode)
+{
+	int res = 0;
+	switch (mode)
+	{
+	case COVERAGE_CLAMP:
+	{
+		if (blend_en)
+			res = min(7, memory_coverage + coverage); // image_read_en to read memory coverage, otherwise, it's 7.
+		else
+			res = (coverage - 1) & 7;
+		break;
+	}
+
+	case COVERAGE_WRAP:
+		res = (coverage + memory_coverage) & 7;
+		break;
+
+	case COVERAGE_ZAP:
+		res = 7;
+		break;
+
+	case COVERAGE_SAVE:
+		res = memory_coverage;
+		break;
+	}
+
+	return res;
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/data_structures.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/data_structures.h
@ -0,0 +1,345 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef DATA_STRUCTURES_H_
+#define DATA_STRUCTURES_H_
+
+// Data structures which are supposed to match up with rdp_data_structures.hpp.
+// A little dirty to duplicate like this, but it's non-trivial to share headers with C++,
+// especially when we need to deal with small integer types.
+
+const int TRIANGLE_SETUP_FLIP_BIT = 1 << 0;
+const int TRIANGLE_SETUP_DO_OFFSET_BIT = 1 << 1;
+const int TRIANGLE_SETUP_SKIP_XFRAC_BIT = 1 << 2;
+const int TRIANGLE_SETUP_INTERLACE_FIELD_BIT = 1 << 3;
+const int TRIANGLE_SETUP_INTERLACE_KEEP_ODD_BIT = 1 << 4;
+const int TRIANGLE_SETUP_DISABLE_UPSCALING_BIT = 1 << 5;
+const int TRIANGLE_SETUP_NATIVE_LOD_BIT = 1 << 6;
+
+const int RASTERIZATION_INTERLACE_FIELD_BIT = 1 << 0;
+const int RASTERIZATION_INTERLACE_KEEP_ODD_BIT = 1 << 1;
+const int RASTERIZATION_AA_BIT = 1 << 2;
+const int RASTERIZATION_PERSPECTIVE_CORRECT_BIT = 1 << 3;
+const int RASTERIZATION_TLUT_BIT = 1 << 4;
+const int RASTERIZATION_TLUT_TYPE_BIT = 1 << 5;
+const int RASTERIZATION_CVG_TIMES_ALPHA_BIT = 1 << 6;
+const int RASTERIZATION_ALPHA_CVG_SELECT_BIT = 1 << 7;
+const int RASTERIZATION_MULTI_CYCLE_BIT = 1 << 8;
+const int RASTERIZATION_TEX_LOD_ENABLE_BIT = 1 << 9;
+const int RASTERIZATION_SHARPEN_LOD_ENABLE_BIT = 1 << 10;
+const int RASTERIZATION_DETAIL_LOD_ENABLE_BIT = 1 << 11;
+const int RASTERIZATION_FILL_BIT = 1 << 12;
+const int RASTERIZATION_COPY_BIT = 1 << 13;
+const int RASTERIZATION_SAMPLE_MODE_BIT = 1 << 14;
+const int RASTERIZATION_ALPHA_TEST_BIT = 1 << 15;
+const int RASTERIZATION_ALPHA_TEST_DITHER_BIT = 1 << 16;
+const int RASTERIZATION_SAMPLE_MID_TEXEL_BIT = 1 << 17;
+const int RASTERIZATION_USES_TEXEL0_BIT = 1 << 18;
+const int RASTERIZATION_USES_TEXEL1_BIT = 1 << 19;
+const int RASTERIZATION_USES_LOD_BIT = 1 << 20;
+const int RASTERIZATION_USES_PIPELINED_TEXEL1_BIT = 1 << 21;
+const int RASTERIZATION_CONVERT_ONE_BIT = 1 << 22;
+const int RASTERIZATION_BILERP_0_BIT = 1 << 23;
+const int RASTERIZATION_BILERP_1_BIT = 1 << 24;
+const int RASTERIZATION_UPSCALING_LOG2_BIT_OFFSET = 26;
+const int RASTERIZATION_NEED_NOISE_BIT = 1 << 28;
+const int RASTERIZATION_USE_STATIC_TEXTURE_SIZE_FORMAT_BIT = 1 << 29;
+const int RASTERIZATION_USE_SPECIALIZATION_CONSTANT_BIT = 1 << 30;
+
+const int DEPTH_BLEND_DEPTH_TEST_BIT = 1 << 0;
+const int DEPTH_BLEND_DEPTH_UPDATE_BIT = 1 << 1;
+const int DEPTH_BLEND_FORCE_BLEND_BIT = 1 << 3;
+const int DEPTH_BLEND_IMAGE_READ_ENABLE_BIT = 1 << 4;
+const int DEPTH_BLEND_COLOR_ON_COVERAGE_BIT = 1 << 5;
+const int DEPTH_BLEND_MULTI_CYCLE_BIT = 1 << 6;
+const int DEPTH_BLEND_AA_BIT = 1 << 7;
+const int DEPTH_BLEND_DITHER_ENABLE_BIT = 1 << 8;
+
+struct TriangleSetupMem
+{
+	int xh, xm, xl;
+	mem_i16 yh, ym;
+	int dxhdy, dxmdy, dxldy;
+	mem_i16 yl; mem_u8 flags; mem_u8 tile;
+};
+
+#if SMALL_TYPES
+#define TriangleSetup TriangleSetupMem
+#else
+struct TriangleSetup
+{
+	int xh, xm, xl;
+	i16 yh, ym;
+	int dxhdy, dxmdy, dxldy;
+	i16 yl; u8 flags; u8 tile;
+};
+#endif
+
+struct AttributeSetupMem
+{
+	ivec4 rgba;
+	ivec4 drgba_dx;
+	ivec4 drgba_de;
+	ivec4 drgba_dy;
+
+	ivec4 stzw;
+	ivec4 dstzw_dx;
+	ivec4 dstzw_de;
+	ivec4 dstzw_dy;
+};
+#define AttributeSetup AttributeSetupMem
+
+struct SpanSetupMem
+{
+	ivec4 rgba;
+	ivec4 stzw;
+
+	mem_u16x4 xleft;
+	mem_u16x4 xright;
+
+	int interpolation_base_x;
+	int start_x;
+	int end_x;
+	mem_i16 lodlength;
+	mem_u16 valid_line;
+};
+#if SMALL_TYPES
+#define SpanSetup SpanSetupMem
+#else
+struct SpanSetup
+{
+	ivec4 rgba;
+	ivec4 stzw;
+
+	u16x4 xleft;
+	u16x4 xright;
+
+	int interpolation_base_x;
+	int start_x;
+	int end_x;
+	i16 lodlength;
+	u16 valid_line;
+};
+#endif
+
+struct SpanInfoOffsetsMem
+{
+	int offset;
+	int ylo;
+	int yhi;
+	int padding;
+};
+#define SpanInfoOffsets SpanInfoOffsetsMem
+
+struct DerivedSetupMem
+{
+	mem_u8x4 constant_muladd0;
+	mem_u8x4 constant_mulsub0;
+	mem_u8x4 constant_mul0;
+	mem_u8x4 constant_add0;
+
+	mem_u8x4 constant_muladd1;
+	mem_u8x4 constant_mulsub1;
+	mem_u8x4 constant_mul1;
+	mem_u8x4 constant_add1;
+
+	mem_u8x4 fog_color;
+	mem_u8x4 blend_color;
+	uint fill_color;
+
+	mem_u16 dz;
+	mem_u8 dz_compressed;
+	mem_u8 min_lod;
+
+	mem_i16x4 factors;
+};
+
+#if SMALL_TYPES
+#define DerivedSetup DerivedSetupMem
+#else
+struct DerivedSetup
+{
+	u8x4 constant_muladd0;
+	u8x4 constant_mulsub0;
+	u8x4 constant_mul0;
+	u8x4 constant_add0;
+
+	u8x4 constant_muladd1;
+	u8x4 constant_mulsub1;
+	u8x4 constant_mul1;
+	u8x4 constant_add1;
+
+	u8x4 fog_color;
+	u8x4 blend_color;
+	uint fill_color;
+
+	u16 dz;
+	u8 dz_compressed;
+	u8 min_lod;
+
+	i16x4 factors;
+};
+#endif
+
+#define ScissorStateMem ivec4
+
+struct ScissorState
+{
+	int xlo, ylo, xhi, yhi;
+};
+
+const int TILE_INFO_CLAMP_S_BIT = 1 << 0;
+const int TILE_INFO_MIRROR_S_BIT = 1 << 1;
+const int TILE_INFO_CLAMP_T_BIT = 1 << 2;
+const int TILE_INFO_MIRROR_T_BIT = 1 << 3;
+
+struct TileInfoMem
+{
+	uint slo;
+	uint shi;
+	uint tlo;
+	uint thi;
+	uint offset;
+	uint stride;
+	mem_u8 fmt;
+	mem_u8 size;
+	mem_u8 palette;
+	mem_u8 mask_s;
+	mem_u8 shift_s;
+	mem_u8 mask_t;
+	mem_u8 shift_t;
+	mem_u8 flags;
+};
+
+#if SMALL_TYPES
+#define TileInfo TileInfoMem
+#else
+struct TileInfo
+{
+	uint slo;
+	uint shi;
+	uint tlo;
+	uint thi;
+	uint offset;
+	uint stride;
+	u8 fmt;
+	u8 size;
+	u8 palette;
+	u8 mask_s;
+	u8 shift_s;
+	u8 mask_t;
+	u8 shift_t;
+	u8 flags;
+};
+#endif
+
+struct StaticRasterizationStateMem
+{
+	mem_u8x4 combiner_inputs_rgb0;
+	mem_u8x4 combiner_inputs_alpha0;
+	mem_u8x4 combiner_inputs_rgb1;
+	mem_u8x4 combiner_inputs_alpha1;
+	uint flags;
+	int dither;
+	int texture_size;
+	int texture_fmt;
+};
+
+#if SMALL_TYPES
+#define StaticRasterizationState StaticRasterizationStateMem
+#else
+struct StaticRasterizationState
+{
+	u8x4 combiner_inputs_rgb0;
+	u8x4 combiner_inputs_alpha0;
+	u8x4 combiner_inputs_rgb1;
+	u8x4 combiner_inputs_alpha1;
+	uint flags;
+	int dither;
+	int texture_size;
+	int texture_fmt;
+};
+#endif
+
+struct DepthBlendStateMem
+{
+	mem_u8x4 blend_modes0;
+	mem_u8x4 blend_modes1;
+	uint flags;
+	mem_u8 coverage_mode;
+	mem_u8 z_mode;
+	mem_u8 padding0;
+	mem_u8 padding1;
+};
+
+#if SMALL_TYPES
+#define DepthBlendState DepthBlendStateMem
+#else
+struct DepthBlendState
+{
+	u8x4 blend_modes0;
+	u8x4 blend_modes1;
+	uint flags;
+	u8 coverage_mode;
+	u8 z_mode;
+	u8 padding0;
+	u8 padding1;
+};
+#endif
+
+struct InstanceIndicesMem
+{
+	mem_u8x4 static_depth_tmem;
+	mem_u8x4 other;
+	mem_u8 tile_infos[8];
+};
+
+struct TMEMInstance16Mem
+{
+	mem_u16 elems[2048];
+};
+
+struct TMEMInstance8Mem
+{
+	mem_u8 elems[4096];
+};
+
+struct ShadedData
+{
+	u8x4 combined;
+	int z_dith;
+	u8 coverage_count;
+	u8 shade_alpha;
+};
+
+const int COVERAGE_FILL_BIT = 0x40;
+const int COVERAGE_COPY_BIT = 0x20;
+
+struct GlobalFBInfo
+{
+	int dx_shift;
+	int dx_mask;
+	int fb_size;
+	uint base_primitive_index;
+};
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/data_structures_buffers.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/data_structures_buffers.h
@ -0,0 +1,134 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef DATA_STRUCTURES_BUFFERS_H_
+#define DATA_STRUCTURES_BUFFERS_H_
+
+#include "data_structures.h"
+
+layout(set = 0, binding = 0, std430) buffer VRAM32
+{
+	uint data[];
+} vram32;
+
+layout(set = 0, binding = 0, std430) buffer VRAM16
+{
+	mem_u16 data[];
+} vram16;
+
+layout(set = 0, binding = 0, std430) buffer VRAM8
+{
+	mem_u8 data[];
+} vram8;
+
+layout(set = 0, binding = 1, std430) buffer HiddenVRAM
+{
+	mem_u8 data[];
+} hidden_vram;
+
+layout(set = 0, binding = 2, std430) readonly buffer TMEM16
+{
+	TMEMInstance16Mem instances[];
+} tmem16;
+
+layout(set = 0, binding = 2, std430) readonly buffer TMEM8
+{
+	TMEMInstance8Mem instances[];
+} tmem8;
+
+layout(set = 1, binding = 0, std430) readonly buffer TriangleSetupBuffer
+{
+	TriangleSetupMem elems[];
+} triangle_setup;
+#include "load_triangle_setup.h"
+
+layout(set = 1, binding = 1, std430) readonly buffer AttributeSetupBuffer
+{
+	AttributeSetupMem elems[];
+} attribute_setup;
+#include "load_attribute_setup.h"
+
+layout(set = 1, binding = 2, std430) readonly buffer DerivedSetupBuffer
+{
+	DerivedSetupMem elems[];
+} derived_setup;
+#include "load_derived_setup.h"
+
+layout(set = 1, binding = 3, std430) readonly buffer ScissorStateBuffer
+{
+	ScissorStateMem elems[];
+} scissor_state;
+#include "load_scissor_state.h"
+
+layout(set = 1, binding = 4, std430) readonly buffer StaticRasterStateBuffer
+{
+	StaticRasterizationStateMem elems[];
+} static_raster_state;
+#include "load_static_raster_state.h"
+
+layout(set = 1, binding = 5, std430) readonly buffer DepthBlendStateBuffer
+{
+	DepthBlendStateMem elems[];
+} depth_blend_state;
+#include "load_depth_blend_state.h"
+
+layout(set = 1, binding = 6, std430) readonly buffer StateIndicesBuffer
+{
+	InstanceIndicesMem elems[];
+} state_indices;
+
+layout(set = 1, binding = 7, std430) readonly buffer TileInfoBuffer
+{
+	TileInfoMem elems[];
+} tile_infos;
+#include "load_tile_info.h"
+
+layout(set = 1, binding = 8, std430) readonly buffer SpanSetups
+{
+	SpanSetupMem elems[];
+} span_setups;
+#include "load_span_setup.h"
+
+layout(set = 1, binding = 9, std430) readonly buffer SpanInfoOffsetBuffer
+{
+	SpanInfoOffsetsMem elems[];
+} span_offsets;
+#include "load_span_offsets.h"
+
+layout(set = 1, binding = 10) uniform utextureBuffer uBlenderDividerLUT;
+
+layout(set = 1, binding = 11, std430) readonly buffer TileBinning
+{
+	uint elems[];
+} tile_binning;
+
+layout(set = 1, binding = 12, std430) readonly buffer TileBinningCoarse
+{
+	uint elems[];
+} tile_binning_coarse;
+
+layout(set = 2, binding = 0, std140) uniform GlobalConstants
+{
+	GlobalFBInfo fb_info;
+} global_constants;
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/debug.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/debug.h
@ -0,0 +1,151 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef DEBUG_H_
+#define DEBUG_H_
+
+#if defined(DEBUG_ENABLE) && DEBUG_ENABLE
+#include "debug_channel.h"
+
+const uint CODE_ASSERT_EQUAL = 0;
+const uint CODE_ASSERT_NOT_EQUAL = 1;
+const uint CODE_ASSERT_LESS_THAN = 2;
+const uint CODE_ASSERT_LESS_THAN_EQUAL = 3;
+const uint CODE_GENERIC = 4;
+const uint CODE_HEX = 5;
+
+void ASSERT_EQUAL_(int line, int a, int b)
+{
+	if (a != b)
+		add_debug_message(CODE_ASSERT_EQUAL, gl_GlobalInvocationID, ivec3(line, a, b));
+}
+
+void ASSERT_NOT_EQUAL_(int line, int a, int b)
+{
+	if (a == b)
+		add_debug_message(CODE_ASSERT_NOT_EQUAL, gl_GlobalInvocationID, ivec3(line, a, b));
+}
+
+void ASSERT_LESS_THAN_(int line, int a, int b)
+{
+	if (a >= b)
+		add_debug_message(CODE_ASSERT_LESS_THAN, gl_GlobalInvocationID, ivec3(line, a, b));
+}
+
+void ASSERT_LESS_THAN_EQUAL_(int line, int a, int b)
+{
+	if (a > b)
+		add_debug_message(CODE_ASSERT_LESS_THAN_EQUAL, gl_GlobalInvocationID, ivec3(line, a, b));
+}
+
+void ASSERT_EQUAL_(int line, uint a, uint b)
+{
+	if (a != b)
+		add_debug_message(CODE_ASSERT_EQUAL, gl_GlobalInvocationID, ivec3(line, a, b));
+}
+
+void ASSERT_NOT_EQUAL_(int line, uint a, uint b)
+{
+	if (a == b)
+		add_debug_message(CODE_ASSERT_NOT_EQUAL, gl_GlobalInvocationID, ivec3(line, a, b));
+}
+
+void ASSERT_LESS_THAN_(int line, uint a, uint b)
+{
+	if (a >= b)
+		add_debug_message(CODE_ASSERT_LESS_THAN, gl_GlobalInvocationID, ivec3(line, a, b));
+}
+
+void ASSERT_LESS_THAN_EQUAL_(int line, uint a, uint b)
+{
+	if (a > b)
+		add_debug_message(CODE_ASSERT_LESS_THAN_EQUAL, gl_GlobalInvocationID, ivec3(line, a, b));
+}
+
+void GENERIC_MESSAGE_(int line)
+{
+	add_debug_message(CODE_GENERIC, gl_GlobalInvocationID, line);
+}
+
+void GENERIC_MESSAGE_(int line, uint v)
+{
+	add_debug_message(CODE_GENERIC, gl_GlobalInvocationID, uvec2(line, v));
+}
+
+void GENERIC_MESSAGE_(int line, uvec2 v)
+{
+	add_debug_message(CODE_GENERIC, gl_GlobalInvocationID, uvec3(line, v));
+}
+
+void GENERIC_MESSAGE_(int line, uvec3 v)
+{
+	add_debug_message(CODE_GENERIC, gl_GlobalInvocationID, uvec4(line, v));
+}
+
+void HEX_MESSAGE_(int line)
+{
+	add_debug_message(CODE_HEX, gl_GlobalInvocationID, line);
+}
+
+void HEX_MESSAGE_(int line, uint v)
+{
+	add_debug_message(CODE_HEX, gl_GlobalInvocationID, uvec2(line, v));
+}
+
+void HEX_MESSAGE_(int line, uvec2 v)
+{
+	add_debug_message(CODE_HEX, gl_GlobalInvocationID, uvec3(line, v));
+}
+
+void HEX_MESSAGE_(int line, uvec3 v)
+{
+	add_debug_message(CODE_HEX, gl_GlobalInvocationID, uvec4(line, v));
+}
+
+#define ASERT_EQUAL(a, b) ASSERT_EQUAL_(__LINE__, a, b)
+#define ASERT_NOT_EQUAL(a, b) ASSERT_NOT_EQUAL_(__LINE__, a, b)
+#define ASERT_LESS_THAN(a, b) ASSERT_LESS_THAN_(__LINE__, a, b)
+#define ASERT_LESS_THAN_EQUAL(a, b) ASSERT_LESS_THAN_EQUAL_(__LINE__, a, b)
+#define GENERIC_MESSAGE0() GENERIC_MESSAGE_(__LINE__)
+#define GENERIC_MESSAGE1(a) GENERIC_MESSAGE_(__LINE__, a)
+#define GENERIC_MESSAGE2(a, b) GENERIC_MESSAGE_(__LINE__, uvec2(a, b))
+#define GENERIC_MESSAGE3(a, b, c) GENERIC_MESSAGE_(__LINE__, uvec3(a, b, c))
+#define HEX_MESSAGE0() HEX_MESSAGE_(__LINE__)
+#define HEX_MESSAGE1(a) HEX_MESSAGE_(__LINE__, a)
+#define HEX_MESSAGE2(a, b) HEX_MESSAGE_(__LINE__, uvec2(a, b))
+#define HEX_MESSAGE3(a, b, c) HEX_MESSAGE_(__LINE__, uvec3(a, b, c))
+#else
+#define ASERT_EQUAL(a, b)
+#define ASERT_NOT_EQUAL(a, b)
+#define ASERT_LESS_THAN(a, b)
+#define ASERT_LESS_THAN_EQUAL(a, b)
+#define GENERIC_MESSAGE0()
+#define GENERIC_MESSAGE1(a)
+#define GENERIC_MESSAGE2(a, b)
+#define GENERIC_MESSAGE3(a, b, c)
+#define HEX_MESSAGE0()
+#define HEX_MESSAGE1(a)
+#define HEX_MESSAGE2(a, b)
+#define HEX_MESSAGE3(a, b, c)
+#endif
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/depth_blend.comp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/depth_blend.comp
@ -0,0 +1,149 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#if SUBGROUP
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#endif
+#include "small_types.h"
+
+layout(local_size_x_id = 3, local_size_y_id = 4) in;
+
+#include "noise.h"
+#include "debug.h"
+#include "data_structures_buffers.h"
+#include "memory_interfacing.h"
+
+layout(set = 0, binding = 3, std430) readonly buffer ColorBuffer
+{
+    mem_u8x4 elems[];
+} color;
+
+layout(set = 0, binding = 3, std430) readonly buffer ColorRawBuffer
+{
+    uint elems[];
+} raw_color;
+
+layout(set = 0, binding = 4, std430) readonly buffer DepthBuffer
+{
+    int elems[];
+} depth;
+
+layout(set = 0, binding = 5, std430) readonly buffer ShadeAlpha
+{
+    mem_u8 elems[];
+} shade_alpha;
+
+layout(set = 0, binding = 6, std430) readonly buffer Coverage
+{
+    mem_i8 elems[];
+} coverage;
+
+layout(std430, set = 0, binding = 7) readonly buffer TileInstanceOffset
+{
+    uint elems[];
+} tile_instance_offsets;
+
+layout(push_constant, std430) uniform Registers
+{
+    uint fb_addr_index;
+    uint fb_depth_addr_index;
+    uint fb_width;
+    uint fb_height;
+    uint group_mask;
+} registers;
+
+layout(constant_id = 5) const int MAX_PRIMITIVES = 256;
+layout(constant_id = 6) const int MAX_WIDTH = 1024;
+
+const int TILE_BINNING_STRIDE = MAX_PRIMITIVES / 32;
+const int MAX_TILES_X = MAX_WIDTH / int(gl_WorkGroupSize.x);
+
+// Overall architecture of the tiling is from RetroWarp.
+
+void main()
+{
+    int x = int(gl_GlobalInvocationID.x);
+    int y = int(gl_GlobalInvocationID.y);
+    ivec2 tile = ivec2(gl_WorkGroupID.xy);
+
+    int linear_tile = tile.x + tile.y * MAX_TILES_X;
+    int linear_tile_base = linear_tile * TILE_BINNING_STRIDE;
+
+    uint coarse_binned = tile_binning_coarse.elems[linear_tile] & registers.group_mask;
+    if (coarse_binned == 0u)
+        return;
+
+    init_tile(gl_GlobalInvocationID.xy,
+              registers.fb_width, registers.fb_height,
+              registers.fb_addr_index, registers.fb_depth_addr_index);
+
+    while (coarse_binned != 0u)
+    {
+        int mask_index = findLSB(coarse_binned);
+        coarse_binned &= ~uint(1 << mask_index);
+
+        uint tile_instance = tile_instance_offsets.elems[linear_tile_base + mask_index];
+        uint binned = tile_binning.elems[linear_tile_base + mask_index];
+
+        while (binned != 0u)
+        {
+            int i = findLSB(binned);
+            binned &= ~uint(1 << i);
+            uint primitive_index = uint(i + 32 * mask_index);
+
+            uint index = tile_instance * (gl_WorkGroupSize.x * gl_WorkGroupSize.y) + gl_LocalInvocationIndex;
+            int coverage = int(coverage.elems[index]);
+
+            if (coverage >= 0)
+            {
+                if ((coverage & COVERAGE_FILL_BIT) != 0)
+                {
+                    fill_color(derived_setup.elems[primitive_index].fill_color);
+                }
+                else if ((coverage & COVERAGE_COPY_BIT) != 0)
+                {
+                    uint word = raw_color.elems[index];
+                    copy_pipeline(word, primitive_index);
+                }
+                else
+                {
+                    ShadedData shaded;
+                    shaded.combined = u8x4(color.elems[index]);
+                    shaded.z_dith = depth.elems[index];
+                    shaded.shade_alpha = u8(shade_alpha.elems[index]);
+                    shaded.coverage_count = u8(coverage);
+                    depth_blend(x, y, primitive_index, shaded);
+                }
+            }
+
+            tile_instance++;
+        }
+    }
+
+    finish_tile(gl_GlobalInvocationID.xy,
+                registers.fb_width, registers.fb_height,
+                registers.fb_addr_index, registers.fb_depth_addr_index);
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/depth_test.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/depth_test.h
@ -0,0 +1,146 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef DEPTH_TEST_H_
+#define DEPTH_TEST_H_
+
+#include "z_encode.h"
+
+const int Z_MODE_OPAQUE = 0;
+const int Z_MODE_INTERPENETRATING = 1;
+const int Z_MODE_TRANSPARENT = 2;
+const int Z_MODE_DECAL = 3;
+
+int combine_dz(int dz)
+{
+	// Find largest POT which is <= dz.
+	if (dz != 0)
+		dz = 1 << findMSB(dz);
+	return dz;
+}
+
+bool depth_test(int z, int dz, int dz_compressed,
+                u16 current_depth, u8 current_dz,
+                inout int coverage_count, int current_coverage_count,
+                bool z_compare, int z_mode,
+                bool force_blend, bool aa_enable,
+                out bool blend_en, out bool coverage_wrap, out u8x2 blend_shift)
+{
+	bool depth_pass;
+
+	if (z_compare)
+	{
+		int memory_z = z_decompress(current_depth);
+		int memory_dz = dz_decompress(current_dz);
+		int precision_factor = (int(current_depth) >> 11) & 0xf;
+		bool coplanar = false;
+
+		blend_shift.x = u8(clamp(dz_compressed - current_dz, 0, 4));
+		blend_shift.y = u8(clamp(current_dz - dz_compressed, 0, 4));
+
+		if (precision_factor < 3)
+		{
+			if (memory_dz != 0x8000)
+				memory_dz = max(memory_dz << 1, 16 >> precision_factor);
+			else
+			{
+				coplanar = true;
+				memory_dz = 0xffff;
+			}
+		}
+
+		int combined_dz = combine_dz(dz | memory_dz);
+		int combined_dz_interpenetrate = combined_dz;
+		combined_dz <<= 3;
+
+		bool farther = coplanar || ((z + combined_dz) >= memory_z);
+		bool overflow = (coverage_count + current_coverage_count) >= 8;
+
+		blend_en = force_blend || (!overflow && aa_enable && farther);
+		coverage_wrap = overflow;
+
+		depth_pass = false;
+		bool max_z = memory_z == 0x3ffff;
+		bool front = z < memory_z;
+		int z_closest_possible = z - combined_dz;
+		bool nearer = coplanar || (z_closest_possible <= memory_z);
+
+		switch (z_mode)
+		{
+		case Z_MODE_OPAQUE:
+		{
+			// The OPAQUE mode is normal less-than.
+			// However, if z is sufficiently close enough to memory Z, we assume that we have the same surface
+			// and we should simply increment coverage (blend_en).
+			// If we overflow coverage, it is clear that we have a different surface, and here we should only
+			// consider pure in-front test and overwrite coverage.
+			depth_pass = max_z || (overflow ? front : nearer);
+			break;
+		}
+
+		case Z_MODE_INTERPENETRATING:
+		{
+			// This one is ... interesting as it affects coverage.
+			if (!front || !farther || !overflow)
+			{
+				// If there is no decal-like intersect, treat this as normal opaque mode.
+				depth_pass = max_z || (overflow ? front : nearer);
+			}
+			else
+			{
+				// Modify coverage based on how far away current surface we are somehow?
+				combined_dz_interpenetrate = dz_compress(combined_dz_interpenetrate & 0xffff);
+				int cvg_coeff = ((memory_z >> combined_dz_interpenetrate) - (z >> combined_dz_interpenetrate)) & 0xf;
+				coverage_count = min((cvg_coeff * coverage_count) >> 3, 8);
+				depth_pass = true;
+			}
+			break;
+		}
+
+		case Z_MODE_TRANSPARENT:
+		{
+			depth_pass = front || max_z;
+			break;
+		}
+
+		case Z_MODE_DECAL:
+		{
+			// Decals pass if |z - memory_z| <= max(dz, memory_dz).
+			depth_pass = farther && nearer && !max_z;
+			break;
+		}
+		}
+	}
+	else
+	{
+		blend_shift.x = u8(0);
+		blend_shift.y = u8(min(0xf - dz_compressed, 4));
+
+		bool overflow = (coverage_count + current_coverage_count) >= 8;
+		blend_en = force_blend || (!overflow && aa_enable);
+		coverage_wrap = overflow;
+		depth_pass = true;
+	}
+	return depth_pass;
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/dither.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/dither.h
@ -0,0 +1,70 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef DITHER_H_
+#define DITHER_H_
+
+const u8 dither_matrices[2][16] = u8[][](
+		u8[](U8_C(0), U8_C(6), U8_C(1), U8_C(7), U8_C(4), U8_C(2), U8_C(5), U8_C(3), U8_C(3), U8_C(5), U8_C(2), U8_C(4), U8_C(7), U8_C(1), U8_C(6), U8_C(0)),
+		u8[](U8_C(0), U8_C(4), U8_C(1), U8_C(5), U8_C(4), U8_C(0), U8_C(5), U8_C(1), U8_C(3), U8_C(7), U8_C(2), U8_C(6), U8_C(7), U8_C(3), U8_C(6), U8_C(2)));
+
+u8x3 rgb_dither(ivec3 orig_rgb, int dith)
+{
+	ivec3 rgb_dith = (ivec3(dith) >> ivec3(0, 3, 6)) & 7;
+	ivec3 rgb = mix((orig_rgb & 0xf8) + 8, ivec3(255), greaterThan(orig_rgb, ivec3(247)));
+	ivec3 replace_sign = (rgb_dith - (orig_rgb & 7)) >> 31;
+	ivec3 dither_diff = rgb - orig_rgb;
+	rgb = orig_rgb + (dither_diff & replace_sign);
+	return u8x3(rgb & 0xff);
+}
+
+void dither_coefficients(int x, int y, int dither_mode_rgb, int dither_mode_alpha, out int rgb_dither, out int alpha_dither)
+{
+	const int DITHER_SPLAT = (1 << 0) | (1 << 3) | (1 << 6);
+
+	if (dither_mode_rgb < 2)
+		rgb_dither = int(dither_matrices[dither_mode_rgb][(y & 3) * 4 + (x & 3)]) * DITHER_SPLAT;
+	else if (dither_mode_rgb == 2)
+		rgb_dither = noise_get_dither_color();
+	else
+		rgb_dither = 0;
+
+	if (dither_mode_alpha == 3)
+		alpha_dither = 0;
+	else
+	{
+		if (dither_mode_alpha == 2)
+		{
+			alpha_dither = noise_get_dither_alpha();
+		}
+		else
+		{
+			alpha_dither = dither_mode_rgb >= 2 ?
+				int(dither_matrices[dither_mode_rgb & 1][(y & 3) * 4 + (x & 3)]) : (rgb_dither & 7);
+
+			if (dither_mode_alpha == 1)
+				alpha_dither = ~alpha_dither & 7;
+		}
+	}
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/extract_vram.comp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/extract_vram.comp
@ -0,0 +1,107 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "small_types.h"
+layout(local_size_x = 16, local_size_y = 8) in;
+
+// Copies VRAM into a texture which is then consumed by VI scanout.
+
+layout(set = 0, binding = 0, rgba8ui) uniform writeonly uimage2D uAAInput;
+layout(set = 0, binding = 1, std430) readonly buffer RDRAM16
+{
+	mem_u16 elems[];
+} vram16;
+
+layout(set = 0, binding = 1, std430) readonly buffer RDRAM32
+{
+	uint elems[];
+} vram32;
+
+layout(set = 0, binding = 2, std430) readonly buffer HiddenRDRAM
+{
+	mem_u8 elems[];
+} hidden_vram;
+
+layout(push_constant, std430) uniform Registers
+{
+	int fb_offset;
+	int fb_width;
+	ivec2 offset;
+	ivec2 resolution;
+} registers;
+
+layout(constant_id = 0) const int RDRAM_SIZE = 0;
+const int RDRAM_MASK_8 = RDRAM_SIZE - 1;
+const int RDRAM_MASK_16 = RDRAM_MASK_8 >> 1;
+const int RDRAM_MASK_32 = RDRAM_MASK_16 >> 1;
+layout(constant_id = 2) const int SCALING_LOG2 = 0;
+const int SCALING_FACTOR = 1 << SCALING_LOG2;
+
+#include "vi_status.h"
+
+uvec4 fetch_color(ivec2 coord)
+{
+	ivec2 slice2d = coord & (SCALING_FACTOR - 1);
+	coord >>= SCALING_LOG2;
+	int slice = slice2d.y * SCALING_FACTOR + slice2d.x;
+
+	uvec4 color;
+	if (FMT_RGBA8888)
+	{
+		int linear_coord = coord.y * registers.fb_width + coord.x + registers.fb_offset;
+		linear_coord &= RDRAM_MASK_32;
+		linear_coord += slice * (RDRAM_SIZE >> 2);
+		uint word = uint(vram32.elems[linear_coord]);
+		color = (uvec4(word) >> uvec4(24, 16, 8, 5)) & uvec4(0xff, 0xff, 0xff, 7);
+	}
+	else if (FMT_RGBA5551)
+	{
+		int linear_coord = coord.y * registers.fb_width + coord.x + registers.fb_offset;
+		linear_coord &= RDRAM_MASK_16;
+		linear_coord += slice * (RDRAM_SIZE >> 1);
+		uint word = uint(vram16.elems[linear_coord ^ 1]);
+		uint hidden_word = uint(hidden_vram.elems[linear_coord]);
+
+		uint r = (word >> 8u) & 0xf8u;
+		uint g = (word >> 3u) & 0xf8u;
+		uint b = (word << 2u) & 0xf8u;
+		uint a = ((word & 1u) << 2u) | hidden_word;
+		color = uvec4(r, g, b, a);
+	}
+	else
+		color = uvec4(0);
+
+	if (!FETCH_AA)
+		color.a = 7u;
+
+	return color;
+}
+
+void main()
+{
+	if (any(greaterThanEqual(gl_GlobalInvocationID.xy, registers.resolution)))
+		return;
+
+	ivec2 coord = ivec2(gl_GlobalInvocationID.xy) + registers.offset;
+	uvec4 col = fetch_color(coord);
+	imageStore(uAAInput, ivec2(gl_GlobalInvocationID.xy), col);
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/fb_formats.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/fb_formats.h
@ -0,0 +1,10 @@
+#ifndef FB_FORMATS_H_
+#define FB_FORMATS_H_
+
+const int FB_FMT_I4 = 0;
+const int FB_FMT_I8 = 1;
+const int FB_FMT_RGBA5551 = 2;
+const int FB_FMT_IA88 = 3;
+const int FB_FMT_RGBA8888 = 4;
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/fullscreen.vert
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/fullscreen.vert
@ -0,0 +1,32 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+void main()
+{
+    if (gl_VertexIndex == 0)
+        gl_Position = vec4(-1.0, -1.0, 0.0, 1.0);
+    else if (gl_VertexIndex == 1)
+        gl_Position = vec4(-1.0, +3.0, 0.0, 1.0);
+    else
+        gl_Position = vec4(+3.0, -1.0, 0.0, 1.0);
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/interpolation.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/interpolation.h
@ -0,0 +1,255 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef INTERPOLATION_H_
+#define INTERPOLATION_H_
+
+#include "data_structures.h"
+#include "clamping.h"
+#include "perspective.h"
+
+u8x4 interpolate_rgba(ivec4 rgba, ivec4 drgba_dx, ivec4 drgba_dy, int dx, int coverage)
+{
+	rgba += ((drgba_dx & ~0x1f) >> SCALING_LOG2) * dx;
+
+	// RGBA is interpolated to 9-bit. The last bit is used to deal with clamping.
+	// Slight underflow below 0 is clamped to 0 and slight overflow above 0xff is clamped to 0xff.
+
+	// Keep 2 sign bits of precision before we complete the centroid interpolation.
+	i16x4 snapped_rgba = i16x4(rgba >> 14);
+
+	// Centroid clipping is based on the first coverage bit, and we interpolate at the first subpixel in scanline order.
+	// With this layout we can just use findLSB to get correct result.
+	// 0x01        0x02
+	//       0x04        0x08
+	// 0x10        0x20
+	//       0x40        0x80
+	int first_coverage = findLSB(coverage);
+	i16 yoff = i16(first_coverage >> 1);
+	i16 xoff = i16((first_coverage & 1) << 1) + (yoff & I16_C(1));
+	snapped_rgba <<= I16_C(2 + SCALING_LOG2);
+	snapped_rgba += xoff * i16x4(drgba_dx >> 14) + yoff * i16x4(drgba_dy >> 14);
+	snapped_rgba >>= I16_C(4 + SCALING_LOG2);
+	return clamp_9bit(snapped_rgba);
+}
+
+void interpolate_st_copy(SpanSetup span, ivec4 dstzw_dx, int x, bool perspective, bool flip,
+                         out ivec2 st, out int s_offset)
+{
+	int dx = flip ? (x - span.start_x) : (span.end_x - x);
+
+	// For copy pipe, we should duplicate pixels when scaling, there is no filtering we can (or should!) do.
+	dx >>= SCALING_LOG2;
+
+	// Snap DX to where we perform interpolation (once per N output pixels).
+	int snapped_dx = dx & global_constants.fb_info.dx_mask;
+	s_offset = dx - snapped_dx;
+	int lerp_dx = (dx >> global_constants.fb_info.dx_shift) * (flip ? 1 : -1);
+	ivec3 stw = span.stzw.xyw + (dstzw_dx.xyw & ~0x1f) * lerp_dx;
+
+	if (perspective)
+	{
+		bool st_overflow;
+		st = perspective_divide(stw >> 16, st_overflow);
+	}
+	else
+		st = no_perspective_divide(stw >> 16);
+}
+
+ivec2 interpolate_st_single(ivec4 stzw, ivec4 dstzw_dx, int dx, bool perspective)
+{
+	ivec3 stw = stzw.xyw + ((dstzw_dx.xyw & ~0x1f) >> SCALING_LOG2) * dx;
+	stw >>= 16;
+	ivec2 st;
+
+	if (perspective)
+	{
+		bool st_overflow;
+		st = perspective_divide(stw, st_overflow);
+	}
+	else
+		st = no_perspective_divide(stw);
+
+	return st;
+}
+
+void interpolate_stz(ivec4 stzw, ivec4 dstzw_dx, ivec4 dstzw_dy, int dx, int coverage, bool perspective, bool uses_lod,
+                     int flip_direction, out ivec2 st, out ivec2 st_dx, out ivec2 st_dy, out int z, inout bool st_overflow)
+{
+	ivec3 stw = stzw.xyw + ((dstzw_dx.xyw & ~0x1f) >> SCALING_LOG2) * dx;
+	ivec3 stw_dx, stw_dy;
+
+	if (uses_lod)
+	{
+		stw_dx = stw + flip_direction * ((dstzw_dx.xyw & ~0x1f) >> SCALING_LOG2);
+		if (SCALING_FACTOR > 1)
+			stw_dy = stw + abs(flip_direction) * ((dstzw_dy.xyw & ~0x7fff) >> SCALING_LOG2);
+		else
+			stw_dy = stw + ((dstzw_dy.xyw & ~0x7fff) >> SCALING_LOG2);
+	}
+
+	if (perspective)
+	{
+		st = perspective_divide(stw >> 16, st_overflow);
+		if (uses_lod)
+		{
+			st_dx = perspective_divide(stw_dx >> 16, st_overflow);
+			st_dy = perspective_divide(stw_dy >> 16, st_overflow);
+		}
+	}
+	else
+	{
+		st = no_perspective_divide(stw >> 16);
+		if (uses_lod)
+		{
+			st_dx = no_perspective_divide(stw_dx >> 16);
+			st_dy = no_perspective_divide(stw_dy >> 16);
+		}
+	}
+
+	// Ensure that interpolation snaps as we expect on every "main" pixel,
+	// for subpixels, interpolate with quantized step factor.
+	z = stzw.z + dstzw_dx.z * (dx >> SCALING_LOG2) + (dstzw_dx.z >> SCALING_LOG2) * (dx & (SCALING_FACTOR - 1));
+
+	int snapped_z = z >> 10;
+	int first_coverage = findLSB(coverage);
+	int yoff = first_coverage >> 1;
+	int xoff = ((first_coverage & 1) << 1) + (yoff & I16_C(1));
+	snapped_z <<= 2 + SCALING_LOG2;
+	snapped_z += xoff * (dstzw_dx.z >> 10) + yoff * (dstzw_dy.z >> 10);
+	snapped_z >>= 5 + SCALING_LOG2;
+
+	z = clamp_z(snapped_z);
+}
+
+#if 0
+u8x4 interpolate_rgba(TriangleSetup setup, AttributeSetup attr, int x, int y, int coverage)
+{
+	bool do_offset = (setup.flags & TRIANGLE_SETUP_DO_OFFSET_BIT) != 0;
+	int y_interpolation_base = int(setup.yh) >> 2;
+	int xh = setup.xh + (y - y_interpolation_base) * (setup.dxhdy << 2);
+
+	ivec4 drgba_diff = ivec4(0);
+
+	// In do_offset mode, varyings are latched at last subpixel line instead of first (for some reason).
+	if (do_offset)
+	{
+		xh += 3 * setup.dxhdy;
+		ivec4 drgba_deh = attr.drgba_de & ~0x1ff;
+		ivec4 drgba_dyh = attr.drgba_dy & ~0x1ff;
+		drgba_diff = drgba_deh - (drgba_deh >> 2) - drgba_dyh + (drgba_dyh >> 2);
+	}
+
+	int base_x = xh >> 16;
+	int xfrac = (xh >> 8) & 0xff;
+
+	ivec4 rgba = attr.rgba;
+	rgba += attr.drgba_de * (y - y_interpolation_base);
+	rgba = ((rgba & ~0x1ff) + drgba_diff - xfrac * ((attr.drgba_dx >> 8) & ~1)) & ~0x3ff;
+	rgba += (attr.drgba_dx & ~0x1f) * (x - base_x);
+
+	// RGBA is interpolated to 9-bit. The last bit is used to deal with clamping.
+	// Slight underflow below 0 is clamped to 0 and slight overflow above 0xff is clamped to 0xff.
+
+	// Keep 2 sign bits of precision before we complete the centroid interpolation.
+	i16x4 snapped_rgba = i16x4(rgba >> 14);
+
+	// Centroid clipping is based on the first coverage bit, and we interpolate at the first subpixel in scanline order.
+	// FWIW, Angrylion has a very different coverage bit assignment, but we need this layout to avoid an awkward LUT.
+	// With this layout we can just use findLSB instead.
+	// 0x01        0x02
+	//       0x04        0x08
+	// 0x10        0x20
+	//       0x40        0x80
+	int first_coverage = findLSB(coverage);
+	i16 yoff = i16(first_coverage >> 1);
+	i16 xoff = i16((first_coverage & 1) << 1) + (yoff & I16_C(1));
+	snapped_rgba <<= I16_C(2);
+	snapped_rgba += xoff * i16x4(attr.drgba_dx >> 14) + yoff * i16x4(attr.drgba_dy >> 14);
+	snapped_rgba >>= I16_C(4);
+	return clamp_9bit(snapped_rgba);
+}
+
+ivec3 interpolate_stw(TriangleSetup setup, AttributeSetup attr, int x, int y)
+{
+	bool do_offset = (setup.flags & TRIANGLE_SETUP_DO_OFFSET_BIT) != 0;
+	int y_interpolation_base = int(setup.yh) >> 2;
+	int xh = setup.xh + (y - y_interpolation_base) * (setup.dxhdy << 2);
+
+	ivec3 dstw_diff = ivec3(0);
+
+	// In do_offset mode, varyings are latched at last subpixel line instead of first (for some reason).
+	if (do_offset)
+	{
+		xh += 3 * setup.dxhdy;
+		ivec3 dstw_deh = attr.dstzw_de.xyw & ~0x1ff;
+		ivec3 dstw_dyh = attr.dstzw_dy.xyw & ~0x1ff;
+		dstw_diff = dstw_deh - (dstw_deh >> 2) - dstw_dyh + (dstw_dyh >> 2);
+	}
+
+	int base_x = xh >> 16;
+	int xfrac = (xh >> 8) & 0xff;
+
+	ivec3 stw = attr.stzw.xyw;
+	stw += attr.dstzw_de.xyw * (y - y_interpolation_base);
+	stw = ((stw & ~0x1ff) + dstw_diff - xfrac * ((attr.dstzw_dx.xyw >> 8) & ~1)) & ~0x3ff;
+	stw += (attr.dstzw_dx.xyw & ~0x1f) * (x - base_x);
+
+	ivec3 snapped_stw = stw >> 16;
+	return snapped_stw;
+}
+
+int interpolate_z(TriangleSetup setup, AttributeSetup attr, int x, int y, int coverage)
+{
+	bool do_offset = (setup.flags & TRIANGLE_SETUP_DO_OFFSET_BIT) != 0;
+	int y_interpolation_base = int(setup.yh) >> 2;
+	int xh = setup.xh + (y - y_interpolation_base) * (setup.dxhdy << 2);
+
+	int dzdiff = 0;
+	// In do_offset mode, varyings are latched at last subpixel line instead of first (for some reason).
+	if (do_offset)
+	{
+		xh += 3 * setup.dxhdy;
+		int dzdeh = attr.dstzw_de.z & ~0x1ff;
+		int dzdyh = attr.dstzw_dy.z & ~0x1ff;
+		dzdiff = dzdeh - (dzdeh >> 2) - dzdyh + (dzdyh >> 2);
+	}
+
+	int base_x = xh >> 16;
+	int xfrac = (xh >> 8) & 0xff;
+	int z = attr.stzw.z;
+	z += attr.dstzw_de.z * (y - y_interpolation_base);
+	z = ((z & ~0x1ff) + dzdiff - xfrac * ((attr.dstzw_dx.z >> 8) & ~1)) & ~0x3ff;
+	z += attr.dstzw_dx.z * (x - base_x);
+
+	int snapped_z = z >> 10;
+	int first_coverage = findLSB(coverage);
+	int yoff = first_coverage >> 1;
+	int xoff = ((first_coverage & 1) << 1) + (yoff & 1s);
+	snapped_z <<= 2;
+	snapped_z += xoff * (attr.dstzw_dx.z >> 10) + yoff * (attr.dstzw_dy.z >> 10);
+	snapped_z >>= 5;
+	return clamp_z(snapped_z);
+}
+#endif
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_attribute_setup.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_attribute_setup.h
@ -0,0 +1,31 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LOAD_ATTRIBUTE_SETUP_H_
+#define LOAD_ATTRIBUTE_SETUP_H_
+
+AttributeSetup load_attribute_setup(uint index)
+{
+	return attribute_setup.elems[index];
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_depth_blend_state.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_depth_blend_state.h
@ -0,0 +1,41 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LOAD_DEPTH_BLEND_STATE_H_
+#define LOAD_DEPTH_BLEND_STATE_H_
+
+DepthBlendState load_depth_blend_state(uint index)
+{
+#if SMALL_TYPES
+	return depth_blend_state.elems[index];
+#else
+	return DepthBlendState(
+			u8x4(depth_blend_state.elems[index].blend_modes0),
+			u8x4(depth_blend_state.elems[index].blend_modes1),
+			depth_blend_state.elems[index].flags,
+			u8(depth_blend_state.elems[index].coverage_mode),
+			u8(depth_blend_state.elems[index].z_mode),
+			u8(0), u8(0));
+#endif
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_derived_setup.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_derived_setup.h
@ -0,0 +1,50 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LOAD_DERIVED_SETUP_H_
+#define LOAD_DERIVED_SETUP_H_
+
+DerivedSetup load_derived_setup(uint index)
+{
+#if SMALL_TYPES
+	return derived_setup.elems[index];
+#else
+	return DerivedSetup(
+			u8x4(derived_setup.elems[index].constant_muladd0),
+			u8x4(derived_setup.elems[index].constant_mulsub0),
+			u8x4(derived_setup.elems[index].constant_mul0),
+			u8x4(derived_setup.elems[index].constant_add0),
+			u8x4(derived_setup.elems[index].constant_muladd1),
+			u8x4(derived_setup.elems[index].constant_mulsub1),
+			u8x4(derived_setup.elems[index].constant_mul1),
+			u8x4(derived_setup.elems[index].constant_add1),
+			u8x4(derived_setup.elems[index].fog_color),
+			u8x4(derived_setup.elems[index].blend_color),
+			uint(derived_setup.elems[index].fill_color),
+			u16(derived_setup.elems[index].dz),
+			u8(derived_setup.elems[index].dz_compressed),
+			u8(derived_setup.elems[index].min_lod),
+			i16x4(derived_setup.elems[index].factors));
+#endif
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_scissor_state.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_scissor_state.h
@ -0,0 +1,32 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LOAD_SCISSOR_STATE_H_
+#define LOAD_SCISSOR_STATE_H_
+
+ScissorState load_scissor_state(uint index)
+{
+	ivec4 values = scissor_state.elems[index];
+	return ScissorState(values.x, values.y, values.z, values.w);
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_span_offsets.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_span_offsets.h
@ -0,0 +1,31 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LOAD_SPAN_OFFSETS_H_
+#define LOAD_SPAN_OFFSETS_H_
+
+SpanInfoOffsets load_span_offsets(uint index)
+{
+	return span_offsets.elems[index];
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_span_setup.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_span_setup.h
@ -0,0 +1,44 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LOAD_SPAN_SETUP_H_
+#define LOAD_SPAN_SETUP_H_
+
+SpanSetup load_span_setup(uint index)
+{
+#if SMALL_TYPES
+	return span_setups.elems[index];
+#else
+	return SpanSetup(
+			span_setups.elems[index].rgba,
+			span_setups.elems[index].stzw,
+			u16x4(uvec4(span_setups.elems[index].xleft)),
+			u16x4(uvec4(span_setups.elems[index].xright)),
+			span_setups.elems[index].interpolation_base_x,
+			span_setups.elems[index].start_x,
+			span_setups.elems[index].end_x,
+			i16(span_setups.elems[index].lodlength),
+			u16(span_setups.elems[index].valid_line));
+#endif
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_static_raster_state.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_static_raster_state.h
@ -0,0 +1,42 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LOAD_STATIC_RASTER_STATE_H_
+#define LOAD_STATIC_RASTER_STATE_H_
+
+StaticRasterizationState load_static_rasterization_state(uint index)
+{
+#if SMALL_TYPES
+	return static_raster_state.elems[index];
+#else
+	return StaticRasterizationState(
+			u8x4(static_raster_state.elems[index].combiner_inputs_rgb0),
+			u8x4(static_raster_state.elems[index].combiner_inputs_alpha0),
+			u8x4(static_raster_state.elems[index].combiner_inputs_rgb1),
+			u8x4(static_raster_state.elems[index].combiner_inputs_alpha1),
+			static_raster_state.elems[index].flags,
+			static_raster_state.elems[index].dither,
+			0, 0);
+#endif
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_tile_info.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_tile_info.h
@ -0,0 +1,49 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LOAD_TILE_INFO_H_
+#define LOAD_TILE_INFO_H_
+
+TileInfo load_tile_info(uint index)
+{
+#if SMALL_TYPES
+	return tile_infos.elems[index];
+#else
+	return TileInfo(
+			tile_infos.elems[index].slo,
+			tile_infos.elems[index].shi,
+			tile_infos.elems[index].tlo,
+			tile_infos.elems[index].thi,
+			tile_infos.elems[index].offset,
+			tile_infos.elems[index].stride,
+			u8(tile_infos.elems[index].fmt),
+			u8(tile_infos.elems[index].size),
+			u8(tile_infos.elems[index].palette),
+			u8(tile_infos.elems[index].mask_s),
+			u8(tile_infos.elems[index].shift_s),
+			u8(tile_infos.elems[index].mask_t),
+			u8(tile_infos.elems[index].shift_t),
+			u8(tile_infos.elems[index].flags));
+#endif
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_triangle_setup.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/load_triangle_setup.h
@ -0,0 +1,46 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LOAD_TRIANGLE_SETUP_H_
+#define LOAD_TRIANGLE_SETUP_H_
+
+TriangleSetup load_triangle_setup(uint index)
+{
+#if SMALL_TYPES
+	return triangle_setup.elems[index];
+#else
+	return TriangleSetup(
+			triangle_setup.elems[index].xh,
+			triangle_setup.elems[index].xm,
+			triangle_setup.elems[index].xl,
+			i16(triangle_setup.elems[index].yh),
+			i16(triangle_setup.elems[index].ym),
+			triangle_setup.elems[index].dxhdy,
+			triangle_setup.elems[index].dxmdy,
+			triangle_setup.elems[index].dxldy,
+			i16(triangle_setup.elems[index].yl),
+			u8(triangle_setup.elems[index].flags),
+			u8(triangle_setup.elems[index].tile));
+#endif
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/masked_rdram_resolve.comp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/masked_rdram_resolve.comp
@ -0,0 +1,70 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+layout(local_size_x_id = 0) in;
+layout(constant_id = 1) const int PAGE_STRIDE = 256;
+
+layout(set = 0, binding = 0, std430) buffer RDRAM
+{
+    uint rdram[];
+};
+
+layout(set = 0, binding = 1, std430) readonly buffer StagingRDRAM
+{
+    uint staging_rdram[];
+};
+
+layout(set = 0, binding = 2, std430) readonly buffer WriteMaskRDRAM
+{
+    uint writemask[];
+};
+
+layout(set = 1, binding = 0, std140) uniform UBO
+{
+    uvec4 offsets[1024];
+};
+
+void main()
+{
+    uint offset = offsets[gl_WorkGroupID.x >> 2u][gl_WorkGroupID.x & 3u];
+    offset *= PAGE_STRIDE;
+    offset += gl_LocalInvocationIndex;
+    uint mask = writemask[offset];
+
+    if (mask == ~0u)
+    {
+        return;
+    }
+    else if (mask == 0u)
+    {
+        uint staging = staging_rdram[offset];
+        rdram[offset] = staging;
+    }
+    else
+    {
+        uint word = rdram[offset];
+        uint staging = staging_rdram[offset];
+        word = (word & mask) | (staging & ~mask);
+        rdram[offset] = word;
+    }
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/memory_interfacing.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/memory_interfacing.h
@ -0,0 +1,582 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef MEMORY_INTERFACING_H_
+#define MEMORY_INTERFACING_H_
+
+#include "dither.h"
+#include "z_encode.h"
+#include "blender.h"
+#include "depth_test.h"
+#include "coverage.h"
+#include "fb_formats.h"
+
+layout(constant_id = 0) const uint RDRAM_SIZE = 0;
+
+layout(constant_id = 7) const int RDRAM_INCOHERENT_SCALING = 0;
+const bool RDRAM_INCOHERENT = (RDRAM_INCOHERENT_SCALING & 1) != 0;
+const int SCALING_LOG2 = RDRAM_INCOHERENT_SCALING >> 1;
+const int SCALING_FACTOR = 1 << SCALING_LOG2;
+const bool RDRAM_UNSCALED_WRITE_MASK = RDRAM_INCOHERENT && SCALING_LOG2 == 0;
+const bool RDRAM_SCALED_WRITE_MASK = RDRAM_INCOHERENT && SCALING_LOG2 != 0;
+
+const uint RDRAM_MASK_8 = RDRAM_SIZE - 1u;
+const uint RDRAM_MASK_16 = RDRAM_MASK_8 >> 1u;
+const uint RDRAM_MASK_32 = RDRAM_MASK_8 >> 2u;
+
+layout(constant_id = 1) const int FB_FMT = 0;
+layout(constant_id = 2) const bool FB_COLOR_DEPTH_ALIAS = false;
+
+u8x4 current_color;
+bool current_color_dirty;
+
+u16 current_depth;
+u8 current_dz;
+bool current_depth_dirty;
+
+void load_vram_color(uint index, uint slice)
+{
+	switch (FB_FMT)
+	{
+	case FB_FMT_I4:
+	case FB_FMT_I8:
+	{
+		index &= RDRAM_MASK_8;
+		index += slice * RDRAM_SIZE;
+		u8 word = u8(vram8.data[index ^ 3u]);
+		current_color = u8x4(word, word, word, u8(hidden_vram.data[index >> 1]));
+		break;
+	}
+
+	case FB_FMT_RGBA5551:
+	{
+		index &= RDRAM_MASK_16;
+		index += slice * (RDRAM_SIZE >> 1);
+		uint word = uint(vram16.data[index ^ 1u]);
+		uvec3 rgb = uvec3(word >> 8u, word >> 3u, word << 2u) & 0xf8u;
+		current_color = u8x4(rgb, (u8(hidden_vram.data[index]) << U8_C(5)) | u8((word & 1) << 7));
+		break;
+	}
+
+	case FB_FMT_IA88:
+	{
+		index &= RDRAM_MASK_16;
+		index += slice * (RDRAM_SIZE >> 1);
+		uint word = uint(vram16.data[index ^ 1u]);
+		current_color = u8x4(u8x3(word >> 8u), word & 0xff);
+		break;
+	}
+
+	case FB_FMT_RGBA8888:
+	{
+		index &= RDRAM_MASK_32;
+		index += slice * (RDRAM_SIZE >> 2);
+		uint word = vram32.data[index];
+		current_color = u8x4((uvec4(word) >> uvec4(24, 16, 8, 0)) & uvec4(0xff));
+		break;
+	}
+	}
+}
+
+void alias_color_to_depth()
+{
+	/* Inherit memory depth from color. */
+	switch (FB_FMT)
+	{
+	case FB_FMT_RGBA5551:
+	{
+		current_dz = (current_color.a >> U8_C(3)) | (current_color.b & U8_C(8));
+		uint word = (current_color.r & 0xf8u) << 6u;
+		word |= (current_color.g & 0xf8u) << 1u;
+		word |= (current_color.b & 0xf8u) >> 4u;
+		current_depth = u16(word);
+		break;
+	}
+
+	case FB_FMT_IA88:
+	{
+		uvec2 col = current_color.ra;
+		uint word = (col.x << 8u) | col.y;
+		uint hidden_word = (word & 1u) * 3u;
+		current_depth = u16(word >> 2u);
+		current_dz = u8(((word & 3u) << 2u) | hidden_word);
+		break;
+	}
+	}
+}
+
+void alias_depth_to_color()
+{
+	uint word = (uint(current_depth) << 4u) | current_dz;
+
+	switch (FB_FMT)
+	{
+	case FB_FMT_RGBA5551:
+	{
+		current_color.r = u8((word >> 10u) & 0xf8u);
+		current_color.g = u8((word >> 5u) & 0xf8u);
+		current_color.b = u8((word >> 0u) & 0xf8u);
+		current_color.a = u8((word & 7u) << 5u);
+		break;
+	}
+
+	case FB_FMT_IA88:
+	{
+		current_color.r = u8((word >> 10u) & 0xffu);
+		current_color.a = u8((word >> 2u) & 0xffu);
+		break;
+	}
+	}
+
+	current_color_dirty = true;
+}
+
+void load_vram_depth(uint index, uint slice)
+{
+	index &= RDRAM_MASK_16;
+	index += slice * (RDRAM_SIZE >> 1);
+	u16 word = u16(vram16.data[index ^ 1u]);
+	current_depth = word >> U16_C(2);
+	current_dz = u8(hidden_vram.data[index]) | u8((word & U16_C(3)) << U16_C(2));
+}
+
+void store_vram_color(uint index, uint slice)
+{
+	//GENERIC_MESSAGE1(index);
+	if (current_color_dirty)
+	{
+		switch (FB_FMT)
+		{
+		case FB_FMT_I4:
+		{
+			index &= RDRAM_MASK_8;
+			index += slice * RDRAM_SIZE;
+			vram8.data[index ^ 3u] = mem_u8(0);
+			if ((index & 1u) != 0u)
+				hidden_vram.data[index >> 1u] = mem_u8(current_color.a);
+
+			if (RDRAM_UNSCALED_WRITE_MASK)
+			{
+				// Need this memory barrier to ensure the mask readback does not read
+				// an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
+				// also coherent.
+				memoryBarrierBuffer();
+				vram8.data[(index ^ 3u) + RDRAM_SIZE] = mem_u8(0xff);
+			}
+			break;
+		}
+
+		case FB_FMT_I8:
+		{
+			index &= RDRAM_MASK_8;
+			index += slice * RDRAM_SIZE;
+			vram8.data[index ^ 3u] = mem_u8(current_color.r);
+			if ((index & 1u) != 0u)
+				hidden_vram.data[index >> 1u] = mem_u8((current_color.r & 1) * 3);
+
+			if (RDRAM_UNSCALED_WRITE_MASK)
+			{
+				// Need this memory barrier to ensure the mask readback does not read
+				// an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
+				// also coherent.
+				memoryBarrierBuffer();
+				vram8.data[(index ^ 3u) + RDRAM_SIZE] = mem_u8(0xff);
+			}
+			break;
+		}
+
+		case FB_FMT_RGBA5551:
+		{
+			index &= RDRAM_MASK_16;
+			index += slice * (RDRAM_SIZE >> 1);
+			uvec4 c = uvec4(current_color);
+			c.rgb &= 0xf8u;
+			uint cov = c.w >> 5u;
+			uint word = (c.x << 8u) | (c.y << 3u) | (c.z >> 2u) | (cov >> 2u);
+			vram16.data[index ^ 1u] = mem_u16(word);
+			hidden_vram.data[index] = mem_u8(cov & U8_C(3));
+
+			if (RDRAM_UNSCALED_WRITE_MASK)
+			{
+				// Need this memory barrier to ensure the mask readback does not read
+				// an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
+				// also coherent.
+				memoryBarrierBuffer();
+				vram16.data[(index ^ 1u) + (RDRAM_SIZE >> 1u)] = mem_u16(0xffff);
+			}
+			break;
+		}
+
+		case FB_FMT_IA88:
+		{
+			index &= RDRAM_MASK_16;
+			index += slice * (RDRAM_SIZE >> 1);
+			uvec2 col = current_color.ra;
+			uint word = (col.x << 8u) | col.y;
+			vram16.data[index ^ 1u] = mem_u16(word);
+			hidden_vram.data[index] = mem_u8((col.y & 1) * 3);
+
+			if (RDRAM_UNSCALED_WRITE_MASK)
+			{
+				// Need this memory barrier to ensure the mask readback does not read
+				// an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
+				// also coherent.
+				memoryBarrierBuffer();
+				vram16.data[(index ^ 1u) + (RDRAM_SIZE >> 1u)] = mem_u16(0xffff);
+			}
+			break;
+		}
+
+		case FB_FMT_RGBA8888:
+		{
+			index &= RDRAM_MASK_32;
+			index += slice * (RDRAM_SIZE >> 2);
+			uvec4 col = current_color;
+			uint word = (col.r << 24u) | (col.g << 16u) | (col.b << 8u) | (col.a << 0u);
+			vram32.data[index] = word;
+			hidden_vram.data[2u * index] = mem_u8((current_color.g & 1) * 3);
+			hidden_vram.data[2u * index + 1u] = mem_u8((current_color.a & 1) * 3);
+
+			if (RDRAM_UNSCALED_WRITE_MASK)
+			{
+				// Need this memory barrier to ensure the mask readback does not read
+				// an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
+				// also coherent.
+				memoryBarrierBuffer();
+				vram32.data[index + (RDRAM_SIZE >> 2u)] = ~0u;
+			}
+			break;
+		}
+		}
+	}
+}
+
+void store_vram_depth(uint index, uint slice)
+{
+	if (!FB_COLOR_DEPTH_ALIAS)
+	{
+		//GENERIC_MESSAGE1(index);
+		if (current_depth_dirty)
+		{
+			index &= RDRAM_MASK_16;
+			index += slice * (RDRAM_SIZE >> 1);
+			vram16.data[index ^ 1u] = mem_u16((current_depth << U16_C(2)) | (current_dz >> U16_C(2)));
+			hidden_vram.data[index] = mem_u8(current_dz & U16_C(3));
+
+			if (RDRAM_UNSCALED_WRITE_MASK)
+			{
+				// Need this memory barrier to ensure the mask readback does not read
+				// an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
+				// also coherent.
+				memoryBarrierBuffer();
+				vram16.data[(index ^ 1) + (RDRAM_SIZE >> 1u)] = mem_u16(0xffff);
+			}
+		}
+	}
+}
+
+uint color_fb_index;
+
+void init_tile(uvec2 coord, uint fb_width, uint fb_height, uint fb_addr_index, uint fb_depth_addr_index)
+{
+	current_color_dirty = false;
+	current_depth_dirty = false;
+	if (all(lessThan(coord, uvec2(fb_width, fb_height))))
+	{
+		uvec2 slice2d = coord & (SCALING_FACTOR - 1);
+		coord >>= SCALING_LOG2;
+		uint slice = slice2d.y * SCALING_FACTOR + slice2d.x;
+
+		uint index = fb_addr_index + (fb_width >> SCALING_LOG2) * coord.y + coord.x;
+		color_fb_index = index;
+		load_vram_color(index, slice);
+
+		index = fb_depth_addr_index + (fb_width >> SCALING_LOG2) * coord.y + coord.x;
+		load_vram_depth(index, slice);
+	}
+}
+
+void emit_scaled_write_masks(uvec2 unscaled_coord, uint unscaled_fb_width)
+{
+	// Merge write masks across pixels.
+	// We reserved a chunk of memory after scaled RDRAM to store 2 bits per pixel holding
+	// a write mask for color and depth. The resolve stage will only resolve a pixel
+	// and trigger a write if any sub-sample was marked as written.
+
+	// Write masks are organized in 4x4 blocks of unscaled pixels for locality purposes.
+	// This guarantees a minimum number of loop iterations to resolve the write masks.
+	uint unscaled_block = (unscaled_coord.y >> 2u) * ((unscaled_fb_width + 3u) >> 2u) + (unscaled_coord.x >> 2u);
+	uvec2 unscaled_sub = unscaled_coord & 3u;
+	uint word = uint(current_color_dirty) + 2u * uint(current_depth_dirty);
+	word <<= 2u * (unscaled_sub.x + unscaled_sub.y * 4u);
+
+#if SUBGROUP
+	// This should only need one iteration .
+	bool is_active = true;
+	do
+	{
+		if (subgroupBroadcastFirst(unscaled_block) == unscaled_block)
+		{
+			uint merged = subgroupOr(word);
+			if (subgroupElect())
+				atomicOr(vram32.data[SCALING_FACTOR * SCALING_FACTOR * (RDRAM_SIZE >> 2) + unscaled_block], merged);
+			is_active = false;
+		}
+	} while (is_active);
+#else
+	// Just use atomics directly. With subgroup support, we can be a bit smarter about it.
+	if (word != 0u)
+		atomicOr(vram32.data[SCALING_FACTOR * SCALING_FACTOR * (RDRAM_SIZE >> 2) + unscaled_block], word);
+#endif
+}
+
+void finish_tile(uvec2 coord, uint fb_width, uint fb_height, uint fb_addr_index, uint fb_depth_addr_index)
+{
+	if (all(lessThan(coord, uvec2(fb_width, fb_height))))
+	{
+		uint unscaled_fb_width = fb_width >> SCALING_LOG2;
+
+		uvec2 slice2d = coord & (SCALING_FACTOR - 1);
+		coord >>= SCALING_LOG2;
+		uint slice = slice2d.y * SCALING_FACTOR + slice2d.x;
+
+		uint index = fb_addr_index + unscaled_fb_width * coord.y + coord.x;
+		store_vram_color(index, slice);
+
+		index = fb_depth_addr_index + unscaled_fb_width * coord.y + coord.x;
+		store_vram_depth(index, slice);
+
+		if (RDRAM_SCALED_WRITE_MASK)
+			emit_scaled_write_masks(coord, unscaled_fb_width);
+	}
+}
+
+u8x4 decode_memory_color(bool image_read_en)
+{
+	u8 memory_coverage = image_read_en ? (current_color.a & U8_C(0xe0)) : U8_C(0xe0);
+
+	u8x3 color;
+	switch (FB_FMT)
+	{
+	case FB_FMT_I4:
+		color = u8x3(0);
+		memory_coverage = U8_C(0xe0);
+		break;
+
+	case FB_FMT_I8:
+		color = current_color.rrr;
+		memory_coverage = U8_C(0xe0);
+		break;
+
+	case FB_FMT_RGBA5551:
+		color = current_color.rgb & U8_C(0xf8);
+		break;
+
+	case FB_FMT_IA88:
+		color = current_color.rrr;
+		break;
+
+	case FB_FMT_RGBA8888:
+		color = current_color.rgb;
+		break;
+	}
+	return u8x4(color, memory_coverage);
+}
+
+void write_color(u8x4 col)
+{
+	if (FB_FMT == FB_FMT_I4)
+		current_color.rgb = col.rgb;
+	else
+		current_color = col;
+	current_color_dirty = true;
+}
+
+void copy_pipeline(uint word, uint primitive_index)
+{
+	switch (FB_FMT)
+	{
+	case FB_FMT_I4:
+	{
+		current_color = u8x4(0);
+		current_color_dirty = true;
+		break;
+	}
+
+	case FB_FMT_I8:
+	{
+		// Alpha testing needs to only look at the low dword for some bizarre reason.
+		// I don't think alpha testing is supposed to be used at all with 8-bit FB ...
+		word &= 0xffu;
+		write_color(u8x4(word));
+		break;
+	}
+
+	case FB_FMT_RGBA5551:
+	{
+		uint r = (word >> 8) & 0xf8u;
+		uint g = (word >> 3) & 0xf8u;
+		uint b = (word << 2) & 0xf8u;
+		uint a = (word & 1) * 0xe0u;
+		write_color(u8x4(r, g, b, a));
+		break;
+	}
+	}
+
+	if (FB_COLOR_DEPTH_ALIAS)
+		alias_color_to_depth();
+}
+
+void fill_color(uint col)
+{
+	switch (FB_FMT)
+	{
+	case FB_FMT_RGBA8888:
+	{
+		uint r = (col >> 24u) & 0xffu;
+		uint g = (col >> 16u) & 0xffu;
+		uint b = (col >> 8u) & 0xffu;
+		uint a = (col >> 0u) & 0xffu;
+		write_color(u8x4(r, g, b, a));
+		break;
+	}
+
+	case FB_FMT_RGBA5551:
+	{
+		col >>= ((color_fb_index & 1u) ^ 1u) * 16u;
+		uint r = (col >> 8u) & 0xf8u;
+		uint g = (col >> 3u) & 0xf8u;
+		uint b = (col << 2u) & 0xf8u;
+		uint a = (col & 1u) * 0xe0u;
+		write_color(u8x4(r, g, b, a));
+		break;
+	}
+
+	case FB_FMT_IA88:
+	{
+		col >>= ((color_fb_index & 1u) ^ 1u) * 16u;
+		col &= 0xffffu;
+		uint r = (col >> 8u) & 0xffu;
+		uint a = (col >> 0u) & 0xffu;
+		write_color(u8x4(r, r, r, a));
+		break;
+	}
+
+	case FB_FMT_I8:
+	{
+		col >>= ((color_fb_index & 3u) ^ 3u) * 8u;
+		col &= 0xffu;
+		write_color(u8x4(col));
+		break;
+	}
+	}
+
+	if (FB_COLOR_DEPTH_ALIAS)
+		alias_color_to_depth();
+}
+
+void depth_blend(int x, int y, uint primitive_index, ShadedData shaded)
+{
+	int z = shaded.z_dith >> 9;
+	int dith = shaded.z_dith & 0x1ff;
+	int coverage_count = shaded.coverage_count;
+	u8x4 combined = shaded.combined;
+	u8 shade_alpha = shaded.shade_alpha;
+
+	uint blend_state_index = uint(state_indices.elems[primitive_index].static_depth_tmem.y);
+	DerivedSetup derived = load_derived_setup(primitive_index);
+	DepthBlendState depth_blend = load_depth_blend_state(blend_state_index);
+
+	bool force_blend = (depth_blend.flags & DEPTH_BLEND_FORCE_BLEND_BIT) != 0;
+	bool z_compare = (depth_blend.flags & DEPTH_BLEND_DEPTH_TEST_BIT) != 0;
+	bool z_update = (depth_blend.flags & DEPTH_BLEND_DEPTH_UPDATE_BIT) != 0;
+	bool image_read_enable = (depth_blend.flags & DEPTH_BLEND_IMAGE_READ_ENABLE_BIT) != 0;
+	bool color_on_coverage = (depth_blend.flags & DEPTH_BLEND_COLOR_ON_COVERAGE_BIT) != 0;
+	bool blend_multicycle = (depth_blend.flags & DEPTH_BLEND_MULTI_CYCLE_BIT) != 0;
+	bool aa_enable = (depth_blend.flags & DEPTH_BLEND_AA_BIT) != 0;
+	bool dither_en = (depth_blend.flags & DEPTH_BLEND_DITHER_ENABLE_BIT) != 0;
+
+	bool blend_en;
+	bool coverage_wrap;
+	u8x2 blend_shift;
+
+	u8x4 memory_color = decode_memory_color(image_read_enable);
+	u8 memory_coverage = memory_color.a >> U8_C(5);
+
+	bool z_pass = depth_test(z, derived.dz, derived.dz_compressed,
+	                         current_depth, current_dz,
+	                         coverage_count, memory_coverage,
+	                         z_compare, depth_blend.z_mode,
+	                         force_blend, aa_enable,
+	                         blend_en, coverage_wrap, blend_shift);
+
+	GENERIC_MESSAGE3(combined.x, combined.y, combined.z);
+
+	// Pixel tests.
+	if (z_pass && (!aa_enable || coverage_count != 0))
+	{
+		// Blending
+		BlendInputs blender_inputs =
+				BlendInputs(combined, memory_color,
+							derived.fog_color, derived.blend_color, shade_alpha);
+
+		u8x4 blend_modes = depth_blend.blend_modes0;
+		if (blend_multicycle)
+		{
+			blender_inputs.pixel_color.rgb =
+					blender(blender_inputs,
+							blend_modes,
+							force_blend, blend_en, color_on_coverage, coverage_wrap, blend_shift, false);
+			blend_modes = depth_blend.blend_modes1;
+		}
+		u8x3 rgb = blender(blender_inputs,
+						   blend_modes,
+						   force_blend, blend_en, color_on_coverage, coverage_wrap, blend_shift, true);
+
+		// Dither
+		if (dither_en)
+			rgb = rgb_dither(rgb, dith);
+
+		// Coverage blending
+		int new_coverage = blend_coverage(coverage_count, memory_coverage, blend_en, depth_blend.coverage_mode);
+
+		GENERIC_MESSAGE3(rgb.x, rgb.y, rgb.z);
+
+		// Writeback
+		write_color(u8x4(rgb, new_coverage << 5));
+
+		// Z-writeback.
+		if (z_update)
+		{
+			current_depth = z_compress(z);
+			current_dz = u8(derived.dz_compressed);
+			current_depth_dirty = true;
+
+			if (FB_COLOR_DEPTH_ALIAS)
+				alias_depth_to_color();
+		}
+		else if (FB_COLOR_DEPTH_ALIAS)
+			alias_color_to_depth();
+	}
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/noise.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/noise.h
@ -0,0 +1,71 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NOISE_H_
+#define NOISE_H_
+
+u16 seeded_noise = U16_C(0);
+
+// From: https://www.shadertoy.com/view/XlXcW4 with slight modifications.
+void reseed_noise(uint x, uint y, uint primitive_offset)
+{
+	const uint NOISE_PRIME = 1103515245u;
+	uvec3 seed = uvec3(x, y, primitive_offset);
+	seed = ((seed >> 8u) ^ seed.yzx) * NOISE_PRIME;
+	seed = ((seed >> 8u) ^ seed.yzx) * NOISE_PRIME;
+	seed = ((seed >> 8u) ^ seed.yzx) * NOISE_PRIME;
+	seeded_noise = u16(seed.x >> 16u);
+}
+
+i16 noise_get_combiner()
+{
+	return i16(((seeded_noise & U16_C(7u)) << U16_C(6u)) | U16_C(0x20u));
+}
+
+int noise_get_dither_alpha()
+{
+	return int(seeded_noise & U16_C(7u));
+}
+
+int noise_get_dither_color()
+{
+	// 3 bits of noise for RGB separately.
+	return int(seeded_noise & U16_C(0x1ff));
+}
+
+u8 noise_get_blend_threshold()
+{
+	return u8(seeded_noise & U16_C(0xffu));
+}
+
+uvec3 noise_get_full_gamma_dither()
+{
+	uint seed = seeded_noise;
+	return uvec3(seed & 0x3f, (seed >> 6u) & 0x3f, ((seed >> 9u) & 0x38) | (seed & 7u));
+}
+
+uvec3 noise_get_partial_gamma_dither()
+{
+	return (uvec3(seeded_noise) >> uvec3(0, 1, 2)) & 1u;
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/perspective.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/perspective.h
@ -0,0 +1,114 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef PERSPECTIVE_H_
+#define PERSPECTIVE_H_
+
+const i16x2 perspective_table[64] = i16x2[](
+	i16x2(0x4000, -252 * 4), i16x2(0x3f04, -244 * 4), i16x2(0x3e10, -238 * 4), i16x2(0x3d22, -230 * 4),
+	i16x2(0x3c3c, -223 * 4), i16x2(0x3b5d, -218 * 4), i16x2(0x3a83, -210 * 4), i16x2(0x39b1, -205 * 4),
+	i16x2(0x38e4, -200 * 4), i16x2(0x381c, -194 * 4), i16x2(0x375a, -189 * 4), i16x2(0x369d, -184 * 4),
+	i16x2(0x35e5, -179 * 4), i16x2(0x3532, -175 * 4), i16x2(0x3483, -170 * 4), i16x2(0x33d9, -166 * 4),
+	i16x2(0x3333, -162 * 4), i16x2(0x3291, -157 * 4), i16x2(0x31f4, -155 * 4), i16x2(0x3159, -150 * 4),
+	i16x2(0x30c3, -147 * 4), i16x2(0x3030, -143 * 4), i16x2(0x2fa1, -140 * 4), i16x2(0x2f15, -137 * 4),
+	i16x2(0x2e8c, -134 * 4), i16x2(0x2e06, -131 * 4), i16x2(0x2d83, -128 * 4), i16x2(0x2d03, -125 * 4),
+	i16x2(0x2c86, -123 * 4), i16x2(0x2c0b, -120 * 4), i16x2(0x2b93, -117 * 4), i16x2(0x2b1e, -115 * 4),
+	i16x2(0x2aab, -113 * 4), i16x2(0x2a3a, -110 * 4), i16x2(0x29cc, -108 * 4), i16x2(0x2960, -106 * 4),
+	i16x2(0x28f6, -104 * 4), i16x2(0x288e, -102 * 4), i16x2(0x2828, -100 * 4), i16x2(0x27c4, -98 * 4),
+	i16x2(0x2762, -96 * 4),  i16x2(0x2702, -94 * 4),  i16x2(0x26a4, -92 * 4),  i16x2(0x2648, -91 * 4),
+	i16x2(0x25ed, -89 * 4),  i16x2(0x2594, -87 * 4),  i16x2(0x253d, -86 * 4),  i16x2(0x24e7, -85 * 4),
+	i16x2(0x2492, -83 * 4),  i16x2(0x243f, -81 * 4),  i16x2(0x23ee, -80 * 4),  i16x2(0x239e, -79 * 4),
+	i16x2(0x234f, -77 * 4),  i16x2(0x2302, -76 * 4),  i16x2(0x22b6, -74 * 4),  i16x2(0x226c, -74 * 4),
+	i16x2(0x2222, -72 * 4),  i16x2(0x21da, -71 * 4),  i16x2(0x2193, -70 * 4),  i16x2(0x214d, -69 * 4),
+	i16x2(0x2108, -67 * 4),  i16x2(0x20c5, -67 * 4),  i16x2(0x2082, -65 * 4),  i16x2(0x2041, -65 * 4)
+);
+
+ivec2 perspective_get_lut(int w)
+{
+	int shift = min(14 - findMSB(w), 14);
+	int normout = (w << shift) & 0x3fff;
+	int wnorm = normout & 0xff;
+	ivec2 table = ivec2(perspective_table[normout >> 8]);
+	int rcp = ((table.y * wnorm) >> 10) + table.x;
+	return ivec2(rcp, shift);
+}
+
+ivec2 no_perspective_divide(ivec3 stw)
+{
+	return stw.xy;
+}
+
+// s16 divided by s1.15.
+// Classic approximation of a (x * rcp) >> shift with a LUT to find rcp.
+ivec2 perspective_divide(ivec3 stw, inout bool overflow)
+{
+	int w = stw.z;
+	bool w_carry = w <= 0;
+	w &= 0x7fff;
+
+	ivec2 table = perspective_get_lut(w);
+	int shift = table.y;
+	ivec2 prod = stw.xy * table.x;
+
+	int temp_mask = ((1 << 30) - 1) & -((1 << 29) >> shift);
+	ivec2 out_of_bounds = prod & temp_mask;
+
+	ivec2 temp;
+	if (shift != 14)
+		temp = prod = prod >> (13 - shift);
+	else
+		temp = prod << 1;
+
+	if (any(notEqual(out_of_bounds, ivec2(0))))
+	{
+		if (out_of_bounds.x != temp_mask && out_of_bounds.x != 0)
+		{
+			if ((prod.x & (1 << 29)) == 0)
+				temp.x = 0x7fff;
+			else
+				temp.x = -0x8000;
+			overflow = true;
+		}
+
+		if (out_of_bounds.y != temp_mask && out_of_bounds.y != 0)
+		{
+			if ((prod.y & (1 << 29)) == 0)
+				temp.y = 0x7fff;
+			else
+				temp.y = -0x8000;
+			overflow = true;
+		}
+	}
+
+	if (w_carry)
+	{
+		temp = ivec2(0x7fff);
+		overflow = true;
+	}
+
+	// Perspective divide produces a 17-bit signed coordinate, which is later clamped to 16-bit signed.
+	// However, the LOD computation happens in 17 bits ...
+	return clamp(temp, ivec2(-0x10000), ivec2(0xffff));
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/rasterizer.comp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/rasterizer.comp
@ -0,0 +1,191 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "small_types.h"
+
+layout(local_size_x_id = 0, local_size_y_id = 1) in;
+
+#include "debug.h"
+#include "data_structures.h"
+
+layout(set = 0, binding = 0, std430) readonly buffer TriangleSetupBuffer
+{
+    TriangleSetupMem elems[];
+} triangle_setup;
+#include "load_triangle_setup.h"
+
+layout(set = 0, binding = 1, std430) readonly buffer AttributeSetupBuffer
+{
+    AttributeSetupMem elems[];
+} attribute_setup;
+#include "load_attribute_setup.h"
+
+layout(set = 0, binding = 2, std430) readonly buffer DerivedSetupBuffer
+{
+    DerivedSetupMem elems[];
+} derived_setup;
+#include "load_derived_setup.h"
+
+layout(set = 0, binding = 3, std430) readonly buffer StaticRasterStateBuffer
+{
+    StaticRasterizationStateMem elems[];
+} static_raster_state;
+#include "load_static_raster_state.h"
+
+layout(set = 0, binding = 4, std430) readonly buffer StateIndicesBuffer
+{
+    InstanceIndicesMem elems[];
+} state_indices;
+
+layout(set = 0, binding = 5, std430) readonly buffer SpanInfoOffsetBuffer
+{
+    SpanInfoOffsetsMem elems[];
+} span_offsets;
+#include "load_span_offsets.h"
+
+layout(set = 0, binding = 6, std430) readonly buffer SpanSetups
+{
+    SpanSetupMem elems[];
+} span_setups;
+#include "load_span_setup.h"
+
+layout(set = 0, binding = 7, std430) readonly buffer TMEM16
+{
+    TMEMInstance16Mem instances[];
+} tmem16;
+
+layout(set = 0, binding = 7, std430) readonly buffer TMEM8
+{
+    TMEMInstance8Mem instances[];
+} tmem8;
+
+layout(set = 0, binding = 8, std430) readonly buffer TileInfoBuffer
+{
+    TileInfoMem elems[];
+} tile_infos;
+#include "load_tile_info.h"
+
+layout(set = 2, binding = 0, std140) uniform GlobalConstants
+{
+    GlobalFBInfo fb_info;
+} global_constants;
+
+layout(constant_id = 2) const int STATIC_STATE_FLAGS = 0;
+layout(constant_id = 3) const int COMBINER_INPUTS_RGB0 = 0;
+layout(constant_id = 4) const int COMBINER_INPUTS_ALPHA0 = 0;
+layout(constant_id = 5) const int COMBINER_INPUTS_RGB1 = 0;
+layout(constant_id = 6) const int COMBINER_INPUTS_ALPHA1 = 0;
+layout(constant_id = 7) const int DITHER_TEX_SIZE_TEX_FMT = 0;
+
+const int COMBINER_INPUT_RGB0_MULADD = (COMBINER_INPUTS_RGB0 >> 0) & 0xff;
+const int COMBINER_INPUT_RGB0_MULSUB = (COMBINER_INPUTS_RGB0 >> 8) & 0xff;
+const int COMBINER_INPUT_RGB0_MUL = (COMBINER_INPUTS_RGB0 >> 16) & 0xff;
+const int COMBINER_INPUT_RGB0_ADD = (COMBINER_INPUTS_RGB0 >> 24) & 0xff;
+
+const int COMBINER_INPUT_ALPHA0_MULADD = (COMBINER_INPUTS_ALPHA0 >> 0) & 0xff;
+const int COMBINER_INPUT_ALPHA0_MULSUB = (COMBINER_INPUTS_ALPHA0 >> 8) & 0xff;
+const int COMBINER_INPUT_ALPHA0_MUL = (COMBINER_INPUTS_ALPHA0 >> 16) & 0xff;
+const int COMBINER_INPUT_ALPHA0_ADD = (COMBINER_INPUTS_ALPHA0 >> 24) & 0xff;
+
+const int COMBINER_INPUT_RGB1_MULADD = (COMBINER_INPUTS_RGB1 >> 0) & 0xff;
+const int COMBINER_INPUT_RGB1_MULSUB = (COMBINER_INPUTS_RGB1 >> 8) & 0xff;
+const int COMBINER_INPUT_RGB1_MUL = (COMBINER_INPUTS_RGB1 >> 16) & 0xff;
+const int COMBINER_INPUT_RGB1_ADD = (COMBINER_INPUTS_RGB1 >> 24) & 0xff;
+
+const int COMBINER_INPUT_ALPHA1_MULADD = (COMBINER_INPUTS_ALPHA1 >> 0) & 0xff;
+const int COMBINER_INPUT_ALPHA1_MULSUB = (COMBINER_INPUTS_ALPHA1 >> 8) & 0xff;
+const int COMBINER_INPUT_ALPHA1_MUL = (COMBINER_INPUTS_ALPHA1 >> 16) & 0xff;
+const int COMBINER_INPUT_ALPHA1_ADD = (COMBINER_INPUTS_ALPHA1 >> 24) & 0xff;
+
+const int DITHER = (DITHER_TEX_SIZE_TEX_FMT >> 0) & 0xff;
+const int TEX_SIZE = (DITHER_TEX_SIZE_TEX_FMT >> 8) & 0xff;
+const int TEX_FMT = (DITHER_TEX_SIZE_TEX_FMT >> 16) & 0xff;
+
+#define RASTERIZER_SPEC_CONSTANT
+
+#include "noise.h"
+#include "shading.h"
+
+layout(set = 0, binding = 9, std430) writeonly buffer ColorBuffer
+{
+    mem_u8x4 elems[];
+} color;
+
+layout(set = 0, binding = 9, std430) writeonly buffer ColorBufferRaw
+{
+    uint elems[];
+} raw_color;
+
+layout(set = 0, binding = 10, std430) writeonly buffer DepthBuffer
+{
+    int elems[];
+} depth;
+
+layout(set = 0, binding = 11, std430) writeonly buffer ShadeAlpha
+{
+    mem_u8 elems[];
+} shade_alpha;
+
+layout(set = 0, binding = 12, std430) writeonly buffer Coverage
+{
+    mem_i8 elems[];
+} coverage;
+
+layout(set = 1, binding = 0, std430) readonly buffer TileWorkList
+{
+    uvec4 elems[];
+} tile_work_list;
+
+void main()
+{
+    uvec4 work = tile_work_list.elems[gl_WorkGroupID.x];
+    int x = int(work.x * gl_WorkGroupSize.x + gl_LocalInvocationID.x);
+    int y = int(work.y * gl_WorkGroupSize.y + gl_LocalInvocationID.y);
+    uint tile_instance = work.z;
+    uint primitive_index = work.w;
+
+    ShadedData shaded;
+    i8 coverage_value;
+    uint index = tile_instance * (gl_WorkGroupSize.x * gl_WorkGroupSize.y) + gl_LocalInvocationIndex;
+
+    if (shade_pixel(x, y, primitive_index, shaded))
+    {
+        coverage_value = i8(shaded.coverage_count);
+
+        if (coverage_value <= I8_C(8))
+        {
+            // Workaround curious bug with glslang, need to cast manually to uvec4 first.
+            color.elems[index] = mem_u8x4(uvec4(shaded.combined));
+            shade_alpha.elems[index] = mem_u8(shaded.shade_alpha);
+            depth.elems[index] = shaded.z_dith;
+        }
+        else if ((coverage_value & COVERAGE_COPY_BIT) != 0)
+        {
+            // For copy pipe, we use a raw 32-bit word to represent the loaded texel.
+            raw_color.elems[index] = shaded.z_dith;
+        }
+    }
+    else
+        coverage_value = I8_C(-1);
+
+    coverage.elems[index] = mem_i8(coverage_value);
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/shading.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/shading.h
@ -0,0 +1,357 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SHADING_H_
+#define SHADING_H_
+
+#ifdef RASTERIZER_SPEC_CONSTANT
+const int SCALING_LOG2 = (STATIC_STATE_FLAGS >> RASTERIZATION_UPSCALING_LOG2_BIT_OFFSET) & 3;
+const int SCALING_FACTOR = 1 << SCALING_LOG2;
+#endif
+
+#include "coverage.h"
+#include "interpolation.h"
+#include "perspective.h"
+#include "texture.h"
+#include "dither.h"
+#include "combiner.h"
+
+bool shade_pixel(int x, int y, uint primitive_index, out ShadedData shaded)
+{
+	SpanInfoOffsets span_offsets = load_span_offsets(primitive_index);
+	if ((y < (SCALING_FACTOR * span_offsets.ylo)) || (y > (span_offsets.yhi * SCALING_FACTOR + (SCALING_FACTOR - 1))))
+		return false;
+
+	uint setup_flags = uint(triangle_setup.elems[primitive_index].flags);
+	if (SCALING_FACTOR > 1)
+	{
+		if ((setup_flags & TRIANGLE_SETUP_DISABLE_UPSCALING_BIT) != 0u)
+		{
+			x &= ~(SCALING_FACTOR - 1);
+			y &= ~(SCALING_FACTOR - 1);
+		}
+	}
+
+	SpanSetup span_setup = load_span_setup(SCALING_FACTOR * span_offsets.offset + (y - SCALING_FACTOR * span_offsets.ylo));
+	if (span_setup.valid_line == U16_C(0))
+		return false;
+
+	uint setup_tile = uint(triangle_setup.elems[primitive_index].tile);
+	AttributeSetup attr = load_attribute_setup(primitive_index);
+
+	uvec4 states = uvec4(state_indices.elems[primitive_index].static_depth_tmem);
+	uint static_state_index = states.x;
+	uint tmem_instance_index = states.z;
+
+	StaticRasterizationState static_state = load_static_rasterization_state(static_state_index);
+	uint static_state_flags = static_state.flags;
+	int static_state_dither = static_state.dither;
+	u8x4 combiner_inputs_rgb0 = static_state.combiner_inputs_rgb0;
+	u8x4 combiner_inputs_alpha0 = static_state.combiner_inputs_alpha0;
+	u8x4 combiner_inputs_rgb1 = static_state.combiner_inputs_rgb1;
+	u8x4 combiner_inputs_alpha1 = static_state.combiner_inputs_alpha1;
+
+#ifdef RASTERIZER_SPEC_CONSTANT
+	if ((STATIC_STATE_FLAGS & RASTERIZATION_USE_SPECIALIZATION_CONSTANT_BIT) != 0)
+	{
+		static_state_flags = STATIC_STATE_FLAGS;
+		static_state_dither = DITHER;
+
+		combiner_inputs_rgb0.x = u8(COMBINER_INPUT_RGB0_MULADD);
+		combiner_inputs_rgb0.y = u8(COMBINER_INPUT_RGB0_MULSUB);
+		combiner_inputs_rgb0.z = u8(COMBINER_INPUT_RGB0_MUL);
+		combiner_inputs_rgb0.w = u8(COMBINER_INPUT_RGB0_ADD);
+
+		combiner_inputs_alpha0.x = u8(COMBINER_INPUT_ALPHA0_MULADD);
+		combiner_inputs_alpha0.y = u8(COMBINER_INPUT_ALPHA0_MULSUB);
+		combiner_inputs_alpha0.z = u8(COMBINER_INPUT_ALPHA0_MUL);
+		combiner_inputs_alpha0.w = u8(COMBINER_INPUT_ALPHA0_ADD);
+
+		combiner_inputs_rgb1.x = u8(COMBINER_INPUT_RGB1_MULADD);
+		combiner_inputs_rgb1.y = u8(COMBINER_INPUT_RGB1_MULSUB);
+		combiner_inputs_rgb1.z = u8(COMBINER_INPUT_RGB1_MUL);
+		combiner_inputs_rgb1.w = u8(COMBINER_INPUT_RGB1_ADD);
+
+		combiner_inputs_alpha1.x = u8(COMBINER_INPUT_ALPHA1_MULADD);
+		combiner_inputs_alpha1.y = u8(COMBINER_INPUT_ALPHA1_MULSUB);
+		combiner_inputs_alpha1.z = u8(COMBINER_INPUT_ALPHA1_MUL);
+		combiner_inputs_alpha1.w = u8(COMBINER_INPUT_ALPHA1_ADD);
+	}
+#endif
+
+	// This is a great case for specialization constants.
+	bool tlut = (static_state_flags & RASTERIZATION_TLUT_BIT) != 0;
+	bool tlut_type = (static_state_flags & RASTERIZATION_TLUT_TYPE_BIT) != 0;
+	bool sample_quad = (static_state_flags & RASTERIZATION_SAMPLE_MODE_BIT) != 0;
+	bool cvg_times_alpha = (static_state_flags & RASTERIZATION_CVG_TIMES_ALPHA_BIT) != 0;
+	bool alpha_cvg_select = (static_state_flags & RASTERIZATION_ALPHA_CVG_SELECT_BIT) != 0;
+	bool perspective = (static_state_flags & RASTERIZATION_PERSPECTIVE_CORRECT_BIT) != 0;
+	bool tex_lod_en = (static_state_flags & RASTERIZATION_TEX_LOD_ENABLE_BIT) != 0;
+	bool sharpen_lod_en = (static_state_flags & RASTERIZATION_SHARPEN_LOD_ENABLE_BIT) != 0;
+	bool detail_lod_en = (static_state_flags & RASTERIZATION_DETAIL_LOD_ENABLE_BIT) != 0;
+	bool aa_enable = (static_state_flags & RASTERIZATION_AA_BIT) != 0;
+	bool multi_cycle = (static_state_flags & RASTERIZATION_MULTI_CYCLE_BIT) != 0;
+	bool interlace_en = (static_state_flags & RASTERIZATION_INTERLACE_FIELD_BIT) != 0;
+	bool fill_en = (static_state_flags & RASTERIZATION_FILL_BIT) != 0;
+	bool copy_en = (static_state_flags & RASTERIZATION_COPY_BIT) != 0;
+	bool alpha_test = (static_state_flags & RASTERIZATION_ALPHA_TEST_BIT) != 0;
+	bool alpha_test_dither = (static_state_flags & RASTERIZATION_ALPHA_TEST_DITHER_BIT) != 0;
+	bool mid_texel = (static_state_flags & RASTERIZATION_SAMPLE_MID_TEXEL_BIT) != 0;
+	bool uses_texel0 = (static_state_flags & RASTERIZATION_USES_TEXEL0_BIT) != 0;
+	bool uses_texel1 = (static_state_flags & RASTERIZATION_USES_TEXEL1_BIT) != 0;
+	bool uses_pipelined_texel1 = (static_state_flags & RASTERIZATION_USES_PIPELINED_TEXEL1_BIT) != 0;
+	bool uses_lod = (static_state_flags & RASTERIZATION_USES_LOD_BIT) != 0;
+	bool convert_one = (static_state_flags & RASTERIZATION_CONVERT_ONE_BIT) != 0;
+	bool bilerp0 = (static_state_flags & RASTERIZATION_BILERP_0_BIT) != 0;
+	bool bilerp1 = (static_state_flags & RASTERIZATION_BILERP_1_BIT) != 0;
+
+	if ((static_state_flags & RASTERIZATION_NEED_NOISE_BIT) != 0)
+		reseed_noise(x, y, primitive_index + global_constants.fb_info.base_primitive_index);
+
+	bool flip = (setup_flags & TRIANGLE_SETUP_FLIP_BIT) != 0;
+
+	if (copy_en)
+	{
+		bool valid = x >= span_setup.start_x && x <= span_setup.end_x;
+		if (!valid)
+			return false;
+
+		ivec2 st;
+		int s_offset;
+		interpolate_st_copy(span_setup, attr.dstzw_dx, x, perspective, flip, st, s_offset);
+
+		uint tile0 = uint(setup_tile) & 7u;
+		uint tile_info_index0 = uint(state_indices.elems[primitive_index].tile_infos[tile0]);
+		TileInfo tile_info0 = load_tile_info(tile_info_index0);
+#ifdef RASTERIZER_SPEC_CONSTANT
+		if ((STATIC_STATE_FLAGS & RASTERIZATION_USE_STATIC_TEXTURE_SIZE_FORMAT_BIT) != 0)
+		{
+			tile_info0.fmt = u8(TEX_FMT);
+			tile_info0.size = u8(TEX_SIZE);
+		}
+#endif
+		int texel0 = sample_texture_copy(tile_info0, tmem_instance_index, st, s_offset, tlut, tlut_type);
+		shaded.z_dith = texel0;
+		shaded.coverage_count = U8_C(COVERAGE_COPY_BIT);
+
+		if (alpha_test && global_constants.fb_info.fb_size == 2 && (texel0 & 1) == 0)
+			return false;
+
+		return true;
+	}
+	else if (fill_en)
+	{
+		shaded.coverage_count = U8_C(COVERAGE_FILL_BIT);
+		return x >= span_setup.start_x && x <= span_setup.end_x;
+	}
+
+	int coverage = compute_coverage(span_setup.xleft, span_setup.xright, x);
+
+	// There is no way we can gain coverage here.
+	// Reject work as fast as possible.
+	if (coverage == 0)
+		return false;
+
+	int coverage_count = bitCount(coverage);
+
+	// If we're not using AA, only the first coverage bit is relevant.
+	if (!aa_enable && (coverage & 1) == 0)
+		return false;
+
+	DerivedSetup derived = load_derived_setup(primitive_index);
+
+	int dx = x - span_setup.interpolation_base_x;
+	int interpolation_direction = flip ? 1 : -1;
+
+	// Interpolate attributes.
+	u8x4 shade = interpolate_rgba(span_setup.rgba, attr.drgba_dx, attr.drgba_dy,
+	                              dx, coverage);
+
+	ivec2 st, st_dx, st_dy;
+	int z;
+	bool perspective_overflow = false;
+
+	int tex_interpolation_direction = interpolation_direction;
+	if (SCALING_FACTOR > 1 && uses_lod)
+		if ((setup_flags & TRIANGLE_SETUP_NATIVE_LOD_BIT) != 0)
+			tex_interpolation_direction *= SCALING_FACTOR;
+
+	interpolate_stz(span_setup.stzw, attr.dstzw_dx, attr.dstzw_dy, dx, coverage, perspective, uses_lod,
+	                tex_interpolation_direction, st, st_dx, st_dy, z, perspective_overflow);
+
+	// Sample textures.
+	uint tile0 = uint(setup_tile) & 7u;
+	uint tile1 = (tile0 + 1) & 7u;
+	uint max_level = uint(setup_tile) >> 3u;
+	int min_lod = derived.min_lod;
+
+	i16 lod_frac;
+	if (uses_lod)
+	{
+		compute_lod_2cycle(tile0, tile1, lod_frac, max_level, min_lod, st, st_dx, st_dy, perspective_overflow,
+		                   tex_lod_en, sharpen_lod_en, detail_lod_en);
+	}
+
+	i16x4 texel0, texel1;
+
+	if (uses_texel0)
+	{
+		uint tile_info_index0 = uint(state_indices.elems[primitive_index].tile_infos[tile0]);
+		TileInfo tile_info0 = load_tile_info(tile_info_index0);
+#ifdef RASTERIZER_SPEC_CONSTANT
+		if ((STATIC_STATE_FLAGS & RASTERIZATION_USE_STATIC_TEXTURE_SIZE_FORMAT_BIT) != 0)
+		{
+			tile_info0.fmt = u8(TEX_FMT);
+			tile_info0.size = u8(TEX_SIZE);
+		}
+#endif
+		texel0 = sample_texture(tile_info0, tmem_instance_index, st, tlut, tlut_type, sample_quad, mid_texel, false, i16x4(0));
+		if (!sample_quad && !bilerp0)
+			texel0 = texture_convert_factors(texel0, derived.factors);
+	}
+
+	// A very awkward mechanism where we peek into the next pixel, or in some cases, the next scanline's first pixel.
+	if (uses_pipelined_texel1)
+	{
+		bool valid_line = uint(span_setups.elems[SCALING_FACTOR * span_offsets.offset + (y - SCALING_FACTOR * span_offsets.ylo + 1)].valid_line) != 0u;
+		bool long_span = span_setup.lodlength >= 8;
+		bool end_span = x == (flip ? span_setup.end_x : span_setup.start_x);
+
+		if (end_span && long_span && valid_line)
+		{
+			ivec3 stw = span_setups.elems[SCALING_FACTOR * span_offsets.offset + (y - SCALING_FACTOR * span_offsets.ylo + 1)].stzw.xyw >> 16;
+			if (perspective)
+			{
+				bool st_overflow;
+				st = perspective_divide(stw, st_overflow);
+			}
+			else
+				st = no_perspective_divide(stw);
+		}
+		else
+			st = interpolate_st_single(span_setup.stzw, attr.dstzw_dx, dx + interpolation_direction * SCALING_FACTOR, perspective);
+
+		tile1 = tile0;
+		uses_texel1 = true;
+	}
+
+	if (uses_texel1)
+	{
+		if (convert_one && !bilerp1)
+		{
+			texel1 = texture_convert_factors(texel0, derived.factors);
+		}
+		else
+		{
+			uint tile_info_index1 = uint(state_indices.elems[primitive_index].tile_infos[tile1]);
+			TileInfo tile_info1 = load_tile_info(tile_info_index1);
+#ifdef RASTERIZER_SPEC_CONSTANT
+			if ((STATIC_STATE_FLAGS & RASTERIZATION_USE_STATIC_TEXTURE_SIZE_FORMAT_BIT) != 0)
+			{
+				tile_info1.fmt = u8(TEX_FMT);
+				tile_info1.size = u8(TEX_SIZE);
+			}
+#endif
+			texel1 = sample_texture(tile_info1, tmem_instance_index, st, tlut, tlut_type, sample_quad, mid_texel,
+			                        convert_one, texel0);
+
+			if (!sample_quad && !tlut && !bilerp1)
+				texel1 = texture_convert_factors(texel1, derived.factors);
+		}
+	}
+
+	int rgb_dith, alpha_dith;
+	dither_coefficients(x, y >> int(interlace_en), static_state_dither >> 2, static_state_dither & 3, rgb_dith, alpha_dith);
+
+	// Run combiner.
+	u8x4 combined;
+	u8 alpha_reference;
+	if (multi_cycle)
+	{
+		CombinerInputs combined_inputs =
+				CombinerInputs(derived.constant_muladd0, derived.constant_mulsub0, derived.constant_mul0, derived.constant_add0,
+				               shade, u8x4(0), texel0, texel1, lod_frac, noise_get_combiner());
+
+		combined_inputs.combined = combiner_cycle0(combined_inputs,
+		                                           combiner_inputs_rgb0,
+		                                           combiner_inputs_alpha0,
+		                                           alpha_dith, coverage_count, cvg_times_alpha, alpha_cvg_select,
+		                                           alpha_test, alpha_reference);
+
+		combined_inputs.constant_muladd = derived.constant_muladd1;
+		combined_inputs.constant_mulsub = derived.constant_mulsub1;
+		combined_inputs.constant_mul = derived.constant_mul1;
+		combined_inputs.constant_add = derived.constant_add1;
+
+		// Pipelining, texel1 is promoted to texel0 in cycle1.
+		// I don't think hardware ever intended for you to access texels in second cycle due to this nature.
+		i16x4 tmp_texel = combined_inputs.texel0;
+		combined_inputs.texel0 = combined_inputs.texel1;
+		// Following the pipelining, texel1 should become texel0 of next pixel,
+		// but let's not go there ...
+		combined_inputs.texel1 = tmp_texel;
+
+		combined = u8x4(combiner_cycle1(combined_inputs,
+		                                combiner_inputs_rgb1,
+		                                combiner_inputs_alpha1,
+		                                alpha_dith, coverage_count, cvg_times_alpha, alpha_cvg_select));
+	}
+	else
+	{
+		CombinerInputs combined_inputs =
+				CombinerInputs(derived.constant_muladd1, derived.constant_mulsub1, derived.constant_mul1, derived.constant_add1,
+				               shade, u8x4(0), texel0, texel1, lod_frac, noise_get_combiner());
+
+		combined = u8x4(combiner_cycle1(combined_inputs,
+		                                combiner_inputs_rgb1,
+		                                combiner_inputs_alpha1,
+		                                alpha_dith, coverage_count, cvg_times_alpha, alpha_cvg_select));
+
+		alpha_reference = combined.a;
+	}
+
+	// After combiner, color can be modified to 0 through alpha-to-cvg, so check for potential write_enable here.
+	// If we're not using AA, the first coverage bit is used instead, coverage count is ignored.
+	if (aa_enable && coverage_count == 0)
+		return false;
+
+	if (alpha_test)
+	{
+		u8 alpha_threshold;
+		if (alpha_test_dither)
+			alpha_threshold = noise_get_blend_threshold();
+		else
+			alpha_threshold = derived.blend_color.a;
+
+		if (alpha_reference < alpha_threshold)
+			return false;
+	}
+
+	shaded.combined = combined;
+	shaded.z_dith = (z << 9) | rgb_dith;
+	shaded.coverage_count = u8(coverage_count);
+	// Shade alpha needs to be passed separately since it might affect the blending stage.
+	shaded.shade_alpha = u8(min(shade.a + alpha_dith, 0xff));
+	return true;
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/slangmosh.hpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/slangmosh.hpp
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/slangmosh.json
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/slangmosh.json
@ -0,0 +1,126 @@
+{
+	"include": [ "../../Granite/assets/shaders/inc" ],
+	"shaders": [
+		{
+			"name": "tmem_update",
+			"compute": true,
+			"path": "tmem_update.comp"
+		},
+		{
+
+			"name": "span_setup",
+			"compute": true,
+			"path": "span_setup.comp"
+		},
+		{
+			"name": "clear_indirect_buffer",
+			"compute": true,
+			"path": "clear_indirect_buffer.comp"
+		},
+		{
+			"name": "tile_binning_combined",
+			"compute": true,
+			"path": "tile_binning_combined.comp",
+			"variants": [
+				{ "define": "SUBGROUP", "count": 2, "resolve": true },
+				{ "define": "UBERSHADER", "count": 2, "resolve": true },
+				{ "define": "SMALL_TYPES", "count": 2, "resolve": true }
+			]
+		},
+		{
+			"name": "ubershader",
+			"path": "ubershader.comp",
+			"compute": true,
+			"variants": [
+				{ "define": "SUBGROUP", "count": 2, "resolve": true },
+				{ "define": "SMALL_TYPES", "count": 2, "resolve": true }
+			]
+		},
+		{
+			"name": "depth_blend",
+			"path": "depth_blend.comp",
+			"compute": true,
+			"variants": [
+				{ "define": "SUBGROUP", "count": 2, "resolve": true },
+				{ "define": "SMALL_TYPES", "count": 2, "resolve": true }
+			]
+		},
+		{
+			"name": "rasterizer",
+			"path": "rasterizer.comp",
+			"compute": true,
+			"variants": [
+				{ "define": "SMALL_TYPES", "count": 2, "resolve": true }
+			]
+		},
+		{
+			"name": "fullscreen",
+			"path": "fullscreen.vert"
+		},
+		{
+			"name": "vi_scale",
+			"path": "vi_scale.frag"
+		},
+		{
+			"name": "vi_divot",
+			"path": "vi_divot.frag",
+			"variants": [
+				{ "define": "FETCH_BUG", "count": 2 }
+			]
+		},
+		{
+			"name": "vi_fetch",
+			"path": "vi_fetch.frag",
+			"variants": [
+				{ "define": "FETCH_BUG", "count": 2 }
+			]
+		},
+		{
+			"name": "vi_blend_fields",
+			"path": "vi_blend_fields.frag"
+		},
+		{
+			"name": "extract_vram",
+			"path": "extract_vram.comp",
+			"compute": true
+		},
+		{
+			"name": "masked_rdram_resolve",
+			"path": "masked_rdram_resolve.comp",
+			"compute": true
+		},
+		{
+			"name": "clear_write_mask",
+			"path": "clear_write_mask.comp",
+			"compute": true
+		},
+		{
+			"name": "update_upscaled_domain_post",
+			"path": "update_upscaled_domain_post.comp",
+			"compute": true
+		},
+		{
+			"name": "update_upscaled_domain_pre",
+			"path": "update_upscaled_domain_pre.comp",
+			"compute": true
+		},
+		{
+			"name": "update_upscaled_domain_resolve",
+			"path": "update_upscaled_domain_resolve.comp",
+			"compute": true
+		},
+		{
+			"name": "clear_super_sampled_write_mask",
+			"path": "clear_super_sampled_write_mask.comp",
+			"compute": true
+		},
+		{
+			"name": "vi_deinterlace_vert",
+			"path": "vi_deinterlace.vert"
+		},
+		{
+			"name": "vi_deinterlace_frag",
+			"path": "vi_deinterlace.frag"
+		}
+	]
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/small_types.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/small_types.h
@ -0,0 +1,121 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+// Utility header to smooth over the difference between
+// 8/16-bit integer arithmetic vs. just 8/16-bit storage.
+
+#ifndef SMALL_INTEGERS_H_
+#define SMALL_INTEGERS_H_
+
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_8bit_storage : require
+
+#if SMALL_TYPES
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+
+#define mem_u8 uint8_t
+#define mem_u16 uint16_t
+#define mem_u8x2 u8vec2
+#define mem_u16x2 u16vec2
+#define mem_u8x3 u8vec3
+#define mem_u16x3 u16vec3
+#define mem_u8x4 u8vec4
+#define mem_u16x4 u16vec4
+
+#define mem_i8 int8_t
+#define mem_i16 int16_t
+#define mem_i8x2 i8vec2
+#define mem_i16x2 i16vec2
+#define mem_i8x3 i8vec3
+#define mem_i16x3 i16vec3
+#define mem_i8x4 i8vec4
+#define mem_i16x4 i16vec4
+
+#define u8 uint8_t
+#define u16 uint16_t
+#define u8x2 u8vec2
+#define u16x2 u16vec2
+#define u8x3 u8vec3
+#define u16x3 u16vec3
+#define u8x4 u8vec4
+#define u16x4 u16vec4
+
+#define i8 int8_t
+#define i16 int16_t
+#define i8x2 i8vec2
+#define i16x2 i16vec2
+#define i8x3 i8vec3
+#define i16x3 i16vec3
+#define i8x4 i8vec4
+#define i16x4 i16vec4
+
+#define U8_C(x) uint8_t(x)
+#define I8_C(x) int8_t(x)
+#define U16_C(x) uint16_t(x)
+#define I16_C(x) int16_t(x)
+
+#else
+
+#define mem_u8 uint8_t
+#define mem_u16 uint16_t
+#define mem_u8x2 u8vec2
+#define mem_u16x2 u16vec2
+#define mem_u8x3 u8vec3
+#define mem_u16x3 u16vec3
+#define mem_u8x4 u8vec4
+#define mem_u16x4 u16vec4
+
+#define mem_i8 int8_t
+#define mem_i16 int16_t
+#define mem_i8x2 i8vec2
+#define mem_i16x2 i16vec2
+#define mem_i8x3 i8vec3
+#define mem_i16x3 i16vec3
+#define mem_i8x4 i8vec4
+#define mem_i16x4 i16vec4
+
+#define u8 int
+#define u16 int
+#define u8x2 ivec2
+#define u16x2 ivec2
+#define u8x3 ivec3
+#define u16x3 ivec3
+#define u8x4 ivec4
+#define u16x4 ivec4
+
+#define i8 int
+#define i16 int
+#define i8x2 ivec2
+#define i16x2 ivec2
+#define i8x3 ivec3
+#define i16x3 ivec3
+#define i8x4 ivec4
+#define i16x4 ivec4
+
+#define U8_C(x) int(x)
+#define I8_C(x) int(x)
+#define U16_C(x) int(x)
+#define I16_C(x) int(x)
+
+#endif
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/span_setup.comp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/span_setup.comp
@ -0,0 +1,227 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "small_types.h"
+#include "debug.h"
+
+layout(local_size_x_id = 0) in;
+layout(constant_id = 1) const int SCALING_LOG2 = 0;
+const int SCALING_FACTOR = 1 << SCALING_LOG2;
+#include "data_structures.h"
+
+layout(std430, set = 0, binding = 0) readonly buffer TriangleSetupBuffer
+{
+    TriangleSetupMem elems[];
+} triangle_setup;
+#include "load_triangle_setup.h"
+
+layout(std430, set = 0, binding = 1) readonly buffer AttributeSetupBuffer
+{
+    AttributeSetupMem elems[];
+} attribute_setup;
+#include "load_attribute_setup.h"
+
+layout(set = 0, binding = 2, std430) readonly buffer ScissorStateBuffer
+{
+    ScissorStateMem elems[];
+} scissor_state;
+#include "load_scissor_state.h"
+
+layout(std430, set = 0, binding = 3) writeonly buffer SpanSetups
+{
+    SpanSetupMem elems[];
+} span_setups;
+#include "store_span_setup.h"
+
+layout(set = 1, binding = 0) uniform utextureBuffer uInterpolationJobs;
+
+const int SUBPIXELS = 4;
+const int SUBPIXELS_LOG2 = 2;
+
+// Convert a 16.16 signed value to 16.3. We have 8 subpixels in X direction after snapping.
+ivec4 quantize_x(ivec4 x)
+{
+    ivec4 sticky = ivec4(notEqual(x & 0xfff, ivec4(0)));
+    ivec4 snapped = ivec4((x >> 12) | sticky);
+    return snapped;
+}
+
+int min4(ivec4 v)
+{
+    ivec2 v2 = min(v.xy, v.zw);
+    return min(v2.x, v2.y);
+}
+
+int max4(ivec4 v)
+{
+    ivec2 v2 = max(v.xy, v.zw);
+    return max(v2.x, v2.y);
+}
+
+ivec4 interpolate_snapped(ivec4 dvalue, int dy)
+{
+    int dy_shifted = dy >> SCALING_LOG2;
+    int dy_masked = dy & (SCALING_FACTOR - 1);
+    return dy_shifted * dvalue + dy_masked * (dvalue >> SCALING_LOG2);
+}
+
+void main()
+{
+    ivec3 job_indices = ivec3(texelFetch(uInterpolationJobs, int(gl_WorkGroupID.x)).xyz);
+    int primitive_index = job_indices.x;
+    int base_y = job_indices.y * SCALING_FACTOR;
+    int max_y = job_indices.z * SCALING_FACTOR + (SCALING_FACTOR - 1);
+    int y = base_y + int(gl_LocalInvocationIndex);
+    if (y > max_y)
+        return;
+
+    TriangleSetup setup = load_triangle_setup(primitive_index);
+    AttributeSetup attr = load_attribute_setup(primitive_index);
+    ScissorState scissor = load_scissor_state(primitive_index);
+
+    bool flip = (setup.flags & TRIANGLE_SETUP_FLIP_BIT) != 0;
+    bool interlace_en = (setup.flags & TRIANGLE_SETUP_INTERLACE_FIELD_BIT) != 0;
+    bool keep_odd_field = (setup.flags & TRIANGLE_SETUP_INTERLACE_KEEP_ODD_BIT) != 0;
+
+    SpanSetup span_setup;
+
+    // Interpolate RGBA, STZW to their scanline.
+    {
+        bool do_offset = (setup.flags & TRIANGLE_SETUP_DO_OFFSET_BIT) != 0;
+        bool skip_xfrac = (setup.flags & TRIANGLE_SETUP_SKIP_XFRAC_BIT) != 0;
+        int y_interpolation_base = int(setup.yh) >> 2;
+        y_interpolation_base *= SCALING_FACTOR;
+
+        // For high-resolution interpolation, make sure we snap interpolation correctly at whole pixels,
+        // and quantize derivatives in-between pixels.
+        int dy = y - y_interpolation_base;
+
+        int xh = setup.xh * SCALING_FACTOR + dy * (setup.dxhdy << 2);
+
+        ivec4 drgba_diff = ivec4(0);
+        ivec4 dstzw_diff = ivec4(0);
+
+        // In do_offset mode, varyings are latched at last subpixel line instead of first (for some reason).
+        if (do_offset)
+        {
+            xh += (SCALING_FACTOR * 3) * setup.dxhdy;
+
+            ivec4 drgba_deh = attr.drgba_de & ~0x1ff;
+            ivec4 drgba_dyh = attr.drgba_dy & ~0x1ff;
+            drgba_diff = drgba_deh - (drgba_deh >> 2) - drgba_dyh + (drgba_dyh >> 2);
+
+            ivec4 dstzw_deh = attr.dstzw_de & ~0x1ff;
+            ivec4 dstzw_dyh = attr.dstzw_dy & ~0x1ff;
+            dstzw_diff = dstzw_deh - (dstzw_deh >> 2) - dstzw_dyh + (dstzw_dyh >> 2);
+        }
+
+        int base_x = xh >> 15;
+        int xfrac = skip_xfrac ? 0 : ((xh >> 7) & 0xff);
+
+        ivec4 rgba = attr.rgba + interpolate_snapped(attr.drgba_de, dy);
+        rgba = ((rgba & ~0x1ff) + drgba_diff - interpolate_snapped((attr.drgba_dx >> 8) & ~1, xfrac)) & ~0x3ff;
+
+        ivec4 stzw = attr.stzw + interpolate_snapped(attr.dstzw_de, dy);
+        stzw = ((stzw & ~0x1ff) + dstzw_diff - interpolate_snapped((attr.dstzw_dx >> 8) & ~1, xfrac)) & ~0x3ff;
+
+        span_setup.rgba = rgba;
+        span_setup.stzw = stzw;
+        span_setup.interpolation_base_x = base_x;
+    }
+
+    // Check Y dimension.
+    int yh_interpolation_base = int(setup.yh) & ~(SUBPIXELS - 1);
+    int ym_interpolation_base = int(setup.ym);
+    yh_interpolation_base *= SCALING_FACTOR;
+    ym_interpolation_base *= SCALING_FACTOR;
+
+    int y_sub = int(y * SUBPIXELS);
+    ivec4 y_subs = y_sub + ivec4(0, 1, 2, 3);
+    int ylo = max(setup.yh, scissor.ylo) * SCALING_FACTOR;
+    int yhi = min(setup.yl, scissor.yhi) * SCALING_FACTOR;
+
+    bvec4 clip_lo_y = lessThan(y_subs, ivec4(ylo));
+    bvec4 clip_hi_y = greaterThanEqual(y_subs, ivec4(yhi));
+    uvec4 clip_y = uvec4(clip_lo_y) | uvec4(clip_hi_y);
+
+    // Interpolate X at all 4 Y-subpixels.
+    ivec4 xh = setup.xh * SCALING_FACTOR + (y_subs - yh_interpolation_base) * setup.dxhdy;
+    ivec4 xm = setup.xm * SCALING_FACTOR + (y_subs - yh_interpolation_base) * setup.dxmdy;
+    ivec4 xl = setup.xl * SCALING_FACTOR + (y_subs - ym_interpolation_base) * setup.dxldy;
+    xl = mix(xl, xm, lessThan(y_subs, ivec4(SCALING_FACTOR * setup.ym)));
+
+    // If we have overflows, we can become sensitive to this in invalid_line check, where
+    // checks that should pass fail, and vice versa.
+    // Note that we shaved off one bit in triangle setup for upscaling purposes,
+    // so this should be 28 bits normally.
+    xl = bitfieldExtract(xl, 0, 27 + SCALING_LOG2);
+    xh = bitfieldExtract(xh, 0, 27 + SCALING_LOG2);
+
+    ivec4 xh_shifted = quantize_x(xh);
+    ivec4 xl_shifted = quantize_x(xl);
+
+    ivec4 xleft, xright;
+    if (flip)
+    {
+        xleft = xh_shifted;
+        xright = xl_shifted;
+    }
+    else
+    {
+        xleft = xl_shifted;
+        xright = xh_shifted;
+    }
+
+    bvec4 invalid_line = greaterThan(xleft >> 1, xright >> 1);
+
+    ivec4 lo_scissor = ivec4(SCALING_FACTOR * (scissor.xlo << 1));
+    ivec4 hi_scissor = ivec4(SCALING_FACTOR * (scissor.xhi << 1));
+
+    bool all_over = all(greaterThanEqual(min(xleft, xright), hi_scissor));
+    bool all_under = all(lessThan(max(xleft, xright), lo_scissor));
+
+    xleft = max(xleft, lo_scissor);
+    xleft = min(xleft, hi_scissor);
+    xright = max(xright, lo_scissor);
+    xright = min(xright, hi_scissor);
+
+    invalid_line = bvec4(uvec4(invalid_line) | clip_y);
+
+    xleft = mix(xleft, ivec4(0xffff), invalid_line);
+    xright = mix(xright, ivec4(0), invalid_line);
+
+    int start_x = min4(xleft) >> 3;
+    int end_x = max4(xright) >> 3;
+
+    span_setup.xleft = xleft;
+    span_setup.xright = xright;
+    span_setup.start_x = start_x;
+    span_setup.end_x = end_x;
+    span_setup.valid_line = int(!all(invalid_line) && !all_over && !all_under);
+
+    if (interlace_en)
+        if (((y >> SCALING_LOG2) & 1) != int(keep_odd_field))
+            span_setup.valid_line = U16_C(0);
+
+    span_setup.lodlength = int(flip ? (end_x - span_setup.interpolation_base_x) : (span_setup.interpolation_base_x - start_x));
+    store_span_setup(gl_GlobalInvocationID.x, span_setup);
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/store_span_setup.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/store_span_setup.h
@ -0,0 +1,43 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef STORE_SPAN_SETUP_H_
+#define STORE_SPAN_SETUP_H_
+
+void store_span_setup(uint index, SpanSetup setup)
+{
+#if SMALL_TYPES
+	span_setups.elems[index] = setup;
+#else
+	span_setups.elems[index].rgba = setup.rgba;
+	span_setups.elems[index].stzw = setup.stzw;
+	span_setups.elems[index].xleft = mem_u16x4(uvec4(setup.xleft));
+	span_setups.elems[index].xright = mem_u16x4(uvec4(setup.xright));
+	span_setups.elems[index].interpolation_base_x = setup.interpolation_base_x;
+	span_setups.elems[index].start_x = setup.start_x;
+	span_setups.elems[index].end_x = setup.end_x;
+	span_setups.elems[index].lodlength = mem_i16(setup.lodlength);
+	span_setups.elems[index].valid_line = mem_u16(setup.valid_line);
+#endif
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/texture.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/texture.h
@ -0,0 +1,905 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef TEXTURE_H_
+#define TEXTURE_H_
+
+#include "data_structures.h"
+
+const int TEXTURE_FORMAT_RGBA = 0;
+const int TEXTURE_FORMAT_YUV = 1;
+const int TEXTURE_FORMAT_CI = 2;
+const int TEXTURE_FORMAT_IA = 3;
+const int TEXTURE_FORMAT_I = 4;
+
+int texel_mask_s(TileInfo tile, int s)
+{
+	if (tile.mask_s != 0)
+	{
+		int mask = 1 << tile.mask_s;
+		if ((tile.flags & TILE_INFO_MIRROR_S_BIT) != 0)
+			s ^= max((s & mask) - 1, 0);
+		s &= mask - 1;
+	}
+
+	return s;
+}
+
+ivec2 texel_mask_s_copy(TileInfo tile, int s)
+{
+	ivec2 multi_s = s + ivec2(0, 1);
+
+	if (tile.mask_s != 0)
+	{
+		int mask = 1 << tile.mask_s;
+		if ((tile.flags & TILE_INFO_MIRROR_S_BIT) != 0)
+			multi_s ^= max((multi_s & mask) - 1, 0);
+		multi_s &= mask - 1;
+	}
+
+	return multi_s;
+}
+
+int texel_mask_t(TileInfo tile, int t)
+{
+	if (tile.mask_t != 0)
+	{
+		int mask = 1 << tile.mask_t;
+		if ((tile.flags & TILE_INFO_MIRROR_T_BIT) != 0)
+			t ^= max((t & mask) - 1, 0);
+		t &= mask - 1;
+	}
+
+	return t;
+}
+
+i16x4 convert_rgba16(uint word)
+{
+	uvec3 rgb = (uvec3(word) >> uvec3(11, 6, 1)) & 31u;
+	rgb = (rgb << 3u) | (rgb >> 2u);
+	uint alpha = (word & 1u) * 0xffu;
+	return i16x4(rgb, alpha);
+}
+
+i16x4 convert_ia16(uint word)
+{
+	uint intensity = word >> 8;
+	uint alpha = word & 0xff;
+	return i16x4(intensity, intensity, intensity, alpha);
+}
+
+i16x4 sample_texel_rgba4(TileInfo tile, uint tmem_instance, uvec2 st)
+{
+	uint byte_offset = tile.offset + tile.stride * st.y;
+	byte_offset += st.x >> 1;
+	byte_offset &= 0xfff;
+
+	uint shift = (~st.x & 1) * 4;
+
+	uint index = byte_offset;
+	index ^= (st.y & 1) << 2;
+	index ^= 3;
+
+	uint word = uint(tmem8.instances[tmem_instance].elems[index]);
+	word = (word >> shift) & 0xf;
+	word |= word << 4;
+	return i16x4(word);
+}
+
+i16x4 sample_texel_ia4(TileInfo tile, uint tmem_instance, uvec2 st)
+{
+	uint byte_offset = tile.offset + tile.stride * st.y;
+	byte_offset += st.x >> 1;
+	byte_offset &= 0xfff;
+
+	uint shift = (~st.x & 1) * 4;
+
+	uint index = byte_offset;
+	index ^= (st.y & 1) << 2;
+	index ^= 3;
+
+	uint word = uint(tmem8.instances[tmem_instance].elems[index]);
+	word = (word >> shift) & 0xf;
+
+	uint intensity = word & 0xe;
+	intensity = (intensity << 4) | (intensity << 1) | (intensity >> 2);
+	return i16x4(intensity, intensity, intensity, (word & 1) * 0xff);
+}
+
+i16x4 sample_texel_ci4(TileInfo tile, uint tmem_instance, uvec2 st, uint pal)
+{
+	uint byte_offset = tile.offset + tile.stride * st.y;
+	byte_offset += st.x >> 1;
+	byte_offset &= 0xfff;
+
+	uint shift = (~st.x & 1) * 4;
+
+	uint index = byte_offset;
+	index ^= (st.y & 1) << 2;
+	index ^= 3;
+
+	uint word = uint(tmem8.instances[tmem_instance].elems[index]);
+	word = (word >> shift) & 0xf;
+	word |= pal << 4;
+	return i16x4(word);
+}
+
+i16x4 sample_texel_ci4_tlut(TileInfo tile, uint tmem_instance, uvec2 st, uint pal, uint lut_offset, uint addr_xor, bool tlut_type)
+{
+	uint byte_offset = tile.offset + tile.stride * st.y;
+	byte_offset += st.x >> 1;
+	byte_offset &= 0x7ff;
+
+	uint shift = (~st.x & 1) * 4;
+
+	uint index = byte_offset;
+	index ^= (st.y & 1) << 2;
+	index ^= 3;
+
+	uint word = uint(tmem8.instances[tmem_instance].elems[index]);
+	word = (word >> shift) & 0xf;
+	word |= pal << 4;
+
+	uint lut_entry = (word << 2) + lut_offset;
+	lut_entry ^= addr_xor;
+
+	word = uint(tmem16.instances[tmem_instance].elems[0x400 | lut_entry]);
+	return tlut_type ? convert_ia16(word) : convert_rgba16(word);
+}
+
+i16x4 sample_texel_ci8_tlut(TileInfo tile, uint tmem_instance, uvec2 st, uint lut_offset, uint addr_xor, bool tlut_type)
+{
+	uint byte_offset = tile.offset + tile.stride * st.y;
+	byte_offset += st.x;
+	byte_offset &= 0x7ff;
+
+	uint index = byte_offset;
+	index ^= (st.y & 1) << 2;
+	index ^= 3;
+
+	uint word = uint(tmem8.instances[tmem_instance].elems[index]);
+	uint lut_entry = (word << 2) + lut_offset;
+	lut_entry ^= addr_xor;
+
+	word = uint(tmem16.instances[tmem_instance].elems[0x400 | lut_entry]);
+	return tlut_type ? convert_ia16(word) : convert_rgba16(word);
+}
+
+i16x4 sample_texel_ci32(TileInfo tile, uint tmem_instance, uvec2 st)
+{
+	uint byte_offset = tile.offset + tile.stride * st.y;
+	byte_offset += st.x * 2;
+	byte_offset &= 0xfff;
+
+	uint index = byte_offset >> 1;
+	index ^= (st.y & 1) << 1;
+	index ^= 1;
+
+	uint word = uint(tmem16.instances[tmem_instance].elems[index]);
+	return i16x2(word >> 8, word & 0xff).xyxy;
+}
+
+i16x4 sample_texel_ci32_tlut(TileInfo tile, uint tmem_instance, uvec2 st, uint lut_offset, uint addr_xor, bool tlut_type)
+{
+	uint byte_offset = tile.offset + tile.stride * st.y;
+	byte_offset += st.x * 2;
+	byte_offset &= 0x7ff;
+
+	uint index = byte_offset >> 1;
+	index ^= (st.y & 1) << 1;
+	index ^= 1;
+
+	uint word = uint(tmem16.instances[tmem_instance].elems[index]);
+	uint lut_entry = ((word >> 6) & ~3) + lut_offset;
+	lut_entry ^= addr_xor;
+	word = uint(tmem16.instances[tmem_instance].elems[0x400 | lut_entry]);
+	return tlut_type ? convert_ia16(word) : convert_rgba16(word);
+}
+
+i16x4 sample_texel_rgba8(TileInfo tile, uint tmem_instance, uvec2 st)
+{
+	uint byte_offset = tile.offset + tile.stride * st.y;
+	byte_offset += st.x;
+	byte_offset &= 0xfff;
+
+	uint index = byte_offset;
+	index ^= (st.y & 1) << 2;
+	index ^= 3;
+
+	uint word = uint(tmem8.instances[tmem_instance].elems[index]);
+	return i16x4(word);
+}
+
+i16x4 sample_texel_ia8(TileInfo tile, uint tmem_instance, uvec2 st)
+{
+	uint byte_offset = tile.offset + tile.stride * st.y;
+	byte_offset += st.x;
+	byte_offset &= 0xfff;
+
+	uint index = byte_offset;
+	index ^= (st.y & 1) << 2;
+	index ^= 3;
+
+	uint word = uint(tmem8.instances[tmem_instance].elems[index]);
+	uint intensity = word >> 4;
+	uint alpha = word & 0xf;
+	alpha |= alpha << 4;
+	intensity |= intensity << 4;
+	return i16x4(intensity, intensity, intensity, alpha);
+}
+
+i16x4 sample_texel_yuv16(TileInfo tile, uint tmem_instance, uvec2 st, uint chroma_x)
+{
+	uint byte_offset = tile.offset + tile.stride * st.y;
+	uint byte_offset_luma = byte_offset + st.x;
+	byte_offset_luma &= 0x7ff;
+
+	uint byte_offset_chroma = byte_offset + chroma_x * 2;
+	byte_offset_chroma &= 0x7ff;
+
+	uint index_luma = byte_offset_luma;
+	index_luma ^= (st.y & 1) << 2;
+	index_luma ^= 3;
+
+	uint index_chroma = byte_offset_chroma >> 1;
+	index_chroma ^= (st.y & 1) << 1;
+	index_chroma ^= 1;
+
+	u8 luma = u8(tmem8.instances[tmem_instance].elems[index_luma | 0x800]);
+	u16 chroma = u16(tmem16.instances[tmem_instance].elems[index_chroma]);
+	u8 u = u8((chroma >> U16_C(8)) & U16_C(0xff));
+	u8 v = u8((chroma >> U16_C(0)) & U16_C(0xff));
+	return i16x4(i16(u) - I16_C(0x80), i16(v) - I16_C(0x80), luma, luma);
+}
+
+i16x4 sample_texel_rgba16(TileInfo tile, uint tmem_instance, uvec2 st)
+{
+	uint byte_offset = tile.offset + tile.stride * st.y;
+	byte_offset += st.x * 2;
+	byte_offset &= 0xfff;
+
+	uint index = byte_offset >> 1;
+	index ^= (st.y & 1) << 1;
+	index ^= 1;
+
+	uint word = uint(tmem16.instances[tmem_instance].elems[index]);
+	return convert_rgba16(word);
+}
+
+i16x4 sample_texel_ia16(TileInfo tile, uint tmem_instance, uvec2 st)
+{
+	uint byte_offset = tile.offset + tile.stride * st.y;
+	byte_offset += st.x * 2;
+	byte_offset &= 0xfff;
+
+	uint index = byte_offset >> 1;
+	index ^= (st.y & 1) << 1;
+	index ^= 1;
+
+	uint word = uint(tmem16.instances[tmem_instance].elems[index]);
+	return convert_ia16(word);
+}
+
+i16x4 sample_texel_rgba32(TileInfo tile, uint tmem_instance, uvec2 st)
+{
+	uint byte_offset = tile.offset + tile.stride * st.y;
+	byte_offset += st.x * 2;
+	byte_offset &= 0x7ff;
+
+	uint index = byte_offset >> 1;
+	index ^= (st.y & 1) << 1;
+	index ^= 1;
+
+	uint lower_word = uint(tmem16.instances[tmem_instance].elems[index]);
+	uint upper_word = uint(tmem16.instances[tmem_instance].elems[index | 0x400]);
+	return i16x4(lower_word >> 8, lower_word & 0xff, upper_word >> 8, upper_word & 0xff);
+}
+
+int clamp_and_shift_coord(bool clamp_bit, int coord, int lo, int hi, int shift)
+{
+	// Clamp 17-bit coordinate to 16-bit coordinate here.
+	coord = clamp(coord, -0x8000, 0x7fff);
+
+	if (shift < 11)
+		coord >>= shift;
+	else
+	{
+		coord <<= (32 - shift);
+		coord >>= 16;
+	}
+
+	if (clamp_bit)
+	{
+		bool clamp_hi = (coord >> 3) >= hi;
+		if (clamp_hi)
+			coord = (((hi >> 2) - (lo >> 2)) & 0x3ff) << 5;
+		else
+			coord = max(coord - (lo << 3), 0);
+	}
+	else
+		coord -= lo << 3;
+
+	return coord;
+}
+
+int shift_coord(int coord, int lo, int shift)
+{
+	// Clamp 17-bit coordinate to 16-bit coordinate here.
+	coord = clamp(coord, -0x8000, 0x7fff);
+
+	if (shift < 11)
+		coord >>= shift;
+	else
+	{
+		coord <<= (32 - shift);
+		coord >>= 16;
+	}
+	coord -= lo << 3;
+	return coord;
+}
+
+// The copy pipe reads 4x16 words.
+int sample_texture_copy_word(TileInfo tile, uint tmem_instance, ivec2 st, int s_offset, bool tlut, bool tlut_type)
+{
+	// For non-16bpp TMEM, the lower 32-bits are sampled based on direct 16-bit fetches. There are no shifts applied.
+	bool high_word = s_offset < 2;
+	bool replicate_8bpp = high_word && tile.size != 2 && !tlut;
+	int samp;
+
+	int s_shamt = min(int(tile.size), 2);
+	bool large_texel = int(tile.size) == 3;
+	int idx_mask = (large_texel || tlut) ? 0x3ff : 0x7ff;
+
+	if (replicate_8bpp)
+	{
+		// The high word of 8-bpp replication is special in the sense that we sample 8-bpp correctly.
+		// Sample the two possible words.
+		st.x += 2 * s_offset;
+		ivec2 s = texel_mask_s_copy(tile, st.x);
+		int t = texel_mask_t(tile, st.y);
+
+		uint tbase = tile.offset + tile.stride * t;
+		uvec2 nibble_offset = (tbase * 2 + (s << s_shamt)) & 0x1fffu;
+		nibble_offset ^= (t & 1u) * 8u;
+		uvec2 index = nibble_offset >> 2u;
+
+		index &= idx_mask;
+		int samp0 = int(tmem16.instances[tmem_instance].elems[index.x ^ 1]);
+		int samp1 = int(tmem16.instances[tmem_instance].elems[index.y ^ 1]);
+
+		if (tile.size == 1)
+		{
+			samp0 >>= 8 - 4 * int(nibble_offset.x & 2);
+			samp1 >>= 8 - 4 * int(nibble_offset.y & 2);
+			samp0 &= 0xff;
+			samp1 &= 0xff;
+		}
+		else if (tile.size == 0)
+		{
+			samp0 >>= 12 - 4 * int(nibble_offset.x & 3u);
+			samp1 >>= 12 - 4 * int(nibble_offset.y & 3u);
+			samp0 = (samp0 & 0xf) * 0x11;
+			samp1 = (samp1 & 0xf) * 0x11;
+		}
+		else
+		{
+			samp0 >>= 8;
+			samp1 >>= 8;
+		}
+
+		samp = (samp0 << 8) | samp1;
+	}
+	else
+	{
+		st.x += s_offset;
+		int s = texel_mask_s(tile, st.x);
+		int t = texel_mask_t(tile, st.y);
+
+		uint tbase = tile.offset + tile.stride * t;
+		uint nibble_offset = (tbase * 2 + (s << s_shamt)) & 0x1fffu;
+		nibble_offset ^= (t & 1u) * 8u;
+
+		uint index = nibble_offset >> 2u;
+		index &= idx_mask;
+		samp = int(tmem16.instances[tmem_instance].elems[index ^ 1]);
+
+		if (tlut)
+		{
+			if (tile.size == 0)
+			{
+				samp >>= 12 - 4 * (nibble_offset & 3);
+				samp &= 0xf;
+				samp |= tile.palette << 4;
+				samp <<= 2;
+				samp += s_offset;
+			}
+			else
+			{
+				samp >>= 8 - 4 * (nibble_offset & 2);
+				samp &= 0xff;
+				samp <<= 2;
+				samp += s_offset;
+			}
+			samp = int(tmem16.instances[tmem_instance].elems[(samp | 0x400) ^ 1]);
+		}
+	}
+
+	return samp;
+}
+
+int sample_texture_copy(TileInfo tile, uint tmem_instance, ivec2 st, int s_offset, bool tlut, bool tlut_type)
+{
+	st.x = shift_coord(st.x, int(tile.slo), int(tile.shift_s));
+	st.y = shift_coord(st.y, int(tile.tlo), int(tile.shift_t));
+	st >>= 5;
+
+	int samp;
+	if (global_constants.fb_info.fb_size == 0)
+	{
+		samp = 0;
+	}
+	else if (global_constants.fb_info.fb_size == 1)
+	{
+		samp = sample_texture_copy_word(tile, tmem_instance, st, s_offset >> 1, tlut, tlut_type);
+		samp >>= 8 - 8 * (s_offset & 1);
+		samp &= 0xff;
+	}
+	else
+	{
+		samp = sample_texture_copy_word(tile, tmem_instance, st, s_offset, tlut, tlut_type);
+	}
+
+	return samp;
+}
+
+i16x2 bilinear_3tap(i16x2 t00, i16x2 t10, i16x2 t01, i16x2 t11, ivec2 frac)
+{
+	int sum_frac = frac.x + frac.y;
+	i16x2 t_base = sum_frac >= 32 ? t11 : t00;
+	i16x2 flip_frac = i16x2(sum_frac >= 32 ? (32 - frac.yx) : frac);
+	i16x2 accum = (t10 - t_base) * flip_frac.x;
+	accum += (t01 - t_base) * flip_frac.y;
+	accum += I16_C(0x10);
+	accum >>= I16_C(5);
+	accum += t_base;
+	return accum;
+}
+
+i16x4 sample_texture(TileInfo tile, uint tmem_instance, ivec2 st, bool tlut, bool tlut_type, bool sample_quad, bool mid_texel, bool convert_one,
+                     i16x4 prev_cycle)
+{
+	st.x = clamp_and_shift_coord((tile.flags & TILE_INFO_CLAMP_S_BIT) != 0, st.x, int(tile.slo), int(tile.shi), int(tile.shift_s));
+	st.y = clamp_and_shift_coord((tile.flags & TILE_INFO_CLAMP_T_BIT) != 0, st.y, int(tile.tlo), int(tile.thi), int(tile.shift_t));
+
+	ivec2 frac;
+	if (sample_quad)
+		frac = st & 31;
+	else
+		frac = ivec2(0);
+
+	int sum_frac = frac.x + frac.y;
+	st >>= 5;
+
+	int s0 = texel_mask_s(tile, st.x);
+	int t0 = texel_mask_t(tile, st.y);
+	int s1 = texel_mask_s(tile, st.x + 1);
+	int t1 = texel_mask_t(tile, st.y + 1);
+
+	// Very specific weird logic going on with t0 and t1.
+	int tdiff = max(t1 - t0, -255);
+	t1 = (t0 & 0xff) + tdiff;
+	t0 &= 0xff;
+
+	i16x4 t_base, t10, t01, t11;
+
+	mid_texel = all(bvec3(mid_texel, equal(frac, ivec2(0x10))));
+	if (mid_texel)
+		sum_frac = 0;
+
+	bool yuv = tile.fmt == TEXTURE_FORMAT_YUV;
+	ivec2 base_st = sum_frac >= 0x20 ? ivec2(s1, t1) : ivec2(s0, t0);
+
+	if (tlut)
+	{
+		switch (int(tile.fmt))
+		{
+		case TEXTURE_FORMAT_RGBA:
+		case TEXTURE_FORMAT_CI:
+		case TEXTURE_FORMAT_IA:
+		case TEXTURE_FORMAT_I:
+		{
+			// For TLUT, entries in the LUT are duplicated and we must make sure that we sample 3 different banks
+			// when we look up the TLUT entry. In normal situations, this is irrelevant, but we're trying to be accurate here.
+			bool upper = sum_frac >= 0x20;
+			uint addr_xor = upper ? 2 : 1;
+
+			switch (int(tile.size))
+			{
+			case 0:
+				t_base = sample_texel_ci4_tlut(tile, tmem_instance, base_st, tile.palette, upper ? 3 : 0, addr_xor, tlut_type);
+				if (sample_quad)
+				{
+					t10 = sample_texel_ci4_tlut(tile, tmem_instance, ivec2(s1, t0), tile.palette, 1, addr_xor,
+					                            tlut_type);
+					t01 = sample_texel_ci4_tlut(tile, tmem_instance, ivec2(s0, t1), tile.palette, 2, addr_xor,
+					                            tlut_type);
+				}
+				if (mid_texel)
+				{
+					t11 = sample_texel_ci4_tlut(tile, tmem_instance, ivec2(s1, t1), tile.palette, 3, addr_xor,
+					                            tlut_type);
+				}
+				break;
+
+			case 1:
+				t_base = sample_texel_ci8_tlut(tile, tmem_instance, base_st, upper ? 3 : 0, addr_xor, tlut_type);
+				if (sample_quad)
+				{
+					t10 = sample_texel_ci8_tlut(tile, tmem_instance, ivec2(s1, t0), 1, addr_xor, tlut_type);
+					t01 = sample_texel_ci8_tlut(tile, tmem_instance, ivec2(s0, t1), 2, addr_xor, tlut_type);
+				}
+				if (mid_texel)
+					t11 = sample_texel_ci8_tlut(tile, tmem_instance, ivec2(s1, t1), 3, addr_xor, tlut_type);
+				break;
+
+			default:
+				t_base = sample_texel_ci32_tlut(tile, tmem_instance, base_st, upper ? 3 : 0, addr_xor, tlut_type);
+				if (sample_quad)
+				{
+					t10 = sample_texel_ci32_tlut(tile, tmem_instance, ivec2(s1, t0), 1, addr_xor, tlut_type);
+					t01 = sample_texel_ci32_tlut(tile, tmem_instance, ivec2(s0, t1), 2, addr_xor, tlut_type);
+				}
+				if (mid_texel)
+					t11 = sample_texel_ci32_tlut(tile, tmem_instance, ivec2(s1, t1), 3, addr_xor, tlut_type);
+				break;
+			}
+			break;
+		}
+		}
+	}
+	else
+	{
+		switch (int(tile.fmt))
+		{
+		case TEXTURE_FORMAT_RGBA:
+			switch (int(tile.size))
+			{
+			case 0:
+				t_base = sample_texel_rgba4(tile, tmem_instance, base_st);
+				if (sample_quad)
+				{
+					t10 = sample_texel_rgba4(tile, tmem_instance, ivec2(s1, t0));
+					t01 = sample_texel_rgba4(tile, tmem_instance, ivec2(s0, t1));
+				}
+				if (mid_texel)
+					t11 = sample_texel_rgba4(tile, tmem_instance, ivec2(s1, t1));
+				break;
+
+			case 1:
+				t_base = sample_texel_rgba8(tile, tmem_instance, base_st);
+				if (sample_quad)
+				{
+					t10 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t0));
+					t01 = sample_texel_rgba8(tile, tmem_instance, ivec2(s0, t1));
+				}
+				if (mid_texel)
+					t11 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t1));
+				break;
+
+			case 2:
+				t_base = sample_texel_rgba16(tile, tmem_instance, base_st);
+				if (sample_quad)
+				{
+					t10 = sample_texel_rgba16(tile, tmem_instance, ivec2(s1, t0));
+					t01 = sample_texel_rgba16(tile, tmem_instance, ivec2(s0, t1));
+				}
+				if (mid_texel)
+					t11 = sample_texel_rgba16(tile, tmem_instance, ivec2(s1, t1));
+				break;
+
+			case 3:
+				t_base = sample_texel_rgba32(tile, tmem_instance, base_st);
+				if (sample_quad)
+				{
+					t10 = sample_texel_rgba32(tile, tmem_instance, ivec2(s1, t0));
+					t01 = sample_texel_rgba32(tile, tmem_instance, ivec2(s0, t1));
+				}
+				if (mid_texel)
+					t11 = sample_texel_rgba32(tile, tmem_instance, ivec2(s1, t1));
+				break;
+			}
+			break;
+
+		case TEXTURE_FORMAT_YUV:
+		{
+			uint chroma_x0 = s0 >> 1;
+			uint chroma_x1 = (s1 + (s1 - s0)) >> 1;
+
+			// Only implement 16bpp for now. It's the only one that gives meaningful results.
+			t_base = sample_texel_yuv16(tile, tmem_instance, ivec2(s0, t0), chroma_x0);
+			if (sample_quad)
+			{
+				t10 = sample_texel_yuv16(tile, tmem_instance, ivec2(s1, t0), chroma_x1);
+				t01 = sample_texel_yuv16(tile, tmem_instance, ivec2(s0, t1), chroma_x0);
+				t11 = sample_texel_yuv16(tile, tmem_instance, ivec2(s1, t1), chroma_x1);
+			}
+			break;
+		}
+
+		case TEXTURE_FORMAT_CI:
+			switch (int(tile.size))
+			{
+			case 0:
+				t_base = sample_texel_ci4(tile, tmem_instance, base_st, tile.palette);
+				if (sample_quad)
+				{
+					t10 = sample_texel_ci4(tile, tmem_instance, ivec2(s1, t0), tile.palette);
+					t01 = sample_texel_ci4(tile, tmem_instance, ivec2(s0, t1), tile.palette);
+				}
+				if (mid_texel)
+					t11 = sample_texel_ci4(tile, tmem_instance, ivec2(s1, t1), tile.palette);
+				break;
+
+			case 1:
+				t_base = sample_texel_rgba8(tile, tmem_instance, base_st);
+				if (sample_quad)
+				{
+					t10 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t0));
+					t01 = sample_texel_rgba8(tile, tmem_instance, ivec2(s0, t1));
+				}
+				if (mid_texel)
+					t11 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t1));
+				break;
+
+			default:
+				t_base = sample_texel_ci32(tile, tmem_instance, base_st);
+				if (sample_quad)
+				{
+					t10 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t0));
+					t01 = sample_texel_ci32(tile, tmem_instance, ivec2(s0, t1));
+				}
+				if (mid_texel)
+					t11 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t1));
+				break;
+			}
+			break;
+
+		case TEXTURE_FORMAT_IA:
+			switch (int(tile.size))
+			{
+			case 0:
+				t_base = sample_texel_ia4(tile, tmem_instance, base_st);
+				if (sample_quad)
+				{
+					t10 = sample_texel_ia4(tile, tmem_instance, ivec2(s1, t0));
+					t01 = sample_texel_ia4(tile, tmem_instance, ivec2(s0, t1));
+				}
+				if (mid_texel)
+					t11 = sample_texel_ia4(tile, tmem_instance, ivec2(s1, t1));
+				break;
+
+			case 1:
+				t_base = sample_texel_ia8(tile, tmem_instance, base_st);
+				if (sample_quad)
+				{
+					t10 = sample_texel_ia8(tile, tmem_instance, ivec2(s1, t0));
+					t01 = sample_texel_ia8(tile, tmem_instance, ivec2(s0, t1));
+				}
+				if (mid_texel)
+					t11 = sample_texel_ia8(tile, tmem_instance, ivec2(s1, t1));
+				break;
+
+			case 2:
+				t_base = sample_texel_ia16(tile, tmem_instance, base_st);
+				if (sample_quad)
+				{
+					t10 = sample_texel_ia16(tile, tmem_instance, ivec2(s1, t0));
+					t01 = sample_texel_ia16(tile, tmem_instance, ivec2(s0, t1));
+				}
+				if (mid_texel)
+					t11 = sample_texel_ia16(tile, tmem_instance, ivec2(s1, t1));
+				break;
+
+			case 3:
+				t_base = sample_texel_ci32(tile, tmem_instance, base_st);
+				if (sample_quad)
+				{
+					t10 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t0));
+					t01 = sample_texel_ci32(tile, tmem_instance, ivec2(s0, t1));
+				}
+				if (mid_texel)
+					t11 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t1));
+				break;
+			}
+			break;
+
+		case TEXTURE_FORMAT_I:
+			switch (int(tile.size))
+			{
+			case 0:
+				t_base = sample_texel_rgba4(tile, tmem_instance, base_st);
+				if (sample_quad)
+				{
+					t10 = sample_texel_rgba4(tile, tmem_instance, ivec2(s1, t0));
+					t01 = sample_texel_rgba4(tile, tmem_instance, ivec2(s0, t1));
+				}
+				if (mid_texel)
+					t11 = sample_texel_rgba4(tile, tmem_instance, ivec2(s1, t1));
+				break;
+
+			case 1:
+				t_base = sample_texel_rgba8(tile, tmem_instance, base_st);
+				if (sample_quad)
+				{
+					t10 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t0));
+					t01 = sample_texel_rgba8(tile, tmem_instance, ivec2(s0, t1));
+				}
+				if (mid_texel)
+					t11 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t1));
+				break;
+
+			default:
+				t_base = sample_texel_ci32(tile, tmem_instance, base_st);
+				if (sample_quad)
+				{
+					t10 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t0));
+					t01 = sample_texel_ci32(tile, tmem_instance, ivec2(s0, t1));
+				}
+				if (mid_texel)
+					t11 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t1));
+				break;
+			}
+			break;
+		}
+	}
+
+	i16x4 accum;
+
+	if (convert_one)
+	{
+		ivec4 prev_sext = bitfieldExtract(ivec4(prev_cycle), 0, 9);
+		ivec2 factors = sum_frac >= 32 ? prev_sext.gr : prev_sext.rg;
+		ivec4 converted = factors.r * (t10 - t_base) + factors.g * (t01 - t_base) + 0x80;
+		converted >>= 8;
+		converted += prev_sext.b;
+		accum = i16x4(converted);
+	}
+	else if (yuv)
+	{
+		if (sample_quad)
+		{
+			int chroma_frac = ((s0 & 1) << 4) | (frac.x >> 1);
+			i16x2 accum_chroma = bilinear_3tap(t_base.xy, t10.xy, t01.xy, t11.xy, ivec2(chroma_frac, frac.y));
+			i16x2 accum_luma = bilinear_3tap(t_base.zw, t10.zw, t01.zw, t11.zw, frac);
+			accum = i16x4(accum_chroma, accum_luma);
+		}
+		else
+			accum = t_base;
+	}
+	else if (mid_texel)
+	{
+		accum = (t_base + t01 + t10 + t11 + I16_C(2)) >> I16_C(2);
+	}
+	else
+	{
+		i16x2 flip_frac = i16x2(sum_frac >= 32 ? (32 - frac.yx) : frac);
+		accum = (t10 - t_base) * flip_frac.x;
+		accum += (t01 - t_base) * flip_frac.y;
+		accum += I16_C(0x10);
+		accum >>= I16_C(5);
+		accum += t_base;
+	}
+	return accum;
+}
+
+void compute_lod_2cycle(inout uint tile0, inout uint tile1, out i16 lod_frac, uint max_level, int min_lod,
+                        ivec2 st, ivec2 st_dx, ivec2 st_dy,
+                        bool perspective_overflow, bool tex_lod_en, bool sharpen_tex_en, bool detail_tex_en)
+{
+	bool magnify = false;
+	bool distant = false;
+
+	uint tile_offset = 0;
+
+	if (perspective_overflow)
+	{
+		distant = true;
+		lod_frac = i16(0xff);
+	}
+	else
+	{
+		ivec2 dx = st_dx - st;
+		// Kinda abs, except it's 1 less than expected if negative.
+		dx ^= dx >> 31;
+		ivec2 dy = st_dy - st;
+		// Kinda abs, except it's 1 less than expected if negative.
+		dy ^= dy >> 31;
+
+		ivec2 max_d2 = max(dx, dy);
+		int max_d = max(max_d2.x, max_d2.y);
+
+		if (max_d >= 0x4000)
+		{
+			distant = true;
+			lod_frac = i16(0xff);
+			tile_offset = max_level;
+		}
+		else if (max_d < 32) // LOD < 0
+		{
+			distant = max_level == 0u;
+			magnify = true;
+
+			if (!sharpen_tex_en && !detail_tex_en)
+				lod_frac = i16(distant ? 0xff : 0);
+			else
+				lod_frac = i16((max(min_lod, max_d) << 3) + (sharpen_tex_en ? -0x100 : 0));
+		}
+		else
+		{
+			int mip_base = max(findMSB(max_d >> 5), 0);
+			distant = mip_base >= max_level;
+
+			if (distant && !sharpen_tex_en && !detail_tex_en)
+			{
+				lod_frac = i16(0xff);
+			}
+			else
+			{
+				lod_frac = i16(((max_d << 3) >> mip_base) & 0xff);
+				tile_offset = mip_base;
+			}
+		}
+	}
+
+	if (tex_lod_en)
+	{
+		if (distant)
+			tile_offset = max_level;
+
+		if (!detail_tex_en)
+		{
+			tile0 = (tile0 + tile_offset) & 7u;
+			if (distant || (!sharpen_tex_en && magnify))
+				tile1 = tile0;
+			else
+				tile1 = (tile0 + 1) & 7;
+		}
+		else
+		{
+			tile1 = (tile0 + tile_offset + ((distant || magnify) ? 1 : 2)) & 7u;
+			tile0 = (tile0 + tile_offset + (magnify ? 0 : 1)) & 7u;
+		}
+	}
+}
+
+i16x4 texture_convert_factors(i16x4 texel_in, i16x4 factors)
+{
+	ivec4 texel = bitfieldExtract(ivec4(texel_in), 0, 9);
+
+	int r = texel.b + ((factors.x * texel.g + 0x80) >> 8);
+	int g = texel.b + ((factors.y * texel.r + factors.z * texel.g + 0x80) >> 8);
+	int b = texel.b + ((factors.w * texel.r + 0x80) >> 8);
+	int a = texel.b;
+	return i16x4(r, g, b, a);
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/tile_binning_combined.comp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/tile_binning_combined.comp
@ -0,0 +1,274 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+// Consumes result from tile_binning_prepass.comp, bins at a finer resolution (8x8 or 16x16 blocks).
+#include "small_types.h"
+
+#if SUBGROUP
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+layout(local_size_x_id = 0) in;
+#else
+// Reasonable default. For AMD (64 threads), subgroups are definitely supported, so this won't be hit.
+layout(local_size_x = 32) in;
+#endif
+
+#include "debug.h"
+#include "data_structures.h"
+#include "binning.h"
+
+layout(constant_id = 1) const int TILE_WIDTH = 8;
+layout(constant_id = 2) const int TILE_HEIGHT = 8;
+layout(constant_id = 3) const int MAX_PRIMITIVES = 256;
+layout(constant_id = 4) const int MAX_WIDTH = 1024;
+layout(constant_id = 5) const int TILE_INSTANCE_STRIDE = 0x8000;
+layout(constant_id = 6) const int SCALE_FACTOR = 1;
+
+const int TILE_BINNING_STRIDE = MAX_PRIMITIVES / 32;
+const int MAX_TILES_X = MAX_WIDTH / TILE_WIDTH;
+
+layout(set = 0, binding = 0, std430) readonly buffer TriangleSetupBuffer
+{
+    TriangleSetupMem elems[];
+} triangle_setup;
+#include "load_triangle_setup.h"
+
+layout(set = 0, binding = 1, std430) readonly buffer ScissorStateBuffer
+{
+    ScissorStateMem elems[];
+} scissor_state;
+#include "load_scissor_state.h"
+
+layout(set = 0, binding = 2, std430) readonly buffer StateIndicesBuffer
+{
+    InstanceIndicesMem elems[];
+} state_indices;
+
+layout(std430, set = 0, binding = 3) writeonly buffer TileBitmask
+{
+    uint binned_bitmask[];
+};
+
+layout(std430, set = 0, binding = 4) writeonly buffer TileBitmaskCoarse
+{
+    uint binned_bitmask_coarse[];
+};
+
+#if !UBERSHADER
+layout(std430, set = 0, binding = 5) writeonly buffer TileInstanceOffset
+{
+    uint elems[];
+} tile_instance_offsets;
+
+layout(std430, set = 0, binding = 6) buffer IndirectBuffer
+{
+    uvec4 elems[];
+} indirect_counts;
+
+// This can actually be uint16_t, but AMD doesn't seem to support loading uint16_t in SMEM unit,
+// the memory traffic for this data structure is not relevant anyways.
+struct TileRasterWork
+{
+    uint tile_x, tile_y;
+    uint tile_instance;
+    uint primitive;
+};
+
+layout(std430, set = 0, binding = 7) writeonly buffer WorkList
+{
+    uvec4 elems[];
+} tile_raster_work;
+#endif
+
+#if !UBERSHADER
+uint allocate_work_offset(uint variant_index)
+{
+#if !SUBGROUP
+    return atomicAdd(indirect_counts.elems[variant_index].x, 1u);
+#else
+    // Merge atomic operations. Compiler would normally do this,
+    // but it might not have figured out that variant_index is uniform.
+    uvec4 active_mask = subgroupBallot(true);
+    uint count = subgroupBallotBitCount(active_mask);
+    uint work_offset = 0u;
+    if (subgroupElect())
+        work_offset = atomicAdd(indirect_counts.elems[variant_index].x, count);
+    work_offset = subgroupBroadcastFirst(work_offset);
+    work_offset += subgroupBallotExclusiveBitCount(active_mask);
+    return work_offset;
+#endif
+}
+#endif
+
+layout(push_constant, std430) uniform Registers
+{
+    uvec2 resolution;
+    int primitive_count;
+} fb_info;
+
+#if !SUBGROUP
+shared uint merged_mask_shared;
+#endif
+
+void main()
+{
+    int group_index = int(gl_WorkGroupID.x);
+    ivec2 meta_tile = ivec2(gl_WorkGroupID.yz);
+
+    const int TILES_X = 8;
+    const int TILES_Y = int(gl_WorkGroupSize.x) >> 3;
+
+#if SUBGROUP
+    // Spec is unclear how gl_LocalInvocationIndex is mapped to gl_SubgroupInvocationID, so synthesize our own.
+    // We know the subgroups are fully occupied with VK_EXT_subgroup_size_control already.
+    int local_index = int(gl_SubgroupInvocationID);
+    int SUBGROUP_TILES_Y = int(gl_SubgroupSize) >> 3;
+#else
+    int local_index = int(gl_LocalInvocationIndex);
+#endif
+
+    int inner_tile_x = local_index & 7;
+    int inner_tile_y = local_index >> 3;
+#if SUBGROUP
+    inner_tile_y += SUBGROUP_TILES_Y * int(gl_SubgroupID);
+#endif
+    ivec2 tile = meta_tile * ivec2(TILES_X, TILES_Y) + ivec2(inner_tile_x, inner_tile_y);
+
+    int linear_tile = tile.y * MAX_TILES_X + tile.x;
+
+    ivec2 base_coord_meta = meta_tile * ivec2(TILE_WIDTH * TILES_X, TILE_HEIGHT * TILES_Y);
+#if SUBGROUP
+    base_coord_meta.y += SUBGROUP_TILES_Y * TILE_HEIGHT * int(gl_SubgroupID);
+    ivec2 end_coord_meta = min(base_coord_meta + ivec2(TILE_WIDTH * TILES_X, TILE_HEIGHT * SUBGROUP_TILES_Y), ivec2(fb_info.resolution)) - 1;
+#else
+    ivec2 end_coord_meta = min(base_coord_meta + ivec2(TILE_WIDTH * TILES_X, TILE_HEIGHT * TILES_Y), ivec2(fb_info.resolution)) - 1;
+#endif
+
+    ivec2 base_coord = tile * ivec2(TILE_WIDTH, TILE_HEIGHT);
+    ivec2 end_coord = min(base_coord + ivec2(TILE_WIDTH, TILE_HEIGHT), ivec2(fb_info.resolution)) - 1;
+
+    int primitive_count = fb_info.primitive_count;
+
+#if !SUBGROUP
+    if (local_index == 0)
+        merged_mask_shared = 0u;
+    barrier();
+#endif
+
+    bool binned = false;
+    if (local_index < 32)
+    {
+        uint primitive_index = group_index * 32 + local_index;
+        if (primitive_index < primitive_count)
+        {
+            ScissorState scissor = load_scissor_state(primitive_index);
+            ivec2 clipped_base_coord = max(base_coord_meta, SCALE_FACTOR * (ivec2(scissor.xlo, scissor.ylo) >> 2) - 1);
+            ivec2 clipped_end_coord = min(end_coord_meta, SCALE_FACTOR * (ivec2(scissor.xhi + 3, scissor.yhi + 3) >> 2) - 1);
+            TriangleSetup setup = load_triangle_setup(primitive_index);
+            binned = bin_primitive(setup, clipped_base_coord, clipped_end_coord, SCALE_FACTOR);
+        }
+    }
+
+#if SUBGROUP
+    uint merged_mask = subgroupBallot(binned).x;
+#else
+    if (binned)
+        atomicOr(merged_mask_shared, 1u << local_index);
+    barrier();
+    uint merged_mask = merged_mask_shared;
+#endif
+
+    uint binned_mask = 0u;
+    while (merged_mask != 0u)
+    {
+        int bit = findLSB(merged_mask);
+        merged_mask &= ~(1u << bit);
+        uint primitive_index = group_index * 32 + bit;
+        ScissorState scissor = load_scissor_state(primitive_index);
+        ivec2 clipped_base_coord = max(base_coord, SCALE_FACTOR * (ivec2(scissor.xlo, scissor.ylo) >> 2));
+        ivec2 clipped_end_coord = min(end_coord, SCALE_FACTOR * (ivec2(scissor.xhi + 3, scissor.yhi + 3) >> 2) - 1);
+        TriangleSetup setup = load_triangle_setup(primitive_index);
+        if (bin_primitive(setup, clipped_base_coord, clipped_end_coord, SCALE_FACTOR))
+            binned_mask |= 1u << bit;
+    }
+
+    binned_bitmask[linear_tile * TILE_BINNING_STRIDE + group_index] = binned_mask;
+    if (binned_mask != 0u)
+        atomicOr(binned_bitmask_coarse[linear_tile], 1u << group_index);
+    else
+        atomicAnd(binned_bitmask_coarse[linear_tile], ~(1u << group_index));
+
+#if SUBGROUP
+#if !UBERSHADER
+    uint bit_count = uint(bitCount(binned_mask));
+    uint instance_offset = 0u;
+    if (subgroupAny(bit_count != 0u))
+    {
+        // Allocate tile instance space for all threads in subgroup in one go.
+        uint total_bit_count = subgroupAdd(bit_count);
+
+        if (subgroupElect())
+            if (total_bit_count != 0u)
+                instance_offset = atomicAdd(indirect_counts.elems[0].w, total_bit_count);
+
+        instance_offset = subgroupBroadcastFirst(instance_offset);
+        instance_offset += subgroupInclusiveAdd(bit_count) - bit_count;
+    }
+#endif
+#else
+#if !UBERSHADER
+    uint bit_count = uint(bitCount(binned_mask));
+    uint instance_offset = 0u;
+    if (bit_count != 0u)
+        instance_offset = atomicAdd(indirect_counts.elems[0].w, bit_count);
+#endif
+#endif
+
+#if !UBERSHADER
+    if (bit_count != 0u)
+        tile_instance_offsets.elems[linear_tile * TILE_BINNING_STRIDE + group_index] = instance_offset;
+
+#if SUBGROUP
+    uint variant_mask = subgroupOr(binned_mask);
+#else
+    uint variant_mask = binned_mask;
+#endif
+
+    while (variant_mask != 0u)
+    {
+        int bit = findLSB(variant_mask);
+        variant_mask &= ~(1u << bit);
+        int primitive_index = group_index * 32 + bit;
+
+        if ((binned_mask & (1u << bit)) != 0u)
+        {
+            uint variant_index = uint(state_indices.elems[primitive_index].static_depth_tmem.x);
+            uint work_offset = allocate_work_offset(variant_index);
+            tile_raster_work.elems[work_offset + uint(TILE_INSTANCE_STRIDE) * variant_index] =
+                uvec4(tile.x, tile.y, instance_offset, primitive_index);
+            instance_offset++;
+        }
+    }
+#endif
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/tmem_update.comp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/tmem_update.comp
@ -0,0 +1,577 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "debug.h"
+#include "small_types.h"
+layout(local_size_x_id = 0) in;
+
+layout(set = 0, binding = 0, std430) readonly buffer VRAM8Buffer
+{
+    mem_u8 data[];
+} vram8;
+
+layout(set = 0, binding = 0, std430) readonly buffer VRAM16Buffer
+{
+    mem_u16 data[];
+} vram16;
+
+layout(set = 0, binding = 0, std430) readonly buffer VRAM32Buffer
+{
+    uint data[];
+} vram32;
+
+layout(set = 0, binding = 1, std430) buffer TMEM16Buffer
+{
+    mem_u16 data[2048];
+} tmem16;
+
+struct TileInstance
+{
+    mem_u16 data[2048];
+};
+
+layout(set = 0, binding = 2, std430) writeonly buffer TMEMInstances
+{
+    TileInstance instances[];
+} tile_instances;
+
+layout(push_constant, std430) uniform Registers
+{
+    int num_uploads;
+} registers;
+
+const int TEXTURE_FMT_RGBA = 0;
+const int TEXTURE_FMT_YUV = 1;
+const int TEXTURE_FMT_CI = 2;
+const int TEXTURE_FMT_IA = 3;
+const int TEXTURE_FMT_I = 4;
+
+const int UPLOAD_MODE_TILE = 0;
+const int UPLOAD_MODE_TLUT = 1;
+const int UPLOAD_MODE_BLOCK = 2;
+
+struct UploadInfo
+{
+    int width, height;
+    float min_t_mod, max_t_mod;
+
+    int vram_addr;
+    int vram_width;
+    int vram_size;
+    int vram_effective_width;
+
+    int tmem_offset;
+    int tmem_stride_words;
+    int tmem_size;
+    int tmem_fmt;
+
+    int mode;
+    float inv_tmem_stride_words;
+    int dxt;
+    int padding;
+};
+
+layout(set = 1, binding = 0, std140) uniform UploadInfos
+{
+    UploadInfo upload_info[256];
+};
+
+bool tmem_dirty;
+uint current_tmem_value;
+
+int compute_upload_t(int offset, float inv_stride)
+{
+    // This is still exact for all relevant inputs, and much faster than integer divide.
+    return int((float(offset) + 0.5) * inv_stride);
+}
+
+// In 32bpp upload mode we read 64 bits and split the result over the lower and upper TMEM.
+void update_tmem_32(UploadInfo info, int tmem16_index, bool upper_tmem, bool yuv)
+{
+    int tmem16_offset = (info.tmem_offset & 0x7ff) >> 1;
+    int tmem16_stride = info.tmem_stride_words;
+
+    int pixel_offset = (tmem16_index - tmem16_offset) & 0x3ff;
+    int upload_x, upload_y;
+    int upload_x_xor = 0;
+
+    if (info.mode == UPLOAD_MODE_BLOCK)
+    {
+        int word_offset = pixel_offset >> 1;
+
+        if (info.tmem_stride_words == 0)
+        {
+            // Trivial case, we can just compute T factor directly and set upload_x_xor.
+            // Other than that, it works like a simple 1D upload.
+
+            // However, if DxT is weird, we might end up in a situation where this word is written multiple times,
+            // or zero times.
+
+            int iteration_candidate_first = word_offset & ~1;
+            int iteration_candidate_second = iteration_candidate_first + 1;
+            int first_t = (iteration_candidate_first * info.dxt) >> 16;
+            int second_t = (iteration_candidate_second * info.dxt) >> 16;
+            if (first_t != second_t)
+            {
+                int iteration_candidate_first_write_index = iteration_candidate_first ^ (first_t & 1);
+                int iteration_candidate_second_write_index = iteration_candidate_second ^ (second_t & 1);
+                if (iteration_candidate_second_write_index == word_offset)
+                    upload_x_xor = (second_t & 1) << 1;
+                else if (iteration_candidate_first_write_index == word_offset)
+                    upload_x_xor = (first_t & 1) << 1;
+                else
+                    return;
+            }
+            else
+                upload_x_xor ^= (first_t & 1) << 1;
+        }
+        else
+        {
+            // Welp ... This is pure insanity, but if we want to be completely correct ...
+            int min_t = compute_upload_t(word_offset & ~1, info.min_t_mod);
+            int max_t = compute_upload_t(word_offset | 1, info.max_t_mod);
+
+            // If t has a range, then the solution to Y = (t = floor(X * dt / 2048)) * stride + X has a range space of:
+            // Y - t_max * stride <= X <= Y - t_min * stride.
+            int max_word_candidate = (word_offset | 1) - tmem16_stride * min_t;
+            int min_word_candidate = (word_offset & ~1) - tmem16_stride * max_t;
+
+            // If we have constraints for X, we constraint T further.
+            min_t = max(min_t, (min_word_candidate * info.dxt) >> 16);
+            max_t = min(max_t, (max_word_candidate * info.dxt) >> 16);
+
+            bool found_candidate = false;
+            for (int t = max_t; t >= min_t; t--)
+            {
+                // Check to see if t is a solution to the equation.
+                // Potentially two targets could write here.
+                int candidate_solution_first = (word_offset & ~1) - tmem16_stride * t;
+                int candidate_solution_second = (word_offset | 1) - tmem16_stride * t;
+
+                int candidate_t_first = (candidate_solution_first * info.dxt) >> 16;
+                int candidate_t_second = (candidate_solution_second * info.dxt) >> 16;
+
+                if (((candidate_solution_second + candidate_t_second * tmem16_stride) ^ (candidate_t_second & 1)) == word_offset)
+                {
+                    found_candidate = true;
+                    pixel_offset = (candidate_solution_second << 1) + (pixel_offset & 1);
+                    break;
+                }
+                else if (((candidate_solution_first + candidate_t_first * tmem16_stride) ^ (candidate_t_first & 1)) == word_offset)
+                {
+                    found_candidate = true;
+                    pixel_offset = (candidate_solution_first << 1) + (pixel_offset & 1);
+                    break;
+                }
+            }
+
+            // We strided over this 64bpp word.
+            if (!found_candidate)
+                return;
+        }
+
+        upload_x = pixel_offset;
+        upload_y = 0;
+    }
+    else if (tmem16_stride == 0)
+    {
+        // For TMEM stride of 0 we're essentially replaying the same line over and over and the final visible result
+        // is what happened in Y == height - 1.
+        upload_x = pixel_offset;
+        upload_y = info.height - 1;
+    }
+    else
+    {
+        upload_y = compute_upload_t(pixel_offset, info.inv_tmem_stride_words);
+        upload_x = pixel_offset - upload_y * tmem16_stride;
+
+        // If stride is smaller than width, we'll need to unroll the last line.
+        if (upload_y >= info.height)
+        {
+            upload_x += tmem16_stride * (upload_y - info.height + 1);
+            upload_y = info.height - 1;
+        }
+    }
+
+    int last_line_upload_x = upload_x ^ ((upload_y & 1) << 1);
+    if (last_line_upload_x >= info.width && upload_y > 0)
+    {
+        // If the last line won't trigger a write, the previous line probably did.
+        upload_y--;
+        upload_x += tmem16_stride;
+    }
+
+    int iteration_offset;
+
+    upload_x ^= ((upload_y & 1) << 1) | upload_x_xor;
+
+    if (info.vram_size == 3 || yuv)
+    {
+        iteration_offset = 4 * (upload_x & ~1);
+    }
+    else if (info.vram_size == 2)
+    {
+        // In 16bpp VRAM mode, we are supposed to step 4 pixels at a time (8 bytes), which will form 2 complete pixels.
+        // However, in 32bpp tile mode we're not shifting the X value appropriately.
+        // So, we're writing texels [0, 1, ..., 4, 5, ...], etc.
+        if ((upload_x & 2) != 0)
+        {
+            // We're not writing in this line, but the previous line might have!
+            // Interleaving patterns will form ...
+            if (upload_y > 0)
+            {
+                upload_y--;
+                upload_x += tmem16_stride;
+                upload_x ^= 2;
+            }
+            else
+            {
+                // These 2 words will never be written to.
+                return;
+            }
+        }
+        iteration_offset = 2 * (upload_x & ~1);
+    }
+    else if (info.vram_size == 1)
+    {
+        // 4 potential mirrors.
+        for (int i = 0; i < 4 && upload_y > 0 && (upload_x & 6) != 0; i++)
+        {
+            upload_y--;
+            upload_x += tmem16_stride;
+            upload_x ^= 2;
+        }
+
+        if ((upload_x & 6) != 0)
+        {
+            // These 6 words will never be written to.
+            return;
+        }
+
+        iteration_offset = upload_x & ~1;
+    }
+
+    if (upload_x >= info.width)
+        return;
+
+    int line_rdram_addr = info.vram_addr + ((upload_y * info.vram_width) << (info.vram_size - 1));
+
+    // The loading pipeline reads 64 bits per iteration.
+    int rdram_addr = line_rdram_addr + iteration_offset + 4 * (upload_x & 1);
+
+    uint word;
+    if ((rdram_addr & 3) == 0)
+    {
+        word = uint(vram32.data[rdram_addr >> 2]);
+    }
+    else
+    {
+        word = (uint(vram8.data[rdram_addr ^ 3]) << 24) |
+               (uint(vram8.data[(rdram_addr + 1) ^ 3]) << 16) |
+               (uint(vram8.data[(rdram_addr + 2) ^ 3]) << 8) |
+                uint(vram8.data[(rdram_addr + 3) ^ 3]);
+    }
+
+    if (yuv)
+    {
+        // Lower TMEM receives interleaved UV samples, while upper receives Y.
+        if (upper_tmem)
+        {
+            uint y0 = (word >> 16u) & 0xffu;
+            uint y1 = (word >> 0u) & 0xffu;
+            word = (y0 << 8u) | y1;
+        }
+        else
+        {
+            uint u = (word >> 24u) & 0xffu;
+            uint v = (word >> 8u) & 0xffu;
+            word = (u << 8u) | v;
+        }
+    }
+    else
+    {
+        word >>= 16u - 16u * uint(upper_tmem);
+        word &= 0xffffu;
+    }
+    current_tmem_value = word;
+    tmem_dirty = true;
+}
+
+void update_tmem_16(UploadInfo info, int tmem16_index)
+{
+    int tmem16_offset = (info.tmem_offset & 0xfff) >> 1;
+    int tmem16_stride = info.tmem_stride_words;
+
+    int pixel_offset = (tmem16_index - tmem16_offset) & 0x7ff;
+    int upload_x, upload_y;
+    int upload_x_xor = 0;
+
+    if (info.mode == UPLOAD_MODE_BLOCK)
+    {
+        int word_offset = pixel_offset >> 2;
+
+        if (info.tmem_stride_words == 0)
+        {
+            // Trivial case, we can just compute T factor directly and set upload_x_xor.
+            // Other than that, it works like a simple 1D upload.
+            upload_x_xor = (((word_offset * info.dxt) >> 16) & 1) << 1;
+        }
+        else
+        {
+            // Welp ... This is pure insanity, but if we want to be completely correct ...
+            int min_t = compute_upload_t(word_offset, info.min_t_mod);
+            int max_t = compute_upload_t(word_offset, info.max_t_mod);
+
+            // If t has a range, then the solution to Y = (t = floor(X * dt / 2048)) * stride + X has a range space of:
+            // Y - t_max * stride <= X <= Y - t_min * stride.
+            int max_word_candidate = word_offset - tmem16_stride * min_t;
+            int min_word_candidate = word_offset - tmem16_stride * max_t;
+
+            // If we have constraints for X, we constraint T further.
+            min_t = max(min_t, (min_word_candidate * info.dxt) >> 16);
+            max_t = min(max_t, (max_word_candidate * info.dxt) >> 16);
+
+            bool found_candidate = false;
+            for (int t = max_t; t >= min_t; t--)
+            {
+                // Check to see if t is a solution to the equation.
+                int candidate_solution = word_offset - tmem16_stride * t;
+                int computed_t = (candidate_solution * info.dxt) >> 16;
+                if (candidate_solution + computed_t * tmem16_stride == word_offset)
+                {
+                    found_candidate = true;
+                    upload_x_xor = (computed_t & 1) << 1;
+                    pixel_offset = (candidate_solution << 2) + (pixel_offset & 3);
+                }
+            }
+
+            // We strided over this 64bpp word.
+            if (!found_candidate)
+                return;
+        }
+
+        upload_x = pixel_offset;
+        upload_y = 0;
+    }
+    else if (tmem16_stride == 0)
+    {
+        // For TMEM stride of 0 we're essentially replaying the same line over and over and the final visible result
+        // is what happened in Y == height - 1.
+        upload_x = pixel_offset;
+        upload_y = info.height - 1;
+    }
+    else
+    {
+        upload_y = compute_upload_t(pixel_offset, info.inv_tmem_stride_words);
+        upload_x = pixel_offset - upload_y * tmem16_stride;
+
+        // If stride is smaller than width, we'll need to unroll the last line.
+        if (upload_y >= info.height)
+        {
+            upload_x += tmem16_stride * (upload_y - info.height + 1);
+            upload_y = info.height - 1;
+        }
+    }
+
+    // This is pure bullshit magic which arises as an edge case when
+    // tile pixel size does not match texture image size.
+    // Should not happen in normal applications.
+    // This is basically doing scatter-as-gather, so we need to figure out
+    // if there is no write to our texel after all (striding), or if there are multiple writes
+    // to our texel, in which case we need to figure out the last writer.
+    // This code is black magic, and it's made with blood, sweat and tears from testing with lots of trial and error.
+    int iteration_offset;
+    if (info.tmem_size != info.vram_size)
+    {
+        if (info.vram_size - info.tmem_size == 1)
+        {
+            // If TMEM is N bpp but VRAM is 2N bpp, we will get mirrored writes here.
+            // Select which half of the 2N bpp load we observe in TMEM.
+            iteration_offset = (upload_x & ~3) * 4;
+            if ((upload_x & ~3) + 2 < (info.vram_effective_width >> (3 - info.vram_size)))
+                iteration_offset += 8;
+        }
+        else if (info.tmem_size == 2 && info.vram_size == 1)
+        {
+            // In 8bpp VRAM mode, we are supposed to step 8 pixels at a time (8 bytes), which will form 4 complete pixels.
+            // However, in 16bpp tile mode we're not shifting the X value appropriately.
+            // So, we're writing texels [0, 1, 2, 3, ..., 8, 9, 10, 11], etc.
+            if ((upload_x & 4) != 0)
+            {
+                // We're not writing in this line, but the previous line might have!
+                // Interleaving patterns will form ...
+                if ((tmem16_stride & 4) != 0 && upload_y > 0)
+                {
+                    upload_y--;
+                    upload_x += tmem16_stride;
+                }
+                else
+                {
+                    // These 4 words will never be written to.
+                    return;
+                }
+            }
+            iteration_offset = upload_x & ~3;
+        }
+    }
+    else
+    {
+        // Normal case TMEM size aligns with VRAM size.
+        iteration_offset = (upload_x & ~3) * 2;
+    }
+
+    if (upload_x >= info.width)
+        return;
+
+    int line_rdram_addr = info.vram_addr + ((upload_y * info.vram_width) << (info.vram_size - 1));
+    upload_x ^= ((upload_y & 1) << 1) | upload_x_xor;
+
+    // The loading pipeline reads 64 bits per iteration.
+    int rdram_addr = line_rdram_addr + iteration_offset + 2 * (upload_x & 3);
+
+    uint word;
+    if ((rdram_addr & 1) == 0)
+        word = uint(vram16.data[(rdram_addr >> 1) ^ 1]);
+    else
+        word = (uint(vram8.data[rdram_addr ^ 3]) << 8) | uint(vram8.data[(rdram_addr + 1) ^ 3]);
+
+    current_tmem_value = word;
+    tmem_dirty = true;
+}
+
+void update_tmem_lut(UploadInfo info, int tmem16_index)
+{
+    int tmem16_offset = (info.tmem_offset & 0xfff) >> 1;
+    int pixel_offset = (tmem16_index - tmem16_offset) & 0x7ff;
+    int pixel_offset_splat;
+
+    if (info.vram_size - info.tmem_size == 2)
+    {
+        pixel_offset_splat = pixel_offset >> 2;
+        pixel_offset_splat <<= info.vram_size - 2;
+        if (pixel_offset_splat >= info.vram_effective_width)
+            return;
+    }
+    else if (info.vram_size - info.tmem_size == 1)
+    {
+        if ((pixel_offset & 4) == 0)
+        {
+            int shamt = info.tmem_size + (info.vram_size == 2 ? 2 : 0);
+            pixel_offset_splat = (pixel_offset & ~7) >> shamt;
+            if (pixel_offset_splat >= info.vram_effective_width)
+                return;
+        }
+        else
+        {
+            return;
+        }
+    }
+    else if (info.vram_size == info.tmem_size)
+    {
+        if ((pixel_offset & 0xc) == 0)
+        {
+            int shamt = info.tmem_size + (info.vram_size == 2 ? 2 : 0);
+            pixel_offset_splat = (pixel_offset & ~3) >> shamt;
+            if (pixel_offset_splat >= info.vram_effective_width)
+                return;
+        }
+        else
+        {
+            return;
+        }
+    }
+    else if (info.vram_size - info.tmem_size == -1)
+    {
+        if ((pixel_offset & 0x1c) == 0)
+        {
+            int shamt = info.tmem_size;
+            pixel_offset_splat = (pixel_offset >> shamt) & ~7;
+            if (pixel_offset_splat >= info.vram_effective_width)
+                return;
+        }
+        else
+        {
+            return;
+        }
+    }
+    else
+    {
+        // 4bpp tile, 32bpp VRAM. Mirrored writes.
+        int span_iteration = pixel_offset >> 2;
+        span_iteration = span_iteration * 2;
+        int span_pixel = span_iteration * 2;
+        if (span_pixel + 2 < info.vram_effective_width)
+            span_pixel += 2;
+
+        if (span_pixel >= info.vram_effective_width)
+            return;
+
+        pixel_offset_splat = span_pixel;
+    }
+
+    int rdram_addr = info.vram_addr + (pixel_offset_splat << (info.vram_size - 1));
+
+    // Odd behavior when we have unaligned TLUT uploads.
+    rdram_addr += 2 * (rdram_addr & 1) * (pixel_offset & 3);
+
+    uint word;
+    if ((rdram_addr & 1) == 0)
+        word = uint(vram16.data[(rdram_addr >> 1) ^ 1]);
+    else
+        word = (uint(vram8.data[rdram_addr ^ 3]) << 8) | uint(vram8.data[(rdram_addr + 1) ^ 3]);
+
+    current_tmem_value = word;
+    tmem_dirty = true;
+}
+
+void main()
+{
+    tmem_dirty = false;
+    current_tmem_value = uint(tmem16.data[gl_GlobalInvocationID.x]);
+    int tmem16_index = int(gl_GlobalInvocationID.x) ^ 1;
+    bool upper_tmem = tmem16_index >= 0x400;
+
+    tile_instances.instances[0].data[gl_GlobalInvocationID.x] = mem_u16(current_tmem_value);
+
+    int num_uploads = registers.num_uploads;
+    for (int i = 0; i < num_uploads; i++)
+    {
+        UploadInfo info = upload_info[i];
+        if (info.mode == UPLOAD_MODE_TLUT)
+        {
+            update_tmem_lut(info, tmem16_index);
+        }
+        else
+        {
+            bool yuv = info.tmem_fmt == TEXTURE_FMT_YUV;
+            if (info.tmem_size == 3 || yuv)
+                update_tmem_32(info, tmem16_index & 0x3ff, upper_tmem, yuv);
+            else if (info.tmem_fmt != TEXTURE_FMT_YUV)
+                update_tmem_16(info, tmem16_index);
+        }
+
+        tile_instances.instances[i + 1].data[gl_GlobalInvocationID.x] = mem_u16(current_tmem_value);
+    }
+
+    if (tmem_dirty)
+        tmem16.data[gl_GlobalInvocationID.x] = mem_u16(current_tmem_value);
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/ubershader.comp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/ubershader.comp
@ -0,0 +1,103 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+// RIP to any GPU which attempts to execute this monstrosity :)
+
+#if SUBGROUP
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#endif
+#include "small_types.h"
+
+layout(local_size_x_id = 3, local_size_y_id = 4) in;
+
+#include "debug.h"
+#include "data_structures_buffers.h"
+
+#include "noise.h"
+#include "memory_interfacing.h"
+#include "shading.h"
+
+layout(push_constant, std430) uniform Registers
+{
+    uint fb_addr_index;
+    uint fb_depth_addr_index;
+    uint fb_width;
+    uint fb_height;
+    uint group_mask;
+} registers;
+
+layout(constant_id = 5) const int MAX_PRIMITIVES = 256;
+layout(constant_id = 6) const int MAX_WIDTH = 1024;
+
+const int TILE_BINNING_STRIDE = MAX_PRIMITIVES / 32;
+const int MAX_TILES_X = MAX_WIDTH / int(gl_WorkGroupSize.x);
+
+void main()
+{
+    int x = int(gl_GlobalInvocationID.x);
+    int y = int(gl_GlobalInvocationID.y);
+    ivec2 tile = ivec2(gl_WorkGroupID.xy);
+
+    int linear_tile = tile.x + tile.y * MAX_TILES_X;
+    int linear_tile_base = linear_tile * TILE_BINNING_STRIDE;
+
+    uint coarse_binned = tile_binning_coarse.elems[linear_tile] & registers.group_mask;
+    if (coarse_binned == 0u)
+        return;
+
+    init_tile(gl_GlobalInvocationID.xy,
+              registers.fb_width, registers.fb_height,
+              registers.fb_addr_index, registers.fb_depth_addr_index);
+
+    while (coarse_binned != 0u)
+    {
+        int mask_index = findLSB(coarse_binned);
+        coarse_binned &= ~uint(1 << mask_index);
+
+        uint binned = tile_binning.elems[linear_tile_base + mask_index];
+        while (binned != 0u)
+        {
+            int i = findLSB(binned);
+            binned &= ~uint(1 << i);
+            uint primitive_index = uint(i + 32 * mask_index);
+
+            ShadedData shaded;
+            if (shade_pixel(x, y, primitive_index, shaded))
+            {
+                if ((shaded.coverage_count & COVERAGE_FILL_BIT) != 0)
+                    fill_color(derived_setup.elems[primitive_index].fill_color);
+                else if ((shaded.coverage_count & COVERAGE_COPY_BIT) != 0)
+                    copy_pipeline(shaded.z_dith, primitive_index);
+                else
+                    depth_blend(x, y, primitive_index, shaded);
+            }
+        }
+    }
+
+    finish_tile(gl_GlobalInvocationID.xy,
+                registers.fb_width, registers.fb_height,
+                registers.fb_addr_index, registers.fb_depth_addr_index);
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/update_upscaled_domain_post.comp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/update_upscaled_domain_post.comp
@ -0,0 +1,119 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "small_types.h"
+#include "fb_formats.h"
+
+layout(local_size_x_id = 3) in;
+
+layout(constant_id = 0) const int RDRAM_SIZE = 8 * 1024 * 1024;
+const int RDRAM_MASK_8 = RDRAM_SIZE - 1;
+const int RDRAM_MASK_16 = RDRAM_MASK_8 >> 1;
+const int RDRAM_MASK_32 = RDRAM_MASK_8 >> 2;
+layout(constant_id = 1) const int FB_SIZE_LOG2 = 0;
+layout(constant_id = 2) const bool COLOR_DEPTH_ALIAS = false;
+layout(constant_id = 4) const int NUM_SAMPLES = 1;
+
+layout(push_constant) uniform Registers
+{
+    uint num_pixels, fb_addr, fb_depth_addr;
+} registers;
+
+layout(set = 0, binding = 0) readonly buffer RDRAMSingleSampled8
+{
+    uint8_t elems[];
+} vram8;
+
+layout(set = 0, binding = 0) readonly buffer RDRAMSingleSampled16
+{
+    uint16_t elems[];
+} vram16;
+
+layout(set = 0, binding = 0) readonly buffer RDRAMSingleSampled32
+{
+    uint elems[];
+} vram32;
+
+layout(set = 0, binding = 2) buffer RDRAMUpscalingReference8
+{
+    uint8_t elems[];
+} vram_reference8;
+
+layout(set = 0, binding = 2) buffer RDRAMUpscalingReference16
+{
+    uint16_t elems[];
+} vram_reference16;
+
+layout(set = 0, binding = 2) buffer RDRAMUpscalingReference32
+{
+    uint elems[];
+} vram_reference32;
+
+void copy_rdram_8(uint index)
+{
+    index &= RDRAM_MASK_8;
+    uint real_word = uint(vram8.elems[index]);
+    vram_reference8.elems[index] = uint8_t(real_word);
+}
+
+void copy_rdram_16(uint index)
+{
+    index &= RDRAM_MASK_16;
+    uint real_word = uint(vram16.elems[index]);
+    vram_reference16.elems[index] = uint16_t(real_word);
+}
+
+void copy_rdram_32(uint index)
+{
+    index &= RDRAM_MASK_32;
+    uint real_word = vram32.elems[index];
+    vram_reference32.elems[index] = real_word;
+}
+
+void main()
+{
+    uint index = gl_GlobalInvocationID.x;
+    if (index >= registers.num_pixels)
+        return;
+
+    uint depth_index = index + registers.fb_depth_addr;
+    uint color_index = index + registers.fb_addr;
+
+    switch (FB_SIZE_LOG2)
+    {
+        case 0:
+            copy_rdram_8(color_index);
+            break;
+
+        case 1:
+            copy_rdram_16(color_index);
+            break;
+
+        case 2:
+            copy_rdram_32(color_index);
+            break;
+    }
+
+    if (!COLOR_DEPTH_ALIAS)
+        copy_rdram_16(depth_index);
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/update_upscaled_domain_pre.comp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/update_upscaled_domain_pre.comp
@ -0,0 +1,185 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "small_types.h"
+
+layout(local_size_x_id = 3) in;
+
+layout(constant_id = 0) const int RDRAM_SIZE = 8 * 1024 * 1024;
+const int RDRAM_MASK_8 = RDRAM_SIZE - 1;
+const int RDRAM_MASK_16 = RDRAM_MASK_8 >> 1;
+const int RDRAM_MASK_32 = RDRAM_MASK_8 >> 2;
+layout(constant_id = 1) const int FB_SIZE_LOG2 = 0;
+layout(constant_id = 2) const bool COLOR_DEPTH_ALIAS = false;
+layout(constant_id = 4) const int NUM_SAMPLES = 1;
+
+layout(push_constant) uniform Registers
+{
+    uint num_pixels, fb_addr, fb_depth_addr;
+} registers;
+
+layout(set = 0, binding = 0) readonly buffer RDRAMSingleSampled8
+{
+    uint8_t elems[];
+} vram8;
+
+layout(set = 0, binding = 0) readonly buffer RDRAMSingleSampled16
+{
+    uint16_t elems[];
+} vram16;
+
+layout(set = 0, binding = 0) readonly buffer RDRAMSingleSampled32
+{
+    uint elems[];
+} vram32;
+
+layout(set = 0, binding = 1) readonly buffer RDRAMHiddenSingleSampled
+{
+    uint8_t elems[];
+} hidden_vram;
+
+layout(set = 0, binding = 2) buffer RDRAMUpscalingReference8
+{
+    uint8_t elems[];
+} vram_reference8;
+
+layout(set = 0, binding = 2) buffer RDRAMUpscalingReference16
+{
+    uint16_t elems[];
+} vram_reference16;
+
+layout(set = 0, binding = 2) buffer RDRAMUpscalingReference32
+{
+    uint elems[];
+} vram_reference32;
+
+layout(set = 0, binding = 3) buffer RDRAMUpscaling8
+{
+    uint8_t elems[];
+} vram_upscaled8;
+
+layout(set = 0, binding = 3) buffer RDRAMUpscaling16
+{
+    uint16_t elems[];
+} vram_upscaled16;
+
+layout(set = 0, binding = 3) buffer RDRAMUpscaling32
+{
+    uint elems[];
+} vram_upscaled32;
+
+layout(set = 0, binding = 4) buffer RDRAMHiddenUpscaling
+{
+    uint8_t elems[];
+} hidden_vram_upscaled;
+
+void update_rdram_8(uint index)
+{
+    index &= RDRAM_MASK_8;
+
+    uint real_word = uint(vram8.elems[index]);
+    uint reference_word = uint(vram_reference8.elems[index]);
+
+    if (real_word != reference_word)
+    {
+        uint mirrored_index = index ^ 3u;
+        uint real_hidden_word = uint(hidden_vram.elems[mirrored_index >> 1u]);
+        for (int i = 0; i < NUM_SAMPLES; i++)
+        {
+            vram_upscaled8.elems[index + i * RDRAM_SIZE] = uint8_t(real_word);
+            if ((mirrored_index & 1u) != 0u)
+                hidden_vram_upscaled.elems[(mirrored_index >> 1u) + i * (RDRAM_SIZE >> 1)] = uint8_t(real_hidden_word);
+        }
+        vram_reference8.elems[index] = uint8_t(real_word);
+    }
+}
+
+void update_rdram_16(uint index)
+{
+    index &= RDRAM_MASK_16;
+
+    uint real_word = uint(vram16.elems[index]);
+    uint reference_word = uint(vram_reference16.elems[index]);
+
+    if (real_word != reference_word)
+    {
+        uint mirrored_index = index ^ 1u;
+        uint real_hidden_word = uint(hidden_vram.elems[mirrored_index]);
+        for (int i = 0; i < NUM_SAMPLES; i++)
+        {
+            vram_upscaled16.elems[index + i * (RDRAM_SIZE >> 1)] = uint16_t(real_word);
+            hidden_vram_upscaled.elems[mirrored_index + i * (RDRAM_SIZE >> 1)] = uint8_t(real_hidden_word);
+        }
+        vram_reference16.elems[index] = uint16_t(real_word);
+    }
+}
+
+void update_rdram_32(uint index)
+{
+    index &= RDRAM_MASK_32;
+
+    uint real_word = vram32.elems[index];
+    uint reference_word = vram_reference32.elems[index];
+
+    if (real_word != reference_word)
+    {
+        uint real_hidden_word0 = uint(hidden_vram.elems[2u * index]);
+        uint real_hidden_word1 = uint(hidden_vram.elems[2u * index + 1u]);
+
+        for (int i = 0; i < NUM_SAMPLES; i++)
+        {
+            vram_upscaled32.elems[index + i * (RDRAM_SIZE >> 2)] = real_word;
+            hidden_vram_upscaled.elems[2u * index + i * (RDRAM_SIZE >> 1)] = uint8_t(real_hidden_word0);
+            hidden_vram_upscaled.elems[2u * index + 1u + i * (RDRAM_SIZE >> 1)] = uint8_t(real_hidden_word1);
+        }
+        vram_reference32.elems[index] = real_word;
+    }
+}
+
+void main()
+{
+    uint index = gl_GlobalInvocationID.x;
+    if (index >= registers.num_pixels)
+        return;
+
+    uint depth_index = index + registers.fb_depth_addr;
+    uint color_index = index + registers.fb_addr;
+
+    switch (FB_SIZE_LOG2)
+    {
+        case 0:
+            update_rdram_8(color_index);
+            break;
+
+        case 1:
+            update_rdram_16(color_index);
+            break;
+
+        case 2:
+            update_rdram_32(color_index);
+            break;
+    }
+
+    if (!COLOR_DEPTH_ALIAS)
+        update_rdram_16(depth_index);
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/update_upscaled_domain_resolve.comp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/update_upscaled_domain_resolve.comp
@ -0,0 +1,279 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "small_types.h"
+#include "fb_formats.h"
+
+layout(local_size_x_id = 3) in;
+
+layout(constant_id = 0) const int RDRAM_SIZE = 8 * 1024 * 1024;
+const int RDRAM_MASK_8 = RDRAM_SIZE - 1;
+const int RDRAM_MASK_16 = RDRAM_MASK_8 >> 1;
+const int RDRAM_MASK_32 = RDRAM_MASK_8 >> 2;
+layout(constant_id = 1) const int FB_SIZE_LOG2 = 0;
+layout(constant_id = 2) const bool COLOR_DEPTH_ALIAS = false;
+layout(constant_id = 4) const int NUM_SAMPLES = 1;
+layout(constant_id = 5) const bool DITHER = false;
+layout(constant_id = 6) const bool RDRAM_UNSCALED_WRITE_MASK = false;
+
+layout(push_constant) uniform Registers
+{
+    uint num_pixels, fb_addr, fb_depth_addr, width, height;
+} registers;
+
+layout(set = 0, binding = 0) writeonly buffer RDRAMSingleSampled8
+{
+    uint8_t elems[];
+} vram8;
+
+layout(set = 0, binding = 0) writeonly buffer RDRAMSingleSampled16
+{
+    uint16_t elems[];
+} vram16;
+
+layout(set = 0, binding = 0) writeonly buffer RDRAMSingleSampled32
+{
+    uint elems[];
+} vram32;
+
+layout(set = 0, binding = 2) writeonly buffer RDRAMUpscalingReference8
+{
+    uint8_t elems[];
+} vram_reference8;
+
+layout(set = 0, binding = 2) writeonly buffer RDRAMUpscalingReference16
+{
+    uint16_t elems[];
+} vram_reference16;
+
+layout(set = 0, binding = 2) writeonly buffer RDRAMUpscalingReference32
+{
+    uint elems[];
+} vram_reference32;
+
+layout(set = 0, binding = 3) readonly buffer RDRAMUpscaling8
+{
+    uint8_t elems[];
+} vram_upscaled8;
+
+layout(set = 0, binding = 3) readonly buffer RDRAMUpscaling16
+{
+    uint16_t elems[];
+} vram_upscaled16;
+
+layout(set = 0, binding = 3) readonly buffer RDRAMUpscaling32
+{
+    uint elems[];
+} vram_upscaled32;
+
+layout(set = 0, binding = 4) readonly buffer RDRAMHiddenUpscaling
+{
+    uint8_t elems[];
+} hidden_vram_upscaled;
+
+void copy_rdram_8(uint index)
+{
+    index &= RDRAM_MASK_8;
+    index ^= 3u;
+
+    uint r = 0u;
+    for (int i = 0; i < NUM_SAMPLES; i++)
+    {
+        uint real_word = uint(vram_upscaled8.elems[index]);
+        r += real_word;
+    }
+
+    r = (r + (NUM_SAMPLES >> 1)) / NUM_SAMPLES;
+    vram_reference8.elems[index] = uint8_t(r);
+    vram8.elems[index] = uint8_t(r);
+
+    if (RDRAM_UNSCALED_WRITE_MASK)
+    {
+        // Need this memory barrier to ensure the mask readback does not read
+        // an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
+        // also coherent.
+        memoryBarrierBuffer();
+        vram8.elems[index + RDRAM_SIZE] = mem_u8(0xff);
+    }
+
+    // Don't bother writing back hidden VRAM. It is not visible to host anyways, and coverage is meaningless when it's filtered.
+    // If host later decides to modify the CPU memory, then the hidden VRAM values become complete bogus either way.
+}
+
+uvec4 decode_rgba5551(uint word)
+{
+    return (uvec4(word) >> uvec4(11, 6, 1, 0)) & uvec4(0x1f, 0x1f, 0x1f, 1);
+}
+
+uint encode_rgba5551(uvec4 color)
+{
+    return (color.r << 11u) | (color.g << 6u) | (color.b << 1u) | color.a;
+}
+
+const uint bayer_dither_lut[16] = uint[](
+    0, 4, 1, 5,
+    4, 0, 5, 1,
+    3, 7, 2, 6,
+    7, 3, 6, 2);
+
+void copy_rdram_16(uint index, uint x, uint y)
+{
+    index &= RDRAM_MASK_16;
+    index ^= 1u;
+
+    uvec4 rgba = uvec4(0u);
+    for (int i = 0; i < NUM_SAMPLES; i++)
+    {
+        uint real_word = uint(vram_upscaled16.elems[index + i * (RDRAM_SIZE >> 1)]);
+        rgba += decode_rgba5551(real_word);
+    }
+
+    if (DITHER)
+    {
+        uint dither_value = bayer_dither_lut[(y & 3u) * 4u + (x & 3u)] * NUM_SAMPLES;
+        rgba = (8u * rgba + dither_value) / (8 * NUM_SAMPLES);
+    }
+    else
+    {
+        rgba = (rgba + (NUM_SAMPLES >> 1)) / NUM_SAMPLES;
+    }
+
+    uint encoded = encode_rgba5551(rgba);
+    vram16.elems[index] = uint16_t(encoded);
+    vram_reference16.elems[index] = uint16_t(encoded);
+
+    if (RDRAM_UNSCALED_WRITE_MASK)
+    {
+        // Need this memory barrier to ensure the mask readback does not read
+        // an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
+        // also coherent.
+        memoryBarrierBuffer();
+        vram16.elems[index + (RDRAM_SIZE >> 1u)] = mem_u16(0xffff);
+    }
+
+    // Don't bother writing back hidden VRAM. It is not visible to host anyways, and coverage is meaningless when it's filtered.
+    // If host later decides to modify the CPU memory, then the hidden VRAM values become complete bogus either way.
+}
+
+void copy_rdram_16_single_sample(uint index)
+{
+    // Copies the first sample. We cannot meaningfully filter depth samples.
+    // The first sample should overlap exactly with the single-sampled version.
+    // Coverage clipping might slightly change the result, but shouldn't be different enough to break things.
+    index &= RDRAM_MASK_16;
+    index ^= 1u;
+    uint upscaled_word = uint(vram_upscaled16.elems[index]);
+    vram16.elems[index] = uint16_t(upscaled_word);
+    vram_reference16.elems[index] = uint16_t(upscaled_word);
+
+    if (RDRAM_UNSCALED_WRITE_MASK)
+    {
+        // Need this memory barrier to ensure the mask readback does not read
+        // an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
+        // also coherent.
+        memoryBarrierBuffer();
+        vram16.elems[index + (RDRAM_SIZE >> 1u)] = mem_u16(0xffff);
+    }
+
+    // Don't bother writing back hidden VRAM. It is not visible to host anyways, and coverage is meaningless when it's filtered.
+    // If host later decides to modify the CPU memory, then the hidden VRAM values become complete bogus either way.
+}
+
+uvec4 decode_rgba8(uint word)
+{
+    return (uvec4(word) >> uvec4(24, 16, 8, 0)) & uvec4(0xff);
+}
+
+uint encode_rgba8(uvec4 color)
+{
+    return (color.r << 24u) | (color.g << 16u) | (color.b << 8u) | (color.a << 0u);
+}
+
+void copy_rdram_32(uint index)
+{
+    index &= RDRAM_MASK_32;
+
+    uvec4 rgba = uvec4(0u);
+    for (int i = 0; i < NUM_SAMPLES; i++)
+    {
+        uint real_word = vram_upscaled32.elems[index + i * (RDRAM_SIZE >> 2)];
+        rgba += decode_rgba8(real_word);
+    }
+
+    rgba = (rgba + (NUM_SAMPLES >> 1)) / NUM_SAMPLES;
+    uint encoded = encode_rgba8(rgba);
+    vram32.elems[index] = encoded;
+    vram_reference32.elems[index] = encoded;
+
+    if (RDRAM_UNSCALED_WRITE_MASK)
+    {
+        // Need this memory barrier to ensure the mask readback does not read
+        // an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
+        // also coherent.
+        memoryBarrierBuffer();
+        vram32.elems[index + (RDRAM_SIZE >> 2u)] = ~0u;
+    }
+
+    // Don't bother writing back hidden VRAM. It is not visible to host anyways, and coverage is meaningless when it's filtered.
+    // If host later decides to modify the CPU memory, then the hidden VRAM values become complete bogus either way.
+}
+
+void main()
+{
+    uvec2 coord = gl_GlobalInvocationID.xy;
+    if (coord.x >= registers.width)
+        return;
+
+    uint index = coord.y * registers.width + coord.x;
+    uint depth_index = index + registers.fb_depth_addr;
+    uint color_index = index + registers.fb_addr;
+
+    uvec2 mask_coord = coord >> 2u;
+    uint mask_index = mask_coord.x + mask_coord.y * ((registers.width + 3) >> 2u);
+    uint write_mask = vram_upscaled32.elems[NUM_SAMPLES * (RDRAM_SIZE >> 2) + mask_index];
+    uint shamt = 2u * ((coord.x & 3u) + 4u * (coord.y & 3u));
+    write_mask = write_mask >> shamt;
+    bool color_write_mask = (write_mask & 1u) != 0u;
+    bool depth_write_mask = (write_mask & 2u) != 0u;
+
+    if (color_write_mask)
+    {
+        switch (FB_SIZE_LOG2)
+        {
+        case 0:
+            copy_rdram_8(color_index);
+            break;
+
+        case 1:
+            copy_rdram_16(color_index, coord.x, coord.y);
+            break;
+
+        case 2:
+            copy_rdram_32(color_index);
+            break;
+        }
+    }
+
+    if (!COLOR_DEPTH_ALIAS && depth_write_mask)
+        copy_rdram_16_single_sample(depth_index);
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/vi_blend_fields.frag
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/vi_blend_fields.frag
@ -0,0 +1,33 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#extension GL_EXT_samplerless_texture_functions : require
+
+layout(location = 0) out vec4 FragColor;
+layout(set = 0, binding = 0) uniform texture2D uImage;
+
+void main()
+{
+    // A persistent pixel does not propagate more than one frame.
+    vec4 input_pixel = texelFetch(uImage, ivec2(gl_FragCoord.xy), 0);
+    FragColor = vec4(input_pixel.rgb * input_pixel.a, 0.0);
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/vi_debug.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/vi_debug.h
@ -0,0 +1,60 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef VI_DEBUG_H_
+#define VI_DEBUG_H_
+
+#if defined(DEBUG_ENABLE) && DEBUG_ENABLE
+#include "debug_channel.h"
+
+void GENERIC_MESSAGE_(int line)
+{
+	add_debug_message(0, uvec3(gl_FragCoord.xy, 0), line);
+}
+
+void GENERIC_MESSAGE_(int line, uint v)
+{
+	add_debug_message(0, uvec3(gl_FragCoord.xy, 0), uvec2(line, v));
+}
+
+void GENERIC_MESSAGE_(int line, uvec2 v)
+{
+	add_debug_message(0, uvec3(gl_FragCoord.xy, 0), uvec3(line, v));
+}
+
+void GENERIC_MESSAGE_(int line, uvec3 v)
+{
+	add_debug_message(0, uvec3(gl_FragCoord.xy, 0), uvec4(line, v));
+}
+
+#define GENERIC_MESSAGE0() GENERIC_MESSAGE_(__LINE__)
+#define GENERIC_MESSAGE1(a) GENERIC_MESSAGE_(__LINE__, a)
+#define GENERIC_MESSAGE2(a, b) GENERIC_MESSAGE_(__LINE__, uvec2(a, b))
+#define GENERIC_MESSAGE3(a, b, c) GENERIC_MESSAGE_(__LINE__, uvec3(a, b, c))
+#else
+#define GENERIC_MESSAGE0()
+#define GENERIC_MESSAGE1(a)
+#define GENERIC_MESSAGE2(a, b)
+#define GENERIC_MESSAGE3(a, b, c)
+#endif
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/vi_deinterlace.frag
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/vi_deinterlace.frag
@ -0,0 +1,31 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+layout(location = 0) in vec2 vUV;
+layout(set = 0, binding = 0) uniform sampler2D uSampler;
+layout(location = 0) out vec4 FragColor;
+
+void main()
+{
+    FragColor = textureLod(uSampler, vUV, 0.0);
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/vi_deinterlace.vert
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/vi_deinterlace.vert
@ -0,0 +1,41 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+layout(location = 0) out vec2 vUV;
+
+layout(push_constant) uniform UBO
+{
+    float y_offset;
+} registers;
+
+void main()
+{
+    if (gl_VertexIndex == 0)
+        gl_Position = vec4(-1.0, -1.0, 0.0, 1.0);
+    else if (gl_VertexIndex == 1)
+        gl_Position = vec4(-1.0, +3.0, 0.0, 1.0);
+    else
+        gl_Position = vec4(+3.0, -1.0, 0.0, 1.0);
+
+    vUV = vec2(gl_Position.x * 0.5 + 0.5, gl_Position.y * 0.5 + 0.5 + registers.y_offset);
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/vi_divot.frag
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/vi_divot.frag
@ -0,0 +1,92 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#extension GL_EXT_samplerless_texture_functions : require
+
+#include "vi_debug.h"
+
+layout(location = 0) out uvec4 FragColor;
+#if defined(FETCH_BUG) && FETCH_BUG
+layout(location = 1) out uvec4 FragColorFetchBug;
+#endif
+
+layout(set = 0, binding = 0) uniform mediump utexture2DArray uFetchCache;
+
+void swap(inout uint a, inout uint b)
+{
+    uint tmp = a;
+    a = b;
+    b = tmp;
+}
+
+uint median3(uint left, uint center, uint right)
+{
+    if (left < center)
+        swap(left, center);
+    if (center < right)
+        swap(center, right);
+    if (left < center)
+        swap(left, center);
+
+    return center;
+}
+
+void main()
+{
+    ivec2 pix = ivec2(gl_FragCoord.xy);
+
+    uvec4 left = texelFetch(uFetchCache, ivec3(pix, 0), 0);
+    uvec4 mid = texelFetchOffset(uFetchCache, ivec3(pix, 0), 0, ivec2(1, 0));
+    uvec4 right = texelFetchOffset(uFetchCache, ivec3(pix, 0), 0, ivec2(2, 0));
+
+    if ((left.a & mid.a & right.a) == 7u)
+    {
+        FragColor = mid;
+    }
+    else
+    {
+        // Median filter. TODO: Optimize with mid3?
+        uint r = median3(left.r, mid.r, right.r);
+        uint g = median3(left.g, mid.g, right.g);
+        uint b = median3(left.b, mid.b, right.b);
+        FragColor = uvec4(r, g, b, mid.a);
+    }
+
+#if defined(FETCH_BUG) && FETCH_BUG
+    left = texelFetch(uFetchCache, ivec3(pix, 1), 0);
+    mid = texelFetchOffset(uFetchCache, ivec3(pix, 1), 0, ivec2(1, 0));
+    right = texelFetchOffset(uFetchCache, ivec3(pix, 1), 0, ivec2(2, 0));
+
+    if ((left.a & mid.a & right.a) == 7u)
+    {
+        FragColorFetchBug = mid;
+    }
+    else
+    {
+        // Median filter. TODO: Optimize with mid3?
+        uint r = median3(left.r, mid.r, right.r);
+        uint g = median3(left.g, mid.g, right.g);
+        uint b = median3(left.b, mid.b, right.b);
+        FragColorFetchBug = uvec4(r, g, b, mid.a);
+    }
+#endif
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/vi_fetch.frag
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/vi_fetch.frag
@ -0,0 +1,164 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#extension GL_EXT_samplerless_texture_functions : require
+#include "small_types.h"
+#include "vi_status.h"
+#include "vi_debug.h"
+
+layout(set = 0, binding = 0) uniform mediump utexture2D uAAInput;
+
+layout(location = 0) out uvec4 FragColor;
+#if defined(FETCH_BUG) && FETCH_BUG
+layout(location = 1) out uvec4 FragColorFetchBug;
+#endif
+
+layout(push_constant) uniform Registers
+{
+    ivec2 offset;
+} registers;
+
+ivec2 pix;
+uvec4 fetch_color_offset(ivec2 offset)
+{
+    return texelFetch(uAAInput, pix + offset, 0);
+}
+
+void check_neighbor(uvec4 candidate,
+                    inout uvec3 lo, inout uvec3 hi,
+                    inout uvec3 second_lo, inout uvec3 second_hi)
+{
+    if (candidate.a == 7u)
+    {
+        second_lo = min(second_lo, max(candidate.rgb, lo));
+        second_hi = max(second_hi, min(candidate.rgb, hi));
+
+        lo = min(candidate.rgb, lo);
+        hi = max(candidate.rgb, hi);
+    }
+}
+
+void main()
+{
+    pix = ivec2(gl_FragCoord.xy) + registers.offset;
+
+    uvec4 mid_pixel = fetch_color_offset(ivec2(0));
+
+    // AA-filter. If coverage is not full, we blend current pixel against background.
+    uvec3 color;
+#if defined(FETCH_BUG) && FETCH_BUG
+    uvec3 color_bug;
+#endif
+
+    if (mid_pixel.a != 7u)
+    {
+        uvec3 lo = mid_pixel.rgb;
+        uvec3 hi = lo;
+        uvec3 second_lo = lo;
+        uvec3 second_hi = lo;
+
+        // Somehow, we're supposed to find the second lowest and second highest neighbor.
+        uvec4 left_up = fetch_color_offset(ivec2(-1, -1));
+        uvec4 right_up = fetch_color_offset(ivec2(+1, -1));
+        uvec4 to_left = fetch_color_offset(ivec2(-2, 0));
+        uvec4 to_right = fetch_color_offset(ivec2(+2, 0));
+        uvec4 left_down = fetch_color_offset(ivec2(-1, +1));
+        uvec4 right_down = fetch_color_offset(ivec2(+1, +1));
+
+        check_neighbor(left_up, lo, hi, second_lo, second_hi);
+        check_neighbor(right_up, lo, hi, second_lo, second_hi);
+        check_neighbor(to_left, lo, hi, second_lo, second_hi);
+        check_neighbor(to_right, lo, hi, second_lo, second_hi);
+
+#if defined(FETCH_BUG) && FETCH_BUG
+        // In the fetch-bug state, we apparently do not read the lower values.
+        // Instead, the lower values are treated as left and right.
+        uvec3 lo_bug = lo;
+        uvec3 hi_bug = hi;
+        uvec3 second_lo_bug = second_lo;
+        uvec3 second_hi_bug = second_hi;
+#endif
+
+        check_neighbor(left_down, lo, hi, second_lo, second_hi);
+        check_neighbor(right_down, lo, hi, second_lo, second_hi);
+#if defined(FETCH_BUG) && FETCH_BUG
+        check_neighbor(to_left, lo_bug, hi_bug, second_lo_bug, second_hi_bug);
+        check_neighbor(to_right, lo_bug, hi_bug, second_lo_bug, second_hi_bug);
+        second_lo = mix(second_lo, lo, equal(mid_pixel.rgb, lo));
+        second_hi = mix(second_hi, hi, equal(mid_pixel.rgb, hi));
+        second_lo_bug = mix(second_lo_bug, lo_bug, equal(mid_pixel.rgb, lo_bug));
+        second_hi_bug = mix(second_hi_bug, hi_bug, equal(mid_pixel.rgb, hi_bug));
+#endif
+
+        uvec3 offset = second_lo + second_hi - (mid_pixel.rgb << 1u);
+        uint coeff = 7u - mid_pixel.a;
+        color = mid_pixel.rgb + (((offset * coeff) + 4u) >> 3u);
+        color &= 0xffu;
+
+#if defined(FETCH_BUG) && FETCH_BUG
+        uvec3 offset_bug = second_lo_bug + second_hi_bug - (mid_pixel.rgb << 1u);
+        color_bug = mid_pixel.rgb + (((offset_bug * coeff) + 4u) >> 3u);
+        color_bug &= 0xffu;
+#endif
+    }
+    else if (DITHER_ENABLE)
+    {
+        // Dither filter.
+        ivec3 tmp_color = ivec3(mid_pixel.rgb >> 3u);
+        ivec3 tmp_accum = ivec3(0);
+        for (int y = -1; y <= 0; y++)
+        {
+            for (int x = -1; x <= 1; x++)
+            {
+                ivec3 col = ivec3(fetch_color_offset(ivec2(x, y)).rgb >> 3u);
+                tmp_accum += clamp(col - tmp_color, ivec3(-1), ivec3(1));
+            }
+        }
+
+#if defined(FETCH_BUG) && FETCH_BUG
+        ivec3 tmp_accum_bug = tmp_accum;
+#endif
+
+        tmp_accum += clamp(ivec3(fetch_color_offset(ivec2(-1, 1)).rgb >> 3u) - tmp_color, ivec3(-1), ivec3(1));
+        tmp_accum += clamp(ivec3(fetch_color_offset(ivec2(+1, 1)).rgb >> 3u) - tmp_color, ivec3(-1), ivec3(1));
+        tmp_accum += clamp(ivec3(fetch_color_offset(ivec2(0, 1)).rgb >> 3u) - tmp_color, ivec3(-1), ivec3(1));
+        color = (mid_pixel.rgb & 0xf8u) + tmp_accum;
+
+#if defined(FETCH_BUG) && FETCH_BUG
+        tmp_accum_bug += clamp(ivec3(fetch_color_offset(ivec2(-1, 0)).rgb >> 3u) - tmp_color, ivec3(-1), ivec3(1));
+        tmp_accum_bug += clamp(ivec3(fetch_color_offset(ivec2(+1, 0)).rgb >> 3u) - tmp_color, ivec3(-1), ivec3(1));
+        color_bug = (mid_pixel.rgb & 0xf8u) + tmp_accum_bug;
+#endif
+    }
+    else
+    {
+        color = mid_pixel.rgb;
+#if defined(FETCH_BUG) && FETCH_BUG
+        color_bug = mid_pixel.rgb;
+#endif
+    }
+
+    FragColor = uvec4(color, mid_pixel.a);
+#if defined(FETCH_BUG) && FETCH_BUG
+    FragColorFetchBug = uvec4(color_bug, mid_pixel.a);
+#endif
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/vi_scale.frag
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/vi_scale.frag
@ -0,0 +1,127 @@
+#version 450
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#extension GL_EXT_samplerless_texture_functions : require
+
+#include "small_types.h"
+#include "vi_status.h"
+#include "vi_debug.h"
+#include "noise.h"
+
+layout(set = 0, binding = 0) uniform mediump utexture2DArray uDivotOutput;
+layout(set = 1, binding = 0) uniform mediump utextureBuffer uGammaTable;
+layout(location = 0) out vec4 FragColor;
+
+layout(push_constant, std430) uniform Registers
+{
+    int x_base;
+    int y_base;
+    int h_offset;
+    int v_offset;
+    int x_add;
+    int y_add;
+    int frame_count;
+
+    int serrate_shift;
+    int serrate_mask;
+    int serrate_select;
+} registers;
+
+uvec3 vi_lerp(uvec3 a, uvec3 b, uint l)
+{
+    return (a + (((b - a) * l + 16u) >> 5u)) & 0xffu;
+}
+
+uvec3 integer_gamma(uvec3 color)
+{
+    uvec3 res;
+    if (GAMMA_DITHER)
+    {
+        color = (color << 6) + noise_get_full_gamma_dither() + 256u;
+        res = uvec3(
+            texelFetch(uGammaTable, int(color.r)).r,
+            texelFetch(uGammaTable, int(color.g)).r,
+            texelFetch(uGammaTable, int(color.b)).r);
+    }
+    else
+    {
+        res = uvec3(
+            texelFetch(uGammaTable, int(color.r)).r,
+            texelFetch(uGammaTable, int(color.g)).r,
+            texelFetch(uGammaTable, int(color.b)).r);
+    }
+    return res;
+}
+
+layout(constant_id = 2) const bool FETCH_BUG = false;
+
+void main()
+{
+    ivec2 coord = ivec2(gl_FragCoord.xy) + ivec2(registers.h_offset, registers.v_offset);
+
+    if ((coord.y & registers.serrate_mask) != registers.serrate_select)
+        discard;
+    coord.y >>= registers.serrate_shift;
+
+    if (GAMMA_DITHER)
+        reseed_noise(coord.x, coord.y, registers.frame_count);
+
+    int x = coord.x * registers.x_add + registers.x_base;
+    int y = coord.y * registers.y_add + registers.y_base;
+    ivec2 base_coord = ivec2(x, y) >> 10;
+    uvec3 c00 = texelFetch(uDivotOutput, ivec3(base_coord, 0), 0).rgb;
+
+    int bug_offset = 0;
+    if (FETCH_BUG)
+    {
+        // This is super awkward.
+        // Basically there seems to be some kind of issue where if we interpolate in Y,
+        // we're going to get buggy output.
+        // If we hit this case, the next line we filter against will come from the "buggy" array slice.
+        // Why this makes sense, I have no idea.
+        int prev_y = (y - registers.y_add) >> 10;
+        int next_y = (y + registers.y_add) >> 10;
+        if (coord.y != 0 && base_coord.y == prev_y && base_coord.y != next_y)
+            bug_offset = 1;
+    }
+
+    if (SCALE_AA)
+    {
+        int x_frac = (x >> 5) & 31;
+        int y_frac = (y >> 5) & 31;
+
+        uvec3 c10 = texelFetchOffset(uDivotOutput, ivec3(base_coord, 0), 0, ivec2(1, 0)).rgb;
+        uvec3 c01 = texelFetchOffset(uDivotOutput, ivec3(base_coord, bug_offset), 0, ivec2(0, 1)).rgb;
+        uvec3 c11 = texelFetchOffset(uDivotOutput, ivec3(base_coord, bug_offset), 0, ivec2(1)).rgb;
+
+        c00 = vi_lerp(c00, c01, y_frac);
+        c10 = vi_lerp(c10, c11, y_frac);
+        c00 = vi_lerp(c00, c10, x_frac);
+    }
+
+    if (GAMMA_ENABLE)
+        c00 = integer_gamma(c00);
+    else if (GAMMA_DITHER)
+        c00 = min(c00 + noise_get_partial_gamma_dither(), uvec3(0xff));
+
+    FragColor = vec4(vec3(c00) / 255.0, 1.0);
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/vi_status.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/vi_status.h
@ -0,0 +1,48 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef VI_STATUS_H_
+#define VI_STATUS_H_
+
+layout(constant_id = 1) const int VI_STATUS = 0;
+const int VI_CONTROL_TYPE_BLANK_BIT = 0 << 0;
+const int VI_CONTROL_TYPE_RESERVED_BIT = 1 << 0;
+const int VI_CONTROL_TYPE_RGBA5551_BIT = 2 << 0;
+const int VI_CONTROL_TYPE_RGBA8888_BIT = 3 << 0;
+const int VI_CONTROL_TYPE_MASK = 3 << 0;
+const int VI_CONTROL_GAMMA_DITHER_ENABLE_BIT = 1 << 2;
+const int VI_CONTROL_GAMMA_ENABLE_BIT = 1 << 3;
+const int VI_CONTROL_DIVOT_ENABLE_BIT = 1 << 4;
+const int VI_CONTROL_SERRATE_BIT = 1 << 6;
+const int VI_CONTROL_DITHER_FILTER_ENABLE_BIT = 1 << 16;
+const int VI_CONTROL_META_AA_BIT = 1 << 17;
+const int VI_CONTROL_META_SCALE_BIT = 1 << 18;
+
+const bool FMT_RGBA5551 = (VI_STATUS & VI_CONTROL_TYPE_MASK) == VI_CONTROL_TYPE_RGBA5551_BIT;
+const bool FMT_RGBA8888 = (VI_STATUS & VI_CONTROL_TYPE_MASK) == VI_CONTROL_TYPE_RGBA8888_BIT;
+const bool DITHER_ENABLE = (VI_STATUS & VI_CONTROL_DITHER_FILTER_ENABLE_BIT) != 0;
+const bool FETCH_AA = (VI_STATUS & VI_CONTROL_META_AA_BIT) != 0;
+const bool SCALE_AA = (VI_STATUS & VI_CONTROL_META_SCALE_BIT) != 0;
+const bool GAMMA_ENABLE = (VI_STATUS & VI_CONTROL_GAMMA_ENABLE_BIT) != 0;
+const bool GAMMA_DITHER = (VI_STATUS & VI_CONTROL_GAMMA_DITHER_ENABLE_BIT) != 0;
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/z_encode.h
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/shaders/z_encode.h
@ -0,0 +1,58 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef Z_ENCODE_H_
+#define Z_ENCODE_H_
+
+// The Z compression is kind of clever, and uses inverted FP, with more precision close to 1.
+// The compressed Z result is 14 bits, and decompresses to 18-bit UNORM.
+int z_decompress(u16 z_)
+{
+	int z = int(z_);
+	int exponent = z >> 11;
+	int mantissa = z & 0x7ff;
+	int shift = max(6 - exponent, 0);
+	int base = 0x40000 - (0x40000 >> exponent);
+	return (mantissa << shift) + base;
+}
+
+u16 z_compress(int z)
+{
+	int inv_z = max(0x3ffff - z, 1);
+	int exponent = 17 - findMSB(inv_z);
+	exponent = clamp(exponent, 0, 7);
+	int shift = max(6 - exponent, 0);
+	int mantissa = (z >> shift) & 0x7ff;
+	return u16((exponent << 11) + mantissa);
+}
+
+int dz_decompress(int dz)
+{
+	return 1 << dz;
+}
+
+int dz_compress(int dz)
+{
+	return max(findMSB(dz), 0);
+}
+
+#endif
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/video_interface.cpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/video_interface.cpp
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/video_interface.hpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/video_interface.hpp
@ -0,0 +1,158 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include "device.hpp"
+#include "rdp_common.hpp"
+
+namespace RDP
+{
+struct ScanoutOptions
+{
+	unsigned crop_overscan_pixels = 0;
+	unsigned downscale_steps = 0;
+
+	// Works around certain game bugs. Considered a hack if enabled.
+	bool persist_frame_on_invalid_input = false;
+
+	// To be equivalent to reference behavior where
+	// pixels persist for an extra frame.
+	// Not hardware accurate, but needed for weave interlace mode.
+	bool blend_previous_frame = false;
+
+	// Upscale deinterlacing deinterlaces by upscaling in Y, with an Y coordinate offset matching the field.
+	// If disabled, weave interlacing is used.
+	// Weave deinterlacing should *not* be used, except to run test suite!
+	bool upscale_deinterlacing = true;
+
+	struct
+	{
+		bool aa = true;
+		bool scale = true;
+		bool serrate = true;
+		bool dither_filter = true;
+		bool divot_filter = true;
+		bool gamma_dither = true;
+	} vi;
+};
+
+struct VIScanoutBuffer
+{
+	Vulkan::BufferHandle buffer;
+	Vulkan::Fence fence;
+	unsigned width = 0;
+	unsigned height = 0;
+};
+
+class Renderer;
+
+class VideoInterface : public Vulkan::DebugChannelInterface
+{
+public:
+	void set_device(Vulkan::Device *device);
+	void set_renderer(Renderer *renderer);
+	void set_vi_register(VIRegister reg, uint32_t value);
+
+	void set_rdram(const Vulkan::Buffer *rdram, size_t offset, size_t size);
+	void set_hidden_rdram(const Vulkan::Buffer *hidden_rdram);
+
+	int resolve_shader_define(const char *name, const char *define) const;
+
+	Vulkan::ImageHandle scanout(VkImageLayout target_layout, const ScanoutOptions &options = {}, unsigned scale_factor = 1);
+	void scanout_memory_range(unsigned &offset, unsigned &length) const;
+	void set_shader_bank(const ShaderBank *bank);
+
+private:
+	Vulkan::Device *device = nullptr;
+	Renderer *renderer = nullptr;
+	uint32_t vi_registers[unsigned(VIRegister::Count)] = {};
+	const Vulkan::Buffer *rdram = nullptr;
+	const Vulkan::Buffer *hidden_rdram = nullptr;
+	Vulkan::BufferHandle gamma_lut;
+	Vulkan::BufferViewHandle gamma_lut_view;
+	const ShaderBank *shader_bank = nullptr;
+
+	void init_gamma_table();
+	bool previous_frame_blank = false;
+	bool debug_channel = false;
+	int filter_debug_channel_x = -1;
+	int filter_debug_channel_y = -1;
+
+	void message(const std::string &tag, uint32_t code,
+	             uint32_t x, uint32_t y, uint32_t z,
+	             uint32_t num_words, const Vulkan::DebugChannelInterface::Word *words) override;
+
+	// Frame state.
+	uint32_t frame_count = 0;
+	uint32_t last_valid_frame_count = 0;
+	Vulkan::ImageHandle prev_scanout_image;
+	VkImageLayout prev_image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+
+	size_t rdram_offset = 0;
+	size_t rdram_size = 0;
+	bool timestamp = false;
+
+	struct Registers
+	{
+		int x_start, y_start;
+		int h_start, v_start;
+		int h_end, v_end;
+		int h_res, v_res;
+		int x_add, y_add;
+		int v_sync;
+		int vi_width;
+		int vi_offset;
+		int max_x, max_y;
+		int v_current_line;
+		bool left_clamp, right_clamp;
+		bool is_pal;
+		uint32_t status;
+	};
+	Registers decode_vi_registers() const;
+	Vulkan::ImageHandle vram_fetch_stage(const Registers &registers,
+	                                     unsigned scaling_factor) const;
+	Vulkan::ImageHandle aa_fetch_stage(Vulkan::CommandBuffer &cmd,
+	                                   Vulkan::Image &vram_image,
+	                                   const Registers &registers,
+	                                   unsigned scaling_factor) const;
+	Vulkan::ImageHandle divot_stage(Vulkan::CommandBuffer &cmd,
+	                                Vulkan::Image &aa_image,
+	                                const Registers &registers,
+	                                unsigned scaling_factor) const;
+	Vulkan::ImageHandle scale_stage(Vulkan::CommandBuffer &cmd,
+	                                Vulkan::Image &divot_image,
+	                                Registers registers,
+	                                unsigned scaling_factor,
+	                                bool degenerate,
+	                                const ScanoutOptions &options) const;
+	Vulkan::ImageHandle downscale_stage(Vulkan::CommandBuffer &cmd,
+	                                    Vulkan::Image &scale_image,
+	                                    unsigned scaling_factor,
+	                                    unsigned downscale_factor) const;
+	Vulkan::ImageHandle upscale_deinterlace(Vulkan::CommandBuffer &cmd,
+	                                        Vulkan::Image &scale_image,
+	                                        unsigned scaling_factor, bool field_select) const;
+	static bool need_fetch_bug_emulation(const Registers &reg, unsigned scaling_factor);
+};
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/worker_thread.hpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/parallel-rdp/worker_thread.hpp
@ -0,0 +1,122 @@
+/* Copyright (c) 2020 Themaister
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <queue>
+#include <mutex>
+#include <thread>
+#include <condition_variable>
+#include <utility>
+
+#ifdef PARALLEL_RDP_SHADER_DIR
+#include "global_managers.hpp"
+#endif
+
+namespace RDP
+{
+template <typename T, typename Executor>
+class WorkerThread
+{
+public:
+	explicit WorkerThread(
+#ifdef PARALLEL_RDP_SHADER_DIR
+			Granite::Global::GlobalManagersHandle globals,
+#endif
+			Executor exec)
+		: executor(std::move(exec))
+#ifdef PARALLEL_RDP_SHADER_DIR
+		, handles(std::move(globals))
+#endif
+	{
+		thr = std::thread(&WorkerThread::main_loop, this);
+	}
+
+	~WorkerThread()
+	{
+		if (thr.joinable())
+		{
+			{
+				std::lock_guard<std::mutex> holder{to_thread_mutex};
+				work_queue.push({});
+				to_thread_cond.notify_one();
+			}
+			thr.join();
+		}
+	}
+
+	template <typename Cond>
+	void wait(Cond &&cond)
+	{
+		std::unique_lock<std::mutex> holder{to_main_mutex};
+		to_main_cond.wait(holder, std::forward<Cond>(cond));
+	}
+
+	void push(T &&t)
+	{
+		std::lock_guard<std::mutex> holder{to_thread_mutex};
+		work_queue.push(std::move(t));
+		to_thread_cond.notify_one();
+	}
+
+private:
+	std::thread thr;
+	std::mutex to_thread_mutex;
+	std::condition_variable to_thread_cond;
+	std::mutex to_main_mutex;
+	std::condition_variable to_main_cond;
+	std::queue<T> work_queue;
+	Executor executor;
+
+#ifdef PARALLEL_RDP_SHADER_DIR
+	Granite::Global::GlobalManagersHandle handles;
+#endif
+
+	void main_loop()
+	{
+#ifdef PARALLEL_RDP_SHADER_DIR
+		Granite::Global::set_thread_context(*handles);
+		handles.reset();
+#endif
+
+		for (;;)
+		{
+			T value;
+
+			{
+				std::unique_lock<std::mutex> holder{to_thread_mutex};
+				to_thread_cond.wait(holder, [this]() { return !work_queue.empty(); });
+				value = std::move(work_queue.front());
+				work_queue.pop();
+			}
+
+			if (executor.is_sentinel(value))
+				break;
+
+			executor.perform_work(value);
+			std::lock_guard<std::mutex> holder{to_main_mutex};
+			executor.notify_work_locked(value);
+			to_main_cond.notify_one();
+		}
+	}
+};
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/util/aligned_alloc.cpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/util/aligned_alloc.cpp
@ -0,0 +1,82 @@
+/* Copyright (c) 2017-2020 Hans-Kristian Arntzen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "aligned_alloc.hpp"
+#include <stdlib.h>
+#include <string.h>
+#ifdef _WIN32
+#include <malloc.h>
+#endif
+
+namespace Util
+{
+void *memalign_alloc(size_t boundary, size_t size)
+{
+#if defined(_WIN32)
+    return _aligned_malloc(size, boundary);
+#elif defined(_ISOC11_SOURCE)
+    return aligned_alloc(boundary, size);
+#elif (_POSIX_C_SOURCE >= 200112L) || (_XOPEN_SOURCE >= 600)
+	void *ptr = nullptr;
+	if (posix_memalign(&ptr, boundary, size) < 0)
+		return nullptr;
+	return ptr;
+#else
+    // Align stuff ourselves. Kinda ugly, but will work anywhere.
+    void **place;
+    uintptr_t addr = 0;
+    void *ptr = malloc(boundary + size + sizeof(uintptr_t));
+
+    if (ptr == nullptr)
+        return nullptr;
+
+    addr = ((uintptr_t)ptr + sizeof(uintptr_t) + boundary) & ~(boundary - 1);
+    place = (void **) addr;
+    place[-1] = ptr;
+
+    return (void *) addr;
+#endif
+}
+
+void *memalign_calloc(size_t boundary, size_t size)
+{
+    void *ret = memalign_alloc(boundary, size);
+    if (ret)
+        memset(ret, 0, size);
+    return ret;
+}
+
+void memalign_free(void *ptr)
+{
+#if defined(_WIN32)
+    _aligned_free(ptr);
+#elif !defined(_ISOC11_SOURCE) && !((_POSIX_C_SOURCE >= 200112L) || (_XOPEN_SOURCE >= 600))
+    if (ptr != nullptr)
+    {
+        void **p = (void **) ptr;
+        free(p[-1]);
+    }
+#else
+    free(ptr);
+#endif
+}
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/util/aligned_alloc.hpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/util/aligned_alloc.hpp
@ -0,0 +1,62 @@
+/* Copyright (c) 2017-2020 Hans-Kristian Arntzen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stddef.h>
+#include <stdexcept>
+#include <new>
+
+namespace Util
+{
+void *memalign_alloc(size_t boundary, size_t size);
+void *memalign_calloc(size_t boundary, size_t size);
+void memalign_free(void *ptr);
+
+template <typename T>
+struct AlignedAllocation
+{
+    static void *operator new(size_t size)
+    {
+        void *ret = ::Util::memalign_alloc(alignof(T), size);
+        if (!ret) throw std::bad_alloc();
+        return ret;
+    }
+
+    static void *operator new[](size_t size)
+    {
+        void *ret = ::Util::memalign_alloc(alignof(T), size);
+        if (!ret) throw std::bad_alloc();
+        return ret;
+    }
+
+    static void operator delete(void *ptr)
+    {
+        return ::Util::memalign_free(ptr);
+    }
+
+    static void operator delete[](void *ptr)
+    {
+        return ::Util::memalign_free(ptr);
+    }
+};
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/util/bitops.hpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/util/bitops.hpp
@ -0,0 +1,106 @@
+/* Copyright (c) 2017-2020 Hans-Kristian Arntzen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+namespace Util
+{
+#ifdef __GNUC__
+#define leading_zeroes(x) ((x) == 0 ? 32 : __builtin_clz(x))
+#define trailing_zeroes(x) ((x) == 0 ? 32 : __builtin_ctz(x))
+#define trailing_ones(x) __builtin_ctz(~uint32_t(x))
+#elif defined(_MSC_VER)
+namespace Internal
+{
+static inline uint32_t clz(uint32_t x)
+{
+	unsigned long result;
+	if (_BitScanReverse(&result, x))
+		return 31 - result;
+	else
+		return 32;
+}
+
+static inline uint32_t ctz(uint32_t x)
+{
+	unsigned long result;
+	if (_BitScanForward(&result, x))
+		return result;
+	else
+		return 32;
+}
+}
+
+#define leading_zeroes(x) ::Util::Internal::clz(x)
+#define trailing_zeroes(x) ::Util::Internal::ctz(x)
+#define trailing_ones(x) ::Util::Internal::ctz(~uint32_t(x))
+#else
+#error "Implement me."
+#endif
+
+template <typename T>
+inline void for_each_bit(uint32_t value, const T &func)
+{
+	while (value)
+	{
+		uint32_t bit = trailing_zeroes(value);
+		func(bit);
+		value &= ~(1u << bit);
+	}
+}
+
+template <typename T>
+inline void for_each_bit_range(uint32_t value, const T &func)
+{
+	if (value == ~0u)
+	{
+		func(0, 32);
+		return;
+	}
+
+	uint32_t bit_offset = 0;
+	while (value)
+	{
+		uint32_t bit = trailing_zeroes(value);
+		bit_offset += bit;
+		value >>= bit;
+		uint32_t range = trailing_ones(value);
+		func(bit_offset, range);
+		value &= ~((1u << range) - 1);
+	}
+}
+
+inline uint32_t next_pow2(uint32_t v)
+{
+	v--;
+	v |= v >> 16;
+	v |= v >> 8;
+	v |= v >> 4;
+	v |= v >> 2;
+	v |= v >> 1;
+	return v + 1;
+}
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/util/enum_cast.hpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/util/enum_cast.hpp
@ -0,0 +1,34 @@
+/* Copyright (c) 2017-2020 Hans-Kristian Arntzen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <type_traits>
+
+namespace Util
+{
+template <typename T>
+constexpr typename std::underlying_type<T>::type ecast(T x)
+{
+	return static_cast<typename std::underlying_type<T>::type>(x);
+}
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/util/hash.hpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/util/hash.hpp
@ -0,0 +1,105 @@
+/* Copyright (c) 2017-2020 Hans-Kristian Arntzen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include <stdint.h>
+#include <string>
+
+namespace Util
+{
+using Hash = uint64_t;
+
+class Hasher
+{
+public:
+	explicit Hasher(Hash h_)
+		: h(h_)
+	{
+	}
+
+	Hasher() = default;
+
+	template <typename T>
+	inline void data(const T *data_, size_t size)
+	{
+		size /= sizeof(*data_);
+		for (size_t i = 0; i < size; i++)
+			h = (h * 0x100000001b3ull) ^ data_[i];
+	}
+
+	inline void u32(uint32_t value)
+	{
+		h = (h * 0x100000001b3ull) ^ value;
+	}
+
+	inline void s32(int32_t value)
+	{
+		u32(uint32_t(value));
+	}
+
+	inline void f32(float value)
+	{
+		union
+		{
+			float f32;
+			uint32_t u32;
+		} u;
+		u.f32 = value;
+		u32(u.u32);
+	}
+
+	inline void u64(uint64_t value)
+	{
+		u32(value & 0xffffffffu);
+		u32(value >> 32);
+	}
+
+	template <typename T>
+	inline void pointer(T *ptr)
+	{
+		u64(reinterpret_cast<uintptr_t>(ptr));
+	}
+
+	inline void string(const char *str)
+	{
+		char c;
+		u32(0xff);
+		while ((c = *str++) != '\0')
+			u32(uint8_t(c));
+	}
+
+	inline void string(const std::string &str)
+	{
+		u32(0xff);
+		for (auto &c : str)
+			u32(uint8_t(c));
+	}
+
+	inline Hash get() const
+	{
+		return h;
+	}
+
+private:
+	Hash h = 0xcbf29ce484222325ull;
+};
+}
--- a/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/util/intrusive.hpp
+++ b/waterbox/ares64/ares/ares/n64/vulkan/parallel-rdp/util/intrusive.hpp
@ -0,0 +1,296 @@
+/* Copyright (c) 2017-2020 Hans-Kristian Arntzen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stddef.h>
+#include <utility>
+#include <memory>
+#include <atomic>
+#include <type_traits>
+
+namespace Util
+{
+class SingleThreadCounter
+{
+public:
+	inline void add_ref()
+	{
+		count++;
+	}
+
+	inline bool release()
+	{
+		return --count == 0;
+	}
+
+private:
+	size_t count = 1;
+};
+
+class MultiThreadCounter
+{
+public:
+	MultiThreadCounter()
+	{
+		count.store(1, std::memory_order_relaxed);
+	}
+
+	inline void add_ref()
+	{
+		count.fetch_add(1, std::memory_order_relaxed);
+	}
+
+	inline bool release()
+	{
+		auto result = count.fetch_sub(1, std::memory_order_acq_rel);
+		return result == 1;
+	}
+
+private:
+	std::atomic_size_t count;
+};
+
+template <typename T>
+class IntrusivePtr;
+
+template <typename T, typename Deleter = std::default_delete<T>, typename ReferenceOps = SingleThreadCounter>
+class IntrusivePtrEnabled
+{
+public:
+	using IntrusivePtrType = IntrusivePtr<T>;
+	using EnabledBase = T;
+	using EnabledDeleter = Deleter;
+	using EnabledReferenceOp = ReferenceOps;
+
+	void release_reference()
+	{
+		if (reference_count.release())
+			Deleter()(static_cast<T *>(this));
+	}
+
+	void add_reference()
+	{
+		reference_count.add_ref();
+	}
+
+	IntrusivePtrEnabled() = default;
+
+	IntrusivePtrEnabled(const IntrusivePtrEnabled &) = delete;
+
+	void operator=(const IntrusivePtrEnabled &) = delete;
+
+protected:
+	Util::IntrusivePtr<T> reference_from_this();
+
+private:
+	ReferenceOps reference_count;
+};
+
+template <typename T>
+class IntrusivePtr
+{
+public:
+	template <typename U>
+	friend class IntrusivePtr;
+
+	IntrusivePtr() = default;
+
+	explicit IntrusivePtr(T *handle)
+		: data(handle)
+	{
+	}
+
+	T &operator*()
+	{
+		return *data;
+	}
+
+	const T &operator*() const
+	{
+		return *data;
+	}
+
+	T *operator->()
+	{
+		return data;
+	}
+
+	const T *operator->() const
+	{
+		return data;
+	}
+
+	explicit operator bool() const
+	{
+		return data != nullptr;
+	}
+
+	bool operator==(const IntrusivePtr &other) const
+	{
+		return data == other.data;
+	}
+
+	bool operator!=(const IntrusivePtr &other) const
+	{
+		return data != other.data;
+	}
+
+	T *get()
+	{
+		return data;
+	}
+
+	const T *get() const
+	{
+		return data;
+	}
+
+	void reset()
+	{
+		using ReferenceBase = IntrusivePtrEnabled<
+				typename T::EnabledBase,
+				typename T::EnabledDeleter,
+				typename T::EnabledReferenceOp>;
+
+		// Static up-cast here to avoid potential issues with multiple intrusive inheritance.
+		// Also makes sure that the pointer type actually inherits from this type.
+		if (data)
+			static_cast<ReferenceBase *>(data)->release_reference();
+		data = nullptr;
+	}
+
+	template <typename U>
+	IntrusivePtr &operator=(const IntrusivePtr<U> &other)
+	{
+		static_assert(std::is_base_of<T, U>::value,
+		              "Cannot safely assign downcasted intrusive pointers.");
+
+		using ReferenceBase = IntrusivePtrEnabled<
+				typename T::EnabledBase,
+				typename T::EnabledDeleter,
+				typename T::EnabledReferenceOp>;
+
+		reset();
+		data = static_cast<T *>(other.data);
+
+		// Static up-cast here to avoid potential issues with multiple intrusive inheritance.
+		// Also makes sure that the pointer type actually inherits from this type.
+		if (data)
+			static_cast<ReferenceBase *>(data)->add_reference();
+		return *this;
+	}
+
+	IntrusivePtr &operator=(const IntrusivePtr &other)
+	{
+		using ReferenceBase = IntrusivePtrEnabled<
+				typename T::EnabledBase,
+				typename T::EnabledDeleter,
+				typename T::EnabledReferenceOp>;
+
+		if (this != &other)
+		{
+			reset();
+			data = other.data;
+			if (data)
+				static_cast<ReferenceBase *>(data)->add_reference();
+		}
+		return *this;
+	}
+
+	template <typename U>
+	IntrusivePtr(const IntrusivePtr<U> &other)
+	{
+		*this = other;
+	}
+
+	IntrusivePtr(const IntrusivePtr &other)
+	{
+		*this = other;
+	}
+
+	~IntrusivePtr()
+	{
+		reset();
+	}
+
+	template <typename U>
+	IntrusivePtr &operator=(IntrusivePtr<U> &&other) noexcept
+	{
+		reset();
+		data = other.data;
+		other.data = nullptr;
+		return *this;
+	}
+
+	IntrusivePtr &operator=(IntrusivePtr &&other) noexcept
+	{
+		if (this != &other)
+		{
+			reset();
+			data = other.data;
+			other.data = nullptr;
+		}
+		return *this;
+	}
+
+	template <typename U>
+	IntrusivePtr(IntrusivePtr<U> &&other) noexcept
+	{
+		*this = std::move(other);
+	}
+
+	template <typename U>
+	IntrusivePtr(IntrusivePtr &&other) noexcept
+	{
+		*this = std::move(other);
+	}
+
+private:
+	T *data = nullptr;
+};
+
+template <typename T, typename Deleter, typename ReferenceOps>
+IntrusivePtr<T> IntrusivePtrEnabled<T, Deleter, ReferenceOps>::reference_from_this()
+{
+	add_reference();
+	return IntrusivePtr<T>(static_cast<T *>(this));
+}
+
+template <typename Derived>
+using DerivedIntrusivePtrType = IntrusivePtr<Derived>;
+
+template <typename T, typename... P>
+DerivedIntrusivePtrType<T> make_handle(P &&... p)
+{
+	return DerivedIntrusivePtrType<T>(new T(std::forward<P>(p)...));
+}
+
+template <typename Base, typename Derived, typename... P>
+typename Base::IntrusivePtrType make_derived_handle(P &&... p)
+{
+	return typename Base::IntrusivePtrType(new Derived(std::forward<P>(p)...));
+}
+
+template <typename T>
+using ThreadSafeIntrusivePtrEnabled = IntrusivePtrEnabled<T, std::default_delete<T>, MultiThreadCounter>;
+}
--- a/Show More
+++ b/Show More