Ares64 Performance Core (#3149)

* prep for performance ares64 core, needs work unmanaged side

* get this going

* rebuild this

* apparently build didnt get cp'd? need to investigate

* fix build, other shit

* suppress these warnings

* tweaks and builds

* apparently bizinvoker doesnt like having LibAres64 class shared between non-waterbox and waterboxed, so split it.
also states for performance core

* builds

* fix this option, describe supersampling properly

* penguin64
This commit is contained in:
CasualPokePlayer 2022-02-16 02:15:27 -08:00 committed by GitHub
parent 05f11be191
commit 655ed7949e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
209 changed files with 172313 additions and 78 deletions

Binary file not shown.

BIN
Assets/dll/libares64.dll Normal file

Binary file not shown.

BIN
Assets/dll/libares64.so Normal file

Binary file not shown.

View File

@ -27,7 +27,7 @@ namespace BizHawk.Client.Common
(new[] { VSystemID.Raw.SNES },
new[] { CoreNames.Faust, CoreNames.Snes9X, CoreNames.Bsnes, CoreNames.Bsnes115 }),
(new[] { VSystemID.Raw.N64 },
new[] { CoreNames.Mupen64Plus, CoreNames.Ares64, }),
new[] { CoreNames.Mupen64Plus, CoreNames.Ares64Performance, CoreNames.Ares64Accuracy }),
(new[] { VSystemID.Raw.SGB },
new[] { CoreNames.Gambatte, CoreNames.Bsnes, CoreNames.Bsnes115}),
(new[] { VSystemID.Raw.GB, VSystemID.Raw.GBC },

View File

@ -3,7 +3,7 @@ using System.ComponentModel;
using BizHawk.Common;
using BizHawk.Emulation.Common;
namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64
namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64.Accuracy
{
public partial class Ares64 : ISettable<object, Ares64.Ares64SyncSettings>
{

View File

@ -6,13 +6,13 @@ using BizHawk.Emulation.Common;
using BizHawk.Emulation.Cores.Properties;
using BizHawk.Emulation.Cores.Waterbox;
namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64
namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64.Accuracy
{
[PortedCore(CoreNames.Ares64, "ares team, Near", "v126", "https://ares-emulator.github.io/", isReleased: false)]
[PortedCore(CoreNames.Ares64Accuracy, "ares team, Near", "v126", "https://ares-emulator.github.io/", isReleased: false)]
[ServiceNotApplicable(new[] { typeof(IDriveLight), })]
public partial class Ares64 : WaterboxCore, IRegionable
{
private readonly LibAres64 _core;
private readonly LibAres64Accuracy _core;
[CoreConstructor(VSystemID.Raw.N64)]
public Ares64(CoreLoadParameters<object, Ares64SyncSettings> lp)
@ -40,7 +40,7 @@ namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64
N64Controller = CreateControllerDefinition(ControllerSettings);
_core = PreInit<LibAres64>(new WaterboxOptions
_core = PreInit<LibAres64Accuracy>(new WaterboxOptions
{
Filename = "ares64.wbx",
SbrkHeapSizeKB = 2 * 1024,
@ -68,19 +68,32 @@ namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64
VsyncDenominator = 1;
}
LibAres64.LoadFlags loadFlags = 0;
if (_syncSettings.RestrictAnalogRange)
loadFlags |= LibAres64.LoadFlags.RestrictAnalogRange;
if (pal)
loadFlags |= LibAres64.LoadFlags.Pal;
var pif = Util.DecompressGzipFile(new MemoryStream(pal ? Resources.PIF_PAL_ROM.Value : Resources.PIF_NTSC_ROM.Value));
_exe.AddReadonlyFile(pif, pal ? "pif.pal.rom" : "pif.ntsc.rom");
_exe.AddReadonlyFile(rom, "program.rom");
if (!_core.Init(ControllerSettings, _syncSettings.RestrictAnalogRange, pal))
unsafe
{
throw new InvalidOperationException("Init returned false!");
fixed (byte* pifPtr = pif, romPtr = rom)
{
var loadData = new LibAres64.LoadData()
{
PifData = (IntPtr)pifPtr,
PifLen = pif.Length,
RomData = (IntPtr)romPtr,
RomLen = rom.Length,
};
if (!_core.Init(loadData, ControllerSettings, loadFlags))
{
throw new InvalidOperationException("Init returned false!");
}
}
}
_exe.RemoveReadonlyFile(pal ? "pif.pal.rom" : "pif.ntsc.rom");
_exe.RemoveReadonlyFile("program.rom");
PostInit();
DeterministicEmulation = true;
}

View File

@ -0,0 +1,77 @@
using System;
using System.ComponentModel;
using Newtonsoft.Json;
using BizHawk.Common;
using BizHawk.Emulation.Common;
namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64.Performance
{
public partial class Ares64 : ISettable<object, Ares64.Ares64SyncSettings>
{
private Ares64SyncSettings _syncSettings;
public object GetSettings() => null;
public Ares64SyncSettings GetSyncSettings() => _syncSettings.Clone();
public PutSettingsDirtyBits PutSettings(object o) => PutSettingsDirtyBits.None;
public PutSettingsDirtyBits PutSyncSettings(Ares64SyncSettings o)
{
var ret = Ares64SyncSettings.NeedsReboot(_syncSettings, o);
_syncSettings = o;
return ret ? PutSettingsDirtyBits.RebootCore : PutSettingsDirtyBits.None;
}
public class Ares64SyncSettings
{
[DisplayName("Player 1 Controller")]
[Description("")]
[DefaultValue(LibAres64.ControllerType.Mempak)]
public LibAres64.ControllerType P1Controller { get; set; }
[DisplayName("Player 2 Controller")]
[Description("")]
[DefaultValue(LibAres64.ControllerType.Unplugged)]
public LibAres64.ControllerType P2Controller { get; set; }
[DisplayName("Player 3 Controller")]
[Description("")]
[DefaultValue(LibAres64.ControllerType.Unplugged)]
public LibAres64.ControllerType P3Controller { get; set; }
[DisplayName("Player 4 Controller")]
[Description("")]
[DefaultValue(LibAres64.ControllerType.Unplugged)]
public LibAres64.ControllerType P4Controller { get; set; }
[DisplayName("Restrict Analog Range")]
[Description("Restricts analog range to account for physical limitations.")]
[DefaultValue(false)]
public bool RestrictAnalogRange { get; set; }
[DisplayName("Enable Vulkan")]
[Description("Enables Vulkan RDP. May fallback to software RDP if your GPU does not support Vulkan.")]
[DefaultValue(true)]
public bool EnableVulkan { get; set; }
[DisplayName("Supersampling")]
[Description("Scales HD and UHD resolutions back down to SD")]
[DefaultValue(false)]
public bool SuperSample { get; set; }
[DisplayName("Vulkan Upscale")]
[Description("")]
[DefaultValue(LibAres64.VulkanUpscaleOpts.SD)]
public LibAres64.VulkanUpscaleOpts VulkanUpscale { get; set; }
public Ares64SyncSettings() => SettingsUtil.SetDefaultValues(this);
public Ares64SyncSettings Clone() => MemberwiseClone() as Ares64SyncSettings;
public static bool NeedsReboot(Ares64SyncSettings x, Ares64SyncSettings y) => !DeepEquality.DeepEquals(x, y);
}
}
}

View File

@ -0,0 +1,449 @@
using System;
using System.IO;
using System.Linq;
using BizHawk.BizInvoke;
using BizHawk.Common;
using BizHawk.Emulation.Common;
using BizHawk.Emulation.Cores.Properties;
using BizHawk.Emulation.Cores.Waterbox;
namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64.Performance
{
[PortedCore(CoreNames.Ares64Performance, "ares team, Near", "v126", "https://ares-emulator.github.io/", singleInstance: true, isReleased: false)]
[ServiceNotApplicable(new[] { typeof(IDriveLight), })]
public partial class Ares64 : IEmulator, IVideoProvider, ISoundProvider, IStatable, IInputPollable, ISaveRam, IRegionable
{
private static readonly LibAres64Performance _core;
static Ares64()
{
var resolver = new DynamicLibraryImportResolver(
OSTailoredCode.IsUnixHost ? "libares64.so" : "libares64.dll", hasLimitedLifetime: false);
_core = BizInvoker.GetInvoker<LibAres64Performance>(resolver, CallingConventionAdapters.Native);
}
private readonly BasicServiceProvider _serviceProvider;
public IEmulatorServiceProvider ServiceProvider => _serviceProvider;
public int Frame { get; private set; }
public int LagCount { get; set; }
public bool IsLagFrame { get; set; }
[FeatureNotImplemented]
public IInputCallbackSystem InputCallbacks => throw new NotImplementedException();
public string SystemId => VSystemID.Raw.N64;
public bool DeterministicEmulation => false;
public void ResetCounters()
{
Frame = 0;
LagCount = 0;
IsLagFrame = false;
}
public void Dispose() => _core.Deinit();
[CoreConstructor(VSystemID.Raw.N64)]
public Ares64(CoreLoadParameters<object, Ares64SyncSettings> lp)
{
if (lp.DeterministicEmulationRequested)
{
throw new InvalidOperationException("This core is not deterministic!");
}
_serviceProvider = new(this);
_syncSettings = lp.SyncSettings ?? new();
int upscale = _syncSettings.EnableVulkan ? (int)_syncSettings.VulkanUpscale : 1;
_videoBuffer = new int[640 * upscale * 576 * upscale];
ControllerSettings = new[]
{
_syncSettings.P1Controller,
_syncSettings.P2Controller,
_syncSettings.P3Controller,
_syncSettings.P4Controller,
};
N64Controller = CreateControllerDefinition(ControllerSettings);
var rom = lp.Roms[0].RomData;
Region = rom[0x3E] switch
{
0x44 or 0x46 or 0x49 or 0x50 or 0x53 or 0x55 or 0x58 or 0x59 => DisplayType.PAL,
_ => DisplayType.NTSC,
};
var pal = Region == DisplayType.PAL;
VsyncNumerator = pal ? 50 : 60000;
VsyncDenominator = pal ? 1 : 1001;
LibAres64.LoadFlags loadFlags = 0;
if (_syncSettings.RestrictAnalogRange)
loadFlags |= LibAres64.LoadFlags.RestrictAnalogRange;
if (pal)
loadFlags |= LibAres64.LoadFlags.Pal;
if (_syncSettings.EnableVulkan)
loadFlags |= LibAres64.LoadFlags.UseVulkan;
if (_syncSettings.SuperSample)
loadFlags |= LibAres64.LoadFlags.SuperSample;
var pif = Util.DecompressGzipFile(new MemoryStream(pal ? Resources.PIF_PAL_ROM.Value : Resources.PIF_NTSC_ROM.Value));
unsafe
{
fixed (byte* pifPtr = pif, romPtr = rom)
{
var loadData = new LibAres64.LoadData()
{
PifData = (IntPtr)pifPtr,
PifLen = pif.Length,
RomData = (IntPtr)romPtr,
RomLen = rom.Length,
VulkanUpscale = upscale,
};
if (!_core.Init(loadData, ControllerSettings, loadFlags))
{
throw new InvalidOperationException("Init returned false!");
}
}
}
ResetCounters();
var areas = new LibWaterboxCore.MemoryArea[256];
_core.GetMemoryAreas(areas);
_memoryAreas = areas.Where(a => a.Data != IntPtr.Zero && a.Size != 0 && !a.Flags.HasFlag(LibWaterboxCore.MemoryDomainFlags.FunctionHook))
.ToArray();
var memoryDomains = _memoryAreas.Select(a => new WaterboxMemoryDomainPointer(a, _monitor)).ToList();
var primaryDomain = memoryDomains
.Where(md => md.Definition.Flags.HasFlag(LibWaterboxCore.MemoryDomainFlags.Primary))
.Single();
var mdl = new MemoryDomainList(
memoryDomains.Cast<MemoryDomain>().ToList()
)
{
MainMemory = primaryDomain
};
_serviceProvider.Register<IMemoryDomains>(mdl);
_saveramAreas = memoryDomains
.Where(md => md.Definition.Flags.HasFlag(LibWaterboxCore.MemoryDomainFlags.Saverammable))
.ToArray();
_saveramSize = (int)_saveramAreas.Sum(a => a.Size);
}
public DisplayType Region { get; }
public ControllerDefinition ControllerDefinition => N64Controller;
private ControllerDefinition N64Controller { get; }
public LibAres64.ControllerType[] ControllerSettings { get; }
private static ControllerDefinition CreateControllerDefinition(LibAres64.ControllerType[] controllerSettings)
{
var ret = new ControllerDefinition("Nintendo 64 Controller");
for (int i = 0; i < 4; i++)
{
if (controllerSettings[i] != LibAres64.ControllerType.Unplugged)
{
ret.BoolButtons.Add($"P{i + 1} DPad U");
ret.BoolButtons.Add($"P{i + 1} DPad D");
ret.BoolButtons.Add($"P{i + 1} DPad L");
ret.BoolButtons.Add($"P{i + 1} DPad R");
ret.BoolButtons.Add($"P{i + 1} Start");
ret.BoolButtons.Add($"P{i + 1} Z");
ret.BoolButtons.Add($"P{i + 1} B");
ret.BoolButtons.Add($"P{i + 1} A");
ret.BoolButtons.Add($"P{i + 1} C Up");
ret.BoolButtons.Add($"P{i + 1} C Down");
ret.BoolButtons.Add($"P{i + 1} C Left");
ret.BoolButtons.Add($"P{i + 1} C Right");
ret.BoolButtons.Add($"P{i + 1} L");
ret.BoolButtons.Add($"P{i + 1} R");
ret.AddXYPair($"P{i + 1} {{0}} Axis", AxisPairOrientation.RightAndUp, (-128).RangeTo(127), 0);
if (controllerSettings[i] == LibAres64.ControllerType.Rumblepak)
{
ret.HapticsChannels.Add($"P{i + 1} Rumble Pak");
}
}
}
ret.BoolButtons.Add("Reset");
ret.BoolButtons.Add("Power");
return ret.MakeImmutable();
}
private static LibAres64.Buttons GetButtons(IController controller, int num)
{
LibAres64.Buttons ret = 0;
if (controller.IsPressed($"P{num} DPad U"))
ret |= LibAres64.Buttons.UP;
if (controller.IsPressed($"P{num} DPad D"))
ret |= LibAres64.Buttons.DOWN;
if (controller.IsPressed($"P{num} DPad L"))
ret |= LibAres64.Buttons.LEFT;
if (controller.IsPressed($"P{num} DPad R"))
ret |= LibAres64.Buttons.RIGHT;
if (controller.IsPressed($"P{num} B"))
ret |= LibAres64.Buttons.B;
if (controller.IsPressed($"P{num} A"))
ret |= LibAres64.Buttons.A;
if (controller.IsPressed($"P{num} C Up"))
ret |= LibAres64.Buttons.C_UP;
if (controller.IsPressed($"P{num} C Down"))
ret |= LibAres64.Buttons.C_DOWN;
if (controller.IsPressed($"P{num} C Left"))
ret |= LibAres64.Buttons.C_LEFT;
if (controller.IsPressed($"P{num} C Right"))
ret |= LibAres64.Buttons.C_RIGHT;
if (controller.IsPressed($"P{num} L"))
ret |= LibAres64.Buttons.L;
if (controller.IsPressed($"P{num} R"))
ret |= LibAres64.Buttons.R;
if (controller.IsPressed($"P{num} Z"))
ret |= LibAres64.Buttons.Z;
if (controller.IsPressed($"P{num} Start"))
ret |= LibAres64.Buttons.START;
return ret;
}
private LibWaterboxCore.FrameInfo FrameAdvancePrep(IController controller, bool render, bool rendersound)
{
for (int i = 0; i < 4; i++)
{
if (ControllerSettings[i] == LibAres64.ControllerType.Rumblepak)
{
controller.SetHapticChannelStrength($"P{i + 1} Rumble Pak", _core.GetRumbleStatus(i) ? int.MaxValue : 0);
}
}
return new LibAres64.FrameInfo
{
P1Buttons = GetButtons(controller, 1),
P1XAxis = (short)controller.AxisValue("P1 X Axis"),
P1YAxis = (short)controller.AxisValue("P1 Y Axis"),
P2Buttons = GetButtons(controller, 2),
P2XAxis = (short)controller.AxisValue("P2 X Axis"),
P2YAxis = (short)controller.AxisValue("P2 Y Axis"),
P3Buttons = GetButtons(controller, 3),
P3XAxis = (short)controller.AxisValue("P3 X Axis"),
P3YAxis = (short)controller.AxisValue("P3 Y Axis"),
P4Buttons = GetButtons(controller, 4),
P4XAxis = (short)controller.AxisValue("P4 X Axis"),
P4YAxis = (short)controller.AxisValue("P4 Y Axis"),
Reset = controller.IsPressed("Reset"),
Power = controller.IsPressed("Power"),
};
}
public unsafe bool FrameAdvance(IController controller, bool render, bool rendersound = true)
{
_core.SetInputCallback(null);
fixed (int* vp = _videoBuffer)
fixed (short* sp = _soundBuffer)
{
var frame = FrameAdvancePrep(controller, render, rendersound);
frame.VideoBuffer = (IntPtr)vp;
frame.SoundBuffer = (IntPtr)sp;
_core.FrameAdvance(frame);
Frame++;
if (IsLagFrame = frame.Lagged != 0)
LagCount++;
if (render)
{
BufferWidth = frame.Width;
BufferHeight = frame.Height;
}
if (rendersound)
{
_numSamples = frame.Samples;
}
else
{
_numSamples = 0;
}
FrameAdvancePost();
}
return true;
}
private void FrameAdvancePost()
{
if (BufferWidth == 0)
{
BufferWidth = BufferHeight == 239 ? 320 : 640;
}
}
public int[] GetVideoBuffer() => _videoBuffer;
private readonly int[] _videoBuffer;
public int VirtualWidth => 640;
public int VirtualHeight => 480;
public int BufferWidth { get; private set; }
public int BufferHeight { get; private set; }
public int VsyncNumerator { get; }
public int VsyncDenominator { get; }
public int BackgroundColor => unchecked((int)0xff000000);
public void SetSyncMode(SyncSoundMode mode)
{
if (mode == SyncSoundMode.Async)
{
throw new NotSupportedException("Async mode is not supported.");
}
}
public void GetSamplesSync(out short[] samples, out int nsamp)
{
samples = _soundBuffer;
nsamp = _numSamples;
}
public void GetSamplesAsync(short[] samples) => throw new InvalidOperationException("Async mode is not supported.");
public void DiscardSamples() {}
private readonly short[] _soundBuffer = new short[2048 * 2];
private int _numSamples;
public bool CanProvideAsync => false;
public SyncSoundMode SyncMode => SyncSoundMode.Sync;
private byte[] _stateBuffer = new byte[0];
public void SaveStateBinary(BinaryWriter writer)
{
var len = _core.SerializeSize();
if (len != _stateBuffer.Length)
{
_stateBuffer = new byte[len];
}
_core.Serialize(_stateBuffer);
writer.Write(_stateBuffer.Length);
writer.Write(_stateBuffer);
}
public void LoadStateBinary(BinaryReader reader)
{
var len = reader.ReadInt32();
if (len != _core.SerializeSize())
{
throw new InvalidOperationException("Savestate size mismatch!");
}
if (len != _stateBuffer.Length)
{
_stateBuffer = new byte[len];
}
reader.Read(_stateBuffer, 0, len);
if (!_core.Unserialize(_stateBuffer, len))
{
throw new Exception($"{nameof(_core.Unserialize)}() returned false!");
}
}
private readonly LibWaterboxCore.MemoryArea[] _memoryAreas;
private readonly WaterboxMemoryDomain[] _saveramAreas;
private readonly int _saveramSize;
public unsafe bool SaveRamModified
{
get
{
if (_saveramSize == 0)
return false;
var buff = new byte[4096];
fixed (byte* bp = buff)
{
foreach (var area in _saveramAreas)
{
var stream = new MemoryDomainStream(area);
int cmp = (area.Definition.Flags & LibWaterboxCore.MemoryDomainFlags.OneFilled) != 0 ? -1 : 0;
while (true)
{
int nread = stream.Read(buff, 0, 4096);
if (nread == 0)
break;
int* p = (int*)bp;
int* pend = p + nread / sizeof(int);
while (p < pend)
{
if (*p++ != cmp)
return true;
}
}
}
}
return false;
}
}
public byte[] CloneSaveRam()
{
if (_saveramSize == 0)
return null;
var ret = new byte[_saveramSize];
var dest = new MemoryStream(ret, true);
foreach (var area in _saveramAreas)
{
new MemoryDomainStream(area).CopyTo(dest);
}
return ret;
}
public void StoreSaveRam(byte[] data)
{
if (data.Length != _saveramSize)
throw new InvalidOperationException("Saveram size mismatch");
var source = new MemoryStream(data, false);
foreach (var area in _saveramAreas)
{
WaterboxUtils.CopySome(source, new MemoryDomainStream(area), area.Size);
}
}
private readonly DummyMonitor _monitor = new();
private class DummyMonitor : IMonitor
{
public void Enter() { }
public void Exit() { }
}
}
}

View File

@ -59,10 +59,57 @@ namespace BizHawk.Emulation.Cores.Consoles.Nintendo.Ares64
public bool Power;
}
[Flags]
public enum LoadFlags : uint
{
RestrictAnalogRange = 1 << 0,
Pal = 1 << 1,
// performance only flags
UseVulkan = 1 << 2,
SuperSample = 1 << 3,
}
public enum VulkanUpscaleOpts : uint
{
SD = 1,
HD = 2,
UHD = 4,
}
[StructLayout(LayoutKind.Sequential)]
public class LoadData
{
public IntPtr PifData;
public int PifLen;
public IntPtr RomData;
public int RomLen;
// performance only data
public int VulkanUpscale;
}
[BizImport(CC)]
public abstract bool Init(ControllerType[] controllerSettings, bool restrictAnalogRange, bool pal);
public abstract bool Init(LoadData loadData, ControllerType[] controllerSettings, LoadFlags loadFlags);
[BizImport(CC)]
public abstract bool GetRumbleStatus(int num);
}
public abstract class LibAres64Accuracy : LibAres64
{
}
public abstract class LibAres64Performance : LibAres64
{
[BizImport(CC)]
public abstract void Deinit();
[BizImport(CC)]
public abstract int SerializeSize();
[BizImport(CC)]
public abstract void Serialize(byte[] buf);
[BizImport(CC)]
public abstract bool Unserialize(byte[] buf, int sz);
}
}

View File

@ -10,7 +10,8 @@ namespace BizHawk.Emulation.Cores
public static class CoreNames
{
public const string A7800Hawk = "A7800Hawk";
public const string Ares64 = "Ares64";
public const string Ares64Accuracy = "Ares64 (Accuracy)";
public const string Ares64Performance = "Ares64 (Performance)";
public const string Atari2600Hawk = "Atari2600Hawk";
public const string Bsnes = "BSNES";
public const string Bsnes115 = "BSNESv115+";

View File

@ -26,11 +26,21 @@ namespace BizHawk.Emulation.Cores
}
}
}
else if (core is Ares64 ares64)
else if (core is Consoles.Nintendo.Ares64.Accuracy.Ares64 ares64Acc)
{
for (var i = 0; i < 4; i++)
{
if (ares64.ControllerSettings[i] != LibAres64.ControllerType.Unplugged)
if (ares64Acc.ControllerSettings[i] != LibAres64.ControllerType.Unplugged)
{
yield return StandardController(i + 1);
}
}
}
else if (core is Consoles.Nintendo.Ares64.Performance.Ares64 ares64Perf)
{
for (var i = 0; i < 4; i++)
{
if (ares64Perf.ControllerSettings[i] != LibAres64.ControllerType.Unplugged)
{
yield return StandardController(i + 1);
}

View File

@ -5,13 +5,14 @@ MAME_PATH = $(ROOT_DIR)/ares/thirdparty/mame
CXXFLAGS := -std=c++17 -msse4.2 \
-I../libco -I.$(ROOT_DIR)/ares/ -I.$(ROOT_DIR)/ares/thirdparty/ -I.$(ARES_PATH) \
-Werror=int-to-pointer-cast -Wno-unused-but-set-variable \
-Werror=int-to-pointer-cast -Wno-unused-but-set-variable -Wno-delete-non-virtual-dtor \
-Wno-parentheses -Wno-reorder -Wno-unused-variable \
-Wno-sign-compare -Wno-switch -Wno-unused-local-typedefs \
-fno-strict-aliasing -fwrapv -fno-operator-names \
-I.$(MAME_PATH)/devices -I.$(MAME_PATH)/emu \
-I.$(MAME_PATH)/lib/util -I.$(MAME_PATH)/mame \
-I.$(MAME_PATH)/osd -DMAME_RDP -DLSB_FIRST -DPTR64 -DSDLMAME_EMSCRIPTEN
-I.$(MAME_PATH)/osd -DMAME_RDP -DLSB_FIRST -DPTR64 -DSDLMAME_EMSCRIPTEN \
-DWATERBOXED
TARGET = ares64.wbx

View File

@ -1,7 +1,16 @@
#include <n64/n64.hpp>
#if WATERBOXED
#include <emulibc.h>
#include <waterboxcore.h>
#endif
#include <vector>
#ifndef WATERBOXED
#define ECL_EXPORT __attribute__((visibility("default")))
#include "../emulibc/waterboxcore.h"
#endif
#define EXPORT extern "C" ECL_EXPORT
@ -38,7 +47,7 @@ struct BizPlatform : ares::Platform
auto video(ares::Node::Video::Screen, const u32*, u32, u32, u32) -> void override;
auto input(ares::Node::Input::Input) -> void override;
ares::VFS::Pak bizpak = new vfs::directory;
ares::VFS::Pak bizpak = nullptr;
ares::Node::Audio::Stream stream = nullptr;
u32* videobuf = nullptr;
u32 pitch = 0;
@ -84,16 +93,19 @@ auto BizPlatform::input(ares::Node::Input::Input node) -> void
}
};
static ares::Node::System root;
static BizPlatform platform;
static ares::Node::System root = nullptr;
static BizPlatform* platform = nullptr;
static array_view<u8>* pifData = nullptr;
static array_view<u8>* romData = nullptr;
static array_view<u8>* saveData = nullptr;
static inline void HackeryDoo()
{
root->run();
root->run();
platform.newframe = false;
platform->newframe = false;
f64 buf[2];
while (platform.stream->pending()) platform.stream->read(buf);
while (platform->stream->pending()) platform->stream->read(buf);
}
typedef enum
@ -311,46 +323,71 @@ static inline SaveType DetectSaveType(u8* rom)
namespace ares::Nintendo64 { extern bool RestrictAnalogRange; }
EXPORT bool Init(ControllerType* controllers, bool restrictAnalogRange, bool pal)
bool Inited = false;
typedef struct
{
FILE* f;
array_view<u8>* data;
u8* PifData;
u32 PifLen;
u8* RomData;
u32 RomLen;
#ifndef WATERBOXED
u32 VulkanUpscale;
#endif
} LoadData;
typedef enum
{
RESTRICT_ANALOG_RANGE = 1 << 0,
IS_PAL = 1 << 1,
#ifndef WATERBOXED
USE_VULKAN = 1 << 2,
SUPER_SAMPLE = 1 << 3,
#endif
} LoadFlags;
EXPORT void Deinit();
EXPORT bool Init(LoadData* loadData, ControllerType* controllers, LoadFlags loadFlags)
{
if (Inited) Deinit();
platform = new BizPlatform;
platform->bizpak = new vfs::directory;
u8* data;
u32 len;
string name;
bool pal = loadFlags & IS_PAL;
name = pal ? "pif.pal.rom" : "pif.ntsc.rom";
f = fopen(name, "rb");
fseek(f, 0, SEEK_END);
len = ftell(f);
data = new array_view<u8>(new u8[len], len);
fseek(f, 0, SEEK_SET);
fread((void*)data->data(), 1, len, f);
fclose(f);
platform.bizpak->append(name, *data);
len = loadData->PifLen;
data = new u8[len];
memcpy(data, loadData->PifData, len);
pifData = new array_view<u8>(data, len);
platform->bizpak->append(name, *pifData);
name = "program.rom";
f = fopen(name, "rb");
fseek(f, 0, SEEK_END);
len = ftell(f);
data = new array_view<u8>(new u8[len], len);
fseek(f, 0, SEEK_SET);
fread((void*)data->data(), 1, len, f);
fclose(f);
platform.bizpak->append(name, *data);
len = loadData->RomLen;
data = new u8[len];
memcpy(data, loadData->RomData, len);
romData = new array_view<u8>(data, len);
platform->bizpak->append(name, *romData);
string region = pal ? "PAL" : "NTSC";
platform.bizpak->setAttribute("region", region);
platform->bizpak->setAttribute("region", region);
string cic = pal ? "CIC-NUS-7101" : "CIC-NUS-6102";
u32 crc32 = Hash::CRC32({&((u8*)data->data())[0x40], 0x9C0}).value();
u32 crc32 = Hash::CRC32({&data[0x40], 0x9C0}).value();
if (crc32 == 0x1DEB51A9) cic = pal ? "CIC-NUS-7102" : "CIC-NUS-6101";
if (crc32 == 0xC08E5BD6) cic = pal ? "CIC-NUS-7101" : "CIC-NUS-6102";
if (crc32 == 0x03B8376A) cic = pal ? "CIC-NUS-7103" : "CIC-NUS-6103";
if (crc32 == 0xCF7F41DC) cic = pal ? "CIC-NUS-7105" : "CIC-NUS-6105";
if (crc32 == 0xD1059C6A) cic = pal ? "CIC-NUS-7106" : "CIC-NUS-6106";
platform.bizpak->setAttribute("cic", cic);
platform->bizpak->setAttribute("cic", cic);
SaveType save = DetectSaveType((u8*)data->data());
SaveType save = DetectSaveType(data);
if (save != NONE)
{
switch (save)
@ -360,17 +397,25 @@ EXPORT bool Init(ControllerType* controllers, bool restrictAnalogRange, bool pal
case SRAM32KB: len = 32 * 1024; name = "save.ram"; break;
case SRAM96KB: len = 96 * 1024; name = "save.ram"; break;
case FLASH128KB: len = 128 * 1024; name = "save.flash"; break;
default: return false;
default: Deinit(); return false;
}
data = new array_view<u8>(new u8[len], len);
memset((void*)data->data(), 0xFF, len);
platform.bizpak->append(name, *data);
data = new u8[len];
memset(data, 0xFF, len);
saveData = new array_view<u8>(data, len);
platform->bizpak->append(name, *saveData);
}
ares::platform = &platform;
ares::platform = platform;
#ifndef WATERBOXED
ares::Nintendo64::option("Enable Vulkan", !!(loadFlags & USE_VULKAN));
ares::Nintendo64::option("Quality", loadData->VulkanUpscale == 1 ? "SD" : (loadData->VulkanUpscale == 2 ? "HD" : "UHD"));
ares::Nintendo64::option("Supersampling", !!(loadFlags & SUPER_SAMPLE));
#endif
if (!ares::Nintendo64::load(root, {"[Nintendo] Nintendo 64 (", region, ")"}))
{
Deinit();
return false;
}
@ -381,6 +426,7 @@ EXPORT bool Init(ControllerType* controllers, bool restrictAnalogRange, bool pal
}
else
{
Deinit();
return false;
}
@ -393,7 +439,6 @@ EXPORT bool Init(ControllerType* controllers, bool restrictAnalogRange, bool pal
auto peripheral = port->allocate("Gamepad");
port->connect();
string name;
switch (controllers[i])
{
case Mempak: name = "Controller Pak"; break;
@ -408,22 +453,51 @@ EXPORT bool Init(ControllerType* controllers, bool restrictAnalogRange, bool pal
}
else
{
Deinit();
return false;
}
}
else
{
Deinit();
return false;
}
}
ares::Nintendo64::RestrictAnalogRange = restrictAnalogRange;
ares::Nintendo64::RestrictAnalogRange = loadFlags & RESTRICT_ANALOG_RANGE;
root->power(false);
HackeryDoo();
Inited = true;
return true;
}
EXPORT void Deinit()
{
if (root) root->unload();
if (platform)
{
if (platform->bizpak) platform->bizpak.reset();
delete platform;
}
if (pifData)
{
delete[] (u8*)pifData->data();
delete pifData;
}
if (romData)
{
delete[] (u8*)romData->data();
delete romData;
}
if (saveData)
{
delete[] (u8*)saveData->data();
delete saveData;
}
Inited = false;
}
EXPORT bool GetRumbleStatus(u32 num)
{
ares::Nintendo64::Gamepad* c = nullptr;
@ -437,6 +511,23 @@ EXPORT bool GetRumbleStatus(u32 num)
return c ? c->motor->enable() : false;
}
EXPORT u32 SerializeSize()
{
return root->serialize(false).size();
}
EXPORT void Serialize(u8* buf)
{
auto s = root->serialize(false);
memcpy(buf, s.data(), s.size());
}
EXPORT bool Unserialize(u8* buf, u32 sz)
{
serializer s(buf, sz);
return root->unserialize(s);
}
#define MAYBE_ADD_MEMORY_DOMAIN(mem, name, flags) do { \
if (ares::Nintendo64::mem.data) \
{ \
@ -544,39 +635,39 @@ EXPORT void FrameAdvance(MyFrameInfo* f)
UPDATE_CONTROLLER(3);
UPDATE_CONTROLLER(4);
platform.lagged = true;
platform->lagged = true;
root->run();
f->Width = platform.width;
f->Height = platform.height;
if (platform.newframe)
f->Width = platform->width;
f->Height = platform->height;
if (platform->newframe)
{
u32* src = platform.videobuf;
u32* src = platform->videobuf;
u32* dst = f->VideoBuffer;
for (int i = 0; i < f->Height; i++)
{
memcpy(dst, src, f->Width * 4);
dst += f->Width;
src += platform.pitch;
src += platform->pitch;
}
platform.newframe = false;
platform->newframe = false;
}
s16* soundbuf = f->SoundBuffer;
while (platform.stream->pending())
while (platform->stream->pending())
{
f64 buf[2];
platform.stream->read(buf);
platform->stream->read(buf);
*soundbuf++ = (s16)std::clamp(buf[0] * 32768, -32768.0, 32767.0);
*soundbuf++ = (s16)std::clamp(buf[1] * 32768, -32768.0, 32767.0);
f->Samples++;
}
f->Lagged = platform.lagged;
f->Lagged = platform->lagged;
}
EXPORT void SetInputCallback(void (*callback)())
{
platform.inputcb = callback;
}
platform->inputcb = callback;
}

View File

@ -0,0 +1,160 @@
ARES_PATH = $(ROOT_DIR)/ares/ares
MAME_PATH = $(ROOT_DIR)/ares/thirdparty/mame
SLJIT_PATH = $(ROOT_DIR)/ares/thirdparty/sljit
CCFLAGS := -std=c99 -Wall -Wno-format -Wno-parentheses
CXXFLAGS := -std=c++17 -msse4.2 -O3 -flto -fvisibility=internal \
-I../libco -I.$(ROOT_DIR)/ares/ -I.$(ROOT_DIR)/ares/thirdparty/ -I.$(ARES_PATH) \
-Werror=int-to-pointer-cast -Wno-unused-but-set-variable \
-Wno-parentheses -Wno-reorder -Wno-unused-variable \
-Wno-sign-compare -Wno-switch -Wno-unused-local-typedefs \
-fno-strict-aliasing -fwrapv -fno-operator-names \
-I.$(MAME_PATH)/devices -I.$(MAME_PATH)/emu \
-I.$(MAME_PATH)/lib/util -I.$(MAME_PATH)/mame \
-I.$(MAME_PATH)/osd -DMAME_RDP -DLSB_FIRST -DPTR64 -DSLJIT_HAVE_CONFIG_PRE=1 -DSLJIT_HAVE_CONFIG_POST=1 -fPIC
LDFLAGS := -shared
ifeq ($(OS),Windows_NT)
CCFLAGS += -DVK_USE_PLATFORM_WIN32_KHR
CXXFLAGS += -DVK_USE_PLATFORM_WIN32_KHR -DOSD_WINDOWS=1
TARGET = libares64.dll
else
CXXFLAGS += -DSDLMAME_LINUX
TARGET = libares64.so
endif
SRCS_LIBCO = \
$(ROOT_DIR)/ares/libco/libco.c
SRCS_PROCESSORS = \
$(ARES_PATH)/component/processor/sm5k/sm5k.cpp
SRCS_ARES = \
$(ARES_PATH)/ares/ares.cpp \
$(ARES_PATH)/ares/memory/fixed-allocator.cpp
SRCS_N64 = \
$(ARES_PATH)/n64/memory/memory.cpp \
$(ARES_PATH)/n64/system/system.cpp \
$(ARES_PATH)/n64/cartridge/cartridge.cpp \
$(ARES_PATH)/n64/controller/controller.cpp \
$(ARES_PATH)/n64/dd/dd.cpp \
$(ARES_PATH)/n64/sp/sp.cpp \
$(ARES_PATH)/n64/dp/dp.cpp \
$(ARES_PATH)/n64/mi/mi.cpp \
$(ARES_PATH)/n64/vi/vi.cpp \
$(ARES_PATH)/n64/ai/ai.cpp \
$(ARES_PATH)/n64/pi/pi.cpp \
$(ARES_PATH)/n64/ri/ri.cpp \
$(ARES_PATH)/n64/si/si.cpp \
$(ARES_PATH)/n64/rdram/rdram.cpp \
$(ARES_PATH)/n64/cpu/cpu.cpp \
$(ARES_PATH)/n64/rdp/rdp.cpp \
$(ARES_PATH)/n64/rsp/rsp.cpp \
$(ARES_PATH)/n64/vulkan/vulkan.cpp
PARALLEL_RDP_IMPLEMENTATION = $(ARES_PATH)/n64/vulkan/parallel-rdp
SRCS_PARALLEL_RDP = \
$(wildcard $(PARALLEL_RDP_IMPLEMENTATION)/parallel-rdp/*.cpp) \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/buffer.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/buffer_pool.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/command_buffer.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/command_pool.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/context.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/cookie.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/descriptor_set.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/device.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/event_manager.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/fence.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/fence_manager.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/image.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/memory_allocator.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/pipeline_event.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/query_pool.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/render_pass.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/sampler.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/semaphore.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/semaphore_manager.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/shader.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/texture_format.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/util/logging.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/util/thread_id.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/util/aligned_alloc.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/util/timer.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/util/timeline_trace_file.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/util/thread_name.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/volk/volk.c
PARALLEL_RDP_INCLUDE_DIRS = \
-I.$(PARALLEL_RDP_IMPLEMENTATION)/parallel-rdp \
-I.$(PARALLEL_RDP_IMPLEMENTATION)/volk \
-I.$(PARALLEL_RDP_IMPLEMENTATION)/vulkan \
-I.$(PARALLEL_RDP_IMPLEMENTATION)/vulkan-headers/include \
-I.$(PARALLEL_RDP_IMPLEMENTATION)/util
CXXFLAGS += $(PARALLEL_RDP_INCLUDE_DIRS) -DVULKAN -DGRANITE_VULKAN_MT
CCFLAGS += $(PARALLEL_RDP_INCLUDE_DIRS)
SRCS_MAME = \
$(MAME_PATH)/emu/emucore.cpp \
$(MAME_PATH)/lib/util/delegate.cpp \
$(MAME_PATH)/lib/util/strformat.cpp \
$(MAME_PATH)/mame/video/n64.cpp \
$(MAME_PATH)/mame/video/pin64.cpp \
$(MAME_PATH)/mame/video/rdpblend.cpp \
$(MAME_PATH)/mame/video/rdptpipe.cpp \
$(MAME_PATH)/osd/osdcore.cpp \
$(MAME_PATH)/osd/osdsync.cpp
SRCS_SLJIT = \
$(SLJIT_PATH)/../sljitAllocator.cpp \
$(SLJIT_PATH)/sljit_src/sljitLir.c
SRCS = $(SRCS_LIBCO) $(SRCS_PROCESSORS) $(SRCS_ARES) $(SRCS_N64) $(SRCS_PARALLEL_RDP) $(SRCS_MAME) $(SRCS_SLJIT) BizInterface.cpp
ROOT_DIR := $(shell dirname $(realpath Performance.mak))
OUTPUTDLL_DIR := $(realpath $(ROOT_DIR)/../../Assets/dll)
OUTPUTDLLCOPY_DIR := $(realpath $(ROOT_DIR)/../../output/dll)
OUT_DIR := $(ROOT_DIR)/obj
OBJ_DIR := $(OUT_DIR)/release_performance
CC := gcc
CXX := g++
_OBJS := $(addsuffix .o,$(realpath $(SRCS)))
OBJS := $(patsubst $(ROOT_DIR)%,$(OBJ_DIR)%,$(_OBJS))
$(OBJ_DIR)/%.c.o: %.c
@echo cc $<
@mkdir -p $(@D)
@$(CC) -c -o $@ $< $(CCFLAGS) $(PER_FILE_FLAGS_$<)
$(OBJ_DIR)/%.cpp.o: %.cpp
@echo cxx $<
@mkdir -p $(@D)
@$(CXX) -c -o $@ $< $(CXXFLAGS) $(PER_FILE_FLAGS_$<)
.DEFAULT_GOAL := install
.PHONY: release install
TARGET_RELEASE := $(OBJ_DIR)/$(TARGET)
release: $(TARGET_RELEASE)
$(TARGET_RELEASE): $(OBJS)
@echo ld $@
@$(CXX) -o $@ $(LDFLAGS) $(CCFLAGS) $(CXXFLAGS) $(OBJS)
install: $(TARGET_RELEASE)
@cp -f $(TARGET_RELEASE) $(OUTPUTDLL_DIR)/$(TARGET)
@cp -f $(TARGET_RELEASE) $(OUTPUTDLLCOPY_DIR)/$(TARGET)
@echo Release build of $(TARGET) installed.
.PHONY: clean
clean:
rm -rf $(OUT_DIR)
-include $(OBJS:%o=%d)

View File

@ -1,8 +1,12 @@
#pragma once
#ifdef WATERBOXED
#include <emulibc.h>
#include <libco.h>
#else
#include <libco/libco.h>
#endif
#include <sljit.h>
#include <nall/platform.hpp>
@ -57,7 +61,11 @@ namespace ares {
}
namespace Video {
#ifdef WATERBOXED
static constexpr bool Threaded = false;
#else
static constexpr bool Threaded = true;
#endif
}
namespace Constants {

View File

@ -3,10 +3,17 @@ Screen::Screen(string name, u32 width, u32 height) : Video(name) {
_canvasHeight = height;
if(width && height) {
#ifdef WATERBOXED
_inputA = alloc_invisible<u32>(width * height);
_inputB = alloc_invisible<u32>(width * height);
_output = alloc_invisible<u32>(width * height);
_rotate = alloc_invisible<u32>(width * height);
#else
_inputA = new u32[width * height]();
_inputB = new u32[width * height]();
_output = new u32[width * height]();
_rotate = new u32[width * height]();
#endif
if constexpr(ares::Video::Threaded) {
_thread = nall::thread::create({&Screen::main, this});

View File

@ -16,6 +16,7 @@ struct Platform {
virtual auto pak(Node::Object) -> shared_pointer<vfs::directory> { return {}; }
virtual auto event(Event) -> void {}
virtual auto log(string_view message) -> void {}
virtual auto status(string_view message) -> void {}
virtual auto video(Node::Video::Screen, const u32* data, u32 pitch, u32 width, u32 height) -> void {}
virtual auto audio(Node::Audio::Stream) -> void {}
virtual auto input(Node::Input::Input) -> void {}

View File

@ -1,6 +1,10 @@
struct Accuracy {
//enable all accuracy flags
#ifdef WATERBOXED
static constexpr bool Reference = 1;
#else
static constexpr bool Reference = 0;
#endif
struct CPU {
static constexpr bool Interpreter = 0 | Reference;

View File

@ -33,10 +33,12 @@ Gamepad::~Gamepad() {
}
auto Gamepad::save() -> void {
/*
if(!slot) return;
if(slot->name() == "Controller Pak") {
ram.save(pak->write("save.pak"));
}
*/
}
auto Gamepad::allocate(string name) -> Node::Peripheral {

View File

@ -99,6 +99,7 @@ auto System::unload() -> void {
vulkan.unload();
#endif
cartridgeSlot.unload();
puts("unloading port 1");
controllerPort1.unload();
controllerPort2.unload();
controllerPort3.unload();
@ -119,12 +120,14 @@ auto System::unload() -> void {
}
auto System::save() -> void {
/*
if(!node) return;
cartridge.save();
controllerPort1.save();
controllerPort2.save();
controllerPort3.save();
controllerPort4.save();
*/
}
auto System::power(bool reset) -> void {

View File

@ -100,7 +100,7 @@ auto VI::writeWord(u32 address, u32 data_) -> void {
n32 data = data_;
#if defined(VULKAN)
vulkan.writeWord(address, data);
if (vulkan.enable) vulkan.writeWord(address, data);
#endif
if(address == 0) {

View File

@ -10,11 +10,16 @@ VI vi;
auto VI::load(Node::Object parent) -> void {
node = parent->append<Node::Object>("VI");
u32 width = 640;
u32 height = 576;
#if defined(VULKAN)
screen = node->append<Node::Video::Screen>("Screen", vulkan.outputUpscale * 640, vulkan.outputUpscale * 576);
#else
screen = node->append<Node::Video::Screen>("Screen", 640, 576);
if (vulkan.enable) {
width *= vulkan.outputUpscale;
height *= vulkan.outputUpscale;
}
#endif
screen = node->append<Node::Video::Screen>("Screen", width, height);
screen->setRefresh({&VI::refresh, this});
screen->colors((1 << 24) + (1 << 15), [&](n32 color) -> n64 {
if(color < (1 << 24)) {
@ -31,10 +36,15 @@ auto VI::load(Node::Object parent) -> void {
return a << 48 | r << 32 | g << 16 | b << 0;
}
});
#if defined(VULKAN)
screen->setSize(vulkan.outputUpscale * 640, vulkan.outputUpscale * 480);
if(!vulkan.supersampleScanout) {
screen->setScale(1.0 / vulkan.outputUpscale, 1.0 / vulkan.outputUpscale);
if(vulkan.enable) {
screen->setSize(vulkan.outputUpscale * 640, vulkan.outputUpscale * 480);
if(!vulkan.supersampleScanout) {
screen->setScale(1.0 / vulkan.outputUpscale, 1.0 / vulkan.outputUpscale);
}
} else {
screen->setSize(640, 480);
}
#else
screen->setSize(640, 480);
@ -62,8 +72,10 @@ auto VI::main() -> void {
io.field = io.field + 1 & io.serrate;
if(!io.field) {
#if defined(VULKAN)
gpuOutputValid = vulkan.scanoutAsync(io.field);
vulkan.frame();
if (vulkan.enable) {
gpuOutputValid = vulkan.scanoutAsync(io.field);
vulkan.frame();
}
#endif
refreshed = true;
@ -81,7 +93,7 @@ auto VI::step(u32 clocks) -> void {
auto VI::refresh() -> void {
#if defined(VULKAN)
if(gpuOutputValid) {
if(vulkan.enable && gpuOutputValid) {
const u8* rgba = nullptr;
u32 width = 0, height = 0;
vulkan.mapScanoutRead(rgba, width, height);

View File

@ -0,0 +1 @@
31ea5eb2d6fcb2d8f1df5f0951364322d09ac01a

View File

@ -0,0 +1,20 @@
Copyright (c) 2020 Themaister
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -0,0 +1,265 @@
# paraLLEl-RDP
This project is a revival and complete rewrite of the old, defunct paraLLEl-RDP project.
The goal is to implement the Nintendo 64 RDP graphics chip as accurately as possible using Vulkan compute.
The implementation aims to be bitexact with the
[Angrylion-Plus](https://github.com/ata4/angrylion-rdp-plus) reference renderer where possible.
## Disclaimer
While paraLLEl-RDP uses [Angrylion-Plus](https://github.com/ata4/angrylion-rdp-plus)
as an implementation reference, it is not a port, and not a derived codebase of said project.
It is written from scratch by studying [Angrylion-Plus](https://github.com/ata4/angrylion-rdp-plus)
and trying to understand what is going on.
The test suite uses [Angrylion-Plus](https://github.com/ata4/angrylion-rdp-plus) as a reference
to validate implementation and cross-checking behavior.
## Use cases
- **Much** faster LLE RDP emulation of N64 compared to a CPU implementation
as parallel graphics workloads are offloaded to the GPU.
Emulation performance is now completely bound by CPU and LLE RSP performance.
Early benchmarking results suggest 2000 - 5000 VI/s being achieved on mid-range desktop GPUs based on timestamp data.
There is no way the CPU emulation can keep up with that, but that means this should
scale down to fairly gimped GPUs as well, assuming the driver requirements are met.
- A backend renderer for standalone engines which aim to efficiently reproduce faithful N64 graphics.
- Hopefully, an easier to understand implementation than the reference renderer.
- An esoteric use case of advanced Vulkan compute programming.
## Missing features
The implementation is quite complete, and compatibility is very high in the limited amount of content I've tested.
However, not every single feature is supported at this moment.
Ticking the last boxes depends mostly on real content making use of said features.
- Color combiner chroma keying
- Various "bugs" / questionable behavior that seems meaningless to emulate
- Certain extreme edge cases in TMEM upload. The implementation has tests for many "crazy" edge cases though.
- ... possibly other obscure features
The VI is essentially complete. A fancy deinterlacer might be useful to add since we have plenty of GPU cycles to spare in the graphics queue.
The VI filtering is always turned on if game requests it, but features can selectively be turned off for the pixel purists.
## Environment variables for development / testing
### `RDP_DEBUG` / `RDP_DEBUG_X` / `RDP_DEBUG_Y`
Supports printf in shaders, which is extremely useful to drill down difficult bugs.
Only printfs from certain pixels can be filtered through to avoid spam.
### `VI_DEBUG` / `VI_DEBUG_X` / `VI_DEBUG_Y`
Same as `RDP_DEBUG` but for the VI.
### `PARALLEL_RDP_MEASURE_SYNC_TIME`
Measures time stalled in `CommandProcessor::wait_for_timeline`. Useful to measure
CPU overhead in hard-synced emulator integrations.
### `PARALLEL_RDP_SMALL_TYPES=0`
Force-disables 8/16-bit arithmetic support. Useful when suspecting driver bugs.
### `PARALLEL_RDP_UBERSHADER=1`
Forces the use of ubershaders. Can be extremely slow depending on the shader compiler.
### `PARALLEL_RDP_FORCE_SYNC_SHADER=1`
Disabled async pipeline optimization, and blocks for every shader compiler.
Only use if the ubershader crashes, since this adds the dreaded shader compilation stalls.
### `PARALLEL_RDP_BENCH=1`
Measures RDP rendering time spent on GPU using Vulkan timestamps.
At end of a run, reports average time spent per render pass,
and how many render passes are flushed per frame.
### `PARALLEL_RDP_SUBGROUP=0`
Force-disables use of Vulkan subgroup operations,
which are used to optimize the tile binning algorithm.
### `PARALLEL_RDP_ALLOW_EXTERNAL_HOST=0`
Disables use of `VK_EXT_external_memory_host`. For testing.
## Vulkan driver requirements
paraLLEl-RDP requires up-to-date Vulkan implementations. A lot of the great improvements over the previous implementation
comes from the idea that we can implement N64's UMA by simply importing RDRAM directly as an SSBO and perform 8 and 16-bit
data access over the bus. With the tile based architecture in paraLLEl-RDP, this works very well and actual
PCI-e traffic is massively reduced. The bandwidth for doing this is also trivial. On iGPU systems, this also works really well, since
it's all the same memory anyways.
Thus, the requirements are as follows. All of these features are widely supported, or will soon be in drivers.
paraLLEl-RDP does not aim for compatibility with ancient hardware and drivers.
Just use the reference renderer for that. This is enthusiast software for a niche audience.
- Vulkan 1.1
- VK_KHR_8bit_storage / VK_KHR_16bit_storage
- Optionally VK_KHR_shader_float16_int8 which enables small integer arithmetic
- Optionally subgroup support with VK_EXT_subgroup_size_control
- For integration in emulators, VK_EXT_external_memory_host is currently required (may be relaxed later at some performance cost)
### Tested drivers
paraLLEl-RDP has been tested on Linux and Windows on all desktop vendors.
- Intel Mesa (20.0.6) - Passes conformance
- Intel Windows - Passes conformance (**CAVEAT**. Intel Windows requires 64 KiB alignment for host memory import, make sure to add some padding around RDRAM in an emulator to make this work well.)
- AMD RADV LLVM (20.0.6) - Passes conformance
- AMD RADV ACO - Passes conformance with bleeding edge drivers and `PARALLEL_RDP_SMALL_TYPES=0`.
- Linux AMDGPU-PRO - Passes conformance, with caveat that 8/16-bit arithmetic does not work correctly for some tests.
paraLLEl-RDP automatically disables small integer arithmetic for proprietary AMD driver.
- AMD Windows - Passes conformance with same caveat and workaround as AMDGPU-PRO.
- NVIDIA Linux - Passes conformance (**MAJOR CAVEAT**, NVIDIA Linux does not support VK_EXT_external_memory_host as of 2020-05-12.)
- NVIDIA Windows - Passes conformance
## Implementation strategy
This project uses Vulkan compute shaders to implement a fully programmable rasterization pipeline.
The overall rendering architecture is reused from [RetroWarp](https://github.com/Themaister/RetroWarp)
with some further refinements.
The lower level Vulkan backend comes from [Granite](https://github.com/Themaister/Granite).
### Asynchronous pipeline optimization
Toggleable paths in RDP state is expressed as specialization constants. The rendering thread will
detect new state combinations and kick off building pipelines which only specify exact state needed to render.
This is a massive performance optimization.
The same shaders are used for an "ubershader" fallback when pipelines are not ready.
In this case, specialization constants are simply not used.
The same SPIR-V modules are reused to great effect using this Vulkan feature.
### Tile-based rendering
See [RetroWarp](https://github.com/Themaister/RetroWarp) for more details.
### GPU-driven TMEM management
TMEM management is fully GPU-driven, but this is a very complicated implementation.
Certain combinations of formats are not supported, but such cases would produce
meaningless results, and it is unclear that applications can make meaningful use of these "weird" uploads.
### Synchronization
Synchronizing the GPU and CPU emulation is one of the hot button issues of N64 emulation.
The integration code is designed around a timeline of synchronization points which can be waited on by the CPU
when appropriate. For accurate emulation, an OpSyncFull is generally followed by a full wait,
but most games can be more relaxed and only synchronize with the CPU N frames later.
Implementation of this behavior is outside the scope of paraLLEl-RDP, and is left up to the integration code.
### Asynchronous compute
GPUs with a dedicated compute queue is recommended for optimal performance since
RDP shading work can happen on the compute queue, and won't be blocked by graphics workloads happening
in the graphics queue, which will typically be VI scanout and frontend applying shaders on top.
## Project structure
This project implements several submodules which are quite useful.
### rdp-replayer
This app replays RDP dump files, which are produced by running content through an RDP dumper.
An implementation can be found in e.g. parallel-N64. The file format is very simple and essentially
contains a record of RDRAM changes and RDP command streams.
This dump is replayed and a live comparison between the reference renderer can be compared to paraLLEl-RDP
with visual output. The UI is extremely crude, and is not user-friendly, but good enough for my use.
### rdp-conformance
I made a somewhat comprehensive test suite for the RDP, with a custom higher level RDP command stream generator.
There are roughly ~150 fuzz tests which exercise many aspects of the RDP.
In order to pass the test, paraLLEl-RDP must produce bit-exact results compared to Angrylion,
so the test condition is as stringent as possible.
#### A note on bitexactness
There are a few cases where bit-exactness is a meaningless term, such as the noise feature of the RDP.
It is not particularly meaningful to exactly reproduce noise, since it is by its very nature unpredictable.
For that reason, this repo references a fork of the reference renderer which implements deterministic "undefined behavior"
where appropriate. The exact formulation of the noise generator is not very interesting as long as
correct entropy and output range is reproduced.
##### Intentional differences from reference renderer
Certain effects invoke "undefined behavior" in the RDP and requires cycle accuracy to resolve bit-accurately with real RDP.
Reference renderer attempts to emulate these effects, but to reproduce this behavior breaks any form of multi-threading.
To be able to validate dumps in a sensible way with buggy content, I modified the reference slightly to make certain
"undefined behavior" deterministic. This doesn't meaningfully change the rendered output in the cases I've seen in the wild.
Some of these effects would be possible to emulate,
but at the cost of lots of added complexity and it wouldn't be quite correct anyways given the cycle accuracy issue.
- CombinedColor/Alpha in first cycle is cleared to zero. Some games read this in first cycle,
and reference renderer will read whatever was generated last pixel.
This causes issues in some cases, where cycle accuracy would have caused the feedback to converge to zero over time.
- Reading LODFrac in 1 cycle mode. This is currently ignored. The results generated seem non-sensical. Never seen this in the wild.
- Using TexLOD in copy mode. This is currently ignored. The results generated seem non-sensical. Never seen this in the wild.
- Reading MemoryColor in first blender cycle in 2-cycle mode. Reference seems to wait until the second cycle before updating this value,
despite memory coverage being updated right away. The sensible thing to do is to allow reading memory color in first cycle.
- Alpha testing in 2-cycle mode reads combined alpha from next pixel in reference.
Just doing alpha testing in first cycle on current pixel is good enough.
If this is correct hardware behavior, I consider this a hardware bug.
- Reading Texel1 in cycle 1 of 2-cycle mode reads the Texel0 from next pixel.
In the few cases I've seen this, the rendered output is slightly buggy, but it's hardly visible in motion.
The workaround is just to read Texel0 from current pixel which still renders fine.
### vi-conformance
This is a conformance suite, except for the video interface (VI) unit.
### rdp-validate-dump
This tool replays an RDP dump headless and compares outputs between reference renderer and paraLLEl-RDP.
To pass, bitexact output must be generated.
## Build
Checkout submodules. This pulls in Angrylion-Plus as well as Granite.
```
git submodule update --init --recursive
```
Standard CMake build.
```
mkdir build
cd build
cmake ..
cmake --build . --parallel (--config Release on MSVC)
```
### Run test suite
You can run rdp-conformance and vi-conformance with ctest to verify if your driver is behaving correctly.
```
ctest (-C Release on MSVC)
```
### Embedding shaders in a C++ header
If embedding paraLLEl-RDP in an emulator project, it is helpful to pre-compile and bake SPIR-V shaders in a C++ header.
Build slangmosh from Granite, and then run:
```
slangmosh parallel-rdp/shaders/slangmosh.json --output slangmosh.hpp --vk11 --strip -O --namespace RDP
```
### Generating a standalone code base for emulator integration
Run the `generate_standalone_codebase.sh $OUTDIR` script with an output directory `$OUTDIR/` as argument to generate a standalone code base which can be built without any special build system support.
Include `$OUTDIR/config.mk` if building with Make to make your life easier.
Note that `slangmosh` must be in your path for this script to run. It executes the command above to build `slangmosh.hpp`.
## License
paraLLEl-RDP is licensed under the permissive license MIT. See included LICENSE file.
This implementation builds heavily on the knowledge (but not code) gained from studying the reference implementation,
thus it felt fair to release it under a permissive license, so my work could be reused more easily.

View File

@ -0,0 +1,53 @@
# For use in standalone implementations.
PARALLEL_RDP_CFLAGS :=
PARALLEL_RDP_CXXFLAGS := -DGRANITE_VULKAN_MT
PARALLEL_RDP_SOURCES_CXX := \
$(wildcard $(PARALLEL_RDP_IMPLEMENTATION)/parallel-rdp/*.cpp) \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/buffer.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/buffer_pool.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/command_buffer.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/command_pool.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/context.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/cookie.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/descriptor_set.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/device.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/event_manager.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/fence.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/fence_manager.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/image.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/memory_allocator.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/pipeline_event.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/query_pool.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/render_pass.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/sampler.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/semaphore.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/semaphore_manager.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/shader.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/vulkan/texture_format.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/util/logging.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/util/thread_id.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/util/aligned_alloc.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/util/timer.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/util/timeline_trace_file.cpp \
$(PARALLEL_RDP_IMPLEMENTATION)/util/thread_name.cpp
PARALLEL_RDP_SOURCES_C := \
$(PARALLEL_RDP_IMPLEMENTATION)/volk/volk.c
PARALLEL_RDP_INCLUDE_DIRS := \
-I$(PARALLEL_RDP_IMPLEMENTATION)/parallel-rdp \
-I$(PARALLEL_RDP_IMPLEMENTATION)/volk \
-I$(PARALLEL_RDP_IMPLEMENTATION)/vulkan \
-I$(PARALLEL_RDP_IMPLEMENTATION)/vulkan-headers/include \
-I$(PARALLEL_RDP_IMPLEMENTATION)/util
PARALLEL_RDP_LDFLAGS := -pthread
ifeq (,$(findstring win,$(platform)))
PARALLEL_RDP_LDFLAGS += -ldl
else
PARALLEL_RDP_CFLAGS += -DVK_USE_PLATFORM_WIN32_KHR
PARALLEL_RDP_LDFLAGS += -lwinmm
endif

View File

@ -0,0 +1,135 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <chrono>
#include "command_ring.hpp"
#include "rdp_device.hpp"
#include "thread_id.hpp"
#include <assert.h>
namespace RDP
{
void CommandRing::init(
#ifdef PARALLEL_RDP_SHADER_DIR
Granite::Global::GlobalManagersHandle global_handles_,
#endif
CommandProcessor *processor_, unsigned count)
{
assert((count & (count - 1)) == 0);
teardown_thread();
processor = processor_;
ring.resize(count);
write_count = 0;
read_count = 0;
#ifdef PARALLEL_RDP_SHADER_DIR
global_handles = std::move(global_handles_);
#endif
thr = std::thread(&CommandRing::thread_loop, this);
}
void CommandRing::teardown_thread()
{
if (thr.joinable())
{
enqueue_command(0, nullptr);
thr.join();
}
}
CommandRing::~CommandRing()
{
teardown_thread();
}
void CommandRing::drain()
{
std::unique_lock<std::mutex> holder{lock};
cond.wait(holder, [this]() {
return write_count == completed_count;
});
}
void CommandRing::enqueue_command(unsigned num_words, const uint32_t *words)
{
std::unique_lock<std::mutex> holder{lock};
cond.wait(holder, [this, num_words]() {
return write_count + num_words + 1 <= read_count + ring.size();
});
size_t mask = ring.size() - 1;
ring[write_count++ & mask] = num_words;
for (unsigned i = 0; i < num_words; i++)
ring[write_count++ & mask] = words[i];
cond.notify_one();
}
void CommandRing::thread_loop()
{
Util::register_thread_index(0);
#ifdef PARALLEL_RDP_SHADER_DIR
// Here to let the RDP play nice with full Granite.
// When we move to standalone Granite, we won't need to interact with global subsystems like this.
Granite::Global::set_thread_context(*global_handles);
global_handles.reset();
#endif
std::vector<uint32_t> tmp_buffer;
tmp_buffer.reserve(64);
size_t mask = ring.size() - 1;
for (;;)
{
bool is_idle = false;
{
std::unique_lock<std::mutex> holder{lock};
if (cond.wait_for(holder, std::chrono::microseconds(500), [this]() { return write_count > read_count; }))
{
uint32_t num_words = ring[read_count++ & mask];
tmp_buffer.resize(num_words);
for (uint32_t i = 0; i < num_words; i++)
tmp_buffer[i] = ring[read_count++ & mask];
}
else
{
// If we don't receive commands at a steady pace,
// notify rendering thread that we should probably kick some work.
tmp_buffer.resize(1);
tmp_buffer[0] = uint32_t(Op::MetaIdle) << 24;
is_idle = true;
}
}
if (tmp_buffer.empty())
break;
processor->enqueue_command_direct(tmp_buffer.size(), tmp_buffer.data());
if (!is_idle)
{
std::lock_guard<std::mutex> holder{lock};
completed_count = read_count;
cond.notify_one();
}
}
}
}

View File

@ -0,0 +1,67 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <thread>
#include <mutex>
#include <condition_variable>
#include <vector>
#ifdef PARALLEL_RDP_SHADER_DIR
#include "global_managers.hpp"
#endif
namespace RDP
{
class CommandProcessor;
class CommandRing
{
public:
void init(
#ifdef PARALLEL_RDP_SHADER_DIR
Granite::Global::GlobalManagersHandle global_handles,
#endif
CommandProcessor *processor, unsigned count);
~CommandRing();
void drain();
void enqueue_command(unsigned num_words, const uint32_t *words);
private:
CommandProcessor *processor = nullptr;
std::thread thr;
std::mutex lock;
std::condition_variable cond;
std::vector<uint32_t> ring;
uint64_t write_count = 0;
uint64_t read_count = 0;
uint64_t completed_count = 0;
void thread_loop();
void teardown_thread();
#ifdef PARALLEL_RDP_SHADER_DIR
Granite::Global::GlobalManagersHandle global_handles;
#endif
};
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,402 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
namespace Vulkan
{
class Program;
class Shader;
}
namespace RDP
{
template <typename Program, typename Shader> struct Shaders;
using ShaderBank = Shaders<Vulkan::Program *, Vulkan::Shader *>;
// list of command IDs
enum class Op
{
Nop = 0,
MetaSignalTimeline = 1,
MetaFlush = 2,
MetaIdle = 3,
MetaSetQuirks = 4,
FillTriangle = 0x08,
FillZBufferTriangle = 0x09,
TextureTriangle = 0x0a,
TextureZBufferTriangle = 0x0b,
ShadeTriangle = 0x0c,
ShadeZBufferTriangle = 0x0d,
ShadeTextureTriangle = 0x0e,
ShadeTextureZBufferTriangle = 0x0f,
TextureRectangle = 0x24,
TextureRectangleFlip = 0x25,
SyncLoad = 0x26,
SyncPipe = 0x27,
SyncTile = 0x28,
SyncFull = 0x29,
SetKeyGB = 0x2a,
SetKeyR = 0x2b,
SetConvert = 0x2c,
SetScissor = 0x2d,
SetPrimDepth = 0x2e,
SetOtherModes = 0x2f,
LoadTLut = 0x30,
SetTileSize = 0x32,
LoadBlock = 0x33,
LoadTile = 0x34,
SetTile = 0x35,
FillRectangle = 0x36,
SetFillColor = 0x37,
SetFogColor = 0x38,
SetBlendColor = 0x39,
SetPrimColor = 0x3a,
SetEnvColor = 0x3b,
SetCombine = 0x3c,
SetTextureImage = 0x3d,
SetMaskImage = 0x3e,
SetColorImage = 0x3f
};
enum class RGBMul : uint8_t
{
Combined = 0,
Texel0 = 1,
Texel1 = 2,
Primitive = 3,
Shade = 4,
Env = 5,
KeyScale = 6,
CombinedAlpha = 7,
Texel0Alpha = 8,
Texel1Alpha = 9,
PrimitiveAlpha = 10,
ShadeAlpha = 11,
EnvAlpha = 12,
LODFrac = 13,
PrimLODFrac = 14,
ConvertK5 = 15,
Zero = 16
};
enum class RGBMulAdd : uint8_t
{
Combined = 0,
Texel0 = 1,
Texel1 = 2,
Primitive = 3,
Shade = 4,
Env = 5,
One = 6,
Noise = 7,
Zero = 8
};
enum class RGBMulSub : uint8_t
{
Combined = 0,
Texel0 = 1,
Texel1 = 2,
Primitive = 3,
Shade = 4,
Env = 5,
KeyCenter = 6,
ConvertK4 = 7,
Zero = 8
};
enum class RGBAdd : uint8_t
{
Combined = 0,
Texel0 = 1,
Texel1 = 2,
Primitive = 3,
Shade = 4,
Env = 5,
One = 6,
Zero = 7
};
enum class AlphaAddSub : uint8_t
{
CombinedAlpha = 0,
Texel0Alpha = 1,
Texel1Alpha = 2,
PrimitiveAlpha = 3,
ShadeAlpha = 4,
EnvAlpha = 5,
One = 6,
Zero = 7
};
enum class AlphaMul : uint8_t
{
LODFrac = 0,
Texel0Alpha = 1,
Texel1Alpha = 2,
PrimitiveAlpha = 3,
ShadeAlpha = 4,
EnvAlpha = 5,
PrimLODFrac = 6,
Zero = 7
};
enum class TextureSize : uint8_t
{
Bpp4 = 0,
Bpp8 = 1,
Bpp16 = 2,
Bpp32 = 3
};
enum class TextureFormat : uint8_t
{
RGBA = 0,
YUV = 1,
CI = 2,
IA = 3,
I = 4
};
enum class RGBDitherMode : uint8_t
{
Magic = 0,
Bayer = 1,
Noise = 2,
Off = 3
};
enum class AlphaDitherMode : uint8_t
{
Pattern = 0,
InvPattern = 1,
Noise = 2,
Off = 3
};
enum class CycleType : uint8_t
{
Cycle1 = 0,
Cycle2 = 1,
Copy = 2,
Fill = 3
};
enum class BlendMode1A : uint8_t
{
PixelColor = 0,
MemoryColor = 1,
BlendColor = 2,
FogColor = 3
};
enum class BlendMode1B : uint8_t
{
PixelAlpha = 0,
FogAlpha = 1,
ShadeAlpha = 2,
Zero = 3
};
enum class BlendMode2A : uint8_t
{
PixelColor = 0,
MemoryColor = 1,
BlendColor = 2,
FogColor = 3
};
enum class BlendMode2B : uint8_t
{
InvPixelAlpha = 0,
MemoryAlpha = 1,
One = 2,
Zero = 3
};
enum class CoverageMode : uint8_t
{
Clamp = 0,
Wrap = 1,
Zap = 2,
Save = 3
};
enum class ZMode : uint8_t
{
Opaque = 0,
Interpenetrating = 1,
Transparent = 2,
Decal = 3
};
enum TileInfoFlagBits
{
TILE_INFO_CLAMP_S_BIT = 1 << 0,
TILE_INFO_MIRROR_S_BIT = 1 << 1,
TILE_INFO_CLAMP_T_BIT = 1 << 2,
TILE_INFO_MIRROR_T_BIT = 1 << 3
};
using TileInfoFlags = uint8_t;
struct TileSize
{
uint32_t slo = 0;
uint32_t shi = 0;
uint32_t tlo = 0;
uint32_t thi = 0;
};
struct TileMeta
{
uint32_t offset = 0;
uint32_t stride = 0;
TextureFormat fmt = TextureFormat::RGBA;
TextureSize size = TextureSize::Bpp16;
uint8_t palette = 0;
uint8_t mask_s = 0;
uint8_t shift_s = 0;
uint8_t mask_t = 0;
uint8_t shift_t = 0;
TileInfoFlags flags = 0;
};
struct TileInfo
{
TileSize size;
TileMeta meta;
};
struct CombinerInputsRGB
{
RGBMulAdd muladd;
RGBMulSub mulsub;
RGBMul mul;
RGBAdd add;
};
struct CombinerInputsAlpha
{
AlphaAddSub muladd;
AlphaAddSub mulsub;
AlphaMul mul;
AlphaAddSub add;
};
struct CombinerInputs
{
CombinerInputsRGB rgb;
CombinerInputsAlpha alpha;
};
struct BlendModes
{
BlendMode1A blend_1a;
BlendMode1B blend_1b;
BlendMode2A blend_2a;
BlendMode2B blend_2b;
};
static_assert(sizeof(TileInfo) == 32, "TileInfo must be 32 bytes.");
enum class VIRegister
{
Control = 0,
Origin,
Width,
Intr,
VCurrentLine,
Timing,
VSync,
HSync,
Leap,
HStart,
VStart,
VBurst,
XScale,
YScale,
Count
};
enum VIControlFlagBits
{
VI_CONTROL_TYPE_BLANK_BIT = 0 << 0,
VI_CONTROL_TYPE_RESERVED_BIT = 1 << 0,
VI_CONTROL_TYPE_RGBA5551_BIT = 2 << 0,
VI_CONTROL_TYPE_RGBA8888_BIT = 3 << 0,
VI_CONTROL_TYPE_MASK = 3 << 0,
VI_CONTROL_GAMMA_DITHER_ENABLE_BIT = 1 << 2,
VI_CONTROL_GAMMA_ENABLE_BIT = 1 << 3,
VI_CONTROL_DIVOT_ENABLE_BIT = 1 << 4,
VI_CONTROL_SERRATE_BIT = 1 << 6,
VI_CONTROL_AA_MODE_RESAMP_EXTRA_ALWAYS_BIT = 0 << 8,
VI_CONTROL_AA_MODE_RESAMP_EXTRA_BIT = 1 << 8,
VI_CONTROL_AA_MODE_RESAMP_ONLY_BIT = 2 << 8,
VI_CONTROL_AA_MODE_RESAMP_REPLICATE_BIT = 3 << 8,
VI_CONTROL_AA_MODE_MASK = 3 << 8,
VI_CONTROL_DITHER_FILTER_ENABLE_BIT = 1 << 16,
VI_CONTROL_META_AA_BIT = 1 << 17,
VI_CONTROL_META_SCALE_BIT = 1 << 18
};
using VIControlFlags = uint32_t;
static inline uint32_t make_vi_start_register(uint32_t start_value, uint32_t end_value)
{
return ((start_value & 0x3ff) << 16) | (end_value & 0x3ff);
}
static inline uint32_t make_vi_scale_register(uint32_t scale_factor, uint32_t bias)
{
return ((bias & 0xfff) << 16) | (scale_factor & 0xfff);
}
constexpr uint32_t VI_V_SYNC_NTSC = 525;
constexpr uint32_t VI_V_SYNC_PAL = 625;
constexpr uint32_t VI_H_OFFSET_NTSC = 108;
constexpr uint32_t VI_H_OFFSET_PAL = 128;
constexpr uint32_t VI_V_OFFSET_NTSC = 34;
constexpr uint32_t VI_V_OFFSET_PAL = 44;
constexpr uint32_t VI_V_RES_NTSC = 480;
constexpr uint32_t VI_V_RES_PAL = 576;
constexpr int VI_SCANOUT_WIDTH = 640;
static inline uint32_t make_default_v_start()
{
return make_vi_start_register(VI_V_OFFSET_NTSC, VI_V_OFFSET_NTSC + 224 * 2);
}
static inline uint32_t make_default_h_start()
{
return make_vi_start_register(VI_H_OFFSET_NTSC, VI_H_OFFSET_NTSC + VI_SCANOUT_WIDTH);
}
template <int bits>
static int32_t sext(int32_t v)
{
struct { int32_t dummy : bits; } d;
d.dummy = v;
return d.dummy;
}
}

View File

@ -0,0 +1,389 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include "rdp_common.hpp"
namespace RDP
{
enum TriangleSetupFlagBits
{
TRIANGLE_SETUP_FLIP_BIT = 1 << 0,
TRIANGLE_SETUP_DO_OFFSET_BIT = 1 << 1,
TRIANGLE_SETUP_SKIP_XFRAC_BIT = 1 << 2,
TRIANGLE_SETUP_INTERLACE_FIELD_BIT = 1 << 3,
TRIANGLE_SETUP_INTERLACE_KEEP_ODD_BIT = 1 << 4,
TRIANGLE_SETUP_DISABLE_UPSCALING_BIT = 1 << 5,
TRIANGLE_SETUP_NATIVE_LOD_BIT = 1 << 6
};
using TriangleSetupFlags = uint8_t;
enum StaticRasterizationFlagBits
{
RASTERIZATION_INTERLACE_FIELD_BIT = 1 << 0,
RASTERIZATION_INTERLACE_KEEP_ODD_BIT = 1 << 1,
RASTERIZATION_AA_BIT = 1 << 2,
RASTERIZATION_PERSPECTIVE_CORRECT_BIT = 1 << 3,
RASTERIZATION_TLUT_BIT = 1 << 4,
RASTERIZATION_TLUT_TYPE_BIT = 1 << 5,
RASTERIZATION_CVG_TIMES_ALPHA_BIT = 1 << 6,
RASTERIZATION_ALPHA_CVG_SELECT_BIT = 1 << 7,
RASTERIZATION_MULTI_CYCLE_BIT = 1 << 8,
RASTERIZATION_TEX_LOD_ENABLE_BIT = 1 << 9,
RASTERIZATION_SHARPEN_LOD_ENABLE_BIT = 1 << 10,
RASTERIZATION_DETAIL_LOD_ENABLE_BIT = 1 << 11,
RASTERIZATION_FILL_BIT = 1 << 12,
RASTERIZATION_COPY_BIT = 1 << 13,
RASTERIZATION_SAMPLE_MODE_BIT = 1 << 14,
RASTERIZATION_ALPHA_TEST_BIT = 1 << 15,
RASTERIZATION_ALPHA_TEST_DITHER_BIT = 1 << 16,
RASTERIZATION_SAMPLE_MID_TEXEL_BIT = 1 << 17,
RASTERIZATION_USES_TEXEL0_BIT = 1 << 18,
RASTERIZATION_USES_TEXEL1_BIT = 1 << 19,
RASTERIZATION_USES_LOD_BIT = 1 << 20,
RASTERIZATION_USES_PIPELINED_TEXEL1_BIT = 1 << 21,
RASTERIZATION_CONVERT_ONE_BIT = 1 << 22,
RASTERIZATION_BILERP_0_BIT = 1 << 23,
RASTERIZATION_BILERP_1_BIT = 1 << 24,
RASTERIZATION_UPSCALING_LOG2_BIT_OFFSET = 26,
RASTERIZATION_NEED_NOISE_BIT = 1 << 28,
RASTERIZATION_USE_STATIC_TEXTURE_SIZE_FORMAT_BIT = 1 << 29,
RASTERIZATION_USE_SPECIALIZATION_CONSTANT_BIT = 1 << 30
};
using StaticRasterizationFlags = uint32_t;
enum DepthBlendFlagBits
{
DEPTH_BLEND_DEPTH_TEST_BIT = 1 << 0,
DEPTH_BLEND_DEPTH_UPDATE_BIT = 1 << 1,
DEPTH_BLEND_FORCE_BLEND_BIT = 1 << 3,
DEPTH_BLEND_IMAGE_READ_ENABLE_BIT = 1 << 4,
DEPTH_BLEND_COLOR_ON_COVERAGE_BIT = 1 << 5,
DEPTH_BLEND_MULTI_CYCLE_BIT = 1 << 6,
DEPTH_BLEND_AA_BIT = 1 << 7,
DEPTH_BLEND_DITHER_ENABLE_BIT = 1 << 8
};
using DepthBlendFlags = uint32_t;
struct TriangleSetup
{
int32_t xh, xm, xl;
int16_t yh, ym;
int32_t dxhdy, dxmdy, dxldy;
int16_t yl;
TriangleSetupFlags flags;
uint8_t tile;
};
struct AttributeSetup
{
int32_t r, g, b, a;
int32_t drdx, dgdx, dbdx, dadx;
int32_t drde, dgde, dbde, dade;
int32_t drdy, dgdy, dbdy, dady;
int32_t s, t, z, w;
int32_t dsdx, dtdx, dzdx, dwdx;
int32_t dsde, dtde, dzde, dwde;
int32_t dsdy, dtdy, dzdy, dwdy;
};
struct ConstantCombinerInputs
{
uint8_t muladd[4];
uint8_t mulsub[4];
uint8_t mul[4];
uint8_t add[4];
};
// Per-primitive state which is very dynamic in nature and does not change anything about the shader itself.
struct DerivedSetup
{
ConstantCombinerInputs constants[2];
uint8_t fog_color[4];
uint8_t blend_color[4];
uint32_t fill_color;
uint16_t dz;
uint8_t dz_compressed;
uint8_t min_lod;
int16_t convert_factors[4];
};
static_assert((sizeof(TriangleSetup) & 15) == 0, "TriangleSetup must be aligned to 16 bytes.");
static_assert((sizeof(AttributeSetup) & 15) == 0, "AttributeSetup must be aligned to 16 bytes.");
static_assert(sizeof(DerivedSetup) == 56, "DerivedSetup is not 56 bytes.");
struct ScissorState
{
uint32_t xlo;
uint32_t ylo;
uint32_t xhi;
uint32_t yhi;
};
struct StaticRasterizationState
{
CombinerInputs combiner[2];
StaticRasterizationFlags flags;
uint32_t dither;
uint32_t texture_size;
uint32_t texture_fmt;
};
static_assert(sizeof(StaticRasterizationState) == 32, "StaticRasterizationState must be 32 bytes.");
struct DepthBlendState
{
BlendModes blend_cycles[2];
DepthBlendFlags flags;
CoverageMode coverage_mode;
ZMode z_mode;
uint8_t padding[2];
};
static_assert(sizeof(DepthBlendState) == 16, "DepthBlendState must be 16 bytes.");
struct InstanceIndices
{
uint8_t static_index;
uint8_t depth_blend_index;
uint8_t tile_instance_index;
uint8_t padding[5];
uint8_t tile_indices[8];
};
static_assert((sizeof(InstanceIndices) & 15) == 0, "InstanceIndices must be aligned to 16 bytes.");
struct UploadInfo
{
int32_t width, height;
float min_t_mod, max_t_mod;
int32_t vram_addr;
int32_t vram_width;
int32_t vram_size;
int32_t vram_effective_width;
int32_t tmem_offset;
int32_t tmem_stride_words;
int32_t tmem_size;
int32_t tmem_fmt;
int32_t mode;
float inv_tmem_stride_words;
int32_t dxt;
int32_t padding;
};
static_assert((sizeof(UploadInfo) & 15) == 0, "UploadInfo must be aligned to 16 bytes.");
struct SpanSetup
{
int32_t r, g, b, a;
int32_t s, t, w, z;
int16_t xlo[4];
int16_t xhi[4];
int32_t interpolation_base_x;
int32_t start_x;
int32_t end_x;
int16_t lodlength;
uint16_t valid_line;
};
static_assert((sizeof(SpanSetup) & 15) == 0, "SpanSetup is not aligned to 16 bytes.");
struct SpanInfoOffsets
{
int32_t offset, ylo, yhi, padding;
};
static_assert((sizeof(SpanInfoOffsets) == 16), "SpanInfoOffsets is not 16 bytes.");
struct SpanInterpolationJob
{
uint16_t primitive_index, base_y, max_y, padding;
};
static_assert((sizeof(SpanInterpolationJob) == 8), "SpanInterpolationJob is not 8 bytes.");
struct GlobalState
{
uint32_t addr_index;
uint32_t depth_addr_index;
uint32_t fb_width, fb_height;
uint32_t group_mask;
};
struct TileRasterWork
{
uint32_t tile_x, tile_y;
uint32_t tile_instance;
uint32_t primitive;
};
static_assert((sizeof(TileRasterWork) == 16), "TileRasterWork is not 16 bytes.");
struct GlobalFBInfo
{
uint32_t dx_shift;
uint32_t dx_mask;
uint32_t fb_size;
uint32_t base_primitive_index;
};
template <typename T, unsigned N>
class StateCache
{
public:
unsigned add(const T &t)
{
if (cached_index >= 0)
if (memcmp(&elements[cached_index], &t, sizeof(T)) == 0)
return unsigned(cached_index);
for (int i = int(count) - 1; i >= 0; i--)
{
if (memcmp(&elements[i], &t, sizeof(T)) == 0)
{
cached_index = i;
return unsigned(i);
}
}
assert(count < N);
memcpy(elements + count, &t, sizeof(T));
unsigned ret = count++;
cached_index = int(ret);
return ret;
}
bool full() const
{
return count == N;
}
unsigned size() const
{
return count;
}
unsigned byte_size() const
{
return size() * sizeof(T);
}
const T *data() const
{
return elements;
}
void reset()
{
count = 0;
cached_index = -1;
}
bool empty() const
{
return count == 0;
}
private:
unsigned count = 0;
int cached_index = -1;
T elements[N];
};
template <typename T, unsigned N>
class StreamCache
{
public:
void add(const T &t)
{
assert(count < N);
memcpy(&elements[count++], &t, sizeof(T));
}
bool full() const
{
return count == N;
}
unsigned size() const
{
return count;
}
unsigned byte_size() const
{
return size() * sizeof(T);
}
const T *data() const
{
return elements;
}
void reset()
{
count = 0;
}
bool empty() const
{
return count == 0;
}
private:
unsigned count = 0;
T elements[N];
};
namespace Limits
{
constexpr unsigned MaxPrimitives = 256;
constexpr unsigned MaxStaticRasterizationStates = 64;
constexpr unsigned MaxDepthBlendStates = 64;
constexpr unsigned MaxTileInfoStates = 256;
constexpr unsigned NumSyncStates = 32;
constexpr unsigned MaxNumTiles = 8;
constexpr unsigned MaxTMEMInstances = 256;
constexpr unsigned MaxSpanSetups = 32 * 1024;
constexpr unsigned MaxWidth = 1024;
constexpr unsigned MaxHeight = 1024;
constexpr unsigned MaxTileInstances = 0x8000;
}
namespace ImplementationConstants
{
constexpr unsigned DefaultWorkgroupSize = 64;
constexpr unsigned TileWidth = 8;
constexpr unsigned TileHeight = 8;
constexpr unsigned MaxTilesX = Limits::MaxWidth / TileWidth;
constexpr unsigned MaxTilesY = Limits::MaxHeight / TileHeight;
constexpr unsigned IncoherentPageSize = 1024;
constexpr unsigned MaxPendingRenderPassesBeforeFlush = 8;
constexpr unsigned MinimumPrimitivesForIdleFlush = 32;
constexpr unsigned MinimumRenderPassesForIdleFlush = 2;
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,243 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <memory>
#include <thread>
#include <queue>
#include "device.hpp"
#include "video_interface.hpp"
#include "rdp_renderer.hpp"
#include "rdp_common.hpp"
#include "command_ring.hpp"
#include "worker_thread.hpp"
#include "rdp_dump_write.hpp"
#ifndef GRANITE_VULKAN_MT
#error "Granite Vulkan backend must be built with multithreading support."
#endif
namespace RDP
{
struct RGBA
{
uint8_t r, g, b, a;
};
enum CommandProcessorFlagBits
{
COMMAND_PROCESSOR_FLAG_HOST_VISIBLE_HIDDEN_RDRAM_BIT = 1 << 0,
COMMAND_PROCESSOR_FLAG_HOST_VISIBLE_TMEM_BIT = 1 << 1,
COMMAND_PROCESSOR_FLAG_UPSCALING_2X_BIT = 1 << 2,
COMMAND_PROCESSOR_FLAG_UPSCALING_4X_BIT = 1 << 3,
COMMAND_PROCESSOR_FLAG_UPSCALING_8X_BIT = 1 << 4,
COMMAND_PROCESSOR_FLAG_SUPER_SAMPLED_READ_BACK_BIT = 1 << 5,
COMMAND_PROCESSOR_FLAG_SUPER_SAMPLED_DITHER_BIT = 1 << 6
};
using CommandProcessorFlags = uint32_t;
struct CoherencyCopy
{
size_t src_offset = 0;
size_t mask_offset = 0;
size_t dst_offset = 0;
size_t size = 0;
std::atomic_uint32_t *counter_base = nullptr;
unsigned counters = 0;
};
struct CoherencyOperation
{
Vulkan::Fence fence;
uint64_t timeline_value = 0;
uint8_t *dst = nullptr;
const Vulkan::Buffer *src = nullptr;
std::vector<CoherencyCopy> copies;
std::atomic_uint32_t *unlock_cookie = nullptr;
};
// These options control various behavior when upscaling to workaround glitches which arise naturally as part of upscaling.
struct Quirks
{
inline Quirks()
{
u.options.native_resolution_tex_rect = true;
u.options.native_texture_lod = false;
}
inline void set_native_resolution_tex_rect(bool enable)
{
u.options.native_resolution_tex_rect = enable;
}
inline void set_native_texture_lod(bool enable)
{
u.options.native_texture_lod = enable;
}
union
{
struct Opts
{
// If true, force TEX_RECT and TEX_RECT_FLIP to render without upscaling.
// Works around bilinear filtering bugs in Cycle1/Cycle2 mode where game assumed 1:1 pixel transfer.
bool native_resolution_tex_rect;
// Forces LOD to be computed as 1x upscale.
// Fixes content which relies on LOD computation to select textures in clever ways.
bool native_texture_lod;
} options;
uint32_t words[1];
} u;
};
class CommandProcessor
{
public:
CommandProcessor(Vulkan::Device &device,
void *rdram_ptr,
size_t rdram_offset,
size_t rdram_size,
size_t hidden_rdram_size,
CommandProcessorFlags flags);
~CommandProcessor();
bool device_is_supported() const;
// Synchronization.
void flush();
uint64_t signal_timeline();
void wait_for_timeline(uint64_t index);
void idle();
void begin_frame_context();
// Queues up state and drawing commands.
void enqueue_command(unsigned num_words, const uint32_t *words);
void enqueue_command_direct(unsigned num_words, const uint32_t *words);
void set_quirks(const Quirks &quirks);
// Interact with memory.
void *begin_read_rdram();
void end_write_rdram();
void *begin_read_hidden_rdram();
void end_write_hidden_rdram();
size_t get_rdram_size() const;
size_t get_hidden_rdram_size() const;
void *get_tmem();
// Sets VI register
void set_vi_register(VIRegister reg, uint32_t value);
Vulkan::ImageHandle scanout(const ScanoutOptions &opts = {});
void scanout_sync(std::vector<RGBA> &colors, unsigned &width, unsigned &height);
void scanout_async_buffer(VIScanoutBuffer &buffer, const ScanoutOptions &opts = {});
private:
Vulkan::Device &device;
Vulkan::BufferHandle rdram;
Vulkan::BufferHandle hidden_rdram;
Vulkan::BufferHandle tmem;
size_t rdram_offset;
size_t rdram_size;
CommandProcessorFlags flags;
#ifndef PARALLEL_RDP_SHADER_DIR
std::unique_ptr<ShaderBank> shader_bank;
#endif
CommandRing ring;
VideoInterface vi;
Renderer renderer;
void clear_hidden_rdram();
void clear_tmem();
void clear_buffer(Vulkan::Buffer &buffer, uint32_t value);
void init_renderer();
void enqueue_command_inner(unsigned num_words, const uint32_t *words);
Vulkan::ImageHandle scanout(const ScanoutOptions &opts, VkImageLayout target_layout);
#define OP(x) void op_##x(const uint32_t *words)
OP(fill_triangle); OP(fill_z_buffer_triangle); OP(texture_triangle); OP(texture_z_buffer_triangle);
OP(shade_triangle); OP(shade_z_buffer_triangle); OP(shade_texture_triangle); OP(shade_texture_z_buffer_triangle);
OP(texture_rectangle); OP(texture_rectangle_flip); OP(sync_load); OP(sync_pipe);
OP(sync_tile); OP(sync_full); OP(set_key_gb); OP(set_key_r);
OP(set_convert); OP(set_scissor); OP(set_prim_depth); OP(set_other_modes);
OP(load_tlut); OP(set_tile_size); OP(load_block);
OP(load_tile); OP(set_tile); OP(fill_rectangle); OP(set_fill_color);
OP(set_fog_color); OP(set_blend_color); OP(set_prim_color); OP(set_env_color);
OP(set_combine); OP(set_texture_image); OP(set_mask_image); OP(set_color_image);
#undef OP
ScissorState scissor_state = {};
StaticRasterizationState static_state = {};
DepthBlendState depth_blend = {};
struct
{
uint32_t addr;
uint32_t width;
TextureFormat fmt;
TextureSize size;
} texture_image = {};
uint64_t timeline_value = 0;
uint64_t thread_timeline_value = 0;
struct FenceExecutor
{
explicit inline FenceExecutor(Vulkan::Device *device_, uint64_t *ptr)
: device(device_), value(ptr)
{
}
Vulkan::Device *device;
uint64_t *value;
bool is_sentinel(const CoherencyOperation &work) const;
void perform_work(CoherencyOperation &work);
void notify_work_locked(const CoherencyOperation &work);
};
WorkerThread<CoherencyOperation, FenceExecutor> timeline_worker;
uint8_t *host_rdram = nullptr;
bool measure_stall_time = false;
bool single_threaded_processing = false;
bool is_supported = false;
bool is_host_coherent = true;
bool timestamp = false;
friend class Renderer;
void enqueue_coherency_operation(CoherencyOperation &&op);
void drain_command_ring();
void decode_triangle_setup(TriangleSetup &setup, const uint32_t *words) const;
Quirks quirks;
std::unique_ptr<RDPDumpWriter> dump_writer;
bool dump_in_command_list = false;
};
}

View File

@ -0,0 +1,151 @@
/* Copyright (c) 2021 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "rdp_dump_write.hpp"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
namespace RDP
{
RDPDumpWriter::~RDPDumpWriter()
{
end();
if (file)
fclose(file);
}
bool RDPDumpWriter::init(const char *path, uint32_t dram_size, uint32_t hidden_dram_size)
{
if (file)
return false;
rdp_dram_cache.clear();
rdp_dram_cache.resize(dram_size);
rdp_hidden_dram_cache.clear();
rdp_hidden_dram_cache.resize(hidden_dram_size);
file = fopen(path, "wb");
if (!file)
return false;
fwrite("RDPDUMP2", 8, 1, file);
fwrite(&dram_size, sizeof(dram_size), 1, file);
fwrite(&hidden_dram_size, sizeof(hidden_dram_size), 1, file);
return true;
}
void RDPDumpWriter::end_frame()
{
if (!file)
return;
uint32_t cmd = RDP_DUMP_CMD_END_FRAME;
fwrite(&cmd, sizeof(cmd), 1, file);
}
void RDPDumpWriter::end()
{
if (!file)
return;
uint32_t cmd = RDP_DUMP_CMD_EOF;
fwrite(&cmd, sizeof(cmd), 1, file);
fclose(file);
file = nullptr;
rdp_dram_cache.clear();
rdp_hidden_dram_cache.clear();
}
void RDPDumpWriter::flush(const void *dram_, uint32_t size,
RDPDumpCmd block_cmd, RDPDumpCmd flush_cmd,
uint8_t *cache)
{
if (!file)
return;
const auto *dram = static_cast<const uint8_t *>(dram_);
const uint32_t block_size = 4 * 1024;
uint32_t i = 0;
for (i = 0; i < size; i += block_size)
{
if (memcmp(dram + i, cache + i, block_size) != 0)
{
uint32_t cmd = block_cmd;
fwrite(&cmd, sizeof(cmd), 1, file);
fwrite(&i, sizeof(i), 1, file);
fwrite(&block_size, sizeof(block_size), 1, file);
fwrite(dram + i, 1, block_size, file);
memcpy(cache + i, dram + i, block_size);
}
}
uint32_t cmd = flush_cmd;
fwrite(&cmd, sizeof(cmd), 1, file);
}
void RDPDumpWriter::flush_dram(const void *dram_, uint32_t size)
{
flush(dram_, size, RDP_DUMP_CMD_UPDATE_DRAM, RDP_DUMP_CMD_UPDATE_DRAM_FLUSH, rdp_dram_cache.data());
}
void RDPDumpWriter::flush_hidden_dram(const void *dram_, uint32_t size)
{
flush(dram_, size, RDP_DUMP_CMD_UPDATE_HIDDEN_DRAM, RDP_DUMP_CMD_UPDATE_HIDDEN_DRAM_FLUSH, rdp_hidden_dram_cache.data());
}
void RDPDumpWriter::signal_complete()
{
if (!file)
return;
uint32_t cmd = RDP_DUMP_CMD_SIGNAL_COMPLETE;
fwrite(&cmd, sizeof(cmd), 1, file);
}
void RDPDumpWriter::emit_command(uint32_t command, const uint32_t *cmd_data, uint32_t cmd_words)
{
if (!file)
return;
uint32_t cmd = RDP_DUMP_CMD_RDP_COMMAND;
fwrite(&cmd, sizeof(cmd), 1, file);
fwrite(&command, sizeof(command), 1, file);
fwrite(&cmd_words, sizeof(cmd_words), 1, file);
fwrite(cmd_data, sizeof(*cmd_data), cmd_words, file);
}
void RDPDumpWriter::set_vi_register(uint32_t vi_register, uint32_t value)
{
if (!file)
return;
uint32_t cmd = RDP_DUMP_CMD_SET_VI_REGISTER;
fwrite(&cmd, sizeof(cmd), 1, file);
fwrite(&vi_register, sizeof(vi_register), 1, file);
fwrite(&value, sizeof(value), 1, file);
}
}

View File

@ -0,0 +1,65 @@
/* Copyright (c) 2021 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <stdint.h>
#include <stdio.h>
#include <vector>
namespace RDP
{
class RDPDumpWriter
{
public:
~RDPDumpWriter();
bool init(const char *path, uint32_t dram_size, uint32_t hidden_dram_size);
void flush_dram(const void *dram, uint32_t size);
void flush_hidden_dram(const void *dram, uint32_t size);
void signal_complete();
void emit_command(uint32_t command, const uint32_t *cmd_data, uint32_t cmd_words);
void set_vi_register(uint32_t vi_register, uint32_t value);
void end_frame();
private:
enum RDPDumpCmd : uint32_t
{
RDP_DUMP_CMD_INVALID = 0,
RDP_DUMP_CMD_UPDATE_DRAM = 1,
RDP_DUMP_CMD_RDP_COMMAND = 2,
RDP_DUMP_CMD_SET_VI_REGISTER = 3,
RDP_DUMP_CMD_END_FRAME = 4,
RDP_DUMP_CMD_SIGNAL_COMPLETE = 5,
RDP_DUMP_CMD_EOF = 6,
RDP_DUMP_CMD_UPDATE_DRAM_FLUSH = 7,
RDP_DUMP_CMD_UPDATE_HIDDEN_DRAM = 8,
RDP_DUMP_CMD_UPDATE_HIDDEN_DRAM_FLUSH = 9,
RDP_DUMP_CMD_INT_MAX = 0x7fffffff
};
FILE *file = nullptr;
std::vector<uint8_t> rdp_dram_cache;
std::vector<uint8_t> rdp_hidden_dram_cache;
void flush(const void *dram_, uint32_t size, RDPDumpCmd block_cmd, RDPDumpCmd flush_cmd, uint8_t *cache);
void end();
};
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,393 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "rdp_data_structures.hpp"
#include "device.hpp"
#include "rdp_common.hpp"
#include "worker_thread.hpp"
#include <unordered_set>
namespace RDP
{
struct CoherencyOperation;
struct SyncObject
{
Vulkan::Fence fence;
};
enum class FBFormat : uint32_t
{
I4 = 0,
I8 = 1,
RGBA5551 = 2,
IA88 = 3,
RGBA8888 = 4
};
enum class UploadMode : uint32_t
{
Tile = 0,
TLUT = 1,
Block = 2
};
struct LoadTileInfo
{
uint32_t tex_addr;
uint32_t tex_width;
uint16_t slo, tlo, shi, thi;
TextureFormat fmt;
TextureSize size;
UploadMode mode;
};
class CommandProcessor;
struct RendererOptions
{
unsigned upscaling_factor = 1;
bool super_sampled_readback = false;
bool super_sampled_readback_dither = false;
};
class Renderer : public Vulkan::DebugChannelInterface
{
public:
explicit Renderer(CommandProcessor &processor);
~Renderer();
void set_device(Vulkan::Device *device);
// If coherent is false, RDRAM is a buffer split into data in lower half and writemask state in upper half, each part being size large.
// offset must be 0 in this case.
void set_rdram(Vulkan::Buffer *buffer, uint8_t *host_rdram, size_t offset, size_t size, bool coherent);
void set_hidden_rdram(Vulkan::Buffer *buffer);
void set_tmem(Vulkan::Buffer *buffer);
void set_shader_bank(const ShaderBank *bank);
bool init_renderer(const RendererOptions &options);
// setup may be mutated to apply various fixups to triangle setup.
void draw_flat_primitive(TriangleSetup &setup);
void draw_shaded_primitive(TriangleSetup &setup, const AttributeSetup &attr);
void set_color_framebuffer(uint32_t addr, uint32_t width, FBFormat fmt);
void set_depth_framebuffer(uint32_t addr);
void set_scissor_state(const ScissorState &state);
void set_static_rasterization_state(const StaticRasterizationState &state);
void set_depth_blend_state(const DepthBlendState &state);
void set_tile(uint32_t tile, const TileMeta &info);
void set_tile_size(uint32_t tile, uint32_t slo, uint32_t shi, uint32_t tlo, uint32_t thi);
void load_tile(uint32_t tile, const LoadTileInfo &info);
void load_tile_iteration(uint32_t tile, const LoadTileInfo &info, uint32_t tmem_offset);
void set_blend_color(uint32_t color);
void set_fog_color(uint32_t color);
void set_env_color(uint32_t color);
void set_primitive_color(uint8_t min_level, uint8_t prim_lod_frac, uint32_t color);
void set_fill_color(uint32_t color);
void set_primitive_depth(uint16_t prim_depth, uint16_t prim_dz);
void set_enable_primitive_depth(bool enable);
void set_convert(uint16_t k0, uint16_t k1, uint16_t k2, uint16_t k3, uint16_t k4, uint16_t k5);
void set_color_key(unsigned component, uint32_t width, uint32_t center, uint32_t scale);
// Called when the command thread has not seen any activity in a given period of time.
// This is useful so we don't needlessly queue up work when we might as well kick it to the GPU.
void notify_idle_command_thread();
void flush_and_signal();
int resolve_shader_define(const char *name, const char *define) const;
void resolve_coherency_external(unsigned offset, unsigned length);
void submit_update_upscaled_domain_external(Vulkan::CommandBuffer &cmd,
unsigned addr, unsigned pixels, unsigned pixel_size_log2);
unsigned get_scaling_factor() const;
const Vulkan::Buffer *get_upscaled_rdram_buffer() const;
const Vulkan::Buffer *get_upscaled_hidden_rdram_buffer() const;
void lock_command_processing();
void unlock_command_processing();
private:
CommandProcessor &processor;
Vulkan::Device *device = nullptr;
Vulkan::Buffer *rdram = nullptr;
Vulkan::BufferHandle upscaling_reference_rdram;
Vulkan::BufferHandle upscaling_multisampled_rdram;
Vulkan::BufferHandle upscaling_multisampled_hidden_rdram;
struct
{
uint8_t *host_rdram = nullptr;
Vulkan::BufferHandle staging_rdram;
Vulkan::BufferHandle staging_readback;
std::unique_ptr<std::atomic_uint32_t[]> pending_writes_for_page;
std::vector<uint32_t> page_to_direct_copy;
std::vector<uint32_t> page_to_masked_copy;
std::vector<uint32_t> page_to_pending_readback;
unsigned num_pages = 0;
unsigned staging_readback_pages = 0;
unsigned staging_readback_index = 0; // Ringbuffer the readbacks.
} incoherent;
size_t rdram_offset = 0;
size_t rdram_size = 0;
bool is_host_coherent = false;
Vulkan::Buffer *hidden_rdram = nullptr;
Vulkan::Buffer *tmem = nullptr;
const ShaderBank *shader_bank = nullptr;
bool init_caps();
void init_blender_lut();
void init_buffers(const RendererOptions &options);
bool init_internal_upscaling_factor(const RendererOptions &options);
struct
{
uint32_t addr = 0;
uint32_t depth_addr = 0;
uint32_t width = 0;
uint32_t deduced_height = 0;
FBFormat fmt = FBFormat::I8;
bool depth_write_pending = false;
bool color_write_pending = false;
} fb;
struct StreamCaches
{
ScissorState scissor_state = {};
StaticRasterizationState static_raster_state = {};
DepthBlendState depth_blend_state = {};
StateCache<StaticRasterizationState, Limits::MaxStaticRasterizationStates> static_raster_state_cache;
StateCache<DepthBlendState, Limits::MaxDepthBlendStates> depth_blend_state_cache;
StateCache<TileInfo, Limits::MaxTileInfoStates> tile_info_state_cache;
StreamCache<TriangleSetup, Limits::MaxPrimitives> triangle_setup;
StreamCache<ScissorState, Limits::MaxPrimitives> scissor_setup;
StreamCache<AttributeSetup, Limits::MaxPrimitives> attribute_setup;
StreamCache<DerivedSetup, Limits::MaxPrimitives> derived_setup;
StreamCache<InstanceIndices, Limits::MaxPrimitives> state_indices;
StreamCache<SpanInfoOffsets, Limits::MaxPrimitives> span_info_offsets;
StreamCache<SpanInterpolationJob, Limits::MaxSpanSetups> span_info_jobs;
std::vector<UploadInfo> tmem_upload_infos;
unsigned max_shaded_tiles = 0;
Vulkan::CommandBufferHandle cmd;
} stream;
void ensure_command_buffer();
TileInfo tiles[Limits::MaxNumTiles];
Vulkan::BufferHandle tmem_instances;
Vulkan::BufferHandle span_setups;
Vulkan::BufferHandle blender_divider_lut_buffer;
Vulkan::BufferViewHandle blender_divider_buffer;
Vulkan::BufferHandle tile_binning_buffer;
Vulkan::BufferHandle tile_binning_buffer_coarse;
Vulkan::BufferHandle indirect_dispatch_buffer;
Vulkan::BufferHandle tile_work_list;
Vulkan::BufferHandle per_tile_offsets;
Vulkan::BufferHandle per_tile_shaded_color;
Vulkan::BufferHandle per_tile_shaded_depth;
Vulkan::BufferHandle per_tile_shaded_shaded_alpha;
Vulkan::BufferHandle per_tile_shaded_coverage;
struct MappedBuffer
{
Vulkan::BufferHandle buffer;
bool is_host = false;
};
struct RenderBuffers
{
void init(Vulkan::Device &device, Vulkan::BufferDomain domain, RenderBuffers *borrow);
static MappedBuffer create_buffer(Vulkan::Device &device, Vulkan::BufferDomain domain, VkDeviceSize size, MappedBuffer *borrow);
MappedBuffer triangle_setup;
MappedBuffer attribute_setup;
MappedBuffer derived_setup;
MappedBuffer scissor_setup;
MappedBuffer static_raster_state;
MappedBuffer depth_blend_state;
MappedBuffer tile_info_state;
MappedBuffer state_indices;
MappedBuffer span_info_offsets;
MappedBuffer span_info_jobs;
Vulkan::BufferViewHandle span_info_jobs_view;
};
struct RenderBuffersUpdater
{
void init(Vulkan::Device &device);
void upload(Vulkan::Device &device, const StreamCaches &caches, Vulkan::CommandBuffer &cmd);
template <typename Cache>
void upload(Vulkan::CommandBuffer &cmd, Vulkan::Device &device,
const MappedBuffer &gpu, const MappedBuffer &cpu, const Cache &cache, bool &did_upload);
RenderBuffers cpu, gpu;
};
struct InternalSynchronization
{
Vulkan::Fence fence;
};
struct Constants
{
uint32_t blend_color = 0;
uint32_t fog_color = 0;
uint32_t env_color = 0;
uint32_t primitive_color = 0;
uint32_t fill_color = 0;
uint8_t min_level = 0;
uint8_t prim_lod_frac = 0;
int32_t prim_depth = 0;
uint16_t prim_dz = 0;
uint16_t convert[6] = {};
uint16_t key_width[3] = {};
uint8_t key_center[3] = {};
uint8_t key_scale[3] = {};
bool use_prim_depth = false;
} constants;
RenderBuffersUpdater buffer_instances[Limits::NumSyncStates];
InternalSynchronization internal_sync[Limits::NumSyncStates];
uint32_t sync_indices_needs_flush = 0;
unsigned buffer_instance = 0;
uint32_t base_primitive_index = 0;
unsigned pending_render_passes = 0;
unsigned pending_render_passes_upscaled = 0;
unsigned pending_primitives = 0;
unsigned pending_primitives_upscaled = 0;
bool tmem_upload_needs_flush(uint32_t addr) const;
bool render_pass_is_upscaled() const;
bool should_render_upscaled() const;
void flush_queues();
void submit_render_pass(Vulkan::CommandBuffer &cmd);
void submit_render_pass_upscaled(Vulkan::CommandBuffer &cmd);
void submit_render_pass_end(Vulkan::CommandBuffer &cmd);
void submit_to_queue();
void begin_new_context();
void reset_context();
bool need_flush() const;
void maintain_queues();
void maintain_queues_idle();
void update_tmem_instances(Vulkan::CommandBuffer &cmd);
void submit_span_setup_jobs(Vulkan::CommandBuffer &cmd, bool upscaled);
void update_deduced_height(const TriangleSetup &setup);
void submit_tile_binning_combined(Vulkan::CommandBuffer &cmd, bool upscaled);
void clear_indirect_buffer(Vulkan::CommandBuffer &cmd);
void submit_rasterization(Vulkan::CommandBuffer &cmd, Vulkan::Buffer &tmem, bool upscaled);
void submit_depth_blend(Vulkan::CommandBuffer &cmd, Vulkan::Buffer &tmem, bool upscaled, bool force_write_mask);
enum class ResolveStage { Pre, Post, SSAAResolve };
void submit_update_upscaled_domain(Vulkan::CommandBuffer &cmd, ResolveStage stage);
void submit_update_upscaled_domain(Vulkan::CommandBuffer &cmd, ResolveStage stage,
unsigned addr, unsigned depth_addr,
unsigned width, unsigned height,
unsigned pixel_size_log2);
void submit_clear_super_sample_write_mask(Vulkan::CommandBuffer &cmd, unsigned width, unsigned height);
SpanInfoOffsets allocate_span_jobs(const TriangleSetup &setup);
DerivedSetup build_derived_attributes(const AttributeSetup &attr) const;
void build_combiner_constants(DerivedSetup &setup, unsigned cycle) const;
int filter_debug_channel_x = -1;
int filter_debug_channel_y = -1;
bool debug_channel = false;
void message(const std::string &tag, uint32_t code,
uint32_t x, uint32_t y, uint32_t z,
uint32_t num_words, const Vulkan::DebugChannelInterface::Word *words) override;
bool can_support_minimum_subgroup_size(unsigned size) const;
bool supports_subgroup_size_control(uint32_t minimum_size, uint32_t maximum_size) const;
std::unordered_set<Util::Hash> pending_async_pipelines;
unsigned compute_conservative_max_num_tiles(const TriangleSetup &setup) const;
void deduce_static_texture_state(unsigned tile, unsigned max_lod_level);
void deduce_noise_state();
static StaticRasterizationState normalize_static_state(StaticRasterizationState state);
void fixup_triangle_setup(TriangleSetup &setup) const;
struct Caps
{
int timestamp = 0;
bool force_sync = false;
bool ubershader = false;
bool supports_small_integer_arithmetic = false;
bool subgroup_tile_binning = false;
bool subgroup_depth_blend = false;
bool super_sample_readback = false;
bool super_sample_readback_dither = false;
unsigned upscaling = 1;
unsigned max_num_tile_instances = Limits::MaxTileInstances;
unsigned max_tiles_x = ImplementationConstants::MaxTilesX;
unsigned max_tiles_y = ImplementationConstants::MaxTilesY;
unsigned max_width = Limits::MaxWidth;
unsigned max_height = Limits::MaxHeight;
} caps;
struct PipelineExecutor
{
Vulkan::Device *device;
bool is_sentinel(const Vulkan::DeferredPipelineCompile &compile) const;
void perform_work(const Vulkan::DeferredPipelineCompile &compile) const;
void notify_work_locked(const Vulkan::DeferredPipelineCompile &compile) const;
};
std::unique_ptr<WorkerThread<Vulkan::DeferredPipelineCompile, PipelineExecutor>> pipeline_worker;
void resolve_coherency_host_to_gpu(Vulkan::CommandBuffer &cmd);
void resolve_coherency_gpu_to_host(CoherencyOperation &op, Vulkan::CommandBuffer &cmd);
uint32_t get_byte_size_for_bound_color_framebuffer() const;
uint32_t get_byte_size_for_bound_depth_framebuffer() const;
void mark_pages_for_gpu_read(uint32_t base_addr, uint32_t byte_count);
void lock_pages_for_gpu_write(uint32_t base_addr, uint32_t byte_count);
std::atomic_uint32_t active_submissions;
void enqueue_fence_wait(Vulkan::Fence fence);
uint64_t last_submit_ns = 0;
std::mutex idle_lock;
};
}

View File

@ -0,0 +1,130 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef BINNING_H_
#define BINNING_H_
// There are 4 critical Y coordinates to test when binning. Top, bottom, mid, and mid - 1.
const int SUBPIXELS_Y = 4;
ivec4 quantize_x(ivec4 x)
{
return x >> 15;
}
int minimum4(ivec4 v)
{
ivec2 minimum2 = min(v.xy, v.zw);
return min(minimum2.x, minimum2.y);
}
int maximum4(ivec4 v)
{
ivec2 maximum2 = max(v.xy, v.zw);
return max(maximum2.x, maximum2.y);
}
ivec4 madd_32_64(ivec4 a, int b, int c, out ivec4 hi_bits)
{
ivec4 lo, hi;
imulExtended(a, ivec4(b), hi, lo);
uvec4 carry;
lo = ivec4(uaddCarry(lo, uvec4(c), carry));
hi += ivec4(carry);
hi_bits = hi;
return lo;
}
ivec2 interpolate_xs(TriangleSetup setup, ivec4 ys, bool flip, int scaling)
{
int yh_interpolation_base = setup.yh & ~(SUBPIXELS_Y - 1);
int ym_interpolation_base = setup.ym;
yh_interpolation_base *= scaling;
ym_interpolation_base *= scaling;
// Interpolate in 64-bit so we can detect quirky overflow scenarios.
ivec4 xh_hi, xm_hi, xl_hi;
ivec4 xh = madd_32_64(ys - yh_interpolation_base, setup.dxhdy, scaling * setup.xh, xh_hi);
ivec4 xm = madd_32_64(ys - yh_interpolation_base, setup.dxmdy, scaling * setup.xm, xm_hi);
ivec4 xl = madd_32_64(ys - ym_interpolation_base, setup.dxldy, scaling * setup.xl, xl_hi);
xl = mix(xl, xm, lessThan(ys, ivec4(scaling * setup.ym)));
xl_hi = mix(xl_hi, xm_hi, lessThan(ys, ivec4(scaling * setup.ym)));
// Handle overflow scenarios. Saturate 64-bit signed to 32-bit signed without 64-bit math.
xh = mix(xh, ivec4(0x7fffffff), greaterThan(xh_hi, ivec4(0)));
xh = mix(xh, ivec4(-0x80000000), lessThan(xh_hi, ivec4(-1)));
xl = mix(xl, ivec4(0x7fffffff), greaterThan(xl_hi, ivec4(0)));
xl = mix(xl, ivec4(-0x80000000), lessThan(xl_hi, ivec4(-1)));
ivec4 xh_shifted = quantize_x(xh);
ivec4 xl_shifted = quantize_x(xl);
ivec4 xleft, xright;
if (flip)
{
xleft = xh_shifted;
xright = xl_shifted;
}
else
{
xleft = xl_shifted;
xright = xh_shifted;
}
// If one of the results are out of range, we have overflow, and we need to be conservative when binning.
int max_range = maximum4(max(abs(xleft), abs(xright)));
ivec2 range;
if (max_range <= 2047 * scaling)
range = ivec2(minimum4(xleft), maximum4(xright));
else
range = ivec2(0, 0x7fffffff);
return range;
}
bool bin_primitive(TriangleSetup setup, ivec2 lo, ivec2 hi, int scaling)
{
int start_y = lo.y * SUBPIXELS_Y;
int end_y = (hi.y * SUBPIXELS_Y) + (SUBPIXELS_Y - 1);
// First, we clip start/end against y_lo, y_hi.
start_y = max(start_y, scaling * int(setup.yh));
end_y = min(end_y, scaling * int(setup.yl) - 1);
// Y is clipped out, exit early.
if (end_y < start_y)
return false;
bool flip = (setup.flags & TRIANGLE_SETUP_FLIP_BIT) != 0;
// Sample the X ranges for min and max Y, and potentially the mid-point as well.
ivec4 ys = ivec4(start_y, end_y, clamp(setup.ym * scaling + ivec2(-1, 0), ivec2(start_y), ivec2(end_y)));
ivec2 x_range = interpolate_xs(setup, ys, flip, scaling);
x_range.x = max(x_range.x, lo.x);
x_range.y = min(x_range.y, hi.x);
return x_range.x <= x_range.y;
}
#endif

View File

@ -0,0 +1,145 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef BLENDER_H_
#define BLENDER_H_
struct BlendInputs
{
u8x4 pixel_color;
u8x4 memory_color;
u8x4 fog_color;
u8x4 blend_color;
u8 shade_alpha;
};
const int BLEND_MODE_1A_PIXEL_COLOR = 0;
const int BLEND_MODE_1A_MEMORY_COLOR = 1;
const int BLEND_MODE_1A_BLEND_COLOR = 2;
const int BLEND_MODE_1A_FOG_COLOR = 3;
const int BLEND_MODE_1B_PIXEL_ALPHA = 0;
const int BLEND_MODE_1B_FOG_ALPHA = 1;
const int BLEND_MODE_1B_SHADE_ALPHA = 2;
const int BLEND_MODE_1B_ZERO = 3;
const int BLEND_MODE_2A_PIXEL_COLOR = 0;
const int BLEND_MODE_2A_MEMORY_COLOR = 1;
const int BLEND_MODE_2A_BLEND_COLOR = 2;
const int BLEND_MODE_2A_FOG_COLOR = 3;
const int BLEND_MODE_2B_INV_PIXEL_ALPHA = 0;
const int BLEND_MODE_2B_MEMORY_ALPHA = 1;
const int BLEND_MODE_2B_ONE = 2;
const int BLEND_MODE_2B_ZERO = 3;
u8x3 blender(BlendInputs inputs, u8x4 blend_modes,
bool force_blend, bool blend_en, bool color_on_coverage, bool coverage_wrap, u8x2 blend_shift,
bool final_cycle)
{
u8x3 rgb1;
switch (int(blend_modes.z))
{
case BLEND_MODE_2A_PIXEL_COLOR: rgb1 = inputs.pixel_color.rgb; break;
case BLEND_MODE_2A_MEMORY_COLOR: rgb1 = inputs.memory_color.rgb; break;
case BLEND_MODE_2A_BLEND_COLOR: rgb1 = inputs.blend_color.rgb; break;
case BLEND_MODE_2A_FOG_COLOR: rgb1 = inputs.fog_color.rgb; break;
}
if (final_cycle)
{
if (color_on_coverage && !coverage_wrap)
return rgb1;
}
u8x3 rgb0;
switch (int(blend_modes.x))
{
case BLEND_MODE_1A_PIXEL_COLOR: rgb0 = inputs.pixel_color.rgb; break;
case BLEND_MODE_1A_MEMORY_COLOR: rgb0 = inputs.memory_color.rgb; break;
case BLEND_MODE_1A_BLEND_COLOR: rgb0 = inputs.blend_color.rgb; break;
case BLEND_MODE_1A_FOG_COLOR: rgb0 = inputs.fog_color.rgb; break;
}
if (final_cycle)
{
if (!blend_en || (blend_modes.y == BLEND_MODE_1B_PIXEL_ALPHA &&
blend_modes.w == BLEND_MODE_2B_INV_PIXEL_ALPHA &&
inputs.pixel_color.a == U8_C(0xff)))
{
return rgb0;
}
}
u8 a0;
u8 a1;
switch (int(blend_modes.y))
{
case BLEND_MODE_1B_PIXEL_ALPHA: a0 = inputs.pixel_color.a; break;
case BLEND_MODE_1B_FOG_ALPHA: a0 = inputs.fog_color.a; break;
case BLEND_MODE_1B_SHADE_ALPHA: a0 = inputs.shade_alpha; break;
case BLEND_MODE_1B_ZERO: a0 = U8_C(0); break;
}
switch (int(blend_modes.w))
{
case BLEND_MODE_2B_INV_PIXEL_ALPHA: a1 = ~a0 & U8_C(0xff); break;
case BLEND_MODE_2B_MEMORY_ALPHA: a1 = inputs.memory_color.a; break;
case BLEND_MODE_2B_ONE: a1 = U8_C(0xff); break;
case BLEND_MODE_2B_ZERO: a1 = U8_C(0); break;
}
a0 >>= U8_C(3);
a1 >>= U8_C(3);
if (blend_modes.w == BLEND_MODE_2B_MEMORY_ALPHA)
{
a0 = (a0 >> blend_shift.x) & U8_C(0x3c);
a1 = (a1 >> blend_shift.y) | U8_C(3);
}
i16x3 blended = i16x3(rgb0) * i16(a0) + i16x3(rgb1) * (i16(a1) + I16_C(1));
if (!final_cycle || force_blend)
{
rgb0 = u8x3(blended >> I16_C(5));
}
else
{
// Serious funk here. Somehow the RDP implemented a divider to deal with weighted average.
// Typically relevant when using blender shifters from interpenetrating Z mode.
// Under normal condition, this is implemented as a straight integer divider, but
// for edge cases, we need a look-up table. The results make no sense.
int blend_sum = (int(a0) >> 2) + (int(a1) >> 2) + 1;
blended >>= I16_C(2);
blended &= I16_C(0x7ff);
rgb0.r = u8(texelFetch(uBlenderDividerLUT, (blend_sum << 11) | blended.x).x);
rgb0.g = u8(texelFetch(uBlenderDividerLUT, (blend_sum << 11) | blended.y).x);
rgb0.b = u8(texelFetch(uBlenderDividerLUT, (blend_sum << 11) | blended.z).x);
}
return rgb0 & U8_C(0xff);
}
#endif

View File

@ -0,0 +1,78 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef CLAMPING_H_
#define CLAMPING_H_
#if SMALL_TYPES && 0
// This path is buggy on RADV LLVM, disable for time being.
i16x4 clamp_9bit_notrunc(i16x4 color)
{
// [-129, -256] should clamp to 0xff, subtracting by 0x80 will underflow back to positive numbers.
// [-128, -1] should clamp to 0.
color -= I16_C(0x80);
// Sign-extend to 9-bit.
color <<= I16_C(7);
color >>= I16_C(7);
color += I16_C(0x80);
return clamp(color, i16x4(0), i16x4(0xff));
}
#else
i16x4 clamp_9bit_notrunc(ivec4 color)
{
// [-129, -256] should clamp to 0xff, subtracting by 0x80 will underflow back to positive numbers.
// [-128, -1] should clamp to 0.
color -= 0x80;
// Sign-extend to 9-bit.
color = bitfieldExtract(color, 0, 9);
color += 0x80;
return i16x4(clamp(color, ivec4(0), ivec4(0xff)));
}
#endif
u8x4 clamp_9bit(i16x4 color)
{
return u8x4(clamp_9bit_notrunc(color));
}
int clamp_9bit(int color)
{
return clamp(bitfieldExtract(color - 0x80, 0, 9) + 0x80, 0, 0xff);
}
// Returns 18-bit UNORM depth.
int clamp_z(int z)
{
// Similar to RGBA, we reserve an extra bit to deal with overflow and underflow.
z -= (1 << 17);
z <<= (31 - 18);
z >>= (31 - 18);
z += (1 << 17);
// [0x00000, 0x3ffff] maps to self.
// [0x40000, 0x5ffff] maps to 0x3ffff.
// [0x60000, 0x7ffff] maps to 0.
return clamp(z, 0, 0x3ffff);
}
#endif

View File

@ -0,0 +1,33 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
layout(local_size_x_id = 0) in;
layout(set = 0, binding = 0, std430) writeonly buffer ClearIndirectBuffer
{
uvec4 indirects[];
};
void main()
{
indirects[gl_GlobalInvocationID.x] = uvec4(0, 1, 1, 0);
}

View File

@ -0,0 +1,34 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
layout(local_size_x_id = 0) in;
layout(set = 0, binding = 0, std430) writeonly buffer ToClear
{
uint elems[];
} mask_ram;
void main()
{
mask_ram.elems[gl_GlobalInvocationID.x] = 0u;
}

View File

@ -0,0 +1,42 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
layout(local_size_x_id = 0) in;
layout(constant_id = 1) const int PAGE_STRIDE = 256;
layout(set = 0, binding = 0, std430) writeonly buffer SSBO
{
uint write_mask[];
};
layout(set = 1, binding = 0, std140) uniform UBO
{
uvec4 offsets[1024];
};
void main()
{
uint offset = offsets[gl_WorkGroupID.x >> 2u][gl_WorkGroupID.x & 3u];
offset *= PAGE_STRIDE;
write_mask[offset + gl_LocalInvocationIndex] = 0u;
}

View File

@ -0,0 +1,284 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef COMBINER_H_
#define COMBINER_H_
#include "clamping.h"
ivec4 special_expand(ivec4 value)
{
// Special sign-extend without explicit clamp.
return bitfieldExtract(value - 0x80, 0, 9) + 0x80;
}
i16x4 combiner_equation(ivec4 a, ivec4 b, ivec4 c, ivec4 d)
{
// Sign-extend multiplier to 9 bits.
c = bitfieldExtract(c, 0, 9);
// Need this to deal with very specific 9-bit sign bits ...
a = special_expand(a);
b = special_expand(b);
d = special_expand(d);
ivec4 color = (a - b) * c;
color += 0x80;
return i16x4(color >> 8) + i16x4(d);
}
struct CombinerInputs
{
u8x4 constant_muladd;
u8x4 constant_mulsub;
u8x4 constant_mul;
u8x4 constant_add;
u8x4 shade;
i16x4 combined;
i16x4 texel0;
i16x4 texel1;
i16 lod_frac;
i16 noise;
};
const int RGB_MULADD_COMBINED = 0;
const int RGB_MULADD_TEXEL0 = 1;
const int RGB_MULADD_TEXEL1 = 2;
const int RGB_MULADD_SHADE = 4;
const int RGB_MULADD_ONE = 6;
const int RGB_MULADD_NOISE = 7;
const int RGB_MULSUB_COMBINED = 0;
const int RGB_MULSUB_TEXEL0 = 1;
const int RGB_MULSUB_TEXEL1 = 2;
const int RGB_MULSUB_SHADE = 4;
const int RGB_MULSUB_K4 = 7;
const int RGB_MUL_COMBINED = 0;
const int RGB_MUL_TEXEL0 = 1;
const int RGB_MUL_TEXEL1 = 2;
const int RGB_MUL_SHADE = 4;
const int RGB_MUL_COMBINED_ALPHA = 7;
const int RGB_MUL_TEXEL0_ALPHA = 8;
const int RGB_MUL_TEXEL1_ALPHA = 9;
const int RGB_MUL_SHADE_ALPHA = 11;
const int RGB_MUL_LOD_FRAC = 13;
const int RGB_MUL_K5 = 15;
const int RGB_ADD_COMBINED = 0;
const int RGB_ADD_TEXEL0 = 1;
const int RGB_ADD_TEXEL1 = 2;
const int RGB_ADD_SHADE = 4;
const int RGB_ADD_ONE = 6;
const int ALPHA_ADDSUB_COMBINED = 0;
const int ALPHA_ADDSUB_TEXEL0_ALPHA = 1;
const int ALPHA_ADDSUB_TEXEL1_ALPHA = 2;
const int ALPHA_ADDSUB_SHADE_ALPHA = 4;
const int ALPHA_ADDSUB_ONE = 6;
const int ALPHA_MUL_LOD_FRAC = 0;
const int ALPHA_MUL_TEXEL0_ALPHA = 1;
const int ALPHA_MUL_TEXEL1_ALPHA = 2;
const int ALPHA_MUL_SHADE_ALPHA = 4;
ivec4 select_muladd(CombinerInputs inputs, int selector_rgb, int selector_alpha)
{
ivec3 res;
switch (selector_rgb)
{
case RGB_MULADD_COMBINED: res = inputs.combined.rgb; break;
case RGB_MULADD_TEXEL0: res = inputs.texel0.rgb; break;
case RGB_MULADD_TEXEL1: res = inputs.texel1.rgb; break;
case RGB_MULADD_SHADE: res = inputs.shade.rgb; break;
case RGB_MULADD_NOISE: res = ivec3(inputs.noise); break;
case RGB_MULADD_ONE: res = ivec3(0x100); break;
default: res = inputs.constant_muladd.rgb; break;
}
int alpha;
switch (selector_alpha)
{
case ALPHA_ADDSUB_COMBINED: alpha = inputs.combined.a; break;
case ALPHA_ADDSUB_TEXEL0_ALPHA: alpha = inputs.texel0.a; break;
case ALPHA_ADDSUB_TEXEL1_ALPHA: alpha = inputs.texel1.a; break;
case ALPHA_ADDSUB_SHADE_ALPHA: alpha = inputs.shade.a; break;
case ALPHA_ADDSUB_ONE: alpha = 0x100; break;
default: alpha = inputs.constant_muladd.a; break;
}
return ivec4(res, alpha);
}
ivec4 select_mulsub(CombinerInputs inputs, int selector_rgb, int selector_alpha)
{
ivec3 res;
switch (selector_rgb)
{
case RGB_MULSUB_COMBINED: res = inputs.combined.rgb; break;
case RGB_MULSUB_TEXEL0: res = inputs.texel0.rgb; break;
case RGB_MULSUB_TEXEL1: res = inputs.texel1.rgb; break;
case RGB_MULSUB_SHADE: res = inputs.shade.rgb; break;
case RGB_MULSUB_K4: res = ivec3((int(inputs.constant_mulsub.g) << 8) | inputs.constant_mulsub.b); break;
default: res = inputs.constant_mulsub.rgb; break;
}
int alpha;
switch (selector_alpha)
{
case ALPHA_ADDSUB_COMBINED: alpha = inputs.combined.a; break;
case ALPHA_ADDSUB_TEXEL0_ALPHA: alpha = inputs.texel0.a; break;
case ALPHA_ADDSUB_TEXEL1_ALPHA: alpha = inputs.texel1.a; break;
case ALPHA_ADDSUB_SHADE_ALPHA: alpha = inputs.shade.a; break;
case ALPHA_ADDSUB_ONE: alpha = 0x100; break;
default: alpha = inputs.constant_mulsub.a; break;
}
return ivec4(res, alpha);
}
ivec4 select_mul(CombinerInputs inputs, int selector_rgb, int selector_alpha)
{
ivec3 res;
switch (selector_rgb)
{
case RGB_MUL_COMBINED: res = inputs.combined.rgb; break;
case RGB_MUL_COMBINED_ALPHA: res = inputs.combined.aaa; break;
case RGB_MUL_TEXEL0: res = inputs.texel0.rgb; break;
case RGB_MUL_TEXEL1: res = inputs.texel1.rgb; break;
case RGB_MUL_SHADE: res = inputs.shade.rgb; break;
case RGB_MUL_TEXEL0_ALPHA: res = inputs.texel0.aaa; break;
case RGB_MUL_TEXEL1_ALPHA: res = inputs.texel1.aaa; break;
case RGB_MUL_SHADE_ALPHA: res = inputs.shade.aaa; break;
case RGB_MUL_LOD_FRAC: res = ivec3(inputs.lod_frac); break;
case RGB_MUL_K5: res = ivec3((int(inputs.constant_mul.g) << 8) | inputs.constant_mul.b); break;
default: res = inputs.constant_mul.rgb; break;
}
int alpha;
switch (selector_alpha)
{
case ALPHA_MUL_LOD_FRAC: alpha = inputs.lod_frac; break;
case ALPHA_MUL_TEXEL0_ALPHA: alpha = inputs.texel0.a; break;
case ALPHA_MUL_TEXEL1_ALPHA: alpha = inputs.texel1.a; break;
case ALPHA_MUL_SHADE_ALPHA: alpha = inputs.shade.a; break;
default: alpha = inputs.constant_mul.a; break;
}
return ivec4(res, alpha);
}
ivec4 select_add(CombinerInputs inputs, int selector_rgb, int selector_alpha)
{
ivec3 res;
switch (selector_rgb)
{
case RGB_ADD_COMBINED: res = inputs.combined.rgb; break;
case RGB_ADD_TEXEL0: res = inputs.texel0.rgb; break;
case RGB_ADD_TEXEL1: res = inputs.texel1.rgb; break;
case RGB_ADD_SHADE: res = inputs.shade.rgb; break;
case RGB_ADD_ONE: res = ivec3(0x100); break;
default: res = inputs.constant_add.rgb; break;
}
int alpha;
switch (selector_alpha)
{
case ALPHA_ADDSUB_COMBINED: alpha = inputs.combined.a; break;
case ALPHA_ADDSUB_TEXEL0_ALPHA: alpha = inputs.texel0.a; break;
case ALPHA_ADDSUB_TEXEL1_ALPHA: alpha = inputs.texel1.a; break;
case ALPHA_ADDSUB_SHADE_ALPHA: alpha = inputs.shade.a; break;
case ALPHA_ADDSUB_ONE: alpha = 0x100; break;
default: alpha = inputs.constant_add.a; break;
}
return ivec4(res, alpha);
}
i16x4 combiner_cycle0(CombinerInputs inputs, u8x4 combiner_inputs_rgb, u8x4 combiner_inputs_alpha, int alpha_dith,
int coverage, bool cvg_times_alpha, bool alpha_cvg_select, bool alpha_test, out u8 alpha_test_reference)
{
ivec4 muladd = select_muladd(inputs, combiner_inputs_rgb.x, combiner_inputs_alpha.x);
ivec4 mulsub = select_mulsub(inputs, combiner_inputs_rgb.y, combiner_inputs_alpha.y);
ivec4 mul = select_mul(inputs, combiner_inputs_rgb.z, combiner_inputs_alpha.z);
ivec4 add = select_add(inputs, combiner_inputs_rgb.w, combiner_inputs_alpha.w);
i16x4 combined = combiner_equation(muladd, mulsub, mul, add);
if (alpha_test)
{
int clamped_alpha = clamp_9bit(combined.a);
// Expands 0xff to 0x100 to avoid having to divide by 2**n - 1.
int expanded_alpha = clamped_alpha + ((clamped_alpha + 1) >> 8);
if (alpha_cvg_select)
{
int modulated_alpha;
if (cvg_times_alpha)
modulated_alpha = (expanded_alpha * coverage + 4) >> 3;
else
modulated_alpha = coverage << 5;
expanded_alpha = modulated_alpha;
}
else
expanded_alpha += alpha_dith;
alpha_test_reference = u8(clamp(expanded_alpha, 0, 0xff));
}
else
alpha_test_reference = U8_C(0);
return combined;
}
i16x4 combiner_cycle1(CombinerInputs inputs, u8x4 combiner_inputs_rgb, u8x4 combiner_inputs_alpha, int alpha_dith,
inout int coverage, bool cvg_times_alpha, bool alpha_cvg_select)
{
ivec4 muladd = select_muladd(inputs, combiner_inputs_rgb.x, combiner_inputs_alpha.x);
ivec4 mulsub = select_mulsub(inputs, combiner_inputs_rgb.y, combiner_inputs_alpha.y);
ivec4 mul = select_mul(inputs, combiner_inputs_rgb.z, combiner_inputs_alpha.z);
ivec4 add = select_add(inputs, combiner_inputs_rgb.w, combiner_inputs_alpha.w);
i16x4 combined = combiner_equation(muladd, mulsub, mul, add);
combined = clamp_9bit_notrunc(combined);
// Expands 0xff to 0x100 to avoid having to divide by 2**n - 1.
int expanded_alpha = combined.a + ((combined.a + 1) >> 8);
int modulated_alpha;
if (cvg_times_alpha)
{
modulated_alpha = (expanded_alpha * coverage + 4) >> 3;
coverage = modulated_alpha >> 5;
}
else
modulated_alpha = coverage << 5;
if (alpha_cvg_select)
expanded_alpha = modulated_alpha;
else
expanded_alpha += alpha_dith;
combined.a = i16(clamp(expanded_alpha, 0, 0xff));
return combined;
}
#endif

View File

@ -0,0 +1,81 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef COVERAGE_H_
#define COVERAGE_H_
#include "data_structures.h"
const int SUBPIXELS_LOG2 = 2;
const int SUBPIXELS = 1 << SUBPIXELS_LOG2;
u8 compute_coverage(u16x4 xleft, u16x4 xright, int x)
{
u16x4 xshift = u16x4(0, 4, 2, 6) + (u16(x) << U16_C(3));
bvec4 clip_lo_x01 = lessThan(xshift, xleft.xxyy);
bvec4 clip_lo_x23 = lessThan(xshift, xleft.zzww);
bvec4 clip_hi_x01 = greaterThanEqual(xshift, xright.xxyy);
bvec4 clip_hi_x23 = greaterThanEqual(xshift, xright.zzww);
u8x4 clip_x0 = u8x4(clip_lo_x01) | u8x4(clip_hi_x01);
u8x4 clip_x1 = u8x4(clip_lo_x23) | u8x4(clip_hi_x23);
u8x4 clip_x = clip_x0 * u8x4(1, 2, 4, 8) + clip_x1 * u8x4(16, 32, 64, 128);
u8 clip_coverage = (clip_x.x | clip_x.y) | (clip_x.z | clip_x.w);
return ~clip_coverage & U8_C(0xff);
}
const int COVERAGE_CLAMP = 0;
const int COVERAGE_WRAP = 1;
const int COVERAGE_ZAP = 2;
const int COVERAGE_SAVE = 3;
int blend_coverage(int coverage, int memory_coverage, bool blend_en, int mode)
{
int res = 0;
switch (mode)
{
case COVERAGE_CLAMP:
{
if (blend_en)
res = min(7, memory_coverage + coverage); // image_read_en to read memory coverage, otherwise, it's 7.
else
res = (coverage - 1) & 7;
break;
}
case COVERAGE_WRAP:
res = (coverage + memory_coverage) & 7;
break;
case COVERAGE_ZAP:
res = 7;
break;
case COVERAGE_SAVE:
res = memory_coverage;
break;
}
return res;
}
#endif

View File

@ -0,0 +1,345 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef DATA_STRUCTURES_H_
#define DATA_STRUCTURES_H_
// Data structures which are supposed to match up with rdp_data_structures.hpp.
// A little dirty to duplicate like this, but it's non-trivial to share headers with C++,
// especially when we need to deal with small integer types.
const int TRIANGLE_SETUP_FLIP_BIT = 1 << 0;
const int TRIANGLE_SETUP_DO_OFFSET_BIT = 1 << 1;
const int TRIANGLE_SETUP_SKIP_XFRAC_BIT = 1 << 2;
const int TRIANGLE_SETUP_INTERLACE_FIELD_BIT = 1 << 3;
const int TRIANGLE_SETUP_INTERLACE_KEEP_ODD_BIT = 1 << 4;
const int TRIANGLE_SETUP_DISABLE_UPSCALING_BIT = 1 << 5;
const int TRIANGLE_SETUP_NATIVE_LOD_BIT = 1 << 6;
const int RASTERIZATION_INTERLACE_FIELD_BIT = 1 << 0;
const int RASTERIZATION_INTERLACE_KEEP_ODD_BIT = 1 << 1;
const int RASTERIZATION_AA_BIT = 1 << 2;
const int RASTERIZATION_PERSPECTIVE_CORRECT_BIT = 1 << 3;
const int RASTERIZATION_TLUT_BIT = 1 << 4;
const int RASTERIZATION_TLUT_TYPE_BIT = 1 << 5;
const int RASTERIZATION_CVG_TIMES_ALPHA_BIT = 1 << 6;
const int RASTERIZATION_ALPHA_CVG_SELECT_BIT = 1 << 7;
const int RASTERIZATION_MULTI_CYCLE_BIT = 1 << 8;
const int RASTERIZATION_TEX_LOD_ENABLE_BIT = 1 << 9;
const int RASTERIZATION_SHARPEN_LOD_ENABLE_BIT = 1 << 10;
const int RASTERIZATION_DETAIL_LOD_ENABLE_BIT = 1 << 11;
const int RASTERIZATION_FILL_BIT = 1 << 12;
const int RASTERIZATION_COPY_BIT = 1 << 13;
const int RASTERIZATION_SAMPLE_MODE_BIT = 1 << 14;
const int RASTERIZATION_ALPHA_TEST_BIT = 1 << 15;
const int RASTERIZATION_ALPHA_TEST_DITHER_BIT = 1 << 16;
const int RASTERIZATION_SAMPLE_MID_TEXEL_BIT = 1 << 17;
const int RASTERIZATION_USES_TEXEL0_BIT = 1 << 18;
const int RASTERIZATION_USES_TEXEL1_BIT = 1 << 19;
const int RASTERIZATION_USES_LOD_BIT = 1 << 20;
const int RASTERIZATION_USES_PIPELINED_TEXEL1_BIT = 1 << 21;
const int RASTERIZATION_CONVERT_ONE_BIT = 1 << 22;
const int RASTERIZATION_BILERP_0_BIT = 1 << 23;
const int RASTERIZATION_BILERP_1_BIT = 1 << 24;
const int RASTERIZATION_UPSCALING_LOG2_BIT_OFFSET = 26;
const int RASTERIZATION_NEED_NOISE_BIT = 1 << 28;
const int RASTERIZATION_USE_STATIC_TEXTURE_SIZE_FORMAT_BIT = 1 << 29;
const int RASTERIZATION_USE_SPECIALIZATION_CONSTANT_BIT = 1 << 30;
const int DEPTH_BLEND_DEPTH_TEST_BIT = 1 << 0;
const int DEPTH_BLEND_DEPTH_UPDATE_BIT = 1 << 1;
const int DEPTH_BLEND_FORCE_BLEND_BIT = 1 << 3;
const int DEPTH_BLEND_IMAGE_READ_ENABLE_BIT = 1 << 4;
const int DEPTH_BLEND_COLOR_ON_COVERAGE_BIT = 1 << 5;
const int DEPTH_BLEND_MULTI_CYCLE_BIT = 1 << 6;
const int DEPTH_BLEND_AA_BIT = 1 << 7;
const int DEPTH_BLEND_DITHER_ENABLE_BIT = 1 << 8;
struct TriangleSetupMem
{
int xh, xm, xl;
mem_i16 yh, ym;
int dxhdy, dxmdy, dxldy;
mem_i16 yl; mem_u8 flags; mem_u8 tile;
};
#if SMALL_TYPES
#define TriangleSetup TriangleSetupMem
#else
struct TriangleSetup
{
int xh, xm, xl;
i16 yh, ym;
int dxhdy, dxmdy, dxldy;
i16 yl; u8 flags; u8 tile;
};
#endif
struct AttributeSetupMem
{
ivec4 rgba;
ivec4 drgba_dx;
ivec4 drgba_de;
ivec4 drgba_dy;
ivec4 stzw;
ivec4 dstzw_dx;
ivec4 dstzw_de;
ivec4 dstzw_dy;
};
#define AttributeSetup AttributeSetupMem
struct SpanSetupMem
{
ivec4 rgba;
ivec4 stzw;
mem_u16x4 xleft;
mem_u16x4 xright;
int interpolation_base_x;
int start_x;
int end_x;
mem_i16 lodlength;
mem_u16 valid_line;
};
#if SMALL_TYPES
#define SpanSetup SpanSetupMem
#else
struct SpanSetup
{
ivec4 rgba;
ivec4 stzw;
u16x4 xleft;
u16x4 xright;
int interpolation_base_x;
int start_x;
int end_x;
i16 lodlength;
u16 valid_line;
};
#endif
struct SpanInfoOffsetsMem
{
int offset;
int ylo;
int yhi;
int padding;
};
#define SpanInfoOffsets SpanInfoOffsetsMem
struct DerivedSetupMem
{
mem_u8x4 constant_muladd0;
mem_u8x4 constant_mulsub0;
mem_u8x4 constant_mul0;
mem_u8x4 constant_add0;
mem_u8x4 constant_muladd1;
mem_u8x4 constant_mulsub1;
mem_u8x4 constant_mul1;
mem_u8x4 constant_add1;
mem_u8x4 fog_color;
mem_u8x4 blend_color;
uint fill_color;
mem_u16 dz;
mem_u8 dz_compressed;
mem_u8 min_lod;
mem_i16x4 factors;
};
#if SMALL_TYPES
#define DerivedSetup DerivedSetupMem
#else
struct DerivedSetup
{
u8x4 constant_muladd0;
u8x4 constant_mulsub0;
u8x4 constant_mul0;
u8x4 constant_add0;
u8x4 constant_muladd1;
u8x4 constant_mulsub1;
u8x4 constant_mul1;
u8x4 constant_add1;
u8x4 fog_color;
u8x4 blend_color;
uint fill_color;
u16 dz;
u8 dz_compressed;
u8 min_lod;
i16x4 factors;
};
#endif
#define ScissorStateMem ivec4
struct ScissorState
{
int xlo, ylo, xhi, yhi;
};
const int TILE_INFO_CLAMP_S_BIT = 1 << 0;
const int TILE_INFO_MIRROR_S_BIT = 1 << 1;
const int TILE_INFO_CLAMP_T_BIT = 1 << 2;
const int TILE_INFO_MIRROR_T_BIT = 1 << 3;
struct TileInfoMem
{
uint slo;
uint shi;
uint tlo;
uint thi;
uint offset;
uint stride;
mem_u8 fmt;
mem_u8 size;
mem_u8 palette;
mem_u8 mask_s;
mem_u8 shift_s;
mem_u8 mask_t;
mem_u8 shift_t;
mem_u8 flags;
};
#if SMALL_TYPES
#define TileInfo TileInfoMem
#else
struct TileInfo
{
uint slo;
uint shi;
uint tlo;
uint thi;
uint offset;
uint stride;
u8 fmt;
u8 size;
u8 palette;
u8 mask_s;
u8 shift_s;
u8 mask_t;
u8 shift_t;
u8 flags;
};
#endif
struct StaticRasterizationStateMem
{
mem_u8x4 combiner_inputs_rgb0;
mem_u8x4 combiner_inputs_alpha0;
mem_u8x4 combiner_inputs_rgb1;
mem_u8x4 combiner_inputs_alpha1;
uint flags;
int dither;
int texture_size;
int texture_fmt;
};
#if SMALL_TYPES
#define StaticRasterizationState StaticRasterizationStateMem
#else
struct StaticRasterizationState
{
u8x4 combiner_inputs_rgb0;
u8x4 combiner_inputs_alpha0;
u8x4 combiner_inputs_rgb1;
u8x4 combiner_inputs_alpha1;
uint flags;
int dither;
int texture_size;
int texture_fmt;
};
#endif
struct DepthBlendStateMem
{
mem_u8x4 blend_modes0;
mem_u8x4 blend_modes1;
uint flags;
mem_u8 coverage_mode;
mem_u8 z_mode;
mem_u8 padding0;
mem_u8 padding1;
};
#if SMALL_TYPES
#define DepthBlendState DepthBlendStateMem
#else
struct DepthBlendState
{
u8x4 blend_modes0;
u8x4 blend_modes1;
uint flags;
u8 coverage_mode;
u8 z_mode;
u8 padding0;
u8 padding1;
};
#endif
struct InstanceIndicesMem
{
mem_u8x4 static_depth_tmem;
mem_u8x4 other;
mem_u8 tile_infos[8];
};
struct TMEMInstance16Mem
{
mem_u16 elems[2048];
};
struct TMEMInstance8Mem
{
mem_u8 elems[4096];
};
struct ShadedData
{
u8x4 combined;
int z_dith;
u8 coverage_count;
u8 shade_alpha;
};
const int COVERAGE_FILL_BIT = 0x40;
const int COVERAGE_COPY_BIT = 0x20;
struct GlobalFBInfo
{
int dx_shift;
int dx_mask;
int fb_size;
uint base_primitive_index;
};
#endif

View File

@ -0,0 +1,134 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef DATA_STRUCTURES_BUFFERS_H_
#define DATA_STRUCTURES_BUFFERS_H_
#include "data_structures.h"
layout(set = 0, binding = 0, std430) buffer VRAM32
{
uint data[];
} vram32;
layout(set = 0, binding = 0, std430) buffer VRAM16
{
mem_u16 data[];
} vram16;
layout(set = 0, binding = 0, std430) buffer VRAM8
{
mem_u8 data[];
} vram8;
layout(set = 0, binding = 1, std430) buffer HiddenVRAM
{
mem_u8 data[];
} hidden_vram;
layout(set = 0, binding = 2, std430) readonly buffer TMEM16
{
TMEMInstance16Mem instances[];
} tmem16;
layout(set = 0, binding = 2, std430) readonly buffer TMEM8
{
TMEMInstance8Mem instances[];
} tmem8;
layout(set = 1, binding = 0, std430) readonly buffer TriangleSetupBuffer
{
TriangleSetupMem elems[];
} triangle_setup;
#include "load_triangle_setup.h"
layout(set = 1, binding = 1, std430) readonly buffer AttributeSetupBuffer
{
AttributeSetupMem elems[];
} attribute_setup;
#include "load_attribute_setup.h"
layout(set = 1, binding = 2, std430) readonly buffer DerivedSetupBuffer
{
DerivedSetupMem elems[];
} derived_setup;
#include "load_derived_setup.h"
layout(set = 1, binding = 3, std430) readonly buffer ScissorStateBuffer
{
ScissorStateMem elems[];
} scissor_state;
#include "load_scissor_state.h"
layout(set = 1, binding = 4, std430) readonly buffer StaticRasterStateBuffer
{
StaticRasterizationStateMem elems[];
} static_raster_state;
#include "load_static_raster_state.h"
layout(set = 1, binding = 5, std430) readonly buffer DepthBlendStateBuffer
{
DepthBlendStateMem elems[];
} depth_blend_state;
#include "load_depth_blend_state.h"
layout(set = 1, binding = 6, std430) readonly buffer StateIndicesBuffer
{
InstanceIndicesMem elems[];
} state_indices;
layout(set = 1, binding = 7, std430) readonly buffer TileInfoBuffer
{
TileInfoMem elems[];
} tile_infos;
#include "load_tile_info.h"
layout(set = 1, binding = 8, std430) readonly buffer SpanSetups
{
SpanSetupMem elems[];
} span_setups;
#include "load_span_setup.h"
layout(set = 1, binding = 9, std430) readonly buffer SpanInfoOffsetBuffer
{
SpanInfoOffsetsMem elems[];
} span_offsets;
#include "load_span_offsets.h"
layout(set = 1, binding = 10) uniform utextureBuffer uBlenderDividerLUT;
layout(set = 1, binding = 11, std430) readonly buffer TileBinning
{
uint elems[];
} tile_binning;
layout(set = 1, binding = 12, std430) readonly buffer TileBinningCoarse
{
uint elems[];
} tile_binning_coarse;
layout(set = 2, binding = 0, std140) uniform GlobalConstants
{
GlobalFBInfo fb_info;
} global_constants;
#endif

View File

@ -0,0 +1,151 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef DEBUG_H_
#define DEBUG_H_
#if defined(DEBUG_ENABLE) && DEBUG_ENABLE
#include "debug_channel.h"
const uint CODE_ASSERT_EQUAL = 0;
const uint CODE_ASSERT_NOT_EQUAL = 1;
const uint CODE_ASSERT_LESS_THAN = 2;
const uint CODE_ASSERT_LESS_THAN_EQUAL = 3;
const uint CODE_GENERIC = 4;
const uint CODE_HEX = 5;
void ASSERT_EQUAL_(int line, int a, int b)
{
if (a != b)
add_debug_message(CODE_ASSERT_EQUAL, gl_GlobalInvocationID, ivec3(line, a, b));
}
void ASSERT_NOT_EQUAL_(int line, int a, int b)
{
if (a == b)
add_debug_message(CODE_ASSERT_NOT_EQUAL, gl_GlobalInvocationID, ivec3(line, a, b));
}
void ASSERT_LESS_THAN_(int line, int a, int b)
{
if (a >= b)
add_debug_message(CODE_ASSERT_LESS_THAN, gl_GlobalInvocationID, ivec3(line, a, b));
}
void ASSERT_LESS_THAN_EQUAL_(int line, int a, int b)
{
if (a > b)
add_debug_message(CODE_ASSERT_LESS_THAN_EQUAL, gl_GlobalInvocationID, ivec3(line, a, b));
}
void ASSERT_EQUAL_(int line, uint a, uint b)
{
if (a != b)
add_debug_message(CODE_ASSERT_EQUAL, gl_GlobalInvocationID, ivec3(line, a, b));
}
void ASSERT_NOT_EQUAL_(int line, uint a, uint b)
{
if (a == b)
add_debug_message(CODE_ASSERT_NOT_EQUAL, gl_GlobalInvocationID, ivec3(line, a, b));
}
void ASSERT_LESS_THAN_(int line, uint a, uint b)
{
if (a >= b)
add_debug_message(CODE_ASSERT_LESS_THAN, gl_GlobalInvocationID, ivec3(line, a, b));
}
void ASSERT_LESS_THAN_EQUAL_(int line, uint a, uint b)
{
if (a > b)
add_debug_message(CODE_ASSERT_LESS_THAN_EQUAL, gl_GlobalInvocationID, ivec3(line, a, b));
}
void GENERIC_MESSAGE_(int line)
{
add_debug_message(CODE_GENERIC, gl_GlobalInvocationID, line);
}
void GENERIC_MESSAGE_(int line, uint v)
{
add_debug_message(CODE_GENERIC, gl_GlobalInvocationID, uvec2(line, v));
}
void GENERIC_MESSAGE_(int line, uvec2 v)
{
add_debug_message(CODE_GENERIC, gl_GlobalInvocationID, uvec3(line, v));
}
void GENERIC_MESSAGE_(int line, uvec3 v)
{
add_debug_message(CODE_GENERIC, gl_GlobalInvocationID, uvec4(line, v));
}
void HEX_MESSAGE_(int line)
{
add_debug_message(CODE_HEX, gl_GlobalInvocationID, line);
}
void HEX_MESSAGE_(int line, uint v)
{
add_debug_message(CODE_HEX, gl_GlobalInvocationID, uvec2(line, v));
}
void HEX_MESSAGE_(int line, uvec2 v)
{
add_debug_message(CODE_HEX, gl_GlobalInvocationID, uvec3(line, v));
}
void HEX_MESSAGE_(int line, uvec3 v)
{
add_debug_message(CODE_HEX, gl_GlobalInvocationID, uvec4(line, v));
}
#define ASERT_EQUAL(a, b) ASSERT_EQUAL_(__LINE__, a, b)
#define ASERT_NOT_EQUAL(a, b) ASSERT_NOT_EQUAL_(__LINE__, a, b)
#define ASERT_LESS_THAN(a, b) ASSERT_LESS_THAN_(__LINE__, a, b)
#define ASERT_LESS_THAN_EQUAL(a, b) ASSERT_LESS_THAN_EQUAL_(__LINE__, a, b)
#define GENERIC_MESSAGE0() GENERIC_MESSAGE_(__LINE__)
#define GENERIC_MESSAGE1(a) GENERIC_MESSAGE_(__LINE__, a)
#define GENERIC_MESSAGE2(a, b) GENERIC_MESSAGE_(__LINE__, uvec2(a, b))
#define GENERIC_MESSAGE3(a, b, c) GENERIC_MESSAGE_(__LINE__, uvec3(a, b, c))
#define HEX_MESSAGE0() HEX_MESSAGE_(__LINE__)
#define HEX_MESSAGE1(a) HEX_MESSAGE_(__LINE__, a)
#define HEX_MESSAGE2(a, b) HEX_MESSAGE_(__LINE__, uvec2(a, b))
#define HEX_MESSAGE3(a, b, c) HEX_MESSAGE_(__LINE__, uvec3(a, b, c))
#else
#define ASERT_EQUAL(a, b)
#define ASERT_NOT_EQUAL(a, b)
#define ASERT_LESS_THAN(a, b)
#define ASERT_LESS_THAN_EQUAL(a, b)
#define GENERIC_MESSAGE0()
#define GENERIC_MESSAGE1(a)
#define GENERIC_MESSAGE2(a, b)
#define GENERIC_MESSAGE3(a, b, c)
#define HEX_MESSAGE0()
#define HEX_MESSAGE1(a)
#define HEX_MESSAGE2(a, b)
#define HEX_MESSAGE3(a, b, c)
#endif
#endif

View File

@ -0,0 +1,149 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#if SUBGROUP
#extension GL_KHR_shader_subgroup_basic : require
#extension GL_KHR_shader_subgroup_vote : require
#extension GL_KHR_shader_subgroup_ballot : require
#extension GL_KHR_shader_subgroup_arithmetic : require
#endif
#include "small_types.h"
layout(local_size_x_id = 3, local_size_y_id = 4) in;
#include "noise.h"
#include "debug.h"
#include "data_structures_buffers.h"
#include "memory_interfacing.h"
layout(set = 0, binding = 3, std430) readonly buffer ColorBuffer
{
mem_u8x4 elems[];
} color;
layout(set = 0, binding = 3, std430) readonly buffer ColorRawBuffer
{
uint elems[];
} raw_color;
layout(set = 0, binding = 4, std430) readonly buffer DepthBuffer
{
int elems[];
} depth;
layout(set = 0, binding = 5, std430) readonly buffer ShadeAlpha
{
mem_u8 elems[];
} shade_alpha;
layout(set = 0, binding = 6, std430) readonly buffer Coverage
{
mem_i8 elems[];
} coverage;
layout(std430, set = 0, binding = 7) readonly buffer TileInstanceOffset
{
uint elems[];
} tile_instance_offsets;
layout(push_constant, std430) uniform Registers
{
uint fb_addr_index;
uint fb_depth_addr_index;
uint fb_width;
uint fb_height;
uint group_mask;
} registers;
layout(constant_id = 5) const int MAX_PRIMITIVES = 256;
layout(constant_id = 6) const int MAX_WIDTH = 1024;
const int TILE_BINNING_STRIDE = MAX_PRIMITIVES / 32;
const int MAX_TILES_X = MAX_WIDTH / int(gl_WorkGroupSize.x);
// Overall architecture of the tiling is from RetroWarp.
void main()
{
int x = int(gl_GlobalInvocationID.x);
int y = int(gl_GlobalInvocationID.y);
ivec2 tile = ivec2(gl_WorkGroupID.xy);
int linear_tile = tile.x + tile.y * MAX_TILES_X;
int linear_tile_base = linear_tile * TILE_BINNING_STRIDE;
uint coarse_binned = tile_binning_coarse.elems[linear_tile] & registers.group_mask;
if (coarse_binned == 0u)
return;
init_tile(gl_GlobalInvocationID.xy,
registers.fb_width, registers.fb_height,
registers.fb_addr_index, registers.fb_depth_addr_index);
while (coarse_binned != 0u)
{
int mask_index = findLSB(coarse_binned);
coarse_binned &= ~uint(1 << mask_index);
uint tile_instance = tile_instance_offsets.elems[linear_tile_base + mask_index];
uint binned = tile_binning.elems[linear_tile_base + mask_index];
while (binned != 0u)
{
int i = findLSB(binned);
binned &= ~uint(1 << i);
uint primitive_index = uint(i + 32 * mask_index);
uint index = tile_instance * (gl_WorkGroupSize.x * gl_WorkGroupSize.y) + gl_LocalInvocationIndex;
int coverage = int(coverage.elems[index]);
if (coverage >= 0)
{
if ((coverage & COVERAGE_FILL_BIT) != 0)
{
fill_color(derived_setup.elems[primitive_index].fill_color);
}
else if ((coverage & COVERAGE_COPY_BIT) != 0)
{
uint word = raw_color.elems[index];
copy_pipeline(word, primitive_index);
}
else
{
ShadedData shaded;
shaded.combined = u8x4(color.elems[index]);
shaded.z_dith = depth.elems[index];
shaded.shade_alpha = u8(shade_alpha.elems[index]);
shaded.coverage_count = u8(coverage);
depth_blend(x, y, primitive_index, shaded);
}
}
tile_instance++;
}
}
finish_tile(gl_GlobalInvocationID.xy,
registers.fb_width, registers.fb_height,
registers.fb_addr_index, registers.fb_depth_addr_index);
}

View File

@ -0,0 +1,146 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef DEPTH_TEST_H_
#define DEPTH_TEST_H_
#include "z_encode.h"
const int Z_MODE_OPAQUE = 0;
const int Z_MODE_INTERPENETRATING = 1;
const int Z_MODE_TRANSPARENT = 2;
const int Z_MODE_DECAL = 3;
int combine_dz(int dz)
{
// Find largest POT which is <= dz.
if (dz != 0)
dz = 1 << findMSB(dz);
return dz;
}
bool depth_test(int z, int dz, int dz_compressed,
u16 current_depth, u8 current_dz,
inout int coverage_count, int current_coverage_count,
bool z_compare, int z_mode,
bool force_blend, bool aa_enable,
out bool blend_en, out bool coverage_wrap, out u8x2 blend_shift)
{
bool depth_pass;
if (z_compare)
{
int memory_z = z_decompress(current_depth);
int memory_dz = dz_decompress(current_dz);
int precision_factor = (int(current_depth) >> 11) & 0xf;
bool coplanar = false;
blend_shift.x = u8(clamp(dz_compressed - current_dz, 0, 4));
blend_shift.y = u8(clamp(current_dz - dz_compressed, 0, 4));
if (precision_factor < 3)
{
if (memory_dz != 0x8000)
memory_dz = max(memory_dz << 1, 16 >> precision_factor);
else
{
coplanar = true;
memory_dz = 0xffff;
}
}
int combined_dz = combine_dz(dz | memory_dz);
int combined_dz_interpenetrate = combined_dz;
combined_dz <<= 3;
bool farther = coplanar || ((z + combined_dz) >= memory_z);
bool overflow = (coverage_count + current_coverage_count) >= 8;
blend_en = force_blend || (!overflow && aa_enable && farther);
coverage_wrap = overflow;
depth_pass = false;
bool max_z = memory_z == 0x3ffff;
bool front = z < memory_z;
int z_closest_possible = z - combined_dz;
bool nearer = coplanar || (z_closest_possible <= memory_z);
switch (z_mode)
{
case Z_MODE_OPAQUE:
{
// The OPAQUE mode is normal less-than.
// However, if z is sufficiently close enough to memory Z, we assume that we have the same surface
// and we should simply increment coverage (blend_en).
// If we overflow coverage, it is clear that we have a different surface, and here we should only
// consider pure in-front test and overwrite coverage.
depth_pass = max_z || (overflow ? front : nearer);
break;
}
case Z_MODE_INTERPENETRATING:
{
// This one is ... interesting as it affects coverage.
if (!front || !farther || !overflow)
{
// If there is no decal-like intersect, treat this as normal opaque mode.
depth_pass = max_z || (overflow ? front : nearer);
}
else
{
// Modify coverage based on how far away current surface we are somehow?
combined_dz_interpenetrate = dz_compress(combined_dz_interpenetrate & 0xffff);
int cvg_coeff = ((memory_z >> combined_dz_interpenetrate) - (z >> combined_dz_interpenetrate)) & 0xf;
coverage_count = min((cvg_coeff * coverage_count) >> 3, 8);
depth_pass = true;
}
break;
}
case Z_MODE_TRANSPARENT:
{
depth_pass = front || max_z;
break;
}
case Z_MODE_DECAL:
{
// Decals pass if |z - memory_z| <= max(dz, memory_dz).
depth_pass = farther && nearer && !max_z;
break;
}
}
}
else
{
blend_shift.x = u8(0);
blend_shift.y = u8(min(0xf - dz_compressed, 4));
bool overflow = (coverage_count + current_coverage_count) >= 8;
blend_en = force_blend || (!overflow && aa_enable);
coverage_wrap = overflow;
depth_pass = true;
}
return depth_pass;
}
#endif

View File

@ -0,0 +1,70 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef DITHER_H_
#define DITHER_H_
const u8 dither_matrices[2][16] = u8[][](
u8[](U8_C(0), U8_C(6), U8_C(1), U8_C(7), U8_C(4), U8_C(2), U8_C(5), U8_C(3), U8_C(3), U8_C(5), U8_C(2), U8_C(4), U8_C(7), U8_C(1), U8_C(6), U8_C(0)),
u8[](U8_C(0), U8_C(4), U8_C(1), U8_C(5), U8_C(4), U8_C(0), U8_C(5), U8_C(1), U8_C(3), U8_C(7), U8_C(2), U8_C(6), U8_C(7), U8_C(3), U8_C(6), U8_C(2)));
u8x3 rgb_dither(ivec3 orig_rgb, int dith)
{
ivec3 rgb_dith = (ivec3(dith) >> ivec3(0, 3, 6)) & 7;
ivec3 rgb = mix((orig_rgb & 0xf8) + 8, ivec3(255), greaterThan(orig_rgb, ivec3(247)));
ivec3 replace_sign = (rgb_dith - (orig_rgb & 7)) >> 31;
ivec3 dither_diff = rgb - orig_rgb;
rgb = orig_rgb + (dither_diff & replace_sign);
return u8x3(rgb & 0xff);
}
void dither_coefficients(int x, int y, int dither_mode_rgb, int dither_mode_alpha, out int rgb_dither, out int alpha_dither)
{
const int DITHER_SPLAT = (1 << 0) | (1 << 3) | (1 << 6);
if (dither_mode_rgb < 2)
rgb_dither = int(dither_matrices[dither_mode_rgb][(y & 3) * 4 + (x & 3)]) * DITHER_SPLAT;
else if (dither_mode_rgb == 2)
rgb_dither = noise_get_dither_color();
else
rgb_dither = 0;
if (dither_mode_alpha == 3)
alpha_dither = 0;
else
{
if (dither_mode_alpha == 2)
{
alpha_dither = noise_get_dither_alpha();
}
else
{
alpha_dither = dither_mode_rgb >= 2 ?
int(dither_matrices[dither_mode_rgb & 1][(y & 3) * 4 + (x & 3)]) : (rgb_dither & 7);
if (dither_mode_alpha == 1)
alpha_dither = ~alpha_dither & 7;
}
}
}
#endif

View File

@ -0,0 +1,107 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "small_types.h"
layout(local_size_x = 16, local_size_y = 8) in;
// Copies VRAM into a texture which is then consumed by VI scanout.
layout(set = 0, binding = 0, rgba8ui) uniform writeonly uimage2D uAAInput;
layout(set = 0, binding = 1, std430) readonly buffer RDRAM16
{
mem_u16 elems[];
} vram16;
layout(set = 0, binding = 1, std430) readonly buffer RDRAM32
{
uint elems[];
} vram32;
layout(set = 0, binding = 2, std430) readonly buffer HiddenRDRAM
{
mem_u8 elems[];
} hidden_vram;
layout(push_constant, std430) uniform Registers
{
int fb_offset;
int fb_width;
ivec2 offset;
ivec2 resolution;
} registers;
layout(constant_id = 0) const int RDRAM_SIZE = 0;
const int RDRAM_MASK_8 = RDRAM_SIZE - 1;
const int RDRAM_MASK_16 = RDRAM_MASK_8 >> 1;
const int RDRAM_MASK_32 = RDRAM_MASK_16 >> 1;
layout(constant_id = 2) const int SCALING_LOG2 = 0;
const int SCALING_FACTOR = 1 << SCALING_LOG2;
#include "vi_status.h"
uvec4 fetch_color(ivec2 coord)
{
ivec2 slice2d = coord & (SCALING_FACTOR - 1);
coord >>= SCALING_LOG2;
int slice = slice2d.y * SCALING_FACTOR + slice2d.x;
uvec4 color;
if (FMT_RGBA8888)
{
int linear_coord = coord.y * registers.fb_width + coord.x + registers.fb_offset;
linear_coord &= RDRAM_MASK_32;
linear_coord += slice * (RDRAM_SIZE >> 2);
uint word = uint(vram32.elems[linear_coord]);
color = (uvec4(word) >> uvec4(24, 16, 8, 5)) & uvec4(0xff, 0xff, 0xff, 7);
}
else if (FMT_RGBA5551)
{
int linear_coord = coord.y * registers.fb_width + coord.x + registers.fb_offset;
linear_coord &= RDRAM_MASK_16;
linear_coord += slice * (RDRAM_SIZE >> 1);
uint word = uint(vram16.elems[linear_coord ^ 1]);
uint hidden_word = uint(hidden_vram.elems[linear_coord]);
uint r = (word >> 8u) & 0xf8u;
uint g = (word >> 3u) & 0xf8u;
uint b = (word << 2u) & 0xf8u;
uint a = ((word & 1u) << 2u) | hidden_word;
color = uvec4(r, g, b, a);
}
else
color = uvec4(0);
if (!FETCH_AA)
color.a = 7u;
return color;
}
void main()
{
if (any(greaterThanEqual(gl_GlobalInvocationID.xy, registers.resolution)))
return;
ivec2 coord = ivec2(gl_GlobalInvocationID.xy) + registers.offset;
uvec4 col = fetch_color(coord);
imageStore(uAAInput, ivec2(gl_GlobalInvocationID.xy), col);
}

View File

@ -0,0 +1,10 @@
#ifndef FB_FORMATS_H_
#define FB_FORMATS_H_
const int FB_FMT_I4 = 0;
const int FB_FMT_I8 = 1;
const int FB_FMT_RGBA5551 = 2;
const int FB_FMT_IA88 = 3;
const int FB_FMT_RGBA8888 = 4;
#endif

View File

@ -0,0 +1,32 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
void main()
{
if (gl_VertexIndex == 0)
gl_Position = vec4(-1.0, -1.0, 0.0, 1.0);
else if (gl_VertexIndex == 1)
gl_Position = vec4(-1.0, +3.0, 0.0, 1.0);
else
gl_Position = vec4(+3.0, -1.0, 0.0, 1.0);
}

View File

@ -0,0 +1,255 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef INTERPOLATION_H_
#define INTERPOLATION_H_
#include "data_structures.h"
#include "clamping.h"
#include "perspective.h"
u8x4 interpolate_rgba(ivec4 rgba, ivec4 drgba_dx, ivec4 drgba_dy, int dx, int coverage)
{
rgba += ((drgba_dx & ~0x1f) >> SCALING_LOG2) * dx;
// RGBA is interpolated to 9-bit. The last bit is used to deal with clamping.
// Slight underflow below 0 is clamped to 0 and slight overflow above 0xff is clamped to 0xff.
// Keep 2 sign bits of precision before we complete the centroid interpolation.
i16x4 snapped_rgba = i16x4(rgba >> 14);
// Centroid clipping is based on the first coverage bit, and we interpolate at the first subpixel in scanline order.
// With this layout we can just use findLSB to get correct result.
// 0x01 0x02
// 0x04 0x08
// 0x10 0x20
// 0x40 0x80
int first_coverage = findLSB(coverage);
i16 yoff = i16(first_coverage >> 1);
i16 xoff = i16((first_coverage & 1) << 1) + (yoff & I16_C(1));
snapped_rgba <<= I16_C(2 + SCALING_LOG2);
snapped_rgba += xoff * i16x4(drgba_dx >> 14) + yoff * i16x4(drgba_dy >> 14);
snapped_rgba >>= I16_C(4 + SCALING_LOG2);
return clamp_9bit(snapped_rgba);
}
void interpolate_st_copy(SpanSetup span, ivec4 dstzw_dx, int x, bool perspective, bool flip,
out ivec2 st, out int s_offset)
{
int dx = flip ? (x - span.start_x) : (span.end_x - x);
// For copy pipe, we should duplicate pixels when scaling, there is no filtering we can (or should!) do.
dx >>= SCALING_LOG2;
// Snap DX to where we perform interpolation (once per N output pixels).
int snapped_dx = dx & global_constants.fb_info.dx_mask;
s_offset = dx - snapped_dx;
int lerp_dx = (dx >> global_constants.fb_info.dx_shift) * (flip ? 1 : -1);
ivec3 stw = span.stzw.xyw + (dstzw_dx.xyw & ~0x1f) * lerp_dx;
if (perspective)
{
bool st_overflow;
st = perspective_divide(stw >> 16, st_overflow);
}
else
st = no_perspective_divide(stw >> 16);
}
ivec2 interpolate_st_single(ivec4 stzw, ivec4 dstzw_dx, int dx, bool perspective)
{
ivec3 stw = stzw.xyw + ((dstzw_dx.xyw & ~0x1f) >> SCALING_LOG2) * dx;
stw >>= 16;
ivec2 st;
if (perspective)
{
bool st_overflow;
st = perspective_divide(stw, st_overflow);
}
else
st = no_perspective_divide(stw);
return st;
}
void interpolate_stz(ivec4 stzw, ivec4 dstzw_dx, ivec4 dstzw_dy, int dx, int coverage, bool perspective, bool uses_lod,
int flip_direction, out ivec2 st, out ivec2 st_dx, out ivec2 st_dy, out int z, inout bool st_overflow)
{
ivec3 stw = stzw.xyw + ((dstzw_dx.xyw & ~0x1f) >> SCALING_LOG2) * dx;
ivec3 stw_dx, stw_dy;
if (uses_lod)
{
stw_dx = stw + flip_direction * ((dstzw_dx.xyw & ~0x1f) >> SCALING_LOG2);
if (SCALING_FACTOR > 1)
stw_dy = stw + abs(flip_direction) * ((dstzw_dy.xyw & ~0x7fff) >> SCALING_LOG2);
else
stw_dy = stw + ((dstzw_dy.xyw & ~0x7fff) >> SCALING_LOG2);
}
if (perspective)
{
st = perspective_divide(stw >> 16, st_overflow);
if (uses_lod)
{
st_dx = perspective_divide(stw_dx >> 16, st_overflow);
st_dy = perspective_divide(stw_dy >> 16, st_overflow);
}
}
else
{
st = no_perspective_divide(stw >> 16);
if (uses_lod)
{
st_dx = no_perspective_divide(stw_dx >> 16);
st_dy = no_perspective_divide(stw_dy >> 16);
}
}
// Ensure that interpolation snaps as we expect on every "main" pixel,
// for subpixels, interpolate with quantized step factor.
z = stzw.z + dstzw_dx.z * (dx >> SCALING_LOG2) + (dstzw_dx.z >> SCALING_LOG2) * (dx & (SCALING_FACTOR - 1));
int snapped_z = z >> 10;
int first_coverage = findLSB(coverage);
int yoff = first_coverage >> 1;
int xoff = ((first_coverage & 1) << 1) + (yoff & I16_C(1));
snapped_z <<= 2 + SCALING_LOG2;
snapped_z += xoff * (dstzw_dx.z >> 10) + yoff * (dstzw_dy.z >> 10);
snapped_z >>= 5 + SCALING_LOG2;
z = clamp_z(snapped_z);
}
#if 0
u8x4 interpolate_rgba(TriangleSetup setup, AttributeSetup attr, int x, int y, int coverage)
{
bool do_offset = (setup.flags & TRIANGLE_SETUP_DO_OFFSET_BIT) != 0;
int y_interpolation_base = int(setup.yh) >> 2;
int xh = setup.xh + (y - y_interpolation_base) * (setup.dxhdy << 2);
ivec4 drgba_diff = ivec4(0);
// In do_offset mode, varyings are latched at last subpixel line instead of first (for some reason).
if (do_offset)
{
xh += 3 * setup.dxhdy;
ivec4 drgba_deh = attr.drgba_de & ~0x1ff;
ivec4 drgba_dyh = attr.drgba_dy & ~0x1ff;
drgba_diff = drgba_deh - (drgba_deh >> 2) - drgba_dyh + (drgba_dyh >> 2);
}
int base_x = xh >> 16;
int xfrac = (xh >> 8) & 0xff;
ivec4 rgba = attr.rgba;
rgba += attr.drgba_de * (y - y_interpolation_base);
rgba = ((rgba & ~0x1ff) + drgba_diff - xfrac * ((attr.drgba_dx >> 8) & ~1)) & ~0x3ff;
rgba += (attr.drgba_dx & ~0x1f) * (x - base_x);
// RGBA is interpolated to 9-bit. The last bit is used to deal with clamping.
// Slight underflow below 0 is clamped to 0 and slight overflow above 0xff is clamped to 0xff.
// Keep 2 sign bits of precision before we complete the centroid interpolation.
i16x4 snapped_rgba = i16x4(rgba >> 14);
// Centroid clipping is based on the first coverage bit, and we interpolate at the first subpixel in scanline order.
// FWIW, Angrylion has a very different coverage bit assignment, but we need this layout to avoid an awkward LUT.
// With this layout we can just use findLSB instead.
// 0x01 0x02
// 0x04 0x08
// 0x10 0x20
// 0x40 0x80
int first_coverage = findLSB(coverage);
i16 yoff = i16(first_coverage >> 1);
i16 xoff = i16((first_coverage & 1) << 1) + (yoff & I16_C(1));
snapped_rgba <<= I16_C(2);
snapped_rgba += xoff * i16x4(attr.drgba_dx >> 14) + yoff * i16x4(attr.drgba_dy >> 14);
snapped_rgba >>= I16_C(4);
return clamp_9bit(snapped_rgba);
}
ivec3 interpolate_stw(TriangleSetup setup, AttributeSetup attr, int x, int y)
{
bool do_offset = (setup.flags & TRIANGLE_SETUP_DO_OFFSET_BIT) != 0;
int y_interpolation_base = int(setup.yh) >> 2;
int xh = setup.xh + (y - y_interpolation_base) * (setup.dxhdy << 2);
ivec3 dstw_diff = ivec3(0);
// In do_offset mode, varyings are latched at last subpixel line instead of first (for some reason).
if (do_offset)
{
xh += 3 * setup.dxhdy;
ivec3 dstw_deh = attr.dstzw_de.xyw & ~0x1ff;
ivec3 dstw_dyh = attr.dstzw_dy.xyw & ~0x1ff;
dstw_diff = dstw_deh - (dstw_deh >> 2) - dstw_dyh + (dstw_dyh >> 2);
}
int base_x = xh >> 16;
int xfrac = (xh >> 8) & 0xff;
ivec3 stw = attr.stzw.xyw;
stw += attr.dstzw_de.xyw * (y - y_interpolation_base);
stw = ((stw & ~0x1ff) + dstw_diff - xfrac * ((attr.dstzw_dx.xyw >> 8) & ~1)) & ~0x3ff;
stw += (attr.dstzw_dx.xyw & ~0x1f) * (x - base_x);
ivec3 snapped_stw = stw >> 16;
return snapped_stw;
}
int interpolate_z(TriangleSetup setup, AttributeSetup attr, int x, int y, int coverage)
{
bool do_offset = (setup.flags & TRIANGLE_SETUP_DO_OFFSET_BIT) != 0;
int y_interpolation_base = int(setup.yh) >> 2;
int xh = setup.xh + (y - y_interpolation_base) * (setup.dxhdy << 2);
int dzdiff = 0;
// In do_offset mode, varyings are latched at last subpixel line instead of first (for some reason).
if (do_offset)
{
xh += 3 * setup.dxhdy;
int dzdeh = attr.dstzw_de.z & ~0x1ff;
int dzdyh = attr.dstzw_dy.z & ~0x1ff;
dzdiff = dzdeh - (dzdeh >> 2) - dzdyh + (dzdyh >> 2);
}
int base_x = xh >> 16;
int xfrac = (xh >> 8) & 0xff;
int z = attr.stzw.z;
z += attr.dstzw_de.z * (y - y_interpolation_base);
z = ((z & ~0x1ff) + dzdiff - xfrac * ((attr.dstzw_dx.z >> 8) & ~1)) & ~0x3ff;
z += attr.dstzw_dx.z * (x - base_x);
int snapped_z = z >> 10;
int first_coverage = findLSB(coverage);
int yoff = first_coverage >> 1;
int xoff = ((first_coverage & 1) << 1) + (yoff & 1s);
snapped_z <<= 2;
snapped_z += xoff * (attr.dstzw_dx.z >> 10) + yoff * (attr.dstzw_dy.z >> 10);
snapped_z >>= 5;
return clamp_z(snapped_z);
}
#endif
#endif

View File

@ -0,0 +1,31 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef LOAD_ATTRIBUTE_SETUP_H_
#define LOAD_ATTRIBUTE_SETUP_H_
AttributeSetup load_attribute_setup(uint index)
{
return attribute_setup.elems[index];
}
#endif

View File

@ -0,0 +1,41 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef LOAD_DEPTH_BLEND_STATE_H_
#define LOAD_DEPTH_BLEND_STATE_H_
DepthBlendState load_depth_blend_state(uint index)
{
#if SMALL_TYPES
return depth_blend_state.elems[index];
#else
return DepthBlendState(
u8x4(depth_blend_state.elems[index].blend_modes0),
u8x4(depth_blend_state.elems[index].blend_modes1),
depth_blend_state.elems[index].flags,
u8(depth_blend_state.elems[index].coverage_mode),
u8(depth_blend_state.elems[index].z_mode),
u8(0), u8(0));
#endif
}
#endif

View File

@ -0,0 +1,50 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef LOAD_DERIVED_SETUP_H_
#define LOAD_DERIVED_SETUP_H_
DerivedSetup load_derived_setup(uint index)
{
#if SMALL_TYPES
return derived_setup.elems[index];
#else
return DerivedSetup(
u8x4(derived_setup.elems[index].constant_muladd0),
u8x4(derived_setup.elems[index].constant_mulsub0),
u8x4(derived_setup.elems[index].constant_mul0),
u8x4(derived_setup.elems[index].constant_add0),
u8x4(derived_setup.elems[index].constant_muladd1),
u8x4(derived_setup.elems[index].constant_mulsub1),
u8x4(derived_setup.elems[index].constant_mul1),
u8x4(derived_setup.elems[index].constant_add1),
u8x4(derived_setup.elems[index].fog_color),
u8x4(derived_setup.elems[index].blend_color),
uint(derived_setup.elems[index].fill_color),
u16(derived_setup.elems[index].dz),
u8(derived_setup.elems[index].dz_compressed),
u8(derived_setup.elems[index].min_lod),
i16x4(derived_setup.elems[index].factors));
#endif
}
#endif

View File

@ -0,0 +1,32 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef LOAD_SCISSOR_STATE_H_
#define LOAD_SCISSOR_STATE_H_
ScissorState load_scissor_state(uint index)
{
ivec4 values = scissor_state.elems[index];
return ScissorState(values.x, values.y, values.z, values.w);
}
#endif

View File

@ -0,0 +1,31 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef LOAD_SPAN_OFFSETS_H_
#define LOAD_SPAN_OFFSETS_H_
SpanInfoOffsets load_span_offsets(uint index)
{
return span_offsets.elems[index];
}
#endif

View File

@ -0,0 +1,44 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef LOAD_SPAN_SETUP_H_
#define LOAD_SPAN_SETUP_H_
SpanSetup load_span_setup(uint index)
{
#if SMALL_TYPES
return span_setups.elems[index];
#else
return SpanSetup(
span_setups.elems[index].rgba,
span_setups.elems[index].stzw,
u16x4(uvec4(span_setups.elems[index].xleft)),
u16x4(uvec4(span_setups.elems[index].xright)),
span_setups.elems[index].interpolation_base_x,
span_setups.elems[index].start_x,
span_setups.elems[index].end_x,
i16(span_setups.elems[index].lodlength),
u16(span_setups.elems[index].valid_line));
#endif
}
#endif

View File

@ -0,0 +1,42 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef LOAD_STATIC_RASTER_STATE_H_
#define LOAD_STATIC_RASTER_STATE_H_
StaticRasterizationState load_static_rasterization_state(uint index)
{
#if SMALL_TYPES
return static_raster_state.elems[index];
#else
return StaticRasterizationState(
u8x4(static_raster_state.elems[index].combiner_inputs_rgb0),
u8x4(static_raster_state.elems[index].combiner_inputs_alpha0),
u8x4(static_raster_state.elems[index].combiner_inputs_rgb1),
u8x4(static_raster_state.elems[index].combiner_inputs_alpha1),
static_raster_state.elems[index].flags,
static_raster_state.elems[index].dither,
0, 0);
#endif
}
#endif

View File

@ -0,0 +1,49 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef LOAD_TILE_INFO_H_
#define LOAD_TILE_INFO_H_
TileInfo load_tile_info(uint index)
{
#if SMALL_TYPES
return tile_infos.elems[index];
#else
return TileInfo(
tile_infos.elems[index].slo,
tile_infos.elems[index].shi,
tile_infos.elems[index].tlo,
tile_infos.elems[index].thi,
tile_infos.elems[index].offset,
tile_infos.elems[index].stride,
u8(tile_infos.elems[index].fmt),
u8(tile_infos.elems[index].size),
u8(tile_infos.elems[index].palette),
u8(tile_infos.elems[index].mask_s),
u8(tile_infos.elems[index].shift_s),
u8(tile_infos.elems[index].mask_t),
u8(tile_infos.elems[index].shift_t),
u8(tile_infos.elems[index].flags));
#endif
}
#endif

View File

@ -0,0 +1,46 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef LOAD_TRIANGLE_SETUP_H_
#define LOAD_TRIANGLE_SETUP_H_
TriangleSetup load_triangle_setup(uint index)
{
#if SMALL_TYPES
return triangle_setup.elems[index];
#else
return TriangleSetup(
triangle_setup.elems[index].xh,
triangle_setup.elems[index].xm,
triangle_setup.elems[index].xl,
i16(triangle_setup.elems[index].yh),
i16(triangle_setup.elems[index].ym),
triangle_setup.elems[index].dxhdy,
triangle_setup.elems[index].dxmdy,
triangle_setup.elems[index].dxldy,
i16(triangle_setup.elems[index].yl),
u8(triangle_setup.elems[index].flags),
u8(triangle_setup.elems[index].tile));
#endif
}
#endif

View File

@ -0,0 +1,70 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
layout(local_size_x_id = 0) in;
layout(constant_id = 1) const int PAGE_STRIDE = 256;
layout(set = 0, binding = 0, std430) buffer RDRAM
{
uint rdram[];
};
layout(set = 0, binding = 1, std430) readonly buffer StagingRDRAM
{
uint staging_rdram[];
};
layout(set = 0, binding = 2, std430) readonly buffer WriteMaskRDRAM
{
uint writemask[];
};
layout(set = 1, binding = 0, std140) uniform UBO
{
uvec4 offsets[1024];
};
void main()
{
uint offset = offsets[gl_WorkGroupID.x >> 2u][gl_WorkGroupID.x & 3u];
offset *= PAGE_STRIDE;
offset += gl_LocalInvocationIndex;
uint mask = writemask[offset];
if (mask == ~0u)
{
return;
}
else if (mask == 0u)
{
uint staging = staging_rdram[offset];
rdram[offset] = staging;
}
else
{
uint word = rdram[offset];
uint staging = staging_rdram[offset];
word = (word & mask) | (staging & ~mask);
rdram[offset] = word;
}
}

View File

@ -0,0 +1,582 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef MEMORY_INTERFACING_H_
#define MEMORY_INTERFACING_H_
#include "dither.h"
#include "z_encode.h"
#include "blender.h"
#include "depth_test.h"
#include "coverage.h"
#include "fb_formats.h"
layout(constant_id = 0) const uint RDRAM_SIZE = 0;
layout(constant_id = 7) const int RDRAM_INCOHERENT_SCALING = 0;
const bool RDRAM_INCOHERENT = (RDRAM_INCOHERENT_SCALING & 1) != 0;
const int SCALING_LOG2 = RDRAM_INCOHERENT_SCALING >> 1;
const int SCALING_FACTOR = 1 << SCALING_LOG2;
const bool RDRAM_UNSCALED_WRITE_MASK = RDRAM_INCOHERENT && SCALING_LOG2 == 0;
const bool RDRAM_SCALED_WRITE_MASK = RDRAM_INCOHERENT && SCALING_LOG2 != 0;
const uint RDRAM_MASK_8 = RDRAM_SIZE - 1u;
const uint RDRAM_MASK_16 = RDRAM_MASK_8 >> 1u;
const uint RDRAM_MASK_32 = RDRAM_MASK_8 >> 2u;
layout(constant_id = 1) const int FB_FMT = 0;
layout(constant_id = 2) const bool FB_COLOR_DEPTH_ALIAS = false;
u8x4 current_color;
bool current_color_dirty;
u16 current_depth;
u8 current_dz;
bool current_depth_dirty;
void load_vram_color(uint index, uint slice)
{
switch (FB_FMT)
{
case FB_FMT_I4:
case FB_FMT_I8:
{
index &= RDRAM_MASK_8;
index += slice * RDRAM_SIZE;
u8 word = u8(vram8.data[index ^ 3u]);
current_color = u8x4(word, word, word, u8(hidden_vram.data[index >> 1]));
break;
}
case FB_FMT_RGBA5551:
{
index &= RDRAM_MASK_16;
index += slice * (RDRAM_SIZE >> 1);
uint word = uint(vram16.data[index ^ 1u]);
uvec3 rgb = uvec3(word >> 8u, word >> 3u, word << 2u) & 0xf8u;
current_color = u8x4(rgb, (u8(hidden_vram.data[index]) << U8_C(5)) | u8((word & 1) << 7));
break;
}
case FB_FMT_IA88:
{
index &= RDRAM_MASK_16;
index += slice * (RDRAM_SIZE >> 1);
uint word = uint(vram16.data[index ^ 1u]);
current_color = u8x4(u8x3(word >> 8u), word & 0xff);
break;
}
case FB_FMT_RGBA8888:
{
index &= RDRAM_MASK_32;
index += slice * (RDRAM_SIZE >> 2);
uint word = vram32.data[index];
current_color = u8x4((uvec4(word) >> uvec4(24, 16, 8, 0)) & uvec4(0xff));
break;
}
}
}
void alias_color_to_depth()
{
/* Inherit memory depth from color. */
switch (FB_FMT)
{
case FB_FMT_RGBA5551:
{
current_dz = (current_color.a >> U8_C(3)) | (current_color.b & U8_C(8));
uint word = (current_color.r & 0xf8u) << 6u;
word |= (current_color.g & 0xf8u) << 1u;
word |= (current_color.b & 0xf8u) >> 4u;
current_depth = u16(word);
break;
}
case FB_FMT_IA88:
{
uvec2 col = current_color.ra;
uint word = (col.x << 8u) | col.y;
uint hidden_word = (word & 1u) * 3u;
current_depth = u16(word >> 2u);
current_dz = u8(((word & 3u) << 2u) | hidden_word);
break;
}
}
}
void alias_depth_to_color()
{
uint word = (uint(current_depth) << 4u) | current_dz;
switch (FB_FMT)
{
case FB_FMT_RGBA5551:
{
current_color.r = u8((word >> 10u) & 0xf8u);
current_color.g = u8((word >> 5u) & 0xf8u);
current_color.b = u8((word >> 0u) & 0xf8u);
current_color.a = u8((word & 7u) << 5u);
break;
}
case FB_FMT_IA88:
{
current_color.r = u8((word >> 10u) & 0xffu);
current_color.a = u8((word >> 2u) & 0xffu);
break;
}
}
current_color_dirty = true;
}
void load_vram_depth(uint index, uint slice)
{
index &= RDRAM_MASK_16;
index += slice * (RDRAM_SIZE >> 1);
u16 word = u16(vram16.data[index ^ 1u]);
current_depth = word >> U16_C(2);
current_dz = u8(hidden_vram.data[index]) | u8((word & U16_C(3)) << U16_C(2));
}
void store_vram_color(uint index, uint slice)
{
//GENERIC_MESSAGE1(index);
if (current_color_dirty)
{
switch (FB_FMT)
{
case FB_FMT_I4:
{
index &= RDRAM_MASK_8;
index += slice * RDRAM_SIZE;
vram8.data[index ^ 3u] = mem_u8(0);
if ((index & 1u) != 0u)
hidden_vram.data[index >> 1u] = mem_u8(current_color.a);
if (RDRAM_UNSCALED_WRITE_MASK)
{
// Need this memory barrier to ensure the mask readback does not read
// an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
// also coherent.
memoryBarrierBuffer();
vram8.data[(index ^ 3u) + RDRAM_SIZE] = mem_u8(0xff);
}
break;
}
case FB_FMT_I8:
{
index &= RDRAM_MASK_8;
index += slice * RDRAM_SIZE;
vram8.data[index ^ 3u] = mem_u8(current_color.r);
if ((index & 1u) != 0u)
hidden_vram.data[index >> 1u] = mem_u8((current_color.r & 1) * 3);
if (RDRAM_UNSCALED_WRITE_MASK)
{
// Need this memory barrier to ensure the mask readback does not read
// an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
// also coherent.
memoryBarrierBuffer();
vram8.data[(index ^ 3u) + RDRAM_SIZE] = mem_u8(0xff);
}
break;
}
case FB_FMT_RGBA5551:
{
index &= RDRAM_MASK_16;
index += slice * (RDRAM_SIZE >> 1);
uvec4 c = uvec4(current_color);
c.rgb &= 0xf8u;
uint cov = c.w >> 5u;
uint word = (c.x << 8u) | (c.y << 3u) | (c.z >> 2u) | (cov >> 2u);
vram16.data[index ^ 1u] = mem_u16(word);
hidden_vram.data[index] = mem_u8(cov & U8_C(3));
if (RDRAM_UNSCALED_WRITE_MASK)
{
// Need this memory barrier to ensure the mask readback does not read
// an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
// also coherent.
memoryBarrierBuffer();
vram16.data[(index ^ 1u) + (RDRAM_SIZE >> 1u)] = mem_u16(0xffff);
}
break;
}
case FB_FMT_IA88:
{
index &= RDRAM_MASK_16;
index += slice * (RDRAM_SIZE >> 1);
uvec2 col = current_color.ra;
uint word = (col.x << 8u) | col.y;
vram16.data[index ^ 1u] = mem_u16(word);
hidden_vram.data[index] = mem_u8((col.y & 1) * 3);
if (RDRAM_UNSCALED_WRITE_MASK)
{
// Need this memory barrier to ensure the mask readback does not read
// an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
// also coherent.
memoryBarrierBuffer();
vram16.data[(index ^ 1u) + (RDRAM_SIZE >> 1u)] = mem_u16(0xffff);
}
break;
}
case FB_FMT_RGBA8888:
{
index &= RDRAM_MASK_32;
index += slice * (RDRAM_SIZE >> 2);
uvec4 col = current_color;
uint word = (col.r << 24u) | (col.g << 16u) | (col.b << 8u) | (col.a << 0u);
vram32.data[index] = word;
hidden_vram.data[2u * index] = mem_u8((current_color.g & 1) * 3);
hidden_vram.data[2u * index + 1u] = mem_u8((current_color.a & 1) * 3);
if (RDRAM_UNSCALED_WRITE_MASK)
{
// Need this memory barrier to ensure the mask readback does not read
// an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
// also coherent.
memoryBarrierBuffer();
vram32.data[index + (RDRAM_SIZE >> 2u)] = ~0u;
}
break;
}
}
}
}
void store_vram_depth(uint index, uint slice)
{
if (!FB_COLOR_DEPTH_ALIAS)
{
//GENERIC_MESSAGE1(index);
if (current_depth_dirty)
{
index &= RDRAM_MASK_16;
index += slice * (RDRAM_SIZE >> 1);
vram16.data[index ^ 1u] = mem_u16((current_depth << U16_C(2)) | (current_dz >> U16_C(2)));
hidden_vram.data[index] = mem_u8(current_dz & U16_C(3));
if (RDRAM_UNSCALED_WRITE_MASK)
{
// Need this memory barrier to ensure the mask readback does not read
// an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
// also coherent.
memoryBarrierBuffer();
vram16.data[(index ^ 1) + (RDRAM_SIZE >> 1u)] = mem_u16(0xffff);
}
}
}
}
uint color_fb_index;
void init_tile(uvec2 coord, uint fb_width, uint fb_height, uint fb_addr_index, uint fb_depth_addr_index)
{
current_color_dirty = false;
current_depth_dirty = false;
if (all(lessThan(coord, uvec2(fb_width, fb_height))))
{
uvec2 slice2d = coord & (SCALING_FACTOR - 1);
coord >>= SCALING_LOG2;
uint slice = slice2d.y * SCALING_FACTOR + slice2d.x;
uint index = fb_addr_index + (fb_width >> SCALING_LOG2) * coord.y + coord.x;
color_fb_index = index;
load_vram_color(index, slice);
index = fb_depth_addr_index + (fb_width >> SCALING_LOG2) * coord.y + coord.x;
load_vram_depth(index, slice);
}
}
void emit_scaled_write_masks(uvec2 unscaled_coord, uint unscaled_fb_width)
{
// Merge write masks across pixels.
// We reserved a chunk of memory after scaled RDRAM to store 2 bits per pixel holding
// a write mask for color and depth. The resolve stage will only resolve a pixel
// and trigger a write if any sub-sample was marked as written.
// Write masks are organized in 4x4 blocks of unscaled pixels for locality purposes.
// This guarantees a minimum number of loop iterations to resolve the write masks.
uint unscaled_block = (unscaled_coord.y >> 2u) * ((unscaled_fb_width + 3u) >> 2u) + (unscaled_coord.x >> 2u);
uvec2 unscaled_sub = unscaled_coord & 3u;
uint word = uint(current_color_dirty) + 2u * uint(current_depth_dirty);
word <<= 2u * (unscaled_sub.x + unscaled_sub.y * 4u);
#if SUBGROUP
// This should only need one iteration .
bool is_active = true;
do
{
if (subgroupBroadcastFirst(unscaled_block) == unscaled_block)
{
uint merged = subgroupOr(word);
if (subgroupElect())
atomicOr(vram32.data[SCALING_FACTOR * SCALING_FACTOR * (RDRAM_SIZE >> 2) + unscaled_block], merged);
is_active = false;
}
} while (is_active);
#else
// Just use atomics directly. With subgroup support, we can be a bit smarter about it.
if (word != 0u)
atomicOr(vram32.data[SCALING_FACTOR * SCALING_FACTOR * (RDRAM_SIZE >> 2) + unscaled_block], word);
#endif
}
void finish_tile(uvec2 coord, uint fb_width, uint fb_height, uint fb_addr_index, uint fb_depth_addr_index)
{
if (all(lessThan(coord, uvec2(fb_width, fb_height))))
{
uint unscaled_fb_width = fb_width >> SCALING_LOG2;
uvec2 slice2d = coord & (SCALING_FACTOR - 1);
coord >>= SCALING_LOG2;
uint slice = slice2d.y * SCALING_FACTOR + slice2d.x;
uint index = fb_addr_index + unscaled_fb_width * coord.y + coord.x;
store_vram_color(index, slice);
index = fb_depth_addr_index + unscaled_fb_width * coord.y + coord.x;
store_vram_depth(index, slice);
if (RDRAM_SCALED_WRITE_MASK)
emit_scaled_write_masks(coord, unscaled_fb_width);
}
}
u8x4 decode_memory_color(bool image_read_en)
{
u8 memory_coverage = image_read_en ? (current_color.a & U8_C(0xe0)) : U8_C(0xe0);
u8x3 color;
switch (FB_FMT)
{
case FB_FMT_I4:
color = u8x3(0);
memory_coverage = U8_C(0xe0);
break;
case FB_FMT_I8:
color = current_color.rrr;
memory_coverage = U8_C(0xe0);
break;
case FB_FMT_RGBA5551:
color = current_color.rgb & U8_C(0xf8);
break;
case FB_FMT_IA88:
color = current_color.rrr;
break;
case FB_FMT_RGBA8888:
color = current_color.rgb;
break;
}
return u8x4(color, memory_coverage);
}
void write_color(u8x4 col)
{
if (FB_FMT == FB_FMT_I4)
current_color.rgb = col.rgb;
else
current_color = col;
current_color_dirty = true;
}
void copy_pipeline(uint word, uint primitive_index)
{
switch (FB_FMT)
{
case FB_FMT_I4:
{
current_color = u8x4(0);
current_color_dirty = true;
break;
}
case FB_FMT_I8:
{
// Alpha testing needs to only look at the low dword for some bizarre reason.
// I don't think alpha testing is supposed to be used at all with 8-bit FB ...
word &= 0xffu;
write_color(u8x4(word));
break;
}
case FB_FMT_RGBA5551:
{
uint r = (word >> 8) & 0xf8u;
uint g = (word >> 3) & 0xf8u;
uint b = (word << 2) & 0xf8u;
uint a = (word & 1) * 0xe0u;
write_color(u8x4(r, g, b, a));
break;
}
}
if (FB_COLOR_DEPTH_ALIAS)
alias_color_to_depth();
}
void fill_color(uint col)
{
switch (FB_FMT)
{
case FB_FMT_RGBA8888:
{
uint r = (col >> 24u) & 0xffu;
uint g = (col >> 16u) & 0xffu;
uint b = (col >> 8u) & 0xffu;
uint a = (col >> 0u) & 0xffu;
write_color(u8x4(r, g, b, a));
break;
}
case FB_FMT_RGBA5551:
{
col >>= ((color_fb_index & 1u) ^ 1u) * 16u;
uint r = (col >> 8u) & 0xf8u;
uint g = (col >> 3u) & 0xf8u;
uint b = (col << 2u) & 0xf8u;
uint a = (col & 1u) * 0xe0u;
write_color(u8x4(r, g, b, a));
break;
}
case FB_FMT_IA88:
{
col >>= ((color_fb_index & 1u) ^ 1u) * 16u;
col &= 0xffffu;
uint r = (col >> 8u) & 0xffu;
uint a = (col >> 0u) & 0xffu;
write_color(u8x4(r, r, r, a));
break;
}
case FB_FMT_I8:
{
col >>= ((color_fb_index & 3u) ^ 3u) * 8u;
col &= 0xffu;
write_color(u8x4(col));
break;
}
}
if (FB_COLOR_DEPTH_ALIAS)
alias_color_to_depth();
}
void depth_blend(int x, int y, uint primitive_index, ShadedData shaded)
{
int z = shaded.z_dith >> 9;
int dith = shaded.z_dith & 0x1ff;
int coverage_count = shaded.coverage_count;
u8x4 combined = shaded.combined;
u8 shade_alpha = shaded.shade_alpha;
uint blend_state_index = uint(state_indices.elems[primitive_index].static_depth_tmem.y);
DerivedSetup derived = load_derived_setup(primitive_index);
DepthBlendState depth_blend = load_depth_blend_state(blend_state_index);
bool force_blend = (depth_blend.flags & DEPTH_BLEND_FORCE_BLEND_BIT) != 0;
bool z_compare = (depth_blend.flags & DEPTH_BLEND_DEPTH_TEST_BIT) != 0;
bool z_update = (depth_blend.flags & DEPTH_BLEND_DEPTH_UPDATE_BIT) != 0;
bool image_read_enable = (depth_blend.flags & DEPTH_BLEND_IMAGE_READ_ENABLE_BIT) != 0;
bool color_on_coverage = (depth_blend.flags & DEPTH_BLEND_COLOR_ON_COVERAGE_BIT) != 0;
bool blend_multicycle = (depth_blend.flags & DEPTH_BLEND_MULTI_CYCLE_BIT) != 0;
bool aa_enable = (depth_blend.flags & DEPTH_BLEND_AA_BIT) != 0;
bool dither_en = (depth_blend.flags & DEPTH_BLEND_DITHER_ENABLE_BIT) != 0;
bool blend_en;
bool coverage_wrap;
u8x2 blend_shift;
u8x4 memory_color = decode_memory_color(image_read_enable);
u8 memory_coverage = memory_color.a >> U8_C(5);
bool z_pass = depth_test(z, derived.dz, derived.dz_compressed,
current_depth, current_dz,
coverage_count, memory_coverage,
z_compare, depth_blend.z_mode,
force_blend, aa_enable,
blend_en, coverage_wrap, blend_shift);
GENERIC_MESSAGE3(combined.x, combined.y, combined.z);
// Pixel tests.
if (z_pass && (!aa_enable || coverage_count != 0))
{
// Blending
BlendInputs blender_inputs =
BlendInputs(combined, memory_color,
derived.fog_color, derived.blend_color, shade_alpha);
u8x4 blend_modes = depth_blend.blend_modes0;
if (blend_multicycle)
{
blender_inputs.pixel_color.rgb =
blender(blender_inputs,
blend_modes,
force_blend, blend_en, color_on_coverage, coverage_wrap, blend_shift, false);
blend_modes = depth_blend.blend_modes1;
}
u8x3 rgb = blender(blender_inputs,
blend_modes,
force_blend, blend_en, color_on_coverage, coverage_wrap, blend_shift, true);
// Dither
if (dither_en)
rgb = rgb_dither(rgb, dith);
// Coverage blending
int new_coverage = blend_coverage(coverage_count, memory_coverage, blend_en, depth_blend.coverage_mode);
GENERIC_MESSAGE3(rgb.x, rgb.y, rgb.z);
// Writeback
write_color(u8x4(rgb, new_coverage << 5));
// Z-writeback.
if (z_update)
{
current_depth = z_compress(z);
current_dz = u8(derived.dz_compressed);
current_depth_dirty = true;
if (FB_COLOR_DEPTH_ALIAS)
alias_depth_to_color();
}
else if (FB_COLOR_DEPTH_ALIAS)
alias_color_to_depth();
}
}
#endif

View File

@ -0,0 +1,71 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef NOISE_H_
#define NOISE_H_
u16 seeded_noise = U16_C(0);
// From: https://www.shadertoy.com/view/XlXcW4 with slight modifications.
void reseed_noise(uint x, uint y, uint primitive_offset)
{
const uint NOISE_PRIME = 1103515245u;
uvec3 seed = uvec3(x, y, primitive_offset);
seed = ((seed >> 8u) ^ seed.yzx) * NOISE_PRIME;
seed = ((seed >> 8u) ^ seed.yzx) * NOISE_PRIME;
seed = ((seed >> 8u) ^ seed.yzx) * NOISE_PRIME;
seeded_noise = u16(seed.x >> 16u);
}
i16 noise_get_combiner()
{
return i16(((seeded_noise & U16_C(7u)) << U16_C(6u)) | U16_C(0x20u));
}
int noise_get_dither_alpha()
{
return int(seeded_noise & U16_C(7u));
}
int noise_get_dither_color()
{
// 3 bits of noise for RGB separately.
return int(seeded_noise & U16_C(0x1ff));
}
u8 noise_get_blend_threshold()
{
return u8(seeded_noise & U16_C(0xffu));
}
uvec3 noise_get_full_gamma_dither()
{
uint seed = seeded_noise;
return uvec3(seed & 0x3f, (seed >> 6u) & 0x3f, ((seed >> 9u) & 0x38) | (seed & 7u));
}
uvec3 noise_get_partial_gamma_dither()
{
return (uvec3(seeded_noise) >> uvec3(0, 1, 2)) & 1u;
}
#endif

View File

@ -0,0 +1,114 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef PERSPECTIVE_H_
#define PERSPECTIVE_H_
const i16x2 perspective_table[64] = i16x2[](
i16x2(0x4000, -252 * 4), i16x2(0x3f04, -244 * 4), i16x2(0x3e10, -238 * 4), i16x2(0x3d22, -230 * 4),
i16x2(0x3c3c, -223 * 4), i16x2(0x3b5d, -218 * 4), i16x2(0x3a83, -210 * 4), i16x2(0x39b1, -205 * 4),
i16x2(0x38e4, -200 * 4), i16x2(0x381c, -194 * 4), i16x2(0x375a, -189 * 4), i16x2(0x369d, -184 * 4),
i16x2(0x35e5, -179 * 4), i16x2(0x3532, -175 * 4), i16x2(0x3483, -170 * 4), i16x2(0x33d9, -166 * 4),
i16x2(0x3333, -162 * 4), i16x2(0x3291, -157 * 4), i16x2(0x31f4, -155 * 4), i16x2(0x3159, -150 * 4),
i16x2(0x30c3, -147 * 4), i16x2(0x3030, -143 * 4), i16x2(0x2fa1, -140 * 4), i16x2(0x2f15, -137 * 4),
i16x2(0x2e8c, -134 * 4), i16x2(0x2e06, -131 * 4), i16x2(0x2d83, -128 * 4), i16x2(0x2d03, -125 * 4),
i16x2(0x2c86, -123 * 4), i16x2(0x2c0b, -120 * 4), i16x2(0x2b93, -117 * 4), i16x2(0x2b1e, -115 * 4),
i16x2(0x2aab, -113 * 4), i16x2(0x2a3a, -110 * 4), i16x2(0x29cc, -108 * 4), i16x2(0x2960, -106 * 4),
i16x2(0x28f6, -104 * 4), i16x2(0x288e, -102 * 4), i16x2(0x2828, -100 * 4), i16x2(0x27c4, -98 * 4),
i16x2(0x2762, -96 * 4), i16x2(0x2702, -94 * 4), i16x2(0x26a4, -92 * 4), i16x2(0x2648, -91 * 4),
i16x2(0x25ed, -89 * 4), i16x2(0x2594, -87 * 4), i16x2(0x253d, -86 * 4), i16x2(0x24e7, -85 * 4),
i16x2(0x2492, -83 * 4), i16x2(0x243f, -81 * 4), i16x2(0x23ee, -80 * 4), i16x2(0x239e, -79 * 4),
i16x2(0x234f, -77 * 4), i16x2(0x2302, -76 * 4), i16x2(0x22b6, -74 * 4), i16x2(0x226c, -74 * 4),
i16x2(0x2222, -72 * 4), i16x2(0x21da, -71 * 4), i16x2(0x2193, -70 * 4), i16x2(0x214d, -69 * 4),
i16x2(0x2108, -67 * 4), i16x2(0x20c5, -67 * 4), i16x2(0x2082, -65 * 4), i16x2(0x2041, -65 * 4)
);
ivec2 perspective_get_lut(int w)
{
int shift = min(14 - findMSB(w), 14);
int normout = (w << shift) & 0x3fff;
int wnorm = normout & 0xff;
ivec2 table = ivec2(perspective_table[normout >> 8]);
int rcp = ((table.y * wnorm) >> 10) + table.x;
return ivec2(rcp, shift);
}
ivec2 no_perspective_divide(ivec3 stw)
{
return stw.xy;
}
// s16 divided by s1.15.
// Classic approximation of a (x * rcp) >> shift with a LUT to find rcp.
ivec2 perspective_divide(ivec3 stw, inout bool overflow)
{
int w = stw.z;
bool w_carry = w <= 0;
w &= 0x7fff;
ivec2 table = perspective_get_lut(w);
int shift = table.y;
ivec2 prod = stw.xy * table.x;
int temp_mask = ((1 << 30) - 1) & -((1 << 29) >> shift);
ivec2 out_of_bounds = prod & temp_mask;
ivec2 temp;
if (shift != 14)
temp = prod = prod >> (13 - shift);
else
temp = prod << 1;
if (any(notEqual(out_of_bounds, ivec2(0))))
{
if (out_of_bounds.x != temp_mask && out_of_bounds.x != 0)
{
if ((prod.x & (1 << 29)) == 0)
temp.x = 0x7fff;
else
temp.x = -0x8000;
overflow = true;
}
if (out_of_bounds.y != temp_mask && out_of_bounds.y != 0)
{
if ((prod.y & (1 << 29)) == 0)
temp.y = 0x7fff;
else
temp.y = -0x8000;
overflow = true;
}
}
if (w_carry)
{
temp = ivec2(0x7fff);
overflow = true;
}
// Perspective divide produces a 17-bit signed coordinate, which is later clamped to 16-bit signed.
// However, the LOD computation happens in 17 bits ...
return clamp(temp, ivec2(-0x10000), ivec2(0xffff));
}
#endif

View File

@ -0,0 +1,191 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "small_types.h"
layout(local_size_x_id = 0, local_size_y_id = 1) in;
#include "debug.h"
#include "data_structures.h"
layout(set = 0, binding = 0, std430) readonly buffer TriangleSetupBuffer
{
TriangleSetupMem elems[];
} triangle_setup;
#include "load_triangle_setup.h"
layout(set = 0, binding = 1, std430) readonly buffer AttributeSetupBuffer
{
AttributeSetupMem elems[];
} attribute_setup;
#include "load_attribute_setup.h"
layout(set = 0, binding = 2, std430) readonly buffer DerivedSetupBuffer
{
DerivedSetupMem elems[];
} derived_setup;
#include "load_derived_setup.h"
layout(set = 0, binding = 3, std430) readonly buffer StaticRasterStateBuffer
{
StaticRasterizationStateMem elems[];
} static_raster_state;
#include "load_static_raster_state.h"
layout(set = 0, binding = 4, std430) readonly buffer StateIndicesBuffer
{
InstanceIndicesMem elems[];
} state_indices;
layout(set = 0, binding = 5, std430) readonly buffer SpanInfoOffsetBuffer
{
SpanInfoOffsetsMem elems[];
} span_offsets;
#include "load_span_offsets.h"
layout(set = 0, binding = 6, std430) readonly buffer SpanSetups
{
SpanSetupMem elems[];
} span_setups;
#include "load_span_setup.h"
layout(set = 0, binding = 7, std430) readonly buffer TMEM16
{
TMEMInstance16Mem instances[];
} tmem16;
layout(set = 0, binding = 7, std430) readonly buffer TMEM8
{
TMEMInstance8Mem instances[];
} tmem8;
layout(set = 0, binding = 8, std430) readonly buffer TileInfoBuffer
{
TileInfoMem elems[];
} tile_infos;
#include "load_tile_info.h"
layout(set = 2, binding = 0, std140) uniform GlobalConstants
{
GlobalFBInfo fb_info;
} global_constants;
layout(constant_id = 2) const int STATIC_STATE_FLAGS = 0;
layout(constant_id = 3) const int COMBINER_INPUTS_RGB0 = 0;
layout(constant_id = 4) const int COMBINER_INPUTS_ALPHA0 = 0;
layout(constant_id = 5) const int COMBINER_INPUTS_RGB1 = 0;
layout(constant_id = 6) const int COMBINER_INPUTS_ALPHA1 = 0;
layout(constant_id = 7) const int DITHER_TEX_SIZE_TEX_FMT = 0;
const int COMBINER_INPUT_RGB0_MULADD = (COMBINER_INPUTS_RGB0 >> 0) & 0xff;
const int COMBINER_INPUT_RGB0_MULSUB = (COMBINER_INPUTS_RGB0 >> 8) & 0xff;
const int COMBINER_INPUT_RGB0_MUL = (COMBINER_INPUTS_RGB0 >> 16) & 0xff;
const int COMBINER_INPUT_RGB0_ADD = (COMBINER_INPUTS_RGB0 >> 24) & 0xff;
const int COMBINER_INPUT_ALPHA0_MULADD = (COMBINER_INPUTS_ALPHA0 >> 0) & 0xff;
const int COMBINER_INPUT_ALPHA0_MULSUB = (COMBINER_INPUTS_ALPHA0 >> 8) & 0xff;
const int COMBINER_INPUT_ALPHA0_MUL = (COMBINER_INPUTS_ALPHA0 >> 16) & 0xff;
const int COMBINER_INPUT_ALPHA0_ADD = (COMBINER_INPUTS_ALPHA0 >> 24) & 0xff;
const int COMBINER_INPUT_RGB1_MULADD = (COMBINER_INPUTS_RGB1 >> 0) & 0xff;
const int COMBINER_INPUT_RGB1_MULSUB = (COMBINER_INPUTS_RGB1 >> 8) & 0xff;
const int COMBINER_INPUT_RGB1_MUL = (COMBINER_INPUTS_RGB1 >> 16) & 0xff;
const int COMBINER_INPUT_RGB1_ADD = (COMBINER_INPUTS_RGB1 >> 24) & 0xff;
const int COMBINER_INPUT_ALPHA1_MULADD = (COMBINER_INPUTS_ALPHA1 >> 0) & 0xff;
const int COMBINER_INPUT_ALPHA1_MULSUB = (COMBINER_INPUTS_ALPHA1 >> 8) & 0xff;
const int COMBINER_INPUT_ALPHA1_MUL = (COMBINER_INPUTS_ALPHA1 >> 16) & 0xff;
const int COMBINER_INPUT_ALPHA1_ADD = (COMBINER_INPUTS_ALPHA1 >> 24) & 0xff;
const int DITHER = (DITHER_TEX_SIZE_TEX_FMT >> 0) & 0xff;
const int TEX_SIZE = (DITHER_TEX_SIZE_TEX_FMT >> 8) & 0xff;
const int TEX_FMT = (DITHER_TEX_SIZE_TEX_FMT >> 16) & 0xff;
#define RASTERIZER_SPEC_CONSTANT
#include "noise.h"
#include "shading.h"
layout(set = 0, binding = 9, std430) writeonly buffer ColorBuffer
{
mem_u8x4 elems[];
} color;
layout(set = 0, binding = 9, std430) writeonly buffer ColorBufferRaw
{
uint elems[];
} raw_color;
layout(set = 0, binding = 10, std430) writeonly buffer DepthBuffer
{
int elems[];
} depth;
layout(set = 0, binding = 11, std430) writeonly buffer ShadeAlpha
{
mem_u8 elems[];
} shade_alpha;
layout(set = 0, binding = 12, std430) writeonly buffer Coverage
{
mem_i8 elems[];
} coverage;
layout(set = 1, binding = 0, std430) readonly buffer TileWorkList
{
uvec4 elems[];
} tile_work_list;
void main()
{
uvec4 work = tile_work_list.elems[gl_WorkGroupID.x];
int x = int(work.x * gl_WorkGroupSize.x + gl_LocalInvocationID.x);
int y = int(work.y * gl_WorkGroupSize.y + gl_LocalInvocationID.y);
uint tile_instance = work.z;
uint primitive_index = work.w;
ShadedData shaded;
i8 coverage_value;
uint index = tile_instance * (gl_WorkGroupSize.x * gl_WorkGroupSize.y) + gl_LocalInvocationIndex;
if (shade_pixel(x, y, primitive_index, shaded))
{
coverage_value = i8(shaded.coverage_count);
if (coverage_value <= I8_C(8))
{
// Workaround curious bug with glslang, need to cast manually to uvec4 first.
color.elems[index] = mem_u8x4(uvec4(shaded.combined));
shade_alpha.elems[index] = mem_u8(shaded.shade_alpha);
depth.elems[index] = shaded.z_dith;
}
else if ((coverage_value & COVERAGE_COPY_BIT) != 0)
{
// For copy pipe, we use a raw 32-bit word to represent the loaded texel.
raw_color.elems[index] = shaded.z_dith;
}
}
else
coverage_value = I8_C(-1);
coverage.elems[index] = mem_i8(coverage_value);
}

View File

@ -0,0 +1,357 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef SHADING_H_
#define SHADING_H_
#ifdef RASTERIZER_SPEC_CONSTANT
const int SCALING_LOG2 = (STATIC_STATE_FLAGS >> RASTERIZATION_UPSCALING_LOG2_BIT_OFFSET) & 3;
const int SCALING_FACTOR = 1 << SCALING_LOG2;
#endif
#include "coverage.h"
#include "interpolation.h"
#include "perspective.h"
#include "texture.h"
#include "dither.h"
#include "combiner.h"
bool shade_pixel(int x, int y, uint primitive_index, out ShadedData shaded)
{
SpanInfoOffsets span_offsets = load_span_offsets(primitive_index);
if ((y < (SCALING_FACTOR * span_offsets.ylo)) || (y > (span_offsets.yhi * SCALING_FACTOR + (SCALING_FACTOR - 1))))
return false;
uint setup_flags = uint(triangle_setup.elems[primitive_index].flags);
if (SCALING_FACTOR > 1)
{
if ((setup_flags & TRIANGLE_SETUP_DISABLE_UPSCALING_BIT) != 0u)
{
x &= ~(SCALING_FACTOR - 1);
y &= ~(SCALING_FACTOR - 1);
}
}
SpanSetup span_setup = load_span_setup(SCALING_FACTOR * span_offsets.offset + (y - SCALING_FACTOR * span_offsets.ylo));
if (span_setup.valid_line == U16_C(0))
return false;
uint setup_tile = uint(triangle_setup.elems[primitive_index].tile);
AttributeSetup attr = load_attribute_setup(primitive_index);
uvec4 states = uvec4(state_indices.elems[primitive_index].static_depth_tmem);
uint static_state_index = states.x;
uint tmem_instance_index = states.z;
StaticRasterizationState static_state = load_static_rasterization_state(static_state_index);
uint static_state_flags = static_state.flags;
int static_state_dither = static_state.dither;
u8x4 combiner_inputs_rgb0 = static_state.combiner_inputs_rgb0;
u8x4 combiner_inputs_alpha0 = static_state.combiner_inputs_alpha0;
u8x4 combiner_inputs_rgb1 = static_state.combiner_inputs_rgb1;
u8x4 combiner_inputs_alpha1 = static_state.combiner_inputs_alpha1;
#ifdef RASTERIZER_SPEC_CONSTANT
if ((STATIC_STATE_FLAGS & RASTERIZATION_USE_SPECIALIZATION_CONSTANT_BIT) != 0)
{
static_state_flags = STATIC_STATE_FLAGS;
static_state_dither = DITHER;
combiner_inputs_rgb0.x = u8(COMBINER_INPUT_RGB0_MULADD);
combiner_inputs_rgb0.y = u8(COMBINER_INPUT_RGB0_MULSUB);
combiner_inputs_rgb0.z = u8(COMBINER_INPUT_RGB0_MUL);
combiner_inputs_rgb0.w = u8(COMBINER_INPUT_RGB0_ADD);
combiner_inputs_alpha0.x = u8(COMBINER_INPUT_ALPHA0_MULADD);
combiner_inputs_alpha0.y = u8(COMBINER_INPUT_ALPHA0_MULSUB);
combiner_inputs_alpha0.z = u8(COMBINER_INPUT_ALPHA0_MUL);
combiner_inputs_alpha0.w = u8(COMBINER_INPUT_ALPHA0_ADD);
combiner_inputs_rgb1.x = u8(COMBINER_INPUT_RGB1_MULADD);
combiner_inputs_rgb1.y = u8(COMBINER_INPUT_RGB1_MULSUB);
combiner_inputs_rgb1.z = u8(COMBINER_INPUT_RGB1_MUL);
combiner_inputs_rgb1.w = u8(COMBINER_INPUT_RGB1_ADD);
combiner_inputs_alpha1.x = u8(COMBINER_INPUT_ALPHA1_MULADD);
combiner_inputs_alpha1.y = u8(COMBINER_INPUT_ALPHA1_MULSUB);
combiner_inputs_alpha1.z = u8(COMBINER_INPUT_ALPHA1_MUL);
combiner_inputs_alpha1.w = u8(COMBINER_INPUT_ALPHA1_ADD);
}
#endif
// This is a great case for specialization constants.
bool tlut = (static_state_flags & RASTERIZATION_TLUT_BIT) != 0;
bool tlut_type = (static_state_flags & RASTERIZATION_TLUT_TYPE_BIT) != 0;
bool sample_quad = (static_state_flags & RASTERIZATION_SAMPLE_MODE_BIT) != 0;
bool cvg_times_alpha = (static_state_flags & RASTERIZATION_CVG_TIMES_ALPHA_BIT) != 0;
bool alpha_cvg_select = (static_state_flags & RASTERIZATION_ALPHA_CVG_SELECT_BIT) != 0;
bool perspective = (static_state_flags & RASTERIZATION_PERSPECTIVE_CORRECT_BIT) != 0;
bool tex_lod_en = (static_state_flags & RASTERIZATION_TEX_LOD_ENABLE_BIT) != 0;
bool sharpen_lod_en = (static_state_flags & RASTERIZATION_SHARPEN_LOD_ENABLE_BIT) != 0;
bool detail_lod_en = (static_state_flags & RASTERIZATION_DETAIL_LOD_ENABLE_BIT) != 0;
bool aa_enable = (static_state_flags & RASTERIZATION_AA_BIT) != 0;
bool multi_cycle = (static_state_flags & RASTERIZATION_MULTI_CYCLE_BIT) != 0;
bool interlace_en = (static_state_flags & RASTERIZATION_INTERLACE_FIELD_BIT) != 0;
bool fill_en = (static_state_flags & RASTERIZATION_FILL_BIT) != 0;
bool copy_en = (static_state_flags & RASTERIZATION_COPY_BIT) != 0;
bool alpha_test = (static_state_flags & RASTERIZATION_ALPHA_TEST_BIT) != 0;
bool alpha_test_dither = (static_state_flags & RASTERIZATION_ALPHA_TEST_DITHER_BIT) != 0;
bool mid_texel = (static_state_flags & RASTERIZATION_SAMPLE_MID_TEXEL_BIT) != 0;
bool uses_texel0 = (static_state_flags & RASTERIZATION_USES_TEXEL0_BIT) != 0;
bool uses_texel1 = (static_state_flags & RASTERIZATION_USES_TEXEL1_BIT) != 0;
bool uses_pipelined_texel1 = (static_state_flags & RASTERIZATION_USES_PIPELINED_TEXEL1_BIT) != 0;
bool uses_lod = (static_state_flags & RASTERIZATION_USES_LOD_BIT) != 0;
bool convert_one = (static_state_flags & RASTERIZATION_CONVERT_ONE_BIT) != 0;
bool bilerp0 = (static_state_flags & RASTERIZATION_BILERP_0_BIT) != 0;
bool bilerp1 = (static_state_flags & RASTERIZATION_BILERP_1_BIT) != 0;
if ((static_state_flags & RASTERIZATION_NEED_NOISE_BIT) != 0)
reseed_noise(x, y, primitive_index + global_constants.fb_info.base_primitive_index);
bool flip = (setup_flags & TRIANGLE_SETUP_FLIP_BIT) != 0;
if (copy_en)
{
bool valid = x >= span_setup.start_x && x <= span_setup.end_x;
if (!valid)
return false;
ivec2 st;
int s_offset;
interpolate_st_copy(span_setup, attr.dstzw_dx, x, perspective, flip, st, s_offset);
uint tile0 = uint(setup_tile) & 7u;
uint tile_info_index0 = uint(state_indices.elems[primitive_index].tile_infos[tile0]);
TileInfo tile_info0 = load_tile_info(tile_info_index0);
#ifdef RASTERIZER_SPEC_CONSTANT
if ((STATIC_STATE_FLAGS & RASTERIZATION_USE_STATIC_TEXTURE_SIZE_FORMAT_BIT) != 0)
{
tile_info0.fmt = u8(TEX_FMT);
tile_info0.size = u8(TEX_SIZE);
}
#endif
int texel0 = sample_texture_copy(tile_info0, tmem_instance_index, st, s_offset, tlut, tlut_type);
shaded.z_dith = texel0;
shaded.coverage_count = U8_C(COVERAGE_COPY_BIT);
if (alpha_test && global_constants.fb_info.fb_size == 2 && (texel0 & 1) == 0)
return false;
return true;
}
else if (fill_en)
{
shaded.coverage_count = U8_C(COVERAGE_FILL_BIT);
return x >= span_setup.start_x && x <= span_setup.end_x;
}
int coverage = compute_coverage(span_setup.xleft, span_setup.xright, x);
// There is no way we can gain coverage here.
// Reject work as fast as possible.
if (coverage == 0)
return false;
int coverage_count = bitCount(coverage);
// If we're not using AA, only the first coverage bit is relevant.
if (!aa_enable && (coverage & 1) == 0)
return false;
DerivedSetup derived = load_derived_setup(primitive_index);
int dx = x - span_setup.interpolation_base_x;
int interpolation_direction = flip ? 1 : -1;
// Interpolate attributes.
u8x4 shade = interpolate_rgba(span_setup.rgba, attr.drgba_dx, attr.drgba_dy,
dx, coverage);
ivec2 st, st_dx, st_dy;
int z;
bool perspective_overflow = false;
int tex_interpolation_direction = interpolation_direction;
if (SCALING_FACTOR > 1 && uses_lod)
if ((setup_flags & TRIANGLE_SETUP_NATIVE_LOD_BIT) != 0)
tex_interpolation_direction *= SCALING_FACTOR;
interpolate_stz(span_setup.stzw, attr.dstzw_dx, attr.dstzw_dy, dx, coverage, perspective, uses_lod,
tex_interpolation_direction, st, st_dx, st_dy, z, perspective_overflow);
// Sample textures.
uint tile0 = uint(setup_tile) & 7u;
uint tile1 = (tile0 + 1) & 7u;
uint max_level = uint(setup_tile) >> 3u;
int min_lod = derived.min_lod;
i16 lod_frac;
if (uses_lod)
{
compute_lod_2cycle(tile0, tile1, lod_frac, max_level, min_lod, st, st_dx, st_dy, perspective_overflow,
tex_lod_en, sharpen_lod_en, detail_lod_en);
}
i16x4 texel0, texel1;
if (uses_texel0)
{
uint tile_info_index0 = uint(state_indices.elems[primitive_index].tile_infos[tile0]);
TileInfo tile_info0 = load_tile_info(tile_info_index0);
#ifdef RASTERIZER_SPEC_CONSTANT
if ((STATIC_STATE_FLAGS & RASTERIZATION_USE_STATIC_TEXTURE_SIZE_FORMAT_BIT) != 0)
{
tile_info0.fmt = u8(TEX_FMT);
tile_info0.size = u8(TEX_SIZE);
}
#endif
texel0 = sample_texture(tile_info0, tmem_instance_index, st, tlut, tlut_type, sample_quad, mid_texel, false, i16x4(0));
if (!sample_quad && !bilerp0)
texel0 = texture_convert_factors(texel0, derived.factors);
}
// A very awkward mechanism where we peek into the next pixel, or in some cases, the next scanline's first pixel.
if (uses_pipelined_texel1)
{
bool valid_line = uint(span_setups.elems[SCALING_FACTOR * span_offsets.offset + (y - SCALING_FACTOR * span_offsets.ylo + 1)].valid_line) != 0u;
bool long_span = span_setup.lodlength >= 8;
bool end_span = x == (flip ? span_setup.end_x : span_setup.start_x);
if (end_span && long_span && valid_line)
{
ivec3 stw = span_setups.elems[SCALING_FACTOR * span_offsets.offset + (y - SCALING_FACTOR * span_offsets.ylo + 1)].stzw.xyw >> 16;
if (perspective)
{
bool st_overflow;
st = perspective_divide(stw, st_overflow);
}
else
st = no_perspective_divide(stw);
}
else
st = interpolate_st_single(span_setup.stzw, attr.dstzw_dx, dx + interpolation_direction * SCALING_FACTOR, perspective);
tile1 = tile0;
uses_texel1 = true;
}
if (uses_texel1)
{
if (convert_one && !bilerp1)
{
texel1 = texture_convert_factors(texel0, derived.factors);
}
else
{
uint tile_info_index1 = uint(state_indices.elems[primitive_index].tile_infos[tile1]);
TileInfo tile_info1 = load_tile_info(tile_info_index1);
#ifdef RASTERIZER_SPEC_CONSTANT
if ((STATIC_STATE_FLAGS & RASTERIZATION_USE_STATIC_TEXTURE_SIZE_FORMAT_BIT) != 0)
{
tile_info1.fmt = u8(TEX_FMT);
tile_info1.size = u8(TEX_SIZE);
}
#endif
texel1 = sample_texture(tile_info1, tmem_instance_index, st, tlut, tlut_type, sample_quad, mid_texel,
convert_one, texel0);
if (!sample_quad && !tlut && !bilerp1)
texel1 = texture_convert_factors(texel1, derived.factors);
}
}
int rgb_dith, alpha_dith;
dither_coefficients(x, y >> int(interlace_en), static_state_dither >> 2, static_state_dither & 3, rgb_dith, alpha_dith);
// Run combiner.
u8x4 combined;
u8 alpha_reference;
if (multi_cycle)
{
CombinerInputs combined_inputs =
CombinerInputs(derived.constant_muladd0, derived.constant_mulsub0, derived.constant_mul0, derived.constant_add0,
shade, u8x4(0), texel0, texel1, lod_frac, noise_get_combiner());
combined_inputs.combined = combiner_cycle0(combined_inputs,
combiner_inputs_rgb0,
combiner_inputs_alpha0,
alpha_dith, coverage_count, cvg_times_alpha, alpha_cvg_select,
alpha_test, alpha_reference);
combined_inputs.constant_muladd = derived.constant_muladd1;
combined_inputs.constant_mulsub = derived.constant_mulsub1;
combined_inputs.constant_mul = derived.constant_mul1;
combined_inputs.constant_add = derived.constant_add1;
// Pipelining, texel1 is promoted to texel0 in cycle1.
// I don't think hardware ever intended for you to access texels in second cycle due to this nature.
i16x4 tmp_texel = combined_inputs.texel0;
combined_inputs.texel0 = combined_inputs.texel1;
// Following the pipelining, texel1 should become texel0 of next pixel,
// but let's not go there ...
combined_inputs.texel1 = tmp_texel;
combined = u8x4(combiner_cycle1(combined_inputs,
combiner_inputs_rgb1,
combiner_inputs_alpha1,
alpha_dith, coverage_count, cvg_times_alpha, alpha_cvg_select));
}
else
{
CombinerInputs combined_inputs =
CombinerInputs(derived.constant_muladd1, derived.constant_mulsub1, derived.constant_mul1, derived.constant_add1,
shade, u8x4(0), texel0, texel1, lod_frac, noise_get_combiner());
combined = u8x4(combiner_cycle1(combined_inputs,
combiner_inputs_rgb1,
combiner_inputs_alpha1,
alpha_dith, coverage_count, cvg_times_alpha, alpha_cvg_select));
alpha_reference = combined.a;
}
// After combiner, color can be modified to 0 through alpha-to-cvg, so check for potential write_enable here.
// If we're not using AA, the first coverage bit is used instead, coverage count is ignored.
if (aa_enable && coverage_count == 0)
return false;
if (alpha_test)
{
u8 alpha_threshold;
if (alpha_test_dither)
alpha_threshold = noise_get_blend_threshold();
else
alpha_threshold = derived.blend_color.a;
if (alpha_reference < alpha_threshold)
return false;
}
shaded.combined = combined;
shaded.z_dith = (z << 9) | rgb_dith;
shaded.coverage_count = u8(coverage_count);
// Shade alpha needs to be passed separately since it might affect the blending stage.
shaded.shade_alpha = u8(min(shade.a + alpha_dith, 0xff));
return true;
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,126 @@
{
"include": [ "../../Granite/assets/shaders/inc" ],
"shaders": [
{
"name": "tmem_update",
"compute": true,
"path": "tmem_update.comp"
},
{
"name": "span_setup",
"compute": true,
"path": "span_setup.comp"
},
{
"name": "clear_indirect_buffer",
"compute": true,
"path": "clear_indirect_buffer.comp"
},
{
"name": "tile_binning_combined",
"compute": true,
"path": "tile_binning_combined.comp",
"variants": [
{ "define": "SUBGROUP", "count": 2, "resolve": true },
{ "define": "UBERSHADER", "count": 2, "resolve": true },
{ "define": "SMALL_TYPES", "count": 2, "resolve": true }
]
},
{
"name": "ubershader",
"path": "ubershader.comp",
"compute": true,
"variants": [
{ "define": "SUBGROUP", "count": 2, "resolve": true },
{ "define": "SMALL_TYPES", "count": 2, "resolve": true }
]
},
{
"name": "depth_blend",
"path": "depth_blend.comp",
"compute": true,
"variants": [
{ "define": "SUBGROUP", "count": 2, "resolve": true },
{ "define": "SMALL_TYPES", "count": 2, "resolve": true }
]
},
{
"name": "rasterizer",
"path": "rasterizer.comp",
"compute": true,
"variants": [
{ "define": "SMALL_TYPES", "count": 2, "resolve": true }
]
},
{
"name": "fullscreen",
"path": "fullscreen.vert"
},
{
"name": "vi_scale",
"path": "vi_scale.frag"
},
{
"name": "vi_divot",
"path": "vi_divot.frag",
"variants": [
{ "define": "FETCH_BUG", "count": 2 }
]
},
{
"name": "vi_fetch",
"path": "vi_fetch.frag",
"variants": [
{ "define": "FETCH_BUG", "count": 2 }
]
},
{
"name": "vi_blend_fields",
"path": "vi_blend_fields.frag"
},
{
"name": "extract_vram",
"path": "extract_vram.comp",
"compute": true
},
{
"name": "masked_rdram_resolve",
"path": "masked_rdram_resolve.comp",
"compute": true
},
{
"name": "clear_write_mask",
"path": "clear_write_mask.comp",
"compute": true
},
{
"name": "update_upscaled_domain_post",
"path": "update_upscaled_domain_post.comp",
"compute": true
},
{
"name": "update_upscaled_domain_pre",
"path": "update_upscaled_domain_pre.comp",
"compute": true
},
{
"name": "update_upscaled_domain_resolve",
"path": "update_upscaled_domain_resolve.comp",
"compute": true
},
{
"name": "clear_super_sampled_write_mask",
"path": "clear_super_sampled_write_mask.comp",
"compute": true
},
{
"name": "vi_deinterlace_vert",
"path": "vi_deinterlace.vert"
},
{
"name": "vi_deinterlace_frag",
"path": "vi_deinterlace.frag"
}
]
}

View File

@ -0,0 +1,121 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
// Utility header to smooth over the difference between
// 8/16-bit integer arithmetic vs. just 8/16-bit storage.
#ifndef SMALL_INTEGERS_H_
#define SMALL_INTEGERS_H_
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_8bit_storage : require
#if SMALL_TYPES
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
#define mem_u8 uint8_t
#define mem_u16 uint16_t
#define mem_u8x2 u8vec2
#define mem_u16x2 u16vec2
#define mem_u8x3 u8vec3
#define mem_u16x3 u16vec3
#define mem_u8x4 u8vec4
#define mem_u16x4 u16vec4
#define mem_i8 int8_t
#define mem_i16 int16_t
#define mem_i8x2 i8vec2
#define mem_i16x2 i16vec2
#define mem_i8x3 i8vec3
#define mem_i16x3 i16vec3
#define mem_i8x4 i8vec4
#define mem_i16x4 i16vec4
#define u8 uint8_t
#define u16 uint16_t
#define u8x2 u8vec2
#define u16x2 u16vec2
#define u8x3 u8vec3
#define u16x3 u16vec3
#define u8x4 u8vec4
#define u16x4 u16vec4
#define i8 int8_t
#define i16 int16_t
#define i8x2 i8vec2
#define i16x2 i16vec2
#define i8x3 i8vec3
#define i16x3 i16vec3
#define i8x4 i8vec4
#define i16x4 i16vec4
#define U8_C(x) uint8_t(x)
#define I8_C(x) int8_t(x)
#define U16_C(x) uint16_t(x)
#define I16_C(x) int16_t(x)
#else
#define mem_u8 uint8_t
#define mem_u16 uint16_t
#define mem_u8x2 u8vec2
#define mem_u16x2 u16vec2
#define mem_u8x3 u8vec3
#define mem_u16x3 u16vec3
#define mem_u8x4 u8vec4
#define mem_u16x4 u16vec4
#define mem_i8 int8_t
#define mem_i16 int16_t
#define mem_i8x2 i8vec2
#define mem_i16x2 i16vec2
#define mem_i8x3 i8vec3
#define mem_i16x3 i16vec3
#define mem_i8x4 i8vec4
#define mem_i16x4 i16vec4
#define u8 int
#define u16 int
#define u8x2 ivec2
#define u16x2 ivec2
#define u8x3 ivec3
#define u16x3 ivec3
#define u8x4 ivec4
#define u16x4 ivec4
#define i8 int
#define i16 int
#define i8x2 ivec2
#define i16x2 ivec2
#define i8x3 ivec3
#define i16x3 ivec3
#define i8x4 ivec4
#define i16x4 ivec4
#define U8_C(x) int(x)
#define I8_C(x) int(x)
#define U16_C(x) int(x)
#define I16_C(x) int(x)
#endif
#endif

View File

@ -0,0 +1,227 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "small_types.h"
#include "debug.h"
layout(local_size_x_id = 0) in;
layout(constant_id = 1) const int SCALING_LOG2 = 0;
const int SCALING_FACTOR = 1 << SCALING_LOG2;
#include "data_structures.h"
layout(std430, set = 0, binding = 0) readonly buffer TriangleSetupBuffer
{
TriangleSetupMem elems[];
} triangle_setup;
#include "load_triangle_setup.h"
layout(std430, set = 0, binding = 1) readonly buffer AttributeSetupBuffer
{
AttributeSetupMem elems[];
} attribute_setup;
#include "load_attribute_setup.h"
layout(set = 0, binding = 2, std430) readonly buffer ScissorStateBuffer
{
ScissorStateMem elems[];
} scissor_state;
#include "load_scissor_state.h"
layout(std430, set = 0, binding = 3) writeonly buffer SpanSetups
{
SpanSetupMem elems[];
} span_setups;
#include "store_span_setup.h"
layout(set = 1, binding = 0) uniform utextureBuffer uInterpolationJobs;
const int SUBPIXELS = 4;
const int SUBPIXELS_LOG2 = 2;
// Convert a 16.16 signed value to 16.3. We have 8 subpixels in X direction after snapping.
ivec4 quantize_x(ivec4 x)
{
ivec4 sticky = ivec4(notEqual(x & 0xfff, ivec4(0)));
ivec4 snapped = ivec4((x >> 12) | sticky);
return snapped;
}
int min4(ivec4 v)
{
ivec2 v2 = min(v.xy, v.zw);
return min(v2.x, v2.y);
}
int max4(ivec4 v)
{
ivec2 v2 = max(v.xy, v.zw);
return max(v2.x, v2.y);
}
ivec4 interpolate_snapped(ivec4 dvalue, int dy)
{
int dy_shifted = dy >> SCALING_LOG2;
int dy_masked = dy & (SCALING_FACTOR - 1);
return dy_shifted * dvalue + dy_masked * (dvalue >> SCALING_LOG2);
}
void main()
{
ivec3 job_indices = ivec3(texelFetch(uInterpolationJobs, int(gl_WorkGroupID.x)).xyz);
int primitive_index = job_indices.x;
int base_y = job_indices.y * SCALING_FACTOR;
int max_y = job_indices.z * SCALING_FACTOR + (SCALING_FACTOR - 1);
int y = base_y + int(gl_LocalInvocationIndex);
if (y > max_y)
return;
TriangleSetup setup = load_triangle_setup(primitive_index);
AttributeSetup attr = load_attribute_setup(primitive_index);
ScissorState scissor = load_scissor_state(primitive_index);
bool flip = (setup.flags & TRIANGLE_SETUP_FLIP_BIT) != 0;
bool interlace_en = (setup.flags & TRIANGLE_SETUP_INTERLACE_FIELD_BIT) != 0;
bool keep_odd_field = (setup.flags & TRIANGLE_SETUP_INTERLACE_KEEP_ODD_BIT) != 0;
SpanSetup span_setup;
// Interpolate RGBA, STZW to their scanline.
{
bool do_offset = (setup.flags & TRIANGLE_SETUP_DO_OFFSET_BIT) != 0;
bool skip_xfrac = (setup.flags & TRIANGLE_SETUP_SKIP_XFRAC_BIT) != 0;
int y_interpolation_base = int(setup.yh) >> 2;
y_interpolation_base *= SCALING_FACTOR;
// For high-resolution interpolation, make sure we snap interpolation correctly at whole pixels,
// and quantize derivatives in-between pixels.
int dy = y - y_interpolation_base;
int xh = setup.xh * SCALING_FACTOR + dy * (setup.dxhdy << 2);
ivec4 drgba_diff = ivec4(0);
ivec4 dstzw_diff = ivec4(0);
// In do_offset mode, varyings are latched at last subpixel line instead of first (for some reason).
if (do_offset)
{
xh += (SCALING_FACTOR * 3) * setup.dxhdy;
ivec4 drgba_deh = attr.drgba_de & ~0x1ff;
ivec4 drgba_dyh = attr.drgba_dy & ~0x1ff;
drgba_diff = drgba_deh - (drgba_deh >> 2) - drgba_dyh + (drgba_dyh >> 2);
ivec4 dstzw_deh = attr.dstzw_de & ~0x1ff;
ivec4 dstzw_dyh = attr.dstzw_dy & ~0x1ff;
dstzw_diff = dstzw_deh - (dstzw_deh >> 2) - dstzw_dyh + (dstzw_dyh >> 2);
}
int base_x = xh >> 15;
int xfrac = skip_xfrac ? 0 : ((xh >> 7) & 0xff);
ivec4 rgba = attr.rgba + interpolate_snapped(attr.drgba_de, dy);
rgba = ((rgba & ~0x1ff) + drgba_diff - interpolate_snapped((attr.drgba_dx >> 8) & ~1, xfrac)) & ~0x3ff;
ivec4 stzw = attr.stzw + interpolate_snapped(attr.dstzw_de, dy);
stzw = ((stzw & ~0x1ff) + dstzw_diff - interpolate_snapped((attr.dstzw_dx >> 8) & ~1, xfrac)) & ~0x3ff;
span_setup.rgba = rgba;
span_setup.stzw = stzw;
span_setup.interpolation_base_x = base_x;
}
// Check Y dimension.
int yh_interpolation_base = int(setup.yh) & ~(SUBPIXELS - 1);
int ym_interpolation_base = int(setup.ym);
yh_interpolation_base *= SCALING_FACTOR;
ym_interpolation_base *= SCALING_FACTOR;
int y_sub = int(y * SUBPIXELS);
ivec4 y_subs = y_sub + ivec4(0, 1, 2, 3);
int ylo = max(setup.yh, scissor.ylo) * SCALING_FACTOR;
int yhi = min(setup.yl, scissor.yhi) * SCALING_FACTOR;
bvec4 clip_lo_y = lessThan(y_subs, ivec4(ylo));
bvec4 clip_hi_y = greaterThanEqual(y_subs, ivec4(yhi));
uvec4 clip_y = uvec4(clip_lo_y) | uvec4(clip_hi_y);
// Interpolate X at all 4 Y-subpixels.
ivec4 xh = setup.xh * SCALING_FACTOR + (y_subs - yh_interpolation_base) * setup.dxhdy;
ivec4 xm = setup.xm * SCALING_FACTOR + (y_subs - yh_interpolation_base) * setup.dxmdy;
ivec4 xl = setup.xl * SCALING_FACTOR + (y_subs - ym_interpolation_base) * setup.dxldy;
xl = mix(xl, xm, lessThan(y_subs, ivec4(SCALING_FACTOR * setup.ym)));
// If we have overflows, we can become sensitive to this in invalid_line check, where
// checks that should pass fail, and vice versa.
// Note that we shaved off one bit in triangle setup for upscaling purposes,
// so this should be 28 bits normally.
xl = bitfieldExtract(xl, 0, 27 + SCALING_LOG2);
xh = bitfieldExtract(xh, 0, 27 + SCALING_LOG2);
ivec4 xh_shifted = quantize_x(xh);
ivec4 xl_shifted = quantize_x(xl);
ivec4 xleft, xright;
if (flip)
{
xleft = xh_shifted;
xright = xl_shifted;
}
else
{
xleft = xl_shifted;
xright = xh_shifted;
}
bvec4 invalid_line = greaterThan(xleft >> 1, xright >> 1);
ivec4 lo_scissor = ivec4(SCALING_FACTOR * (scissor.xlo << 1));
ivec4 hi_scissor = ivec4(SCALING_FACTOR * (scissor.xhi << 1));
bool all_over = all(greaterThanEqual(min(xleft, xright), hi_scissor));
bool all_under = all(lessThan(max(xleft, xright), lo_scissor));
xleft = max(xleft, lo_scissor);
xleft = min(xleft, hi_scissor);
xright = max(xright, lo_scissor);
xright = min(xright, hi_scissor);
invalid_line = bvec4(uvec4(invalid_line) | clip_y);
xleft = mix(xleft, ivec4(0xffff), invalid_line);
xright = mix(xright, ivec4(0), invalid_line);
int start_x = min4(xleft) >> 3;
int end_x = max4(xright) >> 3;
span_setup.xleft = xleft;
span_setup.xright = xright;
span_setup.start_x = start_x;
span_setup.end_x = end_x;
span_setup.valid_line = int(!all(invalid_line) && !all_over && !all_under);
if (interlace_en)
if (((y >> SCALING_LOG2) & 1) != int(keep_odd_field))
span_setup.valid_line = U16_C(0);
span_setup.lodlength = int(flip ? (end_x - span_setup.interpolation_base_x) : (span_setup.interpolation_base_x - start_x));
store_span_setup(gl_GlobalInvocationID.x, span_setup);
}

View File

@ -0,0 +1,43 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef STORE_SPAN_SETUP_H_
#define STORE_SPAN_SETUP_H_
void store_span_setup(uint index, SpanSetup setup)
{
#if SMALL_TYPES
span_setups.elems[index] = setup;
#else
span_setups.elems[index].rgba = setup.rgba;
span_setups.elems[index].stzw = setup.stzw;
span_setups.elems[index].xleft = mem_u16x4(uvec4(setup.xleft));
span_setups.elems[index].xright = mem_u16x4(uvec4(setup.xright));
span_setups.elems[index].interpolation_base_x = setup.interpolation_base_x;
span_setups.elems[index].start_x = setup.start_x;
span_setups.elems[index].end_x = setup.end_x;
span_setups.elems[index].lodlength = mem_i16(setup.lodlength);
span_setups.elems[index].valid_line = mem_u16(setup.valid_line);
#endif
}
#endif

View File

@ -0,0 +1,905 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef TEXTURE_H_
#define TEXTURE_H_
#include "data_structures.h"
const int TEXTURE_FORMAT_RGBA = 0;
const int TEXTURE_FORMAT_YUV = 1;
const int TEXTURE_FORMAT_CI = 2;
const int TEXTURE_FORMAT_IA = 3;
const int TEXTURE_FORMAT_I = 4;
int texel_mask_s(TileInfo tile, int s)
{
if (tile.mask_s != 0)
{
int mask = 1 << tile.mask_s;
if ((tile.flags & TILE_INFO_MIRROR_S_BIT) != 0)
s ^= max((s & mask) - 1, 0);
s &= mask - 1;
}
return s;
}
ivec2 texel_mask_s_copy(TileInfo tile, int s)
{
ivec2 multi_s = s + ivec2(0, 1);
if (tile.mask_s != 0)
{
int mask = 1 << tile.mask_s;
if ((tile.flags & TILE_INFO_MIRROR_S_BIT) != 0)
multi_s ^= max((multi_s & mask) - 1, 0);
multi_s &= mask - 1;
}
return multi_s;
}
int texel_mask_t(TileInfo tile, int t)
{
if (tile.mask_t != 0)
{
int mask = 1 << tile.mask_t;
if ((tile.flags & TILE_INFO_MIRROR_T_BIT) != 0)
t ^= max((t & mask) - 1, 0);
t &= mask - 1;
}
return t;
}
i16x4 convert_rgba16(uint word)
{
uvec3 rgb = (uvec3(word) >> uvec3(11, 6, 1)) & 31u;
rgb = (rgb << 3u) | (rgb >> 2u);
uint alpha = (word & 1u) * 0xffu;
return i16x4(rgb, alpha);
}
i16x4 convert_ia16(uint word)
{
uint intensity = word >> 8;
uint alpha = word & 0xff;
return i16x4(intensity, intensity, intensity, alpha);
}
i16x4 sample_texel_rgba4(TileInfo tile, uint tmem_instance, uvec2 st)
{
uint byte_offset = tile.offset + tile.stride * st.y;
byte_offset += st.x >> 1;
byte_offset &= 0xfff;
uint shift = (~st.x & 1) * 4;
uint index = byte_offset;
index ^= (st.y & 1) << 2;
index ^= 3;
uint word = uint(tmem8.instances[tmem_instance].elems[index]);
word = (word >> shift) & 0xf;
word |= word << 4;
return i16x4(word);
}
i16x4 sample_texel_ia4(TileInfo tile, uint tmem_instance, uvec2 st)
{
uint byte_offset = tile.offset + tile.stride * st.y;
byte_offset += st.x >> 1;
byte_offset &= 0xfff;
uint shift = (~st.x & 1) * 4;
uint index = byte_offset;
index ^= (st.y & 1) << 2;
index ^= 3;
uint word = uint(tmem8.instances[tmem_instance].elems[index]);
word = (word >> shift) & 0xf;
uint intensity = word & 0xe;
intensity = (intensity << 4) | (intensity << 1) | (intensity >> 2);
return i16x4(intensity, intensity, intensity, (word & 1) * 0xff);
}
i16x4 sample_texel_ci4(TileInfo tile, uint tmem_instance, uvec2 st, uint pal)
{
uint byte_offset = tile.offset + tile.stride * st.y;
byte_offset += st.x >> 1;
byte_offset &= 0xfff;
uint shift = (~st.x & 1) * 4;
uint index = byte_offset;
index ^= (st.y & 1) << 2;
index ^= 3;
uint word = uint(tmem8.instances[tmem_instance].elems[index]);
word = (word >> shift) & 0xf;
word |= pal << 4;
return i16x4(word);
}
i16x4 sample_texel_ci4_tlut(TileInfo tile, uint tmem_instance, uvec2 st, uint pal, uint lut_offset, uint addr_xor, bool tlut_type)
{
uint byte_offset = tile.offset + tile.stride * st.y;
byte_offset += st.x >> 1;
byte_offset &= 0x7ff;
uint shift = (~st.x & 1) * 4;
uint index = byte_offset;
index ^= (st.y & 1) << 2;
index ^= 3;
uint word = uint(tmem8.instances[tmem_instance].elems[index]);
word = (word >> shift) & 0xf;
word |= pal << 4;
uint lut_entry = (word << 2) + lut_offset;
lut_entry ^= addr_xor;
word = uint(tmem16.instances[tmem_instance].elems[0x400 | lut_entry]);
return tlut_type ? convert_ia16(word) : convert_rgba16(word);
}
i16x4 sample_texel_ci8_tlut(TileInfo tile, uint tmem_instance, uvec2 st, uint lut_offset, uint addr_xor, bool tlut_type)
{
uint byte_offset = tile.offset + tile.stride * st.y;
byte_offset += st.x;
byte_offset &= 0x7ff;
uint index = byte_offset;
index ^= (st.y & 1) << 2;
index ^= 3;
uint word = uint(tmem8.instances[tmem_instance].elems[index]);
uint lut_entry = (word << 2) + lut_offset;
lut_entry ^= addr_xor;
word = uint(tmem16.instances[tmem_instance].elems[0x400 | lut_entry]);
return tlut_type ? convert_ia16(word) : convert_rgba16(word);
}
i16x4 sample_texel_ci32(TileInfo tile, uint tmem_instance, uvec2 st)
{
uint byte_offset = tile.offset + tile.stride * st.y;
byte_offset += st.x * 2;
byte_offset &= 0xfff;
uint index = byte_offset >> 1;
index ^= (st.y & 1) << 1;
index ^= 1;
uint word = uint(tmem16.instances[tmem_instance].elems[index]);
return i16x2(word >> 8, word & 0xff).xyxy;
}
i16x4 sample_texel_ci32_tlut(TileInfo tile, uint tmem_instance, uvec2 st, uint lut_offset, uint addr_xor, bool tlut_type)
{
uint byte_offset = tile.offset + tile.stride * st.y;
byte_offset += st.x * 2;
byte_offset &= 0x7ff;
uint index = byte_offset >> 1;
index ^= (st.y & 1) << 1;
index ^= 1;
uint word = uint(tmem16.instances[tmem_instance].elems[index]);
uint lut_entry = ((word >> 6) & ~3) + lut_offset;
lut_entry ^= addr_xor;
word = uint(tmem16.instances[tmem_instance].elems[0x400 | lut_entry]);
return tlut_type ? convert_ia16(word) : convert_rgba16(word);
}
i16x4 sample_texel_rgba8(TileInfo tile, uint tmem_instance, uvec2 st)
{
uint byte_offset = tile.offset + tile.stride * st.y;
byte_offset += st.x;
byte_offset &= 0xfff;
uint index = byte_offset;
index ^= (st.y & 1) << 2;
index ^= 3;
uint word = uint(tmem8.instances[tmem_instance].elems[index]);
return i16x4(word);
}
i16x4 sample_texel_ia8(TileInfo tile, uint tmem_instance, uvec2 st)
{
uint byte_offset = tile.offset + tile.stride * st.y;
byte_offset += st.x;
byte_offset &= 0xfff;
uint index = byte_offset;
index ^= (st.y & 1) << 2;
index ^= 3;
uint word = uint(tmem8.instances[tmem_instance].elems[index]);
uint intensity = word >> 4;
uint alpha = word & 0xf;
alpha |= alpha << 4;
intensity |= intensity << 4;
return i16x4(intensity, intensity, intensity, alpha);
}
i16x4 sample_texel_yuv16(TileInfo tile, uint tmem_instance, uvec2 st, uint chroma_x)
{
uint byte_offset = tile.offset + tile.stride * st.y;
uint byte_offset_luma = byte_offset + st.x;
byte_offset_luma &= 0x7ff;
uint byte_offset_chroma = byte_offset + chroma_x * 2;
byte_offset_chroma &= 0x7ff;
uint index_luma = byte_offset_luma;
index_luma ^= (st.y & 1) << 2;
index_luma ^= 3;
uint index_chroma = byte_offset_chroma >> 1;
index_chroma ^= (st.y & 1) << 1;
index_chroma ^= 1;
u8 luma = u8(tmem8.instances[tmem_instance].elems[index_luma | 0x800]);
u16 chroma = u16(tmem16.instances[tmem_instance].elems[index_chroma]);
u8 u = u8((chroma >> U16_C(8)) & U16_C(0xff));
u8 v = u8((chroma >> U16_C(0)) & U16_C(0xff));
return i16x4(i16(u) - I16_C(0x80), i16(v) - I16_C(0x80), luma, luma);
}
i16x4 sample_texel_rgba16(TileInfo tile, uint tmem_instance, uvec2 st)
{
uint byte_offset = tile.offset + tile.stride * st.y;
byte_offset += st.x * 2;
byte_offset &= 0xfff;
uint index = byte_offset >> 1;
index ^= (st.y & 1) << 1;
index ^= 1;
uint word = uint(tmem16.instances[tmem_instance].elems[index]);
return convert_rgba16(word);
}
i16x4 sample_texel_ia16(TileInfo tile, uint tmem_instance, uvec2 st)
{
uint byte_offset = tile.offset + tile.stride * st.y;
byte_offset += st.x * 2;
byte_offset &= 0xfff;
uint index = byte_offset >> 1;
index ^= (st.y & 1) << 1;
index ^= 1;
uint word = uint(tmem16.instances[tmem_instance].elems[index]);
return convert_ia16(word);
}
i16x4 sample_texel_rgba32(TileInfo tile, uint tmem_instance, uvec2 st)
{
uint byte_offset = tile.offset + tile.stride * st.y;
byte_offset += st.x * 2;
byte_offset &= 0x7ff;
uint index = byte_offset >> 1;
index ^= (st.y & 1) << 1;
index ^= 1;
uint lower_word = uint(tmem16.instances[tmem_instance].elems[index]);
uint upper_word = uint(tmem16.instances[tmem_instance].elems[index | 0x400]);
return i16x4(lower_word >> 8, lower_word & 0xff, upper_word >> 8, upper_word & 0xff);
}
int clamp_and_shift_coord(bool clamp_bit, int coord, int lo, int hi, int shift)
{
// Clamp 17-bit coordinate to 16-bit coordinate here.
coord = clamp(coord, -0x8000, 0x7fff);
if (shift < 11)
coord >>= shift;
else
{
coord <<= (32 - shift);
coord >>= 16;
}
if (clamp_bit)
{
bool clamp_hi = (coord >> 3) >= hi;
if (clamp_hi)
coord = (((hi >> 2) - (lo >> 2)) & 0x3ff) << 5;
else
coord = max(coord - (lo << 3), 0);
}
else
coord -= lo << 3;
return coord;
}
int shift_coord(int coord, int lo, int shift)
{
// Clamp 17-bit coordinate to 16-bit coordinate here.
coord = clamp(coord, -0x8000, 0x7fff);
if (shift < 11)
coord >>= shift;
else
{
coord <<= (32 - shift);
coord >>= 16;
}
coord -= lo << 3;
return coord;
}
// The copy pipe reads 4x16 words.
int sample_texture_copy_word(TileInfo tile, uint tmem_instance, ivec2 st, int s_offset, bool tlut, bool tlut_type)
{
// For non-16bpp TMEM, the lower 32-bits are sampled based on direct 16-bit fetches. There are no shifts applied.
bool high_word = s_offset < 2;
bool replicate_8bpp = high_word && tile.size != 2 && !tlut;
int samp;
int s_shamt = min(int(tile.size), 2);
bool large_texel = int(tile.size) == 3;
int idx_mask = (large_texel || tlut) ? 0x3ff : 0x7ff;
if (replicate_8bpp)
{
// The high word of 8-bpp replication is special in the sense that we sample 8-bpp correctly.
// Sample the two possible words.
st.x += 2 * s_offset;
ivec2 s = texel_mask_s_copy(tile, st.x);
int t = texel_mask_t(tile, st.y);
uint tbase = tile.offset + tile.stride * t;
uvec2 nibble_offset = (tbase * 2 + (s << s_shamt)) & 0x1fffu;
nibble_offset ^= (t & 1u) * 8u;
uvec2 index = nibble_offset >> 2u;
index &= idx_mask;
int samp0 = int(tmem16.instances[tmem_instance].elems[index.x ^ 1]);
int samp1 = int(tmem16.instances[tmem_instance].elems[index.y ^ 1]);
if (tile.size == 1)
{
samp0 >>= 8 - 4 * int(nibble_offset.x & 2);
samp1 >>= 8 - 4 * int(nibble_offset.y & 2);
samp0 &= 0xff;
samp1 &= 0xff;
}
else if (tile.size == 0)
{
samp0 >>= 12 - 4 * int(nibble_offset.x & 3u);
samp1 >>= 12 - 4 * int(nibble_offset.y & 3u);
samp0 = (samp0 & 0xf) * 0x11;
samp1 = (samp1 & 0xf) * 0x11;
}
else
{
samp0 >>= 8;
samp1 >>= 8;
}
samp = (samp0 << 8) | samp1;
}
else
{
st.x += s_offset;
int s = texel_mask_s(tile, st.x);
int t = texel_mask_t(tile, st.y);
uint tbase = tile.offset + tile.stride * t;
uint nibble_offset = (tbase * 2 + (s << s_shamt)) & 0x1fffu;
nibble_offset ^= (t & 1u) * 8u;
uint index = nibble_offset >> 2u;
index &= idx_mask;
samp = int(tmem16.instances[tmem_instance].elems[index ^ 1]);
if (tlut)
{
if (tile.size == 0)
{
samp >>= 12 - 4 * (nibble_offset & 3);
samp &= 0xf;
samp |= tile.palette << 4;
samp <<= 2;
samp += s_offset;
}
else
{
samp >>= 8 - 4 * (nibble_offset & 2);
samp &= 0xff;
samp <<= 2;
samp += s_offset;
}
samp = int(tmem16.instances[tmem_instance].elems[(samp | 0x400) ^ 1]);
}
}
return samp;
}
int sample_texture_copy(TileInfo tile, uint tmem_instance, ivec2 st, int s_offset, bool tlut, bool tlut_type)
{
st.x = shift_coord(st.x, int(tile.slo), int(tile.shift_s));
st.y = shift_coord(st.y, int(tile.tlo), int(tile.shift_t));
st >>= 5;
int samp;
if (global_constants.fb_info.fb_size == 0)
{
samp = 0;
}
else if (global_constants.fb_info.fb_size == 1)
{
samp = sample_texture_copy_word(tile, tmem_instance, st, s_offset >> 1, tlut, tlut_type);
samp >>= 8 - 8 * (s_offset & 1);
samp &= 0xff;
}
else
{
samp = sample_texture_copy_word(tile, tmem_instance, st, s_offset, tlut, tlut_type);
}
return samp;
}
i16x2 bilinear_3tap(i16x2 t00, i16x2 t10, i16x2 t01, i16x2 t11, ivec2 frac)
{
int sum_frac = frac.x + frac.y;
i16x2 t_base = sum_frac >= 32 ? t11 : t00;
i16x2 flip_frac = i16x2(sum_frac >= 32 ? (32 - frac.yx) : frac);
i16x2 accum = (t10 - t_base) * flip_frac.x;
accum += (t01 - t_base) * flip_frac.y;
accum += I16_C(0x10);
accum >>= I16_C(5);
accum += t_base;
return accum;
}
i16x4 sample_texture(TileInfo tile, uint tmem_instance, ivec2 st, bool tlut, bool tlut_type, bool sample_quad, bool mid_texel, bool convert_one,
i16x4 prev_cycle)
{
st.x = clamp_and_shift_coord((tile.flags & TILE_INFO_CLAMP_S_BIT) != 0, st.x, int(tile.slo), int(tile.shi), int(tile.shift_s));
st.y = clamp_and_shift_coord((tile.flags & TILE_INFO_CLAMP_T_BIT) != 0, st.y, int(tile.tlo), int(tile.thi), int(tile.shift_t));
ivec2 frac;
if (sample_quad)
frac = st & 31;
else
frac = ivec2(0);
int sum_frac = frac.x + frac.y;
st >>= 5;
int s0 = texel_mask_s(tile, st.x);
int t0 = texel_mask_t(tile, st.y);
int s1 = texel_mask_s(tile, st.x + 1);
int t1 = texel_mask_t(tile, st.y + 1);
// Very specific weird logic going on with t0 and t1.
int tdiff = max(t1 - t0, -255);
t1 = (t0 & 0xff) + tdiff;
t0 &= 0xff;
i16x4 t_base, t10, t01, t11;
mid_texel = all(bvec3(mid_texel, equal(frac, ivec2(0x10))));
if (mid_texel)
sum_frac = 0;
bool yuv = tile.fmt == TEXTURE_FORMAT_YUV;
ivec2 base_st = sum_frac >= 0x20 ? ivec2(s1, t1) : ivec2(s0, t0);
if (tlut)
{
switch (int(tile.fmt))
{
case TEXTURE_FORMAT_RGBA:
case TEXTURE_FORMAT_CI:
case TEXTURE_FORMAT_IA:
case TEXTURE_FORMAT_I:
{
// For TLUT, entries in the LUT are duplicated and we must make sure that we sample 3 different banks
// when we look up the TLUT entry. In normal situations, this is irrelevant, but we're trying to be accurate here.
bool upper = sum_frac >= 0x20;
uint addr_xor = upper ? 2 : 1;
switch (int(tile.size))
{
case 0:
t_base = sample_texel_ci4_tlut(tile, tmem_instance, base_st, tile.palette, upper ? 3 : 0, addr_xor, tlut_type);
if (sample_quad)
{
t10 = sample_texel_ci4_tlut(tile, tmem_instance, ivec2(s1, t0), tile.palette, 1, addr_xor,
tlut_type);
t01 = sample_texel_ci4_tlut(tile, tmem_instance, ivec2(s0, t1), tile.palette, 2, addr_xor,
tlut_type);
}
if (mid_texel)
{
t11 = sample_texel_ci4_tlut(tile, tmem_instance, ivec2(s1, t1), tile.palette, 3, addr_xor,
tlut_type);
}
break;
case 1:
t_base = sample_texel_ci8_tlut(tile, tmem_instance, base_st, upper ? 3 : 0, addr_xor, tlut_type);
if (sample_quad)
{
t10 = sample_texel_ci8_tlut(tile, tmem_instance, ivec2(s1, t0), 1, addr_xor, tlut_type);
t01 = sample_texel_ci8_tlut(tile, tmem_instance, ivec2(s0, t1), 2, addr_xor, tlut_type);
}
if (mid_texel)
t11 = sample_texel_ci8_tlut(tile, tmem_instance, ivec2(s1, t1), 3, addr_xor, tlut_type);
break;
default:
t_base = sample_texel_ci32_tlut(tile, tmem_instance, base_st, upper ? 3 : 0, addr_xor, tlut_type);
if (sample_quad)
{
t10 = sample_texel_ci32_tlut(tile, tmem_instance, ivec2(s1, t0), 1, addr_xor, tlut_type);
t01 = sample_texel_ci32_tlut(tile, tmem_instance, ivec2(s0, t1), 2, addr_xor, tlut_type);
}
if (mid_texel)
t11 = sample_texel_ci32_tlut(tile, tmem_instance, ivec2(s1, t1), 3, addr_xor, tlut_type);
break;
}
break;
}
}
}
else
{
switch (int(tile.fmt))
{
case TEXTURE_FORMAT_RGBA:
switch (int(tile.size))
{
case 0:
t_base = sample_texel_rgba4(tile, tmem_instance, base_st);
if (sample_quad)
{
t10 = sample_texel_rgba4(tile, tmem_instance, ivec2(s1, t0));
t01 = sample_texel_rgba4(tile, tmem_instance, ivec2(s0, t1));
}
if (mid_texel)
t11 = sample_texel_rgba4(tile, tmem_instance, ivec2(s1, t1));
break;
case 1:
t_base = sample_texel_rgba8(tile, tmem_instance, base_st);
if (sample_quad)
{
t10 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t0));
t01 = sample_texel_rgba8(tile, tmem_instance, ivec2(s0, t1));
}
if (mid_texel)
t11 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t1));
break;
case 2:
t_base = sample_texel_rgba16(tile, tmem_instance, base_st);
if (sample_quad)
{
t10 = sample_texel_rgba16(tile, tmem_instance, ivec2(s1, t0));
t01 = sample_texel_rgba16(tile, tmem_instance, ivec2(s0, t1));
}
if (mid_texel)
t11 = sample_texel_rgba16(tile, tmem_instance, ivec2(s1, t1));
break;
case 3:
t_base = sample_texel_rgba32(tile, tmem_instance, base_st);
if (sample_quad)
{
t10 = sample_texel_rgba32(tile, tmem_instance, ivec2(s1, t0));
t01 = sample_texel_rgba32(tile, tmem_instance, ivec2(s0, t1));
}
if (mid_texel)
t11 = sample_texel_rgba32(tile, tmem_instance, ivec2(s1, t1));
break;
}
break;
case TEXTURE_FORMAT_YUV:
{
uint chroma_x0 = s0 >> 1;
uint chroma_x1 = (s1 + (s1 - s0)) >> 1;
// Only implement 16bpp for now. It's the only one that gives meaningful results.
t_base = sample_texel_yuv16(tile, tmem_instance, ivec2(s0, t0), chroma_x0);
if (sample_quad)
{
t10 = sample_texel_yuv16(tile, tmem_instance, ivec2(s1, t0), chroma_x1);
t01 = sample_texel_yuv16(tile, tmem_instance, ivec2(s0, t1), chroma_x0);
t11 = sample_texel_yuv16(tile, tmem_instance, ivec2(s1, t1), chroma_x1);
}
break;
}
case TEXTURE_FORMAT_CI:
switch (int(tile.size))
{
case 0:
t_base = sample_texel_ci4(tile, tmem_instance, base_st, tile.palette);
if (sample_quad)
{
t10 = sample_texel_ci4(tile, tmem_instance, ivec2(s1, t0), tile.palette);
t01 = sample_texel_ci4(tile, tmem_instance, ivec2(s0, t1), tile.palette);
}
if (mid_texel)
t11 = sample_texel_ci4(tile, tmem_instance, ivec2(s1, t1), tile.palette);
break;
case 1:
t_base = sample_texel_rgba8(tile, tmem_instance, base_st);
if (sample_quad)
{
t10 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t0));
t01 = sample_texel_rgba8(tile, tmem_instance, ivec2(s0, t1));
}
if (mid_texel)
t11 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t1));
break;
default:
t_base = sample_texel_ci32(tile, tmem_instance, base_st);
if (sample_quad)
{
t10 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t0));
t01 = sample_texel_ci32(tile, tmem_instance, ivec2(s0, t1));
}
if (mid_texel)
t11 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t1));
break;
}
break;
case TEXTURE_FORMAT_IA:
switch (int(tile.size))
{
case 0:
t_base = sample_texel_ia4(tile, tmem_instance, base_st);
if (sample_quad)
{
t10 = sample_texel_ia4(tile, tmem_instance, ivec2(s1, t0));
t01 = sample_texel_ia4(tile, tmem_instance, ivec2(s0, t1));
}
if (mid_texel)
t11 = sample_texel_ia4(tile, tmem_instance, ivec2(s1, t1));
break;
case 1:
t_base = sample_texel_ia8(tile, tmem_instance, base_st);
if (sample_quad)
{
t10 = sample_texel_ia8(tile, tmem_instance, ivec2(s1, t0));
t01 = sample_texel_ia8(tile, tmem_instance, ivec2(s0, t1));
}
if (mid_texel)
t11 = sample_texel_ia8(tile, tmem_instance, ivec2(s1, t1));
break;
case 2:
t_base = sample_texel_ia16(tile, tmem_instance, base_st);
if (sample_quad)
{
t10 = sample_texel_ia16(tile, tmem_instance, ivec2(s1, t0));
t01 = sample_texel_ia16(tile, tmem_instance, ivec2(s0, t1));
}
if (mid_texel)
t11 = sample_texel_ia16(tile, tmem_instance, ivec2(s1, t1));
break;
case 3:
t_base = sample_texel_ci32(tile, tmem_instance, base_st);
if (sample_quad)
{
t10 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t0));
t01 = sample_texel_ci32(tile, tmem_instance, ivec2(s0, t1));
}
if (mid_texel)
t11 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t1));
break;
}
break;
case TEXTURE_FORMAT_I:
switch (int(tile.size))
{
case 0:
t_base = sample_texel_rgba4(tile, tmem_instance, base_st);
if (sample_quad)
{
t10 = sample_texel_rgba4(tile, tmem_instance, ivec2(s1, t0));
t01 = sample_texel_rgba4(tile, tmem_instance, ivec2(s0, t1));
}
if (mid_texel)
t11 = sample_texel_rgba4(tile, tmem_instance, ivec2(s1, t1));
break;
case 1:
t_base = sample_texel_rgba8(tile, tmem_instance, base_st);
if (sample_quad)
{
t10 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t0));
t01 = sample_texel_rgba8(tile, tmem_instance, ivec2(s0, t1));
}
if (mid_texel)
t11 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t1));
break;
default:
t_base = sample_texel_ci32(tile, tmem_instance, base_st);
if (sample_quad)
{
t10 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t0));
t01 = sample_texel_ci32(tile, tmem_instance, ivec2(s0, t1));
}
if (mid_texel)
t11 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t1));
break;
}
break;
}
}
i16x4 accum;
if (convert_one)
{
ivec4 prev_sext = bitfieldExtract(ivec4(prev_cycle), 0, 9);
ivec2 factors = sum_frac >= 32 ? prev_sext.gr : prev_sext.rg;
ivec4 converted = factors.r * (t10 - t_base) + factors.g * (t01 - t_base) + 0x80;
converted >>= 8;
converted += prev_sext.b;
accum = i16x4(converted);
}
else if (yuv)
{
if (sample_quad)
{
int chroma_frac = ((s0 & 1) << 4) | (frac.x >> 1);
i16x2 accum_chroma = bilinear_3tap(t_base.xy, t10.xy, t01.xy, t11.xy, ivec2(chroma_frac, frac.y));
i16x2 accum_luma = bilinear_3tap(t_base.zw, t10.zw, t01.zw, t11.zw, frac);
accum = i16x4(accum_chroma, accum_luma);
}
else
accum = t_base;
}
else if (mid_texel)
{
accum = (t_base + t01 + t10 + t11 + I16_C(2)) >> I16_C(2);
}
else
{
i16x2 flip_frac = i16x2(sum_frac >= 32 ? (32 - frac.yx) : frac);
accum = (t10 - t_base) * flip_frac.x;
accum += (t01 - t_base) * flip_frac.y;
accum += I16_C(0x10);
accum >>= I16_C(5);
accum += t_base;
}
return accum;
}
void compute_lod_2cycle(inout uint tile0, inout uint tile1, out i16 lod_frac, uint max_level, int min_lod,
ivec2 st, ivec2 st_dx, ivec2 st_dy,
bool perspective_overflow, bool tex_lod_en, bool sharpen_tex_en, bool detail_tex_en)
{
bool magnify = false;
bool distant = false;
uint tile_offset = 0;
if (perspective_overflow)
{
distant = true;
lod_frac = i16(0xff);
}
else
{
ivec2 dx = st_dx - st;
// Kinda abs, except it's 1 less than expected if negative.
dx ^= dx >> 31;
ivec2 dy = st_dy - st;
// Kinda abs, except it's 1 less than expected if negative.
dy ^= dy >> 31;
ivec2 max_d2 = max(dx, dy);
int max_d = max(max_d2.x, max_d2.y);
if (max_d >= 0x4000)
{
distant = true;
lod_frac = i16(0xff);
tile_offset = max_level;
}
else if (max_d < 32) // LOD < 0
{
distant = max_level == 0u;
magnify = true;
if (!sharpen_tex_en && !detail_tex_en)
lod_frac = i16(distant ? 0xff : 0);
else
lod_frac = i16((max(min_lod, max_d) << 3) + (sharpen_tex_en ? -0x100 : 0));
}
else
{
int mip_base = max(findMSB(max_d >> 5), 0);
distant = mip_base >= max_level;
if (distant && !sharpen_tex_en && !detail_tex_en)
{
lod_frac = i16(0xff);
}
else
{
lod_frac = i16(((max_d << 3) >> mip_base) & 0xff);
tile_offset = mip_base;
}
}
}
if (tex_lod_en)
{
if (distant)
tile_offset = max_level;
if (!detail_tex_en)
{
tile0 = (tile0 + tile_offset) & 7u;
if (distant || (!sharpen_tex_en && magnify))
tile1 = tile0;
else
tile1 = (tile0 + 1) & 7;
}
else
{
tile1 = (tile0 + tile_offset + ((distant || magnify) ? 1 : 2)) & 7u;
tile0 = (tile0 + tile_offset + (magnify ? 0 : 1)) & 7u;
}
}
}
i16x4 texture_convert_factors(i16x4 texel_in, i16x4 factors)
{
ivec4 texel = bitfieldExtract(ivec4(texel_in), 0, 9);
int r = texel.b + ((factors.x * texel.g + 0x80) >> 8);
int g = texel.b + ((factors.y * texel.r + factors.z * texel.g + 0x80) >> 8);
int b = texel.b + ((factors.w * texel.r + 0x80) >> 8);
int a = texel.b;
return i16x4(r, g, b, a);
}
#endif

View File

@ -0,0 +1,274 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
// Consumes result from tile_binning_prepass.comp, bins at a finer resolution (8x8 or 16x16 blocks).
#include "small_types.h"
#if SUBGROUP
#extension GL_KHR_shader_subgroup_basic : require
#extension GL_KHR_shader_subgroup_vote : require
#extension GL_KHR_shader_subgroup_ballot : require
#extension GL_KHR_shader_subgroup_arithmetic : require
layout(local_size_x_id = 0) in;
#else
// Reasonable default. For AMD (64 threads), subgroups are definitely supported, so this won't be hit.
layout(local_size_x = 32) in;
#endif
#include "debug.h"
#include "data_structures.h"
#include "binning.h"
layout(constant_id = 1) const int TILE_WIDTH = 8;
layout(constant_id = 2) const int TILE_HEIGHT = 8;
layout(constant_id = 3) const int MAX_PRIMITIVES = 256;
layout(constant_id = 4) const int MAX_WIDTH = 1024;
layout(constant_id = 5) const int TILE_INSTANCE_STRIDE = 0x8000;
layout(constant_id = 6) const int SCALE_FACTOR = 1;
const int TILE_BINNING_STRIDE = MAX_PRIMITIVES / 32;
const int MAX_TILES_X = MAX_WIDTH / TILE_WIDTH;
layout(set = 0, binding = 0, std430) readonly buffer TriangleSetupBuffer
{
TriangleSetupMem elems[];
} triangle_setup;
#include "load_triangle_setup.h"
layout(set = 0, binding = 1, std430) readonly buffer ScissorStateBuffer
{
ScissorStateMem elems[];
} scissor_state;
#include "load_scissor_state.h"
layout(set = 0, binding = 2, std430) readonly buffer StateIndicesBuffer
{
InstanceIndicesMem elems[];
} state_indices;
layout(std430, set = 0, binding = 3) writeonly buffer TileBitmask
{
uint binned_bitmask[];
};
layout(std430, set = 0, binding = 4) writeonly buffer TileBitmaskCoarse
{
uint binned_bitmask_coarse[];
};
#if !UBERSHADER
layout(std430, set = 0, binding = 5) writeonly buffer TileInstanceOffset
{
uint elems[];
} tile_instance_offsets;
layout(std430, set = 0, binding = 6) buffer IndirectBuffer
{
uvec4 elems[];
} indirect_counts;
// This can actually be uint16_t, but AMD doesn't seem to support loading uint16_t in SMEM unit,
// the memory traffic for this data structure is not relevant anyways.
struct TileRasterWork
{
uint tile_x, tile_y;
uint tile_instance;
uint primitive;
};
layout(std430, set = 0, binding = 7) writeonly buffer WorkList
{
uvec4 elems[];
} tile_raster_work;
#endif
#if !UBERSHADER
uint allocate_work_offset(uint variant_index)
{
#if !SUBGROUP
return atomicAdd(indirect_counts.elems[variant_index].x, 1u);
#else
// Merge atomic operations. Compiler would normally do this,
// but it might not have figured out that variant_index is uniform.
uvec4 active_mask = subgroupBallot(true);
uint count = subgroupBallotBitCount(active_mask);
uint work_offset = 0u;
if (subgroupElect())
work_offset = atomicAdd(indirect_counts.elems[variant_index].x, count);
work_offset = subgroupBroadcastFirst(work_offset);
work_offset += subgroupBallotExclusiveBitCount(active_mask);
return work_offset;
#endif
}
#endif
layout(push_constant, std430) uniform Registers
{
uvec2 resolution;
int primitive_count;
} fb_info;
#if !SUBGROUP
shared uint merged_mask_shared;
#endif
void main()
{
int group_index = int(gl_WorkGroupID.x);
ivec2 meta_tile = ivec2(gl_WorkGroupID.yz);
const int TILES_X = 8;
const int TILES_Y = int(gl_WorkGroupSize.x) >> 3;
#if SUBGROUP
// Spec is unclear how gl_LocalInvocationIndex is mapped to gl_SubgroupInvocationID, so synthesize our own.
// We know the subgroups are fully occupied with VK_EXT_subgroup_size_control already.
int local_index = int(gl_SubgroupInvocationID);
int SUBGROUP_TILES_Y = int(gl_SubgroupSize) >> 3;
#else
int local_index = int(gl_LocalInvocationIndex);
#endif
int inner_tile_x = local_index & 7;
int inner_tile_y = local_index >> 3;
#if SUBGROUP
inner_tile_y += SUBGROUP_TILES_Y * int(gl_SubgroupID);
#endif
ivec2 tile = meta_tile * ivec2(TILES_X, TILES_Y) + ivec2(inner_tile_x, inner_tile_y);
int linear_tile = tile.y * MAX_TILES_X + tile.x;
ivec2 base_coord_meta = meta_tile * ivec2(TILE_WIDTH * TILES_X, TILE_HEIGHT * TILES_Y);
#if SUBGROUP
base_coord_meta.y += SUBGROUP_TILES_Y * TILE_HEIGHT * int(gl_SubgroupID);
ivec2 end_coord_meta = min(base_coord_meta + ivec2(TILE_WIDTH * TILES_X, TILE_HEIGHT * SUBGROUP_TILES_Y), ivec2(fb_info.resolution)) - 1;
#else
ivec2 end_coord_meta = min(base_coord_meta + ivec2(TILE_WIDTH * TILES_X, TILE_HEIGHT * TILES_Y), ivec2(fb_info.resolution)) - 1;
#endif
ivec2 base_coord = tile * ivec2(TILE_WIDTH, TILE_HEIGHT);
ivec2 end_coord = min(base_coord + ivec2(TILE_WIDTH, TILE_HEIGHT), ivec2(fb_info.resolution)) - 1;
int primitive_count = fb_info.primitive_count;
#if !SUBGROUP
if (local_index == 0)
merged_mask_shared = 0u;
barrier();
#endif
bool binned = false;
if (local_index < 32)
{
uint primitive_index = group_index * 32 + local_index;
if (primitive_index < primitive_count)
{
ScissorState scissor = load_scissor_state(primitive_index);
ivec2 clipped_base_coord = max(base_coord_meta, SCALE_FACTOR * (ivec2(scissor.xlo, scissor.ylo) >> 2) - 1);
ivec2 clipped_end_coord = min(end_coord_meta, SCALE_FACTOR * (ivec2(scissor.xhi + 3, scissor.yhi + 3) >> 2) - 1);
TriangleSetup setup = load_triangle_setup(primitive_index);
binned = bin_primitive(setup, clipped_base_coord, clipped_end_coord, SCALE_FACTOR);
}
}
#if SUBGROUP
uint merged_mask = subgroupBallot(binned).x;
#else
if (binned)
atomicOr(merged_mask_shared, 1u << local_index);
barrier();
uint merged_mask = merged_mask_shared;
#endif
uint binned_mask = 0u;
while (merged_mask != 0u)
{
int bit = findLSB(merged_mask);
merged_mask &= ~(1u << bit);
uint primitive_index = group_index * 32 + bit;
ScissorState scissor = load_scissor_state(primitive_index);
ivec2 clipped_base_coord = max(base_coord, SCALE_FACTOR * (ivec2(scissor.xlo, scissor.ylo) >> 2));
ivec2 clipped_end_coord = min(end_coord, SCALE_FACTOR * (ivec2(scissor.xhi + 3, scissor.yhi + 3) >> 2) - 1);
TriangleSetup setup = load_triangle_setup(primitive_index);
if (bin_primitive(setup, clipped_base_coord, clipped_end_coord, SCALE_FACTOR))
binned_mask |= 1u << bit;
}
binned_bitmask[linear_tile * TILE_BINNING_STRIDE + group_index] = binned_mask;
if (binned_mask != 0u)
atomicOr(binned_bitmask_coarse[linear_tile], 1u << group_index);
else
atomicAnd(binned_bitmask_coarse[linear_tile], ~(1u << group_index));
#if SUBGROUP
#if !UBERSHADER
uint bit_count = uint(bitCount(binned_mask));
uint instance_offset = 0u;
if (subgroupAny(bit_count != 0u))
{
// Allocate tile instance space for all threads in subgroup in one go.
uint total_bit_count = subgroupAdd(bit_count);
if (subgroupElect())
if (total_bit_count != 0u)
instance_offset = atomicAdd(indirect_counts.elems[0].w, total_bit_count);
instance_offset = subgroupBroadcastFirst(instance_offset);
instance_offset += subgroupInclusiveAdd(bit_count) - bit_count;
}
#endif
#else
#if !UBERSHADER
uint bit_count = uint(bitCount(binned_mask));
uint instance_offset = 0u;
if (bit_count != 0u)
instance_offset = atomicAdd(indirect_counts.elems[0].w, bit_count);
#endif
#endif
#if !UBERSHADER
if (bit_count != 0u)
tile_instance_offsets.elems[linear_tile * TILE_BINNING_STRIDE + group_index] = instance_offset;
#if SUBGROUP
uint variant_mask = subgroupOr(binned_mask);
#else
uint variant_mask = binned_mask;
#endif
while (variant_mask != 0u)
{
int bit = findLSB(variant_mask);
variant_mask &= ~(1u << bit);
int primitive_index = group_index * 32 + bit;
if ((binned_mask & (1u << bit)) != 0u)
{
uint variant_index = uint(state_indices.elems[primitive_index].static_depth_tmem.x);
uint work_offset = allocate_work_offset(variant_index);
tile_raster_work.elems[work_offset + uint(TILE_INSTANCE_STRIDE) * variant_index] =
uvec4(tile.x, tile.y, instance_offset, primitive_index);
instance_offset++;
}
}
#endif
}

View File

@ -0,0 +1,577 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "debug.h"
#include "small_types.h"
layout(local_size_x_id = 0) in;
layout(set = 0, binding = 0, std430) readonly buffer VRAM8Buffer
{
mem_u8 data[];
} vram8;
layout(set = 0, binding = 0, std430) readonly buffer VRAM16Buffer
{
mem_u16 data[];
} vram16;
layout(set = 0, binding = 0, std430) readonly buffer VRAM32Buffer
{
uint data[];
} vram32;
layout(set = 0, binding = 1, std430) buffer TMEM16Buffer
{
mem_u16 data[2048];
} tmem16;
struct TileInstance
{
mem_u16 data[2048];
};
layout(set = 0, binding = 2, std430) writeonly buffer TMEMInstances
{
TileInstance instances[];
} tile_instances;
layout(push_constant, std430) uniform Registers
{
int num_uploads;
} registers;
const int TEXTURE_FMT_RGBA = 0;
const int TEXTURE_FMT_YUV = 1;
const int TEXTURE_FMT_CI = 2;
const int TEXTURE_FMT_IA = 3;
const int TEXTURE_FMT_I = 4;
const int UPLOAD_MODE_TILE = 0;
const int UPLOAD_MODE_TLUT = 1;
const int UPLOAD_MODE_BLOCK = 2;
struct UploadInfo
{
int width, height;
float min_t_mod, max_t_mod;
int vram_addr;
int vram_width;
int vram_size;
int vram_effective_width;
int tmem_offset;
int tmem_stride_words;
int tmem_size;
int tmem_fmt;
int mode;
float inv_tmem_stride_words;
int dxt;
int padding;
};
layout(set = 1, binding = 0, std140) uniform UploadInfos
{
UploadInfo upload_info[256];
};
bool tmem_dirty;
uint current_tmem_value;
int compute_upload_t(int offset, float inv_stride)
{
// This is still exact for all relevant inputs, and much faster than integer divide.
return int((float(offset) + 0.5) * inv_stride);
}
// In 32bpp upload mode we read 64 bits and split the result over the lower and upper TMEM.
void update_tmem_32(UploadInfo info, int tmem16_index, bool upper_tmem, bool yuv)
{
int tmem16_offset = (info.tmem_offset & 0x7ff) >> 1;
int tmem16_stride = info.tmem_stride_words;
int pixel_offset = (tmem16_index - tmem16_offset) & 0x3ff;
int upload_x, upload_y;
int upload_x_xor = 0;
if (info.mode == UPLOAD_MODE_BLOCK)
{
int word_offset = pixel_offset >> 1;
if (info.tmem_stride_words == 0)
{
// Trivial case, we can just compute T factor directly and set upload_x_xor.
// Other than that, it works like a simple 1D upload.
// However, if DxT is weird, we might end up in a situation where this word is written multiple times,
// or zero times.
int iteration_candidate_first = word_offset & ~1;
int iteration_candidate_second = iteration_candidate_first + 1;
int first_t = (iteration_candidate_first * info.dxt) >> 16;
int second_t = (iteration_candidate_second * info.dxt) >> 16;
if (first_t != second_t)
{
int iteration_candidate_first_write_index = iteration_candidate_first ^ (first_t & 1);
int iteration_candidate_second_write_index = iteration_candidate_second ^ (second_t & 1);
if (iteration_candidate_second_write_index == word_offset)
upload_x_xor = (second_t & 1) << 1;
else if (iteration_candidate_first_write_index == word_offset)
upload_x_xor = (first_t & 1) << 1;
else
return;
}
else
upload_x_xor ^= (first_t & 1) << 1;
}
else
{
// Welp ... This is pure insanity, but if we want to be completely correct ...
int min_t = compute_upload_t(word_offset & ~1, info.min_t_mod);
int max_t = compute_upload_t(word_offset | 1, info.max_t_mod);
// If t has a range, then the solution to Y = (t = floor(X * dt / 2048)) * stride + X has a range space of:
// Y - t_max * stride <= X <= Y - t_min * stride.
int max_word_candidate = (word_offset | 1) - tmem16_stride * min_t;
int min_word_candidate = (word_offset & ~1) - tmem16_stride * max_t;
// If we have constraints for X, we constraint T further.
min_t = max(min_t, (min_word_candidate * info.dxt) >> 16);
max_t = min(max_t, (max_word_candidate * info.dxt) >> 16);
bool found_candidate = false;
for (int t = max_t; t >= min_t; t--)
{
// Check to see if t is a solution to the equation.
// Potentially two targets could write here.
int candidate_solution_first = (word_offset & ~1) - tmem16_stride * t;
int candidate_solution_second = (word_offset | 1) - tmem16_stride * t;
int candidate_t_first = (candidate_solution_first * info.dxt) >> 16;
int candidate_t_second = (candidate_solution_second * info.dxt) >> 16;
if (((candidate_solution_second + candidate_t_second * tmem16_stride) ^ (candidate_t_second & 1)) == word_offset)
{
found_candidate = true;
pixel_offset = (candidate_solution_second << 1) + (pixel_offset & 1);
break;
}
else if (((candidate_solution_first + candidate_t_first * tmem16_stride) ^ (candidate_t_first & 1)) == word_offset)
{
found_candidate = true;
pixel_offset = (candidate_solution_first << 1) + (pixel_offset & 1);
break;
}
}
// We strided over this 64bpp word.
if (!found_candidate)
return;
}
upload_x = pixel_offset;
upload_y = 0;
}
else if (tmem16_stride == 0)
{
// For TMEM stride of 0 we're essentially replaying the same line over and over and the final visible result
// is what happened in Y == height - 1.
upload_x = pixel_offset;
upload_y = info.height - 1;
}
else
{
upload_y = compute_upload_t(pixel_offset, info.inv_tmem_stride_words);
upload_x = pixel_offset - upload_y * tmem16_stride;
// If stride is smaller than width, we'll need to unroll the last line.
if (upload_y >= info.height)
{
upload_x += tmem16_stride * (upload_y - info.height + 1);
upload_y = info.height - 1;
}
}
int last_line_upload_x = upload_x ^ ((upload_y & 1) << 1);
if (last_line_upload_x >= info.width && upload_y > 0)
{
// If the last line won't trigger a write, the previous line probably did.
upload_y--;
upload_x += tmem16_stride;
}
int iteration_offset;
upload_x ^= ((upload_y & 1) << 1) | upload_x_xor;
if (info.vram_size == 3 || yuv)
{
iteration_offset = 4 * (upload_x & ~1);
}
else if (info.vram_size == 2)
{
// In 16bpp VRAM mode, we are supposed to step 4 pixels at a time (8 bytes), which will form 2 complete pixels.
// However, in 32bpp tile mode we're not shifting the X value appropriately.
// So, we're writing texels [0, 1, ..., 4, 5, ...], etc.
if ((upload_x & 2) != 0)
{
// We're not writing in this line, but the previous line might have!
// Interleaving patterns will form ...
if (upload_y > 0)
{
upload_y--;
upload_x += tmem16_stride;
upload_x ^= 2;
}
else
{
// These 2 words will never be written to.
return;
}
}
iteration_offset = 2 * (upload_x & ~1);
}
else if (info.vram_size == 1)
{
// 4 potential mirrors.
for (int i = 0; i < 4 && upload_y > 0 && (upload_x & 6) != 0; i++)
{
upload_y--;
upload_x += tmem16_stride;
upload_x ^= 2;
}
if ((upload_x & 6) != 0)
{
// These 6 words will never be written to.
return;
}
iteration_offset = upload_x & ~1;
}
if (upload_x >= info.width)
return;
int line_rdram_addr = info.vram_addr + ((upload_y * info.vram_width) << (info.vram_size - 1));
// The loading pipeline reads 64 bits per iteration.
int rdram_addr = line_rdram_addr + iteration_offset + 4 * (upload_x & 1);
uint word;
if ((rdram_addr & 3) == 0)
{
word = uint(vram32.data[rdram_addr >> 2]);
}
else
{
word = (uint(vram8.data[rdram_addr ^ 3]) << 24) |
(uint(vram8.data[(rdram_addr + 1) ^ 3]) << 16) |
(uint(vram8.data[(rdram_addr + 2) ^ 3]) << 8) |
uint(vram8.data[(rdram_addr + 3) ^ 3]);
}
if (yuv)
{
// Lower TMEM receives interleaved UV samples, while upper receives Y.
if (upper_tmem)
{
uint y0 = (word >> 16u) & 0xffu;
uint y1 = (word >> 0u) & 0xffu;
word = (y0 << 8u) | y1;
}
else
{
uint u = (word >> 24u) & 0xffu;
uint v = (word >> 8u) & 0xffu;
word = (u << 8u) | v;
}
}
else
{
word >>= 16u - 16u * uint(upper_tmem);
word &= 0xffffu;
}
current_tmem_value = word;
tmem_dirty = true;
}
void update_tmem_16(UploadInfo info, int tmem16_index)
{
int tmem16_offset = (info.tmem_offset & 0xfff) >> 1;
int tmem16_stride = info.tmem_stride_words;
int pixel_offset = (tmem16_index - tmem16_offset) & 0x7ff;
int upload_x, upload_y;
int upload_x_xor = 0;
if (info.mode == UPLOAD_MODE_BLOCK)
{
int word_offset = pixel_offset >> 2;
if (info.tmem_stride_words == 0)
{
// Trivial case, we can just compute T factor directly and set upload_x_xor.
// Other than that, it works like a simple 1D upload.
upload_x_xor = (((word_offset * info.dxt) >> 16) & 1) << 1;
}
else
{
// Welp ... This is pure insanity, but if we want to be completely correct ...
int min_t = compute_upload_t(word_offset, info.min_t_mod);
int max_t = compute_upload_t(word_offset, info.max_t_mod);
// If t has a range, then the solution to Y = (t = floor(X * dt / 2048)) * stride + X has a range space of:
// Y - t_max * stride <= X <= Y - t_min * stride.
int max_word_candidate = word_offset - tmem16_stride * min_t;
int min_word_candidate = word_offset - tmem16_stride * max_t;
// If we have constraints for X, we constraint T further.
min_t = max(min_t, (min_word_candidate * info.dxt) >> 16);
max_t = min(max_t, (max_word_candidate * info.dxt) >> 16);
bool found_candidate = false;
for (int t = max_t; t >= min_t; t--)
{
// Check to see if t is a solution to the equation.
int candidate_solution = word_offset - tmem16_stride * t;
int computed_t = (candidate_solution * info.dxt) >> 16;
if (candidate_solution + computed_t * tmem16_stride == word_offset)
{
found_candidate = true;
upload_x_xor = (computed_t & 1) << 1;
pixel_offset = (candidate_solution << 2) + (pixel_offset & 3);
}
}
// We strided over this 64bpp word.
if (!found_candidate)
return;
}
upload_x = pixel_offset;
upload_y = 0;
}
else if (tmem16_stride == 0)
{
// For TMEM stride of 0 we're essentially replaying the same line over and over and the final visible result
// is what happened in Y == height - 1.
upload_x = pixel_offset;
upload_y = info.height - 1;
}
else
{
upload_y = compute_upload_t(pixel_offset, info.inv_tmem_stride_words);
upload_x = pixel_offset - upload_y * tmem16_stride;
// If stride is smaller than width, we'll need to unroll the last line.
if (upload_y >= info.height)
{
upload_x += tmem16_stride * (upload_y - info.height + 1);
upload_y = info.height - 1;
}
}
// This is pure bullshit magic which arises as an edge case when
// tile pixel size does not match texture image size.
// Should not happen in normal applications.
// This is basically doing scatter-as-gather, so we need to figure out
// if there is no write to our texel after all (striding), or if there are multiple writes
// to our texel, in which case we need to figure out the last writer.
// This code is black magic, and it's made with blood, sweat and tears from testing with lots of trial and error.
int iteration_offset;
if (info.tmem_size != info.vram_size)
{
if (info.vram_size - info.tmem_size == 1)
{
// If TMEM is N bpp but VRAM is 2N bpp, we will get mirrored writes here.
// Select which half of the 2N bpp load we observe in TMEM.
iteration_offset = (upload_x & ~3) * 4;
if ((upload_x & ~3) + 2 < (info.vram_effective_width >> (3 - info.vram_size)))
iteration_offset += 8;
}
else if (info.tmem_size == 2 && info.vram_size == 1)
{
// In 8bpp VRAM mode, we are supposed to step 8 pixels at a time (8 bytes), which will form 4 complete pixels.
// However, in 16bpp tile mode we're not shifting the X value appropriately.
// So, we're writing texels [0, 1, 2, 3, ..., 8, 9, 10, 11], etc.
if ((upload_x & 4) != 0)
{
// We're not writing in this line, but the previous line might have!
// Interleaving patterns will form ...
if ((tmem16_stride & 4) != 0 && upload_y > 0)
{
upload_y--;
upload_x += tmem16_stride;
}
else
{
// These 4 words will never be written to.
return;
}
}
iteration_offset = upload_x & ~3;
}
}
else
{
// Normal case TMEM size aligns with VRAM size.
iteration_offset = (upload_x & ~3) * 2;
}
if (upload_x >= info.width)
return;
int line_rdram_addr = info.vram_addr + ((upload_y * info.vram_width) << (info.vram_size - 1));
upload_x ^= ((upload_y & 1) << 1) | upload_x_xor;
// The loading pipeline reads 64 bits per iteration.
int rdram_addr = line_rdram_addr + iteration_offset + 2 * (upload_x & 3);
uint word;
if ((rdram_addr & 1) == 0)
word = uint(vram16.data[(rdram_addr >> 1) ^ 1]);
else
word = (uint(vram8.data[rdram_addr ^ 3]) << 8) | uint(vram8.data[(rdram_addr + 1) ^ 3]);
current_tmem_value = word;
tmem_dirty = true;
}
void update_tmem_lut(UploadInfo info, int tmem16_index)
{
int tmem16_offset = (info.tmem_offset & 0xfff) >> 1;
int pixel_offset = (tmem16_index - tmem16_offset) & 0x7ff;
int pixel_offset_splat;
if (info.vram_size - info.tmem_size == 2)
{
pixel_offset_splat = pixel_offset >> 2;
pixel_offset_splat <<= info.vram_size - 2;
if (pixel_offset_splat >= info.vram_effective_width)
return;
}
else if (info.vram_size - info.tmem_size == 1)
{
if ((pixel_offset & 4) == 0)
{
int shamt = info.tmem_size + (info.vram_size == 2 ? 2 : 0);
pixel_offset_splat = (pixel_offset & ~7) >> shamt;
if (pixel_offset_splat >= info.vram_effective_width)
return;
}
else
{
return;
}
}
else if (info.vram_size == info.tmem_size)
{
if ((pixel_offset & 0xc) == 0)
{
int shamt = info.tmem_size + (info.vram_size == 2 ? 2 : 0);
pixel_offset_splat = (pixel_offset & ~3) >> shamt;
if (pixel_offset_splat >= info.vram_effective_width)
return;
}
else
{
return;
}
}
else if (info.vram_size - info.tmem_size == -1)
{
if ((pixel_offset & 0x1c) == 0)
{
int shamt = info.tmem_size;
pixel_offset_splat = (pixel_offset >> shamt) & ~7;
if (pixel_offset_splat >= info.vram_effective_width)
return;
}
else
{
return;
}
}
else
{
// 4bpp tile, 32bpp VRAM. Mirrored writes.
int span_iteration = pixel_offset >> 2;
span_iteration = span_iteration * 2;
int span_pixel = span_iteration * 2;
if (span_pixel + 2 < info.vram_effective_width)
span_pixel += 2;
if (span_pixel >= info.vram_effective_width)
return;
pixel_offset_splat = span_pixel;
}
int rdram_addr = info.vram_addr + (pixel_offset_splat << (info.vram_size - 1));
// Odd behavior when we have unaligned TLUT uploads.
rdram_addr += 2 * (rdram_addr & 1) * (pixel_offset & 3);
uint word;
if ((rdram_addr & 1) == 0)
word = uint(vram16.data[(rdram_addr >> 1) ^ 1]);
else
word = (uint(vram8.data[rdram_addr ^ 3]) << 8) | uint(vram8.data[(rdram_addr + 1) ^ 3]);
current_tmem_value = word;
tmem_dirty = true;
}
void main()
{
tmem_dirty = false;
current_tmem_value = uint(tmem16.data[gl_GlobalInvocationID.x]);
int tmem16_index = int(gl_GlobalInvocationID.x) ^ 1;
bool upper_tmem = tmem16_index >= 0x400;
tile_instances.instances[0].data[gl_GlobalInvocationID.x] = mem_u16(current_tmem_value);
int num_uploads = registers.num_uploads;
for (int i = 0; i < num_uploads; i++)
{
UploadInfo info = upload_info[i];
if (info.mode == UPLOAD_MODE_TLUT)
{
update_tmem_lut(info, tmem16_index);
}
else
{
bool yuv = info.tmem_fmt == TEXTURE_FMT_YUV;
if (info.tmem_size == 3 || yuv)
update_tmem_32(info, tmem16_index & 0x3ff, upper_tmem, yuv);
else if (info.tmem_fmt != TEXTURE_FMT_YUV)
update_tmem_16(info, tmem16_index);
}
tile_instances.instances[i + 1].data[gl_GlobalInvocationID.x] = mem_u16(current_tmem_value);
}
if (tmem_dirty)
tmem16.data[gl_GlobalInvocationID.x] = mem_u16(current_tmem_value);
}

View File

@ -0,0 +1,103 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
// RIP to any GPU which attempts to execute this monstrosity :)
#if SUBGROUP
#extension GL_KHR_shader_subgroup_basic : require
#extension GL_KHR_shader_subgroup_vote : require
#extension GL_KHR_shader_subgroup_ballot : require
#extension GL_KHR_shader_subgroup_arithmetic : require
#endif
#include "small_types.h"
layout(local_size_x_id = 3, local_size_y_id = 4) in;
#include "debug.h"
#include "data_structures_buffers.h"
#include "noise.h"
#include "memory_interfacing.h"
#include "shading.h"
layout(push_constant, std430) uniform Registers
{
uint fb_addr_index;
uint fb_depth_addr_index;
uint fb_width;
uint fb_height;
uint group_mask;
} registers;
layout(constant_id = 5) const int MAX_PRIMITIVES = 256;
layout(constant_id = 6) const int MAX_WIDTH = 1024;
const int TILE_BINNING_STRIDE = MAX_PRIMITIVES / 32;
const int MAX_TILES_X = MAX_WIDTH / int(gl_WorkGroupSize.x);
void main()
{
int x = int(gl_GlobalInvocationID.x);
int y = int(gl_GlobalInvocationID.y);
ivec2 tile = ivec2(gl_WorkGroupID.xy);
int linear_tile = tile.x + tile.y * MAX_TILES_X;
int linear_tile_base = linear_tile * TILE_BINNING_STRIDE;
uint coarse_binned = tile_binning_coarse.elems[linear_tile] & registers.group_mask;
if (coarse_binned == 0u)
return;
init_tile(gl_GlobalInvocationID.xy,
registers.fb_width, registers.fb_height,
registers.fb_addr_index, registers.fb_depth_addr_index);
while (coarse_binned != 0u)
{
int mask_index = findLSB(coarse_binned);
coarse_binned &= ~uint(1 << mask_index);
uint binned = tile_binning.elems[linear_tile_base + mask_index];
while (binned != 0u)
{
int i = findLSB(binned);
binned &= ~uint(1 << i);
uint primitive_index = uint(i + 32 * mask_index);
ShadedData shaded;
if (shade_pixel(x, y, primitive_index, shaded))
{
if ((shaded.coverage_count & COVERAGE_FILL_BIT) != 0)
fill_color(derived_setup.elems[primitive_index].fill_color);
else if ((shaded.coverage_count & COVERAGE_COPY_BIT) != 0)
copy_pipeline(shaded.z_dith, primitive_index);
else
depth_blend(x, y, primitive_index, shaded);
}
}
}
finish_tile(gl_GlobalInvocationID.xy,
registers.fb_width, registers.fb_height,
registers.fb_addr_index, registers.fb_depth_addr_index);
}

View File

@ -0,0 +1,119 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "small_types.h"
#include "fb_formats.h"
layout(local_size_x_id = 3) in;
layout(constant_id = 0) const int RDRAM_SIZE = 8 * 1024 * 1024;
const int RDRAM_MASK_8 = RDRAM_SIZE - 1;
const int RDRAM_MASK_16 = RDRAM_MASK_8 >> 1;
const int RDRAM_MASK_32 = RDRAM_MASK_8 >> 2;
layout(constant_id = 1) const int FB_SIZE_LOG2 = 0;
layout(constant_id = 2) const bool COLOR_DEPTH_ALIAS = false;
layout(constant_id = 4) const int NUM_SAMPLES = 1;
layout(push_constant) uniform Registers
{
uint num_pixels, fb_addr, fb_depth_addr;
} registers;
layout(set = 0, binding = 0) readonly buffer RDRAMSingleSampled8
{
uint8_t elems[];
} vram8;
layout(set = 0, binding = 0) readonly buffer RDRAMSingleSampled16
{
uint16_t elems[];
} vram16;
layout(set = 0, binding = 0) readonly buffer RDRAMSingleSampled32
{
uint elems[];
} vram32;
layout(set = 0, binding = 2) buffer RDRAMUpscalingReference8
{
uint8_t elems[];
} vram_reference8;
layout(set = 0, binding = 2) buffer RDRAMUpscalingReference16
{
uint16_t elems[];
} vram_reference16;
layout(set = 0, binding = 2) buffer RDRAMUpscalingReference32
{
uint elems[];
} vram_reference32;
void copy_rdram_8(uint index)
{
index &= RDRAM_MASK_8;
uint real_word = uint(vram8.elems[index]);
vram_reference8.elems[index] = uint8_t(real_word);
}
void copy_rdram_16(uint index)
{
index &= RDRAM_MASK_16;
uint real_word = uint(vram16.elems[index]);
vram_reference16.elems[index] = uint16_t(real_word);
}
void copy_rdram_32(uint index)
{
index &= RDRAM_MASK_32;
uint real_word = vram32.elems[index];
vram_reference32.elems[index] = real_word;
}
void main()
{
uint index = gl_GlobalInvocationID.x;
if (index >= registers.num_pixels)
return;
uint depth_index = index + registers.fb_depth_addr;
uint color_index = index + registers.fb_addr;
switch (FB_SIZE_LOG2)
{
case 0:
copy_rdram_8(color_index);
break;
case 1:
copy_rdram_16(color_index);
break;
case 2:
copy_rdram_32(color_index);
break;
}
if (!COLOR_DEPTH_ALIAS)
copy_rdram_16(depth_index);
}

View File

@ -0,0 +1,185 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "small_types.h"
layout(local_size_x_id = 3) in;
layout(constant_id = 0) const int RDRAM_SIZE = 8 * 1024 * 1024;
const int RDRAM_MASK_8 = RDRAM_SIZE - 1;
const int RDRAM_MASK_16 = RDRAM_MASK_8 >> 1;
const int RDRAM_MASK_32 = RDRAM_MASK_8 >> 2;
layout(constant_id = 1) const int FB_SIZE_LOG2 = 0;
layout(constant_id = 2) const bool COLOR_DEPTH_ALIAS = false;
layout(constant_id = 4) const int NUM_SAMPLES = 1;
layout(push_constant) uniform Registers
{
uint num_pixels, fb_addr, fb_depth_addr;
} registers;
layout(set = 0, binding = 0) readonly buffer RDRAMSingleSampled8
{
uint8_t elems[];
} vram8;
layout(set = 0, binding = 0) readonly buffer RDRAMSingleSampled16
{
uint16_t elems[];
} vram16;
layout(set = 0, binding = 0) readonly buffer RDRAMSingleSampled32
{
uint elems[];
} vram32;
layout(set = 0, binding = 1) readonly buffer RDRAMHiddenSingleSampled
{
uint8_t elems[];
} hidden_vram;
layout(set = 0, binding = 2) buffer RDRAMUpscalingReference8
{
uint8_t elems[];
} vram_reference8;
layout(set = 0, binding = 2) buffer RDRAMUpscalingReference16
{
uint16_t elems[];
} vram_reference16;
layout(set = 0, binding = 2) buffer RDRAMUpscalingReference32
{
uint elems[];
} vram_reference32;
layout(set = 0, binding = 3) buffer RDRAMUpscaling8
{
uint8_t elems[];
} vram_upscaled8;
layout(set = 0, binding = 3) buffer RDRAMUpscaling16
{
uint16_t elems[];
} vram_upscaled16;
layout(set = 0, binding = 3) buffer RDRAMUpscaling32
{
uint elems[];
} vram_upscaled32;
layout(set = 0, binding = 4) buffer RDRAMHiddenUpscaling
{
uint8_t elems[];
} hidden_vram_upscaled;
void update_rdram_8(uint index)
{
index &= RDRAM_MASK_8;
uint real_word = uint(vram8.elems[index]);
uint reference_word = uint(vram_reference8.elems[index]);
if (real_word != reference_word)
{
uint mirrored_index = index ^ 3u;
uint real_hidden_word = uint(hidden_vram.elems[mirrored_index >> 1u]);
for (int i = 0; i < NUM_SAMPLES; i++)
{
vram_upscaled8.elems[index + i * RDRAM_SIZE] = uint8_t(real_word);
if ((mirrored_index & 1u) != 0u)
hidden_vram_upscaled.elems[(mirrored_index >> 1u) + i * (RDRAM_SIZE >> 1)] = uint8_t(real_hidden_word);
}
vram_reference8.elems[index] = uint8_t(real_word);
}
}
void update_rdram_16(uint index)
{
index &= RDRAM_MASK_16;
uint real_word = uint(vram16.elems[index]);
uint reference_word = uint(vram_reference16.elems[index]);
if (real_word != reference_word)
{
uint mirrored_index = index ^ 1u;
uint real_hidden_word = uint(hidden_vram.elems[mirrored_index]);
for (int i = 0; i < NUM_SAMPLES; i++)
{
vram_upscaled16.elems[index + i * (RDRAM_SIZE >> 1)] = uint16_t(real_word);
hidden_vram_upscaled.elems[mirrored_index + i * (RDRAM_SIZE >> 1)] = uint8_t(real_hidden_word);
}
vram_reference16.elems[index] = uint16_t(real_word);
}
}
void update_rdram_32(uint index)
{
index &= RDRAM_MASK_32;
uint real_word = vram32.elems[index];
uint reference_word = vram_reference32.elems[index];
if (real_word != reference_word)
{
uint real_hidden_word0 = uint(hidden_vram.elems[2u * index]);
uint real_hidden_word1 = uint(hidden_vram.elems[2u * index + 1u]);
for (int i = 0; i < NUM_SAMPLES; i++)
{
vram_upscaled32.elems[index + i * (RDRAM_SIZE >> 2)] = real_word;
hidden_vram_upscaled.elems[2u * index + i * (RDRAM_SIZE >> 1)] = uint8_t(real_hidden_word0);
hidden_vram_upscaled.elems[2u * index + 1u + i * (RDRAM_SIZE >> 1)] = uint8_t(real_hidden_word1);
}
vram_reference32.elems[index] = real_word;
}
}
void main()
{
uint index = gl_GlobalInvocationID.x;
if (index >= registers.num_pixels)
return;
uint depth_index = index + registers.fb_depth_addr;
uint color_index = index + registers.fb_addr;
switch (FB_SIZE_LOG2)
{
case 0:
update_rdram_8(color_index);
break;
case 1:
update_rdram_16(color_index);
break;
case 2:
update_rdram_32(color_index);
break;
}
if (!COLOR_DEPTH_ALIAS)
update_rdram_16(depth_index);
}

View File

@ -0,0 +1,279 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "small_types.h"
#include "fb_formats.h"
layout(local_size_x_id = 3) in;
layout(constant_id = 0) const int RDRAM_SIZE = 8 * 1024 * 1024;
const int RDRAM_MASK_8 = RDRAM_SIZE - 1;
const int RDRAM_MASK_16 = RDRAM_MASK_8 >> 1;
const int RDRAM_MASK_32 = RDRAM_MASK_8 >> 2;
layout(constant_id = 1) const int FB_SIZE_LOG2 = 0;
layout(constant_id = 2) const bool COLOR_DEPTH_ALIAS = false;
layout(constant_id = 4) const int NUM_SAMPLES = 1;
layout(constant_id = 5) const bool DITHER = false;
layout(constant_id = 6) const bool RDRAM_UNSCALED_WRITE_MASK = false;
layout(push_constant) uniform Registers
{
uint num_pixels, fb_addr, fb_depth_addr, width, height;
} registers;
layout(set = 0, binding = 0) writeonly buffer RDRAMSingleSampled8
{
uint8_t elems[];
} vram8;
layout(set = 0, binding = 0) writeonly buffer RDRAMSingleSampled16
{
uint16_t elems[];
} vram16;
layout(set = 0, binding = 0) writeonly buffer RDRAMSingleSampled32
{
uint elems[];
} vram32;
layout(set = 0, binding = 2) writeonly buffer RDRAMUpscalingReference8
{
uint8_t elems[];
} vram_reference8;
layout(set = 0, binding = 2) writeonly buffer RDRAMUpscalingReference16
{
uint16_t elems[];
} vram_reference16;
layout(set = 0, binding = 2) writeonly buffer RDRAMUpscalingReference32
{
uint elems[];
} vram_reference32;
layout(set = 0, binding = 3) readonly buffer RDRAMUpscaling8
{
uint8_t elems[];
} vram_upscaled8;
layout(set = 0, binding = 3) readonly buffer RDRAMUpscaling16
{
uint16_t elems[];
} vram_upscaled16;
layout(set = 0, binding = 3) readonly buffer RDRAMUpscaling32
{
uint elems[];
} vram_upscaled32;
layout(set = 0, binding = 4) readonly buffer RDRAMHiddenUpscaling
{
uint8_t elems[];
} hidden_vram_upscaled;
void copy_rdram_8(uint index)
{
index &= RDRAM_MASK_8;
index ^= 3u;
uint r = 0u;
for (int i = 0; i < NUM_SAMPLES; i++)
{
uint real_word = uint(vram_upscaled8.elems[index]);
r += real_word;
}
r = (r + (NUM_SAMPLES >> 1)) / NUM_SAMPLES;
vram_reference8.elems[index] = uint8_t(r);
vram8.elems[index] = uint8_t(r);
if (RDRAM_UNSCALED_WRITE_MASK)
{
// Need this memory barrier to ensure the mask readback does not read
// an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
// also coherent.
memoryBarrierBuffer();
vram8.elems[index + RDRAM_SIZE] = mem_u8(0xff);
}
// Don't bother writing back hidden VRAM. It is not visible to host anyways, and coverage is meaningless when it's filtered.
// If host later decides to modify the CPU memory, then the hidden VRAM values become complete bogus either way.
}
uvec4 decode_rgba5551(uint word)
{
return (uvec4(word) >> uvec4(11, 6, 1, 0)) & uvec4(0x1f, 0x1f, 0x1f, 1);
}
uint encode_rgba5551(uvec4 color)
{
return (color.r << 11u) | (color.g << 6u) | (color.b << 1u) | color.a;
}
const uint bayer_dither_lut[16] = uint[](
0, 4, 1, 5,
4, 0, 5, 1,
3, 7, 2, 6,
7, 3, 6, 2);
void copy_rdram_16(uint index, uint x, uint y)
{
index &= RDRAM_MASK_16;
index ^= 1u;
uvec4 rgba = uvec4(0u);
for (int i = 0; i < NUM_SAMPLES; i++)
{
uint real_word = uint(vram_upscaled16.elems[index + i * (RDRAM_SIZE >> 1)]);
rgba += decode_rgba5551(real_word);
}
if (DITHER)
{
uint dither_value = bayer_dither_lut[(y & 3u) * 4u + (x & 3u)] * NUM_SAMPLES;
rgba = (8u * rgba + dither_value) / (8 * NUM_SAMPLES);
}
else
{
rgba = (rgba + (NUM_SAMPLES >> 1)) / NUM_SAMPLES;
}
uint encoded = encode_rgba5551(rgba);
vram16.elems[index] = uint16_t(encoded);
vram_reference16.elems[index] = uint16_t(encoded);
if (RDRAM_UNSCALED_WRITE_MASK)
{
// Need this memory barrier to ensure the mask readback does not read
// an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
// also coherent.
memoryBarrierBuffer();
vram16.elems[index + (RDRAM_SIZE >> 1u)] = mem_u16(0xffff);
}
// Don't bother writing back hidden VRAM. It is not visible to host anyways, and coverage is meaningless when it's filtered.
// If host later decides to modify the CPU memory, then the hidden VRAM values become complete bogus either way.
}
void copy_rdram_16_single_sample(uint index)
{
// Copies the first sample. We cannot meaningfully filter depth samples.
// The first sample should overlap exactly with the single-sampled version.
// Coverage clipping might slightly change the result, but shouldn't be different enough to break things.
index &= RDRAM_MASK_16;
index ^= 1u;
uint upscaled_word = uint(vram_upscaled16.elems[index]);
vram16.elems[index] = uint16_t(upscaled_word);
vram_reference16.elems[index] = uint16_t(upscaled_word);
if (RDRAM_UNSCALED_WRITE_MASK)
{
// Need this memory barrier to ensure the mask readback does not read
// an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
// also coherent.
memoryBarrierBuffer();
vram16.elems[index + (RDRAM_SIZE >> 1u)] = mem_u16(0xffff);
}
// Don't bother writing back hidden VRAM. It is not visible to host anyways, and coverage is meaningless when it's filtered.
// If host later decides to modify the CPU memory, then the hidden VRAM values become complete bogus either way.
}
uvec4 decode_rgba8(uint word)
{
return (uvec4(word) >> uvec4(24, 16, 8, 0)) & uvec4(0xff);
}
uint encode_rgba8(uvec4 color)
{
return (color.r << 24u) | (color.g << 16u) | (color.b << 8u) | (color.a << 0u);
}
void copy_rdram_32(uint index)
{
index &= RDRAM_MASK_32;
uvec4 rgba = uvec4(0u);
for (int i = 0; i < NUM_SAMPLES; i++)
{
uint real_word = vram_upscaled32.elems[index + i * (RDRAM_SIZE >> 2)];
rgba += decode_rgba8(real_word);
}
rgba = (rgba + (NUM_SAMPLES >> 1)) / NUM_SAMPLES;
uint encoded = encode_rgba8(rgba);
vram32.elems[index] = encoded;
vram_reference32.elems[index] = encoded;
if (RDRAM_UNSCALED_WRITE_MASK)
{
// Need this memory barrier to ensure the mask readback does not read
// an invalid value from RDRAM. If the mask is seen, the valid RDRAM value is
// also coherent.
memoryBarrierBuffer();
vram32.elems[index + (RDRAM_SIZE >> 2u)] = ~0u;
}
// Don't bother writing back hidden VRAM. It is not visible to host anyways, and coverage is meaningless when it's filtered.
// If host later decides to modify the CPU memory, then the hidden VRAM values become complete bogus either way.
}
void main()
{
uvec2 coord = gl_GlobalInvocationID.xy;
if (coord.x >= registers.width)
return;
uint index = coord.y * registers.width + coord.x;
uint depth_index = index + registers.fb_depth_addr;
uint color_index = index + registers.fb_addr;
uvec2 mask_coord = coord >> 2u;
uint mask_index = mask_coord.x + mask_coord.y * ((registers.width + 3) >> 2u);
uint write_mask = vram_upscaled32.elems[NUM_SAMPLES * (RDRAM_SIZE >> 2) + mask_index];
uint shamt = 2u * ((coord.x & 3u) + 4u * (coord.y & 3u));
write_mask = write_mask >> shamt;
bool color_write_mask = (write_mask & 1u) != 0u;
bool depth_write_mask = (write_mask & 2u) != 0u;
if (color_write_mask)
{
switch (FB_SIZE_LOG2)
{
case 0:
copy_rdram_8(color_index);
break;
case 1:
copy_rdram_16(color_index, coord.x, coord.y);
break;
case 2:
copy_rdram_32(color_index);
break;
}
}
if (!COLOR_DEPTH_ALIAS && depth_write_mask)
copy_rdram_16_single_sample(depth_index);
}

View File

@ -0,0 +1,33 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#extension GL_EXT_samplerless_texture_functions : require
layout(location = 0) out vec4 FragColor;
layout(set = 0, binding = 0) uniform texture2D uImage;
void main()
{
// A persistent pixel does not propagate more than one frame.
vec4 input_pixel = texelFetch(uImage, ivec2(gl_FragCoord.xy), 0);
FragColor = vec4(input_pixel.rgb * input_pixel.a, 0.0);
}

View File

@ -0,0 +1,60 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef VI_DEBUG_H_
#define VI_DEBUG_H_
#if defined(DEBUG_ENABLE) && DEBUG_ENABLE
#include "debug_channel.h"
void GENERIC_MESSAGE_(int line)
{
add_debug_message(0, uvec3(gl_FragCoord.xy, 0), line);
}
void GENERIC_MESSAGE_(int line, uint v)
{
add_debug_message(0, uvec3(gl_FragCoord.xy, 0), uvec2(line, v));
}
void GENERIC_MESSAGE_(int line, uvec2 v)
{
add_debug_message(0, uvec3(gl_FragCoord.xy, 0), uvec3(line, v));
}
void GENERIC_MESSAGE_(int line, uvec3 v)
{
add_debug_message(0, uvec3(gl_FragCoord.xy, 0), uvec4(line, v));
}
#define GENERIC_MESSAGE0() GENERIC_MESSAGE_(__LINE__)
#define GENERIC_MESSAGE1(a) GENERIC_MESSAGE_(__LINE__, a)
#define GENERIC_MESSAGE2(a, b) GENERIC_MESSAGE_(__LINE__, uvec2(a, b))
#define GENERIC_MESSAGE3(a, b, c) GENERIC_MESSAGE_(__LINE__, uvec3(a, b, c))
#else
#define GENERIC_MESSAGE0()
#define GENERIC_MESSAGE1(a)
#define GENERIC_MESSAGE2(a, b)
#define GENERIC_MESSAGE3(a, b, c)
#endif
#endif

View File

@ -0,0 +1,31 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
layout(location = 0) in vec2 vUV;
layout(set = 0, binding = 0) uniform sampler2D uSampler;
layout(location = 0) out vec4 FragColor;
void main()
{
FragColor = textureLod(uSampler, vUV, 0.0);
}

View File

@ -0,0 +1,41 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
layout(location = 0) out vec2 vUV;
layout(push_constant) uniform UBO
{
float y_offset;
} registers;
void main()
{
if (gl_VertexIndex == 0)
gl_Position = vec4(-1.0, -1.0, 0.0, 1.0);
else if (gl_VertexIndex == 1)
gl_Position = vec4(-1.0, +3.0, 0.0, 1.0);
else
gl_Position = vec4(+3.0, -1.0, 0.0, 1.0);
vUV = vec2(gl_Position.x * 0.5 + 0.5, gl_Position.y * 0.5 + 0.5 + registers.y_offset);
}

View File

@ -0,0 +1,92 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#extension GL_EXT_samplerless_texture_functions : require
#include "vi_debug.h"
layout(location = 0) out uvec4 FragColor;
#if defined(FETCH_BUG) && FETCH_BUG
layout(location = 1) out uvec4 FragColorFetchBug;
#endif
layout(set = 0, binding = 0) uniform mediump utexture2DArray uFetchCache;
void swap(inout uint a, inout uint b)
{
uint tmp = a;
a = b;
b = tmp;
}
uint median3(uint left, uint center, uint right)
{
if (left < center)
swap(left, center);
if (center < right)
swap(center, right);
if (left < center)
swap(left, center);
return center;
}
void main()
{
ivec2 pix = ivec2(gl_FragCoord.xy);
uvec4 left = texelFetch(uFetchCache, ivec3(pix, 0), 0);
uvec4 mid = texelFetchOffset(uFetchCache, ivec3(pix, 0), 0, ivec2(1, 0));
uvec4 right = texelFetchOffset(uFetchCache, ivec3(pix, 0), 0, ivec2(2, 0));
if ((left.a & mid.a & right.a) == 7u)
{
FragColor = mid;
}
else
{
// Median filter. TODO: Optimize with mid3?
uint r = median3(left.r, mid.r, right.r);
uint g = median3(left.g, mid.g, right.g);
uint b = median3(left.b, mid.b, right.b);
FragColor = uvec4(r, g, b, mid.a);
}
#if defined(FETCH_BUG) && FETCH_BUG
left = texelFetch(uFetchCache, ivec3(pix, 1), 0);
mid = texelFetchOffset(uFetchCache, ivec3(pix, 1), 0, ivec2(1, 0));
right = texelFetchOffset(uFetchCache, ivec3(pix, 1), 0, ivec2(2, 0));
if ((left.a & mid.a & right.a) == 7u)
{
FragColorFetchBug = mid;
}
else
{
// Median filter. TODO: Optimize with mid3?
uint r = median3(left.r, mid.r, right.r);
uint g = median3(left.g, mid.g, right.g);
uint b = median3(left.b, mid.b, right.b);
FragColorFetchBug = uvec4(r, g, b, mid.a);
}
#endif
}

View File

@ -0,0 +1,164 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#extension GL_EXT_samplerless_texture_functions : require
#include "small_types.h"
#include "vi_status.h"
#include "vi_debug.h"
layout(set = 0, binding = 0) uniform mediump utexture2D uAAInput;
layout(location = 0) out uvec4 FragColor;
#if defined(FETCH_BUG) && FETCH_BUG
layout(location = 1) out uvec4 FragColorFetchBug;
#endif
layout(push_constant) uniform Registers
{
ivec2 offset;
} registers;
ivec2 pix;
uvec4 fetch_color_offset(ivec2 offset)
{
return texelFetch(uAAInput, pix + offset, 0);
}
void check_neighbor(uvec4 candidate,
inout uvec3 lo, inout uvec3 hi,
inout uvec3 second_lo, inout uvec3 second_hi)
{
if (candidate.a == 7u)
{
second_lo = min(second_lo, max(candidate.rgb, lo));
second_hi = max(second_hi, min(candidate.rgb, hi));
lo = min(candidate.rgb, lo);
hi = max(candidate.rgb, hi);
}
}
void main()
{
pix = ivec2(gl_FragCoord.xy) + registers.offset;
uvec4 mid_pixel = fetch_color_offset(ivec2(0));
// AA-filter. If coverage is not full, we blend current pixel against background.
uvec3 color;
#if defined(FETCH_BUG) && FETCH_BUG
uvec3 color_bug;
#endif
if (mid_pixel.a != 7u)
{
uvec3 lo = mid_pixel.rgb;
uvec3 hi = lo;
uvec3 second_lo = lo;
uvec3 second_hi = lo;
// Somehow, we're supposed to find the second lowest and second highest neighbor.
uvec4 left_up = fetch_color_offset(ivec2(-1, -1));
uvec4 right_up = fetch_color_offset(ivec2(+1, -1));
uvec4 to_left = fetch_color_offset(ivec2(-2, 0));
uvec4 to_right = fetch_color_offset(ivec2(+2, 0));
uvec4 left_down = fetch_color_offset(ivec2(-1, +1));
uvec4 right_down = fetch_color_offset(ivec2(+1, +1));
check_neighbor(left_up, lo, hi, second_lo, second_hi);
check_neighbor(right_up, lo, hi, second_lo, second_hi);
check_neighbor(to_left, lo, hi, second_lo, second_hi);
check_neighbor(to_right, lo, hi, second_lo, second_hi);
#if defined(FETCH_BUG) && FETCH_BUG
// In the fetch-bug state, we apparently do not read the lower values.
// Instead, the lower values are treated as left and right.
uvec3 lo_bug = lo;
uvec3 hi_bug = hi;
uvec3 second_lo_bug = second_lo;
uvec3 second_hi_bug = second_hi;
#endif
check_neighbor(left_down, lo, hi, second_lo, second_hi);
check_neighbor(right_down, lo, hi, second_lo, second_hi);
#if defined(FETCH_BUG) && FETCH_BUG
check_neighbor(to_left, lo_bug, hi_bug, second_lo_bug, second_hi_bug);
check_neighbor(to_right, lo_bug, hi_bug, second_lo_bug, second_hi_bug);
second_lo = mix(second_lo, lo, equal(mid_pixel.rgb, lo));
second_hi = mix(second_hi, hi, equal(mid_pixel.rgb, hi));
second_lo_bug = mix(second_lo_bug, lo_bug, equal(mid_pixel.rgb, lo_bug));
second_hi_bug = mix(second_hi_bug, hi_bug, equal(mid_pixel.rgb, hi_bug));
#endif
uvec3 offset = second_lo + second_hi - (mid_pixel.rgb << 1u);
uint coeff = 7u - mid_pixel.a;
color = mid_pixel.rgb + (((offset * coeff) + 4u) >> 3u);
color &= 0xffu;
#if defined(FETCH_BUG) && FETCH_BUG
uvec3 offset_bug = second_lo_bug + second_hi_bug - (mid_pixel.rgb << 1u);
color_bug = mid_pixel.rgb + (((offset_bug * coeff) + 4u) >> 3u);
color_bug &= 0xffu;
#endif
}
else if (DITHER_ENABLE)
{
// Dither filter.
ivec3 tmp_color = ivec3(mid_pixel.rgb >> 3u);
ivec3 tmp_accum = ivec3(0);
for (int y = -1; y <= 0; y++)
{
for (int x = -1; x <= 1; x++)
{
ivec3 col = ivec3(fetch_color_offset(ivec2(x, y)).rgb >> 3u);
tmp_accum += clamp(col - tmp_color, ivec3(-1), ivec3(1));
}
}
#if defined(FETCH_BUG) && FETCH_BUG
ivec3 tmp_accum_bug = tmp_accum;
#endif
tmp_accum += clamp(ivec3(fetch_color_offset(ivec2(-1, 1)).rgb >> 3u) - tmp_color, ivec3(-1), ivec3(1));
tmp_accum += clamp(ivec3(fetch_color_offset(ivec2(+1, 1)).rgb >> 3u) - tmp_color, ivec3(-1), ivec3(1));
tmp_accum += clamp(ivec3(fetch_color_offset(ivec2(0, 1)).rgb >> 3u) - tmp_color, ivec3(-1), ivec3(1));
color = (mid_pixel.rgb & 0xf8u) + tmp_accum;
#if defined(FETCH_BUG) && FETCH_BUG
tmp_accum_bug += clamp(ivec3(fetch_color_offset(ivec2(-1, 0)).rgb >> 3u) - tmp_color, ivec3(-1), ivec3(1));
tmp_accum_bug += clamp(ivec3(fetch_color_offset(ivec2(+1, 0)).rgb >> 3u) - tmp_color, ivec3(-1), ivec3(1));
color_bug = (mid_pixel.rgb & 0xf8u) + tmp_accum_bug;
#endif
}
else
{
color = mid_pixel.rgb;
#if defined(FETCH_BUG) && FETCH_BUG
color_bug = mid_pixel.rgb;
#endif
}
FragColor = uvec4(color, mid_pixel.a);
#if defined(FETCH_BUG) && FETCH_BUG
FragColorFetchBug = uvec4(color_bug, mid_pixel.a);
#endif
}

View File

@ -0,0 +1,127 @@
#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#extension GL_EXT_samplerless_texture_functions : require
#include "small_types.h"
#include "vi_status.h"
#include "vi_debug.h"
#include "noise.h"
layout(set = 0, binding = 0) uniform mediump utexture2DArray uDivotOutput;
layout(set = 1, binding = 0) uniform mediump utextureBuffer uGammaTable;
layout(location = 0) out vec4 FragColor;
layout(push_constant, std430) uniform Registers
{
int x_base;
int y_base;
int h_offset;
int v_offset;
int x_add;
int y_add;
int frame_count;
int serrate_shift;
int serrate_mask;
int serrate_select;
} registers;
uvec3 vi_lerp(uvec3 a, uvec3 b, uint l)
{
return (a + (((b - a) * l + 16u) >> 5u)) & 0xffu;
}
uvec3 integer_gamma(uvec3 color)
{
uvec3 res;
if (GAMMA_DITHER)
{
color = (color << 6) + noise_get_full_gamma_dither() + 256u;
res = uvec3(
texelFetch(uGammaTable, int(color.r)).r,
texelFetch(uGammaTable, int(color.g)).r,
texelFetch(uGammaTable, int(color.b)).r);
}
else
{
res = uvec3(
texelFetch(uGammaTable, int(color.r)).r,
texelFetch(uGammaTable, int(color.g)).r,
texelFetch(uGammaTable, int(color.b)).r);
}
return res;
}
layout(constant_id = 2) const bool FETCH_BUG = false;
void main()
{
ivec2 coord = ivec2(gl_FragCoord.xy) + ivec2(registers.h_offset, registers.v_offset);
if ((coord.y & registers.serrate_mask) != registers.serrate_select)
discard;
coord.y >>= registers.serrate_shift;
if (GAMMA_DITHER)
reseed_noise(coord.x, coord.y, registers.frame_count);
int x = coord.x * registers.x_add + registers.x_base;
int y = coord.y * registers.y_add + registers.y_base;
ivec2 base_coord = ivec2(x, y) >> 10;
uvec3 c00 = texelFetch(uDivotOutput, ivec3(base_coord, 0), 0).rgb;
int bug_offset = 0;
if (FETCH_BUG)
{
// This is super awkward.
// Basically there seems to be some kind of issue where if we interpolate in Y,
// we're going to get buggy output.
// If we hit this case, the next line we filter against will come from the "buggy" array slice.
// Why this makes sense, I have no idea.
int prev_y = (y - registers.y_add) >> 10;
int next_y = (y + registers.y_add) >> 10;
if (coord.y != 0 && base_coord.y == prev_y && base_coord.y != next_y)
bug_offset = 1;
}
if (SCALE_AA)
{
int x_frac = (x >> 5) & 31;
int y_frac = (y >> 5) & 31;
uvec3 c10 = texelFetchOffset(uDivotOutput, ivec3(base_coord, 0), 0, ivec2(1, 0)).rgb;
uvec3 c01 = texelFetchOffset(uDivotOutput, ivec3(base_coord, bug_offset), 0, ivec2(0, 1)).rgb;
uvec3 c11 = texelFetchOffset(uDivotOutput, ivec3(base_coord, bug_offset), 0, ivec2(1)).rgb;
c00 = vi_lerp(c00, c01, y_frac);
c10 = vi_lerp(c10, c11, y_frac);
c00 = vi_lerp(c00, c10, x_frac);
}
if (GAMMA_ENABLE)
c00 = integer_gamma(c00);
else if (GAMMA_DITHER)
c00 = min(c00 + noise_get_partial_gamma_dither(), uvec3(0xff));
FragColor = vec4(vec3(c00) / 255.0, 1.0);
}

View File

@ -0,0 +1,48 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef VI_STATUS_H_
#define VI_STATUS_H_
layout(constant_id = 1) const int VI_STATUS = 0;
const int VI_CONTROL_TYPE_BLANK_BIT = 0 << 0;
const int VI_CONTROL_TYPE_RESERVED_BIT = 1 << 0;
const int VI_CONTROL_TYPE_RGBA5551_BIT = 2 << 0;
const int VI_CONTROL_TYPE_RGBA8888_BIT = 3 << 0;
const int VI_CONTROL_TYPE_MASK = 3 << 0;
const int VI_CONTROL_GAMMA_DITHER_ENABLE_BIT = 1 << 2;
const int VI_CONTROL_GAMMA_ENABLE_BIT = 1 << 3;
const int VI_CONTROL_DIVOT_ENABLE_BIT = 1 << 4;
const int VI_CONTROL_SERRATE_BIT = 1 << 6;
const int VI_CONTROL_DITHER_FILTER_ENABLE_BIT = 1 << 16;
const int VI_CONTROL_META_AA_BIT = 1 << 17;
const int VI_CONTROL_META_SCALE_BIT = 1 << 18;
const bool FMT_RGBA5551 = (VI_STATUS & VI_CONTROL_TYPE_MASK) == VI_CONTROL_TYPE_RGBA5551_BIT;
const bool FMT_RGBA8888 = (VI_STATUS & VI_CONTROL_TYPE_MASK) == VI_CONTROL_TYPE_RGBA8888_BIT;
const bool DITHER_ENABLE = (VI_STATUS & VI_CONTROL_DITHER_FILTER_ENABLE_BIT) != 0;
const bool FETCH_AA = (VI_STATUS & VI_CONTROL_META_AA_BIT) != 0;
const bool SCALE_AA = (VI_STATUS & VI_CONTROL_META_SCALE_BIT) != 0;
const bool GAMMA_ENABLE = (VI_STATUS & VI_CONTROL_GAMMA_ENABLE_BIT) != 0;
const bool GAMMA_DITHER = (VI_STATUS & VI_CONTROL_GAMMA_DITHER_ENABLE_BIT) != 0;
#endif

View File

@ -0,0 +1,58 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef Z_ENCODE_H_
#define Z_ENCODE_H_
// The Z compression is kind of clever, and uses inverted FP, with more precision close to 1.
// The compressed Z result is 14 bits, and decompresses to 18-bit UNORM.
int z_decompress(u16 z_)
{
int z = int(z_);
int exponent = z >> 11;
int mantissa = z & 0x7ff;
int shift = max(6 - exponent, 0);
int base = 0x40000 - (0x40000 >> exponent);
return (mantissa << shift) + base;
}
u16 z_compress(int z)
{
int inv_z = max(0x3ffff - z, 1);
int exponent = 17 - findMSB(inv_z);
exponent = clamp(exponent, 0, 7);
int shift = max(6 - exponent, 0);
int mantissa = (z >> shift) & 0x7ff;
return u16((exponent << 11) + mantissa);
}
int dz_decompress(int dz)
{
return 1 << dz;
}
int dz_compress(int dz)
{
return max(findMSB(dz), 0);
}
#endif

View File

@ -0,0 +1,158 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <stdint.h>
#include "device.hpp"
#include "rdp_common.hpp"
namespace RDP
{
struct ScanoutOptions
{
unsigned crop_overscan_pixels = 0;
unsigned downscale_steps = 0;
// Works around certain game bugs. Considered a hack if enabled.
bool persist_frame_on_invalid_input = false;
// To be equivalent to reference behavior where
// pixels persist for an extra frame.
// Not hardware accurate, but needed for weave interlace mode.
bool blend_previous_frame = false;
// Upscale deinterlacing deinterlaces by upscaling in Y, with an Y coordinate offset matching the field.
// If disabled, weave interlacing is used.
// Weave deinterlacing should *not* be used, except to run test suite!
bool upscale_deinterlacing = true;
struct
{
bool aa = true;
bool scale = true;
bool serrate = true;
bool dither_filter = true;
bool divot_filter = true;
bool gamma_dither = true;
} vi;
};
struct VIScanoutBuffer
{
Vulkan::BufferHandle buffer;
Vulkan::Fence fence;
unsigned width = 0;
unsigned height = 0;
};
class Renderer;
class VideoInterface : public Vulkan::DebugChannelInterface
{
public:
void set_device(Vulkan::Device *device);
void set_renderer(Renderer *renderer);
void set_vi_register(VIRegister reg, uint32_t value);
void set_rdram(const Vulkan::Buffer *rdram, size_t offset, size_t size);
void set_hidden_rdram(const Vulkan::Buffer *hidden_rdram);
int resolve_shader_define(const char *name, const char *define) const;
Vulkan::ImageHandle scanout(VkImageLayout target_layout, const ScanoutOptions &options = {}, unsigned scale_factor = 1);
void scanout_memory_range(unsigned &offset, unsigned &length) const;
void set_shader_bank(const ShaderBank *bank);
private:
Vulkan::Device *device = nullptr;
Renderer *renderer = nullptr;
uint32_t vi_registers[unsigned(VIRegister::Count)] = {};
const Vulkan::Buffer *rdram = nullptr;
const Vulkan::Buffer *hidden_rdram = nullptr;
Vulkan::BufferHandle gamma_lut;
Vulkan::BufferViewHandle gamma_lut_view;
const ShaderBank *shader_bank = nullptr;
void init_gamma_table();
bool previous_frame_blank = false;
bool debug_channel = false;
int filter_debug_channel_x = -1;
int filter_debug_channel_y = -1;
void message(const std::string &tag, uint32_t code,
uint32_t x, uint32_t y, uint32_t z,
uint32_t num_words, const Vulkan::DebugChannelInterface::Word *words) override;
// Frame state.
uint32_t frame_count = 0;
uint32_t last_valid_frame_count = 0;
Vulkan::ImageHandle prev_scanout_image;
VkImageLayout prev_image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
size_t rdram_offset = 0;
size_t rdram_size = 0;
bool timestamp = false;
struct Registers
{
int x_start, y_start;
int h_start, v_start;
int h_end, v_end;
int h_res, v_res;
int x_add, y_add;
int v_sync;
int vi_width;
int vi_offset;
int max_x, max_y;
int v_current_line;
bool left_clamp, right_clamp;
bool is_pal;
uint32_t status;
};
Registers decode_vi_registers() const;
Vulkan::ImageHandle vram_fetch_stage(const Registers &registers,
unsigned scaling_factor) const;
Vulkan::ImageHandle aa_fetch_stage(Vulkan::CommandBuffer &cmd,
Vulkan::Image &vram_image,
const Registers &registers,
unsigned scaling_factor) const;
Vulkan::ImageHandle divot_stage(Vulkan::CommandBuffer &cmd,
Vulkan::Image &aa_image,
const Registers &registers,
unsigned scaling_factor) const;
Vulkan::ImageHandle scale_stage(Vulkan::CommandBuffer &cmd,
Vulkan::Image &divot_image,
Registers registers,
unsigned scaling_factor,
bool degenerate,
const ScanoutOptions &options) const;
Vulkan::ImageHandle downscale_stage(Vulkan::CommandBuffer &cmd,
Vulkan::Image &scale_image,
unsigned scaling_factor,
unsigned downscale_factor) const;
Vulkan::ImageHandle upscale_deinterlace(Vulkan::CommandBuffer &cmd,
Vulkan::Image &scale_image,
unsigned scaling_factor, bool field_select) const;
static bool need_fetch_bug_emulation(const Registers &reg, unsigned scaling_factor);
};
}

View File

@ -0,0 +1,122 @@
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <queue>
#include <mutex>
#include <thread>
#include <condition_variable>
#include <utility>
#ifdef PARALLEL_RDP_SHADER_DIR
#include "global_managers.hpp"
#endif
namespace RDP
{
template <typename T, typename Executor>
class WorkerThread
{
public:
explicit WorkerThread(
#ifdef PARALLEL_RDP_SHADER_DIR
Granite::Global::GlobalManagersHandle globals,
#endif
Executor exec)
: executor(std::move(exec))
#ifdef PARALLEL_RDP_SHADER_DIR
, handles(std::move(globals))
#endif
{
thr = std::thread(&WorkerThread::main_loop, this);
}
~WorkerThread()
{
if (thr.joinable())
{
{
std::lock_guard<std::mutex> holder{to_thread_mutex};
work_queue.push({});
to_thread_cond.notify_one();
}
thr.join();
}
}
template <typename Cond>
void wait(Cond &&cond)
{
std::unique_lock<std::mutex> holder{to_main_mutex};
to_main_cond.wait(holder, std::forward<Cond>(cond));
}
void push(T &&t)
{
std::lock_guard<std::mutex> holder{to_thread_mutex};
work_queue.push(std::move(t));
to_thread_cond.notify_one();
}
private:
std::thread thr;
std::mutex to_thread_mutex;
std::condition_variable to_thread_cond;
std::mutex to_main_mutex;
std::condition_variable to_main_cond;
std::queue<T> work_queue;
Executor executor;
#ifdef PARALLEL_RDP_SHADER_DIR
Granite::Global::GlobalManagersHandle handles;
#endif
void main_loop()
{
#ifdef PARALLEL_RDP_SHADER_DIR
Granite::Global::set_thread_context(*handles);
handles.reset();
#endif
for (;;)
{
T value;
{
std::unique_lock<std::mutex> holder{to_thread_mutex};
to_thread_cond.wait(holder, [this]() { return !work_queue.empty(); });
value = std::move(work_queue.front());
work_queue.pop();
}
if (executor.is_sentinel(value))
break;
executor.perform_work(value);
std::lock_guard<std::mutex> holder{to_main_mutex};
executor.notify_work_locked(value);
to_main_cond.notify_one();
}
}
};
}

View File

@ -0,0 +1,82 @@
/* Copyright (c) 2017-2020 Hans-Kristian Arntzen
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "aligned_alloc.hpp"
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32
#include <malloc.h>
#endif
namespace Util
{
void *memalign_alloc(size_t boundary, size_t size)
{
#if defined(_WIN32)
return _aligned_malloc(size, boundary);
#elif defined(_ISOC11_SOURCE)
return aligned_alloc(boundary, size);
#elif (_POSIX_C_SOURCE >= 200112L) || (_XOPEN_SOURCE >= 600)
void *ptr = nullptr;
if (posix_memalign(&ptr, boundary, size) < 0)
return nullptr;
return ptr;
#else
// Align stuff ourselves. Kinda ugly, but will work anywhere.
void **place;
uintptr_t addr = 0;
void *ptr = malloc(boundary + size + sizeof(uintptr_t));
if (ptr == nullptr)
return nullptr;
addr = ((uintptr_t)ptr + sizeof(uintptr_t) + boundary) & ~(boundary - 1);
place = (void **) addr;
place[-1] = ptr;
return (void *) addr;
#endif
}
void *memalign_calloc(size_t boundary, size_t size)
{
void *ret = memalign_alloc(boundary, size);
if (ret)
memset(ret, 0, size);
return ret;
}
void memalign_free(void *ptr)
{
#if defined(_WIN32)
_aligned_free(ptr);
#elif !defined(_ISOC11_SOURCE) && !((_POSIX_C_SOURCE >= 200112L) || (_XOPEN_SOURCE >= 600))
if (ptr != nullptr)
{
void **p = (void **) ptr;
free(p[-1]);
}
#else
free(ptr);
#endif
}
}

View File

@ -0,0 +1,62 @@
/* Copyright (c) 2017-2020 Hans-Kristian Arntzen
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <stddef.h>
#include <stdexcept>
#include <new>
namespace Util
{
void *memalign_alloc(size_t boundary, size_t size);
void *memalign_calloc(size_t boundary, size_t size);
void memalign_free(void *ptr);
template <typename T>
struct AlignedAllocation
{
static void *operator new(size_t size)
{
void *ret = ::Util::memalign_alloc(alignof(T), size);
if (!ret) throw std::bad_alloc();
return ret;
}
static void *operator new[](size_t size)
{
void *ret = ::Util::memalign_alloc(alignof(T), size);
if (!ret) throw std::bad_alloc();
return ret;
}
static void operator delete(void *ptr)
{
return ::Util::memalign_free(ptr);
}
static void operator delete[](void *ptr)
{
return ::Util::memalign_free(ptr);
}
};
}

View File

@ -0,0 +1,106 @@
/* Copyright (c) 2017-2020 Hans-Kristian Arntzen
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#ifdef _MSC_VER
#include <intrin.h>
#endif
namespace Util
{
#ifdef __GNUC__
#define leading_zeroes(x) ((x) == 0 ? 32 : __builtin_clz(x))
#define trailing_zeroes(x) ((x) == 0 ? 32 : __builtin_ctz(x))
#define trailing_ones(x) __builtin_ctz(~uint32_t(x))
#elif defined(_MSC_VER)
namespace Internal
{
static inline uint32_t clz(uint32_t x)
{
unsigned long result;
if (_BitScanReverse(&result, x))
return 31 - result;
else
return 32;
}
static inline uint32_t ctz(uint32_t x)
{
unsigned long result;
if (_BitScanForward(&result, x))
return result;
else
return 32;
}
}
#define leading_zeroes(x) ::Util::Internal::clz(x)
#define trailing_zeroes(x) ::Util::Internal::ctz(x)
#define trailing_ones(x) ::Util::Internal::ctz(~uint32_t(x))
#else
#error "Implement me."
#endif
template <typename T>
inline void for_each_bit(uint32_t value, const T &func)
{
while (value)
{
uint32_t bit = trailing_zeroes(value);
func(bit);
value &= ~(1u << bit);
}
}
template <typename T>
inline void for_each_bit_range(uint32_t value, const T &func)
{
if (value == ~0u)
{
func(0, 32);
return;
}
uint32_t bit_offset = 0;
while (value)
{
uint32_t bit = trailing_zeroes(value);
bit_offset += bit;
value >>= bit;
uint32_t range = trailing_ones(value);
func(bit_offset, range);
value &= ~((1u << range) - 1);
}
}
inline uint32_t next_pow2(uint32_t v)
{
v--;
v |= v >> 16;
v |= v >> 8;
v |= v >> 4;
v |= v >> 2;
v |= v >> 1;
return v + 1;
}
}

View File

@ -0,0 +1,34 @@
/* Copyright (c) 2017-2020 Hans-Kristian Arntzen
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <type_traits>
namespace Util
{
template <typename T>
constexpr typename std::underlying_type<T>::type ecast(T x)
{
return static_cast<typename std::underlying_type<T>::type>(x);
}
}

View File

@ -0,0 +1,105 @@
/* Copyright (c) 2017-2020 Hans-Kristian Arntzen
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <stdint.h>
#include <string>
namespace Util
{
using Hash = uint64_t;
class Hasher
{
public:
explicit Hasher(Hash h_)
: h(h_)
{
}
Hasher() = default;
template <typename T>
inline void data(const T *data_, size_t size)
{
size /= sizeof(*data_);
for (size_t i = 0; i < size; i++)
h = (h * 0x100000001b3ull) ^ data_[i];
}
inline void u32(uint32_t value)
{
h = (h * 0x100000001b3ull) ^ value;
}
inline void s32(int32_t value)
{
u32(uint32_t(value));
}
inline void f32(float value)
{
union
{
float f32;
uint32_t u32;
} u;
u.f32 = value;
u32(u.u32);
}
inline void u64(uint64_t value)
{
u32(value & 0xffffffffu);
u32(value >> 32);
}
template <typename T>
inline void pointer(T *ptr)
{
u64(reinterpret_cast<uintptr_t>(ptr));
}
inline void string(const char *str)
{
char c;
u32(0xff);
while ((c = *str++) != '\0')
u32(uint8_t(c));
}
inline void string(const std::string &str)
{
u32(0xff);
for (auto &c : str)
u32(uint8_t(c));
}
inline Hash get() const
{
return h;
}
private:
Hash h = 0xcbf29ce484222325ull;
};
}

View File

@ -0,0 +1,296 @@
/* Copyright (c) 2017-2020 Hans-Kristian Arntzen
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <stddef.h>
#include <utility>
#include <memory>
#include <atomic>
#include <type_traits>
namespace Util
{
class SingleThreadCounter
{
public:
inline void add_ref()
{
count++;
}
inline bool release()
{
return --count == 0;
}
private:
size_t count = 1;
};
class MultiThreadCounter
{
public:
MultiThreadCounter()
{
count.store(1, std::memory_order_relaxed);
}
inline void add_ref()
{
count.fetch_add(1, std::memory_order_relaxed);
}
inline bool release()
{
auto result = count.fetch_sub(1, std::memory_order_acq_rel);
return result == 1;
}
private:
std::atomic_size_t count;
};
template <typename T>
class IntrusivePtr;
template <typename T, typename Deleter = std::default_delete<T>, typename ReferenceOps = SingleThreadCounter>
class IntrusivePtrEnabled
{
public:
using IntrusivePtrType = IntrusivePtr<T>;
using EnabledBase = T;
using EnabledDeleter = Deleter;
using EnabledReferenceOp = ReferenceOps;
void release_reference()
{
if (reference_count.release())
Deleter()(static_cast<T *>(this));
}
void add_reference()
{
reference_count.add_ref();
}
IntrusivePtrEnabled() = default;
IntrusivePtrEnabled(const IntrusivePtrEnabled &) = delete;
void operator=(const IntrusivePtrEnabled &) = delete;
protected:
Util::IntrusivePtr<T> reference_from_this();
private:
ReferenceOps reference_count;
};
template <typename T>
class IntrusivePtr
{
public:
template <typename U>
friend class IntrusivePtr;
IntrusivePtr() = default;
explicit IntrusivePtr(T *handle)
: data(handle)
{
}
T &operator*()
{
return *data;
}
const T &operator*() const
{
return *data;
}
T *operator->()
{
return data;
}
const T *operator->() const
{
return data;
}
explicit operator bool() const
{
return data != nullptr;
}
bool operator==(const IntrusivePtr &other) const
{
return data == other.data;
}
bool operator!=(const IntrusivePtr &other) const
{
return data != other.data;
}
T *get()
{
return data;
}
const T *get() const
{
return data;
}
void reset()
{
using ReferenceBase = IntrusivePtrEnabled<
typename T::EnabledBase,
typename T::EnabledDeleter,
typename T::EnabledReferenceOp>;
// Static up-cast here to avoid potential issues with multiple intrusive inheritance.
// Also makes sure that the pointer type actually inherits from this type.
if (data)
static_cast<ReferenceBase *>(data)->release_reference();
data = nullptr;
}
template <typename U>
IntrusivePtr &operator=(const IntrusivePtr<U> &other)
{
static_assert(std::is_base_of<T, U>::value,
"Cannot safely assign downcasted intrusive pointers.");
using ReferenceBase = IntrusivePtrEnabled<
typename T::EnabledBase,
typename T::EnabledDeleter,
typename T::EnabledReferenceOp>;
reset();
data = static_cast<T *>(other.data);
// Static up-cast here to avoid potential issues with multiple intrusive inheritance.
// Also makes sure that the pointer type actually inherits from this type.
if (data)
static_cast<ReferenceBase *>(data)->add_reference();
return *this;
}
IntrusivePtr &operator=(const IntrusivePtr &other)
{
using ReferenceBase = IntrusivePtrEnabled<
typename T::EnabledBase,
typename T::EnabledDeleter,
typename T::EnabledReferenceOp>;
if (this != &other)
{
reset();
data = other.data;
if (data)
static_cast<ReferenceBase *>(data)->add_reference();
}
return *this;
}
template <typename U>
IntrusivePtr(const IntrusivePtr<U> &other)
{
*this = other;
}
IntrusivePtr(const IntrusivePtr &other)
{
*this = other;
}
~IntrusivePtr()
{
reset();
}
template <typename U>
IntrusivePtr &operator=(IntrusivePtr<U> &&other) noexcept
{
reset();
data = other.data;
other.data = nullptr;
return *this;
}
IntrusivePtr &operator=(IntrusivePtr &&other) noexcept
{
if (this != &other)
{
reset();
data = other.data;
other.data = nullptr;
}
return *this;
}
template <typename U>
IntrusivePtr(IntrusivePtr<U> &&other) noexcept
{
*this = std::move(other);
}
template <typename U>
IntrusivePtr(IntrusivePtr &&other) noexcept
{
*this = std::move(other);
}
private:
T *data = nullptr;
};
template <typename T, typename Deleter, typename ReferenceOps>
IntrusivePtr<T> IntrusivePtrEnabled<T, Deleter, ReferenceOps>::reference_from_this()
{
add_reference();
return IntrusivePtr<T>(static_cast<T *>(this));
}
template <typename Derived>
using DerivedIntrusivePtrType = IntrusivePtr<Derived>;
template <typename T, typename... P>
DerivedIntrusivePtrType<T> make_handle(P &&... p)
{
return DerivedIntrusivePtrType<T>(new T(std::forward<P>(p)...));
}
template <typename Base, typename Derived, typename... P>
typename Base::IntrusivePtrType make_derived_handle(P &&... p)
{
return typename Base::IntrusivePtrType(new Derived(std::forward<P>(p)...));
}
template <typename T>
using ThreadSafeIntrusivePtrEnabled = IntrusivePtrEnabled<T, std::default_delete<T>, MultiThreadCounter>;
}

Some files were not shown because too many files have changed in this diff Show More