nv2a: Add Vulkan renderer

2024-07-26 17:21:01 -07:00 · 2024-07-26 17:21:01 -07:00 · a5385803db
parent e639e0cdb7
commit a5385803db
114 changed files with 23349 additions and 10302 deletions
--- a/.clang-format
+++ b/.clang-format
@ -71,8 +71,8 @@ IndentWidth:     4
 AccessModifierOffset: -4
 IndentWrappedFunctionNames: false
 KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: '.*_BEGIN$' # only PREC_BEGIN ?
-MacroBlockEnd:   '.*_END$'
+#MacroBlockBegin: '.*_BEGIN$' # only PREC_BEGIN ?
+#MacroBlockEnd:   '.*_END$'
 MaxEmptyLinesToKeep: 2
 #PenaltyBreakBeforeFirstCallParameter: 19
 #PenaltyBreakComment: 300
--- a/.gitmodules
+++ b/.gitmodules
@ -82,9 +82,18 @@
 [submodule "tomlplusplus"]
 	path = tomlplusplus
 	url = https://github.com/marzer/tomlplusplus
-[submodule "hw/xbox/nv2a/thirdparty/nv2a_vsh_cpu"]
-	path = hw/xbox/nv2a/thirdparty/nv2a_vsh_cpu
+[submodule "hw/xbox/nv2a/pgraph/thirdparty/nv2a_vsh_cpu"]
+	path = hw/xbox/nv2a/pgraph/thirdparty/nv2a_vsh_cpu
 	url = https://github.com/abaire/nv2a_vsh_cpu.git
 [submodule "ui/thirdparty/httplib"]
 	path = ui/thirdparty/httplib
 	url = https://github.com/yhirose/cpp-httplib
+[submodule "hw/xbox/nv2a/pgraph/vk/thirdparty/VulkanMemoryAllocator"]
+	path = thirdparty/VulkanMemoryAllocator
+	url = https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator
+[submodule "thirdparty/volk"]
+	path = thirdparty/volk
+	url = https://github.com/zeux/volk
+[submodule "thirdparty/SPIRV-Reflect"]
+	path = thirdparty/SPIRV-Reflect
+	url = https://github.com/KhronosGroup/SPIRV-Reflect
--- a/config_spec.yml
+++ b/config_spec.yml
@ -130,6 +130,12 @@ input:
      default: 18 # w

 display:
+  renderer:
+    type: enum
+    values: ["NULL", OPENGL, VULKAN]
+    default: OPENGL
+  vulkan:
+    validation_layers: bool
  quality:
    surface_scale:
      type: integer
--- a/2
+++ b/2
@ -237,7 +237,7 @@ else
    git_submodules_action="ignore"
 fi

-git_submodules="ui/keycodemapdb ui/thirdparty/imgui ui/thirdparty/implot ui/thirdparty/httplib util/xxHash tomlplusplus genconfig hw/xbox/nv2a/thirdparty/nv2a_vsh_cpu"
+git_submodules="ui/keycodemapdb ui/thirdparty/imgui ui/thirdparty/implot ui/thirdparty/httplib util/xxHash tomlplusplus genconfig hw/xbox/nv2a/pgraph/thirdparty/nv2a_vsh_cpu thirdparty/volk thirdparty/VulkanMemoryAllocator thirdparty/SPIRV-Reflect"
 git="git"

 # Don't accept a target_list environment variable.
--- a/debian/control
+++ b/debian/control
@ -16,6 +16,9 @@ Build-Depends: debhelper (>= 11),
 libssl-dev,
 libpcap-dev,
 libslirp-dev,
+ glslang-dev,
+ libvulkan-dev,
+
 Standards-Version: 3.9.8
 Homepage: https://xemu.app
 XS-Debian-Vcs-Browser: https://github.com/mborgerson/xemu
--- a/hw/xbox/nv2a/debug.h
+++ b/hw/xbox/nv2a/debug.h
@ -1,8 +1,9 @@
 /*
- * QEMU Geforce NV2A debug helpers
+ * QEMU Geforce NV2A profiling and debug helpers
 *
- * Copyright (c) 2015 Jannik Vogel
 * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2023 Matt Borgerson
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@ -18,8 +19,8 @@
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef HW_NV2A_DEBUG_H
-#define HW_NV2A_DEBUG_H
+#ifndef HW_XBOX_NV2A_DEBUG_H
+#define HW_XBOX_NV2A_DEBUG_H

 #include <stdint.h>

@ -36,54 +37,6 @@
 # define NV2A_DPRINTF(format, ...)       do { } while (0)
 #endif

-// #define DEBUG_NV2A_GL
-#ifdef DEBUG_NV2A_GL
-
-#include <stdbool.h>
-#include "gl/gloffscreen.h"
-#include "config-host.h"
-
-void gl_debug_initialize(void);
-void gl_debug_message(bool cc, const char *fmt, ...);
-void gl_debug_group_begin(const char *fmt, ...);
-void gl_debug_group_end(void);
-void gl_debug_label(GLenum target, GLuint name, const char *fmt, ...);
-void gl_debug_frame_terminator(void);
-
-# define NV2A_GL_DPRINTF(cc, format, ...) \
-    gl_debug_message(cc, "nv2a: " format, ## __VA_ARGS__)
-# define NV2A_GL_DGROUP_BEGIN(format, ...) \
-    gl_debug_group_begin("nv2a: " format, ## __VA_ARGS__)
-# define NV2A_GL_DGROUP_END() \
-    gl_debug_group_end()
-# define NV2A_GL_DLABEL(target, name, format, ...)  \
-    gl_debug_label(target, name, "nv2a: { " format " }", ## __VA_ARGS__)
-#define NV2A_GL_DFRAME_TERMINATOR() \
-    gl_debug_frame_terminator()
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef CONFIG_RENDERDOC
-bool nv2a_dbg_renderdoc_available(void);
-void nv2a_dbg_renderdoc_capture_frames(uint32_t num_frames);
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#else
-# define NV2A_GL_DPRINTF(cc, format, ...)          do { \
-        if (cc) NV2A_DPRINTF(format "\n", ##__VA_ARGS__ ); \
-    } while (0)
-# define NV2A_GL_DGROUP_BEGIN(format, ...)         do { } while (0)
-# define NV2A_GL_DGROUP_END()                      do { } while (0)
-# define NV2A_GL_DLABEL(target, name, format, ...) do { } while (0)
-# define NV2A_GL_DFRAME_TERMINATOR()               do { } while (0)
-#endif
-
 /* Debug prints to identify when unimplemented or unconfirmed features
 * are being exercised. These cases likely result in graphical problems of
 * varying degree, but should otherwise not crash the system. Enable this
@ -111,6 +64,22 @@ void nv2a_dbg_renderdoc_capture_frames(uint32_t num_frames);
 #endif

 #define NV2A_PROF_COUNTERS_XMAC \
+    _X(NV2A_PROF_FINISH_VERTEX_BUFFER_DIRTY) \
+    _X(NV2A_PROF_FINISH_SURFACE_CREATE) \
+    _X(NV2A_PROF_FINISH_SURFACE_DOWN) \
+    _X(NV2A_PROF_FINISH_NEED_BUFFER_SPACE) \
+    _X(NV2A_PROF_FINISH_FRAMEBUFFER_DIRTY) \
+    _X(NV2A_PROF_FINISH_PRESENTING) \
+    _X(NV2A_PROF_FINISH_FLIP_STALL) \
+    _X(NV2A_PROF_FINISH_FLUSH) \
+    _X(NV2A_PROF_CLEAR) \
+    _X(NV2A_PROF_QUEUE_SUBMIT) \
+    _X(NV2A_PROF_QUEUE_SUBMIT_AUX) \
+    _X(NV2A_PROF_PIPELINE_NOTDIRTY) \
+    _X(NV2A_PROF_PIPELINE_GEN) \
+    _X(NV2A_PROF_PIPELINE_BIND) \
+    _X(NV2A_PROF_PIPELINE_MERGE) \
+    _X(NV2A_PROF_PIPELINE_RENDERPASSES) \
    _X(NV2A_PROF_BEGIN_ENDS) \
    _X(NV2A_PROF_DRAW_ARRAYS) \
    _X(NV2A_PROF_INLINE_BUFFERS) \
@ -120,18 +89,26 @@ void nv2a_dbg_renderdoc_capture_frames(uint32_t num_frames);
    _X(NV2A_PROF_SHADER_GEN) \
    _X(NV2A_PROF_SHADER_BIND) \
    _X(NV2A_PROF_SHADER_BIND_NOTDIRTY) \
+    _X(NV2A_PROF_SHADER_UBO_DIRTY) \
+    _X(NV2A_PROF_SHADER_UBO_NOTDIRTY) \
    _X(NV2A_PROF_ATTR_BIND) \
    _X(NV2A_PROF_TEX_UPLOAD) \
-    _X(NV2A_PROF_TEX_BIND) \
    _X(NV2A_PROF_GEOM_BUFFER_UPDATE_1) \
    _X(NV2A_PROF_GEOM_BUFFER_UPDATE_2) \
    _X(NV2A_PROF_GEOM_BUFFER_UPDATE_3) \
    _X(NV2A_PROF_GEOM_BUFFER_UPDATE_4) \
    _X(NV2A_PROF_GEOM_BUFFER_UPDATE_4_NOTDIRTY) \
+    _X(NV2A_PROF_SURF_SWIZZLE) \
+    _X(NV2A_PROF_SURF_CREATE) \
    _X(NV2A_PROF_SURF_DOWNLOAD) \
    _X(NV2A_PROF_SURF_UPLOAD) \
    _X(NV2A_PROF_SURF_TO_TEX) \
    _X(NV2A_PROF_SURF_TO_TEX_FALLBACK) \
+    _X(NV2A_PROF_QUEUE_SUBMIT_1) \
+    _X(NV2A_PROF_QUEUE_SUBMIT_2) \
+    _X(NV2A_PROF_QUEUE_SUBMIT_3) \
+    _X(NV2A_PROF_QUEUE_SUBMIT_4) \
+    _X(NV2A_PROF_QUEUE_SUBMIT_5) \

 enum NV2A_PROF_COUNTERS_ENUM {
    #define _X(x) x,
@ -161,6 +138,21 @@ extern NV2AStats g_nv2a_stats;

 const char *nv2a_profile_get_counter_name(unsigned int cnt);
 int nv2a_profile_get_counter_value(unsigned int cnt);
+void nv2a_profile_increment(void);
+void nv2a_profile_flip_stall(void);
+
+static inline void nv2a_profile_inc_counter(enum NV2A_PROF_COUNTERS_ENUM cnt)
+{
+    g_nv2a_stats.frame_working.counters[cnt] += 1;
+}
+
+#ifdef CONFIG_RENDERDOC
+void nv2a_dbg_renderdoc_init(void);
+void *nv2a_dbg_renderdoc_get_api(void);
+bool nv2a_dbg_renderdoc_available(void);
+void nv2a_dbg_renderdoc_capture_frames(int num_frames);
+extern int renderdoc_capture_frames;
+#endif

 #ifdef __cplusplus
 }
--- a/hw/xbox/nv2a/gl/meson.build
+++ b/hw/xbox/nv2a/gl/meson.build
@ -1,6 +0,0 @@
-softmmu_ss.add([sdl, files(
-	'gloffscreen_common.c',
-	'gloffscreen_sdl.c',
-	)])
-
-# gloffscreen_sdl.o-cflags := $(SDL_CFLAGS)
--- a/hw/xbox/nv2a/meson.build
+++ b/hw/xbox/nv2a/meson.build
@ -1,27 +1,17 @@
 specific_ss.add(files(
 	'nv2a.c',
-	'debug.c',
 	'pbus.c',
 	'pcrtc.c',
 	'pfb.c',
 	'pfifo.c',
-	'pgraph.c',
 	'pmc.c',
 	'pramdac.c',
 	'prmcio.c',
 	'prmdio.c',
 	'prmvio.c',
-	'psh.c',
 	'ptimer.c',
 	'pvideo.c',
-	'shaders.c',
 	'stubs.c',
 	'user.c',
-	'vsh.c',
-	'swizzle.c',
-	's3tc.c',
 	))
-subdir('gl')
-
-subdir('thirdparty')
-specific_ss.add(nv2a_vsh_cpu)
+subdir('pgraph')
--- a/hw/xbox/nv2a/nv2a.c
+++ b/hw/xbox/nv2a/nv2a.c
@ -172,6 +172,16 @@ static void nv2a_get_offsets(VGACommonState *s,
    *pline_compare = line_compare;
 }

+const uint8_t *nv2a_get_dac_palette(void)
+{
+    return g_nv2a->puserdac.palette;
+}
+
+int nv2a_get_screen_off(void)
+{
+    return g_nv2a->vga.sr[VGA_SEQ_CLOCK_MODE] & VGA_SR01_SCREEN_OFF;
+}
+
 static void nv2a_vga_gfx_update(void *opaque)
 {
    VGACommonState *vga = opaque;
@ -277,7 +287,7 @@ static void nv2a_reset(NV2AState *d)
    }

    memset(d->pfifo.regs, 0, sizeof(d->pfifo.regs));
-    memset(d->pgraph.regs, 0, sizeof(d->pgraph.regs));
+    memset(d->pgraph.regs_, 0, sizeof(d->pgraph.regs_));
    memset(d->pvideo.regs, 0, sizeof(d->pvideo.regs));

    d->pcrtc.start = 0;
@ -365,11 +375,10 @@ static void nv2a_vm_state_change(void *opaque, bool running, RunState state)
    if (state == RUN_STATE_SAVE_VM) {
        nv2a_lock_fifo(d);
        qatomic_set(&d->pfifo.halt, true);
-        qatomic_set(&d->pgraph.download_dirty_surfaces_pending, true);
-        qemu_event_reset(&d->pgraph.dirty_surfaces_download_complete);
+        d->pgraph.renderer->ops.pre_savevm_trigger(d);
        nv2a_unlock_fifo(d);
        qemu_mutex_unlock_iothread();
-        qemu_event_wait(&d->pgraph.dirty_surfaces_download_complete);
+        d->pgraph.renderer->ops.pre_savevm_wait(d);
        qemu_mutex_lock_iothread();
        nv2a_lock_fifo(d);
    } else if (state == RUN_STATE_RESTORE_VM) {
@ -382,11 +391,10 @@ static void nv2a_vm_state_change(void *opaque, bool running, RunState state)
        nv2a_unlock_fifo(d);
    } else if (state == RUN_STATE_SHUTDOWN) {
        nv2a_lock_fifo(d);
-        qatomic_set(&d->pgraph.shader_cache_writeback_pending, true);
-        qemu_event_reset(&d->pgraph.shader_cache_writeback_complete);
+        d->pgraph.renderer->ops.pre_shutdown_trigger(d);
        nv2a_unlock_fifo(d);
        qemu_mutex_unlock_iothread();
-        qemu_event_wait(&d->pgraph.shader_cache_writeback_complete);
+        d->pgraph.renderer->ops.pre_shutdown_wait(d);
        qemu_mutex_lock_iothread();
    }
 }
@ -515,9 +523,9 @@ static const VMStateDescription vmstate_nv2a = {
        VMSTATE_UINT32(pgraph.inline_buffer_length, NV2AState), // fixme
        VMSTATE_UINT32(pgraph.draw_arrays_length, NV2AState),
        VMSTATE_UINT32(pgraph.draw_arrays_max_count, NV2AState),
-        VMSTATE_INT32_ARRAY(pgraph.gl_draw_arrays_start, NV2AState, 1250),
-        VMSTATE_INT32_ARRAY(pgraph.gl_draw_arrays_count, NV2AState, 1250),
-        VMSTATE_UINT32_ARRAY(pgraph.regs, NV2AState, 0x2000),
+        VMSTATE_INT32_ARRAY(pgraph.draw_arrays_start, NV2AState, 1250),
+        VMSTATE_INT32_ARRAY(pgraph.draw_arrays_count, NV2AState, 1250),
+        VMSTATE_UINT32_ARRAY(pgraph.regs_, NV2AState, 0x2000),
        VMSTATE_UINT32(pmc.pending_interrupts, NV2AState),
        VMSTATE_UINT32(pmc.enabled_interrupts, NV2AState),
        VMSTATE_UINT32(pfifo.pending_interrupts, NV2AState),
--- a/hw/xbox/nv2a/nv2a.h
+++ b/hw/xbox/nv2a/nv2a.h
@ -22,7 +22,7 @@
 #define HW_NV2A_H

 void nv2a_init(PCIBus *bus, int devfn, MemoryRegion *ram);
-void nv2a_gl_context_init(void);
+void nv2a_context_init(void);
 int nv2a_get_framebuffer_surface(void);
 void nv2a_set_surface_scale_factor(unsigned int scale);
 unsigned int nv2a_get_surface_scale_factor(void);
--- a/hw/xbox/nv2a/nv2a_int.h
+++ b/hw/xbox/nv2a/nv2a_int.h
@ -44,25 +44,12 @@
 #include "cpu.h"

 #include "trace.h"
-#include "swizzle.h"
-#include "lru.h"
-#include "gl/gloffscreen.h"

 #include "nv2a.h"
+#include "pgraph/pgraph.h"
 #include "debug.h"
-#include "shaders.h"
 #include "nv2a_regs.h"

-#define GET_MASK(v, mask) (((v) & (mask)) >> ctz32(mask))
-
-#define SET_MASK(v, mask, val)                            \
-    ({                                                    \
-        const unsigned int __val = (val);                 \
-        const unsigned int __mask = (mask);               \
-        (v) &= ~(__mask);                                 \
-        (v) |= ((__val) << ctz32(__mask)) & (__mask);     \
-    })
-
 #define NV2A_DEVICE(obj) OBJECT_CHECK(NV2AState, (obj), "nv2a")

 enum FIFOEngine {
@ -78,347 +65,6 @@ typedef struct DMAObject {
    hwaddr limit;
 } DMAObject;

-typedef struct VertexAttribute {
-    bool dma_select;
-    hwaddr offset;
-
-    /* inline arrays are packed in order?
-     * Need to pass the offset to converted attributes */
-    unsigned int inline_array_offset;
-
-    float inline_value[4];
-
-    unsigned int format;
-    unsigned int size; /* size of the data type */
-    unsigned int count; /* number of components */
-    uint32_t stride;
-
-    bool needs_conversion;
-
-    float *inline_buffer;
-    bool inline_buffer_populated;
-
-    GLint gl_count;
-    GLenum gl_type;
-    GLboolean gl_normalize;
-
-    GLuint gl_inline_buffer;
-} VertexAttribute;
-
-typedef struct SurfaceFormatInfo {
-    unsigned int bytes_per_pixel;
-    GLint gl_internal_format;
-    GLenum gl_format;
-    GLenum gl_type;
-    GLenum gl_attachment;
-} SurfaceFormatInfo;
-
-typedef struct Surface {
-    bool draw_dirty;
-    bool buffer_dirty;
-    bool write_enabled_cache;
-    unsigned int pitch;
-
-    hwaddr offset;
-} Surface;
-
-typedef struct SurfaceShape {
-    unsigned int z_format;
-    unsigned int color_format;
-    unsigned int zeta_format;
-    unsigned int log_width, log_height;
-    unsigned int clip_x, clip_y;
-    unsigned int clip_width, clip_height;
-    unsigned int anti_aliasing;
-} SurfaceShape;
-
-typedef struct SurfaceBinding {
-    QTAILQ_ENTRY(SurfaceBinding) entry;
-    MemAccessCallback *access_cb;
-
-    hwaddr vram_addr;
-
-    SurfaceFormatInfo fmt;
-    SurfaceShape shape;
-    uintptr_t dma_addr;
-    uintptr_t dma_len;
-    bool color;
-    bool swizzle;
-
-    unsigned int width;
-    unsigned int height;
-    unsigned int pitch;
-    size_t size;
-
-    GLuint gl_buffer;
-
-    bool cleared;
-    int frame_time;
-    int draw_time;
-    bool draw_dirty;
-    bool download_pending;
-    bool upload_pending;
-} SurfaceBinding;
-
-typedef struct TextureShape {
-    bool cubemap;
-    unsigned int dimensionality;
-    unsigned int color_format;
-    unsigned int levels;
-    unsigned int width, height, depth;
-    bool border;
-
-    unsigned int min_mipmap_level, max_mipmap_level;
-    unsigned int pitch;
-} TextureShape;
-
-typedef struct TextureBinding {
-    GLenum gl_target;
-    GLuint gl_texture;
-    unsigned int refcnt;
-    int draw_time;
-    uint64_t data_hash;
-    unsigned int scale;
-    unsigned int min_filter;
-    unsigned int mag_filter;
-    unsigned int addru;
-    unsigned int addrv;
-    unsigned int addrp;
-    uint32_t border_color;
-    bool border_color_set;
-} TextureBinding;
-
-typedef struct TextureKey {
-    TextureShape state;
-    hwaddr texture_vram_offset;
-    hwaddr texture_length;
-    hwaddr palette_vram_offset;
-    hwaddr palette_length;
-} TextureKey;
-
-typedef struct TextureLruNode {
-    LruNode node;
-    TextureKey key;
-    TextureBinding *binding;
-    bool possibly_dirty;
-} TextureLruNode;
-
-typedef struct VertexKey {
-    size_t count;
-    GLuint gl_type;
-    GLboolean gl_normalize;
-    size_t stride;
-    hwaddr addr;
-} VertexKey;
-
-typedef struct VertexLruNode {
-    LruNode node;
-    VertexKey key;
-    GLuint gl_buffer;
-    bool initialized;
-} VertexLruNode;
-
-typedef struct KelvinState {
-    hwaddr object_instance;
-} KelvinState;
-
-typedef struct ContextSurfaces2DState {
-    hwaddr object_instance;
-    hwaddr dma_image_source;
-    hwaddr dma_image_dest;
-    unsigned int color_format;
-    unsigned int source_pitch, dest_pitch;
-    hwaddr source_offset, dest_offset;
-} ContextSurfaces2DState;
-
-typedef struct ImageBlitState {
-    hwaddr object_instance;
-    hwaddr context_surfaces;
-    unsigned int operation;
-    unsigned int in_x, in_y;
-    unsigned int out_x, out_y;
-    unsigned int width, height;
-} ImageBlitState;
-
-typedef struct BetaState {
-  hwaddr object_instance;
-  uint32_t beta;
-} BetaState;
-
-typedef struct QueryReport {
-    QSIMPLEQ_ENTRY(QueryReport) entry;
-    bool clear;
-    uint32_t parameter;
-    unsigned int query_count;
-    GLuint *queries;
-} QueryReport;
-
-typedef struct PGRAPHState {
-    QemuMutex lock;
-
-    uint32_t pending_interrupts;
-    uint32_t enabled_interrupts;
-
-    int frame_time;
-    int draw_time;
-
-    struct s2t_rndr {
-        GLuint fbo, vao, vbo, prog;
-        GLuint tex_loc, surface_size_loc;
-    } s2t_rndr;
-
-    struct disp_rndr {
-        GLuint fbo, vao, vbo, prog;
-        GLuint display_size_loc;
-        GLuint line_offset_loc;
-        GLuint tex_loc;
-        GLuint pvideo_tex;
-        GLint pvideo_enable_loc;
-        GLint pvideo_tex_loc;
-        GLint pvideo_in_pos_loc;
-        GLint pvideo_pos_loc;
-        GLint pvideo_scale_loc;
-        GLint pvideo_color_key_enable_loc;
-        GLint pvideo_color_key_loc;
-        GLint palette_loc[256];
-    } disp_rndr;
-
-    /* subchannels state we're not sure the location of... */
-    ContextSurfaces2DState context_surfaces_2d;
-    ImageBlitState image_blit;
-    KelvinState kelvin;
-    BetaState beta;
-
-    hwaddr dma_color, dma_zeta;
-    Surface surface_color, surface_zeta;
-    unsigned int surface_type;
-    SurfaceShape surface_shape;
-    SurfaceShape last_surface_shape;
-    QTAILQ_HEAD(, SurfaceBinding) surfaces;
-    SurfaceBinding *color_binding, *zeta_binding;
-    struct {
-        int clip_x;
-        int clip_width;
-        int clip_y;
-        int clip_height;
-        int width;
-        int height;
-    } surface_binding_dim; // FIXME: Refactor
-
-    hwaddr dma_a, dma_b;
-    Lru texture_cache;
-    TextureLruNode *texture_cache_entries;
-    bool texture_dirty[NV2A_MAX_TEXTURES];
-    TextureBinding *texture_binding[NV2A_MAX_TEXTURES];
-
-    Lru shader_cache;
-    ShaderLruNode *shader_cache_entries;
-    ShaderBinding *shader_binding;
-    QemuMutex shader_cache_lock;
-    QemuThread shader_disk_thread;
-
-    bool texture_matrix_enable[NV2A_MAX_TEXTURES];
-
-    GLuint gl_framebuffer;
-
-    GLuint gl_display_buffer;
-    GLint gl_display_buffer_internal_format;
-    GLsizei gl_display_buffer_width;
-    GLsizei gl_display_buffer_height;
-    GLenum gl_display_buffer_format;
-    GLenum gl_display_buffer_type;
-
-    hwaddr dma_state;
-    hwaddr dma_notifies;
-    hwaddr dma_semaphore;
-
-    hwaddr dma_report;
-    hwaddr report_offset;
-    bool zpass_pixel_count_enable;
-    unsigned int zpass_pixel_count_result;
-    unsigned int gl_zpass_pixel_count_query_count;
-    GLuint *gl_zpass_pixel_count_queries;
-    QSIMPLEQ_HEAD(, QueryReport) report_queue;
-
-    hwaddr dma_vertex_a, dma_vertex_b;
-
-    uint32_t primitive_mode;
-
-    bool enable_vertex_program_write;
-
-    uint32_t vertex_state_shader_v0[4];
-    uint32_t program_data[NV2A_MAX_TRANSFORM_PROGRAM_LENGTH][VSH_TOKEN_SIZE];
-    bool program_data_dirty;
-
-    uint32_t vsh_constants[NV2A_VERTEXSHADER_CONSTANTS][4];
-    bool vsh_constants_dirty[NV2A_VERTEXSHADER_CONSTANTS];
-
-    /* lighting constant arrays */
-    uint32_t ltctxa[NV2A_LTCTXA_COUNT][4];
-    bool ltctxa_dirty[NV2A_LTCTXA_COUNT];
-    uint32_t ltctxb[NV2A_LTCTXB_COUNT][4];
-    bool ltctxb_dirty[NV2A_LTCTXB_COUNT];
-    uint32_t ltc1[NV2A_LTC1_COUNT][4];
-    bool ltc1_dirty[NV2A_LTC1_COUNT];
-
-    float material_alpha;
-
-    // should figure out where these are in lighting context
-    float light_infinite_half_vector[NV2A_MAX_LIGHTS][3];
-    float light_infinite_direction[NV2A_MAX_LIGHTS][3];
-    float light_local_position[NV2A_MAX_LIGHTS][3];
-    float light_local_attenuation[NV2A_MAX_LIGHTS][3];
-
-    float point_params[8];
-
-    VertexAttribute vertex_attributes[NV2A_VERTEXSHADER_ATTRIBUTES];
-    uint16_t compressed_attrs;
-
-    Lru element_cache;
-    VertexLruNode *element_cache_entries;
-
-    unsigned int inline_array_length;
-    uint32_t inline_array[NV2A_MAX_BATCH_LENGTH];
-    GLuint gl_inline_array_buffer;
-
-    unsigned int inline_elements_length;
-    uint32_t inline_elements[NV2A_MAX_BATCH_LENGTH];
-
-    unsigned int inline_buffer_length;
-
-    unsigned int draw_arrays_length;
-    unsigned int draw_arrays_min_start;
-    unsigned int draw_arrays_max_count;
-    /* FIXME: Unknown size, possibly endless, 1250 will do for now */
-    /* Keep in sync with size used in nv2a.c */
-    GLint gl_draw_arrays_start[1250];
-    GLsizei gl_draw_arrays_count[1250];
-    bool draw_arrays_prevent_connect;
-
-    GLuint gl_memory_buffer;
-    GLuint gl_vertex_array;
-
-    uint32_t regs[0x2000];
-
-    bool clearing;
-    bool waiting_for_nop;
-    bool waiting_for_flip;
-    bool waiting_for_context_switch;
-    bool downloads_pending;
-    bool download_dirty_surfaces_pending;
-    bool flush_pending;
-    bool gl_sync_pending;
-    bool shader_cache_writeback_pending;
-    QemuEvent downloads_complete;
-    QemuEvent dirty_surfaces_download_complete;
-    QemuEvent flush_complete;
-    QemuEvent gl_sync_complete;
-    QemuEvent shader_cache_writeback_complete;
-
-    unsigned int surface_scale_factor;
-    uint8_t *scale_buf;
-} PGRAPHState;
-
 typedef struct NV2AState {
    /*< private >*/
    PCIDevice parent_obj;
@ -512,9 +158,6 @@ typedef struct NV2ABlockInfo {
 } NV2ABlockInfo;
 extern const NV2ABlockInfo blocktable[NV_NUM_BLOCKS];

-extern GloContext *g_nv2a_context_render;
-extern GloContext *g_nv2a_context_display;
-
 void nv2a_update_irq(NV2AState *d);

 static inline
@ -566,20 +209,5 @@ DEFINE_PROTO(user)
 DMAObject nv_dma_load(NV2AState *d, hwaddr dma_obj_address);
 void *nv_dma_map(NV2AState *d, hwaddr dma_obj_address, hwaddr *len);

-void pgraph_init(NV2AState *d);
-void pgraph_destroy(PGRAPHState *pg);
-void pgraph_context_switch(NV2AState *d, unsigned int channel_id);
-int pgraph_method(NV2AState *d, unsigned int subchannel, unsigned int method,
-                  uint32_t parameter, uint32_t *parameters,
-                  size_t num_words_available, size_t max_lookahead_words,
-                  bool inc);
-void pgraph_gl_sync(NV2AState *d);
-void pgraph_process_pending_reports(NV2AState *d);
-void pgraph_process_pending_downloads(NV2AState *d);
-void pgraph_download_dirty_surfaces(NV2AState *d);
-void pgraph_flush(NV2AState *d);
-
-void *pfifo_thread(void *arg);
-void pfifo_kick(NV2AState *d);

 #endif
--- a/hw/xbox/nv2a/nv2a_regs.h
+++ b/hw/xbox/nv2a/nv2a_regs.h
@ -21,6 +21,17 @@
 #ifndef HW_NV2A_REGS_H
 #define HW_NV2A_REGS_H

+
+#define GET_MASK(v, mask) (((v) & (mask)) >> ctz32(mask))
+
+#define SET_MASK(v, mask, val)                            \
+    ({                                                    \
+        const unsigned int __val = (val);                 \
+        const unsigned int __mask = (mask);               \
+        (v) &= ~(__mask);                                 \
+        (v) |= ((__val) << ctz32(__mask)) & (__mask);     \
+    })
+
 #define NV_NUM_BLOCKS 21
 #define NV_PMC          0   /* card master control */
 #define NV_PBUS         1   /* bus control */
--- a/hw/xbox/nv2a/pfifo.c
+++ b/hw/xbox/nv2a/pfifo.c
@ -95,23 +95,25 @@ void pfifo_kick(NV2AState *d)
    qemu_cond_broadcast(&d->pfifo.fifo_cond);
 }

-static bool pgraph_can_fifo_access(NV2AState *d) {
-    return qatomic_read(&d->pgraph.regs[NV_PGRAPH_FIFO]) & NV_PGRAPH_FIFO_ACCESS;
+static bool can_fifo_access(NV2AState *d) {
+    return qatomic_read(&d->pgraph.regs_[NV_PGRAPH_FIFO]) &
+           NV_PGRAPH_FIFO_ACCESS;
 }

 /* If NV097_FLIP_STALL was executed, check if the flip has completed.
 * This will usually happen in the VSYNC interrupt handler.
 */
-static bool pgraph_is_flip_stall_complete(NV2AState *d)
+static bool is_flip_stall_complete(NV2AState *d)
 {
    PGRAPHState *pg = &d->pgraph;

-    NV2A_DPRINTF("flip stall read: %d, write: %d, modulo: %d\n",
-        GET_MASK(pg->regs[NV_PGRAPH_SURFACE], NV_PGRAPH_SURFACE_READ_3D),
-        GET_MASK(pg->regs[NV_PGRAPH_SURFACE], NV_PGRAPH_SURFACE_WRITE_3D),
-        GET_MASK(pg->regs[NV_PGRAPH_SURFACE], NV_PGRAPH_SURFACE_MODULO_3D));
+    uint32_t s = pgraph_reg_r(pg, NV_PGRAPH_SURFACE);
+
+    NV2A_DPRINTF("flip stall read: %d, write: %d, modulo: %d\n",
+        GET_MASK(s, NV_PGRAPH_SURFACE_READ_3D),
+        GET_MASK(s, NV_PGRAPH_SURFACE_WRITE_3D),
+        GET_MASK(s, NV_PGRAPH_SURFACE_MODULO_3D));

-    uint32_t s = pg->regs[NV_PGRAPH_SURFACE];
    if (GET_MASK(s, NV_PGRAPH_SURFACE_READ_3D)
        != GET_MASK(s, NV_PGRAPH_SURFACE_WRITE_3D)) {
        return true;
@ -126,7 +128,7 @@ static bool pfifo_stall_for_flip(NV2AState *d)

    if (qatomic_read(&d->pgraph.waiting_for_flip)) {
        qemu_mutex_lock(&d->pgraph.lock);
-        if (!pgraph_is_flip_stall_complete(d)) {
+        if (!is_flip_stall_complete(d)) {
            should_stall = true;
        } else {
            d->pgraph.waiting_for_flip = false;
@ -141,7 +143,7 @@ static bool pfifo_puller_should_stall(NV2AState *d)
 {
    return pfifo_stall_for_flip(d) || qatomic_read(&d->pgraph.waiting_for_nop) ||
           qatomic_read(&d->pgraph.waiting_for_context_switch) ||
-           !pgraph_can_fifo_access(d);
+           !can_fifo_access(d);
 }

 static ssize_t pfifo_run_puller(NV2AState *d, uint32_t method_entry,
@ -187,7 +189,7 @@ static ssize_t pfifo_run_puller(NV2AState *d, uint32_t method_entry,
        qemu_mutex_lock(&d->pgraph.lock);

        // Switch contexts if necessary
-        if (pgraph_can_fifo_access(d)) {
+        if (can_fifo_access(d)) {
            pgraph_context_switch(d, entry.channel_id);
            if (!d->pgraph.waiting_for_context_switch) {
                num_proc =
@ -221,7 +223,7 @@ static ssize_t pfifo_run_puller(NV2AState *d, uint32_t method_entry,
        qemu_mutex_unlock(&d->pfifo.lock);
        qemu_mutex_lock(&d->pgraph.lock);

-        if (pgraph_can_fifo_access(d)) {
+        if (can_fifo_access(d)) {
            num_proc =
                pgraph_method(d, subchannel, method, parameter, parameters,
                              num_words_available, max_lookahead_words, inc);
@ -242,7 +244,7 @@ static ssize_t pfifo_run_puller(NV2AState *d, uint32_t method_entry,

 static bool pfifo_pusher_should_stall(NV2AState *d)
 {
-    return !pgraph_can_fifo_access(d) ||
+    return !can_fifo_access(d) ||
           qatomic_read(&d->pgraph.waiting_for_nop);
 }

@ -447,39 +449,11 @@ static void pfifo_run_pusher(NV2AState *d)
    }
 }

-static void process_requests(NV2AState *d)
-{
-    if (qatomic_read(&d->pgraph.downloads_pending) ||
-        qatomic_read(&d->pgraph.download_dirty_surfaces_pending) ||
-        qatomic_read(&d->pgraph.gl_sync_pending) ||
-        qatomic_read(&d->pgraph.flush_pending) ||
-        qatomic_read(&d->pgraph.shader_cache_writeback_pending)) {
-        qemu_mutex_unlock(&d->pfifo.lock);
-        qemu_mutex_lock(&d->pgraph.lock);
-        if (qatomic_read(&d->pgraph.downloads_pending)) {
-            pgraph_process_pending_downloads(d);
-        }
-        if (qatomic_read(&d->pgraph.download_dirty_surfaces_pending)) {
-            pgraph_download_dirty_surfaces(d);
-        }
-        if (qatomic_read(&d->pgraph.gl_sync_pending)) {
-            pgraph_gl_sync(d);
-        }
-        if (qatomic_read(&d->pgraph.flush_pending)) {
-            pgraph_flush(d);
-        }
-        if (qatomic_read(&d->pgraph.shader_cache_writeback_pending)) {
-            shader_write_cache_reload_list(&d->pgraph);
-        }
-        qemu_mutex_unlock(&d->pgraph.lock);
-        qemu_mutex_lock(&d->pfifo.lock);
-    }
-}
-
 void *pfifo_thread(void *arg)
 {
    NV2AState *d = (NV2AState *)arg;
-    glo_set_current(g_nv2a_context_render);
+
+    pgraph_init_thread(d);

    rcu_register_thread();

@ -487,13 +461,13 @@ void *pfifo_thread(void *arg)
    while (true) {
        d->pfifo.fifo_kick = false;

-        process_requests(d);
+        d->pgraph.renderer->ops.process_pending(d);

        if (!d->pfifo.halt) {
            pfifo_run_pusher(d);
        }

-        pgraph_process_pending_reports(d);
+        d->pgraph.renderer->ops.process_pending_reports(d);

        if (!d->pfifo.fifo_kick) {
            qemu_cond_broadcast(&d->pfifo.fifo_idle_cond);
--- a/hw/xbox/nv2a/pgraph.c
+++ b/hw/xbox/nv2a/pgraph.c
--- a/hw/xbox/nv2a/pgraph/debug_renderdoc.c
+++ b/hw/xbox/nv2a/pgraph/debug_renderdoc.c
@ -0,0 +1,84 @@
+/*
+ * Geforce NV2A PGRAPH Renderdoc Helpers
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#pragma GCC diagnostic ignored "-Wstrict-prototypes"
+#include "thirdparty/renderdoc_app.h"
+
+#include "hw/xbox/nv2a/debug.h"
+
+#ifdef _WIN32
+#include <libloaderapi.h>
+#else
+#include <dlfcn.h>
+#endif
+
+static RENDERDOC_API_1_6_0 *rdoc_api = NULL;
+
+int renderdoc_capture_frames = 0;
+
+void nv2a_dbg_renderdoc_init(void)
+{
+    if (rdoc_api) {
+        return;
+    }
+
+#ifdef _WIN32
+    HMODULE renderdoc = GetModuleHandleA("renderdoc.dll");
+    if (renderdoc) {
+        pRENDERDOC_GetAPI RENDERDOC_GetAPI =
+            (pRENDERDOC_GetAPI)GetProcAddress(renderdoc, "RENDERDOC_GetAPI");
+#else
+    void *renderdoc = dlopen(
+#ifdef __APPLE__
+                              "librenderdoc.dylib",
+#else
+                              "librenderdoc.so",
+#endif
+                              RTLD_LAZY);
+    if (renderdoc) {
+        pRENDERDOC_GetAPI RENDERDOC_GetAPI =
+            (pRENDERDOC_GetAPI)dlsym(renderdoc, "RENDERDOC_GetAPI");
+#endif // _WIN32
+        int ret =
+            RENDERDOC_GetAPI(eRENDERDOC_API_Version_1_6_0, (void **)&rdoc_api);
+        assert(ret == 1 && "Failed to retrieve RenderDoc API.");
+    } else {
+        fprintf(stderr, "Error: Failed to open renderdoc library: %s\n", dlerror());
+    }
+}
+
+void *nv2a_dbg_renderdoc_get_api(void)
+{
+    return (void*)rdoc_api;
+}
+
+bool nv2a_dbg_renderdoc_available(void)
+{
+    return rdoc_api != NULL;
+}
+
+void nv2a_dbg_renderdoc_capture_frames(int num_frames)
+{
+    renderdoc_capture_frames += num_frames;
+}
--- a/hw/xbox/nv2a/pgraph/gl/blit.c
+++ b/hw/xbox/nv2a/pgraph/gl/blit.c
@ -0,0 +1,174 @@
+/*
+ * Geforce NV2A PGRAPH OpenGL Renderer
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hw/xbox/nv2a/nv2a_int.h"
+#include "renderer.h"
+
+// TODO: Optimize. Ideally this should all be done via OpenGL.
+void pgraph_gl_image_blit(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    ContextSurfaces2DState *context_surfaces = &pg->context_surfaces_2d;
+    ImageBlitState *image_blit = &pg->image_blit;
+    BetaState *beta = &pg->beta;
+
+    pgraph_gl_surface_update(d, false, true, true);
+
+    assert(context_surfaces->object_instance == image_blit->context_surfaces);
+
+    unsigned int bytes_per_pixel;
+    switch (context_surfaces->color_format) {
+        case NV062_SET_COLOR_FORMAT_LE_Y8:
+            bytes_per_pixel = 1;
+            break;
+        case NV062_SET_COLOR_FORMAT_LE_R5G6B5:
+            bytes_per_pixel = 2;
+            break;
+        case NV062_SET_COLOR_FORMAT_LE_A8R8G8B8:
+        case NV062_SET_COLOR_FORMAT_LE_X8R8G8B8:
+        case NV062_SET_COLOR_FORMAT_LE_X8R8G8B8_Z8R8G8B8:
+        case NV062_SET_COLOR_FORMAT_LE_Y32:
+            bytes_per_pixel = 4;
+            break;
+        default:
+            fprintf(stderr, "Unknown blit surface format: 0x%x\n",
+                    context_surfaces->color_format);
+            assert(false);
+            break;
+    }
+
+    hwaddr source_dma_len, dest_dma_len;
+
+    uint8_t *source = (uint8_t *)nv_dma_map(
+        d, context_surfaces->dma_image_source, &source_dma_len);
+    assert(context_surfaces->source_offset < source_dma_len);
+    source += context_surfaces->source_offset;
+
+    uint8_t *dest = (uint8_t *)nv_dma_map(d, context_surfaces->dma_image_dest,
+                                          &dest_dma_len);
+    assert(context_surfaces->dest_offset < dest_dma_len);
+    dest += context_surfaces->dest_offset;
+
+    hwaddr source_addr = source - d->vram_ptr;
+    hwaddr dest_addr = dest - d->vram_ptr;
+
+    SurfaceBinding *surf_src = pgraph_gl_surface_get(d, source_addr);
+    if (surf_src) {
+        pgraph_gl_surface_download_if_dirty(d, surf_src);
+    }
+
+    SurfaceBinding *surf_dest = pgraph_gl_surface_get(d, dest_addr);
+    if (surf_dest) {
+        if (image_blit->height < surf_dest->height ||
+            image_blit->width < surf_dest->width) {
+            pgraph_gl_surface_download_if_dirty(d, surf_dest);
+        } else {
+            // The blit will completely replace the surface so any pending
+            // download should be discarded.
+            surf_dest->download_pending = false;
+            surf_dest->draw_dirty = false;
+        }
+        surf_dest->upload_pending = true;
+        pg->draw_time++;
+    }
+
+    hwaddr source_offset = image_blit->in_y * context_surfaces->source_pitch +
+                           image_blit->in_x * bytes_per_pixel;
+    hwaddr dest_offset = image_blit->out_y * context_surfaces->dest_pitch +
+                         image_blit->out_x * bytes_per_pixel;
+
+    hwaddr source_size =
+        (image_blit->height - 1) * context_surfaces->source_pitch +
+        image_blit->width * bytes_per_pixel;
+    hwaddr dest_size = (image_blit->height - 1) * context_surfaces->dest_pitch +
+                       image_blit->width * bytes_per_pixel;
+
+    /* FIXME: What does hardware do in this case? */
+    assert(source_addr + source_offset + source_size <=
+           memory_region_size(d->vram));
+    assert(dest_addr + dest_offset + dest_size <= memory_region_size(d->vram));
+
+    uint8_t *source_row = source + source_offset;
+    uint8_t *dest_row = dest + dest_offset;
+
+    if (image_blit->operation == NV09F_SET_OPERATION_SRCCOPY) {
+        // NV2A_GL_DPRINTF(false, "NV09F_SET_OPERATION_SRCCOPY");
+        for (unsigned int y = 0; y < image_blit->height; y++) {
+            memmove(dest_row, source_row, image_blit->width * bytes_per_pixel);
+            source_row += context_surfaces->source_pitch;
+            dest_row += context_surfaces->dest_pitch;
+        }
+    } else if (image_blit->operation == NV09F_SET_OPERATION_BLEND_AND) {
+        // NV2A_GL_DPRINTF(false, "NV09F_SET_OPERATION_BLEND_AND");
+        uint32_t max_beta_mult = 0x7f80;
+        uint32_t beta_mult = beta->beta >> 16;
+        uint32_t inv_beta_mult = max_beta_mult - beta_mult;
+        for (unsigned int y = 0; y < image_blit->height; y++) {
+            for (unsigned int x = 0; x < image_blit->width; x++) {
+                for (unsigned int ch = 0; ch < 3; ch++) {
+                    uint32_t a = source_row[x * 4 + ch] * beta_mult;
+                    uint32_t b = dest_row[x * 4 + ch] * inv_beta_mult;
+                    dest_row[x * 4 + ch] = (a + b) / max_beta_mult;
+                }
+            }
+            source_row += context_surfaces->source_pitch;
+            dest_row += context_surfaces->dest_pitch;
+        }
+    } else {
+        fprintf(stderr, "Unknown blit operation: 0x%x\n",
+                image_blit->operation);
+        assert(false && "Unknown blit operation");
+    }
+
+    NV2A_DPRINTF("  - 0x%tx -> 0x%tx\n", source_addr, dest_addr);
+
+    bool needs_alpha_patching;
+    uint8_t alpha_override;
+    switch (context_surfaces->color_format) {
+    case NV062_SET_COLOR_FORMAT_LE_X8R8G8B8:
+        needs_alpha_patching = true;
+        alpha_override = 0xff;
+        break;
+    case NV062_SET_COLOR_FORMAT_LE_X8R8G8B8_Z8R8G8B8:
+        needs_alpha_patching = true;
+        alpha_override = 0;
+        break;
+    default:
+        needs_alpha_patching = false;
+        alpha_override = 0;
+    }
+
+    if (needs_alpha_patching) {
+        dest_row = dest + dest_offset;
+        for (unsigned int y = 0; y < image_blit->height; y++) {
+            for (unsigned int x = 0; x < image_blit->width; x++) {
+                dest_row[x * 4 + 3] = alpha_override;
+            }
+            dest_row += context_surfaces->dest_pitch;
+        }
+    }
+
+    dest_addr += dest_offset;
+    memory_region_set_client_dirty(d->vram, dest_addr, dest_size,
+                                   DIRTY_MEMORY_VGA);
+    memory_region_set_client_dirty(d->vram, dest_addr, dest_size,
+                                   DIRTY_MEMORY_NV2A_TEX);
+}
--- a/hw/xbox/nv2a/pgraph/gl/constants.h
+++ b/hw/xbox/nv2a/pgraph/gl/constants.h
@ -0,0 +1,322 @@
+/*
+ * Geforce NV2A PGRAPH OpenGL Renderer
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_XBOX_NV2A_PGRAPH_GL_CONSTANTS_H
+#define HW_XBOX_NV2A_PGRAPH_GL_CONSTANTS_H
+
+#include "qemu/osdep.h"
+#include "hw/xbox/nv2a/nv2a_regs.h"
+#include "gloffscreen.h"
+
+static const GLenum pgraph_texture_min_filter_gl_map[] = {
+    0,
+    GL_NEAREST,
+    GL_LINEAR,
+    GL_NEAREST_MIPMAP_NEAREST,
+    GL_LINEAR_MIPMAP_NEAREST,
+    GL_NEAREST_MIPMAP_LINEAR,
+    GL_LINEAR_MIPMAP_LINEAR,
+    GL_LINEAR,
+};
+
+static const GLenum pgraph_texture_mag_filter_gl_map[] = {
+    0,
+    GL_NEAREST,
+    GL_LINEAR,
+    0,
+    GL_LINEAR /* TODO: Convolution filter... */
+};
+
+static const GLenum pgraph_texture_addr_gl_map[] = {
+    0,
+    GL_REPEAT,
+    GL_MIRRORED_REPEAT,
+    GL_CLAMP_TO_EDGE,
+    GL_CLAMP_TO_BORDER,
+    GL_CLAMP_TO_EDGE, /* Approximate GL_CLAMP */
+};
+
+static const GLenum pgraph_blend_factor_gl_map[] = {
+    GL_ZERO,
+    GL_ONE,
+    GL_SRC_COLOR,
+    GL_ONE_MINUS_SRC_COLOR,
+    GL_SRC_ALPHA,
+    GL_ONE_MINUS_SRC_ALPHA,
+    GL_DST_ALPHA,
+    GL_ONE_MINUS_DST_ALPHA,
+    GL_DST_COLOR,
+    GL_ONE_MINUS_DST_COLOR,
+    GL_SRC_ALPHA_SATURATE,
+    0,
+    GL_CONSTANT_COLOR,
+    GL_ONE_MINUS_CONSTANT_COLOR,
+    GL_CONSTANT_ALPHA,
+    GL_ONE_MINUS_CONSTANT_ALPHA,
+};
+
+static const GLenum pgraph_blend_equation_gl_map[] = {
+    GL_FUNC_SUBTRACT,
+    GL_FUNC_REVERSE_SUBTRACT,
+    GL_FUNC_ADD,
+    GL_MIN,
+    GL_MAX,
+    GL_FUNC_REVERSE_SUBTRACT,
+    GL_FUNC_ADD,
+};
+
+/* FIXME
+static const GLenum pgraph_blend_logicop_map[] = {
+    GL_CLEAR,
+    GL_AND,
+    GL_AND_REVERSE,
+    GL_COPY,
+    GL_AND_INVERTED,
+    GL_NOOP,
+    GL_XOR,
+    GL_OR,
+    GL_NOR,
+    GL_EQUIV,
+    GL_INVERT,
+    GL_OR_REVERSE,
+    GL_COPY_INVERTED,
+    GL_OR_INVERTED,
+    GL_NAND,
+    GL_SET,
+};
+*/
+
+static const GLenum pgraph_cull_face_gl_map[] = {
+    0,
+    GL_FRONT,
+    GL_BACK,
+    GL_FRONT_AND_BACK
+};
+
+static const GLenum pgraph_depth_func_gl_map[] = {
+    GL_NEVER,
+    GL_LESS,
+    GL_EQUAL,
+    GL_LEQUAL,
+    GL_GREATER,
+    GL_NOTEQUAL,
+    GL_GEQUAL,
+    GL_ALWAYS,
+};
+
+static const GLenum pgraph_stencil_func_gl_map[] = {
+    GL_NEVER,
+    GL_LESS,
+    GL_EQUAL,
+    GL_LEQUAL,
+    GL_GREATER,
+    GL_NOTEQUAL,
+    GL_GEQUAL,
+    GL_ALWAYS,
+};
+
+static const GLenum pgraph_stencil_op_gl_map[] = {
+    0,
+    GL_KEEP,
+    GL_ZERO,
+    GL_REPLACE,
+    GL_INCR,
+    GL_DECR,
+    GL_INVERT,
+    GL_INCR_WRAP,
+    GL_DECR_WRAP,
+};
+
+typedef struct ColorFormatInfo {
+    unsigned int bytes_per_pixel;
+    bool linear;
+    GLint gl_internal_format;
+    GLenum gl_format;
+    GLenum gl_type;
+    GLenum gl_swizzle_mask[4];
+    bool depth;
+} ColorFormatInfo;
+
+static const ColorFormatInfo kelvin_color_format_gl_map[66] = {
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_Y8] =
+        {1, false, GL_R8, GL_RED, GL_UNSIGNED_BYTE,
+         {GL_RED, GL_RED, GL_RED, GL_ONE}},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_AY8] =
+        {1, false, GL_R8, GL_RED, GL_UNSIGNED_BYTE,
+         {GL_RED, GL_RED, GL_RED, GL_RED}},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A1R5G5B5] =
+        {2, false, GL_RGB5_A1, GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_X1R5G5B5] =
+        {2, false, GL_RGB5, GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A4R4G4B4] =
+        {2, false, GL_RGBA4, GL_BGRA, GL_UNSIGNED_SHORT_4_4_4_4_REV},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_R5G6B5] =
+        {2, false, GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A8R8G8B8] =
+        {4, false, GL_RGBA8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_X8R8G8B8] =
+        {4, false, GL_RGB8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV},
+
+    /* paletted texture */
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_I8_A8R8G8B8] =
+        {1, false, GL_RGBA8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV},
+
+    [NV097_SET_TEXTURE_FORMAT_COLOR_L_DXT1_A1R5G5B5] =
+        {4, false, GL_COMPRESSED_RGBA_S3TC_DXT1_EXT, 0, GL_RGBA},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_L_DXT23_A8R8G8B8] =
+        {4, false, GL_COMPRESSED_RGBA_S3TC_DXT3_EXT, 0, GL_RGBA},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_L_DXT45_A8R8G8B8] =
+        {4, false, GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, 0, GL_RGBA},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A1R5G5B5] =
+        {2, true, GL_RGB5_A1, GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_R5G6B5] =
+        {2, true, GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A8R8G8B8] =
+        {4, true, GL_RGBA8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_Y8] =
+        {1, true, GL_R8, GL_RED, GL_UNSIGNED_BYTE,
+         {GL_RED, GL_RED, GL_RED, GL_ONE}},
+
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_G8B8] =
+        {2, true, GL_RG8, GL_RG, GL_UNSIGNED_BYTE,
+         {GL_RED, GL_GREEN, GL_RED, GL_GREEN}},
+
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A8] =
+        {1, false, GL_R8, GL_RED, GL_UNSIGNED_BYTE,
+         {GL_ONE, GL_ONE, GL_ONE, GL_RED}},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A8Y8] =
+        {2, false, GL_RG8, GL_RG, GL_UNSIGNED_BYTE,
+         {GL_RED, GL_RED, GL_RED, GL_GREEN}},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_AY8] =
+        {1, true, GL_R8, GL_RED, GL_UNSIGNED_BYTE,
+         {GL_RED, GL_RED, GL_RED, GL_RED}},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_X1R5G5B5] =
+        {2, true, GL_RGB5, GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A4R4G4B4] =
+        {2, true, GL_RGBA4, GL_BGRA, GL_UNSIGNED_SHORT_4_4_4_4_REV},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_X8R8G8B8] =
+        {4, true, GL_RGB8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A8] =
+        {1, true, GL_R8, GL_RED, GL_UNSIGNED_BYTE,
+         {GL_ONE, GL_ONE, GL_ONE, GL_RED}},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A8Y8] =
+        {2, true, GL_RG8, GL_RG, GL_UNSIGNED_BYTE,
+         {GL_RED, GL_RED, GL_RED, GL_GREEN}},
+
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_R6G5B5] =
+        {2, false, GL_RGB8_SNORM, GL_RGB, GL_BYTE}, /* FIXME: This might be signed */
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_G8B8] =
+        {2, false, GL_RG8, GL_RG, GL_UNSIGNED_BYTE,
+         {GL_RED, GL_GREEN, GL_RED, GL_GREEN}},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_R8B8] =
+        {2, false, GL_RG8, GL_RG, GL_UNSIGNED_BYTE,
+         {GL_GREEN, GL_RED, GL_RED, GL_GREEN}},
+
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LC_IMAGE_CR8YB8CB8YA8] =
+        {2, true, GL_RGBA8,  GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LC_IMAGE_YB8CR8YA8CB8] =
+        {2, true, GL_RGBA8,  GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV},
+
+    /* Additional information is passed to the pixel shader via the swizzle:
+     * RED: The depth value.
+     * GREEN: 0 for 16-bit, 1 for 24 bit
+     * BLUE: 0 for fixed, 1 for float
+     */
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_DEPTH_Y16_FIXED] =
+        {2, false, GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT,
+         {GL_RED, GL_ZERO, GL_ZERO, GL_ZERO}, true},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_DEPTH_X8_Y24_FIXED] =
+        {4, true, GL_DEPTH_COMPONENT, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8,
+         {GL_RED, GL_ONE, GL_ZERO, GL_ZERO}, true},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_DEPTH_X8_Y24_FLOAT] =
+        /* FIXME: Uses fixed-point format to match surface format hack below. */
+        {4, true, GL_DEPTH_COMPONENT, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8,
+         {GL_RED, GL_ONE, GL_ZERO, GL_ZERO}, true},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_DEPTH_Y16_FIXED] =
+        {2, true, GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT,
+         {GL_RED, GL_ZERO, GL_ZERO, GL_ZERO}, true},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_DEPTH_Y16_FLOAT] =
+        {2, true, GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_HALF_FLOAT,
+          {GL_RED, GL_ZERO, GL_ONE, GL_ZERO}, true},
+
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_Y16] =
+        {2, true, GL_R16, GL_RED, GL_UNSIGNED_SHORT,
+         {GL_RED, GL_RED, GL_RED, GL_ONE}},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A8B8G8R8] =
+        {4, false, GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_B8G8R8A8] =
+        {4, false, GL_RGBA8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8},
+
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_R8G8B8A8] =
+        {4, false, GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8},
+
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A8B8G8R8] =
+        {4, true, GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_B8G8R8A8] =
+        {4, true, GL_RGBA8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8},
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_R8G8B8A8] =
+        {4, true, GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8}
+};
+
+typedef struct SurfaceFormatInfo {
+    unsigned int bytes_per_pixel;
+    GLint gl_internal_format;
+    GLenum gl_format;
+    GLenum gl_type;
+    GLenum gl_attachment;
+} SurfaceFormatInfo;
+
+static const SurfaceFormatInfo kelvin_surface_color_format_gl_map[] = {
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_X1R5G5B5_Z1R5G5B5] =
+        {2, GL_RGB5_A1, GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV, GL_COLOR_ATTACHMENT0},
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_R5G6B5] =
+        {2, GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5, GL_COLOR_ATTACHMENT0},
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_X8R8G8B8_Z8R8G8B8] =
+        {4, GL_RGBA8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, GL_COLOR_ATTACHMENT0},
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_A8R8G8B8] =
+        {4, GL_RGBA8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, GL_COLOR_ATTACHMENT0},
+
+    // FIXME: Map channel color
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_B8] =
+        {1, GL_R8, GL_RED, GL_UNSIGNED_BYTE, GL_COLOR_ATTACHMENT0},
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_G8B8] =
+        {2, GL_RG8, GL_RG, GL_UNSIGNED_SHORT, GL_COLOR_ATTACHMENT0},
+};
+
+static const SurfaceFormatInfo kelvin_surface_zeta_float_format_gl_map[] = {
+    [NV097_SET_SURFACE_FORMAT_ZETA_Z16] =
+        {2, GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_HALF_FLOAT, GL_DEPTH_ATTACHMENT},
+    [NV097_SET_SURFACE_FORMAT_ZETA_Z24S8] =
+        /* FIXME: GL does not support packing floating-point Z24S8 OOTB, so for
+         *        now just emulate this with fixed-point Z24S8. Possible compat
+         *        improvement with custom conversion.
+         */
+        {4, GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, GL_DEPTH_STENCIL_ATTACHMENT},
+};
+
+static const SurfaceFormatInfo kelvin_surface_zeta_fixed_format_gl_map[] = {
+    [NV097_SET_SURFACE_FORMAT_ZETA_Z16] =
+        {2, GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT, GL_DEPTH_ATTACHMENT},
+    [NV097_SET_SURFACE_FORMAT_ZETA_Z24S8] =
+        {4, GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, GL_DEPTH_STENCIL_ATTACHMENT},
+};
+
+#endif
--- a/hw/xbox/nv2a/pgraph/gl/debug.c
+++ b/hw/xbox/nv2a/pgraph/gl/debug.c
@ -1,5 +1,5 @@
 /*
- * QEMU Geforce NV2A debug helpers
+ * Geforce NV2A PGRAPH OpenGL Renderer
 *
 * Copyright (c) 2015 Jannik Vogel
 * Copyright (c) 2012 espes
@ -18,6 +18,7 @@
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 */

+#include "renderer.h"
 #include "debug.h"

 #ifdef DEBUG_NV2A_GL
@ -28,15 +29,8 @@
 #include <assert.h>

 #ifdef CONFIG_RENDERDOC
+#pragma GCC diagnostic ignored "-Wstrict-prototypes"
 #include "thirdparty/renderdoc_app.h"
-#ifdef _WIN32
-#include <libloaderapi.h>
-#else
-#include <dlfcn.h>
-#endif
-
-static RENDERDOC_API_1_1_2 *rdoc_api = NULL;
-static int32_t renderdoc_capture_frames = 0;
 #endif

 #define CHECK_GL_ERROR() do { \
@ -74,31 +68,7 @@ void gl_debug_initialize(void)
    }

 #ifdef CONFIG_RENDERDOC
-    const char *renderdoc_lib;
-    void* renderdoc;
-#ifdef __APPLE__
-    renderdoc_lib = "librenderdoc.dylib";
-#elif _WIN32
-    renderdoc_lib = "renderdoc.dll";
-#else
-    renderdoc_lib = "librenderdoc.so";
-#endif
-
-#ifdef _WIN32
-    renderdoc = GetModuleHandleA(renderdoc_lib);
-    if (renderdoc) {
-        pRENDERDOC_GetAPI RENDERDOC_GetAPI = (pRENDERDOC_GetAPI)GetProcAddress(
-            renderdoc, "RENDERDOC_GetAPI");
-#else
-    renderdoc = dlopen(renderdoc_lib, RTLD_NOW | RTLD_NOLOAD);
-    if (renderdoc) {
-        pRENDERDOC_GetAPI RENDERDOC_GetAPI = (pRENDERDOC_GetAPI)dlsym(
-            renderdoc, "RENDERDOC_GetAPI");
-#endif
-        int ret = RENDERDOC_GetAPI(eRENDERDOC_API_Version_1_1_2,
-                                   (void **)&rdoc_api);
-        assert(ret == 1 && "Failed to retrieve RenderDoc API.");
-    }
+    nv2a_dbg_renderdoc_init();
 #endif
 }

@ -179,7 +149,10 @@ void gl_debug_frame_terminator(void)
    CHECK_GL_ERROR();

 #ifdef CONFIG_RENDERDOC
-    if (rdoc_api) {
+    if (nv2a_dbg_renderdoc_available()) {
+
+        RENDERDOC_API_1_6_0 *rdoc_api = nv2a_dbg_renderdoc_get_api();
+
        if (rdoc_api->IsTargetControlConnected()) {
            if (rdoc_api->IsFrameCapturing()) {
                rdoc_api->EndFrameCapture(NULL, NULL);
@ -190,7 +163,7 @@ void gl_debug_frame_terminator(void)
                            error);
                }
            }
-            if (renderdoc_capture_frames) {
+            if (renderdoc_capture_frames > 0) {
                rdoc_api->StartFrameCapture(NULL, NULL);
                GLenum error = glGetError();
                if (error != GL_NO_ERROR) {
@ -203,22 +176,10 @@ void gl_debug_frame_terminator(void)
        }
    }
 #endif
-    if (!has_GL_GREMEDY_frame_terminator) {
-        return;
+    if (has_GL_GREMEDY_frame_terminator) {
+        glFrameTerminatorGREMEDY();
+        CHECK_GL_ERROR();
    }
-
-    glFrameTerminatorGREMEDY();
-    CHECK_GL_ERROR();
 }

-#ifdef CONFIG_RENDERDOC
-bool nv2a_dbg_renderdoc_available(void) {
-    return rdoc_api != NULL;
-}
-
-void nv2a_dbg_renderdoc_capture_frames(uint32_t num_frames) {
-    renderdoc_capture_frames = num_frames;
-}
-#endif
-
 #endif // DEBUG_NV2A_GL
--- a/hw/xbox/nv2a/pgraph/gl/debug.h
+++ b/hw/xbox/nv2a/pgraph/gl/debug.h
@ -0,0 +1,60 @@
+/*
+ * Geforce NV2A PGRAPH OpenGL Renderer
+ *
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2012 espes
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_XBOX_NV2A_PGRAPH_GL_DEBUG_H
+#define HW_XBOX_NV2A_PGRAPH_GL_DEBUG_H
+
+// #define DEBUG_NV2A_GL
+#ifdef DEBUG_NV2A_GL
+
+#include <stdbool.h>
+#include "gloffscreen.h"
+#include "config-host.h"
+
+void gl_debug_initialize(void);
+void gl_debug_message(bool cc, const char *fmt, ...);
+void gl_debug_group_begin(const char *fmt, ...);
+void gl_debug_group_end(void);
+void gl_debug_label(GLenum target, GLuint name, const char *fmt, ...);
+void gl_debug_frame_terminator(void);
+
+# define NV2A_GL_DPRINTF(cc, format, ...) \
+    gl_debug_message(cc, "nv2a: " format, ## __VA_ARGS__)
+# define NV2A_GL_DGROUP_BEGIN(format, ...) \
+    gl_debug_group_begin("nv2a: " format, ## __VA_ARGS__)
+# define NV2A_GL_DGROUP_END() \
+    gl_debug_group_end()
+# define NV2A_GL_DLABEL(target, name, format, ...)  \
+    gl_debug_label(target, name, "nv2a: { " format " }", ## __VA_ARGS__)
+#define NV2A_GL_DFRAME_TERMINATOR() \
+    gl_debug_frame_terminator()
+
+#else
+
+# define NV2A_GL_DPRINTF(cc, format, ...)          do { \
+        if (cc) NV2A_DPRINTF(format "\n", ##__VA_ARGS__ ); \
+    } while (0)
+# define NV2A_GL_DGROUP_BEGIN(format, ...)         do { } while (0)
+# define NV2A_GL_DGROUP_END()                      do { } while (0)
+# define NV2A_GL_DLABEL(target, name, format, ...) do { } while (0)
+# define NV2A_GL_DFRAME_TERMINATOR()               do { } while (0)
+#endif
+
+#endif
--- a/hw/xbox/nv2a/pgraph/gl/display.c
+++ b/hw/xbox/nv2a/pgraph/gl/display.c
@ -0,0 +1,407 @@
+/*
+ * Geforce NV2A PGRAPH OpenGL Renderer
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hw/xbox/nv2a/nv2a_int.h"
+#include "hw/xbox/nv2a/pgraph/util.h"
+#include "renderer.h"
+
+#include <math.h>
+
+void pgraph_gl_init_display_renderer(NV2AState *d)
+{
+    struct PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    glGenTextures(1, &r->gl_display_buffer);
+    r->gl_display_buffer_internal_format = 0;
+    r->gl_display_buffer_width = 0;
+    r->gl_display_buffer_height = 0;
+    r->gl_display_buffer_format = 0;
+    r->gl_display_buffer_type = 0;
+
+    const char *vs =
+        "#version 330\n"
+        "void main()\n"
+        "{\n"
+        "    float x = -1.0 + float((gl_VertexID & 1) << 2);\n"
+        "    float y = -1.0 + float((gl_VertexID & 2) << 1);\n"
+        "    gl_Position = vec4(x, y, 0, 1);\n"
+        "}\n";
+    /* FIXME: improve interlace handling, pvideo */
+
+    const char *fs =
+        "#version 330\n"
+        "uniform sampler2D tex;\n"
+        "uniform bool pvideo_enable;\n"
+        "uniform sampler2D pvideo_tex;\n"
+        "uniform vec2 pvideo_in_pos;\n"
+        "uniform vec4 pvideo_pos;\n"
+        "uniform vec3 pvideo_scale;\n"
+        "uniform bool pvideo_color_key_enable;\n"
+        "uniform vec4 pvideo_color_key;\n"
+        "uniform vec2 display_size;\n"
+        "uniform float line_offset;\n"
+        "layout(location = 0) out vec4 out_Color;\n"
+        "void main()\n"
+        "{\n"
+        "    vec2 texCoord = gl_FragCoord.xy/display_size;\n"
+        "    float rel = display_size.y/textureSize(tex, 0).y/line_offset;\n"
+        "    texCoord.y = 1 + rel*(texCoord.y - 1);"
+        "    out_Color.rgba = texture(tex, texCoord);\n"
+        "    if (pvideo_enable) {\n"
+        "        vec2 screenCoord = gl_FragCoord.xy - 0.5;\n"
+        "        vec4 output_region = vec4(pvideo_pos.xy, pvideo_pos.xy + pvideo_pos.zw);\n"
+        "        bvec4 clip = bvec4(lessThan(screenCoord, output_region.xy),\n"
+        "                           greaterThan(screenCoord, output_region.zw));\n"
+        "        if (!any(clip) && (!pvideo_color_key_enable || out_Color.rgba == pvideo_color_key)) {\n"
+        "            vec2 out_xy = (screenCoord - pvideo_pos.xy) * pvideo_scale.z;\n"
+        "            vec2 in_st = (pvideo_in_pos + out_xy * pvideo_scale.xy) / textureSize(pvideo_tex, 0);\n"
+        "            in_st.y *= -1.0;\n"
+        "            out_Color.rgba = texture(pvideo_tex, in_st);\n"
+        "        }\n"
+        "    }\n"
+        "}\n";
+
+    r->disp_rndr.prog = pgraph_gl_compile_shader(vs, fs);
+    r->disp_rndr.tex_loc = glGetUniformLocation(r->disp_rndr.prog, "tex");
+    r->disp_rndr.pvideo_enable_loc = glGetUniformLocation(r->disp_rndr.prog, "pvideo_enable");
+    r->disp_rndr.pvideo_tex_loc = glGetUniformLocation(r->disp_rndr.prog, "pvideo_tex");
+    r->disp_rndr.pvideo_in_pos_loc = glGetUniformLocation(r->disp_rndr.prog, "pvideo_in_pos");
+    r->disp_rndr.pvideo_pos_loc = glGetUniformLocation(r->disp_rndr.prog, "pvideo_pos");
+    r->disp_rndr.pvideo_scale_loc = glGetUniformLocation(r->disp_rndr.prog, "pvideo_scale");
+    r->disp_rndr.pvideo_color_key_enable_loc = glGetUniformLocation(r->disp_rndr.prog, "pvideo_color_key_enable");
+    r->disp_rndr.pvideo_color_key_loc = glGetUniformLocation(r->disp_rndr.prog, "pvideo_color_key");
+    r->disp_rndr.display_size_loc = glGetUniformLocation(r->disp_rndr.prog, "display_size");
+    r->disp_rndr.line_offset_loc = glGetUniformLocation(r->disp_rndr.prog, "line_offset");
+
+    glGenVertexArrays(1, &r->disp_rndr.vao);
+    glBindVertexArray(r->disp_rndr.vao);
+    glGenBuffers(1, &r->disp_rndr.vbo);
+    glBindBuffer(GL_ARRAY_BUFFER, r->disp_rndr.vbo);
+    glBufferData(GL_ARRAY_BUFFER, 0, NULL, GL_STATIC_DRAW);
+    glGenFramebuffers(1, &r->disp_rndr.fbo);
+    glGenTextures(1, &r->disp_rndr.pvideo_tex);
+    assert(glGetError() == GL_NO_ERROR);
+}
+
+static uint8_t *convert_texture_data__CR8YB8CB8YA8(const uint8_t *data,
+                                                   unsigned int width,
+                                                   unsigned int height,
+                                                   unsigned int pitch)
+{
+    uint8_t *converted_data = (uint8_t *)g_malloc(width * height * 4);
+    int x, y;
+    for (y = 0; y < height; y++) {
+        const uint8_t *line = &data[y * pitch];
+        const uint32_t row_offset = y * width;
+        for (x = 0; x < width; x++) {
+            uint8_t *pixel = &converted_data[(row_offset + x) * 4];
+            convert_yuy2_to_rgb(line, x, &pixel[0], &pixel[1], &pixel[2]);
+            pixel[3] = 255;
+        }
+    }
+    return converted_data;
+}
+
+static float pvideo_calculate_scale(unsigned int din_dout,
+                                           unsigned int output_size)
+{
+    float calculated_in = din_dout * (output_size - 1);
+    calculated_in = floorf(calculated_in / (1 << 20) + 0.5f);
+    return (calculated_in + 1.0f) / output_size;
+}
+
+static void render_display_pvideo_overlay(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    // FIXME: This check against PVIDEO_SIZE_IN does not match HW behavior.
+    // Many games seem to pass this value when initializing or tearing down
+    // PVIDEO. On its own, this generally does not result in the overlay being
+    // hidden, however there are certain games (e.g., Ultimate Beach Soccer)
+    // that use an unknown mechanism to hide the overlay without explicitly
+    // stopping it.
+    // Since the value seems to be set to 0xFFFFFFFF only in cases where the
+    // content is not valid, it is probably good enough to treat it as an
+    // implicit stop.
+    bool enabled = (d->pvideo.regs[NV_PVIDEO_BUFFER] & NV_PVIDEO_BUFFER_0_USE)
+        && d->pvideo.regs[NV_PVIDEO_SIZE_IN] != 0xFFFFFFFF;
+    glUniform1ui(r->disp_rndr.pvideo_enable_loc, enabled);
+    if (!enabled) {
+        return;
+    }
+
+    hwaddr base = d->pvideo.regs[NV_PVIDEO_BASE];
+    hwaddr limit = d->pvideo.regs[NV_PVIDEO_LIMIT];
+    hwaddr offset = d->pvideo.regs[NV_PVIDEO_OFFSET];
+
+    int in_width =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_SIZE_IN], NV_PVIDEO_SIZE_IN_WIDTH);
+    int in_height =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_SIZE_IN], NV_PVIDEO_SIZE_IN_HEIGHT);
+
+    int in_s = GET_MASK(d->pvideo.regs[NV_PVIDEO_POINT_IN],
+                        NV_PVIDEO_POINT_IN_S);
+    int in_t = GET_MASK(d->pvideo.regs[NV_PVIDEO_POINT_IN],
+                        NV_PVIDEO_POINT_IN_T);
+
+    int in_pitch =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_FORMAT], NV_PVIDEO_FORMAT_PITCH);
+    int in_color =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_FORMAT], NV_PVIDEO_FORMAT_COLOR);
+
+    unsigned int out_width =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_SIZE_OUT], NV_PVIDEO_SIZE_OUT_WIDTH);
+    unsigned int out_height =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_SIZE_OUT], NV_PVIDEO_SIZE_OUT_HEIGHT);
+
+    float scale_x = 1.0f;
+    float scale_y = 1.0f;
+    unsigned int ds_dx = d->pvideo.regs[NV_PVIDEO_DS_DX];
+    unsigned int dt_dy = d->pvideo.regs[NV_PVIDEO_DT_DY];
+    if (ds_dx != NV_PVIDEO_DIN_DOUT_UNITY) {
+        scale_x = pvideo_calculate_scale(ds_dx, out_width);
+    }
+    if (dt_dy != NV_PVIDEO_DIN_DOUT_UNITY) {
+        scale_y = pvideo_calculate_scale(dt_dy, out_height);
+    }
+
+    // On HW, setting NV_PVIDEO_SIZE_IN larger than NV_PVIDEO_SIZE_OUT results
+    // in them being capped to the output size, content is not scaled. This is
+    // particularly important as NV_PVIDEO_SIZE_IN may be set to 0xFFFFFFFF
+    // during initialization or teardown.
+    if (in_width > out_width) {
+        in_width = floorf((float)out_width * scale_x + 0.5f);
+    }
+    if (in_height > out_height) {
+        in_height = floorf((float)out_height * scale_y + 0.5f);
+    }
+
+    /* TODO: support other color formats */
+    assert(in_color == NV_PVIDEO_FORMAT_COLOR_LE_CR8YB8CB8YA8);
+
+    unsigned int out_x =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_POINT_OUT], NV_PVIDEO_POINT_OUT_X);
+    unsigned int out_y =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_POINT_OUT], NV_PVIDEO_POINT_OUT_Y);
+
+    unsigned int color_key_enabled =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_FORMAT], NV_PVIDEO_FORMAT_DISPLAY);
+    glUniform1ui(r->disp_rndr.pvideo_color_key_enable_loc,
+                 color_key_enabled);
+
+    // TODO: Verify that masking off the top byte is correct.
+    // SeaBlade sets a color key of 0x80000000 but the texture passed into the
+    // shader is cleared to 0 alpha.
+    unsigned int color_key = d->pvideo.regs[NV_PVIDEO_COLOR_KEY] & 0xFFFFFF;
+    glUniform4f(r->disp_rndr.pvideo_color_key_loc,
+                GET_MASK(color_key, NV_PVIDEO_COLOR_KEY_RED) / 255.0,
+                GET_MASK(color_key, NV_PVIDEO_COLOR_KEY_GREEN) / 255.0,
+                GET_MASK(color_key, NV_PVIDEO_COLOR_KEY_BLUE) / 255.0,
+                GET_MASK(color_key, NV_PVIDEO_COLOR_KEY_ALPHA) / 255.0);
+
+    assert(offset + in_pitch * in_height <= limit);
+    hwaddr end = base + offset + in_pitch * in_height;
+    assert(end <= memory_region_size(d->vram));
+
+    pgraph_apply_scaling_factor(pg, &out_x, &out_y);
+    pgraph_apply_scaling_factor(pg, &out_width, &out_height);
+
+    // Translate for the GL viewport origin.
+    out_y = MAX(r->gl_display_buffer_height - 1 - (int)(out_y + out_height), 0);
+
+    glActiveTexture(GL_TEXTURE0 + 1);
+    glBindTexture(GL_TEXTURE_2D, r->disp_rndr.pvideo_tex);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+    uint8_t *tex_rgba = convert_texture_data__CR8YB8CB8YA8(
+        d->vram_ptr + base + offset, in_width, in_height, in_pitch);
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, in_width, in_height, 0, GL_RGBA,
+                 GL_UNSIGNED_BYTE, tex_rgba);
+    g_free(tex_rgba);
+    glUniform1i(r->disp_rndr.pvideo_tex_loc, 1);
+    glUniform2f(r->disp_rndr.pvideo_in_pos_loc, in_s, in_t);
+    glUniform4f(r->disp_rndr.pvideo_pos_loc,
+                out_x, out_y, out_width, out_height);
+    glUniform3f(r->disp_rndr.pvideo_scale_loc,
+                scale_x, scale_y, 1.0f / pg->surface_scale_factor);
+}
+
+static void render_display(NV2AState *d, SurfaceBinding *surface)
+{
+    struct PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    unsigned int width, height;
+    uint32_t pline_offset, pstart_addr, pline_compare;
+    d->vga.get_resolution(&d->vga, (int*)&width, (int*)&height);
+    d->vga.get_offsets(&d->vga, &pline_offset, &pstart_addr, &pline_compare);
+    int line_offset = surface->pitch / pline_offset;
+
+    /* Adjust viewport height for interlaced mode, used only in 1080i */
+    if (d->vga.cr[NV_PRMCIO_INTERLACE_MODE] != NV_PRMCIO_INTERLACE_MODE_DISABLED) {
+        height *= 2;
+    }
+
+    pgraph_apply_scaling_factor(pg, &width, &height);
+
+    glBindFramebuffer(GL_FRAMEBUFFER, r->disp_rndr.fbo);
+    glActiveTexture(GL_TEXTURE0);
+    glBindTexture(GL_TEXTURE_2D, r->gl_display_buffer);
+    bool recreate = (
+        surface->fmt.gl_internal_format != r->gl_display_buffer_internal_format
+        || width != r->gl_display_buffer_width
+        || height != r->gl_display_buffer_height
+        || surface->fmt.gl_format != r->gl_display_buffer_format
+        || surface->fmt.gl_type != r->gl_display_buffer_type
+        );
+
+    if (recreate) {
+        /* XXX: There's apparently a bug in some Intel OpenGL drivers for
+         * Windows that will leak this texture when its orphaned after use in
+         * another context, apparently regardless of which thread it's created
+         * or released on.
+         *
+         * Driver: 27.20.100.8729 9/11/2020 W10 x64
+         * Track: https://community.intel.com/t5/Graphics/OpenGL-Windows-drivers-for-Intel-HD-630-leaking-GPU-memory-when/td-p/1274423
+         */
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0);
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0);
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+        r->gl_display_buffer_internal_format = surface->fmt.gl_internal_format;
+        r->gl_display_buffer_width = width;
+        r->gl_display_buffer_height = height;
+        r->gl_display_buffer_format = surface->fmt.gl_format;
+        r->gl_display_buffer_type = surface->fmt.gl_type;
+        glTexImage2D(GL_TEXTURE_2D, 0,
+            r->gl_display_buffer_internal_format,
+            r->gl_display_buffer_width,
+            r->gl_display_buffer_height,
+            0,
+            r->gl_display_buffer_format,
+            r->gl_display_buffer_type,
+            NULL);
+    }
+
+    glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+        GL_TEXTURE_2D, r->gl_display_buffer, 0);
+    GLenum DrawBuffers[1] = {GL_COLOR_ATTACHMENT0};
+    glDrawBuffers(1, DrawBuffers);
+    assert(glCheckFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE);
+
+    glBindTexture(GL_TEXTURE_2D, surface->gl_buffer);
+    glBindVertexArray(r->disp_rndr.vao);
+    glBindBuffer(GL_ARRAY_BUFFER, r->disp_rndr.vbo);
+    glUseProgram(r->disp_rndr.prog);
+    glProgramUniform1i(r->disp_rndr.prog, r->disp_rndr.tex_loc, 0);
+    glUniform2f(r->disp_rndr.display_size_loc, width, height);
+    glUniform1f(r->disp_rndr.line_offset_loc, line_offset);
+    render_display_pvideo_overlay(d);
+
+    glViewport(0, 0, width, height);
+    glColorMask(true, true, true, true);
+    glDisable(GL_SCISSOR_TEST);
+    glDisable(GL_BLEND);
+    glDisable(GL_STENCIL_TEST);
+    glDisable(GL_CULL_FACE);
+    glDisable(GL_DEPTH_TEST);
+    glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
+    glClearColor(0.0f, 0.0f, 0.0f, 1.0f);
+    glClear(GL_COLOR_BUFFER_BIT);
+    glDrawArrays(GL_TRIANGLES, 0, 3);
+
+    glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+        GL_TEXTURE_2D, 0, 0);
+}
+
+static void gl_fence(void)
+{
+    GLsync fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+    int result = glClientWaitSync(fence, GL_SYNC_FLUSH_COMMANDS_BIT,
+                                         (GLuint64)(5000000000));
+    assert(result == GL_CONDITION_SATISFIED || result == GL_ALREADY_SIGNALED);
+    glDeleteSync(fence);
+}
+
+void pgraph_gl_sync(NV2AState *d)
+{
+    uint32_t pline_offset, pstart_addr, pline_compare;
+    d->vga.get_offsets(&d->vga, &pline_offset, &pstart_addr, &pline_compare);
+    SurfaceBinding *surface = pgraph_gl_surface_get_within(d, d->pcrtc.start + pline_offset);
+    if (surface == NULL) {
+        qemu_event_set(&d->pgraph.sync_complete);
+        return;
+    }
+
+    /* FIXME: Sanity check surface dimensions */
+
+    /* Wait for queued commands to complete */
+    pgraph_gl_upload_surface_data(d, surface, !tcg_enabled());
+    gl_fence();
+    assert(glGetError() == GL_NO_ERROR);
+
+    /* Render framebuffer in display context */
+    glo_set_current(g_nv2a_context_display);
+    render_display(d, surface);
+    gl_fence();
+    assert(glGetError() == GL_NO_ERROR);
+
+    /* Switch back to original context */
+    glo_set_current(g_nv2a_context_render);
+
+    qatomic_set(&d->pgraph.sync_pending, false);
+    qemu_event_set(&d->pgraph.sync_complete);
+}
+
+int pgraph_gl_get_framebuffer_surface(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    qemu_mutex_lock(&d->pfifo.lock);
+    // FIXME: Possible race condition with pgraph, consider lock
+    uint32_t pline_offset, pstart_addr, pline_compare;
+    d->vga.get_offsets(&d->vga, &pline_offset, &pstart_addr, &pline_compare);
+    SurfaceBinding *surface = pgraph_gl_surface_get_within(d, d->pcrtc.start + pline_offset);
+    if (surface == NULL || !surface->color) {
+        qemu_mutex_unlock(&d->pfifo.lock);
+        return 0;
+    }
+
+    assert(surface->color);
+    assert(surface->fmt.gl_attachment == GL_COLOR_ATTACHMENT0);
+    assert(surface->fmt.gl_format == GL_RGBA
+        || surface->fmt.gl_format == GL_RGB
+        || surface->fmt.gl_format == GL_BGR
+        || surface->fmt.gl_format == GL_BGRA
+        );
+
+    surface->frame_time = pg->frame_time;
+    qemu_event_reset(&d->pgraph.sync_complete);
+    qatomic_set(&pg->sync_pending, true);
+    pfifo_kick(d);
+    qemu_mutex_unlock(&d->pfifo.lock);
+    qemu_event_wait(&d->pgraph.sync_complete);
+
+    return r->gl_display_buffer;
+}
--- a/hw/xbox/nv2a/pgraph/gl/draw.c
+++ b/hw/xbox/nv2a/pgraph/gl/draw.c
@ -0,0 +1,528 @@
+/*
+ * Geforce NV2A PGRAPH OpenGL Renderer
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/fast-hash.h"
+#include "hw/xbox/nv2a/nv2a_int.h"
+#include "debug.h"
+#include "renderer.h"
+
+void pgraph_gl_clear_surface(NV2AState *d, uint32_t parameter)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    NV2A_DPRINTF("---------PRE CLEAR ------\n");
+    pg->clearing = true;
+
+    GLbitfield gl_mask = 0;
+
+    bool write_color = (parameter & NV097_CLEAR_SURFACE_COLOR);
+    bool write_zeta =
+        (parameter & (NV097_CLEAR_SURFACE_Z | NV097_CLEAR_SURFACE_STENCIL));
+
+    if (write_zeta) {
+        GLint gl_clear_stencil;
+        GLfloat gl_clear_depth;
+        pgraph_get_clear_depth_stencil_value(pg, &gl_clear_depth,
+                                             &gl_clear_stencil);
+
+        if (parameter & NV097_CLEAR_SURFACE_Z) {
+            gl_mask |= GL_DEPTH_BUFFER_BIT;
+            glDepthMask(GL_TRUE);
+            glClearDepth(gl_clear_depth);
+        }
+        if (parameter & NV097_CLEAR_SURFACE_STENCIL) {
+            gl_mask |= GL_STENCIL_BUFFER_BIT;
+            glStencilMask(0xff);
+            glClearStencil(gl_clear_stencil);
+        }
+    }
+    if (write_color) {
+        gl_mask |= GL_COLOR_BUFFER_BIT;
+        glColorMask((parameter & NV097_CLEAR_SURFACE_R)
+                         ? GL_TRUE : GL_FALSE,
+                    (parameter & NV097_CLEAR_SURFACE_G)
+                         ? GL_TRUE : GL_FALSE,
+                    (parameter & NV097_CLEAR_SURFACE_B)
+                         ? GL_TRUE : GL_FALSE,
+                    (parameter & NV097_CLEAR_SURFACE_A)
+                         ? GL_TRUE : GL_FALSE);
+
+        GLfloat rgba[4];
+        pgraph_get_clear_color(pg, rgba);
+        glClearColor(rgba[0], rgba[1], rgba[2], rgba[3]);
+    }
+
+    pgraph_gl_surface_update(d, true, write_color, write_zeta);
+
+    /* FIXME: Needs confirmation */
+    unsigned int xmin =
+        GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CLEARRECTX), NV_PGRAPH_CLEARRECTX_XMIN);
+    unsigned int xmax =
+        GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CLEARRECTX), NV_PGRAPH_CLEARRECTX_XMAX);
+    unsigned int ymin =
+        GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CLEARRECTY), NV_PGRAPH_CLEARRECTY_YMIN);
+    unsigned int ymax =
+        GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CLEARRECTY), NV_PGRAPH_CLEARRECTY_YMAX);
+
+    NV2A_DPRINTF(
+        "------------------CLEAR 0x%x %d,%d - %d,%d  %x---------------\n",
+        parameter, xmin, ymin, xmax, ymax,
+        d->pgraph.regs_[NV_PGRAPH_COLORCLEARVALUE]);
+
+    unsigned int scissor_width = xmax - xmin + 1,
+                 scissor_height = ymax - ymin + 1;
+    pgraph_apply_anti_aliasing_factor(pg, &xmin, &ymin);
+    pgraph_apply_anti_aliasing_factor(pg, &scissor_width, &scissor_height);
+    ymin = pg->surface_binding_dim.height - (ymin + scissor_height);
+
+    NV2A_DPRINTF("Translated clear rect to %d,%d - %d,%d\n", xmin, ymin,
+                 xmin + scissor_width - 1, ymin + scissor_height - 1);
+
+    bool full_clear = !xmin && !ymin &&
+                      scissor_width >= pg->surface_binding_dim.width &&
+                      scissor_height >= pg->surface_binding_dim.height;
+
+    pgraph_apply_scaling_factor(pg, &xmin, &ymin);
+    pgraph_apply_scaling_factor(pg, &scissor_width, &scissor_height);
+
+    /* FIXME: Respect window clip?!?! */
+    glEnable(GL_SCISSOR_TEST);
+    glScissor(xmin, ymin, scissor_width, scissor_height);
+
+    /* Dither */
+    /* FIXME: Maybe also disable it here? + GL implementation dependent */
+    if (pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0) & NV_PGRAPH_CONTROL_0_DITHERENABLE) {
+        glEnable(GL_DITHER);
+    } else {
+        glDisable(GL_DITHER);
+    }
+
+    glClear(gl_mask);
+
+    glDisable(GL_SCISSOR_TEST);
+
+    pgraph_gl_set_surface_dirty(pg, write_color, write_zeta);
+
+    if (r->color_binding) {
+        r->color_binding->cleared = full_clear && write_color;
+    }
+    if (r->zeta_binding) {
+        r->zeta_binding->cleared = full_clear && write_zeta;
+    }
+    
+    pg->clearing = false;
+}
+
+void pgraph_gl_draw_begin(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    NV2A_GL_DGROUP_BEGIN("NV097_SET_BEGIN_END: 0x%x", pg->primitive_mode);
+
+    uint32_t control_0 = pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0);
+    bool mask_alpha = control_0 & NV_PGRAPH_CONTROL_0_ALPHA_WRITE_ENABLE;
+    bool mask_red = control_0 & NV_PGRAPH_CONTROL_0_RED_WRITE_ENABLE;
+    bool mask_green = control_0 & NV_PGRAPH_CONTROL_0_GREEN_WRITE_ENABLE;
+    bool mask_blue = control_0 & NV_PGRAPH_CONTROL_0_BLUE_WRITE_ENABLE;
+    bool color_write = mask_alpha || mask_red || mask_green || mask_blue;
+    bool depth_test = control_0 & NV_PGRAPH_CONTROL_0_ZENABLE;
+    bool stencil_test =
+        pgraph_reg_r(pg, NV_PGRAPH_CONTROL_1) & NV_PGRAPH_CONTROL_1_STENCIL_TEST_ENABLE;
+    bool is_nop_draw = !(color_write || depth_test || stencil_test);
+
+    pgraph_gl_surface_update(d, true, true, depth_test || stencil_test);
+
+    if (is_nop_draw) {
+        return;
+    }
+
+    assert(r->color_binding || r->zeta_binding);
+
+    pgraph_gl_bind_textures(d);
+    pgraph_gl_bind_shaders(pg);
+
+    glColorMask(mask_red, mask_green, mask_blue, mask_alpha);
+    glDepthMask(!!(control_0 & NV_PGRAPH_CONTROL_0_ZWRITEENABLE));
+    glStencilMask(GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_1),
+                           NV_PGRAPH_CONTROL_1_STENCIL_MASK_WRITE));
+
+    if (pgraph_reg_r(pg, NV_PGRAPH_BLEND) & NV_PGRAPH_BLEND_EN) {
+        glEnable(GL_BLEND);
+        uint32_t sfactor = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_BLEND),
+                                    NV_PGRAPH_BLEND_SFACTOR);
+        uint32_t dfactor = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_BLEND),
+                                    NV_PGRAPH_BLEND_DFACTOR);
+        assert(sfactor < ARRAY_SIZE(pgraph_blend_factor_gl_map));
+        assert(dfactor < ARRAY_SIZE(pgraph_blend_factor_gl_map));
+        glBlendFunc(pgraph_blend_factor_gl_map[sfactor],
+                    pgraph_blend_factor_gl_map[dfactor]);
+
+        uint32_t equation = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_BLEND),
+                                     NV_PGRAPH_BLEND_EQN);
+        assert(equation < ARRAY_SIZE(pgraph_blend_equation_gl_map));
+        glBlendEquation(pgraph_blend_equation_gl_map[equation]);
+
+        uint32_t blend_color = pgraph_reg_r(pg, NV_PGRAPH_BLENDCOLOR);
+        float gl_blend_color[4];
+        pgraph_argb_pack32_to_rgba_float(blend_color, gl_blend_color);
+        glBlendColor(gl_blend_color[0], gl_blend_color[1], gl_blend_color[2],
+                     gl_blend_color[3]);
+    } else {
+        glDisable(GL_BLEND);
+    }
+
+    /* Face culling */
+    if (pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER)
+            & NV_PGRAPH_SETUPRASTER_CULLENABLE) {
+        uint32_t cull_face = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER),
+                                      NV_PGRAPH_SETUPRASTER_CULLCTRL);
+        assert(cull_face < ARRAY_SIZE(pgraph_cull_face_gl_map));
+        glCullFace(pgraph_cull_face_gl_map[cull_face]);
+        glEnable(GL_CULL_FACE);
+    } else {
+        glDisable(GL_CULL_FACE);
+    }
+
+    /* Clipping */
+    glEnable(GL_CLIP_DISTANCE0);
+    glEnable(GL_CLIP_DISTANCE1);
+
+    /* Front-face select */
+    glFrontFace(pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER)
+                    & NV_PGRAPH_SETUPRASTER_FRONTFACE
+                        ? GL_CCW : GL_CW);
+
+    /* Polygon offset */
+    /* FIXME: GL implementation-specific, maybe do this in VS? */
+    if (pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER) &
+            NV_PGRAPH_SETUPRASTER_POFFSETFILLENABLE) {
+        glEnable(GL_POLYGON_OFFSET_FILL);
+    } else {
+        glDisable(GL_POLYGON_OFFSET_FILL);
+    }
+    if (pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER) &
+            NV_PGRAPH_SETUPRASTER_POFFSETLINEENABLE) {
+        glEnable(GL_POLYGON_OFFSET_LINE);
+    } else {
+        glDisable(GL_POLYGON_OFFSET_LINE);
+    }
+    if (pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER) &
+            NV_PGRAPH_SETUPRASTER_POFFSETPOINTENABLE) {
+        glEnable(GL_POLYGON_OFFSET_POINT);
+    } else {
+        glDisable(GL_POLYGON_OFFSET_POINT);
+    }
+    if (pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER) &
+            (NV_PGRAPH_SETUPRASTER_POFFSETFILLENABLE |
+             NV_PGRAPH_SETUPRASTER_POFFSETLINEENABLE |
+             NV_PGRAPH_SETUPRASTER_POFFSETPOINTENABLE)) {
+        uint32_t zfactor_u32 = pgraph_reg_r(pg, NV_PGRAPH_ZOFFSETFACTOR);
+        GLfloat zfactor = *(float*)&zfactor_u32;
+        uint32_t zbias_u32 = pgraph_reg_r(pg, NV_PGRAPH_ZOFFSETBIAS);
+        GLfloat zbias = *(float*)&zbias_u32;
+        glPolygonOffset(zfactor, zbias);
+    }
+
+    /* Depth testing */
+    if (depth_test) {
+        glEnable(GL_DEPTH_TEST);
+
+        uint32_t depth_func = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0),
+                                       NV_PGRAPH_CONTROL_0_ZFUNC);
+        assert(depth_func < ARRAY_SIZE(pgraph_depth_func_gl_map));
+        glDepthFunc(pgraph_depth_func_gl_map[depth_func]);
+    } else {
+        glDisable(GL_DEPTH_TEST);
+    }
+
+    if (GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_ZCOMPRESSOCCLUDE),
+                 NV_PGRAPH_ZCOMPRESSOCCLUDE_ZCLAMP_EN) ==
+        NV_PGRAPH_ZCOMPRESSOCCLUDE_ZCLAMP_EN_CLAMP) {
+        glEnable(GL_DEPTH_CLAMP);
+    } else {
+        glDisable(GL_DEPTH_CLAMP);
+    }
+
+    if (GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_3),
+                 NV_PGRAPH_CONTROL_3_SHADEMODE) ==
+        NV_PGRAPH_CONTROL_3_SHADEMODE_FLAT) {
+        glProvokingVertex(GL_FIRST_VERTEX_CONVENTION);
+    }
+
+    if (stencil_test) {
+        glEnable(GL_STENCIL_TEST);
+
+        uint32_t stencil_func = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_1),
+                                    NV_PGRAPH_CONTROL_1_STENCIL_FUNC);
+        uint32_t stencil_ref = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_1),
+                                    NV_PGRAPH_CONTROL_1_STENCIL_REF);
+        uint32_t func_mask = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_1),
+                                NV_PGRAPH_CONTROL_1_STENCIL_MASK_READ);
+        uint32_t op_fail = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_2),
+                                NV_PGRAPH_CONTROL_2_STENCIL_OP_FAIL);
+        uint32_t op_zfail = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_2),
+                                NV_PGRAPH_CONTROL_2_STENCIL_OP_ZFAIL);
+        uint32_t op_zpass = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_2),
+                                NV_PGRAPH_CONTROL_2_STENCIL_OP_ZPASS);
+
+        assert(stencil_func < ARRAY_SIZE(pgraph_stencil_func_gl_map));
+        assert(op_fail < ARRAY_SIZE(pgraph_stencil_op_gl_map));
+        assert(op_zfail < ARRAY_SIZE(pgraph_stencil_op_gl_map));
+        assert(op_zpass < ARRAY_SIZE(pgraph_stencil_op_gl_map));
+
+        glStencilFunc(
+            pgraph_stencil_func_gl_map[stencil_func],
+            stencil_ref,
+            func_mask);
+
+        glStencilOp(
+            pgraph_stencil_op_gl_map[op_fail],
+            pgraph_stencil_op_gl_map[op_zfail],
+            pgraph_stencil_op_gl_map[op_zpass]);
+
+    } else {
+        glDisable(GL_STENCIL_TEST);
+    }
+
+    /* Dither */
+    /* FIXME: GL implementation dependent */
+    if (pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0) &
+            NV_PGRAPH_CONTROL_0_DITHERENABLE) {
+        glEnable(GL_DITHER);
+    } else {
+        glDisable(GL_DITHER);
+    }
+
+    glEnable(GL_PROGRAM_POINT_SIZE);
+
+    bool anti_aliasing = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_ANTIALIASING), NV_PGRAPH_ANTIALIASING_ENABLE);
+
+    /* Edge Antialiasing */
+    if (!anti_aliasing && pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER) &
+                              NV_PGRAPH_SETUPRASTER_LINESMOOTHENABLE) {
+        glEnable(GL_LINE_SMOOTH);
+    } else {
+        glDisable(GL_LINE_SMOOTH);
+    }
+    if (!anti_aliasing && pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER) &
+                              NV_PGRAPH_SETUPRASTER_POLYSMOOTHENABLE) {
+        glEnable(GL_POLYGON_SMOOTH);
+    } else {
+        glDisable(GL_POLYGON_SMOOTH);
+    }
+
+    unsigned int vp_width = pg->surface_binding_dim.width,
+                 vp_height = pg->surface_binding_dim.height;
+    pgraph_apply_scaling_factor(pg, &vp_width, &vp_height);
+    glViewport(0, 0, vp_width, vp_height);
+
+    /* Surface clip */
+    /* FIXME: Consider moving to PSH w/ window clip */
+    unsigned int xmin = pg->surface_shape.clip_x - pg->surface_binding_dim.clip_x,
+                 ymin = pg->surface_shape.clip_y - pg->surface_binding_dim.clip_y;
+    unsigned int xmax = xmin + pg->surface_shape.clip_width - 1,
+                 ymax = ymin + pg->surface_shape.clip_height - 1;
+
+    unsigned int scissor_width = xmax - xmin + 1,
+                 scissor_height = ymax - ymin + 1;
+    pgraph_apply_anti_aliasing_factor(pg, &xmin, &ymin);
+    pgraph_apply_anti_aliasing_factor(pg, &scissor_width, &scissor_height);
+    ymin = pg->surface_binding_dim.height - (ymin + scissor_height);
+    pgraph_apply_scaling_factor(pg, &xmin, &ymin);
+    pgraph_apply_scaling_factor(pg, &scissor_width, &scissor_height);
+
+    glEnable(GL_SCISSOR_TEST);
+    glScissor(xmin, ymin, scissor_width, scissor_height);
+
+    /* Visibility testing */
+    if (pg->zpass_pixel_count_enable) {
+        r->gl_zpass_pixel_count_query_count++;
+        r->gl_zpass_pixel_count_queries = (GLuint*)g_realloc(
+            r->gl_zpass_pixel_count_queries,
+            sizeof(GLuint) * r->gl_zpass_pixel_count_query_count);
+
+        GLuint gl_query;
+        glGenQueries(1, &gl_query);
+        r->gl_zpass_pixel_count_queries[
+            r->gl_zpass_pixel_count_query_count - 1] = gl_query;
+        glBeginQuery(GL_SAMPLES_PASSED, gl_query);
+    }
+}
+
+void pgraph_gl_draw_end(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    uint32_t control_0 = pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0);
+    bool mask_alpha = control_0 & NV_PGRAPH_CONTROL_0_ALPHA_WRITE_ENABLE;
+    bool mask_red = control_0 & NV_PGRAPH_CONTROL_0_RED_WRITE_ENABLE;
+    bool mask_green = control_0 & NV_PGRAPH_CONTROL_0_GREEN_WRITE_ENABLE;
+    bool mask_blue = control_0 & NV_PGRAPH_CONTROL_0_BLUE_WRITE_ENABLE;
+    bool color_write = mask_alpha || mask_red || mask_green || mask_blue;
+    bool depth_test = control_0 & NV_PGRAPH_CONTROL_0_ZENABLE;
+    bool stencil_test =
+        pgraph_reg_r(pg, NV_PGRAPH_CONTROL_1) & NV_PGRAPH_CONTROL_1_STENCIL_TEST_ENABLE;
+    bool is_nop_draw = !(color_write || depth_test || stencil_test);
+
+    if (is_nop_draw) {
+        // FIXME: Check PGRAPH register 0x880.
+        // HW uses bit 11 in 0x880 to enable or disable a color/zeta limit
+        // check that will raise an exception in the case that a draw should
+        // modify the color and/or zeta buffer but the target(s) are masked
+        // off. This check only seems to trigger during the fragment
+        // processing, it is legal to attempt a draw that is entirely
+        // clipped regardless of 0x880. See xemu#635 for context.
+        return;
+    }
+
+    pgraph_gl_flush_draw(d);
+
+    /* End of visibility testing */
+    if (pg->zpass_pixel_count_enable) {
+        nv2a_profile_inc_counter(NV2A_PROF_QUERY);
+        glEndQuery(GL_SAMPLES_PASSED);
+    }
+
+    pg->draw_time++;
+    if (r->color_binding && pgraph_color_write_enabled(pg)) {
+        r->color_binding->draw_time = pg->draw_time;
+    }
+    if (r->zeta_binding && pgraph_zeta_write_enabled(pg)) {
+        r->zeta_binding->draw_time = pg->draw_time;
+    }
+
+    pgraph_gl_set_surface_dirty(pg, color_write, depth_test || stencil_test);
+    NV2A_GL_DGROUP_END();
+}
+
+void pgraph_gl_flush_draw(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    if (!(r->color_binding || r->zeta_binding)) {
+        return;
+    }
+    assert(r->shader_binding);
+
+    if (pg->draw_arrays_length) {
+        NV2A_GL_DPRINTF(false, "Draw Arrays");
+        nv2a_profile_inc_counter(NV2A_PROF_DRAW_ARRAYS);
+        assert(pg->inline_elements_length == 0);
+        assert(pg->inline_buffer_length == 0);
+        assert(pg->inline_array_length == 0);
+
+        pgraph_gl_bind_vertex_attributes(d, pg->draw_arrays_min_start,
+                                      pg->draw_arrays_max_count - 1,
+                                      false, 0,
+                                      pg->draw_arrays_max_count - 1);
+        glMultiDrawArrays(r->shader_binding->gl_primitive_mode,
+                          pg->draw_arrays_start,
+                          pg->draw_arrays_count,
+                          pg->draw_arrays_length);
+    } else if (pg->inline_elements_length) {
+        NV2A_GL_DPRINTF(false, "Inline Elements");
+        nv2a_profile_inc_counter(NV2A_PROF_INLINE_ELEMENTS);
+        assert(pg->inline_buffer_length == 0);
+        assert(pg->inline_array_length == 0);
+
+        uint32_t min_element = (uint32_t)-1;
+        uint32_t max_element = 0;
+        for (int i=0; i < pg->inline_elements_length; i++) {
+            max_element = MAX(pg->inline_elements[i], max_element);
+            min_element = MIN(pg->inline_elements[i], min_element);
+        }
+
+        pgraph_gl_bind_vertex_attributes(
+                d, min_element, max_element, false, 0,
+                pg->inline_elements[pg->inline_elements_length - 1]);
+
+        VertexKey k;
+        memset(&k, 0, sizeof(VertexKey));
+        k.count = pg->inline_elements_length;
+        k.gl_type = GL_UNSIGNED_INT;
+        k.gl_normalize = GL_FALSE;
+        k.stride = sizeof(uint32_t);
+        uint64_t h = fast_hash((uint8_t*)pg->inline_elements,
+                               pg->inline_elements_length * 4);
+
+        LruNode *node = lru_lookup(&r->element_cache, h, &k);
+        VertexLruNode *found = container_of(node, VertexLruNode, node);
+        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, found->gl_buffer);
+        if (!found->initialized) {
+            nv2a_profile_inc_counter(NV2A_PROF_GEOM_BUFFER_UPDATE_4);
+            glBufferData(GL_ELEMENT_ARRAY_BUFFER,
+                         pg->inline_elements_length * 4,
+                         pg->inline_elements, GL_STATIC_DRAW);
+            found->initialized = true;
+        } else {
+            nv2a_profile_inc_counter(NV2A_PROF_GEOM_BUFFER_UPDATE_4_NOTDIRTY);
+        }
+        glDrawElements(r->shader_binding->gl_primitive_mode,
+                       pg->inline_elements_length, GL_UNSIGNED_INT,
+                       (void *)0);
+    } else if (pg->inline_buffer_length) {
+        NV2A_GL_DPRINTF(false, "Inline Buffer");
+        nv2a_profile_inc_counter(NV2A_PROF_INLINE_BUFFERS);
+        assert(pg->inline_array_length == 0);
+
+        if (pg->compressed_attrs) {
+            pg->compressed_attrs = 0;
+            pgraph_gl_bind_shaders(pg);
+        }
+
+        for (int i = 0; i < NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
+            VertexAttribute *attr = &pg->vertex_attributes[i];
+            if (attr->inline_buffer_populated) {
+                nv2a_profile_inc_counter(NV2A_PROF_GEOM_BUFFER_UPDATE_3);
+                glBindBuffer(GL_ARRAY_BUFFER, r->gl_inline_buffer[i]);
+                glBufferData(GL_ARRAY_BUFFER,
+                             pg->inline_buffer_length * sizeof(float) * 4,
+                             attr->inline_buffer, GL_STREAM_DRAW);
+                glVertexAttribPointer(i, 4, GL_FLOAT, GL_FALSE, 0, 0);
+                glEnableVertexAttribArray(i);
+                attr->inline_buffer_populated = false;
+                memcpy(attr->inline_value,
+                       attr->inline_buffer + (pg->inline_buffer_length - 1) * 4,
+                       sizeof(attr->inline_value));
+            } else {
+                glDisableVertexAttribArray(i);
+                glVertexAttrib4fv(i, attr->inline_value);
+            }
+        }
+
+        glDrawArrays(r->shader_binding->gl_primitive_mode,
+                     0, pg->inline_buffer_length);
+    } else if (pg->inline_array_length) {
+        NV2A_GL_DPRINTF(false, "Inline Array");
+        nv2a_profile_inc_counter(NV2A_PROF_INLINE_ARRAYS);
+
+        unsigned int index_count = pgraph_gl_bind_inline_array(d);
+        glDrawArrays(r->shader_binding->gl_primitive_mode,
+                     0, index_count);
+    } else {
+        NV2A_GL_DPRINTF(true, "EMPTY NV097_SET_BEGIN_END");
+        NV2A_UNCONFIRMED("EMPTY NV097_SET_BEGIN_END");
+    }
+}
--- a/hw/xbox/nv2a/pgraph/gl/meson.build
+++ b/hw/xbox/nv2a/pgraph/gl/meson.build
@ -0,0 +1,12 @@
+specific_ss.add([sdl, gloffscreen, files(
+	'blit.c',
+	'debug.c',
+	'display.c',
+	'draw.c',
+	'renderer.c',
+	'reports.c',
+	'shaders.c',
+	'surface.c',
+	'texture.c',
+	'vertex.c',
+	)])
--- a/hw/xbox/nv2a/pgraph/gl/renderer.c
+++ b/hw/xbox/nv2a/pgraph/gl/renderer.c
@ -0,0 +1,201 @@
+/*
+ * Geforce NV2A PGRAPH OpenGL Renderer
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hw/xbox/nv2a/nv2a_int.h"
+#include "hw/xbox/nv2a/pgraph/pgraph.h"
+#include "debug.h"
+#include "renderer.h"
+
+GloContext *g_nv2a_context_render;
+GloContext *g_nv2a_context_display;
+
+static void nv2a_gl_context_init(void)
+{
+    g_nv2a_context_render = glo_context_create();
+    g_nv2a_context_display = glo_context_create();
+}
+
+static void pgraph_gl_init_thread(NV2AState *d)
+{
+    glo_set_current(g_nv2a_context_render);
+}
+
+static void pgraph_gl_deinit(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+
+    glo_set_current(g_nv2a_context_render);
+
+    pgraph_gl_deinit_surfaces(pg);
+    pgraph_gl_deinit_shader_cache(pg);
+    pgraph_gl_deinit_texture_cache(pg);
+
+    glo_set_current(NULL);
+    glo_context_destroy(g_nv2a_context_render);
+    glo_context_destroy(g_nv2a_context_display);
+}
+
+static void pgraph_gl_flip_stall(NV2AState *d)
+{
+    NV2A_GL_DFRAME_TERMINATOR();
+    glFinish();
+}
+
+static void pgraph_gl_flush(NV2AState *d)
+{
+    pgraph_gl_surface_flush(d);
+    pgraph_gl_mark_textures_possibly_dirty(d, 0, memory_region_size(d->vram));
+    pgraph_gl_update_entire_memory_buffer(d);
+    /* FIXME: Flush more? */
+
+    qatomic_set(&d->pgraph.flush_pending, false);
+    qemu_event_set(&d->pgraph.flush_complete);
+}
+
+static void pgraph_gl_process_pending(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    if (qatomic_read(&r->downloads_pending) ||
+        qatomic_read(&r->download_dirty_surfaces_pending) ||
+        qatomic_read(&d->pgraph.sync_pending) ||
+        qatomic_read(&d->pgraph.flush_pending) ||
+        qatomic_read(&r->shader_cache_writeback_pending)) {
+        qemu_mutex_unlock(&d->pfifo.lock);
+        qemu_mutex_lock(&d->pgraph.lock);
+        if (qatomic_read(&r->downloads_pending)) {
+            pgraph_gl_process_pending_downloads(d);
+        }
+        if (qatomic_read(&r->download_dirty_surfaces_pending)) {
+            pgraph_gl_download_dirty_surfaces(d);
+        }
+        if (qatomic_read(&d->pgraph.sync_pending)) {
+            pgraph_gl_sync(d);
+        }
+        if (qatomic_read(&d->pgraph.flush_pending)) {
+            pgraph_gl_flush(d);
+        }
+        if (qatomic_read(&r->shader_cache_writeback_pending)) {
+            pgraph_gl_shader_write_cache_reload_list(&d->pgraph);
+        }
+        qemu_mutex_unlock(&d->pgraph.lock);
+        qemu_mutex_lock(&d->pfifo.lock);
+    }
+}
+
+static void pgraph_gl_pre_savevm_trigger(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    qatomic_set(&r->download_dirty_surfaces_pending, true);
+    qemu_event_reset(&r->dirty_surfaces_download_complete);
+}
+
+static void pgraph_gl_pre_savevm_wait(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    qemu_event_wait(&r->dirty_surfaces_download_complete);
+}
+
+static void pgraph_gl_pre_shutdown_trigger(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    qatomic_set(&r->shader_cache_writeback_pending, true);
+    qemu_event_reset(&r->shader_cache_writeback_complete);
+}
+
+static void pgraph_gl_pre_shutdown_wait(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    qemu_event_wait(&r->shader_cache_writeback_complete);   
+}
+
+static void pgraph_gl_init(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+
+    pg->gl_renderer_state = g_malloc(sizeof(PGRAPHGLState));
+
+    /* fire up opengl */
+    glo_set_current(g_nv2a_context_render);
+
+#ifdef DEBUG_NV2A_GL
+    gl_debug_initialize();
+#endif
+
+    /* DXT textures */
+    assert(glo_check_extension("GL_EXT_texture_compression_s3tc"));
+    /*  Internal RGB565 texture format */
+    assert(glo_check_extension("GL_ARB_ES2_compatibility"));
+
+    pgraph_gl_init_surfaces(pg);
+    pgraph_gl_init_reports(d);
+    pgraph_gl_init_texture_cache(d);
+    pgraph_gl_init_vertex_cache(d);
+    pgraph_gl_init_shader_cache(pg);
+
+    glo_set_current(g_nv2a_context_display);
+    pgraph_gl_init_display_renderer(d);
+
+    glo_set_current(NULL);
+}
+
+static PGRAPHRenderer pgraph_gl_renderer = {
+    .type = CONFIG_DISPLAY_RENDERER_OPENGL,
+    .name = "OpenGL",
+    .ops = {
+        .init = pgraph_gl_init,
+        .early_context_init = nv2a_gl_context_init,
+        .init_thread = pgraph_gl_init_thread,
+        .finalize = pgraph_gl_deinit,
+        .clear_report_value = pgraph_gl_clear_report_value,
+        .clear_surface = pgraph_gl_clear_surface,
+        .draw_begin = pgraph_gl_draw_begin,
+        .draw_end = pgraph_gl_draw_end,
+        .flip_stall = pgraph_gl_flip_stall,
+        .flush_draw = pgraph_gl_flush_draw,
+        .get_report = pgraph_gl_get_report,
+        .image_blit = pgraph_gl_image_blit,
+        .pre_savevm_trigger = pgraph_gl_pre_savevm_trigger,
+        .pre_savevm_wait = pgraph_gl_pre_savevm_wait,
+        .pre_shutdown_trigger = pgraph_gl_pre_shutdown_trigger,
+        .pre_shutdown_wait = pgraph_gl_pre_shutdown_wait,
+        .process_pending = pgraph_gl_process_pending,
+        .process_pending_reports = pgraph_gl_process_pending_reports,
+        .surface_update = pgraph_gl_surface_update,
+        .set_surface_scale_factor = pgraph_gl_set_surface_scale_factor,
+        .get_surface_scale_factor = pgraph_gl_get_surface_scale_factor,
+        .get_framebuffer_surface = pgraph_gl_get_framebuffer_surface,
+    }
+};
+
+static void __attribute__((constructor)) register_renderer(void)
+{
+    pgraph_renderer_register(&pgraph_gl_renderer);
+}
--- a/hw/xbox/nv2a/pgraph/gl/renderer.h
+++ b/hw/xbox/nv2a/pgraph/gl/renderer.h
@ -0,0 +1,283 @@
+/*
+ * Geforce NV2A PGRAPH OpenGL Renderer
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_XBOX_NV2A_PGRAPH_GL_RENDERER_H
+#define HW_XBOX_NV2A_PGRAPH_GL_RENDERER_H
+
+#include "qemu/osdep.h"
+#include "qemu/thread.h"
+#include "qemu/queue.h"
+#include "qemu/lru.h"
+
+#include "hw/hw.h"
+
+#include "hw/xbox/nv2a/nv2a_int.h"
+#include "hw/xbox/nv2a/nv2a_regs.h"
+#include "hw/xbox/nv2a/pgraph/surface.h"
+#include "hw/xbox/nv2a/pgraph/texture.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"
+
+#include "gloffscreen.h"
+#include "constants.h"
+
+typedef struct SurfaceBinding {
+    QTAILQ_ENTRY(SurfaceBinding) entry;
+    MemAccessCallback *access_cb;
+
+    hwaddr vram_addr;
+
+    SurfaceShape shape;
+    uintptr_t dma_addr;
+    uintptr_t dma_len;
+    bool color;
+    bool swizzle;
+
+    unsigned int width;
+    unsigned int height;
+    unsigned int pitch;
+    size_t size;
+
+    bool cleared;
+    int frame_time;
+    int draw_time;
+    bool draw_dirty;
+    bool download_pending;
+    bool upload_pending;
+
+    GLuint gl_buffer;
+    SurfaceFormatInfo fmt;
+} SurfaceBinding;
+
+typedef struct TextureBinding {
+    unsigned int refcnt;
+    int draw_time;
+    uint64_t data_hash;
+    unsigned int scale;
+    unsigned int min_filter;
+    unsigned int mag_filter;
+    unsigned int addru;
+    unsigned int addrv;
+    unsigned int addrp;
+    uint32_t border_color;
+    bool border_color_set;
+    GLenum gl_target;
+    GLuint gl_texture;
+} TextureBinding;
+
+typedef struct ShaderBinding {
+    GLuint gl_program;
+    GLenum gl_primitive_mode;
+
+    GLint psh_constant_loc[9][2];
+    GLint alpha_ref_loc;
+
+    GLint bump_mat_loc[NV2A_MAX_TEXTURES];
+    GLint bump_scale_loc[NV2A_MAX_TEXTURES];
+    GLint bump_offset_loc[NV2A_MAX_TEXTURES];
+    GLint tex_scale_loc[NV2A_MAX_TEXTURES];
+
+    GLint surface_size_loc;
+    GLint clip_range_loc;
+
+    GLint vsh_constant_loc[NV2A_VERTEXSHADER_CONSTANTS];
+    uint32_t vsh_constants[NV2A_VERTEXSHADER_CONSTANTS][4];
+
+    GLint inv_viewport_loc;
+    GLint ltctxa_loc[NV2A_LTCTXA_COUNT];
+    GLint ltctxb_loc[NV2A_LTCTXB_COUNT];
+    GLint ltc1_loc[NV2A_LTC1_COUNT];
+
+    GLint fog_color_loc;
+    GLint fog_param_loc;
+    GLint light_infinite_half_vector_loc[NV2A_MAX_LIGHTS];
+    GLint light_infinite_direction_loc[NV2A_MAX_LIGHTS];
+    GLint light_local_position_loc[NV2A_MAX_LIGHTS];
+    GLint light_local_attenuation_loc[NV2A_MAX_LIGHTS];
+
+    GLint clip_region_loc[8];
+
+    GLint material_alpha_loc;
+} ShaderBinding;
+
+typedef struct ShaderLruNode {
+    LruNode node;
+    bool cached;
+    void *program;
+    size_t program_size;
+    GLenum program_format;
+    ShaderState state;
+    ShaderBinding *binding;
+    QemuThread *save_thread;
+} ShaderLruNode;
+
+typedef struct VertexKey {
+    size_t count;
+    size_t stride;
+    hwaddr addr;
+
+    GLboolean gl_normalize;
+    GLuint gl_type;
+} VertexKey;
+
+typedef struct VertexLruNode {
+    LruNode node;
+    VertexKey key;
+    bool initialized;
+
+    GLuint gl_buffer;
+} VertexLruNode;
+
+typedef struct TextureKey {
+    TextureShape state;
+    hwaddr texture_vram_offset;
+    hwaddr texture_length;
+    hwaddr palette_vram_offset;
+    hwaddr palette_length;
+} TextureKey;
+
+typedef struct TextureLruNode {
+    LruNode node;
+    TextureKey key;
+    TextureBinding *binding;
+    bool possibly_dirty;
+} TextureLruNode;
+
+typedef struct QueryReport {
+    QSIMPLEQ_ENTRY(QueryReport) entry;
+    bool clear;
+    uint32_t parameter;
+    unsigned int query_count;
+    GLuint *queries;
+} QueryReport;
+
+typedef struct PGRAPHGLState {
+    GLuint gl_framebuffer;
+    GLuint gl_display_buffer;
+    GLint gl_display_buffer_internal_format;
+    GLsizei gl_display_buffer_width;
+    GLsizei gl_display_buffer_height;
+    GLenum gl_display_buffer_format;
+    GLenum gl_display_buffer_type;
+
+    Lru element_cache;
+    VertexLruNode *element_cache_entries;
+    GLuint gl_inline_array_buffer;
+    GLuint gl_memory_buffer;
+    GLuint gl_vertex_array;
+    GLuint gl_inline_buffer[NV2A_VERTEXSHADER_ATTRIBUTES];
+
+    QTAILQ_HEAD(, SurfaceBinding) surfaces;
+    SurfaceBinding *color_binding, *zeta_binding;
+    bool downloads_pending;
+    QemuEvent downloads_complete;
+    bool download_dirty_surfaces_pending;
+    QemuEvent dirty_surfaces_download_complete; // common
+
+    TextureBinding *texture_binding[NV2A_MAX_TEXTURES];
+    Lru texture_cache;
+    TextureLruNode *texture_cache_entries;
+
+    Lru shader_cache;
+    ShaderLruNode *shader_cache_entries;
+    ShaderBinding *shader_binding;
+    QemuMutex shader_cache_lock;
+    QemuThread shader_disk_thread;
+
+    unsigned int zpass_pixel_count_result;
+    unsigned int gl_zpass_pixel_count_query_count;
+    GLuint *gl_zpass_pixel_count_queries;
+    QSIMPLEQ_HEAD(, QueryReport) report_queue;
+
+    bool shader_cache_writeback_pending;
+    QemuEvent shader_cache_writeback_complete;
+
+    struct s2t_rndr {
+        GLuint fbo, vao, vbo, prog;
+        GLuint tex_loc, surface_size_loc;
+    } s2t_rndr;
+
+    struct disp_rndr {
+        GLuint fbo, vao, vbo, prog;
+        GLuint display_size_loc;
+        GLuint line_offset_loc;
+        GLuint tex_loc;
+        GLuint pvideo_tex;
+        GLint pvideo_enable_loc;
+        GLint pvideo_tex_loc;
+        GLint pvideo_in_pos_loc;
+        GLint pvideo_pos_loc;
+        GLint pvideo_scale_loc;
+        GLint pvideo_color_key_enable_loc;
+        GLint pvideo_color_key_loc;
+        GLint palette_loc[256];
+    } disp_rndr;
+} PGRAPHGLState;
+
+extern GloContext *g_nv2a_context_render;
+extern GloContext *g_nv2a_context_display;
+
+unsigned int pgraph_gl_bind_inline_array(NV2AState *d);
+void pgraph_gl_bind_shaders(PGRAPHState *pg);
+void pgraph_gl_bind_textures(NV2AState *d);
+void pgraph_gl_bind_vertex_attributes(NV2AState *d, unsigned int min_element, unsigned int max_element, bool inline_data, unsigned int inline_stride, unsigned int provoking_element);
+bool pgraph_gl_check_surface_to_texture_compatibility(const SurfaceBinding *surface, const TextureShape *shape);
+GLuint pgraph_gl_compile_shader(const char *vs_src, const char *fs_src);
+void pgraph_gl_deinit_shader_cache(PGRAPHState *pg);
+void pgraph_gl_deinit_surfaces(PGRAPHState *pg);
+void pgraph_gl_deinit_texture_cache(PGRAPHState *pg);
+void pgraph_gl_download_dirty_surfaces(NV2AState *d);
+void pgraph_gl_clear_report_value(NV2AState *d);
+void pgraph_gl_clear_surface(NV2AState *d, uint32_t parameter);
+void pgraph_gl_draw_begin(NV2AState *d);
+void pgraph_gl_draw_end(NV2AState *d);
+void pgraph_gl_flush_draw(NV2AState *d);
+void pgraph_gl_get_report(NV2AState *d, uint32_t parameter);
+void pgraph_gl_image_blit(NV2AState *d);
+void pgraph_gl_mark_textures_possibly_dirty(NV2AState *d, hwaddr addr, hwaddr size);
+void pgraph_gl_process_pending_reports(NV2AState *d);
+void pgraph_gl_surface_flush(NV2AState *d);
+void pgraph_gl_surface_update(NV2AState *d, bool upload, bool color_write, bool zeta_write);
+void pgraph_gl_sync(NV2AState *d);
+void pgraph_gl_update_entire_memory_buffer(NV2AState *d);
+void pgraph_gl_init_display_renderer(NV2AState *d);
+void pgraph_gl_init_reports(NV2AState *d);
+void pgraph_gl_init_shader_cache(PGRAPHState *pg);
+void pgraph_gl_init_surfaces(PGRAPHState *pg);
+void pgraph_gl_init_texture_cache(NV2AState *d);
+void pgraph_gl_init_vertex_cache(NV2AState *d);
+void pgraph_gl_process_pending_downloads(NV2AState *d);
+void pgraph_gl_reload_surface_scale_factor(PGRAPHState *pg);
+void pgraph_gl_render_surface_to_texture(NV2AState *d, SurfaceBinding *surface, TextureBinding *texture, TextureShape *texture_shape, int texture_unit);
+void pgraph_gl_set_surface_dirty(PGRAPHState *pg, bool color, bool zeta);
+void pgraph_gl_surface_download_if_dirty(NV2AState *d, SurfaceBinding *surface);
+SurfaceBinding *pgraph_gl_surface_get(NV2AState *d, hwaddr addr);
+SurfaceBinding *pgraph_gl_surface_get_within(NV2AState *d, hwaddr addr);
+void pgraph_gl_surface_invalidate(NV2AState *d, SurfaceBinding *e);
+void pgraph_gl_unbind_surface(NV2AState *d, bool color);
+void pgraph_gl_upload_surface_data(NV2AState *d, SurfaceBinding *surface, bool force);
+void pgraph_gl_shader_cache_to_disk(ShaderLruNode *snode);
+bool pgraph_gl_shader_load_from_memory(ShaderLruNode *snode);
+void pgraph_gl_shader_write_cache_reload_list(PGRAPHState *pg);
+void pgraph_gl_set_surface_scale_factor(NV2AState *d, unsigned int scale);
+unsigned int pgraph_gl_get_surface_scale_factor(NV2AState *d);
+int pgraph_gl_get_framebuffer_surface(NV2AState *d);
+
+#endif
--- a/hw/xbox/nv2a/pgraph/gl/reports.c
+++ b/hw/xbox/nv2a/pgraph/gl/reports.c
@ -0,0 +1,111 @@
+/*
+ * Geforce NV2A PGRAPH OpenGL Renderer
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <hw/xbox/nv2a/nv2a_int.h>
+#include "renderer.h"
+
+static void process_pending_report(NV2AState *d, QueryReport *report)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    if (report->clear) {
+        r->zpass_pixel_count_result = 0;
+        return;
+    }
+
+    uint8_t type = GET_MASK(report->parameter, NV097_GET_REPORT_TYPE);
+    assert(type == NV097_GET_REPORT_TYPE_ZPASS_PIXEL_CNT);
+
+    /* FIXME: Multisampling affects this (both: OGL and Xbox GPU),
+     *        not sure if CLEARs also count
+     */
+    /* FIXME: What about clipping regions etc? */
+    for (int i = 0; i < report->query_count; i++) {
+        GLuint gl_query_result = 0;
+        glGetQueryObjectuiv(report->queries[i], GL_QUERY_RESULT, &gl_query_result);
+        gl_query_result /= pg->surface_scale_factor * pg->surface_scale_factor;
+        r->zpass_pixel_count_result += gl_query_result;
+    }
+
+    if (report->query_count) {
+        glDeleteQueries(report->query_count, report->queries);
+        g_free(report->queries);
+    }
+
+    pgraph_write_zpass_pixel_cnt_report(d, report->parameter, r->zpass_pixel_count_result);
+}
+
+void pgraph_gl_process_pending_reports(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+    QueryReport *report, *next;
+
+    QSIMPLEQ_FOREACH_SAFE(report, &r->report_queue, entry, next) {
+        process_pending_report(d, report);
+        QSIMPLEQ_REMOVE_HEAD(&r->report_queue, entry);
+        g_free(report);
+    }
+}
+
+void pgraph_gl_clear_report_value(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    /* FIXME: Does this have a value in parameter? Also does this (also?) modify
+     *        the report memory block?
+     */
+    if (r->gl_zpass_pixel_count_query_count) {
+        glDeleteQueries(r->gl_zpass_pixel_count_query_count,
+                        r->gl_zpass_pixel_count_queries);
+        r->gl_zpass_pixel_count_query_count = 0;
+    }
+
+    QueryReport *report = g_malloc(sizeof(QueryReport));
+    report->clear = true;
+    QSIMPLEQ_INSERT_TAIL(&r->report_queue, report, entry);
+}
+
+void pgraph_gl_init_reports(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    QSIMPLEQ_INIT(&r->report_queue);
+}
+
+void pgraph_gl_get_report(NV2AState *d, uint32_t parameter)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    QueryReport *report = g_malloc(sizeof(QueryReport));
+    report->clear = false;
+    report->parameter = parameter;
+    report->query_count = r->gl_zpass_pixel_count_query_count;
+    report->queries = r->gl_zpass_pixel_count_queries;
+    QSIMPLEQ_INSERT_TAIL(&r->report_queue, report, entry);
+
+    r->gl_zpass_pixel_count_query_count = 0;
+    r->gl_zpass_pixel_count_queries = NULL;
+}
--- a/hw/xbox/nv2a/pgraph/gl/shaders.c
+++ b/hw/xbox/nv2a/pgraph/gl/shaders.c
--- a/hw/xbox/nv2a/pgraph/gl/surface.c
+++ b/hw/xbox/nv2a/pgraph/gl/surface.c
--- a/hw/xbox/nv2a/pgraph/gl/texture.c
+++ b/hw/xbox/nv2a/pgraph/gl/texture.c
@ -0,0 +1,819 @@
+/*
+ * Geforce NV2A PGRAPH OpenGL Renderer
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/fast-hash.h"
+#include "hw/xbox/nv2a/nv2a_int.h"
+#include "hw/xbox/nv2a/pgraph/swizzle.h"
+#include "hw/xbox/nv2a/pgraph/s3tc.h"
+#include "hw/xbox/nv2a/pgraph/texture.h"
+#include "debug.h"
+#include "renderer.h"
+
+static TextureBinding* generate_texture(const TextureShape s, const uint8_t *texture_data, const uint8_t *palette_data);
+static void texture_binding_destroy(gpointer data);
+
+struct pgraph_texture_possibly_dirty_struct {
+    hwaddr addr, end;
+};
+
+static void mark_textures_possibly_dirty_visitor(Lru *lru, LruNode *node, void *opaque)
+{
+    struct pgraph_texture_possibly_dirty_struct *test =
+        (struct pgraph_texture_possibly_dirty_struct *)opaque;
+
+    struct TextureLruNode *tnode = container_of(node, TextureLruNode, node);
+    if (tnode->binding == NULL || tnode->possibly_dirty) {
+        return;
+    }
+
+    uintptr_t k_tex_addr = tnode->key.texture_vram_offset;
+    uintptr_t k_tex_end = k_tex_addr + tnode->key.texture_length - 1;
+    bool overlapping = !(test->addr > k_tex_end || k_tex_addr > test->end);
+
+    if (tnode->key.palette_length > 0) {
+        uintptr_t k_pal_addr = tnode->key.palette_vram_offset;
+        uintptr_t k_pal_end = k_pal_addr + tnode->key.palette_length - 1;
+        overlapping |= !(test->addr > k_pal_end || k_pal_addr > test->end);
+    }
+
+    tnode->possibly_dirty |= overlapping;
+}
+
+void pgraph_gl_mark_textures_possibly_dirty(NV2AState *d,
+    hwaddr addr, hwaddr size)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    hwaddr end = TARGET_PAGE_ALIGN(addr + size) - 1;
+    addr &= TARGET_PAGE_MASK;
+    assert(end <= memory_region_size(d->vram));
+
+    struct pgraph_texture_possibly_dirty_struct test = {
+        .addr = addr,
+        .end = end,
+    };
+
+    lru_visit_active(&r->texture_cache,
+                     mark_textures_possibly_dirty_visitor,
+                     &test);
+}
+
+static bool check_texture_dirty(NV2AState *d, hwaddr addr, hwaddr size)
+{
+    hwaddr end = TARGET_PAGE_ALIGN(addr + size);
+    addr &= TARGET_PAGE_MASK;
+    assert(end < memory_region_size(d->vram));
+    return memory_region_test_and_clear_dirty(d->vram, addr, end - addr,
+                                              DIRTY_MEMORY_NV2A_TEX);
+}
+
+// Check if any of the pages spanned by the a texture are dirty.
+static bool check_texture_possibly_dirty(NV2AState *d,
+                                         hwaddr texture_vram_offset,
+                                         unsigned int length,
+                                         hwaddr palette_vram_offset,
+                                         unsigned int palette_length)
+{
+    bool possibly_dirty = false;
+    if (check_texture_dirty(d, texture_vram_offset, length)) {
+        possibly_dirty = true;
+        pgraph_gl_mark_textures_possibly_dirty(d, texture_vram_offset, length);
+    }
+    if (palette_length && check_texture_dirty(d, palette_vram_offset,
+                                                     palette_length)) {
+        possibly_dirty = true;
+        pgraph_gl_mark_textures_possibly_dirty(d, palette_vram_offset,
+                                            palette_length);
+    }
+    return possibly_dirty;
+}
+
+static void apply_texture_parameters(TextureBinding *binding,
+                                     const BasicColorFormatInfo *f,
+                                     unsigned int dimensionality,
+                                     unsigned int filter,
+                                     unsigned int address,
+                                     bool is_bordered,
+                                     uint32_t border_color)
+{
+    unsigned int min_filter = GET_MASK(filter, NV_PGRAPH_TEXFILTER0_MIN);
+    unsigned int mag_filter = GET_MASK(filter, NV_PGRAPH_TEXFILTER0_MAG);
+    unsigned int addru = GET_MASK(address, NV_PGRAPH_TEXADDRESS0_ADDRU);
+    unsigned int addrv = GET_MASK(address, NV_PGRAPH_TEXADDRESS0_ADDRV);
+    unsigned int addrp = GET_MASK(address, NV_PGRAPH_TEXADDRESS0_ADDRP);
+
+    if (f->linear) {
+        /* somtimes games try to set mipmap min filters on linear textures.
+             * this could indicate a bug... */
+        switch (min_filter) {
+        case NV_PGRAPH_TEXFILTER0_MIN_BOX_NEARESTLOD:
+        case NV_PGRAPH_TEXFILTER0_MIN_BOX_TENT_LOD:
+            min_filter = NV_PGRAPH_TEXFILTER0_MIN_BOX_LOD0;
+            break;
+        case NV_PGRAPH_TEXFILTER0_MIN_TENT_NEARESTLOD:
+        case NV_PGRAPH_TEXFILTER0_MIN_TENT_TENT_LOD:
+            min_filter = NV_PGRAPH_TEXFILTER0_MIN_TENT_LOD0;
+            break;
+        }
+    }
+
+    if (min_filter != binding->min_filter) {
+        glTexParameteri(binding->gl_target, GL_TEXTURE_MIN_FILTER,
+                        pgraph_texture_min_filter_gl_map[min_filter]);
+        binding->min_filter = min_filter;
+    }
+    if (mag_filter != binding->mag_filter) {
+        glTexParameteri(binding->gl_target, GL_TEXTURE_MAG_FILTER,
+                        pgraph_texture_mag_filter_gl_map[mag_filter]);
+        binding->mag_filter = mag_filter;
+    }
+
+    /* Texture wrapping */
+    assert(addru < ARRAY_SIZE(pgraph_texture_addr_gl_map));
+    if (addru != binding->addru) {
+        glTexParameteri(binding->gl_target, GL_TEXTURE_WRAP_S,
+                        pgraph_texture_addr_gl_map[addru]);
+        binding->addru = addru;
+    }
+    bool needs_border_color = binding->addru == NV_PGRAPH_TEXADDRESS0_ADDRU_BORDER;
+    if (dimensionality > 1) {
+        if (addrv != binding->addrv) {
+            assert(addrv < ARRAY_SIZE(pgraph_texture_addr_gl_map));
+            glTexParameteri(binding->gl_target, GL_TEXTURE_WRAP_T,
+                            pgraph_texture_addr_gl_map[addrv]);
+            binding->addrv = addrv;
+        }
+        needs_border_color = needs_border_color || binding->addrv == NV_PGRAPH_TEXADDRESS0_ADDRU_BORDER;
+    }
+    if (dimensionality > 2) {
+        if (addrp != binding->addrp) {
+            assert(addrp < ARRAY_SIZE(pgraph_texture_addr_gl_map));
+            glTexParameteri(binding->gl_target, GL_TEXTURE_WRAP_R,
+                            pgraph_texture_addr_gl_map[addrp]);
+            binding->addrp = addrp;
+        }
+        needs_border_color = needs_border_color || binding->addrp == NV_PGRAPH_TEXADDRESS0_ADDRU_BORDER;
+    }
+
+    if (!is_bordered && needs_border_color) {
+        if (!binding->border_color_set || binding->border_color != border_color) {
+            /* FIXME: Color channels might be wrong order */
+            GLfloat gl_border_color[4];
+            pgraph_argb_pack32_to_rgba_float(border_color, gl_border_color);
+            glTexParameterfv(binding->gl_target, GL_TEXTURE_BORDER_COLOR,
+                             gl_border_color);
+
+            binding->border_color_set = true;
+            binding->border_color = border_color;
+        }
+    }
+}
+
+void pgraph_gl_bind_textures(NV2AState *d)
+{
+    int i;
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    NV2A_GL_DGROUP_BEGIN("%s", __func__);
+
+    for (i=0; i<NV2A_MAX_TEXTURES; i++) {
+        bool enabled = pgraph_is_texture_enabled(pg, i);
+        /* FIXME: What happens if texture is disabled but stage is active? */
+
+        glActiveTexture(GL_TEXTURE0 + i);
+        if (!enabled) {
+            glBindTexture(GL_TEXTURE_CUBE_MAP, 0);
+            glBindTexture(GL_TEXTURE_RECTANGLE, 0);
+            glBindTexture(GL_TEXTURE_1D, 0);
+            glBindTexture(GL_TEXTURE_2D, 0);
+            glBindTexture(GL_TEXTURE_3D, 0);
+            continue;
+        }
+
+        uint32_t filter = pgraph_reg_r(pg, NV_PGRAPH_TEXFILTER0 + i*4);
+        uint32_t address = pgraph_reg_r(pg, NV_PGRAPH_TEXADDRESS0 + i*4);
+        uint32_t border_color = pgraph_reg_r(pg, NV_PGRAPH_BORDERCOLOR0 + i*4);
+
+        /* Check for unsupported features */
+        if (filter & NV_PGRAPH_TEXFILTER0_ASIGNED) NV2A_UNIMPLEMENTED("NV_PGRAPH_TEXFILTER0_ASIGNED");
+        if (filter & NV_PGRAPH_TEXFILTER0_RSIGNED) NV2A_UNIMPLEMENTED("NV_PGRAPH_TEXFILTER0_RSIGNED");
+        if (filter & NV_PGRAPH_TEXFILTER0_GSIGNED) NV2A_UNIMPLEMENTED("NV_PGRAPH_TEXFILTER0_GSIGNED");
+        if (filter & NV_PGRAPH_TEXFILTER0_BSIGNED) NV2A_UNIMPLEMENTED("NV_PGRAPH_TEXFILTER0_BSIGNED");
+
+        TextureShape state = pgraph_get_texture_shape(pg, i);
+        hwaddr texture_vram_offset, palette_vram_offset;
+        size_t length, palette_length;
+
+        length = pgraph_get_texture_length(pg, &state);
+        texture_vram_offset = pgraph_get_texture_phys_addr(pg, i);
+        palette_vram_offset = pgraph_get_texture_palette_phys_addr_length(pg, i, &palette_length);
+
+        assert((texture_vram_offset + length) < memory_region_size(d->vram));
+        assert((palette_vram_offset + palette_length)
+               < memory_region_size(d->vram));
+        bool is_indexed = (state.color_format ==
+                NV097_SET_TEXTURE_FORMAT_COLOR_SZ_I8_A8R8G8B8);
+        bool possibly_dirty = false;
+        bool possibly_dirty_checked = false;
+
+        SurfaceBinding *surface = pgraph_gl_surface_get(d, texture_vram_offset);
+        TextureBinding *tbind = r->texture_binding[i];
+        if (!pg->texture_dirty[i] && tbind) {
+            bool reusable = false;
+            if (surface && tbind->draw_time == surface->draw_time) {
+                reusable = true;
+            } else if (!surface) {
+                possibly_dirty = check_texture_possibly_dirty(
+                        d,
+                        texture_vram_offset,
+                        length,
+                        palette_vram_offset,
+                        is_indexed ? palette_length : 0);
+                possibly_dirty_checked = true;
+                reusable = !possibly_dirty;
+            }
+
+            if (reusable) {
+                glBindTexture(r->texture_binding[i]->gl_target,
+                              r->texture_binding[i]->gl_texture);
+                apply_texture_parameters(r->texture_binding[i],
+                                         &kelvin_color_format_info_map[state.color_format],
+                                         state.dimensionality,
+                                         filter,
+                                         address,
+                                         state.border,
+                                         border_color);
+                continue;
+            }
+        }
+
+        /*
+         * Check active surfaces to see if this texture was a render target
+         */
+        bool surf_to_tex = false;
+        if (surface != NULL) {
+            surf_to_tex = pgraph_gl_check_surface_to_texture_compatibility(
+                    surface, &state);
+
+            if (surf_to_tex && surface->upload_pending) {
+                pgraph_gl_upload_surface_data(d, surface, false);
+            }
+        }
+
+        if (!surf_to_tex) {
+            // FIXME: Restructure to support rendering surfaces to cubemap faces
+
+            // Writeback any surfaces which this texture may index
+            hwaddr tex_vram_end = texture_vram_offset + length - 1;
+            QTAILQ_FOREACH(surface, &r->surfaces, entry) {
+                hwaddr surf_vram_end = surface->vram_addr + surface->size - 1;
+                bool overlapping = !(surface->vram_addr >= tex_vram_end
+                                     || texture_vram_offset >= surf_vram_end);
+                if (overlapping) {
+                    pgraph_gl_surface_download_if_dirty(d, surface);
+                }
+            }
+        }
+
+        TextureKey key;
+        memset(&key, 0, sizeof(TextureKey));
+        key.state = state;
+        key.texture_vram_offset = texture_vram_offset;
+        key.texture_length = length;
+        if (is_indexed) {
+            key.palette_vram_offset = palette_vram_offset;
+            key.palette_length = palette_length;
+        }
+
+        // Search for existing texture binding in cache
+        uint64_t tex_binding_hash = fast_hash((uint8_t*)&key, sizeof(key));
+        LruNode *found = lru_lookup(&r->texture_cache,
+                                     tex_binding_hash, &key);
+        TextureLruNode *key_out = container_of(found, TextureLruNode, node);
+        possibly_dirty |= (key_out->binding == NULL) || key_out->possibly_dirty;
+
+        if (!surf_to_tex && !possibly_dirty_checked) {
+            possibly_dirty |= check_texture_possibly_dirty(
+                    d,
+                    texture_vram_offset,
+                    length,
+                    palette_vram_offset,
+                    is_indexed ? palette_length : 0);
+        }
+
+        // Calculate hash of texture data, if necessary
+        void *texture_data = (char*)d->vram_ptr + texture_vram_offset;
+        void *palette_data = (char*)d->vram_ptr + palette_vram_offset;
+
+        uint64_t tex_data_hash = 0;
+        if (!surf_to_tex && possibly_dirty) {
+            tex_data_hash = fast_hash(texture_data, length);
+            if (is_indexed) {
+                tex_data_hash ^= fast_hash(palette_data, palette_length);
+            }
+        }
+
+        // Free existing binding, if texture data has changed
+        bool must_destroy = (key_out->binding != NULL)
+                            && possibly_dirty
+                            && (key_out->binding->data_hash != tex_data_hash);
+        if (must_destroy) {
+            texture_binding_destroy(key_out->binding);
+            key_out->binding = NULL;
+        }
+
+        if (key_out->binding == NULL) {
+            // Must create the texture
+            key_out->binding = generate_texture(state, texture_data, palette_data);
+            key_out->binding->data_hash = tex_data_hash;
+            key_out->binding->scale = 1;
+        } else {
+            // Saved an upload! Reuse existing texture in graphics memory.
+            glBindTexture(key_out->binding->gl_target,
+                          key_out->binding->gl_texture);
+        }
+
+        key_out->possibly_dirty = false;
+        TextureBinding *binding = key_out->binding;
+        binding->refcnt++;
+
+        if (surf_to_tex && binding->draw_time < surface->draw_time) {
+
+            trace_nv2a_pgraph_surface_render_to_texture(
+                surface->vram_addr, surface->width, surface->height);
+            pgraph_gl_render_surface_to_texture(d, surface, binding, &state, i);
+            binding->draw_time = surface->draw_time;
+            if (binding->gl_target == GL_TEXTURE_RECTANGLE) {
+                binding->scale = pg->surface_scale_factor;
+            } else {
+                binding->scale = 1;
+            }
+        }
+
+        apply_texture_parameters(binding,
+                                 &kelvin_color_format_info_map[state.color_format],
+                                 state.dimensionality,
+                                 filter,
+                                 address,
+                                 state.border,
+                                 border_color);
+
+        if (r->texture_binding[i]) {
+            if (r->texture_binding[i]->gl_target != binding->gl_target) {
+                glBindTexture(r->texture_binding[i]->gl_target, 0);
+            }
+            texture_binding_destroy(r->texture_binding[i]);
+        }
+        r->texture_binding[i] = binding;
+        pg->texture_dirty[i] = false;
+    }
+    NV2A_GL_DGROUP_END();
+}
+
+static enum S3TC_DECOMPRESS_FORMAT
+gl_internal_format_to_s3tc_enum(GLint gl_internal_format)
+{
+    switch (gl_internal_format) {
+    case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
+        return S3TC_DECOMPRESS_FORMAT_DXT1;
+    case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
+        return S3TC_DECOMPRESS_FORMAT_DXT3;
+    case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
+        return S3TC_DECOMPRESS_FORMAT_DXT5;
+    default:
+        assert(!"Invalid format");
+    }
+}
+
+static void upload_gl_texture(GLenum gl_target,
+                              const TextureShape s,
+                              const uint8_t *texture_data,
+                              const uint8_t *palette_data)
+{
+    ColorFormatInfo f = kelvin_color_format_gl_map[s.color_format];
+    nv2a_profile_inc_counter(NV2A_PROF_TEX_UPLOAD);
+
+    unsigned int adjusted_width = s.width;
+    unsigned int adjusted_height = s.height;
+    unsigned int adjusted_pitch = s.pitch;
+    unsigned int adjusted_depth = s.depth;
+    if (!f.linear && s.border) {
+        adjusted_width = MAX(16, adjusted_width * 2);
+        adjusted_height = MAX(16, adjusted_height * 2);
+        adjusted_pitch = adjusted_width * (s.pitch / s.width);
+        adjusted_depth = MAX(16, s.depth * 2);
+    }
+
+    switch(gl_target) {
+    case GL_TEXTURE_1D:
+        assert(false);
+        break;
+    case GL_TEXTURE_RECTANGLE: {
+        /* Can't handle strides unaligned to pixels */
+        assert(s.pitch % f.bytes_per_pixel == 0);
+
+        uint8_t *converted = pgraph_convert_texture_data(
+            s, texture_data, palette_data, adjusted_width, adjusted_height, 1,
+            adjusted_pitch, 0, NULL);
+        glPixelStorei(GL_UNPACK_ROW_LENGTH,
+                      converted ? 0 : adjusted_pitch / f.bytes_per_pixel);
+        glTexImage2D(gl_target, 0, f.gl_internal_format,
+                     adjusted_width, adjusted_height, 0,
+                     f.gl_format, f.gl_type,
+                     converted ? converted : texture_data);
+
+        if (converted) {
+          g_free(converted);
+        }
+
+        glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
+        break;
+    }
+    case GL_TEXTURE_2D:
+    case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+    case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+    case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+    case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+    case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+    case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z: {
+
+        unsigned int width = adjusted_width, height = adjusted_height;
+
+        int level;
+        for (level = 0; level < s.levels; level++) {
+            width = MAX(width, 1);
+            height = MAX(height, 1);
+
+            if (f.gl_format == 0) { /* compressed */
+                 // https://docs.microsoft.com/en-us/windows/win32/direct3d10/d3d10-graphics-programming-guide-resources-block-compression#virtual-size-versus-physical-size
+                unsigned int block_size =
+                    f.gl_internal_format == GL_COMPRESSED_RGBA_S3TC_DXT1_EXT ?
+                        8 : 16;
+                unsigned int physical_width = (width + 3) & ~3,
+                             physical_height = (height + 3) & ~3;
+                if (physical_width != width) {
+                    glPixelStorei(GL_UNPACK_ROW_LENGTH, physical_width);
+                }
+                uint8_t *converted = s3tc_decompress_2d(
+                    gl_internal_format_to_s3tc_enum(f.gl_internal_format),
+                    texture_data, physical_width, physical_height);
+                unsigned int tex_width = width;
+                unsigned int tex_height = height;
+
+                if (s.cubemap && adjusted_width != s.width) {
+                    // FIXME: Consider preserving the border.
+                    // There does not seem to be a way to reference the border
+                    // texels in a cubemap, so they are discarded.
+                    glPixelStorei(GL_UNPACK_SKIP_PIXELS, 4);
+                    glPixelStorei(GL_UNPACK_SKIP_ROWS, 4);
+                    tex_width = s.width;
+                    tex_height = s.height;
+                    if (physical_width == width) {
+                        glPixelStorei(GL_UNPACK_ROW_LENGTH, adjusted_width);
+                    }
+                }
+
+                glTexImage2D(gl_target, level, GL_RGBA, tex_width, tex_height, 0,
+                             GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, converted);
+                g_free(converted);
+                if (physical_width != width) {
+                    glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
+                }
+                if (s.cubemap && adjusted_width != s.width) {
+                    glPixelStorei(GL_UNPACK_SKIP_PIXELS, 0);
+                    glPixelStorei(GL_UNPACK_SKIP_ROWS, 0);
+                    if (physical_width == width) {
+                        glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
+                    }
+                }
+                texture_data +=
+                    physical_width / 4 * physical_height / 4 * block_size;
+            } else {
+                unsigned int pitch = width * f.bytes_per_pixel;
+                uint8_t *unswizzled = (uint8_t*)g_malloc(height * pitch);
+                unswizzle_rect(texture_data, width, height,
+                               unswizzled, pitch, f.bytes_per_pixel);
+                uint8_t *converted = pgraph_convert_texture_data(
+                    s, unswizzled, palette_data, width, height, 1, pitch, 0,
+                    NULL);
+                uint8_t *pixel_data = converted ? converted : unswizzled;
+                unsigned int tex_width = width;
+                unsigned int tex_height = height;
+
+                if (s.cubemap && adjusted_width != s.width) {
+                    // FIXME: Consider preserving the border.
+                    // There does not seem to be a way to reference the border
+                    // texels in a cubemap, so they are discarded.
+                    glPixelStorei(GL_UNPACK_ROW_LENGTH, adjusted_width);
+                    tex_width = s.width;
+                    tex_height = s.height;
+                    pixel_data += 4 * f.bytes_per_pixel + 4 * pitch;
+                }
+
+                glTexImage2D(gl_target, level, f.gl_internal_format, tex_width,
+                             tex_height, 0, f.gl_format, f.gl_type,
+                             pixel_data);
+                if (s.cubemap && s.border) {
+                    glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
+                }
+                if (converted) {
+                    g_free(converted);
+                }
+                g_free(unswizzled);
+
+                texture_data += width * height * f.bytes_per_pixel;
+            }
+
+            width /= 2;
+            height /= 2;
+        }
+
+        break;
+    }
+    case GL_TEXTURE_3D: {
+
+        unsigned int width = adjusted_width;
+        unsigned int height = adjusted_height;
+        unsigned int depth = adjusted_depth;
+
+        assert(f.linear == false);
+
+        int level;
+        for (level = 0; level < s.levels; level++) {
+            if (f.gl_format == 0) { /* compressed */
+                assert(width % 4 == 0 && height % 4 == 0 &&
+                       "Compressed 3D texture virtual size");
+                width = MAX(width, 4);
+                height = MAX(height, 4);
+                depth = MAX(depth, 1);
+
+                unsigned int block_size;
+                if (f.gl_internal_format == GL_COMPRESSED_RGBA_S3TC_DXT1_EXT) {
+                    block_size = 8;
+                } else {
+                    block_size = 16;
+                }
+
+                size_t texture_size = width/4 * height/4 * depth * block_size;
+
+                uint8_t *converted = s3tc_decompress_3d(
+                    gl_internal_format_to_s3tc_enum(f.gl_internal_format),
+                    texture_data, width, height, depth);
+
+                glTexImage3D(gl_target, level,  GL_RGBA8,
+                             width, height, depth, 0,
+                             GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV,
+                             converted);
+
+                g_free(converted);
+
+                texture_data += texture_size;
+            } else {
+                width = MAX(width, 1);
+                height = MAX(height, 1);
+                depth = MAX(depth, 1);
+
+                unsigned int row_pitch = width * f.bytes_per_pixel;
+                unsigned int slice_pitch = row_pitch * height;
+                uint8_t *unswizzled = (uint8_t*)g_malloc(slice_pitch * depth);
+                unswizzle_box(texture_data, width, height, depth, unswizzled,
+                               row_pitch, slice_pitch, f.bytes_per_pixel);
+
+                uint8_t *converted = pgraph_convert_texture_data(
+                    s, unswizzled, palette_data, width, height, depth,
+                    row_pitch, slice_pitch, NULL);
+
+                glTexImage3D(gl_target, level, f.gl_internal_format,
+                             width, height, depth, 0,
+                             f.gl_format, f.gl_type,
+                             converted ? converted : unswizzled);
+
+                if (converted) {
+                    g_free(converted);
+                }
+                g_free(unswizzled);
+
+                texture_data += width * height * depth * f.bytes_per_pixel;
+            }
+
+            width /= 2;
+            height /= 2;
+            depth /= 2;
+        }
+        break;
+    }
+    default:
+        assert(false);
+        break;
+    }
+}
+
+static TextureBinding* generate_texture(const TextureShape s,
+                                        const uint8_t *texture_data,
+                                        const uint8_t *palette_data)
+{
+    ColorFormatInfo f = kelvin_color_format_gl_map[s.color_format];
+
+    /* Create a new opengl texture */
+    GLuint gl_texture;
+    glGenTextures(1, &gl_texture);
+
+    GLenum gl_target;
+    if (s.cubemap) {
+        assert(f.linear == false);
+        assert(s.dimensionality == 2);
+        gl_target = GL_TEXTURE_CUBE_MAP;
+    } else {
+        if (f.linear) {
+            /* linear textures use unnormalised texcoords.
+             * GL_TEXTURE_RECTANGLE_ARB conveniently also does, but
+             * does not allow repeat and mirror wrap modes.
+             *  (or mipmapping, but xbox d3d says 'Non swizzled and non
+             *   compressed textures cannot be mip mapped.')
+             * Not sure if that'll be an issue. */
+
+            /* FIXME: GLSL 330 provides us with textureSize()! Use that? */
+            gl_target = GL_TEXTURE_RECTANGLE;
+            assert(s.dimensionality == 2);
+        } else {
+            switch(s.dimensionality) {
+            case 1: gl_target = GL_TEXTURE_1D; break;
+            case 2: gl_target = GL_TEXTURE_2D; break;
+            case 3: gl_target = GL_TEXTURE_3D; break;
+            default:
+                assert(false);
+                break;
+            }
+        }
+    }
+
+    glBindTexture(gl_target, gl_texture);
+
+    NV2A_GL_DLABEL(GL_TEXTURE, gl_texture,
+                   "offset: 0x%08lx, format: 0x%02X%s, %d dimensions%s, "
+                   "width: %d, height: %d, depth: %d",
+                   texture_data - g_nv2a->vram_ptr,
+                   s.color_format, f.linear ? "" : " (SZ)",
+                   s.dimensionality, s.cubemap ? " (Cubemap)" : "",
+                   s.width, s.height, s.depth);
+
+    if (gl_target == GL_TEXTURE_CUBE_MAP) {
+
+        ColorFormatInfo f = kelvin_color_format_gl_map[s.color_format];
+        unsigned int block_size;
+        if (f.gl_internal_format == GL_COMPRESSED_RGBA_S3TC_DXT1_EXT) {
+            block_size = 8;
+        } else {
+            block_size = 16;
+        }
+
+        size_t length = 0;
+        unsigned int w = s.width;
+        unsigned int h = s.height;
+        if (!f.linear && s.border) {
+            w = MAX(16, w * 2);
+            h = MAX(16, h * 2);
+        }
+
+        int level;
+        for (level = 0; level < s.levels; level++) {
+            if (f.gl_format == 0) {
+                length += w/4 * h/4 * block_size;
+            } else {
+                length += w * h * f.bytes_per_pixel;
+            }
+
+            w /= 2;
+            h /= 2;
+        }
+
+        length = (length + NV2A_CUBEMAP_FACE_ALIGNMENT - 1) & ~(NV2A_CUBEMAP_FACE_ALIGNMENT - 1);
+
+        upload_gl_texture(GL_TEXTURE_CUBE_MAP_POSITIVE_X,
+                          s, texture_data + 0 * length, palette_data);
+        upload_gl_texture(GL_TEXTURE_CUBE_MAP_NEGATIVE_X,
+                          s, texture_data + 1 * length, palette_data);
+        upload_gl_texture(GL_TEXTURE_CUBE_MAP_POSITIVE_Y,
+                          s, texture_data + 2 * length, palette_data);
+        upload_gl_texture(GL_TEXTURE_CUBE_MAP_NEGATIVE_Y,
+                          s, texture_data + 3 * length, palette_data);
+        upload_gl_texture(GL_TEXTURE_CUBE_MAP_POSITIVE_Z,
+                          s, texture_data + 4 * length, palette_data);
+        upload_gl_texture(GL_TEXTURE_CUBE_MAP_NEGATIVE_Z,
+                          s, texture_data + 5 * length, palette_data);
+    } else {
+        upload_gl_texture(gl_target, s, texture_data, palette_data);
+    }
+
+    /* Linear textures don't support mipmapping */
+    if (!f.linear) {
+        glTexParameteri(gl_target, GL_TEXTURE_BASE_LEVEL,
+            s.min_mipmap_level);
+        glTexParameteri(gl_target, GL_TEXTURE_MAX_LEVEL,
+            s.levels - 1);
+    }
+
+    if (f.gl_swizzle_mask[0] != 0 || f.gl_swizzle_mask[1] != 0
+        || f.gl_swizzle_mask[2] != 0 || f.gl_swizzle_mask[3] != 0) {
+        glTexParameteriv(gl_target, GL_TEXTURE_SWIZZLE_RGBA,
+                         (const GLint *)f.gl_swizzle_mask);
+    }
+
+    TextureBinding* ret = (TextureBinding *)g_malloc(sizeof(TextureBinding));
+    ret->gl_target = gl_target;
+    ret->gl_texture = gl_texture;
+    ret->refcnt = 1;
+    ret->draw_time = 0;
+    ret->data_hash = 0;
+    ret->min_filter = 0xFFFFFFFF;
+    ret->mag_filter = 0xFFFFFFFF;
+    ret->addru = 0xFFFFFFFF;
+    ret->addrv = 0xFFFFFFFF;
+    ret->addrp = 0xFFFFFFFF;
+    ret->border_color_set = false;
+    return ret;
+}
+
+static void texture_binding_destroy(gpointer data)
+{
+    TextureBinding *binding = (TextureBinding *)data;
+    assert(binding->refcnt > 0);
+    binding->refcnt--;
+    if (binding->refcnt == 0) {
+        glDeleteTextures(1, &binding->gl_texture);
+        g_free(binding);
+    }
+}
+
+/* functions for texture LRU cache */
+static void texture_cache_entry_init(Lru *lru, LruNode *node, void *key)
+{
+    TextureLruNode *tnode = container_of(node, TextureLruNode, node);
+    memcpy(&tnode->key, key, sizeof(TextureKey));
+
+    tnode->binding = NULL;
+    tnode->possibly_dirty = false;
+}
+
+static void texture_cache_entry_post_evict(Lru *lru, LruNode *node)
+{
+    TextureLruNode *tnode = container_of(node, TextureLruNode, node);
+    if (tnode->binding) {
+        texture_binding_destroy(tnode->binding);
+        tnode->binding = NULL;
+        tnode->possibly_dirty = false;
+    }
+}
+
+static bool texture_cache_entry_compare(Lru *lru, LruNode *node, void *key)
+{
+    TextureLruNode *tnode = container_of(node, TextureLruNode, node);
+    return memcmp(&tnode->key, key, sizeof(TextureKey));
+}
+
+void pgraph_gl_init_texture_cache(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    const size_t texture_cache_size = 512;
+    lru_init(&r->texture_cache);
+    r->texture_cache_entries = malloc(texture_cache_size * sizeof(TextureLruNode));
+    assert(r->texture_cache_entries != NULL);
+    for (int i = 0; i < texture_cache_size; i++) {
+        lru_add_free(&r->texture_cache, &r->texture_cache_entries[i].node);
+    }
+
+    r->texture_cache.init_node = texture_cache_entry_init;
+    r->texture_cache.compare_nodes = texture_cache_entry_compare;
+    r->texture_cache.post_node_evict = texture_cache_entry_post_evict;
+}
+
+void pgraph_gl_deinit_texture_cache(PGRAPHState *pg)
+{
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    // Clear out texture cache
+    lru_flush(&r->texture_cache);
+    free(r->texture_cache_entries);
+}
--- a/hw/xbox/nv2a/pgraph/gl/vertex.c
+++ b/hw/xbox/nv2a/pgraph/gl/vertex.c
@ -0,0 +1,283 @@
+/*
+ * Geforce NV2A PGRAPH OpenGL Renderer
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hw/xbox/nv2a/nv2a_regs.h"
+#include <hw/xbox/nv2a/nv2a_int.h>
+#include "debug.h"
+#include "renderer.h"
+
+static void update_memory_buffer(NV2AState *d, hwaddr addr, hwaddr size,
+                                 bool quick)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    glBindBuffer(GL_ARRAY_BUFFER, r->gl_memory_buffer);
+
+    hwaddr end = TARGET_PAGE_ALIGN(addr + size);
+    addr &= TARGET_PAGE_MASK;
+    assert(end < memory_region_size(d->vram));
+
+    static hwaddr last_addr, last_end;
+    if (quick && (addr >= last_addr) && (end <= last_end)) {
+        return;
+    }
+    last_addr = addr;
+    last_end = end;
+
+    size = end - addr;
+    if (memory_region_test_and_clear_dirty(d->vram, addr, size,
+                                           DIRTY_MEMORY_NV2A)) {
+        glBufferSubData(GL_ARRAY_BUFFER, addr, size,
+                        d->vram_ptr + addr);
+        nv2a_profile_inc_counter(NV2A_PROF_GEOM_BUFFER_UPDATE_1);
+    }
+}
+
+void pgraph_gl_update_entire_memory_buffer(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    glBindBuffer(GL_ARRAY_BUFFER, r->gl_memory_buffer);
+    glBufferSubData(GL_ARRAY_BUFFER, 0, memory_region_size(d->vram), d->vram_ptr);
+}
+
+void pgraph_gl_bind_vertex_attributes(NV2AState *d, unsigned int min_element,
+                                   unsigned int max_element, bool inline_data,
+                                   unsigned int inline_stride,
+                                   unsigned int provoking_element)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    bool updated_memory_buffer = false;
+    unsigned int num_elements = max_element - min_element + 1;
+
+    if (inline_data) {
+        NV2A_GL_DGROUP_BEGIN("%s (num_elements: %d inline stride: %d)",
+                             __func__, num_elements, inline_stride);
+    } else {
+        NV2A_GL_DGROUP_BEGIN("%s (num_elements: %d)", __func__, num_elements);
+    }
+
+    pg->compressed_attrs = 0;
+
+    for (int i = 0; i < NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
+        VertexAttribute *attr = &pg->vertex_attributes[i];
+
+        if (!attr->count) {
+            glDisableVertexAttribArray(i);
+            glVertexAttrib4fv(i, attr->inline_value);
+            continue;
+        }
+
+        NV2A_DPRINTF("vertex data array format=%d, count=%d, stride=%d\n",
+                     attr->format, attr->count, attr->stride);
+
+        GLint gl_count = attr->count;
+        GLenum gl_type;
+        GLboolean gl_normalize;
+        bool needs_conversion = false;
+
+        switch (attr->format) {
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_UB_D3D:
+            gl_type = GL_UNSIGNED_BYTE;
+            gl_normalize = GL_TRUE;
+            // http://www.opengl.org/registry/specs/ARB/vertex_array_bgra.txt
+            gl_count = GL_BGRA;
+            break;
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_UB_OGL:
+            gl_type = GL_UNSIGNED_BYTE;
+            gl_normalize = GL_TRUE;
+            break;
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_S1:
+            gl_type = GL_SHORT;
+            gl_normalize = GL_TRUE;
+            break;
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_F:
+            gl_type = GL_FLOAT;
+            gl_normalize = GL_FALSE;
+            break;
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_S32K:
+            gl_type = GL_SHORT;
+            gl_normalize = GL_FALSE;
+            break;
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_CMP:
+            /* 3 signed, normalized components packed in 32-bits. (11,11,10) */
+            gl_type = GL_INT;
+            assert(attr->count == 1);
+            needs_conversion = true;
+            break;
+        default:
+            fprintf(stderr, "Unknown vertex type: 0x%x\n", attr->format);
+            assert(false);
+            break;
+        }
+
+        nv2a_profile_inc_counter(NV2A_PROF_ATTR_BIND);
+        hwaddr attrib_data_addr;
+        size_t stride;
+
+        if (needs_conversion) {
+            pg->compressed_attrs |= (1 << i);
+        }
+
+        hwaddr start = 0;
+        if (inline_data) {
+            glBindBuffer(GL_ARRAY_BUFFER, r->gl_inline_array_buffer);
+            attrib_data_addr = attr->inline_array_offset;
+            stride = inline_stride;
+        } else {
+            hwaddr dma_len;
+            uint8_t *attr_data = (uint8_t *)nv_dma_map(
+                d, attr->dma_select ? pg->dma_vertex_b : pg->dma_vertex_a,
+                &dma_len);
+            assert(attr->offset < dma_len);
+            attrib_data_addr = attr_data + attr->offset - d->vram_ptr;
+            stride = attr->stride;
+            start = attrib_data_addr + min_element * stride;
+            update_memory_buffer(d, start, num_elements * stride,
+                                        updated_memory_buffer);
+            updated_memory_buffer = true;
+        }
+
+        uint32_t provoking_element_index = provoking_element - min_element;
+        size_t element_size = attr->size * attr->count;
+        assert(element_size <= sizeof(attr->inline_value));
+        const uint8_t *last_entry;
+
+        if (inline_data) {
+            last_entry = (uint8_t*)pg->inline_array + attr->inline_array_offset;
+        } else {
+            last_entry = d->vram_ptr + start;
+        }
+        if (!stride) {
+            // Stride of 0 indicates that only the first element should be
+            // used.
+            pgraph_update_inline_value(attr, last_entry);
+            glDisableVertexAttribArray(i);
+            glVertexAttrib4fv(i, attr->inline_value);
+            continue;
+        }
+
+        if (needs_conversion) {
+            glVertexAttribIPointer(i, gl_count, gl_type, stride,
+                                   (void *)attrib_data_addr);
+        } else {
+            glVertexAttribPointer(i, gl_count, gl_type, gl_normalize, stride,
+                                  (void *)attrib_data_addr);
+        }
+
+        glEnableVertexAttribArray(i);
+        last_entry += stride * provoking_element_index;
+        pgraph_update_inline_value(attr, last_entry);
+    }
+
+    NV2A_GL_DGROUP_END();
+}
+
+unsigned int pgraph_gl_bind_inline_array(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    unsigned int offset = 0;
+    for (int i = 0; i < NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
+        VertexAttribute *attr = &pg->vertex_attributes[i];
+        if (attr->count == 0) {
+            continue;
+        }
+
+        /* FIXME: Double check */
+        offset = ROUND_UP(offset, attr->size);
+        attr->inline_array_offset = offset;
+        NV2A_DPRINTF("bind inline attribute %d size=%d, count=%d\n",
+            i, attr->size, attr->count);
+        offset += attr->size * attr->count;
+        offset = ROUND_UP(offset, attr->size);
+    }
+
+    unsigned int vertex_size = offset;
+    unsigned int index_count = pg->inline_array_length*4 / vertex_size;
+
+    NV2A_DPRINTF("draw inline array %d, %d\n", vertex_size, index_count);
+
+    nv2a_profile_inc_counter(NV2A_PROF_GEOM_BUFFER_UPDATE_2);
+    glBindBuffer(GL_ARRAY_BUFFER, r->gl_inline_array_buffer);
+    glBufferData(GL_ARRAY_BUFFER, NV2A_MAX_BATCH_LENGTH * sizeof(uint32_t),
+                 NULL, GL_STREAM_DRAW);
+    glBufferSubData(GL_ARRAY_BUFFER, 0, index_count * vertex_size, pg->inline_array);
+    pgraph_gl_bind_vertex_attributes(d, 0, index_count-1, true, vertex_size,
+                                  index_count-1);
+
+    return index_count;
+}
+
+static void vertex_cache_entry_init(Lru *lru, LruNode *node, void *key)
+{
+    VertexLruNode *vnode = container_of(node, VertexLruNode, node);
+    memcpy(&vnode->key, key, sizeof(struct VertexKey));
+    vnode->initialized = false;
+}
+
+static bool vertex_cache_entry_compare(Lru *lru, LruNode *node, void *key)
+{
+    VertexLruNode *vnode = container_of(node, VertexLruNode, node);
+    return memcmp(&vnode->key, key, sizeof(VertexKey));
+}
+
+void pgraph_gl_init_vertex_cache(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;
+
+    const size_t element_cache_size = 50*1024;
+    lru_init(&r->element_cache);
+    r->element_cache_entries = malloc(element_cache_size * sizeof(VertexLruNode));
+    assert(r->element_cache_entries != NULL);
+    GLuint element_cache_buffers[element_cache_size];
+    glGenBuffers(element_cache_size, element_cache_buffers);
+    for (int i = 0; i < element_cache_size; i++) {
+        r->element_cache_entries[i].gl_buffer = element_cache_buffers[i];
+        lru_add_free(&r->element_cache, &r->element_cache_entries[i].node);
+    }
+
+    r->element_cache.init_node = vertex_cache_entry_init;
+    r->element_cache.compare_nodes = vertex_cache_entry_compare;
+
+    GLint max_vertex_attributes;
+    glGetIntegerv(GL_MAX_VERTEX_ATTRIBS, &max_vertex_attributes);
+    assert(max_vertex_attributes >= NV2A_VERTEXSHADER_ATTRIBUTES);
+
+    glGenBuffers(NV2A_VERTEXSHADER_ATTRIBUTES, r->gl_inline_buffer);
+    glGenBuffers(1, &r->gl_inline_array_buffer);
+
+    glGenBuffers(1, &r->gl_memory_buffer);
+    glBindBuffer(GL_ARRAY_BUFFER, r->gl_memory_buffer);
+    glBufferData(GL_ARRAY_BUFFER, memory_region_size(d->vram),
+                 NULL, GL_DYNAMIC_DRAW);
+
+    glGenVertexArrays(1, &r->gl_vertex_array);
+    glBindVertexArray(r->gl_vertex_array);
+
+    assert(glGetError() == GL_NO_ERROR);
+}
--- a/hw/xbox/nv2a/pgraph/glsl/common.c
+++ b/hw/xbox/nv2a/pgraph/glsl/common.c
@ -0,0 +1,58 @@
+/*
+ * Geforce NV2A PGRAPH GLSL Shader Generator
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "common.h"
+
+
+MString *pgraph_get_glsl_vtx_header(MString *out, bool location, bool smooth, bool in, bool prefix, bool array)
+{
+    const char *flat_s = "flat";
+    const char *noperspective_s = "noperspective";
+    const char *qualifier_s = smooth ? noperspective_s : flat_s;
+    const char *qualifiers[11] = {
+        noperspective_s, flat_s,          qualifier_s,     qualifier_s,
+        qualifier_s,     qualifier_s,     noperspective_s, noperspective_s,
+        noperspective_s, noperspective_s, noperspective_s
+    };
+
+    const char *in_out_s = in ? "in" : "out";
+
+    const char *float_s = "float";
+    const char *vec4_s = "vec4";
+    const char *types[11] = { float_s, float_s, vec4_s, vec4_s, vec4_s, vec4_s,
+                              float_s, vec4_s,  vec4_s, vec4_s, vec4_s };
+
+    const char *prefix_s = prefix ? "v_" : "";
+    const char *names[11] = {
+        "vtx_inv_w", "vtx_inv_w_flat", "vtxD0", "vtxD1", "vtxB0", "vtxB1",
+        "vtxFog",    "vtxT0",          "vtxT1", "vtxT2", "vtxT3",
+    };
+    const char *suffix_s = array ? "[]" : "";
+
+    for (int i = 0; i < 11; i++) {
+        if (location) {
+            mstring_append_fmt(out, "layout(location = %d) ", i);
+        }
+        mstring_append_fmt(out, "%s %s %s %s%s%s;\n",
+            qualifiers[i], in_out_s, types[i], prefix_s, names[i], suffix_s);
+    }
+
+    return out;
+}
--- a/hw/xbox/nv2a/pgraph/glsl/common.h
+++ b/hw/xbox/nv2a/pgraph/glsl/common.h
@ -0,0 +1,38 @@
+/*
+ * Geforce NV2A PGRAPH GLSL Shader Generator
+ *
+ * Copyright (c) 2015 espes
+ * Copyright (c) 2015 Jannik Vogel
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_NV2A_SHADERS_COMMON_H
+#define HW_NV2A_SHADERS_COMMON_H
+
+#include "qemu/mstring.h"
+#include <stdbool.h>
+
+#define GLSL_C(idx) "c[" stringify(idx) "]"
+#define GLSL_LTCTXA(idx) "ltctxa[" stringify(idx) "]"
+
+#define GLSL_C_MAT4(idx) \
+    "mat4(" GLSL_C(idx) ", " GLSL_C(idx+1) ", " \
+            GLSL_C(idx+2) ", " GLSL_C(idx+3) ")"
+
+#define GLSL_DEFINE(a, b) "#define " stringify(a) " " b "\n"
+
+MString *pgraph_get_glsl_vtx_header(MString *out, bool location, bool smooth, bool in, bool prefix, bool array);
+
+#endif
--- a/hw/xbox/nv2a/pgraph/glsl/geom.c
+++ b/hw/xbox/nv2a/pgraph/glsl/geom.c
@ -0,0 +1,228 @@
+/*
+ * Geforce NV2A PGRAPH GLSL Shader Generator
+ *
+ * Copyright (c) 2015 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2020-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hw/xbox/nv2a/pgraph/shaders.h"
+#include "common.h"
+#include "geom.h"
+
+MString *pgraph_gen_geom_glsl(enum ShaderPolygonMode polygon_front_mode,
+                              enum ShaderPolygonMode polygon_back_mode,
+                              enum ShaderPrimitiveMode primitive_mode,
+                              bool smooth_shading,
+                              bool vulkan)
+{
+    /* FIXME: Missing support for 2-sided-poly mode */
+    assert(polygon_front_mode == polygon_back_mode);
+    enum ShaderPolygonMode polygon_mode = polygon_front_mode;
+
+    /* POINT mode shouldn't require any special work */
+    if (polygon_mode == POLY_MODE_POINT) {
+        return NULL;
+    }
+
+    /* Handle LINE and FILL mode */
+    const char *layout_in = NULL;
+    const char *layout_out = NULL;
+    const char *body = NULL;
+    switch (primitive_mode) {
+    case PRIM_TYPE_POINTS: return NULL;
+    case PRIM_TYPE_LINES: return NULL;
+    case PRIM_TYPE_LINE_LOOP: return NULL;
+    case PRIM_TYPE_LINE_STRIP: return NULL;
+    case PRIM_TYPE_TRIANGLES:
+        if (polygon_mode == POLY_MODE_FILL) { return NULL; }
+        assert(polygon_mode == POLY_MODE_LINE);
+        layout_in = "layout(triangles) in;\n";
+        layout_out = "layout(line_strip, max_vertices = 4) out;\n";
+        body = "  emit_vertex(0, 0);\n"
+               "  emit_vertex(1, 0);\n"
+               "  emit_vertex(2, 0);\n"
+               "  emit_vertex(0, 0);\n"
+               "  EndPrimitive();\n";
+        break;
+    case PRIM_TYPE_TRIANGLE_STRIP:
+        if (polygon_mode == POLY_MODE_FILL) { return NULL; }
+        assert(polygon_mode == POLY_MODE_LINE);
+        layout_in = "layout(triangles) in;\n";
+        layout_out = "layout(line_strip, max_vertices = 4) out;\n";
+        /* Imagine a quad made of a tristrip, the comments tell you which
+         * vertex we are using */
+        body = "  if ((gl_PrimitiveIDIn & 1) == 0) {\n"
+               "    if (gl_PrimitiveIDIn == 0) {\n"
+               "      emit_vertex(0, 0);\n" /* bottom right */
+               "    }\n"
+               "    emit_vertex(1, 0);\n" /* top right */
+               "    emit_vertex(2, 0);\n" /* bottom left */
+               "    emit_vertex(0, 0);\n" /* bottom right */
+               "  } else {\n"
+               "    emit_vertex(2, 0);\n" /* bottom left */
+               "    emit_vertex(1, 0);\n" /* top left */
+               "    emit_vertex(0, 0);\n" /* top right */
+               "  }\n"
+               "  EndPrimitive();\n";
+        break;
+    case PRIM_TYPE_TRIANGLE_FAN:
+        if (polygon_mode == POLY_MODE_FILL) { return NULL; }
+        assert(polygon_mode == POLY_MODE_LINE);
+        layout_in = "layout(triangles) in;\n";
+        layout_out = "layout(line_strip, max_vertices = 4) out;\n";
+        body = "  if (gl_PrimitiveIDIn == 0) {\n"
+               "    emit_vertex(0, 0);\n"
+               "  }\n"
+               "  emit_vertex(1, 0);\n"
+               "  emit_vertex(2, 0);\n"
+               "  emit_vertex(0, 0);\n"
+               "  EndPrimitive();\n";
+        break;
+    case PRIM_TYPE_QUADS:
+        layout_in = "layout(lines_adjacency) in;\n";
+        if (polygon_mode == POLY_MODE_LINE) {
+            layout_out = "layout(line_strip, max_vertices = 5) out;\n";
+            body = "  emit_vertex(0, 3);\n"
+                   "  emit_vertex(1, 3);\n"
+                   "  emit_vertex(2, 3);\n"
+                   "  emit_vertex(3, 3);\n"
+                   "  emit_vertex(0, 3);\n"
+                   "  EndPrimitive();\n";
+        } else if (polygon_mode == POLY_MODE_FILL) {
+            layout_out = "layout(triangle_strip, max_vertices = 4) out;\n";
+            body = "  emit_vertex(3, 3);\n"
+                   "  emit_vertex(0, 3);\n"
+                   "  emit_vertex(2, 3);\n"
+                   "  emit_vertex(1, 3);\n"
+                   "  EndPrimitive();\n";
+        } else {
+            assert(false);
+            return NULL;
+        }
+        break;
+    case PRIM_TYPE_QUAD_STRIP:
+        layout_in = "layout(lines_adjacency) in;\n";
+        if (polygon_mode == POLY_MODE_LINE) {
+            layout_out = "layout(line_strip, max_vertices = 5) out;\n";
+            body = "  if ((gl_PrimitiveIDIn & 1) != 0) { return; }\n"
+                   "  if (gl_PrimitiveIDIn == 0) {\n"
+                   "    emit_vertex(0, 3);\n"
+                   "  }\n"
+                   "  emit_vertex(1, 3);\n"
+                   "  emit_vertex(3, 3);\n"
+                   "  emit_vertex(2, 3);\n"
+                   "  emit_vertex(0, 3);\n"
+                   "  EndPrimitive();\n";
+        } else if (polygon_mode == POLY_MODE_FILL) {
+            layout_out = "layout(triangle_strip, max_vertices = 4) out;\n";
+            body = "  if ((gl_PrimitiveIDIn & 1) != 0) { return; }\n"
+                   "  emit_vertex(0, 3);\n"
+                   "  emit_vertex(1, 3);\n"
+                   "  emit_vertex(2, 3);\n"
+                   "  emit_vertex(3, 3);\n"
+                   "  EndPrimitive();\n";
+        } else {
+            assert(false);
+            return NULL;
+        }
+        break;
+    case PRIM_TYPE_POLYGON:
+        if (polygon_mode == POLY_MODE_LINE) {
+            return NULL;
+        }
+        if (polygon_mode == POLY_MODE_FILL) {
+            if (smooth_shading) {
+                return NULL;
+            }
+            layout_in = "layout(triangles) in;\n";
+            layout_out = "layout(triangle_strip, max_vertices = 3) out;\n";
+            body = "  emit_vertex(0, 2);\n"
+                   "  emit_vertex(1, 2);\n"
+                   "  emit_vertex(2, 2);\n"
+                   "  EndPrimitive();\n";
+        } else {
+            assert(false);
+            return NULL;
+        }
+        break;
+
+    default:
+        assert(false);
+        return NULL;
+    }
+
+    /* generate a geometry shader to support deprecated primitive types */
+    assert(layout_in);
+    assert(layout_out);
+    assert(body);
+    MString *s = mstring_new();
+    mstring_append_fmt(s, "#version %d\n\n", vulkan ? 450 : 400);
+    mstring_append(s, layout_in);
+    mstring_append(s, layout_out);
+    mstring_append(s, "\n");
+    pgraph_get_glsl_vtx_header(s, vulkan, smooth_shading, true, true, true);
+    pgraph_get_glsl_vtx_header(s, vulkan, smooth_shading, false, false, false);
+
+    if (smooth_shading) {
+        mstring_append(s,
+                       "void emit_vertex(int index, int _unused) {\n"
+                       "  gl_Position = gl_in[index].gl_Position;\n"
+                       "  gl_PointSize = gl_in[index].gl_PointSize;\n"
+                       // "  gl_ClipDistance[0] = gl_in[index].gl_ClipDistance[0];\n"
+                       // "  gl_ClipDistance[1] = gl_in[index].gl_ClipDistance[1];\n"
+                       "  vtx_inv_w = v_vtx_inv_w[index];\n"
+                       "  vtx_inv_w_flat = v_vtx_inv_w[index];\n"
+                       "  vtxD0 = v_vtxD0[index];\n"
+                       "  vtxD1 = v_vtxD1[index];\n"
+                       "  vtxB0 = v_vtxB0[index];\n"
+                       "  vtxB1 = v_vtxB1[index];\n"
+                       "  vtxFog = v_vtxFog[index];\n"
+                       "  vtxT0 = v_vtxT0[index];\n"
+                       "  vtxT1 = v_vtxT1[index];\n"
+                       "  vtxT2 = v_vtxT2[index];\n"
+                       "  vtxT3 = v_vtxT3[index];\n"
+                       "  EmitVertex();\n"
+                       "}\n");
+    } else {
+        mstring_append(s,
+                       "void emit_vertex(int index, int provoking_index) {\n"
+                       "  gl_Position = gl_in[index].gl_Position;\n"
+                       "  gl_PointSize = gl_in[index].gl_PointSize;\n"
+                       // "  gl_ClipDistance[0] = gl_in[index].gl_ClipDistance[0];\n"
+                       // "  gl_ClipDistance[1] = gl_in[index].gl_ClipDistance[1];\n"
+                       "  vtx_inv_w = v_vtx_inv_w[index];\n"
+                       "  vtx_inv_w_flat = v_vtx_inv_w[provoking_index];\n"
+                       "  vtxD0 = v_vtxD0[provoking_index];\n"
+                       "  vtxD1 = v_vtxD1[provoking_index];\n"
+                       "  vtxB0 = v_vtxB0[provoking_index];\n"
+                       "  vtxB1 = v_vtxB1[provoking_index];\n"
+                       "  vtxFog = v_vtxFog[index];\n"
+                       "  vtxT0 = v_vtxT0[index];\n"
+                       "  vtxT1 = v_vtxT1[index];\n"
+                       "  vtxT2 = v_vtxT2[index];\n"
+                       "  vtxT3 = v_vtxT3[index];\n"
+                       "  EmitVertex();\n"
+                       "}\n");
+    }
+
+    mstring_append(s, "\n"
+                      "void main() {\n");
+    mstring_append(s, body);
+    mstring_append(s, "}\n");
+
+    return s;
+}
--- a/hw/xbox/nv2a/pgraph/glsl/geom.h
+++ b/hw/xbox/nv2a/pgraph/glsl/geom.h
@ -0,0 +1,34 @@
+/*
+ * Geforce NV2A PGRAPH GLSL Shader Generator
+ *
+ * Copyright (c) 2015 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2020-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_XBOX_NV2A_PGRAPH_GLSL_GEOM_H
+#define HW_XBOX_NV2A_PGRAPH_GLSL_GEOM_H
+
+#include "qemu/mstring.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"
+
+MString *pgraph_gen_geom_glsl(enum ShaderPolygonMode polygon_front_mode,
+                              enum ShaderPolygonMode polygon_back_mode,
+                              enum ShaderPrimitiveMode primitive_mode,
+                              bool smooth_shading,
+                              bool vulkan);
+
+#endif
--- a/hw/xbox/nv2a/pgraph/glsl/meson.build
+++ b/hw/xbox/nv2a/pgraph/glsl/meson.build
@ -0,0 +1,8 @@
+specific_ss.add([files(
+	'common.c',
+	'geom.c',
+	'psh.c',
+	'vsh.c',
+	'vsh-ff.c',
+	'vsh-prog.c',
+	)])
--- a/hw/xbox/nv2a/pgraph/glsl/psh.c
+++ b/hw/xbox/nv2a/pgraph/glsl/psh.c
@ -3,7 +3,7 @@
 *
 * Copyright (c) 2013 espes
 * Copyright (c) 2015 Jannik Vogel
- * Copyright (c) 2020-2021 Matt Borgerson
+ * Copyright (c) 2020-2024 Matt Borgerson
 *
 * Based on:
 * Cxbx, PixelShader.cpp
@ -34,9 +34,9 @@
 #include <stdbool.h>
 #include <stdint.h>

-#include "qapi/qmp/qstring.h"
-
-#include "shaders_common.h"
+#include "common.h"
+#include "hw/xbox/nv2a/debug.h"
+#include "hw/xbox/nv2a/pgraph/psh.h"
 #include "psh.h"

 /*
@ -575,7 +575,7 @@ static const char* get_sampler_type(enum PS_TEXTUREMODES mode, const PshState *s
        return NULL;

    case PS_TEXTUREMODES_PROJECT2D:
-        return state->rect_tex[i] ? sampler2DRect : sampler2D;
+        return (state->rect_tex[i] && !state->vulkan) ? sampler2DRect : sampler2D;

    case PS_TEXTUREMODES_BUMPENVMAP:
    case PS_TEXTUREMODES_BUMPENVMAP_LUM:
@ -584,12 +584,15 @@ static const char* get_sampler_type(enum PS_TEXTUREMODES mode, const PshState *s
            fprintf(stderr, "Shadow map support not implemented for mode %d\n", mode);
            assert(!"Shadow map support not implemented for this mode");
        }
-        return state->rect_tex[i] ? sampler2DRect : sampler2D;
+        return (state->rect_tex[i] && !state->vulkan) ? sampler2DRect : sampler2D;

    case PS_TEXTUREMODES_PROJECT3D:
    case PS_TEXTUREMODES_DOT_STR_3D:
+        if (state->tex_x8y24[i] && state->vulkan) {
+            return "usampler2D";
+        }
        if (state->shadow_map[i]) {
-            return state->rect_tex[i] ? sampler2DRect : sampler2D;
+            return (state->rect_tex[i] && !state->vulkan) ? sampler2DRect : sampler2D;
        }
        return sampler3D;

@ -634,12 +637,28 @@ static void psh_append_shadowmap(const struct PixelShader *ps, int i, bool compa
        return;
    }

-    mstring_append_fmt(vars,
-                       "pT%d.xy *= texScale%d;\n"
-                       "vec4 t%d_depth = textureProj(texSamp%d, pT%d.xyw);\n",
-                       i, i, i, i, i);
-
+    mstring_append_fmt(vars, "pT%d.xy *= texScale%d;\n", i, i);
    const char *comparison = shadow_comparison_map[ps->state.shadow_depth_func];
+    if (ps->state.rect_tex[i] && ps->state.vulkan) {
+        if (ps->state.tex_x8y24[i]) {
+            mstring_append_fmt(
+                vars,
+                "uvec4 t%d_depth_raw = texture(texSamp%d, pT%d.xy/pT%d.w);\n", i, i, i, i);
+            mstring_append_fmt(
+                vars,
+                "vec4 t%d_depth = vec4(float(t%d_depth_raw.x & 0xFFFFFF), 1.0, 0.0, 0.0);",
+                i, i);
+        } else {
+            mstring_append_fmt(
+                vars,
+                "vec4 t%d_depth = textureLod(texSamp%d, pT%d.xy/pT%d.w, 0);\n", i,
+                i, i, i);            
+        }
+    } else {
+        mstring_append_fmt(
+            vars, "vec4 t%d_depth = textureProj(texSamp%d, pT%d.xyw);\n", i, i,
+            i);
+    }

    // Depth.y != 0 indicates 24 bit; depth.z != 0 indicates float.
    if (compare_z) {
@ -685,18 +704,69 @@ static void apply_border_adjustment(const struct PixelShader *ps, MString *vars,
        var_name, var_name, i, ps->state.border_inv_real_size[i][0], ps->state.border_inv_real_size[i][1], ps->state.border_inv_real_size[i][2]);
 }

+static void apply_convolution_filter(const struct PixelShader *ps, MString *vars, int tex)
+{
+    // FIXME: Convolution for 2D textures
+    // FIXME: Quincunx
+    assert(ps->state.rect_tex[tex]);
+
+    if (ps->state.vulkan) {
+        mstring_append_fmt(vars,
+            "vec4 t%d = vec4(0.0);\n"
+            "for (int i = 0; i < 9; i++) {\n"
+            "    vec2 texCoord = pT%d.xy/pT%d.w + convolution3x3[i];\n"
+            "    t%d += textureLod(texSamp%d, texCoord, 0) * gaussian3x3[i];\n"
+            "}\n", tex, tex, tex, tex, tex);
+    } else {
+        mstring_append_fmt(vars,
+            "vec4 t%d = vec4(0.0);\n"
+            "for (int i = 0; i < 9; i++) {\n"
+            "    vec3 texCoord = pT%d.xyw + vec3(convolution3x3[i], 0);\n"
+            "    t%d += textureProj(texSamp%d, texCoord) * gaussian3x3[i];\n"
+            "}\n", tex, tex, tex, tex, tex);
+
+    }
+}
+
 static MString* psh_convert(struct PixelShader *ps)
 {
    int i;

+    const char *u = ps->state.vulkan ? "" : "uniform "; // FIXME: Remove
+
    MString *preflight = mstring_new();
-    mstring_append(preflight, ps->state.smooth_shading ?
-                                  STRUCT_VERTEX_DATA_IN_SMOOTH :
-                                  STRUCT_VERTEX_DATA_IN_FLAT);
-    mstring_append(preflight, "\n");
-    mstring_append(preflight, "out vec4 fragColor;\n");
-    mstring_append(preflight, "\n");
-    mstring_append(preflight, "uniform vec4 fogColor;\n");
+    pgraph_get_glsl_vtx_header(preflight, ps->state.vulkan,
+                             ps->state.smooth_shading, true, false, false);
+
+    if (ps->state.vulkan) {
+        mstring_append_fmt(preflight,
+                           "layout(location = 0) out vec4 fragColor;\n"
+                           "layout(binding = %d, std140) uniform PshUniforms {\n", PSH_UBO_BINDING);
+    } else {
+        mstring_append_fmt(preflight,
+                           "layout(location = 0) out vec4 fragColor;\n");
+    }
+
+    mstring_append_fmt(preflight, "%sfloat alphaRef;\n"
+                                  "%svec4  fogColor;\n"
+                                  "%sivec4 clipRegion[8];\n",
+                                  u, u, u);
+    for (int i = 0; i < 4; i++) {
+        mstring_append_fmt(preflight, "%smat2  bumpMat%d;\n"
+                                      "%sfloat bumpScale%d;\n"
+                                      "%sfloat bumpOffset%d;\n"
+                                      "%sfloat texScale%d;\n",
+                                      u, i, u, i, u, i, u, i);
+    }
+    for (int i = 0; i < 9; i++) {
+        for (int j = 0; j < 2; j++) {
+            mstring_append_fmt(preflight, "%svec4 c%d_%d;\n", u, j, i);
+        }
+    }
+
+    if (ps->state.vulkan) {
+        mstring_append(preflight, "};\n");
+    }

    const char *dotmap_funcs[] = {
        "dotmap_zero_to_one",
@ -766,22 +836,12 @@ static MString* psh_convert(struct PixelShader *ps)
        "    vec2(-1.0,-1.0),vec2(0.0,-1.0),vec2(1.0,-1.0),\n"
        "    vec2(-1.0, 0.0),vec2(0.0, 0.0),vec2(1.0, 0.0),\n"
        "    vec2(-1.0, 1.0),vec2(0.0, 1.0),vec2(1.0, 1.0));\n"
-        "vec4 gaussianFilter2DRectProj(sampler2DRect sampler, vec3 texCoord) {\n"
-        "    vec4 sum = vec4(0.0);\n"
-        "    for (int i = 0; i < 9; i++) {\n"
-        "        sum += gaussian3x3[i]*textureProj(sampler,\n"
-        "                   texCoord + vec3(convolution3x3[i], 0.0));\n"
-        "    }\n"
-        "    return sum;\n"
-        "}\n"
        );

    /* Window Clipping */
    MString *clip = mstring_new();
-    mstring_append(preflight, "uniform ivec4 clipRegion[8];\n");
-    mstring_append_fmt(clip, "/*  Window-clip (%s) */\n",
-                       ps->state.window_clip_exclusive ?
-                           "Exclusive" : "Inclusive");
+    mstring_append_fmt(clip, "/*  Window-clip (%slusive) */\n",
+                       ps->state.window_clip_exclusive ? "Exc" : "Inc");
    if (!ps->state.window_clip_exclusive) {
        mstring_append(clip, "bool clipContained = false;\n");
    }
@ -856,23 +916,27 @@ static MString* psh_convert(struct PixelShader *ps)
            if (ps->state.shadow_map[i]) {
                psh_append_shadowmap(ps, i, false, vars);
            } else {
-                const char *lookup = "textureProj";
-                if ((ps->state.conv_tex[i] == CONVOLUTION_FILTER_GAUSSIAN)
-                    || (ps->state.conv_tex[i] == CONVOLUTION_FILTER_QUINCUNX)) {
-                    /* FIXME: Quincunx looks better than Linear and costs less than
-                     * Gaussian, but Gaussian should be plenty fast so use it for
-                     * now.
-                     */
-                    if (ps->state.rect_tex[i]) {
-                        lookup = "gaussianFilter2DRectProj";
-                    } else {
-                        NV2A_UNIMPLEMENTED("Convolution for 2D textures");
-                    }
-                }
                apply_border_adjustment(ps, vars, i, "pT%d");
                mstring_append_fmt(vars, "pT%d.xy = texScale%d * pT%d.xy;\n", i, i, i);
-                mstring_append_fmt(vars, "vec4 t%d = %s(texSamp%d, pT%d.xyw);\n",
-                                   i, lookup, i, i);
+                if (ps->state.rect_tex[i]) {
+                    if ((ps->state.conv_tex[i] ==
+                         CONVOLUTION_FILTER_GAUSSIAN) ||
+                        (ps->state.conv_tex[i] ==
+                         CONVOLUTION_FILTER_QUINCUNX)) {
+                        apply_convolution_filter(ps, vars, i);
+                    } else {
+                        if (ps->state.vulkan) {
+                            mstring_append_fmt(vars, "vec4 t%d = textureLod(texSamp%d, pT%d.xy/pT%d.w, 0);\n",
+                                               i, i, i, i);
+                        } else {
+                            mstring_append_fmt(vars, "vec4 t%d = textureProj(texSamp%d, pT%d.xyw);\n",
+                                               i, i, i);
+                        }
+                    }
+                } else {
+                    mstring_append_fmt(vars, "vec4 t%d = textureProj(texSamp%d, pT%d.xyw);\n",
+                                       i, i, i);
+                }
            }
            break;
        }
@ -880,6 +944,7 @@ static MString* psh_convert(struct PixelShader *ps)
            if (ps->state.shadow_map[i]) {
                psh_append_shadowmap(ps, i, true, vars);
            } else {
+                assert(!ps->state.rect_tex[i]);
                apply_border_adjustment(ps, vars, i, "pT%d");
                mstring_append_fmt(vars, "vec4 t%d = textureProj(texSamp%d, pT%d.xyzw);\n",
                                   i, i, i);
@ -906,7 +971,6 @@ static MString* psh_convert(struct PixelShader *ps)
        }
        case PS_TEXTUREMODES_BUMPENVMAP:
            assert(i >= 1);
-            mstring_append_fmt(preflight, "uniform mat2 bumpMat%d;\n", i);

            if (ps->state.snorm_tex[ps->input_tex[i]]) {
                /* Input color channels already signed (FIXME: May not always want signed textures in this case) */
@ -925,9 +989,6 @@ static MString* psh_convert(struct PixelShader *ps)
            break;
        case PS_TEXTUREMODES_BUMPENVMAP_LUM:
            assert(i >= 1);
-            mstring_append_fmt(preflight, "uniform float bumpScale%d;\n", i);
-            mstring_append_fmt(preflight, "uniform float bumpOffset%d;\n", i);
-            mstring_append_fmt(preflight, "uniform mat2 bumpMat%d;\n", i);

            if (ps->state.snorm_tex[ps->input_tex[i]]) {
                /* Input color channels already signed (FIXME: May not always want signed textures in this case) */
@ -1060,8 +1121,10 @@ static MString* psh_convert(struct PixelShader *ps)
            break;
        }

-        mstring_append_fmt(preflight, "uniform float texScale%d;\n", i);
        if (sampler_type != NULL) {
+            if (ps->state.vulkan) {
+                mstring_append_fmt(preflight, "layout(binding = %d) ", PSH_TEX_BINDING + i);
+            }
            mstring_append_fmt(preflight, "uniform %s texSamp%d;\n", sampler_type, i);

            /* As this means a texture fetch does happen, do alphakill */
@ -1091,7 +1154,6 @@ static MString* psh_convert(struct PixelShader *ps)
    }

    if (ps->state.alpha_test && ps->state.alpha_func != ALPHA_FUNC_ALWAYS) {
-        mstring_append_fmt(preflight, "uniform float alphaRef;\n");
        if (ps->state.alpha_func == ALPHA_FUNC_NEVER) {
            mstring_append(ps->code, "discard;\n");
        } else {
@ -1112,10 +1174,6 @@ static MString* psh_convert(struct PixelShader *ps)
        }
    }

-    for (i = 0; i < ps->num_const_refs; i++) {
-        mstring_append_fmt(preflight, "uniform vec4 %s;\n", ps->const_refs[i]);
-    }
-
    for (i = 0; i < ps->num_var_refs; i++) {
        mstring_append_fmt(vars, "vec4 %s;\n", ps->var_refs[i]);
        if (strcmp(ps->var_refs[i], "r0") == 0) {
@ -1128,7 +1186,7 @@ static MString* psh_convert(struct PixelShader *ps)
    }

    MString *final = mstring_new();
-    mstring_append(final, "#version 330\n\n");
+    mstring_append_fmt(final, "#version %d\n\n", ps->state.vulkan ? 450 : 400);
    mstring_append(final, mstring_get_str(preflight));
    mstring_append(final, "void main() {\n");
    mstring_append(final, mstring_get_str(clip));
@ -1175,7 +1233,7 @@ static void parse_combiner_output(uint32_t value, struct OutputInfo *out)
    out->cd_alphablue = flags & 0x40;
 }

-MString *psh_translate(const PshState state)
+MString *pgraph_gen_psh_glsl(const PshState state)
 {
    int i;
    struct PixelShader ps;
--- a/hw/xbox/nv2a/pgraph/glsl/psh.h
+++ b/hw/xbox/nv2a/pgraph/glsl/psh.h
@ -0,0 +1,41 @@
+/*
+ * Geforce NV2A PGRAPH GLSL Shader Generator
+ *
+ * Copyright (c) 2013 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2020-2024 Matt Borgerson
+ *
+ * Based on:
+ * Cxbx, PixelShader.cpp
+ * Copyright (c) 2004 Aaron Robinson <caustik@caustik.com>
+ *                    Kingofc <kingofc@freenet.de>
+ * Xeon, XBD3DPixelShader.cpp
+ * Copyright (c) 2003 _SF_
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_XBOX_NV2A_PGRAPH_GLSL_PSH_H
+#define HW_XBOX_NV2A_PGRAPH_GLSL_PSH_H
+
+#include "qemu/mstring.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"
+
+// FIXME: Move to struct
+#define PSH_UBO_BINDING 1
+#define PSH_TEX_BINDING 2
+
+MString *pgraph_gen_psh_glsl(const PshState state);
+
+#endif
--- a/hw/xbox/nv2a/pgraph/glsl/vsh-ff.c
+++ b/hw/xbox/nv2a/pgraph/glsl/vsh-ff.c
@ -0,0 +1,497 @@
+/*
+ * Geforce NV2A PGRAPH GLSL Shader Generator
+ *
+ * Copyright (c) 2015 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2020-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"
+#include "common.h"
+#include "vsh-ff.h"
+
+static void append_skinning_code(MString* str, bool mix,
+                                 unsigned int count, const char* type,
+                                 const char* output, const char* input,
+                                 const char* matrix, const char* swizzle);
+
+void pgraph_gen_vsh_ff_glsl(const ShaderState *state, MString *header,
+                             MString *body, MString *uniforms)
+{
+    int i, j;
+    const char *u = state->vulkan ? "" : "uniform "; // FIXME: Remove
+
+    /* generate vertex shader mimicking fixed function */
+    mstring_append(header,
+"#define position      v0\n"
+"#define weight        v1\n"
+"#define normal        v2.xyz\n"
+"#define diffuse       v3\n"
+"#define specular      v4\n"
+"#define fogCoord      v5.x\n"
+"#define pointSize     v6\n"
+"#define backDiffuse   v7\n"
+"#define backSpecular  v8\n"
+"#define texture0      v9\n"
+"#define texture1      v10\n"
+"#define texture2      v11\n"
+"#define texture3      v12\n"
+"#define reserved1     v13\n"
+"#define reserved2     v14\n"
+"#define reserved3     v15\n"
+"\n");
+    mstring_append_fmt(uniforms,
+"%svec4 ltctxa[" stringify(NV2A_LTCTXA_COUNT) "];\n"
+"%svec4 ltctxb[" stringify(NV2A_LTCTXB_COUNT) "];\n"
+"%svec4 ltc1[" stringify(NV2A_LTC1_COUNT) "];\n", u, u, u
+);
+    mstring_append(header,
+"\n"
+GLSL_DEFINE(projectionMat, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_PMAT0))
+GLSL_DEFINE(compositeMat, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_CMAT0))
+"\n"
+GLSL_DEFINE(texPlaneS0, GLSL_C(NV_IGRAPH_XF_XFCTX_TG0MAT + 0))
+GLSL_DEFINE(texPlaneT0, GLSL_C(NV_IGRAPH_XF_XFCTX_TG0MAT + 1))
+GLSL_DEFINE(texPlaneR0, GLSL_C(NV_IGRAPH_XF_XFCTX_TG0MAT + 2))
+GLSL_DEFINE(texPlaneQ0, GLSL_C(NV_IGRAPH_XF_XFCTX_TG0MAT + 3))
+"\n"
+GLSL_DEFINE(texPlaneS1, GLSL_C(NV_IGRAPH_XF_XFCTX_TG1MAT + 0))
+GLSL_DEFINE(texPlaneT1, GLSL_C(NV_IGRAPH_XF_XFCTX_TG1MAT + 1))
+GLSL_DEFINE(texPlaneR1, GLSL_C(NV_IGRAPH_XF_XFCTX_TG1MAT + 2))
+GLSL_DEFINE(texPlaneQ1, GLSL_C(NV_IGRAPH_XF_XFCTX_TG1MAT + 3))
+"\n"
+GLSL_DEFINE(texPlaneS2, GLSL_C(NV_IGRAPH_XF_XFCTX_TG2MAT + 0))
+GLSL_DEFINE(texPlaneT2, GLSL_C(NV_IGRAPH_XF_XFCTX_TG2MAT + 1))
+GLSL_DEFINE(texPlaneR2, GLSL_C(NV_IGRAPH_XF_XFCTX_TG2MAT + 2))
+GLSL_DEFINE(texPlaneQ2, GLSL_C(NV_IGRAPH_XF_XFCTX_TG2MAT + 3))
+"\n"
+GLSL_DEFINE(texPlaneS3, GLSL_C(NV_IGRAPH_XF_XFCTX_TG3MAT + 0))
+GLSL_DEFINE(texPlaneT3, GLSL_C(NV_IGRAPH_XF_XFCTX_TG3MAT + 1))
+GLSL_DEFINE(texPlaneR3, GLSL_C(NV_IGRAPH_XF_XFCTX_TG3MAT + 2))
+GLSL_DEFINE(texPlaneQ3, GLSL_C(NV_IGRAPH_XF_XFCTX_TG3MAT + 3))
+"\n"
+GLSL_DEFINE(modelViewMat0, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_MMAT0))
+GLSL_DEFINE(modelViewMat1, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_MMAT1))
+GLSL_DEFINE(modelViewMat2, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_MMAT2))
+GLSL_DEFINE(modelViewMat3, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_MMAT3))
+"\n"
+GLSL_DEFINE(invModelViewMat0, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_IMMAT0))
+GLSL_DEFINE(invModelViewMat1, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_IMMAT1))
+GLSL_DEFINE(invModelViewMat2, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_IMMAT2))
+GLSL_DEFINE(invModelViewMat3, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_IMMAT3))
+"\n"
+GLSL_DEFINE(eyePosition, GLSL_C(NV_IGRAPH_XF_XFCTX_EYEP))
+"\n"
+"#define lightAmbientColor(i) "
+    "ltctxb[" stringify(NV_IGRAPH_XF_LTCTXB_L0_AMB) " + (i)*6].xyz\n"
+"#define lightDiffuseColor(i) "
+    "ltctxb[" stringify(NV_IGRAPH_XF_LTCTXB_L0_DIF) " + (i)*6].xyz\n"
+"#define lightSpecularColor(i) "
+    "ltctxb[" stringify(NV_IGRAPH_XF_LTCTXB_L0_SPC) " + (i)*6].xyz\n"
+"\n"
+"#define lightSpotFalloff(i) "
+    "ltctxa[" stringify(NV_IGRAPH_XF_LTCTXA_L0_K) " + (i)*2].xyz\n"
+"#define lightSpotDirection(i) "
+    "ltctxa[" stringify(NV_IGRAPH_XF_LTCTXA_L0_SPT) " + (i)*2]\n"
+"\n"
+"#define lightLocalRange(i) "
+    "ltc1[" stringify(NV_IGRAPH_XF_LTC1_r0) " + (i)].x\n"
+"\n"
+GLSL_DEFINE(sceneAmbientColor, GLSL_LTCTXA(NV_IGRAPH_XF_LTCTXA_FR_AMB) ".xyz")
+GLSL_DEFINE(materialEmissionColor, GLSL_LTCTXA(NV_IGRAPH_XF_LTCTXA_CM_COL) ".xyz")
+"\n"
+);
+    mstring_append_fmt(uniforms,
+"%smat4 invViewport;\n", u);
+
+    /* Skinning */
+    unsigned int count;
+    bool mix;
+    switch (state->skinning) {
+    case SKINNING_OFF:
+        mix = false; count = 0; break;
+    case SKINNING_1WEIGHTS:
+        mix = true; count = 2; break;
+    case SKINNING_2WEIGHTS2MATRICES:
+        mix = false; count = 2; break;
+    case SKINNING_2WEIGHTS:
+        mix = true; count = 3; break;
+    case SKINNING_3WEIGHTS3MATRICES:
+        mix = false; count = 3; break;
+    case SKINNING_3WEIGHTS:
+        mix = true; count = 4; break;
+    case SKINNING_4WEIGHTS4MATRICES:
+        mix = false; count = 4; break;
+    default:
+        assert(false);
+        break;
+    }
+    mstring_append_fmt(body, "/* Skinning mode %d */\n",
+                       state->skinning);
+
+    append_skinning_code(body, mix, count, "vec4",
+                         "tPosition", "position",
+                         "modelViewMat", "xyzw");
+    append_skinning_code(body, mix, count, "vec3",
+                         "tNormal", "vec4(normal, 0.0)",
+                         "invModelViewMat", "xyz");
+
+    /* Normalization */
+    if (state->normalization) {
+        mstring_append(body, "tNormal = normalize(tNormal);\n");
+    }
+
+    /* Texgen */
+    for (i = 0; i < NV2A_MAX_TEXTURES; i++) {
+        mstring_append_fmt(body, "/* Texgen for stage %d */\n",
+                           i);
+        /* Set each component individually */
+        /* FIXME: could be nicer if some channels share the same texgen */
+        for (j = 0; j < 4; j++) {
+            /* TODO: TexGen View Model missing! */
+            char c = "xyzw"[j];
+            char cSuffix = "STRQ"[j];
+            switch (state->texgen[i][j]) {
+            case TEXGEN_DISABLE:
+                mstring_append_fmt(body, "oT%d.%c = texture%d.%c;\n",
+                                   i, c, i, c);
+                break;
+            case TEXGEN_EYE_LINEAR:
+                mstring_append_fmt(body, "oT%d.%c = dot(texPlane%c%d, tPosition);\n",
+                                   i, c, cSuffix, i);
+                break;
+            case TEXGEN_OBJECT_LINEAR:
+                mstring_append_fmt(body, "oT%d.%c = dot(texPlane%c%d, position);\n",
+                                   i, c, cSuffix, i);
+                break;
+            case TEXGEN_SPHERE_MAP:
+                assert(j < 2);  /* Channels S,T only! */
+                mstring_append(body, "{\n");
+                /* FIXME: u, r and m only have to be calculated once */
+                mstring_append(body, "  vec3 u = normalize(tPosition.xyz);\n");
+                //FIXME: tNormal before or after normalization? Always normalize?
+                mstring_append(body, "  vec3 r = reflect(u, tNormal);\n");
+
+                /* FIXME: This would consume 1 division fewer and *might* be
+                 *        faster than length:
+                 *   // [z=1/(2*x) => z=1/x*0.5]
+                 *   vec3 ro = r + vec3(0.0, 0.0, 1.0);
+                 *   float m = inversesqrt(dot(ro,ro))*0.5;
+                 */
+
+                mstring_append(body, "  float invM = 1.0 / (2.0 * length(r + vec3(0.0, 0.0, 1.0)));\n");
+                mstring_append_fmt(body, "  oT%d.%c = r.%c * invM + 0.5;\n",
+                                   i, c, c);
+                mstring_append(body, "}\n");
+                break;
+            case TEXGEN_REFLECTION_MAP:
+                assert(j < 3); /* Channels S,T,R only! */
+                mstring_append(body, "{\n");
+                /* FIXME: u and r only have to be calculated once, can share the one from SPHERE_MAP */
+                mstring_append(body, "  vec3 u = normalize(tPosition.xyz);\n");
+                mstring_append(body, "  vec3 r = reflect(u, tNormal);\n");
+                mstring_append_fmt(body, "  oT%d.%c = r.%c;\n",
+                                   i, c, c);
+                mstring_append(body, "}\n");
+                break;
+            case TEXGEN_NORMAL_MAP:
+                assert(j < 3); /* Channels S,T,R only! */
+                mstring_append_fmt(body, "oT%d.%c = tNormal.%c;\n",
+                                   i, c, c);
+                break;
+            default:
+                assert(false);
+                break;
+            }
+        }
+    }
+
+    /* Apply texture matrices */
+    for (i = 0; i < NV2A_MAX_TEXTURES; i++) {
+        if (state->texture_matrix_enable[i]) {
+            mstring_append_fmt(body,
+                               "oT%d = oT%d * texMat%d;\n",
+                               i, i, i);
+        }
+    }
+
+    /* Lighting */
+    if (state->lighting) {
+
+        //FIXME: Do 2 passes if we want 2 sided-lighting?
+
+        static char alpha_source_diffuse[] = "diffuse.a";
+        static char alpha_source_specular[] = "specular.a";
+        static char alpha_source_material[] = "material_alpha";
+        const char *alpha_source = alpha_source_diffuse;
+        if (state->diffuse_src == MATERIAL_COLOR_SRC_MATERIAL) {
+            mstring_append_fmt(uniforms, "%sfloat material_alpha;\n", u);
+            alpha_source = alpha_source_material;
+        } else if (state->diffuse_src == MATERIAL_COLOR_SRC_SPECULAR) {
+            alpha_source = alpha_source_specular;
+        }
+
+        if (state->ambient_src == MATERIAL_COLOR_SRC_MATERIAL) {
+            mstring_append_fmt(body, "oD0 = vec4(sceneAmbientColor, %s);\n", alpha_source);
+        } else if (state->ambient_src == MATERIAL_COLOR_SRC_DIFFUSE) {
+            mstring_append_fmt(body, "oD0 = vec4(diffuse.rgb, %s);\n", alpha_source);
+        } else if (state->ambient_src == MATERIAL_COLOR_SRC_SPECULAR) {
+            mstring_append_fmt(body, "oD0 = vec4(specular.rgb, %s);\n", alpha_source);
+        }
+
+        mstring_append(body, "oD0.rgb *= materialEmissionColor.rgb;\n");
+        if (state->emission_src == MATERIAL_COLOR_SRC_MATERIAL) {
+            mstring_append(body, "oD0.rgb += sceneAmbientColor;\n");
+        } else if (state->emission_src == MATERIAL_COLOR_SRC_DIFFUSE) {
+            mstring_append(body, "oD0.rgb += diffuse.rgb;\n");
+        } else if (state->emission_src == MATERIAL_COLOR_SRC_SPECULAR) {
+            mstring_append(body, "oD0.rgb += specular.rgb;\n");
+        }
+
+        mstring_append(body, "oD1 = vec4(0.0, 0.0, 0.0, specular.a);\n");
+
+        for (i = 0; i < NV2A_MAX_LIGHTS; i++) {
+            if (state->light[i] == LIGHT_OFF) {
+                continue;
+            }
+
+            /* FIXME: It seems that we only have to handle the surface colors if
+             *        they are not part of the material [= vertex colors].
+             *        If they are material the cpu will premultiply light
+             *        colors
+             */
+
+            mstring_append_fmt(body, "/* Light %d */ {\n", i);
+
+            if (state->light[i] == LIGHT_LOCAL
+                    || state->light[i] == LIGHT_SPOT) {
+
+                mstring_append_fmt(uniforms,
+                    "%svec3 lightLocalPosition%d;\n"
+                    "%svec3 lightLocalAttenuation%d;\n",
+                    u, i, u, i);
+                mstring_append_fmt(body,
+                    "  vec3 VP = lightLocalPosition%d - tPosition.xyz/tPosition.w;\n"
+                    "  float d = length(VP);\n"
+//FIXME: if (d > lightLocalRange) { .. don't process this light .. } /* inclusive?! */ - what about directional lights?
+                    "  VP = normalize(VP);\n"
+                    "  float attenuation = 1.0 / (lightLocalAttenuation%d.x\n"
+                    "                               + lightLocalAttenuation%d.y * d\n"
+                    "                               + lightLocalAttenuation%d.z * d * d);\n"
+                    "  vec3 halfVector = normalize(VP + eyePosition.xyz / eyePosition.w);\n" /* FIXME: Not sure if eyePosition is correct */
+                    "  float nDotVP = max(0.0, dot(tNormal, VP));\n"
+                    "  float nDotHV = max(0.0, dot(tNormal, halfVector));\n",
+                    i, i, i, i);
+
+            }
+
+            switch(state->light[i]) {
+            case LIGHT_INFINITE:
+
+                /* lightLocalRange will be 1e+30 here */
+
+                mstring_append_fmt(uniforms,
+                    "%svec3 lightInfiniteHalfVector%d;\n"
+                    "%svec3 lightInfiniteDirection%d;\n",
+                    u, i, u, i);
+                mstring_append_fmt(body,
+                    "  float attenuation = 1.0;\n"
+                    "  float nDotVP = max(0.0, dot(tNormal, normalize(vec3(lightInfiniteDirection%d))));\n"
+                    "  float nDotHV = max(0.0, dot(tNormal, vec3(lightInfiniteHalfVector%d)));\n",
+                    i, i);
+
+                /* FIXME: Do specular */
+
+                /* FIXME: tBackDiffuse */
+
+                break;
+            case LIGHT_LOCAL:
+                /* Everything done already */
+                break;
+            case LIGHT_SPOT:
+                /* https://docs.microsoft.com/en-us/windows/win32/direct3d9/attenuation-and-spotlight-factor#spotlight-factor */
+                mstring_append_fmt(body,
+                    "  vec4 spotDir = lightSpotDirection(%d);\n"
+                    "  float invScale = 1/length(spotDir.xyz);\n"
+                    "  float cosHalfPhi = -invScale*spotDir.w;\n"
+                    "  float cosHalfTheta = invScale + cosHalfPhi;\n"
+                    "  float spotDirDotVP = dot(spotDir.xyz, VP);\n"
+                    "  float rho = invScale*spotDirDotVP;\n"
+                    "  if (rho > cosHalfTheta) {\n"
+                    "  } else if (rho <= cosHalfPhi) {\n"
+                    "    attenuation = 0.0;\n"
+                    "  } else {\n"
+                    "    attenuation *= spotDirDotVP + spotDir.w;\n" /* FIXME: lightSpotFalloff */
+                    "  }\n",
+                    i);
+                break;
+            default:
+                assert(false);
+                break;
+            }
+
+            mstring_append_fmt(body,
+                "  float pf;\n"
+                "  if (nDotVP == 0.0) {\n"
+                "    pf = 0.0;\n"
+                "  } else {\n"
+                "    pf = pow(nDotHV, /* specular(l, m, n, l1, m1, n1) */ 0.001);\n"
+                "  }\n"
+                "  vec3 lightAmbient = lightAmbientColor(%d) * attenuation;\n"
+                "  vec3 lightDiffuse = lightDiffuseColor(%d) * attenuation * nDotVP;\n"
+                "  vec3 lightSpecular = lightSpecularColor(%d) * pf;\n",
+                i, i, i);
+
+            mstring_append(body,
+                "  oD0.xyz += lightAmbient;\n");
+
+            switch (state->diffuse_src) {
+            case MATERIAL_COLOR_SRC_MATERIAL:
+                mstring_append(body,
+                               "  oD0.xyz += lightDiffuse;\n");
+                break;
+            case MATERIAL_COLOR_SRC_DIFFUSE:
+                mstring_append(body,
+                               "  oD0.xyz += diffuse.xyz * lightDiffuse;\n");
+                break;
+            case MATERIAL_COLOR_SRC_SPECULAR:
+                mstring_append(body,
+                               "  oD0.xyz += specular.xyz * lightDiffuse;\n");
+                break;
+            }
+
+            mstring_append(body,
+                "  oD1.xyz += specular.xyz * lightSpecular;\n");
+
+            mstring_append(body, "}\n");
+        }
+    } else {
+        mstring_append(body, "  oD0 = diffuse;\n");
+        mstring_append(body, "  oD1 = specular;\n");
+    }
+    mstring_append(body, "  oB0 = backDiffuse;\n");
+    mstring_append(body, "  oB1 = backSpecular;\n");
+
+    /* Fog */
+    if (state->fog_enable) {
+
+        /* From: https://www.opengl.org/registry/specs/NV/fog_distance.txt */
+        switch(state->foggen) {
+        case FOGGEN_SPEC_ALPHA:
+            /* FIXME: Do we have to clamp here? */
+            mstring_append(body, "  float fogDistance = clamp(specular.a, 0.0, 1.0);\n");
+            break;
+        case FOGGEN_RADIAL:
+            mstring_append(body, "  float fogDistance = length(tPosition.xyz);\n");
+            break;
+        case FOGGEN_PLANAR:
+        case FOGGEN_ABS_PLANAR:
+            mstring_append(body, "  float fogDistance = dot(fogPlane.xyz, tPosition.xyz) + fogPlane.w;\n");
+            if (state->foggen == FOGGEN_ABS_PLANAR) {
+                mstring_append(body, "  fogDistance = abs(fogDistance);\n");
+            }
+            break;
+        case FOGGEN_FOG_X:
+            mstring_append(body, "  float fogDistance = fogCoord;\n");
+            break;
+        default:
+            assert(false);
+            break;
+        }
+
+    }
+
+    /* If skinning is off the composite matrix already includes the MV matrix */
+    if (state->skinning == SKINNING_OFF) {
+        mstring_append(body, "  tPosition = position;\n");
+    }
+
+    mstring_append(body,
+    "   oPos = invViewport * (tPosition * compositeMat);\n"
+    );
+
+    if (state->vulkan) {
+        mstring_append(body, "   oPos.y *= -1;\n");
+    } else {
+        mstring_append(body, "   oPos.z = oPos.z * 2.0 - oPos.w;\n");
+    }
+
+    /* FIXME: Testing */
+    if (state->point_params_enable) {
+        mstring_append_fmt(
+            body,
+            "  float d_e = length(position * modelViewMat0);\n"
+            "  oPts.x = 1/sqrt(%f + %f*d_e + %f*d_e*d_e) + %f;\n",
+            state->point_params[0], state->point_params[1], state->point_params[2],
+            state->point_params[6]);
+        mstring_append_fmt(body, "  oPts.x = min(oPts.x*%f + %f, 64.0) * %d;\n",
+                           state->point_params[3], state->point_params[7],
+                           state->surface_scale_factor);
+    } else {
+        mstring_append_fmt(body, "  oPts.x = %f * %d;\n", state->point_size,
+                           state->surface_scale_factor);
+    }
+
+    mstring_append(body,
+                   "  if (oPos.w == 0.0 || isinf(oPos.w)) {\n"
+                   "    vtx_inv_w = 1.0;\n"
+                   "  } else {\n"
+                   "    vtx_inv_w = 1.0 / oPos.w;\n"
+                   "  }\n"
+                   "  vtx_inv_w_flat = vtx_inv_w;\n");
+}
+
+static void append_skinning_code(MString* str, bool mix,
+                                 unsigned int count, const char* type,
+                                 const char* output, const char* input,
+                                 const char* matrix, const char* swizzle)
+{
+    if (count == 0) {
+        mstring_append_fmt(str, "%s %s = (%s * %s0).%s;\n",
+                           type, output, input, matrix, swizzle);
+    } else {
+        mstring_append_fmt(str, "%s %s = %s(0.0);\n", type, output, type);
+        if (mix) {
+            /* Generated final weight (like GL_WEIGHT_SUM_UNITY_ARB) */
+            mstring_append(str, "{\n"
+                                "  float weight_i;\n"
+                                "  float weight_n = 1.0;\n");
+            int i;
+            for (i = 0; i < count; i++) {
+                if (i < (count - 1)) {
+                    char c = "xyzw"[i];
+                    mstring_append_fmt(str, "  weight_i = weight.%c;\n"
+                                            "  weight_n -= weight_i;\n",
+                                       c);
+                } else {
+                    mstring_append(str, "  weight_i = weight_n;\n");
+                }
+                mstring_append_fmt(str, "  %s += (%s * %s%d).%s * weight_i;\n",
+                                   output, input, matrix, i, swizzle);
+            }
+            mstring_append(str, "}\n");
+        } else {
+            /* Individual weights */
+            int i;
+            for (i = 0; i < count; i++) {
+                char c = "xyzw"[i];
+                mstring_append_fmt(str, "%s += (%s * %s%d).%s * weight.%c;\n",
+                                   output, input, matrix, i, swizzle, c);
+            }
+        }
+    }
+}
--- a/hw/xbox/nv2a/pgraph/glsl/vsh-ff.h
+++ b/hw/xbox/nv2a/pgraph/glsl/vsh-ff.h
@ -0,0 +1,31 @@
+/*
+ * Geforce NV2A PGRAPH GLSL Shader Generator
+ *
+ * Copyright (c) 2015 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2020-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_XBOX_NV2A_PGRAPH_GLSL_VSH_FF_H
+#define HW_XBOX_NV2A_PGRAPH_GLSL_VSH_FF_H
+
+#include "qemu/mstring.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"
+
+void pgraph_gen_vsh_ff_glsl(const ShaderState *state, MString *header,
+                            MString *body, MString *uniforms);
+
+#endif
--- a/hw/xbox/nv2a/pgraph/glsl/vsh-prog.c
+++ b/hw/xbox/nv2a/pgraph/glsl/vsh-prog.c
@ -1,5 +1,5 @@
 /*
- * QEMU Geforce NV2A vertex shader translation
+ * Geforce NV2A PGRAPH GLSL Shader Generator
 *
 * Copyright (c) 2014 Jannik Vogel
 * Copyright (c) 2012 espes
@ -32,8 +32,9 @@
 #include <stdbool.h>
 #include <assert.h>

-#include "shaders_common.h"
-#include "vsh.h"
+#include "hw/xbox/nv2a/pgraph/vsh.h"
+#include "common.h"
+#include "vsh-prog.h"

 #define VSH_D3DSCM_CORRECTION 96

@ -794,10 +795,11 @@ static const char* vsh_header =
    "  return t;\n"
    "}\n";

-void vsh_translate(uint16_t version,
+void pgraph_gen_vsh_prog_glsl(uint16_t version,
                   const uint32_t *tokens,
                   unsigned int length,
                   bool z_perspective,
+                   bool vulkan,
                   MString *header, MString *body)
 {

@ -843,14 +845,30 @@ void vsh_translate(uint16_t version,
         * TODO: the pixel-center co-ordinate differences should handled
         */
        "  oPos.x = 2.0 * (oPos.x - surfaceSize.x * 0.5) / surfaceSize.x;\n"
-        "  oPos.y = -2.0 * (oPos.y - surfaceSize.y * 0.5) / surfaceSize.y;\n"
-    );
+        );
+
+    if (vulkan) {
+        mstring_append(body,
+                       "  oPos.y = 2.0 * oPos.y / surfaceSize.y - 1.0;\n");
+    } else {
+        mstring_append(body, "  oPos.y = -2.0 * (oPos.y - surfaceSize.y * 0.5) "
+                             "/ surfaceSize.y;\n");
+    }
+
    if (z_perspective) {
        mstring_append(body, "  oPos.z = oPos.w;\n");
    }
+
+    mstring_append(body,
+        "  if (clipRange.y != clipRange.x) {\n");
+    if (vulkan) {
+        mstring_append(body, "      oPos.z /= clipRange.y;\n");
+    } else {
+        mstring_append(body,
+                       "    oPos.z = (oPos.z - clipRange.x)/(0.5*(clipRange.y "
+                       "- clipRange.x)) - 1;\n");
+    }
    mstring_append(body,
-        "  if (clipRange.y != clipRange.x) {\n"
-        "    oPos.z = (oPos.z - clipRange.x)/(0.5*(clipRange.y - clipRange.x)) - 1;\n"
        "  }\n"

        /* Correct for the perspective divide */
--- a/hw/xbox/nv2a/pgraph/glsl/vsh-prog.h
+++ b/hw/xbox/nv2a/pgraph/glsl/vsh-prog.h
@ -0,0 +1,35 @@
+/*
+ * Geforce NV2A PGRAPH GLSL Shader Generator
+ *
+ * Copyright (c) 2014 Jannik Vogel
+ * Copyright (c) 2012 espes
+ *
+ * Based on:
+ * Cxbx, VertexShader.cpp
+ * Copyright (c) 2004 Aaron Robinson <caustik@caustik.com>
+ *                    Kingofc <kingofc@freenet.de>
+ * Dxbx, uPushBuffer.pas
+ * Copyright (c) 2007 Shadow_tj, PatrickvL
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_XBOX_NV2A_PGRAPH_GLSL_VSH_PROG_H
+#define HW_XBOX_NV2A_PGRAPH_GLSL_VSH_PROG_H
+
+void pgraph_gen_vsh_prog_glsl(uint16_t version, const uint32_t *tokens,
+                              unsigned int length, bool z_perspective,
+                              bool vulkan, MString *header, MString *body);
+
+#endif
--- a/hw/xbox/nv2a/pgraph/glsl/vsh.c
+++ b/hw/xbox/nv2a/pgraph/glsl/vsh.c
@ -0,0 +1,274 @@
+/*
+ * Geforce NV2A PGRAPH GLSL Shader Generator
+ *
+ * Copyright (c) 2015 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2020-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"
+#include "common.h"
+#include "vsh.h"
+#include "vsh-ff.h"
+#include "vsh-prog.h"
+#include <stdbool.h>
+
+MString *pgraph_gen_vsh_glsl(const ShaderState *state, bool prefix_outputs)
+{
+    int i;
+    MString *output = mstring_new();
+    mstring_append_fmt(output, "#version %d\n\n", state->vulkan ? 450 : 400);
+
+    MString *header = mstring_from_str("");
+    
+    MString *uniforms = mstring_from_str("");
+
+    const char *u = state->vulkan ? "" : "uniform "; // FIXME: Remove
+
+    mstring_append_fmt(uniforms,
+        "%svec4 clipRange;\n"
+        "%svec2 surfaceSize;\n"
+        "%svec4 c[" stringify(NV2A_VERTEXSHADER_CONSTANTS) "];\n"
+        "%svec2 fogParam;\n",
+        u, u, u, u
+        );
+
+    mstring_append(header,
+        GLSL_DEFINE(fogPlane, GLSL_C(NV_IGRAPH_XF_XFCTX_FOG))
+        GLSL_DEFINE(texMat0, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_T0MAT))
+        GLSL_DEFINE(texMat1, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_T1MAT))
+        GLSL_DEFINE(texMat2, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_T2MAT))
+        GLSL_DEFINE(texMat3, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_T3MAT))
+
+        "\n"
+        "vec4 oPos = vec4(0.0,0.0,0.0,1.0);\n"
+        "vec4 oD0 = vec4(0.0,0.0,0.0,1.0);\n"
+        "vec4 oD1 = vec4(0.0,0.0,0.0,1.0);\n"
+        "vec4 oB0 = vec4(0.0,0.0,0.0,1.0);\n"
+        "vec4 oB1 = vec4(0.0,0.0,0.0,1.0);\n"
+        "vec4 oPts = vec4(0.0,0.0,0.0,1.0);\n"
+        "vec4 oFog = vec4(0.0,0.0,0.0,1.0);\n"
+        "vec4 oT0 = vec4(0.0,0.0,0.0,1.0);\n"
+        "vec4 oT1 = vec4(0.0,0.0,0.0,1.0);\n"
+        "vec4 oT2 = vec4(0.0,0.0,0.0,1.0);\n"
+        "vec4 oT3 = vec4(0.0,0.0,0.0,1.0);\n"
+        "\n"
+        "vec4 decompress_11_11_10(int cmp) {\n"
+        "    float x = float(bitfieldExtract(cmp, 0,  11)) / 1023.0;\n"
+        "    float y = float(bitfieldExtract(cmp, 11, 11)) / 1023.0;\n"
+        "    float z = float(bitfieldExtract(cmp, 22, 10)) / 511.0;\n"
+        "    return vec4(x, y, z, 1);\n"
+        "}\n");
+
+    pgraph_get_glsl_vtx_header(header, state->vulkan, state->smooth_shading,
+                             false, prefix_outputs, false);
+
+    if (prefix_outputs) {
+        mstring_append(header,
+                       "#define vtx_inv_w v_vtx_inv_w\n"
+                       "#define vtx_inv_w_flat v_vtx_inv_w_flat\n"
+                       "#define vtxD0 v_vtxD0\n"
+                       "#define vtxD1 v_vtxD1\n"
+                       "#define vtxB0 v_vtxB0\n"
+                       "#define vtxB1 v_vtxB1\n"
+                       "#define vtxFog v_vtxFog\n"
+                       "#define vtxT0 v_vtxT0\n"
+                       "#define vtxT1 v_vtxT1\n"
+                       "#define vtxT2 v_vtxT2\n"
+                       "#define vtxT3 v_vtxT3\n"
+                       );
+    }
+    mstring_append(header, "\n");
+    for (i = 0; i < NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
+
+        bool is_uniform = state->uniform_attrs & (1 << i);
+        bool is_compressed = state->compressed_attrs & (1 << i);
+
+        assert(!(is_uniform && is_compressed));
+
+        if (is_uniform) {
+            mstring_append_fmt(header, "vec4 v%d = inlineValue[%d];\n", i, i);
+        } else {
+            if (state->compressed_attrs & (1 << i)) {
+                mstring_append_fmt(header,
+                                   "layout(location = %d) in int v%d_cmp;\n", i, i);
+            } else if (state->swizzle_attrs & (1 << i)) {
+                mstring_append_fmt(header, "layout(location = %d) in vec4 v%d_sw;\n",
+                                   i, i);
+            } else {
+                mstring_append_fmt(header, "layout(location = %d) in vec4 v%d;\n",
+                                   i, i);
+            }
+        }
+    }
+    mstring_append(header, "\n");
+
+    MString *body = mstring_from_str("void main() {\n");
+
+    for (i = 0; i < NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
+        if (state->compressed_attrs & (1 << i)) {
+            mstring_append_fmt(
+                body, "vec4 v%d = decompress_11_11_10(v%d_cmp);\n", i, i);
+        }
+
+        if (state->swizzle_attrs & (1 << i)) {
+            mstring_append_fmt(body, "vec4 v%d = v%d_sw.bgra;\n", i, i);
+        }
+
+    }
+
+    if (state->fixed_function) {
+        pgraph_gen_vsh_ff_glsl(state, header, body, uniforms);
+    } else if (state->vertex_program) {
+        pgraph_gen_vsh_prog_glsl(VSH_VERSION_XVS,
+                                 (uint32_t *)state->program_data,
+                                 state->program_length, state->z_perspective,
+                                 state->vulkan, header, body);
+    } else {
+        assert(false);
+    }
+
+
+    /* Fog */
+
+    if (state->fog_enable) {
+
+        if (state->vertex_program) {
+            /* FIXME: Does foggen do something here? Let's do some tracking..
+             *
+             *   "RollerCoaster Tycoon" has
+             *      state->vertex_program = true; state->foggen == FOGGEN_PLANAR
+             *      but expects oFog.x as fogdistance?! Writes oFog.xyzw = v0.z
+             */
+            mstring_append(body, "  float fogDistance = oFog.x;\n");
+        }
+
+        /* FIXME: Do this per pixel? */
+
+        switch (state->fog_mode) {
+        case FOG_MODE_LINEAR:
+        case FOG_MODE_LINEAR_ABS:
+
+            /* f = (end - d) / (end - start)
+             *    fogParam.y = -1 / (end - start)
+             *    fogParam.x = 1 - end * fogParam.y;
+             */
+
+            mstring_append(body,
+                "  if (isinf(fogDistance)) {\n"
+                "    fogDistance = 0.0;\n"
+                "  }\n"
+            );
+            mstring_append(body, "  float fogFactor = fogParam.x + fogDistance * fogParam.y;\n");
+            mstring_append(body, "  fogFactor -= 1.0;\n");
+            break;
+        case FOG_MODE_EXP:
+          mstring_append(body,
+                         "  if (isinf(fogDistance)) {\n"
+                         "    fogDistance = 0.0;\n"
+                         "  }\n"
+          );
+          /* fallthru */
+        case FOG_MODE_EXP_ABS:
+
+            /* f = 1 / (e^(d * density))
+             *    fogParam.y = -density / (2 * ln(256))
+             *    fogParam.x = 1.5
+             */
+
+            mstring_append(body, "  float fogFactor = fogParam.x + exp2(fogDistance * fogParam.y * 16.0);\n");
+            mstring_append(body, "  fogFactor -= 1.5;\n");
+            break;
+        case FOG_MODE_EXP2:
+        case FOG_MODE_EXP2_ABS:
+
+            /* f = 1 / (e^((d * density)^2))
+             *    fogParam.y = -density / (2 * sqrt(ln(256)))
+             *    fogParam.x = 1.5
+             */
+
+            mstring_append(body, "  float fogFactor = fogParam.x + exp2(-fogDistance * fogDistance * fogParam.y * fogParam.y * 32.0);\n");
+            mstring_append(body, "  fogFactor -= 1.5;\n");
+            break;
+        default:
+            assert(false);
+            break;
+        }
+        /* Calculate absolute for the modes which need it */
+        switch (state->fog_mode) {
+        case FOG_MODE_LINEAR_ABS:
+        case FOG_MODE_EXP_ABS:
+        case FOG_MODE_EXP2_ABS:
+            mstring_append(body, "  fogFactor = abs(fogFactor);\n");
+            break;
+        default:
+            break;
+        }
+
+        mstring_append(body, "  oFog.xyzw = vec4(fogFactor);\n");
+    } else {
+        /* FIXME: Is the fog still calculated / passed somehow?!
+         */
+        mstring_append(body, "  oFog.xyzw = vec4(1.0);\n");
+    }
+
+    /* Set outputs */
+    const char *shade_model_mult = state->smooth_shading ? "vtx_inv_w" : "vtx_inv_w_flat";
+    mstring_append_fmt(body, "\n"
+                      "  vtxD0 = clamp(oD0, 0.0, 1.0) * %s;\n"
+                      "  vtxD1 = clamp(oD1, 0.0, 1.0) * %s;\n"
+                      "  vtxB0 = clamp(oB0, 0.0, 1.0) * %s;\n"
+                      "  vtxB1 = clamp(oB1, 0.0, 1.0) * %s;\n"
+                      "  vtxFog = oFog.x * vtx_inv_w;\n"
+                      "  vtxT0 = oT0 * vtx_inv_w;\n"
+                      "  vtxT1 = oT1 * vtx_inv_w;\n"
+                      "  vtxT2 = oT2 * vtx_inv_w;\n"
+                      "  vtxT3 = oT3 * vtx_inv_w;\n"
+                      "  gl_Position = oPos;\n"
+                      "  gl_PointSize = oPts.x;\n"
+                      // "  gl_ClipDistance[0] = oPos.z - oPos.w*clipRange.z;\n" // Near
+                      // "  gl_ClipDistance[1] = oPos.w*clipRange.w - oPos.z;\n" // Far
+                      "\n"
+                      "}\n",
+                       shade_model_mult,
+                       shade_model_mult,
+                       shade_model_mult,
+                       shade_model_mult);
+
+
+    /* Return combined header + source */
+    if (state->vulkan) {
+        mstring_append_fmt(
+            output, "layout(binding = %d, std140) uniform VshUniforms {\n%s};\n\n",
+            VSH_UBO_BINDING, mstring_get_str(uniforms));
+        // FIXME: Only needed for vk, for gl we use glVertexAttrib
+        mstring_append_fmt(output,
+            "layout(push_constant) uniform PushConstants {\n"
+            "vec4 inlineValue[" stringify(NV2A_VERTEXSHADER_ATTRIBUTES) "];\n"
+            "};\n\n");
+    } else {
+        mstring_append(
+            output, mstring_get_str(uniforms));
+    }
+
+    mstring_append(output, mstring_get_str(header));
+    mstring_unref(header);
+
+    mstring_append(output, mstring_get_str(body));
+    mstring_unref(body);
+    return output;
+}
--- a/hw/xbox/nv2a/pgraph/glsl/vsh.h
+++ b/hw/xbox/nv2a/pgraph/glsl/vsh.h
@ -0,0 +1,33 @@
+/*
+ * Geforce NV2A PGRAPH GLSL Shader Generator
+ *
+ * Copyright (c) 2015 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2020-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_XBOX_NV2A_PGRAPH_GLSL_VSH_H
+#define HW_XBOX_NV2A_PGRAPH_GLSL_VSH_H
+
+#include "qemu/mstring.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"
+
+// FIXME: Move to struct
+#define VSH_UBO_BINDING 0
+
+MString *pgraph_gen_vsh_glsl(const ShaderState *state, bool prefix_outputs);
+
+#endif
--- a/hw/xbox/nv2a/pgraph/meson.build
+++ b/hw/xbox/nv2a/pgraph/meson.build
@ -0,0 +1,19 @@
+specific_ss.add(files(
+	'pgraph.c',
+	'profile.c',
+	'rdi.c',
+	's3tc.c',
+	'shaders.c',
+	'swizzle.c',
+	'texture.c',
+	'vertex.c',
+	))
+if have_renderdoc
+	specific_ss.add(files('debug_renderdoc.c'))
+endif
+subdir('thirdparty')
+subdir('null')
+subdir('gl')
+subdir('glsl')
+subdir('vk')
+specific_ss.add(nv2a_vsh_cpu)
--- a/hw/xbox/nv2a/pgraph/methods.h
+++ b/hw/xbox/nv2a/pgraph/methods.h
--- a/hw/xbox/nv2a/pgraph/null/meson.build
+++ b/hw/xbox/nv2a/pgraph/null/meson.build
@ -0,0 +1,3 @@
+specific_ss.add([sdl, files(
+	'renderer.c',
+	)])
--- a/hw/xbox/nv2a/pgraph/null/renderer.c
+++ b/hw/xbox/nv2a/pgraph/null/renderer.c
@ -0,0 +1,146 @@
+/*
+ * Geforce NV2A PGRAPH Null Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/thread.h"
+#include "hw/hw.h"
+#include "hw/xbox/nv2a/nv2a_int.h"
+
+static void pgraph_null_sync(NV2AState *d)
+{
+    qatomic_set(&d->pgraph.sync_pending, false);
+    qemu_event_set(&d->pgraph.sync_complete);
+}
+
+static void pgraph_null_flush(NV2AState *d)
+{
+    qatomic_set(&d->pgraph.flush_pending, false);
+    qemu_event_set(&d->pgraph.flush_complete);
+}
+
+static void pgraph_null_process_pending(NV2AState *d)
+{
+    if (
+        qatomic_read(&d->pgraph.sync_pending) ||
+        qatomic_read(&d->pgraph.flush_pending)
+        ) {
+        qemu_mutex_unlock(&d->pfifo.lock);
+        qemu_mutex_lock(&d->pgraph.lock);
+        if (qatomic_read(&d->pgraph.sync_pending)) {
+            pgraph_null_sync(d);
+        }
+        if (qatomic_read(&d->pgraph.flush_pending)) {
+            pgraph_null_flush(d);
+        }
+        qemu_mutex_unlock(&d->pgraph.lock);
+        qemu_mutex_lock(&d->pfifo.lock);
+    }
+}
+
+static void pgraph_null_clear_report_value(NV2AState *d)
+{
+}
+
+static void pgraph_null_clear_surface(NV2AState *d, uint32_t parameter)
+{
+}
+
+static void pgraph_null_draw_begin(NV2AState *d)
+{
+}
+
+static void pgraph_null_draw_end(NV2AState *d)
+{
+}
+
+static void pgraph_null_flip_stall(NV2AState *d)
+{
+}
+
+static void pgraph_null_flush_draw(NV2AState *d)
+{
+}
+
+static void pgraph_null_get_report(NV2AState *d, uint32_t parameter)
+{
+    pgraph_write_zpass_pixel_cnt_report(d, parameter, 0);
+}
+
+static void pgraph_null_image_blit(NV2AState *d)
+{
+}
+
+static void pgraph_null_pre_savevm_trigger(NV2AState *d)
+{
+}
+
+static void pgraph_null_pre_savevm_wait(NV2AState *d)
+{
+}
+
+static void pgraph_null_pre_shutdown_trigger(NV2AState *d)
+{
+}
+
+static void pgraph_null_pre_shutdown_wait(NV2AState *d)
+{
+}
+
+static void pgraph_null_process_pending_reports(NV2AState *d)
+{
+}
+
+static void pgraph_null_surface_update(NV2AState *d, bool upload,
+                                       bool color_write, bool zeta_write)
+{
+}
+
+static void pgraph_null_init(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    pg->null_renderer_state = NULL;
+}
+
+static PGRAPHRenderer pgraph_null_renderer = {
+    .type = CONFIG_DISPLAY_RENDERER_NULL,
+    .name = "Null",
+    .ops = {
+        .init = pgraph_null_init,
+        .clear_report_value = pgraph_null_clear_report_value,
+        .clear_surface = pgraph_null_clear_surface,
+        .draw_begin = pgraph_null_draw_begin,
+        .draw_end = pgraph_null_draw_end,
+        .flip_stall = pgraph_null_flip_stall,
+        .flush_draw = pgraph_null_flush_draw,
+        .get_report = pgraph_null_get_report,
+        .image_blit = pgraph_null_image_blit,
+        .pre_savevm_trigger = pgraph_null_pre_savevm_trigger,
+        .pre_savevm_wait = pgraph_null_pre_savevm_wait,
+        .pre_shutdown_trigger = pgraph_null_pre_shutdown_trigger,
+        .pre_shutdown_wait = pgraph_null_pre_shutdown_wait,
+        .process_pending = pgraph_null_process_pending,
+        .process_pending_reports = pgraph_null_process_pending_reports,
+        .surface_update = pgraph_null_surface_update,
+    }
+};
+
+static void __attribute__((constructor)) register_renderer(void)
+{
+    pgraph_renderer_register(&pgraph_null_renderer);
+}
--- a/hw/xbox/nv2a/pgraph/pgraph.c
+++ b/hw/xbox/nv2a/pgraph/pgraph.c
--- a/hw/xbox/nv2a/pgraph/pgraph.h
+++ b/hw/xbox/nv2a/pgraph/pgraph.h
@ -0,0 +1,383 @@
+/*
+ * QEMU Geforce NV2A PGRAPH internal definitions
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_XBOX_NV2A_PGRAPH_H
+#define HW_XBOX_NV2A_PGRAPH_H
+
+#include "xemu-config.h"
+#include "qemu/osdep.h"
+#include "qemu/bitmap.h"
+#include "qemu/units.h"
+#include "qemu/thread.h"
+#include "cpu.h"
+
+#include "shaders.h"
+#include "surface.h"
+#include "util.h"
+
+typedef struct NV2AState NV2AState;
+typedef struct PGRAPHNullState PGRAPHNullState;
+typedef struct PGRAPHGLState PGRAPHGLState;
+typedef struct PGRAPHVkState PGRAPHVkState;
+
+typedef struct VertexAttribute {
+    bool dma_select;
+    hwaddr offset;
+
+    /* inline arrays are packed in order?
+     * Need to pass the offset to converted attributes */
+    unsigned int inline_array_offset;
+
+    float inline_value[4];
+
+    unsigned int format;
+    unsigned int size; /* size of the data type */
+    unsigned int count; /* number of components */
+    uint32_t stride;
+
+    bool needs_conversion;
+
+    float *inline_buffer;
+    bool inline_buffer_populated;
+} VertexAttribute;
+
+typedef struct Surface {
+    bool draw_dirty;
+    bool buffer_dirty;
+    bool write_enabled_cache;
+    unsigned int pitch;
+
+    hwaddr offset;
+} Surface;
+
+typedef struct KelvinState {
+    hwaddr object_instance;
+} KelvinState;
+
+typedef struct ContextSurfaces2DState {
+    hwaddr object_instance;
+    hwaddr dma_image_source;
+    hwaddr dma_image_dest;
+    unsigned int color_format;
+    unsigned int source_pitch, dest_pitch;
+    hwaddr source_offset, dest_offset;
+} ContextSurfaces2DState;
+
+typedef struct ImageBlitState {
+    hwaddr object_instance;
+    hwaddr context_surfaces;
+    unsigned int operation;
+    unsigned int in_x, in_y;
+    unsigned int out_x, out_y;
+    unsigned int width, height;
+} ImageBlitState;
+
+typedef struct BetaState {
+  hwaddr object_instance;
+  uint32_t beta;
+} BetaState;
+
+typedef struct PGRAPHRenderer {
+    CONFIG_DISPLAY_RENDERER type;
+    const char *name;
+    struct {
+        void (*early_context_init)(void);
+        void (*init)(NV2AState *d);
+        void (*init_thread)(NV2AState *d);
+        void (*finalize)(NV2AState *d);
+        void (*clear_report_value)(NV2AState *d);
+        void (*clear_surface)(NV2AState *d, uint32_t parameter);
+        void (*draw_begin)(NV2AState *d);
+        void (*draw_end)(NV2AState *d);
+        void (*flip_stall)(NV2AState *d);
+        void (*flush_draw)(NV2AState *d);
+        void (*get_report)(NV2AState *d, uint32_t parameter);
+        void (*image_blit)(NV2AState *d);
+        void (*pre_savevm_trigger)(NV2AState *d);
+        void (*pre_savevm_wait)(NV2AState *d);
+        void (*pre_shutdown_trigger)(NV2AState *d);
+        void (*pre_shutdown_wait)(NV2AState *d);
+        void (*process_pending)(NV2AState *d);
+        void (*process_pending_reports)(NV2AState *d);
+        void (*surface_flush)(NV2AState *d);
+        void (*surface_update)(NV2AState *d, bool upload, bool color_write, bool zeta_write);
+        void (*set_surface_scale_factor)(NV2AState *d, unsigned int scale);
+        unsigned int (*get_surface_scale_factor)(NV2AState *d);
+        int (*get_framebuffer_surface)(NV2AState *d);
+    } ops;
+} PGRAPHRenderer;
+
+typedef struct PGRAPHState {
+    QemuMutex lock;
+
+    uint32_t pending_interrupts;
+    uint32_t enabled_interrupts;
+
+    int frame_time;
+    int draw_time;
+
+    /* subchannels state we're not sure the location of... */
+    ContextSurfaces2DState context_surfaces_2d;
+    ImageBlitState image_blit;
+    KelvinState kelvin;
+    BetaState beta;
+
+    hwaddr dma_color, dma_zeta;
+    Surface surface_color, surface_zeta;
+    unsigned int surface_type;
+    SurfaceShape surface_shape;
+    SurfaceShape last_surface_shape;
+
+    struct {
+        int clip_x;
+        int clip_width;
+        int clip_y;
+        int clip_height;
+        int width;
+        int height;
+    } surface_binding_dim; // FIXME: Refactor
+
+    hwaddr dma_a, dma_b;
+    bool texture_dirty[NV2A_MAX_TEXTURES];
+
+    bool texture_matrix_enable[NV2A_MAX_TEXTURES];
+
+    hwaddr dma_state;
+    hwaddr dma_notifies;
+    hwaddr dma_semaphore;
+
+    hwaddr dma_report;
+    hwaddr report_offset;
+    bool zpass_pixel_count_enable;
+
+    hwaddr dma_vertex_a, dma_vertex_b;
+
+    uint32_t primitive_mode;
+
+    bool enable_vertex_program_write; // FIXME: Not used anywhere???
+
+    uint32_t vertex_state_shader_v0[4];
+    uint32_t program_data[NV2A_MAX_TRANSFORM_PROGRAM_LENGTH][VSH_TOKEN_SIZE];
+    bool program_data_dirty;
+
+    uint32_t vsh_constants[NV2A_VERTEXSHADER_CONSTANTS][4];
+    bool vsh_constants_dirty[NV2A_VERTEXSHADER_CONSTANTS];
+
+    /* lighting constant arrays */
+    uint32_t ltctxa[NV2A_LTCTXA_COUNT][4];
+    bool ltctxa_dirty[NV2A_LTCTXA_COUNT];
+    uint32_t ltctxb[NV2A_LTCTXB_COUNT][4];
+    bool ltctxb_dirty[NV2A_LTCTXB_COUNT];
+    uint32_t ltc1[NV2A_LTC1_COUNT][4];
+    bool ltc1_dirty[NV2A_LTC1_COUNT];
+
+    float material_alpha;
+
+    // should figure out where these are in lighting context
+    float light_infinite_half_vector[NV2A_MAX_LIGHTS][3];
+    float light_infinite_direction[NV2A_MAX_LIGHTS][3];
+    float light_local_position[NV2A_MAX_LIGHTS][3];
+    float light_local_attenuation[NV2A_MAX_LIGHTS][3];
+
+    float point_params[8];
+
+    VertexAttribute vertex_attributes[NV2A_VERTEXSHADER_ATTRIBUTES];
+    uint16_t compressed_attrs;
+    uint16_t uniform_attrs;
+    uint16_t swizzle_attrs;
+
+    unsigned int inline_array_length;
+    uint32_t inline_array[NV2A_MAX_BATCH_LENGTH];
+
+    unsigned int inline_elements_length;
+    uint32_t inline_elements[NV2A_MAX_BATCH_LENGTH];
+
+    unsigned int inline_buffer_length;
+
+    unsigned int draw_arrays_length;
+    unsigned int draw_arrays_min_start;
+    unsigned int draw_arrays_max_count;
+    /* FIXME: Unknown size, possibly endless, 1250 will do for now */
+    /* Keep in sync with size used in nv2a.c */
+    int32_t draw_arrays_start[1250];
+    int32_t draw_arrays_count[1250];
+    bool draw_arrays_prevent_connect;
+
+    uint32_t regs_[0x2000];
+    DECLARE_BITMAP(regs_dirty, 0x2000 / sizeof(uint32_t));
+
+    bool clearing;
+    bool waiting_for_nop;
+    bool waiting_for_flip;
+    bool waiting_for_context_switch;
+
+    bool flush_pending;
+    QemuEvent flush_complete;
+
+    bool sync_pending;
+    QemuEvent sync_complete;
+
+    unsigned int surface_scale_factor;
+    uint8_t *scale_buf;
+
+    const PGRAPHRenderer *renderer;
+    union {
+        PGRAPHNullState *null_renderer_state;
+        PGRAPHGLState *gl_renderer_state;
+        PGRAPHVkState *vk_renderer_state;
+    };
+} PGRAPHState;
+
+void pgraph_init(NV2AState *d);
+void pgraph_init_thread(NV2AState *d);
+void pgraph_destroy(PGRAPHState *pg);
+void pgraph_context_switch(NV2AState *d, unsigned int channel_id);
+int pgraph_method(NV2AState *d, unsigned int subchannel, unsigned int method,
+                  uint32_t parameter, uint32_t *parameters,
+                  size_t num_words_available, size_t max_lookahead_words,
+                  bool inc);
+void pgraph_check_within_begin_end_block(PGRAPHState *pg);
+
+void *pfifo_thread(void *arg);
+void pfifo_kick(NV2AState *d);
+
+void pgraph_renderer_register(const PGRAPHRenderer *renderer);
+
+// FIXME: Move from here
+
+extern NV2AState *g_nv2a;
+
+// FIXME: Add new function pgraph_is_texture_sampler_active()
+
+static inline uint32_t pgraph_reg_r(PGRAPHState *pg, unsigned int r)
+{
+    assert(r % 4 == 0);
+    return pg->regs_[r];
+}
+
+static inline void pgraph_reg_w(PGRAPHState *pg, unsigned int r, uint32_t v)
+{
+    assert(r % 4 == 0);
+    if (pg->regs_[r] != v) {
+        bitmap_set(pg->regs_dirty, r / sizeof(uint32_t), 1);
+    }
+    pg->regs_[r] = v;
+}
+
+void pgraph_clear_dirty_reg_map(PGRAPHState *pg);
+
+static inline bool pgraph_is_reg_dirty(PGRAPHState *pg, unsigned int reg)
+{
+    return test_bit(reg / sizeof(uint32_t), pg->regs_dirty);
+}
+
+static inline bool pgraph_is_texture_stage_active(PGRAPHState *pg, unsigned int stage)
+{
+    assert(stage < NV2A_MAX_TEXTURES);
+    uint32_t mode = (pgraph_reg_r(pg, NV_PGRAPH_SHADERPROG) >> (stage * 5)) & 0x1F;
+    return mode != 0 && mode != 4;// && mode != 0x11 && mode != 0x0a && mode != 0x09 && mode != 5;
+}
+
+static inline bool pgraph_is_texture_enabled(PGRAPHState *pg, int texture_idx)
+{
+    uint32_t ctl_0 = pgraph_reg_r(pg, NV_PGRAPH_TEXCTL0_0 + texture_idx*4);
+    return // pgraph_is_texture_stage_active(pg, texture_idx) &&
+                       GET_MASK(ctl_0, NV_PGRAPH_TEXCTL0_0_ENABLE);
+}
+
+static inline bool pgraph_is_texture_format_compressed(PGRAPHState *pg, int color_format)
+{
+    return color_format == NV097_SET_TEXTURE_FORMAT_COLOR_L_DXT1_A1R5G5B5 ||
+           color_format == NV097_SET_TEXTURE_FORMAT_COLOR_L_DXT23_A8R8G8B8 ||
+           color_format == NV097_SET_TEXTURE_FORMAT_COLOR_L_DXT45_A8R8G8B8;
+}
+
+static inline bool pgraph_color_write_enabled(PGRAPHState *pg)
+{
+    return pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0) & (
+        NV_PGRAPH_CONTROL_0_ALPHA_WRITE_ENABLE
+        | NV_PGRAPH_CONTROL_0_RED_WRITE_ENABLE
+        | NV_PGRAPH_CONTROL_0_GREEN_WRITE_ENABLE
+        | NV_PGRAPH_CONTROL_0_BLUE_WRITE_ENABLE);
+}
+
+static inline bool pgraph_zeta_write_enabled(PGRAPHState *pg)
+{
+    return pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0) & (
+        NV_PGRAPH_CONTROL_0_ZWRITEENABLE
+        | NV_PGRAPH_CONTROL_0_STENCIL_WRITE_ENABLE);
+}
+
+static inline void pgraph_apply_anti_aliasing_factor(PGRAPHState *pg,
+                                              unsigned int *width,
+                                              unsigned int *height)
+{
+    switch (pg->surface_shape.anti_aliasing) {
+    case NV097_SET_SURFACE_FORMAT_ANTI_ALIASING_CENTER_1:
+        break;
+    case NV097_SET_SURFACE_FORMAT_ANTI_ALIASING_CENTER_CORNER_2:
+        if (width) { *width *= 2; }
+        break;
+    case NV097_SET_SURFACE_FORMAT_ANTI_ALIASING_SQUARE_OFFSET_4:
+        if (width) { *width *= 2; }
+        if (height) { *height *= 2; }
+        break;
+    default:
+        assert(false);
+        break;
+    }
+}
+
+static inline void pgraph_apply_scaling_factor(PGRAPHState *pg,
+                                        unsigned int *width,
+                                        unsigned int *height)
+{
+    *width *= pg->surface_scale_factor;
+    *height *= pg->surface_scale_factor;
+}
+
+void pgraph_get_clear_color(PGRAPHState *pg, float rgba[4]);
+void pgraph_get_clear_depth_stencil_value(PGRAPHState *pg, float *depth, int *stencil);
+
+/* Vertex */
+void pgraph_allocate_inline_buffer_vertices(PGRAPHState *pg, unsigned int attr);
+void pgraph_finish_inline_buffer_vertex(PGRAPHState *pg);
+void pgraph_reset_inline_buffers(PGRAPHState *pg);
+void pgraph_reset_draw_arrays(PGRAPHState *pg);
+void pgraph_update_inline_value(VertexAttribute *attr, const uint8_t *data);
+
+/* RDI */
+uint32_t pgraph_rdi_read(PGRAPHState *pg, unsigned int select,
+                         unsigned int address);
+void pgraph_rdi_write(PGRAPHState *pg, unsigned int select,
+                      unsigned int address, uint32_t val);
+
+static inline void pgraph_argb_pack32_to_rgba_float(uint32_t argb, float *rgba)
+{
+    rgba[0] = ((argb >> 16) & 0xFF) / 255.0f; /* red */
+    rgba[1] = ((argb >> 8) & 0xFF) / 255.0f; /* green */
+    rgba[2] = (argb & 0xFF) / 255.0f; /* blue */
+    rgba[3] = ((argb >> 24) & 0xFF) / 255.0f; /* alpha */
+}
+
+void pgraph_write_zpass_pixel_cnt_report(NV2AState *d, uint32_t parameter, uint32_t result);
+
+#endif
--- a/hw/xbox/nv2a/pgraph/profile.c
+++ b/hw/xbox/nv2a/pgraph/profile.c
@ -0,0 +1,74 @@
+/*
+ * QEMU Geforce NV2A profiling helpers
+ *
+ * Copyright (c) 2020-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../nv2a_int.h"
+
+NV2AStats g_nv2a_stats;
+
+void nv2a_profile_increment(void)
+{
+    int64_t now = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
+    const int64_t fps_update_interval = 250000;
+    g_nv2a_stats.last_flip_time = now;
+
+    static int64_t frame_count = 0;
+    frame_count++;
+
+    static int64_t ts = 0;
+    int64_t delta = now - ts;
+    if (delta >= fps_update_interval) {
+        g_nv2a_stats.increment_fps = frame_count * 1000000 / delta;
+        ts = now;
+        frame_count = 0;
+    }
+}
+
+void nv2a_profile_flip_stall(void)
+{
+    int64_t now = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
+    int64_t render_time = (now-g_nv2a_stats.last_flip_time)/1000;
+
+    g_nv2a_stats.frame_working.mspf = render_time;
+    g_nv2a_stats.frame_history[g_nv2a_stats.frame_ptr] =
+        g_nv2a_stats.frame_working;
+    g_nv2a_stats.frame_ptr =
+        (g_nv2a_stats.frame_ptr + 1) % NV2A_PROF_NUM_FRAMES;
+    g_nv2a_stats.frame_count++;
+    memset(&g_nv2a_stats.frame_working, 0, sizeof(g_nv2a_stats.frame_working));
+}
+
+const char *nv2a_profile_get_counter_name(unsigned int cnt)
+{
+    const char *default_names[NV2A_PROF__COUNT] = {
+        #define _X(x) stringify(x),
+        NV2A_PROF_COUNTERS_XMAC
+        #undef _X
+    };
+
+    assert(cnt < NV2A_PROF__COUNT);
+    return default_names[cnt] + 10; /* 'NV2A_PROF_' */
+}
+
+int nv2a_profile_get_counter_value(unsigned int cnt)
+{
+    assert(cnt < NV2A_PROF__COUNT);
+    unsigned int idx = (g_nv2a_stats.frame_ptr + NV2A_PROF_NUM_FRAMES - 1) %
+                       NV2A_PROF_NUM_FRAMES;
+    return g_nv2a_stats.frame_history[idx].counters[cnt];
+}
--- a/hw/xbox/nv2a/pgraph/psh.h
+++ b/hw/xbox/nv2a/pgraph/psh.h
@ -20,7 +20,8 @@
 #ifndef HW_NV2A_PSH_H
 #define HW_NV2A_PSH_H

-#include "shaders_common.h"
+#include <stdint.h>
+#include <stdbool.h>

 enum PshAlphaFunc {
    ALPHA_FUNC_NEVER,
@ -51,6 +52,8 @@ enum ConvolutionFilter {
 };

 typedef struct PshState {
+    bool vulkan;
+
    /* fragment shader - register combiner stuff */
    uint32_t combiner_control;
    uint32_t shader_stage_program;
@ -67,6 +70,7 @@ typedef struct PshState {
    bool compare_mode[4][4];
    bool alphakill[4];
    enum ConvolutionFilter conv_tex[4];
+    bool tex_x8y24[4];

    float border_logical_size[4][3];
    float border_inv_real_size[4][3];
@ -82,6 +86,4 @@ typedef struct PshState {
    bool smooth_shading;
 } PshState;

-MString *psh_translate(const PshState state);
-
 #endif
--- a/hw/xbox/nv2a/pgraph/rdi.c
+++ b/hw/xbox/nv2a/pgraph/rdi.c
@ -0,0 +1,60 @@
+/*
+ * QEMU Geforce NV2A implementation
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../nv2a_int.h"
+
+uint32_t pgraph_rdi_read(PGRAPHState *pg, unsigned int select,
+                         unsigned int address)
+{
+    uint32_t r = 0;
+    switch(select) {
+    case RDI_INDEX_VTX_CONSTANTS0:
+    case RDI_INDEX_VTX_CONSTANTS1:
+        assert((address / 4) < NV2A_VERTEXSHADER_CONSTANTS);
+        r = pg->vsh_constants[address / 4][3 - address % 4];
+        break;
+    default:
+        fprintf(stderr, "nv2a: unknown rdi read select 0x%x address 0x%x\n",
+                select, address);
+        assert(false);
+        break;
+    }
+    return r;
+}
+
+void pgraph_rdi_write(PGRAPHState *pg, unsigned int select,
+                      unsigned int address, uint32_t val)
+{
+    switch(select) {
+    case RDI_INDEX_VTX_CONSTANTS0:
+    case RDI_INDEX_VTX_CONSTANTS1:
+        assert(false); /* Untested */
+        assert((address / 4) < NV2A_VERTEXSHADER_CONSTANTS);
+        pg->vsh_constants_dirty[address / 4] |=
+            (val != pg->vsh_constants[address / 4][3 - address % 4]);
+        pg->vsh_constants[address / 4][3 - address % 4] = val;
+        break;
+    default:
+        NV2A_DPRINTF("unknown rdi write select 0x%x, address 0x%x, val 0x%08x\n",
+                     select, address, val);
+        break;
+    }
+}
--- a/hw/xbox/nv2a/pgraph/s3tc.c
+++ b/hw/xbox/nv2a/pgraph/s3tc.c
@ -1,5 +1,5 @@
 /*
- * QEMU texture decompression routines
+ * S3TC Texture Decompression
 *
 * Copyright (c) 2020 Wilhelm Kovatch
 *
@ -25,13 +25,9 @@
 #include "qemu/osdep.h"
 #include "s3tc.h"

-static inline void decode_bc1_colors(uint16_t c0,
-                                     uint16_t c1,
-                                     uint8_t r[4],
-                                     uint8_t g[4],
-                                     uint8_t b[4],
-                                     uint8_t a[16],
-                                     bool transparent)
+static void decode_bc1_colors(uint16_t c0, uint16_t c1, uint8_t r[4],
+                              uint8_t g[4], uint8_t b[4], uint8_t a[16],
+                              bool transparent)
 {
    r[0] = ((c0 & 0xF800) >> 8) * 0xFF / 0xF8,
    g[0] = ((c0 & 0x07E0) >> 3) * 0xFF / 0xFC,
@ -66,15 +62,10 @@ static inline void decode_bc1_colors(uint16_t c0,
    }
 }

-static inline void write_block_to_texture(uint8_t *converted_data,
-                                          uint32_t indices,
-                                          int i, int j, int width,
-                                          int z_pos_factor,
-                                          uint8_t r[4],
-                                          uint8_t g[4],
-                                          uint8_t b[4],
-                                          uint8_t a[16],
-                                          bool separate_alpha)
+static void write_block_to_texture(uint8_t *converted_data, uint32_t indices,
+                                   int i, int j, int width, int z_pos_factor,
+                                   uint8_t r[4], uint8_t g[4], uint8_t b[4],
+                                   uint8_t a[16], bool separate_alpha)
 {
    int x0 = i * 4,
        y0 = j * 4;
@ -89,16 +80,18 @@ static inline void write_block_to_texture(uint8_t *converted_data,
            int xy_index = y_index + x - x0;
            uint8_t index = (indices >> 2 * xy_index) & 0x03;
            uint8_t alpha_index = separate_alpha ? xy_index : index;
-            uint32_t color = (r[index] << 24) | (g[index] << 16) | (b[index] << 8) | a[alpha_index];
-            *(uint32_t*)(converted_data + (z_plus_y_pos_factor + x) * 4) = color;
+            uint8_t *p = converted_data + (z_plus_y_pos_factor + x) * 4;
+            *p++ = r[index];
+            *p++ = g[index];
+            *p++ = b[index];
+            *p++ = a[alpha_index];
        }
    }
 }

-static inline void decompress_dxt1_block(const uint8_t block_data[8],
-                                         uint8_t *converted_data,
-                                         int i, int j, int width,
-                                         int z_pos_factor)
+static void decompress_dxt1_block(const uint8_t block_data[8],
+                                  uint8_t *converted_data, int i, int j,
+                                  int width, int z_pos_factor)
 {
    uint16_t c0 = ((uint16_t*)block_data)[0],
             c1 = ((uint16_t*)block_data)[1];
@ -111,10 +104,9 @@ static inline void decompress_dxt1_block(const uint8_t block_data[8],
                           r, g, b, a, false);
 }

-static inline void decompress_dxt3_block(const uint8_t block_data[16],
-                                         uint8_t *converted_data,
-                                         int i, int j, int width,
-                                         int z_pos_factor)
+static void decompress_dxt3_block(const uint8_t block_data[16],
+                                  uint8_t *converted_data, int i, int j,
+                                  int width, int z_pos_factor)
 {
    uint16_t c0 = ((uint16_t*)block_data)[4],
             c1 = ((uint16_t*)block_data)[5];
@ -132,10 +124,9 @@ static inline void decompress_dxt3_block(const uint8_t block_data[16],
                           r, g, b, a, true);
 }

-static inline void decompress_dxt5_block(const uint8_t block_data[16],
-                                         uint8_t *converted_data,
-                                         int i, int j, int width,
-                                         int z_pos_factor)
+static void decompress_dxt5_block(const uint8_t block_data[16],
+                                  uint8_t *converted_data, int i, int j,
+                                  int width, int z_pos_factor)
 {
    uint16_t c0 = ((uint16_t*)block_data)[4],
             c1 = ((uint16_t*)block_data)[5];
@ -173,11 +164,9 @@ static inline void decompress_dxt5_block(const uint8_t block_data[16],
                           r, g, b, a, true);
 }

-uint8_t *decompress_3d_texture_data(GLint color_format,
-                                    const uint8_t *data,
-                                    unsigned int width,
-                                    unsigned int height,
-                                    unsigned int depth)
+uint8_t *s3tc_decompress_3d(enum S3TC_DECOMPRESS_FORMAT color_format,
+                            const uint8_t *data, unsigned int width,
+                            unsigned int height, unsigned int depth)
 {
    assert((width > 0) && (width % 4 == 0));
    assert((height > 0) && (height % 4 == 0));
@ -196,13 +185,13 @@ uint8_t *decompress_3d_texture_data(GLint color_format,
                    int sub_block_index = block_index * block_depth + slice;
                    int z_pos_factor = (k * block_depth + slice) * width * height;

-                    if (color_format == GL_COMPRESSED_RGBA_S3TC_DXT1_EXT) {
+                    if (color_format == S3TC_DECOMPRESS_FORMAT_DXT1) {
                        decompress_dxt1_block(data + 8 * sub_block_index, converted_data,
                                              i, j, width, z_pos_factor);
-                    } else if (color_format == GL_COMPRESSED_RGBA_S3TC_DXT3_EXT) {
+                    } else if (color_format == S3TC_DECOMPRESS_FORMAT_DXT3) {
                        decompress_dxt3_block(data + 16 * sub_block_index, converted_data,
                                              i, j, width, z_pos_factor);
-                    } else if (color_format == GL_COMPRESSED_RGBA_S3TC_DXT5_EXT) {
+                    } else if (color_format == S3TC_DECOMPRESS_FORMAT_DXT5) {
                        decompress_dxt5_block(data + 16 * sub_block_index, converted_data,
                                              i, j, width, z_pos_factor);
                    } else {
@ -216,8 +205,9 @@ uint8_t *decompress_3d_texture_data(GLint color_format,
    return converted_data;
 }

-uint8_t *decompress_2d_texture_data(GLint color_format, const uint8_t *data,
-                                    unsigned int width, unsigned int height)
+uint8_t *s3tc_decompress_2d(enum S3TC_DECOMPRESS_FORMAT color_format,
+                            const uint8_t *data, unsigned int width,
+                            unsigned int height)
 {
    assert((width > 0) && (width % 4 == 0));
    assert((height > 0) && (height % 4 == 0));
@ -226,13 +216,13 @@ uint8_t *decompress_2d_texture_data(GLint color_format, const uint8_t *data,
    for (int j = 0; j < num_blocks_y; j++) {
        for (int i = 0; i < num_blocks_x; i++) {
            int block_index = j * num_blocks_x + i;
-            if (color_format == GL_COMPRESSED_RGBA_S3TC_DXT1_EXT) {
+            if (color_format == S3TC_DECOMPRESS_FORMAT_DXT1) {
                decompress_dxt1_block(data + 8 * block_index,
                                      converted_data, i, j, width, 0);
-            } else if (color_format == GL_COMPRESSED_RGBA_S3TC_DXT3_EXT) {
+            } else if (color_format == S3TC_DECOMPRESS_FORMAT_DXT3) {
                decompress_dxt3_block(data + 16 * block_index,
                                      converted_data, i, j, width, 0);
-            } else if (color_format == GL_COMPRESSED_RGBA_S3TC_DXT5_EXT) {
+            } else if (color_format == S3TC_DECOMPRESS_FORMAT_DXT5) {
                decompress_dxt5_block(data + 16 * block_index,
                                      converted_data, i, j, width, 0);
            } else {
--- a/hw/xbox/nv2a/pgraph/s3tc.h
+++ b/hw/xbox/nv2a/pgraph/s3tc.h
@ -1,5 +1,5 @@
 /*
- * QEMU texture decompression routines
+ * S3TC Texture Decompression
 *
 * Copyright (c) 2020 Wilhelm Kovatch
 *
@ -22,18 +22,23 @@
 * THE SOFTWARE.
 */

-#ifndef S3TC_H
-#define S3TC_H
+#ifndef HW_XBOX_NV2A_PGRAPH_S3TC_H
+#define HW_XBOX_NV2A_PGRAPH_S3TC_H

-#include "gl/gloffscreen.h"
+#include <stdint.h>

-uint8_t *decompress_3d_texture_data(GLint color_format,
-                                    const uint8_t *data,
-                                    unsigned int width,
-                                    unsigned int height,
-                                    unsigned int depth);
+enum S3TC_DECOMPRESS_FORMAT {
+    S3TC_DECOMPRESS_FORMAT_DXT1,
+    S3TC_DECOMPRESS_FORMAT_DXT3,
+    S3TC_DECOMPRESS_FORMAT_DXT5,
+};

-uint8_t *decompress_2d_texture_data(GLint color_format, const uint8_t *data,
-                                    unsigned int width, unsigned int height);
+uint8_t *s3tc_decompress_3d(enum S3TC_DECOMPRESS_FORMAT color_format,
+                            const uint8_t *data, unsigned int width,
+                            unsigned int height, unsigned int depth);
+
+uint8_t *s3tc_decompress_2d(enum S3TC_DECOMPRESS_FORMAT color_format,
+                            const uint8_t *data, unsigned int width,
+                            unsigned int height);

 #endif
--- a/hw/xbox/nv2a/pgraph/shaders.c
+++ b/hw/xbox/nv2a/pgraph/shaders.c
@ -0,0 +1,295 @@
+/*
+ * Geforce NV2A PGRAPH OpenGL Renderer
+ *
+ * Copyright (c) 2015 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2020-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hw/xbox/nv2a/debug.h"
+#include "texture.h"
+#include "pgraph.h"
+#include "shaders.h"
+
+ShaderState pgraph_get_shader_state(PGRAPHState *pg)
+{
+    bool vertex_program = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_D),
+                                   NV_PGRAPH_CSV0_D_MODE) == 2;
+
+    bool fixed_function = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_D),
+                                   NV_PGRAPH_CSV0_D_MODE) == 0;
+
+    int program_start = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_C),
+                                 NV_PGRAPH_CSV0_C_CHEOPS_PROGRAM_START);
+
+    pg->program_data_dirty = false;
+
+    ShaderState state;
+
+    // We will hash it, so make sure any padding is zerod
+    memset(&state, 0, sizeof(ShaderState));
+
+    state.vulkan = pg->renderer->type == CONFIG_DISPLAY_RENDERER_VULKAN;
+    state.surface_scale_factor = pg->surface_scale_factor;
+
+    state.compressed_attrs = pg->compressed_attrs;
+    state.uniform_attrs = pg->uniform_attrs;
+    state.swizzle_attrs = pg->swizzle_attrs;
+
+    /* register combiner stuff */
+    state.psh.vulkan = state.vulkan;
+    state.psh.window_clip_exclusive =
+        pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER) & NV_PGRAPH_SETUPRASTER_WINDOWCLIPTYPE;
+    state.psh.combiner_control = pgraph_reg_r(pg, NV_PGRAPH_COMBINECTL);
+    state.psh.shader_stage_program = pgraph_reg_r(pg, NV_PGRAPH_SHADERPROG);
+    state.psh.other_stage_input = pgraph_reg_r(pg, NV_PGRAPH_SHADERCTL);
+    state.psh.final_inputs_0 = pgraph_reg_r(pg, NV_PGRAPH_COMBINESPECFOG0);
+    state.psh.final_inputs_1 = pgraph_reg_r(pg, NV_PGRAPH_COMBINESPECFOG1);
+
+    state.psh.alpha_test =
+        pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0) & NV_PGRAPH_CONTROL_0_ALPHATESTENABLE;
+    state.psh.alpha_func = (enum PshAlphaFunc)GET_MASK(
+        pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0), NV_PGRAPH_CONTROL_0_ALPHAFUNC);
+
+    state.psh.point_sprite = pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER) &
+                             NV_PGRAPH_SETUPRASTER_POINTSMOOTHENABLE;
+
+    state.psh.shadow_depth_func = (enum PshShadowDepthFunc)GET_MASK(
+        pgraph_reg_r(pg, NV_PGRAPH_SHADOWCTL), NV_PGRAPH_SHADOWCTL_SHADOW_ZFUNC);
+
+    state.fixed_function = fixed_function;
+
+    /* fixed function stuff */
+    if (fixed_function) {
+        state.skinning = (enum VshSkinning)GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_D),
+                                                    NV_PGRAPH_CSV0_D_SKIN);
+        state.lighting =
+            GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_C), NV_PGRAPH_CSV0_C_LIGHTING);
+        state.normalization =
+            pgraph_reg_r(pg, NV_PGRAPH_CSV0_C) & NV_PGRAPH_CSV0_C_NORMALIZATION_ENABLE;
+
+        /* color material */
+        state.emission_src = (enum MaterialColorSource)GET_MASK(
+            pgraph_reg_r(pg, NV_PGRAPH_CSV0_C), NV_PGRAPH_CSV0_C_EMISSION);
+        state.ambient_src = (enum MaterialColorSource)GET_MASK(
+            pgraph_reg_r(pg, NV_PGRAPH_CSV0_C), NV_PGRAPH_CSV0_C_AMBIENT);
+        state.diffuse_src = (enum MaterialColorSource)GET_MASK(
+            pgraph_reg_r(pg, NV_PGRAPH_CSV0_C), NV_PGRAPH_CSV0_C_DIFFUSE);
+        state.specular_src = (enum MaterialColorSource)GET_MASK(
+            pgraph_reg_r(pg, NV_PGRAPH_CSV0_C), NV_PGRAPH_CSV0_C_SPECULAR);
+    }
+
+    /* vertex program stuff */
+    state.vertex_program = vertex_program,
+    state.z_perspective = pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0) &
+                          NV_PGRAPH_CONTROL_0_Z_PERSPECTIVE_ENABLE;
+
+    state.point_params_enable = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_D),
+                                         NV_PGRAPH_CSV0_D_POINTPARAMSENABLE);
+    state.point_size =
+        GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_POINTSIZE), NV097_SET_POINT_SIZE_V) / 8.0f;
+    if (state.point_params_enable) {
+        for (int i = 0; i < 8; i++) {
+            state.point_params[i] = pg->point_params[i];
+        }
+    }
+
+    /* geometry shader stuff */
+    state.primitive_mode = (enum ShaderPrimitiveMode)pg->primitive_mode;
+    state.polygon_front_mode = (enum ShaderPolygonMode)GET_MASK(
+        pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER), NV_PGRAPH_SETUPRASTER_FRONTFACEMODE);
+    state.polygon_back_mode = (enum ShaderPolygonMode)GET_MASK(
+        pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER), NV_PGRAPH_SETUPRASTER_BACKFACEMODE);
+
+    state.smooth_shading = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_3),
+                                    NV_PGRAPH_CONTROL_3_SHADEMODE) ==
+                           NV_PGRAPH_CONTROL_3_SHADEMODE_SMOOTH;
+    state.psh.smooth_shading = state.smooth_shading;
+
+    state.program_length = 0;
+
+    if (vertex_program) {
+        // copy in vertex program tokens
+        for (int i = program_start; i < NV2A_MAX_TRANSFORM_PROGRAM_LENGTH;
+             i++) {
+            uint32_t *cur_token = (uint32_t *)&pg->program_data[i];
+            memcpy(&state.program_data[state.program_length], cur_token,
+                   VSH_TOKEN_SIZE * sizeof(uint32_t));
+            state.program_length++;
+
+            if (vsh_get_field(cur_token, FLD_FINAL)) {
+                break;
+            }
+        }
+    }
+
+    /* Texgen */
+    for (int i = 0; i < 4; i++) {
+        unsigned int reg = (i < 2) ? NV_PGRAPH_CSV1_A : NV_PGRAPH_CSV1_B;
+        for (int j = 0; j < 4; j++) {
+            unsigned int masks[] = {
+                (i % 2) ? NV_PGRAPH_CSV1_A_T1_S : NV_PGRAPH_CSV1_A_T0_S,
+                (i % 2) ? NV_PGRAPH_CSV1_A_T1_T : NV_PGRAPH_CSV1_A_T0_T,
+                (i % 2) ? NV_PGRAPH_CSV1_A_T1_R : NV_PGRAPH_CSV1_A_T0_R,
+                (i % 2) ? NV_PGRAPH_CSV1_A_T1_Q : NV_PGRAPH_CSV1_A_T0_Q
+            };
+            state.texgen[i][j] =
+                (enum VshTexgen)GET_MASK(pgraph_reg_r(pg, reg), masks[j]);
+        }
+    }
+
+    /* Fog */
+    state.fog_enable =
+        pgraph_reg_r(pg, NV_PGRAPH_CONTROL_3) & NV_PGRAPH_CONTROL_3_FOGENABLE;
+    if (state.fog_enable) {
+        /*FIXME: Use CSV0_D? */
+        state.fog_mode = (enum VshFogMode)GET_MASK(
+            pgraph_reg_r(pg, NV_PGRAPH_CONTROL_3), NV_PGRAPH_CONTROL_3_FOG_MODE);
+        state.foggen = (enum VshFoggen)GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_D),
+                                                NV_PGRAPH_CSV0_D_FOGGENMODE);
+    } else {
+        /* FIXME: Do we still pass the fogmode? */
+        state.fog_mode = (enum VshFogMode)0;
+        state.foggen = (enum VshFoggen)0;
+    }
+
+    /* Texture matrices */
+    for (int i = 0; i < 4; i++) {
+        state.texture_matrix_enable[i] = pg->texture_matrix_enable[i];
+    }
+
+    /* Lighting */
+    if (state.lighting) {
+        for (int i = 0; i < NV2A_MAX_LIGHTS; i++) {
+            state.light[i] = (enum VshLight)GET_MASK(
+                pgraph_reg_r(pg, NV_PGRAPH_CSV0_D), NV_PGRAPH_CSV0_D_LIGHT0 << (i * 2));
+        }
+    }
+
+    /* Copy content of enabled combiner stages */
+    int num_stages = pgraph_reg_r(pg, NV_PGRAPH_COMBINECTL) & 0xFF;
+    for (int i = 0; i < num_stages; i++) {
+        state.psh.rgb_inputs[i] = pgraph_reg_r(pg, NV_PGRAPH_COMBINECOLORI0 + i * 4);
+        state.psh.rgb_outputs[i] = pgraph_reg_r(pg, NV_PGRAPH_COMBINECOLORO0 + i * 4);
+        state.psh.alpha_inputs[i] = pgraph_reg_r(pg, NV_PGRAPH_COMBINEALPHAI0 + i * 4);
+        state.psh.alpha_outputs[i] = pgraph_reg_r(pg, NV_PGRAPH_COMBINEALPHAO0 + i * 4);
+        // constant_0[i] = pgraph_reg_r(pg, NV_PGRAPH_COMBINEFACTOR0 + i * 4);
+        // constant_1[i] = pgraph_reg_r(pg, NV_PGRAPH_COMBINEFACTOR1 + i * 4);
+    }
+
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            state.psh.compare_mode[i][j] =
+                (pgraph_reg_r(pg, NV_PGRAPH_SHADERCLIPMODE) >> (4 * i + j)) & 1;
+        }
+
+        uint32_t ctl_0 = pgraph_reg_r(pg, NV_PGRAPH_TEXCTL0_0 + i * 4);
+        bool enabled = pgraph_is_texture_stage_active(pg, i) &&
+                       (ctl_0 & NV_PGRAPH_TEXCTL0_0_ENABLE);
+        if (!enabled) {
+            continue;
+        }
+
+        state.psh.alphakill[i] = ctl_0 & NV_PGRAPH_TEXCTL0_0_ALPHAKILLEN;
+
+        uint32_t tex_fmt = pgraph_reg_r(pg, NV_PGRAPH_TEXFMT0 + i * 4);
+        unsigned int color_format = GET_MASK(tex_fmt, NV_PGRAPH_TEXFMT0_COLOR);
+        BasicColorFormatInfo f = kelvin_color_format_info_map[color_format];
+        state.psh.rect_tex[i] = f.linear;
+        state.psh.tex_x8y24[i] = color_format == NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_DEPTH_X8_Y24_FIXED ||
+                                color_format == NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_DEPTH_X8_Y24_FLOAT;
+
+        uint32_t border_source =
+            GET_MASK(tex_fmt, NV_PGRAPH_TEXFMT0_BORDER_SOURCE);
+        bool cubemap = GET_MASK(tex_fmt, NV_PGRAPH_TEXFMT0_CUBEMAPENABLE);
+        state.psh.border_logical_size[i][0] = 0.0f;
+        state.psh.border_logical_size[i][1] = 0.0f;
+        state.psh.border_logical_size[i][2] = 0.0f;
+        if (border_source != NV_PGRAPH_TEXFMT0_BORDER_SOURCE_COLOR) {
+            if (!f.linear && !cubemap) {
+                // The actual texture will be (at least) double the reported
+                // size and shifted by a 4 texel border but texture coordinates
+                // will still be relative to the reported size.
+                unsigned int reported_width =
+                    1 << GET_MASK(tex_fmt, NV_PGRAPH_TEXFMT0_BASE_SIZE_U);
+                unsigned int reported_height =
+                    1 << GET_MASK(tex_fmt, NV_PGRAPH_TEXFMT0_BASE_SIZE_V);
+                unsigned int reported_depth =
+                    1 << GET_MASK(tex_fmt, NV_PGRAPH_TEXFMT0_BASE_SIZE_P);
+
+                state.psh.border_logical_size[i][0] = reported_width;
+                state.psh.border_logical_size[i][1] = reported_height;
+                state.psh.border_logical_size[i][2] = reported_depth;
+
+                if (reported_width < 8) {
+                    state.psh.border_inv_real_size[i][0] = 0.0625f;
+                } else {
+                    state.psh.border_inv_real_size[i][0] =
+                        1.0f / (reported_width * 2.0f);
+                }
+                if (reported_height < 8) {
+                    state.psh.border_inv_real_size[i][1] = 0.0625f;
+                } else {
+                    state.psh.border_inv_real_size[i][1] =
+                        1.0f / (reported_height * 2.0f);
+                }
+                if (reported_depth < 8) {
+                    state.psh.border_inv_real_size[i][2] = 0.0625f;
+                } else {
+                    state.psh.border_inv_real_size[i][2] =
+                        1.0f / (reported_depth * 2.0f);
+                }
+            } else {
+                NV2A_UNIMPLEMENTED(
+                    "Border source texture with linear %d cubemap %d", f.linear,
+                    cubemap);
+            }
+        }
+
+        /* Keep track of whether texture data has been loaded as signed
+         * normalized integers or not. This dictates whether or not we will need
+         * to re-map in fragment shader for certain texture modes (e.g.
+         * bumpenvmap).
+         *
+         * FIXME: When signed texture data is loaded as unsigned and remapped in
+         * fragment shader, there may be interpolation artifacts. Fix this to
+         * support signed textures more appropriately.
+         */
+#if 0 // FIXME
+        state.psh.snorm_tex[i] = (f.gl_internal_format == GL_RGB8_SNORM)
+                                 || (f.gl_internal_format == GL_RG8_SNORM);
+#endif
+        state.psh.shadow_map[i] = f.depth;
+
+        uint32_t filter = pgraph_reg_r(pg, NV_PGRAPH_TEXFILTER0 + i * 4);
+        unsigned int min_filter = GET_MASK(filter, NV_PGRAPH_TEXFILTER0_MIN);
+        enum ConvolutionFilter kernel = CONVOLUTION_FILTER_DISABLED;
+        /* FIXME: We do not distinguish between min and mag when
+         * performing convolution. Just use it if specified for min (common AA
+         * case).
+         */
+        if (min_filter == NV_PGRAPH_TEXFILTER0_MIN_CONVOLUTION_2D_LOD0) {
+            int k = GET_MASK(filter, NV_PGRAPH_TEXFILTER0_CONVOLUTION_KERNEL);
+            assert(k == NV_PGRAPH_TEXFILTER0_CONVOLUTION_KERNEL_QUINCUNX ||
+                   k == NV_PGRAPH_TEXFILTER0_CONVOLUTION_KERNEL_GAUSSIAN_3);
+            kernel = (enum ConvolutionFilter)k;
+        }
+
+        state.psh.conv_tex[i] = kernel;
+    }
+
+    return state;
+}
--- a/hw/xbox/nv2a/pgraph/shaders.h
+++ b/hw/xbox/nv2a/pgraph/shaders.h
@ -18,17 +18,14 @@
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef HW_NV2A_SHADERS_H
-#define HW_NV2A_SHADERS_H
+#ifndef HW_XBOX_NV2A_PGRAPH_SHADERS_H
+#define HW_XBOX_NV2A_PGRAPH_SHADERS_H

-#include "qemu/thread.h"
-#include "qapi/qmp/qstring.h"
-#include "gl/gloffscreen.h"
+#include <stdint.h>
+#include "hw/xbox/nv2a/nv2a_regs.h"

-#include "nv2a_regs.h"
 #include "vsh.h"
 #include "psh.h"
-#include "lru.h"

 enum ShaderPrimitiveMode {
    PRIM_TYPE_INVALID,
@ -57,10 +54,13 @@ enum MaterialColorSource {
 };

 typedef struct ShaderState {
+    bool vulkan;
    unsigned int surface_scale_factor;

    PshState psh;
    uint16_t compressed_attrs;
+    uint16_t uniform_attrs;
+    uint16_t swizzle_attrs;

    bool texture_matrix_enable[4];
    enum VshTexgen texgen[4][4];
@ -101,61 +101,8 @@ typedef struct ShaderState {
    bool smooth_shading;
 } ShaderState;

-typedef struct ShaderBinding {
-    GLuint gl_program;
-    GLenum gl_primitive_mode;
-
-    GLint psh_constant_loc[9][2];
-    GLint alpha_ref_loc;
-
-    GLint bump_mat_loc[NV2A_MAX_TEXTURES];
-    GLint bump_scale_loc[NV2A_MAX_TEXTURES];
-    GLint bump_offset_loc[NV2A_MAX_TEXTURES];
-    GLint tex_scale_loc[NV2A_MAX_TEXTURES];
-
-    GLint surface_size_loc;
-    GLint clip_range_loc;
-
-    GLint vsh_constant_loc[NV2A_VERTEXSHADER_CONSTANTS];
-    uint32_t vsh_constants[NV2A_VERTEXSHADER_CONSTANTS][4];
-
-    GLint inv_viewport_loc;
-    GLint ltctxa_loc[NV2A_LTCTXA_COUNT];
-    GLint ltctxb_loc[NV2A_LTCTXB_COUNT];
-    GLint ltc1_loc[NV2A_LTC1_COUNT];
-
-    GLint fog_color_loc;
-    GLint fog_param_loc[2];
-    GLint light_infinite_half_vector_loc[NV2A_MAX_LIGHTS];
-    GLint light_infinite_direction_loc[NV2A_MAX_LIGHTS];
-    GLint light_local_position_loc[NV2A_MAX_LIGHTS];
-    GLint light_local_attenuation_loc[NV2A_MAX_LIGHTS];
-
-    GLint clip_region_loc[8];
-
-    GLint material_alpha_loc;
-} ShaderBinding;
-
-typedef struct ShaderLruNode {
-    LruNode node;
-    bool cached;
-    void *program;
-    size_t program_size;
-    GLenum program_format;
-    ShaderState state;
-    ShaderBinding *binding;
-    QemuThread *save_thread;
-} ShaderLruNode;
-
 typedef struct PGRAPHState PGRAPHState;

-GLenum get_gl_primitive_mode(enum ShaderPolygonMode polygon_mode, enum ShaderPrimitiveMode primitive_mode);
-void update_shader_constant_locations(ShaderBinding *binding, const ShaderState *state);
-ShaderBinding *generate_shaders(const ShaderState *state);
-
-void shader_cache_init(PGRAPHState *pg);
-void shader_write_cache_reload_list(PGRAPHState *pg);
-bool shader_load_from_memory(ShaderLruNode *snode);
-void shader_cache_to_disk(ShaderLruNode *snode);
+ShaderState pgraph_get_shader_state(PGRAPHState *pg);

 #endif
--- a/hw/xbox/nv2a/pgraph/surface.h
+++ b/hw/xbox/nv2a/pgraph/surface.h
@ -0,0 +1,35 @@
+/*
+ * QEMU Geforce NV2A implementation
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_XBOX_NV2A_PGRAPH_SURFACE_H
+#define HW_XBOX_NV2A_PGRAPH_SURFACE_H
+
+typedef struct SurfaceShape {
+    unsigned int z_format;
+    unsigned int color_format;
+    unsigned int zeta_format;
+    unsigned int log_width, log_height;
+    unsigned int clip_x, clip_y;
+    unsigned int clip_width, clip_height;
+    unsigned int anti_aliasing;
+} SurfaceShape;
+
+#endif
--- a/hw/xbox/nv2a/pgraph/swizzle.c
+++ b/hw/xbox/nv2a/pgraph/swizzle.c
--- a/hw/xbox/nv2a/pgraph/swizzle.h
+++ b/hw/xbox/nv2a/pgraph/swizzle.h
@ -18,8 +18,10 @@
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef HW_XBOX_SWIZZLE_H
-#define HW_XBOX_SWIZZLE_H
+#ifndef HW_XBOX_NV2A_PGRAPH_SWIZZLE_H
+#define HW_XBOX_NV2A_PGRAPH_SWIZZLE_H
+
+#include <stdint.h>

 void swizzle_box(
    const uint8_t *src_buf,
--- a/hw/xbox/nv2a/pgraph/texture.c
+++ b/hw/xbox/nv2a/pgraph/texture.c
@ -0,0 +1,405 @@
+/*
+ * QEMU Geforce NV2A implementation
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hw/xbox/nv2a/nv2a_int.h"
+#include "texture.h"
+#include "util.h"
+
+const BasicColorFormatInfo kelvin_color_format_info_map[66] = {
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_Y8] = { 1, false },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_AY8] = { 1, false },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A1R5G5B5] = { 2, false },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_X1R5G5B5] = { 2, false },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A4R4G4B4] = { 2, false },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_R5G6B5] = { 2, false },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A8R8G8B8] = { 4, false },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_X8R8G8B8] = { 4, false },
+
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_I8_A8R8G8B8] = { 1, false },
+
+    [NV097_SET_TEXTURE_FORMAT_COLOR_L_DXT1_A1R5G5B5] = { 4, false },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_L_DXT23_A8R8G8B8] = { 4, false },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_L_DXT45_A8R8G8B8] = { 4, false },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A1R5G5B5] = { 2, true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_R5G6B5] = { 2, true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A8R8G8B8] = { 4, true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_Y8] = { 1, true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_G8B8] = { 2, true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A8] = { 1, false },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A8Y8] = { 2, false },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_AY8] = { 1, true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_X1R5G5B5] = { 2, true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A4R4G4B4] = { 2, true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_X8R8G8B8] = { 4, true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A8] = { 1, true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A8Y8] = { 2, true },
+
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_R6G5B5] = { 2, false },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_G8B8] = { 2, false },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_R8B8] = { 2, false },
+
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LC_IMAGE_CR8YB8CB8YA8] = { 2, true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LC_IMAGE_YB8CR8YA8CB8] = { 2, true },
+
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_DEPTH_Y16_FIXED] = { 2, false, true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_DEPTH_X8_Y24_FIXED] = { 4, true,
+                                                                     true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_DEPTH_X8_Y24_FLOAT] = { 4, true,
+                                                                     true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_DEPTH_Y16_FIXED] = { 2, true,
+                                                                  true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_DEPTH_Y16_FLOAT] = { 2, true,
+                                                                  true },
+
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_Y16] = { 2, true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A8B8G8R8] = { 4, false },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_B8G8R8A8] = { 4, false },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_R8G8B8A8] = { 4, false },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A8B8G8R8] = { 4, true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_B8G8R8A8] = { 4, true },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_R8G8B8A8] = { 4, true },
+};
+
+hwaddr pgraph_get_texture_phys_addr(PGRAPHState *pg, int texture_idx)
+{
+    NV2AState *d = container_of(pg, NV2AState, pgraph);
+    int i = texture_idx;
+
+    uint32_t fmt = pgraph_reg_r(pg, NV_PGRAPH_TEXFMT0 + i*4);
+    unsigned int dma_select =
+        GET_MASK(fmt, NV_PGRAPH_TEXFMT0_CONTEXT_DMA);
+
+    hwaddr offset = pgraph_reg_r(pg, NV_PGRAPH_TEXOFFSET0 + i*4);
+
+    hwaddr dma_len;
+    uint8_t *texture_data;
+    if (dma_select) {
+        texture_data = (uint8_t*)nv_dma_map(d, pg->dma_b, &dma_len);
+    } else {
+        texture_data = (uint8_t*)nv_dma_map(d, pg->dma_a, &dma_len);
+    }
+    assert(offset < dma_len);
+    texture_data += offset;
+
+    return texture_data - d->vram_ptr;
+}
+
+hwaddr pgraph_get_texture_palette_phys_addr_length(PGRAPHState *pg, int texture_idx, size_t *length)
+{
+    NV2AState *d = container_of(pg, NV2AState, pgraph);
+    int i = texture_idx;
+
+    uint32_t palette = pgraph_reg_r(pg, NV_PGRAPH_TEXPALETTE0 + i*4);
+    bool palette_dma_select =
+        GET_MASK(palette, NV_PGRAPH_TEXPALETTE0_CONTEXT_DMA);
+    unsigned int palette_length_index =
+        GET_MASK(palette, NV_PGRAPH_TEXPALETTE0_LENGTH);
+    unsigned int palette_offset =
+        palette & NV_PGRAPH_TEXPALETTE0_OFFSET;
+
+    unsigned int palette_length = 0;
+    switch (palette_length_index) {
+    case NV_PGRAPH_TEXPALETTE0_LENGTH_256: palette_length = 256; break;
+    case NV_PGRAPH_TEXPALETTE0_LENGTH_128: palette_length = 128; break;
+    case NV_PGRAPH_TEXPALETTE0_LENGTH_64: palette_length = 64; break;
+    case NV_PGRAPH_TEXPALETTE0_LENGTH_32: palette_length = 32; break;
+    default: assert(false); break;
+    }
+    if (length) {
+        *length = palette_length;
+    }
+
+    hwaddr palette_dma_len;
+    uint8_t *palette_data;
+    if (palette_dma_select) {
+        palette_data = (uint8_t*)nv_dma_map(d, pg->dma_b, &palette_dma_len);
+    } else {
+        palette_data = (uint8_t*)nv_dma_map(d, pg->dma_a, &palette_dma_len);
+    }
+    assert(palette_offset < palette_dma_len);
+    palette_data += palette_offset;
+
+    return palette_data - d->vram_ptr;
+}
+
+size_t pgraph_get_texture_length(PGRAPHState *pg, TextureShape *shape)
+{
+    BasicColorFormatInfo f = kelvin_color_format_info_map[shape->color_format];
+    size_t length = 0;
+
+    if (f.linear) {
+        assert(shape->cubemap == false);
+        assert(shape->dimensionality == 2);
+        length = shape->height * shape->pitch;
+    } else {
+        if (shape->dimensionality >= 2) {
+            unsigned int w = shape->width, h = shape->height;
+            int level;
+            if (!pgraph_is_texture_format_compressed(pg, shape->color_format)) {
+                for (level = 0; level < shape->levels; level++) {
+                    w = MAX(w, 1);
+                    h = MAX(h, 1);
+                    length += w * h * f.bytes_per_pixel;
+                    w /= 2;
+                    h /= 2;
+                }
+            } else {
+                /* Compressed textures are a bit different */
+                unsigned int block_size =
+                    shape->color_format ==
+                            NV097_SET_TEXTURE_FORMAT_COLOR_L_DXT1_A1R5G5B5 ?
+                        8 : 16;
+                for (level = 0; level < shape->levels; level++) {
+                    w = MAX(w, 1);
+                    h = MAX(h, 1);
+                    unsigned int phys_w = (w + 3) & ~3,
+                                 phys_h = (h + 3) & ~3;
+                    length += phys_w/4 * phys_h/4 * block_size;
+                    w /= 2;
+                    h /= 2;
+                }
+            }
+            if (shape->cubemap) {
+                assert(shape->dimensionality == 2);
+                length = (length + NV2A_CUBEMAP_FACE_ALIGNMENT - 1) & ~(NV2A_CUBEMAP_FACE_ALIGNMENT - 1);
+                length *= 6;
+            }
+            if (shape->dimensionality >= 3) {
+                length *= shape->depth;
+            }
+        }
+    }
+
+    return length;
+}
+
+TextureShape pgraph_get_texture_shape(PGRAPHState *pg, int texture_idx)
+{
+    int i = texture_idx;
+
+    uint32_t ctl_0 = pgraph_reg_r(pg, NV_PGRAPH_TEXCTL0_0 + i*4);
+    uint32_t ctl_1 = pgraph_reg_r(pg, NV_PGRAPH_TEXCTL1_0 + i*4);
+    uint32_t fmt = pgraph_reg_r(pg, NV_PGRAPH_TEXFMT0 + i*4);
+
+#if DEBUG_NV2A
+    uint32_t filter = pgraph_reg_r(pg, NV_PGRAPH_TEXFILTER0 + i*4);
+    uint32_t address = pgraph_reg_r(pg, NV_PGRAPH_TEXADDRESS0 + i*4);
+#endif
+
+    unsigned int min_mipmap_level =
+        GET_MASK(ctl_0, NV_PGRAPH_TEXCTL0_0_MIN_LOD_CLAMP);
+    unsigned int max_mipmap_level =
+        GET_MASK(ctl_0, NV_PGRAPH_TEXCTL0_0_MAX_LOD_CLAMP);
+
+    unsigned int pitch =
+        GET_MASK(ctl_1, NV_PGRAPH_TEXCTL1_0_IMAGE_PITCH);
+
+    bool cubemap =
+        GET_MASK(fmt, NV_PGRAPH_TEXFMT0_CUBEMAPENABLE);
+    unsigned int dimensionality =
+        GET_MASK(fmt, NV_PGRAPH_TEXFMT0_DIMENSIONALITY);
+
+    int tex_mode = (pgraph_reg_r(pg, NV_PGRAPH_SHADERPROG) >> (texture_idx * 5)) & 0x1F;
+    if (tex_mode == 0x02) {
+        assert(pgraph_is_texture_enabled(pg, texture_idx));
+        // assert(state.dimensionality == 3);
+
+        // OVERRIDE
+        // dimensionality = 3;
+    }
+
+    unsigned int color_format = GET_MASK(fmt, NV_PGRAPH_TEXFMT0_COLOR);
+    unsigned int levels = GET_MASK(fmt, NV_PGRAPH_TEXFMT0_MIPMAP_LEVELS);
+    unsigned int log_width = GET_MASK(fmt, NV_PGRAPH_TEXFMT0_BASE_SIZE_U);
+    unsigned int log_height = GET_MASK(fmt, NV_PGRAPH_TEXFMT0_BASE_SIZE_V);
+    unsigned int log_depth = GET_MASK(fmt, NV_PGRAPH_TEXFMT0_BASE_SIZE_P);
+
+    unsigned int rect_width =
+        GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_TEXIMAGERECT0 + i*4),
+                 NV_PGRAPH_TEXIMAGERECT0_WIDTH);
+    unsigned int rect_height =
+        GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_TEXIMAGERECT0 + i*4),
+                 NV_PGRAPH_TEXIMAGERECT0_HEIGHT);
+#ifdef DEBUG_NV2A
+    unsigned int lod_bias =
+        GET_MASK(filter, NV_PGRAPH_TEXFILTER0_MIPMAP_LOD_BIAS);
+#endif
+    unsigned int border_source = GET_MASK(fmt,
+                                          NV_PGRAPH_TEXFMT0_BORDER_SOURCE);
+
+    NV2A_DPRINTF(" texture %d is format 0x%x, "
+                    "off 0x%" HWADDR_PRIx " (r %d, %d or %d, %d, %d; %d%s),"
+                    " filter %x %x, levels %d-%d %d bias %d\n",
+                 i, color_format, address,
+                 rect_width, rect_height,
+                 1 << log_width, 1 << log_height, 1 << log_depth,
+                 pitch,
+                 cubemap ? "; cubemap" : "",
+                 GET_MASK(filter, NV_PGRAPH_TEXFILTER0_MIN),
+                 GET_MASK(filter, NV_PGRAPH_TEXFILTER0_MAG),
+                 min_mipmap_level, max_mipmap_level, levels,
+                 lod_bias);
+
+    assert(color_format < ARRAY_SIZE(kelvin_color_format_info_map));
+    BasicColorFormatInfo f = kelvin_color_format_info_map[color_format];
+    if (f.bytes_per_pixel == 0) {
+        fprintf(stderr, "nv2a: unimplemented texture color format 0x%x\n",
+                color_format);
+        abort();
+    }
+
+    unsigned int width, height, depth;
+    if (f.linear) {
+        assert(dimensionality == 2);
+        width = rect_width;
+        height = rect_height;
+        depth = 1;
+    } else {
+        width = 1 << log_width;
+        height = 1 << log_height;
+        depth = 1 << log_depth;
+        pitch = 0;
+
+        levels = MIN(levels, max_mipmap_level + 1);
+
+        /* Discard mipmap levels that would be smaller than 1x1.
+         * FIXME: Is this actually needed?
+         *
+         * >> Level 0: 32 x 4
+         *    Level 1: 16 x 2
+         *    Level 2: 8 x 1
+         *    Level 3: 4 x 1
+         *    Level 4: 2 x 1
+         *    Level 5: 1 x 1
+         */
+        levels = MIN(levels, MAX(log_width, log_height) + 1);
+        assert(levels > 0);
+
+        if (dimensionality == 3) {
+            /* FIXME: What about 3D mipmaps? */
+            if (log_width < 2 || log_height < 2) {
+                /* Base level is smaller than 4x4... */
+                levels = 1;
+            } else {
+                levels = MIN(levels, MIN(log_width, log_height) - 1);
+            }
+        }
+        min_mipmap_level = MIN(levels-1, min_mipmap_level);
+        max_mipmap_level = MIN(levels-1, max_mipmap_level);
+    }
+
+    TextureShape shape;
+
+    // We will hash it, so make sure any padding is zero
+    memset(&shape, 0, sizeof(shape));
+
+    shape.cubemap = cubemap;
+    shape.dimensionality = dimensionality;
+    shape.color_format = color_format;
+    shape.levels = levels;
+    shape.width = width;
+    shape.height = height;
+    shape.depth = depth;
+    shape.min_mipmap_level = min_mipmap_level;
+    shape.max_mipmap_level = max_mipmap_level;
+    shape.pitch = pitch;
+    shape.border = border_source != NV_PGRAPH_TEXFMT0_BORDER_SOURCE_COLOR;
+    return shape;
+}
+
+uint8_t *pgraph_convert_texture_data(const TextureShape s, const uint8_t *data,
+                                     const uint8_t *palette_data,
+                                     unsigned int width, unsigned int height,
+                                     unsigned int depth, unsigned int row_pitch,
+                                     unsigned int slice_pitch,
+                                     size_t *converted_size)
+{
+    size_t size = 0;
+    uint8_t *converted_data;
+
+    if (s.color_format == NV097_SET_TEXTURE_FORMAT_COLOR_SZ_I8_A8R8G8B8) {
+        size = width * height * depth * 4;
+        converted_data = g_malloc(size);
+        const uint8_t *src = data;
+        uint32_t *dst = (uint32_t *)converted_data;
+        for (int z = 0; z < depth; z++) {
+            for (int y = 0; y < height; y++) {
+                for (int x = 0; x < width; x++) {
+                    uint8_t index = src[y * row_pitch + x];
+                    uint32_t color = *(uint32_t *)(palette_data + index * 4);
+                    *dst++ = color;
+                }
+            }
+            src += slice_pitch;
+        }
+    } else if (s.color_format ==
+                   NV097_SET_TEXTURE_FORMAT_COLOR_LC_IMAGE_CR8YB8CB8YA8 ||
+               s.color_format ==
+                   NV097_SET_TEXTURE_FORMAT_COLOR_LC_IMAGE_YB8CR8YA8CB8) {
+        // TODO: Investigate whether a non-1 depth is possible.
+        // Generally the hardware asserts when attempting to use volumetric
+        // textures in linear formats.
+        assert(depth == 1); /* FIXME */
+        // FIXME: only valid if control0 register allows for colorspace
+        // conversion
+        size = width * height * 4;
+        converted_data = g_malloc(size);
+        uint8_t *pixel = converted_data;
+        for (int y = 0; y < height; y++) {
+            const uint8_t *line = &data[y * row_pitch * depth];
+            for (int x = 0; x < width; x++, pixel += 4) {
+                if (s.color_format ==
+                    NV097_SET_TEXTURE_FORMAT_COLOR_LC_IMAGE_CR8YB8CB8YA8) {
+                    convert_yuy2_to_rgb(line, x, &pixel[0], &pixel[1],
+                                        &pixel[2]);
+                } else {
+                    convert_uyvy_to_rgb(line, x, &pixel[0], &pixel[1],
+                                        &pixel[2]);
+                }
+                pixel[3] = 255;
+            }
+        }
+    } else if (s.color_format == NV097_SET_TEXTURE_FORMAT_COLOR_SZ_R6G5B5) {
+        assert(depth == 1); /* FIXME */
+        size = width * height * 3;
+        converted_data = g_malloc(size);
+        for (int y = 0; y < height; y++) {
+            for (int x = 0; x < width; x++) {
+                uint16_t rgb655 = *(uint16_t *)(data + y * row_pitch + x * 2);
+                int8_t *pixel = (int8_t *)&converted_data[(y * width + x) * 3];
+                /* Maps 5 bit G and B signed value range to 8 bit
+                 * signed values. R is probably unsigned.
+                 */
+                rgb655 ^= (1 << 9) | (1 << 4);
+                pixel[0] = ((rgb655 & 0xFC00) >> 10) * 0x7F / 0x3F;
+                pixel[1] = ((rgb655 & 0x03E0) >> 5) * 0xFF / 0x1F - 0x80;
+                pixel[2] = (rgb655 & 0x001F) * 0xFF / 0x1F - 0x80;
+            }
+        }
+    } else {
+        return NULL;
+    }
+
+    if (converted_size) {
+        *converted_size = size;
+    }
+    return converted_data;
+}
--- a/hw/xbox/nv2a/pgraph/texture.h
+++ b/hw/xbox/nv2a/pgraph/texture.h
@ -0,0 +1,67 @@
+/*
+ * QEMU Geforce NV2A implementation
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_XBOX_NV2A_PGRAPH_TEXTURE_H
+#define HW_XBOX_NV2A_PGRAPH_TEXTURE_H
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "hw/xbox/nv2a/nv2a_regs.h"
+
+typedef struct PGRAPHState PGRAPHState;
+
+typedef struct TextureShape {
+    bool cubemap;
+    unsigned int dimensionality;
+    unsigned int color_format;
+    unsigned int levels;
+    unsigned int width, height, depth;
+    bool border;
+
+    unsigned int min_mipmap_level, max_mipmap_level;
+    unsigned int pitch;
+} TextureShape;
+
+typedef struct BasicColorFormatInfo {
+    unsigned int bytes_per_pixel;
+    bool linear;
+    bool depth;
+} BasicColorFormatInfo;
+
+extern const BasicColorFormatInfo kelvin_color_format_info_map[66];
+
+uint8_t *pgraph_convert_texture_data(const TextureShape s, const uint8_t *data,
+                                     const uint8_t *palette_data,
+                                     unsigned int width, unsigned int height,
+                                     unsigned int depth, unsigned int row_pitch,
+                                     unsigned int slice_pitch,
+                                     size_t *converted_size);
+
+hwaddr pgraph_get_texture_phys_addr(PGRAPHState *pg, int texture_idx);
+hwaddr pgraph_get_texture_palette_phys_addr_length(PGRAPHState *pg, int texture_idx, size_t *length);
+TextureShape pgraph_get_texture_shape(PGRAPHState *pg, int texture_idx);
+size_t pgraph_get_texture_length(PGRAPHState *pg, TextureShape *shape);
+
+#endif
--- a/hw/xbox/nv2a/pgraph/thirdparty/gloffscreen/common.c
+++ b/hw/xbox/nv2a/pgraph/thirdparty/gloffscreen/common.c
--- a/hw/xbox/nv2a/pgraph/thirdparty/gloffscreen/gloffscreen.h
+++ b/hw/xbox/nv2a/pgraph/thirdparty/gloffscreen/gloffscreen.h
--- a/hw/xbox/nv2a/pgraph/thirdparty/gloffscreen/sdl.c
+++ b/hw/xbox/nv2a/pgraph/thirdparty/gloffscreen/sdl.c
@ -1,7 +1,7 @@
 /*
 *  Offscreen OpenGL abstraction layer -- SDL based
 *
- *  Copyright (c) 2018-2021 Matt Borgerson
+ *  Copyright (c) 2018-2024 Matt Borgerson
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
--- a/hw/xbox/nv2a/pgraph/thirdparty/meson.build
+++ b/hw/xbox/nv2a/pgraph/thirdparty/meson.build
@ -10,3 +10,9 @@ libnv2a_vsh_cpu = static_library('nv2a_vsh_cpu',
                          include_directories: ['.', 'nv2a_vsh_cpu/src'])
 nv2a_vsh_cpu = declare_dependency(link_with: libnv2a_vsh_cpu,
                           include_directories: ['nv2a_vsh_cpu/src'])
+
+libgloffscreen = static_library('libgloffscreen',
+                          sources: files('gloffscreen/common.c', 'gloffscreen/sdl.c'),
+                          dependencies: sdl)
+gloffscreen = declare_dependency(link_with: libgloffscreen,
+                           include_directories: ['gloffscreen'])
--- a/hw/xbox/nv2a/pgraph/thirdparty/nv2a_vsh_cpu
+++ b/hw/xbox/nv2a/pgraph/thirdparty/nv2a_vsh_cpu
--- a/hw/xbox/nv2a/pgraph/util.h
+++ b/hw/xbox/nv2a/pgraph/util.h
@ -0,0 +1,86 @@
+/*
+ * QEMU Geforce NV2A implementation
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_XBOX_NV2A_PGRAPH_UTIL_H
+#define HW_XBOX_NV2A_PGRAPH_UTIL_H
+
+static const float f16_max = 511.9375f;
+static const float f24_max = 1.0E30;
+
+/* 16 bit to [0.0, F16_MAX = 511.9375] */
+static inline 
+float convert_f16_to_float(uint16_t f16) {
+    if (f16 == 0x0000) { return 0.0; }
+    uint32_t i = (f16 << 11) + 0x3C000000;
+    return *(float*)&i;
+}
+
+/* 24 bit to [0.0, F24_MAX] */
+static inline 
+float convert_f24_to_float(uint32_t f24) {
+    assert(!(f24 >> 24));
+    f24 &= 0xFFFFFF;
+    if (f24 == 0x000000) { return 0.0; }
+    uint32_t i = f24 << 7;
+    return *(float*)&i;
+}
+
+static inline 
+uint8_t cliptobyte(int x)
+{
+    return (uint8_t)((x < 0) ? 0 : ((x > 255) ? 255 : x));
+}
+
+static inline 
+void convert_yuy2_to_rgb(const uint8_t *line, unsigned int ix,
+                                uint8_t *r, uint8_t *g, uint8_t* b) {
+    int c, d, e;
+    c = (int)line[ix * 2] - 16;
+    if (ix % 2) {
+        d = (int)line[ix * 2 - 1] - 128;
+        e = (int)line[ix * 2 + 1] - 128;
+    } else {
+        d = (int)line[ix * 2 + 1] - 128;
+        e = (int)line[ix * 2 + 3] - 128;
+    }
+    *r = cliptobyte((298 * c + 409 * e + 128) >> 8);
+    *g = cliptobyte((298 * c - 100 * d - 208 * e + 128) >> 8);
+    *b = cliptobyte((298 * c + 516 * d + 128) >> 8);
+}
+
+static inline 
+void convert_uyvy_to_rgb(const uint8_t *line, unsigned int ix,
+                                uint8_t *r, uint8_t *g, uint8_t* b) {
+    int c, d, e;
+    c = (int)line[ix * 2 + 1] - 16;
+    if (ix % 2) {
+        d = (int)line[ix * 2 - 2] - 128;
+        e = (int)line[ix * 2 + 0] - 128;
+    } else {
+        d = (int)line[ix * 2 + 0] - 128;
+        e = (int)line[ix * 2 + 2] - 128;
+    }
+    *r = cliptobyte((298 * c + 409 * e + 128) >> 8);
+    *g = cliptobyte((298 * c - 100 * d - 208 * e + 128) >> 8);
+    *b = cliptobyte((298 * c + 516 * d + 128) >> 8);
+}
+
+#endif
--- a/hw/xbox/nv2a/pgraph/vertex.c
+++ b/hw/xbox/nv2a/pgraph/vertex.c
@ -0,0 +1,131 @@
+/*
+ * QEMU Geforce NV2A implementation
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hw/xbox/nv2a/nv2a_int.h"
+
+void pgraph_update_inline_value(VertexAttribute *attr, const uint8_t *data)
+{
+    assert(attr->count <= 4);
+    attr->inline_value[0] = 0.0f;
+    attr->inline_value[1] = 0.0f;
+    attr->inline_value[2] = 0.0f;
+    attr->inline_value[3] = 1.0f;
+
+    switch (attr->format) {
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_UB_D3D:
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_UB_OGL:
+            for (uint32_t i = 0; i < attr->count; ++i) {
+                attr->inline_value[i] = (float)data[i] / 255.0f;
+            }
+            break;
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_S1: {
+            const int16_t *val = (const int16_t *) data;
+            for (uint32_t i = 0; i < attr->count; ++i, ++val) {
+                attr->inline_value[i] = MAX(-1.0f, (float) *val / 32767.0f);
+            }
+            break;
+        }
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_F:
+            memcpy(attr->inline_value, data, attr->size * attr->count);
+            break;
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_S32K: {
+            const int16_t *val = (const int16_t *) data;
+            for (uint32_t i = 0; i < attr->count; ++i, ++val) {
+                attr->inline_value[i] = (float)*val;
+            }
+            break;
+        }
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_CMP: {
+            /* 3 signed, normalized components packed in 32-bits. (11,11,10) */
+            const int32_t val = *(const int32_t *)data;
+            int32_t x = val & 0x7FF;
+            if (x & 0x400) {
+                x |= 0xFFFFF800;
+            }
+            int32_t y = (val >> 11) & 0x7FF;
+            if (y & 0x400) {
+                y |= 0xFFFFF800;
+            }
+            int32_t z = (val >> 22) & 0x7FF;
+            if (z & 0x200) {
+                z |= 0xFFFFFC00;
+            }
+
+            attr->inline_value[0] = MAX(-1.0f, (float)x / 1023.0f);
+            attr->inline_value[1] = MAX(-1.0f, (float)y / 1023.0f);
+            attr->inline_value[2] = MAX(-1.0f, (float)z / 511.0f);
+            break;
+        }
+    default:
+        fprintf(stderr, "Unknown vertex attribute type: for format 0x%x\n",
+                attr->format);
+        assert(!"Unsupported attribute type");
+        break;
+    }
+}
+
+void pgraph_allocate_inline_buffer_vertices(PGRAPHState *pg, unsigned int attr)
+{
+    VertexAttribute *attribute = &pg->vertex_attributes[attr];
+
+    if (attribute->inline_buffer_populated || pg->inline_buffer_length == 0) {
+        return;
+    }
+
+    /* Now upload the previous attribute value */
+    attribute->inline_buffer_populated = true;
+    for (int i = 0; i < pg->inline_buffer_length; i++) {
+        memcpy(&attribute->inline_buffer[i * 4], attribute->inline_value,
+               sizeof(float) * 4);
+    }
+}
+
+void pgraph_finish_inline_buffer_vertex(PGRAPHState *pg)
+{
+    pgraph_check_within_begin_end_block(pg);
+    assert(pg->inline_buffer_length < NV2A_MAX_BATCH_LENGTH);
+
+    for (int i = 0; i < NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
+        VertexAttribute *attribute = &pg->vertex_attributes[i];
+        if (attribute->inline_buffer_populated) {
+            memcpy(&attribute->inline_buffer[pg->inline_buffer_length * 4],
+                   attribute->inline_value, sizeof(float) * 4);
+        }
+    }
+
+    pg->inline_buffer_length++;
+}
+
+void pgraph_reset_inline_buffers(PGRAPHState *pg)
+{
+    pg->inline_elements_length = 0;
+    pg->inline_array_length = 0;
+    pg->inline_buffer_length = 0;
+    pgraph_reset_draw_arrays(pg);
+}
+
+void pgraph_reset_draw_arrays(PGRAPHState *pg)
+{
+    pg->draw_arrays_length = 0;
+    pg->draw_arrays_min_start = -1;
+    pg->draw_arrays_max_count = 0;
+    pg->draw_arrays_prevent_connect = false;
+}
--- a/hw/xbox/nv2a/pgraph/vk/blit.c
+++ b/hw/xbox/nv2a/pgraph/vk/blit.c
@ -0,0 +1,177 @@
+/*
+ * Geforce NV2A PGRAPH Vulkan Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * Based on GL implementation:
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hw/xbox/nv2a/nv2a_int.h"
+#include "renderer.h"
+
+void pgraph_vk_image_blit(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    ContextSurfaces2DState *context_surfaces = &pg->context_surfaces_2d;
+    ImageBlitState *image_blit = &pg->image_blit;
+    BetaState *beta = &pg->beta;
+
+    pgraph_vk_surface_update(d, false, true, true);
+
+    assert(context_surfaces->object_instance == image_blit->context_surfaces);
+
+    unsigned int bytes_per_pixel;
+    switch (context_surfaces->color_format) {
+        case NV062_SET_COLOR_FORMAT_LE_Y8:
+            bytes_per_pixel = 1;
+            break;
+        case NV062_SET_COLOR_FORMAT_LE_R5G6B5:
+            bytes_per_pixel = 2;
+            break;
+        case NV062_SET_COLOR_FORMAT_LE_A8R8G8B8:
+        case NV062_SET_COLOR_FORMAT_LE_X8R8G8B8:
+        case NV062_SET_COLOR_FORMAT_LE_X8R8G8B8_Z8R8G8B8:
+        case NV062_SET_COLOR_FORMAT_LE_Y32:
+            bytes_per_pixel = 4;
+            break;
+        default:
+            fprintf(stderr, "Unknown blit surface format: 0x%x\n",
+                    context_surfaces->color_format);
+            assert(false);
+            break;
+    }
+
+    hwaddr source_dma_len, dest_dma_len;
+
+    uint8_t *source = (uint8_t *)nv_dma_map(
+        d, context_surfaces->dma_image_source, &source_dma_len);
+    assert(context_surfaces->source_offset < source_dma_len);
+    source += context_surfaces->source_offset;
+
+    uint8_t *dest = (uint8_t *)nv_dma_map(d, context_surfaces->dma_image_dest,
+                                          &dest_dma_len);
+    assert(context_surfaces->dest_offset < dest_dma_len);
+    dest += context_surfaces->dest_offset;
+
+    hwaddr source_addr = source - d->vram_ptr;
+    hwaddr dest_addr = dest - d->vram_ptr;
+
+    SurfaceBinding *surf_src = pgraph_vk_surface_get(d, source_addr);
+    if (surf_src) {
+        pgraph_vk_surface_download_if_dirty(d, surf_src);
+    }
+
+    SurfaceBinding *surf_dest = pgraph_vk_surface_get(d, dest_addr);
+    if (surf_dest) {
+        if (image_blit->height < surf_dest->height ||
+            image_blit->width < surf_dest->width) {
+            pgraph_vk_surface_download_if_dirty(d, surf_dest);
+        } else {
+            // The blit will completely replace the surface so any pending
+            // download should be discarded.
+            surf_dest->download_pending = false;
+            surf_dest->draw_dirty = false;
+        }
+        surf_dest->upload_pending = true;
+        pg->draw_time++;
+    }
+
+    hwaddr source_offset = image_blit->in_y * context_surfaces->source_pitch +
+                           image_blit->in_x * bytes_per_pixel;
+    hwaddr dest_offset = image_blit->out_y * context_surfaces->dest_pitch +
+                         image_blit->out_x * bytes_per_pixel;
+
+    hwaddr source_size =
+        (image_blit->height - 1) * context_surfaces->source_pitch +
+        image_blit->width * bytes_per_pixel;
+    hwaddr dest_size = (image_blit->height - 1) * context_surfaces->dest_pitch +
+                       image_blit->width * bytes_per_pixel;
+
+    /* FIXME: What does hardware do in this case? */
+    assert(source_addr + source_offset + source_size <=
+           memory_region_size(d->vram));
+    assert(dest_addr + dest_offset + dest_size <= memory_region_size(d->vram));
+
+    uint8_t *source_row = source + source_offset;
+    uint8_t *dest_row = dest + dest_offset;
+
+    if (image_blit->operation == NV09F_SET_OPERATION_SRCCOPY) {
+        // NV2A_GL_DPRINTF(false, "NV09F_SET_OPERATION_SRCCOPY");
+        for (unsigned int y = 0; y < image_blit->height; y++) {
+            memmove(dest_row, source_row, image_blit->width * bytes_per_pixel);
+            source_row += context_surfaces->source_pitch;
+            dest_row += context_surfaces->dest_pitch;
+        }
+    } else if (image_blit->operation == NV09F_SET_OPERATION_BLEND_AND) {
+        // NV2A_GL_DPRINTF(false, "NV09F_SET_OPERATION_BLEND_AND");
+        uint32_t max_beta_mult = 0x7f80;
+        uint32_t beta_mult = beta->beta >> 16;
+        uint32_t inv_beta_mult = max_beta_mult - beta_mult;
+        for (unsigned int y = 0; y < image_blit->height; y++) {
+            for (unsigned int x = 0; x < image_blit->width; x++) {
+                for (unsigned int ch = 0; ch < 3; ch++) {
+                    uint32_t a = source_row[x * 4 + ch] * beta_mult;
+                    uint32_t b = dest_row[x * 4 + ch] * inv_beta_mult;
+                    dest_row[x * 4 + ch] = (a + b) / max_beta_mult;
+                }
+            }
+            source_row += context_surfaces->source_pitch;
+            dest_row += context_surfaces->dest_pitch;
+        }
+    } else {
+        fprintf(stderr, "Unknown blit operation: 0x%x\n",
+                image_blit->operation);
+        assert(false && "Unknown blit operation");
+    }
+
+    NV2A_DPRINTF("  - 0x%tx -> 0x%tx\n", source_addr, dest_addr);
+
+    bool needs_alpha_patching;
+    uint8_t alpha_override;
+    switch (context_surfaces->color_format) {
+    case NV062_SET_COLOR_FORMAT_LE_X8R8G8B8:
+        needs_alpha_patching = true;
+        alpha_override = 0xff;
+        break;
+    case NV062_SET_COLOR_FORMAT_LE_X8R8G8B8_Z8R8G8B8:
+        needs_alpha_patching = true;
+        alpha_override = 0;
+        break;
+    default:
+        needs_alpha_patching = false;
+        alpha_override = 0;
+    }
+
+    if (needs_alpha_patching) {
+        dest_row = dest + dest_offset;
+        for (unsigned int y = 0; y < image_blit->height; y++) {
+            for (unsigned int x = 0; x < image_blit->width; x++) {
+                dest_row[x * 4 + 3] = alpha_override;
+            }
+            dest_row += context_surfaces->dest_pitch;
+        }
+    }
+
+    dest_addr += dest_offset;
+    memory_region_set_client_dirty(d->vram, dest_addr, dest_size,
+                                   DIRTY_MEMORY_VGA);
+    memory_region_set_client_dirty(d->vram, dest_addr, dest_size,
+                                   DIRTY_MEMORY_NV2A_TEX);
+}
--- a/hw/xbox/nv2a/pgraph/vk/buffer.c
+++ b/hw/xbox/nv2a/pgraph/vk/buffer.c
@ -0,0 +1,206 @@
+/*
+ * Geforce NV2A PGRAPH Vulkan Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "renderer.h"
+#include <vulkan/vulkan_core.h>
+
+static void create_buffer(PGRAPHState *pg, StorageBuffer *buffer)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkBufferCreateInfo buffer_create_info = {
+        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+        .size = buffer->buffer_size,
+        .usage = buffer->usage,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+    };
+    VK_CHECK(vmaCreateBuffer(r->allocator, &buffer_create_info,
+                             &buffer->alloc_info, &buffer->buffer,
+                             &buffer->allocation, NULL));
+}
+
+static void destroy_buffer(PGRAPHState *pg, StorageBuffer *buffer)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    vmaDestroyBuffer(r->allocator, buffer->buffer, buffer->allocation);
+    buffer->buffer = VK_NULL_HANDLE;
+    buffer->allocation = VK_NULL_HANDLE;
+}
+
+void pgraph_vk_init_buffers(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    // FIXME: Profile buffer sizes
+
+    VmaAllocationCreateInfo host_alloc_create_info = {
+        .usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST,
+        .flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT
+    };
+    VmaAllocationCreateInfo device_alloc_create_info = {
+        .usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE,
+    };
+
+    r->storage_buffers[BUFFER_STAGING_DST] = (StorageBuffer){
+        .alloc_info = host_alloc_create_info,
+        .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+        .buffer_size = 4096 * 4096 * 4,
+    };
+
+    r->storage_buffers[BUFFER_STAGING_SRC] = (StorageBuffer){
+        .alloc_info = host_alloc_create_info,
+        .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+        .buffer_size = r->storage_buffers[BUFFER_STAGING_DST].buffer_size,
+    };
+
+    r->storage_buffers[BUFFER_COMPUTE_DST] = (StorageBuffer){
+        .alloc_info = device_alloc_create_info,
+        .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+                 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+        .buffer_size = (1024 * 10) * (1024 * 10) * 8,
+    };
+
+    r->storage_buffers[BUFFER_COMPUTE_SRC] = (StorageBuffer){
+        .alloc_info = device_alloc_create_info,
+        .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+                 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+        .buffer_size = r->storage_buffers[BUFFER_COMPUTE_DST].buffer_size,
+    };
+
+    r->storage_buffers[BUFFER_INDEX] = (StorageBuffer){
+        .alloc_info = device_alloc_create_info,
+        .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+                 VK_BUFFER_USAGE_INDEX_BUFFER_BIT,
+        .buffer_size = sizeof(pg->inline_elements) * 100,
+    };
+
+    r->storage_buffers[BUFFER_INDEX_STAGING] = (StorageBuffer){
+        .alloc_info = host_alloc_create_info,
+        .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+        .buffer_size = r->storage_buffers[BUFFER_INDEX].buffer_size,
+    };
+
+    // FIXME: Don't assume that we can render with host mapped buffer
+    r->storage_buffers[BUFFER_VERTEX_RAM] = (StorageBuffer){
+        .alloc_info = host_alloc_create_info,
+        .usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+        .buffer_size = memory_region_size(d->vram),
+    };
+
+    r->bitmap_size = memory_region_size(d->vram) / 4096;
+    r->uploaded_bitmap = bitmap_new(r->bitmap_size);
+    bitmap_clear(r->uploaded_bitmap, 0, r->bitmap_size);
+
+    r->storage_buffers[BUFFER_VERTEX_INLINE] = (StorageBuffer){
+        .alloc_info = device_alloc_create_info,
+        .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+                 VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+        .buffer_size = NV2A_VERTEXSHADER_ATTRIBUTES * NV2A_MAX_BATCH_LENGTH *
+                       4 * sizeof(float) * 10,
+    };
+
+    r->storage_buffers[BUFFER_VERTEX_INLINE_STAGING] = (StorageBuffer){
+        .alloc_info = host_alloc_create_info,
+        .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+        .buffer_size = r->storage_buffers[BUFFER_VERTEX_INLINE].buffer_size,
+    };
+
+    r->storage_buffers[BUFFER_UNIFORM] = (StorageBuffer){
+        .alloc_info = device_alloc_create_info,
+        .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+                 VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
+        .buffer_size = 8 * 1024 * 1024,
+    };
+
+    r->storage_buffers[BUFFER_UNIFORM_STAGING] = (StorageBuffer){
+        .alloc_info = host_alloc_create_info,
+        .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+        .buffer_size = r->storage_buffers[BUFFER_UNIFORM].buffer_size,
+    };
+
+    for (int i = 0; i < BUFFER_COUNT; i++) {
+        create_buffer(pg, &r->storage_buffers[i]);
+    }
+
+    // FIXME: Add fallback path for device using host mapped memory
+
+    int buffers_to_map[] = { BUFFER_VERTEX_RAM,
+                             BUFFER_INDEX_STAGING,
+                             BUFFER_VERTEX_INLINE_STAGING,
+                             BUFFER_UNIFORM_STAGING };
+
+    for (int i = 0; i < ARRAY_SIZE(buffers_to_map); i++) {
+        VK_CHECK(vmaMapMemory(
+            r->allocator, r->storage_buffers[buffers_to_map[i]].allocation,
+            (void **)&r->storage_buffers[buffers_to_map[i]].mapped));
+    }
+}
+
+void pgraph_vk_finalize_buffers(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    for (int i = 0; i < BUFFER_COUNT; i++) {
+        if (r->storage_buffers[i].mapped) {
+            vmaUnmapMemory(r->allocator, r->storage_buffers[i].allocation);
+        }
+        destroy_buffer(pg, &r->storage_buffers[i]);
+    }
+
+    g_free(r->uploaded_bitmap);
+    r->uploaded_bitmap = NULL;
+}
+
+bool pgraph_vk_buffer_has_space_for(PGRAPHState *pg, int index,
+                                    VkDeviceSize size,
+                                    VkDeviceAddress alignment)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+    StorageBuffer *b = &r->storage_buffers[index];
+    return (ROUND_UP(b->buffer_offset, alignment) + size) <= b->buffer_size;
+}
+
+VkDeviceSize pgraph_vk_append_to_buffer(PGRAPHState *pg, int index, void **data,
+                                        VkDeviceSize *sizes, size_t count,
+                                        VkDeviceAddress alignment)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkDeviceSize total_size = 0;
+    for (int i = 0; i < count; i++) {
+        total_size += sizes[i];
+    }
+    assert(pgraph_vk_buffer_has_space_for(pg, index, total_size, alignment));
+
+    StorageBuffer *b = &r->storage_buffers[index];
+    VkDeviceSize starting_offset = ROUND_UP(b->buffer_offset, alignment);
+
+    assert(b->mapped);
+
+    for (int i = 0; i < count; i++) {
+        b->buffer_offset = ROUND_UP(b->buffer_offset, alignment);
+        memcpy(b->mapped + b->buffer_offset, data[i], sizes[i]);
+        b->buffer_offset += sizes[i];
+    }
+
+    return starting_offset;
+}
--- a/hw/xbox/nv2a/pgraph/vk/command.c
+++ b/hw/xbox/nv2a/pgraph/vk/command.c
@ -0,0 +1,119 @@
+/*
+ * Geforce NV2A PGRAPH Vulkan Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "renderer.h"
+
+static void create_command_pool(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    QueueFamilyIndices indices =
+        pgraph_vk_find_queue_families(r->physical_device);
+
+    VkCommandPoolCreateInfo create_info = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+        .flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+        .queueFamilyIndex = indices.queue_family,
+    };
+    VK_CHECK(
+        vkCreateCommandPool(r->device, &create_info, NULL, &r->command_pool));
+}
+
+static void destroy_command_pool(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    vkDestroyCommandPool(r->device, r->command_pool, NULL);
+}
+
+static void create_command_buffers(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkCommandBufferAllocateInfo alloc_info = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+        .commandPool = r->command_pool,
+        .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+        .commandBufferCount = ARRAY_SIZE(r->command_buffers),
+    };
+    VK_CHECK(
+        vkAllocateCommandBuffers(r->device, &alloc_info, r->command_buffers));
+
+    r->command_buffer = r->command_buffers[0];
+    r->aux_command_buffer = r->command_buffers[1];
+}
+
+static void destroy_command_buffers(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    vkFreeCommandBuffers(r->device, r->command_pool,
+                         ARRAY_SIZE(r->command_buffers), r->command_buffers);
+
+    r->command_buffer = VK_NULL_HANDLE;
+    r->aux_command_buffer = VK_NULL_HANDLE;
+}
+
+VkCommandBuffer pgraph_vk_begin_single_time_commands(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    assert(!r->in_aux_command_buffer);
+    r->in_aux_command_buffer = true;
+
+    VkCommandBufferBeginInfo begin_info = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+        .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
+    };
+    VK_CHECK(vkBeginCommandBuffer(r->aux_command_buffer, &begin_info));
+
+    return r->aux_command_buffer;
+}
+
+void pgraph_vk_end_single_time_commands(PGRAPHState *pg, VkCommandBuffer cmd)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    assert(r->in_aux_command_buffer);
+
+    VK_CHECK(vkEndCommandBuffer(cmd));
+
+    VkSubmitInfo submit_info = {
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+        .commandBufferCount = 1,
+        .pCommandBuffers = &cmd,
+    };
+    VK_CHECK(vkQueueSubmit(r->queue, 1, &submit_info, VK_NULL_HANDLE));
+    nv2a_profile_inc_counter(NV2A_PROF_QUEUE_SUBMIT_AUX);
+    VK_CHECK(vkQueueWaitIdle(r->queue));
+
+    r->in_aux_command_buffer = false;
+}
+
+void pgraph_vk_init_command_buffers(PGRAPHState *pg)
+{
+    create_command_pool(pg);
+    create_command_buffers(pg);
+}
+
+void pgraph_vk_finalize_command_buffers(PGRAPHState *pg)
+{
+    destroy_command_buffers(pg);
+    destroy_command_pool(pg);
+}
--- a/hw/xbox/nv2a/pgraph/vk/constants.h
+++ b/hw/xbox/nv2a/pgraph/vk/constants.h
@ -0,0 +1,418 @@
+/*
+ * Geforce NV2A PGRAPH Vulkan Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_XBOX_NV2A_PGRAPH_VK_CONSTANTS_H
+#define HW_XBOX_NV2A_PGRAPH_VK_CONSTANTS_H
+
+#include "hw/xbox/nv2a/nv2a_regs.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"
+#include <vulkan/vulkan.h>
+
+static const VkFilter pgraph_texture_min_filter_vk_map[] = {
+    0,
+    VK_FILTER_NEAREST,
+    VK_FILTER_LINEAR,
+    VK_FILTER_NEAREST,
+    VK_FILTER_LINEAR,
+    VK_FILTER_NEAREST,
+    VK_FILTER_LINEAR,
+    VK_FILTER_LINEAR,
+};
+
+static const VkFilter pgraph_texture_mag_filter_vk_map[] = {
+    0,
+    VK_FILTER_NEAREST,
+    VK_FILTER_LINEAR,
+    0,
+    VK_FILTER_LINEAR /* TODO: Convolution filter... */
+};
+
+static const VkSamplerAddressMode pgraph_texture_addr_vk_map[] = {
+    0,
+    VK_SAMPLER_ADDRESS_MODE_REPEAT,
+    VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT,
+    VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
+    VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER,
+    VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, /* Approximate GL_CLAMP */
+};
+
+static const VkBlendFactor pgraph_blend_factor_vk_map[] = {
+    VK_BLEND_FACTOR_ZERO,
+    VK_BLEND_FACTOR_ONE,
+    VK_BLEND_FACTOR_SRC_COLOR,
+    VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR,
+    VK_BLEND_FACTOR_SRC_ALPHA,
+    VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA,
+    VK_BLEND_FACTOR_DST_ALPHA,
+    VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA,
+    VK_BLEND_FACTOR_DST_COLOR,
+    VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR,
+    VK_BLEND_FACTOR_SRC_ALPHA_SATURATE,
+    0,
+    VK_BLEND_FACTOR_CONSTANT_COLOR,
+    VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR,
+    VK_BLEND_FACTOR_CONSTANT_ALPHA,
+    VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA,
+};
+
+static const VkBlendOp pgraph_blend_equation_vk_map[] = {
+    VK_BLEND_OP_SUBTRACT,
+    VK_BLEND_OP_REVERSE_SUBTRACT,
+    VK_BLEND_OP_ADD,
+    VK_BLEND_OP_MIN,
+    VK_BLEND_OP_MAX,
+    VK_BLEND_OP_REVERSE_SUBTRACT,
+    VK_BLEND_OP_ADD,
+};
+
+/* FIXME
+static const GLenum pgraph_blend_logicop_map[] = {
+    GL_CLEAR,
+    GL_AND,
+    GL_AND_REVERSE,
+    GL_COPY,
+    GL_AND_INVERTED,
+    GL_NOOP,
+    GL_XOR,
+    GL_OR,
+    GL_NOR,
+    GL_EQUIV,
+    GL_INVERT,
+    GL_OR_REVERSE,
+    GL_COPY_INVERTED,
+    GL_OR_INVERTED,
+    GL_NAND,
+    GL_SET,
+};
+*/
+
+static const VkCullModeFlags pgraph_cull_face_vk_map[] = {
+    0,
+    VK_CULL_MODE_FRONT_BIT,
+    VK_CULL_MODE_BACK_BIT,
+    VK_CULL_MODE_FRONT_AND_BACK,
+};
+
+static const VkCompareOp pgraph_depth_func_vk_map[] = {
+    VK_COMPARE_OP_NEVER,
+    VK_COMPARE_OP_LESS,
+    VK_COMPARE_OP_EQUAL,
+    VK_COMPARE_OP_LESS_OR_EQUAL,
+    VK_COMPARE_OP_GREATER,
+    VK_COMPARE_OP_NOT_EQUAL,
+    VK_COMPARE_OP_GREATER_OR_EQUAL,
+    VK_COMPARE_OP_ALWAYS,
+};
+
+static const VkCompareOp pgraph_stencil_func_vk_map[] = {
+    VK_COMPARE_OP_NEVER,
+    VK_COMPARE_OP_LESS,
+    VK_COMPARE_OP_EQUAL,
+    VK_COMPARE_OP_LESS_OR_EQUAL,
+    VK_COMPARE_OP_GREATER,
+    VK_COMPARE_OP_NOT_EQUAL,
+    VK_COMPARE_OP_GREATER_OR_EQUAL,
+    VK_COMPARE_OP_ALWAYS,
+};
+
+static const VkStencilOp pgraph_stencil_op_vk_map[] = {
+    0,
+    VK_STENCIL_OP_KEEP,
+    VK_STENCIL_OP_ZERO,
+    VK_STENCIL_OP_REPLACE,
+    VK_STENCIL_OP_INCREMENT_AND_CLAMP,
+    VK_STENCIL_OP_DECREMENT_AND_CLAMP,
+    VK_STENCIL_OP_INVERT,
+    VK_STENCIL_OP_INCREMENT_AND_WRAP,
+    VK_STENCIL_OP_DECREMENT_AND_WRAP,
+};
+
+static const VkPolygonMode pgraph_polygon_mode_vk_map[] = {
+    [POLY_MODE_FILL] = VK_POLYGON_MODE_FILL,
+    [POLY_MODE_POINT] = VK_POLYGON_MODE_POINT,
+    [POLY_MODE_LINE] = VK_POLYGON_MODE_LINE,
+};
+
+typedef struct VkColorFormatInfo {
+    VkFormat vk_format;
+    VkComponentMapping component_map;
+} VkColorFormatInfo;
+
+static const VkColorFormatInfo kelvin_color_format_vk_map[66] = {
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_Y8] = {
+        VK_FORMAT_R8_UNORM,
+        { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_ONE },
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_AY8] = {
+        VK_FORMAT_R8_UNORM,
+        { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R }
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A1R5G5B5] = {
+        VK_FORMAT_A1R5G5B5_UNORM_PACK16,
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_X1R5G5B5] = {
+        VK_FORMAT_A1R5G5B5_UNORM_PACK16,
+        { VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_ONE },
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A4R4G4B4] = {
+        VK_FORMAT_A4R4G4B4_UNORM_PACK16,
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_R5G6B5] = {
+        VK_FORMAT_R5G6B5_UNORM_PACK16,
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A8R8G8B8] = {
+        VK_FORMAT_B8G8R8A8_UNORM,
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_X8R8G8B8] = {
+        VK_FORMAT_B8G8R8A8_UNORM,
+        { VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_ONE },
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_I8_A8R8G8B8] = {
+        VK_FORMAT_B8G8R8A8_UNORM, // Converted
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_L_DXT1_A1R5G5B5] = {
+        VK_FORMAT_R8G8B8A8_UNORM, // Converted
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_L_DXT23_A8R8G8B8] = {
+        VK_FORMAT_R8G8B8A8_UNORM, // Converted
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_L_DXT45_A8R8G8B8] = {
+        VK_FORMAT_R8G8B8A8_UNORM, // Converted
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A1R5G5B5] = {
+        VK_FORMAT_A1R5G5B5_UNORM_PACK16,
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_R5G6B5] = {
+        VK_FORMAT_R5G6B5_UNORM_PACK16,
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A8R8G8B8] = {
+        VK_FORMAT_B8G8R8A8_UNORM,
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_Y8] = {
+        VK_FORMAT_R8_UNORM,
+        { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_ONE, }
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_G8B8] = {
+        VK_FORMAT_R8G8_UNORM,
+        { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, }
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A8] = {
+        VK_FORMAT_R8_UNORM,
+        { VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_R },
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A8Y8] = {
+        VK_FORMAT_R8G8_UNORM,
+        { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G }
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_AY8] = {
+        VK_FORMAT_R8_UNORM,
+        { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R }
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_X1R5G5B5] = {
+        VK_FORMAT_A1R5G5B5_UNORM_PACK16,
+        { VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_ONE },
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A4R4G4B4] = {
+        VK_FORMAT_A4R4G4B4_UNORM_PACK16,
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_X8R8G8B8] = {
+        VK_FORMAT_B8G8R8A8_UNORM,
+        { VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_ONE },
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A8] = {
+        VK_FORMAT_R8_UNORM,
+        { VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_R }
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A8Y8] = {
+        VK_FORMAT_R8G8_UNORM,
+        { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G }
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_R6G5B5] = {
+        VK_FORMAT_R8G8B8_SNORM, // Converted
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_G8B8] = {
+        VK_FORMAT_R8G8_UNORM,
+        { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G }
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_R8B8] = {
+        VK_FORMAT_R8G8_UNORM,
+        { VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G }
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LC_IMAGE_CR8YB8CB8YA8] = {
+        VK_FORMAT_R8G8B8A8_UNORM, // Converted
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LC_IMAGE_YB8CR8YA8CB8] = {
+        VK_FORMAT_R8G8B8A8_UNORM, // Converted
+    },
+
+    /* Additional information is passed to the pixel shader via the swizzle:
+     * RED: The depth value.
+     * GREEN: 0 for 16-bit, 1 for 24 bit
+     * BLUE: 0 for fixed, 1 for float
+     */
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_DEPTH_Y16_FIXED] = {
+        VK_FORMAT_R16_UNORM, // FIXME
+        { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_ZERO, VK_COMPONENT_SWIZZLE_ZERO, VK_COMPONENT_SWIZZLE_ZERO },
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_DEPTH_X8_Y24_FIXED] = {
+        // FIXME
+        // {GL_DEPTH_COMPONENT, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, {GL_RED, GL_ONE, GL_ZERO, GL_ZERO}},
+        VK_FORMAT_R32_UINT,
+        { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_ZERO,  VK_COMPONENT_SWIZZLE_ZERO },
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_DEPTH_X8_Y24_FLOAT] = {
+        // FIXME
+        // {GL_DEPTH_COMPONENT, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, {GL_RED, GL_ONE, GL_ZERO, GL_ZERO}},
+        VK_FORMAT_R32_UINT,
+        { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_ZERO,  VK_COMPONENT_SWIZZLE_ZERO },
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_DEPTH_Y16_FIXED] = {
+        VK_FORMAT_R16_UNORM, // FIXME
+        { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_ZERO, VK_COMPONENT_SWIZZLE_ZERO, VK_COMPONENT_SWIZZLE_ZERO },
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_DEPTH_Y16_FLOAT] = {
+        VK_FORMAT_R16_SFLOAT,
+        { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_ZERO, VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_ZERO },
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_Y16] = {
+        VK_FORMAT_R16_UNORM,
+        { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_ONE }
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_A8B8G8R8] = {
+        VK_FORMAT_R8G8B8A8_UNORM,
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_B8G8R8A8] = {
+        VK_FORMAT_R8G8B8A8_UNORM,
+        { VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A, VK_COMPONENT_SWIZZLE_R }
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_SZ_R8G8B8A8] = {
+        VK_FORMAT_R8G8B8A8_UNORM,
+        { VK_COMPONENT_SWIZZLE_A, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_R }
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_A8B8G8R8] = {
+        VK_FORMAT_R8G8B8A8_UNORM,
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_B8G8R8A8] = {
+        VK_FORMAT_R8G8B8A8_UNORM,
+        { VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A, VK_COMPONENT_SWIZZLE_R }
+    },
+    [NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_R8G8B8A8] = {
+        VK_FORMAT_R8G8B8A8_UNORM,
+        { VK_COMPONENT_SWIZZLE_A, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_R }
+    },
+};
+
+typedef struct BasicSurfaceFormatInfo {
+    unsigned int bytes_per_pixel;
+} BasicSurfaceFormatInfo;
+
+typedef struct SurfaceFormatInfo {
+    unsigned int host_bytes_per_pixel;
+    VkFormat vk_format;
+    VkImageUsageFlags usage;
+    VkImageAspectFlags aspect;
+} SurfaceFormatInfo;
+
+static const BasicSurfaceFormatInfo kelvin_surface_color_format_map[] = {
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_X1R5G5B5_Z1R5G5B5] = { 2 },
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_R5G6B5] = { 2 },
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_X8R8G8B8_Z8R8G8B8] = { 4 },
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_A8R8G8B8] = { 4 },
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_B8] = { 1 },
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_G8B8] = { 2 },
+};
+
+static const SurfaceFormatInfo kelvin_surface_color_format_vk_map[] = {
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_X1R5G5B5_Z1R5G5B5] =
+    {
+        // FIXME: Force alpha to zero
+        2,
+        VK_FORMAT_A1R5G5B5_UNORM_PACK16,
+        VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+        VK_IMAGE_ASPECT_COLOR_BIT,
+    },
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_R5G6B5] =
+    {
+        2,
+        VK_FORMAT_R5G6B5_UNORM_PACK16,
+        VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+        VK_IMAGE_ASPECT_COLOR_BIT,
+    },
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_X8R8G8B8_Z8R8G8B8] =
+    {
+        // FIXME: Force alpha to zero
+        4,
+        VK_FORMAT_B8G8R8A8_UNORM,
+        VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+        VK_IMAGE_ASPECT_COLOR_BIT,
+    },
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_A8R8G8B8] =
+    {
+        4,
+        VK_FORMAT_B8G8R8A8_UNORM,
+        VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+        VK_IMAGE_ASPECT_COLOR_BIT,
+    },
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_B8] =
+    {
+        // FIXME: Map channel color
+        1,
+        VK_FORMAT_R8_UNORM,
+        VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+        VK_IMAGE_ASPECT_COLOR_BIT,
+    },
+    [NV097_SET_SURFACE_FORMAT_COLOR_LE_G8B8] =
+    {
+        // FIXME: Map channel color
+        2,
+        VK_FORMAT_R8G8_UNORM,
+        VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+        VK_IMAGE_ASPECT_COLOR_BIT,
+    },
+};
+
+static const BasicSurfaceFormatInfo kelvin_surface_zeta_format_map[] = {
+    [NV097_SET_SURFACE_FORMAT_ZETA_Z16] = { 2 },
+    [NV097_SET_SURFACE_FORMAT_ZETA_Z24S8] = { 4 },
+};
+
+// FIXME: Actually support stored float format
+
+static const SurfaceFormatInfo zeta_d16 = {
+    2,
+    VK_FORMAT_D16_UNORM,
+    VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+    VK_IMAGE_ASPECT_DEPTH_BIT,
+};
+
+static const SurfaceFormatInfo zeta_d32_sfloat_s8_uint = {
+    8,
+    VK_FORMAT_D32_SFLOAT_S8_UINT,
+    VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+    VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,
+};
+
+static const SurfaceFormatInfo zeta_d24_unorm_s8_uint = {
+    4,
+    VK_FORMAT_D24_UNORM_S8_UINT,
+    VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+    VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,
+};
+
+#endif
--- a/hw/xbox/nv2a/pgraph/vk/debug.c
+++ b/hw/xbox/nv2a/pgraph/vk/debug.c
@ -0,0 +1,59 @@
+/*
+ * Geforce NV2A PGRAPH Vulkan Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "renderer.h"
+#include "debug.h"
+
+#ifndef _WIN32
+#include <dlfcn.h>
+#endif
+
+#ifdef CONFIG_RENDERDOC
+#pragma GCC diagnostic ignored "-Wstrict-prototypes"
+#include "thirdparty/renderdoc_app.h"
+#endif
+
+int nv2a_vk_dgroup_indent = 0;
+
+void pgraph_vk_debug_init(void)
+{
+#ifdef CONFIG_RENDERDOC
+    nv2a_dbg_renderdoc_init();
+#endif
+}
+
+void pgraph_vk_debug_frame_terminator(void)
+{
+#ifdef CONFIG_RENDERDOC
+    if (nv2a_dbg_renderdoc_available()) {
+        RENDERDOC_API_1_6_0 *rdoc_api = nv2a_dbg_renderdoc_get_api();
+
+        PGRAPHVkState *r = g_nv2a->pgraph.vk_renderer_state;
+        if (rdoc_api->IsTargetControlConnected()) {
+            if (rdoc_api->IsFrameCapturing()) {
+                rdoc_api->EndFrameCapture(RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(r->instance), 0);
+            }
+            if (renderdoc_capture_frames > 0) {
+                rdoc_api->StartFrameCapture(RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(r->instance), 0);
+                --renderdoc_capture_frames;
+            }
+        }
+    }
+#endif
+}
--- a/hw/xbox/nv2a/pgraph/vk/debug.h
+++ b/hw/xbox/nv2a/pgraph/vk/debug.h
@ -0,0 +1,61 @@
+/*
+ * Geforce NV2A PGRAPH Vulkan Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_XBOX_NV2A_PGRAPH_VK_DEBUG_H
+#define HW_XBOX_NV2A_PGRAPH_VK_DEBUG_H
+
+#define DEBUG_VK 0
+
+extern int nv2a_vk_dgroup_indent;
+
+#define NV2A_VK_XDPRINTF(x, fmt, ...)                       \
+    do {                                                    \
+        if (x) {                                            \
+            for (int i = 0; i < nv2a_vk_dgroup_indent; i++) \
+                fprintf(stderr, "  ");                      \
+            fprintf(stderr, fmt "\n", ##__VA_ARGS__);       \
+        }                                                   \
+    } while (0)
+
+#define NV2A_VK_DPRINTF(fmt, ...) NV2A_VK_XDPRINTF(DEBUG_VK, fmt, ##__VA_ARGS__)
+
+#define NV2A_VK_DGROUP_BEGIN(fmt, ...)                  \
+    do {                                                \
+        NV2A_VK_XDPRINTF(DEBUG_VK, fmt, ##__VA_ARGS__); \
+        nv2a_vk_dgroup_indent++;                        \
+    } while (0)
+
+#define NV2A_VK_DGROUP_END(...)             \
+    do {                                    \
+        nv2a_vk_dgroup_indent--;            \
+        assert(nv2a_vk_dgroup_indent >= 0); \
+    } while (0)
+
+#define VK_CHECK(x)                                           \
+    do {                                                      \
+        VkResult vk_result = (x);                             \
+        if (vk_result != VK_SUCCESS) {                        \
+            fprintf(stderr, "vk_result = %d\n", vk_result);   \
+        }                                                     \
+        assert(vk_result == VK_SUCCESS && "vk check failed"); \
+    } while (0)
+
+void pgraph_vk_debug_frame_terminator(void);
+
+#endif
--- a/hw/xbox/nv2a/pgraph/vk/display.c
+++ b/hw/xbox/nv2a/pgraph/vk/display.c
@ -0,0 +1,896 @@
+/*
+ * Geforce NV2A PGRAPH Vulkan Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "renderer.h"
+
+static const char *display_frag_glsl =
+    "#version 450\n"
+    "layout(binding = 0) uniform sampler2D tex;\n"
+    "layout(binding = 1) uniform sampler2D pvideo_tex;\n"
+    "layout(push_constant, std430) uniform PushConstants {\n"
+    "    bool pvideo_enable;\n"
+    "    vec2 pvideo_in_pos;\n"
+    "    vec4 pvideo_pos;\n"
+    "    vec3 pvideo_scale;\n"
+    "    bool pvideo_color_key_enable;\n"
+    "    vec2 display_size;\n"
+    "    float line_offset;\n"
+    "    vec4 pvideo_color_key;\n"
+    "};\n"
+    "layout(location = 0) out vec4 out_Color;\n"
+    "void main()\n"
+    "{\n"
+    "    vec2 texCoord = gl_FragCoord.xy/display_size;\n"
+    "    texCoord.y = 1 - texCoord.y;\n" // GL compat
+    "    float rel = display_size.y/textureSize(tex, 0).y/line_offset;\n"
+    "    texCoord.y = 1 + rel*(texCoord.y - 1);"
+    "    out_Color.rgba = texture(tex, texCoord);\n"
+    // "    if (pvideo_enable) {\n"
+    // "        vec2 screenCoord = gl_FragCoord.xy - 0.5;\n"
+    // "        vec4 output_region = vec4(pvideo_pos.xy, pvideo_pos.xy + pvideo_pos.zw);\n"
+    // "        bvec4 clip = bvec4(lessThan(screenCoord, output_region.xy),\n"
+    // "                           greaterThan(screenCoord, output_region.zw));\n"
+    // "        if (!any(clip) && (!pvideo_color_key_enable || out_Color.rgba == pvideo_color_key)) {\n"
+    // "            vec2 out_xy = (screenCoord - pvideo_pos.xy) * pvideo_scale.z;\n"
+    // "            vec2 in_st = (pvideo_in_pos + out_xy * pvideo_scale.xy) / textureSize(pvideo_tex, 0);\n"
+    // "            in_st.y *= -1.0;\n"
+    // "            out_Color.rgba = texture(pvideo_tex, in_st);\n"
+    // "        }\n"
+    // "    }\n"
+    "}\n";
+
+static void create_descriptor_pool(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkDescriptorPoolSize pool_sizes = {
+        .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+        .descriptorCount = 2,
+    };
+
+    VkDescriptorPoolCreateInfo pool_info = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+        .poolSizeCount = 1,
+        .pPoolSizes = &pool_sizes,
+        .maxSets = 1,
+        .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
+    };
+    VK_CHECK(vkCreateDescriptorPool(r->device, &pool_info, NULL,
+                                    &r->display.descriptor_pool));
+}
+
+static void destroy_descriptor_pool(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    vkDestroyDescriptorPool(r->device, r->display.descriptor_pool, NULL);
+    r->display.descriptor_pool = VK_NULL_HANDLE;
+}
+
+static void create_descriptor_set_layout(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkDescriptorSetLayoutBinding bindings[2];
+
+    for (int i = 0; i < ARRAY_SIZE(bindings); i++) {
+        bindings[i] = (VkDescriptorSetLayoutBinding){
+            .binding = i,
+            .descriptorCount = 1,
+            .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
+        };
+    }
+    VkDescriptorSetLayoutCreateInfo layout_info = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+        .bindingCount = ARRAY_SIZE(bindings),
+        .pBindings = bindings,
+    };
+    VK_CHECK(vkCreateDescriptorSetLayout(r->device, &layout_info, NULL,
+                                         &r->display.descriptor_set_layout));
+}
+
+static void destroy_descriptor_set_layout(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    vkDestroyDescriptorSetLayout(r->device, r->display.descriptor_set_layout,
+                                 NULL);
+    r->display.descriptor_set_layout = VK_NULL_HANDLE;
+}
+
+static void create_descriptor_sets(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkDescriptorSetLayout layout = r->display.descriptor_set_layout;
+
+    VkDescriptorSetAllocateInfo alloc_info = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+        .descriptorPool = r->display.descriptor_pool,
+        .descriptorSetCount = 1,
+        .pSetLayouts = &layout,
+    };
+    VK_CHECK(vkAllocateDescriptorSets(r->device, &alloc_info,
+                                      &r->display.descriptor_set));
+}
+
+static void create_render_pass(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkAttachmentDescription attachment;
+
+    VkAttachmentReference color_reference;
+    attachment = (VkAttachmentDescription){
+        .format = VK_FORMAT_R8G8B8A8_UNORM,
+        .samples = VK_SAMPLE_COUNT_1_BIT,
+        .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
+        .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+        .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+        .stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE,
+        .initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+        .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+    };
+    color_reference = (VkAttachmentReference){
+        0, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL
+    };
+
+    VkSubpassDependency dependency = {
+        .srcSubpass = VK_SUBPASS_EXTERNAL,
+    };
+
+    dependency.srcStageMask |=
+        VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+    dependency.dstStageMask |=
+        VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+    dependency.dstAccessMask |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+
+    VkSubpassDescription subpass = {
+        .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+        .colorAttachmentCount = 1,
+        .pColorAttachments = &color_reference,
+    };
+
+    VkRenderPassCreateInfo renderpass_create_info = {
+        .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+        .attachmentCount = 1,
+        .pAttachments = &attachment,
+        .subpassCount = 1,
+        .pSubpasses = &subpass,
+        .dependencyCount = 1,
+        .pDependencies = &dependency,
+    };
+    VK_CHECK(vkCreateRenderPass(r->device, &renderpass_create_info, NULL,
+                                &r->display.render_pass));
+}
+
+static void destroy_render_pass(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+    vkDestroyRenderPass(r->device, r->display.render_pass, NULL);
+    r->display.render_pass = VK_NULL_HANDLE;
+}
+
+static void create_display_pipeline(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    r->display.display_frag =
+        pgraph_vk_create_shader_module_from_glsl(
+            r, VK_SHADER_STAGE_FRAGMENT_BIT, display_frag_glsl);
+
+    VkPipelineShaderStageCreateInfo shader_stages[] = {
+        (VkPipelineShaderStageCreateInfo){
+            .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+            .stage = VK_SHADER_STAGE_VERTEX_BIT,
+            .module = r->quad_vert_module->module,
+            .pName = "main",
+        },
+        (VkPipelineShaderStageCreateInfo){
+            .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+            .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
+            .module = r->display.display_frag->module,
+            .pName = "main",
+        },
+     };
+
+    VkPipelineVertexInputStateCreateInfo vertex_input = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+    };
+
+    VkPipelineInputAssemblyStateCreateInfo input_assembly = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+        .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
+        .primitiveRestartEnable = VK_FALSE,
+    };
+
+    VkPipelineViewportStateCreateInfo viewport_state = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+        .viewportCount = 1,
+        .scissorCount = 1,
+    };
+
+    VkPipelineRasterizationStateCreateInfo rasterizer = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
+        .depthClampEnable = VK_FALSE,
+        .rasterizerDiscardEnable = VK_FALSE,
+        .polygonMode = VK_POLYGON_MODE_FILL,
+        .lineWidth = 1.0f,
+        .cullMode = VK_CULL_MODE_BACK_BIT,
+        .frontFace = VK_FRONT_FACE_CLOCKWISE,
+        .depthBiasEnable = VK_FALSE,
+    };
+
+    VkPipelineMultisampleStateCreateInfo multisampling = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+        .sampleShadingEnable = VK_FALSE,
+        .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
+    };
+
+    VkPipelineDepthStencilStateCreateInfo depth_stencil = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
+        .depthTestEnable = VK_FALSE,
+        .depthCompareOp = VK_COMPARE_OP_ALWAYS,
+        .depthBoundsTestEnable = VK_FALSE,
+    };
+
+    VkPipelineColorBlendAttachmentState color_blend_attachment = {
+        .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
+                          VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT,
+        .blendEnable = VK_FALSE,
+    };
+
+    VkPipelineColorBlendStateCreateInfo color_blending = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+        .logicOpEnable = VK_FALSE,
+        .logicOp = VK_LOGIC_OP_COPY,
+        .attachmentCount = 1,
+        .pAttachments = &color_blend_attachment,
+    };
+
+    VkDynamicState dynamic_states[] = { VK_DYNAMIC_STATE_VIEWPORT,
+                                        VK_DYNAMIC_STATE_SCISSOR };
+    VkPipelineDynamicStateCreateInfo dynamic_state = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
+        .dynamicStateCount = 2,
+        .pDynamicStates = dynamic_states,
+    };
+
+    VkPushConstantRange push_constant_range = {
+        .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
+        .offset = 0,
+        .size = r->display.display_frag->push_constants.total_size,
+    };
+
+    VkPipelineLayoutCreateInfo pipeline_layout_info = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+        .setLayoutCount = 1,
+        .pSetLayouts = &r->display.descriptor_set_layout,
+        .pushConstantRangeCount = 1,
+        .pPushConstantRanges = &push_constant_range,
+    };
+    VK_CHECK(vkCreatePipelineLayout(r->device, &pipeline_layout_info, NULL,
+                                    &r->display.pipeline_layout));
+
+    VkGraphicsPipelineCreateInfo pipeline_info = {
+        .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
+        .stageCount = ARRAY_SIZE(shader_stages),
+        .pStages = shader_stages,
+        .pVertexInputState = &vertex_input,
+        .pInputAssemblyState = &input_assembly,
+        .pViewportState = &viewport_state,
+        .pRasterizationState = &rasterizer,
+        .pMultisampleState = &multisampling,
+        .pDepthStencilState = r->zeta_binding ? &depth_stencil : NULL,
+        .pColorBlendState = &color_blending,
+        .pDynamicState = &dynamic_state,
+        .layout = r->display.pipeline_layout,
+        .renderPass = r->display.render_pass,
+        .subpass = 0,
+        .basePipelineHandle = VK_NULL_HANDLE,
+    };
+    VK_CHECK(vkCreateGraphicsPipelines(r->device, r->vk_pipeline_cache, 1,
+                                       &pipeline_info, NULL,
+                                       &r->display.pipeline));
+}
+
+static void destroy_display_pipeline(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    vkDestroyPipeline(r->device, r->display.pipeline, NULL);
+    r->display.pipeline = VK_NULL_HANDLE;
+}
+
+static void create_frame_buffer(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkFramebufferCreateInfo create_info = {
+        .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
+        .renderPass = r->display.render_pass,
+        .attachmentCount = 1,
+        .pAttachments = &r->display.image_view,
+        .width = r->display.width,
+        .height = r->display.height,
+        .layers = 1,
+    };
+    VK_CHECK(vkCreateFramebuffer(r->device, &create_info, NULL,
+                                 &r->display.framebuffer));
+}
+
+static void destroy_frame_buffer(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+    vkDestroyFramebuffer(r->device, r->display.framebuffer, NULL);
+    r->display.framebuffer = NULL;
+}
+
+static void destroy_current_display_image(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+    PGRAPHVkDisplayState *d = &r->display;
+
+    if (d->image == VK_NULL_HANDLE) {
+        return;
+    }
+
+    destroy_frame_buffer(pg);
+
+#if HAVE_EXTERNAL_MEMORY
+    glDeleteTextures(1, &d->gl_texture_id);
+    d->gl_texture_id = 0;
+
+    glDeleteMemoryObjectsEXT(1, &d->gl_memory_obj);
+    d->gl_memory_obj = 0;
+
+#ifdef WIN32
+    CloseHandle(d->handle);
+    d->handle = 0;
+#endif
+#endif
+
+    vkDestroyImageView(r->device, d->image_view, NULL);
+    d->image_view = VK_NULL_HANDLE;
+
+    vkDestroyImage(r->device, d->image, NULL);
+    d->image = VK_NULL_HANDLE;
+
+    vkFreeMemory(r->device, d->memory, NULL);
+    d->memory = VK_NULL_HANDLE;
+
+    d->draw_time = 0;
+}
+
+// FIXME: We may need to use two images. One for actually rendering display,
+// and another for GL in the correct tiling mode
+
+static void create_display_image_from_surface(PGRAPHState *pg,
+                                              SurfaceBinding *surface)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+    PGRAPHVkDisplayState *d = &r->display;
+
+    if (r->display.image != VK_NULL_HANDLE) {
+        destroy_current_display_image(pg);
+    }
+
+    const GLint gl_internal_format = GL_RGBA8;
+    bool use_optimal_tiling = true;
+
+#if HAVE_EXTERNAL_MEMORY
+    GLint num_tiling_types;
+    glGetInternalformativ(GL_TEXTURE_2D, gl_internal_format,
+                          GL_NUM_TILING_TYPES_EXT, 1, &num_tiling_types);
+    // XXX: Apparently on AMD GL_OPTIMAL_TILING_EXT is reported to be
+    // supported, but doesn't work? On nVidia, GL_LINEAR_TILING_EXT may not
+    // be supported so we must use optimal. Default to optimal unless
+    // linear is explicitly specified...
+    GLint tiling_types[num_tiling_types];
+    glGetInternalformativ(GL_TEXTURE_2D, gl_internal_format,
+                          GL_TILING_TYPES_EXT, num_tiling_types, tiling_types);
+    for (int i = 0; i < num_tiling_types; i++) {
+        if (tiling_types[i] == GL_LINEAR_TILING_EXT) {
+            use_optimal_tiling = false;
+            break;
+        }
+    }
+#endif
+
+    // Create image
+    VkImageCreateInfo image_create_info = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+        .imageType = VK_IMAGE_TYPE_2D,
+        .extent.width = surface->width,
+        .extent.height = surface->height,
+        .extent.depth = 1,
+        .mipLevels = 1,
+        .arrayLayers = 1,
+        .format = VK_FORMAT_R8G8B8A8_UNORM,
+        .tiling = use_optimal_tiling ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR,
+        .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+        .usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+        .samples = VK_SAMPLE_COUNT_1_BIT,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+    };
+    pgraph_apply_scaling_factor(pg, &image_create_info.extent.width,
+                                &image_create_info.extent.height);
+
+    VkExternalMemoryImageCreateInfo external_memory_image_create_info = {
+        .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO,
+        .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR,
+    };
+    image_create_info.pNext = &external_memory_image_create_info;
+
+    VK_CHECK(vkCreateImage(r->device, &image_create_info, NULL, &d->image));
+
+    // Allocate and bind image memory
+    VkMemoryRequirements memory_requirements;
+    vkGetImageMemoryRequirements(r->device, d->image, &memory_requirements);
+
+    VkMemoryAllocateInfo alloc_info = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+        .allocationSize = memory_requirements.size,
+        .memoryTypeIndex =
+            pgraph_vk_get_memory_type(pg, memory_requirements.memoryTypeBits,
+                                      VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT),
+    };
+
+    VkExportMemoryAllocateInfo export_memory_alloc_info = {
+        .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO,
+        .handleTypes =
+#ifdef WIN32
+            VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR
+#else
+            VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT
+#endif
+            ,
+    };
+    alloc_info.pNext = &export_memory_alloc_info;
+
+    VK_CHECK(vkAllocateMemory(r->device, &alloc_info, NULL, &d->memory));
+
+    vkBindImageMemory(r->device, d->image, d->memory, 0);
+
+    // Create Image View
+    VkImageViewCreateInfo image_view_create_info = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+        .image = d->image,
+        .viewType = VK_IMAGE_VIEW_TYPE_2D,
+        .format = image_create_info.format,
+        .subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+        .subresourceRange.levelCount = 1,
+        .subresourceRange.layerCount = 1,
+    };
+    VK_CHECK(vkCreateImageView(r->device, &image_view_create_info, NULL,
+                               &d->image_view));
+
+#if HAVE_EXTERNAL_MEMORY
+
+#ifdef WIN32
+
+    VkMemoryGetWin32HandleInfoKHR handle_info = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR,
+        .memory = d->memory,
+        .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR
+    };
+    VK_CHECK(vkGetMemoryWin32HandleKHR(r->device, &handle_info, &d->handle));
+
+    glCreateMemoryObjectsEXT(1, &d->gl_memory_obj);
+    glImportMemoryWin32HandleEXT(d->gl_memory_obj, memory_requirements.size, GL_HANDLE_TYPE_OPAQUE_WIN32_EXT, d->handle);
+    assert(glGetError() == GL_NO_ERROR);
+
+#else
+
+    VkMemoryGetFdInfoKHR fd_info = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR,
+        .memory = d->memory,
+        .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+    };
+    VK_CHECK(vkGetMemoryFdKHR(r->device, &fd_info, &d->fd));
+
+    glCreateMemoryObjectsEXT(1, &d->gl_memory_obj);
+    glImportMemoryFdEXT(d->gl_memory_obj, memory_requirements.size,
+                        GL_HANDLE_TYPE_OPAQUE_FD_EXT, d->fd);
+    assert(glIsMemoryObjectEXT(d->gl_memory_obj));
+    assert(glGetError() == GL_NO_ERROR);
+
+#endif // WIN32
+ 
+    glGenTextures(1, &d->gl_texture_id);
+    glBindTexture(GL_TEXTURE_2D, d->gl_texture_id);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_TILING_EXT,
+                    use_optimal_tiling ? GL_OPTIMAL_TILING_EXT :
+                                         GL_LINEAR_TILING_EXT);
+    glTexStorageMem2DEXT(GL_TEXTURE_2D, 1, gl_internal_format,
+                         image_create_info.extent.width,
+                         image_create_info.extent.height, d->gl_memory_obj, 0);
+    assert(glGetError() == GL_NO_ERROR);
+
+#endif // HAVE_EXTERNAL_MEMORY
+
+    d->width = image_create_info.extent.width;
+    d->height = image_create_info.extent.height;
+
+    create_frame_buffer(pg);
+}
+
+static void update_descriptor_set(PGRAPHState *pg, SurfaceBinding *surface)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkDescriptorImageInfo image_infos[2];
+    VkWriteDescriptorSet descriptor_writes[2];
+
+    // Display surface
+    image_infos[0] = (VkDescriptorImageInfo){
+        .imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+        .imageView = surface->image_view,
+        .sampler = r->display.sampler,
+    };
+    descriptor_writes[0] = (VkWriteDescriptorSet){
+        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+        .dstSet = r->display.descriptor_set,
+        .dstBinding = 0,
+        .dstArrayElement = 0,
+        .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+        .descriptorCount = 1,
+        .pImageInfo = &image_infos[0],
+    };
+
+    // FIXME: PVIDEO Overlay
+    image_infos[1] = (VkDescriptorImageInfo){
+        .imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+        .imageView = r->dummy_texture.image_view,
+        .sampler = r->dummy_texture.sampler,
+    };
+    descriptor_writes[1] = (VkWriteDescriptorSet){
+        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+        .dstSet = r->display.descriptor_set,
+        .dstBinding = 1,
+        .dstArrayElement = 0,
+        .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+        .descriptorCount = 1,
+        .pImageInfo = &image_infos[1],
+    };
+
+    vkUpdateDescriptorSets(r->device, ARRAY_SIZE(descriptor_writes),
+                           descriptor_writes, 0, NULL);
+}
+
+static void update_uniforms(PGRAPHState *pg, SurfaceBinding *surface)
+{
+    NV2AState *d = container_of(pg, NV2AState, pgraph);
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    unsigned int width, height;
+    uint32_t pline_offset, pstart_addr, pline_compare;
+    d->vga.get_resolution(&d->vga, (int*)&width, (int*)&height);
+    d->vga.get_offsets(&d->vga, &pline_offset, &pstart_addr, &pline_compare);
+    int line_offset = surface->pitch / pline_offset;
+
+    /* Adjust viewport height for interlaced mode, used only in 1080i */
+    if (d->vga.cr[NV_PRMCIO_INTERLACE_MODE] != NV_PRMCIO_INTERLACE_MODE_DISABLED) {
+        height *= 2;
+    }
+
+    pgraph_apply_scaling_factor(pg, &width, &height);
+
+    ShaderUniformLayout *l = &r->display.display_frag->push_constants;
+    int display_size_loc = uniform_index(l, "display_size");  // FIXME: Cache
+    int line_offset_loc = uniform_index(l, "line_offset");
+    uniform2f(l, display_size_loc, width, height);
+    uniform1f(l, line_offset_loc, line_offset);
+
+#if 0  // FIXME: PVIDEO overlay 
+    // FIXME: This check against PVIDEO_SIZE_IN does not match HW behavior.
+    // Many games seem to pass this value when initializing or tearing down
+    // PVIDEO. On its own, this generally does not result in the overlay being
+    // hidden, however there are certain games (e.g., Ultimate Beach Soccer)
+    // that use an unknown mechanism to hide the overlay without explicitly
+    // stopping it.
+    // Since the value seems to be set to 0xFFFFFFFF only in cases where the
+    // content is not valid, it is probably good enough to treat it as an
+    // implicit stop.
+    bool enabled = (d->pvideo.regs[NV_PVIDEO_BUFFER] & NV_PVIDEO_BUFFER_0_USE)
+        && d->pvideo.regs[NV_PVIDEO_SIZE_IN] != 0xFFFFFFFF;
+    glUniform1ui(d->pgraph.renderer_state->disp_rndr.pvideo_enable_loc, enabled);
+    if (!enabled) {
+        return;
+    }
+
+    hwaddr base = d->pvideo.regs[NV_PVIDEO_BASE];
+    hwaddr limit = d->pvideo.regs[NV_PVIDEO_LIMIT];
+    hwaddr offset = d->pvideo.regs[NV_PVIDEO_OFFSET];
+
+    int in_width =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_SIZE_IN], NV_PVIDEO_SIZE_IN_WIDTH);
+    int in_height =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_SIZE_IN], NV_PVIDEO_SIZE_IN_HEIGHT);
+
+    int in_s = GET_MASK(d->pvideo.regs[NV_PVIDEO_POINT_IN],
+                        NV_PVIDEO_POINT_IN_S);
+    int in_t = GET_MASK(d->pvideo.regs[NV_PVIDEO_POINT_IN],
+                        NV_PVIDEO_POINT_IN_T);
+
+    int in_pitch =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_FORMAT], NV_PVIDEO_FORMAT_PITCH);
+    int in_color =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_FORMAT], NV_PVIDEO_FORMAT_COLOR);
+
+    unsigned int out_width =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_SIZE_OUT], NV_PVIDEO_SIZE_OUT_WIDTH);
+    unsigned int out_height =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_SIZE_OUT], NV_PVIDEO_SIZE_OUT_HEIGHT);
+
+    float scale_x = 1.0f;
+    float scale_y = 1.0f;
+    unsigned int ds_dx = d->pvideo.regs[NV_PVIDEO_DS_DX];
+    unsigned int dt_dy = d->pvideo.regs[NV_PVIDEO_DT_DY];
+    if (ds_dx != NV_PVIDEO_DIN_DOUT_UNITY) {
+        scale_x = pvideo_calculate_scale(ds_dx, out_width);
+    }
+    if (dt_dy != NV_PVIDEO_DIN_DOUT_UNITY) {
+        scale_y = pvideo_calculate_scale(dt_dy, out_height);
+    }
+
+    // On HW, setting NV_PVIDEO_SIZE_IN larger than NV_PVIDEO_SIZE_OUT results
+    // in them being capped to the output size, content is not scaled. This is
+    // particularly important as NV_PVIDEO_SIZE_IN may be set to 0xFFFFFFFF
+    // during initialization or teardown.
+    if (in_width > out_width) {
+        in_width = floorf((float)out_width * scale_x + 0.5f);
+    }
+    if (in_height > out_height) {
+        in_height = floorf((float)out_height * scale_y + 0.5f);
+    }
+
+    /* TODO: support other color formats */
+    assert(in_color == NV_PVIDEO_FORMAT_COLOR_LE_CR8YB8CB8YA8);
+
+    unsigned int out_x =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_POINT_OUT], NV_PVIDEO_POINT_OUT_X);
+    unsigned int out_y =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_POINT_OUT], NV_PVIDEO_POINT_OUT_Y);
+
+    unsigned int color_key_enabled =
+        GET_MASK(d->pvideo.regs[NV_PVIDEO_FORMAT], NV_PVIDEO_FORMAT_DISPLAY);
+    glUniform1ui(d->pgraph.renderer_state->disp_rndr.pvideo_color_key_enable_loc,
+                 color_key_enabled);
+
+    // TODO: Verify that masking off the top byte is correct.
+    // SeaBlade sets a color key of 0x80000000 but the texture passed into the
+    // shader is cleared to 0 alpha.
+    unsigned int color_key = d->pvideo.regs[NV_PVIDEO_COLOR_KEY] & 0xFFFFFF;
+    glUniform4f(d->pgraph.renderer_state->disp_rndr.pvideo_color_key_loc,
+                GET_MASK(color_key, NV_PVIDEO_COLOR_KEY_RED) / 255.0,
+                GET_MASK(color_key, NV_PVIDEO_COLOR_KEY_GREEN) / 255.0,
+                GET_MASK(color_key, NV_PVIDEO_COLOR_KEY_BLUE) / 255.0,
+                GET_MASK(color_key, NV_PVIDEO_COLOR_KEY_ALPHA) / 255.0);
+
+    assert(offset + in_pitch * in_height <= limit);
+    hwaddr end = base + offset + in_pitch * in_height;
+    assert(end <= memory_region_size(d->vram));
+
+    pgraph_apply_scaling_factor(pg, &out_x, &out_y);
+    pgraph_apply_scaling_factor(pg, &out_width, &out_height);
+
+    // Translate for the GL viewport origin.
+    out_y = MAX(pg->renderer_state->gl_display_buffer_height - 1 - (int)(out_y + out_height), 0);
+
+    glActiveTexture(GL_TEXTURE0 + 1);
+    glBindTexture(GL_TEXTURE_2D, d->pgraph.renderer_state->disp_rndr.pvideo_tex);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+    uint8_t *tex_rgba = convert_texture_data__CR8YB8CB8YA8(
+        d->vram_ptr + base + offset, in_width, in_height, in_pitch);
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, in_width, in_height, 0, GL_RGBA,
+                 GL_UNSIGNED_BYTE, tex_rgba);
+    g_free(tex_rgba);
+    glUniform1i(d->pgraph.renderer_state->disp_rndr.pvideo_tex_loc, 1);
+    glUniform2f(d->pgraph.renderer_state->disp_rndr.pvideo_in_pos_loc, in_s, in_t);
+    glUniform4f(d->pgraph.renderer_state->disp_rndr.pvideo_pos_loc,
+                out_x, out_y, out_width, out_height);
+    glUniform3f(d->pgraph.renderer_state->disp_rndr.pvideo_scale_loc,
+                scale_x, scale_y, 1.0f / pg->surface_scale_factor);
+#endif
+}
+
+static void render_display(PGRAPHState *pg, SurfaceBinding *surface)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+    PGRAPHVkDisplayState *disp = &r->display;
+
+    if (disp->draw_time >= surface->draw_time) {
+        return;
+    }
+
+    if (r->in_command_buffer &&
+        surface->draw_time >= r->command_buffer_start_time) {
+        pgraph_vk_finish(pg, VK_FINISH_REASON_PRESENTING);
+    }
+
+    update_uniforms(pg, surface);
+    update_descriptor_set(pg, surface);
+
+    VkCommandBuffer cmd = pgraph_vk_begin_single_time_commands(pg);
+
+    pgraph_vk_transition_image_layout(pg, cmd, surface->image,
+                                      surface->host_fmt.vk_format,
+                                      VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+                                      VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
+    pgraph_vk_transition_image_layout(
+        pg, cmd, disp->image, VK_FORMAT_R8G8B8A8_UNORM,
+        VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL);
+
+    VkRenderPassBeginInfo render_pass_begin_info = {
+        .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+        .renderPass = disp->render_pass,
+        .framebuffer = disp->framebuffer,
+        .renderArea.extent.width = disp->width,
+        .renderArea.extent.height = disp->height,
+    };
+    vkCmdBeginRenderPass(cmd, &render_pass_begin_info,
+                         VK_SUBPASS_CONTENTS_INLINE);
+    vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS,
+                      disp->pipeline);
+
+    vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS,
+                            disp->pipeline_layout, 0, 1, &disp->descriptor_set,
+                            0, NULL);
+
+    VkViewport viewport = {
+        .width = disp->width,
+        .height = disp->height,
+        .minDepth = 0.0,
+        .maxDepth = 1.0,
+    };
+    vkCmdSetViewport(cmd, 0, 1, &viewport);
+
+    VkRect2D scissor = {
+        .extent.width = disp->width,
+        .extent.height = disp->height,
+    };
+    vkCmdSetScissor(cmd, 0, 1, &scissor);
+
+    vkCmdPushConstants(cmd, disp->pipeline_layout, VK_SHADER_STAGE_FRAGMENT_BIT,
+                       0, disp->display_frag->push_constants.total_size,
+                       disp->display_frag->push_constants.allocation);
+
+    vkCmdDraw(cmd, 3, 1, 0, 0);
+
+    vkCmdEndRenderPass(cmd);
+
+#if 0
+    VkImageCopy region = {
+        .srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+        .srcSubresource.layerCount = 1,
+        .dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+        .dstSubresource.layerCount = 1,
+        .extent.width = surface->width,
+        .extent.height = surface->height,
+        .extent.depth = 1,
+    };
+    pgraph_apply_scaling_factor(pg, &region.extent.width,
+                                &region.extent.height);
+
+    vkCmdCopyImage(cmd, surface->image,
+                   VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, disp->image,
+                   VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &region);
+#endif
+
+    pgraph_vk_transition_image_layout(pg, cmd, surface->image,
+                                      surface->host_fmt.vk_format,
+                                      VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+                                      VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL);
+
+    pgraph_vk_transition_image_layout(pg, cmd, disp->image,
+                                      VK_FORMAT_R8G8B8_UNORM,
+                                      VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+                                      VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
+
+    pgraph_vk_end_single_time_commands(pg, cmd);
+    nv2a_profile_inc_counter(NV2A_PROF_QUEUE_SUBMIT_5);
+
+    disp->draw_time = surface->draw_time;
+}
+
+static void create_surface_sampler(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkSamplerCreateInfo sampler_create_info = {
+        .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
+        .magFilter = VK_FILTER_NEAREST,
+        .minFilter = VK_FILTER_NEAREST,
+        .addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT,
+        .addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT,
+        .addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT,
+        .anisotropyEnable = VK_FALSE,
+        .borderColor = VK_BORDER_COLOR_INT_OPAQUE_WHITE,
+        .unnormalizedCoordinates = VK_FALSE,
+        .compareEnable = VK_FALSE,
+        .compareOp = VK_COMPARE_OP_ALWAYS,
+        .mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST,
+    };
+
+    VK_CHECK(vkCreateSampler(r->device, &sampler_create_info, NULL,
+                             &r->display.sampler));
+}
+
+static void destroy_surface_sampler(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    vkDestroySampler(r->device, r->display.sampler, NULL);
+    r->display.sampler = VK_NULL_HANDLE;
+}
+
+void pgraph_vk_init_display(PGRAPHState *pg)
+{
+    create_descriptor_pool(pg);
+    create_descriptor_set_layout(pg);
+    create_descriptor_sets(pg);
+    create_render_pass(pg);
+    create_display_pipeline(pg);
+    create_surface_sampler(pg);
+}
+
+void pgraph_vk_finalize_display(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    if (r->display.image != VK_NULL_HANDLE) {
+        destroy_current_display_image(pg);
+    }
+
+    destroy_surface_sampler(pg);
+    destroy_display_pipeline(pg);
+    destroy_render_pass(pg);
+    destroy_descriptor_set_layout(pg);
+    destroy_descriptor_pool(pg);
+}
+
+void pgraph_vk_render_display(PGRAPHState *pg)
+{
+    NV2AState *d = container_of(pg, NV2AState, pgraph);
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    uint32_t pline_offset, pstart_addr, pline_compare;
+    d->vga.get_offsets(&d->vga, &pline_offset, &pstart_addr, &pline_compare);
+    SurfaceBinding *surface =
+        pgraph_vk_surface_get_within(d, d->pcrtc.start + pline_offset);
+    if (surface == NULL || !surface->color) {
+        return;
+    }
+
+    unsigned int width = surface->width, height = surface->height;
+    pgraph_apply_scaling_factor(pg, &width, &height);
+
+    PGRAPHVkDisplayState *disp = &r->display;
+    if (!disp->image || disp->width != width || disp->height != height) {
+        create_display_image_from_surface(pg, surface);
+    }
+
+    render_display(pg, surface);
+}
--- a/hw/xbox/nv2a/pgraph/vk/draw.c
+++ b/hw/xbox/nv2a/pgraph/vk/draw.c
--- a/hw/xbox/nv2a/pgraph/vk/glsl.c
+++ b/hw/xbox/nv2a/pgraph/vk/glsl.c
@ -0,0 +1,380 @@
+/*
+ * Geforce NV2A PGRAPH Vulkan Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "renderer.h"
+
+#include <assert.h>
+#include <glslang/Include/glslang_c_interface.h>
+#include <stdio.h>
+
+static const glslang_resource_t
+    resource_limits = { .max_lights = 32,
+                        .max_clip_planes = 6,
+                        .max_texture_units = 32,
+                        .max_texture_coords = 32,
+                        .max_vertex_attribs = 64,
+                        .max_vertex_uniform_components = 4096,
+                        .max_varying_floats = 64,
+                        .max_vertex_texture_image_units = 32,
+                        .max_combined_texture_image_units = 80,
+                        .max_texture_image_units = 32,
+                        .max_fragment_uniform_components = 4096,
+                        .max_draw_buffers = 32,
+                        .max_vertex_uniform_vectors = 128,
+                        .max_varying_vectors = 8,
+                        .max_fragment_uniform_vectors = 16,
+                        .max_vertex_output_vectors = 16,
+                        .max_fragment_input_vectors = 15,
+                        .min_program_texel_offset = -8,
+                        .max_program_texel_offset = 7,
+                        .max_clip_distances = 8,
+                        .max_compute_work_group_count_x = 65535,
+                        .max_compute_work_group_count_y = 65535,
+                        .max_compute_work_group_count_z = 65535,
+                        .max_compute_work_group_size_x = 1024,
+                        .max_compute_work_group_size_y = 1024,
+                        .max_compute_work_group_size_z = 64,
+                        .max_compute_uniform_components = 1024,
+                        .max_compute_texture_image_units = 16,
+                        .max_compute_image_uniforms = 8,
+                        .max_compute_atomic_counters = 8,
+                        .max_compute_atomic_counter_buffers = 1,
+                        .max_varying_components = 60,
+                        .max_vertex_output_components = 64,
+                        .max_geometry_input_components = 64,
+                        .max_geometry_output_components = 128,
+                        .max_fragment_input_components = 128,
+                        .max_image_units = 8,
+                        .max_combined_image_units_and_fragment_outputs = 8,
+                        .max_combined_shader_output_resources = 8,
+                        .max_image_samples = 0,
+                        .max_vertex_image_uniforms = 0,
+                        .max_tess_control_image_uniforms = 0,
+                        .max_tess_evaluation_image_uniforms = 0,
+                        .max_geometry_image_uniforms = 0,
+                        .max_fragment_image_uniforms = 8,
+                        .max_combined_image_uniforms = 8,
+                        .max_geometry_texture_image_units = 16,
+                        .max_geometry_output_vertices = 256,
+                        .max_geometry_total_output_components = 1024,
+                        .max_geometry_uniform_components = 1024,
+                        .max_geometry_varying_components = 64,
+                        .max_tess_control_input_components = 128,
+                        .max_tess_control_output_components = 128,
+                        .max_tess_control_texture_image_units = 16,
+                        .max_tess_control_uniform_components = 1024,
+                        .max_tess_control_total_output_components = 4096,
+                        .max_tess_evaluation_input_components = 128,
+                        .max_tess_evaluation_output_components = 128,
+                        .max_tess_evaluation_texture_image_units = 16,
+                        .max_tess_evaluation_uniform_components = 1024,
+                        .max_tess_patch_components = 120,
+                        .max_patch_vertices = 32,
+                        .max_tess_gen_level = 64,
+                        .max_viewports = 16,
+                        .max_vertex_atomic_counters = 0,
+                        .max_tess_control_atomic_counters = 0,
+                        .max_tess_evaluation_atomic_counters = 0,
+                        .max_geometry_atomic_counters = 0,
+                        .max_fragment_atomic_counters = 8,
+                        .max_combined_atomic_counters = 8,
+                        .max_atomic_counter_bindings = 1,
+                        .max_vertex_atomic_counter_buffers = 0,
+                        .max_tess_control_atomic_counter_buffers = 0,
+                        .max_tess_evaluation_atomic_counter_buffers = 0,
+                        .max_geometry_atomic_counter_buffers = 0,
+                        .max_fragment_atomic_counter_buffers = 1,
+                        .max_combined_atomic_counter_buffers = 1,
+                        .max_atomic_counter_buffer_size = 16384,
+                        .max_transform_feedback_buffers = 4,
+                        .max_transform_feedback_interleaved_components = 64,
+                        .max_cull_distances = 8,
+                        .max_combined_clip_and_cull_distances = 8,
+                        .max_samples = 4,
+                        .max_mesh_output_vertices_nv = 256,
+                        .max_mesh_output_primitives_nv = 512,
+                        .max_mesh_work_group_size_x_nv = 32,
+                        .max_mesh_work_group_size_y_nv = 1,
+                        .max_mesh_work_group_size_z_nv = 1,
+                        .max_task_work_group_size_x_nv = 32,
+                        .max_task_work_group_size_y_nv = 1,
+                        .max_task_work_group_size_z_nv = 1,
+                        .max_mesh_view_count_nv = 4,
+                        .maxDualSourceDrawBuffersEXT = 1,
+                        .limits = {
+                            .non_inductive_for_loops = 1,
+                            .while_loops = 1,
+                            .do_while_loops = 1,
+                            .general_uniform_indexing = 1,
+                            .general_attribute_matrix_vector_indexing = 1,
+                            .general_varying_indexing = 1,
+                            .general_sampler_indexing = 1,
+                            .general_variable_indexing = 1,
+                            .general_constant_matrix_vector_indexing = 1,
+                        } };
+
+void pgraph_vk_init_glsl_compiler(void)
+{
+    glslang_initialize_process();
+}
+
+void pgraph_vk_finalize_glsl_compiler(void)
+{
+    glslang_finalize_process();
+}
+
+GByteArray *pgraph_vk_compile_glsl_to_spv(glslang_stage_t stage,
+                                          const char *glsl_source)
+{
+    const glslang_input_t input = {
+        .language = GLSLANG_SOURCE_GLSL,
+        .stage = stage,
+        .client = GLSLANG_CLIENT_VULKAN,
+        .client_version = GLSLANG_TARGET_VULKAN_1_3,
+        .target_language = GLSLANG_TARGET_SPV,
+        .target_language_version = GLSLANG_TARGET_SPV_1_5,
+        .code = glsl_source,
+        .default_version = 460,
+        .default_profile = GLSLANG_NO_PROFILE,
+        .force_default_version_and_profile = false,
+        .forward_compatible = false,
+        .messages = GLSLANG_MSG_DEFAULT_BIT,
+        .resource = &resource_limits,
+    };
+
+    glslang_shader_t *shader = glslang_shader_create(&input);
+
+    if (!glslang_shader_preprocess(shader, &input)) {
+        fprintf(stderr,
+                "GLSL preprocessing failed\n"
+                "[INFO]: %s\n"
+                "[DEBUG]: %s\n"
+                "%s\n",
+                glslang_shader_get_info_log(shader),
+                glslang_shader_get_info_debug_log(shader), input.code);
+        assert(!"glslang preprocess failed");
+        glslang_shader_delete(shader);
+        return NULL;
+    }
+
+    if (!glslang_shader_parse(shader, &input)) {
+        fprintf(stderr,
+                "GLSL parsing failed\n"
+                "[INFO]: %s\n"
+                "[DEBUG]: %s\n"
+                "%s\n",
+                glslang_shader_get_info_log(shader),
+                glslang_shader_get_info_debug_log(shader),
+                glslang_shader_get_preprocessed_code(shader));
+        assert(!"glslang parse failed");
+        glslang_shader_delete(shader);
+        return NULL;
+    }
+
+    glslang_program_t *program = glslang_program_create();
+    glslang_program_add_shader(program, shader);
+
+    if (!glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT |
+                                           GLSLANG_MSG_VULKAN_RULES_BIT)) {
+        fprintf(stderr,
+                "GLSL linking failed\n"
+                "[INFO]: %s\n"
+                "[DEBUG]: %s\n",
+                glslang_program_get_info_log(program),
+                glslang_program_get_info_debug_log(program));
+        assert(!"glslang link failed");
+        glslang_program_delete(program);
+        glslang_shader_delete(shader);
+        return NULL;
+    }
+
+    glslang_spv_options_t spv_options = {
+        .validate = true,
+
+#if defined(CONFIG_RENDERDOC)
+        .disable_optimizer = true,
+        .generate_debug_info = true,
+        .emit_nonsemantic_shader_debug_info = true,
+        .emit_nonsemantic_shader_debug_source = true,
+#endif
+    };
+    glslang_program_SPIRV_generate_with_options(program, stage, &spv_options);
+
+    const char *spirv_messages = glslang_program_SPIRV_get_messages(program);
+    if (spirv_messages) {
+        printf("%s\b", spirv_messages);
+    }
+
+    size_t num_program_bytes =
+        glslang_program_SPIRV_get_size(program) * sizeof(uint32_t);
+
+    guint8 *data = g_malloc(num_program_bytes);
+    glslang_program_SPIRV_get(program, (unsigned int *)data);
+
+    glslang_program_delete(program);
+    glslang_shader_delete(shader);
+
+    return g_byte_array_new_take(data, num_program_bytes);
+}
+
+VkShaderModule pgraph_vk_create_shader_module_from_spv(PGRAPHVkState *r, GByteArray *spv)
+{
+    VkShaderModuleCreateInfo create_info = {
+        .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+        .codeSize = spv->len,
+        .pCode = (uint32_t *)spv->data,
+    };
+    VkShaderModule module;
+    VK_CHECK(
+        vkCreateShaderModule(r->device, &create_info, NULL, &module));
+    return module;
+}
+
+static void block_to_uniforms(const SpvReflectBlockVariable *block, ShaderUniformLayout *layout)
+{
+    assert(!layout->uniforms);
+
+    layout->num_uniforms = block->member_count;
+    layout->uniforms = g_malloc0_n(block->member_count, sizeof(ShaderUniform));
+    layout->total_size = block->size;
+    layout->allocation = g_malloc0(block->size);
+
+    for (uint32_t k = 0; k < block->member_count; ++k) {
+        const SpvReflectBlockVariable *member = &block->members[k];
+
+        assert(member->array.dims_count < 2);
+
+        layout->uniforms[k] = (ShaderUniform){
+            .name = strdup(member->name),
+            .offset = member->offset,
+            .dim_v = MAX(1, member->numeric.vector.component_count),
+            .dim_a = MAX(member->array.dims_count ? member->array.dims[0] : 1, member->numeric.matrix.column_count),
+            .stride = MAX(member->array.stride, member->numeric.matrix.stride),
+        };
+
+        // fprintf(stderr, "<%s offset=%zd dim_v=%zd dim_a=%zd stride=%zd>\n",
+        //     layout->uniforms[k].name,
+        //     layout->uniforms[k].offset,
+        //     layout->uniforms[k].dim_v,
+        //     layout->uniforms[k].dim_a,
+        //     layout->uniforms[k].stride
+        //     );
+    }
+    // fprintf(stderr, "--\n");
+}
+
+static void init_layout_from_spv(ShaderModuleInfo *info)
+{
+    SpvReflectResult result = spvReflectCreateShaderModule(
+        info->spirv->len, info->spirv->data, &info->reflect_module);
+    assert(result == SPV_REFLECT_RESULT_SUCCESS &&
+           "Failed to create SPIR-V shader module");
+
+    uint32_t descriptor_set_count = 0;
+    result = spvReflectEnumerateDescriptorSets(&info->reflect_module,
+                                               &descriptor_set_count, NULL);
+    assert(result == SPV_REFLECT_RESULT_SUCCESS &&
+           "Failed to enumerate descriptor sets");
+
+    info->descriptor_sets =
+        g_malloc_n(descriptor_set_count, sizeof(SpvReflectDescriptorSet *));
+    result = spvReflectEnumerateDescriptorSets(
+        &info->reflect_module, &descriptor_set_count, info->descriptor_sets);
+    assert(result == SPV_REFLECT_RESULT_SUCCESS &&
+           "Failed to enumerate descriptor sets");
+
+    info->uniforms.num_uniforms = 0;
+    info->uniforms.uniforms = NULL;
+
+    for (uint32_t i = 0; i < descriptor_set_count; ++i) {
+        const SpvReflectDescriptorSet *descriptor_set =
+            info->descriptor_sets[i];
+        for (uint32_t j = 0; j < descriptor_set->binding_count; ++j) {
+            const SpvReflectDescriptorBinding *binding =
+                descriptor_set->bindings[j];
+            if (binding->descriptor_type !=
+                SPV_REFLECT_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
+                continue;
+            }
+
+            const SpvReflectBlockVariable *block = &binding->block;
+            block_to_uniforms(block, &info->uniforms);
+        }
+    }
+
+    info->push_constants.num_uniforms = 0;
+    info->push_constants.uniforms = NULL;
+    assert(info->reflect_module.push_constant_block_count < 2);
+    if (info->reflect_module.push_constant_block_count) {
+        block_to_uniforms(&info->reflect_module.push_constant_blocks[0],
+                          &info->push_constants);
+    }
+}
+
+static glslang_stage_t vk_shader_stage_to_glslang_stage(VkShaderStageFlagBits stage)
+{
+    switch (stage) {
+    case VK_SHADER_STAGE_GEOMETRY_BIT:
+        return GLSLANG_STAGE_GEOMETRY;
+    case VK_SHADER_STAGE_VERTEX_BIT:
+        return GLSLANG_STAGE_VERTEX;
+    case VK_SHADER_STAGE_FRAGMENT_BIT:
+        return GLSLANG_STAGE_FRAGMENT;
+    case VK_SHADER_STAGE_COMPUTE_BIT:
+        return GLSLANG_STAGE_COMPUTE;
+    default:
+        assert(0);
+    }
+}
+
+ShaderModuleInfo *pgraph_vk_create_shader_module_from_glsl(
+    PGRAPHVkState *r, VkShaderStageFlagBits stage, const char *glsl)
+{
+    ShaderModuleInfo *info = g_malloc0(sizeof(*info));
+    info->glsl = strdup(glsl);
+    info->spirv = pgraph_vk_compile_glsl_to_spv(
+        vk_shader_stage_to_glslang_stage(stage), glsl);
+    info->module = pgraph_vk_create_shader_module_from_spv(r, info->spirv);
+    init_layout_from_spv(info);
+    return info;
+}
+
+static void finalize_uniform_layout(ShaderUniformLayout *layout)
+{
+    for (int i = 0; i < layout->num_uniforms; i++) {
+        free((void*)layout->uniforms[i].name);
+    }
+    if (layout->uniforms) {
+        g_free(layout->uniforms);
+    }
+}
+
+void pgraph_vk_destroy_shader_module(PGRAPHVkState *r, ShaderModuleInfo *info)
+{
+    if (info->glsl) {
+        free(info->glsl);
+    }
+    finalize_uniform_layout(&info->uniforms);
+    finalize_uniform_layout(&info->push_constants);
+    free(info->descriptor_sets);
+    spvReflectDestroyShaderModule(&info->reflect_module);
+    vkDestroyShaderModule(r->device, info->module, NULL);
+    g_byte_array_unref(info->spirv);
+    g_free(info);
+}
--- a/hw/xbox/nv2a/pgraph/vk/glsl.h
+++ b/hw/xbox/nv2a/pgraph/vk/glsl.h
@ -0,0 +1,205 @@
+/*
+ * Geforce NV2A PGRAPH Vulkan Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_XBOX_NV2A_PGRAPH_VK_GLSL_H
+#define HW_XBOX_NV2A_PGRAPH_VK_GLSL_H
+
+#include "qemu/osdep.h"
+#include <stdint.h>
+#include <assert.h>
+#include <string.h>
+
+typedef struct ShaderUniform {
+	const char *name;
+	size_t dim_v;
+	size_t dim_a;
+	size_t align;
+	size_t stride;
+	size_t offset;
+} ShaderUniform;
+
+typedef struct ShaderUniformLayout {
+	ShaderUniform *uniforms;
+	size_t num_uniforms;
+	size_t total_size;
+	void *allocation;
+} ShaderUniformLayout;
+
+static inline void uniform_std140(ShaderUniformLayout *layout)
+{
+	size_t offset = 0;
+
+	for (int i = 0; i < layout->num_uniforms; i++) {
+		ShaderUniform *u = &layout->uniforms[i];
+		size_t size = sizeof(float); // float or int
+		size_t align = size;
+		size_t stride = 0;
+
+		size *= u->dim_v;
+		align *= u->dim_v == 3 ? 4 : u->dim_v;
+
+		// If an array, each element is padded to vec4.
+		if (u->dim_a > 1) {
+			align = 4 * sizeof(float);
+			stride = align;
+			size = u->dim_a * align;
+		} else {
+			align = size;
+			stride = 0;
+		}
+		
+		offset = ROUND_UP(offset, align);
+
+		u->align = align;
+		u->offset = offset;
+		u->stride = stride;
+
+		offset += size;
+	}
+
+	layout->total_size = offset;
+	assert(layout->total_size);
+}
+
+static inline void uniform_std430(ShaderUniformLayout *layout)
+{
+	size_t offset = 0;
+
+	for (int i = 0; i < layout->num_uniforms; i++) {
+		ShaderUniform *u = &layout->uniforms[i];
+		size_t size = sizeof(float); // float or int
+		size *= u->dim_v;
+		size_t align = size;
+		size *= u->dim_a;
+		
+		offset = ROUND_UP(offset, align);
+
+		u->align = align;
+		u->offset = offset;
+		u->stride = u->dim_a > 1 ? (size * u->dim_v) : 0;
+
+		offset += size;
+	}
+
+	layout->total_size = offset;
+	assert(layout->total_size);
+}
+
+static inline int uniform_index(ShaderUniformLayout *layout, const char *name)
+{
+    for (int i = 0; i < layout->num_uniforms; i++) {
+        if (!strcmp(layout->uniforms[i].name, name)) {
+            return i + 1;
+        }
+    }
+
+    return -1;
+}
+
+static inline
+void *uniform_ptr(ShaderUniformLayout *layout, int idx)
+{
+	assert(idx > 0 && "invalid uniform index");
+
+    return (char *)layout->allocation + layout->uniforms[idx - 1].offset;
+}
+
+static inline
+void uniform_copy(ShaderUniformLayout *layout, int idx, void *values, size_t value_size, size_t count)
+{
+	assert(idx > 0 && "invalid uniform index");
+
+    ShaderUniform *u = &layout->uniforms[idx - 1];
+    const size_t element_size = value_size * u->dim_v;
+
+    size_t bytes_remaining = value_size * count;
+    char *p_out = uniform_ptr(layout, idx);
+    char *p_max = p_out + layout->total_size;
+    char *p_in = (char *)values;
+
+    int index = 0;
+    while (bytes_remaining) {
+    	assert(p_out < p_max);
+    	assert(index < u->dim_a);
+        memcpy(p_out, p_in, element_size);
+        bytes_remaining -= element_size;
+        p_out += u->stride;
+        p_in += element_size;
+        index += 1;
+	}
+}
+
+static inline
+void uniform1fv(ShaderUniformLayout *layout, int idx, size_t count, float *values)
+{
+	uniform_copy(layout, idx, values, sizeof(float), count);
+}
+
+static inline
+void uniform1f(ShaderUniformLayout *layout, int idx, float value)
+{
+	uniform1fv(layout, idx, 1, &value);
+}
+
+static inline
+void uniform2f(ShaderUniformLayout *layout, int idx, float v0, float v1)
+{
+	float values[] = { v0, v1 };
+	uniform1fv(layout, idx, 2, values);
+}
+
+static inline
+void uniform4f(ShaderUniformLayout *layout, int idx, float v0, float v1, float v2, float v3)
+{
+	float values[] = { v0, v1, v2, v3 };
+	uniform1fv(layout, idx, 4, values);
+}
+
+static inline
+void uniformMatrix2fv(ShaderUniformLayout *layout, int idx, float *values)
+{
+	uniform1fv(layout, idx, 4, values);
+}
+
+static inline
+void uniformMatrix4fv(ShaderUniformLayout *layout, int idx, float *values)
+{
+	uniform1fv(layout, idx, 4 * 4, values);
+}
+
+static inline
+void uniform1iv(ShaderUniformLayout *layout, int idx, size_t count, int32_t *values)
+{
+	uniform_copy(layout, idx, values, sizeof(int32_t), count);
+}
+
+static inline
+void uniform1i(ShaderUniformLayout *layout, int idx, int32_t value)
+{
+	uniform1iv(layout, idx, 1, &value);
+}
+
+static inline
+void uniform4i(ShaderUniformLayout *layout, int idx, int v0, int v1, int v2, int v3)
+{
+	int values[] = { v0, v1, v2, v3 };
+	uniform1iv(layout, idx, 4, values);
+}
+
+#endif
--- a/hw/xbox/nv2a/pgraph/vk/image.c
+++ b/hw/xbox/nv2a/pgraph/vk/image.c
@ -0,0 +1,209 @@
+/*
+ * Geforce NV2A PGRAPH Vulkan Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "renderer.h"
+
+static bool check_format_has_depth_component(VkFormat format)
+{
+    return format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
+           format == VK_FORMAT_D24_UNORM_S8_UINT ||
+           format == VK_FORMAT_D16_UNORM;
+}
+
+static bool check_format_has_stencil_component(VkFormat format)
+{
+    return format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
+           format == VK_FORMAT_D24_UNORM_S8_UINT;
+}
+
+void pgraph_vk_transition_image_layout(PGRAPHState *pg, VkCommandBuffer cmd,
+                                       VkImage image, VkFormat format,
+                                       VkImageLayout oldLayout,
+                                       VkImageLayout newLayout)
+{
+    VkImageMemoryBarrier barrier = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+        .oldLayout = oldLayout,
+        .newLayout = newLayout,
+        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .image = image,
+        .subresourceRange.baseMipLevel = 0,
+        .subresourceRange.levelCount = VK_REMAINING_MIP_LEVELS,
+        .subresourceRange.baseArrayLayer = 0,
+        .subresourceRange.layerCount = VK_REMAINING_ARRAY_LAYERS,
+    };
+
+    if (check_format_has_depth_component(format)) {
+        barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
+
+        if (check_format_has_stencil_component(format)) {
+            barrier.subresourceRange.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
+        }
+    } else {
+        barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+    }
+
+    VkPipelineStageFlags sourceStage;
+    VkPipelineStageFlags destinationStage;
+
+    // Undefined -> Dst
+    if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
+        newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
+        barrier.srcAccessMask = 0;
+        barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+        sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+        destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+
+    // Undefined -> Color
+    } else if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
+               newLayout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) {
+        barrier.srcAccessMask = 0;
+        barrier.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+        sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+        destinationStage = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+
+    // Undefined -> Depth
+    } else if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
+               newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) {
+        barrier.srcAccessMask = 0;
+        barrier.dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
+        sourceStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+        destinationStage = VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
+
+    // Dst -> Shader Read
+    } else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL &&
+               newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
+        barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+        barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+        sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+        destinationStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+
+    // Dst -> Color
+    } else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL &&
+               newLayout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) {
+        barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+        barrier.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+        sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+        destinationStage = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+
+    // Dst -> Depth
+    } else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL &&
+               newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) {
+        barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+        barrier.dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
+        sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+        destinationStage = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT;
+
+    // Dst -> Src
+    } else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL &&
+               newLayout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL) {
+        barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+        barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
+        sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+        destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+
+    // Shader Read -> Dst
+    } else if (oldLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL &&
+               newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
+        barrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT;
+        barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+        sourceStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+        destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+
+    // Shader Read -> Color
+    } else if (oldLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL &&
+               newLayout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) {
+        barrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT;
+        barrier.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+        sourceStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+        destinationStage = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+
+    // Color -> Src
+    } else if (oldLayout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL &&
+               newLayout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL) {
+        barrier.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+        barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
+        sourceStage = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+        destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+
+    // Color -> Dst
+    } else if (oldLayout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL &&
+               newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
+        barrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
+        barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+        sourceStage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
+        destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+
+    // Color -> Shader Read
+    } else if (oldLayout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL &&
+               newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
+        barrier.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+        barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+        sourceStage = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+        destinationStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+
+    // Depth -> Src
+    } else if (oldLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL &&
+               newLayout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL) {
+        barrier.srcAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
+        barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
+
+        sourceStage = VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
+        destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+
+    // Depth -> Dst
+    } else if (oldLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL &&
+               newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
+        barrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
+        barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+        sourceStage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
+        destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+
+    // Src -> Color
+    } else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL &&
+               newLayout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) {
+        barrier.srcAccessMask = 0;
+        barrier.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+        sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+        destinationStage = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+
+    // Src -> Depth
+    } else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL &&
+               newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) {
+        barrier.srcAccessMask = 0;
+        barrier.dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
+        sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+        destinationStage = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT;
+
+    // Src -> Dst
+    } else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL &&
+               newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
+        barrier.srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
+        barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+        sourceStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+        destinationStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+
+    } else {
+        assert(!"unsupported layout transition!");
+    }
+
+    vkCmdPipelineBarrier(cmd, sourceStage, destinationStage, 0, 0,
+                         NULL, 0, NULL, 1, &barrier);
+}
--- a/hw/xbox/nv2a/pgraph/vk/instance.c
+++ b/hw/xbox/nv2a/pgraph/vk/instance.c
@ -0,0 +1,662 @@
+/*
+ * Geforce NV2A PGRAPH Vulkan Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "ui/xemu-settings.h"
+#include "renderer.h"
+#include "xemu-version.h"
+
+#include <SDL.h>
+#include <SDL_syswm.h>
+#include <SDL_vulkan.h>
+
+#include <volk.h>
+
+typedef GArray VkExtensionPropertiesArray;
+typedef GArray StringArray;
+
+static bool enable_validation = false;
+
+static char const *const validation_layers[] = {
+    "VK_LAYER_KHRONOS_validation",
+};
+
+static char const *const required_instance_extensions[] = {
+    VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME,
+    VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME,
+    VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME,
+};
+
+static char const *const required_device_extensions[] = {
+    VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME,
+    VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME,
+#ifdef WIN32
+    VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME,
+    VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME,
+#else
+    VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
+    VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME,
+#endif
+};
+
+static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(
+    VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity,
+    VkDebugUtilsMessageTypeFlagsEXT messageType,
+    const VkDebugUtilsMessengerCallbackDataEXT *pCallbackData, void *pUserData)
+{
+    NV2A_VK_DPRINTF("[vk] %s", pCallbackData->pMessage);
+    fprintf(stderr, "[vk] %s\n", pCallbackData->pMessage);
+
+    if ((messageType & VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT) &&
+        (messageSeverity & (VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT |
+                            VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT))) {
+        exit(1);
+    }
+    return VK_FALSE;
+}
+
+static bool check_validation_layer_support(void)
+{
+    uint32_t num_available_layers;
+    vkEnumerateInstanceLayerProperties(&num_available_layers, NULL);
+
+    g_autofree VkLayerProperties *available_layers =
+        g_malloc_n(num_available_layers, sizeof(VkLayerProperties));
+    vkEnumerateInstanceLayerProperties(&num_available_layers, available_layers);
+
+    for (int i = 0; i < ARRAY_SIZE(validation_layers); i++) {
+        bool found = false;
+        for (int j = 0; j < num_available_layers; j++) {
+            if (!strcmp(validation_layers[i], available_layers[j].layerName)) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            fprintf(stderr, "desired validation layer not found: %s\n",
+                    validation_layers[i]);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static SDL_Window *create_window(void)
+{
+    SDL_Window *window = SDL_CreateWindow(
+        "SDL Offscreen Window", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED,
+        640, 480, SDL_WINDOW_VULKAN | SDL_WINDOW_HIDDEN);
+
+    if (window == NULL) {
+        fprintf(stderr, "%s: Failed to create window\n", __func__);
+        SDL_Quit();
+        exit(1);
+    }
+
+    return window;
+}
+
+static VkExtensionPropertiesArray *
+get_available_instance_extensions(PGRAPHState *pg)
+{
+    uint32_t num_extensions = 0;
+
+    VK_CHECK(
+        vkEnumerateInstanceExtensionProperties(NULL, &num_extensions, NULL));
+
+    VkExtensionPropertiesArray *extensions = g_array_sized_new(
+        FALSE, FALSE, sizeof(VkExtensionProperties), num_extensions);
+
+    g_array_set_size(extensions, num_extensions);
+    VK_CHECK(vkEnumerateInstanceExtensionProperties(
+        NULL, &num_extensions, (VkExtensionProperties *)extensions->data));
+
+    return extensions;
+}
+
+static bool
+is_extension_available(VkExtensionPropertiesArray *available_extensions,
+                       const char *extension_name)
+{
+    for (int i = 0; i < available_extensions->len; i++) {
+        VkExtensionProperties *e =
+            &g_array_index(available_extensions, VkExtensionProperties, i);
+        if (!strcmp(e->extensionName, extension_name)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static StringArray *get_required_instance_extension_names(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    // Add instance extensions SDL lists as required
+    unsigned int sdl_count = 0;
+    SDL_Vulkan_GetInstanceExtensions((SDL_Window *)r->window, &sdl_count, NULL);
+
+    StringArray *extensions =
+        g_array_sized_new(FALSE, FALSE, sizeof(char *),
+                          sdl_count + ARRAY_SIZE(required_instance_extensions));
+
+    if (sdl_count) {
+        g_array_set_size(extensions, sdl_count);
+        SDL_Vulkan_GetInstanceExtensions((SDL_Window *)r->window, &sdl_count,
+                                         (const char **)extensions->data);
+    }
+
+    // Add additional required extensions
+    g_array_append_vals(extensions, required_instance_extensions,
+                        ARRAY_SIZE(required_instance_extensions));
+
+    return extensions;
+}
+
+static bool
+add_extension_if_available(VkExtensionPropertiesArray *available_extensions,
+                           StringArray *enabled_extension_names,
+                           const char *desired_extension_name)
+{
+    if (is_extension_available(available_extensions, desired_extension_name)) {
+        g_array_append_val(enabled_extension_names, desired_extension_name);
+        return true;
+    }
+
+    fprintf(stderr, "Warning: extension not available: %s\n",
+            desired_extension_name);
+    return false;
+}
+
+static void
+add_optional_instance_extension_names(PGRAPHState *pg,
+                                      VkExtensionPropertiesArray *available_extensions,
+                                      StringArray *enabled_extension_names)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    r->debug_utils_extension_enabled =
+        g_config.display.vulkan.validation_layers &&
+        add_extension_if_available(available_extensions, enabled_extension_names,
+                                   VK_EXT_DEBUG_UTILS_EXTENSION_NAME);
+}
+
+static void create_instance(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    r->window = create_window();
+
+    VK_CHECK(volkInitialize());
+
+    VkApplicationInfo app_info = {
+        .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
+        .pApplicationName = "xemu",
+        .applicationVersion = VK_MAKE_VERSION(
+            xemu_version_major, xemu_version_minor, xemu_version_patch),
+        .pEngineName = "No Engine",
+        .engineVersion = VK_MAKE_VERSION(1, 0, 0),
+        .apiVersion = VK_API_VERSION_1_3,
+    };
+
+    g_autofree VkExtensionPropertiesArray *available_extensions =
+        get_available_instance_extensions(pg);
+
+    g_autofree StringArray *enabled_extension_names =
+        get_required_instance_extension_names(pg);
+
+    bool all_required_extensions_available = true;
+    for (int i = 0; i < enabled_extension_names->len; i++) {
+        const char *required_extension =
+            g_array_index(enabled_extension_names, const char *, i);
+        if (!is_extension_available(available_extensions, required_extension)) {
+            fprintf(stderr,
+                    "Error: Required instance extension not available: %s\n",
+                    required_extension);
+            all_required_extensions_available = false;
+        }
+    }
+    assert(all_required_extensions_available);
+
+    add_optional_instance_extension_names(pg, available_extensions,
+                                          enabled_extension_names);
+
+    fprintf(stderr, "Enabled instance extensions:\n");
+    for (int i = 0; i < enabled_extension_names->len; i++) {
+        fprintf(stderr, "- %s\n", g_array_index(enabled_extension_names, char *, i));
+    }
+
+    VkInstanceCreateInfo create_info = {
+        .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+        .pApplicationInfo = &app_info,
+        .enabledExtensionCount = enabled_extension_names->len,
+        .ppEnabledExtensionNames =
+            &g_array_index(enabled_extension_names, const char *, 0),
+    };
+
+    VkDebugUtilsMessengerCreateInfoEXT dbg_create_info;
+    if (r->debug_utils_extension_enabled) {
+        dbg_create_info = (VkDebugUtilsMessengerCreateInfoEXT){
+            .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT,
+            .messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT |
+                               VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT |
+                               VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT,
+            .messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT |
+                           VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT |
+                           VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT,
+            .pfnUserCallback = debugCallback,
+        };
+    }
+
+    enable_validation = g_config.display.vulkan.validation_layers;
+
+    if (enable_validation) {
+        if (check_validation_layer_support()) {
+            fprintf(stderr, "Warning: Validation layers enabled. Expect performance impact.\n");
+            create_info.enabledLayerCount = ARRAY_SIZE(validation_layers);
+            create_info.ppEnabledLayerNames = validation_layers;
+            if (r->debug_utils_extension_enabled) {
+                create_info.pNext =
+                    (VkDebugUtilsMessengerCreateInfoEXT *)&dbg_create_info;
+            }
+        } else {
+            fprintf(stderr, "Warning: validation layers not available\n");
+            enable_validation = false;
+        }
+    }
+
+    VK_CHECK(vkCreateInstance(&create_info, NULL, &r->instance));
+
+    volkLoadInstance(r->instance);
+}
+
+static bool is_queue_family_indicies_complete(QueueFamilyIndices indices)
+{
+    return indices.queue_family >= 0;
+}
+
+QueueFamilyIndices pgraph_vk_find_queue_families(VkPhysicalDevice device)
+{
+    QueueFamilyIndices indices = {
+        .queue_family = -1,
+    };
+
+    uint32_t num_queue_families = 0;
+    vkGetPhysicalDeviceQueueFamilyProperties(device, &num_queue_families, NULL);
+
+    g_autofree VkQueueFamilyProperties *queue_families =
+        g_malloc_n(num_queue_families, sizeof(VkQueueFamilyProperties));
+    vkGetPhysicalDeviceQueueFamilyProperties(device, &num_queue_families,
+                                             queue_families);
+
+    for (int i = 0; i < num_queue_families; i++) {
+        VkQueueFamilyProperties queueFamily = queue_families[i];
+        // FIXME: Support independent graphics, compute queues
+        int required_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT;
+        if ((queueFamily.queueFlags & required_flags) == required_flags) {
+            indices.queue_family = i;
+        }
+        if (is_queue_family_indicies_complete(indices)) {
+            break;
+        }
+    }
+
+    return indices;
+}
+
+static VkExtensionPropertiesArray *
+get_available_device_extensions(VkPhysicalDevice device)
+{
+    uint32_t num_extensions = 0;
+
+    VK_CHECK(vkEnumerateDeviceExtensionProperties(device, NULL, &num_extensions,
+                                                  NULL));
+
+    VkExtensionPropertiesArray *extensions = g_array_sized_new(
+        FALSE, FALSE, sizeof(VkExtensionProperties), num_extensions);
+
+    g_array_set_size(extensions, num_extensions);
+    VK_CHECK(vkEnumerateDeviceExtensionProperties(
+        device, NULL, &num_extensions,
+        (VkExtensionProperties *)extensions->data));
+
+    return extensions;
+}
+
+static StringArray *get_required_device_extension_names(void)
+{
+    StringArray *extensions =
+        g_array_sized_new(FALSE, FALSE, sizeof(char *),
+                          ARRAY_SIZE(required_device_extensions));
+
+    g_array_append_vals(extensions, required_device_extensions,
+                        ARRAY_SIZE(required_device_extensions));
+
+    return extensions;
+}
+
+static void add_optional_device_extension_names(
+    PGRAPHState *pg, VkExtensionPropertiesArray *available_extensions,
+    StringArray *enabled_extension_names)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    r->custom_border_color_extension_enabled =
+        add_extension_if_available(available_extensions, enabled_extension_names,
+                                   VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME);
+
+    r->provoking_vertex_extension_enabled =
+        add_extension_if_available(available_extensions, enabled_extension_names,
+                                   VK_EXT_PROVOKING_VERTEX_EXTENSION_NAME);
+
+    r->memory_budget_extension_enabled = add_extension_if_available(
+        available_extensions, enabled_extension_names,
+        VK_EXT_MEMORY_BUDGET_EXTENSION_NAME);
+}
+
+static bool check_device_support_required_extensions(VkPhysicalDevice device)
+{
+    g_autofree VkExtensionPropertiesArray *available_extensions =
+        get_available_device_extensions(device);
+
+    for (int i = 0; i < ARRAY_SIZE(required_device_extensions); i++) {
+        if (!is_extension_available(available_extensions,
+                                    required_device_extensions[i])) {
+            fprintf(stderr, "required device extension not found: %s\n",
+                    required_device_extensions[i]);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static bool is_device_compatible(VkPhysicalDevice device)
+{
+    QueueFamilyIndices indices = pgraph_vk_find_queue_families(device);
+
+    return is_queue_family_indicies_complete(indices) &&
+           check_device_support_required_extensions(device);
+    // FIXME: Check formats
+    // FIXME: Check vram
+}
+
+static void select_physical_device(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    uint32_t num_physical_devices = 0;
+
+    vkEnumeratePhysicalDevices(r->instance, &num_physical_devices, NULL);
+    if (num_physical_devices == 0) {
+        assert(!"failed to find GPUs with Vulkan support");
+    }
+
+    g_autofree VkPhysicalDevice *devices =
+        g_malloc_n(num_physical_devices, sizeof(VkPhysicalDevice));
+    vkEnumeratePhysicalDevices(r->instance, &num_physical_devices, devices);
+
+    fprintf(stderr, "Available physical devices:\n");
+    for (int i = 0; i < num_physical_devices; i++) {
+        vkGetPhysicalDeviceProperties(devices[i], &r->device_props);
+        fprintf(stderr, "- %s\n", r->device_props.deviceName);
+    }
+
+    // FIXME: Store preferred device
+
+    r->physical_device = VK_NULL_HANDLE;
+    for (int i = 0; i < num_physical_devices; i++) {
+        if (is_device_compatible(devices[i])) {
+            r->physical_device = devices[i];
+            break;
+        }
+    }
+    if (r->physical_device == VK_NULL_HANDLE) {
+        assert(!"failed to find a suitable GPU");
+    }
+
+    vkGetPhysicalDeviceProperties(r->physical_device, &r->device_props);
+    fprintf(stderr,
+            "Selected physical device: %s\n"
+            "- Vendor: %x, Device: %x\n"
+            "- Driver Version: %d.%d.%d\n",
+            r->device_props.deviceName,
+            r->device_props.vendorID,
+            r->device_props.deviceID,
+            VK_VERSION_MAJOR(r->device_props.driverVersion),
+            VK_VERSION_MINOR(r->device_props.driverVersion),
+            VK_VERSION_PATCH(r->device_props.driverVersion));
+
+    size_t vsh_attr_values_size =
+        NV2A_VERTEXSHADER_ATTRIBUTES * 4 * sizeof(float);
+    assert(r->device_props.limits.maxPushConstantsSize >= vsh_attr_values_size);
+}
+
+static void create_logical_device(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    QueueFamilyIndices indices =
+        pgraph_vk_find_queue_families(r->physical_device);
+
+    g_autofree VkExtensionPropertiesArray *available_extensions =
+        get_available_device_extensions(r->physical_device);
+
+    g_autofree StringArray *enabled_extension_names =
+        get_required_device_extension_names();
+
+    add_optional_device_extension_names(pg, available_extensions,
+                                        enabled_extension_names);
+
+    fprintf(stderr, "Enabled device extensions:\n");
+    for (int i = 0; i < enabled_extension_names->len; i++) {
+        fprintf(stderr, "- %s\n", g_array_index(enabled_extension_names, char *, i));
+    }
+
+    float queuePriority = 1.0f;
+
+    VkDeviceQueueCreateInfo queue_create_info = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+        .queueFamilyIndex = indices.queue_family,
+        .queueCount = 1,
+        .pQueuePriorities = &queuePriority,
+    };
+
+    // Ensure device supports required features
+    VkPhysicalDeviceFeatures available_features, enabled_features;
+    vkGetPhysicalDeviceFeatures(r->physical_device, &available_features);
+    memset(&enabled_features, 0, sizeof(enabled_features));
+
+    struct {
+        const char *name;
+        VkBool32 available, *enabled;
+    } required_features[] = {
+        #define F(n) { #n, available_features.n, &enabled_features.n }
+        F(shaderClipDistance),
+        F(geometryShader),
+        F(shaderTessellationAndGeometryPointSize),
+        F(depthClamp),
+        F(occlusionQueryPrecise),
+        #undef F
+    };
+
+    bool all_features_available = true;
+    for (int i = 0; i < ARRAY_SIZE(required_features); i++) {
+        if (required_features[i].available != VK_TRUE) {
+            fprintf(stderr, "Error: Device does not support required feature %s\n", required_features[i].name);
+            all_features_available = false;
+        }
+        *required_features[i].enabled = VK_TRUE;
+    }
+    assert(all_features_available);
+
+    void *next_struct = NULL;
+
+    VkPhysicalDeviceProvokingVertexFeaturesEXT provoking_vertex_features;
+    if (r->provoking_vertex_extension_enabled) {
+        provoking_vertex_features = (VkPhysicalDeviceProvokingVertexFeaturesEXT){
+            .sType =
+                VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_FEATURES_EXT,
+            .provokingVertexLast = VK_TRUE,
+            .pNext = next_struct,
+        };
+        next_struct = &provoking_vertex_features;
+    }
+
+    VkPhysicalDeviceCustomBorderColorFeaturesEXT custom_border_features;
+    if (r->custom_border_color_extension_enabled) {
+        custom_border_features = (VkPhysicalDeviceCustomBorderColorFeaturesEXT){
+            .sType =
+                VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT,
+            .customBorderColors = VK_TRUE,
+            .pNext = next_struct,
+        };
+        next_struct = &custom_border_features;
+    }
+
+    VkDeviceCreateInfo device_create_info = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
+        .queueCreateInfoCount = 1,
+        .pQueueCreateInfos = &queue_create_info,
+        .pEnabledFeatures = &enabled_features,
+        .enabledExtensionCount = enabled_extension_names->len,
+        .ppEnabledExtensionNames =
+            &g_array_index(enabled_extension_names, const char *, 0),
+        .pNext = next_struct,
+    };
+
+    if (enable_validation) {
+        device_create_info.enabledLayerCount = ARRAY_SIZE(validation_layers);
+        device_create_info.ppEnabledLayerNames = validation_layers;
+    }
+
+    VK_CHECK(vkCreateDevice(r->physical_device, &device_create_info, NULL,
+                            &r->device));
+
+    vkGetDeviceQueue(r->device, indices.queue_family, 0, &r->queue);
+}
+
+uint32_t pgraph_vk_get_memory_type(PGRAPHState *pg, uint32_t type_bits,
+                                   VkMemoryPropertyFlags properties)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkPhysicalDeviceMemoryProperties prop;
+    vkGetPhysicalDeviceMemoryProperties(r->physical_device, &prop);
+    for (uint32_t i = 0; i < prop.memoryTypeCount; i++) {
+        if ((prop.memoryTypes[i].propertyFlags & properties) == properties &&
+            type_bits & (1 << i)) {
+            return i;
+        }
+    }
+    return 0xFFFFFFFF; // Unable to find memoryType
+}
+
+static void init_allocator(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VmaVulkanFunctions vulkanFunctions = {
+        /// Required when using VMA_DYNAMIC_VULKAN_FUNCTIONS.
+        .vkGetInstanceProcAddr = vkGetInstanceProcAddr,
+        /// Required when using VMA_DYNAMIC_VULKAN_FUNCTIONS.
+        .vkGetDeviceProcAddr = vkGetDeviceProcAddr,
+        .vkGetPhysicalDeviceProperties = vkGetPhysicalDeviceProperties,
+        .vkGetPhysicalDeviceMemoryProperties = vkGetPhysicalDeviceMemoryProperties,
+        .vkAllocateMemory = vkAllocateMemory,
+        .vkFreeMemory = vkFreeMemory,
+        .vkMapMemory = vkMapMemory,
+        .vkUnmapMemory = vkUnmapMemory,
+        .vkFlushMappedMemoryRanges = vkFlushMappedMemoryRanges,
+        .vkInvalidateMappedMemoryRanges = vkInvalidateMappedMemoryRanges,
+        .vkBindBufferMemory = vkBindBufferMemory,
+        .vkBindImageMemory = vkBindImageMemory,
+        .vkGetBufferMemoryRequirements = vkGetBufferMemoryRequirements,
+        .vkGetImageMemoryRequirements = vkGetImageMemoryRequirements,
+        .vkCreateBuffer = vkCreateBuffer,
+        .vkDestroyBuffer = vkDestroyBuffer,
+        .vkCreateImage = vkCreateImage,
+        .vkDestroyImage = vkDestroyImage,
+        .vkCmdCopyBuffer = vkCmdCopyBuffer,
+    #if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000
+        /// Fetch "vkGetBufferMemoryRequirements2" on Vulkan >= 1.1, fetch "vkGetBufferMemoryRequirements2KHR" when using VK_KHR_dedicated_allocation extension.
+        .vkGetBufferMemoryRequirements2KHR = vkGetBufferMemoryRequirements2,
+        /// Fetch "vkGetImageMemoryRequirements2" on Vulkan >= 1.1, fetch "vkGetImageMemoryRequirements2KHR" when using VK_KHR_dedicated_allocation extension.
+        .vkGetImageMemoryRequirements2KHR = vkGetImageMemoryRequirements2,
+    #endif
+    #if VMA_BIND_MEMORY2 || VMA_VULKAN_VERSION >= 1001000
+        /// Fetch "vkBindBufferMemory2" on Vulkan >= 1.1, fetch "vkBindBufferMemory2KHR" when using VK_KHR_bind_memory2 extension.
+        .vkBindBufferMemory2KHR = vkBindBufferMemory2,
+        /// Fetch "vkBindImageMemory2" on Vulkan >= 1.1, fetch "vkBindImageMemory2KHR" when using VK_KHR_bind_memory2 extension.
+        .vkBindImageMemory2KHR = vkBindImageMemory2,
+    #endif
+    #if VMA_MEMORY_BUDGET || VMA_VULKAN_VERSION >= 1001000
+        /// Fetch from "vkGetPhysicalDeviceMemoryProperties2" on Vulkan >= 1.1, but you can also fetch it from "vkGetPhysicalDeviceMemoryProperties2KHR" if you enabled extension VK_KHR_get_physical_device_properties2.
+        .vkGetPhysicalDeviceMemoryProperties2KHR = vkGetPhysicalDeviceMemoryProperties2KHR,
+    #endif
+    #if VMA_KHR_MAINTENANCE4 || VMA_VULKAN_VERSION >= 1003000
+        /// Fetch from "vkGetDeviceBufferMemoryRequirements" on Vulkan >= 1.3, but you can also fetch it from "vkGetDeviceBufferMemoryRequirementsKHR" if you enabled extension VK_KHR_maintenance4.
+        .vkGetDeviceBufferMemoryRequirements = vkGetDeviceBufferMemoryRequirements,
+        /// Fetch from "vkGetDeviceImageMemoryRequirements" on Vulkan >= 1.3, but you can also fetch it from "vkGetDeviceImageMemoryRequirementsKHR" if you enabled extension VK_KHR_maintenance4.
+        .vkGetDeviceImageMemoryRequirements = vkGetDeviceImageMemoryRequirements,
+    #endif
+    };
+
+    VmaAllocatorCreateInfo create_info = {
+        .flags = (r->memory_budget_extension_enabled ?
+                      VMA_ALLOCATOR_CREATE_EXT_MEMORY_BUDGET_BIT :
+                      0),
+        .vulkanApiVersion = VK_API_VERSION_1_3,
+        .instance = r->instance,
+        .physicalDevice = r->physical_device,
+        .device = r->device,
+        .pVulkanFunctions = &vulkanFunctions,
+    };
+
+    VK_CHECK(vmaCreateAllocator(&create_info, &r->allocator));
+}
+
+static void finalize_allocator(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    vmaDestroyAllocator(r->allocator);
+}
+
+void pgraph_vk_init_instance(PGRAPHState *pg)
+{
+    create_instance(pg);
+    select_physical_device(pg);
+    create_logical_device(pg);
+    init_allocator(pg);
+}
+
+void pgraph_vk_finalize_instance(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    finalize_allocator(pg);
+    vkDestroyDevice(r->device, NULL);
+    r->device = VK_NULL_HANDLE;
+
+    vkDestroyInstance(r->instance, NULL);
+    r->instance = VK_NULL_HANDLE;
+}
--- a/hw/xbox/nv2a/pgraph/vk/meson.build
+++ b/hw/xbox/nv2a/pgraph/vk/meson.build
@ -0,0 +1,24 @@
+if vulkan.found()
+
+specific_ss.add([sdl, volk, libglslang, vma, vulkan, spirv_reflect, gloffscreen,
+	files(
+		'blit.c',
+		'buffer.c',
+		'command.c',
+		'debug.c',
+		'display.c',
+		'draw.c',
+		'glsl.c',
+		'image.c',
+		'instance.c',
+		'renderer.c',
+		'reports.c',
+		'shaders.c',
+		'surface-compute.c',
+		'surface.c',
+		'texture.c',
+		'vertex.c',
+		)
+	])
+
+endif
--- a/hw/xbox/nv2a/pgraph/vk/renderer.c
+++ b/hw/xbox/nv2a/pgraph/vk/renderer.c
@ -0,0 +1,266 @@
+/*
+ * Geforce NV2A PGRAPH Vulkan Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hw/xbox/nv2a/nv2a_int.h"
+#include "renderer.h"
+
+#include "gloffscreen.h"
+
+#if HAVE_EXTERNAL_MEMORY
+static GloContext *g_gl_context;
+
+static void gl_context_init(void)
+{
+    g_gl_context = glo_context_create();
+}
+#endif
+
+static void pgraph_vk_init_thread(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+
+#if HAVE_EXTERNAL_MEMORY
+    glo_set_current(g_gl_context);
+#endif
+
+    pgraph_vk_init_instance(pg);
+    pgraph_vk_init_command_buffers(pg);
+    pgraph_vk_init_buffers(d);
+    pgraph_vk_init_surfaces(pg);
+    pgraph_vk_init_shaders(pg);
+    pgraph_vk_init_pipelines(pg);
+    pgraph_vk_init_textures(pg);
+    pgraph_vk_init_reports(pg);
+    pgraph_vk_init_compute(pg);
+    pgraph_vk_init_display(pg);
+}
+
+static void pgraph_vk_finalize(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+
+    pgraph_vk_finalize_display(pg);
+    pgraph_vk_finalize_compute(pg);
+    pgraph_vk_finalize_reports(pg);
+    pgraph_vk_finalize_textures(pg);
+    pgraph_vk_finalize_pipelines(pg);
+    pgraph_vk_finalize_shaders(pg);
+    pgraph_vk_finalize_surfaces(pg);
+    pgraph_vk_finalize_buffers(d);
+    pgraph_vk_finalize_command_buffers(pg);
+    pgraph_vk_finalize_instance(pg);
+}
+
+static void pgraph_vk_flush(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+
+    pgraph_vk_finish(pg, VK_FINISH_REASON_FLUSH);
+    pgraph_vk_surface_flush(d);
+    pgraph_vk_mark_textures_possibly_dirty(d, 0, memory_region_size(d->vram));
+    pgraph_vk_update_vertex_ram_buffer(&d->pgraph, 0, d->vram_ptr,
+                                       memory_region_size(d->vram));
+    for (int i = 0; i < 4; i++) {
+        pg->texture_dirty[i] = true;
+    }
+
+    /* FIXME: Flush more? */
+
+    qatomic_set(&d->pgraph.flush_pending, false);
+    qemu_event_set(&d->pgraph.flush_complete);
+}
+
+static void pgraph_vk_sync(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    pgraph_vk_render_display(pg);
+
+    qatomic_set(&d->pgraph.sync_pending, false);
+    qemu_event_set(&d->pgraph.sync_complete);
+}
+
+static void pgraph_vk_process_pending(NV2AState *d)
+{
+    PGRAPHVkState *r = d->pgraph.vk_renderer_state;
+
+    if (qatomic_read(&r->downloads_pending) ||
+        qatomic_read(&r->download_dirty_surfaces_pending) ||
+        qatomic_read(&d->pgraph.sync_pending) ||
+        qatomic_read(&d->pgraph.flush_pending)
+    ) {
+        qemu_mutex_unlock(&d->pfifo.lock);
+        qemu_mutex_lock(&d->pgraph.lock);
+        if (qatomic_read(&r->downloads_pending)) {
+            pgraph_vk_process_pending_downloads(d);
+        }
+        if (qatomic_read(&r->download_dirty_surfaces_pending)) {
+            pgraph_vk_download_dirty_surfaces(d);
+        }
+        if (qatomic_read(&d->pgraph.sync_pending)) {
+            pgraph_vk_sync(d);
+        }
+        if (qatomic_read(&d->pgraph.flush_pending)) {
+            pgraph_vk_flush(d);
+        }
+        qemu_mutex_unlock(&d->pgraph.lock);
+        qemu_mutex_lock(&d->pfifo.lock);
+    }
+}
+
+static void pgraph_vk_flip_stall(NV2AState *d)
+{
+    pgraph_vk_finish(&d->pgraph, VK_FINISH_REASON_FLIP_STALL);
+    pgraph_vk_debug_frame_terminator();
+}
+
+static void pgraph_vk_pre_savevm_trigger(NV2AState *d)
+{
+    qatomic_set(&d->pgraph.vk_renderer_state->download_dirty_surfaces_pending, true);
+    qemu_event_reset(&d->pgraph.vk_renderer_state->dirty_surfaces_download_complete);
+}
+
+static void pgraph_vk_pre_savevm_wait(NV2AState *d)
+{
+    qemu_event_wait(&d->pgraph.vk_renderer_state->dirty_surfaces_download_complete);
+}
+
+static void pgraph_vk_pre_shutdown_trigger(NV2AState *d)
+{
+    // qatomic_set(&d->pgraph.vk_renderer_state->shader_cache_writeback_pending, true);
+    // qemu_event_reset(&d->pgraph.vk_renderer_state->shader_cache_writeback_complete);
+}
+
+static void pgraph_vk_pre_shutdown_wait(NV2AState *d)
+{
+    // qemu_event_wait(&d->pgraph.vk_renderer_state->shader_cache_writeback_complete);   
+}
+
+static int pgraph_vk_get_framebuffer_surface(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    qemu_mutex_lock(&d->pfifo.lock);
+    // FIXME: Possible race condition with pgraph, consider lock
+    uint32_t pline_offset, pstart_addr, pline_compare;
+    d->vga.get_offsets(&d->vga, &pline_offset, &pstart_addr, &pline_compare);
+    SurfaceBinding *surface = pgraph_vk_surface_get_within(d, d->pcrtc.start + pline_offset);
+    if (surface == NULL || !surface->color) {
+        qemu_mutex_unlock(&d->pfifo.lock);
+        return 0;
+    }
+
+    assert(surface->color);
+
+    surface->frame_time = pg->frame_time;
+
+#if HAVE_EXTERNAL_MEMORY
+    qemu_event_reset(&d->pgraph.sync_complete);
+    qatomic_set(&pg->sync_pending, true);
+    pfifo_kick(d);
+    qemu_mutex_unlock(&d->pfifo.lock);
+    qemu_event_wait(&d->pgraph.sync_complete);
+    return r->display.gl_texture_id;
+#else
+    qemu_mutex_unlock(&d->pfifo.lock);
+    pgraph_vk_wait_for_surface_download(surface);
+    return 0;
+#endif
+}
+
+static void pgraph_vk_init(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+
+    pg->vk_renderer_state = (PGRAPHVkState *)g_malloc0(sizeof(PGRAPHVkState));
+
+    pgraph_vk_debug_init();
+}
+
+static PGRAPHRenderer pgraph_vk_renderer = {
+    .type = CONFIG_DISPLAY_RENDERER_VULKAN,
+    .name = "Vulkan",
+    .ops = {
+        .init = pgraph_vk_init,
+#if HAVE_EXTERNAL_MEMORY
+        .early_context_init = gl_context_init,
+#endif
+        .init_thread = pgraph_vk_init_thread,
+        .finalize = pgraph_vk_finalize,
+        .clear_report_value = pgraph_vk_clear_report_value,
+        .clear_surface = pgraph_vk_clear_surface,
+        .draw_begin = pgraph_vk_draw_begin,
+        .draw_end = pgraph_vk_draw_end,
+        .flip_stall = pgraph_vk_flip_stall,
+        .flush_draw = pgraph_vk_flush_draw,
+        .get_report = pgraph_vk_get_report,
+        .image_blit = pgraph_vk_image_blit,
+        .pre_savevm_trigger = pgraph_vk_pre_savevm_trigger,
+        .pre_savevm_wait = pgraph_vk_pre_savevm_wait,
+        .pre_shutdown_trigger = pgraph_vk_pre_shutdown_trigger,
+        .pre_shutdown_wait = pgraph_vk_pre_shutdown_wait,
+        .process_pending = pgraph_vk_process_pending,
+        .process_pending_reports = pgraph_vk_process_pending_reports,
+        .surface_update = pgraph_vk_surface_update,
+        .set_surface_scale_factor = pgraph_vk_set_surface_scale_factor,
+        .get_surface_scale_factor = pgraph_vk_get_surface_scale_factor,
+        .get_framebuffer_surface = pgraph_vk_get_framebuffer_surface,
+    }
+};
+
+static void __attribute__((constructor)) register_renderer(void)
+{
+    pgraph_renderer_register(&pgraph_vk_renderer);
+}
+
+void pgraph_vk_check_memory_budget(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkPhysicalDeviceMemoryProperties const *props;
+    vmaGetMemoryProperties(r->allocator, &props);
+
+    g_autofree VmaBudget *budgets = g_malloc_n(props->memoryHeapCount, sizeof(VmaBudget));
+    vmaGetHeapBudgets(r->allocator, budgets);
+
+    const float budget_threshold = 0.8;
+    bool near_budget = false;
+
+    for (int i = 0; i < props->memoryHeapCount; i++) {
+        VmaBudget *b = &budgets[i];
+        float use_to_budget_ratio =
+            (double)b->statistics.allocationBytes / (double)b->budget;
+        NV2A_VK_DPRINTF("Heap %d: used %lu/%lu MiB (%.2f%%)", i,
+                        b->statistics.allocationBytes / (1024 * 1024),
+                        b->budget / (1024 * 1024), use_to_budget_ratio * 100);
+        near_budget |= use_to_budget_ratio > budget_threshold;
+    }
+
+    // If any heaps are near budget, free up some resources
+    if (near_budget) {
+        pgraph_vk_trim_texture_cache(pg);
+    }
+
+#if 0
+    char *s;
+    vmaBuildStatsString(r->allocator, &s, VK_TRUE);
+    puts(s);
+    vmaFreeStatsString(r->allocator, s);
+#endif
+}
--- a/hw/xbox/nv2a/pgraph/vk/renderer.h
+++ b/hw/xbox/nv2a/pgraph/vk/renderer.h
@ -0,0 +1,526 @@
+/*
+ * Geforce NV2A PGRAPH Vulkan Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_XBOX_NV2A_PGRAPH_VK_RENDERER_H
+#define HW_XBOX_NV2A_PGRAPH_VK_RENDERER_H
+
+#define VK_NO_PROTOTYPES 1
+
+#include "qemu/osdep.h"
+#include "qemu/thread.h"
+#include "qemu/queue.h"
+#include "qemu/lru.h"
+#include "hw/hw.h"
+#include "hw/xbox/nv2a/nv2a_int.h"
+#include "hw/xbox/nv2a/nv2a_regs.h"
+#include "hw/xbox/nv2a/pgraph/surface.h"
+#include "hw/xbox/nv2a/pgraph/texture.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"
+
+#include <vulkan/vulkan.h>
+#include <glslang/Include/glslang_c_interface.h>
+#include <volk.h>
+#include <spirv_reflect.h>
+
+#define VMA_STATIC_VULKAN_FUNCTIONS 1
+#define VMA_DYNAMIC_VULKAN_FUNCTIONS 0
+#include <vk_mem_alloc.h>
+
+#include "debug.h"
+#include "constants.h"
+#include "glsl.h"
+
+#define HAVE_EXTERNAL_MEMORY 1
+
+typedef struct QueueFamilyIndices {
+    int queue_family;
+} QueueFamilyIndices;
+
+typedef struct MemorySyncRequirement {
+    hwaddr addr, size;
+} MemorySyncRequirement;
+
+typedef struct RenderPassState {
+    VkFormat color_format;
+    VkFormat zeta_format;
+} RenderPassState;
+
+typedef struct RenderPass {
+    RenderPassState state;
+    VkRenderPass render_pass;
+} RenderPass;
+
+typedef struct PipelineKey {
+    bool clear;
+    RenderPassState render_pass_state;
+    ShaderState shader_state;
+    uint32_t regs[10];
+    VkVertexInputBindingDescription binding_descriptions[NV2A_VERTEXSHADER_ATTRIBUTES];
+    VkVertexInputAttributeDescription attribute_descriptions[NV2A_VERTEXSHADER_ATTRIBUTES];
+} PipelineKey;
+
+typedef struct PipelineBinding {
+    LruNode node;
+    PipelineKey key;
+    VkPipelineLayout layout;
+    VkPipeline pipeline;
+    VkRenderPass render_pass;
+    unsigned int draw_time;
+} PipelineBinding;
+
+enum Buffer {
+    BUFFER_STAGING_DST,
+    BUFFER_STAGING_SRC,
+    BUFFER_COMPUTE_DST,
+    BUFFER_COMPUTE_SRC,
+    BUFFER_INDEX,
+    BUFFER_INDEX_STAGING,
+    BUFFER_VERTEX_RAM,
+    BUFFER_VERTEX_INLINE,
+    BUFFER_VERTEX_INLINE_STAGING,
+    BUFFER_UNIFORM,
+    BUFFER_UNIFORM_STAGING,
+    BUFFER_COUNT
+};
+
+typedef struct StorageBuffer {
+    VkBuffer buffer;
+    VkBufferUsageFlags usage;
+    VmaAllocationCreateInfo alloc_info;
+    VmaAllocation allocation;
+    VkMemoryPropertyFlags properties;
+    size_t buffer_offset;
+    size_t buffer_size;
+    uint8_t *mapped;
+} StorageBuffer;
+
+typedef struct SurfaceBinding {
+    QTAILQ_ENTRY(SurfaceBinding) entry;
+    MemAccessCallback *access_cb;
+
+    hwaddr vram_addr;
+
+    SurfaceShape shape;
+    uintptr_t dma_addr;
+    uintptr_t dma_len;
+    bool color;
+    bool swizzle;
+
+    unsigned int width;
+    unsigned int height;
+    unsigned int pitch;
+    size_t size;
+
+    bool cleared;
+    int frame_time;
+    int draw_time;
+    bool draw_dirty;
+    bool download_pending;
+    bool upload_pending;
+
+    BasicSurfaceFormatInfo fmt;
+    SurfaceFormatInfo host_fmt;
+
+    VkImage image;
+    VkImageView image_view;
+    VmaAllocation allocation;
+
+    // Used for scaling
+    VkImage image_scratch;
+    VkImageLayout image_scratch_current_layout;
+    VmaAllocation allocation_scratch;
+
+    bool initialized;
+} SurfaceBinding;
+
+typedef struct ShaderModuleInfo {
+    char *glsl;
+    GByteArray *spirv;
+    VkShaderModule module;
+    SpvReflectShaderModule reflect_module;
+    SpvReflectDescriptorSet **descriptor_sets;
+    ShaderUniformLayout uniforms;
+    ShaderUniformLayout push_constants;
+} ShaderModuleInfo;
+
+typedef struct ShaderBinding {
+    LruNode node;
+    ShaderState state;
+    ShaderModuleInfo *geometry;
+    ShaderModuleInfo *vertex;
+    ShaderModuleInfo *fragment;
+
+    int psh_constant_loc[9][2];
+    int alpha_ref_loc;
+
+    int bump_mat_loc[NV2A_MAX_TEXTURES];
+    int bump_scale_loc[NV2A_MAX_TEXTURES];
+    int bump_offset_loc[NV2A_MAX_TEXTURES];
+    int tex_scale_loc[NV2A_MAX_TEXTURES];
+
+    int surface_size_loc;
+    int clip_range_loc;
+
+    int vsh_constant_loc;
+    uint32_t vsh_constants[NV2A_VERTEXSHADER_CONSTANTS][4];
+
+    int inv_viewport_loc;
+    int ltctxa_loc;
+    int ltctxb_loc;
+    int ltc1_loc;
+
+    int fog_color_loc;
+    int fog_param_loc;
+    int light_infinite_half_vector_loc[NV2A_MAX_LIGHTS];
+    int light_infinite_direction_loc[NV2A_MAX_LIGHTS];
+    int light_local_position_loc[NV2A_MAX_LIGHTS];
+    int light_local_attenuation_loc[NV2A_MAX_LIGHTS];
+
+    int clip_region_loc;
+
+    int material_alpha_loc;
+} ShaderBinding;
+
+typedef struct TextureKey {
+    TextureShape state;
+    hwaddr texture_vram_offset;
+    hwaddr texture_length;
+    hwaddr palette_vram_offset;
+    hwaddr palette_length;
+    float scale;
+} TextureKey;
+
+typedef struct TextureBinding {
+    LruNode node;
+    TextureKey key;
+    VkImage image;
+    VkImageLayout current_layout;
+    VkImageView image_view;
+    VmaAllocation allocation;
+    VkSampler sampler;
+    bool possibly_dirty;
+    uint64_t hash;
+    unsigned int draw_time;
+    uint32_t submit_time;
+} TextureBinding;
+
+typedef struct QueryReport {
+    QSIMPLEQ_ENTRY(QueryReport) entry;
+    bool clear;
+    uint32_t parameter;
+    unsigned int query_count;
+} QueryReport;
+
+typedef struct PGRAPHVkDisplayState {
+    ShaderModuleInfo *display_frag;
+
+    VkDescriptorPool descriptor_pool;
+    VkDescriptorSetLayout descriptor_set_layout;
+    VkDescriptorSet descriptor_set;
+
+    VkPipelineLayout pipeline_layout;
+    VkPipeline pipeline;
+
+    VkRenderPass render_pass;
+    VkFramebuffer framebuffer;
+
+    VkImage image;
+    VkImageView image_view;
+    VkDeviceMemory memory;
+    VkSampler sampler;
+
+    int width, height;
+    int draw_time;
+
+    // OpenGL Interop
+#ifdef WIN32
+    HANDLE handle;
+#else
+    int fd;
+#endif
+    GLuint gl_memory_obj;
+    GLuint gl_texture_id;
+} PGRAPHVkDisplayState;
+
+typedef struct PGRAPHVkComputeState {
+    VkDescriptorPool descriptor_pool;
+    VkDescriptorSetLayout descriptor_set_layout;
+    VkDescriptorSet descriptor_sets[1];
+    VkPipelineLayout pipeline_layout;
+    VkPipeline pipeline_pack_d24s8;
+    VkPipeline pipeline_unpack_d24s8;
+    VkPipeline pipeline_pack_f32s8;
+    VkPipeline pipeline_unpack_f32s8;
+} PGRAPHVkComputeState;
+
+typedef struct PGRAPHVkState {
+    void *window;
+    VkInstance instance;
+
+    bool debug_utils_extension_enabled;
+    bool custom_border_color_extension_enabled;
+    bool provoking_vertex_extension_enabled;
+    bool memory_budget_extension_enabled;
+
+    VkPhysicalDevice physical_device;
+    VkPhysicalDeviceProperties device_props;
+    VkDevice device;
+    VmaAllocator allocator;
+    uint32_t allocator_last_submit_index;
+
+    VkQueue queue;
+    VkCommandPool command_pool;
+    VkCommandBuffer command_buffers[2];
+
+    VkCommandBuffer command_buffer;
+    VkSemaphore command_buffer_semaphore;
+    VkFence command_buffer_fence;
+    unsigned int command_buffer_start_time;
+    bool in_command_buffer;
+    uint32_t submit_count;
+
+    VkCommandBuffer aux_command_buffer;
+    bool in_aux_command_buffer;
+
+    VkFramebuffer framebuffers[50];
+    int framebuffer_index;
+    bool framebuffer_dirty;
+
+    VkRenderPass render_pass;
+    RenderPass *render_passes;
+    int render_passes_index;
+    int render_passes_capacity;
+    bool in_render_pass;
+    bool in_draw;
+
+    Lru pipeline_cache;
+    VkPipelineCache vk_pipeline_cache;
+    PipelineBinding *pipeline_cache_entries;
+    PipelineBinding *pipeline_binding;
+    bool pipeline_binding_changed;
+
+    VkDescriptorPool descriptor_pool;
+    VkDescriptorSetLayout descriptor_set_layout;
+    VkDescriptorSet descriptor_sets[1024];
+    int descriptor_set_index;
+
+    StorageBuffer storage_buffers[BUFFER_COUNT];
+
+    MemorySyncRequirement vertex_ram_buffer_syncs[NV2A_VERTEXSHADER_ATTRIBUTES];
+    size_t num_vertex_ram_buffer_syncs;
+    unsigned long *uploaded_bitmap;
+    size_t bitmap_size;
+
+    VkVertexInputAttributeDescription vertex_attribute_descriptions[NV2A_VERTEXSHADER_ATTRIBUTES];
+    int vertex_attribute_to_description_location[NV2A_VERTEXSHADER_ATTRIBUTES];
+    int num_active_vertex_attribute_descriptions;
+
+    VkVertexInputBindingDescription vertex_binding_descriptions[NV2A_VERTEXSHADER_ATTRIBUTES];
+    int num_active_vertex_binding_descriptions;
+    hwaddr vertex_attribute_offsets[NV2A_VERTEXSHADER_ATTRIBUTES];
+
+    QTAILQ_HEAD(, SurfaceBinding) surfaces;
+    QTAILQ_HEAD(, SurfaceBinding) invalid_surfaces;
+    SurfaceBinding *color_binding, *zeta_binding;
+    bool downloads_pending;
+    QemuEvent downloads_complete;
+    bool download_dirty_surfaces_pending;
+    QemuEvent dirty_surfaces_download_complete; // common
+
+    Lru texture_cache;
+    TextureBinding *texture_cache_entries;
+    TextureBinding *texture_bindings[NV2A_MAX_TEXTURES];
+    TextureBinding dummy_texture;
+    bool texture_bindings_changed;
+
+    Lru shader_cache;
+    ShaderBinding *shader_cache_entries;
+    ShaderBinding *shader_binding;
+    ShaderModuleInfo *quad_vert_module, *solid_frag_module;
+    bool shader_bindings_changed;
+
+    // FIXME: Merge these into a structure
+    uint64_t uniform_buffer_hashes[2];
+    size_t uniform_buffer_offsets[2];
+    bool uniforms_changed;
+
+    VkQueryPool query_pool;
+    int max_queries_in_flight; // FIXME: Move out to constant
+    int num_queries_in_flight;
+    bool new_query_needed;
+    bool query_in_flight;
+    uint32_t zpass_pixel_count_result;
+    QSIMPLEQ_HEAD(, QueryReport) report_queue; // FIXME: Statically allocate
+
+    SurfaceFormatInfo kelvin_surface_zeta_vk_map[3];
+
+    uint32_t clear_parameter;
+
+    PGRAPHVkDisplayState display;
+    PGRAPHVkComputeState compute;
+} PGRAPHVkState;
+
+// renderer.c
+void pgraph_vk_check_memory_budget(PGRAPHState *pg);
+
+// debug.c
+void pgraph_vk_debug_init(void);
+
+// instance.c
+void pgraph_vk_init_instance(PGRAPHState *pg);
+void pgraph_vk_finalize_instance(PGRAPHState *pg);
+QueueFamilyIndices pgraph_vk_find_queue_families(VkPhysicalDevice device);
+uint32_t pgraph_vk_get_memory_type(PGRAPHState *pg, uint32_t type_bits,
+                                   VkMemoryPropertyFlags properties);
+
+// glsl.c
+void pgraph_vk_init_glsl_compiler(void);
+void pgraph_vk_finalize_glsl_compiler(void);
+GByteArray *pgraph_vk_compile_glsl_to_spv(glslang_stage_t stage,
+                                          const char *glsl_source);
+VkShaderModule pgraph_vk_create_shader_module_from_spv(PGRAPHVkState *r,
+                                                       GByteArray *spv);
+ShaderModuleInfo *pgraph_vk_create_shader_module_from_glsl(
+    PGRAPHVkState *r, VkShaderStageFlagBits stage, const char *glsl);
+void pgraph_vk_destroy_shader_module(PGRAPHVkState *r, ShaderModuleInfo *info);
+
+// buffer.c
+void pgraph_vk_init_buffers(NV2AState *d);
+void pgraph_vk_finalize_buffers(NV2AState *d);
+bool pgraph_vk_buffer_has_space_for(PGRAPHState *pg, int index,
+                                    VkDeviceSize size,
+                                    VkDeviceAddress alignment);
+VkDeviceSize pgraph_vk_append_to_buffer(PGRAPHState *pg, int index, void **data,
+                                        VkDeviceSize *sizes, size_t count,
+                                        VkDeviceAddress alignment);
+
+// command.c
+void pgraph_vk_init_command_buffers(PGRAPHState *pg);
+void pgraph_vk_finalize_command_buffers(PGRAPHState *pg);
+VkCommandBuffer pgraph_vk_begin_single_time_commands(PGRAPHState *pg);
+void pgraph_vk_end_single_time_commands(PGRAPHState *pg, VkCommandBuffer cmd);
+
+// image.c
+void pgraph_vk_transition_image_layout(PGRAPHState *pg, VkCommandBuffer cmd,
+                                       VkImage image, VkFormat format,
+                                       VkImageLayout oldLayout,
+                                       VkImageLayout newLayout);
+
+// vertex.c
+void pgraph_vk_bind_vertex_attributes(NV2AState *d, unsigned int min_element,
+                                      unsigned int max_element,
+                                      bool inline_data,
+                                      unsigned int inline_stride,
+                                      unsigned int provoking_element);
+void pgraph_vk_bind_vertex_attributes_inline(NV2AState *d);
+void pgraph_vk_update_vertex_ram_buffer(PGRAPHState *pg, hwaddr offset, void *data,
+                                    VkDeviceSize size);
+VkDeviceSize pgraph_vk_update_index_buffer(PGRAPHState *pg, void *data,
+                                           VkDeviceSize size);
+VkDeviceSize pgraph_vk_update_vertex_inline_buffer(PGRAPHState *pg, void **data,
+                                                   VkDeviceSize *sizes,
+                                                   size_t count);
+
+// surface.c
+void pgraph_vk_init_surfaces(PGRAPHState *pg);
+void pgraph_vk_finalize_surfaces(PGRAPHState *pg);
+void pgraph_vk_surface_flush(NV2AState *d);
+void pgraph_vk_process_pending_downloads(NV2AState *d);
+void pgraph_vk_surface_download_if_dirty(NV2AState *d, SurfaceBinding *surface);
+SurfaceBinding *pgraph_vk_surface_get_within(NV2AState *d, hwaddr addr);
+void pgraph_vk_wait_for_surface_download(SurfaceBinding *e);
+void pgraph_vk_download_dirty_surfaces(NV2AState *d);
+void pgraph_vk_upload_surface_data(NV2AState *d, SurfaceBinding *surface,
+                                   bool force);
+void pgraph_vk_surface_update(NV2AState *d, bool upload, bool color_write,
+                              bool zeta_write);
+SurfaceBinding *pgraph_vk_surface_get(NV2AState *d, hwaddr addr);
+void pgraph_vk_set_surface_dirty(PGRAPHState *pg, bool color, bool zeta);
+void pgraph_vk_set_surface_scale_factor(NV2AState *d, unsigned int scale);
+unsigned int pgraph_vk_get_surface_scale_factor(NV2AState *d);
+void pgraph_vk_reload_surface_scale_factor(PGRAPHState *pg);
+
+// surface-compute.c
+void pgraph_vk_init_compute(PGRAPHState *pg);
+void pgraph_vk_finalize_compute(PGRAPHState *pg);
+void pgraph_vk_pack_depth_stencil(PGRAPHState *pg, SurfaceBinding *surface,
+                                  VkCommandBuffer cmd, VkBuffer src,
+                                  VkBuffer dst, bool downscale);
+void pgraph_vk_unpack_depth_stencil(PGRAPHState *pg, SurfaceBinding *surface,
+                                    VkCommandBuffer cmd, VkBuffer src,
+                                    VkBuffer dst);
+
+// display.c
+void pgraph_vk_init_display(PGRAPHState *pg);
+void pgraph_vk_finalize_display(PGRAPHState *pg);
+void pgraph_vk_render_display(PGRAPHState *pg);
+
+// texture.c
+void pgraph_vk_init_textures(PGRAPHState *pg);
+void pgraph_vk_finalize_textures(PGRAPHState *pg);
+void pgraph_vk_bind_textures(NV2AState *d);
+void pgraph_vk_mark_textures_possibly_dirty(NV2AState *d, hwaddr addr,
+                                            hwaddr size);
+void pgraph_vk_trim_texture_cache(PGRAPHState *pg);
+
+// shaders.c
+void pgraph_vk_init_shaders(PGRAPHState *pg);
+void pgraph_vk_finalize_shaders(PGRAPHState *pg);
+void pgraph_vk_update_descriptor_sets(PGRAPHState *pg);
+void pgraph_vk_bind_shaders(PGRAPHState *pg);
+void pgraph_vk_update_shader_uniforms(PGRAPHState *pg);
+
+// reports.c
+void pgraph_vk_init_reports(PGRAPHState *pg);
+void pgraph_vk_finalize_reports(PGRAPHState *pg);
+void pgraph_vk_clear_report_value(NV2AState *d);
+void pgraph_vk_get_report(NV2AState *d, uint32_t parameter);
+void pgraph_vk_process_pending_reports(NV2AState *d);
+void pgraph_vk_process_pending_reports_internal(NV2AState *d);
+
+typedef enum FinishReason {
+    VK_FINISH_REASON_VERTEX_BUFFER_DIRTY,
+    VK_FINISH_REASON_SURFACE_CREATE,
+    VK_FINISH_REASON_SURFACE_DOWN,
+    VK_FINISH_REASON_NEED_BUFFER_SPACE,
+    VK_FINISH_REASON_FRAMEBUFFER_DIRTY,
+    VK_FINISH_REASON_PRESENTING,
+    VK_FINISH_REASON_FLIP_STALL,
+    VK_FINISH_REASON_FLUSH,
+} FinishReason;
+
+// draw.c
+void pgraph_vk_init_pipelines(PGRAPHState *pg);
+void pgraph_vk_finalize_pipelines(PGRAPHState *pg);
+void pgraph_vk_clear_surface(NV2AState *d, uint32_t parameter);
+void pgraph_vk_draw_begin(NV2AState *d);
+void pgraph_vk_draw_end(NV2AState *d);
+void pgraph_vk_finish(PGRAPHState *pg, FinishReason why);
+void pgraph_vk_flush_draw(NV2AState *d);
+void pgraph_vk_begin_command_buffer(PGRAPHState *pg);
+void pgraph_vk_ensure_command_buffer(PGRAPHState *pg);
+void pgraph_vk_ensure_not_in_render_pass(PGRAPHState *pg);
+
+VkCommandBuffer pgraph_vk_begin_nondraw_commands(PGRAPHState *pg);
+void pgraph_vk_end_nondraw_commands(PGRAPHState *pg, VkCommandBuffer cmd);
+
+// blit.c
+void pgraph_vk_image_blit(NV2AState *d);
+
+#endif
--- a/hw/xbox/nv2a/pgraph/vk/reports.c
+++ b/hw/xbox/nv2a/pgraph/vk/reports.c
@ -0,0 +1,134 @@
+/*
+ * Geforce NV2A PGRAPH Vulkan Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "renderer.h"
+
+void pgraph_vk_init_reports(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    QSIMPLEQ_INIT(&r->report_queue);
+    r->num_queries_in_flight = 0;
+    r->max_queries_in_flight = 1024;
+    r->new_query_needed = true;
+    r->query_in_flight = false;
+    r->zpass_pixel_count_result = 0;
+
+    VkQueryPoolCreateInfo pool_create_info = (VkQueryPoolCreateInfo){
+        .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+        .queryType = VK_QUERY_TYPE_OCCLUSION,
+        .queryCount = r->max_queries_in_flight,
+    };
+    VK_CHECK(
+        vkCreateQueryPool(r->device, &pool_create_info, NULL, &r->query_pool));
+}
+
+void pgraph_vk_finalize_reports(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    vkDestroyQueryPool(r->device, r->query_pool, NULL);
+}
+
+void pgraph_vk_clear_report_value(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    QueryReport *q = g_malloc(sizeof(QueryReport)); // FIXME: Pre-allocate
+    q->clear = true;
+    QSIMPLEQ_INSERT_TAIL(&r->report_queue, q, entry);
+}
+
+void pgraph_vk_get_report(NV2AState *d, uint32_t parameter)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    uint8_t type = GET_MASK(parameter, NV097_GET_REPORT_TYPE);
+    assert(type == NV097_GET_REPORT_TYPE_ZPASS_PIXEL_CNT);
+
+    QueryReport *q = g_malloc(sizeof(QueryReport)); // FIXME: Pre-allocate
+    q->clear = false;
+    q->parameter = parameter;
+    q->query_count = r->num_queries_in_flight;
+    QSIMPLEQ_INSERT_TAIL(&r->report_queue, q, entry);
+
+    r->new_query_needed = true;
+}
+
+void pgraph_vk_process_pending_reports_internal(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    NV2A_VK_DGROUP_BEGIN("Processing queries");
+
+    assert(!r->in_command_buffer);
+
+    // Fetch all query results
+    g_autofree uint64_t *query_results = NULL;
+
+    if (r->num_queries_in_flight > 0) {
+        size_t size_of_results = r->num_queries_in_flight * sizeof(uint64_t);
+        query_results = g_malloc_n(r->num_queries_in_flight,
+                                   sizeof(uint64_t)); // FIXME: Pre-allocate
+        VkResult result;
+        do {
+            result = vkGetQueryPoolResults(
+                r->device, r->query_pool, 0, r->num_queries_in_flight,
+                size_of_results, query_results, sizeof(uint64_t),
+                VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT);
+        } while (result == VK_NOT_READY);
+    }
+
+    // Write out queries
+    QueryReport *q, *next;
+    int num_results_counted = 0;
+
+    int result_divisor = pg->surface_scale_factor * pg->surface_scale_factor;
+
+    QSIMPLEQ_FOREACH_SAFE (q, &r->report_queue, entry, next) {
+        if (q->clear) {
+            NV2A_VK_DPRINTF("Cleared");
+            r->zpass_pixel_count_result = 0;
+        } else {
+            assert(q->query_count >= num_results_counted);
+            assert(q->query_count <= r->num_queries_in_flight);
+
+            while (num_results_counted < q->query_count) {
+                r->zpass_pixel_count_result +=
+                    query_results[num_results_counted++];
+            }
+
+            pgraph_write_zpass_pixel_cnt_report(
+                d, q->parameter,
+                r->zpass_pixel_count_result / result_divisor);
+        }
+        QSIMPLEQ_REMOVE_HEAD(&r->report_queue, entry);
+        g_free(q);
+    }
+
+    r->num_queries_in_flight = 0;
+    NV2A_VK_DGROUP_END();
+}
+
+void pgraph_vk_process_pending_reports(NV2AState *d)
+{
+}
--- a/hw/xbox/nv2a/pgraph/vk/shaders.c
+++ b/hw/xbox/nv2a/pgraph/vk/shaders.c
@ -0,0 +1,797 @@
+/*
+ * Geforce NV2A PGRAPH Vulkan Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * Based on GL implementation:
+ *
+ * Copyright (c) 2015 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"
+#include "hw/xbox/nv2a/pgraph/util.h"
+#include "hw/xbox/nv2a/pgraph/glsl/geom.h"
+#include "hw/xbox/nv2a/pgraph/glsl/vsh.h"
+#include "hw/xbox/nv2a/pgraph/glsl/psh.h"
+#include "qemu/fast-hash.h"
+#include "qemu/mstring.h"
+#include "renderer.h"
+#include <locale.h>
+
+static void create_descriptor_pool(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    size_t num_sets = ARRAY_SIZE(r->descriptor_sets);
+
+    VkDescriptorPoolSize pool_sizes[] = {
+        {
+            .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .descriptorCount = 2 * num_sets,
+        },
+        {
+            .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            .descriptorCount = NV2A_MAX_TEXTURES * num_sets,
+        }
+    };
+
+    VkDescriptorPoolCreateInfo pool_info = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+        .poolSizeCount = ARRAY_SIZE(pool_sizes),
+        .pPoolSizes = pool_sizes,
+        .maxSets = ARRAY_SIZE(r->descriptor_sets),
+        .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
+    };
+    VK_CHECK(vkCreateDescriptorPool(r->device, &pool_info, NULL,
+                                    &r->descriptor_pool));
+}
+
+static void destroy_descriptor_pool(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    vkDestroyDescriptorPool(r->device, r->descriptor_pool, NULL);
+    r->descriptor_pool = VK_NULL_HANDLE;
+}
+
+static void create_descriptor_set_layout(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkDescriptorSetLayoutBinding bindings[2 + NV2A_MAX_TEXTURES];
+
+    bindings[0] = (VkDescriptorSetLayoutBinding){
+        .binding = VSH_UBO_BINDING,
+        .descriptorCount = 1,
+        .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+        .stageFlags = VK_SHADER_STAGE_VERTEX_BIT,
+    };
+    bindings[1] = (VkDescriptorSetLayoutBinding){
+        .binding = PSH_UBO_BINDING,
+        .descriptorCount = 1,
+        .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+        .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
+    };
+    for (int i = 0; i < NV2A_MAX_TEXTURES; i++) {
+        bindings[2 + i] = (VkDescriptorSetLayoutBinding){
+            .binding = PSH_TEX_BINDING + i,
+            .descriptorCount = 1,
+            .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
+        };
+    }
+    VkDescriptorSetLayoutCreateInfo layout_info = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+        .bindingCount = ARRAY_SIZE(bindings),
+        .pBindings = bindings,
+    };
+    VK_CHECK(vkCreateDescriptorSetLayout(r->device, &layout_info, NULL,
+                                         &r->descriptor_set_layout));
+}
+
+static void destroy_descriptor_set_layout(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    vkDestroyDescriptorSetLayout(r->device, r->descriptor_set_layout, NULL);
+    r->descriptor_set_layout = VK_NULL_HANDLE;
+}
+
+static void create_descriptor_sets(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkDescriptorSetLayout layouts[ARRAY_SIZE(r->descriptor_sets)];
+    for (int i = 0; i < ARRAY_SIZE(layouts); i++) {
+        layouts[i] = r->descriptor_set_layout;
+    }
+
+    VkDescriptorSetAllocateInfo alloc_info = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+        .descriptorPool = r->descriptor_pool,
+        .descriptorSetCount = ARRAY_SIZE(r->descriptor_sets),
+        .pSetLayouts = layouts,
+    };
+    VK_CHECK(
+        vkAllocateDescriptorSets(r->device, &alloc_info, r->descriptor_sets));
+}
+
+static void destroy_descriptor_sets(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    vkFreeDescriptorSets(r->device, r->descriptor_pool,
+                         ARRAY_SIZE(r->descriptor_sets), r->descriptor_sets);
+    for (int i = 0; i < ARRAY_SIZE(r->descriptor_sets); i++) {
+        r->descriptor_sets[i] = VK_NULL_HANDLE;
+    }
+}
+
+void pgraph_vk_update_descriptor_sets(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    bool need_uniform_write =
+        r->uniforms_changed ||
+        !r->storage_buffers[BUFFER_UNIFORM_STAGING].buffer_offset;
+
+    if (!(r->shader_bindings_changed || r->texture_bindings_changed ||
+          (r->descriptor_set_index == 0) || need_uniform_write)) {
+        return; // Nothing changed
+    }
+
+    ShaderBinding *binding = r->shader_binding;
+    ShaderUniformLayout *layouts[] = { &binding->vertex->uniforms,
+                                       &binding->fragment->uniforms };
+    VkDeviceSize ubo_buffer_total_size = 0;
+    for (int i = 0; i < ARRAY_SIZE(layouts); i++) {
+        ubo_buffer_total_size += layouts[i]->total_size;
+    }
+    bool need_ubo_staging_buffer_reset =
+        r->uniforms_changed &&
+        !pgraph_vk_buffer_has_space_for(pg, BUFFER_UNIFORM_STAGING,
+                                        ubo_buffer_total_size,
+                                        r->device_props.limits.minUniformBufferOffsetAlignment);
+
+    bool need_descriptor_write_reset =
+        (r->descriptor_set_index >= ARRAY_SIZE(r->descriptor_sets));
+
+    if (need_descriptor_write_reset || need_ubo_staging_buffer_reset) {
+        pgraph_vk_finish(pg, VK_FINISH_REASON_NEED_BUFFER_SPACE);
+        need_uniform_write = true;
+    }
+
+    VkWriteDescriptorSet descriptor_writes[2 + NV2A_MAX_TEXTURES];
+
+    assert(r->descriptor_set_index < ARRAY_SIZE(r->descriptor_sets));
+
+    if (need_uniform_write) {
+        for (int i = 0; i < ARRAY_SIZE(layouts); i++) {
+            void *data = layouts[i]->allocation;
+            VkDeviceSize size = layouts[i]->total_size;
+            r->uniform_buffer_offsets[i] = pgraph_vk_append_to_buffer(
+                pg, BUFFER_UNIFORM_STAGING, &data, &size, 1,
+                r->device_props.limits.minUniformBufferOffsetAlignment);
+        }
+
+        r->uniforms_changed = false;
+    }
+
+    VkDescriptorBufferInfo ubo_buffer_infos[2];
+    for (int i = 0; i < ARRAY_SIZE(layouts); i++) {
+        ubo_buffer_infos[i] = (VkDescriptorBufferInfo){
+            .buffer = r->storage_buffers[BUFFER_UNIFORM].buffer,
+            .offset = r->uniform_buffer_offsets[i],
+            .range = layouts[i]->total_size,
+        };
+        descriptor_writes[i] = (VkWriteDescriptorSet){
+            .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+            .dstSet = r->descriptor_sets[r->descriptor_set_index],
+            .dstBinding = i == 0 ? VSH_UBO_BINDING : PSH_UBO_BINDING,
+            .dstArrayElement = 0,
+            .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .descriptorCount = 1,
+            .pBufferInfo = &ubo_buffer_infos[i],
+        };
+    }
+
+    VkDescriptorImageInfo image_infos[NV2A_MAX_TEXTURES];
+    for (int i = 0; i < NV2A_MAX_TEXTURES; i++) {
+        image_infos[i] = (VkDescriptorImageInfo){
+            .imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+            .imageView = r->texture_bindings[i]->image_view,
+            .sampler = r->texture_bindings[i]->sampler,
+        };
+        descriptor_writes[2 + i] = (VkWriteDescriptorSet){
+            .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+            .dstSet = r->descriptor_sets[r->descriptor_set_index],
+            .dstBinding = PSH_TEX_BINDING + i,
+            .dstArrayElement = 0,
+            .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            .descriptorCount = 1,
+            .pImageInfo = &image_infos[i],
+        };
+    }
+
+    vkUpdateDescriptorSets(r->device, 6, descriptor_writes, 0, NULL);
+
+    r->descriptor_set_index++;
+}
+
+static void update_shader_constant_locations(ShaderBinding *binding)
+{
+    int i, j;
+    char tmp[64];
+
+    /* lookup fragment shader uniforms */
+    for (i = 0; i < 9; i++) {
+        for (j = 0; j < 2; j++) {
+            snprintf(tmp, sizeof(tmp), "c%d_%d", j, i);
+            binding->psh_constant_loc[i][j] =
+                uniform_index(&binding->fragment->uniforms, tmp);
+        }
+    }
+    binding->alpha_ref_loc =
+        uniform_index(&binding->fragment->uniforms, "alphaRef");
+    binding->fog_color_loc =
+        uniform_index(&binding->fragment->uniforms, "fogColor");
+    for (i = 1; i < NV2A_MAX_TEXTURES; i++) {
+        snprintf(tmp, sizeof(tmp), "bumpMat%d", i);
+        binding->bump_mat_loc[i] =
+            uniform_index(&binding->fragment->uniforms, tmp);
+        snprintf(tmp, sizeof(tmp), "bumpScale%d", i);
+        binding->bump_scale_loc[i] =
+            uniform_index(&binding->fragment->uniforms, tmp);
+        snprintf(tmp, sizeof(tmp), "bumpOffset%d", i);
+        binding->bump_offset_loc[i] =
+            uniform_index(&binding->fragment->uniforms, tmp);
+    }
+
+    for (int i = 0; i < NV2A_MAX_TEXTURES; i++) {
+        snprintf(tmp, sizeof(tmp), "texScale%d", i);
+        binding->tex_scale_loc[i] =
+            uniform_index(&binding->fragment->uniforms, tmp);
+    }
+
+    /* lookup vertex shader uniforms */
+    binding->vsh_constant_loc = uniform_index(&binding->vertex->uniforms, "c");
+    binding->surface_size_loc =
+        uniform_index(&binding->vertex->uniforms, "surfaceSize");
+    binding->clip_range_loc =
+        uniform_index(&binding->vertex->uniforms, "clipRange");
+    binding->fog_param_loc =
+        uniform_index(&binding->vertex->uniforms, "fogParam");
+
+    binding->inv_viewport_loc =
+        uniform_index(&binding->vertex->uniforms, "invViewport");
+    binding->ltctxa_loc = uniform_index(&binding->vertex->uniforms, "ltctxa");
+    binding->ltctxb_loc = uniform_index(&binding->vertex->uniforms, "ltctxb");
+    binding->ltc1_loc = uniform_index(&binding->vertex->uniforms, "ltc1");
+
+    for (i = 0; i < NV2A_MAX_LIGHTS; i++) {
+        snprintf(tmp, sizeof(tmp), "lightInfiniteHalfVector%d", i);
+        binding->light_infinite_half_vector_loc[i] =
+            uniform_index(&binding->vertex->uniforms, tmp);
+        snprintf(tmp, sizeof(tmp), "lightInfiniteDirection%d", i);
+        binding->light_infinite_direction_loc[i] =
+            uniform_index(&binding->vertex->uniforms, tmp);
+
+        snprintf(tmp, sizeof(tmp), "lightLocalPosition%d", i);
+        binding->light_local_position_loc[i] =
+            uniform_index(&binding->vertex->uniforms, tmp);
+        snprintf(tmp, sizeof(tmp), "lightLocalAttenuation%d", i);
+        binding->light_local_attenuation_loc[i] =
+            uniform_index(&binding->vertex->uniforms, tmp);
+    }
+
+    binding->clip_region_loc =
+        uniform_index(&binding->fragment->uniforms, "clipRegion");
+
+    binding->material_alpha_loc =
+        uniform_index(&binding->vertex->uniforms, "material_alpha");
+}
+
+static void shader_cache_entry_init(Lru *lru, LruNode *node, void *state)
+{
+    ShaderBinding *snode = container_of(node, ShaderBinding, node);
+    memcpy(&snode->state, state, sizeof(ShaderState));
+}
+
+static void shader_cache_entry_post_evict(Lru *lru, LruNode *node)
+{
+    PGRAPHVkState *r = container_of(lru, PGRAPHVkState, shader_cache);
+    ShaderBinding *snode = container_of(node, ShaderBinding, node);
+
+    ShaderModuleInfo *modules[] = {
+        snode->geometry,
+        snode->vertex,
+        snode->fragment,
+    };
+    for (int i = 0; i < ARRAY_SIZE(modules); i++) {
+        if (modules[i]) {
+            pgraph_vk_destroy_shader_module(r, modules[i]);
+        }
+    }
+
+    memset(&snode->state, 0, sizeof(ShaderState));
+}
+
+static bool shader_cache_entry_compare(Lru *lru, LruNode *node, void *key)
+{
+    ShaderBinding *snode = container_of(node, ShaderBinding, node);
+    return memcmp(&snode->state, key, sizeof(ShaderState));
+}
+
+static void shader_cache_init(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    const size_t shader_cache_size = 1024;
+    lru_init(&r->shader_cache);
+    r->shader_cache_entries = g_malloc_n(shader_cache_size, sizeof(ShaderBinding));
+    assert(r->shader_cache_entries != NULL);
+    for (int i = 0; i < shader_cache_size; i++) {
+        lru_add_free(&r->shader_cache, &r->shader_cache_entries[i].node);
+    }
+    r->shader_cache.init_node = shader_cache_entry_init;
+    r->shader_cache.compare_nodes = shader_cache_entry_compare;
+    r->shader_cache.post_node_evict = shader_cache_entry_post_evict;
+}
+
+static void shader_cache_finalize(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    lru_flush(&r->shader_cache);
+    g_free(r->shader_cache_entries);
+    r->shader_cache_entries = NULL;
+}
+
+static ShaderBinding *gen_shaders(PGRAPHState *pg, ShaderState *state)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    uint64_t hash = fast_hash((void *)state, sizeof(*state));
+    LruNode *node = lru_lookup(&r->shader_cache, hash, state);
+    ShaderBinding *snode = container_of(node, ShaderBinding, node);
+
+    NV2A_VK_DPRINTF("shader state hash: %016lx, %p", hash, snode);
+
+    if (!snode->fragment) {
+        NV2A_VK_DPRINTF("cache miss");
+        nv2a_profile_inc_counter(NV2A_PROF_SHADER_GEN);
+
+        char *previous_numeric_locale = setlocale(LC_NUMERIC, NULL);
+        if (previous_numeric_locale) {
+            previous_numeric_locale = g_strdup(previous_numeric_locale);
+        }
+
+        /* Ensure numeric values are printed with '.' radix, no grouping */
+        setlocale(LC_NUMERIC, "C");
+
+        MString *geometry_shader_code = pgraph_gen_geom_glsl(
+            state->polygon_front_mode, state->polygon_back_mode,
+            state->primitive_mode, state->smooth_shading, true);
+        if (geometry_shader_code) {
+            NV2A_VK_DPRINTF("geometry shader: \n%s",
+                            mstring_get_str(geometry_shader_code));
+            snode->geometry = pgraph_vk_create_shader_module_from_glsl(
+                r, VK_SHADER_STAGE_GEOMETRY_BIT,
+                mstring_get_str(geometry_shader_code));
+            mstring_unref(geometry_shader_code);
+        } else {
+            memset(&snode->geometry, 0, sizeof(snode->geometry));
+        }
+
+        MString *vertex_shader_code =
+            pgraph_gen_vsh_glsl(state, geometry_shader_code != NULL);
+        NV2A_VK_DPRINTF("vertex shader: \n%s",
+                        mstring_get_str(vertex_shader_code));
+        snode->vertex = pgraph_vk_create_shader_module_from_glsl(
+            r, VK_SHADER_STAGE_VERTEX_BIT,
+            mstring_get_str(vertex_shader_code));
+        mstring_unref(vertex_shader_code);
+
+        MString *fragment_shader_code = pgraph_gen_psh_glsl(state->psh);
+        NV2A_VK_DPRINTF("fragment shader: \n%s",
+                        mstring_get_str(fragment_shader_code));
+        snode->fragment = pgraph_vk_create_shader_module_from_glsl(
+            r, VK_SHADER_STAGE_FRAGMENT_BIT,
+            mstring_get_str(fragment_shader_code));
+        mstring_unref(fragment_shader_code);
+
+        if (previous_numeric_locale) {
+            setlocale(LC_NUMERIC, previous_numeric_locale);
+            g_free(previous_numeric_locale);
+        }
+
+        update_shader_constant_locations(snode);
+    }
+
+    return snode;
+}
+
+// FIXME: Move to common
+static void shader_update_constants(PGRAPHState *pg, ShaderBinding *binding,
+                                    bool binding_changed, bool vertex_program,
+                                    bool fixed_function)
+{
+    int i, j;
+
+    /* update combiner constants */
+    for (i = 0; i < 9; i++) {
+        uint32_t constant[2];
+        if (i == 8) {
+            /* final combiner */
+            constant[0] = pgraph_reg_r(pg, NV_PGRAPH_SPECFOGFACTOR0);
+            constant[1] = pgraph_reg_r(pg, NV_PGRAPH_SPECFOGFACTOR1);
+        } else {
+            constant[0] = pgraph_reg_r(pg, NV_PGRAPH_COMBINEFACTOR0 + i * 4);
+            constant[1] = pgraph_reg_r(pg, NV_PGRAPH_COMBINEFACTOR1 + i * 4);
+        }
+
+        for (j = 0; j < 2; j++) {
+            GLint loc = binding->psh_constant_loc[i][j];
+            if (loc != -1) {
+                float value[4];
+                pgraph_argb_pack32_to_rgba_float(constant[j], value);
+                uniform1fv(&binding->fragment->uniforms, loc, 4, value);
+            }
+        }
+    }
+    if (binding->alpha_ref_loc != -1) {
+        float alpha_ref = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0),
+                                   NV_PGRAPH_CONTROL_0_ALPHAREF) /
+                          255.0;
+        uniform1f(&binding->fragment->uniforms, binding->alpha_ref_loc,
+                         alpha_ref);
+    }
+
+
+    /* For each texture stage */
+    for (i = 0; i < NV2A_MAX_TEXTURES; i++) {
+        int loc;
+
+        /* Bump luminance only during stages 1 - 3 */
+        if (i > 0) {
+            loc = binding->bump_mat_loc[i];
+            if (loc != -1) {
+                uint32_t m_u32[4];
+                m_u32[0] = pgraph_reg_r(pg, NV_PGRAPH_BUMPMAT00 + 4 * (i - 1));
+                m_u32[1] = pgraph_reg_r(pg, NV_PGRAPH_BUMPMAT01 + 4 * (i - 1));
+                m_u32[2] = pgraph_reg_r(pg, NV_PGRAPH_BUMPMAT10 + 4 * (i - 1));
+                m_u32[3] = pgraph_reg_r(pg, NV_PGRAPH_BUMPMAT11 + 4 * (i - 1));
+                float m[4];
+                m[0] = *(float*)&m_u32[0];
+                m[1] = *(float*)&m_u32[1];
+                m[2] = *(float*)&m_u32[2];
+                m[3] = *(float*)&m_u32[3];
+                uniformMatrix2fv(&binding->fragment->uniforms, loc, m);
+            }
+            loc = binding->bump_scale_loc[i];
+            if (loc != -1) {
+                uint32_t v =
+                    pgraph_reg_r(pg, NV_PGRAPH_BUMPSCALE1 + (i - 1) * 4);
+                uniform1f(&binding->fragment->uniforms, loc,
+                                 *(float *)&v);
+            }
+            loc = binding->bump_offset_loc[i];
+            if (loc != -1) {
+                uint32_t v =
+                    pgraph_reg_r(pg, NV_PGRAPH_BUMPOFFSET1 + (i - 1) * 4);
+                uniform1f(&binding->fragment->uniforms, loc,
+                                 *(float *)&v);
+            }
+        }
+
+        loc = binding->tex_scale_loc[i];
+        if (loc != -1) {
+            assert(pg->vk_renderer_state->texture_bindings[i] != NULL);
+            float scale = pg->vk_renderer_state->texture_bindings[i]->key.scale;
+            BasicColorFormatInfo f_basic = kelvin_color_format_info_map[pg->vk_renderer_state->texture_bindings[i]->key.state.color_format];
+            if (!f_basic.linear) {
+                scale = 1.0;
+            }
+            uniform1f(&binding->fragment->uniforms, loc, scale);
+        }
+    }
+
+    if (binding->fog_color_loc != -1) {
+        uint32_t fog_color = pgraph_reg_r(pg, NV_PGRAPH_FOGCOLOR);
+        uniform4f(&binding->fragment->uniforms, binding->fog_color_loc,
+                         GET_MASK(fog_color, NV_PGRAPH_FOGCOLOR_RED) / 255.0,
+                         GET_MASK(fog_color, NV_PGRAPH_FOGCOLOR_GREEN) / 255.0,
+                         GET_MASK(fog_color, NV_PGRAPH_FOGCOLOR_BLUE) / 255.0,
+                         GET_MASK(fog_color, NV_PGRAPH_FOGCOLOR_ALPHA) / 255.0);
+    }
+    if (binding->fog_param_loc != -1) {
+        uint32_t v[2];
+        v[0] = pgraph_reg_r(pg, NV_PGRAPH_FOGPARAM0);
+        v[1] = pgraph_reg_r(pg, NV_PGRAPH_FOGPARAM1);
+        uniform2f(&binding->vertex->uniforms,
+                         binding->fog_param_loc, *(float *)&v[0],
+                         *(float *)&v[1]);
+    }
+
+    float zmax;
+    switch (pg->surface_shape.zeta_format) {
+    case NV097_SET_SURFACE_FORMAT_ZETA_Z16:
+        zmax = pg->surface_shape.z_format ? f16_max : (float)0xFFFF;
+        break;
+    case NV097_SET_SURFACE_FORMAT_ZETA_Z24S8:
+        zmax = pg->surface_shape.z_format ? f24_max : (float)0xFFFFFF;
+        break;
+    default:
+        assert(0);
+    }
+
+    if (fixed_function) {
+        /* update lighting constants */
+        struct {
+            uint32_t *v;
+            int locs;
+            size_t len;
+        } lighting_arrays[] = {
+            { &pg->ltctxa[0][0], binding->ltctxa_loc, NV2A_LTCTXA_COUNT },
+            { &pg->ltctxb[0][0], binding->ltctxb_loc, NV2A_LTCTXB_COUNT },
+            { &pg->ltc1[0][0], binding->ltc1_loc, NV2A_LTC1_COUNT },
+        };
+
+        for (i = 0; i < ARRAY_SIZE(lighting_arrays); i++) {
+            uniform1iv(
+                &binding->vertex->uniforms, lighting_arrays[i].locs,
+                lighting_arrays[i].len * 4, (void *)lighting_arrays[i].v);
+        }
+
+        for (i = 0; i < NV2A_MAX_LIGHTS; i++) {
+            int loc = binding->light_infinite_half_vector_loc[i];
+            if (loc != -1) {
+                uniform1fv(&binding->vertex->uniforms, loc, 3,
+                                 pg->light_infinite_half_vector[i]);
+            }
+            loc = binding->light_infinite_direction_loc[i];
+            if (loc != -1) {
+                uniform1fv(&binding->vertex->uniforms, loc, 3,
+                                 pg->light_infinite_direction[i]);
+            }
+
+            loc = binding->light_local_position_loc[i];
+            if (loc != -1) {
+                uniform1fv(&binding->vertex->uniforms, loc, 3,
+                                 pg->light_local_position[i]);
+            }
+            loc = binding->light_local_attenuation_loc[i];
+            if (loc != -1) {
+                uniform1fv(&binding->vertex->uniforms, loc, 3,
+                                 pg->light_local_attenuation[i]);
+            }
+        }
+
+        /* estimate the viewport by assuming it matches the surface ... */
+        unsigned int aa_width = 1, aa_height = 1;
+        pgraph_apply_anti_aliasing_factor(pg, &aa_width, &aa_height);
+
+        float m11 = 0.5 * (pg->surface_binding_dim.width / aa_width);
+        float m22 = -0.5 * (pg->surface_binding_dim.height / aa_height);
+        float m33 = zmax;
+        float m41 = *(float *)&pg->vsh_constants[NV_IGRAPH_XF_XFCTX_VPOFF][0];
+        float m42 = *(float *)&pg->vsh_constants[NV_IGRAPH_XF_XFCTX_VPOFF][1];
+
+        float invViewport[16] = {
+            1.0 / m11, 0,  0, 0,         0, 1.0 / m22,        0,
+            0,         0,  0, 1.0 / m33, 0, -1.0 + m41 / m11, 1.0 + m42 / m22,
+            0,         1.0
+        };
+
+        if (binding->inv_viewport_loc != -1) {
+            uniformMatrix4fv(&binding->vertex->uniforms,
+                                    binding->inv_viewport_loc, &invViewport[0]);
+        }
+    }
+
+    /* update vertex program constants */
+    uniform1iv(&binding->vertex->uniforms, binding->vsh_constant_loc,
+               NV2A_VERTEXSHADER_CONSTANTS * 4, (void *)pg->vsh_constants);
+
+    if (binding->surface_size_loc != -1) {
+        unsigned int aa_width = 1, aa_height = 1;
+        pgraph_apply_anti_aliasing_factor(pg, &aa_width, &aa_height);
+        uniform2f(&binding->vertex->uniforms, binding->surface_size_loc,
+                         pg->surface_binding_dim.width / aa_width,
+                         pg->surface_binding_dim.height / aa_height);
+    }
+
+    if (binding->clip_range_loc != -1) {
+        uint32_t v[2];
+        v[0] = pgraph_reg_r(pg, NV_PGRAPH_ZCLIPMIN);
+        v[1] = pgraph_reg_r(pg, NV_PGRAPH_ZCLIPMAX);
+        float zclip_min = *(float *)&v[0] / zmax * 2.0 - 1.0;
+        float zclip_max = *(float *)&v[1] / zmax * 2.0 - 1.0;
+        uniform4f(&binding->vertex->uniforms, binding->clip_range_loc, 0,
+                         zmax, zclip_min, zclip_max);
+    }
+
+    /* Clipping regions */
+    unsigned int max_gl_width = pg->surface_binding_dim.width;
+    unsigned int max_gl_height = pg->surface_binding_dim.height;
+    pgraph_apply_scaling_factor(pg, &max_gl_width, &max_gl_height);
+
+    uint32_t clip_regions[8][4];
+
+    for (i = 0; i < 8; i++) {
+        uint32_t x = pgraph_reg_r(pg, NV_PGRAPH_WINDOWCLIPX0 + i * 4);
+        unsigned int x_min = GET_MASK(x, NV_PGRAPH_WINDOWCLIPX0_XMIN);
+        unsigned int x_max = GET_MASK(x, NV_PGRAPH_WINDOWCLIPX0_XMAX) + 1;
+        uint32_t y = pgraph_reg_r(pg, NV_PGRAPH_WINDOWCLIPY0 + i * 4);
+        unsigned int y_min = GET_MASK(y, NV_PGRAPH_WINDOWCLIPY0_YMIN);
+        unsigned int y_max = GET_MASK(y, NV_PGRAPH_WINDOWCLIPY0_YMAX) + 1;
+        pgraph_apply_anti_aliasing_factor(pg, &x_min, &y_min);
+        pgraph_apply_anti_aliasing_factor(pg, &x_max, &y_max);
+
+        pgraph_apply_scaling_factor(pg, &x_min, &y_min);
+        pgraph_apply_scaling_factor(pg, &x_max, &y_max);
+
+        clip_regions[i][0] = x_min;
+        clip_regions[i][1] = y_min;
+        clip_regions[i][2] = x_max;
+        clip_regions[i][3] = y_max;
+    }
+    uniform1iv(&binding->fragment->uniforms, binding->clip_region_loc,
+                     8 * 4, (void *)clip_regions);
+
+    if (binding->material_alpha_loc != -1) {
+        uniform1f(&binding->vertex->uniforms, binding->material_alpha_loc,
+                         pg->material_alpha);
+    }
+}
+
+// Quickly check PGRAPH state to see if any registers have changed that
+// necessitate a full shader state inspection.
+static bool check_shaders_dirty(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    if (!r->shader_binding) {
+        return true;
+    }
+    if (pg->program_data_dirty) {
+        return true;
+    }
+
+    int num_stages = pgraph_reg_r(pg, NV_PGRAPH_COMBINECTL) & 0xFF;
+    for (int i = 0; i < num_stages; i++) {
+        if (pgraph_is_reg_dirty(pg, NV_PGRAPH_COMBINEALPHAI0 + i * 4) ||
+            pgraph_is_reg_dirty(pg, NV_PGRAPH_COMBINEALPHAO0 + i * 4) ||
+            pgraph_is_reg_dirty(pg, NV_PGRAPH_COMBINECOLORI0 + i * 4) ||
+            pgraph_is_reg_dirty(pg, NV_PGRAPH_COMBINECOLORO0 + i * 4)) {
+            return true;
+        }
+    }
+    unsigned int regs[] = {
+        NV_PGRAPH_COMBINECTL,
+        NV_PGRAPH_COMBINESPECFOG0,
+        NV_PGRAPH_COMBINESPECFOG1,
+        NV_PGRAPH_CSV0_C,
+        NV_PGRAPH_CSV0_D,
+        NV_PGRAPH_CSV1_A,
+        NV_PGRAPH_CSV1_B,
+        NV_PGRAPH_POINTSIZE,
+        NV_PGRAPH_SHADERCLIPMODE,
+        NV_PGRAPH_SHADERCTL,
+        NV_PGRAPH_SHADERPROG,
+        NV_PGRAPH_SHADOWCTL,
+    };
+    for (int i = 0; i < ARRAY_SIZE(regs); i++) {
+        if (pgraph_is_reg_dirty(pg, regs[i])) {
+            return true;
+        }
+    }
+
+    ShaderState *state = &r->shader_binding->state;
+    if (pg->uniform_attrs != state->uniform_attrs ||
+        pg->swizzle_attrs != state->swizzle_attrs ||
+        pg->compressed_attrs != state->compressed_attrs ||
+        pg->primitive_mode != state->primitive_mode ||
+        pg->surface_scale_factor != state->surface_scale_factor) {
+        return true;
+    }
+
+    // Textures
+    for (int i = 0; i < 4; i++) {
+        if (pg->texture_matrix_enable[i] != pg->vk_renderer_state->shader_binding->state.texture_matrix_enable[i] ||
+            pgraph_is_reg_dirty(pg, NV_PGRAPH_TEXCTL0_0 + i * 4) ||
+            pgraph_is_reg_dirty(pg, NV_PGRAPH_TEXFILTER0 + i * 4) ||
+            pgraph_is_reg_dirty(pg, NV_PGRAPH_TEXFMT0 + i * 4)) {
+            return true;
+        }
+    }
+
+    nv2a_profile_inc_counter(NV2A_PROF_SHADER_BIND_NOTDIRTY);
+
+    return false;
+}
+
+void pgraph_vk_bind_shaders(PGRAPHState *pg)
+{
+    NV2A_VK_DGROUP_BEGIN("%s", __func__);
+
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    r->shader_bindings_changed = false;
+
+    if (check_shaders_dirty(pg)) {
+        ShaderState new_state;
+        memset(&new_state, 0, sizeof(ShaderState));
+        new_state = pgraph_get_shader_state(pg);
+        if (!r->shader_binding || memcmp(&r->shader_binding->state, &new_state, sizeof(ShaderState))) {
+            r->shader_binding = gen_shaders(pg, &new_state);
+            r->shader_bindings_changed = true;
+        }
+    }
+
+    // FIXME: Use dirty bits
+    pgraph_vk_update_shader_uniforms(pg);
+
+    NV2A_VK_DGROUP_END();
+}
+
+void pgraph_vk_update_shader_uniforms(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+    NV2A_VK_DGROUP_BEGIN("%s", __func__);
+    nv2a_profile_inc_counter(NV2A_PROF_SHADER_BIND);
+
+    assert(r->shader_binding);
+    ShaderBinding *binding = r->shader_binding;
+    ShaderUniformLayout *layouts[] = { &binding->vertex->uniforms,
+                                        &binding->fragment->uniforms };
+    shader_update_constants(pg, r->shader_binding, true,
+                            r->shader_binding->state.vertex_program,
+                            r->shader_binding->state.fixed_function);
+
+    for (int i = 0; i < ARRAY_SIZE(layouts); i++) {
+        uint64_t hash = fast_hash(layouts[i]->allocation, layouts[i]->total_size);
+        r->uniforms_changed |= (hash != r->uniform_buffer_hashes[i]);
+        r->uniform_buffer_hashes[i] = hash;
+    }
+
+    nv2a_profile_inc_counter(r->uniforms_changed ?
+                                 NV2A_PROF_SHADER_UBO_DIRTY :
+                                 NV2A_PROF_SHADER_UBO_NOTDIRTY);
+
+    NV2A_VK_DGROUP_END();
+}
+
+void pgraph_vk_init_shaders(PGRAPHState *pg)
+{
+    pgraph_vk_init_glsl_compiler();
+    create_descriptor_pool(pg);
+    create_descriptor_set_layout(pg);
+    create_descriptor_sets(pg);
+    shader_cache_init(pg);
+}
+
+void pgraph_vk_finalize_shaders(PGRAPHState *pg)
+{
+    shader_cache_finalize(pg);
+    destroy_descriptor_sets(pg);
+    destroy_descriptor_set_layout(pg);
+    destroy_descriptor_pool(pg);
+    pgraph_vk_finalize_glsl_compiler();
+}
--- a/hw/xbox/nv2a/pgraph/vk/surface-compute.c
+++ b/hw/xbox/nv2a/pgraph/vk/surface-compute.c
@ -0,0 +1,473 @@
+/*
+ * Geforce NV2A PGRAPH Vulkan Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hw/xbox/nv2a/pgraph/pgraph.h"
+#include "renderer.h"
+#include <vulkan/vulkan_core.h>
+
+// TODO: Swizzle/Unswizzle
+// TODO: Float depth format (low priority, but would be better for accuracy)
+
+// FIXME: Below pipeline creation assumes identical 3 buffer setup. For
+//        swizzle shader we will need more flexibility.
+
+const char *pack_d24_unorm_s8_uint_to_z24s8_glsl =
+    "#version 450\n"
+    "layout(local_size_x = 256) in;\n"
+    "layout(push_constant) uniform PushConstants { uint width_in, width_out; };\n"
+    "layout(binding = 0) buffer DepthIn { uint depth_in[]; };\n"
+    "layout(binding = 1) buffer StencilIn { uint stencil_in[]; };\n"
+    "layout(binding = 2) buffer DepthStencilOut { uint depth_stencil_out[]; };\n"
+    "uint get_input_idx(uint idx_out) {\n"
+    "    uint scale = width_in / width_out;"
+    "    uint y = (idx_out / width_out) * scale;\n"
+    "    uint x = (idx_out % width_out) * scale;\n"
+    "    return y * width_in + x;\n"
+    "}\n"
+    "void main() {\n"
+    "    uint idx_out = gl_GlobalInvocationID.x;\n"
+    "    uint idx_in = get_input_idx(idx_out);\n"
+    "    uint depth_value = depth_in[idx_in];\n"
+    "    uint stencil_value = (stencil_in[idx_in / 4] >> ((idx_in % 4) * 8)) & 0xff;\n"
+    "    depth_stencil_out[idx_out] = depth_value << 8 | stencil_value;\n"
+    "}\n";
+
+const char *unpack_z24s8_to_d24_unorm_s8_uint_glsl =
+    "#version 450\n"
+    "layout(local_size_x = 256) in;\n"
+    "layout(push_constant) uniform PushConstants { uint width_in, width_out; };\n"
+    "layout(binding = 0) buffer DepthOut { uint depth_out[]; };\n"
+    "layout(binding = 1) buffer StencilOut { uint stencil_out[]; };\n"
+    "layout(binding = 2) buffer DepthStencilIn { uint depth_stencil_in[]; };\n"
+    "uint get_input_idx(uint idx_out) {\n"
+    "    uint scale = width_out / width_in;"
+    "    uint y = (idx_out / width_out) / scale;\n"
+    "    uint x = (idx_out % width_out) / scale;\n"
+    "    return y * width_in + x;\n"
+    "}\n"
+    "void main() {\n"
+    "    uint idx_out = gl_GlobalInvocationID.x;\n"
+    "    uint idx_in = get_input_idx(idx_out);\n"
+    "    depth_out[idx_out] = depth_stencil_in[idx_in] >> 8;\n"
+    "    if (idx_out % 4 == 0) {\n"
+    "       uint stencil_value = 0;\n"
+    "       for (int i = 0; i < 4; i++) {\n" // Include next 3 pixels
+    "           uint v = depth_stencil_in[get_input_idx(idx_out + i)] & 0xff;\n"
+    "           stencil_value |= v << (i * 8);\n"
+    "       }\n"
+    "       stencil_out[idx_out / 4] = stencil_value;\n"
+    "    }\n"
+    "}\n";
+
+const char *pack_d32_sfloat_s8_uint_to_z24s8_glsl =
+    "#version 450\n"
+    "layout(local_size_x = 256) in;\n"
+    "layout(push_constant) uniform PushConstants { uint width_in, width_out; };\n"
+    "layout(binding = 0) buffer DepthIn { float depth_in[]; };\n"
+    "layout(binding = 1) buffer StencilIn { uint stencil_in[]; };\n"
+    "layout(binding = 2) buffer DepthStencilOut { uint depth_stencil_out[]; };\n"
+    "uint get_input_idx(uint idx_out) {\n"
+    "    uint y = idx_out / width_out;\n"
+    "    uint x = idx_out % width_out;\n"
+    "    return (y * width_in + x) * (width_in / width_out);\n"
+    "}\n"
+    "void main() {\n"
+    "    uint idx_out = gl_GlobalInvocationID.x;\n"
+    "    uint idx_in = get_input_idx(idx_out);\n"
+    "    uint depth_value = int(depth_in[idx_in] * float(0xffffff));\n"
+    "    uint stencil_value = (stencil_in[idx_in / 4] >> ((idx_in % 4) * 8)) & 0xff;\n"
+    "    depth_stencil_out[idx_out] = depth_value << 8 | stencil_value;\n"
+    "}\n";
+
+const char *unpack_z24s8_to_d32_sfloat_s8_uint_glsl =
+    "#version 450\n"
+    "layout(local_size_x = 256) in;\n"
+    "layout(push_constant) uniform PushConstants { uint width_in, width_out; };\n"
+    "layout(binding = 0) buffer DepthOut { float depth_out[]; };\n"
+    "layout(binding = 1) buffer StencilOut { uint stencil_out[]; };\n"
+    "layout(binding = 2) buffer DepthStencilIn { uint depth_stencil_in[]; };\n"
+    "uint get_input_idx(uint idx_out) {\n"
+    "    uint scale = width_out / width_in;"
+    "    uint y = (idx_out / width_out) / scale;\n"
+    "    uint x = (idx_out % width_out) / scale;\n"
+    "    return y * width_in + x;\n"
+    "}\n"
+    "void main() {\n"
+    "    uint idx_out = gl_GlobalInvocationID.x;\n"
+    "    uint idx_in = get_input_idx(idx_out);\n"
+    "    depth_out[idx_out] = float(depth_stencil_in[idx_in] >> 8) / float(0xffffff);\n"
+    "    if (idx_out % 4 == 0) {\n"
+    "       uint stencil_value = 0;\n"
+    "       for (int i = 0; i < 4; i++) {\n" // Include next 3 pixels
+    "           uint v = depth_stencil_in[get_input_idx(idx_out + i)] & 0xff;\n"
+    "           stencil_value |= v << (i * 8);\n"
+    "       }\n"
+    "       stencil_out[idx_out / 4] = stencil_value;\n"
+    "    }\n"
+    "}\n";
+
+static void create_descriptor_pool(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkDescriptorPoolSize pool_sizes[] = {
+        {
+            .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .descriptorCount = 3,
+        },
+    };
+
+    VkDescriptorPoolCreateInfo pool_info = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+        .poolSizeCount = ARRAY_SIZE(pool_sizes),
+        .pPoolSizes = pool_sizes,
+        .maxSets = ARRAY_SIZE(r->compute.descriptor_sets),
+        .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
+    };
+    VK_CHECK(vkCreateDescriptorPool(r->device, &pool_info, NULL,
+                                    &r->compute.descriptor_pool));
+}
+
+static void destroy_descriptor_pool(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    vkDestroyDescriptorPool(r->device, r->compute.descriptor_pool, NULL);
+    r->compute.descriptor_pool = VK_NULL_HANDLE;
+}
+
+static void create_descriptor_set_layout(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    const int num_buffers = 3;
+
+    VkDescriptorSetLayoutBinding bindings[num_buffers];
+    for (int i = 0; i < num_buffers; i++) {
+        bindings[i] = (VkDescriptorSetLayoutBinding){
+            .binding = i,
+            .descriptorCount = 1,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+        };
+    }
+    VkDescriptorSetLayoutCreateInfo layout_info = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+        .bindingCount = ARRAY_SIZE(bindings),
+        .pBindings = bindings,
+    };
+    VK_CHECK(vkCreateDescriptorSetLayout(r->device, &layout_info, NULL,
+                                         &r->compute.descriptor_set_layout));
+}
+
+static void destroy_descriptor_set_layout(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    vkDestroyDescriptorSetLayout(r->device, r->compute.descriptor_set_layout,
+                                 NULL);
+    r->compute.descriptor_set_layout = VK_NULL_HANDLE;
+}
+
+static void create_descriptor_sets(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkDescriptorSetLayout layouts[ARRAY_SIZE(r->descriptor_sets)];
+    for (int i = 0; i < ARRAY_SIZE(layouts); i++) {
+        layouts[i] = r->compute.descriptor_set_layout;
+    }
+    VkDescriptorSetAllocateInfo alloc_info = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+        .descriptorPool = r->compute.descriptor_pool,
+        .descriptorSetCount = ARRAY_SIZE(r->compute.descriptor_sets),
+        .pSetLayouts = layouts,
+    };
+    VK_CHECK(vkAllocateDescriptorSets(r->device, &alloc_info,
+                                      r->compute.descriptor_sets));
+}
+
+static void destroy_descriptor_sets(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    vkFreeDescriptorSets(r->device, r->compute.descriptor_pool,
+                         ARRAY_SIZE(r->compute.descriptor_sets),
+                         r->compute.descriptor_sets);
+    for (int i = 0; i < ARRAY_SIZE(r->compute.descriptor_sets); i++) {
+        r->compute.descriptor_sets[i] = VK_NULL_HANDLE;
+    }
+}
+
+static void create_compute_pipeline_layout(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkPushConstantRange push_constant_range = {
+        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+        .size = 2 * sizeof(uint32_t),
+    };
+    VkPipelineLayoutCreateInfo pipeline_layout_info = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+        .setLayoutCount = 1,
+        .pSetLayouts = &r->compute.descriptor_set_layout,
+        .pushConstantRangeCount = 1,
+        .pPushConstantRanges = &push_constant_range,
+    };
+    VK_CHECK(vkCreatePipelineLayout(r->device, &pipeline_layout_info, NULL,
+                                    &r->compute.pipeline_layout));
+}
+
+static VkPipeline create_compute_pipeline(PGRAPHState *pg, const char *glsl)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    ShaderModuleInfo *module = pgraph_vk_create_shader_module_from_glsl(
+        r, VK_SHADER_STAGE_COMPUTE_BIT, glsl);
+
+    VkComputePipelineCreateInfo pipeline_info = {
+        .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+        .layout = r->compute.pipeline_layout,
+        .stage =
+            (VkPipelineShaderStageCreateInfo){
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+                .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+                .pName = "main",
+                .module = module->module,
+            },
+    };
+    VkPipeline pipeline;
+    VK_CHECK(vkCreateComputePipelines(r->device, r->vk_pipeline_cache, 1,
+                                       &pipeline_info, NULL,
+                                       &pipeline));
+
+    pgraph_vk_destroy_shader_module(r, module);
+
+    return pipeline;
+}
+
+static void update_descriptor_sets(PGRAPHState *pg,
+                                   VkDescriptorBufferInfo *buffers, int count)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    assert(count == 3);
+    VkWriteDescriptorSet descriptor_writes[3];
+    const int descriptor_set_index = 0;
+
+    for (int i = 0; i < count; i++) {
+        descriptor_writes[i] = (VkWriteDescriptorSet){
+            .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+            .dstSet = r->compute.descriptor_sets[descriptor_set_index],
+            .dstBinding = i,
+            .dstArrayElement = 0,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .descriptorCount = 1,
+            .pBufferInfo = &buffers[i],
+        };
+    }
+    vkUpdateDescriptorSets(r->device, count, descriptor_writes, 0, NULL);
+}
+
+//
+// Pack depth+stencil into NV097_SET_SURFACE_FORMAT_ZETA_Z24S8
+// formatted buffer with depth in bits 31-8 and stencil in bits 7-0.
+//
+void pgraph_vk_pack_depth_stencil(PGRAPHState *pg, SurfaceBinding *surface,
+                                  VkCommandBuffer cmd, VkBuffer src,
+                                  VkBuffer dst, bool downscale)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    unsigned int input_width = surface->width, input_height = surface->height;
+    pgraph_apply_scaling_factor(pg, &input_width, &input_height);
+
+    unsigned int output_width = surface->width, output_height = surface->height;
+    if (!downscale) {
+        pgraph_apply_scaling_factor(pg, &output_width, &output_height);
+    }
+
+    size_t depth_bytes_per_pixel = 4;
+    size_t depth_size = input_width * input_height * depth_bytes_per_pixel;
+
+    size_t stencil_bytes_per_pixel = 1;
+    size_t stencil_size = input_width * input_height * stencil_bytes_per_pixel;
+
+    size_t output_bytes_per_pixel = 4;
+    size_t output_size = output_width * output_height * output_bytes_per_pixel;
+
+    VkDescriptorBufferInfo buffers[] = {
+        {
+            .buffer = src,
+            .offset = 0,
+            .range = depth_size,
+        },
+        {
+            .buffer = src,
+            .offset = depth_size,
+            .range = stencil_size,
+        },
+        {
+            .buffer = dst,
+            .offset = 0,
+            .range = output_size,
+        },
+    };
+    update_descriptor_sets(pg, buffers, ARRAY_SIZE(buffers));
+
+    if (surface->host_fmt.vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
+        vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE,
+                          r->compute.pipeline_pack_d24s8);
+    } else if (surface->host_fmt.vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+        vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE,
+                          r->compute.pipeline_pack_f32s8);
+    } else {
+        assert(!"Unsupported pack format");
+    }
+    vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE,
+                            r->compute.pipeline_layout, 0, 1,
+                            &r->compute.descriptor_sets[0], 0, NULL);
+
+    uint32_t push_constants[2] = { input_width, output_width };
+    assert(sizeof(push_constants) == 8);
+    vkCmdPushConstants(cmd, r->compute.pipeline_layout,
+                       VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
+                       push_constants);
+
+    size_t workgroup_size_in_units = 256;
+    size_t output_size_in_units = output_width * output_height;
+    assert(output_size_in_units % workgroup_size_in_units == 0);
+    size_t group_count = output_size_in_units / workgroup_size_in_units;
+
+    // FIXME: Check max group count
+
+    vkCmdDispatch(cmd, group_count, 1, 1);
+}
+
+void pgraph_vk_unpack_depth_stencil(PGRAPHState *pg, SurfaceBinding *surface,
+                                    VkCommandBuffer cmd, VkBuffer src,
+                                    VkBuffer dst)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    unsigned int input_width = surface->width, input_height = surface->height;
+
+    unsigned int output_width = surface->width, output_height = surface->height;
+    pgraph_apply_scaling_factor(pg, &output_width, &output_height);
+
+    size_t depth_bytes_per_pixel = 4;
+    size_t depth_size = output_width * output_height * depth_bytes_per_pixel;
+
+    size_t stencil_bytes_per_pixel = 1;
+    size_t stencil_size = output_width * output_height * stencil_bytes_per_pixel;
+
+    size_t input_bytes_per_pixel = 4;
+    size_t input_size = input_width * input_height * input_bytes_per_pixel;
+
+    VkDescriptorBufferInfo buffers[] = {
+        {
+            .buffer = dst,
+            .offset = 0,
+            .range = depth_size,
+        },
+        {
+            .buffer = dst,
+            .offset = depth_size,
+            .range = stencil_size,
+        },
+        {
+            .buffer = src,
+            .offset = 0,
+            .range = input_size,
+        },
+    };
+    update_descriptor_sets(pg, buffers, ARRAY_SIZE(buffers));
+
+    if (surface->host_fmt.vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
+        vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE,
+                          r->compute.pipeline_unpack_d24s8);
+    } else if (surface->host_fmt.vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+        vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE,
+                          r->compute.pipeline_unpack_f32s8);
+    } else {
+        assert(!"Unsupported pack format");
+    }
+    vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE,
+                            r->compute.pipeline_layout, 0, 1,
+                            &r->compute.descriptor_sets[0], 0, NULL);
+
+    assert(output_width >= input_width);
+    uint32_t push_constants[2] = { input_width, output_width };
+    assert(sizeof(push_constants) == 8);
+    vkCmdPushConstants(cmd, r->compute.pipeline_layout,
+                       VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
+                       push_constants);
+
+    size_t workgroup_size_in_units = 256;
+    size_t output_size_in_units = output_width * output_height;
+    assert(output_size_in_units % workgroup_size_in_units == 0);
+    size_t group_count = output_size_in_units / workgroup_size_in_units;
+
+    // FIXME: Check max group count
+
+    vkCmdDispatch(cmd, group_count, 1, 1);
+}
+
+void pgraph_vk_init_compute(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    create_descriptor_pool(pg);
+    create_descriptor_set_layout(pg);
+    create_descriptor_sets(pg);
+    create_compute_pipeline_layout(pg);
+
+    r->compute.pipeline_pack_d24s8 =
+        create_compute_pipeline(pg, pack_d24_unorm_s8_uint_to_z24s8_glsl);
+    r->compute.pipeline_unpack_d24s8 =
+        create_compute_pipeline(pg, unpack_z24s8_to_d24_unorm_s8_uint_glsl);
+    r->compute.pipeline_pack_f32s8 =
+        create_compute_pipeline(pg, pack_d32_sfloat_s8_uint_to_z24s8_glsl);
+    r->compute.pipeline_unpack_f32s8 =
+        create_compute_pipeline(pg, unpack_z24s8_to_d32_sfloat_s8_uint_glsl);
+}
+
+void pgraph_vk_finalize_compute(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    VkPipeline *pipelines[] = {
+        &r->compute.pipeline_pack_d24s8,
+        &r->compute.pipeline_unpack_d24s8,
+        &r->compute.pipeline_pack_f32s8,
+        &r->compute.pipeline_unpack_f32s8,
+    };
+
+    for (int i = 0; i < ARRAY_SIZE(pipelines); i++) {
+        vkDestroyPipeline(r->device, *pipelines[i], NULL);
+        pipelines[i] = VK_NULL_HANDLE;
+    }
+
+    vkDestroyPipelineLayout(r->device, r->compute.pipeline_layout, NULL);
+    r->compute.pipeline_layout = VK_NULL_HANDLE;
+
+    destroy_descriptor_sets(pg);
+    destroy_descriptor_set_layout(pg);
+    destroy_descriptor_pool(pg);
+}
--- a/hw/xbox/nv2a/pgraph/vk/surface.c
+++ b/hw/xbox/nv2a/pgraph/vk/surface.c
--- a/hw/xbox/nv2a/pgraph/vk/texture.c
+++ b/hw/xbox/nv2a/pgraph/vk/texture.c
--- a/hw/xbox/nv2a/pgraph/vk/vertex.c
+++ b/hw/xbox/nv2a/pgraph/vk/vertex.c
@ -0,0 +1,312 @@
+/*
+ * Geforce NV2A PGRAPH Vulkan Renderer
+ *
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * Based on GL implementation:
+ *
+ * Copyright (c) 2012 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "renderer.h"
+
+VkDeviceSize pgraph_vk_update_index_buffer(PGRAPHState *pg, void *data,
+                                           VkDeviceSize size)
+{
+    nv2a_profile_inc_counter(NV2A_PROF_GEOM_BUFFER_UPDATE_2);
+    return pgraph_vk_append_to_buffer(pg, BUFFER_INDEX_STAGING, &data, &size, 1,
+                                      1);
+}
+
+VkDeviceSize pgraph_vk_update_vertex_inline_buffer(PGRAPHState *pg, void **data,
+                                                   VkDeviceSize *sizes,
+                                                   size_t count)
+{
+    nv2a_profile_inc_counter(NV2A_PROF_GEOM_BUFFER_UPDATE_3);
+    return pgraph_vk_append_to_buffer(pg, BUFFER_VERTEX_INLINE_STAGING, data,
+                                      sizes, count, 1);
+}
+
+void pgraph_vk_update_vertex_ram_buffer(PGRAPHState *pg, hwaddr offset,
+                                        void *data, VkDeviceSize size)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    size_t offset_bit = offset / 4096;
+    size_t nbits = size / 4096;
+    if (find_next_bit(r->uploaded_bitmap, nbits, offset_bit) < nbits) {
+        // Vertex data changed while building the draw list. Finish drawing
+        // before updating RAM buffer.
+        pgraph_vk_finish(pg, VK_FINISH_REASON_VERTEX_BUFFER_DIRTY);
+    }
+
+    nv2a_profile_inc_counter(NV2A_PROF_GEOM_BUFFER_UPDATE_1);
+    memcpy(r->storage_buffers[BUFFER_VERTEX_RAM].mapped + offset, data, size);
+
+    bitmap_set(r->uploaded_bitmap, offset_bit, nbits);
+}
+
+static void update_memory_buffer(NV2AState *d, hwaddr addr, hwaddr size)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    assert(r->num_vertex_ram_buffer_syncs <
+           ARRAY_SIZE(r->vertex_ram_buffer_syncs));
+    r->vertex_ram_buffer_syncs[r->num_vertex_ram_buffer_syncs++] =
+        (MemorySyncRequirement){ .addr = addr, .size = size };
+}
+
+static const VkFormat float_to_count[] = {
+    VK_FORMAT_R32_SFLOAT,
+    VK_FORMAT_R32G32_SFLOAT,
+    VK_FORMAT_R32G32B32_SFLOAT,
+    VK_FORMAT_R32G32B32A32_SFLOAT,
+};
+
+static const VkFormat ub_to_count[] = {
+    VK_FORMAT_R8_UNORM,
+    VK_FORMAT_R8G8_UNORM,
+    VK_FORMAT_R8G8B8_UNORM,
+    VK_FORMAT_R8G8B8A8_UNORM,
+};
+
+static const VkFormat s1_to_count[] = {
+    VK_FORMAT_R16_SNORM,
+    VK_FORMAT_R16G16_SNORM,
+    VK_FORMAT_R16G16B16_SNORM,
+    VK_FORMAT_R16G16B16A16_SNORM,
+};
+
+static const VkFormat s32k_to_count[] = {
+    VK_FORMAT_R16_SSCALED,
+    VK_FORMAT_R16G16_SSCALED,
+    VK_FORMAT_R16G16B16_SSCALED,
+    VK_FORMAT_R16G16B16A16_SSCALED,
+};
+
+static char const * const vertex_data_array_format_to_str[] = {
+    [NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_UB_D3D] = "UB_D3D",
+    [NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_UB_OGL] = "UB_OGL",
+    [NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_S1] = "S1",
+    [NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_F] = "F",
+    [NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_S32K] = "S32K",
+    [NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_CMP] = "CMP",
+};
+
+void pgraph_vk_bind_vertex_attributes(NV2AState *d, unsigned int min_element,
+                                      unsigned int max_element,
+                                      bool inline_data,
+                                      unsigned int inline_stride,
+                                      unsigned int provoking_element)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    unsigned int num_elements = max_element - min_element + 1;
+
+    if (inline_data) {
+        NV2A_VK_DGROUP_BEGIN("%s (num_elements: %d inline stride: %d)",
+                             __func__, num_elements, inline_stride);
+    } else {
+        NV2A_VK_DGROUP_BEGIN("%s (num_elements: %d)", __func__, num_elements);
+    }
+
+    pg->compressed_attrs = 0;
+    pg->uniform_attrs = 0;
+    pg->swizzle_attrs = 0;
+
+    r->num_active_vertex_attribute_descriptions = 0;
+    r->num_active_vertex_binding_descriptions = 0;
+
+    for (int i = 0; i < NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
+        VertexAttribute *attr = &pg->vertex_attributes[i];
+        NV2A_VK_DGROUP_BEGIN("[attr %02d] format=%s, count=%d, stride=%d", i,
+                             vertex_data_array_format_to_str[attr->format],
+                             attr->count, attr->stride);
+        r->vertex_attribute_to_description_location[i] = -1;
+        if (!attr->count) {
+            pg->uniform_attrs |= 1 << i;
+            NV2A_VK_DPRINTF("inline_value = {%f, %f, %f, %f}",
+                            attr->inline_value[0], attr->inline_value[1],
+                            attr->inline_value[2], attr->inline_value[3]);
+            NV2A_VK_DGROUP_END();
+            continue;
+        }
+
+        VkFormat vk_format;
+        bool needs_conversion = false;
+        bool d3d_swizzle = false;
+
+        switch (attr->format) {
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_UB_D3D:
+            assert(attr->count == 4);
+            d3d_swizzle = true;
+            /* fallthru */
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_UB_OGL:
+            assert(attr->count <= ARRAY_SIZE(ub_to_count));
+            vk_format = ub_to_count[attr->count - 1];
+            break;
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_S1:
+            assert(attr->count <= ARRAY_SIZE(s1_to_count));
+            vk_format = s1_to_count[attr->count - 1];
+            break;
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_F:
+            assert(attr->count <= ARRAY_SIZE(float_to_count));
+            vk_format = float_to_count[attr->count - 1];
+            break;
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_S32K:
+            assert(attr->count <= ARRAY_SIZE(s32k_to_count));
+            vk_format = s32k_to_count[attr->count - 1];
+            break;
+        case NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE_CMP:
+            vk_format =
+                VK_FORMAT_R32_SINT; // VK_FORMAT_B10G11R11_UFLOAT_PACK32 ??
+            /* 3 signed, normalized components packed in 32-bits. (11,11,10) */
+            assert(attr->count == 1);
+            needs_conversion = true;
+            break;
+        default:
+            fprintf(stderr, "Unknown vertex type: 0x%x\n", attr->format);
+            assert(false);
+            break;
+        }
+
+        nv2a_profile_inc_counter(NV2A_PROF_ATTR_BIND);
+        hwaddr attrib_data_addr;
+        size_t stride;
+
+        if (needs_conversion) {
+            pg->compressed_attrs |= (1 << i);
+        }
+        if (d3d_swizzle) {
+            pg->swizzle_attrs |= (1 << i);
+        }
+
+        hwaddr start = 0;
+        if (inline_data) {
+            attrib_data_addr = attr->inline_array_offset;
+            stride = inline_stride;
+        } else {
+            hwaddr dma_len;
+            uint8_t *attr_data = (uint8_t *)nv_dma_map(
+                d, attr->dma_select ? pg->dma_vertex_b : pg->dma_vertex_a,
+                &dma_len);
+            assert(attr->offset < dma_len);
+            attrib_data_addr = attr_data + attr->offset - d->vram_ptr;
+            stride = attr->stride;
+            start = attrib_data_addr + min_element * stride;
+            update_memory_buffer(d, start, num_elements * stride);
+        }
+
+        uint32_t provoking_element_index = provoking_element - min_element;
+        size_t element_size = attr->size * attr->count;
+        assert(element_size <= sizeof(attr->inline_value));
+        const uint8_t *last_entry;
+
+        if (inline_data) {
+            last_entry =
+                (uint8_t *)pg->inline_array + attr->inline_array_offset;
+        } else {
+            last_entry = d->vram_ptr + start;
+        }
+        if (!stride) {
+            // Stride of 0 indicates that only the first element should be
+            // used.
+            pg->uniform_attrs |= 1 << i;
+            pgraph_update_inline_value(attr, last_entry);
+            NV2A_VK_DPRINTF("inline_value = {%f, %f, %f, %f}",
+                            attr->inline_value[0], attr->inline_value[1],
+                            attr->inline_value[2], attr->inline_value[3]);
+            NV2A_VK_DGROUP_END();
+            continue;
+        }
+
+        NV2A_VK_DPRINTF("offset = %08" HWADDR_PRIx, attrib_data_addr);
+        last_entry += stride * provoking_element_index;
+        pgraph_update_inline_value(attr, last_entry);
+
+        r->vertex_attribute_to_description_location[i] =
+            r->num_active_vertex_binding_descriptions;
+
+        r->vertex_binding_descriptions
+            [r->num_active_vertex_binding_descriptions++] =
+            (VkVertexInputBindingDescription){
+                .binding = r->vertex_attribute_to_description_location[i],
+                .stride = stride,
+                .inputRate = VK_VERTEX_INPUT_RATE_VERTEX,
+            };
+
+        r->vertex_attribute_descriptions
+            [r->num_active_vertex_attribute_descriptions++] =
+            (VkVertexInputAttributeDescription){
+                .binding = r->vertex_attribute_to_description_location[i],
+                .location = i,
+                .format = vk_format,
+            };
+
+        r->vertex_attribute_offsets[i] = attrib_data_addr;
+
+        NV2A_VK_DGROUP_END();
+    }
+
+    NV2A_VK_DGROUP_END();
+}
+
+void pgraph_vk_bind_vertex_attributes_inline(NV2AState *d)
+{
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    pg->compressed_attrs = 0;
+    pg->uniform_attrs = 0;
+    pg->swizzle_attrs = 0;
+
+    r->num_active_vertex_attribute_descriptions = 0;
+    r->num_active_vertex_binding_descriptions = 0;
+
+    for (int i = 0; i < NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
+        VertexAttribute *attr = &pg->vertex_attributes[i];
+        if (attr->inline_buffer_populated) {
+            r->vertex_attribute_to_description_location[i] =
+                r->num_active_vertex_binding_descriptions;
+            r->vertex_binding_descriptions
+                [r->num_active_vertex_binding_descriptions++] =
+                (VkVertexInputBindingDescription){
+                    .binding =
+                        r->vertex_attribute_to_description_location[i],
+                    .stride = 4 * sizeof(float),
+                    .inputRate = VK_VERTEX_INPUT_RATE_VERTEX,
+                };
+            r->vertex_attribute_descriptions
+                [r->num_active_vertex_attribute_descriptions++] =
+                (VkVertexInputAttributeDescription){
+                    .binding =
+                        r->vertex_attribute_to_description_location[i],
+                    .location = i,
+                    .format = VK_FORMAT_R32G32B32A32_SFLOAT,
+                };
+            memcpy(attr->inline_value,
+                   attr->inline_buffer + (pg->inline_buffer_length - 1) * 4,
+                   sizeof(attr->inline_value));
+        } else {
+            r->vertex_attribute_to_description_location[i] = -1;
+            pg->uniform_attrs |= 1 << i;
+        }
+    }
+}
--- a/hw/xbox/nv2a/pgraph/vsh.h
+++ b/hw/xbox/nv2a/pgraph/vsh.h
@ -21,7 +21,7 @@
 #define HW_NV2A_VSH_H

 #include <stdbool.h>
-#include "shaders_common.h"
+#include "qemu/mstring.h"

 enum VshLight {
    LIGHT_OFF,
@ -130,11 +130,4 @@ typedef enum {

 uint8_t vsh_get_field(const uint32_t *shader_token, VshFieldName field_name);

-void vsh_translate(uint16_t version,
-                   const uint32_t *tokens,
-                   unsigned int length,
-                   bool z_perspective,
-                   MString *header, MString *body);
-
-
 #endif
--- a/hw/xbox/nv2a/shaders.c
+++ b/hw/xbox/nv2a/shaders.c
--- a/hw/xbox/nv2a/shaders_common.h
+++ b/hw/xbox/nv2a/shaders_common.h
@ -1,125 +0,0 @@
-/*
- * QEMU Geforce NV2A shader common definitions
- *
- * Copyright (c) 2015 espes
- * Copyright (c) 2015 Jannik Vogel
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef HW_NV2A_SHADERS_COMMON_H
-#define HW_NV2A_SHADERS_COMMON_H
-
-#include "debug.h"
-
-#define DEF_VERTEX_DATA(qualifier, in_out, prefix, suffix) \
-    "noperspective " in_out " float " prefix "vtx_inv_w" suffix ";\n" \
-    "flat " in_out " float " prefix "vtx_inv_w_flat" suffix ";\n" \
-    qualifier " " in_out " vec4 " prefix "vtxD0" suffix ";\n" \
-    qualifier " " in_out " vec4 " prefix "vtxD1" suffix ";\n" \
-    qualifier " " in_out " vec4 " prefix "vtxB0" suffix ";\n" \
-    qualifier " " in_out " vec4 " prefix "vtxB1" suffix ";\n" \
-    "noperspective " in_out " float " prefix "vtxFog" suffix ";\n" \
-    "noperspective " in_out " vec4 " prefix "vtxT0" suffix ";\n" \
-    "noperspective " in_out " vec4 " prefix "vtxT1" suffix ";\n" \
-    "noperspective " in_out " vec4 " prefix "vtxT2" suffix ";\n" \
-    "noperspective " in_out " vec4 " prefix "vtxT3" suffix ";\n"
-
-#define STRUCT_VERTEX_DATA_OUT_SMOOTH DEF_VERTEX_DATA("noperspective", "out", "", "")
-#define STRUCT_VERTEX_DATA_IN_SMOOTH  DEF_VERTEX_DATA("noperspective", "in", "", "")
-#define STRUCT_V_VERTEX_DATA_OUT_SMOOTH DEF_VERTEX_DATA("noperspective", "out", "v_", "")
-#define STRUCT_V_VERTEX_DATA_IN_ARRAY_SMOOTH DEF_VERTEX_DATA("noperspective", "in", "v_", "[]")
-
-#define STRUCT_VERTEX_DATA_OUT_FLAT DEF_VERTEX_DATA("flat", "out", "", "")
-#define STRUCT_VERTEX_DATA_IN_FLAT  DEF_VERTEX_DATA("flat", "in", "", "")
-#define STRUCT_V_VERTEX_DATA_OUT_FLAT DEF_VERTEX_DATA("flat", "out", "v_", "")
-#define STRUCT_V_VERTEX_DATA_IN_ARRAY_FLAT DEF_VERTEX_DATA("flat", "in", "v_", "[]")
-
-typedef struct {
-   int ref;
-   gchar *string;
-} MString;
-
-void mstring_append_fmt(MString *mstring, const char *fmt, ...);
-MString *mstring_from_fmt(const char *fmt, ...);
-void mstring_append_va(MString *mstring, const char *fmt, va_list va);
-
-static inline
-void mstring_ref(MString *mstr)
-{
-   mstr->ref++;
-}
-
-static inline
-void mstring_unref(MString *mstr)
-{
-   mstr->ref--;
-   if (!mstr->ref) {
-      g_free(mstr->string);
-      g_free(mstr);
-   }
-}
-
-static inline
-void mstring_append(MString *mstr, const char *str)
-{
-   gchar *n = g_strconcat(mstr->string, str, NULL);
-   g_free(mstr->string);
-   mstr->string = n;
-}
-
-static inline
-void mstring_append_chr(MString *mstr, char chr)
-{
-   mstring_append_fmt(mstr, "%c", chr);
-}
-
-static inline
-void mstring_append_int(MString *mstr, int val)
-{
-   mstring_append_fmt(mstr, "%" PRId64, val);
-}
-
-static inline
-MString *mstring_new(void)
-{
-   MString *mstr = g_malloc(sizeof(MString));
-   mstr->ref = 1;
-   mstr->string = g_strdup("");
-   return mstr;
-}
-
-static inline
-MString *mstring_from_str(const char *str)
-{
-   MString *mstr = g_malloc(sizeof(MString));
-   mstr->ref = 1;
-   mstr->string = g_strdup(str);
-   return mstr;
-}
-
-static inline
-const gchar *mstring_get_str(MString *mstr)
-{
-   return mstr->string;
-}
-
-static inline
-size_t mstring_get_length(MString *mstr)
-{
-   return strlen(mstr->string);
-}
-
-
-#endif
--- a/include/qemu/lru.h
+++ b/include/qemu/lru.h
@ -1,7 +1,7 @@
 /*
 * LRU object list
 *
- * Copyright (c) 2021 Matt Borgerson
+ * Copyright (c) 2021-2024 Matt Borgerson
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@ -42,6 +42,8 @@ typedef struct Lru Lru;
 struct Lru {
 	QTAILQ_HEAD(, LruNode) global;
 	QTAILQ_HEAD(, LruNode) bins[LRU_NUM_BINS];
+	int num_used;
+	int num_free;

 	/* Initialize a node. */
 	void (*init_node)(Lru *lru, LruNode *node, void *key);
@ -67,6 +69,8 @@ void lru_init(Lru *lru)
 	lru->compare_nodes = NULL;
 	lru->pre_node_evict = NULL;
 	lru->post_node_evict = NULL;
+	lru->num_free = 0;
+	lru->num_used = 0;
 }

 static inline
@ -74,6 +78,7 @@ void lru_add_free(Lru *lru, LruNode *node)
 {
 	node->next_bin.tqe_circ.tql_prev = NULL;
 	QTAILQ_INSERT_TAIL(&lru->global, node, next_global);
+	lru->num_free += 1;
 }

 static inline
@ -106,29 +111,51 @@ void lru_evict_node(Lru *lru, LruNode *node)
 	if (lru->post_node_evict) {
 		lru->post_node_evict(lru, node);
 	}
+
+	lru->num_used -= 1;
+	lru->num_free += 1;
+}
+
+static inline
+LruNode *lru_try_evict_one(Lru *lru)
+{
+	LruNode *found;
+
+	QTAILQ_FOREACH_REVERSE(found, &lru->global, next_global) {
+		if (lru_is_node_in_use(lru, found)
+			&& (!lru->pre_node_evict || lru->pre_node_evict(lru, found))) {
+			lru_evict_node(lru, found);
+			return found;
+		}
+	}
+
+	return NULL;
 }

 static inline
 LruNode *lru_evict_one(Lru *lru)
 {
-	LruNode *found;
-
-	QTAILQ_FOREACH_REVERSE(found, &lru->global, next_global) {
-		bool can_evict = true;
-		if (lru_is_node_in_use(lru, found) && lru->pre_node_evict) {
-			can_evict = lru->pre_node_evict(lru, found);
-		}
-		if (can_evict) {
-			break;
-		}
-	}
+	LruNode *found = lru_try_evict_one(lru);

 	assert(found != NULL); /* No evictable node! */

-	lru_evict_node(lru, found);
 	return found;
 }

+static inline
+LruNode *lru_get_one_free(Lru *lru)
+{
+	LruNode *found;
+
+	QTAILQ_FOREACH_REVERSE(found, &lru->global, next_global) {
+		if (!lru_is_node_in_use(lru, found)) {
+			return found;
+		}
+	}
+
+	return lru_evict_one(lru);
+}
+
 static inline
 bool lru_contains_hash(Lru *lru, uint64_t hash)
 {
@ -160,12 +187,15 @@ LruNode *lru_lookup(Lru *lru, uint64_t hash, void *key)
 	if (found) {
 		QTAILQ_REMOVE(&lru->bins[bin], found, next_bin);
 	} else {
-		found = lru_evict_one(lru);
+		found = lru_get_one_free(lru);
 		found->hash = hash;
 		if (lru->init_node) {
 			lru->init_node(lru, found, key);
 		}
 		assert(found->hash == hash);
+
+		lru->num_used += 1;
+		lru->num_free -= 1;
 	}

 	QTAILQ_REMOVE(&lru->global, found, next_global);
--- a/include/qemu/mstring.h
+++ b/include/qemu/mstring.h
@ -0,0 +1,82 @@
+#ifndef MSTRING_H
+#define MSTRING_H
+
+#include "qemu/osdep.h"
+#include <string.h>
+
+typedef struct {
+   int ref;
+   gchar *string;
+} MString;
+
+void mstring_append_fmt(MString *mstring, const char *fmt, ...);
+MString *mstring_from_fmt(const char *fmt, ...);
+void mstring_append_va(MString *mstring, const char *fmt, va_list va);
+
+static inline
+void mstring_ref(MString *mstr)
+{
+   mstr->ref++;
+}
+
+static inline
+void mstring_unref(MString *mstr)
+{
+   mstr->ref--;
+   if (!mstr->ref) {
+      g_free(mstr->string);
+      g_free(mstr);
+   }
+}
+
+static inline
+void mstring_append(MString *mstr, const char *str)
+{
+   gchar *n = g_strconcat(mstr->string, str, NULL);
+   g_free(mstr->string);
+   mstr->string = n;
+}
+
+static inline
+void mstring_append_chr(MString *mstr, char chr)
+{
+   mstring_append_fmt(mstr, "%c", chr);
+}
+
+static inline
+void mstring_append_int(MString *mstr, int val)
+{
+   mstring_append_fmt(mstr, "%" PRId64, val);
+}
+
+static inline
+MString *mstring_new(void)
+{
+   MString *mstr = g_malloc(sizeof(MString));
+   mstr->ref = 1;
+   mstr->string = g_strdup("");
+   return mstr;
+}
+
+static inline
+MString *mstring_from_str(const char *str)
+{
+   MString *mstr = g_malloc(sizeof(MString));
+   mstr->ref = 1;
+   mstr->string = g_strdup(str);
+   return mstr;
+}
+
+static inline
+const gchar *mstring_get_str(MString *mstr)
+{
+   return mstr->string;
+}
+
+static inline
+size_t mstring_get_length(MString *mstr)
+{
+   return strlen(mstr->string);
+}
+
+#endif
--- a/licenses/SPIRV-Reflect.license.txt
+++ b/licenses/SPIRV-Reflect.license.txt
@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/licenses/VulkanMemoryAllocator.license.txt
+++ b/licenses/VulkanMemoryAllocator.license.txt
@ -0,0 +1,19 @@
+Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/licenses/volk.license.txt
+++ b/licenses/volk.license.txt
@ -0,0 +1,19 @@
+Copyright (c) 2018-2024 Arseny Kapoulkine
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/meson.build
+++ b/meson.build
@ -1180,6 +1180,34 @@ if not get_option('opengl').auto() or have_system or have_vhost_user_gpu
                              link_args: config_host['EPOXY_LIBS'].split() + opengl_libs)
 endif

+vulkan = not_found
+if targetos == 'windows'
+  vulkan = declare_dependency(
+      compile_args: ['-DVK_USE_PLATFORM_WIN32_KHR', '-DVK_NO_PROTOTYPES'],
+      )
+  libglslang = declare_dependency(link_args: [
+        '-lglslang',
+        '-lMachineIndependent',
+        '-lGenericCodeGen',
+        '-lSPIRV',
+        '-lSPIRV-Tools',
+        '-lSPIRV-Tools-opt'
+      ])
+elif targetos == 'linux'
+  vulkan = dependency('vulkan')
+  libglslang = declare_dependency(link_args: [
+        '-lglslang',
+        '-lMachineIndependent',
+        '-lGenericCodeGen',
+        '-lSPIRV',
+        '-lSPIRV-Tools',
+        '-lSPIRV-Tools-opt'
+      ])
+endif
+
+subdir('thirdparty')
+
+
 gbm = not_found
 if (have_system or have_tools) and (virgl.found() or opengl.found())
  gbm = dependency('gbm', method: 'pkg-config', required: false,
@ -1931,6 +1959,7 @@ config_host_data.set('CONFIG_LINUX_IO_URING', linux_io_uring.found())
 config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
 config_host_data.set('CONFIG_NUMA', numa.found())
 config_host_data.set('CONFIG_OPENGL', opengl.found())
+config_host_data.set('CONFIG_VULKAN', vulkan.found())
 config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
 config_host_data.set('CONFIG_RBD', rbd.found())
 config_host_data.set('CONFIG_RDMA', rdma.found())
@ -4054,6 +4083,7 @@ summary_info += {'U2F support':       u2f}
 summary_info += {'libusb':            libusb}
 summary_info += {'usb net redir':     usbredir}
 summary_info += {'OpenGL support (epoxy)': opengl}
+summary_info += {'Vulkan support':    vulkan}
 summary_info += {'GBM':               gbm}
 summary_info += {'libiscsi support':  libiscsi}
 summary_info += {'libnfs support':    libnfs}
--- a/scripts/archive-source.sh
+++ b/scripts/archive-source.sh
@ -28,8 +28,12 @@ sub_file="${sub_tdir}/submodule.tar"
 # different to the host OS.
 submodules="dtc meson ui/keycodemapdb"
 submodules="$submodules tests/fp/berkeley-softfloat-3 tests/fp/berkeley-testfloat-3"
-submodules="$submodules ui/thirdparty/imgui ui/thirdparty/implot ui/thirdparty/httplib util/xxHash tomlplusplus genconfig" # xemu extras
+
+# xemu extras
+submodules="$submodules ui/thirdparty/imgui ui/thirdparty/implot ui/thirdparty/httplib util/xxHash tomlplusplus genconfig"
 submodules="$submodules hw/xbox/nv2a/thirdparty/nv2a_vsh_cpu"
+submodules="$submodules thirdparty/volk thirdparty/VulkanMemoryAllocator thirdparty/SPIRV-Reflect"
+
 sub_deinit=""

 function cleanup() {
--- a/scripts/gen-license.py
+++ b/scripts/gen-license.py
@ -228,7 +228,25 @@ Lib('fpng', 'https://github.com/richgel999/fpng',
 Lib('nv2a_vsh_cpu', 'https://github.com/abaire/nv2a_vsh_cpu',
 	unlicense, 'https://raw.githubusercontent.com/abaire/nv2a_vsh_cpu/main/LICENSE',
 	ships_static=all_platforms,
-	submodule=Submodule('hw/xbox/nv2a/thirdparty/nv2a_vsh_cpu')
+	submodule=Submodule('hw/xbox/nv2a/pgraph/thirdparty/nv2a_vsh_cpu')
+	),
+
+Lib('volk', 'https://github.com/zeux/volk',
+	mit, 'https://raw.githubusercontent.com/zeux/volk/master/LICENSE.md',
+	ships_static=all_platforms,
+	submodule=Submodule('thirdparty/volk')
+	),
+
+Lib('VulkanMemoryAllocator', 'https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator',
+	mit, 'https://raw.githubusercontent.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator/master/LICENSE.txt',
+	ships_static=all_platforms,
+	submodule=Submodule('thirdparty/VulkanMemoryAllocator')
+	),
+
+Lib('SPIRV-Reflect', 'https://github.com/KhronosGroup/SPIRV-Reflect',
+	apache2, 'https://raw.githubusercontent.com/KhronosGroup/SPIRV-Reflect/main/LICENSE',
+	ships_static=all_platforms,
+	submodule=Submodule('thirdparty/SPIRV-Reflect')
 	),

 #
@ -344,6 +362,17 @@ Lib('miniz', 'https://github.com/richgel999/miniz',
 	ships_static={windows},	platform={windows},
 	version='2.1.0'
 	),
+
+Lib('glslang', 'https://github.com/KhronosGroup/glslang',
+	bsd_3clause, 'https://raw.githubusercontent.com/KhronosGroup/glslang/main/LICENSE.txt',
+	ships_static={windows},	platform={windows},
+	),
+
+Lib('SPIRV-Tools', 'https://github.com/KhronosGroup/SPIRV-Tools',
+	apache2, 'https://raw.githubusercontent.com/KhronosGroup/SPIRV-Tools/main/LICENSE',
+	ships_static={windows},	platform={windows},
+	),
+
 ]

 def gen_license():
--- a/thirdparty/SPIRV-Reflect
+++ b/thirdparty/SPIRV-Reflect
@ -0,0 +1 @@
+Subproject commit 1d674a82d7e102ed0c02e64e036827db9e8b1a71
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Subproject commit 1d674a82d7e102ed0c02e64e036827db9e8b1a71`