137 changed files with 6128 additions and 8427 deletions
--- a/.github/workflows/build-xemu-win64-toolchain.yml
+++ b/.github/workflows/build-xemu-win64-toolchain.yml
@ -24,7 +24,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
      - name: Extract image metadata (tags, labels)
        id: meta
-        uses: docker/metadata-action@c1e51972afc2121e065aed6d45c65596fe445f3f # v5
+        uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804 # v5
        with:
          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
          labels: |
@ -35,7 +35,7 @@ jobs:
            type=ref,event=branch
            type=sha
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3
+        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3
      - name: Login to GitHub Container Registry
        uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3
        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
@ -44,7 +44,7 @@ jobs:
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
      - name: Build and push image
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v5
+        uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v5
        with:
          context: ubuntu-win64-cross
          push: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -83,7 +83,7 @@ jobs:

    steps:
    - name: Download source package
-      uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
+      uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4
      with:
        name: src.tar.gz
    - name: Extract source package
@ -140,7 +140,7 @@ jobs:
          arch: aarch64
    steps:
    - name: Download artifacts
-      uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
+      uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4
      with:
        name: ${{ matrix.artifact_name }}
        path: ${{ matrix.artifact_name }}
@ -174,25 +174,25 @@ jobs:
          build_param: --debug
          artifact_name: xemu-ubuntu-x86_64-debug
          artifact_filename: xemu-ubuntu-x86_64-debug.tgz
-          runs-on: ubuntu-22.04
+          runs-on: ubuntu-24.04
        - arch: x86_64
          configuration: Release
          build_param:
          artifact_name: xemu-ubuntu-x86_64-release
          artifact_filename: xemu-ubuntu-x86_64-release.tgz
-          runs-on: ubuntu-22.04
+          runs-on: ubuntu-24.04
        - arch: aarch64
          configuration: Debug
          build_param: --debug
          artifact_name: xemu-ubuntu-aarch64-debug
          artifact_filename: xemu-ubuntu-aarch64-debug.tgz
-          runs-on: ubuntu-22.04-arm
+          runs-on: ubuntu-24.04-arm
        - arch: aarch64
          configuration: Release
          build_param:
          artifact_name: xemu-ubuntu-aarch64-release
          artifact_filename: xemu-ubuntu-aarch64-release.tgz
-          runs-on: ubuntu-22.04-arm
+          runs-on: ubuntu-24.04-arm
    steps:
    - name: Initialize compiler cache
      id: cache
@ -202,7 +202,7 @@ jobs:
        key: cache-${{ runner.os }}-${{ matrix.arch }}-${{ matrix.configuration }}-${{ github.sha }}
        restore-keys: cache-${{ runner.os }}-${{ matrix.arch }}-${{ matrix.configuration }}-
    - name: Download source package
-      uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
+      uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4
      with:
        name: src.tar.gz
    - name: Extract source package
@ -305,12 +305,12 @@ jobs:
          artifact_filename: xemu-macos-arm64-release.zip
    steps:
    - name: Download source package
-      uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
+      uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4
      with:
        name: src.tar.gz
    - name: Extract source package
      run: tar xf src.tar.gz
-    - uses: actions/setup-python@v5.6.0
+    - uses: actions/setup-python@v5.5.0
      with:
        python-version: '3.12'
    - name: Install dependencies
@ -358,12 +358,12 @@ jobs:
        configuration: ["debug", "release"]
    steps:
    - name: Download x86_64 build
-      uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
+      uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4
      with:
        name: xemu-macos-x86_64-${{ matrix.configuration }}
        path: xemu-macos-x86_64-${{ matrix.configuration }}
    - name: Download arm64 build
-      uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
+      uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4
      with:
        name: xemu-macos-arm64-${{ matrix.configuration }}
        path: xemu-macos-arm64-${{ matrix.configuration }}
@ -398,7 +398,7 @@ jobs:
    needs: [Ubuntu, macOSUniversal, Windows, WindowsPdb]
    steps:
    - name: Download artifacts
-      uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
+      uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4
      with:
        path: dist
    - name: Extract source package
@ -420,7 +420,7 @@ jobs:
      run: |
        cp dist/xemu-win-x86_64-release-pdb/xemu-win-x86_64-release.zip dist/xemu-win-x86_64-release-pdb/xemu-win-release.zip
    - name: Publish release
-      uses: softprops/action-gh-release@72f2c25fcb47643c292f7107632f7a47c1df5cd8 # v2.3.2
+      uses: softprops/action-gh-release@c95fe1489396fe8a9eb87c0abf8aa5b2ef267fda # v2.2.1
      with:
        tag_name: v${{ env.XEMU_VERSION }}
        name: v${{ env.XEMU_VERSION }}
@ -462,7 +462,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
    - name: Download source package
-      uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
+      uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4
      with:
        name: src.tar.gz
    - name: Extract source package
@ -471,7 +471,7 @@ jobs:
        tar -C src -xf src.tar.gz

        # Ensure subprojects are uploaded
-        find src/subprojects -name "*.gitignore" -exec rm {} \;
+        rm src/subprojects/.gitignore
    - name: Integrate Debian packaging
      run: |
        pushd src
--- a/.github/workflows/bump-subproject-wraps.yml
+++ b/.github/workflows/bump-subproject-wraps.yml
@ -1,74 +0,0 @@
-name: Bump Meson subprojects
-
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 6 * * 1'
-
-permissions:
-  contents: write
-  pull-requests: write
-
-jobs:
-  bump_wraps:
-    name: "Bump Meson subprojects"
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
-
-      - name: Install the latest version of uv
-        uses: astral-sh/setup-uv@e92bafb6253dcd438e0484186d7669ea7a8ca1cc # v6
-        with:
-          enable-cache: false
-
-      - name: Check for updates
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          uv run -s scripts/bump-subproject-wraps.py -m \
-              subprojects/curl.wrap \
-              subprojects/genconfig.wrap \
-              subprojects/glslang.wrap \
-              subprojects/imgui.wrap \
-              subprojects/implot.wrap \
-              subprojects/json.wrap \
-              subprojects/nv2a_vsh_cpu.wrap \
-              subprojects/SPIRV-Reflect.wrap \
-              subprojects/tomlplusplus.wrap \
-              subprojects/volk.wrap \
-              subprojects/VulkanMemoryAllocator.wrap \
-              subprojects/xxhash.wrap \
-            > updated.json
-
-      - name: Create PRs for updates
-        env:
-          GH_TOKEN: ${{ secrets.XEMU_ROBOT_TOKEN }}
-        run: |
-          set -euo pipefail
-          git config user.name  "xemu-robot"
-          git config user.email "robot@xemu.app"
-
-          jq -c '.[]' updated.json | while read -r item; do
-            path=$(echo "$item" | jq -r '.path')
-            file_basename=$(basename "$path")
-            name="${file_basename%%.*}"
-
-            owner=$(echo "$item" | jq -r '.owner')
-            repo=$(echo "$item" | jq -r '.repo')
-            old_rev=$(echo "$item" | jq -r '.old_rev')
-            new_rev=$(echo "$item" | jq -r '.new_rev')
-            new_tag=$(echo "$item" | jq -r '.new_tag')
-
-            echo "➤ Processing $name"
-            branch="sync/bump-${name//\//-}-${GITHUB_RUN_ID}"
-
-            git switch --quiet -c "$branch" origin/master
-            git add "$path"
-            git commit -m "meson: Bump ${name} to ${new_tag}"
-            git push -u origin "$branch"
-
-            gh pr create \
-              --title "meson: Bump ${name} to ${new_tag}" \
-              --body  "Automatic bump of \`${name}\` to [${new_tag}](https://github.com/${owner}/${repo}/compare/${old_rev}..${new_rev})." \
-              --base  master
-          done
--- a/build.sh
+++ b/build.sh
@ -66,14 +66,8 @@ package_macos() {

    cp Info.plist dist/xemu.app/Contents/

-    if [[ -e "${project_source_dir}/XEMU_VERSION" ]]; then
-      xemu_version="$(cat ${project_source_dir}/XEMU_VERSION | cut -f1 -d-)"
-    else
-      xemu_version="0.0.0"
-    fi
-
-    plutil -replace CFBundleShortVersionString -string "${xemu_version}" dist/xemu.app/Contents/Info.plist
-    plutil -replace CFBundleVersion            -string "${xemu_version}" dist/xemu.app/Contents/Info.plist
+    plutil -replace CFBundleShortVersionString -string $(cat ${project_source_dir}/XEMU_VERSION | cut -f1 -d-) dist/xemu.app/Contents/Info.plist
+    plutil -replace CFBundleVersion            -string $(cat ${project_source_dir}/XEMU_VERSION | cut -f1 -d-) dist/xemu.app/Contents/Info.plist

    codesign --force --deep --preserve-metadata=entitlements,requirements,flags,runtime --sign - "${exe_path}"
    python3 ./scripts/gen-license.py --version-file=macos-libs/$target_arch/INSTALLED > dist/LICENSE.txt
--- a/config_spec.yml
+++ b/config_spec.yml
@ -54,9 +54,6 @@ input:
  auto_bind:
    type: bool
    default: true
-  allow_vibration:
-    type: bool
-    default: true
  background_input_capture: bool
  keyboard_controller_scancode_map:
    # Scancode reference : https://github.com/libsdl-org/SDL/blob/main/include/SDL_scancode.h
@ -212,19 +209,9 @@ display:
      advanced_tree_state:
        type: bool
        default: false
-  setup_nvidia_profile:
-    type: bool
-    default: true

 audio:
-  vp:
-    num_workers:
-      type: integer
-      default: 0  # 0 = auto
  use_dsp: bool
-  hrtf:
-    type: bool
-    default: true
  volume_limit:
    type: number
    default: 1
--- a/debian/control
+++ b/debian/control
@ -6,8 +6,6 @@ Build-Depends: debhelper (>= 11),
 cmake,
 git,
 python3:any,
- python3-pip,
- python3-tomli,
 python3-yaml,
 python3-venv,
 ninja-build,
--- a/hw/xbox/mcpx/apu/vp/adpcm.h
+++ b/hw/xbox/mcpx/apu/vp/adpcm.h
--- a/hw/xbox/mcpx/apu/vp/vp.c
+++ b/hw/xbox/mcpx/apu/vp/vp.c
--- a/hw/xbox/mcpx/apu/apu.h
+++ b/hw/xbox/mcpx/apu/apu.h
--- a/hw/xbox/mcpx/apu/apu.c
+++ b/hw/xbox/mcpx/apu/apu.c
@ -1,564 +0,0 @@
-/*
- * QEMU MCPX Audio Processing Unit implementation
- *
- * Copyright (c) 2012 espes
- * Copyright (c) 2018-2019 Jannik Vogel
- * Copyright (c) 2019-2025 Matt Borgerson
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "apu_int.h"
-
-MCPXAPUState *g_state; // Used via debug handlers
-
-static void update_irq(MCPXAPUState *d)
-{
-    if (d->regs[NV_PAPU_FECTL] & NV_PAPU_FECTL_FEMETHMODE_TRAPPED) {
-        qatomic_or(&d->regs[NV_PAPU_ISTS], NV_PAPU_ISTS_FETINTSTS);
-    }
-    if ((d->regs[NV_PAPU_IEN] & NV_PAPU_ISTS_GINTSTS) &&
-        ((d->regs[NV_PAPU_ISTS] & ~NV_PAPU_ISTS_GINTSTS) &
-         d->regs[NV_PAPU_IEN])) {
-        qatomic_or(&d->regs[NV_PAPU_ISTS], NV_PAPU_ISTS_GINTSTS);
-        // fprintf(stderr, "mcpx irq raise ien=%08x ists=%08x\n",
-        //         d->regs[NV_PAPU_IEN], d->regs[NV_PAPU_ISTS]);
-        pci_irq_assert(PCI_DEVICE(d));
-    } else {
-        qatomic_and(&d->regs[NV_PAPU_ISTS], ~NV_PAPU_ISTS_GINTSTS);
-        // fprintf(stderr, "mcpx irq lower ien=%08x ists=%08x\n",
-        //         d->regs[NV_PAPU_IEN], d->regs[NV_PAPU_ISTS]);
-        pci_irq_deassert(PCI_DEVICE(d));
-    }
-}
-
-static uint64_t mcpx_apu_read(void *opaque, hwaddr addr, unsigned int size)
-{
-    MCPXAPUState *d = opaque;
-
-    uint64_t r = 0;
-    switch (addr) {
-    case NV_PAPU_XGSCNT:
-        r = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) / 100; //???
-        break;
-    default:
-        if (addr < 0x20000) {
-            r = qatomic_read(&d->regs[addr]);
-        }
-        break;
-    }
-
-    trace_mcpx_apu_reg_read(addr, size, r);
-    return r;
-}
-
-static void mcpx_apu_write(void *opaque, hwaddr addr, uint64_t val,
-                           unsigned int size)
-{
-    MCPXAPUState *d = opaque;
-
-    trace_mcpx_apu_reg_write(addr, size, val);
-
-    switch (addr) {
-    case NV_PAPU_ISTS:
-        /* the bits of the interrupts to clear are written */
-        qatomic_and(&d->regs[NV_PAPU_ISTS], ~val);
-        update_irq(d);
-        qemu_cond_broadcast(&d->cond);
-        break;
-    case NV_PAPU_FECTL:
-    case NV_PAPU_SECTL:
-        qatomic_set(&d->regs[addr], val);
-        qemu_cond_broadcast(&d->cond);
-        break;
-    case NV_PAPU_FEMEMDATA:
-        /* 'magic write'
-         * This value is expected to be written to FEMEMADDR on completion of
-         * something to do with notifies. Just do it now :/ */
-        stl_le_phys(&address_space_memory, d->regs[NV_PAPU_FEMEMADDR], val);
-        // fprintf(stderr, "MAGIC WRITE\n");
-        qatomic_set(&d->regs[addr], val);
-        break;
-    default:
-        if (addr < 0x20000) {
-            qatomic_set(&d->regs[addr], val);
-        }
-        break;
-    }
-}
-
-static const MemoryRegionOps mcpx_apu_mmio_ops = {
-    .read = mcpx_apu_read,
-    .write = mcpx_apu_write,
-};
-
-static void se_frame(MCPXAPUState *d)
-{
-    mcpx_apu_update_dsp_preference(d);
-    mcpx_debug_begin_frame();
-    g_dbg.gp_realtime = d->gp.realtime;
-    g_dbg.ep_realtime = d->ep.realtime;
-
-    qemu_spin_lock(&d->monitor.fifo_lock);
-    int num_bytes_free = fifo8_num_free(&d->monitor.fifo);
-    qemu_spin_unlock(&d->monitor.fifo_lock);
-
-    /* A rudimentary calculation to determine approximately how taxed the APU
-     * thread is, by measuring how much time we spend waiting for FIFO to drain
-     * versus working on building frames.
-     * =1: thread is not sleeping and likely falling behind realtime
-     * <1: thread is able to complete work on time
-     */
-    if (num_bytes_free < sizeof(d->monitor.frame_buf)) {
-        int64_t sleep_start = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
-        qemu_cond_wait(&d->cond, &d->lock);
-        int64_t sleep_end = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
-        d->sleep_acc += (sleep_end - sleep_start);
-        return;
-    }
-    int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
-    if (now - d->frame_count_time >= 1000) {
-        g_dbg.frames_processed = d->frame_count;
-        float t = 1.0f - ((double)d->sleep_acc /
-                          (double)((now - d->frame_count_time) * 1000));
-        g_dbg.utilization = t;
-
-        d->frame_count_time = now;
-        d->frame_count = 0;
-        d->sleep_acc = 0;
-    }
-    d->frame_count++;
-
-    /* Buffer for all mixbins for this frame */
-    float mixbins[NUM_MIXBINS][NUM_SAMPLES_PER_FRAME] = { 0 };
-
-    mcpx_apu_vp_frame(d, mixbins);
-    mcpx_apu_dsp_frame(d, mixbins);
-
-    if ((d->ep_frame_div + 1) % 8 == 0) {
-#if 0
-        FILE *fd = fopen("ep.pcm", "a+");
-        assert(fd != NULL);
-        fwrite(d->apu_fifo_output, sizeof(d->apu_fifo_output), 1, fd);
-        fclose(fd);
-#endif
-
-        if (0 <= g_config.audio.volume_limit && g_config.audio.volume_limit < 1) {
-            float f = pow(g_config.audio.volume_limit, M_E);
-            for (int i = 0; i < 256; i++) {
-                d->monitor.frame_buf[i][0] *= f;
-                d->monitor.frame_buf[i][1] *= f;
-            }
-        }
-
-        qemu_spin_lock(&d->monitor.fifo_lock);
-        num_bytes_free = fifo8_num_free(&d->monitor.fifo);
-        assert(num_bytes_free >= sizeof(d->monitor.frame_buf));
-        fifo8_push_all(&d->monitor.fifo, (uint8_t *)d->monitor.frame_buf,
-                       sizeof(d->monitor.frame_buf));
-        qemu_spin_unlock(&d->monitor.fifo_lock);
-        memset(d->monitor.frame_buf, 0, sizeof(d->monitor.frame_buf));
-    }
-
-    d->ep_frame_div++;
-
-    mcpx_debug_end_frame();
-}
-
-/* Note: only supports millisecond resolution on Windows */
-static void sleep_ns(int64_t ns)
-{
-#ifndef _WIN32
-        struct timespec sleep_delay, rem_delay;
-        sleep_delay.tv_sec = ns / 1000000000LL;
-        sleep_delay.tv_nsec = ns % 1000000000LL;
-        nanosleep(&sleep_delay, &rem_delay);
-#else
-        Sleep(ns / SCALE_MS);
-#endif
-}
-
-static void monitor_sink_cb(void *opaque, uint8_t *stream, int free_b)
-{
-    MCPXAPUState *s = MCPX_APU_DEVICE(opaque);
-
-    if (!runstate_is_running()) {
-        memset(stream, 0, free_b);
-        return;
-    }
-
-    int avail = 0;
-    while (avail < free_b) {
-        qemu_spin_lock(&s->monitor.fifo_lock);
-        avail = fifo8_num_used(&s->monitor.fifo);
-        qemu_spin_unlock(&s->monitor.fifo_lock);
-        if (avail < free_b) {
-            sleep_ns(1000000);
-            qemu_cond_broadcast(&s->cond);
-        }
-        if (!runstate_is_running()) {
-            memset(stream, 0, free_b);
-            return;
-        }
-    }
-
-    int to_copy = MIN(free_b, avail);
-    while (to_copy > 0) {
-        uint32_t chunk_len = 0;
-        qemu_spin_lock(&s->monitor.fifo_lock);
-        chunk_len = fifo8_pop_buf(&s->monitor.fifo, stream, to_copy);
-        assert(chunk_len <= to_copy);
-        qemu_spin_unlock(&s->monitor.fifo_lock);
-        stream += chunk_len;
-        to_copy -= chunk_len;
-    }
-
-    qemu_cond_broadcast(&s->cond);
-}
-
-static void monitor_init(MCPXAPUState *d)
-{
-    qemu_spin_init(&d->monitor.fifo_lock);
-    fifo8_create(&d->monitor.fifo, 3 * (256 * 2 * 2));
-
-    struct SDL_AudioSpec sdl_audio_spec = {
-        .freq = 48000,
-        .format = AUDIO_S16LSB,
-        .channels = 2,
-        .samples = 512,
-        .callback = monitor_sink_cb,
-        .userdata = d,
-    };
-
-    if (SDL_Init(SDL_INIT_AUDIO) < 0)  {
-        fprintf(stderr, "Failed to initialize SDL audio subsystem: %s\n", SDL_GetError());
-        exit(1);
-    }
-
-    SDL_AudioDeviceID sdl_audio_dev;
-    sdl_audio_dev = SDL_OpenAudioDevice(NULL, 0, &sdl_audio_spec, NULL, 0);
-    if (sdl_audio_dev == 0) {
-        fprintf(stderr, "SDL_OpenAudioDevice failed: %s\n", SDL_GetError());
-        assert(!"SDL_OpenAudioDevice failed");
-        exit(1);
-    }
-    SDL_PauseAudioDevice(sdl_audio_dev, 0);
-}
-
-static void mcpx_apu_realize(PCIDevice *dev, Error **errp)
-{
-    MCPXAPUState *d = MCPX_APU_DEVICE(dev);
-
-    dev->config[PCI_INTERRUPT_PIN] = 0x01;
-
-    memory_region_init_io(&d->mmio, OBJECT(dev), &mcpx_apu_mmio_ops, d,
-                          "mcpx-apu-mmio", 0x80000);
-
-    memory_region_init_io(&d->vp.mmio, OBJECT(dev), &vp_ops, d,
-                          "mcpx-apu-vp", 0x10000);
-    memory_region_add_subregion(&d->mmio, 0x20000, &d->vp.mmio);
-
-    memory_region_init_io(&d->gp.mmio, OBJECT(dev), &gp_ops, d,
-                          "mcpx-apu-gp", 0x10000);
-    memory_region_add_subregion(&d->mmio, 0x30000, &d->gp.mmio);
-
-    memory_region_init_io(&d->ep.mmio, OBJECT(dev), &ep_ops, d,
-                          "mcpx-apu-ep", 0x10000);
-    memory_region_add_subregion(&d->mmio, 0x50000, &d->ep.mmio);
-
-    pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &d->mmio);
-}
-
-static void mcpx_apu_exitfn(PCIDevice *dev)
-{
-    MCPXAPUState *d = MCPX_APU_DEVICE(dev);
-    d->exiting = true;
-    qemu_cond_broadcast(&d->cond);
-    qemu_thread_join(&d->apu_thread);
-    mcpx_apu_vp_finalize(d);
-}
-
-static void mcpx_apu_reset(MCPXAPUState *d)
-{
-    qemu_mutex_lock(&d->lock); // FIXME: Can fail if thread is pegged, add flag
-    memset(d->regs, 0, sizeof(d->regs));
-
-    mcpx_apu_vp_reset(d);
-
-    // FIXME: Reset DSP state
-    memset(d->gp.dsp->core.pram_opcache, 0,
-           sizeof(d->gp.dsp->core.pram_opcache));
-    memset(d->ep.dsp->core.pram_opcache, 0,
-           sizeof(d->ep.dsp->core.pram_opcache));
-    d->set_irq = false;
-    qemu_cond_signal(&d->cond);
-    qemu_mutex_unlock(&d->lock);
-}
-
-// Note: This is handled as a VM state change and not as a `pre_save` callback
-// because we want to halt the FIFO before any VM state is saved/restored to
-// avoid corruption.
-static void mcpx_apu_vm_state_change(void *opaque, bool running, RunState state)
-{
-    MCPXAPUState *d = opaque;
-
-    if (state == RUN_STATE_SAVE_VM) {
-        qemu_mutex_lock(&d->lock);
-    }
-}
-
-static int mcpx_apu_post_save(void *opaque)
-{
-    MCPXAPUState *d = opaque;
-    qemu_cond_signal(&d->cond);
-    qemu_mutex_unlock(&d->lock);
-    return 0;
-}
-
-static int mcpx_apu_pre_load(void *opaque)
-{
-    MCPXAPUState *d = opaque;
-    mcpx_apu_reset(d);
-    qemu_mutex_lock(&d->lock);
-    return 0;
-}
-
-static int mcpx_apu_post_load(void *opaque, int version_id)
-{
-    MCPXAPUState *d = opaque;
-    qemu_cond_signal(&d->cond);
-    qemu_mutex_unlock(&d->lock);
-    return 0;
-}
-
-static void mcpx_apu_reset_hold(Object *obj, ResetType type)
-{
-    MCPXAPUState *d = MCPX_APU_DEVICE(obj);
-    mcpx_apu_reset(d);
-}
-
-const VMStateDescription vmstate_vp_dsp_dma_state = {
-    .name = "mcpx-apu/dsp-state/dma",
-    .version_id = 1,
-    .minimum_version_id = 1,
-    .fields      = (VMStateField[]) {
-        VMSTATE_UINT32(configuration, DSPDMAState),
-        VMSTATE_UINT32(control, DSPDMAState),
-        VMSTATE_UINT32(start_block, DSPDMAState),
-        VMSTATE_UINT32(next_block, DSPDMAState),
-        VMSTATE_BOOL(error, DSPDMAState),
-        VMSTATE_BOOL(eol, DSPDMAState),
-        VMSTATE_END_OF_LIST()
-    }
-};
-
-const VMStateDescription vmstate_vp_dsp_core_state = {
-    .name = "mcpx-apu/dsp-state/core",
-    .version_id = 1,
-    .minimum_version_id = 1,
-    .fields      = (VMStateField[]) {
-        // FIXME: Remove unnecessary fields
-        VMSTATE_UINT16(instr_cycle, dsp_core_t),
-        VMSTATE_UINT32(pc, dsp_core_t),
-        VMSTATE_UINT32_ARRAY(registers, dsp_core_t, DSP_REG_MAX),
-        VMSTATE_UINT32_2DARRAY(stack, dsp_core_t, 2, 16),
-        VMSTATE_UINT32_ARRAY(xram, dsp_core_t, DSP_XRAM_SIZE),
-        VMSTATE_UINT32_ARRAY(yram, dsp_core_t, DSP_YRAM_SIZE),
-        VMSTATE_UINT32_ARRAY(pram, dsp_core_t, DSP_PRAM_SIZE),
-        VMSTATE_UINT32_ARRAY(mixbuffer, dsp_core_t, DSP_MIXBUFFER_SIZE),
-        VMSTATE_UINT32_ARRAY(periph, dsp_core_t, DSP_PERIPH_SIZE),
-        VMSTATE_UINT32(loop_rep, dsp_core_t),
-        VMSTATE_UINT32(pc_on_rep, dsp_core_t),
-        VMSTATE_UINT16(interrupt_state, dsp_core_t),
-        VMSTATE_UINT16(interrupt_instr_fetch, dsp_core_t),
-        VMSTATE_UINT16(interrupt_save_pc, dsp_core_t),
-        VMSTATE_UINT16(interrupt_counter, dsp_core_t),
-        VMSTATE_UINT16(interrupt_ipl_to_raise, dsp_core_t),
-        VMSTATE_UINT16(interrupt_pipeline_count, dsp_core_t),
-        VMSTATE_INT16_ARRAY(interrupt_ipl, dsp_core_t, 12),
-        VMSTATE_UINT16_ARRAY(interrupt_is_pending, dsp_core_t, 12),
-        VMSTATE_UINT32(num_inst, dsp_core_t),
-        VMSTATE_UINT32(cur_inst_len, dsp_core_t),
-        VMSTATE_UINT32(cur_inst, dsp_core_t),
-        VMSTATE_UNUSED(1),
-        VMSTATE_UINT32(disasm_memory_ptr, dsp_core_t),
-        VMSTATE_BOOL(exception_debugging, dsp_core_t),
-        VMSTATE_UINT32(disasm_prev_inst_pc, dsp_core_t),
-        VMSTATE_BOOL(disasm_is_looping, dsp_core_t),
-        VMSTATE_UINT32(disasm_cur_inst, dsp_core_t),
-        VMSTATE_UINT16(disasm_cur_inst_len, dsp_core_t),
-        VMSTATE_UINT32_ARRAY(disasm_registers_save, dsp_core_t, 64),
-// #ifdef DSP_DISASM_REG_PC
-//         VMSTATE_UINT32(pc_save, dsp_core_t),
-// #endif
-        VMSTATE_END_OF_LIST()
-    }
-};
-
-const VMStateDescription vmstate_vp_dsp_state = {
-    .name = "mcpx-apu/dsp-state",
-    .version_id = 1,
-    .minimum_version_id = 1,
-    .fields = (VMStateField[]) {
-        VMSTATE_STRUCT(core, DSPState, 1, vmstate_vp_dsp_core_state, dsp_core_t),
-        VMSTATE_STRUCT(dma, DSPState, 1, vmstate_vp_dsp_dma_state, DSPDMAState),
-        VMSTATE_INT32(save_cycles, DSPState),
-        VMSTATE_UINT32(interrupts, DSPState),
-        VMSTATE_END_OF_LIST()
-    }
-};
-
-
-const VMStateDescription vmstate_vp_ssl_data = {
-    .name = "mcpx_apu_voice_data",
-    .version_id = 1,
-    .minimum_version_id = 1,
-    .fields = (VMStateField[]) {
-        VMSTATE_UINT32_ARRAY(base, MCPXAPUVPSSLData, MCPX_HW_SSLS_PER_VOICE),
-        VMSTATE_UINT8_ARRAY(count, MCPXAPUVPSSLData, MCPX_HW_SSLS_PER_VOICE),
-        VMSTATE_INT32(ssl_index, MCPXAPUVPSSLData),
-        VMSTATE_INT32(ssl_seg, MCPXAPUVPSSLData),
-        VMSTATE_END_OF_LIST()
-    }
-};
-
-static const VMStateDescription vmstate_mcpx_apu = {
-    .name = "mcpx-apu",
-    .version_id = 1,
-    .minimum_version_id = 1,
-    .post_save = mcpx_apu_post_save,
-    .pre_load = mcpx_apu_pre_load,
-    .post_load = mcpx_apu_post_load,
-    .fields = (VMStateField[]) {
-        VMSTATE_PCI_DEVICE(parent_obj, MCPXAPUState),
-        VMSTATE_STRUCT_POINTER(gp.dsp, MCPXAPUState, vmstate_vp_dsp_state,
-                               DSPState),
-        VMSTATE_UINT32_ARRAY(gp.regs, MCPXAPUState, 0x10000),
-        VMSTATE_STRUCT_POINTER(ep.dsp, MCPXAPUState, vmstate_vp_dsp_state,
-                               DSPState),
-        VMSTATE_UINT32_ARRAY(ep.regs, MCPXAPUState, 0x10000),
-        VMSTATE_UINT32_ARRAY(regs, MCPXAPUState, 0x20000),
-        VMSTATE_UINT32(vp.inbuf_sge_handle, MCPXAPUState),
-        VMSTATE_UINT32(vp.outbuf_sge_handle, MCPXAPUState),
-        VMSTATE_STRUCT_ARRAY(vp.ssl, MCPXAPUState, MCPX_HW_MAX_VOICES, 1,
-                             vmstate_vp_ssl_data, MCPXAPUVPSSLData),
-        VMSTATE_INT32(vp.ssl_base_page, MCPXAPUState),
-        VMSTATE_UINT8_ARRAY(vp.hrtf_submix, MCPXAPUState, 4),
-        VMSTATE_UINT8(vp.hrtf_headroom, MCPXAPUState),
-        VMSTATE_UINT8_ARRAY(vp.submix_headroom, MCPXAPUState, NUM_MIXBINS),
-        VMSTATE_UINT64_ARRAY(vp.voice_locked, MCPXAPUState, 4),
-        VMSTATE_END_OF_LIST()
-    },
-};
-
-static void mcpx_apu_class_init(ObjectClass *klass, void *data)
-{
-    DeviceClass *dc = DEVICE_CLASS(klass);
-    ResettableClass *rc = RESETTABLE_CLASS(klass);
-    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
-
-    k->vendor_id = PCI_VENDOR_ID_NVIDIA;
-    k->device_id = PCI_DEVICE_ID_NVIDIA_MCPX_APU;
-    k->revision = 177;
-    k->class_id = PCI_CLASS_MULTIMEDIA_AUDIO;
-    k->realize = mcpx_apu_realize;
-    k->exit = mcpx_apu_exitfn;
-
-    rc->phases.hold = mcpx_apu_reset_hold;
-
-    dc->desc = "MCPX Audio Processing Unit";
-    dc->vmsd = &vmstate_mcpx_apu;
-}
-
-static const TypeInfo mcpx_apu_info = {
-    .name = "mcpx-apu",
-    .parent = TYPE_PCI_DEVICE,
-    .instance_size = sizeof(MCPXAPUState),
-    .class_init = mcpx_apu_class_init,
-    .interfaces =
-        (InterfaceInfo[]){
-            { INTERFACE_CONVENTIONAL_PCI_DEVICE },
-            {},
-        },
-};
-
-static void mcpx_apu_register(void)
-{
-    type_register_static(&mcpx_apu_info);
-}
-type_init(mcpx_apu_register);
-
-static void *mcpx_apu_frame_thread(void *arg)
-{
-    MCPXAPUState *d = MCPX_APU_DEVICE(arg);
-    qemu_mutex_lock(&d->lock);
-    while (!qatomic_read(&d->exiting)) {
-        int xcntmode = GET_MASK(qatomic_read(&d->regs[NV_PAPU_SECTL]),
-                                NV_PAPU_SECTL_XCNTMODE);
-        uint32_t fectl = qatomic_read(&d->regs[NV_PAPU_FECTL]);
-        if (xcntmode == NV_PAPU_SECTL_XCNTMODE_OFF ||
-            (fectl & NV_PAPU_FECTL_FEMETHMODE_TRAPPED) ||
-            (fectl & NV_PAPU_FECTL_FEMETHMODE_HALTED)) {
-            d->set_irq = true;
-        }
-
-        if (d->set_irq) {
-            qemu_mutex_unlock(&d->lock);
-            bql_lock();
-            update_irq(d);
-            bql_unlock();
-            qemu_mutex_lock(&d->lock);
-            d->set_irq = false;
-        }
-
-        xcntmode = GET_MASK(qatomic_read(&d->regs[NV_PAPU_SECTL]),
-                            NV_PAPU_SECTL_XCNTMODE);
-        fectl = qatomic_read(&d->regs[NV_PAPU_FECTL]);
-        if (xcntmode == NV_PAPU_SECTL_XCNTMODE_OFF ||
-            (fectl & NV_PAPU_FECTL_FEMETHMODE_TRAPPED) ||
-            (fectl & NV_PAPU_FECTL_FEMETHMODE_HALTED)) {
-            qemu_cond_wait(&d->cond, &d->lock);
-            continue;
-        }
-        se_frame((void *)d);
-    }
-    qemu_mutex_unlock(&d->lock);
-    return NULL;
-}
-
-void mcpx_apu_init(PCIBus *bus, int devfn, MemoryRegion *ram)
-{
-    PCIDevice *dev = pci_create_simple(bus, devfn, "mcpx-apu");
-    MCPXAPUState *d = MCPX_APU_DEVICE(dev);
-
-    g_state = d;
-
-    d->ram = ram;
-    d->ram_ptr = memory_region_get_ram_ptr(d->ram);
-
-    mcpx_apu_dsp_init(d);
-
-    d->set_irq = false;
-    d->exiting = false;
-
-    qemu_mutex_init(&d->lock);
-    qemu_cond_init(&d->cond);
-    qemu_add_vm_change_state_handler(mcpx_apu_vm_state_change, d);
-
-    mcpx_apu_vp_init(d);
-    qemu_thread_create(&d->apu_thread, "mcpx.apu_thread", mcpx_apu_frame_thread,
-                       d, QEMU_THREAD_JOINABLE);
-
-    monitor_init(d);
-}
--- a/hw/xbox/mcpx/apu/apu_int.h
+++ b/hw/xbox/mcpx/apu/apu_int.h
@ -1,117 +0,0 @@
-/*
- * QEMU MCPX Audio Processing Unit implementation
- *
- * Copyright (c) 2012 espes
- * Copyright (c) 2018-2019 Jannik Vogel
- * Copyright (c) 2019-2025 Matt Borgerson
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef HW_XBOX_MCPX_APU_INT_H
-#define HW_XBOX_MCPX_APU_INT_H
-
-#include "qemu/osdep.h"
-#include <math.h>
-#include <SDL.h>
-#include "hw/hw.h"
-#include "hw/pci/pci.h"
-#include "hw/pci/pci_device.h"
-#include "cpu.h"
-#include "migration/vmstate.h"
-#include "qemu/main-loop.h"
-#include "qemu/thread.h"
-#include "sysemu/runstate.h"
-#include "audio/audio.h"
-#include "qemu/fifo8.h"
-#include "ui/xemu-settings.h"
-
-#include "trace.h"
-#include "apu.h"
-#include "apu_regs.h"
-#include "apu_debug.h"
-#include "fpconv.h"
-#include "vp/vp.h"
-#include "dsp/gp_ep.h"
-
-#define GET_MASK(v, mask) (((v) & (mask)) >> ctz32(mask))
-
-#define SET_MASK(v, mask, val)                                       \
-    do {                                                             \
-        (v) &= ~(mask);                                              \
-        (v) |= ((val) << ctz32(mask)) & (mask);                      \
-    } while (0)
-
-#define CASE_4(v, step)                                              \
-    case (v):                                                        \
-    case (v)+(step):                                                 \
-    case (v)+(step)*2:                                               \
-    case (v)+(step)*3
-
-// #define DEBUG_MCPX
-
-#ifdef DEBUG_MCPX
-#define DPRINTF(fmt, ...) \
-    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
-#else
-#define DPRINTF(fmt, ...) \
-    do { } while (0)
-#endif
-
-#define MCPX_APU_DEVICE(obj) \
-    OBJECT_CHECK(MCPXAPUState, (obj), "mcpx-apu")
-
-typedef struct MCPXAPUState {
-    /*< private >*/
-    PCIDevice parent_obj;
-    /*< public >*/
-
-    bool exiting;
-    bool set_irq;
-
-    QemuThread apu_thread;
-    QemuMutex lock;
-    QemuCond cond;
-
-    MemoryRegion *ram;
-    uint8_t *ram_ptr;
-    MemoryRegion mmio;
-
-    MCPXAPUVPState vp;
-    MCPXAPUGPState gp;
-    MCPXAPUEPState ep;
-
-    uint32_t regs[0x20000];
-
-    int ep_frame_div;
-    int sleep_acc;
-    int frame_count;
-    int64_t frame_count_time;
-
-    struct {
-        McpxApuDebugMonitorPoint point;
-        int16_t frame_buf[256][2]; // 1 EP frame (0x400 bytes), 8 buffered
-        QemuSpin fifo_lock;
-        Fifo8 fifo;
-    } monitor;
-} MCPXAPUState;
-
-extern MCPXAPUState *g_state; // Used via debug handlers
-extern struct McpxApuDebug g_dbg, g_dbg_cache;
-extern int g_dbg_voice_monitor;
-extern uint64_t g_dbg_muted_voices[4];
-
-void mcpx_debug_begin_frame(void);
-void mcpx_debug_end_frame(void);
-
-#endif
--- a/hw/xbox/mcpx/apu/debug.c
+++ b/hw/xbox/mcpx/apu/debug.c
@ -1,86 +0,0 @@
-/*
- * QEMU MCPX Audio Processing Unit implementation
- *
- * Copyright (c) 2012 espes
- * Copyright (c) 2018-2019 Jannik Vogel
- * Copyright (c) 2019-2025 Matt Borgerson
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "apu_int.h"
-
-struct McpxApuDebug g_dbg, g_dbg_cache;
-int g_dbg_voice_monitor = -1;
-uint64_t g_dbg_muted_voices[4];
-
-const struct McpxApuDebug *mcpx_apu_get_debug_info(void)
-{
-    return &g_dbg_cache;
-}
-
-void mcpx_debug_begin_frame(void)
-{
-    for (int i = 0; i < MCPX_HW_MAX_VOICES; i++) {
-        g_dbg.vp.v[i].active = false;
-        g_dbg.vp.v[i].multipass_dst_voice = 0xFFFF;
-    }
-}
-
-void mcpx_debug_end_frame(void)
-{
-    g_dbg_cache = g_dbg;
-}
-
-void mcpx_apu_debug_set_gp_realtime_enabled(bool run)
-{
-    g_state->gp.realtime = run;
-}
-
-void mcpx_apu_debug_set_ep_realtime_enabled(bool run)
-{
-    g_state->ep.realtime = run;
-}
-
-McpxApuDebugMonitorPoint mcpx_apu_debug_get_monitor(void)
-{
-    return g_state->monitor.point;
-}
-
-void mcpx_apu_debug_set_monitor(McpxApuDebugMonitorPoint monitor)
-{
-    g_state->monitor.point = monitor;
-}
-
-void mcpx_apu_debug_isolate_voice(uint16_t v)
-{
-    g_dbg_voice_monitor = v;
-}
-
-void mcpx_apu_debug_clear_isolations(void)
-{
-    g_dbg_voice_monitor = -1;
-}
-
-bool mcpx_apu_debug_is_muted(uint16_t v)
-{
-    assert(v < MCPX_HW_MAX_VOICES);
-    return g_dbg_muted_voices[v / 64] & (1LL << (v % 64));
-}
-
-void mcpx_apu_debug_toggle_mute(uint16_t v)
-{
-    assert(v < MCPX_HW_MAX_VOICES);
-    g_dbg_muted_voices[v / 64] ^= (1LL << (v % 64));
-}
--- a/hw/xbox/mcpx/apu/dsp/gp_ep.c
+++ b/hw/xbox/mcpx/apu/dsp/gp_ep.c
@ -1,526 +0,0 @@
-/*
- * QEMU MCPX Audio Processing Unit implementation
- *
- * Copyright (c) 2012 espes
- * Copyright (c) 2018-2019 Jannik Vogel
- * Copyright (c) 2019-2025 Matt Borgerson
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "hw/xbox/mcpx/apu/apu_int.h"
-
-static const int16_t ep_silence[256][2] = { 0 };
-
-void mcpx_apu_update_dsp_preference(MCPXAPUState *d)
-{
-    static int last_known_preference = -1;
-
-    if (last_known_preference == (int)g_config.audio.use_dsp) {
-        return;
-    }
-
-    if (g_config.audio.use_dsp) {
-        d->monitor.point = MCPX_APU_DEBUG_MON_GP_OR_EP;
-        d->gp.realtime = true;
-        d->ep.realtime = true;
-    } else {
-        d->monitor.point = MCPX_APU_DEBUG_MON_VP;
-        d->gp.realtime = false;
-        d->ep.realtime = false;
-    }
-
-    last_known_preference = g_config.audio.use_dsp;
-}
-
-static void scatter_gather_rw(MCPXAPUState *d, hwaddr sge_base,
-                              unsigned int max_sge, uint8_t *ptr, uint32_t addr,
-                              size_t len, bool dir)
-{
-    unsigned int page_entry = addr / TARGET_PAGE_SIZE;
-    unsigned int offset_in_page = addr % TARGET_PAGE_SIZE;
-    unsigned int bytes_to_copy = TARGET_PAGE_SIZE - offset_in_page;
-
-    while (len > 0) {
-        assert(page_entry <= max_sge);
-
-        uint32_t prd_address = ldl_le_phys(&address_space_memory,
-                                           sge_base + page_entry * 8 + 0);
-        // uint32_t prd_control = ldl_le_phys(&address_space_memory,
-        //                                     sge_base + page_entry * 8 + 4);
-
-        hwaddr paddr = prd_address + offset_in_page;
-
-        if (bytes_to_copy > len) {
-            bytes_to_copy = len;
-        }
-
-        assert(paddr + bytes_to_copy < memory_region_size(d->ram));
-
-        if (dir) {
-            memcpy(&d->ram_ptr[paddr], ptr, bytes_to_copy);
-            memory_region_set_dirty(d->ram, paddr, bytes_to_copy);
-        } else {
-            memcpy(ptr, &d->ram_ptr[paddr], bytes_to_copy);
-        }
-
-        ptr += bytes_to_copy;
-        len -= bytes_to_copy;
-
-        /* After the first iteration, we are page aligned */
-        page_entry += 1;
-        bytes_to_copy = TARGET_PAGE_SIZE;
-        offset_in_page = 0;
-    }
-}
-
-static void gp_scratch_rw(void *opaque, uint8_t *ptr, uint32_t addr, size_t len,
-                          bool dir)
-{
-    MCPXAPUState *d = opaque;
-    // fprintf(stderr, "GP %s scratch 0x%x bytes (0x%x words) at %x (0x%x words)\n", dir ? "writing to" : "reading from", len, len/4, addr, addr/4);
-    scatter_gather_rw(d, d->regs[NV_PAPU_GPSADDR], d->regs[NV_PAPU_GPSMAXSGE],
-                      ptr, addr, len, dir);
-}
-
-static void ep_scratch_rw(void *opaque, uint8_t *ptr, uint32_t addr, size_t len,
-                          bool dir)
-{
-    MCPXAPUState *d = opaque;
-    // fprintf(stderr, "EP %s scratch 0x%x bytes (0x%x words) at %x (0x%x words)\n", dir ? "writing to" : "reading from", len, len/4, addr, addr/4);
-    scatter_gather_rw(d, d->regs[NV_PAPU_EPSADDR], d->regs[NV_PAPU_EPSMAXSGE],
-                      ptr, addr, len, dir);
-}
-
-static uint32_t circular_scatter_gather_rw(MCPXAPUState *d, hwaddr sge_base,
-                                           unsigned int max_sge, uint8_t *ptr,
-                                           uint32_t base, uint32_t end,
-                                           uint32_t cur, size_t len, bool dir)
-{
-    while (len > 0) {
-        unsigned int bytes_to_copy = end - cur;
-
-        if (bytes_to_copy > len) {
-            bytes_to_copy = len;
-        }
-
-        DPRINTF("circular scatter gather %s in range 0x%x - 0x%x at 0x%x of "
-                "length 0x%x / 0x%lx bytes\n",
-                dir ? "write" : "read", base, end, cur, bytes_to_copy, len);
-
-        assert((cur >= base) && ((cur + bytes_to_copy) <= end));
-        scatter_gather_rw(d, sge_base, max_sge, ptr, cur, bytes_to_copy, dir);
-
-        ptr += bytes_to_copy;
-        len -= bytes_to_copy;
-
-        /* After the first iteration we might have to wrap */
-        cur += bytes_to_copy;
-        if (cur >= end) {
-            assert(cur == end);
-            cur = base;
-        }
-    }
-
-    return cur;
-}
-
-static void gp_fifo_rw(void *opaque, uint8_t *ptr, unsigned int index,
-                       size_t len, bool dir)
-{
-    MCPXAPUState *d = opaque;
-    uint32_t base;
-    uint32_t end;
-    hwaddr cur_reg;
-    if (dir) {
-        assert(index < GP_OUTPUT_FIFO_COUNT);
-        base = GET_MASK(d->regs[NV_PAPU_GPOFBASE0 + 0x10 * index],
-                        NV_PAPU_GPOFBASE0_VALUE);
-        end = GET_MASK(d->regs[NV_PAPU_GPOFEND0 + 0x10 * index],
-                       NV_PAPU_GPOFEND0_VALUE);
-        cur_reg = NV_PAPU_GPOFCUR0 + 0x10 * index;
-    } else {
-        assert(index < GP_INPUT_FIFO_COUNT);
-        base = GET_MASK(d->regs[NV_PAPU_GPIFBASE0 + 0x10 * index],
-                        NV_PAPU_GPOFBASE0_VALUE);
-        end = GET_MASK(d->regs[NV_PAPU_GPIFEND0 + 0x10 * index],
-                       NV_PAPU_GPOFEND0_VALUE);
-        cur_reg = NV_PAPU_GPIFCUR0 + 0x10 * index;
-    }
-
-    uint32_t cur = GET_MASK(d->regs[cur_reg], NV_PAPU_GPOFCUR0_VALUE);
-
-    // fprintf(stderr, "GP %s fifo #%d, base = %x, end = %x, cur = %x, len = %x\n",
-    //     dir ? "writing to" : "reading from", index,
-    //     base, end, cur, len);
-
-    /* DSP hangs if current >= end; but forces current >= base */
-    assert(cur < end);
-    if (cur < base) {
-        cur = base;
-    }
-
-    cur = circular_scatter_gather_rw(d,
-        d->regs[NV_PAPU_GPFADDR], d->regs[NV_PAPU_GPFMAXSGE],
-        ptr, base, end, cur, len, dir);
-
-    SET_MASK(d->regs[cur_reg], NV_PAPU_GPOFCUR0_VALUE, cur);
-}
-
-static bool ep_sink_samples(MCPXAPUState *d, uint8_t *ptr, size_t len)
-{
-    if (d->monitor.point == MCPX_APU_DEBUG_MON_AC97) {
-        return false;
-    } else if ((d->monitor.point == MCPX_APU_DEBUG_MON_EP) ||
-        (d->monitor.point == MCPX_APU_DEBUG_MON_GP_OR_EP)) {
-        assert(len == sizeof(d->monitor.frame_buf));
-        memcpy(d->monitor.frame_buf, ptr, len);
-    }
-
-    return true;
-}
-
-static void ep_fifo_rw(void *opaque, uint8_t *ptr, unsigned int index,
-                       size_t len, bool dir)
-{
-    MCPXAPUState *d = opaque;
-    uint32_t base;
-    uint32_t end;
-    hwaddr cur_reg;
-    if (dir) {
-        assert(index < EP_OUTPUT_FIFO_COUNT);
-        base = GET_MASK(d->regs[NV_PAPU_EPOFBASE0 + 0x10 * index],
-                        NV_PAPU_GPOFBASE0_VALUE);
-        end = GET_MASK(d->regs[NV_PAPU_EPOFEND0 + 0x10 * index],
-                       NV_PAPU_GPOFEND0_VALUE);
-        cur_reg = NV_PAPU_EPOFCUR0 + 0x10 * index;
-    } else {
-        assert(index < EP_INPUT_FIFO_COUNT);
-        base = GET_MASK(d->regs[NV_PAPU_EPIFBASE0 + 0x10 * index],
-                        NV_PAPU_GPOFBASE0_VALUE);
-        end = GET_MASK(d->regs[NV_PAPU_EPIFEND0 + 0x10 * index],
-                       NV_PAPU_GPOFEND0_VALUE);
-        cur_reg = NV_PAPU_EPIFCUR0 + 0x10 * index;
-    }
-
-    uint32_t cur = GET_MASK(d->regs[cur_reg], NV_PAPU_GPOFCUR0_VALUE);
-
-    // fprintf(stderr, "EP %s fifo #%d, base = %x, end = %x, cur = %x, len = %x\n",
-    //     dir ? "writing to" : "reading from", index,
-    //     base, end, cur, len);
-
-    if (dir && index == 0) {
-        bool did_sink = ep_sink_samples(d, ptr, len);
-        if (did_sink) {
-            /* Since we are sinking, push silence out */
-            assert(len <= sizeof(ep_silence));
-            ptr = (uint8_t*)ep_silence;
-        }
-    }
-
-    /* DSP hangs if current >= end; but forces current >= base */
-    if (cur >= end) {
-        cur = cur % (end - base);
-    }
-    if (cur < base) {
-        cur = base;
-    }
-
-    cur = circular_scatter_gather_rw(d,
-        d->regs[NV_PAPU_EPFADDR], d->regs[NV_PAPU_EPFMAXSGE],
-        ptr, base, end, cur, len, dir);
-
-    SET_MASK(d->regs[cur_reg], NV_PAPU_GPOFCUR0_VALUE, cur);
-}
-
-static void proc_rst_write(DSPState *dsp, uint32_t oldval, uint32_t val)
-{
-    if (!(val & NV_PAPU_GPRST_GPRST) || !(val & NV_PAPU_GPRST_GPDSPRST)) {
-        dsp_reset(dsp);
-    } else if (
-        (!(oldval & NV_PAPU_GPRST_GPRST) || !(oldval & NV_PAPU_GPRST_GPDSPRST))
-        && ((val & NV_PAPU_GPRST_GPRST) && (val & NV_PAPU_GPRST_GPDSPRST))) {
-        dsp_bootstrap(dsp);
-    }
-}
-
-/* Global Processor - programmable DSP */
-static uint64_t gp_read(void *opaque, hwaddr addr, unsigned int size)
-{
-    MCPXAPUState *d = opaque;
-
-    assert(size == 4);
-    assert(addr % 4 == 0);
-
-    uint64_t r = 0;
-    switch (addr) {
-    case NV_PAPU_GPXMEM ... NV_PAPU_GPXMEM + 0x1000 * 4 - 1: {
-        uint32_t xaddr = (addr - NV_PAPU_GPXMEM) / 4;
-        r = dsp_read_memory(d->gp.dsp, 'X', xaddr);
-        // fprintf(stderr, "read GP NV_PAPU_GPXMEM [%x] -> %x\n", xaddr, r);
-        break;
-    }
-    case NV_PAPU_GPMIXBUF ... NV_PAPU_GPMIXBUF + 0x400 * 4 - 1: {
-        uint32_t xaddr = (addr - NV_PAPU_GPMIXBUF) / 4;
-        r = dsp_read_memory(d->gp.dsp, 'X', GP_DSP_MIXBUF_BASE + xaddr);
-        // fprintf(stderr, "read GP NV_PAPU_GPMIXBUF [%x] -> %x\n", xaddr, r);
-        break;
-    }
-    case NV_PAPU_GPYMEM ... NV_PAPU_GPYMEM + 0x800 * 4 - 1: {
-        uint32_t yaddr = (addr - NV_PAPU_GPYMEM) / 4;
-        r = dsp_read_memory(d->gp.dsp, 'Y', yaddr);
-        // fprintf(stderr, "read GP NV_PAPU_GPYMEM [%x] -> %x\n", yaddr, r);
-        break;
-    }
-    case NV_PAPU_GPPMEM ... NV_PAPU_GPPMEM + 0x1000 * 4 - 1: {
-        uint32_t paddr = (addr - NV_PAPU_GPPMEM) / 4;
-        r = dsp_read_memory(d->gp.dsp, 'P', paddr);
-        // fprintf(stderr, "read GP NV_PAPU_GPPMEM [%x] -> %x\n", paddr, r);
-        break;
-    }
-    default:
-        r = d->gp.regs[addr];
-        break;
-    }
-    DPRINTF("mcpx apu GP: read [0x%" HWADDR_PRIx "] -> 0x%lx\n", addr, r);
-
-    return r;
-}
-
-static void gp_write(void *opaque, hwaddr addr, uint64_t val, unsigned int size)
-{
-    MCPXAPUState *d = opaque;
-
-    qemu_mutex_lock(&d->lock);
-
-    assert(size == 4);
-    assert(addr % 4 == 0);
-
-    DPRINTF("mcpx apu GP: [0x%" HWADDR_PRIx "] = 0x%lx\n", addr, val);
-
-    switch (addr) {
-    case NV_PAPU_GPXMEM ... NV_PAPU_GPXMEM + 0x1000 * 4 - 1: {
-        uint32_t xaddr = (addr - NV_PAPU_GPXMEM) / 4;
-        // fprintf(stderr, "gp write xmem %x = %x\n", xaddr, val);
-        dsp_write_memory(d->gp.dsp, 'X', xaddr, val);
-        break;
-    }
-    case NV_PAPU_GPMIXBUF ... NV_PAPU_GPMIXBUF + 0x400 * 4 - 1: {
-        uint32_t xaddr = (addr - NV_PAPU_GPMIXBUF) / 4;
-        // fprintf(stderr, "gp write xmixbuf %x = %x\n", xaddr, val);
-        dsp_write_memory(d->gp.dsp, 'X', GP_DSP_MIXBUF_BASE + xaddr, val);
-        break;
-    }
-    case NV_PAPU_GPYMEM ... NV_PAPU_GPYMEM + 0x800 * 4 - 1: {
-        uint32_t yaddr = (addr - NV_PAPU_GPYMEM) / 4;
-        // fprintf(stderr, "gp write ymem %x = %x\n", yaddr, val);
-        dsp_write_memory(d->gp.dsp, 'Y', yaddr, val);
-        break;
-    }
-    case NV_PAPU_GPPMEM ... NV_PAPU_GPPMEM + 0x1000 * 4 - 1: {
-        uint32_t paddr = (addr - NV_PAPU_GPPMEM) / 4;
-        // fprintf(stderr, "gp write pmem %x = %x\n", paddr, val);
-        dsp_write_memory(d->gp.dsp, 'P', paddr, val);
-        break;
-    }
-    case NV_PAPU_GPRST:
-        proc_rst_write(d->gp.dsp, d->gp.regs[NV_PAPU_GPRST], val);
-        d->gp.regs[NV_PAPU_GPRST] = val;
-        break;
-    default:
-        d->gp.regs[addr] = val;
-        break;
-    }
-
-    qemu_mutex_unlock(&d->lock);
-}
-
-const MemoryRegionOps gp_ops = {
-    .read = gp_read,
-    .write = gp_write,
-};
-
-/* Encode Processor - encoding DSP */
-static uint64_t ep_read(void *opaque, hwaddr addr, unsigned int size)
-{
-    MCPXAPUState *d = opaque;
-
-    assert(size == 4);
-    assert(addr % 4 == 0);
-
-    uint64_t r = 0;
-    switch (addr) {
-    case NV_PAPU_EPXMEM ... NV_PAPU_EPXMEM + 0xC00 * 4 - 1: {
-        uint32_t xaddr = (addr - NV_PAPU_EPXMEM) / 4;
-        r = dsp_read_memory(d->ep.dsp, 'X', xaddr);
-        // fprintf(stderr, "read EP  NV_PAPU_EPXMEM [%x] -> %x\n", xaddr, r);
-        break;
-    }
-    case NV_PAPU_EPYMEM ... NV_PAPU_EPYMEM + 0x100 * 4 - 1: {
-        uint32_t yaddr = (addr - NV_PAPU_EPYMEM) / 4;
-        r = dsp_read_memory(d->ep.dsp, 'Y', yaddr);
-        // fprintf(stderr, "read EP  NV_PAPU_EPYMEM [%x] -> %x\n", yaddr, r);
-        break;
-    }
-    case NV_PAPU_EPPMEM ... NV_PAPU_EPPMEM + 0x1000 * 4 - 1: {
-        uint32_t paddr = (addr - NV_PAPU_EPPMEM) / 4;
-        r = dsp_read_memory(d->ep.dsp, 'P', paddr);
-        // fprintf(stderr, "read EP  NV_PAPU_EPPMEM [%x] -> %x\n", paddr, r);
-        break;
-    }
-    default:
-        r = d->ep.regs[addr];
-        break;
-    }
-    DPRINTF("mcpx apu EP: read [0x%" HWADDR_PRIx "] -> 0x%lx\n", addr, r);
-
-    return r;
-}
-
-static void ep_write(void *opaque, hwaddr addr, uint64_t val, unsigned int size)
-{
-    MCPXAPUState *d = opaque;
-
-    qemu_mutex_lock(&d->lock);
-
-    assert(size == 4);
-    assert(addr % 4 == 0);
-
-    DPRINTF("mcpx apu EP: [0x%" HWADDR_PRIx "] = 0x%lx\n", addr, val);
-
-    switch (addr) {
-    case NV_PAPU_EPXMEM ... NV_PAPU_EPXMEM + 0xC00 * 4 - 1: {
-        uint32_t xaddr = (addr - NV_PAPU_EPXMEM) / 4;
-        dsp_write_memory(d->ep.dsp, 'X', xaddr, val);
-        // fprintf(stderr, "ep write xmem %x = %x\n", xaddr, val);
-        break;
-    }
-    case NV_PAPU_EPYMEM ... NV_PAPU_EPYMEM + 0x100 * 4 - 1: {
-        uint32_t yaddr = (addr - NV_PAPU_EPYMEM) / 4;
-        dsp_write_memory(d->ep.dsp, 'Y', yaddr, val);
-        // fprintf(stderr, "ep write ymem %x = %x\n", yaddr, val);
-        break;
-    }
-    case NV_PAPU_EPPMEM ... NV_PAPU_EPPMEM + 0x1000 * 4 - 1: {
-        uint32_t paddr = (addr - NV_PAPU_EPPMEM) / 4;
-        // fprintf(stderr, "ep write pmem %x = %x\n", paddr, val);
-        dsp_write_memory(d->ep.dsp, 'P', paddr, val);
-        break;
-    }
-    case NV_PAPU_EPRST:
-        proc_rst_write(d->ep.dsp, d->ep.regs[NV_PAPU_EPRST], val);
-        d->ep.regs[NV_PAPU_EPRST] = val;
-        d->ep_frame_div = 0; /* FIXME: Still unsure about frame sync */
-        break;
-    default:
-        d->ep.regs[addr] = val;
-        break;
-    }
-
-    qemu_mutex_unlock(&d->lock);
-}
-
-const MemoryRegionOps ep_ops = {
-    .read = ep_read,
-    .write = ep_write,
-};
-
-void mcpx_apu_dsp_frame(MCPXAPUState *d, float mixbins[NUM_MIXBINS][NUM_SAMPLES_PER_FRAME])
-{
-    /* Write VP results to the GP DSP MIXBUF */
-    for (int mixbin = 0; mixbin < NUM_MIXBINS; mixbin++) {
-        uint32_t base = GP_DSP_MIXBUF_BASE + mixbin * NUM_SAMPLES_PER_FRAME;
-        for (int sample = 0; sample < NUM_SAMPLES_PER_FRAME; sample++) {
-            dsp_write_memory(d->gp.dsp, 'X', base + sample,
-                             float_to_24b(mixbins[mixbin][sample]));
-        }
-    }
-
-    bool ep_enabled = (d->ep.regs[NV_PAPU_EPRST] & NV_PAPU_GPRST_GPRST) &&
-                      (d->ep.regs[NV_PAPU_EPRST] & NV_PAPU_GPRST_GPDSPRST);
-
-    /* Run GP */
-    if ((d->gp.regs[NV_PAPU_GPRST] & NV_PAPU_GPRST_GPRST) &&
-        (d->gp.regs[NV_PAPU_GPRST] & NV_PAPU_GPRST_GPDSPRST)) {
-        dsp_start_frame(d->gp.dsp);
-        d->gp.dsp->core.is_idle = false;
-        d->gp.dsp->core.cycle_count = 0;
-        do {
-            dsp_run(d->gp.dsp, 1000);
-        } while (!d->gp.dsp->core.is_idle && d->gp.realtime);
-        g_dbg.gp.cycles = d->gp.dsp->core.cycle_count;
-
-        if ((d->monitor.point == MCPX_APU_DEBUG_MON_GP) ||
-            (d->monitor.point == MCPX_APU_DEBUG_MON_GP_OR_EP && !ep_enabled)) {
-            int off = (d->ep_frame_div % 8) * NUM_SAMPLES_PER_FRAME;
-            for (int i = 0; i < NUM_SAMPLES_PER_FRAME; i++) {
-                uint32_t l = dsp_read_memory(d->gp.dsp, 'X', 0x1400 + i);
-                d->monitor.frame_buf[off + i][0] = l >> 8;
-                uint32_t r =
-                    dsp_read_memory(d->gp.dsp, 'X', 0x1400 + 1 * 0x20 + i);
-                d->monitor.frame_buf[off + i][1] = r >> 8;
-            }
-        }
-    }
-
-    /* Run EP */
-    if ((d->ep.regs[NV_PAPU_EPRST] & NV_PAPU_GPRST_GPRST) &&
-        (d->ep.regs[NV_PAPU_EPRST] & NV_PAPU_GPRST_GPDSPRST)) {
-        if (d->ep_frame_div % 8 == 0) {
-            dsp_start_frame(d->ep.dsp);
-            d->ep.dsp->core.is_idle = false;
-            d->ep.dsp->core.cycle_count = 0;
-            do {
-                dsp_run(d->ep.dsp, 1000);
-            } while (!d->ep.dsp->core.is_idle && d->ep.realtime);
-            g_dbg.ep.cycles = d->ep.dsp->core.cycle_count;
-        }
-    }
-}
-
-void mcpx_apu_dsp_init(MCPXAPUState *d)
-{
-    d->gp.dsp = dsp_init(d, gp_scratch_rw, gp_fifo_rw);
-    for (int i = 0; i < DSP_PRAM_SIZE; i++) {
-        d->gp.dsp->core.pram[i] = 0xCACACACA;
-    }
-    memset(d->gp.dsp->core.pram_opcache, 0,
-           sizeof(d->gp.dsp->core.pram_opcache));
-    d->gp.dsp->is_gp = true;
-    d->gp.dsp->core.is_gp = true;
-    d->gp.dsp->core.is_idle = false;
-    d->gp.dsp->core.cycle_count = 0;
-
-    d->ep.dsp = dsp_init(d, ep_scratch_rw, ep_fifo_rw);
-    for (int i = 0; i < DSP_PRAM_SIZE; i++) {
-        d->ep.dsp->core.pram[i] = 0xCACACACA;
-    }
-    memset(d->ep.dsp->core.pram_opcache, 0,
-           sizeof(d->ep.dsp->core.pram_opcache));
-    for (int i = 0; i < DSP_XRAM_SIZE; i++) {
-        d->ep.dsp->core.xram[i] = 0xCACACACA;
-    }
-    for (int i = 0; i < DSP_YRAM_SIZE; i++) {
-        d->ep.dsp->core.yram[i] = 0xCACACACA;
-    }
-    d->ep.dsp->is_gp = false;
-    d->ep.dsp->core.is_gp = false;
-    d->ep.dsp->core.is_idle = false;
-    d->ep.dsp->core.cycle_count = 0;
-
-    /* Until DSP is more performant, a switch to decide whether or not we should
-     * use the full audio pipeline or not.
-     */
-    mcpx_apu_update_dsp_preference(d);
-}
--- a/hw/xbox/mcpx/apu/dsp/gp_ep.h
+++ b/hw/xbox/mcpx/apu/dsp/gp_ep.h
@ -1,57 +0,0 @@
-/*
- * QEMU MCPX Audio Processing Unit implementation
- *
- * Copyright (c) 2012 espes
- * Copyright (c) 2018-2019 Jannik Vogel
- * Copyright (c) 2019-2025 Matt Borgerson
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef HW_XBOX_MCPX_APU_GP_EP_H
-#define HW_XBOX_MCPX_APU_GP_EP_H
-
-#include "qemu/osdep.h"
-#include "hw/hw.h"
-#include "hw/pci/pci.h"
-#include "hw/xbox/mcpx/apu/apu_regs.h"
-
-#include "dsp.h"
-#include "dsp_dma.h"
-#include "dsp_cpu.h"
-#include "dsp_state.h"
-
-typedef struct MCPXAPUState MCPXAPUState;
-
-typedef struct MCPXAPUGPState {
-    bool realtime;
-    MemoryRegion mmio;
-    DSPState *dsp;
-    uint32_t regs[0x10000];
-} MCPXAPUGPState;
-
-typedef struct MCPXAPUEPState {
-    bool realtime;
-    MemoryRegion mmio;
-    DSPState *dsp;
-    uint32_t regs[0x10000];
-} MCPXAPUEPState;
-
-extern const MemoryRegionOps gp_ops;
-extern const MemoryRegionOps ep_ops;
-
-void mcpx_apu_dsp_init(MCPXAPUState *d);
-void mcpx_apu_update_dsp_preference(MCPXAPUState *d);
-void mcpx_apu_dsp_frame(MCPXAPUState *d, float mixbins[NUM_MIXBINS][NUM_SAMPLES_PER_FRAME]);
-
-#endif
--- a/hw/xbox/mcpx/apu/dsp/trace.h
+++ b/hw/xbox/mcpx/apu/dsp/trace.h
@ -1 +0,0 @@
-#include "trace/trace-hw_xbox_mcpx_apu_dsp.h"
--- a/hw/xbox/mcpx/apu/meson.build
+++ b/hw/xbox/mcpx/apu/meson.build
@ -1,7 +0,0 @@
-mcpx_ss.add(sdl, files(
-	'apu.c',
-	'debug.c',
-	))
-
-subdir('vp')
-subdir('dsp')
--- a/hw/xbox/mcpx/apu/trace.h
+++ b/hw/xbox/mcpx/apu/trace.h
@ -1 +0,0 @@
-#include "trace/trace-hw_xbox_mcpx_apu.h"
--- a/hw/xbox/mcpx/apu/vp/hrtf.h
+++ b/hw/xbox/mcpx/apu/vp/hrtf.h
@ -1,137 +0,0 @@
-/*
- * HRTF Filter
- *
- * Copyright (c) 2025 Matt Borgerson
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef HW_XBOX_MCPX_HRTF_H
-#define HW_XBOX_MCPX_HRTF_H
-
-#include <string.h>
-#include <stddef.h>
-#include <math.h>
-
-#include "hw/xbox/mcpx/apu/apu_regs.h"
-
-#define HRTF_SAMPLES_PER_FRAME  NUM_SAMPLES_PER_FRAME
-#define HRTF_NUM_TAPS           31
-#define HRTF_MAX_DELAY_SAMPLES  42
-#define HRTF_BUFLEN             (HRTF_NUM_TAPS + HRTF_MAX_DELAY_SAMPLES)
-#define HRTF_PARAM_SMOOTH_ALPHA 0.01f
-
-typedef struct {
-    int buf_pos;
-    struct {
-        float buf[HRTF_BUFLEN];
-        float hrir_coeff_cur[HRTF_NUM_TAPS];
-        float hrir_coeff_tar[HRTF_NUM_TAPS];
-    } ch[2];
-    float itd_cur;
-    float itd_tar;
-} HrtfFilter;
-
-static inline void hrtf_filter_init(HrtfFilter *f)
-{
-    memset(f, 0, sizeof(*f));
-}
-
-static inline void
-hrtf_filter_set_target_params(HrtfFilter *f, float hrir_coeff[2][HRTF_NUM_TAPS],
-                              float itd)
-{
-    f->itd_tar =
-        fmaxf(-HRTF_MAX_DELAY_SAMPLES, fminf(itd, HRTF_MAX_DELAY_SAMPLES));
-
-    for (int ch = 0; ch < 2; ch++) {
-        float *coeff = f->ch[ch].hrir_coeff_tar;
-        memcpy(coeff, hrir_coeff[ch], sizeof(f->ch[ch].hrir_coeff_tar));
-
-        // Normalize coefficients for unity filter gain
-        float s = 0.0f;
-        for (int k = 0; k < HRTF_NUM_TAPS; k++) {
-            s += fabsf(coeff[k]);
-        }
-        if (s == 0.0f || s == 1.0f) {
-            break;
-        }
-        for (int k = 0; k < HRTF_NUM_TAPS; k++) {
-            coeff[k] /= s;
-        }
-    }
-}
-
-static inline float hrtf_filter_smooth_param(float cur, float tar)
-{
-    // FIXME: Match hardware parameter transition
-    return cur + HRTF_PARAM_SMOOTH_ALPHA * (tar - cur);
-}
-
-static inline void hrtf_filter_step_parameters(HrtfFilter *f)
-{
-    for (int ch = 0; ch < 2; ch++) {
-        float *coeff_cur = f->ch[ch].hrir_coeff_cur;
-        float *coeff_tar = f->ch[ch].hrir_coeff_tar;
-        for (int k = 0; k < HRTF_NUM_TAPS; k++) {
-            coeff_cur[k] = hrtf_filter_smooth_param(coeff_cur[k], coeff_tar[k]);
-        }
-    }
-    f->itd_cur = hrtf_filter_smooth_param(f->itd_cur, f->itd_tar);
-}
-
-static inline void hrtf_filter_process(HrtfFilter *f,
-                                       float in[HRTF_SAMPLES_PER_FRAME][2],
-                                       float out[HRTF_SAMPLES_PER_FRAME][2])
-{
-    for (int n = 0; n < HRTF_SAMPLES_PER_FRAME; n++) {
-        hrtf_filter_step_parameters(f);
-
-        for (int ch = 0; ch < 2; ch++) {
-            float *buf = f->ch[ch].buf;
-            float *coeff = f->ch[ch].hrir_coeff_cur;
-
-            // Push new sample
-            buf[f->buf_pos] = in[n][ch];
-
-            // Interaural time difference (channel delay)
-            float d = f->itd_cur * (ch == 0 ? +1.0f : -1.0f);
-            if (d < 0.0f) {
-                d = 0.0f;
-            }
-            int di = d;
-            float dfrac = d - di;
-
-            // HRIR Convolution
-            float acc = 0.0f;
-            for (int k = 0; k < HRTF_NUM_TAPS; k++) {
-                int idx1 = (f->buf_pos - di - k + HRTF_BUFLEN) % HRTF_BUFLEN;
-                float s = buf[idx1];
-
-                // Linear interpolation for fractional part
-                if (dfrac > 0.0f) {
-                    int idx2 = (idx1 - 1 + HRTF_BUFLEN) % HRTF_BUFLEN;
-                    s = s * (1 - dfrac) + buf[idx2] * dfrac;
-                }
-                acc += coeff[k] * s;
-            }
-
-            out[n][ch] = acc;
-        }
-
-        f->buf_pos = (f->buf_pos + 1) % HRTF_BUFLEN;
-    }
-}
-
-#endif
--- a/hw/xbox/mcpx/apu/vp/meson.build
+++ b/hw/xbox/mcpx/apu/vp/meson.build
@ -1,3 +0,0 @@
-mcpx_ss.add(libsamplerate, files(
-	'vp.c'
-	))
--- a/hw/xbox/mcpx/apu/vp/vp.h
+++ b/hw/xbox/mcpx/apu/vp/vp.h
@ -1,112 +0,0 @@
-/*
- * QEMU MCPX Audio Processing Unit implementation
- *
- * Copyright (c) 2012 espes
- * Copyright (c) 2018-2019 Jannik Vogel
- * Copyright (c) 2019-2025 Matt Borgerson
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef HW_XBOX_MCPX_VP_H
-#define HW_XBOX_MCPX_VP_H
-
-#include <samplerate.h>
-
-#include "qemu/osdep.h"
-#include "qemu/thread.h"
-#include "hw/hw.h"
-#include "hw/pci/pci.h"
-#include "hw/xbox/mcpx/apu/apu_regs.h"
-#include "svf.h"
-#include "hrtf.h"
-
-typedef struct MCPXAPUState MCPXAPUState;
-
-typedef struct MCPXAPUVPSSLData {
-    uint32_t base[MCPX_HW_SSLS_PER_VOICE];
-    uint8_t count[MCPX_HW_SSLS_PER_VOICE];
-    int ssl_index;
-    int ssl_seg;
-} MCPXAPUVPSSLData;
-
-typedef struct MCPXAPUVoiceFilter {
-    uint16_t voice;
-    float resample_buf[NUM_SAMPLES_PER_FRAME * 2];
-    SRC_STATE *resampler;
-    sv_filter svf[2];
-    HrtfFilter hrtf;
-} MCPXAPUVoiceFilter;
-
-typedef struct VoiceWorkItem {
-    int voice;
-    int list;
-} VoiceWorkItem;
-
-typedef struct VoiceWorker {
-    QemuThread thread;
-    float mixbins[NUM_MIXBINS][NUM_SAMPLES_PER_FRAME];
-    float sample_buf[NUM_SAMPLES_PER_FRAME][2];
-    VoiceWorkItem queue[MCPX_HW_MAX_VOICES];
-    int queue_len;
-} VoiceWorker;
-
-typedef struct VoiceWorkDispatch {
-    QemuMutex lock;
-    int num_workers;
-    VoiceWorker *workers;
-    bool workers_should_exit;
-    QemuCond work_pending;
-    uint64_t workers_pending;
-    QemuCond work_finished;
-    float mixbins[NUM_MIXBINS][NUM_SAMPLES_PER_FRAME];
-    VoiceWorkItem queue[MCPX_HW_MAX_VOICES];
-    int queue_len;
-} VoiceWorkDispatch;
-
-typedef struct {
-    MemoryRegion mmio;
-    VoiceWorkDispatch voice_work_dispatch;
-    MCPXAPUVoiceFilter filters[MCPX_HW_MAX_VOICES];
-
-    // FIXME: Where are these stored?
-    int ssl_base_page;
-    MCPXAPUVPSSLData ssl[MCPX_HW_MAX_VOICES];
-    uint8_t hrtf_headroom;
-    uint8_t hrtf_submix[4];
-    uint8_t submix_headroom[NUM_MIXBINS];
-    float sample_buf[NUM_SAMPLES_PER_FRAME][2];
-    uint64_t voice_locked[4];
-    QemuSpin voice_spinlocks[MCPX_HW_MAX_VOICES];
-
-    struct {
-        int current_entry;
-        // FIXME: Stored in RAM
-        struct {
-            float hrir[2][HRTF_NUM_TAPS];
-            float itd;
-        } entries[HRTF_ENTRY_COUNT];
-    } hrtf;
-
-    uint32_t inbuf_sge_handle; //FIXME: Where is this stored?
-    uint32_t outbuf_sge_handle; //FIXME: Where is this stored?
-} MCPXAPUVPState;
-
-extern const MemoryRegionOps vp_ops;
-
-void mcpx_apu_vp_init(MCPXAPUState *d);
-void mcpx_apu_vp_finalize(MCPXAPUState *d);
-void mcpx_apu_vp_frame(MCPXAPUState *d, float mixbins[NUM_MIXBINS][NUM_SAMPLES_PER_FRAME]);
-void mcpx_apu_vp_reset(MCPXAPUState *d);
-
-#endif
--- a/hw/xbox/mcpx/apu/apu_debug.h
+++ b/hw/xbox/mcpx/apu/apu_debug.h
@ -23,15 +23,13 @@
 #include <stdbool.h>
 #include <stdint.h>

-#define MAX_VOICE_WORKERS 16
-
-typedef enum McpxApuDebugMonitorPoint {
+enum McpxApuDebugMon {
    MCPX_APU_DEBUG_MON_AC97,
    MCPX_APU_DEBUG_MON_VP,
    MCPX_APU_DEBUG_MON_GP,
    MCPX_APU_DEBUG_MON_EP,
    MCPX_APU_DEBUG_MON_GP_OR_EP
-} McpxApuDebugMonitorPoint;
+};

 struct McpxApuDebugVoice
 {
@ -57,12 +55,6 @@ struct McpxApuDebugVoice
 struct McpxApuDebugVp
 {
    struct McpxApuDebugVoice v[256];
-    int num_workers;
-    struct {
-        int num_voices;
-        int time_us;
-    } workers[MAX_VOICE_WORKERS];
-    int total_worker_time_us;
 };

 struct McpxApuDebugDsp
@ -84,8 +76,8 @@ extern "C" {
 #endif

 const struct McpxApuDebug *mcpx_apu_get_debug_info(void);
-McpxApuDebugMonitorPoint mcpx_apu_debug_get_monitor(void);
-void mcpx_apu_debug_set_monitor(McpxApuDebugMonitorPoint monitor);
+int mcpx_apu_debug_get_monitor(void);
+void mcpx_apu_debug_set_monitor(int mon);
 void mcpx_apu_debug_isolate_voice(uint16_t v);
 void mcpx_apu_debug_clear_isolations(void);
 void mcpx_apu_debug_toggle_mute(uint16_t v);
--- a/hw/xbox/mcpx/apu/apu_regs.h
+++ b/hw/xbox/mcpx/apu/apu_regs.h
@ -145,8 +145,6 @@
 #define NV1BA0_PIO_VOICE_PAUSE                           0x00000140
 #   define NV1BA0_PIO_VOICE_PAUSE_HANDLE                    0x0000FFFF
 #   define NV1BA0_PIO_VOICE_PAUSE_ACTION                    (1 << 18)
-#define NV1BA0_PIO_SET_CURRENT_HRTF_ENTRY                0x00000160
-#   define NV1BA0_PIO_SET_CURRENT_HRTF_ENTRY_HANDLE         0x0000FFFF
 #define NV1BA0_PIO_SET_CONTEXT_DMA_NOTIFY                0x00000180
 #define NV1BA0_PIO_SET_CURRENT_SSL_CONTEXT_DMA           0x0000018C
 #define NV1BA0_PIO_SET_CURRENT_SSL                       0x00000190
@ -167,8 +165,6 @@
 #define NV1BA0_PIO_SET_VOICE_CFG_ENV1                    0x00000310
 #define NV1BA0_PIO_SET_VOICE_CFG_ENVF                    0x00000314
 #define NV1BA0_PIO_SET_VOICE_CFG_MISC                    0x00000318
-#define NV1BA0_PIO_SET_VOICE_TAR_HRTF                    0x0000031C
-#   define NV1BA0_PIO_SET_VOICE_TAR_HRTF_HANDLE             0x0000FFFF
 #define NV1BA0_PIO_SET_VOICE_SSL_A                       0x00000320
 #   define NV1BA0_PIO_SET_VOICE_SSL_A_COUNT                 0x000000FF
 #   define NV1BA0_PIO_SET_VOICE_SSL_A_BASE                  0xFFFFFF00
@ -189,15 +185,6 @@
 #   define NV1BA0_PIO_SET_VOICE_BUF_CBO_OFFSET              0x00FFFFFF
 #define NV1BA0_PIO_SET_VOICE_CFG_BUF_EBO                 0x000003DC
 #   define NV1BA0_PIO_SET_VOICE_CFG_BUF_EBO_OFFSET          0x00FFFFFF
-#define NV1BA0_PIO_SET_HRIR                              0x00000400
-#   define NV1BA0_PIO_SET_HRIR_LEFT0                        0x000000FF
-#   define NV1BA0_PIO_SET_HRIR_RIGHT0                       0x0000FF00
-#   define NV1BA0_PIO_SET_HRIR_LEFT1                        0x00FF0000
-#   define NV1BA0_PIO_SET_HRIR_RIGHT1                       0xFF000000
-#define NV1BA0_PIO_SET_HRIR_X                            0x0000043C
-#    define NV1BA0_PIO_SET_HRIR_X_LEFT30                     0x000000FF
-#    define NV1BA0_PIO_SET_HRIR_X_RIGHT30                    0x0000FF00
-#    define NV1BA0_PIO_SET_HRIR_X_ITD                        0xFFFF0000
 #define NV1BA0_PIO_SET_SSL_SEGMENT_OFFSET                0x00000600
 #define NV1BA0_PIO_SET_SSL_SEGMENT_LENGTH                0x00000604
 #define NV1BA0_PIO_SET_CURRENT_INBUF_SGE                 0x00000804
@ -262,8 +249,6 @@
 #define NV_PAVS_VOICE_CFG_MISC                           0x00000018
 #   define NV_PAVS_VOICE_CFG_MISC_EF_RELEASERATE            (0xFFF << 0)
 #   define NV_PAVS_VOICE_CFG_MISC_FMODE                     (3 << 16)
-#define NV_PAVS_VOICE_CFG_HRTF_TARGET                    0x0000001C
-#   define NV_PAVS_VOICE_CFG_HRTF_TARGET_HANDLE             0x0000FFFF
 #define NV_PAVS_VOICE_CUR_PSL_START                      0x00000020
 #   define NV_PAVS_VOICE_CUR_PSL_START_BA                   0x00FFFFFF
 #define NV_PAVS_VOICE_CUR_PSH_SAMPLE                     0x00000024
@ -328,7 +313,6 @@
 #define EP_INPUT_FIFO_COUNT   2

 #define MCPX_HW_MAX_VOICES 256
-#define MCPX_HW_MAX_3D_VOICES 64

 #define NUM_SAMPLES_PER_FRAME 32
 #define NUM_MIXBINS 32
@ -352,12 +336,6 @@ enum MCPX_HW_NOTIFIER {
 #define NV1BA0_NOTIFICATION_STATUS_DONE_SUCCESS       0x01
 #define NV1BA0_NOTIFICATION_STATUS_IN_PROGRESS        0x80

-#define HRTF_NULL_HANDLE 0xFFFF
-#define HRTF_ENTRY_COUNT 128
-
-#define MULTIPASS_BIN      31
-#define MULTIPASS_BIN_MASK (1 << MULTIPASS_BIN)
-
 // clang-format on

 #endif
--- a/hw/xbox/mcpx/apu/dsp/debug.c
+++ b/hw/xbox/mcpx/apu/dsp/debug.c
--- a/hw/xbox/mcpx/apu/dsp/debug.h
+++ b/hw/xbox/mcpx/apu/dsp/debug.h
--- a/hw/xbox/mcpx/apu/dsp/dsp.c
+++ b/hw/xbox/mcpx/apu/dsp/dsp.c
--- a/hw/xbox/mcpx/apu/dsp/dsp.h
+++ b/hw/xbox/mcpx/apu/dsp/dsp.h
--- a/hw/xbox/mcpx/apu/dsp/dsp_cpu.c
+++ b/hw/xbox/mcpx/apu/dsp/dsp_cpu.c
--- a/hw/xbox/mcpx/apu/dsp/dsp_cpu.h
+++ b/hw/xbox/mcpx/apu/dsp/dsp_cpu.h
--- a/hw/xbox/mcpx/apu/dsp/dsp_cpu_regs.h
+++ b/hw/xbox/mcpx/apu/dsp/dsp_cpu_regs.h
--- a/hw/xbox/mcpx/apu/dsp/dsp_dis.c.inc
+++ b/hw/xbox/mcpx/apu/dsp/dsp_dis.c.inc
--- a/hw/xbox/mcpx/apu/dsp/dsp_dma.c
+++ b/hw/xbox/mcpx/apu/dsp/dsp_dma.c
--- a/hw/xbox/mcpx/apu/dsp/dsp_dma.h
+++ b/hw/xbox/mcpx/apu/dsp/dsp_dma.h
--- a/hw/xbox/mcpx/apu/dsp/dsp_dma_regs.h
+++ b/hw/xbox/mcpx/apu/dsp/dsp_dma_regs.h
--- a/hw/xbox/mcpx/apu/dsp/dsp_emu.c.inc
+++ b/hw/xbox/mcpx/apu/dsp/dsp_emu.c.inc
--- a/hw/xbox/mcpx/apu/dsp/dsp_state.h
+++ b/hw/xbox/mcpx/apu/dsp/dsp_state.h
--- a/hw/xbox/mcpx/apu/dsp/meson.build
+++ b/hw/xbox/mcpx/apu/dsp/meson.build
@ -1,4 +1,2 @@
 libdsp = static_library('dsp', files(['debug.c', 'dsp.c', 'dsp_cpu.c', 'dsp_dma.c']) + genh)
 dsp = declare_dependency(objects: libdsp.extract_all_objects(recursive: false))
-
-mcpx_ss.add(dsp, files('gp_ep.c'))
--- a/hw/xbox/mcpx/apu/dsp/trace-events
+++ b/hw/xbox/mcpx/apu/dsp/trace-events
--- a/hw/xbox/mcpx/dsp/trace.h
+++ b/hw/xbox/mcpx/dsp/trace.h
@ -0,0 +1 @@
+#include "trace/trace-hw_xbox_mcpx_dsp.h"
--- a/hw/xbox/mcpx/apu/fpconv.h
+++ b/hw/xbox/mcpx/apu/fpconv.h
@ -1,7 +1,7 @@
 /*
 * Helper FP conversions
 *
- * Copyright (c) 2020-2025 Matt Borgerson
+ * Copyright (c) 2020-2021 Matt Borgerson
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@ -21,39 +21,27 @@
 #ifndef FLOATCONV_H
 #define FLOATCONV_H

-#include <stdint.h>
-
-static inline float int8_to_float(int8_t x)
-{
-    return x / 128.0f;
-}
-
-static inline float uint8_to_float(uint8_t value)
+static float uint8_to_float(uint8_t value)
 {
    return ((int)value - 0x80) / (1.0 * 0x80);
 }

-static inline float int16_to_float(int16_t value)
+static float int16_to_float(int16_t value)
 {
    return value / (1.0 * 0x8000);
 }

-static inline float s6p9_to_float(int16_t value)
-{
-    return value / 512.0f;
-}
-
-static inline float int32_to_float(int32_t value)
+static float int32_to_float(int32_t value)
 {
    return value / (1.0 * 0x80000000);
 }

-static inline float int24_to_float(int32_t value)
+static float int24_to_float(int32_t value)
 {
    return int32_to_float((uint32_t)value << 8);
 }

-static inline uint32_t float_to_24b(float value)
+static uint32_t float_to_24b(float value)
 {
    double scaled_value = value * (8.0 * 0x100000);
    int int24;
--- a/hw/xbox/mcpx/meson.build
+++ b/hw/xbox/mcpx/meson.build
@ -1,8 +1,9 @@
+subdir('dsp')

 mcpx_ss = ss.source_set()
-mcpx_ss.add(files('aci.c'))
-
-subdir('apu')
-subdir('nvnet')
+mcpx_ss.add(sdl, libsamplerate, dsp, files(
+	'apu.c',
+	'aci.c',
+	))

 specific_ss.add_all(mcpx_ss)
--- a/hw/xbox/mcpx/nvnet/meson.build
+++ b/hw/xbox/mcpx/nvnet/meson.build
@ -1 +0,0 @@
-mcpx_ss.add(files('nvnet.c'))
--- a/hw/xbox/mcpx/nvnet/nvnet.c
+++ b/hw/xbox/mcpx/nvnet/nvnet.c
--- a/hw/xbox/mcpx/nvnet/nvnet_regs.h
+++ b/hw/xbox/mcpx/nvnet/nvnet_regs.h
@ -1,267 +0,0 @@
-/*
- * QEMU nForce Ethernet Controller register definitions
- *
- * Copyright (c) 2013 espes
- * Copyright (c) 2015-2025 Matt Borgerson
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- *
- * --
- *
- * Most definitions are based on forcedeth.c, taken from cromwell project.
- * Original forcedeth.c license follows:
- *
- * --
- *    forcedeth.c -- Etherboot device driver for the NVIDIA nForce
- *           media access controllers.
- *
- * Note: This driver is based on the Linux driver that was based on
- *      a cleanroom reimplementation which was based on reverse
- *      engineered documentation written by Carl-Daniel Hailfinger
- *      and Andrew de Quincey. It's neither supported nor endorsed
- *      by NVIDIA Corp. Use at your own risk.
- *
- *    Written 2004 by Timothy Legge <tlegge@rogers.com>
- *
- *    This program is free software; you can redistribute it and/or modify
- *    it under the terms of the GNU General Public License as published by
- *    the Free Software Foundation; either version 2 of the License, or
- *    (at your option) any later version.
- *
- *    This program is distributed in the hope that it will be useful,
- *    but WITHOUT ANY WARRANTY; without even the implied warranty of
- *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *    GNU General Public License for more details.
- *
- *    You should have received a copy of the GNU General Public License
- *    along with this program; if not, write to the Free Software
- *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- *    Portions of this code based on:
- *       forcedeth: Ethernet driver for NVIDIA nForce media access controllers:
- *
- *   (C) 2003 Manfred Spraul
- *       See Linux Driver for full information
- *
- *   Linux Driver Version 0.22, 19 Jan 2004
- *
- *
- *    REVISION HISTORY:
- *    ================
- *    v1.0   01-31-2004  timlegge    Initial port of Linux driver
- *    v1.1   02-03-2004  timlegge    Large Clean up, first release
- *
- *    Indent Options: indent -kr -i8
- ***************************************************************************/
-
-#ifndef HW_NVNET_REGS_H
-#define HW_NVNET_REGS_H
-
-// clang-format off
-
-#define NVNET_IRQ_STATUS                         0x000
-#  define NVNET_IRQ_STATUS_RX                      0x00000002
-#  define NVNET_IRQ_STATUS_RX_NOBUF                0x00000004
-#  define NVNET_IRQ_STATUS_TX_ERR                  0x00000008
-#  define NVNET_IRQ_STATUS_TX                      0x00000010
-#  define NVNET_IRQ_STATUS_TIMER                   0x00000020
-#  define NVNET_IRQ_STATUS_MIIEVENT                0x00000040
-#define NVNET_IRQ_MASK                           0x004
-#define NVNET_UNKNOWN_SETUP_REG6                 0x008
-#  define NVNET_UNKNOWN_SETUP_REG6_VAL             3
-/*
- * NVNET_POLLING_INTERVAL_DEFAULT is the interval length of the timer source on the nic
- * NVNET_POLLING_INTERVAL_DEFAULT=97 would result in an interval length of 1 ms
- */
-#define NVNET_POLLING_INTERVAL                   0x00C
-#  define NVNET_POLLING_INTERVAL_DEFAULT           970
-#define NVNET_MISC1                              0x080
-#  define NVNET_MISC1_HD                           0x00000002
-#  define NVNET_MISC1_FORCE                        0x003B0F3C
-#define NVNET_TRANSMITTER_CONTROL                0x084
-#  define NVNET_TRANSMITTER_CONTROL_START          0x00000001
-#define NVNET_TRANSMITTER_STATUS                 0x088
-#  define NVNET_TRANSMITTER_STATUS_BUSY            0x00000001
-#define NVNET_PACKET_FILTER                      0x08C
-#  define NVNET_PACKET_FILTER_ALWAYS               0x007F0008
-#  define NVNET_PACKET_FILTER_PROMISC              0x00000080
-#  define NVNET_PACKET_FILTER_MYADDR               0x00000020
-#define NVNET_OFFLOAD                            0x090
-#  define NVNET_OFFLOAD_HOMEPHY                    0x00000601
-#  define NVNET_OFFLOAD_NORMAL                     0x000005EE
-#define NVNET_RECEIVER_CONTROL                   0x094
-#  define NVNET_RECEIVER_CONTROL_START             0x00000001
-#define NVNET_RECEIVER_STATUS                    0x098
-#  define NVNET_RECEIVER_STATUS_BUSY               0x00000001
-#define NVNET_RANDOM_SEED                        0x09C
-#  define NVNET_RANDOM_SEED_MASK                   0x000000FF
-#  define NVNET_RANDOM_SEED_FORCE                  0x00007F00
-#define NVNET_UNKNOWN_SETUP_REG1                 0x0A0
-#  define NVNET_UNKNOWN_SETUP_REG1_VAL             0x0016070F
-#define NVNET_UNKNOWN_SETUP_REG2                 0x0A4
-#  define NVNET_UNKNOWN_SETUP_REG2_VAL             0x00000016
-#define NVNET_MAC_ADDR_A                         0x0A8
-#define NVNET_MAC_ADDR_B                         0x0AC
-#define NVNET_MULTICAST_ADDR_A                   0x0B0
-#  define NVNET_MULTICAST_ADDR_A_FORCE             0x00000001
-#define NVNET_MULTICAST_ADDR_B                   0x0B4
-#define NVNET_MULTICAST_MASK_A                   0x0B8
-#define NVNET_MULTICAST_MASK_B                   0x0BC
-#define NVNET_TX_RING_PHYS_ADDR                  0x100
-#define NVNET_RX_RING_PHYS_ADDR                  0x104
-#define NVNET_RING_SIZE                          0x108
-#  define NVNET_RING_SIZE_TX                       0x0000FFFF
-#  define NVNET_RING_SIZE_RX                       0xFFFF0000
-#define NVNET_UNKNOWN_TRANSMITTER_REG            0x10C
-#define NVNET_LINKSPEED                          0x110
-#  define NVNET_LINKSPEED_FORCE                    0x00010000
-#  define NVNET_LINKSPEED_10                       10
-#  define NVNET_LINKSPEED_100                      100
-#  define NVNET_LINKSPEED_1000                     1000
-#define NVNET_TX_RING_CURRENT_DESC_PHYS_ADDR     0x11C
-#define NVNET_RX_RING_CURRENT_DESC_PHYS_ADDR     0x120
-#define NVNET_TX_CURRENT_BUFFER_PHYS_ADDR        0x124
-#define NVNET_RX_CURRENT_BUFFER_PHYS_ADDR        0x12C
-#define NVNET_UNKNOWN_SETUP_REG5                 0x130
-#  define NVNET_UNKNOWN_SETUP_REG5_BIT31           (1 << 31)
-#define NVNET_TX_RING_NEXT_DESC_PHYS_ADDR        0x134
-#define NVNET_RX_RING_NEXT_DESC_PHYS_ADDR        0x138
-#define NVNET_UNKNOWN_SETUP_REG8                 0x13C
-#  define NVNET_UNKNOWN_SETUP_REG8_VAL1            0x00300010
-#define NVNET_UNKNOWN_SETUP_REG7                 0x140
-#  define NVNET_UNKNOWN_SETUP_REG7_VAL             0x00300010
-#define NVNET_TX_RX_CONTROL                      0x144
-#  define NVNET_TX_RX_CONTROL_KICK                 0x00000001
-#  define NVNET_TX_RX_CONTROL_BIT1                 0x00000002
-#  define NVNET_TX_RX_CONTROL_BIT2                 0x00000004
-#  define NVNET_TX_RX_CONTROL_IDLE                 0x00000008
-#  define NVNET_TX_RX_CONTROL_RESET                0x00000010
-#define NVNET_MII_STATUS                         0x180
-#  define NVNET_MII_STATUS_ERROR                   0x00000001
-#  define NVNET_MII_STATUS_LINKCHANGE              0x00000008
-#define NVNET_UNKNOWN_SETUP_REG4                 0x184
-#  define NVNET_UNKNOWN_SETUP_REG4_VAL             8
-#define NVNET_ADAPTER_CONTROL                    0x188
-#  define NVNET_ADAPTER_CONTROL_START              0x00000002
-#  define NVNET_ADAPTER_CONTROL_LINKUP             0x00000004
-#  define NVNET_ADAPTER_CONTROL_PHYVALID           0x00004000
-#  define NVNET_ADAPTER_CONTROL_RUNNING            0x00100000
-#  define NVNET_ADAPTER_CONTROL_PHYSHIFT           24
-#define NVNET_MII_SPEED                          0x18C
-#  define NVNET_MII_SPEED_BIT8                     (1 << 8)
-#  define NVNET_MII_SPEED_DELAY                    5
-#define NVNET_MDIO_ADDR                          0x190
-#  define NVNET_MDIO_ADDR_INUSE                    0x00008000
-#  define NVNET_MDIO_ADDR_WRITE                    0x00000400
-#  define NVNET_MDIO_ADDR_PHYADDR                  0x000003E0
-#  define NVNET_MDIO_ADDR_PHYREG                   0x0000001F
-#define NVNET_MDIO_DATA                          0x194
-#define NVNET_WAKEUPFLAGS                        0x200
-#  define NVNET_WAKEUPFLAGS_VAL                    0x00007770
-#  define NVNET_WAKEUPFLAGS_BUSYSHIFT              24
-#  define NVNET_WAKEUPFLAGS_ENABLESHIFT            16
-#  define NVNET_WAKEUPFLAGS_D3SHIFT                12
-#  define NVNET_WAKEUPFLAGS_D2SHIFT                8
-#  define NVNET_WAKEUPFLAGS_D1SHIFT                4
-#  define NVNET_WAKEUPFLAGS_D0SHIFT                0
-#  define NVNET_WAKEUPFLAGS_ACCEPT_MAGPAT          0x00000001
-#  define NVNET_WAKEUPFLAGS_ACCEPT_WAKEUPPAT       0x00000002
-#  define NVNET_WAKEUPFLAGS_ACCEPT_LINKCHANGE      0x00000004
-#define NVNET_PATTERN_CRC                        0x204
-#define NVNET_PATTERN_MASK                       0x208
-#define NVNET_POWERCAP                           0x268
-#  define NVNET_POWERCAP_D3SUPP                    (1 << 30)
-#  define NVNET_POWERCAP_D2SUPP                    (1 << 26)
-#  define NVNET_POWERCAP_D1SUPP                    (1 << 25)
-#define NVNET_POWERSTATE                         0x26C
-#  define NVNET_POWERSTATE_POWEREDUP               0x00008000
-#  define NVNET_POWERSTATE_VALID                   0x00000100
-#  define NVNET_POWERSTATE_MASK                    0x00000003
-#  define NVNET_POWERSTATE_D0                      0x00000000
-#  define NVNET_POWERSTATE_D1                      0x00000001
-#  define NVNET_POWERSTATE_D2                      0x00000002
-#  define NVNET_POWERSTATE_D3                      0x00000003
-
-#define NV_TX_LASTPACKET      (1 << 0)
-#define NV_TX_RETRYERROR      (1 << 3)
-#define NV_TX_LASTPACKET1     (1 << 8)
-#define NV_TX_DEFERRED        (1 << 10)
-#define NV_TX_CARRIERLOST     (1 << 11)
-#define NV_TX_LATECOLLISION   (1 << 12)
-#define NV_TX_UNDERFLOW       (1 << 13)
-#define NV_TX_ERROR           (1 << 14)
-#define NV_TX_VALID           (1 << 15)
-#define NV_RX_DESCRIPTORVALID (1 << 0)
-#define NV_RX_MISSEDFRAME     (1 << 1)
-#define NV_RX_SUBSTRACT1      (1 << 3)
-#define NV_RX_BIT4            (1 << 4)
-#define NV_RX_ERROR1          (1 << 7)
-#define NV_RX_ERROR2          (1 << 8)
-#define NV_RX_ERROR3          (1 << 9)
-#define NV_RX_ERROR4          (1 << 10)
-#define NV_RX_CRCERR          (1 << 11)
-#define NV_RX_OVERFLOW        (1 << 12)
-#define NV_RX_FRAMINGERR      (1 << 13)
-#define NV_RX_ERROR           (1 << 14)
-#define NV_RX_AVAIL           (1 << 15)
-
-/* Miscelaneous hardware related defines: */
-#define NV_PCI_REGSZ          0x270
-
-/* various timeout delays: all in usec */
-#define NV_TXRX_RESET_DELAY   4
-#define NV_TXSTOP_DELAY1      10
-#define NV_TXSTOP_DELAY1MAX   500000
-#define NV_TXSTOP_DELAY2      100
-#define NV_RXSTOP_DELAY1      10
-#define NV_RXSTOP_DELAY1MAX   500000
-#define NV_RXSTOP_DELAY2      100
-#define NV_SETUP5_DELAY       5
-#define NV_SETUP5_DELAYMAX    50000
-#define NV_POWERUP_DELAY      5
-#define NV_POWERUP_DELAYMAX   5000
-#define NV_MIIBUSY_DELAY      50
-#define NV_MIIPHY_DELAY       10
-#define NV_MIIPHY_DELAYMAX    10000
-#define NV_WAKEUPPATTERNS     5
-#define NV_WAKEUPMASKENTRIES  4
-
-/* General driver defaults */
-#define NV_WATCHDOG_TIMEO     (2 * HZ)
-#define DEFAULT_MTU           1500
-
-#define RX_RING               4
-#define TX_RING               2
-/* limited to 1 packet until we understand NV_TX_LASTPACKET */
-#define TX_LIMIT_STOP         10
-#define TX_LIMIT_START        5
-
-/* rx / tx mac addr + type + vlan + align + slack*/
-#define RX_NIC_BUFSIZE        (DEFAULT_MTU + 64)
-/* even more slack */
-#define RX_ALLOC_BUFSIZE      (DEFAULT_MTU + 128)
-#define TX_ALLOC_BUFSIZE      (DEFAULT_MTU + 128)
-
-#define OOM_REFILL            (1 + HZ / 20)
-#define POLL_WAIT             (1 + HZ / 100)
-
-/* Link partner ability register. */
-#define LPA_SLCT     0x001F  /* Same as advertise selector  */
-#define LPA_RESV     0x1C00  /* Unused...                   */
-#define LPA_RFAULT   0x2000  /* Link partner faulted        */
-#define LPA_NPAGE    0x8000  /* Next page bit               */
-
-// clang-format on
-
-#endif /* HW_NVNET_REGS_H */
--- a/hw/xbox/mcpx/nvnet/trace.h
+++ b/hw/xbox/mcpx/nvnet/trace.h
@ -1 +0,0 @@
-#include "trace/trace-hw_xbox_mcpx_nvnet.h"
--- a/hw/xbox/mcpx/apu/vp/svf.h
+++ b/hw/xbox/mcpx/apu/vp/svf.h
--- a/hw/xbox/mcpx/apu/trace-events
+++ b/hw/xbox/mcpx/apu/trace-events
--- a/hw/xbox/mcpx/trace.h
+++ b/hw/xbox/mcpx/trace.h
@ -0,0 +1 @@
+#include "trace/trace-hw_xbox_mcpx.h"
--- a/hw/xbox/meson.build
+++ b/hw/xbox/meson.build
@ -5,6 +5,7 @@ specific_ss.add(files(
 	# 'chihiro.c',
 	'eeprom_generation.c',
 	'lpc47m157.c',
+	'nvnet.c',
 	'smbus_adm1032.c',
 	'smbus_cx25871.c',
 	'smbus_fs454.c',
--- a/hw/xbox/nv2a/debug.h
+++ b/hw/xbox/nv2a/debug.h
@ -155,9 +155,8 @@ static inline void nv2a_profile_inc_counter(enum NV2A_PROF_COUNTERS_ENUM cnt)
 void nv2a_dbg_renderdoc_init(void);
 void *nv2a_dbg_renderdoc_get_api(void);
 bool nv2a_dbg_renderdoc_available(void);
-void nv2a_dbg_renderdoc_capture_frames(int num_frames, bool trace);
+void nv2a_dbg_renderdoc_capture_frames(int num_frames);
 extern int renderdoc_capture_frames;
-extern bool renderdoc_trace_frames;
 #endif

 #ifdef __cplusplus
--- a/hw/xbox/nv2a/nv2a.c
+++ b/hw/xbox/nv2a/nv2a.c
@ -423,7 +423,7 @@ const VMStateDescription vmstate_nv2a_pgraph_vertex_attributes = {

 static const VMStateDescription vmstate_nv2a = {
    .name = "nv2a",
-    .version_id = 3,
+    .version_id = 2,
    .minimum_version_id = 1,
    .post_save = nv2a_post_save,
    .post_load = nv2a_post_load,
@ -507,11 +507,9 @@ static const VMStateDescription vmstate_nv2a = {
        VMSTATE_BOOL_ARRAY(pgraph.ltc1_dirty, NV2AState, NV2A_LTC1_COUNT),
        VMSTATE_STRUCT_ARRAY(pgraph.vertex_attributes, NV2AState, NV2A_VERTEXSHADER_ATTRIBUTES, 1, vmstate_nv2a_pgraph_vertex_attributes, VertexAttribute),
        VMSTATE_UINT32(pgraph.inline_array_length, NV2AState),
-        VMSTATE_UINT32_SUB_ARRAY(pgraph.inline_array, NV2AState, 0, NV2A_MAX_BATCH_LENGTH_V2),
-        VMSTATE_UINT32_SUB_ARRAY_V(pgraph.inline_array, NV2AState, NV2A_MAX_BATCH_LENGTH_V2, NV2A_MAX_BATCH_LENGTH - NV2A_MAX_BATCH_LENGTH_V2, 3),
+        VMSTATE_UINT32_ARRAY(pgraph.inline_array, NV2AState, NV2A_MAX_BATCH_LENGTH),
        VMSTATE_UINT32(pgraph.inline_elements_length, NV2AState), // fixme
-        VMSTATE_UINT32_SUB_ARRAY(pgraph.inline_elements, NV2AState, 0, NV2A_MAX_BATCH_LENGTH_V2),
-        VMSTATE_UINT32_SUB_ARRAY_V(pgraph.inline_elements, NV2AState, NV2A_MAX_BATCH_LENGTH_V2, NV2A_MAX_BATCH_LENGTH - NV2A_MAX_BATCH_LENGTH_V2, 3),
+        VMSTATE_UINT32_ARRAY(pgraph.inline_elements, NV2AState, NV2A_MAX_BATCH_LENGTH),
        VMSTATE_UINT32(pgraph.inline_buffer_length, NV2AState), // fixme
        VMSTATE_UINT32(pgraph.draw_arrays_length, NV2AState),
        VMSTATE_UINT32(pgraph.draw_arrays_max_count, NV2AState),
--- a/hw/xbox/nv2a/nv2a_regs.h
+++ b/hw/xbox/nv2a/nv2a_regs.h
@ -315,14 +315,11 @@
 #define NV_PGRAPH_CSV0_C                                 0x00000FB8
 #   define NV_PGRAPH_CSV0_C_CHEOPS_PROGRAM_START                0x0000FF00
 #   define NV_PGRAPH_CSV0_C_SPECULAR_ENABLE                     (1 << 16)
-#   define NV_PGRAPH_CSV0_C_ALPHA_FROM_MATERIAL_SPECULAR        (1 << 17)
-#   define NV_PGRAPH_CSV0_C_SEPARATE_SPECULAR                   (1 << 18)
 #   define NV_PGRAPH_CSV0_C_SPECULAR                            (3 << 19)
 #   define NV_PGRAPH_CSV0_C_DIFFUSE                             (3 << 21)
 #   define NV_PGRAPH_CSV0_C_AMBIENT                             (3 << 23)
 #   define NV_PGRAPH_CSV0_C_EMISSION                            (3 << 25)
 #   define NV_PGRAPH_CSV0_C_NORMALIZATION_ENABLE                (1 << 27)
-#   define NV_PGRAPH_CSV0_C_LOCALEYE                            (1 << 30)
 #   define NV_PGRAPH_CSV0_C_LIGHTING                            (1 << 31)
 #define NV_PGRAPH_CSV1_B                                 0x00000FBC
 #define NV_PGRAPH_CSV1_A                                 0x00000FC0
@ -409,10 +406,6 @@
 #       define NV_PGRAPH_CLEARRECTY_YMIN                          0x00000FFF
 #       define NV_PGRAPH_CLEARRECTY_YMAX                          0x0FFF0000
 #define NV_PGRAPH_COLORCLEARVALUE                        0x0000186C
-#define NV_PGRAPH_COLORKEYCOLOR0                         0x00001870
-#define NV_PGRAPH_COLORKEYCOLOR1                         0x00001874
-#define NV_PGRAPH_COLORKEYCOLOR2                         0x00001878
-#define NV_PGRAPH_COLORKEYCOLOR3                         0x0000187C
 #define NV_PGRAPH_COMBINEFACTOR0                         0x00001880
 #define NV_PGRAPH_COMBINEFACTOR1                         0x000018A0
 #define NV_PGRAPH_COMBINEALPHAI0                         0x000018C0
@ -537,7 +530,6 @@
 #define NV_PGRAPH_TEXADDRESS2                            0x000019C4
 #define NV_PGRAPH_TEXADDRESS3                            0x000019C8
 #define NV_PGRAPH_TEXCTL0_0                              0x000019CC
-#   define NV_PGRAPH_TEXCTL0_0_COLORKEYMODE                     0x03
 #   define NV_PGRAPH_TEXCTL0_0_ALPHAKILLEN                      (1 << 2)
 #   define NV_PGRAPH_TEXCTL0_0_MAX_LOD_CLAMP                    0x0003FFC0
 #   define NV_PGRAPH_TEXCTL0_0_MIN_LOD_CLAMP                    0x3FFC0000
@ -890,10 +882,6 @@
 #       define NV097_SET_CONTROL0_STENCIL_WRITE_ENABLE            (1 << 0)
 #       define NV097_SET_CONTROL0_Z_FORMAT                        (1 << 12)
 #       define NV097_SET_CONTROL0_Z_PERSPECTIVE_ENABLE            (1 << 16)
-#   define NV097_SET_LIGHT_CONTROL                            0x00000294
-#       define NV097_SET_LIGHT_CONTROL_SEPARATE_SPECULAR          1
-#       define NV097_SET_LIGHT_CONTROL_LOCALEYE                   (1 << 16)
-#       define NV097_SET_LIGHT_CONTROL_ALPHA_FROM_MATERIAL_SPECULAR (1 << 17)
 #   define NV097_SET_COLOR_MATERIAL                           0x00000298
 #   define NV097_SET_FOG_MODE                                 0x0000029C
 #       define NV097_SET_FOG_MODE_V_LINEAR                        0x2601
@ -1061,7 +1049,6 @@
 #       define NV097_SET_TEXGEN_VIEW_MODEL_LOCAL_VIEWER           0
 #       define NV097_SET_TEXGEN_VIEW_MODEL_INFINITE_VIEWER        1
 #   define NV097_SET_FOG_PLANE                                0x000009D0
-#   define NV097_SET_SPECULAR_PARAMS                          0x000009E0
 #   define NV097_SET_SCENE_AMBIENT_COLOR                      0x00000A10
 #   define NV097_SET_VIEWPORT_OFFSET                          0x00000A20
 #   define NV097_SET_POINT_PARAMS                             0x00000A30
@ -1070,7 +1057,6 @@
 #   define NV097_SET_COMBINER_FACTOR1                         0x00000A80
 #   define NV097_SET_COMBINER_ALPHA_OCW                       0x00000AA0
 #   define NV097_SET_COMBINER_COLOR_ICW                       0x00000AC0
-#   define NV097_SET_COLOR_KEY_COLOR                          0x00000AE0
 #   define NV097_SET_VIEWPORT_SCALE                           0x00000AF0
 #   define NV097_SET_TRANSFORM_PROGRAM                        0x00000B00
 #   define NV097_SET_TRANSFORM_CONSTANT                       0x00000B80
@ -1113,11 +1099,6 @@
 #   define NV097_SET_TEXCOORD3_4F                             0x00001620
 #   define NV097_SET_TEXCOORD3_2S                             0x00001610
 #   define NV097_SET_TEXCOORD3_4S                             0x00001630
-#   define NV097_SET_FOG_COORD                                0x00001698
-#   define NV097_SET_WEIGHT1F                                 0x0000169C
-#   define NV097_SET_WEIGHT2F                                 0x000016A0
-#   define NV097_SET_WEIGHT3F                                 0x000016B0
-#   define NV097_SET_WEIGHT4F                                 0x000016C0
 #   define NV097_SET_VERTEX_DATA_ARRAY_OFFSET                 0x00001720
 #   define NV097_SET_VERTEX_DATA_ARRAY_FORMAT                 0x00001760
 #       define NV097_SET_VERTEX_DATA_ARRAY_FORMAT_TYPE            0x0000000F
@ -1271,7 +1252,6 @@
 #   define NV097_SET_CLEAR_RECT_HORIZONTAL                    0x00001D98
 #   define NV097_SET_CLEAR_RECT_VERTICAL                      0x00001D9C
 #   define NV097_SET_SPECULAR_FOG_FACTOR                      0x00001E20
-#   define NV097_SET_SPECULAR_PARAMS_BACK                     0x00001E28
 #   define NV097_SET_COMBINER_COLOR_OCW                       0x00001E40
 #   define NV097_SET_COMBINER_CONTROL                         0x00001E60
 #   define NV097_SET_SHADOW_ZSLOPE_THRESHOLD                  0x00001E68
@ -1473,22 +1453,7 @@
 #define NV2A_NUM_SUBCHANNELS 8
 #define NV2A_CACHE1_SIZE 128

-/* This is a multi-use limit. Testing on an Xbox 1.0, it is possible to send
- * arrays of at least 0x0FFFFF elements without issue, however sending
- * NV097_DRAW_ARRAYS with a start value > 0xFFFF raises an exception implying
- * that there may be a vertex limit. Since xemu uses batch length for vertex
- * elements in NV097_INLINE_ARRAY the size should ideally be high enough to
- * accommodate 0xFFFF vertices with maximum attributes specified.
- *
- * Retail games are known to send at least 0x410FA elements in a single draw, so
- * a somewhat larger value is selected to balance memory use with real-world
- * limits.
- *
- * NV2A_MAX_BATCH_LENGTH_V2 is the previous limit, for migration.
- * FIXME: Remove NV2A_MAX_BATCH_LENGTH_V2 at some point in the future.
- */
-#define NV2A_MAX_BATCH_LENGTH 0x07FFFF
-#define NV2A_MAX_BATCH_LENGTH_V2 0x1FFFF
+#define NV2A_MAX_BATCH_LENGTH 0x1FFFF
 #define NV2A_VERTEXSHADER_ATTRIBUTES 16
 #define NV2A_MAX_TEXTURES 4

--- a/hw/xbox/nv2a/pgraph/debug_renderdoc.c
+++ b/hw/xbox/nv2a/pgraph/debug_renderdoc.c
@ -36,7 +36,6 @@
 static RENDERDOC_API_1_6_0 *rdoc_api = NULL;

 int renderdoc_capture_frames = 0;
-bool renderdoc_trace_frames = false;

 void nv2a_dbg_renderdoc_init(void)
 {
@ -90,8 +89,7 @@ bool nv2a_dbg_renderdoc_available(void)
    return rdoc_api != NULL;
 }

-void nv2a_dbg_renderdoc_capture_frames(int num_frames, bool trace)
+void nv2a_dbg_renderdoc_capture_frames(int num_frames)
 {
    renderdoc_capture_frames += num_frames;
-    renderdoc_trace_frames = trace;
 }
--- a/hw/xbox/nv2a/pgraph/gl/constants.h
+++ b/hw/xbox/nv2a/pgraph/gl/constants.h
@ -298,7 +298,7 @@ static const SurfaceFormatInfo kelvin_surface_color_format_gl_map[] = {
    [NV097_SET_SURFACE_FORMAT_COLOR_LE_B8] =
        {1, GL_R8, GL_RED, GL_UNSIGNED_BYTE, GL_COLOR_ATTACHMENT0},
    [NV097_SET_SURFACE_FORMAT_COLOR_LE_G8B8] =
-        {2, GL_RG8, GL_RG, GL_UNSIGNED_BYTE, GL_COLOR_ATTACHMENT0},
+        {2, GL_RG8, GL_RG, GL_UNSIGNED_SHORT, GL_COLOR_ATTACHMENT0},
 };

 static const SurfaceFormatInfo kelvin_surface_zeta_float_format_gl_map[] = {
--- a/hw/xbox/nv2a/pgraph/gl/debug.c
+++ b/hw/xbox/nv2a/pgraph/gl/debug.c
@ -29,8 +29,6 @@
 #include <assert.h>

 #ifdef CONFIG_RENDERDOC
-#include "trace/control.h"
-
 #pragma GCC diagnostic ignored "-Wstrict-prototypes"
 #include "thirdparty/renderdoc_app.h"
 #endif
@ -156,8 +154,7 @@ void gl_debug_frame_terminator(void)
        RENDERDOC_API_1_6_0 *rdoc_api = nv2a_dbg_renderdoc_get_api();

        if (rdoc_api->IsTargetControlConnected()) {
-            bool capturing = rdoc_api->IsFrameCapturing();
-            if (capturing && renderdoc_capture_frames == 0) {
+            if (rdoc_api->IsFrameCapturing()) {
                rdoc_api->EndFrameCapture(NULL, NULL);
                GLenum error = glGetError();
                if (error != GL_NO_ERROR) {
@ -165,16 +162,8 @@ void gl_debug_frame_terminator(void)
                            "Renderdoc EndFrameCapture triggered GL error 0x%X - ignoring\n",
                            error);
                }
-                if (renderdoc_trace_frames) {
-                    trace_enable_events("-nv2a_pgraph_*");
-                    renderdoc_trace_frames = false;
-                }
            }
            if (renderdoc_capture_frames > 0) {
-                if (!capturing) {
-                    if (renderdoc_trace_frames) {
-                        trace_enable_events("nv2a_pgraph_*");
-                    }
                rdoc_api->StartFrameCapture(NULL, NULL);
                GLenum error = glGetError();
                if (error != GL_NO_ERROR) {
@ -182,7 +171,6 @@ void gl_debug_frame_terminator(void)
                            "Renderdoc StartFrameCapture triggered GL error 0x%X - ignoring\n",
                            error);
                }
-                }
                --renderdoc_capture_frames;
            }
        }
--- a/hw/xbox/nv2a/pgraph/gl/debug.h
+++ b/hw/xbox/nv2a/pgraph/gl/debug.h
@ -32,10 +32,10 @@
 #include "config-host.h"

 void gl_debug_initialize(void);
-void gl_debug_message(bool cc, const char *fmt, ...) __attribute__ ((format (printf, 2, 3)));
-void gl_debug_group_begin(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
+void gl_debug_message(bool cc, const char *fmt, ...);
+void gl_debug_group_begin(const char *fmt, ...);
 void gl_debug_group_end(void);
-void gl_debug_label(GLenum target, GLuint name, const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+void gl_debug_label(GLenum target, GLuint name, const char *fmt, ...);
 void gl_debug_frame_terminator(void);

 # define NV2A_GL_DPRINTF(cc, format, ...) \
--- a/hw/xbox/nv2a/pgraph/gl/display.c
+++ b/hw/xbox/nv2a/pgraph/gl/display.c
@ -68,7 +68,7 @@ void pgraph_gl_init_display(NV2AState *d)
        "{\n"
        "    vec2 texCoord = gl_FragCoord.xy/display_size;\n"
        "    float rel = display_size.y/textureSize(tex, 0).y/line_offset;\n"
-        "    texCoord.y = rel*(1.0f - texCoord.y);"
+        "    texCoord.y = 1 + rel*(texCoord.y - 1);"
        "    out_Color.rgba = texture(tex, texCoord);\n"
        "    if (pvideo_enable) {\n"
        "        vec2 screenCoord = gl_FragCoord.xy - 0.5;\n"
--- a/hw/xbox/nv2a/pgraph/gl/draw.c
+++ b/hw/xbox/nv2a/pgraph/gl/draw.c
@ -92,6 +92,7 @@ void pgraph_gl_clear_surface(NV2AState *d, uint32_t parameter)
                 scissor_height = ymax - ymin + 1;
    pgraph_apply_anti_aliasing_factor(pg, &xmin, &ymin);
    pgraph_apply_anti_aliasing_factor(pg, &scissor_width, &scissor_height);
+    ymin = pg->surface_binding_dim.height - (ymin + scissor_height);

    NV2A_DPRINTF("Translated clear rect to %d,%d - %d,%d\n", xmin, ymin,
                 xmin + scissor_width - 1, ymin + scissor_height - 1);
@ -203,10 +204,9 @@ void pgraph_gl_draw_begin(NV2AState *d)
    }

    /* Front-face select */
-    /* Winding is reverse here because clip-space y-coordinates are inverted */
    glFrontFace(pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER)
                    & NV_PGRAPH_SETUPRASTER_FRONTFACE
-                        ? GL_CW : GL_CCW);
+                        ? GL_CCW : GL_CW);

    /* Polygon offset */
    /* FIXME: GL implementation-specific, maybe do this in VS? */
@ -340,6 +340,7 @@ void pgraph_gl_draw_begin(NV2AState *d)

    pgraph_apply_anti_aliasing_factor(pg, &xmin, &ymin);
    pgraph_apply_anti_aliasing_factor(pg, &scissor_width, &scissor_height);
+    ymin = pg->surface_binding_dim.height - (ymin + scissor_height);
    pgraph_apply_scaling_factor(pg, &xmin, &ymin);
    pgraph_apply_scaling_factor(pg, &scissor_width, &scissor_height);

--- a/hw/xbox/nv2a/pgraph/gl/renderer.h
+++ b/hw/xbox/nv2a/pgraph/gl/renderer.h
@ -33,7 +33,7 @@
 #include "hw/xbox/nv2a/nv2a_regs.h"
 #include "hw/xbox/nv2a/pgraph/surface.h"
 #include "hw/xbox/nv2a/pgraph/texture.h"
-#include "hw/xbox/nv2a/pgraph/glsl/shaders.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"

 #include "gloffscreen.h"
 #include "constants.h"
@ -82,30 +82,6 @@ typedef struct TextureBinding {
    GLuint gl_texture;
 } TextureBinding;

-typedef struct ShaderModuleCacheKey {
-    GLenum kind;
-    union {
-        struct {
-            VshState state;
-            GenVshGlslOptions glsl_opts;
-        } vsh;
-        struct {
-            GeomState state;
-            GenGeomGlslOptions glsl_opts;
-        } geom;
-        struct {
-            PshState state;
-            GenPshGlslOptions glsl_opts;
-        } psh;
-    };
-} ShaderModuleCacheKey;
-
-typedef struct ShaderModuleCacheEntry {
-    LruNode node;
-    ShaderModuleCacheKey key;
-    GLuint gl_shader;
-} ShaderModuleCacheEntry;
-
 typedef struct ShaderBinding {
    LruNode node;
    bool initialized;
@ -120,10 +96,36 @@ typedef struct ShaderBinding {
    GLuint gl_program;
    GLenum gl_primitive_mode;

-    struct {
-        PshUniformLocs psh;
-        VshUniformLocs vsh;
-    } uniform_locs;
+    GLint psh_constant_loc[9][2];
+    GLint alpha_ref_loc;
+
+    GLint bump_mat_loc[NV2A_MAX_TEXTURES];
+    GLint bump_scale_loc[NV2A_MAX_TEXTURES];
+    GLint bump_offset_loc[NV2A_MAX_TEXTURES];
+    GLint tex_scale_loc[NV2A_MAX_TEXTURES];
+
+    GLint surface_size_loc;
+    GLint clip_range_loc;
+    GLint depth_offset_loc;
+
+    GLint vsh_constant_loc[NV2A_VERTEXSHADER_CONSTANTS];
+    uint32_t vsh_constants[NV2A_VERTEXSHADER_CONSTANTS][4];
+
+    GLint inv_viewport_loc;
+    GLint ltctxa_loc[NV2A_LTCTXA_COUNT];
+    GLint ltctxb_loc[NV2A_LTCTXB_COUNT];
+    GLint ltc1_loc[NV2A_LTC1_COUNT];
+
+    GLint fog_color_loc;
+    GLint fog_param_loc;
+    GLint light_infinite_half_vector_loc[NV2A_MAX_LIGHTS];
+    GLint light_infinite_direction_loc[NV2A_MAX_LIGHTS];
+    GLint light_local_position_loc[NV2A_MAX_LIGHTS];
+    GLint light_local_attenuation_loc[NV2A_MAX_LIGHTS];
+
+    GLint clip_region_loc[8];
+
+    GLint material_alpha_loc;
 } ShaderBinding;

 typedef struct VertexKey {
@ -199,9 +201,6 @@ typedef struct PGRAPHGLState {
    QemuMutex shader_cache_lock;
    QemuThread shader_disk_thread;

-    Lru shader_module_cache;
-    ShaderModuleCacheEntry *shader_module_cache_entries;
-
    unsigned int zpass_pixel_count_result;
    unsigned int gl_zpass_pixel_count_query_count;
    GLuint *gl_zpass_pixel_count_queries;
--- a/hw/xbox/nv2a/pgraph/gl/shaders.c
+++ b/hw/xbox/nv2a/pgraph/gl/shaders.c
@ -22,9 +22,14 @@
 #include "qemu/osdep.h"
 #include "qemu/fast-hash.h"
 #include "qemu/mstring.h"
+#include <locale.h>

 #include "xemu-version.h"
 #include "ui/xemu-settings.h"
+#include "hw/xbox/nv2a/pgraph/glsl/geom.h"
+#include "hw/xbox/nv2a/pgraph/glsl/vsh.h"
+#include "hw/xbox/nv2a/pgraph/glsl/psh.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"
 #include "hw/xbox/nv2a/pgraph/util.h"
 #include "debug.h"
 #include "renderer.h"
@ -95,131 +100,154 @@ static GLuint create_gl_shader(GLenum gl_shader_type,
    return shader;
 }

-static void set_texture_sampler_uniforms(ShaderBinding *binding)
+static void update_shader_constant_locations(ShaderBinding *binding)
 {
+    char tmp[64];
+
+    /* set texture samplers */
    for (int i = 0; i < NV2A_MAX_TEXTURES; i++) {
        char samplerName[16];
        snprintf(samplerName, sizeof(samplerName), "texSamp%d", i);
-        GLint texSampLoc =
-            glGetUniformLocation(binding->gl_program, samplerName);
+        GLint texSampLoc = glGetUniformLocation(binding->gl_program, samplerName);
        if (texSampLoc >= 0) {
            glUniform1i(texSampLoc, i);
        }
    }
-}

-static void update_shader_uniform_locs(ShaderBinding *binding)
-{
-    char tmp[64];
-
-    for (int i = 0; i < ARRAY_SIZE(binding->uniform_locs.vsh); i++) {
-        const char *name = VshUniformInfo[i].name;
-        if (VshUniformInfo[i].count > 1) {
-            snprintf(tmp, sizeof(tmp), "%s[0]", name);
-            name = tmp;
-        }
-        binding->uniform_locs.vsh[i] = glGetUniformLocation(binding->gl_program, name);
+    /* validate the program */
+    glValidateProgram(binding->gl_program);
+    GLint valid = 0;
+    glGetProgramiv(binding->gl_program, GL_VALIDATE_STATUS, &valid);
+    if (!valid) {
+        GLchar log[1024];
+        glGetProgramInfoLog(binding->gl_program, 1024, NULL, log);
+        fprintf(stderr, "nv2a: shader validation failed: %s\n", log);
+        abort();
    }

-    for (int i = 0; i < ARRAY_SIZE(binding->uniform_locs.psh); i++) {
-        const char *name = PshUniformInfo[i].name;
-        if (PshUniformInfo[i].count > 1) {
-            snprintf(tmp, sizeof(tmp), "%s[0]", name);
-            name = tmp;
+    /* lookup fragment shader uniforms */
+    for (int i = 0; i < 9; i++) {
+        for (int j = 0; j < 2; j++) {
+            snprintf(tmp, sizeof(tmp), "c%d_%d", j, i);
+            binding->psh_constant_loc[i][j] = glGetUniformLocation(binding->gl_program, tmp);
        }
-        binding->uniform_locs.psh[i] = glGetUniformLocation(binding->gl_program, name);
+    }
+    binding->alpha_ref_loc = glGetUniformLocation(binding->gl_program, "alphaRef");
+    for (int i = 1; i < NV2A_MAX_TEXTURES; i++) {
+        snprintf(tmp, sizeof(tmp), "bumpMat%d", i);
+        binding->bump_mat_loc[i] = glGetUniformLocation(binding->gl_program, tmp);
+        snprintf(tmp, sizeof(tmp), "bumpScale%d", i);
+        binding->bump_scale_loc[i] = glGetUniformLocation(binding->gl_program, tmp);
+        snprintf(tmp, sizeof(tmp), "bumpOffset%d", i);
+        binding->bump_offset_loc[i] = glGetUniformLocation(binding->gl_program, tmp);
+    }
+
+    for (int i = 0; i < NV2A_MAX_TEXTURES; i++) {
+        snprintf(tmp, sizeof(tmp), "texScale%d", i);
+        binding->tex_scale_loc[i] = glGetUniformLocation(binding->gl_program, tmp);
+    }
+
+    /* lookup vertex shader uniforms */
+    for (int i = 0; i < NV2A_VERTEXSHADER_CONSTANTS; i++) {
+        snprintf(tmp, sizeof(tmp), "c[%d]", i);
+        binding->vsh_constant_loc[i] = glGetUniformLocation(binding->gl_program, tmp);
+    }
+    binding->surface_size_loc = glGetUniformLocation(binding->gl_program, "surfaceSize");
+    binding->clip_range_loc = glGetUniformLocation(binding->gl_program, "clipRange");
+    binding->depth_offset_loc = glGetUniformLocation(binding->gl_program, "depthOffset");
+    binding->fog_color_loc = glGetUniformLocation(binding->gl_program, "fogColor");
+    binding->fog_param_loc = glGetUniformLocation(binding->gl_program, "fogParam");
+
+    binding->inv_viewport_loc = glGetUniformLocation(binding->gl_program, "invViewport");
+    for (int i = 0; i < NV2A_LTCTXA_COUNT; i++) {
+        snprintf(tmp, sizeof(tmp), "ltctxa[%d]", i);
+        binding->ltctxa_loc[i] = glGetUniformLocation(binding->gl_program, tmp);
+    }
+    for (int i = 0; i < NV2A_LTCTXB_COUNT; i++) {
+        snprintf(tmp, sizeof(tmp), "ltctxb[%d]", i);
+        binding->ltctxb_loc[i] = glGetUniformLocation(binding->gl_program, tmp);
+    }
+    for (int i = 0; i < NV2A_LTC1_COUNT; i++) {
+        snprintf(tmp, sizeof(tmp), "ltc1[%d]", i);
+        binding->ltc1_loc[i] = glGetUniformLocation(binding->gl_program, tmp);
+    }
+    for (int i = 0; i < NV2A_MAX_LIGHTS; i++) {
+        snprintf(tmp, sizeof(tmp), "lightInfiniteHalfVector%d", i);
+        binding->light_infinite_half_vector_loc[i] =
+            glGetUniformLocation(binding->gl_program, tmp);
+        snprintf(tmp, sizeof(tmp), "lightInfiniteDirection%d", i);
+        binding->light_infinite_direction_loc[i] =
+            glGetUniformLocation(binding->gl_program, tmp);
+
+        snprintf(tmp, sizeof(tmp), "lightLocalPosition%d", i);
+        binding->light_local_position_loc[i] = glGetUniformLocation(binding->gl_program, tmp);
+        snprintf(tmp, sizeof(tmp), "lightLocalAttenuation%d", i);
+        binding->light_local_attenuation_loc[i] =
+            glGetUniformLocation(binding->gl_program, tmp);
+    }
+    for (int i = 0; i < 8; i++) {
+        snprintf(tmp, sizeof(tmp), "clipRegion[%d]", i);
+        binding->clip_region_loc[i] = glGetUniformLocation(binding->gl_program, tmp);
+    }
+
+    if (binding->state.fixed_function) {
+        binding->material_alpha_loc =
+            glGetUniformLocation(binding->gl_program, "material_alpha");
+    } else {
+        binding->material_alpha_loc = -1;
    }
 }

-static void shader_module_cache_entry_init(Lru *lru, LruNode *node,
-                                           const void *key)
+static void generate_shaders(ShaderBinding *binding)
 {
-    ShaderModuleCacheEntry *module =
-        container_of(node, ShaderModuleCacheEntry, node);
-    memcpy(&module->key, key, sizeof(ShaderModuleCacheKey));
-
-    const char *kind_str;
-    MString *code;
-
-    switch (module->key.kind) {
-    case GL_VERTEX_SHADER:
-        kind_str = "vertex shader";
-        code = pgraph_glsl_gen_vsh(&module->key.vsh.state,
-                                   module->key.vsh.glsl_opts);
-        break;
-    case GL_GEOMETRY_SHADER:
-        kind_str = "geometry shader";
-        code = pgraph_glsl_gen_geom(&module->key.geom.state,
-                                    module->key.geom.glsl_opts);
-        break;
-    case GL_FRAGMENT_SHADER:
-        kind_str = "fragment shader";
-        code = pgraph_glsl_gen_psh(&module->key.psh.state,
-                                   module->key.psh.glsl_opts);
-        break;
-    default:
-        assert(!"Invalid shader module kind");
-        kind_str = "unknown";
-        code = NULL;
+    char *previous_numeric_locale = setlocale(LC_NUMERIC, NULL);
+    if (previous_numeric_locale) {
+        previous_numeric_locale = g_strdup(previous_numeric_locale);
    }

-    module->gl_shader =
-        create_gl_shader(module->key.kind, mstring_get_str(code), kind_str);
-    mstring_unref(code);
-}
-
-static void shader_module_cache_entry_post_evict(Lru *lru, LruNode *node)
-{
-    ShaderModuleCacheEntry *module =
-        container_of(node, ShaderModuleCacheEntry, node);
-    glDeleteShader(module->gl_shader);
-}
-
-static bool shader_module_cache_entry_compare(Lru *lru, LruNode *node,
-                                              const void *key)
-{
-    ShaderModuleCacheEntry *module =
-        container_of(node, ShaderModuleCacheEntry, node);
-    return memcmp(&module->key, key, sizeof(ShaderModuleCacheKey));
-}
-
-static GLuint get_shader_module_for_key(PGRAPHGLState *r,
-                                        const ShaderModuleCacheKey *key)
-{
-    uint64_t hash = fast_hash((void *)key, sizeof(ShaderModuleCacheKey));
-    LruNode *node = lru_lookup(&r->shader_module_cache, hash, key);
-    ShaderModuleCacheEntry *module =
-        container_of(node, ShaderModuleCacheEntry, node);
-    return module->gl_shader;
-}
-
-static void generate_shaders(PGRAPHGLState *r, ShaderBinding *binding)
-{
+    /* Ensure numeric values are printed with '.' radix, no grouping */
+    setlocale(LC_NUMERIC, "C");
    GLuint program = glCreateProgram();

    ShaderState *state = &binding->state;
-    ShaderModuleCacheKey key;

-    bool need_geometry_shader = pgraph_glsl_need_geom(&state->geom);
-    if (need_geometry_shader) {
-        memset(&key, 0, sizeof(key));
-        key.kind = GL_GEOMETRY_SHADER;
-        key.geom.state = state->geom;
-        glAttachShader(program, get_shader_module_for_key(r, &key));
+    /* Create an optional geometry shader and find primitive type */
+    GLenum gl_primitive_mode =
+        get_gl_primitive_mode(state->polygon_front_mode, state->primitive_mode);
+    MString* geometry_shader_code =
+        pgraph_gen_geom_glsl(state->polygon_front_mode,
+                                 state->polygon_back_mode,
+                                 state->primitive_mode,
+                                 state->smooth_shading,
+                                 false);
+    if (geometry_shader_code) {
+        const char* geometry_shader_code_str =
+             mstring_get_str(geometry_shader_code);
+        GLuint geometry_shader = create_gl_shader(GL_GEOMETRY_SHADER,
+                                                  geometry_shader_code_str,
+                                                  "geometry shader");
+        glAttachShader(program, geometry_shader);
+        mstring_unref(geometry_shader_code);
    }

    /* create the vertex shader */
-    memset(&key, 0, sizeof(key));
-    key.kind = GL_VERTEX_SHADER;
-    key.vsh.state = state->vsh;
-    key.vsh.glsl_opts.prefix_outputs = need_geometry_shader;
-    glAttachShader(program, get_shader_module_for_key(r, &key));
+    MString *vertex_shader_code =
+        pgraph_gen_vsh_glsl(state, geometry_shader_code != NULL);
+    GLuint vertex_shader = create_gl_shader(GL_VERTEX_SHADER,
+                                            mstring_get_str(vertex_shader_code),
+                                            "vertex shader");
+    glAttachShader(program, vertex_shader);
+    mstring_unref(vertex_shader_code);

    /* generate a fragment shader from register combiners */
-    memset(&key, 0, sizeof(key));
-    key.kind = GL_FRAGMENT_SHADER;
-    key.psh.state = state->psh;
-    glAttachShader(program, get_shader_module_for_key(r, &key));
+    MString *fragment_shader_code = pgraph_gen_psh_glsl(state->psh);
+    const char *fragment_shader_code_str =
+        mstring_get_str(fragment_shader_code);
+    GLuint fragment_shader = create_gl_shader(GL_FRAGMENT_SHADER,
+                                              fragment_shader_code_str,
+                                              "fragment shader");
+    glAttachShader(program, fragment_shader);
+    mstring_unref(fragment_shader_code);

    /* link the program */
    glLinkProgram(program);
@ -234,25 +262,15 @@ static void generate_shaders(PGRAPHGLState *r, ShaderBinding *binding)

    glUseProgram(program);

-    binding->gl_program = program;
-    binding->gl_primitive_mode = get_gl_primitive_mode(
-        state->geom.polygon_front_mode, state->geom.primitive_mode);
    binding->initialized = true;
+    binding->gl_program = program;
+    binding->gl_primitive_mode = gl_primitive_mode;
+    update_shader_constant_locations(binding);

-    set_texture_sampler_uniforms(binding);
-
-    /* validate the program */
-    GLint valid = 0;
-    glValidateProgram(program);
-    glGetProgramiv(program, GL_VALIDATE_STATUS, &valid);
-    if (!valid) {
-        GLchar log[1024];
-        glGetProgramInfoLog(program, 1024, NULL, log);
-        fprintf(stderr, "nv2a: shader validation failed: %s\n", log);
-        abort();
+    if (previous_numeric_locale) {
+        setlocale(LC_NUMERIC, previous_numeric_locale);
+        g_free(previous_numeric_locale);
    }
-
-    update_shader_uniform_locs(binding);
 }

 static const char *shader_gl_vendor = NULL;
@ -328,19 +346,6 @@ bool pgraph_gl_shader_load_from_memory(ShaderBinding *binding)
        return false;
    }

-    glUseProgram(gl_program);
-
-    g_free(binding->program);
-
-    binding->program = NULL;
-    binding->gl_program = gl_program;
-    binding->gl_primitive_mode =
-        get_gl_primitive_mode(binding->state.geom.polygon_front_mode,
-                              binding->state.geom.primitive_mode);
-    binding->initialized = true;
-
-    set_texture_sampler_uniforms(binding);
-
    glValidateProgram(gl_program);
    GLint valid = 0;
    glGetProgramiv(gl_program, GL_VALIDATE_STATUS, &valid);
@ -352,7 +357,17 @@ bool pgraph_gl_shader_load_from_memory(ShaderBinding *binding)
        return false;
    }

-    update_shader_uniform_locs(binding);
+    glUseProgram(gl_program);
+
+    binding->gl_program = gl_program;
+    binding->gl_primitive_mode = get_gl_primitive_mode(
+        binding->state.polygon_front_mode, binding->state.primitive_mode);
+    binding->initialized = true;
+
+    g_free(binding->program);
+    binding->program = NULL;
+
+    update_shader_constant_locations(binding);

    return true;
 }
@ -492,7 +507,7 @@ static void *shader_reload_lru_from_disk(void *arg)
    return NULL;
 }

-static void shader_cache_entry_init(Lru *lru, LruNode *node, const void *state)
+static void shader_cache_entry_init(Lru *lru, LruNode *node, void *state)
 {
    ShaderBinding *binding = container_of(node, ShaderBinding, node);
    memcpy(&binding->state, state, sizeof(ShaderState));
@ -522,7 +537,7 @@ static void shader_cache_entry_post_evict(Lru *lru, LruNode *node)
    memset(&binding->state, 0, sizeof(ShaderState));
 }

-static bool shader_cache_entry_compare(Lru *lru, LruNode *node, const void *key)
+static bool shader_cache_entry_compare(Lru *lru, LruNode *node, void *key)
 {
    ShaderBinding *binding = container_of(node, ShaderBinding, node);
    return memcmp(&binding->state, key, sizeof(ShaderState));
@ -556,20 +571,6 @@ void pgraph_gl_init_shaders(PGRAPHState *pg)

    qemu_thread_create(&r->shader_disk_thread, "pgraph.renderer_state->shader_cache",
                       shader_reload_lru_from_disk, pg, QEMU_THREAD_JOINABLE);
-
-    /* FIXME: Make this configurable */
-    const size_t shader_module_cache_size = 50*1024;
-    lru_init(&r->shader_module_cache);
-    r->shader_module_cache_entries =
-        g_malloc_n(shader_module_cache_size, sizeof(ShaderModuleCacheEntry));
-    assert(r->shader_module_cache_entries != NULL);
-    for (int i = 0; i < shader_module_cache_size; i++) {
-        lru_add_free(&r->shader_module_cache, &r->shader_module_cache_entries[i].node);
-    }
-
-    r->shader_module_cache.init_node = shader_module_cache_entry_init;
-    r->shader_module_cache.compare_nodes = shader_module_cache_entry_compare;
-    r->shader_module_cache.post_node_evict = shader_module_cache_entry_post_evict;
 }

 void pgraph_gl_finalize_shaders(PGRAPHState *pg)
@ -581,10 +582,6 @@ void pgraph_gl_finalize_shaders(PGRAPHState *pg)
    free(r->shader_cache_entries);
    r->shader_cache_entries = NULL;

-    lru_flush(&r->shader_module_cache);
-    g_free(r->shader_module_cache_entries);
-    r->shader_module_cache_entries = NULL;
-
    qemu_mutex_destroy(&r->shader_cache_lock);
 }

@ -688,72 +685,341 @@ void pgraph_gl_shader_cache_to_disk(ShaderBinding *binding)
    qemu_thread_create(binding->save_thread, name, shader_write_to_disk, binding, QEMU_THREAD_JOINABLE);
 }

-static void apply_uniform_updates(const UniformInfo *info, int *locs,
-                                  void *values, size_t count)
-{
-    for (int i = 0; i < count; i++) {
-        if (locs[i] == -1) {
-            continue;
-        }
-
-        void *value = (char*)values + info[i].val_offs;
-
-        switch (info[i].type) {
-        case UniformElementType_uint:
-            glUniform1uiv(locs[i], info[i].count, value);
-            break;
-        case UniformElementType_int:
-            glUniform1iv(locs[i], info[i].count, value);
-            break;
-        case UniformElementType_ivec4:
-            glUniform4iv(locs[i], info[i].count, value);
-            break;
-        case UniformElementType_float:
-            glUniform1fv(locs[i], info[i].count, value);
-            break;
-        case UniformElementType_vec2:
-            glUniform2fv(locs[i], info[i].count, value);
-            break;
-        case UniformElementType_vec3:
-            glUniform3fv(locs[i], info[i].count, value);
-            break;
-        case UniformElementType_vec4:
-            glUniform4fv(locs[i], info[i].count, value);
-            break;
-        case UniformElementType_mat2:
-            glUniformMatrix2fv(locs[i], info[i].count, GL_FALSE, value);
-            break;
-        default:
-            g_assert_not_reached();
-        }
-    }
-
-    assert(glGetError() == GL_NO_ERROR);
-}
-
-// FIXME: Dirty tracking
-// FIXME: Consider UBO to align with VK renderer
-static void update_shader_uniforms(PGRAPHState *pg, ShaderBinding *binding)
+static void shader_update_constants(PGRAPHState *pg, ShaderBinding *binding,
+                                    bool binding_changed)
 {
    PGRAPHGLState *r = pg->gl_renderer_state;
+    int i, j;

-    VshUniformValues vsh_values;
-    pgraph_glsl_set_vsh_uniform_values(pg, &binding->state.vsh,
-                                  binding->uniform_locs.vsh, &vsh_values);
-    apply_uniform_updates(VshUniformInfo, binding->uniform_locs.vsh,
-                          &vsh_values, VshUniform__COUNT);
+    /* update combiner constants */
+    for (i = 0; i < 9; i++) {
+        uint32_t constant[2];
+        if (i == 8) {
+            /* final combiner */
+            constant[0] = pgraph_reg_r(pg, NV_PGRAPH_SPECFOGFACTOR0);
+            constant[1] = pgraph_reg_r(pg, NV_PGRAPH_SPECFOGFACTOR1);
+        } else {
+            constant[0] = pgraph_reg_r(pg, NV_PGRAPH_COMBINEFACTOR0 + i * 4);
+            constant[1] = pgraph_reg_r(pg, NV_PGRAPH_COMBINEFACTOR1 + i * 4);
+        }

-    PshUniformValues psh_values;
-    pgraph_glsl_set_psh_uniform_values(pg, binding->uniform_locs.psh, &psh_values);
-
-    for (int i = 0; i < 4; i++) {
-        if (r->texture_binding[i] != NULL) {
-            float scale = r->texture_binding[i]->scale;
-            psh_values.texScale[i] = scale;
+        for (j = 0; j < 2; j++) {
+            GLint loc = binding->psh_constant_loc[i][j];
+            if (loc != -1) {
+                float value[4];
+                pgraph_argb_pack32_to_rgba_float(constant[j], value);
+                glUniform4fv(loc, 1, value);
            }
        }
-    apply_uniform_updates(PshUniformInfo, binding->uniform_locs.psh,
-                          &psh_values, PshUniform__COUNT);
+    }
+    if (binding->alpha_ref_loc != -1) {
+        int alpha_ref = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0),
+                                   NV_PGRAPH_CONTROL_0_ALPHAREF);
+        glUniform1i(binding->alpha_ref_loc, alpha_ref);
+    }
+
+
+    /* For each texture stage */
+    for (i = 0; i < NV2A_MAX_TEXTURES; i++) {
+        GLint loc;
+
+        /* Bump luminance only during stages 1 - 3 */
+        if (i > 0) {
+            loc = binding->bump_mat_loc[i];
+            if (loc != -1) {
+                uint32_t m_u32[4];
+                m_u32[0] = pgraph_reg_r(pg, NV_PGRAPH_BUMPMAT00 + 4 * (i - 1));
+                m_u32[1] = pgraph_reg_r(pg, NV_PGRAPH_BUMPMAT01 + 4 * (i - 1));
+                m_u32[2] = pgraph_reg_r(pg, NV_PGRAPH_BUMPMAT10 + 4 * (i - 1));
+                m_u32[3] = pgraph_reg_r(pg, NV_PGRAPH_BUMPMAT11 + 4 * (i - 1));
+                float m[4];
+                m[0] = *(float*)&m_u32[0];
+                m[1] = *(float*)&m_u32[1];
+                m[2] = *(float*)&m_u32[2];
+                m[3] = *(float*)&m_u32[3];
+                glUniformMatrix2fv(loc, 1, GL_FALSE, m);
+            }
+            loc = binding->bump_scale_loc[i];
+            if (loc != -1) {
+                uint32_t v =
+                    pgraph_reg_r(pg, NV_PGRAPH_BUMPSCALE1 + (i - 1) * 4);
+                glUniform1f(loc, *(float*)&v);
+            }
+            loc = binding->bump_offset_loc[i];
+            if (loc != -1) {
+                uint32_t v =
+                    pgraph_reg_r(pg, NV_PGRAPH_BUMPOFFSET1 + (i - 1) * 4);
+                glUniform1f(loc, *(float*)&v);
+            }
+        }
+
+        loc = r->shader_binding->tex_scale_loc[i];
+        if (loc != -1) {
+            assert(r->texture_binding[i] != NULL);
+            glUniform1f(loc, (float)r->texture_binding[i]->scale);
+        }
+    }
+
+    if (binding->fog_color_loc != -1) {
+        uint32_t fog_color = pgraph_reg_r(pg, NV_PGRAPH_FOGCOLOR);
+        glUniform4f(binding->fog_color_loc,
+                    GET_MASK(fog_color, NV_PGRAPH_FOGCOLOR_RED) / 255.0,
+                    GET_MASK(fog_color, NV_PGRAPH_FOGCOLOR_GREEN) / 255.0,
+                    GET_MASK(fog_color, NV_PGRAPH_FOGCOLOR_BLUE) / 255.0,
+                    GET_MASK(fog_color, NV_PGRAPH_FOGCOLOR_ALPHA) / 255.0);
+    }
+    if (binding->fog_param_loc != -1) {
+        uint32_t v[2];
+        v[0] = pgraph_reg_r(pg, NV_PGRAPH_FOGPARAM0);
+        v[1] = pgraph_reg_r(pg, NV_PGRAPH_FOGPARAM1);
+        glUniform2f(binding->fog_param_loc, *(float *)&v[0], *(float *)&v[1]);
+    }
+
+    float zmax;
+    switch (pg->surface_shape.zeta_format) {
+    case NV097_SET_SURFACE_FORMAT_ZETA_Z16:
+        zmax = pg->surface_shape.z_format ? f16_max : (float)0xFFFF;
+        break;
+    case NV097_SET_SURFACE_FORMAT_ZETA_Z24S8:
+        zmax = pg->surface_shape.z_format ? f24_max : (float)0xFFFFFF;
+        break;
+    default:
+        assert(0);
+    }
+
+    if (binding->state.fixed_function) {
+        /* update lighting constants */
+        struct {
+            uint32_t* v;
+            bool* dirty;
+            GLint* locs;
+            size_t len;
+        } lighting_arrays[] = {
+            {&pg->ltctxa[0][0], &pg->ltctxa_dirty[0], binding->ltctxa_loc, NV2A_LTCTXA_COUNT},
+            {&pg->ltctxb[0][0], &pg->ltctxb_dirty[0], binding->ltctxb_loc, NV2A_LTCTXB_COUNT},
+            {&pg->ltc1[0][0], &pg->ltc1_dirty[0], binding->ltc1_loc, NV2A_LTC1_COUNT},
+        };
+
+        for (i=0; i<ARRAY_SIZE(lighting_arrays); i++) {
+            uint32_t *lighting_v = lighting_arrays[i].v;
+            bool *lighting_dirty = lighting_arrays[i].dirty;
+            GLint *lighting_locs = lighting_arrays[i].locs;
+            size_t lighting_len = lighting_arrays[i].len;
+            for (j=0; j<lighting_len; j++) {
+                if (!lighting_dirty[j] && !binding_changed) continue;
+                GLint loc = lighting_locs[j];
+                if (loc != -1) {
+                    glUniform4fv(loc, 1, (const GLfloat*)&lighting_v[j*4]);
+                }
+                lighting_dirty[j] = false;
+            }
+        }
+
+        for (i = 0; i < NV2A_MAX_LIGHTS; i++) {
+            GLint loc;
+            loc = binding->light_infinite_half_vector_loc[i];
+            if (loc != -1) {
+                glUniform3fv(loc, 1, pg->light_infinite_half_vector[i]);
+            }
+            loc = binding->light_infinite_direction_loc[i];
+            if (loc != -1) {
+                glUniform3fv(loc, 1, pg->light_infinite_direction[i]);
+            }
+
+            loc = binding->light_local_position_loc[i];
+            if (loc != -1) {
+                glUniform3fv(loc, 1, pg->light_local_position[i]);
+            }
+            loc = binding->light_local_attenuation_loc[i];
+            if (loc != -1) {
+                glUniform3fv(loc, 1, pg->light_local_attenuation[i]);
+            }
+        }
+
+        /* estimate the viewport by assuming it matches the surface ... */
+        unsigned int aa_width = 1, aa_height = 1;
+        pgraph_apply_anti_aliasing_factor(pg, &aa_width, &aa_height);
+
+        float m11 = 0.5 * (pg->surface_binding_dim.width/aa_width);
+        float m22 = -0.5 * (pg->surface_binding_dim.height/aa_height);
+        float m33 = zmax;
+        float m41 = *(float*)&pg->vsh_constants[NV_IGRAPH_XF_XFCTX_VPOFF][0];
+        float m42 = *(float*)&pg->vsh_constants[NV_IGRAPH_XF_XFCTX_VPOFF][1];
+
+        float invViewport[16] = {
+            1.0/m11, 0, 0, 0,
+            0, 1.0/m22, 0, 0,
+            0, 0, 1.0/m33, 0,
+            -1.0+m41/m11, 1.0+m42/m22, 0, 1.0
+        };
+
+        if (binding->inv_viewport_loc != -1) {
+            glUniformMatrix4fv(binding->inv_viewport_loc,
+                               1, GL_FALSE, &invViewport[0]);
+        }
+    }
+
+    /* update vertex program constants */
+    for (i=0; i<NV2A_VERTEXSHADER_CONSTANTS; i++) {
+        if (!pg->vsh_constants_dirty[i] && !binding_changed) continue;
+
+        GLint loc = binding->vsh_constant_loc[i];
+        if ((loc != -1) &&
+            memcmp(binding->vsh_constants[i], pg->vsh_constants[i],
+                   sizeof(pg->vsh_constants[1]))) {
+            glUniform4fv(loc, 1, (const GLfloat *)pg->vsh_constants[i]);
+            memcpy(binding->vsh_constants[i], pg->vsh_constants[i],
+                   sizeof(pg->vsh_constants[i]));
+        }
+
+        pg->vsh_constants_dirty[i] = false;
+    }
+
+    if (binding->surface_size_loc != -1) {
+        unsigned int aa_width = 1, aa_height = 1;
+        pgraph_apply_anti_aliasing_factor(pg, &aa_width, &aa_height);
+        glUniform2f(binding->surface_size_loc,
+                    pg->surface_binding_dim.width / aa_width,
+                    pg->surface_binding_dim.height / aa_height);
+    }
+
+    if (binding->clip_range_loc != -1) {
+        uint32_t v[2];
+        v[0] = pgraph_reg_r(pg, NV_PGRAPH_ZCLIPMIN);
+        v[1] = pgraph_reg_r(pg, NV_PGRAPH_ZCLIPMAX);
+        float zclip_min = *(float *)&v[0];
+        float zclip_max = *(float *)&v[1];
+        glUniform4f(binding->clip_range_loc, 0, zmax, zclip_min, zclip_max);
+    }
+
+    if (binding->depth_offset_loc != -1) {
+        float zbias = 0.0f;
+
+        if (pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER) &
+            (NV_PGRAPH_SETUPRASTER_POFFSETFILLENABLE |
+             NV_PGRAPH_SETUPRASTER_POFFSETLINEENABLE |
+             NV_PGRAPH_SETUPRASTER_POFFSETPOINTENABLE)) {
+            uint32_t zbias_u32 = pgraph_reg_r(pg, NV_PGRAPH_ZOFFSETBIAS);
+            zbias = *(float *)&zbias_u32;
+
+            if (pgraph_reg_r(pg, NV_PGRAPH_ZOFFSETFACTOR) != 0 &&
+                (pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0) &
+                 NV_PGRAPH_CONTROL_0_Z_PERSPECTIVE_ENABLE)) {
+                /* TODO: emulate zfactor when z_perspective true, i.e.
+                 * w-buffering. Perhaps calculate an additional offset based on
+                 * triangle orientation in geometry shader and pass the result
+                 * to fragment shader and add it to gl_FragDepth as well.
+                 */
+                NV2A_UNIMPLEMENTED("NV_PGRAPH_ZOFFSETFACTOR for w-buffering");
+            }
+        }
+
+        glUniform1f(binding->depth_offset_loc, zbias);
+    }
+
+    /* Clipping regions */
+    unsigned int max_gl_width = pg->surface_binding_dim.width;
+    unsigned int max_gl_height = pg->surface_binding_dim.height;
+    pgraph_apply_scaling_factor(pg, &max_gl_width, &max_gl_height);
+
+    for (i = 0; i < 8; i++) {
+        uint32_t x = pgraph_reg_r(pg, NV_PGRAPH_WINDOWCLIPX0 + i * 4);
+        unsigned int x_min = GET_MASK(x, NV_PGRAPH_WINDOWCLIPX0_XMIN);
+        unsigned int x_max = GET_MASK(x, NV_PGRAPH_WINDOWCLIPX0_XMAX) + 1;
+        uint32_t y = pgraph_reg_r(pg, NV_PGRAPH_WINDOWCLIPY0 + i * 4);
+        unsigned int y_min = GET_MASK(y, NV_PGRAPH_WINDOWCLIPY0_YMIN);
+        unsigned int y_max = GET_MASK(y, NV_PGRAPH_WINDOWCLIPY0_YMAX) + 1;
+        pgraph_apply_anti_aliasing_factor(pg, &x_min, &y_min);
+        pgraph_apply_anti_aliasing_factor(pg, &x_max, &y_max);
+
+        pgraph_apply_scaling_factor(pg, &x_min, &y_min);
+        pgraph_apply_scaling_factor(pg, &x_max, &y_max);
+
+        /* Translate for the GL viewport origin */
+        int y_min_xlat = MAX((int)max_gl_height - (int)y_max, 0);
+        int y_max_xlat = MIN((int)max_gl_height - (int)y_min, max_gl_height);
+
+        glUniform4i(r->shader_binding->clip_region_loc[i],
+                    x_min, y_min_xlat, x_max, y_max_xlat);
+    }
+
+    if (binding->material_alpha_loc != -1) {
+        glUniform1f(binding->material_alpha_loc, pg->material_alpha);
+    }
+}
+
+static bool test_shaders_dirty(PGRAPHState *pg)
+{
+    #define CR_1(reg) CR_x(reg, 1)
+    #define CR_4(reg) CR_x(reg, 4)
+    #define CR_8(reg) CR_x(reg, 8)
+    #define CF(src, name)  CF_x(typeof(src), (&src), name, 1)
+    #define CFA(src, name) CF_x(typeof(src[0]), src, name, ARRAY_SIZE(src))
+    #define CNAME(name) reg_check__ ## name
+    #define CX_x__define(type, name, x) static type CNAME(name)[x];
+    #define CR_x__define(reg, x) CX_x__define(uint32_t, reg, x)
+    #define CF_x__define(type, src, name, x) CX_x__define(type, name, x)
+    #define CR_x__check(reg, x) \
+        for (int i = 0; i < x; i++) { if (pgraph_reg_r(pg, reg+i*4) != CNAME(reg)[i]) goto dirty; }
+    #define CF_x__check(type, src, name, x) \
+        for (int i = 0; i < x; i++) { if (src[i] != CNAME(name)[i]) goto dirty; }
+    #define CR_x__update(reg, x) \
+        for (int i = 0; i < x; i++) { CNAME(reg)[i] = pgraph_reg_r(pg, reg+i*4); }
+    #define CF_x__update(type, src, name, x) \
+        for (int i = 0; i < x; i++) { CNAME(name)[i] = src[i]; }
+
+    #define DIRTY_REGS \
+        CR_1(NV_PGRAPH_COMBINECTL) \
+        CR_1(NV_PGRAPH_SHADERCTL) \
+        CR_1(NV_PGRAPH_SHADOWCTL) \
+        CR_1(NV_PGRAPH_COMBINESPECFOG0) \
+        CR_1(NV_PGRAPH_COMBINESPECFOG1) \
+        CR_1(NV_PGRAPH_CONTROL_0) \
+        CR_1(NV_PGRAPH_CONTROL_3) \
+        CR_1(NV_PGRAPH_CSV0_C) \
+        CR_1(NV_PGRAPH_CSV0_D) \
+        CR_1(NV_PGRAPH_CSV1_A) \
+        CR_1(NV_PGRAPH_CSV1_B) \
+        CR_1(NV_PGRAPH_SETUPRASTER) \
+        CR_1(NV_PGRAPH_SHADERPROG) \
+        CR_1(NV_PGRAPH_ZCOMPRESSOCCLUDE) \
+        CR_8(NV_PGRAPH_COMBINECOLORI0) \
+        CR_8(NV_PGRAPH_COMBINECOLORO0) \
+        CR_8(NV_PGRAPH_COMBINEALPHAI0) \
+        CR_8(NV_PGRAPH_COMBINEALPHAO0) \
+        CR_8(NV_PGRAPH_COMBINEFACTOR0) \
+        CR_8(NV_PGRAPH_COMBINEFACTOR1) \
+        CR_1(NV_PGRAPH_SHADERCLIPMODE) \
+        CR_4(NV_PGRAPH_TEXCTL0_0) \
+        CR_4(NV_PGRAPH_TEXFMT0) \
+        CR_4(NV_PGRAPH_TEXFILTER0) \
+        CR_8(NV_PGRAPH_WINDOWCLIPX0) \
+        CR_8(NV_PGRAPH_WINDOWCLIPY0) \
+        CF(pg->primitive_mode, primitive_mode) \
+        CF(pg->surface_scale_factor, surface_scale_factor) \
+        CF(pg->compressed_attrs, compressed_attrs) \
+        CFA(pg->texture_matrix_enable, texture_matrix_enable)
+
+    #define CR_x(reg, x) CR_x__define(reg, x)
+    #define CF_x(type, src, name, x) CF_x__define(type, src, name, x)
+    DIRTY_REGS
+    #undef CR_x
+    #undef CF_x
+
+    #define CR_x(reg, x) CR_x__check(reg, x)
+    #define CF_x(type, src, name, x) CF_x__check(type, src, name, x)
+    DIRTY_REGS
+    #undef CR_x
+    #undef CF_x
+    return false;
+
+dirty:
+    #define CR_x(reg, x) CR_x__update(reg, x)
+    #define CF_x(type, src, name, x) CF_x__update(type, src, name, x)
+    DIRTY_REGS
+    #undef CR_x
+    #undef CF_x
+    return true;
 }

 void pgraph_gl_bind_shaders(PGRAPHState *pg)
@ -761,17 +1027,18 @@ void pgraph_gl_bind_shaders(PGRAPHState *pg)
    PGRAPHGLState *r = pg->gl_renderer_state;

    bool binding_changed = false;
-    if (r->shader_binding &&
-        !pgraph_glsl_check_shader_state_dirty(pg, &r->shader_binding->state)) {
+    if (r->shader_binding && !test_shaders_dirty(pg) && !pg->program_data_dirty) {
        nv2a_profile_inc_counter(NV2A_PROF_SHADER_BIND_NOTDIRTY);
-        goto update_uniforms;
+        goto update_constants;
    }

    ShaderBinding *old_binding = r->shader_binding;
-    ShaderState state = pgraph_glsl_get_shader_state(pg);
+    ShaderState state = pgraph_get_shader_state(pg);
+    assert(!state.vulkan);

-    NV2A_GL_DGROUP_BEGIN("%s (%s)", __func__,
-                         state.vsh.is_fixed_function ? "FF" : "PROG");
+    NV2A_GL_DGROUP_BEGIN("%s (VP: %s FFP: %s)", __func__,
+                         state.vertex_program ? "yes" : "no",
+                         state.fixed_function ? "yes" : "no");

    qemu_mutex_lock(&r->shader_cache_lock);

@ -783,7 +1050,7 @@ void pgraph_gl_bind_shaders(PGRAPHState *pg)

    if (!binding->initialized && !pgraph_gl_shader_load_from_memory(binding)) {
        nv2a_profile_inc_counter(NV2A_PROF_SHADER_GEN);
-        generate_shaders(r, binding);
+        generate_shaders(binding);
        if (g_config.perf.cache_shaders) {
            pgraph_gl_shader_cache_to_disk(binding);
        }
@ -802,10 +1069,10 @@ void pgraph_gl_bind_shaders(PGRAPHState *pg)

    NV2A_GL_DGROUP_END();

-update_uniforms:
+update_constants:
    assert(r->shader_binding);
    assert(r->shader_binding->initialized);
-    update_shader_uniforms(pg, r->shader_binding);
+    shader_update_constants(pg, r->shader_binding, binding_changed);
 }

 GLuint pgraph_gl_compile_shader(const char *vs_src, const char *fs_src)
--- a/hw/xbox/nv2a/pgraph/gl/surface.c
+++ b/hw/xbox/nv2a/pgraph/gl/surface.c
@ -137,7 +137,11 @@ static void init_render_to_texture(PGRAPHState *pg)
        "layout(location = 0) out vec4 out_Color;\n"
        "void main()\n"
        "{\n"
-        "    vec2 texCoord = gl_FragCoord.xy / textureSize(tex, 0).xy;\n"
+        "    vec2 texCoord;\n"
+        "    texCoord.x = gl_FragCoord.x;\n"
+        "    texCoord.y = (surface_size.y - gl_FragCoord.y)\n"
+        "                 + (textureSize(tex,0).y - surface_size.y);\n"
+        "    texCoord /= textureSize(tex,0).xy;\n"
        "    out_Color.rgba = texture(tex, texCoord);\n"
        "}\n";

@ -294,7 +298,7 @@ static void render_surface_to_texture_slow(NV2AState *d,
    size_t bufsize = width * height * surface->fmt.bytes_per_pixel;

    uint8_t *buf = g_malloc(bufsize);
-    surface_download_to_buffer(d, surface, false, false, false, buf);
+    surface_download_to_buffer(d, surface, false, true, false, buf);

    width = texture_shape->width;
    height = texture_shape->height;
@ -413,52 +417,16 @@ bool pgraph_gl_check_surface_to_texture_compatibility(
    return false;
 }

-static bool check_surface_overlaps_range(const SurfaceBinding *surface,
-                                         hwaddr range_start, hwaddr range_len)
+static void wait_for_surface_download(SurfaceBinding *e)
 {
-    hwaddr surface_end = surface->vram_addr + surface->size;
-    hwaddr range_end = range_start + range_len;
-    return !(surface->vram_addr >= range_end || range_start >= surface_end);
-}
+    NV2AState *d = g_nv2a;
+    PGRAPHState *pg = &d->pgraph;
+    PGRAPHGLState *r = pg->gl_renderer_state;

-static void surface_access_callback(void *opaque, MemoryRegion *mr, hwaddr addr,
-                                    hwaddr len, bool write)
-{
-    NV2AState *d = (NV2AState *)opaque;
-    qemu_mutex_lock(&d->pgraph.lock);
-
-    PGRAPHGLState *r = d->pgraph.gl_renderer_state;
-    bool wait_for_downloads = false;
-
-    SurfaceBinding *surface;
-    QTAILQ_FOREACH(surface, &r->surfaces, entry) {
-        if (!check_surface_overlaps_range(surface, addr, len)) {
-            continue;
-        }
-
-        hwaddr offset = addr - surface->vram_addr;
-
-        if (write) {
-            trace_nv2a_pgraph_surface_cpu_write(surface->vram_addr, offset);
-        } else {
-            trace_nv2a_pgraph_surface_cpu_read(surface->vram_addr, offset);
-        }
-
-        if (surface->draw_dirty) {
-            surface->download_pending = true;
-            wait_for_downloads = true;
-        }
-
-        if (write) {
-            surface->upload_pending = true;
-        }
-    }
-
-    qemu_mutex_unlock(&d->pgraph.lock);
-
-    if (wait_for_downloads) {
+    if (qatomic_read(&e->draw_dirty)) {
        qemu_mutex_lock(&d->pfifo.lock);
        qemu_event_reset(&r->downloads_complete);
+        qatomic_set(&e->download_pending, true);
        qatomic_set(&r->downloads_pending, true);
        pfifo_kick(d);
        qemu_mutex_unlock(&d->pfifo.lock);
@ -466,44 +434,22 @@ static void surface_access_callback(void *opaque, MemoryRegion *mr, hwaddr addr,
    }
 }

-static void register_cpu_access_callback(NV2AState *d, SurfaceBinding *surface)
+static void surface_access_callback(void *opaque, MemoryRegion *mr, hwaddr addr,
+                                    hwaddr len, bool write)
 {
-    if (tcg_enabled()) {
-        surface->access_cb = mem_access_callback_insert(
-            qemu_get_cpu(0), d->vram, surface->vram_addr, surface->size,
-            &surface_access_callback, d);
+    SurfaceBinding *e = opaque;
+    assert(addr >= e->vram_addr);
+    hwaddr offset = addr - e->vram_addr;
+    assert(offset < e->size);
+
+    if (qatomic_read(&e->draw_dirty)) {
+        trace_nv2a_pgraph_surface_cpu_access(e->vram_addr, offset);
+        wait_for_surface_download(e);
    }
-}

-static void unregister_cpu_access_callback(NV2AState *d,
-                                           SurfaceBinding const *surface)
-{
-    if (tcg_enabled()) {
-        mem_access_callback_remove_by_ref(qemu_get_cpu(0), surface->access_cb);
-    }
-}
-
-static bool check_surfaces_overlap(const SurfaceBinding *surface,
-                                   const SurfaceBinding *other_surface)
-{
-    return check_surface_overlaps_range(surface, other_surface->vram_addr,
-                                        other_surface->size);
-}
-
-static void invalidate_overlapping_surfaces(NV2AState *d, SurfaceBinding *surface)
-{
-    PGRAPHState *pg = &d->pgraph;
-    PGRAPHGLState *r = pg->gl_renderer_state;
-
-    SurfaceBinding *other_surface, *next_surface;
-    QTAILQ_FOREACH_SAFE(other_surface, &r->surfaces, entry, next_surface) {
-        if (check_surfaces_overlap(surface, other_surface)) {
-            trace_nv2a_pgraph_surface_evict_overlapping(
-                other_surface->vram_addr, other_surface->width, other_surface->height,
-                other_surface->pitch);
-            pgraph_gl_surface_download_if_dirty(d, other_surface);
-            pgraph_gl_surface_invalidate(d, other_surface);
-        }
+    if (write && !qatomic_read(&e->upload_pending)) {
+        trace_nv2a_pgraph_surface_cpu_access(e->vram_addr, offset);
+        qatomic_set(&e->upload_pending, true);
    }
 }

@ -515,13 +461,35 @@ static SurfaceBinding *surface_put(NV2AState *d, hwaddr addr,

    assert(pgraph_gl_surface_get(d, addr) == NULL);

-    invalidate_overlapping_surfaces(d, surface_in);
+    SurfaceBinding *surface, *next;
+    uintptr_t e_end = surface_in->vram_addr + surface_in->size - 1;
+    QTAILQ_FOREACH_SAFE(surface, &r->surfaces, entry, next) {
+        uintptr_t s_end = surface->vram_addr + surface->size - 1;
+        bool overlapping = !(surface->vram_addr > e_end
+                             || surface_in->vram_addr > s_end);
+        if (overlapping) {
+            trace_nv2a_pgraph_surface_evict_overlapping(
+                surface->vram_addr, surface->width, surface->height,
+                surface->pitch);
+            pgraph_gl_surface_download_if_dirty(d, surface);
+            pgraph_gl_surface_invalidate(d, surface);
+        }
+    }

    SurfaceBinding *surface_out = g_malloc(sizeof(SurfaceBinding));
    assert(surface_out != NULL);
    *surface_out = *surface_in;

-    register_cpu_access_callback(d, surface_out);
+    if (tcg_enabled()) {
+        qemu_mutex_unlock(&d->pgraph.lock);
+        bql_lock();
+        mem_access_callback_insert(qemu_get_cpu(0),
+            d->vram, surface_out->vram_addr, surface_out->size,
+            &surface_out->access_cb, &surface_access_callback,
+            surface_out);
+        bql_unlock();
+        qemu_mutex_lock(&d->pgraph.lock);
+    }

    QTAILQ_INSERT_TAIL(&r->surfaces, surface_out, entry);

@ -575,7 +543,13 @@ void pgraph_gl_surface_invalidate(NV2AState *d, SurfaceBinding *surface)
        pgraph_gl_unbind_surface(d, false);
    }

-    unregister_cpu_access_callback(d, surface);
+    if (tcg_enabled()) {
+        qemu_mutex_unlock(&d->pgraph.lock);
+        bql_lock();
+        mem_access_callback_remove_by_ref(qemu_get_cpu(0), surface->access_cb);
+        bql_unlock();
+        qemu_mutex_lock(&d->pgraph.lock);
+    }

    glDeleteTextures(1, &surface->gl_buffer);

@ -764,7 +738,7 @@ static void surface_download(NV2AState *d, SurfaceBinding *surface, bool force)

    nv2a_profile_inc_counter(NV2A_PROF_SURF_DOWNLOAD);

-    surface_download_to_buffer(d, surface, true, false, true,
+    surface_download_to_buffer(d, surface, true, true, true,
                               d->vram_ptr + surface->vram_addr);

    memory_region_set_client_dirty(d->vram, surface->vram_addr,
@ -901,26 +875,20 @@ void pgraph_gl_upload_surface_data(NV2AState *d, SurfaceBinding *surface,
                       surface->fmt.bytes_per_pixel);
    }

-    /* FIXME: Replace this scaling */
+    /* FIXME: Replace this flip/scaling */

    // This is VRAM so we can't do this inplace!
-    uint8_t *optimal_buf = buf;
-    unsigned int optimal_pitch = surface->width * surface->fmt.bytes_per_pixel;
-
-    if (surface->pitch != optimal_pitch) {
-        optimal_buf = (uint8_t *)g_malloc(surface->height * optimal_pitch);
-
-        uint8_t *src = buf;
-        uint8_t *dst = optimal_buf;
+    uint8_t *flipped_buf = (uint8_t *)g_malloc(
+        surface->height * surface->width * surface->fmt.bytes_per_pixel);
    unsigned int irow;
    for (irow = 0; irow < surface->height; irow++) {
-            memcpy(dst, src, optimal_pitch);
-            src += surface->pitch;
-            dst += optimal_pitch;
-        }
+        memcpy(&flipped_buf[surface->width * (surface->height - irow - 1)
+                                 * surface->fmt.bytes_per_pixel],
+               &buf[surface->pitch * irow],
+               surface->width * surface->fmt.bytes_per_pixel);
    }

-    uint8_t *gl_read_buf = optimal_buf;
+    uint8_t *gl_read_buf = flipped_buf;
    unsigned int width = surface->width, height = surface->height;

    if (pg->surface_scale_factor > 1) {
@ -928,7 +896,7 @@ void pgraph_gl_upload_surface_data(NV2AState *d, SurfaceBinding *surface,
        pg->scale_buf = (uint8_t *)g_realloc(
            pg->scale_buf, width * height * surface->fmt.bytes_per_pixel);
        gl_read_buf = pg->scale_buf;
-        uint8_t *out = gl_read_buf, *in = optimal_buf;
+        uint8_t *out = gl_read_buf, *in = flipped_buf;
        surface_copy_expand(out, in, surface->width, surface->height,
                            surface->fmt.bytes_per_pixel,
                            d->pgraph.surface_scale_factor);
@ -947,9 +915,7 @@ void pgraph_gl_upload_surface_data(NV2AState *d, SurfaceBinding *surface,
                 height, 0, surface->fmt.gl_format, surface->fmt.gl_type,
                 gl_read_buf);
    glPixelStorei(GL_UNPACK_ALIGNMENT, prev_unpack_alignment);
-    if (optimal_buf != buf) {
-        g_free(optimal_buf);
-    }
+    g_free(flipped_buf);
    if (surface->swizzle) {
        g_free(buf);
    }
--- a/hw/xbox/nv2a/pgraph/gl/texture.c
+++ b/hw/xbox/nv2a/pgraph/gl/texture.c
@ -746,7 +746,7 @@ static void texture_binding_destroy(gpointer data)
 }

 /* functions for texture LRU cache */
-static void texture_cache_entry_init(Lru *lru, LruNode *node, const void *key)
+static void texture_cache_entry_init(Lru *lru, LruNode *node, void *key)
 {
    TextureLruNode *tnode = container_of(node, TextureLruNode, node);
    memcpy(&tnode->key, key, sizeof(TextureKey));
@ -765,8 +765,7 @@ static void texture_cache_entry_post_evict(Lru *lru, LruNode *node)
    }
 }

-static bool texture_cache_entry_compare(Lru *lru, LruNode *node,
-                                        const void *key)
+static bool texture_cache_entry_compare(Lru *lru, LruNode *node, void *key)
 {
    TextureLruNode *tnode = container_of(node, TextureLruNode, node);
    return memcmp(&tnode->key, key, sizeof(TextureKey));
--- a/hw/xbox/nv2a/pgraph/gl/vertex.c
+++ b/hw/xbox/nv2a/pgraph/gl/vertex.c
@ -223,23 +223,23 @@ unsigned int pgraph_gl_bind_inline_array(NV2AState *d)

    nv2a_profile_inc_counter(NV2A_PROF_GEOM_BUFFER_UPDATE_2);
    glBindBuffer(GL_ARRAY_BUFFER, r->gl_inline_array_buffer);
-    GLsizeiptr buffer_size = index_count * vertex_size;
-    glBufferData(GL_ARRAY_BUFFER, buffer_size, NULL, GL_STREAM_DRAW);
-    glBufferSubData(GL_ARRAY_BUFFER, 0, buffer_size, pg->inline_array);
+    glBufferData(GL_ARRAY_BUFFER, NV2A_MAX_BATCH_LENGTH * sizeof(uint32_t),
+                 NULL, GL_STREAM_DRAW);
+    glBufferSubData(GL_ARRAY_BUFFER, 0, index_count * vertex_size, pg->inline_array);
    pgraph_gl_bind_vertex_attributes(d, 0, index_count-1, true, vertex_size,
                                  index_count-1);

    return index_count;
 }

-static void vertex_cache_entry_init(Lru *lru, LruNode *node, const void *key)
+static void vertex_cache_entry_init(Lru *lru, LruNode *node, void *key)
 {
    VertexLruNode *vnode = container_of(node, VertexLruNode, node);
    memcpy(&vnode->key, key, sizeof(struct VertexKey));
    vnode->initialized = false;
 }

-static bool vertex_cache_entry_compare(Lru *lru, LruNode *node, const void *key)
+static bool vertex_cache_entry_compare(Lru *lru, LruNode *node, void *key)
 {
    VertexLruNode *vnode = container_of(node, VertexLruNode, node);
    return memcmp(&vnode->key, key, sizeof(VertexKey));
--- a/hw/xbox/nv2a/pgraph/glsl/common.c
+++ b/hw/xbox/nv2a/pgraph/glsl/common.c
@ -1,7 +1,7 @@
 /*
 * Geforce NV2A PGRAPH GLSL Shader Generator
 *
- * Copyright (c) 2024-2025 Matt Borgerson
+ * Copyright (c) 2024 Matt Borgerson
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@ -18,15 +18,8 @@
 */

 #include "common.h"
-#include "hw/xbox/nv2a/pgraph/pgraph.h"

-#define DECL_UNIFORM_ELEMENT_NAME(type) #type,
-const char *uniform_element_type_to_str[] = {
-    UNIFORM_ELEMENT_TYPE_X(DECL_UNIFORM_ELEMENT_NAME)
-};
-
-MString *pgraph_glsl_get_vtx_header(MString *out, bool location, bool smooth,
-                                    bool in, bool prefix, bool array)
+MString *pgraph_get_glsl_vtx_header(MString *out, bool location, bool smooth, bool in, bool prefix, bool array)
 {
    const char *smooth_s = "";
    const char *flat_s = "flat ";
@ -61,26 +54,3 @@ MString *pgraph_glsl_get_vtx_header(MString *out, bool location, bool smooth,

    return out;
 }
-
-void pgraph_glsl_set_clip_range_uniform_value(PGRAPHState *pg, float clipRange[4])
-{
-    float zmax;
-    switch (pg->surface_shape.zeta_format) {
-    case NV097_SET_SURFACE_FORMAT_ZETA_Z16:
-        zmax = pg->surface_shape.z_format ? f16_max : (float)0xFFFF;
-        break;
-    case NV097_SET_SURFACE_FORMAT_ZETA_Z24S8:
-        zmax = pg->surface_shape.z_format ? f24_max : (float)0xFFFFFF;
-        break;
-    default:
-        assert(0);
-    }
-
-    uint32_t zclip_min = pgraph_reg_r(pg, NV_PGRAPH_ZCLIPMIN);
-    uint32_t zclip_max = pgraph_reg_r(pg, NV_PGRAPH_ZCLIPMAX);
-
-    clipRange[0] = 0;
-    clipRange[1] = zmax;
-    clipRange[2] = *(float *)&zclip_min;
-    clipRange[3] = *(float *)&zclip_max;
-}
--- a/hw/xbox/nv2a/pgraph/glsl/common.h
+++ b/hw/xbox/nv2a/pgraph/glsl/common.h
@ -3,7 +3,6 @@
 *
 * Copyright (c) 2015 espes
 * Copyright (c) 2015 Jannik Vogel
- * Copyright (c) 2025 Matt Borgerson
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@ -22,69 +21,8 @@
 #ifndef HW_NV2A_SHADERS_COMMON_H
 #define HW_NV2A_SHADERS_COMMON_H

-#include "qemu/osdep.h"
 #include "qemu/mstring.h"
-
-typedef int ivec4[4];
-typedef float mat2[2 * 2];
-typedef unsigned int uint;
-typedef float vec2[2];
-typedef float vec3[3];
-typedef float vec4[4];
-
-#define UNIFORM_ELEMENT_TYPE_X(DECL) \
-    DECL(float)                      \
-    DECL(int)                        \
-    DECL(ivec4)                      \
-    DECL(mat2)                       \
-    DECL(uint)                       \
-    DECL(vec2)                       \
-    DECL(vec3)                       \
-    DECL(vec4)
-
-enum UniformElementType {
-#define DECL_UNIFORM_ELEMENT_TYPE(type) UniformElementType_##type,
-    UNIFORM_ELEMENT_TYPE_X(DECL_UNIFORM_ELEMENT_TYPE)
-};
-
-extern const char *uniform_element_type_to_str[];
-
-#define DECL_UNIFORM_ENUM_VALUE(s, name, type, count) s##_##name,
-#define DECL_UNIFORM_ENUM_TYPE(name, decls)                 \
-    enum name##Indices{                                     \
-        decls(name, DECL_UNIFORM_ENUM_VALUE) name##__COUNT, \
-    };
-
-#define DECL_UNIFORM_LOC_STRUCT_TYPE(name, decls) \
-    typedef int name##Locs[name##__COUNT];
-
-#define DECL_UNIFORM_VAL_STRUCT_FIELD(s, name, type, count) type name[count];
-#define DECL_UNIFORM_VAL_STRUCT_TYPE(name, decls)  \
-    typedef struct name##Values {                  \
-        decls(name, DECL_UNIFORM_VAL_STRUCT_FIELD) \
-    } name##Values;
-
-typedef struct UniformInfo {
-    const char *name;
-    enum UniformElementType type;
-    size_t size;
-    size_t count;
-    size_t val_offs;
-} UniformInfo;
-
-#define DECL_UNIFORM_INFO_ITEM(s, name, type, count)         \
-    { #name, UniformElementType_##type, sizeof(type), count, \
-      offsetof(s##Values, name) },
-#define DECL_UNIFORM_INFO_ARR(name, decls) \
-    extern const UniformInfo name##Info[];
-#define DEF_UNIFORM_INFO_ARR(name, decls) \
-    const UniformInfo name##Info[] = { decls(name, DECL_UNIFORM_INFO_ITEM) };
-
-#define DECL_UNIFORM_TYPES(name, decls)       \
-    DECL_UNIFORM_ENUM_TYPE(name, decls)       \
-    DECL_UNIFORM_LOC_STRUCT_TYPE(name, decls) \
-    DECL_UNIFORM_VAL_STRUCT_TYPE(name, decls) \
-    DECL_UNIFORM_INFO_ARR(name, decls)
+#include <stdbool.h>

 #define GLSL_C(idx) "c[" stringify(idx) "]"
 #define GLSL_LTCTXA(idx) "ltctxa[" stringify(idx) "]"
@ -95,12 +33,6 @@ typedef struct UniformInfo {

 #define GLSL_DEFINE(a, b) "#define " stringify(a) " " b "\n"

-MString *pgraph_glsl_get_vtx_header(MString *out, bool location, bool smooth,
-                                    bool in, bool prefix, bool array);
-
-typedef struct PGRAPHState PGRAPHState;
-
-void pgraph_glsl_set_clip_range_uniform_value(PGRAPHState *pg,
-                                              float clipRange[4]);
+MString *pgraph_get_glsl_vtx_header(MString *out, bool location, bool smooth, bool in, bool prefix, bool array);

 #endif
--- a/hw/xbox/nv2a/pgraph/glsl/geom.c
+++ b/hw/xbox/nv2a/pgraph/glsl/geom.c
@ -3,7 +3,7 @@
 *
 * Copyright (c) 2015 espes
 * Copyright (c) 2015 Jannik Vogel
- * Copyright (c) 2020-2025 Matt Borgerson
+ * Copyright (c) 2020-2024 Matt Borgerson
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@ -19,99 +19,19 @@
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 */

-#include "qemu/osdep.h"
-#include "hw/xbox/nv2a/pgraph/pgraph.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"
+#include "common.h"
 #include "geom.h"

-void pgraph_glsl_set_geom_state(PGRAPHState *pg, GeomState *state)
-{
-    state->primitive_mode = (enum ShaderPrimitiveMode)pg->primitive_mode;
-
-    state->polygon_front_mode = (enum ShaderPolygonMode)GET_MASK(
-        pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER),
-        NV_PGRAPH_SETUPRASTER_FRONTFACEMODE);
-    state->polygon_back_mode = (enum ShaderPolygonMode)GET_MASK(
-        pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER),
-        NV_PGRAPH_SETUPRASTER_BACKFACEMODE);
-
-    state->smooth_shading = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_3),
-                                     NV_PGRAPH_CONTROL_3_SHADEMODE) ==
-                            NV_PGRAPH_CONTROL_3_SHADEMODE_SMOOTH;
-}
-
-bool pgraph_glsl_need_geom(const GeomState *state)
+MString *pgraph_gen_geom_glsl(enum ShaderPolygonMode polygon_front_mode,
+                              enum ShaderPolygonMode polygon_back_mode,
+                              enum ShaderPrimitiveMode primitive_mode,
+                              bool smooth_shading,
+                              bool vulkan)
 {
    /* FIXME: Missing support for 2-sided-poly mode */
-    assert(state->polygon_front_mode == state->polygon_back_mode);
-    enum ShaderPolygonMode polygon_mode = state->polygon_front_mode;
-
-    /* POINT mode shouldn't require any special work */
-    if (polygon_mode == POLY_MODE_POINT) {
-        return false;
-    }
-
-    switch (state->primitive_mode) {
-    case PRIM_TYPE_TRIANGLES:
-        if (polygon_mode == POLY_MODE_FILL) {
-            return false;
-        }
-        return true;
-    case PRIM_TYPE_TRIANGLE_STRIP:
-        if (polygon_mode == POLY_MODE_FILL) {
-            return false;
-        }
-        assert(polygon_mode == POLY_MODE_LINE);
-        return true;
-    case PRIM_TYPE_TRIANGLE_FAN:
-        if (polygon_mode == POLY_MODE_FILL) {
-            return false;
-        }
-        assert(polygon_mode == POLY_MODE_LINE);
-        return true;
-    case PRIM_TYPE_QUADS:
-        if (polygon_mode == POLY_MODE_LINE) {
-            return true;
-        } else if (polygon_mode == POLY_MODE_FILL) {
-            return true;
-        } else {
-            assert(false);
-            return false;
-        }
-        break;
-    case PRIM_TYPE_QUAD_STRIP:
-        if (polygon_mode == POLY_MODE_LINE) {
-            return true;
-        } else if (polygon_mode == POLY_MODE_FILL) {
-            return true;
-        } else {
-            assert(false);
-            return false;
-        }
-        break;
-    case PRIM_TYPE_POLYGON:
-        if (polygon_mode == POLY_MODE_LINE) {
-            return false;
-        }
-        if (polygon_mode == POLY_MODE_FILL) {
-            if (state->smooth_shading) {
-                return false;
-            }
-            return true;
-        } else {
-            assert(false);
-            return false;
-        }
-        break;
-    default:
-        return false;
-    }
-}
-
-MString *pgraph_glsl_gen_geom(const GeomState *state, GenGeomGlslOptions opts)
-{
-    /* FIXME: Missing support for 2-sided-poly mode */
-    assert(state->polygon_front_mode == state->polygon_back_mode);
-    enum ShaderPolygonMode polygon_mode = state->polygon_front_mode;
+    assert(polygon_front_mode == polygon_back_mode);
+    enum ShaderPolygonMode polygon_mode = polygon_front_mode;

    /* POINT mode shouldn't require any special work */
    if (polygon_mode == POLY_MODE_POINT) {
@ -122,7 +42,7 @@ MString *pgraph_glsl_gen_geom(const GeomState *state, GenGeomGlslOptions opts)
    const char *layout_in = NULL;
    const char *layout_out = NULL;
    const char *body = NULL;
-    switch (state->primitive_mode) {
+    switch (primitive_mode) {
    case PRIM_TYPE_POINTS: return NULL;
    case PRIM_TYPE_LINES: return NULL;
    case PRIM_TYPE_LINE_LOOP: return NULL;
@ -225,7 +145,7 @@ MString *pgraph_glsl_gen_geom(const GeomState *state, GenGeomGlslOptions opts)
            return NULL;
        }
        if (polygon_mode == POLY_MODE_FILL) {
-            if (state->smooth_shading) {
+            if (smooth_shading) {
                return NULL;
            }
            layout_in = "layout(triangles) in;\n";
@ -249,19 +169,16 @@ MString *pgraph_glsl_gen_geom(const GeomState *state, GenGeomGlslOptions opts)
    assert(layout_in);
    assert(layout_out);
    assert(body);
-    MString *output =
-        mstring_from_fmt("#version %d\n\n"
-                         "%s"
-                         "%s"
-                         "\n",
-                         opts.vulkan ? 450 : 400, layout_in, layout_out);
-    pgraph_glsl_get_vtx_header(output, opts.vulkan, state->smooth_shading, true,
-                               true, true);
-    pgraph_glsl_get_vtx_header(output, opts.vulkan, state->smooth_shading,
-                               false, false, false);
+    MString *s = mstring_new();
+    mstring_append_fmt(s, "#version %d\n\n", vulkan ? 450 : 400);
+    mstring_append(s, layout_in);
+    mstring_append(s, layout_out);
+    mstring_append(s, "\n");
+    pgraph_get_glsl_vtx_header(s, vulkan, smooth_shading, true, true, true);
+    pgraph_get_glsl_vtx_header(s, vulkan, smooth_shading, false, false, false);

-    if (state->smooth_shading) {
-        mstring_append(output,
+    if (smooth_shading) {
+        mstring_append(s,
                       "void emit_vertex(int index, int _unused) {\n"
                       "  gl_Position = gl_in[index].gl_Position;\n"
                       "  gl_PointSize = gl_in[index].gl_PointSize;\n"
@ -277,7 +194,7 @@ MString *pgraph_glsl_gen_geom(const GeomState *state, GenGeomGlslOptions opts)
                       "  EmitVertex();\n"
                       "}\n");
    } else {
-        mstring_append(output,
+        mstring_append(s,
                       "void emit_vertex(int index, int provoking_index) {\n"
                       "  gl_Position = gl_in[index].gl_Position;\n"
                       "  gl_PointSize = gl_in[index].gl_PointSize;\n"
@ -294,12 +211,10 @@ MString *pgraph_glsl_gen_geom(const GeomState *state, GenGeomGlslOptions opts)
                       "}\n");
    }

-    mstring_append_fmt(output,
-                       "\n"
-                       "void main() {\n"
-                       "%s"
-                       "}\n",
-                       body);
+    mstring_append(s, "\n"
+                      "void main() {\n");
+    mstring_append(s, body);
+    mstring_append(s, "}\n");

-    return output;
+    return s;
 }
--- a/hw/xbox/nv2a/pgraph/glsl/geom.h
+++ b/hw/xbox/nv2a/pgraph/glsl/geom.h
@ -3,7 +3,7 @@
 *
 * Copyright (c) 2015 espes
 * Copyright (c) 2015 Jannik Vogel
- * Copyright (c) 2020-2025 Matt Borgerson
+ * Copyright (c) 2020-2024 Matt Borgerson
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@ -22,24 +22,13 @@
 #ifndef HW_XBOX_NV2A_PGRAPH_GLSL_GEOM_H
 #define HW_XBOX_NV2A_PGRAPH_GLSL_GEOM_H

-#include "common.h"
-#include "hw/xbox/nv2a/pgraph/vsh_regs.h"
+#include "qemu/mstring.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"

-typedef struct {
-    enum ShaderPrimitiveMode primitive_mode;
-    enum ShaderPolygonMode polygon_front_mode;
-    enum ShaderPolygonMode polygon_back_mode;
-    bool smooth_shading;
-} GeomState;
-
-typedef struct GenGeomGlslOptions {
-    bool vulkan;
-} GenGeomGlslOptions;
-
-void pgraph_glsl_set_geom_state(PGRAPHState *pg, GeomState *geom);
-
-bool pgraph_glsl_need_geom(const GeomState *state);
-
-MString *pgraph_glsl_gen_geom(const GeomState *state, GenGeomGlslOptions opts);
+MString *pgraph_gen_geom_glsl(enum ShaderPolygonMode polygon_front_mode,
+                              enum ShaderPolygonMode polygon_back_mode,
+                              enum ShaderPrimitiveMode primitive_mode,
+                              bool smooth_shading,
+                              bool vulkan);

 #endif
--- a/hw/xbox/nv2a/pgraph/glsl/meson.build
+++ b/hw/xbox/nv2a/pgraph/glsl/meson.build
@ -2,7 +2,6 @@ specific_ss.add([files(
 	'common.c',
 	'geom.c',
 	'psh.c',
-	'shaders.c',
 	'vsh.c',
 	'vsh-ff.c',
 	'vsh-prog.c',
--- a/hw/xbox/nv2a/pgraph/glsl/psh.c
+++ b/hw/xbox/nv2a/pgraph/glsl/psh.c
--- a/hw/xbox/nv2a/pgraph/glsl/psh.h
+++ b/hw/xbox/nv2a/pgraph/glsl/psh.h
@ -3,94 +3,39 @@
 *
 * Copyright (c) 2013 espes
 * Copyright (c) 2015 Jannik Vogel
- * Copyright (c) 2020-2025 Matt Borgerson
+ * Copyright (c) 2020-2024 Matt Borgerson
 *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * Based on:
+ * Cxbx, PixelShader.cpp
+ * Copyright (c) 2004 Aaron Robinson <caustik@caustik.com>
+ *                    Kingofc <kingofc@freenet.de>
+ * Xeon, XBD3DPixelShader.cpp
+ * Copyright (c) 2003 _SF_
 *
- * This library is distributed in the hope that it will be useful,
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
 *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
 */

 #ifndef HW_XBOX_NV2A_PGRAPH_GLSL_PSH_H
 #define HW_XBOX_NV2A_PGRAPH_GLSL_PSH_H

-#include "common.h"
-#include "hw/xbox/nv2a/pgraph/psh_regs.h"
+#include "qemu/mstring.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"

-typedef struct PGRAPHState PGRAPHState;
+// FIXME: Move to struct
+#define PSH_UBO_BINDING 1
+#define PSH_TEX_BINDING 2

-typedef struct PshState {
-    uint32_t combiner_control;
-    uint32_t shader_stage_program;
-    uint32_t other_stage_input;
-    uint32_t final_inputs_0;
-    uint32_t final_inputs_1;
-
-    uint32_t rgb_inputs[8], rgb_outputs[8];
-    uint32_t alpha_inputs[8], alpha_outputs[8];
-
-    bool point_sprite;
-    bool rect_tex[4];
-    bool snorm_tex[4];
-    bool compare_mode[4][4];
-    bool alphakill[4];
-    int colorkey_mode[4];
-    enum ConvolutionFilter conv_tex[4];
-    bool tex_x8y24[4];
-    int dim_tex[4];
-
-    float border_logical_size[4][3];
-    float border_inv_real_size[4][3];
-
-    bool shadow_map[4];
-    enum PshShadowDepthFunc shadow_depth_func;
-
-    bool alpha_test;
-    enum PshAlphaFunc alpha_func;
-
-    bool window_clip_exclusive;
-
-    bool smooth_shading;
-    bool depth_clipping;
-    bool z_perspective;
-} PshState;
-
-void pgraph_glsl_set_psh_state(PGRAPHState *pg, PshState *state);
-
-#define PSH_UNIFORM_DECL_X(S, DECL) \
-    DECL(S, alphaRef, int, 1)       \
-    DECL(S, bumpMat, mat2, 4)       \
-    DECL(S, bumpOffset, float, 4)   \
-    DECL(S, bumpScale, float, 4)    \
-    DECL(S, clipRange, vec4, 1)     \
-    DECL(S, clipRegion, ivec4, 8)   \
-    DECL(S, colorKey, uint, 4)      \
-    DECL(S, colorKeyMask, uint, 4)  \
-    DECL(S, consts, vec4, 18)       \
-    DECL(S, depthOffset, float, 1)  \
-    DECL(S, fogColor, vec4, 1)      \
-    DECL(S, texScale, float, 4)
-
-DECL_UNIFORM_TYPES(PshUniform, PSH_UNIFORM_DECL_X)
-
-typedef struct GenPshGlslOptions {
-    bool vulkan;
-    int ubo_binding;
-    int tex_binding;
-} GenPshGlslOptions;
-
-MString *pgraph_glsl_gen_psh(const PshState *state, GenPshGlslOptions opts);
-
-void pgraph_glsl_set_psh_uniform_values(PGRAPHState *pg,
-                                        const PshUniformLocs locs,
-                                        PshUniformValues *values);
+MString *pgraph_gen_psh_glsl(const PshState state);

 #endif
--- a/hw/xbox/nv2a/pgraph/glsl/shaders.c
+++ b/hw/xbox/nv2a/pgraph/glsl/shaders.c
@ -1,94 +0,0 @@
-/*
- * Geforce NV2A PGRAPH GLSL Shader Generator
- *
- * Copyright (c) 2025 Matt Borgerson
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "hw/xbox/nv2a/pgraph/pgraph.h"
-#include "shaders.h"
-
-ShaderState pgraph_glsl_get_shader_state(PGRAPHState *pg)
-{
-    pg->program_data_dirty = false; /* fixme */
-
-    ShaderState state;
-
-    // We will hash it, so make sure any padding is zeroed
-    memset(&state, 0, sizeof(ShaderState));
-
-    pgraph_glsl_set_vsh_state(pg, &state.vsh);
-    pgraph_glsl_set_geom_state(pg, &state.geom);
-    pgraph_glsl_set_psh_state(pg, &state.psh);
-
-    return state;
-}
-
-bool pgraph_glsl_check_shader_state_dirty(PGRAPHState *pg,
-                                          const ShaderState *state)
-{
-    if (pg->program_data_dirty) {
-        return true;
-    }
-
-    unsigned int regs[] = {
-        NV_PGRAPH_COMBINECTL,      NV_PGRAPH_COMBINESPECFOG0,
-        NV_PGRAPH_COMBINESPECFOG1, NV_PGRAPH_CONTROL_0,
-        NV_PGRAPH_CONTROL_3,       NV_PGRAPH_CSV0_C,
-        NV_PGRAPH_CSV0_D,          NV_PGRAPH_CSV1_A,
-        NV_PGRAPH_CSV1_B,          NV_PGRAPH_POINTSIZE,
-        NV_PGRAPH_SETUPRASTER,     NV_PGRAPH_SHADERCLIPMODE,
-        NV_PGRAPH_SHADERCTL,       NV_PGRAPH_SHADERPROG,
-        NV_PGRAPH_SHADOWCTL,       NV_PGRAPH_ZCOMPRESSOCCLUDE,
-    };
-    for (int i = 0; i < ARRAY_SIZE(regs); i++) {
-        if (pgraph_is_reg_dirty(pg, regs[i])) {
-            return true;
-        }
-    }
-
-    int num_stages = pgraph_reg_r(pg, NV_PGRAPH_COMBINECTL) & 0xFF;
-    for (int i = 0; i < num_stages; i++) {
-        if (pgraph_is_reg_dirty(pg, NV_PGRAPH_COMBINEALPHAI0 + i * 4) ||
-            pgraph_is_reg_dirty(pg, NV_PGRAPH_COMBINEALPHAO0 + i * 4) ||
-            pgraph_is_reg_dirty(pg, NV_PGRAPH_COMBINECOLORI0 + i * 4) ||
-            pgraph_is_reg_dirty(pg, NV_PGRAPH_COMBINECOLORO0 + i * 4)) {
-            return true;
-        }
-    }
-
-    if (pg->uniform_attrs != state->vsh.uniform_attrs ||
-        pg->swizzle_attrs != state->vsh.swizzle_attrs ||
-        pg->compressed_attrs != state->vsh.compressed_attrs ||
-        pg->primitive_mode != state->geom.primitive_mode ||
-        pg->surface_scale_factor != state->vsh.surface_scale_factor) {
-        return true;
-    }
-
-    for (int i = 0; i < 4; i++) {
-        if (pgraph_is_reg_dirty(pg, NV_PGRAPH_TEXCTL0_0 + i * 4) ||
-            pgraph_is_reg_dirty(pg, NV_PGRAPH_TEXFILTER0 + i * 4) ||
-            pgraph_is_reg_dirty(pg, NV_PGRAPH_TEXFMT0 + i * 4)) {
-            return true;
-        }
-
-        if (pg->texture_matrix_enable[i] !=
-            state->vsh.fixed_function.texture_matrix_enable[i]) {
-            return true;
-        }
-    }
-
-    return false;
-}
--- a/hw/xbox/nv2a/pgraph/glsl/shaders.h
+++ b/hw/xbox/nv2a/pgraph/glsl/shaders.h
@ -1,40 +0,0 @@
-/*
- * Geforce NV2A PGRAPH GLSL Shader Generator
- *
- * Copyright (c) 2025 Matt Borgerson
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef HW_XBOX_NV2A_PGRAPH_GLSL_SHADERS_H
-#define HW_XBOX_NV2A_PGRAPH_GLSL_SHADERS_H
-
-#include "vsh.h"
-#include "geom.h"
-#include "psh.h"
-
-typedef struct ShaderState {
-    VshState vsh;
-    GeomState geom;
-    PshState psh;
-} ShaderState;
-
-typedef struct PGRAPHState PGRAPHState;
-
-ShaderState pgraph_glsl_get_shader_state(PGRAPHState *pg);
-
-bool pgraph_glsl_check_shader_state_dirty(PGRAPHState *pg,
-                                          const ShaderState *state);
-
-#endif
--- a/hw/xbox/nv2a/pgraph/glsl/vsh-ff.c
+++ b/hw/xbox/nv2a/pgraph/glsl/vsh-ff.c
@ -3,7 +3,7 @@
 *
 * Copyright (c) 2015 espes
 * Copyright (c) 2015 Jannik Vogel
- * Copyright (c) 2020-2025 Matt Borgerson
+ * Copyright (c) 2020-2024 Matt Borgerson
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@ -20,55 +20,22 @@
 */

 #include "qemu/osdep.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"
 #include "common.h"
 #include "vsh-ff.h"

-static void append_skinning_code(MString *str, bool mix, unsigned int count,
-                                 const char *type, const char *output,
-                                 const char *input, const char *matrix,
-                                 const char *swizzle)
-{
-    if (count == 0) {
-        mstring_append_fmt(str, "%s %s = (%s * %s0).%s;\n",
-                           type, output, input, matrix, swizzle);
-    } else {
-        mstring_append_fmt(str, "%s %s = %s(0.0);\n", type, output, type);
-        if (mix) {
-            /* Generated final weight (like GL_WEIGHT_SUM_UNITY_ARB) */
-            mstring_append(str, "{\n"
-                                "  float weight_i;\n"
-                                "  float weight_n = 1.0;\n");
-            int i;
-            for (i = 0; i < count; i++) {
-                if (i < (count - 1)) {
-                    char c = "xyzw"[i];
-                    mstring_append_fmt(str, "  weight_i = weight.%c;\n"
-                                            "  weight_n -= weight_i;\n",
-                                       c);
-                } else {
-                    mstring_append(str, "  weight_i = weight_n;\n");
-                }
-                mstring_append_fmt(str, "  %s += (%s * %s%d).%s * weight_i;\n",
-                                   output, input, matrix, i, swizzle);
-            }
-            mstring_append(str, "}\n");
-        } else {
-            /* Individual weights */
-            int i;
-            for (i = 0; i < count; i++) {
-                char c = "xyzw"[i];
-                mstring_append_fmt(str, "%s += (%s * %s%d).%s * weight.%c;\n",
-                                   output, input, matrix, i, swizzle, c);
-            }
-        }
-    }
-}
+static void append_skinning_code(MString* str, bool mix,
+                                 unsigned int count, const char* type,
+                                 const char* output, const char* input,
+                                 const char* matrix, const char* swizzle);

-void pgraph_glsl_gen_vsh_ff(const VshState *state, MString *header,
-                            MString *body)
+void pgraph_gen_vsh_ff_glsl(const ShaderState *state, MString *header,
+                             MString *body, MString *uniforms)
 {
    int i, j;
+    const char *u = state->vulkan ? "" : "uniform "; // FIXME: Remove

+    /* generate vertex shader mimicking fixed function */
    mstring_append(header,
 "#define position      v0\n"
 "#define weight        v1\n"
@ -87,6 +54,11 @@ void pgraph_glsl_gen_vsh_ff(const VshState *state, MString *header,
 "#define reserved2     v14\n"
 "#define reserved3     v15\n"
 "\n");
+    mstring_append_fmt(uniforms,
+"%svec4 ltctxa[" stringify(NV2A_LTCTXA_COUNT) "];\n"
+"%svec4 ltctxb[" stringify(NV2A_LTCTXB_COUNT) "];\n"
+"%svec4 ltc1[" stringify(NV2A_LTC1_COUNT) "];\n", u, u, u
+);
    mstring_append(header,
 "\n"
 GLSL_DEFINE(projectionMat, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_PMAT0))
@ -143,10 +115,13 @@ GLSL_DEFINE(sceneAmbientColor, GLSL_LTCTXA(NV_IGRAPH_XF_LTCTXA_FR_AMB) ".xyz")
 GLSL_DEFINE(materialEmissionColor, GLSL_LTCTXA(NV_IGRAPH_XF_LTCTXA_CM_COL) ".xyz")
 "\n"
 );
+    mstring_append_fmt(uniforms,
+"%smat4 invViewport;\n", u);

+    /* Skinning */
    unsigned int count;
    bool mix;
-    switch (state->fixed_function.skinning) {
+    switch (state->skinning) {
    case SKINNING_OFF:
        mix = false; count = 0; break;
    case SKINNING_1WEIGHTS:
@ -166,7 +141,7 @@ GLSL_DEFINE(materialEmissionColor, GLSL_LTCTXA(NV_IGRAPH_XF_LTCTXA_CM_COL) ".xyz
        break;
    }
    mstring_append_fmt(body, "/* Skinning mode %d */\n",
-                       state->fixed_function.skinning);
+                       state->skinning);

    append_skinning_code(body, mix, count, "vec4",
                         "tPosition", "position",
@ -175,10 +150,12 @@ GLSL_DEFINE(materialEmissionColor, GLSL_LTCTXA(NV_IGRAPH_XF_LTCTXA_CM_COL) ".xyz
                         "tNormal", "vec4(normal, 0.0)",
                         "invModelViewMat", "xyz");

-    if (state->fixed_function.normalization) {
+    /* Normalization */
+    if (state->normalization) {
        mstring_append(body, "tNormal = normalize(tNormal);\n");
    }

+    /* Texgen */
    for (i = 0; i < NV2A_MAX_TEXTURES; i++) {
        mstring_append_fmt(body, "/* Texgen for stage %d */\n",
                           i);
@ -188,7 +165,7 @@ GLSL_DEFINE(materialEmissionColor, GLSL_LTCTXA(NV_IGRAPH_XF_LTCTXA_CM_COL) ".xyz
            /* TODO: TexGen View Model missing! */
            char c = "xyzw"[j];
            char cSuffix = "STRQ"[j];
-            switch (state->fixed_function.texgen[i][j]) {
+            switch (state->texgen[i][j]) {
            case TEXGEN_DISABLE:
                mstring_append_fmt(body, "oT%d.%c = texture%d.%c;\n",
                                   i, c, i, c);
@ -243,104 +220,104 @@ GLSL_DEFINE(materialEmissionColor, GLSL_LTCTXA(NV_IGRAPH_XF_LTCTXA_CM_COL) ".xyz
        }
    }

+    /* Apply texture matrices */
    for (i = 0; i < NV2A_MAX_TEXTURES; i++) {
-        if (state->fixed_function.texture_matrix_enable[i]) {
+        if (state->texture_matrix_enable[i]) {
            mstring_append_fmt(body,
                               "oT%d = oT%d * texMat%d;\n",
                               i, i, i);
        }
    }

-    if (!state->fixed_function.lighting) {
-        mstring_append(body, "  oD0 = diffuse;\n");
-        mstring_append(body, "  oD1 = specular;\n");
-        mstring_append(body, "  oB0 = backDiffuse;\n");
-        mstring_append(body, "  oB1 = backSpecular;\n");
-    } else {
+    /* Lighting */
+    if (state->lighting) {
+
        //FIXME: Do 2 passes if we want 2 sided-lighting?
+
        static char alpha_source_diffuse[] = "diffuse.a";
        static char alpha_source_specular[] = "specular.a";
        static char alpha_source_material[] = "material_alpha";
        const char *alpha_source = alpha_source_diffuse;
-        if (state->fixed_function.diffuse_src == MATERIAL_COLOR_SRC_MATERIAL) {
+        if (state->diffuse_src == MATERIAL_COLOR_SRC_MATERIAL) {
+            mstring_append_fmt(uniforms, "%sfloat material_alpha;\n", u);
            alpha_source = alpha_source_material;
-        } else if (state->fixed_function.diffuse_src == MATERIAL_COLOR_SRC_SPECULAR) {
+        } else if (state->diffuse_src == MATERIAL_COLOR_SRC_SPECULAR) {
            alpha_source = alpha_source_specular;
        }

-        if (state->fixed_function.ambient_src == MATERIAL_COLOR_SRC_MATERIAL) {
+        if (state->ambient_src == MATERIAL_COLOR_SRC_MATERIAL) {
            mstring_append_fmt(body, "oD0 = vec4(sceneAmbientColor, %s);\n", alpha_source);
-        } else if (state->fixed_function.ambient_src == MATERIAL_COLOR_SRC_DIFFUSE) {
+        } else if (state->ambient_src == MATERIAL_COLOR_SRC_DIFFUSE) {
            mstring_append_fmt(body, "oD0 = vec4(diffuse.rgb, %s);\n", alpha_source);
-        } else if (state->fixed_function.ambient_src == MATERIAL_COLOR_SRC_SPECULAR) {
+        } else if (state->ambient_src == MATERIAL_COLOR_SRC_SPECULAR) {
            mstring_append_fmt(body, "oD0 = vec4(specular.rgb, %s);\n", alpha_source);
        }

        mstring_append(body, "oD0.rgb *= materialEmissionColor.rgb;\n");
-        if (state->fixed_function.emission_src == MATERIAL_COLOR_SRC_MATERIAL) {
+        if (state->emission_src == MATERIAL_COLOR_SRC_MATERIAL) {
            mstring_append(body, "oD0.rgb += sceneAmbientColor;\n");
-        } else if (state->fixed_function.emission_src == MATERIAL_COLOR_SRC_DIFFUSE) {
+        } else if (state->emission_src == MATERIAL_COLOR_SRC_DIFFUSE) {
            mstring_append(body, "oD0.rgb += diffuse.rgb;\n");
-        } else if (state->fixed_function.emission_src == MATERIAL_COLOR_SRC_SPECULAR) {
+        } else if (state->emission_src == MATERIAL_COLOR_SRC_SPECULAR) {
            mstring_append(body, "oD0.rgb += specular.rgb;\n");
        }

        mstring_append(body, "oD1 = vec4(0.0, 0.0, 0.0, specular.a);\n");

-        if (state->fixed_function.local_eye) {
-            mstring_append(body,
-                "vec3 VPeye = normalize(eyePosition.xyz / eyePosition.w - tPosition.xyz / tPosition.w);\n"
-            );
-        }
-
        for (i = 0; i < NV2A_MAX_LIGHTS; i++) {
-            if (state->fixed_function.light[i] == LIGHT_OFF) {
+            if (state->light[i] == LIGHT_OFF) {
                continue;
            }

+            /* FIXME: It seems that we only have to handle the surface colors if
+             *        they are not part of the material [= vertex colors].
+             *        If they are material the cpu will premultiply light
+             *        colors
+             */
+
            mstring_append_fmt(body, "/* Light %d */ {\n", i);

-            if (state->fixed_function.light[i] == LIGHT_LOCAL
-                    || state->fixed_function.light[i] == LIGHT_SPOT) {
+            if (state->light[i] == LIGHT_LOCAL
+                    || state->light[i] == LIGHT_SPOT) {

+                mstring_append_fmt(uniforms,
+                    "%svec3 lightLocalPosition%d;\n"
+                    "%svec3 lightLocalAttenuation%d;\n",
+                    u, i, u, i);
                mstring_append_fmt(body,
-                    "  vec3 tPos = tPosition.xyz/tPosition.w;\n"
-                    "  vec3 VP = lightLocalPosition[%d] - tPos;\n"
+                    "  vec3 VP = lightLocalPosition%d - tPosition.xyz/tPosition.w;\n"
                    "  float d = length(VP);\n"
-                    "  if (d <= lightLocalRange(%d)) {\n"  /* FIXME: Double check that range is inclusive */
+//FIXME: if (d > lightLocalRange) { .. don't process this light .. } /* inclusive?! */ - what about directional lights?
                    "  VP = normalize(VP);\n"
-                    "    float attenuation = 1.0 / (lightLocalAttenuation[%d].x\n"
-                    "                                 + lightLocalAttenuation[%d].y * d\n"
-                    "                                 + lightLocalAttenuation[%d].z * d * d);\n"
-                    "    vec3 halfVector = normalize(VP + %s);\n"
+                    "  float attenuation = 1.0 / (lightLocalAttenuation%d.x\n"
+                    "                               + lightLocalAttenuation%d.y * d\n"
+                    "                               + lightLocalAttenuation%d.z * d * d);\n"
+                    "  vec3 halfVector = normalize(VP + eyePosition.xyz / eyePosition.w);\n" /* FIXME: Not sure if eyePosition is correct */
                    "  float nDotVP = max(0.0, dot(tNormal, VP));\n"
                    "  float nDotHV = max(0.0, dot(tNormal, halfVector));\n",
-                    i, i, i, i, i,
-                    state->fixed_function.local_eye ? "VPeye" : "vec3(0.0, 0.0, 0.0)"
-                );
+                    i, i, i, i);
+
            }

-            switch(state->fixed_function.light[i]) {
+            switch(state->light[i]) {
            case LIGHT_INFINITE:

                /* lightLocalRange will be 1e+30 here */

+                mstring_append_fmt(uniforms,
+                    "%svec3 lightInfiniteHalfVector%d;\n"
+                    "%svec3 lightInfiniteDirection%d;\n",
+                    u, i, u, i);
                mstring_append_fmt(body,
-                    "  {\n"
                    "  float attenuation = 1.0;\n"
-                    "    vec3 lightDirection = normalize(lightInfiniteDirection[%d]);\n"
-                    "    float nDotVP = max(0.0, dot(tNormal, lightDirection));\n",
-                    i);
-                if (state->fixed_function.local_eye) {
-                    mstring_append(body,
-                        "    float nDotHV = max(0.0, dot(tNormal, normalize(lightDirection + VPeye)));\n"
-                    );
-                } else {
-                    mstring_append_fmt(body,
-                        "    float nDotHV = max(0.0, dot(tNormal, lightInfiniteHalfVector[%d]));\n",
-                        i
-                    );
-                }
+                    "  float nDotVP = max(0.0, dot(tNormal, normalize(vec3(lightInfiniteDirection%d))));\n"
+                    "  float nDotHV = max(0.0, dot(tNormal, vec3(lightInfiniteHalfVector%d)));\n",
+                    i, i);
+
+                /* FIXME: Do specular */
+
+                /* FIXME: tBackDiffuse */
+
                break;
            case LIGHT_LOCAL:
                /* Everything done already */
@ -369,20 +346,20 @@ GLSL_DEFINE(materialEmissionColor, GLSL_LTCTXA(NV_IGRAPH_XF_LTCTXA_CM_COL) ".xyz

            mstring_append_fmt(body,
                "  float pf;\n"
-                "    if (nDotVP == 0.0 || nDotHV == 0.0) {\n"
+                "  if (nDotVP == 0.0) {\n"
                "    pf = 0.0;\n"
                "  } else {\n"
-                "      pf = pow(nDotHV, specularPower);\n"
+                "    pf = pow(nDotHV, /* specular(l, m, n, l1, m1, n1) */ 0.001);\n"
                "  }\n"
                "  vec3 lightAmbient = lightAmbientColor(%d) * attenuation;\n"
                "  vec3 lightDiffuse = lightDiffuseColor(%d) * attenuation * nDotVP;\n"
-                "    vec3 lightSpecular = lightSpecularColor(%d) * attenuation * pf;\n",
+                "  vec3 lightSpecular = lightSpecularColor(%d) * pf;\n",
                i, i, i);

            mstring_append(body,
                "  oD0.xyz += lightAmbient;\n");

-            switch (state->fixed_function.diffuse_src) {
+            switch (state->diffuse_src) {
            case MATERIAL_COLOR_SRC_MATERIAL:
                mstring_append(body,
                               "  oD0.xyz += lightDiffuse;\n");
@ -397,57 +374,28 @@ GLSL_DEFINE(materialEmissionColor, GLSL_LTCTXA(NV_IGRAPH_XF_LTCTXA_CM_COL) ".xyz
                break;
            }

-            switch (state->fixed_function.specular_src) {
-            case MATERIAL_COLOR_SRC_MATERIAL:
-                mstring_append(body,
-                               "    oD1.xyz += lightSpecular;\n");
-                break;
-            case MATERIAL_COLOR_SRC_DIFFUSE:
-                mstring_append(body,
-                               "    oD1.xyz += diffuse.xyz * lightSpecular;\n");
-                break;
-            case MATERIAL_COLOR_SRC_SPECULAR:
            mstring_append(body,
                "  oD1.xyz += specular.xyz * lightSpecular;\n");
-                break;
-            }

-            mstring_append(body, "  }\n"
-                                 "}\n");
+            mstring_append(body, "}\n");
        }
-
-        /* TODO: Implement two-sided lighting */
-        mstring_append(body, "  oB0 = backDiffuse;\n");
-        mstring_append(body, "  oB1 = backSpecular;\n");
+    } else {
+        mstring_append(body, "  oD0 = diffuse;\n");
+        mstring_append(body, "  oD1 = specular;\n");
    }

    if (!state->specular_enable) {
        mstring_append(body, "  oD1 = vec4(0.0, 0.0, 0.0, 1.0);\n");
-        mstring_append(body, "  oB1 = vec4(0.0, 0.0, 0.0, 1.0);\n");
-    } else {
-        if (!state->separate_specular) {
-            if (state->fixed_function.lighting) {
-				mstring_append(body,
-				               "  oD0.xyz += oD1.xyz;\n"
-				               "  oB0.xyz += oB1.xyz;\n"
-				);
-            }
-			mstring_append(body,
-				           "  oD1 = specular;\n"
-				           "  oB1 = backSpecular;\n"
-			);
-        }
-        if (state->ignore_specular_alpha) {
-            mstring_append(body,
-                           "  oD1.a = 1.0;\n"
-                           "  oB1.a = 1.0;\n"
-            );
-        }
    }

+    mstring_append(body, "  oB0 = backDiffuse;\n");
+    mstring_append(body, "  oB1 = backSpecular;\n");
+
+    /* Fog */
    if (state->fog_enable) {
+
        /* From: https://www.opengl.org/registry/specs/NV/fog_distance.txt */
-        switch(state->fixed_function.foggen) {
+        switch(state->foggen) {
        case FOGGEN_SPEC_ALPHA:
            /* FIXME: Do we have to clamp here? */
            mstring_append(body, "  float fogDistance = clamp(specular.a, 0.0, 1.0);\n");
@ -458,7 +406,7 @@ GLSL_DEFINE(materialEmissionColor, GLSL_LTCTXA(NV_IGRAPH_XF_LTCTXA_CM_COL) ".xyz
        case FOGGEN_PLANAR:
        case FOGGEN_ABS_PLANAR:
            mstring_append(body, "  float fogDistance = dot(fogPlane.xyz, tPosition.xyz) + fogPlane.w;\n");
-            if (state->fixed_function.foggen == FOGGEN_ABS_PLANAR) {
+            if (state->foggen == FOGGEN_ABS_PLANAR) {
                mstring_append(body, "  fogDistance = abs(fogDistance);\n");
            }
            break;
@ -466,38 +414,81 @@ GLSL_DEFINE(materialEmissionColor, GLSL_LTCTXA(NV_IGRAPH_XF_LTCTXA_CM_COL) ".xyz
            mstring_append(body, "  float fogDistance = fogCoord;\n");
            break;
        default:
-            assert(!"Invalid foggen mode");
+            assert(false);
            break;
        }

    }

    /* If skinning is off the composite matrix already includes the MV matrix */
-    if (state->fixed_function.skinning == SKINNING_OFF) {
+    if (state->skinning == SKINNING_OFF) {
        mstring_append(body, "  tPosition = position;\n");
    }

    mstring_append(body,
    "   oPos = tPosition * compositeMat;\n"
-    "  oPos.z = oPos.z / clipRange.y;\n"
    "   oPos.w = clampAwayZeroInf(oPos.w);\n"
-    "  oPos.xy /= oPos.w;\n"
-    "  oPos.xy += c[" stringify(NV_IGRAPH_XF_XFCTX_VPOFF) "].xy;\n"
-    "  oPos.xy = roundScreenCoords(oPos.xy);\n"
-    "  oPos.xy = (2.0f * oPos.xy - surfaceSize) / surfaceSize;\n"
-    "  oPos.xy *= oPos.w;\n"
+    "   oPos = invViewport * oPos;\n"
    );

+    if (state->vulkan) {
+        mstring_append(body, "   oPos.y *= -1;\n");
+    }
+
    /* FIXME: Testing */
    if (state->point_params_enable) {
-        mstring_append(
+        mstring_append_fmt(
            body,
            "  float d_e = length(position * modelViewMat0);\n"
-            "  oPts.x = 1/sqrt(pointParams[0] + pointParams[1] * d_e + pointParams[2] * d_e * d_e) + pointParams[6];\n");
-        mstring_append_fmt(body, "  oPts.x = min(oPts.x * pointParams[3] + pointParams[7], 64.0) * %d;\n",
+            "  oPts.x = 1/sqrt(%f + %f*d_e + %f*d_e*d_e) + %f;\n",
+            state->point_params[0], state->point_params[1], state->point_params[2],
+            state->point_params[6]);
+        mstring_append_fmt(body, "  oPts.x = min(oPts.x*%f + %f, 64.0) * %d;\n",
+                           state->point_params[3], state->point_params[7],
                           state->surface_scale_factor);
    } else {
        mstring_append_fmt(body, "  oPts.x = %f * %d;\n", state->point_size,
                           state->surface_scale_factor);
    }
 }
+
+static void append_skinning_code(MString* str, bool mix,
+                                 unsigned int count, const char* type,
+                                 const char* output, const char* input,
+                                 const char* matrix, const char* swizzle)
+{
+    if (count == 0) {
+        mstring_append_fmt(str, "%s %s = (%s * %s0).%s;\n",
+                           type, output, input, matrix, swizzle);
+    } else {
+        mstring_append_fmt(str, "%s %s = %s(0.0);\n", type, output, type);
+        if (mix) {
+            /* Generated final weight (like GL_WEIGHT_SUM_UNITY_ARB) */
+            mstring_append(str, "{\n"
+                                "  float weight_i;\n"
+                                "  float weight_n = 1.0;\n");
+            int i;
+            for (i = 0; i < count; i++) {
+                if (i < (count - 1)) {
+                    char c = "xyzw"[i];
+                    mstring_append_fmt(str, "  weight_i = weight.%c;\n"
+                                            "  weight_n -= weight_i;\n",
+                                       c);
+                } else {
+                    mstring_append(str, "  weight_i = weight_n;\n");
+                }
+                mstring_append_fmt(str, "  %s += (%s * %s%d).%s * weight_i;\n",
+                                   output, input, matrix, i, swizzle);
+            }
+            mstring_append(str, "}\n");
+        } else {
+            /* Individual weights */
+            int i;
+            for (i = 0; i < count; i++) {
+                char c = "xyzw"[i];
+                mstring_append_fmt(str, "%s += (%s * %s%d).%s * weight.%c;\n",
+                                   output, input, matrix, i, swizzle, c);
+            }
+        }
+    }
+}
--- a/hw/xbox/nv2a/pgraph/glsl/vsh-ff.h
+++ b/hw/xbox/nv2a/pgraph/glsl/vsh-ff.h
@ -3,7 +3,7 @@
 *
 * Copyright (c) 2015 espes
 * Copyright (c) 2015 Jannik Vogel
- * Copyright (c) 2020-2025 Matt Borgerson
+ * Copyright (c) 2020-2024 Matt Borgerson
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@ -23,9 +23,9 @@
 #define HW_XBOX_NV2A_PGRAPH_GLSL_VSH_FF_H

 #include "qemu/mstring.h"
-#include "vsh.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"

-void pgraph_glsl_gen_vsh_ff(const VshState *state, MString *header,
-                            MString *body);
+void pgraph_gen_vsh_ff_glsl(const ShaderState *state, MString *header,
+                            MString *body, MString *uniforms);

 #endif
--- a/hw/xbox/nv2a/pgraph/glsl/vsh-prog.c
+++ b/hw/xbox/nv2a/pgraph/glsl/vsh-prog.c
@ -3,7 +3,6 @@
 *
 * Copyright (c) 2014 Jannik Vogel
 * Copyright (c) 2012 espes
- * Copyright (c) 2025 Matt Borgerson
 *
 * Based on:
 * Cxbx, VertexShader.cpp
@ -33,10 +32,66 @@
 #include <stdbool.h>
 #include <assert.h>

+#include "hw/xbox/nv2a/pgraph/vsh.h"
 #include "common.h"
-#include "vsh.h"
 #include "vsh-prog.h"

+#define VSH_D3DSCM_CORRECTION 96
+
+
+typedef enum {
+    PARAM_UNKNOWN = 0,
+    PARAM_R,
+    PARAM_V,
+    PARAM_C
+} VshParameterType;
+
+typedef enum {
+    OUTPUT_C = 0,
+    OUTPUT_O
+} VshOutputType;
+
+typedef enum {
+    OMUX_MAC = 0,
+    OMUX_ILU
+} VshOutputMux;
+
+typedef enum {
+    ILU_NOP = 0,
+    ILU_MOV,
+    ILU_RCP,
+    ILU_RCC,
+    ILU_RSQ,
+    ILU_EXP,
+    ILU_LOG,
+    ILU_LIT
+} VshILU;
+
+typedef enum {
+    MAC_NOP,
+    MAC_MOV,
+    MAC_MUL,
+    MAC_ADD,
+    MAC_MAD,
+    MAC_DP3,
+    MAC_DPH,
+    MAC_DP4,
+    MAC_DST,
+    MAC_MIN,
+    MAC_MAX,
+    MAC_SLT,
+    MAC_SGE,
+    MAC_ARL
+} VshMAC;
+
+typedef enum {
+    SWIZZLE_X = 0,
+    SWIZZLE_Y,
+    SWIZZLE_Z,
+    SWIZZLE_W
+} VshSwizzle;
+
+
 typedef struct VshFieldMapping {
    VshFieldName field_name;
    uint8_t subtoken;
@ -88,6 +143,7 @@ static const VshFieldMapping field_mapping[] = {
    {  FLD_FINAL,            3,    0,     1 }
 };

+
 typedef struct VshOpcodeParams {
    bool A;
    bool B;
@ -230,6 +286,8 @@ static const char* out_reg_name[] = {
    "A0.x",
 };

+
+
 // Retrieves a number of bits in the instruction token
 static int vsh_get_from_token(const uint32_t *shader_token,
                              uint8_t subtoken,
@ -248,6 +306,7 @@ uint8_t vsh_get_field(const uint32_t *shader_token, VshFieldName field_name)
                                        field_mapping[field_name].bit_length));
 }

+
 // Converts the C register address to disassembly format
 static int16_t convert_c_register(const int16_t c_reg)
 {
@ -256,7 +315,9 @@ static int16_t convert_c_register(const int16_t c_reg)
    return r; //FIXME: = c_reg?!
 }

-static MString *decode_swizzle(const uint32_t *shader_token,
+
+
+static MString* decode_swizzle(const uint32_t *shader_token,
                               VshFieldName swizzle_field)
 {
    const char* swizzle_str = "xyzw";
@ -294,9 +355,10 @@ static MString *decode_swizzle(const uint32_t *shader_token,
    }
 }

-static MString *decode_opcode_input(const uint32_t *shader_token,
+static MString* decode_opcode_input(const uint32_t *shader_token,
                                    VshParameterType param,
-                                    VshFieldName neg_field, int reg_num)
+                                    VshFieldName neg_field,
+                                    int reg_num)
 {
    /* This function decodes a vertex shader opcode parameter into a string.
     * Input A, B or C is controlled via the Param and NEG fieldnames,
@ -346,10 +408,13 @@ static MString *decode_opcode_input(const uint32_t *shader_token,
    return ret_str;
 }

-static MString *decode_opcode(const uint32_t *shader_token,
-                              VshOutputMux out_mux, uint32_t mask,
-                              const char *opcode, const char *inputs,
-                              MString **suffix)
+
+static MString* decode_opcode(const uint32_t *shader_token,
+                              VshOutputMux out_mux,
+                              uint32_t mask,
+                              const char *opcode,
+                              const char *inputs,
+                              MString** suffix)
 {
    MString *ret = mstring_new();
    int reg_num = vsh_get_field(shader_token, FLD_OUT_R);
@ -431,7 +496,8 @@ static MString *decode_opcode(const uint32_t *shader_token,
    return ret;
 }

-static MString *decode_token(const uint32_t *shader_token)
+
+static MString* decode_token(const uint32_t *shader_token)
 {
    MString *ret;

@ -573,7 +639,7 @@ static const char* vsh_header =
    // Unfortunately mix() falls victim to the same handling of exceptional
    // (inf/NaN) handling as a multiply, so per-component comparisons are used
    // to guarantee HW behavior (anything * 0 must == 0).
-    "  vec4 zero_components = sign(NaNToOne(src0)) * sign(NaNToOne(src1));\n"
+    "  vec4 zero_components = sign(src0) * sign(src1);\n"
    "  vec4 ret = src0 * src1;\n"
    "  if (zero_components.x == 0.0) { ret.x = 0.0; }\n"
    "  if (zero_components.y == 0.0) { ret.y = 0.0; }\n"
@ -723,9 +789,11 @@ static const char* vsh_header =
    "  return t;\n"
    "}\n";

-void pgraph_glsl_gen_vsh_prog(uint16_t version, const uint32_t *tokens,
-                              unsigned int length, MString *header,
-                              MString *body)
+void pgraph_gen_vsh_prog_glsl(uint16_t version,
+                   const uint32_t *tokens,
+                   unsigned int length,
+                   bool vulkan,
+                   MString *header, MString *body)
 {

    mstring_append(header, vsh_header);
@ -737,10 +805,12 @@ void pgraph_glsl_gen_vsh_prog(uint16_t version, const uint32_t *tokens,
        const uint32_t* cur_token = &tokens[slot * VSH_TOKEN_SIZE];
        MString *token_str = decode_token(cur_token);
        mstring_append_fmt(body,
-                           "  /* Slot %d: 0x%08X 0x%08X 0x%08X 0x%08X */\n"
-                           "  %s\n",
-                           slot, cur_token[0], cur_token[1], cur_token[2],
-                           cur_token[3], mstring_get_str(token_str));
+                           "  /* Slot %d: 0x%08X 0x%08X 0x%08X 0x%08X */",
+                           slot,
+                           cur_token[0],cur_token[1],cur_token[2],cur_token[3]);
+        mstring_append(body, "\n");
+        mstring_append(body, mstring_get_str(token_str));
+        mstring_append(body, "\n");
        mstring_unref(token_str);

        if (vsh_get_field(cur_token, FLD_FINAL)) {
@ -751,12 +821,22 @@ void pgraph_glsl_gen_vsh_prog(uint16_t version, const uint32_t *tokens,
    assert(has_final);

    mstring_append(body,
-        /* The shaders leave the result in screen space, while OpenGL expects it
-         * in clip space.
+        /* the shaders leave the result in screen space, while
+         * opengl expects it in clip space.
+         * TODO: the pixel-center co-ordinate differences should handled
         */
-        "  oPos.xy = roundScreenCoords(oPos.xy);\n"
-        "  oPos.xy = (2.0f * oPos.xy - surfaceSize) / surfaceSize;\n"
+        "  oPos.x = 2.0 * (oPos.x - surfaceSize.x * 0.5) / surfaceSize.x;\n"
+        );

+    if (vulkan) {
+        mstring_append(body,
+                       "  oPos.y = 2.0 * oPos.y / surfaceSize.y - 1.0;\n");
+    } else {
+        mstring_append(body, "  oPos.y = -2.0 * (oPos.y - surfaceSize.y * 0.5) "
+                             "/ surfaceSize.y;\n");
+    }
+
+    mstring_append(body,
        "  oPos.z = oPos.z / clipRange.y;\n"
        "  oPos.w = clampAwayZeroInf(oPos.w);\n"

--- a/hw/xbox/nv2a/pgraph/glsl/vsh-prog.h
+++ b/hw/xbox/nv2a/pgraph/glsl/vsh-prog.h
@ -3,7 +3,13 @@
 *
 * Copyright (c) 2014 Jannik Vogel
 * Copyright (c) 2012 espes
- * Copyright (c) 2025 Matt Borgerson
+ *
+ * Based on:
+ * Cxbx, VertexShader.cpp
+ * Copyright (c) 2004 Aaron Robinson <caustik@caustik.com>
+ *                    Kingofc <kingofc@freenet.de>
+ * Dxbx, uPushBuffer.pas
+ * Copyright (c) 2007 Shadow_tj, PatrickvL
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
@ -22,10 +28,8 @@
 #ifndef HW_XBOX_NV2A_PGRAPH_GLSL_VSH_PROG_H
 #define HW_XBOX_NV2A_PGRAPH_GLSL_VSH_PROG_H

-#include "qemu/mstring.h"
-
-void pgraph_glsl_gen_vsh_prog(uint16_t version, const uint32_t *tokens,
-                              unsigned int length, MString *header,
-                              MString *body);
+void pgraph_gen_vsh_prog_glsl(uint16_t version, const uint32_t *tokens,
+                              unsigned int length,
+                              bool vulkan, MString *header, MString *body);

 #endif
--- a/hw/xbox/nv2a/pgraph/glsl/vsh.c
+++ b/hw/xbox/nv2a/pgraph/glsl/vsh.c
@ -3,7 +3,7 @@
 *
 * Copyright (c) 2015 espes
 * Copyright (c) 2015 Jannik Vogel
- * Copyright (c) 2020-2025 Matt Borgerson
+ * Copyright (c) 2020-2024 Matt Borgerson
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@ -20,176 +20,39 @@
 */

 #include "qemu/osdep.h"
-#include "hw/xbox/nv2a/pgraph/pgraph.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"
+#include "common.h"
 #include "vsh.h"
 #include "vsh-ff.h"
 #include "vsh-prog.h"
+#include <stdbool.h>

-DEF_UNIFORM_INFO_ARR(VshUniform, VSH_UNIFORM_DECL_X)
-
-static void set_fixed_function_vsh_state(PGRAPHState *pg,
-                                         FixedFunctionVshState *state)
+MString *pgraph_gen_vsh_glsl(const ShaderState *state, bool prefix_outputs)
 {
-    state->skinning = (enum VshSkinning)GET_MASK(
-        pgraph_reg_r(pg, NV_PGRAPH_CSV0_D), NV_PGRAPH_CSV0_D_SKIN);
-    state->normalization = pgraph_reg_r(pg, NV_PGRAPH_CSV0_C) &
-                           NV_PGRAPH_CSV0_C_NORMALIZATION_ENABLE;
-    state->local_eye =
-        GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_C), NV_PGRAPH_CSV0_C_LOCALEYE);
+    int i;
+    MString *output = mstring_new();
+    mstring_append_fmt(output, "#version %d\n\n", state->vulkan ? 450 : 400);

-    state->emission_src = (enum MaterialColorSource)GET_MASK(
-        pgraph_reg_r(pg, NV_PGRAPH_CSV0_C), NV_PGRAPH_CSV0_C_EMISSION);
-    state->ambient_src = (enum MaterialColorSource)GET_MASK(
-        pgraph_reg_r(pg, NV_PGRAPH_CSV0_C), NV_PGRAPH_CSV0_C_AMBIENT);
-    state->diffuse_src = (enum MaterialColorSource)GET_MASK(
-        pgraph_reg_r(pg, NV_PGRAPH_CSV0_C), NV_PGRAPH_CSV0_C_DIFFUSE);
-    state->specular_src = (enum MaterialColorSource)GET_MASK(
-        pgraph_reg_r(pg, NV_PGRAPH_CSV0_C), NV_PGRAPH_CSV0_C_SPECULAR);
+    MString *header = mstring_from_str("");
+    MString *uniforms = mstring_from_str("");

-    for (int i = 0; i < 4; i++) {
-        state->texture_matrix_enable[i] = pg->texture_matrix_enable[i];
-    }
+    const char *u = state->vulkan ? "" : "uniform "; // FIXME: Remove

-    for (int i = 0; i < 4; i++) {
-        unsigned int reg = (i < 2) ? NV_PGRAPH_CSV1_A : NV_PGRAPH_CSV1_B;
-        for (int j = 0; j < 4; j++) {
-            unsigned int masks[] = {
-                (i % 2) ? NV_PGRAPH_CSV1_A_T1_S : NV_PGRAPH_CSV1_A_T0_S,
-                (i % 2) ? NV_PGRAPH_CSV1_A_T1_T : NV_PGRAPH_CSV1_A_T0_T,
-                (i % 2) ? NV_PGRAPH_CSV1_A_T1_R : NV_PGRAPH_CSV1_A_T0_R,
-                (i % 2) ? NV_PGRAPH_CSV1_A_T1_Q : NV_PGRAPH_CSV1_A_T0_Q
-            };
-            state->texgen[i][j] =
-                (enum VshTexgen)GET_MASK(pgraph_reg_r(pg, reg), masks[j]);
-        }
-    }
+    mstring_append_fmt(uniforms,
+        "%svec4 clipRange;\n"
+        "%svec2 surfaceSize;\n"
+        "%svec4 c[" stringify(NV2A_VERTEXSHADER_CONSTANTS) "];\n"
+        "%svec2 fogParam;\n",
+        u, u, u, u
+        );

-    state->lighting =
-        GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_C), NV_PGRAPH_CSV0_C_LIGHTING);
-    if (state->lighting) {
-        for (int i = 0; i < NV2A_MAX_LIGHTS; i++) {
-            state->light[i] =
-                (enum VshLight)GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_D),
-                                        NV_PGRAPH_CSV0_D_LIGHT0 << (i * 2));
-        }
-    }
-
-    if (pgraph_reg_r(pg, NV_PGRAPH_CONTROL_3) & NV_PGRAPH_CONTROL_3_FOGENABLE) {
-        state->foggen = (enum VshFoggen)GET_MASK(
-            pgraph_reg_r(pg, NV_PGRAPH_CSV0_D), NV_PGRAPH_CSV0_D_FOGGENMODE);
-    }
-}
-
-static void set_programmable_vsh_state(PGRAPHState *pg,
-                                       ProgrammableVshState *prog)
-{
-    int program_start = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_C),
-                                 NV_PGRAPH_CSV0_C_CHEOPS_PROGRAM_START);
-
-    prog->program_length = 0;
-    for (int i = program_start; i < NV2A_MAX_TRANSFORM_PROGRAM_LENGTH; i++) {
-        uint32_t *cur_token = (uint32_t *)&pg->program_data[i];
-        memcpy(&prog->program_data[prog->program_length], cur_token,
-               VSH_TOKEN_SIZE * sizeof(uint32_t));
-        prog->program_length++;
-
-        if (vsh_get_field(cur_token, FLD_FINAL)) {
-            break;
-        }
-    }
-}
-
-void pgraph_glsl_set_vsh_state(PGRAPHState *pg, VshState *vsh)
-{
-    bool vertex_program = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_D),
-                                   NV_PGRAPH_CSV0_D_MODE) == 2;
-
-    bool fixed_function = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_D),
-                                   NV_PGRAPH_CSV0_D_MODE) == 0;
-
-    assert(vertex_program || fixed_function);
-
-    vsh->surface_scale_factor = pg->surface_scale_factor; // FIXME
-
-    vsh->compressed_attrs = pg->compressed_attrs;
-    vsh->uniform_attrs = pg->uniform_attrs;
-    vsh->swizzle_attrs = pg->swizzle_attrs;
-
-    vsh->specular_enable = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_C),
-                                    NV_PGRAPH_CSV0_C_SPECULAR_ENABLE);
-    vsh->separate_specular = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_C),
-                                      NV_PGRAPH_CSV0_C_SEPARATE_SPECULAR);
-    vsh->ignore_specular_alpha =
-        !GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_C),
-                  NV_PGRAPH_CSV0_C_ALPHA_FROM_MATERIAL_SPECULAR);
-    vsh->specular_power = pg->specular_power;
-    vsh->specular_power_back = pg->specular_power_back;
-
-    vsh->z_perspective = pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0) &
-                         NV_PGRAPH_CONTROL_0_Z_PERSPECTIVE_ENABLE;
-
-    vsh->point_params_enable = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_D),
-                                        NV_PGRAPH_CSV0_D_POINTPARAMSENABLE);
-    vsh->point_size = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_POINTSIZE),
-                               NV097_SET_POINT_SIZE_V) /
-                      8.0f;
-    if (vsh->point_params_enable) {
-        for (int i = 0; i < 8; i++) {
-            vsh->point_params[i] = pg->point_params[i];
-        }
-    }
-
-    vsh->smooth_shading = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_3),
-                                   NV_PGRAPH_CONTROL_3_SHADEMODE) ==
-                          NV_PGRAPH_CONTROL_3_SHADEMODE_SMOOTH;
-
-    vsh->fog_enable =
-        pgraph_reg_r(pg, NV_PGRAPH_CONTROL_3) & NV_PGRAPH_CONTROL_3_FOGENABLE;
-    if (vsh->fog_enable) {
-        /*FIXME: Use CSV0_D? */
-        vsh->fog_mode =
-            (enum VshFogMode)GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_3),
-                                      NV_PGRAPH_CONTROL_3_FOG_MODE);
-    }
-
-    vsh->is_fixed_function = fixed_function;
-    if (fixed_function) {
-        set_fixed_function_vsh_state(pg, &vsh->fixed_function);
-    } else {
-        set_programmable_vsh_state(pg, &vsh->programmable);
-    }
-}
-
-MString *pgraph_glsl_gen_vsh(const VshState *state, GenVshGlslOptions opts)
-{
-    MString *uniforms = mstring_new();
-    const char *u = opts.vulkan ? "" : "uniform ";
-    for (int i = 0; i < ARRAY_SIZE(VshUniformInfo); i++) {
-        const UniformInfo *info = &VshUniformInfo[i];
-        const char *type_str = uniform_element_type_to_str[info->type];
-        if (i == VshUniform_inlineValue &&
-            (!state->uniform_attrs ||
-             opts.use_push_constants_for_uniform_attrs)) {
-            continue;
-        }
-        if (info->count == 1) {
-            mstring_append_fmt(uniforms, "%s%s %s;\n", u, type_str,
-                               info->name);
-        } else {
-            mstring_append_fmt(uniforms, "%s%s %s[%zd];\n", u, type_str,
-                               info->name, info->count);
-        }
-    }
-
-    MString *header = mstring_from_str(
+    mstring_append(header,
        GLSL_DEFINE(fogPlane, GLSL_C(NV_IGRAPH_XF_XFCTX_FOG))
        GLSL_DEFINE(texMat0, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_T0MAT))
        GLSL_DEFINE(texMat1, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_T1MAT))
        GLSL_DEFINE(texMat2, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_T2MAT))
        GLSL_DEFINE(texMat3, GLSL_C_MAT4(NV_IGRAPH_XF_XFCTX_T3MAT))

-        "\n"
-        "#define FLOAT_MAX uintBitsToFloat(0x7F7FFFFFu)\n"
        "\n"
        "vec4 oPos = vec4(0.0,0.0,0.0,1.0);\n"
        "vec4 oD0 = vec4(0.0,0.0,0.0,1.0);\n"
@ -218,23 +81,12 @@ MString *pgraph_glsl_gen_vsh(const VshState *state, GenVshGlslOptions opts)
        "    t = clamp(t, uintBitsToFloat(0xDF800000), uintBitsToFloat(0x9F800000));\n"
        "  }\n"
        "  return t;\n"
-        "}\n"
-        "\n"
-        "vec4 NaNToOne(vec4 src) {\n"
-        "  return mix(src, vec4(1.0), isnan(src));\n"
-        "}\n"
-        "\n"
-        // Xbox NV2A rasterizer appears to have 4 bit precision fixed-point
-        // fractional part and to convert floating-point coordinates by
-        // by truncating (not flooring).
-        "vec2 roundScreenCoords(vec2 pos) {\n"
-        "  return trunc(pos * 16.0f) / 16.0f;\n"
        "}\n");

-    pgraph_glsl_get_vtx_header(header, opts.vulkan, state->smooth_shading,
-                               false, opts.prefix_outputs, false);
+    pgraph_get_glsl_vtx_header(header, state->vulkan, state->smooth_shading,
+                             false, prefix_outputs, false);

-    if (opts.prefix_outputs) {
+    if (prefix_outputs) {
        mstring_append(header,
                       "#define vtxD0 v_vtxD0\n"
                       "#define vtxD1 v_vtxD1\n"
@ -251,7 +103,7 @@ MString *pgraph_glsl_gen_vsh(const VshState *state, GenVshGlslOptions opts)

    int num_uniform_attrs = 0;

-    for (int i = 0; i < NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
+    for (i = 0; i < NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
        bool is_uniform = state->uniform_attrs & (1 << i);
        bool is_swizzled = state->swizzle_attrs & (1 << i);
        bool is_compressed = state->compressed_attrs & (1 << i);
@ -276,12 +128,11 @@ MString *pgraph_glsl_gen_vsh(const VshState *state, GenVshGlslOptions opts)
            }
        }
    }
-
    mstring_append(header, "\n");

    MString *body = mstring_from_str("void main() {\n");

-    for (int i = 0; i < NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
+    for (i = 0; i < NV2A_VERTEXSHADER_ATTRIBUTES; i++) {
        if (state->compressed_attrs & (1 << i)) {
            mstring_append_fmt(
                body, "vec4 v%d = decompress_11_11_10(v%d_cmp);\n", i, i);
@ -293,19 +144,23 @@ MString *pgraph_glsl_gen_vsh(const VshState *state, GenVshGlslOptions opts)

    }

-    if (state->is_fixed_function) {
-        pgraph_glsl_gen_vsh_ff(state, header, body);
+    if (state->fixed_function) {
+        pgraph_gen_vsh_ff_glsl(state, header, body, uniforms);
+    } else if (state->vertex_program) {
+        pgraph_gen_vsh_prog_glsl(VSH_VERSION_XVS,
+                                 (uint32_t *)state->program_data,
+                                 state->program_length,
+                                 state->vulkan, header, body);
    } else {
-        pgraph_glsl_gen_vsh_prog(
-            VSH_VERSION_XVS, (uint32_t *)state->programmable.program_data,
-            state->programmable.program_length, header, body);
+        assert(false);
    }

-    if (!state->fog_enable) {
-        /* FIXME: Is the fog still calculated / passed somehow?! */
-        mstring_append(body, "  oFog = vec4(1.0);\n");
-    } else {
-        if (!state->is_fixed_function) {
+
+    /* Fog */
+
+    if (state->fog_enable) {
+
+        if (state->vertex_program) {
            /* FIXME: Does foggen do something here? Let's do some tracking..
             *
             *   "RollerCoaster Tycoon" has
@ -366,6 +221,7 @@ MString *pgraph_glsl_gen_vsh(const VshState *state, GenVshGlslOptions opts)
            assert(false);
            break;
        }
+        /* Calculate absolute for the modes which need it */
        switch (state->fog_mode) {
        case FOG_MODE_LINEAR_ABS:
        case FOG_MODE_EXP_ABS:
@ -376,18 +232,17 @@ MString *pgraph_glsl_gen_vsh(const VshState *state, GenVshGlslOptions opts)
            break;
        }

-        /* Fog is clamped to min/max normal float values here to match HW
-         * interpolation. It is then clamped to [0,1] in the pixel shader.
+        mstring_append(body, "  oFog.xyzw = vec4(fogFactor);\n");
+    } else {
+        /* FIXME: Is the fog still calculated / passed somehow?!
         */
-        // clang-format off
-        mstring_append(body,
-                       "  oFog = clamp(NaNToOne(vec4(fogFactor)), -FLOAT_MAX, FLOAT_MAX);\n");
-        // clang-format on
+        mstring_append(body, "  oFog.xyzw = vec4(1.0);\n");
    }

+    /* Set outputs */
    mstring_append(body, "\n"
-                   "  vtxD0 = clamp(NaNToOne(oD0), 0.0, 1.0);\n"
-                   "  vtxB0 = clamp(NaNToOne(oB0), 0.0, 1.0);\n"
+                   "  vtxD0 = clamp(oD0, 0.0, 1.0);\n"
+                   "  vtxB0 = clamp(oB0, 0.0, 1.0);\n"
                   "  vtxFog = oFog.x;\n"
                   "  vtxT0 = oT0;\n"
                   "  vtxT1 = oT1;\n"
@ -398,16 +253,9 @@ MString *pgraph_glsl_gen_vsh(const VshState *state, GenVshGlslOptions opts)

    if (state->specular_enable) {
        mstring_append(body,
-                       "  vtxD1 = clamp(NaNToOne(oD1), 0.0, 1.0);\n"
-                       "  vtxB1 = clamp(NaNToOne(oB1), 0.0, 1.0);\n"
+                       "  vtxD1 = clamp(oD1, 0.0, 1.0);\n"
+                       "  vtxB1 = clamp(oB1, 0.0, 1.0);\n"
        );
-
-        if (state->ignore_specular_alpha) {
-            mstring_append(body,
-                           "  vtxD1.w = 1.0;\n"
-                           "  vtxB1.w = 1.0;\n"
-            );
-        }
    } else {
        mstring_append(body,
                       "  vtxD1 = vec4(0.0, 0.0, 0.0, 1.0);\n"
@ -415,7 +263,7 @@ MString *pgraph_glsl_gen_vsh(const VshState *state, GenVshGlslOptions opts)
        );
    }

-    if (opts.vulkan) {
+    if (state->vulkan) {
        mstring_append(body,
                   "  gl_Position = oPos;\n"
        );
@ -428,25 +276,25 @@ MString *pgraph_glsl_gen_vsh(const VshState *state, GenVshGlslOptions opts)
    mstring_append(body, "}\n");

    /* Return combined header + source */
-    MString *output =
-        mstring_from_fmt("#version %d\n\n", opts.vulkan ? 450 : 400);
-
-    if (opts.vulkan) {
+    if (state->vulkan) {
        // FIXME: Optimize uniforms
-        if (num_uniform_attrs > 0 &&
-            opts.use_push_constants_for_uniform_attrs) {
+        if (num_uniform_attrs > 0) {
+            if (state->use_push_constants_for_uniform_attrs) {
                mstring_append_fmt(output,
                    "layout(push_constant) uniform PushConstants {\n"
                    "    vec4 inlineValue[%d];\n"
-                               "};\n\n",
+                    "};\n\n", num_uniform_attrs);
+            } else {
+                mstring_append_fmt(uniforms, "    vec4 inlineValue[%d];\n",
                                   num_uniform_attrs);
            }
+        }
        mstring_append_fmt(
            output,
            "layout(binding = %d, std140) uniform VshUniforms {\n"
            "%s"
            "};\n\n",
-            opts.ubo_binding, mstring_get_str(uniforms));
+            VSH_UBO_BINDING, mstring_get_str(uniforms));
    } else {
        mstring_append(
            output, mstring_get_str(uniforms));
@ -457,110 +305,5 @@ MString *pgraph_glsl_gen_vsh(const VshState *state, GenVshGlslOptions opts)

    mstring_append(output, mstring_get_str(body));
    mstring_unref(body);
-
    return output;
 }
-
-void pgraph_glsl_set_vsh_uniform_values(PGRAPHState *pg, const VshState *state,
-                                        const VshUniformLocs locs,
-                                        VshUniformValues *values)
-{
-    if (locs[VshUniform_c] != -1) {
-        QEMU_BUILD_BUG_MSG(sizeof(values->c) != sizeof(pg->vsh_constants),
-                           "Uniform value size inconsistency");
-        memcpy(values->c, pg->vsh_constants, sizeof(pg->vsh_constants));
-    }
-
-    if (locs[VshUniform_clipRange] != -1) {
-        pgraph_glsl_set_clip_range_uniform_value(pg, values->clipRange[0]);
-    }
-
-    if (locs[VshUniform_fogParam] != -1) {
-        uint32_t param_0 = pgraph_reg_r(pg, NV_PGRAPH_FOGPARAM0);
-        uint32_t param_1 = pgraph_reg_r(pg, NV_PGRAPH_FOGPARAM1);
-        values->fogParam[0][0] = *(float *)&param_0;
-        values->fogParam[0][1] = *(float *)&param_1;
-    }
-
-    if (locs[VshUniform_pointParams] != -1) {
-        QEMU_BUILD_BUG_MSG(sizeof(values->pointParams) !=
-                               sizeof(pg->point_params),
-                           "Uniform value size inconsistency");
-        memcpy(values->pointParams, pg->point_params, sizeof(pg->point_params));
-    }
-
-    if (locs[VshUniform_material_alpha] != -1) {
-        values->material_alpha[0] = pg->material_alpha;
-    }
-
-    if (locs[VshUniform_inlineValue] != -1) {
-        pgraph_get_inline_values(pg, state->uniform_attrs, values->inlineValue,
-                                 NULL);
-    }
-
-    if (locs[VshUniform_surfaceSize] != -1) {
-        unsigned int aa_width = 1, aa_height = 1;
-        pgraph_apply_anti_aliasing_factor(pg, &aa_width, &aa_height);
-        float width = (float)pg->surface_binding_dim.width / aa_width;
-        float height = (float)pg->surface_binding_dim.height / aa_height;
-        values->surfaceSize[0][0] = width;
-        values->surfaceSize[0][1] = height;
-    }
-
-    if (state->is_fixed_function) {
-        if (locs[VshUniform_ltctxa] != -1) {
-            QEMU_BUILD_BUG_MSG(sizeof(values->ltctxa) != sizeof(pg->ltctxa),
-                               "Uniform value size inconsistency");
-            memcpy(values->ltctxa, pg->ltctxa, sizeof(pg->ltctxa));
-        }
-
-        if (locs[VshUniform_ltctxb] != -1) {
-            QEMU_BUILD_BUG_MSG(sizeof(values->ltctxb) != sizeof(pg->ltctxb),
-                               "Uniform value size inconsistency");
-            memcpy(values->ltctxb, pg->ltctxb, sizeof(pg->ltctxb));
-        }
-
-        if (locs[VshUniform_ltc1] != -1) {
-            QEMU_BUILD_BUG_MSG(sizeof(values->ltc1) != sizeof(pg->ltc1),
-                               "Uniform value size inconsistency");
-            memcpy(values->ltc1, pg->ltc1, sizeof(pg->ltc1));
-        }
-
-        if (locs[VshUniform_lightInfiniteHalfVector] != -1) {
-            QEMU_BUILD_BUG_MSG(sizeof(values->lightInfiniteHalfVector) !=
-                                   sizeof(pg->light_infinite_half_vector),
-                               "Uniform value size inconsistency");
-            memcpy(values->lightInfiniteHalfVector,
-                   pg->light_infinite_half_vector,
-                   sizeof(pg->light_infinite_half_vector));
-        }
-
-        if (locs[VshUniform_lightInfiniteDirection] != -1) {
-            QEMU_BUILD_BUG_MSG(sizeof(values->lightInfiniteDirection) !=
-                                   sizeof(pg->light_infinite_direction),
-                               "Uniform value size inconsistency");
-            memcpy(values->lightInfiniteDirection, pg->light_infinite_direction,
-                   sizeof(pg->light_infinite_direction));
-        }
-
-        if (locs[VshUniform_lightLocalPosition] != -1) {
-            QEMU_BUILD_BUG_MSG(sizeof(values->lightLocalPosition) !=
-                                   sizeof(pg->light_local_position),
-                               "Uniform value size inconsistency");
-            memcpy(values->lightLocalPosition, pg->light_local_position,
-                   sizeof(pg->light_local_position));
-        }
-
-        if (locs[VshUniform_lightLocalAttenuation] != -1) {
-            QEMU_BUILD_BUG_MSG(sizeof(values->lightLocalAttenuation) !=
-                                   sizeof(pg->light_local_attenuation),
-                               "Uniform value size inconsistency");
-            memcpy(values->lightLocalAttenuation, pg->light_local_attenuation,
-                   sizeof(pg->light_local_attenuation));
-        }
-
-        if (locs[VshUniform_specularPower] != -1) {
-            values->specularPower[0] = pg->specular_power;
-        }
-    }
-}
--- a/hw/xbox/nv2a/pgraph/glsl/vsh.h
+++ b/hw/xbox/nv2a/pgraph/glsl/vsh.h
@ -3,7 +3,7 @@
 *
 * Copyright (c) 2015 espes
 * Copyright (c) 2015 Jannik Vogel
- * Copyright (c) 2020-2025 Matt Borgerson
+ * Copyright (c) 2020-2024 Matt Borgerson
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@ -22,92 +22,12 @@
 #ifndef HW_XBOX_NV2A_PGRAPH_GLSL_VSH_H
 #define HW_XBOX_NV2A_PGRAPH_GLSL_VSH_H

-#include "common.h"
-#include "hw/xbox/nv2a/pgraph/vsh_regs.h"
+#include "qemu/mstring.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"

-typedef struct PGRAPHState PGRAPHState;
+// FIXME: Move to struct
+#define VSH_UBO_BINDING 0

-typedef struct FixedFunctionVshState {
-    bool normalization;
-    bool texture_matrix_enable[4];
-    enum VshTexgen texgen[4][4];
-    enum VshFoggen foggen;
-    enum VshSkinning skinning;
-    bool lighting;
-    enum VshLight light[NV2A_MAX_LIGHTS];
-    enum MaterialColorSource emission_src;
-    enum MaterialColorSource ambient_src;
-    enum MaterialColorSource diffuse_src;
-    enum MaterialColorSource specular_src;
-    bool local_eye;
-} FixedFunctionVshState;
-
-typedef struct ProgrammableVshState {
-    uint32_t program_data[NV2A_MAX_TRANSFORM_PROGRAM_LENGTH][VSH_TOKEN_SIZE];
-    int program_length;
-} ProgrammableVshState;
-
-typedef struct {
-    unsigned int surface_scale_factor;  // FIXME: Remove
-
-    uint16_t compressed_attrs;
-    uint16_t uniform_attrs;
-    uint16_t swizzle_attrs;
-
-    bool fog_enable;
-    enum VshFogMode fog_mode;
-
-    bool specular_enable;
-    bool separate_specular;
-    bool ignore_specular_alpha;
-    float specular_power;
-    float specular_power_back;
-
-    bool point_params_enable;
-    float point_size;
-    float point_params[8];
-
-    bool smooth_shading;
-    bool z_perspective;
-
-    bool is_fixed_function;
-    FixedFunctionVshState fixed_function;
-    ProgrammableVshState programmable;
-} VshState;
-
-void pgraph_glsl_set_vsh_state(PGRAPHState *pg, VshState *state);
-
-#define VSH_UNIFORM_DECL_X(S, DECL)                          \
-    DECL(S, c, vec4, NV2A_VERTEXSHADER_CONSTANTS)            \
-    DECL(S, clipRange, vec4, 1)                              \
-    DECL(S, fogParam, vec2, 1)                               \
-    DECL(S, inlineValue, vec4, NV2A_VERTEXSHADER_ATTRIBUTES) \
-    DECL(S, lightInfiniteDirection, vec3, NV2A_MAX_LIGHTS)   \
-    DECL(S, lightInfiniteHalfVector, vec3, NV2A_MAX_LIGHTS)  \
-    DECL(S, lightLocalAttenuation, vec3, NV2A_MAX_LIGHTS)    \
-    DECL(S, lightLocalPosition, vec3, NV2A_MAX_LIGHTS)       \
-    DECL(S, ltc1, vec4, NV2A_LTC1_COUNT)                     \
-    DECL(S, ltctxa, vec4, NV2A_LTCTXA_COUNT)                 \
-    DECL(S, ltctxb, vec4, NV2A_LTCTXB_COUNT)                 \
-    DECL(S, material_alpha, float, 1)                        \
-    DECL(S, pointParams, float, 8)                           \
-    DECL(S, specularPower, float, 1)                         \
-    DECL(S, surfaceSize, vec2, 1)
-
-DECL_UNIFORM_TYPES(VshUniform, VSH_UNIFORM_DECL_X)
-
-typedef struct GenVshGlslOptions {
-    bool vulkan;
-    bool prefix_outputs;
-    bool use_push_constants_for_uniform_attrs;
-    int ubo_binding;
-} GenVshGlslOptions;
-
-MString *pgraph_glsl_gen_vsh(const VshState *state,
-                             GenVshGlslOptions glsl_opts);
-
-void pgraph_glsl_set_vsh_uniform_values(PGRAPHState *pg, const VshState *state,
-                                        const VshUniformLocs locs,
-                                        VshUniformValues *values);
+MString *pgraph_gen_vsh_glsl(const ShaderState *state, bool prefix_outputs);

 #endif
--- a/hw/xbox/nv2a/pgraph/meson.build
+++ b/hw/xbox/nv2a/pgraph/meson.build
@ -3,6 +3,7 @@ specific_ss.add(files(
 	'profile.c',
 	'rdi.c',
 	's3tc.c',
+	'shaders.c',
 	'swizzle.c',
 	'texture.c',
 	'vertex.c',
--- a/hw/xbox/nv2a/pgraph/methods.h.inc
+++ b/hw/xbox/nv2a/pgraph/methods.h.inc
@ -27,7 +27,6 @@ DEF_METHOD(NV097, SET_COMBINER_SPECULAR_FOG_CW0)
 DEF_METHOD(NV097, SET_COMBINER_SPECULAR_FOG_CW1)
 DEF_METHOD_CASE_4(NV097, SET_TEXTURE_ADDRESS, 64)
 DEF_METHOD(NV097, SET_CONTROL0)
-DEF_METHOD(NV097, SET_LIGHT_CONTROL)
 DEF_METHOD(NV097, SET_COLOR_MATERIAL)
 DEF_METHOD(NV097, SET_FOG_MODE)
 DEF_METHOD(NV097, SET_FOG_GEN_MODE)
@ -96,7 +95,6 @@ DEF_METHOD_RANGE(NV097, SET_FOG_PARAMS, 3)
 DEF_METHOD_RANGE(NV097, SET_TEXGEN_PLANE_S, 4*4*4)
 DEF_METHOD(NV097, SET_TEXGEN_VIEW_MODEL)
 DEF_METHOD_RANGE(NV097, SET_FOG_PLANE, 4)
-DEF_METHOD_RANGE(NV097, SET_SPECULAR_PARAMS, 6)
 DEF_METHOD_RANGE(NV097, SET_SCENE_AMBIENT_COLOR, 3)
 DEF_METHOD_RANGE(NV097, SET_VIEWPORT_OFFSET, 4)
 DEF_METHOD_RANGE(NV097, SET_POINT_PARAMS, 8)
@ -105,7 +103,6 @@ DEF_METHOD_RANGE(NV097, SET_COMBINER_FACTOR0, 8)
 DEF_METHOD_RANGE(NV097, SET_COMBINER_FACTOR1, 8)
 DEF_METHOD_RANGE(NV097, SET_COMBINER_ALPHA_OCW, 8)
 DEF_METHOD_RANGE(NV097, SET_COMBINER_COLOR_ICW, 8)
-DEF_METHOD_RANGE(NV097, SET_COLOR_KEY_COLOR, 4)
 DEF_METHOD_RANGE(NV097, SET_VIEWPORT_SCALE, 4)
 DEF_METHOD_RANGE(NV097, SET_TRANSFORM_PROGRAM, 32)
 DEF_METHOD_RANGE(NV097, SET_TRANSFORM_CONSTANT, 32)
@ -137,11 +134,6 @@ DEF_METHOD_RANGE(NV097, SET_TEXCOORD3_2F, 2)
 DEF_METHOD_RANGE(NV097, SET_TEXCOORD3_4F, 4)
 DEF_METHOD_RANGE(NV097, SET_TEXCOORD3_2S, 1)
 DEF_METHOD_RANGE(NV097, SET_TEXCOORD3_4S, 2)
-DEF_METHOD(NV097, SET_FOG_COORD)
-DEF_METHOD(NV097, SET_WEIGHT1F)
-DEF_METHOD_RANGE(NV097, SET_WEIGHT2F, 2)
-DEF_METHOD_RANGE(NV097, SET_WEIGHT3F, 3)
-DEF_METHOD_RANGE(NV097, SET_WEIGHT4F, 4)
 DEF_METHOD_RANGE(NV097, SET_VERTEX_DATA_ARRAY_FORMAT, 16)
 DEF_METHOD_RANGE(NV097, SET_VERTEX_DATA_ARRAY_OFFSET, 16)
 DEF_METHOD(NV097, SET_LOGIC_OP_ENABLE)
@ -185,7 +177,6 @@ DEF_METHOD(NV097, CLEAR_SURFACE)
 DEF_METHOD(NV097, SET_CLEAR_RECT_HORIZONTAL)
 DEF_METHOD(NV097, SET_CLEAR_RECT_VERTICAL)
 DEF_METHOD_RANGE(NV097, SET_SPECULAR_FOG_FACTOR, 2)
-DEF_METHOD_RANGE(NV097, SET_SPECULAR_PARAMS_BACK, 6)
 DEF_METHOD(NV097, SET_SHADER_CLIP_PLANE_MODE)
 DEF_METHOD_RANGE(NV097, SET_COMBINER_COLOR_OCW, 8)
 DEF_METHOD(NV097, SET_COMBINER_CONTROL)
--- a/hw/xbox/nv2a/pgraph/pgraph.c
+++ b/hw/xbox/nv2a/pgraph/pgraph.c
@ -19,8 +19,6 @@
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 */

-#include <math.h>
-
 #include "hw/xbox/nv2a/nv2a_int.h"
 #include "ui/xemu-notifications.h"
 #include "ui/xemu-settings.h"
@ -225,8 +223,6 @@ void pgraph_init(NV2AState *d)
    qemu_event_init(&pg->sync_complete, false);
    qemu_event_init(&pg->flush_complete, false);
    qemu_cond_init(&pg->framebuffer_released);
-    qemu_event_init(&pg->renderer_switch_complete, false);
-    pg->renderer_switch_phase = PGRAPH_RENDERER_SWITCH_PHASE_IDLE;

    pg->frame_time = 0;
    pg->draw_time = 0;
@ -1079,18 +1075,6 @@ DEF_METHOD(NV097, SET_CONTROL0)
             z_perspective);
 }

-DEF_METHOD(NV097, SET_LIGHT_CONTROL)
-{
-    PG_SET_MASK(NV_PGRAPH_CSV0_C, NV_PGRAPH_CSV0_C_SEPARATE_SPECULAR,
-             (parameter & NV097_SET_LIGHT_CONTROL_SEPARATE_SPECULAR) != 0);
-
-    PG_SET_MASK(NV_PGRAPH_CSV0_C, NV_PGRAPH_CSV0_C_LOCALEYE,
-             (parameter & NV097_SET_LIGHT_CONTROL_LOCALEYE) != 0);
-
-    PG_SET_MASK(NV_PGRAPH_CSV0_C, NV_PGRAPH_CSV0_C_ALPHA_FROM_MATERIAL_SPECULAR,
-             (parameter & NV097_SET_LIGHT_CONTROL_ALPHA_FROM_MATERIAL_SPECULAR) != 0);
-}
-
 DEF_METHOD(NV097, SET_COLOR_MATERIAL)
 {
    PG_SET_MASK(NV_PGRAPH_CSV0_C, NV_PGRAPH_CSV0_C_EMISSION,
@ -1413,10 +1397,8 @@ DEF_METHOD(NV097, SET_BLEND_EQUATION)

 DEF_METHOD(NV097, SET_DEPTH_FUNC)
 {
-    if (parameter >= 0x200 && parameter <= 0x207) {
    PG_SET_MASK(NV_PGRAPH_CONTROL_0, NV_PGRAPH_CONTROL_0_ZFUNC,
             parameter & 0xF);
-    }
 }

 DEF_METHOD(NV097, SET_COLOR_MASK)
@ -1809,113 +1791,6 @@ DEF_METHOD_INC(NV097, SET_FOG_PLANE)
    pg->vsh_constants_dirty[NV_IGRAPH_XF_XFCTX_FOG] = true;
 }

-struct CurveCoefficients {
-  float a;
-  float b;
-  float c;
-};
-
-static const struct CurveCoefficients curve_coefficients[] = {
-  {1.000108475163, -9.838607076280, 54.829089549713},
-  {1.199164441703, -3.292603784852, 7.799987995214},
-  {8.653441252033, 29.189473787191, 43.586027561823},
-  {-531.307758450301, 117.398468683934, 113.155490738338},
-  {-4.662713151292, 1.221108944572, 1.217360986939},
-  {-124.435242105211, 35.401219563514, 35.408114377045},
-  {10672560.259502287954, 21565843.555823743343, 10894794.336297152564},
-  {-51973801.463933646679, -104199997.554352939129, -52225454.356278456748},
-  {972270.324080004124, 2025882.096547174733, 1054898.052467488218},
-};
-
-static const float kCoefficient0StepPoints[] = {
-  -0.022553957999, // power = 1.25
-  -0.421539008617, // power = 4.00
-  -0.678715527058, // power = 9.00
-  -0.838916420937, // power = 20.00
-  -0.961754500866, // power = 90.00
-  -0.990773200989, // power = 375.00
-  -0.994858562946, // power = 650.00
-  -0.996561050415, // power = 1000.00
-  -0.999547004700, // power = 1250.00
-};
-
-static float reconstruct_quadratic(float c0, const struct CurveCoefficients *coefficients) {
-  return coefficients->a + coefficients->b * c0 + coefficients->c * c0 * c0;
-}
-
-static float reconstruct_saturation_growth_rate(float c0, const struct CurveCoefficients *coefficients) {
-  return (coefficients->a * c0) / (coefficients->b + coefficients->c * c0);
-}
-
-static float (* const reconstruct_func_map[])(float, const struct CurveCoefficients *) = {
-  reconstruct_quadratic, // 1.0..1.25 max error 0.01 %
-  reconstruct_quadratic, // 1.25..4.0 max error 2.2 %
-  reconstruct_quadratic, // 4.0..9.0 max error 2.3 %
-  reconstruct_saturation_growth_rate, // 9.0..20.0 max error 1.4 %
-  reconstruct_saturation_growth_rate, // 20.0..90.0 max error 2.1 %
-  reconstruct_saturation_growth_rate, // 90.0..375.0 max error 2.8%
-  reconstruct_quadratic, // 375..650 max error 1.0 %
-  reconstruct_quadratic, // 650..1000 max error 1.7%
-  reconstruct_quadratic, // 1000..1250 max error 1.0%
-};
-
-static float reconstruct_specular_power(const float *params) {
-  // See https://github.com/dracc/xgu/blob/db3172d8c983629f0dc971092981846da22438ae/xgux.h#L279
-
-  // Values < 1.0 will result in a positive c1 and (c2 - c0 * 2) will be very
-  // close to the original value.
-  if (params[1] > 0.0f && params[2] < 1.0f) {
-    return params[2] - (params[0] * 2.0f);
-  }
-
-  float c0 = params[0];
-  float c3 = params[3];
-  // FIXME: This handling is not correct, but is distinct without crashing.
-  // It does not appear possible for a DirectX-generated value to be positive,
-  // so while this differs from hardware behavior, it may be irrelevant in
-  // practice.
-  if (c0 > 0.0f || c3 > 0.0f) {
-    return 0.0001f;
-  }
-
-  float reconstructed_power = 0.f;
-  for (uint32_t i = 0; i < sizeof(kCoefficient0StepPoints) / sizeof(kCoefficient0StepPoints[0]); ++i) {
-    if (c0 > kCoefficient0StepPoints[i]) {
-      reconstructed_power = reconstruct_func_map[i](c0, &curve_coefficients[i]);
-      break;
-    }
-  }
-
-  float reconstructed_half_power = 0.f;
-  for (uint32_t i = 0; i < sizeof(kCoefficient0StepPoints) / sizeof(kCoefficient0StepPoints[0]); ++i) {
-    if (c3 > kCoefficient0StepPoints[i]) {
-      reconstructed_half_power = reconstruct_func_map[i](c3, &curve_coefficients[i]);
-      break;
-    }
-  }
-
-  // The range can be extended beyond 1250 by using the half power params. This
-  // will only work for DirectX generated values, arbitrary params could
-  // erroneously trigger this.
-  //
-  // There are some very low power (~1) values that have inverted powers, but
-  // they are easily identified by comparatively high c0 parameters.
-  if (reconstructed_power == 0.f || (reconstructed_half_power > reconstructed_power && c0 < -0.1f)) {
-    return reconstructed_half_power * 2.f;
-  }
-
-  return reconstructed_power;
-}
-
-DEF_METHOD_INC(NV097, SET_SPECULAR_PARAMS)
-{
-    int slot = (method - NV097_SET_SPECULAR_PARAMS) / 4;
-    pg->specular_params[slot] = *(float *)&parameter;
-    if (slot == 5) {
-        pg->specular_power = reconstruct_specular_power(pg->specular_params);
-    }
-}
-
 DEF_METHOD_INC(NV097, SET_SCENE_AMBIENT_COLOR)
 {
    int slot = (method - NV097_SET_SCENE_AMBIENT_COLOR) / 4;
@ -1968,12 +1843,6 @@ DEF_METHOD_INC(NV097, SET_COMBINER_COLOR_ICW)
    pgraph_reg_w(pg, NV_PGRAPH_COMBINECOLORI0 + slot*4, parameter);
 }

-DEF_METHOD_INC(NV097, SET_COLOR_KEY_COLOR)
-{
-    int slot = (method - NV097_SET_COLOR_KEY_COLOR) / 4;
-    pgraph_reg_w(pg, NV_PGRAPH_COLORKEYCOLOR0 + slot * 4, parameter);
-}
-
 DEF_METHOD_INC(NV097, SET_VIEWPORT_SCALE)
 {
    int slot = (method - NV097_SET_VIEWPORT_SCALE) / 4;
@ -2141,26 +2010,6 @@ DEF_METHOD_INC(NV097, SET_VERTEX4F)
    }
 }

-DEF_METHOD(NV097, SET_FOG_COORD)
-{
-    VertexAttribute *attribute = &pg->vertex_attributes[NV2A_VERTEX_ATTR_FOG];
-    pgraph_allocate_inline_buffer_vertices(pg, NV2A_VERTEX_ATTR_FOG);
-    attribute->inline_value[0] = *(float*)&parameter;
-    attribute->inline_value[1] = attribute->inline_value[0];
-    attribute->inline_value[2] = attribute->inline_value[0];
-    attribute->inline_value[3] = attribute->inline_value[0];
-}
-
-DEF_METHOD(NV097, SET_WEIGHT1F)
-{
-    VertexAttribute *attribute = &pg->vertex_attributes[NV2A_VERTEX_ATTR_WEIGHT];
-    pgraph_allocate_inline_buffer_vertices(pg, NV2A_VERTEX_ATTR_WEIGHT);
-    attribute->inline_value[0] = *(float*)&parameter;
-    attribute->inline_value[1] = 0.f;
-    attribute->inline_value[2] = 0.f;
-    attribute->inline_value[3] = 1.f;
-}
-
 DEF_METHOD_INC(NV097, SET_NORMAL3S)
 {
    int slot = (method - NV097_SET_NORMAL3S) / 4;
@ -2295,6 +2144,7 @@ DEF_METHOD_INC(NV097, SET_TEXCOORD1_4F)
    SET_VERTEX_ATTRIBUTE_F(NV097_SET_TEXCOORD1_4F, NV2A_VERTEX_ATTR_TEXTURE1);
 }

+
 DEF_METHOD_INC(NV097, SET_TEXCOORD2_4F)
 {
    SET_VERTEX_ATTRIBUTE_F(NV097_SET_TEXCOORD2_4F, NV2A_VERTEX_ATTR_TEXTURE2);
@ -2305,34 +2155,8 @@ DEF_METHOD_INC(NV097, SET_TEXCOORD3_4F)
    SET_VERTEX_ATTRIBUTE_F(NV097_SET_TEXCOORD3_4F, NV2A_VERTEX_ATTR_TEXTURE3);
 }

-DEF_METHOD_INC(NV097, SET_WEIGHT4F)
-{
-    SET_VERTEX_ATTRIBUTE_F(NV097_SET_WEIGHT4F, NV2A_VERTEX_ATTR_WEIGHT);
-}
-
 #undef SET_VERTEX_ATTRIBUTE_F

-DEF_METHOD_INC(NV097, SET_WEIGHT2F)
-{
-    int slot = (method - NV097_SET_WEIGHT2F) / 4;
-    VertexAttribute *attribute =
-        &pg->vertex_attributes[NV2A_VERTEX_ATTR_WEIGHT];
-    pgraph_allocate_inline_buffer_vertices(pg, NV2A_VERTEX_ATTR_WEIGHT);
-    attribute->inline_value[slot] = *(float*)&parameter;
-    attribute->inline_value[2] = 0.0f;
-    attribute->inline_value[3] = 1.0f;
-}
-
-DEF_METHOD_INC(NV097, SET_WEIGHT3F)
-{
-    int slot = (method - NV097_SET_WEIGHT3F) / 4;
-    VertexAttribute *attribute =
-        &pg->vertex_attributes[NV2A_VERTEX_ATTR_WEIGHT];
-    pgraph_allocate_inline_buffer_vertices(pg, NV2A_VERTEX_ATTR_WEIGHT);
-    attribute->inline_value[slot] = *(float*)&parameter;
-    attribute->inline_value[3] = 1.0f;
-}
-
 #define SET_VERTEX_ATRIBUTE_TEX_2F(command, attr_index)                    \
    do {                                                                   \
        int slot = (method - (command)) / 4;                               \
@ -2702,11 +2526,7 @@ DEF_METHOD(NV097, DRAW_ARRAYS)
    int32_t count = GET_MASK(parameter, NV097_DRAW_ARRAYS_COUNT) + 1;

    if (pg->inline_elements_length) {
-        /* FIXME: HW throws an exception if the start index is > 0xFFFF. This
-         * would prevent this assert from firing for any reasonable choice of
-         * NV2A_MAX_BATCH_LENGTH (which must be larger to accommodate
-         * NV097_INLINE_ARRAY anyway)
-         */
+        /* FIXME: Determine HW behavior for overflow case. */
        assert((pg->inline_elements_length + count) < NV2A_MAX_BATCH_LENGTH);
        assert(!pg->draw_arrays_prevent_connect);

@ -2908,15 +2728,6 @@ DEF_METHOD_INC(NV097, SET_SPECULAR_FOG_FACTOR)
    pgraph_reg_w(pg, NV_PGRAPH_SPECFOGFACTOR0 + slot*4, parameter);
 }

-DEF_METHOD_INC(NV097, SET_SPECULAR_PARAMS_BACK)
-{
-    int slot = (method - NV097_SET_SPECULAR_PARAMS_BACK) / 4;
-    pg->specular_params_back[slot] = *(float *)&parameter;
-    if (slot == 5) {
-        pg->specular_power_back = reconstruct_specular_power(pg->specular_params_back);
-    }
-}
-
 DEF_METHOD(NV097, SET_SHADER_CLIP_PLANE_MODE)
 {
    pgraph_reg_w(pg, NV_PGRAPH_SHADERCLIPMODE, parameter);
@ -3147,31 +2958,12 @@ void pgraph_write_zpass_pixel_cnt_report(NV2AState *d, uint32_t parameter,
    NV2A_DPRINTF("Report result %d @%" HWADDR_PRIx, result, offset);
 }

-static void do_wait_for_renderer_switch(CPUState *cpu, run_on_cpu_data data)
-{
-    NV2AState *d = (NV2AState *)data.host_ptr;
-
-    qemu_mutex_lock(&d->pfifo.lock);
-    d->pgraph.renderer_switch_phase = PGRAPH_RENDERER_SWITCH_PHASE_CPU_WAITING;
-    pfifo_kick(d);
-    qemu_mutex_unlock(&d->pfifo.lock);
-    qemu_event_wait(&d->pgraph.renderer_switch_complete);
-}
-
 void pgraph_process_pending(NV2AState *d)
 {
    PGRAPHState *pg = &d->pgraph;
    pg->renderer->ops.process_pending(d);

-    if (g_config.display.renderer != pg->renderer->type &&
-        pg->renderer_switch_phase == PGRAPH_RENDERER_SWITCH_PHASE_IDLE) {
-        pg->renderer_switch_phase = PGRAPH_RENDERER_SWITCH_PHASE_STARTED;
-        qemu_event_reset(&pg->renderer_switch_complete);
-        async_safe_run_on_cpu(qemu_get_cpu(0), do_wait_for_renderer_switch,
-                              RUN_ON_CPU_HOST_PTR(d));
-    }
-
-    if (pg->renderer_switch_phase == PGRAPH_RENDERER_SWITCH_PHASE_CPU_WAITING) {
+    if (g_config.display.renderer != pg->renderer->type) {
        qemu_mutex_lock(&d->pgraph.renderer_lock);
        qemu_mutex_unlock(&d->pfifo.lock);
        qemu_mutex_lock(&d->pgraph.lock);
@ -3183,13 +2975,14 @@ void pgraph_process_pending(NV2AState *d)
            qemu_mutex_lock(&d->pfifo.lock);
            qemu_mutex_unlock(&d->pgraph.lock);

+            if (pg->renderer->ops.process_pending) {
                pg->renderer->ops.process_pending(d);
+            }

            qemu_mutex_unlock(&d->pfifo.lock);
            qemu_mutex_lock(&d->pgraph.lock);
            while (pg->framebuffer_in_use) {
-                qemu_cond_wait(&d->pgraph.framebuffer_released,
-                               &d->pgraph.renderer_lock);
+                qemu_cond_wait(&d->pgraph.framebuffer_released, &d->pgraph.renderer_lock);
            }

            if (pg->renderer->ops.finalize) {
@ -3202,9 +2995,6 @@ void pgraph_process_pending(NV2AState *d)
        qemu_mutex_unlock(&d->pgraph.renderer_lock);
        qemu_mutex_unlock(&d->pgraph.lock);
        qemu_mutex_lock(&d->pfifo.lock);
-
-        pg->renderer_switch_phase = PGRAPH_RENDERER_SWITCH_PHASE_IDLE;
-        qemu_event_set(&pg->renderer_switch_complete);
    }
 }

--- a/hw/xbox/nv2a/pgraph/pgraph.h
+++ b/hw/xbox/nv2a/pgraph/pgraph.h
@ -29,10 +29,9 @@
 #include "qemu/thread.h"
 #include "cpu.h"

+#include "shaders.h"
 #include "surface.h"
-#include "texture.h"
 #include "util.h"
-#include "vsh_regs.h"

 typedef struct NV2AState NV2AState;
 typedef struct PGRAPHNullState PGRAPHNullState;
@ -198,11 +197,6 @@ typedef struct PGRAPHState {
    float light_local_position[NV2A_MAX_LIGHTS][3];
    float light_local_attenuation[NV2A_MAX_LIGHTS][3];

-    float specular_params[6];
-    float specular_power;
-    float specular_params_back[6];
-    float specular_power_back;
-
    float point_params[8];

    VertexAttribute vertex_attributes[NV2A_VERTEXSHADER_ATTRIBUTES];
@ -244,13 +238,6 @@ typedef struct PGRAPHState {
    bool framebuffer_in_use;
    QemuCond framebuffer_released;

-    enum {
-        PGRAPH_RENDERER_SWITCH_PHASE_IDLE,
-        PGRAPH_RENDERER_SWITCH_PHASE_STARTED,
-        PGRAPH_RENDERER_SWITCH_PHASE_CPU_WAITING,
-    } renderer_switch_phase;
-    QemuEvent renderer_switch_complete;
-
    unsigned int surface_scale_factor;
    uint8_t *scale_buf;

--- a/hw/xbox/nv2a/pgraph/psh.h
+++ b/hw/xbox/nv2a/pgraph/psh.h
@ -0,0 +1,92 @@
+/*
+ * QEMU Geforce NV2A pixel shader translation
+ *
+ * Copyright (c) 2013 espes
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_NV2A_PSH_H
+#define HW_NV2A_PSH_H
+
+#include <stdint.h>
+#include <stdbool.h>
+
+enum PshAlphaFunc {
+    ALPHA_FUNC_NEVER,
+    ALPHA_FUNC_LESS,
+    ALPHA_FUNC_EQUAL,
+    ALPHA_FUNC_LEQUAL,
+    ALPHA_FUNC_GREATER,
+    ALPHA_FUNC_NOTEQUAL,
+    ALPHA_FUNC_GEQUAL,
+    ALPHA_FUNC_ALWAYS,
+};
+
+enum PshShadowDepthFunc {
+    SHADOW_DEPTH_FUNC_NEVER,
+    SHADOW_DEPTH_FUNC_LESS,
+    SHADOW_DEPTH_FUNC_EQUAL,
+    SHADOW_DEPTH_FUNC_LEQUAL,
+    SHADOW_DEPTH_FUNC_GREATER,
+    SHADOW_DEPTH_FUNC_NOTEQUAL,
+    SHADOW_DEPTH_FUNC_GEQUAL,
+    SHADOW_DEPTH_FUNC_ALWAYS,
+};
+
+enum ConvolutionFilter {
+    CONVOLUTION_FILTER_DISABLED,
+    CONVOLUTION_FILTER_QUINCUNX,
+    CONVOLUTION_FILTER_GAUSSIAN,
+};
+
+typedef struct PshState {
+    bool vulkan;
+
+    /* fragment shader - register combiner stuff */
+    uint32_t combiner_control;
+    uint32_t shader_stage_program;
+    uint32_t other_stage_input;
+    uint32_t final_inputs_0;
+    uint32_t final_inputs_1;
+
+    uint32_t rgb_inputs[8], rgb_outputs[8];
+    uint32_t alpha_inputs[8], alpha_outputs[8];
+
+    bool point_sprite;
+    bool rect_tex[4];
+    bool snorm_tex[4];
+    bool compare_mode[4][4];
+    bool alphakill[4];
+    enum ConvolutionFilter conv_tex[4];
+    bool tex_x8y24[4];
+    int dim_tex[4];
+
+    float border_logical_size[4][3];
+    float border_inv_real_size[4][3];
+
+    bool shadow_map[4];
+    enum PshShadowDepthFunc shadow_depth_func;
+
+    bool alpha_test;
+    enum PshAlphaFunc alpha_func;
+
+    bool window_clip_exclusive;
+
+    bool smooth_shading;
+    bool depth_clipping;
+    bool z_perspective;
+} PshState;
+
+#endif
--- a/hw/xbox/nv2a/pgraph/psh_regs.h
+++ b/hw/xbox/nv2a/pgraph/psh_regs.h
@ -1,190 +0,0 @@
-/*
- * QEMU Geforce NV2A pixel shader translation
- *
- * Copyright (c) 2013 espes
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef HW_NV2A_PSH_H
-#define HW_NV2A_PSH_H
-
-#include <stdint.h>
-#include <stdbool.h>
-
-/*
- * For some background, see the OpenGL extension:
- * https://www.opengl.org/registry/specs/NV/register_combiners.txt
- */
-
-enum PS_TEXTUREMODES
-{                                 // valid in stage 0 1 2 3
-    PS_TEXTUREMODES_NONE=                 0x00L, // * * * *
-    PS_TEXTUREMODES_PROJECT2D=            0x01L, // * * * *
-    PS_TEXTUREMODES_PROJECT3D=            0x02L, // * * * *
-    PS_TEXTUREMODES_CUBEMAP=              0x03L, // * * * *
-    PS_TEXTUREMODES_PASSTHRU=             0x04L, // * * * *
-    PS_TEXTUREMODES_CLIPPLANE=            0x05L, // * * * *
-    PS_TEXTUREMODES_BUMPENVMAP=           0x06L, // - * * *
-    PS_TEXTUREMODES_BUMPENVMAP_LUM=       0x07L, // - * * *
-    PS_TEXTUREMODES_BRDF=                 0x08L, // - - * *
-    PS_TEXTUREMODES_DOT_ST=               0x09L, // - - * *
-    PS_TEXTUREMODES_DOT_ZW=               0x0aL, // - - * *
-    PS_TEXTUREMODES_DOT_RFLCT_DIFF=       0x0bL, // - - * -
-    PS_TEXTUREMODES_DOT_RFLCT_SPEC=       0x0cL, // - - - *
-    PS_TEXTUREMODES_DOT_STR_3D=           0x0dL, // - - - *
-    PS_TEXTUREMODES_DOT_STR_CUBE=         0x0eL, // - - - *
-    PS_TEXTUREMODES_DPNDNT_AR=            0x0fL, // - * * *
-    PS_TEXTUREMODES_DPNDNT_GB=            0x10L, // - * * *
-    PS_TEXTUREMODES_DOTPRODUCT=           0x11L, // - * * -
-    PS_TEXTUREMODES_DOT_RFLCT_SPEC_CONST= 0x12L, // - - - *
-    // 0x13-0x1f reserved
-};
-
-enum PS_INPUTMAPPING
-{
-    PS_INPUTMAPPING_UNSIGNED_IDENTITY= 0x00L, // max(0,x)         OK for final combiner
-    PS_INPUTMAPPING_UNSIGNED_INVERT=   0x20L, // 1 - max(0,x)     OK for final combiner
-    PS_INPUTMAPPING_EXPAND_NORMAL=     0x40L, // 2*max(0,x) - 1   invalid for final combiner
-    PS_INPUTMAPPING_EXPAND_NEGATE=     0x60L, // 1 - 2*max(0,x)   invalid for final combiner
-    PS_INPUTMAPPING_HALFBIAS_NORMAL=   0x80L, // max(0,x) - 1/2   invalid for final combiner
-    PS_INPUTMAPPING_HALFBIAS_NEGATE=   0xa0L, // 1/2 - max(0,x)   invalid for final combiner
-    PS_INPUTMAPPING_SIGNED_IDENTITY=   0xc0L, // x                invalid for final combiner
-    PS_INPUTMAPPING_SIGNED_NEGATE=     0xe0L, // -x               invalid for final combiner
-};
-
-enum PS_REGISTER
-{
-    PS_REGISTER_ZERO=              0x00L, // r
-    PS_REGISTER_DISCARD=           0x00L, // w
-    PS_REGISTER_C0=                0x01L, // r
-    PS_REGISTER_C1=                0x02L, // r
-    PS_REGISTER_FOG=               0x03L, // r
-    PS_REGISTER_V0=                0x04L, // r/w
-    PS_REGISTER_V1=                0x05L, // r/w
-    PS_REGISTER_T0=                0x08L, // r/w
-    PS_REGISTER_T1=                0x09L, // r/w
-    PS_REGISTER_T2=                0x0aL, // r/w
-    PS_REGISTER_T3=                0x0bL, // r/w
-    PS_REGISTER_R0=                0x0cL, // r/w
-    PS_REGISTER_R1=                0x0dL, // r/w
-    PS_REGISTER_V1R0_SUM=          0x0eL, // r
-    PS_REGISTER_EF_PROD=           0x0fL, // r
-
-    PS_REGISTER_ONE=               PS_REGISTER_ZERO | PS_INPUTMAPPING_UNSIGNED_INVERT, // OK for final combiner
-    PS_REGISTER_NEGATIVE_ONE=      PS_REGISTER_ZERO | PS_INPUTMAPPING_EXPAND_NORMAL,   // invalid for final combiner
-    PS_REGISTER_ONE_HALF=          PS_REGISTER_ZERO | PS_INPUTMAPPING_HALFBIAS_NEGATE, // invalid for final combiner
-    PS_REGISTER_NEGATIVE_ONE_HALF= PS_REGISTER_ZERO | PS_INPUTMAPPING_HALFBIAS_NORMAL, // invalid for final combiner
-};
-
-enum PS_COMBINERCOUNTFLAGS
-{
-    PS_COMBINERCOUNT_MUX_LSB=     0x0000L, // mux on r0.a lsb
-    PS_COMBINERCOUNT_MUX_MSB=     0x0001L, // mux on r0.a msb
-
-    PS_COMBINERCOUNT_SAME_C0=     0x0000L, // c0 same in each stage
-    PS_COMBINERCOUNT_UNIQUE_C0=   0x0010L, // c0 unique in each stage
-
-    PS_COMBINERCOUNT_SAME_C1=     0x0000L, // c1 same in each stage
-    PS_COMBINERCOUNT_UNIQUE_C1=   0x0100L  // c1 unique in each stage
-};
-
-enum PS_COMBINEROUTPUT
-{
-    PS_COMBINEROUTPUT_IDENTITY=            0x00L, // y = x
-    PS_COMBINEROUTPUT_BIAS=                0x08L, // y = x - 0.5
-    PS_COMBINEROUTPUT_SHIFTLEFT_1=         0x10L, // y = x*2
-    PS_COMBINEROUTPUT_SHIFTLEFT_1_BIAS=    0x18L, // y = (x - 0.5)*2
-    PS_COMBINEROUTPUT_SHIFTLEFT_2=         0x20L, // y = x*4
-    PS_COMBINEROUTPUT_SHIFTRIGHT_1=        0x30L, // y = x/2
-
-    PS_COMBINEROUTPUT_AB_BLUE_TO_ALPHA=    0x80L, // RGB only
-
-    PS_COMBINEROUTPUT_CD_BLUE_TO_ALPHA=    0x40L, // RGB only
-
-    PS_COMBINEROUTPUT_AB_MULTIPLY=         0x00L,
-    PS_COMBINEROUTPUT_AB_DOT_PRODUCT=      0x02L, // RGB only
-
-    PS_COMBINEROUTPUT_CD_MULTIPLY=         0x00L,
-    PS_COMBINEROUTPUT_CD_DOT_PRODUCT=      0x01L, // RGB only
-
-    PS_COMBINEROUTPUT_AB_CD_SUM=           0x00L, // 3rd output is AB+CD
-    PS_COMBINEROUTPUT_AB_CD_MUX=           0x04L, // 3rd output is MUX(AB,CD) based on R0.a
-};
-
-enum PS_CHANNEL
-{
-    PS_CHANNEL_RGB=   0x00, // used as RGB source
-    PS_CHANNEL_BLUE=  0x00, // used as ALPHA source
-    PS_CHANNEL_ALPHA= 0x10, // used as RGB or ALPHA source
-};
-
-
-enum PS_FINALCOMBINERSETTING
-{
-    PS_FINALCOMBINERSETTING_CLAMP_SUM=     0x80, // V1+R0 sum clamped to [0,1]
-
-    PS_FINALCOMBINERSETTING_COMPLEMENT_V1= 0x40, // unsigned invert mapping
-
-    PS_FINALCOMBINERSETTING_COMPLEMENT_R0= 0x20, // unsigned invert mapping
-};
-
-enum PS_DOTMAPPING
-{                              // valid in stage 0 1 2 3
-    PS_DOTMAPPING_ZERO_TO_ONE=         0x00L, // - * * *
-    PS_DOTMAPPING_MINUS1_TO_1_D3D=     0x01L, // - * * *
-    PS_DOTMAPPING_MINUS1_TO_1_GL=      0x02L, // - * * *
-    PS_DOTMAPPING_MINUS1_TO_1=         0x03L, // - * * *
-    PS_DOTMAPPING_HILO_1=              0x04L, // - * * *
-    PS_DOTMAPPING_HILO_HEMISPHERE_D3D= 0x05L, // - * * *
-    PS_DOTMAPPING_HILO_HEMISPHERE_GL=  0x06L, // - * * *
-    PS_DOTMAPPING_HILO_HEMISPHERE=     0x07L, // - * * *
-};
-
-enum PS_COLORKEYMODE {
-    COLOR_KEY_NONE = 0,
-    COLOR_KEY_KILL_ALPHA = 1,
-    COLOR_KEY_KILL_COLOR_AND_ALPHA = 2,
-    COLOR_KEY_DISCARD = 3,
-};
-
-enum PshAlphaFunc {
-    ALPHA_FUNC_NEVER,
-    ALPHA_FUNC_LESS,
-    ALPHA_FUNC_EQUAL,
-    ALPHA_FUNC_LEQUAL,
-    ALPHA_FUNC_GREATER,
-    ALPHA_FUNC_NOTEQUAL,
-    ALPHA_FUNC_GEQUAL,
-    ALPHA_FUNC_ALWAYS,
-};
-
-enum PshShadowDepthFunc {
-    SHADOW_DEPTH_FUNC_NEVER,
-    SHADOW_DEPTH_FUNC_LESS,
-    SHADOW_DEPTH_FUNC_EQUAL,
-    SHADOW_DEPTH_FUNC_LEQUAL,
-    SHADOW_DEPTH_FUNC_GREATER,
-    SHADOW_DEPTH_FUNC_NOTEQUAL,
-    SHADOW_DEPTH_FUNC_GEQUAL,
-    SHADOW_DEPTH_FUNC_ALWAYS,
-};
-
-enum ConvolutionFilter {
-    CONVOLUTION_FILTER_DISABLED,
-    CONVOLUTION_FILTER_QUINCUNX,
-    CONVOLUTION_FILTER_GAUSSIAN,
-};
-
-#endif
--- a/hw/xbox/nv2a/pgraph/shaders.c
+++ b/hw/xbox/nv2a/pgraph/shaders.c
@ -0,0 +1,302 @@
+/*
+ * Geforce NV2A PGRAPH OpenGL Renderer
+ *
+ * Copyright (c) 2015 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2020-2024 Matt Borgerson
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hw/xbox/nv2a/debug.h"
+#include "texture.h"
+#include "pgraph.h"
+#include "shaders.h"
+
+ShaderState pgraph_get_shader_state(PGRAPHState *pg)
+{
+    bool vertex_program = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_D),
+                                   NV_PGRAPH_CSV0_D_MODE) == 2;
+
+    bool fixed_function = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_D),
+                                   NV_PGRAPH_CSV0_D_MODE) == 0;
+
+    int program_start = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_C),
+                                 NV_PGRAPH_CSV0_C_CHEOPS_PROGRAM_START);
+
+    pg->program_data_dirty = false;
+
+    ShaderState state;
+
+    // We will hash it, so make sure any padding is zeroed
+    memset(&state, 0, sizeof(ShaderState));
+
+    state.surface_scale_factor = pg->surface_scale_factor;
+
+    state.compressed_attrs = pg->compressed_attrs;
+    state.uniform_attrs = pg->uniform_attrs;
+    state.swizzle_attrs = pg->swizzle_attrs;
+
+    /* register combiner stuff */
+    state.psh.window_clip_exclusive =
+        pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER) & NV_PGRAPH_SETUPRASTER_WINDOWCLIPTYPE;
+    state.psh.combiner_control = pgraph_reg_r(pg, NV_PGRAPH_COMBINECTL);
+    state.psh.shader_stage_program = pgraph_reg_r(pg, NV_PGRAPH_SHADERPROG);
+    state.psh.other_stage_input = pgraph_reg_r(pg, NV_PGRAPH_SHADERCTL);
+    state.psh.final_inputs_0 = pgraph_reg_r(pg, NV_PGRAPH_COMBINESPECFOG0);
+    state.psh.final_inputs_1 = pgraph_reg_r(pg, NV_PGRAPH_COMBINESPECFOG1);
+
+    state.psh.alpha_test =
+        pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0) & NV_PGRAPH_CONTROL_0_ALPHATESTENABLE;
+    state.psh.alpha_func = (enum PshAlphaFunc)GET_MASK(
+        pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0), NV_PGRAPH_CONTROL_0_ALPHAFUNC);
+
+    state.psh.point_sprite = pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER) &
+                             NV_PGRAPH_SETUPRASTER_POINTSMOOTHENABLE;
+
+    state.psh.shadow_depth_func = (enum PshShadowDepthFunc)GET_MASK(
+        pgraph_reg_r(pg, NV_PGRAPH_SHADOWCTL), NV_PGRAPH_SHADOWCTL_SHADOW_ZFUNC);
+
+    state.fixed_function = fixed_function;
+    state.specular_enable = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_C),
+                                     NV_PGRAPH_CSV0_C_SPECULAR_ENABLE);
+
+    /* fixed function stuff */
+    if (fixed_function) {
+        state.skinning = (enum VshSkinning)GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_D),
+                                                    NV_PGRAPH_CSV0_D_SKIN);
+        state.lighting =
+            GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_C), NV_PGRAPH_CSV0_C_LIGHTING);
+        state.normalization =
+            pgraph_reg_r(pg, NV_PGRAPH_CSV0_C) & NV_PGRAPH_CSV0_C_NORMALIZATION_ENABLE;
+
+        /* color material */
+        state.emission_src = (enum MaterialColorSource)GET_MASK(
+            pgraph_reg_r(pg, NV_PGRAPH_CSV0_C), NV_PGRAPH_CSV0_C_EMISSION);
+        state.ambient_src = (enum MaterialColorSource)GET_MASK(
+            pgraph_reg_r(pg, NV_PGRAPH_CSV0_C), NV_PGRAPH_CSV0_C_AMBIENT);
+        state.diffuse_src = (enum MaterialColorSource)GET_MASK(
+            pgraph_reg_r(pg, NV_PGRAPH_CSV0_C), NV_PGRAPH_CSV0_C_DIFFUSE);
+        state.specular_src = (enum MaterialColorSource)GET_MASK(
+            pgraph_reg_r(pg, NV_PGRAPH_CSV0_C), NV_PGRAPH_CSV0_C_SPECULAR);
+    }
+
+    /* vertex program stuff */
+    state.vertex_program = vertex_program,
+    state.z_perspective = pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0) &
+                          NV_PGRAPH_CONTROL_0_Z_PERSPECTIVE_ENABLE;
+    state.psh.z_perspective = state.z_perspective;
+
+    state.point_params_enable = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_D),
+                                         NV_PGRAPH_CSV0_D_POINTPARAMSENABLE);
+    state.point_size =
+        GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_POINTSIZE), NV097_SET_POINT_SIZE_V) / 8.0f;
+    if (state.point_params_enable) {
+        for (int i = 0; i < 8; i++) {
+            state.point_params[i] = pg->point_params[i];
+        }
+    }
+
+    /* geometry shader stuff */
+    state.primitive_mode = (enum ShaderPrimitiveMode)pg->primitive_mode;
+    state.polygon_front_mode = (enum ShaderPolygonMode)GET_MASK(
+        pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER), NV_PGRAPH_SETUPRASTER_FRONTFACEMODE);
+    state.polygon_back_mode = (enum ShaderPolygonMode)GET_MASK(
+        pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER), NV_PGRAPH_SETUPRASTER_BACKFACEMODE);
+
+    state.smooth_shading = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_3),
+                                    NV_PGRAPH_CONTROL_3_SHADEMODE) ==
+                           NV_PGRAPH_CONTROL_3_SHADEMODE_SMOOTH;
+    state.psh.smooth_shading = state.smooth_shading;
+
+    state.psh.depth_clipping = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_ZCOMPRESSOCCLUDE),
+                                        NV_PGRAPH_ZCOMPRESSOCCLUDE_ZCLAMP_EN) ==
+                               NV_PGRAPH_ZCOMPRESSOCCLUDE_ZCLAMP_EN_CULL;
+
+    state.program_length = 0;
+
+    if (vertex_program) {
+        // copy in vertex program tokens
+        for (int i = program_start; i < NV2A_MAX_TRANSFORM_PROGRAM_LENGTH;
+             i++) {
+            uint32_t *cur_token = (uint32_t *)&pg->program_data[i];
+            memcpy(&state.program_data[state.program_length], cur_token,
+                   VSH_TOKEN_SIZE * sizeof(uint32_t));
+            state.program_length++;
+
+            if (vsh_get_field(cur_token, FLD_FINAL)) {
+                break;
+            }
+        }
+    }
+
+    /* Texgen */
+    for (int i = 0; i < 4; i++) {
+        unsigned int reg = (i < 2) ? NV_PGRAPH_CSV1_A : NV_PGRAPH_CSV1_B;
+        for (int j = 0; j < 4; j++) {
+            unsigned int masks[] = {
+                (i % 2) ? NV_PGRAPH_CSV1_A_T1_S : NV_PGRAPH_CSV1_A_T0_S,
+                (i % 2) ? NV_PGRAPH_CSV1_A_T1_T : NV_PGRAPH_CSV1_A_T0_T,
+                (i % 2) ? NV_PGRAPH_CSV1_A_T1_R : NV_PGRAPH_CSV1_A_T0_R,
+                (i % 2) ? NV_PGRAPH_CSV1_A_T1_Q : NV_PGRAPH_CSV1_A_T0_Q
+            };
+            state.texgen[i][j] =
+                (enum VshTexgen)GET_MASK(pgraph_reg_r(pg, reg), masks[j]);
+        }
+    }
+
+    /* Fog */
+    state.fog_enable =
+        pgraph_reg_r(pg, NV_PGRAPH_CONTROL_3) & NV_PGRAPH_CONTROL_3_FOGENABLE;
+    if (state.fog_enable) {
+        /*FIXME: Use CSV0_D? */
+        state.fog_mode = (enum VshFogMode)GET_MASK(
+            pgraph_reg_r(pg, NV_PGRAPH_CONTROL_3), NV_PGRAPH_CONTROL_3_FOG_MODE);
+        state.foggen = (enum VshFoggen)GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_D),
+                                                NV_PGRAPH_CSV0_D_FOGGENMODE);
+    } else {
+        /* FIXME: Do we still pass the fogmode? */
+        state.fog_mode = (enum VshFogMode)0;
+        state.foggen = (enum VshFoggen)0;
+    }
+
+    /* Texture matrices */
+    for (int i = 0; i < 4; i++) {
+        state.texture_matrix_enable[i] = pg->texture_matrix_enable[i];
+    }
+
+    /* Lighting */
+    if (state.lighting) {
+        for (int i = 0; i < NV2A_MAX_LIGHTS; i++) {
+            state.light[i] = (enum VshLight)GET_MASK(
+                pgraph_reg_r(pg, NV_PGRAPH_CSV0_D), NV_PGRAPH_CSV0_D_LIGHT0 << (i * 2));
+        }
+    }
+
+    /* Copy content of enabled combiner stages */
+    int num_stages = pgraph_reg_r(pg, NV_PGRAPH_COMBINECTL) & 0xFF;
+    for (int i = 0; i < num_stages; i++) {
+        state.psh.rgb_inputs[i] = pgraph_reg_r(pg, NV_PGRAPH_COMBINECOLORI0 + i * 4);
+        state.psh.rgb_outputs[i] = pgraph_reg_r(pg, NV_PGRAPH_COMBINECOLORO0 + i * 4);
+        state.psh.alpha_inputs[i] = pgraph_reg_r(pg, NV_PGRAPH_COMBINEALPHAI0 + i * 4);
+        state.psh.alpha_outputs[i] = pgraph_reg_r(pg, NV_PGRAPH_COMBINEALPHAO0 + i * 4);
+        // constant_0[i] = pgraph_reg_r(pg, NV_PGRAPH_COMBINEFACTOR0 + i * 4);
+        // constant_1[i] = pgraph_reg_r(pg, NV_PGRAPH_COMBINEFACTOR1 + i * 4);
+    }
+
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            state.psh.compare_mode[i][j] =
+                (pgraph_reg_r(pg, NV_PGRAPH_SHADERCLIPMODE) >> (4 * i + j)) & 1;
+        }
+
+        uint32_t ctl_0 = pgraph_reg_r(pg, NV_PGRAPH_TEXCTL0_0 + i * 4);
+        bool enabled = pgraph_is_texture_stage_active(pg, i) &&
+                       (ctl_0 & NV_PGRAPH_TEXCTL0_0_ENABLE);
+        if (!enabled) {
+            continue;
+        }
+
+        state.psh.alphakill[i] = ctl_0 & NV_PGRAPH_TEXCTL0_0_ALPHAKILLEN;
+
+        uint32_t tex_fmt = pgraph_reg_r(pg, NV_PGRAPH_TEXFMT0 + i * 4);
+        state.psh.dim_tex[i] = GET_MASK(tex_fmt, NV_PGRAPH_TEXFMT0_DIMENSIONALITY);
+
+        unsigned int color_format = GET_MASK(tex_fmt, NV_PGRAPH_TEXFMT0_COLOR);
+        BasicColorFormatInfo f = kelvin_color_format_info_map[color_format];
+        state.psh.rect_tex[i] = f.linear;
+        state.psh.tex_x8y24[i] = color_format == NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_DEPTH_X8_Y24_FIXED ||
+                                color_format == NV097_SET_TEXTURE_FORMAT_COLOR_LU_IMAGE_DEPTH_X8_Y24_FLOAT;
+
+        uint32_t border_source =
+            GET_MASK(tex_fmt, NV_PGRAPH_TEXFMT0_BORDER_SOURCE);
+        bool cubemap = GET_MASK(tex_fmt, NV_PGRAPH_TEXFMT0_CUBEMAPENABLE);
+        state.psh.border_logical_size[i][0] = 0.0f;
+        state.psh.border_logical_size[i][1] = 0.0f;
+        state.psh.border_logical_size[i][2] = 0.0f;
+        if (border_source != NV_PGRAPH_TEXFMT0_BORDER_SOURCE_COLOR) {
+            if (!f.linear && !cubemap) {
+                // The actual texture will be (at least) double the reported
+                // size and shifted by a 4 texel border but texture coordinates
+                // will still be relative to the reported size.
+                unsigned int reported_width =
+                    1 << GET_MASK(tex_fmt, NV_PGRAPH_TEXFMT0_BASE_SIZE_U);
+                unsigned int reported_height =
+                    1 << GET_MASK(tex_fmt, NV_PGRAPH_TEXFMT0_BASE_SIZE_V);
+                unsigned int reported_depth =
+                    1 << GET_MASK(tex_fmt, NV_PGRAPH_TEXFMT0_BASE_SIZE_P);
+
+                state.psh.border_logical_size[i][0] = reported_width;
+                state.psh.border_logical_size[i][1] = reported_height;
+                state.psh.border_logical_size[i][2] = reported_depth;
+
+                if (reported_width < 8) {
+                    state.psh.border_inv_real_size[i][0] = 0.0625f;
+                } else {
+                    state.psh.border_inv_real_size[i][0] =
+                        1.0f / (reported_width * 2.0f);
+                }
+                if (reported_height < 8) {
+                    state.psh.border_inv_real_size[i][1] = 0.0625f;
+                } else {
+                    state.psh.border_inv_real_size[i][1] =
+                        1.0f / (reported_height * 2.0f);
+                }
+                if (reported_depth < 8) {
+                    state.psh.border_inv_real_size[i][2] = 0.0625f;
+                } else {
+                    state.psh.border_inv_real_size[i][2] =
+                        1.0f / (reported_depth * 2.0f);
+                }
+            } else {
+                NV2A_UNIMPLEMENTED(
+                    "Border source texture with linear %d cubemap %d", f.linear,
+                    cubemap);
+            }
+        }
+
+        /* Keep track of whether texture data has been loaded as signed
+         * normalized integers or not. This dictates whether or not we will need
+         * to re-map in fragment shader for certain texture modes (e.g.
+         * bumpenvmap).
+         *
+         * FIXME: When signed texture data is loaded as unsigned and remapped in
+         * fragment shader, there may be interpolation artifacts. Fix this to
+         * support signed textures more appropriately.
+         */
+#if 0 // FIXME
+        state.psh.snorm_tex[i] = (f.gl_internal_format == GL_RGB8_SNORM)
+                                 || (f.gl_internal_format == GL_RG8_SNORM);
+#endif
+        state.psh.shadow_map[i] = f.depth;
+
+        uint32_t filter = pgraph_reg_r(pg, NV_PGRAPH_TEXFILTER0 + i * 4);
+        unsigned int min_filter = GET_MASK(filter, NV_PGRAPH_TEXFILTER0_MIN);
+        enum ConvolutionFilter kernel = CONVOLUTION_FILTER_DISABLED;
+        /* FIXME: We do not distinguish between min and mag when
+         * performing convolution. Just use it if specified for min (common AA
+         * case).
+         */
+        if (min_filter == NV_PGRAPH_TEXFILTER0_MIN_CONVOLUTION_2D_LOD0) {
+            int k = GET_MASK(filter, NV_PGRAPH_TEXFILTER0_CONVOLUTION_KERNEL);
+            assert(k == NV_PGRAPH_TEXFILTER0_CONVOLUTION_KERNEL_QUINCUNX ||
+                   k == NV_PGRAPH_TEXFILTER0_CONVOLUTION_KERNEL_GAUSSIAN_3);
+            kernel = (enum ConvolutionFilter)k;
+        }
+
+        state.psh.conv_tex[i] = kernel;
+    }
+
+    return state;
+}
--- a/hw/xbox/nv2a/pgraph/shaders.h
+++ b/hw/xbox/nv2a/pgraph/shaders.h
@ -0,0 +1,110 @@
+/*
+ * QEMU Geforce NV2A shader generator
+ *
+ * Copyright (c) 2015 espes
+ * Copyright (c) 2015 Jannik Vogel
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_XBOX_NV2A_PGRAPH_SHADERS_H
+#define HW_XBOX_NV2A_PGRAPH_SHADERS_H
+
+#include <stdint.h>
+#include "hw/xbox/nv2a/nv2a_regs.h"
+
+#include "vsh.h"
+#include "psh.h"
+
+enum ShaderPrimitiveMode {
+    PRIM_TYPE_INVALID,
+    PRIM_TYPE_POINTS,
+    PRIM_TYPE_LINES,
+    PRIM_TYPE_LINE_LOOP,
+    PRIM_TYPE_LINE_STRIP,
+    PRIM_TYPE_TRIANGLES,
+    PRIM_TYPE_TRIANGLE_STRIP,
+    PRIM_TYPE_TRIANGLE_FAN,
+    PRIM_TYPE_QUADS,
+    PRIM_TYPE_QUAD_STRIP,
+    PRIM_TYPE_POLYGON,
+};
+
+enum ShaderPolygonMode {
+    POLY_MODE_FILL,
+    POLY_MODE_POINT,
+    POLY_MODE_LINE,
+};
+
+enum MaterialColorSource {
+    MATERIAL_COLOR_SRC_MATERIAL,
+    MATERIAL_COLOR_SRC_DIFFUSE,
+    MATERIAL_COLOR_SRC_SPECULAR,
+};
+
+typedef struct ShaderState {
+    bool vulkan;
+    bool use_push_constants_for_uniform_attrs;
+    unsigned int surface_scale_factor;
+
+    PshState psh;
+    uint16_t compressed_attrs;
+    uint16_t uniform_attrs;
+    uint16_t swizzle_attrs;
+
+    bool texture_matrix_enable[4];
+    enum VshTexgen texgen[4][4];
+
+    bool fog_enable;
+    enum VshFoggen foggen;
+    enum VshFogMode fog_mode;
+
+    enum VshSkinning skinning;
+
+    bool normalization;
+
+    enum MaterialColorSource emission_src;
+    enum MaterialColorSource ambient_src;
+    enum MaterialColorSource diffuse_src;
+    enum MaterialColorSource specular_src;
+
+    bool lighting;
+    enum VshLight light[NV2A_MAX_LIGHTS];
+
+    bool fixed_function;
+    bool specular_enable;
+
+    /* vertex program */
+    bool vertex_program;
+    uint32_t program_data[NV2A_MAX_TRANSFORM_PROGRAM_LENGTH][VSH_TOKEN_SIZE];
+    int program_length;
+    bool z_perspective;
+
+    /* primitive format for geometry shader */
+    enum ShaderPolygonMode polygon_front_mode;
+    enum ShaderPolygonMode polygon_back_mode;
+    enum ShaderPrimitiveMode primitive_mode;
+
+    bool point_params_enable;
+    float point_size;
+    float point_params[8];
+
+    bool smooth_shading;
+} ShaderState;
+
+typedef struct PGRAPHState PGRAPHState;
+
+ShaderState pgraph_get_shader_state(PGRAPHState *pg);
+
+#endif
--- a/hw/xbox/nv2a/pgraph/vk/buffer.c
+++ b/hw/xbox/nv2a/pgraph/vk/buffer.c
@ -18,6 +18,7 @@
 */

 #include "renderer.h"
+#include <vulkan/vulkan_core.h>

 static void create_buffer(PGRAPHState *pg, StorageBuffer *buffer)
 {
--- a/hw/xbox/nv2a/pgraph/vk/constants.h
+++ b/hw/xbox/nv2a/pgraph/vk/constants.h
@ -21,7 +21,7 @@
 #define HW_XBOX_NV2A_PGRAPH_VK_CONSTANTS_H

 #include "hw/xbox/nv2a/nv2a_regs.h"
-#include "hw/xbox/nv2a/pgraph/vsh_regs.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"
 #include <vulkan/vulkan.h>

 static const VkFilter pgraph_texture_min_filter_vk_map[] = {
--- a/hw/xbox/nv2a/pgraph/vk/debug.c
+++ b/hw/xbox/nv2a/pgraph/vk/debug.c
@ -25,8 +25,6 @@
 #endif

 #ifdef CONFIG_RENDERDOC
-#include "trace/control.h"
-
 #pragma GCC diagnostic ignored "-Wstrict-prototypes"
 #include "thirdparty/renderdoc_app.h"
 #endif
@ -48,21 +46,11 @@ void pgraph_vk_debug_frame_terminator(void)

        PGRAPHVkState *r = g_nv2a->pgraph.vk_renderer_state;
        if (rdoc_api->IsTargetControlConnected()) {
-            bool capturing = rdoc_api->IsFrameCapturing();
-            if (capturing && renderdoc_capture_frames == 0) {
+            if (rdoc_api->IsFrameCapturing()) {
                rdoc_api->EndFrameCapture(RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(r->instance), 0);
-                if (renderdoc_trace_frames) {
-                    trace_enable_events("-nv2a_pgraph_*");
-                    renderdoc_trace_frames = false;
-                }
            }
            if (renderdoc_capture_frames > 0) {
-                if (!capturing) {
-                    if (renderdoc_trace_frames) {
-                        trace_enable_events("nv2a_pgraph_*");
-                    }
                rdoc_api->StartFrameCapture(RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(r->instance), 0);
-                }
                --renderdoc_capture_frames;
            }
        }
--- a/hw/xbox/nv2a/pgraph/vk/draw.c
+++ b/hw/xbox/nv2a/pgraph/vk/draw.c
@ -1,7 +1,7 @@
 /*
 * Geforce NV2A PGRAPH Vulkan Renderer
 *
- * Copyright (c) 2024-2025 Matt Borgerson
+ * Copyright (c) 2024 Matt Borgerson
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@ -20,7 +20,6 @@
 #include "qemu/osdep.h"
 #include "qemu/fast-hash.h"
 #include "renderer.h"
-#include <math.h>

 void pgraph_vk_draw_begin(NV2AState *d)
 {
@ -51,8 +50,8 @@ static VkPrimitiveTopology get_primitive_topology(PGRAPHState *pg)
 {
    PGRAPHVkState *r = pg->vk_renderer_state;

-    int polygon_mode = r->shader_binding->state.geom.polygon_front_mode;
-    int primitive_mode = r->shader_binding->state.geom.primitive_mode;
+    int polygon_mode = r->shader_binding->state.polygon_front_mode;
+    int primitive_mode = r->shader_binding->state.primitive_mode;

    if (polygon_mode == POLY_MODE_POINT) {
        return VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
@ -93,8 +92,7 @@ static VkPrimitiveTopology get_primitive_topology(PGRAPHState *pg)
    }
 }

-static void pipeline_cache_entry_init(Lru *lru, LruNode *node,
-                                      const void *state)
+static void pipeline_cache_entry_init(Lru *lru, LruNode *node, void *state)
 {
    PipelineBinding *snode = container_of(node, PipelineBinding, node);
    snode->layout = VK_NULL_HANDLE;
@ -118,8 +116,7 @@ static void pipeline_cache_entry_post_evict(Lru *lru, LruNode *node)
    snode->layout = VK_NULL_HANDLE;
 }

-static bool pipeline_cache_entry_compare(Lru *lru, LruNode *node,
-                                         const void *key)
+static bool pipeline_cache_entry_compare(Lru *lru, LruNode *node, void *key)
 {
    PipelineBinding *snode = container_of(node, PipelineBinding, node);
    return memcmp(&snode->key, key, sizeof(PipelineKey));
@ -748,15 +745,15 @@ static void create_pipeline(PGRAPHState *pg)
        (VkPipelineShaderStageCreateInfo){
            .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
            .stage = VK_SHADER_STAGE_VERTEX_BIT,
-            .module = r->shader_binding->vsh.module_info->module,
+            .module = r->shader_binding->vertex->module,
            .pName = "main",
        };
-    if (r->shader_binding->geom.module_info) {
+    if (r->shader_binding->geometry) {
        shader_stages[num_active_shader_stages++] =
            (VkPipelineShaderStageCreateInfo){
                .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
                .stage = VK_SHADER_STAGE_GEOMETRY_BIT,
-                .module = r->shader_binding->geom.module_info->module,
+                .module = r->shader_binding->geometry->module,
                .pName = "main",
            };
    }
@ -764,7 +761,7 @@ static void create_pipeline(PGRAPHState *pg)
        (VkPipelineShaderStageCreateInfo){
            .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
            .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
-            .module = r->shader_binding->psh.module_info->module,
+            .module = r->shader_binding->fragment->module,
            .pName = "main",
        };

@ -818,7 +815,7 @@ static void create_pipeline(PGRAPHState *pg)
        .depthClampEnable = VK_TRUE,
        .rasterizerDiscardEnable = VK_FALSE,
        .polygonMode = pgraph_polygon_mode_vk_map[r->shader_binding->state
-                                                      .geom.polygon_front_mode],
+                                                      .polygon_front_mode],
        .lineWidth = 1.0f,
        .frontFace = (pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER) &
                      NV_PGRAPH_SETUPRASTER_FRONTFACE) ?
@ -948,23 +945,12 @@ static void create_pipeline(PGRAPHState *pg)
        .blendConstants[3] = blend_constant[3],
    };

-    VkDynamicState dynamic_states[3] = { VK_DYNAMIC_STATE_VIEWPORT,
+    VkDynamicState dynamic_states[2] = { VK_DYNAMIC_STATE_VIEWPORT,
                                         VK_DYNAMIC_STATE_SCISSOR };
-    int num_dynamic_states = 2;
-
-    snode->has_dynamic_line_width =
-        (r->enabled_physical_device_features.wideLines == VK_TRUE) &&
-        (r->shader_binding->state.geom.polygon_front_mode == POLY_MODE_LINE ||
-         r->shader_binding->state.geom.primitive_mode == PRIM_TYPE_LINES ||
-         r->shader_binding->state.geom.primitive_mode == PRIM_TYPE_LINE_LOOP ||
-         r->shader_binding->state.geom.primitive_mode == PRIM_TYPE_LINE_STRIP);
-    if (snode->has_dynamic_line_width) {
-        dynamic_states[num_dynamic_states++] = VK_DYNAMIC_STATE_LINE_WIDTH;
-    }

    VkPipelineDynamicStateCreateInfo dynamic_state = {
        .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
-        .dynamicStateCount = num_dynamic_states,
+        .dynamicStateCount = ARRAY_SIZE(dynamic_states),
        .pDynamicStates = dynamic_states,
    };

@ -1014,9 +1000,9 @@ static void create_pipeline(PGRAPHState *pg)
    };

    VkPushConstantRange push_constant_range;
-    if (r->use_push_constants_for_uniform_attrs) {
+    if (r->shader_binding->state.use_push_constants_for_uniform_attrs) {
        int num_uniform_attributes =
-            __builtin_popcount(r->shader_binding->state.vsh.uniform_attrs);
+            __builtin_popcount(r->shader_binding->state.uniform_attrs);
        if (num_uniform_attributes) {
            push_constant_range = (VkPushConstantRange){
                .stageFlags = VK_SHADER_STAGE_VERTEX_BIT,
@ -1069,7 +1055,7 @@ static void push_vertex_attr_values(PGRAPHState *pg)
 {
    PGRAPHVkState *r = pg->vk_renderer_state;

-    if (!r->use_push_constants_for_uniform_attrs) {
+    if (!r->shader_binding->state.use_push_constants_for_uniform_attrs) {
        return;
    }

@ -1078,8 +1064,8 @@ static void push_vertex_attr_values(PGRAPHState *pg)
    float values[NV2A_VERTEXSHADER_ATTRIBUTES][4];
    int num_uniform_attrs = 0;

-    pgraph_get_inline_values(pg, r->shader_binding->state.vsh.uniform_attrs,
-                             values, &num_uniform_attrs);
+    pgraph_get_inline_values(pg, r->shader_binding->state.uniform_attrs, values,
+                             &num_uniform_attrs);

    if (num_uniform_attrs > 0) {
        vkCmdPushConstants(r->command_buffer, r->pipeline_binding->layout,
@ -1434,21 +1420,6 @@ static void begin_pre_draw(PGRAPHState *pg)
    pgraph_vk_ensure_command_buffer(pg);
 }

-static float clamp_line_width_to_device_limits(PGRAPHState *pg, float width)
-{
-    PGRAPHVkState *r = pg->vk_renderer_state;
-
-    float min_width = r->device_props.limits.lineWidthRange[0];
-    float max_width = r->device_props.limits.lineWidthRange[1];
-    float granularity = r->device_props.limits.lineWidthGranularity;
-
-    if (granularity != 0.0f) {
-        float steps = roundf((width - min_width) / granularity);
-        width = min_width + steps * granularity;
-    }
-    return fminf(fmaxf(min_width, width), max_width);
-}
-
 static void begin_draw(PGRAPHState *pg)
 {
    PGRAPHVkState *r = pg->vk_renderer_state;
@ -1520,12 +1491,6 @@ static void begin_draw(PGRAPHState *pg)
            .extent.height = scissor_height,
        };
        vkCmdSetScissor(r->command_buffer, 0, 1, &scissor);
-
-        if (r->pipeline_binding->has_dynamic_line_width) {
-            float line_width =
-                clamp_line_width_to_device_limits(pg, pg->surface_scale_factor);
-            vkCmdSetLineWidth(r->command_buffer, line_width);
-        }
    }

    if (!pg->clearing) {
--- a/hw/xbox/nv2a/pgraph/vk/glsl.c
+++ b/hw/xbox/nv2a/pgraph/vk/glsl.c
@ -1,7 +1,7 @@
 /*
 * Geforce NV2A PGRAPH Vulkan Renderer
 *
- * Copyright (c) 2024-2025 Matt Borgerson
+ * Copyright (c) 2024 Matt Borgerson
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@ -269,24 +269,12 @@ static void block_to_uniforms(const SpvReflectBlockVariable *block, ShaderUnifor

        assert(member->array.dims_count < 2);

-        int dim = 1;
-        for (int i = 0; i < member->array.dims_count; i++) {
-            dim *= member->array.dims[i];
-        }
-        int stride = MAX(member->array.stride, member->numeric.matrix.stride);
-        if (member->numeric.matrix.column_count) {
-            dim *= member->numeric.matrix.column_count;
-            if (member->array.stride) {
-                stride =
-                    member->array.stride / member->numeric.matrix.column_count;
-            }
-        }
        layout->uniforms[k] = (ShaderUniform){
            .name = strdup(member->name),
            .offset = member->offset,
            .dim_v = MAX(1, member->numeric.vector.component_count),
-            .dim_a = dim,
-            .stride = stride,
+            .dim_a = MAX(member->array.dims_count ? member->array.dims[0] : 1, member->numeric.matrix.column_count),
+            .stride = MAX(member->array.stride, member->numeric.matrix.stride),
        };

        // fprintf(stderr, "<%s offset=%zd dim_v=%zd dim_a=%zd stride=%zd>\n",
@ -368,7 +356,6 @@ ShaderModuleInfo *pgraph_vk_create_shader_module_from_glsl(
    PGRAPHVkState *r, VkShaderStageFlagBits stage, const char *glsl)
 {
    ShaderModuleInfo *info = g_malloc0(sizeof(*info));
-    info->refcnt = 0;
    info->glsl = strdup(glsl);
    info->spirv = pgraph_vk_compile_glsl_to_spv(
        vk_shader_stage_to_glslang_stage(stage), glsl);
@ -387,24 +374,8 @@ static void finalize_uniform_layout(ShaderUniformLayout *layout)
    }
 }

-void pgraph_vk_ref_shader_module(ShaderModuleInfo *info)
-{
-    info->refcnt++;
-}
-
-void pgraph_vk_unref_shader_module(PGRAPHVkState *r, ShaderModuleInfo *info)
-{
-    assert(info->refcnt >= 1);
-
-    info->refcnt--;
-    if (info->refcnt == 0) {
-        pgraph_vk_destroy_shader_module(r, info);
-    }
-}
-
 void pgraph_vk_destroy_shader_module(PGRAPHVkState *r, ShaderModuleInfo *info)
 {
-    assert(info->refcnt == 0);
    if (info->glsl) {
        free(info->glsl);
    }
--- a/hw/xbox/nv2a/pgraph/vk/glsl.h
+++ b/hw/xbox/nv2a/pgraph/vk/glsl.h
@ -1,7 +1,7 @@
 /*
 * Geforce NV2A PGRAPH Vulkan Renderer
 *
- * Copyright (c) 2024-2025 Matt Borgerson
+ * Copyright (c) 2024 Matt Borgerson
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@ -120,8 +120,8 @@ void *uniform_ptr(ShaderUniformLayout *layout, int idx)
    return (char *)layout->allocation + layout->uniforms[idx - 1].offset;
 }

-static inline void uniform_copy(ShaderUniformLayout *layout, int idx,
-                                void *values, size_t value_size, size_t count)
+static inline
+void uniform_copy(ShaderUniformLayout *layout, int idx, void *values, size_t value_size, size_t count)
 {
 	assert(idx > 0 && "invalid uniform index");

@ -135,7 +135,7 @@ static inline void uniform_copy(ShaderUniformLayout *layout, int idx,

    int index = 0;
    while (bytes_remaining) {
-        assert((p_out + element_size) <= p_max);
+    	assert(p_out < p_max);
    	assert(index < u->dim_a);
        memcpy(p_out, p_in, element_size);
        bytes_remaining -= element_size;
@ -202,10 +202,4 @@ void uniform4i(ShaderUniformLayout *layout, int idx, int v0, int v1, int v2, int
 	uniform1iv(layout, idx, 4, values);
 }

-static inline void uniform1uiv(ShaderUniformLayout *layout, int idx,
-                               size_t count, uint32_t *values)
-{
-    uniform_copy(layout, idx, values, sizeof(uint32_t), count);
-}
-
 #endif
--- a/hw/xbox/nv2a/pgraph/vk/instance.c
+++ b/hw/xbox/nv2a/pgraph/vk/instance.c
@ -1,7 +1,7 @@
 /*
 * Geforce NV2A PGRAPH Vulkan Renderer
 *
- * Copyright (c) 2024-2025 Matt Borgerson
+ * Copyright (c) 2024 Matt Borgerson
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@ -522,48 +522,36 @@ static bool create_logical_device(PGRAPHState *pg, Error **errp)
        .pQueuePriorities = &queuePriority,
    };

-    // Check device features
-    VkPhysicalDeviceFeatures physical_device_features;
-    vkGetPhysicalDeviceFeatures(r->physical_device, &physical_device_features);
-    memset(&r->enabled_physical_device_features, 0,
-           sizeof(r->enabled_physical_device_features));
+    // Ensure device supports required features
+    VkPhysicalDeviceFeatures available_features, enabled_features;
+    vkGetPhysicalDeviceFeatures(r->physical_device, &available_features);
+    memset(&enabled_features, 0, sizeof(enabled_features));

    struct {
        const char *name;
        VkBool32 available, *enabled;
-        bool required;
-    } desired_features[] = {
-        // clang-format off
-        #define F(n, req) { \
-            .name = #n, \
-            .available = physical_device_features.n, \
-            .enabled = &r->enabled_physical_device_features.n, \
-            .required = req, \
-        }
-        F(shaderClipDistance, true),
-        F(geometryShader, true),
-        F(shaderTessellationAndGeometryPointSize, true),
-        F(depthClamp, true),
-        F(occlusionQueryPrecise, true),
-        F(fillModeNonSolid, true),
-        F(wideLines, false),
+    } required_features[] = {
+        #define F(n) { #n, available_features.n, &enabled_features.n }
+        F(shaderClipDistance),
+        F(geometryShader),
+        F(shaderTessellationAndGeometryPointSize),
+        F(depthClamp),
+        F(occlusionQueryPrecise),
        #undef F
-        // clang-format on
    };

-    bool all_required_features_available = true;
-    for (int i = 0; i < ARRAY_SIZE(desired_features); i++) {
-        if (desired_features[i].required &&
-            desired_features[i].available != VK_TRUE) {
+    bool all_features_available = true;
+    for (int i = 0; i < ARRAY_SIZE(required_features); i++) {
+        if (required_features[i].available != VK_TRUE) {
            fprintf(stderr,
                    "Error: Device does not support required feature %s\n",
-                    desired_features[i].name);
-            all_required_features_available = false;
+                    required_features[i].name);
+            all_features_available = false;
        }
-        *desired_features[i].enabled = desired_features[i].available;
+        *required_features[i].enabled = VK_TRUE;
    }

-    if (!all_required_features_available) {
+    if (!all_features_available) {
        error_setg(errp, "Device does not support required features");
        return false;
    }
@ -596,7 +584,7 @@ static bool create_logical_device(PGRAPHState *pg, Error **errp)
        .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
        .queueCreateInfoCount = 1,
        .pQueueCreateInfos = &queue_create_info,
-        .pEnabledFeatures = &r->enabled_physical_device_features,
+        .pEnabledFeatures = &enabled_features,
        .enabledExtensionCount = enabled_extension_names->len,
        .ppEnabledExtensionNames =
            &g_array_index(enabled_extension_names, const char *, 0),
--- a/hw/xbox/nv2a/pgraph/vk/renderer.h
+++ b/hw/xbox/nv2a/pgraph/vk/renderer.h
@ -1,7 +1,7 @@
 /*
 * Geforce NV2A PGRAPH Vulkan Renderer
 *
- * Copyright (c) 2024-2025 Matt Borgerson
+ * Copyright (c) 2024 Matt Borgerson
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@ -29,7 +29,7 @@
 #include "hw/xbox/nv2a/nv2a_regs.h"
 #include "hw/xbox/nv2a/pgraph/surface.h"
 #include "hw/xbox/nv2a/pgraph/texture.h"
-#include "hw/xbox/nv2a/pgraph/glsl/shaders.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"

 #include <vulkan/vulkan.h>
 #include <glslang/Include/glslang_c_interface.h>
@ -77,7 +77,6 @@ typedef struct PipelineBinding {
    VkPipeline pipeline;
    VkRenderPass render_pass;
    unsigned int draw_time;
-    bool has_dynamic_line_width;
 } PipelineBinding;

 enum Buffer {
@ -146,7 +145,6 @@ typedef struct SurfaceBinding {
 } SurfaceBinding;

 typedef struct ShaderModuleInfo {
-    int refcnt;
    char *glsl;
    GByteArray *spirv;
    VkShaderModule module;
@ -156,44 +154,48 @@ typedef struct ShaderModuleInfo {
    ShaderUniformLayout push_constants;
 } ShaderModuleInfo;

-typedef struct ShaderModuleCacheKey {
-    VkShaderStageFlagBits kind;
-    union {
-        struct {
-            VshState state;
-            GenVshGlslOptions glsl_opts;
-        } vsh;
-        struct {
-            GeomState state;
-            GenGeomGlslOptions glsl_opts;
-        } geom;
-        struct {
-            PshState state;
-            GenPshGlslOptions glsl_opts;
-        } psh;
-    };
-} ShaderModuleCacheKey;
-
-typedef struct ShaderModuleCacheEntry {
-    LruNode node;
-    ShaderModuleCacheKey key;
-    ShaderModuleInfo *module_info;
-} ShaderModuleCacheEntry;
-
 typedef struct ShaderBinding {
    LruNode node;
+    bool initialized;
+
    ShaderState state;
-    struct {
-        ShaderModuleInfo *module_info;
-        VshUniformLocs uniform_locs;
-    } vsh;
-    struct {
-        ShaderModuleInfo *module_info;
-    } geom;
-    struct {
-        ShaderModuleInfo *module_info;
-        PshUniformLocs uniform_locs;
-    } psh;
+    ShaderModuleInfo *geometry;
+    ShaderModuleInfo *vertex;
+    ShaderModuleInfo *fragment;
+
+    int psh_constant_loc[9][2];
+    int alpha_ref_loc;
+
+    int bump_mat_loc[NV2A_MAX_TEXTURES];
+    int bump_scale_loc[NV2A_MAX_TEXTURES];
+    int bump_offset_loc[NV2A_MAX_TEXTURES];
+    int tex_scale_loc[NV2A_MAX_TEXTURES];
+
+    int surface_size_loc;
+    int clip_range_loc;
+    int clip_range_floc;
+    int depth_offset_loc;
+
+    int vsh_constant_loc;
+    uint32_t vsh_constants[NV2A_VERTEXSHADER_CONSTANTS][4];
+
+    int inv_viewport_loc;
+    int ltctxa_loc;
+    int ltctxb_loc;
+    int ltc1_loc;
+
+    int fog_color_loc;
+    int fog_param_loc;
+    int light_infinite_half_vector_loc[NV2A_MAX_LIGHTS];
+    int light_infinite_direction_loc[NV2A_MAX_LIGHTS];
+    int light_local_position_loc[NV2A_MAX_LIGHTS];
+    int light_local_attenuation_loc[NV2A_MAX_LIGHTS];
+
+    int clip_region_loc;
+
+    int material_alpha_loc;
+
+    int uniform_attrs_loc;
 } ShaderBinding;

 typedef struct TextureKey {
@ -329,7 +331,6 @@ typedef struct PGRAPHVkState {
    bool memory_budget_extension_enabled;

    VkPhysicalDevice physical_device;
-    VkPhysicalDeviceFeatures enabled_physical_device_features;
    VkPhysicalDeviceProperties device_props;
    VkDevice device;
    VmaAllocator allocator;
@ -404,10 +405,6 @@ typedef struct PGRAPHVkState {
    ShaderBinding *shader_binding;
    ShaderModuleInfo *quad_vert_module, *solid_frag_module;
    bool shader_bindings_changed;
-    bool use_push_constants_for_uniform_attrs;
-
-    Lru shader_module_cache;
-    ShaderModuleCacheEntry *shader_module_cache_entries;

    // FIXME: Merge these into a structure
    uint64_t uniform_buffer_hashes[2];
@ -464,8 +461,6 @@ VkShaderModule pgraph_vk_create_shader_module_from_spv(PGRAPHVkState *r,
                                                       GByteArray *spv);
 ShaderModuleInfo *pgraph_vk_create_shader_module_from_glsl(
    PGRAPHVkState *r, VkShaderStageFlagBits stage, const char *glsl);
-void pgraph_vk_ref_shader_module(ShaderModuleInfo *info);
-void pgraph_vk_unref_shader_module(PGRAPHVkState *r, ShaderModuleInfo *info);
 void pgraph_vk_destroy_shader_module(PGRAPHVkState *r, ShaderModuleInfo *info);

 // buffer.c
@ -555,6 +550,7 @@ void pgraph_vk_init_shaders(PGRAPHState *pg);
 void pgraph_vk_finalize_shaders(PGRAPHState *pg);
 void pgraph_vk_update_descriptor_sets(PGRAPHState *pg);
 void pgraph_vk_bind_shaders(PGRAPHState *pg);
+void pgraph_vk_update_shader_uniforms(PGRAPHState *pg);

 // reports.c
 void pgraph_vk_init_reports(PGRAPHState *pg);
--- a/hw/xbox/nv2a/pgraph/vk/shaders.c
+++ b/hw/xbox/nv2a/pgraph/vk/shaders.c
@ -1,7 +1,13 @@
 /*
 * Geforce NV2A PGRAPH Vulkan Renderer
 *
- * Copyright (c) 2024-2025 Matt Borgerson
+ * Copyright (c) 2024 Matt Borgerson
+ *
+ * Based on GL implementation:
+ *
+ * Copyright (c) 2015 espes
+ * Copyright (c) 2015 Jannik Vogel
+ * Copyright (c) 2018-2024 Matt Borgerson
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@ -18,13 +24,15 @@
 */

 #include "qemu/osdep.h"
+#include "hw/xbox/nv2a/pgraph/shaders.h"
+#include "hw/xbox/nv2a/pgraph/util.h"
+#include "hw/xbox/nv2a/pgraph/glsl/geom.h"
+#include "hw/xbox/nv2a/pgraph/glsl/vsh.h"
+#include "hw/xbox/nv2a/pgraph/glsl/psh.h"
 #include "qemu/fast-hash.h"
 #include "qemu/mstring.h"
 #include "renderer.h"
-
-#define VSH_UBO_BINDING 0
-#define PSH_UBO_BINDING 1
-#define PSH_TEX_BINDING 2
+#include <locale.h>

 const size_t MAX_UNIFORM_ATTR_VALUES_SIZE = NV2A_VERTEXSHADER_ATTRIBUTES * 4 * sizeof(float);

@ -151,8 +159,8 @@ void pgraph_vk_update_descriptor_sets(PGRAPHState *pg)
    }

    ShaderBinding *binding = r->shader_binding;
-    ShaderUniformLayout *layouts[] = { &binding->vsh.module_info->uniforms,
-                                       &binding->psh.module_info->uniforms };
+    ShaderUniformLayout *layouts[] = { &binding->vertex->uniforms,
+                                       &binding->fragment->uniforms };
    VkDeviceSize ubo_buffer_total_size = 0;
    for (int i = 0; i < ARRAY_SIZE(layouts); i++) {
        ubo_buffer_total_size += layouts[i]->total_size;
@ -228,72 +236,90 @@ void pgraph_vk_update_descriptor_sets(PGRAPHState *pg)
    r->descriptor_set_index++;
 }

-static void update_shader_uniform_locs(ShaderBinding *binding)
+static void update_shader_constant_locations(ShaderBinding *binding)
 {
-    for (int i = 0; i < ARRAY_SIZE(binding->vsh.uniform_locs); i++) {
-        binding->vsh.uniform_locs[i] = uniform_index(
-            &binding->vsh.module_info->uniforms, VshUniformInfo[i].name);
+    char tmp[64];
+
+    /* lookup fragment shader uniforms */
+    for (int i = 0; i < 9; i++) {
+        for (int j = 0; j < 2; j++) {
+            snprintf(tmp, sizeof(tmp), "c%d_%d", j, i);
+            binding->psh_constant_loc[i][j] =
+                uniform_index(&binding->fragment->uniforms, tmp);
+        }
+    }
+    binding->alpha_ref_loc =
+        uniform_index(&binding->fragment->uniforms, "alphaRef");
+    binding->fog_color_loc =
+        uniform_index(&binding->fragment->uniforms, "fogColor");
+    for (int i = 1; i < NV2A_MAX_TEXTURES; i++) {
+        snprintf(tmp, sizeof(tmp), "bumpMat%d", i);
+        binding->bump_mat_loc[i] =
+            uniform_index(&binding->fragment->uniforms, tmp);
+        snprintf(tmp, sizeof(tmp), "bumpScale%d", i);
+        binding->bump_scale_loc[i] =
+            uniform_index(&binding->fragment->uniforms, tmp);
+        snprintf(tmp, sizeof(tmp), "bumpOffset%d", i);
+        binding->bump_offset_loc[i] =
+            uniform_index(&binding->fragment->uniforms, tmp);
    }

-    for (int i = 0; i < ARRAY_SIZE(binding->psh.uniform_locs); i++) {
-        binding->psh.uniform_locs[i] = uniform_index(
-            &binding->psh.module_info->uniforms, PshUniformInfo[i].name);
+    for (int i = 0; i < NV2A_MAX_TEXTURES; i++) {
+        snprintf(tmp, sizeof(tmp), "texScale%d", i);
+        binding->tex_scale_loc[i] =
+            uniform_index(&binding->fragment->uniforms, tmp);
    }
+
+    /* lookup vertex shader uniforms */
+    binding->vsh_constant_loc = uniform_index(&binding->vertex->uniforms, "c");
+    binding->surface_size_loc =
+        uniform_index(&binding->vertex->uniforms, "surfaceSize");
+    binding->clip_range_loc =
+        uniform_index(&binding->vertex->uniforms, "clipRange");
+    binding->clip_range_floc =
+        uniform_index(&binding->fragment->uniforms, "clipRange");
+    binding->depth_offset_loc =
+        uniform_index(&binding->fragment->uniforms, "depthOffset");
+    binding->fog_param_loc =
+        uniform_index(&binding->vertex->uniforms, "fogParam");
+
+    binding->inv_viewport_loc =
+        uniform_index(&binding->vertex->uniforms, "invViewport");
+    binding->ltctxa_loc = uniform_index(&binding->vertex->uniforms, "ltctxa");
+    binding->ltctxb_loc = uniform_index(&binding->vertex->uniforms, "ltctxb");
+    binding->ltc1_loc = uniform_index(&binding->vertex->uniforms, "ltc1");
+
+    for (int i = 0; i < NV2A_MAX_LIGHTS; i++) {
+        snprintf(tmp, sizeof(tmp), "lightInfiniteHalfVector%d", i);
+        binding->light_infinite_half_vector_loc[i] =
+            uniform_index(&binding->vertex->uniforms, tmp);
+        snprintf(tmp, sizeof(tmp), "lightInfiniteDirection%d", i);
+        binding->light_infinite_direction_loc[i] =
+            uniform_index(&binding->vertex->uniforms, tmp);
+
+        snprintf(tmp, sizeof(tmp), "lightLocalPosition%d", i);
+        binding->light_local_position_loc[i] =
+            uniform_index(&binding->vertex->uniforms, tmp);
+        snprintf(tmp, sizeof(tmp), "lightLocalAttenuation%d", i);
+        binding->light_local_attenuation_loc[i] =
+            uniform_index(&binding->vertex->uniforms, tmp);
+    }
+
+    binding->clip_region_loc =
+        uniform_index(&binding->fragment->uniforms, "clipRegion");
+
+    binding->material_alpha_loc =
+        uniform_index(&binding->vertex->uniforms, "material_alpha");
+
+    binding->uniform_attrs_loc =
+        uniform_index(&binding->vertex->uniforms, "inlineValue");
 }

-static ShaderModuleInfo *
-get_and_ref_shader_module_for_key(PGRAPHVkState *r,
-                                  const ShaderModuleCacheKey *key)
+static void shader_cache_entry_init(Lru *lru, LruNode *node, void *state)
 {
-    uint64_t hash = fast_hash((void *)key, sizeof(ShaderModuleCacheKey));
-    LruNode *node = lru_lookup(&r->shader_module_cache, hash, key);
-    ShaderModuleCacheEntry *module =
-        container_of(node, ShaderModuleCacheEntry, node);
-    pgraph_vk_ref_shader_module(module->module_info);
-    return module->module_info;
-}
-
-static void shader_cache_entry_init(Lru *lru, LruNode *node, const void *state)
-{
-    PGRAPHVkState *r = container_of(lru, PGRAPHVkState, shader_cache);
-    ShaderBinding *binding = container_of(node, ShaderBinding, node);
-    memcpy(&binding->state, state, sizeof(ShaderState));
-
-    NV2A_VK_DPRINTF("cache miss");
-    nv2a_profile_inc_counter(NV2A_PROF_SHADER_GEN);
-
-    ShaderModuleCacheKey key;
-
-    bool need_geometry_shader = pgraph_glsl_need_geom(&binding->state.geom);
-    if (need_geometry_shader) {
-        memset(&key, 0, sizeof(key));
-        key.kind = VK_SHADER_STAGE_GEOMETRY_BIT;
-        key.geom.state = binding->state.geom;
-        key.geom.glsl_opts.vulkan = true;
-        binding->geom.module_info = get_and_ref_shader_module_for_key(r, &key);
-    } else {
-        binding->geom.module_info = NULL;
-    }
-
-    memset(&key, 0, sizeof(key));
-    key.kind = VK_SHADER_STAGE_VERTEX_BIT;
-    key.vsh.state = binding->state.vsh;
-    key.vsh.glsl_opts.vulkan = true;
-    key.vsh.glsl_opts.prefix_outputs = need_geometry_shader;
-    key.vsh.glsl_opts.use_push_constants_for_uniform_attrs =
-        r->use_push_constants_for_uniform_attrs;
-    key.vsh.glsl_opts.ubo_binding = VSH_UBO_BINDING;
-    binding->vsh.module_info = get_and_ref_shader_module_for_key(r, &key);
-
-    memset(&key, 0, sizeof(key));
-    key.kind = VK_SHADER_STAGE_FRAGMENT_BIT;
-    key.psh.state = binding->state.psh;
-    key.psh.glsl_opts.vulkan = true;
-    key.psh.glsl_opts.ubo_binding = PSH_UBO_BINDING;
-    key.psh.glsl_opts.tex_binding = PSH_TEX_BINDING;
-    binding->psh.module_info = get_and_ref_shader_module_for_key(r, &key);
-
-    update_shader_uniform_locs(binding);
+    ShaderBinding *snode = container_of(node, ShaderBinding, node);
+    memcpy(&snode->state, state, sizeof(ShaderState));
+    snode->initialized = false;
 }

 static void shader_cache_entry_post_evict(Lru *lru, LruNode *node)
@ -302,74 +328,25 @@ static void shader_cache_entry_post_evict(Lru *lru, LruNode *node)
    ShaderBinding *snode = container_of(node, ShaderBinding, node);

    ShaderModuleInfo *modules[] = {
-        snode->vsh.module_info,
-        snode->geom.module_info,
-        snode->psh.module_info,
+        snode->geometry,
+        snode->vertex,
+        snode->fragment,
    };
    for (int i = 0; i < ARRAY_SIZE(modules); i++) {
        if (modules[i]) {
-            pgraph_vk_unref_shader_module(r, modules[i]);
+            pgraph_vk_destroy_shader_module(r, modules[i]);
        }
    }
+
+    snode->initialized = false;
 }

-static bool shader_cache_entry_compare(Lru *lru, LruNode *node, const void *key)
+static bool shader_cache_entry_compare(Lru *lru, LruNode *node, void *key)
 {
    ShaderBinding *snode = container_of(node, ShaderBinding, node);
    return memcmp(&snode->state, key, sizeof(ShaderState));
 }

-static void shader_module_cache_entry_init(Lru *lru, LruNode *node,
-                                           const void *key)
-{
-    PGRAPHVkState *r = container_of(lru, PGRAPHVkState, shader_module_cache);
-    ShaderModuleCacheEntry *module =
-        container_of(node, ShaderModuleCacheEntry, node);
-    memcpy(&module->key, key, sizeof(ShaderModuleCacheKey));
-
-    MString *code;
-
-    switch (module->key.kind) {
-    case VK_SHADER_STAGE_VERTEX_BIT:
-        code = pgraph_glsl_gen_vsh(&module->key.vsh.state,
-                                   module->key.vsh.glsl_opts);
-        break;
-    case VK_SHADER_STAGE_GEOMETRY_BIT:
-        code = pgraph_glsl_gen_geom(&module->key.geom.state,
-                                    module->key.geom.glsl_opts);
-        break;
-    case VK_SHADER_STAGE_FRAGMENT_BIT:
-        code = pgraph_glsl_gen_psh(&module->key.psh.state,
-                                   module->key.psh.glsl_opts);
-        break;
-    default:
-        assert(!"Invalid shader module kind");
-        code = NULL;
-    }
-
-    module->module_info = pgraph_vk_create_shader_module_from_glsl(
-        r, module->key.kind, mstring_get_str(code));
-    pgraph_vk_ref_shader_module(module->module_info);
-    mstring_unref(code);
-}
-
-static void shader_module_cache_entry_post_evict(Lru *lru, LruNode *node)
-{
-    PGRAPHVkState *r = container_of(lru, PGRAPHVkState, shader_module_cache);
-    ShaderModuleCacheEntry *module =
-        container_of(node, ShaderModuleCacheEntry, node);
-    pgraph_vk_unref_shader_module(r, module->module_info);
-    module->module_info = NULL;
-}
-
-static bool shader_module_cache_entry_compare(Lru *lru, LruNode *node,
-                                              const void *key)
-{
-    ShaderModuleCacheEntry *module =
-        container_of(node, ShaderModuleCacheEntry, node);
-    return memcmp(&module->key, key, sizeof(ShaderModuleCacheKey));
-}
-
 static void shader_cache_init(PGRAPHState *pg)
 {
    PGRAPHVkState *r = pg->vk_renderer_state;
@ -384,22 +361,6 @@ static void shader_cache_init(PGRAPHState *pg)
    r->shader_cache.init_node = shader_cache_entry_init;
    r->shader_cache.compare_nodes = shader_cache_entry_compare;
    r->shader_cache.post_node_evict = shader_cache_entry_post_evict;
-
-    /* FIXME: Make this configurable */
-    const size_t shader_module_cache_size = 50 * 1024;
-    lru_init(&r->shader_module_cache);
-    r->shader_module_cache_entries =
-        g_malloc_n(shader_module_cache_size, sizeof(ShaderModuleCacheEntry));
-    assert(r->shader_module_cache_entries != NULL);
-    for (int i = 0; i < shader_module_cache_size; i++) {
-        lru_add_free(&r->shader_module_cache,
-                     &r->shader_module_cache_entries[i].node);
-    }
-
-    r->shader_module_cache.init_node = shader_module_cache_entry_init;
-    r->shader_module_cache.compare_nodes = shader_module_cache_entry_compare;
-    r->shader_module_cache.post_node_evict =
-        shader_module_cache_entry_post_evict;
 }

 static void shader_cache_finalize(PGRAPHState *pg)
@ -409,78 +370,475 @@ static void shader_cache_finalize(PGRAPHState *pg)
    lru_flush(&r->shader_cache);
    g_free(r->shader_cache_entries);
    r->shader_cache_entries = NULL;
-
-    lru_flush(&r->shader_module_cache);
-    g_free(r->shader_module_cache_entries);
-    r->shader_module_cache_entries = NULL;
 }

-static ShaderBinding *get_shader_binding_for_state(PGRAPHVkState *r,
-                                                   const ShaderState *state)
+static ShaderBinding *gen_shaders(PGRAPHState *pg, ShaderState *state)
 {
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
    uint64_t hash = fast_hash((void *)state, sizeof(*state));
    LruNode *node = lru_lookup(&r->shader_cache, hash, state);
-    ShaderBinding *binding = container_of(node, ShaderBinding, node);
-    NV2A_VK_DPRINTF("shader state hash: %016" PRIx64 " %p", hash, binding);
-    return binding;
+    ShaderBinding *snode = container_of(node, ShaderBinding, node);
+
+    NV2A_VK_DPRINTF("shader state hash: %016" PRIx64 " %p", hash, snode);
+
+    if (!snode->initialized) {
+        NV2A_VK_DPRINTF("cache miss");
+        nv2a_profile_inc_counter(NV2A_PROF_SHADER_GEN);
+
+        char *previous_numeric_locale = setlocale(LC_NUMERIC, NULL);
+        if (previous_numeric_locale) {
+            previous_numeric_locale = g_strdup(previous_numeric_locale);
+        }
+
+        /* Ensure numeric values are printed with '.' radix, no grouping */
+        setlocale(LC_NUMERIC, "C");
+
+        MString *geometry_shader_code = pgraph_gen_geom_glsl(
+            state->polygon_front_mode, state->polygon_back_mode,
+            state->primitive_mode, state->smooth_shading, true);
+        if (geometry_shader_code) {
+            NV2A_VK_DPRINTF("geometry shader: \n%s",
+                            mstring_get_str(geometry_shader_code));
+            snode->geometry = pgraph_vk_create_shader_module_from_glsl(
+                r, VK_SHADER_STAGE_GEOMETRY_BIT,
+                mstring_get_str(geometry_shader_code));
+            mstring_unref(geometry_shader_code);
+        } else {
+            snode->geometry = NULL;
+        }
+
+        MString *vertex_shader_code =
+            pgraph_gen_vsh_glsl(state, geometry_shader_code != NULL);
+        NV2A_VK_DPRINTF("vertex shader: \n%s",
+                        mstring_get_str(vertex_shader_code));
+        snode->vertex = pgraph_vk_create_shader_module_from_glsl(
+            r, VK_SHADER_STAGE_VERTEX_BIT,
+            mstring_get_str(vertex_shader_code));
+        mstring_unref(vertex_shader_code);
+
+        MString *fragment_shader_code = pgraph_gen_psh_glsl(state->psh);
+        NV2A_VK_DPRINTF("fragment shader: \n%s",
+                        mstring_get_str(fragment_shader_code));
+        snode->fragment = pgraph_vk_create_shader_module_from_glsl(
+            r, VK_SHADER_STAGE_FRAGMENT_BIT,
+            mstring_get_str(fragment_shader_code));
+        mstring_unref(fragment_shader_code);
+
+        if (previous_numeric_locale) {
+            setlocale(LC_NUMERIC, previous_numeric_locale);
+            g_free(previous_numeric_locale);
+        }
+
+        update_shader_constant_locations(snode);
+
+        snode->initialized = true;
+    }
+
+    return snode;
 }

-static void apply_uniform_updates(ShaderUniformLayout *layout,
-                                  const UniformInfo *info, int *locs,
-                                  void *values, size_t count)
+static void update_uniform_attr_values(PGRAPHState *pg, ShaderBinding *binding)
 {
-    for (int i = 0; i < count; i++) {
-        if (locs[i] != -1) {
-            uniform_copy(layout, locs[i], (char*)values + info[i].val_offs,
-                         4, (info[i].size * info[i].count) / 4);
-        }
+    float values[NV2A_VERTEXSHADER_ATTRIBUTES][4];
+    int num_uniform_attrs = 0;
+
+    pgraph_get_inline_values(pg, binding->state.uniform_attrs, values,
+                             &num_uniform_attrs);
+
+    if (num_uniform_attrs > 0) {
+        uniform1fv(&binding->vertex->uniforms, binding->uniform_attrs_loc,
+                   num_uniform_attrs * 4, &values[0][0]);
    }
 }

-// FIXME: Dirty tracking
-static void update_shader_uniforms(PGRAPHState *pg)
+// FIXME: Move to common
+static void shader_update_constants(PGRAPHState *pg, ShaderBinding *binding,
+                                    bool binding_changed, bool vertex_program,
+                                    bool fixed_function)
+{
+    ShaderState *state = &binding->state;
+
+    /* update combiner constants */
+    for (int i = 0; i < 9; i++) {
+        uint32_t constant[2];
+        if (i == 8) {
+            /* final combiner */
+            constant[0] = pgraph_reg_r(pg, NV_PGRAPH_SPECFOGFACTOR0);
+            constant[1] = pgraph_reg_r(pg, NV_PGRAPH_SPECFOGFACTOR1);
+        } else {
+            constant[0] = pgraph_reg_r(pg, NV_PGRAPH_COMBINEFACTOR0 + i * 4);
+            constant[1] = pgraph_reg_r(pg, NV_PGRAPH_COMBINEFACTOR1 + i * 4);
+        }
+
+        for (int j = 0; j < 2; j++) {
+            GLint loc = binding->psh_constant_loc[i][j];
+            if (loc != -1) {
+                float value[4];
+                pgraph_argb_pack32_to_rgba_float(constant[j], value);
+                uniform1fv(&binding->fragment->uniforms, loc, 4, value);
+            }
+        }
+    }
+    if (binding->alpha_ref_loc != -1) {
+        int alpha_ref = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0),
+                                   NV_PGRAPH_CONTROL_0_ALPHAREF);
+        uniform1i(&binding->fragment->uniforms, binding->alpha_ref_loc,
+                         alpha_ref);
+    }
+
+
+    /* For each texture stage */
+    for (int i = 0; i < NV2A_MAX_TEXTURES; i++) {
+        int loc;
+
+        /* Bump luminance only during stages 1 - 3 */
+        if (i > 0) {
+            loc = binding->bump_mat_loc[i];
+            if (loc != -1) {
+                uint32_t m_u32[4];
+                m_u32[0] = pgraph_reg_r(pg, NV_PGRAPH_BUMPMAT00 + 4 * (i - 1));
+                m_u32[1] = pgraph_reg_r(pg, NV_PGRAPH_BUMPMAT01 + 4 * (i - 1));
+                m_u32[2] = pgraph_reg_r(pg, NV_PGRAPH_BUMPMAT10 + 4 * (i - 1));
+                m_u32[3] = pgraph_reg_r(pg, NV_PGRAPH_BUMPMAT11 + 4 * (i - 1));
+                float m[4];
+                m[0] = *(float*)&m_u32[0];
+                m[1] = *(float*)&m_u32[1];
+                m[2] = *(float*)&m_u32[2];
+                m[3] = *(float*)&m_u32[3];
+                uniformMatrix2fv(&binding->fragment->uniforms, loc, m);
+            }
+            loc = binding->bump_scale_loc[i];
+            if (loc != -1) {
+                uint32_t v =
+                    pgraph_reg_r(pg, NV_PGRAPH_BUMPSCALE1 + (i - 1) * 4);
+                uniform1f(&binding->fragment->uniforms, loc,
+                                 *(float *)&v);
+            }
+            loc = binding->bump_offset_loc[i];
+            if (loc != -1) {
+                uint32_t v =
+                    pgraph_reg_r(pg, NV_PGRAPH_BUMPOFFSET1 + (i - 1) * 4);
+                uniform1f(&binding->fragment->uniforms, loc,
+                                 *(float *)&v);
+            }
+        }
+
+        loc = binding->tex_scale_loc[i];
+        if (loc != -1) {
+            assert(pg->vk_renderer_state->texture_bindings[i] != NULL);
+            float scale = pg->vk_renderer_state->texture_bindings[i]->key.scale;
+            BasicColorFormatInfo f_basic = kelvin_color_format_info_map[pg->vk_renderer_state->texture_bindings[i]->key.state.color_format];
+            if (!f_basic.linear) {
+                scale = 1.0;
+            }
+            uniform1f(&binding->fragment->uniforms, loc, scale);
+        }
+    }
+
+    if (binding->fog_color_loc != -1) {
+        uint32_t fog_color = pgraph_reg_r(pg, NV_PGRAPH_FOGCOLOR);
+        uniform4f(&binding->fragment->uniforms, binding->fog_color_loc,
+                         GET_MASK(fog_color, NV_PGRAPH_FOGCOLOR_RED) / 255.0,
+                         GET_MASK(fog_color, NV_PGRAPH_FOGCOLOR_GREEN) / 255.0,
+                         GET_MASK(fog_color, NV_PGRAPH_FOGCOLOR_BLUE) / 255.0,
+                         GET_MASK(fog_color, NV_PGRAPH_FOGCOLOR_ALPHA) / 255.0);
+    }
+    if (binding->fog_param_loc != -1) {
+        uint32_t v[2];
+        v[0] = pgraph_reg_r(pg, NV_PGRAPH_FOGPARAM0);
+        v[1] = pgraph_reg_r(pg, NV_PGRAPH_FOGPARAM1);
+        uniform2f(&binding->vertex->uniforms,
+                         binding->fog_param_loc, *(float *)&v[0],
+                         *(float *)&v[1]);
+    }
+
+    float zmax;
+    switch (pg->surface_shape.zeta_format) {
+    case NV097_SET_SURFACE_FORMAT_ZETA_Z16:
+        zmax = pg->surface_shape.z_format ? f16_max : (float)0xFFFF;
+        break;
+    case NV097_SET_SURFACE_FORMAT_ZETA_Z24S8:
+        zmax = pg->surface_shape.z_format ? f24_max : (float)0xFFFFFF;
+        break;
+    default:
+        assert(0);
+    }
+
+    if (fixed_function) {
+        /* update lighting constants */
+        struct {
+            uint32_t *v;
+            int locs;
+            size_t len;
+        } lighting_arrays[] = {
+            { &pg->ltctxa[0][0], binding->ltctxa_loc, NV2A_LTCTXA_COUNT },
+            { &pg->ltctxb[0][0], binding->ltctxb_loc, NV2A_LTCTXB_COUNT },
+            { &pg->ltc1[0][0], binding->ltc1_loc, NV2A_LTC1_COUNT },
+        };
+
+        for (int i = 0; i < ARRAY_SIZE(lighting_arrays); i++) {
+            uniform1iv(
+                &binding->vertex->uniforms, lighting_arrays[i].locs,
+                lighting_arrays[i].len * 4, (void *)lighting_arrays[i].v);
+        }
+
+        for (int i = 0; i < NV2A_MAX_LIGHTS; i++) {
+            int loc = binding->light_infinite_half_vector_loc[i];
+            if (loc != -1) {
+                uniform1fv(&binding->vertex->uniforms, loc, 3,
+                                 pg->light_infinite_half_vector[i]);
+            }
+            loc = binding->light_infinite_direction_loc[i];
+            if (loc != -1) {
+                uniform1fv(&binding->vertex->uniforms, loc, 3,
+                                 pg->light_infinite_direction[i]);
+            }
+
+            loc = binding->light_local_position_loc[i];
+            if (loc != -1) {
+                uniform1fv(&binding->vertex->uniforms, loc, 3,
+                                 pg->light_local_position[i]);
+            }
+            loc = binding->light_local_attenuation_loc[i];
+            if (loc != -1) {
+                uniform1fv(&binding->vertex->uniforms, loc, 3,
+                                 pg->light_local_attenuation[i]);
+            }
+        }
+
+        /* estimate the viewport by assuming it matches the surface ... */
+        unsigned int aa_width = 1, aa_height = 1;
+        pgraph_apply_anti_aliasing_factor(pg, &aa_width, &aa_height);
+
+        float m11 = 0.5 * (pg->surface_binding_dim.width / aa_width);
+        float m22 = -0.5 * (pg->surface_binding_dim.height / aa_height);
+        float m33 = zmax;
+        float m41 = *(float *)&pg->vsh_constants[NV_IGRAPH_XF_XFCTX_VPOFF][0];
+        float m42 = *(float *)&pg->vsh_constants[NV_IGRAPH_XF_XFCTX_VPOFF][1];
+
+        float invViewport[16] = {
+            1.0 / m11, 0,  0, 0,         0, 1.0 / m22,        0,
+            0,         0,  0, 1.0 / m33, 0, -1.0 + m41 / m11, 1.0 + m42 / m22,
+            0,         1.0
+        };
+
+        if (binding->inv_viewport_loc != -1) {
+            uniformMatrix4fv(&binding->vertex->uniforms,
+                                    binding->inv_viewport_loc, &invViewport[0]);
+        }
+    }
+
+    /* update vertex program constants */
+    uniform1iv(&binding->vertex->uniforms, binding->vsh_constant_loc,
+               NV2A_VERTEXSHADER_CONSTANTS * 4, (void *)pg->vsh_constants);
+
+    if (binding->surface_size_loc != -1) {
+        unsigned int aa_width = 1, aa_height = 1;
+        pgraph_apply_anti_aliasing_factor(pg, &aa_width, &aa_height);
+        uniform2f(&binding->vertex->uniforms, binding->surface_size_loc,
+                         pg->surface_binding_dim.width / aa_width,
+                         pg->surface_binding_dim.height / aa_height);
+    }
+
+    if (binding->clip_range_loc != -1 || binding->clip_range_floc != -1) {
+        uint32_t v[2];
+        v[0] = pgraph_reg_r(pg, NV_PGRAPH_ZCLIPMIN);
+        v[1] = pgraph_reg_r(pg, NV_PGRAPH_ZCLIPMAX);
+        float zclip_min = *(float *)&v[0];
+        float zclip_max = *(float *)&v[1];
+
+        if (binding->clip_range_loc != -1) {
+            uniform4f(&binding->vertex->uniforms, binding->clip_range_loc, 0,
+                      zmax, zclip_min, zclip_max);
+        }
+        if (binding->clip_range_floc != -1) {
+            uniform4f(&binding->fragment->uniforms, binding->clip_range_floc, 0,
+                      zmax, zclip_min, zclip_max);
+        }
+    }
+
+    if (binding->depth_offset_loc != -1) {
+        float zbias = 0.0f;
+
+        if (pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER) &
+            (NV_PGRAPH_SETUPRASTER_POFFSETFILLENABLE |
+             NV_PGRAPH_SETUPRASTER_POFFSETLINEENABLE |
+             NV_PGRAPH_SETUPRASTER_POFFSETPOINTENABLE)) {
+            uint32_t zbias_u32 = pgraph_reg_r(pg, NV_PGRAPH_ZOFFSETBIAS);
+            zbias = *(float *)&zbias_u32;
+
+            if (pgraph_reg_r(pg, NV_PGRAPH_ZOFFSETFACTOR) != 0 &&
+                (pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0) &
+                 NV_PGRAPH_CONTROL_0_Z_PERSPECTIVE_ENABLE)) {
+                /* TODO: emulate zfactor when z_perspective true, i.e.
+                 * w-buffering. Perhaps calculate an additional offset based on
+                 * triangle orientation in geometry shader and pass the result
+                 * to fragment shader and add it to gl_FragDepth as well.
+                 */
+                NV2A_UNIMPLEMENTED("NV_PGRAPH_ZOFFSETFACTOR for w-buffering");
+            }
+        }
+
+        uniform1f(&binding->fragment->uniforms, binding->depth_offset_loc,
+                  zbias);
+    }
+
+    /* Clipping regions */
+    unsigned int max_gl_width = pg->surface_binding_dim.width;
+    unsigned int max_gl_height = pg->surface_binding_dim.height;
+    pgraph_apply_scaling_factor(pg, &max_gl_width, &max_gl_height);
+
+    uint32_t clip_regions[8][4];
+
+    for (int i = 0; i < 8; i++) {
+        uint32_t x = pgraph_reg_r(pg, NV_PGRAPH_WINDOWCLIPX0 + i * 4);
+        unsigned int x_min = GET_MASK(x, NV_PGRAPH_WINDOWCLIPX0_XMIN);
+        unsigned int x_max = GET_MASK(x, NV_PGRAPH_WINDOWCLIPX0_XMAX) + 1;
+        uint32_t y = pgraph_reg_r(pg, NV_PGRAPH_WINDOWCLIPY0 + i * 4);
+        unsigned int y_min = GET_MASK(y, NV_PGRAPH_WINDOWCLIPY0_YMIN);
+        unsigned int y_max = GET_MASK(y, NV_PGRAPH_WINDOWCLIPY0_YMAX) + 1;
+        pgraph_apply_anti_aliasing_factor(pg, &x_min, &y_min);
+        pgraph_apply_anti_aliasing_factor(pg, &x_max, &y_max);
+
+        pgraph_apply_scaling_factor(pg, &x_min, &y_min);
+        pgraph_apply_scaling_factor(pg, &x_max, &y_max);
+
+        clip_regions[i][0] = x_min;
+        clip_regions[i][1] = y_min;
+        clip_regions[i][2] = x_max;
+        clip_regions[i][3] = y_max;
+    }
+    uniform1iv(&binding->fragment->uniforms, binding->clip_region_loc,
+                     8 * 4, (void *)clip_regions);
+
+    if (binding->material_alpha_loc != -1) {
+        uniform1f(&binding->vertex->uniforms, binding->material_alpha_loc,
+                         pg->material_alpha);
+    }
+
+    if (!state->use_push_constants_for_uniform_attrs && state->uniform_attrs) {
+        update_uniform_attr_values(pg, binding);
+    }
+}
+
+// Quickly check PGRAPH state to see if any registers have changed that
+// necessitate a full shader state inspection.
+static bool check_shaders_dirty(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    if (!r->shader_binding) {
+        return true;
+    }
+    if (pg->program_data_dirty) {
+        return true;
+    }
+
+    int num_stages = pgraph_reg_r(pg, NV_PGRAPH_COMBINECTL) & 0xFF;
+    for (int i = 0; i < num_stages; i++) {
+        if (pgraph_is_reg_dirty(pg, NV_PGRAPH_COMBINEALPHAI0 + i * 4) ||
+            pgraph_is_reg_dirty(pg, NV_PGRAPH_COMBINEALPHAO0 + i * 4) ||
+            pgraph_is_reg_dirty(pg, NV_PGRAPH_COMBINECOLORI0 + i * 4) ||
+            pgraph_is_reg_dirty(pg, NV_PGRAPH_COMBINECOLORO0 + i * 4)) {
+            return true;
+        }
+    }
+
+    unsigned int regs[] = {
+        NV_PGRAPH_COMBINECTL,
+        NV_PGRAPH_COMBINESPECFOG0,
+        NV_PGRAPH_COMBINESPECFOG1,
+        NV_PGRAPH_CONTROL_0,
+        NV_PGRAPH_CONTROL_3,
+        NV_PGRAPH_CSV0_C,
+        NV_PGRAPH_CSV0_D,
+        NV_PGRAPH_CSV1_A,
+        NV_PGRAPH_CSV1_B,
+        NV_PGRAPH_POINTSIZE,
+        NV_PGRAPH_SETUPRASTER,
+        NV_PGRAPH_SHADERCLIPMODE,
+        NV_PGRAPH_SHADERCTL,
+        NV_PGRAPH_SHADERPROG,
+        NV_PGRAPH_SHADOWCTL,
+        NV_PGRAPH_ZCOMPRESSOCCLUDE,
+    };
+    for (int i = 0; i < ARRAY_SIZE(regs); i++) {
+        if (pgraph_is_reg_dirty(pg, regs[i])) {
+            return true;
+        }
+    }
+
+    ShaderState *state = &r->shader_binding->state;
+    if (pg->uniform_attrs != state->uniform_attrs ||
+        pg->swizzle_attrs != state->swizzle_attrs ||
+        pg->compressed_attrs != state->compressed_attrs ||
+        pg->primitive_mode != state->primitive_mode ||
+        pg->surface_scale_factor != state->surface_scale_factor) {
+        return true;
+    }
+
+    // Textures
+    for (int i = 0; i < 4; i++) {
+        if (pg->texture_matrix_enable[i] != pg->vk_renderer_state->shader_binding->state.texture_matrix_enable[i] ||
+            pgraph_is_reg_dirty(pg, NV_PGRAPH_TEXCTL0_0 + i * 4) ||
+            pgraph_is_reg_dirty(pg, NV_PGRAPH_TEXFILTER0 + i * 4) ||
+            pgraph_is_reg_dirty(pg, NV_PGRAPH_TEXFMT0 + i * 4)) {
+            return true;
+        }
+    }
+
+    nv2a_profile_inc_counter(NV2A_PROF_SHADER_BIND_NOTDIRTY);
+
+    return false;
+}
+
+void pgraph_vk_bind_shaders(PGRAPHState *pg)
 {
    NV2A_VK_DGROUP_BEGIN("%s", __func__);

    PGRAPHVkState *r = pg->vk_renderer_state;
+
+    r->shader_bindings_changed = false;
+
+    if (check_shaders_dirty(pg)) {
+        ShaderState new_state;
+        memset(&new_state, 0, sizeof(ShaderState));
+        new_state = pgraph_get_shader_state(pg);
+        new_state.vulkan = true;
+        new_state.psh.vulkan = true;
+        new_state.use_push_constants_for_uniform_attrs =
+            (r->device_props.limits.maxPushConstantsSize >=
+             MAX_UNIFORM_ATTR_VALUES_SIZE);
+
+        if (!r->shader_binding || memcmp(&r->shader_binding->state, &new_state, sizeof(ShaderState))) {
+            r->shader_binding = gen_shaders(pg, &new_state);
+            r->shader_bindings_changed = true;
+        }
+    }
+
+    // FIXME: Use dirty bits
+    pgraph_vk_update_shader_uniforms(pg);
+
+    NV2A_VK_DGROUP_END();
+}
+
+void pgraph_vk_update_shader_uniforms(PGRAPHState *pg)
+{
+    PGRAPHVkState *r = pg->vk_renderer_state;
+    NV2A_VK_DGROUP_BEGIN("%s", __func__);
    nv2a_profile_inc_counter(NV2A_PROF_SHADER_BIND);

    assert(r->shader_binding);
    ShaderBinding *binding = r->shader_binding;
-    ShaderUniformLayout *layouts[] = { &binding->vsh.module_info->uniforms,
-                                       &binding->psh.module_info->uniforms };
-
-    VshUniformValues vsh_values;
-    pgraph_glsl_set_vsh_uniform_values(pg, &binding->state.vsh,
-                                  binding->vsh.uniform_locs, &vsh_values);
-    apply_uniform_updates(&binding->vsh.module_info->uniforms, VshUniformInfo,
-                          binding->vsh.uniform_locs, &vsh_values,
-                          VshUniform__COUNT);
-
-    PshUniformValues psh_values;
-    pgraph_glsl_set_psh_uniform_values(pg, binding->psh.uniform_locs,
-                                       &psh_values);
-    for (int i = 0; i < 4; i++) {
-        assert(r->texture_bindings[i] != NULL);
-        float scale = r->texture_bindings[i]->key.scale;
-
-        BasicColorFormatInfo f_basic =
-            kelvin_color_format_info_map[pg->vk_renderer_state
-                                             ->texture_bindings[i]
-                                             ->key.state.color_format];
-        if (!f_basic.linear) {
-            scale = 1.0;
-        }
-
-        psh_values.texScale[i] = scale;
-    }
-    apply_uniform_updates(&binding->psh.module_info->uniforms, PshUniformInfo,
-                          binding->psh.uniform_locs, &psh_values,
-                          PshUniform__COUNT);
+    ShaderUniformLayout *layouts[] = { &binding->vertex->uniforms,
+                                        &binding->fragment->uniforms };
+    shader_update_constants(pg, r->shader_binding, true,
+                            r->shader_binding->state.vertex_program,
+                            r->shader_binding->state.fixed_function);

    for (int i = 0; i < ARRAY_SIZE(layouts); i++) {
-        uint64_t hash =
-            fast_hash(layouts[i]->allocation, layouts[i]->total_size);
+        uint64_t hash = fast_hash(layouts[i]->allocation, layouts[i]->total_size);
        r->uniforms_changed |= (hash != r->uniform_buffer_hashes[i]);
        r->uniform_buffer_hashes[i] = hash;
    }
@ -492,44 +850,13 @@ static void update_shader_uniforms(PGRAPHState *pg)
    NV2A_VK_DGROUP_END();
 }

-void pgraph_vk_bind_shaders(PGRAPHState *pg)
-{
-    NV2A_VK_DGROUP_BEGIN("%s", __func__);
-
-    PGRAPHVkState *r = pg->vk_renderer_state;
-
-    r->shader_bindings_changed = false;
-
-    if (!r->shader_binding ||
-        pgraph_glsl_check_shader_state_dirty(pg, &r->shader_binding->state)) {
-        ShaderState new_state = pgraph_glsl_get_shader_state(pg);
-        if (!r->shader_binding || memcmp(&r->shader_binding->state, &new_state,
-                                         sizeof(ShaderState))) {
-            r->shader_binding = get_shader_binding_for_state(r, &new_state);
-            r->shader_bindings_changed = true;
-        }
-    } else {
-        nv2a_profile_inc_counter(NV2A_PROF_SHADER_BIND_NOTDIRTY);
-    }
-
-    update_shader_uniforms(pg);
-
-    NV2A_VK_DGROUP_END();
-}
-
 void pgraph_vk_init_shaders(PGRAPHState *pg)
 {
-    PGRAPHVkState *r = pg->vk_renderer_state;
-
    pgraph_vk_init_glsl_compiler();
    create_descriptor_pool(pg);
    create_descriptor_set_layout(pg);
    create_descriptor_sets(pg);
    shader_cache_init(pg);
-
-    r->use_push_constants_for_uniform_attrs =
-        (r->device_props.limits.maxPushConstantsSize >=
-         MAX_UNIFORM_ATTR_VALUES_SIZE);
 }

 void pgraph_vk_finalize_shaders(PGRAPHState *pg)
--- a/hw/xbox/nv2a/pgraph/vk/surface-compute.c
+++ b/hw/xbox/nv2a/pgraph/vk/surface-compute.c
@ -524,8 +524,7 @@ void pgraph_vk_unpack_depth_stencil(PGRAPHState *pg, SurfaceBinding *surface,
    pgraph_vk_end_debug_marker(r, cmd);
 }

-static void pipeline_cache_entry_init(Lru *lru, LruNode *node,
-                                      const void *state)
+static void pipeline_cache_entry_init(Lru *lru, LruNode *node, void *state)
 {
    PGRAPHVkState *r = container_of(lru, PGRAPHVkState, compute.pipeline_cache);
    ComputePipeline *snode = container_of(node, ComputePipeline, node);
@ -557,8 +556,7 @@ static void pipeline_cache_entry_post_evict(Lru *lru, LruNode *node)
    pipeline_cache_release_node_resources(r, snode);
 }

-static bool pipeline_cache_entry_compare(Lru *lru, LruNode *node,
-                                         const void *key)
+static bool pipeline_cache_entry_compare(Lru *lru, LruNode *node, void *key)
 {
    ComputePipeline *snode = container_of(node, ComputePipeline, node);
    return memcmp(&snode->key, key, sizeof(ComputePipelineKey));
--- a/hw/xbox/nv2a/pgraph/vk/surface.c
+++ b/hw/xbox/nv2a/pgraph/vk/surface.c
@ -122,22 +122,17 @@ static void memcpy_image(void *dst, void const *src, int dst_stride,
    }
 }

-static bool check_surface_overlaps_range(const SurfaceBinding *surface,
-                                         hwaddr range_start, hwaddr range_len)
-{
-    hwaddr surface_end = surface->vram_addr + surface->size;
-    hwaddr range_end = range_start + range_len;
-    return !(surface->vram_addr >= range_end || range_start >= surface_end);
-}
-
-void pgraph_vk_download_surfaces_in_range_if_dirty(PGRAPHState *pg,
-                                                   hwaddr start, hwaddr size)
+void pgraph_vk_download_surfaces_in_range_if_dirty(PGRAPHState *pg, hwaddr start, hwaddr size)
 {
    PGRAPHVkState *r = pg->vk_renderer_state;
    SurfaceBinding *surface;

+    hwaddr end = start + size - 1;
+
    QTAILQ_FOREACH(surface, &r->surfaces, entry) {
-        if (check_surface_overlaps_range(surface, start, size)) {
+        hwaddr surf_end = surface->vram_addr + surface->size - 1;
+        bool overlapping = !(surface->vram_addr >= end || start >= surf_end);
+        if (overlapping) {
            pgraph_vk_surface_download_if_dirty(
                container_of(pg, NV2AState, pgraph), surface);
        }
@ -532,54 +527,33 @@ void pgraph_vk_download_dirty_surfaces(NV2AState *d)
 static void surface_access_callback(void *opaque, MemoryRegion *mr, hwaddr addr,
                                    hwaddr len, bool write)
 {
-    NV2AState *d = (NV2AState *)opaque;
-    qemu_mutex_lock(&d->pgraph.lock);
+    SurfaceBinding *e = opaque;
+    assert(addr >= e->vram_addr);
+    hwaddr offset = addr - e->vram_addr;
+    assert(offset < e->size);

-    PGRAPHVkState *r = d->pgraph.vk_renderer_state;
-    bool wait_for_downloads = false;
-
-    SurfaceBinding *surface;
-    QTAILQ_FOREACH(surface, &r->surfaces, entry) {
-        if (!check_surface_overlaps_range(surface, addr, len)) {
-            continue;
+    if (qatomic_read(&e->draw_dirty)) {
+        trace_nv2a_pgraph_surface_cpu_access(e->vram_addr, offset);
+        pgraph_vk_wait_for_surface_download(e);
    }

-        hwaddr offset = addr - surface->vram_addr;
-
-        if (write) {
-            trace_nv2a_pgraph_surface_cpu_write(surface->vram_addr, offset);
-        } else {
-            trace_nv2a_pgraph_surface_cpu_read(surface->vram_addr, offset);
-        }
-
-        if (surface->draw_dirty) {
-            surface->download_pending = true;
-            wait_for_downloads = true;
-        }
-
-        if (write) {
-            surface->upload_pending = true;
-        }
-    }
-
-    qemu_mutex_unlock(&d->pgraph.lock);
-
-    if (wait_for_downloads) {
-        qemu_mutex_lock(&d->pfifo.lock);
-        qemu_event_reset(&r->downloads_complete);
-        qatomic_set(&r->downloads_pending, true);
-        pfifo_kick(d);
-        qemu_mutex_unlock(&d->pfifo.lock);
-        qemu_event_wait(&r->downloads_complete);
+    if (write && !qatomic_read(&e->upload_pending)) {
+        trace_nv2a_pgraph_surface_cpu_access(e->vram_addr, offset);
+        qatomic_set(&e->upload_pending, true);
    }
 }

 static void register_cpu_access_callback(NV2AState *d, SurfaceBinding *surface)
 {
    if (tcg_enabled()) {
-        surface->access_cb = mem_access_callback_insert(
-            qemu_get_cpu(0), d->vram, surface->vram_addr, surface->size,
-            &surface_access_callback, d);
+        qemu_mutex_unlock(&d->pgraph.lock);
+        bql_lock();
+        mem_access_callback_insert(qemu_get_cpu(0),
+            d->vram, surface->vram_addr, surface->size,
+            &surface->access_cb, &surface_access_callback,
+            surface);
+        bql_unlock();
+        qemu_mutex_lock(&d->pgraph.lock);
    }
 }

@ -587,7 +561,11 @@ static void unregister_cpu_access_callback(NV2AState *d,
                                           SurfaceBinding const *surface)
 {
    if (tcg_enabled()) {
+        qemu_mutex_unlock(&d->pgraph.lock);
+        bql_lock();
        mem_access_callback_remove_by_ref(qemu_get_cpu(0), surface->access_cb);
+        bql_unlock();
+        qemu_mutex_lock(&d->pgraph.lock);
    }
 }

@ -649,26 +627,24 @@ static void invalidate_surface(NV2AState *d, SurfaceBinding *surface)
    QTAILQ_INSERT_HEAD(&r->invalid_surfaces, surface, entry);
 }

-static bool check_surfaces_overlap(const SurfaceBinding *surface,
-                                   const SurfaceBinding *other_surface)
-{
-    return check_surface_overlaps_range(surface, other_surface->vram_addr,
-                                        other_surface->size);
-}
-
 static void invalidate_overlapping_surfaces(NV2AState *d,
                                            SurfaceBinding const *surface)
 {
    PGRAPHVkState *r = d->pgraph.vk_renderer_state;

-    SurfaceBinding *other_surface, *next_surface;
-    QTAILQ_FOREACH_SAFE (other_surface, &r->surfaces, entry, next_surface) {
-        if (check_surfaces_overlap(surface, other_surface)) {
+    uintptr_t e_end = surface->vram_addr + surface->size - 1;
+
+    SurfaceBinding *s, *next;
+    QTAILQ_FOREACH_SAFE(s, &r->surfaces, entry, next) {
+        uintptr_t s_end = s->vram_addr + s->size - 1;
+        bool overlapping =
+            !(s->vram_addr > e_end || surface->vram_addr > s_end);
+        if (overlapping) {
            trace_nv2a_pgraph_surface_evict_overlapping(
-                other_surface->vram_addr, other_surface->width,
-                other_surface->height, other_surface->pitch);
-            pgraph_vk_surface_download_if_dirty(d, other_surface);
-            invalidate_surface(d, other_surface);
+                s->vram_addr, s->width, s->height,
+                s->pitch);
+            pgraph_vk_surface_download_if_dirty(d, s);
+            invalidate_surface(d, s);
        }
    }
 }
--- a/hw/xbox/nv2a/pgraph/vk/texture.c
+++ b/hw/xbox/nv2a/pgraph/vk/texture.c
@ -1089,9 +1089,12 @@ static void create_texture(PGRAPHState *pg, int texture_idx)
    BasicColorFormatInfo f_basic = kelvin_color_format_info_map[state.color_format];

    const hwaddr texture_vram_offset = pgraph_get_texture_phys_addr(pg, texture_idx);
+    size_t texture_palette_data_size;
+    const hwaddr texture_palette_vram_offset =
+        pgraph_get_texture_palette_phys_addr_length(pg, texture_idx,
+                                                    &texture_palette_data_size);
+
    size_t texture_length = pgraph_get_texture_length(pg, &state);
-    hwaddr texture_palette_vram_offset = 0;
-    size_t texture_palette_data_size = 0;

    uint32_t filter =
        pgraph_reg_r(pg, NV_PGRAPH_TEXFILTER0 + texture_idx * 4);
@ -1099,21 +1102,14 @@ static void create_texture(PGRAPHState *pg, int texture_idx)
        pgraph_reg_r(pg, NV_PGRAPH_TEXADDRESS0 + texture_idx * 4);
    uint32_t border_color_pack32 =
        pgraph_reg_r(pg, NV_PGRAPH_BORDERCOLOR0 + texture_idx * 4);
-    bool is_indexed = (state.color_format ==
-            NV097_SET_TEXTURE_FORMAT_COLOR_SZ_I8_A8R8G8B8);

    TextureKey key;
    memset(&key, 0, sizeof(key));
    key.state = state;
    key.texture_vram_offset = texture_vram_offset;
    key.texture_length = texture_length;
-    if (is_indexed) {
-        texture_palette_vram_offset =
-            pgraph_get_texture_palette_phys_addr_length(
-                pg, texture_idx, &texture_palette_data_size);
    key.palette_vram_offset = texture_palette_vram_offset;
    key.palette_length = texture_palette_data_size;
-    }
    key.scale = 1;

    // FIXME: Separate sampler from texture
@ -1121,6 +1117,9 @@ static void create_texture(PGRAPHState *pg, int texture_idx)
    key.address = address;
    key.border_color = border_color_pack32;

+    bool is_indexed = (state.color_format ==
+            NV097_SET_TEXTURE_FORMAT_COLOR_SZ_I8_A8R8G8B8);
+
    bool possibly_dirty = false;
    bool possibly_dirty_checked = false;
    bool surface_to_texture = false;
@ -1433,7 +1432,7 @@ void pgraph_vk_bind_textures(NV2AState *d)
    NV2A_VK_DGROUP_END();
 }

-static void texture_cache_entry_init(Lru *lru, LruNode *node, const void *state)
+static void texture_cache_entry_init(Lru *lru, LruNode *node, void *state)
 {
    TextureBinding *snode = container_of(node, TextureBinding, node);

@ -1486,8 +1485,7 @@ static void texture_cache_entry_post_evict(Lru *lru, LruNode *node)
    texture_cache_release_node_resources(r, snode);
 }

-static bool texture_cache_entry_compare(Lru *lru, LruNode *node,
-                                        const void *key)
+static bool texture_cache_entry_compare(Lru *lru, LruNode *node, void *key)
 {
    TextureBinding *snode = container_of(node, TextureBinding, node);
    return memcmp(&snode->key, key, sizeof(TextureKey));
--- a/hw/xbox/nv2a/pgraph/vsh_regs.h
+++ b/hw/xbox/nv2a/pgraph/vsh_regs.h
@ -20,8 +20,8 @@
 #ifndef HW_NV2A_VSH_H
 #define HW_NV2A_VSH_H

-#include "qemu/osdep.h"
-#include "hw/xbox/nv2a/nv2a_regs.h"
+#include <stdbool.h>
+#include "qemu/mstring.h"

 enum VshLight {
    LIGHT_OFF,
@ -55,6 +55,8 @@ enum VshFoggen {
    FOGGEN_RADIAL,
    FOGGEN_PLANAR,
    FOGGEN_ABS_PLANAR,
+    FOGGEN_ERROR4,
+    FOGGEN_ERROR5,
    FOGGEN_FOG_X
 };

@ -82,60 +84,6 @@ enum VshSkinning {

 #define VSH_TOKEN_SIZE 4

-#define VSH_D3DSCM_CORRECTION 96
-
-typedef enum {
-    PARAM_UNKNOWN = 0,
-    PARAM_R,
-    PARAM_V,
-    PARAM_C
-} VshParameterType;
-
-typedef enum {
-    OUTPUT_C = 0,
-    OUTPUT_O
-} VshOutputType;
-
-typedef enum {
-    OMUX_MAC = 0,
-    OMUX_ILU
-} VshOutputMux;
-
-typedef enum {
-    ILU_NOP = 0,
-    ILU_MOV,
-    ILU_RCP,
-    ILU_RCC,
-    ILU_RSQ,
-    ILU_EXP,
-    ILU_LOG,
-    ILU_LIT
-} VshILU;
-
-typedef enum {
-    MAC_NOP,
-    MAC_MOV,
-    MAC_MUL,
-    MAC_ADD,
-    MAC_MAD,
-    MAC_DP3,
-    MAC_DPH,
-    MAC_DP4,
-    MAC_DST,
-    MAC_MIN,
-    MAC_MAX,
-    MAC_SLT,
-    MAC_SGE,
-    MAC_ARL
-} VshMAC;
-
-typedef enum {
-    SWIZZLE_X = 0,
-    SWIZZLE_Y,
-    SWIZZLE_Z,
-    SWIZZLE_W
-} VshSwizzle;
-
 typedef enum {
    FLD_ILU = 0,
    FLD_MAC,
@ -182,30 +130,4 @@ typedef enum {

 uint8_t vsh_get_field(const uint32_t *shader_token, VshFieldName field_name);

-enum ShaderPrimitiveMode {
-    PRIM_TYPE_INVALID,
-    PRIM_TYPE_POINTS,
-    PRIM_TYPE_LINES,
-    PRIM_TYPE_LINE_LOOP,
-    PRIM_TYPE_LINE_STRIP,
-    PRIM_TYPE_TRIANGLES,
-    PRIM_TYPE_TRIANGLE_STRIP,
-    PRIM_TYPE_TRIANGLE_FAN,
-    PRIM_TYPE_QUADS,
-    PRIM_TYPE_QUAD_STRIP,
-    PRIM_TYPE_POLYGON,
-};
-
-enum ShaderPolygonMode {
-    POLY_MODE_FILL,
-    POLY_MODE_POINT,
-    POLY_MODE_LINE,
-};
-
-enum MaterialColorSource {
-    MATERIAL_COLOR_SRC_MATERIAL,
-    MATERIAL_COLOR_SRC_DIFFUSE,
-    MATERIAL_COLOR_SRC_SPECULAR,
-};
-
 #endif
--- a/hw/xbox/nv2a/trace-events
+++ b/hw/xbox/nv2a/trace-events
@ -11,8 +11,7 @@ nv2a_pgraph_method(uint32_t subchannel, uint32_t graphics_class, uint32_t method
 nv2a_pgraph_method_abbrev(uint32_t subchannel, uint32_t graphics_class, uint32_t method, const char *name, unsigned int count) "%d: 0x%"PRIx32" -> 0x%04"PRIx32" %s * %d"
 nv2a_pgraph_method_unhandled(uint32_t subchannel, uint32_t graphics_class, uint32_t method, uint32_t parameter) "%d: 0x%"PRIx32" -> 0x%04"PRIx32" 0x%"PRIx32
 nv2a_pgraph_surface_compare_mismatch(const char *field, long int a, long int b) "%20s -- %8ld vs %8ld"
-nv2a_pgraph_surface_cpu_read(uint32_t addr, uint32_t offset) "0x%08"PRIx32"+0x%"PRIx32
-nv2a_pgraph_surface_cpu_write(uint32_t addr, uint32_t offset) "0x%08"PRIx32"+0x%"PRIx32
+nv2a_pgraph_surface_cpu_access(uint32_t addr, uint32_t offset) "0x%08"PRIx32"+0x%"PRIx32
 nv2a_pgraph_surface_create_color(uint32_t addr, uint32_t width, uint32_t height, const char *layout, uint32_t anti_aliasing, uint32_t clip_x, uint32_t clip_width, uint32_t clip_y, uint32_t clip_height, uint32_t pitch) "Create: [COLOR @ 0x%08" PRIx32 " (%dx%d)] (%s) aa:%d, clip:x=%d,w=%d,y=%d,h=%d,p=%d"
 nv2a_pgraph_surface_create_zeta(uint32_t addr, uint32_t width, uint32_t height, const char *layout, uint32_t anti_aliasing, uint32_t clip_x, uint32_t clip_width, uint32_t clip_y, uint32_t clip_height, uint32_t pitch) " Create: [ZETA  @ 0x%08" PRIx32 " (%dx%d)] (%s) aa:%d, clip:x=%d,w=%d,y=%d,h=%d,p=%d"
 nv2a_pgraph_surface_download(const char *binding, const char *layout, uint32_t addr, uint32_t width, uint32_t height, uint32_t pitch, uint32_t bytes_per_pixel) "[GPU->RAM] %s (%s) surface @ 0x%08" PRIx32 " (w=%d,h=%d,p=%d,bpp=%d)"
--- a/hw/xbox/nvnet.c
+++ b/hw/xbox/nvnet.c
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`#include "trace/trace-hw_xbox_mcpx_apu_dsp.h"`