diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 00000000..8b206540
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,18 @@
+# Vendored Dependencies
+src/frontend/glad/** linguist-vendored
+src/frontend/qt_sdl/gif-h/** linguist-vendored
+src/frontend/qt_sdl/toml/** linguist-vendored
+src/net/libslirp/** linguist-vendored
+src/net/pcap/** linguist-vendored
+src/sha1/** linguist-vendored
+src/teakra/** linguist-vendored
+src/tiny-AES-c/** linguist-vendored
+src/xxhash/** linguist-vendored
+
+# A handful of custom files embedded in the vendored dependencies
+
+## Ad-hoc CMakeLists.txt for melonDS
+src/net/libslirp/src/CMakeLists.txt -linguist-vendored
+
+## glib stub
+src/net/libslirp/src/glib/** -linguist-vendored
diff --git a/.github/workflows/build-appimage.yml b/.github/workflows/build-appimage.yml
deleted file mode 100644
index be4494e8..00000000
--- a/.github/workflows/build-appimage.yml
+++ /dev/null
@@ -1,55 +0,0 @@
-name: AppImage
-
-on:
- push:
- branches:
- - master
- pull_request:
- branches:
- - master
-
-jobs:
- build:
-
- runs-on: ubuntu-20.04
-
- steps:
- - uses: actions/checkout@v1
- - name: Install dependencies
- run: |
- sudo rm -f /etc/apt/sources.list.d/dotnetdev.list /etc/apt/sources.list.d/microsoft-prod.list
- sudo apt update
- sudo apt install cmake extra-cmake-modules libcurl4-gnutls-dev libpcap0.8-dev libsdl2-dev libqt5multimedia5-plugins qt5-default qtbase5-private-dev qtmultimedia5-dev libslirp0 libslirp-dev libarchive-dev zstd libzstd-dev --allow-downgrades
- - name: Create build environment
- run: mkdir ${{runner.workspace}}/build
- - name: Configure
- working-directory: ${{runner.workspace}}/build
- run: cmake $GITHUB_WORKSPACE
- - name: Make
- working-directory: ${{runner.workspace}}/build
- run: |
- make -j$(nproc --all)
- - name: Prepare AppDir for AppImage
- working-directory: ${{runner.workspace}}/build
- run: |
- make install DESTDIR=AppDir
- mv ./AppDir/usr/local/bin ./AppDir/usr/bin
- mv ./AppDir/usr/local/share ./AppDir/usr/share
- rm -rf ./AppDir/usr/local
- - name: Prepare necessary Tools for building the AppImage
- working-directory: ${{runner.workspace}}/build
- run: |
- wget https://github.com/linuxdeploy/linuxdeploy/releases/download/continuous/linuxdeploy-x86_64.AppImage
- wget https://github.com/linuxdeploy/linuxdeploy-plugin-qt/releases/download/continuous/linuxdeploy-plugin-qt-x86_64.AppImage
- chmod a+x linuxdeploy-x86_64.AppImage
- chmod a+x linuxdeploy-plugin-qt-x86_64.AppImage
- - name: Build the AppImage
- working-directory: ${{runner.workspace}}/build
- run: |
- ./linuxdeploy-x86_64.AppImage --appdir AppDir --plugin qt --output appimage
- mkdir dist
- cp ./melonDS*.AppImage ./dist
- - uses: actions/upload-artifact@v1
- with:
- name: melonDS-appimage-x86_64
- path: ${{runner.workspace}}/build/dist
diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml
index 6d5693a1..f47b3a4a 100644
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -4,10 +4,16 @@ on:
push:
branches:
- master
+ - ci/vcpkg-update
pull_request:
branches:
- master
+env:
+ MELONDS_GIT_BRANCH: ${{ github.ref }}
+ MELONDS_GIT_HASH: ${{ github.sha }}
+ MELONDS_BUILD_PROVIDER: GitHub Actions
+
jobs:
build-macos:
strategy:
@@ -15,24 +21,25 @@ jobs:
arch: [x86_64, arm64]
name: ${{ matrix.arch }}
- runs-on: macos-13
+ runs-on: macos-14
steps:
- name: Check out sources
uses: actions/checkout@v3
- name: Install dependencies for package building
run: |
- brew install autoconf automake autoconf-archive libtool && pip3 install setuptools
+ brew install autoconf automake autoconf-archive libtool python-setuptools
- name: Set up CMake
uses: lukka/get-cmake@latest
- name: Set up vcpkg
uses: lukka/run-vcpkg@v11
with:
- vcpkgGitCommitId: c8696863d371ab7f46e213d8f5ca923c4aef2a00
+ vcpkgGitCommitId: 10b7a178346f3f0abef60cecd5130e295afd8da4
- name: Build
uses: lukka/run-cmake@v10
with:
configurePreset: release-mac-${{ matrix.arch }}
buildPreset: release-mac-${{ matrix.arch }}
+ configurePresetAdditionalArgs: "['-DMELONDS_EMBED_BUILD_INFO=ON']"
- name: Compress app bundle
shell: bash
run: |
@@ -43,11 +50,13 @@ jobs:
with:
name: macOS-${{ matrix.arch }}
path: macOS-${{ matrix.arch }}.zip
+ retention-days: 1
universal-binary:
name: Universal binary
needs: [build-macos]
runs-on: macos-13
+ continue-on-error: true
steps:
- name: Download x86_64
uses: actions/download-artifact@v4
@@ -74,11 +83,10 @@ jobs:
with:
name: macOS-universal
path: macOS-universal.zip
- - name: Clean up architecture-specific artifacts
- uses: geekyeggo/delete-artifact@v4
- with:
- token: ${{ secrets.GITHUB_TOKEN }}
- failOnError: false
- name: |
- macOS-x86_64
- macOS-arm64
+# - name: Clean up architecture-specific artifacts
+# uses: geekyeggo/delete-artifact@v4
+# with:
+# failOnError: false
+# name: |
+# macOS-x86_64
+# macOS-arm64
diff --git a/.github/workflows/build-ubuntu-aarch64.yml b/.github/workflows/build-ubuntu-aarch64.yml
deleted file mode 100644
index 43f4d8b6..00000000
--- a/.github/workflows/build-ubuntu-aarch64.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-name: Ubuntu
-
-on:
- push:
- branches:
- - master
- pull_request:
- branches:
- - master
-
-env:
- BUILD_TYPE: Release
-
-jobs:
- build:
- name: aarch64
- runs-on: ubuntu-20.04
- container: ubuntu:20.04
-
- steps:
- - name: Prepare system
- shell: bash
- run: |
- apt update
- apt -y full-upgrade
- apt -y install git
- - name: Check out source
- uses: actions/checkout@v1
- - name: Install dependencies
- shell: bash
- run: |
- dpkg --add-architecture arm64
- sh -c "sed \"s|^deb \([a-z\.:/]*\) \([a-z\-]*\) \(.*\)$|deb [arch=amd64] \1 \2 \3\ndeb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports \2 \3|\" /etc/apt/sources.list > /etc/apt/sources.list.new"
- rm /etc/apt/sources.list
- mv /etc/apt/sources.list{.new,}
- apt update
- DEBIAN_FRONTEND=noninteractive apt install -y {gcc-10,g++-10,pkg-config}-aarch64-linux-gnu {libsdl2,qtbase5,qtbase5-private,qtmultimedia5,libslirp,libarchive,libzstd}-dev:arm64 zstd:arm64 cmake extra-cmake-modules dpkg-dev
- - name: Configure
- shell: bash
- run: |
- CC=aarch64-linux-gnu-gcc-10 CXX=aarch64-linux-gnu-g++-10 cmake -DPKG_CONFIG_EXECUTABLE=/usr/bin/aarch64-linux-gnu-pkg-config $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -B build
- - name: Make
- shell: bash
- run: |
- cmake --build build -j$(nproc --all)
- mkdir dist
- cp build/melonDS dist
- - uses: actions/upload-artifact@v1
- with:
- name: melonDS-ubuntu-aarch64
- path: dist
diff --git a/.github/workflows/build-ubuntu.yml b/.github/workflows/build-ubuntu.yml
index 438fddd0..044d01ee 100644
--- a/.github/workflows/build-ubuntu.yml
+++ b/.github/workflows/build-ubuntu.yml
@@ -8,31 +8,84 @@ on:
branches:
- master
-jobs:
- build:
- name: x86_64
+env:
+ MELONDS_GIT_BRANCH: ${{ github.ref }}
+ MELONDS_GIT_HASH: ${{ github.sha }}
+ MELONDS_BUILD_PROVIDER: GitHub Actions
- runs-on: ubuntu-20.04
+jobs:
+ build-x86_64:
+ name: x86_64
+ runs-on: ubuntu-22.04
steps:
- - uses: actions/checkout@v1
+ - uses: actions/checkout@v4
+ name: Check out sources
- name: Install dependencies
run: |
sudo rm -f /etc/apt/sources.list.d/dotnetdev.list /etc/apt/sources.list.d/microsoft-prod.list
sudo apt update
- sudo apt install cmake extra-cmake-modules libcurl4-gnutls-dev libpcap0.8-dev libsdl2-dev qt5-default qtbase5-private-dev qtmultimedia5-dev libslirp0 libslirp-dev libarchive-dev zstd libzstd-dev --allow-downgrades
- - name: Create build environment
- run: mkdir ${{runner.workspace}}/build
+ sudo apt install --allow-downgrades cmake ninja-build extra-cmake-modules libpcap0.8-dev libsdl2-dev libenet-dev \
+ qt6-{base,base-private,multimedia}-dev libqt6svg6-dev libarchive-dev libzstd-dev libfuse2
- name: Configure
- working-directory: ${{runner.workspace}}/build
- run: cmake $GITHUB_WORKSPACE
- - name: Make
- working-directory: ${{runner.workspace}}/build
+ run: cmake -B build -G Ninja -DUSE_QT6=ON -DCMAKE_INSTALL_PREFIX=/usr -DMELONDS_EMBED_BUILD_INFO=ON
+ - name: Build
run: |
- make -j$(nproc --all)
- mkdir dist
- cp melonDS dist
- - uses: actions/upload-artifact@v1
+ cmake --build build
+ DESTDIR=AppDir cmake --install build
+ - uses: actions/upload-artifact@v4
with:
name: melonDS-ubuntu-x86_64
- path: ${{runner.workspace}}/build/dist
+ path: AppDir/usr/bin/melonDS
+ - name: Fetch AppImage tools
+ run: |
+ wget https://github.com/linuxdeploy/linuxdeploy/releases/download/continuous/linuxdeploy-x86_64.AppImage
+ wget https://github.com/linuxdeploy/linuxdeploy-plugin-qt/releases/download/continuous/linuxdeploy-plugin-qt-x86_64.AppImage
+ chmod a+x linuxdeploy-*.AppImage
+ - name: Build the AppImage
+ env:
+ QMAKE: /usr/lib/qt6/bin/qmake
+ run: |
+ ./linuxdeploy-x86_64.AppImage --appdir AppDir --plugin qt --output appimage
+ - uses: actions/upload-artifact@v4
+ with:
+ name: melonDS-appimage-x86_64
+ path: melonDS*.AppImage
+
+ build-aarch64:
+ name: aarch64
+ runs-on: ubuntu-latest
+ container: ubuntu:22.04
+
+ steps:
+ - name: Prepare system
+ shell: bash
+ run: |
+ dpkg --add-architecture arm64
+ sh -c "sed \"s|^deb \([a-z\.:/]*\) \([a-z\-]*\) \(.*\)$|deb [arch=amd64] \1 \2 \3\ndeb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports \2 \3|\" /etc/apt/sources.list > /etc/apt/sources.list.new"
+ rm /etc/apt/sources.list
+ mv /etc/apt/sources.list{.new,}
+ apt update
+ apt -y full-upgrade
+ apt -y install git {gcc-12,g++-12}-aarch64-linux-gnu cmake ninja-build extra-cmake-modules \
+ {libsdl2,qt6-{base,base-private,multimedia},libqt6svg6,libarchive,libzstd,libenet}-dev:arm64 \
+ pkg-config dpkg-dev
+ - name: Check out source
+ uses: actions/checkout@v4
+ - name: Configure
+ shell: bash
+ run: |
+ cmake -B build -G Ninja \
+ -DPKG_CONFIG_EXECUTABLE=/usr/bin/aarch64-linux-gnu-pkg-config \
+ -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc-12 \
+ -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++-12 \
+ -DUSE_QT6=ON \
+ -DMELONDS_EMBED_BUILD_INFO=ON
+ - name: Build
+ shell: bash
+ run: |
+ cmake --build build
+ - uses: actions/upload-artifact@v4
+ with:
+ name: melonDS-ubuntu-aarch64
+ path: build/melonDS
diff --git a/.github/workflows/build-windows.yml b/.github/workflows/build-windows.yml
index e6846da7..c3350b4d 100644
--- a/.github/workflows/build-windows.yml
+++ b/.github/workflows/build-windows.yml
@@ -4,40 +4,40 @@ on:
push:
branches:
- master
+ - ci/*
pull_request:
branches:
- master
env:
- BUILD_TYPE: Release
+ MELONDS_GIT_BRANCH: ${{ github.ref }}
+ MELONDS_GIT_HASH: ${{ github.sha }}
+ MELONDS_BUILD_PROVIDER: GitHub Actions
jobs:
build:
-
runs-on: windows-latest
-
defaults:
run:
shell: msys2 {0}
steps:
- - uses: actions/checkout@v1
- - uses: msys2/setup-msys2@v2
+ - name: Check out sources
+ uses: actions/checkout@v3
+ - name: Set up MSYS2
+ uses: msys2/setup-msys2@v2
with:
- msystem: MINGW64
- update: true
-
- - name: Install dependencies
- run: pacman -Sq --noconfirm git pkgconf mingw-w64-x86_64-{cmake,SDL2,qt5-static,libslirp,libarchive,toolchain}
-
+ msystem: ucrt64
+ update: true
+ pacboy: gcc:p cmake:p ninja:p make:p
+ - name: Set up vcpkg
+ uses: lukka/run-vcpkg@v11
+ with:
+ vcpkgGitCommitId: 10b7a178346f3f0abef60cecd5130e295afd8da4
- name: Configure
- working-directory: ${{runner.workspace}}
- run: cmake -B build $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DBUILD_STATIC=ON -DCMAKE_PREFIX_PATH=C:/tools/msys64/mingw64/qt5-static
-
- - name: Make
- working-directory: ${{runner.workspace}}/build
- run: cmake --build .
-
- - uses: actions/upload-artifact@v1
+ run: cmake --preset=release-mingw-x86_64 -DMELONDS_EMBED_BUILD_INFO=ON
+ - name: Build
+ run: cmake --build --preset=release-mingw-x86_64
+ - uses: actions/upload-artifact@v4
with:
name: melonDS-windows-x86_64
- path: ${{runner.workspace}}\build\melonDS.exe
+ path: .\build\release-mingw-x86_64\melonDS.exe
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 97dfc5bd..55bf825f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,8 @@ endif()
set(CMAKE_POLICY_DEFAULT_CMP0069 NEW)
-set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
+set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
+set(CMAKE_USER_MAKE_RULES_OVERRIDE "${CMAKE_CURRENT_SOURCE_DIR}/cmake/DefaultBuildFlags.cmake")
option(USE_VCPKG "Use vcpkg for dependency packages" OFF)
if (USE_VCPKG)
@@ -25,6 +26,9 @@ include(CheckLibraryExists)
include(CMakeDependentOption)
include(CheckIPOSupported)
+include(SetupCCache)
+include(Sanitizers)
+
set(CMAKE_OSX_DEPLOYMENT_TARGET "10.15" CACHE STRING "Minimum OS X deployment version")
set(CMAKE_C_STANDARD 11)
@@ -33,8 +37,6 @@ set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
-add_compile_definitions(MELONDS_VERSION="${melonDS_VERSION}")
-
if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
@@ -78,14 +80,6 @@ if (ENABLE_LTO)
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
endif()
-if (CMAKE_CXX_COMPILER_ID STREQUAL GNU)
- set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Og")
- set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Og")
-endif()
-
-string(REPLACE "-O2" "-O3" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
-string(REPLACE "-O2" "-O3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
-
if (NOT APPLE)
set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} -s")
endif()
@@ -100,13 +94,6 @@ endif()
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-find_program(CCACHE "ccache")
-if (CCACHE)
- message(STATUS "Using CCache to speed up compilation")
- set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE})
- set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE})
-endif()
-
option(ENABLE_GDBSTUB "Enable GDB stub" ON)
if (ENABLE_GDBSTUB)
add_definitions(-DGDBSTUB_ENABLED)
diff --git a/CMakePresets.json b/CMakePresets.json
index e14eda24..2144417b 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -20,6 +20,23 @@
}
}
},
+ {
+ "name": "release-mingw-x86_64",
+ "inherits": "release-vcpkg",
+ "displayName": "Windows MinGW release (x86_64)",
+ "binaryDir": "${sourceDir}/build/release-mingw-x86_64",
+ "generator": "Ninja",
+ "cacheVariables": {
+ "USE_QT6": {
+ "type": "BOOL",
+ "value": "ON"
+ },
+ "BUILD_STATIC": {
+ "type": "BOOL",
+ "value": "ON"
+ }
+ }
+ },
{
"name": "release-mac-x86_64",
"inherits": "release-vcpkg",
@@ -44,6 +61,10 @@
"name": "release-vcpkg",
"configurePreset": "release-vcpkg"
},
+ {
+ "name": "release-mingw-x86_64",
+ "configurePreset": "release-mingw-x86_64"
+ },
{
"name": "release-mac-x86_64",
"configurePreset": "release-mac-x86_64"
@@ -85,4 +106,4 @@
]
}
]
-}
\ No newline at end of file
+}
diff --git a/README.md b/README.md
index de494305..eb8b1358 100644
--- a/README.md
+++ b/README.md
@@ -5,11 +5,11 @@
+
-
-
-
-
+
+
+
DS emulator, sorta
@@ -35,9 +35,9 @@ As for the rest, the interface should be pretty straightforward. If you have a q
### Linux
1. Install dependencies:
- * Ubuntu 22.04: `sudo apt install cmake extra-cmake-modules libcurl4-gnutls-dev libpcap0.8-dev libsdl2-dev qtbase5-dev qtbase5-private-dev qtmultimedia5-dev libslirp-dev libarchive-dev libzstd-dev`
- * Older Ubuntu: `sudo apt install cmake extra-cmake-modules libcurl4-gnutls-dev libpcap0.8-dev libsdl2-dev qt5-default qtbase5-private-dev qtmultimedia5-dev libslirp-dev libarchive-dev libzstd-dev`
- * Arch Linux: `sudo pacman -S base-devel cmake extra-cmake-modules git libpcap sdl2 qt5-base qt5-multimedia libslirp libarchive zstd`
+ * Ubuntu 22.04: `sudo apt install cmake extra-cmake-modules libcurl4-gnutls-dev libpcap0.8-dev libsdl2-dev qtbase5-dev qtbase5-private-dev qtmultimedia5-dev libqt5svg5-dev libarchive-dev libenet-dev libzstd-dev`
+ * Older Ubuntu: `sudo apt install cmake extra-cmake-modules libcurl4-gnutls-dev libpcap0.8-dev libsdl2-dev qt5-default qtbase5-private-dev qtmultimedia5-dev libqt5svg5-dev libarchive-dev libenet-dev libzstd-dev`
+ * Arch Linux: `sudo pacman -S base-devel cmake extra-cmake-modules git libpcap sdl2 qt5-base qt5-multimedia qt5-svg libarchive enet zstd`
3. Download the melonDS repository and prepare:
```bash
git clone https://github.com/melonDS-emu/melonDS
@@ -64,7 +64,7 @@ As for the rest, the interface should be pretty straightforward. If you have a q
cd melonDS
```
#### Dynamic builds (with DLLs)
-5. Install dependencies: `pacman -S mingw-w64-x86_64-{cmake,SDL2,toolchain,qt5-base,qt5-svg,qt5-multimedia,qt5-tools,libslirp,libarchive,zstd}`
+5. Install dependencies: `pacman -S mingw-w64-x86_64-{cmake,SDL2,toolchain,qt5-base,qt5-svg,qt5-multimedia,qt5-svg,qt5-tools,libarchive,enet,zstd}`
6. Compile:
```bash
cmake -B build
@@ -75,7 +75,7 @@ As for the rest, the interface should be pretty straightforward. If you have a q
If everything went well, melonDS and the libraries it needs should now be in the `dist` folder.
#### Static builds (without DLLs, standalone executable)
-5. Install dependencies: `pacman -S mingw-w64-x86_64-{cmake,SDL2,toolchain,qt5-static,libslirp,libarchive,zstd}`
+5. Install dependencies: `pacman -S mingw-w64-x86_64-{cmake,SDL2,toolchain,qt5-static,libarchive,enet,zstd}`
6. Compile:
```bash
cmake -B build -DBUILD_STATIC=ON -DCMAKE_PREFIX_PATH=/mingw64/qt5-static
@@ -85,7 +85,7 @@ If everything went well, melonDS should now be in the `build` folder.
### macOS
1. Install the [Homebrew Package Manager](https://brew.sh)
-2. Install dependencies: `brew install git pkg-config cmake sdl2 qt@6 libslirp libarchive zstd`
+2. Install dependencies: `brew install git pkg-config cmake sdl2 qt@6 libarchive enet zstd`
3. Download the melonDS repository and prepare:
```zsh
git clone https://github.com/melonDS-emu/melonDS
@@ -93,14 +93,14 @@ If everything went well, melonDS should now be in the `build` folder.
```
4. Compile:
```zsh
- cmake -B build -DCMAKE_PREFIX_PATH="$(brew --prefix qt@6);$(brew --prefix libarchive)" -DUSE_QT6=ON
+ cmake -B build -DCMAKE_PREFIX_PATH="$(brew --prefix qt@6);$(brew --prefix libarchive)"
cmake --build build -j$(sysctl -n hw.logicalcpu)
```
If everything went well, melonDS.app should now be in the `build` directory.
#### Self-contained app bundle
If you want an app bundle that can be distributed to other computers without needing to install dependencies through Homebrew, you can additionally run `
-../tools/mac-bundle.rb melonDS.app` after the build is completed, or add `-DMACOS_BUNDLE_LIBS=ON` to the first CMake command.
+../tools/mac-libs.rb .` after the build is completed, or add `-DMACOS_BUNDLE_LIBS=ON` to the first CMake command.
## TODO LIST
diff --git a/cmake/ConfigureVcpkg.cmake b/cmake/ConfigureVcpkg.cmake
index be8f0590..c1eb522d 100644
--- a/cmake/ConfigureVcpkg.cmake
+++ b/cmake/ConfigureVcpkg.cmake
@@ -4,10 +4,12 @@ set(_DEFAULT_VCPKG_ROOT "${CMAKE_SOURCE_DIR}/vcpkg")
set(VCPKG_ROOT "${_DEFAULT_VCPKG_ROOT}" CACHE STRING "The path to the vcpkg repository")
if (VCPKG_ROOT STREQUAL "${_DEFAULT_VCPKG_ROOT}")
- file(LOCK "${_DEFAULT_VCPKG_ROOT}" DIRECTORY GUARD FILE)
+ if (APPLE) # this doesn't work on non-macOS
+ file(LOCK "${_DEFAULT_VCPKG_ROOT}" DIRECTORY GUARD FILE)
+ endif()
FetchContent_Declare(vcpkg
GIT_REPOSITORY "https://github.com/Microsoft/vcpkg.git"
- GIT_TAG 2023.12.12
+ GIT_TAG 2024.10.21
SOURCE_DIR "${CMAKE_SOURCE_DIR}/vcpkg")
FetchContent_MakeAvailable(vcpkg)
endif()
@@ -16,6 +18,23 @@ set(VCPKG_OVERLAY_TRIPLETS "${CMAKE_SOURCE_DIR}/cmake/overlay-triplets")
option(USE_RECOMMENDED_TRIPLETS "Use the recommended triplets that are used for official builds" ON)
+# Duplicated here because it needs to be set before project()
+if (NOT WIN32)
+ option(USE_QT6 "Build using Qt 6 instead of 5" ON)
+else()
+ option(USE_QT6 "Build using Qt 6 instead of 5" OFF)
+endif()
+
+# Since the Linux build pulls in glib anyway, we can just use upstream libslirp
+if (UNIX AND NOT APPLE)
+ option(USE_SYSTEM_LIBSLIRP "Use system libslirp instead of the bundled version" ON)
+endif()
+
+if (NOT USE_QT6)
+ list(APPEND VCPKG_MANIFEST_FEATURES qt5)
+ set(VCPKG_MANIFEST_NO_DEFAULT_FEATURES ON)
+endif()
+
if (CMAKE_OSX_ARCHITECTURES MATCHES ";")
message(FATAL_ERROR "macOS universal builds are not supported. Build them individually and combine afterwards instead.")
endif()
@@ -47,7 +66,15 @@ if (USE_RECOMMENDED_TRIPLETS)
elseif(WIN32)
# TODO Windows arm64 if possible
set(_CAN_TARGET_AS_HOST ON)
- set(_WANTED_TRIPLET x64-mingw-static)
+ set(_WANTED_TRIPLET x64-mingw-static-release)
+ elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL Linux)
+ # Can't really detect cross compiling here.
+ set(_CAN_TARGET_AS_HOST ON)
+ if (_HOST_PROCESSOR STREQUAL x86_64)
+ set(_WANTED_TRIPLET x64-linux-release)
+ elseif(_HOST_PROCESSOR STREQUAL "aarch64")
+ set(_WANTED_TRIPLET arm64-linux-release)
+ endif()
endif()
# Don't override it if the user set something else
diff --git a/cmake/DefaultBuildFlags.cmake b/cmake/DefaultBuildFlags.cmake
new file mode 100644
index 00000000..683767b3
--- /dev/null
+++ b/cmake/DefaultBuildFlags.cmake
@@ -0,0 +1,9 @@
+if (CMAKE_C_COMPILER_ID STREQUAL GNU)
+ set(CMAKE_C_FLAGS_DEBUG_INIT "-g -Og")
+endif()
+if (CMAKE_CXX_COMPILER_ID STREQUAL GNU)
+ set(CMAKE_CXX_FLAGS_DEBUG_INIT "-g -Og")
+endif()
+
+string(REPLACE "-O2" "-O3" CMAKE_C_FLAGS_RELEASE_INIT "${CMAKE_C_FLAGS_RELEASE_INIT}")
+string(REPLACE "-O2" "-O3" CMAKE_CXX_FLAGS_RELEASE_INIT "${CMAKE_CXX_FLAGS_RELEASE_INIT}")
diff --git a/cmake/FindENet.cmake b/cmake/FindENet.cmake
new file mode 100644
index 00000000..f9044c30
--- /dev/null
+++ b/cmake/FindENet.cmake
@@ -0,0 +1,48 @@
+# - Try to find enet
+# Once done this will define
+#
+# ENET_FOUND - system has enet
+# ENET_INCLUDE_DIRS - the enet include directory
+# ENET_LIBRARIES - the libraries needed to use enet
+#
+# $ENETDIR is an environment variable used for finding enet.
+#
+# Borrowed from The Mana World
+# http://themanaworld.org/
+#
+# Several changes and additions by Fabian 'x3n' Landau
+# Lots of simplifications by Adrian Friedli
+# > www.orxonox.net <
+
+FIND_PATH(ENET_INCLUDE_DIRS enet/enet.h
+ PATHS
+ $ENV{ENETDIR}
+ /usr/local
+ /usr
+ PATH_SUFFIXES include
+)
+
+FIND_LIBRARY(ENET_LIBRARY
+ NAMES enet
+ PATHS
+ $ENV{ENETDIR}
+ /usr/local
+ /usr
+ PATH_SUFFIXES lib
+)
+
+# handle the QUIETLY and REQUIRED arguments and set ENET_FOUND to TRUE if
+# all listed variables are TRUE
+INCLUDE(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(ENet DEFAULT_MSG ENET_LIBRARY ENET_INCLUDE_DIRS)
+
+IF (ENET_FOUND)
+ IF(WIN32)
+ SET(WINDOWS_ENET_DEPENDENCIES "ws2_32;winmm")
+ SET(ENET_LIBRARIES ${ENET_LIBRARY} ${WINDOWS_ENET_DEPENDENCIES})
+ ELSE(WIN32)
+ SET(ENET_LIBRARIES ${ENET_LIBRARY})
+ ENDIF(WIN32)
+ENDIF (ENET_FOUND)
+
+MARK_AS_ADVANCED(ENET_LIBRARY ENET_LIBRARIES ENET_INCLUDE_DIRS)
diff --git a/cmake/FixInterfaceIncludes.cmake b/cmake/FixInterfaceIncludes.cmake
index 513c1117..5c285d7a 100644
--- a/cmake/FixInterfaceIncludes.cmake
+++ b/cmake/FixInterfaceIncludes.cmake
@@ -19,6 +19,13 @@ function(fix_interface_includes)
if (PARENT_DIR MATCHES "include$")
list(APPEND NEW_DIRS "${PARENT_DIR}")
endif()
+
+ # HACK
+ # The libarchive pkg-config file in MSYS2 seems to include a UNIX-style path for its
+ # include directory and CMake doesn't like that.
+ if (WIN32 AND MINGW AND target STREQUAL PkgConfig::LibArchive)
+ list(FILTER DIRS EXCLUDE REGEX "^/[^.]+64/.*")
+ endif()
endforeach()
list(APPEND DIRS ${NEW_DIRS})
diff --git a/cmake/Sanitizers.cmake b/cmake/Sanitizers.cmake
new file mode 100644
index 00000000..9c09da28
--- /dev/null
+++ b/cmake/Sanitizers.cmake
@@ -0,0 +1,8 @@
+set(SANITIZE "" CACHE STRING "Sanitizers to enable.")
+
+string(REGEX MATCHALL "[^,]+" ENABLED_SANITIZERS "${SANITIZE}")
+
+foreach(SANITIZER ${ENABLED_SANITIZERS})
+ add_compile_options("-fsanitize=${SANITIZER}")
+ add_link_options("-fsanitize=${SANITIZER}")
+endforeach()
\ No newline at end of file
diff --git a/cmake/SetupCCache.cmake b/cmake/SetupCCache.cmake
new file mode 100644
index 00000000..72388bf8
--- /dev/null
+++ b/cmake/SetupCCache.cmake
@@ -0,0 +1,19 @@
+include(FindPackageMessage)
+
+find_program(CCACHE "ccache")
+
+cmake_dependent_option(USE_CCACHE "Use CCache to speed up repeated builds." ON CCACHE OFF)
+
+if (NOT CCACHE OR NOT USE_CCACHE)
+ return()
+endif()
+
+# Fedora, and probably also Red Hat-based distros in general, use CCache by default if it's installed on the system.
+# We'll try to detect this here, and exit if that's the case.
+# Trying to launch ccache with ccache as we'd otherwise do seems to cause build issues.
+if (CMAKE_C_COMPILER MATCHES "ccache" OR CMAKE_CXX_COMPILER MATCHES "ccache")
+ return()
+endif()
+
+find_package_message(CCache "Using CCache to speed up compilation" "${USE_CCACHE}")
+set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE}")
\ No newline at end of file
diff --git a/cmake/overlay-triplets/x64-mingw-static-release.cmake b/cmake/overlay-triplets/x64-mingw-static-release.cmake
new file mode 100644
index 00000000..19c2aeb0
--- /dev/null
+++ b/cmake/overlay-triplets/x64-mingw-static-release.cmake
@@ -0,0 +1,7 @@
+set(VCPKG_TARGET_ARCHITECTURE x64)
+set(VCPKG_CRT_LINKAGE dynamic)
+set(VCPKG_LIBRARY_LINKAGE static)
+set(VCPKG_ENV_PASSTHROUGH PATH)
+set(VCPKG_BUILD_TYPE release)
+
+set(VCPKG_CMAKE_SYSTEM_NAME MinGW)
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 00000000..be75f57f
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,61 @@
+{
+ "nodes": {
+ "flake-utils": {
+ "inputs": {
+ "systems": "systems"
+ },
+ "locked": {
+ "lastModified": 1726560853,
+ "narHash": "sha256-X6rJYSESBVr3hBoH0WbKE5KvhPU5bloyZ2L4K60/fPQ=",
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "rev": "c1dfcf08411b08f6b8615f7d8971a2bfa81d5e8a",
+ "type": "github"
+ },
+ "original": {
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "type": "github"
+ }
+ },
+ "nixpkgs": {
+ "locked": {
+ "lastModified": 1729665710,
+ "narHash": "sha256-AlcmCXJZPIlO5dmFzV3V2XF6x/OpNWUV8Y/FMPGd8Z4=",
+ "owner": "NixOS",
+ "repo": "nixpkgs",
+ "rev": "2768c7d042a37de65bb1b5b3268fc987e534c49d",
+ "type": "github"
+ },
+ "original": {
+ "owner": "NixOS",
+ "ref": "nixos-unstable",
+ "repo": "nixpkgs",
+ "type": "github"
+ }
+ },
+ "root": {
+ "inputs": {
+ "flake-utils": "flake-utils",
+ "nixpkgs": "nixpkgs"
+ }
+ },
+ "systems": {
+ "locked": {
+ "lastModified": 1681028828,
+ "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+ "owner": "nix-systems",
+ "repo": "default",
+ "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+ "type": "github"
+ },
+ "original": {
+ "owner": "nix-systems",
+ "repo": "default",
+ "type": "github"
+ }
+ }
+ },
+ "root": "root",
+ "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 00000000..8d500c03
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,100 @@
+{
+ description = "Nintendo DS emulator";
+
+ inputs = {
+ nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
+ flake-utils.url = "github:numtide/flake-utils";
+ };
+
+ outputs = { self, nixpkgs, flake-utils }: flake-utils.lib.eachDefaultSystem (system:
+ let
+ pkgs = import nixpkgs { inherit system; };
+ inherit (pkgs.lib) cmakeBool optionals makeLibraryPath;
+ inherit (pkgs.stdenv) isLinux isDarwin;
+
+ revision = with self; if sourceInfo?dirtyRev
+ then sourceInfo.dirtyRev
+ else sourceInfo.rev;
+ shortRevision = with self; if sourceInfo?dirtyShortRev
+ then sourceInfo.dirtyShortRev
+ else sourceInfo.shortRev;
+
+ melonDS = pkgs.qt6.qtbase.stdenv.mkDerivation {
+ pname = "melonDS";
+ version = "0.9.5-${shortRevision}";
+ src = ./.;
+
+ nativeBuildInputs = with pkgs; [
+ cmake
+ ninja
+ pkg-config
+ qt6.wrapQtAppsHook
+ ];
+
+ buildInputs = (with pkgs; [
+ qt6.qtbase
+ qt6.qtmultimedia
+ SDL2
+ zstd
+ libarchive
+ libGL
+ libslirp
+ enet
+ ]) ++ optionals (!isDarwin) (with pkgs; [
+ kdePackages.extra-cmake-modules
+ qt6.qtwayland
+ wayland
+ ]);
+
+ cmakeFlags = [
+ (cmakeBool "USE_QT6" true)
+ (cmakeBool "USE_SYSTEM_LIBSLIRP" true)
+ (cmakeBool "MELONDS_EMBED_BUILD_INFO" true)
+ ];
+
+ env.MELONDS_GIT_HASH = revision;
+ env.MELONDS_GIT_BRANCH = "(unknown)";
+ env.MELONDS_BUILD_PROVIDER = "Nix";
+
+ qtWrapperArgs = optionals isLinux [
+ "--prefix LD_LIBRARY_PATH : ${makeLibraryPath [ pkgs.libpcap pkgs.wayland ]}"
+ ] ++ optionals isDarwin [
+ "--prefix DYLD_LIBRARY_PATH : ${makeLibraryPath [ pkgs.libpcap ]}"
+ ];
+
+ passthru = {
+ exePath = if isDarwin then
+ "/Applications/melonDS.app/Contents/MacOS/melonDS"
+ else "/bin/melonDS";
+ };
+ };
+ in {
+ packages.default = melonDS;
+ apps.default = flake-utils.lib.mkApp {
+ drv = self.packages.${system}.default;
+ };
+ devShells = {
+ default = pkgs.mkShell.override { stdenv = pkgs.qt6.qtbase.stdenv; } {
+ inputsFrom = [ self.packages.${system}.default ];
+ };
+
+ # Shell for building static melonDS release builds with vcpkg
+ # Use mkShellNoCC to ensure Nix's gcc/clang and stdlib isn't used
+ vcpkg = pkgs.mkShellNoCC {
+ packages = with pkgs; [
+ autoconf
+ autoconf-archive
+ automake
+ cmake
+ cups.dev # Needed by qtbase despite not enabling print support
+ git
+ iconv.dev
+ libtool
+ ninja
+ pkg-config
+ ];
+ };
+ };
+ }
+ );
+}
diff --git a/res/melon.qrc b/res/melon.qrc
index 38915bbf..3c5824d6 100644
--- a/res/melon.qrc
+++ b/res/melon.qrc
@@ -2,5 +2,6 @@
icon/melon_256x256.png
+ melon.svg
diff --git a/src/ARCodeFile.cpp b/src/ARCodeFile.cpp
index 602a2e7b..a98f5e50 100644
--- a/src/ARCodeFile.cpp
+++ b/src/ARCodeFile.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -33,17 +33,26 @@ ARCodeFile::ARCodeFile(const std::string& filename)
{
Filename = filename;
- Error = false;
-
- Categories.clear();
-
if (!Load())
Error = true;
}
-ARCodeFile::~ARCodeFile()
+std::vector ARCodeFile::GetCodes() const noexcept
{
- Categories.clear();
+ if (Error)
+ return {};
+
+ std::vector codes;
+
+ for (const ARCodeCat& cat : Categories)
+ {
+ for (const ARCode& code : cat.Codes)
+ {
+ codes.push_back(code);
+ }
+ }
+
+ return codes;
}
bool ARCodeFile::Load()
diff --git a/src/ARCodeFile.h b/src/ARCodeFile.h
index 11e71efe..04f9e4f4 100644
--- a/src/ARCodeFile.h
+++ b/src/ARCodeFile.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -48,14 +48,16 @@ class ARCodeFile
{
public:
ARCodeFile(const std::string& filename);
- ~ARCodeFile();
+ ~ARCodeFile() noexcept = default;
- bool Error;
+ [[nodiscard]] std::vector GetCodes() const noexcept;
+
+ bool Error = false;
bool Load();
bool Save();
- ARCodeCatList Categories;
+ ARCodeCatList Categories {};
private:
std::string Filename;
diff --git a/src/AREngine.cpp b/src/AREngine.cpp
index c7d49fe6..bdda5863 100644
--- a/src/AREngine.cpp
+++ b/src/AREngine.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -31,7 +31,6 @@ using Platform::LogLevel;
AREngine::AREngine(melonDS::NDS& nds) : NDS(nds)
{
- CodeFile = nullptr;
}
#define case16(x) \
@@ -388,19 +387,12 @@ void AREngine::RunCheat(const ARCode& arcode)
void AREngine::RunCheats()
{
- if (!CodeFile) return;
+ if (Cheats.empty()) return;
- for (ARCodeCatList::iterator i = CodeFile->Categories.begin(); i != CodeFile->Categories.end(); i++)
+ for (const ARCode& code : Cheats)
{
- ARCodeCat& cat = *i;
-
- for (ARCodeList::iterator j = cat.Codes.begin(); j != cat.Codes.end(); j++)
- {
- ARCode& code = *j;
-
- if (code.Enabled)
- RunCheat(code);
- }
+ if (code.Enabled)
+ RunCheat(code);
}
}
}
diff --git a/src/AREngine.h b/src/AREngine.h
index 21044676..e73fc98e 100644
--- a/src/AREngine.h
+++ b/src/AREngine.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -19,6 +19,7 @@
#ifndef ARENGINE_H
#define ARENGINE_H
+#include
#include "ARCodeFile.h"
namespace melonDS
@@ -29,14 +30,13 @@ class AREngine
public:
AREngine(melonDS::NDS& nds);
- ARCodeFile* GetCodeFile() { return CodeFile; }
- void SetCodeFile(ARCodeFile* file) { CodeFile = file; }
-
+ std::vector Cheats {};
+private:
+ friend class ARM;
void RunCheats();
void RunCheat(const ARCode& arcode);
-private:
+
melonDS::NDS& NDS;
- ARCodeFile* CodeFile; // AR code file - frontend is responsible for managing this
};
}
diff --git a/src/ARM.cpp b/src/ARM.cpp
index c2f6a6c2..b7b703da 100644
--- a/src/ARM.cpp
+++ b/src/ARM.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -110,6 +110,7 @@ const u32 ARM::ConditionTable[16] =
ARM::ARM(u32 num, bool jit, std::optional gdb, melonDS::NDS& nds) :
#ifdef GDBSTUB_ENABLED
GdbStub(this, gdb ? (num ? gdb->PortARM7 : gdb->PortARM9) : 0),
+ BreakOnStartup(gdb ? (num ? gdb->ARM7BreakOnStartup : gdb->ARM9BreakOnStartup) : false),
#endif
Num(num), // well uh
NDS(nds)
@@ -582,9 +583,11 @@ void ARM::CheckGdbIncoming()
GdbCheckA();
}
+template
void ARMv5::Execute()
{
- GdbCheckB();
+ if constexpr (mode == CPUExecuteMode::InterpreterGDB)
+ GdbCheckB();
if (Halted)
{
@@ -607,231 +610,125 @@ void ARMv5::Execute()
while (NDS.ARM9Timestamp < NDS.ARM9Target)
{
- if (CPSR & 0x20) // THUMB
+#ifdef JIT_ENABLED
+ if constexpr (mode == CPUExecuteMode::JIT)
{
- GdbCheckC();
+ u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
- // prefetch
- R[15] += 2;
- CurInstr = NextInstr[0];
- NextInstr[0] = NextInstr[1];
- if (R[15] & 0x2) { NextInstr[1] >>= 16; CodeCycles = 0; }
- else NextInstr[1] = CodeRead32(R[15], false);
-
- // actually execute
- u32 icode = (CurInstr >> 6) & 0x3FF;
- ARMInterpreter::THUMBInstrTable[icode](this);
- }
- else
- {
- GdbCheckC();
-
- // prefetch
- R[15] += 4;
- CurInstr = NextInstr[0];
- NextInstr[0] = NextInstr[1];
- NextInstr[1] = CodeRead32(R[15], false);
-
- // actually execute
- if (CheckCondition(CurInstr >> 28))
- {
- u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0);
- ARMInterpreter::ARMInstrTable[icode](this);
- }
- else if ((CurInstr & 0xFE000000) == 0xFA000000)
- {
- ARMInterpreter::A_BLX_IMM(this);
- }
- else
- AddCycles_C();
- }
-
- // TODO optimize this shit!!!
- if (Halted)
- {
- if (Halted == 1 && NDS.ARM9Timestamp < NDS.ARM9Target)
+ if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize))
+ && !NDS.JIT.SetupExecutableRegion(0, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize))
{
NDS.ARM9Timestamp = NDS.ARM9Target;
+ Log(LogLevel::Error, "ARMv5 PC in non executable region %08X\n", R[15]);
+ return;
}
- break;
- }
- /*if (NDS::IF[0] & NDS::IE[0])
- {
- if (NDS::IME[0] & 0x1)
- TriggerIRQ();
- }*/
- if (IRQ) TriggerIRQ();
- NDS.ARM9Timestamp += Cycles;
- Cycles = 0;
- }
+ JitBlockEntry block = NDS.JIT.LookUpBlock(0, FastBlockLookup,
+ instrAddr - FastBlockLookupStart, instrAddr);
+ if (block)
+ ARM_Dispatch(this, block);
+ else
+ NDS.JIT.CompileBlock(this);
- if (Halted == 2)
- Halted = 0;
-}
-
-#ifdef JIT_ENABLED
-void ARMv5::ExecuteJIT()
-{
- if (Halted)
- {
- if (Halted == 2)
- {
- Halted = 0;
- }
- else if (NDS.HaltInterrupted(0))
- {
- Halted = 0;
- if (NDS.IME[0] & 0x1)
- TriggerIRQ();
- }
- else
- {
- NDS.ARM9Timestamp = NDS.ARM9Target;
- return;
- }
- }
-
- while (NDS.ARM9Timestamp < NDS.ARM9Target)
- {
- u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
-
- if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize))
- && !NDS.JIT.SetupExecutableRegion(0, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize))
- {
- NDS.ARM9Timestamp = NDS.ARM9Target;
- Log(LogLevel::Error, "ARMv5 PC in non executable region %08X\n", R[15]);
- return;
- }
-
- JitBlockEntry block = NDS.JIT.LookUpBlock(0, FastBlockLookup,
- instrAddr - FastBlockLookupStart, instrAddr);
- if (block)
- ARM_Dispatch(this, block);
- else
- NDS.JIT.CompileBlock(this);
-
- if (StopExecution)
- {
- // this order is crucial otherwise idle loops waiting for an IRQ won't function
- if (IRQ)
- TriggerIRQ();
-
- if (Halted || IdleLoop)
+ if (StopExecution)
{
- if ((Halted == 1 || IdleLoop) && NDS.ARM9Timestamp < NDS.ARM9Target)
+ // this order is crucial otherwise idle loops waiting for an IRQ won't function
+ if (IRQ)
+ TriggerIRQ();
+
+ if (Halted || IdleLoop)
{
- Cycles = 0;
- NDS.ARM9Timestamp = NDS.ARM9Target;
+ if ((Halted == 1 || IdleLoop) && NDS.ARM9Timestamp < NDS.ARM9Target)
+ {
+ Cycles = 0;
+ NDS.ARM9Timestamp = NDS.ARM9Target;
+ }
+ IdleLoop = 0;
+ break;
}
- IdleLoop = 0;
- break;
}
}
-
- NDS.ARM9Timestamp += Cycles;
- Cycles = 0;
- }
-
- if (Halted == 2)
- Halted = 0;
-}
+ else
#endif
-
-void ARMv4::Execute()
-{
- GdbCheckB();
-
- if (Halted)
- {
- if (Halted == 2)
{
- Halted = 0;
- }
- else if (NDS.HaltInterrupted(1))
- {
- Halted = 0;
- if (NDS.IME[1] & 0x1)
- TriggerIRQ();
- }
- else
- {
- NDS.ARM7Timestamp = NDS.ARM7Target;
- return;
- }
- }
-
- while (NDS.ARM7Timestamp < NDS.ARM7Target)
- {
- if (CPSR & 0x20) // THUMB
- {
- GdbCheckC();
-
- // prefetch
- R[15] += 2;
- CurInstr = NextInstr[0];
- NextInstr[0] = NextInstr[1];
- NextInstr[1] = CodeRead16(R[15]);
-
- // actually execute
- u32 icode = (CurInstr >> 6);
- ARMInterpreter::THUMBInstrTable[icode](this);
- }
- else
- {
- GdbCheckC();
-
- // prefetch
- R[15] += 4;
- CurInstr = NextInstr[0];
- NextInstr[0] = NextInstr[1];
- NextInstr[1] = CodeRead32(R[15]);
-
- // actually execute
- if (CheckCondition(CurInstr >> 28))
+ if (CPSR & 0x20) // THUMB
{
- u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0);
- ARMInterpreter::ARMInstrTable[icode](this);
+ if constexpr (mode == CPUExecuteMode::InterpreterGDB)
+ GdbCheckC();
+
+ // prefetch
+ R[15] += 2;
+ CurInstr = NextInstr[0];
+ NextInstr[0] = NextInstr[1];
+ if (R[15] & 0x2) { NextInstr[1] >>= 16; CodeCycles = 0; }
+ else NextInstr[1] = CodeRead32(R[15], false);
+
+ // actually execute
+ u32 icode = (CurInstr >> 6) & 0x3FF;
+ ARMInterpreter::THUMBInstrTable[icode](this);
}
else
- AddCycles_C();
- }
-
- // TODO optimize this shit!!!
- if (Halted)
- {
- if (Halted == 1 && NDS.ARM7Timestamp < NDS.ARM7Target)
{
- NDS.ARM7Timestamp = NDS.ARM7Target;
- }
- break;
- }
- /*if (NDS::IF[1] & NDS::IE[1])
- {
- if (NDS::IME[1] & 0x1)
- TriggerIRQ();
- }*/
- if (IRQ) TriggerIRQ();
+ if constexpr (mode == CPUExecuteMode::InterpreterGDB)
+ GdbCheckC();
- NDS.ARM7Timestamp += Cycles;
+ // prefetch
+ R[15] += 4;
+ CurInstr = NextInstr[0];
+ NextInstr[0] = NextInstr[1];
+ NextInstr[1] = CodeRead32(R[15], false);
+
+ // actually execute
+ if (CheckCondition(CurInstr >> 28))
+ {
+ u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0);
+ ARMInterpreter::ARMInstrTable[icode](this);
+ }
+ else if ((CurInstr & 0xFE000000) == 0xFA000000)
+ {
+ ARMInterpreter::A_BLX_IMM(this);
+ }
+ else
+ AddCycles_C();
+ }
+
+ // TODO optimize this shit!!!
+ if (Halted)
+ {
+ if (Halted == 1 && NDS.ARM9Timestamp < NDS.ARM9Target)
+ {
+ NDS.ARM9Timestamp = NDS.ARM9Target;
+ }
+ break;
+ }
+ /*if (NDS::IF[0] & NDS::IE[0])
+ {
+ if (NDS::IME[0] & 0x1)
+ TriggerIRQ();
+ }*/
+ if (IRQ) TriggerIRQ();
+
+ }
+
+ NDS.ARM9Timestamp += Cycles;
Cycles = 0;
}
if (Halted == 2)
Halted = 0;
-
- if (Halted == 4)
- {
- assert(NDS.ConsoleType == 1);
- auto& dsi = dynamic_cast(NDS);
- dsi.SoftReset();
- Halted = 2;
- }
}
-
+template void ARMv5::Execute();
+template void ARMv5::Execute();
#ifdef JIT_ENABLED
-void ARMv4::ExecuteJIT()
+template void ARMv5::Execute();
+#endif
+
+template
+void ARMv4::Execute()
{
+ if constexpr (mode == CPUExecuteMode::InterpreterGDB)
+ GdbCheckB();
+
if (Halted)
{
if (Halted == 2)
@@ -853,38 +750,97 @@ void ARMv4::ExecuteJIT()
while (NDS.ARM7Timestamp < NDS.ARM7Target)
{
- u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
-
- if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize))
- && !NDS.JIT.SetupExecutableRegion(1, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize))
+#ifdef JIT_ENABLED
+ if constexpr (mode == CPUExecuteMode::JIT)
{
- NDS.ARM7Timestamp = NDS.ARM7Target;
- Log(LogLevel::Error, "ARMv4 PC in non executable region %08X\n", R[15]);
- return;
- }
+ u32 instrAddr = R[15] - ((CPSR&0x20)?2:4);
- JitBlockEntry block = NDS.JIT.LookUpBlock(1, FastBlockLookup,
- instrAddr - FastBlockLookupStart, instrAddr);
- if (block)
- ARM_Dispatch(this, block);
- else
- NDS.JIT.CompileBlock(this);
-
- if (StopExecution)
- {
- if (IRQ)
- TriggerIRQ();
-
- if (Halted || IdleLoop)
+ if ((instrAddr < FastBlockLookupStart || instrAddr >= (FastBlockLookupStart + FastBlockLookupSize))
+ && !NDS.JIT.SetupExecutableRegion(1, instrAddr, FastBlockLookup, FastBlockLookupStart, FastBlockLookupSize))
{
- if ((Halted == 1 || IdleLoop) && NDS.ARM7Timestamp < NDS.ARM7Target)
+ NDS.ARM7Timestamp = NDS.ARM7Target;
+ Log(LogLevel::Error, "ARMv4 PC in non executable region %08X\n", R[15]);
+ return;
+ }
+
+ JitBlockEntry block = NDS.JIT.LookUpBlock(1, FastBlockLookup,
+ instrAddr - FastBlockLookupStart, instrAddr);
+ if (block)
+ ARM_Dispatch(this, block);
+ else
+ NDS.JIT.CompileBlock(this);
+
+ if (StopExecution)
+ {
+ if (IRQ)
+ TriggerIRQ();
+
+ if (Halted || IdleLoop)
+ {
+ if ((Halted == 1 || IdleLoop) && NDS.ARM7Timestamp < NDS.ARM7Target)
+ {
+ Cycles = 0;
+ NDS.ARM7Timestamp = NDS.ARM7Target;
+ }
+ IdleLoop = 0;
+ break;
+ }
+ }
+ }
+ else
+#endif
+ {
+ if (CPSR & 0x20) // THUMB
+ {
+ if constexpr (mode == CPUExecuteMode::InterpreterGDB)
+ GdbCheckC();
+
+ // prefetch
+ R[15] += 2;
+ CurInstr = NextInstr[0];
+ NextInstr[0] = NextInstr[1];
+ NextInstr[1] = CodeRead16(R[15]);
+
+ // actually execute
+ u32 icode = (CurInstr >> 6);
+ ARMInterpreter::THUMBInstrTable[icode](this);
+ }
+ else
+ {
+ if constexpr (mode == CPUExecuteMode::InterpreterGDB)
+ GdbCheckC();
+
+ // prefetch
+ R[15] += 4;
+ CurInstr = NextInstr[0];
+ NextInstr[0] = NextInstr[1];
+ NextInstr[1] = CodeRead32(R[15]);
+
+ // actually execute
+ if (CheckCondition(CurInstr >> 28))
+ {
+ u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0);
+ ARMInterpreter::ARMInstrTable[icode](this);
+ }
+ else
+ AddCycles_C();
+ }
+
+ // TODO optimize this shit!!!
+ if (Halted)
+ {
+ if (Halted == 1 && NDS.ARM7Timestamp < NDS.ARM7Target)
{
- Cycles = 0;
NDS.ARM7Timestamp = NDS.ARM7Target;
}
- IdleLoop = 0;
break;
}
+ /*if (NDS::IF[1] & NDS::IE[1])
+ {
+ if (NDS::IME[1] & 0x1)
+ TriggerIRQ();
+ }*/
+ if (IRQ) TriggerIRQ();
}
NDS.ARM7Timestamp += Cycles;
@@ -902,6 +858,11 @@ void ARMv4::ExecuteJIT()
Halted = 2;
}
}
+
+template void ARMv4::Execute();
+template void ARMv4::Execute();
+#ifdef JIT_ENABLED
+template void ARMv4::Execute();
#endif
void ARMv5::FillPipeline()
diff --git a/src/ARM.h b/src/ARM.h
index 1e0b71b8..b652e74d 100644
--- a/src/ARM.h
+++ b/src/ARM.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -43,6 +43,15 @@ enum
RWFlags_ForceUser = (1<<21),
};
+enum class CPUExecuteMode : u32
+{
+ Interpreter,
+ InterpreterGDB,
+#ifdef JIT_ENABLED
+ JIT
+#endif
+};
+
struct GDBArgs;
class ARMJIT;
class GPU;
@@ -75,10 +84,6 @@ public:
}
void NocashPrint(u32 addr) noexcept;
- virtual void Execute() = 0;
-#ifdef JIT_ENABLED
- virtual void ExecuteJIT() = 0;
-#endif
bool CheckCondition(u32 code) const
{
@@ -241,10 +246,8 @@ public:
void PrefetchAbort();
void DataAbort();
- void Execute() override;
-#ifdef JIT_ENABLED
- void ExecuteJIT() override;
-#endif
+ template
+ void Execute();
// all code accesses are forced nonseq 32bit
u32 CodeRead32(u32 addr, bool branch);
@@ -383,10 +386,8 @@ public:
void JumpTo(u32 addr, bool restorecpsr = false) override;
- void Execute() override;
-#ifdef JIT_ENABLED
- void ExecuteJIT() override;
-#endif
+ template
+ void Execute();
u16 CodeRead16(u32 addr)
{
diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp
index a1da1b86..e9973380 100644
--- a/src/ARMInterpreter.cpp
+++ b/src/ARMInterpreter.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -38,7 +38,7 @@ void A_UNK(ARM* cpu)
{
Log(LogLevel::Warn, "undefined ARM%d instruction %08X @ %08X\n", cpu->Num?7:9, cpu->CurInstr, cpu->R[15]-8);
#ifdef GDBSTUB_ENABLED
- cpu->GdbStub.Enter(true, Gdb::TgtStatus::FaultInsn, cpu->R[15]-8);
+ cpu->GdbStub.Enter(cpu->GdbStub.IsConnected(), Gdb::TgtStatus::FaultInsn, cpu->R[15]-8);
#endif
//for (int i = 0; i < 16; i++) printf("R%d: %08X\n", i, cpu->R[i]);
//NDS::Halt();
@@ -56,7 +56,7 @@ void T_UNK(ARM* cpu)
{
Log(LogLevel::Warn, "undefined THUMB%d instruction %04X @ %08X\n", cpu->Num?7:9, cpu->CurInstr, cpu->R[15]-4);
#ifdef GDBSTUB_ENABLED
- cpu->GdbStub.Enter(true, Gdb::TgtStatus::FaultInsn, cpu->R[15]-4);
+ cpu->GdbStub.Enter(cpu->GdbStub.IsConnected(), Gdb::TgtStatus::FaultInsn, cpu->R[15]-4);
#endif
//NDS::Halt();
u32 oldcpsr = cpu->CPSR;
diff --git a/src/ARMInterpreter.h b/src/ARMInterpreter.h
index cff4821a..1066ac69 100644
--- a/src/ARMInterpreter.h
+++ b/src/ARMInterpreter.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp
index 315d59d0..167e184e 100644
--- a/src/ARMInterpreter_ALU.cpp
+++ b/src/ARMInterpreter_ALU.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARMInterpreter_ALU.h b/src/ARMInterpreter_ALU.h
index 6998b637..58d8165c 100644
--- a/src/ARMInterpreter_ALU.h
+++ b/src/ARMInterpreter_ALU.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARMInterpreter_Branch.cpp b/src/ARMInterpreter_Branch.cpp
index 015f5682..623be41a 100644
--- a/src/ARMInterpreter_Branch.cpp
+++ b/src/ARMInterpreter_Branch.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARMInterpreter_Branch.h b/src/ARMInterpreter_Branch.h
index 51a561c1..e3d16776 100644
--- a/src/ARMInterpreter_Branch.h
+++ b/src/ARMInterpreter_Branch.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp
index 91acaacc..f7c24312 100644
--- a/src/ARMInterpreter_LoadStore.cpp
+++ b/src/ARMInterpreter_LoadStore.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -430,9 +430,9 @@ void A_LDM(ARM* cpu)
}
}
+ u32 pc = 0;
if (cpu->CurInstr & (1<<15))
{
- u32 pc;
if (preinc) base += 4;
if (first) cpu->DataRead32 (base, &pc);
else cpu->DataRead32S(base, &pc);
@@ -440,13 +440,8 @@ void A_LDM(ARM* cpu)
if (cpu->Num == 1)
pc &= ~0x1;
-
- cpu->JumpTo(pc, cpu->CurInstr & (1<<22));
}
- if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15)))
- cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true);
-
if (cpu->CurInstr & (1<<21))
{
// post writeback
@@ -466,6 +461,12 @@ void A_LDM(ARM* cpu)
cpu->R[baseid] = wbbase;
}
+ if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15)))
+ cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true);
+
+ if (cpu->CurInstr & (1<<15))
+ cpu->JumpTo(pc, cpu->CurInstr & (1<<22));
+
cpu->AddCycles_CDI();
}
diff --git a/src/ARMInterpreter_LoadStore.h b/src/ARMInterpreter_LoadStore.h
index 32d6e4d2..62828194 100644
--- a/src/ARMInterpreter_LoadStore.h
+++ b/src/ARMInterpreter_LoadStore.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp
index c3fcba26..1ebcce8e 100644
--- a/src/ARMJIT.cpp
+++ b/src/ARMJIT.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARMJIT.h b/src/ARMJIT.h
index 7619f234..a228a4dd 100644
--- a/src/ARMJIT.h
+++ b/src/ARMJIT.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARMJIT_A64/ARMJIT_ALU.cpp b/src/ARMJIT_A64/ARMJIT_ALU.cpp
index b25bcaa3..cb777500 100644
--- a/src/ARMJIT_A64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_A64/ARMJIT_ALU.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2022 melonDS team, RSDuck
+ Copyright 2016-2024 melonDS team, RSDuck
This file is part of melonDS.
diff --git a/src/ARMJIT_A64/ARMJIT_Branch.cpp b/src/ARMJIT_A64/ARMJIT_Branch.cpp
index 92717e91..f9c2e0c5 100644
--- a/src/ARMJIT_A64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Branch.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2022 melonDS team, RSDuck
+ Copyright 2016-2024 melonDS team, RSDuck
This file is part of melonDS.
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
index 7981ed67..f05de448 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2022 melonDS team, RSDuck
+ Copyright 2016-2024 melonDS team, RSDuck
This file is part of melonDS.
diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h
index 2b0048a9..a7b567f6 100644
--- a/src/ARMJIT_A64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_A64/ARMJIT_Compiler.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2022 melonDS team, RSDuck
+ Copyright 2016-2024 melonDS team, RSDuck
This file is part of melonDS.
diff --git a/src/ARMJIT_A64/ARMJIT_Linkage.S b/src/ARMJIT_A64/ARMJIT_Linkage.S
index b73905bd..9c360ec0 100644
--- a/src/ARMJIT_A64/ARMJIT_Linkage.S
+++ b/src/ARMJIT_A64/ARMJIT_Linkage.S
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2022 melonDS team, RSDuck
+ Copyright 2016-2024 melonDS team, RSDuck
This file is part of melonDS.
diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
index e108b7b4..6d2c4276 100644
--- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2022 melonDS team, RSDuck
+ Copyright 2016-2024 melonDS team, RSDuck
This file is part of melonDS.
diff --git a/src/ARMJIT_Compiler.h b/src/ARMJIT_Compiler.h
index ff4f8ff7..46cce5b0 100644
--- a/src/ARMJIT_Compiler.h
+++ b/src/ARMJIT_Compiler.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2022 melonDS team, RSDuck
+ Copyright 2016-2024 melonDS team, RSDuck
This file is part of melonDS.
diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h
index 8429bade..5b393903 100644
--- a/src/ARMJIT_Internal.h
+++ b/src/ARMJIT_Internal.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2022 melonDS team, RSDuck
+ Copyright 2016-2024 melonDS team, RSDuck
This file is part of melonDS.
diff --git a/src/ARMJIT_Memory.cpp b/src/ARMJIT_Memory.cpp
index c8969aee..51e022d1 100644
--- a/src/ARMJIT_Memory.cpp
+++ b/src/ARMJIT_Memory.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARMJIT_Memory.h b/src/ARMJIT_Memory.h
index d36f6032..88e647d5 100644
--- a/src/ARMJIT_Memory.h
+++ b/src/ARMJIT_Memory.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARMJIT_RegisterCache.h b/src/ARMJIT_RegisterCache.h
index e5f28dd6..d2680731 100644
--- a/src/ARMJIT_RegisterCache.h
+++ b/src/ARMJIT_RegisterCache.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2022 melonDS team, RSDuck
+ Copyright 2016-2024 melonDS team, RSDuck
This file is part of melonDS.
diff --git a/src/ARMJIT_x64/ARMJIT_ALU.cpp b/src/ARMJIT_x64/ARMJIT_ALU.cpp
index 69449ff9..8f7a1b22 100644
--- a/src/ARMJIT_x64/ARMJIT_ALU.cpp
+++ b/src/ARMJIT_x64/ARMJIT_ALU.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp
index f7a01f48..c32e2b73 100644
--- a/src/ARMJIT_x64/ARMJIT_Branch.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
index b18837f3..ba6c0fb4 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h
index 941d8924..3965e882 100644
--- a/src/ARMJIT_x64/ARMJIT_Compiler.h
+++ b/src/ARMJIT_x64/ARMJIT_Compiler.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp
index e2a74eee..e4812c0a 100644
--- a/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp
+++ b/src/ARMJIT_x64/ARMJIT_GenOffsets.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2022 melonDS team, RSDuck
+ Copyright 2016-2024 melonDS team, RSDuck
This file is part of melonDS.
diff --git a/src/ARMJIT_x64/ARMJIT_Linkage.S b/src/ARMJIT_x64/ARMJIT_Linkage.S
index 023f6e7b..18596003 100644
--- a/src/ARMJIT_x64/ARMJIT_Linkage.S
+++ b/src/ARMJIT_x64/ARMJIT_Linkage.S
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
index 8520bebc..219c7271 100644
--- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
+++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARMJIT_x64/ARMJIT_Offsets.h b/src/ARMJIT_x64/ARMJIT_Offsets.h
index 9d2a9522..738fc4ea 100644
--- a/src/ARMJIT_x64/ARMJIT_Offsets.h
+++ b/src/ARMJIT_x64/ARMJIT_Offsets.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2022 melonDS team, RSDuck
+ Copyright 2016-2024 melonDS team, RSDuck
This file is part of melonDS.
diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp
index d53c88f0..58838307 100644
--- a/src/ARM_InstrInfo.cpp
+++ b/src/ARM_InstrInfo.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARM_InstrInfo.h b/src/ARM_InstrInfo.h
index 13f66b09..fe4095b4 100644
--- a/src/ARM_InstrInfo.h
+++ b/src/ARM_InstrInfo.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/ARM_InstrTable.h b/src/ARM_InstrTable.h
index 7cdad66d..8213c2e0 100644
--- a/src/ARM_InstrTable.h
+++ b/src/ARM_InstrTable.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/Args.h b/src/Args.h
index d836b643..2c405e29 100644
--- a/src/Args.h
+++ b/src/Args.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -69,6 +69,10 @@ struct JITArgs
bool FastMemory = true;
};
+using ARM9BIOSImage = std::array;
+using ARM7BIOSImage = std::array;
+using DSiBIOSImage = std::array;
+
struct GDBArgs
{
u16 PortARM7 = 0;
@@ -95,11 +99,11 @@ struct NDSArgs
/// NDS ARM9 BIOS to install.
/// Defaults to FreeBIOS, which is not compatible with DSi mode.
- std::array ARM9BIOS = bios_arm9_bin;
+ std::unique_ptr ARM9BIOS = std::make_unique(bios_arm9_bin);
/// NDS ARM7 BIOS to install.
/// Defaults to FreeBIOS, which is not compatible with DSi mode.
- std::array ARM7BIOS = bios_arm7_bin;
+ std::unique_ptr ARM7BIOS = std::make_unique(bios_arm7_bin);
/// Firmware image to install.
/// Defaults to generated NDS firmware.
@@ -131,8 +135,8 @@ struct NDSArgs
/// Contains no virtual methods, so there's no vtable.
struct DSiArgs final : public NDSArgs
{
- std::array ARM9iBIOS = BrokenBIOS;
- std::array ARM7iBIOS = BrokenBIOS;
+ std::unique_ptr ARM9iBIOS = std::make_unique(BrokenBIOS);
+ std::unique_ptr ARM7iBIOS = std::make_unique(BrokenBIOS);
/// NAND image to install.
/// Required, there is no default value.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index afabc03f..1f947d11 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -35,6 +35,8 @@ add_library(core STATIC
GPU2D_Soft.cpp
GPU3D.cpp
GPU3D_Soft.cpp
+ GPU3D_Texcache.cpp
+ GPU3D_Texcache.h
melonDLDI.h
NDS.cpp
NDSCart.cpp
@@ -52,7 +54,6 @@ add_library(core STATIC
types.h
Utils.cpp
Utils.h
- version.h
Wifi.cpp
WifiAP.cpp
@@ -79,6 +80,9 @@ if (ENABLE_OGLRENDERER)
GPU_OpenGL.cpp
GPU_OpenGL_shaders.h
GPU3D_OpenGL.cpp
+ GPU3D_Compute.cpp
+ GPU3D_TexcacheOpenGL.cpp
+ GPU3D_TexcacheOpenGL.h
GPU3D_OpenGL_shaders.h
OpenGLSupport.cpp)
@@ -123,6 +127,24 @@ if (ENABLE_JIT)
endif()
endif()
+set(MELONDS_VERSION_SUFFIX "$ENV{MELONDS_VERSION_SUFFIX}" CACHE STRING "Suffix to add to displayed melonDS version")
+option(MELONDS_EMBED_BUILD_INFO "Embed detailed build info into the binary" OFF)
+set(MELONDS_GIT_BRANCH "$ENV{MELONDS_GIT_BRANCH}" CACHE STRING "The Git branch used for this build")
+set(MELONDS_GIT_HASH "$ENV{MELONDS_GIT_HASH}" CACHE STRING "The hash of the Git commit")
+set(MELONDS_BUILD_PROVIDER "$ENV{MELONDS_BUILD_PROVIDER}" CACHE STRING "The name of the provider of this build")
+
+if (MELONDS_EMBED_BUILD_INFO)
+ target_compile_definitions(core PUBLIC MELONDS_EMBED_BUILD_INFO)
+ if (NOT MELONDS_GIT_BRANCH OR NOT MELONDS_GIT_HASH OR NOT MELONDS_BUILD_PROVIDER)
+ message(FATAL_ERROR "When embedding build information, all fields must be filled out. See src/CMakeLists.txt.")
+ endif()
+endif()
+
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/version.h.in" "${CMAKE_CURRENT_BINARY_DIR}/version.h")
+target_sources(core PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/version.h")
+target_include_directories(core PUBLIC "${CMAKE_CURRENT_BINARY_DIR}")
+
+set(BUILD_SHARED_LIBS OFF)
add_subdirectory(teakra EXCLUDE_FROM_ALL)
# Workaround for building teakra with -O0 on Windows either failing or hanging forever
target_compile_options(teakra PRIVATE "$<$:-Og>")
@@ -130,7 +152,9 @@ target_link_libraries(core PRIVATE teakra)
if (NOT MSVC)
# MSVC has its own compiler flag syntax; if we ever support it,
- # be sure to silence any equivalent warnings there.
+ # be sure to add equivalent flags here.
+
+ target_compile_options(core PUBLIC -fwrapv)
target_compile_options(core PRIVATE "$<$:-Wno-invalid-offsetof>")
# These warnings are excessive, and are only triggered in the ARMJIT code
@@ -154,11 +178,13 @@ endif()
if (WIN32)
target_link_libraries(core PRIVATE ole32 comctl32 wsock32 ws2_32)
-elseif(NOT APPLE)
+elseif(NOT APPLE AND NOT HAIKU)
check_library_exists(rt shm_open "" NEED_LIBRT)
if (NEED_LIBRT)
target_link_libraries(core PRIVATE rt)
endif()
+elseif(HAIKU)
+ target_link_libraries(core PRIVATE network)
endif()
if (ENABLE_JIT_PROFILING)
diff --git a/src/CP15.cpp b/src/CP15.cpp
index 58137fdd..c271e180 100644
--- a/src/CP15.cpp
+++ b/src/CP15.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -186,10 +186,14 @@ void ARMv5::UpdatePURegion(u32 n)
return;
}
- u32 start = rgn >> 12;
- u32 sz = 2 << ((rgn >> 1) & 0x1F);
- u32 end = start + (sz >> 12);
- // TODO: check alignment of start
+ // notes:
+ // * min size of a pu region is 4KiB (12 bits)
+ // * size is calculated as size + 1, but the 12 lsb of address space are ignored, therefore we need it as size + 1 - 12, or size - 11
+ // * pu regions are aligned based on their size
+ u32 size = std::max((int)((rgn>>1) & 0x1F) - 11, 0); // obtain the size, subtract 11 and clamp to a min of 0.
+ u32 start = ((rgn >> 12) >> size) << size; // determine the start offset, and use shifts to force alignment with a multiple of the size.
+ u32 end = start + (1<> 4) & 0xF,
val,
val & 1 ? "enabled" : "disabled",
val & 0xFFFFF000,
- (val & 0xFFFFF000) + (2 << ((val & 0x3E) >> 1))
+ (val & 0x3E) >> 1
);
Log(LogLevel::Debug, "%s", log_output);
// Some implementations of Log imply a newline, so we build up the line before printing it
diff --git a/src/CRC32.cpp b/src/CRC32.cpp
index 0756c034..82fe467f 100644
--- a/src/CRC32.cpp
+++ b/src/CRC32.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/CRC32.h b/src/CRC32.h
index 11879057..90fb8057 100644
--- a/src/CRC32.h
+++ b/src/CRC32.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/DMA.cpp b/src/DMA.cpp
index 717b38fa..80cd592c 100644
--- a/src/DMA.cpp
+++ b/src/DMA.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -21,6 +21,7 @@
#include "DSi.h"
#include "DMA.h"
#include "GPU.h"
+#include "GPU3D.h"
#include "DMA_Timings.h"
#include "Platform.h"
diff --git a/src/DMA.h b/src/DMA.h
index e0e3be15..354f4495 100644
--- a/src/DMA.h
+++ b/src/DMA.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/DMA_Timings.cpp b/src/DMA_Timings.cpp
index 912e4e2e..a51fedfb 100644
--- a/src/DMA_Timings.cpp
+++ b/src/DMA_Timings.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/DMA_Timings.h b/src/DMA_Timings.h
index 63dc4676..38206235 100644
--- a/src/DMA_Timings.h
+++ b/src/DMA_Timings.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/DSi.cpp b/src/DSi.cpp
index c929c6d2..00ed8da0 100644
--- a/src/DSi.cpp
+++ b/src/DSi.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -70,8 +70,28 @@ const u32 NDMAModes[] =
0xFF, // wifi / GBA cart slot (TODO)
};
-DSi::DSi(DSiArgs&& args) noexcept :
- NDS(std::move(args), 1),
+/*DSi::DSi() noexcept :
+ DSi(
+ DSiArgs {
+ NDSArgs {
+ nullptr,
+ nullptr,
+ bios_arm9_bin,
+ bios_arm7_bin,
+ Firmware(0),
+ },
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+ false
+ }
+ )
+{
+}*/
+
+DSi::DSi(DSiArgs&& args, void* userdata) noexcept :
+ NDS(std::move(args), 1, userdata),
NDMAs {
DSi_NDMA(0, 0, *this),
DSi_NDMA(0, 1, *this),
@@ -82,8 +102,8 @@ DSi::DSi(DSiArgs&& args) noexcept :
DSi_NDMA(1, 2, *this),
DSi_NDMA(1, 3, *this),
},
- ARM7iBIOS(args.ARM7iBIOS),
- ARM9iBIOS(args.ARM9iBIOS),
+ ARM7iBIOS(*args.ARM7iBIOS),
+ ARM9iBIOS(*args.ARM9iBIOS),
DSP(*this),
SDMMC(*this, std::move(args.NANDImage), std::move(args.DSiSDCard)),
SDIO(*this),
diff --git a/src/DSi.h b/src/DSi.h
index 1d010e0f..23a2460c 100644
--- a/src/DSi.h
+++ b/src/DSi.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -130,7 +130,8 @@ public:
void ARM7IOWrite32(u32 addr, u32 val) override;
public:
- DSi(DSiArgs&& args) noexcept;
+ DSi(DSiArgs&& args, void* userdata = nullptr) noexcept;
+ //DSi() noexcept;
~DSi() noexcept override;
DSi(const DSi&) = delete;
DSi& operator=(const DSi&) = delete;
diff --git a/src/DSi_AES.cpp b/src/DSi_AES.cpp
index 379dea13..36fe2892 100644
--- a/src/DSi_AES.cpp
+++ b/src/DSi_AES.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/DSi_AES.h b/src/DSi_AES.h
index d83c870e..f3b79868 100644
--- a/src/DSi_AES.h
+++ b/src/DSi_AES.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -28,7 +28,7 @@ namespace melonDS
{
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wattributes"
-#if defined(__GNUC__) && (__GNUC__ >= 11) // gcc 11.*
+#if defined(__GNUC__) && (__GNUC__ >= 11) && defined(__SIZEOF_INT128__) // gcc 11.*
// NOTE: Yes, the compiler does *not* recognize this code pattern, so it is indeed an optimization.
__attribute((always_inline)) static void Bswap128(void* Dst, const void* Src)
{
diff --git a/src/DSi_Camera.cpp b/src/DSi_Camera.cpp
index a1cdbe0a..b1d60d04 100644
--- a/src/DSi_Camera.cpp
+++ b/src/DSi_Camera.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -410,7 +410,7 @@ void DSi_Camera::DoSavestate(Savestate* file)
void DSi_Camera::Reset()
{
- Platform::Camera_Stop(Num);
+ Platform::Camera_Stop(Num, DSi.UserData);
DataPos = 0;
RegAddr = 0;
@@ -435,7 +435,7 @@ void DSi_Camera::Reset()
void DSi_Camera::Stop()
{
- Platform::Camera_Stop(Num);
+ Platform::Camera_Stop(Num, DSi.UserData);
}
bool DSi_Camera::IsActivated() const
@@ -474,7 +474,7 @@ void DSi_Camera::StartTransfer()
FrameFormat = 0;
}
- Platform::Camera_CaptureFrame(Num, FrameBuffer, 640, 480, true);
+ Platform::Camera_CaptureFrame(Num, FrameBuffer, 640, 480, true, DSi.UserData);
}
bool DSi_Camera::TransferDone() const
@@ -655,8 +655,8 @@ void DSi_Camera::I2C_WriteReg(u16 addr, u16 val)
StandbyCnt = val;
//printf("CAM%d STBCNT=%04X (%04X)\n", Num, StandbyCnt, val);
bool isactive = IsActivated();
- if (isactive && !wasactive) Platform::Camera_Start(Num);
- else if (wasactive && !isactive) Platform::Camera_Stop(Num);
+ if (isactive && !wasactive) Platform::Camera_Start(Num, DSi.UserData);
+ else if (wasactive && !isactive) Platform::Camera_Stop(Num, DSi.UserData);
}
return;
case 0x001A:
@@ -665,8 +665,8 @@ void DSi_Camera::I2C_WriteReg(u16 addr, u16 val)
MiscCnt = val & 0x0B7B;
//printf("CAM%d MISCCNT=%04X (%04X)\n", Num, MiscCnt, val);
bool isactive = IsActivated();
- if (isactive && !wasactive) Platform::Camera_Start(Num);
- else if (wasactive && !isactive) Platform::Camera_Stop(Num);
+ if (isactive && !wasactive) Platform::Camera_Start(Num, DSi.UserData);
+ else if (wasactive && !isactive) Platform::Camera_Stop(Num, DSi.UserData);
}
return;
diff --git a/src/DSi_Camera.h b/src/DSi_Camera.h
index 363cea43..604e06ac 100644
--- a/src/DSi_Camera.h
+++ b/src/DSi_Camera.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/DSi_I2C.cpp b/src/DSi_I2C.cpp
index 28f98dc8..f28562e9 100644
--- a/src/DSi_I2C.cpp
+++ b/src/DSi_I2C.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/DSi_I2C.h b/src/DSi_I2C.h
index 5dfeebd0..3102ffeb 100644
--- a/src/DSi_I2C.h
+++ b/src/DSi_I2C.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -87,6 +87,7 @@ public:
void DoSavestate(Savestate* file) override;
u8 GetBootFlag() const;
+ void SetBootFlag(u8 boot) noexcept { Registers[0x70] = boot; }
bool GetBatteryCharging() const;
void SetBatteryCharging(bool charging);
diff --git a/src/DSi_NAND.cpp b/src/DSi_NAND.cpp
index 8da02540..a6b6c566 100644
--- a/src/DSi_NAND.cpp
+++ b/src/DSi_NAND.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/DSi_NAND.h b/src/DSi_NAND.h
index 104845d5..7af434d9 100644
--- a/src/DSi_NAND.h
+++ b/src/DSi_NAND.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/DSi_NDMA.cpp b/src/DSi_NDMA.cpp
index fe1f0ba7..452ac6e6 100644
--- a/src/DSi_NDMA.cpp
+++ b/src/DSi_NDMA.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -22,6 +22,7 @@
#include "DSi_NDMA.h"
#include "GPU.h"
#include "DSi_AES.h"
+#include "GPU3D.h"
namespace melonDS
{
diff --git a/src/DSi_NDMA.h b/src/DSi_NDMA.h
index fb34dbdf..9f8e6706 100644
--- a/src/DSi_NDMA.h
+++ b/src/DSi_NDMA.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/DSi_NWifi.cpp b/src/DSi_NWifi.cpp
index a6177dec..9827bdbe 100644
--- a/src/DSi_NWifi.cpp
+++ b/src/DSi_NWifi.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -1334,7 +1334,7 @@ void DSi_NWifi::WMI_SendPacket(u16 len)
}
printf("\n");*/
- Platform::LAN_SendPacket(LANBuffer, lan_len);
+ Platform::Net_SendPacket(LANBuffer, lan_len, DSi.UserData);
}
void DSi_NWifi::SendWMIEvent(u8 ep, u16 id, u8* data, u32 len)
@@ -1442,20 +1442,25 @@ void DSi_NWifi::CheckRX()
if (!Mailbox[8].CanFit(2048))
return;
- int rxlen = Platform::LAN_RecvPacket(LANBuffer);
- if (rxlen > 0)
+ int rxlen = Platform::Net_RecvPacket(LANBuffer, DSi.UserData);
+ while (rxlen > 0)
{
- //printf("WMI packet recv %04X %04X %04X\n", *(u16*)&LANBuffer[0], *(u16*)&LANBuffer[2], *(u16*)&LANBuffer[4]);
// check destination MAC
if (*(u32*)&LANBuffer[0] != 0xFFFFFFFF || *(u16*)&LANBuffer[4] != 0xFFFF)
{
if (memcmp(&LANBuffer[0], &EEPROM[0x00A], 6))
- return;
+ {
+ rxlen = Platform::Net_RecvPacket(LANBuffer, DSi.UserData);
+ continue;
+ }
}
// check source MAC, in case we get a packet we just sent out
if (!memcmp(&LANBuffer[6], &EEPROM[0x00A], 6))
- return;
+ {
+ rxlen = Platform::Net_RecvPacket(LANBuffer, DSi.UserData);
+ continue;
+ }
// packet is good
@@ -1502,6 +1507,7 @@ void DSi_NWifi::CheckRX()
Mailbox[8].Write(LANBuffer[14+i]);
DrainRXBuffer();
+ return;
}
}
diff --git a/src/DSi_NWifi.h b/src/DSi_NWifi.h
index 39e9459c..84ac8a49 100644
--- a/src/DSi_NWifi.h
+++ b/src/DSi_NWifi.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/DSi_SD.cpp b/src/DSi_SD.cpp
index 72fe3756..c600bc76 100644
--- a/src/DSi_SD.cpp
+++ b/src/DSi_SD.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/DSi_SD.h b/src/DSi_SD.h
index 29620dc5..5d376600 100644
--- a/src/DSi_SD.h
+++ b/src/DSi_SD.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/DSi_SPI_TSC.cpp b/src/DSi_SPI_TSC.cpp
index d515db9f..dbb60c10 100644
--- a/src/DSi_SPI_TSC.cpp
+++ b/src/DSi_SPI_TSC.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/DSi_SPI_TSC.h b/src/DSi_SPI_TSC.h
index d1a71063..f20f7ac1 100644
--- a/src/DSi_SPI_TSC.h
+++ b/src/DSi_SPI_TSC.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/DSi_TMD.h b/src/DSi_TMD.h
index f07b3d1c..5ea91a6f 100644
--- a/src/DSi_TMD.h
+++ b/src/DSi_TMD.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/FATIO.cpp b/src/FATIO.cpp
index 233014e4..aea33ee6 100644
--- a/src/FATIO.cpp
+++ b/src/FATIO.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/FATIO.h b/src/FATIO.h
index 6d8aa499..f8184885 100644
--- a/src/FATIO.h
+++ b/src/FATIO.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/FATStorage.cpp b/src/FATStorage.cpp
index 0f1bf235..200c99d5 100644
--- a/src/FATStorage.cpp
+++ b/src/FATStorage.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -110,6 +110,7 @@ bool FATStorage::InjectFile(const std::string& path, u8* data, u32 len)
res = f_mount(&fs, "0:", 1);
if (res != FR_OK)
{
+ f_unmount("0:");
ff_disk_close();
return false;
}
@@ -146,6 +147,7 @@ u32 FATStorage::ReadFile(const std::string& path, u32 start, u32 len, u8* data)
res = f_mount(&fs, "0:", 1);
if (res != FR_OK)
{
+ f_unmount("0:");
ff_disk_close();
return false;
}
@@ -1144,6 +1146,7 @@ bool FATStorage::Save()
res = f_mount(&fs, "0:", 1);
if (res != FR_OK)
{
+ f_unmount("0:");
ff_disk_close();
return false;
}
diff --git a/src/FATStorage.h b/src/FATStorage.h
index 00628461..030b765b 100644
--- a/src/FATStorage.h
+++ b/src/FATStorage.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/FIFO.h b/src/FIFO.h
index 026c2c7f..5fc04832 100644
--- a/src/FIFO.h
+++ b/src/FIFO.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -24,6 +24,7 @@
namespace melonDS
{
+
template
class FIFO
{
@@ -191,5 +192,121 @@ private:
u32 ReadPos, WritePos;
};
+template
+class RingBuffer
+{
+public:
+ void Clear()
+ {
+ NumOccupied = 0;
+ ReadPos = 0;
+ WritePos = 0;
+ memset(Buffer, 0, Size);
+ }
+
+
+ void DoSavestate(Savestate* file)
+ {
+ file->Var32(&NumOccupied);
+ file->Var32(&ReadPos);
+ file->Var32(&WritePos);
+
+ file->VarArray(Buffer, Size);
+ }
+
+
+ bool Write(const void* data, u32 len)
+ {
+ if (!CanFit(len)) return false;
+
+ if ((WritePos + len) >= Size)
+ {
+ u32 part1 = Size - WritePos;
+ memcpy(&Buffer[WritePos], data, part1);
+ if (len > part1)
+ memcpy(Buffer, &((u8*)data)[part1], len - part1);
+ WritePos = len - part1;
+ }
+ else
+ {
+ memcpy(&Buffer[WritePos], data, len);
+ WritePos += len;
+ }
+
+ NumOccupied += len;
+
+ return true;
+ }
+
+ bool Read(void* data, u32 len)
+ {
+ if (NumOccupied < len) return false;
+
+ u32 readpos = ReadPos;
+ if ((readpos + len) >= Size)
+ {
+ u32 part1 = Size - readpos;
+ memcpy(data, &Buffer[readpos], part1);
+ if (len > part1)
+ memcpy(&((u8*)data)[part1], Buffer, len - part1);
+ ReadPos = len - part1;
+ }
+ else
+ {
+ memcpy(data, &Buffer[readpos], len);
+ ReadPos += len;
+ }
+
+ NumOccupied -= len;
+ return true;
+ }
+
+ bool Peek(void* data, u32 offset, u32 len)
+ {
+ if (NumOccupied < len) return false;
+
+ u32 readpos = ReadPos + offset;
+ if (readpos >= Size) readpos -= Size;
+
+ if ((readpos + len) >= Size)
+ {
+ u32 part1 = Size - readpos;
+ memcpy(data, &Buffer[readpos], part1);
+ if (len > part1)
+ memcpy(&((u8*)data)[part1], Buffer, len - part1);
+ }
+ else
+ {
+ memcpy(data, &Buffer[readpos], len);
+ }
+
+ return true;
+ }
+
+ bool Skip(u32 len)
+ {
+ if (NumOccupied < len) return false;
+
+ ReadPos += len;
+ if (ReadPos >= Size)
+ ReadPos -= Size;
+
+ NumOccupied -= len;
+ return true;
+ }
+
+ u32 Level() const { return NumOccupied; }
+ bool IsEmpty() const { return NumOccupied == 0; }
+ bool IsFull() const { return NumOccupied >= Size; }
+
+ bool CanFit(u32 num) const { return ((NumOccupied + num) <= Size); }
+
+private:
+ u8 Buffer[Size] = {0};
+ u32 NumOccupied = 0;
+ u32 ReadPos = 0, WritePos = 0;
+};
+
}
+
#endif
diff --git a/src/GBACart.cpp b/src/GBACart.cpp
index 1be50e75..a62aca6b 100644
--- a/src/GBACart.cpp
+++ b/src/GBACart.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -95,17 +95,18 @@ u32 CartCommon::GetSaveMemoryLength() const
return 0;
}
-CartGame::CartGame(const u8* rom, u32 len, const u8* sram, u32 sramlen, GBACart::CartType type) :
- CartGame(CopyToUnique(rom, len), len, CopyToUnique(sram, sramlen), sramlen, type)
+CartGame::CartGame(const u8* rom, u32 len, const u8* sram, u32 sramlen, void* userdata, GBACart::CartType type) :
+ CartGame(CopyToUnique(rom, len), len, CopyToUnique(sram, sramlen), sramlen, userdata, type)
{
}
-CartGame::CartGame(std::unique_ptr&& rom, u32 len, std::unique_ptr&& sram, u32 sramlen, GBACart::CartType type) :
+CartGame::CartGame(std::unique_ptr&& rom, u32 len, std::unique_ptr&& sram, u32 sramlen, void* userdata, GBACart::CartType type) :
CartCommon(type),
ROM(std::move(rom)),
ROMLength(len),
SRAM(std::move(sram)),
- SRAMLength(sramlen)
+ SRAMLength(sramlen),
+ UserData(userdata)
{
if (SRAM && SRAMLength)
{
@@ -170,7 +171,7 @@ void CartGame::DoSavestate(Savestate* file)
file->Var8((u8*)&SRAMType);
if ((!file->Saving) && SRAM)
- Platform::WriteGBASave(SRAM.get(), SRAMLength, 0, SRAMLength);
+ Platform::WriteGBASave(SRAM.get(), SRAMLength, 0, SRAMLength, UserData);
}
void CartGame::SetupSave(u32 type)
@@ -223,7 +224,7 @@ void CartGame::SetSaveMemory(const u8* savedata, u32 savelen)
u32 len = std::min(savelen, SRAMLength);
memcpy(SRAM.get(), savedata, len);
- Platform::WriteGBASave(savedata, len, 0, len);
+ Platform::WriteGBASave(savedata, len, 0, len, UserData);
}
u16 CartGame::ROMRead(u32 addr) const
@@ -464,7 +465,7 @@ void CartGame::SRAMWrite_FLASH(u32 addr, u8 val)
u32 start_addr = addr + 0x10000 * SRAMFlashState.bank;
memset((u8*)&SRAM[start_addr], 0xFF, 0x1000);
- Platform::WriteGBASave(SRAM.get(), SRAMLength, start_addr, 0x1000);
+ Platform::WriteGBASave(SRAM.get(), SRAMLength, start_addr, 0x1000, UserData);
}
SRAMFlashState.state = 0;
SRAMFlashState.cmd = 0;
@@ -523,18 +524,18 @@ void CartGame::SRAMWrite_SRAM(u32 addr, u8 val)
*(u8*)&SRAM[addr] = val;
// TODO: optimize this!!
- Platform::WriteGBASave(SRAM.get(), SRAMLength, addr, 1);
+ Platform::WriteGBASave(SRAM.get(), SRAMLength, addr, 1, UserData);
}
}
-CartGameSolarSensor::CartGameSolarSensor(const u8* rom, u32 len, const u8* sram, u32 sramlen) :
- CartGameSolarSensor(CopyToUnique(rom, len), len, CopyToUnique(sram, sramlen), sramlen)
+CartGameSolarSensor::CartGameSolarSensor(const u8* rom, u32 len, const u8* sram, u32 sramlen, void* userdata) :
+ CartGameSolarSensor(CopyToUnique(rom, len), len, CopyToUnique(sram, sramlen), sramlen, userdata)
{
}
-CartGameSolarSensor::CartGameSolarSensor(std::unique_ptr&& rom, u32 len, std::unique_ptr&& sram, u32 sramlen) :
- CartGame(std::move(rom), len, std::move(sram), sramlen, CartType::GameSolarSensor)
+CartGameSolarSensor::CartGameSolarSensor(std::unique_ptr&& rom, u32 len, std::unique_ptr&& sram, u32 sramlen, void* userdata) :
+ CartGame(std::move(rom), len, std::move(sram), sramlen, userdata, CartType::GameSolarSensor)
{
}
@@ -581,6 +582,11 @@ int CartGameSolarSensor::SetInput(int num, bool pressed)
return -1;
}
+void CartGameSolarSensor::SetLightLevel(u8 level) noexcept
+{
+ LightLevel = std::clamp(level, 0, 10);
+}
+
void CartGameSolarSensor::ProcessGPIO()
{
if (GPIO.data & 4) return; // Boktai chip select
@@ -680,7 +686,45 @@ void CartRAMExpansion::ROMWrite(u32 addr, u16 val)
}
}
-GBACartSlot::GBACartSlot(std::unique_ptr&& cart) noexcept : Cart(std::move(cart))
+CartRumblePak::CartRumblePak(void* userdata) :
+ CartCommon(RumblePak),
+ UserData(userdata)
+{
+}
+
+CartRumblePak::~CartRumblePak() = default;
+
+void CartRumblePak::Reset()
+{
+ RumbleState = 0;
+}
+
+void CartRumblePak::DoSavestate(Savestate* file)
+{
+ CartCommon::DoSavestate(file);
+ file->Var16(&RumbleState);
+}
+
+u16 CartRumblePak::ROMRead(u32 addr) const
+{
+ // A1 is pulled low on a real Rumble Pak, so return the
+ // necessary detection value here,
+ // and let the existing open bus implementation take care of the rest
+ return 0xFFFD;
+}
+
+void CartRumblePak::ROMWrite(u32 addr, u16 val)
+{
+ addr &= 0x01FFFFFF;
+ if (RumbleState != val)
+ {
+ Platform::Addon_RumbleStop(UserData);
+ RumbleState = val;
+ Platform::Addon_RumbleStart(16, UserData);
+ }
+}
+
+GBACartSlot::GBACartSlot(melonDS::NDS& nds, std::unique_ptr&& cart) noexcept : NDS(nds), Cart(std::move(cart))
{
}
@@ -723,24 +767,24 @@ void GBACartSlot::DoSavestate(Savestate* file) noexcept
if (Cart) Cart->DoSavestate(file);
}
-std::unique_ptr ParseROM(std::unique_ptr&& romdata, u32 romlen)
+std::unique_ptr ParseROM(std::unique_ptr&& romdata, u32 romlen, void* userdata)
{
- return ParseROM(std::move(romdata), romlen, nullptr, 0);
+ return ParseROM(std::move(romdata), romlen, nullptr, 0, userdata);
}
-std::unique_ptr ParseROM(const u8* romdata, u32 romlen, const u8* sramdata, u32 sramlen)
+std::unique_ptr ParseROM(const u8* romdata, u32 romlen, const u8* sramdata, u32 sramlen, void* userdata)
{
auto [romcopy, romcopylen] = PadToPowerOf2(romdata, romlen);
- return ParseROM(std::move(romcopy), romcopylen, CopyToUnique(sramdata, sramlen), sramlen);
+ return ParseROM(std::move(romcopy), romcopylen, CopyToUnique(sramdata, sramlen), sramlen, userdata);
}
-std::unique_ptr ParseROM(const u8* romdata, u32 romlen)
+std::unique_ptr ParseROM(const u8* romdata, u32 romlen, void* userdata)
{
- return ParseROM(romdata, romlen, nullptr, 0);
+ return ParseROM(romdata, romlen, nullptr, 0, userdata);
}
-std::unique_ptr ParseROM(std::unique_ptr&& romdata, u32 romlen, std::unique_ptr&& sramdata, u32 sramlen)
+std::unique_ptr ParseROM(std::unique_ptr&& romdata, u32 romlen, std::unique_ptr&& sramdata, u32 sramlen, void* userdata)
{
if (romdata == nullptr)
{
@@ -773,9 +817,9 @@ std::unique_ptr ParseROM(std::unique_ptr&& romdata, u32 romlen
std::unique_ptr cart;
if (solarsensor)
- cart = std::make_unique(std::move(cartrom), cartromsize, std::move(sramdata), sramlen);
+ cart = std::make_unique(std::move(cartrom), cartromsize, std::move(sramdata), sramlen, userdata);
else
- cart = std::make_unique(std::move(cartrom), cartromsize, std::move(sramdata), sramlen);
+ cart = std::make_unique(std::move(cartrom), cartromsize, std::move(sramdata), sramlen, userdata);
cart->Reset();
@@ -794,7 +838,7 @@ void GBACartSlot::SetCart(std::unique_ptr&& cart) noexcept
if (!Cart)
{
- Log(LogLevel::Info, "Ejected GBA cart");
+ Log(LogLevel::Info, "Ejected GBA cart\n");
return;
}
@@ -820,13 +864,16 @@ void GBACartSlot::SetSaveMemory(const u8* savedata, u32 savelen) noexcept
}
}
-void GBACartSlot::LoadAddon(int type) noexcept
+void GBACartSlot::LoadAddon(void* userdata, int type) noexcept
{
switch (type)
{
case GBAAddon_RAMExpansion:
Cart = std::make_unique();
break;
+ case GBAAddon_RumblePak:
+ Cart = std::make_unique(userdata);
+ break;
default:
Log(LogLevel::Warn, "GBACart: !! invalid addon type %d\n", type);
@@ -875,4 +922,4 @@ void GBACartSlot::SRAMWrite(u32 addr, u8 val) noexcept
}
-}
\ No newline at end of file
+}
diff --git a/src/GBACart.h b/src/GBACart.h
index 493bf6b8..726a234d 100644
--- a/src/GBACart.h
+++ b/src/GBACart.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -32,6 +32,7 @@ enum CartType
Game = 0x101,
GameSolarSensor = 0x102,
RAMExpansion = 0x201,
+ RumblePak = 0x202,
};
// CartCommon -- base code shared by all cart types
@@ -72,8 +73,8 @@ private:
class CartGame : public CartCommon
{
public:
- CartGame(const u8* rom, u32 len, const u8* sram, u32 sramlen, GBACart::CartType type = GBACart::CartType::Game);
- CartGame(std::unique_ptr&& rom, u32 len, std::unique_ptr&& sram, u32 sramlen, GBACart::CartType type = GBACart::CartType::Game);
+ CartGame(const u8* rom, u32 len, const u8* sram, u32 sramlen, void* userdata, GBACart::CartType type = GBACart::CartType::Game);
+ CartGame(std::unique_ptr&& rom, u32 len, std::unique_ptr&& sram, u32 sramlen, void* userdata, GBACart::CartType type = GBACart::CartType::Game);
~CartGame() override;
u32 Checksum() const override;
@@ -104,6 +105,8 @@ protected:
u8 SRAMRead_SRAM(u32 addr);
void SRAMWrite_SRAM(u32 addr, u8 val);
+ void* UserData;
+
std::unique_ptr ROM;
u32 ROMLength;
@@ -147,14 +150,16 @@ private:
class CartGameSolarSensor : public CartGame
{
public:
- CartGameSolarSensor(const u8* rom, u32 len, const u8* sram, u32 sramlen);
- CartGameSolarSensor(std::unique_ptr&& rom, u32 len, std::unique_ptr&& sram, u32 sramlen);
+ CartGameSolarSensor(const u8* rom, u32 len, const u8* sram, u32 sramlen, void* userdata);
+ CartGameSolarSensor(std::unique_ptr&& rom, u32 len, std::unique_ptr&& sram, u32 sramlen, void* userdata);
void Reset() override;
void DoSavestate(Savestate* file) override;
int SetInput(int num, bool pressed) override;
+ void SetLightLevel(u8 level) noexcept;
+ [[nodiscard]] u8 GetLightLevel() const noexcept { return LightLevel; }
protected:
void ProcessGPIO() override;
@@ -187,6 +192,25 @@ private:
u16 RAMEnable = 0;
};
+// CartRumblePak -- DS Rumble Pak (used in various NDS games)
+class CartRumblePak : public CartCommon
+{
+public:
+ CartRumblePak(void* userdata);
+ ~CartRumblePak() override;
+
+ void Reset() override;
+
+ void DoSavestate(Savestate* file) override;
+
+ u16 ROMRead(u32 addr) const override;
+ void ROMWrite(u32 addr, u16 val) override;
+
+private:
+ void* UserData;
+ u16 RumbleState = 0;
+};
+
// possible inputs for GBA carts that might accept user input
enum
{
@@ -197,7 +221,7 @@ enum
class GBACartSlot
{
public:
- GBACartSlot(std::unique_ptr&& cart = nullptr) noexcept;
+ GBACartSlot(melonDS::NDS& nds, std::unique_ptr&& cart = nullptr) noexcept;
~GBACartSlot() noexcept = default;
void Reset() noexcept;
void DoSavestate(Savestate* file) noexcept;
@@ -217,7 +241,7 @@ public:
[[nodiscard]] CartCommon* GetCart() noexcept { return Cart.get(); }
[[nodiscard]] const CartCommon* GetCart() const noexcept { return Cart.get(); }
- void LoadAddon(int type) noexcept;
+ void LoadAddon(void* userdata, int type) noexcept;
/// @return The cart that was in the cart slot if any,
/// or \c nullptr if the cart slot was empty.
@@ -258,6 +282,7 @@ public:
/// if a cart is loaded and supports SRAM, otherwise zero.
[[nodiscard]] u32 GetSaveMemoryLength() const noexcept { return Cart ? Cart->GetSaveMemoryLength() : 0; }
private:
+ melonDS::NDS& NDS;
std::unique_ptr Cart = nullptr;
u16 OpenBusDecay = 0;
};
@@ -270,9 +295,9 @@ private:
/// @param romlen The length of the ROM data in bytes.
/// @returns A \c GBACart::CartCommon object representing the parsed ROM,
/// or \c nullptr if the ROM data couldn't be parsed.
-std::unique_ptr ParseROM(const u8* romdata, u32 romlen);
-std::unique_ptr ParseROM(std::unique_ptr&& romdata, u32 romlen);
-std::unique_ptr ParseROM(const u8* romdata, u32 romlen, const u8* sramdata, u32 sramlen);
+std::unique_ptr ParseROM(const u8* romdata, u32 romlen, void* userdata = nullptr);
+std::unique_ptr ParseROM(std::unique_ptr&& romdata, u32 romlen, void* userdata = nullptr);
+std::unique_ptr ParseROM(const u8* romdata, u32 romlen, const u8* sramdata, u32 sramlen, void* userdata = nullptr);
/// @param romdata The ROM data to parse. Will be moved-from.
/// @param romlen Length of romdata in bytes.
@@ -282,7 +307,7 @@ std::unique_ptr ParseROM(const u8* romdata, u32 romlen, const u8* sr
/// May be zero, in which case the cart will have no save data.
/// @return Unique pointer to the parsed GBA cart,
/// or \c nullptr if there was an error.
-std::unique_ptr ParseROM(std::unique_ptr&& romdata, u32 romlen, std::unique_ptr&& sramdata, u32 sramlen);
+std::unique_ptr ParseROM(std::unique_ptr&& romdata, u32 romlen, std::unique_ptr&& sramdata, u32 sramlen, void* userdata = nullptr);
}
diff --git a/src/GPU.cpp b/src/GPU.cpp
index f23e641e..f24d8ab5 100644
--- a/src/GPU.cpp
+++ b/src/GPU.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -23,7 +23,7 @@
#include "ARMJIT.h"
#include "GPU2D_Soft.h"
-#include "GPU3D_Soft.h"
+#include "GPU3D.h"
namespace melonDS
{
diff --git a/src/GPU.h b/src/GPU.h
index 780d5e01..5c373ca8 100644
--- a/src/GPU.h
+++ b/src/GPU.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -499,6 +499,17 @@ public:
OAMDirty |= 1 << (addr / 1024);
}
+ template
+ inline T ReadVRAMFlat_Texture(u32 addr) const
+ {
+ return *(T*)&VRAMFlat_Texture[addr & 0x7FFFF];
+ }
+ template
+ inline T ReadVRAMFlat_TexPal(u32 addr) const
+ {
+ return *(T*)&VRAMFlat_TexPal[addr & 0x1FFFF];
+ }
+
void SetPowerCnt(u32 val) noexcept;
void StartFrame() noexcept;
diff --git a/src/GPU2D.cpp b/src/GPU2D.cpp
index e0aa630d..e76e85c1 100644
--- a/src/GPU2D.cpp
+++ b/src/GPU2D.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -20,6 +20,7 @@
#include
#include "NDS.h"
#include "GPU.h"
+#include "GPU3D.h"
namespace melonDS
{
@@ -386,6 +387,14 @@ void Unit::Write16(u32 addr, u16 val)
if (!Num) GPU.GPU3D.SetRenderXPos(val);
break;
+ case 0x064:
+ CaptureCnt = (CaptureCnt & 0xFFFF0000) | (val & 0xEF3F1F1F);
+ return;
+
+ case 0x066:
+ CaptureCnt = (CaptureCnt & 0xFFFF) | ((val << 16) & 0xEF3F1F1F);
+ return;
+
case 0x068:
DispFIFO[DispFIFOWritePtr] = val;
return;
diff --git a/src/GPU2D.h b/src/GPU2D.h
index e87167cb..f56167a1 100644
--- a/src/GPU2D.h
+++ b/src/GPU2D.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/GPU2D_Soft.cpp b/src/GPU2D_Soft.cpp
index e01d3665..6ad2cd3e 100644
--- a/src/GPU2D_Soft.cpp
+++ b/src/GPU2D_Soft.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -18,7 +18,7 @@
#include "GPU2D_Soft.h"
#include "GPU.h"
-#include "GPU3D_OpenGL.h"
+#include "GPU3D.h"
namespace melonDS
{
@@ -254,7 +254,11 @@ void SoftRenderer::DrawScanline(u32 line, Unit* unit)
if (GPU.GPU3D.IsRendererAccelerated())
{
- dst[256*3] = masterBrightness | (CurUnit->DispCnt & 0x30000);
+ u32 xpos = GPU.GPU3D.GetRenderXPos();
+
+ dst[256*3] = masterBrightness |
+ (CurUnit->DispCnt & 0x30000) |
+ (xpos << 24) | ((xpos & 0x100) << 15);
return;
}
@@ -1503,7 +1507,7 @@ void SoftRenderer::ApplySpriteMosaicX()
u32* objLine = OBJLine[CurUnit->Num];
- u8* curOBJXMosaicTable = MosaicTable[CurUnit->OBJMosaicSize[1]].data();
+ u8* curOBJXMosaicTable = MosaicTable[CurUnit->OBJMosaicSize[0]].data();
u32 lastcolor = objLine[0];
diff --git a/src/GPU2D_Soft.h b/src/GPU2D_Soft.h
index befb67f6..d9942f61 100644
--- a/src/GPU2D_Soft.h
+++ b/src/GPU2D_Soft.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp
index 47abae2f..4a1426aa 100644
--- a/src/GPU3D.cpp
+++ b/src/GPU3D.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -24,6 +24,7 @@
#include "FIFO.h"
#include "GPU3D_Soft.h"
#include "Platform.h"
+#include "GPU3D.h"
namespace melonDS
{
@@ -191,7 +192,7 @@ void GPU3D::Reset() noexcept
CmdStallQueue.Clear();
- ZeroDotWLimit = 0; // CHECKME
+ ZeroDotWLimit = 0xFFFFFF;
GXStat = 0;
@@ -273,6 +274,8 @@ void GPU3D::Reset() noexcept
memset(MatEmission, 0, sizeof(MatSpecular));
UseShininessTable = false;
+ // Shininess table seems to be uninitialized garbage, at least on n3dsxl hw?
+ // Also doesn't seem to be cleared properly unless the system is fully powered off?
memset(ShininessTable, 0, sizeof(ShininessTable));
PolygonAttr = 0;
@@ -1278,7 +1281,7 @@ void GPU3D::SubmitPolygon() noexcept
{
Vertex* vtx = poly->Vertices[i];
- if (vtx->FinalPosition[1] < ytop || (vtx->FinalPosition[1] == ytop && vtx->FinalPosition[0] < xtop))
+ if (vtx->FinalPosition[1] < ytop)
{
xtop = vtx->FinalPosition[0];
ytop = vtx->FinalPosition[1];
@@ -1458,67 +1461,86 @@ void GPU3D::CalculateLighting() noexcept
TexCoords[1] = RawTexCoords[1] + (((s64)Normal[0]*TexMatrix[1] + (s64)Normal[1]*TexMatrix[5] + (s64)Normal[2]*TexMatrix[9]) >> 21);
}
- s32 normaltrans[3];
- normaltrans[0] = (Normal[0]*VecMatrix[0] + Normal[1]*VecMatrix[4] + Normal[2]*VecMatrix[8]) >> 12;
- normaltrans[1] = (Normal[0]*VecMatrix[1] + Normal[1]*VecMatrix[5] + Normal[2]*VecMatrix[9]) >> 12;
- normaltrans[2] = (Normal[0]*VecMatrix[2] + Normal[1]*VecMatrix[6] + Normal[2]*VecMatrix[10]) >> 12;
-
- VertexColor[0] = MatEmission[0];
- VertexColor[1] = MatEmission[1];
- VertexColor[2] = MatEmission[2];
+ s32 normaltrans[3]; // should be 1 bit sign 10 bits frac
+ normaltrans[0] = ((Normal[0]*VecMatrix[0] + Normal[1]*VecMatrix[4] + Normal[2]*VecMatrix[8]) << 9) >> 21;
+ normaltrans[1] = ((Normal[0]*VecMatrix[1] + Normal[1]*VecMatrix[5] + Normal[2]*VecMatrix[9]) << 9) >> 21;
+ normaltrans[2] = ((Normal[0]*VecMatrix[2] + Normal[1]*VecMatrix[6] + Normal[2]*VecMatrix[10]) << 9) >> 21;
s32 c = 0;
+ u32 vtxbuff[3] =
+ {
+ (u32)MatEmission[0] << 14,
+ (u32)MatEmission[1] << 14,
+ (u32)MatEmission[2] << 14
+ };
for (int i = 0; i < 4; i++)
{
if (!(CurPolygonAttr & (1<1)
- // according to some hardware tests
- // * diffuse level is saturated to 255
- // * shininess level mirrors back to 0 and is ANDed with 0xFF, that before being squared
- // TODO: check how it behaves when the computed shininess is >=0x200
+ // (credit to azusa for working out most of the details of the diff. algorithm, and essentially the entire spec. algorithm)
+
+ // calculate dot product
+ // bottom 9 bits are discarded after multiplying and before adding
+ s32 dot = ((LightDirection[i][0]*normaltrans[0]) >> 9) +
+ ((LightDirection[i][1]*normaltrans[1]) >> 9) +
+ ((LightDirection[i][2]*normaltrans[2]) >> 9);
- s32 difflevel = (-(LightDirection[i][0]*normaltrans[0] +
- LightDirection[i][1]*normaltrans[1] +
- LightDirection[i][2]*normaltrans[2])) >> 10;
- if (difflevel < 0) difflevel = 0;
- else if (difflevel > 255) difflevel = 255;
+ s32 shinelevel;
+ if (dot > 0)
+ {
+ // -- diffuse lighting --
+
+ // convert dot to signed 11 bit int
+ // then we truncate the result of the multiplications to an unsigned 20 bits before adding to the vtx color
+ s32 diffdot = (dot << 21) >> 21;
+ vtxbuff[0] += (MatDiffuse[0] * LightColor[i][0] * diffdot) & 0xFFFFF;
+ vtxbuff[1] += (MatDiffuse[1] * LightColor[i][1] * diffdot) & 0xFFFFF;
+ vtxbuff[2] += (MatDiffuse[2] * LightColor[i][2] * diffdot) & 0xFFFFF;
- s32 shinelevel = -(((LightDirection[i][0]>>1)*normaltrans[0] +
- (LightDirection[i][1]>>1)*normaltrans[1] +
- ((LightDirection[i][2]-0x200)>>1)*normaltrans[2]) >> 10);
- if (shinelevel < 0) shinelevel = 0;
- else if (shinelevel > 255) shinelevel = (0x100 - shinelevel) & 0xFF;
- shinelevel = ((shinelevel * shinelevel) >> 7) - 0x100; // really (2*shinelevel*shinelevel)-1
- if (shinelevel < 0) shinelevel = 0;
+ // -- specular lighting --
+
+ // reuse the dot product from diffuse lighting
+ dot += normaltrans[2];
+ // convert to s11, then square it, and truncate to 10 bits
+ dot = (dot << 21) >> 21;
+ dot = ((dot * dot) >> 10) & 0x3FF;
+
+ // multiply dot and reciprocal, the subtract '1'
+ shinelevel = ((dot * SpecRecip[i]) >> 8) - (1<<9);
+
+ if (shinelevel < 0) shinelevel = 0;
+ else
+ {
+ // sign extend to convert to signed 14 bit integer
+ shinelevel = (shinelevel << 18) >> 18;
+ if (shinelevel < 0) shinelevel = 0; // for some reason there seems to be a redundant check for <0?
+ else if (shinelevel > 0x1FF) shinelevel = 0x1FF;
+ }
+ }
+ else shinelevel = 0;
+
+ // convert shinelevel to use for lookup in the shininess table if enabled.
if (UseShininessTable)
{
- // checkme
- shinelevel >>= 1;
+ shinelevel >>= 2;
shinelevel = ShininessTable[shinelevel];
+ shinelevel <<= 1;
}
- VertexColor[0] += ((MatSpecular[0] * LightColor[i][0] * shinelevel) >> 13);
- VertexColor[0] += ((MatDiffuse[0] * LightColor[i][0] * difflevel) >> 13);
- VertexColor[0] += ((MatAmbient[0] * LightColor[i][0]) >> 5);
-
- VertexColor[1] += ((MatSpecular[1] * LightColor[i][1] * shinelevel) >> 13);
- VertexColor[1] += ((MatDiffuse[1] * LightColor[i][1] * difflevel) >> 13);
- VertexColor[1] += ((MatAmbient[1] * LightColor[i][1]) >> 5);
-
- VertexColor[2] += ((MatSpecular[2] * LightColor[i][2] * shinelevel) >> 13);
- VertexColor[2] += ((MatDiffuse[2] * LightColor[i][2] * difflevel) >> 13);
- VertexColor[2] += ((MatAmbient[2] * LightColor[i][2]) >> 5);
-
- if (VertexColor[0] > 31) VertexColor[0] = 31;
- if (VertexColor[1] > 31) VertexColor[1] = 31;
- if (VertexColor[2] > 31) VertexColor[2] = 31;
+ // Note: ambient seems to be a plain bitshift
+ vtxbuff[0] += ((MatSpecular[0] * shinelevel) + (MatAmbient[0] << 9)) * LightColor[i][0];
+ vtxbuff[1] += ((MatSpecular[1] * shinelevel) + (MatAmbient[1] << 9)) * LightColor[i][1];
+ vtxbuff[2] += ((MatSpecular[2] * shinelevel) + (MatAmbient[2] << 9)) * LightColor[i][2];
c++;
}
+ VertexColor[0] = (vtxbuff[0] >> 14 > 31) ? 31 : (vtxbuff[0] >> 14);
+ VertexColor[1] = (vtxbuff[1] >> 14 > 31) ? 31 : (vtxbuff[1] >> 14);
+ VertexColor[2] = (vtxbuff[2] >> 14 > 31) ? 31 : (vtxbuff[2] >> 14);
+
if (c < 1) c = 1;
NormalPipeline = 7;
AddCycles(c);
@@ -2011,9 +2033,15 @@ void GPU3D::ExecuteCommand() noexcept
dir[0] = (s16)((entry.Param & 0x000003FF) << 6) >> 6;
dir[1] = (s16)((entry.Param & 0x000FFC00) >> 4) >> 6;
dir[2] = (s16)((entry.Param & 0x3FF00000) >> 14) >> 6;
- LightDirection[l][0] = (dir[0]*VecMatrix[0] + dir[1]*VecMatrix[4] + dir[2]*VecMatrix[8]) >> 12;
- LightDirection[l][1] = (dir[0]*VecMatrix[1] + dir[1]*VecMatrix[5] + dir[2]*VecMatrix[9]) >> 12;
- LightDirection[l][2] = (dir[0]*VecMatrix[2] + dir[1]*VecMatrix[6] + dir[2]*VecMatrix[10]) >> 12;
+ // the order of operations here is very specific: discard bottom 12 bits -> negate -> then sign extend to convert to 11 bit signed int
+ // except for when used to calculate the specular reciprocal; then it's: sign extend -> discard lsb -> negate.
+ LightDirection[l][0] = (-((dir[0]*VecMatrix[0] + dir[1]*VecMatrix[4] + dir[2]*VecMatrix[8] ) >> 12) << 21) >> 21;
+ LightDirection[l][1] = (-((dir[0]*VecMatrix[1] + dir[1]*VecMatrix[5] + dir[2]*VecMatrix[9] ) >> 12) << 21) >> 21;
+ LightDirection[l][2] = (-((dir[0]*VecMatrix[2] + dir[1]*VecMatrix[6] + dir[2]*VecMatrix[10]) >> 12) << 21) >> 21;
+ s32 den = -(((dir[0]*VecMatrix[2] + dir[1]*VecMatrix[6] + dir[2]*VecMatrix[10]) << 9) >> 21) + (1<<9);
+
+ if (den == 0) SpecRecip[l] = 0;
+ else SpecRecip[l] = (1<<18) / den;
}
AddCycles(5);
break;
diff --git a/src/GPU3D.h b/src/GPU3D.h
index 4a5fe6e0..d10df55f 100644
--- a/src/GPU3D.h
+++ b/src/GPU3D.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -197,7 +197,7 @@ public:
FIFO CmdStallQueue {};
- u32 ZeroDotWLimit = 0;
+ u32 ZeroDotWLimit = 0xFFFFFF;
u32 GXStat = 0;
@@ -286,6 +286,7 @@ public:
s16 Normal[3] {};
s16 LightDirection[4][3] {};
+ s32 SpecRecip[4] {};
u8 LightColor[4][3] {};
u8 MatDiffuse[3] {};
u8 MatAmbient[3] {};
@@ -349,7 +350,14 @@ public:
virtual void RestartFrame(GPU& gpu) {};
virtual u32* GetLine(int line) = 0;
virtual void Blit(const GPU& gpu) {};
+
+ virtual void SetupAccelFrame() {}
virtual void PrepareCaptureFrame() {}
+ virtual void BindOutputTexture(int buffer) {}
+
+ virtual bool NeedsShaderCompile() { return false; }
+ virtual void ShaderCompileStep(int& current, int& count) {}
+
protected:
Renderer3D(bool Accelerated);
};
diff --git a/src/GPU3D_Compute.cpp b/src/GPU3D_Compute.cpp
new file mode 100644
index 00000000..346a6a53
--- /dev/null
+++ b/src/GPU3D_Compute.cpp
@@ -0,0 +1,1137 @@
+/*
+ Copyright 2016-2024 melonDS team
+
+ This file is part of melonDS.
+
+ melonDS is free software: you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation, either version 3 of the License, or (at your option)
+ any later version.
+
+ melonDS is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with melonDS. If not, see http://www.gnu.org/licenses/.
+*/
+
+#include "GPU3D_Compute.h"
+
+#include
+
+#include "OpenGLSupport.h"
+
+#include "GPU3D_Compute_shaders.h"
+
+namespace melonDS
+{
+
+ComputeRenderer::ComputeRenderer(GLCompositor&& compositor)
+ : Renderer3D(true), Texcache(TexcacheOpenGLLoader()), CurGLCompositor(std::move(compositor))
+{}
+
+bool ComputeRenderer::CompileShader(GLuint& shader, const std::string& source, const std::initializer_list& defines)
+{
+ std::string shaderName;
+ std::string shaderSource;
+ shaderSource += "#version 430 core\n";
+ for (const char* define : defines)
+ {
+ shaderSource += "#define ";
+ shaderSource += define;
+ shaderSource += '\n';
+ shaderName += define;
+ shaderName += ',';
+ }
+ shaderSource += "#define ScreenWidth ";
+ shaderSource += std::to_string(ScreenWidth);
+ shaderSource += "\n#define ScreenHeight ";
+ shaderSource += std::to_string(ScreenHeight);
+ shaderSource += "\n#define MaxWorkTiles ";
+ shaderSource += std::to_string(MaxWorkTiles);
+
+ shaderSource += ComputeRendererShaders::Common;
+ shaderSource += source;
+
+ return OpenGL::CompileComputeProgram(shader, shaderSource.c_str(), shaderName.c_str());
+}
+
+void ComputeRenderer::ShaderCompileStep(int& current, int& count)
+{
+ current = ShaderStepIdx;
+ ShaderStepIdx++;
+ count = 33;
+ switch (current)
+ {
+ case 0:
+ CompileShader(ShaderInterpXSpans[0], ComputeRendererShaders::InterpSpans, {"InterpSpans", "ZBuffer"});
+ return;
+ case 1:
+ CompileShader(ShaderInterpXSpans[1], ComputeRendererShaders::InterpSpans, {"InterpSpans", "WBuffer"});
+ return;
+ case 2:
+ CompileShader(ShaderBinCombined, ComputeRendererShaders::BinCombined, {"BinCombined"});
+ return;
+ case 3:
+ CompileShader(ShaderDepthBlend[0], ComputeRendererShaders::DepthBlend, {"DepthBlend", "ZBuffer"});
+ return;
+ case 4:
+ CompileShader(ShaderDepthBlend[1], ComputeRendererShaders::DepthBlend, {"DepthBlend", "WBuffer"});
+ return;
+ case 5:
+ CompileShader(ShaderRasteriseNoTexture[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture"});
+ return;
+ case 6:
+ CompileShader(ShaderRasteriseNoTexture[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture"});
+ return;
+ case 7:
+ CompileShader(ShaderRasteriseNoTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Toon"});
+ return;
+ case 8:
+ CompileShader(ShaderRasteriseNoTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Toon"});
+ return;
+ case 9:
+ CompileShader(ShaderRasteriseNoTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Highlight"});
+ return;
+ case 10:
+ CompileShader(ShaderRasteriseNoTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Highlight"});
+ return;
+ case 11:
+ CompileShader(ShaderRasteriseUseTextureDecal[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Decal"});
+ return;
+ case 12:
+ CompileShader(ShaderRasteriseUseTextureDecal[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Decal"});
+ return;
+ case 13:
+ CompileShader(ShaderRasteriseUseTextureModulate[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Modulate"});
+ return;
+ case 14:
+ CompileShader(ShaderRasteriseUseTextureModulate[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Modulate"});
+ return;
+ case 15:
+ CompileShader(ShaderRasteriseUseTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Toon"});
+ return;
+ case 16:
+ CompileShader(ShaderRasteriseUseTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Toon"});
+ return;
+ case 17:
+ CompileShader(ShaderRasteriseUseTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Highlight"});
+ return;
+ case 18:
+ CompileShader(ShaderRasteriseUseTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Highlight"});
+ return;
+ case 19:
+ CompileShader(ShaderRasteriseShadowMask[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "ShadowMask"});
+ return;
+ case 20:
+ CompileShader(ShaderRasteriseShadowMask[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "ShadowMask"});
+ return;
+ case 21:
+ CompileShader(ShaderClearCoarseBinMask, ComputeRendererShaders::ClearCoarseBinMask, {"ClearCoarseBinMask"});
+ return;
+ case 22:
+ CompileShader(ShaderClearIndirectWorkCount, ComputeRendererShaders::ClearIndirectWorkCount, {"ClearIndirectWorkCount"});
+ return;
+ case 23:
+ CompileShader(ShaderCalculateWorkListOffset, ComputeRendererShaders::CalcOffsets, {"CalculateWorkOffsets"});
+ return;
+ case 24:
+ CompileShader(ShaderSortWork, ComputeRendererShaders::SortWork, {"SortWork"});
+ return;
+ case 25:
+ CompileShader(ShaderFinalPass[0], ComputeRendererShaders::FinalPass, {"FinalPass"});
+ return;
+ case 26:
+ CompileShader(ShaderFinalPass[1], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking"});
+ return;
+ case 27:
+ CompileShader(ShaderFinalPass[2], ComputeRendererShaders::FinalPass, {"FinalPass", "Fog"});
+ return;
+ case 28:
+ CompileShader(ShaderFinalPass[3], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking", "Fog"});
+ return;
+ case 29:
+ CompileShader(ShaderFinalPass[4], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing"});
+ return;
+ case 30:
+ CompileShader(ShaderFinalPass[5], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking"});
+ return;
+ case 31:
+ CompileShader(ShaderFinalPass[6], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "Fog"});
+ return;
+ case 32:
+ CompileShader(ShaderFinalPass[7], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking", "Fog"});
+ return;
+ default:
+ __builtin_unreachable();
+ return;
+ }
+}
+
+void blah(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length, const GLchar *message, const void *userParam)
+{
+ printf("%s\n", message);
+}
+
+std::unique_ptr ComputeRenderer::New()
+{
+ std::optional compositor = GLCompositor::New();
+ if (!compositor)
+ return nullptr;
+
+ std::unique_ptr result = std::unique_ptr(new ComputeRenderer(std::move(*compositor)));
+
+ //glDebugMessageCallback(blah, NULL);
+ //glEnable(GL_DEBUG_OUTPUT);
+ glGenBuffers(1, &result->YSpanSetupMemory);
+ glBindBuffer(GL_SHADER_STORAGE_BUFFER, result->YSpanSetupMemory);
+ glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupY)*MaxYSpanSetups, nullptr, GL_DYNAMIC_DRAW);
+
+ glGenBuffers(1, &result->RenderPolygonMemory);
+ glBindBuffer(GL_SHADER_STORAGE_BUFFER, result->RenderPolygonMemory);
+ glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(RenderPolygon)*2048, nullptr, GL_DYNAMIC_DRAW);
+
+ glGenBuffers(1, &result->XSpanSetupMemory);
+ glGenBuffers(1, &result->BinResultMemory);
+ glGenBuffers(1, &result->FinalTileMemory);
+ glGenBuffers(1, &result->YSpanIndicesTextureMemory);
+ glGenBuffers(tilememoryLayer_Num, result->TileMemory);
+ glGenBuffers(1, &result->WorkDescMemory);
+
+ glGenTextures(1, &result->YSpanIndicesTexture);
+ glGenTextures(1, &result->LowResFramebuffer);
+ glBindTexture(GL_TEXTURE_2D, result->LowResFramebuffer);
+ glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8UI, 256, 192);
+
+ glGenBuffers(1, &result->MetaUniformMemory);
+ glBindBuffer(GL_UNIFORM_BUFFER, result->MetaUniformMemory);
+ glBufferData(GL_UNIFORM_BUFFER, sizeof(MetaUniform), nullptr, GL_DYNAMIC_DRAW);
+
+ glGenSamplers(9, result->Samplers);
+ for (u32 j = 0; j < 3; j++)
+ {
+ for (u32 i = 0; i < 3; i++)
+ {
+ const GLenum translateWrapMode[3] = {GL_CLAMP_TO_EDGE, GL_REPEAT, GL_MIRRORED_REPEAT};
+ glSamplerParameteri(result->Samplers[i+j*3], GL_TEXTURE_WRAP_S, translateWrapMode[i]);
+ glSamplerParameteri(result->Samplers[i+j*3], GL_TEXTURE_WRAP_T, translateWrapMode[j]);
+ glSamplerParameteri(result->Samplers[i+j*3], GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+ glSamplerParameterf(result->Samplers[i+j*3], GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+ }
+ }
+
+ glGenBuffers(1, &result->PixelBuffer);
+ glBindBuffer(GL_PIXEL_PACK_BUFFER, result->PixelBuffer);
+ glBufferData(GL_PIXEL_PACK_BUFFER, 256*192*4, NULL, GL_DYNAMIC_READ);
+
+ return result;
+}
+
+ComputeRenderer::~ComputeRenderer()
+{
+ Texcache.Reset();
+
+ glDeleteBuffers(1, &YSpanSetupMemory);
+ glDeleteBuffers(1, &RenderPolygonMemory);
+ glDeleteBuffers(1, &XSpanSetupMemory);
+ glDeleteBuffers(1, &BinResultMemory);
+ glDeleteBuffers(tilememoryLayer_Num, TileMemory);
+ glDeleteBuffers(1, &WorkDescMemory);
+ glDeleteBuffers(1, &FinalTileMemory);
+ glDeleteBuffers(1, &YSpanIndicesTextureMemory);
+ glDeleteTextures(1, &YSpanIndicesTexture);
+ glDeleteTextures(1, &Framebuffer);
+ glDeleteBuffers(1, &MetaUniformMemory);
+
+ glDeleteSamplers(9, Samplers);
+ glDeleteBuffers(1, &PixelBuffer);
+}
+
+void ComputeRenderer::DeleteShaders()
+{
+ std::initializer_list allPrograms =
+ {
+ ShaderInterpXSpans[0],
+ ShaderInterpXSpans[1],
+ ShaderBinCombined,
+ ShaderDepthBlend[0],
+ ShaderDepthBlend[1],
+ ShaderRasteriseNoTexture[0],
+ ShaderRasteriseNoTexture[1],
+ ShaderRasteriseNoTextureToon[0],
+ ShaderRasteriseNoTextureToon[1],
+ ShaderRasteriseNoTextureHighlight[0],
+ ShaderRasteriseNoTextureHighlight[1],
+ ShaderRasteriseUseTextureDecal[0],
+ ShaderRasteriseUseTextureDecal[1],
+ ShaderRasteriseUseTextureModulate[0],
+ ShaderRasteriseUseTextureModulate[1],
+ ShaderRasteriseUseTextureToon[0],
+ ShaderRasteriseUseTextureToon[1],
+ ShaderRasteriseUseTextureHighlight[0],
+ ShaderRasteriseUseTextureHighlight[1],
+ ShaderRasteriseShadowMask[0],
+ ShaderRasteriseShadowMask[1],
+ ShaderClearCoarseBinMask,
+ ShaderClearIndirectWorkCount,
+ ShaderCalculateWorkListOffset,
+ ShaderSortWork,
+ ShaderFinalPass[0],
+ ShaderFinalPass[1],
+ ShaderFinalPass[2],
+ ShaderFinalPass[3],
+ ShaderFinalPass[4],
+ ShaderFinalPass[5],
+ ShaderFinalPass[6],
+ ShaderFinalPass[7],
+ };
+ for (GLuint program : allPrograms)
+ glDeleteProgram(program);
+}
+
+void ComputeRenderer::Reset(GPU& gpu)
+{
+ Texcache.Reset();
+}
+
+void ComputeRenderer::SetRenderSettings(int scale, bool highResolutionCoordinates)
+{
+ CurGLCompositor.SetScaleFactor(scale);
+
+ if (ScaleFactor != -1)
+ {
+ DeleteShaders();
+ }
+
+ ShaderStepIdx = 0;
+
+ ScaleFactor = scale;
+ ScreenWidth = 256 * ScaleFactor;
+ ScreenHeight = 192 * ScaleFactor;
+
+ TilesPerLine = ScreenWidth/TileSize;
+ TileLines = ScreenHeight/TileSize;
+
+ HiresCoordinates = highResolutionCoordinates;
+
+ MaxWorkTiles = TilesPerLine*TileLines*16;
+
+ for (int i = 0; i < tilememoryLayer_Num; i++)
+ {
+ glBindBuffer(GL_SHADER_STORAGE_BUFFER, TileMemory[i]);
+ glBufferData(GL_SHADER_STORAGE_BUFFER, 4*TileSize*TileSize*MaxWorkTiles, nullptr, GL_DYNAMIC_DRAW);
+ }
+
+ glBindBuffer(GL_SHADER_STORAGE_BUFFER, FinalTileMemory);
+ glBufferData(GL_SHADER_STORAGE_BUFFER, 4*3*2*ScreenWidth*ScreenHeight, nullptr, GL_DYNAMIC_DRAW);
+
+ int binResultSize = sizeof(BinResultHeader)
+ + TilesPerLine*TileLines*CoarseBinStride*4 // BinnedMaskCoarse
+ + TilesPerLine*TileLines*BinStride*4 // BinnedMask
+ + TilesPerLine*TileLines*BinStride*4; // WorkOffsets
+ glBindBuffer(GL_SHADER_STORAGE_BUFFER, BinResultMemory);
+ glBufferData(GL_SHADER_STORAGE_BUFFER, binResultSize, nullptr, GL_DYNAMIC_DRAW);
+
+ glBindBuffer(GL_SHADER_STORAGE_BUFFER, WorkDescMemory);
+ glBufferData(GL_SHADER_STORAGE_BUFFER, MaxWorkTiles*2*4*2, nullptr, GL_DYNAMIC_DRAW);
+
+ if (Framebuffer != 0)
+ glDeleteTextures(1, &Framebuffer);
+ glGenTextures(1, &Framebuffer);
+ glBindTexture(GL_TEXTURE_2D, Framebuffer);
+ glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, ScreenWidth, ScreenHeight);
+
+ // eh those are pretty bad guesses
+ // though real hw shouldn't be eable to render all 2048 polygons on every line either
+ int maxYSpanIndices = 64*2048 * ScaleFactor;
+ YSpanIndices.resize(maxYSpanIndices);
+
+ glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory);
+ glBufferData(GL_TEXTURE_BUFFER, maxYSpanIndices*2*4, nullptr, GL_DYNAMIC_DRAW);
+
+ glBindBuffer(GL_SHADER_STORAGE_BUFFER, XSpanSetupMemory);
+ glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupX)*maxYSpanIndices, nullptr, GL_DYNAMIC_DRAW);
+
+ glBindTexture(GL_TEXTURE_BUFFER, YSpanIndicesTexture);
+ glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA16UI, YSpanIndicesTextureMemory);
+}
+
+void ComputeRenderer::VCount144(GPU& gpu)
+{
+
+}
+
+void ComputeRenderer::SetupAttrs(SpanSetupY* span, Polygon* poly, int from, int to)
+{
+ span->Z0 = poly->FinalZ[from];
+ span->W0 = poly->FinalW[from];
+ span->Z1 = poly->FinalZ[to];
+ span->W1 = poly->FinalW[to];
+ span->ColorR0 = poly->Vertices[from]->FinalColor[0];
+ span->ColorG0 = poly->Vertices[from]->FinalColor[1];
+ span->ColorB0 = poly->Vertices[from]->FinalColor[2];
+ span->ColorR1 = poly->Vertices[to]->FinalColor[0];
+ span->ColorG1 = poly->Vertices[to]->FinalColor[1];
+ span->ColorB1 = poly->Vertices[to]->FinalColor[2];
+ span->TexcoordU0 = poly->Vertices[from]->TexCoords[0];
+ span->TexcoordV0 = poly->Vertices[from]->TexCoords[1];
+ span->TexcoordU1 = poly->Vertices[to]->TexCoords[0];
+ span->TexcoordV1 = poly->Vertices[to]->TexCoords[1];
+}
+
+void ComputeRenderer::SetupYSpanDummy(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int vertex, int side, s32 positions[10][2])
+{
+ s32 x0 = positions[vertex][0];
+ if (side)
+ {
+ span->DxInitial = -0x40000;
+ x0--;
+ }
+ else
+ {
+ span->DxInitial = 0;
+ }
+
+ span->X0 = span->X1 = x0;
+ span->XMin = x0;
+ span->XMax = x0;
+ span->Y0 = span->Y1 = positions[vertex][1];
+
+ if (span->XMin < rp->XMin)
+ {
+ rp->XMin = span->XMin;
+ rp->XMinY = span->Y0;
+ }
+ if (span->XMax > rp->XMax)
+ {
+ rp->XMax = span->XMax;
+ rp->XMaxY = span->Y0;
+ }
+
+ span->Increment = 0;
+
+ span->I0 = span->I1 = span->IRecip = 0;
+ span->Linear = true;
+
+ span->XCovIncr = 0;
+
+ span->IsDummy = true;
+
+ SetupAttrs(span, poly, vertex, vertex);
+}
+
+void ComputeRenderer::SetupYSpan(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int from, int to, int side, s32 positions[10][2])
+{
+ span->X0 = positions[from][0];
+ span->X1 = positions[to][0];
+ span->Y0 = positions[from][1];
+ span->Y1 = positions[to][1];
+
+ SetupAttrs(span, poly, from, to);
+
+ s32 minXY, maxXY;
+ bool negative = false;
+ if (span->X1 > span->X0)
+ {
+ span->XMin = span->X0;
+ span->XMax = span->X1-1;
+
+ minXY = span->Y0;
+ maxXY = span->Y1;
+ }
+ else if (span->X1 < span->X0)
+ {
+ span->XMin = span->X1;
+ span->XMax = span->X0-1;
+ negative = true;
+
+ minXY = span->Y1;
+ maxXY = span->Y0;
+ }
+ else
+ {
+ span->XMin = span->X0;
+ if (side) span->XMin--;
+ span->XMax = span->XMin;
+
+ // doesn't matter for completely vertical slope
+ minXY = span->Y0;
+ maxXY = span->Y0;
+ }
+
+ if (span->XMin < rp->XMin)
+ {
+ rp->XMin = span->XMin;
+ rp->XMinY = minXY;
+ }
+ if (span->XMax > rp->XMax)
+ {
+ rp->XMax = span->XMax;
+ rp->XMaxY = maxXY;
+ }
+
+ span->IsDummy = false;
+
+ s32 xlen = span->XMax+1 - span->XMin;
+ s32 ylen = span->Y1 - span->Y0;
+
+ // slope increment has a 18-bit fractional part
+ // note: for some reason, x/y isn't calculated directly,
+ // instead, 1/y is calculated and then multiplied by x
+ // TODO: this is still not perfect (see for example x=169 y=33)
+ if (ylen == 0)
+ {
+ span->Increment = 0;
+ }
+ else if (ylen == xlen)
+ {
+ span->Increment = 0x40000;
+ }
+ else
+ {
+ s32 yrecip = (1<<18) / ylen;
+ span->Increment = (span->X1-span->X0) * yrecip;
+ if (span->Increment < 0) span->Increment = -span->Increment;
+ }
+
+ bool xMajor = (span->Increment > 0x40000);
+
+ if (side)
+ {
+ // right
+
+ if (xMajor)
+ span->DxInitial = negative ? (0x20000 + 0x40000) : (span->Increment - 0x20000);
+ else if (span->Increment != 0)
+ span->DxInitial = negative ? 0x40000 : 0;
+ else
+ span->DxInitial = -0x40000;
+ }
+ else
+ {
+ // left
+
+ if (xMajor)
+ span->DxInitial = negative ? ((span->Increment - 0x20000) + 0x40000) : 0x20000;
+ else if (span->Increment != 0)
+ span->DxInitial = negative ? 0x40000 : 0;
+ else
+ span->DxInitial = 0;
+ }
+
+ if (xMajor)
+ {
+ if (side)
+ {
+ span->I0 = span->X0 - 1;
+ span->I1 = span->X1 - 1;
+ }
+ else
+ {
+ span->I0 = span->X0;
+ span->I1 = span->X1;
+ }
+
+ // used for calculating AA coverage
+ span->XCovIncr = (ylen << 10) / xlen;
+ }
+ else
+ {
+ span->I0 = span->Y0;
+ span->I1 = span->Y1;
+ }
+
+ if (span->I0 != span->I1)
+ span->IRecip = (1<<30) / (span->I1 - span->I0);
+ else
+ span->IRecip = 0;
+
+ span->Linear = (span->W0 == span->W1) && !(span->W0 & 0x7E) && !(span->W1 & 0x7E);
+
+ if ((span->W0 & 0x1) && !(span->W1 & 0x1))
+ {
+ span->W0n = (span->W0 - 1) >> 1;
+ span->W0d = (span->W0 + 1) >> 1;
+ span->W1d = span->W1 >> 1;
+ }
+ else
+ {
+ span->W0n = span->W0 >> 1;
+ span->W0d = span->W0 >> 1;
+ span->W1d = span->W1 >> 1;
+ }
+}
+
+struct Variant
+{
+ GLuint Texture, Sampler;
+ u16 Width, Height;
+ u8 BlendMode;
+
+ bool operator==(const Variant& other)
+ {
+ return Texture == other.Texture && Sampler == other.Sampler && BlendMode == other.BlendMode;
+ }
+};
+
+/*
+ Antialiasing
+ W-Buffer
+ With Texture
+ 0
+ 1, 3
+ 2
+ without Texture
+ 2
+ 0, 1, 3
+
+ => 20 Shader + 1x Shadow Mask
+*/
+
+void ComputeRenderer::RenderFrame(GPU& gpu)
+{
+ assert(!NeedsShaderCompile());
+ if (!Texcache.Update(gpu) && gpu.GPU3D.RenderFrameIdentical)
+ {
+ return;
+ }
+
+ int numYSpans = 0;
+ int numSetupIndices = 0;
+
+ /*
+ Some games really like to spam small textures, often
+ to store the data like PPU tiles. E.g. Shantae
+ or some Mega Man game. Fortunately they are usually kind
+ enough to not vary the texture size all too often (usually
+ they just use 8x8 or 16x for everything).
+
+ This is the reason we have this whole mess where textures of
+ the same size are put into array textures. This allows
+ to increase the batch size.
+ Less variance between each Variant hah!
+ */
+ u32 numVariants = 0, prevVariant, prevTexLayer;
+ Variant variants[MaxVariants];
+
+ bool enableTextureMaps = gpu.GPU3D.RenderDispCnt & (1<<0);
+
+ for (int i = 0; i < gpu.GPU3D.RenderNumPolygons; i++)
+ {
+ Polygon* polygon = gpu.GPU3D.RenderPolygonRAM[i];
+
+ u32 nverts = polygon->NumVertices;
+ u32 vtop = polygon->VTop, vbot = polygon->VBottom;
+
+ u32 curVL = vtop, curVR = vtop;
+ u32 nextVL, nextVR;
+
+ RenderPolygons[i].FirstXSpan = numSetupIndices;
+ RenderPolygons[i].Attr = polygon->Attr;
+
+ bool foundVariant = false;
+ if (i > 0)
+ {
+ // if the whole texture attribute matches
+ // the texture layer will also match
+ Polygon* prevPolygon = gpu.GPU3D.RenderPolygonRAM[i - 1];
+ foundVariant = prevPolygon->TexParam == polygon->TexParam
+ && prevPolygon->TexPalette == polygon->TexPalette
+ && (prevPolygon->Attr & 0x30) == (polygon->Attr & 0x30)
+ && prevPolygon->IsShadowMask == polygon->IsShadowMask;
+ }
+
+ if (!foundVariant)
+ {
+ Variant variant;
+ variant.BlendMode = polygon->IsShadowMask ? 4 : ((polygon->Attr >> 4) & 0x3);
+ variant.Texture = 0;
+ variant.Sampler = 0;
+ u32* textureLastVariant = nullptr;
+ // we always need to look up the texture to get the layer of the array texture
+ if (enableTextureMaps && (polygon->TexParam >> 26) & 0x7)
+ {
+ Texcache.GetTexture(gpu, polygon->TexParam, polygon->TexPalette, variant.Texture, prevTexLayer, textureLastVariant);
+ bool wrapS = (polygon->TexParam >> 16) & 1;
+ bool wrapT = (polygon->TexParam >> 17) & 1;
+ bool mirrorS = (polygon->TexParam >> 18) & 1;
+ bool mirrorT = (polygon->TexParam >> 19) & 1;
+ variant.Sampler = Samplers[(wrapS ? (mirrorS ? 2 : 1) : 0) + (wrapT ? (mirrorT ? 2 : 1) : 0) * 3];
+
+ if (*textureLastVariant < numVariants && variants[*textureLastVariant] == variant)
+ {
+ foundVariant = true;
+ prevVariant = *textureLastVariant;
+ }
+ }
+
+ if (!foundVariant)
+ {
+ for (int j = numVariants - 1; j >= 0; j--)
+ {
+ if (variants[j] == variant)
+ {
+ foundVariant = true;
+ prevVariant = j;
+ goto foundVariant;
+ }
+ }
+
+ prevVariant = numVariants;
+ variants[numVariants] = variant;
+ variants[numVariants].Width = TextureWidth(polygon->TexParam);
+ variants[numVariants].Height = TextureHeight(polygon->TexParam);
+ numVariants++;
+ assert(numVariants <= MaxVariants);
+ foundVariant:;
+
+ if (textureLastVariant)
+ *textureLastVariant = prevVariant;
+ }
+ }
+ RenderPolygons[i].Variant = prevVariant;
+ RenderPolygons[i].TextureLayer = (float)prevTexLayer;
+
+ if (polygon->FacingView)
+ {
+ nextVL = curVL + 1;
+ if (nextVL >= nverts) nextVL = 0;
+ nextVR = curVR - 1;
+ if ((s32)nextVR < 0) nextVR = nverts - 1;
+ }
+ else
+ {
+ nextVL = curVL - 1;
+ if ((s32)nextVL < 0) nextVL = nverts - 1;
+ nextVR = curVR + 1;
+ if (nextVR >= nverts) nextVR = 0;
+ }
+
+ s32 scaledPositions[10][2];
+ s32 ytop = ScreenHeight, ybot = 0;
+ for (int i = 0; i < polygon->NumVertices; i++)
+ {
+ if (HiresCoordinates)
+ {
+ scaledPositions[i][0] = (polygon->Vertices[i]->HiresPosition[0] * ScaleFactor) >> 4;
+ scaledPositions[i][1] = (polygon->Vertices[i]->HiresPosition[1] * ScaleFactor) >> 4;
+ }
+ else
+ {
+ scaledPositions[i][0] = polygon->Vertices[i]->FinalPosition[0] * ScaleFactor;
+ scaledPositions[i][1] = polygon->Vertices[i]->FinalPosition[1] * ScaleFactor;
+ }
+ ytop = std::min(scaledPositions[i][1], ytop);
+ ybot = std::max(scaledPositions[i][1], ybot);
+ }
+ RenderPolygons[i].YTop = ytop;
+ RenderPolygons[i].YBot = ybot;
+ RenderPolygons[i].XMin = ScreenWidth;
+ RenderPolygons[i].XMax = 0;
+
+ if (ybot == ytop)
+ {
+ vtop = 0; vbot = 0;
+
+ RenderPolygons[i].YBot++;
+
+ int j = 1;
+ if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j;
+ if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j;
+
+ j = nverts - 1;
+ if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j;
+ if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j;
+
+ assert(numYSpans < MaxYSpanSetups);
+ u32 curSpanL = numYSpans;
+ SetupYSpanDummy(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, vtop, 0, scaledPositions);
+ assert(numYSpans < MaxYSpanSetups);
+ u32 curSpanR = numYSpans;
+ SetupYSpanDummy(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, vbot, 1, scaledPositions);
+
+ YSpanIndices[numSetupIndices].PolyIdx = i;
+ YSpanIndices[numSetupIndices].SpanIdxL = curSpanL;
+ YSpanIndices[numSetupIndices].SpanIdxR = curSpanR;
+ YSpanIndices[numSetupIndices].Y = ytop;
+ numSetupIndices++;
+ }
+ else
+ {
+ u32 curSpanL = numYSpans;
+ assert(numYSpans < MaxYSpanSetups);
+ SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVL, nextVL, 0, scaledPositions);
+ u32 curSpanR = numYSpans;
+ assert(numYSpans < MaxYSpanSetups);
+ SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVR, nextVR, 1, scaledPositions);
+
+ for (u32 y = ytop; y < ybot; y++)
+ {
+ if (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom)
+ {
+ while (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom)
+ {
+ curVL = nextVL;
+ if (polygon->FacingView)
+ {
+ nextVL = curVL + 1;
+ if (nextVL >= nverts)
+ nextVL = 0;
+ }
+ else
+ {
+ nextVL = curVL - 1;
+ if ((s32)nextVL < 0)
+ nextVL = nverts - 1;
+ }
+ }
+
+
+ assert(numYSpans < MaxYSpanSetups);
+ curSpanL = numYSpans;
+ SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVL, nextVL, 0, scaledPositions);
+ }
+ if (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom)
+ {
+ while (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom)
+ {
+ curVR = nextVR;
+ if (polygon->FacingView)
+ {
+ nextVR = curVR - 1;
+ if ((s32)nextVR < 0)
+ nextVR = nverts - 1;
+ }
+ else
+ {
+ nextVR = curVR + 1;
+ if (nextVR >= nverts)
+ nextVR = 0;
+ }
+ }
+
+ assert(numYSpans < MaxYSpanSetups);
+ curSpanR = numYSpans;
+ SetupYSpan(&RenderPolygons[i] ,&YSpanSetups[numYSpans++], polygon, curVR, nextVR, 1, scaledPositions);
+ }
+
+ YSpanIndices[numSetupIndices].PolyIdx = i;
+ YSpanIndices[numSetupIndices].SpanIdxL = curSpanL;
+ YSpanIndices[numSetupIndices].SpanIdxR = curSpanR;
+ YSpanIndices[numSetupIndices].Y = y;
+ numSetupIndices++;
+ }
+ }
+
+ //printf("polygon min max %d %d | %d %d\n", RenderPolygons[i].XMin, RenderPolygons[i].XMinY, RenderPolygons[i].XMax, RenderPolygons[i].XMaxY);
+ }
+
+ /*for (u32 i = 0; i < RenderNumPolygons; i++)
+ {
+ if (RenderPolygons[i].Variant >= numVariants)
+ {
+ printf("blarb2 %d %d %d\n", RenderPolygons[i].Variant, i, RenderNumPolygons);
+ }
+ //assert(RenderPolygons[i].Variant < numVariants);
+ }*/
+
+ if (numYSpans > 0)
+ {
+ glBindBuffer(GL_SHADER_STORAGE_BUFFER, YSpanSetupMemory);
+ glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeof(SpanSetupY)*numYSpans, YSpanSetups);
+
+ glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory);
+ glBufferSubData(GL_TEXTURE_BUFFER, 0, numSetupIndices*4*2, YSpanIndices.data());
+
+ glBindBuffer(GL_SHADER_STORAGE_BUFFER, RenderPolygonMemory);
+ glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, gpu.GPU3D.RenderNumPolygons*sizeof(RenderPolygon), RenderPolygons);
+ // we haven't accessed image data yet, so we don't need to invalidate anything
+ }
+
+ //printf("found via %d %d %d of %d\n", foundviatexcache, foundviaprev, numslow, RenderNumPolygons);
+
+ // bind everything
+ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, RenderPolygonMemory);
+
+ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, XSpanSetupMemory);
+ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, YSpanSetupMemory);
+
+ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, FinalTileMemory);
+ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, BinResultMemory);
+ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 7, WorkDescMemory);
+
+ MetaUniform meta;
+ meta.DispCnt = gpu.GPU3D.RenderDispCnt;
+ meta.NumPolygons = gpu.GPU3D.RenderNumPolygons;
+ meta.NumVariants = numVariants;
+ meta.AlphaRef = gpu.GPU3D.RenderAlphaRef;
+ {
+ u32 r = (gpu.GPU3D.RenderClearAttr1 << 1) & 0x3E; if (r) r++;
+ u32 g = (gpu.GPU3D.RenderClearAttr1 >> 4) & 0x3E; if (g) g++;
+ u32 b = (gpu.GPU3D.RenderClearAttr1 >> 9) & 0x3E; if (b) b++;
+ u32 a = (gpu.GPU3D.RenderClearAttr1 >> 16) & 0x1F;
+ meta.ClearColor = r | (g << 8) | (b << 16) | (a << 24);
+ meta.ClearDepth = ((gpu.GPU3D.RenderClearAttr2 & 0x7FFF) * 0x200) + 0x1FF;
+ meta.ClearAttr = gpu.GPU3D.RenderClearAttr1 & 0x3F008000;
+ }
+ for (u32 i = 0; i < 32; i++)
+ {
+ u32 color = gpu.GPU3D.RenderToonTable[i];
+ u32 r = (color << 1) & 0x3E;
+ u32 g = (color >> 4) & 0x3E;
+ u32 b = (color >> 9) & 0x3E;
+ if (r) r++;
+ if (g) g++;
+ if (b) b++;
+
+ meta.ToonTable[i*4+0] = r | (g << 8) | (b << 16);
+ }
+ for (u32 i = 0; i < 34; i++)
+ {
+ meta.ToonTable[i*4+1] = gpu.GPU3D.RenderFogDensityTable[i];
+ }
+ for (u32 i = 0; i < 8; i++)
+ {
+ u32 color = gpu.GPU3D.RenderEdgeTable[i];
+ u32 r = (color << 1) & 0x3E;
+ u32 g = (color >> 4) & 0x3E;
+ u32 b = (color >> 9) & 0x3E;
+ if (r) r++;
+ if (g) g++;
+ if (b) b++;
+
+ meta.ToonTable[i*4+2] = r | (g << 8) | (b << 16);
+ }
+ meta.FogOffset = gpu.GPU3D.RenderFogOffset;
+ meta.FogShift = gpu.GPU3D.RenderFogShift;
+ {
+ u32 fogR = (gpu.GPU3D.RenderFogColor << 1) & 0x3E; if (fogR) fogR++;
+ u32 fogG = (gpu.GPU3D.RenderFogColor >> 4) & 0x3E; if (fogG) fogG++;
+ u32 fogB = (gpu.GPU3D.RenderFogColor >> 9) & 0x3E; if (fogB) fogB++;
+ u32 fogA = (gpu.GPU3D.RenderFogColor >> 16) & 0x1F;
+ meta.FogColor = fogR | (fogG << 8) | (fogB << 16) | (fogA << 24);
+ }
+
+ glBindBuffer(GL_UNIFORM_BUFFER, MetaUniformMemory);
+ glBufferSubData(GL_UNIFORM_BUFFER, 0, sizeof(MetaUniform), &meta);
+ glBindBufferBase(GL_UNIFORM_BUFFER, 0, MetaUniformMemory);
+
+ glUseProgram(ShaderClearCoarseBinMask);
+ glDispatchCompute(TilesPerLine*TileLines/32, 1, 1);
+
+ bool wbuffer = false;
+ if (numYSpans > 0)
+ {
+ wbuffer = gpu.GPU3D.RenderPolygonRAM[0]->WBuffer;
+
+ glUseProgram(ShaderClearIndirectWorkCount);
+ glDispatchCompute((numVariants+31)/32, 1, 1);
+
+ // calculate x-spans
+ glBindImageTexture(0, YSpanIndicesTexture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RGBA16UI);
+ glUseProgram(ShaderInterpXSpans[wbuffer]);
+ glDispatchCompute((numSetupIndices + 31) / 32, 1, 1);
+ glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
+
+ // bin polygons
+ glUseProgram(ShaderBinCombined);
+ glDispatchCompute(((gpu.GPU3D.RenderNumPolygons + 31) / 32), ScreenWidth/CoarseTileW, ScreenHeight/CoarseTileH);
+ glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
+
+ // calculate list offsets
+ glUseProgram(ShaderCalculateWorkListOffset);
+ glDispatchCompute((numVariants + 31) / 32, 1, 1);
+ glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
+
+ // sort shader work
+ glUseProgram(ShaderSortWork);
+ glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory);
+ glDispatchComputeIndirect(offsetof(BinResultHeader, SortWorkWorkCount));
+ glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
+
+ glActiveTexture(GL_TEXTURE0);
+
+ for (int i = 0; i < tilememoryLayer_Num; i++)
+ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2+i, TileMemory[i]);
+
+ // rasterise
+ {
+ bool highLightMode = gpu.GPU3D.RenderDispCnt & (1<<1);
+
+ GLuint shadersNoTexture[] =
+ {
+ ShaderRasteriseNoTexture[wbuffer],
+ ShaderRasteriseNoTexture[wbuffer],
+ highLightMode
+ ? ShaderRasteriseNoTextureHighlight[wbuffer]
+ : ShaderRasteriseNoTextureToon[wbuffer],
+ ShaderRasteriseNoTexture[wbuffer],
+ ShaderRasteriseShadowMask[wbuffer]
+ };
+ GLuint shadersUseTexture[] =
+ {
+ ShaderRasteriseUseTextureModulate[wbuffer],
+ ShaderRasteriseUseTextureDecal[wbuffer],
+ highLightMode
+ ? ShaderRasteriseUseTextureHighlight[wbuffer]
+ : ShaderRasteriseUseTextureToon[wbuffer],
+ ShaderRasteriseUseTextureDecal[wbuffer],
+ ShaderRasteriseShadowMask[wbuffer]
+ };
+
+ GLuint prevShader = 0;
+ s32 prevTexture = 0, prevSampler = 0;
+ for (int i = 0; i < numVariants; i++)
+ {
+ GLuint shader = 0;
+ if (variants[i].Texture == 0)
+ {
+ shader = shadersNoTexture[variants[i].BlendMode];
+ }
+ else
+ {
+ shader = shadersUseTexture[variants[i].BlendMode];
+ if (variants[i].Texture != prevTexture)
+ {
+ glBindTexture(GL_TEXTURE_2D_ARRAY, variants[i].Texture);
+ prevTexture = variants[i].Texture;
+ }
+ if (variants[i].Sampler != prevSampler)
+ {
+ glBindSampler(0, variants[i].Sampler);
+ prevSampler = variants[i].Sampler;
+ }
+ }
+ assert(shader != 0);
+ if (shader != prevShader)
+ {
+ glUseProgram(shader);
+ prevShader = shader;
+ }
+
+ glUniform1ui(UniformIdxCurVariant, i);
+ glUniform2f(UniformIdxTextureSize, 1.f / variants[i].Width, 1.f / variants[i].Height);
+ glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory);
+ glDispatchComputeIndirect(offsetof(BinResultHeader, VariantWorkCount) + i*4*4);
+ }
+ }
+ }
+ glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+
+ // compose final image
+ glUseProgram(ShaderDepthBlend[wbuffer]);
+ glDispatchCompute(ScreenWidth/TileSize, ScreenHeight/TileSize, 1);
+ glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+
+ glBindImageTexture(0, Framebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8);
+ glBindImageTexture(1, LowResFramebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8UI);
+ u32 finalPassShader = 0;
+ if (gpu.GPU3D.RenderDispCnt & (1<<4))
+ finalPassShader |= 0x4;
+ if (gpu.GPU3D.RenderDispCnt & (1<<7))
+ finalPassShader |= 0x2;
+ if (gpu.GPU3D.RenderDispCnt & (1<<5))
+ finalPassShader |= 0x1;
+
+ glUseProgram(ShaderFinalPass[finalPassShader]);
+ glDispatchCompute(ScreenWidth/32, ScreenHeight, 1);
+ glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
+
+ glBindSampler(0, 0);
+
+ /*u64 starttime = armGetSystemTick();
+ EmuQueue.waitIdle();
+ printf("total time %f\n", armTicksToNs(armGetSystemTick()-starttime)*0.000001f);*/
+
+ /*for (u32 i = 0; i < RenderNumPolygons; i++)
+ {
+ if (RenderPolygons[i].Variant >= numVariants)
+ {
+ printf("blarb %d %d %d\n", RenderPolygons[i].Variant, i, RenderNumPolygons);
+ }
+ //assert(RenderPolygons[i].Variant < numVariants);
+ }*/
+
+ /*for (int i = 0; i < binresult->SortWorkWorkCount[0]*32; i++)
+ {
+ printf("sorted %x %x\n", binresult->SortedWork[i*2+0], binresult->SortedWork[i*2+1]);
+ }*/
+/* if (polygonvisible != -1)
+ {
+ SpanSetupX* xspans = Gfx::DataHeap->CpuAddr(XSpanSetupMemory);
+ printf("span result\n");
+ Polygon* poly = RenderPolygonRAM[polygonvisible];
+ u32 xspanoffset = RenderPolygons[polygonvisible].FirstXSpan;
+ for (u32 i = 0; i < (poly->YBottom - poly->YTop); i++)
+ {
+ printf("%d: %d - %d | %d %d | %d %d\n", i + poly->YTop, xspans[xspanoffset + i].X0, xspans[xspanoffset + i].X1, xspans[xspanoffset + i].__pad0, xspans[xspanoffset + i].__pad1, RenderPolygons[polygonvisible].YTop, RenderPolygons[polygonvisible].YBot);
+ }
+ }*/
+/*
+ printf("xspans: %d\n", numSetupIndices);
+ SpanSetupX* xspans = Gfx::DataHeap->CpuAddr(XSpanSetupMemory[curSlice]);
+ for (int i = 0; i < numSetupIndices; i++)
+ {
+ printf("poly %d %d %d | line %d | %d to %d\n", YSpanIndices[i].PolyIdx, YSpanIndices[i].SpanIdxL, YSpanIndices[i].SpanIdxR, YSpanIndices[i].Y, xspans[i].X0, xspans[i].X1);
+ }
+ printf("bin result\n");
+ BinResult* binresult = Gfx::DataHeap->CpuAddr(BinResultMemory);
+ for (u32 y = 0; y < 192/8; y++)
+ {
+ for (u32 x = 0; x < 256/8; x++)
+ {
+ printf("%08x ", binresult->BinnedMaskCoarse[(x + y * (256/8)) * 2]);
+ }
+ printf("\n");
+ }*/
+}
+
+void ComputeRenderer::RestartFrame(GPU& gpu)
+{
+
+}
+
+u32* ComputeRenderer::GetLine(int line)
+{
+ int stride = 256;
+
+ if (line == 0)
+ {
+ glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer);
+ u8* data = (u8*)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
+ if (data) memcpy(&FramebufferCPU[0], data, 4*stride*192);
+ glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
+ }
+
+ return &FramebufferCPU[stride * line];
+}
+
+void ComputeRenderer::SetupAccelFrame()
+{
+ glBindTexture(GL_TEXTURE_2D, Framebuffer);
+}
+
+void ComputeRenderer::PrepareCaptureFrame()
+{
+ glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer);
+ glBindTexture(GL_TEXTURE_2D, LowResFramebuffer);
+ glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, nullptr);
+}
+
+void ComputeRenderer::BindOutputTexture(int buffer)
+{
+ CurGLCompositor.BindOutputTexture(buffer);
+}
+
+void ComputeRenderer::Blit(const GPU &gpu)
+{
+ CurGLCompositor.RenderFrame(gpu, *this);
+}
+
+void ComputeRenderer::Stop(const GPU &gpu)
+{
+ CurGLCompositor.Stop(gpu);
+}
+
+}
\ No newline at end of file
diff --git a/src/GPU3D_Compute.h b/src/GPU3D_Compute.h
new file mode 100644
index 00000000..751737b7
--- /dev/null
+++ b/src/GPU3D_Compute.h
@@ -0,0 +1,242 @@
+/*
+ Copyright 2016-2024 melonDS team
+
+ This file is part of melonDS.
+
+ melonDS is free software: you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation, either version 3 of the License, or (at your option)
+ any later version.
+
+ melonDS is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with melonDS. If not, see http://www.gnu.org/licenses/.
+*/
+
+#ifndef GPU3D_COMPUTE
+#define GPU3D_COMPUTE
+
+#include
+
+#include "types.h"
+
+#include "GPU3D.h"
+
+#include "OpenGLSupport.h"
+#include "GPU_OpenGL.h"
+
+#include "GPU3D_TexcacheOpenGL.h"
+
+#include "NonStupidBitfield.h"
+
+namespace melonDS
+{
+
+class ComputeRenderer : public Renderer3D
+{
+public:
+ static std::unique_ptr New();
+ ~ComputeRenderer() override;
+
+ void Reset(GPU& gpu) override;
+
+ void SetRenderSettings(int scale, bool highResolutionCoordinates);
+
+ void VCount144(GPU& gpu) override;
+
+ void RenderFrame(GPU& gpu) override;
+ void RestartFrame(GPU& gpu) override;
+ u32* GetLine(int line) override;
+
+ void SetupAccelFrame() override;
+ void PrepareCaptureFrame() override;
+
+ void BindOutputTexture(int buffer) override;
+
+ void Blit(const GPU& gpu) override;
+ void Stop(const GPU& gpu) override;
+
+ bool NeedsShaderCompile() override { return ShaderStepIdx != 33; }
+ void ShaderCompileStep(int& current, int& count) override;
+private:
+ ComputeRenderer(GLCompositor&& compositor);
+
+ GLuint ShaderInterpXSpans[2];
+ GLuint ShaderBinCombined;
+ GLuint ShaderDepthBlend[2];
+ GLuint ShaderRasteriseNoTexture[2];
+ GLuint ShaderRasteriseNoTextureToon[2];
+ GLuint ShaderRasteriseNoTextureHighlight[2];
+ GLuint ShaderRasteriseUseTextureDecal[2];
+ GLuint ShaderRasteriseUseTextureModulate[2];
+ GLuint ShaderRasteriseUseTextureToon[2];
+ GLuint ShaderRasteriseUseTextureHighlight[2];
+ GLuint ShaderRasteriseShadowMask[2];
+ GLuint ShaderClearCoarseBinMask;
+ GLuint ShaderClearIndirectWorkCount;
+ GLuint ShaderCalculateWorkListOffset;
+ GLuint ShaderSortWork;
+ GLuint ShaderFinalPass[8];
+
+ GLuint YSpanIndicesTextureMemory;
+ GLuint YSpanIndicesTexture;
+ GLuint YSpanSetupMemory;
+ GLuint XSpanSetupMemory;
+ GLuint BinResultMemory;
+ GLuint RenderPolygonMemory;
+ GLuint WorkDescMemory;
+
+ enum
+ {
+ tilememoryLayer_Color,
+ tilememoryLayer_Depth,
+ tilememoryLayer_Attr,
+ tilememoryLayer_Num,
+ };
+
+ GLuint TileMemory[tilememoryLayer_Num];
+ GLuint FinalTileMemory;
+
+ u32 DummyLine[256] = {};
+
+ struct SpanSetupY
+ {
+ // Attributes
+ s32 Z0, Z1, W0, W1;
+ s32 ColorR0, ColorG0, ColorB0;
+ s32 ColorR1, ColorG1, ColorB1;
+ s32 TexcoordU0, TexcoordV0;
+ s32 TexcoordU1, TexcoordV1;
+
+ // Interpolator
+ s32 I0, I1;
+ s32 Linear;
+ s32 IRecip;
+ s32 W0n, W0d, W1d;
+
+ // Slope
+ s32 Increment;
+
+ s32 X0, X1, Y0, Y1;
+ s32 XMin, XMax;
+ s32 DxInitial;
+
+ s32 XCovIncr;
+ u32 IsDummy;
+ };
+ struct SpanSetupX
+ {
+ s32 X0, X1;
+
+ s32 EdgeLenL, EdgeLenR, EdgeCovL, EdgeCovR;
+
+ s32 XRecip;
+
+ u32 Flags;
+
+ s32 Z0, Z1, W0, W1;
+ s32 ColorR0, ColorG0, ColorB0;
+ s32 ColorR1, ColorG1, ColorB1;
+ s32 TexcoordU0, TexcoordV0;
+ s32 TexcoordU1, TexcoordV1;
+
+ s32 CovLInitial, CovRInitial;
+ };
+ struct SetupIndices
+ {
+ u16 PolyIdx, SpanIdxL, SpanIdxR, Y;
+ };
+ struct RenderPolygon
+ {
+ u32 FirstXSpan;
+ s32 YTop, YBot;
+
+ s32 XMin, XMax;
+ s32 XMinY, XMaxY;
+
+ u32 Variant;
+ u32 Attr;
+
+ float TextureLayer;
+ };
+
+ static constexpr int TileSize = 8;
+ static constexpr int CoarseTileCountX = 8;
+ static constexpr int CoarseTileCountY = 4;
+ static constexpr int CoarseTileW = CoarseTileCountX * TileSize;
+ static constexpr int CoarseTileH = CoarseTileCountY * TileSize;
+
+ static constexpr int BinStride = 2048/32;
+ static constexpr int CoarseBinStride = BinStride/32;
+
+ static constexpr int MaxVariants = 256;
+
+ static constexpr int UniformIdxCurVariant = 0;
+ static constexpr int UniformIdxTextureSize = 1;
+
+ static constexpr int MaxFullscreenLayers = 16;
+
+ struct BinResultHeader
+ {
+ u32 VariantWorkCount[MaxVariants*4];
+ u32 SortedWorkOffset[MaxVariants];
+
+ u32 SortWorkWorkCount[4];
+ };
+
+ static const int MaxYSpanSetups = 6144*2;
+ std::vector YSpanIndices;
+ SpanSetupY YSpanSetups[MaxYSpanSetups];
+ RenderPolygon RenderPolygons[2048];
+
+ TexcacheOpenGL Texcache;
+
+ struct MetaUniform
+ {
+ u32 NumPolygons;
+ u32 NumVariants;
+
+ u32 AlphaRef;
+ u32 DispCnt;
+
+ u32 ToonTable[4*34];
+
+ u32 ClearColor, ClearDepth, ClearAttr;
+
+ u32 FogOffset, FogShift, FogColor;
+ };
+ GLuint MetaUniformMemory;
+
+ GLuint Samplers[9];
+
+ GLuint Framebuffer = 0;
+ GLuint LowResFramebuffer;
+ GLuint PixelBuffer;
+
+ u32 FramebufferCPU[256*192];
+
+ int ScreenWidth, ScreenHeight;
+ int TilesPerLine, TileLines;
+ int ScaleFactor = -1;
+ int MaxWorkTiles;
+ bool HiresCoordinates;
+
+ GLCompositor CurGLCompositor;
+
+ int ShaderStepIdx = 0;
+
+ void DeleteShaders();
+
+ void SetupAttrs(SpanSetupY* span, Polygon* poly, int from, int to);
+ void SetupYSpan(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int from, int to, int side, s32 positions[10][2]);
+ void SetupYSpanDummy(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int vertex, int side, s32 positions[10][2]);
+
+ bool CompileShader(GLuint& shader, const std::string& source, const std::initializer_list& defines);
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/GPU3D_Compute_shaders.h b/src/GPU3D_Compute_shaders.h
new file mode 100644
index 00000000..26fb7bde
--- /dev/null
+++ b/src/GPU3D_Compute_shaders.h
@@ -0,0 +1,1665 @@
+/*
+ Copyright 2016-2024 melonDS team
+
+ This file is part of melonDS.
+
+ melonDS is free software: you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation, either version 3 of the License, or (at your option)
+ any later version.
+
+ melonDS is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with melonDS. If not, see http://www.gnu.org/licenses/.
+*/
+
+#ifndef GPU3D_COMPUTE_SHADERS
+#define GPU3D_COMPUTE_SHADERS
+
+#include
+
+namespace melonDS
+{
+
+namespace ComputeRendererShaders
+{
+
+// defines:
+// InterpSpans
+// BinCombined
+// Rasterise
+// DepthBlend
+// ClearCoarseBinMask
+// ClearIndirectWorkCount
+// CalculateWorkOffsets
+// SortWork
+// FinalPass
+
+// AntiAliasing
+// EdgeMarking
+// Fog
+
+// ZBuffer
+// WBuffer
+
+// for Rasterise
+// NoTexture
+// UseTexture
+// Decal
+// Modulate
+// Toon
+// Highlight
+// ShadowMask
+
+
+/*
+ Some notes on signed division:
+
+ we want to avoid it, so we can avoid higher precision numbers
+ in a few places.
+
+ Fortunately all divisions *should* assuming I'm not mistaken
+ have the same sign on the divisor and the dividend.
+
+ Thus we apply:
+
+ assuming n < 0 <=> d < 0
+ n/d = abs(n)/abs(d)
+
+*/
+
+const std::string XSpanSetupBuffer{R"(
+
+const uint XSpanSetup_Linear = 1U << 0;
+const uint XSpanSetup_FillInside = 1U << 1;
+const uint XSpanSetup_FillLeft = 1U << 2;
+const uint XSpanSetup_FillRight = 1U << 3;
+
+struct XSpanSetup
+{
+ int X0, X1;
+
+ int InsideStart, InsideEnd, EdgeCovL, EdgeCovR;
+
+ int XRecip;
+
+ uint Flags;
+
+ int Z0, Z1, W0, W1;
+ int ColorR0, ColorG0, ColorB0;
+ int ColorR1, ColorG1, ColorB1;
+ int TexcoordU0, TexcoordV0;
+ int TexcoordU1, TexcoordV1;
+
+ int CovLInitial, CovRInitial;
+};
+
+#if defined(Rasterise)
+int CalcYFactorX(XSpanSetup span, int x)
+{
+ x -= span.X0;
+
+ if (span.X0 != span.X1)
+ {
+ uint numLo = uint(x) * uint(span.W0);
+ uint numHi = 0U;
+ numHi |= numLo >> (32U-YFactorShift);
+ numLo <<= YFactorShift;
+
+ uint den = uint(x) * uint(span.W0) + uint(span.X1 - span.X0 - x) * uint(span.W1);
+
+ if (den == 0)
+ return 0;
+ else
+ return int(Div64_32_32(numHi, numLo, den));
+ }
+ else
+ {
+ return 0;
+ }
+}
+#endif
+
+layout (std430, binding = 1) buffer XSpanSetupsBuffer
+{
+ XSpanSetup XSpanSetups[];
+};
+
+)"};
+
+const std::string YSpanSetupBuffer{R"(
+
+struct YSpanSetup
+{
+ // Attributes
+ int Z0, Z1, W0, W1;
+ int ColorR0, ColorG0, ColorB0;
+ int ColorR1, ColorG1, ColorB1;
+ int TexcoordU0, TexcoordV0;
+ int TexcoordU1, TexcoordV1;
+
+ // Interpolator
+ int I0, I1;
+ bool Linear;
+ int IRecip;
+ int W0n, W0d, W1d;
+
+ // Slope
+ int Increment;
+
+ int X0, X1, Y0, Y1;
+ int XMin, XMax;
+ int DxInitial;
+
+ int XCovIncr;
+
+ bool IsDummy;
+};
+
+#if defined(InterpSpans)
+int CalcYFactorY(YSpanSetup span, int i)
+{
+ /*
+ maybe it would be better to do use a 32x32=64 multiplication?
+ */
+ uint numLo = uint(abs(i)) * uint(span.W0n);
+ uint numHi = 0U;
+ numHi |= numLo >> (32U-YFactorShift);
+ numLo <<= YFactorShift;
+
+ uint den = uint(abs(i)) * uint(span.W0d) + uint(abs(span.I1 - span.I0 - i)) * span.W1d;
+
+ if (den == 0)
+ {
+ return 0;
+ }
+ else
+ {
+ return int(Div64_32_32(numHi, numLo, den));
+ }
+}
+
+int CalculateDx(int y, YSpanSetup span)
+{
+ return span.DxInitial + (y - span.Y0) * span.Increment;
+}
+
+int CalculateX(int dx, YSpanSetup span)
+{
+ int x = span.X0;
+ if (span.X1 < span.X0)
+ x -= dx >> 18;
+ else
+ x += dx >> 18;
+ return clamp(x, span.XMin, span.XMax);
+}
+
+void EdgeParams_XMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov)
+{
+ bool negative = span.X1 < span.X0;
+ int len;
+ if (side != negative)
+ len = (dx >> 18) - ((dx-span.Increment) >> 18);
+ else
+ len = ((dx+span.Increment) >> 18) - (dx >> 18);
+ edgelen = len;
+
+ int xlen = span.XMax + 1 - span.XMin;
+ int startx = dx >> 18;
+ if (negative) startx = xlen - startx;
+ if (side) startx = startx - len + 1;
+
+ uint r;
+ int startcov = int(Div(uint(((startx << 10) + 0x1FF) * (span.Y1 - span.Y0)), uint(xlen), r));
+ edgecov = (1<<31) | ((startcov & 0x3FF) << 12) | (span.XCovIncr & 0x3FF);
+}
+
+void EdgeParams_YMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov)
+{
+ bool negative = span.X1 < span.X0;
+ edgelen = 1;
+
+ if (span.Increment == 0)
+ {
+ edgecov = 31;
+ }
+ else
+ {
+ int cov = ((dx >> 9) + (span.Increment >> 10)) >> 4;
+ if ((cov >> 5) != (dx >> 18)) cov = 31;
+ cov &= 0x1F;
+ if (side == negative) cov = 0x1F - cov;
+
+ edgecov = cov;
+ }
+}
+#endif
+
+layout (std430, binding = 2) buffer YSpanSetupsBuffer
+{
+ YSpanSetup YSpanSetups[];
+};
+
+)"};
+
+const std::string PolygonBuffer{R"(
+struct Polygon
+{
+ int FirstXSpan;
+ int YTop, YBot;
+
+ int XMin, XMax;
+ int XMinY, XMaxY;
+
+ int Variant;
+
+ uint Attr;
+
+ float TextureLayer;
+};
+
+layout (std430, binding = 0) readonly buffer PolygonBuffer
+{
+ Polygon Polygons[];
+};
+)"};
+
+const std::string BinningBuffer{R"(
+
+layout (std430, binding = 6) buffer BinResultBuffer
+{
+ uvec4 VariantWorkCount[MaxVariants];
+ uint SortedWorkOffset[MaxVariants];
+
+ uvec4 SortWorkWorkCount;
+
+ uint BinningMaskAndOffset[];
+ //uint BinnedMaskCoarse[TilesPerLine*TileLines*CoarseBinStride];
+ //uint BinnedMask[TilesPerLine*TileLines*BinStride];
+ //uint WorkOffsets[TilesPerLine*TileLines*BinStride];
+};
+
+const int BinningCoarseMaskStart = 0;
+const int BinningMaskStart = BinningCoarseMaskStart+TilesPerLine*TileLines*CoarseBinStride;
+const int BinningWorkOffsetsStart = BinningMaskStart+TilesPerLine*TileLines*BinStride;
+
+)"};
+
+/*
+ structure of each WorkDesc item:
+ x:
+ bits 0-10: polygon idx
+ bits 11-31: tile idx (before sorting within variant after sorting within all tiles)
+ y:
+ bits 0-15: X position on screen
+ bits 15-31: Y position on screen
+*/
+const std::string WorkDescBuffer{R"(
+layout (std430, binding = 7) buffer WorkDescBuffer
+{
+ //uvec2 UnsortedWorkDescs[MaxWorkTiles];
+ //uvec2 SortedWorkDescs[MaxWorkTiles];
+ uvec2 WorkDescs[];
+};
+
+const uint WorkDescsUnsortedStart = 0;
+const uint WorkDescsSortedStart = WorkDescsUnsortedStart+MaxWorkTiles;
+
+)"};
+
+const std::string Tilebuffers{R"(
+layout (std430, binding = 2) buffer ColorTileBuffer
+{
+ uint ColorTiles[];
+};
+layout (std430, binding = 3) buffer DepthTileBuffer
+{
+ uint DepthTiles[];
+};
+layout (std430, binding = 4) buffer AttrTileBuffer
+{
+ uint AttrTiles[];
+};
+
+)"};
+
+const std::string ResultBuffer{R"(
+layout (std430, binding = 5) buffer ResultBuffer
+{
+ uint ResultValue[];
+};
+
+const uint ResultColorStart = 0;
+const uint ResultDepthStart = ResultColorStart+ScreenWidth*ScreenHeight*2;
+const uint ResultAttrStart = ResultDepthStart+ScreenWidth*ScreenHeight*2;
+)"};
+
+const char* Common = R"(
+
+#define TileSize 8
+const int CoarseTileCountX = 8;
+const int CoarseTileCountY = 4;
+const int CoarseTileW = (CoarseTileCountX * TileSize);
+const int CoarseTileH = (CoarseTileCountY * TileSize);
+
+const int FramebufferStride = ScreenWidth*ScreenHeight;
+const int TilesPerLine = ScreenWidth/TileSize;
+const int TileLines = ScreenHeight/TileSize;
+
+const int BinStride = 2048/32;
+const int CoarseBinStride = BinStride/32;
+
+const int MaxVariants = 256;
+
+layout (std140, binding = 0) uniform MetaUniform
+{
+ uint NumPolygons;
+ uint NumVariants;
+
+ int AlphaRef;
+
+ uint DispCnt;
+
+ // r = Toon
+ // g = Fog Density
+ // b = Edge Color
+ uvec4 ToonTable[34];
+
+ uint ClearColor, ClearDepth, ClearAttr;
+
+ uint FogOffset, FogShift, FogColor;
+};
+
+#ifdef InterpSpans
+const int YFactorShift = 9;
+#else
+const int YFactorShift = 8;
+#endif
+
+#if defined(InterpSpans) || defined(Rasterise)
+uint Umulh(uint a, uint b)
+{
+ uint lo, hi;
+ umulExtended(a, b, hi, lo);
+ return hi;
+}
+
+const uint startTable[256] = uint[256](
+ 254, 252, 250, 248, 246, 244, 242, 240, 238, 236, 234, 233, 231, 229, 227, 225, 224, 222, 220, 218, 217, 215, 213, 212, 210, 208, 207, 205, 203, 202, 200, 199, 197, 195, 194, 192, 191, 189, 188, 186, 185, 183, 182, 180, 179, 178, 176, 175, 173, 172, 170, 169, 168, 166, 165, 164, 162, 161, 160, 158,
+157, 156, 154, 153, 152, 151, 149, 148, 147, 146, 144, 143, 142, 141, 139, 138, 137, 136, 135, 134, 132, 131, 130, 129, 128, 127, 126, 125, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 102, 101, 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 88, 87, 86, 85, 84, 83, 82, 81, 80, 80, 79, 78, 77, 76, 75, 74, 74, 73, 72, 71, 70, 70, 69, 68, 67, 66, 66, 65, 64, 63, 62, 62, 61, 60, 59, 59, 58, 57, 56, 56, 55, 54, 53, 53, 52, 51, 50, 50, 49, 48, 48, 47, 46, 46, 45, 44, 43, 43, 42, 41, 41, 40, 39, 39, 38, 37, 37, 36, 35, 35, 34, 33, 33, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0
+);
+
+uint Div(uint x, uint y, out uint r)
+{
+ // https://www.microsoft.com/en-us/research/publication/software-integer-division/
+ uint k = 31 - findMSB(y);
+ uint ty = (y << k) >> (32 - 9);
+ uint t = startTable[ty - 256] + 256;
+ uint z = (t << (32 - 9)) >> (32 - k - 1);
+ uint my = 0 - y;
+
+ z += Umulh(z, my * z);
+ z += Umulh(z, my * z);
+
+ uint q = Umulh(x, z);
+ r = x - y * q;
+ if(r >= y)
+ {
+ r = r - y;
+ q = q + 1;
+ if(r >= y)
+ {
+ r = r - y;
+ q = q + 1;
+ }
+ }
+
+ return q;
+}
+
+uint Div64_32_32(uint numHi, uint numLo, uint den)
+{
+ // based on https://github.com/ridiculousfish/libdivide/blob/3bd34388573681ce563348cdf04fe15d24770d04/libdivide.h#L469
+ // modified to work with half the size 64/32=32 instead of 128/64=64
+ // for further details see https://ridiculousfish.com/blog/posts/labor-of-division-episode-iv.html
+
+ // We work in base 2**16.
+ // A uint32 holds a single digit (in the lower 16 bit). A uint32 holds two digits.
+ // Our numerator is conceptually [num3, num2, num1, num0].
+ // Our denominator is [den1, den0].
+ const uint b = (1U << 16);
+
+ // Determine the normalization factor. We multiply den by this, so that its leading digit is at
+ // least half b. In binary this means just shifting left by the number of leading zeros, so that
+ // there's a 1 in the MSB.
+ // We also shift numer by the same amount. This cannot overflow because numHi < den.
+ // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
+ // by 64. (it's also UB in GLSL!!!!)
+ uint shift = 31 - findMSB(den);
+ den <<= shift;
+ numHi <<= shift;
+ numHi |= (numLo >> (-shift & 31U)) & uint(-int(shift) >> 31);
+ numLo <<= shift;
+
+ // Extract the low digits of the numerator and both digits of the denominator.
+ uint num1 = (numLo >> 16);
+ uint num0 = (numLo & 0xFFFFU);
+ uint den1 = (den >> 16);
+ uint den0 = (den & 0xFFFFU);
+
+ // We wish to compute q1 = [n3 n2 n1] / [d1 d0].
+ // Estimate q1 as [n3 n2] / [d1], and then correct it.
+ // Note while qhat may be 2 digits, q1 is always 1 digit.
+
+ uint rhat;
+ uint qhat = Div(numHi, den1, rhat);
+ uint c1 = qhat * den0;
+ uint c2 = rhat * b + num1;
+ if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
+ uint q1 = qhat & 0xFFFFU;
+
+ // Compute the true (partial) remainder.
+ uint rem = numHi * b + num1 - q1 * den;
+
+ // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
+ // Estimate q0 as [rem1 rem0] / [d1] and correct it.
+ qhat = Div(rem, den1, rhat);
+ c1 = qhat * den0;
+ c2 = rhat * b + num0;
+ if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
+
+ return bitfieldInsert(qhat, q1, 16, 16);
+}
+
+int InterpolateAttrPersp(int y0, int y1, int ifactor)
+{
+ if (y0 == y1)
+ return y0;
+
+ if (y0 < y1)
+ return y0 + (((y1-y0) * ifactor) >> YFactorShift);
+ else
+ return y1 + (((y0-y1) * ((1<> YFactorShift);
+}
+
+int InterpolateAttrLinear(int y0, int y1, int i, int irecip, int idiff)
+{
+ if (y0 == y1)
+ return y0;
+
+#ifndef Rasterise
+ irecip = abs(irecip);
+#endif
+
+ uint mulLo, mulHi, carry;
+ if (y0 < y1)
+ {
+#ifndef Rasterise
+ uint offset = uint(abs(i));
+#else
+ uint offset = uint(i);
+#endif
+ umulExtended(uint(y1-y0)*offset, uint(irecip), mulHi, mulLo);
+ mulLo = uaddCarry(mulLo, 3U<<24, carry);
+ mulHi += carry;
+ return y0 + int((mulLo >> 30) | (mulHi << (32 - 30)));
+ //return y0 + int(((int64_t(y1-y0) * int64_t(offset) * int64_t(irecip)) + int64_t(3<<24)) >> 30);
+ }
+ else
+ {
+#ifndef Rasterise
+ uint offset = uint(abs(idiff-i));
+#else
+ uint offset = uint(idiff-i);
+#endif
+ umulExtended(uint(y0-y1)*offset, uint(irecip), mulHi, mulLo);
+ mulLo = uaddCarry(mulLo, 3<<24, carry);
+ mulHi += carry;
+ return y1 + int((mulLo >> 30) | (mulHi << (32 - 30)));
+ //return y1 + int(((int64_t(y0-y1) * int64_t(offset) * int64_t(irecip)) + int64_t(3<<24)) >> 30);
+ }
+}
+
+uint InterpolateZZBuffer(int z0, int z1, int i, int irecip, int idiff)
+{
+ if (z0 == z1)
+ return z0;
+
+ uint base, disp, factor;
+ if (z0 < z1)
+ {
+ base = uint(z0);
+ disp = uint(z1 - z0);
+ factor = uint(abs(i));
+ }
+ else
+ {
+ base = uint(z1);
+ disp = uint(z0 - z1),
+ factor = uint(abs(idiff - i));
+ }
+
+#ifdef InterpSpans
+ int shiftl = 0;
+ const int shiftr = 22;
+ if (disp > 0x3FF)
+ {
+ shiftl = findMSB(disp) - 9;
+ disp >>= shiftl;
+ }
+#else
+ disp >>= 9;
+ const int shiftl = 0;
+ const int shiftr = 13;
+#endif
+ uint mulLo, mulHi;
+
+ umulExtended(disp * factor, abs(irecip) >> 8, mulHi, mulLo);
+
+ return base + (((mulLo >> shiftr) | (mulHi << (32 - shiftr))) << shiftl);
+/*
+ int base, disp, factor;
+ if (z0 < z1)
+ {
+ base = z0;
+ disp = z1 - z0;
+ factor = i;
+ }
+ else
+ {
+ base = z1;
+ disp = z0 - z1,
+ factor = idiff - i;
+ }
+
+#ifdef InterpSpans
+ {
+ int shift = 0;
+ while (disp > 0x3FF)
+ {
+ disp >>= 1;
+ shift++;
+ }
+
+ return base + int(((int64_t(disp) * int64_t(factor) * (int64_t(irecip) >> 8)) >> 22) << shift);
+ }
+#else
+ {
+ disp >>= 9;
+ return base + int((int64_t(disp) * int64_t(factor) * (int64_t(irecip) >> 8)) >> 13);
+ }
+#endif*/
+}
+
+uint InterpolateZWBuffer(int z0, int z1, int ifactor)
+{
+ if (z0 == z1)
+ return z0;
+
+#ifdef Rasterise
+ // since the precision along x spans is only 8 bit the result will always fit in 32-bit
+ if (z0 < z1)
+ {
+ return uint(z0) + (((z1-z0) * ifactor) >> YFactorShift);
+ }
+ else
+ {
+ return uint(z1) + (((z0-z1) * ((1<> YFactorShift);
+ }
+#else
+ uint mulLo, mulHi;
+ if (z0 < z1)
+ {
+ umulExtended(z1-z0, ifactor, mulHi, mulLo);
+ // 64-bit shift
+ return uint(z0) + ((mulLo >> YFactorShift) | (mulHi << (32-YFactorShift)));
+ }
+ else
+ {
+ umulExtended(z0-z1, (1<> YFactorShift) | (mulHi << (32-YFactorShift)));
+ }
+#endif
+ /*if (z0 < z1)
+ {
+ return uint(z0) + uint((int64_t(z1-z0) * int64_t(ifactor)) >> YFactorShift);
+ }
+ else
+ {
+ return uint(z1) + uint((int64_t(z0-z1) * int64_t((1<> YFactorShift);
+ }*/
+}
+#endif
+
+)";
+
+const std::string InterpSpans =
+ PolygonBuffer +
+ XSpanSetupBuffer +
+ YSpanSetupBuffer + R"(
+layout (local_size_x = 32) in;
+
+layout (binding = 0, rgba16ui) uniform readonly uimageBuffer SetupIndices;
+
+void main()
+{
+ uvec4 setup = imageLoad(SetupIndices, int(gl_GlobalInvocationID.x));
+
+ YSpanSetup spanL = YSpanSetups[setup.y];
+ YSpanSetup spanR = YSpanSetups[setup.z];
+
+ XSpanSetup xspan;
+ xspan.Flags = 0U;
+
+ int y = int(setup.w);
+
+ int dxl = CalculateDx(y, spanL);
+ int dxr = CalculateDx(y, spanR);
+
+ int xl = CalculateX(dxl, spanL);
+ int xr = CalculateX(dxr, spanR);
+
+ Polygon polygon = Polygons[setup.x];
+
+ int edgeLenL, edgeLenR;
+
+ if (xl > xr)
+ {
+ YSpanSetup tmpSpan = spanL;
+ spanL = spanR;
+ spanR = tmpSpan;
+
+ int tmp = xl;
+ xl = xr;
+ xr = tmp;
+
+ EdgeParams_YMajor(false, dxr, spanL, edgeLenL, xspan.EdgeCovL);
+ EdgeParams_YMajor(true, dxl, spanR, edgeLenR, xspan.EdgeCovR);
+ }
+ else
+ {
+ // edges are the right way
+ if (spanL.Increment > 0x40000)
+ EdgeParams_XMajor(false, dxl, spanL, edgeLenL, xspan.EdgeCovL);
+ else
+ EdgeParams_YMajor(false, dxl, spanL, edgeLenL, xspan.EdgeCovL);
+ if (spanR.Increment > 0x40000)
+ EdgeParams_XMajor(true, dxr, spanR, edgeLenR, xspan.EdgeCovR);
+ else
+ EdgeParams_YMajor(true, dxr, spanR, edgeLenR, xspan.EdgeCovR);
+ }
+
+ xspan.CovLInitial = (xspan.EdgeCovL >> 12) & 0x3FF;
+ if (xspan.CovLInitial == 0x3FF)
+ xspan.CovLInitial = 0;
+ xspan.CovRInitial = (xspan.EdgeCovR >> 12) & 0x3FF;
+ if (xspan.CovRInitial == 0x3FF)
+ xspan.CovRInitial = 0;
+
+ xspan.X0 = xl;
+ xspan.X1 = xr + 1;
+
+ uint polyalpha = ((polygon.Attr >> 16) & 0x1FU);
+ bool isWireframe = polyalpha == 0U;
+
+ if (!isWireframe || (y == polygon.YTop || y == polygon.YBot - 1))
+ xspan.Flags |= XSpanSetup_FillInside;
+
+ xspan.InsideStart = xspan.X0 + edgeLenL;
+ if (xspan.InsideStart > xspan.X1)
+ xspan.InsideStart = xspan.X1;
+ xspan.InsideEnd = xspan.X1 - edgeLenR;
+ if (xspan.InsideEnd > xspan.X1)
+ xspan.InsideEnd = xspan.X1;
+
+ bool isShadowMask = ((polygon.Attr & 0x3F000030U) == 0x00000030U);
+ bool fillAllEdges = polyalpha < 31 || (DispCnt & (3U<<4)) != 0U;
+
+ if (fillAllEdges || spanL.X1 < spanL.X0 || spanL.Increment <= 0x40000)
+ xspan.Flags |= XSpanSetup_FillLeft;
+ if (fillAllEdges || (spanR.X1 >= spanR.X0 && spanR.Increment > 0x40000) || spanR.Increment == 0)
+ xspan.Flags |= XSpanSetup_FillRight;
+
+ if (spanL.I0 == spanL.I1)
+ {
+ xspan.TexcoordU0 = spanL.TexcoordU0;
+ xspan.TexcoordV0 = spanL.TexcoordV0;
+ xspan.ColorR0 = spanL.ColorR0;
+ xspan.ColorG0 = spanL.ColorG0;
+ xspan.ColorB0 = spanL.ColorB0;
+ xspan.Z0 = spanL.Z0;
+ xspan.W0 = spanL.W0;
+ }
+ else
+ {
+ int i = (spanL.Increment > 0x40000 ? xl : y) - spanL.I0;
+ int ifactor = CalcYFactorY(spanL, i);
+ int idiff = spanL.I1 - spanL.I0;
+
+#ifdef ZBuffer
+ xspan.Z0 = int(InterpolateZZBuffer(spanL.Z0, spanL.Z1, i, spanL.IRecip, idiff));
+#endif
+#ifdef WBuffer
+ xspan.Z0 = int(InterpolateZWBuffer(spanL.Z0, spanL.Z1, ifactor));
+#endif
+
+ if (!spanL.Linear)
+ {
+ xspan.TexcoordU0 = InterpolateAttrPersp(spanL.TexcoordU0, spanL.TexcoordU1, ifactor);
+ xspan.TexcoordV0 = InterpolateAttrPersp(spanL.TexcoordV0, spanL.TexcoordV1, ifactor);
+
+ xspan.ColorR0 = InterpolateAttrPersp(spanL.ColorR0, spanL.ColorR1, ifactor);
+ xspan.ColorG0 = InterpolateAttrPersp(spanL.ColorG0, spanL.ColorG1, ifactor);
+ xspan.ColorB0 = InterpolateAttrPersp(spanL.ColorB0, spanL.ColorB1, ifactor);
+
+ xspan.W0 = InterpolateAttrPersp(spanL.W0, spanL.W1, ifactor);
+ }
+ else
+ {
+ xspan.TexcoordU0 = InterpolateAttrLinear(spanL.TexcoordU0, spanL.TexcoordU1, i, spanL.IRecip, idiff);
+ xspan.TexcoordV0 = InterpolateAttrLinear(spanL.TexcoordV0, spanL.TexcoordV1, i, spanL.IRecip, idiff);
+
+ xspan.ColorR0 = InterpolateAttrLinear(spanL.ColorR0, spanL.ColorR1, i, spanL.IRecip, idiff);
+ xspan.ColorG0 = InterpolateAttrLinear(spanL.ColorG0, spanL.ColorG1, i, spanL.IRecip, idiff);
+ xspan.ColorB0 = InterpolateAttrLinear(spanL.ColorB0, spanL.ColorB1, i, spanL.IRecip, idiff);
+
+ xspan.W0 = spanL.W0; // linear mode is only taken if W0 == W1
+ }
+ }
+
+ if (spanR.I0 == spanR.I1)
+ {
+ xspan.TexcoordU1 = spanR.TexcoordU0;
+ xspan.TexcoordV1 = spanR.TexcoordV0;
+ xspan.ColorR1 = spanR.ColorR0;
+ xspan.ColorG1 = spanR.ColorG0;
+ xspan.ColorB1 = spanR.ColorB0;
+ xspan.Z1 = spanR.Z0;
+ xspan.W1 = spanR.W0;
+ }
+ else
+ {
+ int i = (spanR.Increment > 0x40000 ? xr : y) - spanR.I0;
+ int ifactor = CalcYFactorY(spanR, i);
+ int idiff = spanR.I1 - spanR.I0;
+
+ #ifdef ZBuffer
+ xspan.Z1 = int(InterpolateZZBuffer(spanR.Z0, spanR.Z1, i, spanR.IRecip, idiff));
+ #endif
+ #ifdef WBuffer
+ xspan.Z1 = int(InterpolateZWBuffer(spanR.Z0, spanR.Z1, ifactor));
+ #endif
+
+ if (!spanR.Linear)
+ {
+ xspan.TexcoordU1 = InterpolateAttrPersp(spanR.TexcoordU0, spanR.TexcoordU1, ifactor);
+ xspan.TexcoordV1 = InterpolateAttrPersp(spanR.TexcoordV0, spanR.TexcoordV1, ifactor);
+
+ xspan.ColorR1 = InterpolateAttrPersp(spanR.ColorR0, spanR.ColorR1, ifactor);
+ xspan.ColorG1 = InterpolateAttrPersp(spanR.ColorG0, spanR.ColorG1, ifactor);
+ xspan.ColorB1 = InterpolateAttrPersp(spanR.ColorB0, spanR.ColorB1, ifactor);
+
+ xspan.W1 = int(InterpolateAttrPersp(spanR.W0, spanR.W1, ifactor));
+ }
+ else
+ {
+ xspan.TexcoordU1 = InterpolateAttrLinear(spanR.TexcoordU0, spanR.TexcoordU1, i, spanR.IRecip, idiff);
+ xspan.TexcoordV1 = InterpolateAttrLinear(spanR.TexcoordV0, spanR.TexcoordV1, i, spanR.IRecip, idiff);
+
+ xspan.ColorR1 = InterpolateAttrLinear(spanR.ColorR0, spanR.ColorR1, i, spanR.IRecip, idiff);
+ xspan.ColorG1 = InterpolateAttrLinear(spanR.ColorG0, spanR.ColorG1, i, spanR.IRecip, idiff);
+ xspan.ColorB1 = InterpolateAttrLinear(spanR.ColorB0, spanR.ColorB1, i, spanR.IRecip, idiff);
+
+ xspan.W1 = spanR.W0;
+ }
+ }
+
+ if (xspan.W0 == xspan.W1 && ((xspan.W0 | xspan.W1) & 0x7F) == 0)
+ {
+ xspan.Flags |= XSpanSetup_Linear;
+// a bit hacky, but when wbuffering we only need to calculate xrecip for linear spans
+#ifdef ZBuffer
+ }
+ {
+#endif
+ uint r;
+ xspan.XRecip = int(Div(1U<<30, uint(xspan.X1 - xspan.X0), r));
+ }
+
+ XSpanSetups[gl_GlobalInvocationID.x] = xspan;
+}
+
+)";
+
+const std::string ClearIndirectWorkCount =
+ BinningBuffer + R"(
+
+layout (local_size_x = 32) in;
+
+void main()
+{
+ VariantWorkCount[gl_GlobalInvocationID.x] = uvec4(1, 1, 0, 0);
+}
+
+)";
+
+const std::string ClearCoarseBinMask =
+ BinningBuffer + R"(
+layout (local_size_x = 32) in;
+
+void main()
+{
+ BinningMaskAndOffset[BinningCoarseMaskStart + gl_GlobalInvocationID.x*CoarseBinStride+0] = 0;
+ BinningMaskAndOffset[BinningCoarseMaskStart + gl_GlobalInvocationID.x*CoarseBinStride+1] = 0;
+}
+
+)";
+
+const std::string BinCombined =
+ PolygonBuffer +
+ BinningBuffer +
+ XSpanSetupBuffer +
+ WorkDescBuffer + R"(
+
+layout (local_size_x = 32) in;
+
+bool BinPolygon(Polygon polygon, ivec2 topLeft, ivec2 botRight)
+{
+ if (polygon.YTop > botRight.y || polygon.YBot <= topLeft.y)
+ return false;
+
+ int polygonHeight = polygon.YBot - polygon.YTop;
+
+ /*
+ All (good) polygons are convex. So the following holds true:
+
+ Starting from the top most point where both edges originate
+ the X coordinate of the left edge will stay the same or falls until
+ the minimum X-axis coordinate is reached. Then it stays the same or
+ rises until the point it meets with the right edge.
+
+ The same applies to the right edge, except that it first may rise or stay equal and
+ after the maximum point may only fall or stay the same.
+
+ This means that for every tile which doesn't contain the point where the direction changes
+ we can just get the maximum point by sampling the top most and bottom most coordinate
+ within the tile.
+
+ For a tile which is that the height of the direction change
+
+ As a sidenote another consequence of this design decision is
+ that malformed polygons aren't binned properly.
+
+ As a note bottom Y is exclusive!
+ */
+ int polyInnerTopY = clamp(topLeft.y - polygon.YTop, 0, max(polygonHeight-1, 0));
+ int polyInnerBotY = clamp(botRight.y - polygon.YTop, 0, max(polygonHeight-1, 0));
+
+ XSpanSetup xspanTop = XSpanSetups[polygon.FirstXSpan + polyInnerTopY];
+ XSpanSetup xspanBot = XSpanSetups[polygon.FirstXSpan + polyInnerBotY];
+
+ int minXL;
+ if (polygon.XMinY >= topLeft.y && polygon.XMinY <= botRight.y)
+ minXL = polygon.XMin;
+ else
+ minXL = min(xspanTop.X0, xspanBot.X0);
+
+ if (minXL > botRight.x)
+ return false;
+
+ int maxXR;
+ if (polygon.XMaxY >= topLeft.y && polygon.XMaxY <= botRight.y)
+ maxXR = polygon.XMax;
+ else
+ maxXR = max(xspanTop.X1, xspanBot.X1) - 1;
+
+ if (maxXR < topLeft.x)
+ return false;
+
+ return true;
+}
+
+shared uint mergedMaskShared;
+
+void main()
+{
+ int groupIdx = int(gl_WorkGroupID.x);
+ ivec2 coarseTile = ivec2(gl_WorkGroupID.yz);
+
+#if 0
+ int localIdx = int(gl_SubGroupInvocationARB);
+#else
+ int localIdx = int(gl_LocalInvocationIndex);
+
+ if (localIdx == 0)
+ mergedMaskShared = 0U;
+ barrier();
+#endif
+
+ int polygonIdx = groupIdx * 32 + localIdx;
+
+ ivec2 coarseTopLeft = coarseTile * ivec2(CoarseTileW, CoarseTileH);
+ ivec2 coarseBotRight = coarseTopLeft + ivec2(CoarseTileW-1, CoarseTileH-1);
+
+ bool binned = false;
+ if (polygonIdx < NumPolygons)
+ {
+ binned = BinPolygon(Polygons[polygonIdx], coarseTopLeft, coarseBotRight);
+ }
+
+#if 0
+ uint mergedMask = unpackUint2x32(ballotARB(binned)).x;
+#else
+ if (binned)
+ atomicOr(mergedMaskShared, 1U << localIdx);
+ barrier();
+ uint mergedMask = mergedMaskShared;
+#endif
+
+ ivec2 fineTile = ivec2(localIdx & 0x7, localIdx >> 3);
+
+ ivec2 fineTileTopLeft = coarseTopLeft + fineTile * ivec2(TileSize, TileSize);
+ ivec2 fineTileBotRight = fineTileTopLeft + ivec2(TileSize-1, TileSize-1);
+
+ uint binnedMask = 0U;
+ while (mergedMask != 0U)
+ {
+ int bit = findLSB(mergedMask);
+ mergedMask &= ~(1U << bit);
+
+ int polygonIdx = groupIdx * 32 + bit;
+
+ if (BinPolygon(Polygons[polygonIdx], fineTileTopLeft, fineTileBotRight))
+ binnedMask |= 1U << bit;
+ }
+
+ int linearTile = fineTile.x + fineTile.y * TilesPerLine + coarseTile.x * CoarseTileCountX + coarseTile.y * TilesPerLine * CoarseTileCountY;
+
+ BinningMaskAndOffset[BinningMaskStart + linearTile * BinStride + groupIdx] = binnedMask;
+ int coarseMaskIdx = linearTile * CoarseBinStride + (groupIdx >> 5);
+ if (binnedMask != 0U)
+ atomicOr(BinningMaskAndOffset[BinningCoarseMaskStart + coarseMaskIdx], 1U << (groupIdx & 0x1F));
+
+ if (binnedMask != 0U)
+ {
+ uint workOffset = atomicAdd(VariantWorkCount[0].w, uint(bitCount(binnedMask)));
+ BinningMaskAndOffset[BinningWorkOffsetsStart + linearTile * BinStride + groupIdx] = workOffset;
+
+ uint tilePositionCombined = bitfieldInsert(fineTileTopLeft.x, fineTileTopLeft.y, 16, 16);
+
+ int idx = 0;
+ while (binnedMask != 0U)
+ {
+ int bit = findLSB(binnedMask);
+ binnedMask &= ~(1U << bit);
+
+ int polygonIdx = groupIdx * 32 + bit;
+ int variantIdx = Polygons[polygonIdx].Variant;
+
+ int inVariantOffset = int(atomicAdd(VariantWorkCount[variantIdx].z, 1));
+ WorkDescs[WorkDescsUnsortedStart + workOffset + idx] = uvec2(tilePositionCombined, bitfieldInsert(polygonIdx, inVariantOffset, 11, 21));
+
+ idx++;
+ }
+ }
+}
+
+)";
+
+const std::string CalcOffsets =
+ BinningBuffer + R"(
+
+layout (local_size_x = 32) in;
+
+void main()
+{
+ if (gl_GlobalInvocationID.x < NumVariants)
+ {
+ if (gl_GlobalInvocationID.x == 0)
+ {
+ // a bit of a cheat putting this here, but this shader won't run that often
+ SortWorkWorkCount = uvec4((VariantWorkCount[0].w + 31) / 32, 1, 1, 0);
+ }
+ SortedWorkOffset[gl_GlobalInvocationID.x] = atomicAdd(VariantWorkCount[1].w, VariantWorkCount[gl_GlobalInvocationID.x].z);
+ }
+}
+
+
+)";
+
+const std::string SortWork =
+ PolygonBuffer +
+ BinningBuffer +
+ WorkDescBuffer + R"(
+
+layout (local_size_x = 32) in;
+
+void main()
+{
+ if (gl_GlobalInvocationID.x < VariantWorkCount[0].w)
+ {
+ uvec2 workDesc = WorkDescs[WorkDescsUnsortedStart + gl_GlobalInvocationID.x];
+ int inVariantOffset = int(bitfieldExtract(workDesc.y, 11, 21));
+ int polygonIdx = int(bitfieldExtract(workDesc.y, 0, 11));
+ int variantIdx = Polygons[polygonIdx].Variant;
+
+ int sortedIndex = int(SortedWorkOffset[variantIdx]) + inVariantOffset;
+ WorkDescs[WorkDescsSortedStart + sortedIndex] = uvec2(workDesc.x, bitfieldInsert(workDesc.y, gl_GlobalInvocationID.x, 11, 21));
+ }
+}
+
+)";
+
+const std::string Rasterise =
+ PolygonBuffer +
+ WorkDescBuffer +
+ XSpanSetupBuffer +
+ BinningBuffer +
+ Tilebuffers + R"(
+
+layout (local_size_x = TileSize, local_size_y = TileSize) in;
+
+layout (binding = 0) uniform usampler2DArray CurrentTexture;
+
+layout (location = 0) uniform uint CurVariant;
+layout (location = 1) uniform vec2 InvTextureSize;
+
+void main()
+{
+ uvec2 workDesc = WorkDescs[WorkDescsSortedStart + SortedWorkOffset[CurVariant] + gl_WorkGroupID.z];
+ Polygon polygon = Polygons[bitfieldExtract(workDesc.y, 0, 11)];
+ ivec2 position = ivec2(bitfieldExtract(workDesc.x, 0, 16), bitfieldExtract(workDesc.x, 16, 16)) + ivec2(gl_LocalInvocationID.xy);
+ int tileOffset = int(bitfieldExtract(workDesc.y, 11, 21)) * TileSize * TileSize + TileSize * int(gl_LocalInvocationID.y) + int(gl_LocalInvocationID.x);
+
+ uint color = 0U;
+ if (position.y >= polygon.YTop && position.y < polygon.YBot)
+ {
+ XSpanSetup xspan = XSpanSetups[polygon.FirstXSpan + (position.y - polygon.YTop)];
+
+ bool insideLeftEdge = position.x < xspan.InsideStart;
+ bool insideRightEdge = position.x >= xspan.InsideEnd;
+ bool insidePolygonInside = !insideLeftEdge && !insideRightEdge;
+
+ if (position.x >= xspan.X0 && position.x < xspan.X1
+ && ((insideLeftEdge && (xspan.Flags & XSpanSetup_FillLeft) != 0U)
+ || (insideRightEdge && (xspan.Flags & XSpanSetup_FillRight) != 0U)
+ || (insidePolygonInside && (xspan.Flags & XSpanSetup_FillInside) != 0U)))
+ {
+ uint attr = 0;
+ if (position.y == polygon.YTop)
+ attr |= 0x4U;
+ else if (position.y == polygon.YBot - 1)
+ attr |= 0x8U;
+
+ if (insideLeftEdge)
+ {
+ attr |= 0x1U;
+
+ int cov = xspan.EdgeCovL;
+ if (cov < 0)
+ {
+ int xcov = xspan.CovLInitial + (xspan.EdgeCovL & 0x3FF) * (position.x - xspan.X0);
+ cov = min(xcov >> 5, 31);
+ }
+
+ attr |= uint(cov) << 8;
+ }
+ else if (insideRightEdge)
+ {
+ attr |= 0x2U;
+
+ int cov = xspan.EdgeCovR;
+ if (cov < 0)
+ {
+ int xcov = xspan.CovRInitial + (xspan.EdgeCovR & 0x3FF) * (position.x - xspan.InsideEnd);
+ cov = max(0x1F - (xcov >> 5), 0);
+ }
+
+ attr |= uint(cov) << 8;
+ }
+
+ uint z;
+ int u, v, vr, vg, vb;
+
+ if (xspan.X0 == xspan.X1)
+ {
+ z = xspan.Z0;
+ u = xspan.TexcoordU0;
+ v = xspan.TexcoordV0;
+ vr = xspan.ColorR0;
+ vg = xspan.ColorG0;
+ vb = xspan.ColorB0;
+ }
+ else
+ {
+ int ifactor = CalcYFactorX(xspan, position.x);
+ int idiff = xspan.X1 - xspan.X0;
+ int i = position.x - xspan.X0;
+
+#ifdef ZBuffer
+ z = InterpolateZZBuffer(xspan.Z0, xspan.Z1, i, xspan.XRecip, idiff);
+#endif
+#ifdef WBuffer
+ z = InterpolateZWBuffer(xspan.Z0, xspan.Z1, ifactor);
+#endif
+ if ((xspan.Flags & XSpanSetup_Linear) == 0U)
+ {
+ u = InterpolateAttrPersp(xspan.TexcoordU0, xspan.TexcoordU1, ifactor);
+ v = InterpolateAttrPersp(xspan.TexcoordV0, xspan.TexcoordV1, ifactor);
+
+ vr = InterpolateAttrPersp(xspan.ColorR0, xspan.ColorR1, ifactor);
+ vg = InterpolateAttrPersp(xspan.ColorG0, xspan.ColorG1, ifactor);
+ vb = InterpolateAttrPersp(xspan.ColorB0, xspan.ColorB1, ifactor);
+ }
+ else
+ {
+ u = InterpolateAttrLinear(xspan.TexcoordU0, xspan.TexcoordU1, i, xspan.XRecip, idiff);
+ v = InterpolateAttrLinear(xspan.TexcoordV0, xspan.TexcoordV1, i, xspan.XRecip, idiff);
+
+ vr = InterpolateAttrLinear(xspan.ColorR0, xspan.ColorR1, i, xspan.XRecip, idiff);
+ vg = InterpolateAttrLinear(xspan.ColorG0, xspan.ColorG1, i, xspan.XRecip, idiff);
+ vb = InterpolateAttrLinear(xspan.ColorB0, xspan.ColorB1, i, xspan.XRecip, idiff);
+ }
+ }
+
+#ifndef ShadowMask
+ vr >>= 3;
+ vg >>= 3;
+ vb >>= 3;
+
+ uint r, g, b, a;
+ uint polyalpha = bitfieldExtract(polygon.Attr, 16, 5);
+
+#ifdef Toon
+ uint tooncolor = ToonTable[vr >> 1].r;
+ vr = int(bitfieldExtract(tooncolor, 0, 8));
+ vg = int(bitfieldExtract(tooncolor, 8, 8));
+ vb = int(bitfieldExtract(tooncolor, 16, 8));
+#endif
+#ifdef Highlight
+ vg = vr;
+ vb = vr;
+#endif
+
+#ifdef NoTexture
+ a = int(polyalpha);
+#endif
+ r = vr;
+ g = vg;
+ b = vb;
+
+#ifdef UseTexture
+ vec2 uvf = vec2(ivec2(u, v)) * vec2(1.0 / 16.0) * InvTextureSize;
+
+ uvec4 texcolor = texture(CurrentTexture, vec3(uvf, polygon.TextureLayer));
+#ifdef Decal
+ if (texcolor.a == 31)
+ {
+ r = int(texcolor.r);
+ g = int(texcolor.g);
+ b = int(texcolor.b);
+ }
+ else if (texcolor.a > 0)
+ {
+ r = int((texcolor.r * texcolor.a) + (vr * (31-texcolor.a))) >> 5;
+ g = int((texcolor.g * texcolor.a) + (vg * (31-texcolor.a))) >> 5;
+ b = int((texcolor.b * texcolor.a) + (vb * (31-texcolor.a))) >> 5;
+ }
+ a = int(polyalpha);
+#endif
+#if defined(Modulate) || defined(Toon) || defined(Highlight)
+ r = int((texcolor.r+1) * (vr+1) - 1) >> 6;
+ g = int((texcolor.g+1) * (vg+1) - 1) >> 6;
+ b = int((texcolor.b+1) * (vb+1) - 1) >> 6;
+ a = int((texcolor.a+1) * (polyalpha+1) - 1) >> 5;
+#endif
+#endif
+
+#ifdef Highlight
+ uint tooncolor = ToonTable[vr >> 1].r;
+
+ r = min(r + int(bitfieldExtract(tooncolor, 0, 8)), 63);
+ g = min(g + int(bitfieldExtract(tooncolor, 8, 8)), 63);
+ b = min(b + int(bitfieldExtract(tooncolor, 16, 8)), 63);
+#endif
+
+ if (polyalpha == 0)
+ a = 31;
+
+ if (a > AlphaRef)
+ {
+ color = r | (g << 8) | (b << 16) | (a << 24);
+
+ DepthTiles[tileOffset] = z;
+ AttrTiles[tileOffset] = attr;
+ }
+#else
+ color = 0xFFFFFFFF; // doesn't really matter as long as it's not 0
+ DepthTiles[tileOffset] = z;
+#endif
+ }
+ }
+
+ ColorTiles[tileOffset] = color;
+}
+
+)";
+
+const std::string DepthBlend =
+ PolygonBuffer +
+ Tilebuffers +
+ ResultBuffer +
+ BinningBuffer + R"(
+
+layout (local_size_x = TileSize, local_size_y = TileSize) in;
+
+void PlotTranslucent(inout uint color, inout uint depth, inout uint attr, bool isShadow, uint tileColor, uint srcA, uint tileDepth, uint srcAttr, bool writeDepth)
+{
+ uint blendAttr = (srcAttr & 0xE0F0U) | ((srcAttr >> 8) & 0xFF0000U) | (1U<<22) | (attr & 0xFF001F0FU);
+
+ if ((!isShadow || (attr & (1U<<22)) != 0U)
+ ? (attr & 0x007F0000U) != (blendAttr & 0x007F0000U)
+ : (attr & 0x3F000000U) != (srcAttr & 0x3F000000U))
+ {
+ // le blend
+ if (writeDepth)
+ depth = tileDepth;
+
+ if ((attr & (1U<<15)) == 0)
+ blendAttr &= ~(1U<<15);
+ attr = blendAttr;
+
+ uint srcRB = tileColor & 0x3F003FU;
+ uint srcG = tileColor & 0x003F00U;
+ uint dstRB = color & 0x3F003FU;
+ uint dstG = color & 0x003F00U;
+ uint dstA = color & 0x1F000000U;
+
+ uint alpha = (srcA >> 24) + 1;
+ if (dstA != 0)
+ {
+ srcRB = ((srcRB * alpha) + (dstRB * (32-alpha))) >> 5;
+ srcG = ((srcG * alpha) + (dstG * (32-alpha))) >> 5;
+ }
+
+ color = (srcRB & 0x3F003FU) | (srcG & 0x003F00U) | max(dstA, srcA);
+ }
+}
+
+void ProcessCoarseMask(int linearTile, uint coarseMask, uint coarseOffset,
+ inout uvec2 color, inout uvec2 depth, inout uvec2 attr, inout uint stencil,
+ inout bool prevIsShadowMask)
+{
+ int tileInnerOffset = int(gl_LocalInvocationID.x) + int(gl_LocalInvocationID.y) * TileSize;
+
+ while (coarseMask != 0U)
+ {
+ uint coarseBit = findLSB(coarseMask);
+ coarseMask &= ~(1U << coarseBit);
+
+ uint tileOffset = linearTile * BinStride + coarseBit + coarseOffset;
+
+ uint fineMask = BinningMaskAndOffset[BinningMaskStart + tileOffset];
+ uint workIdx = BinningMaskAndOffset[BinningWorkOffsetsStart + tileOffset];
+
+ while (fineMask != 0U)
+ {
+ uint fineIdx = findLSB(fineMask);
+ fineMask &= ~(1U << fineIdx);
+
+ uint pixelindex = tileInnerOffset + workIdx * TileSize * TileSize;
+ uint tileColor = ColorTiles[pixelindex];
+ workIdx++;
+
+ uint polygonIdx = fineIdx + (coarseBit + coarseOffset) * 32;
+
+ if (tileColor != 0U)
+ {
+ uint polygonAttr = Polygons[polygonIdx].Attr;
+
+ bool isShadowMask = ((polygonAttr & 0x3F000030U) == 0x00000030U);
+ bool prevIsShadowMaskOld = prevIsShadowMask;
+ prevIsShadowMask = isShadowMask;
+
+ bool equalDepthTest = (polygonAttr & (1U << 14)) != 0U;
+
+ uint tileDepth = DepthTiles[pixelindex];
+ uint tileAttr = AttrTiles[pixelindex];
+
+ uint dstattr = attr.x;
+
+ if (!isShadowMask)
+ {
+ bool isShadow = (polygonAttr & 0x30U) == 0x30U;
+
+ bool writeSecondLayer = false;
+
+ if (isShadow)
+ {
+ if (stencil == 0U)
+ continue;
+ if ((stencil & 1U) == 0U)
+ writeSecondLayer = true;
+ if ((stencil & 2U) == 0U)
+ dstattr &= ~0x3U;
+ }
+
+ uint dstDepth = writeSecondLayer ? depth.y : depth.x;
+ if (!(equalDepthTest
+#ifdef WBuffer
+ ? dstDepth - tileDepth + 0xFFU <= 0x1FE
+#endif
+#ifdef ZBuffer
+ ? dstDepth - tileDepth + 0x200 <= 0x400
+#endif
+ : tileDepth < dstDepth))
+ {
+ if ((dstattr & 0x3U) == 0U || writeSecondLayer)
+ continue;
+
+ writeSecondLayer = true;
+ dstattr = attr.y;
+ if (!(equalDepthTest
+#ifdef WBuffer
+ ? depth.y - tileDepth + 0xFFU <= 0x1FE
+#endif
+#ifdef ZBuffer
+ ? depth.y - tileDepth + 0x200 <= 0x400
+#endif
+ : tileDepth < depth.y))
+ continue;
+ }
+
+ uint srcAttr = (polygonAttr & 0x3F008000U);
+
+ uint srcA = tileColor & 0x1F000000U;
+ if (srcA == 0x1F000000U)
+ {
+ srcAttr |= tileAttr;
+
+ if (!writeSecondLayer)
+ {
+ if ((srcAttr & 0x3U) != 0U)
+ {
+ color.y = color.x;
+ depth.y = depth.x;
+ attr.y = attr.x;
+ }
+
+ color.x = tileColor;
+ depth.x = tileDepth;
+ attr.x = srcAttr;
+ }
+ else
+ {
+ color.y = tileColor;
+ depth.y = tileDepth;
+ attr.y = srcAttr;
+ }
+ }
+ else
+ {
+ bool writeDepth = (polygonAttr & (1U<<11)) != 0;
+
+ if (!writeSecondLayer)
+ {
+ // blend into both layers
+ PlotTranslucent(color.x, depth.x, attr.x, isShadow, tileColor, srcA, tileDepth, srcAttr, writeDepth);
+ }
+ if (writeSecondLayer || (dstattr & 0x3U) != 0U)
+ {
+ PlotTranslucent(color.y, depth.y, attr.y, isShadow, tileColor, srcA, tileDepth, srcAttr, writeDepth);
+ }
+ }
+ }
+ else
+ {
+ if (!prevIsShadowMaskOld)
+ stencil = 0;
+
+ if (!(equalDepthTest
+#ifdef WBuffer
+ ? depth.x - tileDepth + 0xFFU <= 0x1FE
+#endif
+#ifdef ZBuffer
+ ? depth.x - tileDepth + 0x200 <= 0x400
+#endif
+ : tileDepth < depth.x))
+ stencil = 0x1U;
+
+ if ((dstattr & 0x3U) != 0U)
+ {
+ if (!(equalDepthTest
+#ifdef WBuffer
+ ? depth.y - tileDepth + 0xFFU <= 0x1FE
+#endif
+#ifdef ZBuffer
+ ? depth.y - tileDepth + 0x200 <= 0x400
+#endif
+ : tileDepth < depth.y))
+ stencil |= 0x2U;
+ }
+ }
+ }
+ }
+ }
+}
+
+void main()
+{
+ int linearTile = int(gl_WorkGroupID.x + (gl_WorkGroupID.y * TilesPerLine));
+
+ uint coarseMaskLo = BinningMaskAndOffset[BinningCoarseMaskStart + linearTile*CoarseBinStride + 0];
+ uint coarseMaskHi = BinningMaskAndOffset[BinningCoarseMaskStart + linearTile*CoarseBinStride + 1];
+
+ uvec2 color = uvec2(ClearColor, 0U);
+ uvec2 depth = uvec2(ClearDepth, 0U);
+ uvec2 attr = uvec2(ClearAttr, 0U);
+ uint stencil = 0U;
+ bool prevIsShadowMask = false;
+
+ ProcessCoarseMask(linearTile, coarseMaskLo, 0, color, depth, attr, stencil, prevIsShadowMask);
+ ProcessCoarseMask(linearTile, coarseMaskHi, BinStride/2, color, depth, attr, stencil, prevIsShadowMask);
+
+ int resultOffset = int(gl_GlobalInvocationID.x) + int(gl_GlobalInvocationID.y) * ScreenWidth;
+ ResultValue[ResultColorStart+resultOffset] = color.x;
+ ResultValue[ResultColorStart+resultOffset+FramebufferStride] = color.y;
+ ResultValue[ResultDepthStart+resultOffset] = depth.x;
+ ResultValue[ResultDepthStart+resultOffset+FramebufferStride] = depth.y;
+ ResultValue[ResultAttrStart+resultOffset] = attr.x;
+ ResultValue[ResultAttrStart+resultOffset+FramebufferStride] = attr.y;
+}
+
+)";
+
+const std::string FinalPass =
+ ResultBuffer + R"(
+
+layout (local_size_x = 32) in;
+
+layout (binding = 0, rgba8) writeonly uniform image2D FinalFB;
+layout (binding = 1, rgba8ui) writeonly uniform uimage2D LowResFB;
+
+uint BlendFog(uint color, uint depth)
+{
+ uint densityid = 0, densityfrac = 0;
+
+ if (depth >= FogOffset)
+ {
+ depth -= FogOffset;
+ depth = (depth >> 2) << FogShift;
+
+ densityid = depth >> 17;
+ if (densityid >= 32)
+ {
+ densityid = 32;
+ densityfrac = 0;
+ }
+ else
+ {
+ densityfrac = depth & 0x1FFFFU;
+ }
+ }
+
+ uint density =
+ ((ToonTable[densityid].g * (0x20000U-densityfrac)) +
+ (ToonTable[densityid+1].g * densityfrac)) >> 17;
+ density = min(density, 128U);
+
+ uint colorRB = color & 0x3F003FU;
+ uint colorGA = (color >> 8) & 0x3F003FU;
+
+ uint fogRB = FogColor & 0x3F003FU;
+ uint fogGA = (FogColor >> 8) & 0x1F003FU;
+
+ uint finalColorRB = ((fogRB * density) + (colorRB * (128-density))) >> 7;
+ uint finalColorGA = ((fogGA * density) + (colorGA * (128-density))) >> 7;
+
+ finalColorRB &= 0x3F003FU;
+ finalColorGA &= 0x1F003FU;
+
+ return (DispCnt & (1U<<6)) != 0
+ ? (bitfieldInsert(color, finalColorGA >> 16, 24, 8))
+ : (finalColorRB | (finalColorGA << 8));
+}
+
+void main()
+{
+ int srcX = int(gl_GlobalInvocationID.x);
+ int resultOffset = int(srcX) + int(gl_GlobalInvocationID.y) * ScreenWidth;
+
+ uvec2 color = uvec2(ResultValue[resultOffset+ResultColorStart], ResultValue[resultOffset+FramebufferStride+ResultColorStart]);
+ uvec2 depth = uvec2(ResultValue[resultOffset+ResultDepthStart], ResultValue[resultOffset+FramebufferStride+ResultDepthStart]);
+ uvec2 attr = uvec2(ResultValue[resultOffset+ResultAttrStart], ResultValue[resultOffset+FramebufferStride+ResultAttrStart]);
+
+#ifdef EdgeMarking
+ if ((attr.x & 0xFU) != 0U)
+ {
+ uvec4 otherAttr = uvec4(ClearAttr);
+ uvec4 otherDepth = uvec4(ClearDepth);
+
+ if (srcX > 0U)
+ {
+ otherAttr.x = ResultValue[resultOffset-1+ResultAttrStart];
+ otherDepth.x = ResultValue[resultOffset-1+ResultDepthStart];
+ }
+ if (srcX < ScreenWidth-1)
+ {
+ otherAttr.y = ResultValue[resultOffset+1+ResultAttrStart];
+ otherDepth.y = ResultValue[resultOffset+1+ResultDepthStart];
+ }
+ if (gl_GlobalInvocationID.y > 0U)
+ {
+ otherAttr.z = ResultValue[resultOffset-ScreenWidth+ResultAttrStart];
+ otherDepth.z = ResultValue[resultOffset-ScreenWidth+ResultDepthStart];
+ }
+ if (gl_GlobalInvocationID.y < ScreenHeight-1)
+ {
+ otherAttr.w = ResultValue[resultOffset+ScreenWidth+ResultAttrStart];
+ otherDepth.w = ResultValue[resultOffset+ScreenWidth+ResultDepthStart];
+ }
+
+ uint polyId = bitfieldExtract(attr.x, 24, 6);
+ uvec4 otherPolyId = bitfieldExtract(otherAttr, 24, 6);
+
+ bvec4 polyIdMismatch = notEqual(uvec4(polyId), otherPolyId);
+ bvec4 nearer = lessThan(uvec4(depth.x), otherDepth);
+
+ if ((polyIdMismatch.x && nearer.x)
+ || (polyIdMismatch.y && nearer.y)
+ || (polyIdMismatch.z && nearer.z)
+ || (polyIdMismatch.w && nearer.w))
+ {
+ color.x = ToonTable[polyId >> 3].b | (color.x & 0xFF000000U);
+ attr.x = (attr.x & 0xFFFFE0FFU) | 0x00001000U;
+ }
+ }
+#endif
+
+#ifdef Fog
+ if ((attr.x & (1U<<15)) != 0U)
+ {
+ color.x = BlendFog(color.x, depth.x);
+ }
+
+ if ((attr.x & 0xFU) != 0 && (attr.y & (1U<<15)) != 0U)
+ {
+ color.y = BlendFog(color.y, depth.y);
+ }
+#endif
+
+#ifdef AntiAliasing
+ // resolve anti-aliasing
+ if ((attr.x & 0x3U) != 0)
+ {
+ uint coverage = (attr.x >> 8) & 0x1FU;
+
+ if (coverage != 0)
+ {
+ uint topRB = color.x & 0x3F003FU;
+ uint topG = color.x & 0x003F00U;
+ uint topA = bitfieldExtract(color.x, 24, 5);
+
+ uint botRB = color.y & 0x3F003FU;
+ uint botG = color.y & 0x003F00U;
+ uint botA = bitfieldExtract(color.y, 24, 5);
+
+ coverage++;
+
+ if (botA > 0)
+ {
+ topRB = ((topRB * coverage) + (botRB * (32-coverage))) >> 5;
+ topG = ((topG * coverage) + (botG * (32-coverage))) >> 5;
+
+ topRB &= 0x3F003FU;
+ topG &= 0x003F00U;
+ }
+
+ topA = ((topA * coverage) + (botA * (32-coverage))) >> 5;
+
+ color.x = topRB | topG | (topA << 24);
+ }
+ else
+ {
+ color.x = color.y;
+ }
+ }
+#endif
+
+// if (bitfieldExtract(color.x, 24, 8) != 0U)
+// color.x |= 0x40000000U;
+// else
+// color.x = 0U;
+
+ //if ((gl_GlobalInvocationID.y % 8) == 7 || (gl_GlobalInvocationID.y % 8) == 7)
+ // color.x = 0x1F00001FU | 0x40000000U;
+
+ vec4 result = vec4(bitfieldExtract(color.x, 16, 8), bitfieldExtract(color.x, 8, 8), color.x & 0x3FU, bitfieldExtract(color.x, 24, 8));
+ result /= vec4(63.0, 63.0, 63.0, 31.0);
+ imageStore(FinalFB, ivec2(gl_GlobalInvocationID.xy), result);
+
+ // It's a division by constant, so using the builtin division is fine
+ const int scale = ScreenWidth/256;
+ ivec2 lowresCoordinate = ivec2(gl_GlobalInvocationID.xy) / scale;
+ ivec2 lowresCoordinateRest = ivec2(gl_GlobalInvocationID.xy) % scale;
+ if (lowresCoordinateRest == ivec2(0, 0))
+ {
+ uvec4 color8;
+ color8.x = bitfieldExtract(color.x, 0, 8);
+ color8.y = bitfieldExtract(color.x, 8, 8);
+ color8.z = bitfieldExtract(color.x, 16, 8);
+ color8.w = bitfieldExtract(color.x, 24, 8);
+ imageStore(LowResFB, lowresCoordinate, color8);
+ }
+}
+
+)";
+
+}
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/GPU3D_OpenGL.cpp b/src/GPU3D_OpenGL.cpp
index 3e9ce5b0..3f85db8c 100644
--- a/src/GPU3D_OpenGL.cpp
+++ b/src/GPU3D_OpenGL.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -28,46 +28,32 @@
namespace melonDS
{
-bool GLRenderer::BuildRenderShader(u32 flags, const char* vs, const char* fs)
+bool GLRenderer::BuildRenderShader(u32 flags, const std::string& vs, const std::string& fs)
{
char shadername[32];
snprintf(shadername, sizeof(shadername), "RenderShader%02X", flags);
int headerlen = strlen(kShaderHeader);
- int vslen = strlen(vs);
- int vsclen = strlen(kRenderVSCommon);
- char* vsbuf = new char[headerlen + vsclen + vslen + 1];
- strcpy(&vsbuf[0], kShaderHeader);
- strcpy(&vsbuf[headerlen], kRenderVSCommon);
- strcpy(&vsbuf[headerlen + vsclen], vs);
+ std::string vsbuf;
+ vsbuf += kShaderHeader;
+ vsbuf += kRenderVSCommon;
+ vsbuf += vs;
- int fslen = strlen(fs);
- int fsclen = strlen(kRenderFSCommon);
- char* fsbuf = new char[headerlen + fsclen + fslen + 1];
- strcpy(&fsbuf[0], kShaderHeader);
- strcpy(&fsbuf[headerlen], kRenderFSCommon);
- strcpy(&fsbuf[headerlen + fsclen], fs);
+ std::string fsbuf;
+ fsbuf += kShaderHeader;
+ fsbuf += kRenderFSCommon;
+ fsbuf += fs;
- bool ret = OpenGL::BuildShaderProgram(vsbuf, fsbuf, RenderShader[flags], shadername);
-
- delete[] vsbuf;
- delete[] fsbuf;
+ GLuint prog;
+ bool ret = OpenGL::CompileVertexFragmentProgram(prog,
+ vsbuf, fsbuf,
+ shadername,
+ {{"vPosition", 0}, {"vColor", 1}, {"vTexcoord", 2}, {"vPolygonAttr", 3}},
+ {{"oColor", 0}, {"oAttr", 1}});
if (!ret) return false;
- GLuint prog = RenderShader[flags][2];
-
- glBindAttribLocation(prog, 0, "vPosition");
- glBindAttribLocation(prog, 1, "vColor");
- glBindAttribLocation(prog, 2, "vTexcoord");
- glBindAttribLocation(prog, 3, "vPolygonAttr");
- glBindFragDataLocation(prog, 0, "oColor");
- glBindFragDataLocation(prog, 1, "oAttr");
-
- if (!OpenGL::LinkShaderProgram(RenderShader[flags]))
- return false;
-
GLint uni_id = glGetUniformBlockIndex(prog, "uConfig");
glUniformBlockBinding(prog, uni_id, 0);
@@ -78,13 +64,15 @@ bool GLRenderer::BuildRenderShader(u32 flags, const char* vs, const char* fs)
uni_id = glGetUniformLocation(prog, "TexPalMem");
glUniform1i(uni_id, 1);
+ RenderShader[flags] = prog;
+
return true;
}
void GLRenderer::UseRenderShader(u32 flags)
{
if (CurShaderID == flags) return;
- glUseProgram(RenderShader[flags][2]);
+ glUseProgram(RenderShader[flags]);
CurShaderID = flags;
}
@@ -125,21 +113,17 @@ std::unique_ptr GLRenderer::New() noexcept
glDepthRange(0, 1);
glClearDepth(1.0);
-
- if (!OpenGL::BuildShaderProgram(kClearVS, kClearFS, result->ClearShaderPlain, "ClearShader"))
+ if (!OpenGL::CompileVertexFragmentProgram(result->ClearShaderPlain,
+ kClearVS, kClearFS,
+ "ClearShader",
+ {{"vPosition", 0}},
+ {{"oColor", 0}, {"oAttr", 1}}))
return nullptr;
- glBindAttribLocation(result->ClearShaderPlain[2], 0, "vPosition");
- glBindFragDataLocation(result->ClearShaderPlain[2], 0, "oColor");
- glBindFragDataLocation(result->ClearShaderPlain[2], 1, "oAttr");
-
- if (!OpenGL::LinkShaderProgram(result->ClearShaderPlain))
- return nullptr;
-
- result->ClearUniformLoc[0] = glGetUniformLocation(result->ClearShaderPlain[2], "uColor");
- result->ClearUniformLoc[1] = glGetUniformLocation(result->ClearShaderPlain[2], "uDepth");
- result->ClearUniformLoc[2] = glGetUniformLocation(result->ClearShaderPlain[2], "uOpaquePolyID");
- result->ClearUniformLoc[3] = glGetUniformLocation(result->ClearShaderPlain[2], "uFogFlag");
+ result->ClearUniformLoc[0] = glGetUniformLocation(result->ClearShaderPlain, "uColor");
+ result->ClearUniformLoc[1] = glGetUniformLocation(result->ClearShaderPlain, "uDepth");
+ result->ClearUniformLoc[2] = glGetUniformLocation(result->ClearShaderPlain, "uOpaquePolyID");
+ result->ClearUniformLoc[3] = glGetUniformLocation(result->ClearShaderPlain, "uFogFlag");
memset(result->RenderShader, 0, sizeof(RenderShader));
@@ -167,42 +151,35 @@ std::unique_ptr GLRenderer::New() noexcept
if (!result->BuildRenderShader(RenderFlag_ShadowMask | RenderFlag_WBuffer, kRenderVS_W, kRenderFS_WSM))
return nullptr;
- if (!OpenGL::BuildShaderProgram(kFinalPassVS, kFinalPassEdgeFS, result->FinalPassEdgeShader, "FinalPassEdgeShader"))
+ if (!OpenGL::CompileVertexFragmentProgram(result->FinalPassEdgeShader,
+ kFinalPassVS, kFinalPassEdgeFS,
+ "FinalPassEdgeShader",
+ {{"vPosition", 0}},
+ {{"oColor", 0}}))
+ return nullptr;
+ if (!OpenGL::CompileVertexFragmentProgram(result->FinalPassFogShader,
+ kFinalPassVS, kFinalPassFogFS,
+ "FinalPassFogShader",
+ {{"vPosition", 0}},
+ {{"oColor", 0}}))
return nullptr;
- if (!OpenGL::BuildShaderProgram(kFinalPassVS, kFinalPassFogFS, result->FinalPassFogShader, "FinalPassFogShader"))
- return nullptr;
+ GLuint uni_id = glGetUniformBlockIndex(result->FinalPassEdgeShader, "uConfig");
+ glUniformBlockBinding(result->FinalPassEdgeShader, uni_id, 0);
- glBindAttribLocation(result->FinalPassEdgeShader[2], 0, "vPosition");
- glBindFragDataLocation(result->FinalPassEdgeShader[2], 0, "oColor");
-
- if (!OpenGL::LinkShaderProgram(result->FinalPassEdgeShader))
- return nullptr;
-
- GLint uni_id = glGetUniformBlockIndex(result->FinalPassEdgeShader[2], "uConfig");
- glUniformBlockBinding(result->FinalPassEdgeShader[2], uni_id, 0);
-
- glUseProgram(result->FinalPassEdgeShader[2]);
-
- uni_id = glGetUniformLocation(result->FinalPassEdgeShader[2], "DepthBuffer");
+ glUseProgram(result->FinalPassEdgeShader);
+ uni_id = glGetUniformLocation(result->FinalPassEdgeShader, "DepthBuffer");
glUniform1i(uni_id, 0);
- uni_id = glGetUniformLocation(result->FinalPassEdgeShader[2], "AttrBuffer");
+ uni_id = glGetUniformLocation(result->FinalPassEdgeShader, "AttrBuffer");
glUniform1i(uni_id, 1);
- glBindAttribLocation(result->FinalPassFogShader[2], 0, "vPosition");
- glBindFragDataLocation(result->FinalPassFogShader[2], 0, "oColor");
+ uni_id = glGetUniformBlockIndex(result->FinalPassFogShader, "uConfig");
+ glUniformBlockBinding(result->FinalPassFogShader, uni_id, 0);
- if (!OpenGL::LinkShaderProgram(result->FinalPassFogShader))
- return nullptr;
-
- uni_id = glGetUniformBlockIndex(result->FinalPassFogShader[2], "uConfig");
- glUniformBlockBinding(result->FinalPassFogShader[2], uni_id, 0);
-
- glUseProgram(result->FinalPassFogShader[2]);
-
- uni_id = glGetUniformLocation(result->FinalPassFogShader[2], "DepthBuffer");
+ glUseProgram(result->FinalPassFogShader);
+ uni_id = glGetUniformLocation(result->FinalPassFogShader, "DepthBuffer");
glUniform1i(uni_id, 0);
- uni_id = glGetUniformLocation(result->FinalPassFogShader[2], "AttrBuffer");
+ uni_id = glGetUniformLocation(result->FinalPassFogShader, "AttrBuffer");
glUniform1i(uni_id, 1);
@@ -255,29 +232,27 @@ std::unique_ptr GLRenderer::New() noexcept
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, result->IndexBufferID);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(IndexBuffer), nullptr, GL_DYNAMIC_DRAW);
- glGenFramebuffers(4, &result->FramebufferID[0]);
- glBindFramebuffer(GL_FRAMEBUFFER, result->FramebufferID[0]);
-
- glGenTextures(8, &result->FramebufferTex[0]);
- result->FrontBuffer = 0;
+ glGenFramebuffers(1, &result->MainFramebuffer);
+ glGenFramebuffers(1, &result->DownscaleFramebuffer);
// color buffers
- SetupDefaultTexParams(result->FramebufferTex[0]);
- SetupDefaultTexParams(result->FramebufferTex[1]);
+ glGenTextures(1, &result->ColorBufferTex);
+ SetupDefaultTexParams(result->ColorBufferTex);
// depth/stencil buffer
- SetupDefaultTexParams(result->FramebufferTex[4]);
- SetupDefaultTexParams(result->FramebufferTex[6]);
+ glGenTextures(1, &result->DepthBufferTex);
+ SetupDefaultTexParams(result->DepthBufferTex);
// attribute buffer
// R: opaque polyID (for edgemarking)
// G: edge flag
// B: fog flag
- SetupDefaultTexParams(result->FramebufferTex[5]);
- SetupDefaultTexParams(result->FramebufferTex[7]);
+ glGenTextures(1, &result->AttrBufferTex);
+ SetupDefaultTexParams(result->AttrBufferTex);
// downscale framebuffer for display capture (always 256x192)
- SetupDefaultTexParams(result->FramebufferTex[3]);
+ glGenTextures(1, &result->DownScaleBufferTex);
+ SetupDefaultTexParams(result->DownScaleBufferTex);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 256, 192, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
glEnable(GL_BLEND);
@@ -315,8 +290,12 @@ GLRenderer::~GLRenderer()
glDeleteTextures(1, &TexMemID);
glDeleteTextures(1, &TexPalMemID);
- glDeleteFramebuffers(4, &FramebufferID[0]);
- glDeleteTextures(8, &FramebufferTex[0]);
+ glDeleteFramebuffers(1, &MainFramebuffer);
+ glDeleteFramebuffers(1, &DownscaleFramebuffer);
+ glDeleteTextures(1, &ColorBufferTex);
+ glDeleteTextures(1, &DepthBufferTex);
+ glDeleteTextures(1, &AttrBufferTex);
+ glDeleteTextures(1, &DownScaleBufferTex);
glDeleteVertexArrays(1, &VertexArrayID);
glDeleteBuffers(1, &VertexBufferID);
@@ -327,8 +306,8 @@ GLRenderer::~GLRenderer()
for (int i = 0; i < 16; i++)
{
- if (!RenderShader[i][2]) continue;
- OpenGL::DeleteShaderProgram(RenderShader[i]);
+ if (!RenderShader[i]) continue;
+ glDeleteProgram(RenderShader[i]);
}
}
@@ -361,40 +340,25 @@ void GLRenderer::SetRenderSettings(bool betterpolygons, int scale) noexcept
ScreenW = 256 * scale;
ScreenH = 192 * scale;
- glBindTexture(GL_TEXTURE_2D, FramebufferTex[0]);
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, ScreenW, ScreenH, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
- glBindTexture(GL_TEXTURE_2D, FramebufferTex[1]);
+ glBindTexture(GL_TEXTURE_2D, ColorBufferTex);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, ScreenW, ScreenH, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
- glBindTexture(GL_TEXTURE_2D, FramebufferTex[4]);
+ glBindTexture(GL_TEXTURE_2D, DepthBufferTex);
glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, ScreenW, ScreenH, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, NULL);
- glBindTexture(GL_TEXTURE_2D, FramebufferTex[5]);
+ glBindTexture(GL_TEXTURE_2D, AttrBufferTex);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, ScreenW, ScreenH, 0, GL_RGB, GL_UNSIGNED_BYTE, NULL);
- glBindTexture(GL_TEXTURE_2D, FramebufferTex[6]);
- glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, ScreenW, ScreenH, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, NULL);
- glBindTexture(GL_TEXTURE_2D, FramebufferTex[7]);
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, ScreenW, ScreenH, 0, GL_RGB, GL_UNSIGNED_BYTE, NULL);
-
- glBindFramebuffer(GL_FRAMEBUFFER, FramebufferID[3]);
- glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, FramebufferTex[3], 0);
+ glBindFramebuffer(GL_FRAMEBUFFER, DownscaleFramebuffer);
+ glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, DownScaleBufferTex, 0);
GLenum fbassign[2] = {GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1};
- glBindFramebuffer(GL_FRAMEBUFFER, FramebufferID[0]);
- glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, FramebufferTex[0], 0);
- glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, FramebufferTex[4], 0);
- glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT1, FramebufferTex[5], 0);
+ glBindFramebuffer(GL_FRAMEBUFFER, MainFramebuffer);
+ glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, ColorBufferTex, 0);
+ glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, DepthBufferTex, 0);
+ glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT1, AttrBufferTex, 0);
glDrawBuffers(2, fbassign);
- glBindFramebuffer(GL_FRAMEBUFFER, FramebufferID[1]);
- glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, FramebufferTex[1], 0);
- glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, FramebufferTex[6], 0);
- glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT1, FramebufferTex[7], 0);
- glDrawBuffers(2, fbassign);
-
- glBindFramebuffer(GL_FRAMEBUFFER, FramebufferID[0]);
-
glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelbufferID);
glBufferData(GL_PIXEL_PACK_BUFFER, 256*192*4, NULL, GL_DYNAMIC_READ);
@@ -1103,9 +1067,9 @@ void GLRenderer::RenderSceneChunk(const GPU3D& gpu3d, int y, int h)
glStencilMask(0);
glActiveTexture(GL_TEXTURE0);
- glBindTexture(GL_TEXTURE_2D, FramebufferTex[FrontBuffer ? 6 : 4]);
+ glBindTexture(GL_TEXTURE_2D, DepthBufferTex);
glActiveTexture(GL_TEXTURE1);
- glBindTexture(GL_TEXTURE_2D, FramebufferTex[FrontBuffer ? 7 : 5]);
+ glBindTexture(GL_TEXTURE_2D, AttrBufferTex);
glBindBuffer(GL_ARRAY_BUFFER, ClearVertexBufferID);
glBindVertexArray(ClearVertexArrayID);
@@ -1115,7 +1079,7 @@ void GLRenderer::RenderSceneChunk(const GPU3D& gpu3d, int y, int h)
// edge marking
// TODO: depth/polyid values at screen edges
- glUseProgram(FinalPassEdgeShader[2]);
+ glUseProgram(FinalPassEdgeShader);
glBlendFuncSeparate(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ZERO, GL_ONE);
@@ -1126,7 +1090,7 @@ void GLRenderer::RenderSceneChunk(const GPU3D& gpu3d, int y, int h)
{
// fog
- glUseProgram(FinalPassFogShader[2]);
+ glUseProgram(FinalPassFogShader);
if (gpu3d.RenderDispCnt & (1<<6))
glBlendFuncSeparate(GL_ZERO, GL_ONE, GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_ALPHA);
@@ -1154,7 +1118,7 @@ void GLRenderer::RenderFrame(GPU& gpu)
CurShaderID = -1;
glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
- glBindFramebuffer(GL_DRAW_FRAMEBUFFER, FramebufferID[FrontBuffer]);
+ glBindFramebuffer(GL_DRAW_FRAMEBUFFER, MainFramebuffer);
ShaderConfig.uScreenSize[0] = ScreenW;
ShaderConfig.uScreenSize[1] = ScreenH;
@@ -1260,7 +1224,7 @@ void GLRenderer::RenderFrame(GPU& gpu)
// TODO: check whether 'clear polygon ID' affects translucent polyID
// (for example when alpha is 1..30)
{
- glUseProgram(ClearShaderPlain[2]);
+ glUseProgram(ClearShaderPlain);
glDepthFunc(GL_ALWAYS);
u32 r = gpu.GPU3D.RenderClearAttr1 & 0x1F;
@@ -1320,8 +1284,6 @@ void GLRenderer::RenderFrame(GPU& gpu)
RenderSceneChunk(gpu.GPU3D, 0, 192);
}
-
- FrontBuffer = FrontBuffer ? 0 : 1;
}
void GLRenderer::Stop(const GPU& gpu)
@@ -1331,16 +1293,14 @@ void GLRenderer::Stop(const GPU& gpu)
void GLRenderer::PrepareCaptureFrame()
{
- // TODO: make sure this picks the right buffer when doing antialiasing
- int original_fb = FrontBuffer^1;
-
- glBindFramebuffer(GL_READ_FRAMEBUFFER, FramebufferID[original_fb]);
+ glBindFramebuffer(GL_READ_FRAMEBUFFER, MainFramebuffer);
glReadBuffer(GL_COLOR_ATTACHMENT0);
- glBindFramebuffer(GL_DRAW_FRAMEBUFFER, FramebufferID[3]);
+ glBindFramebuffer(GL_DRAW_FRAMEBUFFER, DownscaleFramebuffer);
glDrawBuffer(GL_COLOR_ATTACHMENT0);
glBlitFramebuffer(0, 0, ScreenW, ScreenH, 0, 0, 256, 192, GL_COLOR_BUFFER_BIT, GL_NEAREST);
- glBindFramebuffer(GL_READ_FRAMEBUFFER, FramebufferID[3]);
+ glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelbufferID);
+ glBindFramebuffer(GL_READ_FRAMEBUFFER, DownscaleFramebuffer);
glReadPixels(0, 0, 256, 192, GL_BGRA, GL_UNSIGNED_BYTE, NULL);
}
@@ -1349,12 +1309,18 @@ void GLRenderer::Blit(const GPU& gpu)
CurGLCompositor.RenderFrame(gpu, *this);
}
+void GLRenderer::BindOutputTexture(int buffer)
+{
+ CurGLCompositor.BindOutputTexture(buffer);
+}
+
u32* GLRenderer::GetLine(int line)
{
int stride = 256;
if (line == 0)
{
+ glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelbufferID);
u8* data = (u8*)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
if (data) memcpy(&Framebuffer[stride*0], data, 4*stride*192);
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
@@ -1374,7 +1340,7 @@ u32* GLRenderer::GetLine(int line)
void GLRenderer::SetupAccelFrame()
{
- glBindTexture(GL_TEXTURE_2D, FramebufferTex[FrontBuffer]);
+ glBindTexture(GL_TEXTURE_2D, ColorBufferTex);
}
}
diff --git a/src/GPU3D_OpenGL.h b/src/GPU3D_OpenGL.h
index c30232ca..d69af324 100644
--- a/src/GPU3D_OpenGL.h
+++ b/src/GPU3D_OpenGL.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -44,12 +44,11 @@ public:
void Stop(const GPU& gpu) override;
u32* GetLine(int line) override;
- void SetupAccelFrame();
+ void SetupAccelFrame() override;
void PrepareCaptureFrame() override;
void Blit(const GPU& gpu) override;
- [[nodiscard]] const GLCompositor& GetCompositor() const noexcept { return CurGLCompositor; }
- GLCompositor& GetCompositor() noexcept { return CurGLCompositor; }
+ void BindOutputTexture(int buffer) override;
static std::unique_ptr New() noexcept;
private:
@@ -77,7 +76,7 @@ private:
GLCompositor CurGLCompositor;
RendererPolygon PolygonList[2048] {};
- bool BuildRenderShader(u32 flags, const char* vs, const char* fs);
+ bool BuildRenderShader(u32 flags, const std::string& vs, const std::string& fs);
void UseRenderShader(u32 flags);
void SetupPolygon(RendererPolygon* rp, Polygon* polygon) const;
u32* SetupVertex(const Polygon* poly, int vid, const Vertex* vtx, u32 vtxattr, u32* vptr) const;
@@ -96,13 +95,13 @@ private:
};
- GLuint ClearShaderPlain[3] {};
+ GLuint ClearShaderPlain {};
- GLuint RenderShader[16][3] {};
+ GLuint RenderShader[16] {};
GLuint CurShaderID = -1;
- GLuint FinalPassEdgeShader[3] {};
- GLuint FinalPassFogShader[3] {};
+ GLuint FinalPassEdgeShader {};
+ GLuint FinalPassFogShader {};
// std140 compliant structure
struct
@@ -155,12 +154,12 @@ private:
bool BetterPolygons {};
int ScreenW {}, ScreenH {};
- GLuint FramebufferTex[8] {};
- int FrontBuffer {};
- GLuint FramebufferID[4] {}, PixelbufferID {};
+ GLuint ColorBufferTex {}, DepthBufferTex {}, AttrBufferTex {};
+ GLuint DownScaleBufferTex {};
+ GLuint PixelbufferID {};
+
+ GLuint MainFramebuffer {}, DownscaleFramebuffer {};
u32 Framebuffer[256*192] {};
-
-
};
}
#endif
\ No newline at end of file
diff --git a/src/GPU3D_OpenGL_shaders.h b/src/GPU3D_OpenGL_shaders.h
index 13492b7f..03bd43f9 100644
--- a/src/GPU3D_OpenGL_shaders.h
+++ b/src/GPU3D_OpenGL_shaders.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp
index 74027d5b..a9d0bd64 100644
--- a/src/GPU3D_Soft.cpp
+++ b/src/GPU3D_Soft.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -95,8 +95,8 @@ void SoftRenderer::EnableRenderThread()
}
}
-SoftRenderer::SoftRenderer(bool threaded) noexcept
- : Renderer3D(false), Threaded(threaded)
+SoftRenderer::SoftRenderer() noexcept
+ : Renderer3D(false)
{
Sema_RenderStart = Platform::Semaphore_Create();
Sema_RenderDone = Platform::Semaphore_Create();
@@ -193,10 +193,10 @@ void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s
case 1: // A3I5
{
vramaddr += ((t * width) + s);
- u8 pixel = ReadVRAM_Texture(vramaddr, gpu);
+ u8 pixel = gpu.ReadVRAMFlat_Texture(vramaddr);
texpal <<= 4;
- *color = ReadVRAM_TexPal(texpal + ((pixel&0x1F)<<1), gpu);
+ *color = gpu.ReadVRAMFlat_TexPal(texpal + ((pixel&0x1F)<<1));
*alpha = ((pixel >> 3) & 0x1C) + (pixel >> 6);
}
break;
@@ -204,12 +204,12 @@ void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s
case 2: // 4-color
{
vramaddr += (((t * width) + s) >> 2);
- u8 pixel = ReadVRAM_Texture(vramaddr, gpu);
+ u8 pixel = gpu.ReadVRAMFlat_Texture(vramaddr);
pixel >>= ((s & 0x3) << 1);
pixel &= 0x3;
texpal <<= 3;
- *color = ReadVRAM_TexPal(texpal + (pixel<<1), gpu);
+ *color = gpu.ReadVRAMFlat_TexPal(texpal + (pixel<<1));
*alpha = (pixel==0) ? alpha0 : 31;
}
break;
@@ -217,12 +217,12 @@ void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s
case 3: // 16-color
{
vramaddr += (((t * width) + s) >> 1);
- u8 pixel = ReadVRAM_Texture(vramaddr, gpu);
+ u8 pixel = gpu.ReadVRAMFlat_Texture(vramaddr);
if (s & 0x1) pixel >>= 4;
else pixel &= 0xF;
texpal <<= 4;
- *color = ReadVRAM_TexPal(texpal + (pixel<<1), gpu);
+ *color = gpu.ReadVRAMFlat_TexPal(texpal + (pixel<<1));
*alpha = (pixel==0) ? alpha0 : 31;
}
break;
@@ -230,10 +230,10 @@ void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s
case 4: // 256-color
{
vramaddr += ((t * width) + s);
- u8 pixel = ReadVRAM_Texture(vramaddr, gpu);
+ u8 pixel = gpu.ReadVRAMFlat_Texture(vramaddr);
texpal <<= 4;
- *color = ReadVRAM_TexPal(texpal + (pixel<<1), gpu);
+ *color = gpu.ReadVRAMFlat_TexPal(texpal + (pixel<<1));
*alpha = (pixel==0) ? alpha0 : 31;
}
break;
@@ -242,35 +242,42 @@ void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s
{
vramaddr += ((t & 0x3FC) * (width>>2)) + (s & 0x3FC);
vramaddr += (t & 0x3);
+ vramaddr &= 0x7FFFF; // address used for all calcs wraps around after slot 3
u32 slot1addr = 0x20000 + ((vramaddr & 0x1FFFC) >> 1);
if (vramaddr >= 0x40000)
slot1addr += 0x10000;
- u8 val = ReadVRAM_Texture(vramaddr, gpu);
- val >>= (2 * (s & 0x3));
+ u8 val;
+ if (vramaddr >= 0x20000 && vramaddr < 0x40000) // reading slot 1 for texels should always read 0
+ val = 0;
+ else
+ {
+ val = gpu.ReadVRAMFlat_Texture(vramaddr);
+ val >>= (2 * (s & 0x3));
+ }
- u16 palinfo = ReadVRAM_Texture(slot1addr, gpu);
+ u16 palinfo = gpu.ReadVRAMFlat_Texture(slot1addr);
u32 paloffset = (palinfo & 0x3FFF) << 2;
texpal <<= 4;
switch (val & 0x3)
{
case 0:
- *color = ReadVRAM_TexPal(texpal + paloffset, gpu);
+ *color = gpu.ReadVRAMFlat_TexPal(texpal + paloffset);
*alpha = 31;
break;
case 1:
- *color = ReadVRAM_TexPal(texpal + paloffset + 2, gpu);
+ *color = gpu.ReadVRAMFlat_TexPal(texpal + paloffset + 2);
*alpha = 31;
break;
case 2:
if ((palinfo >> 14) == 1)
{
- u16 color0 = ReadVRAM_TexPal(texpal + paloffset, gpu);
- u16 color1 = ReadVRAM_TexPal(texpal + paloffset + 2, gpu);
+ u16 color0 = gpu.ReadVRAMFlat_TexPal(texpal + paloffset);
+ u16 color1 = gpu.ReadVRAMFlat_TexPal(texpal + paloffset + 2);
u32 r0 = color0 & 0x001F;
u32 g0 = color0 & 0x03E0;
@@ -287,8 +294,8 @@ void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s
}
else if ((palinfo >> 14) == 3)
{
- u16 color0 = ReadVRAM_TexPal(texpal + paloffset, gpu);
- u16 color1 = ReadVRAM_TexPal(texpal + paloffset + 2, gpu);
+ u16 color0 = gpu.ReadVRAMFlat_TexPal(texpal + paloffset);
+ u16 color1 = gpu.ReadVRAMFlat_TexPal(texpal + paloffset + 2);
u32 r0 = color0 & 0x001F;
u32 g0 = color0 & 0x03E0;
@@ -304,20 +311,20 @@ void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s
*color = r | g | b;
}
else
- *color = ReadVRAM_TexPal(texpal + paloffset + 4, gpu);
+ *color = gpu.ReadVRAMFlat_TexPal(texpal + paloffset + 4);
*alpha = 31;
break;
case 3:
if ((palinfo >> 14) == 2)
{
- *color = ReadVRAM_TexPal(texpal + paloffset + 6, gpu);
+ *color = gpu.ReadVRAMFlat_TexPal(texpal + paloffset + 6);
*alpha = 31;
}
else if ((palinfo >> 14) == 3)
{
- u16 color0 = ReadVRAM_TexPal(texpal + paloffset, gpu);
- u16 color1 = ReadVRAM_TexPal(texpal + paloffset + 2, gpu);
+ u16 color0 = gpu.ReadVRAMFlat_TexPal(texpal + paloffset);
+ u16 color1 = gpu.ReadVRAMFlat_TexPal(texpal + paloffset + 2);
u32 r0 = color0 & 0x001F;
u32 g0 = color0 & 0x03E0;
@@ -346,10 +353,10 @@ void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s
case 6: // A5I3
{
vramaddr += ((t * width) + s);
- u8 pixel = ReadVRAM_Texture(vramaddr, gpu);
+ u8 pixel = gpu.ReadVRAMFlat_Texture(vramaddr);
texpal <<= 4;
- *color = ReadVRAM_TexPal(texpal + ((pixel&0x7)<<1), gpu);
+ *color = gpu.ReadVRAMFlat_TexPal(texpal + ((pixel&0x7)<<1));
*alpha = (pixel >> 3);
}
break;
@@ -357,7 +364,7 @@ void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s
case 7: // direct color
{
vramaddr += (((t * width) + s) << 1);
- *color = ReadVRAM_Texture(vramaddr, gpu);
+ *color = gpu.ReadVRAMFlat_Texture(vramaddr);
*alpha = (*color & 0x8000) ? 31 : 0;
}
break;
@@ -1652,8 +1659,8 @@ void SoftRenderer::ClearBuffers(const GPU& gpu)
{
for (int x = 0; x < 256; x++)
{
- u16 val2 = ReadVRAM_Texture(0x40000 + (yoff << 9) + (xoff << 1), gpu);
- u16 val3 = ReadVRAM_Texture(0x60000 + (yoff << 9) + (xoff << 1), gpu);
+ u16 val2 = gpu.ReadVRAMFlat_Texture(0x40000 + (yoff << 9) + (xoff << 1));
+ u16 val3 = gpu.ReadVRAMFlat_Texture(0x60000 + (yoff << 9) + (xoff << 1));
// TODO: confirm color conversion
u32 r = (val2 << 1) & 0x3E; if (r) r++;
diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h
index 9cfdf9ad..73d02e4f 100644
--- a/src/GPU3D_Soft.h
+++ b/src/GPU3D_Soft.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -29,7 +29,7 @@ namespace melonDS
class SoftRenderer : public Renderer3D
{
public:
- SoftRenderer(bool threaded = false) noexcept;
+ SoftRenderer() noexcept;
~SoftRenderer() override;
void Reset(GPU& gpu) override;
@@ -430,16 +430,6 @@ private:
s32 ycoverage, ycov_incr;
};
- template
- inline T ReadVRAM_Texture(u32 addr, const GPU& gpu) const
- {
- return *(T*)&gpu.VRAMFlat_Texture[addr & 0x7FFFF];
- }
- template
- inline T ReadVRAM_TexPal(u32 addr, const GPU& gpu) const
- {
- return *(T*)&gpu.VRAMFlat_TexPal[addr & 0x1FFFF];
- }
u32 AlphaBlend(const GPU3D& gpu3d, u32 srccolor, u32 dstcolor, u32 alpha) const noexcept;
struct RendererPolygon
@@ -504,7 +494,7 @@ private:
// threading
- bool Threaded;
+ bool Threaded = false;
Platform::Thread* RenderThread;
std::atomic_bool RenderThreadRunning;
std::atomic_bool RenderThreadRendering;
diff --git a/src/GPU3D_Texcache.cpp b/src/GPU3D_Texcache.cpp
new file mode 100644
index 00000000..a6a40a04
--- /dev/null
+++ b/src/GPU3D_Texcache.cpp
@@ -0,0 +1,270 @@
+#include "GPU3D_Texcache.h"
+
+namespace melonDS
+{
+
+inline u16 ColorAvg(u16 color0, u16 color1)
+{
+ u32 r0 = color0 & 0x001F;
+ u32 g0 = color0 & 0x03E0;
+ u32 b0 = color0 & 0x7C00;
+ u32 r1 = color1 & 0x001F;
+ u32 g1 = color1 & 0x03E0;
+ u32 b1 = color1 & 0x7C00;
+
+ u32 r = (r0 + r1) >> 1;
+ u32 g = ((g0 + g1) >> 1) & 0x03E0;
+ u32 b = ((b0 + b1) >> 1) & 0x7C00;
+
+ return r | g | b;
+}
+
+inline u16 Color5of3(u16 color0, u16 color1)
+{
+ u32 r0 = color0 & 0x001F;
+ u32 g0 = color0 & 0x03E0;
+ u32 b0 = color0 & 0x7C00;
+ u32 r1 = color1 & 0x001F;
+ u32 g1 = color1 & 0x03E0;
+ u32 b1 = color1 & 0x7C00;
+
+ u32 r = (r0*5 + r1*3) >> 3;
+ u32 g = ((g0*5 + g1*3) >> 3) & 0x03E0;
+ u32 b = ((b0*5 + b1*3) >> 3) & 0x7C00;
+
+ return r | g | b;
+}
+
+inline u16 Color3of5(u16 color0, u16 color1)
+{
+ u32 r0 = color0 & 0x001F;
+ u32 g0 = color0 & 0x03E0;
+ u32 b0 = color0 & 0x7C00;
+ u32 r1 = color1 & 0x001F;
+ u32 g1 = color1 & 0x03E0;
+ u32 b1 = color1 & 0x7C00;
+
+ u32 r = (r0*3 + r1*5) >> 3;
+ u32 g = ((g0*3 + g1*5) >> 3) & 0x03E0;
+ u32 b = ((b0*3 + b1*5) >> 3) & 0x7C00;
+
+ return r | g | b;
+}
+
+inline u32 ConvertRGB5ToRGB8(u16 val)
+{
+ return (((u32)val & 0x1F) << 3)
+ | (((u32)val & 0x3E0) << 6)
+ | (((u32)val & 0x7C00) << 9);
+}
+inline u32 ConvertRGB5ToBGR8(u16 val)
+{
+ return (((u32)val & 0x1F) << 9)
+ | (((u32)val & 0x3E0) << 6)
+ | (((u32)val & 0x7C00) << 3);
+}
+inline u32 ConvertRGB5ToRGB6(u16 val)
+{
+ u8 r = (val & 0x1F) << 1;
+ u8 g = (val & 0x3E0) >> 4;
+ u8 b = (val & 0x7C00) >> 9;
+ if (r) r++;
+ if (g) g++;
+ if (b) b++;
+ return (u32)r | ((u32)g << 8) | ((u32)b << 16);
+}
+
+template
+void ConvertBitmapTexture(u32 width, u32 height, u32* output, u32 addr, GPU& gpu)
+{
+ for (u32 i = 0; i < width*height; i++)
+ {
+ u16 value = gpu.ReadVRAMFlat_Texture(addr + i * 2);
+
+ switch (outputFmt)
+ {
+ case outputFmt_RGB6A5:
+ output[i] = ConvertRGB5ToRGB6(value) | (value & 0x8000 ? 0x1F000000 : 0);
+ break;
+ case outputFmt_RGBA8:
+ output[i] = ConvertRGB5ToRGB8(value) | (value & 0x8000 ? 0xFF000000 : 0);
+ break;
+ case outputFmt_BGRA8:
+ output[i] = ConvertRGB5ToBGR8(value) | (value & 0x8000 ? 0xFF000000 : 0);
+ break;
+ }
+ }
+}
+
+template void ConvertBitmapTexture(u32 width, u32 height, u32* output, u32 addr, GPU& gpu);
+
+template
+void ConvertCompressedTexture(u32 width, u32 height, u32* output, u32 addr, u32 addrAux, u32 palAddr, GPU& gpu)
+{
+ // we process a whole block at the time
+ for (int y = 0; y < height / 4; y++)
+ {
+ for (int x = 0; x < width / 4; x++)
+ {
+ u32 data = gpu.ReadVRAMFlat_Texture(addr + (x + y * (width / 4))*4);
+ u16 auxData = gpu.ReadVRAMFlat_Texture(addrAux + (x + y * (width / 4))*2);
+
+ u32 paletteOffset = palAddr + (auxData & 0x3FFF) * 4;
+ u16 color0 = gpu.ReadVRAMFlat_TexPal(paletteOffset) | 0x8000;
+ u16 color1 = gpu.ReadVRAMFlat_TexPal(paletteOffset+2) | 0x8000;
+ u16 color2 = gpu.ReadVRAMFlat_TexPal(paletteOffset+4) | 0x8000;
+ u16 color3 = gpu.ReadVRAMFlat_TexPal(paletteOffset+6) | 0x8000;
+
+ switch ((auxData >> 14) & 0x3)
+ {
+ case 0:
+ color3 = 0;
+ break;
+ case 1:
+ {
+ u32 r0 = color0 & 0x001F;
+ u32 g0 = color0 & 0x03E0;
+ u32 b0 = color0 & 0x7C00;
+ u32 r1 = color1 & 0x001F;
+ u32 g1 = color1 & 0x03E0;
+ u32 b1 = color1 & 0x7C00;
+
+ u32 r = (r0 + r1) >> 1;
+ u32 g = ((g0 + g1) >> 1) & 0x03E0;
+ u32 b = ((b0 + b1) >> 1) & 0x7C00;
+ color2 = r | g | b | 0x8000;
+ }
+ color3 = 0;
+ break;
+ case 2:
+ break;
+ case 3:
+ {
+ u32 r0 = color0 & 0x001F;
+ u32 g0 = color0 & 0x03E0;
+ u32 b0 = color0 & 0x7C00;
+ u32 r1 = color1 & 0x001F;
+ u32 g1 = color1 & 0x03E0;
+ u32 b1 = color1 & 0x7C00;
+
+ u32 r = (r0*5 + r1*3) >> 3;
+ u32 g = ((g0*5 + g1*3) >> 3) & 0x03E0;
+ u32 b = ((b0*5 + b1*3) >> 3) & 0x7C00;
+
+ color2 = r | g | b | 0x8000;
+ }
+ {
+ u32 r0 = color0 & 0x001F;
+ u32 g0 = color0 & 0x03E0;
+ u32 b0 = color0 & 0x7C00;
+ u32 r1 = color1 & 0x001F;
+ u32 g1 = color1 & 0x03E0;
+ u32 b1 = color1 & 0x7C00;
+
+ u32 r = (r0*3 + r1*5) >> 3;
+ u32 g = ((g0*3 + g1*5) >> 3) & 0x03E0;
+ u32 b = ((b0*3 + b1*5) >> 3) & 0x7C00;
+
+ color3 = r | g | b | 0x8000;
+ }
+ break;
+ }
+
+ // in 2020 our default data types are big enough to be used as lookup tables...
+ u64 packed = color0 | ((u64)color1 << 16) | ((u64)color2 << 32) | ((u64)color3 << 48);
+
+ for (int j = 0; j < 4; j++)
+ {
+ for (int i = 0; i < 4; i++)
+ {
+ u32 colorIdx = 16 * ((data >> 2 * (i + j * 4)) & 0x3);
+ u16 color = (packed >> colorIdx) & 0xFFFF;
+ u32 res;
+ switch (outputFmt)
+ {
+ case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color)
+ | ((color & 0x8000) ? 0x1F000000 : 0); break;
+ case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color)
+ | ((color & 0x8000) ? 0xFF000000 : 0); break;
+ case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color)
+ | ((color & 0x8000) ? 0xFF000000 : 0); break;
+ }
+ output[x * 4 + i + (y * 4 + j) * width] = res;
+ }
+ }
+ }
+ }
+}
+
+template void ConvertCompressedTexture(u32, u32, u32*, u32, u32, u32, GPU&);
+
+template
+void ConvertAXIYTexture(u32 width, u32 height, u32* output, u32 addr, u32 palAddr, GPU& gpu)
+{
+ for (int y = 0; y < height; y++)
+ {
+ for (int x = 0; x < width; x++)
+ {
+ u8 val = gpu.ReadVRAMFlat_Texture(addr + x + y * width);
+
+ u32 idx = val & ((1 << Y) - 1);
+
+ u16 color = gpu.ReadVRAMFlat_TexPal(palAddr + idx * 2);
+ u32 alpha = (val >> Y) & ((1 << X) - 1);
+ if (X != 5)
+ alpha = alpha * 4 + alpha / 2;
+
+ u32 res;
+ switch (outputFmt)
+ {
+ case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color) | alpha << 24; break;
+ // make sure full alpha == 255
+ case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color) | (alpha << 27 | (alpha & 0x1C) << 22); break;
+ case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color) | (alpha << 27 | (alpha & 0x1C) << 22); break;
+ }
+ output[x + y * width] = res;
+ }
+ }
+}
+
+template void ConvertAXIYTexture(u32, u32, u32*, u32, u32, GPU&);
+template void ConvertAXIYTexture(u32, u32, u32*, u32, u32, GPU&);
+
+template
+void ConvertNColorsTexture(u32 width, u32 height, u32* output, u32 addr, u32 palAddr, bool color0Transparent, GPU& gpu)
+{
+ for (int y = 0; y < height; y++)
+ {
+ for (int x = 0; x < width / (16 / colorBits); x++)
+ {
+ // smallest possible row is 8 pixels with 2bpp => fits in u16
+ u16 val = gpu.ReadVRAMFlat_Texture(addr + 2 * (x + y * (width / (16 / colorBits))));
+
+ for (int i = 0; i < 16 / colorBits; i++)
+ {
+ u32 index = val & ((1 << colorBits) - 1);
+ val >>= colorBits;
+ u16 color = gpu.ReadVRAMFlat_TexPal(palAddr + index * 2);
+
+ bool transparent = color0Transparent && index == 0;
+ u32 res;
+ switch (outputFmt)
+ {
+ case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color)
+ | (transparent ? 0 : 0x1F000000); break;
+ case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color)
+ | (transparent ? 0 : 0xFF000000); break;
+ case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color)
+ | (transparent ? 0 : 0xFF000000); break;
+ }
+ output[x * (16 / colorBits) + y * width + i] = res;
+ }
+ }
+ }
+}
+
+template void ConvertNColorsTexture(u32, u32, u32*, u32, u32, bool, GPU&);
+template void ConvertNColorsTexture(u32, u32, u32*, u32, u32, bool, GPU&);
+template void ConvertNColorsTexture(u32, u32, u32*, u32, u32, bool, GPU&);
+
+}
\ No newline at end of file
diff --git a/src/GPU3D_Texcache.h b/src/GPU3D_Texcache.h
new file mode 100644
index 00000000..f2cd6416
--- /dev/null
+++ b/src/GPU3D_Texcache.h
@@ -0,0 +1,330 @@
+#ifndef GPU3D_TEXCACHE
+#define GPU3D_TEXCACHE
+
+#include "types.h"
+#include "GPU.h"
+
+#include
+#include
+#include
+
+#define XXH_STATIC_LINKING_ONLY
+#include "xxhash/xxhash.h"
+
+namespace melonDS
+{
+
+inline u32 TextureWidth(u32 texparam)
+{
+ return 8 << ((texparam >> 20) & 0x7);
+}
+
+inline u32 TextureHeight(u32 texparam)
+{
+ return 8 << ((texparam >> 23) & 0x7);
+}
+
+enum
+{
+ outputFmt_RGB6A5,
+ outputFmt_RGBA8,
+ outputFmt_BGRA8
+};
+
+template
+void ConvertBitmapTexture(u32 width, u32 height, u32* output, u32 addr, GPU& gpu);
+template
+void ConvertCompressedTexture(u32 width, u32 height, u32* output, u32 addr, u32 addrAux, u32 palAddr, GPU& gpu);
+template
+void ConvertAXIYTexture(u32 width, u32 height, u32* output, u32 addr, u32 palAddr, GPU& gpu);
+template
+void ConvertNColorsTexture(u32 width, u32 height, u32* output, u32 addr, u32 palAddr, bool color0Transparent, GPU& gpu);
+
+template
+class Texcache
+{
+public:
+ Texcache(const TexLoaderT& texloader)
+ : TexLoader(texloader) // probably better if this would be a move constructor???
+ {}
+
+ u64 MaskedHash(u8* vram, u32 vramSize, u32 addr, u32 size)
+ {
+ u64 hash = 0;
+
+ while (size > 0)
+ {
+ u32 pieceSize;
+ if (addr + size > vramSize)
+ // wraps around, only do the part inside
+ pieceSize = vramSize - addr;
+ else
+ // fits completely inside
+ pieceSize = size;
+
+ hash = XXH64(&vram[addr], pieceSize, hash);
+
+ addr += pieceSize;
+ addr &= (vramSize - 1);
+ assert(size >= pieceSize);
+ size -= pieceSize;
+ }
+
+ return hash;
+ }
+
+ bool CheckInvalid(u32 start, u32 size, u64 oldHash, u64* dirty, u8* vram, u32 vramSize)
+ {
+ u32 startBit = start / VRAMDirtyGranularity;
+ u32 bitsCount = ((start + size + VRAMDirtyGranularity - 1) / VRAMDirtyGranularity) - startBit;
+
+ u32 startEntry = startBit >> 6;
+ u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry;
+ for (u32 j = startEntry; j < startEntry + entriesCount; j++)
+ {
+ if (GetRangedBitMask(j, startBit, bitsCount) & dirty[j & ((vramSize / VRAMDirtyGranularity)-1)])
+ {
+ if (MaskedHash(vram, vramSize, start, size) != oldHash)
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ bool Update(GPU& gpu)
+ {
+ auto textureDirty = gpu.VRAMDirty_Texture.DeriveState(gpu.VRAMMap_Texture, gpu);
+ auto texPalDirty = gpu.VRAMDirty_TexPal.DeriveState(gpu.VRAMMap_TexPal, gpu);
+
+ bool textureChanged = gpu.MakeVRAMFlat_TextureCoherent(textureDirty);
+ bool texPalChanged = gpu.MakeVRAMFlat_TexPalCoherent(texPalDirty);
+
+ if (textureChanged || texPalChanged)
+ {
+ //printf("check invalidation %d\n", TexCache.size());
+ for (auto it = Cache.begin(); it != Cache.end();)
+ {
+ TexCacheEntry& entry = it->second;
+ if (textureChanged)
+ {
+ for (u32 i = 0; i < 2; i++)
+ {
+ if (CheckInvalid(entry.TextureRAMStart[i], entry.TextureRAMSize[i],
+ entry.TextureHash[i],
+ textureDirty.Data,
+ gpu.VRAMFlat_Texture, sizeof(gpu.VRAMFlat_Texture)))
+ goto invalidate;
+ }
+ }
+
+ if (texPalChanged && entry.TexPalSize > 0)
+ {
+ if (CheckInvalid(entry.TexPalStart, entry.TexPalSize,
+ entry.TexPalHash,
+ texPalDirty.Data,
+ gpu.VRAMFlat_TexPal, sizeof(gpu.VRAMFlat_TexPal)))
+ goto invalidate;
+ }
+
+ it++;
+ continue;
+ invalidate:
+ FreeTextures[entry.WidthLog2][entry.HeightLog2].push_back(entry.Texture);
+
+ //printf("invalidating texture %d\n", entry.ImageDescriptor);
+
+ it = Cache.erase(it);
+ }
+
+ return true;
+ }
+
+ return false;
+ }
+
+ void GetTexture(GPU& gpu, u32 texParam, u32 palBase, TexHandleT& textureHandle, u32& layer, u32*& helper)
+ {
+ // remove sampling and texcoord gen params
+ texParam &= ~0xC00F0000;
+
+ u32 fmt = (texParam >> 26) & 0x7;
+ u64 key = texParam;
+ if (fmt != 7)
+ {
+ key |= (u64)palBase << 32;
+ if (fmt == 5)
+ key &= ~((u64)1 << 29);
+ }
+ //printf("%" PRIx64 " %" PRIx32 " %" PRIx32 "\n", key, texParam, palBase);
+
+ assert(fmt != 0 && "no texture is not a texture format!");
+
+ auto it = Cache.find(key);
+
+ if (it != Cache.end())
+ {
+ textureHandle = it->second.Texture.TextureID;
+ layer = it->second.Texture.Layer;
+ helper = &it->second.LastVariant;
+ return;
+ }
+
+ u32 widthLog2 = (texParam >> 20) & 0x7;
+ u32 heightLog2 = (texParam >> 23) & 0x7;
+ u32 width = 8 << widthLog2;
+ u32 height = 8 << heightLog2;
+
+ u32 addr = (texParam & 0xFFFF) * 8;
+
+ TexCacheEntry entry = {0};
+
+ entry.TextureRAMStart[0] = addr;
+ entry.WidthLog2 = widthLog2;
+ entry.HeightLog2 = heightLog2;
+
+ // apparently a new texture
+ if (fmt == 7)
+ {
+ entry.TextureRAMSize[0] = width*height*2;
+
+ ConvertBitmapTexture(width, height, DecodingBuffer, addr, gpu);
+ }
+ else if (fmt == 5)
+ {
+ u32 slot1addr = 0x20000 + ((addr & 0x1FFFC) >> 1);
+ if (addr >= 0x40000)
+ slot1addr += 0x10000;
+
+ entry.TextureRAMSize[0] = width*height/16*4;
+ entry.TextureRAMStart[1] = slot1addr;
+ entry.TextureRAMSize[1] = width*height/16*2;
+ entry.TexPalStart = palBase*16;
+ entry.TexPalSize = 0x10000;
+
+ ConvertCompressedTexture(width, height, DecodingBuffer, addr, slot1addr, entry.TexPalStart, gpu);
+ }
+ else
+ {
+ u32 texSize, palAddr = palBase*16, numPalEntries;
+ switch (fmt)
+ {
+ case 1: texSize = width*height; numPalEntries = 32; break;
+ case 6: texSize = width*height; numPalEntries = 8; break;
+ case 2: texSize = width*height/4; numPalEntries = 4; palAddr >>= 1; break;
+ case 3: texSize = width*height/2; numPalEntries = 16; break;
+ case 4: texSize = width*height; numPalEntries = 256; break;
+ }
+
+ palAddr &= 0x1FFFF;
+
+ /*printf("creating texture | fmt: %d | %dx%d | %08x | %08x\n", fmt, width, height, addr, palAddr);
+ svcSleepThread(1000*1000);*/
+
+ entry.TextureRAMSize[0] = texSize;
+ entry.TexPalStart = palAddr;
+ entry.TexPalSize = numPalEntries*2;
+
+ //assert(entry.TexPalStart+entry.TexPalSize <= 128*1024*1024);
+
+ bool color0Transparent = texParam & (1 << 29);
+
+ switch (fmt)
+ {
+ case 1: ConvertAXIYTexture(width, height, DecodingBuffer, addr, palAddr, gpu); break;
+ case 6: ConvertAXIYTexture(width, height, DecodingBuffer, addr, palAddr, gpu); break;
+ case 2: ConvertNColorsTexture(width, height, DecodingBuffer, addr, palAddr, color0Transparent, gpu); break;
+ case 3: ConvertNColorsTexture(width, height, DecodingBuffer, addr, palAddr, color0Transparent, gpu); break;
+ case 4: ConvertNColorsTexture(width, height, DecodingBuffer, addr, palAddr, color0Transparent, gpu); break;
+ }
+ }
+
+ for (int i = 0; i < 2; i++)
+ {
+ if (entry.TextureRAMSize[i])
+ entry.TextureHash[i] = MaskedHash(gpu.VRAMFlat_Texture, sizeof(gpu.VRAMFlat_Texture),
+ entry.TextureRAMStart[i], entry.TextureRAMSize[i]);
+ }
+ if (entry.TexPalSize)
+ entry.TexPalHash = MaskedHash(gpu.VRAMFlat_TexPal, sizeof(gpu.VRAMFlat_TexPal),
+ entry.TexPalStart, entry.TexPalSize);
+
+ auto& texArrays = TexArrays[widthLog2][heightLog2];
+ auto& freeTextures = FreeTextures[widthLog2][heightLog2];
+
+ if (freeTextures.size() == 0)
+ {
+ texArrays.resize(texArrays.size()+1);
+ TexHandleT& array = texArrays[texArrays.size()-1];
+
+ u32 layers = std::min((8*1024*1024) / (width*height*4), 64);
+
+ // allocate new array texture
+ //printf("allocating new layer set for %d %d %d %d\n", width, height, texArrays.size()-1, array.ImageDescriptor);
+ array = TexLoader.GenerateTexture(width, height, layers);
+
+ for (u32 i = 0; i < layers; i++)
+ {
+ freeTextures.push_back(TexArrayEntry{array, i});
+ }
+ }
+
+ TexArrayEntry storagePlace = freeTextures[freeTextures.size()-1];
+ freeTextures.pop_back();
+
+ entry.Texture = storagePlace;
+
+ TexLoader.UploadTexture(storagePlace.TextureID, width, height, storagePlace.Layer, DecodingBuffer);
+ //printf("using storage place %d %d | %d %d (%d)\n", width, height, storagePlace.TexArrayIdx, storagePlace.LayerIdx, array.ImageDescriptor);
+
+ textureHandle = storagePlace.TextureID;
+ layer = storagePlace.Layer;
+ helper = &Cache.emplace(std::make_pair(key, entry)).first->second.LastVariant;
+ }
+
+ void Reset()
+ {
+ for (u32 i = 0; i < 8; i++)
+ {
+ for (u32 j = 0; j < 8; j++)
+ {
+ for (u32 k = 0; k < TexArrays[i][j].size(); k++)
+ TexLoader.DeleteTexture(TexArrays[i][j][k]);
+ TexArrays[i][j].clear();
+ FreeTextures[i][j].clear();
+ }
+ }
+ Cache.clear();
+ }
+private:
+ struct TexArrayEntry
+ {
+ TexHandleT TextureID;
+ u32 Layer;
+ };
+
+ struct TexCacheEntry
+ {
+ u32 LastVariant; // very cheap way to make variant lookup faster
+
+ u32 TextureRAMStart[2], TextureRAMSize[2];
+ u32 TexPalStart, TexPalSize;
+ u8 WidthLog2, HeightLog2;
+ TexArrayEntry Texture;
+
+ u64 TextureHash[2];
+ u64 TexPalHash;
+ };
+ std::unordered_map Cache;
+
+ TexLoaderT TexLoader;
+
+ std::vector FreeTextures[8][8];
+ std::vector TexArrays[8][8];
+
+ u32 DecodingBuffer[1024*1024];
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/GPU3D_TexcacheOpenGL.cpp b/src/GPU3D_TexcacheOpenGL.cpp
new file mode 100644
index 00000000..95ca8cdc
--- /dev/null
+++ b/src/GPU3D_TexcacheOpenGL.cpp
@@ -0,0 +1,29 @@
+#include "GPU3D_TexcacheOpenGL.h"
+
+namespace melonDS
+{
+
+GLuint TexcacheOpenGLLoader::GenerateTexture(u32 width, u32 height, u32 layers)
+{
+ GLuint texarray;
+ glGenTextures(1, &texarray);
+ glBindTexture(GL_TEXTURE_2D_ARRAY, texarray);
+ glTexStorage3D(GL_TEXTURE_2D_ARRAY, 1, GL_RGBA8UI, width, height, layers);
+ return texarray;
+}
+
+void TexcacheOpenGLLoader::UploadTexture(GLuint handle, u32 width, u32 height, u32 layer, void* data)
+{
+ glBindTexture(GL_TEXTURE_2D_ARRAY, handle);
+ glTexSubImage3D(GL_TEXTURE_2D_ARRAY,
+ 0, 0, 0, layer,
+ width, height, 1,
+ GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, data);
+}
+
+void TexcacheOpenGLLoader::DeleteTexture(GLuint handle)
+{
+ glDeleteTextures(1, &handle);
+}
+
+}
\ No newline at end of file
diff --git a/src/GPU3D_TexcacheOpenGL.h b/src/GPU3D_TexcacheOpenGL.h
new file mode 100644
index 00000000..a8cfa576
--- /dev/null
+++ b/src/GPU3D_TexcacheOpenGL.h
@@ -0,0 +1,25 @@
+#ifndef GPU3D_TEXCACHEOPENGL
+#define GPU3D_TEXCACHEOPENGL
+
+#include "GPU3D_Texcache.h"
+#include "OpenGLSupport.h"
+
+namespace melonDS
+{
+
+template
+class Texcache;
+
+class TexcacheOpenGLLoader
+{
+public:
+ GLuint GenerateTexture(u32 width, u32 height, u32 layers);
+ void UploadTexture(GLuint handle, u32 width, u32 height, u32 layer, void* data);
+ void DeleteTexture(GLuint handle);
+};
+
+using TexcacheOpenGL = Texcache;
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/GPU_OpenGL.cpp b/src/GPU_OpenGL.cpp
index 2e2857ce..a58dbedb 100644
--- a/src/GPU_OpenGL.cpp
+++ b/src/GPU_OpenGL.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -36,32 +36,26 @@ using namespace OpenGL;
std::optional GLCompositor::New() noexcept
{
assert(glBindAttribLocation != nullptr);
+ GLuint CompShader {};
- std::array CompShader {};
- if (!OpenGL::BuildShaderProgram(kCompositorVS, kCompositorFS_Nearest, &CompShader[0], "CompositorShader"))
- return std::nullopt;
-
- glBindAttribLocation(CompShader[2], 0, "vPosition");
- glBindAttribLocation(CompShader[2], 1, "vTexcoord");
- glBindFragDataLocation(CompShader[2], 0, "oColor");
-
- if (!OpenGL::LinkShaderProgram(CompShader.data()))
- // OpenGL::LinkShaderProgram already deletes the shader program object
- // if linking the shaders together failed.
+ if (!OpenGL::CompileVertexFragmentProgram(CompShader,
+ kCompositorVS, kCompositorFS_Nearest,
+ "CompositorShader",
+ {{"vPosition", 0}, {"vTexcoord", 1}},
+ {{"oColor", 0}}))
return std::nullopt;
return { GLCompositor(CompShader) };
}
-GLCompositor::GLCompositor(std::array compShader) noexcept : CompShader(compShader)
+GLCompositor::GLCompositor(GLuint compShader) noexcept : CompShader(compShader)
{
- CompScaleLoc = glGetUniformLocation(CompShader[2], "u3DScale");
- Comp3DXPosLoc = glGetUniformLocation(CompShader[2], "u3DXPos");
+ CompScaleLoc = glGetUniformLocation(CompShader, "u3DScale");
- glUseProgram(CompShader[2]);
- GLuint screenTextureUniform = glGetUniformLocation(CompShader[2], "ScreenTex");
+ glUseProgram(CompShader);
+ GLuint screenTextureUniform = glGetUniformLocation(CompShader, "ScreenTex");
glUniform1i(screenTextureUniform, 0);
- GLuint _3dTextureUniform = glGetUniformLocation(CompShader[2], "_3DTex");
+ GLuint _3dTextureUniform = glGetUniformLocation(CompShader, "_3DTex");
glUniform1i(_3dTextureUniform, 1);
// all this mess is to prevent bleeding
@@ -136,7 +130,7 @@ GLCompositor::~GLCompositor()
glDeleteVertexArrays(1, &CompVertexArrayID);
glDeleteBuffers(1, &CompVertexBufferID);
- OpenGL::DeleteShaderProgram(CompShader.data());
+ glDeleteProgram(CompShader);
}
@@ -145,7 +139,6 @@ GLCompositor::GLCompositor(GLCompositor&& other) noexcept :
ScreenH(other.ScreenH),
ScreenW(other.ScreenW),
CompScaleLoc(other.CompScaleLoc),
- Comp3DXPosLoc(other.Comp3DXPosLoc),
CompVertices(other.CompVertices),
CompShader(other.CompShader),
CompVertexBufferID(other.CompVertexBufferID),
@@ -170,11 +163,10 @@ GLCompositor& GLCompositor::operator=(GLCompositor&& other) noexcept
ScreenH = other.ScreenH;
ScreenW = other.ScreenW;
CompScaleLoc = other.CompScaleLoc;
- Comp3DXPosLoc = other.Comp3DXPosLoc;
CompVertices = other.CompVertices;
// Clean up these resources before overwriting them
- OpenGL::DeleteShaderProgram(CompShader.data());
+ glDeleteProgram(CompShader);
CompShader = other.CompShader;
glDeleteBuffers(1, &CompVertexBufferID);
@@ -244,11 +236,11 @@ void GLCompositor::Stop(const GPU& gpu) noexcept
glBindFramebuffer(GL_FRAMEBUFFER, 0);
}
-void GLCompositor::RenderFrame(const GPU& gpu, GLRenderer& renderer) noexcept
+void GLCompositor::RenderFrame(const GPU& gpu, Renderer3D& renderer) noexcept
{
- int frontbuf = gpu.FrontBuffer;
+ int backbuf = gpu.FrontBuffer ^ 1;
glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
- glBindFramebuffer(GL_DRAW_FRAMEBUFFER, CompScreenOutputFB[frontbuf]);
+ glBindFramebuffer(GL_DRAW_FRAMEBUFFER, CompScreenOutputFB[backbuf]);
glDisable(GL_DEPTH_TEST);
glDisable(GL_STENCIL_TEST);
@@ -260,21 +252,18 @@ void GLCompositor::RenderFrame(const GPU& gpu, GLRenderer& renderer) noexcept
glClear(GL_COLOR_BUFFER_BIT);
// TODO: select more shaders (filtering, etc)
- OpenGL::UseShaderProgram(CompShader.data());
+ glUseProgram(CompShader);
glUniform1ui(CompScaleLoc, Scale);
- // TODO: support setting this midframe, if ever needed
- glUniform1i(Comp3DXPosLoc, ((int)gpu.GPU3D.GetRenderXPos() << 23) >> 23);
-
glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, CompScreenInputTex);
- if (gpu.Framebuffer[frontbuf][0] && gpu.Framebuffer[frontbuf][1])
+ if (gpu.Framebuffer[backbuf][0] && gpu.Framebuffer[backbuf][1])
{
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 256*3 + 1, 192, GL_RGBA_INTEGER,
- GL_UNSIGNED_BYTE, gpu.Framebuffer[frontbuf][0].get());
+ GL_UNSIGNED_BYTE, gpu.Framebuffer[backbuf][0].get());
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 192, 256*3 + 1, 192, GL_RGBA_INTEGER,
- GL_UNSIGNED_BYTE, gpu.Framebuffer[frontbuf][1].get());
+ GL_UNSIGNED_BYTE, gpu.Framebuffer[backbuf][1].get());
}
glActiveTexture(GL_TEXTURE1);
diff --git a/src/GPU_OpenGL.h b/src/GPU_OpenGL.h
index 9c040966..2e482861 100644
--- a/src/GPU_OpenGL.h
+++ b/src/GPU_OpenGL.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -28,6 +28,7 @@ namespace melonDS
class GPU;
struct RenderSettings;
class GLRenderer;
+class Renderer3D;
class GLCompositor
{
public:
@@ -42,16 +43,15 @@ public:
[[nodiscard]] int GetScaleFactor() const noexcept { return Scale; }
void Stop(const GPU& gpu) noexcept;
- void RenderFrame(const GPU& gpu, GLRenderer& renderer) noexcept;
+ void RenderFrame(const GPU& gpu, Renderer3D& renderer) noexcept;
void BindOutputTexture(int buf);
private:
- GLCompositor(std::array CompShader) noexcept;
+ GLCompositor(GLuint CompShader) noexcept;
int Scale = 0;
int ScreenH = 0, ScreenW = 0;
- std::array CompShader {};
+ GLuint CompShader {};
GLuint CompScaleLoc = 0;
- GLuint Comp3DXPosLoc = 0;
GLuint CompVertexBufferID = 0;
GLuint CompVertexArrayID = 0;
diff --git a/src/GPU_OpenGL_shaders.h b/src/GPU_OpenGL_shaders.h
index a8c5b951..3c463ab8 100644
--- a/src/GPU_OpenGL_shaders.h
+++ b/src/GPU_OpenGL_shaders.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -43,7 +43,6 @@ void main()
const char* kCompositorFS_Nearest = R"(#version 140
uniform uint u3DScale;
-uniform int u3DXPos;
uniform usampler2D ScreenTex;
uniform sampler2D _3DTex;
@@ -56,11 +55,13 @@ void main()
{
ivec4 pixel = ivec4(texelFetch(ScreenTex, ivec2(fTexcoord), 0));
- float _3dxpos = float(u3DXPos);
-
ivec4 mbright = ivec4(texelFetch(ScreenTex, ivec2(256*3, int(fTexcoord.y)), 0));
int dispmode = mbright.b & 0x3;
+ // mbright.a == HOFS bit0..7
+ // mbright.b bit7 == HOFS bit8 (sign)
+ float _3dxpos = float(mbright.a - ((mbright.b & 0x80) * 2));
+
if (dispmode == 1)
{
ivec4 val1 = pixel;
diff --git a/src/JitBlock.h b/src/JitBlock.h
index 9b31d6d7..2dc516fd 100644
--- a/src/JitBlock.h
+++ b/src/JitBlock.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/MemConstants.h b/src/MemConstants.h
index e9aa6b2b..3e10cbce 100644
--- a/src/MemConstants.h
+++ b/src/MemConstants.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/MemRegion.h b/src/MemRegion.h
index 11b3d1da..0a8212c7 100644
--- a/src/MemRegion.h
+++ b/src/MemRegion.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 62a07b4e..7e8711bf 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -35,6 +35,7 @@
#include "Platform.h"
#include "FreeBIOS.h"
#include "Args.h"
+#include "version.h"
#include "DSi.h"
#include "DSi_SPI_TSC.h"
@@ -80,18 +81,19 @@ NDS::NDS() noexcept :
NDSArgs {
nullptr,
nullptr,
- bios_arm9_bin,
- bios_arm7_bin,
+ std::make_unique(bios_arm9_bin),
+ std::make_unique(bios_arm7_bin),
Firmware(0),
}
)
{
}
-NDS::NDS(NDSArgs&& args, int type) noexcept :
+NDS::NDS(NDSArgs&& args, int type, void* userdata) noexcept :
ConsoleType(type),
- ARM7BIOS(args.ARM7BIOS),
- ARM9BIOS(args.ARM9BIOS),
+ UserData(userdata),
+ ARM7BIOS(*args.ARM7BIOS),
+ ARM9BIOS(*args.ARM9BIOS),
ARM7BIOSNative(CRC32(ARM7BIOS.data(), ARM7BIOS.size()) == ARM7BIOSCRC32),
ARM9BIOSNative(CRC32(ARM9BIOS.data(), ARM9BIOS.size()) == ARM9BIOSCRC32),
JIT(*this, args.JIT),
@@ -101,10 +103,13 @@ NDS::NDS(NDSArgs&& args, int type) noexcept :
RTC(*this),
Wifi(*this),
NDSCartSlot(*this, std::move(args.NDSROM)),
- GBACartSlot(type == 1 ? nullptr : std::move(args.GBAROM)),
+ GBACartSlot(*this, type == 1 ? nullptr : std::move(args.GBAROM)),
AREngine(*this),
ARM9(*this, args.GDB, args.JIT.has_value()),
ARM7(*this, args.GDB, args.JIT.has_value()),
+#ifdef GDBSTUB_ENABLED
+ EnableGDBStub(args.GDB.has_value()),
+#endif
#ifdef JIT_ENABLED
EnableJIT(args.JIT.has_value()),
#endif
@@ -573,7 +578,7 @@ void NDS::Stop(Platform::StopReason reason)
Log(level, "Stopping emulated console (Reason: %s)\n", StopReasonName(reason));
Running = false;
- Platform::SignalStop(reason);
+ Platform::SignalStop(reason, UserData);
GPU.Stop();
SPU.Stop();
}
@@ -754,7 +759,7 @@ void NDS::SetDebugPrint(bool enabled) noexcept
void NDS::LoadGBAAddon(int type)
{
- GBACartSlot.LoadAddon(type);
+ GBACartSlot.LoadAddon(UserData, type);
}
void NDS::LoadBIOS()
@@ -889,7 +894,7 @@ void NDS::RunSystemSleep(u64 timestamp)
}
}
-template
+template
u32 NDS::RunFrame()
{
FrameStartTimestamp = SysTimestamp;
@@ -930,8 +935,11 @@ u32 NDS::RunFrame()
}
else
{
- ARM9.CheckGdbIncoming();
- ARM7.CheckGdbIncoming();
+ if (cpuMode == CPUExecuteMode::InterpreterGDB)
+ {
+ ARM9.CheckGdbIncoming();
+ ARM7.CheckGdbIncoming();
+ }
if (!(CPUStop & CPUStop_Wakeup))
{
@@ -966,12 +974,7 @@ u32 NDS::RunFrame()
}
else
{
-#ifdef JIT_ENABLED
- if (EnableJIT)
- ARM9.ExecuteJIT();
- else
-#endif
- ARM9.Execute();
+ ARM9.Execute();
}
RunTimers(0);
@@ -998,12 +1001,7 @@ u32 NDS::RunFrame()
}
else
{
-#ifdef JIT_ENABLED
- if (EnableJIT)
- ARM7.ExecuteJIT();
- else
-#endif
- ARM7.Execute();
+ ARM7.Execute();
}
RunTimers(1);
@@ -1048,10 +1046,18 @@ u32 NDS::RunFrame()
{
#ifdef JIT_ENABLED
if (EnableJIT)
- return RunFrame();
+ return RunFrame();
else
#endif
- return RunFrame();
+#ifdef GDBSTUB_ENABLED
+ if (EnableGDBStub)
+ {
+ return RunFrame();
+ } else
+#endif
+ {
+ return RunFrame();
+ }
}
void NDS::Reschedule(u64 target)
@@ -1466,7 +1472,7 @@ u64 NDS::GetSysClockCycles(int num)
return ret;
}
-void NDS::NocashPrint(u32 ncpu, u32 addr)
+void NDS::NocashPrint(u32 ncpu, u32 addr, bool appendNewline)
{
// addr: debug string
@@ -1544,7 +1550,7 @@ void NDS::NocashPrint(u32 ncpu, u32 addr)
}
output[ptr] = '\0';
- Log(LogLevel::Debug, "%s", output);
+ Log(LogLevel::Debug, appendNewline ? "%s\n" : "%s", output);
}
void NDS::MonitorARM9Jump(u32 addr)
@@ -1850,7 +1856,7 @@ void NDS::debug(u32 param)
//for (int i = 0; i < 9; i++)
// printf("VRAM %c: %02X\n", 'A'+i, GPU->VRAMCNT[i]);
- Platform::FileHandle* shit = Platform::OpenFile("debug/DSfirmware.bin", FileMode::Write);
+ Platform::FileHandle* shit = Platform::OpenFile("debug/pokeplat.bin", FileMode::Write);
Platform::FileWrite(ARM9.ITCM, 0x8000, 1, shit);
for (u32 i = 0x02000000; i < 0x02400000; i+=4)
{
@@ -2733,11 +2739,37 @@ u8 NDS::ARM9IORead8(u32 addr)
case 0x04000132: return KeyCnt[0] & 0xFF;
case 0x04000133: return KeyCnt[0] >> 8;
+ case 0x040001A0:
+ if (!(ExMemCnt[0] & (1<<11)))
+ return NDSCartSlot.GetSPICnt() & 0xFF;
+ return 0;
+ case 0x040001A1:
+ if (!(ExMemCnt[0] & (1<<11)))
+ return NDSCartSlot.GetSPICnt() >> 8;
+ return 0;
+
case 0x040001A2:
if (!(ExMemCnt[0] & (1<<11)))
return NDSCartSlot.ReadSPIData();
return 0;
+ case 0x040001A4:
+ if (!(ExMemCnt[0] & (1<<11)))
+ return NDSCartSlot.GetROMCnt() & 0xFF;
+ return 0;
+ case 0x040001A5:
+ if (!(ExMemCnt[0] & (1<<11)))
+ return (NDSCartSlot.GetROMCnt() >> 8) & 0xFF;
+ return 0;
+ case 0x040001A6:
+ if (!(ExMemCnt[0] & (1<<11)))
+ return (NDSCartSlot.GetROMCnt() >> 16) & 0xFF;
+ return 0;
+ case 0x040001A7:
+ if (!(ExMemCnt[0] & (1<<11)))
+ return NDSCartSlot.GetROMCnt() >> 24;
+ return 0;
+
case 0x040001A8:
if (!(ExMemCnt[0] & (1<<11)))
return NDSCartSlot.GetROMCommand(0);
@@ -2818,7 +2850,7 @@ u8 NDS::ARM9IORead8(u32 addr)
if(addr >= 0x04FFFA00 && addr < 0x04FFFA10)
{
// FIX: GBATek says this should be padded with spaces
- static char const emuID[16] = "melonDS " MELONDS_VERSION;
+ static char const emuID[16] = "melonDS " MELONDS_VERSION_BASE;
auto idx = addr - 0x04FFFA00;
return (u8)(emuID[idx]);
}
@@ -2889,6 +2921,15 @@ u16 NDS::ARM9IORead16(u32 addr)
return NDSCartSlot.ReadSPIData();
return 0;
+ case 0x040001A4:
+ if (!(ExMemCnt[0] & (1<<11)))
+ return NDSCartSlot.GetROMCnt() & 0xFFFF;
+ return 0;
+ case 0x040001A6:
+ if (!(ExMemCnt[0] & (1<<11)))
+ return NDSCartSlot.GetROMCnt() >> 16;
+ return 0;
+
case 0x040001A8:
if (!(ExMemCnt[0] & (1<<11)))
return NDSCartSlot.GetROMCommand(0) |
@@ -2914,6 +2955,8 @@ u16 NDS::ARM9IORead16(u32 addr)
case 0x04000208: return IME[0];
case 0x04000210: return IE[0] & 0xFFFF;
case 0x04000212: return IE[0] >> 16;
+ case 0x04000214: return IF[0] & 0xFFFF;
+ case 0x04000216: return IF[0] >> 16;
case 0x04000240: return GPU.VRAMCNT[0] | (GPU.VRAMCNT[1] << 8);
case 0x04000242: return GPU.VRAMCNT[2] | (GPU.VRAMCNT[3] << 8);
@@ -3152,6 +3195,23 @@ void NDS::ARM9IOWrite8(u32 addr, u8 val)
NDSCartSlot.WriteSPIData(val);
return;
+ case 0x040001A4:
+ if (!(ExMemCnt[0] & (1<<11)))
+ NDSCartSlot.WriteROMCnt((NDSCartSlot.GetROMCnt() & 0xFFFFFF00) | val);
+ return;
+ case 0x040001A5:
+ if (!(ExMemCnt[0] & (1<<11)))
+ NDSCartSlot.WriteROMCnt((NDSCartSlot.GetROMCnt() & 0xFFFF00FF) | (val << 8));
+ return;
+ case 0x040001A6:
+ if (!(ExMemCnt[0] & (1<<11)))
+ NDSCartSlot.WriteROMCnt((NDSCartSlot.GetROMCnt() & 0xFF00FFFF) | (val << 16));
+ return;
+ case 0x040001A7:
+ if (!(ExMemCnt[0] & (1<<11)))
+ NDSCartSlot.WriteROMCnt((NDSCartSlot.GetROMCnt() & 0x00FFFFFF) | (val << 24));
+ return;
+
case 0x040001A8: if (!(ExMemCnt[0] & (1<<11))) NDSCartSlot.SetROMCommand(0, val); return;
case 0x040001A9: if (!(ExMemCnt[0] & (1<<11))) NDSCartSlot.SetROMCommand(1, val); return;
case 0x040001AA: if (!(ExMemCnt[0] & (1<<11))) NDSCartSlot.SetROMCommand(2, val); return;
@@ -3208,6 +3268,9 @@ void NDS::ARM9IOWrite16(u32 addr, u16 val)
case 0x04000060: GPU.GPU3D.Write16(addr, val); return;
+ case 0x04000064:
+ case 0x04000066: GPU.GPU2D_A.Write16(addr, val); return;
+
case 0x04000068:
case 0x0400006A: GPU.GPU2D_A.Write16(addr, val); return;
@@ -3281,6 +3344,15 @@ void NDS::ARM9IOWrite16(u32 addr, u16 val)
NDSCartSlot.WriteSPIData(val & 0xFF);
return;
+ case 0x040001A4:
+ if (!(ExMemCnt[0] & (1<<11)))
+ NDSCartSlot.WriteROMCnt((NDSCartSlot.GetROMCnt() & 0xFFFF0000) | val);
+ return;
+ case 0x040001A6:
+ if (!(ExMemCnt[0] & (1<<11)))
+ NDSCartSlot.WriteROMCnt((NDSCartSlot.GetROMCnt() & 0x0000FFFF) | (val << 16));
+ return;
+
case 0x040001A8:
if (!(ExMemCnt[0] & (1<<11)))
{
@@ -3327,6 +3399,8 @@ void NDS::ARM9IOWrite16(u32 addr, u16 val)
case 0x04000210: IE[0] = (IE[0] & 0xFFFF0000) | val; UpdateIRQ(0); return;
case 0x04000212: IE[0] = (IE[0] & 0x0000FFFF) | (val << 16); UpdateIRQ(0); return;
// TODO: what happens when writing to IF this way??
+ case 0x04000214: IF[0] &= ~val; GPU.GPU3D.CheckFIFOIRQ(); UpdateIRQ(0); return;
+ case 0x04000216: IF[0] &= ~(val<<16); GPU.GPU3D.CheckFIFOIRQ(); UpdateIRQ(0); return;
case 0x04000240:
GPU.MapVRAM_AB(0, val & 0xFF);
@@ -3551,10 +3625,8 @@ void NDS::ARM9IOWrite32(u32 addr, u32 val)
case 0x04FFFA14:
case 0x04FFFA18:
{
- bool appendLF = 0x04FFFA18 == addr;
- NocashPrint(0, val);
- if(appendLF)
- Log(LogLevel::Debug, "\n");
+ NocashPrint(0, val, 0x04FFFA18 == addr);
+
return;
}
@@ -3597,11 +3669,37 @@ u8 NDS::ARM7IORead8(u32 addr)
case 0x04000138: return RTC.Read() & 0xFF;
+ case 0x040001A0:
+ if (ExMemCnt[0] & (1<<11))
+ return NDSCartSlot.GetSPICnt() & 0xFF;
+ return 0;
+ case 0x040001A1:
+ if (ExMemCnt[0] & (1<<11))
+ return NDSCartSlot.GetSPICnt() >> 8;
+ return 0;
+
case 0x040001A2:
if (ExMemCnt[0] & (1<<11))
return NDSCartSlot.ReadSPIData();
return 0;
+ case 0x040001A4:
+ if (ExMemCnt[0] & (1<<11))
+ return NDSCartSlot.GetROMCnt() & 0xFF;
+ return 0;
+ case 0x040001A5:
+ if (ExMemCnt[0] & (1<<11))
+ return (NDSCartSlot.GetROMCnt() >> 8) & 0xFF;
+ return 0;
+ case 0x040001A6:
+ if (ExMemCnt[0] & (1<<11))
+ return (NDSCartSlot.GetROMCnt() >> 16) & 0xFF;
+ return 0;
+ case 0x040001A7:
+ if (ExMemCnt[0] & (1<<11))
+ return NDSCartSlot.GetROMCnt() >> 24;
+ return 0;
+
case 0x040001A8:
if (ExMemCnt[0] & (1<<11))
return NDSCartSlot.GetROMCommand(0);
@@ -3702,6 +3800,15 @@ u16 NDS::ARM7IORead16(u32 addr)
case 0x040001A0: if (ExMemCnt[0] & (1<<11)) return NDSCartSlot.GetSPICnt(); return 0;
case 0x040001A2: if (ExMemCnt[0] & (1<<11)) return NDSCartSlot.ReadSPIData(); return 0;
+ case 0x040001A4:
+ if (ExMemCnt[0] & (1<<11))
+ return NDSCartSlot.GetROMCnt() & 0xFFFF;
+ return 0;
+ case 0x040001A6:
+ if (ExMemCnt[0] & (1<<11))
+ return NDSCartSlot.GetROMCnt() >> 16;
+ return 0;
+
case 0x040001A8:
if (ExMemCnt[0] & (1<<11))
return NDSCartSlot.GetROMCommand(0) |
@@ -3889,6 +3996,23 @@ void NDS::ARM7IOWrite8(u32 addr, u8 val)
NDSCartSlot.WriteSPIData(val);
return;
+ case 0x040001A4:
+ if (ExMemCnt[0] & (1<<11))
+ NDSCartSlot.WriteROMCnt((NDSCartSlot.GetROMCnt() & 0xFFFFFF00) | val);
+ return;
+ case 0x040001A5:
+ if (ExMemCnt[0] & (1<<11))
+ NDSCartSlot.WriteROMCnt((NDSCartSlot.GetROMCnt() & 0xFFFF00FF) | (val << 8));
+ return;
+ case 0x040001A6:
+ if (ExMemCnt[0] & (1<<11))
+ NDSCartSlot.WriteROMCnt((NDSCartSlot.GetROMCnt() & 0xFF00FFFF) | (val << 16));
+ return;
+ case 0x040001A7:
+ if (ExMemCnt[0] & (1<<11))
+ NDSCartSlot.WriteROMCnt((NDSCartSlot.GetROMCnt() & 0x00FFFFFF) | (val << 24));
+ return;
+
case 0x040001A8: if (ExMemCnt[0] & (1<<11)) NDSCartSlot.SetROMCommand(0, val); return;
case 0x040001A9: if (ExMemCnt[0] & (1<<11)) NDSCartSlot.SetROMCommand(1, val); return;
case 0x040001AA: if (ExMemCnt[0] & (1<<11)) NDSCartSlot.SetROMCommand(2, val); return;
@@ -3994,6 +4118,15 @@ void NDS::ARM7IOWrite16(u32 addr, u16 val)
NDSCartSlot.WriteSPIData(val & 0xFF);
return;
+ case 0x040001A4:
+ if (ExMemCnt[0] & (1<<11))
+ NDSCartSlot.WriteROMCnt((NDSCartSlot.GetROMCnt() & 0xFFFFFF00) | val);
+ return;
+ case 0x040001A6:
+ if (ExMemCnt[0] & (1<<11))
+ NDSCartSlot.WriteROMCnt((NDSCartSlot.GetROMCnt() & 0xFF00FFFF) | (val << 16));
+ return;
+
case 0x040001A8:
if (ExMemCnt[0] & (1<<11))
{
diff --git a/src/NDS.h b/src/NDS.h
index ea07a442..985b5cb8 100644
--- a/src/NDS.h
+++ b/src/NDS.h
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -210,6 +210,7 @@ enum
enum
{
GBAAddon_RAMExpansion = 1,
+ GBAAddon_RumblePak = 2,
};
class SPU;
@@ -227,8 +228,13 @@ private:
#ifdef JIT_ENABLED
bool EnableJIT;
#endif
+#ifdef GDBSTUB_ENABLED
+ bool EnableGDBStub = false;
+#endif
public: // TODO: Encapsulate the rest of these members
+ void* UserData;
+
int ConsoleType;
int CurCPU;
@@ -424,7 +430,7 @@ public: // TODO: Encapsulate the rest of these members
u32 GetPC(u32 cpu) const;
u64 GetSysClockCycles(int num);
- void NocashPrint(u32 cpu, u32 addr);
+ void NocashPrint(u32 cpu, u32 addr, bool appendNewline = true);
void MonitorARM9Jump(u32 addr);
@@ -523,10 +529,11 @@ private:
void SetWifiWaitCnt(u16 val);
void SetGBASlotTimings();
void EnterSleepMode();
- template
+ template
u32 RunFrame();
+
public:
- NDS(NDSArgs&& args) noexcept : NDS(std::move(args), 0) {}
+ NDS(NDSArgs&& args, void* userdata = nullptr) noexcept : NDS(std::move(args), 0, userdata) {}
NDS() noexcept;
virtual ~NDS() noexcept;
NDS(const NDS&) = delete;
@@ -536,7 +543,7 @@ public:
// The frontend should set and unset this manually after creating and destroying the NDS object.
[[deprecated("Temporary workaround until JIT code generation is revised to accommodate multiple NDS objects.")]] static NDS* Current;
protected:
- explicit NDS(NDSArgs&& args, int type) noexcept;
+ explicit NDS(NDSArgs&& args, int type, void* userdata) noexcept;
virtual void DoSavestateExtra(Savestate* file) {}
};
diff --git a/src/NDSCart.cpp b/src/NDSCart.cpp
index a64d8a27..b0eef56a 100644
--- a/src/NDSCart.cpp
+++ b/src/NDSCart.cpp
@@ -1,5 +1,5 @@
/*
- Copyright 2016-2023 melonDS team
+ Copyright 2016-2024 melonDS team
This file is part of melonDS.
@@ -173,17 +173,18 @@ void NDSCartSlot::Key2_Encrypt(const u8* data, u32 len) noexcept
}
-CartCommon::CartCommon(const u8* rom, u32 len, u32 chipid, bool badDSiDump, ROMListEntry romparams, melonDS::NDSCart::CartType type) :
- CartCommon(CopyToUnique(rom, len), len, chipid, badDSiDump, romparams, type)
+CartCommon::CartCommon(const u8* rom, u32 len, u32 chipid, bool badDSiDump, ROMListEntry romparams, melonDS::NDSCart::CartType type, void* userdata) :
+ CartCommon(CopyToUnique(rom, len), len, chipid, badDSiDump, romparams, type, userdata)
{
}
-CartCommon::CartCommon(std::unique_ptr&& rom, u32 len, u32 chipid, bool badDSiDump, ROMListEntry romparams, melonDS::NDSCart::CartType type) :
+CartCommon::CartCommon(std::unique_ptr&& rom, u32 len, u32 chipid, bool badDSiDump, ROMListEntry romparams, melonDS::NDSCart::CartType type, void* userdata) :
ROM(std::move(rom)),
ROMLength(len),
ChipID(chipid),
ROMParams(romparams),
- CartType(type)
+ CartType(type),
+ UserData(userdata)
{
memcpy(&Header, ROM.get(), sizeof(Header));
IsDSi = Header.IsDSi() && !badDSiDump;
@@ -375,13 +376,13 @@ const NDSBanner* CartCommon::Banner() const
return nullptr;
}
-CartRetail::CartRetail(const u8* rom, u32 len, u32 chipid, bool badDSiDump, ROMListEntry romparams, std::unique_ptr&& sram, u32 sramlen, melonDS::NDSCart::CartType type) :
- CartRetail(CopyToUnique(rom, len), len, chipid, badDSiDump, romparams, std::move(sram), sramlen, type)
+CartRetail::CartRetail(const u8* rom, u32 len, u32 chipid, bool badDSiDump, ROMListEntry romparams, std::unique_ptr&& sram, u32 sramlen, void* userdata, melonDS::NDSCart::CartType type) :
+ CartRetail(CopyToUnique(rom, len), len, chipid, badDSiDump, romparams, std::move(sram), sramlen, userdata, type)
{
}
-CartRetail::CartRetail(std::unique_ptr&& rom, u32 len, u32 chipid, bool badDSiDump, ROMListEntry romparams, std::unique_ptr&& sram, u32 sramlen, melonDS::NDSCart::CartType type) :
- CartCommon(std::move(rom), len, chipid, badDSiDump, romparams, type)
+CartRetail::CartRetail(std::unique_ptr&& rom, u32 len, u32 chipid, bool badDSiDump, ROMListEntry romparams, std::unique_ptr&& sram, u32 sramlen, void* userdata, melonDS::NDSCart::CartType type) :
+ CartCommon(std::move(rom), len, chipid, badDSiDump, romparams, type, userdata)
{
u32 savememtype = ROMParams.SaveMemType <= 10 ? ROMParams.SaveMemType : 0;
constexpr int sramlengths[] =
@@ -469,7 +470,7 @@ void CartRetail::DoSavestate(Savestate* file)
file->Var8(&SRAMStatus);
if ((!file->Saving) && SRAM)
- Platform::WriteNDSSave(SRAM.get(), SRAMLength, 0, SRAMLength);
+ Platform::WriteNDSSave(SRAM.get(), SRAMLength, 0, SRAMLength, UserData);
}
void CartRetail::SetSaveMemory(const u8* savedata, u32 savelen)
@@ -478,7 +479,7 @@ void CartRetail::SetSaveMemory(const u8* savedata, u32 savelen)
u32 len = std::min(savelen, SRAMLength);
memcpy(SRAM.get(), savedata, len);
- Platform::WriteNDSSave(savedata, len, 0, len);
+ Platform::WriteNDSSave(savedata, len, 0, len, UserData);
}
int CartRetail::ROMCommandStart(NDS& nds, NDSCart::NDSCartSlot& cartslot, const u8* cmd, u8* data, u32 len)
@@ -594,7 +595,8 @@ u8 CartRetail::SRAMWrite_EEPROMTiny(u8 val, u32 pos, bool last)
{
SRAMStatus &= ~(1<<1);
Platform::WriteNDSSave(SRAM.get(), SRAMLength,
- (SRAMFirstAddr + ((SRAMCmd==0x0A)?0x100:0)) & 0x1FF, SRAMAddr-SRAMFirstAddr);
+ (SRAMFirstAddr + ((SRAMCmd==0x0A)?0x100:0)) & 0x1FF, SRAMAddr-SRAMFirstAddr,
+ UserData);
}
return 0;
@@ -658,7 +660,8 @@ u8 CartRetail::SRAMWrite_EEPROM(u8 val, u32 pos, bool last)
{
SRAMStatus &= ~(1<<1);
Platform::WriteNDSSave(SRAM.get(), SRAMLength,
- SRAMFirstAddr & (SRAMLength-1), SRAMAddr-SRAMFirstAddr);
+ SRAMFirstAddr & (SRAMLength-1), SRAMAddr-SRAMFirstAddr,
+ UserData);
}
return 0;
@@ -715,7 +718,8 @@ u8 CartRetail::SRAMWrite_FLASH(u8 val, u32 pos, bool last)
{
SRAMStatus &= ~(1<<1);
Platform::WriteNDSSave(SRAM.get(), SRAMLength,
- SRAMFirstAddr & (SRAMLength-1), SRAMAddr-SRAMFirstAddr);
+ SRAMFirstAddr & (SRAMLength-1), SRAMAddr-SRAMFirstAddr,
+ UserData);
}
return 0;
@@ -752,7 +756,8 @@ u8 CartRetail::SRAMWrite_FLASH(u8 val, u32 pos, bool last)
{
SRAMStatus &= ~(1<<1);
Platform::WriteNDSSave(SRAM.get(), SRAMLength,
- SRAMFirstAddr & (SRAMLength-1), SRAMAddr-SRAMFirstAddr);
+ SRAMFirstAddr & (SRAMLength-1), SRAMAddr-SRAMFirstAddr,
+ UserData);
}
return 0;
@@ -798,7 +803,8 @@ u8 CartRetail::SRAMWrite_FLASH(u8 val, u32 pos, bool last)
{
SRAMStatus &= ~(1<<1);
Platform::WriteNDSSave(SRAM.get(), SRAMLength,
- SRAMFirstAddr & (SRAMLength-1), SRAMAddr-SRAMFirstAddr);
+ SRAMFirstAddr & (SRAMLength-1), SRAMAddr-SRAMFirstAddr,
+ UserData);
}
return 0;
@@ -821,7 +827,8 @@ u8 CartRetail::SRAMWrite_FLASH(u8 val, u32 pos, bool last)
{
SRAMStatus &= ~(1<<1);
Platform::WriteNDSSave(SRAM.get(), SRAMLength,
- SRAMFirstAddr & (SRAMLength-1), SRAMAddr-SRAMFirstAddr);
+ SRAMFirstAddr & (SRAMLength-1), SRAMAddr-SRAMFirstAddr,
+ UserData);
}
return 0;
@@ -832,13 +839,13 @@ u8 CartRetail::SRAMWrite_FLASH(u8 val, u32 pos, bool last)
}
}
-CartRetailNAND::CartRetailNAND(const u8* rom, u32 len, u32 chipid, ROMListEntry romparams, std::unique_ptr&& sram, u32 sramlen) :
- CartRetailNAND(CopyToUnique(rom, len), len, chipid, romparams, std::move(sram), sramlen)
+CartRetailNAND::CartRetailNAND(const u8* rom, u32 len, u32 chipid, ROMListEntry romparams, std::unique_ptr