diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml new file mode 100644 index 00000000..e2b942ab --- /dev/null +++ b/.github/workflows/build-macos.yml @@ -0,0 +1,39 @@ +name: CMake Build (macOS x86-64) + +on: + push: + branches: + - master + pull_request: + branches: + - master + +env: + BUILD_TYPE: Release + +jobs: + build: + + runs-on: macos-latest + + steps: + - uses: actions/checkout@v1 + - name: Install dependencies + working-directory: ${{runner.workspace}} + run: | + brew install cmake sdl2 qt5 libslirp + - name: Create build environment + run: mkdir ${{runner.workspace}}/build + - name: Configure + working-directory: ${{runner.workspace}}/build + run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DQt5_DIR=$(brew --prefix qt5)/lib/cmake/Qt5 + - name: Make + working-directory: ${{runner.workspace}}/build + run: | + make -j$(sysctl -n hw.ncpu) + mkdir dist + cp -r melonDS.app dist + - uses: actions/upload-artifact@v1 + with: + name: melonDS.app + path: ${{runner.workspace}}/build/dist diff --git a/.github/workflows/build-ubuntu-aarch64.yml b/.github/workflows/build-ubuntu-aarch64.yml index c5ce2eb2..6ea78ea8 100644 --- a/.github/workflows/build-ubuntu-aarch64.yml +++ b/.github/workflows/build-ubuntu-aarch64.yml @@ -29,6 +29,7 @@ jobs: shell: bash working-directory: ${{runner.workspace}} run: | + sudo rm -f /etc/apt/sources.list.d/kubernetes.list sudo dpkg --add-architecture arm64 sudo sh -c "sed \"s|^deb \([a-z\.:/]*\) \([a-z\-]*\) \(.*\)$|deb [arch=amd64] \1 \2 \3\ndeb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports \2 \3|\" /etc/apt/sources.list > /etc/apt/sources.list.new" sudo rm /etc/apt/sources.list diff --git a/.github/workflows/build-ubuntu.yml b/.github/workflows/build-ubuntu.yml index 97825f05..d2070d8b 100644 --- a/.github/workflows/build-ubuntu.yml +++ b/.github/workflows/build-ubuntu.yml @@ -8,10 +8,6 @@ on: branches: - master -env: - BUILD_TYPE: Release - CMAKE_VERSION: 3.15.2 - jobs: build: @@ -20,25 +16,21 @@ jobs: steps: - uses: actions/checkout@v1 - name: Install dependencies - shell: bash - working-directory: ${{runner.workspace}} - run: | # Fetch a new version of CMake, because the default is too old. - sudo rm -f /etc/apt/sources.list.d/dotnetdev.list /etc/apt/sources.list.d/microsoft-prod.list \ - && sudo apt update \ - && sudo apt install cmake libcurl4-gnutls-dev libpcap0.8-dev libsdl2-dev qt5-default libslirp0=4.1.0-2ubuntu2.1 libslirp-dev libarchive-dev --allow-downgrades + run: | + sudo rm -f /etc/apt/sources.list.d/dotnetdev.list /etc/apt/sources.list.d/microsoft-prod.list + sudo apt update + sudo apt install cmake libcurl4-gnutls-dev libpcap0.8-dev libsdl2-dev qt5-default libslirp0 libslirp-dev libarchive-dev --allow-downgrades - name: Create build environment run: mkdir ${{runner.workspace}}/build - name: Configure - shell: bash working-directory: ${{runner.workspace}}/build - run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE + run: cmake $GITHUB_WORKSPACE - name: Make - shell: bash working-directory: ${{runner.workspace}}/build run: | - make -j$(nproc --all) \ - && mkdir dist \ - && cp melonDS dist + make -j$(nproc --all) + mkdir dist + cp melonDS dist - uses: actions/upload-artifact@v1 with: name: melonDS diff --git a/.gitignore b/.gitignore index 3c877403..a38b5a38 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ cmake-build-debug .idea *.exe + +.DS_Store diff --git a/CMakeLists.txt b/CMakeLists.txt index 9cecc50d..59a3f2d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,20 +1,30 @@ cmake_minimum_required(VERSION 3.13) +include(CheckSymbolExists) +include(CheckLibraryExists) + cmake_policy(VERSION 3.13) if (POLICY CMP0076) cmake_policy(SET CMP0076 NEW) endif() -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_OSX_DEPLOYMENT_TARGET "10.9" CACHE STRING "Minimum OS X deployment version") + +project(melonDS CXX) + +set(CMAKE_C_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) -project(melonDS) - -if (NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Release) +check_library_exists(m pow "" LIBM) +if(LIBM) + link_libraries(m) +endif() + +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) endif() -include(CheckSymbolExists) function(detect_architecture symbol arch) if (NOT DEFINED ARCHITECTURE) set(CMAKE_REQUIRED_QUIET 1) @@ -65,17 +75,36 @@ if (CMAKE_BUILD_TYPE STREQUAL Release) add_link_options(-s) endif() -add_compile_options(-fno-pic) -add_link_options(-no-pie) - -option(BUILD_QT_SDL "Build Qt/SDL frontend" ON) - if (WIN32) option(BUILD_STATIC "Statically link dependencies" OFF) endif() +if (ENABLE_LTO) + if (WIN32) + add_compile_options(-flto) + add_link_options(-flto) + else() + add_compile_options(-flto -fPIC) + add_link_options(-flto -fuse-linker-plugin -pie) + endif() +endif() + +if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set(CMAKE_AR "gcc-ar") + set(CMAKE_RANLIB "gcc-ranlib") +elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + find_program(LLD NAMES ld.lld ld64.lld lld-link) + if (NOT LLD STREQUAL "LLD-NOTFOUND") + add_link_options(-fuse-ld=lld) + endif() + set(CMAKE_AR "llvm-ar") + set(CMAKE_RANLIB "llvm-ranlib") +endif() + +option(BUILD_QT_SDL "Build Qt/SDL frontend" ON) + add_subdirectory(src) if (BUILD_QT_SDL) add_subdirectory(src/frontend/qt_sdl) -endif() +endif() \ No newline at end of file diff --git a/README.md b/README.md index ec218dd7..8df34df2 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ As for the rest, the interface should be pretty straightforward. If you have a q * Install dependencies: ```sh -sudo apt-get install cmake libgtk-3-dev libcurl4-gnutls-dev libpcap0.8-dev libsdl2-dev qtbase5-dev qtdeclarative5-dev libslirp-dev libarchive-dev +sudo apt-get install cmake libcurl4-gnutls-dev libpcap0.8-dev libsdl2-dev qtbase5-dev qtdeclarative5-dev libslirp-dev libarchive-dev ``` * Compile: @@ -84,6 +84,21 @@ If everything went well, melonDS and the libraries it needs should now be in the ``` If everything went well, melonDS should now be in the `dist` folder. +### macOS: +1. Install the [Homebrew Package Manager](https://brew.sh) +2. Install dependencies: `brew install git pkg-config cmake sdl2 qt5 libslirp libarchive` +3. Compile: + ```zsh + git clone https://github.com/Arisotura/melonDS.git + cd melonDS + mkdir build && cd build + cmake .. -DQt5_DIR=$(brew --prefix qt5)/lib/cmake/Qt5 + make -j$(sysctl -n hw.ncpu) + mkdir dist && cp -r melonDS.app dist + ``` +If everything went well, melonDS.app should now be in the `dist` folder. + + ## TODO LIST * DSi emulation diff --git a/melonDS.icns b/melonDS.icns new file mode 100644 index 00000000..b4f37335 Binary files /dev/null and b/melonDS.icns differ diff --git a/melonDS.plist b/melonDS.plist new file mode 100644 index 00000000..1328777e --- /dev/null +++ b/melonDS.plist @@ -0,0 +1,24 @@ + + + + + CFBundleExecutable + melonDS + CFBundleIconFile + melonDS.icns + CFBundleIdentifier + net.kuribo64.melonDS + CFBundleDevelopmentRegion + English + CFBundlePackageType + APPL + CFBundleVersion + 0.9 + CFBundleShortVersionString + 0.9 + NSHumanReadableCopyright + Licensed under GPLv3 + NSHighResolutionCapable + + + \ No newline at end of file diff --git a/src/ARM.cpp b/src/ARM.cpp index 7eeacb7f..29110e56 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -592,7 +592,7 @@ void ARMv5::Execute() else AddCycles_C(); } - + // TODO optimize this shit!!! if (Halted) { @@ -651,7 +651,7 @@ void ARMv5::ExecuteJIT() return; } - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(0, FastBlockLookup, + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(0, FastBlockLookup, instrAddr - FastBlockLookupStart, instrAddr); if (block) ARM_Dispatch(this, block); @@ -802,7 +802,7 @@ void ARMv4::ExecuteJIT() return; } - ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(1, FastBlockLookup, + ARMJIT::JitBlockEntry block = ARMJIT::LookUpBlock(1, FastBlockLookup, instrAddr - FastBlockLookupStart, instrAddr); if (block) ARM_Dispatch(this, block); @@ -879,4 +879,4 @@ void ARMv4::FillPipeline() NextInstr[0] = CodeRead32(R[15] - 4); NextInstr[1] = CodeRead32(R[15]); } -} \ No newline at end of file +} diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index c9d2b623..1921f132 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -176,7 +176,7 @@ T SlowRead9(u32 addr, ARMv5* cpu) } template -void SlowWrite9(u32 addr, ARMv5* cpu, T val) +void SlowWrite9(u32 addr, ARMv5* cpu, u32 val) { addr &= ~(sizeof(T) - 1); @@ -224,7 +224,7 @@ T SlowRead7(u32 addr) } template -void SlowWrite7(u32 addr, T val) +void SlowWrite7(u32 addr, u32 val) { addr &= ~(sizeof(T) - 1); @@ -266,16 +266,16 @@ void SlowBlockTransfer7(u32 addr, u64* data, u32 num) #define INSTANTIATE_SLOWMEM(consoleType) \ template void SlowWrite9(u32, ARMv5*, u32); \ - template void SlowWrite9(u32, ARMv5*, u16); \ - template void SlowWrite9(u32, ARMv5*, u8); \ + template void SlowWrite9(u32, ARMv5*, u32); \ + template void SlowWrite9(u32, ARMv5*, u32); \ \ template u32 SlowRead9(u32, ARMv5*); \ template u16 SlowRead9(u32, ARMv5*); \ template u8 SlowRead9(u32, ARMv5*); \ \ template void SlowWrite7(u32, u32); \ - template void SlowWrite7(u32, u16); \ - template void SlowWrite7(u32, u8); \ + template void SlowWrite7(u32, u32); \ + template void SlowWrite7(u32, u32); \ \ template u32 SlowRead7(u32); \ template u16 SlowRead7(u32); \ @@ -298,6 +298,7 @@ void Init() void DeInit() { + ResetBlockCache(); ARMJIT_Memory::DeInit(); delete JITCompiler; @@ -594,7 +595,8 @@ void CompileBlock(ARM* cpu) u32 r15 = cpu->R[15]; u32 addressRanges[Config::JIT_MaxBlockSize]; - u32 addressMasks[Config::JIT_MaxBlockSize] = {0}; + u32 addressMasks[Config::JIT_MaxBlockSize]; + memset(addressMasks, 0, Config::JIT_MaxBlockSize * sizeof(u32)); u32 numAddressRanges = 0; u32 numLiterals = 0; @@ -1116,6 +1118,7 @@ void ResetBlockCache() range->Blocks.Clear(); range->Code = 0; } + delete block; } JitBlocks9.clear(); JitBlocks7.clear(); diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.cpp b/src/ARMJIT_A64/ARMJIT_Compiler.cpp index 80c7f041..5fe3fe77 100644 --- a/src/ARMJIT_A64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_A64/ARMJIT_Compiler.cpp @@ -1,5 +1,11 @@ +#include "ARMJIT_Compiler.h" + +#include "../ARMJIT_Internal.h" +#include "../ARMInterpreter.h" +#include "../Config.h" + #ifdef __SWITCH__ -#include "../switch/compat_switch.h" +#include extern char __start__; #else @@ -7,13 +13,7 @@ extern char __start__; #include #endif -#include "ARMJIT_Compiler.h" - -#include "../ARMJIT_Internal.h" -#include "../ARMInterpreter.h" -#include "../Config.h" - -#include +#include using namespace Arm64Gen; @@ -68,6 +68,11 @@ void Compiler::A_Comp_MRS() MOV(rd, RCPSR); } +void UpdateModeTrampoline(ARM* arm, u32 oldmode, u32 newmode) +{ + arm->UpdateMode(oldmode, newmode); +} + void Compiler::A_Comp_MSR() { Comp_AddCycles_C(); @@ -139,7 +144,7 @@ void Compiler::A_Comp_MSR() PushRegs(true); - QuickCallFunction(X3, (void*)&ARM::UpdateMode); + QuickCallFunction(X3, (void*)&UpdateModeTrampoline); PopRegs(true); } @@ -179,7 +184,7 @@ void Compiler::PopRegs(bool saveHiRegs) Compiler::Compiler() { #ifdef __SWITCH__ - JitRWBase = memalign(0x1000, JitMemSize); + JitRWBase = aligned_alloc(0x1000, JitMemSize); JitRXStart = (u8*)&__start__ - JitMemSize - 0x1000; JitRWStart = virtmemReserve(JitMemSize); @@ -915,4 +920,4 @@ void Compiler::Comp_AddCycles_CD() ConstantCycles += cycles; } -} \ No newline at end of file +} diff --git a/src/ARMJIT_A64/ARMJIT_Compiler.h b/src/ARMJIT_A64/ARMJIT_Compiler.h index af7497a3..a79e9daf 100644 --- a/src/ARMJIT_A64/ARMJIT_Compiler.h +++ b/src/ARMJIT_A64/ARMJIT_Compiler.h @@ -187,6 +187,7 @@ public: void Comp_RegShiftReg(int op, bool S, Op2& op2, Arm64Gen::ARM64Reg rs); bool Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr); + enum { memop_Writeback = 1 << 0, @@ -213,8 +214,8 @@ public: return (u8*)entry - GetRXBase(); } - bool IsJITFault(u64 pc); - s64 RewriteMemAccess(u64 pc); + bool IsJITFault(u8* pc); + u8* RewriteMemAccess(u8* pc); void SwapCodeRegion() { diff --git a/src/ARMJIT_A64/ARMJIT_Linkage.s b/src/ARMJIT_A64/ARMJIT_Linkage.S similarity index 100% rename from src/ARMJIT_A64/ARMJIT_Linkage.s rename to src/ARMJIT_A64/ARMJIT_Linkage.S diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp index 86e257a3..2c14dc6a 100644 --- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp @@ -9,37 +9,34 @@ using namespace Arm64Gen; namespace ARMJIT { -bool Compiler::IsJITFault(u64 pc) +bool Compiler::IsJITFault(u8* pc) { - return pc >= (u64)GetRXBase() && pc - (u64)GetRXBase() < (JitMemMainSize + JitMemSecondarySize); + return (u64)pc >= (u64)GetRXBase() && (u64)pc - (u64)GetRXBase() < (JitMemMainSize + JitMemSecondarySize); } -s64 Compiler::RewriteMemAccess(u64 pc) +u8* Compiler::RewriteMemAccess(u8* pc) { - ptrdiff_t pcOffset = pc - (u64)GetRXBase(); + ptrdiff_t pcOffset = pc - GetRXBase(); auto it = LoadStorePatches.find(pcOffset); if (it != LoadStorePatches.end()) { LoadStorePatch patch = it->second; + LoadStorePatches.erase(it); ptrdiff_t curCodeOffset = GetCodeOffset(); SetCodePtrUnsafe(pcOffset + patch.PatchOffset); BL(patch.PatchFunc); - for (int i = 0; i < patch.PatchSize / 4 - 1; i++) HINT(HINT_NOP); - FlushIcacheSection((u8*)pc + patch.PatchOffset, (u8*)GetRXPtr()); SetCodePtrUnsafe(curCodeOffset); - LoadStorePatches.erase(it); - - return patch.PatchOffset; + return pc + (ptrdiff_t)patch.PatchOffset; } printf("this is a JIT bug! %08x\n", __builtin_bswap32(*(u32*)pc)); abort(); @@ -192,7 +189,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, Op2 offset, int size, int flags) else { LDRGeneric(size, flags & memop_SignExtend, rdMapped, size > 8 ? X1 : X0, X7); - if (size == 32) + if (size == 32 && !addrIsStatic) { UBFIZ(W0, W0, 3, 2); RORV(rdMapped, rdMapped, W0); diff --git a/src/ARMJIT_Internal.h b/src/ARMJIT_Internal.h index 42444701..b1e35f58 100644 --- a/src/ARMJIT_Internal.h +++ b/src/ARMJIT_Internal.h @@ -216,9 +216,9 @@ template void LinkBlock(ARM* cpu, u32 codeOffset); template T SlowRead9(u32 addr, ARMv5* cpu); -template void SlowWrite9(u32 addr, ARMv5* cpu, T val); +template void SlowWrite9(u32 addr, ARMv5* cpu, u32 val); template T SlowRead7(u32 addr); -template void SlowWrite7(u32 addr, T val); +template void SlowWrite7(u32 addr, u32 val); template void SlowBlockTransfer9(u32 addr, u64* data, u32 num, ARMv5* cpu); template void SlowBlockTransfer7(u32 addr, u64* data, u32 num); diff --git a/src/ARMJIT_Memory.cpp b/src/ARMJIT_Memory.cpp index d321d2f1..33d6bcfe 100644 --- a/src/ARMJIT_Memory.cpp +++ b/src/ARMJIT_Memory.cpp @@ -1,5 +1,6 @@ #if defined(__SWITCH__) -#include "switch/compat_switch.h" +#include +#include "frontend/switch/FaultHandler.h" #elif defined(_WIN32) #include #else @@ -10,6 +11,12 @@ #include #endif +#if defined(__ANDROID__) +#include +#include +#include +#endif + #include "ARMJIT_Memory.h" #include "ARMJIT_Internal.h" @@ -22,7 +29,7 @@ #include "NDSCart.h" #include "SPU.h" -#include +#include /* We're handling fastmem here. @@ -40,7 +47,8 @@ We handle this by only mapping those regions which are actually used and by praying the games don't go wild. - Beware, this file is full of platform specific code. + Beware, this file is full of platform specific code and copied + from Dolphin, so enjoy the copied comments! */ @@ -49,12 +57,16 @@ namespace ARMJIT_Memory struct FaultDescription { u32 EmulatedFaultAddr; - u64 FaultPC; + u8* FaultPC; }; -bool FaultHandler(FaultDescription* faultDesc, s32& offset); +bool FaultHandler(FaultDescription& faultDesc); } +#if defined(__ANDROID__) +#define ASHMEM_DEVICE "/dev/ashmem" +#endif + #if defined(__SWITCH__) // with LTO the symbols seem to be not properly overriden // if they're somewhere else @@ -75,7 +87,7 @@ void __libnx_exception_handler(ThreadExceptionDump* ctx) ARMJIT_Memory::FaultDescription desc; u8* curArea = (u8*)(NDS::CurCPU == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start); desc.EmulatedFaultAddr = (u8*)ctx->far.x - curArea; - desc.FaultPC = ctx->pc.x; + desc.FaultPC = (u8*)ctx->pc.x; u64 integerRegisters[33]; memcpy(integerRegisters, &ctx->cpu_gprs[0].x, 8*29); @@ -84,23 +96,14 @@ void __libnx_exception_handler(ThreadExceptionDump* ctx) integerRegisters[31] = ctx->sp.x; integerRegisters[32] = ctx->pc.x; - s32 offset; - if (ARMJIT_Memory::FaultHandler(&desc, offset)) + if (ARMJIT_Memory::FaultHandler(desc)) { - integerRegisters[32] += offset; + integerRegisters[32] = (u64)desc.FaultPC; ARM_RestoreContext(integerRegisters); } - if (ctx->pc.x >= (u64)&__start__ && ctx->pc.x < (u64)&__rodata_start) - { - printf("unintentional fault in .text at 0x%x (type %d) (trying to access 0x%x?)\n", - ctx->pc.x - (u64)&__start__, ctx->error_desc, ctx->far.x); - } - else - { - printf("unintentional fault somewhere in deep (address) space at %x (type %d)\n", ctx->pc.x, ctx->error_desc); - } + HandleFault(ctx->pc.x, ctx->lr.x, ctx->fp.x, ctx->far.x, ctx->error_desc); } } @@ -117,12 +120,11 @@ static LONG ExceptionHandler(EXCEPTION_POINTERS* exceptionInfo) ARMJIT_Memory::FaultDescription desc; u8* curArea = (u8*)(NDS::CurCPU == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start); desc.EmulatedFaultAddr = (u8*)exceptionInfo->ExceptionRecord->ExceptionInformation[1] - curArea; - desc.FaultPC = exceptionInfo->ContextRecord->Rip; + desc.FaultPC = (u8*)exceptionInfo->ContextRecord->Rip; - s32 offset = 0; - if (ARMJIT_Memory::FaultHandler(&desc, offset)) + if (ARMJIT_Memory::FaultHandler(desc)) { - exceptionInfo->ContextRecord->Rip += offset; + exceptionInfo->ContextRecord->Rip = (u64)desc.FaultPC; return EXCEPTION_CONTINUE_EXECUTION; } @@ -131,50 +133,84 @@ static LONG ExceptionHandler(EXCEPTION_POINTERS* exceptionInfo) #else -struct sigaction NewSa; -struct sigaction OldSa; +static struct sigaction OldSaSegv; +static struct sigaction OldSaBus; static void SigsegvHandler(int sig, siginfo_t* info, void* rawContext) { + if (sig != SIGSEGV && sig != SIGBUS) + { + // We are not interested in other signals - handle it as usual. + return; + } + if (info->si_code != SEGV_MAPERR && info->si_code != SEGV_ACCERR) + { + // Huh? Return. + return; + } + ucontext_t* context = (ucontext_t*)rawContext; - + ARMJIT_Memory::FaultDescription desc; u8* curArea = (u8*)(NDS::CurCPU == 0 ? ARMJIT_Memory::FastMem9Start : ARMJIT_Memory::FastMem7Start); #ifdef __x86_64__ desc.EmulatedFaultAddr = (u8*)info->si_addr - curArea; - desc.FaultPC = context->uc_mcontext.gregs[REG_RIP]; + #ifdef __APPLE__ + desc.FaultPC = (u8*)context->uc_mcontext->__ss.__rip; + #else + desc.FaultPC = (u8*)context->uc_mcontext.gregs[REG_RIP]; + #endif + #else - desc.EmulatedFaultAddr = (u8*)context->uc_mcontext.fault_address - curArea; - desc.FaultPC = context->uc_mcontext.pc; + #ifdef __APPLE__ + desc.EmulatedFaultAddr = (u8*)context->uc_mcontext->__es.__far - curArea; + desc.FaultPC = (u8*)context->uc_mcontext->__ss.__pc; + #else + desc.EmulatedFaultAddr = (u8*)context->uc_mcontext.fault_address - curArea; + desc.FaultPC = (u8*)context->uc_mcontext.pc; + #endif #endif - s32 offset = 0; - if (ARMJIT_Memory::FaultHandler(&desc, offset)) + if (ARMJIT_Memory::FaultHandler(desc)) { #ifdef __x86_64__ - context->uc_mcontext.gregs[REG_RIP] += offset; + #ifdef __APPLE__ + context->uc_mcontext->__ss.__rip = (u64)desc.FaultPC; + #else + context->uc_mcontext.gregs[REG_RIP] = (u64)desc.FaultPC; + #endif #else - context->uc_mcontext.pc += offset; + #ifdef __APPLE__ + context->uc_mcontext->__ss.__pc = (u64)desc.FaultPC; + #else + context->uc_mcontext.pc = (u64)desc.FaultPC; + #endif #endif return; } - if (OldSa.sa_flags & SA_SIGINFO) + struct sigaction* oldSa; + if (sig == SIGSEGV) + oldSa = &OldSaSegv; + else + oldSa = &OldSaBus; + + if (oldSa->sa_flags & SA_SIGINFO) { - OldSa.sa_sigaction(sig, info, rawContext); + oldSa->sa_sigaction(sig, info, rawContext); return; } - if (OldSa.sa_handler == SIG_DFL) + if (oldSa->sa_handler == SIG_DFL) { signal(sig, SIG_DFL); return; } - if (OldSa.sa_handler == SIG_IGN) + if (oldSa->sa_handler == SIG_IGN) { // Ignore signal return; } - OldSa.sa_handler(sig); + oldSa->sa_handler(sig); } #endif @@ -231,7 +267,7 @@ enum { memstate_Unmapped, memstate_MappedRW, - // on switch this is unmapped as well + // on Switch this is unmapped as well memstate_MappedProtected, }; @@ -314,14 +350,16 @@ struct Mapping void Unmap(int region) { + u32 dtcmStart = NDS::ARM9->DTCMBase; + u32 dtcmSize = NDS::ARM9->DTCMSize; bool skipDTCM = Num == 0 && region != memregion_DTCM; u8* statuses = Num == 0 ? MappingStatus9 : MappingStatus7; u32 offset = 0; while (offset < Size) { - if (skipDTCM && Addr + offset == NDS::ARM9->DTCMBase) + if (skipDTCM && Addr + offset == dtcmStart) { - offset += NDS::ARM9->DTCMSize; + offset += dtcmSize; } else { @@ -329,7 +367,7 @@ struct Mapping u8 status = statuses[(Addr + offset) >> 12]; while (statuses[(Addr + offset) >> 12] == status && offset < Size - && (!skipDTCM || Addr + offset != NDS::ARM9->DTCMBase)) + && (!skipDTCM || Addr + offset != dtcmStart)) { assert(statuses[(Addr + offset) >> 12] != memstate_Unmapped); statuses[(Addr + offset) >> 12] = memstate_Unmapped; @@ -347,9 +385,33 @@ struct Mapping #endif } } + #ifndef __SWITCH__ - bool succeded = UnmapFromRange(Addr, Num, OffsetsPerRegion[region] + LocalOffset, Size); - assert(succeded); +#ifndef _WIN32 + u32 dtcmEnd = dtcmStart + dtcmSize; + if (Num == 0 + && dtcmEnd >= Addr + && dtcmStart < Addr + Size) + { + bool success; + if (dtcmStart > Addr) + { + success = UnmapFromRange(Addr, 0, OffsetsPerRegion[region] + LocalOffset, dtcmStart - Addr); + assert(success); + } + if (dtcmEnd < Addr + Size) + { + u32 offset = dtcmStart - Addr + dtcmSize; + success = UnmapFromRange(dtcmEnd, 0, OffsetsPerRegion[region] + LocalOffset + offset, Size - offset); + assert(success); + } + } + else +#endif + { + bool succeded = UnmapFromRange(Addr, Num, OffsetsPerRegion[region] + LocalOffset, Size); + assert(succeded); + } #endif } }; @@ -418,10 +480,10 @@ void RemapDTCM(u32 newBase, u32 newSize) printf("unmapping %d %x %x %x %x\n", region, mapping.Addr, mapping.Size, mapping.Num, mapping.LocalOffset); - bool oldOverlap = NDS::ARM9->DTCMSize > 0 && !(oldDTCMBase >= end || oldDTCBEnd <= start); - bool newOverlap = newSize > 0 && !(newBase >= end || newEnd <= start); + bool overlap = (NDS::ARM9->DTCMSize > 0 && oldDTCMBase < end && oldDTCBEnd > start) + || (newSize > 0 && newBase < end && newEnd > start); - if (mapping.Num == 0 && (oldOverlap || newOverlap)) + if (mapping.Num == 0 && overlap) { mapping.Unmap(region); Mappings[region].Remove(i); @@ -445,8 +507,8 @@ void RemapNWRAM(int num) for (int i = 0; i < Mappings[memregion_SharedWRAM].Length;) { Mapping& mapping = Mappings[memregion_SharedWRAM][i]; - if (!(DSi::NWRAMStart[mapping.Num][num] >= mapping.Addr + mapping.Size - || DSi::NWRAMEnd[mapping.Num][num] < mapping.Addr)) + if (DSi::NWRAMStart[mapping.Num][num] < mapping.Addr + mapping.Size + && DSi::NWRAMEnd[mapping.Num][num] > mapping.Addr) { mapping.Unmap(memregion_SharedWRAM); Mappings[memregion_SharedWRAM].Remove(i); @@ -469,7 +531,7 @@ void RemapSWRAM() for (int i = 0; i < Mappings[memregion_WRAM7].Length;) { Mapping& mapping = Mappings[memregion_WRAM7][i]; - if (mapping.Addr + mapping.Size < 0x03800000) + if (mapping.Addr + mapping.Size <= 0x03800000) { mapping.Unmap(memregion_WRAM7); Mappings[memregion_WRAM7].Remove(i); @@ -501,26 +563,53 @@ bool MapAtAddress(u32 addr) return false; u8* states = num == 0 ? MappingStatus9 : MappingStatus7; - printf("trying to create mapping %x, %x %x %d %d\n", mirrorStart, mirrorSize, memoryOffset, region, num); + printf("mapping mirror %x, %x %x %d %d\n", mirrorStart, mirrorSize, memoryOffset, region, num); bool isExecutable = ARMJIT::CodeMemRegions[region]; + u32 dtcmStart = NDS::ARM9->DTCMBase; + u32 dtcmSize = NDS::ARM9->DTCMSize; + u32 dtcmEnd = dtcmStart + dtcmSize; #ifndef __SWITCH__ - bool succeded = MapIntoRange(mirrorStart, num, OffsetsPerRegion[region] + memoryOffset, mirrorSize); - assert(succeded); +#ifndef _WIN32 + if (num == 0 + && dtcmEnd >= mirrorStart + && dtcmStart < mirrorStart + mirrorSize) + { + bool success; + if (dtcmStart > mirrorStart) + { + success = MapIntoRange(mirrorStart, 0, OffsetsPerRegion[region] + memoryOffset, dtcmStart - mirrorStart); + assert(success); + } + if (dtcmEnd < mirrorStart + mirrorSize) + { + u32 offset = dtcmStart - mirrorStart + dtcmSize; + success = MapIntoRange(dtcmEnd, 0, OffsetsPerRegion[region] + memoryOffset + offset, mirrorSize - offset); + assert(success); + } + } + else +#endif + { + bool succeded = MapIntoRange(mirrorStart, num, OffsetsPerRegion[region] + memoryOffset, mirrorSize); + assert(succeded); + } #endif ARMJIT::AddressRange* range = ARMJIT::CodeMemRegions[region] + memoryOffset / 512; // this overcomplicated piece of code basically just finds whole pieces of code memory - // which can be mapped + // which can be mapped/protected u32 offset = 0; bool skipDTCM = num == 0 && region != memregion_DTCM; while (offset < mirrorSize) { - if (skipDTCM && mirrorStart + offset == NDS::ARM9->DTCMBase) + if (skipDTCM && mirrorStart + offset == dtcmStart) { - SetCodeProtectionRange(NDS::ARM9->DTCMBase, NDS::ARM9->DTCMSize, 0, 0); - offset += NDS::ARM9->DTCMSize; +#ifdef _WIN32 + SetCodeProtectionRange(dtcmStart, dtcmSize, 0, 0); +#endif + offset += dtcmSize; } else { @@ -557,37 +646,36 @@ bool MapAtAddress(u32 addr) Mapping mapping{mirrorStart, mirrorSize, memoryOffset, num}; Mappings[region].Add(mapping); - printf("mapped mirror at %08x-%08x\n", mirrorStart, mirrorStart + mirrorSize - 1); + //printf("mapped mirror at %08x-%08x\n", mirrorStart, mirrorStart + mirrorSize - 1); return true; } -bool FaultHandler(FaultDescription* faultDesc, s32& offset) +bool FaultHandler(FaultDescription& faultDesc) { - if (ARMJIT::JITCompiler->IsJITFault(faultDesc->FaultPC)) + if (ARMJIT::JITCompiler->IsJITFault(faultDesc.FaultPC)) { bool rewriteToSlowPath = true; - u32 addr = faultDesc->EmulatedFaultAddr; + u8* memStatus = NDS::CurCPU == 0 ? MappingStatus9 : MappingStatus7; - if ((NDS::CurCPU == 0 ? MappingStatus9 : MappingStatus7)[addr >> 12] == memstate_Unmapped) - rewriteToSlowPath = !MapAtAddress(faultDesc->EmulatedFaultAddr); + if (memStatus[faultDesc.EmulatedFaultAddr >> 12] == memstate_Unmapped) + rewriteToSlowPath = !MapAtAddress(faultDesc.EmulatedFaultAddr); if (rewriteToSlowPath) - { - offset = ARMJIT::JITCompiler->RewriteMemAccess(faultDesc->FaultPC); - } + faultDesc.FaultPC = ARMJIT::JITCompiler->RewriteMemAccess(faultDesc.FaultPC); + return true; } return false; } +const u64 AddrSpaceSize = 0x100000000; + void Init() { - const u64 AddrSpaceSize = 0x100000000; - #if defined(__SWITCH__) - MemoryBase = (u8*)memalign(0x1000, MemoryTotalSize); + MemoryBase = (u8*)aligned_alloc(0x1000, MemoryTotalSize); MemoryBaseCodeMem = (u8*)virtmemReserve(MemoryTotalSize); bool succeded = R_SUCCEEDED(svcMapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, @@ -624,22 +712,52 @@ void Init() u8* basePtr = MemoryBase; #else - FastMem9Start = mmap(NULL, AddrSpaceSize, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0); - FastMem7Start = mmap(NULL, AddrSpaceSize, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0); + // this used to be allocated with three different mmaps + // The idea was to give the OS more freedom where to position the buffers, + // but something was bad about this so instead we take this vmem eating monster + // which seems to work better. + MemoryBase = (u8*)mmap(NULL, AddrSpaceSize*4, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0); + munmap(MemoryBase, AddrSpaceSize*4); + FastMem9Start = MemoryBase; + FastMem7Start = MemoryBase + AddrSpaceSize; + MemoryBase = MemoryBase + AddrSpaceSize*2; - MemoryBase = (u8*)mmap(NULL, MemoryTotalSize, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0); +#if defined(__ANDROID__) + static void* libandroid = dlopen("libandroid.so", RTLD_LAZY | RTLD_LOCAL); + using type_ASharedMemory_create = int(*)(const char* name, size_t size); + static void* symbol = dlsym(libandroid, "ASharedMemory_create"); + static auto shared_memory_create = reinterpret_cast(symbol); + if (shared_memory_create) + { + MemoryFile = shared_memory_create("melondsfastmem", MemoryTotalSize); + } + else + { + int fd = open(ASHMEM_DEVICE, O_RDWR); + ioctl(fd, ASHMEM_SET_NAME, "melondsfastmem"); + ioctl(fd, ASHMEM_SET_SIZE, MemoryTotalSize); + MemoryFile = fd; + } +#elif defined(__APPLE__) + char* fastmemPidName = new char[snprintf(NULL, 0, "melondsfastmem%d", getpid()) + 1]; + sprintf(fastmemPidName, "melondsfastmem%d", getpid()); + MemoryFile = shm_open(fastmemPidName, O_RDWR|O_CREAT, 0600); + delete[] fastmemPidName; +#else MemoryFile = memfd_create("melondsfastmem", 0); +#endif ftruncate(MemoryFile, MemoryTotalSize); - NewSa.sa_flags = SA_SIGINFO; - sigemptyset(&NewSa.sa_mask); - NewSa.sa_sigaction = SigsegvHandler; - sigaction(SIGSEGV, &NewSa, &OldSa); - - munmap(MemoryBase, MemoryTotalSize); - munmap(FastMem9Start, AddrSpaceSize); - munmap(FastMem7Start, AddrSpaceSize); + struct sigaction sa; + sa.sa_handler = nullptr; + sa.sa_sigaction = &SigsegvHandler; + sa.sa_flags = SA_SIGINFO; + sigemptyset(&sa.sa_mask); + sigaction(SIGSEGV, &sa, &OldSaSegv); +#ifdef __APPLE__ + sigaction(SIGBUS, &sa, &OldSaBus); +#endif mmap(MemoryBase, MemoryTotalSize, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, MemoryFile, 0); @@ -657,17 +775,30 @@ void Init() void DeInit() { #if defined(__SWITCH__) - virtmemFree(FastMem9Start, 0x100000000); - virtmemFree(FastMem7Start, 0x100000000); + virtmemFree(FastMem9Start, AddrSpaceSize); + virtmemFree(FastMem7Start, AddrSpaceSize); svcUnmapProcessCodeMemory(envGetOwnProcessHandle(), (u64)MemoryBaseCodeMem, (u64)MemoryBase, MemoryTotalSize); virtmemFree(MemoryBaseCodeMem, MemoryTotalSize); free(MemoryBase); +#elif defined(__APPLE__) + char* fastmemPidName = new char[snprintf(NULL, 0, "melondsfastmem%d", getpid()) + 1]; + sprintf(fastmemPidName, "melondsfastmem%d", getpid()); + shm_unlink(fastmemPidName); + delete[] fastmemPidName; #elif defined(_WIN32) assert(UnmapViewOfFile(MemoryBase)); CloseHandle(MemoryFile); RemoveVectoredExceptionHandler(ExceptionHandlerHandle); +#else + sigaction(SIGSEGV, &OldSaSegv, nullptr); +#ifdef __APPLE__ + sigaction(SIGBUS, &OldSaBus, nullptr); +#endif + + munmap(MemoryBase, MemoryTotalSize); + close(MemoryFile); #endif } @@ -997,9 +1128,11 @@ int ClassifyAddress7(u32 addr) case 0x06000000: case 0x06800000: return memregion_VWRAM; + + default: + return memregion_Other; } } - return memregion_Other; } void WifiWrite32(u32 addr, u32 val) @@ -1176,4 +1309,4 @@ void* GetFuncForAddr(ARM* cpu, u32 addr, bool store, int size) return NULL; } -} \ No newline at end of file +} diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index 819fe3cd..70ec781c 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -130,6 +130,16 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), Imm8(cycles)); } +void ARMv4JumpToTrampoline(ARMv4* arm, u32 addr, bool restorecpsr) +{ + arm->JumpTo(addr, restorecpsr); +} + +void ARMv5JumpToTrampoline(ARMv5* arm, u32 addr, bool restorecpsr) +{ + arm->JumpTo(addr, restorecpsr); +} + void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) { IrregularCycles = true; @@ -146,9 +156,9 @@ void Compiler::Comp_JumpTo(Gen::X64Reg addr, bool restoreCPSR) else MOV(32, R(ABI_PARAM3), Imm32(true)); // what a waste if (Num == 0) - CALL((void*)&ARMv5::JumpTo); + CALL((void*)&ARMv5JumpToTrampoline); else - CALL((void*)&ARMv4::JumpTo); + CALL((void*)&ARMv4JumpToTrampoline); PopRegs(restoreCPSR); @@ -269,4 +279,4 @@ void Compiler::T_Comp_BL_Merged() Comp_JumpTo(target); } -} \ No newline at end of file +} diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.cpp b/src/ARMJIT_x64/ARMJIT_Compiler.cpp index c6419c90..cc4ad800 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.cpp +++ b/src/ARMJIT_x64/ARMJIT_Compiler.cpp @@ -101,6 +101,11 @@ void Compiler::A_Comp_MRS() MOV(32, rd, R(RCPSR)); } +void UpdateModeTrampoline(ARM* arm, u32 oldmode, u32 newmode) +{ + arm->UpdateMode(oldmode, newmode); +} + void Compiler::A_Comp_MSR() { Comp_AddCycles_C(); @@ -185,7 +190,7 @@ void Compiler::A_Comp_MSR() MOV(32, R(ABI_PARAM3), R(RCPSR)); MOV(32, R(ABI_PARAM2), R(RSCRATCH3)); MOV(64, R(ABI_PARAM1), R(RCPU)); - CALL((void*)&ARM::UpdateMode); + CALL((void*)&UpdateModeTrampoline); PopRegs(true); } @@ -216,6 +221,8 @@ Compiler::Compiler() #ifdef _WIN32 DWORD dummy; VirtualProtect(pageAligned, alignedSize, PAGE_EXECUTE_READWRITE, &dummy); + #elif defined(__APPLE__) + pageAligned = (u8*)mmap(NULL, 1024*1024*32, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS ,-1, 0); #else mprotect(pageAligned, alignedSize, PROT_EXEC | PROT_READ | PROT_WRITE); #endif @@ -340,7 +347,7 @@ Compiler::Compiler() ABI_PushRegistersAndAdjustStack(CallerSavedPushRegs, 8); if (consoleType == 0) { - switch ((8 << size) | num) + switch ((8 << size) | num) { case 32: ABI_CallFunction(SlowWrite9); break; case 33: ABI_CallFunction(SlowWrite7); break; @@ -352,7 +359,7 @@ Compiler::Compiler() } else { - switch ((8 << size) | num) + switch ((8 << size) | num) { case 32: ABI_CallFunction(SlowWrite9); break; case 33: ABI_CallFunction(SlowWrite7); break; @@ -375,7 +382,7 @@ Compiler::Compiler() ABI_PushRegistersAndAdjustStack(CallerSavedPushRegs, 8); if (consoleType == 0) { - switch ((8 << size) | num) + switch ((8 << size) | num) { case 32: ABI_CallFunction(SlowRead9); break; case 33: ABI_CallFunction(SlowRead7); break; @@ -387,7 +394,7 @@ Compiler::Compiler() } else { - switch ((8 << size) | num) + switch ((8 << size) | num) { case 32: ABI_CallFunction(SlowRead9); break; case 33: ABI_CallFunction(SlowRead7); break; @@ -612,9 +619,9 @@ void Compiler::Reset() LoadStorePatches.clear(); } -bool Compiler::IsJITFault(u64 addr) +bool Compiler::IsJITFault(u8* addr) { - return addr >= (u64)CodeMemory && addr < (u64)CodeMemory + sizeof(CodeMemory); + return (u64)addr >= (u64)ResetStart && (u64)addr < (u64)ResetStart + CodeMemSize; } void Compiler::Comp_SpecialBranchBehaviour(bool taken) @@ -896,5 +903,4 @@ void Compiler::Comp_AddCycles_CD() else ConstantCycles += cycles; } - -} \ No newline at end of file +} diff --git a/src/ARMJIT_x64/ARMJIT_Compiler.h b/src/ARMJIT_x64/ARMJIT_Compiler.h index 3e900c33..57aab7b5 100644 --- a/src/ARMJIT_x64/ARMJIT_Compiler.h +++ b/src/ARMJIT_x64/ARMJIT_Compiler.h @@ -208,9 +208,9 @@ public: SetCodePtr(FarCode); } - bool IsJITFault(u64 addr); + bool IsJITFault(u8* addr); - s32 RewriteMemAccess(u64 pc); + u8* RewriteMemAccess(u8* pc); u8* FarCode; u8* NearCode; diff --git a/src/ARMJIT_x64/ARMJIT_Linkage.s b/src/ARMJIT_x64/ARMJIT_Linkage.S similarity index 89% rename from src/ARMJIT_x64/ARMJIT_Linkage.s rename to src/ARMJIT_x64/ARMJIT_Linkage.S index 0a84df07..8cc0b5f9 100644 --- a/src/ARMJIT_x64/ARMJIT_Linkage.s +++ b/src/ARMJIT_x64/ARMJIT_Linkage.S @@ -29,8 +29,13 @@ .p2align 4,,15 +#ifdef __APPLE__ +.global _ARM_Dispatch +_ARM_Dispatch: +#else .global ARM_Dispatch ARM_Dispatch: +#endif #ifdef WIN64 push rdi push rsi @@ -54,8 +59,13 @@ ARM_Dispatch: .p2align 4,,15 +#ifdef __APPLE__ +.global _ARM_Ret +_ARM_Ret: +#else .global ARM_Ret ARM_Ret: +#endif mov [RCPU + ARM_CPSR_offset], RCPSR #ifdef WIN64 diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 8b4e8fe9..d80b25b5 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -15,28 +15,24 @@ int squeezePointer(T* ptr) return truncated; } -s32 Compiler::RewriteMemAccess(u64 pc) +u8* Compiler::RewriteMemAccess(u8* pc) { - auto it = LoadStorePatches.find((u8*)pc); + auto it = LoadStorePatches.find(pc); if (it != LoadStorePatches.end()) { LoadStorePatch patch = it->second; LoadStorePatches.erase(it); - u8* curCodePtr = GetWritableCodePtr(); - u8* rewritePtr = (u8*)pc + (ptrdiff_t)patch.Offset; - SetCodePtr(rewritePtr); + //printf("rewriting memory access %p %d %d\n", (u8*)pc-ResetStart, patch.Offset, patch.Size); - CALL(patch.PatchFunc); - u32 remainingSize = patch.Size - (GetWritableCodePtr() - rewritePtr); + XEmitter emitter(pc + (ptrdiff_t)patch.Offset); + emitter.CALL(patch.PatchFunc); + ptrdiff_t remainingSize = (ptrdiff_t)patch.Size - 5; + assert(remainingSize >= 0); if (remainingSize > 0) - NOP(remainingSize); + emitter.NOP(remainingSize); - //printf("rewriting memory access %p %d %d\n", patch.PatchFunc, patch.Offset, patch.Size); - - SetCodePtr(curCodePtr); - - return patch.Offset; + return pc + (ptrdiff_t)patch.Offset; } printf("this is a JIT bug %llx\n", pc); @@ -192,6 +188,7 @@ void Compiler::Comp_MemAccess(int rd, int rn, const Op2& op2, int size, int flag u8* memopStart = GetWritableCodePtr(); LoadStorePatch patch; + assert(rdMapped.GetSimpleReg() >= 0 && rdMapped.GetSimpleReg() < 16); patch.PatchFunc = flags & memop_Store ? PatchedStoreFuncs[NDS::ConsoleType][Num][__builtin_ctz(size) - 3][rdMapped.GetSimpleReg()] : PatchedLoadFuncs[NDS::ConsoleType][Num][__builtin_ctz(size) - 3][!!(flags & memop_SignExtend)][rdMapped.GetSimpleReg()]; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d6c38971..9f07cea6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -27,6 +27,7 @@ add_library(core STATIC GBACart.cpp GPU.cpp GPU2D.cpp + GPU2D_Soft.cpp GPU3D.cpp GPU3D_Soft.cpp melonDLDI.h @@ -80,9 +81,8 @@ if (ENABLE_JIT) ARMJIT_x64/ARMJIT_LoadStore.cpp ARMJIT_x64/ARMJIT_Branch.cpp - ARMJIT_x64/ARMJIT_Linkage.s + ARMJIT_x64/ARMJIT_Linkage.S ) - set_source_files_properties(ARMJIT_x64/ARMJIT_Linkage.s PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp") endif() if (ARCHITECTURE STREQUAL ARM64) target_sources(core PRIVATE @@ -94,16 +94,22 @@ if (ENABLE_JIT) ARMJIT_A64/ARMJIT_LoadStore.cpp ARMJIT_A64/ARMJIT_Branch.cpp - ARMJIT_A64/ARMJIT_Linkage.s + ARMJIT_A64/ARMJIT_Linkage.S ) - set_source_files_properties(ARMJIT_A64/ARMJIT_Linkage.s PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp") endif() endif() +if (APPLE) + target_include_directories(core PUBLIC /usr/local/include) + target_link_directories(core PUBLIC /usr/local/lib) +endif() + if (ENABLE_OGLRENDERER) if (WIN32) target_link_libraries(core ole32 comctl32 ws2_32 opengl32) - else() + elseif (APPLE) + target_link_libraries(core "-framework OpenGL") + else() target_link_libraries(core GL EGL) endif() else() diff --git a/src/Config.cpp b/src/Config.cpp index 341b14c3..f7db2528 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -73,7 +73,11 @@ ConfigEntry ConfigFile[] = {"JIT_MaxBlockSize", 0, &JIT_MaxBlockSize, 32, NULL, 0}, {"JIT_BranchOptimisations", 0, &JIT_BranchOptimisations, 1, NULL, 0}, {"JIT_LiteralOptimisations", 0, &JIT_LiteralOptimisations, 1, NULL, 0}, - {"JIT_FastMemory", 0, &JIT_FastMemory, 1, NULL, 0}, + #ifdef __APPLE__ + {"JIT_FastMemory", 0, &JIT_FastMemory, 0, NULL, 0}, + #else + {"JIT_FastMemory", 0, &JIT_FastMemory, 1, NULL, 0}, + #endif #endif {"", -1, NULL, 0, NULL, 0} diff --git a/src/DMA.cpp b/src/DMA.cpp index 18b8a2f1..cd3465f6 100644 --- a/src/DMA.cpp +++ b/src/DMA.cpp @@ -77,21 +77,6 @@ void DMA::Reset() Running = false; InProgress = false; - - if (NDS::ConsoleType == 1) - { - BusRead16 = (CPU==0) ? DSi::ARM9Read16 : DSi::ARM7Read16; - BusRead32 = (CPU==0) ? DSi::ARM9Read32 : DSi::ARM7Read32; - BusWrite16 = (CPU==0) ? DSi::ARM9Write16 : DSi::ARM7Write16; - BusWrite32 = (CPU==0) ? DSi::ARM9Write32 : DSi::ARM7Write32; - } - else - { - BusRead16 = (CPU==0) ? NDS::ARM9Read16 : NDS::ARM7Read16; - BusRead32 = (CPU==0) ? NDS::ARM9Read32 : NDS::ARM7Read32; - BusWrite16 = (CPU==0) ? NDS::ARM9Write16 : NDS::ARM7Write16; - BusWrite32 = (CPU==0) ? NDS::ARM9Write32 : NDS::ARM7Write32; - } } void DMA::DoSavestate(Savestate* file) @@ -198,13 +183,7 @@ void DMA::Start() NDS::StopCPU(CPU, 1< void DMA::Run9() { if (NDS::ARM9Timestamp >= NDS::ARM9Target) return; @@ -242,7 +221,10 @@ void DMA::Run9() { NDS::ARM9Timestamp += (unitcycles << NDS::ARM9ClockShift); - BusWrite16(CurDstAddr, BusRead16(CurSrcAddr)); + if (ConsoleType == 1) + DSi::ARM9Write16(CurDstAddr, DSi::ARM9Read16(CurSrcAddr)); + else + NDS::ARM9Write16(CurDstAddr, NDS::ARM9Read16(CurSrcAddr)); CurSrcAddr += SrcAddrInc<<1; CurDstAddr += DstAddrInc<<1; @@ -278,7 +260,10 @@ void DMA::Run9() { NDS::ARM9Timestamp += (unitcycles << NDS::ARM9ClockShift); - BusWrite32(CurDstAddr, BusRead32(CurSrcAddr)); + if (ConsoleType == 1) + DSi::ARM9Write32(CurDstAddr, DSi::ARM9Read32(CurSrcAddr)); + else + NDS::ARM9Write32(CurDstAddr, NDS::ARM9Read32(CurSrcAddr)); CurSrcAddr += SrcAddrInc<<2; CurDstAddr += DstAddrInc<<2; @@ -317,6 +302,7 @@ void DMA::Run9() NDS::ResumeCPU(0, 1< void DMA::Run7() { if (NDS::ARM7Timestamp >= NDS::ARM7Target) return; @@ -354,7 +340,10 @@ void DMA::Run7() { NDS::ARM7Timestamp += unitcycles; - BusWrite16(CurDstAddr, BusRead16(CurSrcAddr)); + if (ConsoleType == 1) + DSi::ARM7Write16(CurDstAddr, DSi::ARM7Read16(CurSrcAddr)); + else + NDS::ARM7Write16(CurDstAddr, NDS::ARM7Read16(CurSrcAddr)); CurSrcAddr += SrcAddrInc<<1; CurDstAddr += DstAddrInc<<1; @@ -390,7 +379,10 @@ void DMA::Run7() { NDS::ARM7Timestamp += unitcycles; - BusWrite32(CurDstAddr, BusRead32(CurSrcAddr)); + if (ConsoleType == 1) + DSi::ARM7Write32(CurDstAddr, DSi::ARM7Read32(CurSrcAddr)); + else + NDS::ARM7Write32(CurDstAddr, NDS::ARM7Read32(CurSrcAddr)); CurSrcAddr += SrcAddrInc<<2; CurDstAddr += DstAddrInc<<2; @@ -425,3 +417,14 @@ void DMA::Run7() InProgress = false; NDS::ResumeCPU(1, 1< +void DMA::Run() +{ + if (!Running) return; + if (CPU == 0) return Run9(); + else return Run7(); +} + +template void DMA::Run<0>(); +template void DMA::Run<1>(); diff --git a/src/DMA.h b/src/DMA.h index 0344fbac..b0b4ab2a 100644 --- a/src/DMA.h +++ b/src/DMA.h @@ -34,9 +34,12 @@ public: void WriteCnt(u32 val); void Start(); + template void Run(); + template void Run9(); + template void Run7(); bool IsInMode(u32 mode) @@ -86,11 +89,6 @@ private: bool Stall; bool IsGXFIFODMA; - - u16 (*BusRead16)(u32 addr); - u32 (*BusRead32)(u32 addr); - void (*BusWrite16)(u32 addr, u16 val); - void (*BusWrite32)(u32 addr, u32 val); }; #endif diff --git a/src/DSi.cpp b/src/DSi.cpp index e8b12315..bcc1f925 100644 --- a/src/DSi.cpp +++ b/src/DSi.cpp @@ -35,6 +35,7 @@ #include "DSi_I2C.h" #include "DSi_SD.h" #include "DSi_AES.h" +#include "DSi_Camera.h" #include "tiny-AES-c/aes.hpp" @@ -542,15 +543,15 @@ void MapNWRAM_A(u32 num, u8 val) return; } -#ifdef JIT_ENABLED - ARMJIT_Memory::RemapNWRAM(0); -#endif - int mbkn = 0, mbks = 8*num; u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF; if (oldval == val) return; +#ifdef JIT_ENABLED + ARMJIT_Memory::RemapNWRAM(0); +#endif + MBK[0][mbkn] &= ~(0xFF << mbks); MBK[0][mbkn] |= (val << mbks); MBK[1][mbkn] = MBK[0][mbkn]; @@ -577,15 +578,15 @@ void MapNWRAM_B(u32 num, u8 val) return; } -#ifdef JIT_ENABLED - ARMJIT_Memory::RemapNWRAM(1); -#endif - int mbkn = 1+(num>>2), mbks = 8*(num&3); u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF; if (oldval == val) return; +#ifdef JIT_ENABLED + ARMJIT_Memory::RemapNWRAM(1); +#endif + MBK[0][mbkn] &= ~(0xFF << mbks); MBK[0][mbkn] |= (val << mbks); MBK[1][mbkn] = MBK[0][mbkn]; @@ -616,15 +617,15 @@ void MapNWRAM_C(u32 num, u8 val) return; } -#ifdef JIT_ENABLED - ARMJIT_Memory::RemapNWRAM(2); -#endif - int mbkn = 3+(num>>2), mbks = 8*(num&3); u8 oldval = (MBK[0][mbkn] >> mbks) & 0xFF; if (oldval == val) return; +#ifdef JIT_ENABLED + ARMJIT_Memory::RemapNWRAM(2); +#endif + MBK[0][mbkn] &= ~(0xFF << mbks); MBK[0][mbkn] |= (val << mbks); MBK[1][mbkn] = MBK[0][mbkn]; @@ -1406,6 +1407,12 @@ u8 ARM9IORead8(u32 addr) CASE_READ8_32BIT(0x04004060, MBK[0][8]) } + if ((addr & 0xFFFFFF00) == 0x04004200) + { + if (!(SCFG_EXT[0] & (1<<17))) return 0; + return DSi_Camera::Read8(addr); + } + return NDS::ARM9IORead8(addr); } @@ -1428,6 +1435,12 @@ u16 ARM9IORead16(u32 addr) CASE_READ16_32BIT(0x04004060, MBK[0][8]) } + if ((addr & 0xFFFFFF00) == 0x04004200) + { + if (!(SCFG_EXT[0] & (1<<17))) return 0; + return DSi_Camera::Read16(addr); + } + return NDS::ARM9IORead16(addr); } @@ -1480,6 +1493,12 @@ u32 ARM9IORead32(u32 addr) case 0x04004170: return NDMAs[3]->Cnt; } + if ((addr & 0xFFFFFF00) == 0x04004200) + { + if (!(SCFG_EXT[0] & (1<<17))) return 0; + return DSi_Camera::Read32(addr); + } + return NDS::ARM9IORead32(addr); } @@ -1519,6 +1538,12 @@ void ARM9IOWrite8(u32 addr, u8 val) case 0x04004053: MapNWRAM_C(7, val); return; } + if ((addr & 0xFFFFFF00) == 0x04004200) + { + if (!(SCFG_EXT[0] & (1<<17))) return; + return DSi_Camera::Write8(addr, val); + } + return NDS::ARM9IOWrite8(addr, val); } @@ -1572,6 +1597,12 @@ void ARM9IOWrite16(u32 addr, u16 val) return; } + if ((addr & 0xFFFFFF00) == 0x04004200) + { + if (!(SCFG_EXT[0] & (1<<17))) return; + return DSi_Camera::Write16(addr, val); + } + return NDS::ARM9IOWrite16(addr, val); } @@ -1678,6 +1709,12 @@ void ARM9IOWrite32(u32 addr, u32 val) case 0x04004170: NDMAs[3]->WriteCnt(val); return; } + if ((addr & 0xFFFFFF00) == 0x04004200) + { + if (!(SCFG_EXT[0] & (1<<17))) return; + return DSi_Camera::Write32(addr, val); + } + return NDS::ARM9IOWrite32(addr, val); } diff --git a/src/DSi_Camera.cpp b/src/DSi_Camera.cpp index 56cba1cb..79cfe3fd 100644 --- a/src/DSi_Camera.cpp +++ b/src/DSi_Camera.cpp @@ -18,12 +18,28 @@ #include #include +#include "DSi.h" #include "DSi_Camera.h" DSi_Camera* DSi_Camera0; // 78 / facing outside DSi_Camera* DSi_Camera1; // 7A / selfie cam +u16 DSi_Camera::ModuleCnt; +u16 DSi_Camera::Cnt; + +u8 DSi_Camera::FrameBuffer[640*480*4]; +u32 DSi_Camera::FrameLength; +u32 DSi_Camera::TransferPos; + +// note on camera data/etc intervals +// on hardware those are likely affected by several factors +// namely, how long cameras take to process frames +// camera IRQ is fired at roughly 15FPS with default config + +const u32 kIRQInterval = 1120000; // ~30 FPS +const u32 kTransferStart = 60000; + bool DSi_Camera::Init() { @@ -43,6 +59,87 @@ void DSi_Camera::Reset() { DSi_Camera0->ResetCam(); DSi_Camera1->ResetCam(); + + ModuleCnt = 0; // CHECKME + Cnt = 0; + + memset(FrameBuffer, 0, 640*480*4); + TransferPos = 0; + FrameLength = 256*192*2; // TODO: make it check frame size, data type, etc + + NDS::ScheduleEvent(NDS::Event_DSi_CamIRQ, true, kIRQInterval, IRQ, 0); +} + + +void DSi_Camera::IRQ(u32 param) +{ + DSi_Camera* activecam = nullptr; + + // TODO: check which camera has priority if both are activated + // (or does it just jumble both data sources together, like it + // does for, say, overlapping VRAM?) + if (DSi_Camera0->IsActivated()) activecam = DSi_Camera0; + else if (DSi_Camera1->IsActivated()) activecam = DSi_Camera1; + + if (activecam) + { + RequestFrame(activecam->Num); + + if (Cnt & (1<<11)) + NDS::SetIRQ(0, NDS::IRQ_DSi_Camera); + + if (Cnt & (1<<15)) + NDS::ScheduleEvent(NDS::Event_DSi_CamTransfer, false, kTransferStart, Transfer, 0); + } + + NDS::ScheduleEvent(NDS::Event_DSi_CamIRQ, true, kIRQInterval, IRQ, 0); +} + +void DSi_Camera::RequestFrame(u32 cam) +{ + if (!(Cnt & (1<<13))) printf("CAMERA: !! REQUESTING YUV FRAME\n"); + + // TODO: picture size, data type, cropping, etc + // generate test pattern + // TODO: get picture from platform (actual camera, video file, whatever source) + for (u32 y = 0; y < 192; y++) + { + for (u32 x = 0; x < 256; x++) + { + u16* px = (u16*)&FrameBuffer[((y*256) + x) * 2]; + + if ((x & 0x8) ^ (y & 0x8)) + *px = 0x8000; + else + *px = 0xFC00 | ((y >> 3) << 5); + } + } +} + +void DSi_Camera::Transfer(u32 pos) +{ + u32 numscan = (Cnt & 0x000F) + 1; + u32 numpix = numscan * 256; // CHECKME + + // TODO: present data + //printf("CAM TRANSFER POS=%d/%d\n", pos, 0x6000*2); + + DSi::CheckNDMAs(0, 0x0B); + + pos += numpix; + if (pos >= 0x6000*2) // HACK + { + // transfer done + } + else + { + // keep going + + // TODO: must be tweaked such that each block has enough time to transfer + u32 delay = numpix*2 + 16; + + NDS::ScheduleEvent(NDS::Event_DSi_CamTransfer, false, delay, Transfer, pos); + } } @@ -62,16 +159,28 @@ void DSi_Camera::ResetCam() RegAddr = 0; RegData = 0; - PLLCnt = 0; + PLLDiv = 0x0366; + PLLPDiv = 0x00F5; + PLLCnt = 0x21F9; + ClocksCnt = 0; StandbyCnt = 0x4029; // checkme + MiscCnt = 0; +} + +bool DSi_Camera::IsActivated() +{ + if (StandbyCnt & (1<<14)) return false; // standby + if (!(MiscCnt & (1<<9))) return false; // data transfer not enabled + + return true; } -void DSi_Camera::Start() +void DSi_Camera::I2C_Start() { } -u8 DSi_Camera::Read(bool last) +u8 DSi_Camera::I2C_Read(bool last) { u8 ret; @@ -89,7 +198,7 @@ u8 DSi_Camera::Read(bool last) } else { - RegData = ReadReg(RegAddr); + RegData = I2C_ReadReg(RegAddr); ret = RegData >> 8; } } @@ -100,7 +209,7 @@ u8 DSi_Camera::Read(bool last) return ret; } -void DSi_Camera::Write(u8 val, bool last) +void DSi_Camera::I2C_Write(u8 val, bool last) { if (DataPos < 2) { @@ -116,7 +225,7 @@ void DSi_Camera::Write(u8 val, bool last) if (DataPos & 0x1) { RegData |= val; - WriteReg(RegAddr, RegData); + I2C_WriteReg(RegAddr, RegData); RegAddr += 2; // checkme } else @@ -129,38 +238,172 @@ void DSi_Camera::Write(u8 val, bool last) else DataPos++; } -u16 DSi_Camera::ReadReg(u16 addr) +u16 DSi_Camera::I2C_ReadReg(u16 addr) { switch (addr) { case 0x0000: return 0x2280; // chip ID + case 0x0010: return PLLDiv; + case 0x0012: return PLLPDiv; case 0x0014: return PLLCnt; + case 0x0016: return ClocksCnt; case 0x0018: return StandbyCnt; + case 0x001A: return MiscCnt; case 0x301A: return ((~StandbyCnt) & 0x4000) >> 12; } - //printf("DSi_Camera%d: unknown read %04X\n", Num, addr); + if(Num==1)printf("DSi_Camera%d: unknown read %04X\n", Num, addr); return 0; } -void DSi_Camera::WriteReg(u16 addr, u16 val) +void DSi_Camera::I2C_WriteReg(u16 addr, u16 val) { switch (addr) { + case 0x0010: + PLLDiv = val & 0x3FFF; + return; + case 0x0012: + PLLPDiv = val & 0xBFFF; + return; case 0x0014: // shouldn't be instant either? val &= 0x7FFF; val |= ((val & 0x0002) << 14); PLLCnt = val; return; + case 0x0016: + ClocksCnt = val; + printf("ClocksCnt=%04X\n", val); + return; case 0x0018: // TODO: this shouldn't be instant, but uh val &= 0x003F; val |= ((val & 0x0001) << 14); StandbyCnt = val; + printf("CAM%d STBCNT=%04X (%04X)\n", Num, StandbyCnt, val); + return; + case 0x001A: + MiscCnt = val & 0x0B7B; + printf("CAM%d MISCCNT=%04X (%04X)\n", Num, MiscCnt, val); return; } - //printf("DSi_Camera%d: unknown write %04X %04X\n", Num, addr, val); + if(Num==1)printf("DSi_Camera%d: unknown write %04X %04X\n", Num, addr, val); +} + + +u8 DSi_Camera::Read8(u32 addr) +{ + // + + printf("unknown DSi cam read8 %08X\n", addr); + return 0; +} + +u16 DSi_Camera::Read16(u32 addr) +{printf("CAM READ %08X %08X\n", addr, NDS::GetPC(0)); + switch (addr) + { + case 0x04004200: return ModuleCnt; + case 0x04004202: return Cnt; + } + + printf("unknown DSi cam read16 %08X\n", addr); + return 0; +} +u32 dorp = 0; +u32 DSi_Camera::Read32(u32 addr) +{ + switch (addr) + { + case 0x04004204: + { + return 0xFC00801F; + if (!(Cnt & (1<<15))) return 0; // CHECKME + u32 ret = *(u32*)&FrameBuffer[TransferPos]; + TransferPos += 4; + if (TransferPos >= FrameLength) TransferPos = 0; + dorp += 4; + //if (dorp >= (256*4*2)) + if (TransferPos == 0) + { + dorp = 0; + Cnt &= ~(1<<4); + } + return ret; + } + } + + printf("unknown DSi cam read32 %08X\n", addr); + return 0; +} + +void DSi_Camera::Write8(u32 addr, u8 val) +{ + // + + printf("unknown DSi cam write8 %08X %02X\n", addr, val); +} + +void DSi_Camera::Write16(u32 addr, u16 val) +{printf("CAM WRITE %08X %04X %08X\n", addr, val, NDS::GetPC(0)); + switch (addr) + { + case 0x04004200: + { + u16 oldcnt = ModuleCnt; + ModuleCnt = val; + + if ((ModuleCnt & (1<<1)) && !(oldcnt & (1<<1))) + { + // reset shit to zero + // CHECKME + + Cnt = 0; + } + + if ((ModuleCnt & (1<<5)) && !(oldcnt & (1<<5))) + { + // TODO: reset I2C?? + } + } + return; + + case 0x04004202: + { + // checkme + u16 oldmask; + if (Cnt & 0x8000) + { + val &= 0x8F20; + oldmask = 0x601F; + } + else + { + val &= 0xEF2F; + oldmask = 0x0010; + } + + Cnt = (Cnt & oldmask) | (val & ~0x0020); + if (val & (1<<5)) Cnt &= ~(1<<4); + + if ((val & (1<<15)) && !(Cnt & (1<<15))) + { + // start transfer + //DSi::CheckNDMAs(0, 0x0B); + } + } + return; + } + + printf("unknown DSi cam write16 %08X %04X\n", addr, val); +} + +void DSi_Camera::Write32(u32 addr, u32 val) +{ + // + + printf("unknown DSi cam write32 %08X %08X\n", addr, val); } diff --git a/src/DSi_Camera.h b/src/DSi_Camera.h index 844a4d28..108d76a5 100644 --- a/src/DSi_Camera.h +++ b/src/DSi_Camera.h @@ -28,27 +28,56 @@ public: static void DeInit(); static void Reset(); + static void IRQ(u32 param); + static void RequestFrame(u32 cam); + + static void Transfer(u32 pos); + DSi_Camera(u32 num); ~DSi_Camera(); void ResetCam(); + bool IsActivated(); - void Start(); - u8 Read(bool last); - void Write(u8 val, bool last); + void I2C_Start(); + u8 I2C_Read(bool last); + void I2C_Write(u8 val, bool last); + + static u8 Read8(u32 addr); + static u16 Read16(u32 addr); + static u32 Read32(u32 addr); + static void Write8(u32 addr, u8 val); + static void Write16(u32 addr, u16 val); + static void Write32(u32 addr, u32 val); -private: u32 Num; +private: u32 DataPos; u32 RegAddr; u16 RegData; - u16 ReadReg(u16 addr); - void WriteReg(u16 addr, u16 val); + u16 I2C_ReadReg(u16 addr); + void I2C_WriteReg(u16 addr, u16 val); + u16 PLLDiv; + u16 PLLPDiv; u16 PLLCnt; + u16 ClocksCnt; u16 StandbyCnt; + u16 MiscCnt; + + u16 MCUAddr; + u16* MCUData; + + u8 MCURegs[0x8000]; + + static u16 ModuleCnt; + static u16 Cnt; + + static u8 FrameBuffer[640*480*4]; + static u32 TransferPos; + static u32 FrameLength; }; diff --git a/src/DSi_I2C.cpp b/src/DSi_I2C.cpp index d58a38cd..76664e5e 100644 --- a/src/DSi_I2C.cpp +++ b/src/DSi_I2C.cpp @@ -50,7 +50,7 @@ void Reset() Registers[0x10] = 0x00; // power btn Registers[0x11] = 0x00; // reset Registers[0x12] = 0x00; // power btn tap - Registers[0x20] = 0x83; // battery + Registers[0x20] = 0x8F; // battery Registers[0x21] = 0x07; Registers[0x30] = 0x13; Registers[0x31] = 0x00; // camera power @@ -187,8 +187,10 @@ void WriteCnt(u8 val) switch (Device) { case 0x4A: Data = DSi_BPTWL::Read(islast); break; - case 0x78: Data = DSi_Camera0->Read(islast); break; - case 0x7A: Data = DSi_Camera1->Read(islast); break; + case 0x78: Data = DSi_Camera0->I2C_Read(islast); break; + case 0x7A: Data = DSi_Camera1->I2C_Read(islast); break; + case 0xA0: + case 0xE0: Data = 0xFF; break; default: printf("I2C: read on unknown device %02X, cnt=%02X, data=%02X, last=%d\n", Device, val, 0, islast); Data = 0xFF; @@ -211,8 +213,10 @@ void WriteCnt(u8 val) switch (Device) { case 0x4A: DSi_BPTWL::Start(); break; - case 0x78: DSi_Camera0->Start(); break; - case 0x7A: DSi_Camera1->Start(); break; + case 0x78: DSi_Camera0->I2C_Start(); break; + case 0x7A: DSi_Camera1->I2C_Start(); break; + case 0xA0: + case 0xE0: ack = false; break; default: printf("I2C: %s start on unknown device %02X\n", (Data&0x01)?"read":"write", Device); ack = false; @@ -226,8 +230,10 @@ void WriteCnt(u8 val) switch (Device) { case 0x4A: DSi_BPTWL::Write(Data, islast); break; - case 0x78: DSi_Camera0->Write(Data, islast); break; - case 0x7A: DSi_Camera1->Write(Data, islast); break; + case 0x78: DSi_Camera0->I2C_Write(Data, islast); break; + case 0x7A: DSi_Camera1->I2C_Write(Data, islast); break; + case 0xA0: + case 0xE0: ack = false; break; default: printf("I2C: write on unknown device %02X, cnt=%02X, data=%02X, last=%d\n", Device, val, Data, islast); ack = false; diff --git a/src/DSi_NDMA.cpp b/src/DSi_NDMA.cpp index 707c777b..d6d289d4 100644 --- a/src/DSi_NDMA.cpp +++ b/src/DSi_NDMA.cpp @@ -101,7 +101,7 @@ void DSi_NDMA::WriteCnt(u32 val) Start(); if (StartMode != 0x10 && StartMode != 0x30 && - StartMode != 0x04 && StartMode != 0x06 && StartMode != 0x07 && StartMode != 0x08 && StartMode != 0x09 && + StartMode != 0x04 && StartMode != 0x06 && StartMode != 0x07 && StartMode != 0x08 && StartMode != 0x09 && StartMode != 0x0B && StartMode != 0x24 && StartMode != 0x26 && StartMode != 0x28 && StartMode != 0x29 && StartMode != 0x2A && StartMode != 0x2B) printf("UNIMPLEMENTED ARM%d NDMA%d START MODE %02X, %08X->%08X LEN=%d BLK=%d CNT=%08X\n", CPU?7:9, Num, StartMode, SrcAddr, DstAddr, TotalLength, BlockLength, Cnt); diff --git a/src/DSi_SD.cpp b/src/DSi_SD.cpp index 45a597b7..de82edb5 100644 --- a/src/DSi_SD.cpp +++ b/src/DSi_SD.cpp @@ -778,6 +778,23 @@ void DSi_MMCStorage::SendCMD(u8 cmd, u32 param) Host->SendResponse(CSR, true); return; + case 1: // SEND_OP_COND + // CHECKME!! + // also TODO: it's different for the SD card + if (Internal) + { + param &= ~(1<<30); + OCR &= 0xBF000000; + OCR |= (param & 0x40FFFFFF); + Host->SendResponse(OCR, true); + SetState(0x01); + } + else + { + printf("CMD1 on SD card!!\n"); + } + return; + case 2: case 10: // get CID Host->SendResponse(*(u32*)&CID[12], false); @@ -801,6 +818,11 @@ void DSi_MMCStorage::SendCMD(u8 cmd, u32 param) } return; + case 6: // MMC: 'SWITCH' + // TODO! + Host->SendResponse(CSR, true); + return; + case 7: // select card (by RCA) Host->SendResponse(CSR, true); return; diff --git a/src/GPU.cpp b/src/GPU.cpp index 7989750a..35ebaba1 100644 --- a/src/GPU.cpp +++ b/src/GPU.cpp @@ -49,8 +49,8 @@ u8 VRAM_F[ 16*1024]; u8 VRAM_G[ 16*1024]; u8 VRAM_H[ 32*1024]; u8 VRAM_I[ 16*1024]; -u8* VRAM[9] = {VRAM_A, VRAM_B, VRAM_C, VRAM_D, VRAM_E, VRAM_F, VRAM_G, VRAM_H, VRAM_I}; -u32 VRAMMask[9] = {0x1FFFF, 0x1FFFF, 0x1FFFF, 0x1FFFF, 0xFFFF, 0x3FFF, 0x3FFF, 0x7FFF, 0x3FFF}; +u8* const VRAM[9] = {VRAM_A, VRAM_B, VRAM_C, VRAM_D, VRAM_E, VRAM_F, VRAM_G, VRAM_H, VRAM_I}; +u32 const VRAMMask[9] = {0x1FFFF, 0x1FFFF, 0x1FFFF, 0x1FFFF, 0xFFFF, 0x3FFF, 0x3FFF, 0x7FFF, 0x3FFF}; u8 VRAMCNT[9]; u8 VRAMSTAT; @@ -85,11 +85,67 @@ bool Accelerated; GPU2D* GPU2D_A; GPU2D* GPU2D_B; +/* + VRAM invalidation tracking + + - we want to know when a VRAM region used for graphics changed + - for some regions unmapping is mandatory to modify them (Texture, TexPal and ExtPal) and + we don't want to completely invalidate them every time they're unmapped and remapped + + For this reason we don't track the dirtyness per mapping region, but instead per VRAM bank + with VRAMDirty. Writes to LCDC go directly into VRAMDirty, while writes via other mapping regions + like BG or OBJ are first tracked in VRAMWritten_* and need to be flushed using SyncDirtyFlags. + + This is more or less a description of VRAMTrackingSet::DeriveState + Each time before the memory is read two things could have happened + to each 16kb piece (16kb is the smallest unit in which mappings can + be made thus also the size VRAMMap_* use): + - this piece was remapped compared to last time we checked, + which means this location in memory is invalid. + - this piece wasn't remapped, which means we need to check whether + it was changed. This can be archived by checking VRAMDirty. + VRAMDirty need to be reset for the respective VRAM bank. +*/ + +VRAMTrackingSet<512*1024, 16*1024> VRAMDirty_ABG; +VRAMTrackingSet<256*1024, 16*1024> VRAMDirty_AOBJ; +VRAMTrackingSet<128*1024, 16*1024> VRAMDirty_BBG; +VRAMTrackingSet<128*1024, 16*1024> VRAMDirty_BOBJ; + +VRAMTrackingSet<32*1024, 8*1024> VRAMDirty_ABGExtPal; +VRAMTrackingSet<32*1024, 8*1024> VRAMDirty_BBGExtPal; +VRAMTrackingSet<8*1024, 8*1024> VRAMDirty_AOBJExtPal; +VRAMTrackingSet<8*1024, 8*1024> VRAMDirty_BOBJExtPal; + +VRAMTrackingSet<512*1024, 128*1024> VRAMDirty_Texture; +VRAMTrackingSet<128*1024, 16*1024> VRAMDirty_TexPal; + + +NonStupidBitField<512*1024/VRAMDirtyGranularity> VRAMWritten_ABG; +NonStupidBitField<256*1024/VRAMDirtyGranularity> VRAMWritten_AOBJ; +NonStupidBitField<128*1024/VRAMDirtyGranularity> VRAMWritten_BBG; +NonStupidBitField<128*1024/VRAMDirtyGranularity> VRAMWritten_BOBJ; +NonStupidBitField<256*1024/VRAMDirtyGranularity> VRAMWritten_ARM7; + +NonStupidBitField<128*1024/VRAMDirtyGranularity> VRAMDirty[9]; + +u8 VRAMFlat_ABG[512*1024]; +u8 VRAMFlat_BBG[128*1024]; +u8 VRAMFlat_AOBJ[256*1024]; +u8 VRAMFlat_BOBJ[128*1024]; + +u8 VRAMFlat_ABGExtPal[32*1024]; +u8 VRAMFlat_BBGExtPal[32*1024]; +u8 VRAMFlat_AOBJExtPal[8*1024]; +u8 VRAMFlat_BOBJExtPal[8*1024]; + +u8 VRAMFlat_Texture[512*1024]; +u8 VRAMFlat_TexPal[128*1024]; bool Init() { - GPU2D_A = new GPU2D(0); - GPU2D_B = new GPU2D(1); + GPU2D_A = new GPU2D_Soft(0); + GPU2D_B = new GPU2D_Soft(1); if (!GPU3D::Init()) return false; FrontBuffer = 0; @@ -113,6 +169,34 @@ void DeInit() if (Framebuffer[1][1]) delete[] Framebuffer[1][1]; } +void ResetVRAMCache() +{ + for (int i = 0; i < 9; i++) + VRAMDirty[i] = NonStupidBitField<128*1024/VRAMDirtyGranularity>(); + + VRAMDirty_ABG.Reset(); + VRAMDirty_BBG.Reset(); + VRAMDirty_AOBJ.Reset(); + VRAMDirty_BOBJ.Reset(); + VRAMDirty_ABGExtPal.Reset(); + VRAMDirty_BBGExtPal.Reset(); + VRAMDirty_AOBJExtPal.Reset(); + VRAMDirty_BOBJExtPal.Reset(); + VRAMDirty_Texture.Reset(); + VRAMDirty_TexPal.Reset(); + + memset(VRAMFlat_ABG, 0, sizeof(VRAMFlat_ABG)); + memset(VRAMFlat_BBG, 0, sizeof(VRAMFlat_BBG)); + memset(VRAMFlat_AOBJ, 0, sizeof(VRAMFlat_AOBJ)); + memset(VRAMFlat_BOBJ, 0, sizeof(VRAMFlat_BOBJ)); + memset(VRAMFlat_ABGExtPal, 0, sizeof(VRAMFlat_ABGExtPal)); + memset(VRAMFlat_BBGExtPal, 0, sizeof(VRAMFlat_BBGExtPal)); + memset(VRAMFlat_AOBJExtPal, 0, sizeof(VRAMFlat_AOBJExtPal)); + memset(VRAMFlat_BOBJExtPal, 0, sizeof(VRAMFlat_BOBJExtPal)); + memset(VRAMFlat_Texture, 0, sizeof(VRAMFlat_Texture)); + memset(VRAMFlat_TexPal, 0, sizeof(VRAMFlat_TexPal)); +} + void Reset() { VCount = 0; @@ -186,6 +270,8 @@ void Reset() GPU2D_B->SetFramebuffer(Framebuffer[backbuf][0]); ResetRenderer(); + + ResetVRAMCache(); } void Stop() @@ -261,6 +347,8 @@ void DoSavestate(Savestate* file) GPU2D_A->DoSavestate(file); GPU2D_B->DoSavestate(file); GPU3D::DoSavestate(file); + + ResetVRAMCache(); } void AssignFramebuffers() @@ -411,18 +499,8 @@ void SetRenderSettings(int renderer, RenderSettings& settings) u8* GetUniqueBankPtr(u32 mask, u32 offset) { - if (!mask) return NULL; - - int num = 0; - if (!(mask & 0xFF)) { mask >>= 8; num += 8; } - else - { - if (!(mask & 0xF)) { mask >>= 4; num += 4; } - if (!(mask & 0x3)) { mask >>= 2; num += 2; } - if (!(mask & 0x1)) { mask >>= 1; num += 1; } - } - if (mask != 1) return NULL; - + if (!mask || (mask & (mask - 1)) != 0) return NULL; + int num = __builtin_ctz(mask); return &VRAM[num][offset & VRAMMask[num]]; } @@ -606,8 +684,6 @@ void MapVRAM_E(u32 bank, u8 cnt) case 4: // ABG ext palette UNMAP_RANGE(ABGExtPal, 0, 4); - GPU2D_A->BGExtPalDirty(0); - GPU2D_A->BGExtPalDirty(2); break; } } @@ -634,8 +710,6 @@ void MapVRAM_E(u32 bank, u8 cnt) case 4: // ABG ext palette MAP_RANGE(ABGExtPal, 0, 4); - GPU2D_A->BGExtPalDirty(0); - GPU2D_A->BGExtPalDirty(2); break; } } @@ -687,12 +761,10 @@ void MapVRAM_FG(u32 bank, u8 cnt) case 4: // ABG ext palette VRAMMap_ABGExtPal[((oldofs & 0x1) << 1)] &= ~bankmask; VRAMMap_ABGExtPal[((oldofs & 0x1) << 1) + 1] &= ~bankmask; - GPU2D_A->BGExtPalDirty((oldofs & 0x1) << 1); break; case 5: // AOBJ ext palette VRAMMap_AOBJExtPal &= ~bankmask; - GPU2D_A->OBJExtPalDirty(); break; } } @@ -732,12 +804,10 @@ void MapVRAM_FG(u32 bank, u8 cnt) case 4: // ABG ext palette VRAMMap_ABGExtPal[((ofs & 0x1) << 1)] |= bankmask; VRAMMap_ABGExtPal[((ofs & 0x1) << 1) + 1] |= bankmask; - GPU2D_A->BGExtPalDirty((ofs & 0x1) << 1); break; case 5: // AOBJ ext palette VRAMMap_AOBJExtPal |= bankmask; - GPU2D_A->OBJExtPalDirty(); break; } } @@ -773,8 +843,6 @@ void MapVRAM_H(u32 bank, u8 cnt) case 2: // BBG ext palette UNMAP_RANGE(BBGExtPal, 0, 4); - GPU2D_B->BGExtPalDirty(0); - GPU2D_B->BGExtPalDirty(2); break; } } @@ -800,8 +868,6 @@ void MapVRAM_H(u32 bank, u8 cnt) case 2: // BBG ext palette MAP_RANGE(BBGExtPal, 0, 4); - GPU2D_B->BGExtPalDirty(0); - GPU2D_B->BGExtPalDirty(2); break; } } @@ -841,7 +907,6 @@ void MapVRAM_I(u32 bank, u8 cnt) case 3: // BOBJ ext palette VRAMMap_BOBJExtPal &= ~bankmask; - GPU2D_B->OBJExtPalDirty(); break; } } @@ -871,7 +936,6 @@ void MapVRAM_I(u32 bank, u8 cnt) case 3: // BOBJ ext palette VRAMMap_BOBJExtPal |= bankmask; - GPU2D_B->OBJExtPalDirty(); break; } } @@ -937,6 +1001,8 @@ void StartHBlank(u32 line) DispStat[0] |= (1<<1); DispStat[1] |= (1<<1); + SyncDirtyFlags(); + if (VCount < 192) { // draw @@ -1096,4 +1162,224 @@ void SetVCount(u16 val) NextVCount = val; } +template +NonStupidBitField VRAMTrackingSet::DeriveState(u32* currentMappings) +{ + NonStupidBitField result; + u16 banksToBeZeroed = 0; + for (u32 i = 0; i < Size / MappingGranularity; i++) + { + if (currentMappings[i] != Mapping[i]) + { + result |= NonStupidBitField(i*VRAMBitsPerMapping, VRAMBitsPerMapping); + banksToBeZeroed |= currentMappings[i]; + Mapping[i] = currentMappings[i]; + } + else + { + u32 mapping = Mapping[i]; + + banksToBeZeroed |= mapping; + + while (mapping != 0) + { + u32 num = __builtin_ctz(mapping); + mapping &= ~(1 << num); + + // hack for **speed** + // this could probably be done less ugly but then we would rely + // on the compiler for vectorisation + static_assert(VRAMDirtyGranularity == 512); + if (MappingGranularity == 16*1024) + { + u32 dirty = ((u32*)VRAMDirty[num].Data)[i & (VRAMMask[num] >> 14)]; + ((u32*)result.Data)[i] |= dirty; + } + else if (MappingGranularity == 8*1024) + { + u16 dirty = ((u16*)VRAMDirty[num].Data)[i & (VRAMMask[num] >> 13)]; + ((u16*)result.Data)[i] |= dirty; + } + else if (MappingGranularity == 128*1024) + { + ((u64*)result.Data)[i * 4 + 0] |= ((u64*)VRAMDirty[num].Data)[0]; + ((u64*)result.Data)[i * 4 + 1] |= ((u64*)VRAMDirty[num].Data)[1]; + ((u64*)result.Data)[i * 4 + 2] |= ((u64*)VRAMDirty[num].Data)[2]; + ((u64*)result.Data)[i * 4 + 3] |= ((u64*)VRAMDirty[num].Data)[3]; + } + else + { + // welp + abort(); + } + } + } + } + + while (banksToBeZeroed != 0) + { + u32 num = __builtin_ctz(banksToBeZeroed); + banksToBeZeroed &= ~(1 << num); + memset(VRAMDirty[num].Data, 0, sizeof(VRAMDirty[num].Data)); + } + + return result; } + +template NonStupidBitField<32*1024/VRAMDirtyGranularity> VRAMTrackingSet<32*1024, 8*1024>::DeriveState(u32*); +template NonStupidBitField<8*1024/VRAMDirtyGranularity> VRAMTrackingSet<8*1024, 8*1024>::DeriveState(u32*); +template NonStupidBitField<512*1024/VRAMDirtyGranularity> VRAMTrackingSet<512*1024, 128*1024>::DeriveState(u32*); +template NonStupidBitField<128*1024/VRAMDirtyGranularity> VRAMTrackingSet<128*1024, 16*1024>::DeriveState(u32*); +template NonStupidBitField<256*1024/VRAMDirtyGranularity> VRAMTrackingSet<256*1024, 16*1024>::DeriveState(u32*); +template NonStupidBitField<512*1024/VRAMDirtyGranularity> VRAMTrackingSet<512*1024, 16*1024>::DeriveState(u32*); + +template +void SyncDirtyFlags(u32* mappings, NonStupidBitField& writtenFlags) +{ + const u32 VRAMWrittenBitsPer16KB = 16*1024/VRAMDirtyGranularity; + + for (typename NonStupidBitField::Iterator it = writtenFlags.Begin(); it != writtenFlags.End(); it++) + { + u32 mapping = mappings[*it / VRAMWrittenBitsPer16KB]; + while (mapping != 0) + { + u32 num = __builtin_ctz(mapping); + + VRAMDirty[num][*it & (VRAMMask[num] / VRAMDirtyGranularity)] = true; + + mapping &= ~(1 << num); + } + } + memset(writtenFlags.Data, 0, sizeof(writtenFlags.Data)); +} + +void SyncDirtyFlags() +{ + SyncDirtyFlags(VRAMMap_ABG, VRAMWritten_ABG); + SyncDirtyFlags(VRAMMap_AOBJ, VRAMWritten_AOBJ); + SyncDirtyFlags(VRAMMap_BBG, VRAMWritten_BBG); + SyncDirtyFlags(VRAMMap_BOBJ, VRAMWritten_BOBJ); + SyncDirtyFlags(VRAMMap_ARM7, VRAMWritten_ARM7); +} + +template +inline bool CopyLinearVRAM(u8* flat, u32* mappings, NonStupidBitField& dirty, u64 (*slowAccess)(u32 addr)) +{ + const u32 VRAMBitsPerMapping = MappingGranularity / VRAMDirtyGranularity; + + bool change = false; + + typename NonStupidBitField::Iterator it = dirty.Begin(); + while (it != dirty.End()) + { + u32 offset = *it * VRAMDirtyGranularity; + u8* dst = flat + offset; + u8* fastAccess = GetUniqueBankPtr(mappings[*it / VRAMBitsPerMapping], offset); + if (fastAccess) + { + memcpy(dst, fastAccess, VRAMDirtyGranularity); + } + else + { + for (u32 i = 0; i < VRAMDirtyGranularity; i += 8) + *(u64*)&dst[i] = slowAccess(offset + i); + } + change = true; + it++; + } + return change; +} + +bool MakeVRAMFlat_TextureCoherent(NonStupidBitField<512*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<128*1024>(VRAMFlat_Texture, VRAMMap_Texture, dirty, ReadVRAM_Texture); +} +bool MakeVRAMFlat_TexPalCoherent(NonStupidBitField<128*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<16*1024>(VRAMFlat_TexPal, VRAMMap_TexPal, dirty, ReadVRAM_TexPal); +} + +bool MakeVRAMFlat_ABGCoherent(NonStupidBitField<512*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<16*1024>(VRAMFlat_ABG, VRAMMap_ABG, dirty, ReadVRAM_ABG); +} +bool MakeVRAMFlat_BBGCoherent(NonStupidBitField<128*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<16*1024>(VRAMFlat_BBG, VRAMMap_BBG, dirty, ReadVRAM_BBG); +} + +bool MakeVRAMFlat_AOBJCoherent(NonStupidBitField<256*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<16*1024>(VRAMFlat_AOBJ, VRAMMap_AOBJ, dirty, ReadVRAM_AOBJ); +} +bool MakeVRAMFlat_BOBJCoherent(NonStupidBitField<128*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<16*1024>(VRAMFlat_BOBJ, VRAMMap_BOBJ, dirty, ReadVRAM_BOBJ); +} + +template +T ReadVRAM_ABGExtPal(u32 addr) +{ + u32 mask = VRAMMap_ABGExtPal[(addr >> 13) & 0x3]; + + T ret = 0; + if (mask & (1<<4)) ret |= *(T*)&VRAM_E[addr & 0x7FFF]; + if (mask & (1<<5)) ret |= *(T*)&VRAM_F[addr & 0x3FFF]; + if (mask & (1<<6)) ret |= *(T*)&VRAM_G[addr & 0x3FFF]; + + return ret; +} + +template +T ReadVRAM_BBGExtPal(u32 addr) +{ + u32 mask = VRAMMap_BBGExtPal[(addr >> 13) & 0x3]; + + T ret = 0; + if (mask & (1<<7)) ret |= *(T*)&VRAM_H[addr & 0x7FFF]; + + return ret; +} + +template +T ReadVRAM_AOBJExtPal(u32 addr) +{ + u32 mask = VRAMMap_AOBJExtPal; + + T ret = 0; + if (mask & (1<<4)) ret |= *(T*)&VRAM_F[addr & 0x1FFF]; + if (mask & (1<<5)) ret |= *(T*)&VRAM_G[addr & 0x1FFF]; + + return ret; +} + +template +T ReadVRAM_BOBJExtPal(u32 addr) +{ + u32 mask = VRAMMap_BOBJExtPal; + + T ret = 0; + if (mask & (1<<8)) ret |= *(T*)&VRAM_I[addr & 0x1FFF]; + + return ret; +} + +bool MakeVRAMFlat_ABGExtPalCoherent(NonStupidBitField<32*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<8*1024>(VRAMFlat_ABGExtPal, VRAMMap_ABGExtPal, dirty, ReadVRAM_ABGExtPal); +} +bool MakeVRAMFlat_BBGExtPalCoherent(NonStupidBitField<32*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<8*1024>(VRAMFlat_BBGExtPal, VRAMMap_BBGExtPal, dirty, ReadVRAM_BBGExtPal); +} + +bool MakeVRAMFlat_AOBJExtPalCoherent(NonStupidBitField<8*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<8*1024>(VRAMFlat_AOBJExtPal, &VRAMMap_AOBJExtPal, dirty, ReadVRAM_AOBJExtPal); +} +bool MakeVRAMFlat_BOBJExtPalCoherent(NonStupidBitField<8*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<8*1024>(VRAMFlat_BOBJExtPal, &VRAMMap_BOBJExtPal, dirty, ReadVRAM_BOBJExtPal); +} + +} \ No newline at end of file diff --git a/src/GPU.h b/src/GPU.h index 1564ef7f..cc62e1ea 100644 --- a/src/GPU.h +++ b/src/GPU.h @@ -20,6 +20,7 @@ #define GPU_H #include "GPU2D.h" +#include "NonStupidBitfield.h" namespace GPU { @@ -45,7 +46,7 @@ extern u8 VRAM_G[ 16*1024]; extern u8 VRAM_H[ 32*1024]; extern u8 VRAM_I[ 16*1024]; -extern u8* VRAM[9]; +extern u8* const VRAM[9]; extern u32 VRAMMap_LCDC; extern u32 VRAMMap_ABG[0x20]; @@ -73,6 +74,78 @@ extern GPU2D* GPU2D_B; extern int Renderer; +const u32 VRAMDirtyGranularity = 512; + +extern NonStupidBitField<512*1024/VRAMDirtyGranularity> VRAMWritten_ABG; +extern NonStupidBitField<256*1024/VRAMDirtyGranularity> VRAMWritten_AOBJ; +extern NonStupidBitField<128*1024/VRAMDirtyGranularity> VRAMWritten_BBG; +extern NonStupidBitField<128*1024/VRAMDirtyGranularity> VRAMWritten_BOBJ; +extern NonStupidBitField<256*1024/VRAMDirtyGranularity> VRAMWritten_ARM7; + +extern NonStupidBitField<128*1024/VRAMDirtyGranularity> VRAMDirty[9]; + +template +struct VRAMTrackingSet +{ + u16 Mapping[Size / MappingGranularity]; + + const u32 VRAMBitsPerMapping = MappingGranularity / VRAMDirtyGranularity; + + void Reset() + { + for (int i = 0; i < Size / MappingGranularity; i++) + { + // this is not a real VRAM bank + // so it will always be a mismatch => the bank will be completely invalidated + Mapping[i] = 0x8000; + } + } + NonStupidBitField DeriveState(u32* currentMappings); +}; + +extern VRAMTrackingSet<512*1024, 16*1024> VRAMDirty_ABG; +extern VRAMTrackingSet<256*1024, 16*1024> VRAMDirty_AOBJ; +extern VRAMTrackingSet<128*1024, 16*1024> VRAMDirty_BBG; +extern VRAMTrackingSet<128*1024, 16*1024> VRAMDirty_BOBJ; + +extern VRAMTrackingSet<32*1024, 8*1024> VRAMDirty_ABGExtPal; +extern VRAMTrackingSet<32*1024, 8*1024> VRAMDirty_BBGExtPal; +extern VRAMTrackingSet<8*1024, 8*1024> VRAMDirty_AOBJExtPal; +extern VRAMTrackingSet<8*1024, 8*1024> VRAMDirty_BOBJExtPal; + +extern VRAMTrackingSet<512*1024, 128*1024> VRAMDirty_Texture; +extern VRAMTrackingSet<128*1024, 16*1024> VRAMDirty_TexPal; + +extern u8 VRAMFlat_ABG[512*1024]; +extern u8 VRAMFlat_BBG[128*1024]; +extern u8 VRAMFlat_AOBJ[256*1024]; +extern u8 VRAMFlat_BOBJ[128*1024]; + +extern u8 VRAMFlat_ABGExtPal[32*1024]; +extern u8 VRAMFlat_BBGExtPal[32*1024]; + +extern u8 VRAMFlat_AOBJExtPal[8*1024]; +extern u8 VRAMFlat_BOBJExtPal[8*1024]; + +extern u8 VRAMFlat_Texture[512*1024]; +extern u8 VRAMFlat_TexPal[128*1024]; + +bool MakeVRAMFlat_ABGCoherent(NonStupidBitField<512*1024/VRAMDirtyGranularity>& dirty); +bool MakeVRAMFlat_BBGCoherent(NonStupidBitField<128*1024/VRAMDirtyGranularity>& dirty); + +bool MakeVRAMFlat_AOBJCoherent(NonStupidBitField<256*1024/VRAMDirtyGranularity>& dirty); +bool MakeVRAMFlat_BOBJCoherent(NonStupidBitField<128*1024/VRAMDirtyGranularity>& dirty); + +bool MakeVRAMFlat_ABGExtPalCoherent(NonStupidBitField<32*1024/VRAMDirtyGranularity>& dirty); +bool MakeVRAMFlat_BBGExtPalCoherent(NonStupidBitField<32*1024/VRAMDirtyGranularity>& dirty); + +bool MakeVRAMFlat_AOBJExtPalCoherent(NonStupidBitField<8*1024/VRAMDirtyGranularity>& dirty); +bool MakeVRAMFlat_BOBJExtPalCoherent(NonStupidBitField<8*1024/VRAMDirtyGranularity>& dirty); + +bool MakeVRAMFlat_TextureCoherent(NonStupidBitField<512*1024/VRAMDirtyGranularity>& dirty); +bool MakeVRAMFlat_TexPalCoherent(NonStupidBitField<128*1024/VRAMDirtyGranularity>& dirty); + +void SyncDirtyFlags(); typedef struct { @@ -233,7 +306,11 @@ void WriteVRAM_LCDC(u32 addr, T val) default: return; } - if (VRAMMap_LCDC & (1<> 14) & 0x1F]; + VRAMWritten_ABG[(addr & 0x7FFFF) / VRAMDirtyGranularity] = true; + if (mask & (1<<0)) *(T*)&VRAM_A[addr & 0x1FFFF] = val; if (mask & (1<<1)) *(T*)&VRAM_B[addr & 0x1FFFF] = val; if (mask & (1<<2)) *(T*)&VRAM_C[addr & 0x1FFFF] = val; @@ -295,6 +374,8 @@ void WriteVRAM_AOBJ(u32 addr, T val) { u32 mask = VRAMMap_AOBJ[(addr >> 14) & 0xF]; + VRAMWritten_AOBJ[(addr & 0x3FFFF) / VRAMDirtyGranularity] = true; + if (mask & (1<<0)) *(T*)&VRAM_A[addr & 0x1FFFF] = val; if (mask & (1<<1)) *(T*)&VRAM_B[addr & 0x1FFFF] = val; if (mask & (1<<4)) *(T*)&VRAM_E[addr & 0xFFFF] = val; @@ -324,6 +405,8 @@ void WriteVRAM_BBG(u32 addr, T val) { u32 mask = VRAMMap_BBG[(addr >> 14) & 0x7]; + VRAMWritten_BBG[(addr & 0x1FFFF) / VRAMDirtyGranularity] = true; + if (mask & (1<<2)) *(T*)&VRAM_C[addr & 0x1FFFF] = val; if (mask & (1<<7)) *(T*)&VRAM_H[addr & 0x7FFF] = val; if (mask & (1<<8)) *(T*)&VRAM_I[addr & 0x3FFF] = val; @@ -350,11 +433,12 @@ void WriteVRAM_BOBJ(u32 addr, T val) { u32 mask = VRAMMap_BOBJ[(addr >> 14) & 0x7]; + VRAMWritten_BOBJ[(addr & 0x1FFFF) / VRAMDirtyGranularity] = true; + if (mask & (1<<3)) *(T*)&VRAM_D[addr & 0x1FFFF] = val; if (mask & (1<<8)) *(T*)&VRAM_I[addr & 0x3FFF] = val; } - template T ReadVRAM_ARM7(u32 addr) { @@ -372,6 +456,8 @@ void WriteVRAM_ARM7(u32 addr, T val) { u32 mask = VRAMMap_ARM7[(addr >> 17) & 0x1]; + VRAMWritten_ARM7[(addr & 0x1FFFF) / VRAMDirtyGranularity] = true; + if (mask & (1<<2)) *(T*)&VRAM_C[addr & 0x1FFFF] = val; if (mask & (1<<3)) *(T*)&VRAM_D[addr & 0x1FFFF] = val; } diff --git a/src/GPU2D.cpp b/src/GPU2D.cpp index 7774c650..fa05e795 100644 --- a/src/GPU2D.cpp +++ b/src/GPU2D.cpp @@ -84,20 +84,6 @@ GPU2D::GPU2D(u32 num) { Num = num; - - // initialize mosaic table - for (int m = 0; m < 16; m++) - { - for (int x = 0; x < 256; x++) - { - int offset = x % (m+1); - MosaicTable[m][x] = offset; - } - } -} - -GPU2D::~GPU2D() -{ } void GPU2D::Reset() @@ -131,8 +117,6 @@ void GPU2D::Reset() BGMosaicYMax = 0; OBJMosaicY = 0; OBJMosaicYMax = 0; - CurBGXMosaicTable = MosaicTable[0]; - CurOBJXMosaicTable = MosaicTable[0]; BlendCnt = 0; EVA = 16; @@ -149,11 +133,7 @@ void GPU2D::Reset() MasterBrightness = 0; - BGExtPalStatus[0] = 0; - BGExtPalStatus[1] = 0; - BGExtPalStatus[2] = 0; - BGExtPalStatus[3] = 0; - OBJExtPalStatus = 0; + MosaicXSizeChanged(); } void GPU2D::DoSavestate(Savestate* file) @@ -206,18 +186,7 @@ void GPU2D::DoSavestate(Savestate* file) file->Var32(&Win0Active); file->Var32(&Win1Active); - if (!file->Saving) - { - // refresh those - BGExtPalStatus[0] = 0; - BGExtPalStatus[1] = 0; - BGExtPalStatus[2] = 0; - BGExtPalStatus[3] = 0; - OBJExtPalStatus = 0; - - CurBGXMosaicTable = MosaicTable[BGMosaicSize[0]]; - CurOBJXMosaicTable = MosaicTable[OBJMosaicSize[0]]; - } + MosaicXSizeChanged(); } void GPU2D::SetFramebuffer(u32* buf) @@ -225,15 +194,6 @@ void GPU2D::SetFramebuffer(u32* buf) Framebuffer = buf; } -void GPU2D::SetRenderSettings(bool accel) -{ - Accelerated = accel; - - if (Accelerated) DrawPixel = DrawPixel_Accel; - else DrawPixel = DrawPixel_Normal; -} - - u8 GPU2D::Read8(u32 addr) { switch (addr & 0x00000FFF) @@ -328,6 +288,13 @@ void GPU2D::Write8(u32 addr, u8 val) DispCnt = (DispCnt & 0x00FFFFFF) | (val << 24); if (Num) DispCnt &= 0xC0B1FFF7; return; + + case 0x10: + if (!Num) GPU3D::SetRenderXPos((GPU3D::RenderXPos & 0xFF00) | val); + break; + case 0x11: + if (!Num) GPU3D::SetRenderXPos((GPU3D::RenderXPos & 0x00FF) | (val << 8)); + break; } if (!Enabled) return; @@ -378,12 +345,12 @@ void GPU2D::Write8(u32 addr, u8 val) case 0x04C: BGMosaicSize[0] = val & 0xF; BGMosaicSize[1] = val >> 4; - CurBGXMosaicTable = MosaicTable[BGMosaicSize[0]]; + MosaicXSizeChanged(); return; case 0x04D: OBJMosaicSize[0] = val & 0xF; OBJMosaicSize[1] = val >> 4; - CurOBJXMosaicTable = MosaicTable[OBJMosaicSize[0]]; + MosaicXSizeChanged(); return; case 0x050: BlendCnt = (BlendCnt & 0x3F00) | val; return; @@ -420,6 +387,10 @@ void GPU2D::Write16(u32 addr, u16 val) if (Num) DispCnt &= 0xC0B1FFF7; return; + case 0x010: + if (!Num) GPU3D::SetRenderXPos(val); + break; + case 0x068: DispFIFO[DispFIFOWritePtr] = val; return; @@ -526,10 +497,9 @@ void GPU2D::Write16(u32 addr, u16 val) case 0x04C: BGMosaicSize[0] = val & 0xF; BGMosaicSize[1] = (val >> 4) & 0xF; - CurBGXMosaicTable = MosaicTable[BGMosaicSize[0]]; OBJMosaicSize[0] = (val >> 8) & 0xF; OBJMosaicSize[1] = val >> 12; - CurOBJXMosaicTable = MosaicTable[OBJMosaicSize[0]]; + MosaicXSizeChanged(); return; case 0x050: BlendCnt = val & 0x3FFF; return; @@ -603,138 +573,6 @@ void GPU2D::Write32(u32 addr, u32 val) Write16(addr+2, val>>16); } - -u32 GPU2D::ColorBlend4(u32 val1, u32 val2, u32 eva, u32 evb) -{ - u32 r = (((val1 & 0x00003F) * eva) + ((val2 & 0x00003F) * evb)) >> 4; - u32 g = ((((val1 & 0x003F00) * eva) + ((val2 & 0x003F00) * evb)) >> 4) & 0x007F00; - u32 b = ((((val1 & 0x3F0000) * eva) + ((val2 & 0x3F0000) * evb)) >> 4) & 0x7F0000; - - if (r > 0x00003F) r = 0x00003F; - if (g > 0x003F00) g = 0x003F00; - if (b > 0x3F0000) b = 0x3F0000; - - return r | g | b | 0xFF000000; -} - -u32 GPU2D::ColorBlend5(u32 val1, u32 val2) -{ - u32 eva = ((val1 >> 24) & 0x1F) + 1; - u32 evb = 32 - eva; - - if (eva == 32) return val1; - - u32 r = (((val1 & 0x00003F) * eva) + ((val2 & 0x00003F) * evb)) >> 5; - u32 g = ((((val1 & 0x003F00) * eva) + ((val2 & 0x003F00) * evb)) >> 5) & 0x007F00; - u32 b = ((((val1 & 0x3F0000) * eva) + ((val2 & 0x3F0000) * evb)) >> 5) & 0x7F0000; - - if (eva <= 16) - { - r += 0x000001; - g += 0x000100; - b += 0x010000; - } - - if (r > 0x00003F) r = 0x00003F; - if (g > 0x003F00) g = 0x003F00; - if (b > 0x3F0000) b = 0x3F0000; - - return r | g | b | 0xFF000000; -} - -u32 GPU2D::ColorBrightnessUp(u32 val, u32 factor) -{ - u32 rb = val & 0x3F003F; - u32 g = val & 0x003F00; - - rb += ((((0x3F003F - rb) * factor) >> 4) & 0x3F003F); - g += ((((0x003F00 - g) * factor) >> 4) & 0x003F00); - - return rb | g | 0xFF000000; -} - -u32 GPU2D::ColorBrightnessDown(u32 val, u32 factor) -{ - u32 rb = val & 0x3F003F; - u32 g = val & 0x003F00; - - rb -= (((rb * factor) >> 4) & 0x3F003F); - g -= (((g * factor) >> 4) & 0x003F00); - - return rb | g | 0xFF000000; -} - -u32 GPU2D::ColorComposite(int i, u32 val1, u32 val2) -{ - u32 coloreffect = 0; - u32 eva, evb; - - u32 flag1 = val1 >> 24; - u32 flag2 = val2 >> 24; - - u32 target2; - if (flag2 & 0x80) target2 = 0x1000; - else if (flag2 & 0x40) target2 = 0x0100; - else target2 = flag2 << 8; - - if ((flag1 & 0x80) && (BlendCnt & target2)) - { - // sprite blending - - coloreffect = 1; - - if (flag1 & 0x40) - { - eva = flag1 & 0x1F; - evb = 16 - eva; - } - else - { - eva = EVA; - evb = EVB; - } - } - else if ((flag1 & 0x40) && (BlendCnt & target2)) - { - // 3D layer blending - - coloreffect = 4; - } - else - { - if (flag1 & 0x80) flag1 = 0x10; - else if (flag1 & 0x40) flag1 = 0x01; - - if ((BlendCnt & flag1) && (WindowMask[i] & 0x20)) - { - coloreffect = (BlendCnt >> 6) & 0x3; - - if (coloreffect == 1) - { - if (BlendCnt & target2) - { - eva = EVA; - evb = EVB; - } - else - coloreffect = 0; - } - } - } - - switch (coloreffect) - { - case 0: return val1; - case 1: return ColorBlend4(val1, val2, eva, evb); - case 2: return ColorBrightnessUp(val1, EVY); - case 3: return ColorBrightnessDown(val1, EVY); - case 4: return ColorBlend5(val1, val2); - } - - return val1; -} - - void GPU2D::UpdateMosaicCounters(u32 line) { // Y mosaic uses incrementing 4-bit counters @@ -752,183 +590,13 @@ void GPU2D::UpdateMosaicCounters(u32 line) } } - -void GPU2D::DrawScanline(u32 line) -{ - int stride = Accelerated ? (256*3 + 1) : 256; - u32* dst = &Framebuffer[stride * line]; - - int n3dline = line; - line = GPU::VCount; - - bool forceblank = false; - - // scanlines that end up outside of the GPU drawing range - // (as a result of writing to VCount) are filled white - if (line > 192) forceblank = true; - - // GPU B can be completely disabled by POWCNT1 - // oddly that's not the case for GPU A - if (Num && !Enabled) forceblank = true; - - if (forceblank) - { - for (int i = 0; i < 256; i++) - dst[i] = 0xFFFFFFFF; - - if (Accelerated) - { - dst[256*3] = 0; - } - return; - } - - u32 dispmode = DispCnt >> 16; - dispmode &= (Num ? 0x1 : 0x3); - - if (Num == 0) - { - if (!Accelerated) - _3DLine = GPU3D::GetLine(n3dline); - else if ((CaptureCnt & (1<<31)) && (((CaptureCnt >> 29) & 0x3) != 1)) - { - _3DLine = GPU3D::GetLine(n3dline); - //GPU3D::GLRenderer::PrepareCaptureFrame(); - } - } - - // always render regular graphics - DrawScanline_BGOBJ(line); - UpdateMosaicCounters(line); - - switch (dispmode) - { - case 0: // screen off - { - for (int i = 0; i < 256; i++) - dst[i] = 0x003F3F3F; - } - break; - - case 1: // regular display - { - int i = 0; - for (; i < (stride & ~1); i+=2) - *(u64*)&dst[i] = *(u64*)&BGOBJLine[i]; - } - break; - - case 2: // VRAM display - { - u32 vrambank = (DispCnt >> 18) & 0x3; - if (GPU::VRAMMap_LCDC & (1<> 4; - u8 b = (color & 0x7C00) >> 9; - - dst[i] = r | (g << 8) | (b << 16); - } - } - else - { - for (int i = 0; i < 256; i++) - { - dst[i] = 0; - } - } - } - break; - - case 3: // FIFO display - { - for (int i = 0; i < 256; i++) - { - u16 color = DispFIFOBuffer[i]; - u8 r = (color & 0x001F) << 1; - u8 g = (color & 0x03E0) >> 4; - u8 b = (color & 0x7C00) >> 9; - - dst[i] = r | (g << 8) | (b << 16); - } - } - break; - } - - // capture - if ((Num == 0) && (CaptureCnt & (1<<31))) - { - u32 capwidth, capheight; - switch ((CaptureCnt >> 20) & 0x3) - { - case 0: capwidth = 128; capheight = 128; break; - case 1: capwidth = 256; capheight = 64; break; - case 2: capwidth = 256; capheight = 128; break; - case 3: capwidth = 256; capheight = 192; break; - } - - if (line < capheight) - DoCapture(line, capwidth); - } - - if (Accelerated) - { - dst[256*3] = MasterBrightness | (DispCnt & 0x30000); - return; - } - - // master brightness - if (dispmode != 0) - { - if ((MasterBrightness >> 14) == 1) - { - // up - u32 factor = MasterBrightness & 0x1F; - if (factor > 16) factor = 16; - - for (int i = 0; i < 256; i++) - { - dst[i] = ColorBrightnessUp(dst[i], factor); - } - } - else if ((MasterBrightness >> 14) == 2) - { - // down - u32 factor = MasterBrightness & 0x1F; - if (factor > 16) factor = 16; - - for (int i = 0; i < 256; i++) - { - dst[i] = ColorBrightnessDown(dst[i], factor); - } - } - } - - // convert to 32-bit BGRA - // note: 32-bit RGBA would be more straightforward, but - // BGRA seems to be more compatible (Direct2D soft, cairo...) - for (int i = 0; i < 256; i+=2) - { - u64 c = *(u64*)&dst[i]; - - u64 r = (c << 18) & 0xFC000000FC0000; - u64 g = (c << 2) & 0xFC000000FC00; - u64 b = (c >> 14) & 0xFC000000FC; - c = r | g | b; - - *(u64*)&dst[i] = c | ((c & 0x00C0C0C000C0C0C0) >> 6) | 0xFF000000FF000000; - } -} - void GPU2D::VBlank() { - CaptureCnt &= ~(1<<31); + if (CaptureLatch) + { + CaptureCnt &= ~(1<<31); + CaptureLatch = false; + } DispFIFOReadPtr = 0; DispFIFOWritePtr = 0; @@ -948,235 +616,6 @@ void GPU2D::VBlankEnd() //OBJMosaicYMax = OBJMosaicSize[1]; //OBJMosaicY = 0; //OBJMosaicYCount = 0; - -#ifdef OGLRENDERER_ENABLED - if (Accelerated) - { - if ((Num == 0) && (CaptureCnt & (1<<31)) && (((CaptureCnt >> 29) & 0x3) != 1)) - { - GPU3D::GLRenderer::PrepareCaptureFrame(); - } - } -#endif -} - - -void GPU2D::DoCapture(u32 line, u32 width) -{ - u32 dstvram = (CaptureCnt >> 16) & 0x3; - - // TODO: confirm this - // it should work like VRAM display mode, which requires VRAM to be mapped to LCDC - if (!(GPU::VRAMMap_LCDC & (1<> 18) & 0x3) << 14) + (line * width); - - // TODO: handle 3D in accelerated mode!! - - u32* srcA; - if (CaptureCnt & (1<<24)) - { - srcA = _3DLine; - } - else - { - srcA = BGOBJLine; - if (Accelerated) - { - // in accelerated mode, compositing is normally done on the GPU - // but when doing display capture, we do need the composited output - // so we do it here - - for (int i = 0; i < 256; i++) - { - u32 val1 = BGOBJLine[i]; - u32 val2 = BGOBJLine[256+i]; - u32 val3 = BGOBJLine[512+i]; - - u32 compmode = (val3 >> 24) & 0xF; - - if (compmode == 4) - { - // 3D on top, blending - - u32 _3dval = _3DLine[val3 & 0xFF]; - if ((_3dval >> 24) > 0) - val1 = ColorBlend5(_3dval, val1); - else - val1 = val2; - } - else if (compmode == 1) - { - // 3D on bottom, blending - - u32 _3dval = _3DLine[val3 & 0xFF]; - if ((_3dval >> 24) > 0) - { - u32 eva = (val3 >> 8) & 0x1F; - u32 evb = (val3 >> 16) & 0x1F; - - val1 = ColorBlend4(val1, _3dval, eva, evb); - } - else - val1 = val2; - } - else if (compmode <= 3) - { - // 3D on top, normal/fade - - u32 _3dval = _3DLine[val3 & 0xFF]; - if ((_3dval >> 24) > 0) - { - u32 evy = (val3 >> 8) & 0x1F; - - val1 = _3dval; - if (compmode == 2) val1 = ColorBrightnessUp(val1, evy); - else if (compmode == 3) val1 = ColorBrightnessDown(val1, evy); - } - else - val1 = val2; - } - - BGOBJLine[i] = val1; - } - } - } - - u16* srcB = NULL; - u32 srcBaddr = line * 256; - - if (CaptureCnt & (1<<25)) - { - srcB = &DispFIFOBuffer[0]; - srcBaddr = 0; - } - else - { - u32 srcvram = (DispCnt >> 18) & 0x3; - if (GPU::VRAMMap_LCDC & (1<> 16) & 0x3) != 2) - srcBaddr += ((CaptureCnt >> 26) & 0x3) << 14; - } - - dstaddr &= 0xFFFF; - srcBaddr &= 0xFFFF; - - switch ((CaptureCnt >> 29) & 0x3) - { - case 0: // source A - { - for (u32 i = 0; i < width; i++) - { - u32 val = srcA[i]; - - // TODO: check what happens when alpha=0 - - u32 r = (val >> 1) & 0x1F; - u32 g = (val >> 9) & 0x1F; - u32 b = (val >> 17) & 0x1F; - u32 a = ((val >> 24) != 0) ? 0x8000 : 0; - - dst[dstaddr] = r | (g << 5) | (b << 10) | a; - dstaddr = (dstaddr + 1) & 0xFFFF; - } - } - break; - - case 1: // source B - { - if (srcB) - { - for (u32 i = 0; i < width; i++) - { - dst[dstaddr] = srcB[srcBaddr]; - srcBaddr = (srcBaddr + 1) & 0xFFFF; - dstaddr = (dstaddr + 1) & 0xFFFF; - } - } - else - { - for (u32 i = 0; i < width; i++) - { - dst[dstaddr] = 0; - dstaddr = (dstaddr + 1) & 0xFFFF; - } - } - } - break; - - case 2: // sources A+B - case 3: - { - u32 eva = CaptureCnt & 0x1F; - u32 evb = (CaptureCnt >> 8) & 0x1F; - - // checkme - if (eva > 16) eva = 16; - if (evb > 16) evb = 16; - - if (srcB) - { - for (u32 i = 0; i < width; i++) - { - u32 val = srcA[i]; - - // TODO: check what happens when alpha=0 - - u32 rA = (val >> 1) & 0x1F; - u32 gA = (val >> 9) & 0x1F; - u32 bA = (val >> 17) & 0x1F; - u32 aA = ((val >> 24) != 0) ? 1 : 0; - - val = srcB[srcBaddr]; - - u32 rB = val & 0x1F; - u32 gB = (val >> 5) & 0x1F; - u32 bB = (val >> 10) & 0x1F; - u32 aB = val >> 15; - - u32 rD = ((rA * aA * eva) + (rB * aB * evb)) >> 4; - u32 gD = ((gA * aA * eva) + (gB * aB * evb)) >> 4; - u32 bD = ((bA * aA * eva) + (bB * aB * evb)) >> 4; - u32 aD = (eva>0 ? aA : 0) | (evb>0 ? aB : 0); - - if (rD > 0x1F) rD = 0x1F; - if (gD > 0x1F) gD = 0x1F; - if (bD > 0x1F) bD = 0x1F; - - dst[dstaddr] = rD | (gD << 5) | (bD << 10) | (aD << 15); - srcBaddr = (srcBaddr + 1) & 0xFFFF; - dstaddr = (dstaddr + 1) & 0xFFFF; - } - } - else - { - for (u32 i = 0; i < width; i++) - { - u32 val = srcA[i]; - - // TODO: check what happens when alpha=0 - - u32 rA = (val >> 1) & 0x1F; - u32 gA = (val >> 9) & 0x1F; - u32 bA = (val >> 17) & 0x1F; - u32 aA = ((val >> 24) != 0) ? 1 : 0; - - u32 rD = (rA * aA * eva) >> 4; - u32 gD = (gA * aA * eva) >> 4; - u32 bD = (bA * aA * eva) >> 4; - u32 aD = (eva>0 ? aA : 0); - - dst[dstaddr] = rD | (gD << 5) | (bD << 10) | (aD << 15); - dstaddr = (dstaddr + 1) & 0xFFFF; - } - } - } - break; - } } void GPU2D::SampleFIFO(u32 offset, u32 num) @@ -1191,88 +630,22 @@ void GPU2D::SampleFIFO(u32 offset, u32 num) } } - -void GPU2D::BGExtPalDirty(u32 base) -{ - BGExtPalStatus[base] = 0; - BGExtPalStatus[base+1] = 0; -} - -void GPU2D::OBJExtPalDirty() -{ - OBJExtPalStatus = 0; -} - - u16* GPU2D::GetBGExtPal(u32 slot, u32 pal) { - u16* dst = &BGExtPalCache[slot][pal << 8]; - - if (!(BGExtPalStatus[slot] & (1< 0)) DrawBG_##type(line, num); else DrawBG_##type(line, num); } - -#define DoDrawBG_Large(line) \ - { if ((BGCnt[2] & 0x0040) && (BGMosaicSize[0] > 0)) DrawBG_Large(line); else DrawBG_Large(line); } - -template -void GPU2D::DrawScanlineBGMode(u32 line) +void GPU2D::GetBGVRAM(u8*& data, u32& mask) { - for (int i = 3; i >= 0; i--) + if (Num == 0) { - if ((BGCnt[3] & 0x3) == i) - { - if (DispCnt & 0x0800) - { - if (bgmode >= 3) - DoDrawBG(Extended, line, 3) - else if (bgmode >= 1) - DoDrawBG(Affine, line, 3) - else - DoDrawBG(Text, line, 3) - } - } - if ((BGCnt[2] & 0x3) == i) - { - if (DispCnt & 0x0400) - { - if (bgmode == 5) - DoDrawBG(Extended, line, 2) - else if (bgmode == 4 || bgmode == 2) - DoDrawBG(Affine, line, 2) - else - DoDrawBG(Text, line, 2) - } - } - if ((BGCnt[1] & 0x3) == i) - { - if (DispCnt & 0x0200) - { - DoDrawBG(Text, line, 1) - } - } - if ((BGCnt[0] & 0x3) == i) - { - if (DispCnt & 0x0100) - { - if ((!Num) && (DispCnt & 0x8)) - DrawBG_3D(); - else - DoDrawBG(Text, line, 0) - } - } - if ((DispCnt & 0x1000) && NumSprites) - InterleaveSprites(0x40000 | (i<<16)); - } -} - -void GPU2D::DrawScanlineBGMode6(u32 line) -{ - for (int i = 3; i >= 0; i--) - { - if ((BGCnt[2] & 0x3) == i) - { - if (DispCnt & 0x0400) - { - DoDrawBG_Large(line) - } - } - if ((BGCnt[0] & 0x3) == i) - { - if (DispCnt & 0x0100) - { - if ((!Num) && (DispCnt & 0x8)) - DrawBG_3D(); - } - } - if ((DispCnt & 0x1000) && NumSprites) - InterleaveSprites(0x40000 | (i<<16)); - } -} - -void GPU2D::DrawScanlineBGMode7(u32 line) -{ - // mode 7 only has text-mode BG0 and BG1 - - for (int i = 3; i >= 0; i--) - { - if ((BGCnt[1] & 0x3) == i) - { - if (DispCnt & 0x0200) - { - DoDrawBG(Text, line, 1) - } - } - if ((BGCnt[0] & 0x3) == i) - { - if (DispCnt & 0x0100) - { - if ((!Num) && (DispCnt & 0x8)) - DrawBG_3D(); - else - DoDrawBG(Text, line, 0) - } - } - if ((DispCnt & 0x1000) && NumSprites) - InterleaveSprites(0x40000 | (i<<16)); - } -} - -void GPU2D::DrawScanline_BGOBJ(u32 line) -{ - // forced blank disables BG/OBJ compositing - if (DispCnt & (1<<7)) - { - for (int i = 0; i < 256; i++) - BGOBJLine[i] = 0xFF3F3F3F; - - return; - } - - u64 backdrop; - if (Num) backdrop = *(u16*)&GPU::Palette[0x400]; - else backdrop = *(u16*)&GPU::Palette[0]; - - { - u8 r = (backdrop & 0x001F) << 1; - u8 g = (backdrop & 0x03E0) >> 4; - u8 b = (backdrop & 0x7C00) >> 9; - - backdrop = r | (g << 8) | (b << 16) | 0x20000000; - backdrop |= (backdrop << 32); - - for (int i = 0; i < 256; i+=2) - *(u64*)&BGOBJLine[i] = backdrop; - } - - if (DispCnt & 0xE000) - CalculateWindowMask(line); - else - memset(WindowMask, 0xFF, 256); - - ApplySpriteMosaicX(); - - switch (DispCnt & 0x7) - { - case 0: DrawScanlineBGMode<0>(line); break; - case 1: DrawScanlineBGMode<1>(line); break; - case 2: DrawScanlineBGMode<2>(line); break; - case 3: DrawScanlineBGMode<3>(line); break; - case 4: DrawScanlineBGMode<4>(line); break; - case 5: DrawScanlineBGMode<5>(line); break; - case 6: DrawScanlineBGMode6(line); break; - case 7: DrawScanlineBGMode7(line); break; - } - - // color special effects - // can likely be optimized - - if (!Accelerated) - { - for (int i = 0; i < 256; i++) - { - u32 val1 = BGOBJLine[i]; - u32 val2 = BGOBJLine[256+i]; - - BGOBJLine[i] = ColorComposite(i, val1, val2); - } + data = GPU::VRAMFlat_ABG; + mask = 0x7FFFF; } else { - if (Num == 0) - { - for (int i = 0; i < 256; i++) - { - u32 val1 = BGOBJLine[i]; - u32 val2 = BGOBJLine[256+i]; - u32 val3 = BGOBJLine[512+i]; - - u32 flag1 = val1 >> 24; - u32 flag2 = val2 >> 24; - - u32 bldcnteffect = (BlendCnt >> 6) & 0x3; - - u32 target1; - if (flag1 & 0x80) target1 = 0x0010; - else if (flag1 & 0x40) target1 = 0x0001; - else target1 = flag1; - - u32 target2; - if (flag2 & 0x80) target2 = 0x1000; - else if (flag2 & 0x40) target2 = 0x0100; - else target2 = flag2 << 8; - - if (((flag1 & 0xC0) == 0x40) && (BlendCnt & target2)) - { - // 3D on top, blending - - BGOBJLine[i] = val2; - BGOBJLine[256+i] = ColorComposite(i, val2, val3); - BGOBJLine[512+i] = 0x04000000 | (val1 & 0xFF); - } - else if ((flag1 & 0xC0) == 0x40) - { - // 3D on top, normal/fade - - if (bldcnteffect == 1) bldcnteffect = 0; - if (!(BlendCnt & 0x0001)) bldcnteffect = 0; - if (!(WindowMask[i] & 0x20)) bldcnteffect = 0; - - BGOBJLine[i] = val2; - BGOBJLine[256+i] = ColorComposite(i, val2, val3); - BGOBJLine[512+i] = (bldcnteffect << 24) | (EVY << 8) | (val1 & 0xFF); - } - else if (((flag2 & 0xC0) == 0x40) && ((BlendCnt & 0x01C0) == 0x0140)) - { - // 3D on bottom, blending - - u32 eva, evb; - if ((flag1 & 0xC0) == 0xC0) - { - eva = flag1 & 0x1F; - evb = 16 - eva; - } - else if (((BlendCnt & target1) && (WindowMask[i] & 0x20)) || - ((flag1 & 0xC0) == 0x80)) - { - eva = EVA; - evb = EVB; - } - else - bldcnteffect = 7; - - BGOBJLine[i] = val1; - BGOBJLine[256+i] = ColorComposite(i, val1, val3); - BGOBJLine[512+i] = (bldcnteffect << 24) | (EVB << 16) | (EVA << 8) | (val2 & 0xFF); - } - else - { - // no potential 3D pixel involved - - BGOBJLine[i] = ColorComposite(i, val1, val2); - BGOBJLine[256+i] = 0; - BGOBJLine[512+i] = 0x07000000; - } - } - } - else - { - for (int i = 0; i < 256; i++) - { - u32 val1 = BGOBJLine[i]; - u32 val2 = BGOBJLine[256+i]; - - BGOBJLine[i] = ColorComposite(i, val1, val2); - BGOBJLine[256+i] = 0; - BGOBJLine[512+i] = 0x07000000; - } - } + data = GPU::VRAMFlat_BBG; + mask = 0x1FFFF; } - - if (BGMosaicY >= BGMosaicYMax) - { - BGMosaicY = 0; - BGMosaicYMax = BGMosaicSize[1]; - } - else - BGMosaicY++; - - /*if (OBJMosaicY >= OBJMosaicYMax) - { - OBJMosaicY = 0; - OBJMosaicYMax = OBJMosaicSize[1]; - } - else - OBJMosaicY++;*/ } - -void GPU2D::DrawPixel_Normal(u32* dst, u16 color, u32 flag) +void GPU2D::GetOBJVRAM(u8*& data, u32& mask) { - u8 r = (color & 0x001F) << 1; - u8 g = (color & 0x03E0) >> 4; - u8 b = (color & 0x7C00) >> 9; - //g |= ((color & 0x8000) >> 15); - - *(dst+256) = *dst; - *dst = r | (g << 8) | (b << 16) | flag; -} - -void GPU2D::DrawPixel_Accel(u32* dst, u16 color, u32 flag) -{ - u8 r = (color & 0x001F) << 1; - u8 g = (color & 0x03E0) >> 4; - u8 b = (color & 0x7C00) >> 9; - - *(dst+512) = *(dst+256); - *(dst+256) = *dst; - *dst = r | (g << 8) | (b << 16) | flag; -} - -void GPU2D::DrawBG_3D() -{ - u16 xoff = BGXPos[0]; - int i = 0; - int iend = 256; - - if (xoff & 0x100) + if (Num == 0) { - i = (0x100 - (xoff & 0xFF)); - xoff += i; - } - if ((xoff - i + iend - 1) & 0x100) - { - iend -= (xoff & 0xFF); - } - - if (Accelerated) - { - for (; i < iend; i++) - { - int pos = xoff++; - - if (!(WindowMask[i] & 0x01)) continue; - - BGOBJLine[i+512] = BGOBJLine[i+256]; - BGOBJLine[i+256] = BGOBJLine[i]; - BGOBJLine[i] = 0x40000000 | pos; // 3D-layer placeholder - } + data = GPU::VRAMFlat_AOBJ; + mask = 0x3FFFF; } else { - for (; i < iend; i++) - { - u32 c = _3DLine[xoff]; - xoff++; - - if ((c >> 24) == 0) continue; - if (!(WindowMask[i] & 0x01)) continue; - - BGOBJLine[i+256] = BGOBJLine[i]; - BGOBJLine[i] = c | 0x40000000; - } - } -} - -template -void GPU2D::DrawBG_Text(u32 line, u32 bgnum) -{ - u16 bgcnt = BGCnt[bgnum]; - - u32 tilesetaddr, tilemapaddr; - u16* pal; - u32 extpal, extpalslot; - - u16 xoff = BGXPos[bgnum]; - u16 yoff = BGYPos[bgnum] + line; - - if (bgcnt & 0x0040) - { - // vertical mosaic - yoff -= BGMosaicY; - } - - u32 widexmask = (bgcnt & 0x4000) ? 0x100 : 0; - - extpal = (DispCnt & 0x40000000); - if (extpal) extpalslot = ((bgnum<2) && (bgcnt&0x2000)) ? (2+bgnum) : bgnum; - - if (Num) - { - tilesetaddr = 0x06200000 + ((bgcnt & 0x003C) << 12); - tilemapaddr = 0x06200000 + ((bgcnt & 0x1F00) << 3); - - pal = (u16*)&GPU::Palette[0x400]; - } - else - { - tilesetaddr = 0x06000000 + ((DispCnt & 0x07000000) >> 8) + ((bgcnt & 0x003C) << 12); - tilemapaddr = 0x06000000 + ((DispCnt & 0x38000000) >> 11) + ((bgcnt & 0x1F00) << 3); - - pal = (u16*)&GPU::Palette[0]; - } - - // adjust Y position in tilemap - if (bgcnt & 0x8000) - { - tilemapaddr += ((yoff & 0x1F8) << 3); - if (bgcnt & 0x4000) - tilemapaddr += ((yoff & 0x100) << 3); - } - else - tilemapaddr += ((yoff & 0xF8) << 3); - - u16 curtile; - u16* curpal; - u32 pixelsaddr; - u8 color; - u32 lastxpos; - - if (bgcnt & 0x0080) - { - // 256-color - - // preload shit as needed - if ((xoff & 0x7) || mosaic) - { - curtile = GPU::ReadVRAM_BG(tilemapaddr + ((xoff & 0xF8) >> 2) + ((xoff & widexmask) << 3)); - - if (extpal) curpal = GetBGExtPal(extpalslot, curtile>>12); - else curpal = pal; - - pixelsaddr = tilesetaddr + ((curtile & 0x03FF) << 6) - + (((curtile & 0x0800) ? (7-(yoff&0x7)) : (yoff&0x7)) << 3); - } - - if (mosaic) lastxpos = xoff; - - for (int i = 0; i < 256; i++) - { - u32 xpos; - if (mosaic) xpos = xoff - CurBGXMosaicTable[i]; - else xpos = xoff; - - if ((!mosaic && (!(xpos & 0x7))) || - (mosaic && ((xpos >> 3) != (lastxpos >> 3)))) - { - // load a new tile - curtile = GPU::ReadVRAM_BG(tilemapaddr + ((xpos & 0xF8) >> 2) + ((xpos & widexmask) << 3)); - - if (extpal) curpal = GetBGExtPal(extpalslot, curtile>>12); - else curpal = pal; - - pixelsaddr = tilesetaddr + ((curtile & 0x03FF) << 6) - + (((curtile & 0x0800) ? (7-(yoff&0x7)) : (yoff&0x7)) << 3); - - if (mosaic) lastxpos = xpos; - } - - // draw pixel - if (WindowMask[i] & (1<(pixelsaddr + tilexoff); - - if (color) - DrawPixel(&BGOBJLine[i], curpal[color], 0x01000000<(tilemapaddr + ((xoff & 0xF8) >> 2) + ((xoff & widexmask) << 3)); - curpal = pal + ((curtile & 0xF000) >> 8); - pixelsaddr = tilesetaddr + ((curtile & 0x03FF) << 5) - + (((curtile & 0x0800) ? (7-(yoff&0x7)) : (yoff&0x7)) << 2); - } - - if (mosaic) lastxpos = xoff; - - for (int i = 0; i < 256; i++) - { - u32 xpos; - if (mosaic) xpos = xoff - CurBGXMosaicTable[i]; - else xpos = xoff; - - if ((!mosaic && (!(xpos & 0x7))) || - (mosaic && ((xpos >> 3) != (lastxpos >> 3)))) - { - // load a new tile - curtile = GPU::ReadVRAM_BG(tilemapaddr + ((xpos & 0xF8) >> 2) + ((xpos & widexmask) << 3)); - curpal = pal + ((curtile & 0xF000) >> 8); - pixelsaddr = tilesetaddr + ((curtile & 0x03FF) << 5) - + (((curtile & 0x0800) ? (7-(yoff&0x7)) : (yoff&0x7)) << 2); - - if (mosaic) lastxpos = xpos; - } - - // draw pixel - if (WindowMask[i] & (1<(pixelsaddr + (tilexoff >> 1)) >> 4; - } - else - { - color = GPU::ReadVRAM_BG(pixelsaddr + (tilexoff >> 1)) & 0x0F; - } - - if (color) - DrawPixel(&BGOBJLine[i], curpal[color], 0x01000000< -void GPU2D::DrawBG_Affine(u32 line, u32 bgnum) -{ - u16 bgcnt = BGCnt[bgnum]; - - u32 tilesetaddr, tilemapaddr; - u16* pal; - - u32 coordmask; - u32 yshift; - switch (bgcnt & 0xC000) - { - case 0x0000: coordmask = 0x07800; yshift = 7; break; - case 0x4000: coordmask = 0x0F800; yshift = 8; break; - case 0x8000: coordmask = 0x1F800; yshift = 9; break; - case 0xC000: coordmask = 0x3F800; yshift = 10; break; - } - - u32 overflowmask; - if (bgcnt & 0x2000) overflowmask = 0; - else overflowmask = ~(coordmask | 0x7FF); - - s16 rotA = BGRotA[bgnum-2]; - s16 rotB = BGRotB[bgnum-2]; - s16 rotC = BGRotC[bgnum-2]; - s16 rotD = BGRotD[bgnum-2]; - - s32 rotX = BGXRefInternal[bgnum-2]; - s32 rotY = BGYRefInternal[bgnum-2]; - - if (bgcnt & 0x0040) - { - // vertical mosaic - rotX -= (BGMosaicY * rotB); - rotY -= (BGMosaicY * rotD); - } - - if (Num) - { - tilesetaddr = 0x06200000 + ((bgcnt & 0x003C) << 12); - tilemapaddr = 0x06200000 + ((bgcnt & 0x1F00) << 3); - - pal = (u16*)&GPU::Palette[0x400]; - } - else - { - tilesetaddr = 0x06000000 + ((DispCnt & 0x07000000) >> 8) + ((bgcnt & 0x003C) << 12); - tilemapaddr = 0x06000000 + ((DispCnt & 0x38000000) >> 11) + ((bgcnt & 0x1F00) << 3); - - pal = (u16*)&GPU::Palette[0]; - } - - u16 curtile; - u8 color; - - yshift -= 3; - - for (int i = 0; i < 256; i++) - { - if (WindowMask[i] & (1<(tilemapaddr + ((((finalY & coordmask) >> 11) << yshift) + ((finalX & coordmask) >> 11))); - - // draw pixel - u32 tilexoff = (finalX >> 8) & 0x7; - u32 tileyoff = (finalY >> 8) & 0x7; - - color = GPU::ReadVRAM_BG(tilesetaddr + (curtile << 6) + (tileyoff << 3) + tilexoff); - - if (color) - DrawPixel(&BGOBJLine[i], pal[color], 0x01000000< -void GPU2D::DrawBG_Extended(u32 line, u32 bgnum) -{ - u16 bgcnt = BGCnt[bgnum]; - - u32 tilesetaddr, tilemapaddr; - u16* pal; - u32 extpal; - - extpal = (DispCnt & 0x40000000); - - s16 rotA = BGRotA[bgnum-2]; - s16 rotB = BGRotB[bgnum-2]; - s16 rotC = BGRotC[bgnum-2]; - s16 rotD = BGRotD[bgnum-2]; - - s32 rotX = BGXRefInternal[bgnum-2]; - s32 rotY = BGYRefInternal[bgnum-2]; - - if (bgcnt & 0x0040) - { - // vertical mosaic - rotX -= (BGMosaicY * rotB); - rotY -= (BGMosaicY * rotD); - } - - if (bgcnt & 0x0080) - { - // bitmap modes - - u32 xmask, ymask; - u32 yshift; - switch (bgcnt & 0xC000) - { - case 0x0000: xmask = 0x07FFF; ymask = 0x07FFF; yshift = 7; break; - case 0x4000: xmask = 0x0FFFF; ymask = 0x0FFFF; yshift = 8; break; - case 0x8000: xmask = 0x1FFFF; ymask = 0x0FFFF; yshift = 9; break; - case 0xC000: xmask = 0x1FFFF; ymask = 0x1FFFF; yshift = 9; break; - } - - u32 ofxmask, ofymask; - if (bgcnt & 0x2000) - { - ofxmask = 0; - ofymask = 0; - } - else - { - ofxmask = ~xmask; - ofymask = ~ymask; - } - - if (Num) tilemapaddr = 0x06200000 + ((bgcnt & 0x1F00) << 6); - else tilemapaddr = 0x06000000 + ((bgcnt & 0x1F00) << 6); - - if (bgcnt & 0x0004) - { - // direct color bitmap - - u16 color; - - for (int i = 0; i < 256; i++) - { - if (WindowMask[i] & (1<(tilemapaddr + (((((finalY & ymask) >> 8) << yshift) + ((finalX & xmask) >> 8)) << 1)); - - if (color & 0x8000) - DrawPixel(&BGOBJLine[i], color, 0x01000000<(tilemapaddr + (((finalY & ymask) >> 8) << yshift) + ((finalX & xmask) >> 8)); - - if (color) - DrawPixel(&BGOBJLine[i], pal[color], 0x01000000<> 8) + ((bgcnt & 0x003C) << 12); - tilemapaddr = 0x06000000 + ((DispCnt & 0x38000000) >> 11) + ((bgcnt & 0x1F00) << 3); - - pal = (u16*)&GPU::Palette[0]; - } - - u16 curtile; - u16* curpal; - u8 color; - - yshift -= 3; - - for (int i = 0; i < 256; i++) - { - if (WindowMask[i] & (1<(tilemapaddr + (((((finalY & coordmask) >> 11) << yshift) + ((finalX & coordmask) >> 11)) << 1)); - - if (extpal) curpal = GetBGExtPal(bgnum, curtile>>12); - else curpal = pal; - - // draw pixel - u32 tilexoff = (finalX >> 8) & 0x7; - u32 tileyoff = (finalY >> 8) & 0x7; - - if (curtile & 0x0400) tilexoff = 7-tilexoff; - if (curtile & 0x0800) tileyoff = 7-tileyoff; - - color = GPU::ReadVRAM_BG(tilesetaddr + ((curtile & 0x03FF) << 6) + (tileyoff << 3) + tilexoff); - - if (color) - DrawPixel(&BGOBJLine[i], curpal[color], 0x01000000< -void GPU2D::DrawBG_Large(u32 line) // BG is always BG2 -{ - u16 bgcnt = BGCnt[2]; - - u32 tilesetaddr, tilemapaddr; - u16* pal; - - // large BG sizes: - // 0: 512x1024 - // 1: 1024x512 - // 2: 512x256 - // 3: 512x512 - u32 xmask, ymask; - u32 yshift; - switch (bgcnt & 0xC000) - { - case 0x0000: xmask = 0x1FFFF; ymask = 0x3FFFF; yshift = 9; break; - case 0x4000: xmask = 0x3FFFF; ymask = 0x1FFFF; yshift = 10; break; - case 0x8000: xmask = 0x1FFFF; ymask = 0x0FFFF; yshift = 9; break; - case 0xC000: xmask = 0x1FFFF; ymask = 0x1FFFF; yshift = 9; break; - } - - u32 ofxmask, ofymask; - if (bgcnt & 0x2000) - { - ofxmask = 0; - ofymask = 0; - } - else - { - ofxmask = ~xmask; - ofymask = ~ymask; - } - - s16 rotA = BGRotA[0]; - s16 rotB = BGRotB[0]; - s16 rotC = BGRotC[0]; - s16 rotD = BGRotD[0]; - - s32 rotX = BGXRefInternal[0]; - s32 rotY = BGYRefInternal[0]; - - if (bgcnt & 0x0040) - { - // vertical mosaic - rotX -= (BGMosaicY * rotB); - rotY -= (BGMosaicY * rotD); - } - - if (Num) tilemapaddr = 0x06200000; - else tilemapaddr = 0x06000000; - - // 256-color bitmap - - if (Num) pal = (u16*)&GPU::Palette[0x400]; - else pal = (u16*)&GPU::Palette[0]; - - u8 color; - - for (int i = 0; i < 256; i++) - { - if (WindowMask[i] & (1<<2)) - { - s32 finalX, finalY; - if (mosaic) - { - int im = CurBGXMosaicTable[i]; - finalX = rotX - (im * rotA); - finalY = rotY - (im * rotC); - } - else - { - finalX = rotX; - finalY = rotY; - } - - if (!(finalX & ofxmask) && !(finalY & ofymask)) - { - color = GPU::ReadVRAM_BG(tilemapaddr + (((finalY & ymask) >> 8) << yshift) + ((finalX & xmask) >> 8)); - - if (color) - DrawPixel(&BGOBJLine[i], pal[color], 0x01000000<<2); - } - } - - rotX += rotA; - rotY += rotC; - } - - BGXRefInternal[0] += rotB; - BGYRefInternal[0] += rotD; -} - -// OBJ line buffer: -// * bit0-15: color (bit15=1: direct color, bit15=0: palette index, bit12=0 to indicate extpal) -// * bit16-17: BG-relative priority -// * bit18: non-transparent sprite pixel exists here -// * bit19: X mosaic should be applied here -// * bit24-31: compositor flags - -void GPU2D::ApplySpriteMosaicX() -{ - // apply X mosaic if needed - // X mosaic for sprites is applied after all sprites are rendered - - if (OBJMosaicSize[0] == 0) return; - - u32 lastcolor = OBJLine[0]; - - for (u32 i = 1; i < 256; i++) - { - if (!(OBJLine[i] & 0x100000)) - { - // not a mosaic'd sprite pixel - continue; - } - - if ((OBJIndex[i] != OBJIndex[i-1]) || (CurOBJXMosaicTable[i] == 0)) - lastcolor = OBJLine[i]; - else - OBJLine[i] = lastcolor; - } -} - -void GPU2D::InterleaveSprites(u32 prio) -{ - u16* pal = (u16*)&GPU::Palette[Num ? 0x600 : 0x200]; - - if (DispCnt & 0x80000000) - { - u16* extpal = GetOBJExtPal(); - - for (u32 i = 0; i < 256; i++) - { - if ((OBJLine[i] & 0x70000) != prio) continue; - if (!(WindowMask[i] & 0x10)) continue; - - u16 color; - u32 pixel = OBJLine[i]; - - if (pixel & 0x8000) - color = pixel & 0x7FFF; - else if (pixel & 0x1000) - color = pal[pixel & 0xFF]; - else - color = extpal[pixel & 0xFFF]; - - DrawPixel(&BGOBJLine[i], color, pixel & 0xFF000000); - } - } - else - { - // optimized no-extpal version - - for (u32 i = 0; i < 256; i++) - { - if ((OBJLine[i] & 0x70000) != prio) continue; - if (!(WindowMask[i] & 0x10)) continue; - - u16 color; - u32 pixel = OBJLine[i]; - - if (pixel & 0x8000) - color = pixel & 0x7FFF; - else - color = pal[pixel & 0xFF]; - - DrawPixel(&BGOBJLine[i], color, pixel & 0xFF000000); - } - } -} - -#define DoDrawSprite(type, ...) \ - if (iswin) \ - { \ - DrawSprite_##type(__VA_ARGS__); \ - } \ - else \ - { \ - DrawSprite_##type(__VA_ARGS__); \ - } - -void GPU2D::DrawSprites(u32 line) -{ - if (line == 0) - { - // reset those counters here - // TODO: find out when those are supposed to be reset - // it would make sense to reset them at the end of VBlank - // however, sprites are rendered one scanline in advance - // so they need to be reset a bit earlier - - OBJMosaicY = 0; - OBJMosaicYCount = 0; - } - - NumSprites = 0; - memset(OBJLine, 0, 256*4); - memset(OBJWindow, 0, 256); - if (!(DispCnt & 0x1000)) return; - - memset(OBJIndex, 0xFF, 256); - - u16* oam = (u16*)&GPU::OAM[Num ? 0x400 : 0]; - - const s32 spritewidth[16] = - { - 8, 16, 8, 8, - 16, 32, 8, 8, - 32, 32, 16, 8, - 64, 64, 32, 8 - }; - const s32 spriteheight[16] = - { - 8, 8, 16, 8, - 16, 8, 32, 8, - 32, 16, 32, 8, - 64, 32, 64, 8 - }; - - for (int bgnum = 0x0C00; bgnum >= 0x0000; bgnum -= 0x0400) - { - for (int sprnum = 127; sprnum >= 0; sprnum--) - { - u16* attrib = &oam[sprnum*4]; - - if ((attrib[2] & 0x0C00) != bgnum) - continue; - - bool iswin = (((attrib[0] >> 10) & 0x3) == 2); - - u32 sprline; - if ((attrib[0] & 0x1000) && !iswin) - { - // apply Y mosaic - sprline = OBJMosaicY; - } - else - sprline = line; - - if (attrib[0] & 0x0100) - { - u32 sizeparam = (attrib[0] >> 14) | ((attrib[1] & 0xC000) >> 12); - s32 width = spritewidth[sizeparam]; - s32 height = spriteheight[sizeparam]; - s32 boundwidth = width; - s32 boundheight = height; - - if (attrib[0] & 0x0200) - { - boundwidth <<= 1; - boundheight <<= 1; - } - - u32 ypos = attrib[0] & 0xFF; - ypos = (sprline - ypos) & 0xFF; - if (ypos >= (u32)boundheight) - continue; - - s32 xpos = (s32)(attrib[1] << 23) >> 23; - if (xpos <= -boundwidth) - continue; - - u32 rotparamgroup = (attrib[1] >> 9) & 0x1F; - - DoDrawSprite(Rotscale, sprnum, boundwidth, boundheight, width, height, xpos, ypos); - - NumSprites++; - } - else - { - if (attrib[0] & 0x0200) - continue; - - u32 sizeparam = (attrib[0] >> 14) | ((attrib[1] & 0xC000) >> 12); - s32 width = spritewidth[sizeparam]; - s32 height = spriteheight[sizeparam]; - - u32 ypos = attrib[0] & 0xFF; - ypos = (sprline - ypos) & 0xFF; - if (ypos >= (u32)height) - continue; - - s32 xpos = (s32)(attrib[1] << 23) >> 23; - if (xpos <= -width) - continue; - - DoDrawSprite(Normal, sprnum, width, height, xpos, ypos); - - NumSprites++; - } - } - } -} - -template -void GPU2D::DrawSprite_Rotscale(u32 num, u32 boundwidth, u32 boundheight, u32 width, u32 height, s32 xpos, s32 ypos) -{ - u16* oam = (u16*)&GPU::OAM[Num ? 0x400 : 0]; - u16* attrib = &oam[num * 4]; - u16* rotparams = &oam[(((attrib[1] >> 9) & 0x1F) * 16) + 3]; - - u32 pixelattr = ((attrib[2] & 0x0C00) << 6) | 0xC0000; - u32 tilenum = attrib[2] & 0x03FF; - u32 spritemode = window ? 0 : ((attrib[0] >> 10) & 0x3); - - u32 ytilefactor; - - s32 centerX = boundwidth >> 1; - s32 centerY = boundheight >> 1; - - if ((attrib[0] & 0x1000) && !window) - { - // apply Y mosaic - pixelattr |= 0x100000; - } - - u32 xoff; - if (xpos >= 0) - { - xoff = 0; - if ((xpos+boundwidth) > 256) - boundwidth = 256-xpos; - } - else - { - xoff = -xpos; - xpos = 0; - } - - s16 rotA = (s16)rotparams[0]; - s16 rotB = (s16)rotparams[4]; - s16 rotC = (s16)rotparams[8]; - s16 rotD = (s16)rotparams[12]; - - s32 rotX = ((xoff-centerX) * rotA) + ((ypos-centerY) * rotB) + (width << 7); - s32 rotY = ((xoff-centerX) * rotC) + ((ypos-centerY) * rotD) + (height << 7); - - width <<= 8; - height <<= 8; - - u16 color = 0; // transparent in all cases - - if (spritemode == 3) - { - u32 alpha = attrib[2] >> 12; - if (!alpha) return; - alpha++; - - pixelattr |= (0xC0000000 | (alpha << 24)); - - if (DispCnt & 0x40) - { - if (DispCnt & 0x20) - { - // 'reserved' - // draws nothing - - return; - } - else - { - tilenum <<= (7 + ((DispCnt >> 22) & 0x1)); - ytilefactor = ((width >> 8) * 2); - } - } - else - { - if (DispCnt & 0x20) - { - tilenum = ((tilenum & 0x01F) << 4) + ((tilenum & 0x3E0) << 7); - ytilefactor = (256 * 2); - } - else - { - tilenum = ((tilenum & 0x00F) << 4) + ((tilenum & 0x3F0) << 7); - ytilefactor = (128 * 2); - } - } - - u32 pixelsaddr = (Num ? 0x06600000 : 0x06400000) + tilenum; - - for (; xoff < boundwidth;) - { - if ((u32)rotX < width && (u32)rotY < height) - { - color = GPU::ReadVRAM_OBJ(pixelsaddr + ((rotY >> 8) * ytilefactor) + ((rotX >> 8) << 1)); - - if (color & 0x8000) - { - if (window) OBJWindow[xpos] = 1; - else { OBJLine[xpos] = color | pixelattr; OBJIndex[xpos] = num; } - } - else if (!window) - { - if (OBJLine[xpos] == 0) - { - OBJLine[xpos] = pixelattr & 0x180000; - OBJIndex[xpos] = num; - } - } - } - - rotX += rotA; - rotY += rotC; - xoff++; - xpos++; - } - } - else - { - if (DispCnt & 0x10) - { - tilenum <<= ((DispCnt >> 20) & 0x3); - ytilefactor = (width >> 11) << ((attrib[0] & 0x2000) ? 1:0); - } - else - { - ytilefactor = 0x20; - } - - if (spritemode == 1) pixelattr |= 0x80000000; - else pixelattr |= 0x10000000; - - if (attrib[0] & 0x2000) - { - // 256-color - tilenum <<= 5; - ytilefactor <<= 5; - u32 pixelsaddr = (Num ? 0x06600000 : 0x06400000) + tilenum; - - if (!window) - { - if (!(DispCnt & 0x80000000)) - pixelattr |= 0x1000; - else - pixelattr |= ((attrib[2] & 0xF000) >> 4); - } - - for (; xoff < boundwidth;) - { - if ((u32)rotX < width && (u32)rotY < height) - { - color = GPU::ReadVRAM_OBJ(pixelsaddr + ((rotY>>11)*ytilefactor) + ((rotY&0x700)>>5) + ((rotX>>11)*64) + ((rotX&0x700)>>8)); - - if (color) - { - if (window) OBJWindow[xpos] = 1; - else { OBJLine[xpos] = color | pixelattr; OBJIndex[xpos] = num; } - } - else if (!window) - { - if (OBJLine[xpos] == 0) - { - OBJLine[xpos] = pixelattr & 0x180000; - OBJIndex[xpos] = num; - } - } - } - - rotX += rotA; - rotY += rotC; - xoff++; - xpos++; - } - } - else - { - // 16-color - tilenum <<= 5; - ytilefactor <<= 5; - u32 pixelsaddr = (Num ? 0x06600000 : 0x06400000) + tilenum; - - if (!window) - { - pixelattr |= 0x1000; - pixelattr |= ((attrib[2] & 0xF000) >> 8); - } - - for (; xoff < boundwidth;) - { - if ((u32)rotX < width && (u32)rotY < height) - { - color = GPU::ReadVRAM_OBJ(pixelsaddr + ((rotY>>11)*ytilefactor) + ((rotY&0x700)>>6) + ((rotX>>11)*32) + ((rotX&0x700)>>9)); - if (rotX & 0x100) - color >>= 4; - else - color &= 0x0F; - - if (color) - { - if (window) OBJWindow[xpos] = 1; - else { OBJLine[xpos] = color | pixelattr; OBJIndex[xpos] = num; } - } - else if (!window) - { - if (OBJLine[xpos] == 0) - { - OBJLine[xpos] = pixelattr & 0x180000; - OBJIndex[xpos] = num; - } - } - } - - rotX += rotA; - rotY += rotC; - xoff++; - xpos++; - } - } - } -} - -template -void GPU2D::DrawSprite_Normal(u32 num, u32 width, u32 height, s32 xpos, s32 ypos) -{ - u16* oam = (u16*)&GPU::OAM[Num ? 0x400 : 0]; - u16* attrib = &oam[num * 4]; - - u32 pixelattr = ((attrib[2] & 0x0C00) << 6) | 0xC0000; - u32 tilenum = attrib[2] & 0x03FF; - u32 spritemode = window ? 0 : ((attrib[0] >> 10) & 0x3); - - u32 wmask = width - 8; // really ((width - 1) & ~0x7) - - if ((attrib[0] & 0x1000) && !window) - { - // apply Y mosaic - pixelattr |= 0x100000; - } - - // yflip - if (attrib[1] & 0x2000) - ypos = height-1 - ypos; - - u32 xoff; - u32 xend = width; - if (xpos >= 0) - { - xoff = 0; - if ((xpos+xend) > 256) - xend = 256-xpos; - } - else - { - xoff = -xpos; - xpos = 0; - } - - u16 color = 0; // transparent in all cases - - if (spritemode == 3) - { - // bitmap sprite - - u32 alpha = attrib[2] >> 12; - if (!alpha) return; - alpha++; - - pixelattr |= (0xC0000000 | (alpha << 24)); - - if (DispCnt & 0x40) - { - if (DispCnt & 0x20) - { - // 'reserved' - // draws nothing - - return; - } - else - { - tilenum <<= (7 + ((DispCnt >> 22) & 0x1)); - tilenum += (ypos * width * 2); - } - } - else - { - if (DispCnt & 0x20) - { - tilenum = ((tilenum & 0x01F) << 4) + ((tilenum & 0x3E0) << 7); - tilenum += (ypos * 256 * 2); - } - else - { - tilenum = ((tilenum & 0x00F) << 4) + ((tilenum & 0x3F0) << 7); - tilenum += (ypos * 128 * 2); - } - } - - u32 pixelsaddr = (Num ? 0x06600000 : 0x06400000) + tilenum; - s32 pixelstride; - - if (attrib[1] & 0x1000) // xflip - { - pixelsaddr += (width-1 << 1); - pixelsaddr -= (xoff << 1); - pixelstride = -2; - } - else - { - pixelsaddr += (xoff << 1); - pixelstride = 2; - } - - for (; xoff < xend;) - { - color = GPU::ReadVRAM_OBJ(pixelsaddr); - - pixelsaddr += pixelstride; - - if (color & 0x8000) - { - if (window) OBJWindow[xpos] = 1; - else { OBJLine[xpos] = color | pixelattr; OBJIndex[xpos] = num; } - } - else if (!window) - { - if (OBJLine[xpos] == 0) - { - OBJLine[xpos] = pixelattr & 0x180000; - OBJIndex[xpos] = num; - } - } - - xoff++; - xpos++; - } - } - else - { - if (DispCnt & 0x10) - { - tilenum <<= ((DispCnt >> 20) & 0x3); - tilenum += ((ypos >> 3) * (width >> 3)) << ((attrib[0] & 0x2000) ? 1:0); - } - else - { - tilenum += ((ypos >> 3) * 0x20); - } - - if (spritemode == 1) pixelattr |= 0x80000000; - else pixelattr |= 0x10000000; - - if (attrib[0] & 0x2000) - { - // 256-color - tilenum <<= 5; - u32 pixelsaddr = (Num ? 0x06600000 : 0x06400000) + tilenum; - pixelsaddr += ((ypos & 0x7) << 3); - s32 pixelstride; - - if (!window) - { - if (!(DispCnt & 0x80000000)) - pixelattr |= 0x1000; - else - pixelattr |= ((attrib[2] & 0xF000) >> 4); - } - - if (attrib[1] & 0x1000) // xflip - { - pixelsaddr += (((width-1) & wmask) << 3); - pixelsaddr += ((width-1) & 0x7); - pixelsaddr -= ((xoff & wmask) << 3); - pixelsaddr -= (xoff & 0x7); - pixelstride = -1; - } - else - { - pixelsaddr += ((xoff & wmask) << 3); - pixelsaddr += (xoff & 0x7); - pixelstride = 1; - } - - for (; xoff < xend;) - { - color = GPU::ReadVRAM_OBJ(pixelsaddr); - - pixelsaddr += pixelstride; - - if (color) - { - if (window) OBJWindow[xpos] = 1; - else { OBJLine[xpos] = color | pixelattr; OBJIndex[xpos] = num; } - } - else if (!window) - { - if (OBJLine[xpos] == 0) - { - OBJLine[xpos] = pixelattr & 0x180000; - OBJIndex[xpos] = num; - } - } - - xoff++; - xpos++; - if (!(xoff & 0x7)) pixelsaddr += (56 * pixelstride); - } - } - else - { - // 16-color - tilenum <<= 5; - u32 pixelsaddr = (Num ? 0x06600000 : 0x06400000) + tilenum; - pixelsaddr += ((ypos & 0x7) << 2); - s32 pixelstride; - - if (!window) - { - pixelattr |= 0x1000; - pixelattr |= ((attrib[2] & 0xF000) >> 8); - } - - // TODO: optimize VRAM access!! - // TODO: do xflip better? the 'two pixels per byte' thing makes it a bit shitty - - if (attrib[1] & 0x1000) // xflip - { - pixelsaddr += (((width-1) & wmask) << 2); - pixelsaddr += (((width-1) & 0x7) >> 1); - pixelsaddr -= ((xoff & wmask) << 2); - pixelsaddr -= ((xoff & 0x7) >> 1); - pixelstride = -1; - } - else - { - pixelsaddr += ((xoff & wmask) << 2); - pixelsaddr += ((xoff & 0x7) >> 1); - pixelstride = 1; - } - - for (; xoff < xend;) - { - if (attrib[1] & 0x1000) - { - if (xoff & 0x1) { color = GPU::ReadVRAM_OBJ(pixelsaddr) & 0x0F; pixelsaddr--; } - else color = GPU::ReadVRAM_OBJ(pixelsaddr) >> 4; - } - else - { - if (xoff & 0x1) { color = GPU::ReadVRAM_OBJ(pixelsaddr) >> 4; pixelsaddr++; } - else color = GPU::ReadVRAM_OBJ(pixelsaddr) & 0x0F; - } - - if (color) - { - if (window) OBJWindow[xpos] = 1; - else { OBJLine[xpos] = color | pixelattr; OBJIndex[xpos] = num; } - } - else if (!window) - { - if (OBJLine[xpos] == 0) - { - OBJLine[xpos] = pixelattr & 0x180000; - OBJIndex[xpos] = num; - } - } - - xoff++; - xpos++; - if (!(xoff & 0x7)) pixelsaddr += ((attrib[1] & 0x1000) ? -28 : 28); - } - } + data = GPU::VRAMFlat_BOBJ; + mask = 0x1FFFF; } } diff --git a/src/GPU2D.h b/src/GPU2D.h index 521adf01..0f59ae36 100644 --- a/src/GPU2D.h +++ b/src/GPU2D.h @@ -19,11 +19,14 @@ #ifndef GPU2D_H #define GPU2D_H +#include "types.h" +#include "Savestate.h" + class GPU2D { public: GPU2D(u32 num); - ~GPU2D(); + virtual ~GPU2D() {} void Reset(); @@ -31,7 +34,7 @@ public: void SetEnabled(bool enable) { Enabled = enable; } void SetFramebuffer(u32* buf); - void SetRenderSettings(bool accel); + virtual void SetRenderSettings(bool accel) = 0; u8 Read8(u32 addr); u16 Read16(u32 addr); @@ -52,36 +55,24 @@ public: void SampleFIFO(u32 offset, u32 num); - void DrawScanline(u32 line); - void DrawSprites(u32 line); + virtual void DrawScanline(u32 line) = 0; + virtual void DrawSprites(u32 line) = 0; void VBlank(); - void VBlankEnd(); + virtual void VBlankEnd(); void CheckWindows(u32 line); - void BGExtPalDirty(u32 base); - void OBJExtPalDirty(); - u16* GetBGExtPal(u32 slot, u32 pal); u16* GetOBJExtPal(); -private: + void GetBGVRAM(u8*& data, u32& mask); + void GetOBJVRAM(u8*& data, u32& mask); + +protected: u32 Num; bool Enabled; u32* Framebuffer; - bool Accelerated; - - u32 BGOBJLine[256*3] __attribute__((aligned (8))); - u32* _3DLine; - - u8 WindowMask[256] __attribute__((aligned (8))); - u32 OBJLine[256] __attribute__((aligned (8))); - u8 OBJWindow[256] __attribute__((aligned (8))); - u8 OBJIndex[256] __attribute__((aligned (8))); - - u32 NumSprites; - u16 DispFIFO[16]; u32 DispFIFOReadPtr; u32 DispFIFOWritePtr; @@ -114,32 +105,61 @@ private: u8 BGMosaicY, BGMosaicYMax; u8 OBJMosaicYCount, OBJMosaicY, OBJMosaicYMax; - u8 MosaicTable[16][256]; - u8* CurBGXMosaicTable; - u8* CurOBJXMosaicTable; - u16 BlendCnt; u16 BlendAlpha; u8 EVA, EVB; u8 EVY; + bool CaptureLatch; u32 CaptureCnt; u16 MasterBrightness; - u16 BGExtPalCache[4][16*256]; - u16 OBJExtPalCache[16*256]; - u32 BGExtPalStatus[4]; - u32 OBJExtPalStatus; + u8 WindowMask[256] __attribute__((aligned (8))); + u8 OBJWindow[256] __attribute__((aligned (8))); + void UpdateMosaicCounters(u32 line); + void CalculateWindowMask(u32 line); + + virtual void MosaicXSizeChanged() = 0; +}; + +class GPU2D_Soft : public GPU2D +{ +public: + GPU2D_Soft(u32 num); + ~GPU2D_Soft() override {} + + void SetRenderSettings(bool accel) override; + + void DrawScanline(u32 line) override; + void DrawSprites(u32 line) override; + void VBlankEnd() override; + +protected: + void MosaicXSizeChanged() override; + +private: + bool Accelerated; + + u32 BGOBJLine[256*3] __attribute__((aligned (8))); + u32* _3DLine; + + u32 OBJLine[256] __attribute__((aligned (8))); + u8 OBJIndex[256] __attribute__((aligned (8))); + + u32 NumSprites; + + u8 MosaicTable[16][256]; + u8* CurBGXMosaicTable; + u8* CurOBJXMosaicTable; + u32 ColorBlend4(u32 val1, u32 val2, u32 eva, u32 evb); u32 ColorBlend5(u32 val1, u32 val2); u32 ColorBrightnessUp(u32 val, u32 factor); u32 ColorBrightnessDown(u32 val, u32 factor); u32 ColorComposite(int i, u32 val1, u32 val2); - void UpdateMosaicCounters(u32 line); - template void DrawScanlineBGMode(u32 line); void DrawScanlineBGMode6(u32 line); void DrawScanlineBGMode7(u32 line); @@ -147,22 +167,22 @@ private: static void DrawPixel_Normal(u32* dst, u16 color, u32 flag); static void DrawPixel_Accel(u32* dst, u16 color, u32 flag); - void (*DrawPixel)(u32* dst, u16 color, u32 flag); + + typedef void (*DrawPixel)(u32* dst, u16 color, u32 flag); void DrawBG_3D(); - template void DrawBG_Text(u32 line, u32 bgnum); - template void DrawBG_Affine(u32 line, u32 bgnum); - template void DrawBG_Extended(u32 line, u32 bgnum); - template void DrawBG_Large(u32 line); + template void DrawBG_Text(u32 line, u32 bgnum); + template void DrawBG_Affine(u32 line, u32 bgnum); + template void DrawBG_Extended(u32 line, u32 bgnum); + template void DrawBG_Large(u32 line); void ApplySpriteMosaicX(); + template void InterleaveSprites(u32 prio); template void DrawSprite_Rotscale(u32 num, u32 boundwidth, u32 boundheight, u32 width, u32 height, s32 xpos, s32 ypos); template void DrawSprite_Normal(u32 num, u32 width, u32 height, s32 xpos, s32 ypos); void DoCapture(u32 line, u32 width); - - void CalculateWindowMask(u32 line); }; #endif diff --git a/src/GPU2D_Soft.cpp b/src/GPU2D_Soft.cpp new file mode 100644 index 00000000..c686bad7 --- /dev/null +++ b/src/GPU2D_Soft.cpp @@ -0,0 +1,2215 @@ +#include "GPU2D.h" +#include "GPU.h" + +GPU2D_Soft::GPU2D_Soft(u32 num) + : GPU2D(num) +{ + // initialize mosaic table + for (int m = 0; m < 16; m++) + { + for (int x = 0; x < 256; x++) + { + int offset = x % (m+1); + MosaicTable[m][x] = offset; + } + } +} + +void GPU2D_Soft::SetRenderSettings(bool accel) +{ + Accelerated = accel; +} + +u32 GPU2D_Soft::ColorBlend4(u32 val1, u32 val2, u32 eva, u32 evb) +{ + u32 r = (((val1 & 0x00003F) * eva) + ((val2 & 0x00003F) * evb)) >> 4; + u32 g = ((((val1 & 0x003F00) * eva) + ((val2 & 0x003F00) * evb)) >> 4) & 0x007F00; + u32 b = ((((val1 & 0x3F0000) * eva) + ((val2 & 0x3F0000) * evb)) >> 4) & 0x7F0000; + + if (r > 0x00003F) r = 0x00003F; + if (g > 0x003F00) g = 0x003F00; + if (b > 0x3F0000) b = 0x3F0000; + + return r | g | b | 0xFF000000; +} + +u32 GPU2D_Soft::ColorBlend5(u32 val1, u32 val2) +{ + u32 eva = ((val1 >> 24) & 0x1F) + 1; + u32 evb = 32 - eva; + + if (eva == 32) return val1; + + u32 r = (((val1 & 0x00003F) * eva) + ((val2 & 0x00003F) * evb)) >> 5; + u32 g = ((((val1 & 0x003F00) * eva) + ((val2 & 0x003F00) * evb)) >> 5) & 0x007F00; + u32 b = ((((val1 & 0x3F0000) * eva) + ((val2 & 0x3F0000) * evb)) >> 5) & 0x7F0000; + + if (eva <= 16) + { + r += 0x000001; + g += 0x000100; + b += 0x010000; + } + + if (r > 0x00003F) r = 0x00003F; + if (g > 0x003F00) g = 0x003F00; + if (b > 0x3F0000) b = 0x3F0000; + + return r | g | b | 0xFF000000; +} + +u32 GPU2D_Soft::ColorBrightnessUp(u32 val, u32 factor) +{ + u32 rb = val & 0x3F003F; + u32 g = val & 0x003F00; + + rb += ((((0x3F003F - rb) * factor) >> 4) & 0x3F003F); + g += ((((0x003F00 - g) * factor) >> 4) & 0x003F00); + + return rb | g | 0xFF000000; +} + +u32 GPU2D_Soft::ColorBrightnessDown(u32 val, u32 factor) +{ + u32 rb = val & 0x3F003F; + u32 g = val & 0x003F00; + + rb -= (((rb * factor) >> 4) & 0x3F003F); + g -= (((g * factor) >> 4) & 0x003F00); + + return rb | g | 0xFF000000; +} + +u32 GPU2D_Soft::ColorComposite(int i, u32 val1, u32 val2) +{ + u32 coloreffect = 0; + u32 eva, evb; + + u32 flag1 = val1 >> 24; + u32 flag2 = val2 >> 24; + + u32 target2; + if (flag2 & 0x80) target2 = 0x1000; + else if (flag2 & 0x40) target2 = 0x0100; + else target2 = flag2 << 8; + + if ((flag1 & 0x80) && (BlendCnt & target2)) + { + // sprite blending + + coloreffect = 1; + + if (flag1 & 0x40) + { + eva = flag1 & 0x1F; + evb = 16 - eva; + } + else + { + eva = EVA; + evb = EVB; + } + } + else if ((flag1 & 0x40) && (BlendCnt & target2)) + { + // 3D layer blending + + coloreffect = 4; + } + else + { + if (flag1 & 0x80) flag1 = 0x10; + else if (flag1 & 0x40) flag1 = 0x01; + + if ((BlendCnt & flag1) && (WindowMask[i] & 0x20)) + { + coloreffect = (BlendCnt >> 6) & 0x3; + + if (coloreffect == 1) + { + if (BlendCnt & target2) + { + eva = EVA; + evb = EVB; + } + else + coloreffect = 0; + } + } + } + + switch (coloreffect) + { + case 0: return val1; + case 1: return ColorBlend4(val1, val2, eva, evb); + case 2: return ColorBrightnessUp(val1, EVY); + case 3: return ColorBrightnessDown(val1, EVY); + case 4: return ColorBlend5(val1, val2); + } + + return val1; +} + +void GPU2D_Soft::DrawScanline(u32 line) +{ + int stride = Accelerated ? (256*3 + 1) : 256; + u32* dst = &Framebuffer[stride * line]; + + int n3dline = line; + line = GPU::VCount; + + if (Num == 0) + { + auto bgDirty = GPU::VRAMDirty_ABG.DeriveState(GPU::VRAMMap_ABG); + GPU::MakeVRAMFlat_ABGCoherent(bgDirty); + auto bgExtPalDirty = GPU::VRAMDirty_ABGExtPal.DeriveState(GPU::VRAMMap_ABGExtPal); + GPU::MakeVRAMFlat_ABGExtPalCoherent(bgExtPalDirty); + auto objExtPalDirty = GPU::VRAMDirty_AOBJExtPal.DeriveState(&GPU::VRAMMap_AOBJExtPal); + GPU::MakeVRAMFlat_AOBJExtPalCoherent(objExtPalDirty); + } + else + { + auto bgDirty = GPU::VRAMDirty_BBG.DeriveState(GPU::VRAMMap_BBG); + GPU::MakeVRAMFlat_BBGCoherent(bgDirty); + auto bgExtPalDirty = GPU::VRAMDirty_BBGExtPal.DeriveState(GPU::VRAMMap_BBGExtPal); + GPU::MakeVRAMFlat_BBGExtPalCoherent(bgExtPalDirty); + auto objExtPalDirty = GPU::VRAMDirty_BOBJExtPal.DeriveState(&GPU::VRAMMap_BOBJExtPal); + GPU::MakeVRAMFlat_BOBJExtPalCoherent(objExtPalDirty); + } + + bool forceblank = false; + + // scanlines that end up outside of the GPU drawing range + // (as a result of writing to VCount) are filled white + if (line > 192) forceblank = true; + + // GPU B can be completely disabled by POWCNT1 + // oddly that's not the case for GPU A + if (Num && !Enabled) forceblank = true; + + if (forceblank) + { + for (int i = 0; i < 256; i++) + dst[i] = 0xFFFFFFFF; + + if (Accelerated) + { + dst[256*3] = 0; + } + return; + } + + u32 dispmode = DispCnt >> 16; + dispmode &= (Num ? 0x1 : 0x3); + + if (Num == 0) + { + if (!Accelerated) + _3DLine = GPU3D::GetLine(n3dline); + else if ((CaptureCnt & (1<<31)) && (((CaptureCnt >> 29) & 0x3) != 1)) + { + _3DLine = GPU3D::GetLine(n3dline); + //GPU3D::GLRenderer::PrepareCaptureFrame(); + } + } + + if (line == 0 && CaptureCnt & (1 << 31)) + CaptureLatch = true; + + // always render regular graphics + DrawScanline_BGOBJ(line); + UpdateMosaicCounters(line); + + switch (dispmode) + { + case 0: // screen off + { + for (int i = 0; i < 256; i++) + dst[i] = 0x003F3F3F; + } + break; + + case 1: // regular display + { + int i = 0; + for (; i < (stride & ~1); i+=2) + *(u64*)&dst[i] = *(u64*)&BGOBJLine[i]; + } + break; + + case 2: // VRAM display + { + u32 vrambank = (DispCnt >> 18) & 0x3; + if (GPU::VRAMMap_LCDC & (1<> 4; + u8 b = (color & 0x7C00) >> 9; + + dst[i] = r | (g << 8) | (b << 16); + } + } + else + { + for (int i = 0; i < 256; i++) + { + dst[i] = 0; + } + } + } + break; + + case 3: // FIFO display + { + for (int i = 0; i < 256; i++) + { + u16 color = DispFIFOBuffer[i]; + u8 r = (color & 0x001F) << 1; + u8 g = (color & 0x03E0) >> 4; + u8 b = (color & 0x7C00) >> 9; + + dst[i] = r | (g << 8) | (b << 16); + } + } + break; + } + + // capture + if ((Num == 0) && CaptureLatch) + { + u32 capwidth, capheight; + switch ((CaptureCnt >> 20) & 0x3) + { + case 0: capwidth = 128; capheight = 128; break; + case 1: capwidth = 256; capheight = 64; break; + case 2: capwidth = 256; capheight = 128; break; + case 3: capwidth = 256; capheight = 192; break; + } + + if (line < capheight) + DoCapture(line, capwidth); + } + + if (Accelerated) + { + dst[256*3] = MasterBrightness | (DispCnt & 0x30000); + return; + } + + // master brightness + if (dispmode != 0) + { + if ((MasterBrightness >> 14) == 1) + { + // up + u32 factor = MasterBrightness & 0x1F; + if (factor > 16) factor = 16; + + for (int i = 0; i < 256; i++) + { + dst[i] = ColorBrightnessUp(dst[i], factor); + } + } + else if ((MasterBrightness >> 14) == 2) + { + // down + u32 factor = MasterBrightness & 0x1F; + if (factor > 16) factor = 16; + + for (int i = 0; i < 256; i++) + { + dst[i] = ColorBrightnessDown(dst[i], factor); + } + } + } + + // convert to 32-bit BGRA + // note: 32-bit RGBA would be more straightforward, but + // BGRA seems to be more compatible (Direct2D soft, cairo...) + for (int i = 0; i < 256; i+=2) + { + u64 c = *(u64*)&dst[i]; + + u64 r = (c << 18) & 0xFC000000FC0000; + u64 g = (c << 2) & 0xFC000000FC00; + u64 b = (c >> 14) & 0xFC000000FC; + c = r | g | b; + + *(u64*)&dst[i] = c | ((c & 0x00C0C0C000C0C0C0) >> 6) | 0xFF000000FF000000; + } +} + +void GPU2D_Soft::VBlankEnd() +{ + GPU2D::VBlankEnd(); + +#ifdef OGLRENDERER_ENABLED + if (Accelerated) + { + if ((Num == 0) && (CaptureCnt & (1<<31)) && (((CaptureCnt >> 29) & 0x3) != 1)) + { + GPU3D::GLRenderer::PrepareCaptureFrame(); + } + } +#endif +} + +void GPU2D_Soft::DoCapture(u32 line, u32 width) +{ + u32 dstvram = (CaptureCnt >> 16) & 0x3; + + // TODO: confirm this + // it should work like VRAM display mode, which requires VRAM to be mapped to LCDC + if (!(GPU::VRAMMap_LCDC & (1<> 18) & 0x3) << 14) + (line * width); + + static_assert(GPU::VRAMDirtyGranularity == 512); + GPU::VRAMDirty[dstvram][(dstaddr & 0x1FFFF) / GPU::VRAMDirtyGranularity] = true; + + // TODO: handle 3D in accelerated mode!! + + u32* srcA; + if (CaptureCnt & (1<<24)) + { + srcA = _3DLine; + } + else + { + srcA = BGOBJLine; + if (Accelerated) + { + // in accelerated mode, compositing is normally done on the GPU + // but when doing display capture, we do need the composited output + // so we do it here + + for (int i = 0; i < 256; i++) + { + u32 val1 = BGOBJLine[i]; + u32 val2 = BGOBJLine[256+i]; + u32 val3 = BGOBJLine[512+i]; + + u32 compmode = (val3 >> 24) & 0xF; + + if (compmode == 4) + { + // 3D on top, blending + + u32 _3dval = _3DLine[i]; + if ((_3dval >> 24) > 0) + val1 = ColorBlend5(_3dval, val1); + else + val1 = val2; + } + else if (compmode == 1) + { + // 3D on bottom, blending + + u32 _3dval = _3DLine[i]; + if ((_3dval >> 24) > 0) + { + u32 eva = (val3 >> 8) & 0x1F; + u32 evb = (val3 >> 16) & 0x1F; + + val1 = ColorBlend4(val1, _3dval, eva, evb); + } + else + val1 = val2; + } + else if (compmode <= 3) + { + // 3D on top, normal/fade + + u32 _3dval = _3DLine[i]; + if ((_3dval >> 24) > 0) + { + u32 evy = (val3 >> 8) & 0x1F; + + val1 = _3dval; + if (compmode == 2) val1 = ColorBrightnessUp(val1, evy); + else if (compmode == 3) val1 = ColorBrightnessDown(val1, evy); + } + else + val1 = val2; + } + + BGOBJLine[i] = val1; + } + } + } + + u16* srcB = NULL; + u32 srcBaddr = line * 256; + + if (CaptureCnt & (1<<25)) + { + srcB = &DispFIFOBuffer[0]; + srcBaddr = 0; + } + else + { + u32 srcvram = (DispCnt >> 18) & 0x3; + if (GPU::VRAMMap_LCDC & (1<> 16) & 0x3) != 2) + srcBaddr += ((CaptureCnt >> 26) & 0x3) << 14; + } + + dstaddr &= 0xFFFF; + srcBaddr &= 0xFFFF; + + switch ((CaptureCnt >> 29) & 0x3) + { + case 0: // source A + { + for (u32 i = 0; i < width; i++) + { + u32 val = srcA[i]; + + // TODO: check what happens when alpha=0 + + u32 r = (val >> 1) & 0x1F; + u32 g = (val >> 9) & 0x1F; + u32 b = (val >> 17) & 0x1F; + u32 a = ((val >> 24) != 0) ? 0x8000 : 0; + + dst[dstaddr] = r | (g << 5) | (b << 10) | a; + dstaddr = (dstaddr + 1) & 0xFFFF; + } + } + break; + + case 1: // source B + { + if (srcB) + { + for (u32 i = 0; i < width; i++) + { + dst[dstaddr] = srcB[srcBaddr]; + srcBaddr = (srcBaddr + 1) & 0xFFFF; + dstaddr = (dstaddr + 1) & 0xFFFF; + } + } + else + { + for (u32 i = 0; i < width; i++) + { + dst[dstaddr] = 0; + dstaddr = (dstaddr + 1) & 0xFFFF; + } + } + } + break; + + case 2: // sources A+B + case 3: + { + u32 eva = CaptureCnt & 0x1F; + u32 evb = (CaptureCnt >> 8) & 0x1F; + + // checkme + if (eva > 16) eva = 16; + if (evb > 16) evb = 16; + + if (srcB) + { + for (u32 i = 0; i < width; i++) + { + u32 val = srcA[i]; + + // TODO: check what happens when alpha=0 + + u32 rA = (val >> 1) & 0x1F; + u32 gA = (val >> 9) & 0x1F; + u32 bA = (val >> 17) & 0x1F; + u32 aA = ((val >> 24) != 0) ? 1 : 0; + + val = srcB[srcBaddr]; + + u32 rB = val & 0x1F; + u32 gB = (val >> 5) & 0x1F; + u32 bB = (val >> 10) & 0x1F; + u32 aB = val >> 15; + + u32 rD = ((rA * aA * eva) + (rB * aB * evb)) >> 4; + u32 gD = ((gA * aA * eva) + (gB * aB * evb)) >> 4; + u32 bD = ((bA * aA * eva) + (bB * aB * evb)) >> 4; + u32 aD = (eva>0 ? aA : 0) | (evb>0 ? aB : 0); + + if (rD > 0x1F) rD = 0x1F; + if (gD > 0x1F) gD = 0x1F; + if (bD > 0x1F) bD = 0x1F; + + dst[dstaddr] = rD | (gD << 5) | (bD << 10) | (aD << 15); + srcBaddr = (srcBaddr + 1) & 0xFFFF; + dstaddr = (dstaddr + 1) & 0xFFFF; + } + } + else + { + for (u32 i = 0; i < width; i++) + { + u32 val = srcA[i]; + + // TODO: check what happens when alpha=0 + + u32 rA = (val >> 1) & 0x1F; + u32 gA = (val >> 9) & 0x1F; + u32 bA = (val >> 17) & 0x1F; + u32 aA = ((val >> 24) != 0) ? 1 : 0; + + u32 rD = (rA * aA * eva) >> 4; + u32 gD = (gA * aA * eva) >> 4; + u32 bD = (bA * aA * eva) >> 4; + u32 aD = (eva>0 ? aA : 0); + + dst[dstaddr] = rD | (gD << 5) | (bD << 10) | (aD << 15); + dstaddr = (dstaddr + 1) & 0xFFFF; + } + } + } + break; + } +} + +#define DoDrawBG(type, line, num) \ + do \ + { \ + if ((BGCnt[num] & 0x0040) && (BGMosaicSize[0] > 0)) \ + { \ + if (Accelerated) DrawBG_##type(line, num); \ + else DrawBG_##type(line, num); \ + } \ + else \ + { \ + if (Accelerated) DrawBG_##type(line, num); \ + else DrawBG_##type(line, num); \ + } \ + } while (false) + +#define DoDrawBG_Large(line) \ + do \ + { \ + if ((BGCnt[2] & 0x0040) && (BGMosaicSize[0] > 0)) \ + { \ + if (Accelerated) DrawBG_Large(line); \ + else DrawBG_Large(line); \ + } \ + else \ + { \ + if (Accelerated) DrawBG_Large(line); \ + else DrawBG_Large(line); \ + } \ + } while (false) + +#define DoInterleaveSprites(prio) \ + if (Accelerated) InterleaveSprites(prio); else InterleaveSprites(prio); + +template +void GPU2D_Soft::DrawScanlineBGMode(u32 line) +{ + for (int i = 3; i >= 0; i--) + { + if ((BGCnt[3] & 0x3) == i) + { + if (DispCnt & 0x0800) + { + if (bgmode >= 3) + DoDrawBG(Extended, line, 3); + else if (bgmode >= 1) + DoDrawBG(Affine, line, 3); + else + DoDrawBG(Text, line, 3); + } + } + if ((BGCnt[2] & 0x3) == i) + { + if (DispCnt & 0x0400) + { + if (bgmode == 5) + DoDrawBG(Extended, line, 2); + else if (bgmode == 4 || bgmode == 2) + DoDrawBG(Affine, line, 2); + else + DoDrawBG(Text, line, 2); + } + } + if ((BGCnt[1] & 0x3) == i) + { + if (DispCnt & 0x0200) + { + DoDrawBG(Text, line, 1); + } + } + if ((BGCnt[0] & 0x3) == i) + { + if (DispCnt & 0x0100) + { + if ((!Num) && (DispCnt & 0x8)) + DrawBG_3D(); + else + DoDrawBG(Text, line, 0); + } + } + if ((DispCnt & 0x1000) && NumSprites) + DoInterleaveSprites(0x40000 | (i<<16)); + } +} + +void GPU2D_Soft::DrawScanlineBGMode6(u32 line) +{ + for (int i = 3; i >= 0; i--) + { + if ((BGCnt[2] & 0x3) == i) + { + if (DispCnt & 0x0400) + { + DoDrawBG_Large(line); + } + } + if ((BGCnt[0] & 0x3) == i) + { + if (DispCnt & 0x0100) + { + if ((!Num) && (DispCnt & 0x8)) + DrawBG_3D(); + } + } + if ((DispCnt & 0x1000) && NumSprites) + DoInterleaveSprites(0x40000 | (i<<16)) + } +} + +void GPU2D_Soft::DrawScanlineBGMode7(u32 line) +{ + // mode 7 only has text-mode BG0 and BG1 + + for (int i = 3; i >= 0; i--) + { + if ((BGCnt[1] & 0x3) == i) + { + if (DispCnt & 0x0200) + { + DoDrawBG(Text, line, 1); + } + } + if ((BGCnt[0] & 0x3) == i) + { + if (DispCnt & 0x0100) + { + if ((!Num) && (DispCnt & 0x8)) + DrawBG_3D(); + else + DoDrawBG(Text, line, 0); + } + } + if ((DispCnt & 0x1000) && NumSprites) + DoInterleaveSprites(0x40000 | (i<<16)) + } +} + +void GPU2D_Soft::DrawScanline_BGOBJ(u32 line) +{ + // forced blank disables BG/OBJ compositing + if (DispCnt & (1<<7)) + { + for (int i = 0; i < 256; i++) + BGOBJLine[i] = 0xFF3F3F3F; + + return; + } + + u64 backdrop; + if (Num) backdrop = *(u16*)&GPU::Palette[0x400]; + else backdrop = *(u16*)&GPU::Palette[0]; + + { + u8 r = (backdrop & 0x001F) << 1; + u8 g = (backdrop & 0x03E0) >> 4; + u8 b = (backdrop & 0x7C00) >> 9; + + backdrop = r | (g << 8) | (b << 16) | 0x20000000; + backdrop |= (backdrop << 32); + + for (int i = 0; i < 256; i+=2) + *(u64*)&BGOBJLine[i] = backdrop; + } + + if (DispCnt & 0xE000) + CalculateWindowMask(line); + else + memset(WindowMask, 0xFF, 256); + + ApplySpriteMosaicX(); + + switch (DispCnt & 0x7) + { + case 0: DrawScanlineBGMode<0>(line); break; + case 1: DrawScanlineBGMode<1>(line); break; + case 2: DrawScanlineBGMode<2>(line); break; + case 3: DrawScanlineBGMode<3>(line); break; + case 4: DrawScanlineBGMode<4>(line); break; + case 5: DrawScanlineBGMode<5>(line); break; + case 6: DrawScanlineBGMode6(line); break; + case 7: DrawScanlineBGMode7(line); break; + } + + // color special effects + // can likely be optimized + + if (!Accelerated) + { + for (int i = 0; i < 256; i++) + { + u32 val1 = BGOBJLine[i]; + u32 val2 = BGOBJLine[256+i]; + + BGOBJLine[i] = ColorComposite(i, val1, val2); + } + } + else + { + if (Num == 0) + { + for (int i = 0; i < 256; i++) + { + u32 val1 = BGOBJLine[i]; + u32 val2 = BGOBJLine[256+i]; + u32 val3 = BGOBJLine[512+i]; + + u32 flag1 = val1 >> 24; + u32 flag2 = val2 >> 24; + + u32 bldcnteffect = (BlendCnt >> 6) & 0x3; + + u32 target1; + if (flag1 & 0x80) target1 = 0x0010; + else if (flag1 & 0x40) target1 = 0x0001; + else target1 = flag1; + + u32 target2; + if (flag2 & 0x80) target2 = 0x1000; + else if (flag2 & 0x40) target2 = 0x0100; + else target2 = flag2 << 8; + + if (((flag1 & 0xC0) == 0x40) && (BlendCnt & target2)) + { + // 3D on top, blending + + BGOBJLine[i] = val2; + BGOBJLine[256+i] = ColorComposite(i, val2, val3); + BGOBJLine[512+i] = 0x04000000; + } + else if ((flag1 & 0xC0) == 0x40) + { + // 3D on top, normal/fade + + if (bldcnteffect == 1) bldcnteffect = 0; + if (!(BlendCnt & 0x0001)) bldcnteffect = 0; + if (!(WindowMask[i] & 0x20)) bldcnteffect = 0; + + BGOBJLine[i] = val2; + BGOBJLine[256+i] = ColorComposite(i, val2, val3); + BGOBJLine[512+i] = (bldcnteffect << 24) | (EVY << 8); + } + else if (((flag2 & 0xC0) == 0x40) && ((BlendCnt & 0x01C0) == 0x0140)) + { + // 3D on bottom, blending + + u32 eva, evb; + if ((flag1 & 0xC0) == 0xC0) + { + eva = flag1 & 0x1F; + evb = 16 - eva; + } + else if (((BlendCnt & target1) && (WindowMask[i] & 0x20)) || + ((flag1 & 0xC0) == 0x80)) + { + eva = EVA; + evb = EVB; + } + else + bldcnteffect = 7; + + BGOBJLine[i] = val1; + BGOBJLine[256+i] = ColorComposite(i, val1, val3); + BGOBJLine[512+i] = (bldcnteffect << 24) | (EVB << 16) | (EVA << 8); + } + else + { + // no potential 3D pixel involved + + BGOBJLine[i] = ColorComposite(i, val1, val2); + BGOBJLine[256+i] = 0; + BGOBJLine[512+i] = 0x07000000; + } + } + } + else + { + for (int i = 0; i < 256; i++) + { + u32 val1 = BGOBJLine[i]; + u32 val2 = BGOBJLine[256+i]; + + BGOBJLine[i] = ColorComposite(i, val1, val2); + BGOBJLine[256+i] = 0; + BGOBJLine[512+i] = 0x07000000; + } + } + } + + if (BGMosaicY >= BGMosaicYMax) + { + BGMosaicY = 0; + BGMosaicYMax = BGMosaicSize[1]; + } + else + BGMosaicY++; + + /*if (OBJMosaicY >= OBJMosaicYMax) + { + OBJMosaicY = 0; + OBJMosaicYMax = OBJMosaicSize[1]; + } + else + OBJMosaicY++;*/ +} + + +void GPU2D_Soft::DrawPixel_Normal(u32* dst, u16 color, u32 flag) +{ + u8 r = (color & 0x001F) << 1; + u8 g = (color & 0x03E0) >> 4; + u8 b = (color & 0x7C00) >> 9; + //g |= ((color & 0x8000) >> 15); + + *(dst+256) = *dst; + *dst = r | (g << 8) | (b << 16) | flag; +} + +void GPU2D_Soft::DrawPixel_Accel(u32* dst, u16 color, u32 flag) +{ + u8 r = (color & 0x001F) << 1; + u8 g = (color & 0x03E0) >> 4; + u8 b = (color & 0x7C00) >> 9; + + *(dst+512) = *(dst+256); + *(dst+256) = *dst; + *dst = r | (g << 8) | (b << 16) | flag; +} + +void GPU2D_Soft::DrawBG_3D() +{ + int i = 0; + + if (Accelerated) + { + for (i = 0; i < 256; i++) + { + if (!(WindowMask[i] & 0x01)) continue; + + BGOBJLine[i+512] = BGOBJLine[i+256]; + BGOBJLine[i+256] = BGOBJLine[i]; + BGOBJLine[i] = 0x40000000; // 3D-layer placeholder + } + } + else + { + for (i = 0; i < 256; i++) + { + u32 c = _3DLine[i]; + + if ((c >> 24) == 0) continue; + if (!(WindowMask[i] & 0x01)) continue; + + BGOBJLine[i+256] = BGOBJLine[i]; + BGOBJLine[i] = c | 0x40000000; + } + } +} + +template +void GPU2D_Soft::DrawBG_Text(u32 line, u32 bgnum) +{ + u16 bgcnt = BGCnt[bgnum]; + + u32 tilesetaddr, tilemapaddr; + u16* pal; + u32 extpal, extpalslot; + + u16 xoff = BGXPos[bgnum]; + u16 yoff = BGYPos[bgnum] + line; + + if (bgcnt & 0x0040) + { + // vertical mosaic + yoff -= BGMosaicY; + } + + u32 widexmask = (bgcnt & 0x4000) ? 0x100 : 0; + + extpal = (DispCnt & 0x40000000); + if (extpal) extpalslot = ((bgnum<2) && (bgcnt&0x2000)) ? (2+bgnum) : bgnum; + + u8* bgvram; + u32 bgvrammask; + GetBGVRAM(bgvram, bgvrammask); + if (Num) + { + tilesetaddr = ((bgcnt & 0x003C) << 12); + tilemapaddr = ((bgcnt & 0x1F00) << 3); + + pal = (u16*)&GPU::Palette[0x400]; + } + else + { + tilesetaddr = ((DispCnt & 0x07000000) >> 8) + ((bgcnt & 0x003C) << 12); + tilemapaddr = ((DispCnt & 0x38000000) >> 11) + ((bgcnt & 0x1F00) << 3); + + pal = (u16*)&GPU::Palette[0]; + } + + // adjust Y position in tilemap + if (bgcnt & 0x8000) + { + tilemapaddr += ((yoff & 0x1F8) << 3); + if (bgcnt & 0x4000) + tilemapaddr += ((yoff & 0x100) << 3); + } + else + tilemapaddr += ((yoff & 0xF8) << 3); + + u16 curtile; + u16* curpal; + u32 pixelsaddr; + u8 color; + u32 lastxpos; + + if (bgcnt & 0x0080) + { + // 256-color + + // preload shit as needed + if ((xoff & 0x7) || mosaic) + { + curtile = *(u16*)&bgvram[(tilemapaddr + ((xoff & 0xF8) >> 2) + ((xoff & widexmask) << 3)) & bgvrammask]; + + if (extpal) curpal = GetBGExtPal(extpalslot, curtile>>12); + else curpal = pal; + + pixelsaddr = tilesetaddr + ((curtile & 0x03FF) << 6) + + (((curtile & 0x0800) ? (7-(yoff&0x7)) : (yoff&0x7)) << 3); + } + + if (mosaic) lastxpos = xoff; + + for (int i = 0; i < 256; i++) + { + u32 xpos; + if (mosaic) xpos = xoff - CurBGXMosaicTable[i]; + else xpos = xoff; + + if ((!mosaic && (!(xpos & 0x7))) || + (mosaic && ((xpos >> 3) != (lastxpos >> 3)))) + { + // load a new tile + curtile = *(u16*)&bgvram[(tilemapaddr + ((xpos & 0xF8) >> 2) + ((xpos & widexmask) << 3)) & bgvrammask]; + + if (extpal) curpal = GetBGExtPal(extpalslot, curtile>>12); + else curpal = pal; + + pixelsaddr = tilesetaddr + ((curtile & 0x03FF) << 6) + + (((curtile & 0x0800) ? (7-(yoff&0x7)) : (yoff&0x7)) << 3); + + if (mosaic) lastxpos = xpos; + } + + // draw pixel + if (WindowMask[i] & (1<> 2) + ((xoff & widexmask) << 3))) & bgvrammask]; + curpal = pal + ((curtile & 0xF000) >> 8); + pixelsaddr = tilesetaddr + ((curtile & 0x03FF) << 5) + + (((curtile & 0x0800) ? (7-(yoff&0x7)) : (yoff&0x7)) << 2); + } + + if (mosaic) lastxpos = xoff; + + for (int i = 0; i < 256; i++) + { + u32 xpos; + if (mosaic) xpos = xoff - CurBGXMosaicTable[i]; + else xpos = xoff; + + if ((!mosaic && (!(xpos & 0x7))) || + (mosaic && ((xpos >> 3) != (lastxpos >> 3)))) + { + // load a new tile + curtile = *(u16*)&bgvram[(tilemapaddr + ((xpos & 0xF8) >> 2) + ((xpos & widexmask) << 3)) & bgvrammask]; + curpal = pal + ((curtile & 0xF000) >> 8); + pixelsaddr = tilesetaddr + ((curtile & 0x03FF) << 5) + + (((curtile & 0x0800) ? (7-(yoff&0x7)) : (yoff&0x7)) << 2); + + if (mosaic) lastxpos = xpos; + } + + // draw pixel + if (WindowMask[i] & (1<> 1)) & bgvrammask] >> 4; + } + else + { + color = bgvram[(pixelsaddr + (tilexoff >> 1)) & bgvrammask] & 0x0F; + } + + if (color) + drawPixel(&BGOBJLine[i], curpal[color], 0x01000000< +void GPU2D_Soft::DrawBG_Affine(u32 line, u32 bgnum) +{ + u16 bgcnt = BGCnt[bgnum]; + + u32 tilesetaddr, tilemapaddr; + u16* pal; + + u32 coordmask; + u32 yshift; + switch (bgcnt & 0xC000) + { + case 0x0000: coordmask = 0x07800; yshift = 7; break; + case 0x4000: coordmask = 0x0F800; yshift = 8; break; + case 0x8000: coordmask = 0x1F800; yshift = 9; break; + case 0xC000: coordmask = 0x3F800; yshift = 10; break; + } + + u32 overflowmask; + if (bgcnt & 0x2000) overflowmask = 0; + else overflowmask = ~(coordmask | 0x7FF); + + s16 rotA = BGRotA[bgnum-2]; + s16 rotB = BGRotB[bgnum-2]; + s16 rotC = BGRotC[bgnum-2]; + s16 rotD = BGRotD[bgnum-2]; + + s32 rotX = BGXRefInternal[bgnum-2]; + s32 rotY = BGYRefInternal[bgnum-2]; + + if (bgcnt & 0x0040) + { + // vertical mosaic + rotX -= (BGMosaicY * rotB); + rotY -= (BGMosaicY * rotD); + } + + u8* bgvram; + u32 bgvrammask; + GetBGVRAM(bgvram, bgvrammask); + + if (Num) + { + tilesetaddr = ((bgcnt & 0x003C) << 12); + tilemapaddr = ((bgcnt & 0x1F00) << 3); + + pal = (u16*)&GPU::Palette[0x400]; + } + else + { + tilesetaddr = ((DispCnt & 0x07000000) >> 8) + ((bgcnt & 0x003C) << 12); + tilemapaddr = ((DispCnt & 0x38000000) >> 11) + ((bgcnt & 0x1F00) << 3); + + pal = (u16*)&GPU::Palette[0]; + } + + u16 curtile; + u8 color; + + yshift -= 3; + + for (int i = 0; i < 256; i++) + { + if (WindowMask[i] & (1<> 11) << yshift) + ((finalX & coordmask) >> 11))) & bgvrammask]; + + // draw pixel + u32 tilexoff = (finalX >> 8) & 0x7; + u32 tileyoff = (finalY >> 8) & 0x7; + + color = bgvram[(tilesetaddr + (curtile << 6) + (tileyoff << 3) + tilexoff) & bgvrammask]; + + if (color) + drawPixel(&BGOBJLine[i], pal[color], 0x01000000< +void GPU2D_Soft::DrawBG_Extended(u32 line, u32 bgnum) +{ + u16 bgcnt = BGCnt[bgnum]; + + u32 tilesetaddr, tilemapaddr; + u16* pal; + u32 extpal; + + u8* bgvram; + u32 bgvrammask; + GetBGVRAM(bgvram, bgvrammask); + + extpal = (DispCnt & 0x40000000); + + s16 rotA = BGRotA[bgnum-2]; + s16 rotB = BGRotB[bgnum-2]; + s16 rotC = BGRotC[bgnum-2]; + s16 rotD = BGRotD[bgnum-2]; + + s32 rotX = BGXRefInternal[bgnum-2]; + s32 rotY = BGYRefInternal[bgnum-2]; + + if (bgcnt & 0x0040) + { + // vertical mosaic + rotX -= (BGMosaicY * rotB); + rotY -= (BGMosaicY * rotD); + } + + if (bgcnt & 0x0080) + { + // bitmap modes + + u32 xmask, ymask; + u32 yshift; + switch (bgcnt & 0xC000) + { + case 0x0000: xmask = 0x07FFF; ymask = 0x07FFF; yshift = 7; break; + case 0x4000: xmask = 0x0FFFF; ymask = 0x0FFFF; yshift = 8; break; + case 0x8000: xmask = 0x1FFFF; ymask = 0x0FFFF; yshift = 9; break; + case 0xC000: xmask = 0x1FFFF; ymask = 0x1FFFF; yshift = 9; break; + } + + u32 ofxmask, ofymask; + if (bgcnt & 0x2000) + { + ofxmask = 0; + ofymask = 0; + } + else + { + ofxmask = ~xmask; + ofymask = ~ymask; + } + + if (Num) tilemapaddr = ((bgcnt & 0x1F00) << 6); + else tilemapaddr = ((bgcnt & 0x1F00) << 6); + + if (bgcnt & 0x0004) + { + // direct color bitmap + + u16 color; + + for (int i = 0; i < 256; i++) + { + if (WindowMask[i] & (1<> 8) << yshift) + ((finalX & xmask) >> 8)) << 1)) & bgvrammask]; + + if (color & 0x8000) + drawPixel(&BGOBJLine[i], color, 0x01000000<> 8) << yshift) + ((finalX & xmask) >> 8)) & bgvrammask]; + + if (color) + drawPixel(&BGOBJLine[i], pal[color], 0x01000000<> 8) + ((bgcnt & 0x003C) << 12); + tilemapaddr = ((DispCnt & 0x38000000) >> 11) + ((bgcnt & 0x1F00) << 3); + + pal = (u16*)&GPU::Palette[0]; + } + + u16 curtile; + u16* curpal; + u8 color; + + yshift -= 3; + + for (int i = 0; i < 256; i++) + { + if (WindowMask[i] & (1<> 11) << yshift) + ((finalX & coordmask) >> 11)) << 1)) & bgvrammask]; + + if (extpal) curpal = GetBGExtPal(bgnum, curtile>>12); + else curpal = pal; + + // draw pixel + u32 tilexoff = (finalX >> 8) & 0x7; + u32 tileyoff = (finalY >> 8) & 0x7; + + if (curtile & 0x0400) tilexoff = 7-tilexoff; + if (curtile & 0x0800) tileyoff = 7-tileyoff; + + color = bgvram[(tilesetaddr + ((curtile & 0x03FF) << 6) + (tileyoff << 3) + tilexoff) & bgvrammask]; + + if (color) + drawPixel(&BGOBJLine[i], curpal[color], 0x01000000< +void GPU2D_Soft::DrawBG_Large(u32 line) // BG is always BG2 +{ + u16 bgcnt = BGCnt[2]; + + u32 tilesetaddr, tilemapaddr; + u16* pal; + + // large BG sizes: + // 0: 512x1024 + // 1: 1024x512 + // 2: 512x256 + // 3: 512x512 + u32 xmask, ymask; + u32 yshift; + switch (bgcnt & 0xC000) + { + case 0x0000: xmask = 0x1FFFF; ymask = 0x3FFFF; yshift = 9; break; + case 0x4000: xmask = 0x3FFFF; ymask = 0x1FFFF; yshift = 10; break; + case 0x8000: xmask = 0x1FFFF; ymask = 0x0FFFF; yshift = 9; break; + case 0xC000: xmask = 0x1FFFF; ymask = 0x1FFFF; yshift = 9; break; + } + + u32 ofxmask, ofymask; + if (bgcnt & 0x2000) + { + ofxmask = 0; + ofymask = 0; + } + else + { + ofxmask = ~xmask; + ofymask = ~ymask; + } + + s16 rotA = BGRotA[0]; + s16 rotB = BGRotB[0]; + s16 rotC = BGRotC[0]; + s16 rotD = BGRotD[0]; + + s32 rotX = BGXRefInternal[0]; + s32 rotY = BGYRefInternal[0]; + + if (bgcnt & 0x0040) + { + // vertical mosaic + rotX -= (BGMosaicY * rotB); + rotY -= (BGMosaicY * rotD); + } + + u8* bgvram; + u32 bgvrammask; + GetBGVRAM(bgvram, bgvrammask); + + // 256-color bitmap + + if (Num) pal = (u16*)&GPU::Palette[0x400]; + else pal = (u16*)&GPU::Palette[0]; + + u8 color; + + for (int i = 0; i < 256; i++) + { + if (WindowMask[i] & (1<<2)) + { + s32 finalX, finalY; + if (mosaic) + { + int im = CurBGXMosaicTable[i]; + finalX = rotX - (im * rotA); + finalY = rotY - (im * rotC); + } + else + { + finalX = rotX; + finalY = rotY; + } + + if (!(finalX & ofxmask) && !(finalY & ofymask)) + { + color = bgvram[(tilemapaddr + (((finalY & ymask) >> 8) << yshift) + ((finalX & xmask) >> 8)) & bgvrammask]; + + if (color) + drawPixel(&BGOBJLine[i], pal[color], 0x01000000<<2); + } + } + + rotX += rotA; + rotY += rotC; + } + + BGXRefInternal[0] += rotB; + BGYRefInternal[0] += rotD; +} + +// OBJ line buffer: +// * bit0-15: color (bit15=1: direct color, bit15=0: palette index, bit12=0 to indicate extpal) +// * bit16-17: BG-relative priority +// * bit18: non-transparent sprite pixel exists here +// * bit19: X mosaic should be applied here +// * bit24-31: compositor flags + +void GPU2D_Soft::ApplySpriteMosaicX() +{ + // apply X mosaic if needed + // X mosaic for sprites is applied after all sprites are rendered + + if (OBJMosaicSize[0] == 0) return; + + u32 lastcolor = OBJLine[0]; + + for (u32 i = 1; i < 256; i++) + { + if (!(OBJLine[i] & 0x100000)) + { + // not a mosaic'd sprite pixel + continue; + } + + if ((OBJIndex[i] != OBJIndex[i-1]) || (CurOBJXMosaicTable[i] == 0)) + lastcolor = OBJLine[i]; + else + OBJLine[i] = lastcolor; + } +} + +template +void GPU2D_Soft::InterleaveSprites(u32 prio) +{ + u16* pal = (u16*)&GPU::Palette[Num ? 0x600 : 0x200]; + + if (DispCnt & 0x80000000) + { + u16* extpal = GetOBJExtPal(); + + for (u32 i = 0; i < 256; i++) + { + if ((OBJLine[i] & 0x70000) != prio) continue; + if (!(WindowMask[i] & 0x10)) continue; + + u16 color; + u32 pixel = OBJLine[i]; + + if (pixel & 0x8000) + color = pixel & 0x7FFF; + else if (pixel & 0x1000) + color = pal[pixel & 0xFF]; + else + color = extpal[pixel & 0xFFF]; + + drawPixel(&BGOBJLine[i], color, pixel & 0xFF000000); + } + } + else + { + // optimized no-extpal version + + for (u32 i = 0; i < 256; i++) + { + if ((OBJLine[i] & 0x70000) != prio) continue; + if (!(WindowMask[i] & 0x10)) continue; + + u16 color; + u32 pixel = OBJLine[i]; + + if (pixel & 0x8000) + color = pixel & 0x7FFF; + else + color = pal[pixel & 0xFF]; + + drawPixel(&BGOBJLine[i], color, pixel & 0xFF000000); + } + } +} + +#define DoDrawSprite(type, ...) \ + if (iswin) \ + { \ + DrawSprite_##type(__VA_ARGS__); \ + } \ + else \ + { \ + DrawSprite_##type(__VA_ARGS__); \ + } + +void GPU2D_Soft::DrawSprites(u32 line) +{ + if (line == 0) + { + // reset those counters here + // TODO: find out when those are supposed to be reset + // it would make sense to reset them at the end of VBlank + // however, sprites are rendered one scanline in advance + // so they need to be reset a bit earlier + + OBJMosaicY = 0; + OBJMosaicYCount = 0; + } + + if (Num == 0) + { + auto objDirty = GPU::VRAMDirty_AOBJ.DeriveState(GPU::VRAMMap_AOBJ); + GPU::MakeVRAMFlat_AOBJCoherent(objDirty); + } + else + { + auto objDirty = GPU::VRAMDirty_BOBJ.DeriveState(GPU::VRAMMap_BOBJ); + GPU::MakeVRAMFlat_BOBJCoherent(objDirty); + } + + NumSprites = 0; + memset(OBJLine, 0, 256*4); + memset(OBJWindow, 0, 256); + if (!(DispCnt & 0x1000)) return; + + memset(OBJIndex, 0xFF, 256); + + u16* oam = (u16*)&GPU::OAM[Num ? 0x400 : 0]; + + const s32 spritewidth[16] = + { + 8, 16, 8, 8, + 16, 32, 8, 8, + 32, 32, 16, 8, + 64, 64, 32, 8 + }; + const s32 spriteheight[16] = + { + 8, 8, 16, 8, + 16, 8, 32, 8, + 32, 16, 32, 8, + 64, 32, 64, 8 + }; + + for (int bgnum = 0x0C00; bgnum >= 0x0000; bgnum -= 0x0400) + { + for (int sprnum = 127; sprnum >= 0; sprnum--) + { + u16* attrib = &oam[sprnum*4]; + + if ((attrib[2] & 0x0C00) != bgnum) + continue; + + bool iswin = (((attrib[0] >> 10) & 0x3) == 2); + + u32 sprline; + if ((attrib[0] & 0x1000) && !iswin) + { + // apply Y mosaic + sprline = OBJMosaicY; + } + else + sprline = line; + + if (attrib[0] & 0x0100) + { + u32 sizeparam = (attrib[0] >> 14) | ((attrib[1] & 0xC000) >> 12); + s32 width = spritewidth[sizeparam]; + s32 height = spriteheight[sizeparam]; + s32 boundwidth = width; + s32 boundheight = height; + + if (attrib[0] & 0x0200) + { + boundwidth <<= 1; + boundheight <<= 1; + } + + u32 ypos = attrib[0] & 0xFF; + ypos = (sprline - ypos) & 0xFF; + if (ypos >= (u32)boundheight) + continue; + + s32 xpos = (s32)(attrib[1] << 23) >> 23; + if (xpos <= -boundwidth) + continue; + + u32 rotparamgroup = (attrib[1] >> 9) & 0x1F; + + DoDrawSprite(Rotscale, sprnum, boundwidth, boundheight, width, height, xpos, ypos); + + NumSprites++; + } + else + { + if (attrib[0] & 0x0200) + continue; + + u32 sizeparam = (attrib[0] >> 14) | ((attrib[1] & 0xC000) >> 12); + s32 width = spritewidth[sizeparam]; + s32 height = spriteheight[sizeparam]; + + u32 ypos = attrib[0] & 0xFF; + ypos = (sprline - ypos) & 0xFF; + if (ypos >= (u32)height) + continue; + + s32 xpos = (s32)(attrib[1] << 23) >> 23; + if (xpos <= -width) + continue; + + DoDrawSprite(Normal, sprnum, width, height, xpos, ypos); + + NumSprites++; + } + } + } +} + +template +void GPU2D_Soft::DrawSprite_Rotscale(u32 num, u32 boundwidth, u32 boundheight, u32 width, u32 height, s32 xpos, s32 ypos) +{ + u16* oam = (u16*)&GPU::OAM[Num ? 0x400 : 0]; + u16* attrib = &oam[num * 4]; + u16* rotparams = &oam[(((attrib[1] >> 9) & 0x1F) * 16) + 3]; + + u32 pixelattr = ((attrib[2] & 0x0C00) << 6) | 0xC0000; + u32 tilenum = attrib[2] & 0x03FF; + u32 spritemode = window ? 0 : ((attrib[0] >> 10) & 0x3); + + u32 ytilefactor; + + u8* objvram; + u32 objvrammask; + GetOBJVRAM(objvram, objvrammask); + + s32 centerX = boundwidth >> 1; + s32 centerY = boundheight >> 1; + + if ((attrib[0] & 0x1000) && !window) + { + // apply Y mosaic + pixelattr |= 0x100000; + } + + u32 xoff; + if (xpos >= 0) + { + xoff = 0; + if ((xpos+boundwidth) > 256) + boundwidth = 256-xpos; + } + else + { + xoff = -xpos; + xpos = 0; + } + + s16 rotA = (s16)rotparams[0]; + s16 rotB = (s16)rotparams[4]; + s16 rotC = (s16)rotparams[8]; + s16 rotD = (s16)rotparams[12]; + + s32 rotX = ((xoff-centerX) * rotA) + ((ypos-centerY) * rotB) + (width << 7); + s32 rotY = ((xoff-centerX) * rotC) + ((ypos-centerY) * rotD) + (height << 7); + + width <<= 8; + height <<= 8; + + u16 color = 0; // transparent in all cases + + if (spritemode == 3) + { + u32 alpha = attrib[2] >> 12; + if (!alpha) return; + alpha++; + + pixelattr |= (0xC0000000 | (alpha << 24)); + + u32 pixelsaddr; + if (DispCnt & 0x40) + { + if (DispCnt & 0x20) + { + // 'reserved' + // draws nothing + + return; + } + else + { + pixelsaddr = tilenum << (7 + ((DispCnt >> 22) & 0x1)); + ytilefactor = ((width >> 8) * 2); + } + } + else + { + if (DispCnt & 0x20) + { + pixelsaddr = ((tilenum & 0x01F) << 4) + ((tilenum & 0x3E0) << 7); + ytilefactor = (256 * 2); + } + else + { + pixelsaddr = ((tilenum & 0x00F) << 4) + ((tilenum & 0x3F0) << 7); + ytilefactor = (128 * 2); + } + } + + for (; xoff < boundwidth;) + { + if ((u32)rotX < width && (u32)rotY < height) + { + color = *(u16*)&objvram[(pixelsaddr + ((rotY >> 8) * ytilefactor) + ((rotX >> 8) << 1)) & objvrammask]; + + if (color & 0x8000) + { + if (window) OBJWindow[xpos] = 1; + else { OBJLine[xpos] = color | pixelattr; OBJIndex[xpos] = num; } + } + else if (!window) + { + if (OBJLine[xpos] == 0) + { + OBJLine[xpos] = pixelattr & 0x180000; + OBJIndex[xpos] = num; + } + } + } + + rotX += rotA; + rotY += rotC; + xoff++; + xpos++; + } + } + else + { + u32 pixelsaddr = tilenum; + if (DispCnt & 0x10) + { + pixelsaddr <<= ((DispCnt >> 20) & 0x3); + ytilefactor = (width >> 11) << ((attrib[0] & 0x2000) ? 1:0); + } + else + { + ytilefactor = 0x20; + } + + if (spritemode == 1) pixelattr |= 0x80000000; + else pixelattr |= 0x10000000; + + ytilefactor <<= 5; + pixelsaddr <<= 5; + + if (attrib[0] & 0x2000) + { + // 256-color + + if (!window) + { + if (!(DispCnt & 0x80000000)) + pixelattr |= 0x1000; + else + pixelattr |= ((attrib[2] & 0xF000) >> 4); + } + + for (; xoff < boundwidth;) + { + if ((u32)rotX < width && (u32)rotY < height) + { + color = objvram[(pixelsaddr + ((rotY>>11)*ytilefactor) + ((rotY&0x700)>>5) + ((rotX>>11)*64) + ((rotX&0x700)>>8)) & objvrammask]; + + if (color) + { + if (window) OBJWindow[xpos] = 1; + else { OBJLine[xpos] = color | pixelattr; OBJIndex[xpos] = num; } + } + else if (!window) + { + if (OBJLine[xpos] == 0) + { + OBJLine[xpos] = pixelattr & 0x180000; + OBJIndex[xpos] = num; + } + } + } + + rotX += rotA; + rotY += rotC; + xoff++; + xpos++; + } + } + else + { + // 16-color + if (!window) + { + pixelattr |= 0x1000; + pixelattr |= ((attrib[2] & 0xF000) >> 8); + } + + for (; xoff < boundwidth;) + { + if ((u32)rotX < width && (u32)rotY < height) + { + color = objvram[(pixelsaddr + ((rotY>>11)*ytilefactor) + ((rotY&0x700)>>6) + ((rotX>>11)*32) + ((rotX&0x700)>>9)) & objvrammask]; + if (rotX & 0x100) + color >>= 4; + else + color &= 0x0F; + + if (color) + { + if (window) OBJWindow[xpos] = 1; + else { OBJLine[xpos] = color | pixelattr; OBJIndex[xpos] = num; } + } + else if (!window) + { + if (OBJLine[xpos] == 0) + { + OBJLine[xpos] = pixelattr & 0x180000; + OBJIndex[xpos] = num; + } + } + } + + rotX += rotA; + rotY += rotC; + xoff++; + xpos++; + } + } + } +} + +template +void GPU2D_Soft::DrawSprite_Normal(u32 num, u32 width, u32 height, s32 xpos, s32 ypos) +{ + u16* oam = (u16*)&GPU::OAM[Num ? 0x400 : 0]; + u16* attrib = &oam[num * 4]; + + u32 pixelattr = ((attrib[2] & 0x0C00) << 6) | 0xC0000; + u32 tilenum = attrib[2] & 0x03FF; + u32 spritemode = window ? 0 : ((attrib[0] >> 10) & 0x3); + + u32 wmask = width - 8; // really ((width - 1) & ~0x7) + + if ((attrib[0] & 0x1000) && !window) + { + // apply Y mosaic + pixelattr |= 0x100000; + } + + u8* objvram; + u32 objvrammask; + GetOBJVRAM(objvram, objvrammask); + + // yflip + if (attrib[1] & 0x2000) + ypos = height-1 - ypos; + + u32 xoff; + u32 xend = width; + if (xpos >= 0) + { + xoff = 0; + if ((xpos+xend) > 256) + xend = 256-xpos; + } + else + { + xoff = -xpos; + xpos = 0; + } + + u16 color = 0; // transparent in all cases + + if (spritemode == 3) + { + // bitmap sprite + + u32 alpha = attrib[2] >> 12; + if (!alpha) return; + alpha++; + + pixelattr |= (0xC0000000 | (alpha << 24)); + + u32 pixelsaddr = tilenum; + if (DispCnt & 0x40) + { + if (DispCnt & 0x20) + { + // 'reserved' + // draws nothing + + return; + } + else + { + pixelsaddr <<= (7 + ((DispCnt >> 22) & 0x1)); + pixelsaddr += (ypos * width * 2); + } + } + else + { + if (DispCnt & 0x20) + { + pixelsaddr = ((tilenum & 0x01F) << 4) + ((tilenum & 0x3E0) << 7); + pixelsaddr += (ypos * 256 * 2); + } + else + { + pixelsaddr = ((tilenum & 0x00F) << 4) + ((tilenum & 0x3F0) << 7); + pixelsaddr += (ypos * 128 * 2); + } + } + + s32 pixelstride; + + if (attrib[1] & 0x1000) // xflip + { + pixelsaddr += (width-1 << 1); + pixelsaddr -= (xoff << 1); + pixelstride = -2; + } + else + { + pixelsaddr += (xoff << 1); + pixelstride = 2; + } + + for (; xoff < xend;) + { + color = *(u16*)&objvram[pixelsaddr & objvrammask]; + + pixelsaddr += pixelstride; + + if (color & 0x8000) + { + if (window) OBJWindow[xpos] = 1; + else { OBJLine[xpos] = color | pixelattr; OBJIndex[xpos] = num; } + } + else if (!window) + { + if (OBJLine[xpos] == 0) + { + OBJLine[xpos] = pixelattr & 0x180000; + OBJIndex[xpos] = num; + } + } + + xoff++; + xpos++; + } + } + else + { + u32 pixelsaddr = tilenum; + if (DispCnt & 0x10) + { + pixelsaddr <<= ((DispCnt >> 20) & 0x3); + pixelsaddr += ((ypos >> 3) * (width >> 3)) << ((attrib[0] & 0x2000) ? 1:0); + } + else + { + pixelsaddr += ((ypos >> 3) * 0x20); + } + + if (spritemode == 1) pixelattr |= 0x80000000; + else pixelattr |= 0x10000000; + + if (attrib[0] & 0x2000) + { + // 256-color + pixelsaddr <<= 5; + pixelsaddr += ((ypos & 0x7) << 3); + s32 pixelstride; + + if (!window) + { + if (!(DispCnt & 0x80000000)) + pixelattr |= 0x1000; + else + pixelattr |= ((attrib[2] & 0xF000) >> 4); + } + + if (attrib[1] & 0x1000) // xflip + { + pixelsaddr += (((width-1) & wmask) << 3); + pixelsaddr += ((width-1) & 0x7); + pixelsaddr -= ((xoff & wmask) << 3); + pixelsaddr -= (xoff & 0x7); + pixelstride = -1; + } + else + { + pixelsaddr += ((xoff & wmask) << 3); + pixelsaddr += (xoff & 0x7); + pixelstride = 1; + } + + for (; xoff < xend;) + { + color = objvram[pixelsaddr]; + + pixelsaddr += pixelstride; + + if (color) + { + if (window) OBJWindow[xpos] = 1; + else { OBJLine[xpos] = color | pixelattr; OBJIndex[xpos] = num; } + } + else if (!window) + { + if (OBJLine[xpos] == 0) + { + OBJLine[xpos] = pixelattr & 0x180000; + OBJIndex[xpos] = num; + } + } + + xoff++; + xpos++; + if (!(xoff & 0x7)) pixelsaddr += (56 * pixelstride); + } + } + else + { + // 16-color + pixelsaddr <<= 5; + pixelsaddr += ((ypos & 0x7) << 2); + s32 pixelstride; + + if (!window) + { + pixelattr |= 0x1000; + pixelattr |= ((attrib[2] & 0xF000) >> 8); + } + + // TODO: optimize VRAM access!! + // TODO: do xflip better? the 'two pixels per byte' thing makes it a bit shitty + + if (attrib[1] & 0x1000) // xflip + { + pixelsaddr += (((width-1) & wmask) << 2); + pixelsaddr += (((width-1) & 0x7) >> 1); + pixelsaddr -= ((xoff & wmask) << 2); + pixelsaddr -= ((xoff & 0x7) >> 1); + pixelstride = -1; + } + else + { + pixelsaddr += ((xoff & wmask) << 2); + pixelsaddr += ((xoff & 0x7) >> 1); + pixelstride = 1; + } + + for (; xoff < xend;) + { + if (attrib[1] & 0x1000) + { + if (xoff & 0x1) { color = objvram[pixelsaddr & objvrammask] & 0x0F; pixelsaddr--; } + else color = objvram[pixelsaddr & objvrammask] >> 4; + } + else + { + if (xoff & 0x1) { color = objvram[pixelsaddr & objvrammask] >> 4; pixelsaddr++; } + else color = objvram[pixelsaddr & objvrammask] & 0x0F; + } + + if (color) + { + if (window) OBJWindow[xpos] = 1; + else { OBJLine[xpos] = color | pixelattr; OBJIndex[xpos] = num; } + } + else if (!window) + { + if (OBJLine[xpos] == 0) + { + OBJLine[xpos] = pixelattr & 0x180000; + OBJIndex[xpos] = num; + } + } + + xoff++; + xpos++; + if (!(xoff & 0x7)) pixelsaddr += ((attrib[1] & 0x1000) ? -28 : 28); + } + } + } +} + +void GPU2D_Soft::MosaicXSizeChanged() +{ + CurBGXMosaicTable = MosaicTable[BGMosaicSize[0]]; + CurOBJXMosaicTable = MosaicTable[OBJMosaicSize[1]]; +} diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 74debfe0..7b304268 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -179,6 +179,10 @@ u8 RenderFogDensityTable[34]; u32 RenderClearAttr1, RenderClearAttr2; +bool RenderFrameIdentical; + +u16 RenderXPos; + u32 ZeroDotWLimit; u32 GXStat; @@ -383,6 +387,8 @@ void Reset() FlushAttributes = 0; ResetRenderingState(); + + RenderXPos = 0; } void DoSavestate(Savestate* file) @@ -428,6 +434,8 @@ void DoSavestate(Savestate* file) file->Var32(&RenderClearAttr1); file->Var32(&RenderClearAttr2); + file->Var16(&RenderXPos); + file->Var32(&ZeroDotWLimit); file->Var32(&GXStat); @@ -585,8 +593,6 @@ void DoSavestate(Savestate* file) } } - // probably not worth storing the vblank-latched Renderxxxxxx variables - CmdStallQueue->DoSavestate(file); file->Var32((u32*)&VertexPipeline); file->Var32((u32*)&NormalPipeline); @@ -606,6 +612,22 @@ void DoSavestate(Savestate* file) // might cause a blank frame but atleast it won't shit itself RenderNumPolygons = 0; } + + file->VarArray(CurVertex, sizeof(s16)*3); + file->VarArray(VertexColor, sizeof(u8)*3); + file->VarArray(TexCoords, sizeof(s16)*2); + file->VarArray(RawTexCoords, sizeof(s16)*2); + file->VarArray(Normal, sizeof(s16)*3); + + file->VarArray(LightDirection, sizeof(s16)*4*3); + file->VarArray(LightColor, sizeof(u8)*4*3); + file->VarArray(MatDiffuse, sizeof(u8)*3); + file->VarArray(MatAmbient, sizeof(u8)*3); + file->VarArray(MatSpecular, sizeof(u8)*3); + file->VarArray(MatEmission, sizeof(u8)*3); + + file->Bool32(&UseShininessTable); + file->VarArray(ShininessTable, 128*sizeof(u8)); } @@ -2491,6 +2513,19 @@ void VBlank() } RenderNumPolygons = NumPolygons; + RenderFrameIdentical = false; + } + else + { + RenderFrameIdentical = RenderDispCnt == DispCnt + && RenderAlphaRef == AlphaRef + && RenderClearAttr1 == ClearAttr1 + && RenderClearAttr2 == ClearAttr2 + && RenderFogColor == FogColor + && RenderFogOffset == FogOffset * 0x200 + && memcmp(RenderEdgeTable, EdgeTable, 8*2) == 0 + && memcmp(RenderFogDensityTable + 1, FogDensityTable, 32) == 0 + && memcmp(RenderToonTable, ToonTable, 32*2) == 0; } RenderDispCnt = DispCnt; @@ -2533,14 +2568,46 @@ void VCount215() #endif } +void SetRenderXPos(u16 xpos) +{ + if (!RenderingEnabled) return; + + RenderXPos = xpos & 0x01FF; +} + +u32 ScrolledLine[256]; + u32* GetLine(int line) { - if (GPU::Renderer == 0) return SoftRenderer::GetLine(line); + u32* rawline = NULL; + + if (GPU::Renderer == 0) rawline = SoftRenderer::GetLine(line); #ifdef OGLRENDERER_ENABLED - else return GLRenderer::GetLine(line); -#else - return NULL; + else rawline = GLRenderer::GetLine(line); #endif + + if (RenderXPos == 0) return rawline; + + // apply X scroll + + if (RenderXPos & 0x100) + { + int i = 0, j = RenderXPos; + for (; j < 512; i++, j++) + ScrolledLine[i] = 0; + for (j = 0; i < 256; i++, j++) + ScrolledLine[i] = rawline[j]; + } + else + { + int i = 0, j = RenderXPos; + for (; j < 256; i++, j++) + ScrolledLine[i] = rawline[j]; + for (; i < 256; i++) + ScrolledLine[i] = 0; + } + + return ScrolledLine; } diff --git a/src/GPU3D.h b/src/GPU3D.h index c69adde2..69b67fa7 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -87,6 +87,10 @@ extern u8 RenderFogDensityTable[34]; extern u32 RenderClearAttr1, RenderClearAttr2; +extern bool RenderFrameIdentical; + +extern u16 RenderXPos; + extern std::array RenderPolygonRAM; extern u32 RenderNumPolygons; @@ -112,6 +116,8 @@ void CheckFIFODMA(); void VCount144(); void VBlank(); void VCount215(); + +void SetRenderXPos(u16 xpos); u32* GetLine(int line); void WriteToGXFIFO(u32 val); diff --git a/src/GPU3D_OpenGL.cpp b/src/GPU3D_OpenGL.cpp index 658b2613..88ae77a9 100644 --- a/src/GPU3D_OpenGL.cpp +++ b/src/GPU3D_OpenGL.cpp @@ -53,17 +53,18 @@ GLuint CurShaderID = -1; GLuint FinalPassEdgeShader[3]; GLuint FinalPassFogShader[3]; +// std140 compliant structure struct { - float uScreenSize[2]; - u32 uDispCnt; + float uScreenSize[2]; // vec2 0 / 2 + u32 uDispCnt; // int 2 / 1 u32 __pad0; - float uToonColors[32][4]; - float uEdgeColors[8][4]; - float uFogColor[4]; - float uFogDensity[34][4]; - u32 uFogOffset; - u32 uFogShift; + float uToonColors[32][4]; // vec4[32] 4 / 128 + float uEdgeColors[8][4]; // vec4[8] 132 / 32 + float uFogColor[4]; // vec4 164 / 4 + float uFogDensity[34][4]; // float[34] 168 / 136 + u32 uFogOffset; // int 304 / 1 + u32 uFogShift; // int 305 / 1 } ShaderConfig; @@ -74,11 +75,11 @@ typedef struct Polygon* PolyData; u32 NumIndices; - u16* Indices; + u32 IndicesOffset; GLuint PrimType; u32 NumEdgeIndices; - u16* EdgeIndices; + u32 EdgeIndicesOffset; u32 RenderKey; @@ -107,7 +108,11 @@ u32 VertexBuffer[10240 * 7]; u32 NumVertices; GLuint VertexArrayID; +GLuint IndexBufferID; u16 IndexBuffer[2048 * 40]; +u32 NumIndices, NumEdgeIndices; + +const u32 EdgeIndicesOffset = 2048 * 30; GLuint TexMemID; GLuint TexPalMemID; @@ -280,7 +285,7 @@ bool Init() glGenBuffers(1, &ShaderConfigUBO); glBindBuffer(GL_UNIFORM_BUFFER, ShaderConfigUBO); - glBufferData(GL_UNIFORM_BUFFER, sizeof(ShaderConfig), &ShaderConfig, GL_STATIC_DRAW); + glBufferData(GL_UNIFORM_BUFFER, (sizeof(ShaderConfig) + 15) & ~15, &ShaderConfig, GL_STATIC_DRAW); glBindBufferBase(GL_UNIFORM_BUFFER, 0, ShaderConfigUBO); @@ -320,6 +325,9 @@ bool Init() glEnableVertexAttribArray(3); // attrib glVertexAttribIPointer(3, 3, GL_UNSIGNED_INT, 7*4, (void*)(4*4)); + glGenBuffers(1, &IndexBufferID); + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, IndexBufferID); + glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(IndexBuffer), NULL, GL_DYNAMIC_DRAW); glGenFramebuffers(4, &FramebufferID[0]); glBindFramebuffer(GL_FRAMEBUFFER, FramebufferID[0]); @@ -563,15 +571,15 @@ void BuildPolygons(RendererPolygon* polygons, int npolys) u32* vptr = &VertexBuffer[0]; u32 vidx = 0; - u16* iptr = &IndexBuffer[0]; - u16* eiptr = &IndexBuffer[2048*30]; + u32 iidx = 0; + u32 eidx = EdgeIndicesOffset; for (int i = 0; i < npolys; i++) { RendererPolygon* rp = &polygons[i]; Polygon* poly = rp->PolyData; - rp->Indices = iptr; + rp->IndicesOffset = iidx; rp->NumIndices = 0; u32 vidx_first = vidx; @@ -606,7 +614,7 @@ void BuildPolygons(RendererPolygon* polygons, int npolys) vptr = SetupVertex(poly, j, vtx, vtxattr, vptr); - *iptr++ = vidx; + IndexBuffer[iidx++] = vidx; rp->NumIndices++; vidx++; @@ -627,9 +635,9 @@ void BuildPolygons(RendererPolygon* polygons, int npolys) } // build a triangle - *iptr++ = vidx_first; - *iptr++ = vidx - 2; - *iptr++ = vidx - 1; + IndexBuffer[iidx++] = vidx_first; + IndexBuffer[iidx++] = vidx - 2; + IndexBuffer[iidx++] = vidx - 1; rp->NumIndices += 3; } else // quad, pentagon, etc @@ -649,9 +657,9 @@ void BuildPolygons(RendererPolygon* polygons, int npolys) if (j >= 2) { // build a triangle - *iptr++ = vidx_first; - *iptr++ = vidx - 1; - *iptr++ = vidx; + IndexBuffer[iidx++] = vidx_first; + IndexBuffer[iidx++] = vidx - 1; + IndexBuffer[iidx++] = vidx; rp->NumIndices += 3; } @@ -743,46 +751,50 @@ void BuildPolygons(RendererPolygon* polygons, int npolys) if (j >= 1) { // build a triangle - *iptr++ = vidx_first; - *iptr++ = vidx - 1; - *iptr++ = vidx; + IndexBuffer[iidx++] = vidx_first; + IndexBuffer[iidx++] = vidx - 1; + IndexBuffer[iidx++] = vidx; rp->NumIndices += 3; } vidx++; } - *iptr++ = vidx_first; - *iptr++ = vidx - 1; - *iptr++ = vidx_first + 1; + IndexBuffer[iidx++] = vidx_first; + IndexBuffer[iidx++] = vidx - 1; + IndexBuffer[iidx++] = vidx_first + 1; rp->NumIndices += 3; } } - rp->EdgeIndices = eiptr; + rp->EdgeIndicesOffset = eidx; rp->NumEdgeIndices = 0; u32 vidx_cur = vidx_first; for (int j = 1; j < poly->NumVertices; j++) { - *eiptr++ = vidx_cur; - *eiptr++ = vidx_cur + 1; + IndexBuffer[eidx++] = vidx_cur; + IndexBuffer[eidx++] = vidx_cur + 1; vidx_cur++; rp->NumEdgeIndices += 2; } - *eiptr++ = vidx_cur; - *eiptr++ = vidx_first; + IndexBuffer[eidx++] = vidx_cur; + IndexBuffer[eidx++] = vidx_first; rp->NumEdgeIndices += 2; } NumVertices = vidx; + NumIndices = iidx; + NumEdgeIndices = eidx - EdgeIndicesOffset; } -void RenderSinglePolygon(int i) +int RenderSinglePolygon(int i) { RendererPolygon* rp = &PolygonList[i]; - glDrawElements(rp->PrimType, rp->NumIndices, GL_UNSIGNED_SHORT, rp->Indices); + glDrawElements(rp->PrimType, rp->NumIndices, GL_UNSIGNED_SHORT, (void*)(uintptr_t)(rp->IndicesOffset * 2)); + + return 1; } int RenderPolygonBatch(int i) @@ -803,7 +815,7 @@ int RenderPolygonBatch(int i) numindices += cur_rp->NumIndices; } - glDrawElements(primtype, numindices, GL_UNSIGNED_SHORT, rp->Indices); + glDrawElements(primtype, numindices, GL_UNSIGNED_SHORT, (void*)(uintptr_t)(rp->IndicesOffset * 2)); return numpolys; } @@ -823,7 +835,7 @@ int RenderPolygonEdgeBatch(int i) numindices += cur_rp->NumEdgeIndices; } - glDrawElements(GL_LINES, numindices, GL_UNSIGNED_SHORT, rp->EdgeIndices); + glDrawElements(GL_LINES, numindices, GL_UNSIGNED_SHORT, (void*)(uintptr_t)(rp->EdgeIndicesOffset * 2)); return numpolys; } @@ -857,6 +869,7 @@ void RenderSceneChunk(int y, int h) RendererPolygon* rp = &PolygonList[i]; if (rp->PolyData->IsShadowMask) { i++; continue; } + if (rp->PolyData->Translucent) { i++; continue; } if (rp->PolyData->Attr & (1<<14)) glDepthFunc(GL_LEQUAL); @@ -874,7 +887,8 @@ void RenderSceneChunk(int y, int h) } // if edge marking is enabled, mark all opaque edges - if (RenderDispCnt & (1<<5)) + // TODO BETTER EDGE MARKING!!! THIS SUCKS + /*if (RenderDispCnt & (1<<5)) { UseRenderShader(flags | RenderFlag_Edge); glLineWidth(1.5); @@ -899,7 +913,7 @@ void RenderSceneChunk(int y, int h) } glDepthMask(GL_TRUE); - } + }*/ glEnable(GL_BLEND); glBlendEquationSeparate(GL_FUNC_ADD, GL_MAX); @@ -944,15 +958,32 @@ void RenderSceneChunk(int y, int h) } else if (rp->PolyData->Translucent) { - UseRenderShader(flags | RenderFlag_Trans); + bool needopaque = ((rp->PolyData->Attr & 0x001F0000) == 0x001F0000); - if (rp->PolyData->Attr & (1<<14)) + u32 polyattr = rp->PolyData->Attr; + u32 polyid = (polyattr >> 24) & 0x3F; + + if (polyattr & (1<<14)) glDepthFunc(GL_LEQUAL); else glDepthFunc(GL_LESS); - u32 polyattr = rp->PolyData->Attr; - u32 polyid = (polyattr >> 24) & 0x3F; + if (needopaque) + { + UseRenderShader(flags); + + glDisable(GL_BLEND); + glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); + glColorMaski(1, GL_TRUE, GL_TRUE, fogenable, GL_FALSE); + + glStencilFunc(GL_ALWAYS, polyid, 0xFF); + glStencilOp(GL_KEEP, GL_KEEP, GL_REPLACE); + glStencilMask(0xFF); + + RenderSinglePolygon(i); + } + + UseRenderShader(flags | RenderFlag_Trans); GLboolean transfog; if (!(polyattr & (1<<15))) transfog = fogenable; @@ -975,7 +1006,7 @@ void RenderSceneChunk(int y, int h) if (polyattr & (1<<11)) glDepthMask(GL_TRUE); else glDepthMask(GL_FALSE); - i += RenderPolygonBatch(i); + i += needopaque ? RenderSinglePolygon(i) : RenderPolygonBatch(i); } else { @@ -989,7 +1020,7 @@ void RenderSceneChunk(int y, int h) if (polyattr & (1<<11)) glDepthMask(GL_TRUE); else glDepthMask(GL_FALSE); - i += RenderPolygonBatch(i); + i += needopaque ? RenderSinglePolygon(i) : RenderPolygonBatch(i); } } else @@ -1030,20 +1061,37 @@ void RenderSceneChunk(int y, int h) } else if (rp->PolyData->Translucent) { - UseRenderShader(flags | RenderFlag_Trans); + bool needopaque = ((rp->PolyData->Attr & 0x001F0000) == 0x001F0000); u32 polyattr = rp->PolyData->Attr; u32 polyid = (polyattr >> 24) & 0x3F; - GLboolean transfog; - if (!(polyattr & (1<<15))) transfog = fogenable; - else transfog = GL_FALSE; - - if (rp->PolyData->Attr & (1<<14)) + if (polyattr & (1<<14)) glDepthFunc(GL_LEQUAL); else glDepthFunc(GL_LESS); + if (needopaque) + { + UseRenderShader(flags); + + glDisable(GL_BLEND); + glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); + glColorMaski(1, GL_TRUE, GL_TRUE, fogenable, GL_FALSE); + + glStencilFunc(GL_ALWAYS, polyid, 0xFF); + glStencilOp(GL_KEEP, GL_KEEP, GL_REPLACE); + glStencilMask(0xFF); + + RenderSinglePolygon(i); + } + + UseRenderShader(flags | RenderFlag_Trans); + + GLboolean transfog; + if (!(polyattr & (1<<15))) transfog = fogenable; + else transfog = GL_FALSE; + if (rp->PolyData->IsShadow) { glDisable(GL_BLEND); @@ -1067,8 +1115,7 @@ void RenderSceneChunk(int y, int h) if (polyattr & (1<<11)) glDepthMask(GL_TRUE); else glDepthMask(GL_FALSE); - RenderSinglePolygon(i); - i++; + i += RenderSinglePolygon(i); } else { @@ -1083,7 +1130,7 @@ void RenderSceneChunk(int y, int h) if (polyattr & (1<<11)) glDepthMask(GL_TRUE); else glDepthMask(GL_FALSE); - i += RenderPolygonBatch(i); + i += needopaque ? RenderSinglePolygon(i) : RenderPolygonBatch(i); } } else @@ -1320,6 +1367,11 @@ void RenderFrame() glBindBuffer(GL_ARRAY_BUFFER, VertexBufferID); glBufferSubData(GL_ARRAY_BUFFER, 0, NumVertices*7*4, VertexBuffer); + // bind to access the index buffer + glBindVertexArray(VertexArrayID); + glBufferSubData(GL_ELEMENT_ARRAY_BUFFER, 0, NumIndices * 2, IndexBuffer); + glBufferSubData(GL_ELEMENT_ARRAY_BUFFER, EdgeIndicesOffset * 2, NumEdgeIndices * 2, IndexBuffer + EdgeIndicesOffset); + RenderSceneChunk(0, 192); } diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index e9d8e75f..d66eb76e 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -58,15 +58,17 @@ bool PrevIsShadowMask; bool Enabled; +bool FrameIdentical; + // threading bool Threaded; -void* RenderThread; +Platform::Thread* RenderThread; bool RenderThreadRunning; bool RenderThreadRendering; -void* Sema_RenderStart; -void* Sema_RenderDone; -void* Sema_ScanlineCount; +Platform::Semaphore* Sema_RenderStart; +Platform::Semaphore* Sema_RenderDone; +Platform::Semaphore* Sema_ScanlineCount; void RenderThreadFunc(); @@ -550,6 +552,16 @@ typedef struct RendererPolygon PolygonList[2048]; +template +inline T ReadVRAM_Texture(u32 addr) +{ + return *(T*)&GPU::VRAMFlat_Texture[addr & 0x7FFFF]; +} +template +inline T ReadVRAM_TexPal(u32 addr) +{ + return *(T*)&GPU::VRAMFlat_TexPal[addr & 0x1FFFF]; +} void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) { @@ -606,10 +618,10 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha case 1: // A3I5 { vramaddr += ((t * width) + s); - u8 pixel = GPU::ReadVRAM_Texture(vramaddr); + u8 pixel = ReadVRAM_Texture(vramaddr); texpal <<= 4; - *color = GPU::ReadVRAM_TexPal(texpal + ((pixel&0x1F)<<1)); + *color = ReadVRAM_TexPal(texpal + ((pixel&0x1F)<<1)); *alpha = ((pixel >> 3) & 0x1C) + (pixel >> 6); } break; @@ -617,12 +629,12 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha case 2: // 4-color { vramaddr += (((t * width) + s) >> 2); - u8 pixel = GPU::ReadVRAM_Texture(vramaddr); + u8 pixel = ReadVRAM_Texture(vramaddr); pixel >>= ((s & 0x3) << 1); pixel &= 0x3; texpal <<= 3; - *color = GPU::ReadVRAM_TexPal(texpal + (pixel<<1)); + *color = ReadVRAM_TexPal(texpal + (pixel<<1)); *alpha = (pixel==0) ? alpha0 : 31; } break; @@ -630,12 +642,12 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha case 3: // 16-color { vramaddr += (((t * width) + s) >> 1); - u8 pixel = GPU::ReadVRAM_Texture(vramaddr); + u8 pixel = ReadVRAM_Texture(vramaddr); if (s & 0x1) pixel >>= 4; else pixel &= 0xF; texpal <<= 4; - *color = GPU::ReadVRAM_TexPal(texpal + (pixel<<1)); + *color = ReadVRAM_TexPal(texpal + (pixel<<1)); *alpha = (pixel==0) ? alpha0 : 31; } break; @@ -643,10 +655,10 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha case 4: // 256-color { vramaddr += ((t * width) + s); - u8 pixel = GPU::ReadVRAM_Texture(vramaddr); + u8 pixel = ReadVRAM_Texture(vramaddr); texpal <<= 4; - *color = GPU::ReadVRAM_TexPal(texpal + (pixel<<1)); + *color = ReadVRAM_TexPal(texpal + (pixel<<1)); *alpha = (pixel==0) ? alpha0 : 31; } break; @@ -660,30 +672,30 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha if (vramaddr >= 0x40000) slot1addr += 0x10000; - u8 val = GPU::ReadVRAM_Texture(vramaddr); + u8 val = ReadVRAM_Texture(vramaddr); val >>= (2 * (s & 0x3)); - u16 palinfo = GPU::ReadVRAM_Texture(slot1addr); + u16 palinfo = ReadVRAM_Texture(slot1addr); u32 paloffset = (palinfo & 0x3FFF) << 2; texpal <<= 4; switch (val & 0x3) { case 0: - *color = GPU::ReadVRAM_TexPal(texpal + paloffset); + *color = ReadVRAM_TexPal(texpal + paloffset); *alpha = 31; break; case 1: - *color = GPU::ReadVRAM_TexPal(texpal + paloffset + 2); + *color = ReadVRAM_TexPal(texpal + paloffset + 2); *alpha = 31; break; case 2: if ((palinfo >> 14) == 1) { - u16 color0 = GPU::ReadVRAM_TexPal(texpal + paloffset); - u16 color1 = GPU::ReadVRAM_TexPal(texpal + paloffset + 2); + u16 color0 = ReadVRAM_TexPal(texpal + paloffset); + u16 color1 = ReadVRAM_TexPal(texpal + paloffset + 2); u32 r0 = color0 & 0x001F; u32 g0 = color0 & 0x03E0; @@ -700,8 +712,8 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha } else if ((palinfo >> 14) == 3) { - u16 color0 = GPU::ReadVRAM_TexPal(texpal + paloffset); - u16 color1 = GPU::ReadVRAM_TexPal(texpal + paloffset + 2); + u16 color0 = ReadVRAM_TexPal(texpal + paloffset); + u16 color1 = ReadVRAM_TexPal(texpal + paloffset + 2); u32 r0 = color0 & 0x001F; u32 g0 = color0 & 0x03E0; @@ -717,20 +729,20 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha *color = r | g | b; } else - *color = GPU::ReadVRAM_TexPal(texpal + paloffset + 4); + *color = ReadVRAM_TexPal(texpal + paloffset + 4); *alpha = 31; break; case 3: if ((palinfo >> 14) == 2) { - *color = GPU::ReadVRAM_TexPal(texpal + paloffset + 6); + *color = ReadVRAM_TexPal(texpal + paloffset + 6); *alpha = 31; } else if ((palinfo >> 14) == 3) { - u16 color0 = GPU::ReadVRAM_TexPal(texpal + paloffset); - u16 color1 = GPU::ReadVRAM_TexPal(texpal + paloffset + 2); + u16 color0 = ReadVRAM_TexPal(texpal + paloffset); + u16 color1 = ReadVRAM_TexPal(texpal + paloffset + 2); u32 r0 = color0 & 0x001F; u32 g0 = color0 & 0x03E0; @@ -759,10 +771,10 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha case 6: // A5I3 { vramaddr += ((t * width) + s); - u8 pixel = GPU::ReadVRAM_Texture(vramaddr); + u8 pixel = ReadVRAM_Texture(vramaddr); texpal <<= 4; - *color = GPU::ReadVRAM_TexPal(texpal + ((pixel&0x7)<<1)); + *color = ReadVRAM_TexPal(texpal + ((pixel&0x7)<<1)); *alpha = (pixel >> 3); } break; @@ -770,7 +782,7 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha case 7: // direct color { vramaddr += (((t * width) + s) << 1); - *color = GPU::ReadVRAM_Texture(vramaddr); + *color = ReadVRAM_Texture(vramaddr); *alpha = (*color & 0x8000) ? 31 : 0; } break; @@ -2007,8 +2019,8 @@ void ClearBuffers() { for (int x = 0; x < 256; x++) { - u16 val2 = GPU::ReadVRAM_Texture(0x40000 + (yoff << 9) + (xoff << 1)); - u16 val3 = GPU::ReadVRAM_Texture(0x60000 + (yoff << 9) + (xoff << 1)); + u16 val2 = ReadVRAM_Texture(0x40000 + (yoff << 9) + (xoff << 1)); + u16 val3 = ReadVRAM_Texture(0x60000 + (yoff << 9) + (xoff << 1)); // TODO: confirm color conversion u32 r = (val2 << 1) & 0x3E; if (r) r++; @@ -2088,11 +2100,19 @@ void VCount144() void RenderFrame() { + auto textureDirty = GPU::VRAMDirty_Texture.DeriveState(GPU::VRAMMap_Texture); + auto texPalDirty = GPU::VRAMDirty_TexPal.DeriveState(GPU::VRAMMap_TexPal); + + bool textureChanged = GPU::MakeVRAMFlat_TextureCoherent(textureDirty); + bool texPalChanged = GPU::MakeVRAMFlat_TexPalCoherent(texPalDirty); + + FrameIdentical = !(textureChanged || texPalChanged) && RenderFrameIdentical; + if (RenderThreadRunning) { Platform::Semaphore_Post(Sema_RenderStart); } - else + else if (!FrameIdentical) { ClearBuffers(); RenderPolygons(false, &RenderPolygonRAM[0], RenderNumPolygons); @@ -2107,8 +2127,15 @@ void RenderThreadFunc() if (!RenderThreadRunning) return; RenderThreadRendering = true; - ClearBuffers(); - RenderPolygons(true, &RenderPolygonRAM[0], RenderNumPolygons); + if (FrameIdentical) + { + Platform::Semaphore_Post(Sema_ScanlineCount, 192); + } + else + { + ClearBuffers(); + RenderPolygons(true, &RenderPolygonRAM[0], RenderNumPolygons); + } Platform::Semaphore_Post(Sema_RenderDone); RenderThreadRendering = false; diff --git a/src/GPU_OpenGL.cpp b/src/GPU_OpenGL.cpp index 359e9cd4..0c6cf004 100644 --- a/src/GPU_OpenGL.cpp +++ b/src/GPU_OpenGL.cpp @@ -36,6 +36,7 @@ int ScreenH, ScreenW; GLuint CompShader[1][3]; GLuint CompScaleLoc[1]; +GLuint Comp3DXPosLoc[1]; GLuint CompVertexBufferID; GLuint CompVertexArrayID; @@ -64,6 +65,7 @@ bool Init() return false; CompScaleLoc[i] = glGetUniformLocation(CompShader[i][2], "u3DScale"); + Comp3DXPosLoc[i] = glGetUniformLocation(CompShader[i][2], "u3DXPos"); glUseProgram(CompShader[i][2]); uni_id = glGetUniformLocation(CompShader[i][2], "ScreenTex"); @@ -180,6 +182,9 @@ void RenderFrame() OpenGL::UseShaderProgram(CompShader[0]); glUniform1ui(CompScaleLoc[0], Scale); + // TODO: support setting this midframe, if ever needed + glUniform1i(Comp3DXPosLoc[0], ((int)GPU3D::RenderXPos << 23) >> 23); + int frontbuf = GPU::FrontBuffer; glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D, CompScreenInputTex); diff --git a/src/GPU_OpenGL_shaders.h b/src/GPU_OpenGL_shaders.h index 20ac7673..03ddb7af 100644 --- a/src/GPU_OpenGL_shaders.h +++ b/src/GPU_OpenGL_shaders.h @@ -40,6 +40,7 @@ void main() const char* kCompositorFS_Nearest = R"(#version 140 uniform uint u3DScale; +uniform int u3DXPos; uniform usampler2D ScreenTex; uniform sampler2D _3DTex; @@ -52,6 +53,8 @@ void main() { ivec4 pixel = ivec4(texelFetch(ScreenTex, ivec2(fTexcoord), 0)); + float _3dxpos = float(u3DXPos); + ivec4 mbright = ivec4(texelFetch(ScreenTex, ivec2(256*3, int(fTexcoord.y)), 0)); int dispmode = mbright.b & 0x3; @@ -68,7 +71,7 @@ void main() { // 3D on top, blending - float xpos = val3.r + fract(fTexcoord.x); + float xpos = fTexcoord.x + _3dxpos; float ypos = mod(fTexcoord.y, 192); ivec4 _3dpix = ivec4(texelFetch(_3DTex, ivec2(vec2(xpos, ypos)*u3DScale), 0).bgra * vec4(63,63,63,31)); @@ -89,7 +92,7 @@ void main() { // 3D on bottom, blending - float xpos = val3.r + fract(fTexcoord.x); + float xpos = fTexcoord.x + _3dxpos; float ypos = mod(fTexcoord.y, 192); ivec4 _3dpix = ivec4(texelFetch(_3DTex, ivec2(vec2(xpos, ypos)*u3DScale), 0).bgra * vec4(63,63,63,31)); @@ -109,7 +112,7 @@ void main() { // 3D on top, normal/fade - float xpos = val3.r + fract(fTexcoord.x); + float xpos = fTexcoord.x + _3dxpos; float ypos = mod(fTexcoord.y, 192); ivec4 _3dpix = ivec4(texelFetch(_3DTex, ivec2(vec2(xpos, ypos)*u3DScale), 0).bgra * vec4(63,63,63,31)); diff --git a/src/NDS.cpp b/src/NDS.cpp index 90149add..8b49328f 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -210,13 +210,13 @@ bool Init() void DeInit() { - delete ARM9; - delete ARM7; - #ifdef JIT_ENABLED ARMJIT::DeInit(); #endif + delete ARM9; + delete ARM7; + for (int i = 0; i < 8; i++) delete DMAs[i]; @@ -908,7 +908,7 @@ void RunSystem(u64 timestamp) } } -template +template u32 RunFrame() { FrameStartTimestamp = SysTimestamp; @@ -934,10 +934,10 @@ u32 RunFrame() } else if (CPUStop & 0x0FFF) { - DMAs[0]->Run(); - if (!(CPUStop & 0x80000000)) DMAs[1]->Run(); - if (!(CPUStop & 0x80000000)) DMAs[2]->Run(); - if (!(CPUStop & 0x80000000)) DMAs[3]->Run(); + DMAs[0]->Run(); + if (!(CPUStop & 0x80000000)) DMAs[1]->Run(); + if (!(CPUStop & 0x80000000)) DMAs[2]->Run(); + if (!(CPUStop & 0x80000000)) DMAs[3]->Run(); if (ConsoleType == 1) DSi::RunNDMAs(0); } else @@ -962,10 +962,10 @@ u32 RunFrame() if (CPUStop & 0x0FFF0000) { - DMAs[4]->Run(); - DMAs[5]->Run(); - DMAs[6]->Run(); - DMAs[7]->Run(); + DMAs[4]->Run(); + DMAs[5]->Run(); + DMAs[6]->Run(); + DMAs[7]->Run(); if (ConsoleType == 1) DSi::RunNDMAs(1); } else @@ -999,6 +999,9 @@ u32 RunFrame() ARM7Timestamp-SysTimestamp, GPU3D::Timestamp-SysTimestamp); #endif + SPU::TransferOutput(); + + NDSCart::FlushSRAMFile(); NumFrames++; @@ -1009,10 +1012,14 @@ u32 RunFrame() { #ifdef JIT_ENABLED if (Config::JIT_Enable) - return RunFrame(); + return NDS::ConsoleType == 1 + ? RunFrame() + : RunFrame(); else #endif - return RunFrame(); + return NDS::ConsoleType == 1 + ? RunFrame() + : RunFrame(); } void Reschedule(u64 target) @@ -1470,7 +1477,7 @@ void HandleTimerOverflow(u32 tid) { Timer* timer = &Timers[tid]; - timer->Counter += timer->Reload << 16; + timer->Counter += (timer->Reload << 10); if (timer->Cnt & (1<<6)) SetIRQ(tid >> 2, IRQ_Timer0 + (tid & 0x3)); @@ -1486,11 +1493,11 @@ void HandleTimerOverflow(u32 tid) if ((timer->Cnt & 0x84) != 0x84) break; - timer->Counter += 0x10000; - if (timer->Counter >> 16) + timer->Counter += (1 << 10); + if (!(timer->Counter >> 26)) break; - timer->Counter = timer->Reload << 16; + timer->Counter = timer->Reload << 10; if (timer->Cnt & (1<<6)) SetIRQ(tid >> 2, IRQ_Timer0 + (tid & 0x3)); @@ -1505,8 +1512,13 @@ void RunTimer(u32 tid, s32 cycles) u32 oldcount = timer->Counter; timer->Counter += (cycles << timer->CycleShift); - if (timer->Counter < oldcount) + //if (timer->Counter < oldcount) + // HandleTimerOverflow(tid); + while (timer->Counter >> 26) + { + timer->Counter -= (1 << 26); HandleTimerOverflow(tid); + } } void RunTimers(u32 cpu) @@ -1623,7 +1635,7 @@ u16 TimerGetCounter(u32 timer) RunTimers(timer>>2); u32 ret = Timers[timer].Counter; - return ret >> 16; + return ret >> 10; } void TimerStart(u32 id, u16 cnt) @@ -1633,11 +1645,11 @@ void TimerStart(u32 id, u16 cnt) u16 newstart = cnt & (1<<7); timer->Cnt = cnt; - timer->CycleShift = 16 - TimerPrescaler[cnt & 0x03]; + timer->CycleShift = 10 - TimerPrescaler[cnt & 0x03]; if ((!curstart) && newstart) { - timer->Counter = timer->Reload << 16; + timer->Counter = timer->Reload << 10; /*if ((cnt & 0x84) == 0x80) { @@ -1824,14 +1836,14 @@ void debug(u32 param) fclose(shit);*/ FILE* - shit = fopen("debug/picto9.bin", "wb"); + shit = fopen("debug/power9.bin", "wb"); for (u32 i = 0x02000000; i < 0x04000000; i+=4) { u32 val = DSi::ARM9Read32(i); fwrite(&val, 4, 1, shit); } fclose(shit); - shit = fopen("debug/picto7.bin", "wb"); + shit = fopen("debug/power7.bin", "wb"); for (u32 i = 0x02000000; i < 0x04000000; i+=4) { u32 val = DSi::ARM7Read32(i); @@ -3001,6 +3013,7 @@ u32 ARM9IORead32(u32 addr) case 0x04000130: return (KeyInput & 0xFFFF) | (KeyCnt << 16); case 0x04000180: return IPCSync9; + case 0x04000184: return ARM9IORead16(addr); case 0x040001A0: return NDSCart::SPICnt | (NDSCart::ReadSPIData() << 16); case 0x040001A4: return NDSCart::ROMCnt; @@ -3115,6 +3128,10 @@ void ARM9IOWrite8(u32 addr, u8 val) NDSCart::WriteSPIData(val); return; + case 0x04000188: + ARM9IOWrite32(addr, val | (val << 8) | (val << 16) | (val << 24)); + return; + case 0x040001A8: NDSCart::ROMCommand[0] = val; return; case 0x040001A9: NDSCart::ROMCommand[1] = val; return; case 0x040001AA: NDSCart::ROMCommand[2] = val; return; @@ -3228,7 +3245,11 @@ void ARM9IOWrite16(u32 addr, u16 val) SetIRQ(0, IRQ_IPCRecv); if (val & 0x4000) IPCFIFOCnt9 &= ~0x4000; - IPCFIFOCnt9 = val & 0x8404; + IPCFIFOCnt9 = (val & 0x8404) | (IPCFIFOCnt9 & 0x4000); + return; + + case 0x04000188: + ARM9IOWrite32(addr, val | (val << 16)); return; case 0x040001A0: @@ -3378,10 +3399,11 @@ void ARM9IOWrite32(u32 addr, u32 val) case 0x04000130: KeyCnt = val >> 16; return; + case 0x04000180: + case 0x04000184: ARM9IOWrite16(addr, val); return; - case 0x04000188: if (IPCFIFOCnt9 & 0x8000) { @@ -3640,6 +3662,7 @@ u32 ARM7IORead32(u32 addr) case 0x04000138: return RTC::Read(); case 0x04000180: return IPCSync7; + case 0x04000184: return ARM7IORead16(addr); case 0x040001A0: return NDSCart::SPICnt | (NDSCart::ReadSPIData() << 16); case 0x040001A4: return NDSCart::ROMCnt; @@ -3716,6 +3739,10 @@ void ARM7IOWrite8(u32 addr, u8 val) case 0x04000138: RTC::Write(val, true); return; + case 0x04000188: + ARM7IOWrite32(addr, val | (val << 8) | (val << 16) | (val << 24)); + return; + case 0x040001A0: if (ExMemCnt[0] & (1<<11)) { @@ -3821,7 +3848,11 @@ void ARM7IOWrite16(u32 addr, u16 val) SetIRQ(1, IRQ_IPCRecv); if (val & 0x4000) IPCFIFOCnt7 &= ~0x4000; - IPCFIFOCnt7 = val & 0x8404; + IPCFIFOCnt7 = (val & 0x8404) | (IPCFIFOCnt7 & 0x4000); + return; + + case 0x04000188: + ARM7IOWrite32(addr, val | (val << 16)); return; case 0x040001A0: @@ -3940,6 +3971,7 @@ void ARM7IOWrite32(u32 addr, u32 val) case 0x04000138: RTC::Write(val & 0xFFFF, false); return; case 0x04000180: + case 0x04000184: ARM7IOWrite16(addr, val); return; case 0x04000188: @@ -3984,6 +4016,11 @@ void ARM7IOWrite32(u32 addr, u32 val) case 0x040001B0: *(u32*)&ROMSeed0[8] = val; return; case 0x040001B4: *(u32*)&ROMSeed1[8] = val; return; + case 0x040001C0: + SPI::WriteCnt(val & 0xFFFF); + SPI::WriteData((val >> 16) & 0xFF); + return; + case 0x04000208: IME[1] = val & 0x1; UpdateIRQ(1); return; case 0x04000210: IE[1] = val; UpdateIRQ(1); return; case 0x04000214: IF[1] &= ~val; UpdateIRQ(1); return; diff --git a/src/NDS.h b/src/NDS.h index 046d84b6..98a0f7d6 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -46,6 +46,8 @@ enum Event_DSi_SDMMCTransfer, Event_DSi_SDIOTransfer, Event_DSi_NWifi, + Event_DSi_CamIRQ, + Event_DSi_CamTransfer, Event_DSi_RAMSizeChange, @@ -82,7 +84,7 @@ enum IRQ_IPCSendDone, IRQ_IPCRecv, IRQ_CartSendDone, // TODO: less misleading name - IRQ_CartIREQMC, // IRQ triggered by game cart (example: Pok�mon Typing Adventure, BT controller) + IRQ_CartIREQMC, // IRQ triggered by game cart (example: Pokémon Typing Adventure, BT controller) IRQ_GXFIFO, IRQ_LidOpen, IRQ_SPI, diff --git a/src/NDSCart.cpp b/src/NDSCart.cpp index 077bf48c..2d8396ad 100644 --- a/src/NDSCart.cpp +++ b/src/NDSCart.cpp @@ -37,6 +37,7 @@ u8* SRAM; u32 SRAMLength; char SRAMPath[1024]; +bool SRAMFileDirty; void (*WriteFunc)(u8 val, bool islast); @@ -445,14 +446,21 @@ void Write(u8 val, u32 hold) break; } - if (islast && (CurCmd == 0x02 || CurCmd == 0x0A) && (SRAMLength > 0)) + SRAMFileDirty |= islast && (CurCmd == 0x02 || CurCmd == 0x0A) && (SRAMLength > 0); +} + +void FlushSRAMFile() +{ + if (!SRAMFileDirty) + return; + + SRAMFileDirty = false; + + FILE* f = Platform::OpenFile(SRAMPath, "wb"); + if (f) { - FILE* f = Platform::OpenFile(SRAMPath, "wb"); - if (f) - { - fwrite(SRAM, SRAMLength, 1, f); - fclose(f); - } + fwrite(SRAM, SRAMLength, 1, f); + fclose(f); } } @@ -1034,6 +1042,11 @@ void RelocateSave(const char* path, bool write) NDSCart_SRAM::RelocateSave(path, write); } +void FlushSRAMFile() +{ + NDSCart_SRAM::FlushSRAMFile(); +} + int ImportSRAM(const u8* data, u32 length) { memcpy(NDSCart_SRAM::SRAM, data, std::min(length, NDSCart_SRAM::SRAMLength)); diff --git a/src/NDSCart.h b/src/NDSCart.h index 9fe916db..7d3f4a15 100644 --- a/src/NDSCart.h +++ b/src/NDSCart.h @@ -46,6 +46,9 @@ void DoSavestate(Savestate* file); void DecryptSecureArea(u8* out); bool LoadROM(const char* path, const char* sram, bool direct); + +void FlushSRAMFile(); + void RelocateSave(const char* path, bool write); int ImportSRAM(const u8* data, u32 length); diff --git a/src/NonStupidBitfield.h b/src/NonStupidBitfield.h new file mode 100644 index 00000000..124ba76f --- /dev/null +++ b/src/NonStupidBitfield.h @@ -0,0 +1,149 @@ +#ifndef NONSTUPIDBITFIELD_H +#define NONSTUPIDBITFIELD_H + +#include "types.h" + +#include + +#include +#include + +// like std::bitset but less stupid and optimised for +// our use case (keeping track of memory invalidations) + +template +struct NonStupidBitField +{ + static_assert((Size % 8) == 0, "bitfield size must be a multiple of 8"); + static const u32 DataLength = Size / 8; + u8 Data[DataLength]; + + struct Ref + { + NonStupidBitField& BitField; + u32 Idx; + + operator bool() + { + return BitField.Data[Idx >> 3] & (1 << (Idx & 0x7)); + } + + Ref& operator=(bool set) + { + BitField.Data[Idx >> 3] &= ~(1 << (Idx & 0x7)); + BitField.Data[Idx >> 3] |= ((u8)set << (Idx & 0x7)); + return *this; + } + }; + + struct Iterator + { + NonStupidBitField& BitField; + u32 DataIdx; + u32 BitIdx; + u64 RemainingBits; + + u32 operator*() { return DataIdx * 8 + BitIdx; } + + bool operator==(const Iterator& other) { return other.DataIdx == DataIdx; } + bool operator!=(const Iterator& other) { return other.DataIdx != DataIdx; } + + template + void Next() + { + while (RemainingBits == 0 && DataIdx < DataLength) + { + DataIdx += sizeof(T); + RemainingBits = *(T*)&BitField.Data[DataIdx]; + } + + BitIdx = __builtin_ctzll(RemainingBits); + RemainingBits &= ~(1ULL << BitIdx); + } + + Iterator operator++(int) + { + Iterator prev(*this); + ++*this; + return prev; + } + + Iterator& operator++() + { + if ((DataLength % 8) == 0) + Next(); + else if ((DataLength % 4) == 0) + Next(); + else if ((DataLength % 2) == 0) + Next(); + else + Next(); + + return *this; + } + }; + + NonStupidBitField(u32 start, u32 size) + { + memset(Data, 0, sizeof(Data)); + + if (size == 0) + return; + + u32 roundedStartBit = (start + 7) & ~7; + u32 roundedEndBit = (start + size) & ~7; + if (roundedStartBit != roundedEndBit) + memset(Data + roundedStartBit / 8, 0xFF, (roundedEndBit - roundedStartBit) / 8); + + if (start & 0x7) + Data[start >> 3] = 0xFF << (start & 0x7); + if ((start + size) & 0x7) + Data[(start + size) >> 3] = 0xFF >> ((start + size) & 0x7); + } + + NonStupidBitField() + { + memset(Data, 0, sizeof(Data)); + } + + Iterator End() + { + return Iterator{*this, DataLength, 0, 0}; + } + Iterator Begin() + { + if ((DataLength % 8) == 0) + return ++Iterator{*this, 0, 0, *(u64*)Data}; + else if ((DataLength % 4) == 0) + return ++Iterator{*this, 0, 0, *(u32*)Data}; + else if ((DataLength % 2) == 0) + return ++Iterator{*this, 0, 0, *(u16*)Data}; + else + return ++Iterator{*this, 0, 0, *Data}; + } + + Ref operator[](u32 idx) + { + return Ref{*this, idx}; + } + + NonStupidBitField& operator|=(const NonStupidBitField& other) + { + for (u32 i = 0; i < DataLength; i++) + { + Data[i] |= other.Data[i]; + } + return *this; + } + NonStupidBitField& operator&=(const NonStupidBitField& other) + { + for (u32 i = 0; i < DataLength; i++) + { + Data[i] &= other.Data[i]; + } + return *this; + } +}; + + +#endif \ No newline at end of file diff --git a/src/OpenGLSupport.h b/src/OpenGLSupport.h index 925c0ad0..44c511f5 100644 --- a/src/OpenGLSupport.h +++ b/src/OpenGLSupport.h @@ -23,8 +23,13 @@ #include // TODO: different includes for each platform -#include -#include +#ifdef __APPLE__ + #include + #include +#else + #include + #include +#endif #include "Platform.h" @@ -61,6 +66,11 @@ #endif +#ifdef __APPLE__ + +#define DO_PROCLIST(func) + +#else #define DO_PROCLIST(func) \ DO_PROCLIST_1_3(func) \ @@ -128,6 +138,7 @@ \ func(GLGETSTRINGI, glGetStringi); \ +#endif namespace OpenGL { diff --git a/src/Platform.h b/src/Platform.h index fea98dd5..b4dda9eb 100644 --- a/src/Platform.h +++ b/src/Platform.h @@ -67,15 +67,24 @@ inline bool LocalFileExists(const char* name) return true; } -void* Thread_Create(void (*func)()); -void Thread_Free(void* thread); -void Thread_Wait(void* thread); +struct Thread; +Thread* Thread_Create(void (*func)()); +void Thread_Free(Thread* thread); +void Thread_Wait(Thread* thread); -void* Semaphore_Create(); -void Semaphore_Free(void* sema); -void Semaphore_Reset(void* sema); -void Semaphore_Wait(void* sema); -void Semaphore_Post(void* sema); +struct Semaphore; +Semaphore* Semaphore_Create(); +void Semaphore_Free(Semaphore* sema); +void Semaphore_Reset(Semaphore* sema); +void Semaphore_Wait(Semaphore* sema); +void Semaphore_Post(Semaphore* sema, int count = 1); + +struct Mutex; +Mutex* Mutex_Create(); +void Mutex_Free(Mutex* mutex); +void Mutex_Lock(Mutex* mutex); +void Mutex_Unlock(Mutex* mutex); +bool Mutex_TryLock(Mutex* mutex); void* GL_GetProcAddress(const char* proc); diff --git a/src/ROMList.h b/src/ROMList.h index ead3ee4c..03252bb0 100644 --- a/src/ROMList.h +++ b/src/ROMList.h @@ -24,11 +24,11 @@ typedef struct u32 GameCode; u32 ROMSize; u32 SaveMemType; - + } ROMListEntry; -ROMListEntry ROMList[] = +ROMListEntry ROMList[] = { {0x41464141, 0x00800000, 0x00000004}, {0x414D4155, 0x00800000, 0x00000008}, @@ -1143,7 +1143,7 @@ ROMListEntry ROMList[] = {0x454A4943, 0x00800000, 0x00000001}, {0x454A4956, 0x04000000, 0x00000003}, {0x454A4A42, 0x01000000, 0x00000001}, - {0x454A4A43, 0x00800000, 0x00000001}, + {0x454A4A43, 0x00800000, 0x00000002}, {0x454A4C41, 0x01000000, 0x00000001}, {0x454A4C42, 0x04000000, 0x00000001}, {0x454A4C43, 0x08000000, 0x00000002}, diff --git a/src/SPU.cpp b/src/SPU.cpp index 5b74bdae..fe798c79 100644 --- a/src/SPU.cpp +++ b/src/SPU.cpp @@ -18,6 +18,7 @@ #include #include +#include "Platform.h" #include "NDS.h" #include "DSi.h" #include "SPU.h" @@ -61,13 +62,15 @@ const s16 PSGTable[8][8] = {-0x7FFF, -0x7FFF, -0x7FFF, -0x7FFF, -0x7FFF, -0x7FFF, -0x7FFF, -0x7FFF} }; -const u32 kSamplesPerRun = 1; +const u32 OutputBufferSize = 2*2048; +s16 OutputBackbuffer[2 * OutputBufferSize]; +u32 OutputBackbufferWritePosition; -const u32 OutputBufferSize = 2*1024; -s16 OutputBuffer[2 * OutputBufferSize]; -volatile u32 OutputReadOffset; -volatile u32 OutputWriteOffset; +s16 OutputFrontBuffer[2 * OutputBufferSize]; +u32 OutputFrontBufferWritePosition; +u32 OutputFrontBufferReadPosition; +Platform::Mutex* AudioLock; u16 Cnt; u8 MasterVolume; @@ -85,6 +88,8 @@ bool Init() Capture[0] = new CaptureUnit(0); Capture[1] = new CaptureUnit(1); + AudioLock = Platform::Mutex_Create(); + return true; } @@ -95,6 +100,8 @@ void DeInit() delete Capture[0]; delete Capture[1]; + + Platform::Mutex_Free(AudioLock); } void Reset() @@ -111,15 +118,18 @@ void Reset() Capture[0]->Reset(); Capture[1]->Reset(); - NDS::ScheduleEvent(NDS::Event_SPU, true, 1024*kSamplesPerRun, Mix, kSamplesPerRun); + NDS::ScheduleEvent(NDS::Event_SPU, true, 1024, Mix, 0); } void Stop() { - memset(OutputBuffer, 0, 2*OutputBufferSize*2); + Platform::Mutex_Lock(AudioLock); + memset(OutputFrontBuffer, 0, 2*OutputBufferSize*2); - OutputReadOffset = 0; - OutputWriteOffset = 0; + OutputBackbufferWritePosition = 0; + OutputFrontBufferReadPosition = 0; + OutputFrontBufferWritePosition = 0; + Platform::Mutex_Unlock(AudioLock); } void DoSavestate(Savestate* file) @@ -416,11 +426,11 @@ void Channel::NextSample_Noise() } template -void Channel::Run(s32* buf, u32 samples) +s32 Channel::Run() { - if (!(Cnt & (1<<31))) return; + if (!(Cnt & (1<<31))) return 0; - if ((type < 3) && ((Length+LoopPos) < 16)) return; + if ((type < 3) && ((Length+LoopPos) < 16)) return 0; if (KeyOn) { @@ -428,45 +438,32 @@ void Channel::Run(s32* buf, u32 samples) KeyOn = false; } - for (u32 s = 0; s < samples; s++) + Timer += 512; // 1 sample = 512 cycles at 16MHz + + while (Timer >> 16) { - Timer += 512; // 1 sample = 512 cycles at 16MHz + Timer = TimerReload + (Timer - 0x10000); - while (Timer >> 16) + switch (type) { - Timer = TimerReload + (Timer - 0x10000); - - switch (type) - { - case 0: NextSample_PCM8(); break; - case 1: NextSample_PCM16(); break; - case 2: NextSample_ADPCM(); break; - case 3: NextSample_PSG(); break; - case 4: NextSample_Noise(); break; - } + case 0: NextSample_PCM8(); break; + case 1: NextSample_PCM16(); break; + case 2: NextSample_ADPCM(); break; + case 3: NextSample_PSG(); break; + case 4: NextSample_Noise(); break; } - - s32 val = (s32)CurSample; - val <<= VolumeShift; - val *= Volume; - buf[s] = val; - - if (!(Cnt & (1<<31))) break; } + + s32 val = (s32)CurSample; + val <<= VolumeShift; + val *= Volume; + return val; } -void Channel::PanOutput(s32* inbuf, u32 samples, s32* leftbuf, s32* rightbuf) +void Channel::PanOutput(s32 in, s32& left, s32& right) { - for (u32 s = 0; s < samples; s++) - { - s32 val = (s32)inbuf[s]; - - s32 l = ((s64)val * (128-Pan)) >> 10; - s32 r = ((s64)val * Pan) >> 10; - - leftbuf[s] += l; - rightbuf[s] += r; - } + left += ((s64)in * (128-Pan)) >> 10; + right += ((s64)in * Pan) >> 10; } @@ -602,39 +599,31 @@ void CaptureUnit::Run(s32 sample) } -void Mix(u32 samples) +void Mix(u32 dummy) { - s32 channelbuf[32]; - s32 leftbuf[32], rightbuf[32]; - s32 ch0buf[32], ch1buf[32], ch2buf[32], ch3buf[32]; - s32 leftoutput[32], rightoutput[32]; - - for (u32 s = 0; s < samples; s++) - { - leftbuf[s] = 0; rightbuf[s] = 0; - leftoutput[s] = 0; rightoutput[s] = 0; - } + s32 left = 0, right = 0; + s32 leftoutput = 0, rightoutput = 0; if (Cnt & (1<<15)) { - Channels[0]->DoRun(ch0buf, samples); - Channels[1]->DoRun(ch1buf, samples); - Channels[2]->DoRun(ch2buf, samples); - Channels[3]->DoRun(ch3buf, samples); + s32 ch0 = Channels[0]->DoRun(); + s32 ch1 = Channels[1]->DoRun(); + s32 ch2 = Channels[2]->DoRun(); + s32 ch3 = Channels[3]->DoRun(); // TODO: addition from capture registers - Channels[0]->PanOutput(ch0buf, samples, leftbuf, rightbuf); - Channels[2]->PanOutput(ch2buf, samples, leftbuf, rightbuf); + Channels[0]->PanOutput(ch0, left, right); + Channels[2]->PanOutput(ch2, left, right); - if (!(Cnt & (1<<12))) Channels[1]->PanOutput(ch1buf, samples, leftbuf, rightbuf); - if (!(Cnt & (1<<13))) Channels[3]->PanOutput(ch3buf, samples, leftbuf, rightbuf); + if (!(Cnt & (1<<12))) Channels[1]->PanOutput(ch1, left, right); + if (!(Cnt & (1<<13))) Channels[3]->PanOutput(ch3, left, right); for (int i = 4; i < 16; i++) { Channel* chan = Channels[i]; - chan->DoRun(channelbuf, samples); - chan->PanOutput(channelbuf, samples, leftbuf, rightbuf); + s32 channel = chan->DoRun(); + chan->PanOutput(channel, left, right); } // sound capture @@ -642,32 +631,24 @@ void Mix(u32 samples) if (Capture[0]->Cnt & (1<<7)) { - for (u32 s = 0; s < samples; s++) - { - s32 val = leftbuf[s]; + s32 val = left; - val >>= 8; - if (val < -0x8000) val = -0x8000; - else if (val > 0x7FFF) val = 0x7FFF; + val >>= 8; + if (val < -0x8000) val = -0x8000; + else if (val > 0x7FFF) val = 0x7FFF; - Capture[0]->Run(val); - if (!(Capture[0]->Cnt & (1<<7))) break; - } + Capture[0]->Run(val); } if (Capture[1]->Cnt & (1<<7)) { - for (u32 s = 0; s < samples; s++) - { - s32 val = rightbuf[s]; + s32 val = right; - val >>= 8; - if (val < -0x8000) val = -0x8000; - else if (val > 0x7FFF) val = 0x7FFF; + val >>= 8; + if (val < -0x8000) val = -0x8000; + else if (val > 0x7FFF) val = 0x7FFF; - Capture[1]->Run(val); - if (!(Capture[1]->Cnt & (1<<7))) break; - } + Capture[1]->Run(val); } // final output @@ -675,31 +656,25 @@ void Mix(u32 samples) switch (Cnt & 0x0300) { case 0x0000: // left mixer - { - for (u32 s = 0; s < samples; s++) - leftoutput[s] = leftbuf[s]; - } + leftoutput = left; break; case 0x0100: // channel 1 { s32 pan = 128 - Channels[1]->Pan; - for (u32 s = 0; s < samples; s++) - leftoutput[s] = ((s64)ch1buf[s] * pan) >> 10; + leftoutput = ((s64)ch1 * pan) >> 10; } break; case 0x0200: // channel 3 { s32 pan = 128 - Channels[3]->Pan; - for (u32 s = 0; s < samples; s++) - leftoutput[s] = ((s64)ch3buf[s] * pan) >> 10; + leftoutput = ((s64)ch3 * pan) >> 10; } break; case 0x0300: // channel 1+3 { s32 pan1 = 128 - Channels[1]->Pan; s32 pan3 = 128 - Channels[3]->Pan; - for (u32 s = 0; s < samples; s++) - leftoutput[s] = (((s64)ch1buf[s] * pan1) >> 10) + (((s64)ch3buf[s] * pan3) >> 10); + leftoutput = (((s64)ch1 * pan1) >> 10) + (((s64)ch3 * pan3) >> 10); } break; } @@ -707,105 +682,122 @@ void Mix(u32 samples) switch (Cnt & 0x0C00) { case 0x0000: // right mixer - { - for (u32 s = 0; s < samples; s++) - rightoutput[s] = rightbuf[s]; - } + rightoutput = right; break; case 0x0400: // channel 1 { s32 pan = Channels[1]->Pan; - for (u32 s = 0; s < samples; s++) - rightoutput[s] = ((s64)ch1buf[s] * pan) >> 10; + rightoutput = ((s64)ch1 * pan) >> 10; } break; case 0x0800: // channel 3 { s32 pan = Channels[3]->Pan; - for (u32 s = 0; s < samples; s++) - rightoutput[s] = ((s64)ch3buf[s] * pan) >> 10; + rightoutput = ((s64)ch3 * pan) >> 10; } break; case 0x0C00: // channel 1+3 { s32 pan1 = Channels[1]->Pan; s32 pan3 = Channels[3]->Pan; - for (u32 s = 0; s < samples; s++) - rightoutput[s] = (((s64)ch1buf[s] * pan1) >> 10) + (((s64)ch3buf[s] * pan3) >> 10); + rightoutput = (((s64)ch1 * pan1) >> 10) + (((s64)ch3 * pan3) >> 10); } break; } } - for (u32 s = 0; s < samples; s++) - { - s32 l = leftoutput[s]; - s32 r = rightoutput[s]; + leftoutput = ((s64)leftoutput * MasterVolume) >> 7; + rightoutput = ((s64)rightoutput * MasterVolume) >> 7; - l = ((s64)l * MasterVolume) >> 7; - r = ((s64)r * MasterVolume) >> 7; + leftoutput >>= 8; + if (leftoutput < -0x8000) leftoutput = -0x8000; + else if (leftoutput > 0x7FFF) leftoutput = 0x7FFF; + rightoutput >>= 8; + if (rightoutput < -0x8000) rightoutput = -0x8000; + else if (rightoutput > 0x7FFF) rightoutput = 0x7FFF; - l >>= 8; - if (l < -0x8000) l = -0x8000; - else if (l > 0x7FFF) l = 0x7FFF; - r >>= 8; - if (r < -0x8000) r = -0x8000; - else if (r > 0x7FFF) r = 0x7FFF; + // OutputBufferFrame can never get full because it's + // transfered to OutputBuffer at the end of the frame + OutputBackbuffer[OutputBackbufferWritePosition ] = leftoutput >> 1; + OutputBackbuffer[OutputBackbufferWritePosition + 1] = rightoutput >> 1; + OutputBackbufferWritePosition += 2; - OutputBuffer[OutputWriteOffset ] = l >> 1; - OutputBuffer[OutputWriteOffset + 1] = r >> 1; - OutputWriteOffset += 2; - OutputWriteOffset &= ((2*OutputBufferSize)-1); - if (OutputWriteOffset == OutputReadOffset) - { - //printf("!! SOUND FIFO OVERFLOW %d\n", OutputWriteOffset>>1); - // advance the read position too, to avoid losing the entire FIFO - OutputReadOffset += 2; - OutputReadOffset &= ((2*OutputBufferSize)-1); - } - } - - NDS::ScheduleEvent(NDS::Event_SPU, true, 1024*kSamplesPerRun, Mix, kSamplesPerRun); + NDS::ScheduleEvent(NDS::Event_SPU, true, 1024, Mix, 0); } +void TransferOutput() +{ + Platform::Mutex_Lock(AudioLock); + for (u32 i = 0; i < OutputBackbufferWritePosition; i += 2) + { + OutputFrontBuffer[OutputFrontBufferWritePosition ] = OutputBackbuffer[i ]; + OutputFrontBuffer[OutputFrontBufferWritePosition + 1] = OutputBackbuffer[i + 1]; + + OutputFrontBufferWritePosition += 2; + OutputFrontBufferWritePosition &= OutputBufferSize*2-1; + if (OutputFrontBufferWritePosition == OutputFrontBufferReadPosition) + { + // advance the read position too, to avoid losing the entire FIFO + OutputFrontBufferReadPosition += 2; + OutputFrontBufferReadPosition &= OutputBufferSize*2-1; + } + } + OutputBackbufferWritePosition = 0; + Platform::Mutex_Unlock(AudioLock); +} void TrimOutput() { + Platform::Mutex_Lock(AudioLock); const int halflimit = (OutputBufferSize / 2); - int readpos = OutputWriteOffset - (halflimit*2); + int readpos = OutputFrontBufferWritePosition - (halflimit*2); if (readpos < 0) readpos += (OutputBufferSize*2); - OutputReadOffset = readpos; + OutputFrontBufferReadPosition = readpos; + Platform::Mutex_Unlock(AudioLock); } void DrainOutput() { - OutputReadOffset = 0; - OutputWriteOffset = 0; + Platform::Mutex_Lock(AudioLock); + OutputFrontBufferWritePosition = 0; + OutputFrontBufferReadPosition = 0; + Platform::Mutex_Unlock(AudioLock); } void InitOutput() { - memset(OutputBuffer, 0, 2*OutputBufferSize*2); - OutputReadOffset = 0; - OutputWriteOffset = OutputBufferSize; + Platform::Mutex_Lock(AudioLock); + memset(OutputBackbuffer, 0, 2*OutputBufferSize*2); + memset(OutputFrontBuffer, 0, 2*OutputBufferSize*2); + OutputFrontBufferReadPosition = 0; + OutputFrontBufferWritePosition = 0; + Platform::Mutex_Unlock(AudioLock); } int GetOutputSize() { + Platform::Mutex_Lock(AudioLock); + int ret; - if (OutputWriteOffset >= OutputReadOffset) - ret = OutputWriteOffset - OutputReadOffset; + if (OutputFrontBufferWritePosition >= OutputFrontBufferReadPosition) + ret = OutputFrontBufferWritePosition - OutputFrontBufferReadPosition; else - ret = (OutputBufferSize*2) - OutputReadOffset + OutputWriteOffset; + ret = (OutputBufferSize*2) - OutputFrontBufferReadPosition + OutputFrontBufferWritePosition; ret >>= 1; + + Platform::Mutex_Unlock(AudioLock); return ret; } void Sync(bool wait) { + // this function is currently not used anywhere + // depending on the usage context the thread safety measures could be made + // a lot faster + // sync to audio output in case the core is running too fast // * wait=true: wait until enough audio data has been played // * wait=false: merely skip some audio data to avoid a FIFO overflow @@ -819,32 +811,42 @@ void Sync(bool wait) } else if (GetOutputSize() > halflimit) { - int readpos = OutputWriteOffset - (halflimit*2); + Platform::Mutex_Lock(AudioLock); + + int readpos = OutputFrontBufferWritePosition - (halflimit*2); if (readpos < 0) readpos += (OutputBufferSize*2); - OutputReadOffset = readpos; + OutputFrontBufferReadPosition = readpos; + + Platform::Mutex_Unlock(AudioLock); } } int ReadOutput(s16* data, int samples) { - if (OutputReadOffset == OutputWriteOffset) + Platform::Mutex_Lock(AudioLock); + if (OutputFrontBufferReadPosition == OutputFrontBufferWritePosition) + { + Platform::Mutex_Unlock(AudioLock); return 0; + } for (int i = 0; i < samples; i++) { - *data++ = OutputBuffer[OutputReadOffset]; - *data++ = OutputBuffer[OutputReadOffset + 1]; + *data++ = OutputFrontBuffer[OutputFrontBufferReadPosition]; + *data++ = OutputFrontBuffer[OutputFrontBufferReadPosition + 1]; - //if (OutputReadOffset != OutputWriteOffset) + OutputFrontBufferReadPosition += 2; + OutputFrontBufferReadPosition &= ((2*OutputBufferSize)-1); + + if (OutputFrontBufferWritePosition == OutputFrontBufferReadPosition) { - OutputReadOffset += 2; - OutputReadOffset &= ((2*OutputBufferSize)-1); - } - if (OutputReadOffset == OutputWriteOffset) + Platform::Mutex_Unlock(AudioLock); return i+1; + } } + Platform::Mutex_Unlock(AudioLock); return samples; } diff --git a/src/SPU.h b/src/SPU.h index 964841d3..c6b1c7f1 100644 --- a/src/SPU.h +++ b/src/SPU.h @@ -33,7 +33,7 @@ void DoSavestate(Savestate* file); void SetBias(u16 bias); -void Mix(u32 samples); +void Mix(u32 dummy); void TrimOutput(); void DrainOutput(); @@ -41,6 +41,7 @@ void InitOutput(); int GetOutputSize(); void Sync(bool wait); int ReadOutput(s16* data, int samples); +void TransferOutput(); u8 Read8(u32 addr); u16 Read16(u32 addr); @@ -123,26 +124,24 @@ public: void NextSample_PSG(); void NextSample_Noise(); - template void Run(s32* buf, u32 samples); + template s32 Run(); - void DoRun(s32* buf, u32 samples) + s32 DoRun() { - for (u32 s = 0; s < samples; s++) - buf[s] = 0; - switch ((Cnt >> 29) & 0x3) { - case 0: Run<0>(buf, samples); break; - case 1: Run<1>(buf, samples); break; - case 2: Run<2>(buf, samples); break; + case 0: return Run<0>(); break; + case 1: return Run<1>(); break; + case 2: return Run<2>(); break; case 3: - if (Num >= 14) Run<4>(buf, samples); - else if (Num >= 8) Run<3>(buf, samples); - break; + if (Num >= 14) return Run<4>(); + else if (Num >= 8) return Run<3>(); + default: + return 0; } } - void PanOutput(s32* inbuf, u32 samples, s32* leftbuf, s32* rightbuf); + void PanOutput(s32 in, s32& left, s32& right); private: u32 (*BusRead32)(u32 addr); diff --git a/src/Savestate.h b/src/Savestate.h index c3c2e1d0..ae8fced1 100644 --- a/src/Savestate.h +++ b/src/Savestate.h @@ -22,7 +22,7 @@ #include #include "types.h" -#define SAVESTATE_MAJOR 6 +#define SAVESTATE_MAJOR 7 #define SAVESTATE_MINOR 0 class Savestate diff --git a/src/frontend/SharedConfig.h b/src/frontend/SharedConfig.h new file mode 100644 index 00000000..b4b18c5f --- /dev/null +++ b/src/frontend/SharedConfig.h @@ -0,0 +1,13 @@ +#ifndef SHAREDCONFIG_H +#define SHAREDCONFIG_H + +namespace Config +{ + +extern int ConsoleType; +extern int DirectBoot; +extern int SavestateRelocSRAM; + +} + +#endif \ No newline at end of file diff --git a/src/frontend/Util_ROM.cpp b/src/frontend/Util_ROM.cpp index f61c3e3e..9f22f5f5 100644 --- a/src/frontend/Util_ROM.cpp +++ b/src/frontend/Util_ROM.cpp @@ -21,7 +21,7 @@ #include "FrontendUtil.h" #include "Config.h" -#include "qt_sdl/PlatformConfig.h" // FIXME!!! +#include "SharedConfig.h" #include "Platform.h" #include "NDS.h" diff --git a/src/frontend/qt_sdl/CMakeLists.txt b/src/frontend/qt_sdl/CMakeLists.txt index f12a9ed8..865acca0 100644 --- a/src/frontend/qt_sdl/CMakeLists.txt +++ b/src/frontend/qt_sdl/CMakeLists.txt @@ -100,6 +100,19 @@ if (PORTABLE) add_definitions(-DPORTABLE) endif() +if (APPLE) + set_target_properties(melonDS PROPERTIES + MACOSX_BUNDLE true + MACOSX_BUNDLE_INFO_PLIST ${CMAKE_SOURCE_DIR}/melonDS.plist + OUTPUT_NAME melonDS + ) + + # Copy icon into the bundle + target_sources(melonDS PRIVATE "${CMAKE_SOURCE_DIR}/melonDS.icns") + set_source_files_properties("${CMAKE_SOURCE_DIR}/melonDS.icns" PROPERTIES MACOSX_PACKAGE_LOCATION Resources) + +endif() + install(FILES ../../../net.kuribo64.melonDS.desktop DESTINATION ${CMAKE_INSTALL_PREFIX}/share/applications) install(FILES ../../../icon/melon_16x16.png DESTINATION ${CMAKE_INSTALL_PREFIX}/share/icons/hicolor/16x16/apps RENAME net.kuribo64.melonDS.png) install(FILES ../../../icon/melon_32x32.png DESTINATION ${CMAKE_INSTALL_PREFIX}/share/icons/hicolor/32x32/apps RENAME net.kuribo64.melonDS.png) @@ -107,4 +120,4 @@ install(FILES ../../../icon/melon_48x48.png DESTINATION ${CMAKE_INSTALL_PREFIX}/ install(FILES ../../../icon/melon_64x64.png DESTINATION ${CMAKE_INSTALL_PREFIX}/share/icons/hicolor/64x64/apps RENAME net.kuribo64.melonDS.png) install(FILES ../../../icon/melon_128x128.png DESTINATION ${CMAKE_INSTALL_PREFIX}/share/icons/hicolor/128x128/apps RENAME net.kuribo64.melonDS.png) install(FILES ../../../icon/melon_256x256.png DESTINATION ${CMAKE_INSTALL_PREFIX}/share/icons/hicolor/256x256/apps RENAME net.kuribo64.melonDS.png) -install(TARGETS melonDS RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) +install(TARGETS melonDS BUNDLE DESTINATION ${CMAKE_BINARY_DIR} RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) diff --git a/src/frontend/qt_sdl/EmuSettingsDialog.cpp b/src/frontend/qt_sdl/EmuSettingsDialog.cpp index 79ce5ed0..31831826 100644 --- a/src/frontend/qt_sdl/EmuSettingsDialog.cpp +++ b/src/frontend/qt_sdl/EmuSettingsDialog.cpp @@ -65,6 +65,9 @@ EmuSettingsDialog::EmuSettingsDialog(QWidget* parent) : QDialog(parent), ui(new ui->chkJITBranchOptimisations->setChecked(Config::JIT_BranchOptimisations != 0); ui->chkJITLiteralOptimisations->setChecked(Config::JIT_LiteralOptimisations != 0); ui->chkJITFastMemory->setChecked(Config::JIT_FastMemory != 0); + #ifdef __APPLE__ + ui->chkJITFastMemory->setDisabled(true); + #endif ui->spnJITMaximumBlockSize->setValue(Config::JIT_MaxBlockSize); #else ui->chkEnableJIT->setDisabled(true); @@ -329,6 +332,8 @@ void EmuSettingsDialog::on_chkEnableJIT_toggled() bool disabled = !ui->chkEnableJIT->isChecked(); ui->chkJITBranchOptimisations->setDisabled(disabled); ui->chkJITLiteralOptimisations->setDisabled(disabled); - ui->chkJITFastMemory->setDisabled(disabled); + #ifndef __APPLE__ + ui->chkJITFastMemory->setDisabled(disabled); + #endif ui->spnJITMaximumBlockSize->setDisabled(disabled); } diff --git a/src/frontend/qt_sdl/InputConfigDialog.cpp b/src/frontend/qt_sdl/InputConfigDialog.cpp index 9f08731d..eaf1e9bd 100644 --- a/src/frontend/qt_sdl/InputConfigDialog.cpp +++ b/src/frontend/qt_sdl/InputConfigDialog.cpp @@ -216,6 +216,7 @@ KeyMapButton::KeyMapButton(int* mapping, bool hotkey) : QPushButton() setCheckable(true); setText(mappingText()); + setFocusPolicy(Qt::StrongFocus); //Fixes binding keys in macOS connect(this, &KeyMapButton::clicked, this, &KeyMapButton::onClick); } diff --git a/src/frontend/qt_sdl/LAN_PCap.cpp b/src/frontend/qt_sdl/LAN_PCap.cpp index ce278bcb..3381e809 100644 --- a/src/frontend/qt_sdl/LAN_PCap.cpp +++ b/src/frontend/qt_sdl/LAN_PCap.cpp @@ -33,7 +33,11 @@ #include #include #include - #include + #ifdef __linux__ + #include + #else + #include + #endif #endif @@ -66,6 +70,9 @@ const char* PCapLibNames[] = #ifdef __WIN32__ // TODO: name for npcap in non-WinPCap mode "wpcap.dll", +#elif defined(__APPLE__) + "libpcap.A.dylib", + "libpcap.dylib", #else // Linux lib names "libpcap.so.1", @@ -276,6 +283,7 @@ bool Init(bool open_adapter) struct sockaddr_in* sa = (sockaddr_in*)curaddr->ifa_addr; memcpy(adata->IP_v4, &sa->sin_addr, 4); } + #ifdef __linux__ else if (af == AF_PACKET) { struct sockaddr_ll* sa = (sockaddr_ll*)curaddr->ifa_addr; @@ -284,7 +292,16 @@ bool Init(bool open_adapter) else memcpy(adata->MAC, sa->sll_addr, 6); } - + #else + else if (af == AF_LINK) + { + struct sockaddr_dl* sa = (sockaddr_dl*)curaddr->ifa_addr; + if (sa->sdl_alen != 6) + printf("weird MAC length %d for %s\n", sa->sdl_alen, curaddr->ifa_name); + else + memcpy(adata->MAC, LLADDR(sa), 6); + } + #endif curaddr = curaddr->ifa_next; } } diff --git a/src/frontend/qt_sdl/Platform.cpp b/src/frontend/qt_sdl/Platform.cpp index a716feb6..d3480e44 100644 --- a/src/frontend/qt_sdl/Platform.cpp +++ b/src/frontend/qt_sdl/Platform.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "Platform.h" @@ -187,53 +188,77 @@ FILE* OpenLocalFile(const char* path, const char* mode) return OpenFile(fullpath.toUtf8(), mode, mode[0] != 'w'); } -void* Thread_Create(void (* func)()) +Thread* Thread_Create(void (* func)()) { QThread* t = QThread::create(func); t->start(); - return (void*) t; + return (Thread*) t; } -void Thread_Free(void* thread) +void Thread_Free(Thread* thread) { QThread* t = (QThread*) thread; t->terminate(); delete t; } -void Thread_Wait(void* thread) +void Thread_Wait(Thread* thread) { ((QThread*) thread)->wait(); } -void* Semaphore_Create() +Semaphore* Semaphore_Create() { - return new QSemaphore(); + return (Semaphore*)new QSemaphore(); } -void Semaphore_Free(void* sema) +void Semaphore_Free(Semaphore* sema) { delete (QSemaphore*) sema; } -void Semaphore_Reset(void* sema) +void Semaphore_Reset(Semaphore* sema) { QSemaphore* s = (QSemaphore*) sema; s->acquire(s->available()); } -void Semaphore_Wait(void* sema) +void Semaphore_Wait(Semaphore* sema) { ((QSemaphore*) sema)->acquire(); } -void Semaphore_Post(void* sema) +void Semaphore_Post(Semaphore* sema, int count) { - ((QSemaphore*) sema)->release(); + ((QSemaphore*) sema)->release(count); } +Mutex* Mutex_Create() +{ + return (Mutex*)new QMutex(); +} + +void Mutex_Free(Mutex* mutex) +{ + delete (QMutex*) mutex; +} + +void Mutex_Lock(Mutex* mutex) +{ + ((QMutex*) mutex)->lock(); +} + +void Mutex_Unlock(Mutex* mutex) +{ + ((QMutex*) mutex)->unlock(); +} + +bool Mutex_TryLock(Mutex* mutex) +{ + return ((QMutex*) mutex)->try_lock(); +} void* GL_GetProcAddress(const char* proc) { diff --git a/src/frontend/qt_sdl/PlatformConfig.cpp b/src/frontend/qt_sdl/PlatformConfig.cpp index c2d40c45..98616623 100644 --- a/src/frontend/qt_sdl/PlatformConfig.cpp +++ b/src/frontend/qt_sdl/PlatformConfig.cpp @@ -120,7 +120,7 @@ ConfigEntry PlatformConfigFile[] = {"HKJoy_Reset", 0, &HKJoyMapping[HK_Reset], -1, NULL, 0}, {"HKJoy_FastForward", 0, &HKJoyMapping[HK_FastForward], -1, NULL, 0}, {"HKJoy_FastForwardToggle", 0, &HKJoyMapping[HK_FastForwardToggle], -1, NULL, 0}, - {"HKJoy_FastForwardToggle", 0, &HKJoyMapping[HK_FullscreenToggle], -1, NULL, 0}, + {"HKJoy_FullscreenToggle", 0, &HKJoyMapping[HK_FullscreenToggle], -1, NULL, 0}, {"HKJoy_SolarSensorDecrease", 0, &HKJoyMapping[HK_SolarSensorDecrease], -1, NULL, 0}, {"HKJoy_SolarSensorIncrease", 0, &HKJoyMapping[HK_SolarSensorIncrease], -1, NULL, 0}, diff --git a/src/frontend/qt_sdl/WifiSettingsDialog.cpp b/src/frontend/qt_sdl/WifiSettingsDialog.cpp index 67297ad5..24b339da 100644 --- a/src/frontend/qt_sdl/WifiSettingsDialog.cpp +++ b/src/frontend/qt_sdl/WifiSettingsDialog.cpp @@ -54,7 +54,7 @@ WifiSettingsDialog::WifiSettingsDialog(QWidget* parent) : QDialog(parent), ui(ne LAN_Socket::Init(); haspcap = LAN_PCap::Init(false); - ui->cbDirectMode->setText("Direct mode (requires " PCAP_NAME " and ethernet connection)"); + ui->rbDirectMode->setText("Direct mode (requires " PCAP_NAME " and ethernet connection)"); ui->cbBindAnyAddr->setChecked(Config::SocketBindAnyAddr != 0); ui->cbRandomizeMAC->setChecked(Config::RandomizeMAC != 0); @@ -71,8 +71,9 @@ WifiSettingsDialog::WifiSettingsDialog(QWidget* parent) : QDialog(parent), ui(ne } ui->cbxDirectAdapter->setCurrentIndex(sel); - ui->cbDirectMode->setChecked(Config::DirectLAN != 0); - if (!haspcap) ui->cbDirectMode->setEnabled(false); + ui->rbDirectMode->setChecked(Config::DirectLAN != 0); + ui->rbIndirectMode->setChecked(Config::DirectLAN == 0); + if (!haspcap) ui->rbDirectMode->setEnabled(false); updateAdapterControls(); } @@ -101,7 +102,7 @@ void WifiSettingsDialog::done(int r) Config::SocketBindAnyAddr = ui->cbBindAnyAddr->isChecked() ? 1:0; Config::RandomizeMAC = randommac; - Config::DirectLAN = ui->cbDirectMode->isChecked() ? 1:0; + Config::DirectLAN = ui->rbDirectMode->isChecked() ? 1:0; int sel = ui->cbxDirectAdapter->currentIndex(); if (sel < 0 || sel >= LAN_PCap::NumAdapters) sel = 0; @@ -125,11 +126,14 @@ void WifiSettingsDialog::done(int r) closeDlg(); } -void WifiSettingsDialog::on_cbDirectMode_stateChanged(int state) +void WifiSettingsDialog::on_rbDirectMode_clicked() +{ + updateAdapterControls(); +} +void WifiSettingsDialog::on_rbIndirectMode_clicked() { updateAdapterControls(); } - void WifiSettingsDialog::on_cbxDirectAdapter_currentIndexChanged(int sel) { if (!haspcap) return; @@ -153,7 +157,7 @@ void WifiSettingsDialog::on_cbxDirectAdapter_currentIndexChanged(int sel) void WifiSettingsDialog::updateAdapterControls() { - bool enable = haspcap && ui->cbDirectMode->isChecked(); + bool enable = haspcap && ui->rbDirectMode->isChecked(); ui->cbxDirectAdapter->setEnabled(enable); ui->lblAdapterMAC->setEnabled(enable); diff --git a/src/frontend/qt_sdl/WifiSettingsDialog.h b/src/frontend/qt_sdl/WifiSettingsDialog.h index 6c1f863d..600941fa 100644 --- a/src/frontend/qt_sdl/WifiSettingsDialog.h +++ b/src/frontend/qt_sdl/WifiSettingsDialog.h @@ -55,7 +55,8 @@ public: private slots: void done(int r); - void on_cbDirectMode_stateChanged(int state); + void on_rbDirectMode_clicked(); + void on_rbIndirectMode_clicked(); void on_cbxDirectAdapter_currentIndexChanged(int sel); private: diff --git a/src/frontend/qt_sdl/WifiSettingsDialog.ui b/src/frontend/qt_sdl/WifiSettingsDialog.ui index 6668d883..174a3dc6 100644 --- a/src/frontend/qt_sdl/WifiSettingsDialog.ui +++ b/src/frontend/qt_sdl/WifiSettingsDialog.ui @@ -6,8 +6,8 @@ 0 0 - 479 - 240 + 572 + 296 @@ -58,15 +58,81 @@ Online - - + + + + Direct Mode Settings + + + + + + Network adapter: + + + + + + + + 0 + 0 + + + + + 300 + 0 + + + + <html><head/><body><p>Selects the network adapter through which to route network traffic under direct mode.</p></body></html> + + + + + + + MAC address: + + + + + + + [PLACEHOLDER] + + + + + + + IP address: + + + + + + + [PLACEHOLDER] + + + + + + + + + + <html><head/><body><p>Indirect mode uses libslirp. It requires no extra setup and is easy to use.</p></body></html> + - MAC address: + Indirect Mode (uses libslirp, recommended) - - + + <html><head/><body><p>Direct mode directly routes network traffic to the host network. It is the most reliable, but requires an ethernet connection.</p><p><br/></p><p>Non-direct mode uses a layer of emulation to get around this, but is more prone to problems.</p></body></html> @@ -75,53 +141,6 @@ - - - - - 0 - 0 - - - - - 350 - 0 - - - - <html><head/><body><p>Selects the network adapter through which to route network traffic under direct mode.</p></body></html> - - - - - - - Network adapter: - - - - - - - IP address: - - - - - - - [PLACEHOLDER] - - - - - - - [PLACEHOLDER] - - - diff --git a/src/frontend/qt_sdl/main.cpp b/src/frontend/qt_sdl/main.cpp index 6c49803a..a4730627 100644 --- a/src/frontend/qt_sdl/main.cpp +++ b/src/frontend/qt_sdl/main.cpp @@ -274,6 +274,7 @@ EmuThread::EmuThread(QObject* parent) : QThread(parent) connect(this, SIGNAL(windowEmuStop()), mainWindow, SLOT(onEmuStop())); connect(this, SIGNAL(windowEmuPause()), mainWindow->actPause, SLOT(trigger())); connect(this, SIGNAL(windowEmuReset()), mainWindow->actReset, SLOT(trigger())); + connect(this, SIGNAL(windowLimitFPSChange()), mainWindow->actLimitFramerate, SLOT(trigger())); connect(this, SIGNAL(screenLayoutChange()), mainWindow->panel, SLOT(onScreenLayoutChanged())); connect(this, SIGNAL(windowFullscreenToggle()), mainWindow, SLOT(onFullscreenToggled())); @@ -363,10 +364,10 @@ void EmuThread::run() Input::Init(); u32 nframes = 0; - u32 starttick = SDL_GetTicks(); - u32 lasttick = starttick; - u32 lastmeasuretick = lasttick; - u32 fpslimitcount = 0; + double perfCountsSec = 1.0 / SDL_GetPerformanceFrequency(); + double lastTime = SDL_GetPerformanceCounter() * perfCountsSec; + double frameLimitError = 0.0; + double lastMeasureTime = lastTime; char melontitle[100]; @@ -378,7 +379,7 @@ void EmuThread::run() if (Input::HotkeyPressed(HK_Pause)) emit windowEmuPause(); if (Input::HotkeyPressed(HK_Reset)) emit windowEmuReset(); - + if (Input::HotkeyPressed(HK_FullscreenToggle)) emit windowFullscreenToggle(); if (GBACart::CartInserted && GBACart::HasSolarSensor) @@ -500,49 +501,43 @@ void EmuThread::run() SDL_UnlockMutex(audioSyncLock); } - float framerate = (1000.0f * nlines) / (60.0f * 263.0f); + double frametimeStep = nlines / (60.0 * 263.0); { - u32 curtick = SDL_GetTicks(); - u32 delay = curtick - lasttick; - bool limitfps = Config::LimitFPS && !fastforward; - if (limitfps) - { - float wantedtickF = starttick + (framerate * (fpslimitcount+1)); - u32 wantedtick = (u32)ceil(wantedtickF); - if (curtick < wantedtick) SDL_Delay(wantedtick - curtick); - lasttick = SDL_GetTicks(); - fpslimitcount++; - if ((abs(wantedtickF - (float)wantedtick) < 0.001312) || (fpslimitcount > 60)) - { - fpslimitcount = 0; - starttick = lasttick; - } - } - else + double practicalFramelimit = limitfps ? frametimeStep : 1.0 / 1000.0; + + double curtime = SDL_GetPerformanceCounter() * perfCountsSec; + + frameLimitError += practicalFramelimit - (curtime - lastTime); + if (frameLimitError < -practicalFramelimit) + frameLimitError = -practicalFramelimit; + if (frameLimitError > practicalFramelimit) + frameLimitError = practicalFramelimit; + + if (round(frameLimitError * 1000.0) > 0.0) { - if (delay < 1) SDL_Delay(1); - lasttick = SDL_GetTicks(); + SDL_Delay(round(frameLimitError * 1000.0)); + double timeBeforeSleep = curtime; + curtime = SDL_GetPerformanceCounter() * perfCountsSec; + frameLimitError -= curtime - timeBeforeSleep; } + + lastTime = curtime; } nframes++; if (nframes >= 30) { - u32 tick = SDL_GetTicks(); - u32 diff = tick - lastmeasuretick; - lastmeasuretick = tick; + double time = SDL_GetPerformanceCounter() * perfCountsSec; + double dt = time - lastMeasureTime; + lastMeasureTime = time; - u32 fps; - if (diff < 1) fps = 77777; - else fps = (nframes * 1000) / diff; + u32 fps = round(nframes / dt); nframes = 0; - float fpstarget; - if (framerate < 1) fpstarget = 999; - else fpstarget = 1000.0f/framerate; + float fpstarget = 1.0/frametimeStep; sprintf(melontitle, "[%d/%.0f] melonDS " MELONDS_VERSION, fps, fpstarget); changeWindowTitle(melontitle); @@ -552,10 +547,8 @@ void EmuThread::run() { // paused nframes = 0; - lasttick = SDL_GetTicks(); - starttick = lasttick; - lastmeasuretick = lasttick; - fpslimitcount = 0; + lastTime = SDL_GetPerformanceCounter() * perfCountsSec; + lastMeasureTime = lastTime; emit windowUpdate(); @@ -1339,6 +1332,7 @@ void MainWindow::keyPressEvent(QKeyEvent* event) { if (event->isAutoRepeat()) return; + // TODO!! REMOVE ME IN RELEASE BUILDS!! if (event->key() == Qt::Key_F11) NDS::debug(0); Input::KeyPress(event); @@ -1362,7 +1356,7 @@ void MainWindow::dragEnterEvent(QDragEnterEvent* event) QString filename = urls.at(0).toLocalFile(); QString ext = filename.right(3); - if (ext == "nds" || ext == "srl" || ext == "dsi" || (ext == "gba" && RunningSomething)) + if (ext == "nds" || ext == "srl" || ext == "dsi" || ext == "gba") event->acceptProposedAction(); } @@ -1986,9 +1980,9 @@ void MainWindow::onTitleUpdate(QString title) void MainWindow::onFullscreenToggled() { - if (!mainWindow->isFullScreen()) + if (!mainWindow->isFullScreen()) { - mainWindow->showFullScreen(); + mainWindow->showFullScreen(); mainWindow->menuBar()->hide(); } else