From 6e8bac39091d0321f43a3e7574d3359255996e9f Mon Sep 17 00:00:00 2001 From: RSDuck Date: Mon, 30 Nov 2020 16:58:52 +0100 Subject: [PATCH] Merge vram dirty tracking Squashed commit of the following: commit b463a05d4b909372f0cd1ad91caa0c77a25e5901 Author: RSDuck Date: Mon Nov 30 01:55:35 2020 +0100 minor fix commit ce73cebbdf5da243d7ebade82d8799ded9cd6b28 Author: RSDuck Date: Mon Nov 30 00:43:08 2020 +0100 fix dirty flags of BG/OBJ mappings not being reset commit fc5d73a6178e3adc444398bdd23de8314b5ca8f8 Author: RSDuck Date: Mon Nov 30 00:11:13 2020 +0100 use flat vram for gpu2d everywhere commit 34ee9fe2bf04fcfa2a5a1c8d78d70007e606f1a2 Author: RSDuck Date: Sat Nov 28 19:10:34 2020 +0100 mark VRAM dirty for display capture commit e8778fa2f429c6df0eece19d6a5ee83ae23a0cf4 Author: RSDuck Date: Sat Nov 28 18:59:31 2020 +0100 use flat VRAM for textures and texpals also skip rendering if nothing changed and a bunch of fixes commit 53f2041e2e1a28b35702a2ed51de885c36689f71 Author: RSDuck Date: Fri Nov 27 18:29:56 2020 +0100 use vram dirty tracking for extpals also preparations to take this further commit 4cdfa329e95aed26d3b21319c8fd86a04abf20f7 Author: RSDuck Date: Mon Nov 16 23:32:22 2020 +0100 VRAM dirty tracking --- src/GPU.cpp | 338 ++++++++++++++++++++++++++++--- src/GPU.h | 87 +++++++- src/GPU2D.cpp | 294 +++++++++++++-------------- src/GPU2D.h | 6 - src/GPU3D.cpp | 15 ++ src/GPU3D.h | 2 + src/GPU3D_Soft.cpp | 83 +++++--- src/NonStupidBitfield.h | 149 ++++++++++++++ src/Platform.h | 2 +- src/frontend/qt_sdl/Platform.cpp | 4 +- 10 files changed, 765 insertions(+), 215 deletions(-) create mode 100644 src/NonStupidBitfield.h diff --git a/src/GPU.cpp b/src/GPU.cpp index 7989750a..e6b24e07 100644 --- a/src/GPU.cpp +++ b/src/GPU.cpp @@ -49,8 +49,8 @@ u8 VRAM_F[ 16*1024]; u8 VRAM_G[ 16*1024]; u8 VRAM_H[ 32*1024]; u8 VRAM_I[ 16*1024]; -u8* VRAM[9] = {VRAM_A, VRAM_B, VRAM_C, VRAM_D, VRAM_E, VRAM_F, VRAM_G, VRAM_H, VRAM_I}; -u32 VRAMMask[9] = {0x1FFFF, 0x1FFFF, 0x1FFFF, 0x1FFFF, 0xFFFF, 0x3FFF, 0x3FFF, 0x7FFF, 0x3FFF}; +u8* const VRAM[9] = {VRAM_A, VRAM_B, VRAM_C, VRAM_D, VRAM_E, VRAM_F, VRAM_G, VRAM_H, VRAM_I}; +u32 const VRAMMask[9] = {0x1FFFF, 0x1FFFF, 0x1FFFF, 0x1FFFF, 0xFFFF, 0x3FFF, 0x3FFF, 0x7FFF, 0x3FFF}; u8 VRAMCNT[9]; u8 VRAMSTAT; @@ -85,6 +85,62 @@ bool Accelerated; GPU2D* GPU2D_A; GPU2D* GPU2D_B; +/* + VRAM invalidation tracking + + - we want to know when a VRAM region used for graphics changed + - for some regions unmapping is mandatory to modify them (Texture, TexPal and ExtPal) and + we don't want to completely invalidate them every time they're unmapped and remapped + + For this reason we don't track the dirtyness per mapping region, but instead per VRAM bank + with VRAMDirty. Writes to LCDC go directly into VRAMDirty, while writes via other mapping regions + like BG or OBJ are first tracked in VRAMWritten_* and need to be flushed using SyncDirtyFlags. + + This is more or less a description of VRAMTrackingSet::DeriveState + Each time before the memory is read two things could have happened + to each 16kb piece (16kb is the smallest unit in which mappings can + be made thus also the size VRAMMap_* use): + - this piece was remapped compared to last time we checked, + which means this location in memory is invalid. + - this piece wasn't remapped, which means we need to check whether + it was changed. This can be archived by checking VRAMDirty. + VRAMDirty need to be reset for the respective VRAM bank. +*/ + +VRAMTrackingSet<512*1024, 16*1024> VRAMDirty_ABG; +VRAMTrackingSet<256*1024, 16*1024> VRAMDirty_AOBJ; +VRAMTrackingSet<128*1024, 16*1024> VRAMDirty_BBG; +VRAMTrackingSet<128*1024, 16*1024> VRAMDirty_BOBJ; + +VRAMTrackingSet<32*1024, 8*1024> VRAMDirty_ABGExtPal; +VRAMTrackingSet<32*1024, 8*1024> VRAMDirty_BBGExtPal; +VRAMTrackingSet<8*1024, 8*1024> VRAMDirty_AOBJExtPal; +VRAMTrackingSet<8*1024, 8*1024> VRAMDirty_BOBJExtPal; + +VRAMTrackingSet<512*1024, 128*1024> VRAMDirty_Texture; +VRAMTrackingSet<128*1024, 16*1024> VRAMDirty_TexPal; + + +NonStupidBitField<512*1024/VRAMDirtyGranularity> VRAMWritten_ABG; +NonStupidBitField<256*1024/VRAMDirtyGranularity> VRAMWritten_AOBJ; +NonStupidBitField<128*1024/VRAMDirtyGranularity> VRAMWritten_BBG; +NonStupidBitField<128*1024/VRAMDirtyGranularity> VRAMWritten_BOBJ; +NonStupidBitField<256*1024/VRAMDirtyGranularity> VRAMWritten_ARM7; + +NonStupidBitField<128*1024/VRAMDirtyGranularity> VRAMDirty[9]; + +u8 VRAMFlat_ABG[512*1024]; +u8 VRAMFlat_BBG[128*1024]; +u8 VRAMFlat_AOBJ[256*1024]; +u8 VRAMFlat_BOBJ[128*1024]; + +u8 VRAMFlat_ABGExtPal[32*1024]; +u8 VRAMFlat_BBGExtPal[32*1024]; +u8 VRAMFlat_AOBJExtPal[8*1024]; +u8 VRAMFlat_BOBJExtPal[8*1024]; + +u8 VRAMFlat_Texture[512*1024]; +u8 VRAMFlat_TexPal[128*1024]; bool Init() { @@ -113,6 +169,30 @@ void DeInit() if (Framebuffer[1][1]) delete[] Framebuffer[1][1]; } +void ResetVRAMCache() +{ + for (int i = 0; i < 9; i++) + VRAMDirty[i] = NonStupidBitField<128*1024/VRAMDirtyGranularity>(); + + VRAMDirty_ABG.Reset(); + VRAMDirty_BBG.Reset(); + VRAMDirty_AOBJ.Reset(); + VRAMDirty_BOBJ.Reset(); + VRAMDirty_ABGExtPal.Reset(); + VRAMDirty_BBGExtPal.Reset(); + VRAMDirty_AOBJExtPal.Reset(); + VRAMDirty_BOBJExtPal.Reset(); + + memset(VRAMFlat_ABG, 0, sizeof(VRAMFlat_ABG)); + memset(VRAMFlat_BBG, 0, sizeof(VRAMFlat_BBG)); + memset(VRAMFlat_AOBJ, 0, sizeof(VRAMFlat_AOBJ)); + memset(VRAMFlat_BOBJ, 0, sizeof(VRAMFlat_BOBJ)); + memset(VRAMFlat_ABGExtPal, 0, sizeof(VRAMFlat_ABGExtPal)); + memset(VRAMFlat_BBGExtPal, 0, sizeof(VRAMFlat_BBGExtPal)); + memset(VRAMFlat_AOBJExtPal, 0, sizeof(VRAMFlat_AOBJExtPal)); + memset(VRAMFlat_BOBJExtPal, 0, sizeof(VRAMFlat_BOBJExtPal)); +} + void Reset() { VCount = 0; @@ -186,6 +266,8 @@ void Reset() GPU2D_B->SetFramebuffer(Framebuffer[backbuf][0]); ResetRenderer(); + + ResetVRAMCache(); } void Stop() @@ -261,6 +343,8 @@ void DoSavestate(Savestate* file) GPU2D_A->DoSavestate(file); GPU2D_B->DoSavestate(file); GPU3D::DoSavestate(file); + + ResetVRAMCache(); } void AssignFramebuffers() @@ -411,18 +495,8 @@ void SetRenderSettings(int renderer, RenderSettings& settings) u8* GetUniqueBankPtr(u32 mask, u32 offset) { - if (!mask) return NULL; - - int num = 0; - if (!(mask & 0xFF)) { mask >>= 8; num += 8; } - else - { - if (!(mask & 0xF)) { mask >>= 4; num += 4; } - if (!(mask & 0x3)) { mask >>= 2; num += 2; } - if (!(mask & 0x1)) { mask >>= 1; num += 1; } - } - if (mask != 1) return NULL; - + if (!mask || (mask & (mask - 1)) != 0) return NULL; + int num = __builtin_ctz(mask); return &VRAM[num][offset & VRAMMask[num]]; } @@ -606,8 +680,6 @@ void MapVRAM_E(u32 bank, u8 cnt) case 4: // ABG ext palette UNMAP_RANGE(ABGExtPal, 0, 4); - GPU2D_A->BGExtPalDirty(0); - GPU2D_A->BGExtPalDirty(2); break; } } @@ -634,8 +706,6 @@ void MapVRAM_E(u32 bank, u8 cnt) case 4: // ABG ext palette MAP_RANGE(ABGExtPal, 0, 4); - GPU2D_A->BGExtPalDirty(0); - GPU2D_A->BGExtPalDirty(2); break; } } @@ -687,12 +757,10 @@ void MapVRAM_FG(u32 bank, u8 cnt) case 4: // ABG ext palette VRAMMap_ABGExtPal[((oldofs & 0x1) << 1)] &= ~bankmask; VRAMMap_ABGExtPal[((oldofs & 0x1) << 1) + 1] &= ~bankmask; - GPU2D_A->BGExtPalDirty((oldofs & 0x1) << 1); break; case 5: // AOBJ ext palette VRAMMap_AOBJExtPal &= ~bankmask; - GPU2D_A->OBJExtPalDirty(); break; } } @@ -732,12 +800,10 @@ void MapVRAM_FG(u32 bank, u8 cnt) case 4: // ABG ext palette VRAMMap_ABGExtPal[((ofs & 0x1) << 1)] |= bankmask; VRAMMap_ABGExtPal[((ofs & 0x1) << 1) + 1] |= bankmask; - GPU2D_A->BGExtPalDirty((ofs & 0x1) << 1); break; case 5: // AOBJ ext palette VRAMMap_AOBJExtPal |= bankmask; - GPU2D_A->OBJExtPalDirty(); break; } } @@ -773,8 +839,6 @@ void MapVRAM_H(u32 bank, u8 cnt) case 2: // BBG ext palette UNMAP_RANGE(BBGExtPal, 0, 4); - GPU2D_B->BGExtPalDirty(0); - GPU2D_B->BGExtPalDirty(2); break; } } @@ -800,8 +864,6 @@ void MapVRAM_H(u32 bank, u8 cnt) case 2: // BBG ext palette MAP_RANGE(BBGExtPal, 0, 4); - GPU2D_B->BGExtPalDirty(0); - GPU2D_B->BGExtPalDirty(2); break; } } @@ -841,7 +903,6 @@ void MapVRAM_I(u32 bank, u8 cnt) case 3: // BOBJ ext palette VRAMMap_BOBJExtPal &= ~bankmask; - GPU2D_B->OBJExtPalDirty(); break; } } @@ -871,7 +932,6 @@ void MapVRAM_I(u32 bank, u8 cnt) case 3: // BOBJ ext palette VRAMMap_BOBJExtPal |= bankmask; - GPU2D_B->OBJExtPalDirty(); break; } } @@ -937,6 +997,8 @@ void StartHBlank(u32 line) DispStat[0] |= (1<<1); DispStat[1] |= (1<<1); + SyncDirtyFlags(); + if (VCount < 192) { // draw @@ -1096,4 +1158,224 @@ void SetVCount(u16 val) NextVCount = val; } +template +NonStupidBitField VRAMTrackingSet::DeriveState(u32* currentMappings) +{ + NonStupidBitField result; + u16 banksToBeZeroed = 0; + for (u32 i = 0; i < Size / MappingGranularity; i++) + { + if (currentMappings[i] != Mapping[i]) + { + result |= NonStupidBitField(i*VRAMBitsPerMapping, VRAMBitsPerMapping); + banksToBeZeroed |= currentMappings[i]; + Mapping[i] = currentMappings[i]; + } + else + { + u32 mapping = Mapping[i]; + + banksToBeZeroed |= mapping; + + while (mapping != 0) + { + u32 num = __builtin_ctz(mapping); + mapping &= ~(1 << num); + + // hack for **speed** + // this could probably be done less ugly but then we would rely + // on the compiler for vectorisation + static_assert(VRAMDirtyGranularity == 512); + if (MappingGranularity == 16*1024) + { + u32 dirty = ((u32*)VRAMDirty[num].Data)[i & (VRAMMask[num] >> 14)]; + ((u32*)result.Data)[i] |= dirty; + } + else if (MappingGranularity == 8*1024) + { + u16 dirty = ((u16*)VRAMDirty[num].Data)[i & (VRAMMask[num] >> 13)]; + ((u16*)result.Data)[i] |= dirty; + } + else if (MappingGranularity == 128*1024) + { + ((u64*)result.Data)[i * 4 + 0] |= ((u64*)VRAMDirty[num].Data)[0]; + ((u64*)result.Data)[i * 4 + 1] |= ((u64*)VRAMDirty[num].Data)[1]; + ((u64*)result.Data)[i * 4 + 2] |= ((u64*)VRAMDirty[num].Data)[2]; + ((u64*)result.Data)[i * 4 + 3] |= ((u64*)VRAMDirty[num].Data)[3]; + } + else + { + // welp + abort(); + } + } + } + } + + while (banksToBeZeroed != 0) + { + u32 num = __builtin_ctz(banksToBeZeroed); + banksToBeZeroed &= ~(1 << num); + memset(VRAMDirty[num].Data, 0, sizeof(VRAMDirty[num].Data)); + } + + return result; } + +template NonStupidBitField<32*1024/VRAMDirtyGranularity> VRAMTrackingSet<32*1024, 8*1024>::DeriveState(u32*); +template NonStupidBitField<8*1024/VRAMDirtyGranularity> VRAMTrackingSet<8*1024, 8*1024>::DeriveState(u32*); +template NonStupidBitField<512*1024/VRAMDirtyGranularity> VRAMTrackingSet<512*1024, 128*1024>::DeriveState(u32*); +template NonStupidBitField<128*1024/VRAMDirtyGranularity> VRAMTrackingSet<128*1024, 16*1024>::DeriveState(u32*); +template NonStupidBitField<256*1024/VRAMDirtyGranularity> VRAMTrackingSet<256*1024, 16*1024>::DeriveState(u32*); +template NonStupidBitField<512*1024/VRAMDirtyGranularity> VRAMTrackingSet<512*1024, 16*1024>::DeriveState(u32*); + +template +void SyncDirtyFlags(u32* mappings, NonStupidBitField& writtenFlags) +{ + const u32 VRAMWrittenBitsPer16KB = 16*1024/VRAMDirtyGranularity; + + for (typename NonStupidBitField::Iterator it = writtenFlags.Begin(); it != writtenFlags.End(); it++) + { + u32 mapping = mappings[*it / VRAMWrittenBitsPer16KB]; + while (mapping != 0) + { + u32 num = __builtin_ctz(mapping); + + VRAMDirty[num][*it & (VRAMMask[num] / VRAMDirtyGranularity)] = true; + + mapping &= ~(1 << num); + } + } + memset(writtenFlags.Data, 0, sizeof(writtenFlags.Data)); +} + +void SyncDirtyFlags() +{ + SyncDirtyFlags(VRAMMap_ABG, VRAMWritten_ABG); + SyncDirtyFlags(VRAMMap_AOBJ, VRAMWritten_AOBJ); + SyncDirtyFlags(VRAMMap_BBG, VRAMWritten_BBG); + SyncDirtyFlags(VRAMMap_BOBJ, VRAMWritten_BOBJ); + SyncDirtyFlags(VRAMMap_ARM7, VRAMWritten_ARM7); +} + +template +inline bool CopyLinearVRAM(u8* flat, u32* mappings, NonStupidBitField& dirty, u64 (*slowAccess)(u32 addr)) +{ + const u32 VRAMBitsPerMapping = MappingGranularity / VRAMDirtyGranularity; + + bool change = false; + + typename NonStupidBitField::Iterator it = dirty.Begin(); + while (it != dirty.End()) + { + u32 offset = *it * VRAMDirtyGranularity; + u8* dst = flat + offset; + u8* fastAccess = GetUniqueBankPtr(mappings[*it / VRAMBitsPerMapping], offset); + if (fastAccess) + { + memcpy(dst, fastAccess, VRAMDirtyGranularity); + } + else + { + for (u32 i = 0; i < VRAMDirtyGranularity; i += 8) + *(u64*)&dst[i] = slowAccess(offset + i); + } + change = true; + it++; + } + return change; +} + +bool MakeVRAMFlat_TextureCoherent(NonStupidBitField<512*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<128*1024>(VRAMFlat_Texture, VRAMMap_Texture, dirty, ReadVRAM_Texture); +} +bool MakeVRAMFlat_TexPalCoherent(NonStupidBitField<128*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<16*1024>(VRAMFlat_TexPal, VRAMMap_TexPal, dirty, ReadVRAM_TexPal); +} + +bool MakeVRAMFlat_ABGCoherent(NonStupidBitField<512*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<16*1024>(VRAMFlat_ABG, VRAMMap_ABG, dirty, ReadVRAM_ABG); +} +bool MakeVRAMFlat_BBGCoherent(NonStupidBitField<128*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<16*1024>(VRAMFlat_BBG, VRAMMap_BBG, dirty, ReadVRAM_BBG); +} + +bool MakeVRAMFlat_AOBJCoherent(NonStupidBitField<256*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<16*1024>(VRAMFlat_AOBJ, VRAMMap_AOBJ, dirty, ReadVRAM_AOBJ); +} +bool MakeVRAMFlat_BOBJCoherent(NonStupidBitField<128*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<16*1024>(VRAMFlat_BOBJ, VRAMMap_BOBJ, dirty, ReadVRAM_BOBJ); +} + +template +T ReadVRAM_ABGExtPal(u32 addr) +{ + u32 mask = VRAMMap_ABGExtPal[(addr >> 13) & 0x3]; + + T ret = 0; + if (mask & (1<<4)) ret |= *(T*)&VRAM_E[addr & 0x7FFF]; + if (mask & (1<<5)) ret |= *(T*)&VRAM_F[addr & 0x3FFF]; + if (mask & (1<<6)) ret |= *(T*)&VRAM_G[addr & 0x3FFF]; + + return ret; +} + +template +T ReadVRAM_BBGExtPal(u32 addr) +{ + u32 mask = VRAMMap_BBGExtPal[(addr >> 13) & 0x3]; + + T ret = 0; + if (mask & (1<<7)) ret |= *(T*)&VRAM_H[addr & 0x7FFF]; + + return ret; +} + +template +T ReadVRAM_AOBJExtPal(u32 addr) +{ + u32 mask = VRAMMap_AOBJExtPal; + + T ret = 0; + if (mask & (1<<4)) ret |= *(T*)&VRAM_F[addr & 0x1FFF]; + if (mask & (1<<5)) ret |= *(T*)&VRAM_G[addr & 0x1FFF]; + + return ret; +} + +template +T ReadVRAM_BOBJExtPal(u32 addr) +{ + u32 mask = VRAMMap_BOBJExtPal; + + T ret = 0; + if (mask & (1<<8)) ret |= *(T*)&VRAM_I[addr & 0x1FFF]; + + return ret; +} + +bool MakeVRAMFlat_ABGExtPalCoherent(NonStupidBitField<32*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<8*1024>(VRAMFlat_ABGExtPal, VRAMMap_ABGExtPal, dirty, ReadVRAM_ABGExtPal); +} +bool MakeVRAMFlat_BBGExtPalCoherent(NonStupidBitField<32*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<8*1024>(VRAMFlat_BBGExtPal, VRAMMap_BBGExtPal, dirty, ReadVRAM_BBGExtPal); +} + +bool MakeVRAMFlat_AOBJExtPalCoherent(NonStupidBitField<8*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<8*1024>(VRAMFlat_AOBJExtPal, &VRAMMap_AOBJExtPal, dirty, ReadVRAM_AOBJExtPal); +} +bool MakeVRAMFlat_BOBJExtPalCoherent(NonStupidBitField<8*1024/VRAMDirtyGranularity>& dirty) +{ + return CopyLinearVRAM<8*1024>(VRAMFlat_BOBJExtPal, &VRAMMap_BOBJExtPal, dirty, ReadVRAM_BOBJExtPal); +} + +} \ No newline at end of file diff --git a/src/GPU.h b/src/GPU.h index 1564ef7f..2f71da69 100644 --- a/src/GPU.h +++ b/src/GPU.h @@ -20,6 +20,7 @@ #define GPU_H #include "GPU2D.h" +#include "NonStupidBitfield.h" namespace GPU { @@ -45,7 +46,7 @@ extern u8 VRAM_G[ 16*1024]; extern u8 VRAM_H[ 32*1024]; extern u8 VRAM_I[ 16*1024]; -extern u8* VRAM[9]; +extern u8* const VRAM[9]; extern u32 VRAMMap_LCDC; extern u32 VRAMMap_ABG[0x20]; @@ -73,6 +74,73 @@ extern GPU2D* GPU2D_B; extern int Renderer; +const u32 VRAMDirtyGranularity = 512; + +extern NonStupidBitField<512*1024/VRAMDirtyGranularity> VRAMWritten_ABG; +extern NonStupidBitField<256*1024/VRAMDirtyGranularity> VRAMWritten_AOBJ; +extern NonStupidBitField<128*1024/VRAMDirtyGranularity> VRAMWritten_BBG; +extern NonStupidBitField<128*1024/VRAMDirtyGranularity> VRAMWritten_BOBJ; +extern NonStupidBitField<256*1024/VRAMDirtyGranularity> VRAMWritten_ARM7; + +extern NonStupidBitField<128*1024/VRAMDirtyGranularity> VRAMDirty[9]; + +template +struct VRAMTrackingSet +{ + u16 Mapping[Size / MappingGranularity]; + + const u32 VRAMBitsPerMapping = MappingGranularity / VRAMDirtyGranularity; + + void Reset() + { + memset(Mapping, 0, sizeof(Mapping)); + } + NonStupidBitField DeriveState(u32* currentMappings); +}; + +extern VRAMTrackingSet<512*1024, 16*1024> VRAMDirty_ABG; +extern VRAMTrackingSet<256*1024, 16*1024> VRAMDirty_AOBJ; +extern VRAMTrackingSet<128*1024, 16*1024> VRAMDirty_BBG; +extern VRAMTrackingSet<128*1024, 16*1024> VRAMDirty_BOBJ; + +extern VRAMTrackingSet<32*1024, 8*1024> VRAMDirty_ABGExtPal; +extern VRAMTrackingSet<32*1024, 8*1024> VRAMDirty_BBGExtPal; +extern VRAMTrackingSet<8*1024, 8*1024> VRAMDirty_AOBJExtPal; +extern VRAMTrackingSet<8*1024, 8*1024> VRAMDirty_BOBJExtPal; + +extern VRAMTrackingSet<512*1024, 128*1024> VRAMDirty_Texture; +extern VRAMTrackingSet<128*1024, 16*1024> VRAMDirty_TexPal; + +extern u8 VRAMFlat_ABG[512*1024]; +extern u8 VRAMFlat_BBG[128*1024]; +extern u8 VRAMFlat_AOBJ[256*1024]; +extern u8 VRAMFlat_BOBJ[128*1024]; + +extern u8 VRAMFlat_ABGExtPal[32*1024]; +extern u8 VRAMFlat_BBGExtPal[32*1024]; + +extern u8 VRAMFlat_AOBJExtPal[8*1024]; +extern u8 VRAMFlat_BOBJExtPal[8*1024]; + +extern u8 VRAMFlat_Texture[512*1024]; +extern u8 VRAMFlat_TexPal[128*1024]; + +bool MakeVRAMFlat_ABGCoherent(NonStupidBitField<512*1024/VRAMDirtyGranularity>& dirty); +bool MakeVRAMFlat_BBGCoherent(NonStupidBitField<128*1024/VRAMDirtyGranularity>& dirty); + +bool MakeVRAMFlat_AOBJCoherent(NonStupidBitField<256*1024/VRAMDirtyGranularity>& dirty); +bool MakeVRAMFlat_BOBJCoherent(NonStupidBitField<128*1024/VRAMDirtyGranularity>& dirty); + +bool MakeVRAMFlat_ABGExtPalCoherent(NonStupidBitField<32*1024/VRAMDirtyGranularity>& dirty); +bool MakeVRAMFlat_BBGExtPalCoherent(NonStupidBitField<32*1024/VRAMDirtyGranularity>& dirty); + +bool MakeVRAMFlat_AOBJExtPalCoherent(NonStupidBitField<8*1024/VRAMDirtyGranularity>& dirty); +bool MakeVRAMFlat_BOBJExtPalCoherent(NonStupidBitField<8*1024/VRAMDirtyGranularity>& dirty); + +bool MakeVRAMFlat_TextureCoherent(NonStupidBitField<512*1024/VRAMDirtyGranularity>& dirty); +bool MakeVRAMFlat_TexPalCoherent(NonStupidBitField<128*1024/VRAMDirtyGranularity>& dirty); + +void SyncDirtyFlags(); typedef struct { @@ -233,7 +301,11 @@ void WriteVRAM_LCDC(u32 addr, T val) default: return; } - if (VRAMMap_LCDC & (1<> 14) & 0x1F]; + VRAMWritten_ABG[(addr & 0x7FFFF) / VRAMDirtyGranularity] = true; + if (mask & (1<<0)) *(T*)&VRAM_A[addr & 0x1FFFF] = val; if (mask & (1<<1)) *(T*)&VRAM_B[addr & 0x1FFFF] = val; if (mask & (1<<2)) *(T*)&VRAM_C[addr & 0x1FFFF] = val; @@ -295,6 +369,8 @@ void WriteVRAM_AOBJ(u32 addr, T val) { u32 mask = VRAMMap_AOBJ[(addr >> 14) & 0xF]; + VRAMWritten_AOBJ[(addr & 0x3FFFF) / VRAMDirtyGranularity] = true; + if (mask & (1<<0)) *(T*)&VRAM_A[addr & 0x1FFFF] = val; if (mask & (1<<1)) *(T*)&VRAM_B[addr & 0x1FFFF] = val; if (mask & (1<<4)) *(T*)&VRAM_E[addr & 0xFFFF] = val; @@ -324,6 +400,8 @@ void WriteVRAM_BBG(u32 addr, T val) { u32 mask = VRAMMap_BBG[(addr >> 14) & 0x7]; + VRAMWritten_BBG[(addr & 0x1FFFF) / VRAMDirtyGranularity] = true; + if (mask & (1<<2)) *(T*)&VRAM_C[addr & 0x1FFFF] = val; if (mask & (1<<7)) *(T*)&VRAM_H[addr & 0x7FFF] = val; if (mask & (1<<8)) *(T*)&VRAM_I[addr & 0x3FFF] = val; @@ -350,11 +428,12 @@ void WriteVRAM_BOBJ(u32 addr, T val) { u32 mask = VRAMMap_BOBJ[(addr >> 14) & 0x7]; + VRAMWritten_BOBJ[(addr & 0x1FFFF) / VRAMDirtyGranularity] = true; + if (mask & (1<<3)) *(T*)&VRAM_D[addr & 0x1FFFF] = val; if (mask & (1<<8)) *(T*)&VRAM_I[addr & 0x3FFF] = val; } - template T ReadVRAM_ARM7(u32 addr) { @@ -372,6 +451,8 @@ void WriteVRAM_ARM7(u32 addr, T val) { u32 mask = VRAMMap_ARM7[(addr >> 17) & 0x1]; + VRAMWritten_ARM7[(addr & 0x1FFFF) / VRAMDirtyGranularity] = true; + if (mask & (1<<2)) *(T*)&VRAM_C[addr & 0x1FFFF] = val; if (mask & (1<<3)) *(T*)&VRAM_D[addr & 0x1FFFF] = val; } diff --git a/src/GPU2D.cpp b/src/GPU2D.cpp index 27aa6083..07b5b21a 100644 --- a/src/GPU2D.cpp +++ b/src/GPU2D.cpp @@ -148,12 +148,6 @@ void GPU2D::Reset() CaptureCnt = 0; MasterBrightness = 0; - - BGExtPalStatus[0] = 0; - BGExtPalStatus[1] = 0; - BGExtPalStatus[2] = 0; - BGExtPalStatus[3] = 0; - OBJExtPalStatus = 0; } void GPU2D::DoSavestate(Savestate* file) @@ -208,13 +202,6 @@ void GPU2D::DoSavestate(Savestate* file) if (!file->Saving) { - // refresh those - BGExtPalStatus[0] = 0; - BGExtPalStatus[1] = 0; - BGExtPalStatus[2] = 0; - BGExtPalStatus[3] = 0; - OBJExtPalStatus = 0; - CurBGXMosaicTable = MosaicTable[BGMosaicSize[0]]; CurOBJXMosaicTable = MosaicTable[OBJMosaicSize[0]]; } @@ -758,6 +745,25 @@ void GPU2D::DrawScanline(u32 line) int n3dline = line; line = GPU::VCount; + if (Num == 0) + { + auto bgDirty = GPU::VRAMDirty_ABG.DeriveState(GPU::VRAMMap_ABG); + GPU::MakeVRAMFlat_ABGCoherent(bgDirty); + auto bgExtPalDirty = GPU::VRAMDirty_ABGExtPal.DeriveState(GPU::VRAMMap_ABGExtPal); + GPU::MakeVRAMFlat_ABGExtPalCoherent(bgExtPalDirty); + auto objExtPalDirty = GPU::VRAMDirty_AOBJExtPal.DeriveState(&GPU::VRAMMap_AOBJExtPal); + GPU::MakeVRAMFlat_AOBJExtPalCoherent(objExtPalDirty); + } + else + { + auto bgDirty = GPU::VRAMDirty_BBG.DeriveState(GPU::VRAMMap_BBG); + GPU::MakeVRAMFlat_BBGCoherent(bgDirty); + auto bgExtPalDirty = GPU::VRAMDirty_BBGExtPal.DeriveState(GPU::VRAMMap_BBGExtPal); + GPU::MakeVRAMFlat_BBGExtPalCoherent(bgExtPalDirty); + auto objExtPalDirty = GPU::VRAMDirty_BOBJExtPal.DeriveState(&GPU::VRAMMap_BOBJExtPal); + GPU::MakeVRAMFlat_BOBJExtPalCoherent(objExtPalDirty); + } + bool forceblank = false; // scanlines that end up outside of the GPU drawing range @@ -970,6 +976,9 @@ void GPU2D::DoCapture(u32 line, u32 width) u16* dst = (u16*)GPU::VRAM[dstvram]; u32 dstaddr = (((CaptureCnt >> 18) & 0x3) << 14) + (line * width); + static_assert(GPU::VRAMDirtyGranularity == 512); + GPU::VRAMDirty[dstvram][(dstaddr & 0x1FFFF) / GPU::VRAMDirtyGranularity] = true; + // TODO: handle 3D in accelerated mode!! u32* srcA; @@ -1188,85 +1197,20 @@ void GPU2D::SampleFIFO(u32 offset, u32 num) } } - -void GPU2D::BGExtPalDirty(u32 base) -{ - BGExtPalStatus[base] = 0; - BGExtPalStatus[base+1] = 0; -} - -void GPU2D::OBJExtPalDirty() -{ - OBJExtPalStatus = 0; -} - - u16* GPU2D::GetBGExtPal(u32 slot, u32 pal) { - u16* dst = &BGExtPalCache[slot][pal << 8]; - - if (!(BGExtPalStatus[slot] & (1< void GPU2D::DrawBG_Text(u32 line, u32 bgnum) { @@ -1720,17 +1678,20 @@ void GPU2D::DrawBG_Text(u32 line, u32 bgnum) extpal = (DispCnt & 0x40000000); if (extpal) extpalslot = ((bgnum<2) && (bgcnt&0x2000)) ? (2+bgnum) : bgnum; + u8* bgvram; + u32 bgvrammask; + GetBGVRAM(Num, bgvram, bgvrammask); if (Num) { - tilesetaddr = 0x06200000 + ((bgcnt & 0x003C) << 12); - tilemapaddr = 0x06200000 + ((bgcnt & 0x1F00) << 3); + tilesetaddr = ((bgcnt & 0x003C) << 12); + tilemapaddr = ((bgcnt & 0x1F00) << 3); pal = (u16*)&GPU::Palette[0x400]; } else { - tilesetaddr = 0x06000000 + ((DispCnt & 0x07000000) >> 8) + ((bgcnt & 0x003C) << 12); - tilemapaddr = 0x06000000 + ((DispCnt & 0x38000000) >> 11) + ((bgcnt & 0x1F00) << 3); + tilesetaddr = ((DispCnt & 0x07000000) >> 8) + ((bgcnt & 0x003C) << 12); + tilemapaddr = ((DispCnt & 0x38000000) >> 11) + ((bgcnt & 0x1F00) << 3); pal = (u16*)&GPU::Palette[0]; } @@ -1758,7 +1719,7 @@ void GPU2D::DrawBG_Text(u32 line, u32 bgnum) // preload shit as needed if ((xoff & 0x7) || mosaic) { - curtile = GPU::ReadVRAM_BG(tilemapaddr + ((xoff & 0xF8) >> 2) + ((xoff & widexmask) << 3)); + curtile = *(u16*)&bgvram[(tilemapaddr + ((xoff & 0xF8) >> 2) + ((xoff & widexmask) << 3)) & bgvrammask]; if (extpal) curpal = GetBGExtPal(extpalslot, curtile>>12); else curpal = pal; @@ -1779,7 +1740,7 @@ void GPU2D::DrawBG_Text(u32 line, u32 bgnum) (mosaic && ((xpos >> 3) != (lastxpos >> 3)))) { // load a new tile - curtile = GPU::ReadVRAM_BG(tilemapaddr + ((xpos & 0xF8) >> 2) + ((xpos & widexmask) << 3)); + curtile = *(u16*)&bgvram[(tilemapaddr + ((xpos & 0xF8) >> 2) + ((xpos & widexmask) << 3)) & bgvrammask]; if (extpal) curpal = GetBGExtPal(extpalslot, curtile>>12); else curpal = pal; @@ -1794,7 +1755,7 @@ void GPU2D::DrawBG_Text(u32 line, u32 bgnum) if (WindowMask[i] & (1<(pixelsaddr + tilexoff); + color = bgvram[(pixelsaddr + tilexoff) & bgvrammask]; if (color) drawPixel(&BGOBJLine[i], curpal[color], 0x01000000<(tilemapaddr + ((xoff & 0xF8) >> 2) + ((xoff & widexmask) << 3)); + curtile = *(u16*)&bgvram[((tilemapaddr + ((xoff & 0xF8) >> 2) + ((xoff & widexmask) << 3))) & bgvrammask]; curpal = pal + ((curtile & 0xF000) >> 8); pixelsaddr = tilesetaddr + ((curtile & 0x03FF) << 5) + (((curtile & 0x0800) ? (7-(yoff&0x7)) : (yoff&0x7)) << 2); @@ -1828,7 +1789,7 @@ void GPU2D::DrawBG_Text(u32 line, u32 bgnum) (mosaic && ((xpos >> 3) != (lastxpos >> 3)))) { // load a new tile - curtile = GPU::ReadVRAM_BG(tilemapaddr + ((xpos & 0xF8) >> 2) + ((xpos & widexmask) << 3)); + curtile = *(u16*)&bgvram[(tilemapaddr + ((xpos & 0xF8) >> 2) + ((xpos & widexmask) << 3)) & bgvrammask]; curpal = pal + ((curtile & 0xF000) >> 8); pixelsaddr = tilesetaddr + ((curtile & 0x03FF) << 5) + (((curtile & 0x0800) ? (7-(yoff&0x7)) : (yoff&0x7)) << 2); @@ -1842,11 +1803,11 @@ void GPU2D::DrawBG_Text(u32 line, u32 bgnum) u32 tilexoff = (curtile & 0x0400) ? (7-(xpos&0x7)) : (xpos&0x7); if (tilexoff & 0x1) { - color = GPU::ReadVRAM_BG(pixelsaddr + (tilexoff >> 1)) >> 4; + color = bgvram[(pixelsaddr + (tilexoff >> 1)) & bgvrammask] >> 4; } else { - color = GPU::ReadVRAM_BG(pixelsaddr + (tilexoff >> 1)) & 0x0F; + color = bgvram[(pixelsaddr + (tilexoff >> 1)) & bgvrammask] & 0x0F; } if (color) @@ -1895,17 +1856,20 @@ void GPU2D::DrawBG_Affine(u32 line, u32 bgnum) rotY -= (BGMosaicY * rotD); } + u8* bgvram; + u32 bgvrammask; + if (Num) { - tilesetaddr = 0x06200000 + ((bgcnt & 0x003C) << 12); - tilemapaddr = 0x06200000 + ((bgcnt & 0x1F00) << 3); + tilesetaddr = ((bgcnt & 0x003C) << 12); + tilemapaddr = ((bgcnt & 0x1F00) << 3); pal = (u16*)&GPU::Palette[0x400]; } else { - tilesetaddr = 0x06000000 + ((DispCnt & 0x07000000) >> 8) + ((bgcnt & 0x003C) << 12); - tilemapaddr = 0x06000000 + ((DispCnt & 0x38000000) >> 11) + ((bgcnt & 0x1F00) << 3); + tilesetaddr = ((DispCnt & 0x07000000) >> 8) + ((bgcnt & 0x003C) << 12); + tilemapaddr = ((DispCnt & 0x38000000) >> 11) + ((bgcnt & 0x1F00) << 3); pal = (u16*)&GPU::Palette[0]; } @@ -1934,13 +1898,13 @@ void GPU2D::DrawBG_Affine(u32 line, u32 bgnum) if ((!((finalX|finalY) & overflowmask))) { - curtile = GPU::ReadVRAM_BG(tilemapaddr + ((((finalY & coordmask) >> 11) << yshift) + ((finalX & coordmask) >> 11))); + curtile = bgvram[(tilemapaddr + ((((finalY & coordmask) >> 11) << yshift) + ((finalX & coordmask) >> 11))) & bgvrammask]; // draw pixel u32 tilexoff = (finalX >> 8) & 0x7; u32 tileyoff = (finalY >> 8) & 0x7; - color = GPU::ReadVRAM_BG(tilesetaddr + (curtile << 6) + (tileyoff << 3) + tilexoff); + color = bgvram[(tilesetaddr + (curtile << 6) + (tileyoff << 3) + tilexoff) & bgvrammask]; if (color) drawPixel(&BGOBJLine[i], pal[color], 0x01000000<(tilemapaddr + (((((finalY & ymask) >> 8) << yshift) + ((finalX & xmask) >> 8)) << 1)); + color = *(u16*)&bgvram[(tilemapaddr + (((((finalY & ymask) >> 8) << yshift) + ((finalX & xmask) >> 8)) << 1)) & bgvrammask]; if (color & 0x8000) drawPixel(&BGOBJLine[i], color, 0x01000000<(tilemapaddr + (((finalY & ymask) >> 8) << yshift) + ((finalX & xmask) >> 8)); + color = bgvram[(tilemapaddr + (((finalY & ymask) >> 8) << yshift) + ((finalX & xmask) >> 8)) & bgvrammask]; if (color) drawPixel(&BGOBJLine[i], pal[color], 0x01000000<> 8) + ((bgcnt & 0x003C) << 12); - tilemapaddr = 0x06000000 + ((DispCnt & 0x38000000) >> 11) + ((bgcnt & 0x1F00) << 3); + tilesetaddr = ((DispCnt & 0x07000000) >> 8) + ((bgcnt & 0x003C) << 12); + tilemapaddr = ((DispCnt & 0x38000000) >> 11) + ((bgcnt & 0x1F00) << 3); pal = (u16*)&GPU::Palette[0]; } @@ -2144,7 +2112,7 @@ void GPU2D::DrawBG_Extended(u32 line, u32 bgnum) if ((!((finalX|finalY) & overflowmask))) { - curtile = GPU::ReadVRAM_BG(tilemapaddr + (((((finalY & coordmask) >> 11) << yshift) + ((finalX & coordmask) >> 11)) << 1)); + curtile = *(u16*)&bgvram[(tilemapaddr + (((((finalY & coordmask) >> 11) << yshift) + ((finalX & coordmask) >> 11)) << 1)) & bgvrammask]; if (extpal) curpal = GetBGExtPal(bgnum, curtile>>12); else curpal = pal; @@ -2156,7 +2124,7 @@ void GPU2D::DrawBG_Extended(u32 line, u32 bgnum) if (curtile & 0x0400) tilexoff = 7-tilexoff; if (curtile & 0x0800) tileyoff = 7-tileyoff; - color = GPU::ReadVRAM_BG(tilesetaddr + ((curtile & 0x03FF) << 6) + (tileyoff << 3) + tilexoff); + color = bgvram[(tilesetaddr + ((curtile & 0x03FF) << 6) + (tileyoff << 3) + tilexoff) & bgvrammask]; if (color) drawPixel(&BGOBJLine[i], curpal[color], 0x01000000<(tilemapaddr + (((finalY & ymask) >> 8) << yshift) + ((finalX & xmask) >> 8)); + color = bgvram[(tilemapaddr + (((finalY & ymask) >> 8) << yshift) + ((finalX & xmask) >> 8)) & bgvrammask]; if (color) drawPixel(&BGOBJLine[i], pal[color], 0x01000000<<2); @@ -2346,6 +2315,20 @@ void GPU2D::InterleaveSprites(u32 prio) } } +void GetOBJVRAM(u32 num, u8*& data, u32& mask) +{ + if (num == 0) + { + data = GPU::VRAMFlat_AOBJ; + mask = 0x3FFFF; + } + else + { + data = GPU::VRAMFlat_BOBJ; + mask = 0x1FFFF; + } +} + #define DoDrawSprite(type, ...) \ if (iswin) \ { \ @@ -2370,6 +2353,17 @@ void GPU2D::DrawSprites(u32 line) OBJMosaicYCount = 0; } + if (Num == 0) + { + auto objDirty = GPU::VRAMDirty_AOBJ.DeriveState(GPU::VRAMMap_AOBJ); + GPU::MakeVRAMFlat_AOBJCoherent(objDirty); + } + else + { + auto objDirty = GPU::VRAMDirty_BOBJ.DeriveState(GPU::VRAMMap_BOBJ); + GPU::MakeVRAMFlat_BOBJCoherent(objDirty); + } + NumSprites = 0; memset(OBJLine, 0, 256*4); memset(OBJWindow, 0, 256); @@ -2482,6 +2476,10 @@ void GPU2D::DrawSprite_Rotscale(u32 num, u32 boundwidth, u32 boundheight, u32 wi u32 ytilefactor; + u8* objvram; + u32 objvrammask; + GetOBJVRAM(Num, objvram, objvrammask); + s32 centerX = boundwidth >> 1; s32 centerY = boundheight >> 1; @@ -2525,6 +2523,7 @@ void GPU2D::DrawSprite_Rotscale(u32 num, u32 boundwidth, u32 boundheight, u32 wi pixelattr |= (0xC0000000 | (alpha << 24)); + u32 pixelsaddr; if (DispCnt & 0x40) { if (DispCnt & 0x20) @@ -2536,7 +2535,7 @@ void GPU2D::DrawSprite_Rotscale(u32 num, u32 boundwidth, u32 boundheight, u32 wi } else { - tilenum <<= (7 + ((DispCnt >> 22) & 0x1)); + pixelsaddr = tilenum << (7 + ((DispCnt >> 22) & 0x1)); ytilefactor = ((width >> 8) * 2); } } @@ -2544,23 +2543,21 @@ void GPU2D::DrawSprite_Rotscale(u32 num, u32 boundwidth, u32 boundheight, u32 wi { if (DispCnt & 0x20) { - tilenum = ((tilenum & 0x01F) << 4) + ((tilenum & 0x3E0) << 7); + pixelsaddr = ((tilenum & 0x01F) << 4) + ((tilenum & 0x3E0) << 7); ytilefactor = (256 * 2); } else { - tilenum = ((tilenum & 0x00F) << 4) + ((tilenum & 0x3F0) << 7); + pixelsaddr = ((tilenum & 0x00F) << 4) + ((tilenum & 0x3F0) << 7); ytilefactor = (128 * 2); } } - u32 pixelsaddr = (Num ? 0x06600000 : 0x06400000) + tilenum; - for (; xoff < boundwidth;) { if ((u32)rotX < width && (u32)rotY < height) { - color = GPU::ReadVRAM_OBJ(pixelsaddr + ((rotY >> 8) * ytilefactor) + ((rotX >> 8) << 1)); + color = *(u16*)&objvram[(pixelsaddr + ((rotY >> 8) * ytilefactor) + ((rotX >> 8) << 1)) & objvrammask]; if (color & 0x8000) { @@ -2585,9 +2582,10 @@ void GPU2D::DrawSprite_Rotscale(u32 num, u32 boundwidth, u32 boundheight, u32 wi } else { + u32 pixelsaddr = tilenum; if (DispCnt & 0x10) { - tilenum <<= ((DispCnt >> 20) & 0x3); + pixelsaddr <<= ((DispCnt >> 20) & 0x3); ytilefactor = (width >> 11) << ((attrib[0] & 0x2000) ? 1:0); } else @@ -2601,9 +2599,8 @@ void GPU2D::DrawSprite_Rotscale(u32 num, u32 boundwidth, u32 boundheight, u32 wi if (attrib[0] & 0x2000) { // 256-color - tilenum <<= 5; ytilefactor <<= 5; - u32 pixelsaddr = (Num ? 0x06600000 : 0x06400000) + tilenum; + pixelsaddr <<= 5; if (!window) { @@ -2617,7 +2614,7 @@ void GPU2D::DrawSprite_Rotscale(u32 num, u32 boundwidth, u32 boundheight, u32 wi { if ((u32)rotX < width && (u32)rotY < height) { - color = GPU::ReadVRAM_OBJ(pixelsaddr + ((rotY>>11)*ytilefactor) + ((rotY&0x700)>>5) + ((rotX>>11)*64) + ((rotX&0x700)>>8)); + color = objvram[(pixelsaddr + ((rotY>>11)*ytilefactor) + ((rotY&0x700)>>5) + ((rotX>>11)*64) + ((rotX&0x700)>>8)) & objvrammask]; if (color) { @@ -2657,7 +2654,7 @@ void GPU2D::DrawSprite_Rotscale(u32 num, u32 boundwidth, u32 boundheight, u32 wi { if ((u32)rotX < width && (u32)rotY < height) { - color = GPU::ReadVRAM_OBJ(pixelsaddr + ((rotY>>11)*ytilefactor) + ((rotY&0x700)>>6) + ((rotX>>11)*32) + ((rotX&0x700)>>9)); + color = objvram[(pixelsaddr + ((rotY>>11)*ytilefactor) + ((rotY&0x700)>>6) + ((rotX>>11)*32) + ((rotX&0x700)>>9)) & objvrammask]; if (rotX & 0x100) color >>= 4; else @@ -2705,6 +2702,10 @@ void GPU2D::DrawSprite_Normal(u32 num, u32 width, u32 height, s32 xpos, s32 ypos pixelattr |= 0x100000; } + u8* objvram; + u32 objvrammask; + GetOBJVRAM(Num, objvram, objvrammask); + // yflip if (attrib[1] & 0x2000) ypos = height-1 - ypos; @@ -2735,6 +2736,7 @@ void GPU2D::DrawSprite_Normal(u32 num, u32 width, u32 height, s32 xpos, s32 ypos pixelattr |= (0xC0000000 | (alpha << 24)); + u32 pixelsaddr = tilenum; if (DispCnt & 0x40) { if (DispCnt & 0x20) @@ -2746,25 +2748,24 @@ void GPU2D::DrawSprite_Normal(u32 num, u32 width, u32 height, s32 xpos, s32 ypos } else { - tilenum <<= (7 + ((DispCnt >> 22) & 0x1)); - tilenum += (ypos * width * 2); + pixelsaddr <<= (7 + ((DispCnt >> 22) & 0x1)); + pixelsaddr += (ypos * width * 2); } } else { if (DispCnt & 0x20) { - tilenum = ((tilenum & 0x01F) << 4) + ((tilenum & 0x3E0) << 7); - tilenum += (ypos * 256 * 2); + pixelsaddr = ((tilenum & 0x01F) << 4) + ((tilenum & 0x3E0) << 7); + pixelsaddr += (ypos * 256 * 2); } else { - tilenum = ((tilenum & 0x00F) << 4) + ((tilenum & 0x3F0) << 7); - tilenum += (ypos * 128 * 2); + pixelsaddr = ((tilenum & 0x00F) << 4) + ((tilenum & 0x3F0) << 7); + pixelsaddr += (ypos * 128 * 2); } } - u32 pixelsaddr = (Num ? 0x06600000 : 0x06400000) + tilenum; s32 pixelstride; if (attrib[1] & 0x1000) // xflip @@ -2781,7 +2782,7 @@ void GPU2D::DrawSprite_Normal(u32 num, u32 width, u32 height, s32 xpos, s32 ypos for (; xoff < xend;) { - color = GPU::ReadVRAM_OBJ(pixelsaddr); + color = *(u16*)&objvram[pixelsaddr & objvrammask]; pixelsaddr += pixelstride; @@ -2805,14 +2806,15 @@ void GPU2D::DrawSprite_Normal(u32 num, u32 width, u32 height, s32 xpos, s32 ypos } else { + u32 pixelsaddr = tilenum; if (DispCnt & 0x10) { - tilenum <<= ((DispCnt >> 20) & 0x3); - tilenum += ((ypos >> 3) * (width >> 3)) << ((attrib[0] & 0x2000) ? 1:0); + pixelsaddr <<= ((DispCnt >> 20) & 0x3); + pixelsaddr += ((ypos >> 3) * (width >> 3)) << ((attrib[0] & 0x2000) ? 1:0); } else { - tilenum += ((ypos >> 3) * 0x20); + pixelsaddr += ((ypos >> 3) * 0x20); } if (spritemode == 1) pixelattr |= 0x80000000; @@ -2821,8 +2823,7 @@ void GPU2D::DrawSprite_Normal(u32 num, u32 width, u32 height, s32 xpos, s32 ypos if (attrib[0] & 0x2000) { // 256-color - tilenum <<= 5; - u32 pixelsaddr = (Num ? 0x06600000 : 0x06400000) + tilenum; + pixelsaddr <<= 5; pixelsaddr += ((ypos & 0x7) << 3); s32 pixelstride; @@ -2851,7 +2852,7 @@ void GPU2D::DrawSprite_Normal(u32 num, u32 width, u32 height, s32 xpos, s32 ypos for (; xoff < xend;) { - color = GPU::ReadVRAM_OBJ(pixelsaddr); + color = objvram[pixelsaddr]; pixelsaddr += pixelstride; @@ -2877,8 +2878,7 @@ void GPU2D::DrawSprite_Normal(u32 num, u32 width, u32 height, s32 xpos, s32 ypos else { // 16-color - tilenum <<= 5; - u32 pixelsaddr = (Num ? 0x06600000 : 0x06400000) + tilenum; + pixelsaddr <<= 5; pixelsaddr += ((ypos & 0x7) << 2); s32 pixelstride; @@ -2910,13 +2910,13 @@ void GPU2D::DrawSprite_Normal(u32 num, u32 width, u32 height, s32 xpos, s32 ypos { if (attrib[1] & 0x1000) { - if (xoff & 0x1) { color = GPU::ReadVRAM_OBJ(pixelsaddr) & 0x0F; pixelsaddr--; } - else color = GPU::ReadVRAM_OBJ(pixelsaddr) >> 4; + if (xoff & 0x1) { color = objvram[pixelsaddr & objvrammask] & 0x0F; pixelsaddr--; } + else color = objvram[pixelsaddr & objvrammask] >> 4; } else { - if (xoff & 0x1) { color = GPU::ReadVRAM_OBJ(pixelsaddr) >> 4; pixelsaddr++; } - else color = GPU::ReadVRAM_OBJ(pixelsaddr) & 0x0F; + if (xoff & 0x1) { color = objvram[pixelsaddr & objvrammask] >> 4; pixelsaddr++; } + else color = objvram[pixelsaddr & objvrammask] & 0x0F; } if (color) diff --git a/src/GPU2D.h b/src/GPU2D.h index 469d6a24..db15adc1 100644 --- a/src/GPU2D.h +++ b/src/GPU2D.h @@ -59,9 +59,6 @@ public: void CheckWindows(u32 line); - void BGExtPalDirty(u32 base); - void OBJExtPalDirty(); - u16* GetBGExtPal(u32 slot, u32 pal); u16* GetOBJExtPal(); @@ -128,9 +125,6 @@ private: u16 MasterBrightness; u16 BGExtPalCache[4][16*256]; - u16 OBJExtPalCache[16*256]; - u32 BGExtPalStatus[4]; - u32 OBJExtPalStatus; u32 ColorBlend4(u32 val1, u32 val2, u32 eva, u32 evb); u32 ColorBlend5(u32 val1, u32 val2); diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 74debfe0..4e6ac424 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -179,6 +179,8 @@ u8 RenderFogDensityTable[34]; u32 RenderClearAttr1, RenderClearAttr2; +bool RenderFrameIdentical; + u32 ZeroDotWLimit; u32 GXStat; @@ -2491,6 +2493,19 @@ void VBlank() } RenderNumPolygons = NumPolygons; + RenderFrameIdentical = false; + } + else + { + RenderFrameIdentical = RenderDispCnt == DispCnt + && RenderAlphaRef == AlphaRef + && RenderClearAttr1 == ClearAttr1 + && RenderClearAttr2 == ClearAttr2 + && RenderFogColor == FogColor + && RenderFogOffset == FogOffset * 0x200 + && memcmp(RenderEdgeTable, EdgeTable, 8*2) == 0 + && memcmp(RenderFogDensityTable + 1, FogDensityTable, 32) == 0 + && memcmp(RenderToonTable, ToonTable, 32*2) == 0; } RenderDispCnt = DispCnt; diff --git a/src/GPU3D.h b/src/GPU3D.h index c69adde2..0477c4f1 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -87,6 +87,8 @@ extern u8 RenderFogDensityTable[34]; extern u32 RenderClearAttr1, RenderClearAttr2; +extern bool RenderFrameIdentical; + extern std::array RenderPolygonRAM; extern u32 RenderNumPolygons; diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 7ee9e5df..d66eb76e 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -58,6 +58,8 @@ bool PrevIsShadowMask; bool Enabled; +bool FrameIdentical; + // threading bool Threaded; @@ -550,6 +552,16 @@ typedef struct RendererPolygon PolygonList[2048]; +template +inline T ReadVRAM_Texture(u32 addr) +{ + return *(T*)&GPU::VRAMFlat_Texture[addr & 0x7FFFF]; +} +template +inline T ReadVRAM_TexPal(u32 addr) +{ + return *(T*)&GPU::VRAMFlat_TexPal[addr & 0x1FFFF]; +} void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) { @@ -606,10 +618,10 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha case 1: // A3I5 { vramaddr += ((t * width) + s); - u8 pixel = GPU::ReadVRAM_Texture(vramaddr); + u8 pixel = ReadVRAM_Texture(vramaddr); texpal <<= 4; - *color = GPU::ReadVRAM_TexPal(texpal + ((pixel&0x1F)<<1)); + *color = ReadVRAM_TexPal(texpal + ((pixel&0x1F)<<1)); *alpha = ((pixel >> 3) & 0x1C) + (pixel >> 6); } break; @@ -617,12 +629,12 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha case 2: // 4-color { vramaddr += (((t * width) + s) >> 2); - u8 pixel = GPU::ReadVRAM_Texture(vramaddr); + u8 pixel = ReadVRAM_Texture(vramaddr); pixel >>= ((s & 0x3) << 1); pixel &= 0x3; texpal <<= 3; - *color = GPU::ReadVRAM_TexPal(texpal + (pixel<<1)); + *color = ReadVRAM_TexPal(texpal + (pixel<<1)); *alpha = (pixel==0) ? alpha0 : 31; } break; @@ -630,12 +642,12 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha case 3: // 16-color { vramaddr += (((t * width) + s) >> 1); - u8 pixel = GPU::ReadVRAM_Texture(vramaddr); + u8 pixel = ReadVRAM_Texture(vramaddr); if (s & 0x1) pixel >>= 4; else pixel &= 0xF; texpal <<= 4; - *color = GPU::ReadVRAM_TexPal(texpal + (pixel<<1)); + *color = ReadVRAM_TexPal(texpal + (pixel<<1)); *alpha = (pixel==0) ? alpha0 : 31; } break; @@ -643,10 +655,10 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha case 4: // 256-color { vramaddr += ((t * width) + s); - u8 pixel = GPU::ReadVRAM_Texture(vramaddr); + u8 pixel = ReadVRAM_Texture(vramaddr); texpal <<= 4; - *color = GPU::ReadVRAM_TexPal(texpal + (pixel<<1)); + *color = ReadVRAM_TexPal(texpal + (pixel<<1)); *alpha = (pixel==0) ? alpha0 : 31; } break; @@ -660,30 +672,30 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha if (vramaddr >= 0x40000) slot1addr += 0x10000; - u8 val = GPU::ReadVRAM_Texture(vramaddr); + u8 val = ReadVRAM_Texture(vramaddr); val >>= (2 * (s & 0x3)); - u16 palinfo = GPU::ReadVRAM_Texture(slot1addr); + u16 palinfo = ReadVRAM_Texture(slot1addr); u32 paloffset = (palinfo & 0x3FFF) << 2; texpal <<= 4; switch (val & 0x3) { case 0: - *color = GPU::ReadVRAM_TexPal(texpal + paloffset); + *color = ReadVRAM_TexPal(texpal + paloffset); *alpha = 31; break; case 1: - *color = GPU::ReadVRAM_TexPal(texpal + paloffset + 2); + *color = ReadVRAM_TexPal(texpal + paloffset + 2); *alpha = 31; break; case 2: if ((palinfo >> 14) == 1) { - u16 color0 = GPU::ReadVRAM_TexPal(texpal + paloffset); - u16 color1 = GPU::ReadVRAM_TexPal(texpal + paloffset + 2); + u16 color0 = ReadVRAM_TexPal(texpal + paloffset); + u16 color1 = ReadVRAM_TexPal(texpal + paloffset + 2); u32 r0 = color0 & 0x001F; u32 g0 = color0 & 0x03E0; @@ -700,8 +712,8 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha } else if ((palinfo >> 14) == 3) { - u16 color0 = GPU::ReadVRAM_TexPal(texpal + paloffset); - u16 color1 = GPU::ReadVRAM_TexPal(texpal + paloffset + 2); + u16 color0 = ReadVRAM_TexPal(texpal + paloffset); + u16 color1 = ReadVRAM_TexPal(texpal + paloffset + 2); u32 r0 = color0 & 0x001F; u32 g0 = color0 & 0x03E0; @@ -717,20 +729,20 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha *color = r | g | b; } else - *color = GPU::ReadVRAM_TexPal(texpal + paloffset + 4); + *color = ReadVRAM_TexPal(texpal + paloffset + 4); *alpha = 31; break; case 3: if ((palinfo >> 14) == 2) { - *color = GPU::ReadVRAM_TexPal(texpal + paloffset + 6); + *color = ReadVRAM_TexPal(texpal + paloffset + 6); *alpha = 31; } else if ((palinfo >> 14) == 3) { - u16 color0 = GPU::ReadVRAM_TexPal(texpal + paloffset); - u16 color1 = GPU::ReadVRAM_TexPal(texpal + paloffset + 2); + u16 color0 = ReadVRAM_TexPal(texpal + paloffset); + u16 color1 = ReadVRAM_TexPal(texpal + paloffset + 2); u32 r0 = color0 & 0x001F; u32 g0 = color0 & 0x03E0; @@ -759,10 +771,10 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha case 6: // A5I3 { vramaddr += ((t * width) + s); - u8 pixel = GPU::ReadVRAM_Texture(vramaddr); + u8 pixel = ReadVRAM_Texture(vramaddr); texpal <<= 4; - *color = GPU::ReadVRAM_TexPal(texpal + ((pixel&0x7)<<1)); + *color = ReadVRAM_TexPal(texpal + ((pixel&0x7)<<1)); *alpha = (pixel >> 3); } break; @@ -770,7 +782,7 @@ void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha case 7: // direct color { vramaddr += (((t * width) + s) << 1); - *color = GPU::ReadVRAM_Texture(vramaddr); + *color = ReadVRAM_Texture(vramaddr); *alpha = (*color & 0x8000) ? 31 : 0; } break; @@ -2007,8 +2019,8 @@ void ClearBuffers() { for (int x = 0; x < 256; x++) { - u16 val2 = GPU::ReadVRAM_Texture(0x40000 + (yoff << 9) + (xoff << 1)); - u16 val3 = GPU::ReadVRAM_Texture(0x60000 + (yoff << 9) + (xoff << 1)); + u16 val2 = ReadVRAM_Texture(0x40000 + (yoff << 9) + (xoff << 1)); + u16 val3 = ReadVRAM_Texture(0x60000 + (yoff << 9) + (xoff << 1)); // TODO: confirm color conversion u32 r = (val2 << 1) & 0x3E; if (r) r++; @@ -2088,11 +2100,19 @@ void VCount144() void RenderFrame() { + auto textureDirty = GPU::VRAMDirty_Texture.DeriveState(GPU::VRAMMap_Texture); + auto texPalDirty = GPU::VRAMDirty_TexPal.DeriveState(GPU::VRAMMap_TexPal); + + bool textureChanged = GPU::MakeVRAMFlat_TextureCoherent(textureDirty); + bool texPalChanged = GPU::MakeVRAMFlat_TexPalCoherent(texPalDirty); + + FrameIdentical = !(textureChanged || texPalChanged) && RenderFrameIdentical; + if (RenderThreadRunning) { Platform::Semaphore_Post(Sema_RenderStart); } - else + else if (!FrameIdentical) { ClearBuffers(); RenderPolygons(false, &RenderPolygonRAM[0], RenderNumPolygons); @@ -2107,8 +2127,15 @@ void RenderThreadFunc() if (!RenderThreadRunning) return; RenderThreadRendering = true; - ClearBuffers(); - RenderPolygons(true, &RenderPolygonRAM[0], RenderNumPolygons); + if (FrameIdentical) + { + Platform::Semaphore_Post(Sema_ScanlineCount, 192); + } + else + { + ClearBuffers(); + RenderPolygons(true, &RenderPolygonRAM[0], RenderNumPolygons); + } Platform::Semaphore_Post(Sema_RenderDone); RenderThreadRendering = false; diff --git a/src/NonStupidBitfield.h b/src/NonStupidBitfield.h new file mode 100644 index 00000000..124ba76f --- /dev/null +++ b/src/NonStupidBitfield.h @@ -0,0 +1,149 @@ +#ifndef NONSTUPIDBITFIELD_H +#define NONSTUPIDBITFIELD_H + +#include "types.h" + +#include + +#include +#include + +// like std::bitset but less stupid and optimised for +// our use case (keeping track of memory invalidations) + +template +struct NonStupidBitField +{ + static_assert((Size % 8) == 0, "bitfield size must be a multiple of 8"); + static const u32 DataLength = Size / 8; + u8 Data[DataLength]; + + struct Ref + { + NonStupidBitField& BitField; + u32 Idx; + + operator bool() + { + return BitField.Data[Idx >> 3] & (1 << (Idx & 0x7)); + } + + Ref& operator=(bool set) + { + BitField.Data[Idx >> 3] &= ~(1 << (Idx & 0x7)); + BitField.Data[Idx >> 3] |= ((u8)set << (Idx & 0x7)); + return *this; + } + }; + + struct Iterator + { + NonStupidBitField& BitField; + u32 DataIdx; + u32 BitIdx; + u64 RemainingBits; + + u32 operator*() { return DataIdx * 8 + BitIdx; } + + bool operator==(const Iterator& other) { return other.DataIdx == DataIdx; } + bool operator!=(const Iterator& other) { return other.DataIdx != DataIdx; } + + template + void Next() + { + while (RemainingBits == 0 && DataIdx < DataLength) + { + DataIdx += sizeof(T); + RemainingBits = *(T*)&BitField.Data[DataIdx]; + } + + BitIdx = __builtin_ctzll(RemainingBits); + RemainingBits &= ~(1ULL << BitIdx); + } + + Iterator operator++(int) + { + Iterator prev(*this); + ++*this; + return prev; + } + + Iterator& operator++() + { + if ((DataLength % 8) == 0) + Next(); + else if ((DataLength % 4) == 0) + Next(); + else if ((DataLength % 2) == 0) + Next(); + else + Next(); + + return *this; + } + }; + + NonStupidBitField(u32 start, u32 size) + { + memset(Data, 0, sizeof(Data)); + + if (size == 0) + return; + + u32 roundedStartBit = (start + 7) & ~7; + u32 roundedEndBit = (start + size) & ~7; + if (roundedStartBit != roundedEndBit) + memset(Data + roundedStartBit / 8, 0xFF, (roundedEndBit - roundedStartBit) / 8); + + if (start & 0x7) + Data[start >> 3] = 0xFF << (start & 0x7); + if ((start + size) & 0x7) + Data[(start + size) >> 3] = 0xFF >> ((start + size) & 0x7); + } + + NonStupidBitField() + { + memset(Data, 0, sizeof(Data)); + } + + Iterator End() + { + return Iterator{*this, DataLength, 0, 0}; + } + Iterator Begin() + { + if ((DataLength % 8) == 0) + return ++Iterator{*this, 0, 0, *(u64*)Data}; + else if ((DataLength % 4) == 0) + return ++Iterator{*this, 0, 0, *(u32*)Data}; + else if ((DataLength % 2) == 0) + return ++Iterator{*this, 0, 0, *(u16*)Data}; + else + return ++Iterator{*this, 0, 0, *Data}; + } + + Ref operator[](u32 idx) + { + return Ref{*this, idx}; + } + + NonStupidBitField& operator|=(const NonStupidBitField& other) + { + for (u32 i = 0; i < DataLength; i++) + { + Data[i] |= other.Data[i]; + } + return *this; + } + NonStupidBitField& operator&=(const NonStupidBitField& other) + { + for (u32 i = 0; i < DataLength; i++) + { + Data[i] &= other.Data[i]; + } + return *this; + } +}; + + +#endif \ No newline at end of file diff --git a/src/Platform.h b/src/Platform.h index deb37853..b4dda9eb 100644 --- a/src/Platform.h +++ b/src/Platform.h @@ -77,7 +77,7 @@ Semaphore* Semaphore_Create(); void Semaphore_Free(Semaphore* sema); void Semaphore_Reset(Semaphore* sema); void Semaphore_Wait(Semaphore* sema); -void Semaphore_Post(Semaphore* sema); +void Semaphore_Post(Semaphore* sema, int count = 1); struct Mutex; Mutex* Mutex_Create(); diff --git a/src/frontend/qt_sdl/Platform.cpp b/src/frontend/qt_sdl/Platform.cpp index a51a9854..d3480e44 100644 --- a/src/frontend/qt_sdl/Platform.cpp +++ b/src/frontend/qt_sdl/Platform.cpp @@ -230,9 +230,9 @@ void Semaphore_Wait(Semaphore* sema) ((QSemaphore*) sema)->acquire(); } -void Semaphore_Post(Semaphore* sema) +void Semaphore_Post(Semaphore* sema, int count) { - ((QSemaphore*) sema)->release(); + ((QSemaphore*) sema)->release(count); } Mutex* Mutex_Create()