From ed61867decb0924a2b1f005e433753dc97487d7c Mon Sep 17 00:00:00 2001 From: RSDuck Date: Sat, 21 Mar 2020 19:37:18 +0100 Subject: [PATCH] make things fasteeer! also fix alpha for A3I5 textures --- src/GPU.cpp | 21 ++--- src/GPU.h | 8 +- src/GPU3D.h | 3 - src/GPU3D_Soft.cpp | 29 ++++--- src/GPU3D_TexCache.cpp | 169 ++++++++++++++++++++++++++--------------- 5 files changed, 145 insertions(+), 85 deletions(-) diff --git a/src/GPU.cpp b/src/GPU.cpp index 7192cd41..2ecd8802 100644 --- a/src/GPU.cpp +++ b/src/GPU.cpp @@ -52,6 +52,8 @@ u8 VRAM_I[ 16*1024]; u8* VRAM[9] = {VRAM_A, VRAM_B, VRAM_C, VRAM_D, VRAM_E, VRAM_F, VRAM_G, VRAM_H, VRAM_I}; u32 VRAMMask[9] = {0x1FFFF, 0x1FFFF, 0x1FFFF, 0x1FFFF, 0xFFFF, 0x3FFF, 0x3FFF, 0x7FFF, 0x3FFF}; +u64 LCDCDirty[9][2]; + u8 VRAMCNT[9]; u8 VRAMSTAT; @@ -380,16 +382,17 @@ void MapVRAM_AB(u32 bank, u8 cnt) case 1: // ABG UNMAP_RANGE_PTR(ABG, oldofs<<3, 8); + LCDCDirty[bank][0] = LCDCDirty[bank][1] = 0xFFFFFFFFFFFFFFFF; break; case 2: // AOBJ oldofs &= 0x1; UNMAP_RANGE_PTR(AOBJ, oldofs<<3, 8); + LCDCDirty[bank][0] = LCDCDirty[bank][1] = 0xFFFFFFFFFFFFFFFF; break; case 3: // texture VRAMMap_Texture[oldofs] &= ~bankmask; - GPU3D::TexCache::InvalidateTexSlot(oldofs); break; } } @@ -413,7 +416,6 @@ void MapVRAM_AB(u32 bank, u8 cnt) case 3: // texture VRAMMap_Texture[ofs] |= bankmask; - GPU3D::TexCache::InvalidateTexSlot(ofs); break; } } @@ -442,16 +444,17 @@ void MapVRAM_CD(u32 bank, u8 cnt) case 1: // ABG UNMAP_RANGE_PTR(ABG, oldofs<<3, 8); + LCDCDirty[bank][0] = LCDCDirty[bank][1] = 0xFFFFFFFFFFFFFFFF; break; case 2: // ARM7 VRAM oldofs &= 0x1; VRAMMap_ARM7[oldofs] &= ~bankmask; + LCDCDirty[bank][0] = LCDCDirty[bank][1] = 0xFFFFFFFFFFFFFFFF; break; case 3: // texture VRAMMap_Texture[oldofs] &= ~bankmask; - GPU3D::TexCache::InvalidateTexSlot(oldofs); break; case 4: // BBG/BOBJ @@ -463,6 +466,7 @@ void MapVRAM_CD(u32 bank, u8 cnt) { UNMAP_RANGE_PTR(BOBJ, 0, 8); } + LCDCDirty[bank][0] = LCDCDirty[bank][1] = 0xFFFFFFFFFFFFFFFF; break; } } @@ -487,7 +491,6 @@ void MapVRAM_CD(u32 bank, u8 cnt) case 3: // texture VRAMMap_Texture[ofs] |= bankmask; - GPU3D::TexCache::InvalidateTexSlot(ofs); break; case 4: // BBG/BOBJ @@ -523,16 +526,16 @@ void MapVRAM_E(u32 bank, u8 cnt) case 1: // ABG UNMAP_RANGE_PTR(ABG, 0, 4); + LCDCDirty[bank][0] = 0xFFFFFFFFFFFFFFFF; break; case 2: // AOBJ UNMAP_RANGE_PTR(AOBJ, 0, 4); + LCDCDirty[bank][0] = 0xFFFFFFFFFFFFFFFF; break; case 3: // texture palette UNMAP_RANGE(TexPal, 0, 4); - for (int i = 0; i < 4; i++) - GPU3D::TexCache::InvalidatePalSlot(i); break; case 4: // ABG ext palette @@ -561,8 +564,6 @@ void MapVRAM_E(u32 bank, u8 cnt) case 3: // texture palette MAP_RANGE(TexPal, 0, 4); - for (int i = 0; i < 4; i++) - GPU3D::TexCache::InvalidatePalSlot(i); break; case 4: // ABG ext palette @@ -601,6 +602,7 @@ void MapVRAM_FG(u32 bank, u8 cnt) VRAMPtr_ABG[base] = GetUniqueBankPtr(VRAMMap_ABG[base], base << 14); VRAMPtr_ABG[base + 2] = GetUniqueBankPtr(VRAMMap_ABG[base + 2], (base + 2) << 14); } + LCDCDirty[bank][0] = 0xFFFFFFFF; break; case 2: // AOBJ @@ -611,11 +613,11 @@ void MapVRAM_FG(u32 bank, u8 cnt) VRAMPtr_AOBJ[base] = GetUniqueBankPtr(VRAMMap_AOBJ[base], base << 14); VRAMPtr_AOBJ[base + 2] = GetUniqueBankPtr(VRAMMap_AOBJ[base + 2], (base + 2) << 14); } + LCDCDirty[bank][0] = 0xFFFFFFFF; break; case 3: // texture palette VRAMMap_TexPal[(oldofs & 0x1) + ((oldofs & 0x2) << 1)] &= ~bankmask; - GPU3D::TexCache::InvalidatePalSlot((oldofs & 0x1) + ((oldofs & 0x2) << 1)); break; case 4: // ABG ext palette @@ -661,7 +663,6 @@ void MapVRAM_FG(u32 bank, u8 cnt) case 3: // texture palette VRAMMap_TexPal[(ofs & 0x1) + ((ofs & 0x2) << 1)] |= bankmask; - GPU3D::TexCache::InvalidatePalSlot((ofs & 0x1) + ((ofs & 0x2) << 1)); break; case 4: // ABG ext palette diff --git a/src/GPU.h b/src/GPU.h index f7fc4b35..0a13e0b0 100644 --- a/src/GPU.h +++ b/src/GPU.h @@ -49,6 +49,8 @@ extern u8 VRAM_I[ 16*1024]; extern u8* VRAM[9]; extern u32 VRAMMask[9]; +extern u64 LCDCDirty[9][2]; + extern u32 VRAMMap_LCDC; extern u32 VRAMMap_ABG[0x20]; extern u32 VRAMMap_AOBJ[0x10]; @@ -219,7 +221,11 @@ void WriteVRAM_LCDC(u32 addr, T val) default: return; } - if (VRAMMap_LCDC & (1<> 16] |= 1 << ((addr >> 10) & 0x3F); + } } diff --git a/src/GPU3D.h b/src/GPU3D.h index 3ed1b352..94c45a3f 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -155,9 +155,6 @@ void SaveTextures(); template ExternalTexHandle GetTexture(u32 texParam, u32 palBase); -void InvalidateTexSlot(u32 base); -void InvalidatePalSlot(u32 base); - } namespace SoftRenderer diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index b8d47c43..151de492 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -80,7 +80,7 @@ struct TextureAllocator // all sizes below 8*8 (log2(64)=6) can be ignored TextureAllocator TextureMem[14]; -TextureAllocator& GetTextureAllocator(u32 width, u32 height) +inline TextureAllocator& GetTextureAllocator(u32 width, u32 height) { return TextureMem[__builtin_ctz(width * height) - 6]; } @@ -1136,14 +1136,23 @@ void SetupPolygonRightEdge(RendererPolygon* rp, s32 y) polygon->FinalW[rp->CurVR], polygon->FinalW[rp->NextVR], y); } -void SetupPolygon(RendererPolygon* rp, Polygon* polygon) +void SetupPolygon(RendererPolygon* rp, Polygon* polygon, RendererPolygon* lastRp) { if (polygon->TexParam & 0x1C000000) { - TexCache::ExternalTexHandle handle = TexCache::GetTexture(polygon->TexParam, polygon->TexPalette); - u32 width = 8 << ((polygon->TexParam >> 20) & 0x7); - u32 height = 8 << ((polygon->TexParam >> 23) & 0x7); - rp->TextureData = &GetTextureAllocator(width, height).Pixels[handle]; + if (lastRp && lastRp->PolyData->TexParam == polygon->TexParam + && lastRp->PolyData->TexPalette == polygon->TexPalette) + { + rp->TextureData = lastRp->TextureData; + } + else + { + TexCache::ExternalTexHandle handle = + TexCache::GetTexture(polygon->TexParam, polygon->TexPalette); + u32 width = 8 << ((polygon->TexParam >> 20) & 0x7); + u32 height = 8 << ((polygon->TexParam >> 23) & 0x7); + rp->TextureData = &GetTextureAllocator(width, height).Pixels[handle]; + } } u32 nverts = polygon->NumVertices; @@ -2142,17 +2151,15 @@ void ClearBuffers() void RenderPolygons(bool threaded, Polygon** polygons, int npolys) { - u64 ticksStart = SDL_GetPerformanceCounter(); TexCache::UpdateTextures(); int j = 0; for (int i = 0; i < npolys; i++) { if (polygons[i]->Degenerate) continue; - SetupPolygon(&PolygonList[j++], polygons[i]); - } - u64 tickesEnd = SDL_GetPerformanceCounter(); - printf("time %fms\n", (tickesEnd-ticksStart)/(float)SDL_GetPerformanceFrequency()*1000.f); + SetupPolygon(&PolygonList[j], polygons[i], j > 0 ? &PolygonList[j - 1] : NULL); + j++; + } TexCache::SaveTextures(); RenderScanline(0, j); diff --git a/src/GPU3D_TexCache.cpp b/src/GPU3D_TexCache.cpp index 3bfdb8bb..13d3388b 100644 --- a/src/GPU3D_TexCache.cpp +++ b/src/GPU3D_TexCache.cpp @@ -165,7 +165,7 @@ void ConvertDirectColorTexture(u32 width, u32 height, u32* output, u8* texData) } template -void ConvertAXIYTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData, bool color0Transparent) +void ConvertAXIYTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData) { for (int y = 0; y < height; y++) { @@ -176,11 +176,9 @@ void ConvertAXIYTexture(u32 width, u32 height, u32* output, u8* texData, u16* pa u32 idx = val & ((1 << Y) - 1); u16 color = palData[idx]; - u32 alpha = (val >> X) & ((1 << X) - 1); + u32 alpha = (val >> Y) & ((1 << X) - 1); if (X != 5) alpha = alpha * 4 + alpha / 2; - if (color0Transparent && idx == 0) - alpha = 0; u32 res; switch (outputFmt) @@ -236,11 +234,11 @@ struct Texture ExternalTexHandle Handle; }; -u64 PaletteCacheStatus; +u64 PaletteCacheStatus[2]; u8 PaletteCache[128*1024]; -u64 PaletteDirty[2]; -u64 TexturesDirty[8]; +u32 TextureMap[4]; +u32 PaletteMap[8]; std::unordered_map TextureCache; @@ -260,11 +258,12 @@ void DeInit() void Reset() { - PaletteCacheStatus = 0; + memset(PaletteCacheStatus, 0, 2*8); - memset(PaletteDirty, 0, 8*2); - memset(TexturesDirty, 0, 8*8); TextureCache.clear(); + + memset(TextureMap, 0, 2*4); + memset(PaletteMap, 0, 8*4); } u8* GetTexturePtr(u32 addr, u32 size, u8** unpackBuffer) @@ -289,29 +288,32 @@ u8* GetTexturePtr(u32 addr, u32 size, u8** unpackBuffer) } } -void EnsurePaletteCoherent(u64 mask) +void EnsurePaletteCoherent(u64* mask) { - if ((PaletteCacheStatus & mask) != mask) + for (int i = 0; i < 2; i++) { - u32 updateField = ~PaletteCacheStatus & mask; - PaletteCacheStatus |= mask; - while (updateField != 0) + if ((PaletteCacheStatus[i] & mask[i]) != mask[i]) { - updatePalette = true; - int idx = __builtin_ctz(updateField); - u32 map = GPU::VRAMMap_TexPal[idx >> 3]; - if (map && (map & (map - 1)) == 0) + u64 updateField = ~PaletteCacheStatus[i] & mask[i]; + PaletteCacheStatus[i] |= mask[i]; + while (updateField != 0) { - u32 bank = __builtin_ctz(map); - memcpy( - PaletteCache + idx * 0x800, - GPU::VRAM[bank] + ((idx * 0x800) & GPU::VRAMMask[bank]), - 0x800); + updatePalette = true; + int idx = __builtin_ctzll(updateField); + u32 map = GPU::VRAMMap_TexPal[idx >> 4 + i * 4]; + if (map && (map & (map - 1)) == 0) + { + u32 bank = __builtin_ctz(map); + memcpy( + PaletteCache + i * 0x10000 + idx * 0x400, + GPU::VRAM[bank] + ((idx * 0x400) & GPU::VRAMMask[bank]), + 0x400); + } + else + for (int j = 0; j < 0x400; j += 8) + *(u64*)&PaletteCache[i * 0x10000 + idx * 0x400 + j] = GPU::ReadVRAM_TexPal(i * 0x10000 + idx * 0x400 + j); + updateField &= ~(1ULL << idx); } - else - for (int i = 0; i < 0x800; i += 8) - *(u64*)&PaletteCache[idx * 0x800 + i] = GPU::ReadVRAM_TexPal(idx * 0x800 + i); - updateField &= ~(1 << idx); } } } @@ -324,6 +326,62 @@ void UpdateTextures() copyTexture = false; textureUpdated = false; + u64 PaletteDirty[2] = {0}; + u64 TexturesDirty[8] = {0}; + + for (int i = 0; i < 4; i++) + { + if (GPU::VRAMMap_Texture[i] != TextureMap[i]) + { + TexturesDirty[(i << 1)] = 0xFFFFFFFFFFFFFFFF; + TexturesDirty[(i << 1) + 1] = 0xFFFFFFFFFFFFFFFF; + + TextureMap[i] = GPU::VRAMMap_Texture[i]; + } + else + { + for (int j = 0; j < 4; j++) + { + if (TextureMap[i] & (1<> 2] |= 0xFFFF << (i & 0x3) * 16; + PaletteCacheStatus[i >> 2] &= ~(0xFFFF << (i & 0x3) * 16); + PaletteMap[i] = GPU::VRAMMap_TexPal[i]; + } + else + { + // E + if (PaletteMap[i] & (1<<3)) + { + PaletteDirty[i >> 2] |= GPU::LCDCDirty[3][0]; + PaletteCacheStatus[i >> 2] &= ~GPU::LCDCDirty[3][0]; + GPU::LCDCDirty[3][0] = 0; + } + // FG + for (int j = 0; j < 2; j++) + { + if (PaletteMap[i] & (1<<(4+j))) + { + PaletteDirty[i >> 2] |= GPU::LCDCDirty[4+j][0] << (i & 0x3) * 16; + PaletteCacheStatus[i >> 2] &= ~(GPU::LCDCDirty[4+j][0] << (i & 0x3) * 16); + GPU::LCDCDirty[4+j][0] = 0; + } + } + } + } + bool paletteDirty = PaletteDirty[0] | PaletteDirty[1]; bool textureDirty = false; for (int i = 0; i < 8; i++) @@ -358,23 +416,25 @@ void UpdateTextures() } } -inline u64 MakePaletteMask(u32 addr, u32 size) -{ - return ((1ULL << (((addr + size + 0x7FF & ~0x7FF) >> 11) - (addr >> 11))) - 1) << (addr >> 11); -} - inline void MakeDirtyMask(u64* out, u32 addr, u32 size) { - u32 start = addr >> 10; - u32 count = (((addr + size + 0x3FF) & ~0x3FF) >> 10) - start; + u32 startBit = addr >> 10; + u32 bitsCount = ((addr + size + 0x3FF & ~0x3FF) >> 10) - startBit; - u32 firstIdx = start >> 6; - u32 indicesCount = (((count + 0x3F) & ~0x3F) >> 6) - firstIdx; + u32 startEntry = startBit >> 6; + u64 entriesCount = ((startBit + bitsCount + 0x3F & ~0x3F) >> 6) - startEntry; - out[firstIdx] = (1ULL << (63 - (start & 0x3F))) - 1 << (start & 0x3F); - out[firstIdx + indicesCount - 1] = (1ULL << (start & 0x3F)) - 1; - for (int i = firstIdx + 1; i < firstIdx + indicesCount - 1; i++) - out[i] |= 0xFFFFFFFFFFFFFFFF; + if (entriesCount > 1) + { + out[startEntry] |= 0xFFFFFFFFFFFFFFFF << (startBit & 0x3F); + out[startEntry + entriesCount - 1] |= (1ULL << (startBit & 0x3F)) - 1; + for (int i = startEntry + 1; i < startEntry + entriesCount - 1; i++) + out[i] = 0xFFFFFFFFFFFFFFFF; + } + else + { + out[startEntry] |= ((1ULL << bitsCount) - 1) << (startBit & 0x3F); + } } template @@ -427,8 +487,7 @@ ExternalTexHandle GetTexture(u32 texParam, u32 palBase) MakeDirtyMask(texture.TextureMask, slot1addr, width*height/16*2); MakeDirtyMask(texture.PaletteMask, palBase*16, 0x10000); - u64 paletteMask = MakePaletteMask(palBase*16, 0x10000); - EnsurePaletteCoherent(MakePaletteMask(palBase*16, 0x10000)); + EnsurePaletteCoherent(texture.PaletteMask); u16* palData = (u16*)(PaletteCache + palBase*16); ConvertCompressedTexture(width, height, data, texData, texAuxData, palData); @@ -445,19 +504,20 @@ ExternalTexHandle GetTexture(u32 texParam, u32 palBase) case 4: texSize = width*height; palSize = 256; break; } - u8* texData = GetTexturePtr(addr, texSize, &unpackBuffer); - EnsurePaletteCoherent(MakePaletteMask(palAddr, palSize*2)); - u16* palData = (u16*)(PaletteCache + palAddr); - MakeDirtyMask(texture.TextureMask, addr, texSize); MakeDirtyMask(texture.PaletteMask, palAddr, palSize); + EnsurePaletteCoherent(texture.PaletteMask); + + u8* texData = GetTexturePtr(addr, texSize, &unpackBuffer); + u16* palData = (u16*)(PaletteCache + palAddr); + bool color0Transparent = texParam & (1 << 29); switch (fmt) { - case 1: ConvertAXIYTexture(width, height, data, texData, palData, color0Transparent); break; - case 6: ConvertAXIYTexture(width, height, data, texData, palData, color0Transparent); break; + case 1: ConvertAXIYTexture(width, height, data, texData, palData); break; + case 6: ConvertAXIYTexture(width, height, data, texData, palData); break; case 2: ConvertNColorsTexture(width, height, data, texData, palData, color0Transparent); break; case 3: ConvertNColorsTexture(width, height, data, texData, palData, color0Transparent); break; case 4: ConvertNColorsTexture(width, height, data, texData, palData, color0Transparent); break; @@ -493,19 +553,8 @@ void SaveTextures() //printf("%d %d textures converted %d pixels %d %d %d\n", converted, TextureCache.size(), pixelsConverted, updatePalette, copyTexture, textureUpdated); } -void InvalidateTexSlot(u32 base) -{ - TexturesDirty[(base << 1)] = 0xFFFFFFFFFFFFFFFF; - TexturesDirty[(base << 1) + 1] = 0xFFFFFFFFFFFFFFFF; } -void InvalidatePalSlot(u32 base) -{ - PaletteDirty[base >> 2] |= 0xFFFF << (base & 0x3) * 16; - PaletteCacheStatus &= ~(0xFF << base * 8); -} - -} } template GPU3D::TexCache::ExternalTexHandle