put the texture cache into it's own file

2023-05-14 19:52:40 +02:00 · 2023-05-14 19:52:40 +02:00 · 045829b0bd
parent 0ae19cffe6
commit 045829b0bd
15 changed files with 658 additions and 526 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -33,6 +33,8 @@ add_library(core STATIC
    GPU2D_Soft.cpp
    GPU3D.cpp
    GPU3D_Soft.cpp
+    GPU3D_Texcache.cpp
+    GPU3D_Texcache.h
    melonDLDI.h
    NDS.cpp
    NDSCart.cpp
@ -67,6 +69,8 @@ if (ENABLE_OGLRENDERER)
        GPU_OpenGL_shaders.h
        GPU3D_OpenGL.cpp
        GPU3D_Compute.cpp
+        GPU3D_TexcacheOpenGL.cpp
+        GPU3D_TexcacheOpenGL.h
        GPU3D_OpenGL_shaders.h
        OpenGLSupport.cpp)

--- a/src/DMA.cpp
+++ b/src/DMA.cpp
@ -21,6 +21,7 @@
 #include "DSi.h"
 #include "DMA.h"
 #include "GPU.h"
+#include "GPU3D.h"
 #include "DMA_Timings.h"
 #include "Platform.h"

--- a/src/DSi_NDMA.cpp
+++ b/src/DSi_NDMA.cpp
@ -22,6 +22,7 @@
 #include "DSi_NDMA.h"
 #include "GPU.h"
 #include "DSi_AES.h"
+#include "GPU3D.h"

 using Platform::Log;
 using Platform::LogLevel;
--- a/src/GPU.cpp
+++ b/src/GPU.cpp
@ -25,6 +25,7 @@
 #endif

 #include "GPU2D_Soft.h"
+#include "GPU3D.h"

 using Platform::Log;
 using Platform::LogLevel;
--- a/src/GPU.h
+++ b/src/GPU.h
@ -617,6 +617,4 @@ void SetDispStat(u32 cpu, u16 val);
 void SetVCount(u16 val);
 }

-#include "GPU3D.h"
-
 #endif
--- a/src/GPU2D.cpp
+++ b/src/GPU2D.cpp
@ -20,6 +20,7 @@
 #include <string.h>
 #include "NDS.h"
 #include "GPU.h"
+#include "GPU3D.h"

 using Platform::Log;
 using Platform::LogLevel;
--- a/src/GPU2D_Soft.cpp
+++ b/src/GPU2D_Soft.cpp
@ -18,6 +18,7 @@

 #include "GPU2D_Soft.h"
 #include "GPU.h"
+#include "GPU3D.h"

 namespace GPU2D
 {
--- a/src/GPU3D.cpp
+++ b/src/GPU3D.cpp
@ -23,6 +23,7 @@
 #include "GPU.h"
 #include "FIFO.h"
 #include "Platform.h"
+#include "GPU3D.h"

 using Platform::Log;
 using Platform::LogLevel;
--- a/src/GPU3D_Compute.cpp
+++ b/src/GPU3D_Compute.cpp
@ -20,9 +20,6 @@

 #include <assert.h>

-#define XXH_STATIC_LINKING_ONLY
-#include "xxhash/xxhash.h"
-
 #include "OpenGLSupport.h"

 #include "GPU3D_Compute_shaders.h"
@ -31,7 +28,7 @@ namespace GPU3D
 {

 ComputeRenderer::ComputeRenderer()
-    : Renderer3D(true)
+    : Renderer3D(true), Texcache(TexcacheOpenGLLoader())
 {}

 ComputeRenderer::~ComputeRenderer()
@ -72,8 +69,8 @@ void blah(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,con

 bool ComputeRenderer::Init()
 {
-    glDebugMessageCallback(blah, NULL);
-    glEnable(GL_DEBUG_OUTPUT);
+    //glDebugMessageCallback(blah, NULL);
+    //glEnable(GL_DEBUG_OUTPUT);
    glGenBuffers(1, &YSpanSetupMemory);
    glBindBuffer(GL_SHADER_STORAGE_BUFFER, YSpanSetupMemory);
    glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupY)*MaxYSpanSetups, nullptr, GL_DYNAMIC_DRAW);
@ -120,7 +117,7 @@ bool ComputeRenderer::Init()

 void ComputeRenderer::DeInit()
 {
-    ResetTexcache();
+    Texcache.Reset();

    glDeleteBuffers(1, &YSpanSetupMemory);
    glDeleteBuffers(1, &RenderPolygonMemory);
@ -180,24 +177,9 @@ void ComputeRenderer::DeleteShaders()
        glDeleteProgram(program);
 }

-void ComputeRenderer::ResetTexcache()
-{
-    for (u32 i = 0; i < 8; i++)
-    {
-        for (u32 j = 0; j < 8; j++)
-        {
-            for (u32 k = 0; k < TexArrays[i][j].size(); k++)
-                glDeleteTextures(1, &TexArrays[i][j][k]);
-            TexArrays[i][j].clear();
-            FreeTextures[i][j].clear();
-        }
-    }
-    TexCache.clear();
-}
-
 void ComputeRenderer::Reset()
 {
-    ResetTexcache();
+    Texcache.Reset();
 }

 void ComputeRenderer::SetRenderSettings(GPU::RenderSettings& settings)
@ -496,402 +478,6 @@ void ComputeRenderer::SetupYSpan(RenderPolygon* rp, SpanSetupY* span, Polygon* p
    }
 }

-inline u32 TextureWidth(u32 texparam)
-{
-    return 8 << ((texparam >> 20) & 0x7);
-}
-
-inline u32 TextureHeight(u32 texparam)
-{
-    return 8 << ((texparam >> 23) & 0x7);
-}
-
-inline u16 ColorAvg(u16 color0, u16 color1)
-{
-    u32 r0 = color0 & 0x001F;
-    u32 g0 = color0 & 0x03E0;
-    u32 b0 = color0 & 0x7C00;
-    u32 r1 = color1 & 0x001F;
-    u32 g1 = color1 & 0x03E0;
-    u32 b1 = color1 & 0x7C00;
-
-    u32 r = (r0 + r1) >> 1;
-    u32 g = ((g0 + g1) >> 1) & 0x03E0;
-    u32 b = ((b0 + b1) >> 1) & 0x7C00;
-
-    return r | g | b;
-}
-
-inline u16 Color5of3(u16 color0, u16 color1)
-{
-    u32 r0 = color0 & 0x001F;
-    u32 g0 = color0 & 0x03E0;
-    u32 b0 = color0 & 0x7C00;
-    u32 r1 = color1 & 0x001F;
-    u32 g1 = color1 & 0x03E0;
-    u32 b1 = color1 & 0x7C00;
-
-    u32 r = (r0*5 + r1*3) >> 3;
-    u32 g = ((g0*5 + g1*3) >> 3) & 0x03E0;
-    u32 b = ((b0*5 + b1*3) >> 3) & 0x7C00;
-
-    return r | g | b;
-}
-
-inline u16 Color3of5(u16 color0, u16 color1)
-{
-    u32 r0 = color0 & 0x001F;
-    u32 g0 = color0 & 0x03E0;
-    u32 b0 = color0 & 0x7C00;
-    u32 r1 = color1 & 0x001F;
-    u32 g1 = color1 & 0x03E0;
-    u32 b1 = color1 & 0x7C00;
-
-    u32 r = (r0*3 + r1*5) >> 3;
-    u32 g = ((g0*3 + g1*5) >> 3) & 0x03E0;
-    u32 b = ((b0*3 + b1*5) >> 3) & 0x7C00;
-
-    return r | g | b;
-}
-
-inline u32 ConvertRGB5ToRGB8(u16 val)
-{
-    return (((u32)val & 0x1F) << 3)
-        | (((u32)val & 0x3E0) << 6)
-        | (((u32)val & 0x7C00) << 9);
-}
-inline u32 ConvertRGB5ToBGR8(u16 val)
-{
-    return (((u32)val & 0x1F) << 9)
-        | (((u32)val & 0x3E0) << 6)
-        | (((u32)val & 0x7C00) << 3);
-}
-inline u32 ConvertRGB5ToRGB6(u16 val)
-{
-    u8 r = (val & 0x1F) << 1;
-    u8 g = (val & 0x3E0) >> 4;
-    u8 b = (val & 0x7C00) >> 9;
-    if (r) r++;
-    if (g) g++;
-    if (b) b++;
-    return (u32)r | ((u32)g << 8) | ((u32)b << 16);
-}
-
-enum
-{
-    outputFmt_RGB6A5,
-    outputFmt_RGBA8,
-    outputFmt_BGRA8
-};
-
-template <int outputFmt>
-void ConvertCompressedTexture(u32 width, u32 height, u32* output, u8* texData, u8* texAuxData, u16* palData)
-{
-    // we process a whole block at the time
-    for (int y = 0; y < height / 4; y++)
-    {
-        for (int x = 0; x < width / 4; x++)
-        {
-            u32 data = ((u32*)texData)[x + y * (width / 4)];
-            u16 auxData = ((u16*)texAuxData)[x + y * (width / 4)];
-
-            u32 paletteOffset = auxData & 0x3FFF;
-            u16 color0 = palData[paletteOffset*2] | 0x8000;
-            u16 color1 = palData[paletteOffset*2+1] | 0x8000;
-            u16 color2, color3;
-
-            switch ((auxData >> 14) & 0x3)
-            {
-            case 0:
-                color2 = palData[paletteOffset*2+2] | 0x8000;
-                color3 = 0;
-                break;
-            case 1:
-                {
-                    u32 r0 = color0 & 0x001F;
-                    u32 g0 = color0 & 0x03E0;
-                    u32 b0 = color0 & 0x7C00;
-                    u32 r1 = color1 & 0x001F;
-                    u32 g1 = color1 & 0x03E0;
-                    u32 b1 = color1 & 0x7C00;
-
-                    u32 r = (r0 + r1) >> 1;
-                    u32 g = ((g0 + g1) >> 1) & 0x03E0;
-                    u32 b = ((b0 + b1) >> 1) & 0x7C00;
-                    color2 = r | g | b | 0x8000;
-                }
-                color3 = 0;
-                break;
-            case 2:
-                color2 = palData[paletteOffset*2+2] | 0x8000;
-                color3 = palData[paletteOffset*2+3] | 0x8000;
-                break;
-            case 3:
-                {
-                    u32 r0 = color0 & 0x001F;
-                    u32 g0 = color0 & 0x03E0;
-                    u32 b0 = color0 & 0x7C00;
-                    u32 r1 = color1 & 0x001F;
-                    u32 g1 = color1 & 0x03E0;
-                    u32 b1 = color1 & 0x7C00;
-
-                    u32 r = (r0*5 + r1*3) >> 3;
-                    u32 g = ((g0*5 + g1*3) >> 3) & 0x03E0;
-                    u32 b = ((b0*5 + b1*3) >> 3) & 0x7C00;
-
-                    color2 = r | g | b | 0x8000;
-                }
-                {
-                    u32 r0 = color0 & 0x001F;
-                    u32 g0 = color0 & 0x03E0;
-                    u32 b0 = color0 & 0x7C00;
-                    u32 r1 = color1 & 0x001F;
-                    u32 g1 = color1 & 0x03E0;
-                    u32 b1 = color1 & 0x7C00;
-
-                    u32 r = (r0*3 + r1*5) >> 3;
-                    u32 g = ((g0*3 + g1*5) >> 3) & 0x03E0;
-                    u32 b = ((b0*3 + b1*5) >> 3) & 0x7C00;
-
-                    color3 = r | g | b | 0x8000;
-                }
-                break;
-            }
-
-            // in 2020 our default data types are big enough to be used as lookup tables...
-            u64 packed = color0 | ((u64)color1 << 16) | ((u64)color2 << 32) | ((u64)color3 << 48);
-
-            for (int j = 0; j < 4; j++)
-            {
-                for (int i = 0; i < 4; i++)
-                {
-                    u16 color = (packed >> 16 * (data >> 2 * (i + j * 4))) & 0xFFFF;
-                    u32 res;
-                    switch (outputFmt)
-                    {
-                    case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color)
-                        | ((color & 0x8000) ? 0x1F000000 : 0); break;
-                    case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color)
-                        | ((color & 0x8000) ? 0xFF000000 : 0); break;
-                    case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color)
-                        | ((color & 0x8000) ? 0xFF000000 : 0); break;
-                    }
-                    output[x * 4 + i + (y * 4 + j) * width] = res;
-                }
-            }
-        }
-    }
-}
-
-template <int outputFmt, int X, int Y>
-void ConvertAXIYTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData)
-{
-    for (int y = 0; y < height; y++)
-    {
-        for (int x = 0; x < width; x++)
-        {
-            u8 val = texData[x + y * width];
-
-            u32 idx = val & ((1 << Y) - 1);
-
-            u16 color = palData[idx];
-            u32 alpha = (val >> Y) & ((1 << X) - 1);
-            if (X != 5)
-                alpha = alpha * 4 + alpha / 2;
-
-            u32 res;
-            switch (outputFmt)
-            {
-            case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color) | alpha << 24; break;
-            // make sure full alpha == 255
-            case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color) | (alpha << 27 | (alpha & 0x1C) << 22); break;
-            case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color) | (alpha << 27 | (alpha & 0x1C) << 22); break;
-            }
-            output[x + y * width] = res;
-        }
-    }
-}
-
-template <int outputFmt, int colorBits>
-void ConvertNColorsTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData, bool color0Transparent)
-{
-    for (int y = 0; y < height; y++)
-    {
-        for (int x = 0; x < width / (8 / colorBits); x++)
-        {
-            u8 val = texData[x + y * (width / (8 / colorBits))];
-
-            for (int i = 0; i < 8 / colorBits; i++)
-            {
-                u32 index = (val >> (i * colorBits)) & ((1 << colorBits) - 1);
-                u16 color = palData[index];
-
-                bool transparent = color0Transparent && index == 0;
-                u32 res;
-                switch (outputFmt)
-                {
-                case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color)
-                    | (transparent ? 0 : 0x1F000000); break;
-                case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color)
-                    | (transparent ? 0 : 0xFF000000); break;
-                case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color)
-                    | (transparent ? 0 : 0xFF000000); break;
-                }
-                output[x * (8 / colorBits) + y * width + i] = res;
-            }
-        }
-    }
-}
-
-ComputeRenderer::TexCacheEntry& ComputeRenderer::GetTexture(u32 texParam, u32 palBase)
-{
-    // remove sampling and texcoord gen params
-    texParam &= ~0xC00F0000;
-
-    u32 fmt = (texParam >> 26) & 0x7;
-    u64 key = texParam;
-    if (fmt != 7)
-    {
-        key |= (u64)palBase << 32;
-        if (fmt == 5)
-            key &= ~((u64)1 << 29);
-    }
-    //printf("%" PRIx64 " %" PRIx32 " %" PRIx32 "\n", key, texParam, palBase);
-
-    assert(fmt != 0 && "no texture is not a texture format!");
-
-    auto it = TexCache.find(key);
-
-    if (it != TexCache.end())
-        return it->second;
-
-    u32 widthLog2 = (texParam >> 20) & 0x7;
-    u32 heightLog2 = (texParam >> 23) & 0x7;
-    u32 width = 8 << widthLog2;
-    u32 height = 8 << heightLog2;
-
-    u32 addr = (texParam & 0xFFFF) * 8;
-
-    TexCacheEntry entry = {0};
-
-    entry.TextureRAMStart[0] = addr;
-    entry.WidthLog2 = widthLog2;
-    entry.HeightLog2 = heightLog2;
-
-    // apparently a new texture
-    if (fmt == 7)
-    {
-        entry.TextureRAMSize[0] = width*height*2;
-
-        for (u32 i = 0; i < width*height; i++)
-        {
-            u16 value = *(u16*)&GPU::VRAMFlat_Texture[addr + i * 2];
-
-            TextureDecodingBuffer[i] = ConvertRGB5ToRGB6(value) | (value & 0x8000 ? 0x1F000000 : 0);
-        }
-    }
-    else if (fmt == 5)
-    {
-        u8* texData = &GPU::VRAMFlat_Texture[addr];
-        u32 slot1addr = 0x20000 + ((addr & 0x1FFFC) >> 1);
-        if (addr >= 0x40000)
-            slot1addr += 0x10000;
-        u8* texAuxData = &GPU::VRAMFlat_Texture[slot1addr];
-
-        u16* palData = (u16*)(GPU::VRAMFlat_TexPal + palBase*16);
-
-        entry.TextureRAMSize[0] = width*height/16*4;
-        entry.TextureRAMStart[1] = slot1addr;
-        entry.TextureRAMSize[1] = width*height/16*2;
-        entry.TexPalStart = palBase*16;
-        entry.TexPalSize = 0x10000;
-
-        ConvertCompressedTexture<outputFmt_RGB6A5>(width, height, TextureDecodingBuffer, texData, texAuxData, palData);
-    }
-    else
-    {
-        u32 texSize, palAddr = palBase*16, numPalEntries;
-        switch (fmt)
-        {
-        case 1: texSize = width*height; numPalEntries = 32; break;
-        case 6: texSize = width*height; numPalEntries = 8; break;
-        case 2: texSize = width*height/4; numPalEntries = 4; palAddr >>= 1; break;
-        case 3: texSize = width*height/2; numPalEntries = 16; break;
-        case 4: texSize = width*height; numPalEntries = 256; break;
-        }
-
-        palAddr &= 0x1FFFF;
-
-        /*printf("creating texture | fmt: %d | %dx%d | %08x | %08x\n", fmt, width, height, addr, palAddr);
-        svcSleepThread(1000*1000);*/
-
-        entry.TextureRAMSize[0] = texSize;
-        entry.TexPalStart = palAddr;
-        entry.TexPalSize = numPalEntries*2;
-
-        u8* texData = &GPU::VRAMFlat_Texture[addr];
-        u16* palData = (u16*)(GPU::VRAMFlat_TexPal + palAddr);
-
-        //assert(entry.TexPalStart+entry.TexPalSize <= 128*1024*1024);
-
-        bool color0Transparent = texParam & (1 << 29);
-
-        switch (fmt)
-        {
-        case 1: ConvertAXIYTexture<outputFmt_RGB6A5, 3, 5>(width, height, TextureDecodingBuffer, texData, palData); break;
-        case 6: ConvertAXIYTexture<outputFmt_RGB6A5, 5, 3>(width, height, TextureDecodingBuffer, texData, palData); break;
-        case 2: ConvertNColorsTexture<outputFmt_RGB6A5, 2>(width, height, TextureDecodingBuffer, texData, palData, color0Transparent); break;
-        case 3: ConvertNColorsTexture<outputFmt_RGB6A5, 4>(width, height, TextureDecodingBuffer, texData, palData, color0Transparent); break;
-        case 4: ConvertNColorsTexture<outputFmt_RGB6A5, 8>(width, height, TextureDecodingBuffer, texData, palData, color0Transparent); break;
-        }
-    }
-
-    for (int i = 0; i < 2; i++)
-    {
-        if (entry.TextureRAMSize[i])
-            entry.TextureHash[i] = XXH3_64bits(&GPU::VRAMFlat_Texture[entry.TextureRAMStart[i]], entry.TextureRAMSize[i]);
-    }
-    if (entry.TexPalSize)
-        entry.TexPalHash = XXH3_64bits(&GPU::VRAMFlat_TexPal[entry.TexPalStart], entry.TexPalSize);
-
-    auto& texArrays = TexArrays[widthLog2][heightLog2];
-    auto& freeTextures = FreeTextures[widthLog2][heightLog2];
-
-    if (freeTextures.size() == 0)
-    {
-        texArrays.resize(texArrays.size()+1);
-        GLuint& array = texArrays[texArrays.size()-1];
-
-        u32 layers = std::min<u32>((8*1024*1024) / (width*height*4), 64);
-
-        // allocate new array texture
-        glGenTextures(1, &array);
-        glBindTexture(GL_TEXTURE_2D_ARRAY, array);
-        glTexStorage3D(GL_TEXTURE_2D_ARRAY, 1, GL_RGBA8UI, width, height, layers);
-        //printf("allocating new layer set for %d %d %d %d\n", width, height, texArrays.size()-1, array.ImageDescriptor);
-
-        for (u32 i = 0; i < layers; i++)
-        {
-            freeTextures.push_back(TexArrayEntry{array, i});
-        }
-    }
-
-    TexArrayEntry storagePlace = freeTextures[freeTextures.size()-1];
-    freeTextures.pop_back();
-
-    //printf("using storage place %d %d | %d %d (%d)\n", width, height, storagePlace.TexArrayIdx, storagePlace.LayerIdx, array.ImageDescriptor);
-
-    glBindTexture(GL_TEXTURE_2D_ARRAY, storagePlace.TextureID);
-    glTexSubImage3D(GL_TEXTURE_2D_ARRAY,
-        0, 0, 0, storagePlace.Layer,
-        width, height, 1,
-        GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, TextureDecodingBuffer);
-
-    entry.Texture = storagePlace;
-
-    return TexCache.emplace(std::make_pair(key, entry)).first->second;
-}
-
 struct Variant
 {
    GLuint Texture, Sampler;
@ -921,69 +507,8 @@ struct Variant
 void ComputeRenderer::RenderFrame()
 {
    //printf("render frame\n");
-    auto textureDirty = GPU::VRAMDirty_Texture.DeriveState(GPU::VRAMMap_Texture);
-    auto texPalDirty = GPU::VRAMDirty_TexPal.DeriveState(GPU::VRAMMap_TexPal);

-    bool textureChanged = GPU::MakeVRAMFlat_TextureCoherent(textureDirty);
-    bool texPalChanged = GPU::MakeVRAMFlat_TexPalCoherent(texPalDirty);
-
-    if (textureChanged || texPalChanged)
-    {
-        //printf("check invalidation %d\n", TexCache.size());
-        for (auto it = TexCache.begin(); it != TexCache.end();)
-        {
-            TexCacheEntry& entry = it->second;
-            if (textureChanged)
-            {
-                for (u32 i = 0; i < 2; i++)
-                {
-                    u32 startBit = entry.TextureRAMStart[i] / GPU::VRAMDirtyGranularity;
-                    u32 bitsCount = ((entry.TextureRAMStart[i] + entry.TextureRAMSize[i] + GPU::VRAMDirtyGranularity - 1) / GPU::VRAMDirtyGranularity) - startBit;
-
-                    u32 startEntry = startBit >> 6;
-                    u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry;
-                    for (u32 j = startEntry; j < startEntry + entriesCount; j++)
-                    {
-                        if (GetRangedBitMask(j, startBit, bitsCount) & textureDirty.Data[j])
-                        {
-                            u64 newTexHash = XXH3_64bits(&GPU::VRAMFlat_Texture[entry.TextureRAMStart[i]], entry.TextureRAMSize[i]);
-
-                            if (newTexHash != entry.TextureHash[i])
-                                goto invalidate;
-                        }
-                    }
-                }
-            }
-
-            if (texPalChanged && entry.TexPalSize > 0)
-            {
-                u32 startBit = entry.TexPalStart / GPU::VRAMDirtyGranularity;
-                u32 bitsCount = ((entry.TexPalStart + entry.TexPalSize + GPU::VRAMDirtyGranularity - 1) / GPU::VRAMDirtyGranularity) - startBit;
-
-                u32 startEntry = startBit >> 6;
-                u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry;
-                for (u32 j = startEntry; j < startEntry + entriesCount; j++)
-                {
-                    if (GetRangedBitMask(j, startBit, bitsCount) & texPalDirty.Data[j])
-                    {
-                        u64 newPalHash = XXH3_64bits(&GPU::VRAMFlat_TexPal[entry.TexPalStart], entry.TexPalSize);
-                        if (newPalHash != entry.TexPalHash)
-                            goto invalidate;
-                    }
-                }
-            }
-
-            it++;
-            continue;
-        invalidate:
-            FreeTextures[entry.WidthLog2][entry.HeightLog2].push_back(entry.Texture);
-
-            //printf("invalidating texture %d\n", entry.ImageDescriptor);
-
-            it = TexCache.erase(it);
-        }
-    }
-    else if (RenderFrameIdentical)
+    if (!Texcache.Update() && RenderFrameIdentical)
    {
        return;
    }
@ -1006,8 +531,6 @@ void ComputeRenderer::RenderFrame()
    u32 numVariants = 0, prevVariant, prevTexLayer;
    Variant variants[MaxVariants];

-    int foundviatexcache = 0, foundviaprev = 0, numslow = 0;
-
    bool enableTextureMaps = RenderDispCnt & (1<<0);

    for (int i = 0; i < RenderNumPolygons; i++)
@ -1033,8 +556,6 @@ void ComputeRenderer::RenderFrame()
                && prevPolygon->TexPalette == polygon->TexPalette
                && (prevPolygon->Attr & 0x30) == (polygon->Attr & 0x30)
                && prevPolygon->IsShadowMask == polygon->IsShadowMask;
-            if (foundVariant)
-                foundviaprev++;
        }

        if (!foundVariant)
@ -1043,30 +564,26 @@ void ComputeRenderer::RenderFrame()
            variant.BlendMode = polygon->IsShadowMask ? 4 : ((polygon->Attr >> 4) & 0x3);
            variant.Texture = 0;
            variant.Sampler = 0;
-            TexCacheEntry* texcacheEntry = nullptr;
+            u32* textureLastVariant = nullptr;
            // we always need to look up the texture to get the layer of the array texture
            if (enableTextureMaps && (polygon->TexParam >> 26) & 0x7)
            {
-                texcacheEntry = &GetTexture(polygon->TexParam, polygon->TexPalette);
+                Texcache.GetTexture(polygon->TexParam, polygon->TexPalette, variant.Texture, prevTexLayer, textureLastVariant);
                bool wrapS = (polygon->TexParam >> 16) & 1;
                bool wrapT = (polygon->TexParam >> 17) & 1;
                bool mirrorS = (polygon->TexParam >> 18) & 1;
                bool mirrorT = (polygon->TexParam >> 19) & 1;
                variant.Sampler = Samplers[(wrapS ? (mirrorS ? 2 : 1) : 0) + (wrapT ? (mirrorT ? 2 : 1) : 0) * 3];
-                variant.Texture = texcacheEntry->Texture.TextureID;
-                prevTexLayer = texcacheEntry->Texture.Layer;

-                if (texcacheEntry->LastVariant < numVariants && variants[texcacheEntry->LastVariant] == variant)
+                if (*textureLastVariant < numVariants && variants[*textureLastVariant] == variant)
                {
                    foundVariant = true;
-                    prevVariant = texcacheEntry->LastVariant;
-                    foundviatexcache++;
+                    prevVariant = *textureLastVariant;
                }
            }

            if (!foundVariant)
            {
-                numslow++;
                for (int j = numVariants - 1; j >= 0; j--)
                {
                    if (variants[j] == variant)
@ -1085,8 +602,8 @@ void ComputeRenderer::RenderFrame()
                assert(numVariants <= MaxVariants);
            foundVariant:;

-                if (texcacheEntry)
-                    texcacheEntry->LastVariant = prevVariant;
+                if (textureLastVariant)
+                    *textureLastVariant = prevVariant;
            }
        }
        RenderPolygons[i].Variant = prevVariant;
--- a/src/GPU3D_Compute.h
+++ b/src/GPU3D_Compute.h
@ -23,9 +23,9 @@

 #include "OpenGLSupport.h"

-#include "NonStupidBitfield.h"
+#include "GPU3D_TexcacheOpenGL.h"

-#include <unordered_map>
+#include "NonStupidBitfield.h"

 namespace GPU3D
 {
@ -179,25 +179,7 @@ private:
    SpanSetupY YSpanSetups[MaxYSpanSetups];
    RenderPolygon RenderPolygons[2048];

-    struct TexArrayEntry
-    {
-        GLuint TextureID;
-        u32 Layer;
-    };
-
-    struct TexCacheEntry
-    {
-        u32 LastVariant; // very cheap way to make variant lookup faster
-
-        u32 TextureRAMStart[2], TextureRAMSize[2];
-        u32 TexPalStart, TexPalSize;
-        u8 WidthLog2, HeightLog2;
-        TexArrayEntry Texture;
-
-        u64 TextureHash[2];
-        u64 TexPalHash;
-    };
-    std::unordered_map<u64, TexCacheEntry> TexCache;
+    TexcacheOpenGL Texcache;

    struct MetaUniform
    {
@ -215,27 +197,19 @@ private:
    };
    GLuint MetaUniformMemory;

-    std::vector<TexArrayEntry> FreeTextures[8][8];
-    std::vector<GLuint> TexArrays[8][8];
-
    GLuint Samplers[9];

-    u32 TextureDecodingBuffer[1024*1024];
-
    GLuint Framebuffer = 0;
    GLuint LowResFramebuffer;
    GLuint PixelBuffer;

    u32 FramebufferCPU[256*192];

-    TexCacheEntry& GetTexture(u32 textureParam, u32 paletteParam);
-
    int ScreenWidth, ScreenHeight;
    int TilesPerLine, TileLines;
    int ScaleFactor = -1;
    int MaxWorkTiles;

-    void ResetTexcache();
    void DeleteShaders();

    void SetupAttrs(SpanSetupY* span, Polygon* poly, int from, int to);
--- a/src/GPU3D_Texcache.cpp
+++ b/src/GPU3D_Texcache.cpp
@ -0,0 +1,269 @@
+#include "GPU3D_Texcache.h"
+
+namespace GPU3D
+{
+
+inline u16 ColorAvg(u16 color0, u16 color1)
+{
+    u32 r0 = color0 & 0x001F;
+    u32 g0 = color0 & 0x03E0;
+    u32 b0 = color0 & 0x7C00;
+    u32 r1 = color1 & 0x001F;
+    u32 g1 = color1 & 0x03E0;
+    u32 b1 = color1 & 0x7C00;
+
+    u32 r = (r0 + r1) >> 1;
+    u32 g = ((g0 + g1) >> 1) & 0x03E0;
+    u32 b = ((b0 + b1) >> 1) & 0x7C00;
+
+    return r | g | b;
+}
+
+inline u16 Color5of3(u16 color0, u16 color1)
+{
+    u32 r0 = color0 & 0x001F;
+    u32 g0 = color0 & 0x03E0;
+    u32 b0 = color0 & 0x7C00;
+    u32 r1 = color1 & 0x001F;
+    u32 g1 = color1 & 0x03E0;
+    u32 b1 = color1 & 0x7C00;
+
+    u32 r = (r0*5 + r1*3) >> 3;
+    u32 g = ((g0*5 + g1*3) >> 3) & 0x03E0;
+    u32 b = ((b0*5 + b1*3) >> 3) & 0x7C00;
+
+    return r | g | b;
+}
+
+inline u16 Color3of5(u16 color0, u16 color1)
+{
+    u32 r0 = color0 & 0x001F;
+    u32 g0 = color0 & 0x03E0;
+    u32 b0 = color0 & 0x7C00;
+    u32 r1 = color1 & 0x001F;
+    u32 g1 = color1 & 0x03E0;
+    u32 b1 = color1 & 0x7C00;
+
+    u32 r = (r0*3 + r1*5) >> 3;
+    u32 g = ((g0*3 + g1*5) >> 3) & 0x03E0;
+    u32 b = ((b0*3 + b1*5) >> 3) & 0x7C00;
+
+    return r | g | b;
+}
+
+inline u32 ConvertRGB5ToRGB8(u16 val)
+{
+    return (((u32)val & 0x1F) << 3)
+        | (((u32)val & 0x3E0) << 6)
+        | (((u32)val & 0x7C00) << 9);
+}
+inline u32 ConvertRGB5ToBGR8(u16 val)
+{
+    return (((u32)val & 0x1F) << 9)
+        | (((u32)val & 0x3E0) << 6)
+        | (((u32)val & 0x7C00) << 3);
+}
+inline u32 ConvertRGB5ToRGB6(u16 val)
+{
+    u8 r = (val & 0x1F) << 1;
+    u8 g = (val & 0x3E0) >> 4;
+    u8 b = (val & 0x7C00) >> 9;
+    if (r) r++;
+    if (g) g++;
+    if (b) b++;
+    return (u32)r | ((u32)g << 8) | ((u32)b << 16);
+}
+
+template <int outputFmt>
+void ConvertBitmapTexture(u32 width, u32 height, u32* output, u8* texData)
+{
+    for (u32 i = 0; i < width*height; i++)
+    {
+        u16 value = *(u16*)&texData[i * 2];
+
+        switch (outputFmt)
+        {
+        case outputFmt_RGB6A5:
+            output[i] = ConvertRGB5ToRGB6(value) | (value & 0x8000 ? 0x1F000000 : 0);
+            break;
+        case outputFmt_RGBA8:
+            output[i] = ConvertRGB5ToRGB8(value) | (value & 0x8000 ? 0xFF000000 : 0);
+            break;
+        case outputFmt_BGRA8:
+            output[i] = ConvertRGB5ToBGR8(value) | (value & 0x8000 ? 0xFF000000 : 0);
+            break;
+        }
+    }
+}
+
+template void ConvertBitmapTexture<outputFmt_RGB6A5>(u32 width, u32 height, u32* output, u8* texData);
+
+template <int outputFmt>
+void ConvertCompressedTexture(u32 width, u32 height, u32* output, u8* texData, u8* texAuxData, u16* palData)
+{
+    // we process a whole block at the time
+    for (int y = 0; y < height / 4; y++)
+    {
+        for (int x = 0; x < width / 4; x++)
+        {
+            u32 data = ((u32*)texData)[x + y * (width / 4)];
+            u16 auxData = ((u16*)texAuxData)[x + y * (width / 4)];
+
+            u32 paletteOffset = auxData & 0x3FFF;
+            u16 color0 = palData[paletteOffset*2] | 0x8000;
+            u16 color1 = palData[paletteOffset*2+1] | 0x8000;
+            u16 color2, color3;
+
+            switch ((auxData >> 14) & 0x3)
+            {
+            case 0:
+                color2 = palData[paletteOffset*2+2] | 0x8000;
+                color3 = 0;
+                break;
+            case 1:
+                {
+                    u32 r0 = color0 & 0x001F;
+                    u32 g0 = color0 & 0x03E0;
+                    u32 b0 = color0 & 0x7C00;
+                    u32 r1 = color1 & 0x001F;
+                    u32 g1 = color1 & 0x03E0;
+                    u32 b1 = color1 & 0x7C00;
+
+                    u32 r = (r0 + r1) >> 1;
+                    u32 g = ((g0 + g1) >> 1) & 0x03E0;
+                    u32 b = ((b0 + b1) >> 1) & 0x7C00;
+                    color2 = r | g | b | 0x8000;
+                }
+                color3 = 0;
+                break;
+            case 2:
+                color2 = palData[paletteOffset*2+2] | 0x8000;
+                color3 = palData[paletteOffset*2+3] | 0x8000;
+                break;
+            case 3:
+                {
+                    u32 r0 = color0 & 0x001F;
+                    u32 g0 = color0 & 0x03E0;
+                    u32 b0 = color0 & 0x7C00;
+                    u32 r1 = color1 & 0x001F;
+                    u32 g1 = color1 & 0x03E0;
+                    u32 b1 = color1 & 0x7C00;
+
+                    u32 r = (r0*5 + r1*3) >> 3;
+                    u32 g = ((g0*5 + g1*3) >> 3) & 0x03E0;
+                    u32 b = ((b0*5 + b1*3) >> 3) & 0x7C00;
+
+                    color2 = r | g | b | 0x8000;
+                }
+                {
+                    u32 r0 = color0 & 0x001F;
+                    u32 g0 = color0 & 0x03E0;
+                    u32 b0 = color0 & 0x7C00;
+                    u32 r1 = color1 & 0x001F;
+                    u32 g1 = color1 & 0x03E0;
+                    u32 b1 = color1 & 0x7C00;
+
+                    u32 r = (r0*3 + r1*5) >> 3;
+                    u32 g = ((g0*3 + g1*5) >> 3) & 0x03E0;
+                    u32 b = ((b0*3 + b1*5) >> 3) & 0x7C00;
+
+                    color3 = r | g | b | 0x8000;
+                }
+                break;
+            }
+
+            // in 2020 our default data types are big enough to be used as lookup tables...
+            u64 packed = color0 | ((u64)color1 << 16) | ((u64)color2 << 32) | ((u64)color3 << 48);
+
+            for (int j = 0; j < 4; j++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    u16 color = (packed >> 16 * (data >> 2 * (i + j * 4))) & 0xFFFF;
+                    u32 res;
+                    switch (outputFmt)
+                    {
+                    case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color)
+                        | ((color & 0x8000) ? 0x1F000000 : 0); break;
+                    case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color)
+                        | ((color & 0x8000) ? 0xFF000000 : 0); break;
+                    case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color)
+                        | ((color & 0x8000) ? 0xFF000000 : 0); break;
+                    }
+                    output[x * 4 + i + (y * 4 + j) * width] = res;
+                }
+            }
+        }
+    }
+}
+
+template void ConvertCompressedTexture<outputFmt_RGB6A5>(u32, u32, u32*, u8*, u8*, u16*);
+
+template <int outputFmt, int X, int Y>
+void ConvertAXIYTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData)
+{
+    for (int y = 0; y < height; y++)
+    {
+        for (int x = 0; x < width; x++)
+        {
+            u8 val = texData[x + y * width];
+
+            u32 idx = val & ((1 << Y) - 1);
+
+            u16 color = palData[idx];
+            u32 alpha = (val >> Y) & ((1 << X) - 1);
+            if (X != 5)
+                alpha = alpha * 4 + alpha / 2;
+
+            u32 res;
+            switch (outputFmt)
+            {
+            case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color) | alpha << 24; break;
+            // make sure full alpha == 255
+            case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color) | (alpha << 27 | (alpha & 0x1C) << 22); break;
+            case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color) | (alpha << 27 | (alpha & 0x1C) << 22); break;
+            }
+            output[x + y * width] = res;
+        }
+    }
+}
+
+template void ConvertAXIYTexture<outputFmt_RGB6A5, 5, 3>(u32, u32, u32*, u8*, u16*);
+template void ConvertAXIYTexture<outputFmt_RGB6A5, 3, 5>(u32, u32, u32*, u8*, u16*);
+
+template <int outputFmt, int colorBits>
+void ConvertNColorsTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData, bool color0Transparent)
+{
+    for (int y = 0; y < height; y++)
+    {
+        for (int x = 0; x < width / (8 / colorBits); x++)
+        {
+            u8 val = texData[x + y * (width / (8 / colorBits))];
+
+            for (int i = 0; i < 8 / colorBits; i++)
+            {
+                u32 index = (val >> (i * colorBits)) & ((1 << colorBits) - 1);
+                u16 color = palData[index];
+
+                bool transparent = color0Transparent && index == 0;
+                u32 res;
+                switch (outputFmt)
+                {
+                case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color)
+                    | (transparent ? 0 : 0x1F000000); break;
+                case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color)
+                    | (transparent ? 0 : 0xFF000000); break;
+                case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color)
+                    | (transparent ? 0 : 0xFF000000); break;
+                }
+                output[x * (8 / colorBits) + y * width + i] = res;
+            }
+        }
+    }
+}
+
+template void ConvertNColorsTexture<outputFmt_RGB6A5, 2>(u32, u32, u32*, u8*, u16*, bool);
+template void ConvertNColorsTexture<outputFmt_RGB6A5, 4>(u32, u32, u32*, u8*, u16*, bool);
+template void ConvertNColorsTexture<outputFmt_RGB6A5, 8>(u32, u32, u32*, u8*, u16*, bool);
+
+}
--- a/src/GPU3D_Texcache.h
+++ b/src/GPU3D_Texcache.h
@ -0,0 +1,309 @@
+#ifndef GPU3D_TEXCACHE
+#define GPU3D_TEXCACHE
+
+#include "types.h"
+#include "GPU.h"
+
+#include <assert.h>
+#include <unordered_map>
+
+#define XXH_STATIC_LINKING_ONLY
+#include "xxhash/xxhash.h"
+
+namespace GPU3D
+{
+
+inline u32 TextureWidth(u32 texparam)
+{
+    return 8 << ((texparam >> 20) & 0x7);
+}
+
+inline u32 TextureHeight(u32 texparam)
+{
+    return 8 << ((texparam >> 23) & 0x7);
+}
+
+enum
+{
+    outputFmt_RGB6A5,
+    outputFmt_RGBA8,
+    outputFmt_BGRA8
+};
+
+template <int outputFmt>
+void ConvertBitmapTexture(u32 width, u32 height, u32* output, u8* texData);
+template <int outputFmt>
+void ConvertCompressedTexture(u32 width, u32 height, u32* output, u8* texData, u8* texAuxData, u16* palData);
+template <int outputFmt, int X, int Y>
+void ConvertAXIYTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData);
+template <int outputFmt, int colorBits>
+void ConvertNColorsTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData, bool color0Transparent);
+
+template <typename TexLoaderT, typename TexHandleT>
+class Texcache
+{
+public:
+    Texcache(const TexLoaderT& texloader)
+        : TexLoader(texloader) // probably better if this would be a move constructor???
+    {}
+
+    bool Update()
+    {
+        auto textureDirty = GPU::VRAMDirty_Texture.DeriveState(GPU::VRAMMap_Texture);
+        auto texPalDirty = GPU::VRAMDirty_TexPal.DeriveState(GPU::VRAMMap_TexPal);
+
+        bool textureChanged = GPU::MakeVRAMFlat_TextureCoherent(textureDirty);
+        bool texPalChanged = GPU::MakeVRAMFlat_TexPalCoherent(texPalDirty);
+
+        if (textureChanged || texPalChanged)
+        {
+            //printf("check invalidation %d\n", TexCache.size());
+            for (auto it = Cache.begin(); it != Cache.end();)
+            {
+                TexCacheEntry& entry = it->second;
+                if (textureChanged)
+                {
+                    for (u32 i = 0; i < 2; i++)
+                    {
+                        u32 startBit = entry.TextureRAMStart[i] / GPU::VRAMDirtyGranularity;
+                        u32 bitsCount = ((entry.TextureRAMStart[i] + entry.TextureRAMSize[i] + GPU::VRAMDirtyGranularity - 1) / GPU::VRAMDirtyGranularity) - startBit;
+
+                        u32 startEntry = startBit >> 6;
+                        u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry;
+                        for (u32 j = startEntry; j < startEntry + entriesCount; j++)
+                        {
+                            if (GetRangedBitMask(j, startBit, bitsCount) & textureDirty.Data[j])
+                            {
+                                u64 newTexHash = XXH3_64bits(&GPU::VRAMFlat_Texture[entry.TextureRAMStart[i]], entry.TextureRAMSize[i]);
+
+                                if (newTexHash != entry.TextureHash[i])
+                                    goto invalidate;
+                            }
+                        }
+                    }
+                }
+
+                if (texPalChanged && entry.TexPalSize > 0)
+                {
+                    u32 startBit = entry.TexPalStart / GPU::VRAMDirtyGranularity;
+                    u32 bitsCount = ((entry.TexPalStart + entry.TexPalSize + GPU::VRAMDirtyGranularity - 1) / GPU::VRAMDirtyGranularity) - startBit;
+
+                    u32 startEntry = startBit >> 6;
+                    u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry;
+                    for (u32 j = startEntry; j < startEntry + entriesCount; j++)
+                    {
+                        if (GetRangedBitMask(j, startBit, bitsCount) & texPalDirty.Data[j])
+                        {
+                            u64 newPalHash = XXH3_64bits(&GPU::VRAMFlat_TexPal[entry.TexPalStart], entry.TexPalSize);
+                            if (newPalHash != entry.TexPalHash)
+                                goto invalidate;
+                        }
+                    }
+                }
+
+                it++;
+                continue;
+            invalidate:
+                FreeTextures[entry.WidthLog2][entry.HeightLog2].push_back(entry.Texture);
+
+                //printf("invalidating texture %d\n", entry.ImageDescriptor);
+
+                it = Cache.erase(it);
+            }
+
+            return true;
+        }
+
+        return false;
+    }
+
+    void GetTexture(u32 texParam, u32 palBase, TexHandleT& textureHandle, u32& layer, u32*& helper)
+    {
+        // remove sampling and texcoord gen params
+        texParam &= ~0xC00F0000;
+
+        u32 fmt = (texParam >> 26) & 0x7;
+        u64 key = texParam;
+        if (fmt != 7)
+        {
+            key |= (u64)palBase << 32;
+            if (fmt == 5)
+                key &= ~((u64)1 << 29);
+        }
+        //printf("%" PRIx64 " %" PRIx32 " %" PRIx32 "\n", key, texParam, palBase);
+
+        assert(fmt != 0 && "no texture is not a texture format!");
+
+        auto it = Cache.find(key);
+
+        if (it != Cache.end())
+        {
+            textureHandle = it->second.Texture.TextureID;
+            layer = it->second.Texture.Layer;
+            helper = &it->second.LastVariant;
+            return;
+        }
+
+        u32 widthLog2 = (texParam >> 20) & 0x7;
+        u32 heightLog2 = (texParam >> 23) & 0x7;
+        u32 width = 8 << widthLog2;
+        u32 height = 8 << heightLog2;
+
+        u32 addr = (texParam & 0xFFFF) * 8;
+
+        TexCacheEntry entry = {0};
+
+        entry.TextureRAMStart[0] = addr;
+        entry.WidthLog2 = widthLog2;
+        entry.HeightLog2 = heightLog2;
+
+        // apparently a new texture
+        if (fmt == 7)
+        {
+            entry.TextureRAMSize[0] = width*height*2;
+
+            ConvertBitmapTexture<outputFmt_RGB6A5>(width, height, DecodingBuffer, &GPU::VRAMFlat_Texture[addr]);
+        }
+        else if (fmt == 5)
+        {
+            u8* texData = &GPU::VRAMFlat_Texture[addr];
+            u32 slot1addr = 0x20000 + ((addr & 0x1FFFC) >> 1);
+            if (addr >= 0x40000)
+                slot1addr += 0x10000;
+            u8* texAuxData = &GPU::VRAMFlat_Texture[slot1addr];
+
+            u16* palData = (u16*)(GPU::VRAMFlat_TexPal + palBase*16);
+
+            entry.TextureRAMSize[0] = width*height/16*4;
+            entry.TextureRAMStart[1] = slot1addr;
+            entry.TextureRAMSize[1] = width*height/16*2;
+            entry.TexPalStart = palBase*16;
+            entry.TexPalSize = 0x10000;
+
+            ConvertCompressedTexture<outputFmt_RGB6A5>(width, height, DecodingBuffer, texData, texAuxData, palData);
+        }
+        else
+        {
+            u32 texSize, palAddr = palBase*16, numPalEntries;
+            switch (fmt)
+            {
+            case 1: texSize = width*height; numPalEntries = 32; break;
+            case 6: texSize = width*height; numPalEntries = 8; break;
+            case 2: texSize = width*height/4; numPalEntries = 4; palAddr >>= 1; break;
+            case 3: texSize = width*height/2; numPalEntries = 16; break;
+            case 4: texSize = width*height; numPalEntries = 256; break;
+            }
+
+            palAddr &= 0x1FFFF;
+
+            /*printf("creating texture | fmt: %d | %dx%d | %08x | %08x\n", fmt, width, height, addr, palAddr);
+            svcSleepThread(1000*1000);*/
+
+            entry.TextureRAMSize[0] = texSize;
+            entry.TexPalStart = palAddr;
+            entry.TexPalSize = numPalEntries*2;
+
+            u8* texData = &GPU::VRAMFlat_Texture[addr];
+            u16* palData = (u16*)(GPU::VRAMFlat_TexPal + palAddr);
+
+            //assert(entry.TexPalStart+entry.TexPalSize <= 128*1024*1024);
+
+            bool color0Transparent = texParam & (1 << 29);
+
+            switch (fmt)
+            {
+            case 1: ConvertAXIYTexture<outputFmt_RGB6A5, 3, 5>(width, height, DecodingBuffer, texData, palData); break;
+            case 6: ConvertAXIYTexture<outputFmt_RGB6A5, 5, 3>(width, height, DecodingBuffer, texData, palData); break;
+            case 2: ConvertNColorsTexture<outputFmt_RGB6A5, 2>(width, height, DecodingBuffer, texData, palData, color0Transparent); break;
+            case 3: ConvertNColorsTexture<outputFmt_RGB6A5, 4>(width, height, DecodingBuffer, texData, palData, color0Transparent); break;
+            case 4: ConvertNColorsTexture<outputFmt_RGB6A5, 8>(width, height, DecodingBuffer, texData, palData, color0Transparent); break;
+            }
+        }
+
+        for (int i = 0; i < 2; i++)
+        {
+            if (entry.TextureRAMSize[i])
+                entry.TextureHash[i] = XXH3_64bits(&GPU::VRAMFlat_Texture[entry.TextureRAMStart[i]], entry.TextureRAMSize[i]);
+        }
+        if (entry.TexPalSize)
+            entry.TexPalHash = XXH3_64bits(&GPU::VRAMFlat_TexPal[entry.TexPalStart], entry.TexPalSize);
+
+        auto& texArrays = TexArrays[widthLog2][heightLog2];
+        auto& freeTextures = FreeTextures[widthLog2][heightLog2];
+
+        if (freeTextures.size() == 0)
+        {
+            texArrays.resize(texArrays.size()+1);
+            GLuint& array = texArrays[texArrays.size()-1];
+
+            u32 layers = std::min<u32>((8*1024*1024) / (width*height*4), 64);
+
+            // allocate new array texture
+            //printf("allocating new layer set for %d %d %d %d\n", width, height, texArrays.size()-1, array.ImageDescriptor);
+            array = TexLoader.GenerateTexture(width, height, layers);
+
+            for (u32 i = 0; i < layers; i++)
+            {
+                freeTextures.push_back(TexArrayEntry{array, i});
+            }
+        }
+
+        TexArrayEntry storagePlace = freeTextures[freeTextures.size()-1];
+        freeTextures.pop_back();
+
+        entry.Texture = storagePlace;
+
+        TexLoader.UploadTexture(storagePlace.TextureID, width, height, storagePlace.Layer, DecodingBuffer);
+        //printf("using storage place %d %d | %d %d (%d)\n", width, height, storagePlace.TexArrayIdx, storagePlace.LayerIdx, array.ImageDescriptor);
+
+        textureHandle = storagePlace.TextureID;
+        layer = storagePlace.Layer;
+        helper = &Cache.emplace(std::make_pair(key, entry)).first->second.LastVariant;
+    }
+
+    void Reset()
+    {
+        for (u32 i = 0; i < 8; i++)
+        {
+            for (u32 j = 0; j < 8; j++)
+            {
+                for (u32 k = 0; k < TexArrays[i][j].size(); k++)
+                    TexLoader.DeleteTexture(TexArrays[i][j][k]);
+                TexArrays[i][j].clear();
+                FreeTextures[i][j].clear();
+            }
+        }
+        Cache.clear();
+    }
+private:
+    struct TexArrayEntry
+    {
+        TexHandleT TextureID;
+        u32 Layer;
+    };
+
+    struct TexCacheEntry
+    {
+        u32 LastVariant; // very cheap way to make variant lookup faster
+
+        u32 TextureRAMStart[2], TextureRAMSize[2];
+        u32 TexPalStart, TexPalSize;
+        u8 WidthLog2, HeightLog2;
+        TexArrayEntry Texture;
+
+        u64 TextureHash[2];
+        u64 TexPalHash;
+    };
+    std::unordered_map<u64, TexCacheEntry> Cache;
+
+    TexLoaderT TexLoader;
+
+    std::vector<TexArrayEntry> FreeTextures[8][8];
+    std::vector<TexHandleT> TexArrays[8][8];
+
+    u32 DecodingBuffer[1024*1024];
+};
+
+}
+
+#endif
--- a/src/GPU3D_TexcacheOpenGL.cpp
+++ b/src/GPU3D_TexcacheOpenGL.cpp
@ -0,0 +1,29 @@
+#include "GPU3D_TexcacheOpenGL.h"
+
+namespace GPU3D
+{
+
+GLuint TexcacheOpenGLLoader::GenerateTexture(u32 width, u32 height, u32 layers)
+{
+    GLuint texarray;
+    glGenTextures(1, &texarray);
+    glBindTexture(GL_TEXTURE_2D_ARRAY, texarray);
+    glTexStorage3D(GL_TEXTURE_2D_ARRAY, 1, GL_RGBA8UI, width, height, layers);
+    return texarray;
+}
+
+void TexcacheOpenGLLoader::UploadTexture(GLuint handle, u32 width, u32 height, u32 layer, void* data)
+{
+    glBindTexture(GL_TEXTURE_2D_ARRAY, handle);
+    glTexSubImage3D(GL_TEXTURE_2D_ARRAY,
+        0, 0, 0, layer,
+        width, height, 1,
+        GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, data);
+}
+
+void TexcacheOpenGLLoader::DeleteTexture(GLuint handle)
+{
+    glDeleteTextures(1, &handle);
+}
+
+}
--- a/src/GPU3D_TexcacheOpenGL.h
+++ b/src/GPU3D_TexcacheOpenGL.h
@ -0,0 +1,25 @@
+#ifndef GPU3D_TEXCACHEOPENGL
+#define GPU3D_TEXCACHEOPENGL
+
+#include "GPU3D_Texcache.h"
+#include "OpenGLSupport.h"
+
+namespace GPU3D
+{
+
+template <typename, typename>
+class Texcache;
+
+class TexcacheOpenGLLoader
+{
+public:
+    GLuint GenerateTexture(u32 width, u32 height, u32 layers);
+    void UploadTexture(GLuint handle, u32 width, u32 height, u32 layer, void* data);
+    void DeleteTexture(GLuint handle);
+};
+
+using TexcacheOpenGL = Texcache<TexcacheOpenGLLoader, GLuint>;
+
+}
+
+#endif
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@ -33,6 +33,7 @@
 #include "AREngine.h"
 #include "Platform.h"
 #include "FreeBIOS.h"
+#include "GPU3D.h"

 #ifdef JIT_ENABLED
 #include "ARMJIT.h"