From 9826afa063ac4d8ec030c54c643337d6e7225790 Mon Sep 17 00:00:00 2001
From: Flyinghead <raphael.jean@gmail.com>
Date: Fri, 7 Feb 2020 16:55:32 +0100
Subject: [PATCH] upload all texture mipmap levels to gpu

implemented for gl/gl4/vulkan/oit
don't auto-generate mipmaps in vulkan
simpler/smaller detwiddle table
use std::lock_guard with cMutex
---
 core/hw/mem/vmem32.cpp                |  21 +-
 core/rend/TexCache.cpp                | 313 ++++++++++++++------------
 core/rend/TexCache.h                  | 113 ++++++----
 core/rend/gl4/gles.cpp                |   9 +-
 core/rend/gles/gles.cpp               |   1 +
 core/rend/gles/gles.h                 |   3 +-
 core/rend/gles/gltex.cpp              |  65 +++++-
 core/rend/vulkan/oit/oit_renderer.cpp |   6 +-
 core/rend/vulkan/texture.cpp          | 104 +++------
 core/rend/vulkan/texture.h            |   5 +-
 core/rend/vulkan/vmu.cpp              |   2 +-
 core/rend/vulkan/vulkan_renderer.cpp  |   6 +-
 core/stdclass.h                       |   3 +
 core/wsi/gl_context.cpp               |   4 +
 core/wsi/gl_context.h                 |   2 +
 15 files changed, 362 insertions(+), 295 deletions(-)
diff --git a/core/hw/mem/vmem32.cpp b/core/hw/mem/vmem32.cpp
index 7a82ececd..3d0e211f9 100644
--- a/core/hw/mem/vmem32.cpp
+++ b/core/hw/mem/vmem32.cpp
@@ -19,6 +19,7 @@
     along with reicast.  If not, see <https://www.gnu.org/licenses/>.
  */
 #include <unordered_set>
+#include <mutex>
 #include "build.h"
 #include "vmem32.h"
 #include "_vmem.h"
@@ -267,20 +268,20 @@ static u32 vmem32_map_mmu(u32 address, bool write)
 			u32 end = start + page_size;
 			const vector<vram_lock>& blocks = vram_blocks[start / VRAM_PROT_SEGMENT];
 
-			vramlist_lock.Lock();
-			for (int i = blocks.size() - 1; i >= 0; i--)
 			{
-				if (blocks[i].start < end && blocks[i].end >= start)
+				std::lock_guard<cMutex> lock(vramlist_lock);
+				for (int i = blocks.size() - 1; i >= 0; i--)
 				{
-					u32 prot_start = max(start, blocks[i].start);
-					u32 prot_size = min(end, blocks[i].end + 1) - prot_start;
-					prot_size += prot_start % PAGE_SIZE;
-					prot_start &= ~PAGE_MASK;
-					vmem32_protect_buffer(vpn + (prot_start & (page_size - 1)), prot_size);
+					if (blocks[i].start < end && blocks[i].end >= start)
+					{
+						u32 prot_start = max(start, blocks[i].start);
+						u32 prot_size = min(end, blocks[i].end + 1) - prot_start;
+						prot_size += prot_start % PAGE_SIZE;
+						prot_start &= ~PAGE_MASK;
+						vmem32_protect_buffer(vpn + (prot_start & (page_size - 1)), prot_size);
+					}
 				}
 			}
-			vramlist_lock.Unlock();
-
 		}
 		else if (offset >= MAP_RAM_START_OFFSET && offset < MAP_RAM_START_OFFSET + RAM_SIZE)
 		{
diff --git a/core/rend/TexCache.cpp b/core/rend/TexCache.cpp
index 80d3fca62..57fa01bcc 100644
--- a/core/rend/TexCache.cpp
+++ b/core/rend/TexCache.cpp
@@ -1,4 +1,5 @@
 #include <algorithm>
+#include <mutex>
 #ifndef TARGET_NO_OPENMP
 #include <omp.h>
 #endif
@@ -22,60 +23,23 @@ u32 palette32_ram[1024];
 u32 pal_hash_256[4];
 u32 pal_hash_16[64];
 
-u32 detwiddle[2][8][1024];
-//input : address in the yyyyyxxxxx format
-//output : address in the xyxyxyxy format
-//U : x resolution , V : y resolution
-//twiddle works on 64b words
+u32 detwiddle[1024];
 
-
-static u32 twiddle_slow(u32 x,u32 y,u32 x_sz,u32 y_sz)
+void BuildTwiddleTable()
 {
-	u32 rv=0;//low 2 bits are directly passed  -> needs some misc stuff to work.However
-			 //Pvr internally maps the 64b banks "as if" they were twiddled :p
-
-	u32 sh=0;
-	x_sz>>=1;
-	y_sz>>=1;
-	while(x_sz!=0 || y_sz!=0)
-	{
-		if (y_sz)
-		{
-			u32 temp=y&1;
-			rv|=temp<<sh;
-
-			y_sz>>=1;
-			y>>=1;
-			sh++;
-		}
-		if (x_sz)
-		{
-			u32 temp=x&1;
-			rv|=temp<<sh;
-
-			x_sz>>=1;
-			x>>=1;
-			sh++;
-		}
-	}	
-	return rv;
+    for (u32 j = 0; j < ARRAY_SIZE(detwiddle); j++)
+    {
+        u32 detwiddled = 0;
+        for (int i = 0; i < 10; i++)
+        {
+            u32 shift = 1 << i;
+            detwiddled |= ((j & shift) << i);
+        }
+    	detwiddle[j] = detwiddled;
+    }
 }
 
-static void BuildTwiddleTables()
-{
-	for (u32 s=0;s<8;s++)
-	{
-		u32 x_sz=1024;
-		u32 y_sz=8<<s;
-		for (u32 i=0;i<x_sz;i++)
-		{
-			detwiddle[0][s][i]=twiddle_slow(i,0,x_sz,y_sz);
-			detwiddle[1][s][i]=twiddle_slow(0,i,y_sz,x_sz);
-		}
-	}
-}
-
-static OnLoad btt(&BuildTwiddleTables);
+static OnLoad btt(&BuildTwiddleTable);
 
 void palette_update()
 {
@@ -196,12 +160,10 @@ vram_block* libCore_vramlock_Lock(u32 start_offset64,u32 end_offset64,void* user
 	block->type=64;
 
 	{
-		vramlist_lock.Lock();
+		std::lock_guard<cMutex> lock(vramlist_lock);
 
 		// This also protects vram if needed
 		vramlock_list_add(block);
-
-		vramlist_lock.Unlock();
 	}
 
 	return block;
@@ -216,7 +178,7 @@ bool VramLockedWriteOffset(size_t offset)
 	vector<vram_block *>& list = VramLocks[addr_hash];
 
 	{
-		vramlist_lock.Lock();
+		std::lock_guard<cMutex> lock(vramlist_lock);
 
 		for (size_t i = 0; i < list.size(); i++)
 		{
@@ -235,8 +197,6 @@ bool VramLockedWriteOffset(size_t offset)
 		list.clear();
 
 		_vmem_unprotect_vram((u32)(offset & ~PAGE_MASK), PAGE_SIZE);
-
-		vramlist_lock.Unlock();
 	}
 
 	return true;
@@ -254,9 +214,8 @@ bool VramLockedWrite(u8* address)
 //also frees the handle
 void libCore_vramlock_Unlock_block(vram_block* block)
 {
-	vramlist_lock.Lock();
+	std::lock_guard<cMutex> lock(vramlist_lock);
 	libCore_vramlock_Unlock_block_wb(block);
-	vramlist_lock.Unlock();
 }
 
 void libCore_vramlock_Unlock_block_wb(vram_block* block)
@@ -409,8 +368,11 @@ static const PvrTexInfo format[8] =
 	{"ns/1555", 0},																														// Not supported (1555)
 };
 
-static const u32 MipPoint[8] =
+static const u32 VQMipPoint[11] =
 {
+	0x00000,//1
+	0x00001,//2
+	0x00002,//4
 	0x00006,//8
 	0x00016,//16
 	0x00056,//32
@@ -420,6 +382,20 @@ static const u32 MipPoint[8] =
 	0x05556,//512
 	0x15556//1024
 };
+static const u32 OtherMipPoint[11] =
+{
+	0x00003,//1
+	0x00004,//2
+	0x00008,//4
+	0x00018,//8
+	0x00058,//16
+	0x00158,//32
+	0x00558,//64
+	0x01558,//128
+	0x05558,//256
+	0x15558,//512
+	0x55558//1024
+};
 
 static const TextureType PAL_TYPE[4] = {
 	TextureType::_5551, TextureType::_565, TextureType::_4444, TextureType::_8888
@@ -496,71 +472,50 @@ void BaseTextureCacheData::Create()
 	else if (tex->bpp == 8)
 		palette_index = (tcw.PalSelect >> 4) << 8;
 
-	//VQ table (if VQ tex)
-	if (tcw.VQ_Comp)
-		vq_codebook = sa;
-
-	//Convert a pvr texture into OpenGL
-	switch (tcw.PixelFmt)
+	if (tcw.ScanOrder && (tex->PL || tex->PL32))
 	{
+		//Texture is stored 'planar' in memory, no deswizzle is needed
+		//verify(tcw.VQ_Comp==0);
+		if (tcw.VQ_Comp != 0)
+			WARN_LOG(RENDERER, "Warning: planar texture with VQ set (invalid)");
 
-	case Pixel1555: 	//0     1555 value: 1 bit; RGB values: 5 bits each
-	case PixelReserved: //7     Reserved        Regarded as 1555
-	case Pixel565: 		//1     565      R value: 5 bits; G value: 6 bits; B value: 5 bits
-	case Pixel4444: 	//2     4444 value: 4 bits; RGB values: 4 bits each
-	case PixelYUV:		//3     YUV422 32 bits per 2 pixels; YUYV values: 8 bits each
-	case PixelBumpMap:	//4		Bump Map 	16 bits/pixel; S value: 8 bits; R value: 8 bits
-	case PixelPal4:		//5     4 BPP Palette   Palette texture with 4 bits/pixel
-	case PixelPal8:		//6     8 BPP Palette   Palette texture with 8 bits/pixel
-		if (tcw.ScanOrder && (tex->PL || tex->PL32))
+		//Planar textures support stride selection, mostly used for non power of 2 textures (videos)
+		int stride = w;
+		if (tcw.StrideSel)
+			stride = (TEXT_CONTROL & 31) * 32;
+
+		//Call the format specific conversion code
+		texconv = tex->PL;
+		texconv32 = tex->PL32;
+		//calculate the size, in bytes, for the locking
+		size = stride * h * tex->bpp / 8;
+	}
+	else
+	{
+		// Quake 3 Arena uses one
+		if (tcw.MipMapped)
+			// Mipmapped texture must be square and TexV is ignored
+			h = w;
+
+		if (tcw.VQ_Comp)
 		{
-			//Texture is stored 'planar' in memory, no deswizzle is needed
-			//verify(tcw.VQ_Comp==0);
-			if (tcw.VQ_Comp != 0)
-				WARN_LOG(RENDERER, "Warning: planar texture with VQ set (invalid)");
-
-			//Planar textures support stride selection, mostly used for non power of 2 textures (videos)
-			int stride = w;
-			if (tcw.StrideSel)
-				stride = (TEXT_CONTROL & 31) * 32;
-
-			//Call the format specific conversion code
-			texconv = tex->PL;
-			texconv32 = tex->PL32;
-			//calculate the size, in bytes, for the locking
-			size = stride * h * tex->bpp / 8;
+			verify(tex->VQ != NULL || tex->VQ32 != NULL);
+			vq_codebook = sa;
+			if (tcw.MipMapped)
+				sa += VQMipPoint[tsp.TexU + 3];
+			texconv = tex->VQ;
+			texconv32 = tex->VQ32;
+			size = w * h / 8;
 		}
 		else
 		{
-			// Quake 3 Arena uses one. Not sure if valid but no need to crash
-			//verify(w == h || !tcw.MipMapped); // are non square mipmaps supported ? i can't recall right now *WARN*
-
-			if (tcw.VQ_Comp)
-			{
-				verify(tex->VQ != NULL || tex->VQ32 != NULL);
-				vq_codebook = sa;
-				if (tcw.MipMapped)
-					sa += MipPoint[tsp.TexU];
-				texconv = tex->VQ;
-				texconv32 = tex->VQ32;
-				size = w * h / 8;
-			}
-			else
-			{
-				verify(tex->TW != NULL || tex->TW32 != NULL);
-				if (tcw.MipMapped)
-					sa += MipPoint[tsp.TexU] * tex->bpp / 2;
-				texconv = tex->TW;
-				texconv32 = tex->TW32;
-				size = w * h * tex->bpp / 8;
-			}
+			verify(tex->TW != NULL || tex->TW32 != NULL);
+			if (tcw.MipMapped)
+				sa += OtherMipPoint[tsp.TexU + 3] * tex->bpp / 8;
+			texconv = tex->TW;
+			texconv32 = tex->TW32;
+			size = w * h * tex->bpp / 8;
 		}
-		break;
-	default:
-		WARN_LOG(RENDERER, "Unhandled texture format %d", tcw.PixelFmt);
-		size = w * h * 2;
-		texconv = NULL;
-		texconv32 = NULL;
 	}
 }
 
@@ -631,58 +586,119 @@ void BaseTextureCacheData::Update()
 	PixelBuffer<u32> pb32;
 
 	// Figure out if we really need to use a 32-bit pixel buffer
+	bool textureUpscaling = settings.rend.TextureUpscale > 1
+			// Don't process textures that are too big
+			&& w * h <= settings.rend.MaxFilteredTextureSize * settings.rend.MaxFilteredTextureSize
+			// Don't process YUV textures
+			&& tcw.PixelFmt != PixelYUV;
 	bool need_32bit_buffer = true;
-	if ((settings.rend.TextureUpscale <= 1
-			|| w * h > settings.rend.MaxFilteredTextureSize
-				* settings.rend.MaxFilteredTextureSize		// Don't process textures that are too big
-			|| tcw.PixelFmt == PixelYUV)					// Don't process YUV textures
+	if (!textureUpscaling
 		&& (!IsPaletted() || tex_type != TextureType::_8888)
 		&& texconv != NULL
 		&& !Force32BitTexture(tex_type))
 		need_32bit_buffer = false;
 	// TODO avoid upscaling/depost. textures that change too often
 
+	bool mipmapped = IsMipmapped() && settings.rend.UseMipmaps;
+
 	if (texconv32 != NULL && need_32bit_buffer)
 	{
+		if (textureUpscaling)
+			// don't use mipmaps if upscaling
+			mipmapped = false;
 		// Force the texture type since that's the only 32-bit one we know
 		tex_type = TextureType::_8888;
 
-		pb32.init(w, h);
-
-		texconv32(&pb32, (u8*)&vram[sa], stride, h);
+		if (mipmapped)
+		{
+			pb32.init(w, h, true);
+			for (int i = 0; i <= tsp.TexU + 3; i++)
+			{
+				pb32.set_mipmap(i);
+				u32 vram_addr;
+				if (tcw.VQ_Comp)
+				{
+					vram_addr = sa_tex + VQMipPoint[i];
+					if (i == 0)
+					{
+						PixelBuffer<u32> pb0;
+						pb0.init(2, 2 ,false);
+						texconv32(&pb0, (u8*)&vram[vram_addr], 2, 2);
+						*pb32.data() = *pb0.data(1, 1);
+						continue;
+					}
+				}
+				else
+					vram_addr = sa_tex + OtherMipPoint[i] * tex->bpp / 8;
+				texconv32(&pb32, (u8*)&vram[vram_addr], 1 << i, 1 << i);
+			}
+			pb32.set_mipmap(0);
+		}
+		else
+		{
+			pb32.init(w, h);
+			texconv32(&pb32, (u8*)&vram[sa], stride, h);
 
 #ifdef DEPOSTERIZE
-		{
-			// Deposterization
-			PixelBuffer<u32> tmp_buf;
-			tmp_buf.init(w, h);
+			{
+				// Deposterization
+				PixelBuffer<u32> tmp_buf;
+				tmp_buf.init(w, h);
 
-			DePosterize(pb32.data(), tmp_buf.data(), w, h);
-			pb32.steal_data(tmp_buf);
-		}
+				DePosterize(pb32.data(), tmp_buf.data(), w, h);
+				pb32.steal_data(tmp_buf);
+			}
 #endif
 
-		// xBRZ scaling
-		if (settings.rend.TextureUpscale > 1)
-		{
-			PixelBuffer<u32> tmp_buf;
-			tmp_buf.init(w * settings.rend.TextureUpscale, h * settings.rend.TextureUpscale);
+			// xBRZ scaling
+			if (textureUpscaling)
+			{
+				PixelBuffer<u32> tmp_buf;
+				tmp_buf.init(w * settings.rend.TextureUpscale, h * settings.rend.TextureUpscale);
 
-			if (tcw.PixelFmt == Pixel1555 || tcw.PixelFmt == Pixel4444)
-				// Alpha channel formats. Palettes with alpha are already handled
-				has_alpha = true;
-			UpscalexBRZ(settings.rend.TextureUpscale, pb32.data(), tmp_buf.data(), w, h, has_alpha);
-			pb32.steal_data(tmp_buf);
-			upscaled_w *= settings.rend.TextureUpscale;
-			upscaled_h *= settings.rend.TextureUpscale;
+				if (tcw.PixelFmt == Pixel1555 || tcw.PixelFmt == Pixel4444)
+					// Alpha channel formats. Palettes with alpha are already handled
+					has_alpha = true;
+				UpscalexBRZ(settings.rend.TextureUpscale, pb32.data(), tmp_buf.data(), w, h, has_alpha);
+				pb32.steal_data(tmp_buf);
+				upscaled_w *= settings.rend.TextureUpscale;
+				upscaled_h *= settings.rend.TextureUpscale;
+			}
 		}
 		temp_tex_buffer = pb32.data();
 	}
 	else if (texconv != NULL)
 	{
-		pb16.init(w, h);
-
-		texconv(&pb16,(u8*)&vram[sa],stride,h);
+		if (mipmapped)
+		{
+			pb16.init(w, h, true);
+			for (int i = 0; i <= tsp.TexU + 3; i++)
+			{
+				pb16.set_mipmap(i);
+				u32 vram_addr;
+				if (tcw.VQ_Comp)
+				{
+					vram_addr = sa_tex + VQMipPoint[i];
+					if (i == 0)
+					{
+						PixelBuffer<u16> pb0;
+						pb0.init(2, 2 ,false);
+						texconv(&pb0, (u8*)&vram[vram_addr], 2, 2);
+						*pb16.data() = *pb0.data(1, 1);
+						continue;
+					}
+				}
+				else
+					vram_addr = sa_tex + OtherMipPoint[i] * tex->bpp / 8;
+				texconv(&pb16, (u8*)&vram[vram_addr], 1 << i, 1 << i);
+			}
+			pb16.set_mipmap(0);
+		}
+		else
+		{
+			pb16.init(w, h);
+			texconv(&pb16,(u8*)&vram[sa],stride,h);
+		}
 		temp_tex_buffer = pb16.data();
 	}
 	else
@@ -692,6 +708,7 @@ void BaseTextureCacheData::Update()
 		pb16.init(w, h);
 		memset(pb16.data(), 0x80, w * h * 2);
 		temp_tex_buffer = pb16.data();
+		mipmapped = false;
 	}
 	// Restore the original texture height if it was constrained to VRAM limits above
 	h = original_h;
@@ -699,7 +716,7 @@ void BaseTextureCacheData::Update()
 	//lock the texture to detect changes in it
 	lock_block = libCore_vramlock_Lock(sa_tex,sa+size-1,this);
 
-	UploadToGPU(upscaled_w, upscaled_h, (u8*)temp_tex_buffer);
+	UploadToGPU(upscaled_w, upscaled_h, (u8*)temp_tex_buffer, mipmapped);
 	if (settings.rend.DumpTextures)
 	{
 		ComputeHash();
@@ -713,7 +730,7 @@ void BaseTextureCacheData::CheckCustomTexture()
 	if (custom_load_in_progress == 0 && custom_image_data != NULL)
 	{
 		tex_type = TextureType::_8888;
-		UploadToGPU(custom_width, custom_height, custom_image_data);
+		UploadToGPU(custom_width, custom_height, custom_image_data, false);
 		delete [] custom_image_data;
 		custom_image_data = NULL;
 	}
diff --git a/core/rend/TexCache.h b/core/rend/TexCache.h
index 41021af1d..fae7e3db7 100644
--- a/core/rend/TexCache.h
+++ b/core/rend/TexCache.h
@@ -17,32 +17,46 @@ extern u32 pal_hash_256[4];
 extern u32 pal_hash_16[64];
 extern bool KillTex;
 
-extern u32 detwiddle[2][8][1024];
+extern u32 detwiddle[1024];
 
 template<class pixel_type>
 class PixelBuffer
 {
-	pixel_type* p_buffer_start;
-	pixel_type* p_current_line;
-	pixel_type* p_current_pixel;
+	pixel_type* p_buffer_start = nullptr;
+	pixel_type* p_current_mipmap = nullptr;
+	pixel_type* p_current_line = nullptr;
+	pixel_type* p_current_pixel = nullptr;
 
 	u32 pixels_per_line = 0;
 
 public:
-	PixelBuffer()
-	{
-		p_buffer_start = p_current_line = p_current_pixel = NULL;
-	}
-
 	~PixelBuffer()
 	{
 		deinit();
 	}
 
+	void init(u32 width, u32 height, bool mipmapped)
+	{
+		deinit();
+		size_t size = width * height * sizeof(pixel_type);
+		if (mipmapped)
+		{
+			do
+			{
+				width /= 2;
+				height /= 2;
+				size += width * height * sizeof(pixel_type);
+			}
+			while (width != 0 && height != 0);
+		}
+		p_buffer_start = p_current_line = p_current_pixel = p_current_mipmap = (pixel_type *)malloc(size);
+		this->pixels_per_line = 1;
+	}
+
 	void init(u32 width, u32 height)
 	{
 		deinit();
-		p_buffer_start = p_current_line = p_current_pixel = (pixel_type *)malloc(width * height * sizeof(pixel_type));
+		p_buffer_start = p_current_line = p_current_pixel = p_current_mipmap = (pixel_type *)malloc(width * height * sizeof(pixel_type));
 		this->pixels_per_line = width;
 	}
 
@@ -51,47 +65,56 @@ public:
 		if (p_buffer_start != NULL)
 		{
 			free(p_buffer_start);
-			p_buffer_start = p_current_line = p_current_pixel = NULL;
+			p_buffer_start = p_current_mipmap = p_current_line = p_current_pixel = NULL;
 		}
 	}
 
 	void steal_data(PixelBuffer &buffer)
 	{
 		deinit();
-		p_buffer_start = p_current_line = p_current_pixel = buffer.p_buffer_start;
+		p_buffer_start = p_current_mipmap = p_current_line = p_current_pixel = buffer.p_buffer_start;
 		pixels_per_line = buffer.pixels_per_line;
-		buffer.p_buffer_start = buffer.p_current_line = buffer.p_current_pixel = NULL;
+		buffer.p_buffer_start = p_current_mipmap = buffer.p_current_line = buffer.p_current_pixel = NULL;
+	}
+
+	void set_mipmap(int level)
+	{
+		size_t offset = 0;
+		for (int i = 0; i < level; i++)
+			offset += (1 << (2 * i));
+		p_current_mipmap = p_current_line = p_current_pixel = p_buffer_start + offset;
+		pixels_per_line = 1 << level;
 	}
 
 	__forceinline pixel_type *data(u32 x = 0, u32 y = 0)
 	{
-		return p_buffer_start + pixels_per_line * y + x;
+		return p_current_mipmap + pixels_per_line * y + x;
 	}
 
-	__forceinline void prel(u32 x,pixel_type value)
+	__forceinline void prel(u32 x, pixel_type value)
 	{
-		p_current_pixel[x]=value;
+		p_current_pixel[x] = value;
 	}
 
-	__forceinline void prel(u32 x,u32 y,pixel_type value)
+	__forceinline void prel(u32 x, u32 y, pixel_type value)
 	{
-		p_current_pixel[y*pixels_per_line+x]=value;
+		p_current_pixel[y * pixels_per_line + x] = value;
 	}
 
 	__forceinline void rmovex(u32 value)
 	{
-		p_current_pixel+=value;
+		p_current_pixel += value;
 	}
 	__forceinline void rmovey(u32 value)
 	{
-		p_current_line+=pixels_per_line*value;
-		p_current_pixel=p_current_line;
+		p_current_line += pixels_per_line * value;
+		p_current_pixel = p_current_line;
 	}
-	__forceinline void amove(u32 x_m,u32 y_m)
+	__forceinline void amove(u32 x_m, u32 y_m)
 	{
 		//p_current_pixel=p_buffer_start;
-		p_current_line=p_buffer_start+pixels_per_line*y_m;
-		p_current_pixel=p_current_line + x_m;
+		p_current_line = p_current_mipmap + pixels_per_line * y_m;
+		p_current_pixel = p_current_line + x_m;
 	}
 };
 
@@ -145,8 +168,6 @@ __forceinline u32 YUV422(s32 Y,s32 Yu,s32 Yv)
 	return PixelPacker::packRGB(clamp(0,255,R),clamp(0,255,G),clamp(0,255,B));
 }
 
-#define twop(x,y,bcx,bcy) (detwiddle[0][bcy][x]+detwiddle[1][bcx][y])
-
 //pixel packers !
 struct pp_565
 {
@@ -496,24 +517,23 @@ void texture_PL(PixelBuffer<pixel_type>* pb,u8* p_in,u32 Width,u32 Height)
 	}
 }
 
+static inline u32 get_tw_texel_position(u32 x, u32 y)
+{
+    return detwiddle[y]  |  detwiddle[x] << 1;
+}
+
 template<class PixelConvertor, class pixel_type>
 void texture_TW(PixelBuffer<pixel_type>* pb,u8* p_in,u32 Width,u32 Height)
 {
 	pb->amove(0,0);
 
-	const u32 divider=PixelConvertor::xpp*PixelConvertor::ypp;
+	const u32 divider = PixelConvertor::xpp * PixelConvertor::ypp;
 
-	unsigned long bcx_,bcy_;
-	bcx_=bitscanrev(Width);
-	bcy_=bitscanrev(Height);
-	const u32 bcx=bcx_-3;
-	const u32 bcy=bcy_-3;
-
-	for (u32 y=0;y<Height;y+=PixelConvertor::ypp)
+	for (u32 y = 0; y < Height; y += PixelConvertor::ypp)
 	{
-		for (u32 x=0;x<Width;x+=PixelConvertor::xpp)
+		for (u32 x = 0; x < Width; x += PixelConvertor::xpp)
 		{
-			u8* p = &p_in[(twop(x,y,bcx,bcy)/divider)<<3];
+			u8* p = &p_in[get_tw_texel_position(x, y) / divider * 8];
 			PixelConvertor::Convert(pb,p);
 
 			pb->rmovex(PixelConvertor::xpp);
@@ -528,18 +548,14 @@ void texture_VQ(PixelBuffer<pixel_type>* pb,u8* p_in,u32 Width,u32 Height)
 	p_in+=256*4*2;
 	pb->amove(0,0);
 
-	const u32 divider=PixelConvertor::xpp*PixelConvertor::ypp;
-	unsigned long bcx_,bcy_;
-	bcx_=bitscanrev(Width);
-	bcy_=bitscanrev(Height);
-	const u32 bcx=bcx_-3;
-	const u32 bcy=bcy_-3;
+	Height /= PixelConvertor::ypp;
+	Width /= PixelConvertor::xpp;
 
-	for (u32 y=0;y<Height;y+=PixelConvertor::ypp)
+	for (u32 y = 0; y < Height; y++)
 	{
-		for (u32 x=0;x<Width;x+=PixelConvertor::xpp)
+		for (u32 x = 0; x < Width; x++)
 		{
-			u8 p = p_in[twop(x,y,bcx,bcy)/divider];
+			u8 p = p_in[get_tw_texel_position(x, y)];
 			PixelConvertor::Convert(pb,&vq_codebook[p*8]);
 
 			pb->rmovex(PixelConvertor::xpp);
@@ -670,6 +686,11 @@ struct BaseTextureCacheData
 		return tcw.PixelFmt == PixelPal4 || tcw.PixelFmt == PixelPal8;
 	}
 
+	bool IsMipmapped()
+	{
+		return tcw.MipMapped != 0 && tcw.ScanOrder == 0;
+	}
+
 	const char* GetPixelFormatName()
 	{
 		switch (tcw.PixelFmt)
@@ -688,7 +709,7 @@ struct BaseTextureCacheData
 	void Create();
 	void ComputeHash();
 	void Update();
-	virtual void UploadToGPU(int width, int height, u8 *temp_tex_buffer) = 0;
+	virtual void UploadToGPU(int width, int height, u8 *temp_tex_buffer, bool mipmapped) = 0;
 	virtual bool Force32BitTexture(TextureType type) const { return false; }
 	void CheckCustomTexture();
 	//true if : dirty or paletted texture and hashes don't match
diff --git a/core/rend/gl4/gles.cpp b/core/rend/gl4/gles.cpp
index 29de1697c..89ec73fe9 100644
--- a/core/rend/gl4/gles.cpp
+++ b/core/rend/gl4/gles.cpp
@@ -501,8 +501,6 @@ static bool gl_create_resources()
 		// Assume the resources have already been created
 		return true;
 
-	findGLVersion();
-
 	//create vao
 	glGenVertexArrays(1, &gl4.vbo.main_vao);
 	glGenVertexArrays(1, &gl4.vbo.modvol_vao);
@@ -538,11 +536,8 @@ extern void gl4CreateTextures(int width, int height);
 
 static bool gles_init()
 {
-	int major = 0;
-	int minor = 0;
-	glGetIntegerv(GL_MAJOR_VERSION, &major);
-	glGetIntegerv(GL_MINOR_VERSION, &minor);
-	if (major < 4 || (major == 4 && minor < 3))
+	findGLVersion();
+	if (gl.gl_major < 4 || (gl.gl_major == 4 && gl.gl_minor < 3))
 	{
 		WARN_LOG(RENDERER, "Warning: OpenGL version doesn't support per-pixel sorting.");
 		return false;
diff --git a/core/rend/gles/gles.cpp b/core/rend/gles/gles.cpp
index 5e21fc105..af7e5d143 100644
--- a/core/rend/gles/gles.cpp
+++ b/core/rend/gles/gles.cpp
@@ -469,6 +469,7 @@ void findGLVersion()
 {
 	gl.index_type = GL_UNSIGNED_INT;
 	gl.gl_major = theGLContext.GetMajorVersion();
+	gl.gl_minor = theGLContext.GetMinorVersion();
 	gl.is_gles = theGLContext.IsGLES();
 	if (gl.is_gles)
 	{
diff --git a/core/rend/gles/gles.h b/core/rend/gles/gles.h
index f6451ba66..2da72ad96 100755
--- a/core/rend/gles/gles.h
+++ b/core/rend/gles/gles.h
@@ -95,6 +95,7 @@ struct gl_ctx
 	const char *gl_version;
 	const char *glsl_version_header;
 	int gl_major;
+	int gl_minor;
 	bool is_gles;
 	GLuint fog_image_format;
 	GLenum index_type;
@@ -200,7 +201,7 @@ struct TextureCacheData : BaseTextureCacheData
 	GLuint texID;   //gl texture
 	u16* pData;
 	virtual std::string GetId() override { return std::to_string(texID); }
-	virtual void UploadToGPU(int width, int height, u8 *temp_tex_buffer) override;
+	virtual void UploadToGPU(int width, int height, u8 *temp_tex_buffer, bool mipmapped) override;
 	virtual bool Delete() override;
 };
 
diff --git a/core/rend/gles/gltex.cpp b/core/rend/gles/gltex.cpp
index d8825d432..4622e81ba 100644
--- a/core/rend/gles/gltex.cpp
+++ b/core/rend/gles/gltex.cpp
@@ -73,7 +73,7 @@ static void dumpRtTexture(u32 name, u32 w, u32 h) {
 	free(rows);
 }
 
-void TextureCacheData::UploadToGPU(int width, int height, u8 *temp_tex_buffer)
+void TextureCacheData::UploadToGPU(int width, int height, u8 *temp_tex_buffer, bool mipmapped)
 {
 	if (texID != 0)
 	{
@@ -100,9 +100,66 @@ void TextureCacheData::UploadToGPU(int width, int height, u8 *temp_tex_buffer)
 			die("Unsupported texture type");
 			break;
 		}
-		glTexImage2D(GL_TEXTURE_2D, 0,comps, width, height, 0, comps, gltype, temp_tex_buffer);
-		if (tcw.MipMapped && settings.rend.UseMipmaps)
-			glGenerateMipmap(GL_TEXTURE_2D);
+		if (mipmapped)
+		{
+			int mipmapLevels = 0;
+			int dim = width;
+			while (dim != 0)
+			{
+				mipmapLevels++;
+				dim >>= 1;
+			}
+#ifndef GLES2
+			// Open GL 4.2 or GLES 3.0 min
+			if (gl.gl_major > 4 || (gl.gl_major == 4 && gl.gl_minor >= 2)
+					|| (gl.is_gles && gl.gl_major >= 3))
+			{
+				GLuint internalFormat;
+				switch (tex_type)
+				{
+				case TextureType::_5551:
+					internalFormat = GL_RGB5_A1;
+					break;
+				case TextureType::_565:
+					internalFormat = GL_RGB565;
+					break;
+				case TextureType::_4444:
+					internalFormat = GL_RGBA4;
+					break;
+				case TextureType::_8888:
+					internalFormat = GL_RGBA8;
+					break;
+				}
+				if (Updates == 1)
+				{
+					glTexStorage2D(GL_TEXTURE_2D, mipmapLevels, internalFormat, width, height);
+					glCheck();
+				}
+				for (int i = 0; i < mipmapLevels; i++)
+				{
+					glTexSubImage2D(GL_TEXTURE_2D, mipmapLevels - i - 1, 0, 0, 1 << i, 1 << i, comps, gltype, temp_tex_buffer);
+					temp_tex_buffer += (1 << (2 * i)) * (tex_type == TextureType::_8888 ? 4 : 2);
+				}
+			}
+			else
+#endif
+			{
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0);
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, mipmapLevels - 1);
+				for (int i = 0; i < mipmapLevels; i++)
+				{
+					glTexImage2D(GL_TEXTURE_2D, mipmapLevels - i - 1, comps, 1 << i, 1 << i, 0, comps, gltype, temp_tex_buffer);
+					temp_tex_buffer += (1 << (2 * i)) * (tex_type == TextureType::_8888 ? 4 : 2);
+				}
+			}
+		}
+		else
+		{
+			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, 0);
+			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0);
+			glTexImage2D(GL_TEXTURE_2D, 0,comps, width, height, 0, comps, gltype, temp_tex_buffer);
+		}
+		glCheck();
 	}
 	else {
 		#if FEAT_HAS_SOFTREND
diff --git a/core/rend/vulkan/oit/oit_renderer.cpp b/core/rend/vulkan/oit/oit_renderer.cpp
index bcc0607f0..68a859592 100644
--- a/core/rend/vulkan/oit/oit_renderer.cpp
+++ b/core/rend/vulkan/oit/oit_renderer.cpp
@@ -65,7 +65,7 @@ public:
 					vjoyTexture->SetPhysicalDevice(GetContext()->GetPhysicalDevice());
 					vjoyTexture->SetDevice(GetContext()->GetDevice());
 					vjoyTexture->SetCommandBuffer(texCommandPool.Allocate());
-					vjoyTexture->UploadToGPU(OSD_TEX_W, OSD_TEX_H, image_data);
+					vjoyTexture->UploadToGPU(OSD_TEX_W, OSD_TEX_H, image_data, false);
 					vjoyTexture->SetCommandBuffer(nullptr);
 					texCommandPool.EndFrame();
 					delete [] image_data;
@@ -135,7 +135,7 @@ public:
 			curTexture->SetDevice(GetContext()->GetDevice());
 		}
 		curTexture->SetCommandBuffer(texCommandPool.Allocate());
-		curTexture->UploadToGPU(width, height, (u8*)pb.data());
+		curTexture->UploadToGPU(width, height, (u8*)pb.data(), false);
 		curTexture->SetCommandBuffer(nullptr);
 		texCommandPool.EndFrame();
 
@@ -287,7 +287,7 @@ private:
 		MakeFogTexture(texData);
 		fogTexture->SetCommandBuffer(texCommandPool.Allocate());
 
-		fogTexture->UploadToGPU(128, 2, texData);
+		fogTexture->UploadToGPU(128, 2, texData, false);
 
 		fogTexture->SetCommandBuffer(nullptr);
 	}
diff --git a/core/rend/vulkan/texture.cpp b/core/rend/vulkan/texture.cpp
index 2e4038632..fd3c3cbf8 100644
--- a/core/rend/vulkan/texture.cpp
+++ b/core/rend/vulkan/texture.cpp
@@ -143,7 +143,7 @@ void setImageLayout(vk::CommandBuffer const& commandBuffer, vk::Image image, vk:
 	commandBuffer.pipelineBarrier(sourceStage, destinationStage, {}, nullptr, nullptr, imageMemoryBarrier);
 }
 
-void Texture::UploadToGPU(int width, int height, u8 *data)
+void Texture::UploadToGPU(int width, int height, u8 *data, bool mipmapped)
 {
 	vk::Format format;
 	u32 dataSize = width * height * 2;
@@ -167,20 +167,31 @@ void Texture::UploadToGPU(int width, int height, u8 *data)
 		dataSize /= 2;
 		break;
 	}
+	if (mipmapped)
+	{
+		int w = width / 2;
+		u32 size = dataSize / 4;
+		while (w)
+		{
+			dataSize += size;
+			size /= 4;
+			w /= 2;
+		}
+	}
 	bool isNew = true;
 	if (width != extent.width || height != extent.height || format != this->format)
-		Init(width, height, format);
+		Init(width, height, format, dataSize);
 	else
 		isNew = false;
 	SetImage(dataSize, data, isNew);
 }
 
-void Texture::Init(u32 width, u32 height, vk::Format format)
+void Texture::Init(u32 width, u32 height, vk::Format format, u32 dataSize)
 {
 	this->extent = vk::Extent2D(width, height);
 	this->format = format;
 	mipmapLevels = 1;
-	if (tcw.MipMapped && settings.rend.UseMipmaps)
+	if (IsMipmapped() && settings.rend.UseMipmaps)
 		mipmapLevels += floor(log2(std::max(width, height)));
 
 	vk::FormatProperties formatProperties = physicalDevice.getFormatProperties(format);
@@ -195,7 +206,7 @@ void Texture::Init(u32 width, u32 height, vk::Format format)
 	vk::ImageUsageFlags usageFlags = vk::ImageUsageFlagBits::eSampled;
 	if (needsStaging)
 	{
-		stagingBufferData = std::unique_ptr<BufferData>(new BufferData(extent.width * extent.height * 4, vk::BufferUsageFlagBits::eTransferSrc));
+		stagingBufferData = std::unique_ptr<BufferData>(new BufferData(dataSize, vk::BufferUsageFlagBits::eTransferSrc));
 		usageFlags |= vk::ImageUsageFlagBits::eTransferDst;
 		initialLayout = vk::ImageLayout::eUndefined;
 		requirements = vk::MemoryPropertyFlagBits::eDeviceLocal;
@@ -206,8 +217,6 @@ void Texture::Init(u32 width, u32 height, vk::Format format)
 		initialLayout = vk::ImageLayout::ePreinitialized;
 		requirements = vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostVisible;
 	}
-	if (mipmapLevels > 1)
-		usageFlags |= vk::ImageUsageFlagBits::eTransferSrc | vk::ImageUsageFlagBits::eTransferDst;
 	CreateImage(imageTiling, usageFlags, initialLayout, requirements, vk::ImageAspectFlagBits::eColor);
 }
 
@@ -252,77 +261,34 @@ void Texture::SetImage(u32 srcSize, void *srcData, bool isNew)
 		// Since we're going to blit to the texture image, set its layout to eTransferDstOptimal
 		setImageLayout(commandBuffer, image.get(), format, mipmapLevels, isNew ? vk::ImageLayout::eUndefined : vk::ImageLayout::eShaderReadOnlyOptimal,
 				vk::ImageLayout::eTransferDstOptimal);
-		vk::BufferImageCopy copyRegion(0, extent.width, extent.height, vk::ImageSubresourceLayers(vk::ImageAspectFlagBits::eColor, 0, 0, 1), vk::Offset3D(0, 0, 0), vk::Extent3D(extent, 1));
-		commandBuffer.copyBufferToImage(stagingBufferData->buffer.get(), image.get(), vk::ImageLayout::eTransferDstOptimal, copyRegion);
 		if (mipmapLevels > 1)
-			GenerateMipmaps();
+		{
+			vk::DeviceSize bufferOffset = 0;
+			for (int i = 0; i < mipmapLevels; i++)
+			{
+				vk::BufferImageCopy copyRegion(bufferOffset, 1 << i, 1 << i, vk::ImageSubresourceLayers(vk::ImageAspectFlagBits::eColor, mipmapLevels - i - 1, 0, 1),
+						vk::Offset3D(0, 0, 0), vk::Extent3D(1 << i, 1 << i, 1));
+				commandBuffer.copyBufferToImage(stagingBufferData->buffer.get(), image.get(), vk::ImageLayout::eTransferDstOptimal, copyRegion);
+				bufferOffset += (1 << (2 * i)) * (tex_type == TextureType::_8888 ? 4 : 2);
+			}
+		}
 		else
-			// Set the layout for the texture image from eTransferDstOptimal to SHADER_READ_ONLY
-			setImageLayout(commandBuffer, image.get(), format, mipmapLevels, vk::ImageLayout::eTransferDstOptimal, vk::ImageLayout::eShaderReadOnlyOptimal);
+		{
+			vk::BufferImageCopy copyRegion(0, extent.width, extent.height, vk::ImageSubresourceLayers(vk::ImageAspectFlagBits::eColor, 0, 0, 1),
+					vk::Offset3D(0, 0, 0), vk::Extent3D(extent, 1));
+			commandBuffer.copyBufferToImage(stagingBufferData->buffer.get(), image.get(), vk::ImageLayout::eTransferDstOptimal, copyRegion);
+		}
+		// Set the layout for the texture image from eTransferDstOptimal to SHADER_READ_ONLY
+		setImageLayout(commandBuffer, image.get(), format, mipmapLevels, vk::ImageLayout::eTransferDstOptimal, vk::ImageLayout::eShaderReadOnlyOptimal);
 	}
 	else
 	{
-		if (mipmapLevels > 1)
-			GenerateMipmaps();
-		else
-			// If we can use the linear tiled image as a texture, just do it
-			setImageLayout(commandBuffer, image.get(), format, mipmapLevels, vk::ImageLayout::ePreinitialized, vk::ImageLayout::eShaderReadOnlyOptimal);
+		// If we can use the linear tiled image as a texture, just do it
+		setImageLayout(commandBuffer, image.get(), format, mipmapLevels, vk::ImageLayout::ePreinitialized, vk::ImageLayout::eShaderReadOnlyOptimal);
 	}
 	commandBuffer.end();
 }
 
-void Texture::GenerateMipmaps()
-{
-	u32 mipWidth = extent.width;
-	u32 mipHeight = extent.height;
-	vk::ImageMemoryBarrier barrier(vk::AccessFlagBits::eTransferWrite, vk::AccessFlagBits::eTransferRead,
-			vk::ImageLayout::eTransferDstOptimal, vk::ImageLayout::eTransferSrcOptimal, VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED,
-			*image, vk::ImageSubresourceRange(vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1));
-
-	for (int i = 1; i < mipmapLevels; i++)
-	{
-		// Transition previous mipmap level from dst optimal/preinit to src optimal
-		barrier.subresourceRange.baseMipLevel = i - 1;
-		if (i == 1 && !needsStaging)
-		{
-			barrier.oldLayout = vk::ImageLayout::ePreinitialized;
-			barrier.srcAccessMask = vk::AccessFlagBits::eHostWrite;
-		}
-		else
-		{
-			barrier.oldLayout = vk::ImageLayout::eTransferDstOptimal;
-			barrier.srcAccessMask = vk::AccessFlagBits::eTransferWrite;
-		}
-		barrier.newLayout = vk::ImageLayout::eTransferSrcOptimal;
-		barrier.dstAccessMask = vk::AccessFlagBits::eTransferRead;
-		commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eTransfer, {}, nullptr, nullptr, barrier);
-
-		// Blit previous mipmap level on current
-		vk::ImageBlit blit(vk::ImageSubresourceLayers(vk::ImageAspectFlagBits::eColor, i - 1, 0, 1),
-				 { { vk::Offset3D(0, 0, 0), vk::Offset3D(mipWidth, mipHeight, 1) } },
-				 vk::ImageSubresourceLayers(vk::ImageAspectFlagBits::eColor, i, 0, 1),
-				 { { vk::Offset3D(0, 0, 0), vk::Offset3D(std::max(mipWidth / 2, 1u), std::max(mipHeight / 2, 1u), 1) } });
-		commandBuffer.blitImage(*image, vk::ImageLayout::eTransferSrcOptimal, *image, vk::ImageLayout::eTransferDstOptimal, 1, &blit, vk::Filter::eLinear);
-
-		// Transition previous mipmap level from src optimal to shader read-only optimal
-		barrier.oldLayout = vk::ImageLayout::eTransferSrcOptimal;
-		barrier.newLayout = vk::ImageLayout::eShaderReadOnlyOptimal;
-		barrier.srcAccessMask = vk::AccessFlagBits::eTransferRead;
-		barrier.dstAccessMask = vk::AccessFlagBits::eShaderRead;
-		commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eFragmentShader, {}, nullptr, nullptr, barrier);
-
-		mipWidth = std::max(mipWidth / 2, 1u);
-		mipHeight = std::max(mipHeight / 2, 1u);
-	}
-	// Transition last mipmap level from dst optimal to shader read-only optimal
-	barrier.subresourceRange.baseMipLevel = mipmapLevels - 1;
-	barrier.oldLayout = vk::ImageLayout::eTransferDstOptimal;
-	barrier.newLayout = vk::ImageLayout::eShaderReadOnlyOptimal;
-	barrier.srcAccessMask = vk::AccessFlagBits::eTransferWrite;
-	barrier.dstAccessMask = vk::AccessFlagBits::eShaderRead;
-	commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eFragmentShader, {}, nullptr, nullptr, barrier);
-}
-
 void FramebufferAttachment::Init(u32 width, u32 height, vk::Format format, vk::ImageUsageFlags usage)
 {
 	this->format = format;
diff --git a/core/rend/vulkan/texture.h b/core/rend/vulkan/texture.h
index 4dfc76961..ec3b30cdb 100644
--- a/core/rend/vulkan/texture.h
+++ b/core/rend/vulkan/texture.h
@@ -30,7 +30,7 @@ void setImageLayout(vk::CommandBuffer const& commandBuffer, vk::Image image, vk:
 
 struct Texture : BaseTextureCacheData
 {
-	void UploadToGPU(int width, int height, u8 *data) override;
+	void UploadToGPU(int width, int height, u8 *data, bool mipmapped) override;
 	u64 GetIntId() { return (u64)reinterpret_cast<uintptr_t>(this); }
 	std::string GetId() override { char s[20]; sprintf(s, "%p", this); return s; }
 	bool IsNew() const { return !image.get(); }
@@ -43,11 +43,10 @@ struct Texture : BaseTextureCacheData
 	void SetDevice(vk::Device device) { this->device = device; }
 
 private:
-	void Init(u32 width, u32 height, vk::Format format);
+	void Init(u32 width, u32 height, vk::Format format ,u32 dataSize);
 	void SetImage(u32 size, void *data, bool isNew);
 	void CreateImage(vk::ImageTiling tiling, vk::ImageUsageFlags usage, vk::ImageLayout initialLayout,
 			vk::MemoryPropertyFlags memoryProperties, vk::ImageAspectFlags aspectMask);
-	void GenerateMipmaps();
 
 	vk::Format format = vk::Format::eUndefined;
 	vk::Extent2D extent;
diff --git a/core/rend/vulkan/vmu.cpp b/core/rend/vulkan/vmu.cpp
index 63b8f0211..71898e4e3 100644
--- a/core/rend/vulkan/vmu.cpp
+++ b/core/rend/vulkan/vmu.cpp
@@ -51,7 +51,7 @@ const std::vector<vk::UniqueCommandBuffer>* VulkanVMUs::PrepareVMUs(vk::CommandP
 				VulkanContext::Instance()->GetDevice().allocateCommandBuffersUnique(vk::CommandBufferAllocateInfo(commandPool, vk::CommandBufferLevel::ePrimary, 1))
 				.front()));
 		texture->SetCommandBuffer(*commandBuffers[context->GetCurrentImageIndex()].back());
-		texture->UploadToGPU(48, 32, (u8*)vmu_lcd_data[i]);
+		texture->UploadToGPU(48, 32, (u8*)vmu_lcd_data[i], false);
 		texture->SetCommandBuffer(nullptr);
 		vmu_lcd_changed[i] = false;
 	}
diff --git a/core/rend/vulkan/vulkan_renderer.cpp b/core/rend/vulkan/vulkan_renderer.cpp
index f94ee339c..f80c0d8c2 100644
--- a/core/rend/vulkan/vulkan_renderer.cpp
+++ b/core/rend/vulkan/vulkan_renderer.cpp
@@ -61,7 +61,7 @@ public:
 				vjoyTexture->SetPhysicalDevice(GetContext()->GetPhysicalDevice());
 				vjoyTexture->SetDevice(GetContext()->GetDevice());
 				vjoyTexture->SetCommandBuffer(texCommandPool.Allocate());
-				vjoyTexture->UploadToGPU(OSD_TEX_W, OSD_TEX_H, image_data);
+				vjoyTexture->UploadToGPU(OSD_TEX_W, OSD_TEX_H, image_data, false);
 				vjoyTexture->SetCommandBuffer(nullptr);
 				texCommandPool.EndFrame();
 				delete [] image_data;
@@ -122,7 +122,7 @@ public:
 			curTexture->SetDevice(GetContext()->GetDevice());
 		}
 		curTexture->SetCommandBuffer(texCommandPool.Allocate());
-		curTexture->UploadToGPU(width, height, (u8*)pb.data());
+		curTexture->UploadToGPU(width, height, (u8*)pb.data(), false);
 		curTexture->SetCommandBuffer(nullptr);
 		texCommandPool.EndFrame();
 
@@ -275,7 +275,7 @@ private:
 		MakeFogTexture(texData);
 		fogTexture->SetCommandBuffer(texCommandPool.Allocate());
 
-		fogTexture->UploadToGPU(128, 2, texData);
+		fogTexture->UploadToGPU(128, 2, texData, false);
 
 		fogTexture->SetCommandBuffer(nullptr);
 	}
diff --git a/core/stdclass.h b/core/stdclass.h
index f3e5a85a7..912d667c7 100644
--- a/core/stdclass.h
+++ b/core/stdclass.h
@@ -118,6 +118,9 @@ public :
 		pthread_mutex_unlock(&mutx);
 #endif
 	}
+	// std::BasicLockable so we can use std::lock_guard
+	void lock() { Lock(); }
+	void unlock() { Unlock(); }
 };
 
 #if !defined(TARGET_IPHONE)
diff --git a/core/wsi/gl_context.cpp b/core/wsi/gl_context.cpp
index b36d31ae1..b0b1fc808 100644
--- a/core/wsi/gl_context.cpp
+++ b/core/wsi/gl_context.cpp
@@ -29,6 +29,10 @@ void GLGraphicsContext::findGLVersion()
 	glGetIntegerv(GL_MAJOR_VERSION, &majorVersion);
 	if (glGetError() == GL_INVALID_ENUM)
 		majorVersion = 2;
+	else
+	{
+		glGetIntegerv(GL_MINOR_VERSION, &minorVersion);
+	}
 	const char *version = (const char *)glGetString(GL_VERSION);
 	isGLES = !strncmp(version, "OpenGL ES", 9);
 	INFO_LOG(RENDERER, "OpenGL version: %s", version);
diff --git a/core/wsi/gl_context.h b/core/wsi/gl_context.h
index eda891299..0faec3413 100644
--- a/core/wsi/gl_context.h
+++ b/core/wsi/gl_context.h
@@ -29,6 +29,7 @@ class GLGraphicsContext
 {
 public:
 	int GetMajorVersion() const { return majorVersion; }
+	int GetMinorVersion() const { return minorVersion; }
 	bool IsGLES() const { return isGLES; }
 
 protected:
@@ -38,6 +39,7 @@ protected:
 
 private:
 	int majorVersion = 0;
+	int minorVersion = 0;
 	bool isGLES = false;
 };