pcsx2/pcsx2/GS/Renderers/HW/GSRendererHW.cpp

/*  PCSX2 - PS2 Emulator for PCs
 *  Copyright (C) 2002-2023 PCSX2 Dev Team
 *
 *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
 *  of the GNU Lesser General Public License as published by the Free Software Found-
 *  ation, either version 3 of the License, or (at your option) any later version.
 *
 *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 *  PURPOSE.  See the GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along with PCSX2.
 *  If not, see <http://www.gnu.org/licenses/>.
 */

#include "PrecompiledHeader.h"
#include "GS/Renderers/HW/GSRendererHW.h"
#include "GS/Renderers/HW/GSTextureReplacements.h"
#include "GS/GSGL.h"
#include "GS/GSPerfMon.h"
#include "GS/GSUtil.h"
#include "Host.h"
#include "common/BitUtils.h"
#include "common/StringUtil.h"
#include <bit>

GSRendererHW::GSRendererHW()
	: GSRenderer()
{
	MULTI_ISA_SELECT(GSRendererHWPopulateFunctions)(*this);
	m_mipmap = (GSConfig.HWMipmap >= HWMipmapLevel::Basic);
	SetTCOffset();

	pxAssert(!g_texture_cache);
	g_texture_cache = std::make_unique<GSTextureCache>();
	GSTextureReplacements::Initialize();

	// Hope nothing requires too many draw calls.
	m_drawlist.reserve(2048);

	memset(&m_conf, 0, sizeof(m_conf));

	ResetStates();
}

void GSRendererHW::SetTCOffset()
{
	m_userhacks_tcoffset_x = std::max<s32>(GSConfig.UserHacks_TCOffsetX, 0) / -1000.0f;
	m_userhacks_tcoffset_y = std::max<s32>(GSConfig.UserHacks_TCOffsetY, 0) / -1000.0f;
	m_userhacks_tcoffset = m_userhacks_tcoffset_x < 0.0f || m_userhacks_tcoffset_y < 0.0f;
}

GSRendererHW::~GSRendererHW()
{
	g_texture_cache.reset();
}

void GSRendererHW::Destroy()
{
	g_texture_cache->RemoveAll();
	GSRenderer::Destroy();
}

void GSRendererHW::PurgeTextureCache()
{
	g_texture_cache->RemoveAll();
}

void GSRendererHW::ReadbackTextureCache()
{
	g_texture_cache->ReadbackAll();
}

GSTexture* GSRendererHW::LookupPaletteSource(u32 CBP, u32 CPSM, u32 CBW, GSVector2i& offset, float* scale, const GSVector2i& size)
{
	return g_texture_cache->LookupPaletteSource(CBP, CPSM, CBW, offset, scale, size);
}

bool GSRendererHW::CanUpscale()
{
	return GSConfig.UpscaleMultiplier != 1.0f;
}

float GSRendererHW::GetUpscaleMultiplier()
{
	return GSConfig.UpscaleMultiplier;
}

void GSRendererHW::Reset(bool hardware_reset)
{
	// Read back on CSR Reset, conditional downloading on render swap etc handled elsewhere.
	if (!hardware_reset)
		g_texture_cache->ReadbackAll();

	g_texture_cache->RemoveAll();

	GSRenderer::Reset(hardware_reset);
}

void GSRendererHW::UpdateSettings(const Pcsx2Config::GSOptions& old_config)
{
	GSRenderer::UpdateSettings(old_config);
	m_mipmap = (GSConfig.HWMipmap >= HWMipmapLevel::Basic);
	SetTCOffset();
}

void GSRendererHW::VSync(u32 field, bool registers_written, bool idle_frame)
{
	if (GSConfig.LoadTextureReplacements)
		GSTextureReplacements::ProcessAsyncLoadedTextures();

	if (!idle_frame)
	{
		// If it did draws very recently, we should keep the recent stuff in case it hasn't been preloaded/used yet.
		// Rocky Legend does this with the main menu FMV's.
		if (s_last_transfer_draw_n == s_n)
		{
			for (auto iter = m_draw_transfers.rbegin(); iter != m_draw_transfers.rend(); iter++)
			{
				if ((s_n - iter->draw) > 5)
				{
					m_draw_transfers.erase(m_draw_transfers.begin(), std::next(iter).base());
					break;
				}
			}
		}
		else
		{
			m_draw_transfers.clear();
		}

		g_texture_cache->IncAge();
	}
	else
	{
		// Don't age the texture cache when no draws or EE writes have occurred.
		// Xenosaga needs its targets kept around while it's loading, because it uses them for a fade transition.
		GL_INS("No draws or transfers, not aging TC");
	}

	if (g_texture_cache->GetHashCacheMemoryUsage() > 1024 * 1024 * 1024)
	{
		Host::AddKeyedOSDMessage("HashCacheOverflow",
			fmt::format(TRANSLATE_FS("GS", "Hash cache has used {:.2f} MB of VRAM, disabling."),
				static_cast<float>(g_texture_cache->GetHashCacheMemoryUsage()) / 1048576.0f),
			Host::OSD_ERROR_DURATION);
		g_texture_cache->RemoveAll();
		g_gs_device->PurgePool();
		GSConfig.TexturePreloading = TexturePreloadingLevel::Partial;
	}

	m_skip = 0;
	m_skip_offset = 0;

	GSRenderer::VSync(field, registers_written, idle_frame);
}

GSTexture* GSRendererHW::GetOutput(int i, float& scale, int& y_offset)
{
	int index = i >= 0 ? i : 1;

	GSPCRTCRegs::PCRTCDisplay& curFramebuffer = PCRTCDisplays.PCRTCDisplays[index];
	const GSVector2i framebufferSize(PCRTCDisplays.GetFramebufferSize(i));

	PCRTCDisplays.RemoveFramebufferOffset(i);
	// TRACE(_T("[%d] GetOutput %d %05x (%d)\n"), (int)m_perfmon.GetFrame(), i, (int)TEX0.TBP0, (int)TEX0.PSM);

	GSTexture* t = nullptr;

	GIFRegTEX0 TEX0 = {};
	TEX0.TBP0 = curFramebuffer.Block();
	TEX0.TBW = curFramebuffer.FBW;
	TEX0.PSM = curFramebuffer.PSM;

	if (GSTextureCache::Target* rt = g_texture_cache->LookupDisplayTarget(TEX0, framebufferSize, GetTextureScaleFactor()))
	{
		rt->Update();
		t = rt->m_texture;
		scale = rt->m_scale;

		const int delta = TEX0.TBP0 - rt->m_TEX0.TBP0;
		if (delta > 0 && curFramebuffer.FBW != 0)
		{
			const int pages = delta >> 5u;
			int y_pages = pages / curFramebuffer.FBW;
			y_offset = y_pages * GSLocalMemory::m_psm[curFramebuffer.PSM].pgs.y;
			GL_CACHE("Frame y offset %d pixels, unit %d", y_offset, i);
		}

#ifdef ENABLE_OGL_DEBUG
		if (GSConfig.DumpGSData)
		{
			if (GSConfig.SaveFrame && s_n >= GSConfig.SaveN)
			{
				t->Save(GetDrawDumpPath("%05d_f%lld_fr%d_%05x_%s.bmp", s_n, g_perfmon.GetFrame(), i, static_cast<int>(TEX0.TBP0), psm_str(TEX0.PSM)));
			}
		}
#endif
	}

	return t;
}

GSTexture* GSRendererHW::GetFeedbackOutput(float& scale)
{
	const int index = m_regs->EXTBUF.FBIN & 1;
	const GSVector2i fb_size(PCRTCDisplays.GetFramebufferSize(index));

	GIFRegTEX0 TEX0 = {};
	TEX0.TBP0 = m_regs->EXTBUF.EXBP;
	TEX0.TBW = m_regs->EXTBUF.EXBW;
	TEX0.PSM = PCRTCDisplays.PCRTCDisplays[index].PSM;

	GSTextureCache::Target* rt = g_texture_cache->LookupDisplayTarget(TEX0, fb_size, GetTextureScaleFactor());
	if (!rt)
		return nullptr;

	rt->Update();
	GSTexture* t = rt->m_texture;
	scale = rt->m_scale;

#ifdef ENABLE_OGL_DEBUG
	if (GSConfig.DumpGSData && GSConfig.SaveFrame && s_n >= GSConfig.SaveN)
		t->Save(GetDrawDumpPath("%05d_f%lld_fr%d_%05x_%s.bmp", s_n, g_perfmon.GetFrame(), 3, static_cast<int>(TEX0.TBP0), psm_str(TEX0.PSM)));
#endif

	return t;
}

void GSRendererHW::Lines2Sprites()
{
	ASSERT(m_vt.m_primclass == GS_SPRITE_CLASS);

	// each sprite converted to quad needs twice the space

	while (m_vertex.tail * 2 > m_vertex.maxcount)
	{
		GrowVertexBuffer();
	}

	// assume vertices are tightly packed and sequentially indexed (it should be the case)
	const bool predivide_q = PRIM->TME && !PRIM->FST && m_vt.m_accurate_stq;

	if (m_vertex.next >= 2)
	{
		const u32 count = m_vertex.next;

		int i = static_cast<int>(count) * 2 - 4;
		GSVertex* s = &m_vertex.buff[count - 2];
		GSVertex* q = &m_vertex.buff[count * 2 - 4];
		u16* RESTRICT index = &m_index.buff[count * 3 - 6];

		// Sprites are flat shaded, so the provoking vertex doesn't matter here.
		constexpr GSVector4i indices = GSVector4i::cxpr16(0, 1, 2, 1, 2, 3, 0, 0);

		for (; i >= 0; i -= 4, s -= 2, q -= 4, index -= 6)
		{
			GSVertex v0 = s[0];
			GSVertex v1 = s[1];

			v0.RGBAQ = v1.RGBAQ;
			v0.XYZ.Z = v1.XYZ.Z;
			v0.FOG = v1.FOG;

			if (predivide_q)
			{
				const GSVector4 st0 = GSVector4::loadl(&v0.ST.U64);
				const GSVector4 st1 = GSVector4::loadl(&v1.ST.U64);
				const GSVector4 Q = GSVector4(v1.RGBAQ.Q, v1.RGBAQ.Q, v1.RGBAQ.Q, v1.RGBAQ.Q);
				const GSVector4 st = st0.upld(st1) / Q;

				GSVector4::storel(&v0.ST.U64, st);
				GSVector4::storeh(&v1.ST.U64, st);

				v0.RGBAQ.Q = 1.0f;
				v1.RGBAQ.Q = 1.0f;
			}

			q[0] = v0;
			q[3] = v1;

			// swap x, s, u

			const u16 x = v0.XYZ.X;
			v0.XYZ.X = v1.XYZ.X;
			v1.XYZ.X = x;

			const float s = v0.ST.S;
			v0.ST.S = v1.ST.S;
			v1.ST.S = s;

			const u16 u = v0.U;
			v0.U = v1.U;
			v1.U = u;

			q[1] = v0;
			q[2] = v1;

			const GSVector4i this_indices = GSVector4i::broadcast16(i).add16(indices);
			const int high = this_indices.extract32<2>();
			GSVector4i::storel(index, this_indices);
			std::memcpy(&index[4], &high, sizeof(high));
		}

		m_vertex.head = m_vertex.tail = m_vertex.next = count * 2;
		m_index.tail = count * 3;
	}
}

void GSRendererHW::ExpandLineIndices()
{
	const u32 process_count = (m_index.tail + 7) / 8 * 8;
	const u32 expansion_factor = 3;
	m_index.tail *= expansion_factor;
	GSVector4i* end = reinterpret_cast<GSVector4i*>(m_index.buff);
	GSVector4i* read = reinterpret_cast<GSVector4i*>(m_index.buff + process_count);
	GSVector4i* write = reinterpret_cast<GSVector4i*>(m_index.buff + process_count * expansion_factor);

	constexpr GSVector4i mask0 = GSVector4i::cxpr8(0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5);
	constexpr GSVector4i mask1 = GSVector4i::cxpr8(6, 7, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 8, 9);
	constexpr GSVector4i mask2 = GSVector4i::cxpr8(10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 12, 13, 14, 15, 14, 15);

	constexpr GSVector4i low0 = GSVector4i::cxpr16(0, 1, 2, 1, 2, 3, 0, 1);
	constexpr GSVector4i low1 = GSVector4i::cxpr16(2, 1, 2, 3, 0, 1, 2, 1);
	constexpr GSVector4i low2 = GSVector4i::cxpr16(2, 3, 0, 1, 2, 1, 2, 3);

	while (read > end)
	{
		read -= 1;
		write -= expansion_factor;

		const GSVector4i in = read->sll16(2);
		write[0] = in.shuffle8(mask0) | low0;
		write[1] = in.shuffle8(mask1) | low1;
		write[2] = in.shuffle8(mask2) | low2;
	}
}

// Fix the vertex position/tex_coordinate from 16 bits color to 32 bits color
void GSRendererHW::ConvertSpriteTextureShuffle(bool& write_ba, bool& read_ba, GSTextureCache::Target* rt, GSTextureCache::Source* tex)
{
	const u32 count = m_vertex.next;
	GSVertex* v = &m_vertex.buff[0];
	const GIFRegXYOFFSET& o = m_context->XYOFFSET;
	const GSVertex first_vert = (v[0].XYZ.X <= v[m_vertex.tail - 2].XYZ.X) ? v[0] : v[m_vertex.tail - 2];
	// vertex position is 8 to 16 pixels, therefore it is the 16-31 bits of the colors
	const int pos = (first_vert.XYZ.X - o.OFX) & 0xFF;
	write_ba = (pos > 112 && pos < 136);

	// Read texture is 8 to 16 pixels (same as above)
	const float tw = static_cast<float>(1u << m_cached_ctx.TEX0.TW);
	int tex_pos = (PRIM->FST) ? first_vert.U : static_cast<int>(tw * first_vert.ST.S);
	tex_pos &= 0xFF;
	// "same group" means it can read blue and write alpha using C32 tricks
	read_ba = (tex_pos > 112 && tex_pos < 144) || (m_same_group_texture_shuffle && (m_cached_ctx.FRAME.FBMSK & 0xFFFF0000) != 0xFFFF0000);

	// Another way of selecting whether to read RG/BA is to use region repeat.
	// Ace Combat 04 reads RG, writes to RGBA by setting a MINU of 1015.
	if (m_cached_ctx.CLAMP.WMS == CLAMP_REGION_REPEAT)
	{
		GL_INS("REGION_REPEAT clamp with texture shuffle, FBMSK=%08x, MINU=%u, MINV=%u, MAXU=%u, MAXV=%u",
			m_cached_ctx.FRAME.FBMSK, m_cached_ctx.CLAMP.MINU, m_cached_ctx.CLAMP.MINV, m_cached_ctx.CLAMP.MAXU,
			m_cached_ctx.CLAMP.MAXV);

		// offset coordinates swap around RG/BA.
		const bool invert = read_ba; // (tex_pos > 112 && tex_pos < 144), i.e. 8 fixed point
		const u32 minu = (m_cached_ctx.CLAMP.MINU & 8) ^ (invert ? 8 : 0);
		read_ba = ((minu & 8) != 0);
	}

	if (m_split_texture_shuffle_pages > 0)
	{
		// Input vertices might be bad, so rewrite them.
		// We can't use the draw rect exactly here, because if the target was actually larger
		// for some reason... unhandled clears, maybe, it won't have been halved correctly.
		// So, halve it ourselves.
		const GSVector4i dr = m_r;
		const GSVector4i r = dr.blend32<9>(dr.sra32(1));
		GL_CACHE("ConvertSpriteTextureShuffle: Rewrite from %d,%d => %d,%d to %d,%d => %d,%d",
			static_cast<int>(m_vt.m_min.p.x), static_cast<int>(m_vt.m_min.p.y), static_cast<int>(m_vt.m_min.p.z),
			static_cast<int>(m_vt.m_min.p.w), r.x, r.y, r.z, r.w);

		const GSVector4i fpr = r.sll32(4);
		v[0].XYZ.X = static_cast<u16>(m_context->XYOFFSET.OFX + fpr.x);
		v[0].XYZ.Y = static_cast<u16>(m_context->XYOFFSET.OFY + fpr.y);

		v[1].XYZ.X = static_cast<u16>(m_context->XYOFFSET.OFX + fpr.z);
		v[1].XYZ.Y = static_cast<u16>(m_context->XYOFFSET.OFY + fpr.w);

		if (PRIM->FST)
		{
			v[0].U = fpr.x;
			v[0].V = fpr.y;
			v[1].U = fpr.z;
			v[1].V = fpr.w;
		}
		else
		{
			const float th = static_cast<float>(1 << m_cached_ctx.TEX0.TH);
			const GSVector4 st = GSVector4(r) / GSVector4(GSVector2(tw, th)).xyxy();
			GSVector4::storel(&v[0].ST.S, st);
			GSVector4::storeh(&v[1].ST.S, st);
		}

		m_vertex.head = m_vertex.tail = m_vertex.next = 2;
		m_index.tail = 2;
		return;
	}

	bool half_bottom_vert = true;
	bool half_right_vert = true;
	bool half_bottom_uv = true;
	bool half_right_uv = true;

	if (m_same_group_texture_shuffle)
	{
		if (m_cached_ctx.FRAME.FBW != rt->m_TEX0.TBW && m_cached_ctx.FRAME.FBW == rt->m_TEX0.TBW * 2)
			half_right_vert = false;
		else
			half_bottom_vert = false;
	}
	else
	{
		// Different source (maybe?)
		// If a game does the texture and frame doubling differently, they can burn in hell.
		if (m_cached_ctx.TEX0.TBP0 != m_cached_ctx.FRAME.Block())
		{
			// No super source of truth here, since the width can get batted around, the valid is probably our best bet.
			const int tex_width = tex->m_target ? tex->m_from_target->m_valid.z : (tex->m_TEX0.TBW * 64);
			const int tex_tbw = tex->m_target ? tex->m_from_target_TEX0.TBW : tex->m_TEX0.TBW;
			if ((static_cast<int>(m_cached_ctx.TEX0.TBW * 64) >= std::min(tex_width * 2, 1024) && tex_tbw != m_cached_ctx.TEX0.TBW) || (m_cached_ctx.TEX0.TBW * 64) < floor(m_vt.m_max.t.x))
			{
				half_right_uv = false;
				half_right_vert = false;
			}
			else
			{
				half_bottom_uv = false;
				half_bottom_vert = false;
			}
		}
		else
		{
			if ((floor(m_vt.m_max.p.y) <= rt->m_valid.w) && ((floor(m_vt.m_max.p.x) > (m_cached_ctx.FRAME.FBW * 64)) || (rt->m_TEX0.TBW != m_cached_ctx.FRAME.FBW)))
			{
				half_right_vert = false;
				half_right_uv = false;
			}
			else
			{
				half_bottom_vert = false;
				half_bottom_uv = false;
			}
		}
	}

	if (PRIM->FST)
	{
		GL_INS("First vertex is  P: %d => %d    T: %d => %d", v[0].XYZ.X, v[1].XYZ.X, v[0].U, v[1].U);

		for (u32 i = 0; i < count; i += 2)
		{
			if (write_ba)
				v[i].XYZ.X -= 128u;
			else
				v[i + 1].XYZ.X += 128u;

			if (read_ba)
				v[i].U -= 128u;
			else
				v[i + 1].U += 128u;

			if (!half_bottom_vert)
			{
				// Height is too big (2x).
				const int tex_offset = v[i].V & 0xF;
				const GSVector4i offset(o.OFY, tex_offset, o.OFY, tex_offset);

				GSVector4i tmp(v[i].XYZ.Y, v[i].V, v[i + 1].XYZ.Y, v[i + 1].V);
				tmp = GSVector4i(tmp - offset).srl32(1) + offset;

				v[i].XYZ.Y = static_cast<u16>(tmp.x);
				v[i + 1].XYZ.Y = static_cast<u16>(tmp.z);

				if (!half_bottom_uv)
				{
					v[i].V = static_cast<u16>(tmp.y);
					v[i + 1].V = static_cast<u16>(tmp.w);
				}
			}
		}
	}
	else
	{
		const float offset_8pix = 8.0f / tw;
		GL_INS("First vertex is  P: %d => %d    T: %f => %f (offset %f)", v[0].XYZ.X, v[1].XYZ.X, v[0].ST.S, v[1].ST.S, offset_8pix);

		for (u32 i = 0; i < count; i += 2)
		{
			if (write_ba)
				v[i].XYZ.X -= 128u;
			else
				v[i + 1].XYZ.X += 128u;

			if (read_ba)
				v[i].ST.S -= offset_8pix;
			else
				v[i + 1].ST.S += offset_8pix;

			if (!half_bottom_vert)
			{
				// Height is too big (2x).
				const GSVector4i offset(o.OFY, o.OFY);

				GSVector4i tmp(v[i].XYZ.Y, v[i + 1].XYZ.Y);
				tmp = GSVector4i(tmp - offset).srl32(1) + offset;

				//fprintf(stderr, "Before %d, After %d\n", v[i + 1].XYZ.Y, tmp.y);
				v[i].XYZ.Y = static_cast<u16>(tmp.x);
				v[i + 1].XYZ.Y = static_cast<u16>(tmp.y);

				if (!half_bottom_uv)
				{
					v[i].ST.T /= 2.0f;
					v[i + 1].ST.T /= 2.0f;
				}
			}
		}
	}

	// Update vertex trace too. Avoid issue to compute bounding box
	if (write_ba)
		m_vt.m_min.p.x -= 8.0f;
	else
		m_vt.m_max.p.x += 8.0f;

	if (!m_same_group_texture_shuffle)
	{
		if (read_ba)
			m_vt.m_min.t.x -= 8.0f;
		else
			m_vt.m_max.t.x += 8.0f;
	}

	if (!half_right_vert)
	{
		m_vt.m_min.p.x /= 2.0f;
		m_vt.m_max.p.x /= 2.0f;
		m_context->scissor.in.x = m_vt.m_min.p.x;
		m_context->scissor.in.z = m_vt.m_max.p.x + 8.0f;
	}

	if (!half_bottom_vert)
	{
		m_vt.m_min.p.y /= 2.0f;
		m_vt.m_max.p.y /= 2.0f;
		m_context->scissor.in.y = m_vt.m_min.p.y;
		m_context->scissor.in.w = m_vt.m_max.p.y + 8.0f;
	}

	// Only do this is the source is being interpreted as 16bit
	if (!half_bottom_uv)
	{
		m_vt.m_min.t.y /= 2.0f;
		m_vt.m_max.t.y /= 2.0f;
	}

	if (!half_right_uv)
	{
		m_vt.m_min.t.y /= 2.0f;
		m_vt.m_max.t.y /= 2.0f;
	}
}

GSVector4 GSRendererHW::RealignTargetTextureCoordinate(const GSTextureCache::Source* tex)
{
	if (GSConfig.UserHacks_HalfPixelOffset <= 1 || GetUpscaleMultiplier() == 1.0f)
		return GSVector4(0.0f);

	const GSVertex* v = &m_vertex.buff[0];
	const float scale = tex->GetScale();
	const bool linear = m_vt.IsRealLinear();
	const int t_position = v[0].U;
	GSVector4 half_offset(0.0f);

	// FIXME Let's start with something wrong same mess on X and Y
	// FIXME Maybe it will be enough to check linear

	if (PRIM->FST)
	{
		if (GSConfig.UserHacks_HalfPixelOffset == 3)
		{
			if (!linear && t_position == 8)
			{
				half_offset.x = 8;
				half_offset.y = 8;
			}
			else if (linear && t_position == 16)
			{
				half_offset.x = 16;
				half_offset.y = 16;
			}
			else if (m_vt.m_min.p.x == -0.5f)
			{
				half_offset.x = 8;
				half_offset.y = 8;
			}
		}
		else
		{
			if (!linear && t_position == 8)
			{
				half_offset.x = 8 - 8 / scale;
				half_offset.y = 8 - 8 / scale;
			}
			else if (linear && t_position == 16)
			{
				half_offset.x = 16 - 16 / scale;
				half_offset.y = 16 - 16 / scale;
			}
			else if (m_vt.m_min.p.x == -0.5f)
			{
				half_offset.x = 8;
				half_offset.y = 8;
			}
		}

		GL_INS("offset detected %f,%f t_pos %d (linear %d, scale %f)",
			half_offset.x, half_offset.y, t_position, linear, scale);
	}
	else if (m_vt.m_eq.q)
	{
		const float tw = static_cast<float>(1 << m_cached_ctx.TEX0.TW);
		const float th = static_cast<float>(1 << m_cached_ctx.TEX0.TH);
		const float q = v[0].RGBAQ.Q;

		// Tales of Abyss
		half_offset.x = 0.5f * q / tw;
		half_offset.y = 0.5f * q / th;

		GL_INS("ST offset detected %f,%f (linear %d, scale %f)",
			half_offset.x, half_offset.y, linear, scale);
	}

	return half_offset;
}

GSVector4i GSRendererHW::ComputeBoundingBox(const GSVector2i& rtsize, float rtscale)
{
	const GSVector4 offset = GSVector4(-1.0f, 1.0f); // Round value
	const GSVector4 box = m_vt.m_min.p.upld(m_vt.m_max.p) + offset.xxyy();
	return GSVector4i(box * GSVector4(rtscale)).rintersect(GSVector4i(0, 0, rtsize.x, rtsize.y));
}

void GSRendererHW::MergeSprite(GSTextureCache::Source* tex)
{
	// Upscaling hack to avoid various line/grid issues
	if (GSConfig.UserHacks_MergePPSprite && CanUpscale() && tex && tex->m_target && (m_vt.m_primclass == GS_SPRITE_CLASS))
	{
		if (PRIM->FST && GSLocalMemory::m_psm[tex->m_TEX0.PSM].fmt < 2 && ((m_vt.m_eq.value & 0xCFFFF) == 0xCFFFF))
		{
			// Ideally the hack ought to be enabled in a true paving mode only. I don't know how to do it accurately
			// neither in a fast way. So instead let's just take the hypothesis that all sprites must have the same
			// size.
			// Tested on Tekken 5.
			const GSVertex* v = &m_vertex.buff[0];
			bool is_paving = true;
			// SSE optimization: shuffle m[1] to have (4*32 bits) X, Y, U, V
			const int first_dpX = v[1].XYZ.X - v[0].XYZ.X;
			const int first_dpU = v[1].U - v[0].U;
			for (u32 i = 0; i < m_vertex.next; i += 2)
			{
				const int dpX = v[i + 1].XYZ.X - v[i].XYZ.X;
				const int dpU = v[i + 1].U - v[i].U;
				if (dpX != first_dpX || dpU != first_dpU)
				{
					is_paving = false;
					break;
				}
			}

#if 0
			const GSVector4 delta_p = m_vt.m_max.p - m_vt.m_min.p;
			const GSVector4 delta_t = m_vt.m_max.t - m_vt.m_min.t;
			const bool is_blit = PrimitiveOverlap() == PRIM_OVERLAP_NO;
			GL_INS("PP SAMPLER: Dp %f %f Dt %f %f. Is blit %d, is paving %d, count %d", delta_p.x, delta_p.y, delta_t.x, delta_t.y, is_blit, is_paving, m_vertex.tail);
#endif

			if (is_paving)
			{
				// Replace all sprite with a single fullscreen sprite.
				GSVertex* s = &m_vertex.buff[0];

				s[0].XYZ.X = static_cast<u16>((16.0f * m_vt.m_min.p.x) + m_context->XYOFFSET.OFX);
				s[1].XYZ.X = static_cast<u16>((16.0f * m_vt.m_max.p.x) + m_context->XYOFFSET.OFX);
				s[0].XYZ.Y = static_cast<u16>((16.0f * m_vt.m_min.p.y) + m_context->XYOFFSET.OFY);
				s[1].XYZ.Y = static_cast<u16>((16.0f * m_vt.m_max.p.y) + m_context->XYOFFSET.OFY);

				s[0].U = static_cast<u16>(16.0f * m_vt.m_min.t.x);
				s[0].V = static_cast<u16>(16.0f * m_vt.m_min.t.y);
				s[1].U = static_cast<u16>(16.0f * m_vt.m_max.t.x);
				s[1].V = static_cast<u16>(16.0f * m_vt.m_max.t.y);

				m_vertex.head = m_vertex.tail = m_vertex.next = 2;
				m_index.tail = 2;
			}
		}
	}
}

float GSRendererHW::GetTextureScaleFactor()
{
	return GetUpscaleMultiplier();
}

GSVector2i GSRendererHW::GetValidSize(const GSTextureCache::Source* tex)
{
	// Don't blindly expand out to the scissor size if we're not drawing to it.
	// e.g. Burnout 3, God of War II, etc.
	int height = std::min<int>(m_context->scissor.in.w, m_r.w);

	// If the draw is less than a page high, FBW=0 is the same as FBW=1.
	const GSLocalMemory::psm_t& frame_psm = GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM];
	int width = std::min(std::max<int>(m_cached_ctx.FRAME.FBW, 1) * 64, m_context->scissor.in.z);
	if (m_cached_ctx.FRAME.FBW == 0 && m_r.w > frame_psm.pgs.y)
	{
		GL_INS("FBW=0 when drawing more than 1 page in height (PSM %s, PGS %dx%d).", psm_str(m_cached_ctx.FRAME.PSM),
			frame_psm.pgs.x, frame_psm.pgs.y);
	}

	// If it's a channel shuffle, it'll likely be just a single page, so assume full screen.
	if (m_channel_shuffle)
	{
		const int page_x = frame_psm.pgs.x - 1;
		const int page_y = frame_psm.pgs.y - 1;
		pxAssert(tex);

		// Round up the page as channel shuffles are generally done in pages at a time
		width = (std::max(tex->GetUnscaledWidth(), width) + page_x) & ~page_x;
		height = (std::max(tex->GetUnscaledHeight(), height) + page_y) & ~page_y;
	}

	// Align to page size. Since FRAME/Z has to always start on a page boundary, in theory no two should overlap.
	width = Common::AlignUpPow2(width, frame_psm.pgs.x);
	height = Common::AlignUpPow2(height, frame_psm.pgs.y);

	// Early detection of texture shuffles. These double the input height because they're interpreting 64x32 C32 pages as 64x64 C16.
	// Why? Well, we don't want to be doubling the heights of targets, but also we don't want to align C32 targets to 64 instead of 32.
	// Yumeria's text breaks, and GOW goes to 512x448 instead of 512x416 if we don't.
	const bool possible_texture_shuffle =
		(tex && m_vt.m_primclass == GS_SPRITE_CLASS && frame_psm.bpp == 16 &&
			GSLocalMemory::m_psm[m_cached_ctx.TEX0.PSM].bpp == 16 &&
			(tex->m_32_bits_fmt ||
				(m_cached_ctx.TEX0.TBP0 != m_cached_ctx.FRAME.Block() && IsOpaque() && !(m_context->TEX1.MMIN & 1) &&
					m_cached_ctx.FRAME.FBMSK && g_texture_cache->Has32BitTarget(m_cached_ctx.FRAME.Block()))));
	if (possible_texture_shuffle)
	{
		const u32 tex_width_pgs = (tex->m_target ? tex->m_from_target_TEX0.TBW : tex->m_TEX0.TBW);
		const u32 half_draw_width_pgs = ((width + (frame_psm.pgs.x - 1)) / frame_psm.pgs.x) >> 1;

		// Games such as Midnight Club 3 draw headlights with a texture shuffle, but instead of doubling the height, they doubled the width.
		if (tex_width_pgs == half_draw_width_pgs)
		{
			GL_CACHE("Halving width due to texture shuffle with double width, %dx%d -> %dx%d", width, height, width / 2, height);
			width /= 2;
		}
		else
		{
			GL_CACHE("Halving height due to texture shuffle, %dx%d -> %dx%d", width, height, width, height / 2);
			height /= 2;
		}
	}

	return  GSVector2i(width, height);
}

GSVector2i GSRendererHW::GetTargetSize(const GSTextureCache::Source* tex)
{
	const GSVector2i valid_size = GetValidSize(tex);

	return g_texture_cache->GetTargetSize(m_cached_ctx.FRAME.Block(), m_cached_ctx.FRAME.FBW, m_cached_ctx.FRAME.PSM, valid_size.x, valid_size.y);
}

bool GSRendererHW::IsPossibleChannelShuffle() const
{
	if (!PRIM->TME || m_cached_ctx.TEX0.PSM != PSMT8 || // 8-bit texture draw
		m_vt.m_primclass != GS_SPRITE_CLASS) // draw_sprite_tex
	{
		return false;
	}

	const int mask = (((m_vt.m_max.p - m_vt.m_min.p) <= GSVector4(64.0f)).mask() & 0x3);
	if (mask == 0x3) // single_page
		return true;
	else if (mask != 0x1) // Not a single page in width.
		return false;

	// WRC 4 does channel shuffles in vertical strips. So check for page alignment.
	// Texture TBW should also be twice the framebuffer FBW, because the page is twice as wide.
	if (m_cached_ctx.TEX0.TBW == (m_cached_ctx.FRAME.FBW * 2) &&
		GSLocalMemory::IsPageAligned(m_cached_ctx.FRAME.PSM, GSVector4i(m_vt.m_min.p.upld(m_vt.m_max.p))))
	{
		return true;
	}

	return false;
}

bool GSRendererHW::NextDrawMatchesShuffle() const
{
	// Make sure nothing unexpected has changed.
	// Twinsanity seems to screw with ZBUF here despite it being irrelevant.
	const GSDrawingContext& next_ctx = m_env.CTXT[m_backed_up_ctx];
	if (((m_context->TEX0.U64 ^ next_ctx.TEX0.U64) & (~0x3FFF)) != 0 ||
		m_context->TEX1.U64 != next_ctx.TEX1.U64 ||
		m_context->CLAMP.U64 != next_ctx.CLAMP.U64 ||
		m_context->TEST.U64 != next_ctx.TEST.U64 ||
		((m_context->FRAME.U64 ^ next_ctx.FRAME.U64) & (~0x1FF)) != 0 ||
		m_context->ZBUF.ZMSK != next_ctx.ZBUF.ZMSK)
	{
		return false;
	}

	return true;
}

bool GSRendererHW::IsSplitTextureShuffle(u32 rt_tbw)
{
	// For this to work, we're peeking into the next draw, therefore we need dirty registers.
	if (m_dirty_gs_regs == 0)
		return false;

	if (!NextDrawMatchesShuffle())
		return false;

	// Different channel being shuffled, so needs to be handled separately (misdetection in 50 Cent)
	if (m_vertex.buff[m_index.buff[0]].U != m_v.U)
		return false;

	// Check that both the position and texture coordinates are page aligned, so we can work in pages instead of coordinates.
	// For texture shuffles, the U will be offset by 8.

	const GSVector4i pos_rc = GSVector4i(m_vt.m_min.p.upld(m_vt.m_max.p + GSVector4::cxpr(0.5f)));
	const GSVector4i tex_rc = GSVector4i(m_vt.m_min.t.upld(m_vt.m_max.t));

	// Width/height should match.
	if (std::abs(pos_rc.width() - tex_rc.width()) > 8 || pos_rc.height() != tex_rc.height())
		return false;

	// X might be offset by up to -8/+8, but either the position or UV should be aligned.
	GSVector4i aligned_rc = pos_rc.min_i32(tex_rc).blend32<12>(pos_rc.max_i32(tex_rc));

	const GSLocalMemory::psm_t& frame_psm = GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM];
	const GSDrawingContext& next_ctx = m_env.CTXT[m_backed_up_ctx];
	// Y should be page aligned. X should be too, but if it's doing a copy with a shuffle (which is kinda silly), both the
	// position and coordinates may be offset by +8. See Psi-Ops - The Mindgate Conspiracy.
	if ((aligned_rc.x & 7) != 0 || aligned_rc.x > 8 || (aligned_rc.z & 7) != 0 ||
		aligned_rc.y != 0 || (aligned_rc.w & (frame_psm.pgs.y - 1)) != 0)
	{
		return false;
	}

	// Matrix Path of Neo draws 512x512 instead of 512x448, then scissors to 512x448.
	aligned_rc = aligned_rc.rintersect(m_context->scissor.in);

	// We should have the same number of pages in both the position and UV.
	const u32 pages_high = static_cast<u32>(aligned_rc.height()) / frame_psm.pgs.y;
	const u32 num_pages = m_context->FRAME.FBW * pages_high;

	// If this is a split texture shuffle, the next draw's FRAME/TEX0 should line up.
	// Re-add the offset we subtracted in Draw() to get the original FBP/TBP0.. this won't handle wrapping. Oh well.
	// "Potential" ones are for Jak3 which does a split shuffle on a 128x128 texture with a width of 256, writing to the lower half then offsetting 2 pages.
	const u32 expected_next_FBP = (m_cached_ctx.FRAME.FBP + m_split_texture_shuffle_pages) + num_pages;
	const u32 potential_expected_next_FBP = m_cached_ctx.FRAME.FBP + ((m_context->FRAME.FBW * 64) / aligned_rc.width());
	const u32 expected_next_TBP0 = (m_cached_ctx.TEX0.TBP0 + (m_split_texture_shuffle_pages + num_pages) * BLOCKS_PER_PAGE);
	const u32 potential_expected_next_TBP0 = m_cached_ctx.TEX0.TBP0 + (BLOCKS_PER_PAGE * ((m_context->TEX0.TBW * 64) / aligned_rc.width()));
	GL_CACHE("IsSplitTextureShuffle: Draw covers %ux%u pages, next FRAME %x TEX %x",
		static_cast<u32>(aligned_rc.width()) / frame_psm.pgs.x, pages_high, expected_next_FBP * BLOCKS_PER_PAGE,
		expected_next_TBP0);
	if (next_ctx.TEX0.TBP0 != expected_next_TBP0 && next_ctx.TEX0.TBP0 != potential_expected_next_TBP0)
	{
		GL_CACHE("IsSplitTextureShuffle: Mismatch on TBP0, expecting %x, got %x", expected_next_TBP0, next_ctx.TEX0.TBP0);
		return false;
	}

	// Some games don't offset the FBP.
	if (next_ctx.FRAME.FBP != expected_next_FBP && next_ctx.FRAME.FBP != m_cached_ctx.FRAME.FBP && next_ctx.FRAME.FBP != potential_expected_next_FBP)
	{
		GL_CACHE("IsSplitTextureShuffle: Mismatch on FBP, expecting %x, got %x", expected_next_FBP * BLOCKS_PER_PAGE,
			next_ctx.FRAME.FBP * BLOCKS_PER_PAGE);
		return false;
	}

	// Great, everything lines up, so skip 'em.
	GL_CACHE("IsSplitTextureShuffle: Match, buffering and skipping draw.");

	if (m_split_texture_shuffle_pages == 0)
	{
		m_split_texture_shuffle_start_FBP = m_cached_ctx.FRAME.FBP;
		m_split_texture_shuffle_start_TBP = m_cached_ctx.TEX0.TBP0;

		// If the game has changed the texture width to 1 we need to retanslate it to whatever the rt has so the final rect is correct.
		if (m_cached_ctx.FRAME.FBW == 1)
			m_split_texture_shuffle_fbw = rt_tbw;
		else
			m_split_texture_shuffle_fbw = m_cached_ctx.FRAME.FBW;
	}

	if ((m_split_texture_shuffle_pages % m_split_texture_shuffle_fbw) == 0)
		m_split_texture_shuffle_pages_high += pages_high;

	m_split_texture_shuffle_pages += num_pages;

	return true;
}

GSVector4i GSRendererHW::GetSplitTextureShuffleDrawRect() const
{
	const GSLocalMemory::psm_t& frame_psm = GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM];
	GSVector4i r = GSVector4i(m_vt.m_min.p.upld(m_vt.m_max.p + GSVector4::cxpr(0.5f))).rintersect(m_context->scissor.in);

	// Some games (e.g. Crash Twinsanity) adjust both FBP and TBP0, so the rectangle will be half the size
	// of the actual shuffle. Others leave the FBP alone, but only adjust TBP0, and offset the draw rectangle
	// to the second half of the fb. In which case, the rectangle bounds will be correct.
	if (m_context->FRAME.FBP != m_split_texture_shuffle_start_FBP)
	{
		const int pages_high = (r.height() + frame_psm.pgs.y - 1) / frame_psm.pgs.y;
		r.w = (m_split_texture_shuffle_pages_high + pages_high) * frame_psm.pgs.y;
	}

	// But we still need to page align, because of the +/- 8 offset.
	return r.insert64<0>(0).ralign<Align_Outside>(frame_psm.pgs);
}

u32 GSRendererHW::GetEffectiveTextureShuffleFbmsk() const
{
	pxAssert(m_texture_shuffle);
	const u32 m = m_cached_ctx.FRAME.FBMSK & GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk;
	const u32 fbmask = ((m >> 3) & 0x1F) | ((m >> 6) & 0x3E0) | ((m >> 9) & 0x7C00) | ((m >> 16) & 0x8000);
	const u32 rb_mask = fbmask & 0xFF;
	const u32 ga_mask = (fbmask >> 8) & 0xFF;
	const u32 eff_mask =
		((rb_mask == 0xFF && ga_mask == 0xFF) ? 0x00FFFFFFu : 0) | ((ga_mask == 0xFF) ? 0xFF000000u : 0);
	return eff_mask;
}

GSVector4i GSRendererHW::GetDrawRectForPages(u32 bw, u32 psm, u32 num_pages)
{
	const GSVector2i& pgs = GSLocalMemory::m_psm[psm].pgs;
	const GSVector2i size = GSVector2i(static_cast<int>(bw) * pgs.x, static_cast<int>(num_pages / std::max(1U, bw)) * pgs.y);
	return GSVector4i::loadh(size);
}

bool GSRendererHW::TryToResolveSinglePageFramebuffer(GIFRegFRAME& FRAME, bool only_next_draw)
{
	const u32 start_bp = FRAME.Block();
	u32 new_bw = FRAME.FBW;
	u32 new_psm = FRAME.PSM;
	pxAssert(new_bw <= 1);

	if (m_backed_up_ctx >= 0)
	{
		const GSDrawingContext& next_ctx = m_env.CTXT[m_backed_up_ctx];
		if (next_ctx.FRAME.FBW != new_bw)
		{
			// Using it as a target/Z next (Superman Returns).
			if (start_bp == next_ctx.FRAME.Block())
			{
				GL_INS("TryToResolveSinglePageWidth(): Next FBP is split clear, using FBW of %u", next_ctx.FRAME.FBW);
				new_bw = next_ctx.FRAME.FBW;
				new_psm = next_ctx.FRAME.PSM;
			}
			else if (start_bp == next_ctx.ZBUF.Block())
			{
				GL_INS("TryToResolveSinglePageWidth(): Next ZBP is split clear, using FBW of %u", next_ctx.FRAME.FBW);
				new_bw = next_ctx.FRAME.FBW;
			}
		}

		// Might be using it as a texture next (NARC).
		if (new_bw <= 1 && next_ctx.TEX0.TBP0 == start_bp && new_bw != next_ctx.TEX0.TBW)
		{
			GL_INS("TryToResolveSinglePageWidth(): Next texture is using split clear, using FBW of %u", next_ctx.TEX0.TBW);
			new_bw = next_ctx.TEX0.TBW;
			new_psm = next_ctx.TEX0.PSM;
		}
	}

	if (!only_next_draw)
	{
		// Try for an exiting target at the start BP. (Tom & Jerry)
		if (new_bw <= 1)
		{
			GSTextureCache::Target* tgt = g_texture_cache->GetTargetWithSharedBits(start_bp, new_psm);
			if (!tgt)
			{
				// Try with Z or FRAME (whichever we're not using).
				tgt = g_texture_cache->GetTargetWithSharedBits(start_bp, new_psm ^ 0x30);
			}
			if (tgt && ((start_bp + (m_split_clear_pages * BLOCKS_PER_PAGE)) - 1) <= tgt->m_end_block)
			{
				GL_INS("TryToResolveSinglePageWidth(): Using FBW of %u and PSM %s from existing target",
					tgt->m_TEX0.PSM, psm_str(tgt->m_TEX0.PSM));
				new_bw = tgt->m_TEX0.TBW;
				new_psm = tgt->m_TEX0.PSM;
			}
		}

		// Still bad FBW? Fall back to the resolution hack (Brave).
		if (new_bw <= 1)
		{
			// Framebuffer is likely to be read as 16bit later, so we will need to double the width if the write is 32bit.
			const bool double_width =
				GSLocalMemory::m_psm[new_psm].bpp == 32 && PCRTCDisplays.GetFramebufferBitDepth() == 16;
			const GSVector2i fb_size = PCRTCDisplays.GetFramebufferSize(-1);
			u32 width =
				std::ceil(static_cast<float>(m_split_clear_pages * GSLocalMemory::m_psm[new_psm].pgs.y) / fb_size.y) *
				64;
			width = std::max((width * (double_width ? 2 : 1)), static_cast<u32>(fb_size.x));
			new_bw = (width + 63) / 64;
			GL_INS("TryToResolveSinglePageWidth(): Fallback guess target FBW of %u", new_bw);
		}
	}

	if (new_bw <= 1)
		return false;

	FRAME.FBW = new_bw;
	FRAME.PSM = new_psm;
	return true;
}

bool GSRendererHW::IsSplitClearActive() const
{
	return (m_split_clear_pages != 0);
}

bool GSRendererHW::IsStartingSplitClear()
{
	// Shouldn't have gaps.
	if (m_vt.m_eq.rgba != 0xFFFF || (!m_cached_ctx.ZBUF.ZMSK && !m_vt.m_eq.z) || !PrimitiveCoversWithoutGaps())
		return false;

	// Limit to only single page wide tall draws for now. Too many false positives otherwise (e.g. NFSU).
	if (m_context->FRAME.FBW > 1 || m_r.height() < 1024)
		return false;

	u32 pages_covered;
	if (!CheckNextDrawForSplitClear(m_r, &pages_covered))
		return false;

	m_split_clear_start = m_cached_ctx.FRAME;
	m_split_clear_start_Z = m_cached_ctx.ZBUF;
	m_split_clear_pages = pages_covered;
	m_split_clear_color = GetConstantDirectWriteMemClearColor();

	GL_INS("Starting split clear at FBP %x FBW %u PSM %s with %dx%d rect covering %u pages",
		m_cached_ctx.FRAME.Block(), m_cached_ctx.FRAME.FBW, psm_str(m_cached_ctx.FRAME.PSM),
		m_r.width(), m_r.height(), pages_covered);

	// Remove any targets which are directly at the start.
	if (IsDiscardingDstColor())
	{
		const u32 bp = m_cached_ctx.FRAME.Block();
		g_texture_cache->InvalidateVideoMemType(GSTextureCache::RenderTarget, bp, m_cached_ctx.FRAME.PSM);
		g_texture_cache->InvalidateVideoMemType(GSTextureCache::DepthStencil, bp, m_cached_ctx.FRAME.PSM);
	}

	return true;
}

bool GSRendererHW::ContinueSplitClear()
{
	// Should be a mem clear type draw.
	if (!IsConstantDirectWriteMemClear())
		return false;

	// Shouldn't have gaps.
	if (m_vt.m_eq.rgba != 0xFFFF || (!m_cached_ctx.ZBUF.ZMSK && !m_vt.m_eq.z) || !PrimitiveCoversWithoutGaps())
		return false;

	// Remove any targets which are directly at the start, since we checked this draw in the last.
	if (IsDiscardingDstColor())
	{
		const u32 bp = m_cached_ctx.FRAME.Block();
		g_texture_cache->InvalidateVideoMemType(GSTextureCache::RenderTarget, bp, m_cached_ctx.FRAME.PSM);
		g_texture_cache->InvalidateVideoMemType(GSTextureCache::DepthStencil, bp, m_cached_ctx.FRAME.PSM);
	}

	// Check next draw.
	u32 pages_covered;
	const bool skip = CheckNextDrawForSplitClear(m_r, &pages_covered);

	// We might've found the end, but this draw still counts.
	m_split_clear_pages += pages_covered;
	return skip;
}

bool GSRendererHW::CheckNextDrawForSplitClear(const GSVector4i& r, u32* pages_covered_by_this_draw) const
{
	const u32 end_block = GSLocalMemory::GetEndBlockAddress(m_cached_ctx.FRAME.Block(), m_cached_ctx.FRAME.FBW, m_cached_ctx.FRAME.PSM, r);
	if (pages_covered_by_this_draw)
	{
		if (end_block < m_cached_ctx.FRAME.Block())
			*pages_covered_by_this_draw = (((MAX_BLOCKS - end_block) + m_cached_ctx.FRAME.Block()) + (BLOCKS_PER_PAGE)) / BLOCKS_PER_PAGE;
		else
			*pages_covered_by_this_draw = ((end_block - m_cached_ctx.FRAME.Block()) + (BLOCKS_PER_PAGE)) / BLOCKS_PER_PAGE;
	}

	// must be changing FRAME
	if (m_backed_up_ctx < 0 || (m_dirty_gs_regs & (1u << DIRTY_REG_FRAME)) == 0)
		return false;

	// rect width should match the FBW (page aligned)
	if (r.width() != m_cached_ctx.FRAME.FBW * 64)
		return false;

	// next FBP should point to the end of the rect
	const GSDrawingContext& next_ctx = m_env.CTXT[m_backed_up_ctx];
	if (next_ctx.FRAME.Block() != ((end_block + 1) % MAX_BLOCKS) ||
		m_context->TEX0.U64 != next_ctx.TEX0.U64 ||
		m_context->TEX1.U64 != next_ctx.TEX1.U64 || m_context->CLAMP.U64 != next_ctx.CLAMP.U64 ||
		m_context->TEST.U64 != next_ctx.TEST.U64 || ((m_context->FRAME.U64 ^ next_ctx.FRAME.U64) & (~0x1FF)) != 0 ||
		((m_context->ZBUF.U64 ^ next_ctx.ZBUF.U64) & (~0x1FF)) != 0)
	{
		return false;
	}

	// check ZBP if we're doing Z too
	if (!m_cached_ctx.ZBUF.ZMSK && m_cached_ctx.FRAME.FBP != m_cached_ctx.ZBUF.ZBP)
	{
		const u32 end_z_block = GSLocalMemory::GetEndBlockAddress(
			m_cached_ctx.ZBUF.Block(), m_cached_ctx.FRAME.FBW, m_cached_ctx.ZBUF.PSM, r);
		if (next_ctx.ZBUF.Block() != ((end_z_block + 1) % MAX_BLOCKS))
			return false;
	}

	return true;
}

void GSRendererHW::FinishSplitClear()
{
	GL_INS("FinishSplitClear(): Start %x FBW %u PSM %s, %u pages, %08X color", m_split_clear_start.Block(),
		m_split_clear_start.FBW, psm_str(m_split_clear_start.PSM), m_split_clear_pages, m_split_clear_color);

	// If this was a tall single-page draw, try to get a better BW from somewhere.
	if (m_split_clear_start.FBW <= 1 && m_split_clear_pages >= 16) // 1024 high
		TryToResolveSinglePageFramebuffer(m_split_clear_start, false);

	SetNewFRAME(m_split_clear_start.Block(), m_split_clear_start.FBW, m_split_clear_start.PSM);
	SetNewZBUF(m_split_clear_start_Z.Block(), m_split_clear_start_Z.PSM);
	ReplaceVerticesWithSprite(
		GetDrawRectForPages(m_split_clear_start.FBW, m_split_clear_start.PSM, m_split_clear_pages), GSVector2i(1, 1));
	GL_INS("FinishSplitClear(): New draw rect is (%d,%d=>%d,%d) with FBW %u and PSM %s", m_r.x, m_r.y, m_r.z, m_r.w,
		m_split_clear_start.FBW, psm_str(m_split_clear_start.PSM));
	m_split_clear_start.U64 = 0;
	m_split_clear_start_Z.U64 = 0;
	m_split_clear_pages = 0;
	m_split_clear_color = 0;
}

bool GSRendererHW::IsTBPFrameOrZ(u32 tbp) const
{
	const bool is_frame = (m_cached_ctx.FRAME.Block() == tbp);
	const bool is_z = (m_cached_ctx.ZBUF.Block() == tbp);
	if (!is_frame && !is_z)
		return false;

	const u32 fm = m_cached_ctx.FRAME.FBMSK;
	const u32 zm = m_cached_ctx.ZBUF.ZMSK || m_cached_ctx.TEST.ZTE == 0 ? 0xffffffff : 0;
	const u32 fm_mask = GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk;

	const u32 max_z = (0xFFFFFFFF >> (GSLocalMemory::m_psm[m_cached_ctx.ZBUF.PSM].fmt * 8));
	const bool no_rt = (m_context->ALPHA.IsCd() && PRIM->ABE && (m_cached_ctx.FRAME.PSM == 1))
		|| (!m_cached_ctx.TEST.DATE && (fm & GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk) == GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk);
	const bool no_ds = (
		// Depth is always pass/fail (no read) and write are discarded.
		(zm != 0 && m_cached_ctx.TEST.ZTST <= ZTST_ALWAYS) ||
		// Depth test will always pass
		(zm != 0 && m_cached_ctx.TEST.ZTST == ZTST_GEQUAL && m_vt.m_eq.z && std::min(m_vertex.buff[0].XYZ.Z, max_z) == max_z) ||
		// Depth will be written through the RT
		(!no_rt && m_cached_ctx.FRAME.FBP == m_cached_ctx.ZBUF.ZBP && !PRIM->TME && zm == 0 && (fm & fm_mask) == 0 && m_cached_ctx.TEST.ZTE));

	// Relying a lot on the optimizer here... I don't like it.
	return (is_frame && !no_rt) || (is_z && !no_ds);
}


void GSRendererHW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
{
	// printf("[%d] InvalidateVideoMem %d,%d - %d,%d %05x (%d)\n", static_cast<int>(g_perfmon.GetFrame()), r.left, r.top, r.right, r.bottom, static_cast<int>(BITBLTBUF.DBP), static_cast<int>(BITBLTBUF.DPSM));

	// This is gross, but if the EE write loops, we need to split it on the 2048 border.
	GSVector4i rect = r;
	bool loop_h = false;
	bool loop_w = false;
	if (r.w > 2048)
	{
		rect.w = 2048;
		loop_h = true;
	}
	if (r.z > 2048)
	{
		rect.z = 2048;
		loop_w = true;
	}
	if (loop_h || loop_w)
	{
		g_texture_cache->InvalidateVideoMem(m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM), rect);
		if (loop_h)
		{
			rect.y = 0;
			rect.w = r.w - 2048;
		}
		if (loop_w)
		{
			rect.x = 0;
			rect.z = r.z - 2048;
		}
		g_texture_cache->InvalidateVideoMem(m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM), rect);
	}
	else
		g_texture_cache->InvalidateVideoMem(m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM), r);
}

void GSRendererHW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut)
{
	// printf("[%d] InvalidateLocalMem %d,%d - %d,%d %05x (%d)\n", static_cast<int>(g_perfmon.GetFrame()), r.left, r.top, r.right, r.bottom, static_cast<int>(BITBLTBUF.SBP), static_cast<int>(BITBLTBUF.SPSM));

	if (clut)
		return; // FIXME

	auto iter = m_draw_transfers.end();
	bool skip = false;
	// If the EE write overlaps the readback and was done since the last draw, there's no need to read it back.
	// Dog's life does this.
	while (iter != m_draw_transfers.begin())
	{
		--iter;

		if (!(iter->draw == s_n && BITBLTBUF.SBP == iter->blit.DBP && iter->blit.DPSM == BITBLTBUF.SPSM && r.eq(iter->rect)))
			continue;

		g_texture_cache->InvalidateVideoMem(m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM), r);
		skip = true;
		break;
	}

	if (!skip)
	{
		const bool recursive_copy = (BITBLTBUF.SBP == BITBLTBUF.DBP) && (m_env.TRXDIR.XDIR == 2);
		g_texture_cache->InvalidateLocalMem(m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM), r, recursive_copy);
	}
}

void GSRendererHW::Move()
{
	if (m_mv && m_mv(*this))
	{
		// Handled by HW hack.
		return;
	}

	if (m_env.TRXDIR.XDIR == 3)
		return;

	const int sx = m_env.TRXPOS.SSAX;
	const int sy = m_env.TRXPOS.SSAY;
	const int dx = m_env.TRXPOS.DSAX;
	const int dy = m_env.TRXPOS.DSAY;

	const int w = m_env.TRXREG.RRW;
	const int h = m_env.TRXREG.RRH;

	if (g_texture_cache->Move(m_env.BITBLTBUF.SBP, m_env.BITBLTBUF.SBW, m_env.BITBLTBUF.SPSM, sx, sy,
			m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW, m_env.BITBLTBUF.DPSM, dx, dy, w, h))
	{
		m_env.TRXDIR.XDIR = 3;
		// Handled entirely in TC, no need to update local memory.
		return;
	}

	GSRenderer::Move();
}

u16 GSRendererHW::Interpolate_UV(float alpha, int t0, int t1)
{
	const float t = (1.0f - alpha) * t0 + alpha * t1;
	return static_cast<u16>(t) & ~0xF; // cheap rounding
}

float GSRendererHW::alpha0(int L, int X0, int X1)
{
	const int x = (X0 + 15) & ~0xF; // Round up
	return static_cast<float>(x - X0) / static_cast<float>(L);
}

float GSRendererHW::alpha1(int L, int X0, int X1)
{
	const int x = (X1 - 1) & ~0xF; // Round down. Note -1 because right pixel isn't included in primitive so 0x100 must return 0.
	return static_cast<float>(x - X0) / static_cast<float>(L);
}

void GSRendererHW::SwSpriteRender()
{
	// Supported drawing attributes
	ASSERT(PRIM->PRIM == GS_TRIANGLESTRIP || PRIM->PRIM == GS_SPRITE);
	ASSERT(!PRIM->FGE); // No FOG
	ASSERT(!PRIM->AA1); // No antialiasing
	ASSERT(!PRIM->FIX); // Normal fragment value control

	ASSERT(!m_draw_env->DTHE.DTHE); // No dithering

	ASSERT(!m_cached_ctx.TEST.ATE); // No alpha test
	ASSERT(!m_cached_ctx.TEST.DATE); // No destination alpha test
	ASSERT(!m_cached_ctx.DepthRead() && !m_cached_ctx.DepthWrite()); // No depth handling

	ASSERT(!m_cached_ctx.TEX0.CSM); // No CLUT usage

	ASSERT(!m_draw_env->PABE.PABE); // No PABE

	// PSMCT32 pixel format
	ASSERT(!PRIM->TME || m_cached_ctx.TEX0.PSM == PSMCT32);
	ASSERT(m_cached_ctx.FRAME.PSM == PSMCT32);

	// No rasterization required
	ASSERT(PRIM->PRIM == GS_SPRITE
		|| ((PRIM->IIP || m_vt.m_eq.rgba == 0xffff)
			&& m_vt.m_eq.z == 0x1
			&& (!PRIM->TME || PRIM->FST || m_vt.m_eq.q == 0x1)));  // Check Q equality only if texturing enabled and STQ coords used

	const bool texture_mapping_enabled = PRIM->TME;

	const GSVector4i r = m_r;

#ifndef NDEBUG
	const int tw = 1 << m_cached_ctx.TEX0.TW;
	const int th = 1 << m_cached_ctx.TEX0.TH;
	const float meas_tw = m_vt.m_max.t.x - m_vt.m_min.t.x;
	const float meas_th = m_vt.m_max.t.y - m_vt.m_min.t.y;
	ASSERT(!PRIM->TME || (abs(meas_tw - r.width()) <= SSR_UV_TOLERANCE && abs(meas_th - r.height()) <= SSR_UV_TOLERANCE)); // No input texture min/mag, if any.
	ASSERT(!PRIM->TME || (abs(m_vt.m_min.t.x) <= SSR_UV_TOLERANCE && abs(m_vt.m_min.t.y) <= SSR_UV_TOLERANCE && abs(meas_tw - tw) <= SSR_UV_TOLERANCE && abs(meas_th - th) <= SSR_UV_TOLERANCE)); // No texture UV wrap, if any.
#endif

	GIFRegTRXPOS trxpos = {};

	trxpos.DSAX = r.x;
	trxpos.DSAY = r.y;
	trxpos.SSAX = static_cast<int>(m_vt.m_min.t.x / 2) * 2; // Rounded down to closest even integer.
	trxpos.SSAY = static_cast<int>(m_vt.m_min.t.y / 2) * 2;

	ASSERT(r.x % 2 == 0 && r.y % 2 == 0);

	GIFRegTRXREG trxreg = {};

	trxreg.RRW = r.width();
	trxreg.RRH = r.height();

	ASSERT(r.width() % 2 == 0 && r.height() % 2 == 0);

	// SW rendering code, mainly taken from GSState::Move(), TRXPOS.DIR{X,Y} management excluded

	const int sx = trxpos.SSAX;
	int sy = trxpos.SSAY;
	const int dx = trxpos.DSAX;
	int dy = trxpos.DSAY;
	const int w = trxreg.RRW;
	const int h = trxreg.RRH;

	GL_INS("SwSpriteRender: Dest 0x%x W:%d F:%s, size(%d %d)", m_cached_ctx.FRAME.Block(), m_cached_ctx.FRAME.FBW, psm_str(m_cached_ctx.FRAME.PSM), w, h);

	const GSOffset spo = m_mem.GetOffset(m_context->TEX0.TBP0, m_context->TEX0.TBW, m_context->TEX0.PSM);
	const GSOffset& dpo = m_context->offset.fb;

	const bool alpha_blending_enabled = PRIM->ABE;

	const GSVertex& v = m_index.tail > 0 ? m_vertex.buff[m_index.buff[m_index.tail - 1]] : GSVertex(); // Last vertex if any.
	const GSVector4i vc = GSVector4i(v.RGBAQ.R, v.RGBAQ.G, v.RGBAQ.B, v.RGBAQ.A) // 0x000000AA000000BB000000GG000000RR
							  .ps32(); // 0x00AA00BB00GG00RR00AA00BB00GG00RR

	const GSVector4i a_mask = GSVector4i::xff000000().u8to16(); // 0x00FF00000000000000FF000000000000

	const bool fb_mask_enabled = m_cached_ctx.FRAME.FBMSK != 0x0;
	const GSVector4i fb_mask = GSVector4i(m_cached_ctx.FRAME.FBMSK).u8to16(); // 0x00AA00BB00GG00RR00AA00BB00GG00RR

	const u8 tex0_tfx = m_cached_ctx.TEX0.TFX;
	const u8 tex0_tcc = m_cached_ctx.TEX0.TCC;
	const u8 alpha_a = m_context->ALPHA.A;
	const u8 alpha_b = m_context->ALPHA.B;
	const u8 alpha_c = m_context->ALPHA.C;
	const u8 alpha_d = m_context->ALPHA.D;
	const u8 alpha_fix = m_context->ALPHA.FIX;

	if (texture_mapping_enabled)
		g_texture_cache->InvalidateLocalMem(spo, GSVector4i(sx, sy, sx + w, sy + h));
	constexpr bool invalidate_local_mem_before_fb_read = false;
	if (invalidate_local_mem_before_fb_read && (alpha_blending_enabled || fb_mask_enabled))
		g_texture_cache->InvalidateLocalMem(dpo, m_r);

	for (int y = 0; y < h; y++, ++sy, ++dy)
	{
		u32* vm = m_mem.vm32();
		const GSOffset::PAHelper spa = spo.paMulti(sx, sy);
		const GSOffset::PAHelper dpa = dpo.paMulti(dx, dy);

		ASSERT(w % 2 == 0);

		for (int x = 0; x < w; x += 2)
		{
			u32* di = &vm[dpa.value(x)];
			ASSERT(di + 1 == &vm[dpa.value(x + 1)]); // Destination pixel pair is adjacent in memory

			GSVector4i sc = {};
			if (texture_mapping_enabled)
			{
				const u32* si = &vm[spa.value(x)];
				// Read 2 source pixel colors
				ASSERT(si + 1 == &vm[spa.value(x + 1)]); // Source pixel pair is adjacent in memory
				sc = GSVector4i::loadl(si).u8to16(); // 0x00AA00BB00GG00RR00aa00bb00gg00rr

				// Apply TFX
				ASSERT(tex0_tfx == 0 || tex0_tfx == 1);
				if (tex0_tfx == 0)
					sc = sc.mul16l(vc).srl16(7).clamp8(); // clamp((sc * vc) >> 7, 0, 255), srl16 is ok because 16 bit values are unsigned

				if (tex0_tcc == 0)
					sc = sc.blend(vc, a_mask);
			}
			else
				sc = vc;

			// No FOG

			GSVector4i dc0 = {};
			GSVector4i dc = {};

			if (alpha_blending_enabled || fb_mask_enabled)
			{
				// Read 2 destination pixel colors
				dc0 = GSVector4i::loadl(di).u8to16(); // 0x00AA00BB00GG00RR00aa00bb00gg00rr
			}

			if (alpha_blending_enabled)
			{
				// Blending
				const GSVector4i A = alpha_a == 0 ? sc : alpha_a == 1 ? dc0 : GSVector4i::zero();
				const GSVector4i B = alpha_b == 0 ? sc : alpha_b == 1 ? dc0 : GSVector4i::zero();
				const GSVector4i C = alpha_c == 2 ? GSVector4i(alpha_fix).xxxx().ps32()
				                                  : (alpha_c == 0 ? sc : dc0).yyww()    // 0x00AA00BB00AA00BB00aa00bb00aa00bb
				                                                             .srl32(16) // 0x000000AA000000AA000000aa000000aa
				                                                             .ps32()    // 0x00AA00AA00aa00aa00AA00AA00aa00aa
				                                                             .xxyy();   // 0x00AA00AA00AA00AA00aa00aa00aa00aa
				const GSVector4i D = alpha_d == 0 ? sc : alpha_d == 1 ? dc0 : GSVector4i::zero();
				dc = A.sub16(B).mul16l(C).sra16(7).add16(D); // (((A - B) * C) >> 7) + D, must use sra16 due to signed 16 bit values.
				// dc alpha channels (dc.u16[3], dc.u16[7]) dirty
			}
			else
				dc = sc;

			// No dithering

			// Clamping
			if (m_draw_env->COLCLAMP.CLAMP)
				dc = dc.clamp8(); // clamp(dc, 0, 255)
			else
				dc = dc.sll16(8).srl16(8); // Mask, lower 8 bits enabled per channel

			// No Alpha Correction
			ASSERT(m_context->FBA.FBA == 0);
			dc = dc.blend(sc, a_mask);
			// dc alpha channels valid

			// Frame buffer mask
			if (fb_mask_enabled)
				dc = dc.blend(dc0, fb_mask);

			// Store 2 pixel colors
			dc = dc.pu16(GSVector4i::zero()); // 0x0000000000000000AABBGGRRaabbggrr
			GSVector4i::storel(di, dc);
		}
	}

	g_texture_cache->InvalidateVideoMem(dpo, m_r);
}

bool GSRendererHW::CanUseSwSpriteRender()
{
	const GSVector4i r = m_r;
	if (r.x % 2 != 0 || r.y % 2 != 0)
		return false; // Even offset.
	const int w = r.width();
	const int h = r.height();
	if (w % 2 != 0 || h % 2 != 0)
		return false; // Even size.
	if (w > 64 || h > 64)
		return false; // Small draw.
	if (PRIM->PRIM != GS_SPRITE
		&& ((PRIM->IIP && m_vt.m_eq.rgba != 0xffff)
			|| (PRIM->TME && !PRIM->FST && m_vt.m_eq.q != 0x1)
			|| m_vt.m_eq.z != 0x1)) // No rasterization
		return false;
	if (m_vt.m_primclass != GS_TRIANGLE_CLASS && m_vt.m_primclass != GS_SPRITE_CLASS) // Triangle or sprite class prims
		return false;
	if (PRIM->PRIM != GS_TRIANGLESTRIP && PRIM->PRIM != GS_SPRITE) // Triangle strip or sprite draw
		return false;
	if (m_vt.m_primclass == GS_TRIANGLE_CLASS && (PRIM->PRIM != GS_TRIANGLESTRIP || m_vertex.tail != 4)) // If triangle class, strip draw with 4 vertices (two prims, emulating single sprite prim)
		return false;
	// TODO If GS_TRIANGLESTRIP draw, check that the draw is axis aligned
	if (m_vt.m_primclass == GS_SPRITE_CLASS && (PRIM->PRIM != GS_SPRITE || m_vertex.tail != 2)) // If sprite class, sprite draw with 2 vertices (one prim)
		return false;
	if (m_cached_ctx.DepthRead() || m_cached_ctx.DepthWrite()) // No depth handling
		return false;
	if (m_cached_ctx.FRAME.PSM != PSMCT32) // Frame buffer format is 32 bit color
		return false;
	if (PRIM->TME)
	{
		// Texture mapping enabled

		if (m_cached_ctx.TEX0.PSM != PSMCT32) // Input texture format is 32 bit color
			return false;
		if (IsMipMapDraw()) // No mipmapping.
			return false;
		const int tw = 1 << m_cached_ctx.TEX0.TW;
		const int th = 1 << m_cached_ctx.TEX0.TH;
		const float meas_tw = m_vt.m_max.t.x - m_vt.m_min.t.x;
		const float meas_th = m_vt.m_max.t.y - m_vt.m_min.t.y;
		if (abs(m_vt.m_min.t.x) > SSR_UV_TOLERANCE ||
			abs(m_vt.m_min.t.y) > SSR_UV_TOLERANCE ||
			abs(meas_tw - tw) > SSR_UV_TOLERANCE ||
			abs(meas_th - th) > SSR_UV_TOLERANCE) // No UV wrapping.
			return false;
		if (abs(meas_tw - w) > SSR_UV_TOLERANCE || abs(meas_th - h) > SSR_UV_TOLERANCE) // No texture width or height mag/min.
			return false;
	}

	// The draw call is a good candidate for using the SwSpriteRender to replace the GPU draw
	// However, some draw attributes might not be supported yet by the SwSpriteRender,
	// so if any bug occurs in using it, enabling debug build would probably
	// make failing some of the assertions used in the SwSpriteRender to highlight its limitations.
	// In that case, either the condition can be added here to discard the draw, or the
	// SwSpriteRender can be improved by adding the missing features.
	return true;
}

template <bool linear>
void GSRendererHW::RoundSpriteOffset()
{
//#define DEBUG_U
//#define DEBUG_V
#if defined(DEBUG_V) || defined(DEBUG_U)
	bool debug = linear;
#endif
	const u32 count = m_vertex.next;
	GSVertex* v = &m_vertex.buff[0];

	for (u32 i = 0; i < count; i += 2)
	{
		// Performance note: if it had any impact on perf, someone would port it to SSE (AKA GSVector)

		// Compute the coordinate of first and last texels (in native with a linear filtering)
		const int ox = m_context->XYOFFSET.OFX;
		const int X0 = v[i].XYZ.X - ox;
		const int X1 = v[i + 1].XYZ.X - ox;
		const int Lx = (v[i + 1].XYZ.X - v[i].XYZ.X);
		const float ax0 = alpha0(Lx, X0, X1);
		const float ax1 = alpha1(Lx, X0, X1);
		const u16 tx0 = Interpolate_UV(ax0, v[i].U, v[i + 1].U);
		const u16 tx1 = Interpolate_UV(ax1, v[i].U, v[i + 1].U);
#ifdef DEBUG_U
		if (debug)
		{
			fprintf(stderr, "u0:%d and u1:%d\n", v[i].U, v[i + 1].U);
			fprintf(stderr, "a0:%f and a1:%f\n", ax0, ax1);
			fprintf(stderr, "t0:%d and t1:%d\n", tx0, tx1);
		}
#endif

		const int oy = m_context->XYOFFSET.OFY;
		const int Y0 = v[i].XYZ.Y - oy;
		const int Y1 = v[i + 1].XYZ.Y - oy;
		const int Ly = (v[i + 1].XYZ.Y - v[i].XYZ.Y);
		const float ay0 = alpha0(Ly, Y0, Y1);
		const float ay1 = alpha1(Ly, Y0, Y1);
		const u16 ty0 = Interpolate_UV(ay0, v[i].V, v[i + 1].V);
		const u16 ty1 = Interpolate_UV(ay1, v[i].V, v[i + 1].V);
#ifdef DEBUG_V
		if (debug)
		{
			fprintf(stderr, "v0:%d and v1:%d\n", v[i].V, v[i + 1].V);
			fprintf(stderr, "a0:%f and a1:%f\n", ay0, ay1);
			fprintf(stderr, "t0:%d and t1:%d\n", ty0, ty1);
		}
#endif

#ifdef DEBUG_U
		if (debug)
			fprintf(stderr, "GREP_BEFORE %d => %d\n", v[i].U, v[i + 1].U);
#endif
#ifdef DEBUG_V
		if (debug)
			fprintf(stderr, "GREP_BEFORE %d => %d\n", v[i].V, v[i + 1].V);
#endif

#if 1
		// Use rounded value of the newly computed texture coordinate. It ensures
		// that sampling will remains inside texture boundary
		//
		// Note for bilinear: by definition it will never work correctly! A sligh modification
		// of interpolation migth trigger a discard (with alpha testing)
		// Let's use something simple that correct really bad case (for a couple of 2D games).
		// I hope it won't create too much glitches.
		if (linear)
		{
			const int Lu = v[i + 1].U - v[i].U;
			// Note 32 is based on taisho-mononoke
			if ((Lu > 0) && (Lu <= (Lx + 32)))
			{
				v[i + 1].U -= 8;
			}
		}
		else
		{
			if (tx0 <= tx1)
			{
				v[i].U = tx0;
				v[i + 1].U = tx1 + 16;
			}
			else
			{
				v[i].U = tx0 + 15;
				v[i + 1].U = tx1;
			}
		}
#endif
#if 1
		if (linear)
		{
			const int Lv = v[i + 1].V - v[i].V;
			if ((Lv > 0) && (Lv <= (Ly + 32)))
			{
				v[i + 1].V -= 8;
			}
		}
		else
		{
			if (ty0 <= ty1)
			{
				v[i].V = ty0;
				v[i + 1].V = ty1 + 16;
			}
			else
			{
				v[i].V = ty0 + 15;
				v[i + 1].V = ty1;
			}
		}
#endif

#ifdef DEBUG_U
		if (debug)
			fprintf(stderr, "GREP_AFTER %d => %d\n\n", v[i].U, v[i + 1].U);
#endif
#ifdef DEBUG_V
		if (debug)
			fprintf(stderr, "GREP_AFTER %d => %d\n\n", v[i].V, v[i + 1].V);
#endif
	}
}

void GSRendererHW::Draw()
{
	if (GSConfig.DumpGSData && (s_n >= GSConfig.SaveN))
	{
		std::string s;

		// Dump Register state
		s = GetDrawDumpPath("%05d_context.txt", s_n);

		m_draw_env->Dump(s);
		m_context->Dump(s);

		// Dump vertices
		s = GetDrawDumpPath("%05d_vertex.txt", s_n);
		DumpVertices(s);
	}

#ifdef ENABLE_OGL_DEBUG
	static u32 num_skipped_channel_shuffle_draws = 0;
#endif

	// We mess with this state as an optimization, so take a copy and use that instead.
	const GSDrawingContext* context = m_context;
	m_cached_ctx.TEX0 = context->TEX0;
	m_cached_ctx.CLAMP = context->CLAMP;
	m_cached_ctx.TEST = context->TEST;
	m_cached_ctx.FRAME = context->FRAME;
	m_cached_ctx.ZBUF = context->ZBUF;
	m_primitive_covers_without_gaps.reset();

	if (IsBadFrame())
	{
		GL_INS("Warning skipping a draw call (%d)", s_n);
		return;
	}

	// Channel shuffles repeat lots of draws. Get out early if we can.
	if (m_channel_shuffle)
	{
		// NFSU2 does consecutive channel shuffles with blending, reducing the alpha channel over time.
		// Fortunately, it seems to change the FBMSK along the way, so this check alone is sufficient.
		// Tomb Raider: Underworld does similar, except with R, G, B in separate palettes, therefore
		// we need to split on those too.
		m_channel_shuffle = IsPossibleChannelShuffle() && m_last_channel_shuffle_fbmsk == m_context->FRAME.FBMSK;

#ifdef ENABLE_OGL_DEBUG
		if (m_channel_shuffle)
		{
			num_skipped_channel_shuffle_draws++;
			return;
		}

		if (num_skipped_channel_shuffle_draws > 0)
			GL_INS("Skipped %u channel shuffle draws", num_skipped_channel_shuffle_draws);
		num_skipped_channel_shuffle_draws = 0;
#else
		if (m_channel_shuffle)
			return;
#endif
	}

	GL_PUSH("HW Draw %d (Context %u)", s_n, PRIM->CTXT);
	GL_INS("FLUSH REASON: %s%s", GetFlushReasonString(m_state_flush_reason),
		(m_state_flush_reason != GSFlushReason::CONTEXTCHANGE && m_dirty_gs_regs) ? " AND POSSIBLE CONTEXT CHANGE" :
																					"");

	// When the format is 24bit (Z or C), DATE ceases to function.
	// It was believed that in 24bit mode all pixels pass because alpha doesn't exist
	// however after testing this on a PS2 it turns out nothing passes, it ignores the draw.
	if ((m_cached_ctx.FRAME.PSM & 0xF) == PSMCT24 && m_context->TEST.DATE)
	{
		GL_CACHE("DATE on a 24bit format, Frame PSM %x", m_context->FRAME.PSM);
		return;
	}

	// skip alpha test if possible
	// Note: do it first so we know if frame/depth writes are masked
	u32 fm = m_cached_ctx.FRAME.FBMSK;
	u32 zm = m_cached_ctx.ZBUF.ZMSK || m_cached_ctx.TEST.ZTE == 0 ? 0xffffffff : 0;
	const u32 fm_mask = GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk;

	// Note required to compute TryAlphaTest below. So do it now.
	const GSDrawingEnvironment& env = *m_draw_env;
	const GSLocalMemory::psm_t& tex_psm = GSLocalMemory::m_psm[context->TEX0.PSM];
	if (PRIM->TME && tex_psm.pal > 0)
		m_mem.m_clut.Read32(m_cached_ctx.TEX0, env.TEXA);

	//  Test if we can optimize Alpha Test as a NOP
	m_cached_ctx.TEST.ATE = m_cached_ctx.TEST.ATE && !GSRenderer::TryAlphaTest(fm, fm_mask, zm);

	// Need to fix the alpha test, since the alpha will be fixed to 1.0 if ABE is disabled and AA1 is enabled
	// So if it doesn't meet the condition, always fail, if it does, always pass (turn off the test).
	if (IsCoverageAlpha() && m_cached_ctx.TEST.ATE && m_cached_ctx.TEST.ATST > 1)
	{
		const float aref = static_cast<float>(m_cached_ctx.TEST.AREF);
		const int old_ATST = m_cached_ctx.TEST.ATST;
		m_cached_ctx.TEST.ATST = 0;

		switch (old_ATST)
		{
			case ATST_LESS:
				if (128.0f < aref)
					m_cached_ctx.TEST.ATE = false;
				break;
			case ATST_LEQUAL:
				if (128.0f <= aref)
					m_cached_ctx.TEST.ATE = false;
				break;
			case ATST_EQUAL:
				if (128.0f == aref)
					m_cached_ctx.TEST.ATE = false;
				break;
			case ATST_GEQUAL:
				if (128.0f >= aref)
					m_cached_ctx.TEST.ATE = false;
				break;
			case ATST_GREATER:
				if (128.0f > aref)
					m_cached_ctx.TEST.ATE = false;
				break;
			case ATST_NOTEQUAL:
				if (128.0f != aref)
					m_cached_ctx.TEST.ATE = false;
				break;
			default:
				break;
		}
	}

	m_cached_ctx.FRAME.FBMSK = fm;
	m_cached_ctx.ZBUF.ZMSK = zm != 0;

	// It is allowed to use the depth and rt at the same location. However at least 1 must
	// be disabled. Or the written value must be the same on both channels.
	// 1/ GoW uses a Cd blending on a 24 bits buffer (no alpha)
	// 2/ SuperMan really draws (0,0,0,0) color and a (0) 32-bits depth
	// 3/ 50cents really draws (0,0,0,128) color and a (0) 24 bits depth
	// Note: FF DoC has both buffer at same location but disable the depth test (write?) with ZTE = 0
	const u32 max_z = (0xFFFFFFFF >> (GSLocalMemory::m_psm[m_cached_ctx.ZBUF.PSM].fmt * 8));
	bool no_rt = (context->ALPHA.IsCd() && PRIM->ABE && (m_cached_ctx.FRAME.PSM == 1))
						|| (!m_cached_ctx.TEST.DATE && (fm & GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk) == GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk);
	const bool all_depth_tests_pass =
		// Depth is always pass/fail (no read) and write are discarded.
		(!m_cached_ctx.TEST.ZTE || m_cached_ctx.TEST.ZTST <= ZTST_ALWAYS) ||
		// Depth test will always pass
		(m_cached_ctx.TEST.ZTST == ZTST_GEQUAL && m_vt.m_eq.z && std::min(m_vertex.buff[0].XYZ.Z, max_z) == max_z);
	bool no_ds = (zm != 0 && all_depth_tests_pass) ||
					   // Depth will be written through the RT
					   (!no_rt && m_cached_ctx.FRAME.FBP == m_cached_ctx.ZBUF.ZBP && !PRIM->TME && zm == 0 && (fm & fm_mask) == 0 && m_cached_ctx.TEST.ZTE);

	// No Z test if no z buffer.
	if (no_ds || all_depth_tests_pass)
	{
		if (m_cached_ctx.TEST.ZTST != ZTST_ALWAYS)
			GL_CACHE("Disabling Z tests because all tests will pass.");

		m_cached_ctx.TEST.ZTST = ZTST_ALWAYS;
	}

	if (no_rt && no_ds)
	{
		GL_CACHE("Skipping draw with no color nor depth output.");
		return;
	}

	const bool draw_sprite_tex = PRIM->TME && (m_vt.m_primclass == GS_SPRITE_CLASS);

	// We trigger the sw prim render here super early, to avoid creating superfluous render targets.
	if (CanUseSwPrimRender(no_rt, no_ds, draw_sprite_tex) && SwPrimRender(*this, true, true))
	{
		GL_CACHE("Possible texture decompression, drawn with SwPrimRender() (BP %x BW %u TBP0 %x TBW %u)",
			m_cached_ctx.FRAME.Block(), m_cached_ctx.FRAME.FBMSK, m_cached_ctx.TEX0.TBP0, m_cached_ctx.TEX0.TBW);
		return;
	}

	// GS doesn't fill the right or bottom edges of sprites/triangles, and for a pixel to be shaded, the vertex
	// must cross the center. In other words, the range is equal to the floor of coordinates +0.5. Except for
	// the case where the minimum equals the maximum, because at least one pixel is filled per line.
	// Test cases for the math:
	//                                --------------------------------------
	//                                | Position range | Draw Range | Size |
	//                                |       -0.5,0.0 |        0-0 |    1 |
	//                                |       -0.5,0.5 |        0-0 |    1 |
	//                                |            0,1 |        0-0 |    1 |
	//                                |          0,1.5 |        0-1 |    2 |
	//                                |        0.5,1.5 |        1-1 |    1 |
	//                                |       0.5,1.75 |        1-1 |    1 |
	//                                |       0.5,2.25 |        1-1 |    1 |
	//                                |        0.5,2.5 |        1-2 |    2 |
	//                                --------------------------------------
	m_r = GSVector4i(m_vt.m_min.p.upld(m_vt.m_max.p) + GSVector4::cxpr(0.5f));
	m_r = m_r.blend8(m_r + GSVector4i::cxpr(0, 0, 1, 1), (m_r.xyxy() == m_r.zwzw()));
	m_r = m_r.rintersect(context->scissor.in);

	// We want to fix up the context if we're doing a double half clear, regardless of whether we do the CPU fill.
	const bool is_possible_mem_clear = IsConstantDirectWriteMemClear();
	if (!GSConfig.UserHacks_DisableSafeFeatures && is_possible_mem_clear)
	{
		if (!DetectStripedDoubleClear(no_rt, no_ds))
			DetectDoubleHalfClear(no_rt, no_ds);
	}

	const bool process_texture = PRIM->TME && !(PRIM->ABE && m_context->ALPHA.IsBlack() && !m_cached_ctx.TEX0.TCC);
	const u32 frame_end_bp = GSLocalMemory::GetUnwrappedEndBlockAddress(m_cached_ctx.FRAME.Block(), m_cached_ctx.FRAME.FBW, m_cached_ctx.FRAME.PSM, m_r);
	const bool tex_is_rt = (process_texture && m_cached_ctx.TEX0.TBP0 >= m_cached_ctx.FRAME.Block() &&
		m_cached_ctx.TEX0.TBP0 < frame_end_bp);
	const bool not_writing_to_all = (!PrimitiveCoversWithoutGaps() || AreAnyPixelsDiscarded() || !all_depth_tests_pass);
	const bool preserve_rt_rgb = (!no_rt && (!IsDiscardingDstRGB() || not_writing_to_all || tex_is_rt));
	const bool preserve_rt_alpha =
		(!no_rt && (!IsDiscardingDstAlpha() || not_writing_to_all ||
					   (tex_is_rt && GSLocalMemory::m_psm[m_cached_ctx.TEX0.PSM].trbpp != 24)));
	bool preserve_rt_color = preserve_rt_rgb || preserve_rt_alpha;
	bool preserve_depth =
		not_writing_to_all || (!no_ds && (!all_depth_tests_pass || !m_cached_ctx.DepthWrite() || m_cached_ctx.TEST.ATE));

	// SW CLUT Render enable.
	bool force_preload = GSConfig.PreloadFrameWithGSData;
	if (GSConfig.UserHacks_CPUCLUTRender > 0 || GSConfig.UserHacks_GPUTargetCLUTMode != GSGPUTargetCLUTMode::Disabled)
	{
		const CLUTDrawTestResult result = (GSConfig.UserHacks_CPUCLUTRender == 2) ? PossibleCLUTDrawAggressive() : PossibleCLUTDraw();
		m_mem.m_clut.ClearDrawInvalidity();
		if (result == CLUTDrawTestResult::CLUTDrawOnCPU && GSConfig.UserHacks_CPUCLUTRender > 0)
		{
			if (SwPrimRender(*this, true, true))
			{
				GL_CACHE("Possible clut draw, drawn with SwPrimRender()");
				return;
			}
		}
		else if (result != CLUTDrawTestResult::NotCLUTDraw)
		{
			// Force enable preloading if any of the existing data is needed.
			// e.g. NFSMW only writes the alpha channel, and needs the RGB preloaded.
			force_preload |= preserve_rt_color;
			if (preserve_rt_color)
				GL_INS("Forcing preload due to partial/blended CLUT draw");
		}
	}

	if (!m_channel_shuffle && m_cached_ctx.FRAME.Block() == m_cached_ctx.TEX0.TBP0 &&
		IsPossibleChannelShuffle())
	{
		// Special post-processing effect
		GL_INS("Possible channel shuffle effect detected");
		m_channel_shuffle = true;
		m_last_channel_shuffle_fbmsk = m_context->FRAME.FBMSK;
	}
	else if (IsSplitClearActive())
	{
		if (ContinueSplitClear())
		{
			GL_INS("Skipping due to continued split clear, FBP %x FBW %u", m_cached_ctx.FRAME.Block(), m_cached_ctx.FRAME.FBW);
			return;
		}
		else
		{
			FinishSplitClear();
		}
	}

	m_texture_shuffle = false;
	m_copy_16bit_to_target_shuffle = false;
	m_same_group_texture_shuffle = false;

	const bool is_split_texture_shuffle = (m_split_texture_shuffle_pages > 0);
	if (is_split_texture_shuffle)
	{
		// Adjust the draw rectangle to the new page range, so we get the correct fb height.
		const GSVector4i new_r = GetSplitTextureShuffleDrawRect();
		GL_CACHE(
			"Split texture shuffle: FBP %x -> %x, TBP0 %x -> %x, draw %d,%d => %d,%d -> %d,%d => %d,%d",
			m_cached_ctx.FRAME.Block(), m_split_texture_shuffle_start_FBP * BLOCKS_PER_PAGE,
			m_cached_ctx.TEX0.TBP0, m_split_texture_shuffle_start_TBP,
			m_r.x, m_r.y, m_r.z, m_r.w,
			new_r.x, new_r.y, new_r.z, new_r.w);
		m_r = new_r;

		// Adjust the scissor too, if it's in two parts, this will be wrong.
		m_context->scissor.in = new_r;

		// Fudge FRAME and TEX0 to point to the start of the shuffle.
		m_cached_ctx.TEX0.TBP0 = m_split_texture_shuffle_start_TBP;

		// We correct this again at the end of the split
		SetNewFRAME(m_split_texture_shuffle_start_FBP << 5, m_context->FRAME.FBW, m_cached_ctx.FRAME.PSM);

		// TEX0 may also be just using single width with offsets also, so let's deal with that.
		if (m_split_texture_shuffle_pages > 1 && !NextDrawMatchesShuffle())
		{
			if (m_context->FRAME.FBW != m_split_texture_shuffle_fbw && m_cached_ctx.TEX0.TBW == 1)
			{
				const GSLocalMemory::psm_t& frame_psm = GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM];
				const GSLocalMemory::psm_t& tex_psm = GSLocalMemory::m_psm[m_cached_ctx.TEX0.PSM];
				// This is the final draw of the shuffle, so let's fudge the numbers
				// Need to update the final rect as it could be wrong.
				if (m_context->FRAME.FBW == 1 && m_split_texture_shuffle_fbw != m_context->FRAME.FBW)
				{
					m_r.x = 0; // Need to keep the X offset to calculate the shuffle.
					m_r.z = m_split_texture_shuffle_fbw * frame_psm.pgs.x;
					m_r.y = 0;
					m_r.w = std::min(1024U, m_split_texture_shuffle_pages_high * frame_psm.pgs.y); // Max we can shuffle is 1024 (512)

					//Fudge the scissor and frame
					m_context->scissor.in = m_r;

					SetNewFRAME(m_split_texture_shuffle_start_FBP << 5, m_split_texture_shuffle_fbw, m_cached_ctx.FRAME.PSM);
				}

				const int pages = m_split_texture_shuffle_pages + 1;
				const int width = m_split_texture_shuffle_fbw;
				const int height = (pages >= width) ? (pages / width) : 1;
				// We must update the texture size! It will likely be 64x64, which is no good, so let's fudge that.
				m_cached_ctx.TEX0.TW = std::ceil(std::log2(std::min(1024, width * tex_psm.pgs.x)));
				m_cached_ctx.TEX0.TH = std::ceil(std::log2(std::min(1024, height * tex_psm.pgs.y)));
				m_cached_ctx.TEX0.TBW = m_split_texture_shuffle_fbw;
			}
		}
	}

	if (!GSConfig.UserHacks_DisableSafeFeatures && is_possible_mem_clear)
	{
		GL_INS("WARNING: Possible mem clear.");

		// We'll finish things off later.
		if (IsStartingSplitClear())
		{
			CleanupDraw(false);
			return;
		}

		// Try to fix large single-page-wide draws.
		bool height_invalid = m_r.w >= 1024;
		if (height_invalid && m_cached_ctx.FRAME.FBW <= 1 &&
			TryToResolveSinglePageFramebuffer(m_cached_ctx.FRAME, true))
		{
			const GSVector2i& pgs = GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].pgs;
			ReplaceVerticesWithSprite(
				GetDrawRectForPages(m_cached_ctx.FRAME.FBW, m_cached_ctx.FRAME.PSM, (m_r.w + (pgs.y - 1)) / pgs.y),
				GSVector2i(1, 1));
			height_invalid = false;
		}

		const bool is_zero_color_clear = (GetConstantDirectWriteMemClearColor() == 0 && !preserve_rt_color);
		const bool is_zero_depth_clear = (GetConstantDirectWriteMemClearDepth() == 0 && !preserve_depth);

		// If it's an invalid-sized draw, do the mem clear on the CPU, we don't want to create huge targets.
		// If clearing to zero, don't bother creating the target. Games tend to clear more than they use, wasting VRAM/bandwidth.
		if (is_zero_color_clear || is_zero_depth_clear || height_invalid)
		{
			const u32 rt_end_bp = GSLocalMemory::GetUnwrappedEndBlockAddress(
				m_cached_ctx.FRAME.Block(), m_cached_ctx.FRAME.FBW, m_cached_ctx.FRAME.PSM, m_r);
			const u32 ds_end_bp = GSLocalMemory::GetUnwrappedEndBlockAddress(
				m_cached_ctx.ZBUF.Block(), m_cached_ctx.FRAME.FBW, m_cached_ctx.ZBUF.PSM, m_r);
			// If this is a partial clear of a larger buffer, we can't invalidate the target, since we'll be losing data
			// which only existed on the GPU. Assume a BW change is a new target, though. Test case: Persona 3 shadows.
			GSTextureCache::Target* tgt;
			const bool overwriting_whole_rt =
				(no_rt || height_invalid ||
					(tgt = g_texture_cache->GetExactTarget(m_cached_ctx.FRAME.Block(), m_cached_ctx.FRAME.FBW,
						 GSTextureCache::RenderTarget, rt_end_bp)) == nullptr ||
					m_r.rintersect(tgt->m_valid).eq(tgt->m_valid));
			const bool overwriting_whole_ds =
				(no_ds || height_invalid ||
					(tgt = g_texture_cache->GetExactTarget(m_cached_ctx.ZBUF.Block(), m_cached_ctx.FRAME.FBW,
						 GSTextureCache::DepthStencil, ds_end_bp)) == nullptr ||
					m_r.rintersect(tgt->m_valid).eq(tgt->m_valid));

			if (overwriting_whole_rt && overwriting_whole_ds &&
				TryGSMemClear(no_rt, preserve_rt_color, is_zero_color_clear, rt_end_bp,
					no_ds, preserve_depth, is_zero_depth_clear, ds_end_bp))
			{
				GL_INS("Skipping (%d,%d=>%d,%d) draw at FBP %x/ZBP %x due to invalid height or zero clear.", m_r.x, m_r.y,
					m_r.z, m_r.w, m_cached_ctx.FRAME.Block(), m_cached_ctx.ZBUF.Block());

				CleanupDraw(false);
				return;
			}
		}
	}

	GIFRegTEX0 TEX0 = {};
	GSTextureCache::Source* src = nullptr;
	TextureMinMaxResult tmm;

	// Disable texture mapping if the blend is black and using alpha from vertex.
	if (process_texture)
	{
		GIFRegCLAMP MIP_CLAMP = m_cached_ctx.CLAMP;
		GSVector2i hash_lod_range(0, 0);
		m_lod = GSVector2i(0, 0);

		// Code from the SW renderer
		if (IsMipMapActive())
		{
			const int interpolation = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri

			int k = (m_context->TEX1.K + 8) >> 4;
			int lcm = m_context->TEX1.LCM;
			const int mxl = std::min<int>(static_cast<int>(m_context->TEX1.MXL), 6);

			if (static_cast<int>(m_vt.m_lod.x) >= mxl)
			{
				k = mxl; // set lod to max level
				lcm = 1; // constant lod
			}

			if (PRIM->FST)
			{
				ASSERT(lcm == 1);
				ASSERT(((m_vt.m_min.t.uph(m_vt.m_max.t) == GSVector4::zero()).mask() & 3) == 3); // ratchet and clank (menu)

				lcm = 1;
			}

			if (lcm == 1)
			{
				m_lod.x = std::max<int>(k, 0);
				m_lod.y = m_lod.x;
			}
			else
			{
				// Not constant but who care !
				if (interpolation == 2)
				{
					// Mipmap Linear. Both layers are sampled, only take the big one
					m_lod.x = std::max<int>(static_cast<int>(floor(m_vt.m_lod.x)), 0);
				}
				else
				{
					// On GS lod is a fixed float number 7:4 (4 bit for the frac part)
#if 0
					m_lod.x = std::max<int>(static_cast<int>(round(m_vt.m_lod.x + 0.0625)), 0);
#else
					// Same as above with a bigger margin on rounding
					// The goal is to avoid 1 undrawn pixels around the edge which trigger the load of the big
					// layer.
					if (ceil(m_vt.m_lod.x) < m_vt.m_lod.y)
						m_lod.x = std::max<int>(static_cast<int>(round(m_vt.m_lod.x + 0.0625 + 0.01)), 0);
					else
						m_lod.x = std::max<int>(static_cast<int>(round(m_vt.m_lod.x + 0.0625)), 0);
#endif
				}

				m_lod.y = std::max<int>(static_cast<int>(ceil(m_vt.m_lod.y)), 0);
			}

			m_lod.x = std::min<int>(m_lod.x, mxl);
			m_lod.y = std::min<int>(m_lod.y, mxl);

			TEX0 = (m_lod.x == 0) ? m_cached_ctx.TEX0 : GetTex0Layer(m_lod.x);

			// upload the full chain (with offset) for the hash cache, in case some other texture uses more levels
			// for basic mipmapping, we can get away with just doing the base image, since all the mips get generated anyway.
			hash_lod_range = GSVector2i(m_lod.x, (GSConfig.HWMipmap == HWMipmapLevel::Full) ? mxl : m_lod.x);

			MIP_CLAMP.MINU >>= m_lod.x;
			MIP_CLAMP.MINV >>= m_lod.x;
			MIP_CLAMP.MAXU >>= m_lod.x;
			MIP_CLAMP.MAXV >>= m_lod.x;

			for (int i = 0; i < m_lod.x; i++)
			{
				m_vt.m_min.t *= 0.5f;
				m_vt.m_max.t *= 0.5f;
			}

			GL_CACHE("Mipmap LOD %d %d (%f %f) new size %dx%d (K %d L %u)", m_lod.x, m_lod.y, m_vt.m_lod.x, m_vt.m_lod.y, 1 << TEX0.TW, 1 << TEX0.TH, m_context->TEX1.K, m_context->TEX1.L);
		}
		else
		{
			TEX0 = m_cached_ctx.TEX0;
		}

		tmm = GetTextureMinMax(TEX0, MIP_CLAMP, m_vt.IsLinear(), false);

		// Snowblind games set TW/TH to 1024, and use UVs for smaller textures inside that.
		// Such textures usually contain junk in local memory, so try to make them smaller based on UVs.
		// We can only do this for UVs, because ST repeat won't be correct.

		if (GSConfig.UserHacks_EstimateTextureRegion && // enabled
			(PRIM->FST || (MIP_CLAMP.WMS == CLAMP_CLAMP && MIP_CLAMP.WMT == CLAMP_CLAMP)) && // UV or ST with clamp
			TEX0.TW >= 9 && TEX0.TH >= 9 && // 512x512
			MIP_CLAMP.WMS < CLAMP_REGION_CLAMP && MIP_CLAMP.WMT < CLAMP_REGION_CLAMP && // not using custom region
			((m_vt.m_max.t >= GSVector4(512.0f)).mask() & 0x3) == 0) // If the UVs actually are large, don't optimize.
		{
			// Clamp to the UVs of the texture. We could align this to something, but it ends up working better to just duplicate
			// for different sizes in the hash cache, rather than hashing more and duplicating based on local memory.
			const GSVector4i maxt(m_vt.m_max.t + GSVector4(m_vt.IsLinear() ? 0.5f : 0.0f));
			MIP_CLAMP.WMS = CLAMP_REGION_CLAMP;
			MIP_CLAMP.WMT = CLAMP_REGION_CLAMP;
			MIP_CLAMP.MINU = 0;
			MIP_CLAMP.MAXU = maxt.x >> m_lod.x;
			MIP_CLAMP.MINV = 0;
			MIP_CLAMP.MAXV = maxt.y >> m_lod.x;
			GL_CACHE("Estimated texture region: %u,%u -> %u,%u", MIP_CLAMP.MINU, MIP_CLAMP.MINV, MIP_CLAMP.MAXU + 1,
				MIP_CLAMP.MAXV + 1);
		}

		GIFRegTEX0 FRAME_TEX0;
		bool rt_32bit = false;
		if (!no_rt && m_cached_ctx.FRAME.Block() != m_cached_ctx.TEX0.TBP0 && GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].bpp == 16)
		{
			// FBW is going to be wrong for channel shuffling into a new target, so take it from the source.
			FRAME_TEX0.U64 = 0;
			FRAME_TEX0.TBP0 = m_cached_ctx.FRAME.Block();
			FRAME_TEX0.TBW = m_cached_ctx.FRAME.FBW;
			FRAME_TEX0.PSM = m_cached_ctx.FRAME.PSM;

			GSTextureCache::Target* tgt = g_texture_cache->LookupTarget(FRAME_TEX0, GSVector2i(m_vt.m_max.p.x, m_vt.m_max.p.y), GetTextureScaleFactor(), GSTextureCache::RenderTarget, true,
				fm);

			if (tgt)
				rt_32bit = tgt->m_32_bits_fmt;

			tgt = nullptr;
		}
		const bool possible_shuffle = ((rt_32bit && GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].bpp == 16) || m_cached_ctx.FRAME.Block() == m_cached_ctx.TEX0.TBP0) || IsPossibleChannelShuffle();
		const bool req_color = (!PRIM->ABE || (PRIM->ABE && m_context->ALPHA.IsUsingCs())) && (possible_shuffle || (m_cached_ctx.FRAME.FBMSK & (fm_mask & 0x00FFFFFF)) != (fm_mask & 0x00FFFFFF));
		const bool req_alpha = m_context->TEX0.TCC && ((m_cached_ctx.TEST.ATE && m_cached_ctx.TEST.ATST > ATST_ALWAYS) || (possible_shuffle || (m_cached_ctx.FRAME.FBMSK & (fm_mask & 0xFF000000)) != (fm_mask & 0xFF000000)));

		src = tex_psm.depth ? g_texture_cache->LookupDepthSource(TEX0, env.TEXA, MIP_CLAMP, tmm.coverage, possible_shuffle, m_vt.IsLinear(), m_cached_ctx.FRAME.Block(), req_color, req_alpha) :
								g_texture_cache->LookupSource(TEX0, env.TEXA, MIP_CLAMP, tmm.coverage, (GSConfig.HWMipmap >= HWMipmapLevel::Basic || GSConfig.TriFilter == TriFiltering::Forced) ? &hash_lod_range : nullptr,
															possible_shuffle, m_vt.IsLinear(), m_cached_ctx.FRAME.Block(), req_color, req_alpha);

		if (unlikely(!src))
		{
			GL_INS("ERROR: Source lookup failed, skipping.");
			CleanupDraw(true);
			return;
		}

		// We don't know the alpha range of direct sources when we first tried to optimize the alpha test.
		// Moving the texture lookup before the ATST optimization complicates things a lot, so instead,
		// recompute it, and everything derived from it again if it changes.
		if (GSLocalMemory::m_psm[src->m_TEX0.PSM].pal == 0)
		{
			CalcAlphaMinMax(src->m_alpha_minmax.first, src->m_alpha_minmax.second);

			u32 new_fm = m_context->FRAME.FBMSK;
			u32 new_zm = m_context->ZBUF.ZMSK || m_context->TEST.ZTE == 0 ? 0xffffffff : 0;
			if (m_cached_ctx.TEST.ATE && GSRenderer::TryAlphaTest(new_fm, fm_mask, new_zm))
			{
				m_cached_ctx.TEST.ATE = false;
				m_cached_ctx.FRAME.FBMSK = new_fm;
				m_cached_ctx.ZBUF.ZMSK = (new_zm != 0);
				fm = new_fm;
				zm = new_zm;
				no_rt = no_rt || (!m_cached_ctx.TEST.DATE && (fm & GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk) == GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk);
				no_ds = no_ds || (zm != 0 && all_depth_tests_pass);
				if (no_rt && no_ds)
				{
					GL_INS("Late draw cancel because no pixels pass alpha test.");
					CleanupDraw(true);
					return;
				}
			}
		}
	}

	// Estimate size based on the scissor rectangle and height cache.
	const GSVector2i t_size = GetTargetSize(src);
	const GSVector4i t_size_rect = GSVector4i::loadh(t_size);

	// Ensure draw rect is clamped to framebuffer size. Necessary for updating valid area.
	const GSVector4i unclamped_draw_rect = m_r;
	// Don't clamp on shuffle, the height cache may troll us with the REAL height.
	if (!m_texture_shuffle && m_split_texture_shuffle_pages == 0)
		m_r = m_r.rintersect(t_size_rect);

	float target_scale = GetTextureScaleFactor();

	// This upscaling hack is for games which construct P8 textures by drawing a bunch of small sprites in C32,
	// then reinterpreting it as P8. We need to keep the off-screen intermediate textures at native resolution,
	// but not propagate that through to the normal render targets. Test Case: Crash Wrath of Cortex.
	if (no_ds && src && !m_channel_shuffle && GSConfig.UserHacks_NativePaletteDraw && src->m_from_target &&
		src->m_scale == 1.0f && (src->m_TEX0.PSM == PSMT8 || src->m_TEX0.TBP0 == m_cached_ctx.FRAME.Block()))
	{
		GL_CACHE("Using native resolution for target based on texture source");
		target_scale = 1.0f;
	}

	GSTextureCache::Target* rt = nullptr;
	GIFRegTEX0 FRAME_TEX0;
	if (!no_rt)
	{
		// FBW is going to be wrong for channel shuffling into a new target, so take it from the source.
		FRAME_TEX0.U64 = 0;
		FRAME_TEX0.TBP0 = m_cached_ctx.FRAME.Block();
		FRAME_TEX0.TBW = m_channel_shuffle ? src->m_from_target_TEX0.TBW : m_cached_ctx.FRAME.FBW;
		FRAME_TEX0.PSM = m_cached_ctx.FRAME.PSM;

		// Normally we would use 1024 here to match the clear above, but The Godfather does a 1023x1023 draw instead
		// (very close to 1024x1024, but apparently the GS rounds down..). So, catch that here, we don't want to
		// create that target, because the clear isn't black, it'll hang around and never get invalidated.
		const bool is_square = (t_size.y == t_size.x) && m_r.w >= 1023 && PrimitiveCoversWithoutGaps();
		const bool is_clear = is_possible_mem_clear && is_square;
		rt = g_texture_cache->LookupTarget(FRAME_TEX0, t_size, target_scale, GSTextureCache::RenderTarget, true,
			fm, false, force_preload, preserve_rt_rgb, preserve_rt_alpha, unclamped_draw_rect, IsPossibleChannelShuffle(), is_possible_mem_clear && FRAME_TEX0.TBP0 != m_cached_ctx.ZBUF.Block());

		// Draw skipped because it was a clear and there was no target.
		if (!rt)
		{
			if (is_clear)
			{
				GL_INS("Clear draw with no target, skipping.");

				const bool is_zero_color_clear = (GetConstantDirectWriteMemClearColor() == 0 && !preserve_rt_color);
				const bool is_zero_depth_clear = (GetConstantDirectWriteMemClearDepth() == 0 && !preserve_depth);
				const u32 rt_end_bp = GSLocalMemory::GetUnwrappedEndBlockAddress(
					m_cached_ctx.FRAME.Block(), m_cached_ctx.FRAME.FBW, m_cached_ctx.FRAME.PSM, m_r);
				const u32 ds_end_bp = GSLocalMemory::GetUnwrappedEndBlockAddress(
					m_cached_ctx.ZBUF.Block(), m_cached_ctx.FRAME.FBW, m_cached_ctx.ZBUF.PSM, m_r);
				TryGSMemClear(no_rt, preserve_rt_color, is_zero_color_clear, rt_end_bp,
					no_ds, preserve_depth, is_zero_depth_clear, ds_end_bp);

				CleanupDraw(true);
				return;
			}

			rt = g_texture_cache->CreateTarget(FRAME_TEX0, t_size, GetValidSize(src), target_scale, GSTextureCache::RenderTarget, true,
				fm, false, force_preload, preserve_rt_color, m_r, src);
			if (unlikely(!rt))
			{
				GL_INS("ERROR: Failed to create FRAME target, skipping.");
				CleanupDraw(true);
				return;
			}
		}
	}

	GSTextureCache::Target* ds = nullptr;
	GIFRegTEX0 ZBUF_TEX0;
	if (!no_ds)
	{
		ZBUF_TEX0.U64 = 0;
		ZBUF_TEX0.TBP0 = m_cached_ctx.ZBUF.Block();
		ZBUF_TEX0.TBW = m_channel_shuffle ? src->m_from_target_TEX0.TBW : m_cached_ctx.FRAME.FBW;
		ZBUF_TEX0.PSM = m_cached_ctx.ZBUF.PSM;

		ds = g_texture_cache->LookupTarget(ZBUF_TEX0, t_size, target_scale, GSTextureCache::DepthStencil,
			m_cached_ctx.DepthWrite(), 0, false, force_preload, preserve_depth, preserve_depth, unclamped_draw_rect, IsPossibleChannelShuffle(), is_possible_mem_clear && ZBUF_TEX0.TBP0 != m_cached_ctx.FRAME.Block());
		if (!ds)
		{
			ds = g_texture_cache->CreateTarget(ZBUF_TEX0, t_size, GetValidSize(src), target_scale, GSTextureCache::DepthStencil,
				m_cached_ctx.DepthWrite(), 0, false, force_preload, preserve_depth, m_r, src);
			if (unlikely(!ds))
			{
				GL_INS("ERROR: Failed to create ZBUF target, skipping.");
				CleanupDraw(true);
				return;
			}
		}
	}

	if (process_texture)
	{
		GIFRegCLAMP MIP_CLAMP = m_cached_ctx.CLAMP;
		const GSVertex* v = &m_vertex.buff[0];

		if (rt)
		{
			// Hypothesis: texture shuffle is used as a postprocessing effect so texture will be an old target.
			// Initially code also tested the RT but it gives too much false-positive
			const int first_x = ((v[0].XYZ.X - m_context->XYOFFSET.OFX) + 8) >> 4;
			const int first_u = PRIM->FST ? ((v[0].U + 8) >> 4) : static_cast<int>(((1 << m_cached_ctx.TEX0.TW) * (v[0].ST.S / v[1].RGBAQ.Q)) + 0.5f);
			const bool shuffle_coords = (first_x ^ first_u) & 8;
			const u32 draw_end = GSLocalMemory::GetEndBlockAddress(m_cached_ctx.FRAME.Block(), m_cached_ctx.FRAME.FBW, m_cached_ctx.FRAME.PSM, m_r) + 1;
			const bool draw_uses_target = src->m_from_target && ((src->m_from_target_TEX0.TBP0 <= m_cached_ctx.FRAME.Block() &&
				src->m_from_target->UnwrappedEndBlock() > m_cached_ctx.FRAME.Block()) ||
				(m_cached_ctx.FRAME.Block() < src->m_from_target_TEX0.TBP0 && draw_end > src->m_from_target_TEX0.TBP0));

			// copy of a 16bit source in to this target, make sure it's opaque and not bilinear to reduce false positives.
			m_copy_16bit_to_target_shuffle = m_cached_ctx.TEX0.TBP0 != m_cached_ctx.FRAME.Block() && rt->m_32_bits_fmt == true && IsOpaque()
											&& !(context->TEX1.MMIN & 1) && !src->m_32_bits_fmt && m_cached_ctx.FRAME.FBMSK;

			// It's not actually possible to do a C16->C16 texture shuffle of B to A as they are the same group
			// However you can do it by using C32 and offsetting the target verticies to point to B A, then mask as appropriate.
			m_same_group_texture_shuffle = draw_uses_target && (m_cached_ctx.TEX0.PSM & 0xE) == PSMCT32 && (m_cached_ctx.FRAME.PSM & 0x7) == PSMCT16 && (m_vt.m_min.p.x == 8.0f);

			// Both input and output are 16 bits and texture was initially 32 bits! Same for the target, Sonic Unleash makes a new target which really is 16bit.
			m_texture_shuffle = ((m_same_group_texture_shuffle || (tex_psm.bpp == 16)) && (GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].bpp == 16) &&
				(shuffle_coords || rt->m_32_bits_fmt))
				&& draw_sprite_tex && (src->m_32_bits_fmt || m_copy_16bit_to_target_shuffle);
		};

		// Okami mustn't call this code
		if (m_texture_shuffle && m_vertex.next < 3 && PRIM->FST && ((m_cached_ctx.FRAME.FBMSK & fm_mask) == 0))
		{
			// Avious dubious call to m_texture_shuffle on 16 bits games
			// The pattern is severals column of 8 pixels. A single sprite
			// smell fishy but a big sprite is wrong.

			// Shadow of Memories/Destiny shouldn't call this code.
			// Causes shadow flickering.
			m_texture_shuffle = ((v[1].U - v[0].U) < 256) ||
				// Tomb Raider Angel of Darkness relies on this behavior to produce a fog effect.
				// In this case, the address of the framebuffer and texture are the same.
				// The game will take RG => BA and then the BA => RG of next pixels.
				// However, only RG => BA needs to be emulated because RG isn't used.
				m_cached_ctx.FRAME.Block() == m_cached_ctx.TEX0.TBP0 ||
				// DMC3, Onimusha 3 rely on this behavior.
				// They do fullscreen rectangle with scissor, then shift by 8 pixels, not done with recursion.
				// So we check if it's a TS effect by checking the scissor.
				((m_context->SCISSOR.SCAX1 - m_context->SCISSOR.SCAX0) < 32);

			GL_INS("WARNING: Possible misdetection of effect, texture shuffle is %s", m_texture_shuffle ? "Enabled" : "Disabled");
		}

		if (m_texture_shuffle && IsSplitTextureShuffle(rt->m_TEX0.TBW))
		{
			// If TEX0 == FBP, we're going to have a source left in the TC.
			// That source will get used in the actual draw unsafely, so kick it out.
			if (m_cached_ctx.FRAME.Block() == m_cached_ctx.TEX0.TBP0)
				g_texture_cache->InvalidateVideoMem(context->offset.fb, m_r, false);

			CleanupDraw(true);
			return;
		}

		if (src->m_target && IsPossibleChannelShuffle())
		{
			GL_INS("Channel shuffle effect detected (2nd shot)");
			m_channel_shuffle = true;
			m_last_channel_shuffle_fbmsk = m_context->FRAME.FBMSK;
		}
		else
		{
			m_channel_shuffle = false;
		}
#if 0
		// FIXME: We currently crop off the rightmost and bottommost pixel when upscaling clamps,
		// until the issue is properly solved we should keep this disabled as it breaks many games when upscaling.
		// See #5387, #5853, #5851 on GH for more details.
		//
		// Texture clamp optimizations (try to move everything to sampler hardware)
		if (m_cached_ctx.CLAMP.WMS == CLAMP_REGION_CLAMP && MIP_CLAMP.MINU == 0 && MIP_CLAMP.MAXU == tw - 1)
			m_cached_ctx.CLAMP.WMS = CLAMP_CLAMP;
		else if (m_cached_ctx.CLAMP.WMS == CLAMP_REGION_REPEAT && MIP_CLAMP.MINU == tw - 1 && MIP_CLAMP.MAXU == 0)
			m_cached_ctx.CLAMP.WMS = CLAMP_REPEAT;
		else if ((m_cached_ctx.CLAMP.WMS & 2) && !(tmm.uses_boundary & TextureMinMaxResult::USES_BOUNDARY_U))
			m_cached_ctx.CLAMP.WMS = CLAMP_CLAMP;
		if (m_cached_ctx.CLAMP.WMT == CLAMP_REGION_CLAMP && MIP_CLAMP.MINV == 0 && MIP_CLAMP.MAXV == th - 1)
			m_cached_ctx.CLAMP.WMT = CLAMP_CLAMP;
		else if (m_cached_ctx.CLAMP.WMT == CLAMP_REGION_REPEAT && MIP_CLAMP.MINV == th - 1 && MIP_CLAMP.MAXV == 0)
			m_cached_ctx.CLAMP.WMT = CLAMP_REPEAT;
		else if ((m_cached_ctx.CLAMP.WMT & 2) && !(tmm.uses_boundary & TextureMinMaxResult::USES_BOUNDARY_V))
			m_cached_ctx.CLAMP.WMT = CLAMP_CLAMP;
#endif
		const int tw = 1 << TEX0.TW;
		const int th = 1 << TEX0.TH;
		const bool is_shuffle = m_channel_shuffle || m_texture_shuffle;

		// If m_src is from a target that isn't the same size as the texture, texture sample edge modes won't work quite the same way
		// If the game actually tries to access stuff outside of the rendered target, it was going to get garbage anyways so whatever
		// But the game could issue reads that wrap to valid areas, so move wrapping to the shader if wrapping is used
		const GSVector2i unscaled_size = src->m_target ? src->GetRegionSize() : src->GetUnscaledSize();

		if (!is_shuffle && m_cached_ctx.CLAMP.WMS == CLAMP_REPEAT && (tmm.uses_boundary & TextureMinMaxResult::USES_BOUNDARY_U) && unscaled_size.x != tw)
		{
			// Our shader-emulated region repeat doesn't upscale :(
			// Try to avoid it if possible
			// TODO: Upscale-supporting shader-emulated region repeat
			if (unscaled_size.x < tw && m_vt.m_min.t.x > -(tw - unscaled_size.x) && m_vt.m_max.t.x < tw)
			{
				// Game only extends into data we don't have (but doesn't wrap around back onto good data), clamp seems like the most reasonable solution
				m_cached_ctx.CLAMP.WMS = CLAMP_CLAMP;
			}
			else
			{
				m_cached_ctx.CLAMP.WMS = CLAMP_REGION_REPEAT;
				m_cached_ctx.CLAMP.MINU = (1 << m_cached_ctx.TEX0.TW) - 1;
				m_cached_ctx.CLAMP.MAXU = 0;
			}
		}
		if (!is_shuffle && m_cached_ctx.CLAMP.WMT == CLAMP_REPEAT && (tmm.uses_boundary & TextureMinMaxResult::USES_BOUNDARY_V) && unscaled_size.y != th)
		{
			if (unscaled_size.y < th && m_vt.m_min.t.y > -(th - unscaled_size.y) && m_vt.m_max.t.y < th)
			{
				m_cached_ctx.CLAMP.WMT = CLAMP_CLAMP;
			}
			else
			{
				m_cached_ctx.CLAMP.WMT = CLAMP_REGION_REPEAT;
				m_cached_ctx.CLAMP.MINV = (1 << m_cached_ctx.TEX0.TH) - 1;
				m_cached_ctx.CLAMP.MAXV = 0;
			}
		}

		// Round 2
		if (IsMipMapActive() && GSConfig.HWMipmap == HWMipmapLevel::Full && !tex_psm.depth && !src->m_from_hash_cache)
		{
			// Upload remaining texture layers
			const GSVector4 tmin = m_vt.m_min.t;
			const GSVector4 tmax = m_vt.m_max.t;

			for (int layer = m_lod.x + 1; layer <= m_lod.y; layer++)
			{
				const GIFRegTEX0 MIP_TEX0(GetTex0Layer(layer));

				MIP_CLAMP.MINU >>= 1;
				MIP_CLAMP.MINV >>= 1;
				MIP_CLAMP.MAXU >>= 1;
				MIP_CLAMP.MAXV >>= 1;

				m_vt.m_min.t *= 0.5f;
				m_vt.m_max.t *= 0.5f;

				tmm = GetTextureMinMax(MIP_TEX0, MIP_CLAMP, m_vt.IsLinear(), false);

				src->UpdateLayer(MIP_TEX0, tmm.coverage, layer - m_lod.x);
			}

			// we don't need to generate mipmaps since they were provided
			src->m_texture->ClearMipmapGenerationFlag();
			m_vt.m_min.t = tmin;
			m_vt.m_max.t = tmax;
		}
	}

	if (rt)
	{
		// Be sure texture shuffle detection is properly propagated
		// Otherwise set or clear the flag (Code in texture cache only set the flag)
		// Note: it is important to clear the flag when RT is used as a real 16 bits target.
		rt->m_32_bits_fmt = m_texture_shuffle || (GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].bpp != 16);
	}

	// Do the same for the depth target. Jackie Chan Adventures swaps from C32 to Z16 after a clear.
	if (ds)
		ds->m_32_bits_fmt = (GSLocalMemory::m_psm[m_cached_ctx.ZBUF.PSM].bpp != 16);

	// Deferred update of TEX0. We don't want to change it when we're doing a shuffle/clear, because it
	// may increase the buffer width, or change PSM, which breaks P8 conversion amongst other things.
	const bool can_update_size = !is_possible_mem_clear && !m_texture_shuffle && !m_channel_shuffle;
	if (!m_texture_shuffle && !m_channel_shuffle)
	{
		if (rt && (!is_possible_mem_clear || rt->m_TEX0.PSM != FRAME_TEX0.PSM))
		{
			if (rt->m_TEX0.TBW != FRAME_TEX0.TBW && !m_cached_ctx.ZBUF.ZMSK && (m_cached_ctx.FRAME.FBMSK & 0xFF000000))
			{
				// Alpha could be a font, and since the width is changing it's no longer valid.
				// Be careful of downsize copies or other effects, checking Z MSK should hopefully be enough.. (Okami).
				if (m_cached_ctx.FRAME.FBMSK & 0x0F000000)
					rt->m_valid_alpha_low = false;
				if (m_cached_ctx.FRAME.FBMSK & 0xF0000000)
					rt->m_valid_alpha_high = false;
			}
			rt->m_TEX0 = FRAME_TEX0;
		}

		if (ds && (!is_possible_mem_clear || ds->m_TEX0.PSM != ZBUF_TEX0.PSM || (rt && ds->m_TEX0.TBW != rt->m_TEX0.TBW)))
			ds->m_TEX0 = ZBUF_TEX0;
	}
	else if (!m_texture_shuffle)
	{
		// Allow FB PSM to update on channel shuffle, it should be correct, unlike texture shuffle.
		// The FBW should also be okay, since it's coming from the source.
		if (rt)
		{
			rt->m_TEX0.TBW = std::max(rt->m_TEX0.TBW, FRAME_TEX0.TBW);
			rt->m_TEX0.PSM = FRAME_TEX0.PSM;
		}
		if (ds)
		{
			ds->m_TEX0.TBW = std::max(ds->m_TEX0.TBW, ZBUF_TEX0.TBW);
			ds->m_TEX0.PSM = ZBUF_TEX0.PSM;
		}
	}

	// Figure out which channels we're writing.
	if (rt)
		rt->UpdateValidChannels(rt->m_TEX0.PSM, m_texture_shuffle ? GetEffectiveTextureShuffleFbmsk() : fm);
	if (ds)
		ds->UpdateValidChannels(ZBUF_TEX0.PSM, zm);

	if (rt)
		rt->Update();
	if (ds)
		ds->Update();

	const GSVector2i resolution = PCRTCDisplays.GetResolution();
	GSTextureCache::Target* old_rt = nullptr;
	GSTextureCache::Target* old_ds = nullptr;
	{
		GSVector2i new_size = t_size;

		// We need to adjust the size if it's a texture shuffle as we could end up making the RT twice the size.
		if (rt && m_texture_shuffle && m_split_texture_shuffle_pages == 0)
		{
			if ((new_size.x > rt->m_valid.z && m_vt.m_max.p.x == new_size.x) || (new_size.y > rt->m_valid.w && m_vt.m_max.p.y == new_size.y))
			{
				if (new_size.y <= rt->m_valid.w && (rt->m_TEX0.TBW != m_cached_ctx.FRAME.FBW))
					new_size.x /= 2;
				else
					new_size.y /= 2;
			}
		}

		// We still need to make sure the dimensions of the targets match.
		const int new_w = std::max(new_size.x, std::max(rt ? rt->m_unscaled_size.x : 0, ds ? ds->m_unscaled_size.x : 0));
		const int new_h = std::max(new_size.y, std::max(rt ? rt->m_unscaled_size.y : 0, ds ? ds->m_unscaled_size.y : 0));
		if (rt)
		{
			const u32 old_end_block = rt->m_end_block;
			const bool new_rect = rt->m_valid.rempty();
			const bool new_height = new_h > rt->GetUnscaledHeight();
			const int old_height = rt->m_texture->GetHeight();

			pxAssert(rt->GetScale() == target_scale);
			if (rt->GetUnscaledWidth() != new_w || rt->GetUnscaledHeight() != new_h)
				GL_INS("Resize RT from %dx%d to %dx%d", rt->GetUnscaledWidth(), rt->GetUnscaledHeight(), new_w, new_h);

			rt->ResizeTexture(new_w, new_h);

			if (!m_texture_shuffle && !m_channel_shuffle)
			{
				rt->ResizeValidity(rt->GetUnscaledRect());
				rt->ResizeDrawn(rt->GetUnscaledRect());
			}

			const GSVector4i update_rect = m_r.rintersect(GSVector4i::loadh(new_size));
			// Limit to 2x the vertical height of the resolution (for double buffering)
			rt->UpdateValidity(update_rect, can_update_size || (m_r.w <= (resolution.y * 2) && !m_texture_shuffle));
			rt->UpdateDrawn(update_rect, can_update_size || (m_r.w <= (resolution.y * 2) && !m_texture_shuffle));
			// Probably changing to double buffering, so invalidate any old target that was next to it.
			// This resolves an issue where the PCRTC will find the old target in FMV's causing flashing.
			// Grandia Xtreme, Onimusha Warlord.
			if (!new_rect && new_height && old_end_block != rt->m_end_block)
			{
				old_rt = g_texture_cache->FindTargetOverlap(rt, GSTextureCache::RenderTarget, m_cached_ctx.FRAME.PSM);

				if (old_rt && old_rt != rt && GSUtil::HasSharedBits(old_rt->m_TEX0.PSM, rt->m_TEX0.PSM))
				{
					const int copy_width = (old_rt->m_texture->GetWidth()) > (rt->m_texture->GetWidth()) ? (rt->m_texture->GetWidth()) : old_rt->m_texture->GetWidth();
					const int copy_height = (old_rt->m_texture->GetHeight()) > (rt->m_texture->GetHeight() - old_height) ? (rt->m_texture->GetHeight() - old_height) : old_rt->m_texture->GetHeight();
					GL_INS("RT double buffer copy from FBP 0x%x, %dx%d => %d,%d", old_rt->m_TEX0.TBP0, copy_width, copy_height, 0, old_height);

					// Invalidate has been moved to after DrawPrims(), because we might kill the current sources' backing.
					g_gs_device->CopyRect(old_rt->m_texture, rt->m_texture, GSVector4i(0, 0, copy_width, copy_height), 0, old_height);
					preserve_rt_color = true;
				}
				else
				{
					old_rt = nullptr;
				}
			}
		}
		if (ds)
		{
			const u32 old_end_block = ds->m_end_block;
			const bool new_rect = ds->m_valid.rempty();
			const bool new_height = new_h > ds->GetUnscaledHeight();
			const int old_height = ds->m_texture->GetHeight();

			pxAssert(ds->GetScale() == target_scale);
			if (ds->GetUnscaledWidth() != new_w || ds->GetUnscaledHeight() != new_h)
				GL_INS("Resize DS from %dx%d to %dx%d", ds->GetUnscaledWidth(), ds->GetUnscaledHeight(), new_w, new_h);
			ds->ResizeTexture(new_w, new_h);

			if (!m_texture_shuffle && !m_channel_shuffle)
			{
				ds->ResizeValidity(ds->GetUnscaledRect());
				ds->ResizeDrawn(ds->GetUnscaledRect());
			}

			// Limit to 2x the vertical height of the resolution (for double buffering)
			ds->UpdateValidity(m_r, can_update_size || m_r.w <= (resolution.y * 2));
			ds->UpdateDrawn(m_r, can_update_size || m_r.w <= (resolution.y * 2));

			if (!new_rect && new_height && old_end_block != ds->m_end_block)
			{
				old_ds = g_texture_cache->FindTargetOverlap(ds, GSTextureCache::DepthStencil, m_cached_ctx.ZBUF.PSM);

				if (old_ds && old_ds != ds && GSUtil::HasSharedBits(old_ds->m_TEX0.PSM, ds->m_TEX0.PSM))
				{
					const int copy_width = (old_ds->m_texture->GetWidth()) > (ds->m_texture->GetWidth()) ? (ds->m_texture->GetWidth()) : old_ds->m_texture->GetWidth();
					const int copy_height = (old_ds->m_texture->GetHeight()) > (ds->m_texture->GetHeight() - old_height) ? (ds->m_texture->GetHeight() - old_height) : old_ds->m_texture->GetHeight();
					GL_INS("DS double buffer copy from FBP 0x%x, %dx%d => %d,%d", old_ds->m_TEX0.TBP0, copy_width, copy_height, 0, old_height);

					g_gs_device->CopyRect(old_ds->m_texture, ds->m_texture, GSVector4i(0, 0, copy_width, copy_height), 0, old_height);
					preserve_depth = true;
				}
				else
				{
					old_ds = nullptr;
				}
			}
		}
	}

	if (src && src->m_shared_texture && src->m_texture != src->m_from_target->m_texture)
	{
		// Target texture changed, update reference.
		src->m_texture = src->m_from_target->m_texture;
	}

	if (GSConfig.DumpGSData)
	{
		const u64 frame = g_perfmon.GetFrame();

		std::string s;

		if (GSConfig.SaveTexture && s_n >= GSConfig.SaveN && src)
		{
			s = GetDrawDumpPath("%05d_f%lld_itex_%05x_%s_%d%d_%02x_%02x_%02x_%02x.dds",
				s_n, frame, static_cast<int>(m_cached_ctx.TEX0.TBP0), psm_str(m_cached_ctx.TEX0.PSM),
				static_cast<int>(m_cached_ctx.CLAMP.WMS), static_cast<int>(m_cached_ctx.CLAMP.WMT),
				static_cast<int>(m_cached_ctx.CLAMP.MINU), static_cast<int>(m_cached_ctx.CLAMP.MAXU),
				static_cast<int>(m_cached_ctx.CLAMP.MINV), static_cast<int>(m_cached_ctx.CLAMP.MAXV));

			src->m_texture->Save(s);

			if (src->m_palette)
			{
				s = GetDrawDumpPath("%05d_f%lld_itpx_%05x_%s.dds", s_n, frame, m_cached_ctx.TEX0.CBP, psm_str(m_cached_ctx.TEX0.CPSM));

				src->m_palette->Save(s);
			}
		}

		if (rt && GSConfig.SaveRT && s_n >= GSConfig.SaveN)
		{
			s = GetDrawDumpPath("%05d_f%lld_rt0_%05x_%s.bmp", s_n, frame, m_cached_ctx.FRAME.Block(), psm_str(m_cached_ctx.FRAME.PSM));

			if (rt->m_texture)
				rt->m_texture->Save(s);
		}

		if (ds && GSConfig.SaveDepth && s_n >= GSConfig.SaveN)
		{
			s = GetDrawDumpPath("%05d_f%lld_rz0_%05x_%s.bmp", s_n, frame, m_cached_ctx.ZBUF.Block(), psm_str(m_cached_ctx.ZBUF.PSM));

			if (ds->m_texture)
				ds->m_texture->Save(s);
		}
	}

	if (m_oi && !m_oi(*this, rt ? rt->m_texture : nullptr, ds ? ds->m_texture : nullptr, src))
	{
		GL_INS("Warning skipping a draw call (%d)", s_n);
		CleanupDraw(true);
		return;
	}

	if (!OI_BlitFMV(rt, src, m_r))
	{
		GL_INS("Warning skipping a draw call (%d)", s_n);
		CleanupDraw(true);
		return;
	}

	bool skip_draw = false;
	if (!GSConfig.UserHacks_DisableSafeFeatures && is_possible_mem_clear)
		skip_draw = TryTargetClear(rt, ds, preserve_rt_color, preserve_depth);

	// A couple of hack to avoid upscaling issue. So far it seems to impacts mostly sprite
	// Note: first hack corrects both position and texture coordinate
	// Note: second hack corrects only the texture coordinate
	if (CanUpscale() && (m_vt.m_primclass == GS_SPRITE_CLASS))
	{
		const u32 count = m_vertex.next;
		GSVertex* v = &m_vertex.buff[0];

		// Hack to avoid vertical black line in various games (ace combat/tekken)
		if (GSConfig.UserHacks_AlignSpriteX)
		{
			// Note for performance reason I do the check only once on the first
			// primitive
			const int win_position = v[1].XYZ.X - context->XYOFFSET.OFX;
			const bool unaligned_position = ((win_position & 0xF) == 8);
			const bool unaligned_texture = ((v[1].U & 0xF) == 0) && PRIM->FST; // I'm not sure this check is useful
			const bool hole_in_vertex = (count < 4) || (v[1].XYZ.X != v[2].XYZ.X);
			if (hole_in_vertex && unaligned_position && (unaligned_texture || !PRIM->FST))
			{
				// Normaly vertex are aligned on full pixels and texture in half
				// pixels. Let's extend the coverage of an half-pixel to avoid
				// hole after upscaling
				for (u32 i = 0; i < count; i += 2)
				{
					v[i + 1].XYZ.X += 8;
					// I really don't know if it is a good idea. Neither what to do for !PRIM->FST
					if (unaligned_texture)
						v[i + 1].U += 8;
				}
			}
		}

		// Noting to do if no texture is sampled
		if (PRIM->FST && draw_sprite_tex)
		{
			if ((GSConfig.UserHacks_RoundSprite > 1) || (GSConfig.UserHacks_RoundSprite == 1 && !m_vt.IsLinear()))
			{
				if (m_vt.IsLinear())
					RoundSpriteOffset<true>();
				else
					RoundSpriteOffset<false>();
			}
		}
		else
		{
			; // vertical line in Yakuza (note check m_userhacks_align_sprite_X behavior)
		}
	}

	//

	if (!skip_draw)
		DrawPrims(rt, ds, src, tmm);

	//

	// Temporary source *must* be invalidated before normal, because otherwise it'll be double freed.
	g_texture_cache->InvalidateTemporarySource();

	// Invalidation of old targets when changing to double-buffering.
	if (old_rt)
		g_texture_cache->InvalidateVideoMemType(GSTextureCache::RenderTarget, old_rt->m_TEX0.TBP0);
	if (old_ds)
		g_texture_cache->InvalidateVideoMemType(GSTextureCache::DepthStencil, old_ds->m_TEX0.TBP0);

	if ((fm & fm_mask) != fm_mask && rt)
	{
		//rt->m_valid = rt->m_valid.runion(r);
		// Limit to 2x the vertical height of the resolution (for double buffering)
		rt->UpdateValidity(m_r, can_update_size || (m_r.w <= (resolution.y * 2) && !m_texture_shuffle));

		g_texture_cache->InvalidateVideoMem(context->offset.fb, m_r, false);

		// Remove overwritten Zs at the FBP.
		g_texture_cache->InvalidateVideoMemType(GSTextureCache::DepthStencil, m_cached_ctx.FRAME.Block(),
			m_cached_ctx.FRAME.PSM, m_texture_shuffle ? GetEffectiveTextureShuffleFbmsk() : fm);
	}

	if (zm != 0xffffffff && ds)
	{
		//ds->m_valid = ds->m_valid.runion(r);
		// Limit to 2x the vertical height of the resolution (for double buffering)
		ds->UpdateValidity(m_r, can_update_size || (m_r.w <= (resolution.y * 2) && !m_texture_shuffle));

		g_texture_cache->InvalidateVideoMem(context->offset.zb, m_r, false);

		// Remove overwritten RTs at the ZBP.
		g_texture_cache->InvalidateVideoMemType(
			GSTextureCache::RenderTarget, m_cached_ctx.ZBUF.Block(), m_cached_ctx.ZBUF.PSM, zm);
	}

	//

	if (GSConfig.DumpGSData)
	{
		const u64 frame = g_perfmon.GetFrame();

		std::string s;

		if (GSConfig.SaveRT && s_n >= GSConfig.SaveN)
		{
			s = GetDrawDumpPath("%05d_f%lld_rt1_%05x_%s.bmp", s_n, frame, m_cached_ctx.FRAME.Block(), psm_str(m_cached_ctx.FRAME.PSM));

			if (rt)
				rt->m_texture->Save(s);
		}

		if (GSConfig.SaveDepth && s_n >= GSConfig.SaveN)
		{
			s = GetDrawDumpPath("%05d_f%lld_rz1_%05x_%s.bmp", s_n, frame, m_cached_ctx.ZBUF.Block(), psm_str(m_cached_ctx.ZBUF.PSM));

			if (ds)
				ds->m_texture->Save(s);
		}

		if (GSConfig.SaveL > 0 && (s_n - GSConfig.SaveN) > GSConfig.SaveL)
		{
			GSConfig.DumpGSData = 0;
		}
	}

#ifdef DISABLE_HW_TEXTURE_CACHE
	if (rt)
		g_texture_cache->Read(rt, m_r);
#endif

	//

	CleanupDraw(false);
}

/// Verifies assumptions we expect to hold about indices
bool GSRendererHW::VerifyIndices()
{
	switch (m_vt.m_primclass)
	{
		case GS_SPRITE_CLASS:
			if (m_index.tail % 2 != 0)
				return false;
			[[fallthrough]];
		case GS_POINT_CLASS:
			// Expect indices to be flat increasing
			for (u32 i = 0; i < m_index.tail; i++)
			{
				if (m_index.buff[i] != i)
					return false;
			}
			break;
		case GS_LINE_CLASS:
			if (m_index.tail % 2 != 0)
				return false;
			// Expect each line to be a pair next to each other
			// VS expand relies on this!
			if (g_gs_device->Features().provoking_vertex_last)
			{
				for (u32 i = 0; i < m_index.tail; i += 2)
				{
					if (m_index.buff[i] + 1 != m_index.buff[i + 1])
						return false;
				}
			}
			else
			{
				for (u32 i = 0; i < m_index.tail; i += 2)
				{
					if (m_index.buff[i] != m_index.buff[i + 1] + 1)
						return false;
				}
			}
			break;
		case GS_TRIANGLE_CLASS:
			if (m_index.tail % 3 != 0)
				return false;
			break;
		case GS_INVALID_CLASS:
			break;
	}
	return true;
}

void GSRendererHW::SetupIA(float target_scale, float sx, float sy)
{
	GL_PUSH("IA");

	if (GSConfig.UserHacks_WildHack && !m_isPackedUV_HackFlag && PRIM->TME && PRIM->FST)
	{
		for (u32 i = 0; i < m_vertex.next; i++)
			m_vertex.buff[i].UV &= 0x3FEF3FEF;
	}

	const bool unscale_pt_ln = !GSConfig.UserHacks_DisableSafeFeatures && (target_scale != 1.0f);
	const GSDevice::FeatureSupport features = g_gs_device->Features();

	ASSERT(VerifyIndices());

	switch (m_vt.m_primclass)
	{
		case GS_POINT_CLASS:
			{
				m_conf.topology = GSHWDrawConfig::Topology::Point;
				m_conf.indices_per_prim = 1;
				if (unscale_pt_ln)
				{
					if (features.point_expand)
					{
						m_conf.vs.point_size = true;
						m_conf.cb_vs.point_size = GSVector2(target_scale);
					}
					else if (features.vs_expand)
					{
						m_conf.vs.expand = GSHWDrawConfig::VSExpand::Point;
						m_conf.cb_vs.point_size = GSVector2(16.0f * sx, 16.0f * sy);
						m_conf.topology = GSHWDrawConfig::Topology::Triangle;
						m_conf.verts = m_vertex.buff;
						m_conf.nverts = m_vertex.next;
						m_conf.nindices = m_index.tail * 6;
						m_conf.indices_per_prim = 6;
						return;
					}
				}
				else
				{
					// Vulkan/GL still need to set point size.
					m_conf.cb_vs.point_size = target_scale;

					// M1 requires point size output on *all* points.
					m_conf.vs.point_size = true;
				}
			}
			break;

		case GS_LINE_CLASS:
			{
				m_conf.topology = GSHWDrawConfig::Topology::Line;
				m_conf.indices_per_prim = 2;
				if (unscale_pt_ln)
				{
					if (features.line_expand)
					{
						m_conf.line_expand = true;
					}
					else if (features.vs_expand)
					{
						m_conf.vs.expand = GSHWDrawConfig::VSExpand::Line;
						m_conf.cb_vs.point_size = GSVector2(16.0f * sx, 16.0f * sy);
						m_conf.topology = GSHWDrawConfig::Topology::Triangle;
						m_conf.indices_per_prim = 6;
						ExpandLineIndices();
					}
				}
			}
			break;

		case GS_SPRITE_CLASS:
			{
				// Need to pre-divide ST by Q if Q is very large, to avoid precision issues on some GPUs.
				// May as well just expand the whole thing out with the CPU path in such a case.
				if (features.vs_expand && !m_vt.m_accurate_stq)
				{
					m_conf.topology = GSHWDrawConfig::Topology::Triangle;
					m_conf.vs.expand = GSHWDrawConfig::VSExpand::Sprite;
					m_conf.verts = m_vertex.buff;
					m_conf.nverts = m_vertex.next;
					m_conf.nindices = m_index.tail * 3;
					m_conf.indices_per_prim = 6;
					return;
				}
				else
				{
					Lines2Sprites();

					m_conf.topology = GSHWDrawConfig::Topology::Triangle;
					m_conf.indices_per_prim = 6;
				}
			}
			break;

		case GS_TRIANGLE_CLASS:
			{
				m_conf.topology = GSHWDrawConfig::Topology::Triangle;
				m_conf.indices_per_prim = 3;
			}
			break;

		default:
			__assume(0);
	}

	m_conf.verts = m_vertex.buff;
	m_conf.nverts = m_vertex.next;
	m_conf.indices = m_index.buff;
	m_conf.nindices = m_index.tail;
}

void GSRendererHW::EmulateZbuffer(const GSTextureCache::Target* ds)
{
	if (ds && m_cached_ctx.TEST.ZTE)
	{
		m_conf.depth.ztst = m_cached_ctx.TEST.ZTST;
		// AA1: Z is not written on lines since coverage is always less than 0x80.
		m_conf.depth.zwe = (m_cached_ctx.ZBUF.ZMSK || (PRIM->AA1 && m_vt.m_primclass == GS_LINE_CLASS)) ? 0 : 1;
	}
	else
	{
		m_conf.depth.ztst = ZTST_ALWAYS;
	}

	// On the real GS we appear to do clamping on the max z value the format allows.
	// Clamping is done after rasterization.
	const u32 max_z = 0xFFFFFFFF >> (GSLocalMemory::m_psm[m_cached_ctx.ZBUF.PSM].fmt * 8);
	const bool clamp_z = static_cast<u32>(GSVector4i(m_vt.m_max.p).z) > max_z;

	m_conf.cb_vs.max_depth = GSVector2i(0xFFFFFFFF);
	//ps_cb.MaxDepth = GSVector4(0.0f, 0.0f, 0.0f, 1.0f);
	m_conf.ps.zclamp = 0;

	if (clamp_z)
	{
		if (m_vt.m_primclass == GS_SPRITE_CLASS || m_vt.m_primclass == GS_POINT_CLASS)
		{
			m_conf.cb_vs.max_depth = GSVector2i(max_z);
		}
		else if (!m_cached_ctx.ZBUF.ZMSK)
		{
			m_conf.cb_ps.TA_MaxDepth_Af.z = static_cast<float>(max_z) * (g_gs_device->Features().clip_control ? 0x1p-32f : 0x1p-24f);
			m_conf.ps.zclamp = 1;
		}
	}
}

void GSRendererHW::EmulateTextureShuffleAndFbmask(GSTextureCache::Target* rt, GSTextureCache::Source* tex)
{
	// Uncomment to disable texture shuffle emulation.
	// m_texture_shuffle = false;

	bool enable_fbmask_emulation = false;
	const GSDevice::FeatureSupport features = g_gs_device->Features();
	if (features.texture_barrier)
	{
		enable_fbmask_emulation = GSConfig.AccurateBlendingUnit != AccBlendLevel::Minimum;
	}
	else
	{
		// FBmask blend level selection.
		// We do this becaue:
		// 1. D3D sucks.
		// 2. FB copy is slow, especially on triangle primitives which is unplayable with some games.
		// 3. SW blending isn't implemented yet.
		switch (GSConfig.AccurateBlendingUnit)
		{
			case AccBlendLevel::Maximum:
			case AccBlendLevel::Full:
			case AccBlendLevel::High:
			case AccBlendLevel::Medium:
				enable_fbmask_emulation = true;
				break;
			case AccBlendLevel::Basic:
				// Enable Fbmask emulation excluding triangle class because it is quite slow.
				enable_fbmask_emulation = (m_vt.m_primclass != GS_TRIANGLE_CLASS);
				break;
			case AccBlendLevel::Minimum:
				break;
		}
	}

	if (m_texture_shuffle)
	{
		m_conf.ps.shuffle = 1;
		m_conf.ps.dfmt = 0;

		bool write_ba;
		bool read_ba;

		ConvertSpriteTextureShuffle(write_ba, read_ba, rt, tex);

		// If date is enabled you need to test the green channel instead of the
		// alpha channel. Only enable this code in DATE mode to reduce the number
		// of shader.
		m_conf.ps.write_rg = !write_ba && features.texture_barrier && m_cached_ctx.TEST.DATE;

		m_conf.ps.read_ba = read_ba;
		m_conf.ps.real16src = m_copy_16bit_to_target_shuffle;
		m_conf.ps.shuffle_same = m_same_group_texture_shuffle;
		// Please bang my head against the wall!
		// 1/ Reduce the frame mask to a 16 bit format
		const u32 m = m_cached_ctx.FRAME.FBMSK & GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk;

		// fbmask is converted to a 16bit version to represent the 2 32bit channels it's writing to.
		// The lower 8 bits represents the Red/Blue channels, the top 8 bits is Green/Alpha, depending on write_ba.
		const u32 fbmask = ((m >> 3) & 0x1F) | ((m >> 6) & 0x3E0) | ((m >> 9) & 0x7C00) | ((m >> 16) & 0x8000);
		// r = rb mask, g = ga mask
		const GSVector2i rb_ga_mask = GSVector2i(fbmask & 0xFF, (fbmask >> 8) & 0xFF);

		// Ace Combat 04 sets FBMSK to 0 for the shuffle, duplicating RG across RGBA.
		// Given how touchy texture shuffles are, I'm not ready to make it 100% dependent on the real FBMSK yet.
		// TODO: Remove this if, and see what breaks.
		if (fbmask != 0)
		{
			m_conf.colormask.wrgba = 0;
		}
		else
		{
			m_conf.colormask.wr = m_conf.colormask.wg = (rb_ga_mask.r != 0xFF);
			m_conf.colormask.wb = m_conf.colormask.wa = (rb_ga_mask.g != 0xFF);
		}

		// 2 Select the new mask
		if (rb_ga_mask.r != 0xFF)
		{
			if (write_ba)
			{
				GL_INS("Color shuffle %s => B", read_ba ? "B" : "R");
				m_conf.colormask.wb = 1;
			}
			else
			{
				GL_INS("Color shuffle %s => R", read_ba ? "B" : "R");
				m_conf.colormask.wr = 1;
			}
			if (rb_ga_mask.r)
				m_conf.ps.fbmask = 1;
		}

		if (rb_ga_mask.g != 0xFF)
		{
			if (write_ba)
			{
				GL_INS("Color shuffle %s => A", read_ba ? "A" : "G");
				m_conf.colormask.wa = 1;
			}
			else
			{
				GL_INS("Color shuffle %s => G", read_ba ? "A" : "G");
				m_conf.colormask.wg = 1;
			}
			if (rb_ga_mask.g)
				m_conf.ps.fbmask = 1;
		}

		if (m_conf.ps.fbmask && enable_fbmask_emulation)
		{
			m_conf.cb_ps.FbMask.r = rb_ga_mask.r;
			m_conf.cb_ps.FbMask.g = rb_ga_mask.g;
			m_conf.cb_ps.FbMask.b = rb_ga_mask.r;
			m_conf.cb_ps.FbMask.a = rb_ga_mask.g;

			// No blending so hit unsafe path.
			if (!PRIM->ABE || !features.texture_barrier)
			{
				GL_INS("FBMASK Unsafe SW emulated fb_mask:%x on tex shuffle", fbmask);
				m_conf.require_one_barrier = true;
			}
			else
			{
				GL_INS("FBMASK SW emulated fb_mask:%x on tex shuffle", fbmask);
				m_conf.require_full_barrier = true;
			}
		}
		else
		{
			m_conf.ps.fbmask = 0;
		}

		// Set dirty alpha on target, but only if we're actually writing to it.
		if (rt)
		{
			rt->m_valid_alpha_low |= m_conf.colormask.wa;
			rt->m_valid_alpha_high |= m_conf.colormask.wa;
		}

		// Once we draw the shuffle, no more buffering.
		m_split_texture_shuffle_pages = 0;
		m_split_texture_shuffle_pages_high = 0;
		m_split_texture_shuffle_start_FBP = 0;
		m_split_texture_shuffle_start_TBP = 0;
	}
	else
	{
		m_conf.ps.dfmt = GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmt;

		// Don't allow only unused bits on 16bit format to enable fbmask,
		// let's set the mask to 0 in such cases.
		int fbmask = static_cast<int>(m_cached_ctx.FRAME.FBMSK);
		const int fbmask_r = GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk;
		fbmask &= fbmask_r;
		const GSVector4i fbmask_v = GSVector4i::load(fbmask);
		const GSVector4i fbmask_vr = GSVector4i::load(fbmask_r);
		const int ff_fbmask = fbmask_v.eq8(fbmask_vr).mask();
		const int zero_fbmask = fbmask_v.eq8(GSVector4i::zero()).mask();

		m_conf.colormask.wrgba = ~ff_fbmask; // Enable channel if at least 1 bit is 0

		m_conf.ps.fbmask = enable_fbmask_emulation && (~ff_fbmask & ~zero_fbmask & 0xF);

		if (m_conf.ps.fbmask)
		{
			m_conf.cb_ps.FbMask = fbmask_v.u8to32();
			// Only alpha is special here, I think we can take a very unsafe shortcut
			// Alpha isn't blended on the GS but directly copyied into the RT.
			//
			// Behavior is clearly undefined however there is a high probability that
			// it will work. Masked bit will be constant and normally the same everywhere
			// RT/FS output/Cached value.
			//
			// Just to be sure let's add a new safe hack for unsafe access :)
			//
			// Here the GL spec quote to emphasize the unexpected behavior.
			/*
			   - If a texel has been written, then in order to safely read the result
			   a texel fetch must be in a subsequent Draw separated by the command

			   void TextureBarrier(void);

			   TextureBarrier() will guarantee that writes have completed and caches
			   have been invalidated before subsequent Draws are executed.
			 */
			// No blending so hit unsafe path.
			if (!PRIM->ABE || !(~ff_fbmask & ~zero_fbmask & 0x7) || !g_gs_device->Features().texture_barrier)
			{
				GL_INS("FBMASK Unsafe SW emulated fb_mask:%x on %d bits format", m_cached_ctx.FRAME.FBMSK,
					(m_conf.ps.dfmt == 2) ? 16 : 32);
				m_conf.require_one_barrier = true;
			}
			else
			{
				// The safe and accurate path (but slow)
				GL_INS("FBMASK SW emulated fb_mask:%x on %d bits format", m_cached_ctx.FRAME.FBMSK,
					(m_conf.ps.dfmt == 2) ? 16 : 32);
				m_conf.require_full_barrier = true;
			}
		}
	}
}

bool GSRendererHW::TestChannelShuffle(GSTextureCache::Target* src)
{
	// We have to do the second test early here, because it might be a different source.
	const bool shuffle = m_channel_shuffle || IsPossibleChannelShuffle();

	// This is a little redundant since it'll get called twice, but the only way to stop us wasting time on copies.
	m_channel_shuffle = (shuffle && EmulateChannelShuffle(src, true));
	return m_channel_shuffle;
}

__ri bool GSRendererHW::EmulateChannelShuffle(GSTextureCache::Target* src, bool test_only)
{
	if ((src->m_texture->GetType() == GSTexture::Type::DepthStencil) && !src->m_32_bits_fmt)
	{
		// So far 2 games hit this code path. Urban Chaos and Tales of Abyss
		// UC: will copy depth to green channel
		// ToA: will copy depth to alpha channel
		if ((m_cached_ctx.FRAME.FBMSK & 0xFF0000) == 0xFF0000)
		{
			// Green channel is masked
			GL_INS("Tales Of Abyss Crazyness (MSB 16b depth to Alpha)");
			if (test_only)
				return true;

			m_conf.ps.tales_of_abyss_hle = 1;
		}
		else
		{
			GL_INS("Urban Chaos Crazyness (Green extraction)");
			if (test_only)
				return true;

			m_conf.ps.urban_chaos_hle = 1;
		}
	}
	else if (m_index.tail <= 64 && m_cached_ctx.CLAMP.WMT == 3)
	{
		// Blood will tell. I think it is channel effect too but again
		// implemented in a different way. I don't want to add more CRC stuff. So
		// let's disable channel when the signature is different
		//
		// Note: Tales Of Abyss and Tekken5 could hit this path too. Those games are
		// handled above.
		GL_INS("Maybe not a channel!");
		if (test_only)
			return false;

		m_channel_shuffle = false;
		return false;
	}
	else if (m_cached_ctx.CLAMP.WMS == 3 && ((m_cached_ctx.CLAMP.MAXU & 0x8) == 8))
	{
		// Read either blue or Alpha. Let's go for Blue ;)
		// MGS3/Kill Zone
		GL_INS("Blue channel");
		if (test_only)
			return true;

		m_conf.ps.channel = ChannelFetch_BLUE;
	}
	else if (m_cached_ctx.CLAMP.WMS == 3 && ((m_cached_ctx.CLAMP.MINU & 0x8) == 0))
	{
		// Read either Red or Green. Let's check the V coordinate. 0-1 is likely top so
		// red. 2-3 is likely bottom so green (actually depends on texture base pointer offset)
		const bool green = PRIM->FST && (m_vertex.buff[0].V & 32);
		if (green && (m_cached_ctx.FRAME.FBMSK & 0x00FFFFFF) == 0x00FFFFFF)
		{
			// Typically used in Terminator 3
			const int blue_mask = m_cached_ctx.FRAME.FBMSK >> 24;
			int blue_shift = -1;

			// Note: potentially we could also check the value of the clut
			switch (blue_mask)
			{
				case 0xFF: ASSERT(0);      break;
				case 0xFE: blue_shift = 1; break;
				case 0xFC: blue_shift = 2; break;
				case 0xF8: blue_shift = 3; break;
				case 0xF0: blue_shift = 4; break;
				case 0xE0: blue_shift = 5; break;
				case 0xC0: blue_shift = 6; break;
				case 0x80: blue_shift = 7; break;
				default:                   break;
			}

			if (blue_shift >= 0)
			{
				const int green_mask = ~blue_mask & 0xFF;
				const int green_shift = 8 - blue_shift;

				GL_INS("Green/Blue channel (%d, %d)", blue_shift, green_shift);
				if (test_only)
					return true;

				m_conf.cb_ps.ChannelShuffle = GSVector4i(blue_mask, blue_shift, green_mask, green_shift);
				m_conf.ps.channel = ChannelFetch_GXBY;
				m_cached_ctx.FRAME.FBMSK = 0x00FFFFFF;
			}
			else
			{
				GL_INS("Green channel (wrong mask) (fbmask %x)", blue_mask);
				if (test_only)
					return true;

				m_conf.ps.channel = ChannelFetch_GREEN;
			}
		}
		else if (green)
		{
			GL_INS("Green channel");
			if (test_only)
				return true;

			m_conf.ps.channel = ChannelFetch_GREEN;
		}
		else
		{
			// Pop
			GL_INS("Red channel");
			if (test_only)
				return true;

			m_conf.ps.channel = ChannelFetch_RED;
		}
	}
	else
	{
		// We can use the minimum UV to work out which channel it's grabbing.
		// Used by Ape Escape 2, Everybody's Tennis/Golf, Okage, and Valkyrie Profile 2.
		// Page align test to limit false detections (there is a few).
		const GSVector4i min_uv = GSVector4i(m_vt.m_min.t.upld(GSVector4::zero()));
		ChannelFetch channel = ChannelFetch_NONE;
		if (GSLocalMemory::IsPageAligned(src->m_TEX0.PSM, m_r) &&
			m_r.upl64(GSVector4i::zero()).eq(GSVector4i::zero()))
		{
			if (min_uv.eq(GSVector4i::cxpr(0, 0, 0, 0)))
				channel = ChannelFetch_RED;
			else if (min_uv.eq(GSVector4i::cxpr(0, 2, 0, 0)))
				channel = ChannelFetch_GREEN;
			else if (min_uv.eq(GSVector4i::cxpr(8, 0, 0, 0)))
				channel = ChannelFetch_BLUE;
			else if (min_uv.eq(GSVector4i::cxpr(8, 2, 0, 0)))
				channel = ChannelFetch_ALPHA;
		}

		if (channel != ChannelFetch_NONE)
		{
#ifdef ENABLE_OGL_DEBUG
			static constexpr const char* channel_names[] = { "Red", "Green", "Blue", "Alpha" };
			GL_INS("%s channel from min UV: r={%d,%d=>%d,%d} min uv = %d,%d", channel_names[static_cast<u32>(channel - 1)],
				m_r.x, m_r.y, m_r.z, m_r.w, min_uv.x, min_uv.y);
#endif

			if (test_only)
				return true;

			m_conf.ps.channel = channel;
		}
		else
		{
			GL_INS("Channel not supported r={%d,%d=>%d,%d} min uv = %d,%d",
				m_r.x, m_r.y, m_r.z, m_r.w, min_uv.x, min_uv.y);

			if (test_only)
				return false;

			m_channel_shuffle = false;
			return false;
		}
	}

	pxAssert(m_channel_shuffle);

	// Effect is really a channel shuffle effect so let's cheat a little
	m_conf.tex = src->m_texture;

	// Replace current draw with a fullscreen sprite
	//
	// Performance GPU note: it could be wise to reduce the size to
	// the rendered size of the framebuffer

	GSVertex* s = &m_vertex.buff[0];
	s[0].XYZ.X = static_cast<u16>(m_context->XYOFFSET.OFX + 0);
	s[1].XYZ.X = static_cast<u16>(m_context->XYOFFSET.OFX + 16384);
	s[0].XYZ.Y = static_cast<u16>(m_context->XYOFFSET.OFY + 0);
	s[1].XYZ.Y = static_cast<u16>(m_context->XYOFFSET.OFY + 16384);

	m_vertex.head = m_vertex.tail = m_vertex.next = 2;
	m_index.tail = 2;
	return true;
}

void GSRendererHW::EmulateBlending(int rt_alpha_min, int rt_alpha_max, bool& DATE_PRIMID, bool& DATE_BARRIER, bool& blending_alpha_pass)
{
	{
		// AA1: Don't enable blending on AA1, not yet implemented on hardware mode,
		// it requires coverage sample so it's safer to turn it off instead.
		const bool AA1 = PRIM->AA1 && (m_vt.m_primclass == GS_LINE_CLASS || m_vt.m_primclass == GS_TRIANGLE_CLASS);
		// PABE: Check condition early as an optimization.
		const bool PABE = PRIM->ABE && m_draw_env->PABE.PABE && (GetAlphaMinMax().max < 128);
		// FBMASK: Color is not written, no need to do blending.
		const u32 temp_fbmask = m_conf.ps.dfmt == 2 ? 0x00F8F8F8 : 0x00FFFFFF;
		const bool FBMASK = (m_cached_ctx.FRAME.FBMSK & temp_fbmask) == temp_fbmask;

		// No blending or coverage anti-aliasing so early exit
		if (FBMASK || PABE || !(PRIM->ABE || AA1))
		{
			m_conf.blend = {};
			m_conf.ps.no_color1 = true;
			return;
		}
	}

	// Compute the blending equation to detect special case
	const GSDevice::FeatureSupport features(g_gs_device->Features());
	const GIFRegALPHA& ALPHA = m_context->ALPHA;
	const GIFRegCOLCLAMP& COLCLAMP = m_draw_env->COLCLAMP;
	// AFIX: Afix factor.
	u8 AFIX = ALPHA.FIX;

	// Set blending to shader bits
	m_conf.ps.blend_a = ALPHA.A;
	m_conf.ps.blend_b = ALPHA.B;
	m_conf.ps.blend_c = ALPHA.C;
	m_conf.ps.blend_d = ALPHA.D;

#ifdef ENABLE_OGL_DEBUG
	static constexpr const char* col[3] = {"Cs", "Cd", "0"};
	static constexpr const char* alpha[3] = {"As", "Ad", "Af"};
	GL_INS("EmulateBlending(): (%s - %s) * %s + %s", col[ALPHA.A], col[ALPHA.B], alpha[ALPHA.C], col[ALPHA.D]);
	GL_INS("Draw AlphaMinMax: %d-%d, RT AlphaMinMax: %d-%d", GetAlphaMinMax().min, GetAlphaMinMax().max, rt_alpha_min, rt_alpha_max);
#endif

	bool blend_ad_improved = false;
	const bool alpha_mask = (m_cached_ctx.FRAME.FBMSK & 0xFF000000) == 0xFF000000;

	// When AA1 is enabled and Alpha Blending is disabled, alpha blending done with coverage instead of alpha.
	// We use a COV value of 128 (full coverage) in triangles (except the edge geometry, which we can't do easily).
	if (IsCoverageAlpha())
	{
		m_conf.ps.fixed_one_a = 1;
		m_conf.ps.blend_c = 0;
	}
	else if (m_conf.ps.blend_c == 1)
	{
		// When both rt alpha min and max are equal replace Ad with Af, easier to manage.
		if (rt_alpha_min == rt_alpha_max)
		{
			AFIX = rt_alpha_min;
			m_conf.ps.blend_c = 2;
		}
		// 24 bits doesn't have an alpha channel so use 128 (1.0f) fix factor as equivalent.
		else if (m_conf.ps.dfmt == 1)
		{
			AFIX = 128;
			m_conf.ps.blend_c = 2;
		}
		// Check whenever we can use rt alpha min as the new alpha value, will be more accurate.
		else if (!alpha_mask && (rt_alpha_min >= (rt_alpha_max / 2)))
		{
			AFIX = rt_alpha_min;
			m_conf.ps.blend_c = 2;
			blend_ad_improved = true;
		}
	}

	// Get alpha value
	const bool alpha_c0_zero = (m_conf.ps.blend_c == 0 && GetAlphaMinMax().max == 0);
	const bool alpha_c0_one = (m_conf.ps.blend_c == 0 && (GetAlphaMinMax().min == 128) && (GetAlphaMinMax().max == 128));
	const bool alpha_c0_high_min_one = (m_conf.ps.blend_c == 0 && GetAlphaMinMax().min > 128);
	const bool alpha_c0_high_max_one = (m_conf.ps.blend_c == 0 && GetAlphaMinMax().max > 128);
	const bool alpha_c2_zero = (m_conf.ps.blend_c == 2 && AFIX == 0u);
	const bool alpha_c2_one = (m_conf.ps.blend_c == 2 && AFIX == 128u);
	const bool alpha_c2_high_one = (m_conf.ps.blend_c == 2 && AFIX > 128u);
	const bool alpha_one = alpha_c0_one || alpha_c2_one;

	// Optimize blending equations, must be done before index calculation
	if ((m_conf.ps.blend_a == m_conf.ps.blend_b) || ((m_conf.ps.blend_b == m_conf.ps.blend_d) && alpha_one))
	{
		// Condition 1:
		// A == B
		// (A - B) * C, result will be 0.0f so set A B to Cs, C to As
		// Condition 2:
		// B == D
		// Swap D with A
		// A == B
		// (A - B) * C, result will be 0.0f so set A B to Cs, C to As
		if (m_conf.ps.blend_a != m_conf.ps.blend_b)
			m_conf.ps.blend_d = m_conf.ps.blend_a;
		m_conf.ps.blend_a = 0;
		m_conf.ps.blend_b = 0;
		m_conf.ps.blend_c = 0;
	}
	else if (alpha_c0_zero || alpha_c2_zero)
	{
		// C == 0.0f
		// (A - B) * C, result will be 0.0f so set A B to Cs
		m_conf.ps.blend_a = 0;
		m_conf.ps.blend_b = 0;
	}
	else if (COLCLAMP.CLAMP && m_conf.ps.blend_a == 2
		&& (m_conf.ps.blend_d == 2 || (m_conf.ps.blend_b == m_conf.ps.blend_d && (alpha_c0_high_min_one || alpha_c2_high_one))))
	{
		// CLAMP 1, negative result will be clamped to 0.
		// Condition 1:
		// (0  - Cs)*Alpha +  0, (0  - Cd)*Alpha +  0
		// Condition 2:
		// Alpha is either As or F higher than 1.0f
		// (0  - Cd)*Alpha  + Cd, (0  - Cs)*F  + Cs
		// Results will be 0.0f, make sure D is set to 2.
		m_conf.ps.blend_a = 0;
		m_conf.ps.blend_b = 0;
		m_conf.ps.blend_c = 0;
		m_conf.ps.blend_d = 2;
	}

	// Ad cases, alpha write is masked, one barrier is enough, for d3d11 read the fb
	// Replace Ad with As, blend flags will be used from As since we are chaging the blend_index value.
	// Must be done before index calculation, after blending equation optimizations
	const bool blend_ad = m_conf.ps.blend_c == 1;
	bool blend_ad_alpha_masked = blend_ad && alpha_mask;
	if (((GSConfig.AccurateBlendingUnit >= AccBlendLevel::Basic) || (COLCLAMP.CLAMP == 0))
		&& g_gs_device->Features().texture_barrier && blend_ad_alpha_masked)
		m_conf.ps.blend_c = 0;
	else if (((GSConfig.AccurateBlendingUnit >= AccBlendLevel::Medium)
		// Detect barrier aka fbmask on d3d11.
		|| m_conf.require_one_barrier)
		&& blend_ad_alpha_masked)
		m_conf.ps.blend_c = 0;
	else
		blend_ad_alpha_masked = false;

	u8 blend_index = static_cast<u8>(((m_conf.ps.blend_a * 3 + m_conf.ps.blend_b) * 3 + m_conf.ps.blend_c) * 3 + m_conf.ps.blend_d);
	const HWBlend blend_preliminary = GSDevice::GetBlend(blend_index, false);
	const int blend_flag = blend_preliminary.flags;

	// Re set alpha, it was modified, must be done after index calculation
	if (blend_ad_alpha_masked)
		m_conf.ps.blend_c = ALPHA.C;

	// HW blend can handle Cd output.
	bool color_dest_blend = !!(blend_flag & BLEND_CD);

	// Do the multiplication in shader for blending accumulation: Cs*As + Cd or Cs*Af + Cd
	bool accumulation_blend = !!(blend_flag & BLEND_ACCU);
	// If alpha == 1.0, almost everything is an accumulation blend!
	// Ones that use (1 + Alpha) can't guarante the mixed sw+hw blending this enables will give an identical result to sw due to clamping
	// But enable for everything else that involves dst color
	if (alpha_one && (m_conf.ps.blend_a != m_conf.ps.blend_d) && blend_preliminary.dst != GSDevice::CONST_ZERO)
		accumulation_blend = true;

	// Blending doesn't require barrier, or sampling of the rt
	const bool blend_non_recursive = !!(blend_flag & BLEND_NO_REC);

	// BLEND MIX selection, use a mix of hw/sw blending
	const bool blend_mix1 = !!(blend_flag & BLEND_MIX1) &&
							(features.dual_source_blend || !(m_conf.ps.blend_b == m_conf.ps.blend_d && (alpha_c0_high_min_one || alpha_c2_high_one)));
	const bool blend_mix2 = !!(blend_flag & BLEND_MIX2);
	const bool blend_mix3 = !!(blend_flag & BLEND_MIX3);
	bool blend_mix = (blend_mix1 || blend_mix2 || blend_mix3) && COLCLAMP.CLAMP;

	const bool one_barrier = m_conf.require_one_barrier || blend_ad_alpha_masked;

	// Blend can be done on hw. As and F cases should be accurate.
	// BLEND_HW_CLR1 with Ad, BLEND_HW_CLR3  Cs > 0.5f will require sw blend.
	// BLEND_HW_CLR1 with As/F and BLEND_HW_CLR2 can be done in hw.
	const bool clr_blend = !!(blend_flag & (BLEND_HW_CLR1 | BLEND_HW_CLR2 | BLEND_HW_CLR3));
	bool clr_blend1_2 = (blend_flag & (BLEND_HW_CLR1 | BLEND_HW_CLR2)) && (m_conf.ps.blend_c != 1) && !blend_ad_improved // Make sure it isn't an Ad case
						&& !m_draw_env->PABE.PABE // No PABE as it will require sw blending.
						&& (COLCLAMP.CLAMP) // Let's add a colclamp check too, hw blend will clamp to 0-1.
						&& !(one_barrier || m_conf.require_full_barrier); // Also don't run if there are barriers present.

	// Warning no break on purpose
	// Note: the [[fallthrough]] attribute tell compilers not to complain about not having breaks.
	bool sw_blending = false;
	if (features.texture_barrier)
	{
		// Condition 1: Require full sw blend for full barrier.
		// Condition 2: One barrier is already enabled, prims don't overlap so let's use sw blend instead.
		const bool prefer_sw_blend = m_conf.require_full_barrier || (one_barrier && m_prim_overlap == PRIM_OVERLAP_NO);
		const bool no_prim_overlap = (m_prim_overlap == PRIM_OVERLAP_NO);
		const bool free_blend = blend_non_recursive // Free sw blending, doesn't require barriers or reading fb
			|| accumulation_blend; // Mix of hw/sw blending
		const bool blend_requires_barrier = (blend_flag & BLEND_A_MAX) // Impossible blending
			|| (m_conf.require_full_barrier) // Another effect (for example fbmask) already requires a full barrier
			// Blend can be done in a single draw, and we already need a barrier
			// On fbfetch, one barrier is like full barrier
			|| (one_barrier && (no_prim_overlap || features.framebuffer_fetch))
			|| ((alpha_c2_high_one || alpha_c0_high_max_one) && no_prim_overlap)
			// Ad blends are completely wrong without sw blend (Ad is 0.5 not 1 for 128). We can spare a barrier for it.
			|| ((blend_ad || blend_ad_improved) && no_prim_overlap);

		switch (GSConfig.AccurateBlendingUnit)
		{
			case AccBlendLevel::Maximum:
				clr_blend1_2 = false;
				sw_blending |= true;
				[[fallthrough]];
			case AccBlendLevel::Full:
				sw_blending |= m_conf.ps.blend_a != m_conf.ps.blend_b && alpha_c0_high_max_one;
				[[fallthrough]];
			case AccBlendLevel::High:
				sw_blending |= m_conf.ps.blend_c == 1 || (m_conf.ps.blend_a != m_conf.ps.blend_b && alpha_c2_high_one);
				[[fallthrough]];
			case AccBlendLevel::Medium:
				// Initial idea was to enable accurate blending for sprite rendering to handle
				// correctly post-processing effect. Some games (ZoE) use tons of sprites as particles.
				// In order to keep it fast, let's limit it to smaller draw call.
				sw_blending |= m_vt.m_primclass == GS_SPRITE_CLASS && m_drawlist.size() < 100;
				[[fallthrough]];
			case AccBlendLevel::Basic:
				// SW FBMASK, needs sw blend, avoid hitting any hw blend pre enabled (accumulation, blend mix, blend cd),
				// fixes shadows in Superman shadows of Apokolips.
				// DATE_BARRIER already does full barrier so also makes more sense to do full sw blend.
				color_dest_blend &= !prefer_sw_blend;
				// If prims don't overlap prefer full sw blend on blend_ad_alpha_masked cases.
				accumulation_blend &= !(prefer_sw_blend || (blend_ad_alpha_masked && m_prim_overlap == PRIM_OVERLAP_NO));
				// Enable sw blending for barriers.
				sw_blending |= blend_requires_barrier;
				// Try to do hw blend for clr2 case.
				sw_blending &= !clr_blend1_2;
				// blend_ad_improved should only run if no other barrier blend is enabled, otherwise restore bit values.
				if (blend_ad_improved && (sw_blending || prefer_sw_blend))
				{
					AFIX = 0;
					m_conf.ps.blend_c = 1;
				}
				// Enable sw blending for free blending, should be done after blend_ad_improved check.
				sw_blending |= free_blend;
				// Do not run BLEND MIX if sw blending is already present, it's less accurate.
				blend_mix &= !sw_blending;
				sw_blending |= blend_mix;
				// Disable dithering on blend mix.
				m_conf.ps.dither &= !blend_mix;
				[[fallthrough]];
			case AccBlendLevel::Minimum:
				break;
		}
	}
	else
	{
		// FBMASK or channel shuffle already reads the fb so it is safe to enable sw blend when there is no overlap.
		const bool fbmask_no_overlap = m_conf.require_one_barrier && (m_prim_overlap == PRIM_OVERLAP_NO);

		switch (GSConfig.AccurateBlendingUnit)
		{
			case AccBlendLevel::Maximum:
				if (m_prim_overlap == PRIM_OVERLAP_NO)
				{
					clr_blend1_2 = false;
					sw_blending |= true;
				}
				[[fallthrough]];
			case AccBlendLevel::Full:
				sw_blending |= ((m_conf.ps.blend_c == 1 || (blend_mix && (alpha_c2_high_one || alpha_c0_high_max_one))) && (m_prim_overlap == PRIM_OVERLAP_NO));
				[[fallthrough]];
			case AccBlendLevel::High:
				sw_blending |= (!(clr_blend || blend_mix) && (m_prim_overlap == PRIM_OVERLAP_NO));
				[[fallthrough]];
			case AccBlendLevel::Medium:
				// If prims don't overlap prefer full sw blend on blend_ad_alpha_masked cases.
				if (blend_ad_alpha_masked && m_prim_overlap == PRIM_OVERLAP_NO)
				{
					accumulation_blend = false;
					sw_blending |= true;
				}
				[[fallthrough]];
			case AccBlendLevel::Basic:
				// Disable accumulation blend when there is fbmask with no overlap, will be faster.
				color_dest_blend   &= !fbmask_no_overlap;
				accumulation_blend &= !fbmask_no_overlap;
				// Blending requires reading the framebuffer when there's no overlap.
				sw_blending |= fbmask_no_overlap;
				// Try to do hw blend for clr2 case.
				sw_blending &= !clr_blend1_2;
				// blend_ad_improved should only run if no other barrier blend is enabled, otherwise restore bit values.
				if (blend_ad_improved && (sw_blending || fbmask_no_overlap))
				{
					AFIX = 0;
					m_conf.ps.blend_c = 1;
				}
				// Enable sw blending for free blending, should be done after blend_ad_improved check.
				sw_blending |= accumulation_blend || blend_non_recursive;
				// Do not run BLEND MIX if sw blending is already present, it's less accurate.
				blend_mix &= !sw_blending;
				sw_blending |= blend_mix;
				// Disable dithering on blend mix.
				m_conf.ps.dither &= !blend_mix;
				[[fallthrough]];
			case AccBlendLevel::Minimum:
				break;
		}
	}

	bool replace_dual_src = false;
	if (!features.dual_source_blend && GSDevice::IsDualSourceBlend(blend_index))
	{
		// if we don't have an alpha channel, we don't need a second pass, just output the alpha blend
		// in the single colour's alpha chnanel, and blend with it
		if (!m_conf.colormask.wa)
		{
			GL_INS("Outputting alpha blend in col0 because of no alpha write");
			m_conf.ps.no_ablend = true;
			replace_dual_src = true;
		}
		else if (features.framebuffer_fetch || m_conf.require_one_barrier || m_conf.require_full_barrier)
		{
			// prefer single pass sw blend (if barrier) or framebuffer fetch over dual pass alpha when supported
			sw_blending = true;
			color_dest_blend = false;
			accumulation_blend &= !features.framebuffer_fetch;
			blend_mix = false;
		}
		else
		{
			// split the draw into two
			blending_alpha_pass = true;
			replace_dual_src = true;
		}
	}
	else if (features.framebuffer_fetch)
	{
		// If we have fbfetch, use software blending when we need the fb value for anything else.
		// This saves outputting the second color when it's not needed.
		if (one_barrier || m_conf.require_full_barrier)
		{
			sw_blending = true;
			color_dest_blend = false;
			accumulation_blend = false;
			blend_mix = false;
		}
	}

	// Color clip
	if (COLCLAMP.CLAMP == 0)
	{
		bool free_colclip = false;
		if (features.framebuffer_fetch)
			free_colclip = true;
		else if (features.texture_barrier)
			free_colclip = m_prim_overlap == PRIM_OVERLAP_NO || blend_non_recursive;
		else
			free_colclip = blend_non_recursive;

		GL_DBG("COLCLIP Info (Blending: %u/%u/%u/%u, OVERLAP: %d)", m_conf.ps.blend_a, m_conf.ps.blend_b, m_conf.ps.blend_c, m_conf.ps.blend_d, m_prim_overlap);
		if (color_dest_blend)
		{
			// No overflow, disable colclip.
			GL_INS("COLCLIP mode DISABLED");
		}
		else if (free_colclip)
		{
			// The fastest algo that requires a single pass
			GL_INS("COLCLIP Free mode ENABLED");
			m_conf.ps.colclip  = 1;
			sw_blending        = true;
			// Disable the HDR algo
			accumulation_blend = false;
			blend_mix          = false;
		}
		else if (accumulation_blend)
		{
			// A fast algo that requires 2 passes
			GL_INS("COLCLIP Fast HDR mode ENABLED");
			m_conf.ps.hdr = 1;
			sw_blending = true; // Enable sw blending for the HDR algo
		}
		else if (sw_blending)
		{
			// A slow algo that could requires several passes (barely used)
			GL_INS("COLCLIP SW mode ENABLED");
			m_conf.ps.colclip = 1;
		}
		else
		{
			GL_INS("COLCLIP HDR mode ENABLED");
			m_conf.ps.hdr = 1;
		}
	}

	// Per pixel alpha blending
	if (m_draw_env->PABE.PABE)
	{
		// Breath of Fire Dragon Quarter, Strawberry Shortcake, Super Robot Wars, Cartoon Network Racing.

		if (sw_blending)
		{
			GL_INS("PABE mode ENABLED");
			if (features.texture_barrier)
			{
				// Disable hw/sw blend and do pure sw blend with reading the framebuffer.
				color_dest_blend   = false;
				accumulation_blend = false;
				blend_mix          = false;
				m_conf.ps.pabe     = 1;

				// HDR mode should be disabled when doing sw blend, swap with sw colclip.
				if (m_conf.ps.hdr)
				{
					m_conf.ps.hdr     = 0;
					m_conf.ps.colclip = 1;
				}
			}
			else
			{
				m_conf.ps.pabe = !(accumulation_blend || blend_mix);
			}
		}
		else if (m_conf.ps.blend_a == 0 && m_conf.ps.blend_b == 1 && m_conf.ps.blend_c == 0 && m_conf.ps.blend_d == 1)
		{
			// this works because with PABE alpha blending is on when alpha >= 0x80, but since the pixel shader
			// cannot output anything over 0x80 (== 1.0) blending with 0x80 or turning it off gives the same result
			blend_index = 0;
		}
	}

	// For stat to optimize accurate option
#if 0
	GL_INS("BLEND_INFO: %u/%u/%u/%u. Clamp:%u. Prim:%d number %u (drawlist %zu) (sw %d)",
		m_conf.ps.blend_a, m_conf.ps.blend_b, m_conf.ps.blend_c, m_conf.ps.blend_d,
		m_env.COLCLAMP.CLAMP, m_vt.m_primclass, m_vertex.next, m_drawlist.size(), sw_blending);
#endif
	if (color_dest_blend)
	{
		// Blend output will be Cd, disable hw/sw blending.
		m_conf.blend = {};
		m_conf.ps.no_color1 = true;
		m_conf.ps.blend_a = m_conf.ps.blend_b = m_conf.ps.blend_c = m_conf.ps.blend_d = 0;
		sw_blending = false; // DATE_PRIMID

		// Output is Cd, set rgb write to 0.
		m_conf.colormask.wrgba &= 0x8;
	}
	else if (sw_blending)
	{
		// Require the fix alpha vlaue
		if (m_conf.ps.blend_c == 2)
			m_conf.cb_ps.TA_MaxDepth_Af.a = static_cast<float>(AFIX) / 128.0f;

		const HWBlend blend = GSDevice::GetBlend(blend_index, replace_dual_src);
		if (accumulation_blend)
		{
			// Keep HW blending to do the addition/subtraction
			m_conf.blend = {true, GSDevice::CONST_ONE, GSDevice::CONST_ONE, blend.op, false, 0};
			blending_alpha_pass = false;

			// Remove Cd from sw blend, it's handled in hw
			if (m_conf.ps.blend_a == 1)
				m_conf.ps.blend_a = 2;
			if (m_conf.ps.blend_b == 1)
				m_conf.ps.blend_b = 2;
			if (m_conf.ps.blend_d == 1)
				m_conf.ps.blend_d = 2;

			if (m_conf.ps.blend_a == 2)
			{
				// Accumulation blend is only available in (Cs - 0)*Something + Cd, or with alpha == 1
				ASSERT(m_conf.ps.blend_d == 2 || alpha_one);
				// A bit of normalization
				m_conf.ps.blend_a = m_conf.ps.blend_d;
				m_conf.ps.blend_d = 2;
			}

			if (blend.op == GSDevice::OP_REV_SUBTRACT)
			{
				ASSERT(m_conf.ps.blend_a == 2);
				if (m_conf.ps.hdr)
				{
					// HDR uses unorm, which is always positive
					// Have the shader do the inversion, then clip to remove the negative
					m_conf.blend.op = GSDevice::OP_ADD;
				}
				else
				{
					// The blend unit does a reverse subtraction so it means
					// the shader must output a positive value.
					// Replace 0 - Cs by Cs - 0
					m_conf.ps.blend_a = m_conf.ps.blend_b;
					m_conf.ps.blend_b = 2;
				}
			}

			// Dual source output not needed (accumulation blend replaces it with ONE).
			m_conf.ps.no_color1 = true;

			// Only Ad case will require one barrier
			// No need to set a_masked bit for blend_ad_alpha_masked case
			m_conf.require_one_barrier |= blend_ad_alpha_masked;
		}
		else if (blend_mix)
		{
			// For mixed blend, the source blend is done in the shader (so we use CONST_ONE as a factor).
			m_conf.blend = {true, GSDevice::CONST_ONE, blend.dst, blend.op, m_conf.ps.blend_c == 2, AFIX};
			m_conf.ps.blend_mix = (blend.op == GSDevice::OP_REV_SUBTRACT) ? 2 : 1;

			// Elide DSB colour output if not used by dest.
			m_conf.ps.no_color1 |= !GSDevice::IsDualSourceBlendFactor(blend.dst);

			if (blend_mix1)
			{
				if (m_conf.ps.blend_b == m_conf.ps.blend_d && (alpha_c0_high_min_one || alpha_c2_high_one))
				{
					// Replace Cs*As + Cd*(1 - As) with Cs*As - Cd*(As - 1).
					// Replace Cs*F + Cd*(1 - F) with Cs*F - Cd*(F - 1).
					// As - 1 or F - 1 subtraction is only done for the dual source output (hw blending part) since we are changing the equation.
					// Af will be replaced with As in shader and send it to dual source output.
					m_conf.blend = {true, GSDevice::CONST_ONE, GSDevice::SRC1_COLOR, GSDevice::OP_SUBTRACT, false, 0};
					// blend hw 1 will disable alpha clamp, we can reuse the old bits.
					m_conf.ps.blend_hw = 1;
					// DSB output will always be used.
					m_conf.ps.no_color1 = false;
				}
				else if (m_conf.ps.blend_a == m_conf.ps.blend_d)
				{
					// Compensate slightly for Cd*(As + 1) - Cs*As.
					// Try to compensate a bit with subtracting 1 (0.00392) * (Alpha + 1) from Cs.
					m_conf.ps.blend_hw = 2;
				}

				m_conf.ps.blend_a = 0;
				m_conf.ps.blend_b = 2;
				m_conf.ps.blend_d = 2;
			}
			else if (blend_mix2)
			{
				// Allow to compensate when Cs*(Alpha + 1) overflows, to compensate we change
				// the alpha output value for Cd*Alpha.
				m_conf.blend = {true, GSDevice::CONST_ONE, GSDevice::SRC1_COLOR, blend.op, false, 0};
				m_conf.ps.blend_hw = 3;
				m_conf.ps.no_color1 = false;

				m_conf.ps.blend_a = 0;
				m_conf.ps.blend_b = 2;
				m_conf.ps.blend_d = 0;
			}
			else if (blend_mix3)
			{
				m_conf.ps.blend_a = 2;
				m_conf.ps.blend_b = 0;
				m_conf.ps.blend_d = 0;
			}

			// Only Ad case will require one barrier
			if (blend_ad_alpha_masked)
			{
				// Swap Ad with As for hw blend
				m_conf.ps.a_masked = 1;
				m_conf.require_one_barrier |= true;
			}
		}
		else
		{
			// Disable HW blending
			m_conf.blend = {};
			m_conf.ps.no_color1 = true;
			replace_dual_src = false;
			blending_alpha_pass = false;

			// No need to set a_masked bit for blend_ad_alpha_masked case
			const bool blend_non_recursive_one_barrier = blend_non_recursive && blend_ad_alpha_masked;
			if (blend_non_recursive_one_barrier)
				m_conf.require_one_barrier |= true;
			else if (features.texture_barrier)
				m_conf.require_full_barrier |= !blend_non_recursive;
			else
				m_conf.require_one_barrier |= !blend_non_recursive;
		}
	}
	else
	{
		// No sw blending
		m_conf.ps.blend_a = 0;
		m_conf.ps.blend_b = 0;
		m_conf.ps.blend_d = 0;

		// Care for hw blend value, 6 is for hw/sw, sw blending used.
		if (blend_flag & BLEND_HW_CLR1)
		{
			m_conf.ps.blend_hw = 1;
		}
		else if (blend_flag & BLEND_HW_CLR2)
		{
			if (m_conf.ps.blend_c == 2)
				m_conf.cb_ps.TA_MaxDepth_Af.a = static_cast<float>(AFIX) / 128.0f;

			m_conf.ps.blend_hw = 2;
		}
		else if (blend_flag & BLEND_HW_CLR3)
		{
			m_conf.ps.blend_hw = 3;
		}

		if (blend_ad_alpha_masked)
		{
			m_conf.ps.a_masked = 1;
			m_conf.require_one_barrier |= true;
		}

		const HWBlend blend(GSDevice::GetBlend(blend_index, replace_dual_src));
		m_conf.blend = {true, blend.src, blend.dst, blend.op, m_conf.ps.blend_c == 2, AFIX};

		// Remove second color output when unused. Works around bugs in some drivers (e.g. Intel).
		m_conf.ps.no_color1 |= !GSDevice::IsDualSourceBlendFactor(m_conf.blend.src_factor) &&
							   !GSDevice::IsDualSourceBlendFactor(m_conf.blend.dst_factor);
	}

	// Notify the shader that it needs to invert rounding
	if (m_conf.blend.op == GSDevice::OP_REV_SUBTRACT)
		m_conf.ps.round_inv = 1;

	// DATE_PRIMID interact very badly with sw blending. DATE_PRIMID uses the primitiveID to find the primitive
	// that write the bad alpha value. Sw blending will force the draw to run primitive by primitive
	// (therefore primitiveID will be constant to 1).
	// Switch DATE_PRIMID with DATE_BARRIER in such cases to ensure accuracy.
	// No mix of COLCLIP + sw blend + DATE_PRIMID, neither sw fbmask + DATE_PRIMID.
	// Note: Do the swap in the end, saves the expensive draw splitting/barriers when mixed software blending is used.
	if (sw_blending && DATE_PRIMID && m_conf.require_full_barrier)
	{
		GL_PERF("DATE: Swap DATE_PRIMID with DATE_BARRIER");
		m_conf.require_full_barrier = true;
		DATE_PRIMID = false;
		DATE_BARRIER = true;
	}
}

__ri static constexpr bool IsRedundantClamp(u8 clamp, u32 clamp_min, u32 clamp_max, u32 tsize)
{
	// Don't shader sample when the clamp/repeat is configured to the texture size.
	// That way trilinear etc still works.
	const u32 textent = (1u << tsize) - 1u;
	if (clamp == CLAMP_REGION_CLAMP)
		return (clamp_min == 0 && clamp_max >= textent);
	else if (clamp == CLAMP_REGION_REPEAT)
		return (clamp_max == 0 && clamp_min == textent);
	else
		return false;
}

__ri static constexpr u8 EffectiveClamp(u8 clamp, bool has_region)
{
	// When we have extracted the region in the texture, we can use the hardware sampler for repeat/clamp.
	// (weird flip here because clamp/repeat is inverted for region vs non-region).
	return (clamp >= CLAMP_REGION_CLAMP && has_region) ? (clamp ^ 3) : clamp;
}

__ri void GSRendererHW::EmulateTextureSampler(const GSTextureCache::Target* rt, const GSTextureCache::Target* ds, GSTextureCache::Source* tex, const TextureMinMaxResult& tmm, GSTexture*& src_copy)
{
	// don't overwrite the texture when using channel shuffle, but keep the palette
	if (!m_channel_shuffle)
		m_conf.tex = tex->m_texture;
	m_conf.pal = tex->m_palette;

	// Hazard handling (i.e. reading from the current RT/DS).
	GSTextureCache::SourceRegion source_region = tex->GetRegion();
	bool target_region = (tex->IsFromTarget() && source_region.HasEither());
	GSVector2i unscaled_size = target_region ? tex->GetRegionSize() : tex->GetUnscaledSize();
	float scale = tex->GetScale();
	HandleTextureHazards(rt, ds, tex, tmm, source_region, target_region, unscaled_size, scale, src_copy);

	// Warning fetch the texture PSM format rather than the context format. The latter could have been corrected in the texture cache for depth.
	//const GSLocalMemory::psm_t &psm = GSLocalMemory::m_psm[m_cached_ctx.TEX0.PSM];
	const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[tex->m_TEX0.PSM];
	const GSLocalMemory::psm_t& cpsm = psm.pal > 0 ? GSLocalMemory::m_psm[m_cached_ctx.TEX0.CPSM] : psm;

	// Redundant clamp tests are restricted to local memory/1x sources only, if we're from a target,
	// we keep the shader clamp. See #5851 on github, and the note in Draw().
	[[maybe_unused]] static constexpr const char* clamp_modes[] = {"REPEAT", "CLAMP", "REGION_CLAMP", "REGION_REPEAT"};
	const bool redundant_wms = IsRedundantClamp(m_cached_ctx.CLAMP.WMS, m_cached_ctx.CLAMP.MINU,
													 m_cached_ctx.CLAMP.MAXU, tex->m_TEX0.TW);
	const bool redundant_wmt = IsRedundantClamp(m_cached_ctx.CLAMP.WMT, m_cached_ctx.CLAMP.MINV,
													 m_cached_ctx.CLAMP.MAXV, tex->m_TEX0.TH);
	const u8 wms = EffectiveClamp(m_cached_ctx.CLAMP.WMS, !tex->m_target && (source_region.HasX() || redundant_wms));
	const u8 wmt = EffectiveClamp(m_cached_ctx.CLAMP.WMT, !tex->m_target && (source_region.HasY() || redundant_wmt));
	const bool complex_wms_wmt = !!((wms | wmt) & 2) || target_region;
	GL_CACHE("FST: %s WMS: %s [%s%s] WMT: %s [%s%s] Complex: %d TargetRegion: %d MINU: %d MAXU: %d MINV: %d MAXV: %d",
		PRIM->FST ? "UV" : "STQ", clamp_modes[m_cached_ctx.CLAMP.WMS], redundant_wms ? "redundant," : "",
		clamp_modes[wms], clamp_modes[m_cached_ctx.CLAMP.WMT], redundant_wmt ? "redundant," : "", clamp_modes[wmt],
		complex_wms_wmt, target_region, m_cached_ctx.CLAMP.MINU, m_cached_ctx.CLAMP.MAXU, m_cached_ctx.CLAMP.MINV,
		m_cached_ctx.CLAMP.MAXV);

	const bool need_mipmap = IsMipMapDraw();
	const bool shader_emulated_sampler = tex->m_palette || (tex->m_target && !m_conf.ps.shuffle && cpsm.fmt != 0) ||
										 complex_wms_wmt || psm.depth || target_region;
	const bool trilinear_manual = need_mipmap && GSConfig.HWMipmap == HWMipmapLevel::Full;

	bool bilinear = m_vt.IsLinear();
	int trilinear = 0;
	bool trilinear_auto = false; // Generate mipmaps if needed (basic).
	switch (GSConfig.TriFilter)
	{
		case TriFiltering::Forced:
		{
			// Force bilinear otherwise we can end up with min/mag nearest and mip linear.
			// We don't need to check for HWMipmapLevel::Off here, because forced trilinear implies forced mipmaps.
			bilinear = true;
			trilinear = static_cast<u8>(GS_MIN_FILTER::Linear_Mipmap_Linear);
			trilinear_auto = !tex->m_target && (!need_mipmap || GSConfig.HWMipmap != HWMipmapLevel::Full);
		}
		break;

		case TriFiltering::PS2:
		{
			// Can only use PS2 trilinear when mipmapping is enabled.
			if (need_mipmap && GSConfig.HWMipmap != HWMipmapLevel::Off)
			{
				trilinear = m_context->TEX1.MMIN;
				trilinear_auto = !tex->m_target && GSConfig.HWMipmap != HWMipmapLevel::Full;
			}
		}
		break;

		case TriFiltering::Automatic:
		case TriFiltering::Off:
		default:
			break;
	}

	// 1 and 0 are equivalent
	m_conf.ps.wms = (wms & 2 || target_region) ? wms : 0;
	m_conf.ps.wmt = (wmt & 2 || target_region) ? wmt : 0;

	// Depth + bilinear filtering isn't done yet (And I'm not sure we need it anyway but a game will prove me wrong)
	// So of course, GTA set the linear mode, but sampling is done at texel center so it is equivalent to nearest sampling
	// Other games worth testing: Area 51, Burnout
	if (psm.depth && m_vt.IsLinear())
		GL_INS("WARNING: Depth + bilinear filtering not supported");

	// Performance note:
	// 1/ Don't set 0 as it is the default value
	// 2/ Only keep aem when it is useful (avoid useless shader permutation)
	if (m_conf.ps.shuffle)
	{
		const GIFRegTEXA& TEXA = m_draw_env->TEXA;

		// Force a 32 bits access (normally shuffle is done on 16 bits)
		// m_ps_sel.tex_fmt = 0; // removed as an optimization

		//ASSERT(tex->m_target);

		// Require a float conversion if the texure is a depth otherwise uses Integral scaling
		if (psm.depth)
		{
			m_conf.ps.depth_fmt = (tex->m_texture->GetType() != GSTexture::Type::DepthStencil) ? 3 : 1;
		}

		// Shuffle is a 16 bits format, so aem is always required
		if (m_cached_ctx.TEX0.TCC)
		{
			m_conf.ps.aem = TEXA.AEM;
			GSVector4 ta(TEXA & GSVector4i::x000000ff());
			ta /= 255.0f;
			m_conf.cb_ps.TA_MaxDepth_Af.x = ta.x;
			m_conf.cb_ps.TA_MaxDepth_Af.y = ta.y;
		}
		else
		{
			m_conf.cb_ps.TA_MaxDepth_Af.x = 0;
			m_conf.cb_ps.TA_MaxDepth_Af.y = 1.0f;
		}

		// The purpose of texture shuffle is to move color channel. Extra interpolation is likely a bad idea.
		bilinear &= m_vt.IsLinear();

		const GSVector4 half_pixel = RealignTargetTextureCoordinate(tex);
		m_conf.cb_vs.texture_offset = GSVector2(half_pixel.x, half_pixel.y);
	}
	else if (tex->m_target)
	{
		const GIFRegTEXA& TEXA = m_draw_env->TEXA;

		// Use an old target. AEM and index aren't resolved it must be done
		// on the GPU

		// Select the 32/24/16 bits color (AEM)
		m_conf.ps.aem_fmt = cpsm.fmt;
		m_conf.ps.aem = TEXA.AEM;

		// Don't upload AEM if format is 32 bits
		if (cpsm.fmt)
		{
			GSVector4 ta(TEXA & GSVector4i::x000000ff());
			ta /= 255.0f;
			m_conf.cb_ps.TA_MaxDepth_Af.x = ta.x;
			m_conf.cb_ps.TA_MaxDepth_Af.y = ta.y;
		}

		// Select the index format
		if (tex->m_palette)
		{
			// FIXME Potentially improve fmt field in GSLocalMemory
			if (m_cached_ctx.TEX0.PSM == PSMT4HL)
				m_conf.ps.pal_fmt = 1;
			else if (m_cached_ctx.TEX0.PSM == PSMT4HH)
				m_conf.ps.pal_fmt = 2;
			else
				m_conf.ps.pal_fmt = 3;

			// Alpha channel of the RT is reinterpreted as an index. Star
			// Ocean 3 uses it to emulate a stencil buffer.  It is a very
			// bad idea to force bilinear filtering on it.
			bilinear &= m_vt.IsLinear();
		}

		// Depth format
		if (tex->m_texture->GetType() == GSTexture::Type::DepthStencil)
		{
			// Require a float conversion if the texure is a depth format
			m_conf.ps.depth_fmt = (psm.bpp == 16) ? 2 : 1;

			// Don't force interpolation on depth format
			bilinear &= m_vt.IsLinear();
		}
		else if (psm.depth)
		{
			// Use Integral scaling
			m_conf.ps.depth_fmt = 3;

			// Don't force interpolation on depth format
			bilinear &= m_vt.IsLinear();
		}

		const GSVector4 half_pixel = RealignTargetTextureCoordinate(tex);
		m_conf.cb_vs.texture_offset = GSVector2(half_pixel.x, half_pixel.y);
	}
	else if (tex->m_palette)
	{
		// Use a standard 8 bits texture. AEM is already done on the CLUT
		// Therefore you only need to set the index
		// m_conf.ps.aem     = 0; // removed as an optimization

		// Note 4 bits indexes are converted to 8 bits
		m_conf.ps.pal_fmt = 3;
	}
	else
	{
		// Standard texture. Both index and AEM expansion were already done by the CPU.
		// m_conf.ps.tex_fmt = 0; // removed as an optimization
		// m_conf.ps.aem     = 0; // removed as an optimization
	}

	if (m_cached_ctx.TEX0.TFX == TFX_MODULATE && m_vt.m_eq.rgba == 0xFFFF && m_vt.m_min.c.eq(GSVector4i(128)))
	{
		// Micro optimization that reduces GPU load (removes 5 instructions on the FS program)
		m_conf.ps.tfx = TFX_DECAL;
	}
	else
	{
		m_conf.ps.tfx = m_cached_ctx.TEX0.TFX;
	}

	m_conf.ps.tcc = m_cached_ctx.TEX0.TCC;

	m_conf.ps.ltf = bilinear && shader_emulated_sampler;
	m_conf.ps.point_sampler = g_gs_device->Features().broken_point_sampler && !target_region && (!bilinear || shader_emulated_sampler);

	const int tw = static_cast<int>(1 << m_cached_ctx.TEX0.TW);
	const int th = static_cast<int>(1 << m_cached_ctx.TEX0.TH);
	const int miptw = 1 << tex->m_TEX0.TW;
	const int mipth = 1 << tex->m_TEX0.TH;

	const GSVector4 WH(static_cast<float>(tw), static_cast<float>(th), miptw * scale, mipth * scale);

	// Reduction factor when source is a target and smaller/larger than TW/TH.
	m_conf.cb_ps.STScale = GSVector2(static_cast<float>(miptw) / static_cast<float>(unscaled_size.x),
		static_cast<float>(mipth) / static_cast<float>(unscaled_size.y));

	if (target_region)
	{
		// Use texelFetch() and clamp. Subtract one because the upper bound is exclusive.
		m_conf.cb_ps.STRange = GSVector4(tex->GetRegionRect() - GSVector4i::cxpr(0, 0, 1, 1)) * GSVector4(scale);
		m_conf.ps.region_rect = true;
	}
	else if (!tex->m_target)
	{
		// Targets aren't currently offset, so STScale takes care of it.
		if (source_region.HasX())
		{
			m_conf.cb_ps.STRange.x = static_cast<float>(source_region.GetMinX()) / static_cast<float>(miptw);
			m_conf.cb_ps.STRange.z = static_cast<float>(miptw) / static_cast<float>(source_region.GetWidth());
			m_conf.ps.adjs = 1;
		}
		if (source_region.HasY())
		{
			m_conf.cb_ps.STRange.y = static_cast<float>(source_region.GetMinY()) / static_cast<float>(mipth);
			m_conf.cb_ps.STRange.w = static_cast<float>(mipth) / static_cast<float>(source_region.GetHeight());
			m_conf.ps.adjt = 1;
		}
	}

	m_conf.ps.fst = !!PRIM->FST;

	m_conf.cb_ps.WH = WH;
	m_conf.cb_ps.HalfTexel = GSVector4(-0.5f, 0.5f).xxyy() / WH.zwzw();
	if (complex_wms_wmt)
	{
		const GSVector4i clamp(m_cached_ctx.CLAMP.MINU, m_cached_ctx.CLAMP.MINV, m_cached_ctx.CLAMP.MAXU, m_cached_ctx.CLAMP.MAXV);
		const GSVector4 region_repeat(GSVector4::cast(clamp));
		const GSVector4 region_clamp(GSVector4(clamp) / WH.xyxy());
		if (wms >= CLAMP_REGION_CLAMP)
		{
			m_conf.cb_ps.MinMax.x = (wms == CLAMP_REGION_CLAMP && !m_conf.ps.depth_fmt) ? region_clamp.x : region_repeat.x;
			m_conf.cb_ps.MinMax.z = (wms == CLAMP_REGION_CLAMP && !m_conf.ps.depth_fmt) ? region_clamp.z : region_repeat.z;
		}
		if (wmt >= CLAMP_REGION_CLAMP)
		{
			m_conf.cb_ps.MinMax.y = (wmt == CLAMP_REGION_CLAMP && !m_conf.ps.depth_fmt) ? region_clamp.y : region_repeat.y;
			m_conf.cb_ps.MinMax.w = (wmt == CLAMP_REGION_CLAMP && !m_conf.ps.depth_fmt) ? region_clamp.w : region_repeat.w;
		}
	}
	else if (trilinear_manual)
	{
		// Reuse uv_min_max for mipmap parameter to avoid an extension of the UBO
		m_conf.cb_ps.MinMax.x = static_cast<float>(m_context->TEX1.K) / 16.0f;
		m_conf.cb_ps.MinMax.y = static_cast<float>(1 << m_context->TEX1.L);
		m_conf.cb_ps.MinMax.z = static_cast<float>(m_lod.x); // Offset because first layer is m_lod, dunno if we can do better
		m_conf.cb_ps.MinMax.w = static_cast<float>(m_lod.y);
	}
	else if (trilinear_auto)
	{
		tex->m_texture->GenerateMipmapsIfNeeded();
	}

	// TC Offset Hack
	m_conf.ps.tcoffsethack = m_userhacks_tcoffset;
	const GSVector4 tc_oh_ts = GSVector4(1 / 16.0f, 1 / 16.0f, m_userhacks_tcoffset_x, m_userhacks_tcoffset_y) / WH.xyxy();
	m_conf.cb_ps.TCOffsetHack = GSVector2(tc_oh_ts.z, tc_oh_ts.w);
	m_conf.cb_vs.texture_scale = GSVector2(tc_oh_ts.x, tc_oh_ts.y);

	// Only enable clamping in CLAMP mode. REGION_CLAMP will be done manually in the shader
	m_conf.sampler.tau = (wms == CLAMP_REPEAT && !target_region);
	m_conf.sampler.tav = (wmt == CLAMP_REPEAT && !target_region);
	if (shader_emulated_sampler)
	{
		m_conf.sampler.biln = 0;
		m_conf.sampler.aniso = 0;
		m_conf.sampler.triln = 0;
	}
	else
	{
		m_conf.sampler.biln = bilinear;
		// Aniso filtering doesn't work with textureLod so use texture (automatic_lod) instead.
		// Enable aniso only for triangles. Sprites are flat so aniso is likely useless (it would save perf for others primitives).
		const bool anisotropic = m_vt.m_primclass == GS_TRIANGLE_CLASS && !trilinear_manual;
		m_conf.sampler.aniso = anisotropic;
		m_conf.sampler.triln = trilinear;
		if (trilinear_manual)
		{
			m_conf.ps.manual_lod = 1;
		}
		else if (trilinear_auto || anisotropic)
		{
			m_conf.ps.automatic_lod = 1;
		}
	}

	// clamp to base level if we're not providing or generating mipmaps
	// manual trilinear causes the chain to be uploaded, auto causes it to be generated
	m_conf.sampler.lodclamp = !(trilinear_manual || trilinear_auto);
}

__ri void GSRendererHW::HandleTextureHazards(const GSTextureCache::Target* rt, const GSTextureCache::Target* ds,
	const GSTextureCache::Source* tex, const TextureMinMaxResult& tmm, GSTextureCache::SourceRegion& source_region,
	bool& target_region, GSVector2i& unscaled_size, float& scale, GSTexture*& src_copy)
{
	// Detect framebuffer read that will need special handling
	const GSTextureCache::Target* src_target = nullptr;
	if (m_conf.tex == m_conf.rt)
	{
		// Can we read the framebuffer directly? (i.e. sample location matches up).
		if (CanUseTexIsFB(rt, tex, tmm))
		{
			m_conf.tex = nullptr;
			m_conf.ps.tex_is_fb = true;
			if (m_prim_overlap == PRIM_OVERLAP_NO || !g_gs_device->Features().texture_barrier)
				m_conf.require_one_barrier = true;
			else
				m_conf.require_full_barrier = true;

			unscaled_size = rt->GetUnscaledSize();
			scale = rt->GetScale();
			return;
		}

		GL_CACHE("Source is render target, taking copy.");
		src_target = rt;
	}
	else if (m_conf.tex == m_conf.ds)
	{
		// GL, Vulkan (in General layout), not DirectX!
		const bool can_read_current_depth_buffer = g_gs_device->Features().test_and_sample_depth;

		// If this is our current Z buffer, we might not be able to read it directly if it's being written to.
		// Rather than leaving the backend to do it, we'll check it here.
		if (can_read_current_depth_buffer && (m_cached_ctx.ZBUF.ZMSK || m_cached_ctx.TEST.ZTST == ZTST_NEVER))
		{
			// Safe to read!
			GL_CACHE("Source is depth buffer, not writing, safe to read.");
			unscaled_size = ds->GetUnscaledSize();
			scale = ds->GetScale();
			return;
		}

		// Can't safely read the depth buffer, so we need to take a copy of it.
		GL_CACHE("Source is depth buffer, unsafe to read, taking copy.");
		src_target = ds;
	}
	else
	{
		// No match.
		return;
	}

	// We need to copy. Try to cut down the source range as much as possible so we don't copy texels we're not reading.
	const GSVector2i& src_unscaled_size = src_target->GetUnscaledSize();
	const GSVector4i src_bounds = src_target->GetUnscaledRect();
	GSVector4i copy_range;
	GSVector2i copy_size;
	GSVector2i copy_dst_offset;

	// Shuffles take the whole target. This should've already been halved.
	// We can't partially copy depth targets in DirectX, and GL/Vulkan should use the direct read above.
	// Restricting it also breaks Tom and Jerry...
	if (m_channel_shuffle || tex->m_texture->GetType() == GSTexture::Type::DepthStencil)
	{
		copy_range = src_bounds;
		copy_size = src_unscaled_size;
		GSVector4i::storel(&copy_dst_offset, copy_range);
	}
	else
	{
		// If we're using TW/TH-based sizing, take the size from TEX0, not the target.
		const GSVector2i tex_size = GSVector2i(1 << m_cached_ctx.TEX0.TW, 1 << m_cached_ctx.TEX0.TH);
		copy_size.x = std::min(tex_size.x, src_unscaled_size.x);
		copy_size.y = std::min(tex_size.y, src_unscaled_size.y);

		// Use the texture min/max to get the copy range.
		copy_range = tmm.coverage;

		// Texture size above might be invalid (Timesplitters 2), extend if needed.
		if (m_cached_ctx.CLAMP.WMS >= CLAMP_REGION_CLAMP && copy_range.z > copy_size.x)
			copy_size.x = src_unscaled_size.x;
		if (m_cached_ctx.CLAMP.WMT >= CLAMP_REGION_CLAMP && copy_range.w > copy_size.y)
			copy_size.y = src_unscaled_size.y;

		// Texture shuffles might read up to +/- 8 pixels on either side.
		if (m_texture_shuffle)
			copy_range = (copy_range + GSVector4i::cxpr(-8, 0, 8, 0)).max_i32(GSVector4i::zero());

		// Apply target region offset.
		// TODO: Shrink the output texture to only the copy size.
		// Currently there's precision issues when using point sampling with normalized coordinates.
		// Once we move those over to texelFetch(), we should be able to shrink the size of the copy textures.
		if (target_region)
		{
			// Create a new texture using only the carved out region. Might save a bit of GPU time if we're lucky.
			const GSVector4i src_offset = GSVector4i(source_region.GetMinX(), source_region.GetMinY()).xyxy();
			copy_range += src_offset;
			copy_range = copy_range.rintersect(source_region.GetRect(src_unscaled_size.x, src_unscaled_size.y));
			GL_CACHE("Applying target region at copy: %dx%d @ %d,%d => %d,%d", copy_range.width(), copy_range.height(),
				tmm.coverage.x, tmm.coverage.y, copy_range.x, copy_range.y);

			// Remove target region flag, we don't need to offset the coordinates anymore.
			source_region = {};
			target_region = false;

			// Make sure it's not out of the source's bounds.
			copy_range = copy_range.rintersect(src_bounds);

			// Unapply the region offset for the destination coordinates.
			const GSVector4i dst_range = copy_range - src_offset;
			GSVector4i::storel(&copy_dst_offset, dst_range);

			// We shouldn't need a larger texture because of the TS2 check above, but just in case.
			GSVector4i::storel(&copy_size, GSVector4i(copy_size).max_i32(dst_range.zwzw()));
		}
		else
		{
			// TODO: We also could use source region here to offset the coordinates.
			copy_range = copy_range.rintersect(src_bounds);
			GSVector4i::storel(&copy_dst_offset, copy_range);
		}
	}

	if (copy_range.rempty())
	{
		// Reading outside of the RT range.
		GL_CACHE("ERROR: Reading outside of the RT range, using null texture.");
		unscaled_size = GSVector2i(1, 1);
		scale = 1.0f;
		m_conf.tex = nullptr;
		m_conf.ps.tfx = 4;
		return;
	}

	unscaled_size = copy_size;
	scale = src_target->GetScale();
	GL_CACHE("Copy size: %dx%d, range: %d,%d -> %d,%d (%dx%d) @ %.1f", copy_size.x, copy_size.y, copy_range.x,
		copy_range.y, copy_range.z, copy_range.w, copy_range.width(), copy_range.height(), scale);

	const GSVector2i scaled_copy_size = GSVector2i(static_cast<int>(std::ceil(static_cast<float>(copy_size.x) * scale)),
		static_cast<int>(std::ceil(static_cast<float>(copy_size.y) * scale)));
	const GSVector4i scaled_copy_range = GSVector4i((GSVector4(copy_range) * GSVector4(scale)).ceil());
	const GSVector2i scaled_copy_dst_offset =
		GSVector2i(static_cast<int>(std::ceil(static_cast<float>(copy_dst_offset.x) * scale)),
			static_cast<int>(std::ceil(static_cast<float>(copy_dst_offset.y) * scale)));

	src_copy = src_target->m_texture->IsDepthStencil() ?
				   g_gs_device->CreateDepthStencil(
					   scaled_copy_size.x, scaled_copy_size.y, src_target->m_texture->GetFormat(), false) :
				   g_gs_device->CreateTexture(
					   scaled_copy_size.x, scaled_copy_size.y, 1, src_target->m_texture->GetFormat(), true);
	g_gs_device->CopyRect(
		src_target->m_texture, src_copy, scaled_copy_range, scaled_copy_dst_offset.x, scaled_copy_dst_offset.y);
	m_conf.tex = src_copy;
}

bool GSRendererHW::CanUseTexIsFB(const GSTextureCache::Target* rt, const GSTextureCache::Source* tex,
	const TextureMinMaxResult& tmm)
{
	// Minimum blending or no barriers -> we can't use tex-is-fb.
	if (GSConfig.AccurateBlendingUnit == AccBlendLevel::Minimum || !g_gs_device->Features().texture_barrier)
	{
		GL_CACHE("Can't use tex-is-fb due to no barriers.");
		return false;
	}

	// If we're a shuffle, tex-is-fb is always fine.
	if (m_texture_shuffle || m_channel_shuffle)
	{
		GL_CACHE("Activating tex-is-fb for %s shuffle.", m_texture_shuffle ? "texture" : "channel");
		return true;
	}

	static constexpr auto check_clamp = [](u32 clamp, u32 min, u32 max, s32 tmin, s32 tmax) {
		if (clamp == CLAMP_REGION_CLAMP)
		{
			if (tmin < static_cast<s32>(min) || tmax > static_cast<s32>(max + 1))
			{
				GL_CACHE("Can't use tex-is-fb because of REGION_CLAMP [%d, %d] with TMM of [%d, %d]", min, max, tmin, tmax);
				return false;
			}
		}
		else if (clamp == CLAMP_REGION_REPEAT)
		{
			const u32 req_tbits = (tmax > 1) ? (std::bit_ceil(static_cast<u32>(tmax - 1)) - 1) : 0x1;
			if ((min & req_tbits) != req_tbits)
			{
				GL_CACHE("Can't use tex-is-fb because of REGION_REPEAT [%d, %d] with TMM of [%d, %d] and tbits of %d",
					min, max, tmin, tmax, req_tbits);
				return false;
			}
		}

		return true;
	};
	if (!check_clamp(
			m_cached_ctx.CLAMP.WMS, m_cached_ctx.CLAMP.MINU, m_cached_ctx.CLAMP.MAXU, tmm.coverage.x, tmm.coverage.z) ||
		!check_clamp(
			m_cached_ctx.CLAMP.WMT, m_cached_ctx.CLAMP.MINV, m_cached_ctx.CLAMP.MAXV, tmm.coverage.y, tmm.coverage.w))
	{
		return false;
	}

	// Texture is actually the frame buffer. Stencil emulation to compute shadow (Jak series/tri-ace game)
	// Will hit the "m_ps_sel.tex_is_fb = 1" path in the draw
	const bool is_quads = (m_vt.m_primclass == GS_SPRITE_CLASS || m_prim_overlap == PRIM_OVERLAP_NO);
	if (is_quads)
	{
		// No bilinear for tex-is-fb.
		if (m_vt.IsLinear())
		{
			GL_CACHE("Can't use tex-is-fb due to bilinear sampling.");
			return false;
		}

		// Can't do tex-is-fb if paletted and we're not a shuffle (C32 -> P8).
		// This one shouldn't happen anymore, because all conversion should be done already.
		const GSLocalMemory::psm_t& tex_psm = GSLocalMemory::m_psm[tex->m_TEX0.PSM];
		const GSLocalMemory::psm_t& rt_psm = GSLocalMemory::m_psm[rt->m_TEX0.PSM];
		if (tex_psm.pal > 0 && tex_psm.bpp < rt_psm.bpp)
		{
			Console.Error("Draw %d: Can't use tex-is-fb due to palette conversion", s_n);
			return true;
		}

		// Make sure that we're not sampling away from the area we're rendering.
		// We need to take the absolute here, because Beyond Good and Evil undithers itself using a -1,-1 offset.
		const GSVector4 diff(m_vt.m_min.p.upld(m_vt.m_max.p) - m_vt.m_min.t.upld(m_vt.m_max.t));
		GL_CACHE("Coord diff: %f,%f", diff.x, diff.y);
		if ((diff.abs() < GSVector4(1.0f)).alltrue())
		{
			GL_CACHE("Sampling from rendered texel, using tex-is-fb.");
			return true;
		}

		GL_CACHE("Coord diff too large, not using tex-is-fb.");
		return false;
	}

	if (m_vt.m_primclass == GS_TRIANGLE_CLASS)
	{
		// This pattern is used by several games to emulate a stencil (shadow)
		// Ratchet & Clank, Jak do alpha integer multiplication (tfx) which is mostly equivalent to +1/-1
		// Tri-Ace (Star Ocean 3/RadiataStories/VP2) uses a palette to handle the +1/-1
		if (m_cached_ctx.FRAME.FBMSK == 0x00FFFFFF)
		{
			GL_CACHE("Tex-is-fb hack for Jak");
			return true;
		}

		GL_CACHE("Triangle draw, not using tex-is-fb");
		return false;
	}

	return false;
}

void GSRendererHW::EmulateATST(float& AREF, GSHWDrawConfig::PSSelector& ps, bool pass_2)
{
	static const u32 inverted_atst[] = {ATST_ALWAYS, ATST_NEVER, ATST_GEQUAL, ATST_GREATER, ATST_NOTEQUAL, ATST_LESS, ATST_LEQUAL, ATST_EQUAL};

	if (!m_cached_ctx.TEST.ATE)
		return;

	// Check for pass 2, otherwise do pass 1.
	const int atst = pass_2 ? inverted_atst[m_cached_ctx.TEST.ATST] : m_cached_ctx.TEST.ATST;
	const float aref = static_cast<float>(m_cached_ctx.TEST.AREF);

	switch (atst)
	{
		case ATST_LESS:
			AREF = aref - 0.1f;
			ps.atst = 1;
			break;
		case ATST_LEQUAL:
			AREF = aref - 0.1f + 1.0f;
			ps.atst = 1;
			break;
		case ATST_GEQUAL:
			AREF = aref - 0.1f;
			ps.atst = 2;
			break;
		case ATST_GREATER:
			AREF = aref - 0.1f + 1.0f;
			ps.atst = 2;
			break;
		case ATST_EQUAL:
			AREF = aref;
			ps.atst = 3;
			break;
		case ATST_NOTEQUAL:
			AREF = aref;
			ps.atst = 4;
			break;
		case ATST_NEVER: // Draw won't be done so no need to implement it in shader
		case ATST_ALWAYS:
		default:
			ps.atst = 0;
			break;
	}
}

void GSRendererHW::CleanupDraw(bool invalidate_temp_src)
{
	// Remove any RT source.
	if (invalidate_temp_src)
		g_texture_cache->InvalidateTemporarySource();

	// Restore offsets.
	if ((m_context->FRAME.U32[0] ^ m_cached_ctx.FRAME.U32[0]) & 0x3f3f01ff)
		m_context->offset.fb = m_mem.GetOffset(m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM);
	if ((m_context->ZBUF.U32[0] ^ m_cached_ctx.ZBUF.U32[0]) & 0x3f0001ff)
		m_context->offset.zb = m_mem.GetOffset(m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM);
}

void GSRendererHW::ResetStates()
{
	// We don't want to zero out the constant buffers, since fields used by the current draw could result in redundant uploads.
	// This memset should be pretty efficient - the struct is 16 byte aligned, as is the cb_vs offset.
	memset(&m_conf, 0, reinterpret_cast<const char*>(&m_conf.cb_vs) - reinterpret_cast<const char*>(&m_conf));
}

__ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Target* ds, GSTextureCache::Source* tex, const TextureMinMaxResult& tmm)
{
#ifdef ENABLE_OGL_DEBUG
	const GSVector4i area_out = GSVector4i(m_vt.m_min.p.upld(m_vt.m_max.p)).rintersect(m_context->scissor.in);
	const GSVector4i area_in = GSVector4i(m_vt.m_min.t.upld(m_vt.m_max.t));

	GL_PUSH("GL Draw from (area %d,%d => %d,%d) in (area %d,%d => %d,%d)",
		area_in.x, area_in.y, area_in.z, area_in.w,
		area_out.x, area_out.y, area_out.z, area_out.w);
#endif

	const GSDrawingEnvironment& env = *m_draw_env;
	bool DATE = m_cached_ctx.TEST.DATE && m_cached_ctx.FRAME.PSM != PSMCT24;
	bool DATE_PRIMID = false;
	bool DATE_BARRIER = false;
	bool DATE_one = false;

	const bool ate_first_pass = m_cached_ctx.TEST.DoFirstPass();
	const bool ate_second_pass = m_cached_ctx.TEST.DoSecondPass();

	ResetStates();

	const float scale_factor = rt ? rt->GetScale() : ds->GetScale();
	m_conf.cb_vs.texture_offset = {};
	m_conf.cb_ps.ScaleFactor = GSVector4(scale_factor * (1.0f / 16.0f), 1.0f / scale_factor, scale_factor, 0.0f);
	m_conf.ps.scanmsk = env.SCANMSK.MSK;
	m_conf.rt = rt ? rt->m_texture : nullptr;
	m_conf.ds = ds ? ds->m_texture : nullptr;

	// Z setup has to come before channel shuffle
	EmulateZbuffer(ds);

	// HLE implementation of the channel selection effect
	//
	// Warning it must be done at the begining because it will change the
	// vertex list (it will interact with PrimitiveOverlap and accurate
	// blending)
	if (m_channel_shuffle && tex && tex->m_from_target)
		EmulateChannelShuffle(tex->m_from_target, false);

	// Upscaling hack to avoid various line/grid issues
	MergeSprite(tex);

	m_prim_overlap = PrimitiveOverlap();

	EmulateTextureShuffleAndFbmask(rt, tex);

	const GSDevice::FeatureSupport features = g_gs_device->Features();

	// Blend
	int blend_alpha_min = 0, blend_alpha_max = 255;
	if (rt)
	{
		blend_alpha_min = rt->m_alpha_min;
		blend_alpha_max = rt->m_alpha_max;

		const bool is_24_bit = (GSLocalMemory::m_psm[rt->m_TEX0.PSM].trbpp == 24);
		if (is_24_bit)
		{
			// C24/Z24 - alpha is 1.
			blend_alpha_min = 128;
			blend_alpha_max = 128;
		}

		if (!m_channel_shuffle && !m_texture_shuffle)
		{
			const int fba_value = m_prev_env.CTXT[m_prev_env.PRIM.CTXT].FBA.FBA * 128;
			if ((m_cached_ctx.FRAME.FBMSK & 0xff000000) == 0)
			{
				if (rt->m_valid.rintersect(m_r).eq(rt->m_valid) && PrimitiveCoversWithoutGaps() && !(m_cached_ctx.TEST.DATE || m_cached_ctx.TEST.ATE || m_cached_ctx.TEST.ZTST != ZTST_ALWAYS))
				{
					rt->m_alpha_max = GetAlphaMinMax().max | fba_value;
					rt->m_alpha_min = GetAlphaMinMax().min | fba_value;
				}
				else
				{
					rt->m_alpha_max = std::max(GetAlphaMinMax().max | fba_value, rt->m_alpha_max);
					rt->m_alpha_min = std::min(GetAlphaMinMax().min | fba_value, rt->m_alpha_min);
				}
			}
			else if ((m_cached_ctx.FRAME.FBMSK & 0xff000000) != 0xff000000) // We can't be sure of the alpha if it's partially masked.
			{
				rt->m_alpha_max |= std::max(GetAlphaMinMax().max | fba_value, rt->m_alpha_max);
				rt->m_alpha_min = std::min(GetAlphaMinMax().min | fba_value, rt->m_alpha_min);
			}
			else if (!is_24_bit)
			{
				// If both are zero then we probably don't know what the alpha is.
				if (rt->m_alpha_max == 0 && rt->m_alpha_min == 0)
				{
					rt->m_alpha_max = 255;
					rt->m_alpha_min = 0;
				}
			}
		}
		else if ((m_texture_shuffle && m_conf.ps.write_rg == false) || m_channel_shuffle)
		{
			rt->m_alpha_max = 255;
			rt->m_alpha_min = 0;
		}

		GL_INS("RT Alpha Range: %d-%d => %d-%d", blend_alpha_min, blend_alpha_max, rt->m_alpha_min, rt->m_alpha_max);

		// If there's no overlap, the values in the RT before FB write will be the old values.
		if (m_prim_overlap != PRIM_OVERLAP_NO)
		{
			// Otherwise, it may be a mix of the old/new values.
			blend_alpha_min = std::min(blend_alpha_min, rt->m_alpha_min);
			blend_alpha_max = std::max(blend_alpha_max, rt->m_alpha_max);
		}

		if (!rt->m_32_bits_fmt)
		{
			rt->m_alpha_max &= 128;
			rt->m_alpha_min &= 128;
		}
	}

	// DATE: selection of the algorithm. Must be done before blending because GL42 is not compatible with blending
	if (DATE)
	{
		if (m_cached_ctx.TEST.DATM)
		{
			if (rt)
			{
				// Destination and incoming pixels are all 1 or higher, no need for DATE.
				if ((rt->m_alpha_min >= 128 || (m_cached_ctx.FRAME.FBMSK & 0x80000000)) && blend_alpha_min >= 128)
				{
					DATE = false;
					m_cached_ctx.TEST.DATE = false;
				}
				else if (blend_alpha_max < 128) // All dest pixels are less than 1, everything fails.
				{
					rt->m_alpha_max = blend_alpha_max;
					rt->m_alpha_min = blend_alpha_min;
					return;
				}
			}
		}
		else
		{
			if (rt)
			{
				// Destination and incoming pixels are all less than 1, no need for DATE.
				if ((rt->m_alpha_max < 128 || (m_cached_ctx.FRAME.FBMSK & 0x80000000)) && blend_alpha_max < 128)
				{
					DATE = false;
					m_cached_ctx.TEST.DATE = false;
				}
				else if (blend_alpha_min >= 128) // All dest pixels are 1 or higher, everything fails.
				{
					rt->m_alpha_max = blend_alpha_max;
					rt->m_alpha_min = blend_alpha_min;
					return;
				}
			}
		}

		if (DATE)
		{
			// It is way too complex to emulate texture shuffle with DATE, so use accurate path.
			// No overlap should be triggered on gl/vk only as they support DATE_BARRIER.
			if (features.framebuffer_fetch)
			{
				// Full DATE is "free" with framebuffer fetch. The barrier gets cleared below.
				DATE_BARRIER = true;
				m_conf.require_full_barrier = true;
			}
			else if ((features.texture_barrier && m_prim_overlap == PRIM_OVERLAP_NO) || m_texture_shuffle)
			{
				GL_PERF("DATE: Accurate with %s", (features.texture_barrier && m_prim_overlap == PRIM_OVERLAP_NO) ? "no overlap" : "texture shuffle");
				if (features.texture_barrier)
				{
					m_conf.require_full_barrier = true;
					DATE_BARRIER = true;
				}
			}
			// When Blending is disabled and Edge Anti Aliasing is enabled,
			// the output alpha is Coverage (which we force to 128) so DATE will fail/pass guaranteed on second pass.
			else if (m_conf.colormask.wa && (m_context->FBA.FBA || IsCoverageAlpha()) && features.stencil_buffer)
			{
				GL_PERF("DATE: Fast with FBA, all pixels will be >= 128");
				DATE_one = !m_cached_ctx.TEST.DATM;
			}
			else if (m_conf.colormask.wa && !m_cached_ctx.TEST.ATE && !(m_cached_ctx.FRAME.FBMSK & 0x80000000))
			{
				// Performance note: check alpha range with GetAlphaMinMax()
				// Note: all my dump are already above 120fps, but it seems to reduce GPU load
				// with big upscaling
				if (m_cached_ctx.TEST.DATM && GetAlphaMinMax().max < 128 && features.stencil_buffer)
				{
					// Only first pixel (write 0) will pass (alpha is 1)
					GL_PERF("DATE: Fast with alpha %d-%d", GetAlphaMinMax().min, GetAlphaMinMax().max);
					DATE_one = true;
				}
				else if (!m_cached_ctx.TEST.DATM && GetAlphaMinMax().min >= 128 && features.stencil_buffer)
				{
					// Only first pixel (write 1) will pass (alpha is 0)
					GL_PERF("DATE: Fast with alpha %d-%d", GetAlphaMinMax().min, GetAlphaMinMax().max);
					DATE_one = true;
				}
				else if (features.texture_barrier && ((m_vt.m_primclass == GS_SPRITE_CLASS && m_drawlist.size() < 10) || (m_index.tail < 30)))
				{
					// texture barrier will split the draw call into n draw call. It is very efficient for
					// few primitive draws. Otherwise it sucks.
					GL_PERF("DATE: Accurate with alpha %d-%d", GetAlphaMinMax().min, GetAlphaMinMax().max);
					m_conf.require_full_barrier = true;
					DATE_BARRIER = true;
				}
				else if (features.primitive_id)
				{
					GL_PERF("DATE: Accurate with alpha %d-%d", GetAlphaMinMax().min, GetAlphaMinMax().max);
					DATE_PRIMID = true;
				}
				else if (features.texture_barrier)
				{
					GL_PERF("DATE: Accurate with alpha %d-%d", GetAlphaMinMax().min, GetAlphaMinMax().max);
					m_conf.require_full_barrier = true;
					DATE_BARRIER = true;
				}
				else if (features.stencil_buffer)
				{
					// Might be inaccurate in some cases but we shouldn't hit this path.
					GL_PERF("DATE: Fast with alpha %d-%d", GetAlphaMinMax().min, GetAlphaMinMax().max);
					DATE_one = true;
				}
			}
			else if (!m_conf.colormask.wa && !m_cached_ctx.TEST.ATE)
			{
				GL_PERF("DATE: Accurate with no alpha write");
				if (g_gs_device->Features().texture_barrier)
				{
					m_conf.require_one_barrier = true;
					DATE_BARRIER = true;
				}
			}

			// Will save my life !
			ASSERT(!(DATE_BARRIER && DATE_one));
			ASSERT(!(DATE_PRIMID && DATE_one));
			ASSERT(!(DATE_PRIMID && DATE_BARRIER));
		}
	}

	// Before emulateblending, dither will be used
	m_conf.ps.dither = GSConfig.Dithering > 0 && m_conf.ps.dfmt == 2 && env.DTHE.DTHE;

	if (m_conf.ps.dfmt == 1)
	{
		// Disable writing of the alpha channel
		m_conf.colormask.wa = 0;
	}


	// Not gonna spend too much time with this, it's not likely to be used much, can't be less accurate than it was.
	if (ds)
	{
		ds->m_alpha_max = std::max(ds->m_alpha_max, static_cast<int>(m_vt.m_max.p.z) >> 24);
		ds->m_alpha_min = std::min(ds->m_alpha_min, static_cast<int>(m_vt.m_min.p.z) >> 24);
		GL_INS("New DS Alpha Range: %d-%d", ds->m_alpha_min, ds->m_alpha_max);

		if (GSLocalMemory::m_psm[ds->m_TEX0.PSM].bpp == 16)
		{
			ds->m_alpha_max &= 128;
			ds->m_alpha_min &= 128;
		}
	}

	bool blending_alpha_pass = false;
	if ((!IsOpaque() || m_context->ALPHA.IsBlack()) && rt && (m_conf.colormask.wrgba & 0x7))
	{
		EmulateBlending(blend_alpha_min, blend_alpha_max, DATE_PRIMID, DATE_BARRIER, blending_alpha_pass);
	}
	else
	{
		m_conf.blend = {}; // No blending please
		m_conf.ps.no_color1 = true;
	}

	// No point outputting colours if we're just writing depth.
	// We might still need the framebuffer for DATE, though.
	if (!rt || m_conf.colormask.wrgba == 0)
	{
		m_conf.ps.DisableColorOutput();
		m_conf.colormask.wrgba = 0;
	}

	if (m_conf.ps.scanmsk & 2)
		DATE_PRIMID = false; // to have discard in the shader work correctly

	// DATE setup, no DATE_BARRIER please

	if (!DATE)
		m_conf.destination_alpha = GSHWDrawConfig::DestinationAlphaMode::Off;
	else if (DATE_one)
		m_conf.destination_alpha = GSHWDrawConfig::DestinationAlphaMode::StencilOne;
	else if (DATE_PRIMID)
		m_conf.destination_alpha = GSHWDrawConfig::DestinationAlphaMode::PrimIDTracking;
	else if (DATE_BARRIER)
		m_conf.destination_alpha = GSHWDrawConfig::DestinationAlphaMode::Full;
	else if (features.stencil_buffer)
		m_conf.destination_alpha = GSHWDrawConfig::DestinationAlphaMode::Stencil;

	m_conf.datm = m_cached_ctx.TEST.DATM;

	// If we're doing stencil DATE and we don't have a depth buffer, we need to allocate a temporary one.
	GSTexture* temp_ds = nullptr;
	if (m_conf.destination_alpha >= GSHWDrawConfig::DestinationAlphaMode::Stencil &&
		m_conf.destination_alpha <= GSHWDrawConfig::DestinationAlphaMode::StencilOne && !m_conf.ds)
	{
		temp_ds = g_gs_device->CreateDepthStencil(m_conf.rt->GetWidth(), m_conf.rt->GetHeight(), GSTexture::Format::DepthStencil, false);
		m_conf.ds = temp_ds;
	}

	// vs

	m_conf.vs.tme = PRIM->TME;
	m_conf.vs.fst = PRIM->FST;

	// FIXME D3D11 and GL support half pixel center. Code could be easier!!!
	const GSVector2i rtsize = m_conf.ds ? m_conf.ds->GetSize() : m_conf.rt->GetSize();
	const float rtscale = (ds ? ds->GetScale() : rt->GetScale());
	const float sx = 2.0f * rtscale / (rtsize.x << 4);
	const float sy = 2.0f * rtscale / (rtsize.y << 4);
	const float ox = static_cast<float>(static_cast<int>(m_context->XYOFFSET.OFX));
	const float oy = static_cast<float>(static_cast<int>(m_context->XYOFFSET.OFY));
	float ox2 = -1.0f / rtsize.x;
	float oy2 = -1.0f / rtsize.y;
	float mod_xy = 0.0f;
	//This hack subtracts around half a pixel from OFX and OFY.
	//
	//The resulting shifted output aligns better with common blending / corona / blurring effects,
	//but introduces a few bad pixels on the edges.
	if (!rt)
	{
		mod_xy = GetModXYOffset();
	}
	else
		mod_xy = rt->OffsetHack_modxy;

	if (mod_xy > 1.0f)
	{
		ox2 *= mod_xy;
		oy2 *= mod_xy;
	}

	m_conf.cb_vs.vertex_scale = GSVector2(sx, sy);
	m_conf.cb_vs.vertex_offset = GSVector2(ox * sx + ox2 + 1, oy * sy + oy2 + 1);
	// END of FIXME

	// GS_SPRITE_CLASS are already flat (either by CPU or the GS)
	m_conf.ps.iip = (m_vt.m_primclass == GS_SPRITE_CLASS) ? 0 : PRIM->IIP;
	m_conf.vs.iip = m_conf.ps.iip;

	if (DATE_BARRIER)
	{
		m_conf.ps.date = 5 + m_cached_ctx.TEST.DATM;
	}
	else if (DATE_one)
	{
		if (features.texture_barrier)
		{
			m_conf.require_one_barrier = true;
			m_conf.ps.date = 5 + m_cached_ctx.TEST.DATM;
		}
		m_conf.depth.date = 1;
		m_conf.depth.date_one = 1;
	}
	else if (DATE_PRIMID)
	{
		m_conf.ps.date = 1 + m_cached_ctx.TEST.DATM;
	}
	else if (DATE)
	{
		m_conf.depth.date = 1;
	}

	m_conf.ps.fba = m_context->FBA.FBA;

	if (m_conf.ps.dither)
	{
		const GIFRegDIMX& DIMX = m_draw_env->DIMX;
		GL_DBG("DITHERING mode ENABLED (%d)", GSConfig.Dithering);

		m_conf.ps.dither = GSConfig.Dithering;
		m_conf.cb_ps.DitherMatrix[0] = GSVector4(DIMX.DM00, DIMX.DM01, DIMX.DM02, DIMX.DM03);
		m_conf.cb_ps.DitherMatrix[1] = GSVector4(DIMX.DM10, DIMX.DM11, DIMX.DM12, DIMX.DM13);
		m_conf.cb_ps.DitherMatrix[2] = GSVector4(DIMX.DM20, DIMX.DM21, DIMX.DM22, DIMX.DM23);
		m_conf.cb_ps.DitherMatrix[3] = GSVector4(DIMX.DM30, DIMX.DM31, DIMX.DM32, DIMX.DM33);
	}

	if (PRIM->FGE)
	{
		m_conf.ps.fog = 1;

		const GSVector4 fc = GSVector4::rgba32(m_draw_env->FOGCOL.U32[0]);
		// Blend AREF to avoid to load a random value for alpha (dirty cache)
		m_conf.cb_ps.FogColor_AREF = fc.blend32<8>(m_conf.cb_ps.FogColor_AREF);
	}

	// Warning must be done after EmulateZbuffer
	// Depth test is always true so it can be executed in 2 passes (no order required) unlike color.
	// The idea is to compute first the color which is independent of the alpha test. And then do a 2nd
	// pass to handle the depth based on the alpha test.
	bool ate_RGBA_then_Z = false;
	bool ate_RGB_then_ZA = false;
	if (ate_first_pass && ate_second_pass)
	{
		GL_DBG("Complex Alpha Test");
		const bool commutative_depth = (m_conf.depth.ztst == ZTST_GEQUAL && m_vt.m_eq.z) || (m_conf.depth.ztst == ZTST_ALWAYS);
		const bool commutative_alpha = (m_context->ALPHA.C != 1); // when either Alpha Src or a constant

		ate_RGBA_then_Z = m_cached_ctx.TEST.GetAFAIL(m_cached_ctx.FRAME.PSM) == AFAIL_FB_ONLY && commutative_depth;
		ate_RGB_then_ZA = m_cached_ctx.TEST.GetAFAIL(m_cached_ctx.FRAME.PSM) == AFAIL_RGB_ONLY && commutative_depth && commutative_alpha;
	}

	if (ate_RGBA_then_Z)
	{
		GL_DBG("Alternate ATE handling: ate_RGBA_then_Z");
		// Render all color but don't update depth
		// ATE is disabled here
		m_conf.depth.zwe = false;
	}
	else if (ate_RGB_then_ZA)
	{
		GL_DBG("Alternate ATE handling: ate_RGB_then_ZA");
		// Render RGB color but don't update depth/alpha
		// ATE is disabled here
		m_conf.depth.zwe = false;
		m_conf.colormask.wa = false;
	}
	else
	{
		float aref = m_conf.cb_ps.FogColor_AREF.a;
		EmulateATST(aref, m_conf.ps, false);

		// avoid redundant cbuffer updates
		m_conf.cb_ps.FogColor_AREF.a = aref;
		m_conf.alpha_second_pass.ps_aref = aref;
	}

	GSTexture* tex_copy = nullptr;
	if (tex)
	{
		EmulateTextureSampler(rt, ds, tex, tmm, tex_copy);
	}
	else
	{
		m_conf.ps.tfx = 4;
	}

	if (features.framebuffer_fetch)
	{
		// Intel GPUs on Metal lock up if you try to use DSB and framebuffer fetch at once
		// We should never need to do that (since using framebuffer fetch means you should be able to do all blending in shader), but sometimes it slips through
		if (m_conf.require_one_barrier || m_conf.require_full_barrier)
			ASSERT(!m_conf.blend.enable);

		// Barriers aren't needed with fbfetch.
		m_conf.require_one_barrier = false;
		m_conf.require_full_barrier = false;
	}
	// Multi-pass algorithms shouldn't be needed with full barrier and backends may not handle this correctly
	ASSERT(!m_conf.require_full_barrier || !m_conf.ps.hdr);

	// Swap full barrier for one barrier when there's no overlap.
	if (m_conf.require_full_barrier && m_prim_overlap == PRIM_OVERLAP_NO)
	{
		m_conf.require_full_barrier = false;
		m_conf.require_one_barrier = true;
	}

	// rs
	const GSVector4i hacked_scissor = m_channel_shuffle ? GSVector4i::cxpr(0, 0, 1024, 1024) : m_context->scissor.in;
	const GSVector4i scissor(GSVector4i(GSVector4(rtscale) * GSVector4(hacked_scissor)).rintersect(GSVector4i::loadh(rtsize)));

	m_conf.drawarea = m_channel_shuffle ? scissor : scissor.rintersect(ComputeBoundingBox(rtsize, rtscale));
	m_conf.scissor = (DATE && !DATE_BARRIER) ? m_conf.drawarea : scissor;

	SetupIA(rtscale, sx, sy);

	m_conf.alpha_second_pass.enable = ate_second_pass;

	if (ate_second_pass)
	{
		ASSERT(!env.PABE.PABE);
		memcpy(&m_conf.alpha_second_pass.ps,        &m_conf.ps,        sizeof(m_conf.ps));
		memcpy(&m_conf.alpha_second_pass.colormask, &m_conf.colormask, sizeof(m_conf.colormask));
		memcpy(&m_conf.alpha_second_pass.depth,     &m_conf.depth,     sizeof(m_conf.depth));

		if (ate_RGBA_then_Z || ate_RGB_then_ZA)
		{
			// Enable ATE as first pass to update the depth
			// of pixels that passed the alpha test
			EmulateATST(m_conf.alpha_second_pass.ps_aref, m_conf.alpha_second_pass.ps, false);
		}
		else
		{
			// second pass will process the pixels that failed
			// the alpha test
			EmulateATST(m_conf.alpha_second_pass.ps_aref, m_conf.alpha_second_pass.ps, true);
		}


		bool z = m_conf.depth.zwe;
		bool r = m_conf.colormask.wr;
		bool g = m_conf.colormask.wg;
		bool b = m_conf.colormask.wb;
		bool a = m_conf.colormask.wa;
		const int fail_type = m_cached_ctx.TEST.GetAFAIL(m_cached_ctx.FRAME.PSM);
		switch (fail_type)
		{
			case AFAIL_KEEP: z = r = g = b = a = false; break; // none
			case AFAIL_FB_ONLY: z = false; break; // rgba
			case AFAIL_ZB_ONLY: r = g = b = a = false; break; // z
			case AFAIL_RGB_ONLY: z = a = false; break; // rgb
			default: __assume(0);
		}

		// Depth test should be disabled when depth writes are masked and similarly, Alpha test must be disabled
		// when writes to all of the alpha bits in the Framebuffer are masked.
		if (ate_RGBA_then_Z)
		{
			z = !m_cached_ctx.ZBUF.ZMSK;
			r = g = b = a = false;
		}
		else if (ate_RGB_then_ZA)
		{
			z = !m_cached_ctx.ZBUF.ZMSK;
			a = (m_cached_ctx.FRAME.FBMSK & 0xFF000000) != 0xFF000000;
			r = g = b = false;
		}

		if (z || r || g || b || a)
		{
			m_conf.alpha_second_pass.depth.zwe = z;
			m_conf.alpha_second_pass.colormask.wr = r;
			m_conf.alpha_second_pass.colormask.wg = g;
			m_conf.alpha_second_pass.colormask.wb = b;
			m_conf.alpha_second_pass.colormask.wa = a;
			if (m_conf.alpha_second_pass.colormask.wrgba == 0)
				m_conf.alpha_second_pass.ps.DisableColorOutput();
		}
		else
		{
			m_conf.alpha_second_pass.enable = false;
		}
	}

	if (!ate_first_pass)
	{
		if (!m_conf.alpha_second_pass.enable)
		{
			CleanupDraw(true);
			return;
		}

		// RenderHW always renders first pass, replace first pass with second
		memcpy(&m_conf.ps,        &m_conf.alpha_second_pass.ps,        sizeof(m_conf.ps));
		memcpy(&m_conf.colormask, &m_conf.alpha_second_pass.colormask, sizeof(m_conf.colormask));
		memcpy(&m_conf.depth,     &m_conf.alpha_second_pass.depth,     sizeof(m_conf.depth));
		m_conf.cb_ps.FogColor_AREF.a = m_conf.alpha_second_pass.ps_aref;
		m_conf.alpha_second_pass.enable = false;
	}

	if (blending_alpha_pass)
	{
		// write alpha blend as the single alpha output
		m_conf.ps.no_ablend = true;

		// there's a case we can skip this: RGB_then_ZA alternate handling.
		// but otherwise, we need to write alpha separately.
		if (m_conf.colormask.wa)
		{
			m_conf.colormask.wa = false;
			m_conf.separate_alpha_pass = true;
		}

		// do we need to do this for the failed alpha fragments?
		if (m_conf.alpha_second_pass.enable)
		{
			// there's also a case we can skip here: when we're not writing RGB, there's
			// no blending, so we can just write the normal alpha!
			const u8 second_pass_wrgba = m_conf.alpha_second_pass.colormask.wrgba;
			if ((second_pass_wrgba & (1 << 3)) != 0 && second_pass_wrgba != (1 << 3))
			{
				// this sucks. potentially up to 4 passes. but no way around it when we don't have dual-source blend.
				m_conf.alpha_second_pass.ps.no_ablend = true;
				m_conf.alpha_second_pass.colormask.wa = false;
				m_conf.second_separate_alpha_pass = true;
			}
		}
	}

	m_conf.drawlist = (m_conf.require_full_barrier && m_vt.m_primclass == GS_SPRITE_CLASS) ? &m_drawlist : nullptr;

	g_gs_device->RenderHW(m_conf);

	if (tex_copy)
		g_gs_device->Recycle(tex_copy);
	if (temp_ds)
		g_gs_device->Recycle(temp_ds);
}

// If the EE uploaded a new CLUT since the last draw, use that.
bool GSRendererHW::HasEEUpload(GSVector4i r)
{
	for (auto iter = m_draw_transfers.begin(); iter != m_draw_transfers.end(); ++iter)
	{
		if (iter->draw == (s_n - 1) && iter->blit.DBP == m_cached_ctx.TEX0.TBP0 && GSUtil::HasSharedBits(iter->blit.DPSM, m_cached_ctx.TEX0.PSM))
		{
			GSVector4i rect = r;

			if (!GSUtil::HasCompatibleBits(iter->blit.DPSM, m_cached_ctx.TEX0.PSM))
			{
				GSTextureCache::SurfaceOffsetKey sok;
				sok.elems[0].bp = iter->blit.DBP;
				sok.elems[0].bw = iter->blit.DBW;
				sok.elems[0].psm = iter->blit.DPSM;
				sok.elems[0].rect = iter->rect;
				sok.elems[1].bp = m_cached_ctx.TEX0.TBP0;
				sok.elems[1].bw = m_cached_ctx.TEX0.TBW;
				sok.elems[1].psm = m_cached_ctx.TEX0.PSM;
				sok.elems[1].rect = r;

				rect = g_texture_cache->ComputeSurfaceOffset(sok).b2a_offset;
			}
			if (rect.rintersect(r).eq(r))
				return true;
		}
	}
	return false;
}

GSRendererHW::CLUTDrawTestResult GSRendererHW::PossibleCLUTDraw()
{
	// No shuffles.
	if (m_channel_shuffle || m_texture_shuffle)
		return CLUTDrawTestResult::NotCLUTDraw;

	// Keep the draws simple, no alpha testing, blending, mipmapping, Z writes, and make sure it's flat.
	const bool fb_only = m_cached_ctx.TEST.ATE && m_cached_ctx.TEST.GetAFAIL(m_cached_ctx.FRAME.PSM) == AFAIL_FB_ONLY && m_cached_ctx.TEST.ATST == ATST_NEVER;

	// No Z writes, unless it's points, then it's quite likely to be a palette and they left it on.
	if (!m_cached_ctx.ZBUF.ZMSK && !fb_only && !(m_vt.m_primclass == GS_POINT_CLASS))
		return CLUTDrawTestResult::NotCLUTDraw;

	// Make sure it's flat.
	if (m_vt.m_eq.z != 0x1)
		return CLUTDrawTestResult::NotCLUTDraw;

	// No mipmapping, please never be any mipmapping...
	if (m_context->TEX1.MXL)
		return CLUTDrawTestResult::NotCLUTDraw;

	// Writing to the framebuffer for output. We're not interested. - Note: This stops NFS HP2 Busted screens working, but they're glitchy anyway
	// what NFS HP2 really needs is a kind of shuffle with mask, 32bit target is interpreted as 16bit and masked.
	if ((m_regs->DISP[0].DISPFB.Block() == m_cached_ctx.FRAME.Block()) || (m_regs->DISP[1].DISPFB.Block() == m_cached_ctx.FRAME.Block()) ||
		(PRIM->TME && ((m_regs->DISP[0].DISPFB.Block() == m_cached_ctx.TEX0.TBP0) || (m_regs->DISP[1].DISPFB.Block() == m_cached_ctx.TEX0.TBP0)) && !(m_mem.m_clut.IsInvalid() & 2)))
		return CLUTDrawTestResult::NotCLUTDraw;

	// Ignore large render targets, make sure it's staying in page width.
	if (PRIM->TME && (m_cached_ctx.FRAME.FBW != 1 && m_cached_ctx.TEX0.TBW == m_cached_ctx.FRAME.FBW))
		return CLUTDrawTestResult::NotCLUTDraw;

	// Hopefully no games draw a CLUT with a CLUT, that would be evil, most likely a channel shuffle.
	if (PRIM->TME && GSLocalMemory::m_psm[m_cached_ctx.TEX0.PSM].pal > 0)
		return CLUTDrawTestResult::NotCLUTDraw;

	const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM];

	// Make sure the CLUT formats are matching.
	if (GSLocalMemory::m_psm[m_mem.m_clut.GetCLUTCPSM()].bpp != psm.bpp)
		return CLUTDrawTestResult::NotCLUTDraw;

	// Max size for a CLUT/Current page size.
	constexpr float min_clut_width = 7.0f;
	constexpr float min_clut_height = 1.0f;
	const float page_width = static_cast<float>(psm.pgs.x);
	const float page_height = static_cast<float>(psm.pgs.y);

	// If the coordinates aren't starting within the page, it's likely not a CLUT draw.
	if (floor(m_vt.m_min.p.x) < 0 || floor(m_vt.m_min.p.y) < 0 || floor(m_vt.m_min.p.x) > page_width || floor(m_vt.m_min.p.y) > page_height)
		return CLUTDrawTestResult::NotCLUTDraw;

	// Make sure it's a division of 8 in width to avoid bad draws. Points will go from 0-7 inclusive, but sprites etc will do 0-16 exclusive.
	int draw_divder_match = false;
	const int valid_sizes[] = {8, 16, 32, 64};

	for (int i = 0; i < 4; i++)
	{
		draw_divder_match = ((m_vt.m_primclass == GS_POINT_CLASS) ? ((static_cast<int>(m_vt.m_max.p.x + 1) & ~1) == valid_sizes[i]) : (static_cast<int>(m_vt.m_max.p.x) == valid_sizes[i]));

		if (draw_divder_match)
			break;
	}
	// Make sure it's kinda CLUT sized, at least. Be wary, it can draw a line at a time (Guitar Hero - Metallica)
	// Driver Parallel Lines draws a bunch of CLUT's at once, ending up as a 64x256 draw, very annoying.
	const float draw_width = (m_vt.m_max.p.x - m_vt.m_min.p.x);
	const float draw_height = (m_vt.m_max.p.y - m_vt.m_min.p.y);
	const bool valid_size = ((draw_width >= min_clut_width || draw_height >= min_clut_height))
							&& (((draw_width < page_width && draw_height <= page_height) || (draw_width == page_width)) && draw_divder_match); // Make sure draw is multiples of 8 wide (AC5 midetection).

	// Make sure the draw hits the next CLUT and it's marked as invalid (kind of a sanity check).
	// We can also allow draws which are of a sensible size within the page, as they could also be CLUT draws (or gradients for the CLUT).
	if (!valid_size)
		return CLUTDrawTestResult::NotCLUTDraw;

	if (PRIM->TME)
	{
		// If we're using a texture to draw our CLUT/whatever, we need the GPU to write back dirty data we need.
		const GSVector4i r = GetTextureMinMax(m_cached_ctx.TEX0, m_cached_ctx.CLAMP, m_vt.IsLinear(), false).coverage;

		// If we have GPU CLUT enabled, don't do a CPU draw when it would result in a download.
		if (GSConfig.UserHacks_GPUTargetCLUTMode != GSGPUTargetCLUTMode::Disabled)
		{
			if (HasEEUpload(r))
				return CLUTDrawTestResult::CLUTDrawOnCPU;

			const GSTextureCache::Target* tgt = g_texture_cache->FindOverlappingTarget(
				m_cached_ctx.TEX0.TBP0, m_cached_ctx.TEX0.TBW, m_cached_ctx.TEX0.PSM, r);
			if (tgt)
			{
				bool is_dirty = false;
				for (const GSDirtyRect& rc : tgt->m_dirty)
				{
					if (!rc.GetDirtyRect(m_cached_ctx.TEX0).rintersect(r).rempty())
					{
						is_dirty = true;
						break;
					}
				}
				if (!is_dirty)
				{
					GL_INS("GPU clut is enabled and this draw would readback, leaving on GPU");
					return CLUTDrawTestResult::CLUTDrawOnGPU;
				}
			}
		}
		else
		{
			if (HasEEUpload(r))
				return CLUTDrawTestResult::CLUTDrawOnCPU;
		}

		GIFRegBITBLTBUF BITBLTBUF = {};
		BITBLTBUF.SBP = m_cached_ctx.TEX0.TBP0;
		BITBLTBUF.SBW = m_cached_ctx.TEX0.TBW;
		BITBLTBUF.SPSM = m_cached_ctx.TEX0.PSM;

		InvalidateLocalMem(BITBLTBUF, r);
	}
	// Debugging stuff..
	//const u32 startbp = psm.info.bn(m_vt.m_min.p.x, m_vt.m_min.p.y, m_FRAME.Block(), m_FRAME.FBW);
	//const u32 endbp = psm.info.bn(m_vt.m_max.p.x, m_vt.m_max.p.y, m_FRAME.Block(), m_FRAME.FBW);
	//DevCon.Warning("Draw width %f height %f page width %f height %f TPSM %x TBP0 %x FPSM %x FBP %x CBP %x valid size %d Invalid %d DISPFB0 %x DISPFB1 %x start %x end %x draw %d", draw_width, draw_height, page_width, page_height, m_cached_ctx.TEX0.PSM, m_cached_ctx.TEX0.TBP0, m_FRAME.PSM, m_FRAME.Block(), m_mem.m_clut.GetCLUTCBP(), valid_size, m_mem.m_clut.IsInvalid(), m_regs->DISP[0].DISPFB.Block(), m_regs->DISP[1].DISPFB.Block(), startbp, endbp, s_n);

	return CLUTDrawTestResult::CLUTDrawOnCPU;
}

// Slight more aggressive version that kinda YOLO's it if the draw is anywhere near the CLUT or is point/line (providing it's not too wide of a draw and a few other parameters.
// This is pretty much tuned for the Sega Model 2 games, which draw a huge gradient, then pick lines out of it to make up CLUT's for about 4000 draws...
GSRendererHW::CLUTDrawTestResult GSRendererHW::PossibleCLUTDrawAggressive()
{
	// Avoid any shuffles.
	if (m_channel_shuffle || m_texture_shuffle)
		return CLUTDrawTestResult::NotCLUTDraw;

	// Keep the draws simple, no alpha testing, blending, mipmapping, Z writes, and make sure it's flat.
	if (m_cached_ctx.TEST.ATE)
		return CLUTDrawTestResult::NotCLUTDraw;

	if (PRIM->ABE)
		return CLUTDrawTestResult::NotCLUTDraw;

	if (m_context->TEX1.MXL)
		return CLUTDrawTestResult::NotCLUTDraw;

	if (m_cached_ctx.FRAME.FBW != 1)
		return CLUTDrawTestResult::NotCLUTDraw;

	if (!m_cached_ctx.ZBUF.ZMSK)
		return CLUTDrawTestResult::NotCLUTDraw;

	if (m_vt.m_eq.z != 0x1)
		return CLUTDrawTestResult::NotCLUTDraw;

	if (!((m_vt.m_primclass == GS_POINT_CLASS || m_vt.m_primclass == GS_LINE_CLASS) || ((m_mem.m_clut.GetCLUTCBP() >> 5) >= m_cached_ctx.FRAME.FBP && (m_cached_ctx.FRAME.FBP + 1U) >= (m_mem.m_clut.GetCLUTCBP() >> 5) && m_vt.m_primclass == GS_SPRITE_CLASS)))
		return CLUTDrawTestResult::NotCLUTDraw;

	// Avoid invalidating anything here, we just want to avoid the thing being drawn on the GPU.
	return CLUTDrawTestResult::CLUTDrawOnCPU;
}

bool GSRendererHW::CanUseSwPrimRender(bool no_rt, bool no_ds, bool draw_sprite_tex)
{
	// Master enable.
	const int bw = GSConfig.UserHacks_CPUSpriteRenderBW;
	const int level = GSConfig.UserHacks_CPUSpriteRenderLevel;
	if (bw == 0)
		return false;

	// We don't ever want to do this when we have a depth buffer, and only for textured sprites.
	if (no_rt || !no_ds || (level == 0 && !draw_sprite_tex))
		return false;

	// Check the size threshold. Spider-man 2 uses a FBW of 32 for some silly reason...
	if (m_cached_ctx.FRAME.FBW > static_cast<u32>(bw) && m_cached_ctx.FRAME.FBW != 32)
		return false;

	// We shouldn't be using mipmapping, and this shouldn't be a blended draw.
	if (level < 2 && (IsMipMapActive() || !IsOpaque()))
		return false;

	// Make sure this isn't something we've actually rendered to (e.g. a texture shuffle).
	if (PRIM->TME)
	{
		GSTextureCache::Target* src_target = g_texture_cache->GetTargetWithSharedBits(m_cached_ctx.TEX0.TBP0, m_cached_ctx.TEX0.PSM);
		if (src_target)
		{
			// If the EE has written over our sample area, we're fine to do this on the CPU, despite the target.
			if (!src_target->m_dirty.empty())
			{
				const GSVector4i tr(GetTextureMinMax(m_cached_ctx.TEX0, m_cached_ctx.CLAMP, m_vt.IsLinear(), false).coverage);
				for (GSDirtyRect& rc : src_target->m_dirty)
				{
					if (!rc.GetDirtyRect(m_cached_ctx.TEX0).rintersect(tr).rempty())
						return true;
				}
			}

			return false;
		}
	}

	// We can use the sw prim render path!
	return true;
}

void GSRendererHW::SetNewFRAME(u32 bp, u32 bw, u32 psm)
{
	m_cached_ctx.FRAME.FBP = bp >> 5;
	m_cached_ctx.FRAME.FBW = bw;
	m_cached_ctx.FRAME.PSM = psm;
	m_context->offset.fb = m_mem.GetOffset(bp, bw, psm);
}

void GSRendererHW::SetNewZBUF(u32 bp, u32 psm)
{
	m_cached_ctx.ZBUF.ZBP = bp >> 5;
	m_cached_ctx.ZBUF.PSM = psm;
	m_context->offset.zb = m_mem.GetOffset(bp, m_cached_ctx.FRAME.FBW, psm);
}

bool GSRendererHW::DetectStripedDoubleClear(bool& no_rt, bool& no_ds)
{
	const bool single_page_offset =
		std::abs(static_cast<int>(m_cached_ctx.FRAME.FBP) - static_cast<int>(m_cached_ctx.ZBUF.ZBP)) == 1;
	const bool z_is_frame = (m_cached_ctx.FRAME.FBP == m_cached_ctx.ZBUF.ZBP ||
								(m_cached_ctx.FRAME.FBW > 1 && single_page_offset)) && // GT4O Public Beta
							!m_cached_ctx.ZBUF.ZMSK &&
							(m_cached_ctx.FRAME.PSM & 0x30) != (m_cached_ctx.ZBUF.PSM & 0x30) &&
							(m_cached_ctx.FRAME.PSM & 0xF) == (m_cached_ctx.ZBUF.PSM & 0xF) && m_vt.m_eq.z == 1 &&
							m_vertex.buff[1].XYZ.Z == m_vertex.buff[1].RGBAQ.U32[0];

	// Z and color must be constant and the same and must be drawing strips.
	if (!z_is_frame || m_vt.m_eq.rgba != 0xFFFF)
		return false;

	const GSVector2i page_size = GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].pgs;
	const int strip_size = ((single_page_offset) ? page_size.x : (page_size.x / 2));

	// Find the biggest gap out of all the verts, most of the time games are nice and do strips,
	// however Lord of the Rings - The Third Age draws the strips 8x8 per sprite, until it makes up 32x8, then does the next 32x8 below.
	// I know, unneccesary, but that's what they did. But this loop should calculate the largest gap, then we can confirm it.
	// LOTR has 4096 verts, so this isn't going to be super fast on that game, most games will be just 16 verts so they should be ok,
	// and I could cheat and stop when we get a size that matches, but that might be a lucky misdetection, I don't wanna risk it.
	int vertex_offset = 0;
	int last_vertex = m_vertex.buff[0].XYZ.X;

	for (u32 i = 1; i < m_vertex.tail; i++)
	{
		vertex_offset = std::max(static_cast<int>((m_vertex.buff[i].XYZ.X - last_vertex) >> 4), vertex_offset);
		last_vertex = m_vertex.buff[i].XYZ.X;

		// Found a gap which is much bigger, no point continuing to scan.
		if (vertex_offset > strip_size)
			break;
	}

	const bool is_strips = vertex_offset == strip_size;

	if (!is_strips)
		return false;

	// Half a page extra width is written through Z.
	// When the FRAME is lower or the same and including offset matches the frame width, it will be set back 64/32 pixels.
	// When the FRAME is higher, that means ZBUF is ahead behind 1 page, so the beginning will be 1 page in
	if (m_cached_ctx.FRAME.FBP < m_cached_ctx.ZBUF.ZBP || m_r.x == 0)
		m_r.z += vertex_offset;
	else
		m_r.x -= vertex_offset;

	GL_INS("DetectStripedDoubleClear(): %d,%d => %d,%d @ FBP %x FBW %u ZBP %x", m_r.x, m_r.y, m_r.z, m_r.w,
		m_cached_ctx.FRAME.Block(), m_cached_ctx.FRAME.FBW, m_cached_ctx.ZBUF.Block());

	// And replace the vertex with a fullscreen quad.
	ReplaceVerticesWithSprite(m_r, GSVector2i(1, 1));

	// Remove Z, we'll write it through colour.
	m_cached_ctx.ZBUF.ZMSK = true;
	no_rt = false;
	no_ds = true;
	return true;
}

bool GSRendererHW::DetectDoubleHalfClear(bool& no_rt, bool& no_ds)
{
	if (m_cached_ctx.TEST.ZTST != ZTST_ALWAYS || m_cached_ctx.ZBUF.ZMSK)
		return false;

	// Block when any bits are masked. Too many false positives if we don't.
	// Siren does a C32+Z24 clear with A masked, GTA:LCS does C32+Z24 but doesn't set FBMSK, leaving half
	// of the alpha channel untouched (no effect because it uses Z24 elsewhere).
	const GSLocalMemory::psm_t& frame_psm = GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM];
	const GSLocalMemory::psm_t& zbuf_psm = GSLocalMemory::m_psm[m_cached_ctx.ZBUF.PSM];
	if (((m_cached_ctx.FRAME.FBMSK & frame_psm.fmsk) != 0 && (m_cached_ctx.FRAME.FBMSK & zbuf_psm.fmsk) != 0))
		return false;

	// Z and color must be constant and the same
	GSVertex* v = &m_vertex.buff[0];
	if (m_vt.m_eq.rgba != 0xFFFF || !m_vt.m_eq.z || v[1].XYZ.Z != v[1].RGBAQ.U32[0])
		return false;

	// Frame and depth pointer can be inverted
	const bool clear_depth = (m_cached_ctx.FRAME.FBP > m_cached_ctx.ZBUF.ZBP);
	const u32 base = clear_depth ? m_cached_ctx.ZBUF.ZBP : m_cached_ctx.FRAME.FBP;
	const u32 half = clear_depth ? m_cached_ctx.FRAME.FBP : m_cached_ctx.ZBUF.ZBP;
	const bool enough_bits = clear_depth ? (frame_psm.trbpp >= zbuf_psm.trbpp) : (zbuf_psm.trbpp >= frame_psm.trbpp);

	// Size of the current draw
	const u32 w_pages = (m_r.z + (frame_psm.pgs.x - 1)) / frame_psm.pgs.x;
	const u32 h_pages = (m_r.w + (frame_psm.pgs.y - 1)) / frame_psm.pgs.y;
	const u32 written_pages = w_pages * h_pages;

	// If both buffers are side by side we can expect a fast clear in on-going
	if (half > (base + written_pages) || half <= base)
		return false;

	GSTextureCache::Target* half_point = g_texture_cache->GetExactTarget(half << 5, m_cached_ctx.FRAME.FBW, clear_depth ? GSTextureCache::RenderTarget : GSTextureCache::DepthStencil, half << 5);

	if (half_point)
	{
		half_point = nullptr;
		return false;
	}

	// Don't allow double half clear to go through when the number of bits written through FRAME and Z are different.
	// GTA: LCS does this setup, along with a few other games. Thankfully if it's a zero clear, we'll clear both
	// separately, and the end result is the same because it gets invalidated. That's better than falsely detecting
	// double half clears, and ending up with 1024 high render targets which really shouldn't be.
	if ((!enough_bits && frame_psm.fmt != zbuf_psm.fmt && m_cached_ctx.FRAME.FBMSK != ((zbuf_psm.fmt == 1) ? 0xFF000000u : 0)) ||
		!GSUtil::HasCompatibleBits(m_cached_ctx.FRAME.PSM & ~0x30, m_cached_ctx.ZBUF.PSM & ~0x30)) // Bit depth is not the same (i.e. 32bit + 16bit).
	{
		GL_INS("Inconsistent FRAME [%s, %08x] and ZBUF [%s] formats, not using double-half clear.",
			psm_str(m_cached_ctx.FRAME.PSM), m_cached_ctx.FRAME.FBMSK, psm_str(m_cached_ctx.ZBUF.PSM));
		return false;
	}

	// Try peeking ahead to confirm whether this is a "normal" clear, where the two buffers just happen to be
	// bang up next to each other, or a double half clear. The two are really difficult to differentiate.
	// Have to check both contexts, because God of War 2 likes to do this in-between setting TRXDIR, which
	// causes a flush, and we don't have the next context backed up index set.
	bool horizontal = false;

	const bool ctx0_match = ((((m_env.CTXT[0].FRAME.FBW + 1) & ~1) == m_cached_ctx.FRAME.FBW * 2) || (m_env.CTXT[0].FRAME.FBW == m_cached_ctx.FRAME.FBW)) &&
		((m_env.CTXT[0].FRAME.FBP == base &&
		(!m_env.CTXT[0].ZBUF.ZMSK || (m_env.CTXT[0].TEST.ZTE && m_env.CTXT[0].TEST.ZTST >= ZTST_GEQUAL)) &&
		m_env.CTXT[0].ZBUF.ZBP != half) ||
		(m_env.CTXT[0].ZBUF.ZBP == base && m_env.CTXT[0].FRAME.FBP != half));

	const bool ctx1_match = ((((m_env.CTXT[1].FRAME.FBW + 1) & ~1) == m_cached_ctx.FRAME.FBW * 2) || (m_env.CTXT[1].FRAME.FBW == m_cached_ctx.FRAME.FBW)) &&
		((m_env.CTXT[1].FRAME.FBP == base && m_env.CTXT[1].ZBUF.ZBP != half) ||
		(m_env.CTXT[1].ZBUF.ZBP == base &&
			(!m_env.CTXT[1].ZBUF.ZMSK || (m_env.CTXT[1].TEST.ZTE && m_env.CTXT[1].TEST.ZTST >= ZTST_GEQUAL)) &&
			m_env.CTXT[1].FRAME.FBP != half));

	if (ctx0_match || ctx1_match)
	{
		// Needed for Spider-Man 2 (target was previously half size, double half cleared at new size).
		GL_INS("Confirmed double-half clear by next FBP/ZBP");

		const int ctx = ctx1_match ? 1 : 0;

		if (((m_env.CTXT[ctx].FRAME.FBW + 1) & ~1) == m_cached_ctx.FRAME.FBW * 2)
			horizontal = true;
	}
	else
	{
		// Check for a target matching the starting point. It might be in Z or FRAME...
		GSTextureCache::Target* tgt = g_texture_cache->GetTargetWithSharedBits(
			base * BLOCKS_PER_PAGE, clear_depth ? m_cached_ctx.ZBUF.PSM : m_cached_ctx.FRAME.PSM);
		if (!tgt)
		{
			tgt = g_texture_cache->GetTargetWithSharedBits(
				base * BLOCKS_PER_PAGE, clear_depth ? m_cached_ctx.FRAME.PSM : m_cached_ctx.ZBUF.PSM);
		}

		u32 end_block = ((half + written_pages) * BLOCKS_PER_PAGE) - 1;

		if (tgt)
		{
			// If the full size is an odd width and it's trying to do half (in the case of FF7 DoC it goes from 7 to 4), we need to recalculate our end check.
			if ((m_cached_ctx.FRAME.FBW * 2) == (tgt->m_TEX0.TBW + 1))
				end_block = GSLocalMemory::GetUnwrappedEndBlockAddress(tgt->m_TEX0.TBP0, tgt->m_TEX0.TBW + 1, tgt->m_TEX0.PSM, tgt->GetUnscaledRect());
			else
				end_block = GSLocalMemory::GetUnwrappedEndBlockAddress(tgt->m_TEX0.TBP0, tgt->m_TEX0.TBW, tgt->m_TEX0.PSM, tgt->GetUnscaledRect());
		}
		// Are we clearing over the middle of this target?
		if (!tgt || (((half + written_pages) * BLOCKS_PER_PAGE) - 1) > end_block)
		{
			return false;
		}

		// Siren double half clears horizontally with half FBW instead of vertically.
		// We could use the FBW here, but using the rectangle seems a bit safer, because changing FBW
		// from one RT to another isn't uncommon.
		const GSVector4 vr = GSVector4(m_r.rintersect(tgt->m_valid)) / GSVector4(tgt->m_valid);
		horizontal = (vr.z < vr.w);
	}

	GL_INS("DetectDoubleHalfClear(): Clearing %s %s, fbp=%x, zbp=%x, pages=%u, base=%x, half=%x, rect=(%d,%d=>%d,%d)",
		clear_depth ? "depth" : "color", horizontal ? "horizontally" : "vertically", m_cached_ctx.FRAME.Block(),
		m_cached_ctx.ZBUF.Block(), written_pages, base * BLOCKS_PER_PAGE, half * BLOCKS_PER_PAGE, m_r.x, m_r.y, m_r.z,
		m_r.w);

	// Double the clear rect.
	if (horizontal)
	{
		m_cached_ctx.FRAME.FBW *= 2;
		m_r.z += m_r.x + m_r.width();
	}
	else
	{
		m_r.w += m_r.y + m_r.height();
	}
	ReplaceVerticesWithSprite(m_r, GSVector2i(1, 1));

	// Prevent wasting time looking up and creating the target which is getting blown away.
	if (!clear_depth)
	{
		SetNewFRAME(base * BLOCKS_PER_PAGE, m_cached_ctx.FRAME.FBW, m_cached_ctx.FRAME.PSM);
		m_cached_ctx.ZBUF.ZMSK = true;
		no_rt = false;
		no_ds = true;
	}
	else
	{
		SetNewZBUF(base * BLOCKS_PER_PAGE, m_cached_ctx.ZBUF.PSM);
		m_cached_ctx.FRAME.FBMSK = 0xFFFFFFFF;
		no_rt = true;
		no_ds = false;
	}

	// Remove any targets at the half-buffer point, they're getting overwritten.
	g_texture_cache->InvalidateVideoMemType(GSTextureCache::RenderTarget, half * BLOCKS_PER_PAGE);
	g_texture_cache->InvalidateVideoMemType(GSTextureCache::DepthStencil, half * BLOCKS_PER_PAGE);
	return true;
}

bool GSRendererHW::TryTargetClear(GSTextureCache::Target* rt, GSTextureCache::Target* ds, bool preserve_rt_color, bool preserve_depth)
{
	if (m_vt.m_eq.rgba != 0xFFFF || !m_vt.m_eq.z)
		return false;

	bool skip = true;
	if (rt)
	{
		if (!preserve_rt_color && !IsReallyDithered() && m_r.rintersect(rt->m_valid).eq(rt->m_valid))
		{
			const u32 c = GetConstantDirectWriteMemClearColor();
			GL_INS("TryTargetClear(): RT at %x <= %08X", rt->m_TEX0.TBP0, c);
			g_gs_device->ClearRenderTarget(rt->m_texture, c);
			rt->m_alpha_max = c >> 24;
			rt->m_alpha_min = c >> 24;

			if (!rt->m_32_bits_fmt)
			{
				rt->m_alpha_max &= 128;
				rt->m_alpha_min &= 128;
			}
		}
		else
		{
			skip = false;
		}
	}

	if (ds)
	{
		if (ds && !preserve_depth && m_r.rintersect(ds->m_valid).eq(ds->m_valid))
		{
			const u32 max_z = 0xFFFFFFFF >> (GSLocalMemory::m_psm[m_cached_ctx.ZBUF.PSM].fmt * 8);
			const u32 z = std::min(max_z, m_vertex.buff[1].XYZ.Z);
			const float d = static_cast<float>(z) * (g_gs_device->Features().clip_control ? 0x1p-32f : 0x1p-24f);
			GL_INS("TryTargetClear(): DS at %x <= %f", ds->m_TEX0.TBP0, d);
			g_gs_device->ClearDepth(ds->m_texture, d);
			ds->m_alpha_max = z >> 24;
			ds->m_alpha_min = z >> 24;

			if (GSLocalMemory::m_psm[ds->m_TEX0.PSM].bpp == 16)
			{
				ds->m_alpha_max &= 128;
				ds->m_alpha_min &= 128;
			}
		}
		else
		{
			skip = false;
		}
	}

	return skip;
}

bool GSRendererHW::TryGSMemClear(bool no_rt, bool preserve_rt, bool invalidate_rt, u32 rt_end_bp,
	bool no_ds, bool preserve_z, bool invalidate_z, u32 ds_end_bp)
{
	if (!PrimitiveCoversWithoutGaps())
		return false;

	// Limit the hack to a single full buffer clear. Some games might use severals column to clear a screen
	// but hopefully it will be enough.
	if (m_r.width() < ((static_cast<int>(m_cached_ctx.FRAME.FBW) - 1) * 64))
		return false;

	if (!no_rt && !preserve_rt)
	{
		ClearGSLocalMemory(m_context->offset.fb, m_r, GetConstantDirectWriteMemClearColor());

		if (invalidate_rt)
		{
			g_texture_cache->InvalidateVideoMem(m_context->offset.fb, m_r, false);
			g_texture_cache->InvalidateContainedTargets(
				GSLocalMemory::GetStartBlockAddress(
					m_cached_ctx.FRAME.Block(), m_cached_ctx.FRAME.FBW, m_cached_ctx.FRAME.PSM, m_r),
				rt_end_bp, m_cached_ctx.FRAME.PSM);

			GSUploadQueue clear_queue;
			clear_queue.draw = s_n;
			clear_queue.rect = m_r;
			clear_queue.blit.DBP = m_cached_ctx.FRAME.Block();
			clear_queue.blit.DBW = m_cached_ctx.FRAME.FBW;
			clear_queue.blit.DPSM = m_cached_ctx.FRAME.PSM;
			clear_queue.zero_clear = true;
			m_draw_transfers.push_back(clear_queue);
		}
	}

	if (!no_ds && !preserve_z)
	{
		ClearGSLocalMemory(m_context->offset.zb, m_r, m_vertex.buff[1].XYZ.Z);

		if (invalidate_z)
		{
			g_texture_cache->InvalidateVideoMem(m_context->offset.zb, m_r, false);
			g_texture_cache->InvalidateContainedTargets(
				GSLocalMemory::GetStartBlockAddress(
					m_cached_ctx.ZBUF.Block(), m_cached_ctx.FRAME.FBW, m_cached_ctx.ZBUF.PSM, m_r),
				ds_end_bp, m_cached_ctx.ZBUF.PSM);
		}
	}

	return ((invalidate_rt || no_rt) && (invalidate_z || no_ds));
}

void GSRendererHW::ClearGSLocalMemory(const GSOffset& off, const GSVector4i& r, u32 vert_color)
{
	GL_INS("ClearGSLocalMemory(): %08X %d,%d => %d,%d @ BP %x BW %u %s", vert_color, r.x, r.y, r.z, r.w, off.bp(),
		off.bw(), psm_str(off.psm()));

	const u32 psm = (off.psm() == PSMCT32 && m_cached_ctx.FRAME.FBMSK == 0xFF000000u) ? PSMCT24 : off.psm();
	const int format = GSLocalMemory::m_psm[psm].fmt;

	const int left = r.left;
	const int right = r.right;
	const int bottom = r.bottom;
	int top = r.top;

	// Process the page aligned region first, then fall back to anything which is not.
	// Since pages are linear in memory, we can do it basically with a vector memset.
	// If the draw area is greater than the FBW.. I don't want to deal with that here..

	const u32 fbw = m_cached_ctx.FRAME.FBW;
	const u32 pages_wide = r.z / 64u;
	const GSVector2i& pgs = GSLocalMemory::m_psm[psm].pgs;
	if (left == 0 && top == 0 && (right & (pgs.x - 1)) == 0 && pages_wide <= fbw)
	{
		const u32 pixels_per_page = pgs.x * pgs.y;
		const int page_aligned_bottom = (bottom & ~(pgs.y - 1));

		if (format == 0)
		{
			const GSVector4i vcolor = GSVector4i(vert_color);
			const u32 iterations_per_page = (pages_wide * pixels_per_page) / 4;
			pxAssert((off.bp() & (BLOCKS_PER_PAGE - 1)) == 0);
			for (u32 current_page = off.bp() >> 5; top < page_aligned_bottom; top += pgs.y, current_page += fbw)
			{
				GSVector4i* ptr = reinterpret_cast<GSVector4i*>(m_mem.vm8() + current_page * PAGE_SIZE);
				GSVector4i* const ptr_end = ptr + iterations_per_page;
				while (ptr != ptr_end)
					*(ptr++) = vcolor;
			}
		}
		else if (format == 1)
		{
			const GSVector4i mask = GSVector4i::xff000000();
			const GSVector4i vcolor = GSVector4i(vert_color & 0x00ffffffu);
			const u32 iterations_per_page = (pages_wide * pixels_per_page) / 4;
			pxAssert((off.bp() & (BLOCKS_PER_PAGE - 1)) == 0);
			for (u32 current_page = off.bp() >> 5; top < page_aligned_bottom; top += pgs.y, current_page += fbw)
			{
				GSVector4i* ptr = reinterpret_cast<GSVector4i*>(m_mem.vm8() + current_page * PAGE_SIZE);
				GSVector4i* const ptr_end = ptr + iterations_per_page;
				while (ptr != ptr_end)
				{
					*ptr = (*ptr & mask) | vcolor;
					ptr++;
				}
			}
		}
		else if (format == 2)
		{
			const u16 converted_color = ((vert_color >> 16) & 0x8000) | ((vert_color >> 9) & 0x7C00) |
										((vert_color >> 6) & 0x7E0) | ((vert_color >> 3) & 0x1F);
			const GSVector4i vcolor = GSVector4i::broadcast16(converted_color);
			const u32 iterations_per_page = (pages_wide * pixels_per_page) / 8;
			pxAssert((off.bp() & (BLOCKS_PER_PAGE - 1)) == 0);
			for (u32 current_page = off.bp() >> 5; top < page_aligned_bottom; top += pgs.y, current_page += fbw)
			{
				GSVector4i* ptr = reinterpret_cast<GSVector4i*>(m_mem.vm8() + current_page * PAGE_SIZE);
				GSVector4i* const ptr_end = ptr + iterations_per_page;
				while (ptr != ptr_end)
					*(ptr++) = vcolor;
			}
		}
	}

	if (format == 0)
	{
		// Based on WritePixel32
		u32* vm = m_mem.vm32();
		for (int y = top; y < bottom; y++)
		{
			GSOffset::PAHelper pa = off.assertSizesMatch(GSLocalMemory::swizzle32).paMulti(0, y);

			for (int x = left; x < right; x++)
				vm[pa.value(x)] = vert_color;
		}
	}
	else if (format == 1)
	{
		// Based on WritePixel24
		u32* vm = m_mem.vm32();
		const u32 write_color = vert_color & 0xffffffu;
		for (int y = top; y < bottom; y++)
		{
			GSOffset::PAHelper pa = off.assertSizesMatch(GSLocalMemory::swizzle32).paMulti(0, y);

			for (int x = left; x < right; x++)
				vm[pa.value(x)] = (vm[pa.value(x)] & 0xff000000u) | write_color;
		}
	}
	else if (format == 2)
	{
		const u16 converted_color = ((vert_color >> 16) & 0x8000) | ((vert_color >> 9) & 0x7C00) | ((vert_color >> 6) & 0x7E0) | ((vert_color >> 3) & 0x1F);

		// Based on WritePixel16
		u16* vm = m_mem.vm16();
		for (int y = top; y < bottom; y++)
		{
			GSOffset::PAHelper pa = off.assertSizesMatch(GSLocalMemory::swizzle16).paMulti(0, y);

			for (int x = left; x < right; x++)
				vm[pa.value(x)] = converted_color;
		}
	}
}

bool GSRendererHW::OI_BlitFMV(GSTextureCache::Target* _rt, GSTextureCache::Source* tex, const GSVector4i& r_draw)
{
	if (r_draw.w > 1024 && (m_vt.m_primclass == GS_SPRITE_CLASS) && (m_vertex.next == 2) && PRIM->TME && !PRIM->ABE && tex && !tex->m_target && m_cached_ctx.TEX0.TBW > 0)
	{
		GL_PUSH("OI_BlitFMV");

		GL_INS("OI_BlitFMV");

		// The draw is done past the RT at the location of the texture. To avoid various upscaling mess
		// We will blit the data from the top to the bottom of the texture manually.

		// Expected memory representation
		// -----------------------------------------------------------------
		// RT (2 half frame)
		// -----------------------------------------------------------------
		// Top of Texture (full height frame)
		//
		// Bottom of Texture (half height frame, will be the copy of Top texture after the draw)
		// -----------------------------------------------------------------

		const int tw = static_cast<int>(1 << m_cached_ctx.TEX0.TW);
		const int th = static_cast<int>(1 << m_cached_ctx.TEX0.TH);

		// Compute the Bottom of texture rectangle
		ASSERT(m_cached_ctx.TEX0.TBP0 > m_cached_ctx.FRAME.Block());
		const int offset = (m_cached_ctx.TEX0.TBP0 - m_cached_ctx.FRAME.Block()) / m_cached_ctx.TEX0.TBW;
		GSVector4i r_texture(r_draw);
		r_texture.y -= offset;
		r_texture.w -= offset;

		if (GSTexture* rt = g_gs_device->CreateRenderTarget(tw, th, GSTexture::Format::Color))
		{
			// sRect is the top of texture
			const GSVector4 sRect(m_vt.m_min.t.x / tw, m_vt.m_min.t.y / th, m_vt.m_max.t.x / tw, m_vt.m_max.t.y / th);
			const GSVector4 dRect(r_texture);
			const GSVector4i r_full(0, 0, tw, th);

			g_gs_device->CopyRect(tex->m_texture, rt, r_full, 0, 0);
			g_perfmon.Put(GSPerfMon::TextureCopies, 1);

			g_gs_device->StretchRect(tex->m_texture, sRect, rt, dRect);
			g_perfmon.Put(GSPerfMon::TextureCopies, 1);

			g_gs_device->CopyRect(rt, tex->m_texture, r_full, 0, 0);
			g_perfmon.Put(GSPerfMon::TextureCopies, 1);

			g_gs_device->Recycle(rt);
		}

		// Copy back the texture into the GS mem. I don't know why but it will be
		// reuploaded again later
		g_texture_cache->Read(tex, r_texture.rintersect(tex->m_texture->GetRect()));

		g_texture_cache->InvalidateVideoMemSubTarget(_rt);

		return false; // skip current draw
	}

	// Nothing to see keep going
	return true;
}

bool GSRendererHW::AreAnyPixelsDiscarded() const
{
	return ((m_draw_env->SCANMSK.MSK & 2) || // skipping rows
			m_cached_ctx.TEST.ATE || // testing alpha (might discard some pixels)
			m_cached_ctx.TEST.DATE); // reading alpha
}

bool GSRendererHW::IsDiscardingDstColor()
{
	return ((!PRIM->ABE || IsOpaque() || m_context->ALPHA.IsBlack()) && // no blending or writing black
			!AreAnyPixelsDiscarded() && (m_cached_ctx.FRAME.FBMSK & GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk) == 0); // no channels masked
}

bool GSRendererHW::IsDiscardingDstRGB()
{
	return ((!PRIM->ABE || IsOpaque() || m_context->ALPHA.IsBlack()) && // no blending or writing black
			((m_cached_ctx.FRAME.FBMSK & GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk) & 0xFFFFFFu) == 0); // RGB isn't masked
}

bool GSRendererHW::IsDiscardingDstAlpha() const
{
	return ((!PRIM->ABE || m_context->ALPHA.C != 1) && // not using Ad
			((m_cached_ctx.FRAME.FBMSK & GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmsk) & 0xFF000000u) == 0); // alpha isn't masked
}

bool GSRendererHW::PrimitiveCoversWithoutGaps()
{
	if (m_primitive_covers_without_gaps.has_value())
		return m_primitive_covers_without_gaps.value();

	// Draw shouldn't be offset.
	if (((m_r.eq32(GSVector4i::zero())).mask() & 0xff) != 0xff)
	{
		m_primitive_covers_without_gaps = false;
		return false;
	}

	if (m_vt.m_primclass == GS_POINT_CLASS)
	{
		m_primitive_covers_without_gaps = (m_vertex.next < 2);
		return m_primitive_covers_without_gaps.value();
	}
	else if (m_vt.m_primclass == GS_TRIANGLE_CLASS)
	{
		m_primitive_covers_without_gaps = (m_index.tail == 6 && TrianglesAreQuads());
		return m_primitive_covers_without_gaps.value();
	}
	else if (m_vt.m_primclass != GS_SPRITE_CLASS)
	{
		m_primitive_covers_without_gaps = false;
		return false;
	}

	// Simple case: one sprite.
	if (m_index.tail == 2)
	{
		m_primitive_covers_without_gaps = true;
		return true;
	}

	// Check that the height matches. Xenosaga 3 draws a letterbox around
	// the FMV with a sprite at the top and bottom of the framebuffer.
	const GSVertex* v = &m_vertex.buff[0];
	const u32 first_dpY = v[1].XYZ.Y - v[0].XYZ.Y;
	const u32 first_dpX = v[1].XYZ.X - v[0].XYZ.X;

	// Horizontal Match.
	if ((first_dpX >> 4) == m_r.z)
	{
		// Borrowed from MergeSprite() modified to calculate heights.
		u32 last_pY = v[1].XYZ.Y;
		for (u32 i = 2; i < m_vertex.next; i += 2)
		{
			const u32 dpY = v[i + 1].XYZ.Y - v[i].XYZ.Y;
			if (dpY != first_dpY || v[i].XYZ.Y != last_pY)
			{
				m_primitive_covers_without_gaps = false;
				return false;
			}

			last_pY = v[i + 1].XYZ.Y;
		}

		m_primitive_covers_without_gaps = true;
		return true;
	}

	// Vertical Match.
	if ((first_dpY >> 4) == m_r.w)
	{
		// Borrowed from MergeSprite().
		u32 last_pX = v[1].XYZ.X;
		for (u32 i = 2; i < m_vertex.next; i += 2)
		{
			if (v[i].XYZ.X < v[i-2].XYZ.X)
			{
				const u32 dpX = v[i + 1].XYZ.X - v[i].XYZ.X;
				const u32 prev_X = v[i - 2].XYZ.X - m_context->XYOFFSET.OFX;
				if (dpX != prev_X || v[i].XYZ.X != m_context->XYOFFSET.OFX)
				{
					m_primitive_covers_without_gaps = false;
					return false;
				}
			}
			else
			{
				const u32 dpX = v[i + 1].XYZ.X - v[i].XYZ.X;
				if (dpX != first_dpX || v[i].XYZ.X != last_pX)
				{
					m_primitive_covers_without_gaps = false;
					return false;
				}
			}

			last_pX = v[i + 1].XYZ.X;
		}

		m_primitive_covers_without_gaps = true;
		return true;
	}

	m_primitive_covers_without_gaps = false;
	return false;
}

bool GSRendererHW::IsConstantDirectWriteMemClear()
{
	const bool direct_draw = (m_vt.m_primclass == GS_SPRITE_CLASS) || (m_index.tail == 6 && m_vt.m_primclass == GS_TRIANGLE_CLASS);
	// Constant Direct Write without texture/test/blending (aka a GS mem clear)
	if (direct_draw && !PRIM->TME // Direct write
		&& !(m_draw_env->SCANMSK.MSK & 2)
		&& !m_cached_ctx.TEST.ATE // no alpha test
		&& !m_cached_ctx.TEST.DATE // no destination alpha test
		&& (!m_cached_ctx.TEST.ZTE || m_cached_ctx.TEST.ZTST == ZTST_ALWAYS) // no depth test
		&& (m_vt.m_eq.rgba == 0xFFFF || m_vertex.next == 2) // constant color write
		&& m_r.x == 0 && m_r.y == 0) // Likely full buffer write
		return true;

	return false;
}

u32 GSRendererHW::GetConstantDirectWriteMemClearColor() const
{
	// Take the vertex colour, but check if the blending would make it black.
	u32 vert_color = m_vertex.buff[1].RGBAQ.U32[0];
	if (PRIM->ABE && m_context->ALPHA.IsBlack())
		vert_color &= 0xFF000000u;

	// 24-bit format? Otherwise, FBA sets the high bit in alpha.
	const u32 cfmt = GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmt;
	if (cfmt == 1)
		vert_color &= 0xFFFFFFu;
	else
		vert_color |= m_context->FBA.FBA << 31;

	// Apply mask for 16-bit formats.
	if (cfmt == 2)
		vert_color &= 0x80F8F8F8u;

	return vert_color;
}

u32 GSRendererHW::GetConstantDirectWriteMemClearDepth() const
{
	const u32 max_z = (0xFFFFFFFF >> (GSLocalMemory::m_psm[m_cached_ctx.ZBUF.PSM].fmt * 8));
	return std::min(m_vertex.buff[1].XYZ.Z, max_z);
}

bool GSRendererHW::IsReallyDithered() const
{
	// Must have dither on, not disabled in config, and using 16-bit.
	const GSDrawingEnvironment* env = m_draw_env;
	if (!env->DTHE.DTHE || GSConfig.Dithering == 0 || GSLocalMemory::m_psm[m_cached_ctx.FRAME.PSM].fmt != 2)
		return false;

	// Dithering is still on, but if the matrix is all-zero, it has no effect.
	if ((env->DIMX.U64 & UINT64_C(0x7777777777777777)) == 0)
		return false;

	return true;
}

void GSRendererHW::ReplaceVerticesWithSprite(const GSVector4i& unscaled_rect, const GSVector4i& unscaled_uv_rect,
	const GSVector2i& unscaled_size, const GSVector4i& scissor)
{
	const GSVector4i fpr = unscaled_rect.sll32(4);
	const GSVector4i fpuv = unscaled_uv_rect.sll32(4);
	GSVertex* v = m_vertex.buff;

	v[0].XYZ.X = static_cast<u16>(m_context->XYOFFSET.OFX + fpr.x);
	v[0].XYZ.Y = static_cast<u16>(m_context->XYOFFSET.OFY + fpr.y);
	v[0].XYZ.Z = v[1].XYZ.Z;
	v[0].RGBAQ = v[1].RGBAQ;
	v[0].FOG = v[1].FOG;

	v[1].XYZ.X = static_cast<u16>(m_context->XYOFFSET.OFX + fpr.z);
	v[1].XYZ.Y = static_cast<u16>(m_context->XYOFFSET.OFY + fpr.w);

	if (PRIM->FST)
	{
		v[0].U = fpuv.x;
		v[0].V = fpuv.y;
		v[1].U = fpuv.z;
		v[1].V = fpuv.w;
	}
	else
	{
		const GSVector4 st = GSVector4(unscaled_uv_rect) / GSVector4(GSVector4i(unscaled_size).xyxy());
		GSVector4::storel(&v[0].ST.S, st);
		GSVector4::storeh(&v[1].ST.S, st);
	}

	// Fix up vertex trace.
	m_vt.m_min.p.x = unscaled_rect.x;
	m_vt.m_min.p.y = unscaled_rect.y;
	m_vt.m_min.p.z = v[0].XYZ.Z;
	m_vt.m_max.p.x = unscaled_rect.z;
	m_vt.m_max.p.y = unscaled_rect.w;
	m_vt.m_max.p.z = v[0].XYZ.Z;
	m_vt.m_min.t.x = unscaled_uv_rect.x;
	m_vt.m_min.t.y = unscaled_uv_rect.y;
	m_vt.m_max.t.x = unscaled_uv_rect.z;
	m_vt.m_max.t.y = unscaled_uv_rect.w;
	m_vt.m_min.c = GSVector4i(v[0].RGBAQ.U32[0]).u8to32();
	m_vt.m_max.c = m_vt.m_min.c;
	m_vt.m_eq.rgba = 0xFFFF;
	m_vt.m_eq.z = true;
	m_vt.m_eq.f = true;

	m_vertex.head = m_vertex.tail = m_vertex.next = 2;
	m_index.tail = 2;

	m_r = unscaled_rect;
	m_context->scissor.in = scissor;
}

void GSRendererHW::ReplaceVerticesWithSprite(const GSVector4i& unscaled_rect, const GSVector2i& unscaled_size)
{
	ReplaceVerticesWithSprite(unscaled_rect, unscaled_rect, unscaled_size, unscaled_rect);
}

GSHWDrawConfig& GSRendererHW::BeginHLEHardwareDraw(
	GSTexture* rt, GSTexture* ds, float rt_scale, GSTexture* tex, float tex_scale, const GSVector4i& unscaled_rect)
{
	ResetStates();

	// Bit gross, but really no other way to ensure there's nothing of the last draw left over.
	GSHWDrawConfig& config = m_conf;
	std::memset(&config.cb_vs, 0, sizeof(config.cb_vs));
	std::memset(&config.cb_ps, 0, sizeof(config.cb_ps));

	// Reused between draws, since the draw config is shared, you can't have multiple draws in flight anyway.
	static GSVertex vertices[4];
	static constexpr u16 indices[6] = {0, 1, 2, 2, 1, 3};

#define V(i, x, y, u, v) \
	do \
	{ \
		vertices[i].XYZ.X = x; \
		vertices[i].XYZ.Y = y; \
		vertices[i].U = u; \
		vertices[i].V = v; \
	} while (0)

	const GSVector4i fp_rect = unscaled_rect.sll32(4);
	V(0, fp_rect.x, fp_rect.y, fp_rect.x, fp_rect.y); // top-left
	V(1, fp_rect.z, fp_rect.y, fp_rect.z, fp_rect.y); // top-right
	V(2, fp_rect.x, fp_rect.w, fp_rect.x, fp_rect.w); // bottom-left
	V(3, fp_rect.z, fp_rect.w, fp_rect.z, fp_rect.w); // bottom-right

#undef V

	GSTexture* rt_or_ds = rt ? rt : ds;
	config.rt = rt;
	config.ds = ds;
	config.tex = tex;
	config.pal = nullptr;
	config.indices = indices;
	config.verts = vertices;
	config.nverts = static_cast<u32>(std::size(vertices));
	config.nindices = static_cast<u32>(std::size(indices));
	config.indices_per_prim = 3;
	config.drawlist = nullptr;
	config.scissor = rt_or_ds->GetRect().rintersect(GSVector4i(GSVector4(rt->GetRect()) * tex_scale));
	config.drawarea = config.scissor;
	config.topology = GSHWDrawConfig::Topology::Triangle;
	config.blend = GSHWDrawConfig::BlendState();
	config.depth = GSHWDrawConfig::DepthStencilSelector::NoDepth();
	config.colormask = GSHWDrawConfig::ColorMaskSelector();
	config.colormask.wrgba = 0xf;
	config.require_one_barrier = false;
	config.require_full_barrier = false;
	config.destination_alpha = GSHWDrawConfig::DestinationAlphaMode::Off;
	config.datm = false;
	config.line_expand = false;
	config.separate_alpha_pass = false;
	config.second_separate_alpha_pass = false;
	config.alpha_second_pass.enable = false;
	config.vs.key = 0;
	config.vs.tme = tex != nullptr;
	config.vs.iip = true;
	config.vs.fst = true;
	config.ps.key_lo = 0;
	config.ps.key_hi = 0;
	config.ps.tfx = tex ? TFX_DECAL : TFX_NONE;
	config.ps.iip = true;
	config.ps.fst = true;

	if (tex)
	{
		const GSVector2i texsize = tex->GetSize();
		config.cb_ps.WH = GSVector4(static_cast<float>(texsize.x) / tex_scale,
			static_cast<float>(texsize.y) / tex_scale, static_cast<float>(texsize.x), static_cast<float>(texsize.y));
		config.cb_ps.STScale = GSVector2(1.0f);
		config.cb_vs.texture_scale = GSVector2((1.0f / 16.0f) / config.cb_ps.WH.x, (1.0f / 16.0f) / config.cb_ps.WH.y);
	}

	const GSVector2i rtsize = rt_or_ds->GetSize();
	config.cb_vs.vertex_scale = GSVector2(2.0f * rt_scale / (rtsize.x << 4), 2.0f * rt_scale / (rtsize.y << 4));
	config.cb_vs.vertex_offset = GSVector2(-1.0f / rtsize.x + 1.0f, -1.0f / rtsize.y + 1.0f);

	return config;
}

void GSRendererHW::EndHLEHardwareDraw(bool force_copy_on_hazard /* = false */)
{
	GSHWDrawConfig& config = m_conf;

	GL_PUSH("HLE hardware draw in %d,%d => %d,%d", config.drawarea.left, config.drawarea.top, config.drawarea.right,
		config.drawarea.bottom);

	GSTexture* copy = nullptr;
	if (config.tex && (config.tex == config.rt || config.tex == config.ds))
	{
		const GSDevice::FeatureSupport features = g_gs_device->Features();

		if (!force_copy_on_hazard && config.tex == config.rt && features.texture_barrier)
		{
			// Sample RT 1:1.
			config.require_one_barrier = !features.framebuffer_fetch;
			config.ps.tex_is_fb = true;
		}
		else if (!force_copy_on_hazard && config.tex == config.ds && !config.depth.zwe &&
				 features.test_and_sample_depth)
		{
			// Safe to read depth buffer.
		}
		else
		{
			// Have to copy texture. Assume the whole thing is read, in all the cases this is used, it is.
			GSTexture* src = (config.tex == config.rt) ? config.rt : config.ds;
			copy = g_gs_device->CreateTexture(src->GetWidth(), src->GetHeight(), 1, src->GetFormat(), true);
			if (!copy)
			{
				Console.Error("Texture allocation failure in EndHLEHardwareDraw()");
				return;
			}

			// DX11 can't partial copy depth textures.
			const GSVector4i copy_rect = (src->IsDepthStencil() && !features.test_and_sample_depth) ?
											 src->GetRect() :
											 config.drawarea.rintersect(src->GetRect());
			g_gs_device->CopyRect(src, copy, copy_rect - copy_rect.xyxy(), copy_rect.x, copy_rect.y);
			config.tex = copy;
		}
	}

	// Drop color1 if dual-source is not being used.
	config.ps.no_color = !config.rt;
	config.ps.no_color1 = !config.rt || !config.blend.enable ||
						  (!GSDevice::IsDualSourceBlendFactor(config.blend.src_factor) &&
							  !GSDevice::IsDualSourceBlendFactor(config.blend.dst_factor));

	g_gs_device->RenderHW(m_conf);

	if (copy)
		g_gs_device->Recycle(copy);
}