GPU/HW: Split shadergen to seperate class

2019-11-03 13:36:54 +10:00 · 2019-11-03 13:36:54 +10:00 · be81d08109
parent 91c99f0226
commit be81d08109
10 changed files with 624 additions and 590 deletions
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@ -21,6 +21,8 @@ add_library(core
    gpu_hw.h
    gpu_hw_opengl.cpp
    gpu_hw_opengl.h
+    gpu_hw_shadergen.cpp
+    gpu_hw_shadergen.h
    gpu_sw.cpp
    gpu_sw.h
    gte.cpp
--- a/src/core/core.vcxproj
+++ b/src/core/core.vcxproj
@ -41,6 +41,7 @@
    <ClCompile Include="cpu_disasm.cpp" />
    <ClCompile Include="digital_controller.cpp" />
    <ClCompile Include="gpu_commands.cpp" />
+    <ClCompile Include="gpu_hw_shadergen.cpp" />
    <ClCompile Include="gpu_sw.cpp" />
    <ClCompile Include="gte.cpp" />
    <ClCompile Include="dma.cpp" />
@ -64,6 +65,7 @@
    <ClInclude Include="cpu_core.h" />
    <ClInclude Include="cpu_disasm.h" />
    <ClInclude Include="digital_controller.h" />
+    <ClInclude Include="gpu_hw_shadergen.h" />
    <ClInclude Include="gpu_sw.h" />
    <ClInclude Include="gte.h" />
    <ClInclude Include="cpu_types.h" />
--- a/src/core/core.vcxproj.filters
+++ b/src/core/core.vcxproj.filters
@ -23,6 +23,7 @@
    <ClCompile Include="settings.cpp" />
    <ClCompile Include="gpu_commands.cpp" />
    <ClCompile Include="gpu_sw.cpp" />
+    <ClCompile Include="gpu_hw_shadergen.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="types.h" />
@ -50,6 +51,7 @@
    <ClInclude Include="memory_card.h" />
    <ClInclude Include="settings.h" />
    <ClInclude Include="gpu_sw.h" />
+    <ClInclude Include="gpu_hw_shadergen.h" />
  </ItemGroup>
  <ItemGroup>
    <None Include="cpu_core.inl" />
--- a/src/core/gpu.h
+++ b/src/core/gpu.h
@ -18,6 +18,57 @@ class Timers;
 class GPU
 {
 public:
+  enum class DMADirection : u32
+  {
+    Off = 0,
+    FIFO = 1,
+    CPUtoGP0 = 2,
+    GPUREADtoCPU = 3
+  };
+
+  enum class Primitive : u8
+  {
+    Reserved = 0,
+    Polygon = 1,
+    Line = 2,
+    Rectangle = 3
+  };
+
+  enum class DrawRectangleSize : u8
+  {
+    Variable = 0,
+    R1x1 = 1,
+    R8x8 = 2,
+    R16x16 = 3
+  };
+
+  enum class TextureMode : u8
+  {
+    Palette4Bit = 0,
+    Palette8Bit = 1,
+    Direct16Bit = 2,
+    Reserved_Direct16Bit = 3,
+
+    // Not register values.
+    RawTextureBit = 4,
+    RawPalette4Bit = RawTextureBit | Palette4Bit,
+    RawPalette8Bit = RawTextureBit | Palette8Bit,
+    RawDirect16Bit = RawTextureBit | Direct16Bit,
+    Reserved_RawDirect16Bit = RawTextureBit | Reserved_Direct16Bit,
+
+    Disabled = 8 // Not a register value
+  };
+
+  enum class TransparencyMode : u8
+  {
+    HalfBackgroundPlusHalfForeground = 0,
+    BackgroundPlusForeground = 1,
+    BackgroundMinusForeground = 2,
+    BackgroundPlusQuarterForeground = 3,
+
+    Disabled = 4 // Not a register value
+  };
+
  enum : u32
  {
    VRAM_WIDTH = 1024,
@ -29,6 +80,13 @@ public:
    HBLANK_TIMER_INDEX = 1
  };

+  // 4x4 dither matrix.
+  static constexpr s32 DITHER_MATRIX[4][4] = {{-4, +0, -3, +1},  // row 0
+                                              {+2, -2, +3, -1},  // row 1
+                                              {-3, +1, -4, +0},  // row 2
+                                              {+4, -1, +2, -2}}; // row 3
+
+  // Base class constructor.
  GPU();
  virtual ~GPU();

@ -112,57 +170,6 @@ protected:
  static bool DumpVRAMToFile(const char* filename, u32 width, u32 height, u32 stride, const void* buffer,
                             bool remove_alpha);

-  enum class DMADirection : u32
-  {
-    Off = 0,
-    FIFO = 1,
-    CPUtoGP0 = 2,
-    GPUREADtoCPU = 3
-  };
-
-  enum class Primitive : u8
-  {
-    Reserved = 0,
-    Polygon = 1,
-    Line = 2,
-    Rectangle = 3
-  };
-
-  enum class DrawRectangleSize : u8
-  {
-    Variable = 0,
-    R1x1 = 1,
-    R8x8 = 2,
-    R16x16 = 3
-  };
-
-  enum class TextureMode : u8
-  {
-    Palette4Bit = 0,
-    Palette8Bit = 1,
-    Direct16Bit = 2,
-    Reserved_Direct16Bit = 3,
-
-    // Not register values.
-    RawTextureBit = 4,
-    RawPalette4Bit = RawTextureBit | Palette4Bit,
-    RawPalette8Bit = RawTextureBit | Palette8Bit,
-    RawDirect16Bit = RawTextureBit | Direct16Bit,
-    Reserved_RawDirect16Bit = RawTextureBit | Reserved_Direct16Bit,
-
-    Disabled = 8 // Not a register value
-  };
-
-  enum class TransparencyMode : u8
-  {
-    HalfBackgroundPlusHalfForeground = 0,
-    BackgroundPlusForeground = 1,
-    BackgroundMinusForeground = 2,
-    BackgroundPlusQuarterForeground = 3,
-
-    Disabled = 4 // Not a register value
-  };
-
  union RenderCommand
  {
    u32 bits;
@ -258,12 +265,6 @@ protected:
    }
  };

-  // 4x4 dither matrix.
-  static constexpr s32 DITHER_MATRIX[4][4] = {{-4, +0, -3, +1},  // row 0
-                                              {+2, -2, +3, -1},  // row 1
-                                              {-3, +1, -4, +0},  // row 2
-                                              {+4, -1, +2, -2}}; // row 3
-
  void SoftReset();

  // Sets dots per scanline
@ -464,3 +465,5 @@ private:

  static const GP0CommandHandlerTable s_GP0_command_handler_table;
 };
+
+IMPLEMENT_ENUM_CLASS_BITWISE_OPERATORS(GPU::TextureMode);
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@ -163,7 +163,7 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command

 void GPU_HW::AddDuplicateVertex()
 {
-  std::memcpy(m_batch_current_vertex_ptr, m_batch_current_vertex_ptr - 1, sizeof(HWVertex));
+  std::memcpy(m_batch_current_vertex_ptr, m_batch_current_vertex_ptr - 1, sizeof(BatchVertex));
  m_batch_current_vertex_ptr++;
 }

@ -175,443 +175,14 @@ void GPU_HW::CalcScissorRect(int* left, int* top, int* right, int* bottom)
  *bottom = std::max<u32>((m_drawing_area.bottom + 1) * m_resolution_scale, *top + 1);
 }

-static void DefineMacro(std::stringstream& ss, const char* name, bool enabled)
-{
-  if (enabled)
-    ss << "#define " << name << " 1\n";
-  else
-    ss << "/* #define " << name << " 0 */\n";
-}
-
-void GPU_HW::GenerateShaderHeader(std::stringstream& ss)
-{
-  ss << "#version 330 core\n\n";
-  ss << "const int RESOLUTION_SCALE = " << m_resolution_scale << ";\n";
-  ss << "const ivec2 VRAM_SIZE = ivec2(" << VRAM_WIDTH << ", " << VRAM_HEIGHT << ") * RESOLUTION_SCALE;\n";
-  ss << "const vec2 RCP_VRAM_SIZE = vec2(1.0, 1.0) / vec2(VRAM_SIZE);\n";
-  ss << R"(
-
-float fixYCoord(float y)
-{
-  return 1.0 - RCP_VRAM_SIZE.y - y;
-}
-
-int fixYCoord(int y)
-{
-  return VRAM_SIZE.y - y - 1;
-}
-
-uint RGBA8ToRGBA5551(vec4 v)
-{
-  uint r = uint(v.r * 255.0) >> 3;
-  uint g = uint(v.g * 255.0) >> 3;
-  uint b = uint(v.b * 255.0) >> 3;
-  uint a = (v.a != 0.0) ? 1u : 0u;
-  return (r) | (g << 5) | (b << 10) | (a << 15);
-}
-
-vec4 RGBA5551ToRGBA8(uint v)
-{
-  uint r = (v & 31u);
-  uint g = ((v >> 5) & 31u);
-  uint b = ((v >> 10) & 31u);
-  uint a = ((v >> 15) & 1u);
-
-  // repeat lower bits
-  r = (r << 3) | (r & 7u);
-  g = (g << 3) | (g & 7u);
-  b = (b << 3) | (b & 7u);
-
-  return vec4(float(r) / 255.0, float(g) / 255.0, float(b) / 255.0, float(a));
-}
-)";
-}
-
-void GPU_HW::GenerateBatchUniformBuffer(std::stringstream& ss)
-{
-  ss << R"(
-uniform UBOBlock {
-  ivec2 u_pos_offset;
-  uvec2 u_texture_window_mask;
-  uvec2 u_texture_window_offset;
-  float u_src_alpha_factor;
-  float u_dst_alpha_factor;
-};
-)";
-}
-
-std::string GPU_HW::GenerateVertexShader(bool textured)
-{
-  std::stringstream ss;
-  GenerateShaderHeader(ss);
-  DefineMacro(ss, "TEXTURED", textured);
-  GenerateBatchUniformBuffer(ss);
-
-  ss << R"(
-in ivec2 a_pos;
-in vec4 a_col0;
-in int a_texcoord;
-in int a_texpage;
-
-out vec3 v_col0;
-#if TEXTURED
-  out vec2 v_tex0;
-  flat out ivec4 v_texpage;
-#endif
-
-void main()
-{
-  // 0..+1023 -> -1..1
-  float pos_x = (float(a_pos.x + u_pos_offset.x) / 512.0) - 1.0;
-  float pos_y = (float(a_pos.y + u_pos_offset.y) / -256.0) + 1.0;
-  gl_Position = vec4(pos_x, pos_y, 0.0, 1.0);
-
-  v_col0 = a_col0.rgb;
-  #if TEXTURED
-    v_tex0 = vec2(float(a_texcoord & 0xFFFF), float(a_texcoord >> 16)) / vec2(255.0);
-
-    // base_x,base_y,palette_x,palette_y
-    v_texpage.x = (a_texpage & 15) * 64 * RESOLUTION_SCALE;
-    v_texpage.y = ((a_texpage >> 4) & 1) * 256 * RESOLUTION_SCALE;
-    v_texpage.z = ((a_texpage >> 16) & 63) * 16 * RESOLUTION_SCALE;
-    v_texpage.w = ((a_texpage >> 22) & 511) * RESOLUTION_SCALE;
-  #endif
-}
-)";
-
-  return ss.str();
-}
-
-std::string GPU_HW::GenerateFragmentShader(HWBatchRenderMode transparency, TextureMode texture_mode, bool dithering)
-{
-  const TextureMode actual_texture_mode =
-    static_cast<TextureMode>(static_cast<u8>(texture_mode) & ~static_cast<u8>(TextureMode::RawTextureBit));
-  const bool raw_texture = (static_cast<u8>(texture_mode) & static_cast<u8>(TextureMode::RawTextureBit)) ==
-                           static_cast<u8>(TextureMode::RawTextureBit);
-
-  std::stringstream ss;
-  GenerateShaderHeader(ss);
-  GenerateBatchUniformBuffer(ss);
-  DefineMacro(ss, "TRANSPARENCY", transparency != HWBatchRenderMode::TransparencyDisabled);
-  DefineMacro(ss, "TRANSPARENCY_ONLY_OPAQUE", transparency == HWBatchRenderMode::OnlyOpaque);
-  DefineMacro(ss, "TRANSPARENCY_ONLY_TRANSPARENCY", transparency == HWBatchRenderMode::OnlyTransparent);
-  DefineMacro(ss, "TEXTURED", actual_texture_mode != TextureMode::Disabled);
-  DefineMacro(ss, "PALETTE",
-              actual_texture_mode == GPU::TextureMode::Palette4Bit ||
-                actual_texture_mode == GPU::TextureMode::Palette8Bit);
-  DefineMacro(ss, "PALETTE_4_BIT", actual_texture_mode == GPU::TextureMode::Palette4Bit);
-  DefineMacro(ss, "PALETTE_8_BIT", actual_texture_mode == GPU::TextureMode::Palette8Bit);
-  DefineMacro(ss, "RAW_TEXTURE", raw_texture);
-  DefineMacro(ss, "DITHERING", dithering);
-  DefineMacro(ss, "TRUE_COLOR", m_true_color);
-
-  ss << "const int[16] s_dither_values = int[16]( ";
-  for (u32 i = 0; i < 16; i++)
-  {
-    if (i > 0)
-      ss << ", ";
-    ss << DITHER_MATRIX[i / 4][i % 4];
-  }
-  ss << " );\n";
-
-  ss << R"(
-in vec3 v_col0;
-#if TEXTURED
-  in vec2 v_tex0;
-  flat in ivec4 v_texpage;
-  uniform sampler2D samp0;
-#endif
-
-out vec4 o_col0;
-
-ivec3 ApplyDithering(ivec3 icol)
-{
-  ivec2 fc = (ivec2(gl_FragCoord.xy) / ivec2(RESOLUTION_SCALE, RESOLUTION_SCALE)) & ivec2(3, 3);
-  int offset = s_dither_values[fc.y * 4 + fc.x];
-  return icol + ivec3(offset, offset, offset);
-}
-
-ivec3 TruncateTo15Bit(ivec3 icol)
-{
-  icol = clamp(icol, ivec3(0, 0, 0), ivec3(255, 255, 255));
-  return (icol & ivec3(~7, ~7, ~7)) | ((icol >> 3) & ivec3(7, 7, 7));
-}
-
-#if TEXTURED
-ivec2 ApplyNativeTextureWindow(ivec2 coords)
-{
-  uint x = (uint(coords.x) & ~(u_texture_window_mask.x * 8u)) | ((u_texture_window_offset.x & u_texture_window_mask.x) * 8u);
-  uint y = (uint(coords.y) & ~(u_texture_window_mask.y * 8u)) | ((u_texture_window_offset.y & u_texture_window_mask.y) * 8u);
-  return ivec2(int(x), int(y));
-}  
-
-ivec2 ApplyTextureWindow(ivec2 coords)
-{
-  if (RESOLUTION_SCALE == 1)
-    return ApplyNativeTextureWindow(coords);
-
-  ivec2 downscaled_coords = coords / ivec2(RESOLUTION_SCALE);
-  ivec2 coords_offset = coords % ivec2(RESOLUTION_SCALE);
-  return (ApplyNativeTextureWindow(downscaled_coords) * ivec2(RESOLUTION_SCALE)) + coords_offset;
-}
-
-ivec4 SampleFromVRAM(vec2 coord)
-{
-  // from 0..1 to 0..255
-  ivec2 icoord = ivec2(coord * vec2(255 * RESOLUTION_SCALE));
-  icoord = ApplyTextureWindow(icoord);
-
-  // adjust for tightly packed palette formats
-  ivec2 index_coord = icoord;
-  #if PALETTE_4_BIT
-    index_coord.x /= 4;
-  #elif PALETTE_8_BIT
-    index_coord.x /= 2;
-  #endif
-
-  // fixup coords
-  ivec2 vicoord = ivec2(v_texpage.x + index_coord.x, fixYCoord(v_texpage.y + index_coord.y));
-
-  // load colour/palette
-  vec4 color = texelFetch(samp0, vicoord, 0);
-
-  // apply palette
-  #if PALETTE
-    #if PALETTE_4_BIT
-      int subpixel = int(icoord.x / RESOLUTION_SCALE) & 3;
-      uint vram_value = RGBA8ToRGBA5551(color);
-      int palette_index = int((vram_value >> (subpixel * 4)) & 0x0Fu);
-    #elif PALETTE_8_BIT
-      int subpixel = int(icoord.x / RESOLUTION_SCALE) & 1;
-      uint vram_value = RGBA8ToRGBA5551(color);
-      int palette_index = int((vram_value >> (subpixel * 8)) & 0xFFu);
-    #endif
-    ivec2 palette_icoord = ivec2(v_texpage.z + (palette_index * RESOLUTION_SCALE), fixYCoord(v_texpage.w));
-    color = texelFetch(samp0, palette_icoord, 0);
-  #endif
-
-  return ivec4(color * vec4(255.0, 255.0, 255.0, 255.0));
-}
-#endif
-
-void main()
-{
-  ivec3 vertcol = ivec3(v_col0 * vec3(255.0, 255.0, 255.0));
-
-  bool semitransparent;
-  bool new_mask_bit;
-  ivec3 icolor;
-
-  #if TEXTURED
-    ivec4 texcol = SampleFromVRAM(v_tex0);
-    if (texcol == ivec4(0.0, 0.0, 0.0, 0.0))
-      discard;
-
-    // Grab semitransparent bit from the texture color.
-    semitransparent = (texcol.a != 0);
-
-    #if RAW_TEXTURE
-      icolor = texcol.rgb;
-    #else
-      icolor = (vertcol * texcol.rgb) >> 7;
-    #endif
-  #else
-    // All pixels are semitransparent for untextured polygons.
-    semitransparent = true;
-    icolor = vertcol;
-  #endif
-
-  // Apply dithering
-  #if DITHERING
-    icolor = ApplyDithering(icolor);
-  #endif
-
-  // Clip to 15-bit range
-  #if !TRUE_COLOR
-    icolor = TruncateTo15Bit(icolor);
-  #endif
-
-  // Normalize
-  vec3 color = vec3(icolor) / vec3(255.0, 255.0, 255.0);
-
-  #if TRANSPARENCY
-    // Apply semitransparency. If not a semitransparent texel, destination alpha is ignored.
-    if (semitransparent)
-    {
-      #if TRANSPARENCY_ONLY_OPAQUE
-        discard;
-      #endif
-      o_col0 = vec4(color * u_src_alpha_factor, u_dst_alpha_factor);
-    }
-    else
-    {
-      #if TRANSPARENCY_ONLY_TRANSPARENCY
-        discard;
-      #endif
-      o_col0 = vec4(color, 0.0);
-    }
-  #else
-    o_col0 = vec4(color, 0.0);
-  #endif
-}
-)";
-
-  return ss.str();
-}
-
-std::string GPU_HW::GenerateScreenQuadVertexShader()
-{
-  std::stringstream ss;
-  GenerateShaderHeader(ss);
-  ss << R"(
-
-out vec2 v_tex0;
-
-void main()
-{
-  v_tex0 = vec2(float((gl_VertexID << 1) & 2), float(gl_VertexID & 2));
-  gl_Position = vec4(v_tex0 * vec2(2.0f, -2.0f) + vec2(-1.0f, 1.0f), 0.0f, 1.0f);
-  gl_Position.y = -gl_Position.y;
-}
-)";
-
-  return ss.str();
-}
-
-std::string GPU_HW::GenerateFillFragmentShader()
-{
-  std::stringstream ss;
-  GenerateShaderHeader(ss);
-
-  ss << R"(
-uniform vec4 fill_color;
-out vec4 o_col0;
-
-void main()
-{
-  o_col0 = fill_color;
-}
-)";
-
-  return ss.str();
-}
-
-std::string GPU_HW::GenerateDisplayFragmentShader(bool depth_24bit, bool interlaced)
-{
-  std::stringstream ss;
-  GenerateShaderHeader(ss);
-  DefineMacro(ss, "DEPTH_24BIT", depth_24bit);
-  DefineMacro(ss, "INTERLACED", interlaced);
-
-  ss << R"(
-in vec2 v_tex0;
-out vec4 o_col0;
-
-uniform sampler2D samp0;
-uniform ivec3 u_base_coords;
-
-ivec2 GetCoords(vec2 fragcoord)
-{
-  ivec2 icoords = ivec2(fragcoord);
-  #if INTERLACED
-    if ((((icoords.y - u_base_coords.z) / RESOLUTION_SCALE) & 1) != 0)
-      discard;
-  #endif
-  return icoords;
-}
-
-void main()
-{
-  ivec2 icoords = GetCoords(gl_FragCoord.xy);
-
-  #if DEPTH_24BIT
-    // compute offset in dwords from the start of the 24-bit values
-    ivec2 base = ivec2(u_base_coords.x, u_base_coords.y + icoords.y);
-    int xoff = int(icoords.x);
-    int dword_index = (xoff / 2) + (xoff / 4);
-
-    // sample two adjacent dwords, or four 16-bit values as the 24-bit value will lie somewhere between these
-    uint s0 = RGBA8ToRGBA5551(texelFetch(samp0, ivec2(base.x + dword_index * 2 + 0, base.y), 0));
-    uint s1 = RGBA8ToRGBA5551(texelFetch(samp0, ivec2(base.x + dword_index * 2 + 1, base.y), 0));
-    uint s2 = RGBA8ToRGBA5551(texelFetch(samp0, ivec2(base.x + (dword_index + 1) * 2 + 0, base.y), 0));
-    uint s3 = RGBA8ToRGBA5551(texelFetch(samp0, ivec2(base.x + (dword_index + 1) * 2 + 1, base.y), 0));
-
-    // select the bit for this pixel depending on its offset in the 4-pixel block
-    uint r, g, b;
-    int block_offset = xoff & 3;
-    if (block_offset == 0)
-    {
-      r = s0 & 0xFFu;
-      g = s0 >> 8;
-      b = s1 & 0xFFu;
-    }
-    else if (block_offset == 1)
-    {
-      r = s1 >> 8;
-      g = s2 & 0xFFu;
-      b = s2 >> 8;
-    }
-    else if (block_offset == 2)
-    {
-      r = s1 & 0xFFu;
-      g = s1 >> 8;
-      b = s2 & 0xFFu;
-    }
-    else
-    {
-      r = s2 >> 8;
-      g = s3 & 0xFFu;
-      b = s3 >> 8;
-    }
-
-    // and normalize
-    o_col0 = vec4(float(r) / 255.0, float(g) / 255.0, float(b) / 255.0, 1.0);
-  #else
-    // load and return
-    o_col0 = texelFetch(samp0, u_base_coords.xy + icoords, 0);
-  #endif
-}
-)";
-
-  return ss.str();
-}
-
-std::string GPU_HW::GenerateVRAMWriteFragmentShader()
-{
-  std::stringstream ss;
-  GenerateShaderHeader(ss);
-
-  ss << R"(
-
-uniform ivec2 u_base_coords;
-uniform ivec2 u_size;
-uniform usamplerBuffer samp0;
-
-out vec4 o_col0;
-
-void main()
-{
-  ivec2 coords = ivec2(gl_FragCoord.xy) / ivec2(RESOLUTION_SCALE, RESOLUTION_SCALE);
-  ivec2 offset = coords - u_base_coords;
-  offset.y = u_size.y - offset.y - 1;
-
-  int buffer_offset = offset.y * u_size.x + offset.x;
-  uint value = texelFetch(samp0, buffer_offset).r;
-  
-  o_col0 = RGBA5551ToRGBA8(value);
-})";
-
-  return ss.str();
-}
-
-GPU_HW::HWPrimitive GPU_HW::GetPrimitiveForCommand(RenderCommand rc)
+GPU_HW::BatchPrimitive GPU_HW::GetPrimitiveForCommand(RenderCommand rc)
 {
  if (rc.primitive == Primitive::Line)
-    return rc.polyline ? HWPrimitive::LineStrip : HWPrimitive::Lines;
+    return rc.polyline ? BatchPrimitive::LineStrip : BatchPrimitive::Lines;
  else if ((rc.primitive == Primitive::Polygon && rc.quad_polygon) || rc.primitive == Primitive::Rectangle)
-    return HWPrimitive::TriangleStrip;
+    return BatchPrimitive::TriangleStrip;
  else
-    return HWPrimitive::Triangles;
+    return BatchPrimitive::Triangles;
 }

 void GPU_HW::DispatchRenderCommand(RenderCommand rc, u32 num_vertices, const u32* command_ptr)
@ -687,13 +258,13 @@ void GPU_HW::DispatchRenderCommand(RenderCommand rc, u32 num_vertices, const u32
  // has any state changed which requires a new batch?
  const TransparencyMode transparency_mode =
    rc.transparency_enable ? m_render_state.transparency_mode : TransparencyMode::Disabled;
-  const HWPrimitive rc_primitive = GetPrimitiveForCommand(rc);
+  const BatchPrimitive rc_primitive = GetPrimitiveForCommand(rc);
  const bool dithering_enable = (!m_true_color && rc.IsDitheringEnabled()) ? m_GPUSTAT.dither_enable : false;
  const u32 max_added_vertices = num_vertices + 2;
  if (!IsFlushed())
  {
    const bool buffer_overflow = GetBatchVertexSpace() < max_added_vertices;
-    if (buffer_overflow || rc_primitive == HWPrimitive::LineStrip || m_batch.texture_mode != texture_mode ||
+    if (buffer_overflow || rc_primitive == BatchPrimitive::LineStrip || m_batch.texture_mode != texture_mode ||
        m_batch.transparency_mode != transparency_mode || m_batch.primitive != rc_primitive ||
        dithering_enable != m_batch.dithering || m_render_state.IsTextureWindowChanged())
    {
--- a/src/core/gpu_hw.h
+++ b/src/core/gpu_hw.h
@ -8,6 +8,22 @@
 class GPU_HW : public GPU
 {
 public:
+  enum class BatchPrimitive : u8
+  {
+    Lines = 0,
+    LineStrip = 1,
+    Triangles = 2,
+    TriangleStrip = 3
+  };
+
+  enum class BatchRenderMode : u8
+  {
+    TransparencyDisabled,
+    TransparentAndOpaque,
+    OnlyOpaque,
+    OnlyTransparent
+  };
+
  GPU_HW();
  virtual ~GPU_HW();

@ -16,23 +32,7 @@ public:
  virtual void UpdateSettings() override;

 protected:
-  enum class HWPrimitive : u8
-  {
-    Lines = 0,
-    LineStrip = 1,
-    Triangles = 2,
-    TriangleStrip = 3
-  };
-
-  enum class HWBatchRenderMode : u8
-  {
-    TransparencyDisabled,
-    TransparentAndOpaque,
-    OnlyOpaque,
-    OnlyTransparent
-  };
-
-  struct HWVertex
+  struct BatchVertex
  {
    s32 x;
    s32 y;
@ -55,9 +55,9 @@ protected:
    }
  };

-  struct HWBatchConfig
+  struct BatchConfig
  {
-    HWPrimitive primitive;
+    BatchPrimitive primitive;
    TextureMode texture_mode;
    TransparencyMode transparency_mode;
    bool dithering;
@ -71,14 +71,14 @@ protected:
    }

    // Returns the render mode for this batch.
-    HWBatchRenderMode GetRenderMode() const
+    BatchRenderMode GetRenderMode() const
    {
-      return transparency_mode == TransparencyMode::Disabled ? HWBatchRenderMode::TransparencyDisabled :
-                                                               HWBatchRenderMode::TransparentAndOpaque;
+      return transparency_mode == TransparencyMode::Disabled ? BatchRenderMode::TransparencyDisabled :
+                                                               BatchRenderMode::TransparentAndOpaque;
    }
  };

-  struct HWBatchUBOData
+  struct BatchUBOData
  {
    s32 u_pos_offset[2];
    u32 u_texture_window_mask[2];
@ -90,7 +90,7 @@ protected:
  static constexpr u32 VRAM_UPDATE_TEXTURE_BUFFER_SIZE = VRAM_WIDTH * VRAM_HEIGHT * sizeof(u32);
  static constexpr u32 VERTEX_BUFFER_SIZE = 1 * 1024 * 1024;
  static constexpr u32 MIN_BATCH_VERTEX_COUNT = 6;
-  static constexpr u32 MAX_BATCH_VERTEX_COUNT = VERTEX_BUFFER_SIZE / sizeof(HWVertex);
+  static constexpr u32 MAX_BATCH_VERTEX_COUNT = VERTEX_BUFFER_SIZE / sizeof(BatchVertex);
  static constexpr u32 UNIFORM_BUFFER_SIZE = 512 * 1024;

  static constexpr std::tuple<float, float, float, float> RGBA8ToFloat(u32 rgba)
@ -121,31 +121,21 @@ protected:
    return std::make_tuple(x * s32(m_resolution_scale), y * s32(m_resolution_scale));
  }

-  std::string GenerateVertexShader(bool textured);
-  std::string GenerateFragmentShader(HWBatchRenderMode transparency, TextureMode texture_mode, bool dithering);
-  std::string GenerateScreenQuadVertexShader();
-  std::string GenerateFillFragmentShader();
-  std::string GenerateDisplayFragmentShader(bool depth_24bit, bool interlaced);
-  std::string GenerateVRAMWriteFragmentShader();
-
-  HWVertex* m_batch_start_vertex_ptr = nullptr;
-  HWVertex* m_batch_end_vertex_ptr = nullptr;
-  HWVertex* m_batch_current_vertex_ptr = nullptr;
+  BatchVertex* m_batch_start_vertex_ptr = nullptr;
+  BatchVertex* m_batch_end_vertex_ptr = nullptr;
+  BatchVertex* m_batch_current_vertex_ptr = nullptr;
  u32 m_batch_base_vertex = 0;

  u32 m_resolution_scale = 1;
  u32 m_max_resolution_scale = 1;
  bool m_true_color = false;

-  HWBatchConfig m_batch = {};
-  HWBatchUBOData m_batch_ubo_data = {};
+  BatchConfig m_batch = {};
+  BatchUBOData m_batch_ubo_data = {};
  bool m_batch_ubo_dirty = true;

 private:
-  static HWPrimitive GetPrimitiveForCommand(RenderCommand rc);
-
-  void GenerateShaderHeader(std::stringstream& ss);
-  void GenerateBatchUniformBuffer(std::stringstream& ss);
+  static BatchPrimitive GetPrimitiveForCommand(RenderCommand rc);

  void LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command_ptr);
  void AddDuplicateVertex();
--- a/src/core/gpu_hw_opengl.cpp
+++ b/src/core/gpu_hw_opengl.cpp
@ -2,6 +2,7 @@
 #include "YBaseLib/Assert.h"
 #include "YBaseLib/Log.h"
 #include "YBaseLib/String.h"
+#include "gpu_hw_shadergen.h"
 #include "host_interface.h"
 #include "imgui.h"
 #include "system.h"
@ -138,9 +139,9 @@ void GPU_HW_OpenGL::MapBatchVertexPointer(u32 required_vertices)
  Assert(!m_batch_start_vertex_ptr);

  const GL::StreamBuffer::MappingResult res =
-    m_vertex_stream_buffer->Map(sizeof(HWVertex), required_vertices * sizeof(HWVertex));
+    m_vertex_stream_buffer->Map(sizeof(BatchVertex), required_vertices * sizeof(BatchVertex));

-  m_batch_start_vertex_ptr = static_cast<HWVertex*>(res.pointer);
+  m_batch_start_vertex_ptr = static_cast<BatchVertex*>(res.pointer);
  m_batch_current_vertex_ptr = m_batch_start_vertex_ptr;
  m_batch_end_vertex_ptr = m_batch_start_vertex_ptr + res.space_aligned;
  m_batch_base_vertex = res.index_aligned;
@ -246,11 +247,11 @@ void GPU_HW_OpenGL::CreateVertexBuffer()
  glEnableVertexAttribArray(1);
  glEnableVertexAttribArray(2);
  glEnableVertexAttribArray(3);
-  glVertexAttribIPointer(0, 2, GL_INT, sizeof(HWVertex), reinterpret_cast<void*>(offsetof(HWVertex, x)));
-  glVertexAttribPointer(1, 4, GL_UNSIGNED_BYTE, true, sizeof(HWVertex),
-                        reinterpret_cast<void*>(offsetof(HWVertex, color)));
-  glVertexAttribIPointer(2, 2, GL_INT, sizeof(HWVertex), reinterpret_cast<void*>(offsetof(HWVertex, texcoord)));
-  glVertexAttribIPointer(3, 1, GL_INT, sizeof(HWVertex), reinterpret_cast<void*>(offsetof(HWVertex, texpage)));
+  glVertexAttribIPointer(0, 2, GL_INT, sizeof(BatchVertex), reinterpret_cast<void*>(offsetof(BatchVertex, x)));
+  glVertexAttribPointer(1, 4, GL_UNSIGNED_BYTE, true, sizeof(BatchVertex),
+                        reinterpret_cast<void*>(offsetof(BatchVertex, color)));
+  glVertexAttribIPointer(2, 2, GL_INT, sizeof(BatchVertex), reinterpret_cast<void*>(offsetof(BatchVertex, texcoord)));
+  glVertexAttribIPointer(3, 1, GL_INT, sizeof(BatchVertex), reinterpret_cast<void*>(offsetof(BatchVertex, texpage)));
  glBindVertexArray(0);

  glGenVertexArrays(1, &m_attributeless_vao_id);
@ -280,67 +281,21 @@ void GPU_HW_OpenGL::CreateTextureBuffer()

 bool GPU_HW_OpenGL::CompilePrograms()
 {
+  GPU_HW_ShaderGen shadergen(GPU_HW_ShaderGen::Backend::OpenGL, m_resolution_scale, m_true_color);
+
  for (u32 render_mode = 0; render_mode < 4; render_mode++)
  {
    for (u32 texture_mode = 0; texture_mode < 9; texture_mode++)
    {
      for (u8 dithering = 0; dithering < 2; dithering++)
      {
-        if (!CompileProgram(m_render_programs[render_mode][texture_mode][dithering],
-                            static_cast<HWBatchRenderMode>(render_mode), static_cast<TextureMode>(texture_mode),
-                            ConvertToBoolUnchecked(dithering)))
-        {
-          return false;
-        }
-      }
-    }
-  }
+        const bool textured = (static_cast<TextureMode>(texture_mode) != TextureMode::Disabled);
+        const std::string vs = shadergen.GenerateBatchVertexShader(textured);
+        const std::string fs = shadergen.GenerateBatchFragmentShader(static_cast<BatchRenderMode>(render_mode),
+                                                                     static_cast<TextureMode>(texture_mode),
+                                                                     ConvertToBoolUnchecked(dithering));

-  // TODO: Use string_view
-  for (u8 depth_24bit = 0; depth_24bit < 2; depth_24bit++)
-  {
-    for (u8 interlaced = 0; interlaced < 2; interlaced++)
-    {
-      GL::Program& prog = m_display_programs[depth_24bit][interlaced];
-      const std::string vs = GenerateScreenQuadVertexShader();
-      const std::string fs =
-        GenerateDisplayFragmentShader(ConvertToBoolUnchecked(depth_24bit), ConvertToBoolUnchecked(interlaced));
-      if (!prog.Compile(vs, fs))
-        return false;
-
-      prog.BindFragData(0, "o_col0");
-      if (!prog.Link())
-        return false;
-
-      prog.Bind();
-      prog.RegisterUniform("u_base_coords");
-      prog.RegisterUniform("samp0");
-      prog.Uniform1i(1, 0);
-    }
-  }
-
-  if (!m_vram_write_program.Compile(GenerateScreenQuadVertexShader(), GenerateVRAMWriteFragmentShader()))
-    return false;
-
-  m_vram_write_program.BindFragData(0, "o_col0");
-  if (!m_vram_write_program.Link())
-    return false;
-
-  m_vram_write_program.Bind();
-  m_vram_write_program.RegisterUniform("u_base_coords");
-  m_vram_write_program.RegisterUniform("u_size");
-  m_vram_write_program.RegisterUniform("samp0");
-  m_vram_write_program.Uniform1i(2, 0);
-
-  return true;
-}
-
-bool GPU_HW_OpenGL::CompileProgram(GL::Program& prog, HWBatchRenderMode render_mode, TextureMode texture_mode,
-                                   bool dithering)
-{
-  const bool textured = texture_mode != TextureMode::Disabled;
-  const std::string vs = GenerateVertexShader(textured);
-  const std::string fs = GenerateFragmentShader(render_mode, texture_mode, dithering);
+        GL::Program& prog = m_render_programs[render_mode][texture_mode][dithering];
        if (!prog.Compile(vs, fs))
          return false;

@ -358,18 +313,58 @@ bool GPU_HW_OpenGL::CompileProgram(GL::Program& prog, HWBatchRenderMode render_m
          return false;

        prog.BindUniformBlock("UBOBlock", 1);
-
        if (textured)
        {
          prog.Bind();
          prog.RegisterUniform("samp0");
          prog.Uniform1i(0, 0);
        }
+      }
+    }
+  }
+
+  for (u8 depth_24bit = 0; depth_24bit < 2; depth_24bit++)
+  {
+    for (u8 interlaced = 0; interlaced < 2; interlaced++)
+    {
+      GL::Program& prog = m_display_programs[depth_24bit][interlaced];
+      const std::string vs = shadergen.GenerateScreenQuadVertexShader();
+      const std::string fs = shadergen.GenerateDisplayFragmentShader(ConvertToBoolUnchecked(depth_24bit),
+                                                                     ConvertToBoolUnchecked(interlaced));
+      if (!prog.Compile(vs, fs))
+        return false;
+
+      prog.BindFragData(0, "o_col0");
+      if (!prog.Link())
+        return false;
+
+      prog.Bind();
+      prog.RegisterUniform("u_base_coords");
+      prog.RegisterUniform("samp0");
+      prog.Uniform1i(1, 0);
+    }
+  }
+
+  if (!m_vram_write_program.Compile(shadergen.GenerateScreenQuadVertexShader(),
+                                    shadergen.GenerateVRAMWriteFragmentShader()))
+  {
+    return false;
+  }
+
+  m_vram_write_program.BindFragData(0, "o_col0");
+  if (!m_vram_write_program.Link())
+    return false;
+
+  m_vram_write_program.Bind();
+  m_vram_write_program.RegisterUniform("u_base_coords");
+  m_vram_write_program.RegisterUniform("u_size");
+  m_vram_write_program.RegisterUniform("samp0");
+  m_vram_write_program.Uniform1i(2, 0);

  return true;
 }

-void GPU_HW_OpenGL::SetDrawState(HWBatchRenderMode render_mode)
+void GPU_HW_OpenGL::SetDrawState(BatchRenderMode render_mode)
 {
  const GL::Program& prog = m_render_programs[static_cast<u8>(render_mode)][static_cast<u8>(m_batch.texture_mode)]
                                             [BoolToUInt8(m_batch.dithering)];
@ -378,7 +373,7 @@ void GPU_HW_OpenGL::SetDrawState(HWBatchRenderMode render_mode)
  if (m_batch.texture_mode != TextureMode::Disabled)
    m_vram_read_texture->Bind();

-  if (m_batch.transparency_mode == TransparencyMode::Disabled || render_mode == HWBatchRenderMode::OnlyOpaque)
+  if (m_batch.transparency_mode == TransparencyMode::Disabled || render_mode == BatchRenderMode::OnlyOpaque)
  {
    glDisable(GL_BLEND);
  }
@ -732,7 +727,7 @@ void GPU_HW_OpenGL::FlushRender()
  m_stats.num_batches++;
  m_stats.num_vertices += vertex_count;

-  m_vertex_stream_buffer->Unmap(vertex_count * sizeof(HWVertex));
+  m_vertex_stream_buffer->Unmap(vertex_count * sizeof(BatchVertex));
  m_vertex_stream_buffer->Bind();
  m_batch_start_vertex_ptr = nullptr;
  m_batch_end_vertex_ptr = nullptr;
@ -742,9 +737,9 @@ void GPU_HW_OpenGL::FlushRender()

  if (m_batch.NeedsTwoPassRendering())
  {
-    SetDrawState(HWBatchRenderMode::OnlyTransparent);
+    SetDrawState(BatchRenderMode::OnlyTransparent);
    glDrawArrays(gl_primitives[static_cast<u8>(m_batch.primitive)], 0, vertex_count);
-    SetDrawState(HWBatchRenderMode::OnlyOpaque);
+    SetDrawState(BatchRenderMode::OnlyOpaque);
    glDrawArrays(gl_primitives[static_cast<u8>(m_batch.primitive)], 0, vertex_count);
  }
  else
--- a/src/core/gpu_hw_opengl.h
+++ b/src/core/gpu_hw_opengl.h
@ -58,8 +58,7 @@ private:
  void CreateTextureBuffer();

  bool CompilePrograms();
-  bool CompileProgram(GL::Program& prog, HWBatchRenderMode render_mode, TextureMode texture_mode, bool dithering);
-  void SetDrawState(HWBatchRenderMode render_mode);
+  void SetDrawState(BatchRenderMode render_mode);
  void UploadUniformBlock(const void* data, u32 data_size);

  // downsample texture - used for readbacks at >1xIR.
--- a/src/core/gpu_hw_shadergen.cpp
+++ b/src/core/gpu_hw_shadergen.cpp
@ -0,0 +1,436 @@
+#include "gpu_hw_shadergen.h"
+
+GPU_HW_ShaderGen::GPU_HW_ShaderGen(Backend backend, u32 resolution_scale, bool true_color)
+  : m_backend(backend), m_resolution_scale(resolution_scale), m_true_color(true_color)
+{
+}
+
+GPU_HW_ShaderGen::~GPU_HW_ShaderGen() = default;
+
+static void DefineMacro(std::stringstream& ss, const char* name, bool enabled)
+{
+  if (enabled)
+    ss << "#define " << name << " 1\n";
+  else
+    ss << "/* #define " << name << " 0 */\n";
+}
+
+void GPU_HW_ShaderGen::GenerateShaderHeader(std::stringstream& ss)
+{
+  ss << "#version 330 core\n\n";
+  ss << "const int RESOLUTION_SCALE = " << m_resolution_scale << ";\n";
+  ss << "const ivec2 VRAM_SIZE = ivec2(" << GPU::VRAM_WIDTH << ", " << GPU::VRAM_HEIGHT << ") * RESOLUTION_SCALE;\n";
+  ss << "const vec2 RCP_VRAM_SIZE = vec2(1.0, 1.0) / vec2(VRAM_SIZE);\n";
+  ss << R"(
+
+float fixYCoord(float y)
+{
+  return 1.0 - RCP_VRAM_SIZE.y - y;
+}
+
+int fixYCoord(int y)
+{
+  return VRAM_SIZE.y - y - 1;
+}
+
+uint RGBA8ToRGBA5551(vec4 v)
+{
+  uint r = uint(v.r * 255.0) >> 3;
+  uint g = uint(v.g * 255.0) >> 3;
+  uint b = uint(v.b * 255.0) >> 3;
+  uint a = (v.a != 0.0) ? 1u : 0u;
+  return (r) | (g << 5) | (b << 10) | (a << 15);
+}
+
+vec4 RGBA5551ToRGBA8(uint v)
+{
+  uint r = (v & 31u);
+  uint g = ((v >> 5) & 31u);
+  uint b = ((v >> 10) & 31u);
+  uint a = ((v >> 15) & 1u);
+
+  // repeat lower bits
+  r = (r << 3) | (r & 7u);
+  g = (g << 3) | (g & 7u);
+  b = (b << 3) | (b & 7u);
+
+  return vec4(float(r) / 255.0, float(g) / 255.0, float(b) / 255.0, float(a));
+}
+)";
+}
+
+void GPU_HW_ShaderGen::GenerateBatchUniformBuffer(std::stringstream& ss)
+{
+  ss << R"(
+uniform UBOBlock {
+  ivec2 u_pos_offset;
+  uvec2 u_texture_window_mask;
+  uvec2 u_texture_window_offset;
+  float u_src_alpha_factor;
+  float u_dst_alpha_factor;
+};
+)";
+}
+
+std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured)
+{
+  std::stringstream ss;
+  GenerateShaderHeader(ss);
+  DefineMacro(ss, "TEXTURED", textured);
+  GenerateBatchUniformBuffer(ss);
+
+  ss << R"(
+in ivec2 a_pos;
+in vec4 a_col0;
+in int a_texcoord;
+in int a_texpage;
+
+out vec3 v_col0;
+#if TEXTURED
+  out vec2 v_tex0;
+  flat out ivec4 v_texpage;
+#endif
+
+void main()
+{
+  // 0..+1023 -> -1..1
+  float pos_x = (float(a_pos.x + u_pos_offset.x) / 512.0) - 1.0;
+  float pos_y = (float(a_pos.y + u_pos_offset.y) / -256.0) + 1.0;
+  gl_Position = vec4(pos_x, pos_y, 0.0, 1.0);
+
+  v_col0 = a_col0.rgb;
+  #if TEXTURED
+    v_tex0 = vec2(float(a_texcoord & 0xFFFF), float(a_texcoord >> 16)) / vec2(255.0);
+
+    // base_x,base_y,palette_x,palette_y
+    v_texpage.x = (a_texpage & 15) * 64 * RESOLUTION_SCALE;
+    v_texpage.y = ((a_texpage >> 4) & 1) * 256 * RESOLUTION_SCALE;
+    v_texpage.z = ((a_texpage >> 16) & 63) * 16 * RESOLUTION_SCALE;
+    v_texpage.w = ((a_texpage >> 22) & 511) * RESOLUTION_SCALE;
+  #endif
+}
+)";
+
+  return ss.str();
+}
+
+std::string GPU_HW_ShaderGen::GenerateBatchFragmentShader(GPU_HW::BatchRenderMode transparency,
+                                                     GPU::TextureMode texture_mode, bool dithering)
+{
+  const GPU::TextureMode actual_texture_mode = texture_mode & ~GPU::TextureMode::RawTextureBit;
+  const bool raw_texture = (texture_mode & GPU::TextureMode::RawTextureBit) == GPU::TextureMode::RawTextureBit;
+
+  std::stringstream ss;
+  GenerateShaderHeader(ss);
+  GenerateBatchUniformBuffer(ss);
+  DefineMacro(ss, "TRANSPARENCY", transparency != GPU_HW::BatchRenderMode::TransparencyDisabled);
+  DefineMacro(ss, "TRANSPARENCY_ONLY_OPAQUE", transparency == GPU_HW::BatchRenderMode::OnlyOpaque);
+  DefineMacro(ss, "TRANSPARENCY_ONLY_TRANSPARENCY", transparency == GPU_HW::BatchRenderMode::OnlyTransparent);
+  DefineMacro(ss, "TEXTURED", actual_texture_mode != GPU::TextureMode::Disabled);
+  DefineMacro(ss, "PALETTE",
+              actual_texture_mode == GPU::TextureMode::Palette4Bit ||
+                actual_texture_mode == GPU::TextureMode::Palette8Bit);
+  DefineMacro(ss, "PALETTE_4_BIT", actual_texture_mode == GPU::TextureMode::Palette4Bit);
+  DefineMacro(ss, "PALETTE_8_BIT", actual_texture_mode == GPU::TextureMode::Palette8Bit);
+  DefineMacro(ss, "RAW_TEXTURE", raw_texture);
+  DefineMacro(ss, "DITHERING", dithering);
+  DefineMacro(ss, "TRUE_COLOR", m_true_color);
+
+  ss << "const int[16] s_dither_values = int[16]( ";
+  for (u32 i = 0; i < 16; i++)
+  {
+    if (i > 0)
+      ss << ", ";
+    ss << GPU::DITHER_MATRIX[i / 4][i % 4];
+  }
+  ss << " );\n";
+
+  ss << R"(
+in vec3 v_col0;
+#if TEXTURED
+  in vec2 v_tex0;
+  flat in ivec4 v_texpage;
+  uniform sampler2D samp0;
+#endif
+
+out vec4 o_col0;
+
+ivec3 ApplyDithering(ivec3 icol)
+{
+  ivec2 fc = (ivec2(gl_FragCoord.xy) / ivec2(RESOLUTION_SCALE, RESOLUTION_SCALE)) & ivec2(3, 3);
+  int offset = s_dither_values[fc.y * 4 + fc.x];
+  return icol + ivec3(offset, offset, offset);
+}
+
+ivec3 TruncateTo15Bit(ivec3 icol)
+{
+  icol = clamp(icol, ivec3(0, 0, 0), ivec3(255, 255, 255));
+  return (icol & ivec3(~7, ~7, ~7)) | ((icol >> 3) & ivec3(7, 7, 7));
+}
+
+#if TEXTURED
+ivec2 ApplyNativeTextureWindow(ivec2 coords)
+{
+  uint x = (uint(coords.x) & ~(u_texture_window_mask.x * 8u)) | ((u_texture_window_offset.x & u_texture_window_mask.x) * 8u);
+  uint y = (uint(coords.y) & ~(u_texture_window_mask.y * 8u)) | ((u_texture_window_offset.y & u_texture_window_mask.y) * 8u);
+  return ivec2(int(x), int(y));
+}  
+
+ivec2 ApplyTextureWindow(ivec2 coords)
+{
+  if (RESOLUTION_SCALE == 1)
+    return ApplyNativeTextureWindow(coords);
+
+  ivec2 downscaled_coords = coords / ivec2(RESOLUTION_SCALE);
+  ivec2 coords_offset = coords % ivec2(RESOLUTION_SCALE);
+  return (ApplyNativeTextureWindow(downscaled_coords) * ivec2(RESOLUTION_SCALE)) + coords_offset;
+}
+
+ivec4 SampleFromVRAM(vec2 coord)
+{
+  // from 0..1 to 0..255
+  ivec2 icoord = ivec2(coord * vec2(255 * RESOLUTION_SCALE));
+  icoord = ApplyTextureWindow(icoord);
+
+  // adjust for tightly packed palette formats
+  ivec2 index_coord = icoord;
+  #if PALETTE_4_BIT
+    index_coord.x /= 4;
+  #elif PALETTE_8_BIT
+    index_coord.x /= 2;
+  #endif
+
+  // fixup coords
+  ivec2 vicoord = ivec2(v_texpage.x + index_coord.x, fixYCoord(v_texpage.y + index_coord.y));
+
+  // load colour/palette
+  vec4 color = texelFetch(samp0, vicoord, 0);
+
+  // apply palette
+  #if PALETTE
+    #if PALETTE_4_BIT
+      int subpixel = int(icoord.x / RESOLUTION_SCALE) & 3;
+      uint vram_value = RGBA8ToRGBA5551(color);
+      int palette_index = int((vram_value >> (subpixel * 4)) & 0x0Fu);
+    #elif PALETTE_8_BIT
+      int subpixel = int(icoord.x / RESOLUTION_SCALE) & 1;
+      uint vram_value = RGBA8ToRGBA5551(color);
+      int palette_index = int((vram_value >> (subpixel * 8)) & 0xFFu);
+    #endif
+    ivec2 palette_icoord = ivec2(v_texpage.z + (palette_index * RESOLUTION_SCALE), fixYCoord(v_texpage.w));
+    color = texelFetch(samp0, palette_icoord, 0);
+  #endif
+
+  return ivec4(color * vec4(255.0, 255.0, 255.0, 255.0));
+}
+#endif
+
+void main()
+{
+  ivec3 vertcol = ivec3(v_col0 * vec3(255.0, 255.0, 255.0));
+
+  bool semitransparent;
+  bool new_mask_bit;
+  ivec3 icolor;
+
+  #if TEXTURED
+    ivec4 texcol = SampleFromVRAM(v_tex0);
+    if (texcol == ivec4(0.0, 0.0, 0.0, 0.0))
+      discard;
+
+    // Grab semitransparent bit from the texture color.
+    semitransparent = (texcol.a != 0);
+
+    #if RAW_TEXTURE
+      icolor = texcol.rgb;
+    #else
+      icolor = (vertcol * texcol.rgb) >> 7;
+    #endif
+  #else
+    // All pixels are semitransparent for untextured polygons.
+    semitransparent = true;
+    icolor = vertcol;
+  #endif
+
+  // Apply dithering
+  #if DITHERING
+    icolor = ApplyDithering(icolor);
+  #endif
+
+  // Clip to 15-bit range
+  #if !TRUE_COLOR
+    icolor = TruncateTo15Bit(icolor);
+  #endif
+
+  // Normalize
+  vec3 color = vec3(icolor) / vec3(255.0, 255.0, 255.0);
+
+  #if TRANSPARENCY
+    // Apply semitransparency. If not a semitransparent texel, destination alpha is ignored.
+    if (semitransparent)
+    {
+      #if TRANSPARENCY_ONLY_OPAQUE
+        discard;
+      #endif
+      o_col0 = vec4(color * u_src_alpha_factor, u_dst_alpha_factor);
+    }
+    else
+    {
+      #if TRANSPARENCY_ONLY_TRANSPARENCY
+        discard;
+      #endif
+      o_col0 = vec4(color, 0.0);
+    }
+  #else
+    o_col0 = vec4(color, 0.0);
+  #endif
+}
+)";
+
+  return ss.str();
+}
+
+std::string GPU_HW_ShaderGen::GenerateScreenQuadVertexShader()
+{
+  std::stringstream ss;
+  GenerateShaderHeader(ss);
+  ss << R"(
+
+out vec2 v_tex0;
+
+void main()
+{
+  v_tex0 = vec2(float((gl_VertexID << 1) & 2), float(gl_VertexID & 2));
+  gl_Position = vec4(v_tex0 * vec2(2.0f, -2.0f) + vec2(-1.0f, 1.0f), 0.0f, 1.0f);
+  gl_Position.y = -gl_Position.y;
+}
+)";
+
+  return ss.str();
+}
+
+std::string GPU_HW_ShaderGen::GenerateFillFragmentShader()
+{
+  std::stringstream ss;
+  GenerateShaderHeader(ss);
+
+  ss << R"(
+uniform vec4 fill_color;
+out vec4 o_col0;
+
+void main()
+{
+  o_col0 = fill_color;
+}
+)";
+
+  return ss.str();
+}
+
+std::string GPU_HW_ShaderGen::GenerateDisplayFragmentShader(bool depth_24bit, bool interlaced)
+{
+  std::stringstream ss;
+  GenerateShaderHeader(ss);
+  DefineMacro(ss, "DEPTH_24BIT", depth_24bit);
+  DefineMacro(ss, "INTERLACED", interlaced);
+
+  ss << R"(
+in vec2 v_tex0;
+out vec4 o_col0;
+
+uniform sampler2D samp0;
+uniform ivec3 u_base_coords;
+
+ivec2 GetCoords(vec2 fragcoord)
+{
+  ivec2 icoords = ivec2(fragcoord);
+  #if INTERLACED
+    if ((((icoords.y - u_base_coords.z) / RESOLUTION_SCALE) & 1) != 0)
+      discard;
+  #endif
+  return icoords;
+}
+
+void main()
+{
+  ivec2 icoords = GetCoords(gl_FragCoord.xy);
+
+  #if DEPTH_24BIT
+    // compute offset in dwords from the start of the 24-bit values
+    ivec2 base = ivec2(u_base_coords.x, u_base_coords.y + icoords.y);
+    int xoff = int(icoords.x);
+    int dword_index = (xoff / 2) + (xoff / 4);
+
+    // sample two adjacent dwords, or four 16-bit values as the 24-bit value will lie somewhere between these
+    uint s0 = RGBA8ToRGBA5551(texelFetch(samp0, ivec2(base.x + dword_index * 2 + 0, base.y), 0));
+    uint s1 = RGBA8ToRGBA5551(texelFetch(samp0, ivec2(base.x + dword_index * 2 + 1, base.y), 0));
+    uint s2 = RGBA8ToRGBA5551(texelFetch(samp0, ivec2(base.x + (dword_index + 1) * 2 + 0, base.y), 0));
+    uint s3 = RGBA8ToRGBA5551(texelFetch(samp0, ivec2(base.x + (dword_index + 1) * 2 + 1, base.y), 0));
+
+    // select the bit for this pixel depending on its offset in the 4-pixel block
+    uint r, g, b;
+    int block_offset = xoff & 3;
+    if (block_offset == 0)
+    {
+      r = s0 & 0xFFu;
+      g = s0 >> 8;
+      b = s1 & 0xFFu;
+    }
+    else if (block_offset == 1)
+    {
+      r = s1 >> 8;
+      g = s2 & 0xFFu;
+      b = s2 >> 8;
+    }
+    else if (block_offset == 2)
+    {
+      r = s1 & 0xFFu;
+      g = s1 >> 8;
+      b = s2 & 0xFFu;
+    }
+    else
+    {
+      r = s2 >> 8;
+      g = s3 & 0xFFu;
+      b = s3 >> 8;
+    }
+
+    // and normalize
+    o_col0 = vec4(float(r) / 255.0, float(g) / 255.0, float(b) / 255.0, 1.0);
+  #else
+    // load and return
+    o_col0 = texelFetch(samp0, u_base_coords.xy + icoords, 0);
+  #endif
+}
+)";
+
+  return ss.str();
+}
+
+std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader()
+{
+  std::stringstream ss;
+  GenerateShaderHeader(ss);
+
+  ss << R"(
+
+uniform ivec2 u_base_coords;
+uniform ivec2 u_size;
+uniform usamplerBuffer samp0;
+
+out vec4 o_col0;
+
+void main()
+{
+  ivec2 coords = ivec2(gl_FragCoord.xy) / ivec2(RESOLUTION_SCALE, RESOLUTION_SCALE);
+  ivec2 offset = coords - u_base_coords;
+  offset.y = u_size.y - offset.y - 1;
+
+  int buffer_offset = offset.y * u_size.x + offset.x;
+  uint value = texelFetch(samp0, buffer_offset).r;
+  
+  o_col0 = RGBA5551ToRGBA8(value);
+})";
+
+  return ss.str();
+}
--- a/src/core/gpu_hw_shadergen.h
+++ b/src/core/gpu_hw_shadergen.h
@ -0,0 +1,34 @@
+#pragma once
+#include <sstream>
+#include <string>
+#include "gpu_hw.h"
+
+class GPU_HW_ShaderGen
+{
+public:
+  enum class Backend
+  {
+    OpenGL
+  };
+
+public:
+  GPU_HW_ShaderGen(Backend backend, u32 resolution_scale, bool true_color);
+  ~GPU_HW_ShaderGen();
+
+  void Init(Backend backend, u32 resolution_scale, bool true_color);
+
+  std::string GenerateBatchVertexShader(bool textured);
+  std::string GenerateBatchFragmentShader(GPU_HW::BatchRenderMode transparency, GPU::TextureMode texture_mode, bool dithering);
+  std::string GenerateScreenQuadVertexShader();
+  std::string GenerateFillFragmentShader();
+  std::string GenerateDisplayFragmentShader(bool depth_24bit, bool interlaced);
+  std::string GenerateVRAMWriteFragmentShader();
+
+  Backend m_backend;
+  u32 m_resolution_scale;
+  bool m_true_color;
+
+private:
+  void GenerateShaderHeader(std::stringstream& ss);
+  void GenerateBatchUniformBuffer(std::stringstream& ss);
+};