diff --git a/src/core/gpu_hw_shadergen.cpp b/src/core/gpu_hw_shadergen.cpp
index 2379e9a75..06e971ad2 100644
--- a/src/core/gpu_hw_shadergen.cpp
+++ b/src/core/gpu_hw_shadergen.cpp
@@ -637,13 +637,22 @@ float4 SampleFromVRAM(uint4 texpage, uint2 icoord)
     #endif
   #endif
 
-  // Clip to 15-bit range
-  #if !TRUE_COLOR
-    icolor = TruncateTo15Bit(icolor);
+  // Premultiply alpha so we don't need to use a colour output for it.
+  float premultiply_alpha = ialpha;
+  #if TRANSPARENCY
+    premultiply_alpha = ialpha * (semitransparent ? u_src_alpha_factor : 1.0);
   #endif
 
-  // Normalize
-  float3 color = float3(icolor) / float3(255.0, 255.0, 255.0);
+  float3 color;
+  #if !TRUE_COLOR
+    // We want to apply the alpha before the truncation to 16-bit, otherwise we'll be passing a 32-bit precision color
+    // into the blend unit, which can cause a small amount of error to accumulate.
+    icolor = int3(((float3(icolor) / float3(255.0, 255.0, 255.0)) * premultiply_alpha) * float3(255.0, 255.0, 255.0));
+    color = (float3(icolor >> 3) / float3(31.0, 31.0, 31.0));
+  #else
+    // True color is actually simpler here since we want to preserve the precision.
+    color = (float3(icolor) / float3(255.0, 255.0, 255.0)) * premultiply_alpha;
+  #endif
 
   #if TRANSPARENCY
     // Apply semitransparency. If not a semitransparent texel, destination alpha is ignored.
@@ -654,10 +663,10 @@ float4 SampleFromVRAM(uint4 texpage, uint2 icoord)
       #endif
 
       #if USE_DUAL_SOURCE
-        o_col0 = float4(color * (u_src_alpha_factor * ialpha), oalpha);
+        o_col0 = float4(color, oalpha);
         o_col1 = float4(0.0, 0.0, 0.0, u_dst_alpha_factor / ialpha);
       #else
-        o_col0 = float4(color * (u_src_alpha_factor * ialpha), u_dst_alpha_factor / ialpha);
+        o_col0 = float4(color, u_dst_alpha_factor / ialpha);
       #endif
     }
     else
@@ -667,15 +676,15 @@ float4 SampleFromVRAM(uint4 texpage, uint2 icoord)
       #endif
 
       #if USE_DUAL_SOURCE
-        o_col0 = float4(color * ialpha, oalpha);
+        o_col0 = float4(color, oalpha);
         o_col1 = float4(0.0, 0.0, 0.0, 0.0);
       #else
-        o_col0 = float4(color * ialpha, 1.0 - ialpha);
+        o_col0 = float4(color, 1.0 - ialpha);
       #endif
     }
   #else
     // Non-transparency won't enable blending so we can write the mask here regardless.
-    o_col0 = float4(color * ialpha, oalpha);
+    o_col0 = float4(color, oalpha);
 
     #if USE_DUAL_SOURCE
       o_col1 = float4(0.0, 0.0, 0.0, 1.0 - ialpha);