nv2a: Perspective-correct interpolation for w-buffering

z_perspective is true implies w-buffering and then the w-coordinate stored in the depth buffer should also be interpolated in a perspective-correct way. We do this by calculating w and setting gl_FragDepth in the fragment shader. Since enabling polygon offset and setting values using glPolygonOffset won't have any effect when manually setting gl_FragDepth for w-buffering, we introduce the depthOffset variable to obtain similar behaviour (but the glPolygonOffset factor-argument is currently not emulated.) (Note that glPolygonOffset is OpenGL implementation-dependent and it might be good to use depthOffset for z-buffering as well, but this is not done here and we still use OpenGL/Vulkan zbias functionality.) This also implements depth clipping and clamping in the fragment shader. If triangles are clipped, the shadows of the small rocks in Halo 2 Beaver Creek map can have flickering horizontal lines. The shadows are drawn on the ground in another pass with the same models as for the ground, but for some reason with depth clamping enabled. The flickering happens if Xemu clips the ground triangles, but the exact same shadow triangles are depth clamped, so there are small differences in the coordinates. The shadows are drawn with depth function GL_EQUAL so there is no tolerance for any differences. Clipping in the fragment shader solves the problem because the ground and shadow triangles remain exactly the same regardless of depth clipping/clamping. For some performance gain, it might be a good idea to cull triangles by depth in the geometry shader, but this is not implemented here. In the programmable vertex shader we always multiply position output by w because this improves numerical stability in subsequent floating point computations by modern GPUs. This usually means that the perspective divide done by the vertex program gets undone. The magic bounding constants 5.42101e-020 and 1.884467e+019 are replaced by 5.421011e-20 and 1.8446744e19, i.e. more decimals added. This makes the 32-bit floating point numbers represent exactly 2^(-64) and 2^64 (raw bits 0x1f800000 and 0x5f800000) which seem more likely the correct values although testing with hardware was not done to this precision. Testing indicates that the same RCC instruction magic constants are also applied to both fixed function and programmable vertex shader w-coordinate output. This bounding replaces the special test for w==0.0 and abs(w)==inf which used to set vtx_inv_w=1.0 (which did not match Xbox hardware behaviour.)
2025-02-28 20:28:19 +02:00 · 2025-02-28 20:28:19 +02:00 · 798ad30819
parent 45078ef51f
commit 798ad30819
15 changed files with 210 additions and 151 deletions
--- a/hw/xbox/nv2a/pgraph/gl/draw.c
+++ b/hw/xbox/nv2a/pgraph/gl/draw.c
@ -203,10 +203,6 @@ void pgraph_gl_draw_begin(NV2AState *d)
        glDisable(GL_CULL_FACE);
    }

-    /* Clipping */
-    glEnable(GL_CLIP_DISTANCE0);
-    glEnable(GL_CLIP_DISTANCE1);
-
    /* Front-face select */
    glFrontFace(pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER)
                    & NV_PGRAPH_SETUPRASTER_FRONTFACE
@ -240,6 +236,8 @@ void pgraph_gl_draw_begin(NV2AState *d)
        GLfloat zfactor = *(float*)&zfactor_u32;
        uint32_t zbias_u32 = pgraph_reg_r(pg, NV_PGRAPH_ZOFFSETBIAS);
        GLfloat zbias = *(float*)&zbias_u32;
+        // FIXME: with Linux and Mesa, zbias must be multiplied by 0.5 in
+        // order to have the same depth value offset as Xbox.
        glPolygonOffset(zfactor, zbias);
    }

@ -255,13 +253,7 @@ void pgraph_gl_draw_begin(NV2AState *d)
        glDisable(GL_DEPTH_TEST);
    }

-    if (GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_ZCOMPRESSOCCLUDE),
-                 NV_PGRAPH_ZCOMPRESSOCCLUDE_ZCLAMP_EN) ==
-        NV_PGRAPH_ZCOMPRESSOCCLUDE_ZCLAMP_EN_CLAMP) {
-        glEnable(GL_DEPTH_CLAMP);
-    } else {
-        glDisable(GL_DEPTH_CLAMP);
-    }
+    glEnable(GL_DEPTH_CLAMP);

    if (GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CONTROL_3),
                 NV_PGRAPH_CONTROL_3_SHADEMODE) ==
--- a/hw/xbox/nv2a/pgraph/gl/renderer.h
+++ b/hw/xbox/nv2a/pgraph/gl/renderer.h
@ -106,6 +106,7 @@ typedef struct ShaderBinding {

    GLint surface_size_loc;
    GLint clip_range_loc;
+    GLint depth_offset_loc;

    GLint vsh_constant_loc[NV2A_VERTEXSHADER_CONSTANTS];
    uint32_t vsh_constants[NV2A_VERTEXSHADER_CONSTANTS][4];
--- a/hw/xbox/nv2a/pgraph/gl/shaders.c
+++ b/hw/xbox/nv2a/pgraph/gl/shaders.c
@ -154,6 +154,7 @@ static void update_shader_constant_locations(ShaderBinding *binding)
    }
    binding->surface_size_loc = glGetUniformLocation(binding->gl_program, "surfaceSize");
    binding->clip_range_loc = glGetUniformLocation(binding->gl_program, "clipRange");
+    binding->depth_offset_loc = glGetUniformLocation(binding->gl_program, "depthOffset");
    binding->fog_color_loc = glGetUniformLocation(binding->gl_program, "fogColor");
    binding->fog_param_loc = glGetUniformLocation(binding->gl_program, "fogParam");

@ -886,11 +887,36 @@ static void shader_update_constants(PGRAPHState *pg, ShaderBinding *binding,
        uint32_t v[2];
        v[0] = pgraph_reg_r(pg, NV_PGRAPH_ZCLIPMIN);
        v[1] = pgraph_reg_r(pg, NV_PGRAPH_ZCLIPMAX);
-        float zclip_min = *(float*)&v[0] / zmax * 2.0 - 1.0;
-        float zclip_max = *(float*)&v[1] / zmax * 2.0 - 1.0;
+        float zclip_min = *(float *)&v[0];
+        float zclip_max = *(float *)&v[1];
        glUniform4f(binding->clip_range_loc, 0, zmax, zclip_min, zclip_max);
    }

+    if (binding->depth_offset_loc != -1) {
+        float zbias = 0.0f;
+
+        if (pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER) &
+            (NV_PGRAPH_SETUPRASTER_POFFSETFILLENABLE |
+             NV_PGRAPH_SETUPRASTER_POFFSETLINEENABLE |
+             NV_PGRAPH_SETUPRASTER_POFFSETPOINTENABLE)) {
+            uint32_t zbias_u32 = pgraph_reg_r(pg, NV_PGRAPH_ZOFFSETBIAS);
+            zbias = *(float *)&zbias_u32;
+
+            if (pgraph_reg_r(pg, NV_PGRAPH_ZOFFSETFACTOR) != 0 &&
+                (pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0) &
+                 NV_PGRAPH_CONTROL_0_Z_PERSPECTIVE_ENABLE)) {
+                /* TODO: emulate zfactor when z_perspective true, i.e.
+                 * w-buffering. Perhaps calculate an additional offset based on
+                 * triangle orientation in geometry shader and pass the result
+                 * to fragment shader and add it to gl_FragDepth as well.
+                 */
+                NV2A_UNIMPLEMENTED("NV_PGRAPH_ZOFFSETFACTOR for w-buffering");
+            }
+        }
+
+        glUniform1f(binding->depth_offset_loc, zbias);
+    }
+
    /* Clipping regions */
    unsigned int max_gl_width = pg->surface_binding_dim.width;
    unsigned int max_gl_height = pg->surface_binding_dim.height;
@ -956,6 +982,7 @@ static bool test_shaders_dirty(PGRAPHState *pg)
        CR_1(NV_PGRAPH_CSV1_B) \
        CR_1(NV_PGRAPH_SETUPRASTER) \
        CR_1(NV_PGRAPH_SHADERPROG) \
+        CR_1(NV_PGRAPH_ZCOMPRESSOCCLUDE) \
        CR_8(NV_PGRAPH_COMBINECOLORI0) \
        CR_8(NV_PGRAPH_COMBINECOLORO0) \
        CR_8(NV_PGRAPH_COMBINEALPHAI0) \
--- a/hw/xbox/nv2a/pgraph/glsl/common.c
+++ b/hw/xbox/nv2a/pgraph/glsl/common.c
@ -23,34 +23,32 @@

 MString *pgraph_get_glsl_vtx_header(MString *out, bool location, bool smooth, bool in, bool prefix, bool array)
 {
-    const char *flat_s = "flat";
-    const char *noperspective_s = "noperspective";
-    const char *qualifier_s = smooth ? noperspective_s : flat_s;
-    const char *qualifiers[11] = {
-        noperspective_s, flat_s,          qualifier_s,     qualifier_s,
-        qualifier_s,     qualifier_s,     noperspective_s, noperspective_s,
-        noperspective_s, noperspective_s, noperspective_s
-    };
+    const char *flat_s = "flat ";
+    const char *smooth_s = "";
+    const char *qualifier_s = smooth ? smooth_s : flat_s;
+    const char *qualifiers[9] = { qualifier_s, qualifier_s, qualifier_s,
+                                  qualifier_s, smooth_s,    smooth_s,
+                                  smooth_s,    smooth_s,    smooth_s };

    const char *in_out_s = in ? "in" : "out";

    const char *float_s = "float";
    const char *vec4_s = "vec4";
-    const char *types[11] = { float_s, float_s, vec4_s, vec4_s, vec4_s, vec4_s,
-                              float_s, vec4_s,  vec4_s, vec4_s, vec4_s };
+    const char *types[9] = { vec4_s, vec4_s, vec4_s, vec4_s, float_s,
+                             vec4_s, vec4_s, vec4_s, vec4_s };

    const char *prefix_s = prefix ? "v_" : "";
-    const char *names[11] = {
-        "vtx_inv_w", "vtx_inv_w_flat", "vtxD0", "vtxD1", "vtxB0", "vtxB1",
-        "vtxFog",    "vtxT0",          "vtxT1", "vtxT2", "vtxT3",
+    const char *names[9] = {
+        "vtxD0", "vtxD1", "vtxB0", "vtxB1", "vtxFog",
+        "vtxT0", "vtxT1", "vtxT2", "vtxT3",
    };
    const char *suffix_s = array ? "[]" : "";

-    for (int i = 0; i < 11; i++) {
+    for (int i = 0; i < 9; i++) {
        if (location) {
            mstring_append_fmt(out, "layout(location = %d) ", i);
        }
-        mstring_append_fmt(out, "%s %s %s %s%s%s;\n",
+        mstring_append_fmt(out, "%s%s %s %s%s%s;\n",
            qualifiers[i], in_out_s, types[i], prefix_s, names[i], suffix_s);
    }

--- a/hw/xbox/nv2a/pgraph/glsl/geom.c
+++ b/hw/xbox/nv2a/pgraph/glsl/geom.c
@ -182,10 +182,6 @@ MString *pgraph_gen_geom_glsl(enum ShaderPolygonMode polygon_front_mode,
                       "void emit_vertex(int index, int _unused) {\n"
                       "  gl_Position = gl_in[index].gl_Position;\n"
                       "  gl_PointSize = gl_in[index].gl_PointSize;\n"
-                       // "  gl_ClipDistance[0] = gl_in[index].gl_ClipDistance[0];\n"
-                       // "  gl_ClipDistance[1] = gl_in[index].gl_ClipDistance[1];\n"
-                       "  vtx_inv_w = v_vtx_inv_w[index];\n"
-                       "  vtx_inv_w_flat = v_vtx_inv_w[index];\n"
                       "  vtxD0 = v_vtxD0[index];\n"
                       "  vtxD1 = v_vtxD1[index];\n"
                       "  vtxB0 = v_vtxB0[index];\n"
@ -202,10 +198,6 @@ MString *pgraph_gen_geom_glsl(enum ShaderPolygonMode polygon_front_mode,
                       "void emit_vertex(int index, int provoking_index) {\n"
                       "  gl_Position = gl_in[index].gl_Position;\n"
                       "  gl_PointSize = gl_in[index].gl_PointSize;\n"
-                       // "  gl_ClipDistance[0] = gl_in[index].gl_ClipDistance[0];\n"
-                       // "  gl_ClipDistance[1] = gl_in[index].gl_ClipDistance[1];\n"
-                       "  vtx_inv_w = v_vtx_inv_w[index];\n"
-                       "  vtx_inv_w_flat = v_vtx_inv_w[provoking_index];\n"
                       "  vtxD0 = v_vtxD0[provoking_index];\n"
                       "  vtxD1 = v_vtxD1[provoking_index];\n"
                       "  vtxB0 = v_vtxB0[provoking_index];\n"
--- a/hw/xbox/nv2a/pgraph/glsl/psh.c
+++ b/hw/xbox/nv2a/pgraph/glsl/psh.c
@ -745,8 +745,10 @@ static MString* psh_convert(struct PixelShader *ps)

    mstring_append_fmt(preflight, "%sint alphaRef;\n"
                                  "%svec4  fogColor;\n"
-                                  "%sivec4 clipRegion[8];\n",
-                                  u, u, u);
+                                  "%sivec4 clipRegion[8];\n"
+                                  "%svec4  clipRange;\n"
+                                  "%sfloat depthOffset;\n",
+                                  u, u, u, u, u);
    for (int i = 0; i < 4; i++) {
        mstring_append_fmt(preflight, "%smat2  bumpMat%d;\n"
                                      "%sfloat bumpScale%d;\n"
@ -861,28 +863,62 @@ static MString* psh_convert(struct PixelShader *ps)
                             "}\n");
    }

-    /* calculate perspective-correct inputs */
-    MString *vars = mstring_new();
-    if (ps->state.smooth_shading) {
-        mstring_append(vars, "vec4 pD0 = vtxD0 / vtx_inv_w;\n");
-        mstring_append(vars, "vec4 pD1 = vtxD1 / vtx_inv_w;\n");
-        mstring_append(vars, "vec4 pB0 = vtxB0 / vtx_inv_w;\n");
-        mstring_append(vars, "vec4 pB1 = vtxB1 / vtx_inv_w;\n");
-    } else {
-        mstring_append(vars, "vec4 pD0 = vtxD0 / vtx_inv_w_flat;\n");
-        mstring_append(vars, "vec4 pD1 = vtxD1 / vtx_inv_w_flat;\n");
-        mstring_append(vars, "vec4 pB0 = vtxB0 / vtx_inv_w_flat;\n");
-        mstring_append(vars, "vec4 pB1 = vtxB1 / vtx_inv_w_flat;\n");
+    /* Depth clipping */
+    if (ps->state.depth_clipping) {
+        if (ps->state.z_perspective) {
+            mstring_append(
+                clip, "float zvalue = 1.0/gl_FragCoord.w + depthOffset;\n"
+                      "if (zvalue < clipRange.z || clipRange.w < zvalue) {\n"
+                      "  discard;\n"
+                      "}\n");
+        } else {
+            /* Take care of floating point precision problems. MS dashboard
+             * outputs exactly 0.0 z-coordinates and then our fixed function
+             * vertex shader outputs -w as the z-coordinate when OpenGL is
+             * used. Since -w/w = -1, this should give us exactly 0.0 as
+             * gl_FragCoord.z here. Unfortunately, with AMD Radeon RX 6600 the
+             * result is slightly greater than 0. MS dashboard sets the clip
+             * range to [0.0, 0.0] and so the imprecision causes unwanted
+             * clipping. Note that since Vulkan uses NDC range [0,1] it
+             * doesn't suffer from this problem with Radeon. Also, despite the
+             * imprecision OpenGL Radeon writes the correct value 0 to the depth
+             * buffer (if writing is enabled.) Radeon appears to write floored
+             * values. To compare, Intel integrated UHD 770 has gl_FragCoord.z
+             * exactly 0 (and writes rounded to closest integer values to the
+             * depth buffer.) Radeon OpenGL problem could also be fixed by using
+             * glClipControl(), but it requires OpenGL 4.5.
+             * Above is based on experiments with Linux and Mesa.
+             */
+            if (ps->state.vulkan) {
+                mstring_append(
+                    clip, "if (gl_FragCoord.z*clipRange.y < clipRange.z ||\n"
+                          "    gl_FragCoord.z*clipRange.y > clipRange.w) {\n"
+                          "  discard;\n"
+                          "}\n");
+            } else {
+                mstring_append(
+                    clip, "if ((gl_FragCoord.z + 1.0f/16777216.0f)*clipRange.y < clipRange.z ||\n"
+                          "    (gl_FragCoord.z - 1.0f/16777216.0f)*clipRange.y > clipRange.w) {\n"
+                          "  discard;\n"
+                          "}\n");
+            }
+        }
    }
-    mstring_append(vars, "vec4 pFog = vec4(fogColor.rgb, clamp(vtxFog / vtx_inv_w, 0.0, 1.0));\n");
-    mstring_append(vars, "vec4 pT0 = vtxT0 / vtx_inv_w;\n");
-    mstring_append(vars, "vec4 pT1 = vtxT1 / vtx_inv_w;\n");
-    mstring_append(vars, "vec4 pT2 = vtxT2 / vtx_inv_w;\n");
+
+    MString *vars = mstring_new();
+    mstring_append(vars, "vec4 pD0 = vtxD0;\n");
+    mstring_append(vars, "vec4 pD1 = vtxD1;\n");
+    mstring_append(vars, "vec4 pB0 = vtxB0;\n");
+    mstring_append(vars, "vec4 pB1 = vtxB1;\n");
+    mstring_append(vars, "vec4 pFog = vec4(fogColor.rgb, clamp(vtxFog, 0.0, 1.0));\n");
+    mstring_append(vars, "vec4 pT0 = vtxT0;\n");
+    mstring_append(vars, "vec4 pT1 = vtxT1;\n");
+    mstring_append(vars, "vec4 pT2 = vtxT2;\n");
    if (ps->state.point_sprite) {
        assert(!ps->state.rect_tex[3]);
        mstring_append(vars, "vec4 pT3 = vec4(gl_PointCoord, 1.0, 1.0);\n");
    } else {
-        mstring_append(vars, "vec4 pT3 = vtxT3 / vtx_inv_w;\n");
+        mstring_append(vars, "vec4 pT3 = vtxT3;\n");
    }
    mstring_append(vars, "\n");
    mstring_append(vars, "vec4 v0 = pD0;\n");
@ -1208,6 +1244,23 @@ static MString* psh_convert(struct PixelShader *ps)
        }
    }

+    if (ps->state.z_perspective) {
+        if (!ps->state.depth_clipping) {
+            mstring_append(ps->code,
+                           "float zvalue = 1.0/gl_FragCoord.w + depthOffset;\n");
+        }
+        /* TODO: With integer depth buffers Xbox hardware floors values and so
+         * does Radeon, but Intel UHD 770 rounds to nearest. Should probably
+         * floor here explicitly (in some way that doesn't also cause
+         * imprecision issues due to division by clipRange.y)
+         */
+        mstring_append(ps->code,
+                       "gl_FragDepth = clamp(zvalue, clipRange.z, clipRange.w)/clipRange.y;\n");
+    } else if (!ps->state.depth_clipping) {
+        mstring_append(ps->code,
+                       "gl_FragDepth = clamp(gl_FragCoord.z, clipRange.z/clipRange.y, clipRange.w/clipRange.y);\n");
+    }
+
    MString *final = mstring_new();
    mstring_append_fmt(final, "#version %d\n\n", ps->state.vulkan ? 450 : 400);
    mstring_append(final, mstring_get_str(preflight));
--- a/hw/xbox/nv2a/pgraph/glsl/vsh-ff.c
+++ b/hw/xbox/nv2a/pgraph/glsl/vsh-ff.c
@ -422,12 +422,11 @@ GLSL_DEFINE(materialEmissionColor, GLSL_LTCTXA(NV_IGRAPH_XF_LTCTXA_CM_COL) ".xyz

    mstring_append(body,
    "   oPos = invViewport * (tPosition * compositeMat);\n"
+    "   oPos.w = (2.0f * step(0.0f, oPos.w) - 1.0f) * clamp(abs(oPos.w), 5.421011e-20, 1.8446744e19);\n"
    );

    if (state->vulkan) {
        mstring_append(body, "   oPos.y *= -1;\n");
-    } else {
-        mstring_append(body, "   oPos.z = oPos.z * 2.0 - oPos.w;\n");
    }

    /* FIXME: Testing */
@ -445,14 +444,6 @@ GLSL_DEFINE(materialEmissionColor, GLSL_LTCTXA(NV_IGRAPH_XF_LTCTXA_CM_COL) ".xyz
        mstring_append_fmt(body, "  oPts.x = %f * %d;\n", state->point_size,
                           state->surface_scale_factor);
    }
-
-    mstring_append(body,
-                   "  if (oPos.w == 0.0 || isinf(oPos.w)) {\n"
-                   "    vtx_inv_w = 1.0;\n"
-                   "  } else {\n"
-                   "    vtx_inv_w = 1.0 / oPos.w;\n"
-                   "  }\n"
-                   "  vtx_inv_w_flat = vtx_inv_w;\n");
 }

 static void append_skinning_code(MString* str, bool mix,
--- a/hw/xbox/nv2a/pgraph/glsl/vsh-prog.c
+++ b/hw/xbox/nv2a/pgraph/glsl/vsh-prog.c
@ -735,13 +735,8 @@ static const char* vsh_header =
    "#define RCC(dest, mask, src) dest.mask = _RCC(_in(src).x).mask\n"
    "vec4 _RCC(float src)\n"
    "{\n"
-    "  float t = 1.0 / src;\n"
-    "  if (t > 0.0) {\n"
-    "    t = clamp(t, 5.42101e-020, 1.884467e+019);\n"
-    "  } else {\n"
-    "    t = clamp(t, -1.884467e+019, -5.42101e-020);\n"
-    "  }\n"
-    "  return vec4(t);\n"
+    "  src = (2.0f * step(0.0f, src) - 1.0f) * clamp(abs(src), 5.421011e-20, 1.8446744e19);\n"
+    "  return vec4(1.0 / src);\n"
    "}\n"
    "\n"
    "#define RSQ(dest, mask, src) dest.mask = _RSQ(_in(src).x).mask\n"
@ -797,7 +792,6 @@ static const char* vsh_header =
 void pgraph_gen_vsh_prog_glsl(uint16_t version,
                   const uint32_t *tokens,
                   unsigned int length,
-                   bool z_perspective,
                   bool vulkan,
                   MString *header, MString *body)
 {
@ -826,18 +820,6 @@ void pgraph_gen_vsh_prog_glsl(uint16_t version,
    }
    assert(has_final);

-    /* pre-divide and output the generated W so we can do persepctive correct
-     * interpolation manually. OpenGL can't, since we give it a W of 1 to work
-     * around the perspective divide */
-    mstring_append(body,
-        "  if (oPos.w == 0.0 || isinf(oPos.w)) {\n"
-        "    vtx_inv_w = 1.0;\n"
-        "  } else {\n"
-        "    vtx_inv_w = 1.0 / oPos.w;\n"
-        "  }\n"
-        "  vtx_inv_w_flat = vtx_inv_w;\n"
-    );
-
    mstring_append(body,
        /* the shaders leave the result in screen space, while
         * opengl expects it in clip space.
@ -854,32 +836,17 @@ void pgraph_gen_vsh_prog_glsl(uint16_t version,
                             "/ surfaceSize.y;\n");
    }

-    if (z_perspective) {
-        mstring_append(body, "  oPos.z = oPos.w;\n");
-    }
-
    mstring_append(body,
-        "  if (clipRange.y != clipRange.x) {\n");
-    if (vulkan) {
-        mstring_append(body, "      oPos.z /= clipRange.y;\n");
-    } else {
-        mstring_append(body,
-                       "    oPos.z = (oPos.z - clipRange.x)/(0.5*(clipRange.y "
-                       "- clipRange.x)) - 1;\n");
-    }
-    mstring_append(body,
-        "  }\n"
+        "  oPos.z = oPos.z / clipRange.y;\n"
+        "  oPos.w = (2.0f * step(0.0f, oPos.w) - 1.0f) * clamp(abs(oPos.w), 5.421011e-20, 1.8446744e19);\n"

-        /* Correct for the perspective divide */
-        "  if (oPos.w < 0.0) {\n"
-            /* undo the perspective divide in the case where the point would be
-             * clipped so opengl can clip it correctly */
-        "    oPos.xyz *= oPos.w;\n"
-        "  } else {\n"
-            /* we don't want the OpenGL perspective divide to happen, but we
-             * can't multiply by W because it could be meaningless here */
-        "    oPos.w = 1.0;\n"
-        "  }\n"
+        /* Undo perspective divide by w.
+         * Note that games may also have vertex shaders that do
+         * not divide by w (such as 2D-graphics menus or overlays), but since
+         * OpenGL will later on divide by the same w, we get back the same
+         * screen space coordinates (perhaps with some loss of floating point
+         * precision, though.)
+         */
+        "  oPos.xyz *= oPos.w;\n"
    );
-
 }
--- a/hw/xbox/nv2a/pgraph/glsl/vsh-prog.h
+++ b/hw/xbox/nv2a/pgraph/glsl/vsh-prog.h
@ -29,7 +29,7 @@
 #define HW_XBOX_NV2A_PGRAPH_GLSL_VSH_PROG_H

 void pgraph_gen_vsh_prog_glsl(uint16_t version, const uint32_t *tokens,
-                              unsigned int length, bool z_perspective,
+                              unsigned int length,
                              bool vulkan, MString *header, MString *body);

 #endif
--- a/hw/xbox/nv2a/pgraph/glsl/vsh.c
+++ b/hw/xbox/nv2a/pgraph/glsl/vsh.c
@ -79,8 +79,6 @@ MString *pgraph_gen_vsh_glsl(const ShaderState *state, bool prefix_outputs)

    if (prefix_outputs) {
        mstring_append(header,
-                       "#define vtx_inv_w v_vtx_inv_w\n"
-                       "#define vtx_inv_w_flat v_vtx_inv_w_flat\n"
                       "#define vtxD0 v_vtxD0\n"
                       "#define vtxD1 v_vtxD1\n"
                       "#define vtxB0 v_vtxB0\n"
@ -142,7 +140,7 @@ MString *pgraph_gen_vsh_glsl(const ShaderState *state, bool prefix_outputs)
    } else if (state->vertex_program) {
        pgraph_gen_vsh_prog_glsl(VSH_VERSION_XVS,
                                 (uint32_t *)state->program_data,
-                                 state->program_length, state->z_perspective,
+                                 state->program_length,
                                 state->vulkan, header, body);
    } else {
        assert(false);
@ -233,27 +231,30 @@ MString *pgraph_gen_vsh_glsl(const ShaderState *state, bool prefix_outputs)
    }

    /* Set outputs */
-    const char *shade_model_mult = state->smooth_shading ? "vtx_inv_w" : "vtx_inv_w_flat";
-    mstring_append_fmt(body, "\n"
-                      "  vtxD0 = clamp(oD0, 0.0, 1.0) * %s;\n"
-                      "  vtxD1 = clamp(oD1, 0.0, 1.0) * %s;\n"
-                      "  vtxB0 = clamp(oB0, 0.0, 1.0) * %s;\n"
-                      "  vtxB1 = clamp(oB1, 0.0, 1.0) * %s;\n"
-                      "  vtxFog = oFog.x * vtx_inv_w;\n"
-                      "  vtxT0 = oT0 * vtx_inv_w;\n"
-                      "  vtxT1 = oT1 * vtx_inv_w;\n"
-                      "  vtxT2 = oT2 * vtx_inv_w;\n"
-                      "  vtxT3 = oT3 * vtx_inv_w;\n"
-                      "  gl_Position = oPos;\n"
-                      "  gl_PointSize = oPts.x;\n"
-                      // "  gl_ClipDistance[0] = oPos.z - oPos.w*clipRange.z;\n" // Near
-                      // "  gl_ClipDistance[1] = oPos.w*clipRange.w - oPos.z;\n" // Far
-                      "\n"
-                      "}\n",
-                       shade_model_mult,
-                       shade_model_mult,
-                       shade_model_mult,
-                       shade_model_mult);
+    mstring_append(body, "\n"
+                   "  vtxD0 = clamp(oD0, 0.0, 1.0);\n"
+                   "  vtxD1 = clamp(oD1, 0.0, 1.0);\n"
+                   "  vtxB0 = clamp(oB0, 0.0, 1.0);\n"
+                   "  vtxB1 = clamp(oB1, 0.0, 1.0);\n"
+                   "  vtxFog = oFog.x;\n"
+                   "  vtxT0 = oT0;\n"
+                   "  vtxT1 = oT1;\n"
+                   "  vtxT2 = oT2;\n"
+                   "  vtxT3 = oT3;\n"
+                   "  gl_PointSize = oPts.x;\n"
+    );
+
+    if (state->vulkan) {
+        mstring_append(body,
+                   "  gl_Position = oPos;\n"
+        );
+    } else {
+        mstring_append(body,
+                   "  gl_Position = vec4(oPos.x, oPos.y, 2.0*oPos.z - oPos.w, oPos.w);\n"
+        );
+    }
+
+    mstring_append(body, "}\n");

    /* Return combined header + source */
    if (state->vulkan) {
--- a/hw/xbox/nv2a/pgraph/psh.h
+++ b/hw/xbox/nv2a/pgraph/psh.h
@ -85,6 +85,8 @@ typedef struct PshState {
    bool window_clip_exclusive;

    bool smooth_shading;
+    bool depth_clipping;
+    bool z_perspective;
 } PshState;

 #endif
--- a/hw/xbox/nv2a/pgraph/shaders.c
+++ b/hw/xbox/nv2a/pgraph/shaders.c
@ -94,6 +94,7 @@ ShaderState pgraph_get_shader_state(PGRAPHState *pg)
    state.vertex_program = vertex_program,
    state.z_perspective = pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0) &
                          NV_PGRAPH_CONTROL_0_Z_PERSPECTIVE_ENABLE;
+    state.psh.z_perspective = state.z_perspective;

    state.point_params_enable = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_CSV0_D),
                                         NV_PGRAPH_CSV0_D_POINTPARAMSENABLE);
@ -117,6 +118,10 @@ ShaderState pgraph_get_shader_state(PGRAPHState *pg)
                           NV_PGRAPH_CONTROL_3_SHADEMODE_SMOOTH;
    state.psh.smooth_shading = state.smooth_shading;

+    state.psh.depth_clipping = GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_ZCOMPRESSOCCLUDE),
+                                        NV_PGRAPH_ZCOMPRESSOCCLUDE_ZCLAMP_EN) ==
+                               NV_PGRAPH_ZCOMPRESSOCCLUDE_ZCLAMP_EN_CULL;
+
    state.program_length = 0;

    if (vertex_program) {
--- a/hw/xbox/nv2a/pgraph/vk/draw.c
+++ b/hw/xbox/nv2a/pgraph/vk/draw.c
@ -816,7 +816,7 @@ static void create_pipeline(PGRAPHState *pg)

    VkPipelineRasterizationStateCreateInfo rasterizer = {
        .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
-        .depthClampEnable = VK_FALSE,
+        .depthClampEnable = VK_TRUE,
        .rasterizerDiscardEnable = VK_FALSE,
        .polygonMode = pgraph_polygon_mode_vk_map[r->shader_binding->state
                                                      .polygon_front_mode],
@ -958,10 +958,6 @@ static void create_pipeline(PGRAPHState *pg)
        .pDynamicStates = dynamic_states,
    };

-    // /* Clipping */
-    // glEnable(GL_CLIP_DISTANCE0);
-    // glEnable(GL_CLIP_DISTANCE1);
-
    // /* Polygon offset */
    // /* FIXME: GL implementation-specific, maybe do this in VS? */
    // if (pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER) &
@ -983,12 +979,6 @@ static void create_pipeline(PGRAPHState *pg)
        rasterizer.depthBiasConstantFactor = zbias;
    }

-    if (GET_MASK(pgraph_reg_r(pg, NV_PGRAPH_ZCOMPRESSOCCLUDE),
-                 NV_PGRAPH_ZCOMPRESSOCCLUDE_ZCLAMP_EN) ==
-        NV_PGRAPH_ZCOMPRESSOCCLUDE_ZCLAMP_EN_CLAMP) {
-        rasterizer.depthClampEnable = VK_TRUE;
-    }
-
    // FIXME: Dither
    // if (pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0) &
    //         NV_PGRAPH_CONTROL_0_DITHERENABLE))
--- a/hw/xbox/nv2a/pgraph/vk/renderer.h
+++ b/hw/xbox/nv2a/pgraph/vk/renderer.h
@ -173,6 +173,8 @@ typedef struct ShaderBinding {

    int surface_size_loc;
    int clip_range_loc;
+    int clip_range_floc;
+    int depth_offset_loc;

    int vsh_constant_loc;
    uint32_t vsh_constants[NV2A_VERTEXSHADER_CONSTANTS][4];
--- a/hw/xbox/nv2a/pgraph/vk/shaders.c
+++ b/hw/xbox/nv2a/pgraph/vk/shaders.c
@ -276,6 +276,10 @@ static void update_shader_constant_locations(ShaderBinding *binding)
        uniform_index(&binding->vertex->uniforms, "surfaceSize");
    binding->clip_range_loc =
        uniform_index(&binding->vertex->uniforms, "clipRange");
+    binding->clip_range_floc =
+        uniform_index(&binding->fragment->uniforms, "clipRange");
+    binding->depth_offset_loc =
+        uniform_index(&binding->fragment->uniforms, "depthOffset");
    binding->fog_param_loc =
        uniform_index(&binding->vertex->uniforms, "fogParam");

@ -637,14 +641,47 @@ static void shader_update_constants(PGRAPHState *pg, ShaderBinding *binding,
                         pg->surface_binding_dim.height / aa_height);
    }

-    if (binding->clip_range_loc != -1) {
+    if (binding->clip_range_loc != -1 || binding->clip_range_floc != -1) {
        uint32_t v[2];
        v[0] = pgraph_reg_r(pg, NV_PGRAPH_ZCLIPMIN);
        v[1] = pgraph_reg_r(pg, NV_PGRAPH_ZCLIPMAX);
-        float zclip_min = *(float *)&v[0] / zmax * 2.0 - 1.0;
-        float zclip_max = *(float *)&v[1] / zmax * 2.0 - 1.0;
-        uniform4f(&binding->vertex->uniforms, binding->clip_range_loc, 0,
-                         zmax, zclip_min, zclip_max);
+        float zclip_min = *(float *)&v[0];
+        float zclip_max = *(float *)&v[1];
+
+        if (binding->clip_range_loc != -1) {
+            uniform4f(&binding->vertex->uniforms, binding->clip_range_loc, 0,
+                      zmax, zclip_min, zclip_max);
+        }
+        if (binding->clip_range_floc != -1) {
+            uniform4f(&binding->fragment->uniforms, binding->clip_range_floc, 0,
+                      zmax, zclip_min, zclip_max);
+        }
+    }
+
+    if (binding->depth_offset_loc != -1) {
+        float zbias = 0.0f;
+
+        if (pgraph_reg_r(pg, NV_PGRAPH_SETUPRASTER) &
+            (NV_PGRAPH_SETUPRASTER_POFFSETFILLENABLE |
+             NV_PGRAPH_SETUPRASTER_POFFSETLINEENABLE |
+             NV_PGRAPH_SETUPRASTER_POFFSETPOINTENABLE)) {
+            uint32_t zbias_u32 = pgraph_reg_r(pg, NV_PGRAPH_ZOFFSETBIAS);
+            zbias = *(float *)&zbias_u32;
+
+            if (pgraph_reg_r(pg, NV_PGRAPH_ZOFFSETFACTOR) != 0 &&
+                (pgraph_reg_r(pg, NV_PGRAPH_CONTROL_0) &
+                 NV_PGRAPH_CONTROL_0_Z_PERSPECTIVE_ENABLE)) {
+                /* TODO: emulate zfactor when z_perspective true, i.e.
+                 * w-buffering. Perhaps calculate an additional offset based on
+                 * triangle orientation in geometry shader and pass the result
+                 * to fragment shader and add it to gl_FragDepth as well.
+                 */
+                NV2A_UNIMPLEMENTED("NV_PGRAPH_ZOFFSETFACTOR for w-buffering");
+            }
+        }
+
+        uniform1f(&binding->fragment->uniforms, binding->depth_offset_loc,
+                  zbias);
    }

    /* Clipping regions */
@ -724,6 +761,7 @@ static bool check_shaders_dirty(PGRAPHState *pg)
        NV_PGRAPH_SHADERCTL,
        NV_PGRAPH_SHADERPROG,
        NV_PGRAPH_SHADOWCTL,
+        NV_PGRAPH_ZCOMPRESSOCCLUDE,
    };
    for (int i = 0; i < ARRAY_SIZE(regs); i++) {
        if (pgraph_is_reg_dirty(pg, regs[i])) {