From fb01ccaac6397a9d7e19e0338d4bbfb4a6015dfe Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Tue, 4 May 2021 21:39:34 +0300
Subject: [PATCH] [GPU] Viewport/clipping cleanup, don't clamp oDepth

---
 .../gpu/d3d12/d3d12_command_processor.cc      |  37 +-
 src/xenia/gpu/draw_util.cc                    | 518 ++++++++++++------
 src/xenia/gpu/draw_util.h                     |  41 +-
 src/xenia/gpu/dxbc_shader_translator.cc       |   3 +-
 src/xenia/gpu/dxbc_shader_translator.h        |  10 +-
 src/xenia/gpu/dxbc_shader_translator_om.cc    |  33 +-
 src/xenia/gpu/render_target_cache.cc          |  46 +-
 7 files changed, 428 insertions(+), 260 deletions(-)

diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index b1419fc30..56bf3322c 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -2026,20 +2026,21 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
       render_target_cache_->depth_float24_conversion();
   draw_util::ViewportInfo viewport_info;
   draw_util::GetHostViewportInfo(
-      regs, resolution_scale, true, float(D3D12_VIEWPORT_BOUNDS_MAX),
-      float(D3D12_VIEWPORT_BOUNDS_MAX), false,
+      regs, resolution_scale, true, D3D12_VIEWPORT_BOUNDS_MAX,
+      D3D12_VIEWPORT_BOUNDS_MAX, false,
       host_render_targets_used &&
           (depth_float24_conversion ==
                RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating ||
            depth_float24_conversion ==
                RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding),
-      host_render_targets_used, viewport_info);
+      host_render_targets_used, pixel_shader && pixel_shader->writes_depth(),
+      viewport_info);
   draw_util::Scissor scissor;
   draw_util::GetScissor(regs, scissor);
-  scissor.left *= resolution_scale;
-  scissor.top *= resolution_scale;
-  scissor.width *= resolution_scale;
-  scissor.height *= resolution_scale;
+  scissor.offset[0] *= resolution_scale;
+  scissor.offset[1] *= resolution_scale;
+  scissor.extent[0] *= resolution_scale;
+  scissor.extent[1] *= resolution_scale;
 
   // Update viewport, scissor, blend factor and stencil reference.
   UpdateFixedFunctionState(viewport_info, scissor, primitive_polygonal);
@@ -2811,20 +2812,20 @@ void D3D12CommandProcessor::UpdateFixedFunctionState(
 
   // Viewport.
   D3D12_VIEWPORT viewport;
-  viewport.TopLeftX = viewport_info.left;
-  viewport.TopLeftY = viewport_info.top;
-  viewport.Width = viewport_info.width;
-  viewport.Height = viewport_info.height;
+  viewport.TopLeftX = float(viewport_info.xy_offset[0]);
+  viewport.TopLeftY = float(viewport_info.xy_offset[1]);
+  viewport.Width = float(viewport_info.xy_extent[0]);
+  viewport.Height = float(viewport_info.xy_extent[1]);
   viewport.MinDepth = viewport_info.z_min;
   viewport.MaxDepth = viewport_info.z_max;
   SetViewport(viewport);
 
   // Scissor.
   D3D12_RECT scissor_rect;
-  scissor_rect.left = LONG(scissor.left);
-  scissor_rect.top = LONG(scissor.top);
-  scissor_rect.right = LONG(scissor.left + scissor.width);
-  scissor_rect.bottom = LONG(scissor.top + scissor.height);
+  scissor_rect.left = LONG(scissor.offset[0]);
+  scissor_rect.top = LONG(scissor.offset[1]);
+  scissor_rect.right = LONG(scissor.offset[0] + scissor.extent[0]);
+  scissor_rect.bottom = LONG(scissor.offset[1] + scissor.extent[1]);
   SetScissorRect(scissor_rect);
 
   if (render_target_cache_->GetPath() ==
@@ -3090,9 +3091,11 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
   system_constants_.point_size_min_max[0] = point_size_min;
   system_constants_.point_size_min_max[1] = point_size_max;
   float point_screen_to_ndc_x =
-      (0.5f * 2.0f * resolution_scale) / viewport_info.width;
+      (/* 0.5f * 2.0f * */ float(resolution_scale)) /
+      std::max(viewport_info.xy_extent[0], uint32_t(1));
   float point_screen_to_ndc_y =
-      (0.5f * 2.0f * resolution_scale) / viewport_info.height;
+      (/* 0.5f * 2.0f * */ float(resolution_scale)) /
+      std::max(viewport_info.xy_extent[1], uint32_t(1));
   dirty |= system_constants_.point_screen_to_ndc[0] != point_screen_to_ndc_x;
   dirty |= system_constants_.point_screen_to_ndc[1] != point_screen_to_ndc_y;
   system_constants_.point_screen_to_ndc[0] = point_screen_to_ndc_x;
diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc
index 315b0a3c9..87eaf092c 100644
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@@ -184,167 +184,353 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader,
 }
 
 void GetHostViewportInfo(const RegisterFile& regs, uint32_t resolution_scale,
-                         bool origin_bottom_left, float x_max, float y_max,
-                         bool allow_reverse_z, bool convert_z_to_float24,
-                         bool full_float24_in_0_to_1,
+                         bool origin_bottom_left, uint32_t x_max,
+                         uint32_t y_max, bool allow_reverse_z,
+                         bool convert_z_to_float24, bool full_float24_in_0_to_1,
+                         bool pixel_shader_writes_depth,
                          ViewportInfo& viewport_info_out) {
-  assert_true(resolution_scale >= 1);
-  assert_true(x_max >= 1.0f);
-  assert_true(y_max >= 1.0f);
+  assert_not_zero(resolution_scale);
 
-  // PA_CL_VTE_CNTL contains whether offsets and scales are enabled.
-  // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
-  // In games, either all are enabled (for regular drawing) or none are (for
-  // rectangle lists usually).
+  // A vertex position goes the following path:
   //
-  // If scale/offset is enabled, the Xenos shader is writing (neglecting W
-  // division) position in the NDC (-1, -1, dx_clip_space_def - 1) -> (1, 1, 1)
-  // box. If it's not, the position is in screen space. Since we can only use
-  // the NDC in PC APIs, we use a viewport of the largest possible size, and
-  // divide the position by it in translated shaders.
+  // = Vertex shader output in clip space, (-w, -w, 0) ... (w, w, w) for
+  //   Direct3D or (-w, -w, -w) ... (w, w, w) for OpenGL.
+  // > Clipping to the boundaries of the clip space if enabled.
+  // > Division by W if not pre-divided.
+  // = Normalized device coordinates, (-1, -1, 0) ... (1, 1, 1) for Direct3D or
+  //   (-1, -1, -1) ... (1, 1, 1) for OpenGL.
+  // > Viewport scaling.
+  // > Viewport, window and half-pixel offsetting.
+  // = Actual position in render target pixels used for rasterization and depth
+  //   buffer coordinates.
+  //
+  // On modern PC graphics APIs, all drawing is done with clipping enabled (only
+  // Z clipping can be replaced with viewport depth range clamping).
+  //
+  // On the Xbox 360, however, there are two cases:
+  //
+  // - Clipping is enabled:
+  //
+  //   Drawing "as normal", primarily for the game world. Draws are clipped to
+  //   the (-w, -w, 0) ... (w, w, w) or (-w, -w, -w) ... (w, w, w) clip space.
+  //
+  //   Ideally all offsets in pixels (window offset, half-pixel offset) are
+  //   post-clip, and thus they would need to be applied via the host viewport
+  //   (also the Direct3D 11.3 specification defines this as the correct way of
+  //   reproducing the original Direct3D 9 half-pixel offset behavior).
+  //
+  //   However, in reality, only WARP actually truly clips to -W...W, with the
+  //   viewport fractional offset actually accurately making samples outside the
+  //   fractional rectangle unable to be covered. AMD, Intel and Nvidia, in
+  //   Direct3D 12, all don't truly clip even a really huge primitive to -W...W.
+  //   Instead, primitives still overflow the fractional rectangle and cover
+  //   samples outside of it. The actual viewport scissor is floor(TopLeftX,
+  //   TopLeftY) ... floor(TopLeftX + Width, TopLeftY + Height), with flooring
+  //   and addition in float32 (with 0x3F7FFFFF TopLeftXY, or 1.0f - ULP, all
+  //   the samples in the top row / left column can be covered, while with
+  //   0x3F800000, or 1.0f, none of them can be).
+  //
+  //   We are reproducing the same behavior here - what would happen if we'd be
+  //   passing the guest values directly to Direct3D 12. Also, for consistency
+  //   across hardware and APIs (especially Vulkan with viewportSubPixelBits
+  //   being 0 rather than at least 8 on some devices - Arm Mali, Imagination
+  //   PowerVR), and for simplicity of math, and also for exact calculations in
+  //   bounds checking in validation layers of the host APIs, we are returning
+  //   integer viewport coordinates, handling the fractional offset in the
+  //   vertex shaders instead, via ndc_scale and ndc_offset - it shouldn't
+  //   significantly affect precision that we will be doing the offsetting in
+  //   W-scaled rather than W-divided units, the ratios of exponents involved in
+  //   the calculations stay the same, and everything ends up being 16.8 anyway
+  //   on most hardware, so small precision differences are very unlikely to
+  //   affect coverage.
+  //
+  // FIXME(Triang3l): Overestimate or more properly round the viewport scissor
+  // boundaries if this flooring causes gaps on the bottom / right side in real
+  // games if any are found using fractional viewport coordinates. Viewport
+  // scissoring is not an inherent result of the viewport scale / offset, these
+  // are used merely for transformation of coordinates; rather, it's done by
+  // intersecting the viewport and scissor rectangles in the guest driver and
+  // writing the common portion to PA_SC_WINDOW_SCISSOR, so how the scissor is
+  // computed for a fractional viewport is entirely up to the guest.
+  //
+  //   Even though Xbox 360 games are designed for Direct3D, with 0...W range of
+  //   Z in clip space, the GPU also allows -W...W. Since Xenia is not targeting
+  //   OpenGL (where it would be toggled via glClipControl - or, on ES, it would
+  //   always be -W...W), this function always remaps it to 0...W, though
+  //   numerically not precisely (0 is moved to 0.5, locking the exponent near
+  //   what was the truly floating-point 0 originally). It is the guest
+  //   viewport's responsibility (haven't checked, but it's logical) to remap
+  //   from -1...1 in the NDC to glDepthRange within the 0...1 range. Also -Z
+  //   pointing forward in OpenGL doesn't matter here (the -W...W clip space is
+  //   symmetric).
+  //
+  // - Clipping is disabled:
+  //
+  //   The most common case of drawing without clipping in games is screen-space
+  //   draws, most prominently clears, directly in render target coordinates.
+  //
+  //   In this particular case (though all the general case arithmetic still
+  //   applies), the vertex shader returns a position in pixels, pre-divided by
+  //   W (though this doesn't matter if W is 1).
+  //
+  //   Because clipping is disabled, this huge polygon with, for example,
+  //   a (1280, 720, 0, 1) vertex, is not clipped to (-w, -w) ... (w, w), so the
+  //   vertex becomes (1280, 720) in the NDC as well (even though in regular 3D
+  //   draws with clipping, disregarding the guard band for simplicity, it can't
+  //   be bigger than (1, 1) after clipping and the division by W).
+  //
+  //   For these draws, the viewport is also usually disabled (though, again, it
+  //   doesn't have to be - an enabled viewport would likely still work as
+  //   usual) by disabling PA_CL_VTE_CNTL::VPORT_X/Y/Z_SCALE/OFFSET_ENA - which
+  //   equals to having a viewport scale of (1, 1, 1) and offset of (0, 0, 0).
+  //   This results in the NDC being treated directly as pixel coordinates.
+  //   Normally, with clipping, this would make only a tiny 1x1 area in the
+  //   corner of the render target being possible to cover (and 3 unreachable
+  //   pixels outside of the render target). The window offset is then applied,
+  //   if needed, as well as the half-pixel offset.
+  //
+  //   It's also possible (though not verified) that without clipping, Z (as a
+  //   result of, for instance, polygon offset, or explicit calculations in the
+  //   vertex shader) may end up outside the viewport Z range. Direct3D 10
+  //   requires clamping to the viewport Z bounds in all cases in the
+  //   output-merger according to the Direct3D 11.3 functional specification. A
+  //   different behavior is likely on the Xbox 360, however, because while
+  //   Direct3D 10-compatible AMD GPUs such as the R600 have
+  //   PA_SC_VPORT_ZMIN/ZMAX registers, the Adreno 200 doesn't seem to have any
+  //   equivalents, neither in PA nor in RB. This probably also applies to
+  //   shader depth output - possibly doesn't need to be clamped as well.
+  //
+  //   On the PC, we need to emulate disabled clipping by using a viewport at
+  //   least as large as the scissor region within the render target, as well as
+  //   the full viewport depth range (plus changing Z clipping to Z clamping on
+  //   the host if possible), and rescale from the guest clip space to the host
+  //   "no clip" clip space, as well as apply the viewport, the window offset,
+  //   and the half-pixel offset, in the vertex shader. Ideally, the host
+  //   viewport should have a power of 2 size - so scaling doesn't affect
+  //   precision, and is merely an exponent bias.
+  //
+  // NDC XY point towards +XY on the render target - the viewport scale sign
+  // handles the remapping from Direct3D 9 -Y towards +U to a generic
+  // transformation from the NDC to pixel coordinates.
+  //
+  // TODO(Triang3l): Investigate the need for clamping of oDepth to 0...1 for
+  // D24FS8 as well.
 
   auto pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
   auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
   auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
   auto pa_su_vtx_cntl = regs.Get<reg::PA_SU_VTX_CNTL>();
 
-  float viewport_left, viewport_top;
-  float viewport_width, viewport_height;
-  float ndc_scale_x, ndc_scale_y;
-  float ndc_offset_x, ndc_offset_y;
-  // To avoid zero size viewports, which would harm division and aren't allowed
-  // on Vulkan. Nothing will ever be covered by a viewport of this size - this
-  // is 2 orders of magnitude smaller than a .8 subpixel, and thus shouldn't
-  // have any effect on rounding, n and n + 1 / 1024 would be rounded to the
-  // same .8 fixed-point value, thus in fixed-point, the viewport would have
-  // zero size.
-  const float size_min = 1.0f / 1024.0f;
-
-  float viewport_offset_x = pa_cl_vte_cntl.vport_x_offset_ena
-                                ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32
-                                : 0.0f;
-  float viewport_offset_y = pa_cl_vte_cntl.vport_y_offset_ena
-                                ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
-                                : 0.0f;
+  // Obtain the original viewport values in a normalized way.
+  float scale_xy[] = {
+      pa_cl_vte_cntl.vport_x_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32
+                                       : 1.0f,
+      pa_cl_vte_cntl.vport_y_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
+                                       : 1.0f,
+  };
+  float scale_z = pa_cl_vte_cntl.vport_z_scale_ena
+                      ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32
+                      : 1.0f;
+  float offset_base_xy[] = {
+      pa_cl_vte_cntl.vport_x_offset_ena
+          ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32
+          : 0.0f,
+      pa_cl_vte_cntl.vport_y_offset_ena
+          ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
+          : 0.0f,
+  };
+  float offset_z = pa_cl_vte_cntl.vport_z_offset_ena
+                       ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32
+                       : 0.0f;
+  // Calculate all the integer.0 or integer.5 offsetting exactly at full
+  // precision, separately so it can be used in other integer calculations
+  // without double rounding if needed.
+  float offset_add_xy[2] = {};
   if (pa_su_sc_mode_cntl.vtx_window_offset_enable) {
     auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
-    viewport_offset_x += float(pa_sc_window_offset.window_x_offset);
-    viewport_offset_y += float(pa_sc_window_offset.window_y_offset);
+    offset_add_xy[0] += float(pa_sc_window_offset.window_x_offset);
+    offset_add_xy[1] += float(pa_sc_window_offset.window_y_offset);
   }
-
-  if (pa_cl_vte_cntl.vport_x_scale_ena) {
-    float pa_cl_vport_xscale = regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32;
-    float viewport_scale_x_abs =
-        std::abs(pa_cl_vport_xscale) * resolution_scale;
-    viewport_left = viewport_offset_x * resolution_scale - viewport_scale_x_abs;
-    float viewport_right = viewport_left + viewport_scale_x_abs * 2.0f;
-    // Keep the viewport in the positive quarter-plane for simplicity of
-    // clamping to the maximum supported bounds.
-    float cutoff_left = std::fmax(-viewport_left, 0.0f);
-    float cutoff_right = std::fmax(viewport_right - x_max, 0.0f);
-    viewport_left = std::fmax(viewport_left, 0.0f);
-    viewport_right = std::fmin(viewport_right, x_max);
-    viewport_width = viewport_right - viewport_left;
-    if (viewport_width > size_min) {
-      ndc_scale_x =
-          (viewport_width + cutoff_left + cutoff_right) / viewport_width;
-      if (pa_cl_vport_xscale < 0.0f) {
-        ndc_scale_x = -ndc_scale_x;
-      }
-      ndc_offset_x =
-          ((cutoff_right - cutoff_left) * (0.5f * 2.0f)) / viewport_width;
-    } else {
-      // Empty viewport, but don't pass 0 because that's against the Vulkan
-      // specification.
-      viewport_left = 0.0f;
-      viewport_width = size_min;
-      ndc_scale_x = 0.0f;
-      ndc_offset_x = 0.0f;
-    }
-  } else {
-    // Drawing without a viewport and without clipping to one - use a viewport
-    // covering the entire potential guest render target or the positive part of
-    // the host viewport area, whichever is smaller, and apply the offset, if
-    // enabled, via the shader.
-    viewport_left = 0.0f;
-    viewport_width = std::min(
-        float(xenos::kTexture2DCubeMaxWidthHeight) * resolution_scale, x_max);
-    ndc_scale_x = (2.0f * resolution_scale) / viewport_width;
-    ndc_offset_x = viewport_offset_x * ndc_scale_x - 1.0f;
-  }
-
-  if (pa_cl_vte_cntl.vport_y_scale_ena) {
-    float pa_cl_vport_yscale = regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32;
-    float viewport_scale_y_abs =
-        std::abs(pa_cl_vport_yscale) * resolution_scale;
-    viewport_top = viewport_offset_y * resolution_scale - viewport_scale_y_abs;
-    float viewport_bottom = viewport_top + viewport_scale_y_abs * 2.0f;
-    float cutoff_top = std::fmax(-viewport_top, 0.0f);
-    float cutoff_bottom = std::fmax(viewport_bottom - y_max, 0.0f);
-    viewport_top = std::fmax(viewport_top, 0.0f);
-    viewport_bottom = std::fmin(viewport_bottom, y_max);
-    viewport_height = viewport_bottom - viewport_top;
-    if (viewport_height > size_min) {
-      ndc_scale_y =
-          (viewport_height + cutoff_top + cutoff_bottom) / viewport_height;
-      if (pa_cl_vport_yscale < 0.0f) {
-        ndc_scale_y = -ndc_scale_y;
-      }
-      ndc_offset_y =
-          ((cutoff_bottom - cutoff_top) * (0.5f * 2.0f)) / viewport_height;
-    } else {
-      // Empty viewport, but don't pass 0 because that's against the Vulkan
-      // specification.
-      viewport_top = 0.0f;
-      viewport_height = size_min;
-      ndc_scale_y = 0.0f;
-      ndc_offset_y = 0.0f;
-    }
-  } else {
-    viewport_top = 0.0f;
-    viewport_height = std::min(
-        float(xenos::kTexture2DCubeMaxWidthHeight) * resolution_scale, y_max);
-    ndc_scale_y = (2.0f * resolution_scale) / viewport_height;
-    ndc_offset_y = viewport_offset_y * ndc_scale_y - 1.0f;
-  }
-
-  // Apply the vertex half-pixel offset via the shader (it must not affect
-  // clipping, otherwise with resolution scale, samples in the left/top half
-  // will never be covered).
   if (cvars::half_pixel_offset && !pa_su_vtx_cntl.pix_center) {
-    float half_pixel_offset_ndc_scale = 0.5f * 2.0f * resolution_scale;
-    ndc_offset_x += half_pixel_offset_ndc_scale / viewport_width;
-    ndc_offset_y += half_pixel_offset_ndc_scale / viewport_height;
+    offset_add_xy[0] += 0.5f;
+    offset_add_xy[1] += 0.5f;
   }
 
-  if (origin_bottom_left) {
-    ndc_scale_y = -ndc_scale_y;
-    ndc_offset_y = -ndc_offset_y;
+  // The maximum value is at least the maximum host render target size anyway -
+  // and a guest pixel is always treated as a whole with resolution scaling.
+  uint32_t xy_max_unscaled[] = {x_max / resolution_scale,
+                                y_max / resolution_scale};
+  assert_not_zero(xy_max_unscaled[0]);
+  assert_not_zero(xy_max_unscaled[1]);
+
+  float z_min;
+  float z_max;
+  float ndc_scale[3];
+  float ndc_offset[3];
+
+  if (pa_cl_clip_cntl.clip_disable) {
+    // Clipping is disabled - use a huge host viewport, perform pixel and depth
+    // offsetting in the vertex shader.
+
+    // XY.
+    for (uint32_t i = 0; i < 2; ++i) {
+      viewport_info_out.xy_offset[i] = 0;
+      uint32_t extent_axis_unscaled =
+          std::min(xenos::kTexture2DCubeMaxWidthHeight, xy_max_unscaled[i]);
+      viewport_info_out.xy_extent[i] = extent_axis_unscaled * resolution_scale;
+      float extent_axis_unscaled_float = float(extent_axis_unscaled);
+      float pixels_to_ndc_axis = 2.0f / extent_axis_unscaled_float;
+      ndc_scale[i] = scale_xy[i] * pixels_to_ndc_axis;
+      ndc_offset[i] = (offset_base_xy[i] - extent_axis_unscaled_float * 0.5f +
+                       offset_add_xy[i]) *
+                      pixels_to_ndc_axis;
+    }
+
+    // Z.
+    z_min = 0.0f;
+    z_max = 1.0f;
+    ndc_scale[2] = scale_z;
+    ndc_offset[2] = offset_z;
+  } else {
+    // Clipping is enabled - perform pixel and depth offsetting via the host
+    // viewport.
+
+    // XY.
+    for (uint32_t i = 0; i < 2; ++i) {
+      // With resolution scaling, do all viewport XY scissoring in guest pixels
+      // if fractional and for the half-pixel offset - we treat guest pixels as
+      // a whole, and also the half-pixel offset would be irreversible in guest
+      // vertices if we did flooring in host pixels. Instead of flooring, also
+      // doing truncation for simplicity - since maxing with 0 is done anyway
+      // (we only return viewports in the positive quarter-plane).
+      float offset_axis = offset_base_xy[i] + offset_add_xy[i];
+      float scale_axis = scale_xy[i];
+      float scale_axis_abs = std::abs(scale_xy[i]);
+      float axis_0 = offset_axis - scale_axis_abs;
+      float axis_1 = offset_axis + scale_axis_abs;
+      float axis_max_unscaled_float = float(xy_max_unscaled[i]);
+      // fmax to drop NaN and < 0, min as float (axis_max_unscaled_float is well
+      // below 2^24) to safely drop very large values.
+      uint32_t axis_0_int =
+          uint32_t(std::min(std::fmax(axis_0, 0.0f), axis_max_unscaled_float));
+      uint32_t axis_1_int =
+          uint32_t(std::min(std::fmax(axis_1, 0.0f), axis_max_unscaled_float));
+      uint32_t axis_extent_int = axis_1_int - axis_0_int;
+      viewport_info_out.xy_offset[i] = axis_0_int * resolution_scale;
+      viewport_info_out.xy_extent[i] = axis_extent_int * resolution_scale;
+      float ndc_scale_axis;
+      float ndc_offset_axis;
+      if (axis_extent_int) {
+        // Rescale from the old bounds to the new ones, and also apply the sign.
+        // If the new bounds are smaller than the old, for instance, we're
+        // cropping - the new -W...W clip space is a subregion of the old one -
+        // the scale should be > 1 so the area being cut off ends up outside
+        // -W...W. If the new region should include more than the original clip
+        // space, a region previously outside -W...W should end up within it, so
+        // the scale should be < 1.
+        float axis_extent_rounded = float(axis_extent_int);
+        ndc_scale_axis = scale_axis * 2.0f / axis_extent_rounded;
+        // Move the origin of the snapped coordinates back to the original one.
+        ndc_offset_axis = (float(offset_axis) -
+                           (float(axis_0_int) + axis_extent_rounded * 0.5f)) *
+                          2.0f / axis_extent_rounded;
+      } else {
+        // Empty viewport (everything outside the viewport scissor).
+        ndc_scale_axis = 1.0f;
+        ndc_offset_axis = 0.0f;
+      }
+      ndc_scale[i] = ndc_scale_axis;
+      ndc_offset[i] = ndc_offset_axis;
+    }
+
+    // Z.
+    float host_clip_offset_z;
+    float host_clip_scale_z;
+    if (pa_cl_clip_cntl.dx_clip_space_def) {
+      host_clip_offset_z = offset_z;
+      host_clip_scale_z = scale_z;
+      ndc_scale[2] = 1.0f;
+      ndc_offset[2] = 0.0f;
+    } else {
+      // Normalizing both Direct3D / Vulkan 0...W and OpenGL -W...W clip spaces
+      // to 0...W. We are not targeting OpenGL, but there we could accept the
+      // wanted clip space (Direct3D, OpenGL, or any) and return the actual one
+      // (Direct3D or OpenGL).
+      //
+      // If the guest wants to use -W...W clip space (-1...1 NDC) and a 0...1
+      // depth range in the end, it's expected to use ZSCALE of 0.5 and ZOFFSET
+      // of 0.5.
+      //
+      // We are providing the near and the far (or offset and offset + scale)
+      // plane distances to the host API in a way that the near maps to Z = 0
+      // and the far maps to Z = W in clip space (or Z = 1 in NDC).
+      //
+      // With D3D offset and scale that we want, assuming D3D clip space input,
+      // the formula for the depth would be:
+      //
+      // depth = offset_d3d + scale_d3d * ndc_z_d3d
+      //
+      // We are remapping the incoming OpenGL Z from -W...W to 0...W by scaling
+      // it by 0.5 and adding 0.5 * W to the result. So, our depth formula would
+      // be:
+      //
+      // depth = offset_d3d + scale_d3d * (ndc_z_gl * 0.5 + 0.5)
+      //
+      // The guest registers, however, contain the offset and the scale for
+      // remapping not from 0...W to near...far, but from -W...W to near...far,
+      // or:
+      //
+      // depth = offset_gl + scale_gl * ndc_z_gl
+      //
+      // Knowing offset_gl, scale_gl and how ndc_z_d3d can be obtained from
+      // ndc_z_gl, we need to derive the formulas for the needed offset_d3d and
+      // scale_d3d to apply them to the incoming ndc_z_d3d.
+      //
+      // depth = offset_gl + scale_gl * (ndc_z_d3d * 2 - 1)
+      //
+      // Expanding:
+      //
+      // depth = offset_gl + (scale_gl * ndc_z_d3d * 2 - scale_gl)
+      //
+      // Reordering:
+      //
+      // depth = (offset_gl - scale_gl) + (scale_gl * 2) * ndc_z_d3d
+      // offset_d3d = offset_gl - scale_gl
+      // scale_d3d = scale_gl * 2
+      host_clip_offset_z = offset_z - scale_z;
+      host_clip_scale_z = scale_z * 2.0f;
+      // Need to remap -W...W clip space to 0...W via ndc_scale and ndc_offset -
+      // by scaling Z by 0.5 and adding 0.5 * W to it.
+      ndc_scale[2] = 0.5f;
+      ndc_offset[2] = 0.5f;
+    }
+    if (pixel_shader_writes_depth) {
+      // Allow the pixel shader to write any depth value since
+      // PA_SC_VPORT_ZMIN/ZMAX isn't present on the Adreno 200; guest pixel
+      // shaders don't have access to the original Z in the viewport space
+      // anyway and likely must write the depth on all execution paths.
+      z_min = 0.0f;
+      z_max = 1.0f;
+    } else {
+      // This clamping is not very correct, but just for safety. Direct3D
+      // doesn't allow an unrestricted depth range. Vulkan does, as an
+      // extension. But cases when this really matters are yet to be found -
+      // trying to fix this will result in more correct depth values, but
+      // incorrect clipping.
+      z_min = std::min(std::fmax(host_clip_offset_z, 0.0f), 1.0f);
+      z_max = std::min(std::fmax(host_clip_offset_z + host_clip_scale_z, 0.0f),
+                       1.0f);
+      // Direct3D 12 doesn't allow reverse depth range - on some drivers it
+      // works, on some drivers it doesn't, actually, but it was never
+      // explicitly allowed by the specification.
+      if (!allow_reverse_z && z_min > z_max) {
+        std::swap(z_min, z_max);
+        ndc_scale[2] = -ndc_scale[2];
+        ndc_offset[2] = 1.0f - ndc_offset[2];
+      }
+    }
   }
 
-  float viewport_scale_z = pa_cl_vte_cntl.vport_z_scale_ena
-                               ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32
-                               : 1.0f;
-  float viewport_offset_z = pa_cl_vte_cntl.vport_z_offset_ena
-                                ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32
-                                : 0.0f;
-  // Vulkan requires the depth bounds to be in the 0 to 1 range without
-  // VK_EXT_depth_range_unrestricted (which isn't used on the Xbox 360).
-  float viewport_z_min = std::min(std::fmax(viewport_offset_z, 0.0f), 1.0f);
-  float viewport_z_max =
-      std::min(std::fmax(viewport_offset_z + viewport_scale_z, 0.0f), 1.0f);
-  // When VPORT_Z_SCALE_ENA is disabled, Z/W is directly what is expected to be
-  // written to the depth buffer, and for some reason DX_CLIP_SPACE_DEF isn't
-  // set in this case in draws in games.
-  bool gl_clip_space_def =
-      !pa_cl_clip_cntl.dx_clip_space_def && pa_cl_vte_cntl.vport_z_scale_ena;
-  float ndc_scale_z = gl_clip_space_def ? 0.5f : 1.0f;
-  float ndc_offset_z = gl_clip_space_def ? 0.5f : 0.0f;
-  if (viewport_z_min > viewport_z_max && !allow_reverse_z) {
-    std::swap(viewport_z_min, viewport_z_max);
-    ndc_scale_z = -ndc_scale_z;
-    ndc_offset_z = 1.0f - ndc_offset_z;
-  }
   if (GetDepthControlForCurrentEdramMode(regs).z_enable &&
       regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
           xenos::DepthRenderTargetFormat::kD24FS8) {
@@ -352,34 +538,30 @@ void GetHostViewportInfo(const RegisterFile& regs, uint32_t resolution_scale,
       // Need to adjust the bounds that the resulting depth values will be
       // clamped to after the pixel shader. Preferring adding some error to
       // interpolated Z instead if conversion can't be done exactly, without
-      // modifying clipping bounds by adjusting Z in vertex shaders, as that may
-      // cause polygons placed explicitly at Z = 0 or Z = W to be clipped.
-      viewport_z_min =
-          xenos::Float20e4To32(xenos::Float32To20e4(viewport_z_min));
-      viewport_z_max =
-          xenos::Float20e4To32(xenos::Float32To20e4(viewport_z_max));
+      // modifying clipping bounds by adjusting Z in vertex shaders, as that
+      // may cause polygons placed explicitly at Z = 0 or Z = W to be clipped.
+      z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min));
+      z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max));
     }
     if (full_float24_in_0_to_1) {
       // Remap the full [0...2) float24 range to [0...1) support data round-trip
       // during render target ownership transfer of EDRAM tiles through depth
       // input without unrestricted depth range.
-      viewport_z_min *= 0.5f;
-      viewport_z_max *= 0.5f;
+      z_min *= 0.5f;
+      z_max *= 0.5f;
     }
   }
+  viewport_info_out.z_min = z_min;
+  viewport_info_out.z_max = z_max;
 
-  viewport_info_out.left = viewport_left;
-  viewport_info_out.top = viewport_top;
-  viewport_info_out.width = viewport_width;
-  viewport_info_out.height = viewport_height;
-  viewport_info_out.z_min = viewport_z_min;
-  viewport_info_out.z_max = viewport_z_max;
-  viewport_info_out.ndc_scale[0] = ndc_scale_x;
-  viewport_info_out.ndc_scale[1] = ndc_scale_y;
-  viewport_info_out.ndc_scale[2] = ndc_scale_z;
-  viewport_info_out.ndc_offset[0] = ndc_offset_x;
-  viewport_info_out.ndc_offset[1] = ndc_offset_y;
-  viewport_info_out.ndc_offset[2] = ndc_offset_z;
+  if (origin_bottom_left) {
+    ndc_scale[1] = -ndc_scale[1];
+    ndc_offset[1] = -ndc_offset[1];
+  }
+  for (uint32_t i = 0; i < 3; ++i) {
+    viewport_info_out.ndc_scale[i] = ndc_scale[i];
+    viewport_info_out.ndc_offset[i] = ndc_offset[i];
+  }
 }
 
 void GetScissor(const RegisterFile& regs, Scissor& scissor_out) {
@@ -420,10 +602,10 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out) {
   // console, but no evidence of such has ever been seen).
   br_x = std::max(br_x, tl_x);
   br_y = std::max(br_y, tl_y);
-  scissor_out.left = tl_x;
-  scissor_out.top = tl_y;
-  scissor_out.width = br_x - tl_x;
-  scissor_out.height = br_y - tl_y;
+  scissor_out.offset[0] = tl_x;
+  scissor_out.offset[1] = tl_y;
+  scissor_out.extent[0] = br_x - tl_x;
+  scissor_out.extent[1] = br_y - tl_y;
 }
 
 xenos::CopySampleSelect SanitizeCopySampleSelect(
diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h
index f95dece17..193aabca5 100644
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@@ -77,15 +77,27 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader,
                                           const RegisterFile& regs);
 
 struct ViewportInfo {
-  // The returned viewport will always be in the positive quarter-plane for
-  // simplicity of clamping to the maximum size supported by the host, negative
-  // offset will be applied via ndc_offset.
-  float left;
-  float top;
-  float width;
-  float height;
+  // Offset from render target UV = 0 to +UV.
+  // For simplicity of cropping to the maximum size on the host; to match the
+  // Direct3D 12 clipping / scissoring behavior with a fractional viewport, to
+  // floor(TopLeftXY) ... floor(TopLeftXY + WidthHeight), on the real AMD, Intel
+  // and Nvidia hardware (not WARP); as well as to hide the differences between
+  // 0 and 8+ viewportSubPixelBits on Vulkan, and to prevent any numerical error
+  // in bound checking in host APIs, viewport bounds are returned as integers.
+  // Also they're returned as non-negative, also to make it easier to crop (so
+  // Vulkan maxViewportDimensions and viewportBoundsRange don't have to be
+  // handled separately - maxViewportDimensions is greater than or equal to the
+  // largest framebuffer image size, so it's safe, and viewportBoundsRange is
+  // always bigger than maxViewportDimensions. All fractional offsetting,
+  // including the half-pixel offset, and cropping are handled via ndc_scale and
+  // ndc_offset.
+  uint32_t xy_offset[2];
+  // Extent can be zero for an empty viewport - host APIs not supporting empty
+  // viewports need to use an empty scissor rectangle.
+  uint32_t xy_extent[2];
   float z_min;
   float z_max;
+  // The scale is applied before the offset (like using multiply-add).
   float ndc_scale[3];
   float ndc_offset[3];
 };
@@ -94,16 +106,17 @@ struct ViewportInfo {
 // host graphics APIs such as Direct3D 11+ and Vulkan, also forcing it to the
 // Direct3D clip space with 0...W Z rather than -W...W.
 void GetHostViewportInfo(const RegisterFile& regs, uint32_t resolution_scale,
-                         bool origin_bottom_left, float x_max, float y_max,
-                         bool allow_reverse_z, bool convert_z_to_float24,
-                         bool full_float24_in_0_to_1,
+                         bool origin_bottom_left, uint32_t x_max,
+                         uint32_t y_max, bool allow_reverse_z,
+                         bool convert_z_to_float24, bool full_float24_in_0_to_1,
+                         bool pixel_shader_writes_depth,
                          ViewportInfo& viewport_info_out);
 
 struct Scissor {
-  uint32_t left;
-  uint32_t top;
-  uint32_t width;
-  uint32_t height;
+  // Offset from render target UV = 0 to +UV.
+  uint32_t offset[2];
+  // Extent can be zero.
+  uint32_t extent[2];
 };
 void GetScissor(const RegisterFile& regs, Scissor& scissor_out);
 
diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc
index 154a64e86..5ea7e797e 100644
--- a/src/xenia/gpu/dxbc_shader_translator.cc
+++ b/src/xenia/gpu/dxbc_shader_translator.cc
@@ -1511,8 +1511,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
       // 20e4-as-32 conversion and with 0...1 to 0...0.5 float24 remapping.
       // Though 20e4 float depth can store values between 1 and 2, it's a very
       // unusual case. Direct3D 10+ SV_Depth, however, can accept any values,
-      // including specials, when the depth buffer is floating-point; but depth
-      // is clamped to the viewport bounds anyway.
+      // including specials, when the depth buffer is floating-point.
       is_clamped = true;
       break;
   }
diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h
index f513b2a22..3b749d654 100644
--- a/src/xenia/gpu/dxbc_shader_translator.h
+++ b/src/xenia/gpu/dxbc_shader_translator.h
@@ -771,11 +771,11 @@ class DxbcShaderTranslator : public ShaderTranslator {
     return !is_depth_only_pixel_shader_ && !current_shader().writes_depth() &&
            !current_shader().is_valid_memexport_used();
   }
-  // Converts the depth value to 24-bit (storing the result in bits 0:23 and
-  // zeros in 24:31, not creating room for stencil - since this may be involved
-  // in comparisons) according to the format specified in the system constants.
-  // Source and destination may be the same, temporary must be different than
-  // both.
+  // Converts the pre-clamped depth value to 24-bit (storing the result in bits
+  // 0:23 and zeros in 24:31, not creating room for stencil - since this may be
+  // involved in comparisons) according to the format specified in the system
+  // constants. Source and destination may be the same, temporary must be
+  // different than both.
   void ROV_DepthTo24Bit(uint32_t d24_temp, uint32_t d24_temp_component,
                         uint32_t d32_temp, uint32_t d32_temp_component,
                         uint32_t temp_temp, uint32_t temp_temp_component);
diff --git a/src/xenia/gpu/dxbc_shader_translator_om.cc b/src/xenia/gpu/dxbc_shader_translator_om.cc
index 38a6bf9a3..15984c8db 100644
--- a/src/xenia/gpu/dxbc_shader_translator_om.cc
+++ b/src/xenia/gpu/dxbc_shader_translator_om.cc
@@ -457,37 +457,10 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() {
 
     if (!i) {
       if (shader_writes_depth) {
-        // Clamp oDepth to the lower viewport depth bound (depth clamp happens
-        // after the pixel shader in the pipeline, at least on Direct3D 11 and
-        // Vulkan, thus applies to the shader's depth output too).
-        system_constants_used_ |= 1ull << kSysConst_EdramDepthRange_Index;
-        a_.OpMax(dxbc::Dest::R(system_temp_depth_stencil_, 0b0001),
-                 dxbc::Src::R(system_temp_depth_stencil_, dxbc::Src::kXXXX),
-                 dxbc::Src::CB(cbuffer_index_system_constants_,
-                               uint32_t(CbufferRegister::kSystemConstants),
-                               kSysConst_EdramDepthRange_Vec)
-                     .Select(kSysConst_EdramDepthRangeOffset_Comp));
-        // Calculate the upper Z range bound to temp.x for clamping after
-        // biasing.
-        // temp.x = viewport maximum depth
-        system_constants_used_ |= 1ull << kSysConst_EdramDepthRange_Index;
-        a_.OpAdd(temp_x_dest,
-                 dxbc::Src::CB(cbuffer_index_system_constants_,
-                               uint32_t(CbufferRegister::kSystemConstants),
-                               kSysConst_EdramDepthRange_Vec)
-                     .Select(kSysConst_EdramDepthRangeOffset_Comp),
-                 dxbc::Src::CB(cbuffer_index_system_constants_,
-                               uint32_t(CbufferRegister::kSystemConstants),
-                               kSysConst_EdramDepthRange_Vec)
-                     .Select(kSysConst_EdramDepthRangeScale_Comp));
-        // Clamp oDepth to the upper viewport depth bound (already not above 1,
-        // but saturate for total safety).
-        // temp.x = free
-        a_.OpMin(dxbc::Dest::R(system_temp_depth_stencil_, 0b0001),
-                 dxbc::Src::R(system_temp_depth_stencil_, dxbc::Src::kXXXX),
-                 temp_x_src, true);
         // Convert the shader-generated depth to 24-bit, using temp.x as
-        // temporary.
+        // temporary. oDepth is already written by StoreResult with saturation,
+        // no need to clamp here. Adreno 200 doesn't have PA_SC_VPORT_ZMIN/ZMAX,
+        // so likely there's no need to clamp to the viewport depth bounds.
         ROV_DepthTo24Bit(system_temp_depth_stencil_, 0,
                          system_temp_depth_stencil_, 0, temp, 0);
       } else {
diff --git a/src/xenia/gpu/render_target_cache.cc b/src/xenia/gpu/render_target_cache.cc
index 72ed63e57..70935c3c9 100644
--- a/src/xenia/gpu/render_target_cache.cc
+++ b/src/xenia/gpu/render_target_cache.cc
@@ -22,6 +22,7 @@
 #include "xenia/base/logging.h"
 #include "xenia/base/math.h"
 #include "xenia/gpu/draw_util.h"
+#include "xenia/gpu/gpu_flags.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/registers.h"
 #include "xenia/gpu/xenos.h"
@@ -562,35 +563,32 @@ bool RenderTargetCache::Update(bool is_rasterization_done,
       GetRenderTargetHeight(pitch_tiles_at_32bpp, msaa_samples);
   int32_t window_y_offset =
       regs.Get<reg::PA_SC_WINDOW_OFFSET>().window_y_offset;
-  auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
-  if (pa_cl_vte_cntl.vport_y_scale_ena) {
+  if (!regs.Get<reg::PA_CL_CLIP_CNTL>().clip_disable) {
+    auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
     float viewport_bottom = 0.0f;
-    if (pa_cl_vte_cntl.vport_y_offset_ena) {
-      viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
-    }
+    // First calculate all the integer.0 or integer.5 offsetting exactly at full
+    // precision.
     if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
       viewport_bottom += float(window_y_offset);
     }
-    viewport_bottom += std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32);
-    uint32_t viewport_bottom_fixed = uint32_t(std::max(
-        draw_util::FloatToD3D11Fixed16p8(viewport_bottom), int32_t(0)));
-    uint32_t viewport_bottom_pixels = viewport_bottom_fixed >> 8;
-    // Without MSAA, the center must be covered - according to the top-left
-    // rasterization rule, for the bottom, the test is exclusive. If the last
-    // row is included in the viewport only partially, check if its center is
-    // precisely potentially covered to round - to more safely catch, for
-    // example, if the game does something with the half-pixel offset through
-    // the viewport.
-    // With MSAA, it's less likely that the game will use the viewport to
-    // manipulate the half-pixel offset - different host implementations may
-    // also use different sample positions (up to the topmost row - possible to
-    // set such sample positions in PC APIs), so just check if the last row's
-    // area is at least slightly covered.
-    if ((viewport_bottom_fixed & uint32_t(0xFF)) >
-        uint32_t(msaa_samples != xenos::MsaaSamples::k1X ? 0 : 0x80)) {
-      ++viewport_bottom_pixels;
+    if (cvars::half_pixel_offset &&
+        !regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
+      viewport_bottom += 0.5f;
     }
-    height_used = std::min(height_used, viewport_bottom_pixels);
+    // Then apply the floating-point viewport offset.
+    if (pa_cl_vte_cntl.vport_y_offset_ena) {
+      viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
+    }
+    viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena
+                           ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
+                           : 1.0f;
+    // Using floor, or, rather, truncation (because maxing with zero anyway)
+    // similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
+    // GPUs on Direct3D 12, also like in draw_util::GetHostViewportInfo.
+    // fmax to drop NaN and < 0, min as float (height_used is well below 2^24)
+    // to safely drop very large values.
+    height_used = uint32_t(
+        std::min(std::fmax(viewport_bottom, 0.0f), float(height_used)));
   }
   uint32_t scissor_bottom = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>().br_y;
   if (!regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>().window_offset_disable) {