Merge branch 'master' of https://github.com/xenia-project/xenia into canary_experimental

2022-06-22 21:05:45 +02:00 · 2022-06-22 21:05:45 +02:00 · ce3b159683
parent e7a122d943 e9f129f67f
commit ce3b159683
3 changed files with 110 additions and 33 deletions
--- a/src/xenia/gpu/d3d12/pipeline_cache.cc
+++ b/src/xenia/gpu/d3d12/pipeline_cache.cc
@ -1435,16 +1435,8 @@ bool PipelineCache::GetCurrentStateDescription(
    float polygon_offset, polygon_offset_scale;
    draw_util::GetPreferredFacePolygonOffset(
        regs, primitive_polygonal, polygon_offset_scale, polygon_offset);
-    float polygon_offset_host_scale = draw_util::GetD3D10PolygonOffsetFactor(
-        regs.Get<reg::RB_DEPTH_INFO>().depth_format, true);
-    // Using ceil here just in case a game wants the offset but passes a value
-    // that is too small - it's better to apply more offset than to make depth
-    // fighting worse or to disable the offset completely (Direct3D 12 takes an
-    // integer value).
-    description_out.depth_bias =
-        int32_t(
-            std::ceil(std::abs(polygon_offset * polygon_offset_host_scale))) *
-        (polygon_offset < 0.0f ? -1 : 1);
+    description_out.depth_bias = draw_util::GetD3D10IntegerPolygonOffset(
+        regs.Get<reg::RB_DEPTH_INFO>().depth_format, polygon_offset);
    description_out.depth_bias_slope_scaled =
        polygon_offset_scale * xenos::kPolygonOffsetScaleSubpixelUnit;
  }
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@ -10,6 +10,7 @@
 #ifndef XENIA_GPU_DRAW_UTIL_H_
 #define XENIA_GPU_DRAW_UTIL_H_

+#include <cmath>
 #include <cstdint>
 #include <utility>

@ -113,29 +114,113 @@ extern const int8_t kD3D10StandardSamplePositions4x[4][2];

 reg::RB_DEPTHCONTROL GetNormalizedDepthControl(const RegisterFile& regs);

-constexpr float GetD3D10PolygonOffsetFactor(
-    xenos::DepthRenderTargetFormat depth_format, bool float24_as_0_to_0_5) {
-  if (depth_format == xenos::DepthRenderTargetFormat::kD24S8) {
-    return float(1 << 24);
+// Direct3D 9 and Xenos constant polygon offset is an absolute floating-point
+// value.
+// It's possibly treated just as an absolute offset by the Xenos too - the
+// PA_SU_POLY_OFFSET_DB_FMT_CNTL::POLY_OFFSET_DB_IS_FLOAT_FMT switch was added
+// later in the R6xx, though this needs verification.
+// 5454082B, for example, for float24, sets the bias to 0.000002 - or slightly
+// above 2^-19.
+// The total polygon offset formula specified by Direct3D 9 is:
+// `offset = slope * slope factor + constant offset`
+//
+// Direct3D 10, Metal, OpenGL and Vulkan, however, take the constant polygon
+// offset factor as a relative value, with the formula being:
+// `offset = slope * slope factor +
+//           maximum resolvable difference * constant factor`
+// where the maximum resolvable difference is:
+// - For a fixed-point depth buffer, the minimum representable non-zero value in
+//   the depth buffer, that is 1 / (2^24 - 1) for unorm24 (on Vulkan though it's
+//   allowed to be up to 2 / 2^24 in this case).
+// - For a floating-point depth buffer, it's:
+//   2 ^ (exponent of the maximum Z in the primitive - the number of explicitly
+//        stored mantissa bits)
+//   (23 explicitly stored bits for float32 - and 20 explicitly stored bits for
+//    float24).
+//
+// While the polygon offset is a fixed-function feature in the pipeline, and the
+// formula can't be toggled between absolute and relative, it's important that
+// Xenia translates the guest absolute depth bias into the host relative depth
+// bias in a way that the values that separate coplanar geometry on the guest
+// qualitatively also still correctly separate them on the host.
+//
+// It also should be taken into account that on Xenia, float32 depth values may
+// be snapped to float24 directly in the translated pixel shaders (to prevent
+// data loss if after reuploading a depth buffer to the EDRAM there's no way to
+// recover the full-precision value, that results in the inability to perform
+// more rendering passes for the same geometry), and not only to the nearest
+// value, but also just truncating them (so in case of data loss, the "greater
+// or equal" depth test function still works).
+//
+// Because of this, the depth bias may be lost if Xenia translates it into a too
+// small value, and the conversion of the depth is done in the pixel shader.
+// Specifically, Xenia should not simply convert a value that separates coplanar
+// primitives as float24 just into something that still separates them as
+// float32. Essentially, if conversion to float24 is done in the pixel shader,
+// Xenia should make sure the polygon offset on the host is calculated as if the
+// host had float24 depth too, not float32.
+
+// Applies to both native host unorm24, and unorm24 emulated as host float32.
+// For native unorm24, this is exactly the inverse of the minimum representable
+// non-zero value.
+// For unorm24 emulated as float32, the minimum representable non-zero value for
+// a primitive in the [0.5, 1) range (the worst case that forward depth reaches
+// very quickly, at nearly `2 * near clipping plane distance`) is 2 ^ (-1 - 23),
+// or 2^-24, and this factor is almost 2^24.
+constexpr float kD3D10PolygonOffsetFactorUnorm24 =
+    float((UINT32_C(1) << 24) - 1);
+
+// For a host floating-point depth buffer, the integer value of the depth bias
+// is roughly how many ULPs primitives should be separated by.
+//
+// Float24, however, has 3 mantissa bits fewer than float32 - so one float24 ULP
+// corresponds to 8 float32 ULPs - which means each conceptual "layer" of the
+// guest value should correspond to a polygon offset of 8. So, after the guest
+// absolute value is converted to "layers", it should be multiplied by 8 before
+// being used on the host with a float32 depth buffer.
+//
+// The scale for converting the guest absolute depth bias to the "layers" needs
+// to be determined for the worst case - specifically, the [0.5, 1) range (1 is
+// a single value, so there's no need to take it into consideration). In this
+// range, Z values have the exponent of -1. Therefore, for float24, the absolute
+// offset is obtained from the "layer index" in this range as (disregarding the
+// slope term):
+// offset = 2 ^ (-1 - 20) * constant factor
+// Thus, to obtain the constant factor from the absolute offset in the range
+// with the lowest absolute precision, the offset needs to be multiplied by
+// 2^21.
+//
+// Finally, the 0...0.5 range may be used on the host to represent the 0...1
+// guest depth range to be able to copy all possible encodings, which are
+// [0, 2), via a [0, 1] depth output variable, during EDRAM contents
+// reinterpretation. This is done by scaling the viewport depth bounds by 0.5.
+// However, there's no need to do anything to handle this scenario in the
+// polygon offset - it's calculated after applying the viewport transformation,
+// and the maximum Z value in the primitive will have an exponent lowered by 1,
+// thus the result will also have an exponent lowered by 1 - exactly what's
+// needed for remapping 0...1 to 0...0.5.
+constexpr float kD3D10PolygonOffsetFactorFloat24 =
+    float(UINT32_C(1) << (21 + 3));
+
+inline int32_t GetD3D10IntegerPolygonOffset(
+    xenos::DepthRenderTargetFormat depth_format, float polygon_offset) {
+  bool is_float24 = depth_format == xenos::DepthRenderTargetFormat::kD24FS8;
+  // Using `ceil` because more offset is better, especially if flooring would
+  // result in 0 - conceptually, if the offset is used at all, primitives need
+  // to be separated in the depth buffer.
+  int32_t polygon_offset_int = int32_t(
+      std::ceil(std::abs(polygon_offset) *
+                (is_float24 ? kD3D10PolygonOffsetFactorFloat24 * (1.0f / 8.0f)
+                            : kD3D10PolygonOffsetFactorUnorm24)));
+  // For float24, the conversion may be done in the translated pixel shaders,
+  // including via truncation rather than rounding to the nearest. So, making
+  // the integer bias always in the increments of 2^3 (2 ^ the difference in the
+  // mantissa bit count between float32 and float24), and because of that, doing
+  // `ceil` before changing the units from float24 ULPs to float32 ULPs.
+  if (is_float24) {
+    polygon_offset_int <<= 3;
  }
-  // 20 explicit + 1 implicit (1.) mantissa bits.
-  // 2^20 is not enough for 415607E6 retail version's training mission shooting
-  // range floor (with the number 1) on Direct3D 12. Tested on Nvidia GeForce
-  // GTX 1070, the exact formula (taking into account the 0...1 to 0...0.5
-  // remapping described below) used for testing is
-  // `int(ceil(offset * 2^20 * 0.5)) * sign(offset)`. With 2^20 * 0.5, there
-  // are various kinds of stripes dependending on the view angle in that
-  // location. With 2^21 * 0.5, the issue is not present.
-  constexpr float kFloat24Scale = float(1 << 21);
-  // 0...0.5 range may be used on the host to represent the 0...1 guest depth
-  // range to be able to copy all possible encodings, which are [0, 2), via a
-  // [0, 1] depth output variable, during EDRAM contents reinterpretation.
-  // This is done by scaling the viewport depth bounds by 0.5. However, the
-  // depth bias is applied after the viewport. This adjustment is only needed
-  // for the constant bias - for slope-scaled, the derivatives of Z are
-  // calculated after the viewport as well, and will already include the 0.5
-  // scaling from the viewport.
-  return float24_as_0_to_0_5 ? kFloat24Scale * 0.5f : kFloat24Scale;
+  return polygon_offset < 0 ? -polygon_offset_int : polygon_offset_int;
 }

 // For hosts not supporting separate front and back polygon offsets, returns the
--- a/src/xenia/gpu/xenos.cc
+++ b/src/xenia/gpu/xenos.cc
@ -146,7 +146,7 @@ uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
    f32u32 += 0xC8000000u;
  }
  if (round_to_nearest_even) {
-    f32u32 += f32u32 + 3 + ((f32u32 >> 3) & 1);
+    f32u32 += 3 + ((f32u32 >> 3) & 1);
  }
  return (f32u32 >> 3) & 0xFFFFFF;
 }