Merge branch 'master' into vulkan

This commit is contained in:
Triang3l 2022-06-22 13:15:50 +03:00
commit 0d8bd0e0c6
14 changed files with 213 additions and 185 deletions

View File

@ -2239,18 +2239,13 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
// Get dynamic rasterizer state.
uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x();
uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y();
RenderTargetCache::DepthFloat24Conversion depth_float24_conversion =
render_target_cache_->depth_float24_conversion();
draw_util::ViewportInfo viewport_info;
draw_util::GetHostViewportInfo(
regs, draw_resolution_scale_x, draw_resolution_scale_y, true,
D3D12_VIEWPORT_BOUNDS_MAX, D3D12_VIEWPORT_BOUNDS_MAX, false,
normalized_depth_control,
host_render_targets_used &&
(depth_float24_conversion ==
RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating ||
depth_float24_conversion ==
RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding),
render_target_cache_->depth_float24_convert_in_pixel_shader(),
host_render_targets_used, pixel_shader && pixel_shader->writes_depth(),
viewport_info);
draw_util::Scissor scissor;

View File

@ -457,7 +457,9 @@ bool D3D12RenderTargetCache::Initialize() {
gamma_render_target_as_srgb_ = cvars::gamma_render_target_as_srgb;
depth_float24_conversion_ = GetConfigDepthFloat24Conversion();
depth_float24_round_ = cvars::depth_float24_round;
depth_float24_convert_in_pixel_shader_ =
cvars::depth_float24_convert_in_pixel_shader;
// Check if 2x MSAA is supported or needs to be emulated with 4x MSAA
// instead.
@ -1013,8 +1015,9 @@ bool D3D12RenderTargetCache::Initialize() {
// Blending is done in linear space directly in shaders.
gamma_render_target_as_srgb_ = false;
// Always true float24 depth.
depth_float24_conversion_ = DepthFloat24Conversion::kOnOutputRounding;
// Always true float24 depth rounded to the nearest even.
depth_float24_round_ = true;
depth_float24_convert_in_pixel_shader_ = true;
// Only ForcedSampleCount, which doesn't support 2x.
msaa_2x_supported_ = false;
@ -2091,7 +2094,7 @@ RenderTargetCache::RenderTarget* D3D12RenderTargetCache::CreateRenderTarget(
bool D3D12RenderTargetCache::IsHostDepthEncodingDifferent(
xenos::DepthRenderTargetFormat format) const {
if (format == xenos::DepthRenderTargetFormat::kD24FS8) {
return depth_float24_conversion_ == DepthFloat24Conversion::kOnCopy;
return !depth_float24_convert_in_pixel_shader_;
}
return false;
}
@ -3542,8 +3545,13 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
} break;
case xenos::DepthRenderTargetFormat::kD24FS8: {
// Convert using r1.y as temporary.
DxbcShaderTranslator::PreClampedDepthTo20e4(a, i, 3, i, 3, 1, 1,
true);
// When converting the depth in pixel shaders, it's always exact,
// truncating not to insert additional rounding instructions.
DxbcShaderTranslator::PreClampedDepthTo20e4(
a, i, 3, i, 3, 1, 1,
!depth_float24_convert_in_pixel_shader() &&
depth_float24_round(),
true);
} break;
}
// Merge depth and stencil into r0/r1.x.
@ -3729,8 +3737,13 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
} break;
case xenos::DepthRenderTargetFormat::kD24FS8: {
// Convert using r1.y as temporary.
DxbcShaderTranslator::PreClampedDepthTo20e4(a, 1, 3, 1, 3, 1, 1,
true);
// When converting the depth in pixel shaders, it's always exact,
// truncating not to insert additional rounding instructions.
DxbcShaderTranslator::PreClampedDepthTo20e4(
a, 1, 3, 1, 3, 1, 1,
!depth_float24_convert_in_pixel_shader() &&
depth_float24_round(),
true);
} break;
}
if (dest_is_color) {
@ -4105,8 +4118,14 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
dxbc::Src::R(0, dxbc::Src::kYYYY));
} break;
case xenos::DepthRenderTargetFormat::kD24FS8: {
DxbcShaderTranslator::PreClampedDepthTo20e4(a, 0, 1, 0, 0, 0, 2,
true);
// When converting the depth in pixel shaders, it's always
// exact, truncating not to insert additional rounding
// instructions.
DxbcShaderTranslator::PreClampedDepthTo20e4(
a, 0, 1, 0, 0, 0, 2,
!depth_float24_convert_in_pixel_shader() &&
depth_float24_round(),
true);
} break;
}
a.OpIEq(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(0, dxbc::Src::kYYYY),
@ -6167,7 +6186,12 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
case xenos::DepthRenderTargetFormat::kD24FS8:
// Convert to [0, 2) float24 from [0, 1) float32, using r0.x as
// temporary.
DxbcShaderTranslator::PreClampedDepthTo20e4(a, 1, 0, 1, 0, 0, 0, true);
// When converting the depth in pixel shaders, it's always exact,
// truncating not to insert additional rounding instructions.
DxbcShaderTranslator::PreClampedDepthTo20e4(
a, 1, 0, 1, 0, 0, 0,
!depth_float24_convert_in_pixel_shader() && depth_float24_round(),
true);
break;
}
// Combine 24-bit depth and stencil into r1.x.

View File

@ -107,8 +107,9 @@ class D3D12RenderTargetCache final : public RenderTargetCache {
!cvars::snorm16_render_target_full_range;
}
DepthFloat24Conversion depth_float24_conversion() const {
return depth_float24_conversion_;
bool depth_float24_round() const { return depth_float24_round_; }
bool depth_float24_convert_in_pixel_shader() const {
return depth_float24_convert_in_pixel_shader_;
}
DXGI_FORMAT GetColorResourceDXGIFormat(
@ -720,8 +721,8 @@ class D3D12RenderTargetCache final : public RenderTargetCache {
bool gamma_render_target_as_srgb_ = false;
DepthFloat24Conversion depth_float24_conversion_ =
DepthFloat24Conversion::kOnCopy;
bool depth_float24_round_ = false;
bool depth_float24_convert_in_pixel_shader_ = false;
bool msaa_2x_supported_ = false;

View File

@ -882,20 +882,14 @@ PipelineCache::GetCurrentPixelShaderModification(
RenderTargetCache::Path::kHostRenderTargets) {
using DepthStencilMode =
DxbcShaderTranslator::Modification::DepthStencilMode;
RenderTargetCache::DepthFloat24Conversion depth_float24_conversion =
render_target_cache_.depth_float24_conversion();
if ((depth_float24_conversion ==
RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating ||
depth_float24_conversion ==
RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding) &&
if (render_target_cache_.depth_float24_convert_in_pixel_shader() &&
normalized_depth_control.z_enable &&
regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
xenos::DepthRenderTargetFormat::kD24FS8) {
modification.pixel.depth_stencil_mode =
depth_float24_conversion ==
RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating
? DepthStencilMode::kFloat24Truncating
: DepthStencilMode::kFloat24Rounding;
render_target_cache_.depth_float24_round()
? DepthStencilMode::kFloat24Rounding
: DepthStencilMode::kFloat24Truncating;
} else {
if (shader.implicit_early_z_write_allowed() &&
(!shader.writes_color_target(0) ||
@ -2917,20 +2911,16 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline(
state_desc.PS.pShaderBytecode = depth_only_pixel_shader_.data();
state_desc.PS.BytecodeLength = depth_only_pixel_shader_.size();
} else {
if ((description.depth_func != xenos::CompareFunction::kAlways ||
if (render_target_cache_.depth_float24_convert_in_pixel_shader() &&
(description.depth_func != xenos::CompareFunction::kAlways ||
description.depth_write) &&
description.depth_format == xenos::DepthRenderTargetFormat::kD24FS8) {
switch (render_target_cache_.depth_float24_conversion()) {
case RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating:
state_desc.PS.pShaderBytecode = shaders::float24_truncate_ps;
state_desc.PS.BytecodeLength = sizeof(shaders::float24_truncate_ps);
break;
case RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding:
state_desc.PS.pShaderBytecode = shaders::float24_round_ps;
state_desc.PS.BytecodeLength = sizeof(shaders::float24_round_ps);
break;
default:
break;
if (render_target_cache_.depth_float24_round()) {
state_desc.PS.pShaderBytecode = shaders::float24_round_ps;
state_desc.PS.BytecodeLength = sizeof(shaders::float24_round_ps);
} else {
state_desc.PS.pShaderBytecode = shaders::float24_truncate_ps;
state_desc.PS.BytecodeLength = sizeof(shaders::float24_truncate_ps);
}
}
}

View File

@ -532,8 +532,10 @@ void GetHostViewportInfo(const RegisterFile& regs,
// interpolated Z instead if conversion can't be done exactly, without
// modifying clipping bounds by adjusting Z in vertex shaders, as that
// may cause polygons placed explicitly at Z = 0 or Z = W to be clipped.
z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min));
z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max));
// Rounding the bounds to the nearest even regardless of the depth
// rounding mode not to add even more error by truncating twice.
z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min, true));
z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max, true));
}
if (full_float24_in_0_to_1) {
// Remap the full [0...2) float24 range to [0...1) support data round-trip

View File

@ -533,13 +533,14 @@ class DxbcShaderTranslator : public ShaderTranslator {
uint32_t temp2_temp_component);
// Converts the depth value externally clamped to the representable [0, 2)
// range to 20e4 floating point, with zeros in bits 24:31, rounding to the
// nearest even. Source and destination may be the same, temporary must be
// different than both. If remap_from_0_to_0_5 is true, it's assumed that
// 0...1 is pre-remapped to 0...0.5 in the input.
// nearest even or towards zero. Source and destination may be the same,
// temporary must be different than both. If remap_from_0_to_0_5 is true, it's
// assumed that 0...1 is pre-remapped to 0...0.5 in the input.
static void PreClampedDepthTo20e4(
dxbc::Assembler& a, uint32_t f24_temp, uint32_t f24_temp_component,
uint32_t f32_temp, uint32_t f32_temp_component, uint32_t temp_temp,
uint32_t temp_temp_component, bool remap_from_0_to_0_5);
uint32_t temp_temp_component, bool round_to_nearest_even,
bool remap_from_0_to_0_5);
// Converts the 20e4 number in bits [f24_shift, f24_shift + 10) to a 32-bit
// float. Two temporaries must be different, but one can be the same as the
// source. The destination may be anything writable. If remap_to_0_to_0_5 is

View File

@ -1921,7 +1921,7 @@ void DxbcShaderTranslator::CompletePixelShader_DSV_DepthTo24Bit() {
} else {
// Properly convert to 20e4, with rounding to the nearest even (the bias was
// pre-applied by multiplying by 2), then convert back restoring the bias.
PreClampedDepthTo20e4(a_, temp, 0, temp, 0, temp, 1, false);
PreClampedDepthTo20e4(a_, temp, 0, temp, 0, temp, 1, true, false);
Depth20e4To32(a_, dxbc::Dest::ODepth(), temp, 0, 0, temp, 0, temp, 1, true);
}
@ -3217,7 +3217,8 @@ void DxbcShaderTranslator::Float7e3To32(
void DxbcShaderTranslator::PreClampedDepthTo20e4(
dxbc::Assembler& a, uint32_t f24_temp, uint32_t f24_temp_component,
uint32_t f32_temp, uint32_t f32_temp_component, uint32_t temp_temp,
uint32_t temp_temp_component, bool remap_from_0_to_0_5) {
uint32_t temp_temp_component, bool round_to_nearest_even,
bool remap_from_0_to_0_5) {
assert_true(temp_temp != f24_temp ||
temp_temp_component != f24_temp_component);
assert_true(temp_temp != f32_temp ||
@ -3268,13 +3269,18 @@ void DxbcShaderTranslator::PreClampedDepthTo20e4(
// Close the denormal check.
a.OpEndIf();
// Build the 20e4 number.
// temp = (biased_f32 >> 3) & 1
a.OpUBFE(temp_dest, dxbc::Src::LU(1), dxbc::Src::LU(3), f24_src);
// f24 = biased_f32 + 3
a.OpIAdd(f24_dest, f24_src, dxbc::Src::LU(3));
// f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1)
a.OpIAdd(f24_dest, f24_src, temp_src);
if (round_to_nearest_even) {
// temp = (biased_f32 >> 3) & 1
a.OpUBFE(temp_dest, dxbc::Src::LU(1), dxbc::Src::LU(3), f24_src);
// f24 = biased_f32 + 3
a.OpIAdd(f24_dest, f24_src, dxbc::Src::LU(3));
// f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1)
a.OpIAdd(f24_dest, f24_src, temp_src);
}
// For rounding to the nearest even:
// f24 = ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF
// For rounding towards zero:
// f24 = (biased_f32 >> 3) & 0xFFFFFF
a.OpUBFE(f24_dest, dxbc::Src::LU(24), dxbc::Src::LU(3), f24_src);
}
@ -3377,7 +3383,7 @@ void DxbcShaderTranslator::ROV_DepthTo24Bit(uint32_t d24_temp,
// 20e4 conversion.
PreClampedDepthTo20e4(a_, d24_temp, d24_temp_component, d32_temp,
d32_temp_component, temp_temp, temp_temp_component,
false);
true, false);
}
a_.OpElse();
{

View File

@ -39,42 +39,109 @@ DEFINE_bool(
"reduce bandwidth usage during transfers as the previous depth won't need "
"to be read.",
"GPU");
// The round trip is done, in particular, in 545407F2.
DEFINE_string(
depth_float24_conversion, "",
"Method for converting 32-bit Z values to 20e4 floating point when using "
"host depth buffers without native 20e4 support (when not using rasterizer-"
"ordered views / fragment shader interlocks to perform depth testing "
"manually).\n"
"Use: [any, on_copy, truncate, round]\n"
" on_copy:\n"
" Do depth testing at host precision, converting when copying between "
"color and depth buffers (or between depth buffers of different formats) "
"to support reinterpretation, but keeps the last host depth buffer used "
"for each EDRAM range and reloads the host precision value if it's still "
"up to date after the EDRAM range was used with a different pixel format.\n"
" + Highest performance, allows early depth test and writing.\n"
" + Host MSAA is possible with pixel-rate shading where supported.\n"
" - EDRAM > RAM > EDRAM depth buffer round trip done in certain games "
"destroys precision irreparably, causing artifacts if another rendering "
"pass is done after the EDRAM reupload.\n"
" truncate:\n"
" Convert to 20e4 directly in pixel shaders, always rounding down.\n"
" + Average performance, conservative early depth test is possible.\n"
" + No precision loss when anything changes in the storage of the depth "
"buffer, EDRAM > RAM > EDRAM copying preserves precision.\n"
" - Rounding mode is incorrect, sometimes giving results smaller than "
"they should be - may cause inaccuracy especially in edge cases when the "
"game wants to write an exact value.\n"
" - Host MSAA is only possible at SSAA speed, with per-sample shading.\n"
" round:\n"
" Convert to 20e4 directly in pixel shaders, correctly rounding to the "
"nearest even.\n"
" + Highest accuracy.\n"
" - Significantly limited performance, early depth test is not possible.\n"
" - Host MSAA is only possible at SSAA speed, with per-sample shading.\n"
" Any other value:\n"
" Choose what is considered the most optimal (currently \"on_copy\").",
// Lossless round trip: 545407F2.
// Lossy round trip with the "greater or equal" test afterwards: 4D530919.
// Lossy round trip with the "equal" test afterwards: 535107F5, 565507EF.
DEFINE_bool(
depth_float24_round, false,
"Whether to round to the nearest even, rather than truncating (rounding "
"towards zero), the depth when converting it to 24-bit floating-point "
"(20e4) from the host precision (32-bit floating point) when using a host "
"depth buffer.\n"
"false:\n"
" Recommended.\n"
" The conversion may move the depth values farther away from the camera.\n"
" Without depth_float24_convert_in_pixel_shader:\n"
" The \"greater or equal\" depth test function continues to work fine if "
"the full host precision depth data is lost, it's still possible to draw "
"another pass of the same geometry with it.\n"
" (See the description of depth_float24_convert_in_pixel_shader for more "
"information about full precision depth data loss.)\n"
" With depth_float24_convert_in_pixel_shader:\n"
" Faster - the pixel shader for hidden surfaces may still be skipped "
"(using conservative depth output).\n"
"true:\n"
" Only for special cases of issues caused by minor 32-bit floating-point "
"rounding errors, for instance, when the game tries to draw something at "
"the camera plane by setting Z of the vertex position to W.\n"
" The conversion may move the depth values closer or farther.\n"
" Using the same rounding mode as in the Direct3D 9 reference rasterizer.\n"
" Without depth_float24_convert_in_pixel_shader:\n"
" Not possible to recover from a full host precision depth data loss - in "
"subsequent passes of rendering the same geometry, half of the samples "
"will be failing the depth test with the \"greater or equal\" depth test "
"function.\n"
" With depth_float24_convert_in_pixel_shader:\n"
" Slower - depth rejection before the pixel shader is not possible.\n"
"When the depth buffer is emulated in software (via the fragment shader "
"interlock / rasterizer-ordered view), this is ignored, and rounding to "
"the nearest even is always done.",
"GPU");
// With MSAA, when converting the depth in pixel shaders, they must run at
// sample frequency - otherwise, if the depth is the same for the entire pixel,
// intersections of polygons cannot be antialiased.
//
// Important usage note: When using this mode, bounds of the fixed-function
// viewport must be converted to and back from float24 too (preferably using
// rounding to the nearest even regardless of whether truncation was requested
// for the values, to reduce the error already caused by truncation rather than
// to amplify it). This ensures that clamping to the viewport bounds, which
// happens after the pixel shader even if it overwrites the resulting depth, is
// never done to a value not representable as float24 (for example, if the
// minimum Z is a number too small to be represented as float24, but not zero,
// it won't be possible to write what should become 0x000000 to the depth
// buffer). Note that this may add some error to the depth values from the
// rasterizer; however, modifying Z in the vertex shader to make interpolated
// depth values would cause clipping to be done to different bounds, which may
// be more undesirable, especially in cases when Z is explicitly set to a value
// like 0 or W (in such cases, the adjusted polygon may go outside 0...W in clip
// space and disappear).
//
// If false, doing the depth test at the host precision, converting to 20e4 to
// support reinterpretation, but keeping track of both the last color (or
// non-20e4 depth) value (let's call it stored_f24) and the last host depth
// value (stored_host) for each EDRAM pixel, reloading the last host depth value
// if stored_f24 == to_f24(stored_host) (otherwise it was overwritten by
// something else, like clearing, or an actually used color buffer; this is
// inexact though, and will incorrectly load pixels that were overwritten by
// something else in the EDRAM, but turned out to have the same value on the
// guest as before - an outdated host-precision value will be loaded in these
// cases instead).
DEFINE_bool(
depth_float24_convert_in_pixel_shader, false,
"Whether to convert the depth values to 24-bit floating-point (20e4) from "
"the host precision (32-bit floating point) directly in the pixel shaders "
"of guest draws when using a host depth buffer.\n"
"This prevents visual artifacts (interleaved stripes of parts of surfaces "
"rendered and parts not rendered, having either the same width in case of "
"the \"greater or equal\" depth test function, or the former being much "
"thinner than the latter with the \"equal\" function) if the full host "
"precision depth data is lost.\n"
"This issue may happen if the game reloads the depth data previously "
"evicted from the EDRAM to the RAM back to the EDRAM, but the EDRAM region "
"that previously contained that depth buffer was overwritten by another "
"depth buffer, or the game loads it to a different location in the EDRAM "
"than it was previously placed at, thus Xenia is unable to restore the "
"depth data with the original precision, and instead falls back to "
"converting the lower-precision values, so in subsequent rendering passes "
"for the same geometry, the actual depth values of the surfaces don't "
"match those stored in the depth buffer anymore.\n"
"This is a costly option because it makes the GPU unable to use depth "
"buffer compression, and also with MSAA, forces the pixel shader to run "
"for every subpixel sample rather than for the entire pixel, making pixel "
"shading 2 or 4 times heavier depending on the MSAA sample count.\n"
"The rounding direction is controlled by the depth_float24_round "
"configuration variable.\n"
"Note that with depth_float24_round = true, this becomes even more costly "
"because pixel shaders must be executed regardless of whether the surface "
"is behind the previously drawn surfaces. With depth_float24_round = "
"false, conservative depth output is used, however, so depth rejection "
"before the pixel shader may still work.\n"
"If sample-rate shading is not supported by the host GPU, the conversion "
"in the pixel shader is done only when MSAA is not used.\n"
"When the depth buffer is emulated in software (via the fragment shader "
"interlock / rasterizer-ordered view), this is ignored because 24-bit "
"depth is always used directly.",
"GPU");
DEFINE_bool(
draw_resolution_scaled_texture_offsets, true,
@ -790,17 +857,6 @@ uint32_t RenderTargetCache::GetLastUpdateBoundRenderTargets(
return rts_used;
}
RenderTargetCache::DepthFloat24Conversion
RenderTargetCache::GetConfigDepthFloat24Conversion() {
if (cvars::depth_float24_conversion == "truncate") {
return DepthFloat24Conversion::kOnOutputTruncating;
}
if (cvars::depth_float24_conversion == "round") {
return DepthFloat24Conversion::kOnOutputRounding;
}
return DepthFloat24Conversion::kOnCopy;
}
uint32_t RenderTargetCache::GetRenderTargetHeight(
uint32_t pitch_tiles_at_32bpp, xenos::MsaaSamples msaa_samples) const {
if (!pitch_tiles_at_32bpp) {

View File

@ -29,6 +29,8 @@
#include "xenia/gpu/xenos.h"
DECLARE_bool(depth_transfer_not_equal_test);
DECLARE_bool(depth_float24_round);
DECLARE_bool(depth_float24_convert_in_pixel_shader);
DECLARE_bool(draw_resolution_scaled_texture_offsets);
DECLARE_bool(gamma_render_target_as_srgb);
DECLARE_bool(native_2x_msaa);
@ -89,60 +91,6 @@ class RenderTargetCache {
kPixelShaderInterlock,
};
enum class DepthFloat24Conversion {
// Doing depth test at the host precision, converting to 20e4 to support
// reinterpretation, but keeping track of both the last color (or non-20e4
// depth) value (let's call it stored_f24) and the last host depth value
// (stored_host) for each EDRAM pixel, reloading the last host depth value
// if stored_f24 == to_f24(stored_host) (otherwise it was overwritten by
// something else, like clearing, or an actually used color buffer; this is
// inexact though, and will incorrectly load pixels that were overwritten by
// something else in the EDRAM, but turned out to have the same value on the
// guest as before - an outdated host-precision value will be loaded in
// these cases instead).
//
// EDRAM > RAM, then reusing the EDRAM region for something else > EDRAM
// round trip destroys precision beyond repair.
//
// Full host early Z and MSAA with pixel-rate shading are supported.
kOnCopy,
// Converting the depth to the closest host value representable exactly as a
// 20e4 float in pixel shaders, to support invariance in cases when the
// guest reuploads a previously resolved depth buffer to the EDRAM, rounding
// towards zero (which contradicts the rounding used by the Direct3D 9
// reference rasterizer, but allows less-than-or-equal pixel shader depth
// output to be used to preserve most of early Z culling when the game is
// using reversed depth, which is the usual way of doing depth testing on
// the Xbox 360 and of utilizing the advantages of a floating-point
// encoding).
//
// With MSAA, pixel shaders must run at sample frequency - otherwise, if the
// depth is the same for the entire pixel, intersections of polygons cannot
// be antialiased.
//
// Important usage note: When using this mode, bounds of the fixed-function
// viewport must be converted to and back from float24 too (preferably using
// correct rounding to the nearest even, to reduce the error already caused
// by truncation rather than to amplify it). This ensures that clamping to
// the viewport bounds, which happens after the pixel shader even if it
// overwrites the resulting depth, is never done to a value not
// representable as float24 (for example, if the minimum Z is a number too
// small to be represented as float24, but not zero, it won't be possible to
// write what should become 0x000000 to the depth buffer). Note that this
// may add some error to the depth values from the rasterizer; however,
// modifying Z in the vertex shader to make interpolated depth values would
// cause clipping to be done to different bounds, which may be more
// undesirable, especially in cases when Z is explicitly set to a value like
// 0 or W (in such cases, the adjusted polygon may go outside 0...W in clip
// space and disappear).
kOnOutputTruncating,
// Similar to kOnOutputTruncating, but rounding to the nearest even, more
// correctly, however, because the resulting depth can be bigger than the
// original host value, early depth testing can't be used at all. Same
// viewport usage rules apply.
kOnOutputRounding,
};
// Useful host-specific values.
// sRGB conversion from the Direct3D 11.3 functional specification.
static constexpr float kSrgbToLinearDenominator1 = 12.92f;
@ -512,8 +460,6 @@ class RenderTargetCache {
}
};
static DepthFloat24Conversion GetConfigDepthFloat24Conversion();
virtual uint32_t GetMaxRenderTargetWidth() const = 0;
virtual uint32_t GetMaxRenderTargetHeight() const = 0;

View File

@ -53,9 +53,9 @@ ushr [precise(y)] r0.y, r0.y, r0.z
ult [precise(z)] r0.z, r0.x, l(0x38800000)
iadd [precise(x)] r0.x, r0.x, l(0xc8000000)
movc [precise(x)] r0.x, r0.z, r0.y, r0.x
iadd [precise(y)] r0.y, r0.x, l(3)
ubfe [precise(x)] r0.x, l(1), l(3), r0.x
iadd [precise(x)] r0.x, r0.x, r0.y
ubfe [precise(y)] r0.y, l(1), l(3), r0.x
iadd [precise(x)] r0.x, r0.y, r0.x
iadd [precise(x)] r0.x, r0.x, l(3)
ubfe [precise(xyz)] r0.xyz, l(24, 20, 4, 0), l(3, 3, 23, 0), r0.xxxx
firstbit_hi [precise(w)] r0.w, r0.y
iadd [precise(w)] r0.w, r0.w, l(-11)
@ -76,10 +76,10 @@ ret
const BYTE float24_round_ps[] =
{
68, 88, 66, 67, 229, 54,
46, 1, 194, 31, 164, 202,
193, 71, 175, 129, 44, 52,
218, 154, 1, 0, 0, 0,
68, 88, 66, 67, 110, 79,
84, 202, 151, 165, 237, 180,
64, 17, 0, 132, 236, 126,
142, 105, 1, 0, 0, 0,
8, 7, 0, 0, 5, 0,
0, 0, 52, 0, 0, 0,
160, 0, 0, 0, 120, 2,
@ -259,22 +259,22 @@ const BYTE float24_round_ps[] =
0, 0, 0, 0, 26, 0,
16, 0, 0, 0, 0, 0,
10, 0, 16, 0, 0, 0,
0, 0, 30, 0, 16, 7,
0, 0, 138, 0, 16, 9,
34, 0, 16, 0, 0, 0,
0, 0, 10, 0, 16, 0,
0, 0, 0, 0, 1, 64,
0, 0, 3, 0, 0, 0,
138, 0, 8, 9, 18, 0,
16, 0, 0, 0, 0, 0,
1, 64, 0, 0, 1, 0,
0, 0, 1, 64, 0, 0,
3, 0, 0, 0, 10, 0,
1, 0, 0, 0, 1, 64,
0, 0, 3, 0, 0, 0,
10, 0, 16, 0, 0, 0,
0, 0, 30, 0, 8, 7,
18, 0, 16, 0, 0, 0,
0, 0, 26, 0, 16, 0,
0, 0, 0, 0, 10, 0,
16, 0, 0, 0, 0, 0,
30, 0, 8, 7, 18, 0,
16, 0, 0, 0, 0, 0,
10, 0, 16, 0, 0, 0,
0, 0, 26, 0, 16, 0,
0, 0, 0, 0, 138, 0,
0, 0, 1, 64, 0, 0,
3, 0, 0, 0, 138, 0,
56, 15, 114, 0, 16, 0,
0, 0, 0, 0, 2, 64,
0, 0, 24, 0, 0, 0,

View File

@ -12,5 +12,6 @@ precise float main(XePSInput xe_input) : SV_Depth {
// allow for safe reinterpretation of any 24-bit value to and from float24
// depth using depth output without unrestricted depth range.
return asfloat(XeFloat20e4To32(
XeFloat32To20e4(asuint(saturate(xe_input.position.z * 2.0f))), true));
XeFloat32To20e4(asuint(saturate(xe_input.position.z * 2.0f)), true),
true));
}

View File

@ -587,14 +587,17 @@ xesl_uint4 XeRG16SNormToRG16Float(xesl_uint4 packed_texels) {
// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
// We also can't clamp the stored value to 1 as load->store->load must be exact.
uint XeFloat32To20e4(uint f32u32) {
uint XeFloat32To20e4(uint f32u32, bool round_to_nearest_even) {
// Keep only positive (high bit set means negative for both float and int) and
// saturate to the maximum representable value near 2 (also dropping NaNs).
f32u32 = min((f32u32 <= 0x7FFFFFFFu) ? f32u32 : 0u, 0x3FFFFFF8u);
uint denormalized =
((f32u32 & 0x7FFFFFu) | 0x800000u) >> min(113u - (f32u32 >> 23u), 24u);
uint f24u32 = (f32u32 < 0x38800000u) ? denormalized : (f32u32 + 0xC8000000u);
return ((f24u32 + 3u + ((f24u32 >> 3u) & 1u)) >> 3u) & 0xFFFFFFu;
if (round_to_nearest_even) {
f24u32 += 3u + ((f24u32 >> 3u) & 1u);
}
return (f24u32 >> 3u) & 0xFFFFFFu;
}
uint XeFloat20e4To32(uint f24u32, bool remap_to_0_to_0_5) {

View File

@ -126,7 +126,7 @@ float Float7e3To32(uint32_t f10) {
// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
uint32_t Float32To20e4(float f32) {
uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
if (!(f32 > 0.0f)) {
// Positive only, and not -0 or NaN.
return 0;
@ -145,7 +145,10 @@ uint32_t Float32To20e4(float f32) {
// Rebias the exponent to represent the value as a normalized 20e4.
f32u32 += 0xC8000000u;
}
return ((f32u32 + 3 + ((f32u32 >> 3) & 1)) >> 3) & 0xFFFFFF;
if (round_to_nearest_even) {
f32u32 += f32u32 + 3 + ((f32u32 >> 3) & 1);
}
return (f32u32 >> 3) & 0xFFFFFF;
}
float Float20e4To32(uint32_t f24) {

View File

@ -336,8 +336,8 @@ float Float7e3To32(uint32_t f10);
// Converts 24-bit unorm depth in the value (not clamping) to an IEEE-754 32-bit
// floating-point number.
// Converts an IEEE-754 32-bit floating-point number to Xenos floating-point
// depth, rounding to the nearest even.
uint32_t Float32To20e4(float f32);
// depth, rounding to the nearest even or towards zero.
uint32_t Float32To20e4(float f32, bool round_to_nearest_even);
// Converts Xenos floating-point depth in bits 0:23 (not clamping) to an
// IEEE-754 32-bit floating-point number.
float Float20e4To32(uint32_t f24);