Merge branch 'master' into vulkan
This commit is contained in:
commit
0d8bd0e0c6
|
@ -2239,18 +2239,13 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
|
||||||
// Get dynamic rasterizer state.
|
// Get dynamic rasterizer state.
|
||||||
uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x();
|
uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x();
|
||||||
uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y();
|
uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y();
|
||||||
RenderTargetCache::DepthFloat24Conversion depth_float24_conversion =
|
|
||||||
render_target_cache_->depth_float24_conversion();
|
|
||||||
draw_util::ViewportInfo viewport_info;
|
draw_util::ViewportInfo viewport_info;
|
||||||
draw_util::GetHostViewportInfo(
|
draw_util::GetHostViewportInfo(
|
||||||
regs, draw_resolution_scale_x, draw_resolution_scale_y, true,
|
regs, draw_resolution_scale_x, draw_resolution_scale_y, true,
|
||||||
D3D12_VIEWPORT_BOUNDS_MAX, D3D12_VIEWPORT_BOUNDS_MAX, false,
|
D3D12_VIEWPORT_BOUNDS_MAX, D3D12_VIEWPORT_BOUNDS_MAX, false,
|
||||||
normalized_depth_control,
|
normalized_depth_control,
|
||||||
host_render_targets_used &&
|
host_render_targets_used &&
|
||||||
(depth_float24_conversion ==
|
render_target_cache_->depth_float24_convert_in_pixel_shader(),
|
||||||
RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating ||
|
|
||||||
depth_float24_conversion ==
|
|
||||||
RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding),
|
|
||||||
host_render_targets_used, pixel_shader && pixel_shader->writes_depth(),
|
host_render_targets_used, pixel_shader && pixel_shader->writes_depth(),
|
||||||
viewport_info);
|
viewport_info);
|
||||||
draw_util::Scissor scissor;
|
draw_util::Scissor scissor;
|
||||||
|
|
|
@ -457,7 +457,9 @@ bool D3D12RenderTargetCache::Initialize() {
|
||||||
|
|
||||||
gamma_render_target_as_srgb_ = cvars::gamma_render_target_as_srgb;
|
gamma_render_target_as_srgb_ = cvars::gamma_render_target_as_srgb;
|
||||||
|
|
||||||
depth_float24_conversion_ = GetConfigDepthFloat24Conversion();
|
depth_float24_round_ = cvars::depth_float24_round;
|
||||||
|
depth_float24_convert_in_pixel_shader_ =
|
||||||
|
cvars::depth_float24_convert_in_pixel_shader;
|
||||||
|
|
||||||
// Check if 2x MSAA is supported or needs to be emulated with 4x MSAA
|
// Check if 2x MSAA is supported or needs to be emulated with 4x MSAA
|
||||||
// instead.
|
// instead.
|
||||||
|
@ -1013,8 +1015,9 @@ bool D3D12RenderTargetCache::Initialize() {
|
||||||
// Blending is done in linear space directly in shaders.
|
// Blending is done in linear space directly in shaders.
|
||||||
gamma_render_target_as_srgb_ = false;
|
gamma_render_target_as_srgb_ = false;
|
||||||
|
|
||||||
// Always true float24 depth.
|
// Always true float24 depth rounded to the nearest even.
|
||||||
depth_float24_conversion_ = DepthFloat24Conversion::kOnOutputRounding;
|
depth_float24_round_ = true;
|
||||||
|
depth_float24_convert_in_pixel_shader_ = true;
|
||||||
|
|
||||||
// Only ForcedSampleCount, which doesn't support 2x.
|
// Only ForcedSampleCount, which doesn't support 2x.
|
||||||
msaa_2x_supported_ = false;
|
msaa_2x_supported_ = false;
|
||||||
|
@ -2091,7 +2094,7 @@ RenderTargetCache::RenderTarget* D3D12RenderTargetCache::CreateRenderTarget(
|
||||||
bool D3D12RenderTargetCache::IsHostDepthEncodingDifferent(
|
bool D3D12RenderTargetCache::IsHostDepthEncodingDifferent(
|
||||||
xenos::DepthRenderTargetFormat format) const {
|
xenos::DepthRenderTargetFormat format) const {
|
||||||
if (format == xenos::DepthRenderTargetFormat::kD24FS8) {
|
if (format == xenos::DepthRenderTargetFormat::kD24FS8) {
|
||||||
return depth_float24_conversion_ == DepthFloat24Conversion::kOnCopy;
|
return !depth_float24_convert_in_pixel_shader_;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -3542,7 +3545,12 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
|
||||||
} break;
|
} break;
|
||||||
case xenos::DepthRenderTargetFormat::kD24FS8: {
|
case xenos::DepthRenderTargetFormat::kD24FS8: {
|
||||||
// Convert using r1.y as temporary.
|
// Convert using r1.y as temporary.
|
||||||
DxbcShaderTranslator::PreClampedDepthTo20e4(a, i, 3, i, 3, 1, 1,
|
// When converting the depth in pixel shaders, it's always exact,
|
||||||
|
// truncating not to insert additional rounding instructions.
|
||||||
|
DxbcShaderTranslator::PreClampedDepthTo20e4(
|
||||||
|
a, i, 3, i, 3, 1, 1,
|
||||||
|
!depth_float24_convert_in_pixel_shader() &&
|
||||||
|
depth_float24_round(),
|
||||||
true);
|
true);
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
@ -3729,7 +3737,12 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
|
||||||
} break;
|
} break;
|
||||||
case xenos::DepthRenderTargetFormat::kD24FS8: {
|
case xenos::DepthRenderTargetFormat::kD24FS8: {
|
||||||
// Convert using r1.y as temporary.
|
// Convert using r1.y as temporary.
|
||||||
DxbcShaderTranslator::PreClampedDepthTo20e4(a, 1, 3, 1, 3, 1, 1,
|
// When converting the depth in pixel shaders, it's always exact,
|
||||||
|
// truncating not to insert additional rounding instructions.
|
||||||
|
DxbcShaderTranslator::PreClampedDepthTo20e4(
|
||||||
|
a, 1, 3, 1, 3, 1, 1,
|
||||||
|
!depth_float24_convert_in_pixel_shader() &&
|
||||||
|
depth_float24_round(),
|
||||||
true);
|
true);
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
@ -4105,7 +4118,13 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
|
||||||
dxbc::Src::R(0, dxbc::Src::kYYYY));
|
dxbc::Src::R(0, dxbc::Src::kYYYY));
|
||||||
} break;
|
} break;
|
||||||
case xenos::DepthRenderTargetFormat::kD24FS8: {
|
case xenos::DepthRenderTargetFormat::kD24FS8: {
|
||||||
DxbcShaderTranslator::PreClampedDepthTo20e4(a, 0, 1, 0, 0, 0, 2,
|
// When converting the depth in pixel shaders, it's always
|
||||||
|
// exact, truncating not to insert additional rounding
|
||||||
|
// instructions.
|
||||||
|
DxbcShaderTranslator::PreClampedDepthTo20e4(
|
||||||
|
a, 0, 1, 0, 0, 0, 2,
|
||||||
|
!depth_float24_convert_in_pixel_shader() &&
|
||||||
|
depth_float24_round(),
|
||||||
true);
|
true);
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
@ -6167,7 +6186,12 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
|
||||||
case xenos::DepthRenderTargetFormat::kD24FS8:
|
case xenos::DepthRenderTargetFormat::kD24FS8:
|
||||||
// Convert to [0, 2) float24 from [0, 1) float32, using r0.x as
|
// Convert to [0, 2) float24 from [0, 1) float32, using r0.x as
|
||||||
// temporary.
|
// temporary.
|
||||||
DxbcShaderTranslator::PreClampedDepthTo20e4(a, 1, 0, 1, 0, 0, 0, true);
|
// When converting the depth in pixel shaders, it's always exact,
|
||||||
|
// truncating not to insert additional rounding instructions.
|
||||||
|
DxbcShaderTranslator::PreClampedDepthTo20e4(
|
||||||
|
a, 1, 0, 1, 0, 0, 0,
|
||||||
|
!depth_float24_convert_in_pixel_shader() && depth_float24_round(),
|
||||||
|
true);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// Combine 24-bit depth and stencil into r1.x.
|
// Combine 24-bit depth and stencil into r1.x.
|
||||||
|
|
|
@ -107,8 +107,9 @@ class D3D12RenderTargetCache final : public RenderTargetCache {
|
||||||
!cvars::snorm16_render_target_full_range;
|
!cvars::snorm16_render_target_full_range;
|
||||||
}
|
}
|
||||||
|
|
||||||
DepthFloat24Conversion depth_float24_conversion() const {
|
bool depth_float24_round() const { return depth_float24_round_; }
|
||||||
return depth_float24_conversion_;
|
bool depth_float24_convert_in_pixel_shader() const {
|
||||||
|
return depth_float24_convert_in_pixel_shader_;
|
||||||
}
|
}
|
||||||
|
|
||||||
DXGI_FORMAT GetColorResourceDXGIFormat(
|
DXGI_FORMAT GetColorResourceDXGIFormat(
|
||||||
|
@ -720,8 +721,8 @@ class D3D12RenderTargetCache final : public RenderTargetCache {
|
||||||
|
|
||||||
bool gamma_render_target_as_srgb_ = false;
|
bool gamma_render_target_as_srgb_ = false;
|
||||||
|
|
||||||
DepthFloat24Conversion depth_float24_conversion_ =
|
bool depth_float24_round_ = false;
|
||||||
DepthFloat24Conversion::kOnCopy;
|
bool depth_float24_convert_in_pixel_shader_ = false;
|
||||||
|
|
||||||
bool msaa_2x_supported_ = false;
|
bool msaa_2x_supported_ = false;
|
||||||
|
|
||||||
|
|
|
@ -882,20 +882,14 @@ PipelineCache::GetCurrentPixelShaderModification(
|
||||||
RenderTargetCache::Path::kHostRenderTargets) {
|
RenderTargetCache::Path::kHostRenderTargets) {
|
||||||
using DepthStencilMode =
|
using DepthStencilMode =
|
||||||
DxbcShaderTranslator::Modification::DepthStencilMode;
|
DxbcShaderTranslator::Modification::DepthStencilMode;
|
||||||
RenderTargetCache::DepthFloat24Conversion depth_float24_conversion =
|
if (render_target_cache_.depth_float24_convert_in_pixel_shader() &&
|
||||||
render_target_cache_.depth_float24_conversion();
|
|
||||||
if ((depth_float24_conversion ==
|
|
||||||
RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating ||
|
|
||||||
depth_float24_conversion ==
|
|
||||||
RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding) &&
|
|
||||||
normalized_depth_control.z_enable &&
|
normalized_depth_control.z_enable &&
|
||||||
regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
|
regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
|
||||||
xenos::DepthRenderTargetFormat::kD24FS8) {
|
xenos::DepthRenderTargetFormat::kD24FS8) {
|
||||||
modification.pixel.depth_stencil_mode =
|
modification.pixel.depth_stencil_mode =
|
||||||
depth_float24_conversion ==
|
render_target_cache_.depth_float24_round()
|
||||||
RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating
|
? DepthStencilMode::kFloat24Rounding
|
||||||
? DepthStencilMode::kFloat24Truncating
|
: DepthStencilMode::kFloat24Truncating;
|
||||||
: DepthStencilMode::kFloat24Rounding;
|
|
||||||
} else {
|
} else {
|
||||||
if (shader.implicit_early_z_write_allowed() &&
|
if (shader.implicit_early_z_write_allowed() &&
|
||||||
(!shader.writes_color_target(0) ||
|
(!shader.writes_color_target(0) ||
|
||||||
|
@ -2917,20 +2911,16 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline(
|
||||||
state_desc.PS.pShaderBytecode = depth_only_pixel_shader_.data();
|
state_desc.PS.pShaderBytecode = depth_only_pixel_shader_.data();
|
||||||
state_desc.PS.BytecodeLength = depth_only_pixel_shader_.size();
|
state_desc.PS.BytecodeLength = depth_only_pixel_shader_.size();
|
||||||
} else {
|
} else {
|
||||||
if ((description.depth_func != xenos::CompareFunction::kAlways ||
|
if (render_target_cache_.depth_float24_convert_in_pixel_shader() &&
|
||||||
|
(description.depth_func != xenos::CompareFunction::kAlways ||
|
||||||
description.depth_write) &&
|
description.depth_write) &&
|
||||||
description.depth_format == xenos::DepthRenderTargetFormat::kD24FS8) {
|
description.depth_format == xenos::DepthRenderTargetFormat::kD24FS8) {
|
||||||
switch (render_target_cache_.depth_float24_conversion()) {
|
if (render_target_cache_.depth_float24_round()) {
|
||||||
case RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating:
|
|
||||||
state_desc.PS.pShaderBytecode = shaders::float24_truncate_ps;
|
|
||||||
state_desc.PS.BytecodeLength = sizeof(shaders::float24_truncate_ps);
|
|
||||||
break;
|
|
||||||
case RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding:
|
|
||||||
state_desc.PS.pShaderBytecode = shaders::float24_round_ps;
|
state_desc.PS.pShaderBytecode = shaders::float24_round_ps;
|
||||||
state_desc.PS.BytecodeLength = sizeof(shaders::float24_round_ps);
|
state_desc.PS.BytecodeLength = sizeof(shaders::float24_round_ps);
|
||||||
break;
|
} else {
|
||||||
default:
|
state_desc.PS.pShaderBytecode = shaders::float24_truncate_ps;
|
||||||
break;
|
state_desc.PS.BytecodeLength = sizeof(shaders::float24_truncate_ps);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -532,8 +532,10 @@ void GetHostViewportInfo(const RegisterFile& regs,
|
||||||
// interpolated Z instead if conversion can't be done exactly, without
|
// interpolated Z instead if conversion can't be done exactly, without
|
||||||
// modifying clipping bounds by adjusting Z in vertex shaders, as that
|
// modifying clipping bounds by adjusting Z in vertex shaders, as that
|
||||||
// may cause polygons placed explicitly at Z = 0 or Z = W to be clipped.
|
// may cause polygons placed explicitly at Z = 0 or Z = W to be clipped.
|
||||||
z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min));
|
// Rounding the bounds to the nearest even regardless of the depth
|
||||||
z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max));
|
// rounding mode not to add even more error by truncating twice.
|
||||||
|
z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min, true));
|
||||||
|
z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max, true));
|
||||||
}
|
}
|
||||||
if (full_float24_in_0_to_1) {
|
if (full_float24_in_0_to_1) {
|
||||||
// Remap the full [0...2) float24 range to [0...1) support data round-trip
|
// Remap the full [0...2) float24 range to [0...1) support data round-trip
|
||||||
|
|
|
@ -533,13 +533,14 @@ class DxbcShaderTranslator : public ShaderTranslator {
|
||||||
uint32_t temp2_temp_component);
|
uint32_t temp2_temp_component);
|
||||||
// Converts the depth value externally clamped to the representable [0, 2)
|
// Converts the depth value externally clamped to the representable [0, 2)
|
||||||
// range to 20e4 floating point, with zeros in bits 24:31, rounding to the
|
// range to 20e4 floating point, with zeros in bits 24:31, rounding to the
|
||||||
// nearest even. Source and destination may be the same, temporary must be
|
// nearest even or towards zero. Source and destination may be the same,
|
||||||
// different than both. If remap_from_0_to_0_5 is true, it's assumed that
|
// temporary must be different than both. If remap_from_0_to_0_5 is true, it's
|
||||||
// 0...1 is pre-remapped to 0...0.5 in the input.
|
// assumed that 0...1 is pre-remapped to 0...0.5 in the input.
|
||||||
static void PreClampedDepthTo20e4(
|
static void PreClampedDepthTo20e4(
|
||||||
dxbc::Assembler& a, uint32_t f24_temp, uint32_t f24_temp_component,
|
dxbc::Assembler& a, uint32_t f24_temp, uint32_t f24_temp_component,
|
||||||
uint32_t f32_temp, uint32_t f32_temp_component, uint32_t temp_temp,
|
uint32_t f32_temp, uint32_t f32_temp_component, uint32_t temp_temp,
|
||||||
uint32_t temp_temp_component, bool remap_from_0_to_0_5);
|
uint32_t temp_temp_component, bool round_to_nearest_even,
|
||||||
|
bool remap_from_0_to_0_5);
|
||||||
// Converts the 20e4 number in bits [f24_shift, f24_shift + 10) to a 32-bit
|
// Converts the 20e4 number in bits [f24_shift, f24_shift + 10) to a 32-bit
|
||||||
// float. Two temporaries must be different, but one can be the same as the
|
// float. Two temporaries must be different, but one can be the same as the
|
||||||
// source. The destination may be anything writable. If remap_to_0_to_0_5 is
|
// source. The destination may be anything writable. If remap_to_0_to_0_5 is
|
||||||
|
|
|
@ -1921,7 +1921,7 @@ void DxbcShaderTranslator::CompletePixelShader_DSV_DepthTo24Bit() {
|
||||||
} else {
|
} else {
|
||||||
// Properly convert to 20e4, with rounding to the nearest even (the bias was
|
// Properly convert to 20e4, with rounding to the nearest even (the bias was
|
||||||
// pre-applied by multiplying by 2), then convert back restoring the bias.
|
// pre-applied by multiplying by 2), then convert back restoring the bias.
|
||||||
PreClampedDepthTo20e4(a_, temp, 0, temp, 0, temp, 1, false);
|
PreClampedDepthTo20e4(a_, temp, 0, temp, 0, temp, 1, true, false);
|
||||||
Depth20e4To32(a_, dxbc::Dest::ODepth(), temp, 0, 0, temp, 0, temp, 1, true);
|
Depth20e4To32(a_, dxbc::Dest::ODepth(), temp, 0, 0, temp, 0, temp, 1, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3217,7 +3217,8 @@ void DxbcShaderTranslator::Float7e3To32(
|
||||||
void DxbcShaderTranslator::PreClampedDepthTo20e4(
|
void DxbcShaderTranslator::PreClampedDepthTo20e4(
|
||||||
dxbc::Assembler& a, uint32_t f24_temp, uint32_t f24_temp_component,
|
dxbc::Assembler& a, uint32_t f24_temp, uint32_t f24_temp_component,
|
||||||
uint32_t f32_temp, uint32_t f32_temp_component, uint32_t temp_temp,
|
uint32_t f32_temp, uint32_t f32_temp_component, uint32_t temp_temp,
|
||||||
uint32_t temp_temp_component, bool remap_from_0_to_0_5) {
|
uint32_t temp_temp_component, bool round_to_nearest_even,
|
||||||
|
bool remap_from_0_to_0_5) {
|
||||||
assert_true(temp_temp != f24_temp ||
|
assert_true(temp_temp != f24_temp ||
|
||||||
temp_temp_component != f24_temp_component);
|
temp_temp_component != f24_temp_component);
|
||||||
assert_true(temp_temp != f32_temp ||
|
assert_true(temp_temp != f32_temp ||
|
||||||
|
@ -3268,13 +3269,18 @@ void DxbcShaderTranslator::PreClampedDepthTo20e4(
|
||||||
// Close the denormal check.
|
// Close the denormal check.
|
||||||
a.OpEndIf();
|
a.OpEndIf();
|
||||||
// Build the 20e4 number.
|
// Build the 20e4 number.
|
||||||
|
if (round_to_nearest_even) {
|
||||||
// temp = (biased_f32 >> 3) & 1
|
// temp = (biased_f32 >> 3) & 1
|
||||||
a.OpUBFE(temp_dest, dxbc::Src::LU(1), dxbc::Src::LU(3), f24_src);
|
a.OpUBFE(temp_dest, dxbc::Src::LU(1), dxbc::Src::LU(3), f24_src);
|
||||||
// f24 = biased_f32 + 3
|
// f24 = biased_f32 + 3
|
||||||
a.OpIAdd(f24_dest, f24_src, dxbc::Src::LU(3));
|
a.OpIAdd(f24_dest, f24_src, dxbc::Src::LU(3));
|
||||||
// f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1)
|
// f24 = biased_f32 + 3 + ((biased_f32 >> 3) & 1)
|
||||||
a.OpIAdd(f24_dest, f24_src, temp_src);
|
a.OpIAdd(f24_dest, f24_src, temp_src);
|
||||||
|
}
|
||||||
|
// For rounding to the nearest even:
|
||||||
// f24 = ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF
|
// f24 = ((biased_f32 + 3 + ((biased_f32 >> 3) & 1)) >> 3) & 0xFFFFFF
|
||||||
|
// For rounding towards zero:
|
||||||
|
// f24 = (biased_f32 >> 3) & 0xFFFFFF
|
||||||
a.OpUBFE(f24_dest, dxbc::Src::LU(24), dxbc::Src::LU(3), f24_src);
|
a.OpUBFE(f24_dest, dxbc::Src::LU(24), dxbc::Src::LU(3), f24_src);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3377,7 +3383,7 @@ void DxbcShaderTranslator::ROV_DepthTo24Bit(uint32_t d24_temp,
|
||||||
// 20e4 conversion.
|
// 20e4 conversion.
|
||||||
PreClampedDepthTo20e4(a_, d24_temp, d24_temp_component, d32_temp,
|
PreClampedDepthTo20e4(a_, d24_temp, d24_temp_component, d32_temp,
|
||||||
d32_temp_component, temp_temp, temp_temp_component,
|
d32_temp_component, temp_temp, temp_temp_component,
|
||||||
false);
|
true, false);
|
||||||
}
|
}
|
||||||
a_.OpElse();
|
a_.OpElse();
|
||||||
{
|
{
|
||||||
|
|
|
@ -39,42 +39,109 @@ DEFINE_bool(
|
||||||
"reduce bandwidth usage during transfers as the previous depth won't need "
|
"reduce bandwidth usage during transfers as the previous depth won't need "
|
||||||
"to be read.",
|
"to be read.",
|
||||||
"GPU");
|
"GPU");
|
||||||
// The round trip is done, in particular, in 545407F2.
|
// Lossless round trip: 545407F2.
|
||||||
DEFINE_string(
|
// Lossy round trip with the "greater or equal" test afterwards: 4D530919.
|
||||||
depth_float24_conversion, "",
|
// Lossy round trip with the "equal" test afterwards: 535107F5, 565507EF.
|
||||||
"Method for converting 32-bit Z values to 20e4 floating point when using "
|
DEFINE_bool(
|
||||||
"host depth buffers without native 20e4 support (when not using rasterizer-"
|
depth_float24_round, false,
|
||||||
"ordered views / fragment shader interlocks to perform depth testing "
|
"Whether to round to the nearest even, rather than truncating (rounding "
|
||||||
"manually).\n"
|
"towards zero), the depth when converting it to 24-bit floating-point "
|
||||||
"Use: [any, on_copy, truncate, round]\n"
|
"(20e4) from the host precision (32-bit floating point) when using a host "
|
||||||
" on_copy:\n"
|
"depth buffer.\n"
|
||||||
" Do depth testing at host precision, converting when copying between "
|
"false:\n"
|
||||||
"color and depth buffers (or between depth buffers of different formats) "
|
" Recommended.\n"
|
||||||
"to support reinterpretation, but keeps the last host depth buffer used "
|
" The conversion may move the depth values farther away from the camera.\n"
|
||||||
"for each EDRAM range and reloads the host precision value if it's still "
|
" Without depth_float24_convert_in_pixel_shader:\n"
|
||||||
"up to date after the EDRAM range was used with a different pixel format.\n"
|
" The \"greater or equal\" depth test function continues to work fine if "
|
||||||
" + Highest performance, allows early depth test and writing.\n"
|
"the full host precision depth data is lost, it's still possible to draw "
|
||||||
" + Host MSAA is possible with pixel-rate shading where supported.\n"
|
"another pass of the same geometry with it.\n"
|
||||||
" - EDRAM > RAM > EDRAM depth buffer round trip done in certain games "
|
" (See the description of depth_float24_convert_in_pixel_shader for more "
|
||||||
"destroys precision irreparably, causing artifacts if another rendering "
|
"information about full precision depth data loss.)\n"
|
||||||
"pass is done after the EDRAM reupload.\n"
|
" With depth_float24_convert_in_pixel_shader:\n"
|
||||||
" truncate:\n"
|
" Faster - the pixel shader for hidden surfaces may still be skipped "
|
||||||
" Convert to 20e4 directly in pixel shaders, always rounding down.\n"
|
"(using conservative depth output).\n"
|
||||||
" + Average performance, conservative early depth test is possible.\n"
|
"true:\n"
|
||||||
" + No precision loss when anything changes in the storage of the depth "
|
" Only for special cases of issues caused by minor 32-bit floating-point "
|
||||||
"buffer, EDRAM > RAM > EDRAM copying preserves precision.\n"
|
"rounding errors, for instance, when the game tries to draw something at "
|
||||||
" - Rounding mode is incorrect, sometimes giving results smaller than "
|
"the camera plane by setting Z of the vertex position to W.\n"
|
||||||
"they should be - may cause inaccuracy especially in edge cases when the "
|
" The conversion may move the depth values closer or farther.\n"
|
||||||
"game wants to write an exact value.\n"
|
" Using the same rounding mode as in the Direct3D 9 reference rasterizer.\n"
|
||||||
" - Host MSAA is only possible at SSAA speed, with per-sample shading.\n"
|
" Without depth_float24_convert_in_pixel_shader:\n"
|
||||||
" round:\n"
|
" Not possible to recover from a full host precision depth data loss - in "
|
||||||
" Convert to 20e4 directly in pixel shaders, correctly rounding to the "
|
"subsequent passes of rendering the same geometry, half of the samples "
|
||||||
"nearest even.\n"
|
"will be failing the depth test with the \"greater or equal\" depth test "
|
||||||
" + Highest accuracy.\n"
|
"function.\n"
|
||||||
" - Significantly limited performance, early depth test is not possible.\n"
|
" With depth_float24_convert_in_pixel_shader:\n"
|
||||||
" - Host MSAA is only possible at SSAA speed, with per-sample shading.\n"
|
" Slower - depth rejection before the pixel shader is not possible.\n"
|
||||||
" Any other value:\n"
|
"When the depth buffer is emulated in software (via the fragment shader "
|
||||||
" Choose what is considered the most optimal (currently \"on_copy\").",
|
"interlock / rasterizer-ordered view), this is ignored, and rounding to "
|
||||||
|
"the nearest even is always done.",
|
||||||
|
"GPU");
|
||||||
|
// With MSAA, when converting the depth in pixel shaders, they must run at
|
||||||
|
// sample frequency - otherwise, if the depth is the same for the entire pixel,
|
||||||
|
// intersections of polygons cannot be antialiased.
|
||||||
|
//
|
||||||
|
// Important usage note: When using this mode, bounds of the fixed-function
|
||||||
|
// viewport must be converted to and back from float24 too (preferably using
|
||||||
|
// rounding to the nearest even regardless of whether truncation was requested
|
||||||
|
// for the values, to reduce the error already caused by truncation rather than
|
||||||
|
// to amplify it). This ensures that clamping to the viewport bounds, which
|
||||||
|
// happens after the pixel shader even if it overwrites the resulting depth, is
|
||||||
|
// never done to a value not representable as float24 (for example, if the
|
||||||
|
// minimum Z is a number too small to be represented as float24, but not zero,
|
||||||
|
// it won't be possible to write what should become 0x000000 to the depth
|
||||||
|
// buffer). Note that this may add some error to the depth values from the
|
||||||
|
// rasterizer; however, modifying Z in the vertex shader to make interpolated
|
||||||
|
// depth values would cause clipping to be done to different bounds, which may
|
||||||
|
// be more undesirable, especially in cases when Z is explicitly set to a value
|
||||||
|
// like 0 or W (in such cases, the adjusted polygon may go outside 0...W in clip
|
||||||
|
// space and disappear).
|
||||||
|
//
|
||||||
|
// If false, doing the depth test at the host precision, converting to 20e4 to
|
||||||
|
// support reinterpretation, but keeping track of both the last color (or
|
||||||
|
// non-20e4 depth) value (let's call it stored_f24) and the last host depth
|
||||||
|
// value (stored_host) for each EDRAM pixel, reloading the last host depth value
|
||||||
|
// if stored_f24 == to_f24(stored_host) (otherwise it was overwritten by
|
||||||
|
// something else, like clearing, or an actually used color buffer; this is
|
||||||
|
// inexact though, and will incorrectly load pixels that were overwritten by
|
||||||
|
// something else in the EDRAM, but turned out to have the same value on the
|
||||||
|
// guest as before - an outdated host-precision value will be loaded in these
|
||||||
|
// cases instead).
|
||||||
|
DEFINE_bool(
|
||||||
|
depth_float24_convert_in_pixel_shader, false,
|
||||||
|
"Whether to convert the depth values to 24-bit floating-point (20e4) from "
|
||||||
|
"the host precision (32-bit floating point) directly in the pixel shaders "
|
||||||
|
"of guest draws when using a host depth buffer.\n"
|
||||||
|
"This prevents visual artifacts (interleaved stripes of parts of surfaces "
|
||||||
|
"rendered and parts not rendered, having either the same width in case of "
|
||||||
|
"the \"greater or equal\" depth test function, or the former being much "
|
||||||
|
"thinner than the latter with the \"equal\" function) if the full host "
|
||||||
|
"precision depth data is lost.\n"
|
||||||
|
"This issue may happen if the game reloads the depth data previously "
|
||||||
|
"evicted from the EDRAM to the RAM back to the EDRAM, but the EDRAM region "
|
||||||
|
"that previously contained that depth buffer was overwritten by another "
|
||||||
|
"depth buffer, or the game loads it to a different location in the EDRAM "
|
||||||
|
"than it was previously placed at, thus Xenia is unable to restore the "
|
||||||
|
"depth data with the original precision, and instead falls back to "
|
||||||
|
"converting the lower-precision values, so in subsequent rendering passes "
|
||||||
|
"for the same geometry, the actual depth values of the surfaces don't "
|
||||||
|
"match those stored in the depth buffer anymore.\n"
|
||||||
|
"This is a costly option because it makes the GPU unable to use depth "
|
||||||
|
"buffer compression, and also with MSAA, forces the pixel shader to run "
|
||||||
|
"for every subpixel sample rather than for the entire pixel, making pixel "
|
||||||
|
"shading 2 or 4 times heavier depending on the MSAA sample count.\n"
|
||||||
|
"The rounding direction is controlled by the depth_float24_round "
|
||||||
|
"configuration variable.\n"
|
||||||
|
"Note that with depth_float24_round = true, this becomes even more costly "
|
||||||
|
"because pixel shaders must be executed regardless of whether the surface "
|
||||||
|
"is behind the previously drawn surfaces. With depth_float24_round = "
|
||||||
|
"false, conservative depth output is used, however, so depth rejection "
|
||||||
|
"before the pixel shader may still work.\n"
|
||||||
|
"If sample-rate shading is not supported by the host GPU, the conversion "
|
||||||
|
"in the pixel shader is done only when MSAA is not used.\n"
|
||||||
|
"When the depth buffer is emulated in software (via the fragment shader "
|
||||||
|
"interlock / rasterizer-ordered view), this is ignored because 24-bit "
|
||||||
|
"depth is always used directly.",
|
||||||
"GPU");
|
"GPU");
|
||||||
DEFINE_bool(
|
DEFINE_bool(
|
||||||
draw_resolution_scaled_texture_offsets, true,
|
draw_resolution_scaled_texture_offsets, true,
|
||||||
|
@ -790,17 +857,6 @@ uint32_t RenderTargetCache::GetLastUpdateBoundRenderTargets(
|
||||||
return rts_used;
|
return rts_used;
|
||||||
}
|
}
|
||||||
|
|
||||||
RenderTargetCache::DepthFloat24Conversion
|
|
||||||
RenderTargetCache::GetConfigDepthFloat24Conversion() {
|
|
||||||
if (cvars::depth_float24_conversion == "truncate") {
|
|
||||||
return DepthFloat24Conversion::kOnOutputTruncating;
|
|
||||||
}
|
|
||||||
if (cvars::depth_float24_conversion == "round") {
|
|
||||||
return DepthFloat24Conversion::kOnOutputRounding;
|
|
||||||
}
|
|
||||||
return DepthFloat24Conversion::kOnCopy;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t RenderTargetCache::GetRenderTargetHeight(
|
uint32_t RenderTargetCache::GetRenderTargetHeight(
|
||||||
uint32_t pitch_tiles_at_32bpp, xenos::MsaaSamples msaa_samples) const {
|
uint32_t pitch_tiles_at_32bpp, xenos::MsaaSamples msaa_samples) const {
|
||||||
if (!pitch_tiles_at_32bpp) {
|
if (!pitch_tiles_at_32bpp) {
|
||||||
|
|
|
@ -29,6 +29,8 @@
|
||||||
#include "xenia/gpu/xenos.h"
|
#include "xenia/gpu/xenos.h"
|
||||||
|
|
||||||
DECLARE_bool(depth_transfer_not_equal_test);
|
DECLARE_bool(depth_transfer_not_equal_test);
|
||||||
|
DECLARE_bool(depth_float24_round);
|
||||||
|
DECLARE_bool(depth_float24_convert_in_pixel_shader);
|
||||||
DECLARE_bool(draw_resolution_scaled_texture_offsets);
|
DECLARE_bool(draw_resolution_scaled_texture_offsets);
|
||||||
DECLARE_bool(gamma_render_target_as_srgb);
|
DECLARE_bool(gamma_render_target_as_srgb);
|
||||||
DECLARE_bool(native_2x_msaa);
|
DECLARE_bool(native_2x_msaa);
|
||||||
|
@ -89,60 +91,6 @@ class RenderTargetCache {
|
||||||
kPixelShaderInterlock,
|
kPixelShaderInterlock,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum class DepthFloat24Conversion {
|
|
||||||
// Doing depth test at the host precision, converting to 20e4 to support
|
|
||||||
// reinterpretation, but keeping track of both the last color (or non-20e4
|
|
||||||
// depth) value (let's call it stored_f24) and the last host depth value
|
|
||||||
// (stored_host) for each EDRAM pixel, reloading the last host depth value
|
|
||||||
// if stored_f24 == to_f24(stored_host) (otherwise it was overwritten by
|
|
||||||
// something else, like clearing, or an actually used color buffer; this is
|
|
||||||
// inexact though, and will incorrectly load pixels that were overwritten by
|
|
||||||
// something else in the EDRAM, but turned out to have the same value on the
|
|
||||||
// guest as before - an outdated host-precision value will be loaded in
|
|
||||||
// these cases instead).
|
|
||||||
//
|
|
||||||
// EDRAM > RAM, then reusing the EDRAM region for something else > EDRAM
|
|
||||||
// round trip destroys precision beyond repair.
|
|
||||||
//
|
|
||||||
// Full host early Z and MSAA with pixel-rate shading are supported.
|
|
||||||
kOnCopy,
|
|
||||||
// Converting the depth to the closest host value representable exactly as a
|
|
||||||
// 20e4 float in pixel shaders, to support invariance in cases when the
|
|
||||||
// guest reuploads a previously resolved depth buffer to the EDRAM, rounding
|
|
||||||
// towards zero (which contradicts the rounding used by the Direct3D 9
|
|
||||||
// reference rasterizer, but allows less-than-or-equal pixel shader depth
|
|
||||||
// output to be used to preserve most of early Z culling when the game is
|
|
||||||
// using reversed depth, which is the usual way of doing depth testing on
|
|
||||||
// the Xbox 360 and of utilizing the advantages of a floating-point
|
|
||||||
// encoding).
|
|
||||||
//
|
|
||||||
// With MSAA, pixel shaders must run at sample frequency - otherwise, if the
|
|
||||||
// depth is the same for the entire pixel, intersections of polygons cannot
|
|
||||||
// be antialiased.
|
|
||||||
//
|
|
||||||
// Important usage note: When using this mode, bounds of the fixed-function
|
|
||||||
// viewport must be converted to and back from float24 too (preferably using
|
|
||||||
// correct rounding to the nearest even, to reduce the error already caused
|
|
||||||
// by truncation rather than to amplify it). This ensures that clamping to
|
|
||||||
// the viewport bounds, which happens after the pixel shader even if it
|
|
||||||
// overwrites the resulting depth, is never done to a value not
|
|
||||||
// representable as float24 (for example, if the minimum Z is a number too
|
|
||||||
// small to be represented as float24, but not zero, it won't be possible to
|
|
||||||
// write what should become 0x000000 to the depth buffer). Note that this
|
|
||||||
// may add some error to the depth values from the rasterizer; however,
|
|
||||||
// modifying Z in the vertex shader to make interpolated depth values would
|
|
||||||
// cause clipping to be done to different bounds, which may be more
|
|
||||||
// undesirable, especially in cases when Z is explicitly set to a value like
|
|
||||||
// 0 or W (in such cases, the adjusted polygon may go outside 0...W in clip
|
|
||||||
// space and disappear).
|
|
||||||
kOnOutputTruncating,
|
|
||||||
// Similar to kOnOutputTruncating, but rounding to the nearest even, more
|
|
||||||
// correctly, however, because the resulting depth can be bigger than the
|
|
||||||
// original host value, early depth testing can't be used at all. Same
|
|
||||||
// viewport usage rules apply.
|
|
||||||
kOnOutputRounding,
|
|
||||||
};
|
|
||||||
|
|
||||||
// Useful host-specific values.
|
// Useful host-specific values.
|
||||||
// sRGB conversion from the Direct3D 11.3 functional specification.
|
// sRGB conversion from the Direct3D 11.3 functional specification.
|
||||||
static constexpr float kSrgbToLinearDenominator1 = 12.92f;
|
static constexpr float kSrgbToLinearDenominator1 = 12.92f;
|
||||||
|
@ -512,8 +460,6 @@ class RenderTargetCache {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static DepthFloat24Conversion GetConfigDepthFloat24Conversion();
|
|
||||||
|
|
||||||
virtual uint32_t GetMaxRenderTargetWidth() const = 0;
|
virtual uint32_t GetMaxRenderTargetWidth() const = 0;
|
||||||
virtual uint32_t GetMaxRenderTargetHeight() const = 0;
|
virtual uint32_t GetMaxRenderTargetHeight() const = 0;
|
||||||
|
|
||||||
|
|
|
@ -53,9 +53,9 @@ ushr [precise(y)] r0.y, r0.y, r0.z
|
||||||
ult [precise(z)] r0.z, r0.x, l(0x38800000)
|
ult [precise(z)] r0.z, r0.x, l(0x38800000)
|
||||||
iadd [precise(x)] r0.x, r0.x, l(0xc8000000)
|
iadd [precise(x)] r0.x, r0.x, l(0xc8000000)
|
||||||
movc [precise(x)] r0.x, r0.z, r0.y, r0.x
|
movc [precise(x)] r0.x, r0.z, r0.y, r0.x
|
||||||
iadd [precise(y)] r0.y, r0.x, l(3)
|
ubfe [precise(y)] r0.y, l(1), l(3), r0.x
|
||||||
ubfe [precise(x)] r0.x, l(1), l(3), r0.x
|
iadd [precise(x)] r0.x, r0.y, r0.x
|
||||||
iadd [precise(x)] r0.x, r0.x, r0.y
|
iadd [precise(x)] r0.x, r0.x, l(3)
|
||||||
ubfe [precise(xyz)] r0.xyz, l(24, 20, 4, 0), l(3, 3, 23, 0), r0.xxxx
|
ubfe [precise(xyz)] r0.xyz, l(24, 20, 4, 0), l(3, 3, 23, 0), r0.xxxx
|
||||||
firstbit_hi [precise(w)] r0.w, r0.y
|
firstbit_hi [precise(w)] r0.w, r0.y
|
||||||
iadd [precise(w)] r0.w, r0.w, l(-11)
|
iadd [precise(w)] r0.w, r0.w, l(-11)
|
||||||
|
@ -76,10 +76,10 @@ ret
|
||||||
|
|
||||||
const BYTE float24_round_ps[] =
|
const BYTE float24_round_ps[] =
|
||||||
{
|
{
|
||||||
68, 88, 66, 67, 229, 54,
|
68, 88, 66, 67, 110, 79,
|
||||||
46, 1, 194, 31, 164, 202,
|
84, 202, 151, 165, 237, 180,
|
||||||
193, 71, 175, 129, 44, 52,
|
64, 17, 0, 132, 236, 126,
|
||||||
218, 154, 1, 0, 0, 0,
|
142, 105, 1, 0, 0, 0,
|
||||||
8, 7, 0, 0, 5, 0,
|
8, 7, 0, 0, 5, 0,
|
||||||
0, 0, 52, 0, 0, 0,
|
0, 0, 52, 0, 0, 0,
|
||||||
160, 0, 0, 0, 120, 2,
|
160, 0, 0, 0, 120, 2,
|
||||||
|
@ -259,22 +259,22 @@ const BYTE float24_round_ps[] =
|
||||||
0, 0, 0, 0, 26, 0,
|
0, 0, 0, 0, 26, 0,
|
||||||
16, 0, 0, 0, 0, 0,
|
16, 0, 0, 0, 0, 0,
|
||||||
10, 0, 16, 0, 0, 0,
|
10, 0, 16, 0, 0, 0,
|
||||||
0, 0, 30, 0, 16, 7,
|
0, 0, 138, 0, 16, 9,
|
||||||
34, 0, 16, 0, 0, 0,
|
34, 0, 16, 0, 0, 0,
|
||||||
0, 0, 10, 0, 16, 0,
|
|
||||||
0, 0, 0, 0, 1, 64,
|
|
||||||
0, 0, 3, 0, 0, 0,
|
|
||||||
138, 0, 8, 9, 18, 0,
|
|
||||||
16, 0, 0, 0, 0, 0,
|
|
||||||
1, 64, 0, 0, 1, 0,
|
|
||||||
0, 0, 1, 64, 0, 0,
|
0, 0, 1, 64, 0, 0,
|
||||||
3, 0, 0, 0, 10, 0,
|
1, 0, 0, 0, 1, 64,
|
||||||
|
0, 0, 3, 0, 0, 0,
|
||||||
|
10, 0, 16, 0, 0, 0,
|
||||||
|
0, 0, 30, 0, 8, 7,
|
||||||
|
18, 0, 16, 0, 0, 0,
|
||||||
|
0, 0, 26, 0, 16, 0,
|
||||||
|
0, 0, 0, 0, 10, 0,
|
||||||
16, 0, 0, 0, 0, 0,
|
16, 0, 0, 0, 0, 0,
|
||||||
30, 0, 8, 7, 18, 0,
|
30, 0, 8, 7, 18, 0,
|
||||||
16, 0, 0, 0, 0, 0,
|
16, 0, 0, 0, 0, 0,
|
||||||
10, 0, 16, 0, 0, 0,
|
10, 0, 16, 0, 0, 0,
|
||||||
0, 0, 26, 0, 16, 0,
|
0, 0, 1, 64, 0, 0,
|
||||||
0, 0, 0, 0, 138, 0,
|
3, 0, 0, 0, 138, 0,
|
||||||
56, 15, 114, 0, 16, 0,
|
56, 15, 114, 0, 16, 0,
|
||||||
0, 0, 0, 0, 2, 64,
|
0, 0, 0, 0, 2, 64,
|
||||||
0, 0, 24, 0, 0, 0,
|
0, 0, 24, 0, 0, 0,
|
||||||
|
|
|
@ -12,5 +12,6 @@ precise float main(XePSInput xe_input) : SV_Depth {
|
||||||
// allow for safe reinterpretation of any 24-bit value to and from float24
|
// allow for safe reinterpretation of any 24-bit value to and from float24
|
||||||
// depth using depth output without unrestricted depth range.
|
// depth using depth output without unrestricted depth range.
|
||||||
return asfloat(XeFloat20e4To32(
|
return asfloat(XeFloat20e4To32(
|
||||||
XeFloat32To20e4(asuint(saturate(xe_input.position.z * 2.0f))), true));
|
XeFloat32To20e4(asuint(saturate(xe_input.position.z * 2.0f)), true),
|
||||||
|
true));
|
||||||
}
|
}
|
||||||
|
|
|
@ -587,14 +587,17 @@ xesl_uint4 XeRG16SNormToRG16Float(xesl_uint4 packed_texels) {
|
||||||
// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
|
// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
|
||||||
// We also can't clamp the stored value to 1 as load->store->load must be exact.
|
// We also can't clamp the stored value to 1 as load->store->load must be exact.
|
||||||
|
|
||||||
uint XeFloat32To20e4(uint f32u32) {
|
uint XeFloat32To20e4(uint f32u32, bool round_to_nearest_even) {
|
||||||
// Keep only positive (high bit set means negative for both float and int) and
|
// Keep only positive (high bit set means negative for both float and int) and
|
||||||
// saturate to the maximum representable value near 2 (also dropping NaNs).
|
// saturate to the maximum representable value near 2 (also dropping NaNs).
|
||||||
f32u32 = min((f32u32 <= 0x7FFFFFFFu) ? f32u32 : 0u, 0x3FFFFFF8u);
|
f32u32 = min((f32u32 <= 0x7FFFFFFFu) ? f32u32 : 0u, 0x3FFFFFF8u);
|
||||||
uint denormalized =
|
uint denormalized =
|
||||||
((f32u32 & 0x7FFFFFu) | 0x800000u) >> min(113u - (f32u32 >> 23u), 24u);
|
((f32u32 & 0x7FFFFFu) | 0x800000u) >> min(113u - (f32u32 >> 23u), 24u);
|
||||||
uint f24u32 = (f32u32 < 0x38800000u) ? denormalized : (f32u32 + 0xC8000000u);
|
uint f24u32 = (f32u32 < 0x38800000u) ? denormalized : (f32u32 + 0xC8000000u);
|
||||||
return ((f24u32 + 3u + ((f24u32 >> 3u) & 1u)) >> 3u) & 0xFFFFFFu;
|
if (round_to_nearest_even) {
|
||||||
|
f24u32 += 3u + ((f24u32 >> 3u) & 1u);
|
||||||
|
}
|
||||||
|
return (f24u32 >> 3u) & 0xFFFFFFu;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint XeFloat20e4To32(uint f24u32, bool remap_to_0_to_0_5) {
|
uint XeFloat20e4To32(uint f24u32, bool remap_to_0_to_0_5) {
|
||||||
|
|
|
@ -126,7 +126,7 @@ float Float7e3To32(uint32_t f10) {
|
||||||
// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
|
// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
|
||||||
// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
|
// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
|
||||||
|
|
||||||
uint32_t Float32To20e4(float f32) {
|
uint32_t Float32To20e4(float f32, bool round_to_nearest_even) {
|
||||||
if (!(f32 > 0.0f)) {
|
if (!(f32 > 0.0f)) {
|
||||||
// Positive only, and not -0 or NaN.
|
// Positive only, and not -0 or NaN.
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -145,7 +145,10 @@ uint32_t Float32To20e4(float f32) {
|
||||||
// Rebias the exponent to represent the value as a normalized 20e4.
|
// Rebias the exponent to represent the value as a normalized 20e4.
|
||||||
f32u32 += 0xC8000000u;
|
f32u32 += 0xC8000000u;
|
||||||
}
|
}
|
||||||
return ((f32u32 + 3 + ((f32u32 >> 3) & 1)) >> 3) & 0xFFFFFF;
|
if (round_to_nearest_even) {
|
||||||
|
f32u32 += f32u32 + 3 + ((f32u32 >> 3) & 1);
|
||||||
|
}
|
||||||
|
return (f32u32 >> 3) & 0xFFFFFF;
|
||||||
}
|
}
|
||||||
|
|
||||||
float Float20e4To32(uint32_t f24) {
|
float Float20e4To32(uint32_t f24) {
|
||||||
|
|
|
@ -336,8 +336,8 @@ float Float7e3To32(uint32_t f10);
|
||||||
// Converts 24-bit unorm depth in the value (not clamping) to an IEEE-754 32-bit
|
// Converts 24-bit unorm depth in the value (not clamping) to an IEEE-754 32-bit
|
||||||
// floating-point number.
|
// floating-point number.
|
||||||
// Converts an IEEE-754 32-bit floating-point number to Xenos floating-point
|
// Converts an IEEE-754 32-bit floating-point number to Xenos floating-point
|
||||||
// depth, rounding to the nearest even.
|
// depth, rounding to the nearest even or towards zero.
|
||||||
uint32_t Float32To20e4(float f32);
|
uint32_t Float32To20e4(float f32, bool round_to_nearest_even);
|
||||||
// Converts Xenos floating-point depth in bits 0:23 (not clamping) to an
|
// Converts Xenos floating-point depth in bits 0:23 (not clamping) to an
|
||||||
// IEEE-754 32-bit floating-point number.
|
// IEEE-754 32-bit floating-point number.
|
||||||
float Float20e4To32(uint32_t f24);
|
float Float20e4To32(uint32_t f24);
|
||||||
|
|
Loading…
Reference in New Issue