[GPU] Viewport/clipping cleanup, don't clamp oDepth

This commit is contained in:
Triang3l 2021-05-04 21:39:34 +03:00
parent 95031d9471
commit fb01ccaac6
7 changed files with 428 additions and 260 deletions

View File

@ -2026,20 +2026,21 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
render_target_cache_->depth_float24_conversion();
draw_util::ViewportInfo viewport_info;
draw_util::GetHostViewportInfo(
regs, resolution_scale, true, float(D3D12_VIEWPORT_BOUNDS_MAX),
float(D3D12_VIEWPORT_BOUNDS_MAX), false,
regs, resolution_scale, true, D3D12_VIEWPORT_BOUNDS_MAX,
D3D12_VIEWPORT_BOUNDS_MAX, false,
host_render_targets_used &&
(depth_float24_conversion ==
RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating ||
depth_float24_conversion ==
RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding),
host_render_targets_used, viewport_info);
host_render_targets_used, pixel_shader && pixel_shader->writes_depth(),
viewport_info);
draw_util::Scissor scissor;
draw_util::GetScissor(regs, scissor);
scissor.left *= resolution_scale;
scissor.top *= resolution_scale;
scissor.width *= resolution_scale;
scissor.height *= resolution_scale;
scissor.offset[0] *= resolution_scale;
scissor.offset[1] *= resolution_scale;
scissor.extent[0] *= resolution_scale;
scissor.extent[1] *= resolution_scale;
// Update viewport, scissor, blend factor and stencil reference.
UpdateFixedFunctionState(viewport_info, scissor, primitive_polygonal);
@ -2811,20 +2812,20 @@ void D3D12CommandProcessor::UpdateFixedFunctionState(
// Viewport.
D3D12_VIEWPORT viewport;
viewport.TopLeftX = viewport_info.left;
viewport.TopLeftY = viewport_info.top;
viewport.Width = viewport_info.width;
viewport.Height = viewport_info.height;
viewport.TopLeftX = float(viewport_info.xy_offset[0]);
viewport.TopLeftY = float(viewport_info.xy_offset[1]);
viewport.Width = float(viewport_info.xy_extent[0]);
viewport.Height = float(viewport_info.xy_extent[1]);
viewport.MinDepth = viewport_info.z_min;
viewport.MaxDepth = viewport_info.z_max;
SetViewport(viewport);
// Scissor.
D3D12_RECT scissor_rect;
scissor_rect.left = LONG(scissor.left);
scissor_rect.top = LONG(scissor.top);
scissor_rect.right = LONG(scissor.left + scissor.width);
scissor_rect.bottom = LONG(scissor.top + scissor.height);
scissor_rect.left = LONG(scissor.offset[0]);
scissor_rect.top = LONG(scissor.offset[1]);
scissor_rect.right = LONG(scissor.offset[0] + scissor.extent[0]);
scissor_rect.bottom = LONG(scissor.offset[1] + scissor.extent[1]);
SetScissorRect(scissor_rect);
if (render_target_cache_->GetPath() ==
@ -3090,9 +3091,11 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
system_constants_.point_size_min_max[0] = point_size_min;
system_constants_.point_size_min_max[1] = point_size_max;
float point_screen_to_ndc_x =
(0.5f * 2.0f * resolution_scale) / viewport_info.width;
(/* 0.5f * 2.0f * */ float(resolution_scale)) /
std::max(viewport_info.xy_extent[0], uint32_t(1));
float point_screen_to_ndc_y =
(0.5f * 2.0f * resolution_scale) / viewport_info.height;
(/* 0.5f * 2.0f * */ float(resolution_scale)) /
std::max(viewport_info.xy_extent[1], uint32_t(1));
dirty |= system_constants_.point_screen_to_ndc[0] != point_screen_to_ndc_x;
dirty |= system_constants_.point_screen_to_ndc[1] != point_screen_to_ndc_y;
system_constants_.point_screen_to_ndc[0] = point_screen_to_ndc_x;

View File

@ -184,167 +184,353 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader,
}
void GetHostViewportInfo(const RegisterFile& regs, uint32_t resolution_scale,
bool origin_bottom_left, float x_max, float y_max,
bool allow_reverse_z, bool convert_z_to_float24,
bool full_float24_in_0_to_1,
bool origin_bottom_left, uint32_t x_max,
uint32_t y_max, bool allow_reverse_z,
bool convert_z_to_float24, bool full_float24_in_0_to_1,
bool pixel_shader_writes_depth,
ViewportInfo& viewport_info_out) {
assert_true(resolution_scale >= 1);
assert_true(x_max >= 1.0f);
assert_true(y_max >= 1.0f);
assert_not_zero(resolution_scale);
// PA_CL_VTE_CNTL contains whether offsets and scales are enabled.
// http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf
// In games, either all are enabled (for regular drawing) or none are (for
// rectangle lists usually).
// A vertex position goes the following path:
//
// If scale/offset is enabled, the Xenos shader is writing (neglecting W
// division) position in the NDC (-1, -1, dx_clip_space_def - 1) -> (1, 1, 1)
// box. If it's not, the position is in screen space. Since we can only use
// the NDC in PC APIs, we use a viewport of the largest possible size, and
// divide the position by it in translated shaders.
// = Vertex shader output in clip space, (-w, -w, 0) ... (w, w, w) for
// Direct3D or (-w, -w, -w) ... (w, w, w) for OpenGL.
// > Clipping to the boundaries of the clip space if enabled.
// > Division by W if not pre-divided.
// = Normalized device coordinates, (-1, -1, 0) ... (1, 1, 1) for Direct3D or
// (-1, -1, -1) ... (1, 1, 1) for OpenGL.
// > Viewport scaling.
// > Viewport, window and half-pixel offsetting.
// = Actual position in render target pixels used for rasterization and depth
// buffer coordinates.
//
// On modern PC graphics APIs, all drawing is done with clipping enabled (only
// Z clipping can be replaced with viewport depth range clamping).
//
// On the Xbox 360, however, there are two cases:
//
// - Clipping is enabled:
//
// Drawing "as normal", primarily for the game world. Draws are clipped to
// the (-w, -w, 0) ... (w, w, w) or (-w, -w, -w) ... (w, w, w) clip space.
//
// Ideally all offsets in pixels (window offset, half-pixel offset) are
// post-clip, and thus they would need to be applied via the host viewport
// (also the Direct3D 11.3 specification defines this as the correct way of
// reproducing the original Direct3D 9 half-pixel offset behavior).
//
// However, in reality, only WARP actually truly clips to -W...W, with the
// viewport fractional offset actually accurately making samples outside the
// fractional rectangle unable to be covered. AMD, Intel and Nvidia, in
// Direct3D 12, all don't truly clip even a really huge primitive to -W...W.
// Instead, primitives still overflow the fractional rectangle and cover
// samples outside of it. The actual viewport scissor is floor(TopLeftX,
// TopLeftY) ... floor(TopLeftX + Width, TopLeftY + Height), with flooring
// and addition in float32 (with 0x3F7FFFFF TopLeftXY, or 1.0f - ULP, all
// the samples in the top row / left column can be covered, while with
// 0x3F800000, or 1.0f, none of them can be).
//
// We are reproducing the same behavior here - what would happen if we'd be
// passing the guest values directly to Direct3D 12. Also, for consistency
// across hardware and APIs (especially Vulkan with viewportSubPixelBits
// being 0 rather than at least 8 on some devices - Arm Mali, Imagination
// PowerVR), and for simplicity of math, and also for exact calculations in
// bounds checking in validation layers of the host APIs, we are returning
// integer viewport coordinates, handling the fractional offset in the
// vertex shaders instead, via ndc_scale and ndc_offset - it shouldn't
// significantly affect precision that we will be doing the offsetting in
// W-scaled rather than W-divided units, the ratios of exponents involved in
// the calculations stay the same, and everything ends up being 16.8 anyway
// on most hardware, so small precision differences are very unlikely to
// affect coverage.
//
// FIXME(Triang3l): Overestimate or more properly round the viewport scissor
// boundaries if this flooring causes gaps on the bottom / right side in real
// games if any are found using fractional viewport coordinates. Viewport
// scissoring is not an inherent result of the viewport scale / offset, these
// are used merely for transformation of coordinates; rather, it's done by
// intersecting the viewport and scissor rectangles in the guest driver and
// writing the common portion to PA_SC_WINDOW_SCISSOR, so how the scissor is
// computed for a fractional viewport is entirely up to the guest.
//
// Even though Xbox 360 games are designed for Direct3D, with 0...W range of
// Z in clip space, the GPU also allows -W...W. Since Xenia is not targeting
// OpenGL (where it would be toggled via glClipControl - or, on ES, it would
// always be -W...W), this function always remaps it to 0...W, though
// numerically not precisely (0 is moved to 0.5, locking the exponent near
// what was the truly floating-point 0 originally). It is the guest
// viewport's responsibility (haven't checked, but it's logical) to remap
// from -1...1 in the NDC to glDepthRange within the 0...1 range. Also -Z
// pointing forward in OpenGL doesn't matter here (the -W...W clip space is
// symmetric).
//
// - Clipping is disabled:
//
// The most common case of drawing without clipping in games is screen-space
// draws, most prominently clears, directly in render target coordinates.
//
// In this particular case (though all the general case arithmetic still
// applies), the vertex shader returns a position in pixels, pre-divided by
// W (though this doesn't matter if W is 1).
//
// Because clipping is disabled, this huge polygon with, for example,
// a (1280, 720, 0, 1) vertex, is not clipped to (-w, -w) ... (w, w), so the
// vertex becomes (1280, 720) in the NDC as well (even though in regular 3D
// draws with clipping, disregarding the guard band for simplicity, it can't
// be bigger than (1, 1) after clipping and the division by W).
//
// For these draws, the viewport is also usually disabled (though, again, it
// doesn't have to be - an enabled viewport would likely still work as
// usual) by disabling PA_CL_VTE_CNTL::VPORT_X/Y/Z_SCALE/OFFSET_ENA - which
// equals to having a viewport scale of (1, 1, 1) and offset of (0, 0, 0).
// This results in the NDC being treated directly as pixel coordinates.
// Normally, with clipping, this would make only a tiny 1x1 area in the
// corner of the render target being possible to cover (and 3 unreachable
// pixels outside of the render target). The window offset is then applied,
// if needed, as well as the half-pixel offset.
//
// It's also possible (though not verified) that without clipping, Z (as a
// result of, for instance, polygon offset, or explicit calculations in the
// vertex shader) may end up outside the viewport Z range. Direct3D 10
// requires clamping to the viewport Z bounds in all cases in the
// output-merger according to the Direct3D 11.3 functional specification. A
// different behavior is likely on the Xbox 360, however, because while
// Direct3D 10-compatible AMD GPUs such as the R600 have
// PA_SC_VPORT_ZMIN/ZMAX registers, the Adreno 200 doesn't seem to have any
// equivalents, neither in PA nor in RB. This probably also applies to
// shader depth output - possibly doesn't need to be clamped as well.
//
// On the PC, we need to emulate disabled clipping by using a viewport at
// least as large as the scissor region within the render target, as well as
// the full viewport depth range (plus changing Z clipping to Z clamping on
// the host if possible), and rescale from the guest clip space to the host
// "no clip" clip space, as well as apply the viewport, the window offset,
// and the half-pixel offset, in the vertex shader. Ideally, the host
// viewport should have a power of 2 size - so scaling doesn't affect
// precision, and is merely an exponent bias.
//
// NDC XY point towards +XY on the render target - the viewport scale sign
// handles the remapping from Direct3D 9 -Y towards +U to a generic
// transformation from the NDC to pixel coordinates.
//
// TODO(Triang3l): Investigate the need for clamping of oDepth to 0...1 for
// D24FS8 as well.
auto pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
auto pa_su_vtx_cntl = regs.Get<reg::PA_SU_VTX_CNTL>();
float viewport_left, viewport_top;
float viewport_width, viewport_height;
float ndc_scale_x, ndc_scale_y;
float ndc_offset_x, ndc_offset_y;
// To avoid zero size viewports, which would harm division and aren't allowed
// on Vulkan. Nothing will ever be covered by a viewport of this size - this
// is 2 orders of magnitude smaller than a .8 subpixel, and thus shouldn't
// have any effect on rounding, n and n + 1 / 1024 would be rounded to the
// same .8 fixed-point value, thus in fixed-point, the viewport would have
// zero size.
const float size_min = 1.0f / 1024.0f;
float viewport_offset_x = pa_cl_vte_cntl.vport_x_offset_ena
? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32
: 0.0f;
float viewport_offset_y = pa_cl_vte_cntl.vport_y_offset_ena
? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
: 0.0f;
// Obtain the original viewport values in a normalized way.
float scale_xy[] = {
pa_cl_vte_cntl.vport_x_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32
: 1.0f,
pa_cl_vte_cntl.vport_y_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
: 1.0f,
};
float scale_z = pa_cl_vte_cntl.vport_z_scale_ena
? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32
: 1.0f;
float offset_base_xy[] = {
pa_cl_vte_cntl.vport_x_offset_ena
? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32
: 0.0f,
pa_cl_vte_cntl.vport_y_offset_ena
? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
: 0.0f,
};
float offset_z = pa_cl_vte_cntl.vport_z_offset_ena
? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32
: 0.0f;
// Calculate all the integer.0 or integer.5 offsetting exactly at full
// precision, separately so it can be used in other integer calculations
// without double rounding if needed.
float offset_add_xy[2] = {};
if (pa_su_sc_mode_cntl.vtx_window_offset_enable) {
auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
viewport_offset_x += float(pa_sc_window_offset.window_x_offset);
viewport_offset_y += float(pa_sc_window_offset.window_y_offset);
offset_add_xy[0] += float(pa_sc_window_offset.window_x_offset);
offset_add_xy[1] += float(pa_sc_window_offset.window_y_offset);
}
if (pa_cl_vte_cntl.vport_x_scale_ena) {
float pa_cl_vport_xscale = regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32;
float viewport_scale_x_abs =
std::abs(pa_cl_vport_xscale) * resolution_scale;
viewport_left = viewport_offset_x * resolution_scale - viewport_scale_x_abs;
float viewport_right = viewport_left + viewport_scale_x_abs * 2.0f;
// Keep the viewport in the positive quarter-plane for simplicity of
// clamping to the maximum supported bounds.
float cutoff_left = std::fmax(-viewport_left, 0.0f);
float cutoff_right = std::fmax(viewport_right - x_max, 0.0f);
viewport_left = std::fmax(viewport_left, 0.0f);
viewport_right = std::fmin(viewport_right, x_max);
viewport_width = viewport_right - viewport_left;
if (viewport_width > size_min) {
ndc_scale_x =
(viewport_width + cutoff_left + cutoff_right) / viewport_width;
if (pa_cl_vport_xscale < 0.0f) {
ndc_scale_x = -ndc_scale_x;
}
ndc_offset_x =
((cutoff_right - cutoff_left) * (0.5f * 2.0f)) / viewport_width;
} else {
// Empty viewport, but don't pass 0 because that's against the Vulkan
// specification.
viewport_left = 0.0f;
viewport_width = size_min;
ndc_scale_x = 0.0f;
ndc_offset_x = 0.0f;
}
} else {
// Drawing without a viewport and without clipping to one - use a viewport
// covering the entire potential guest render target or the positive part of
// the host viewport area, whichever is smaller, and apply the offset, if
// enabled, via the shader.
viewport_left = 0.0f;
viewport_width = std::min(
float(xenos::kTexture2DCubeMaxWidthHeight) * resolution_scale, x_max);
ndc_scale_x = (2.0f * resolution_scale) / viewport_width;
ndc_offset_x = viewport_offset_x * ndc_scale_x - 1.0f;
}
if (pa_cl_vte_cntl.vport_y_scale_ena) {
float pa_cl_vport_yscale = regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32;
float viewport_scale_y_abs =
std::abs(pa_cl_vport_yscale) * resolution_scale;
viewport_top = viewport_offset_y * resolution_scale - viewport_scale_y_abs;
float viewport_bottom = viewport_top + viewport_scale_y_abs * 2.0f;
float cutoff_top = std::fmax(-viewport_top, 0.0f);
float cutoff_bottom = std::fmax(viewport_bottom - y_max, 0.0f);
viewport_top = std::fmax(viewport_top, 0.0f);
viewport_bottom = std::fmin(viewport_bottom, y_max);
viewport_height = viewport_bottom - viewport_top;
if (viewport_height > size_min) {
ndc_scale_y =
(viewport_height + cutoff_top + cutoff_bottom) / viewport_height;
if (pa_cl_vport_yscale < 0.0f) {
ndc_scale_y = -ndc_scale_y;
}
ndc_offset_y =
((cutoff_bottom - cutoff_top) * (0.5f * 2.0f)) / viewport_height;
} else {
// Empty viewport, but don't pass 0 because that's against the Vulkan
// specification.
viewport_top = 0.0f;
viewport_height = size_min;
ndc_scale_y = 0.0f;
ndc_offset_y = 0.0f;
}
} else {
viewport_top = 0.0f;
viewport_height = std::min(
float(xenos::kTexture2DCubeMaxWidthHeight) * resolution_scale, y_max);
ndc_scale_y = (2.0f * resolution_scale) / viewport_height;
ndc_offset_y = viewport_offset_y * ndc_scale_y - 1.0f;
}
// Apply the vertex half-pixel offset via the shader (it must not affect
// clipping, otherwise with resolution scale, samples in the left/top half
// will never be covered).
if (cvars::half_pixel_offset && !pa_su_vtx_cntl.pix_center) {
float half_pixel_offset_ndc_scale = 0.5f * 2.0f * resolution_scale;
ndc_offset_x += half_pixel_offset_ndc_scale / viewport_width;
ndc_offset_y += half_pixel_offset_ndc_scale / viewport_height;
offset_add_xy[0] += 0.5f;
offset_add_xy[1] += 0.5f;
}
if (origin_bottom_left) {
ndc_scale_y = -ndc_scale_y;
ndc_offset_y = -ndc_offset_y;
// The maximum value is at least the maximum host render target size anyway -
// and a guest pixel is always treated as a whole with resolution scaling.
uint32_t xy_max_unscaled[] = {x_max / resolution_scale,
y_max / resolution_scale};
assert_not_zero(xy_max_unscaled[0]);
assert_not_zero(xy_max_unscaled[1]);
float z_min;
float z_max;
float ndc_scale[3];
float ndc_offset[3];
if (pa_cl_clip_cntl.clip_disable) {
// Clipping is disabled - use a huge host viewport, perform pixel and depth
// offsetting in the vertex shader.
// XY.
for (uint32_t i = 0; i < 2; ++i) {
viewport_info_out.xy_offset[i] = 0;
uint32_t extent_axis_unscaled =
std::min(xenos::kTexture2DCubeMaxWidthHeight, xy_max_unscaled[i]);
viewport_info_out.xy_extent[i] = extent_axis_unscaled * resolution_scale;
float extent_axis_unscaled_float = float(extent_axis_unscaled);
float pixels_to_ndc_axis = 2.0f / extent_axis_unscaled_float;
ndc_scale[i] = scale_xy[i] * pixels_to_ndc_axis;
ndc_offset[i] = (offset_base_xy[i] - extent_axis_unscaled_float * 0.5f +
offset_add_xy[i]) *
pixels_to_ndc_axis;
}
// Z.
z_min = 0.0f;
z_max = 1.0f;
ndc_scale[2] = scale_z;
ndc_offset[2] = offset_z;
} else {
// Clipping is enabled - perform pixel and depth offsetting via the host
// viewport.
// XY.
for (uint32_t i = 0; i < 2; ++i) {
// With resolution scaling, do all viewport XY scissoring in guest pixels
// if fractional and for the half-pixel offset - we treat guest pixels as
// a whole, and also the half-pixel offset would be irreversible in guest
// vertices if we did flooring in host pixels. Instead of flooring, also
// doing truncation for simplicity - since maxing with 0 is done anyway
// (we only return viewports in the positive quarter-plane).
float offset_axis = offset_base_xy[i] + offset_add_xy[i];
float scale_axis = scale_xy[i];
float scale_axis_abs = std::abs(scale_xy[i]);
float axis_0 = offset_axis - scale_axis_abs;
float axis_1 = offset_axis + scale_axis_abs;
float axis_max_unscaled_float = float(xy_max_unscaled[i]);
// fmax to drop NaN and < 0, min as float (axis_max_unscaled_float is well
// below 2^24) to safely drop very large values.
uint32_t axis_0_int =
uint32_t(std::min(std::fmax(axis_0, 0.0f), axis_max_unscaled_float));
uint32_t axis_1_int =
uint32_t(std::min(std::fmax(axis_1, 0.0f), axis_max_unscaled_float));
uint32_t axis_extent_int = axis_1_int - axis_0_int;
viewport_info_out.xy_offset[i] = axis_0_int * resolution_scale;
viewport_info_out.xy_extent[i] = axis_extent_int * resolution_scale;
float ndc_scale_axis;
float ndc_offset_axis;
if (axis_extent_int) {
// Rescale from the old bounds to the new ones, and also apply the sign.
// If the new bounds are smaller than the old, for instance, we're
// cropping - the new -W...W clip space is a subregion of the old one -
// the scale should be > 1 so the area being cut off ends up outside
// -W...W. If the new region should include more than the original clip
// space, a region previously outside -W...W should end up within it, so
// the scale should be < 1.
float axis_extent_rounded = float(axis_extent_int);
ndc_scale_axis = scale_axis * 2.0f / axis_extent_rounded;
// Move the origin of the snapped coordinates back to the original one.
ndc_offset_axis = (float(offset_axis) -
(float(axis_0_int) + axis_extent_rounded * 0.5f)) *
2.0f / axis_extent_rounded;
} else {
// Empty viewport (everything outside the viewport scissor).
ndc_scale_axis = 1.0f;
ndc_offset_axis = 0.0f;
}
ndc_scale[i] = ndc_scale_axis;
ndc_offset[i] = ndc_offset_axis;
}
// Z.
float host_clip_offset_z;
float host_clip_scale_z;
if (pa_cl_clip_cntl.dx_clip_space_def) {
host_clip_offset_z = offset_z;
host_clip_scale_z = scale_z;
ndc_scale[2] = 1.0f;
ndc_offset[2] = 0.0f;
} else {
// Normalizing both Direct3D / Vulkan 0...W and OpenGL -W...W clip spaces
// to 0...W. We are not targeting OpenGL, but there we could accept the
// wanted clip space (Direct3D, OpenGL, or any) and return the actual one
// (Direct3D or OpenGL).
//
// If the guest wants to use -W...W clip space (-1...1 NDC) and a 0...1
// depth range in the end, it's expected to use ZSCALE of 0.5 and ZOFFSET
// of 0.5.
//
// We are providing the near and the far (or offset and offset + scale)
// plane distances to the host API in a way that the near maps to Z = 0
// and the far maps to Z = W in clip space (or Z = 1 in NDC).
//
// With D3D offset and scale that we want, assuming D3D clip space input,
// the formula for the depth would be:
//
// depth = offset_d3d + scale_d3d * ndc_z_d3d
//
// We are remapping the incoming OpenGL Z from -W...W to 0...W by scaling
// it by 0.5 and adding 0.5 * W to the result. So, our depth formula would
// be:
//
// depth = offset_d3d + scale_d3d * (ndc_z_gl * 0.5 + 0.5)
//
// The guest registers, however, contain the offset and the scale for
// remapping not from 0...W to near...far, but from -W...W to near...far,
// or:
//
// depth = offset_gl + scale_gl * ndc_z_gl
//
// Knowing offset_gl, scale_gl and how ndc_z_d3d can be obtained from
// ndc_z_gl, we need to derive the formulas for the needed offset_d3d and
// scale_d3d to apply them to the incoming ndc_z_d3d.
//
// depth = offset_gl + scale_gl * (ndc_z_d3d * 2 - 1)
//
// Expanding:
//
// depth = offset_gl + (scale_gl * ndc_z_d3d * 2 - scale_gl)
//
// Reordering:
//
// depth = (offset_gl - scale_gl) + (scale_gl * 2) * ndc_z_d3d
// offset_d3d = offset_gl - scale_gl
// scale_d3d = scale_gl * 2
host_clip_offset_z = offset_z - scale_z;
host_clip_scale_z = scale_z * 2.0f;
// Need to remap -W...W clip space to 0...W via ndc_scale and ndc_offset -
// by scaling Z by 0.5 and adding 0.5 * W to it.
ndc_scale[2] = 0.5f;
ndc_offset[2] = 0.5f;
}
if (pixel_shader_writes_depth) {
// Allow the pixel shader to write any depth value since
// PA_SC_VPORT_ZMIN/ZMAX isn't present on the Adreno 200; guest pixel
// shaders don't have access to the original Z in the viewport space
// anyway and likely must write the depth on all execution paths.
z_min = 0.0f;
z_max = 1.0f;
} else {
// This clamping is not very correct, but just for safety. Direct3D
// doesn't allow an unrestricted depth range. Vulkan does, as an
// extension. But cases when this really matters are yet to be found -
// trying to fix this will result in more correct depth values, but
// incorrect clipping.
z_min = std::min(std::fmax(host_clip_offset_z, 0.0f), 1.0f);
z_max = std::min(std::fmax(host_clip_offset_z + host_clip_scale_z, 0.0f),
1.0f);
// Direct3D 12 doesn't allow reverse depth range - on some drivers it
// works, on some drivers it doesn't, actually, but it was never
// explicitly allowed by the specification.
if (!allow_reverse_z && z_min > z_max) {
std::swap(z_min, z_max);
ndc_scale[2] = -ndc_scale[2];
ndc_offset[2] = 1.0f - ndc_offset[2];
}
}
}
float viewport_scale_z = pa_cl_vte_cntl.vport_z_scale_ena
? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32
: 1.0f;
float viewport_offset_z = pa_cl_vte_cntl.vport_z_offset_ena
? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32
: 0.0f;
// Vulkan requires the depth bounds to be in the 0 to 1 range without
// VK_EXT_depth_range_unrestricted (which isn't used on the Xbox 360).
float viewport_z_min = std::min(std::fmax(viewport_offset_z, 0.0f), 1.0f);
float viewport_z_max =
std::min(std::fmax(viewport_offset_z + viewport_scale_z, 0.0f), 1.0f);
// When VPORT_Z_SCALE_ENA is disabled, Z/W is directly what is expected to be
// written to the depth buffer, and for some reason DX_CLIP_SPACE_DEF isn't
// set in this case in draws in games.
bool gl_clip_space_def =
!pa_cl_clip_cntl.dx_clip_space_def && pa_cl_vte_cntl.vport_z_scale_ena;
float ndc_scale_z = gl_clip_space_def ? 0.5f : 1.0f;
float ndc_offset_z = gl_clip_space_def ? 0.5f : 0.0f;
if (viewport_z_min > viewport_z_max && !allow_reverse_z) {
std::swap(viewport_z_min, viewport_z_max);
ndc_scale_z = -ndc_scale_z;
ndc_offset_z = 1.0f - ndc_offset_z;
}
if (GetDepthControlForCurrentEdramMode(regs).z_enable &&
regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
xenos::DepthRenderTargetFormat::kD24FS8) {
@ -352,34 +538,30 @@ void GetHostViewportInfo(const RegisterFile& regs, uint32_t resolution_scale,
// Need to adjust the bounds that the resulting depth values will be
// clamped to after the pixel shader. Preferring adding some error to
// interpolated Z instead if conversion can't be done exactly, without
// modifying clipping bounds by adjusting Z in vertex shaders, as that may
// cause polygons placed explicitly at Z = 0 or Z = W to be clipped.
viewport_z_min =
xenos::Float20e4To32(xenos::Float32To20e4(viewport_z_min));
viewport_z_max =
xenos::Float20e4To32(xenos::Float32To20e4(viewport_z_max));
// modifying clipping bounds by adjusting Z in vertex shaders, as that
// may cause polygons placed explicitly at Z = 0 or Z = W to be clipped.
z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min));
z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max));
}
if (full_float24_in_0_to_1) {
// Remap the full [0...2) float24 range to [0...1) support data round-trip
// during render target ownership transfer of EDRAM tiles through depth
// input without unrestricted depth range.
viewport_z_min *= 0.5f;
viewport_z_max *= 0.5f;
z_min *= 0.5f;
z_max *= 0.5f;
}
}
viewport_info_out.z_min = z_min;
viewport_info_out.z_max = z_max;
viewport_info_out.left = viewport_left;
viewport_info_out.top = viewport_top;
viewport_info_out.width = viewport_width;
viewport_info_out.height = viewport_height;
viewport_info_out.z_min = viewport_z_min;
viewport_info_out.z_max = viewport_z_max;
viewport_info_out.ndc_scale[0] = ndc_scale_x;
viewport_info_out.ndc_scale[1] = ndc_scale_y;
viewport_info_out.ndc_scale[2] = ndc_scale_z;
viewport_info_out.ndc_offset[0] = ndc_offset_x;
viewport_info_out.ndc_offset[1] = ndc_offset_y;
viewport_info_out.ndc_offset[2] = ndc_offset_z;
if (origin_bottom_left) {
ndc_scale[1] = -ndc_scale[1];
ndc_offset[1] = -ndc_offset[1];
}
for (uint32_t i = 0; i < 3; ++i) {
viewport_info_out.ndc_scale[i] = ndc_scale[i];
viewport_info_out.ndc_offset[i] = ndc_offset[i];
}
}
void GetScissor(const RegisterFile& regs, Scissor& scissor_out) {
@ -420,10 +602,10 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out) {
// console, but no evidence of such has ever been seen).
br_x = std::max(br_x, tl_x);
br_y = std::max(br_y, tl_y);
scissor_out.left = tl_x;
scissor_out.top = tl_y;
scissor_out.width = br_x - tl_x;
scissor_out.height = br_y - tl_y;
scissor_out.offset[0] = tl_x;
scissor_out.offset[1] = tl_y;
scissor_out.extent[0] = br_x - tl_x;
scissor_out.extent[1] = br_y - tl_y;
}
xenos::CopySampleSelect SanitizeCopySampleSelect(

View File

@ -77,15 +77,27 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader,
const RegisterFile& regs);
struct ViewportInfo {
// The returned viewport will always be in the positive quarter-plane for
// simplicity of clamping to the maximum size supported by the host, negative
// offset will be applied via ndc_offset.
float left;
float top;
float width;
float height;
// Offset from render target UV = 0 to +UV.
// For simplicity of cropping to the maximum size on the host; to match the
// Direct3D 12 clipping / scissoring behavior with a fractional viewport, to
// floor(TopLeftXY) ... floor(TopLeftXY + WidthHeight), on the real AMD, Intel
// and Nvidia hardware (not WARP); as well as to hide the differences between
// 0 and 8+ viewportSubPixelBits on Vulkan, and to prevent any numerical error
// in bound checking in host APIs, viewport bounds are returned as integers.
// Also they're returned as non-negative, also to make it easier to crop (so
// Vulkan maxViewportDimensions and viewportBoundsRange don't have to be
// handled separately - maxViewportDimensions is greater than or equal to the
// largest framebuffer image size, so it's safe, and viewportBoundsRange is
// always bigger than maxViewportDimensions. All fractional offsetting,
// including the half-pixel offset, and cropping are handled via ndc_scale and
// ndc_offset.
uint32_t xy_offset[2];
// Extent can be zero for an empty viewport - host APIs not supporting empty
// viewports need to use an empty scissor rectangle.
uint32_t xy_extent[2];
float z_min;
float z_max;
// The scale is applied before the offset (like using multiply-add).
float ndc_scale[3];
float ndc_offset[3];
};
@ -94,16 +106,17 @@ struct ViewportInfo {
// host graphics APIs such as Direct3D 11+ and Vulkan, also forcing it to the
// Direct3D clip space with 0...W Z rather than -W...W.
void GetHostViewportInfo(const RegisterFile& regs, uint32_t resolution_scale,
bool origin_bottom_left, float x_max, float y_max,
bool allow_reverse_z, bool convert_z_to_float24,
bool full_float24_in_0_to_1,
bool origin_bottom_left, uint32_t x_max,
uint32_t y_max, bool allow_reverse_z,
bool convert_z_to_float24, bool full_float24_in_0_to_1,
bool pixel_shader_writes_depth,
ViewportInfo& viewport_info_out);
struct Scissor {
uint32_t left;
uint32_t top;
uint32_t width;
uint32_t height;
// Offset from render target UV = 0 to +UV.
uint32_t offset[2];
// Extent can be zero.
uint32_t extent[2];
};
void GetScissor(const RegisterFile& regs, Scissor& scissor_out);

View File

@ -1511,8 +1511,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result,
// 20e4-as-32 conversion and with 0...1 to 0...0.5 float24 remapping.
// Though 20e4 float depth can store values between 1 and 2, it's a very
// unusual case. Direct3D 10+ SV_Depth, however, can accept any values,
// including specials, when the depth buffer is floating-point; but depth
// is clamped to the viewport bounds anyway.
// including specials, when the depth buffer is floating-point.
is_clamped = true;
break;
}

View File

@ -771,11 +771,11 @@ class DxbcShaderTranslator : public ShaderTranslator {
return !is_depth_only_pixel_shader_ && !current_shader().writes_depth() &&
!current_shader().is_valid_memexport_used();
}
// Converts the depth value to 24-bit (storing the result in bits 0:23 and
// zeros in 24:31, not creating room for stencil - since this may be involved
// in comparisons) according to the format specified in the system constants.
// Source and destination may be the same, temporary must be different than
// both.
// Converts the pre-clamped depth value to 24-bit (storing the result in bits
// 0:23 and zeros in 24:31, not creating room for stencil - since this may be
// involved in comparisons) according to the format specified in the system
// constants. Source and destination may be the same, temporary must be
// different than both.
void ROV_DepthTo24Bit(uint32_t d24_temp, uint32_t d24_temp_component,
uint32_t d32_temp, uint32_t d32_temp_component,
uint32_t temp_temp, uint32_t temp_temp_component);

View File

@ -457,37 +457,10 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() {
if (!i) {
if (shader_writes_depth) {
// Clamp oDepth to the lower viewport depth bound (depth clamp happens
// after the pixel shader in the pipeline, at least on Direct3D 11 and
// Vulkan, thus applies to the shader's depth output too).
system_constants_used_ |= 1ull << kSysConst_EdramDepthRange_Index;
a_.OpMax(dxbc::Dest::R(system_temp_depth_stencil_, 0b0001),
dxbc::Src::R(system_temp_depth_stencil_, dxbc::Src::kXXXX),
dxbc::Src::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_EdramDepthRange_Vec)
.Select(kSysConst_EdramDepthRangeOffset_Comp));
// Calculate the upper Z range bound to temp.x for clamping after
// biasing.
// temp.x = viewport maximum depth
system_constants_used_ |= 1ull << kSysConst_EdramDepthRange_Index;
a_.OpAdd(temp_x_dest,
dxbc::Src::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_EdramDepthRange_Vec)
.Select(kSysConst_EdramDepthRangeOffset_Comp),
dxbc::Src::CB(cbuffer_index_system_constants_,
uint32_t(CbufferRegister::kSystemConstants),
kSysConst_EdramDepthRange_Vec)
.Select(kSysConst_EdramDepthRangeScale_Comp));
// Clamp oDepth to the upper viewport depth bound (already not above 1,
// but saturate for total safety).
// temp.x = free
a_.OpMin(dxbc::Dest::R(system_temp_depth_stencil_, 0b0001),
dxbc::Src::R(system_temp_depth_stencil_, dxbc::Src::kXXXX),
temp_x_src, true);
// Convert the shader-generated depth to 24-bit, using temp.x as
// temporary.
// temporary. oDepth is already written by StoreResult with saturation,
// no need to clamp here. Adreno 200 doesn't have PA_SC_VPORT_ZMIN/ZMAX,
// so likely there's no need to clamp to the viewport depth bounds.
ROV_DepthTo24Bit(system_temp_depth_stencil_, 0,
system_temp_depth_stencil_, 0, temp, 0);
} else {

View File

@ -22,6 +22,7 @@
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/gpu/draw_util.h"
#include "xenia/gpu/gpu_flags.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/registers.h"
#include "xenia/gpu/xenos.h"
@ -562,35 +563,32 @@ bool RenderTargetCache::Update(bool is_rasterization_done,
GetRenderTargetHeight(pitch_tiles_at_32bpp, msaa_samples);
int32_t window_y_offset =
regs.Get<reg::PA_SC_WINDOW_OFFSET>().window_y_offset;
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
if (pa_cl_vte_cntl.vport_y_scale_ena) {
if (!regs.Get<reg::PA_CL_CLIP_CNTL>().clip_disable) {
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
float viewport_bottom = 0.0f;
if (pa_cl_vte_cntl.vport_y_offset_ena) {
viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
}
// First calculate all the integer.0 or integer.5 offsetting exactly at full
// precision.
if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
viewport_bottom += float(window_y_offset);
}
viewport_bottom += std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32);
uint32_t viewport_bottom_fixed = uint32_t(std::max(
draw_util::FloatToD3D11Fixed16p8(viewport_bottom), int32_t(0)));
uint32_t viewport_bottom_pixels = viewport_bottom_fixed >> 8;
// Without MSAA, the center must be covered - according to the top-left
// rasterization rule, for the bottom, the test is exclusive. If the last
// row is included in the viewport only partially, check if its center is
// precisely potentially covered to round - to more safely catch, for
// example, if the game does something with the half-pixel offset through
// the viewport.
// With MSAA, it's less likely that the game will use the viewport to
// manipulate the half-pixel offset - different host implementations may
// also use different sample positions (up to the topmost row - possible to
// set such sample positions in PC APIs), so just check if the last row's
// area is at least slightly covered.
if ((viewport_bottom_fixed & uint32_t(0xFF)) >
uint32_t(msaa_samples != xenos::MsaaSamples::k1X ? 0 : 0x80)) {
++viewport_bottom_pixels;
if (cvars::half_pixel_offset &&
!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
viewport_bottom += 0.5f;
}
height_used = std::min(height_used, viewport_bottom_pixels);
// Then apply the floating-point viewport offset.
if (pa_cl_vte_cntl.vport_y_offset_ena) {
viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
}
viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena
? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
: 1.0f;
// Using floor, or, rather, truncation (because maxing with zero anyway)
// similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
// GPUs on Direct3D 12, also like in draw_util::GetHostViewportInfo.
// fmax to drop NaN and < 0, min as float (height_used is well below 2^24)
// to safely drop very large values.
height_used = uint32_t(
std::min(std::fmax(viewport_bottom, 0.0f), float(height_used)));
}
uint32_t scissor_bottom = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>().br_y;
if (!regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>().window_offset_disable) {