From fb01ccaac6397a9d7e19e0338d4bbfb4a6015dfe Mon Sep 17 00:00:00 2001 From: Triang3l Date: Tue, 4 May 2021 21:39:34 +0300 Subject: [PATCH] [GPU] Viewport/clipping cleanup, don't clamp oDepth --- .../gpu/d3d12/d3d12_command_processor.cc | 37 +- src/xenia/gpu/draw_util.cc | 518 ++++++++++++------ src/xenia/gpu/draw_util.h | 41 +- src/xenia/gpu/dxbc_shader_translator.cc | 3 +- src/xenia/gpu/dxbc_shader_translator.h | 10 +- src/xenia/gpu/dxbc_shader_translator_om.cc | 33 +- src/xenia/gpu/render_target_cache.cc | 46 +- 7 files changed, 428 insertions(+), 260 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index b1419fc30..56bf3322c 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -2026,20 +2026,21 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, render_target_cache_->depth_float24_conversion(); draw_util::ViewportInfo viewport_info; draw_util::GetHostViewportInfo( - regs, resolution_scale, true, float(D3D12_VIEWPORT_BOUNDS_MAX), - float(D3D12_VIEWPORT_BOUNDS_MAX), false, + regs, resolution_scale, true, D3D12_VIEWPORT_BOUNDS_MAX, + D3D12_VIEWPORT_BOUNDS_MAX, false, host_render_targets_used && (depth_float24_conversion == RenderTargetCache::DepthFloat24Conversion::kOnOutputTruncating || depth_float24_conversion == RenderTargetCache::DepthFloat24Conversion::kOnOutputRounding), - host_render_targets_used, viewport_info); + host_render_targets_used, pixel_shader && pixel_shader->writes_depth(), + viewport_info); draw_util::Scissor scissor; draw_util::GetScissor(regs, scissor); - scissor.left *= resolution_scale; - scissor.top *= resolution_scale; - scissor.width *= resolution_scale; - scissor.height *= resolution_scale; + scissor.offset[0] *= resolution_scale; + scissor.offset[1] *= resolution_scale; + scissor.extent[0] *= resolution_scale; + scissor.extent[1] *= resolution_scale; // Update viewport, scissor, blend factor and stencil reference. UpdateFixedFunctionState(viewport_info, scissor, primitive_polygonal); @@ -2811,20 +2812,20 @@ void D3D12CommandProcessor::UpdateFixedFunctionState( // Viewport. D3D12_VIEWPORT viewport; - viewport.TopLeftX = viewport_info.left; - viewport.TopLeftY = viewport_info.top; - viewport.Width = viewport_info.width; - viewport.Height = viewport_info.height; + viewport.TopLeftX = float(viewport_info.xy_offset[0]); + viewport.TopLeftY = float(viewport_info.xy_offset[1]); + viewport.Width = float(viewport_info.xy_extent[0]); + viewport.Height = float(viewport_info.xy_extent[1]); viewport.MinDepth = viewport_info.z_min; viewport.MaxDepth = viewport_info.z_max; SetViewport(viewport); // Scissor. D3D12_RECT scissor_rect; - scissor_rect.left = LONG(scissor.left); - scissor_rect.top = LONG(scissor.top); - scissor_rect.right = LONG(scissor.left + scissor.width); - scissor_rect.bottom = LONG(scissor.top + scissor.height); + scissor_rect.left = LONG(scissor.offset[0]); + scissor_rect.top = LONG(scissor.offset[1]); + scissor_rect.right = LONG(scissor.offset[0] + scissor.extent[0]); + scissor_rect.bottom = LONG(scissor.offset[1] + scissor.extent[1]); SetScissorRect(scissor_rect); if (render_target_cache_->GetPath() == @@ -3090,9 +3091,11 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( system_constants_.point_size_min_max[0] = point_size_min; system_constants_.point_size_min_max[1] = point_size_max; float point_screen_to_ndc_x = - (0.5f * 2.0f * resolution_scale) / viewport_info.width; + (/* 0.5f * 2.0f * */ float(resolution_scale)) / + std::max(viewport_info.xy_extent[0], uint32_t(1)); float point_screen_to_ndc_y = - (0.5f * 2.0f * resolution_scale) / viewport_info.height; + (/* 0.5f * 2.0f * */ float(resolution_scale)) / + std::max(viewport_info.xy_extent[1], uint32_t(1)); dirty |= system_constants_.point_screen_to_ndc[0] != point_screen_to_ndc_x; dirty |= system_constants_.point_screen_to_ndc[1] != point_screen_to_ndc_y; system_constants_.point_screen_to_ndc[0] = point_screen_to_ndc_x; diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index 315b0a3c9..87eaf092c 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -184,167 +184,353 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader, } void GetHostViewportInfo(const RegisterFile& regs, uint32_t resolution_scale, - bool origin_bottom_left, float x_max, float y_max, - bool allow_reverse_z, bool convert_z_to_float24, - bool full_float24_in_0_to_1, + bool origin_bottom_left, uint32_t x_max, + uint32_t y_max, bool allow_reverse_z, + bool convert_z_to_float24, bool full_float24_in_0_to_1, + bool pixel_shader_writes_depth, ViewportInfo& viewport_info_out) { - assert_true(resolution_scale >= 1); - assert_true(x_max >= 1.0f); - assert_true(y_max >= 1.0f); + assert_not_zero(resolution_scale); - // PA_CL_VTE_CNTL contains whether offsets and scales are enabled. - // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf - // In games, either all are enabled (for regular drawing) or none are (for - // rectangle lists usually). + // A vertex position goes the following path: // - // If scale/offset is enabled, the Xenos shader is writing (neglecting W - // division) position in the NDC (-1, -1, dx_clip_space_def - 1) -> (1, 1, 1) - // box. If it's not, the position is in screen space. Since we can only use - // the NDC in PC APIs, we use a viewport of the largest possible size, and - // divide the position by it in translated shaders. + // = Vertex shader output in clip space, (-w, -w, 0) ... (w, w, w) for + // Direct3D or (-w, -w, -w) ... (w, w, w) for OpenGL. + // > Clipping to the boundaries of the clip space if enabled. + // > Division by W if not pre-divided. + // = Normalized device coordinates, (-1, -1, 0) ... (1, 1, 1) for Direct3D or + // (-1, -1, -1) ... (1, 1, 1) for OpenGL. + // > Viewport scaling. + // > Viewport, window and half-pixel offsetting. + // = Actual position in render target pixels used for rasterization and depth + // buffer coordinates. + // + // On modern PC graphics APIs, all drawing is done with clipping enabled (only + // Z clipping can be replaced with viewport depth range clamping). + // + // On the Xbox 360, however, there are two cases: + // + // - Clipping is enabled: + // + // Drawing "as normal", primarily for the game world. Draws are clipped to + // the (-w, -w, 0) ... (w, w, w) or (-w, -w, -w) ... (w, w, w) clip space. + // + // Ideally all offsets in pixels (window offset, half-pixel offset) are + // post-clip, and thus they would need to be applied via the host viewport + // (also the Direct3D 11.3 specification defines this as the correct way of + // reproducing the original Direct3D 9 half-pixel offset behavior). + // + // However, in reality, only WARP actually truly clips to -W...W, with the + // viewport fractional offset actually accurately making samples outside the + // fractional rectangle unable to be covered. AMD, Intel and Nvidia, in + // Direct3D 12, all don't truly clip even a really huge primitive to -W...W. + // Instead, primitives still overflow the fractional rectangle and cover + // samples outside of it. The actual viewport scissor is floor(TopLeftX, + // TopLeftY) ... floor(TopLeftX + Width, TopLeftY + Height), with flooring + // and addition in float32 (with 0x3F7FFFFF TopLeftXY, or 1.0f - ULP, all + // the samples in the top row / left column can be covered, while with + // 0x3F800000, or 1.0f, none of them can be). + // + // We are reproducing the same behavior here - what would happen if we'd be + // passing the guest values directly to Direct3D 12. Also, for consistency + // across hardware and APIs (especially Vulkan with viewportSubPixelBits + // being 0 rather than at least 8 on some devices - Arm Mali, Imagination + // PowerVR), and for simplicity of math, and also for exact calculations in + // bounds checking in validation layers of the host APIs, we are returning + // integer viewport coordinates, handling the fractional offset in the + // vertex shaders instead, via ndc_scale and ndc_offset - it shouldn't + // significantly affect precision that we will be doing the offsetting in + // W-scaled rather than W-divided units, the ratios of exponents involved in + // the calculations stay the same, and everything ends up being 16.8 anyway + // on most hardware, so small precision differences are very unlikely to + // affect coverage. + // + // FIXME(Triang3l): Overestimate or more properly round the viewport scissor + // boundaries if this flooring causes gaps on the bottom / right side in real + // games if any are found using fractional viewport coordinates. Viewport + // scissoring is not an inherent result of the viewport scale / offset, these + // are used merely for transformation of coordinates; rather, it's done by + // intersecting the viewport and scissor rectangles in the guest driver and + // writing the common portion to PA_SC_WINDOW_SCISSOR, so how the scissor is + // computed for a fractional viewport is entirely up to the guest. + // + // Even though Xbox 360 games are designed for Direct3D, with 0...W range of + // Z in clip space, the GPU also allows -W...W. Since Xenia is not targeting + // OpenGL (where it would be toggled via glClipControl - or, on ES, it would + // always be -W...W), this function always remaps it to 0...W, though + // numerically not precisely (0 is moved to 0.5, locking the exponent near + // what was the truly floating-point 0 originally). It is the guest + // viewport's responsibility (haven't checked, but it's logical) to remap + // from -1...1 in the NDC to glDepthRange within the 0...1 range. Also -Z + // pointing forward in OpenGL doesn't matter here (the -W...W clip space is + // symmetric). + // + // - Clipping is disabled: + // + // The most common case of drawing without clipping in games is screen-space + // draws, most prominently clears, directly in render target coordinates. + // + // In this particular case (though all the general case arithmetic still + // applies), the vertex shader returns a position in pixels, pre-divided by + // W (though this doesn't matter if W is 1). + // + // Because clipping is disabled, this huge polygon with, for example, + // a (1280, 720, 0, 1) vertex, is not clipped to (-w, -w) ... (w, w), so the + // vertex becomes (1280, 720) in the NDC as well (even though in regular 3D + // draws with clipping, disregarding the guard band for simplicity, it can't + // be bigger than (1, 1) after clipping and the division by W). + // + // For these draws, the viewport is also usually disabled (though, again, it + // doesn't have to be - an enabled viewport would likely still work as + // usual) by disabling PA_CL_VTE_CNTL::VPORT_X/Y/Z_SCALE/OFFSET_ENA - which + // equals to having a viewport scale of (1, 1, 1) and offset of (0, 0, 0). + // This results in the NDC being treated directly as pixel coordinates. + // Normally, with clipping, this would make only a tiny 1x1 area in the + // corner of the render target being possible to cover (and 3 unreachable + // pixels outside of the render target). The window offset is then applied, + // if needed, as well as the half-pixel offset. + // + // It's also possible (though not verified) that without clipping, Z (as a + // result of, for instance, polygon offset, or explicit calculations in the + // vertex shader) may end up outside the viewport Z range. Direct3D 10 + // requires clamping to the viewport Z bounds in all cases in the + // output-merger according to the Direct3D 11.3 functional specification. A + // different behavior is likely on the Xbox 360, however, because while + // Direct3D 10-compatible AMD GPUs such as the R600 have + // PA_SC_VPORT_ZMIN/ZMAX registers, the Adreno 200 doesn't seem to have any + // equivalents, neither in PA nor in RB. This probably also applies to + // shader depth output - possibly doesn't need to be clamped as well. + // + // On the PC, we need to emulate disabled clipping by using a viewport at + // least as large as the scissor region within the render target, as well as + // the full viewport depth range (plus changing Z clipping to Z clamping on + // the host if possible), and rescale from the guest clip space to the host + // "no clip" clip space, as well as apply the viewport, the window offset, + // and the half-pixel offset, in the vertex shader. Ideally, the host + // viewport should have a power of 2 size - so scaling doesn't affect + // precision, and is merely an exponent bias. + // + // NDC XY point towards +XY on the render target - the viewport scale sign + // handles the remapping from Direct3D 9 -Y towards +U to a generic + // transformation from the NDC to pixel coordinates. + // + // TODO(Triang3l): Investigate the need for clamping of oDepth to 0...1 for + // D24FS8 as well. auto pa_cl_clip_cntl = regs.Get(); auto pa_cl_vte_cntl = regs.Get(); auto pa_su_sc_mode_cntl = regs.Get(); auto pa_su_vtx_cntl = regs.Get(); - float viewport_left, viewport_top; - float viewport_width, viewport_height; - float ndc_scale_x, ndc_scale_y; - float ndc_offset_x, ndc_offset_y; - // To avoid zero size viewports, which would harm division and aren't allowed - // on Vulkan. Nothing will ever be covered by a viewport of this size - this - // is 2 orders of magnitude smaller than a .8 subpixel, and thus shouldn't - // have any effect on rounding, n and n + 1 / 1024 would be rounded to the - // same .8 fixed-point value, thus in fixed-point, the viewport would have - // zero size. - const float size_min = 1.0f / 1024.0f; - - float viewport_offset_x = pa_cl_vte_cntl.vport_x_offset_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32 - : 0.0f; - float viewport_offset_y = pa_cl_vte_cntl.vport_y_offset_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32 - : 0.0f; + // Obtain the original viewport values in a normalized way. + float scale_xy[] = { + pa_cl_vte_cntl.vport_x_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32 + : 1.0f, + pa_cl_vte_cntl.vport_y_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32 + : 1.0f, + }; + float scale_z = pa_cl_vte_cntl.vport_z_scale_ena + ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 + : 1.0f; + float offset_base_xy[] = { + pa_cl_vte_cntl.vport_x_offset_ena + ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32 + : 0.0f, + pa_cl_vte_cntl.vport_y_offset_ena + ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32 + : 0.0f, + }; + float offset_z = pa_cl_vte_cntl.vport_z_offset_ena + ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 + : 0.0f; + // Calculate all the integer.0 or integer.5 offsetting exactly at full + // precision, separately so it can be used in other integer calculations + // without double rounding if needed. + float offset_add_xy[2] = {}; if (pa_su_sc_mode_cntl.vtx_window_offset_enable) { auto pa_sc_window_offset = regs.Get(); - viewport_offset_x += float(pa_sc_window_offset.window_x_offset); - viewport_offset_y += float(pa_sc_window_offset.window_y_offset); + offset_add_xy[0] += float(pa_sc_window_offset.window_x_offset); + offset_add_xy[1] += float(pa_sc_window_offset.window_y_offset); } - - if (pa_cl_vte_cntl.vport_x_scale_ena) { - float pa_cl_vport_xscale = regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32; - float viewport_scale_x_abs = - std::abs(pa_cl_vport_xscale) * resolution_scale; - viewport_left = viewport_offset_x * resolution_scale - viewport_scale_x_abs; - float viewport_right = viewport_left + viewport_scale_x_abs * 2.0f; - // Keep the viewport in the positive quarter-plane for simplicity of - // clamping to the maximum supported bounds. - float cutoff_left = std::fmax(-viewport_left, 0.0f); - float cutoff_right = std::fmax(viewport_right - x_max, 0.0f); - viewport_left = std::fmax(viewport_left, 0.0f); - viewport_right = std::fmin(viewport_right, x_max); - viewport_width = viewport_right - viewport_left; - if (viewport_width > size_min) { - ndc_scale_x = - (viewport_width + cutoff_left + cutoff_right) / viewport_width; - if (pa_cl_vport_xscale < 0.0f) { - ndc_scale_x = -ndc_scale_x; - } - ndc_offset_x = - ((cutoff_right - cutoff_left) * (0.5f * 2.0f)) / viewport_width; - } else { - // Empty viewport, but don't pass 0 because that's against the Vulkan - // specification. - viewport_left = 0.0f; - viewport_width = size_min; - ndc_scale_x = 0.0f; - ndc_offset_x = 0.0f; - } - } else { - // Drawing without a viewport and without clipping to one - use a viewport - // covering the entire potential guest render target or the positive part of - // the host viewport area, whichever is smaller, and apply the offset, if - // enabled, via the shader. - viewport_left = 0.0f; - viewport_width = std::min( - float(xenos::kTexture2DCubeMaxWidthHeight) * resolution_scale, x_max); - ndc_scale_x = (2.0f * resolution_scale) / viewport_width; - ndc_offset_x = viewport_offset_x * ndc_scale_x - 1.0f; - } - - if (pa_cl_vte_cntl.vport_y_scale_ena) { - float pa_cl_vport_yscale = regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32; - float viewport_scale_y_abs = - std::abs(pa_cl_vport_yscale) * resolution_scale; - viewport_top = viewport_offset_y * resolution_scale - viewport_scale_y_abs; - float viewport_bottom = viewport_top + viewport_scale_y_abs * 2.0f; - float cutoff_top = std::fmax(-viewport_top, 0.0f); - float cutoff_bottom = std::fmax(viewport_bottom - y_max, 0.0f); - viewport_top = std::fmax(viewport_top, 0.0f); - viewport_bottom = std::fmin(viewport_bottom, y_max); - viewport_height = viewport_bottom - viewport_top; - if (viewport_height > size_min) { - ndc_scale_y = - (viewport_height + cutoff_top + cutoff_bottom) / viewport_height; - if (pa_cl_vport_yscale < 0.0f) { - ndc_scale_y = -ndc_scale_y; - } - ndc_offset_y = - ((cutoff_bottom - cutoff_top) * (0.5f * 2.0f)) / viewport_height; - } else { - // Empty viewport, but don't pass 0 because that's against the Vulkan - // specification. - viewport_top = 0.0f; - viewport_height = size_min; - ndc_scale_y = 0.0f; - ndc_offset_y = 0.0f; - } - } else { - viewport_top = 0.0f; - viewport_height = std::min( - float(xenos::kTexture2DCubeMaxWidthHeight) * resolution_scale, y_max); - ndc_scale_y = (2.0f * resolution_scale) / viewport_height; - ndc_offset_y = viewport_offset_y * ndc_scale_y - 1.0f; - } - - // Apply the vertex half-pixel offset via the shader (it must not affect - // clipping, otherwise with resolution scale, samples in the left/top half - // will never be covered). if (cvars::half_pixel_offset && !pa_su_vtx_cntl.pix_center) { - float half_pixel_offset_ndc_scale = 0.5f * 2.0f * resolution_scale; - ndc_offset_x += half_pixel_offset_ndc_scale / viewport_width; - ndc_offset_y += half_pixel_offset_ndc_scale / viewport_height; + offset_add_xy[0] += 0.5f; + offset_add_xy[1] += 0.5f; } - if (origin_bottom_left) { - ndc_scale_y = -ndc_scale_y; - ndc_offset_y = -ndc_offset_y; + // The maximum value is at least the maximum host render target size anyway - + // and a guest pixel is always treated as a whole with resolution scaling. + uint32_t xy_max_unscaled[] = {x_max / resolution_scale, + y_max / resolution_scale}; + assert_not_zero(xy_max_unscaled[0]); + assert_not_zero(xy_max_unscaled[1]); + + float z_min; + float z_max; + float ndc_scale[3]; + float ndc_offset[3]; + + if (pa_cl_clip_cntl.clip_disable) { + // Clipping is disabled - use a huge host viewport, perform pixel and depth + // offsetting in the vertex shader. + + // XY. + for (uint32_t i = 0; i < 2; ++i) { + viewport_info_out.xy_offset[i] = 0; + uint32_t extent_axis_unscaled = + std::min(xenos::kTexture2DCubeMaxWidthHeight, xy_max_unscaled[i]); + viewport_info_out.xy_extent[i] = extent_axis_unscaled * resolution_scale; + float extent_axis_unscaled_float = float(extent_axis_unscaled); + float pixels_to_ndc_axis = 2.0f / extent_axis_unscaled_float; + ndc_scale[i] = scale_xy[i] * pixels_to_ndc_axis; + ndc_offset[i] = (offset_base_xy[i] - extent_axis_unscaled_float * 0.5f + + offset_add_xy[i]) * + pixels_to_ndc_axis; + } + + // Z. + z_min = 0.0f; + z_max = 1.0f; + ndc_scale[2] = scale_z; + ndc_offset[2] = offset_z; + } else { + // Clipping is enabled - perform pixel and depth offsetting via the host + // viewport. + + // XY. + for (uint32_t i = 0; i < 2; ++i) { + // With resolution scaling, do all viewport XY scissoring in guest pixels + // if fractional and for the half-pixel offset - we treat guest pixels as + // a whole, and also the half-pixel offset would be irreversible in guest + // vertices if we did flooring in host pixels. Instead of flooring, also + // doing truncation for simplicity - since maxing with 0 is done anyway + // (we only return viewports in the positive quarter-plane). + float offset_axis = offset_base_xy[i] + offset_add_xy[i]; + float scale_axis = scale_xy[i]; + float scale_axis_abs = std::abs(scale_xy[i]); + float axis_0 = offset_axis - scale_axis_abs; + float axis_1 = offset_axis + scale_axis_abs; + float axis_max_unscaled_float = float(xy_max_unscaled[i]); + // fmax to drop NaN and < 0, min as float (axis_max_unscaled_float is well + // below 2^24) to safely drop very large values. + uint32_t axis_0_int = + uint32_t(std::min(std::fmax(axis_0, 0.0f), axis_max_unscaled_float)); + uint32_t axis_1_int = + uint32_t(std::min(std::fmax(axis_1, 0.0f), axis_max_unscaled_float)); + uint32_t axis_extent_int = axis_1_int - axis_0_int; + viewport_info_out.xy_offset[i] = axis_0_int * resolution_scale; + viewport_info_out.xy_extent[i] = axis_extent_int * resolution_scale; + float ndc_scale_axis; + float ndc_offset_axis; + if (axis_extent_int) { + // Rescale from the old bounds to the new ones, and also apply the sign. + // If the new bounds are smaller than the old, for instance, we're + // cropping - the new -W...W clip space is a subregion of the old one - + // the scale should be > 1 so the area being cut off ends up outside + // -W...W. If the new region should include more than the original clip + // space, a region previously outside -W...W should end up within it, so + // the scale should be < 1. + float axis_extent_rounded = float(axis_extent_int); + ndc_scale_axis = scale_axis * 2.0f / axis_extent_rounded; + // Move the origin of the snapped coordinates back to the original one. + ndc_offset_axis = (float(offset_axis) - + (float(axis_0_int) + axis_extent_rounded * 0.5f)) * + 2.0f / axis_extent_rounded; + } else { + // Empty viewport (everything outside the viewport scissor). + ndc_scale_axis = 1.0f; + ndc_offset_axis = 0.0f; + } + ndc_scale[i] = ndc_scale_axis; + ndc_offset[i] = ndc_offset_axis; + } + + // Z. + float host_clip_offset_z; + float host_clip_scale_z; + if (pa_cl_clip_cntl.dx_clip_space_def) { + host_clip_offset_z = offset_z; + host_clip_scale_z = scale_z; + ndc_scale[2] = 1.0f; + ndc_offset[2] = 0.0f; + } else { + // Normalizing both Direct3D / Vulkan 0...W and OpenGL -W...W clip spaces + // to 0...W. We are not targeting OpenGL, but there we could accept the + // wanted clip space (Direct3D, OpenGL, or any) and return the actual one + // (Direct3D or OpenGL). + // + // If the guest wants to use -W...W clip space (-1...1 NDC) and a 0...1 + // depth range in the end, it's expected to use ZSCALE of 0.5 and ZOFFSET + // of 0.5. + // + // We are providing the near and the far (or offset and offset + scale) + // plane distances to the host API in a way that the near maps to Z = 0 + // and the far maps to Z = W in clip space (or Z = 1 in NDC). + // + // With D3D offset and scale that we want, assuming D3D clip space input, + // the formula for the depth would be: + // + // depth = offset_d3d + scale_d3d * ndc_z_d3d + // + // We are remapping the incoming OpenGL Z from -W...W to 0...W by scaling + // it by 0.5 and adding 0.5 * W to the result. So, our depth formula would + // be: + // + // depth = offset_d3d + scale_d3d * (ndc_z_gl * 0.5 + 0.5) + // + // The guest registers, however, contain the offset and the scale for + // remapping not from 0...W to near...far, but from -W...W to near...far, + // or: + // + // depth = offset_gl + scale_gl * ndc_z_gl + // + // Knowing offset_gl, scale_gl and how ndc_z_d3d can be obtained from + // ndc_z_gl, we need to derive the formulas for the needed offset_d3d and + // scale_d3d to apply them to the incoming ndc_z_d3d. + // + // depth = offset_gl + scale_gl * (ndc_z_d3d * 2 - 1) + // + // Expanding: + // + // depth = offset_gl + (scale_gl * ndc_z_d3d * 2 - scale_gl) + // + // Reordering: + // + // depth = (offset_gl - scale_gl) + (scale_gl * 2) * ndc_z_d3d + // offset_d3d = offset_gl - scale_gl + // scale_d3d = scale_gl * 2 + host_clip_offset_z = offset_z - scale_z; + host_clip_scale_z = scale_z * 2.0f; + // Need to remap -W...W clip space to 0...W via ndc_scale and ndc_offset - + // by scaling Z by 0.5 and adding 0.5 * W to it. + ndc_scale[2] = 0.5f; + ndc_offset[2] = 0.5f; + } + if (pixel_shader_writes_depth) { + // Allow the pixel shader to write any depth value since + // PA_SC_VPORT_ZMIN/ZMAX isn't present on the Adreno 200; guest pixel + // shaders don't have access to the original Z in the viewport space + // anyway and likely must write the depth on all execution paths. + z_min = 0.0f; + z_max = 1.0f; + } else { + // This clamping is not very correct, but just for safety. Direct3D + // doesn't allow an unrestricted depth range. Vulkan does, as an + // extension. But cases when this really matters are yet to be found - + // trying to fix this will result in more correct depth values, but + // incorrect clipping. + z_min = std::min(std::fmax(host_clip_offset_z, 0.0f), 1.0f); + z_max = std::min(std::fmax(host_clip_offset_z + host_clip_scale_z, 0.0f), + 1.0f); + // Direct3D 12 doesn't allow reverse depth range - on some drivers it + // works, on some drivers it doesn't, actually, but it was never + // explicitly allowed by the specification. + if (!allow_reverse_z && z_min > z_max) { + std::swap(z_min, z_max); + ndc_scale[2] = -ndc_scale[2]; + ndc_offset[2] = 1.0f - ndc_offset[2]; + } + } } - float viewport_scale_z = pa_cl_vte_cntl.vport_z_scale_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 - : 1.0f; - float viewport_offset_z = pa_cl_vte_cntl.vport_z_offset_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 - : 0.0f; - // Vulkan requires the depth bounds to be in the 0 to 1 range without - // VK_EXT_depth_range_unrestricted (which isn't used on the Xbox 360). - float viewport_z_min = std::min(std::fmax(viewport_offset_z, 0.0f), 1.0f); - float viewport_z_max = - std::min(std::fmax(viewport_offset_z + viewport_scale_z, 0.0f), 1.0f); - // When VPORT_Z_SCALE_ENA is disabled, Z/W is directly what is expected to be - // written to the depth buffer, and for some reason DX_CLIP_SPACE_DEF isn't - // set in this case in draws in games. - bool gl_clip_space_def = - !pa_cl_clip_cntl.dx_clip_space_def && pa_cl_vte_cntl.vport_z_scale_ena; - float ndc_scale_z = gl_clip_space_def ? 0.5f : 1.0f; - float ndc_offset_z = gl_clip_space_def ? 0.5f : 0.0f; - if (viewport_z_min > viewport_z_max && !allow_reverse_z) { - std::swap(viewport_z_min, viewport_z_max); - ndc_scale_z = -ndc_scale_z; - ndc_offset_z = 1.0f - ndc_offset_z; - } if (GetDepthControlForCurrentEdramMode(regs).z_enable && regs.Get().depth_format == xenos::DepthRenderTargetFormat::kD24FS8) { @@ -352,34 +538,30 @@ void GetHostViewportInfo(const RegisterFile& regs, uint32_t resolution_scale, // Need to adjust the bounds that the resulting depth values will be // clamped to after the pixel shader. Preferring adding some error to // interpolated Z instead if conversion can't be done exactly, without - // modifying clipping bounds by adjusting Z in vertex shaders, as that may - // cause polygons placed explicitly at Z = 0 or Z = W to be clipped. - viewport_z_min = - xenos::Float20e4To32(xenos::Float32To20e4(viewport_z_min)); - viewport_z_max = - xenos::Float20e4To32(xenos::Float32To20e4(viewport_z_max)); + // modifying clipping bounds by adjusting Z in vertex shaders, as that + // may cause polygons placed explicitly at Z = 0 or Z = W to be clipped. + z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min)); + z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max)); } if (full_float24_in_0_to_1) { // Remap the full [0...2) float24 range to [0...1) support data round-trip // during render target ownership transfer of EDRAM tiles through depth // input without unrestricted depth range. - viewport_z_min *= 0.5f; - viewport_z_max *= 0.5f; + z_min *= 0.5f; + z_max *= 0.5f; } } + viewport_info_out.z_min = z_min; + viewport_info_out.z_max = z_max; - viewport_info_out.left = viewport_left; - viewport_info_out.top = viewport_top; - viewport_info_out.width = viewport_width; - viewport_info_out.height = viewport_height; - viewport_info_out.z_min = viewport_z_min; - viewport_info_out.z_max = viewport_z_max; - viewport_info_out.ndc_scale[0] = ndc_scale_x; - viewport_info_out.ndc_scale[1] = ndc_scale_y; - viewport_info_out.ndc_scale[2] = ndc_scale_z; - viewport_info_out.ndc_offset[0] = ndc_offset_x; - viewport_info_out.ndc_offset[1] = ndc_offset_y; - viewport_info_out.ndc_offset[2] = ndc_offset_z; + if (origin_bottom_left) { + ndc_scale[1] = -ndc_scale[1]; + ndc_offset[1] = -ndc_offset[1]; + } + for (uint32_t i = 0; i < 3; ++i) { + viewport_info_out.ndc_scale[i] = ndc_scale[i]; + viewport_info_out.ndc_offset[i] = ndc_offset[i]; + } } void GetScissor(const RegisterFile& regs, Scissor& scissor_out) { @@ -420,10 +602,10 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out) { // console, but no evidence of such has ever been seen). br_x = std::max(br_x, tl_x); br_y = std::max(br_y, tl_y); - scissor_out.left = tl_x; - scissor_out.top = tl_y; - scissor_out.width = br_x - tl_x; - scissor_out.height = br_y - tl_y; + scissor_out.offset[0] = tl_x; + scissor_out.offset[1] = tl_y; + scissor_out.extent[0] = br_x - tl_x; + scissor_out.extent[1] = br_y - tl_y; } xenos::CopySampleSelect SanitizeCopySampleSelect( diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h index f95dece17..193aabca5 100644 --- a/src/xenia/gpu/draw_util.h +++ b/src/xenia/gpu/draw_util.h @@ -77,15 +77,27 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader, const RegisterFile& regs); struct ViewportInfo { - // The returned viewport will always be in the positive quarter-plane for - // simplicity of clamping to the maximum size supported by the host, negative - // offset will be applied via ndc_offset. - float left; - float top; - float width; - float height; + // Offset from render target UV = 0 to +UV. + // For simplicity of cropping to the maximum size on the host; to match the + // Direct3D 12 clipping / scissoring behavior with a fractional viewport, to + // floor(TopLeftXY) ... floor(TopLeftXY + WidthHeight), on the real AMD, Intel + // and Nvidia hardware (not WARP); as well as to hide the differences between + // 0 and 8+ viewportSubPixelBits on Vulkan, and to prevent any numerical error + // in bound checking in host APIs, viewport bounds are returned as integers. + // Also they're returned as non-negative, also to make it easier to crop (so + // Vulkan maxViewportDimensions and viewportBoundsRange don't have to be + // handled separately - maxViewportDimensions is greater than or equal to the + // largest framebuffer image size, so it's safe, and viewportBoundsRange is + // always bigger than maxViewportDimensions. All fractional offsetting, + // including the half-pixel offset, and cropping are handled via ndc_scale and + // ndc_offset. + uint32_t xy_offset[2]; + // Extent can be zero for an empty viewport - host APIs not supporting empty + // viewports need to use an empty scissor rectangle. + uint32_t xy_extent[2]; float z_min; float z_max; + // The scale is applied before the offset (like using multiply-add). float ndc_scale[3]; float ndc_offset[3]; }; @@ -94,16 +106,17 @@ struct ViewportInfo { // host graphics APIs such as Direct3D 11+ and Vulkan, also forcing it to the // Direct3D clip space with 0...W Z rather than -W...W. void GetHostViewportInfo(const RegisterFile& regs, uint32_t resolution_scale, - bool origin_bottom_left, float x_max, float y_max, - bool allow_reverse_z, bool convert_z_to_float24, - bool full_float24_in_0_to_1, + bool origin_bottom_left, uint32_t x_max, + uint32_t y_max, bool allow_reverse_z, + bool convert_z_to_float24, bool full_float24_in_0_to_1, + bool pixel_shader_writes_depth, ViewportInfo& viewport_info_out); struct Scissor { - uint32_t left; - uint32_t top; - uint32_t width; - uint32_t height; + // Offset from render target UV = 0 to +UV. + uint32_t offset[2]; + // Extent can be zero. + uint32_t extent[2]; }; void GetScissor(const RegisterFile& regs, Scissor& scissor_out); diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 154a64e86..5ea7e797e 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -1511,8 +1511,7 @@ void DxbcShaderTranslator::StoreResult(const InstructionResult& result, // 20e4-as-32 conversion and with 0...1 to 0...0.5 float24 remapping. // Though 20e4 float depth can store values between 1 and 2, it's a very // unusual case. Direct3D 10+ SV_Depth, however, can accept any values, - // including specials, when the depth buffer is floating-point; but depth - // is clamped to the viewport bounds anyway. + // including specials, when the depth buffer is floating-point. is_clamped = true; break; } diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index f513b2a22..3b749d654 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -771,11 +771,11 @@ class DxbcShaderTranslator : public ShaderTranslator { return !is_depth_only_pixel_shader_ && !current_shader().writes_depth() && !current_shader().is_valid_memexport_used(); } - // Converts the depth value to 24-bit (storing the result in bits 0:23 and - // zeros in 24:31, not creating room for stencil - since this may be involved - // in comparisons) according to the format specified in the system constants. - // Source and destination may be the same, temporary must be different than - // both. + // Converts the pre-clamped depth value to 24-bit (storing the result in bits + // 0:23 and zeros in 24:31, not creating room for stencil - since this may be + // involved in comparisons) according to the format specified in the system + // constants. Source and destination may be the same, temporary must be + // different than both. void ROV_DepthTo24Bit(uint32_t d24_temp, uint32_t d24_temp_component, uint32_t d32_temp, uint32_t d32_temp_component, uint32_t temp_temp, uint32_t temp_temp_component); diff --git a/src/xenia/gpu/dxbc_shader_translator_om.cc b/src/xenia/gpu/dxbc_shader_translator_om.cc index 38a6bf9a3..15984c8db 100644 --- a/src/xenia/gpu/dxbc_shader_translator_om.cc +++ b/src/xenia/gpu/dxbc_shader_translator_om.cc @@ -457,37 +457,10 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() { if (!i) { if (shader_writes_depth) { - // Clamp oDepth to the lower viewport depth bound (depth clamp happens - // after the pixel shader in the pipeline, at least on Direct3D 11 and - // Vulkan, thus applies to the shader's depth output too). - system_constants_used_ |= 1ull << kSysConst_EdramDepthRange_Index; - a_.OpMax(dxbc::Dest::R(system_temp_depth_stencil_, 0b0001), - dxbc::Src::R(system_temp_depth_stencil_, dxbc::Src::kXXXX), - dxbc::Src::CB(cbuffer_index_system_constants_, - uint32_t(CbufferRegister::kSystemConstants), - kSysConst_EdramDepthRange_Vec) - .Select(kSysConst_EdramDepthRangeOffset_Comp)); - // Calculate the upper Z range bound to temp.x for clamping after - // biasing. - // temp.x = viewport maximum depth - system_constants_used_ |= 1ull << kSysConst_EdramDepthRange_Index; - a_.OpAdd(temp_x_dest, - dxbc::Src::CB(cbuffer_index_system_constants_, - uint32_t(CbufferRegister::kSystemConstants), - kSysConst_EdramDepthRange_Vec) - .Select(kSysConst_EdramDepthRangeOffset_Comp), - dxbc::Src::CB(cbuffer_index_system_constants_, - uint32_t(CbufferRegister::kSystemConstants), - kSysConst_EdramDepthRange_Vec) - .Select(kSysConst_EdramDepthRangeScale_Comp)); - // Clamp oDepth to the upper viewport depth bound (already not above 1, - // but saturate for total safety). - // temp.x = free - a_.OpMin(dxbc::Dest::R(system_temp_depth_stencil_, 0b0001), - dxbc::Src::R(system_temp_depth_stencil_, dxbc::Src::kXXXX), - temp_x_src, true); // Convert the shader-generated depth to 24-bit, using temp.x as - // temporary. + // temporary. oDepth is already written by StoreResult with saturation, + // no need to clamp here. Adreno 200 doesn't have PA_SC_VPORT_ZMIN/ZMAX, + // so likely there's no need to clamp to the viewport depth bounds. ROV_DepthTo24Bit(system_temp_depth_stencil_, 0, system_temp_depth_stencil_, 0, temp, 0); } else { diff --git a/src/xenia/gpu/render_target_cache.cc b/src/xenia/gpu/render_target_cache.cc index 72ed63e57..70935c3c9 100644 --- a/src/xenia/gpu/render_target_cache.cc +++ b/src/xenia/gpu/render_target_cache.cc @@ -22,6 +22,7 @@ #include "xenia/base/logging.h" #include "xenia/base/math.h" #include "xenia/gpu/draw_util.h" +#include "xenia/gpu/gpu_flags.h" #include "xenia/gpu/register_file.h" #include "xenia/gpu/registers.h" #include "xenia/gpu/xenos.h" @@ -562,35 +563,32 @@ bool RenderTargetCache::Update(bool is_rasterization_done, GetRenderTargetHeight(pitch_tiles_at_32bpp, msaa_samples); int32_t window_y_offset = regs.Get().window_y_offset; - auto pa_cl_vte_cntl = regs.Get(); - if (pa_cl_vte_cntl.vport_y_scale_ena) { + if (!regs.Get().clip_disable) { + auto pa_cl_vte_cntl = regs.Get(); float viewport_bottom = 0.0f; - if (pa_cl_vte_cntl.vport_y_offset_ena) { - viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32; - } + // First calculate all the integer.0 or integer.5 offsetting exactly at full + // precision. if (regs.Get().vtx_window_offset_enable) { viewport_bottom += float(window_y_offset); } - viewport_bottom += std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32); - uint32_t viewport_bottom_fixed = uint32_t(std::max( - draw_util::FloatToD3D11Fixed16p8(viewport_bottom), int32_t(0))); - uint32_t viewport_bottom_pixels = viewport_bottom_fixed >> 8; - // Without MSAA, the center must be covered - according to the top-left - // rasterization rule, for the bottom, the test is exclusive. If the last - // row is included in the viewport only partially, check if its center is - // precisely potentially covered to round - to more safely catch, for - // example, if the game does something with the half-pixel offset through - // the viewport. - // With MSAA, it's less likely that the game will use the viewport to - // manipulate the half-pixel offset - different host implementations may - // also use different sample positions (up to the topmost row - possible to - // set such sample positions in PC APIs), so just check if the last row's - // area is at least slightly covered. - if ((viewport_bottom_fixed & uint32_t(0xFF)) > - uint32_t(msaa_samples != xenos::MsaaSamples::k1X ? 0 : 0x80)) { - ++viewport_bottom_pixels; + if (cvars::half_pixel_offset && + !regs.Get().pix_center) { + viewport_bottom += 0.5f; } - height_used = std::min(height_used, viewport_bottom_pixels); + // Then apply the floating-point viewport offset. + if (pa_cl_vte_cntl.vport_y_offset_ena) { + viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32; + } + viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena + ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32) + : 1.0f; + // Using floor, or, rather, truncation (because maxing with zero anyway) + // similar to how viewport scissoring behaves on real AMD, Intel and Nvidia + // GPUs on Direct3D 12, also like in draw_util::GetHostViewportInfo. + // fmax to drop NaN and < 0, min as float (height_used is well below 2^24) + // to safely drop very large values. + height_used = uint32_t( + std::min(std::fmax(viewport_bottom, 0.0f), float(height_used))); } uint32_t scissor_bottom = regs.Get().br_y; if (!regs.Get().window_offset_disable) {