From 10d7bcde930f9f395ee2bf6321449e0095408d5a Mon Sep 17 00:00:00 2001 From: Triang3l Date: Mon, 16 Nov 2020 23:03:42 +0300 Subject: [PATCH] [GPU] Viewport in draw_util with host API flexibility --- .../gpu/d3d12/d3d12_command_processor.cc | 254 +++++------------ src/xenia/gpu/d3d12/d3d12_command_processor.h | 9 +- .../shaders/dxbc/primitive_point_list_gs.cso | Bin 7692 -> 7712 bytes .../shaders/dxbc/primitive_point_list_gs.h | 256 +++++++++--------- .../shaders/dxbc/primitive_point_list_gs.txt | 114 ++++---- .../shaders/primitive_point_list.gs.hlsl | 19 +- src/xenia/gpu/draw_util.cc | 172 ++++++++++++ src/xenia/gpu/draw_util.h | 22 ++ src/xenia/gpu/dxbc_shader_translator.cc | 18 +- src/xenia/gpu/dxbc_shader_translator.h | 5 +- 10 files changed, 471 insertions(+), 398 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 8db6f1626..ff9041fbd 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -1996,15 +1996,44 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, current_external_pipeline_ = nullptr; } + // Get dynamic rasterizer state. + // Supersampling replacing multisampling due to difficulties of emulating + // EDRAM with multisampling with RTV/DSV (with ROV, there's MSAA), and also + // resolution scale. + uint32_t pixel_size_x, pixel_size_y; + if (edram_rov_used_) { + pixel_size_x = 1; + pixel_size_y = 1; + } else { + xenos::MsaaSamples msaa_samples = + regs.Get().msaa_samples; + pixel_size_x = msaa_samples >= xenos::MsaaSamples::k4X ? 2 : 1; + pixel_size_y = msaa_samples >= xenos::MsaaSamples::k2X ? 2 : 1; + } + if (texture_cache_->IsResolutionScale2X()) { + pixel_size_x *= 2; + pixel_size_y *= 2; + } + draw_util::ViewportInfo viewport_info; + draw_util::GetHostViewportInfo(regs, float(pixel_size_x), float(pixel_size_y), + true, float(D3D12_VIEWPORT_BOUNDS_MAX), false, + viewport_info); + draw_util::Scissor scissor; + draw_util::GetScissor(regs, scissor); + scissor.left *= pixel_size_x; + scissor.top *= pixel_size_y; + scissor.width *= pixel_size_x; + scissor.height *= pixel_size_y; + // Update viewport, scissor, blend factor and stencil reference. - UpdateFixedFunctionState(primitive_two_faced); + UpdateFixedFunctionState(viewport_info, scissor, primitive_two_faced); // Update system constants before uploading them. UpdateSystemConstantValues( memexport_used, primitive_two_faced, line_loop_closing_index, indexed ? index_buffer_info->endianness : xenos::Endian::kNone, - used_texture_mask, early_z, GetCurrentColorMask(pixel_shader), - pipeline_render_targets); + viewport_info, pixel_size_x, pixel_size_y, used_texture_mask, early_z, + GetCurrentColorMask(pixel_shader), pipeline_render_targets); // Update constant buffers, descriptors and root parameters. if (!UpdateBindings(vertex_shader, pixel_shader, root_signature)) { @@ -2753,87 +2782,21 @@ void D3D12CommandProcessor::ClearCommandAllocatorCache() { command_allocator_writable_last_ = nullptr; } -void D3D12CommandProcessor::UpdateFixedFunctionState(bool primitive_two_faced) { +void D3D12CommandProcessor::UpdateFixedFunctionState( + const draw_util::ViewportInfo& viewport_info, + const draw_util::Scissor& scissor, bool primitive_two_faced) { #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES - const RegisterFile& regs = *register_file_; - - // Window parameters. - // http://ftp.tku.edu.tw/NetBSD/NetBSD-current/xsrc/external/mit/xf86-video-ati/dist/src/r600_reg_auto_r6xx.h - // See r200UpdateWindow: - // https://github.com/freedreno/mesa/blob/master/src/mesa/drivers/dri/r200/r200_state.c - auto pa_sc_window_offset = regs.Get(); - - // Supersampling replacing multisampling due to difficulties of emulating - // EDRAM with multisampling with RTV/DSV (with ROV, there's MSAA), and also - // resolution scale. - uint32_t pixel_size_x, pixel_size_y; - if (edram_rov_used_) { - pixel_size_x = 1; - pixel_size_y = 1; - } else { - xenos::MsaaSamples msaa_samples = - regs.Get().msaa_samples; - pixel_size_x = msaa_samples >= xenos::MsaaSamples::k4X ? 2 : 1; - pixel_size_y = msaa_samples >= xenos::MsaaSamples::k2X ? 2 : 1; - } - if (texture_cache_->IsResolutionScale2X()) { - pixel_size_x *= 2; - pixel_size_y *= 2; - } - // Viewport. - // PA_CL_VTE_CNTL contains whether offsets and scales are enabled. - // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf - // In games, either all are enabled (for regular drawing) or none are (for - // rectangle lists usually). - // - // If scale/offset is enabled, the Xenos shader is writing (neglecting W - // division) position in the NDC (-1, -1, dx_clip_space_def - 1) -> (1, 1, 1) - // box. If it's not, the position is in screen space. Since we can only use - // the NDC in PC APIs, we use a viewport of the largest possible size, and - // divide the position by it in translated shaders. - auto pa_cl_vte_cntl = regs.Get(); - float viewport_scale_x = - pa_cl_vte_cntl.vport_x_scale_ena - ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32) - : 4096.0f; - float viewport_scale_y = - pa_cl_vte_cntl.vport_y_scale_ena - ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32) - : 4096.0f; - float viewport_scale_z = pa_cl_vte_cntl.vport_z_scale_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 - : 1.0f; - float viewport_offset_x = pa_cl_vte_cntl.vport_x_offset_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32 - : std::abs(viewport_scale_x); - float viewport_offset_y = pa_cl_vte_cntl.vport_y_offset_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32 - : std::abs(viewport_scale_y); - float viewport_offset_z = pa_cl_vte_cntl.vport_z_offset_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 - : 0.0f; - if (regs.Get().vtx_window_offset_enable) { - viewport_offset_x += float(pa_sc_window_offset.window_x_offset); - viewport_offset_y += float(pa_sc_window_offset.window_y_offset); - } D3D12_VIEWPORT viewport; - viewport.TopLeftX = - (viewport_offset_x - viewport_scale_x) * float(pixel_size_x); - viewport.TopLeftY = - (viewport_offset_y - viewport_scale_y) * float(pixel_size_y); - viewport.Width = viewport_scale_x * 2.0f * float(pixel_size_x); - viewport.Height = viewport_scale_y * 2.0f * float(pixel_size_y); - viewport.MinDepth = viewport_offset_z; - viewport.MaxDepth = viewport_offset_z + viewport_scale_z; - if (viewport_scale_z < 0.0f) { - // MinDepth > MaxDepth doesn't work on Nvidia, emulating it in vertex - // shaders and when applying polygon offset. - std::swap(viewport.MinDepth, viewport.MaxDepth); - } + viewport.TopLeftX = viewport_info.left; + viewport.TopLeftY = viewport_info.top; + viewport.Width = viewport_info.width; + viewport.Height = viewport_info.height; + viewport.MinDepth = viewport_info.z_min; + viewport.MaxDepth = viewport_info.z_max; ff_viewport_update_needed_ |= ff_viewport_.TopLeftX != viewport.TopLeftX; ff_viewport_update_needed_ |= ff_viewport_.TopLeftY != viewport.TopLeftY; ff_viewport_update_needed_ |= ff_viewport_.Width != viewport.Width; @@ -2847,13 +2810,11 @@ void D3D12CommandProcessor::UpdateFixedFunctionState(bool primitive_two_faced) { } // Scissor. - draw_util::Scissor scissor; - draw_util::GetScissor(regs, scissor); D3D12_RECT scissor_rect; - scissor_rect.left = LONG(scissor.left * pixel_size_x); - scissor_rect.top = LONG(scissor.top * pixel_size_y); - scissor_rect.right = LONG((scissor.left + scissor.width) * pixel_size_x); - scissor_rect.bottom = LONG((scissor.top + scissor.height) * pixel_size_y); + scissor_rect.left = LONG(scissor.left); + scissor_rect.top = LONG(scissor.top); + scissor_rect.right = LONG(scissor.left + scissor.width); + scissor_rect.bottom = LONG(scissor.top + scissor.height); ff_scissor_update_needed_ |= ff_scissor_.left != scissor_rect.left; ff_scissor_update_needed_ |= ff_scissor_.top != scissor_rect.top; ff_scissor_update_needed_ |= ff_scissor_.right != scissor_rect.right; @@ -2865,6 +2826,8 @@ void D3D12CommandProcessor::UpdateFixedFunctionState(bool primitive_two_faced) { } if (!edram_rov_used_) { + const RegisterFile& regs = *register_file_; + // Blend factor. ff_blend_factor_update_needed_ |= ff_blend_factor_[0] != regs[XE_GPU_REG_RB_BLEND_RED].f32; @@ -2908,7 +2871,9 @@ void D3D12CommandProcessor::UpdateFixedFunctionState(bool primitive_two_faced) { void D3D12CommandProcessor::UpdateSystemConstantValues( bool shared_memory_is_uav, bool primitive_two_faced, uint32_t line_loop_closing_index, xenos::Endian index_endian, - uint32_t used_texture_mask, bool early_z, uint32_t color_mask, + const draw_util::ViewportInfo& viewport_info, uint32_t pixel_size_x, + uint32_t pixel_size_y, uint32_t used_texture_mask, bool early_z, + uint32_t color_mask, const RenderTargetCache::PipelineRenderTarget render_targets[4]) { #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); @@ -2920,7 +2885,6 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( auto pa_su_point_minmax = regs.Get(); auto pa_su_point_size = regs.Get(); auto pa_su_sc_mode_cntl = regs.Get(); - auto pa_su_vtx_cntl = regs.Get(); float rb_alpha_ref = regs[XE_GPU_REG_RB_ALPHA_REF].f32; auto rb_colorcontrol = regs.Get(); auto rb_depth_info = regs.Get(); @@ -2986,11 +2950,6 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( } } - // Get viewport Z scale - needed for flags and ROV output. - float viewport_scale_z = pa_cl_vte_cntl.vport_z_scale_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 - : 1.0f; - bool dirty = false; // Flags. @@ -3023,10 +2982,6 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( flags |= (pa_cl_clip_cntl.value & 0b111111) << DxbcShaderTranslator::kSysFlag_UserClipPlane0_Shift; } - // Reversed depth. - if (viewport_scale_z < 0.0f) { - flags |= DxbcShaderTranslator::kSysFlag_ReverseZ; - } // Whether SV_IsFrontFace matters. if (primitive_two_faced) { flags |= DxbcShaderTranslator::kSysFlag_PrimitiveTwoFaced; @@ -3122,81 +3077,24 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( } // Conversion to Direct3D 12 normalized device coordinates. - // See viewport configuration in UpdateFixedFunctionState for explanations. - // X and Y scale/offset is to convert unnormalized coordinates generated by - // shaders (for rectangle list drawing, for instance) to the viewport of the - // largest possible render target size that is used to emulate unnormalized - // coordinates. Z scale/offset is to convert from OpenGL NDC to Direct3D NDC - // if needed. Also apply half-pixel offset to reproduce Direct3D 9 - // rasterization rules - must be done before clipping, not through the - // viewport, for SSAA and resolution scale to work correctly. - float viewport_scale_x = regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32; - float viewport_scale_y = regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32; // Kill all primitives if multipass or both faces are culled, but still need // to do memexport. if (sq_program_cntl.vs_export_mode == xenos::VertexShaderExportMode::kMultipass || (primitive_two_faced && pa_su_sc_mode_cntl.cull_front && pa_su_sc_mode_cntl.cull_back)) { - dirty |= !std::isnan(system_constants_.ndc_scale[0]); - dirty |= !std::isnan(system_constants_.ndc_scale[1]); - dirty |= !std::isnan(system_constants_.ndc_scale[2]); - dirty |= !std::isnan(system_constants_.ndc_offset[0]); - dirty |= !std::isnan(system_constants_.ndc_offset[1]); - dirty |= !std::isnan(system_constants_.ndc_offset[2]); float nan_value = std::nanf(""); - system_constants_.ndc_scale[0] = nan_value; - system_constants_.ndc_scale[1] = nan_value; - system_constants_.ndc_scale[2] = nan_value; - system_constants_.ndc_offset[0] = nan_value; - system_constants_.ndc_offset[1] = nan_value; - system_constants_.ndc_offset[2] = nan_value; - } else { - // When VPORT_Z_SCALE_ENA is disabled, Z/W is directly what is expected to - // be written to the depth buffer, and for some reason DX_CLIP_SPACE_DEF - // isn't set in this case in draws in games. - bool gl_clip_space_def = - !pa_cl_clip_cntl.dx_clip_space_def && pa_cl_vte_cntl.vport_z_scale_ena; - float ndc_scale_x = pa_cl_vte_cntl.vport_x_scale_ena - ? (viewport_scale_x >= 0.0f ? 1.0f : -1.0f) - : (1.0f / 4096.0f); - float ndc_scale_y = pa_cl_vte_cntl.vport_y_scale_ena - ? (viewport_scale_y >= 0.0f ? -1.0f : 1.0f) - : (-1.0f / 4096.0f); - float ndc_scale_z = gl_clip_space_def ? 0.5f : 1.0f; - float ndc_offset_x = pa_cl_vte_cntl.vport_x_offset_ena ? 0.0f : -1.0f; - float ndc_offset_y = pa_cl_vte_cntl.vport_y_offset_ena ? 0.0f : 1.0f; - float ndc_offset_z = gl_clip_space_def ? 0.5f : 0.0f; - if (cvars::half_pixel_offset && !pa_su_vtx_cntl.pix_center) { - // Signs are hopefully correct here, tested in GTA IV on both clearing - // (without a viewport) and drawing things near the edges of the screen. - if (pa_cl_vte_cntl.vport_x_scale_ena) { - if (viewport_scale_x != 0.0f) { - ndc_offset_x += 0.5f / viewport_scale_x; - } - } else { - ndc_offset_x += 1.0f / xenos::kTexture2DCubeMaxWidthHeight; - } - if (pa_cl_vte_cntl.vport_y_scale_ena) { - if (viewport_scale_y != 0.0f) { - ndc_offset_y += 0.5f / viewport_scale_y; - } - } else { - ndc_offset_y -= 1.0f / xenos::kTexture2DCubeMaxWidthHeight; - } + for (uint32_t i = 0; i < 3; ++i) { + dirty |= !std::isnan(system_constants_.ndc_scale[i]); + system_constants_.ndc_scale[i] = nan_value; + } + } else { + for (uint32_t i = 0; i < 3; ++i) { + dirty |= system_constants_.ndc_scale[i] != viewport_info.ndc_scale[i]; + dirty |= system_constants_.ndc_offset[i] != viewport_info.ndc_offset[i]; + system_constants_.ndc_scale[i] = viewport_info.ndc_scale[i]; + system_constants_.ndc_offset[i] = viewport_info.ndc_offset[i]; } - dirty |= system_constants_.ndc_scale[0] != ndc_scale_x; - dirty |= system_constants_.ndc_scale[1] != ndc_scale_y; - dirty |= system_constants_.ndc_scale[2] != ndc_scale_z; - dirty |= system_constants_.ndc_offset[0] != ndc_offset_x; - dirty |= system_constants_.ndc_offset[1] != ndc_offset_y; - dirty |= system_constants_.ndc_offset[2] != ndc_offset_z; - system_constants_.ndc_scale[0] = ndc_scale_x; - system_constants_.ndc_scale[1] = ndc_scale_y; - system_constants_.ndc_scale[2] = ndc_scale_z; - system_constants_.ndc_offset[0] = ndc_offset_x; - system_constants_.ndc_offset[1] = ndc_offset_y; - system_constants_.ndc_offset[2] = ndc_offset_z; } // Point size. @@ -3212,19 +3110,10 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( system_constants_.point_size[1] = point_size_y; system_constants_.point_size_min_max[0] = point_size_min; system_constants_.point_size_min_max[1] = point_size_max; - float point_screen_to_ndc_x, point_screen_to_ndc_y; - if (pa_cl_vte_cntl.vport_x_scale_ena) { - point_screen_to_ndc_x = - (viewport_scale_x != 0.0f) ? (0.5f / viewport_scale_x) : 0.0f; - } else { - point_screen_to_ndc_x = 1.0f / xenos::kTexture2DCubeMaxWidthHeight; - } - if (pa_cl_vte_cntl.vport_y_scale_ena) { - point_screen_to_ndc_y = - (viewport_scale_y != 0.0f) ? (-0.5f / viewport_scale_y) : 0.0f; - } else { - point_screen_to_ndc_y = -1.0f / xenos::kTexture2DCubeMaxWidthHeight; - } + float point_screen_to_ndc_x = + (0.5f * 2.0f * pixel_size_x) / viewport_info.width; + float point_screen_to_ndc_y = + (0.5f * 2.0f * pixel_size_y) / viewport_info.height; dirty |= system_constants_.point_screen_to_ndc[0] != point_screen_to_ndc_x; dirty |= system_constants_.point_screen_to_ndc[1] != point_screen_to_ndc_y; system_constants_.point_screen_to_ndc[0] = point_screen_to_ndc_x; @@ -3374,20 +3263,11 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( dirty |= system_constants_.edram_depth_base_dwords != depth_base_dwords; system_constants_.edram_depth_base_dwords = depth_base_dwords; - // The Z range is reversed in the vertex shader if it's reverse - use the - // absolute value of the scale. - float depth_range_scale = std::abs(viewport_scale_z); + float depth_range_scale = viewport_info.z_max - viewport_info.z_min; dirty |= system_constants_.edram_depth_range_scale != depth_range_scale; system_constants_.edram_depth_range_scale = depth_range_scale; - float depth_range_offset = pa_cl_vte_cntl.vport_z_offset_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 - : 0.0f; - if (viewport_scale_z < 0.0f) { - // Similar to MinDepth in fixed-function viewport calculation. - depth_range_offset += viewport_scale_z; - } - dirty |= system_constants_.edram_depth_range_offset != depth_range_offset; - system_constants_.edram_depth_range_offset = depth_range_offset; + dirty |= system_constants_.edram_depth_range_offset != viewport_info.z_min; + system_constants_.edram_depth_range_offset = viewport_info.z_min; // For non-polygons, front polygon offset is used, and it's enabled if // POLY_OFFSET_PARA_ENABLED is set, for polygons, separate front and back diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index ceffe5fd0..982f9eac5 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -26,6 +26,7 @@ #include "xenia/gpu/d3d12/primitive_converter.h" #include "xenia/gpu/d3d12/render_target_cache.h" #include "xenia/gpu/d3d12/texture_cache.h" +#include "xenia/gpu/draw_util.h" #include "xenia/gpu/dxbc_shader_translator.h" #include "xenia/gpu/xenos.h" #include "xenia/kernel/kernel_state.h" @@ -345,11 +346,15 @@ class D3D12CommandProcessor : public CommandProcessor { D3D12_CPU_DESCRIPTOR_HANDLE& cpu_handle_out, D3D12_GPU_DESCRIPTOR_HANDLE& gpu_handle_out); - void UpdateFixedFunctionState(bool primitive_two_faced); + void UpdateFixedFunctionState(const draw_util::ViewportInfo& viewport_info, + const draw_util::Scissor& scissor, + bool primitive_two_faced); void UpdateSystemConstantValues( bool shared_memory_is_uav, bool primitive_two_faced, uint32_t line_loop_closing_index, xenos::Endian index_endian, - uint32_t used_texture_mask, bool early_z, uint32_t color_mask, + const draw_util::ViewportInfo& viewport_info, uint32_t pixel_size_x, + uint32_t pixel_size_y, uint32_t used_texture_mask, bool early_z, + uint32_t color_mask, const RenderTargetCache::PipelineRenderTarget render_targets[4]); bool UpdateBindings(const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader, diff --git a/src/xenia/gpu/d3d12/shaders/dxbc/primitive_point_list_gs.cso b/src/xenia/gpu/d3d12/shaders/dxbc/primitive_point_list_gs.cso index b871af09c7e2d2e073cddb2798377c9642239b47..13cca4e6c777e64e1027f9828128f8b9a205ce69 100644 GIT binary patch delta 260 zcmeCNSzu%665-@5)*>MDa&6hqNuRE!)^x=mWn^GrP>^F_UwYIy(Sk(urM-Bz9lW}9Z#ya_&RE(kms}s~PEvVrP3=$khP_Z@!fz9_N7Bfy(lkl2+Mk;1< Zf~*8<8PGQ7$+fcXlc&fEOuiy(0RYmUIm`e6 delta 256 zcmZ2r(_>@k65-^WU-e)?(}%ut?Iah5#U}0!j0_A6JaP;StU%fXh%az4Flg{HFs$Kc zU?|vVSkKS72gnO#U}DJMypzA1k&$V$tKfPjGYf_au1^dC3?Th(ECN81fq}^ZNH8?) z2V#3D2C^$S;Oh7`>xryjoXjWT#_9&rJ6TaI9B6R@$Dz$zB{ninz9i$u`iu)`7)U~x zfkA@P2&fDMfXW*j85lNeOD_g06ZM-cBO9@~LTnP_= 1.0f); + assert_true(pixel_size_y >= 1.0f); + assert_true(xy_max >= 1.0f); + + // PA_CL_VTE_CNTL contains whether offsets and scales are enabled. + // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf + // In games, either all are enabled (for regular drawing) or none are (for + // rectangle lists usually). + // + // If scale/offset is enabled, the Xenos shader is writing (neglecting W + // division) position in the NDC (-1, -1, dx_clip_space_def - 1) -> (1, 1, 1) + // box. If it's not, the position is in screen space. Since we can only use + // the NDC in PC APIs, we use a viewport of the largest possible size, and + // divide the position by it in translated shaders. + + auto pa_cl_clip_cntl = regs.Get(); + auto pa_cl_vte_cntl = regs.Get(); + auto pa_su_sc_mode_cntl = regs.Get(); + auto pa_su_vtx_cntl = regs.Get(); + + float viewport_left, viewport_top; + float viewport_width, viewport_height; + float ndc_scale_x, ndc_scale_y; + float ndc_offset_x, ndc_offset_y; + // To avoid zero size viewports, which would harm division and aren't allowed + // on Vulkan. Nothing will ever be covered by a viewport of this size - this + // is 2 orders of magnitude smaller than a .8 subpixel, and thus shouldn't + // have any effect on rounding, n and n + 1 / 1024 would be rounded to the + // same .8 fixed-point value, thus in fixed-point, the viewport would have + // zero size. + const float size_min = 1.0f / 1024.0f; + + float viewport_offset_x = pa_cl_vte_cntl.vport_x_offset_ena + ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32 + : 0.0f; + float viewport_offset_y = pa_cl_vte_cntl.vport_y_offset_ena + ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32 + : 0.0f; + if (pa_su_sc_mode_cntl.vtx_window_offset_enable) { + auto pa_sc_window_offset = regs.Get(); + viewport_offset_x += float(pa_sc_window_offset.window_x_offset); + viewport_offset_y += float(pa_sc_window_offset.window_y_offset); + } + + if (pa_cl_vte_cntl.vport_x_scale_ena) { + float pa_cl_vport_xscale = regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32; + float viewport_scale_x_abs = std::abs(pa_cl_vport_xscale) * pixel_size_x; + viewport_left = viewport_offset_x * pixel_size_x - viewport_scale_x_abs; + float viewport_right = viewport_left + viewport_scale_x_abs * 2.0f; + // Keep the viewport in the positive quarter-plane for simplicity of + // clamping to the maximum supported bounds. + float cutoff_left = std::fmax(-viewport_left, 0.0f); + float cutoff_right = std::fmax(viewport_right - xy_max, 0.0f); + viewport_left = std::fmax(viewport_left, 0.0f); + viewport_right = std::fmin(viewport_right, xy_max); + viewport_width = viewport_right - viewport_left; + if (viewport_width > size_min) { + ndc_scale_x = + (viewport_width + cutoff_left + cutoff_right) / viewport_width; + if (pa_cl_vport_xscale < 0.0f) { + ndc_scale_x = -ndc_scale_x; + } + ndc_offset_x = + ((cutoff_right - cutoff_left) * (0.5f * 2.0f)) / viewport_width; + } else { + // Empty viewport, but don't pass 0 because that's against the Vulkan + // specification. + viewport_left = 0.0f; + viewport_width = size_min; + ndc_scale_x = 0.0f; + ndc_offset_x = 0.0f; + } + } else { + // Drawing without a viewport and without clipping to one - use a viewport + // covering the entire potential guest render target or the positive part of + // the host viewport area, whichever is smaller, and apply the offset, if + // enabled, via the shader. + viewport_left = 0.0f; + viewport_width = std::min( + float(xenos::kTexture2DCubeMaxWidthHeight) * pixel_size_x, xy_max); + ndc_scale_x = (2.0f * pixel_size_x) / viewport_width; + ndc_offset_x = viewport_offset_x * ndc_scale_x - 1.0f; + } + + if (pa_cl_vte_cntl.vport_y_scale_ena) { + float pa_cl_vport_yscale = regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32; + float viewport_scale_y_abs = std::abs(pa_cl_vport_yscale) * pixel_size_y; + viewport_top = viewport_offset_y * pixel_size_y - viewport_scale_y_abs; + float viewport_bottom = viewport_top + viewport_scale_y_abs * 2.0f; + float cutoff_top = std::fmax(-viewport_top, 0.0f); + float cutoff_bottom = std::fmax(viewport_bottom - xy_max, 0.0f); + viewport_top = std::fmax(viewport_top, 0.0f); + viewport_bottom = std::fmin(viewport_bottom, xy_max); + viewport_height = viewport_bottom - viewport_top; + if (viewport_height > size_min) { + ndc_scale_y = + (viewport_height + cutoff_top + cutoff_bottom) / viewport_height; + if (pa_cl_vport_yscale < 0.0f) { + ndc_scale_y = -ndc_scale_y; + } + ndc_offset_y = + ((cutoff_bottom - cutoff_top) * (0.5f * 2.0f)) / viewport_height; + } else { + // Empty viewport, but don't pass 0 because that's against the Vulkan + // specification. + viewport_top = 0.0f; + viewport_height = size_min; + ndc_scale_y = 0.0f; + ndc_offset_y = 0.0f; + } + } else { + viewport_height = std::min( + float(xenos::kTexture2DCubeMaxWidthHeight) * pixel_size_y, xy_max); + ndc_scale_y = (2.0f * pixel_size_y) / viewport_height; + ndc_offset_y = viewport_offset_y * ndc_scale_y - 1.0f; + } + + // Apply the vertex half-pixel offset via the shader (it must not affect + // clipping, otherwise with SSAA or resolution scale, samples in the left/top + // half will never be covered). + if (cvars::half_pixel_offset && !pa_su_vtx_cntl.pix_center) { + ndc_offset_x += (0.5f * 2.0f * pixel_size_x) / viewport_width; + ndc_offset_y += (0.5f * 2.0f * pixel_size_y) / viewport_height; + } + + if (origin_bottom_left) { + ndc_scale_y = -ndc_scale_y; + ndc_offset_y = -ndc_offset_y; + } + + float viewport_scale_z = pa_cl_vte_cntl.vport_z_scale_ena + ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 + : 1.0f; + float viewport_offset_z = pa_cl_vte_cntl.vport_z_offset_ena + ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 + : 0.0f; + // Vulkan requires the depth bounds to be in the 0 to 1 range without + // VK_EXT_depth_range_unrestricted (which isn't used on the Xbox 360). + float viewport_z_min = std::min(std::fmax(viewport_offset_z, 0.0f), 1.0f); + float viewport_z_max = + std::min(std::fmax(viewport_offset_z + viewport_scale_z, 0.0f), 1.0f); + // When VPORT_Z_SCALE_ENA is disabled, Z/W is directly what is expected to be + // written to the depth buffer, and for some reason DX_CLIP_SPACE_DEF isn't + // set in this case in draws in games. + bool gl_clip_space_def = + !pa_cl_clip_cntl.dx_clip_space_def && pa_cl_vte_cntl.vport_z_scale_ena; + float ndc_scale_z = gl_clip_space_def ? 0.5f : 1.0f; + float ndc_offset_z = gl_clip_space_def ? 0.5f : 0.0f; + if (viewport_z_min > viewport_z_max && !allow_reverse_z) { + std::swap(viewport_z_min, viewport_z_max); + ndc_scale_z = -ndc_scale_z; + ndc_offset_z = 1.0f - ndc_offset_z; + } + + viewport_info_out.left = viewport_left; + viewport_info_out.top = viewport_top; + viewport_info_out.width = viewport_width; + viewport_info_out.height = viewport_height; + viewport_info_out.z_min = viewport_z_min; + viewport_info_out.z_max = viewport_z_max; + viewport_info_out.ndc_scale[0] = ndc_scale_x; + viewport_info_out.ndc_scale[1] = ndc_scale_y; + viewport_info_out.ndc_scale[2] = ndc_scale_z; + viewport_info_out.ndc_offset[0] = ndc_offset_x; + viewport_info_out.ndc_offset[1] = ndc_offset_y; + viewport_info_out.ndc_offset[2] = ndc_offset_z; +} + void GetScissor(const RegisterFile& regs, Scissor& scissor_out) { // FIXME(Triang3l): Screen scissor isn't applied here, but it seems to be // unused on Xbox 360 Direct3D 9. diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h index 7ef3186a0..2cee26de7 100644 --- a/src/xenia/gpu/draw_util.h +++ b/src/xenia/gpu/draw_util.h @@ -33,6 +33,28 @@ namespace draw_util { // for use with the top-left rasterization rule later. int32_t FloatToD3D11Fixed16p8(float f32); +struct ViewportInfo { + // The returned viewport will always be in the positive quarter-plane for + // simplicity of clamping to the maximum size supported by the host, negative + // offset will be applied via ndc_offset. + float left; + float top; + float width; + float height; + float z_min; + float z_max; + float ndc_scale[3]; + float ndc_offset[3]; +}; +// Converts the guest viewport (or fakes one if drawing without a viewport) to +// a viewport, plus values to multiply-add the returned position by, usable on +// host graphics APIs such as Direct3D 11+ and Vulkan, also forcing it to the +// Direct3D clip space with 0...W Z rather than -W...W. +void GetHostViewportInfo(const RegisterFile& regs, float pixel_size_x, + float pixel_size_y, bool origin_bottom_left, + float xy_max, bool allow_reverse_z, + ViewportInfo& viewport_info_out); + struct Scissor { uint32_t left; uint32_t top; diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index 3f9140158..56278157d 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -1044,10 +1044,9 @@ void DxbcShaderTranslator::CompleteVertexOrDomainShader() { DxbcOpEndIf(); } - // Apply scale for drawing without a viewport, and also remap from OpenGL - // Z clip space to Direct3D if needed. Also, if the vertex shader is - // multipass, the NDC scale constant can be used to set position to NaN to - // kill all primitives. + // Apply scale for guest to host viewport and clip space conversion. Also, if + // the vertex shader is multipass, the NDC scale constant can be used to set + // position to NaN to kill all primitives. system_constants_used_ |= 1ull << kSysConst_NDCScale_Index; DxbcOpMul(DxbcDest::R(system_temp_position_, 0b0111), DxbcSrc::R(system_temp_position_), @@ -1056,16 +1055,7 @@ void DxbcShaderTranslator::CompleteVertexOrDomainShader() { kSysConst_NDCScale_Vec, kSysConst_NDCScale_Comp * 0b010101 + 0b100100)); - // Reverse Z (Z = W - Z) if the viewport depth is inverted. - DxbcOpAnd(temp_x_dest, flags_src, DxbcSrc::LU(kSysFlag_ReverseZ)); - DxbcOpIf(true, temp_x_src); - DxbcOpAdd(DxbcDest::R(system_temp_position_, 0b0100), - DxbcSrc::R(system_temp_position_, DxbcSrc::kWWWW), - -DxbcSrc::R(system_temp_position_, DxbcSrc::kZZZZ)); - DxbcOpEndIf(); - - // Apply offset (multiplied by W) for drawing without a viewport and for half - // pixel offset. + // Apply offset (multiplied by W) used for the same purposes. system_constants_used_ |= 1ull << kSysConst_NDCOffset_Index; DxbcOpMAd(DxbcDest::R(system_temp_position_, 0b0111), DxbcSrc::CB(cbuffer_index_system_constants_, diff --git a/src/xenia/gpu/dxbc_shader_translator.h b/src/xenia/gpu/dxbc_shader_translator.h index c45cfc4d9..997be5fe7 100644 --- a/src/xenia/gpu/dxbc_shader_translator.h +++ b/src/xenia/gpu/dxbc_shader_translator.h @@ -123,7 +123,6 @@ class DxbcShaderTranslator : public ShaderTranslator { kSysFlag_UserClipPlane3_Shift, kSysFlag_UserClipPlane4_Shift, kSysFlag_UserClipPlane5_Shift, - kSysFlag_ReverseZ_Shift, kSysFlag_KillIfAnyVertexKilled_Shift, kSysFlag_PrimitiveTwoFaced_Shift, kSysFlag_AlphaPassIfLess_Shift, @@ -165,7 +164,6 @@ class DxbcShaderTranslator : public ShaderTranslator { kSysFlag_UserClipPlane3 = 1u << kSysFlag_UserClipPlane3_Shift, kSysFlag_UserClipPlane4 = 1u << kSysFlag_UserClipPlane4_Shift, kSysFlag_UserClipPlane5 = 1u << kSysFlag_UserClipPlane5_Shift, - kSysFlag_ReverseZ = 1u << kSysFlag_ReverseZ_Shift, kSysFlag_KillIfAnyVertexKilled = 1u << kSysFlag_KillIfAnyVertexKilled_Shift, kSysFlag_PrimitiveTwoFaced = 1u << kSysFlag_PrimitiveTwoFaced_Shift, kSysFlag_AlphaPassIfLess = 1u << kSysFlag_AlphaPassIfLess_Shift, @@ -220,8 +218,7 @@ class DxbcShaderTranslator : public ShaderTranslator { float point_size[2]; float point_size_min_max[2]; - // Inverse scale of the host viewport (but not supersampled), with signs - // pre-applied. + // Screen point size * 2 (but not supersampled) -> size in NDC. float point_screen_to_ndc[2]; float user_clip_planes[6][4];