diff --git a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc index 9da117bf4..34c3654ed 100644 --- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc @@ -2918,73 +2918,29 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) { uint32_t draw_resolution_scale_x = this->draw_resolution_scale_x(); uint32_t draw_resolution_scale_y = this->draw_resolution_scale_y(); - uint32_t tile_width_samples_scaled = + uint32_t tile_width_samples = xenos::kEdramTileWidthSamples * draw_resolution_scale_x; - uint32_t tile_height_samples_scaled = + uint32_t tile_height_samples = xenos::kEdramTileHeightSamples * draw_resolution_scale_y; - // Split the destination pixel index into 32bpp tile in r0.z and + // Split the destination pixel index into 32bpp tile in r0.zw and // 32bpp-tile-relative pixel index in r0.xy. // r0.xy = pixel XY as uint a.OpFToU(dxbc::Dest::R(0, 0b0011), dxbc::Src::V1D(kInputRegisterPosition)); - uint32_t dest_sample_width_log2 = - uint32_t(dest_is_64bpp) + - uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k4X); - uint32_t dest_sample_height_log2 = + uint32_t dest_tile_width_pixels = + tile_width_samples >> + (uint32_t(dest_is_64bpp) + + uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k4X)); + uint32_t dest_tile_height_pixels = + tile_height_samples >> uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k2X); - uint32_t dest_tile_width_divide_scale, dest_tile_width_divide_upper_shift; - draw_util::GetEdramTileWidthDivideScaleAndUpperShift( - draw_resolution_scale_x, dest_tile_width_divide_scale, - dest_tile_width_divide_upper_shift); - assert_true(dest_tile_width_divide_upper_shift >= dest_sample_width_log2); - // Need the host tile size in pixels, not samples. - dest_tile_width_divide_upper_shift -= dest_sample_width_log2; - static_assert( - TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3, - "D3D12RenderTargetCache EDRAM range ownership transfer shader generation " - "supports Y draw resolution scaling factors of only up to 3"); - if (draw_resolution_scale_y == 3) { - // r0.zw = upper 32 bits in the division process of pixel XY by pixel count - // in a 32bpp tile - a.OpUMul(dxbc::Dest::R(0, 0b1100), dxbc::Dest::Null(), - dxbc::Src::R(0, 0b0100 << 4), - dxbc::Src::LU(0, 0, dest_tile_width_divide_scale, - draw_util::kDivideScale3)); - // r0.zw = 32bpp tile XY index - a.OpUShR(dxbc::Dest::R(0, 0b1100), dxbc::Src::R(0), - dxbc::Src::LU( - 0, 0, dest_tile_width_divide_upper_shift, - draw_util::kDivideUpperShift3 + 4 - dest_sample_height_log2)); - // r0.xy = destination pixel XY index within the 32bpp tile - a.OpIMAd( - dxbc::Dest::R(0, 0b0011), dxbc::Src::R(0, 0b1110), - dxbc::Src::LI( - -int32_t((80 * draw_resolution_scale_x) >> dest_sample_width_log2), - -int32_t((16 * draw_resolution_scale_y) >> dest_sample_height_log2), - 0, 0), - dxbc::Src::R(0, 0b0100)); - } else { - assert_true(draw_resolution_scale_y <= 2); - uint32_t dest_tile_height_pixels_log2 = - (draw_resolution_scale_y == 2 ? 5 : 4) - dest_sample_height_log2; - // r0.z = upper 32 bits in the division process of pixel X by pixel count in - // a 32bpp tile - a.OpUMul(dxbc::Dest::R(0, 0b0100), dxbc::Dest::Null(), - dxbc::Src::R(0, dxbc::Src::kXXXX), - dxbc::Src::LU(dest_tile_width_divide_scale)); - // r0.zw = 32bpp tile XY index - a.OpUShR(dxbc::Dest::R(0, 0b1100), dxbc::Src::R(0, 0b0110 << 4), - dxbc::Src::LU(0, 0, dest_tile_width_divide_upper_shift, - dest_tile_height_pixels_log2)); - // r0.x = destination pixel X index within the 32bpp tile - a.OpIMAd(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kZZZZ), - dxbc::Src::LI(-int32_t((80 * draw_resolution_scale_x) >> - dest_sample_width_log2)), - dxbc::Src::R(0, dxbc::Src::kXXXX)); - // r0.y = destination pixel Y index within the 32bpp tile - a.OpAnd(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(0, dxbc::Src::kYYYY), - dxbc::Src::LU((1 << dest_tile_height_pixels_log2) - 1)); - } + // r0.xy = destination pixel XY index within the 32bpp tile + // r0.zw = 32bpp tile XY index + a.OpUDiv(dxbc::Dest::R(0, 0b1100), dxbc::Dest::R(0, 0b0011), + dxbc::Src::R(0, 0b01000100), + dxbc::Src::LU(dest_tile_width_pixels, dest_tile_height_pixels, + dest_tile_width_pixels, dest_tile_height_pixels)); + // r1.x = destination pitch in 32bpp tiles a.OpUBFE(dxbc::Dest::R(1, 0b0001), dxbc::Src::LU(xenos::kEdramPitchTilesBits), dxbc::Src::LU(0), @@ -3305,7 +3261,7 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) { // Copying between color and depth / stencil - swap 40-32bpp-sample columns // in the pixel index within the source 32bpp tile using r1.w as temporary. uint32_t source_32bpp_tile_half_pixels = - tile_width_samples_scaled >> (1 + source_pixel_width_dwords_log2); + tile_width_samples >> (1 + source_pixel_width_dwords_log2); a.OpULT(dxbc::Dest::R(1, 0b1000), dxbc::Src::R(source_tile_pixel_x_reg, dxbc::Src::kXXXX), dxbc::Src::LU(source_32bpp_tile_half_pixels)); @@ -3348,18 +3304,17 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) { // r1.x = pixel X within the source texture // r2.x = free a.OpUMAd(dxbc::Dest::R(1, 0b0001), - dxbc::Src::LU(tile_width_samples_scaled >> - source_pixel_width_dwords_log2), + dxbc::Src::LU(tile_width_samples >> source_pixel_width_dwords_log2), dxbc::Src::R(2, dxbc::Src::kXXXX), dxbc::Src::R(source_tile_pixel_x_reg, dxbc::Src::kXXXX)); // r1.y = pixel Y within the source texture // r1.w = free - a.OpUMAd(dxbc::Dest::R(1, 0b0010), - dxbc::Src::LU( - tile_height_samples_scaled >> - uint32_t(key.source_msaa_samples >= xenos::MsaaSamples::k2X)), - dxbc::Src::R(1, dxbc::Src::kWWWW), - dxbc::Src::R(source_tile_pixel_y_reg, dxbc::Src::kYYYY)); + a.OpUMAd( + dxbc::Dest::R(1, 0b0010), + dxbc::Src::LU(tile_height_samples >> uint32_t(key.source_msaa_samples >= + xenos::MsaaSamples::k2X)), + dxbc::Src::R(1, dxbc::Src::kWWWW), + dxbc::Src::R(source_tile_pixel_y_reg, dxbc::Src::kYYYY)); // Load the source to r1, or, for 32bpp | 32bpp -> 64bpp, the first dword to // r0 since addressing will not be needed anymore for color, and the second @@ -3575,9 +3530,9 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) { for (uint32_t i = 0; i < 2; ++i) { switch (source_depth_format) { case xenos::DepthRenderTargetFormat::kD24S8: { - // Round to the nearest even integer. This seems to be the correct, - // adding +0.5 and rounding towards zero results in red instead of - // black in the 4D5307E6 clear shader. + // Round to the nearest even integer. This seems to be the correct + // conversion, adding +0.5 and rounding towards zero results in red + // instead of black in the 4D5307E6 clear shader. a.OpMul(dxbc::Dest::R(i, 0b1000), dxbc::Src::R(i, dxbc::Src::kWWWW), dxbc::Src::LF(float(0xFFFFFF))); a.OpRoundNE(dxbc::Dest::R(i, 0b1000), @@ -3762,9 +3717,9 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) { depth_loaded_in_guest_format = true; switch (source_depth_format) { case xenos::DepthRenderTargetFormat::kD24S8: { - // Round to the nearest even integer. This seems to be the correct, - // adding +0.5 and rounding towards zero results in red instead of - // black in the 4D5307E6 clear shader. + // Round to the nearest even integer. This seems to be the correct + // conversion, adding +0.5 and rounding towards zero results in red + // instead of black in the 4D5307E6 clear shader. a.OpMul(dxbc::Dest::R(1, 0b1000), dxbc::Src::R(1, dxbc::Src::kWWWW), dxbc::Src::LF(float(0xFFFFFF))); a.OpRoundNE(dxbc::Dest::R(1, 0b1000), @@ -3920,12 +3875,11 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) { // Combine the tile sample index and the tile index into buffer // address to r0.x. a.OpUMAd(dxbc::Dest::R(0, 0b0001), - dxbc::Src::LU(tile_width_samples_scaled), + dxbc::Src::LU(tile_width_samples), dxbc::Src::R(0, dxbc::Src::kYYYY), dxbc::Src::R(0, dxbc::Src::kXXXX)); a.OpUMAd(dxbc::Dest::R(0, 0b0001), - dxbc::Src::LU(tile_width_samples_scaled * - tile_height_samples_scaled), + dxbc::Src::LU(tile_width_samples * tile_height_samples), dxbc::Src::R(0, dxbc::Src::kZZZZ), dxbc::Src::R(0, dxbc::Src::kXXXX)); // Load from the buffer. @@ -4102,7 +4056,7 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) { // r1.x = free a.OpUMAd( dxbc::Dest::R(0, 0b0001), - dxbc::Src::LU(tile_width_samples_scaled >> + dxbc::Src::LU(tile_width_samples >> uint32_t(key.host_depth_source_msaa_samples >= xenos::MsaaSamples::k4X)), dxbc::Src::R(1, dxbc::Src::kXXXX), @@ -4111,7 +4065,7 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) { // r0.z = free a.OpUMAd( dxbc::Dest::R(0, 0b0010), - dxbc::Src::LU(tile_height_samples_scaled >> + dxbc::Src::LU(tile_height_samples >> uint32_t(key.host_depth_source_msaa_samples >= xenos::MsaaSamples::k2X)), dxbc::Src::R(0, dxbc::Src::kZZZZ), @@ -5933,97 +5887,42 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline( // 32bpp is unknown, treating 64bpp tiles as storing 40x16 samples rather than // 80x16 for simplicity of addressing into the texture. - // Get the parts of the address along Y - tile row index within the dispatch - // to r0.w, sample Y within the tile to r0.y. - static_assert( - TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3, - "D3D12RenderTargetCache render target dump shader generation supports Y " - "draw resolution scaling factors of only up to 3"); - if (draw_resolution_scale_y == 3) { - // Multiplication part of the division by the (16 * scale) tile height, - // specifically 48 here, or 16 * 3. - // r0.w = (Y * kDivideScale3) >> 32 - a.OpUMul(dxbc::Dest::R(0, 0b1000), dxbc::Dest::Null(), - dxbc::Src::VThreadID(dxbc::Src::kYYYY), - dxbc::Src::LU(draw_util::kDivideScale3)); - // Shift part of the division by 16 * scale. - // r0.w = Y tile position - a.OpUShR(dxbc::Dest::R(0, 0b1000), dxbc::Src::R(0, dxbc::Src::kWWWW), - dxbc::Src::LU(draw_util::kDivideUpperShift3 + 4)); - // Take the remainder of the performed division to r0.y. - // r0.y = Y sample position within the tile - // r0.w = Y tile position - a.OpIMAd(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(0, dxbc::Src::kWWWW), - dxbc::Src::LI(-16 * draw_resolution_scale_y), - dxbc::Src::VThreadID(dxbc::Src::kYYYY)); - } else { - assert_true(draw_resolution_scale_y <= 2); - // Tile height is a power of two, can use bit operations. - // Get the tile row index into r0.w. - // r0.w = Y tile position. - a.OpUShR(dxbc::Dest::R(0, 0b1000), dxbc::Src::VThreadID(dxbc::Src::kYYYY), - dxbc::Src::LU(draw_resolution_scale_y == 2 ? 5 : 4)); - // Get the Y sample position within the tile into r0.y. - // r0.y = Y sample position within the tile - // r0.w = Y tile position - a.OpAnd(dxbc::Dest::R(0, 0b0010), dxbc::Src::VThreadID(dxbc::Src::kYYYY), - dxbc::Src::LU((16 * draw_resolution_scale_y) - 1)); - } + uint32_t tile_width = + (xenos::kEdramTileWidthSamples * draw_resolution_scale_x) >> + uint32_t(format_is_64bpp); + uint32_t tile_height = + xenos::kEdramTileHeightSamples * draw_resolution_scale_y; - // Get the X tile offset within the dispatch to r0.z. - uint32_t tile_width = xenos::kEdramTileWidthSamples * draw_resolution_scale_x; - uint32_t tile_width_divide_scale; - uint32_t tile_width_divide_upper_shift; - draw_util::GetEdramTileWidthDivideScaleAndUpperShift( - draw_resolution_scale_x, tile_width_divide_scale, - tile_width_divide_upper_shift); - if (format_is_64bpp) { - tile_width >>= 1; - assert_not_zero(tile_width_divide_upper_shift); - --tile_width_divide_upper_shift; - } - // Multiplication part of the division by 80|40 * scale. - // r0.y = Y sample position within the tile - // r0.z = (X * tile_width_divide_scale) >> 32 - // r0.w = Y tile position - a.OpUMul(dxbc::Dest::R(0, 0b0100), dxbc::Dest::Null(), - dxbc::Src::VThreadID(dxbc::Src::kXXXX), - dxbc::Src::LU(tile_width_divide_scale)); - // Shift part of the division by 80|40 * scale. + // Get the parts of the address - tile row index within the dispatch to r0.zw, + // sample Y within the tile to r0.xy. + // r0.x = X sample position within the tile // r0.y = Y sample position within the tile // r0.z = X tile position // r0.w = Y tile position - a.OpUShR(dxbc::Dest::R(0, 0b0100), dxbc::Src::R(0, dxbc::Src::kZZZZ), - dxbc::Src::LU(tile_width_divide_upper_shift)); + a.OpUDiv(dxbc::Dest::R(0, 0b1100), dxbc::Dest::R(0, 0b0011), + dxbc::Src::VThreadID(0b01000100), + dxbc::Src::LU(tile_width, tile_height, tile_width, tile_height)); - // Extract the dump rectangle tile row pitch to r0.x. - // r0.x = dump rectangle pitch in tiles + // Extract the dump rectangle tile row pitch to r1.x. + // r0.x = X sample position within the tile // r0.y = Y sample position within the tile // r0.z = X tile position // r0.w = Y tile position - a.OpUBFE(dxbc::Dest::R(0, 0b0001), dxbc::Src::LU(xenos::kEdramPitchTilesBits), + // r1.x = dump rectangle pitch in tiles + a.OpUBFE(dxbc::Dest::R(1, 0b0001), dxbc::Src::LU(xenos::kEdramPitchTilesBits), dxbc::Src::LU(0), dxbc::Src::CB(kDumpCbufferPitches, kDumpCbufferPitches, 0, dxbc::Src::kXXXX)); // Get the tile index in the EDRAM relative to the dump rectangle base tile to // r0.w. - // r0.x = free - // r0.y = Y sample position within the tile - // r0.z = X tile position - // r0.w = tile index relative to the dump rectangle base - a.OpUMAd(dxbc::Dest::R(0, 0b1000), dxbc::Src::R(0, dxbc::Src::kWWWW), - dxbc::Src::R(0, dxbc::Src::kXXXX), - dxbc::Src::R(0, dxbc::Src::kZZZZ)); - - // Take the X sample index within the tile as the remainder of the division of - // the thread index by tile width to r0.x. // r0.x = X sample position within the tile // r0.y = Y sample position within the tile // r0.z = free // r0.w = tile index relative to the dump rectangle base - a.OpIMAd(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kZZZZ), - dxbc::Src::LI(-int32_t(tile_width)), - dxbc::Src::VThreadID(dxbc::Src::kXXXX)); + // r1.x = free + a.OpUMAd(dxbc::Dest::R(0, 0b1000), dxbc::Src::R(0, dxbc::Src::kWWWW), + dxbc::Src::R(1, dxbc::Src::kXXXX), + dxbc::Src::R(0, dxbc::Src::kZZZZ)); // Extract the index of the first tile of the dispatch in the EDRAM to r0.z. // r0.x = X sample position within the tile @@ -6053,7 +5952,7 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline( xenos::kEdramTileHeightSamples), dxbc::Src::R(0, dxbc::Src::kXXXX)); // Add the contribution of the Y sample position within the tile to the sample - // address in the EDRAM to r0.w. + // address in the EDRAM to r0.z. // r0.x = X sample position within the tile // r0.y = Y sample position within the tile // r0.z = sample offset in the EDRAM without the depth column swapping @@ -6119,7 +6018,6 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline( dxbc::Src::CB(kDumpCbufferPitches, kDumpCbufferPitches, 0, dxbc::Src::kXXXX)); // Split the linear tile index in the source texture into X and Y in tiles. - // Get the source texture pitch in tiles to r1.x. // r0.x = X sample position within the tile // r0.y = Y sample position within the tile // r0.z = sample offset in the EDRAM @@ -6257,9 +6155,9 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline( if (key.is_depth) { switch (key.GetDepthFormat()) { case xenos::DepthRenderTargetFormat::kD24S8: - // Round to the nearest even integer. This seems to be the correct, - // adding +0.5 and rounding towards zero results in red instead of - // black in the 4D5307E6 clear shader. + // Round to the nearest even integer. This seems to be the correct + // conversion, adding +0.5 and rounding towards zero results in red + // instead of black in the 4D5307E6 clear shader. a.OpMul(dxbc::Dest::R(1, 0b0001), dxbc::Src::R(1, dxbc::Src::kXXXX), dxbc::Src::LF(float(0xFFFFFF))); a.OpRoundNE(dxbc::Dest::R(1, 0b0001), diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index 70bf76df7..e894a925d 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -649,31 +649,6 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs, return normalized_color_mask; } -void GetEdramTileWidthDivideScaleAndUpperShift( - uint32_t draw_resolution_scale_x, uint32_t& divide_scale_out, - uint32_t& divide_upper_shift_out) { - static_assert( - TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3, - "GetEdramTileWidthDivideScaleAndUpperShift provides values only for draw " - "resolution scaling factors of up to 3"); - switch (draw_resolution_scale_x) { - case 1: - divide_scale_out = kDivideScale5; - divide_upper_shift_out = kDivideUpperShift5 + 4; - break; - case 2: - divide_scale_out = kDivideScale5; - divide_upper_shift_out = kDivideUpperShift5 + 5; - break; - case 3: - divide_scale_out = kDivideScale15; - divide_upper_shift_out = kDivideUpperShift15 + 4; - break; - default: - assert_unhandled_case(draw_resolution_scale_x); - } -} - xenos::CopySampleSelect SanitizeCopySampleSelect( xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples, bool is_depth) { diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h index f270de673..3a52c7440 100644 --- a/src/xenia/gpu/draw_util.h +++ b/src/xenia/gpu/draw_util.h @@ -226,20 +226,6 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out, uint32_t GetNormalizedColorMask(const RegisterFile& regs, uint32_t pixel_shader_writes_color_targets); -// Scales, and shift amounts of the upper 32 bits of the 32x32=64-bit -// multiplication result, for fast division and multiplication by -// EDRAM-tile-related amounts. -constexpr uint32_t kDivideScale3 = 0xAAAAAAABu; -constexpr uint32_t kDivideUpperShift3 = 1; -constexpr uint32_t kDivideScale5 = 0xCCCCCCCDu; -constexpr uint32_t kDivideUpperShift5 = 2; -constexpr uint32_t kDivideScale15 = 0x88888889u; -constexpr uint32_t kDivideUpperShift15 = 3; - -void GetEdramTileWidthDivideScaleAndUpperShift( - uint32_t draw_resolution_scale_x, uint32_t& divide_scale_out, - uint32_t& divide_upper_shift_out); - // Never an identity conversion - can always write conditional move instructions // to shaders that will be no-ops for conversion from guest to host samples. // While we don't know the exact guest sample pattern, due to the way diff --git a/src/xenia/gpu/dxbc_shader_translator_memexport.cc b/src/xenia/gpu/dxbc_shader_translator_memexport.cc index 8d1295ee7..d34053839 100644 --- a/src/xenia/gpu/dxbc_shader_translator_memexport.cc +++ b/src/xenia/gpu/dxbc_shader_translator_memexport.cc @@ -120,80 +120,49 @@ void DxbcShaderTranslator::ExportToMemory() { a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kXXXX)); // Check more fine-grained limitations. - // The flag in control_temp.x can be 0 or 1 for simplicity, not necessarily - // 0 or 0xFFFFFFFF. bool inner_condition_provided = false; if (is_pixel_shader()) { uint32_t resolution_scaled_axes = uint32_t(draw_resolution_scale_x_ > 1) | (uint32_t(draw_resolution_scale_y_ > 1) << 1); if (resolution_scaled_axes) { - // Only do memexport for one host pixel in a guest pixel. - // For 2x - pixel 1 because it's covered with half-pixel offset that - // becomes full-pixel. - // For 3x - also pixel 1 because it's still covered with half-pixel - // offset, but close to the center. - // If X needs resolution scaling, writing 1 or 0 - whether the column is - // the one where memexport should be done - to control_temp.y. - // For Y, doing that to control_temp.z. - // Then, if both axes are resolution-scaled, merging the conditions for - // the two. + // Only do memexport for one host pixel in a guest pixel - prefer the + // host pixel closer to the center of the guest pixel, but one that's + // covered with the half-pixel offset according to the top-left rule (1 + // for 2x because 0 isn't covered with the half-pixel offset, 1 for 3x + // because it's the center and is covered with the half-pixel offset too). + // Using control_temp.yz as per-axis temporary variables. in_position_used_ |= resolution_scaled_axes; a_.OpFToU( dxbc::Dest::R(control_temp, resolution_scaled_axes << 1), dxbc::Src::V1D(uint32_t(InOutRegister::kPSInPosition), 0b0100 << 2)); - dxbc::Dest resolution_scaling_temp_dest( - dxbc::Dest::R(control_temp, 0b1000)); - dxbc::Src resolution_scaling_temp_src( - dxbc::Src::R(control_temp, dxbc::Src::kWWWW)); + a_.OpUDiv(dxbc::Dest::Null(), + dxbc::Dest::R(control_temp, resolution_scaled_axes << 1), + dxbc::Src::R(control_temp, 0b1001 << 2), + dxbc::Src::LU(0, draw_resolution_scale_x_, + draw_resolution_scale_y_, 0)); for (uint32_t i = 0; i < 2; ++i) { if (!(resolution_scaled_axes & (1 << i))) { continue; } // If there's no inner condition in control_temp.x yet, the condition // for the current axis can go directly to it. Otherwise, need to merge - // with the previous condition, using control_temp.w as an intermediate - // variable. - dxbc::Dest resolution_scaled_axis_result( - inner_condition_provided ? resolution_scaling_temp_dest - : dxbc::Dest::R(control_temp, 0b0001)); + // with the previous condition, using control_temp.y or .z as an + // intermediate variable. dxbc::Src resolution_scaled_axis_src( dxbc::Src::R(control_temp).Select(1 + i)); - uint32_t axis_resolution_scale = - i ? draw_resolution_scale_y_ : draw_resolution_scale_x_; - static_assert( - TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3, - "DxbcShaderTranslator memexport draw resolution scaling " - "conditional generation supports draw resolution scaling factors " - "of only up to 3"); - switch (axis_resolution_scale) { - case 2: - // xy & 1 == 1. - a_.OpAnd(resolution_scaled_axis_result, resolution_scaled_axis_src, - dxbc::Src::LU(1)); - // No need to do IEq - already 1 for right / bottom, 0 for left / - // top. - break; - case 3: - // xy % 3 == 1. - a_.OpUMul(resolution_scaling_temp_dest, dxbc::Dest::Null(), - resolution_scaled_axis_src, - dxbc::Src::LU(draw_util::kDivideScale3)); - a_.OpUShR(resolution_scaling_temp_dest, resolution_scaling_temp_src, - dxbc::Src::LU(draw_util::kDivideUpperShift3)); - a_.OpIMAd(resolution_scaling_temp_dest, resolution_scaling_temp_src, - dxbc::Src::LI(-3), resolution_scaled_axis_src); - a_.OpIEq(resolution_scaled_axis_result, resolution_scaling_temp_src, - dxbc::Src::LU(1)); - break; - default: - assert_unhandled_case(axis_resolution_scale); - } + a_.OpIEq( + dxbc::Dest::R(control_temp, + inner_condition_provided ? 1 << (1 + i) : 0b0001), + resolution_scaled_axis_src, + dxbc::Src::LU( + (i ? draw_resolution_scale_y_ : draw_resolution_scale_x_) >> + 1)); if (inner_condition_provided) { // Merge with the previous condition in control_temp.x. a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001), dxbc::Src::R(control_temp, dxbc::Src::kXXXX), - resolution_scaling_temp_src); + resolution_scaled_axis_src); } inner_condition_provided = true; } diff --git a/src/xenia/gpu/dxbc_shader_translator_om.cc b/src/xenia/gpu/dxbc_shader_translator_om.cc index eb5a8bd38..df800b9c2 100644 --- a/src/xenia/gpu/dxbc_shader_translator_om.cc +++ b/src/xenia/gpu/dxbc_shader_translator_om.cc @@ -190,91 +190,22 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { // dividing by 40, not by 80. // For depth-only: // Same, but for full 80x16 tiles, not 40x16 half-tiles. - uint32_t tile_or_half_tile_width = 80 * draw_resolution_scale_x_; - uint32_t tile_or_half_tile_width_divide_scale; - uint32_t tile_or_half_tile_width_divide_upper_shift; - draw_util::GetEdramTileWidthDivideScaleAndUpperShift( - draw_resolution_scale_x_, tile_or_half_tile_width_divide_scale, - tile_or_half_tile_width_divide_upper_shift); - if (any_color_targets_written) { - tile_or_half_tile_width >>= 1; - assert_not_zero(tile_or_half_tile_width_divide_upper_shift); - --tile_or_half_tile_width_divide_upper_shift; - } - static_assert( - TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3, - "DxbcShaderTranslator ROV sample address calculation supports Y draw " - "resolution scaling factors of only up to 3"); - if (draw_resolution_scale_y_ == 3) { - // Multiplication part of the division by 40|80 x 16 x scale (specifically - // 40|80 * scale width here, and 48 height, or 16 * 3 height). - // system_temp_rov_params_.x = X sample 0 position - // system_temp_rov_params_.y = Y sample 0 position - // system_temp_rov_params_.z = (X * tile_or_half_tile_width_divide_scale) >> - // 32 - // system_temp_rov_params_.w = (Y * kDivideScale3) >> 32 - a_.OpUMul(dxbc::Dest::R(system_temp_rov_params_, 0b1100), - dxbc::Dest::Null(), - dxbc::Src::R(system_temp_rov_params_, 0b0100 << 4), - dxbc::Src::LU(0, 0, tile_or_half_tile_width_divide_scale, - draw_util::kDivideScale3)); - // Shift part of the division by 40|80 x 16 x scale. - // system_temp_rov_params_.x = X sample 0 position - // system_temp_rov_params_.y = Y sample 0 position - // system_temp_rov_params_.z = X half-tile or tile position - // system_temp_rov_params_.w = Y tile position - a_.OpUShR(dxbc::Dest::R(system_temp_rov_params_, 0b1100), - dxbc::Src::R(system_temp_rov_params_), - dxbc::Src::LU(0, 0, tile_or_half_tile_width_divide_upper_shift, - draw_util::kDivideUpperShift3 + 4)); - // Take the remainder of the performed division to - // system_temp_rov_params_.xy. - // system_temp_rov_params_.x = X sample 0 position within the half-tile - // system_temp_rov_params_.y = Y sample 0 position within the (half-)tile - // system_temp_rov_params_.z = X half-tile or tile position - // system_temp_rov_params_.w = Y tile position - a_.OpIMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0011), - dxbc::Src::R(system_temp_rov_params_, 0b1110), - dxbc::Src::LI(-int32_t(tile_or_half_tile_width), - -16 * draw_resolution_scale_y_, 0, 0), - dxbc::Src::R(system_temp_rov_params_)); - } else { - assert_true(draw_resolution_scale_y_ <= 2); - // Multiplication part of the division of X by 40|80 * scale. - // system_temp_rov_params_.x = X sample 0 position - // system_temp_rov_params_.y = Y sample 0 position - // system_temp_rov_params_.z = (X * tile_or_half_tile_width_divide_scale) >> - // 32 - a_.OpUMul(dxbc::Dest::R(system_temp_rov_params_, 0b0100), - dxbc::Dest::Null(), - dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX), - dxbc::Src::LU(tile_or_half_tile_width_divide_scale)); - // Shift part of the division of X by 40 * scale, division of Y by - // 16 * scale as it's power of two in this case. - // system_temp_rov_params_.x = X sample 0 position - // system_temp_rov_params_.y = Y sample 0 position - // system_temp_rov_params_.z = X half-tile or tile position - // system_temp_rov_params_.w = Y tile position - a_.OpUShR(dxbc::Dest::R(system_temp_rov_params_, 0b1100), - dxbc::Src::R(system_temp_rov_params_, 0b0110 << 4), - dxbc::Src::LU(0, 0, tile_or_half_tile_width_divide_upper_shift, - draw_resolution_scale_y_ == 2 ? 5 : 4)); - // Take the remainder of the performed division (via multiply-subtract for - // X, via AND for Y which is power-of-two here) to - // system_temp_rov_params_.xy. - // system_temp_rov_params_.x = X sample 0 position within the half-tile or - // tile - // system_temp_rov_params_.y = Y sample 0 position within the (half-)tile - // system_temp_rov_params_.z = X half-tile or tile position - // system_temp_rov_params_.w = Y tile position - a_.OpIMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0001), - dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ), - dxbc::Src::LI(-int32_t(tile_or_half_tile_width)), - dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX)); - a_.OpAnd(dxbc::Dest::R(system_temp_rov_params_, 0b0010), - dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY), - dxbc::Src::LU((16 * draw_resolution_scale_y_) - 1)); - } + uint32_t tile_width = + xenos::kEdramTileWidthSamples * draw_resolution_scale_x_; + uint32_t tile_or_tile_half_width = + tile_width >> uint32_t(any_color_targets_written); + uint32_t tile_height = + xenos::kEdramTileHeightSamples * draw_resolution_scale_y_; + // system_temp_rov_params_.x = X sample 0 position within the half-tile or + // tile + // system_temp_rov_params_.y = Y sample 0 position within the (half-)tile + // system_temp_rov_params_.z = X half-tile or tile position + // system_temp_rov_params_.w = Y tile position + a_.OpUDiv(dxbc::Dest::R(system_temp_rov_params_, 0b1100), + dxbc::Dest::R(system_temp_rov_params_, 0b0011), + dxbc::Src::R(system_temp_rov_params_, 0b01000100), + dxbc::Src::LU(tile_or_tile_half_width, tile_height, + tile_or_tile_half_width, tile_height)); // Convert the Y sample 0 position within the half-tile or tile to the dword // offset of the row within a 80x16 32bpp tile or a 40x16 64bpp half-tile to @@ -287,8 +218,10 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { // system_temp_rov_params_.w = Y tile position a_.OpUMul(dxbc::Dest::Null(), dxbc::Dest::R(system_temp_rov_params_, 0b0010), dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY), - dxbc::Src::LU(80 * draw_resolution_scale_x_)); + dxbc::Src::LU(tile_width)); + uint32_t tile_size = tile_width * tile_height; + uint32_t tile_half_width = tile_width >> 1; if (any_color_targets_written) { // Depth, 32bpp color, 64bpp color are all needed. @@ -336,12 +269,10 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { // system_temp_rov_params_.w = Y tile row dword origin in a 32bpp surface // rov_address_temp.x = dword offset of the beginning of the row of samples // within a row of 32bpp tiles - a_.OpUMAd( - dxbc::Dest::R(rov_address_temp, 0b0001), - dxbc::Src::R(rov_address_temp, dxbc::Src::kXXXX), - dxbc::Src::LU(80 * 16 * - (draw_resolution_scale_x_ * draw_resolution_scale_y_)), - dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY)); + a_.OpUMAd(dxbc::Dest::R(rov_address_temp, 0b0001), + dxbc::Src::R(rov_address_temp, dxbc::Src::kXXXX), + dxbc::Src::LU(tile_size), + dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY)); // Get the dword offset of the beginning of the row of samples within a // 32bpp surface to rov_address_temp.x. // system_temp_rov_params_.x = X sample 0 position within the half-tile @@ -365,12 +296,10 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { // system_temp_rov_params_.w = Y tile row dword origin in a 32bpp surface // rov_address_temp.x = dword offset of the beginning of the row of samples // within a 32bpp surface - a_.OpUMAd( - dxbc::Dest::R(system_temp_rov_params_, 0b0010), - dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ), - dxbc::Src::LU(80 * 16 * - (draw_resolution_scale_x_ * draw_resolution_scale_y_)), - dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY)); + a_.OpUMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0010), + dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ), + dxbc::Src::LU(tile_size), + dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY)); // Get the dword offset of the beginning of the row of samples within a // 64bpp surface to system_temp_rov_params_.w (last time the Y tile row // offset is needed). @@ -420,7 +349,7 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { // within a 32bpp surface a_.OpUMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0100), dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY), - dxbc::Src::LU(40 * draw_resolution_scale_x_), + dxbc::Src::LU(tile_half_width), dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX)); // Get the final offset of the sample 0 within a 32bpp color surface to // system_temp_rov_params_.z (last time the 32bpp row offset is needed). @@ -439,8 +368,8 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { // system_temp_rov_params_.w = dword sample 0 offset within a 64bpp surface a_.OpMovC(dxbc::Dest::R(system_temp_rov_params_, 0b0010), dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY), - dxbc::Src::LI(-40 * draw_resolution_scale_x_), - dxbc::Src::LI(40 * draw_resolution_scale_x_)); + dxbc::Src::LI(-int32_t(tile_half_width)), + dxbc::Src::LI(int32_t(tile_half_width))); // Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color - // get the final offset of the sample 0 within a 32bpp depth / stencil // surface to system_temp_rov_params_.y. @@ -466,12 +395,10 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { // system_temp_rov_params_.z = dword offset of the beginning of the row of // samples within a row of 32bpp tiles // system_temp_rov_params_.w = Y tile position - a_.OpUMAd( - dxbc::Dest::R(system_temp_rov_params_, 0b0100), - dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ), - dxbc::Src::LU(80 * 16 * - (draw_resolution_scale_x_ * draw_resolution_scale_y_)), - dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY)); + a_.OpUMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0100), + dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ), + dxbc::Src::LU(tile_size), + dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY)); // Get the dword offset of the beginning of the row of samples within a // 32bpp surface to system_temp_rov_params_.y (last time anything Y-related // is needed, as well as the sample row offset within the tile row). @@ -502,15 +429,15 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() { // otherwise a_.OpUGE(dxbc::Dest::R(system_temp_rov_params_, 0b0001), dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX), - dxbc::Src::LU(40 * draw_resolution_scale_x_)); + dxbc::Src::LU(tile_half_width)); // Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color - // get the dword offset to add for flipping to system_temp_rov_params_.x. // system_temp_rov_params_.x = depth half-tile flipping offset // system_temp_rov_params_.y = dword sample 0 offset within a 32bpp surface a_.OpMovC(dxbc::Dest::R(system_temp_rov_params_, 0b0001), dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX), - dxbc::Src::LI(-40 * draw_resolution_scale_x_), - dxbc::Src::LI(40 * draw_resolution_scale_x_)); + dxbc::Src::LI(-int32_t(tile_half_width)), + dxbc::Src::LI(int32_t(tile_half_width))); // Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color - // get the final offset of the sample 0 within a 32bpp depth / stencil // surface to system_temp_rov_params_.y. @@ -1288,10 +1215,12 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() { // Go to the next sample (samples are at +0, +(80*scale_x), +1, // +(80*scale_x+1), so need to do +(80*scale_x), -(80*scale_x-1), // +(80*scale_x) and -(80*scale_x+1) after each sample). + uint32_t tile_width = + xenos::kEdramTileWidthSamples * draw_resolution_scale_x_; a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010), dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY), - dxbc::Src::LI((i & 1) ? -80 * draw_resolution_scale_x_ + 2 - i - : 80 * draw_resolution_scale_x_)); + dxbc::Src::LI((i & 1) ? -int32_t(tile_width) + 2 - i + : int32_t(tile_width))); } if (ROV_IsDepthStencilEarly()) { @@ -2181,6 +2110,9 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { dxbc::Dest temp_w_dest(dxbc::Dest::R(temp, 0b1000)); dxbc::Src temp_w_src(dxbc::Src::R(temp, dxbc::Src::kWWWW)); + uint32_t tile_width = + xenos::kEdramTileWidthSamples * draw_resolution_scale_x_; + // Do late depth/stencil test (which includes writing) if needed or deferred // depth writing. if (ROV_IsDepthStencilEarly()) { @@ -2212,8 +2144,8 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { if (i < 3) { a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010), dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY), - dxbc::Src::LI((i & 1) ? -80 * draw_resolution_scale_x_ + 2 - i - : 80 * draw_resolution_scale_x_)); + dxbc::Src::LI((i & 1) ? -int32_t(tile_width) + 2 - i + : int32_t(tile_width))); } } } else { @@ -3021,8 +2953,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() { // +(80*scale_x+1), so need to do +(80*scale_x), -(80*scale_x-1), // +(80*scale_x) and -(80*scale_x+1) after each sample). int32_t next_sample_distance = - (j & 1) ? -80 * draw_resolution_scale_x_ + 2 - j - : 80 * draw_resolution_scale_x_; + (j & 1) ? -int32_t(tile_width) + 2 - j : int32_t(tile_width); a_.OpIAdd( dxbc::Dest::R(system_temp_rov_params_, 0b1100), dxbc::Src::R(system_temp_rov_params_),