[D3D12] Use udiv by constant tile size + minor transfer cleanup

Drivers compile that to a multiplication and a shift anyway.
This commit is contained in:
Triang3l 2022-06-20 22:39:30 +03:00
parent 207e11c8d2
commit e2f632f8fa
5 changed files with 125 additions and 366 deletions

View File

@ -2918,73 +2918,29 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
uint32_t draw_resolution_scale_x = this->draw_resolution_scale_x();
uint32_t draw_resolution_scale_y = this->draw_resolution_scale_y();
uint32_t tile_width_samples_scaled =
uint32_t tile_width_samples =
xenos::kEdramTileWidthSamples * draw_resolution_scale_x;
uint32_t tile_height_samples_scaled =
uint32_t tile_height_samples =
xenos::kEdramTileHeightSamples * draw_resolution_scale_y;
// Split the destination pixel index into 32bpp tile in r0.z and
// Split the destination pixel index into 32bpp tile in r0.zw and
// 32bpp-tile-relative pixel index in r0.xy.
// r0.xy = pixel XY as uint
a.OpFToU(dxbc::Dest::R(0, 0b0011), dxbc::Src::V1D(kInputRegisterPosition));
uint32_t dest_sample_width_log2 =
uint32_t(dest_is_64bpp) +
uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k4X);
uint32_t dest_sample_height_log2 =
uint32_t dest_tile_width_pixels =
tile_width_samples >>
(uint32_t(dest_is_64bpp) +
uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k4X));
uint32_t dest_tile_height_pixels =
tile_height_samples >>
uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k2X);
uint32_t dest_tile_width_divide_scale, dest_tile_width_divide_upper_shift;
draw_util::GetEdramTileWidthDivideScaleAndUpperShift(
draw_resolution_scale_x, dest_tile_width_divide_scale,
dest_tile_width_divide_upper_shift);
assert_true(dest_tile_width_divide_upper_shift >= dest_sample_width_log2);
// Need the host tile size in pixels, not samples.
dest_tile_width_divide_upper_shift -= dest_sample_width_log2;
static_assert(
TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3,
"D3D12RenderTargetCache EDRAM range ownership transfer shader generation "
"supports Y draw resolution scaling factors of only up to 3");
if (draw_resolution_scale_y == 3) {
// r0.zw = upper 32 bits in the division process of pixel XY by pixel count
// in a 32bpp tile
a.OpUMul(dxbc::Dest::R(0, 0b1100), dxbc::Dest::Null(),
dxbc::Src::R(0, 0b0100 << 4),
dxbc::Src::LU(0, 0, dest_tile_width_divide_scale,
draw_util::kDivideScale3));
// r0.zw = 32bpp tile XY index
a.OpUShR(dxbc::Dest::R(0, 0b1100), dxbc::Src::R(0),
dxbc::Src::LU(
0, 0, dest_tile_width_divide_upper_shift,
draw_util::kDivideUpperShift3 + 4 - dest_sample_height_log2));
// r0.xy = destination pixel XY index within the 32bpp tile
a.OpIMAd(
dxbc::Dest::R(0, 0b0011), dxbc::Src::R(0, 0b1110),
dxbc::Src::LI(
-int32_t((80 * draw_resolution_scale_x) >> dest_sample_width_log2),
-int32_t((16 * draw_resolution_scale_y) >> dest_sample_height_log2),
0, 0),
dxbc::Src::R(0, 0b0100));
} else {
assert_true(draw_resolution_scale_y <= 2);
uint32_t dest_tile_height_pixels_log2 =
(draw_resolution_scale_y == 2 ? 5 : 4) - dest_sample_height_log2;
// r0.z = upper 32 bits in the division process of pixel X by pixel count in
// a 32bpp tile
a.OpUMul(dxbc::Dest::R(0, 0b0100), dxbc::Dest::Null(),
dxbc::Src::R(0, dxbc::Src::kXXXX),
dxbc::Src::LU(dest_tile_width_divide_scale));
// r0.zw = 32bpp tile XY index
a.OpUShR(dxbc::Dest::R(0, 0b1100), dxbc::Src::R(0, 0b0110 << 4),
dxbc::Src::LU(0, 0, dest_tile_width_divide_upper_shift,
dest_tile_height_pixels_log2));
// r0.x = destination pixel X index within the 32bpp tile
a.OpIMAd(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kZZZZ),
dxbc::Src::LI(-int32_t((80 * draw_resolution_scale_x) >>
dest_sample_width_log2)),
dxbc::Src::R(0, dxbc::Src::kXXXX));
// r0.y = destination pixel Y index within the 32bpp tile
a.OpAnd(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(0, dxbc::Src::kYYYY),
dxbc::Src::LU((1 << dest_tile_height_pixels_log2) - 1));
}
// r0.xy = destination pixel XY index within the 32bpp tile
// r0.zw = 32bpp tile XY index
a.OpUDiv(dxbc::Dest::R(0, 0b1100), dxbc::Dest::R(0, 0b0011),
dxbc::Src::R(0, 0b01000100),
dxbc::Src::LU(dest_tile_width_pixels, dest_tile_height_pixels,
dest_tile_width_pixels, dest_tile_height_pixels));
// r1.x = destination pitch in 32bpp tiles
a.OpUBFE(dxbc::Dest::R(1, 0b0001), dxbc::Src::LU(xenos::kEdramPitchTilesBits),
dxbc::Src::LU(0),
@ -3305,7 +3261,7 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
// Copying between color and depth / stencil - swap 40-32bpp-sample columns
// in the pixel index within the source 32bpp tile using r1.w as temporary.
uint32_t source_32bpp_tile_half_pixels =
tile_width_samples_scaled >> (1 + source_pixel_width_dwords_log2);
tile_width_samples >> (1 + source_pixel_width_dwords_log2);
a.OpULT(dxbc::Dest::R(1, 0b1000),
dxbc::Src::R(source_tile_pixel_x_reg, dxbc::Src::kXXXX),
dxbc::Src::LU(source_32bpp_tile_half_pixels));
@ -3348,18 +3304,17 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
// r1.x = pixel X within the source texture
// r2.x = free
a.OpUMAd(dxbc::Dest::R(1, 0b0001),
dxbc::Src::LU(tile_width_samples_scaled >>
source_pixel_width_dwords_log2),
dxbc::Src::LU(tile_width_samples >> source_pixel_width_dwords_log2),
dxbc::Src::R(2, dxbc::Src::kXXXX),
dxbc::Src::R(source_tile_pixel_x_reg, dxbc::Src::kXXXX));
// r1.y = pixel Y within the source texture
// r1.w = free
a.OpUMAd(dxbc::Dest::R(1, 0b0010),
dxbc::Src::LU(
tile_height_samples_scaled >>
uint32_t(key.source_msaa_samples >= xenos::MsaaSamples::k2X)),
dxbc::Src::R(1, dxbc::Src::kWWWW),
dxbc::Src::R(source_tile_pixel_y_reg, dxbc::Src::kYYYY));
a.OpUMAd(
dxbc::Dest::R(1, 0b0010),
dxbc::Src::LU(tile_height_samples >> uint32_t(key.source_msaa_samples >=
xenos::MsaaSamples::k2X)),
dxbc::Src::R(1, dxbc::Src::kWWWW),
dxbc::Src::R(source_tile_pixel_y_reg, dxbc::Src::kYYYY));
// Load the source to r1, or, for 32bpp | 32bpp -> 64bpp, the first dword to
// r0 since addressing will not be needed anymore for color, and the second
@ -3575,9 +3530,9 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
for (uint32_t i = 0; i < 2; ++i) {
switch (source_depth_format) {
case xenos::DepthRenderTargetFormat::kD24S8: {
// Round to the nearest even integer. This seems to be the correct,
// adding +0.5 and rounding towards zero results in red instead of
// black in the 4D5307E6 clear shader.
// Round to the nearest even integer. This seems to be the correct
// conversion, adding +0.5 and rounding towards zero results in red
// instead of black in the 4D5307E6 clear shader.
a.OpMul(dxbc::Dest::R(i, 0b1000), dxbc::Src::R(i, dxbc::Src::kWWWW),
dxbc::Src::LF(float(0xFFFFFF)));
a.OpRoundNE(dxbc::Dest::R(i, 0b1000),
@ -3762,9 +3717,9 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
depth_loaded_in_guest_format = true;
switch (source_depth_format) {
case xenos::DepthRenderTargetFormat::kD24S8: {
// Round to the nearest even integer. This seems to be the correct,
// adding +0.5 and rounding towards zero results in red instead of
// black in the 4D5307E6 clear shader.
// Round to the nearest even integer. This seems to be the correct
// conversion, adding +0.5 and rounding towards zero results in red
// instead of black in the 4D5307E6 clear shader.
a.OpMul(dxbc::Dest::R(1, 0b1000), dxbc::Src::R(1, dxbc::Src::kWWWW),
dxbc::Src::LF(float(0xFFFFFF)));
a.OpRoundNE(dxbc::Dest::R(1, 0b1000),
@ -3920,12 +3875,11 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
// Combine the tile sample index and the tile index into buffer
// address to r0.x.
a.OpUMAd(dxbc::Dest::R(0, 0b0001),
dxbc::Src::LU(tile_width_samples_scaled),
dxbc::Src::LU(tile_width_samples),
dxbc::Src::R(0, dxbc::Src::kYYYY),
dxbc::Src::R(0, dxbc::Src::kXXXX));
a.OpUMAd(dxbc::Dest::R(0, 0b0001),
dxbc::Src::LU(tile_width_samples_scaled *
tile_height_samples_scaled),
dxbc::Src::LU(tile_width_samples * tile_height_samples),
dxbc::Src::R(0, dxbc::Src::kZZZZ),
dxbc::Src::R(0, dxbc::Src::kXXXX));
// Load from the buffer.
@ -4102,7 +4056,7 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
// r1.x = free
a.OpUMAd(
dxbc::Dest::R(0, 0b0001),
dxbc::Src::LU(tile_width_samples_scaled >>
dxbc::Src::LU(tile_width_samples >>
uint32_t(key.host_depth_source_msaa_samples >=
xenos::MsaaSamples::k4X)),
dxbc::Src::R(1, dxbc::Src::kXXXX),
@ -4111,7 +4065,7 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
// r0.z = free
a.OpUMAd(
dxbc::Dest::R(0, 0b0010),
dxbc::Src::LU(tile_height_samples_scaled >>
dxbc::Src::LU(tile_height_samples >>
uint32_t(key.host_depth_source_msaa_samples >=
xenos::MsaaSamples::k2X)),
dxbc::Src::R(0, dxbc::Src::kZZZZ),
@ -5933,97 +5887,42 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
// 32bpp is unknown, treating 64bpp tiles as storing 40x16 samples rather than
// 80x16 for simplicity of addressing into the texture.
// Get the parts of the address along Y - tile row index within the dispatch
// to r0.w, sample Y within the tile to r0.y.
static_assert(
TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3,
"D3D12RenderTargetCache render target dump shader generation supports Y "
"draw resolution scaling factors of only up to 3");
if (draw_resolution_scale_y == 3) {
// Multiplication part of the division by the (16 * scale) tile height,
// specifically 48 here, or 16 * 3.
// r0.w = (Y * kDivideScale3) >> 32
a.OpUMul(dxbc::Dest::R(0, 0b1000), dxbc::Dest::Null(),
dxbc::Src::VThreadID(dxbc::Src::kYYYY),
dxbc::Src::LU(draw_util::kDivideScale3));
// Shift part of the division by 16 * scale.
// r0.w = Y tile position
a.OpUShR(dxbc::Dest::R(0, 0b1000), dxbc::Src::R(0, dxbc::Src::kWWWW),
dxbc::Src::LU(draw_util::kDivideUpperShift3 + 4));
// Take the remainder of the performed division to r0.y.
// r0.y = Y sample position within the tile
// r0.w = Y tile position
a.OpIMAd(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(0, dxbc::Src::kWWWW),
dxbc::Src::LI(-16 * draw_resolution_scale_y),
dxbc::Src::VThreadID(dxbc::Src::kYYYY));
} else {
assert_true(draw_resolution_scale_y <= 2);
// Tile height is a power of two, can use bit operations.
// Get the tile row index into r0.w.
// r0.w = Y tile position.
a.OpUShR(dxbc::Dest::R(0, 0b1000), dxbc::Src::VThreadID(dxbc::Src::kYYYY),
dxbc::Src::LU(draw_resolution_scale_y == 2 ? 5 : 4));
// Get the Y sample position within the tile into r0.y.
// r0.y = Y sample position within the tile
// r0.w = Y tile position
a.OpAnd(dxbc::Dest::R(0, 0b0010), dxbc::Src::VThreadID(dxbc::Src::kYYYY),
dxbc::Src::LU((16 * draw_resolution_scale_y) - 1));
}
uint32_t tile_width =
(xenos::kEdramTileWidthSamples * draw_resolution_scale_x) >>
uint32_t(format_is_64bpp);
uint32_t tile_height =
xenos::kEdramTileHeightSamples * draw_resolution_scale_y;
// Get the X tile offset within the dispatch to r0.z.
uint32_t tile_width = xenos::kEdramTileWidthSamples * draw_resolution_scale_x;
uint32_t tile_width_divide_scale;
uint32_t tile_width_divide_upper_shift;
draw_util::GetEdramTileWidthDivideScaleAndUpperShift(
draw_resolution_scale_x, tile_width_divide_scale,
tile_width_divide_upper_shift);
if (format_is_64bpp) {
tile_width >>= 1;
assert_not_zero(tile_width_divide_upper_shift);
--tile_width_divide_upper_shift;
}
// Multiplication part of the division by 80|40 * scale.
// r0.y = Y sample position within the tile
// r0.z = (X * tile_width_divide_scale) >> 32
// r0.w = Y tile position
a.OpUMul(dxbc::Dest::R(0, 0b0100), dxbc::Dest::Null(),
dxbc::Src::VThreadID(dxbc::Src::kXXXX),
dxbc::Src::LU(tile_width_divide_scale));
// Shift part of the division by 80|40 * scale.
// Get the parts of the address - tile row index within the dispatch to r0.zw,
// sample Y within the tile to r0.xy.
// r0.x = X sample position within the tile
// r0.y = Y sample position within the tile
// r0.z = X tile position
// r0.w = Y tile position
a.OpUShR(dxbc::Dest::R(0, 0b0100), dxbc::Src::R(0, dxbc::Src::kZZZZ),
dxbc::Src::LU(tile_width_divide_upper_shift));
a.OpUDiv(dxbc::Dest::R(0, 0b1100), dxbc::Dest::R(0, 0b0011),
dxbc::Src::VThreadID(0b01000100),
dxbc::Src::LU(tile_width, tile_height, tile_width, tile_height));
// Extract the dump rectangle tile row pitch to r0.x.
// r0.x = dump rectangle pitch in tiles
// Extract the dump rectangle tile row pitch to r1.x.
// r0.x = X sample position within the tile
// r0.y = Y sample position within the tile
// r0.z = X tile position
// r0.w = Y tile position
a.OpUBFE(dxbc::Dest::R(0, 0b0001), dxbc::Src::LU(xenos::kEdramPitchTilesBits),
// r1.x = dump rectangle pitch in tiles
a.OpUBFE(dxbc::Dest::R(1, 0b0001), dxbc::Src::LU(xenos::kEdramPitchTilesBits),
dxbc::Src::LU(0),
dxbc::Src::CB(kDumpCbufferPitches, kDumpCbufferPitches, 0,
dxbc::Src::kXXXX));
// Get the tile index in the EDRAM relative to the dump rectangle base tile to
// r0.w.
// r0.x = free
// r0.y = Y sample position within the tile
// r0.z = X tile position
// r0.w = tile index relative to the dump rectangle base
a.OpUMAd(dxbc::Dest::R(0, 0b1000), dxbc::Src::R(0, dxbc::Src::kWWWW),
dxbc::Src::R(0, dxbc::Src::kXXXX),
dxbc::Src::R(0, dxbc::Src::kZZZZ));
// Take the X sample index within the tile as the remainder of the division of
// the thread index by tile width to r0.x.
// r0.x = X sample position within the tile
// r0.y = Y sample position within the tile
// r0.z = free
// r0.w = tile index relative to the dump rectangle base
a.OpIMAd(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kZZZZ),
dxbc::Src::LI(-int32_t(tile_width)),
dxbc::Src::VThreadID(dxbc::Src::kXXXX));
// r1.x = free
a.OpUMAd(dxbc::Dest::R(0, 0b1000), dxbc::Src::R(0, dxbc::Src::kWWWW),
dxbc::Src::R(1, dxbc::Src::kXXXX),
dxbc::Src::R(0, dxbc::Src::kZZZZ));
// Extract the index of the first tile of the dispatch in the EDRAM to r0.z.
// r0.x = X sample position within the tile
@ -6053,7 +5952,7 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
xenos::kEdramTileHeightSamples),
dxbc::Src::R(0, dxbc::Src::kXXXX));
// Add the contribution of the Y sample position within the tile to the sample
// address in the EDRAM to r0.w.
// address in the EDRAM to r0.z.
// r0.x = X sample position within the tile
// r0.y = Y sample position within the tile
// r0.z = sample offset in the EDRAM without the depth column swapping
@ -6119,7 +6018,6 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
dxbc::Src::CB(kDumpCbufferPitches, kDumpCbufferPitches, 0,
dxbc::Src::kXXXX));
// Split the linear tile index in the source texture into X and Y in tiles.
// Get the source texture pitch in tiles to r1.x.
// r0.x = X sample position within the tile
// r0.y = Y sample position within the tile
// r0.z = sample offset in the EDRAM
@ -6257,9 +6155,9 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
if (key.is_depth) {
switch (key.GetDepthFormat()) {
case xenos::DepthRenderTargetFormat::kD24S8:
// Round to the nearest even integer. This seems to be the correct,
// adding +0.5 and rounding towards zero results in red instead of
// black in the 4D5307E6 clear shader.
// Round to the nearest even integer. This seems to be the correct
// conversion, adding +0.5 and rounding towards zero results in red
// instead of black in the 4D5307E6 clear shader.
a.OpMul(dxbc::Dest::R(1, 0b0001), dxbc::Src::R(1, dxbc::Src::kXXXX),
dxbc::Src::LF(float(0xFFFFFF)));
a.OpRoundNE(dxbc::Dest::R(1, 0b0001),

View File

@ -649,31 +649,6 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs,
return normalized_color_mask;
}
void GetEdramTileWidthDivideScaleAndUpperShift(
uint32_t draw_resolution_scale_x, uint32_t& divide_scale_out,
uint32_t& divide_upper_shift_out) {
static_assert(
TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3,
"GetEdramTileWidthDivideScaleAndUpperShift provides values only for draw "
"resolution scaling factors of up to 3");
switch (draw_resolution_scale_x) {
case 1:
divide_scale_out = kDivideScale5;
divide_upper_shift_out = kDivideUpperShift5 + 4;
break;
case 2:
divide_scale_out = kDivideScale5;
divide_upper_shift_out = kDivideUpperShift5 + 5;
break;
case 3:
divide_scale_out = kDivideScale15;
divide_upper_shift_out = kDivideUpperShift15 + 4;
break;
default:
assert_unhandled_case(draw_resolution_scale_x);
}
}
xenos::CopySampleSelect SanitizeCopySampleSelect(
xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
bool is_depth) {

View File

@ -226,20 +226,6 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
uint32_t GetNormalizedColorMask(const RegisterFile& regs,
uint32_t pixel_shader_writes_color_targets);
// Scales, and shift amounts of the upper 32 bits of the 32x32=64-bit
// multiplication result, for fast division and multiplication by
// EDRAM-tile-related amounts.
constexpr uint32_t kDivideScale3 = 0xAAAAAAABu;
constexpr uint32_t kDivideUpperShift3 = 1;
constexpr uint32_t kDivideScale5 = 0xCCCCCCCDu;
constexpr uint32_t kDivideUpperShift5 = 2;
constexpr uint32_t kDivideScale15 = 0x88888889u;
constexpr uint32_t kDivideUpperShift15 = 3;
void GetEdramTileWidthDivideScaleAndUpperShift(
uint32_t draw_resolution_scale_x, uint32_t& divide_scale_out,
uint32_t& divide_upper_shift_out);
// Never an identity conversion - can always write conditional move instructions
// to shaders that will be no-ops for conversion from guest to host samples.
// While we don't know the exact guest sample pattern, due to the way

View File

@ -120,80 +120,49 @@ void DxbcShaderTranslator::ExportToMemory() {
a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kXXXX));
// Check more fine-grained limitations.
// The flag in control_temp.x can be 0 or 1 for simplicity, not necessarily
// 0 or 0xFFFFFFFF.
bool inner_condition_provided = false;
if (is_pixel_shader()) {
uint32_t resolution_scaled_axes =
uint32_t(draw_resolution_scale_x_ > 1) |
(uint32_t(draw_resolution_scale_y_ > 1) << 1);
if (resolution_scaled_axes) {
// Only do memexport for one host pixel in a guest pixel.
// For 2x - pixel 1 because it's covered with half-pixel offset that
// becomes full-pixel.
// For 3x - also pixel 1 because it's still covered with half-pixel
// offset, but close to the center.
// If X needs resolution scaling, writing 1 or 0 - whether the column is
// the one where memexport should be done - to control_temp.y.
// For Y, doing that to control_temp.z.
// Then, if both axes are resolution-scaled, merging the conditions for
// the two.
// Only do memexport for one host pixel in a guest pixel - prefer the
// host pixel closer to the center of the guest pixel, but one that's
// covered with the half-pixel offset according to the top-left rule (1
// for 2x because 0 isn't covered with the half-pixel offset, 1 for 3x
// because it's the center and is covered with the half-pixel offset too).
// Using control_temp.yz as per-axis temporary variables.
in_position_used_ |= resolution_scaled_axes;
a_.OpFToU(
dxbc::Dest::R(control_temp, resolution_scaled_axes << 1),
dxbc::Src::V1D(uint32_t(InOutRegister::kPSInPosition), 0b0100 << 2));
dxbc::Dest resolution_scaling_temp_dest(
dxbc::Dest::R(control_temp, 0b1000));
dxbc::Src resolution_scaling_temp_src(
dxbc::Src::R(control_temp, dxbc::Src::kWWWW));
a_.OpUDiv(dxbc::Dest::Null(),
dxbc::Dest::R(control_temp, resolution_scaled_axes << 1),
dxbc::Src::R(control_temp, 0b1001 << 2),
dxbc::Src::LU(0, draw_resolution_scale_x_,
draw_resolution_scale_y_, 0));
for (uint32_t i = 0; i < 2; ++i) {
if (!(resolution_scaled_axes & (1 << i))) {
continue;
}
// If there's no inner condition in control_temp.x yet, the condition
// for the current axis can go directly to it. Otherwise, need to merge
// with the previous condition, using control_temp.w as an intermediate
// variable.
dxbc::Dest resolution_scaled_axis_result(
inner_condition_provided ? resolution_scaling_temp_dest
: dxbc::Dest::R(control_temp, 0b0001));
// with the previous condition, using control_temp.y or .z as an
// intermediate variable.
dxbc::Src resolution_scaled_axis_src(
dxbc::Src::R(control_temp).Select(1 + i));
uint32_t axis_resolution_scale =
i ? draw_resolution_scale_y_ : draw_resolution_scale_x_;
static_assert(
TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3,
"DxbcShaderTranslator memexport draw resolution scaling "
"conditional generation supports draw resolution scaling factors "
"of only up to 3");
switch (axis_resolution_scale) {
case 2:
// xy & 1 == 1.
a_.OpAnd(resolution_scaled_axis_result, resolution_scaled_axis_src,
dxbc::Src::LU(1));
// No need to do IEq - already 1 for right / bottom, 0 for left /
// top.
break;
case 3:
// xy % 3 == 1.
a_.OpUMul(resolution_scaling_temp_dest, dxbc::Dest::Null(),
resolution_scaled_axis_src,
dxbc::Src::LU(draw_util::kDivideScale3));
a_.OpUShR(resolution_scaling_temp_dest, resolution_scaling_temp_src,
dxbc::Src::LU(draw_util::kDivideUpperShift3));
a_.OpIMAd(resolution_scaling_temp_dest, resolution_scaling_temp_src,
dxbc::Src::LI(-3), resolution_scaled_axis_src);
a_.OpIEq(resolution_scaled_axis_result, resolution_scaling_temp_src,
dxbc::Src::LU(1));
break;
default:
assert_unhandled_case(axis_resolution_scale);
}
a_.OpIEq(
dxbc::Dest::R(control_temp,
inner_condition_provided ? 1 << (1 + i) : 0b0001),
resolution_scaled_axis_src,
dxbc::Src::LU(
(i ? draw_resolution_scale_y_ : draw_resolution_scale_x_) >>
1));
if (inner_condition_provided) {
// Merge with the previous condition in control_temp.x.
a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001),
dxbc::Src::R(control_temp, dxbc::Src::kXXXX),
resolution_scaling_temp_src);
resolution_scaled_axis_src);
}
inner_condition_provided = true;
}

View File

@ -190,91 +190,22 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// dividing by 40, not by 80.
// For depth-only:
// Same, but for full 80x16 tiles, not 40x16 half-tiles.
uint32_t tile_or_half_tile_width = 80 * draw_resolution_scale_x_;
uint32_t tile_or_half_tile_width_divide_scale;
uint32_t tile_or_half_tile_width_divide_upper_shift;
draw_util::GetEdramTileWidthDivideScaleAndUpperShift(
draw_resolution_scale_x_, tile_or_half_tile_width_divide_scale,
tile_or_half_tile_width_divide_upper_shift);
if (any_color_targets_written) {
tile_or_half_tile_width >>= 1;
assert_not_zero(tile_or_half_tile_width_divide_upper_shift);
--tile_or_half_tile_width_divide_upper_shift;
}
static_assert(
TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3,
"DxbcShaderTranslator ROV sample address calculation supports Y draw "
"resolution scaling factors of only up to 3");
if (draw_resolution_scale_y_ == 3) {
// Multiplication part of the division by 40|80 x 16 x scale (specifically
// 40|80 * scale width here, and 48 height, or 16 * 3 height).
// system_temp_rov_params_.x = X sample 0 position
// system_temp_rov_params_.y = Y sample 0 position
// system_temp_rov_params_.z = (X * tile_or_half_tile_width_divide_scale) >>
// 32
// system_temp_rov_params_.w = (Y * kDivideScale3) >> 32
a_.OpUMul(dxbc::Dest::R(system_temp_rov_params_, 0b1100),
dxbc::Dest::Null(),
dxbc::Src::R(system_temp_rov_params_, 0b0100 << 4),
dxbc::Src::LU(0, 0, tile_or_half_tile_width_divide_scale,
draw_util::kDivideScale3));
// Shift part of the division by 40|80 x 16 x scale.
// system_temp_rov_params_.x = X sample 0 position
// system_temp_rov_params_.y = Y sample 0 position
// system_temp_rov_params_.z = X half-tile or tile position
// system_temp_rov_params_.w = Y tile position
a_.OpUShR(dxbc::Dest::R(system_temp_rov_params_, 0b1100),
dxbc::Src::R(system_temp_rov_params_),
dxbc::Src::LU(0, 0, tile_or_half_tile_width_divide_upper_shift,
draw_util::kDivideUpperShift3 + 4));
// Take the remainder of the performed division to
// system_temp_rov_params_.xy.
// system_temp_rov_params_.x = X sample 0 position within the half-tile
// system_temp_rov_params_.y = Y sample 0 position within the (half-)tile
// system_temp_rov_params_.z = X half-tile or tile position
// system_temp_rov_params_.w = Y tile position
a_.OpIMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0011),
dxbc::Src::R(system_temp_rov_params_, 0b1110),
dxbc::Src::LI(-int32_t(tile_or_half_tile_width),
-16 * draw_resolution_scale_y_, 0, 0),
dxbc::Src::R(system_temp_rov_params_));
} else {
assert_true(draw_resolution_scale_y_ <= 2);
// Multiplication part of the division of X by 40|80 * scale.
// system_temp_rov_params_.x = X sample 0 position
// system_temp_rov_params_.y = Y sample 0 position
// system_temp_rov_params_.z = (X * tile_or_half_tile_width_divide_scale) >>
// 32
a_.OpUMul(dxbc::Dest::R(system_temp_rov_params_, 0b0100),
dxbc::Dest::Null(),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX),
dxbc::Src::LU(tile_or_half_tile_width_divide_scale));
// Shift part of the division of X by 40 * scale, division of Y by
// 16 * scale as it's power of two in this case.
// system_temp_rov_params_.x = X sample 0 position
// system_temp_rov_params_.y = Y sample 0 position
// system_temp_rov_params_.z = X half-tile or tile position
// system_temp_rov_params_.w = Y tile position
a_.OpUShR(dxbc::Dest::R(system_temp_rov_params_, 0b1100),
dxbc::Src::R(system_temp_rov_params_, 0b0110 << 4),
dxbc::Src::LU(0, 0, tile_or_half_tile_width_divide_upper_shift,
draw_resolution_scale_y_ == 2 ? 5 : 4));
// Take the remainder of the performed division (via multiply-subtract for
// X, via AND for Y which is power-of-two here) to
// system_temp_rov_params_.xy.
// system_temp_rov_params_.x = X sample 0 position within the half-tile or
// tile
// system_temp_rov_params_.y = Y sample 0 position within the (half-)tile
// system_temp_rov_params_.z = X half-tile or tile position
// system_temp_rov_params_.w = Y tile position
a_.OpIMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0001),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ),
dxbc::Src::LI(-int32_t(tile_or_half_tile_width)),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX));
a_.OpAnd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
dxbc::Src::LU((16 * draw_resolution_scale_y_) - 1));
}
uint32_t tile_width =
xenos::kEdramTileWidthSamples * draw_resolution_scale_x_;
uint32_t tile_or_tile_half_width =
tile_width >> uint32_t(any_color_targets_written);
uint32_t tile_height =
xenos::kEdramTileHeightSamples * draw_resolution_scale_y_;
// system_temp_rov_params_.x = X sample 0 position within the half-tile or
// tile
// system_temp_rov_params_.y = Y sample 0 position within the (half-)tile
// system_temp_rov_params_.z = X half-tile or tile position
// system_temp_rov_params_.w = Y tile position
a_.OpUDiv(dxbc::Dest::R(system_temp_rov_params_, 0b1100),
dxbc::Dest::R(system_temp_rov_params_, 0b0011),
dxbc::Src::R(system_temp_rov_params_, 0b01000100),
dxbc::Src::LU(tile_or_tile_half_width, tile_height,
tile_or_tile_half_width, tile_height));
// Convert the Y sample 0 position within the half-tile or tile to the dword
// offset of the row within a 80x16 32bpp tile or a 40x16 64bpp half-tile to
@ -287,8 +218,10 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// system_temp_rov_params_.w = Y tile position
a_.OpUMul(dxbc::Dest::Null(), dxbc::Dest::R(system_temp_rov_params_, 0b0010),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
dxbc::Src::LU(80 * draw_resolution_scale_x_));
dxbc::Src::LU(tile_width));
uint32_t tile_size = tile_width * tile_height;
uint32_t tile_half_width = tile_width >> 1;
if (any_color_targets_written) {
// Depth, 32bpp color, 64bpp color are all needed.
@ -336,12 +269,10 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// system_temp_rov_params_.w = Y tile row dword origin in a 32bpp surface
// rov_address_temp.x = dword offset of the beginning of the row of samples
// within a row of 32bpp tiles
a_.OpUMAd(
dxbc::Dest::R(rov_address_temp, 0b0001),
dxbc::Src::R(rov_address_temp, dxbc::Src::kXXXX),
dxbc::Src::LU(80 * 16 *
(draw_resolution_scale_x_ * draw_resolution_scale_y_)),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
a_.OpUMAd(dxbc::Dest::R(rov_address_temp, 0b0001),
dxbc::Src::R(rov_address_temp, dxbc::Src::kXXXX),
dxbc::Src::LU(tile_size),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
// Get the dword offset of the beginning of the row of samples within a
// 32bpp surface to rov_address_temp.x.
// system_temp_rov_params_.x = X sample 0 position within the half-tile
@ -365,12 +296,10 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// system_temp_rov_params_.w = Y tile row dword origin in a 32bpp surface
// rov_address_temp.x = dword offset of the beginning of the row of samples
// within a 32bpp surface
a_.OpUMAd(
dxbc::Dest::R(system_temp_rov_params_, 0b0010),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ),
dxbc::Src::LU(80 * 16 *
(draw_resolution_scale_x_ * draw_resolution_scale_y_)),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
a_.OpUMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ),
dxbc::Src::LU(tile_size),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
// Get the dword offset of the beginning of the row of samples within a
// 64bpp surface to system_temp_rov_params_.w (last time the Y tile row
// offset is needed).
@ -420,7 +349,7 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// within a 32bpp surface
a_.OpUMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0100),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
dxbc::Src::LU(40 * draw_resolution_scale_x_),
dxbc::Src::LU(tile_half_width),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX));
// Get the final offset of the sample 0 within a 32bpp color surface to
// system_temp_rov_params_.z (last time the 32bpp row offset is needed).
@ -439,8 +368,8 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// system_temp_rov_params_.w = dword sample 0 offset within a 64bpp surface
a_.OpMovC(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
dxbc::Src::LI(-40 * draw_resolution_scale_x_),
dxbc::Src::LI(40 * draw_resolution_scale_x_));
dxbc::Src::LI(-int32_t(tile_half_width)),
dxbc::Src::LI(int32_t(tile_half_width)));
// Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color -
// get the final offset of the sample 0 within a 32bpp depth / stencil
// surface to system_temp_rov_params_.y.
@ -466,12 +395,10 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// system_temp_rov_params_.z = dword offset of the beginning of the row of
// samples within a row of 32bpp tiles
// system_temp_rov_params_.w = Y tile position
a_.OpUMAd(
dxbc::Dest::R(system_temp_rov_params_, 0b0100),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ),
dxbc::Src::LU(80 * 16 *
(draw_resolution_scale_x_ * draw_resolution_scale_y_)),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
a_.OpUMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0100),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ),
dxbc::Src::LU(tile_size),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
// Get the dword offset of the beginning of the row of samples within a
// 32bpp surface to system_temp_rov_params_.y (last time anything Y-related
// is needed, as well as the sample row offset within the tile row).
@ -502,15 +429,15 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// otherwise
a_.OpUGE(dxbc::Dest::R(system_temp_rov_params_, 0b0001),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX),
dxbc::Src::LU(40 * draw_resolution_scale_x_));
dxbc::Src::LU(tile_half_width));
// Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color -
// get the dword offset to add for flipping to system_temp_rov_params_.x.
// system_temp_rov_params_.x = depth half-tile flipping offset
// system_temp_rov_params_.y = dword sample 0 offset within a 32bpp surface
a_.OpMovC(dxbc::Dest::R(system_temp_rov_params_, 0b0001),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX),
dxbc::Src::LI(-40 * draw_resolution_scale_x_),
dxbc::Src::LI(40 * draw_resolution_scale_x_));
dxbc::Src::LI(-int32_t(tile_half_width)),
dxbc::Src::LI(int32_t(tile_half_width)));
// Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color -
// get the final offset of the sample 0 within a 32bpp depth / stencil
// surface to system_temp_rov_params_.y.
@ -1288,10 +1215,12 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() {
// Go to the next sample (samples are at +0, +(80*scale_x), +1,
// +(80*scale_x+1), so need to do +(80*scale_x), -(80*scale_x-1),
// +(80*scale_x) and -(80*scale_x+1) after each sample).
uint32_t tile_width =
xenos::kEdramTileWidthSamples * draw_resolution_scale_x_;
a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
dxbc::Src::LI((i & 1) ? -80 * draw_resolution_scale_x_ + 2 - i
: 80 * draw_resolution_scale_x_));
dxbc::Src::LI((i & 1) ? -int32_t(tile_width) + 2 - i
: int32_t(tile_width)));
}
if (ROV_IsDepthStencilEarly()) {
@ -2181,6 +2110,9 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
dxbc::Dest temp_w_dest(dxbc::Dest::R(temp, 0b1000));
dxbc::Src temp_w_src(dxbc::Src::R(temp, dxbc::Src::kWWWW));
uint32_t tile_width =
xenos::kEdramTileWidthSamples * draw_resolution_scale_x_;
// Do late depth/stencil test (which includes writing) if needed or deferred
// depth writing.
if (ROV_IsDepthStencilEarly()) {
@ -2212,8 +2144,8 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
if (i < 3) {
a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
dxbc::Src::LI((i & 1) ? -80 * draw_resolution_scale_x_ + 2 - i
: 80 * draw_resolution_scale_x_));
dxbc::Src::LI((i & 1) ? -int32_t(tile_width) + 2 - i
: int32_t(tile_width)));
}
}
} else {
@ -3021,8 +2953,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
// +(80*scale_x+1), so need to do +(80*scale_x), -(80*scale_x-1),
// +(80*scale_x) and -(80*scale_x+1) after each sample).
int32_t next_sample_distance =
(j & 1) ? -80 * draw_resolution_scale_x_ + 2 - j
: 80 * draw_resolution_scale_x_;
(j & 1) ? -int32_t(tile_width) + 2 - j : int32_t(tile_width);
a_.OpIAdd(
dxbc::Dest::R(system_temp_rov_params_, 0b1100),
dxbc::Src::R(system_temp_rov_params_),