[D3D12] Use udiv by constant tile size + minor transfer cleanup

Drivers compile that to a multiplication and a shift anyway.
This commit is contained in:
Triang3l 2022-06-20 22:39:30 +03:00
parent 207e11c8d2
commit e2f632f8fa
5 changed files with 125 additions and 366 deletions

View File

@ -2918,73 +2918,29 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
uint32_t draw_resolution_scale_x = this->draw_resolution_scale_x(); uint32_t draw_resolution_scale_x = this->draw_resolution_scale_x();
uint32_t draw_resolution_scale_y = this->draw_resolution_scale_y(); uint32_t draw_resolution_scale_y = this->draw_resolution_scale_y();
uint32_t tile_width_samples_scaled = uint32_t tile_width_samples =
xenos::kEdramTileWidthSamples * draw_resolution_scale_x; xenos::kEdramTileWidthSamples * draw_resolution_scale_x;
uint32_t tile_height_samples_scaled = uint32_t tile_height_samples =
xenos::kEdramTileHeightSamples * draw_resolution_scale_y; xenos::kEdramTileHeightSamples * draw_resolution_scale_y;
// Split the destination pixel index into 32bpp tile in r0.z and // Split the destination pixel index into 32bpp tile in r0.zw and
// 32bpp-tile-relative pixel index in r0.xy. // 32bpp-tile-relative pixel index in r0.xy.
// r0.xy = pixel XY as uint // r0.xy = pixel XY as uint
a.OpFToU(dxbc::Dest::R(0, 0b0011), dxbc::Src::V1D(kInputRegisterPosition)); a.OpFToU(dxbc::Dest::R(0, 0b0011), dxbc::Src::V1D(kInputRegisterPosition));
uint32_t dest_sample_width_log2 = uint32_t dest_tile_width_pixels =
uint32_t(dest_is_64bpp) + tile_width_samples >>
uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k4X); (uint32_t(dest_is_64bpp) +
uint32_t dest_sample_height_log2 = uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k4X));
uint32_t dest_tile_height_pixels =
tile_height_samples >>
uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k2X); uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k2X);
uint32_t dest_tile_width_divide_scale, dest_tile_width_divide_upper_shift; // r0.xy = destination pixel XY index within the 32bpp tile
draw_util::GetEdramTileWidthDivideScaleAndUpperShift( // r0.zw = 32bpp tile XY index
draw_resolution_scale_x, dest_tile_width_divide_scale, a.OpUDiv(dxbc::Dest::R(0, 0b1100), dxbc::Dest::R(0, 0b0011),
dest_tile_width_divide_upper_shift); dxbc::Src::R(0, 0b01000100),
assert_true(dest_tile_width_divide_upper_shift >= dest_sample_width_log2); dxbc::Src::LU(dest_tile_width_pixels, dest_tile_height_pixels,
// Need the host tile size in pixels, not samples. dest_tile_width_pixels, dest_tile_height_pixels));
dest_tile_width_divide_upper_shift -= dest_sample_width_log2;
static_assert(
TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3,
"D3D12RenderTargetCache EDRAM range ownership transfer shader generation "
"supports Y draw resolution scaling factors of only up to 3");
if (draw_resolution_scale_y == 3) {
// r0.zw = upper 32 bits in the division process of pixel XY by pixel count
// in a 32bpp tile
a.OpUMul(dxbc::Dest::R(0, 0b1100), dxbc::Dest::Null(),
dxbc::Src::R(0, 0b0100 << 4),
dxbc::Src::LU(0, 0, dest_tile_width_divide_scale,
draw_util::kDivideScale3));
// r0.zw = 32bpp tile XY index
a.OpUShR(dxbc::Dest::R(0, 0b1100), dxbc::Src::R(0),
dxbc::Src::LU(
0, 0, dest_tile_width_divide_upper_shift,
draw_util::kDivideUpperShift3 + 4 - dest_sample_height_log2));
// r0.xy = destination pixel XY index within the 32bpp tile
a.OpIMAd(
dxbc::Dest::R(0, 0b0011), dxbc::Src::R(0, 0b1110),
dxbc::Src::LI(
-int32_t((80 * draw_resolution_scale_x) >> dest_sample_width_log2),
-int32_t((16 * draw_resolution_scale_y) >> dest_sample_height_log2),
0, 0),
dxbc::Src::R(0, 0b0100));
} else {
assert_true(draw_resolution_scale_y <= 2);
uint32_t dest_tile_height_pixels_log2 =
(draw_resolution_scale_y == 2 ? 5 : 4) - dest_sample_height_log2;
// r0.z = upper 32 bits in the division process of pixel X by pixel count in
// a 32bpp tile
a.OpUMul(dxbc::Dest::R(0, 0b0100), dxbc::Dest::Null(),
dxbc::Src::R(0, dxbc::Src::kXXXX),
dxbc::Src::LU(dest_tile_width_divide_scale));
// r0.zw = 32bpp tile XY index
a.OpUShR(dxbc::Dest::R(0, 0b1100), dxbc::Src::R(0, 0b0110 << 4),
dxbc::Src::LU(0, 0, dest_tile_width_divide_upper_shift,
dest_tile_height_pixels_log2));
// r0.x = destination pixel X index within the 32bpp tile
a.OpIMAd(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kZZZZ),
dxbc::Src::LI(-int32_t((80 * draw_resolution_scale_x) >>
dest_sample_width_log2)),
dxbc::Src::R(0, dxbc::Src::kXXXX));
// r0.y = destination pixel Y index within the 32bpp tile
a.OpAnd(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(0, dxbc::Src::kYYYY),
dxbc::Src::LU((1 << dest_tile_height_pixels_log2) - 1));
}
// r1.x = destination pitch in 32bpp tiles // r1.x = destination pitch in 32bpp tiles
a.OpUBFE(dxbc::Dest::R(1, 0b0001), dxbc::Src::LU(xenos::kEdramPitchTilesBits), a.OpUBFE(dxbc::Dest::R(1, 0b0001), dxbc::Src::LU(xenos::kEdramPitchTilesBits),
dxbc::Src::LU(0), dxbc::Src::LU(0),
@ -3305,7 +3261,7 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
// Copying between color and depth / stencil - swap 40-32bpp-sample columns // Copying between color and depth / stencil - swap 40-32bpp-sample columns
// in the pixel index within the source 32bpp tile using r1.w as temporary. // in the pixel index within the source 32bpp tile using r1.w as temporary.
uint32_t source_32bpp_tile_half_pixels = uint32_t source_32bpp_tile_half_pixels =
tile_width_samples_scaled >> (1 + source_pixel_width_dwords_log2); tile_width_samples >> (1 + source_pixel_width_dwords_log2);
a.OpULT(dxbc::Dest::R(1, 0b1000), a.OpULT(dxbc::Dest::R(1, 0b1000),
dxbc::Src::R(source_tile_pixel_x_reg, dxbc::Src::kXXXX), dxbc::Src::R(source_tile_pixel_x_reg, dxbc::Src::kXXXX),
dxbc::Src::LU(source_32bpp_tile_half_pixels)); dxbc::Src::LU(source_32bpp_tile_half_pixels));
@ -3348,18 +3304,17 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
// r1.x = pixel X within the source texture // r1.x = pixel X within the source texture
// r2.x = free // r2.x = free
a.OpUMAd(dxbc::Dest::R(1, 0b0001), a.OpUMAd(dxbc::Dest::R(1, 0b0001),
dxbc::Src::LU(tile_width_samples_scaled >> dxbc::Src::LU(tile_width_samples >> source_pixel_width_dwords_log2),
source_pixel_width_dwords_log2),
dxbc::Src::R(2, dxbc::Src::kXXXX), dxbc::Src::R(2, dxbc::Src::kXXXX),
dxbc::Src::R(source_tile_pixel_x_reg, dxbc::Src::kXXXX)); dxbc::Src::R(source_tile_pixel_x_reg, dxbc::Src::kXXXX));
// r1.y = pixel Y within the source texture // r1.y = pixel Y within the source texture
// r1.w = free // r1.w = free
a.OpUMAd(dxbc::Dest::R(1, 0b0010), a.OpUMAd(
dxbc::Src::LU( dxbc::Dest::R(1, 0b0010),
tile_height_samples_scaled >> dxbc::Src::LU(tile_height_samples >> uint32_t(key.source_msaa_samples >=
uint32_t(key.source_msaa_samples >= xenos::MsaaSamples::k2X)), xenos::MsaaSamples::k2X)),
dxbc::Src::R(1, dxbc::Src::kWWWW), dxbc::Src::R(1, dxbc::Src::kWWWW),
dxbc::Src::R(source_tile_pixel_y_reg, dxbc::Src::kYYYY)); dxbc::Src::R(source_tile_pixel_y_reg, dxbc::Src::kYYYY));
// Load the source to r1, or, for 32bpp | 32bpp -> 64bpp, the first dword to // Load the source to r1, or, for 32bpp | 32bpp -> 64bpp, the first dword to
// r0 since addressing will not be needed anymore for color, and the second // r0 since addressing will not be needed anymore for color, and the second
@ -3575,9 +3530,9 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
for (uint32_t i = 0; i < 2; ++i) { for (uint32_t i = 0; i < 2; ++i) {
switch (source_depth_format) { switch (source_depth_format) {
case xenos::DepthRenderTargetFormat::kD24S8: { case xenos::DepthRenderTargetFormat::kD24S8: {
// Round to the nearest even integer. This seems to be the correct, // Round to the nearest even integer. This seems to be the correct
// adding +0.5 and rounding towards zero results in red instead of // conversion, adding +0.5 and rounding towards zero results in red
// black in the 4D5307E6 clear shader. // instead of black in the 4D5307E6 clear shader.
a.OpMul(dxbc::Dest::R(i, 0b1000), dxbc::Src::R(i, dxbc::Src::kWWWW), a.OpMul(dxbc::Dest::R(i, 0b1000), dxbc::Src::R(i, dxbc::Src::kWWWW),
dxbc::Src::LF(float(0xFFFFFF))); dxbc::Src::LF(float(0xFFFFFF)));
a.OpRoundNE(dxbc::Dest::R(i, 0b1000), a.OpRoundNE(dxbc::Dest::R(i, 0b1000),
@ -3762,9 +3717,9 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
depth_loaded_in_guest_format = true; depth_loaded_in_guest_format = true;
switch (source_depth_format) { switch (source_depth_format) {
case xenos::DepthRenderTargetFormat::kD24S8: { case xenos::DepthRenderTargetFormat::kD24S8: {
// Round to the nearest even integer. This seems to be the correct, // Round to the nearest even integer. This seems to be the correct
// adding +0.5 and rounding towards zero results in red instead of // conversion, adding +0.5 and rounding towards zero results in red
// black in the 4D5307E6 clear shader. // instead of black in the 4D5307E6 clear shader.
a.OpMul(dxbc::Dest::R(1, 0b1000), dxbc::Src::R(1, dxbc::Src::kWWWW), a.OpMul(dxbc::Dest::R(1, 0b1000), dxbc::Src::R(1, dxbc::Src::kWWWW),
dxbc::Src::LF(float(0xFFFFFF))); dxbc::Src::LF(float(0xFFFFFF)));
a.OpRoundNE(dxbc::Dest::R(1, 0b1000), a.OpRoundNE(dxbc::Dest::R(1, 0b1000),
@ -3920,12 +3875,11 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
// Combine the tile sample index and the tile index into buffer // Combine the tile sample index and the tile index into buffer
// address to r0.x. // address to r0.x.
a.OpUMAd(dxbc::Dest::R(0, 0b0001), a.OpUMAd(dxbc::Dest::R(0, 0b0001),
dxbc::Src::LU(tile_width_samples_scaled), dxbc::Src::LU(tile_width_samples),
dxbc::Src::R(0, dxbc::Src::kYYYY), dxbc::Src::R(0, dxbc::Src::kYYYY),
dxbc::Src::R(0, dxbc::Src::kXXXX)); dxbc::Src::R(0, dxbc::Src::kXXXX));
a.OpUMAd(dxbc::Dest::R(0, 0b0001), a.OpUMAd(dxbc::Dest::R(0, 0b0001),
dxbc::Src::LU(tile_width_samples_scaled * dxbc::Src::LU(tile_width_samples * tile_height_samples),
tile_height_samples_scaled),
dxbc::Src::R(0, dxbc::Src::kZZZZ), dxbc::Src::R(0, dxbc::Src::kZZZZ),
dxbc::Src::R(0, dxbc::Src::kXXXX)); dxbc::Src::R(0, dxbc::Src::kXXXX));
// Load from the buffer. // Load from the buffer.
@ -4102,7 +4056,7 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
// r1.x = free // r1.x = free
a.OpUMAd( a.OpUMAd(
dxbc::Dest::R(0, 0b0001), dxbc::Dest::R(0, 0b0001),
dxbc::Src::LU(tile_width_samples_scaled >> dxbc::Src::LU(tile_width_samples >>
uint32_t(key.host_depth_source_msaa_samples >= uint32_t(key.host_depth_source_msaa_samples >=
xenos::MsaaSamples::k4X)), xenos::MsaaSamples::k4X)),
dxbc::Src::R(1, dxbc::Src::kXXXX), dxbc::Src::R(1, dxbc::Src::kXXXX),
@ -4111,7 +4065,7 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
// r0.z = free // r0.z = free
a.OpUMAd( a.OpUMAd(
dxbc::Dest::R(0, 0b0010), dxbc::Dest::R(0, 0b0010),
dxbc::Src::LU(tile_height_samples_scaled >> dxbc::Src::LU(tile_height_samples >>
uint32_t(key.host_depth_source_msaa_samples >= uint32_t(key.host_depth_source_msaa_samples >=
xenos::MsaaSamples::k2X)), xenos::MsaaSamples::k2X)),
dxbc::Src::R(0, dxbc::Src::kZZZZ), dxbc::Src::R(0, dxbc::Src::kZZZZ),
@ -5933,97 +5887,42 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
// 32bpp is unknown, treating 64bpp tiles as storing 40x16 samples rather than // 32bpp is unknown, treating 64bpp tiles as storing 40x16 samples rather than
// 80x16 for simplicity of addressing into the texture. // 80x16 for simplicity of addressing into the texture.
// Get the parts of the address along Y - tile row index within the dispatch uint32_t tile_width =
// to r0.w, sample Y within the tile to r0.y. (xenos::kEdramTileWidthSamples * draw_resolution_scale_x) >>
static_assert( uint32_t(format_is_64bpp);
TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3, uint32_t tile_height =
"D3D12RenderTargetCache render target dump shader generation supports Y " xenos::kEdramTileHeightSamples * draw_resolution_scale_y;
"draw resolution scaling factors of only up to 3");
if (draw_resolution_scale_y == 3) {
// Multiplication part of the division by the (16 * scale) tile height,
// specifically 48 here, or 16 * 3.
// r0.w = (Y * kDivideScale3) >> 32
a.OpUMul(dxbc::Dest::R(0, 0b1000), dxbc::Dest::Null(),
dxbc::Src::VThreadID(dxbc::Src::kYYYY),
dxbc::Src::LU(draw_util::kDivideScale3));
// Shift part of the division by 16 * scale.
// r0.w = Y tile position
a.OpUShR(dxbc::Dest::R(0, 0b1000), dxbc::Src::R(0, dxbc::Src::kWWWW),
dxbc::Src::LU(draw_util::kDivideUpperShift3 + 4));
// Take the remainder of the performed division to r0.y.
// r0.y = Y sample position within the tile
// r0.w = Y tile position
a.OpIMAd(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(0, dxbc::Src::kWWWW),
dxbc::Src::LI(-16 * draw_resolution_scale_y),
dxbc::Src::VThreadID(dxbc::Src::kYYYY));
} else {
assert_true(draw_resolution_scale_y <= 2);
// Tile height is a power of two, can use bit operations.
// Get the tile row index into r0.w.
// r0.w = Y tile position.
a.OpUShR(dxbc::Dest::R(0, 0b1000), dxbc::Src::VThreadID(dxbc::Src::kYYYY),
dxbc::Src::LU(draw_resolution_scale_y == 2 ? 5 : 4));
// Get the Y sample position within the tile into r0.y.
// r0.y = Y sample position within the tile
// r0.w = Y tile position
a.OpAnd(dxbc::Dest::R(0, 0b0010), dxbc::Src::VThreadID(dxbc::Src::kYYYY),
dxbc::Src::LU((16 * draw_resolution_scale_y) - 1));
}
// Get the X tile offset within the dispatch to r0.z. // Get the parts of the address - tile row index within the dispatch to r0.zw,
uint32_t tile_width = xenos::kEdramTileWidthSamples * draw_resolution_scale_x; // sample Y within the tile to r0.xy.
uint32_t tile_width_divide_scale; // r0.x = X sample position within the tile
uint32_t tile_width_divide_upper_shift;
draw_util::GetEdramTileWidthDivideScaleAndUpperShift(
draw_resolution_scale_x, tile_width_divide_scale,
tile_width_divide_upper_shift);
if (format_is_64bpp) {
tile_width >>= 1;
assert_not_zero(tile_width_divide_upper_shift);
--tile_width_divide_upper_shift;
}
// Multiplication part of the division by 80|40 * scale.
// r0.y = Y sample position within the tile
// r0.z = (X * tile_width_divide_scale) >> 32
// r0.w = Y tile position
a.OpUMul(dxbc::Dest::R(0, 0b0100), dxbc::Dest::Null(),
dxbc::Src::VThreadID(dxbc::Src::kXXXX),
dxbc::Src::LU(tile_width_divide_scale));
// Shift part of the division by 80|40 * scale.
// r0.y = Y sample position within the tile // r0.y = Y sample position within the tile
// r0.z = X tile position // r0.z = X tile position
// r0.w = Y tile position // r0.w = Y tile position
a.OpUShR(dxbc::Dest::R(0, 0b0100), dxbc::Src::R(0, dxbc::Src::kZZZZ), a.OpUDiv(dxbc::Dest::R(0, 0b1100), dxbc::Dest::R(0, 0b0011),
dxbc::Src::LU(tile_width_divide_upper_shift)); dxbc::Src::VThreadID(0b01000100),
dxbc::Src::LU(tile_width, tile_height, tile_width, tile_height));
// Extract the dump rectangle tile row pitch to r0.x. // Extract the dump rectangle tile row pitch to r1.x.
// r0.x = dump rectangle pitch in tiles // r0.x = X sample position within the tile
// r0.y = Y sample position within the tile // r0.y = Y sample position within the tile
// r0.z = X tile position // r0.z = X tile position
// r0.w = Y tile position // r0.w = Y tile position
a.OpUBFE(dxbc::Dest::R(0, 0b0001), dxbc::Src::LU(xenos::kEdramPitchTilesBits), // r1.x = dump rectangle pitch in tiles
a.OpUBFE(dxbc::Dest::R(1, 0b0001), dxbc::Src::LU(xenos::kEdramPitchTilesBits),
dxbc::Src::LU(0), dxbc::Src::LU(0),
dxbc::Src::CB(kDumpCbufferPitches, kDumpCbufferPitches, 0, dxbc::Src::CB(kDumpCbufferPitches, kDumpCbufferPitches, 0,
dxbc::Src::kXXXX)); dxbc::Src::kXXXX));
// Get the tile index in the EDRAM relative to the dump rectangle base tile to // Get the tile index in the EDRAM relative to the dump rectangle base tile to
// r0.w. // r0.w.
// r0.x = free
// r0.y = Y sample position within the tile
// r0.z = X tile position
// r0.w = tile index relative to the dump rectangle base
a.OpUMAd(dxbc::Dest::R(0, 0b1000), dxbc::Src::R(0, dxbc::Src::kWWWW),
dxbc::Src::R(0, dxbc::Src::kXXXX),
dxbc::Src::R(0, dxbc::Src::kZZZZ));
// Take the X sample index within the tile as the remainder of the division of
// the thread index by tile width to r0.x.
// r0.x = X sample position within the tile // r0.x = X sample position within the tile
// r0.y = Y sample position within the tile // r0.y = Y sample position within the tile
// r0.z = free // r0.z = free
// r0.w = tile index relative to the dump rectangle base // r0.w = tile index relative to the dump rectangle base
a.OpIMAd(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kZZZZ), // r1.x = free
dxbc::Src::LI(-int32_t(tile_width)), a.OpUMAd(dxbc::Dest::R(0, 0b1000), dxbc::Src::R(0, dxbc::Src::kWWWW),
dxbc::Src::VThreadID(dxbc::Src::kXXXX)); dxbc::Src::R(1, dxbc::Src::kXXXX),
dxbc::Src::R(0, dxbc::Src::kZZZZ));
// Extract the index of the first tile of the dispatch in the EDRAM to r0.z. // Extract the index of the first tile of the dispatch in the EDRAM to r0.z.
// r0.x = X sample position within the tile // r0.x = X sample position within the tile
@ -6053,7 +5952,7 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
xenos::kEdramTileHeightSamples), xenos::kEdramTileHeightSamples),
dxbc::Src::R(0, dxbc::Src::kXXXX)); dxbc::Src::R(0, dxbc::Src::kXXXX));
// Add the contribution of the Y sample position within the tile to the sample // Add the contribution of the Y sample position within the tile to the sample
// address in the EDRAM to r0.w. // address in the EDRAM to r0.z.
// r0.x = X sample position within the tile // r0.x = X sample position within the tile
// r0.y = Y sample position within the tile // r0.y = Y sample position within the tile
// r0.z = sample offset in the EDRAM without the depth column swapping // r0.z = sample offset in the EDRAM without the depth column swapping
@ -6119,7 +6018,6 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
dxbc::Src::CB(kDumpCbufferPitches, kDumpCbufferPitches, 0, dxbc::Src::CB(kDumpCbufferPitches, kDumpCbufferPitches, 0,
dxbc::Src::kXXXX)); dxbc::Src::kXXXX));
// Split the linear tile index in the source texture into X and Y in tiles. // Split the linear tile index in the source texture into X and Y in tiles.
// Get the source texture pitch in tiles to r1.x.
// r0.x = X sample position within the tile // r0.x = X sample position within the tile
// r0.y = Y sample position within the tile // r0.y = Y sample position within the tile
// r0.z = sample offset in the EDRAM // r0.z = sample offset in the EDRAM
@ -6257,9 +6155,9 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
if (key.is_depth) { if (key.is_depth) {
switch (key.GetDepthFormat()) { switch (key.GetDepthFormat()) {
case xenos::DepthRenderTargetFormat::kD24S8: case xenos::DepthRenderTargetFormat::kD24S8:
// Round to the nearest even integer. This seems to be the correct, // Round to the nearest even integer. This seems to be the correct
// adding +0.5 and rounding towards zero results in red instead of // conversion, adding +0.5 and rounding towards zero results in red
// black in the 4D5307E6 clear shader. // instead of black in the 4D5307E6 clear shader.
a.OpMul(dxbc::Dest::R(1, 0b0001), dxbc::Src::R(1, dxbc::Src::kXXXX), a.OpMul(dxbc::Dest::R(1, 0b0001), dxbc::Src::R(1, dxbc::Src::kXXXX),
dxbc::Src::LF(float(0xFFFFFF))); dxbc::Src::LF(float(0xFFFFFF)));
a.OpRoundNE(dxbc::Dest::R(1, 0b0001), a.OpRoundNE(dxbc::Dest::R(1, 0b0001),

View File

@ -649,31 +649,6 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs,
return normalized_color_mask; return normalized_color_mask;
} }
void GetEdramTileWidthDivideScaleAndUpperShift(
uint32_t draw_resolution_scale_x, uint32_t& divide_scale_out,
uint32_t& divide_upper_shift_out) {
static_assert(
TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3,
"GetEdramTileWidthDivideScaleAndUpperShift provides values only for draw "
"resolution scaling factors of up to 3");
switch (draw_resolution_scale_x) {
case 1:
divide_scale_out = kDivideScale5;
divide_upper_shift_out = kDivideUpperShift5 + 4;
break;
case 2:
divide_scale_out = kDivideScale5;
divide_upper_shift_out = kDivideUpperShift5 + 5;
break;
case 3:
divide_scale_out = kDivideScale15;
divide_upper_shift_out = kDivideUpperShift15 + 4;
break;
default:
assert_unhandled_case(draw_resolution_scale_x);
}
}
xenos::CopySampleSelect SanitizeCopySampleSelect( xenos::CopySampleSelect SanitizeCopySampleSelect(
xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples, xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
bool is_depth) { bool is_depth) {

View File

@ -226,20 +226,6 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
uint32_t GetNormalizedColorMask(const RegisterFile& regs, uint32_t GetNormalizedColorMask(const RegisterFile& regs,
uint32_t pixel_shader_writes_color_targets); uint32_t pixel_shader_writes_color_targets);
// Scales, and shift amounts of the upper 32 bits of the 32x32=64-bit
// multiplication result, for fast division and multiplication by
// EDRAM-tile-related amounts.
constexpr uint32_t kDivideScale3 = 0xAAAAAAABu;
constexpr uint32_t kDivideUpperShift3 = 1;
constexpr uint32_t kDivideScale5 = 0xCCCCCCCDu;
constexpr uint32_t kDivideUpperShift5 = 2;
constexpr uint32_t kDivideScale15 = 0x88888889u;
constexpr uint32_t kDivideUpperShift15 = 3;
void GetEdramTileWidthDivideScaleAndUpperShift(
uint32_t draw_resolution_scale_x, uint32_t& divide_scale_out,
uint32_t& divide_upper_shift_out);
// Never an identity conversion - can always write conditional move instructions // Never an identity conversion - can always write conditional move instructions
// to shaders that will be no-ops for conversion from guest to host samples. // to shaders that will be no-ops for conversion from guest to host samples.
// While we don't know the exact guest sample pattern, due to the way // While we don't know the exact guest sample pattern, due to the way

View File

@ -120,80 +120,49 @@ void DxbcShaderTranslator::ExportToMemory() {
a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kXXXX)); a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kXXXX));
// Check more fine-grained limitations. // Check more fine-grained limitations.
// The flag in control_temp.x can be 0 or 1 for simplicity, not necessarily
// 0 or 0xFFFFFFFF.
bool inner_condition_provided = false; bool inner_condition_provided = false;
if (is_pixel_shader()) { if (is_pixel_shader()) {
uint32_t resolution_scaled_axes = uint32_t resolution_scaled_axes =
uint32_t(draw_resolution_scale_x_ > 1) | uint32_t(draw_resolution_scale_x_ > 1) |
(uint32_t(draw_resolution_scale_y_ > 1) << 1); (uint32_t(draw_resolution_scale_y_ > 1) << 1);
if (resolution_scaled_axes) { if (resolution_scaled_axes) {
// Only do memexport for one host pixel in a guest pixel. // Only do memexport for one host pixel in a guest pixel - prefer the
// For 2x - pixel 1 because it's covered with half-pixel offset that // host pixel closer to the center of the guest pixel, but one that's
// becomes full-pixel. // covered with the half-pixel offset according to the top-left rule (1
// For 3x - also pixel 1 because it's still covered with half-pixel // for 2x because 0 isn't covered with the half-pixel offset, 1 for 3x
// offset, but close to the center. // because it's the center and is covered with the half-pixel offset too).
// If X needs resolution scaling, writing 1 or 0 - whether the column is // Using control_temp.yz as per-axis temporary variables.
// the one where memexport should be done - to control_temp.y.
// For Y, doing that to control_temp.z.
// Then, if both axes are resolution-scaled, merging the conditions for
// the two.
in_position_used_ |= resolution_scaled_axes; in_position_used_ |= resolution_scaled_axes;
a_.OpFToU( a_.OpFToU(
dxbc::Dest::R(control_temp, resolution_scaled_axes << 1), dxbc::Dest::R(control_temp, resolution_scaled_axes << 1),
dxbc::Src::V1D(uint32_t(InOutRegister::kPSInPosition), 0b0100 << 2)); dxbc::Src::V1D(uint32_t(InOutRegister::kPSInPosition), 0b0100 << 2));
dxbc::Dest resolution_scaling_temp_dest( a_.OpUDiv(dxbc::Dest::Null(),
dxbc::Dest::R(control_temp, 0b1000)); dxbc::Dest::R(control_temp, resolution_scaled_axes << 1),
dxbc::Src resolution_scaling_temp_src( dxbc::Src::R(control_temp, 0b1001 << 2),
dxbc::Src::R(control_temp, dxbc::Src::kWWWW)); dxbc::Src::LU(0, draw_resolution_scale_x_,
draw_resolution_scale_y_, 0));
for (uint32_t i = 0; i < 2; ++i) { for (uint32_t i = 0; i < 2; ++i) {
if (!(resolution_scaled_axes & (1 << i))) { if (!(resolution_scaled_axes & (1 << i))) {
continue; continue;
} }
// If there's no inner condition in control_temp.x yet, the condition // If there's no inner condition in control_temp.x yet, the condition
// for the current axis can go directly to it. Otherwise, need to merge // for the current axis can go directly to it. Otherwise, need to merge
// with the previous condition, using control_temp.w as an intermediate // with the previous condition, using control_temp.y or .z as an
// variable. // intermediate variable.
dxbc::Dest resolution_scaled_axis_result(
inner_condition_provided ? resolution_scaling_temp_dest
: dxbc::Dest::R(control_temp, 0b0001));
dxbc::Src resolution_scaled_axis_src( dxbc::Src resolution_scaled_axis_src(
dxbc::Src::R(control_temp).Select(1 + i)); dxbc::Src::R(control_temp).Select(1 + i));
uint32_t axis_resolution_scale = a_.OpIEq(
i ? draw_resolution_scale_y_ : draw_resolution_scale_x_; dxbc::Dest::R(control_temp,
static_assert( inner_condition_provided ? 1 << (1 + i) : 0b0001),
TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3, resolution_scaled_axis_src,
"DxbcShaderTranslator memexport draw resolution scaling " dxbc::Src::LU(
"conditional generation supports draw resolution scaling factors " (i ? draw_resolution_scale_y_ : draw_resolution_scale_x_) >>
"of only up to 3"); 1));
switch (axis_resolution_scale) {
case 2:
// xy & 1 == 1.
a_.OpAnd(resolution_scaled_axis_result, resolution_scaled_axis_src,
dxbc::Src::LU(1));
// No need to do IEq - already 1 for right / bottom, 0 for left /
// top.
break;
case 3:
// xy % 3 == 1.
a_.OpUMul(resolution_scaling_temp_dest, dxbc::Dest::Null(),
resolution_scaled_axis_src,
dxbc::Src::LU(draw_util::kDivideScale3));
a_.OpUShR(resolution_scaling_temp_dest, resolution_scaling_temp_src,
dxbc::Src::LU(draw_util::kDivideUpperShift3));
a_.OpIMAd(resolution_scaling_temp_dest, resolution_scaling_temp_src,
dxbc::Src::LI(-3), resolution_scaled_axis_src);
a_.OpIEq(resolution_scaled_axis_result, resolution_scaling_temp_src,
dxbc::Src::LU(1));
break;
default:
assert_unhandled_case(axis_resolution_scale);
}
if (inner_condition_provided) { if (inner_condition_provided) {
// Merge with the previous condition in control_temp.x. // Merge with the previous condition in control_temp.x.
a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001), a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001),
dxbc::Src::R(control_temp, dxbc::Src::kXXXX), dxbc::Src::R(control_temp, dxbc::Src::kXXXX),
resolution_scaling_temp_src); resolution_scaled_axis_src);
} }
inner_condition_provided = true; inner_condition_provided = true;
} }

View File

@ -190,91 +190,22 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// dividing by 40, not by 80. // dividing by 40, not by 80.
// For depth-only: // For depth-only:
// Same, but for full 80x16 tiles, not 40x16 half-tiles. // Same, but for full 80x16 tiles, not 40x16 half-tiles.
uint32_t tile_or_half_tile_width = 80 * draw_resolution_scale_x_; uint32_t tile_width =
uint32_t tile_or_half_tile_width_divide_scale; xenos::kEdramTileWidthSamples * draw_resolution_scale_x_;
uint32_t tile_or_half_tile_width_divide_upper_shift; uint32_t tile_or_tile_half_width =
draw_util::GetEdramTileWidthDivideScaleAndUpperShift( tile_width >> uint32_t(any_color_targets_written);
draw_resolution_scale_x_, tile_or_half_tile_width_divide_scale, uint32_t tile_height =
tile_or_half_tile_width_divide_upper_shift); xenos::kEdramTileHeightSamples * draw_resolution_scale_y_;
if (any_color_targets_written) { // system_temp_rov_params_.x = X sample 0 position within the half-tile or
tile_or_half_tile_width >>= 1; // tile
assert_not_zero(tile_or_half_tile_width_divide_upper_shift); // system_temp_rov_params_.y = Y sample 0 position within the (half-)tile
--tile_or_half_tile_width_divide_upper_shift; // system_temp_rov_params_.z = X half-tile or tile position
} // system_temp_rov_params_.w = Y tile position
static_assert( a_.OpUDiv(dxbc::Dest::R(system_temp_rov_params_, 0b1100),
TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3, dxbc::Dest::R(system_temp_rov_params_, 0b0011),
"DxbcShaderTranslator ROV sample address calculation supports Y draw " dxbc::Src::R(system_temp_rov_params_, 0b01000100),
"resolution scaling factors of only up to 3"); dxbc::Src::LU(tile_or_tile_half_width, tile_height,
if (draw_resolution_scale_y_ == 3) { tile_or_tile_half_width, tile_height));
// Multiplication part of the division by 40|80 x 16 x scale (specifically
// 40|80 * scale width here, and 48 height, or 16 * 3 height).
// system_temp_rov_params_.x = X sample 0 position
// system_temp_rov_params_.y = Y sample 0 position
// system_temp_rov_params_.z = (X * tile_or_half_tile_width_divide_scale) >>
// 32
// system_temp_rov_params_.w = (Y * kDivideScale3) >> 32
a_.OpUMul(dxbc::Dest::R(system_temp_rov_params_, 0b1100),
dxbc::Dest::Null(),
dxbc::Src::R(system_temp_rov_params_, 0b0100 << 4),
dxbc::Src::LU(0, 0, tile_or_half_tile_width_divide_scale,
draw_util::kDivideScale3));
// Shift part of the division by 40|80 x 16 x scale.
// system_temp_rov_params_.x = X sample 0 position
// system_temp_rov_params_.y = Y sample 0 position
// system_temp_rov_params_.z = X half-tile or tile position
// system_temp_rov_params_.w = Y tile position
a_.OpUShR(dxbc::Dest::R(system_temp_rov_params_, 0b1100),
dxbc::Src::R(system_temp_rov_params_),
dxbc::Src::LU(0, 0, tile_or_half_tile_width_divide_upper_shift,
draw_util::kDivideUpperShift3 + 4));
// Take the remainder of the performed division to
// system_temp_rov_params_.xy.
// system_temp_rov_params_.x = X sample 0 position within the half-tile
// system_temp_rov_params_.y = Y sample 0 position within the (half-)tile
// system_temp_rov_params_.z = X half-tile or tile position
// system_temp_rov_params_.w = Y tile position
a_.OpIMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0011),
dxbc::Src::R(system_temp_rov_params_, 0b1110),
dxbc::Src::LI(-int32_t(tile_or_half_tile_width),
-16 * draw_resolution_scale_y_, 0, 0),
dxbc::Src::R(system_temp_rov_params_));
} else {
assert_true(draw_resolution_scale_y_ <= 2);
// Multiplication part of the division of X by 40|80 * scale.
// system_temp_rov_params_.x = X sample 0 position
// system_temp_rov_params_.y = Y sample 0 position
// system_temp_rov_params_.z = (X * tile_or_half_tile_width_divide_scale) >>
// 32
a_.OpUMul(dxbc::Dest::R(system_temp_rov_params_, 0b0100),
dxbc::Dest::Null(),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX),
dxbc::Src::LU(tile_or_half_tile_width_divide_scale));
// Shift part of the division of X by 40 * scale, division of Y by
// 16 * scale as it's power of two in this case.
// system_temp_rov_params_.x = X sample 0 position
// system_temp_rov_params_.y = Y sample 0 position
// system_temp_rov_params_.z = X half-tile or tile position
// system_temp_rov_params_.w = Y tile position
a_.OpUShR(dxbc::Dest::R(system_temp_rov_params_, 0b1100),
dxbc::Src::R(system_temp_rov_params_, 0b0110 << 4),
dxbc::Src::LU(0, 0, tile_or_half_tile_width_divide_upper_shift,
draw_resolution_scale_y_ == 2 ? 5 : 4));
// Take the remainder of the performed division (via multiply-subtract for
// X, via AND for Y which is power-of-two here) to
// system_temp_rov_params_.xy.
// system_temp_rov_params_.x = X sample 0 position within the half-tile or
// tile
// system_temp_rov_params_.y = Y sample 0 position within the (half-)tile
// system_temp_rov_params_.z = X half-tile or tile position
// system_temp_rov_params_.w = Y tile position
a_.OpIMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0001),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ),
dxbc::Src::LI(-int32_t(tile_or_half_tile_width)),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX));
a_.OpAnd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
dxbc::Src::LU((16 * draw_resolution_scale_y_) - 1));
}
// Convert the Y sample 0 position within the half-tile or tile to the dword // Convert the Y sample 0 position within the half-tile or tile to the dword
// offset of the row within a 80x16 32bpp tile or a 40x16 64bpp half-tile to // offset of the row within a 80x16 32bpp tile or a 40x16 64bpp half-tile to
@ -287,8 +218,10 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// system_temp_rov_params_.w = Y tile position // system_temp_rov_params_.w = Y tile position
a_.OpUMul(dxbc::Dest::Null(), dxbc::Dest::R(system_temp_rov_params_, 0b0010), a_.OpUMul(dxbc::Dest::Null(), dxbc::Dest::R(system_temp_rov_params_, 0b0010),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY), dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
dxbc::Src::LU(80 * draw_resolution_scale_x_)); dxbc::Src::LU(tile_width));
uint32_t tile_size = tile_width * tile_height;
uint32_t tile_half_width = tile_width >> 1;
if (any_color_targets_written) { if (any_color_targets_written) {
// Depth, 32bpp color, 64bpp color are all needed. // Depth, 32bpp color, 64bpp color are all needed.
@ -336,12 +269,10 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// system_temp_rov_params_.w = Y tile row dword origin in a 32bpp surface // system_temp_rov_params_.w = Y tile row dword origin in a 32bpp surface
// rov_address_temp.x = dword offset of the beginning of the row of samples // rov_address_temp.x = dword offset of the beginning of the row of samples
// within a row of 32bpp tiles // within a row of 32bpp tiles
a_.OpUMAd( a_.OpUMAd(dxbc::Dest::R(rov_address_temp, 0b0001),
dxbc::Dest::R(rov_address_temp, 0b0001), dxbc::Src::R(rov_address_temp, dxbc::Src::kXXXX),
dxbc::Src::R(rov_address_temp, dxbc::Src::kXXXX), dxbc::Src::LU(tile_size),
dxbc::Src::LU(80 * 16 * dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
(draw_resolution_scale_x_ * draw_resolution_scale_y_)),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
// Get the dword offset of the beginning of the row of samples within a // Get the dword offset of the beginning of the row of samples within a
// 32bpp surface to rov_address_temp.x. // 32bpp surface to rov_address_temp.x.
// system_temp_rov_params_.x = X sample 0 position within the half-tile // system_temp_rov_params_.x = X sample 0 position within the half-tile
@ -365,12 +296,10 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// system_temp_rov_params_.w = Y tile row dword origin in a 32bpp surface // system_temp_rov_params_.w = Y tile row dword origin in a 32bpp surface
// rov_address_temp.x = dword offset of the beginning of the row of samples // rov_address_temp.x = dword offset of the beginning of the row of samples
// within a 32bpp surface // within a 32bpp surface
a_.OpUMAd( a_.OpUMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
dxbc::Dest::R(system_temp_rov_params_, 0b0010), dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ), dxbc::Src::LU(tile_size),
dxbc::Src::LU(80 * 16 * dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
(draw_resolution_scale_x_ * draw_resolution_scale_y_)),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
// Get the dword offset of the beginning of the row of samples within a // Get the dword offset of the beginning of the row of samples within a
// 64bpp surface to system_temp_rov_params_.w (last time the Y tile row // 64bpp surface to system_temp_rov_params_.w (last time the Y tile row
// offset is needed). // offset is needed).
@ -420,7 +349,7 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// within a 32bpp surface // within a 32bpp surface
a_.OpUMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0100), a_.OpUMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0100),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY), dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
dxbc::Src::LU(40 * draw_resolution_scale_x_), dxbc::Src::LU(tile_half_width),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX)); dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX));
// Get the final offset of the sample 0 within a 32bpp color surface to // Get the final offset of the sample 0 within a 32bpp color surface to
// system_temp_rov_params_.z (last time the 32bpp row offset is needed). // system_temp_rov_params_.z (last time the 32bpp row offset is needed).
@ -439,8 +368,8 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// system_temp_rov_params_.w = dword sample 0 offset within a 64bpp surface // system_temp_rov_params_.w = dword sample 0 offset within a 64bpp surface
a_.OpMovC(dxbc::Dest::R(system_temp_rov_params_, 0b0010), a_.OpMovC(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY), dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
dxbc::Src::LI(-40 * draw_resolution_scale_x_), dxbc::Src::LI(-int32_t(tile_half_width)),
dxbc::Src::LI(40 * draw_resolution_scale_x_)); dxbc::Src::LI(int32_t(tile_half_width)));
// Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color - // Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color -
// get the final offset of the sample 0 within a 32bpp depth / stencil // get the final offset of the sample 0 within a 32bpp depth / stencil
// surface to system_temp_rov_params_.y. // surface to system_temp_rov_params_.y.
@ -466,12 +395,10 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// system_temp_rov_params_.z = dword offset of the beginning of the row of // system_temp_rov_params_.z = dword offset of the beginning of the row of
// samples within a row of 32bpp tiles // samples within a row of 32bpp tiles
// system_temp_rov_params_.w = Y tile position // system_temp_rov_params_.w = Y tile position
a_.OpUMAd( a_.OpUMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0100),
dxbc::Dest::R(system_temp_rov_params_, 0b0100), dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ), dxbc::Src::LU(tile_size),
dxbc::Src::LU(80 * 16 * dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
(draw_resolution_scale_x_ * draw_resolution_scale_y_)),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
// Get the dword offset of the beginning of the row of samples within a // Get the dword offset of the beginning of the row of samples within a
// 32bpp surface to system_temp_rov_params_.y (last time anything Y-related // 32bpp surface to system_temp_rov_params_.y (last time anything Y-related
// is needed, as well as the sample row offset within the tile row). // is needed, as well as the sample row offset within the tile row).
@ -502,15 +429,15 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
// otherwise // otherwise
a_.OpUGE(dxbc::Dest::R(system_temp_rov_params_, 0b0001), a_.OpUGE(dxbc::Dest::R(system_temp_rov_params_, 0b0001),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX), dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX),
dxbc::Src::LU(40 * draw_resolution_scale_x_)); dxbc::Src::LU(tile_half_width));
// Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color - // Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color -
// get the dword offset to add for flipping to system_temp_rov_params_.x. // get the dword offset to add for flipping to system_temp_rov_params_.x.
// system_temp_rov_params_.x = depth half-tile flipping offset // system_temp_rov_params_.x = depth half-tile flipping offset
// system_temp_rov_params_.y = dword sample 0 offset within a 32bpp surface // system_temp_rov_params_.y = dword sample 0 offset within a 32bpp surface
a_.OpMovC(dxbc::Dest::R(system_temp_rov_params_, 0b0001), a_.OpMovC(dxbc::Dest::R(system_temp_rov_params_, 0b0001),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX), dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX),
dxbc::Src::LI(-40 * draw_resolution_scale_x_), dxbc::Src::LI(-int32_t(tile_half_width)),
dxbc::Src::LI(40 * draw_resolution_scale_x_)); dxbc::Src::LI(int32_t(tile_half_width)));
// Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color - // Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color -
// get the final offset of the sample 0 within a 32bpp depth / stencil // get the final offset of the sample 0 within a 32bpp depth / stencil
// surface to system_temp_rov_params_.y. // surface to system_temp_rov_params_.y.
@ -1288,10 +1215,12 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() {
// Go to the next sample (samples are at +0, +(80*scale_x), +1, // Go to the next sample (samples are at +0, +(80*scale_x), +1,
// +(80*scale_x+1), so need to do +(80*scale_x), -(80*scale_x-1), // +(80*scale_x+1), so need to do +(80*scale_x), -(80*scale_x-1),
// +(80*scale_x) and -(80*scale_x+1) after each sample). // +(80*scale_x) and -(80*scale_x+1) after each sample).
uint32_t tile_width =
xenos::kEdramTileWidthSamples * draw_resolution_scale_x_;
a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010), a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY), dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
dxbc::Src::LI((i & 1) ? -80 * draw_resolution_scale_x_ + 2 - i dxbc::Src::LI((i & 1) ? -int32_t(tile_width) + 2 - i
: 80 * draw_resolution_scale_x_)); : int32_t(tile_width)));
} }
if (ROV_IsDepthStencilEarly()) { if (ROV_IsDepthStencilEarly()) {
@ -2181,6 +2110,9 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
dxbc::Dest temp_w_dest(dxbc::Dest::R(temp, 0b1000)); dxbc::Dest temp_w_dest(dxbc::Dest::R(temp, 0b1000));
dxbc::Src temp_w_src(dxbc::Src::R(temp, dxbc::Src::kWWWW)); dxbc::Src temp_w_src(dxbc::Src::R(temp, dxbc::Src::kWWWW));
uint32_t tile_width =
xenos::kEdramTileWidthSamples * draw_resolution_scale_x_;
// Do late depth/stencil test (which includes writing) if needed or deferred // Do late depth/stencil test (which includes writing) if needed or deferred
// depth writing. // depth writing.
if (ROV_IsDepthStencilEarly()) { if (ROV_IsDepthStencilEarly()) {
@ -2212,8 +2144,8 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
if (i < 3) { if (i < 3) {
a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010), a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY), dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
dxbc::Src::LI((i & 1) ? -80 * draw_resolution_scale_x_ + 2 - i dxbc::Src::LI((i & 1) ? -int32_t(tile_width) + 2 - i
: 80 * draw_resolution_scale_x_)); : int32_t(tile_width)));
} }
} }
} else { } else {
@ -3021,8 +2953,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
// +(80*scale_x+1), so need to do +(80*scale_x), -(80*scale_x-1), // +(80*scale_x+1), so need to do +(80*scale_x), -(80*scale_x-1),
// +(80*scale_x) and -(80*scale_x+1) after each sample). // +(80*scale_x) and -(80*scale_x+1) after each sample).
int32_t next_sample_distance = int32_t next_sample_distance =
(j & 1) ? -80 * draw_resolution_scale_x_ + 2 - j (j & 1) ? -int32_t(tile_width) + 2 - j : int32_t(tile_width);
: 80 * draw_resolution_scale_x_;
a_.OpIAdd( a_.OpIAdd(
dxbc::Dest::R(system_temp_rov_params_, 0b1100), dxbc::Dest::R(system_temp_rov_params_, 0b1100),
dxbc::Src::R(system_temp_rov_params_), dxbc::Src::R(system_temp_rov_params_),