Merge branch 'master' into vulkan
This commit is contained in:
commit
c0703e64db
|
@ -2918,73 +2918,29 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
|
||||||
uint32_t draw_resolution_scale_x = this->draw_resolution_scale_x();
|
uint32_t draw_resolution_scale_x = this->draw_resolution_scale_x();
|
||||||
uint32_t draw_resolution_scale_y = this->draw_resolution_scale_y();
|
uint32_t draw_resolution_scale_y = this->draw_resolution_scale_y();
|
||||||
|
|
||||||
uint32_t tile_width_samples_scaled =
|
uint32_t tile_width_samples =
|
||||||
xenos::kEdramTileWidthSamples * draw_resolution_scale_x;
|
xenos::kEdramTileWidthSamples * draw_resolution_scale_x;
|
||||||
uint32_t tile_height_samples_scaled =
|
uint32_t tile_height_samples =
|
||||||
xenos::kEdramTileHeightSamples * draw_resolution_scale_y;
|
xenos::kEdramTileHeightSamples * draw_resolution_scale_y;
|
||||||
|
|
||||||
// Split the destination pixel index into 32bpp tile in r0.z and
|
// Split the destination pixel index into 32bpp tile in r0.zw and
|
||||||
// 32bpp-tile-relative pixel index in r0.xy.
|
// 32bpp-tile-relative pixel index in r0.xy.
|
||||||
// r0.xy = pixel XY as uint
|
// r0.xy = pixel XY as uint
|
||||||
a.OpFToU(dxbc::Dest::R(0, 0b0011), dxbc::Src::V1D(kInputRegisterPosition));
|
a.OpFToU(dxbc::Dest::R(0, 0b0011), dxbc::Src::V1D(kInputRegisterPosition));
|
||||||
uint32_t dest_sample_width_log2 =
|
uint32_t dest_tile_width_pixels =
|
||||||
uint32_t(dest_is_64bpp) +
|
tile_width_samples >>
|
||||||
uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k4X);
|
(uint32_t(dest_is_64bpp) +
|
||||||
uint32_t dest_sample_height_log2 =
|
uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k4X));
|
||||||
|
uint32_t dest_tile_height_pixels =
|
||||||
|
tile_height_samples >>
|
||||||
uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k2X);
|
uint32_t(key.dest_msaa_samples >= xenos::MsaaSamples::k2X);
|
||||||
uint32_t dest_tile_width_divide_scale, dest_tile_width_divide_upper_shift;
|
// r0.xy = destination pixel XY index within the 32bpp tile
|
||||||
draw_util::GetEdramTileWidthDivideScaleAndUpperShift(
|
// r0.zw = 32bpp tile XY index
|
||||||
draw_resolution_scale_x, dest_tile_width_divide_scale,
|
a.OpUDiv(dxbc::Dest::R(0, 0b1100), dxbc::Dest::R(0, 0b0011),
|
||||||
dest_tile_width_divide_upper_shift);
|
dxbc::Src::R(0, 0b01000100),
|
||||||
assert_true(dest_tile_width_divide_upper_shift >= dest_sample_width_log2);
|
dxbc::Src::LU(dest_tile_width_pixels, dest_tile_height_pixels,
|
||||||
// Need the host tile size in pixels, not samples.
|
dest_tile_width_pixels, dest_tile_height_pixels));
|
||||||
dest_tile_width_divide_upper_shift -= dest_sample_width_log2;
|
|
||||||
static_assert(
|
|
||||||
TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3,
|
|
||||||
"D3D12RenderTargetCache EDRAM range ownership transfer shader generation "
|
|
||||||
"supports Y draw resolution scaling factors of only up to 3");
|
|
||||||
if (draw_resolution_scale_y == 3) {
|
|
||||||
// r0.zw = upper 32 bits in the division process of pixel XY by pixel count
|
|
||||||
// in a 32bpp tile
|
|
||||||
a.OpUMul(dxbc::Dest::R(0, 0b1100), dxbc::Dest::Null(),
|
|
||||||
dxbc::Src::R(0, 0b0100 << 4),
|
|
||||||
dxbc::Src::LU(0, 0, dest_tile_width_divide_scale,
|
|
||||||
draw_util::kDivideScale3));
|
|
||||||
// r0.zw = 32bpp tile XY index
|
|
||||||
a.OpUShR(dxbc::Dest::R(0, 0b1100), dxbc::Src::R(0),
|
|
||||||
dxbc::Src::LU(
|
|
||||||
0, 0, dest_tile_width_divide_upper_shift,
|
|
||||||
draw_util::kDivideUpperShift3 + 4 - dest_sample_height_log2));
|
|
||||||
// r0.xy = destination pixel XY index within the 32bpp tile
|
|
||||||
a.OpIMAd(
|
|
||||||
dxbc::Dest::R(0, 0b0011), dxbc::Src::R(0, 0b1110),
|
|
||||||
dxbc::Src::LI(
|
|
||||||
-int32_t((80 * draw_resolution_scale_x) >> dest_sample_width_log2),
|
|
||||||
-int32_t((16 * draw_resolution_scale_y) >> dest_sample_height_log2),
|
|
||||||
0, 0),
|
|
||||||
dxbc::Src::R(0, 0b0100));
|
|
||||||
} else {
|
|
||||||
assert_true(draw_resolution_scale_y <= 2);
|
|
||||||
uint32_t dest_tile_height_pixels_log2 =
|
|
||||||
(draw_resolution_scale_y == 2 ? 5 : 4) - dest_sample_height_log2;
|
|
||||||
// r0.z = upper 32 bits in the division process of pixel X by pixel count in
|
|
||||||
// a 32bpp tile
|
|
||||||
a.OpUMul(dxbc::Dest::R(0, 0b0100), dxbc::Dest::Null(),
|
|
||||||
dxbc::Src::R(0, dxbc::Src::kXXXX),
|
|
||||||
dxbc::Src::LU(dest_tile_width_divide_scale));
|
|
||||||
// r0.zw = 32bpp tile XY index
|
|
||||||
a.OpUShR(dxbc::Dest::R(0, 0b1100), dxbc::Src::R(0, 0b0110 << 4),
|
|
||||||
dxbc::Src::LU(0, 0, dest_tile_width_divide_upper_shift,
|
|
||||||
dest_tile_height_pixels_log2));
|
|
||||||
// r0.x = destination pixel X index within the 32bpp tile
|
|
||||||
a.OpIMAd(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kZZZZ),
|
|
||||||
dxbc::Src::LI(-int32_t((80 * draw_resolution_scale_x) >>
|
|
||||||
dest_sample_width_log2)),
|
|
||||||
dxbc::Src::R(0, dxbc::Src::kXXXX));
|
|
||||||
// r0.y = destination pixel Y index within the 32bpp tile
|
|
||||||
a.OpAnd(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(0, dxbc::Src::kYYYY),
|
|
||||||
dxbc::Src::LU((1 << dest_tile_height_pixels_log2) - 1));
|
|
||||||
}
|
|
||||||
// r1.x = destination pitch in 32bpp tiles
|
// r1.x = destination pitch in 32bpp tiles
|
||||||
a.OpUBFE(dxbc::Dest::R(1, 0b0001), dxbc::Src::LU(xenos::kEdramPitchTilesBits),
|
a.OpUBFE(dxbc::Dest::R(1, 0b0001), dxbc::Src::LU(xenos::kEdramPitchTilesBits),
|
||||||
dxbc::Src::LU(0),
|
dxbc::Src::LU(0),
|
||||||
|
@ -3305,7 +3261,7 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
|
||||||
// Copying between color and depth / stencil - swap 40-32bpp-sample columns
|
// Copying between color and depth / stencil - swap 40-32bpp-sample columns
|
||||||
// in the pixel index within the source 32bpp tile using r1.w as temporary.
|
// in the pixel index within the source 32bpp tile using r1.w as temporary.
|
||||||
uint32_t source_32bpp_tile_half_pixels =
|
uint32_t source_32bpp_tile_half_pixels =
|
||||||
tile_width_samples_scaled >> (1 + source_pixel_width_dwords_log2);
|
tile_width_samples >> (1 + source_pixel_width_dwords_log2);
|
||||||
a.OpULT(dxbc::Dest::R(1, 0b1000),
|
a.OpULT(dxbc::Dest::R(1, 0b1000),
|
||||||
dxbc::Src::R(source_tile_pixel_x_reg, dxbc::Src::kXXXX),
|
dxbc::Src::R(source_tile_pixel_x_reg, dxbc::Src::kXXXX),
|
||||||
dxbc::Src::LU(source_32bpp_tile_half_pixels));
|
dxbc::Src::LU(source_32bpp_tile_half_pixels));
|
||||||
|
@ -3348,18 +3304,17 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
|
||||||
// r1.x = pixel X within the source texture
|
// r1.x = pixel X within the source texture
|
||||||
// r2.x = free
|
// r2.x = free
|
||||||
a.OpUMAd(dxbc::Dest::R(1, 0b0001),
|
a.OpUMAd(dxbc::Dest::R(1, 0b0001),
|
||||||
dxbc::Src::LU(tile_width_samples_scaled >>
|
dxbc::Src::LU(tile_width_samples >> source_pixel_width_dwords_log2),
|
||||||
source_pixel_width_dwords_log2),
|
|
||||||
dxbc::Src::R(2, dxbc::Src::kXXXX),
|
dxbc::Src::R(2, dxbc::Src::kXXXX),
|
||||||
dxbc::Src::R(source_tile_pixel_x_reg, dxbc::Src::kXXXX));
|
dxbc::Src::R(source_tile_pixel_x_reg, dxbc::Src::kXXXX));
|
||||||
// r1.y = pixel Y within the source texture
|
// r1.y = pixel Y within the source texture
|
||||||
// r1.w = free
|
// r1.w = free
|
||||||
a.OpUMAd(dxbc::Dest::R(1, 0b0010),
|
a.OpUMAd(
|
||||||
dxbc::Src::LU(
|
dxbc::Dest::R(1, 0b0010),
|
||||||
tile_height_samples_scaled >>
|
dxbc::Src::LU(tile_height_samples >> uint32_t(key.source_msaa_samples >=
|
||||||
uint32_t(key.source_msaa_samples >= xenos::MsaaSamples::k2X)),
|
xenos::MsaaSamples::k2X)),
|
||||||
dxbc::Src::R(1, dxbc::Src::kWWWW),
|
dxbc::Src::R(1, dxbc::Src::kWWWW),
|
||||||
dxbc::Src::R(source_tile_pixel_y_reg, dxbc::Src::kYYYY));
|
dxbc::Src::R(source_tile_pixel_y_reg, dxbc::Src::kYYYY));
|
||||||
|
|
||||||
// Load the source to r1, or, for 32bpp | 32bpp -> 64bpp, the first dword to
|
// Load the source to r1, or, for 32bpp | 32bpp -> 64bpp, the first dword to
|
||||||
// r0 since addressing will not be needed anymore for color, and the second
|
// r0 since addressing will not be needed anymore for color, and the second
|
||||||
|
@ -3575,9 +3530,9 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
|
||||||
for (uint32_t i = 0; i < 2; ++i) {
|
for (uint32_t i = 0; i < 2; ++i) {
|
||||||
switch (source_depth_format) {
|
switch (source_depth_format) {
|
||||||
case xenos::DepthRenderTargetFormat::kD24S8: {
|
case xenos::DepthRenderTargetFormat::kD24S8: {
|
||||||
// Round to the nearest even integer. This seems to be the correct,
|
// Round to the nearest even integer. This seems to be the correct
|
||||||
// adding +0.5 and rounding towards zero results in red instead of
|
// conversion, adding +0.5 and rounding towards zero results in red
|
||||||
// black in the 4D5307E6 clear shader.
|
// instead of black in the 4D5307E6 clear shader.
|
||||||
a.OpMul(dxbc::Dest::R(i, 0b1000), dxbc::Src::R(i, dxbc::Src::kWWWW),
|
a.OpMul(dxbc::Dest::R(i, 0b1000), dxbc::Src::R(i, dxbc::Src::kWWWW),
|
||||||
dxbc::Src::LF(float(0xFFFFFF)));
|
dxbc::Src::LF(float(0xFFFFFF)));
|
||||||
a.OpRoundNE(dxbc::Dest::R(i, 0b1000),
|
a.OpRoundNE(dxbc::Dest::R(i, 0b1000),
|
||||||
|
@ -3762,9 +3717,9 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
|
||||||
depth_loaded_in_guest_format = true;
|
depth_loaded_in_guest_format = true;
|
||||||
switch (source_depth_format) {
|
switch (source_depth_format) {
|
||||||
case xenos::DepthRenderTargetFormat::kD24S8: {
|
case xenos::DepthRenderTargetFormat::kD24S8: {
|
||||||
// Round to the nearest even integer. This seems to be the correct,
|
// Round to the nearest even integer. This seems to be the correct
|
||||||
// adding +0.5 and rounding towards zero results in red instead of
|
// conversion, adding +0.5 and rounding towards zero results in red
|
||||||
// black in the 4D5307E6 clear shader.
|
// instead of black in the 4D5307E6 clear shader.
|
||||||
a.OpMul(dxbc::Dest::R(1, 0b1000), dxbc::Src::R(1, dxbc::Src::kWWWW),
|
a.OpMul(dxbc::Dest::R(1, 0b1000), dxbc::Src::R(1, dxbc::Src::kWWWW),
|
||||||
dxbc::Src::LF(float(0xFFFFFF)));
|
dxbc::Src::LF(float(0xFFFFFF)));
|
||||||
a.OpRoundNE(dxbc::Dest::R(1, 0b1000),
|
a.OpRoundNE(dxbc::Dest::R(1, 0b1000),
|
||||||
|
@ -3920,12 +3875,11 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
|
||||||
// Combine the tile sample index and the tile index into buffer
|
// Combine the tile sample index and the tile index into buffer
|
||||||
// address to r0.x.
|
// address to r0.x.
|
||||||
a.OpUMAd(dxbc::Dest::R(0, 0b0001),
|
a.OpUMAd(dxbc::Dest::R(0, 0b0001),
|
||||||
dxbc::Src::LU(tile_width_samples_scaled),
|
dxbc::Src::LU(tile_width_samples),
|
||||||
dxbc::Src::R(0, dxbc::Src::kYYYY),
|
dxbc::Src::R(0, dxbc::Src::kYYYY),
|
||||||
dxbc::Src::R(0, dxbc::Src::kXXXX));
|
dxbc::Src::R(0, dxbc::Src::kXXXX));
|
||||||
a.OpUMAd(dxbc::Dest::R(0, 0b0001),
|
a.OpUMAd(dxbc::Dest::R(0, 0b0001),
|
||||||
dxbc::Src::LU(tile_width_samples_scaled *
|
dxbc::Src::LU(tile_width_samples * tile_height_samples),
|
||||||
tile_height_samples_scaled),
|
|
||||||
dxbc::Src::R(0, dxbc::Src::kZZZZ),
|
dxbc::Src::R(0, dxbc::Src::kZZZZ),
|
||||||
dxbc::Src::R(0, dxbc::Src::kXXXX));
|
dxbc::Src::R(0, dxbc::Src::kXXXX));
|
||||||
// Load from the buffer.
|
// Load from the buffer.
|
||||||
|
@ -4102,7 +4056,7 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
|
||||||
// r1.x = free
|
// r1.x = free
|
||||||
a.OpUMAd(
|
a.OpUMAd(
|
||||||
dxbc::Dest::R(0, 0b0001),
|
dxbc::Dest::R(0, 0b0001),
|
||||||
dxbc::Src::LU(tile_width_samples_scaled >>
|
dxbc::Src::LU(tile_width_samples >>
|
||||||
uint32_t(key.host_depth_source_msaa_samples >=
|
uint32_t(key.host_depth_source_msaa_samples >=
|
||||||
xenos::MsaaSamples::k4X)),
|
xenos::MsaaSamples::k4X)),
|
||||||
dxbc::Src::R(1, dxbc::Src::kXXXX),
|
dxbc::Src::R(1, dxbc::Src::kXXXX),
|
||||||
|
@ -4111,7 +4065,7 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
|
||||||
// r0.z = free
|
// r0.z = free
|
||||||
a.OpUMAd(
|
a.OpUMAd(
|
||||||
dxbc::Dest::R(0, 0b0010),
|
dxbc::Dest::R(0, 0b0010),
|
||||||
dxbc::Src::LU(tile_height_samples_scaled >>
|
dxbc::Src::LU(tile_height_samples >>
|
||||||
uint32_t(key.host_depth_source_msaa_samples >=
|
uint32_t(key.host_depth_source_msaa_samples >=
|
||||||
xenos::MsaaSamples::k2X)),
|
xenos::MsaaSamples::k2X)),
|
||||||
dxbc::Src::R(0, dxbc::Src::kZZZZ),
|
dxbc::Src::R(0, dxbc::Src::kZZZZ),
|
||||||
|
@ -5933,97 +5887,42 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
|
||||||
// 32bpp is unknown, treating 64bpp tiles as storing 40x16 samples rather than
|
// 32bpp is unknown, treating 64bpp tiles as storing 40x16 samples rather than
|
||||||
// 80x16 for simplicity of addressing into the texture.
|
// 80x16 for simplicity of addressing into the texture.
|
||||||
|
|
||||||
// Get the parts of the address along Y - tile row index within the dispatch
|
uint32_t tile_width =
|
||||||
// to r0.w, sample Y within the tile to r0.y.
|
(xenos::kEdramTileWidthSamples * draw_resolution_scale_x) >>
|
||||||
static_assert(
|
uint32_t(format_is_64bpp);
|
||||||
TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3,
|
uint32_t tile_height =
|
||||||
"D3D12RenderTargetCache render target dump shader generation supports Y "
|
xenos::kEdramTileHeightSamples * draw_resolution_scale_y;
|
||||||
"draw resolution scaling factors of only up to 3");
|
|
||||||
if (draw_resolution_scale_y == 3) {
|
|
||||||
// Multiplication part of the division by the (16 * scale) tile height,
|
|
||||||
// specifically 48 here, or 16 * 3.
|
|
||||||
// r0.w = (Y * kDivideScale3) >> 32
|
|
||||||
a.OpUMul(dxbc::Dest::R(0, 0b1000), dxbc::Dest::Null(),
|
|
||||||
dxbc::Src::VThreadID(dxbc::Src::kYYYY),
|
|
||||||
dxbc::Src::LU(draw_util::kDivideScale3));
|
|
||||||
// Shift part of the division by 16 * scale.
|
|
||||||
// r0.w = Y tile position
|
|
||||||
a.OpUShR(dxbc::Dest::R(0, 0b1000), dxbc::Src::R(0, dxbc::Src::kWWWW),
|
|
||||||
dxbc::Src::LU(draw_util::kDivideUpperShift3 + 4));
|
|
||||||
// Take the remainder of the performed division to r0.y.
|
|
||||||
// r0.y = Y sample position within the tile
|
|
||||||
// r0.w = Y tile position
|
|
||||||
a.OpIMAd(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(0, dxbc::Src::kWWWW),
|
|
||||||
dxbc::Src::LI(-16 * draw_resolution_scale_y),
|
|
||||||
dxbc::Src::VThreadID(dxbc::Src::kYYYY));
|
|
||||||
} else {
|
|
||||||
assert_true(draw_resolution_scale_y <= 2);
|
|
||||||
// Tile height is a power of two, can use bit operations.
|
|
||||||
// Get the tile row index into r0.w.
|
|
||||||
// r0.w = Y tile position.
|
|
||||||
a.OpUShR(dxbc::Dest::R(0, 0b1000), dxbc::Src::VThreadID(dxbc::Src::kYYYY),
|
|
||||||
dxbc::Src::LU(draw_resolution_scale_y == 2 ? 5 : 4));
|
|
||||||
// Get the Y sample position within the tile into r0.y.
|
|
||||||
// r0.y = Y sample position within the tile
|
|
||||||
// r0.w = Y tile position
|
|
||||||
a.OpAnd(dxbc::Dest::R(0, 0b0010), dxbc::Src::VThreadID(dxbc::Src::kYYYY),
|
|
||||||
dxbc::Src::LU((16 * draw_resolution_scale_y) - 1));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get the X tile offset within the dispatch to r0.z.
|
// Get the parts of the address - tile row index within the dispatch to r0.zw,
|
||||||
uint32_t tile_width = xenos::kEdramTileWidthSamples * draw_resolution_scale_x;
|
// sample Y within the tile to r0.xy.
|
||||||
uint32_t tile_width_divide_scale;
|
// r0.x = X sample position within the tile
|
||||||
uint32_t tile_width_divide_upper_shift;
|
|
||||||
draw_util::GetEdramTileWidthDivideScaleAndUpperShift(
|
|
||||||
draw_resolution_scale_x, tile_width_divide_scale,
|
|
||||||
tile_width_divide_upper_shift);
|
|
||||||
if (format_is_64bpp) {
|
|
||||||
tile_width >>= 1;
|
|
||||||
assert_not_zero(tile_width_divide_upper_shift);
|
|
||||||
--tile_width_divide_upper_shift;
|
|
||||||
}
|
|
||||||
// Multiplication part of the division by 80|40 * scale.
|
|
||||||
// r0.y = Y sample position within the tile
|
|
||||||
// r0.z = (X * tile_width_divide_scale) >> 32
|
|
||||||
// r0.w = Y tile position
|
|
||||||
a.OpUMul(dxbc::Dest::R(0, 0b0100), dxbc::Dest::Null(),
|
|
||||||
dxbc::Src::VThreadID(dxbc::Src::kXXXX),
|
|
||||||
dxbc::Src::LU(tile_width_divide_scale));
|
|
||||||
// Shift part of the division by 80|40 * scale.
|
|
||||||
// r0.y = Y sample position within the tile
|
// r0.y = Y sample position within the tile
|
||||||
// r0.z = X tile position
|
// r0.z = X tile position
|
||||||
// r0.w = Y tile position
|
// r0.w = Y tile position
|
||||||
a.OpUShR(dxbc::Dest::R(0, 0b0100), dxbc::Src::R(0, dxbc::Src::kZZZZ),
|
a.OpUDiv(dxbc::Dest::R(0, 0b1100), dxbc::Dest::R(0, 0b0011),
|
||||||
dxbc::Src::LU(tile_width_divide_upper_shift));
|
dxbc::Src::VThreadID(0b01000100),
|
||||||
|
dxbc::Src::LU(tile_width, tile_height, tile_width, tile_height));
|
||||||
|
|
||||||
// Extract the dump rectangle tile row pitch to r0.x.
|
// Extract the dump rectangle tile row pitch to r1.x.
|
||||||
// r0.x = dump rectangle pitch in tiles
|
// r0.x = X sample position within the tile
|
||||||
// r0.y = Y sample position within the tile
|
// r0.y = Y sample position within the tile
|
||||||
// r0.z = X tile position
|
// r0.z = X tile position
|
||||||
// r0.w = Y tile position
|
// r0.w = Y tile position
|
||||||
a.OpUBFE(dxbc::Dest::R(0, 0b0001), dxbc::Src::LU(xenos::kEdramPitchTilesBits),
|
// r1.x = dump rectangle pitch in tiles
|
||||||
|
a.OpUBFE(dxbc::Dest::R(1, 0b0001), dxbc::Src::LU(xenos::kEdramPitchTilesBits),
|
||||||
dxbc::Src::LU(0),
|
dxbc::Src::LU(0),
|
||||||
dxbc::Src::CB(kDumpCbufferPitches, kDumpCbufferPitches, 0,
|
dxbc::Src::CB(kDumpCbufferPitches, kDumpCbufferPitches, 0,
|
||||||
dxbc::Src::kXXXX));
|
dxbc::Src::kXXXX));
|
||||||
// Get the tile index in the EDRAM relative to the dump rectangle base tile to
|
// Get the tile index in the EDRAM relative to the dump rectangle base tile to
|
||||||
// r0.w.
|
// r0.w.
|
||||||
// r0.x = free
|
|
||||||
// r0.y = Y sample position within the tile
|
|
||||||
// r0.z = X tile position
|
|
||||||
// r0.w = tile index relative to the dump rectangle base
|
|
||||||
a.OpUMAd(dxbc::Dest::R(0, 0b1000), dxbc::Src::R(0, dxbc::Src::kWWWW),
|
|
||||||
dxbc::Src::R(0, dxbc::Src::kXXXX),
|
|
||||||
dxbc::Src::R(0, dxbc::Src::kZZZZ));
|
|
||||||
|
|
||||||
// Take the X sample index within the tile as the remainder of the division of
|
|
||||||
// the thread index by tile width to r0.x.
|
|
||||||
// r0.x = X sample position within the tile
|
// r0.x = X sample position within the tile
|
||||||
// r0.y = Y sample position within the tile
|
// r0.y = Y sample position within the tile
|
||||||
// r0.z = free
|
// r0.z = free
|
||||||
// r0.w = tile index relative to the dump rectangle base
|
// r0.w = tile index relative to the dump rectangle base
|
||||||
a.OpIMAd(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kZZZZ),
|
// r1.x = free
|
||||||
dxbc::Src::LI(-int32_t(tile_width)),
|
a.OpUMAd(dxbc::Dest::R(0, 0b1000), dxbc::Src::R(0, dxbc::Src::kWWWW),
|
||||||
dxbc::Src::VThreadID(dxbc::Src::kXXXX));
|
dxbc::Src::R(1, dxbc::Src::kXXXX),
|
||||||
|
dxbc::Src::R(0, dxbc::Src::kZZZZ));
|
||||||
|
|
||||||
// Extract the index of the first tile of the dispatch in the EDRAM to r0.z.
|
// Extract the index of the first tile of the dispatch in the EDRAM to r0.z.
|
||||||
// r0.x = X sample position within the tile
|
// r0.x = X sample position within the tile
|
||||||
|
@ -6053,7 +5952,7 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
|
||||||
xenos::kEdramTileHeightSamples),
|
xenos::kEdramTileHeightSamples),
|
||||||
dxbc::Src::R(0, dxbc::Src::kXXXX));
|
dxbc::Src::R(0, dxbc::Src::kXXXX));
|
||||||
// Add the contribution of the Y sample position within the tile to the sample
|
// Add the contribution of the Y sample position within the tile to the sample
|
||||||
// address in the EDRAM to r0.w.
|
// address in the EDRAM to r0.z.
|
||||||
// r0.x = X sample position within the tile
|
// r0.x = X sample position within the tile
|
||||||
// r0.y = Y sample position within the tile
|
// r0.y = Y sample position within the tile
|
||||||
// r0.z = sample offset in the EDRAM without the depth column swapping
|
// r0.z = sample offset in the EDRAM without the depth column swapping
|
||||||
|
@ -6119,7 +6018,6 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
|
||||||
dxbc::Src::CB(kDumpCbufferPitches, kDumpCbufferPitches, 0,
|
dxbc::Src::CB(kDumpCbufferPitches, kDumpCbufferPitches, 0,
|
||||||
dxbc::Src::kXXXX));
|
dxbc::Src::kXXXX));
|
||||||
// Split the linear tile index in the source texture into X and Y in tiles.
|
// Split the linear tile index in the source texture into X and Y in tiles.
|
||||||
// Get the source texture pitch in tiles to r1.x.
|
|
||||||
// r0.x = X sample position within the tile
|
// r0.x = X sample position within the tile
|
||||||
// r0.y = Y sample position within the tile
|
// r0.y = Y sample position within the tile
|
||||||
// r0.z = sample offset in the EDRAM
|
// r0.z = sample offset in the EDRAM
|
||||||
|
@ -6257,9 +6155,9 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
|
||||||
if (key.is_depth) {
|
if (key.is_depth) {
|
||||||
switch (key.GetDepthFormat()) {
|
switch (key.GetDepthFormat()) {
|
||||||
case xenos::DepthRenderTargetFormat::kD24S8:
|
case xenos::DepthRenderTargetFormat::kD24S8:
|
||||||
// Round to the nearest even integer. This seems to be the correct,
|
// Round to the nearest even integer. This seems to be the correct
|
||||||
// adding +0.5 and rounding towards zero results in red instead of
|
// conversion, adding +0.5 and rounding towards zero results in red
|
||||||
// black in the 4D5307E6 clear shader.
|
// instead of black in the 4D5307E6 clear shader.
|
||||||
a.OpMul(dxbc::Dest::R(1, 0b0001), dxbc::Src::R(1, dxbc::Src::kXXXX),
|
a.OpMul(dxbc::Dest::R(1, 0b0001), dxbc::Src::R(1, dxbc::Src::kXXXX),
|
||||||
dxbc::Src::LF(float(0xFFFFFF)));
|
dxbc::Src::LF(float(0xFFFFFF)));
|
||||||
a.OpRoundNE(dxbc::Dest::R(1, 0b0001),
|
a.OpRoundNE(dxbc::Dest::R(1, 0b0001),
|
||||||
|
|
|
@ -649,31 +649,6 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs,
|
||||||
return normalized_color_mask;
|
return normalized_color_mask;
|
||||||
}
|
}
|
||||||
|
|
||||||
void GetEdramTileWidthDivideScaleAndUpperShift(
|
|
||||||
uint32_t draw_resolution_scale_x, uint32_t& divide_scale_out,
|
|
||||||
uint32_t& divide_upper_shift_out) {
|
|
||||||
static_assert(
|
|
||||||
TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3,
|
|
||||||
"GetEdramTileWidthDivideScaleAndUpperShift provides values only for draw "
|
|
||||||
"resolution scaling factors of up to 3");
|
|
||||||
switch (draw_resolution_scale_x) {
|
|
||||||
case 1:
|
|
||||||
divide_scale_out = kDivideScale5;
|
|
||||||
divide_upper_shift_out = kDivideUpperShift5 + 4;
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
divide_scale_out = kDivideScale5;
|
|
||||||
divide_upper_shift_out = kDivideUpperShift5 + 5;
|
|
||||||
break;
|
|
||||||
case 3:
|
|
||||||
divide_scale_out = kDivideScale15;
|
|
||||||
divide_upper_shift_out = kDivideUpperShift15 + 4;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
assert_unhandled_case(draw_resolution_scale_x);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
xenos::CopySampleSelect SanitizeCopySampleSelect(
|
xenos::CopySampleSelect SanitizeCopySampleSelect(
|
||||||
xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
|
xenos::CopySampleSelect copy_sample_select, xenos::MsaaSamples msaa_samples,
|
||||||
bool is_depth) {
|
bool is_depth) {
|
||||||
|
|
|
@ -226,20 +226,6 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
|
||||||
uint32_t GetNormalizedColorMask(const RegisterFile& regs,
|
uint32_t GetNormalizedColorMask(const RegisterFile& regs,
|
||||||
uint32_t pixel_shader_writes_color_targets);
|
uint32_t pixel_shader_writes_color_targets);
|
||||||
|
|
||||||
// Scales, and shift amounts of the upper 32 bits of the 32x32=64-bit
|
|
||||||
// multiplication result, for fast division and multiplication by
|
|
||||||
// EDRAM-tile-related amounts.
|
|
||||||
constexpr uint32_t kDivideScale3 = 0xAAAAAAABu;
|
|
||||||
constexpr uint32_t kDivideUpperShift3 = 1;
|
|
||||||
constexpr uint32_t kDivideScale5 = 0xCCCCCCCDu;
|
|
||||||
constexpr uint32_t kDivideUpperShift5 = 2;
|
|
||||||
constexpr uint32_t kDivideScale15 = 0x88888889u;
|
|
||||||
constexpr uint32_t kDivideUpperShift15 = 3;
|
|
||||||
|
|
||||||
void GetEdramTileWidthDivideScaleAndUpperShift(
|
|
||||||
uint32_t draw_resolution_scale_x, uint32_t& divide_scale_out,
|
|
||||||
uint32_t& divide_upper_shift_out);
|
|
||||||
|
|
||||||
// Never an identity conversion - can always write conditional move instructions
|
// Never an identity conversion - can always write conditional move instructions
|
||||||
// to shaders that will be no-ops for conversion from guest to host samples.
|
// to shaders that will be no-ops for conversion from guest to host samples.
|
||||||
// While we don't know the exact guest sample pattern, due to the way
|
// While we don't know the exact guest sample pattern, due to the way
|
||||||
|
|
|
@ -120,80 +120,49 @@ void DxbcShaderTranslator::ExportToMemory() {
|
||||||
a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kXXXX));
|
a_.OpIf(true, dxbc::Src::R(control_temp, dxbc::Src::kXXXX));
|
||||||
|
|
||||||
// Check more fine-grained limitations.
|
// Check more fine-grained limitations.
|
||||||
// The flag in control_temp.x can be 0 or 1 for simplicity, not necessarily
|
|
||||||
// 0 or 0xFFFFFFFF.
|
|
||||||
bool inner_condition_provided = false;
|
bool inner_condition_provided = false;
|
||||||
if (is_pixel_shader()) {
|
if (is_pixel_shader()) {
|
||||||
uint32_t resolution_scaled_axes =
|
uint32_t resolution_scaled_axes =
|
||||||
uint32_t(draw_resolution_scale_x_ > 1) |
|
uint32_t(draw_resolution_scale_x_ > 1) |
|
||||||
(uint32_t(draw_resolution_scale_y_ > 1) << 1);
|
(uint32_t(draw_resolution_scale_y_ > 1) << 1);
|
||||||
if (resolution_scaled_axes) {
|
if (resolution_scaled_axes) {
|
||||||
// Only do memexport for one host pixel in a guest pixel.
|
// Only do memexport for one host pixel in a guest pixel - prefer the
|
||||||
// For 2x - pixel 1 because it's covered with half-pixel offset that
|
// host pixel closer to the center of the guest pixel, but one that's
|
||||||
// becomes full-pixel.
|
// covered with the half-pixel offset according to the top-left rule (1
|
||||||
// For 3x - also pixel 1 because it's still covered with half-pixel
|
// for 2x because 0 isn't covered with the half-pixel offset, 1 for 3x
|
||||||
// offset, but close to the center.
|
// because it's the center and is covered with the half-pixel offset too).
|
||||||
// If X needs resolution scaling, writing 1 or 0 - whether the column is
|
// Using control_temp.yz as per-axis temporary variables.
|
||||||
// the one where memexport should be done - to control_temp.y.
|
|
||||||
// For Y, doing that to control_temp.z.
|
|
||||||
// Then, if both axes are resolution-scaled, merging the conditions for
|
|
||||||
// the two.
|
|
||||||
in_position_used_ |= resolution_scaled_axes;
|
in_position_used_ |= resolution_scaled_axes;
|
||||||
a_.OpFToU(
|
a_.OpFToU(
|
||||||
dxbc::Dest::R(control_temp, resolution_scaled_axes << 1),
|
dxbc::Dest::R(control_temp, resolution_scaled_axes << 1),
|
||||||
dxbc::Src::V1D(uint32_t(InOutRegister::kPSInPosition), 0b0100 << 2));
|
dxbc::Src::V1D(uint32_t(InOutRegister::kPSInPosition), 0b0100 << 2));
|
||||||
dxbc::Dest resolution_scaling_temp_dest(
|
a_.OpUDiv(dxbc::Dest::Null(),
|
||||||
dxbc::Dest::R(control_temp, 0b1000));
|
dxbc::Dest::R(control_temp, resolution_scaled_axes << 1),
|
||||||
dxbc::Src resolution_scaling_temp_src(
|
dxbc::Src::R(control_temp, 0b1001 << 2),
|
||||||
dxbc::Src::R(control_temp, dxbc::Src::kWWWW));
|
dxbc::Src::LU(0, draw_resolution_scale_x_,
|
||||||
|
draw_resolution_scale_y_, 0));
|
||||||
for (uint32_t i = 0; i < 2; ++i) {
|
for (uint32_t i = 0; i < 2; ++i) {
|
||||||
if (!(resolution_scaled_axes & (1 << i))) {
|
if (!(resolution_scaled_axes & (1 << i))) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// If there's no inner condition in control_temp.x yet, the condition
|
// If there's no inner condition in control_temp.x yet, the condition
|
||||||
// for the current axis can go directly to it. Otherwise, need to merge
|
// for the current axis can go directly to it. Otherwise, need to merge
|
||||||
// with the previous condition, using control_temp.w as an intermediate
|
// with the previous condition, using control_temp.y or .z as an
|
||||||
// variable.
|
// intermediate variable.
|
||||||
dxbc::Dest resolution_scaled_axis_result(
|
|
||||||
inner_condition_provided ? resolution_scaling_temp_dest
|
|
||||||
: dxbc::Dest::R(control_temp, 0b0001));
|
|
||||||
dxbc::Src resolution_scaled_axis_src(
|
dxbc::Src resolution_scaled_axis_src(
|
||||||
dxbc::Src::R(control_temp).Select(1 + i));
|
dxbc::Src::R(control_temp).Select(1 + i));
|
||||||
uint32_t axis_resolution_scale =
|
a_.OpIEq(
|
||||||
i ? draw_resolution_scale_y_ : draw_resolution_scale_x_;
|
dxbc::Dest::R(control_temp,
|
||||||
static_assert(
|
inner_condition_provided ? 1 << (1 + i) : 0b0001),
|
||||||
TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3,
|
resolution_scaled_axis_src,
|
||||||
"DxbcShaderTranslator memexport draw resolution scaling "
|
dxbc::Src::LU(
|
||||||
"conditional generation supports draw resolution scaling factors "
|
(i ? draw_resolution_scale_y_ : draw_resolution_scale_x_) >>
|
||||||
"of only up to 3");
|
1));
|
||||||
switch (axis_resolution_scale) {
|
|
||||||
case 2:
|
|
||||||
// xy & 1 == 1.
|
|
||||||
a_.OpAnd(resolution_scaled_axis_result, resolution_scaled_axis_src,
|
|
||||||
dxbc::Src::LU(1));
|
|
||||||
// No need to do IEq - already 1 for right / bottom, 0 for left /
|
|
||||||
// top.
|
|
||||||
break;
|
|
||||||
case 3:
|
|
||||||
// xy % 3 == 1.
|
|
||||||
a_.OpUMul(resolution_scaling_temp_dest, dxbc::Dest::Null(),
|
|
||||||
resolution_scaled_axis_src,
|
|
||||||
dxbc::Src::LU(draw_util::kDivideScale3));
|
|
||||||
a_.OpUShR(resolution_scaling_temp_dest, resolution_scaling_temp_src,
|
|
||||||
dxbc::Src::LU(draw_util::kDivideUpperShift3));
|
|
||||||
a_.OpIMAd(resolution_scaling_temp_dest, resolution_scaling_temp_src,
|
|
||||||
dxbc::Src::LI(-3), resolution_scaled_axis_src);
|
|
||||||
a_.OpIEq(resolution_scaled_axis_result, resolution_scaling_temp_src,
|
|
||||||
dxbc::Src::LU(1));
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
assert_unhandled_case(axis_resolution_scale);
|
|
||||||
}
|
|
||||||
if (inner_condition_provided) {
|
if (inner_condition_provided) {
|
||||||
// Merge with the previous condition in control_temp.x.
|
// Merge with the previous condition in control_temp.x.
|
||||||
a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001),
|
a_.OpAnd(dxbc::Dest::R(control_temp, 0b0001),
|
||||||
dxbc::Src::R(control_temp, dxbc::Src::kXXXX),
|
dxbc::Src::R(control_temp, dxbc::Src::kXXXX),
|
||||||
resolution_scaling_temp_src);
|
resolution_scaled_axis_src);
|
||||||
}
|
}
|
||||||
inner_condition_provided = true;
|
inner_condition_provided = true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -190,91 +190,22 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
|
||||||
// dividing by 40, not by 80.
|
// dividing by 40, not by 80.
|
||||||
// For depth-only:
|
// For depth-only:
|
||||||
// Same, but for full 80x16 tiles, not 40x16 half-tiles.
|
// Same, but for full 80x16 tiles, not 40x16 half-tiles.
|
||||||
uint32_t tile_or_half_tile_width = 80 * draw_resolution_scale_x_;
|
uint32_t tile_width =
|
||||||
uint32_t tile_or_half_tile_width_divide_scale;
|
xenos::kEdramTileWidthSamples * draw_resolution_scale_x_;
|
||||||
uint32_t tile_or_half_tile_width_divide_upper_shift;
|
uint32_t tile_or_tile_half_width =
|
||||||
draw_util::GetEdramTileWidthDivideScaleAndUpperShift(
|
tile_width >> uint32_t(any_color_targets_written);
|
||||||
draw_resolution_scale_x_, tile_or_half_tile_width_divide_scale,
|
uint32_t tile_height =
|
||||||
tile_or_half_tile_width_divide_upper_shift);
|
xenos::kEdramTileHeightSamples * draw_resolution_scale_y_;
|
||||||
if (any_color_targets_written) {
|
// system_temp_rov_params_.x = X sample 0 position within the half-tile or
|
||||||
tile_or_half_tile_width >>= 1;
|
// tile
|
||||||
assert_not_zero(tile_or_half_tile_width_divide_upper_shift);
|
// system_temp_rov_params_.y = Y sample 0 position within the (half-)tile
|
||||||
--tile_or_half_tile_width_divide_upper_shift;
|
// system_temp_rov_params_.z = X half-tile or tile position
|
||||||
}
|
// system_temp_rov_params_.w = Y tile position
|
||||||
static_assert(
|
a_.OpUDiv(dxbc::Dest::R(system_temp_rov_params_, 0b1100),
|
||||||
TextureCache::kMaxDrawResolutionScaleAlongAxis <= 3,
|
dxbc::Dest::R(system_temp_rov_params_, 0b0011),
|
||||||
"DxbcShaderTranslator ROV sample address calculation supports Y draw "
|
dxbc::Src::R(system_temp_rov_params_, 0b01000100),
|
||||||
"resolution scaling factors of only up to 3");
|
dxbc::Src::LU(tile_or_tile_half_width, tile_height,
|
||||||
if (draw_resolution_scale_y_ == 3) {
|
tile_or_tile_half_width, tile_height));
|
||||||
// Multiplication part of the division by 40|80 x 16 x scale (specifically
|
|
||||||
// 40|80 * scale width here, and 48 height, or 16 * 3 height).
|
|
||||||
// system_temp_rov_params_.x = X sample 0 position
|
|
||||||
// system_temp_rov_params_.y = Y sample 0 position
|
|
||||||
// system_temp_rov_params_.z = (X * tile_or_half_tile_width_divide_scale) >>
|
|
||||||
// 32
|
|
||||||
// system_temp_rov_params_.w = (Y * kDivideScale3) >> 32
|
|
||||||
a_.OpUMul(dxbc::Dest::R(system_temp_rov_params_, 0b1100),
|
|
||||||
dxbc::Dest::Null(),
|
|
||||||
dxbc::Src::R(system_temp_rov_params_, 0b0100 << 4),
|
|
||||||
dxbc::Src::LU(0, 0, tile_or_half_tile_width_divide_scale,
|
|
||||||
draw_util::kDivideScale3));
|
|
||||||
// Shift part of the division by 40|80 x 16 x scale.
|
|
||||||
// system_temp_rov_params_.x = X sample 0 position
|
|
||||||
// system_temp_rov_params_.y = Y sample 0 position
|
|
||||||
// system_temp_rov_params_.z = X half-tile or tile position
|
|
||||||
// system_temp_rov_params_.w = Y tile position
|
|
||||||
a_.OpUShR(dxbc::Dest::R(system_temp_rov_params_, 0b1100),
|
|
||||||
dxbc::Src::R(system_temp_rov_params_),
|
|
||||||
dxbc::Src::LU(0, 0, tile_or_half_tile_width_divide_upper_shift,
|
|
||||||
draw_util::kDivideUpperShift3 + 4));
|
|
||||||
// Take the remainder of the performed division to
|
|
||||||
// system_temp_rov_params_.xy.
|
|
||||||
// system_temp_rov_params_.x = X sample 0 position within the half-tile
|
|
||||||
// system_temp_rov_params_.y = Y sample 0 position within the (half-)tile
|
|
||||||
// system_temp_rov_params_.z = X half-tile or tile position
|
|
||||||
// system_temp_rov_params_.w = Y tile position
|
|
||||||
a_.OpIMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0011),
|
|
||||||
dxbc::Src::R(system_temp_rov_params_, 0b1110),
|
|
||||||
dxbc::Src::LI(-int32_t(tile_or_half_tile_width),
|
|
||||||
-16 * draw_resolution_scale_y_, 0, 0),
|
|
||||||
dxbc::Src::R(system_temp_rov_params_));
|
|
||||||
} else {
|
|
||||||
assert_true(draw_resolution_scale_y_ <= 2);
|
|
||||||
// Multiplication part of the division of X by 40|80 * scale.
|
|
||||||
// system_temp_rov_params_.x = X sample 0 position
|
|
||||||
// system_temp_rov_params_.y = Y sample 0 position
|
|
||||||
// system_temp_rov_params_.z = (X * tile_or_half_tile_width_divide_scale) >>
|
|
||||||
// 32
|
|
||||||
a_.OpUMul(dxbc::Dest::R(system_temp_rov_params_, 0b0100),
|
|
||||||
dxbc::Dest::Null(),
|
|
||||||
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX),
|
|
||||||
dxbc::Src::LU(tile_or_half_tile_width_divide_scale));
|
|
||||||
// Shift part of the division of X by 40 * scale, division of Y by
|
|
||||||
// 16 * scale as it's power of two in this case.
|
|
||||||
// system_temp_rov_params_.x = X sample 0 position
|
|
||||||
// system_temp_rov_params_.y = Y sample 0 position
|
|
||||||
// system_temp_rov_params_.z = X half-tile or tile position
|
|
||||||
// system_temp_rov_params_.w = Y tile position
|
|
||||||
a_.OpUShR(dxbc::Dest::R(system_temp_rov_params_, 0b1100),
|
|
||||||
dxbc::Src::R(system_temp_rov_params_, 0b0110 << 4),
|
|
||||||
dxbc::Src::LU(0, 0, tile_or_half_tile_width_divide_upper_shift,
|
|
||||||
draw_resolution_scale_y_ == 2 ? 5 : 4));
|
|
||||||
// Take the remainder of the performed division (via multiply-subtract for
|
|
||||||
// X, via AND for Y which is power-of-two here) to
|
|
||||||
// system_temp_rov_params_.xy.
|
|
||||||
// system_temp_rov_params_.x = X sample 0 position within the half-tile or
|
|
||||||
// tile
|
|
||||||
// system_temp_rov_params_.y = Y sample 0 position within the (half-)tile
|
|
||||||
// system_temp_rov_params_.z = X half-tile or tile position
|
|
||||||
// system_temp_rov_params_.w = Y tile position
|
|
||||||
a_.OpIMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0001),
|
|
||||||
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ),
|
|
||||||
dxbc::Src::LI(-int32_t(tile_or_half_tile_width)),
|
|
||||||
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX));
|
|
||||||
a_.OpAnd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
|
|
||||||
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
|
|
||||||
dxbc::Src::LU((16 * draw_resolution_scale_y_) - 1));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convert the Y sample 0 position within the half-tile or tile to the dword
|
// Convert the Y sample 0 position within the half-tile or tile to the dword
|
||||||
// offset of the row within a 80x16 32bpp tile or a 40x16 64bpp half-tile to
|
// offset of the row within a 80x16 32bpp tile or a 40x16 64bpp half-tile to
|
||||||
|
@ -287,8 +218,10 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
|
||||||
// system_temp_rov_params_.w = Y tile position
|
// system_temp_rov_params_.w = Y tile position
|
||||||
a_.OpUMul(dxbc::Dest::Null(), dxbc::Dest::R(system_temp_rov_params_, 0b0010),
|
a_.OpUMul(dxbc::Dest::Null(), dxbc::Dest::R(system_temp_rov_params_, 0b0010),
|
||||||
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
|
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
|
||||||
dxbc::Src::LU(80 * draw_resolution_scale_x_));
|
dxbc::Src::LU(tile_width));
|
||||||
|
|
||||||
|
uint32_t tile_size = tile_width * tile_height;
|
||||||
|
uint32_t tile_half_width = tile_width >> 1;
|
||||||
if (any_color_targets_written) {
|
if (any_color_targets_written) {
|
||||||
// Depth, 32bpp color, 64bpp color are all needed.
|
// Depth, 32bpp color, 64bpp color are all needed.
|
||||||
|
|
||||||
|
@ -336,12 +269,10 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
|
||||||
// system_temp_rov_params_.w = Y tile row dword origin in a 32bpp surface
|
// system_temp_rov_params_.w = Y tile row dword origin in a 32bpp surface
|
||||||
// rov_address_temp.x = dword offset of the beginning of the row of samples
|
// rov_address_temp.x = dword offset of the beginning of the row of samples
|
||||||
// within a row of 32bpp tiles
|
// within a row of 32bpp tiles
|
||||||
a_.OpUMAd(
|
a_.OpUMAd(dxbc::Dest::R(rov_address_temp, 0b0001),
|
||||||
dxbc::Dest::R(rov_address_temp, 0b0001),
|
dxbc::Src::R(rov_address_temp, dxbc::Src::kXXXX),
|
||||||
dxbc::Src::R(rov_address_temp, dxbc::Src::kXXXX),
|
dxbc::Src::LU(tile_size),
|
||||||
dxbc::Src::LU(80 * 16 *
|
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
|
||||||
(draw_resolution_scale_x_ * draw_resolution_scale_y_)),
|
|
||||||
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
|
|
||||||
// Get the dword offset of the beginning of the row of samples within a
|
// Get the dword offset of the beginning of the row of samples within a
|
||||||
// 32bpp surface to rov_address_temp.x.
|
// 32bpp surface to rov_address_temp.x.
|
||||||
// system_temp_rov_params_.x = X sample 0 position within the half-tile
|
// system_temp_rov_params_.x = X sample 0 position within the half-tile
|
||||||
|
@ -365,12 +296,10 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
|
||||||
// system_temp_rov_params_.w = Y tile row dword origin in a 32bpp surface
|
// system_temp_rov_params_.w = Y tile row dword origin in a 32bpp surface
|
||||||
// rov_address_temp.x = dword offset of the beginning of the row of samples
|
// rov_address_temp.x = dword offset of the beginning of the row of samples
|
||||||
// within a 32bpp surface
|
// within a 32bpp surface
|
||||||
a_.OpUMAd(
|
a_.OpUMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
|
||||||
dxbc::Dest::R(system_temp_rov_params_, 0b0010),
|
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ),
|
||||||
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ),
|
dxbc::Src::LU(tile_size),
|
||||||
dxbc::Src::LU(80 * 16 *
|
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
|
||||||
(draw_resolution_scale_x_ * draw_resolution_scale_y_)),
|
|
||||||
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
|
|
||||||
// Get the dword offset of the beginning of the row of samples within a
|
// Get the dword offset of the beginning of the row of samples within a
|
||||||
// 64bpp surface to system_temp_rov_params_.w (last time the Y tile row
|
// 64bpp surface to system_temp_rov_params_.w (last time the Y tile row
|
||||||
// offset is needed).
|
// offset is needed).
|
||||||
|
@ -420,7 +349,7 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
|
||||||
// within a 32bpp surface
|
// within a 32bpp surface
|
||||||
a_.OpUMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0100),
|
a_.OpUMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0100),
|
||||||
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
|
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
|
||||||
dxbc::Src::LU(40 * draw_resolution_scale_x_),
|
dxbc::Src::LU(tile_half_width),
|
||||||
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX));
|
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX));
|
||||||
// Get the final offset of the sample 0 within a 32bpp color surface to
|
// Get the final offset of the sample 0 within a 32bpp color surface to
|
||||||
// system_temp_rov_params_.z (last time the 32bpp row offset is needed).
|
// system_temp_rov_params_.z (last time the 32bpp row offset is needed).
|
||||||
|
@ -439,8 +368,8 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
|
||||||
// system_temp_rov_params_.w = dword sample 0 offset within a 64bpp surface
|
// system_temp_rov_params_.w = dword sample 0 offset within a 64bpp surface
|
||||||
a_.OpMovC(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
|
a_.OpMovC(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
|
||||||
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
|
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
|
||||||
dxbc::Src::LI(-40 * draw_resolution_scale_x_),
|
dxbc::Src::LI(-int32_t(tile_half_width)),
|
||||||
dxbc::Src::LI(40 * draw_resolution_scale_x_));
|
dxbc::Src::LI(int32_t(tile_half_width)));
|
||||||
// Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color -
|
// Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color -
|
||||||
// get the final offset of the sample 0 within a 32bpp depth / stencil
|
// get the final offset of the sample 0 within a 32bpp depth / stencil
|
||||||
// surface to system_temp_rov_params_.y.
|
// surface to system_temp_rov_params_.y.
|
||||||
|
@ -466,12 +395,10 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
|
||||||
// system_temp_rov_params_.z = dword offset of the beginning of the row of
|
// system_temp_rov_params_.z = dword offset of the beginning of the row of
|
||||||
// samples within a row of 32bpp tiles
|
// samples within a row of 32bpp tiles
|
||||||
// system_temp_rov_params_.w = Y tile position
|
// system_temp_rov_params_.w = Y tile position
|
||||||
a_.OpUMAd(
|
a_.OpUMAd(dxbc::Dest::R(system_temp_rov_params_, 0b0100),
|
||||||
dxbc::Dest::R(system_temp_rov_params_, 0b0100),
|
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ),
|
||||||
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kZZZZ),
|
dxbc::Src::LU(tile_size),
|
||||||
dxbc::Src::LU(80 * 16 *
|
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
|
||||||
(draw_resolution_scale_x_ * draw_resolution_scale_y_)),
|
|
||||||
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY));
|
|
||||||
// Get the dword offset of the beginning of the row of samples within a
|
// Get the dword offset of the beginning of the row of samples within a
|
||||||
// 32bpp surface to system_temp_rov_params_.y (last time anything Y-related
|
// 32bpp surface to system_temp_rov_params_.y (last time anything Y-related
|
||||||
// is needed, as well as the sample row offset within the tile row).
|
// is needed, as well as the sample row offset within the tile row).
|
||||||
|
@ -502,15 +429,15 @@ void DxbcShaderTranslator::StartPixelShader_LoadROVParameters() {
|
||||||
// otherwise
|
// otherwise
|
||||||
a_.OpUGE(dxbc::Dest::R(system_temp_rov_params_, 0b0001),
|
a_.OpUGE(dxbc::Dest::R(system_temp_rov_params_, 0b0001),
|
||||||
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX),
|
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX),
|
||||||
dxbc::Src::LU(40 * draw_resolution_scale_x_));
|
dxbc::Src::LU(tile_half_width));
|
||||||
// Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color -
|
// Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color -
|
||||||
// get the dword offset to add for flipping to system_temp_rov_params_.x.
|
// get the dword offset to add for flipping to system_temp_rov_params_.x.
|
||||||
// system_temp_rov_params_.x = depth half-tile flipping offset
|
// system_temp_rov_params_.x = depth half-tile flipping offset
|
||||||
// system_temp_rov_params_.y = dword sample 0 offset within a 32bpp surface
|
// system_temp_rov_params_.y = dword sample 0 offset within a 32bpp surface
|
||||||
a_.OpMovC(dxbc::Dest::R(system_temp_rov_params_, 0b0001),
|
a_.OpMovC(dxbc::Dest::R(system_temp_rov_params_, 0b0001),
|
||||||
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX),
|
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kXXXX),
|
||||||
dxbc::Src::LI(-40 * draw_resolution_scale_x_),
|
dxbc::Src::LI(-int32_t(tile_half_width)),
|
||||||
dxbc::Src::LI(40 * draw_resolution_scale_x_));
|
dxbc::Src::LI(int32_t(tile_half_width)));
|
||||||
// Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color -
|
// Flip the 40x16 half-tiles for depth / stencil as opposed to 32bpp color -
|
||||||
// get the final offset of the sample 0 within a 32bpp depth / stencil
|
// get the final offset of the sample 0 within a 32bpp depth / stencil
|
||||||
// surface to system_temp_rov_params_.y.
|
// surface to system_temp_rov_params_.y.
|
||||||
|
@ -1288,10 +1215,12 @@ void DxbcShaderTranslator::ROV_DepthStencilTest() {
|
||||||
// Go to the next sample (samples are at +0, +(80*scale_x), +1,
|
// Go to the next sample (samples are at +0, +(80*scale_x), +1,
|
||||||
// +(80*scale_x+1), so need to do +(80*scale_x), -(80*scale_x-1),
|
// +(80*scale_x+1), so need to do +(80*scale_x), -(80*scale_x-1),
|
||||||
// +(80*scale_x) and -(80*scale_x+1) after each sample).
|
// +(80*scale_x) and -(80*scale_x+1) after each sample).
|
||||||
|
uint32_t tile_width =
|
||||||
|
xenos::kEdramTileWidthSamples * draw_resolution_scale_x_;
|
||||||
a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
|
a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
|
||||||
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
|
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
|
||||||
dxbc::Src::LI((i & 1) ? -80 * draw_resolution_scale_x_ + 2 - i
|
dxbc::Src::LI((i & 1) ? -int32_t(tile_width) + 2 - i
|
||||||
: 80 * draw_resolution_scale_x_));
|
: int32_t(tile_width)));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ROV_IsDepthStencilEarly()) {
|
if (ROV_IsDepthStencilEarly()) {
|
||||||
|
@ -2181,6 +2110,9 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
|
||||||
dxbc::Dest temp_w_dest(dxbc::Dest::R(temp, 0b1000));
|
dxbc::Dest temp_w_dest(dxbc::Dest::R(temp, 0b1000));
|
||||||
dxbc::Src temp_w_src(dxbc::Src::R(temp, dxbc::Src::kWWWW));
|
dxbc::Src temp_w_src(dxbc::Src::R(temp, dxbc::Src::kWWWW));
|
||||||
|
|
||||||
|
uint32_t tile_width =
|
||||||
|
xenos::kEdramTileWidthSamples * draw_resolution_scale_x_;
|
||||||
|
|
||||||
// Do late depth/stencil test (which includes writing) if needed or deferred
|
// Do late depth/stencil test (which includes writing) if needed or deferred
|
||||||
// depth writing.
|
// depth writing.
|
||||||
if (ROV_IsDepthStencilEarly()) {
|
if (ROV_IsDepthStencilEarly()) {
|
||||||
|
@ -2212,8 +2144,8 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
|
||||||
if (i < 3) {
|
if (i < 3) {
|
||||||
a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
|
a_.OpIAdd(dxbc::Dest::R(system_temp_rov_params_, 0b0010),
|
||||||
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
|
dxbc::Src::R(system_temp_rov_params_, dxbc::Src::kYYYY),
|
||||||
dxbc::Src::LI((i & 1) ? -80 * draw_resolution_scale_x_ + 2 - i
|
dxbc::Src::LI((i & 1) ? -int32_t(tile_width) + 2 - i
|
||||||
: 80 * draw_resolution_scale_x_));
|
: int32_t(tile_width)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -3021,8 +2953,7 @@ void DxbcShaderTranslator::CompletePixelShader_WriteToROV() {
|
||||||
// +(80*scale_x+1), so need to do +(80*scale_x), -(80*scale_x-1),
|
// +(80*scale_x+1), so need to do +(80*scale_x), -(80*scale_x-1),
|
||||||
// +(80*scale_x) and -(80*scale_x+1) after each sample).
|
// +(80*scale_x) and -(80*scale_x+1) after each sample).
|
||||||
int32_t next_sample_distance =
|
int32_t next_sample_distance =
|
||||||
(j & 1) ? -80 * draw_resolution_scale_x_ + 2 - j
|
(j & 1) ? -int32_t(tile_width) + 2 - j : int32_t(tile_width);
|
||||||
: 80 * draw_resolution_scale_x_;
|
|
||||||
a_.OpIAdd(
|
a_.OpIAdd(
|
||||||
dxbc::Dest::R(system_temp_rov_params_, 0b1100),
|
dxbc::Dest::R(system_temp_rov_params_, 0b1100),
|
||||||
dxbc::Src::R(system_temp_rov_params_),
|
dxbc::Src::R(system_temp_rov_params_),
|
||||||
|
|
Loading…
Reference in New Issue