forked from ShuriZma/suyu
DMA & InlineToMemory Engines Rework.
This commit is contained in:
parent
b2099fbdcc
commit
f5fd6b5c86
|
@ -24,4 +24,12 @@ template <class ForwardIt, class T, class Compare = std::less<>>
|
||||||
return first != last && !comp(value, *first) ? first : last;
|
return first != last && !comp(value, *first) ? first : last;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T, typename Func, typename... Args>
|
||||||
|
T FoldRight(T initial_value, Func&& func, Args&&... args) {
|
||||||
|
T value{initial_value};
|
||||||
|
const auto high_func = [&value, &func]<typename T>(T x) { value = func(value, x); };
|
||||||
|
(std::invoke(high_func, std::forward<Args>(args)), ...);
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace Common
|
} // namespace Common
|
||||||
|
|
|
@ -126,7 +126,7 @@ public:
|
||||||
|
|
||||||
void DownloadMemory(VAddr cpu_addr, u64 size);
|
void DownloadMemory(VAddr cpu_addr, u64 size);
|
||||||
|
|
||||||
bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<u8> inlined_buffer);
|
bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<const u8> inlined_buffer);
|
||||||
|
|
||||||
void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
|
void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
|
||||||
|
|
||||||
|
@ -1685,7 +1685,7 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
|
||||||
|
|
||||||
template <class P>
|
template <class P>
|
||||||
bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size,
|
bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size,
|
||||||
std::span<u8> inlined_buffer) {
|
std::span<const u8> inlined_buffer) {
|
||||||
const bool is_dirty = IsRegionRegistered(dest_address, copy_size);
|
const bool is_dirty = IsRegionRegistered(dest_address, copy_size);
|
||||||
if (!is_dirty) {
|
if (!is_dirty) {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
|
||||||
|
#include "common/algorithm.h"
|
||||||
#include "common/assert.h"
|
#include "common/assert.h"
|
||||||
#include "video_core/engines/engine_upload.h"
|
#include "video_core/engines/engine_upload.h"
|
||||||
#include "video_core/memory_manager.h"
|
#include "video_core/memory_manager.h"
|
||||||
|
@ -34,21 +35,48 @@ void State::ProcessData(const u32 data, const bool is_last_call) {
|
||||||
if (!is_last_call) {
|
if (!is_last_call) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
ProcessData(inner_buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
void State::ProcessData(const u32* data, size_t num_data) {
|
||||||
|
std::span<const u8> read_buffer(reinterpret_cast<const u8*>(data), num_data * sizeof(u32));
|
||||||
|
ProcessData(read_buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
void State::ProcessData(std::span<const u8> read_buffer) {
|
||||||
const GPUVAddr address{regs.dest.Address()};
|
const GPUVAddr address{regs.dest.Address()};
|
||||||
if (is_linear) {
|
if (is_linear) {
|
||||||
rasterizer->AccelerateInlineToMemory(address, copy_size, inner_buffer);
|
if (regs.line_count == 1) {
|
||||||
|
rasterizer->AccelerateInlineToMemory(address, copy_size, read_buffer);
|
||||||
|
} else {
|
||||||
|
for (u32 line = 0; line < regs.line_count; ++line) {
|
||||||
|
const GPUVAddr dest_line = address + static_cast<size_t>(line) * regs.dest.pitch;
|
||||||
|
memory_manager.WriteBlockUnsafe(
|
||||||
|
dest_line, read_buffer.data() + static_cast<size_t>(line) * regs.line_length_in,
|
||||||
|
regs.line_length_in);
|
||||||
|
}
|
||||||
|
memory_manager.InvalidateRegion(address, regs.dest.pitch * regs.line_count);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
UNIMPLEMENTED_IF(regs.dest.z != 0);
|
u32 width = regs.dest.width;
|
||||||
UNIMPLEMENTED_IF(regs.dest.depth != 1);
|
u32 x_elements = regs.line_length_in;
|
||||||
UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 0);
|
u32 x_offset = regs.dest.x;
|
||||||
UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 0);
|
const u32 bpp_shift = Common::FoldRight(
|
||||||
|
4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
|
||||||
|
width, x_elements, x_offset, static_cast<u32>(address));
|
||||||
|
width >>= bpp_shift;
|
||||||
|
x_elements >>= bpp_shift;
|
||||||
|
x_offset >>= bpp_shift;
|
||||||
|
const u32 bytes_per_pixel = 1U << bpp_shift;
|
||||||
const std::size_t dst_size = Tegra::Texture::CalculateSize(
|
const std::size_t dst_size = Tegra::Texture::CalculateSize(
|
||||||
true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 0);
|
true, bytes_per_pixel, width, regs.dest.height, regs.dest.depth,
|
||||||
|
regs.dest.BlockHeight(), regs.dest.BlockDepth());
|
||||||
tmp_buffer.resize(dst_size);
|
tmp_buffer.resize(dst_size);
|
||||||
memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
|
memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
|
||||||
Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, regs.dest.y,
|
Tegra::Texture::SwizzleSubrect(tmp_buffer, read_buffer, bytes_per_pixel, width,
|
||||||
regs.dest.BlockHeight(), copy_size, inner_buffer.data(),
|
regs.dest.height, regs.dest.depth, x_offset, regs.dest.y,
|
||||||
tmp_buffer.data());
|
x_elements, regs.line_count, regs.dest.BlockHeight(),
|
||||||
|
regs.dest.BlockDepth(), regs.line_length_in);
|
||||||
memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
|
memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <span>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "common/bit_field.h"
|
#include "common/bit_field.h"
|
||||||
#include "common/common_types.h"
|
#include "common/common_types.h"
|
||||||
|
@ -33,7 +34,7 @@ struct Registers {
|
||||||
u32 width;
|
u32 width;
|
||||||
u32 height;
|
u32 height;
|
||||||
u32 depth;
|
u32 depth;
|
||||||
u32 z;
|
u32 layer;
|
||||||
u32 x;
|
u32 x;
|
||||||
u32 y;
|
u32 y;
|
||||||
|
|
||||||
|
@ -62,11 +63,14 @@ public:
|
||||||
|
|
||||||
void ProcessExec(bool is_linear_);
|
void ProcessExec(bool is_linear_);
|
||||||
void ProcessData(u32 data, bool is_last_call);
|
void ProcessData(u32 data, bool is_last_call);
|
||||||
|
void ProcessData(const u32* data, size_t num_data);
|
||||||
|
|
||||||
/// Binds a rasterizer to this engine.
|
/// Binds a rasterizer to this engine.
|
||||||
void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
|
void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
void ProcessData(std::span<const u8> read_buffer);
|
||||||
|
|
||||||
u32 write_offset = 0;
|
u32 write_offset = 0;
|
||||||
u32 copy_size = 0;
|
u32 copy_size = 0;
|
||||||
std::vector<u8> inner_buffer;
|
std::vector<u8> inner_buffer;
|
||||||
|
|
|
@ -36,8 +36,6 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal
|
||||||
}
|
}
|
||||||
case KEPLER_COMPUTE_REG_INDEX(data_upload): {
|
case KEPLER_COMPUTE_REG_INDEX(data_upload): {
|
||||||
upload_state.ProcessData(method_argument, is_last_call);
|
upload_state.ProcessData(method_argument, is_last_call);
|
||||||
if (is_last_call) {
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case KEPLER_COMPUTE_REG_INDEX(launch):
|
case KEPLER_COMPUTE_REG_INDEX(launch):
|
||||||
|
@ -50,8 +48,15 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal
|
||||||
|
|
||||||
void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
|
void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
|
||||||
u32 methods_pending) {
|
u32 methods_pending) {
|
||||||
for (std::size_t i = 0; i < amount; i++) {
|
switch (method) {
|
||||||
CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
|
case KEPLER_COMPUTE_REG_INDEX(data_upload):
|
||||||
|
upload_state.ProcessData(base_start, static_cast<size_t>(amount));
|
||||||
|
return;
|
||||||
|
default:
|
||||||
|
for (std::size_t i = 0; i < amount; i++) {
|
||||||
|
CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -33,8 +33,6 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call
|
||||||
}
|
}
|
||||||
case KEPLERMEMORY_REG_INDEX(data): {
|
case KEPLERMEMORY_REG_INDEX(data): {
|
||||||
upload_state.ProcessData(method_argument, is_last_call);
|
upload_state.ProcessData(method_argument, is_last_call);
|
||||||
if (is_last_call) {
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -42,8 +40,15 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call
|
||||||
|
|
||||||
void KeplerMemory::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
|
void KeplerMemory::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
|
||||||
u32 methods_pending) {
|
u32 methods_pending) {
|
||||||
for (std::size_t i = 0; i < amount; i++) {
|
switch (method) {
|
||||||
CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
|
case KEPLERMEMORY_REG_INDEX(data):
|
||||||
|
upload_state.ProcessData(base_start, static_cast<size_t>(amount));
|
||||||
|
return;
|
||||||
|
default:
|
||||||
|
for (std::size_t i = 0; i < amount; i++) {
|
||||||
|
CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -239,8 +239,6 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume
|
||||||
return upload_state.ProcessExec(regs.exec_upload.linear != 0);
|
return upload_state.ProcessExec(regs.exec_upload.linear != 0);
|
||||||
case MAXWELL3D_REG_INDEX(data_upload):
|
case MAXWELL3D_REG_INDEX(data_upload):
|
||||||
upload_state.ProcessData(argument, is_last_call);
|
upload_state.ProcessData(argument, is_last_call);
|
||||||
if (is_last_call) {
|
|
||||||
}
|
|
||||||
return;
|
return;
|
||||||
case MAXWELL3D_REG_INDEX(fragment_barrier):
|
case MAXWELL3D_REG_INDEX(fragment_barrier):
|
||||||
return rasterizer->FragmentBarrier();
|
return rasterizer->FragmentBarrier();
|
||||||
|
@ -316,6 +314,9 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
|
||||||
case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 15:
|
case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 15:
|
||||||
ProcessCBMultiData(base_start, amount);
|
ProcessCBMultiData(base_start, amount);
|
||||||
break;
|
break;
|
||||||
|
case MAXWELL3D_REG_INDEX(data_upload):
|
||||||
|
upload_state.ProcessData(base_start, static_cast<size_t>(amount));
|
||||||
|
return;
|
||||||
default:
|
default:
|
||||||
for (std::size_t i = 0; i < amount; i++) {
|
for (std::size_t i = 0; i < amount; i++) {
|
||||||
CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
|
CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
|
// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
|
||||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
|
#include "common/algorithm.h"
|
||||||
#include "common/assert.h"
|
#include "common/assert.h"
|
||||||
#include "common/logging/log.h"
|
#include "common/logging/log.h"
|
||||||
#include "common/microprofile.h"
|
#include "common/microprofile.h"
|
||||||
|
@ -54,8 +55,6 @@ void MaxwellDMA::Launch() {
|
||||||
const LaunchDMA& launch = regs.launch_dma;
|
const LaunchDMA& launch = regs.launch_dma;
|
||||||
ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE);
|
ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE);
|
||||||
ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED);
|
ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED);
|
||||||
ASSERT(regs.dst_params.origin.x == 0);
|
|
||||||
ASSERT(regs.dst_params.origin.y == 0);
|
|
||||||
|
|
||||||
const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
|
const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
|
||||||
const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
|
const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
|
||||||
|
@ -121,12 +120,13 @@ void MaxwellDMA::CopyPitchToPitch() {
|
||||||
|
|
||||||
void MaxwellDMA::CopyBlockLinearToPitch() {
|
void MaxwellDMA::CopyBlockLinearToPitch() {
|
||||||
UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
|
UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
|
||||||
UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0);
|
|
||||||
UNIMPLEMENTED_IF(regs.src_params.layer != 0);
|
UNIMPLEMENTED_IF(regs.src_params.layer != 0);
|
||||||
|
|
||||||
|
const bool is_remapping = regs.launch_dma.remap_enable != 0;
|
||||||
|
|
||||||
// Optimized path for micro copies.
|
// Optimized path for micro copies.
|
||||||
const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
|
const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
|
||||||
if (dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X &&
|
if (!is_remapping && dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X &&
|
||||||
regs.src_params.height > GOB_SIZE_Y) {
|
regs.src_params.height > GOB_SIZE_Y) {
|
||||||
FastCopyBlockLinearToPitch();
|
FastCopyBlockLinearToPitch();
|
||||||
return;
|
return;
|
||||||
|
@ -134,10 +134,27 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
|
||||||
|
|
||||||
// Deswizzle the input and copy it over.
|
// Deswizzle the input and copy it over.
|
||||||
UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
|
UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
|
||||||
const u32 bytes_per_pixel =
|
|
||||||
regs.launch_dma.remap_enable ? regs.pitch_out / regs.line_length_in : 1;
|
|
||||||
const Parameters& src_params = regs.src_params;
|
const Parameters& src_params = regs.src_params;
|
||||||
const u32 width = src_params.width;
|
|
||||||
|
const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1;
|
||||||
|
const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1;
|
||||||
|
|
||||||
|
const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size;
|
||||||
|
|
||||||
|
u32 width = src_params.width;
|
||||||
|
u32 x_elements = regs.line_length_in;
|
||||||
|
u32 x_offset = src_params.origin.x;
|
||||||
|
u32 bpp_shift = 0U;
|
||||||
|
if (!is_remapping) {
|
||||||
|
bpp_shift = Common::FoldRight(
|
||||||
|
4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
|
||||||
|
width, x_elements, x_offset, static_cast<u32>(regs.offset_in));
|
||||||
|
width >>= bpp_shift;
|
||||||
|
x_elements >>= bpp_shift;
|
||||||
|
x_offset >>= bpp_shift;
|
||||||
|
}
|
||||||
|
|
||||||
|
const u32 bytes_per_pixel = base_bpp << bpp_shift;
|
||||||
const u32 height = src_params.height;
|
const u32 height = src_params.height;
|
||||||
const u32 depth = src_params.depth;
|
const u32 depth = src_params.depth;
|
||||||
const u32 block_height = src_params.block_size.height;
|
const u32 block_height = src_params.block_size.height;
|
||||||
|
@ -155,30 +172,46 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
|
||||||
memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
|
memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
|
||||||
memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
|
memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
|
||||||
|
|
||||||
UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, width, bytes_per_pixel,
|
UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset,
|
||||||
block_height, src_params.origin.x, src_params.origin.y, write_buffer.data(),
|
src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
|
||||||
read_buffer.data());
|
regs.pitch_out);
|
||||||
|
|
||||||
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
|
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
void MaxwellDMA::CopyPitchToBlockLinear() {
|
void MaxwellDMA::CopyPitchToBlockLinear() {
|
||||||
UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one");
|
UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one");
|
||||||
|
UNIMPLEMENTED_IF(regs.dst_params.layer != 0);
|
||||||
UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
|
UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
|
||||||
|
|
||||||
|
const bool is_remapping = regs.launch_dma.remap_enable != 0;
|
||||||
|
const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1;
|
||||||
|
const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1;
|
||||||
|
|
||||||
const auto& dst_params = regs.dst_params;
|
const auto& dst_params = regs.dst_params;
|
||||||
const u32 bytes_per_pixel =
|
|
||||||
regs.launch_dma.remap_enable ? regs.pitch_in / regs.line_length_in : 1;
|
const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size;
|
||||||
const u32 width = dst_params.width;
|
|
||||||
|
u32 width = dst_params.width;
|
||||||
|
u32 x_elements = regs.line_length_in;
|
||||||
|
u32 x_offset = dst_params.origin.x;
|
||||||
|
u32 bpp_shift = 0U;
|
||||||
|
if (!is_remapping) {
|
||||||
|
bpp_shift = Common::FoldRight(
|
||||||
|
4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
|
||||||
|
width, x_elements, x_offset, static_cast<u32>(regs.offset_out));
|
||||||
|
width >>= bpp_shift;
|
||||||
|
x_elements >>= bpp_shift;
|
||||||
|
x_offset >>= bpp_shift;
|
||||||
|
}
|
||||||
|
|
||||||
|
const u32 bytes_per_pixel = base_bpp << bpp_shift;
|
||||||
const u32 height = dst_params.height;
|
const u32 height = dst_params.height;
|
||||||
const u32 depth = dst_params.depth;
|
const u32 depth = dst_params.depth;
|
||||||
const u32 block_height = dst_params.block_size.height;
|
const u32 block_height = dst_params.block_size.height;
|
||||||
const u32 block_depth = dst_params.block_size.depth;
|
const u32 block_depth = dst_params.block_size.depth;
|
||||||
const size_t dst_size =
|
const size_t dst_size =
|
||||||
CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
|
CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
|
||||||
const size_t dst_layer_size =
|
|
||||||
CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth);
|
|
||||||
|
|
||||||
const size_t src_size = static_cast<size_t>(regs.pitch_in) * regs.line_count;
|
const size_t src_size = static_cast<size_t>(regs.pitch_in) * regs.line_count;
|
||||||
|
|
||||||
if (read_buffer.size() < src_size) {
|
if (read_buffer.size() < src_size) {
|
||||||
|
@ -188,32 +221,23 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
|
||||||
write_buffer.resize(dst_size);
|
write_buffer.resize(dst_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
|
||||||
if (Settings::IsGPULevelExtreme()) {
|
if (Settings::IsGPULevelExtreme()) {
|
||||||
memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
|
|
||||||
memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
|
memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
|
||||||
} else {
|
} else {
|
||||||
memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), src_size);
|
|
||||||
memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
|
memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the input is linear and the output is tiled, swizzle the input and copy it over.
|
// If the input is linear and the output is tiled, swizzle the input and copy it over.
|
||||||
if (regs.dst_params.block_size.depth > 0) {
|
SwizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset,
|
||||||
ASSERT(dst_params.layer == 0);
|
dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
|
||||||
SwizzleSliceToVoxel(regs.line_length_in, regs.line_count, regs.pitch_in, width, height,
|
regs.pitch_in);
|
||||||
bytes_per_pixel, block_height, block_depth, dst_params.origin.x,
|
|
||||||
dst_params.origin.y, write_buffer.data(), read_buffer.data());
|
|
||||||
} else {
|
|
||||||
SwizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_in, width, bytes_per_pixel,
|
|
||||||
write_buffer.data() + dst_layer_size * dst_params.layer, read_buffer.data(),
|
|
||||||
block_height, dst_params.origin.x, dst_params.origin.y);
|
|
||||||
}
|
|
||||||
|
|
||||||
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
|
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
void MaxwellDMA::FastCopyBlockLinearToPitch() {
|
void MaxwellDMA::FastCopyBlockLinearToPitch() {
|
||||||
const u32 bytes_per_pixel =
|
const u32 bytes_per_pixel = 1U;
|
||||||
regs.launch_dma.remap_enable ? regs.pitch_out / regs.line_length_in : 1;
|
|
||||||
const size_t src_size = GOB_SIZE;
|
const size_t src_size = GOB_SIZE;
|
||||||
const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
|
const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
|
||||||
u32 pos_x = regs.src_params.origin.x;
|
u32 pos_x = regs.src_params.origin.x;
|
||||||
|
@ -239,9 +263,10 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
|
||||||
memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
|
memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, regs.src_params.width,
|
UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, regs.src_params.width,
|
||||||
bytes_per_pixel, regs.src_params.block_size.height, pos_x, pos_y,
|
regs.src_params.height, 1, pos_x, pos_y, regs.line_length_in, regs.line_count,
|
||||||
write_buffer.data(), read_buffer.data());
|
regs.src_params.block_size.height, regs.src_params.block_size.depth,
|
||||||
|
regs.pitch_out);
|
||||||
|
|
||||||
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
|
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
|
||||||
}
|
}
|
||||||
|
|
|
@ -189,10 +189,16 @@ public:
|
||||||
BitField<4, 3, Swizzle> dst_y;
|
BitField<4, 3, Swizzle> dst_y;
|
||||||
BitField<8, 3, Swizzle> dst_z;
|
BitField<8, 3, Swizzle> dst_z;
|
||||||
BitField<12, 3, Swizzle> dst_w;
|
BitField<12, 3, Swizzle> dst_w;
|
||||||
|
BitField<0, 12, u32> dst_components_raw;
|
||||||
BitField<16, 2, u32> component_size_minus_one;
|
BitField<16, 2, u32> component_size_minus_one;
|
||||||
BitField<20, 2, u32> num_src_components_minus_one;
|
BitField<20, 2, u32> num_src_components_minus_one;
|
||||||
BitField<24, 2, u32> num_dst_components_minus_one;
|
BitField<24, 2, u32> num_dst_components_minus_one;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Swizzle GetComponent(size_t i) {
|
||||||
|
const u32 raw = dst_components_raw;
|
||||||
|
return static_cast<Swizzle>((raw >> (i * 3)) & 0x7);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
static_assert(sizeof(RemapConst) == 12);
|
static_assert(sizeof(RemapConst) == 12);
|
||||||
|
|
||||||
|
|
|
@ -156,8 +156,9 @@ void Vic::WriteRGBFrame(const AVFrame* frame, const VicConfig& config) {
|
||||||
const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
|
const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
|
||||||
const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0);
|
const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0);
|
||||||
luma_buffer.resize(size);
|
luma_buffer.resize(size);
|
||||||
Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(),
|
std::span<const u8> frame_buff(converted_frame_buf_addr, 4 * width * height);
|
||||||
converted_frame_buf_addr, block_height, 0, 0);
|
Texture::SwizzleSubrect(luma_buffer, frame_buff, 4, width, height, 1,
|
||||||
|
0, 0, width, height, block_height, 0, width * 4);
|
||||||
|
|
||||||
host1x.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size);
|
host1x.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -462,6 +462,97 @@ void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size) const {
|
||||||
MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, flush_short_pages);
|
MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, flush_short_pages);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool MemoryManager::IsMemoryDirty(GPUVAddr gpu_addr, size_t size) const {
|
||||||
|
bool result = false;
|
||||||
|
auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
|
||||||
|
[[maybe_unused]] std::size_t offset,
|
||||||
|
[[maybe_unused]] std::size_t copy_amount) { return false; };
|
||||||
|
|
||||||
|
auto mapped_normal = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
|
||||||
|
const VAddr cpu_addr_base =
|
||||||
|
(static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset;
|
||||||
|
result |= rasterizer->MustFlushRegion(cpu_addr_base, copy_amount);
|
||||||
|
return result;
|
||||||
|
};
|
||||||
|
auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
|
||||||
|
const VAddr cpu_addr_base =
|
||||||
|
(static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset;
|
||||||
|
result |= rasterizer->MustFlushRegion(cpu_addr_base, copy_amount);
|
||||||
|
return result;
|
||||||
|
};
|
||||||
|
auto check_short_pages = [&](std::size_t page_index, std::size_t offset,
|
||||||
|
std::size_t copy_amount) {
|
||||||
|
GPUVAddr base = (page_index << big_page_bits) + offset;
|
||||||
|
MemoryOperation<false>(base, copy_amount, mapped_normal, do_nothing, do_nothing);
|
||||||
|
return result;
|
||||||
|
};
|
||||||
|
MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, check_short_pages);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t MemoryManager::MaxContinousRange(GPUVAddr gpu_addr, size_t size) const {
|
||||||
|
std::optional<VAddr> old_page_addr{};
|
||||||
|
size_t range_so_far = 0;
|
||||||
|
bool result{false};
|
||||||
|
auto fail = [&]([[maybe_unused]] std::size_t page_index, [[maybe_unused]] std::size_t offset,
|
||||||
|
std::size_t copy_amount) {
|
||||||
|
result = true;
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
auto short_check = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
|
||||||
|
const VAddr cpu_addr_base =
|
||||||
|
(static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset;
|
||||||
|
if (old_page_addr && *old_page_addr != cpu_addr_base) {
|
||||||
|
result = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
range_so_far += copy_amount;
|
||||||
|
old_page_addr = {cpu_addr_base + copy_amount};
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
auto big_check = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
|
||||||
|
const VAddr cpu_addr_base =
|
||||||
|
(static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset;
|
||||||
|
if (old_page_addr && *old_page_addr != cpu_addr_base) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
range_so_far += copy_amount;
|
||||||
|
old_page_addr = {cpu_addr_base + copy_amount};
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
auto check_short_pages = [&](std::size_t page_index, std::size_t offset,
|
||||||
|
std::size_t copy_amount) {
|
||||||
|
GPUVAddr base = (page_index << big_page_bits) + offset;
|
||||||
|
MemoryOperation<false>(base, copy_amount, short_check, fail, fail);
|
||||||
|
return result;
|
||||||
|
};
|
||||||
|
MemoryOperation<true>(gpu_addr, size, big_check, fail, check_short_pages);
|
||||||
|
return range_so_far;
|
||||||
|
}
|
||||||
|
|
||||||
|
void MemoryManager::InvalidateRegion(GPUVAddr gpu_addr, size_t size) const {
|
||||||
|
auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
|
||||||
|
[[maybe_unused]] std::size_t offset,
|
||||||
|
[[maybe_unused]] std::size_t copy_amount) {};
|
||||||
|
|
||||||
|
auto mapped_normal = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
|
||||||
|
const VAddr cpu_addr_base =
|
||||||
|
(static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset;
|
||||||
|
rasterizer->InvalidateRegion(cpu_addr_base, copy_amount);
|
||||||
|
};
|
||||||
|
auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
|
||||||
|
const VAddr cpu_addr_base =
|
||||||
|
(static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset;
|
||||||
|
rasterizer->InvalidateRegion(cpu_addr_base, copy_amount);
|
||||||
|
};
|
||||||
|
auto invalidate_short_pages = [&](std::size_t page_index, std::size_t offset,
|
||||||
|
std::size_t copy_amount) {
|
||||||
|
GPUVAddr base = (page_index << big_page_bits) + offset;
|
||||||
|
MemoryOperation<false>(base, copy_amount, mapped_normal, do_nothing, do_nothing);
|
||||||
|
};
|
||||||
|
MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, invalidate_short_pages);
|
||||||
|
}
|
||||||
|
|
||||||
void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size) {
|
void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size) {
|
||||||
std::vector<u8> tmp_buffer(size);
|
std::vector<u8> tmp_buffer(size);
|
||||||
ReadBlock(gpu_src_addr, tmp_buffer.data(), size);
|
ReadBlock(gpu_src_addr, tmp_buffer.data(), size);
|
||||||
|
|
|
@ -104,6 +104,12 @@ public:
|
||||||
|
|
||||||
void FlushRegion(GPUVAddr gpu_addr, size_t size) const;
|
void FlushRegion(GPUVAddr gpu_addr, size_t size) const;
|
||||||
|
|
||||||
|
void InvalidateRegion(GPUVAddr gpu_addr, size_t size) const;
|
||||||
|
|
||||||
|
bool IsMemoryDirty(GPUVAddr gpu_addr, size_t size) const;
|
||||||
|
|
||||||
|
size_t MaxContinousRange(GPUVAddr gpu_addr, size_t size) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped>
|
template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped>
|
||||||
inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped,
|
inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped,
|
||||||
|
|
|
@ -129,7 +129,7 @@ public:
|
||||||
[[nodiscard]] virtual Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() = 0;
|
[[nodiscard]] virtual Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() = 0;
|
||||||
|
|
||||||
virtual void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
|
virtual void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
|
||||||
std::span<u8> memory) = 0;
|
std::span<const u8> memory) = 0;
|
||||||
|
|
||||||
/// Attempt to use a faster method to display the framebuffer to screen
|
/// Attempt to use a faster method to display the framebuffer to screen
|
||||||
[[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config,
|
[[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config,
|
||||||
|
|
|
@ -476,7 +476,7 @@ Tegra::Engines::AccelerateDMAInterface& RasterizerOpenGL::AccessAccelerateDMA()
|
||||||
}
|
}
|
||||||
|
|
||||||
void RasterizerOpenGL::AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
|
void RasterizerOpenGL::AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
|
||||||
std::span<u8> memory) {
|
std::span<const u8> memory) {
|
||||||
auto cpu_addr = gpu_memory->GpuToCpuAddress(address);
|
auto cpu_addr = gpu_memory->GpuToCpuAddress(address);
|
||||||
if (!cpu_addr) [[unlikely]] {
|
if (!cpu_addr) [[unlikely]] {
|
||||||
gpu_memory->WriteBlock(address, memory.data(), copy_size);
|
gpu_memory->WriteBlock(address, memory.data(), copy_size);
|
||||||
|
|
|
@ -99,7 +99,7 @@ public:
|
||||||
const Tegra::Engines::Fermi2D::Config& copy_config) override;
|
const Tegra::Engines::Fermi2D::Config& copy_config) override;
|
||||||
Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
|
Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
|
||||||
void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
|
void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
|
||||||
std::span<u8> memory) override;
|
std::span<const u8> memory) override;
|
||||||
bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
|
bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
|
||||||
u32 pixel_stride) override;
|
u32 pixel_stride) override;
|
||||||
void LoadDiskResources(u64 title_id, std::stop_token stop_loading,
|
void LoadDiskResources(u64 title_id, std::stop_token stop_loading,
|
||||||
|
|
|
@ -26,8 +26,6 @@
|
||||||
|
|
||||||
namespace Vulkan {
|
namespace Vulkan {
|
||||||
|
|
||||||
using Tegra::Texture::SWIZZLE_TABLE;
|
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0;
|
constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0;
|
||||||
|
|
|
@ -548,7 +548,7 @@ Tegra::Engines::AccelerateDMAInterface& RasterizerVulkan::AccessAccelerateDMA()
|
||||||
}
|
}
|
||||||
|
|
||||||
void RasterizerVulkan::AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
|
void RasterizerVulkan::AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
|
||||||
std::span<u8> memory) {
|
std::span<const u8> memory) {
|
||||||
auto cpu_addr = gpu_memory->GpuToCpuAddress(address);
|
auto cpu_addr = gpu_memory->GpuToCpuAddress(address);
|
||||||
if (!cpu_addr) [[unlikely]] {
|
if (!cpu_addr) [[unlikely]] {
|
||||||
gpu_memory->WriteBlock(address, memory.data(), copy_size);
|
gpu_memory->WriteBlock(address, memory.data(), copy_size);
|
||||||
|
|
|
@ -95,7 +95,7 @@ public:
|
||||||
const Tegra::Engines::Fermi2D::Config& copy_config) override;
|
const Tegra::Engines::Fermi2D::Config& copy_config) override;
|
||||||
Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
|
Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
|
||||||
void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
|
void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
|
||||||
std::span<u8> memory) override;
|
std::span<const u8> memory) override;
|
||||||
bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
|
bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
|
||||||
u32 pixel_stride) override;
|
u32 pixel_stride) override;
|
||||||
void LoadDiskResources(u64 title_id, std::stop_token stop_loading,
|
void LoadDiskResources(u64 title_id, std::stop_token stop_loading,
|
||||||
|
|
|
@ -517,7 +517,6 @@ void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr
|
||||||
const u32 host_bytes_per_layer = num_blocks_per_layer * bytes_per_block;
|
const u32 host_bytes_per_layer = num_blocks_per_layer * bytes_per_block;
|
||||||
|
|
||||||
UNIMPLEMENTED_IF(info.tile_width_spacing > 0);
|
UNIMPLEMENTED_IF(info.tile_width_spacing > 0);
|
||||||
|
|
||||||
UNIMPLEMENTED_IF(copy.image_offset.x != 0);
|
UNIMPLEMENTED_IF(copy.image_offset.x != 0);
|
||||||
UNIMPLEMENTED_IF(copy.image_offset.y != 0);
|
UNIMPLEMENTED_IF(copy.image_offset.y != 0);
|
||||||
UNIMPLEMENTED_IF(copy.image_offset.z != 0);
|
UNIMPLEMENTED_IF(copy.image_offset.z != 0);
|
||||||
|
|
|
@ -89,6 +89,69 @@ void SwizzleImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <bool TO_LINEAR, u32 BYTES_PER_PIXEL>
|
||||||
|
void SwizzleSubrectImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32 height,
|
||||||
|
u32 depth, u32 origin_x, u32 origin_y, u32 extent_x, u32 num_lines,
|
||||||
|
u32 block_height, u32 block_depth, u32 pitch_linear) {
|
||||||
|
// The origin of the transformation can be configured here, leave it as zero as the current API
|
||||||
|
// doesn't expose it.
|
||||||
|
static constexpr u32 origin_z = 0;
|
||||||
|
|
||||||
|
// We can configure here a custom pitch
|
||||||
|
// As it's not exposed 'width * BYTES_PER_PIXEL' will be the expected pitch.
|
||||||
|
const u32 pitch = pitch_linear;
|
||||||
|
const u32 stride = Common::AlignUpLog2(width * BYTES_PER_PIXEL, GOB_SIZE_X_SHIFT);
|
||||||
|
|
||||||
|
const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT);
|
||||||
|
const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
|
||||||
|
const u32 slice_size =
|
||||||
|
Common::DivCeilLog2(height, block_height + GOB_SIZE_Y_SHIFT) * block_size;
|
||||||
|
|
||||||
|
const u32 block_height_mask = (1U << block_height) - 1;
|
||||||
|
const u32 block_depth_mask = (1U << block_depth) - 1;
|
||||||
|
const u32 x_shift = GOB_SIZE_SHIFT + block_height + block_depth;
|
||||||
|
|
||||||
|
u32 unprocessed_lines = num_lines;
|
||||||
|
u32 extent_y = std::min(num_lines, height - origin_y);
|
||||||
|
|
||||||
|
for (u32 slice = 0; slice < depth; ++slice) {
|
||||||
|
const u32 z = slice + origin_z;
|
||||||
|
const u32 offset_z = (z >> block_depth) * slice_size +
|
||||||
|
((z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height));
|
||||||
|
const u32 lines_in_y = std::min(unprocessed_lines, extent_y);
|
||||||
|
for (u32 line = 0; line < lines_in_y; ++line) {
|
||||||
|
const u32 y = line + origin_y;
|
||||||
|
const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(y);
|
||||||
|
|
||||||
|
const u32 block_y = y >> GOB_SIZE_Y_SHIFT;
|
||||||
|
const u32 offset_y = (block_y >> block_height) * block_size +
|
||||||
|
((block_y & block_height_mask) << GOB_SIZE_SHIFT);
|
||||||
|
|
||||||
|
u32 swizzled_x = pdep<SWIZZLE_X_BITS>(origin_x * BYTES_PER_PIXEL);
|
||||||
|
for (u32 column = 0; column < extent_x;
|
||||||
|
++column, incrpdep<SWIZZLE_X_BITS, BYTES_PER_PIXEL>(swizzled_x)) {
|
||||||
|
const u32 x = (column + origin_x) * BYTES_PER_PIXEL;
|
||||||
|
const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift;
|
||||||
|
|
||||||
|
const u32 base_swizzled_offset = offset_z + offset_y + offset_x;
|
||||||
|
const u32 swizzled_offset = base_swizzled_offset + (swizzled_x | swizzled_y);
|
||||||
|
|
||||||
|
const u32 unswizzled_offset =
|
||||||
|
slice * pitch * height + line * pitch + column * BYTES_PER_PIXEL;
|
||||||
|
|
||||||
|
u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset];
|
||||||
|
const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset];
|
||||||
|
|
||||||
|
std::memcpy(dst, src, BYTES_PER_PIXEL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
unprocessed_lines -= lines_in_y;
|
||||||
|
if (unprocessed_lines == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <bool TO_LINEAR>
|
template <bool TO_LINEAR>
|
||||||
void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
|
void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
|
||||||
u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) {
|
u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) {
|
||||||
|
@ -111,97 +174,6 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <u32 BYTES_PER_PIXEL>
|
|
||||||
void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
|
|
||||||
u8* swizzled_data, const u8* unswizzled_data, u32 block_height_bit,
|
|
||||||
u32 offset_x, u32 offset_y) {
|
|
||||||
const u32 block_height = 1U << block_height_bit;
|
|
||||||
const u32 image_width_in_gobs =
|
|
||||||
(swizzled_width * BYTES_PER_PIXEL + (GOB_SIZE_X - 1)) / GOB_SIZE_X;
|
|
||||||
for (u32 line = 0; line < subrect_height; ++line) {
|
|
||||||
const u32 dst_y = line + offset_y;
|
|
||||||
const u32 gob_address_y =
|
|
||||||
(dst_y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +
|
|
||||||
((dst_y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;
|
|
||||||
|
|
||||||
const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(dst_y);
|
|
||||||
u32 swizzled_x = pdep<SWIZZLE_X_BITS>(offset_x * BYTES_PER_PIXEL);
|
|
||||||
for (u32 x = 0; x < subrect_width;
|
|
||||||
++x, incrpdep<SWIZZLE_X_BITS, BYTES_PER_PIXEL>(swizzled_x)) {
|
|
||||||
const u32 dst_x = x + offset_x;
|
|
||||||
const u32 gob_address =
|
|
||||||
gob_address_y + (dst_x * BYTES_PER_PIXEL / GOB_SIZE_X) * GOB_SIZE * block_height;
|
|
||||||
const u32 swizzled_offset = gob_address + (swizzled_x | swizzled_y);
|
|
||||||
const u32 unswizzled_offset = line * source_pitch + x * BYTES_PER_PIXEL;
|
|
||||||
|
|
||||||
const u8* const source_line = unswizzled_data + unswizzled_offset;
|
|
||||||
u8* const dest_addr = swizzled_data + swizzled_offset;
|
|
||||||
std::memcpy(dest_addr, source_line, BYTES_PER_PIXEL);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <u32 BYTES_PER_PIXEL>
|
|
||||||
void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 block_height,
|
|
||||||
u32 origin_x, u32 origin_y, u8* output, const u8* input) {
|
|
||||||
const u32 stride = width * BYTES_PER_PIXEL;
|
|
||||||
const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X;
|
|
||||||
const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height);
|
|
||||||
|
|
||||||
const u32 block_height_mask = (1U << block_height) - 1;
|
|
||||||
const u32 x_shift = GOB_SIZE_SHIFT + block_height;
|
|
||||||
|
|
||||||
for (u32 line = 0; line < line_count; ++line) {
|
|
||||||
const u32 src_y = line + origin_y;
|
|
||||||
const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(src_y);
|
|
||||||
|
|
||||||
const u32 block_y = src_y >> GOB_SIZE_Y_SHIFT;
|
|
||||||
const u32 src_offset_y = (block_y >> block_height) * block_size +
|
|
||||||
((block_y & block_height_mask) << GOB_SIZE_SHIFT);
|
|
||||||
|
|
||||||
u32 swizzled_x = pdep<SWIZZLE_X_BITS>(origin_x * BYTES_PER_PIXEL);
|
|
||||||
for (u32 column = 0; column < line_length_in;
|
|
||||||
++column, incrpdep<SWIZZLE_X_BITS, BYTES_PER_PIXEL>(swizzled_x)) {
|
|
||||||
const u32 src_x = (column + origin_x) * BYTES_PER_PIXEL;
|
|
||||||
const u32 src_offset_x = (src_x >> GOB_SIZE_X_SHIFT) << x_shift;
|
|
||||||
|
|
||||||
const u32 swizzled_offset = src_offset_y + src_offset_x + (swizzled_x | swizzled_y);
|
|
||||||
const u32 unswizzled_offset = line * pitch + column * BYTES_PER_PIXEL;
|
|
||||||
|
|
||||||
std::memcpy(output + unswizzled_offset, input + swizzled_offset, BYTES_PER_PIXEL);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <u32 BYTES_PER_PIXEL>
|
|
||||||
void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
|
|
||||||
u32 block_height, u32 block_depth, u32 origin_x, u32 origin_y, u8* output,
|
|
||||||
const u8* input) {
|
|
||||||
UNIMPLEMENTED_IF(origin_x > 0);
|
|
||||||
UNIMPLEMENTED_IF(origin_y > 0);
|
|
||||||
|
|
||||||
const u32 stride = width * BYTES_PER_PIXEL;
|
|
||||||
const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X;
|
|
||||||
const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
|
|
||||||
|
|
||||||
const u32 block_height_mask = (1U << block_height) - 1;
|
|
||||||
const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height + block_depth;
|
|
||||||
|
|
||||||
for (u32 line = 0; line < line_count; ++line) {
|
|
||||||
const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(line);
|
|
||||||
const u32 block_y = line / GOB_SIZE_Y;
|
|
||||||
const u32 dst_offset_y =
|
|
||||||
(block_y >> block_height) * block_size + (block_y & block_height_mask) * GOB_SIZE;
|
|
||||||
|
|
||||||
u32 swizzled_x = 0;
|
|
||||||
for (u32 x = 0; x < line_length_in; ++x, incrpdep<SWIZZLE_X_BITS, 1>(swizzled_x)) {
|
|
||||||
const u32 dst_offset =
|
|
||||||
((x / GOB_SIZE_X) << x_shift) + dst_offset_y + (swizzled_x | swizzled_y);
|
|
||||||
const u32 src_offset = x * BYTES_PER_PIXEL + line * pitch;
|
|
||||||
std::memcpy(output + dst_offset, input + src_offset, BYTES_PER_PIXEL);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} // Anonymous namespace
|
} // Anonymous namespace
|
||||||
|
|
||||||
void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
|
void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
|
||||||
|
@ -218,15 +190,15 @@ void SwizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_p
|
||||||
stride_alignment);
|
stride_alignment);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
|
void SwizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
|
||||||
u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data,
|
u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x, u32 extent_y,
|
||||||
u32 block_height_bit, u32 offset_x, u32 offset_y) {
|
u32 block_height, u32 block_depth, u32 pitch_linear) {
|
||||||
switch (bytes_per_pixel) {
|
switch (bytes_per_pixel) {
|
||||||
#define BPP_CASE(x) \
|
#define BPP_CASE(x) \
|
||||||
case x: \
|
case x: \
|
||||||
return SwizzleSubrect<x>(subrect_width, subrect_height, source_pitch, swizzled_width, \
|
return SwizzleSubrectImpl<true, x>(output, input, width, height, depth, origin_x, \
|
||||||
swizzled_data, unswizzled_data, block_height_bit, offset_x, \
|
origin_y, extent_x, extent_y, block_height, \
|
||||||
offset_y);
|
block_depth, pitch_linear);
|
||||||
BPP_CASE(1)
|
BPP_CASE(1)
|
||||||
BPP_CASE(2)
|
BPP_CASE(2)
|
||||||
BPP_CASE(3)
|
BPP_CASE(3)
|
||||||
|
@ -241,13 +213,15 @@ void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel,
|
void UnswizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
|
||||||
u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input) {
|
u32 width, u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x,
|
||||||
|
u32 extent_y, u32 block_height, u32 block_depth, u32 pitch_linear) {
|
||||||
switch (bytes_per_pixel) {
|
switch (bytes_per_pixel) {
|
||||||
#define BPP_CASE(x) \
|
#define BPP_CASE(x) \
|
||||||
case x: \
|
case x: \
|
||||||
return UnswizzleSubrect<x>(line_length_in, line_count, pitch, width, block_height, \
|
return SwizzleSubrectImpl<false, x>(output, input, width, height, depth, origin_x, \
|
||||||
origin_x, origin_y, output, input);
|
origin_y, extent_x, extent_y, block_height, \
|
||||||
|
block_depth, pitch_linear);
|
||||||
BPP_CASE(1)
|
BPP_CASE(1)
|
||||||
BPP_CASE(2)
|
BPP_CASE(2)
|
||||||
BPP_CASE(3)
|
BPP_CASE(3)
|
||||||
|
@ -262,55 +236,6 @@ void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
|
|
||||||
u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x,
|
|
||||||
u32 origin_y, u8* output, const u8* input) {
|
|
||||||
switch (bytes_per_pixel) {
|
|
||||||
#define BPP_CASE(x) \
|
|
||||||
case x: \
|
|
||||||
return SwizzleSliceToVoxel<x>(line_length_in, line_count, pitch, width, height, \
|
|
||||||
block_height, block_depth, origin_x, origin_y, output, \
|
|
||||||
input);
|
|
||||||
BPP_CASE(1)
|
|
||||||
BPP_CASE(2)
|
|
||||||
BPP_CASE(3)
|
|
||||||
BPP_CASE(4)
|
|
||||||
BPP_CASE(6)
|
|
||||||
BPP_CASE(8)
|
|
||||||
BPP_CASE(12)
|
|
||||||
BPP_CASE(16)
|
|
||||||
#undef BPP_CASE
|
|
||||||
default:
|
|
||||||
ASSERT_MSG(false, "Invalid bytes_per_pixel={}", bytes_per_pixel);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y,
|
|
||||||
const u32 block_height_bit, const std::size_t copy_size, const u8* source_data,
|
|
||||||
u8* swizzle_data) {
|
|
||||||
const u32 block_height = 1U << block_height_bit;
|
|
||||||
const u32 image_width_in_gobs{(width + GOB_SIZE_X - 1) / GOB_SIZE_X};
|
|
||||||
std::size_t count = 0;
|
|
||||||
for (std::size_t y = dst_y; y < height && count < copy_size; ++y) {
|
|
||||||
const std::size_t gob_address_y =
|
|
||||||
(y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +
|
|
||||||
((y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;
|
|
||||||
const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(static_cast<u32>(y));
|
|
||||||
u32 swizzled_x = pdep<SWIZZLE_X_BITS>(dst_x);
|
|
||||||
for (std::size_t x = dst_x; x < width && count < copy_size;
|
|
||||||
++x, incrpdep<SWIZZLE_X_BITS, 1>(swizzled_x)) {
|
|
||||||
const std::size_t gob_address =
|
|
||||||
gob_address_y + (x / GOB_SIZE_X) * GOB_SIZE * block_height;
|
|
||||||
const std::size_t swizzled_offset = gob_address + (swizzled_x | swizzled_y);
|
|
||||||
const u8* source_line = source_data + count;
|
|
||||||
u8* dest_addr = swizzle_data + swizzled_offset;
|
|
||||||
count++;
|
|
||||||
|
|
||||||
*dest_addr = *source_line;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height, u32 depth,
|
std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height, u32 depth,
|
||||||
u32 block_height, u32 block_depth) {
|
u32 block_height, u32 block_depth) {
|
||||||
if (tiled) {
|
if (tiled) {
|
||||||
|
|
|
@ -40,7 +40,6 @@ constexpr SwizzleTable MakeSwizzleTable() {
|
||||||
}
|
}
|
||||||
return table;
|
return table;
|
||||||
}
|
}
|
||||||
constexpr SwizzleTable SWIZZLE_TABLE = MakeSwizzleTable();
|
|
||||||
|
|
||||||
/// Unswizzles a block linear texture into linear memory.
|
/// Unswizzles a block linear texture into linear memory.
|
||||||
void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
|
void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
|
||||||
|
@ -57,34 +56,14 @@ std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height
|
||||||
u32 block_height, u32 block_depth);
|
u32 block_height, u32 block_depth);
|
||||||
|
|
||||||
/// Copies an untiled subrectangle into a tiled surface.
|
/// Copies an untiled subrectangle into a tiled surface.
|
||||||
void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
|
void SwizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
|
||||||
u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data,
|
u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x, u32 extent_y,
|
||||||
u32 block_height_bit, u32 offset_x, u32 offset_y);
|
u32 block_height, u32 block_depth, u32 pitch_linear);
|
||||||
|
|
||||||
/// Copies a tiled subrectangle into a linear surface.
|
/// Copies a tiled subrectangle into a linear surface.
|
||||||
void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel,
|
void UnswizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
|
||||||
u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input);
|
u32 width, u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x,
|
||||||
|
u32 extent_y, u32 block_height, u32 block_depth, u32 pitch_linear);
|
||||||
/// @brief Swizzles a 2D array of pixels into a 3D texture
|
|
||||||
/// @param line_length_in Number of pixels per line
|
|
||||||
/// @param line_count Number of lines
|
|
||||||
/// @param pitch Number of bytes per line
|
|
||||||
/// @param width Width of the swizzled texture
|
|
||||||
/// @param height Height of the swizzled texture
|
|
||||||
/// @param bytes_per_pixel Number of bytes used per pixel
|
|
||||||
/// @param block_height Block height shift
|
|
||||||
/// @param block_depth Block depth shift
|
|
||||||
/// @param origin_x Column offset in pixels of the swizzled texture
|
|
||||||
/// @param origin_y Row offset in pixels of the swizzled texture
|
|
||||||
/// @param output Pointer to the pixels of the swizzled texture
|
|
||||||
/// @param input Pointer to the 2D array of pixels used as input
|
|
||||||
/// @pre input and output points to an array large enough to hold the number of bytes used
|
|
||||||
void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
|
|
||||||
u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x,
|
|
||||||
u32 origin_y, u8* output, const u8* input);
|
|
||||||
|
|
||||||
void SwizzleKepler(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,
|
|
||||||
std::size_t copy_size, const u8* source_data, u8* swizzle_data);
|
|
||||||
|
|
||||||
/// Obtains the offset of the gob for positions 'dst_x' & 'dst_y'
|
/// Obtains the offset of the gob for positions 'dst_x' & 'dst_y'
|
||||||
u64 GetGOBOffset(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,
|
u64 GetGOBOffset(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,
|
||||||
|
|
Loading…
Reference in New Issue