diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index a0a9662e7..2c098860d 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -51,6 +51,8 @@ add_library(core
gpu_hw.h
gpu_hw_shadergen.cpp
gpu_hw_shadergen.h
+ gpu_hw_texture_cache.cpp
+ gpu_hw_texture_cache.h
gpu_shadergen.cpp
gpu_shadergen.h
gpu_sw.cpp
diff --git a/src/core/core.vcxproj b/src/core/core.vcxproj
index 2144c2873..7cf309b9d 100644
--- a/src/core/core.vcxproj
+++ b/src/core/core.vcxproj
@@ -47,6 +47,7 @@
+
@@ -131,6 +132,7 @@
+
diff --git a/src/core/core.vcxproj.filters b/src/core/core.vcxproj.filters
index f623ed9f2..ffa65a8a7 100644
--- a/src/core/core.vcxproj.filters
+++ b/src/core/core.vcxproj.filters
@@ -69,6 +69,7 @@
+
@@ -143,6 +144,7 @@
+
diff --git a/src/core/gpu.cpp b/src/core/gpu.cpp
index bc0743296..9c8a4f908 100644
--- a/src/core/gpu.cpp
+++ b/src/core/gpu.cpp
@@ -1719,8 +1719,8 @@ void GPU::SetDrawMode(u16 value)
if (new_mode_reg.bits == m_draw_mode.mode_reg.bits)
return;
- m_draw_mode.texture_page_changed |= ((new_mode_reg.bits & GPUDrawModeReg::TEXTURE_PAGE_MASK) !=
- (m_draw_mode.mode_reg.bits & GPUDrawModeReg::TEXTURE_PAGE_MASK));
+ m_draw_mode.texture_page_changed |= ((new_mode_reg.bits & GPUDrawModeReg::TEXTURE_MODE_AND_PAGE_MASK) !=
+ (m_draw_mode.mode_reg.bits & GPUDrawModeReg::TEXTURE_MODE_AND_PAGE_MASK));
m_draw_mode.mode_reg.bits = new_mode_reg.bits;
if (m_GPUSTAT.draw_to_displayed_field != new_mode_reg.draw_to_displayed_field)
diff --git a/src/core/gpu_commands.cpp b/src/core/gpu_commands.cpp
index 6bc3effee..ad12627c3 100644
--- a/src/core/gpu_commands.cpp
+++ b/src/core/gpu_commands.cpp
@@ -532,7 +532,7 @@ void GPU::FinishVRAMWrite()
m_vram_transfer.height, sizeof(u16) * m_vram_transfer.width, m_blit_buffer.data(), true);
}
- if (g_settings.texture_replacements.ShouldDumpVRAMWrite(m_vram_transfer.width, m_vram_transfer.height))
+ if (TextureReplacements::ShouldDumpVRAMWrite(m_vram_transfer.width, m_vram_transfer.height))
{
TextureReplacements::DumpVRAMWrite(m_vram_transfer.width, m_vram_transfer.height,
reinterpret_cast(m_blit_buffer.data()));
diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp
index 92b4a3ba3..0a39db2b1 100644
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@@ -193,6 +193,8 @@ GPU_HW::GPU_HW() : GPU()
GPU_HW::~GPU_HW()
{
+ GPUTextureCache::Shutdown();
+
if (m_sw_renderer)
{
m_sw_renderer->Shutdown();
@@ -261,6 +263,8 @@ bool GPU_HW::Initialize()
m_clamp_uvs = ShouldClampUVs(m_texture_filtering) || ShouldClampUVs(m_sprite_texture_filtering);
m_compute_uv_range = m_clamp_uvs;
m_allow_sprite_mode = ShouldAllowSpriteMode(m_resolution_scale, m_texture_filtering, m_sprite_texture_filtering);
+ m_use_texture_cache = g_settings.gpu_texture_cache;
+ m_texture_dumping = m_use_texture_cache && g_settings.texture_replacements.dump_textures;
CheckSettings();
@@ -281,13 +285,27 @@ bool GPU_HW::Initialize()
return false;
}
+ if (m_use_texture_cache)
+ {
+ if (!GPUTextureCache::Initialize())
+ {
+ ERROR_LOG("Failed to initialize texture cache, disabling.");
+ m_use_texture_cache = false;
+ }
+ }
+
UpdateDownsamplingLevels();
+
RestoreDeviceContext();
return true;
}
void GPU_HW::Reset(bool clear_vram)
{
+ // Texture cache needs to be invalidated before we load, otherwise we dump black.
+ if (m_use_texture_cache)
+ GPUTextureCache::Invalidate();
+
if (m_batch_vertex_ptr)
UnmapGPUBuffer(0, 0);
@@ -365,6 +383,7 @@ bool GPU_HW::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_di
else if (sw.IsReading())
{
// Need to update the VRAM copy on the GPU with the state data.
+ // Would invalidate the TC, but base DoState() calls Reset().
UpdateVRAMOnGPU(0, 0, VRAM_WIDTH, VRAM_HEIGHT, g_vram, VRAM_WIDTH * sizeof(u16), false, false, VRAM_SIZE_RECT);
}
@@ -374,10 +393,12 @@ bool GPU_HW::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_di
DebugAssert(!m_batch_vertex_ptr && !m_batch_index_ptr);
ClearVRAMDirtyRectangle();
SetFullVRAMDirtyRectangle();
+ UpdateVRAMReadTexture(true, false);
+ ClearVRAMDirtyRectangle();
ResetBatchVertexDepth();
}
- return true;
+ return GPUTextureCache::DoState(sw, !m_use_texture_cache);
}
void GPU_HW::RestoreDeviceContext()
@@ -471,6 +492,8 @@ void GPU_HW::UpdateSettings(const Settings& old_settings)
m_clamp_uvs = clamp_uvs;
m_compute_uv_range = m_clamp_uvs;
m_allow_sprite_mode = ShouldAllowSpriteMode(resolution_scale, m_texture_filtering, m_sprite_texture_filtering);
+ m_use_texture_cache = g_settings.gpu_texture_cache;
+ m_texture_dumping = m_use_texture_cache && g_settings.texture_replacements.dump_textures;
m_batch.sprite_mode = (m_allow_sprite_mode && m_batch.sprite_mode);
const bool depth_buffer_changed = (m_pgxp_depth_buffer != g_settings.UsingPGXPDepthBuffer());
@@ -524,6 +547,23 @@ void GPU_HW::UpdateSettings(const Settings& old_settings)
UpdateDepthBufferFromMaskBit();
}
+ if (m_use_texture_cache && !old_settings.gpu_texture_cache)
+ {
+ if (!GPUTextureCache::Initialize())
+ {
+ ERROR_LOG("Failed to initialize texture cache, disabling.");
+ m_use_texture_cache = false;
+ }
+ }
+ else if (!m_use_texture_cache && old_settings.gpu_texture_cache)
+ {
+ GPUTextureCache::Shutdown();
+ }
+ else if (m_use_texture_cache)
+ {
+ GPUTextureCache::UpdateSettings(old_settings);
+ }
+
if (g_settings.gpu_downsample_mode != old_settings.gpu_downsample_mode ||
(g_settings.gpu_downsample_mode == GPUDownsampleMode::Box &&
g_settings.gpu_downsample_scale != old_settings.gpu_downsample_scale))
@@ -717,6 +757,9 @@ void GPU_HW::AddWrittenRectangle(const GSVector4i rect)
{
m_vram_dirty_write_rect = m_vram_dirty_write_rect.runion(rect);
SetTexPageChangedOnOverlap(m_vram_dirty_write_rect);
+
+ if (m_use_texture_cache)
+ GPUTextureCache::AddWrittenRectangle(rect);
}
void GPU_HW::AddDrawnRectangle(const GSVector4i rect)
@@ -724,13 +767,22 @@ void GPU_HW::AddDrawnRectangle(const GSVector4i rect)
// Normally, we would check for overlap here. But the GPU's texture cache won't actually reload until the page
// changes, or it samples a larger region, so we can get away without doing so. This reduces copies considerably in
// games like Mega Man Legends 2.
- m_vram_dirty_draw_rect = m_vram_dirty_draw_rect.runion(rect);
+ if (m_current_draw_rect.rcontains(rect))
+ return;
+
+ m_current_draw_rect = m_current_draw_rect.runion(rect);
+ m_vram_dirty_draw_rect = m_vram_dirty_draw_rect.runion(m_current_draw_rect);
+
+ if (m_use_texture_cache)
+ GPUTextureCache::AddDrawnRectangle(m_current_draw_rect);
}
void GPU_HW::AddUnclampedDrawnRectangle(const GSVector4i rect)
{
m_vram_dirty_draw_rect = m_vram_dirty_draw_rect.runion(rect);
SetTexPageChangedOnOverlap(m_vram_dirty_draw_rect);
+ if (m_use_texture_cache)
+ GPUTextureCache::AddDrawnRectangle(rect);
}
void GPU_HW::SetTexPageChangedOnOverlap(const GSVector4i update_rect)
@@ -738,9 +790,9 @@ void GPU_HW::SetTexPageChangedOnOverlap(const GSVector4i update_rect)
// the vram area can include the texture page, but the game can leave it as-is. in this case, set it as dirty so the
// shadow texture is updated
if (!m_draw_mode.IsTexturePageChanged() && m_batch.texture_mode != BatchTextureMode::Disabled &&
- (m_draw_mode.mode_reg.GetTexturePageRectangle().rintersects(update_rect) ||
+ (GetTextureRect(m_draw_mode.mode_reg.texture_page, m_draw_mode.mode_reg.texture_mode).rintersects(update_rect) ||
(m_draw_mode.mode_reg.IsUsingPalette() &&
- m_draw_mode.palette_reg.GetRectangle(m_draw_mode.mode_reg.texture_mode).rintersects(update_rect))))
+ GetPaletteRect(m_draw_mode.palette_reg, m_draw_mode.mode_reg.texture_mode).rintersects(update_rect))))
{
m_draw_mode.SetTexturePageChanged();
}
@@ -878,6 +930,8 @@ void GPU_HW::ClearFramebuffer()
g_gpu_device->ClearDepth(m_vram_depth_texture.get(), m_pgxp_depth_buffer ? 1.0f : 0.0f);
}
ClearVRAMDirtyRectangle();
+ if (m_use_texture_cache)
+ GPUTextureCache::Invalidate();
m_last_depth_z = 1.0f;
}
@@ -982,7 +1036,7 @@ bool GPU_HW::CompilePipelines(Error* error)
const u32 active_texture_modes =
m_allow_sprite_mode ? NUM_TEXTURE_MODES :
(NUM_TEXTURE_MODES - (NUM_TEXTURE_MODES - static_cast(BatchTextureMode::SpriteStart)));
- const u32 total_vertex_shaders = (m_allow_sprite_mode ? 5 : 3);
+ const u32 total_vertex_shaders = (m_allow_sprite_mode ? 7 : 3);
const u32 total_fragment_shaders =
(active_texture_modes * 5 * 9 * 2 * (1 + BoolToUInt32(!true_color)) *
(1 + BoolToUInt32(!m_force_progressive_scan)) * (1 + BoolToUInt32(needs_rov_depth)));
@@ -1009,7 +1063,7 @@ bool GPU_HW::CompilePipelines(Error* error)
// vertex shaders - [textured/palette/sprite]
// fragment shaders - [depth_test][render_mode][transparency_mode][texture_mode][check_mask][dithering][interlacing]
static constexpr auto destroy_shader = [](std::unique_ptr& s) { s.reset(); };
- DimensionalArray, 2, 2, 2> batch_vertex_shaders{};
+ DimensionalArray, 2, 3, 2> batch_vertex_shaders{};
DimensionalArray, 2, 2, 2, NUM_TEXTURE_MODES, 5, 5, 2> batch_fragment_shaders{};
ScopedGuard batch_shader_guard([&batch_vertex_shaders, &batch_fragment_shaders]() {
batch_vertex_shaders.enumerate(destroy_shader);
@@ -1018,13 +1072,13 @@ bool GPU_HW::CompilePipelines(Error* error)
for (u8 textured = 0; textured < 2; textured++)
{
- for (u8 palette = 0; palette < (textured ? 2 : 1); palette++)
+ for (u8 palette = 0; palette < (textured ? 3 : 1); palette++)
{
for (u8 sprite = 0; sprite < (textured ? 2 : 1); sprite++)
{
const bool uv_limits = ShouldClampUVs(sprite ? m_sprite_texture_filtering : m_texture_filtering);
const std::string vs = shadergen.GenerateBatchVertexShader(
- textured != 0, palette != 0, uv_limits, !sprite && force_round_texcoords, m_pgxp_depth_buffer);
+ textured != 0, palette == 1, palette == 2, uv_limits, !sprite && force_round_texcoords, m_pgxp_depth_buffer);
if (!(batch_vertex_shaders[textured][palette][sprite] =
g_gpu_device->CreateShader(GPUShaderStage::Vertex, shadergen.GetLanguage(), vs, error)))
{
@@ -1191,6 +1245,8 @@ bool GPU_HW::CompilePipelines(Error* error)
static_cast(texture_mode) == BatchTextureMode::Palette8Bit ||
static_cast(texture_mode) == BatchTextureMode::SpritePalette4Bit ||
static_cast(texture_mode) == BatchTextureMode::SpritePalette8Bit);
+ const bool page_texture =
+ (static_cast(texture_mode) == BatchTextureMode::PageTexture);
const bool sprite = (static_cast(texture_mode) >= BatchTextureMode::SpriteStart);
const bool uv_limits = ShouldClampUVs(sprite ? m_sprite_texture_filtering : m_texture_filtering);
const bool use_shader_blending = (render_mode == static_cast(BatchRenderMode::ShaderBlend));
@@ -1204,7 +1260,9 @@ bool GPU_HW::CompilePipelines(Error* error)
std::span(vertex_attributes, NUM_BATCH_VERTEX_ATTRIBUTES);
plconfig.vertex_shader =
- batch_vertex_shaders[BoolToUInt8(textured)][BoolToUInt8(palette)][BoolToUInt8(sprite)].get();
+ batch_vertex_shaders[BoolToUInt8(textured)][page_texture ? 2 : BoolToUInt8(palette)]
+ [BoolToUInt8(sprite)]
+ .get();
plconfig.fragment_shader =
batch_fragment_shaders[BoolToUInt8(depth_test && needs_rov_depth)][render_mode]
[use_shader_blending ? transparency_mode :
@@ -1834,19 +1892,26 @@ void GPU_HW::UnmapGPUBuffer(u32 used_vertices, u32 used_indices)
}
ALWAYS_INLINE_RELEASE void GPU_HW::DrawBatchVertices(BatchRenderMode render_mode, u32 num_indices, u32 base_index,
- u32 base_vertex)
+ u32 base_vertex, const GPUTextureCache::Source* texture)
{
// [depth_test][transparency_mode][render_mode][texture_mode][dithering][interlacing][check_mask]
- const u8 texture_mode = static_cast(m_batch.texture_mode) +
- ((m_batch.texture_mode != BatchTextureMode::Disabled && m_batch.sprite_mode) ?
- static_cast(BatchTextureMode::SpriteStart) :
- 0);
+ const u8 texture_mode = texture ? static_cast(BatchTextureMode::PageTexture) :
+ (static_cast(m_batch.texture_mode) +
+ ((m_batch.texture_mode < BatchTextureMode::PageTexture && m_batch.sprite_mode) ?
+ static_cast(BatchTextureMode::SpriteStart) :
+ 0));
const u8 depth_test = BoolToUInt8(m_batch.use_depth_buffer);
const u8 check_mask = BoolToUInt8(m_batch.check_mask_before_draw);
g_gpu_device->SetPipeline(m_batch_pipelines[depth_test][static_cast(m_batch.transparency_mode)][static_cast(
render_mode)][texture_mode][BoolToUInt8(m_batch.dithering)][BoolToUInt8(m_batch.interlacing)][check_mask]
.get());
+ // TOOD: Totally not optimized.
+ if (texture)
+ g_gpu_device->SetTextureSampler(0, texture->texture, g_gpu_device->GetNearestSampler());
+ else if (texture_mode != static_cast(BatchTextureMode::Disabled))
+ g_gpu_device->SetTextureSampler(0, m_vram_read_texture.get(), g_gpu_device->GetNearestSampler());
+
GL_INS_FMT("Texture mode: {}", s_batch_texture_modes[texture_mode]);
GL_INS_FMT("Transparency mode: {}", s_transparency_modes[static_cast(m_batch.transparency_mode)]);
GL_INS_FMT("Render mode: {}", s_batch_render_modes[static_cast(render_mode)]);
@@ -2197,7 +2262,7 @@ void GPU_HW::ComputePolygonUVLimits(BatchVertex* vertices, u32 num_vertices)
for (u32 i = 0; i < num_vertices; i++)
vertices[i].SetUVLimits(min_u, max_u, min_v, max_v);
- if (m_texpage_dirty != 0)
+ if (ShouldCheckForTexPageOverlap())
CheckForTexPageOverlap(GSVector4i(min).upl32(GSVector4i(max)).u16to32());
}
@@ -2618,7 +2683,7 @@ void GPU_HW::LoadVertices()
const u32 tex_right = tex_left + quad_width;
const u32 uv_limits = BatchVertex::PackUVLimits(tex_left, tex_right - 1, tex_top, tex_bottom - 1);
- if (rc.texture_enable && m_texpage_dirty != 0)
+ if (rc.texture_enable && ShouldCheckForTexPageOverlap())
{
CheckForTexPageOverlap(GSVector4i(static_cast(tex_left), static_cast(tex_top),
static_cast(tex_right), static_cast(tex_bottom)));
@@ -2842,7 +2907,7 @@ bool GPU_HW::BlitVRAMReplacementTexture(const TextureReplacements::ReplacementIm
ALWAYS_INLINE_RELEASE void GPU_HW::CheckForTexPageOverlap(GSVector4i uv_rect)
{
- DebugAssert(m_texpage_dirty != 0 && m_batch.texture_mode != BatchTextureMode::Disabled);
+ DebugAssert((m_texpage_dirty != 0 || m_texture_dumping) && m_batch.texture_mode != BatchTextureMode::Disabled);
if (m_texture_window_active)
{
@@ -2869,6 +2934,34 @@ ALWAYS_INLINE_RELEASE void GPU_HW::CheckForTexPageOverlap(GSVector4i uv_rect)
m_current_uv_rect = new_uv_rect;
bool update_drawn = false, update_written = false;
+ if (m_texpage_dirty & TEXPAGE_DIRTY_PAGE_RECT)
+ {
+ DebugAssert(!(m_texpage_dirty & (TEXPAGE_DIRTY_DRAWN_RECT | TEXPAGE_DIRTY_WRITTEN_RECT)));
+ DebugAssert(m_batch.texture_mode == BatchTextureMode::PageTexture &&
+ m_batch.texture_cache_key.page < NUM_VRAM_PAGES);
+
+ if (GPUTextureCache::AreSourcePagesDrawn(m_batch.texture_cache_key, m_current_uv_rect))
+ {
+ // UVs intersect with drawn area, can't use TC
+ if (m_batch_index_count > 0)
+ {
+ FlushRender();
+ EnsureVertexBufferSpaceForCurrentCommand();
+ }
+
+ // We need to swap the dirty tracking over to drawn/written.
+ const GSVector4i page_rect = GetTextureRect(m_batch.texture_cache_key.page, m_batch.texture_cache_key.mode);
+ m_texpage_dirty = (m_vram_dirty_draw_rect.rintersects(page_rect) ? TEXPAGE_DIRTY_DRAWN_RECT : 0) |
+ (m_vram_dirty_write_rect.rintersects(page_rect) ? TEXPAGE_DIRTY_WRITTEN_RECT : 0);
+ m_compute_uv_range = (ShouldCheckForTexPageOverlap() || m_clamp_uvs);
+ m_batch.texture_mode = static_cast(m_draw_mode.mode_reg.texture_mode.GetValue());
+ }
+ else
+ {
+ // Page isn't drawn, we're done.
+ return;
+ }
+ }
if (m_texpage_dirty & TEXPAGE_DIRTY_DRAWN_RECT)
{
DebugAssert(!m_vram_dirty_draw_rect.eq(INVALID_RECT));
@@ -2903,6 +2996,11 @@ ALWAYS_INLINE_RELEASE void GPU_HW::CheckForTexPageOverlap(GSVector4i uv_rect)
}
}
+bool GPU_HW::ShouldCheckForTexPageOverlap() const
+{
+ return (m_texpage_dirty != 0);
+}
+
ALWAYS_INLINE bool GPU_HW::IsFlushed() const
{
return (m_batch_index_count == 0);
@@ -2999,8 +3097,9 @@ ALWAYS_INLINE float GPU_HW::GetCurrentNormalizedVertexDepth() const
void GPU_HW::UpdateSoftwareRenderer(bool copy_vram_from_hw)
{
+ // TODO: SW-for-readbacks is currently incompatible with the texture cache, due to threading races.
const bool current_enabled = (m_sw_renderer != nullptr);
- const bool new_enabled = g_settings.gpu_use_software_renderer_for_readbacks;
+ const bool new_enabled = !m_use_texture_cache && g_settings.gpu_use_software_renderer_for_readbacks;
if (current_enabled == new_enabled)
return;
@@ -3078,7 +3177,21 @@ void GPU_HW::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
GL_INS_FMT("Dirty draw area before: {}", m_vram_dirty_draw_rect);
const GSVector4i bounds = GetVRAMTransferBounds(x, y, width, height);
- AddUnclampedDrawnRectangle(bounds);
+
+ // If TC is enabled, we have to update local memory.
+ if (m_use_texture_cache && !IsInterlacedRenderingEnabled())
+ {
+ AddWrittenRectangle(bounds);
+
+ if (m_sw_renderer)
+ m_sw_renderer->Sync(true);
+ else
+ GPU::FillVRAM(x, y, width, height, color);
+ }
+ else
+ {
+ AddUnclampedDrawnRectangle(bounds);
+ }
GL_INS_FMT("Dirty draw area after: {}", m_vram_dirty_draw_rect);
@@ -3124,6 +3237,8 @@ void GPU_HW::ReadVRAM(u32 x, u32 y, u32 width, u32 height)
return;
}
+ // TODO: Only read if it's in the drawn area
+
// Get bounds with wrap-around handled.
GSVector4i copy_rect = GetVRAMTransferBounds(x, y, width, height);
@@ -3175,7 +3290,21 @@ void GPU_HW::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, b
{
GL_SCOPE_FMT("UpdateVRAM({},{} => {},{} ({}x{})", x, y, x + width, y + height, width, height);
- if (m_sw_renderer)
+ // TODO: Handle wrapped transfers... break them up or something
+ const GSVector4i bounds = GetVRAMTransferBounds(x, y, width, height);
+ DebugAssert(bounds.right <= static_cast(VRAM_WIDTH) && bounds.bottom <= static_cast(VRAM_HEIGHT));
+ AddWrittenRectangle(bounds);
+
+ // We want to dump *before* the write goes through, otherwise we dump bad data.
+ if (m_use_texture_cache)
+ {
+ if (m_sw_renderer)
+ m_sw_renderer->Sync(true);
+
+ GPU::UpdateVRAM(x, y, width, height, data, set_mask, check_mask);
+ GPUTextureCache::TrackVRAMWrite(bounds);
+ }
+ else if (m_sw_renderer)
{
const u32 num_words = width * height;
GPUBackendUpdateVRAMCommand* cmd = m_sw_renderer->NewUpdateVRAMCommand(num_words);
@@ -3190,10 +3319,6 @@ void GPU_HW::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, b
m_sw_renderer->PushCommand(cmd);
}
- const GSVector4i bounds = GetVRAMTransferBounds(x, y, width, height);
- DebugAssert(bounds.right <= static_cast(VRAM_WIDTH) && bounds.bottom <= static_cast(VRAM_HEIGHT));
- AddWrittenRectangle(bounds);
-
if (check_mask)
{
// set new vertex counter since we want this to take into consideration previous masked pixels
@@ -3281,7 +3406,32 @@ void GPU_HW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32
{
GL_SCOPE_FMT("CopyVRAM({}x{} @ {},{} => {},{}", width, height, src_x, src_y, dst_x, dst_y);
- if (m_sw_renderer)
+ // masking enabled, oversized, or overlapping
+ const bool use_shader =
+ (m_GPUSTAT.IsMaskingEnabled() || ((src_x % VRAM_WIDTH) + width) > VRAM_WIDTH ||
+ ((src_y % VRAM_HEIGHT) + height) > VRAM_HEIGHT || ((dst_x % VRAM_WIDTH) + width) > VRAM_WIDTH ||
+ ((dst_y % VRAM_HEIGHT) + height) > VRAM_HEIGHT);
+ const GSVector4i src_bounds = GetVRAMTransferBounds(src_x, src_y, width, height);
+ const GSVector4i dst_bounds = GetVRAMTransferBounds(dst_x, dst_y, width, height);
+
+ // If we're copying a region that hasn't been drawn to, and we're using the TC, we can do it in local memory.
+ if (m_use_texture_cache && !GPUTextureCache::IsRectDrawn(src_bounds))
+ {
+ GL_INS("Performed in local memory.");
+
+ if (m_sw_renderer)
+ m_sw_renderer->Sync(true);
+
+ GPUTextureCache::AddWrittenRectangle(dst_bounds);
+ // GPUTextureCache::AddCopiedRectanglePart1(dst_bounds); // needed for FF8 because it animates textures by copying
+ GPU::CopyVRAM(src_x, src_y, dst_x, dst_y, width, height);
+ UpdateVRAMOnGPU(dst_bounds.left, dst_bounds.top, dst_bounds.width(), dst_bounds.height(),
+ &g_vram[dst_bounds.top * VRAM_WIDTH + dst_bounds.left], VRAM_WIDTH * sizeof(u16), false, false,
+ dst_bounds);
+ // GPUTextureCache::AddCopiedRectanglePart2(dst_bounds);
+ return;
+ }
+ else if (m_sw_renderer)
{
GPUBackendCopyVRAMCommand* cmd = m_sw_renderer->NewCopyVRAMCommand();
FillBackendCommandParameters(cmd);
@@ -3294,16 +3444,8 @@ void GPU_HW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32
m_sw_renderer->PushCommand(cmd);
}
- // masking enabled, oversized, or overlapping
- const bool use_shader =
- (m_GPUSTAT.IsMaskingEnabled() || ((src_x % VRAM_WIDTH) + width) > VRAM_WIDTH ||
- ((src_y % VRAM_HEIGHT) + height) > VRAM_HEIGHT || ((dst_x % VRAM_WIDTH) + width) > VRAM_WIDTH ||
- ((dst_y % VRAM_HEIGHT) + height) > VRAM_HEIGHT);
- const GSVector4i src_bounds = GetVRAMTransferBounds(src_x, src_y, width, height);
- const GSVector4i dst_bounds = GetVRAMTransferBounds(dst_x, dst_y, width, height);
const bool intersect_with_draw = m_vram_dirty_draw_rect.rintersects(src_bounds);
const bool intersect_with_write = m_vram_dirty_write_rect.rintersects(src_bounds);
-
if (use_shader || IsUsingMultisampling())
{
if (intersect_with_draw || intersect_with_write)
@@ -3341,6 +3483,7 @@ void GPU_HW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32
g_gpu_device->SetViewportAndScissor(dst_bounds_scaled);
g_gpu_device->SetPipeline(
m_vram_copy_pipelines[BoolToUInt8(m_GPUSTAT.check_mask_before_draw && m_write_mask_as_depth)].get());
+ g_gpu_device->SetTextureSampler(0, m_vram_read_texture.get(), g_gpu_device->GetNearestSampler());
g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms));
g_gpu_device->Draw(3, 0);
RestoreDeviceContext();
@@ -3360,7 +3503,8 @@ void GPU_HW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32
UpdateVRAMReadTexture(intersect_with_draw, intersect_with_write);
}
- if (intersect_with_draw)
+ // We don't have it in local memory, so TC can't read it.
+ if (intersect_with_draw || m_use_texture_cache)
{
AddUnclampedDrawnRectangle(dst_bounds);
}
@@ -3396,77 +3540,112 @@ void GPU_HW::DispatchRenderCommand()
{
const GPURenderCommand rc{m_render_command.bits};
- BatchTextureMode texture_mode = BatchTextureMode::Disabled;
+ // TODO: avoid all this for vertex loading, only do when the type of draw changes
+ BatchTextureMode texture_mode = rc.IsTexturingEnabled() ? m_batch.texture_mode : BatchTextureMode::Disabled;
+ GPUTextureCache::SourceKey texture_cache_key = m_batch.texture_cache_key;
if (rc.IsTexturingEnabled())
{
// texture page changed - check that the new page doesn't intersect the drawing area
- if (m_draw_mode.IsTexturePageChanged())
+ if (m_draw_mode.IsTexturePageChanged() || texture_mode == BatchTextureMode::Disabled)
{
m_draw_mode.ClearTexturePageChangedFlag();
-#if 0
- if (!m_vram_dirty_draw_rect.eq(INVALID_RECT) || !m_vram_dirty_write_rect.eq(INVALID_RECT))
- {
- GL_INS_FMT("VRAM DIRTY: {} {}", m_vram_dirty_draw_rect, m_vram_dirty_write_rect);
- GL_INS_FMT("PAGE RECT: {}", m_draw_mode.mode_reg.GetTexturePageRectangle());
- if (m_draw_mode.mode_reg.IsUsingPalette())
- GL_INS_FMT("PALETTE RECT: {}", m_draw_mode.palette_reg.GetRectangle(m_draw_mode.mode_reg.texture_mode));
- }
-#endif
+ // start by assuming we can use the TC
+ bool use_texture_cache = m_use_texture_cache;
+ // check that the palette isn't in a drawn area
if (m_draw_mode.mode_reg.IsUsingPalette())
{
- const GSVector4i palette_rect = m_draw_mode.palette_reg.GetRectangle(m_draw_mode.mode_reg.texture_mode);
- const bool update_drawn = palette_rect.rintersects(m_vram_dirty_draw_rect);
- const bool update_written = palette_rect.rintersects(m_vram_dirty_write_rect);
- if (update_drawn || update_written)
+ const GSVector4i palette_rect =
+ GetPaletteRect(m_draw_mode.palette_reg, m_draw_mode.mode_reg.texture_mode, use_texture_cache);
+ if (!use_texture_cache || GPUTextureCache::IsRectDrawn(palette_rect))
{
- GL_INS("Palette in VRAM dirty area, flushing cache");
- if (!IsFlushed())
- FlushRender();
+ if (use_texture_cache)
+ GL_INS_FMT("Palette at {} is in drawn area, can't use TC", palette_rect);
+ use_texture_cache = false;
- UpdateVRAMReadTexture(update_drawn, update_written);
+ const bool update_drawn = palette_rect.rintersects(m_vram_dirty_draw_rect);
+ const bool update_written = palette_rect.rintersects(m_vram_dirty_write_rect);
+ if (update_drawn || update_written)
+ {
+ GL_INS("Palette in VRAM dirty area, flushing cache");
+ if (!IsFlushed())
+ FlushRender();
+
+ UpdateVRAMReadTexture(update_drawn, update_written);
+ }
}
}
- const GSVector4i page_rect = m_draw_mode.mode_reg.GetTexturePageRectangle();
- GSVector4i::storel(m_current_texture_page_offset, page_rect);
+ m_compute_uv_range = (m_clamp_uvs || m_texture_dumping);
- u8 new_texpage_dirty = m_vram_dirty_draw_rect.rintersects(page_rect) ? TEXPAGE_DIRTY_DRAWN_RECT : 0;
- new_texpage_dirty |= m_vram_dirty_write_rect.rintersects(page_rect) ? TEXPAGE_DIRTY_WRITTEN_RECT : 0;
+ const GPUTextureMode gpu_texture_mode =
+ (m_draw_mode.mode_reg.texture_mode == GPUTextureMode::Reserved_Direct16Bit) ? GPUTextureMode::Direct16Bit :
+ m_draw_mode.mode_reg.texture_mode;
+ const GSVector4i page_rect = GetTextureRect(m_draw_mode.mode_reg.texture_page, m_draw_mode.mode_reg.texture_mode);
- if (new_texpage_dirty != 0)
+ // TODO: This will result in incorrect global-space UVs when the texture page wraps around.
+ // Need to deal with it if it becomes a problem.
+ m_current_texture_page_offset[0] = static_cast(m_draw_mode.mode_reg.GetTexturePageBaseX());
+ m_current_texture_page_offset[1] = static_cast(m_draw_mode.mode_reg.GetTexturePageBaseY());
+
+ if (use_texture_cache)
{
- GL_INS("Texpage is in dirty area, checking UV ranges");
- m_texpage_dirty = new_texpage_dirty;
- m_compute_uv_range = true;
- m_current_uv_rect = INVALID_RECT;
+ texture_mode = BatchTextureMode::PageTexture;
+ texture_cache_key =
+ GPUTextureCache::SourceKey(m_draw_mode.mode_reg.texture_page, m_draw_mode.palette_reg, gpu_texture_mode);
+
+ const bool is_drawn = GPUTextureCache::IsRectDrawn(page_rect);
+ if (is_drawn)
+ GL_INS_FMT("Texpage [{}] {} is drawn in TC, checking UV ranges", texture_cache_key.page, page_rect);
+
+ m_texpage_dirty =
+ (is_drawn ? TEXPAGE_DIRTY_PAGE_RECT : 0) | (m_texture_dumping ? TEXPAGE_DIRTY_ONLY_UV_RECT : 0);
+ m_compute_uv_range |= ShouldCheckForTexPageOverlap();
}
else
{
- m_compute_uv_range = m_clamp_uvs;
- if (m_texpage_dirty)
- GL_INS("Texpage is no longer dirty");
- m_texpage_dirty = 0;
+ texture_mode = static_cast(gpu_texture_mode);
+ m_texpage_dirty = (m_vram_dirty_draw_rect.rintersects(page_rect) ? TEXPAGE_DIRTY_DRAWN_RECT : 0) |
+ (m_vram_dirty_write_rect.rintersects(page_rect) ? TEXPAGE_DIRTY_WRITTEN_RECT : 0);
+ if (m_texpage_dirty & TEXPAGE_DIRTY_DRAWN_RECT)
+ GL_INS_FMT("Texpage {} is in dirty DRAWN area {}", page_rect, m_vram_dirty_draw_rect);
+ if (m_texpage_dirty & TEXPAGE_DIRTY_WRITTEN_RECT)
+ GL_INS_FMT("Texpage {} is in dirty WRITTEN area {}", page_rect, m_vram_dirty_write_rect);
+
+ // Current UV rect _must_ be cleared here, because we're only check for texpage intersection when it grows in
+ // size, a switch from a non-contained page to a contained page would go undetected otherwise.
+ if (m_texpage_dirty != 0)
+ {
+ m_compute_uv_range = true;
+ m_current_uv_rect = INVALID_RECT;
+ }
}
}
-
- texture_mode = (m_draw_mode.mode_reg.texture_mode == GPUTextureMode::Reserved_Direct16Bit) ?
- BatchTextureMode::Direct16Bit :
- static_cast(m_draw_mode.mode_reg.texture_mode.GetValue());
}
+ DebugAssert((rc.IsTexturingEnabled() && (texture_mode == BatchTextureMode::PageTexture &&
+ texture_cache_key.mode == m_draw_mode.mode_reg.texture_mode) ||
+ texture_mode == static_cast(m_draw_mode.mode_reg.texture_mode.GetValue())) ||
+ (!rc.IsTexturingEnabled() && texture_mode == BatchTextureMode::Disabled));
+ DebugAssert(!(m_texpage_dirty & TEXPAGE_DIRTY_PAGE_RECT) || texture_mode == BatchTextureMode::PageTexture ||
+ !rc.IsTexturingEnabled());
+
// has any state changed which requires a new batch?
// Reverse blending breaks with mixed transparent and opaque pixels, so we have to do one draw per polygon.
// If we have fbfetch, we don't need to draw it in two passes. Test case: Suikoden 2 shadows.
const GPUTransparencyMode transparency_mode =
rc.transparency_enable ? m_draw_mode.mode_reg.transparency_mode : GPUTransparencyMode::Disabled;
const bool dithering_enable = (!m_true_color && rc.IsDitheringEnabled()) ? m_GPUSTAT.dither_enable : false;
- if (texture_mode != m_batch.texture_mode || transparency_mode != m_batch.transparency_mode ||
- (transparency_mode == GPUTransparencyMode::BackgroundMinusForeground && !m_allow_shader_blend) ||
- dithering_enable != m_batch.dithering)
+ if (!IsFlushed())
{
- FlushRender();
+ if (texture_mode != m_batch.texture_mode || transparency_mode != m_batch.transparency_mode ||
+ (transparency_mode == GPUTransparencyMode::BackgroundMinusForeground && !m_allow_shader_blend) ||
+ dithering_enable != m_batch.dithering ||
+ (texture_mode == BatchTextureMode::PageTexture && m_batch.texture_cache_key != texture_cache_key))
+ {
+ FlushRender();
+ }
}
EnsureVertexBufferSpaceForCurrentCommand();
@@ -3510,6 +3689,7 @@ void GPU_HW::DispatchRenderCommand()
m_batch.texture_mode = texture_mode;
m_batch.transparency_mode = transparency_mode;
m_batch.dithering = dithering_enable;
+ m_batch.texture_cache_key = texture_cache_key;
if (m_draw_mode.IsTextureWindowChanged())
{
@@ -3575,10 +3755,21 @@ void GPU_HW::FlushRender()
return;
#ifdef _DEBUG
- GL_SCOPE_FMT("Hardware Draw {}", ++s_draw_number);
+ GL_SCOPE_FMT("Hardware Draw {}: {}", ++s_draw_number, m_current_draw_rect);
#endif
GL_INS_FMT("Dirty draw area: {}", m_vram_dirty_draw_rect);
+ if (m_compute_uv_range)
+ GL_INS_FMT("UV rect: {}", m_current_uv_rect);
+
+ const GPUTextureCache::Source* texture = nullptr;
+ if (m_batch.texture_mode == BatchTextureMode::PageTexture)
+ {
+ texture = LookupSource(m_batch.texture_cache_key, m_current_uv_rect,
+ m_batch.transparency_mode != GPUTransparencyMode::Disabled ?
+ GPUTextureCache::PaletteRecordFlags::HasSemiTransparentDraws :
+ GPUTextureCache::PaletteRecordFlags::None);
+ }
if (m_batch_ubo_dirty)
{
@@ -3587,21 +3778,24 @@ void GPU_HW::FlushRender()
m_batch_ubo_dirty = false;
}
+ m_current_draw_rect = INVALID_RECT;
+ m_current_uv_rect = INVALID_RECT;
+
if (m_wireframe_mode != GPUWireframeMode::OnlyWireframe)
{
if (NeedsShaderBlending(m_batch.transparency_mode, m_batch.texture_mode, m_batch.check_mask_before_draw) ||
m_rov_active || (m_use_rov_for_shader_blend && m_pgxp_depth_buffer))
{
- DrawBatchVertices(BatchRenderMode::ShaderBlend, index_count, base_index, base_vertex);
+ DrawBatchVertices(BatchRenderMode::ShaderBlend, index_count, base_index, base_vertex, texture);
}
else if (NeedsTwoPassRendering())
{
- DrawBatchVertices(BatchRenderMode::OnlyOpaque, index_count, base_index, base_vertex);
- DrawBatchVertices(BatchRenderMode::OnlyTransparent, index_count, base_index, base_vertex);
+ DrawBatchVertices(BatchRenderMode::OnlyOpaque, index_count, base_index, base_vertex, texture);
+ DrawBatchVertices(BatchRenderMode::OnlyTransparent, index_count, base_index, base_vertex, texture);
}
else
{
- DrawBatchVertices(m_batch.GetRenderMode(), index_count, base_index, base_vertex);
+ DrawBatchVertices(m_batch.GetRenderMode(), index_count, base_index, base_vertex, texture);
}
}
@@ -3621,6 +3815,8 @@ void GPU_HW::UpdateDisplay()
GL_SCOPE("UpdateDisplay()");
+ GPUTextureCache::Compact();
+
if (g_settings.debugging.show_vram)
{
if (IsUsingMultisampling())
diff --git a/src/core/gpu_hw.h b/src/core/gpu_hw.h
index 1ada1e17f..d94c75cfd 100644
--- a/src/core/gpu_hw.h
+++ b/src/core/gpu_hw.h
@@ -4,6 +4,7 @@
#pragma once
#include "gpu.h"
+#include "gpu_hw_texture_cache.h"
#include "texture_replacements.h"
#include "util/gpu_device.h"
@@ -38,6 +39,7 @@ public:
Palette4Bit,
Palette8Bit,
Direct16Bit,
+ PageTexture,
Disabled,
SpritePalette4Bit,
@@ -52,6 +54,11 @@ public:
static_cast(BatchTextureMode::Palette8Bit) == static_cast(GPUTextureMode::Palette8Bit) &&
static_cast(BatchTextureMode::Direct16Bit) == static_cast(GPUTextureMode::Direct16Bit));
+ static constexpr GSVector4i VRAM_SIZE_RECT = GSVector4i::cxpr(0, 0, VRAM_WIDTH, VRAM_HEIGHT);
+ static constexpr GSVector4i INVALID_RECT =
+ GSVector4i::cxpr(std::numeric_limits::max(), std::numeric_limits::max(), std::numeric_limits::min(),
+ std::numeric_limits::min());
+
GPU_HW();
~GPU_HW() override;
@@ -83,6 +90,8 @@ private:
{
TEXPAGE_DIRTY_DRAWN_RECT = (1 << 0),
TEXPAGE_DIRTY_WRITTEN_RECT = (1 << 1),
+ TEXPAGE_DIRTY_PAGE_RECT = (1 << 2),
+ TEXPAGE_DIRTY_ONLY_UV_RECT = (1 << 3),
};
static_assert(GPUDevice::MIN_TEXEL_BUFFER_ELEMENTS >= (VRAM_WIDTH * VRAM_HEIGHT));
@@ -116,6 +125,8 @@ private:
bool use_depth_buffer = false;
bool sprite_mode = false;
+ GPUTextureCache::SourceKey texture_cache_key = {};
+
// Returns the render mode for this batch.
BatchRenderMode GetRenderMode() const;
};
@@ -136,11 +147,6 @@ private:
u32 num_uniform_buffer_updates;
};
- static constexpr GSVector4i VRAM_SIZE_RECT = GSVector4i::cxpr(0, 0, VRAM_WIDTH, VRAM_HEIGHT);
- static constexpr GSVector4i INVALID_RECT =
- GSVector4i::cxpr(std::numeric_limits::max(), std::numeric_limits::max(), std::numeric_limits::min(),
- std::numeric_limits::min());
-
/// Returns true if a depth buffer should be created.
GPUTexture::Format GetDepthBufferFormat() const;
@@ -165,7 +171,8 @@ private:
void DeactivateROV();
void MapGPUBuffer(u32 required_vertices, u32 required_indices);
void UnmapGPUBuffer(u32 used_vertices, u32 used_indices);
- void DrawBatchVertices(BatchRenderMode render_mode, u32 num_indices, u32 base_index, u32 base_vertex);
+ void DrawBatchVertices(BatchRenderMode render_mode, u32 num_indices, u32 base_index, u32 base_vertex,
+ const GPUTextureCache::Source* texture);
u32 CalculateResolutionScale() const;
GPUDownsampleMode GetDownsampleMode(u32 resolution_scale) const;
@@ -182,6 +189,7 @@ private:
void SetTexPageChangedOnOverlap(const GSVector4i update_rect);
void CheckForTexPageOverlap(GSVector4i uv_rect);
+ bool ShouldCheckForTexPageOverlap() const;
bool IsFlushed() const;
void EnsureVertexBufferSpace(u32 required_vertices, u32 required_indices);
@@ -286,6 +294,9 @@ private:
bool m_texture_window_active : 1 = false;
bool m_rov_active : 1 = false;
+ bool m_use_texture_cache : 1 = false;
+ bool m_texture_dumping : 1 = false;
+
u8 m_texpage_dirty = 0;
BatchConfig m_batch;
@@ -296,8 +307,9 @@ private:
// Bounding box of VRAM area that the GPU has drawn into.
GSVector4i m_vram_dirty_draw_rect = INVALID_RECT;
- GSVector4i m_vram_dirty_write_rect = INVALID_RECT;
+ GSVector4i m_vram_dirty_write_rect = INVALID_RECT; // TODO: Don't use in TC mode, should be kept at zero.
GSVector4i m_current_uv_rect = INVALID_RECT;
+ GSVector4i m_current_draw_rect = INVALID_RECT;
s32 m_current_texture_page_offset[2] = {};
std::unique_ptr m_wireframe_pipeline;
diff --git a/src/core/gpu_hw_shadergen.cpp b/src/core/gpu_hw_shadergen.cpp
index 07b10500b..2f35777b9 100644
--- a/src/core/gpu_hw_shadergen.cpp
+++ b/src/core/gpu_hw_shadergen.cpp
@@ -60,13 +60,14 @@ void GPU_HW_ShaderGen::WriteBatchUniformBuffer(std::stringstream& ss)
false);
}
-std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool palette, bool uv_limits,
+std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool palette, bool page_texture, bool uv_limits,
bool force_round_texcoords, bool pgxp_depth)
{
std::stringstream ss;
WriteHeader(ss);
DefineMacro(ss, "TEXTURED", textured);
DefineMacro(ss, "PALETTE", palette);
+ DefineMacro(ss, "PAGE_TEXTURE", page_texture);
DefineMacro(ss, "UV_LIMITS", uv_limits);
DefineMacro(ss, "FORCE_ROUND_TEXCOORDS", force_round_texcoords);
DefineMacro(ss, "PGXP_DEPTH", pgxp_depth);
@@ -74,7 +75,22 @@ std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool pale
WriteCommonFunctions(ss);
WriteBatchUniformBuffer(ss);
- if (textured)
+ if (textured && page_texture)
+ {
+ if (uv_limits)
+ {
+ DeclareVertexEntryPoint(
+ ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage", "float4 a_uv_limits"}, 1, 1,
+ {{"nointerpolation", "float4 v_uv_limits"}}, false, "", UsingMSAA(), UsingPerSampleShading(),
+ m_disable_color_perspective);
+ }
+ else
+ {
+ DeclareVertexEntryPoint(ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage"}, 1, 1, {},
+ false, "", UsingMSAA(), UsingPerSampleShading(), m_disable_color_perspective);
+ }
+ }
+ else if (textured)
{
if (uv_limits)
{
@@ -132,16 +148,18 @@ std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool pale
v_col0 = a_col0;
#if TEXTURED
v_tex0 = float2(uint2(a_texcoord & 0xFFFFu, a_texcoord >> 16));
- #if !PALETTE
+ #if !PALETTE && !PAGE_TEXTURE
v_tex0 *= float(RESOLUTION_SCALE);
#endif
- // base_x,base_y,palette_x,palette_y
- v_texpage.x = (a_texpage & 15u) * 64u;
- v_texpage.y = ((a_texpage >> 4) & 1u) * 256u;
- #if PALETTE
- v_texpage.z = ((a_texpage >> 16) & 63u) * 16u;
- v_texpage.w = ((a_texpage >> 22) & 511u);
+ #if !PAGE_TEXTURE
+ // base_x,base_y,palette_x,palette_y
+ v_texpage.x = (a_texpage & 15u) * 64u;
+ v_texpage.y = ((a_texpage >> 4) & 1u) * 256u;
+ #if PALETTE
+ v_texpage.z = ((a_texpage >> 16) & 63u) * 16u;
+ v_texpage.w = ((a_texpage >> 22) & 511u);
+ #endif
#endif
#if UV_LIMITS
@@ -151,7 +169,7 @@ std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool pale
// Add 0.5 to the upper bounds when upscaling, to work around interpolation differences.
// Limited to force-round-texcoord hack, to avoid breaking other games.
v_uv_limits.zw += 0.5;
- #elif !PALETTE
+ #elif !PAGE_TEXTURE && !PALETTE
// Treat coordinates as being in upscaled space, and extend the UV range to all "upscaled"
// pixels. This means 1-pixel-high polygon-based framebuffer effects won't be downsampled.
// (e.g. Mega Man Legends 2 haze effect)
@@ -712,6 +730,7 @@ std::string GPU_HW_ShaderGen::GenerateBatchFragmentShader(
const bool textured = (texture_mode != GPU_HW::BatchTextureMode::Disabled);
const bool palette =
(texture_mode == GPU_HW::BatchTextureMode::Palette4Bit || texture_mode == GPU_HW::BatchTextureMode::Palette8Bit);
+ const bool page_texture = (texture_mode == GPU_HW::BatchTextureMode::PageTexture);
const bool shader_blending = (render_mode == GPU_HW::BatchRenderMode::ShaderBlend);
const bool use_dual_source = (!shader_blending && !use_rov && m_supports_dual_source_blend &&
((render_mode != GPU_HW::BatchRenderMode::TransparencyDisabled &&
@@ -730,6 +749,7 @@ std::string GPU_HW_ShaderGen::GenerateBatchFragmentShader(
DefineMacro(ss, "PALETTE", palette);
DefineMacro(ss, "PALETTE_4_BIT", texture_mode == GPU_HW::BatchTextureMode::Palette4Bit);
DefineMacro(ss, "PALETTE_8_BIT", texture_mode == GPU_HW::BatchTextureMode::Palette8Bit);
+ DefineMacro(ss, "PAGE_TEXTURE", page_texture);
DefineMacro(ss, "DITHERING", dithering);
DefineMacro(ss, "DITHERING_SCALED", m_scaled_dithering);
DefineMacro(ss, "INTERLACING", interlacing);
@@ -810,6 +830,8 @@ uint2 FloatToIntegerCoords(float2 coords)
return uint2((RESOLUTION_SCALE == 1u || FORCE_ROUND_TEXCOORDS != 0) ? roundEven(coords) : floor(coords));
}
+#if !PAGE_TEXTURE
+
float4 SampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords)
{
#if PALETTE
@@ -863,11 +885,43 @@ float4 SampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords)
#endif
}
+#else
+
+float4 SampleFromPageTexture(float2 coords)
+{
+ // Cached textures.
+#if FORCE_ROUND_TEXCOORDS
+ float2 fpart = coords - roundEven(coords);
+#else
+ float2 fpart = frac(coords);
#endif
+ uint2 icoord = ApplyTextureWindow(FloatToIntegerCoords(coords));
+ coords = (float2(icoord) + fpart) * (1.0f / 256.0f);
+ return SAMPLE_TEXTURE(samp0, coords);
+}
+
+#endif
+
+#endif // TEXTURED
)";
const u32 num_fragment_outputs = use_rov ? 0 : (use_dual_source ? 2 : 1);
- if (textured)
+ if (textured && page_texture)
+ {
+ if (uv_limits)
+ {
+ DeclareFragmentEntryPoint(ss, 1, 1, {{"nointerpolation", "float4 v_uv_limits"}}, true, num_fragment_outputs,
+ use_dual_source, m_write_mask_as_depth, UsingMSAA(), UsingPerSampleShading(), false,
+ m_disable_color_perspective, shader_blending && !use_rov, use_rov);
+ }
+ else
+ {
+ DeclareFragmentEntryPoint(ss, 1, 1, {}, true, num_fragment_outputs, use_dual_source, m_write_mask_as_depth,
+ UsingMSAA(), UsingPerSampleShading(), false, m_disable_color_perspective,
+ shader_blending && !use_rov, use_rov);
+ }
+ }
+ else if (textured)
{
if (texture_filtering != GPUTextureFilter::Nearest)
WriteBatchTextureFilter(ss, texture_filtering);
@@ -913,7 +967,17 @@ float4 SampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords)
#if TEXTURED
float4 texcol;
- #if TEXTURE_FILTERING
+ #if PAGE_TEXTURE
+ #if UV_LIMITS
+ texcol = SampleFromPageTexture(clamp(v_tex0, v_uv_limits.xy, v_uv_limits.zw));
+ #else
+ texcol = SampleFromPageTexture(v_tex0);
+ #endif
+ if (VECTOR_EQ(texcol, TRANSPARENT_PIXEL_COLOR))
+ discard;
+
+ ialpha = 1.0;
+ #elif TEXTURE_FILTERING
FilteredSampleFromVRAM(v_texpage, v_tex0, v_uv_limits, texcol, ialpha);
if (ialpha < 0.5)
discard;
@@ -1712,3 +1776,33 @@ std::string GPU_HW_ShaderGen::GenerateBoxSampleDownsampleFragmentShader(u32 fact
return ss.str();
}
+
+std::string GPU_HW_ShaderGen::GenerateReplacementMergeFragmentShader(bool semitransparent)
+{
+ std::stringstream ss;
+ WriteHeader(ss);
+ DefineMacro(ss, "SEMITRANSPARENT", semitransparent);
+ DeclareUniformBuffer(ss, {"float4 u_src_rect"}, true);
+ DeclareTexture(ss, "samp0", 0);
+ DeclareFragmentEntryPoint(ss, 0, 1);
+
+ ss << R"(
+{
+ float2 coords = u_src_rect.xy + v_tex0 * u_src_rect.zw;
+ float4 color = SAMPLE_TEXTURE(samp0, coords);
+ o_col0.rgb = color.rgb;
+
+ // Alpha processing.
+ #if SEMITRANSPARENT
+ // Map anything not 255 to 1 for semitransparent, otherwise zero for opaque.
+ o_col0.a = (color.a <= 0.95f) ? 1.0f : 0.0f;
+ o_col0.a = VECTOR_EQ(color, float4(0.0, 0.0, 0.0, 0.0)) ? 0.0f : o_col0.a;
+ #else
+ // Leave (0,0,0,0) as 0000 for opaque replacements for cutout alpha.
+ o_col0.a = color.a;
+ #endif
+}
+)";
+
+ return ss.str();
+}
\ No newline at end of file
diff --git a/src/core/gpu_hw_shadergen.h b/src/core/gpu_hw_shadergen.h
index ef442b836..e07bb6cc9 100644
--- a/src/core/gpu_hw_shadergen.h
+++ b/src/core/gpu_hw_shadergen.h
@@ -15,8 +15,8 @@ public:
bool supports_dual_source_blend, bool supports_framebuffer_fetch);
~GPU_HW_ShaderGen();
- std::string GenerateBatchVertexShader(bool textured, bool palette, bool uv_limits, bool force_round_texcoords,
- bool pgxp_depth);
+ std::string GenerateBatchVertexShader(bool textured, bool palette, bool page_texture, bool uv_limits,
+ bool force_round_texcoords, bool pgxp_depth);
std::string GenerateBatchFragmentShader(GPU_HW::BatchRenderMode render_mode, GPUTransparencyMode transparency,
GPU_HW::BatchTextureMode texture_mode, GPUTextureFilter texture_filtering,
bool uv_limits, bool force_round_texcoords, bool dithering, bool interlacing,
@@ -36,6 +36,8 @@ public:
std::string GenerateAdaptiveDownsampleCompositeFragmentShader();
std::string GenerateBoxSampleDownsampleFragmentShader(u32 factor);
+ std::string GenerateReplacementMergeFragmentShader(bool semitransparent);
+
private:
ALWAYS_INLINE bool UsingMSAA() const { return m_multisamples > 1; }
ALWAYS_INLINE bool UsingPerSampleShading() const { return m_multisamples > 1 && m_per_sample_shading; }
diff --git a/src/core/gpu_hw_texture_cache.cpp b/src/core/gpu_hw_texture_cache.cpp
new file mode 100644
index 000000000..826a7ac32
--- /dev/null
+++ b/src/core/gpu_hw_texture_cache.cpp
@@ -0,0 +1,1915 @@
+// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin
+// SPDX-License-Identifier: CC-BY-NC-ND-4.0
+
+#include "gpu_hw_texture_cache.h"
+#include "gpu_hw.h"
+#include "gpu_hw_shadergen.h"
+#include "settings.h"
+#include "system.h"
+
+#include "util/gpu_device.h"
+#include "util/state_wrapper.h"
+
+#include "common/gsvector_formatter.h"
+#include "common/log.h"
+#include "common/string_util.h"
+
+#define XXH_STATIC_LINKING_ONLY
+#include "xxhash.h"
+#ifdef CPU_ARCH_SSE
+#include "xxh_x86dispatch.h"
+#endif
+
+#include
+#include
+#include
+
+Log_SetChannel(GPUTextureCache);
+
+// TODO: Fix copy-as-write.
+// TODO: Write coalescing, xenogears.
+
+// #define ALWAYS_TRACK_VRAM_WRITES 1
+
+namespace GPUTextureCache {
+static constexpr u32 MAX_CLUT_SIZE = 256;
+
+struct VRAMWrite
+{
+ GSVector4i active_rect;
+ GSVector4i write_rect;
+ HashType hash;
+
+ struct PaletteRecord
+ {
+ // TODO: Texture window, for sub texture dumping.
+ GSVector4i rect;
+ SourceKey key;
+ PaletteRecordFlags flags;
+
+ // Awkward to store, but we need to keep a backup copy of each CLUT, because if the CLUT gets overwritten
+ // before the VRAM write, when we go to dump the texture, it'll be incorrect.
+ HashType palette_hash;
+ u16 palette[MAX_CLUT_SIZE];
+ };
+
+ // List of palettes and rectangles drawn for dumping.
+ // TODO: Keep these in texel-local space, not global space, that way texture sizes aren't aligned to 4 pixels.
+ // But realistically, that probably isn't super common, and also requires modifying the renderer side of things.
+ std::vector palette_records;
+
+ u32 num_splits;
+ u32 num_page_refs;
+ std::array, MAX_PAGE_REFS_PER_WRITE> page_refs;
+};
+
+struct PageEntry
+{
+ TList