670 lines
29 KiB
C++
670 lines
29 KiB
C++
/**
|
|
******************************************************************************
|
|
* Xenia : Xbox 360 Emulator Research Project *
|
|
******************************************************************************
|
|
* Copyright 2022 Ben Vanik. All rights reserved. *
|
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
|
******************************************************************************
|
|
*/
|
|
|
|
#ifndef XENIA_GPU_TEXTURE_CACHE_H_
|
|
#define XENIA_GPU_TEXTURE_CACHE_H_
|
|
|
|
#include <array>
|
|
#include <atomic>
|
|
#include <cstdint>
|
|
#include <cstring>
|
|
#include <memory>
|
|
#include <unordered_map>
|
|
|
|
#include "xenia/base/assert.h"
|
|
#include "xenia/base/hash.h"
|
|
#include "xenia/base/math.h"
|
|
#include "xenia/base/mutex.h"
|
|
#include "xenia/gpu/register_file.h"
|
|
#include "xenia/gpu/shared_memory.h"
|
|
#include "xenia/gpu/texture_util.h"
|
|
#include "xenia/gpu/xenos.h"
|
|
|
|
namespace xe {
|
|
namespace gpu {
|
|
|
|
// Manages host copies of guest textures, performing untiling, format and endian
|
|
// conversion of textures stored in the shared memory, and also handling
|
|
// invalidation.
|
|
//
|
|
// Mipmaps are treated the following way, according to the GPU hang message
|
|
// found in game executables explaining the valid usage of BaseAddress when
|
|
// streaming the largest LOD (it says games should not use 0 as the base address
|
|
// when the largest LOD isn't loaded, but rather, either allocate a valid
|
|
// address for it or make it the same as mip_address):
|
|
// - If the texture has a base address, but no mip address, it's not mipmapped -
|
|
// the host texture has only the largest level too.
|
|
// - If the texture has different non-zero base address and mip address, a host
|
|
// texture with mip_max_level+1 mipmaps is created - mip_min_level is ignored
|
|
// and treated purely as sampler state because there are tfetch instructions
|
|
// working directly with LOD values - including fetching with an explicit LOD.
|
|
// However, the max level is not ignored because any mip count can be
|
|
// specified when creating a texture, and another texture may be placed after
|
|
// the last one.
|
|
// - If the texture has a mip address, but the base address is 0 or the same as
|
|
// the mip address, a mipmapped texture is created, but min/max LOD is clamped
|
|
// to the lower bound of 1 - the game is expected to do that anyway until the
|
|
// largest LOD is loaded.
|
|
// TODO(Triang3l): Attach the largest LOD to existing textures with a valid
|
|
// mip_address but no base ever used yet (no base_address) to save memory
|
|
// because textures are streamed this way anyway.
|
|
class TextureCache {
|
|
public:
|
|
// Hard limit, originating from the half-pixel offset filling hack in the
|
|
// resolve shaders only filling up to 3 pixels, due to the bit counts used for
|
|
// passing the scale to shaders, and because the full 490 MB EDRAM buffer is
|
|
// within the minimum Direct3D 12 requirement of 128 * 2^20 texels in a single
|
|
// buffer binding (counted as R32 for a byte address buffer).
|
|
static constexpr uint32_t kMaxDrawResolutionScaleAlongAxis = 7;
|
|
|
|
TextureCache(const TextureCache& texture_cache) = delete;
|
|
TextureCache& operator=(const TextureCache& texture_cache) = delete;
|
|
virtual ~TextureCache();
|
|
|
|
// Returns whether the actual scale is not smaller than the requested one.
|
|
static bool GetConfigDrawResolutionScale(uint32_t& x_out, uint32_t& y_out);
|
|
uint32_t draw_resolution_scale_x() const { return draw_resolution_scale_x_; }
|
|
uint32_t draw_resolution_scale_y() const { return draw_resolution_scale_y_; }
|
|
|
|
divisors::MagicDiv draw_resolution_scale_x_divisor() const {
|
|
return draw_resolution_scale_x_divisor_;
|
|
}
|
|
divisors::MagicDiv draw_resolution_scale_y_divisor() const {
|
|
return draw_resolution_scale_y_divisor_;
|
|
}
|
|
|
|
bool IsDrawResolutionScaled() const {
|
|
return draw_resolution_scale_x_ > 1 || draw_resolution_scale_y_ > 1;
|
|
}
|
|
|
|
virtual void ClearCache();
|
|
|
|
virtual void CompletedSubmissionUpdated(uint64_t completed_submission_index);
|
|
virtual void BeginSubmission(uint64_t new_submission_index);
|
|
virtual void BeginFrame();
|
|
|
|
void MarkRangeAsResolved(uint32_t start_unscaled, uint32_t length_unscaled);
|
|
// Ensures the memory backing the range in the scaled resolve address space is
|
|
// allocated and returns whether it is.
|
|
virtual bool EnsureScaledResolveMemoryCommitted(
|
|
uint32_t start_unscaled, uint32_t length_unscaled,
|
|
uint32_t length_scaled_alignment_log2 = 0) {
|
|
return false;
|
|
}
|
|
|
|
static uint32_t GuestToHostSwizzle(uint32_t guest_swizzle,
|
|
uint32_t host_format_swizzle);
|
|
|
|
void TextureFetchConstantWritten(uint32_t index) {
|
|
texture_bindings_in_sync_ &= ~(UINT32_C(1) << index);
|
|
}
|
|
void TextureFetchConstantsWritten(uint32_t first_index, uint32_t last_index) {
|
|
// generate a mask of all bits from before the first index, and xor it with
|
|
// all bits before the last index this produces a mask covering only the
|
|
// bits between first and last
|
|
uint32_t res = ((1U << first_index) - 1) ^
|
|
static_cast<uint32_t>((1ULL << (last_index + 1)) - 1ULL);
|
|
// todo: check that this is right
|
|
|
|
texture_bindings_in_sync_ &= ~res;
|
|
}
|
|
|
|
virtual void RequestTextures(uint32_t used_texture_mask);
|
|
|
|
// "ActiveTexture" means as of the latest RequestTextures call.
|
|
|
|
uint32_t GetActiveTextureHostSwizzle(uint32_t fetch_constant_index) const {
|
|
const TextureBinding* binding =
|
|
GetValidTextureBinding(fetch_constant_index);
|
|
return binding ? binding->host_swizzle : xenos::XE_GPU_TEXTURE_SWIZZLE_0000;
|
|
}
|
|
uint8_t GetActiveTextureSwizzledSigns(uint32_t fetch_constant_index) const {
|
|
const TextureBinding* binding =
|
|
GetValidTextureBinding(fetch_constant_index);
|
|
return binding ? binding->swizzled_signs : kSwizzledSignsUnsigned;
|
|
}
|
|
bool IsActiveTextureResolved(uint32_t fetch_constant_index) const {
|
|
const TextureBinding* binding =
|
|
GetValidTextureBinding(fetch_constant_index);
|
|
if (!binding) {
|
|
return false;
|
|
}
|
|
return (binding->texture && binding->texture->IsResolved()) ||
|
|
(binding->texture_signed && binding->texture_signed->IsResolved());
|
|
}
|
|
template <swcache::PrefetchTag tag>
|
|
void PrefetchTextureBinding(uint32_t fetch_constant_index) const {
|
|
swcache::Prefetch<tag>(&texture_bindings_[fetch_constant_index]);
|
|
swcache::Prefetch<tag>(
|
|
&texture_bindings_[fetch_constant_index +
|
|
1]); // we may cross a cache line boundary :( size
|
|
// of the structure is 0x28
|
|
}
|
|
|
|
protected:
|
|
struct TextureKey {
|
|
// Dimensions minus 1 are stored similarly to how they're stored in fetch
|
|
// constants so fewer bits can be used, while the maximum size (8192 for 2D)
|
|
// can still be encoded (a 8192x sky texture is used in 4D530910).
|
|
|
|
// Physical 4 KB page with the base mip level, disregarding A/C/E address
|
|
// range prefix.
|
|
uint32_t base_page : 17; // 17 total
|
|
xenos::DataDimension dimension : 2; // 19
|
|
uint32_t width_minus_1 : 13; // 32
|
|
|
|
uint32_t height_minus_1 : 13; // 45
|
|
uint32_t tiled : 1; // 46
|
|
uint32_t packed_mips : 1; // 47
|
|
// Physical 4 KB page with mip 1 and smaller.
|
|
uint32_t mip_page : 17; // 64
|
|
|
|
// (Layers for stacked and 3D, 6 for cube, 1 for other dimensions) - 1.
|
|
uint32_t depth_or_array_size_minus_1 : 10; // 74
|
|
uint32_t pitch : 9; // 83
|
|
uint32_t mip_max_level : 4; // 87
|
|
xenos::TextureFormat format : 6; // 93
|
|
xenos::Endian endianness : 2; // 95
|
|
// Whether this texture is signed and has a different host representation
|
|
// than an unsigned view of the same guest texture.
|
|
uint32_t signed_separate : 1; // 96
|
|
|
|
// Whether this texture is a resolution-scaled resolve target.
|
|
uint32_t scaled_resolve : 1; // 97
|
|
// Least important in ==, so placed last.
|
|
uint32_t is_valid : 1; // 98
|
|
|
|
TextureKey() { MakeInvalid(); }
|
|
TextureKey(const TextureKey& key) {
|
|
std::memcpy(this, &key, sizeof(*this));
|
|
}
|
|
TextureKey& operator=(const TextureKey& key) {
|
|
std::memcpy(this, &key, sizeof(*this));
|
|
return *this;
|
|
}
|
|
void MakeInvalid() {
|
|
// Zero everything, including the padding, for a stable hash.
|
|
std::memset(this, 0, sizeof(*this));
|
|
}
|
|
|
|
using Hasher = xe::hash::XXHasher<TextureKey>;
|
|
bool operator==(const TextureKey& key) const {
|
|
return !std::memcmp(this, &key, sizeof(*this));
|
|
}
|
|
bool operator!=(const TextureKey& key) const { return !(*this == key); }
|
|
|
|
uint32_t GetWidth() const { return width_minus_1 + 1; }
|
|
uint32_t GetHeight() const { return height_minus_1 + 1; }
|
|
uint32_t GetDepthOrArraySize() const {
|
|
return depth_or_array_size_minus_1 + 1;
|
|
}
|
|
|
|
texture_util::TextureGuestLayout GetGuestLayout() const {
|
|
return texture_util::GetGuestTextureLayout(
|
|
dimension, pitch, GetWidth(), GetHeight(), GetDepthOrArraySize(),
|
|
tiled, format, packed_mips, base_page != 0, mip_max_level);
|
|
}
|
|
|
|
static const char* GetLogDimensionName(xenos::DataDimension dimension);
|
|
const char* GetLogDimensionName() const {
|
|
return GetLogDimensionName(dimension);
|
|
}
|
|
void LogAction(const char* action) const;
|
|
};
|
|
|
|
class Texture {
|
|
public:
|
|
Texture(const Texture& texture) = delete;
|
|
Texture& operator=(const Texture& texture) = delete;
|
|
virtual ~Texture();
|
|
|
|
TextureCache& texture_cache() const { return texture_cache_; }
|
|
|
|
const TextureKey& key() const { return key_; }
|
|
|
|
const texture_util::TextureGuestLayout& guest_layout() const {
|
|
return guest_layout_;
|
|
}
|
|
uint32_t GetGuestBaseSize() const {
|
|
return guest_layout().base.level_data_extent_bytes;
|
|
}
|
|
uint32_t GetGuestMipsSize() const {
|
|
return guest_layout().mips_total_extent_bytes;
|
|
}
|
|
|
|
uint64_t GetHostMemoryUsage() const { return host_memory_usage_; }
|
|
|
|
uint64_t last_usage_submission_index() const {
|
|
return last_usage_submission_index_;
|
|
}
|
|
uint64_t last_usage_time() const { return last_usage_time_; }
|
|
|
|
bool GetBaseResolved() const { return base_resolved_; }
|
|
void SetBaseResolved(bool base_resolved) {
|
|
assert_false(!base_resolved && key().scaled_resolve);
|
|
base_resolved_ = base_resolved;
|
|
}
|
|
bool GetMipsResolved() const { return mips_resolved_; }
|
|
void SetMipsResolved(bool mips_resolved) {
|
|
assert_false(!mips_resolved && key().scaled_resolve);
|
|
mips_resolved_ = mips_resolved;
|
|
}
|
|
bool IsResolved() const { return base_resolved_ || mips_resolved_; }
|
|
|
|
bool base_outdated(const global_unique_lock_type& global_lock) const {
|
|
return base_outdated_;
|
|
}
|
|
bool mips_outdated(const global_unique_lock_type& global_lock) const {
|
|
return mips_outdated_;
|
|
}
|
|
void MakeUpToDateAndWatch(const global_unique_lock_type& global_lock);
|
|
|
|
void WatchCallback(const global_unique_lock_type& global_lock, bool is_mip);
|
|
|
|
// For LRU caching - updates the last usage frame and moves the texture to
|
|
// the end of the usage queue. Must be called any time the texture is
|
|
// referenced by any GPU work in the implementation to make sure it's not
|
|
// destroyed while still in use.
|
|
void MarkAsUsed();
|
|
|
|
void LogAction(const char* action) const;
|
|
|
|
protected:
|
|
explicit Texture(TextureCache& texture_cache, const TextureKey& key);
|
|
|
|
void SetHostMemoryUsage(uint64_t new_host_memory_usage) {
|
|
texture_cache_.UpdateTexturesTotalHostMemoryUsage(new_host_memory_usage,
|
|
host_memory_usage_);
|
|
host_memory_usage_ = new_host_memory_usage;
|
|
}
|
|
|
|
private:
|
|
TextureCache& texture_cache_;
|
|
|
|
TextureKey key_;
|
|
|
|
texture_util::TextureGuestLayout guest_layout_;
|
|
|
|
uint64_t host_memory_usage_ = 0;
|
|
|
|
uint64_t last_usage_submission_index_;
|
|
uint64_t last_usage_time_;
|
|
Texture* used_previous_;
|
|
Texture* used_next_;
|
|
|
|
// Whether the most up-to-date base / mips contain pages with data from a
|
|
// resolve operation (rather than from the CPU or memexport), primarily for
|
|
// choosing between piecewise linear gamma and sRGB when the former is
|
|
// emulated with the latter.
|
|
bool base_resolved_;
|
|
bool mips_resolved_;
|
|
|
|
// These are to be accessed within the global critical region to synchronize
|
|
// with shared memory.
|
|
// Whether the recent base level data needs reloading from the memory.
|
|
bool base_outdated_ = false;
|
|
// Whether the recent mip data needs reloading from the memory.
|
|
bool mips_outdated_ = false;
|
|
// Watch handles for the memory ranges.
|
|
SharedMemory::WatchHandle base_watch_handle_ = nullptr;
|
|
SharedMemory::WatchHandle mips_watch_handle_ = nullptr;
|
|
};
|
|
|
|
// Rules of data access in load shaders:
|
|
// - Source reading (from the shared memory or the scaled resolve buffer):
|
|
// - Guest data may be stored in a sparsely-allocated buffer, or, in
|
|
// Direct3D 12 terms, a tiled buffer. This means that some regions of the
|
|
// buffer may not be mapped. On tiled resources tier 1 hardware, accessing
|
|
// unmapped tiles results in undefined behavior, including a GPU page
|
|
// fault and device removal. So, shaders must not try to access
|
|
// potentially unmapped regions (that are outside the texture memory
|
|
// extents calculated on the CPU, taking into account that Xenia can't
|
|
// overestimate texture sizes freely since it must not try to upload
|
|
// unallocated pages on the CPU).
|
|
// - Buffer tiles have 64 KB size on Direct3D 12. Vulkan has its own
|
|
// alignment requirements for sparse binding. But overall, we're
|
|
// allocating pretty large regions.
|
|
// - Resolution scaling disabled:
|
|
// - Shared memory allocates regions of power of two sizes that map
|
|
// directly to the same portions of the 512 MB of the console's
|
|
// physical memory. So, a 64 KB-aligned host buffer region is also 64
|
|
// KB-aligned in the guest address space.
|
|
// - Tiled textures: 32x32x4-block tiles are always resident each as a
|
|
// whole. If the width is bigger than the pitch, the overflowing 32x32x4
|
|
// tiles are also loaded as entire tiles. We do not have separate
|
|
// shaders for 2D and 3D. So, for tiled textures, it's safe to consider
|
|
// that if any location within a 32x32-aligned portion is within the
|
|
// texture bounds, the entire 32x32 portion also can be read.
|
|
// - Linear textures: Pitch is aligned to 256 bytes. Row count, however,
|
|
// is not aligned to anything (unless the mip tail is being loaded). The
|
|
// overflowing last row in case `width > pitch`, however, is made
|
|
// resident up to the last texel in it. But row start alignment is 256,
|
|
// which is a power of two, and is smaller than the Direct3D 12 tile
|
|
// size of 64 KB. So, if any block within a 256-aligned region is within
|
|
// the texture bounds, without resolution scaling, reading from any
|
|
// location in that 256-aligned region is safe.
|
|
// - Since we use the same shaders for tiled and linear textures (as well
|
|
// as 1D textures), this means that without resolution scaling, it's
|
|
// safe to access a min(256 bytes, 32 blocks)-aligned portion along X,
|
|
// but only within the same row of blocks, with bounds checking only for
|
|
// such portion as a whole, but without additional bounds checking
|
|
// inside of it.
|
|
// - Therefore, it's recommended that shaders read power-of-two amounts of
|
|
// blocks (so there will naturally be some alignment to some power of
|
|
// two), and this way, each thread may read at most 16 16bpb blocks or
|
|
// at most 32 8bpb or smaller blocks with in a single `if (x < width)`
|
|
// for the whole aligned range of the same length.
|
|
// - Resolution scaling enabled:
|
|
// - For simplicity, unlike in the shared memory, buffer tile boundaries
|
|
// are not aligned to powers of 2 the same way as guest addresses are.
|
|
// While for 2x2 resolution scaling it still happens to be the case
|
|
// because `host scaling unit address = guest scaling unit address << 2`
|
|
// (similarly for 2x1 and 1x2), for 3x or x3, it's not - a 64 KB host
|
|
// tile would represent 7281.777 guest bytes with 3x3 (disregarding that
|
|
// sequences of texels that are adjacent in memory alongside the
|
|
// horizontal axis, not individual bytes, are scaled, but even in that
|
|
// case it's not scaling by 2^n still).
|
|
// - The above would affect the `width > pitch` case for linear textures,
|
|
// requiring overestimating the width in calculation of the range of the
|
|
// tiles to map, while not doing this overestimation on the guest memory
|
|
// extent calculation side (otherwise it may result in attempting to
|
|
// upload unallocated memory on the CPU). For example, let's take look
|
|
// at an extreme case of a 369x28 k_8 texture with a pitch of 256 bytes.
|
|
// The last row, in guest memory, would be loaded from the [7168, 7281)
|
|
// range, or, with 3x3 resolution scaling, from bytes [64512, 65529).
|
|
// However, if we try to unconditionally load 2 pixels, like the texture
|
|
// is 370x28, we will be accessing the bytes [64512, 65538). But bytes
|
|
// 65536 and 65537 will be in another 64 KB tile, which may be not
|
|
// mapped yet. However, none of this is an issue for one simple reason -
|
|
// resolving is only possible to tiled textures, so linear textures will
|
|
// never be resolution-scaled.
|
|
// - Tiled textures have potentially referenced guest 32x32-block tiles
|
|
// loaded in their entirety. So, just like for unscaled textures, if any
|
|
// block within a tile is available, the entire tile is as well.
|
|
// - Destination writing (to the linear buffer):
|
|
// - host_x_blocks_per_thread specifies how many pixels can be written
|
|
// without bounds checking within increments of that amount - the pitch of
|
|
// the destination buffer is manually overaligned if needed.
|
|
|
|
// In textures, resolution scaling is done for 8-byte portions of memory for
|
|
// 8bpp textures, and for 16-byte portions for textures of higher bit depths
|
|
// (these are the sizes of regions where contiguous texels in memory are also
|
|
// contiguous in the texture along the horizontal axis, so 64-bit and 128-bit
|
|
// loads / stores, for 8bpp and 16bpp+ respectively, can be used for untiling
|
|
// regardless of the resolution scale).
|
|
|
|
struct LoadConstants {
|
|
uint32_t is_tiled_3d_endian_scale;
|
|
// Base offset in bytes, resolution-scaled.
|
|
uint32_t guest_offset;
|
|
// For tiled textures - row pitch in blocks, aligned to 32, unscaled.
|
|
// For linear textures - row pitch in bytes.
|
|
uint32_t guest_pitch_aligned;
|
|
// For 3D textures only (ignored otherwise) - aligned to 32, unscaled.
|
|
uint32_t guest_z_stride_block_rows_aligned;
|
|
|
|
// - std140 vector boundary -
|
|
|
|
// If this is a packed mip tail, this is aligned to tile dimensions.
|
|
// Resolution-scaled.
|
|
uint32_t size_blocks[3];
|
|
// Base offset in bytes.
|
|
uint32_t host_offset;
|
|
|
|
// - std140 vector boundary -
|
|
|
|
uint32_t host_pitch;
|
|
uint32_t height_texels;
|
|
};
|
|
|
|
static constexpr uint32_t kLoadGuestXThreadsPerGroupLog2 = 2;
|
|
static constexpr uint32_t kLoadGuestYBlocksPerGroupLog2 = 5;
|
|
|
|
enum LoadShaderIndex {
|
|
kLoadShaderIndex8bpb,
|
|
kLoadShaderIndex16bpb,
|
|
kLoadShaderIndex32bpb,
|
|
kLoadShaderIndex64bpb,
|
|
kLoadShaderIndex128bpb,
|
|
kLoadShaderIndexR5G5B5A1ToB5G5R5A1,
|
|
kLoadShaderIndexR5G6B5ToB5G6R5,
|
|
kLoadShaderIndexR5G5B6ToB5G6R5WithRBGASwizzle,
|
|
kLoadShaderIndexRGBA4ToBGRA4,
|
|
kLoadShaderIndexRGBA4ToARGB4,
|
|
kLoadShaderIndexGBGR8ToGRGB8,
|
|
kLoadShaderIndexGBGR8ToRGB8,
|
|
kLoadShaderIndexBGRG8ToRGBG8,
|
|
kLoadShaderIndexBGRG8ToRGB8,
|
|
kLoadShaderIndexR10G11B11ToRGBA16,
|
|
kLoadShaderIndexR10G11B11ToRGBA16SNorm,
|
|
kLoadShaderIndexR11G11B10ToRGBA16,
|
|
kLoadShaderIndexR11G11B10ToRGBA16SNorm,
|
|
kLoadShaderIndexR16UNormToFloat,
|
|
kLoadShaderIndexR16SNormToFloat,
|
|
kLoadShaderIndexRG16UNormToFloat,
|
|
kLoadShaderIndexRG16SNormToFloat,
|
|
kLoadShaderIndexRGBA16UNormToFloat,
|
|
kLoadShaderIndexRGBA16SNormToFloat,
|
|
kLoadShaderIndexDXT1ToRGBA8,
|
|
kLoadShaderIndexDXT3ToRGBA8,
|
|
kLoadShaderIndexDXT5ToRGBA8,
|
|
kLoadShaderIndexDXNToRG8,
|
|
kLoadShaderIndexDXT3A,
|
|
kLoadShaderIndexDXT3AAs1111ToBGRA4,
|
|
kLoadShaderIndexDXT3AAs1111ToARGB4,
|
|
kLoadShaderIndexDXT5AToR8,
|
|
kLoadShaderIndexCTX1,
|
|
kLoadShaderIndexDepthUnorm,
|
|
kLoadShaderIndexDepthFloat,
|
|
|
|
kLoadShaderCount,
|
|
kLoadShaderIndexUnknown = kLoadShaderCount,
|
|
};
|
|
|
|
struct LoadShaderInfo {
|
|
// Log2 of the sizes, in bytes, of the elements in the source (guest) and
|
|
// the destination (host) buffer bindings accessed by the copying shader,
|
|
// since the shader may copy multiple blocks per one invocation.
|
|
uint32_t source_bpe_log2;
|
|
uint32_t dest_bpe_log2;
|
|
// Number of bytes in a host resolution-scaled block (corresponding to a
|
|
// guest block if not decompressing, or a host texel if decompressing)
|
|
// written by the shader.
|
|
uint32_t bytes_per_host_block;
|
|
// Log2 of the number of guest resolution-scaled blocks along the X axis
|
|
// loaded by a single thread shader group.
|
|
uint32_t guest_x_blocks_per_thread_log2;
|
|
|
|
uint32_t GetGuestXBlocksPerGroupLog2() const {
|
|
return kLoadGuestXThreadsPerGroupLog2 + guest_x_blocks_per_thread_log2;
|
|
}
|
|
};
|
|
|
|
static constexpr uint8_t kSwizzledSignsUnsigned =
|
|
uint8_t(xenos::TextureSign::kUnsigned) * uint8_t(0b01010101);
|
|
|
|
struct TextureBinding {
|
|
TextureKey key;
|
|
// Destination swizzle merged with guest to host format swizzle.
|
|
uint32_t host_swizzle;
|
|
// Packed TextureSign values, 2 bit per each component, with guest-side
|
|
// destination swizzle from the fetch constant applied to them.
|
|
uint8_t swizzled_signs;
|
|
// Unsigned version of the texture (or signed if they have the same data).
|
|
Texture* texture;
|
|
// Signed version of the texture if the data in the signed version is
|
|
// different on the host.
|
|
Texture* texture_signed;
|
|
|
|
TextureBinding() { Reset(); }
|
|
|
|
void Reset() {
|
|
std::memset(this, 0, sizeof(*this));
|
|
host_swizzle = xenos::XE_GPU_TEXTURE_SWIZZLE_0000;
|
|
swizzled_signs = kSwizzledSignsUnsigned;
|
|
}
|
|
};
|
|
|
|
explicit TextureCache(const RegisterFile& register_file,
|
|
SharedMemory& shared_memory,
|
|
uint32_t draw_resolution_scale_x,
|
|
uint32_t draw_resolution_scale_y);
|
|
|
|
const RegisterFile& register_file() const { return register_file_; }
|
|
SharedMemory& shared_memory() const { return shared_memory_; }
|
|
|
|
// May be called for purposes like clearing the cache, as well as in the
|
|
// destructor of the implementation if textures, for instance, have references
|
|
// to the implementation that are used in their destructor, and will become
|
|
// invalid if the implementation is destroyed before the texture.
|
|
void DestroyAllTextures(bool from_destructor = false);
|
|
|
|
// Whether the signed version of the texture has a different representation on
|
|
// the host than its unsigned version (for example, if it's a fixed-point
|
|
// texture emulated with a larger host pixel format).
|
|
virtual bool IsSignedVersionSeparateForFormat(TextureKey key) const {
|
|
return false;
|
|
}
|
|
// Parameters like whether the texture is tiled and its dimensions are checked
|
|
// externally, the implementation should take only format-related parameters
|
|
// such as the format itself and the signedness into account.
|
|
virtual bool IsScaledResolveSupportedForFormat(TextureKey key) const {
|
|
return false;
|
|
}
|
|
// For formats with less than 4 components, implementations normally should
|
|
// replicate the last component into the non-existent ones, similar to what is
|
|
// done for unused components of operands in shaders by Microsoft's Xbox 360
|
|
// shader compiler (.xxxx, .xyyy, .xyzz, .xyzw).
|
|
// For DXT3A and DXT5A, RRRR swizzle is specified in:
|
|
// http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf
|
|
// 4D5307E6 also expects replicated components in k_8 sprites.
|
|
// DXN is read as RG in 4D5307E6, but as RA in 415607E6.
|
|
// TODO(Triang3l): Find out the correct contents of unused texture components.
|
|
virtual uint32_t GetHostFormatSwizzle(TextureKey key) const = 0;
|
|
|
|
virtual uint32_t GetMaxHostTextureWidthHeight(
|
|
xenos::DataDimension dimension) const = 0;
|
|
virtual uint32_t GetMaxHostTextureDepthOrArraySize(
|
|
xenos::DataDimension dimension) const = 0;
|
|
|
|
// The texture must be created exactly with this key (if the implementation
|
|
// supports the texture with this key, otherwise, or in case of a runtime
|
|
// failure, it should return nullptr), modifying it is not allowed.
|
|
virtual std::unique_ptr<Texture> CreateTexture(TextureKey key) = 0;
|
|
|
|
// Returns nullptr not only if the key is not supported, but also if couldn't
|
|
// create the texture - if it's nullptr, occasionally a recreation attempt
|
|
// should be made.
|
|
Texture* FindOrCreateTexture(TextureKey key);
|
|
|
|
static const LoadShaderInfo& GetLoadShaderInfo(
|
|
LoadShaderIndex load_shader_index) {
|
|
assert_true(load_shader_index < kLoadShaderCount);
|
|
return load_shader_info_[load_shader_index];
|
|
}
|
|
bool LoadTextureData(Texture& texture);
|
|
void LoadTexturesData(Texture** textures, uint32_t n_textures);
|
|
// Writes the texture data (for base, mips or both - but not neither) from the
|
|
// shared memory or the scaled resolve memory. The shared memory management is
|
|
// done outside this function, the implementation just needs to load the data
|
|
// into the texture object.
|
|
virtual bool LoadTextureDataFromResidentMemoryImpl(Texture& texture,
|
|
bool load_base,
|
|
bool load_mips) = 0;
|
|
|
|
// Converts a texture fetch constant to a texture key, normalizing and
|
|
// validating the values, or creating an invalid key, and also gets the
|
|
// post-guest-swizzle signedness.
|
|
static void BindingInfoFromFetchConstant(
|
|
const xenos::xe_gpu_texture_fetch_t& fetch, TextureKey& key_out,
|
|
uint8_t* swizzled_signs_out);
|
|
|
|
// Makes all texture bindings invalid. Also requesting textures after calling
|
|
// this will cause another attempt to create a texture or to untile it if
|
|
// there was an error.
|
|
void ResetTextureBindings(bool from_destructor = false);
|
|
|
|
const TextureBinding* GetValidTextureBinding(
|
|
uint32_t fetch_constant_index) const {
|
|
const TextureBinding& binding = texture_bindings_[fetch_constant_index];
|
|
return binding.key.is_valid ? &binding : nullptr;
|
|
}
|
|
// Called when something in a texture binding is changed for the
|
|
// implementation to update the internal dependencies of the binding.
|
|
virtual void UpdateTextureBindingsImpl(uint32_t fetch_constant_mask) {}
|
|
|
|
private:
|
|
void UpdateTexturesTotalHostMemoryUsage(uint64_t add, uint64_t subtract);
|
|
|
|
// Shared memory callback for texture data invalidation.
|
|
static void WatchCallback(const global_unique_lock_type& global_lock,
|
|
void* context, void* data, uint64_t argument,
|
|
bool invalidated_by_gpu);
|
|
|
|
// Checks if there are any pages that contain scaled resolve data within the
|
|
// range.
|
|
bool IsRangeScaledResolved(uint32_t start_unscaled, uint32_t length_unscaled);
|
|
// Global shared memory invalidation callback for invalidating scaled resolved
|
|
// texture data.
|
|
static void ScaledResolveGlobalWatchCallbackThunk(
|
|
const global_unique_lock_type& global_lock, void* context,
|
|
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
|
|
void ScaledResolveGlobalWatchCallback(
|
|
const global_unique_lock_type& global_lock, uint32_t address_first,
|
|
uint32_t address_last, bool invalidated_by_gpu);
|
|
|
|
const RegisterFile& register_file_;
|
|
SharedMemory& shared_memory_;
|
|
uint32_t draw_resolution_scale_x_;
|
|
uint32_t draw_resolution_scale_y_;
|
|
divisors::MagicDiv draw_resolution_scale_x_divisor_;
|
|
divisors::MagicDiv draw_resolution_scale_y_divisor_;
|
|
static const LoadShaderInfo load_shader_info_[kLoadShaderCount];
|
|
|
|
xe::global_critical_region global_critical_region_;
|
|
// Bit vector storing whether each 4 KB physical memory page contains scaled
|
|
// resolve data. uint32_t rather than uint64_t because parts of it can be sent
|
|
// to shaders.
|
|
std::unique_ptr<uint32_t[]> scaled_resolve_pages_;
|
|
// Second level of the bit vector for faster rejection of non-scaled textures.
|
|
// >> 12 for 4 KB pages, >> 5 for uint32_t level 1 bits, >> 6 for uint64_t
|
|
// level 2 bits.
|
|
uint64_t scaled_resolve_pages_l2_[SharedMemory::kBufferSize >> (12 + 5 + 6)];
|
|
|
|
// Global watch for scaled resolve data invalidation.
|
|
SharedMemory::GlobalWatchHandle scaled_resolve_global_watch_handle_ = nullptr;
|
|
|
|
uint64_t current_submission_index_ = 0;
|
|
uint64_t current_submission_time_ = 0;
|
|
|
|
std::unordered_map<TextureKey, std::unique_ptr<Texture>, TextureKey::Hasher>
|
|
textures_;
|
|
|
|
uint64_t textures_total_host_memory_usage_ = 0;
|
|
|
|
Texture* texture_used_first_ = nullptr;
|
|
Texture* texture_used_last_ = nullptr;
|
|
|
|
// Whether a texture has become outdated (a memory watch has been triggered),
|
|
// so need to recheck if textures aren't outdated, disregarding whether fetch
|
|
// constants have been changed.
|
|
std::atomic<bool> texture_became_outdated_{false};
|
|
|
|
std::array<TextureBinding, xenos::kTextureFetchConstantCount>
|
|
texture_bindings_;
|
|
// Bit vector with bits reset on fetch constant writes to avoid parsing fetch
|
|
// constants again and again.
|
|
uint32_t texture_bindings_in_sync_ = 0;
|
|
};
|
|
|
|
} // namespace gpu
|
|
} // namespace xe
|
|
|
|
#endif // XENIA_GPU_TEXTURE_CACHE_H_
|