xenia-canary/src/xenia/gpu/texture_cache.cc

1080 lines
40 KiB
C++
Raw Normal View History

/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2022 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#include "xenia/gpu/texture_cache.h"
#include "xenia/base/clock.h"
#include "xenia/base/cvar.h"
#include "xenia/base/logging.h"
#include "xenia/base/profiling.h"
#include "xenia/gpu/gpu_flags.h"
DEFINE_int32(
draw_resolution_scale_x, 1,
"Integer pixel width scale used for scaling the rendering resolution "
"opaquely to the game.\n"
"1, 2 and 3 may be supported, but support of anything above 1 depends on "
"the device properties, such as whether it supports sparse binding / tiled "
"resources, the number of virtual address bits per resource, and other "
"factors.\n"
"Various effects and parts of game rendering pipelines may work "
"incorrectly as pixels become ambiguous from the game's perspective and "
"because half-pixel offset (which normally doesn't affect coverage when "
"MSAA isn't used) becomes full-pixel.",
"GPU");
DEFINE_int32(
draw_resolution_scale_y, 1,
"Integer pixel width scale used for scaling the rendering resolution "
"opaquely to the game.\n"
"See draw_resolution_scale_x for more information.",
"GPU");
DEFINE_uint32(
texture_cache_memory_limit_soft, 384,
"Maximum host texture memory usage (in megabytes) above which old textures "
"will be destroyed.",
"GPU");
DEFINE_uint32(
texture_cache_memory_limit_soft_lifetime, 30,
"Seconds a texture should be unused to be considered old enough to be "
"deleted if texture memory usage exceeds texture_cache_memory_limit_soft.",
"GPU");
DEFINE_uint32(
texture_cache_memory_limit_hard, 768,
"Maximum host texture memory usage (in megabytes) above which textures "
"will be destroyed as soon as possible.",
"GPU");
DEFINE_uint32(
texture_cache_memory_limit_render_to_texture, 24,
"Part of the host texture memory budget (in megabytes) that will be scaled "
"by the current drawing resolution scale.\n"
"If texture_cache_memory_limit_soft, for instance, is 384, and this is 24, "
"it will be assumed that the game will be using roughly 24 MB of "
"render-to-texture (resolve) targets and 384 - 24 = 360 MB of regular "
"textures - so with 2x2 resolution scaling, the soft limit will be 360 + "
"96 MB, and with 3x3, it will be 360 + 216 MB.",
"GPU");
namespace xe {
namespace gpu {
const TextureCache::LoadShaderInfo
TextureCache::load_shader_info_[kLoadShaderCount] = {
// k8bpb
{3, 4, 1, 4},
// k16bpb
{4, 4, 2, 4},
// k32bpb
{4, 4, 4, 3},
// k64bpb
{4, 4, 8, 2},
// k128bpb
{4, 4, 16, 1},
// kR5G5B5A1ToB5G5R5A1
{4, 4, 2, 4},
// kR5G6B5ToB5G6R5
{4, 4, 2, 4},
// kR5G5B6ToB5G6R5WithRBGASwizzle
{4, 4, 2, 4},
// kRGBA4ToBGRA4
{4, 4, 2, 4},
// kRGBA4ToARGB4
{4, 4, 2, 4},
// kGBGR8ToGRGB8
{4, 4, 4, 3},
// kGBGR8ToRGB8
{4, 4, 8, 3},
// kBGRG8ToRGBG8
{4, 4, 4, 3},
// kBGRG8ToRGB8
{4, 4, 8, 3},
// kR10G11B11ToRGBA16
{4, 4, 8, 3},
// kR10G11B11ToRGBA16SNorm
{4, 4, 8, 3},
// kR11G11B10ToRGBA16
{4, 4, 8, 3},
// kR11G11B10ToRGBA16SNorm
{4, 4, 8, 3},
// kR16UNormToFloat
{4, 4, 2, 4},
// kR16SNormToFloat
{4, 4, 2, 4},
// kRG16UNormToFloat
{4, 4, 4, 3},
// kRG16SNormToFloat
{4, 4, 4, 3},
// kRGBA16UNormToFloat
{4, 4, 8, 2},
// kRGBA16SNormToFloat
{4, 4, 8, 2},
// kDXT1ToRGBA8
{4, 4, 4, 2},
// kDXT3ToRGBA8
{4, 4, 4, 1},
// kDXT5ToRGBA8
{4, 4, 4, 1},
// kDXNToRG8
{4, 4, 2, 1},
// kDXT3A
{4, 4, 1, 2},
// kDXT3AAs1111ToBGRA4
{4, 4, 2, 2},
// kDXT3AAs1111ToARGB4
{4, 4, 2, 2},
// kDXT5AToR8
{4, 4, 1, 2},
// kCTX1
{4, 4, 2, 2},
// kDepthUnorm
{4, 4, 4, 3},
// kDepthFloat
{4, 4, 4, 3},
};
TextureCache::TextureCache(const RegisterFile& register_file,
SharedMemory& shared_memory,
uint32_t draw_resolution_scale_x,
uint32_t draw_resolution_scale_y)
: register_file_(register_file),
shared_memory_(shared_memory),
draw_resolution_scale_x_(draw_resolution_scale_x),
Fixed a bug with readback_resolve and readback_memexport that was responsible for a large portion of their overhead. readback_memexport and resolve are now usable for games, depending on your hardware. in my case games that were slideshows now run at like 20-30 fps, and my hardware isnt the best for xenia. add split_map class for mapping keys to values in a way that optimizes for frequent searches and infrequent insertions/removals remove jump table implementation of GetColorRenderTargetFormatComponentCount, it was appearing relatively high in profiles. instead pack the component counts into a single 32 bit word, which is indexed by shifting Add cvar to align all basic blocks to a boundary Add mmio aware load paths liberally apply XE_RESTRICT in ringbuffer related code Removed the IS_TRUE and IS_FALSE opcodes, they were pointless duplicates of COMPARE_EQ/COMPARE_NE and i want to simplify our set of opcodes for future backends More work on LVSR/LVSL/STVR/STVL opcodes Optimized X64 translated code emission, now only compute instrkey once Add code for pre-computing integer division magic numbers Optimized GetHostViewportInfo a little Move args for GetHostViewportInfo into a class, cache the result and compare for future queries. moved GetHostViewportInfo far lower on the profile Add (currently not functional, and very racy) asynchronous memcpy code. will improve it and actually use it in future commits. Add non-temporal memcpy function for huge page-aligned allocations. Used for copying to shared memory/readback hoist are_accumulated_render_targets_valid_ check out of loop in render_target_cache already bound check. Add stosb/movsb code for small constant memcpys/memsets that arent worth the overhead of memcpy/memset
2022-08-28 21:24:25 +00:00
draw_resolution_scale_y_(draw_resolution_scale_y),
draw_resolution_scale_x_divisor_(draw_resolution_scale_x),
draw_resolution_scale_y_divisor_(draw_resolution_scale_y) {
assert_true(draw_resolution_scale_x >= 1);
assert_true(draw_resolution_scale_x <= kMaxDrawResolutionScaleAlongAxis);
assert_true(draw_resolution_scale_y >= 1);
assert_true(draw_resolution_scale_y <= kMaxDrawResolutionScaleAlongAxis);
if (draw_resolution_scale_x > 1 || draw_resolution_scale_y > 1) {
constexpr uint32_t kScaledResolvePageDwordCount =
SharedMemory::kBufferSize / 4096 / 32;
scaled_resolve_pages_ =
std::unique_ptr<uint32_t[]>(new uint32_t[kScaledResolvePageDwordCount]);
std::memset(scaled_resolve_pages_.get(), 0,
kScaledResolvePageDwordCount * sizeof(uint32_t));
std::memset(scaled_resolve_pages_l2_, 0, sizeof(scaled_resolve_pages_l2_));
scaled_resolve_global_watch_handle_ = shared_memory.RegisterGlobalWatch(
ScaledResolveGlobalWatchCallbackThunk, this);
}
}
TextureCache::~TextureCache() {
DestroyAllTextures(true);
if (scaled_resolve_global_watch_handle_) {
shared_memory().UnregisterGlobalWatch(scaled_resolve_global_watch_handle_);
}
}
bool TextureCache::GetConfigDrawResolutionScale(uint32_t& x_out,
uint32_t& y_out) {
uint32_t config_x =
uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_x));
uint32_t config_y =
uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_y));
Fixed a bug with readback_resolve and readback_memexport that was responsible for a large portion of their overhead. readback_memexport and resolve are now usable for games, depending on your hardware. in my case games that were slideshows now run at like 20-30 fps, and my hardware isnt the best for xenia. add split_map class for mapping keys to values in a way that optimizes for frequent searches and infrequent insertions/removals remove jump table implementation of GetColorRenderTargetFormatComponentCount, it was appearing relatively high in profiles. instead pack the component counts into a single 32 bit word, which is indexed by shifting Add cvar to align all basic blocks to a boundary Add mmio aware load paths liberally apply XE_RESTRICT in ringbuffer related code Removed the IS_TRUE and IS_FALSE opcodes, they were pointless duplicates of COMPARE_EQ/COMPARE_NE and i want to simplify our set of opcodes for future backends More work on LVSR/LVSL/STVR/STVL opcodes Optimized X64 translated code emission, now only compute instrkey once Add code for pre-computing integer division magic numbers Optimized GetHostViewportInfo a little Move args for GetHostViewportInfo into a class, cache the result and compare for future queries. moved GetHostViewportInfo far lower on the profile Add (currently not functional, and very racy) asynchronous memcpy code. will improve it and actually use it in future commits. Add non-temporal memcpy function for huge page-aligned allocations. Used for copying to shared memory/readback hoist are_accumulated_render_targets_valid_ check out of loop in render_target_cache already bound check. Add stosb/movsb code for small constant memcpys/memsets that arent worth the overhead of memcpy/memset
2022-08-28 21:24:25 +00:00
uint32_t clamped_x = std::min(kMaxDrawResolutionScaleAlongAxis, config_x);
uint32_t clamped_y = std::min(kMaxDrawResolutionScaleAlongAxis, config_y);
x_out = clamped_x;
y_out = clamped_y;
return clamped_x == config_x && clamped_y == config_y;
}
void TextureCache::ClearCache() { DestroyAllTextures(); }
void TextureCache::CompletedSubmissionUpdated(
uint64_t completed_submission_index) {
// If memory usage is too high, destroy unused textures.
uint64_t current_time = xe::Clock::QueryHostUptimeMillis();
// texture_cache_memory_limit_render_to_texture is assumed to be included in
// texture_cache_memory_limit_soft and texture_cache_memory_limit_hard, at 1x,
// so subtracting 1 from the scale.
uint32_t limit_scaled_resolve_add_mb =
cvars::texture_cache_memory_limit_render_to_texture *
(draw_resolution_scale_x() * draw_resolution_scale_y() - 1);
uint32_t limit_soft_mb =
cvars::texture_cache_memory_limit_soft + limit_scaled_resolve_add_mb;
uint32_t limit_hard_mb =
cvars::texture_cache_memory_limit_hard + limit_scaled_resolve_add_mb;
uint32_t limit_soft_lifetime =
cvars::texture_cache_memory_limit_soft_lifetime * 1000;
bool destroyed_any = false;
while (texture_used_first_ != nullptr) {
uint64_t total_host_memory_usage_mb =
(textures_total_host_memory_usage_ + ((UINT32_C(1) << 20) - 1)) >> 20;
bool limit_hard_exceeded = total_host_memory_usage_mb > limit_hard_mb;
if (total_host_memory_usage_mb <= limit_soft_mb && !limit_hard_exceeded) {
break;
}
Texture* texture = texture_used_first_;
if (texture->last_usage_submission_index() > completed_submission_index) {
break;
}
if (!limit_hard_exceeded &&
(texture->last_usage_time() + limit_soft_lifetime) > current_time) {
break;
}
if (!destroyed_any) {
destroyed_any = true;
// The texture being destroyed might have been bound in the previous
// submissions, and nothing has overwritten the binding yet, so completion
// of the submission where the texture was last actually used on the GPU
// doesn't imply that it's not bound currently. Reset bindings if
// any texture has been destroyed.
ResetTextureBindings();
}
// Remove the texture from the map and destroy it via its unique_ptr.
auto found_texture_it = textures_.find(texture->key());
assert_true(found_texture_it != textures_.end());
if (found_texture_it != textures_.end()) {
assert_true(found_texture_it->second.get() == texture);
textures_.erase(found_texture_it);
// `texture` is invalid now.
}
}
if (destroyed_any) {
COUNT_profile_set("gpu/texture_cache/textures", textures_.size());
}
}
void TextureCache::BeginSubmission(uint64_t new_submission_index) {
assert_true(new_submission_index > current_submission_index_);
current_submission_index_ = new_submission_index;
current_submission_time_ = xe::Clock::QueryHostUptimeMillis();
}
void TextureCache::BeginFrame() {
// In case there was a failure to create something in the previous frame, make
// sure bindings are reset so a new attempt will surely be made if the texture
// is requested again.
ResetTextureBindings();
}
void TextureCache::MarkRangeAsResolved(uint32_t start_unscaled,
uint32_t length_unscaled) {
if (length_unscaled == 0) {
return;
}
start_unscaled &= 0x1FFFFFFF;
length_unscaled = std::min(length_unscaled, 0x20000000 - start_unscaled);
if (IsDrawResolutionScaled()) {
uint32_t page_first = start_unscaled >> 12;
uint32_t page_last = (start_unscaled + length_unscaled - 1) >> 12;
uint32_t block_first = page_first >> 5;
uint32_t block_last = page_last >> 5;
auto global_lock = global_critical_region_.Acquire();
for (uint32_t i = block_first; i <= block_last; ++i) {
uint32_t add_bits = UINT32_MAX;
if (i == block_first) {
add_bits &= ~((UINT32_C(1) << (page_first & 31)) - 1);
}
if (i == block_last && (page_last & 31) != 31) {
add_bits &= (UINT32_C(1) << ((page_last & 31) + 1)) - 1;
}
scaled_resolve_pages_[i] |= add_bits;
scaled_resolve_pages_l2_[i >> 6] |= UINT64_C(1) << (i & 63);
}
}
// Invalidate textures. Toggling individual textures between scaled and
// unscaled also relies on invalidation through shared memory.
shared_memory().RangeWrittenByGpu(start_unscaled, length_unscaled, true);
}
uint32_t TextureCache::GuestToHostSwizzle(uint32_t guest_swizzle,
uint32_t host_format_swizzle) {
uint32_t host_swizzle = 0;
for (uint32_t i = 0; i < 4; ++i) {
uint32_t guest_swizzle_component = (guest_swizzle >> (3 * i)) & 0b111;
uint32_t host_swizzle_component;
if (guest_swizzle_component >= xenos::XE_GPU_TEXTURE_SWIZZLE_0) {
// Get rid of 6 and 7 values (to prevent host GPU errors if the game has
// something broken) the simple way - by changing them to 4 (0) and 5 (1).
host_swizzle_component = guest_swizzle_component & 0b101;
} else {
host_swizzle_component =
(host_format_swizzle >> (3 * guest_swizzle_component)) & 0b111;
}
host_swizzle |= host_swizzle_component << (3 * i);
}
return host_swizzle;
}
void TextureCache::RequestTextures(uint32_t used_texture_mask) {
const auto& regs = register_file();
if (texture_became_outdated_.exchange(false, std::memory_order_acquire)) {
// A texture has become outdated - make sure whether textures are outdated
// is rechecked in this draw and in subsequent ones to reload the new data
// if needed.
ResetTextureBindings();
}
// Update the texture keys and the textures.
uint32_t bindings_changed = 0;
uint32_t textures_remaining = used_texture_mask & ~texture_bindings_in_sync_;
uint32_t index = 0;
fixed wine crash from use of NtSetEventPriorityBoost add xe::clear_lowest_bit, use it in place of shift-andnot in some bit iteration code make is_allocated_ and is_enabled_ volatile in xma_context preallocate avpacket buffer in XMAContext::Setup, the reallocations of the buffer in ffmpeg were showing up on profiles check is_enabled and is_allocated BEFORE locking an xmacontext. XMA worker was spending most of its time locking and unlocking contexts Removed XeDMAC, dma:: namespace. It was a bad idea and I couldn't make it work in the end. Kept vastcpy and moved it to the memory namespace instead Made the rest of global_critical_region's members static. They never needed an instance. Removed ifdef'ed out code from ring_buffer.h Added EventInfo struct to threading, added Event::Query to aid with implementing NtQueryEvent. Removed vector from WaitMultiple, instead use a fixed array of 64 handles that we populate. WaitForMultipleObjects cannot handle more than 64 objects. Remove XE_MSVC_OPTIMIZE_SMALL() use in x64_sequences, x64 backend is now always size optimized because of premake Make global_critical_region_ static constexpr in shared_memory.h to get rid of wasteage of 8 bytes (empty class=1byte, +alignment for next member=8) Move trace-related data to the tail of SharedMemory to keep more important data together In IssueDraw build an array of fetch constant addresses/sizes, then pre-lock the global lock before doing requestrange for each instead of individually locking within requestrange for each of them Consistent access specifier protected for pm4_command_processor_declare Devirtualize WriteOneRegisterFromRing. Move ExecutePacket and ExecutePrimaryBuffer to pm4_command_buffer_x Remove many redundant header inclusions access xenia-gpu Minor microoptimization of ExecutePacketType0 Add TextureCache::RequestTextures for batch invocation of LoadTexturesData Add TextureCache::LoadTexturesData for reducing the number of times we release and reacquire the global lock. Ideally you should hold the global lock for as little time as possible, but if you are constantly acquiring and releasing it you are actually more likely to have contention Add already_locked param to ObjectTable::LookupObject to help with reducing lock acquire/release pairs Add missing checks to XAudioRegisterRenderDriverClient_entry. this is unlikely to fix anything, it was just an easy thing to do Add NtQueryEvent system call implementation. I don't actually know of any games that need it. Instead of using std::vector + push_back in KeWaitForMultipleObjects and xeNtWaitForMultipleObjectsEx use a fixed size array of 64 and track the count. More than 64 objects is not permitted by the kernel. The repeated reallocations from push_back were appearing unusually high on the profiler, but were masked until now by waitformultipleobjects natural overhead Pre-lock the global lock before looking up each handle for xeNtWaitForMultipleObjectsEx and KeWaitForMultipleObjects. Pre-lock before looking up the signal and waiter in NtSignalAndWaitForSingleObjectEx add missing checks to NtWaitForMultipleObjectsEx Support pre-locking in XObject::GetNativeObject
2022-10-08 16:55:17 +00:00
Texture* textures_to_load[64]; // max bits = 32, can be unsigned + signed
// means max array size = 64
uint32_t num_textures_to_load = 0;
while (xe::bit_scan_forward(textures_remaining, &index)) {
uint32_t index_bit = UINT32_C(1) << index;
fixed wine crash from use of NtSetEventPriorityBoost add xe::clear_lowest_bit, use it in place of shift-andnot in some bit iteration code make is_allocated_ and is_enabled_ volatile in xma_context preallocate avpacket buffer in XMAContext::Setup, the reallocations of the buffer in ffmpeg were showing up on profiles check is_enabled and is_allocated BEFORE locking an xmacontext. XMA worker was spending most of its time locking and unlocking contexts Removed XeDMAC, dma:: namespace. It was a bad idea and I couldn't make it work in the end. Kept vastcpy and moved it to the memory namespace instead Made the rest of global_critical_region's members static. They never needed an instance. Removed ifdef'ed out code from ring_buffer.h Added EventInfo struct to threading, added Event::Query to aid with implementing NtQueryEvent. Removed vector from WaitMultiple, instead use a fixed array of 64 handles that we populate. WaitForMultipleObjects cannot handle more than 64 objects. Remove XE_MSVC_OPTIMIZE_SMALL() use in x64_sequences, x64 backend is now always size optimized because of premake Make global_critical_region_ static constexpr in shared_memory.h to get rid of wasteage of 8 bytes (empty class=1byte, +alignment for next member=8) Move trace-related data to the tail of SharedMemory to keep more important data together In IssueDraw build an array of fetch constant addresses/sizes, then pre-lock the global lock before doing requestrange for each instead of individually locking within requestrange for each of them Consistent access specifier protected for pm4_command_processor_declare Devirtualize WriteOneRegisterFromRing. Move ExecutePacket and ExecutePrimaryBuffer to pm4_command_buffer_x Remove many redundant header inclusions access xenia-gpu Minor microoptimization of ExecutePacketType0 Add TextureCache::RequestTextures for batch invocation of LoadTexturesData Add TextureCache::LoadTexturesData for reducing the number of times we release and reacquire the global lock. Ideally you should hold the global lock for as little time as possible, but if you are constantly acquiring and releasing it you are actually more likely to have contention Add already_locked param to ObjectTable::LookupObject to help with reducing lock acquire/release pairs Add missing checks to XAudioRegisterRenderDriverClient_entry. this is unlikely to fix anything, it was just an easy thing to do Add NtQueryEvent system call implementation. I don't actually know of any games that need it. Instead of using std::vector + push_back in KeWaitForMultipleObjects and xeNtWaitForMultipleObjectsEx use a fixed size array of 64 and track the count. More than 64 objects is not permitted by the kernel. The repeated reallocations from push_back were appearing unusually high on the profiler, but were masked until now by waitformultipleobjects natural overhead Pre-lock the global lock before looking up each handle for xeNtWaitForMultipleObjectsEx and KeWaitForMultipleObjects. Pre-lock before looking up the signal and waiter in NtSignalAndWaitForSingleObjectEx add missing checks to NtWaitForMultipleObjectsEx Support pre-locking in XObject::GetNativeObject
2022-10-08 16:55:17 +00:00
textures_remaining = xe::clear_lowest_bit(textures_remaining);
TextureBinding& binding = texture_bindings_[index];
xenos::xe_gpu_texture_fetch_t fetch = regs.GetTextureFetch(index);
TextureKey old_key = binding.key;
uint8_t old_swizzled_signs = binding.swizzled_signs;
BindingInfoFromFetchConstant(fetch, binding.key, &binding.swizzled_signs);
texture_bindings_in_sync_ |= index_bit;
if (!binding.key.is_valid) {
if (old_key.is_valid) {
bindings_changed |= index_bit;
}
binding.Reset();
continue;
}
uint32_t old_host_swizzle = binding.host_swizzle;
binding.host_swizzle =
GuestToHostSwizzle(fetch.swizzle, GetHostFormatSwizzle(binding.key));
// Check if need to load the unsigned and the signed versions of the texture
// (if the format is emulated with different host bit representations for
// signed and unsigned - otherwise only the unsigned one is loaded).
bool key_changed = binding.key != old_key;
bool any_sign_was_not_signed =
texture_util::IsAnySignNotSigned(old_swizzled_signs);
bool any_sign_was_signed =
texture_util::IsAnySignSigned(old_swizzled_signs);
bool any_sign_is_not_signed =
texture_util::IsAnySignNotSigned(binding.swizzled_signs);
bool any_sign_is_signed =
texture_util::IsAnySignSigned(binding.swizzled_signs);
if (key_changed || binding.host_swizzle != old_host_swizzle ||
any_sign_is_not_signed != any_sign_was_not_signed ||
any_sign_is_signed != any_sign_was_signed) {
bindings_changed |= index_bit;
}
bool load_unsigned_data = false, load_signed_data = false;
if (IsSignedVersionSeparateForFormat(binding.key)) {
// Can reuse previously loaded unsigned/signed versions if the key is the
// same and the texture was previously bound as unsigned/signed
// respectively (checking the previous values of signedness rather than
// binding.texture != nullptr and binding.texture_signed != nullptr also
// prevents repeated attempts to load the texture if it has failed to
// load).
if (any_sign_is_not_signed) {
if (key_changed || !any_sign_was_not_signed) {
binding.texture = FindOrCreateTexture(binding.key);
load_unsigned_data = true;
}
} else {
binding.texture = nullptr;
}
if (any_sign_is_signed) {
if (key_changed || !any_sign_was_signed) {
TextureKey signed_key = binding.key;
signed_key.signed_separate = 1;
binding.texture_signed = FindOrCreateTexture(signed_key);
load_signed_data = true;
}
} else {
binding.texture_signed = nullptr;
}
} else {
// Same resource for both unsigned and signed, but descriptor formats may
// be different.
if (key_changed) {
binding.texture = FindOrCreateTexture(binding.key);
load_unsigned_data = true;
}
binding.texture_signed = nullptr;
}
if (load_unsigned_data && binding.texture != nullptr) {
fixed wine crash from use of NtSetEventPriorityBoost add xe::clear_lowest_bit, use it in place of shift-andnot in some bit iteration code make is_allocated_ and is_enabled_ volatile in xma_context preallocate avpacket buffer in XMAContext::Setup, the reallocations of the buffer in ffmpeg were showing up on profiles check is_enabled and is_allocated BEFORE locking an xmacontext. XMA worker was spending most of its time locking and unlocking contexts Removed XeDMAC, dma:: namespace. It was a bad idea and I couldn't make it work in the end. Kept vastcpy and moved it to the memory namespace instead Made the rest of global_critical_region's members static. They never needed an instance. Removed ifdef'ed out code from ring_buffer.h Added EventInfo struct to threading, added Event::Query to aid with implementing NtQueryEvent. Removed vector from WaitMultiple, instead use a fixed array of 64 handles that we populate. WaitForMultipleObjects cannot handle more than 64 objects. Remove XE_MSVC_OPTIMIZE_SMALL() use in x64_sequences, x64 backend is now always size optimized because of premake Make global_critical_region_ static constexpr in shared_memory.h to get rid of wasteage of 8 bytes (empty class=1byte, +alignment for next member=8) Move trace-related data to the tail of SharedMemory to keep more important data together In IssueDraw build an array of fetch constant addresses/sizes, then pre-lock the global lock before doing requestrange for each instead of individually locking within requestrange for each of them Consistent access specifier protected for pm4_command_processor_declare Devirtualize WriteOneRegisterFromRing. Move ExecutePacket and ExecutePrimaryBuffer to pm4_command_buffer_x Remove many redundant header inclusions access xenia-gpu Minor microoptimization of ExecutePacketType0 Add TextureCache::RequestTextures for batch invocation of LoadTexturesData Add TextureCache::LoadTexturesData for reducing the number of times we release and reacquire the global lock. Ideally you should hold the global lock for as little time as possible, but if you are constantly acquiring and releasing it you are actually more likely to have contention Add already_locked param to ObjectTable::LookupObject to help with reducing lock acquire/release pairs Add missing checks to XAudioRegisterRenderDriverClient_entry. this is unlikely to fix anything, it was just an easy thing to do Add NtQueryEvent system call implementation. I don't actually know of any games that need it. Instead of using std::vector + push_back in KeWaitForMultipleObjects and xeNtWaitForMultipleObjectsEx use a fixed size array of 64 and track the count. More than 64 objects is not permitted by the kernel. The repeated reallocations from push_back were appearing unusually high on the profiler, but were masked until now by waitformultipleobjects natural overhead Pre-lock the global lock before looking up each handle for xeNtWaitForMultipleObjectsEx and KeWaitForMultipleObjects. Pre-lock before looking up the signal and waiter in NtSignalAndWaitForSingleObjectEx add missing checks to NtWaitForMultipleObjectsEx Support pre-locking in XObject::GetNativeObject
2022-10-08 16:55:17 +00:00
textures_to_load[num_textures_to_load++] = binding.texture;
}
if (load_signed_data && binding.texture_signed != nullptr) {
fixed wine crash from use of NtSetEventPriorityBoost add xe::clear_lowest_bit, use it in place of shift-andnot in some bit iteration code make is_allocated_ and is_enabled_ volatile in xma_context preallocate avpacket buffer in XMAContext::Setup, the reallocations of the buffer in ffmpeg were showing up on profiles check is_enabled and is_allocated BEFORE locking an xmacontext. XMA worker was spending most of its time locking and unlocking contexts Removed XeDMAC, dma:: namespace. It was a bad idea and I couldn't make it work in the end. Kept vastcpy and moved it to the memory namespace instead Made the rest of global_critical_region's members static. They never needed an instance. Removed ifdef'ed out code from ring_buffer.h Added EventInfo struct to threading, added Event::Query to aid with implementing NtQueryEvent. Removed vector from WaitMultiple, instead use a fixed array of 64 handles that we populate. WaitForMultipleObjects cannot handle more than 64 objects. Remove XE_MSVC_OPTIMIZE_SMALL() use in x64_sequences, x64 backend is now always size optimized because of premake Make global_critical_region_ static constexpr in shared_memory.h to get rid of wasteage of 8 bytes (empty class=1byte, +alignment for next member=8) Move trace-related data to the tail of SharedMemory to keep more important data together In IssueDraw build an array of fetch constant addresses/sizes, then pre-lock the global lock before doing requestrange for each instead of individually locking within requestrange for each of them Consistent access specifier protected for pm4_command_processor_declare Devirtualize WriteOneRegisterFromRing. Move ExecutePacket and ExecutePrimaryBuffer to pm4_command_buffer_x Remove many redundant header inclusions access xenia-gpu Minor microoptimization of ExecutePacketType0 Add TextureCache::RequestTextures for batch invocation of LoadTexturesData Add TextureCache::LoadTexturesData for reducing the number of times we release and reacquire the global lock. Ideally you should hold the global lock for as little time as possible, but if you are constantly acquiring and releasing it you are actually more likely to have contention Add already_locked param to ObjectTable::LookupObject to help with reducing lock acquire/release pairs Add missing checks to XAudioRegisterRenderDriverClient_entry. this is unlikely to fix anything, it was just an easy thing to do Add NtQueryEvent system call implementation. I don't actually know of any games that need it. Instead of using std::vector + push_back in KeWaitForMultipleObjects and xeNtWaitForMultipleObjectsEx use a fixed size array of 64 and track the count. More than 64 objects is not permitted by the kernel. The repeated reallocations from push_back were appearing unusually high on the profiler, but were masked until now by waitformultipleobjects natural overhead Pre-lock the global lock before looking up each handle for xeNtWaitForMultipleObjectsEx and KeWaitForMultipleObjects. Pre-lock before looking up the signal and waiter in NtSignalAndWaitForSingleObjectEx add missing checks to NtWaitForMultipleObjectsEx Support pre-locking in XObject::GetNativeObject
2022-10-08 16:55:17 +00:00
textures_to_load[num_textures_to_load++] = binding.texture_signed;
}
}
fixed wine crash from use of NtSetEventPriorityBoost add xe::clear_lowest_bit, use it in place of shift-andnot in some bit iteration code make is_allocated_ and is_enabled_ volatile in xma_context preallocate avpacket buffer in XMAContext::Setup, the reallocations of the buffer in ffmpeg were showing up on profiles check is_enabled and is_allocated BEFORE locking an xmacontext. XMA worker was spending most of its time locking and unlocking contexts Removed XeDMAC, dma:: namespace. It was a bad idea and I couldn't make it work in the end. Kept vastcpy and moved it to the memory namespace instead Made the rest of global_critical_region's members static. They never needed an instance. Removed ifdef'ed out code from ring_buffer.h Added EventInfo struct to threading, added Event::Query to aid with implementing NtQueryEvent. Removed vector from WaitMultiple, instead use a fixed array of 64 handles that we populate. WaitForMultipleObjects cannot handle more than 64 objects. Remove XE_MSVC_OPTIMIZE_SMALL() use in x64_sequences, x64 backend is now always size optimized because of premake Make global_critical_region_ static constexpr in shared_memory.h to get rid of wasteage of 8 bytes (empty class=1byte, +alignment for next member=8) Move trace-related data to the tail of SharedMemory to keep more important data together In IssueDraw build an array of fetch constant addresses/sizes, then pre-lock the global lock before doing requestrange for each instead of individually locking within requestrange for each of them Consistent access specifier protected for pm4_command_processor_declare Devirtualize WriteOneRegisterFromRing. Move ExecutePacket and ExecutePrimaryBuffer to pm4_command_buffer_x Remove many redundant header inclusions access xenia-gpu Minor microoptimization of ExecutePacketType0 Add TextureCache::RequestTextures for batch invocation of LoadTexturesData Add TextureCache::LoadTexturesData for reducing the number of times we release and reacquire the global lock. Ideally you should hold the global lock for as little time as possible, but if you are constantly acquiring and releasing it you are actually more likely to have contention Add already_locked param to ObjectTable::LookupObject to help with reducing lock acquire/release pairs Add missing checks to XAudioRegisterRenderDriverClient_entry. this is unlikely to fix anything, it was just an easy thing to do Add NtQueryEvent system call implementation. I don't actually know of any games that need it. Instead of using std::vector + push_back in KeWaitForMultipleObjects and xeNtWaitForMultipleObjectsEx use a fixed size array of 64 and track the count. More than 64 objects is not permitted by the kernel. The repeated reallocations from push_back were appearing unusually high on the profiler, but were masked until now by waitformultipleobjects natural overhead Pre-lock the global lock before looking up each handle for xeNtWaitForMultipleObjectsEx and KeWaitForMultipleObjects. Pre-lock before looking up the signal and waiter in NtSignalAndWaitForSingleObjectEx add missing checks to NtWaitForMultipleObjectsEx Support pre-locking in XObject::GetNativeObject
2022-10-08 16:55:17 +00:00
LoadTexturesData(textures_to_load, num_textures_to_load);
if (bindings_changed) {
UpdateTextureBindingsImpl(bindings_changed);
}
}
const char* TextureCache::TextureKey::GetLogDimensionName(
xenos::DataDimension dimension) {
switch (dimension) {
case xenos::DataDimension::k1D:
return "1D";
case xenos::DataDimension::k2DOrStacked:
return "2D";
case xenos::DataDimension::k3D:
return "3D";
case xenos::DataDimension::kCube:
return "cube";
default:
assert_unhandled_case(dimension);
return "unknown";
}
}
void TextureCache::TextureKey::LogAction(const char* action) const {
XELOGGPU(
"{} {} {}{}x{}x{} {} {} texture with {} {}packed mip level{}, "
"base at 0x{:08X} (pitch {}), mips at 0x{:08X}",
action, tiled ? "tiled" : "linear", scaled_resolve ? "scaled " : "",
GetWidth(), GetHeight(), GetDepthOrArraySize(), GetLogDimensionName(),
use Sleep(0) instead of SwitchToThread, should waste less power and help the os with scheduling. PM4 buffer handling made a virtual member of commandprocessor, place the implementation/declaration into reusable macro files. this is probably the biggest boost here. Optimized SET_CONSTANT/ LOAD_CONSTANT pm4 ops based on the register range they start writing at, this was also a nice boost Expose X64 extension flags to code outside of x64 backend, so we can detect and use things like avx512, xop, avx2, etc in normal code Add freelists for HIR structures to try to reduce the number of last level cache misses during optimization (currently disabled... fixme later) Analyzed PGO feedback and reordered branches, uninlined functions, moved code out into different functions based on info from it in the PM4 functions, this gave like a 2% boost at best. Added support for the db16cyc opcode, which is used often in xb360 spinlocks. before it was just being translated to nop, now on x64 we translate it to _mm_pause but may change that in the future to reduce cpu time wasted texture util - all our divisors were powers of 2, instead we look up a shift. this made texture scaling slightly faster, more so on intel processors which seem to be worse at int divs. GetGuestTextureLayout is now a little faster, although it is still one of the heaviest functions in the emulator when scaling is on. xe_unlikely_mutex was not a good choice for the guest clock lock, (running theory) on intel processors another thread may take a significant time to update the clock? maybe because of the uint64 division? really not sure, but switched it to xe_mutex. This fixed audio stutter that i had introduced to 1 or 2 games, fixed performance on that n64 rare game with the monkeys. Took another crack at DMA implementation, another failure. Instead of passing as a parameter, keep the ringbuffer reader as the first member of commandprocessor so it can be accessed through this Added macro for noalias Applied noalias to Memory::LookupHeap. This reduced the size of the executable by 7 kb. Reworked kernel shim template, this shaved like 100kb off the exe and eliminated the indirect calls from the shim to the actual implementation. We still unconditionally generate string representations of kernel calls though :(, unless it is kHighFrequency Add nvapi extensions support, currently unused. Will use CPUVISIBLE memory at some point Inserted prefetches in a few places based on feedback from vtune. Add native implementation of SHA int8 if all elements are the same Vectorized comparisons for SetViewport, SetScissorRect Vectorized ranged comparisons for WriteRegister Add XE_MSVC_ASSUME Move FormatInfo::name out of the structure, instead look up the name in a different table. Debug related data and critical runtime data are best kept apart Templated UpdateSystemConstantValues based on ROV/RTV and primitive_polygonal Add ArchFloatMask functions, these are for storing the results of floating point comparisons without doing costly float->int pipeline transfers (vucomiss/setb) Use floatmasks in UpdateSystemConstantValues for checking if dirty, only transfer to int at end of function. Instead of dirty |= (x == y) in UpdateSystemConstantValues, now we do dirty_u32 |= (x^y). if any of them are not equal, dirty_u32 will be nz, else if theyre all equal it will be zero. This is more friendly to register renaming and the lack of dependencies on EFLAGS lets the compiler reorder better Add PrefetchSamplerParameters to D3D12TextureCache use PrefetchSamplerParameters in UpdateBindings to eliminate cache misses that vtune detected Add PrefetchTextureBinding to D3D12TextureCache Prefetch texture bindings to get rid of more misses vtune detected (more accesses out of order with random strides) Rewrote DMAC, still terrible though and have disabled it for now. Replace tiny memcmp of 6 U64 in render_target_cache with inline loop, msvc fails to make it a loop and instead does a thunk to their memcmp function, which is optimized for larger sizes PrefetchTextureBinding in AreActiveTextureSRVKeysUpToDate Replace memcmp calls for pipelinedescription with handwritten cmp Directly write some registers that dont have special handling in PM4 functions Changed EstimateMaxY to try to eliminate mispredictions that vtune was reporting, msvc ended up turning the changed code into a series of blends in ExecutePacketType3_EVENT_WRITE_EXT, instead of writing extents to an array on the stack and then doing xe_copy_and_swap_16 of the data to its dest, pre-swap each constant and then store those. msvc manages to unroll that into wider stores stop logging XE_SWAP every time we receive XE_SWAP, stop logging the start and end of each viz query Prefetch watch nodes in FireWatches based on feedback from vtune Removed dead code from texture_info.cc NOINLINE on GpuSwap, PGO builds did it so we should too.
2022-09-11 21:14:48 +00:00
FormatInfo::GetName(format), mip_max_level + 1, packed_mips ? "" : "un",
mip_max_level != 0 ? "s" : "", base_page << 12, pitch << 5,
mip_page << 12);
}
void TextureCache::Texture::LogAction(const char* action) const {
XELOGGPU(
"{} {} {}{}x{}x{} {} {} texture with {} {}packed mip level{}, "
"base at 0x{:08X} (pitch {}, size 0x{:08X}), mips at 0x{:08X} (size "
"0x{:08X})",
action, key_.tiled ? "tiled" : "linear",
key_.scaled_resolve ? "scaled " : "", key_.GetWidth(), key_.GetHeight(),
key_.GetDepthOrArraySize(), key_.GetLogDimensionName(),
use Sleep(0) instead of SwitchToThread, should waste less power and help the os with scheduling. PM4 buffer handling made a virtual member of commandprocessor, place the implementation/declaration into reusable macro files. this is probably the biggest boost here. Optimized SET_CONSTANT/ LOAD_CONSTANT pm4 ops based on the register range they start writing at, this was also a nice boost Expose X64 extension flags to code outside of x64 backend, so we can detect and use things like avx512, xop, avx2, etc in normal code Add freelists for HIR structures to try to reduce the number of last level cache misses during optimization (currently disabled... fixme later) Analyzed PGO feedback and reordered branches, uninlined functions, moved code out into different functions based on info from it in the PM4 functions, this gave like a 2% boost at best. Added support for the db16cyc opcode, which is used often in xb360 spinlocks. before it was just being translated to nop, now on x64 we translate it to _mm_pause but may change that in the future to reduce cpu time wasted texture util - all our divisors were powers of 2, instead we look up a shift. this made texture scaling slightly faster, more so on intel processors which seem to be worse at int divs. GetGuestTextureLayout is now a little faster, although it is still one of the heaviest functions in the emulator when scaling is on. xe_unlikely_mutex was not a good choice for the guest clock lock, (running theory) on intel processors another thread may take a significant time to update the clock? maybe because of the uint64 division? really not sure, but switched it to xe_mutex. This fixed audio stutter that i had introduced to 1 or 2 games, fixed performance on that n64 rare game with the monkeys. Took another crack at DMA implementation, another failure. Instead of passing as a parameter, keep the ringbuffer reader as the first member of commandprocessor so it can be accessed through this Added macro for noalias Applied noalias to Memory::LookupHeap. This reduced the size of the executable by 7 kb. Reworked kernel shim template, this shaved like 100kb off the exe and eliminated the indirect calls from the shim to the actual implementation. We still unconditionally generate string representations of kernel calls though :(, unless it is kHighFrequency Add nvapi extensions support, currently unused. Will use CPUVISIBLE memory at some point Inserted prefetches in a few places based on feedback from vtune. Add native implementation of SHA int8 if all elements are the same Vectorized comparisons for SetViewport, SetScissorRect Vectorized ranged comparisons for WriteRegister Add XE_MSVC_ASSUME Move FormatInfo::name out of the structure, instead look up the name in a different table. Debug related data and critical runtime data are best kept apart Templated UpdateSystemConstantValues based on ROV/RTV and primitive_polygonal Add ArchFloatMask functions, these are for storing the results of floating point comparisons without doing costly float->int pipeline transfers (vucomiss/setb) Use floatmasks in UpdateSystemConstantValues for checking if dirty, only transfer to int at end of function. Instead of dirty |= (x == y) in UpdateSystemConstantValues, now we do dirty_u32 |= (x^y). if any of them are not equal, dirty_u32 will be nz, else if theyre all equal it will be zero. This is more friendly to register renaming and the lack of dependencies on EFLAGS lets the compiler reorder better Add PrefetchSamplerParameters to D3D12TextureCache use PrefetchSamplerParameters in UpdateBindings to eliminate cache misses that vtune detected Add PrefetchTextureBinding to D3D12TextureCache Prefetch texture bindings to get rid of more misses vtune detected (more accesses out of order with random strides) Rewrote DMAC, still terrible though and have disabled it for now. Replace tiny memcmp of 6 U64 in render_target_cache with inline loop, msvc fails to make it a loop and instead does a thunk to their memcmp function, which is optimized for larger sizes PrefetchTextureBinding in AreActiveTextureSRVKeysUpToDate Replace memcmp calls for pipelinedescription with handwritten cmp Directly write some registers that dont have special handling in PM4 functions Changed EstimateMaxY to try to eliminate mispredictions that vtune was reporting, msvc ended up turning the changed code into a series of blends in ExecutePacketType3_EVENT_WRITE_EXT, instead of writing extents to an array on the stack and then doing xe_copy_and_swap_16 of the data to its dest, pre-swap each constant and then store those. msvc manages to unroll that into wider stores stop logging XE_SWAP every time we receive XE_SWAP, stop logging the start and end of each viz query Prefetch watch nodes in FireWatches based on feedback from vtune Removed dead code from texture_info.cc NOINLINE on GpuSwap, PGO builds did it so we should too.
2022-09-11 21:14:48 +00:00
FormatInfo::GetName(key_.format), key_.mip_max_level + 1,
key_.packed_mips ? "" : "un", key_.mip_max_level != 0 ? "s" : "",
key_.base_page << 12, key_.pitch << 5, GetGuestBaseSize(),
key_.mip_page << 12, GetGuestMipsSize());
}
// The texture must be in the recent usage list. Place it in front now because
// after creation, the texture will likely be used immediately, and it should
// not be destroyed immediately after creation if dropping of old textures is
// performed somehow. The list is maintained by the Texture, not the
// TextureCache itself (unlike the `textures_` container).
TextureCache::Texture::Texture(TextureCache& texture_cache,
const TextureKey& key)
: texture_cache_(texture_cache),
key_(key),
guest_layout_(key.GetGuestLayout()),
base_resolved_(key.scaled_resolve),
mips_resolved_(key.scaled_resolve),
last_usage_submission_index_(texture_cache.current_submission_index_),
last_usage_time_(texture_cache.current_submission_time_),
used_previous_(texture_cache.texture_used_last_),
used_next_(nullptr) {
if (texture_cache.texture_used_last_) {
texture_cache.texture_used_last_->used_next_ = this;
} else {
texture_cache.texture_used_first_ = this;
}
texture_cache.texture_used_last_ = this;
// Never try to upload data that doesn't exist.
base_outdated_ = guest_layout().base.level_data_extent_bytes != 0;
mips_outdated_ = guest_layout().mips_total_extent_bytes != 0;
}
TextureCache::Texture::~Texture() {
if (mips_watch_handle_) {
texture_cache().shared_memory().UnwatchMemoryRange(mips_watch_handle_);
}
if (base_watch_handle_) {
texture_cache().shared_memory().UnwatchMemoryRange(base_watch_handle_);
}
if (used_previous_) {
used_previous_->used_next_ = used_next_;
} else {
texture_cache_.texture_used_first_ = used_next_;
}
if (used_next_) {
used_next_->used_previous_ = used_previous_;
} else {
texture_cache_.texture_used_last_ = used_previous_;
}
texture_cache_.UpdateTexturesTotalHostMemoryUsage(0, host_memory_usage_);
}
void TextureCache::Texture::MakeUpToDateAndWatch(
2022-08-14 15:59:11 +00:00
const global_unique_lock_type& global_lock) {
SharedMemory& shared_memory = texture_cache().shared_memory();
if (base_outdated_) {
assert_not_zero(GetGuestBaseSize());
base_outdated_ = false;
base_watch_handle_ = shared_memory.WatchMemoryRange(
key().base_page << 12, GetGuestBaseSize(), TextureCache::WatchCallback,
this, nullptr, 0);
}
if (mips_outdated_) {
assert_not_zero(GetGuestMipsSize());
mips_outdated_ = false;
mips_watch_handle_ = shared_memory.WatchMemoryRange(
key().mip_page << 12, GetGuestMipsSize(), TextureCache::WatchCallback,
this, nullptr, 1);
}
}
void TextureCache::Texture::MarkAsUsed() {
assert_true(last_usage_submission_index_ <=
texture_cache_.current_submission_index_);
// This is called very frequently, don't relink unless needed for caching.
if (last_usage_submission_index_ >=
texture_cache_.current_submission_index_) {
return;
}
last_usage_submission_index_ = texture_cache_.current_submission_index_;
last_usage_time_ = texture_cache_.current_submission_time_;
if (used_next_ == nullptr) {
// Already the most recently used.
return;
}
if (used_previous_ != nullptr) {
used_previous_->used_next_ = used_next_;
} else {
texture_cache_.texture_used_first_ = used_next_;
}
used_next_->used_previous_ = used_previous_;
used_previous_ = texture_cache_.texture_used_last_;
used_next_ = nullptr;
texture_cache_.texture_used_last_->used_next_ = this;
texture_cache_.texture_used_last_ = this;
}
void TextureCache::Texture::WatchCallback(
Fixed a bug with readback_resolve and readback_memexport that was responsible for a large portion of their overhead. readback_memexport and resolve are now usable for games, depending on your hardware. in my case games that were slideshows now run at like 20-30 fps, and my hardware isnt the best for xenia. add split_map class for mapping keys to values in a way that optimizes for frequent searches and infrequent insertions/removals remove jump table implementation of GetColorRenderTargetFormatComponentCount, it was appearing relatively high in profiles. instead pack the component counts into a single 32 bit word, which is indexed by shifting Add cvar to align all basic blocks to a boundary Add mmio aware load paths liberally apply XE_RESTRICT in ringbuffer related code Removed the IS_TRUE and IS_FALSE opcodes, they were pointless duplicates of COMPARE_EQ/COMPARE_NE and i want to simplify our set of opcodes for future backends More work on LVSR/LVSL/STVR/STVL opcodes Optimized X64 translated code emission, now only compute instrkey once Add code for pre-computing integer division magic numbers Optimized GetHostViewportInfo a little Move args for GetHostViewportInfo into a class, cache the result and compare for future queries. moved GetHostViewportInfo far lower on the profile Add (currently not functional, and very racy) asynchronous memcpy code. will improve it and actually use it in future commits. Add non-temporal memcpy function for huge page-aligned allocations. Used for copying to shared memory/readback hoist are_accumulated_render_targets_valid_ check out of loop in render_target_cache already bound check. Add stosb/movsb code for small constant memcpys/memsets that arent worth the overhead of memcpy/memset
2022-08-28 21:24:25 +00:00
[[maybe_unused]] const global_unique_lock_type& global_lock, bool is_mip) {
if (is_mip) {
assert_not_zero(GetGuestMipsSize());
mips_outdated_ = true;
mips_watch_handle_ = nullptr;
} else {
assert_not_zero(GetGuestBaseSize());
base_outdated_ = true;
base_watch_handle_ = nullptr;
}
}
2022-08-14 15:59:11 +00:00
void TextureCache::WatchCallback(const global_unique_lock_type& global_lock,
Fixed a bug with readback_resolve and readback_memexport that was responsible for a large portion of their overhead. readback_memexport and resolve are now usable for games, depending on your hardware. in my case games that were slideshows now run at like 20-30 fps, and my hardware isnt the best for xenia. add split_map class for mapping keys to values in a way that optimizes for frequent searches and infrequent insertions/removals remove jump table implementation of GetColorRenderTargetFormatComponentCount, it was appearing relatively high in profiles. instead pack the component counts into a single 32 bit word, which is indexed by shifting Add cvar to align all basic blocks to a boundary Add mmio aware load paths liberally apply XE_RESTRICT in ringbuffer related code Removed the IS_TRUE and IS_FALSE opcodes, they were pointless duplicates of COMPARE_EQ/COMPARE_NE and i want to simplify our set of opcodes for future backends More work on LVSR/LVSL/STVR/STVL opcodes Optimized X64 translated code emission, now only compute instrkey once Add code for pre-computing integer division magic numbers Optimized GetHostViewportInfo a little Move args for GetHostViewportInfo into a class, cache the result and compare for future queries. moved GetHostViewportInfo far lower on the profile Add (currently not functional, and very racy) asynchronous memcpy code. will improve it and actually use it in future commits. Add non-temporal memcpy function for huge page-aligned allocations. Used for copying to shared memory/readback hoist are_accumulated_render_targets_valid_ check out of loop in render_target_cache already bound check. Add stosb/movsb code for small constant memcpys/memsets that arent worth the overhead of memcpy/memset
2022-08-28 21:24:25 +00:00
void* context, void* data, uint64_t argument,
bool invalidated_by_gpu) {
Texture& texture = *static_cast<Texture*>(context);
texture.WatchCallback(global_lock, argument != 0);
texture.texture_cache().texture_became_outdated_.store(
true, std::memory_order_release);
}
void TextureCache::DestroyAllTextures(bool from_destructor) {
ResetTextureBindings(from_destructor);
textures_.clear();
COUNT_profile_set("gpu/texture_cache/textures", 0);
}
TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) {
// Check if the texture is a scaled resolve texture.
if (IsDrawResolutionScaled() && key.tiled &&
IsScaledResolveSupportedForFormat(key)) {
texture_util::TextureGuestLayout scaled_resolve_guest_layout =
key.GetGuestLayout();
if ((scaled_resolve_guest_layout.base.level_data_extent_bytes &&
IsRangeScaledResolved(
key.base_page << 12,
scaled_resolve_guest_layout.base.level_data_extent_bytes)) ||
(scaled_resolve_guest_layout.mips_total_extent_bytes &&
IsRangeScaledResolved(
key.mip_page << 12,
scaled_resolve_guest_layout.mips_total_extent_bytes))) {
key.scaled_resolve = 1;
}
}
uint32_t host_width = key.GetWidth();
uint32_t host_height = key.GetHeight();
if (key.scaled_resolve) {
host_width *= draw_resolution_scale_x();
host_height *= draw_resolution_scale_y();
}
// With 3x resolution scaling, a 2D texture may become bigger than the
// Direct3D 11 limit, and with 2x, a 3D one as well.
// TODO(Triang3l): Skip mips on Vulkan in this case - the minimum requirement
// there is 4096, which is below the Xenos maximum texture size of 8192.
uint32_t max_host_width_height = GetMaxHostTextureWidthHeight(key.dimension);
uint32_t max_host_depth_or_array_size =
GetMaxHostTextureDepthOrArraySize(key.dimension);
if (host_width > max_host_width_height ||
host_height > max_host_width_height ||
key.GetDepthOrArraySize() > max_host_depth_or_array_size) {
return nullptr;
}
// Try to find an existing texture.
// TODO(Triang3l): Reuse a texture with mip_page unchanged, but base_page
// previously 0, now not 0, to save memory - common case in streaming.
auto found_texture_it = textures_.find(key);
if (found_texture_it != textures_.end()) {
return found_texture_it->second.get();
}
// Create the texture and add it to the map.
Texture* texture;
{
std::unique_ptr<Texture> new_texture = CreateTexture(key);
if (!new_texture) {
key.LogAction("Failed to create");
return nullptr;
}
assert_true(new_texture->key() == key);
texture =
textures_.emplace(key, std::move(new_texture)).first->second.get();
}
COUNT_profile_set("gpu/texture_cache/textures", textures_.size());
texture->LogAction("Created");
return texture;
}
fixed wine crash from use of NtSetEventPriorityBoost add xe::clear_lowest_bit, use it in place of shift-andnot in some bit iteration code make is_allocated_ and is_enabled_ volatile in xma_context preallocate avpacket buffer in XMAContext::Setup, the reallocations of the buffer in ffmpeg were showing up on profiles check is_enabled and is_allocated BEFORE locking an xmacontext. XMA worker was spending most of its time locking and unlocking contexts Removed XeDMAC, dma:: namespace. It was a bad idea and I couldn't make it work in the end. Kept vastcpy and moved it to the memory namespace instead Made the rest of global_critical_region's members static. They never needed an instance. Removed ifdef'ed out code from ring_buffer.h Added EventInfo struct to threading, added Event::Query to aid with implementing NtQueryEvent. Removed vector from WaitMultiple, instead use a fixed array of 64 handles that we populate. WaitForMultipleObjects cannot handle more than 64 objects. Remove XE_MSVC_OPTIMIZE_SMALL() use in x64_sequences, x64 backend is now always size optimized because of premake Make global_critical_region_ static constexpr in shared_memory.h to get rid of wasteage of 8 bytes (empty class=1byte, +alignment for next member=8) Move trace-related data to the tail of SharedMemory to keep more important data together In IssueDraw build an array of fetch constant addresses/sizes, then pre-lock the global lock before doing requestrange for each instead of individually locking within requestrange for each of them Consistent access specifier protected for pm4_command_processor_declare Devirtualize WriteOneRegisterFromRing. Move ExecutePacket and ExecutePrimaryBuffer to pm4_command_buffer_x Remove many redundant header inclusions access xenia-gpu Minor microoptimization of ExecutePacketType0 Add TextureCache::RequestTextures for batch invocation of LoadTexturesData Add TextureCache::LoadTexturesData for reducing the number of times we release and reacquire the global lock. Ideally you should hold the global lock for as little time as possible, but if you are constantly acquiring and releasing it you are actually more likely to have contention Add already_locked param to ObjectTable::LookupObject to help with reducing lock acquire/release pairs Add missing checks to XAudioRegisterRenderDriverClient_entry. this is unlikely to fix anything, it was just an easy thing to do Add NtQueryEvent system call implementation. I don't actually know of any games that need it. Instead of using std::vector + push_back in KeWaitForMultipleObjects and xeNtWaitForMultipleObjectsEx use a fixed size array of 64 and track the count. More than 64 objects is not permitted by the kernel. The repeated reallocations from push_back were appearing unusually high on the profiler, but were masked until now by waitformultipleobjects natural overhead Pre-lock the global lock before looking up each handle for xeNtWaitForMultipleObjectsEx and KeWaitForMultipleObjects. Pre-lock before looking up the signal and waiter in NtSignalAndWaitForSingleObjectEx add missing checks to NtWaitForMultipleObjectsEx Support pre-locking in XObject::GetNativeObject
2022-10-08 16:55:17 +00:00
void TextureCache::LoadTexturesData(Texture** textures, uint32_t n_textures) {
assert_true(n_textures <= 64);
if (n_textures < 2) {
if (!n_textures) {
return;
} else {
LoadTextureData(*textures[0]);
return;
}
}
uint64_t index_base_outdated = 0;
uint64_t index_mips_outdated = 0;
uint32_t nkept = 0;
{
auto global_lock = global_critical_region_.Acquire();
for (uint32_t i = 0; i < n_textures; ++i) {
Texture* current = textures[i];
auto base_outdated = current->base_outdated(global_lock);
auto mips_outdated = current->mips_outdated(global_lock);
index_base_outdated |= static_cast<uint64_t>(base_outdated) << i;
index_mips_outdated |= static_cast<uint64_t>(mips_outdated) << i;
if (!base_outdated && !mips_outdated) {
textures[i] = nullptr;
} else {
nkept++;
}
}
}
if (nkept == 0) {
return;
}
for (uint32_t i = 0; i < n_textures; ++i) {
Texture* p_texture = textures[i];
if (!p_texture) {
continue;
}
textures[i] = nullptr;
Texture& texture = *p_texture;
TextureKey texture_key = texture.key();
// Implementation may load multiple blocks at once via accesses of up to 128
// bits (R32G32B32A32_UINT), so aligning the size to this value to make sure
// if the texture is small (especially if it's linear), the last blocks
// won't be cut off (hosts may return 0, 0, 0, 0 for the whole
// R32G32B32A32_UINT access for the non-16-aligned tail even if 1...15 bytes
// are actually provided for it).
// Request uploading of the texture data to the shared memory.
// This is also necessary when resolution scaling is used - the texture
// cache relies on shared memory for invalidation of both unscaled and
// scaled textures. Plus a texture may be unscaled partially, when only a
// portion of its pages is invalidated, in this case we'll need the texture
// from the shared memory to load the unscaled parts.
// TODO(Triang3l): Load unscaled parts.
bool base_resolved = texture.GetBaseResolved();
if (index_base_outdated & (1ULL << i)) {
if (!shared_memory().RequestRange(
texture_key.base_page << 12,
xe::align(texture.GetGuestBaseSize(), UINT32_C(16)),
texture_key.scaled_resolve ? nullptr : &base_resolved)) {
continue;
}
}
bool mips_resolved = texture.GetMipsResolved();
if (index_mips_outdated & (1ULL << i)) {
if (!shared_memory().RequestRange(
texture_key.mip_page << 12,
xe::align(texture.GetGuestMipsSize(), UINT32_C(16)),
texture_key.scaled_resolve ? nullptr : &mips_resolved)) {
continue;
}
}
if (texture_key.scaled_resolve) {
// Make sure all the scaled resolve memory is resident and accessible from
// the shader, including any possible padding that hasn't yet been touched
// by an actual resolve, but is still included in the texture size, so the
// GPU won't be trying to access unmapped memory.
if (!EnsureScaledResolveMemoryCommitted(texture_key.base_page << 12,
texture.GetGuestBaseSize(), 4)) {
continue;
}
if (!EnsureScaledResolveMemoryCommitted(texture_key.mip_page << 12,
texture.GetGuestMipsSize(), 4)) {
continue;
}
}
fixed wine crash from use of NtSetEventPriorityBoost add xe::clear_lowest_bit, use it in place of shift-andnot in some bit iteration code make is_allocated_ and is_enabled_ volatile in xma_context preallocate avpacket buffer in XMAContext::Setup, the reallocations of the buffer in ffmpeg were showing up on profiles check is_enabled and is_allocated BEFORE locking an xmacontext. XMA worker was spending most of its time locking and unlocking contexts Removed XeDMAC, dma:: namespace. It was a bad idea and I couldn't make it work in the end. Kept vastcpy and moved it to the memory namespace instead Made the rest of global_critical_region's members static. They never needed an instance. Removed ifdef'ed out code from ring_buffer.h Added EventInfo struct to threading, added Event::Query to aid with implementing NtQueryEvent. Removed vector from WaitMultiple, instead use a fixed array of 64 handles that we populate. WaitForMultipleObjects cannot handle more than 64 objects. Remove XE_MSVC_OPTIMIZE_SMALL() use in x64_sequences, x64 backend is now always size optimized because of premake Make global_critical_region_ static constexpr in shared_memory.h to get rid of wasteage of 8 bytes (empty class=1byte, +alignment for next member=8) Move trace-related data to the tail of SharedMemory to keep more important data together In IssueDraw build an array of fetch constant addresses/sizes, then pre-lock the global lock before doing requestrange for each instead of individually locking within requestrange for each of them Consistent access specifier protected for pm4_command_processor_declare Devirtualize WriteOneRegisterFromRing. Move ExecutePacket and ExecutePrimaryBuffer to pm4_command_buffer_x Remove many redundant header inclusions access xenia-gpu Minor microoptimization of ExecutePacketType0 Add TextureCache::RequestTextures for batch invocation of LoadTexturesData Add TextureCache::LoadTexturesData for reducing the number of times we release and reacquire the global lock. Ideally you should hold the global lock for as little time as possible, but if you are constantly acquiring and releasing it you are actually more likely to have contention Add already_locked param to ObjectTable::LookupObject to help with reducing lock acquire/release pairs Add missing checks to XAudioRegisterRenderDriverClient_entry. this is unlikely to fix anything, it was just an easy thing to do Add NtQueryEvent system call implementation. I don't actually know of any games that need it. Instead of using std::vector + push_back in KeWaitForMultipleObjects and xeNtWaitForMultipleObjectsEx use a fixed size array of 64 and track the count. More than 64 objects is not permitted by the kernel. The repeated reallocations from push_back were appearing unusually high on the profiler, but were masked until now by waitformultipleobjects natural overhead Pre-lock the global lock before looking up each handle for xeNtWaitForMultipleObjectsEx and KeWaitForMultipleObjects. Pre-lock before looking up the signal and waiter in NtSignalAndWaitForSingleObjectEx add missing checks to NtWaitForMultipleObjectsEx Support pre-locking in XObject::GetNativeObject
2022-10-08 16:55:17 +00:00
// Actually load the texture data.
if (!LoadTextureDataFromResidentMemoryImpl(
texture, (index_base_outdated & (1ULL << i)) != 0,
(index_mips_outdated & (1ULL << i)) != 0)) {
continue;
}
// Update the source of the texture (resolve vs. CPU or memexport) for
// purposes of handling piecewise gamma emulation via sRGB and for
// resolution scale in sampling offsets.
if (!texture_key.scaled_resolve) {
texture.SetBaseResolved(base_resolved);
texture.SetMipsResolved(mips_resolved);
}
// reque for makeuptodatandwatch
textures[i] = &texture;
}
{
auto crit = global_critical_region_.Acquire();
for (uint32_t i = 0; i < n_textures; ++i) {
auto texture = textures[i];
if (!texture) {
continue;
}
// Mark the ranges as uploaded and watch them. This is needed for scaled
// resolves as well to detect when the CPU wants to reuse the memory for a
// regular texture or a vertex buffer, and thus the scaled resolve version
// is not up to date anymore.
texture->MakeUpToDateAndWatch(crit);
texture->LogAction("Loaded");
}
}
}
bool TextureCache::LoadTextureData(Texture& texture) {
// Check what needs to be uploaded.
bool base_outdated, mips_outdated;
{
auto global_lock = global_critical_region_.Acquire();
base_outdated = texture.base_outdated(global_lock);
mips_outdated = texture.mips_outdated(global_lock);
}
if (!base_outdated && !mips_outdated) {
return true;
}
TextureKey texture_key = texture.key();
// Implementation may load multiple blocks at once via accesses of up to 128
// bits (R32G32B32A32_UINT), so aligning the size to this value to make sure
// if the texture is small (especially if it's linear), the last blocks won't
// be cut off (hosts may return 0, 0, 0, 0 for the whole R32G32B32A32_UINT
// access for the non-16-aligned tail even if 1...15 bytes are actually
// provided for it).
// Request uploading of the texture data to the shared memory.
// This is also necessary when resolution scaling is used - the texture cache
// relies on shared memory for invalidation of both unscaled and scaled
// textures. Plus a texture may be unscaled partially, when only a portion of
// its pages is invalidated, in this case we'll need the texture from the
// shared memory to load the unscaled parts.
// TODO(Triang3l): Load unscaled parts.
bool base_resolved = texture.GetBaseResolved();
if (base_outdated) {
if (!shared_memory().RequestRange(
texture_key.base_page << 12,
xe::align(texture.GetGuestBaseSize(), UINT32_C(16)),
texture_key.scaled_resolve ? nullptr : &base_resolved)) {
return false;
}
}
bool mips_resolved = texture.GetMipsResolved();
if (mips_outdated) {
if (!shared_memory().RequestRange(
texture_key.mip_page << 12,
xe::align(texture.GetGuestMipsSize(), UINT32_C(16)),
texture_key.scaled_resolve ? nullptr : &mips_resolved)) {
return false;
}
}
if (texture_key.scaled_resolve) {
// Make sure all the scaled resolve memory is resident and accessible from
// the shader, including any possible padding that hasn't yet been touched
// by an actual resolve, but is still included in the texture size, so the
// GPU won't be trying to access unmapped memory.
if (!EnsureScaledResolveMemoryCommitted(texture_key.base_page << 12,
texture.GetGuestBaseSize(), 4)) {
return false;
}
if (!EnsureScaledResolveMemoryCommitted(texture_key.mip_page << 12,
texture.GetGuestMipsSize(), 4)) {
return false;
}
}
// Actually load the texture data.
if (!LoadTextureDataFromResidentMemoryImpl(texture, base_outdated,
mips_outdated)) {
return false;
}
// Update the source of the texture (resolve vs. CPU or memexport) for
// purposes of handling piecewise gamma emulation via sRGB and for resolution
// scale in sampling offsets.
if (!texture_key.scaled_resolve) {
texture.SetBaseResolved(base_resolved);
texture.SetMipsResolved(mips_resolved);
}
// Mark the ranges as uploaded and watch them. This is needed for scaled
// resolves as well to detect when the CPU wants to reuse the memory for a
// regular texture or a vertex buffer, and thus the scaled resolve version is
// not up to date anymore.
texture.MakeUpToDateAndWatch(global_critical_region_.Acquire());
texture.LogAction("Loaded");
return true;
}
void TextureCache::BindingInfoFromFetchConstant(
const xenos::xe_gpu_texture_fetch_t& fetch, TextureKey& key_out,
uint8_t* swizzled_signs_out) {
// Reset the key and the signedness.
key_out.MakeInvalid();
if (swizzled_signs_out != nullptr) {
*swizzled_signs_out =
uint8_t(xenos::TextureSign::kUnsigned) * uint8_t(0b01010101);
}
switch (fetch.type) {
case xenos::FetchConstantType::kTexture:
break;
case xenos::FetchConstantType::kInvalidTexture:
if (cvars::gpu_allow_invalid_fetch_constants) {
break;
}
XELOGW(
"Texture fetch constant ({:08X} {:08X} {:08X} {:08X} {:08X} {:08X}) "
"has \"invalid\" type! This is incorrect behavior, but you can try "
"bypassing this by launching Xenia with "
"--gpu_allow_invalid_fetch_constants=true.",
fetch.dword_0, fetch.dword_1, fetch.dword_2, fetch.dword_3,
fetch.dword_4, fetch.dword_5);
return;
default:
XELOGW(
"Texture fetch constant ({:08X} {:08X} {:08X} {:08X} {:08X} {:08X}) "
"is completely invalid!",
fetch.dword_0, fetch.dword_1, fetch.dword_2, fetch.dword_3,
fetch.dword_4, fetch.dword_5);
return;
}
uint32_t width_minus_1, height_minus_1, depth_or_array_size_minus_1;
uint32_t base_page, mip_page, mip_max_level;
texture_util::GetSubresourcesFromFetchConstant(
fetch, &width_minus_1, &height_minus_1, &depth_or_array_size_minus_1,
&base_page, &mip_page, nullptr, &mip_max_level);
if (base_page == 0 && mip_page == 0) {
// No texture data at all.
return;
}
if (fetch.dimension == xenos::DataDimension::k1D) {
bool is_invalid_1d = false;
// TODO(Triang3l): Support long 1D textures.
if (width_minus_1 >= xenos::kTexture2DCubeMaxWidthHeight) {
XELOGE(
"1D texture is too wide ({}) - ignoring! Report the game to Xenia "
"developers",
width_minus_1 + 1);
is_invalid_1d = true;
}
assert_false(fetch.tiled);
if (fetch.tiled) {
XELOGE(
"1D texture has tiling enabled in the fetch constant, but this "
"appears to be completely wrong - ignoring! Report the game to Xenia "
"developers");
is_invalid_1d = true;
}
assert_false(fetch.packed_mips);
if (fetch.packed_mips) {
XELOGE(
"1D texture has packed mips enabled in the fetch constant, but this "
"appears to be completely wrong - ignoring! Report the game to Xenia "
"developers");
is_invalid_1d = true;
}
if (is_invalid_1d) {
return;
}
}
xenos::TextureFormat format = GetBaseFormat(fetch.format);
key_out.base_page = base_page;
key_out.mip_page = mip_page;
key_out.dimension = fetch.dimension;
key_out.width_minus_1 = width_minus_1;
key_out.height_minus_1 = height_minus_1;
key_out.depth_or_array_size_minus_1 = depth_or_array_size_minus_1;
key_out.pitch = fetch.pitch;
key_out.mip_max_level = mip_max_level;
key_out.tiled = fetch.tiled;
key_out.packed_mips = fetch.packed_mips;
key_out.format = format;
key_out.endianness = fetch.endianness;
key_out.is_valid = 1;
if (swizzled_signs_out != nullptr) {
*swizzled_signs_out = texture_util::SwizzleSigns(fetch);
}
}
void TextureCache::ResetTextureBindings(bool from_destructor) {
uint32_t bindings_reset = 0;
for (size_t i = 0; i < texture_bindings_.size(); ++i) {
TextureBinding& binding = texture_bindings_[i];
if (!binding.key.is_valid) {
continue;
}
binding.Reset();
bindings_reset |= UINT32_C(1) << i;
}
texture_bindings_in_sync_ &= ~bindings_reset;
if (!from_destructor && bindings_reset) {
UpdateTextureBindingsImpl(bindings_reset);
}
}
void TextureCache::UpdateTexturesTotalHostMemoryUsage(uint64_t add,
uint64_t subtract) {
textures_total_host_memory_usage_ =
textures_total_host_memory_usage_ - subtract + add;
COUNT_profile_set("gpu/texture_cache/total_host_memory_usage_mb",
uint32_t((textures_total_host_memory_usage_ +
((UINT32_C(1) << 20) - 1)) >>
20));
}
bool TextureCache::IsRangeScaledResolved(uint32_t start_unscaled,
uint32_t length_unscaled) {
if (!IsDrawResolutionScaled()) {
return false;
}
start_unscaled = std::min(start_unscaled, SharedMemory::kBufferSize);
length_unscaled =
std::min(length_unscaled, SharedMemory::kBufferSize - start_unscaled);
if (!length_unscaled) {
return false;
}
// Two-level check for faster rejection since resolve targets are usually
// placed in relatively small and localized memory portions (confirmed by
// testing - pretty much all times the deeper level was entered, the texture
// was a resolve target).
uint32_t page_first = start_unscaled >> 12;
uint32_t page_last = (start_unscaled + length_unscaled - 1) >> 12;
uint32_t block_first = page_first >> 5;
uint32_t block_last = page_last >> 5;
uint32_t l2_block_first = block_first >> 6;
uint32_t l2_block_last = block_last >> 6;
auto global_lock = global_critical_region_.Acquire();
for (uint32_t i = l2_block_first; i <= l2_block_last; ++i) {
uint64_t l2_block = scaled_resolve_pages_l2_[i];
if (i == l2_block_first) {
l2_block &= ~((UINT64_C(1) << (block_first & 63)) - 1);
}
if (i == l2_block_last && (block_last & 63) != 63) {
l2_block &= (UINT64_C(1) << ((block_last & 63) + 1)) - 1;
}
uint32_t block_relative_index;
while (xe::bit_scan_forward(l2_block, &block_relative_index)) {
l2_block &= ~(UINT64_C(1) << block_relative_index);
uint32_t block_index = (i << 6) + block_relative_index;
uint32_t check_bits = UINT32_MAX;
if (block_index == block_first) {
check_bits &= ~((UINT32_C(1) << (page_first & 31)) - 1);
}
if (block_index == block_last && (page_last & 31) != 31) {
check_bits &= (UINT32_C(1) << ((page_last & 31) + 1)) - 1;
}
if (scaled_resolve_pages_[block_index] & check_bits) {
return true;
}
}
}
return false;
}
void TextureCache::ScaledResolveGlobalWatchCallbackThunk(
2022-08-14 15:59:11 +00:00
const global_unique_lock_type& global_lock, void* context,
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) {
TextureCache* texture_cache = reinterpret_cast<TextureCache*>(context);
texture_cache->ScaledResolveGlobalWatchCallback(
global_lock, address_first, address_last, invalidated_by_gpu);
}
void TextureCache::ScaledResolveGlobalWatchCallback(
Fixed a bug with readback_resolve and readback_memexport that was responsible for a large portion of their overhead. readback_memexport and resolve are now usable for games, depending on your hardware. in my case games that were slideshows now run at like 20-30 fps, and my hardware isnt the best for xenia. add split_map class for mapping keys to values in a way that optimizes for frequent searches and infrequent insertions/removals remove jump table implementation of GetColorRenderTargetFormatComponentCount, it was appearing relatively high in profiles. instead pack the component counts into a single 32 bit word, which is indexed by shifting Add cvar to align all basic blocks to a boundary Add mmio aware load paths liberally apply XE_RESTRICT in ringbuffer related code Removed the IS_TRUE and IS_FALSE opcodes, they were pointless duplicates of COMPARE_EQ/COMPARE_NE and i want to simplify our set of opcodes for future backends More work on LVSR/LVSL/STVR/STVL opcodes Optimized X64 translated code emission, now only compute instrkey once Add code for pre-computing integer division magic numbers Optimized GetHostViewportInfo a little Move args for GetHostViewportInfo into a class, cache the result and compare for future queries. moved GetHostViewportInfo far lower on the profile Add (currently not functional, and very racy) asynchronous memcpy code. will improve it and actually use it in future commits. Add non-temporal memcpy function for huge page-aligned allocations. Used for copying to shared memory/readback hoist are_accumulated_render_targets_valid_ check out of loop in render_target_cache already bound check. Add stosb/movsb code for small constant memcpys/memsets that arent worth the overhead of memcpy/memset
2022-08-28 21:24:25 +00:00
const global_unique_lock_type& global_lock, uint32_t address_first,
uint32_t address_last, bool invalidated_by_gpu) {
assert_true(IsDrawResolutionScaled());
if (invalidated_by_gpu) {
// Resolves themselves do exactly the opposite of what this should do.
return;
}
// Mark scaled resolve ranges as non-scaled. Textures themselves will be
// invalidated by their shared memory watches.
uint32_t resolve_page_first = address_first >> 12;
uint32_t resolve_page_last = address_last >> 12;
uint32_t resolve_block_first = resolve_page_first >> 5;
uint32_t resolve_block_last = resolve_page_last >> 5;
uint32_t resolve_l2_block_first = resolve_block_first >> 6;
uint32_t resolve_l2_block_last = resolve_block_last >> 6;
for (uint32_t i = resolve_l2_block_first; i <= resolve_l2_block_last; ++i) {
uint64_t resolve_l2_block = scaled_resolve_pages_l2_[i];
uint32_t resolve_block_relative_index;
while (
xe::bit_scan_forward(resolve_l2_block, &resolve_block_relative_index)) {
resolve_l2_block &= ~(UINT64_C(1) << resolve_block_relative_index);
uint32_t resolve_block_index = (i << 6) + resolve_block_relative_index;
uint32_t resolve_keep_bits = 0;
if (resolve_block_index == resolve_block_first) {
resolve_keep_bits |= (UINT32_C(1) << (resolve_page_first & 31)) - 1;
}
if (resolve_block_index == resolve_block_last &&
(resolve_page_last & 31) != 31) {
resolve_keep_bits |=
~((UINT32_C(1) << ((resolve_page_last & 31) + 1)) - 1);
}
scaled_resolve_pages_[resolve_block_index] &= resolve_keep_bits;
if (scaled_resolve_pages_[resolve_block_index] == 0) {
scaled_resolve_pages_l2_[i] &=
~(UINT64_C(1) << resolve_block_relative_index);
}
}
}
}
} // namespace gpu
} // namespace xe