Adding basic short-term vb reuse, removing index counting.
Regression for some games (that don't size their fetches), massive improvement for others.
This commit is contained in:
parent
159ebb4295
commit
11b0c076bd
|
@ -36,69 +36,39 @@ size_t page_size() {
|
||||||
// http://gnuradio.org/redmine/projects/gnuradio/repository/revisions/f2bc76cc65ffba51a141950f98e75364e49df874/entry/volk/kernels/volk/volk_32u_byteswap.h
|
// http://gnuradio.org/redmine/projects/gnuradio/repository/revisions/f2bc76cc65ffba51a141950f98e75364e49df874/entry/volk/kernels/volk/volk_32u_byteswap.h
|
||||||
// http://gnuradio.org/redmine/projects/gnuradio/repository/revisions/2c4c371885c31222362f70a1cd714415d1398021/entry/volk/kernels/volk/volk_64u_byteswap.h
|
// http://gnuradio.org/redmine/projects/gnuradio/repository/revisions/2c4c371885c31222362f70a1cd714415d1398021/entry/volk/kernels/volk/volk_64u_byteswap.h
|
||||||
|
|
||||||
void copy_and_swap_16_aligned(uint16_t* dest, const uint16_t* src, size_t count,
|
void copy_and_swap_16_aligned(uint16_t* dest, const uint16_t* src,
|
||||||
uint16_t* out_max_value) {
|
size_t count) {
|
||||||
return copy_and_swap_16_unaligned(dest, src, count, out_max_value);
|
return copy_and_swap_16_unaligned(dest, src, count);
|
||||||
}
|
}
|
||||||
|
|
||||||
void copy_and_swap_16_unaligned(uint16_t* dest, const uint16_t* src,
|
void copy_and_swap_16_unaligned(uint16_t* dest, const uint16_t* src,
|
||||||
size_t count, uint16_t* out_max_value) {
|
size_t count) {
|
||||||
if (out_max_value) {
|
for (size_t i = 0; i < count; ++i) {
|
||||||
uint16_t max_value = 0;
|
dest[i] = byte_swap(src[i]);
|
||||||
for (size_t i = 0; i < count; ++i) {
|
|
||||||
uint16_t value = byte_swap(src[i]);
|
|
||||||
max_value = std::max(max_value, value);
|
|
||||||
dest[i] = value;
|
|
||||||
}
|
|
||||||
*out_max_value = max_value;
|
|
||||||
} else {
|
|
||||||
for (size_t i = 0; i < count; ++i) {
|
|
||||||
dest[i] = byte_swap(src[i]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void copy_and_swap_32_aligned(uint32_t* dest, const uint32_t* src, size_t count,
|
void copy_and_swap_32_aligned(uint32_t* dest, const uint32_t* src,
|
||||||
uint32_t* out_max_value) {
|
size_t count) {
|
||||||
return copy_and_swap_32_unaligned(dest, src, count, out_max_value);
|
return copy_and_swap_32_unaligned(dest, src, count);
|
||||||
}
|
}
|
||||||
|
|
||||||
void copy_and_swap_32_unaligned(uint32_t* dest, const uint32_t* src,
|
void copy_and_swap_32_unaligned(uint32_t* dest, const uint32_t* src,
|
||||||
size_t count, uint32_t* out_max_value) {
|
size_t count) {
|
||||||
if (out_max_value) {
|
for (size_t i = 0; i < count; ++i) {
|
||||||
uint32_t max_value = 0;
|
dest[i] = byte_swap(src[i]);
|
||||||
for (size_t i = 0; i < count; ++i) {
|
|
||||||
uint32_t value = byte_swap(src[i]);
|
|
||||||
max_value = std::max(max_value, value);
|
|
||||||
dest[i] = value;
|
|
||||||
}
|
|
||||||
*out_max_value = max_value;
|
|
||||||
} else {
|
|
||||||
for (size_t i = 0; i < count; ++i) {
|
|
||||||
dest[i] = byte_swap(src[i]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src, size_t count,
|
void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src,
|
||||||
uint64_t* out_max_value) {
|
size_t count) {
|
||||||
return copy_and_swap_64_unaligned(dest, src, count, out_max_value);
|
return copy_and_swap_64_unaligned(dest, src, count);
|
||||||
}
|
}
|
||||||
|
|
||||||
void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src,
|
void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src,
|
||||||
size_t count, uint64_t* out_max_value) {
|
size_t count) {
|
||||||
if (out_max_value) {
|
for (size_t i = 0; i < count; ++i) {
|
||||||
uint64_t max_value = 0;
|
dest[i] = byte_swap(src[i]);
|
||||||
for (size_t i = 0; i < count; ++i) {
|
|
||||||
uint64_t value = byte_swap(src[i]);
|
|
||||||
max_value = std::max(max_value, value);
|
|
||||||
dest[i] = value;
|
|
||||||
}
|
|
||||||
*out_max_value = max_value;
|
|
||||||
} else {
|
|
||||||
for (size_t i = 0; i < count; ++i) {
|
|
||||||
dest[i] = byte_swap(src[i]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -29,21 +29,18 @@ size_t hash_combine(size_t seed, const T& v, const Ts&... vs) {
|
||||||
|
|
||||||
size_t page_size();
|
size_t page_size();
|
||||||
|
|
||||||
void copy_and_swap_16_aligned(uint16_t* dest, const uint16_t* src, size_t count,
|
void copy_and_swap_16_aligned(uint16_t* dest, const uint16_t* src,
|
||||||
uint16_t* out_max_value = nullptr);
|
size_t count);
|
||||||
void copy_and_swap_16_unaligned(uint16_t* dest, const uint16_t* src,
|
void copy_and_swap_16_unaligned(uint16_t* dest, const uint16_t* src,
|
||||||
size_t count,
|
size_t count);
|
||||||
uint16_t* out_max_value = nullptr);
|
void copy_and_swap_32_aligned(uint32_t* dest, const uint32_t* src,
|
||||||
void copy_and_swap_32_aligned(uint32_t* dest, const uint32_t* src, size_t count,
|
size_t count);
|
||||||
uint32_t* out_max_value = nullptr);
|
|
||||||
void copy_and_swap_32_unaligned(uint32_t* dest, const uint32_t* src,
|
void copy_and_swap_32_unaligned(uint32_t* dest, const uint32_t* src,
|
||||||
size_t count,
|
size_t count);
|
||||||
uint32_t* out_max_value = nullptr);
|
void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src,
|
||||||
void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src, size_t count,
|
size_t count);
|
||||||
uint64_t* out_max_value = nullptr);
|
|
||||||
void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src,
|
void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src,
|
||||||
size_t count,
|
size_t count);
|
||||||
uint64_t* out_max_value = nullptr);
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void copy_and_swap(T* dest, const T* src, size_t count) {
|
void copy_and_swap(T* dest, const T* src, size_t count) {
|
||||||
|
|
|
@ -86,10 +86,32 @@ CircularBuffer::Allocation CircularBuffer::Acquire(size_t length) {
|
||||||
allocation.offset = write_head_;
|
allocation.offset = write_head_;
|
||||||
allocation.length = length;
|
allocation.length = length;
|
||||||
allocation.aligned_length = aligned_length;
|
allocation.aligned_length = aligned_length;
|
||||||
|
allocation.cache_key = 0;
|
||||||
write_head_ += aligned_length;
|
write_head_ += aligned_length;
|
||||||
return allocation;
|
return allocation;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool CircularBuffer::AcquireCached(uint32_t key, size_t length,
|
||||||
|
Allocation* out_allocation) {
|
||||||
|
uint64_t full_key = key | (length << 32);
|
||||||
|
auto& it = allocation_cache_.find(full_key);
|
||||||
|
if (it != allocation_cache_.end()) {
|
||||||
|
uintptr_t write_head = it->second;
|
||||||
|
size_t aligned_length = poly::round_up(length, alignment_);
|
||||||
|
out_allocation->host_ptr = host_base_ + write_head;
|
||||||
|
out_allocation->gpu_ptr = gpu_base_ + write_head;
|
||||||
|
out_allocation->offset = write_head;
|
||||||
|
out_allocation->length = length;
|
||||||
|
out_allocation->aligned_length = aligned_length;
|
||||||
|
out_allocation->cache_key = full_key;
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
*out_allocation = Acquire(length);
|
||||||
|
out_allocation->cache_key = full_key;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void CircularBuffer::Discard(Allocation allocation) {
|
void CircularBuffer::Discard(Allocation allocation) {
|
||||||
write_head_ -= allocation.aligned_length;
|
write_head_ -= allocation.aligned_length;
|
||||||
}
|
}
|
||||||
|
@ -100,6 +122,9 @@ void CircularBuffer::Commit(Allocation allocation) {
|
||||||
dirty_start_ = std::min(dirty_start_, start);
|
dirty_start_ = std::min(dirty_start_, start);
|
||||||
dirty_end_ = std::max(dirty_end_, end);
|
dirty_end_ = std::max(dirty_end_, end);
|
||||||
assert_true(dirty_end_ <= capacity_);
|
assert_true(dirty_end_ <= capacity_);
|
||||||
|
if (allocation.cache_key) {
|
||||||
|
allocation_cache_.insert({allocation.cache_key, allocation.offset});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void CircularBuffer::Flush() {
|
void CircularBuffer::Flush() {
|
||||||
|
@ -112,10 +137,13 @@ void CircularBuffer::Flush() {
|
||||||
dirty_end_ = 0;
|
dirty_end_ = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void CircularBuffer::ClearCache() { allocation_cache_.clear(); }
|
||||||
|
|
||||||
void CircularBuffer::WaitUntilClean() {
|
void CircularBuffer::WaitUntilClean() {
|
||||||
Flush();
|
Flush();
|
||||||
glFinish();
|
glFinish();
|
||||||
write_head_ = 0;
|
write_head_ = 0;
|
||||||
|
ClearCache();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace gl4
|
} // namespace gl4
|
||||||
|
|
|
@ -10,6 +10,8 @@
|
||||||
#ifndef XENIA_GPU_GL4_CIRCULAR_BUFFER_H_
|
#ifndef XENIA_GPU_GL4_CIRCULAR_BUFFER_H_
|
||||||
#define XENIA_GPU_GL4_CIRCULAR_BUFFER_H_
|
#define XENIA_GPU_GL4_CIRCULAR_BUFFER_H_
|
||||||
|
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
#include "xenia/gpu/gl4/gl_context.h"
|
#include "xenia/gpu/gl4/gl_context.h"
|
||||||
|
|
||||||
namespace xe {
|
namespace xe {
|
||||||
|
@ -29,6 +31,7 @@ class CircularBuffer {
|
||||||
size_t offset;
|
size_t offset;
|
||||||
size_t length;
|
size_t length;
|
||||||
size_t aligned_length;
|
size_t aligned_length;
|
||||||
|
uint64_t cache_key; // 0 if caching disabled.
|
||||||
};
|
};
|
||||||
|
|
||||||
bool Initialize();
|
bool Initialize();
|
||||||
|
@ -40,9 +43,11 @@ class CircularBuffer {
|
||||||
|
|
||||||
bool CanAcquire(size_t length);
|
bool CanAcquire(size_t length);
|
||||||
Allocation Acquire(size_t length);
|
Allocation Acquire(size_t length);
|
||||||
|
bool AcquireCached(uint32_t key, size_t length, Allocation* out_allocation);
|
||||||
void Discard(Allocation allocation);
|
void Discard(Allocation allocation);
|
||||||
void Commit(Allocation allocation);
|
void Commit(Allocation allocation);
|
||||||
void Flush();
|
void Flush();
|
||||||
|
void ClearCache();
|
||||||
|
|
||||||
void WaitUntilClean();
|
void WaitUntilClean();
|
||||||
|
|
||||||
|
@ -55,6 +60,8 @@ class CircularBuffer {
|
||||||
GLuint buffer_;
|
GLuint buffer_;
|
||||||
GLuint64 gpu_base_;
|
GLuint64 gpu_base_;
|
||||||
uint8_t* host_base_;
|
uint8_t* host_base_;
|
||||||
|
|
||||||
|
std::unordered_map<uint64_t, uintptr_t> allocation_cache_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace gl4
|
} // namespace gl4
|
||||||
|
|
|
@ -524,6 +524,8 @@ void CommandProcessor::MakeCoherent() {
|
||||||
// Mark coherent.
|
// Mark coherent.
|
||||||
status_host &= ~0x80000000ul;
|
status_host &= ~0x80000000ul;
|
||||||
regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32 = status_host;
|
regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32 = status_host;
|
||||||
|
|
||||||
|
scratch_buffer_.ClearCache();
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommandProcessor::PrepareForWait() {
|
void CommandProcessor::PrepareForWait() {
|
||||||
|
@ -1431,8 +1433,6 @@ bool CommandProcessor::ExecutePacketType3_INVALIDATE_STATE(
|
||||||
bool CommandProcessor::LoadShader(ShaderType shader_type,
|
bool CommandProcessor::LoadShader(ShaderType shader_type,
|
||||||
const uint32_t* address,
|
const uint32_t* address,
|
||||||
uint32_t dword_count) {
|
uint32_t dword_count) {
|
||||||
SCOPE_profile_cpu_f("gpu");
|
|
||||||
|
|
||||||
// Hash the input memory and lookup the shader.
|
// Hash the input memory and lookup the shader.
|
||||||
GL4Shader* shader_ptr = nullptr;
|
GL4Shader* shader_ptr = nullptr;
|
||||||
uint64_t hash = XXH64(address, dword_count * sizeof(uint32_t), 0);
|
uint64_t hash = XXH64(address, dword_count * sizeof(uint32_t), 0);
|
||||||
|
@ -2288,30 +2288,29 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateIndexBuffer() {
|
||||||
assert_true(info.endianness == Endian::k8in16 ||
|
assert_true(info.endianness == Endian::k8in16 ||
|
||||||
info.endianness == Endian::k8in32);
|
info.endianness == Endian::k8in32);
|
||||||
|
|
||||||
|
trace_writer_.WriteMemoryRead(info.guest_base, info.length);
|
||||||
|
|
||||||
size_t total_size =
|
size_t total_size =
|
||||||
info.count * (info.format == IndexFormat::kInt32 ? sizeof(uint32_t)
|
info.count * (info.format == IndexFormat::kInt32 ? sizeof(uint32_t)
|
||||||
: sizeof(uint16_t));
|
: sizeof(uint16_t));
|
||||||
auto allocation = scratch_buffer_.Acquire(total_size);
|
CircularBuffer::Allocation allocation;
|
||||||
|
if (!scratch_buffer_.AcquireCached(info.guest_base, total_size,
|
||||||
trace_writer_.WriteMemoryRead(info.guest_base, info.length);
|
&allocation)) {
|
||||||
if (info.format == IndexFormat::kInt32) {
|
if (info.format == IndexFormat::kInt32) {
|
||||||
auto dest = reinterpret_cast<uint32_t*>(allocation.host_ptr);
|
auto dest = reinterpret_cast<uint32_t*>(allocation.host_ptr);
|
||||||
auto src = reinterpret_cast<const uint32_t*>(membase_ + info.guest_base);
|
auto src = reinterpret_cast<const uint32_t*>(membase_ + info.guest_base);
|
||||||
uint32_t max_index_found;
|
poly::copy_and_swap_32_aligned(dest, src, info.count);
|
||||||
poly::copy_and_swap_32_aligned(dest, src, info.count, &max_index_found);
|
} else {
|
||||||
index_buffer_info_.max_index_found = max_index_found;
|
auto dest = reinterpret_cast<uint16_t*>(allocation.host_ptr);
|
||||||
|
auto src = reinterpret_cast<const uint16_t*>(membase_ + info.guest_base);
|
||||||
|
poly::copy_and_swap_16_aligned(dest, src, info.count);
|
||||||
|
}
|
||||||
|
draw_batcher_.set_index_buffer(allocation);
|
||||||
|
scratch_buffer_.Commit(std::move(allocation));
|
||||||
} else {
|
} else {
|
||||||
auto dest = reinterpret_cast<uint16_t*>(allocation.host_ptr);
|
draw_batcher_.set_index_buffer(allocation);
|
||||||
auto src = reinterpret_cast<const uint16_t*>(membase_ + info.guest_base);
|
|
||||||
uint16_t max_index_found;
|
|
||||||
poly::copy_and_swap_16_aligned(dest, src, info.count, &max_index_found);
|
|
||||||
index_buffer_info_.max_index_found = max_index_found;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
draw_batcher_.set_index_buffer(allocation);
|
|
||||||
|
|
||||||
scratch_buffer_.Commit(std::move(allocation));
|
|
||||||
|
|
||||||
return UpdateStatus::kCompatible;
|
return UpdateStatus::kCompatible;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2344,44 +2343,56 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateVertexBuffers() {
|
||||||
}
|
}
|
||||||
assert_true(fetch->endian == 2);
|
assert_true(fetch->endian == 2);
|
||||||
|
|
||||||
// Constrain the vertex upload to just what we are interested in.
|
size_t valid_range = size_t(fetch->size * 4);
|
||||||
const size_t kRangeKludge = 5; // could pick index count based on prim.
|
|
||||||
uint32_t max_index = index_buffer_info_.guest_base
|
|
||||||
? index_buffer_info_.max_index_found
|
|
||||||
: draw_index_count_;
|
|
||||||
size_t valid_range = (max_index + kRangeKludge) * desc.stride_words * 4;
|
|
||||||
valid_range = std::min(valid_range, size_t(fetch->size * 4));
|
|
||||||
|
|
||||||
auto allocation = scratch_buffer_.Acquire(valid_range);
|
|
||||||
|
|
||||||
trace_writer_.WriteMemoryRead(fetch->address << 2, valid_range);
|
trace_writer_.WriteMemoryRead(fetch->address << 2, valid_range);
|
||||||
|
|
||||||
// Copy and byte swap the entire buffer.
|
CircularBuffer::Allocation allocation;
|
||||||
// We could be smart about this to save GPU bandwidth by building a CRC
|
if (!scratch_buffer_.AcquireCached(fetch->address << 2, valid_range,
|
||||||
// as we copy and only if it differs from the previous value committing
|
&allocation)) {
|
||||||
// it (and if it matches just discard and reuse).
|
// Copy and byte swap the entire buffer.
|
||||||
poly::copy_and_swap_32_aligned(
|
// We could be smart about this to save GPU bandwidth by building a CRC
|
||||||
reinterpret_cast<uint32_t*>(allocation.host_ptr),
|
// as we copy and only if it differs from the previous value committing
|
||||||
reinterpret_cast<const uint32_t*>(membase_ + (fetch->address << 2)),
|
// it (and if it matches just discard and reuse).
|
||||||
valid_range / 4);
|
poly::copy_and_swap_32_aligned(
|
||||||
|
reinterpret_cast<uint32_t*>(allocation.host_ptr),
|
||||||
|
reinterpret_cast<const uint32_t*>(membase_ + (fetch->address << 2)),
|
||||||
|
valid_range / 4);
|
||||||
|
|
||||||
if (!has_bindless_vbos_) {
|
if (!has_bindless_vbos_) {
|
||||||
// TODO(benvanik): if we could find a way to avoid this, we could use
|
// TODO(benvanik): if we could find a way to avoid this, we could use
|
||||||
// multidraw without flushing.
|
// multidraw without flushing.
|
||||||
glVertexArrayVertexBuffer(active_vertex_shader_->vao(), buffer_index,
|
glVertexArrayVertexBuffer(active_vertex_shader_->vao(), buffer_index,
|
||||||
scratch_buffer_.handle(), allocation.offset,
|
scratch_buffer_.handle(), allocation.offset,
|
||||||
desc.stride_words * 4);
|
desc.stride_words * 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (has_bindless_vbos_) {
|
if (has_bindless_vbos_) {
|
||||||
for (uint32_t i = 0; i < desc.element_count; ++i, ++el_index) {
|
for (uint32_t i = 0; i < desc.element_count; ++i, ++el_index) {
|
||||||
const auto& el = desc.elements[i];
|
const auto& el = desc.elements[i];
|
||||||
draw_batcher_.set_vertex_buffer(el_index, 0, desc.stride_words * 4,
|
draw_batcher_.set_vertex_buffer(el_index, 0, desc.stride_words * 4,
|
||||||
allocation);
|
allocation);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
scratch_buffer_.Commit(std::move(allocation));
|
||||||
|
} else {
|
||||||
|
if (!has_bindless_vbos_) {
|
||||||
|
// TODO(benvanik): if we could find a way to avoid this, we could use
|
||||||
|
// multidraw without flushing.
|
||||||
|
glVertexArrayVertexBuffer(active_vertex_shader_->vao(), buffer_index,
|
||||||
|
scratch_buffer_.handle(), allocation.offset,
|
||||||
|
desc.stride_words * 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (has_bindless_vbos_) {
|
||||||
|
for (uint32_t i = 0; i < desc.element_count; ++i, ++el_index) {
|
||||||
|
const auto& el = desc.elements[i];
|
||||||
|
draw_batcher_.set_vertex_buffer(el_index, 0, desc.stride_words * 4,
|
||||||
|
allocation);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
scratch_buffer_.Commit(std::move(allocation));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return UpdateStatus::kCompatible;
|
return UpdateStatus::kCompatible;
|
||||||
|
|
|
@ -277,7 +277,6 @@ class CommandProcessor {
|
||||||
uint32_t count;
|
uint32_t count;
|
||||||
uint32_t guest_base;
|
uint32_t guest_base;
|
||||||
size_t length;
|
size_t length;
|
||||||
uint32_t max_index_found;
|
|
||||||
} index_buffer_info_;
|
} index_buffer_info_;
|
||||||
uint32_t draw_index_count_;
|
uint32_t draw_index_count_;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue