diff --git a/src/poly/memory.cc b/src/poly/memory.cc index 585e38a82..7514663d4 100644 --- a/src/poly/memory.cc +++ b/src/poly/memory.cc @@ -36,69 +36,39 @@ size_t page_size() { // http://gnuradio.org/redmine/projects/gnuradio/repository/revisions/f2bc76cc65ffba51a141950f98e75364e49df874/entry/volk/kernels/volk/volk_32u_byteswap.h // http://gnuradio.org/redmine/projects/gnuradio/repository/revisions/2c4c371885c31222362f70a1cd714415d1398021/entry/volk/kernels/volk/volk_64u_byteswap.h -void copy_and_swap_16_aligned(uint16_t* dest, const uint16_t* src, size_t count, - uint16_t* out_max_value) { - return copy_and_swap_16_unaligned(dest, src, count, out_max_value); +void copy_and_swap_16_aligned(uint16_t* dest, const uint16_t* src, + size_t count) { + return copy_and_swap_16_unaligned(dest, src, count); } void copy_and_swap_16_unaligned(uint16_t* dest, const uint16_t* src, - size_t count, uint16_t* out_max_value) { - if (out_max_value) { - uint16_t max_value = 0; - for (size_t i = 0; i < count; ++i) { - uint16_t value = byte_swap(src[i]); - max_value = std::max(max_value, value); - dest[i] = value; - } - *out_max_value = max_value; - } else { - for (size_t i = 0; i < count; ++i) { - dest[i] = byte_swap(src[i]); - } + size_t count) { + for (size_t i = 0; i < count; ++i) { + dest[i] = byte_swap(src[i]); } } -void copy_and_swap_32_aligned(uint32_t* dest, const uint32_t* src, size_t count, - uint32_t* out_max_value) { - return copy_and_swap_32_unaligned(dest, src, count, out_max_value); +void copy_and_swap_32_aligned(uint32_t* dest, const uint32_t* src, + size_t count) { + return copy_and_swap_32_unaligned(dest, src, count); } void copy_and_swap_32_unaligned(uint32_t* dest, const uint32_t* src, - size_t count, uint32_t* out_max_value) { - if (out_max_value) { - uint32_t max_value = 0; - for (size_t i = 0; i < count; ++i) { - uint32_t value = byte_swap(src[i]); - max_value = std::max(max_value, value); - dest[i] = value; - } - *out_max_value = max_value; - } else { - for (size_t i = 0; i < count; ++i) { - dest[i] = byte_swap(src[i]); - } + size_t count) { + for (size_t i = 0; i < count; ++i) { + dest[i] = byte_swap(src[i]); } } -void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src, size_t count, - uint64_t* out_max_value) { - return copy_and_swap_64_unaligned(dest, src, count, out_max_value); +void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src, + size_t count) { + return copy_and_swap_64_unaligned(dest, src, count); } void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src, - size_t count, uint64_t* out_max_value) { - if (out_max_value) { - uint64_t max_value = 0; - for (size_t i = 0; i < count; ++i) { - uint64_t value = byte_swap(src[i]); - max_value = std::max(max_value, value); - dest[i] = value; - } - *out_max_value = max_value; - } else { - for (size_t i = 0; i < count; ++i) { - dest[i] = byte_swap(src[i]); - } + size_t count) { + for (size_t i = 0; i < count; ++i) { + dest[i] = byte_swap(src[i]); } } diff --git a/src/poly/memory.h b/src/poly/memory.h index 8214b6c5c..be3e643ea 100644 --- a/src/poly/memory.h +++ b/src/poly/memory.h @@ -29,21 +29,18 @@ size_t hash_combine(size_t seed, const T& v, const Ts&... vs) { size_t page_size(); -void copy_and_swap_16_aligned(uint16_t* dest, const uint16_t* src, size_t count, - uint16_t* out_max_value = nullptr); +void copy_and_swap_16_aligned(uint16_t* dest, const uint16_t* src, + size_t count); void copy_and_swap_16_unaligned(uint16_t* dest, const uint16_t* src, - size_t count, - uint16_t* out_max_value = nullptr); -void copy_and_swap_32_aligned(uint32_t* dest, const uint32_t* src, size_t count, - uint32_t* out_max_value = nullptr); + size_t count); +void copy_and_swap_32_aligned(uint32_t* dest, const uint32_t* src, + size_t count); void copy_and_swap_32_unaligned(uint32_t* dest, const uint32_t* src, - size_t count, - uint32_t* out_max_value = nullptr); -void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src, size_t count, - uint64_t* out_max_value = nullptr); + size_t count); +void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src, + size_t count); void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src, - size_t count, - uint64_t* out_max_value = nullptr); + size_t count); template void copy_and_swap(T* dest, const T* src, size_t count) { diff --git a/src/xenia/gpu/gl4/circular_buffer.cc b/src/xenia/gpu/gl4/circular_buffer.cc index bfd7cbf2d..538b71e82 100644 --- a/src/xenia/gpu/gl4/circular_buffer.cc +++ b/src/xenia/gpu/gl4/circular_buffer.cc @@ -86,10 +86,32 @@ CircularBuffer::Allocation CircularBuffer::Acquire(size_t length) { allocation.offset = write_head_; allocation.length = length; allocation.aligned_length = aligned_length; + allocation.cache_key = 0; write_head_ += aligned_length; return allocation; } +bool CircularBuffer::AcquireCached(uint32_t key, size_t length, + Allocation* out_allocation) { + uint64_t full_key = key | (length << 32); + auto& it = allocation_cache_.find(full_key); + if (it != allocation_cache_.end()) { + uintptr_t write_head = it->second; + size_t aligned_length = poly::round_up(length, alignment_); + out_allocation->host_ptr = host_base_ + write_head; + out_allocation->gpu_ptr = gpu_base_ + write_head; + out_allocation->offset = write_head; + out_allocation->length = length; + out_allocation->aligned_length = aligned_length; + out_allocation->cache_key = full_key; + return true; + } else { + *out_allocation = Acquire(length); + out_allocation->cache_key = full_key; + return false; + } +} + void CircularBuffer::Discard(Allocation allocation) { write_head_ -= allocation.aligned_length; } @@ -100,6 +122,9 @@ void CircularBuffer::Commit(Allocation allocation) { dirty_start_ = std::min(dirty_start_, start); dirty_end_ = std::max(dirty_end_, end); assert_true(dirty_end_ <= capacity_); + if (allocation.cache_key) { + allocation_cache_.insert({allocation.cache_key, allocation.offset}); + } } void CircularBuffer::Flush() { @@ -112,10 +137,13 @@ void CircularBuffer::Flush() { dirty_end_ = 0; } +void CircularBuffer::ClearCache() { allocation_cache_.clear(); } + void CircularBuffer::WaitUntilClean() { Flush(); glFinish(); write_head_ = 0; + ClearCache(); } } // namespace gl4 diff --git a/src/xenia/gpu/gl4/circular_buffer.h b/src/xenia/gpu/gl4/circular_buffer.h index 7a0232693..da1ebd788 100644 --- a/src/xenia/gpu/gl4/circular_buffer.h +++ b/src/xenia/gpu/gl4/circular_buffer.h @@ -10,6 +10,8 @@ #ifndef XENIA_GPU_GL4_CIRCULAR_BUFFER_H_ #define XENIA_GPU_GL4_CIRCULAR_BUFFER_H_ +#include + #include "xenia/gpu/gl4/gl_context.h" namespace xe { @@ -29,6 +31,7 @@ class CircularBuffer { size_t offset; size_t length; size_t aligned_length; + uint64_t cache_key; // 0 if caching disabled. }; bool Initialize(); @@ -40,9 +43,11 @@ class CircularBuffer { bool CanAcquire(size_t length); Allocation Acquire(size_t length); + bool AcquireCached(uint32_t key, size_t length, Allocation* out_allocation); void Discard(Allocation allocation); void Commit(Allocation allocation); void Flush(); + void ClearCache(); void WaitUntilClean(); @@ -55,6 +60,8 @@ class CircularBuffer { GLuint buffer_; GLuint64 gpu_base_; uint8_t* host_base_; + + std::unordered_map allocation_cache_; }; } // namespace gl4 diff --git a/src/xenia/gpu/gl4/command_processor.cc b/src/xenia/gpu/gl4/command_processor.cc index 8e3a5eed1..fd50eee70 100644 --- a/src/xenia/gpu/gl4/command_processor.cc +++ b/src/xenia/gpu/gl4/command_processor.cc @@ -524,6 +524,8 @@ void CommandProcessor::MakeCoherent() { // Mark coherent. status_host &= ~0x80000000ul; regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32 = status_host; + + scratch_buffer_.ClearCache(); } void CommandProcessor::PrepareForWait() { @@ -1431,8 +1433,6 @@ bool CommandProcessor::ExecutePacketType3_INVALIDATE_STATE( bool CommandProcessor::LoadShader(ShaderType shader_type, const uint32_t* address, uint32_t dword_count) { - SCOPE_profile_cpu_f("gpu"); - // Hash the input memory and lookup the shader. GL4Shader* shader_ptr = nullptr; uint64_t hash = XXH64(address, dword_count * sizeof(uint32_t), 0); @@ -2288,30 +2288,29 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateIndexBuffer() { assert_true(info.endianness == Endian::k8in16 || info.endianness == Endian::k8in32); + trace_writer_.WriteMemoryRead(info.guest_base, info.length); + size_t total_size = info.count * (info.format == IndexFormat::kInt32 ? sizeof(uint32_t) : sizeof(uint16_t)); - auto allocation = scratch_buffer_.Acquire(total_size); - - trace_writer_.WriteMemoryRead(info.guest_base, info.length); - if (info.format == IndexFormat::kInt32) { - auto dest = reinterpret_cast(allocation.host_ptr); - auto src = reinterpret_cast(membase_ + info.guest_base); - uint32_t max_index_found; - poly::copy_and_swap_32_aligned(dest, src, info.count, &max_index_found); - index_buffer_info_.max_index_found = max_index_found; + CircularBuffer::Allocation allocation; + if (!scratch_buffer_.AcquireCached(info.guest_base, total_size, + &allocation)) { + if (info.format == IndexFormat::kInt32) { + auto dest = reinterpret_cast(allocation.host_ptr); + auto src = reinterpret_cast(membase_ + info.guest_base); + poly::copy_and_swap_32_aligned(dest, src, info.count); + } else { + auto dest = reinterpret_cast(allocation.host_ptr); + auto src = reinterpret_cast(membase_ + info.guest_base); + poly::copy_and_swap_16_aligned(dest, src, info.count); + } + draw_batcher_.set_index_buffer(allocation); + scratch_buffer_.Commit(std::move(allocation)); } else { - auto dest = reinterpret_cast(allocation.host_ptr); - auto src = reinterpret_cast(membase_ + info.guest_base); - uint16_t max_index_found; - poly::copy_and_swap_16_aligned(dest, src, info.count, &max_index_found); - index_buffer_info_.max_index_found = max_index_found; + draw_batcher_.set_index_buffer(allocation); } - draw_batcher_.set_index_buffer(allocation); - - scratch_buffer_.Commit(std::move(allocation)); - return UpdateStatus::kCompatible; } @@ -2344,44 +2343,56 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateVertexBuffers() { } assert_true(fetch->endian == 2); - // Constrain the vertex upload to just what we are interested in. - const size_t kRangeKludge = 5; // could pick index count based on prim. - uint32_t max_index = index_buffer_info_.guest_base - ? index_buffer_info_.max_index_found - : draw_index_count_; - size_t valid_range = (max_index + kRangeKludge) * desc.stride_words * 4; - valid_range = std::min(valid_range, size_t(fetch->size * 4)); - - auto allocation = scratch_buffer_.Acquire(valid_range); + size_t valid_range = size_t(fetch->size * 4); trace_writer_.WriteMemoryRead(fetch->address << 2, valid_range); - // Copy and byte swap the entire buffer. - // We could be smart about this to save GPU bandwidth by building a CRC - // as we copy and only if it differs from the previous value committing - // it (and if it matches just discard and reuse). - poly::copy_and_swap_32_aligned( - reinterpret_cast(allocation.host_ptr), - reinterpret_cast(membase_ + (fetch->address << 2)), - valid_range / 4); + CircularBuffer::Allocation allocation; + if (!scratch_buffer_.AcquireCached(fetch->address << 2, valid_range, + &allocation)) { + // Copy and byte swap the entire buffer. + // We could be smart about this to save GPU bandwidth by building a CRC + // as we copy and only if it differs from the previous value committing + // it (and if it matches just discard and reuse). + poly::copy_and_swap_32_aligned( + reinterpret_cast(allocation.host_ptr), + reinterpret_cast(membase_ + (fetch->address << 2)), + valid_range / 4); - if (!has_bindless_vbos_) { - // TODO(benvanik): if we could find a way to avoid this, we could use - // multidraw without flushing. - glVertexArrayVertexBuffer(active_vertex_shader_->vao(), buffer_index, - scratch_buffer_.handle(), allocation.offset, - desc.stride_words * 4); - } + if (!has_bindless_vbos_) { + // TODO(benvanik): if we could find a way to avoid this, we could use + // multidraw without flushing. + glVertexArrayVertexBuffer(active_vertex_shader_->vao(), buffer_index, + scratch_buffer_.handle(), allocation.offset, + desc.stride_words * 4); + } - if (has_bindless_vbos_) { - for (uint32_t i = 0; i < desc.element_count; ++i, ++el_index) { - const auto& el = desc.elements[i]; - draw_batcher_.set_vertex_buffer(el_index, 0, desc.stride_words * 4, - allocation); + if (has_bindless_vbos_) { + for (uint32_t i = 0; i < desc.element_count; ++i, ++el_index) { + const auto& el = desc.elements[i]; + draw_batcher_.set_vertex_buffer(el_index, 0, desc.stride_words * 4, + allocation); + } + } + + scratch_buffer_.Commit(std::move(allocation)); + } else { + if (!has_bindless_vbos_) { + // TODO(benvanik): if we could find a way to avoid this, we could use + // multidraw without flushing. + glVertexArrayVertexBuffer(active_vertex_shader_->vao(), buffer_index, + scratch_buffer_.handle(), allocation.offset, + desc.stride_words * 4); + } + + if (has_bindless_vbos_) { + for (uint32_t i = 0; i < desc.element_count; ++i, ++el_index) { + const auto& el = desc.elements[i]; + draw_batcher_.set_vertex_buffer(el_index, 0, desc.stride_words * 4, + allocation); + } } } - - scratch_buffer_.Commit(std::move(allocation)); } return UpdateStatus::kCompatible; diff --git a/src/xenia/gpu/gl4/command_processor.h b/src/xenia/gpu/gl4/command_processor.h index ffe7bd3cd..1d8aa4142 100644 --- a/src/xenia/gpu/gl4/command_processor.h +++ b/src/xenia/gpu/gl4/command_processor.h @@ -277,7 +277,6 @@ class CommandProcessor { uint32_t count; uint32_t guest_base; size_t length; - uint32_t max_index_found; } index_buffer_info_; uint32_t draw_index_count_;