2013-01-13 07:25:41 +00:00
|
|
|
/**
|
|
|
|
******************************************************************************
|
|
|
|
* Xenia : Xbox 360 Emulator Research Project *
|
|
|
|
******************************************************************************
|
2020-03-02 15:37:11 +00:00
|
|
|
* Copyright 2020 Ben Vanik. All rights reserved. *
|
2013-01-13 07:25:41 +00:00
|
|
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
|
|
|
******************************************************************************
|
|
|
|
*/
|
|
|
|
|
2015-02-01 06:49:47 +00:00
|
|
|
#include "xenia/memory.h"
|
2013-12-07 06:57:16 +00:00
|
|
|
|
2014-08-16 23:57:00 +00:00
|
|
|
#include <algorithm>
|
2015-06-17 05:08:05 +00:00
|
|
|
#include <cstring>
|
2020-02-15 18:35:24 +00:00
|
|
|
#include <utility>
|
2014-07-10 05:28:51 +00:00
|
|
|
|
2020-03-02 15:37:11 +00:00
|
|
|
#include "third_party/fmt/include/fmt/format.h"
|
2020-02-15 18:35:24 +00:00
|
|
|
#include "xenia/base/assert.h"
|
2015-12-01 23:26:55 +00:00
|
|
|
#include "xenia/base/byte_stream.h"
|
2015-05-27 05:20:46 +00:00
|
|
|
#include "xenia/base/clock.h"
|
2019-04-17 19:49:29 +00:00
|
|
|
#include "xenia/base/cvar.h"
|
2015-05-02 10:42:51 +00:00
|
|
|
#include "xenia/base/logging.h"
|
|
|
|
#include "xenia/base/math.h"
|
2015-05-24 07:02:47 +00:00
|
|
|
#include "xenia/base/threading.h"
|
2015-02-01 06:49:47 +00:00
|
|
|
#include "xenia/cpu/mmio_handler.h"
|
2013-12-07 06:57:16 +00:00
|
|
|
|
2013-10-23 06:34:24 +00:00
|
|
|
// TODO(benvanik): move xbox.h out
|
2015-02-01 06:49:47 +00:00
|
|
|
#include "xenia/xbox.h"
|
2013-10-23 06:34:24 +00:00
|
|
|
|
2019-04-17 19:49:29 +00:00
|
|
|
DEFINE_bool(protect_zero, true, "Protect the zero page from reads and writes.",
|
|
|
|
"Memory");
|
2015-12-06 07:27:25 +00:00
|
|
|
DEFINE_bool(protect_on_release, false,
|
2019-04-17 19:49:29 +00:00
|
|
|
"Protect released memory to prevent accesses.", "Memory");
|
2014-08-19 05:12:21 +00:00
|
|
|
DEFINE_bool(scribble_heap, false,
|
2019-04-17 19:49:29 +00:00
|
|
|
"Scribble 0xCD into all allocated heap memory.", "Memory");
|
2013-01-13 07:25:41 +00:00
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
namespace xe {
|
|
|
|
uint32_t get_page_count(uint32_t value, uint32_t page_size) {
|
|
|
|
return xe::round_up(value, page_size) / page_size;
|
|
|
|
}
|
|
|
|
|
2013-01-13 07:25:41 +00:00
|
|
|
/**
|
|
|
|
* Memory map:
|
2013-06-01 10:08:31 +00:00
|
|
|
* 0x00000000 - 0x3FFFFFFF (1024mb) - virtual 4k pages
|
|
|
|
* 0x40000000 - 0x7FFFFFFF (1024mb) - virtual 64k pages
|
|
|
|
* 0x80000000 - 0x8BFFFFFF ( 192mb) - xex 64k pages
|
|
|
|
* 0x8C000000 - 0x8FFFFFFF ( 64mb) - xex 64k pages (encrypted)
|
|
|
|
* 0x90000000 - 0x9FFFFFFF ( 256mb) - xex 4k pages
|
|
|
|
* 0xA0000000 - 0xBFFFFFFF ( 512mb) - physical 64k pages
|
|
|
|
* 0xC0000000 - 0xDFFFFFFF - physical 16mb pages
|
|
|
|
* 0xE0000000 - 0xFFFFFFFF - physical 4k pages
|
2013-01-13 07:25:41 +00:00
|
|
|
*
|
|
|
|
* We use the host OS to create an entire addressable range for this. That way
|
|
|
|
* we don't have to emulate a TLB. It'd be really cool to pass through page
|
|
|
|
* sizes or use madvice to let the OS know what to expect.
|
2013-05-30 04:00:55 +00:00
|
|
|
*
|
2013-12-07 06:57:16 +00:00
|
|
|
* We create our own heap of committed memory that lives at
|
2015-03-24 15:25:58 +00:00
|
|
|
* memory_HEAP_LOW to memory_HEAP_HIGH - all normal user allocations
|
2019-04-17 19:49:29 +00:00
|
|
|
* come from there. Since the Xbox has no paging, we know that the size of
|
|
|
|
* this heap will never need to be larger than ~512MB (realistically, smaller
|
|
|
|
* than that). We place it far away from the XEX data and keep the memory
|
|
|
|
* around it uncommitted so that we have some warning if things go astray.
|
2013-05-30 04:00:55 +00:00
|
|
|
*
|
|
|
|
* For XEX/GPU/etc data we allow placement allocations (base_address != 0) and
|
|
|
|
* commit the requested memory as needed. This bypasses the standard heap, but
|
|
|
|
* XEXs should never be overwriting anything so that's fine. We can also query
|
|
|
|
* for previous commits and assert that we really isn't committing twice.
|
2013-10-21 07:57:48 +00:00
|
|
|
*
|
|
|
|
* GPU memory is mapped onto the lower 512mb of the virtual 4k range (0).
|
|
|
|
* So 0xA0000000 = 0x00000000. A more sophisticated allocator could handle
|
|
|
|
* this.
|
2013-01-13 07:25:41 +00:00
|
|
|
*/
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
static Memory* active_memory_ = nullptr;
|
|
|
|
|
2015-09-06 20:34:39 +00:00
|
|
|
void CrashDump() {
|
|
|
|
static std::atomic<int> in_crash_dump(0);
|
|
|
|
if (in_crash_dump.fetch_add(1)) {
|
|
|
|
xe::FatalError(
|
|
|
|
"Hard crash: the memory system crashed while dumping a crash dump.");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
active_memory_->DumpMap();
|
|
|
|
--in_crash_dump;
|
|
|
|
}
|
2013-05-28 02:58:20 +00:00
|
|
|
|
2015-07-16 05:10:05 +00:00
|
|
|
Memory::Memory() {
|
2015-07-16 01:20:05 +00:00
|
|
|
system_page_size_ = uint32_t(xe::memory::page_size());
|
2019-08-04 20:10:59 +00:00
|
|
|
system_allocation_granularity_ =
|
|
|
|
uint32_t(xe::memory::allocation_granularity());
|
2015-05-16 07:23:13 +00:00
|
|
|
assert_zero(active_memory_);
|
|
|
|
active_memory_ = this;
|
2013-12-07 06:57:16 +00:00
|
|
|
}
|
|
|
|
|
2014-08-20 04:02:15 +00:00
|
|
|
Memory::~Memory() {
|
2015-05-16 07:23:13 +00:00
|
|
|
assert_true(active_memory_ == this);
|
|
|
|
active_memory_ = nullptr;
|
|
|
|
|
2014-07-30 05:12:39 +00:00
|
|
|
// Uninstall the MMIO handler, as we won't be able to service more
|
|
|
|
// requests.
|
|
|
|
mmio_handler_.reset();
|
2014-06-02 14:11:27 +00:00
|
|
|
|
2020-02-15 18:35:24 +00:00
|
|
|
for (auto invalidation_callback : physical_memory_invalidation_callbacks_) {
|
|
|
|
delete invalidation_callback;
|
2019-07-30 05:00:20 +00:00
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
heaps_.v00000000.Dispose();
|
|
|
|
heaps_.v40000000.Dispose();
|
|
|
|
heaps_.v80000000.Dispose();
|
|
|
|
heaps_.v90000000.Dispose();
|
|
|
|
heaps_.vA0000000.Dispose();
|
|
|
|
heaps_.vC0000000.Dispose();
|
|
|
|
heaps_.vE0000000.Dispose();
|
|
|
|
heaps_.physical.Dispose();
|
2013-01-13 07:25:41 +00:00
|
|
|
|
2013-12-07 06:57:16 +00:00
|
|
|
// Unmap all views and close mapping.
|
2020-11-22 11:17:37 +00:00
|
|
|
if (mapping_ != xe::memory::kFileMappingHandleInvalid) {
|
2013-12-07 06:57:16 +00:00
|
|
|
UnmapViews();
|
2019-07-11 13:44:42 +00:00
|
|
|
xe::memory::CloseFileMappingHandle(mapping_, file_name_);
|
2015-07-16 05:10:05 +00:00
|
|
|
mapping_base_ = nullptr;
|
2020-11-22 11:17:37 +00:00
|
|
|
mapping_ = xe::memory::kFileMappingHandleInvalid;
|
2013-12-07 06:57:16 +00:00
|
|
|
}
|
2015-03-29 18:11:35 +00:00
|
|
|
|
|
|
|
virtual_membase_ = nullptr;
|
|
|
|
physical_membase_ = nullptr;
|
2013-12-07 06:57:16 +00:00
|
|
|
}
|
2013-01-13 07:25:41 +00:00
|
|
|
|
2015-12-03 01:37:48 +00:00
|
|
|
bool Memory::Initialize() {
|
2019-07-11 02:31:16 +00:00
|
|
|
file_name_ = fmt::format("xenia_memory_{}", Clock::QueryHostTickCount());
|
2015-07-16 05:10:05 +00:00
|
|
|
|
|
|
|
// Create main page file-backed mapping. This is all reserved but
|
|
|
|
// uncommitted (so it shouldn't expand page file).
|
|
|
|
mapping_ = xe::memory::CreateFileMappingHandle(
|
|
|
|
file_name_,
|
|
|
|
// entire 4gb space + 512mb physical:
|
|
|
|
0x11FFFFFFF, xe::memory::PageAccess::kReadWrite, false);
|
2020-11-22 11:17:37 +00:00
|
|
|
if (mapping_ == xe::memory::kFileMappingHandleInvalid) {
|
2013-10-21 06:19:57 +00:00
|
|
|
XELOGE("Unable to reserve the 4gb guest address space.");
|
2020-11-22 11:17:37 +00:00
|
|
|
assert_always();
|
2015-12-03 01:37:48 +00:00
|
|
|
return false;
|
2013-10-21 06:19:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Attempt to create our views. This may fail at the first address
|
|
|
|
// we pick, so try a few times.
|
2013-12-07 06:57:16 +00:00
|
|
|
mapping_base_ = 0;
|
2013-10-21 06:19:57 +00:00
|
|
|
for (size_t n = 32; n < 64; n++) {
|
2015-08-07 03:17:01 +00:00
|
|
|
auto mapping_base = reinterpret_cast<uint8_t*>(1ull << n);
|
2013-12-07 06:57:16 +00:00
|
|
|
if (!MapViews(mapping_base)) {
|
|
|
|
mapping_base_ = mapping_base;
|
2013-10-21 06:19:57 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2013-12-07 06:57:16 +00:00
|
|
|
if (!mapping_base_) {
|
2013-10-21 06:19:57 +00:00
|
|
|
XELOGE("Unable to find a continuous block in the 64bit address space.");
|
2014-07-12 23:51:52 +00:00
|
|
|
assert_always();
|
2015-12-03 01:37:48 +00:00
|
|
|
return false;
|
2013-10-21 06:19:57 +00:00
|
|
|
}
|
2015-03-29 18:11:35 +00:00
|
|
|
virtual_membase_ = mapping_base_;
|
2015-05-16 07:23:13 +00:00
|
|
|
physical_membase_ = mapping_base_ + 0x100000000ull;
|
|
|
|
|
|
|
|
// Prepare virtual heaps.
|
2020-09-16 18:09:32 +00:00
|
|
|
heaps_.v00000000.Initialize(this, virtual_membase_, HeapType::kGuestVirtual,
|
|
|
|
0x00000000, 0x40000000, 4096);
|
|
|
|
heaps_.v40000000.Initialize(this, virtual_membase_, HeapType::kGuestVirtual,
|
|
|
|
0x40000000, 0x40000000 - 0x01000000, 64 * 1024);
|
|
|
|
heaps_.v80000000.Initialize(this, virtual_membase_, HeapType::kGuestXex,
|
|
|
|
0x80000000, 0x10000000, 64 * 1024);
|
|
|
|
heaps_.v90000000.Initialize(this, virtual_membase_, HeapType::kGuestXex,
|
|
|
|
0x90000000, 0x10000000, 4096);
|
2015-05-16 07:23:13 +00:00
|
|
|
|
|
|
|
// Prepare physical heaps.
|
2020-09-16 18:09:32 +00:00
|
|
|
heaps_.physical.Initialize(this, physical_membase_, HeapType::kGuestPhysical,
|
|
|
|
0x00000000, 0x20000000, 4096);
|
|
|
|
heaps_.vA0000000.Initialize(this, virtual_membase_, HeapType::kGuestPhysical,
|
|
|
|
0xA0000000, 0x20000000, 64 * 1024,
|
|
|
|
&heaps_.physical);
|
|
|
|
heaps_.vC0000000.Initialize(this, virtual_membase_, HeapType::kGuestPhysical,
|
|
|
|
0xC0000000, 0x20000000, 16 * 1024 * 1024,
|
|
|
|
&heaps_.physical);
|
|
|
|
heaps_.vE0000000.Initialize(this, virtual_membase_, HeapType::kGuestPhysical,
|
|
|
|
0xE0000000, 0x1FD00000, 4096, &heaps_.physical);
|
2015-05-16 07:23:13 +00:00
|
|
|
|
2018-02-15 03:58:05 +00:00
|
|
|
// Protect the first and last 64kb of memory.
|
2015-05-16 07:23:13 +00:00
|
|
|
heaps_.v00000000.AllocFixed(
|
2018-02-15 03:58:05 +00:00
|
|
|
0x00000000, 0x10000, 0x10000,
|
2015-05-16 07:23:13 +00:00
|
|
|
kMemoryAllocationReserve | kMemoryAllocationCommit,
|
2019-04-17 19:49:29 +00:00
|
|
|
!cvars::protect_zero ? kMemoryProtectRead | kMemoryProtectWrite
|
|
|
|
: kMemoryProtectNoAccess);
|
2018-02-15 03:58:05 +00:00
|
|
|
heaps_.physical.AllocFixed(0x1FFF0000, 0x10000, 0x10000,
|
|
|
|
kMemoryAllocationReserve, kMemoryProtectNoAccess);
|
2013-01-29 05:36:03 +00:00
|
|
|
|
2013-10-23 04:05:32 +00:00
|
|
|
// GPU writeback.
|
2013-12-15 23:31:32 +00:00
|
|
|
// 0xC... is physical, 0x7F... is virtual. We may need to overlay these.
|
2015-05-16 07:23:13 +00:00
|
|
|
heaps_.vC0000000.AllocFixed(
|
|
|
|
0xC0000000, 0x01000000, 32,
|
|
|
|
kMemoryAllocationReserve | kMemoryAllocationCommit,
|
|
|
|
kMemoryProtectRead | kMemoryProtectWrite);
|
2013-10-23 04:05:32 +00:00
|
|
|
|
2014-06-02 06:36:18 +00:00
|
|
|
// Add handlers for MMIO.
|
2019-08-13 20:49:49 +00:00
|
|
|
mmio_handler_ = cpu::MMIOHandler::Install(
|
|
|
|
virtual_membase_, physical_membase_, physical_membase_ + 0x1FFFFFFF,
|
Huge set of performance improvements, combined with an architecture specific build and clang-cl users have reported absurd gains over master for some gains, in the range 50%-90%
But for normal msvc builds i would put it at around 30-50%
Added per-xexmodule caching of information per instruction, can be used to remember what code needs compiling at start up
Record what guest addresses wrote mmio and backpropagate that to future runs, eliminating dependence on exception trapping. this makes many games like h3 actually tolerable to run under a debugger
fixed a number of errors where temporaries were being passed by reference/pointer
Can now be compiled with clang-cl 14.0.1, requires -Werror off though and some other solution/project changes.
Added macros wrapping compiler extensions like noinline, forceinline, __expect, and cold.
Removed the "global lock" in guest code completely. It does not properly emulate the behavior of mfmsrd/mtmsr and it seriously cripples amd cpus. Removing this yielded around a 3x speedup in Halo Reach for me.
Disabled the microprofiler for now. The microprofiler has a huge performance cost associated with it. Developers can re-enable it in the base/profiling header if they really need it
Disable the trace writer in release builds. despite just returning after checking if the file was open the trace functions were consuming about 0.60% cpu time total
Add IsValidReg, GetRegisterInfo is a huge (about 45k) branching function and using that to check if a register was valid consumed a significant chunk of time
Optimized RingBuffer::ReadAndSwap and RingBuffer::read_count. This gave us the largest overall boost in performance. The memcpies were unnecessary and one of them was always a no-op
Added simplification rules for multiplicative patterns like (x+x), (x<<1)+x
For the most frequently called win32 functions i added code to call their underlying NT implementations, which lets us skip a lot of MS code we don't care about/isnt relevant to our usecases
^this can be toggled off in the platform_win header
handle indirect call true with constant function pointer, was occurring in h3
lookup host format swizzle in denser array
by default, don't check if a gpu register is unknown, instead just check if its out of range. controlled by a cvar
^looking up whether its known or not took approx 0.3% cpu time
Changed some things in /cpu to make the project UNITYBUILD friendly
The timer thread was spinning way too much and consuming a ton of cpu, changed it to use a blocking wait instead
tagged some conditions as XE_UNLIKELY/LIKELY based on profiler feedback (will only affect clang builds)
Shifted around some code in CommandProcessor::WriteRegister based on how frequently it was executed
added support for docdecaduple precision floating point so that we can represent our performance gains numerically
tons of other stuff im probably forgetting
2022-08-13 19:59:00 +00:00
|
|
|
HostToGuestVirtualThunk, this, AccessViolationCallbackThunk, this,
|
|
|
|
nullptr, nullptr);
|
2014-07-30 05:12:39 +00:00
|
|
|
if (!mmio_handler_) {
|
|
|
|
XELOGE("Unable to install MMIO handlers");
|
|
|
|
assert_always();
|
2015-12-03 01:37:48 +00:00
|
|
|
return false;
|
2014-06-02 06:36:18 +00:00
|
|
|
}
|
|
|
|
|
2015-05-17 23:40:38 +00:00
|
|
|
// ?
|
|
|
|
uint32_t unk_phys_alloc;
|
|
|
|
heaps_.vA0000000.Alloc(0x340000, 64 * 1024, kMemoryAllocationReserve,
|
|
|
|
kMemoryProtectNoAccess, true, &unk_phys_alloc);
|
2015-01-08 06:24:59 +00:00
|
|
|
|
2015-12-03 01:37:48 +00:00
|
|
|
return true;
|
2013-01-13 07:25:41 +00:00
|
|
|
}
|
|
|
|
|
Huge set of performance improvements, combined with an architecture specific build and clang-cl users have reported absurd gains over master for some gains, in the range 50%-90%
But for normal msvc builds i would put it at around 30-50%
Added per-xexmodule caching of information per instruction, can be used to remember what code needs compiling at start up
Record what guest addresses wrote mmio and backpropagate that to future runs, eliminating dependence on exception trapping. this makes many games like h3 actually tolerable to run under a debugger
fixed a number of errors where temporaries were being passed by reference/pointer
Can now be compiled with clang-cl 14.0.1, requires -Werror off though and some other solution/project changes.
Added macros wrapping compiler extensions like noinline, forceinline, __expect, and cold.
Removed the "global lock" in guest code completely. It does not properly emulate the behavior of mfmsrd/mtmsr and it seriously cripples amd cpus. Removing this yielded around a 3x speedup in Halo Reach for me.
Disabled the microprofiler for now. The microprofiler has a huge performance cost associated with it. Developers can re-enable it in the base/profiling header if they really need it
Disable the trace writer in release builds. despite just returning after checking if the file was open the trace functions were consuming about 0.60% cpu time total
Add IsValidReg, GetRegisterInfo is a huge (about 45k) branching function and using that to check if a register was valid consumed a significant chunk of time
Optimized RingBuffer::ReadAndSwap and RingBuffer::read_count. This gave us the largest overall boost in performance. The memcpies were unnecessary and one of them was always a no-op
Added simplification rules for multiplicative patterns like (x+x), (x<<1)+x
For the most frequently called win32 functions i added code to call their underlying NT implementations, which lets us skip a lot of MS code we don't care about/isnt relevant to our usecases
^this can be toggled off in the platform_win header
handle indirect call true with constant function pointer, was occurring in h3
lookup host format swizzle in denser array
by default, don't check if a gpu register is unknown, instead just check if its out of range. controlled by a cvar
^looking up whether its known or not took approx 0.3% cpu time
Changed some things in /cpu to make the project UNITYBUILD friendly
The timer thread was spinning way too much and consuming a ton of cpu, changed it to use a blocking wait instead
tagged some conditions as XE_UNLIKELY/LIKELY based on profiler feedback (will only affect clang builds)
Shifted around some code in CommandProcessor::WriteRegister based on how frequently it was executed
added support for docdecaduple precision floating point so that we can represent our performance gains numerically
tons of other stuff im probably forgetting
2022-08-13 19:59:00 +00:00
|
|
|
void Memory::SetMMIOExceptionRecordingCallback(
|
|
|
|
cpu::MmioAccessRecordCallback callback, void* context) {
|
|
|
|
mmio_handler_->SetMMIOExceptionRecordingCallback(callback, context);
|
|
|
|
}
|
|
|
|
|
2015-08-07 03:17:01 +00:00
|
|
|
static const struct {
|
2015-05-16 07:23:13 +00:00
|
|
|
uint64_t virtual_address_start;
|
|
|
|
uint64_t virtual_address_end;
|
|
|
|
uint64_t target_address;
|
2014-08-02 04:43:52 +00:00
|
|
|
} map_info[] = {
|
2015-05-16 07:23:13 +00:00
|
|
|
// (1024mb) - virtual 4k pages
|
2015-07-20 01:32:48 +00:00
|
|
|
{
|
2017-12-15 02:35:44 +00:00
|
|
|
0x00000000,
|
|
|
|
0x3FFFFFFF,
|
|
|
|
0x0000000000000000ull,
|
2015-07-20 01:32:48 +00:00
|
|
|
},
|
2015-05-16 07:23:13 +00:00
|
|
|
// (1024mb) - virtual 64k pages (cont)
|
2015-07-20 01:32:48 +00:00
|
|
|
{
|
2017-12-15 02:35:44 +00:00
|
|
|
0x40000000,
|
|
|
|
0x7EFFFFFF,
|
|
|
|
0x0000000040000000ull,
|
2015-07-20 01:32:48 +00:00
|
|
|
},
|
2015-05-16 07:23:13 +00:00
|
|
|
// (16mb) - GPU writeback + 15mb of XPS?
|
2015-07-20 01:32:48 +00:00
|
|
|
{
|
2017-12-15 02:35:44 +00:00
|
|
|
0x7F000000,
|
|
|
|
0x7FFFFFFF,
|
|
|
|
0x0000000100000000ull,
|
2015-07-20 01:32:48 +00:00
|
|
|
},
|
2015-05-16 07:23:13 +00:00
|
|
|
// (256mb) - xex 64k pages
|
2015-07-20 01:32:48 +00:00
|
|
|
{
|
2017-12-15 02:35:44 +00:00
|
|
|
0x80000000,
|
|
|
|
0x8FFFFFFF,
|
|
|
|
0x0000000080000000ull,
|
2015-07-20 01:32:48 +00:00
|
|
|
},
|
2015-05-16 07:23:13 +00:00
|
|
|
// (256mb) - xex 4k pages
|
2015-07-20 01:32:48 +00:00
|
|
|
{
|
2017-12-15 02:35:44 +00:00
|
|
|
0x90000000,
|
|
|
|
0x9FFFFFFF,
|
|
|
|
0x0000000080000000ull,
|
2015-07-20 01:32:48 +00:00
|
|
|
},
|
2015-05-16 07:23:13 +00:00
|
|
|
// (512mb) - physical 64k pages
|
2015-07-20 01:32:48 +00:00
|
|
|
{
|
2017-12-15 02:35:44 +00:00
|
|
|
0xA0000000,
|
|
|
|
0xBFFFFFFF,
|
|
|
|
0x0000000100000000ull,
|
2015-07-20 01:32:48 +00:00
|
|
|
},
|
2015-05-16 07:23:13 +00:00
|
|
|
// - physical 16mb pages
|
2015-07-20 01:32:48 +00:00
|
|
|
{
|
2017-12-15 02:35:44 +00:00
|
|
|
0xC0000000,
|
|
|
|
0xDFFFFFFF,
|
|
|
|
0x0000000100000000ull,
|
2015-07-20 01:32:48 +00:00
|
|
|
},
|
2015-05-16 07:23:13 +00:00
|
|
|
// - physical 4k pages
|
2015-07-20 01:32:48 +00:00
|
|
|
{
|
2017-12-15 02:35:44 +00:00
|
|
|
0xE0000000,
|
|
|
|
0xFFFFFFFF,
|
2019-08-14 18:37:52 +00:00
|
|
|
0x0000000100001000ull,
|
2015-07-20 01:32:48 +00:00
|
|
|
},
|
2015-05-16 07:23:13 +00:00
|
|
|
// - physical raw
|
2015-07-20 01:32:48 +00:00
|
|
|
{
|
2017-12-15 02:35:44 +00:00
|
|
|
0x100000000,
|
|
|
|
0x11FFFFFFF,
|
|
|
|
0x0000000100000000ull,
|
2015-07-20 01:32:48 +00:00
|
|
|
},
|
2014-08-02 04:43:52 +00:00
|
|
|
};
|
2014-08-20 04:02:15 +00:00
|
|
|
int Memory::MapViews(uint8_t* mapping_base) {
|
2015-05-02 10:42:51 +00:00
|
|
|
assert_true(xe::countof(map_info) == xe::countof(views_.all_views));
|
2019-08-14 18:37:52 +00:00
|
|
|
// 0xE0000000 4 KB offset is emulated via host_address_offset and on the CPU
|
|
|
|
// side if system allocation granularity is bigger than 4 KB.
|
|
|
|
uint64_t granularity_mask = ~uint64_t(system_allocation_granularity_ - 1);
|
2015-05-02 10:42:51 +00:00
|
|
|
for (size_t n = 0; n < xe::countof(map_info); n++) {
|
2015-07-16 05:10:05 +00:00
|
|
|
views_.all_views[n] = reinterpret_cast<uint8_t*>(xe::memory::MapFileView(
|
|
|
|
mapping_, mapping_base + map_info[n].virtual_address_start,
|
2014-08-02 04:43:52 +00:00
|
|
|
map_info[n].virtual_address_end - map_info[n].virtual_address_start + 1,
|
2019-08-14 18:37:52 +00:00
|
|
|
xe::memory::PageAccess::kReadWrite,
|
|
|
|
map_info[n].target_address & granularity_mask));
|
2014-08-21 06:26:46 +00:00
|
|
|
if (!views_.all_views[n]) {
|
|
|
|
// Failed, so bail and try again.
|
|
|
|
UnmapViews();
|
|
|
|
return 1;
|
|
|
|
}
|
2013-10-21 06:19:57 +00:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-08-20 04:02:15 +00:00
|
|
|
void Memory::UnmapViews() {
|
2015-05-02 10:42:51 +00:00
|
|
|
for (size_t n = 0; n < xe::countof(views_.all_views); n++) {
|
2013-12-07 06:57:16 +00:00
|
|
|
if (views_.all_views[n]) {
|
2014-08-19 05:12:21 +00:00
|
|
|
size_t length = map_info[n].virtual_address_end -
|
|
|
|
map_info[n].virtual_address_start + 1;
|
2015-07-16 05:10:05 +00:00
|
|
|
xe::memory::UnmapFileView(mapping_, views_.all_views[n], length);
|
2013-10-21 06:19:57 +00:00
|
|
|
}
|
|
|
|
}
|
2013-01-13 07:25:41 +00:00
|
|
|
}
|
|
|
|
|
2015-12-27 20:03:30 +00:00
|
|
|
void Memory::Reset() {
|
2015-12-29 20:45:27 +00:00
|
|
|
heaps_.v00000000.Reset();
|
|
|
|
heaps_.v40000000.Reset();
|
|
|
|
heaps_.v80000000.Reset();
|
|
|
|
heaps_.v90000000.Reset();
|
|
|
|
heaps_.physical.Reset();
|
2015-12-27 20:03:30 +00:00
|
|
|
}
|
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
2022-11-27 17:37:06 +00:00
|
|
|
// clang does not like non-standard layout offsetof
|
|
|
|
#if XE_COMPILER_MSVC == 1 && XE_COMPILER_CLANG_CL == 0
|
use Sleep(0) instead of SwitchToThread, should waste less power and help the os with scheduling.
PM4 buffer handling made a virtual member of commandprocessor, place the implementation/declaration into reusable macro files. this is probably the biggest boost here.
Optimized SET_CONSTANT/ LOAD_CONSTANT pm4 ops based on the register range they start writing at, this was also a nice boost
Expose X64 extension flags to code outside of x64 backend, so we can detect and use things like avx512, xop, avx2, etc in normal code
Add freelists for HIR structures to try to reduce the number of last level cache misses during optimization (currently disabled... fixme later)
Analyzed PGO feedback and reordered branches, uninlined functions, moved code out into different functions based on info from it in the PM4 functions, this gave like a 2% boost at best.
Added support for the db16cyc opcode, which is used often in xb360 spinlocks. before it was just being translated to nop, now on x64 we translate it to _mm_pause but may change that in the future to reduce cpu time wasted
texture util - all our divisors were powers of 2, instead we look up a shift. this made texture scaling slightly faster, more so on intel processors which seem to be worse at int divs. GetGuestTextureLayout is now a little faster, although it is still one of the heaviest functions in the emulator when scaling is on.
xe_unlikely_mutex was not a good choice for the guest clock lock, (running theory) on intel processors another thread may take a significant time to update the clock? maybe because of the uint64 division? really not sure, but switched it to xe_mutex. This fixed audio stutter that i had introduced to 1 or 2 games, fixed performance on that n64 rare game with the monkeys.
Took another crack at DMA implementation, another failure.
Instead of passing as a parameter, keep the ringbuffer reader as the first member of commandprocessor so it can be accessed through this
Added macro for noalias
Applied noalias to Memory::LookupHeap. This reduced the size of the executable by 7 kb.
Reworked kernel shim template, this shaved like 100kb off the exe and eliminated the indirect calls from the shim to the actual implementation. We still unconditionally generate string representations of kernel calls though :(, unless it is kHighFrequency
Add nvapi extensions support, currently unused. Will use CPUVISIBLE memory at some point
Inserted prefetches in a few places based on feedback from vtune.
Add native implementation of SHA int8 if all elements are the same
Vectorized comparisons for SetViewport, SetScissorRect
Vectorized ranged comparisons for WriteRegister
Add XE_MSVC_ASSUME
Move FormatInfo::name out of the structure, instead look up the name in a different table. Debug related data and critical runtime data are best kept apart
Templated UpdateSystemConstantValues based on ROV/RTV and primitive_polygonal
Add ArchFloatMask functions, these are for storing the results of floating point comparisons without doing costly float->int pipeline transfers (vucomiss/setb)
Use floatmasks in UpdateSystemConstantValues for checking if dirty, only transfer to int at end of function.
Instead of dirty |= (x == y) in UpdateSystemConstantValues, now we do dirty_u32 |= (x^y). if any of them are not equal, dirty_u32 will be nz, else if theyre all equal it will be zero. This is more friendly to register renaming and the lack of dependencies on EFLAGS lets the compiler reorder better
Add PrefetchSamplerParameters to D3D12TextureCache
use PrefetchSamplerParameters in UpdateBindings to eliminate cache misses that vtune detected
Add PrefetchTextureBinding to D3D12TextureCache
Prefetch texture bindings to get rid of more misses vtune detected (more accesses out of order with random strides)
Rewrote DMAC, still terrible though and have disabled it for now.
Replace tiny memcmp of 6 U64 in render_target_cache with inline loop, msvc fails to make it a loop and instead does a thunk to their memcmp function, which is optimized for larger sizes
PrefetchTextureBinding in AreActiveTextureSRVKeysUpToDate
Replace memcmp calls for pipelinedescription with handwritten cmp
Directly write some registers that dont have special handling in PM4 functions
Changed EstimateMaxY to try to eliminate mispredictions that vtune was reporting, msvc ended up turning the changed code into a series of blends
in ExecutePacketType3_EVENT_WRITE_EXT, instead of writing extents to an array on the stack and then doing xe_copy_and_swap_16 of the data to its dest, pre-swap each constant and then store those. msvc manages to unroll that into wider stores
stop logging XE_SWAP every time we receive XE_SWAP, stop logging the start and end of each viz query
Prefetch watch nodes in FireWatches based on feedback from vtune
Removed dead code from texture_info.cc
NOINLINE on GpuSwap, PGO builds did it so we should too.
2022-09-11 21:14:48 +00:00
|
|
|
XE_NOALIAS
|
2019-08-13 21:07:27 +00:00
|
|
|
const BaseHeap* Memory::LookupHeap(uint32_t address) const {
|
use Sleep(0) instead of SwitchToThread, should waste less power and help the os with scheduling.
PM4 buffer handling made a virtual member of commandprocessor, place the implementation/declaration into reusable macro files. this is probably the biggest boost here.
Optimized SET_CONSTANT/ LOAD_CONSTANT pm4 ops based on the register range they start writing at, this was also a nice boost
Expose X64 extension flags to code outside of x64 backend, so we can detect and use things like avx512, xop, avx2, etc in normal code
Add freelists for HIR structures to try to reduce the number of last level cache misses during optimization (currently disabled... fixme later)
Analyzed PGO feedback and reordered branches, uninlined functions, moved code out into different functions based on info from it in the PM4 functions, this gave like a 2% boost at best.
Added support for the db16cyc opcode, which is used often in xb360 spinlocks. before it was just being translated to nop, now on x64 we translate it to _mm_pause but may change that in the future to reduce cpu time wasted
texture util - all our divisors were powers of 2, instead we look up a shift. this made texture scaling slightly faster, more so on intel processors which seem to be worse at int divs. GetGuestTextureLayout is now a little faster, although it is still one of the heaviest functions in the emulator when scaling is on.
xe_unlikely_mutex was not a good choice for the guest clock lock, (running theory) on intel processors another thread may take a significant time to update the clock? maybe because of the uint64 division? really not sure, but switched it to xe_mutex. This fixed audio stutter that i had introduced to 1 or 2 games, fixed performance on that n64 rare game with the monkeys.
Took another crack at DMA implementation, another failure.
Instead of passing as a parameter, keep the ringbuffer reader as the first member of commandprocessor so it can be accessed through this
Added macro for noalias
Applied noalias to Memory::LookupHeap. This reduced the size of the executable by 7 kb.
Reworked kernel shim template, this shaved like 100kb off the exe and eliminated the indirect calls from the shim to the actual implementation. We still unconditionally generate string representations of kernel calls though :(, unless it is kHighFrequency
Add nvapi extensions support, currently unused. Will use CPUVISIBLE memory at some point
Inserted prefetches in a few places based on feedback from vtune.
Add native implementation of SHA int8 if all elements are the same
Vectorized comparisons for SetViewport, SetScissorRect
Vectorized ranged comparisons for WriteRegister
Add XE_MSVC_ASSUME
Move FormatInfo::name out of the structure, instead look up the name in a different table. Debug related data and critical runtime data are best kept apart
Templated UpdateSystemConstantValues based on ROV/RTV and primitive_polygonal
Add ArchFloatMask functions, these are for storing the results of floating point comparisons without doing costly float->int pipeline transfers (vucomiss/setb)
Use floatmasks in UpdateSystemConstantValues for checking if dirty, only transfer to int at end of function.
Instead of dirty |= (x == y) in UpdateSystemConstantValues, now we do dirty_u32 |= (x^y). if any of them are not equal, dirty_u32 will be nz, else if theyre all equal it will be zero. This is more friendly to register renaming and the lack of dependencies on EFLAGS lets the compiler reorder better
Add PrefetchSamplerParameters to D3D12TextureCache
use PrefetchSamplerParameters in UpdateBindings to eliminate cache misses that vtune detected
Add PrefetchTextureBinding to D3D12TextureCache
Prefetch texture bindings to get rid of more misses vtune detected (more accesses out of order with random strides)
Rewrote DMAC, still terrible though and have disabled it for now.
Replace tiny memcmp of 6 U64 in render_target_cache with inline loop, msvc fails to make it a loop and instead does a thunk to their memcmp function, which is optimized for larger sizes
PrefetchTextureBinding in AreActiveTextureSRVKeysUpToDate
Replace memcmp calls for pipelinedescription with handwritten cmp
Directly write some registers that dont have special handling in PM4 functions
Changed EstimateMaxY to try to eliminate mispredictions that vtune was reporting, msvc ended up turning the changed code into a series of blends
in ExecutePacketType3_EVENT_WRITE_EXT, instead of writing extents to an array on the stack and then doing xe_copy_and_swap_16 of the data to its dest, pre-swap each constant and then store those. msvc manages to unroll that into wider stores
stop logging XE_SWAP every time we receive XE_SWAP, stop logging the start and end of each viz query
Prefetch watch nodes in FireWatches based on feedback from vtune
Removed dead code from texture_info.cc
NOINLINE on GpuSwap, PGO builds did it so we should too.
2022-09-11 21:14:48 +00:00
|
|
|
#define HEAP_INDEX(name) \
|
|
|
|
offsetof(Memory, heaps_.name) - offsetof(Memory, heaps_)
|
|
|
|
|
|
|
|
const char* heap_select = (const char*)&this->heaps_;
|
|
|
|
|
|
|
|
unsigned selected_heap_offset = 0;
|
|
|
|
unsigned high_nibble = address >> 28;
|
|
|
|
|
|
|
|
if (high_nibble < 0x4) {
|
|
|
|
selected_heap_offset = HEAP_INDEX(v00000000);
|
|
|
|
} else if (address < 0x7F000000) {
|
|
|
|
selected_heap_offset = HEAP_INDEX(v40000000);
|
|
|
|
} else if (high_nibble < 0x8) {
|
|
|
|
heap_select = nullptr;
|
|
|
|
// return nullptr;
|
|
|
|
} else if (high_nibble < 0x9) {
|
|
|
|
selected_heap_offset = HEAP_INDEX(v80000000);
|
|
|
|
// return &heaps_.v80000000;
|
|
|
|
} else if (high_nibble < 0xA) {
|
|
|
|
// return &heaps_.v90000000;
|
|
|
|
selected_heap_offset = HEAP_INDEX(v90000000);
|
|
|
|
} else if (high_nibble < 0xC) {
|
|
|
|
// return &heaps_.vA0000000;
|
|
|
|
selected_heap_offset = HEAP_INDEX(vA0000000);
|
|
|
|
} else if (high_nibble < 0xE) {
|
|
|
|
// return &heaps_.vC0000000;
|
|
|
|
selected_heap_offset = HEAP_INDEX(vC0000000);
|
|
|
|
} else if (address < 0xFFD00000) {
|
|
|
|
// return &heaps_.vE0000000;
|
|
|
|
selected_heap_offset = HEAP_INDEX(vE0000000);
|
|
|
|
} else {
|
|
|
|
// return nullptr;
|
|
|
|
heap_select = nullptr;
|
|
|
|
}
|
|
|
|
return reinterpret_cast<const BaseHeap*>(selected_heap_offset + heap_select);
|
2022-11-05 17:50:33 +00:00
|
|
|
}
|
use Sleep(0) instead of SwitchToThread, should waste less power and help the os with scheduling.
PM4 buffer handling made a virtual member of commandprocessor, place the implementation/declaration into reusable macro files. this is probably the biggest boost here.
Optimized SET_CONSTANT/ LOAD_CONSTANT pm4 ops based on the register range they start writing at, this was also a nice boost
Expose X64 extension flags to code outside of x64 backend, so we can detect and use things like avx512, xop, avx2, etc in normal code
Add freelists for HIR structures to try to reduce the number of last level cache misses during optimization (currently disabled... fixme later)
Analyzed PGO feedback and reordered branches, uninlined functions, moved code out into different functions based on info from it in the PM4 functions, this gave like a 2% boost at best.
Added support for the db16cyc opcode, which is used often in xb360 spinlocks. before it was just being translated to nop, now on x64 we translate it to _mm_pause but may change that in the future to reduce cpu time wasted
texture util - all our divisors were powers of 2, instead we look up a shift. this made texture scaling slightly faster, more so on intel processors which seem to be worse at int divs. GetGuestTextureLayout is now a little faster, although it is still one of the heaviest functions in the emulator when scaling is on.
xe_unlikely_mutex was not a good choice for the guest clock lock, (running theory) on intel processors another thread may take a significant time to update the clock? maybe because of the uint64 division? really not sure, but switched it to xe_mutex. This fixed audio stutter that i had introduced to 1 or 2 games, fixed performance on that n64 rare game with the monkeys.
Took another crack at DMA implementation, another failure.
Instead of passing as a parameter, keep the ringbuffer reader as the first member of commandprocessor so it can be accessed through this
Added macro for noalias
Applied noalias to Memory::LookupHeap. This reduced the size of the executable by 7 kb.
Reworked kernel shim template, this shaved like 100kb off the exe and eliminated the indirect calls from the shim to the actual implementation. We still unconditionally generate string representations of kernel calls though :(, unless it is kHighFrequency
Add nvapi extensions support, currently unused. Will use CPUVISIBLE memory at some point
Inserted prefetches in a few places based on feedback from vtune.
Add native implementation of SHA int8 if all elements are the same
Vectorized comparisons for SetViewport, SetScissorRect
Vectorized ranged comparisons for WriteRegister
Add XE_MSVC_ASSUME
Move FormatInfo::name out of the structure, instead look up the name in a different table. Debug related data and critical runtime data are best kept apart
Templated UpdateSystemConstantValues based on ROV/RTV and primitive_polygonal
Add ArchFloatMask functions, these are for storing the results of floating point comparisons without doing costly float->int pipeline transfers (vucomiss/setb)
Use floatmasks in UpdateSystemConstantValues for checking if dirty, only transfer to int at end of function.
Instead of dirty |= (x == y) in UpdateSystemConstantValues, now we do dirty_u32 |= (x^y). if any of them are not equal, dirty_u32 will be nz, else if theyre all equal it will be zero. This is more friendly to register renaming and the lack of dependencies on EFLAGS lets the compiler reorder better
Add PrefetchSamplerParameters to D3D12TextureCache
use PrefetchSamplerParameters in UpdateBindings to eliminate cache misses that vtune detected
Add PrefetchTextureBinding to D3D12TextureCache
Prefetch texture bindings to get rid of more misses vtune detected (more accesses out of order with random strides)
Rewrote DMAC, still terrible though and have disabled it for now.
Replace tiny memcmp of 6 U64 in render_target_cache with inline loop, msvc fails to make it a loop and instead does a thunk to their memcmp function, which is optimized for larger sizes
PrefetchTextureBinding in AreActiveTextureSRVKeysUpToDate
Replace memcmp calls for pipelinedescription with handwritten cmp
Directly write some registers that dont have special handling in PM4 functions
Changed EstimateMaxY to try to eliminate mispredictions that vtune was reporting, msvc ended up turning the changed code into a series of blends
in ExecutePacketType3_EVENT_WRITE_EXT, instead of writing extents to an array on the stack and then doing xe_copy_and_swap_16 of the data to its dest, pre-swap each constant and then store those. msvc manages to unroll that into wider stores
stop logging XE_SWAP every time we receive XE_SWAP, stop logging the start and end of each viz query
Prefetch watch nodes in FireWatches based on feedback from vtune
Removed dead code from texture_info.cc
NOINLINE on GpuSwap, PGO builds did it so we should too.
2022-09-11 21:14:48 +00:00
|
|
|
#else
|
2022-11-05 17:50:33 +00:00
|
|
|
XE_NOALIAS
|
|
|
|
const BaseHeap* Memory::LookupHeap(uint32_t address) const {
|
2015-05-16 07:23:13 +00:00
|
|
|
if (address < 0x40000000) {
|
|
|
|
return &heaps_.v00000000;
|
2015-05-22 07:20:05 +00:00
|
|
|
} else if (address < 0x7F000000) {
|
2015-05-16 07:23:13 +00:00
|
|
|
return &heaps_.v40000000;
|
2015-05-22 07:20:05 +00:00
|
|
|
} else if (address < 0x80000000) {
|
|
|
|
return nullptr;
|
2015-05-16 07:23:13 +00:00
|
|
|
} else if (address < 0x90000000) {
|
|
|
|
return &heaps_.v80000000;
|
|
|
|
} else if (address < 0xA0000000) {
|
|
|
|
return &heaps_.v90000000;
|
|
|
|
} else if (address < 0xC0000000) {
|
|
|
|
return &heaps_.vA0000000;
|
|
|
|
} else if (address < 0xE0000000) {
|
|
|
|
return &heaps_.vC0000000;
|
2015-06-09 00:49:54 +00:00
|
|
|
} else if (address < 0xFFD00000) {
|
2015-05-16 07:23:13 +00:00
|
|
|
return &heaps_.vE0000000;
|
2015-06-09 00:49:54 +00:00
|
|
|
} else {
|
|
|
|
return nullptr;
|
2015-05-16 07:23:13 +00:00
|
|
|
}
|
|
|
|
}
|
2022-11-05 17:50:33 +00:00
|
|
|
#endif
|
2015-05-16 07:23:13 +00:00
|
|
|
BaseHeap* Memory::LookupHeapByType(bool physical, uint32_t page_size) {
|
|
|
|
if (physical) {
|
|
|
|
if (page_size <= 4096) {
|
2019-08-15 20:55:33 +00:00
|
|
|
return &heaps_.vE0000000;
|
2015-05-16 07:23:13 +00:00
|
|
|
} else if (page_size <= 64 * 1024) {
|
|
|
|
return &heaps_.vA0000000;
|
|
|
|
} else {
|
|
|
|
return &heaps_.vC0000000;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (page_size <= 4096) {
|
|
|
|
return &heaps_.v00000000;
|
|
|
|
} else {
|
|
|
|
return &heaps_.v40000000;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-02-10 22:45:06 +00:00
|
|
|
VirtualHeap* Memory::GetPhysicalHeap() { return &heaps_.physical; }
|
|
|
|
|
2021-05-30 10:52:34 +00:00
|
|
|
void Memory::GetHeapsPageStatsSummary(const BaseHeap* const* provided_heaps,
|
|
|
|
size_t heaps_count,
|
|
|
|
uint32_t& unreserved_pages,
|
|
|
|
uint32_t& reserved_pages,
|
|
|
|
uint32_t& used_pages,
|
|
|
|
uint32_t& reserved_bytes) {
|
|
|
|
auto lock = global_critical_region_.Acquire();
|
|
|
|
for (size_t i = 0; i < heaps_count; i++) {
|
|
|
|
const BaseHeap* heap = provided_heaps[i];
|
|
|
|
uint32_t heap_unreserved_pages = heap->unreserved_page_count();
|
|
|
|
uint32_t heap_reserved_pages = heap->reserved_page_count();
|
|
|
|
|
|
|
|
unreserved_pages += heap_unreserved_pages;
|
|
|
|
reserved_pages += heap_reserved_pages;
|
|
|
|
used_pages += ((heap->total_page_count() - heap_unreserved_pages) *
|
|
|
|
heap->page_size()) /
|
|
|
|
4096;
|
|
|
|
reserved_bytes += heap_reserved_pages * heap->page_size();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-08-13 20:49:49 +00:00
|
|
|
uint32_t Memory::HostToGuestVirtual(const void* host_address) const {
|
|
|
|
size_t virtual_address = reinterpret_cast<size_t>(host_address) -
|
|
|
|
reinterpret_cast<size_t>(virtual_membase_);
|
|
|
|
uint32_t vE0000000_host_offset = heaps_.vE0000000.host_address_offset();
|
|
|
|
size_t vE0000000_host_base =
|
|
|
|
size_t(heaps_.vE0000000.heap_base()) + vE0000000_host_offset;
|
|
|
|
if (virtual_address >= vE0000000_host_base &&
|
|
|
|
virtual_address <=
|
2020-02-22 11:55:28 +00:00
|
|
|
(vE0000000_host_base + (heaps_.vE0000000.heap_size() - 1))) {
|
2019-08-13 20:49:49 +00:00
|
|
|
virtual_address -= vE0000000_host_offset;
|
|
|
|
}
|
|
|
|
return uint32_t(virtual_address);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t Memory::HostToGuestVirtualThunk(const void* context,
|
|
|
|
const void* host_address) {
|
|
|
|
return reinterpret_cast<const Memory*>(context)->HostToGuestVirtual(
|
|
|
|
host_address);
|
|
|
|
}
|
|
|
|
|
2019-08-24 14:40:59 +00:00
|
|
|
uint32_t Memory::GetPhysicalAddress(uint32_t address) const {
|
|
|
|
const BaseHeap* heap = LookupHeap(address);
|
2020-09-16 18:31:53 +00:00
|
|
|
if (!heap || heap->heap_type() != HeapType::kGuestPhysical) {
|
2019-08-24 14:40:59 +00:00
|
|
|
return UINT32_MAX;
|
|
|
|
}
|
|
|
|
return static_cast<const PhysicalHeap*>(heap)->GetPhysicalAddress(address);
|
|
|
|
}
|
|
|
|
|
2015-03-25 02:41:29 +00:00
|
|
|
void Memory::Zero(uint32_t address, uint32_t size) {
|
2015-03-29 18:11:35 +00:00
|
|
|
std::memset(TranslateVirtual(address), 0, size);
|
2015-03-24 14:46:18 +00:00
|
|
|
}
|
|
|
|
|
2015-03-25 02:41:29 +00:00
|
|
|
void Memory::Fill(uint32_t address, uint32_t size, uint8_t value) {
|
2015-03-29 18:11:35 +00:00
|
|
|
std::memset(TranslateVirtual(address), value, size);
|
2015-03-24 14:46:18 +00:00
|
|
|
}
|
|
|
|
|
2015-03-25 02:41:29 +00:00
|
|
|
void Memory::Copy(uint32_t dest, uint32_t src, uint32_t size) {
|
2015-03-29 18:11:35 +00:00
|
|
|
uint8_t* pdest = TranslateVirtual(dest);
|
|
|
|
const uint8_t* psrc = TranslateVirtual(src);
|
2015-03-28 22:54:44 +00:00
|
|
|
std::memcpy(pdest, psrc, size);
|
2015-03-24 14:46:18 +00:00
|
|
|
}
|
|
|
|
|
2015-03-25 02:41:29 +00:00
|
|
|
uint32_t Memory::SearchAligned(uint32_t start, uint32_t end,
|
2015-03-24 14:46:18 +00:00
|
|
|
const uint32_t* values, size_t value_count) {
|
|
|
|
assert_true(start <= end);
|
2015-03-29 18:11:35 +00:00
|
|
|
auto p = TranslateVirtual<const uint32_t*>(start);
|
|
|
|
auto pe = TranslateVirtual<const uint32_t*>(end);
|
2015-03-24 14:46:18 +00:00
|
|
|
while (p != pe) {
|
|
|
|
if (*p == values[0]) {
|
|
|
|
const uint32_t* pc = p + 1;
|
|
|
|
size_t matched = 1;
|
|
|
|
for (size_t n = 1; n < value_count; n++, pc++) {
|
|
|
|
if (*pc != values[n]) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
matched++;
|
|
|
|
}
|
|
|
|
if (matched == value_count) {
|
2019-08-14 21:31:21 +00:00
|
|
|
return HostToGuestVirtual(p);
|
2015-03-24 14:46:18 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
bool Memory::AddVirtualMappedRange(uint32_t virtual_address, uint32_t mask,
|
|
|
|
uint32_t size, void* context,
|
|
|
|
cpu::MMIOReadCallback read_callback,
|
|
|
|
cpu::MMIOWriteCallback write_callback) {
|
2015-07-16 02:05:08 +00:00
|
|
|
if (!xe::memory::AllocFixed(TranslateVirtual(virtual_address), size,
|
|
|
|
xe::memory::AllocationType::kCommit,
|
|
|
|
xe::memory::PageAccess::kNoAccess)) {
|
2014-07-30 05:12:39 +00:00
|
|
|
XELOGE("Unable to map range; commit/protect failed");
|
2014-06-02 06:36:18 +00:00
|
|
|
return false;
|
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
return mmio_handler_->RegisterRange(virtual_address, mask, size, context,
|
2014-08-19 05:12:21 +00:00
|
|
|
read_callback, write_callback);
|
2014-06-02 06:36:18 +00:00
|
|
|
}
|
|
|
|
|
2015-06-03 03:15:43 +00:00
|
|
|
cpu::MMIORange* Memory::LookupVirtualMappedRange(uint32_t virtual_address) {
|
|
|
|
return mmio_handler_->LookupRange(virtual_address);
|
|
|
|
}
|
|
|
|
|
2020-02-15 18:35:24 +00:00
|
|
|
bool Memory::AccessViolationCallback(
|
use Sleep(0) instead of SwitchToThread, should waste less power and help the os with scheduling.
PM4 buffer handling made a virtual member of commandprocessor, place the implementation/declaration into reusable macro files. this is probably the biggest boost here.
Optimized SET_CONSTANT/ LOAD_CONSTANT pm4 ops based on the register range they start writing at, this was also a nice boost
Expose X64 extension flags to code outside of x64 backend, so we can detect and use things like avx512, xop, avx2, etc in normal code
Add freelists for HIR structures to try to reduce the number of last level cache misses during optimization (currently disabled... fixme later)
Analyzed PGO feedback and reordered branches, uninlined functions, moved code out into different functions based on info from it in the PM4 functions, this gave like a 2% boost at best.
Added support for the db16cyc opcode, which is used often in xb360 spinlocks. before it was just being translated to nop, now on x64 we translate it to _mm_pause but may change that in the future to reduce cpu time wasted
texture util - all our divisors were powers of 2, instead we look up a shift. this made texture scaling slightly faster, more so on intel processors which seem to be worse at int divs. GetGuestTextureLayout is now a little faster, although it is still one of the heaviest functions in the emulator when scaling is on.
xe_unlikely_mutex was not a good choice for the guest clock lock, (running theory) on intel processors another thread may take a significant time to update the clock? maybe because of the uint64 division? really not sure, but switched it to xe_mutex. This fixed audio stutter that i had introduced to 1 or 2 games, fixed performance on that n64 rare game with the monkeys.
Took another crack at DMA implementation, another failure.
Instead of passing as a parameter, keep the ringbuffer reader as the first member of commandprocessor so it can be accessed through this
Added macro for noalias
Applied noalias to Memory::LookupHeap. This reduced the size of the executable by 7 kb.
Reworked kernel shim template, this shaved like 100kb off the exe and eliminated the indirect calls from the shim to the actual implementation. We still unconditionally generate string representations of kernel calls though :(, unless it is kHighFrequency
Add nvapi extensions support, currently unused. Will use CPUVISIBLE memory at some point
Inserted prefetches in a few places based on feedback from vtune.
Add native implementation of SHA int8 if all elements are the same
Vectorized comparisons for SetViewport, SetScissorRect
Vectorized ranged comparisons for WriteRegister
Add XE_MSVC_ASSUME
Move FormatInfo::name out of the structure, instead look up the name in a different table. Debug related data and critical runtime data are best kept apart
Templated UpdateSystemConstantValues based on ROV/RTV and primitive_polygonal
Add ArchFloatMask functions, these are for storing the results of floating point comparisons without doing costly float->int pipeline transfers (vucomiss/setb)
Use floatmasks in UpdateSystemConstantValues for checking if dirty, only transfer to int at end of function.
Instead of dirty |= (x == y) in UpdateSystemConstantValues, now we do dirty_u32 |= (x^y). if any of them are not equal, dirty_u32 will be nz, else if theyre all equal it will be zero. This is more friendly to register renaming and the lack of dependencies on EFLAGS lets the compiler reorder better
Add PrefetchSamplerParameters to D3D12TextureCache
use PrefetchSamplerParameters in UpdateBindings to eliminate cache misses that vtune detected
Add PrefetchTextureBinding to D3D12TextureCache
Prefetch texture bindings to get rid of more misses vtune detected (more accesses out of order with random strides)
Rewrote DMAC, still terrible though and have disabled it for now.
Replace tiny memcmp of 6 U64 in render_target_cache with inline loop, msvc fails to make it a loop and instead does a thunk to their memcmp function, which is optimized for larger sizes
PrefetchTextureBinding in AreActiveTextureSRVKeysUpToDate
Replace memcmp calls for pipelinedescription with handwritten cmp
Directly write some registers that dont have special handling in PM4 functions
Changed EstimateMaxY to try to eliminate mispredictions that vtune was reporting, msvc ended up turning the changed code into a series of blends
in ExecutePacketType3_EVENT_WRITE_EXT, instead of writing extents to an array on the stack and then doing xe_copy_and_swap_16 of the data to its dest, pre-swap each constant and then store those. msvc manages to unroll that into wider stores
stop logging XE_SWAP every time we receive XE_SWAP, stop logging the start and end of each viz query
Prefetch watch nodes in FireWatches based on feedback from vtune
Removed dead code from texture_info.cc
NOINLINE on GpuSwap, PGO builds did it so we should too.
2022-09-11 21:14:48 +00:00
|
|
|
global_unique_lock_type global_lock_locked_once, void* host_address,
|
|
|
|
bool is_write) {
|
2020-02-15 18:35:24 +00:00
|
|
|
// Access via physical_membase_ is special, when need to bypass everything
|
|
|
|
// (for instance, for a data provider to actually write the data) so only
|
|
|
|
// triggering callbacks on virtual memory regions.
|
2019-08-13 20:49:49 +00:00
|
|
|
if (reinterpret_cast<size_t>(host_address) <
|
|
|
|
reinterpret_cast<size_t>(virtual_membase_) ||
|
|
|
|
reinterpret_cast<size_t>(host_address) >=
|
|
|
|
reinterpret_cast<size_t>(physical_membase_)) {
|
2019-07-30 05:00:20 +00:00
|
|
|
return false;
|
|
|
|
}
|
2019-08-14 05:28:30 +00:00
|
|
|
uint32_t virtual_address = HostToGuestVirtual(host_address);
|
2019-07-30 05:00:20 +00:00
|
|
|
BaseHeap* heap = LookupHeap(virtual_address);
|
2020-09-16 18:31:53 +00:00
|
|
|
if (heap->heap_type() != HeapType::kGuestPhysical) {
|
2020-02-15 18:35:24 +00:00
|
|
|
return false;
|
2019-07-30 05:00:20 +00:00
|
|
|
}
|
|
|
|
|
2020-02-15 18:35:24 +00:00
|
|
|
// Access violation callbacks from the guest are triggered when the global
|
|
|
|
// critical region mutex is locked once.
|
|
|
|
//
|
|
|
|
// Will be rounded to physical page boundaries internally, so just pass 1 as
|
|
|
|
// the length - guranteed not to cross page boundaries also.
|
|
|
|
auto physical_heap = static_cast<PhysicalHeap*>(heap);
|
|
|
|
return physical_heap->TriggerCallbacks(std::move(global_lock_locked_once),
|
|
|
|
virtual_address, 1, is_write, false);
|
2019-07-30 05:00:20 +00:00
|
|
|
}
|
|
|
|
|
2020-02-15 18:35:24 +00:00
|
|
|
bool Memory::AccessViolationCallbackThunk(
|
use Sleep(0) instead of SwitchToThread, should waste less power and help the os with scheduling.
PM4 buffer handling made a virtual member of commandprocessor, place the implementation/declaration into reusable macro files. this is probably the biggest boost here.
Optimized SET_CONSTANT/ LOAD_CONSTANT pm4 ops based on the register range they start writing at, this was also a nice boost
Expose X64 extension flags to code outside of x64 backend, so we can detect and use things like avx512, xop, avx2, etc in normal code
Add freelists for HIR structures to try to reduce the number of last level cache misses during optimization (currently disabled... fixme later)
Analyzed PGO feedback and reordered branches, uninlined functions, moved code out into different functions based on info from it in the PM4 functions, this gave like a 2% boost at best.
Added support for the db16cyc opcode, which is used often in xb360 spinlocks. before it was just being translated to nop, now on x64 we translate it to _mm_pause but may change that in the future to reduce cpu time wasted
texture util - all our divisors were powers of 2, instead we look up a shift. this made texture scaling slightly faster, more so on intel processors which seem to be worse at int divs. GetGuestTextureLayout is now a little faster, although it is still one of the heaviest functions in the emulator when scaling is on.
xe_unlikely_mutex was not a good choice for the guest clock lock, (running theory) on intel processors another thread may take a significant time to update the clock? maybe because of the uint64 division? really not sure, but switched it to xe_mutex. This fixed audio stutter that i had introduced to 1 or 2 games, fixed performance on that n64 rare game with the monkeys.
Took another crack at DMA implementation, another failure.
Instead of passing as a parameter, keep the ringbuffer reader as the first member of commandprocessor so it can be accessed through this
Added macro for noalias
Applied noalias to Memory::LookupHeap. This reduced the size of the executable by 7 kb.
Reworked kernel shim template, this shaved like 100kb off the exe and eliminated the indirect calls from the shim to the actual implementation. We still unconditionally generate string representations of kernel calls though :(, unless it is kHighFrequency
Add nvapi extensions support, currently unused. Will use CPUVISIBLE memory at some point
Inserted prefetches in a few places based on feedback from vtune.
Add native implementation of SHA int8 if all elements are the same
Vectorized comparisons for SetViewport, SetScissorRect
Vectorized ranged comparisons for WriteRegister
Add XE_MSVC_ASSUME
Move FormatInfo::name out of the structure, instead look up the name in a different table. Debug related data and critical runtime data are best kept apart
Templated UpdateSystemConstantValues based on ROV/RTV and primitive_polygonal
Add ArchFloatMask functions, these are for storing the results of floating point comparisons without doing costly float->int pipeline transfers (vucomiss/setb)
Use floatmasks in UpdateSystemConstantValues for checking if dirty, only transfer to int at end of function.
Instead of dirty |= (x == y) in UpdateSystemConstantValues, now we do dirty_u32 |= (x^y). if any of them are not equal, dirty_u32 will be nz, else if theyre all equal it will be zero. This is more friendly to register renaming and the lack of dependencies on EFLAGS lets the compiler reorder better
Add PrefetchSamplerParameters to D3D12TextureCache
use PrefetchSamplerParameters in UpdateBindings to eliminate cache misses that vtune detected
Add PrefetchTextureBinding to D3D12TextureCache
Prefetch texture bindings to get rid of more misses vtune detected (more accesses out of order with random strides)
Rewrote DMAC, still terrible though and have disabled it for now.
Replace tiny memcmp of 6 U64 in render_target_cache with inline loop, msvc fails to make it a loop and instead does a thunk to their memcmp function, which is optimized for larger sizes
PrefetchTextureBinding in AreActiveTextureSRVKeysUpToDate
Replace memcmp calls for pipelinedescription with handwritten cmp
Directly write some registers that dont have special handling in PM4 functions
Changed EstimateMaxY to try to eliminate mispredictions that vtune was reporting, msvc ended up turning the changed code into a series of blends
in ExecutePacketType3_EVENT_WRITE_EXT, instead of writing extents to an array on the stack and then doing xe_copy_and_swap_16 of the data to its dest, pre-swap each constant and then store those. msvc manages to unroll that into wider stores
stop logging XE_SWAP every time we receive XE_SWAP, stop logging the start and end of each viz query
Prefetch watch nodes in FireWatches based on feedback from vtune
Removed dead code from texture_info.cc
NOINLINE on GpuSwap, PGO builds did it so we should too.
2022-09-11 21:14:48 +00:00
|
|
|
global_unique_lock_type global_lock_locked_once, void* context,
|
|
|
|
void* host_address, bool is_write) {
|
2019-07-30 05:00:20 +00:00
|
|
|
return reinterpret_cast<Memory*>(context)->AccessViolationCallback(
|
2020-02-15 18:35:24 +00:00
|
|
|
std::move(global_lock_locked_once), host_address, is_write);
|
2019-07-30 05:00:20 +00:00
|
|
|
}
|
|
|
|
|
2020-02-15 18:35:24 +00:00
|
|
|
bool Memory::TriggerPhysicalMemoryCallbacks(
|
use Sleep(0) instead of SwitchToThread, should waste less power and help the os with scheduling.
PM4 buffer handling made a virtual member of commandprocessor, place the implementation/declaration into reusable macro files. this is probably the biggest boost here.
Optimized SET_CONSTANT/ LOAD_CONSTANT pm4 ops based on the register range they start writing at, this was also a nice boost
Expose X64 extension flags to code outside of x64 backend, so we can detect and use things like avx512, xop, avx2, etc in normal code
Add freelists for HIR structures to try to reduce the number of last level cache misses during optimization (currently disabled... fixme later)
Analyzed PGO feedback and reordered branches, uninlined functions, moved code out into different functions based on info from it in the PM4 functions, this gave like a 2% boost at best.
Added support for the db16cyc opcode, which is used often in xb360 spinlocks. before it was just being translated to nop, now on x64 we translate it to _mm_pause but may change that in the future to reduce cpu time wasted
texture util - all our divisors were powers of 2, instead we look up a shift. this made texture scaling slightly faster, more so on intel processors which seem to be worse at int divs. GetGuestTextureLayout is now a little faster, although it is still one of the heaviest functions in the emulator when scaling is on.
xe_unlikely_mutex was not a good choice for the guest clock lock, (running theory) on intel processors another thread may take a significant time to update the clock? maybe because of the uint64 division? really not sure, but switched it to xe_mutex. This fixed audio stutter that i had introduced to 1 or 2 games, fixed performance on that n64 rare game with the monkeys.
Took another crack at DMA implementation, another failure.
Instead of passing as a parameter, keep the ringbuffer reader as the first member of commandprocessor so it can be accessed through this
Added macro for noalias
Applied noalias to Memory::LookupHeap. This reduced the size of the executable by 7 kb.
Reworked kernel shim template, this shaved like 100kb off the exe and eliminated the indirect calls from the shim to the actual implementation. We still unconditionally generate string representations of kernel calls though :(, unless it is kHighFrequency
Add nvapi extensions support, currently unused. Will use CPUVISIBLE memory at some point
Inserted prefetches in a few places based on feedback from vtune.
Add native implementation of SHA int8 if all elements are the same
Vectorized comparisons for SetViewport, SetScissorRect
Vectorized ranged comparisons for WriteRegister
Add XE_MSVC_ASSUME
Move FormatInfo::name out of the structure, instead look up the name in a different table. Debug related data and critical runtime data are best kept apart
Templated UpdateSystemConstantValues based on ROV/RTV and primitive_polygonal
Add ArchFloatMask functions, these are for storing the results of floating point comparisons without doing costly float->int pipeline transfers (vucomiss/setb)
Use floatmasks in UpdateSystemConstantValues for checking if dirty, only transfer to int at end of function.
Instead of dirty |= (x == y) in UpdateSystemConstantValues, now we do dirty_u32 |= (x^y). if any of them are not equal, dirty_u32 will be nz, else if theyre all equal it will be zero. This is more friendly to register renaming and the lack of dependencies on EFLAGS lets the compiler reorder better
Add PrefetchSamplerParameters to D3D12TextureCache
use PrefetchSamplerParameters in UpdateBindings to eliminate cache misses that vtune detected
Add PrefetchTextureBinding to D3D12TextureCache
Prefetch texture bindings to get rid of more misses vtune detected (more accesses out of order with random strides)
Rewrote DMAC, still terrible though and have disabled it for now.
Replace tiny memcmp of 6 U64 in render_target_cache with inline loop, msvc fails to make it a loop and instead does a thunk to their memcmp function, which is optimized for larger sizes
PrefetchTextureBinding in AreActiveTextureSRVKeysUpToDate
Replace memcmp calls for pipelinedescription with handwritten cmp
Directly write some registers that dont have special handling in PM4 functions
Changed EstimateMaxY to try to eliminate mispredictions that vtune was reporting, msvc ended up turning the changed code into a series of blends
in ExecutePacketType3_EVENT_WRITE_EXT, instead of writing extents to an array on the stack and then doing xe_copy_and_swap_16 of the data to its dest, pre-swap each constant and then store those. msvc manages to unroll that into wider stores
stop logging XE_SWAP every time we receive XE_SWAP, stop logging the start and end of each viz query
Prefetch watch nodes in FireWatches based on feedback from vtune
Removed dead code from texture_info.cc
NOINLINE on GpuSwap, PGO builds did it so we should too.
2022-09-11 21:14:48 +00:00
|
|
|
global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
|
|
|
|
uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) {
|
2019-07-30 06:06:23 +00:00
|
|
|
BaseHeap* heap = LookupHeap(virtual_address);
|
2020-09-16 18:31:53 +00:00
|
|
|
if (heap->heap_type() == HeapType::kGuestPhysical) {
|
2020-02-15 18:35:24 +00:00
|
|
|
auto physical_heap = static_cast<PhysicalHeap*>(heap);
|
|
|
|
return physical_heap->TriggerCallbacks(std::move(global_lock_locked_once),
|
|
|
|
virtual_address, length, is_write,
|
|
|
|
unwatch_exact_range, unprotect);
|
2019-07-30 06:06:23 +00:00
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-02-15 18:35:24 +00:00
|
|
|
void* Memory::RegisterPhysicalMemoryInvalidationCallback(
|
|
|
|
PhysicalMemoryInvalidationCallback callback, void* callback_context) {
|
|
|
|
auto entry = new std::pair<PhysicalMemoryInvalidationCallback, void*>(
|
|
|
|
callback, callback_context);
|
2019-07-30 05:00:20 +00:00
|
|
|
auto lock = global_critical_region_.Acquire();
|
2020-02-15 18:35:24 +00:00
|
|
|
physical_memory_invalidation_callbacks_.push_back(entry);
|
2019-07-30 05:00:20 +00:00
|
|
|
return entry;
|
2018-07-26 19:52:26 +00:00
|
|
|
}
|
|
|
|
|
2020-02-15 18:35:24 +00:00
|
|
|
void Memory::UnregisterPhysicalMemoryInvalidationCallback(
|
|
|
|
void* callback_handle) {
|
|
|
|
auto entry =
|
|
|
|
reinterpret_cast<std::pair<PhysicalMemoryInvalidationCallback, void*>*>(
|
|
|
|
callback_handle);
|
2019-07-30 05:00:20 +00:00
|
|
|
{
|
|
|
|
auto lock = global_critical_region_.Acquire();
|
2020-02-15 18:35:24 +00:00
|
|
|
auto it = std::find(physical_memory_invalidation_callbacks_.begin(),
|
|
|
|
physical_memory_invalidation_callbacks_.end(), entry);
|
|
|
|
assert_true(it != physical_memory_invalidation_callbacks_.end());
|
|
|
|
if (it != physical_memory_invalidation_callbacks_.end()) {
|
|
|
|
physical_memory_invalidation_callbacks_.erase(it);
|
2019-07-30 05:00:20 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
delete entry;
|
2018-07-26 19:52:26 +00:00
|
|
|
}
|
|
|
|
|
2020-02-15 18:35:24 +00:00
|
|
|
void Memory::EnablePhysicalMemoryAccessCallbacks(
|
|
|
|
uint32_t physical_address, uint32_t length,
|
|
|
|
bool enable_invalidation_notifications, bool enable_data_providers) {
|
|
|
|
heaps_.vA0000000.EnableAccessCallbacks(physical_address, length,
|
|
|
|
enable_invalidation_notifications,
|
|
|
|
enable_data_providers);
|
|
|
|
heaps_.vC0000000.EnableAccessCallbacks(physical_address, length,
|
|
|
|
enable_invalidation_notifications,
|
|
|
|
enable_data_providers);
|
|
|
|
heaps_.vE0000000.EnableAccessCallbacks(physical_address, length,
|
|
|
|
enable_invalidation_notifications,
|
|
|
|
enable_data_providers);
|
2018-07-26 19:52:26 +00:00
|
|
|
}
|
|
|
|
|
2015-03-28 22:54:44 +00:00
|
|
|
uint32_t Memory::SystemHeapAlloc(uint32_t size, uint32_t alignment,
|
|
|
|
uint32_t system_heap_flags) {
|
|
|
|
// TODO(benvanik): lightweight pool.
|
|
|
|
bool is_physical = !!(system_heap_flags & kSystemHeapPhysical);
|
2019-08-16 18:11:55 +00:00
|
|
|
auto heap = LookupHeapByType(is_physical, 4096);
|
2015-05-16 07:23:13 +00:00
|
|
|
uint32_t address;
|
2022-02-28 13:20:55 +00:00
|
|
|
if (!heap->AllocSystemHeap(
|
|
|
|
size, alignment, kMemoryAllocationReserve | kMemoryAllocationCommit,
|
2022-06-17 20:23:39 +00:00
|
|
|
kMemoryProtectRead | kMemoryProtectWrite, false, &address)) {
|
2015-05-16 07:23:13 +00:00
|
|
|
return 0;
|
2015-03-28 22:54:44 +00:00
|
|
|
}
|
2015-05-19 01:48:48 +00:00
|
|
|
Zero(address, size);
|
2015-05-16 07:23:13 +00:00
|
|
|
return address;
|
2015-03-28 22:54:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void Memory::SystemHeapFree(uint32_t address) {
|
|
|
|
if (!address) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
// TODO(benvanik): lightweight pool.
|
2015-06-05 02:18:00 +00:00
|
|
|
auto heap = LookupHeap(address);
|
2015-05-16 07:23:13 +00:00
|
|
|
heap->Release(address);
|
|
|
|
}
|
|
|
|
|
|
|
|
void Memory::DumpMap() {
|
|
|
|
XELOGE("==================================================================");
|
|
|
|
XELOGE("Memory Dump");
|
|
|
|
XELOGE("==================================================================");
|
2020-02-28 20:30:48 +00:00
|
|
|
XELOGE(" System Page Size: {0} ({0:08X})", system_page_size_);
|
|
|
|
XELOGE(" System Allocation Granularity: {0} ({0:08X})",
|
|
|
|
system_allocation_granularity_);
|
|
|
|
XELOGE(" Virtual Membase: {}", virtual_membase_);
|
|
|
|
XELOGE(" Physical Membase: {}", physical_membase_);
|
2015-05-16 07:23:13 +00:00
|
|
|
XELOGE("");
|
|
|
|
XELOGE("------------------------------------------------------------------");
|
|
|
|
XELOGE("Virtual Heaps");
|
|
|
|
XELOGE("------------------------------------------------------------------");
|
|
|
|
XELOGE("");
|
|
|
|
heaps_.v00000000.DumpMap();
|
|
|
|
heaps_.v40000000.DumpMap();
|
|
|
|
heaps_.v80000000.DumpMap();
|
|
|
|
heaps_.v90000000.DumpMap();
|
|
|
|
XELOGE("");
|
|
|
|
XELOGE("------------------------------------------------------------------");
|
|
|
|
XELOGE("Physical Heaps");
|
|
|
|
XELOGE("------------------------------------------------------------------");
|
|
|
|
XELOGE("");
|
|
|
|
heaps_.physical.DumpMap();
|
|
|
|
heaps_.vA0000000.DumpMap();
|
|
|
|
heaps_.vC0000000.DumpMap();
|
|
|
|
heaps_.vE0000000.DumpMap();
|
|
|
|
XELOGE("");
|
|
|
|
}
|
|
|
|
|
2015-12-01 23:26:55 +00:00
|
|
|
bool Memory::Save(ByteStream* stream) {
|
|
|
|
XELOGD("Serializing memory...");
|
|
|
|
heaps_.v00000000.Save(stream);
|
|
|
|
heaps_.v40000000.Save(stream);
|
|
|
|
heaps_.v80000000.Save(stream);
|
|
|
|
heaps_.v90000000.Save(stream);
|
|
|
|
heaps_.physical.Save(stream);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Memory::Restore(ByteStream* stream) {
|
2016-03-18 02:55:16 +00:00
|
|
|
XELOGD("Restoring memory...");
|
2015-12-01 23:26:55 +00:00
|
|
|
heaps_.v00000000.Restore(stream);
|
|
|
|
heaps_.v40000000.Restore(stream);
|
|
|
|
heaps_.v80000000.Restore(stream);
|
|
|
|
heaps_.v90000000.Restore(stream);
|
|
|
|
heaps_.physical.Restore(stream);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-07-16 01:20:05 +00:00
|
|
|
xe::memory::PageAccess ToPageAccess(uint32_t protect) {
|
|
|
|
if ((protect & kMemoryProtectRead) && !(protect & kMemoryProtectWrite)) {
|
|
|
|
return xe::memory::PageAccess::kReadOnly;
|
|
|
|
} else if ((protect & kMemoryProtectRead) &&
|
|
|
|
(protect & kMemoryProtectWrite)) {
|
|
|
|
return xe::memory::PageAccess::kReadWrite;
|
|
|
|
} else {
|
|
|
|
return xe::memory::PageAccess::kNoAccess;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-07-25 02:41:47 +00:00
|
|
|
uint32_t FromPageAccess(xe::memory::PageAccess protect) {
|
|
|
|
switch (protect) {
|
|
|
|
case memory::PageAccess::kNoAccess:
|
|
|
|
return kMemoryProtectNoAccess;
|
|
|
|
case memory::PageAccess::kReadOnly:
|
|
|
|
return kMemoryProtectRead;
|
|
|
|
case memory::PageAccess::kReadWrite:
|
|
|
|
return kMemoryProtectRead | kMemoryProtectWrite;
|
2020-11-24 19:18:50 +00:00
|
|
|
case memory::PageAccess::kExecuteReadOnly:
|
|
|
|
// Guest memory cannot be executable - this should never happen :)
|
|
|
|
assert_always();
|
|
|
|
return kMemoryProtectRead;
|
2017-07-25 02:41:47 +00:00
|
|
|
case memory::PageAccess::kExecuteReadWrite:
|
|
|
|
// Guest memory cannot be executable - this should never happen :)
|
|
|
|
assert_always();
|
|
|
|
return kMemoryProtectRead | kMemoryProtectWrite;
|
|
|
|
}
|
|
|
|
|
|
|
|
return kMemoryProtectNoAccess;
|
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
BaseHeap::BaseHeap()
|
|
|
|
: membase_(nullptr), heap_base_(0), heap_size_(0), page_size_(0) {}
|
|
|
|
|
|
|
|
BaseHeap::~BaseHeap() = default;
|
|
|
|
|
2020-09-16 18:09:32 +00:00
|
|
|
void BaseHeap::Initialize(Memory* memory, uint8_t* membase, HeapType heap_type,
|
|
|
|
uint32_t heap_base, uint32_t heap_size,
|
|
|
|
uint32_t page_size, uint32_t host_address_offset) {
|
2019-07-30 05:00:20 +00:00
|
|
|
memory_ = memory;
|
2015-05-16 07:23:13 +00:00
|
|
|
membase_ = membase;
|
2020-09-16 18:09:32 +00:00
|
|
|
heap_type_ = heap_type;
|
2015-05-16 07:23:13 +00:00
|
|
|
heap_base_ = heap_base;
|
2020-02-22 11:55:28 +00:00
|
|
|
heap_size_ = heap_size;
|
2015-05-16 07:23:13 +00:00
|
|
|
page_size_ = page_size;
|
2022-09-17 11:04:53 +00:00
|
|
|
xenia_assert(xe::is_pow2(page_size_));
|
|
|
|
page_size_shift_ = xe::log2_floor(page_size_);
|
2019-08-04 20:55:54 +00:00
|
|
|
host_address_offset_ = host_address_offset;
|
2015-05-16 07:23:13 +00:00
|
|
|
page_table_.resize(heap_size / page_size);
|
2021-05-30 10:52:34 +00:00
|
|
|
unreserved_page_count_ = uint32_t(page_table_.size());
|
2015-05-16 07:23:13 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void BaseHeap::Dispose() {
|
|
|
|
// Walk table and release all regions.
|
|
|
|
for (uint32_t page_number = 0; page_number < page_table_.size();
|
|
|
|
++page_number) {
|
|
|
|
auto& page_entry = page_table_[page_number];
|
|
|
|
if (page_entry.state) {
|
2019-08-14 21:31:21 +00:00
|
|
|
xe::memory::DeallocFixed(TranslateRelative(page_number * page_size_), 0,
|
|
|
|
xe::memory::DeallocationType::kRelease);
|
2015-05-16 07:23:13 +00:00
|
|
|
page_number += page_entry.region_page_count;
|
2013-10-20 20:42:34 +00:00
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void BaseHeap::DumpMap() {
|
2015-09-06 16:30:54 +00:00
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
2015-05-16 07:23:13 +00:00
|
|
|
XELOGE("------------------------------------------------------------------");
|
2020-02-28 20:30:48 +00:00
|
|
|
XELOGE("Heap: {:08X}-{:08X}", heap_base_, heap_base_ + (heap_size_ - 1));
|
2015-05-16 07:23:13 +00:00
|
|
|
XELOGE("------------------------------------------------------------------");
|
2020-02-28 20:30:48 +00:00
|
|
|
XELOGE(" Heap Base: {:08X}", heap_base_);
|
|
|
|
XELOGE(" Heap Size: {0} ({0:08X})", heap_size_);
|
|
|
|
XELOGE(" Page Size: {0} ({0:08X})", page_size_);
|
|
|
|
XELOGE(" Page Count: {}", page_table_.size());
|
|
|
|
XELOGE(" Host Address Offset: {0} ({0:08X})", host_address_offset_);
|
2015-05-16 07:23:13 +00:00
|
|
|
bool is_empty_span = false;
|
|
|
|
uint32_t empty_span_start = 0;
|
|
|
|
for (uint32_t i = 0; i < uint32_t(page_table_.size()); ++i) {
|
|
|
|
auto& page = page_table_[i];
|
|
|
|
if (!page.state) {
|
|
|
|
if (!is_empty_span) {
|
|
|
|
is_empty_span = true;
|
|
|
|
empty_span_start = i;
|
2013-10-23 04:50:10 +00:00
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
continue;
|
2013-10-23 04:50:10 +00:00
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
if (is_empty_span) {
|
2020-02-28 20:30:48 +00:00
|
|
|
XELOGE(" {:08X}-{:08X} {:6d}p {:10d}b unreserved",
|
2015-05-16 07:23:13 +00:00
|
|
|
heap_base_ + empty_span_start * page_size_,
|
|
|
|
heap_base_ + i * page_size_, i - empty_span_start,
|
|
|
|
(i - empty_span_start) * page_size_);
|
|
|
|
is_empty_span = false;
|
2013-05-30 04:00:55 +00:00
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
const char* state_name = " ";
|
|
|
|
if (page.state & kMemoryAllocationCommit) {
|
|
|
|
state_name = "COM";
|
|
|
|
} else if (page.state & kMemoryAllocationReserve) {
|
|
|
|
state_name = "RES";
|
2013-05-30 04:00:55 +00:00
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
char access_r = (page.current_protect & kMemoryProtectRead) ? 'R' : ' ';
|
|
|
|
char access_w = (page.current_protect & kMemoryProtectWrite) ? 'W' : ' ';
|
2020-02-28 20:30:48 +00:00
|
|
|
XELOGE(" {:08X}-{:08X} {:6d}p {:10d}b {} {}{}",
|
|
|
|
heap_base_ + i * page_size_,
|
2015-05-16 07:23:13 +00:00
|
|
|
heap_base_ + (i + page.region_page_count) * page_size_,
|
|
|
|
page.region_page_count, page.region_page_count * page_size_,
|
|
|
|
state_name, access_r, access_w);
|
2015-05-17 17:17:32 +00:00
|
|
|
i += page.region_page_count - 1;
|
2015-05-16 07:23:13 +00:00
|
|
|
}
|
|
|
|
if (is_empty_span) {
|
2020-02-28 20:30:48 +00:00
|
|
|
XELOGE(" {:08X}-{:08X} - {} unreserved pages)",
|
2020-02-22 11:55:28 +00:00
|
|
|
heap_base_ + empty_span_start * page_size_,
|
|
|
|
heap_base_ + (heap_size_ - 1),
|
2015-05-16 07:23:13 +00:00
|
|
|
page_table_.size() - empty_span_start);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-12-01 23:26:55 +00:00
|
|
|
bool BaseHeap::Save(ByteStream* stream) {
|
2020-02-28 20:30:48 +00:00
|
|
|
XELOGD("Heap {:08X}-{:08X}", heap_base_, heap_base_ + (heap_size_ - 1));
|
2015-12-01 23:26:55 +00:00
|
|
|
|
|
|
|
for (size_t i = 0; i < page_table_.size(); i++) {
|
|
|
|
auto& page = page_table_[i];
|
|
|
|
stream->Write(page.qword);
|
|
|
|
if (!page.state) {
|
|
|
|
// Unallocated.
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2015-12-30 04:26:01 +00:00
|
|
|
// TODO(DrChat): write compressed with snappy.
|
2015-12-07 00:02:59 +00:00
|
|
|
if (page.state & kMemoryAllocationCommit) {
|
2019-08-14 21:31:21 +00:00
|
|
|
void* addr = TranslateRelative(i * page_size_);
|
2015-12-07 00:02:59 +00:00
|
|
|
|
|
|
|
memory::PageAccess old_access;
|
|
|
|
memory::Protect(addr, page_size_, memory::PageAccess::kReadWrite,
|
|
|
|
&old_access);
|
|
|
|
|
|
|
|
stream->Write(addr, page_size_);
|
|
|
|
|
|
|
|
memory::Protect(addr, page_size_, old_access, nullptr);
|
2015-12-01 23:26:55 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool BaseHeap::Restore(ByteStream* stream) {
|
2020-02-28 20:30:48 +00:00
|
|
|
XELOGD("Heap {:08X}-{:08X}", heap_base_, heap_base_ + (heap_size_ - 1));
|
2016-03-18 02:55:16 +00:00
|
|
|
|
2015-12-01 23:26:55 +00:00
|
|
|
for (size_t i = 0; i < page_table_.size(); i++) {
|
|
|
|
auto& page = page_table_[i];
|
|
|
|
page.qword = stream->Read<uint64_t>();
|
|
|
|
if (!page.state) {
|
|
|
|
// Unallocated.
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
memory::PageAccess page_access = memory::PageAccess::kNoAccess;
|
2015-12-07 16:53:45 +00:00
|
|
|
if ((page.current_protect & kMemoryProtectRead) &&
|
|
|
|
(page.current_protect & kMemoryProtectWrite)) {
|
2015-12-01 23:26:55 +00:00
|
|
|
page_access = memory::PageAccess::kReadWrite;
|
|
|
|
} else if (page.current_protect & kMemoryProtectRead) {
|
|
|
|
page_access = memory::PageAccess::kReadOnly;
|
|
|
|
}
|
|
|
|
|
2015-12-07 00:02:59 +00:00
|
|
|
// Commit the memory if it isn't already. We do not need to reserve any
|
|
|
|
// memory, as the mapping has already taken care of that.
|
|
|
|
if (page.state & kMemoryAllocationCommit) {
|
2019-08-14 21:31:21 +00:00
|
|
|
xe::memory::AllocFixed(TranslateRelative(i * page_size_), page_size_,
|
2015-12-07 00:02:59 +00:00
|
|
|
memory::AllocationType::kCommit,
|
|
|
|
memory::PageAccess::kReadWrite);
|
2015-12-01 23:26:55 +00:00
|
|
|
}
|
|
|
|
|
2015-12-07 00:02:59 +00:00
|
|
|
// Now read into memory. We'll set R/W protection first, then set the
|
|
|
|
// protection back to its previous state.
|
2015-12-30 04:26:01 +00:00
|
|
|
// TODO(DrChat): read compressed with snappy.
|
2015-12-07 00:02:59 +00:00
|
|
|
if (page.state & kMemoryAllocationCommit) {
|
2019-08-14 21:31:21 +00:00
|
|
|
void* addr = TranslateRelative(i * page_size_);
|
2015-12-07 00:02:59 +00:00
|
|
|
xe::memory::Protect(addr, page_size_, memory::PageAccess::kReadWrite,
|
|
|
|
nullptr);
|
|
|
|
|
|
|
|
stream->Read(addr, page_size_);
|
2015-12-01 23:26:55 +00:00
|
|
|
|
2015-12-07 00:02:59 +00:00
|
|
|
xe::memory::Protect(addr, page_size_, page_access, nullptr);
|
2015-12-01 23:26:55 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void BaseHeap::Reset() {
|
2015-12-30 04:26:01 +00:00
|
|
|
// TODO(DrChat): protect pages.
|
|
|
|
std::memset(page_table_.data(), 0, sizeof(PageEntry) * page_table_.size());
|
2020-02-15 18:35:24 +00:00
|
|
|
// TODO(Triang3l): Remove access callbacks from pages if this is a physical
|
|
|
|
// memory heap.
|
2015-12-01 23:26:55 +00:00
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
bool BaseHeap::Alloc(uint32_t size, uint32_t alignment,
|
|
|
|
uint32_t allocation_type, uint32_t protect, bool top_down,
|
|
|
|
uint32_t* out_address) {
|
|
|
|
*out_address = 0;
|
|
|
|
size = xe::round_up(size, page_size_);
|
|
|
|
alignment = xe::round_up(alignment, page_size_);
|
2022-02-14 18:26:31 +00:00
|
|
|
uint32_t heap_virtual_guest_offset = 0;
|
|
|
|
if (heap_type_ == HeapType::kGuestVirtual) {
|
|
|
|
heap_virtual_guest_offset = 0x10000000;
|
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
uint32_t low_address = heap_base_;
|
2021-08-03 10:14:48 +00:00
|
|
|
uint32_t high_address =
|
2022-02-14 18:26:31 +00:00
|
|
|
heap_base_ + (heap_size_ - 1) - heap_virtual_guest_offset;
|
2015-05-16 07:23:13 +00:00
|
|
|
return AllocRange(low_address, high_address, size, alignment, allocation_type,
|
|
|
|
protect, top_down, out_address);
|
|
|
|
}
|
2013-05-30 04:00:55 +00:00
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
bool BaseHeap::AllocFixed(uint32_t base_address, uint32_t size,
|
|
|
|
uint32_t alignment, uint32_t allocation_type,
|
|
|
|
uint32_t protect) {
|
|
|
|
alignment = xe::round_up(alignment, page_size_);
|
|
|
|
size = xe::align(size, alignment);
|
|
|
|
assert_true(base_address % alignment == 0);
|
|
|
|
uint32_t page_count = get_page_count(size, page_size_);
|
|
|
|
uint32_t start_page_number = (base_address - heap_base_) / page_size_;
|
|
|
|
uint32_t end_page_number = start_page_number + page_count - 1;
|
|
|
|
if (start_page_number >= page_table_.size() ||
|
|
|
|
end_page_number > page_table_.size()) {
|
2015-05-30 04:47:19 +00:00
|
|
|
XELOGE("BaseHeap::AllocFixed passed out of range address range");
|
2015-05-16 07:23:13 +00:00
|
|
|
return false;
|
|
|
|
}
|
2013-05-30 04:00:55 +00:00
|
|
|
|
2015-09-06 16:30:54 +00:00
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
2015-05-16 07:23:13 +00:00
|
|
|
|
|
|
|
// - If we are reserving the entire range requested must not be already
|
|
|
|
// reserved.
|
|
|
|
// - If we are committing it's ok for pages within the range to already be
|
|
|
|
// committed.
|
|
|
|
for (uint32_t page_number = start_page_number; page_number <= end_page_number;
|
|
|
|
++page_number) {
|
|
|
|
uint32_t state = page_table_[page_number].state;
|
|
|
|
if ((allocation_type == kMemoryAllocationReserve) && state) {
|
|
|
|
// Already reserved.
|
2015-05-30 04:47:19 +00:00
|
|
|
XELOGE(
|
|
|
|
"BaseHeap::AllocFixed attempting to reserve an already reserved "
|
|
|
|
"range");
|
2015-05-16 07:23:13 +00:00
|
|
|
return false;
|
2013-05-30 04:00:55 +00:00
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
if ((allocation_type == kMemoryAllocationCommit) &&
|
|
|
|
!(state & kMemoryAllocationReserve)) {
|
|
|
|
// Attempting a commit-only op on an unreserved page.
|
2015-05-30 04:47:19 +00:00
|
|
|
// This may be OK.
|
|
|
|
XELOGW("BaseHeap::AllocFixed attempting commit on unreserved page");
|
|
|
|
allocation_type |= kMemoryAllocationReserve;
|
|
|
|
break;
|
2015-05-16 07:23:13 +00:00
|
|
|
}
|
|
|
|
}
|
2013-01-29 05:36:03 +00:00
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
// Allocate from host.
|
|
|
|
if (allocation_type == kMemoryAllocationReserve) {
|
|
|
|
// Reserve is not needed, as we are mapped already.
|
|
|
|
} else {
|
2015-07-16 02:05:08 +00:00
|
|
|
auto alloc_type = (allocation_type & kMemoryAllocationCommit)
|
|
|
|
? xe::memory::AllocationType::kCommit
|
|
|
|
: xe::memory::AllocationType::kReserve;
|
|
|
|
void* result = xe::memory::AllocFixed(
|
2019-08-14 21:31:21 +00:00
|
|
|
TranslateRelative(start_page_number * page_size_),
|
2015-07-16 02:05:08 +00:00
|
|
|
page_count * page_size_, alloc_type, ToPageAccess(protect));
|
2015-05-16 07:23:13 +00:00
|
|
|
if (!result) {
|
2015-05-30 04:47:19 +00:00
|
|
|
XELOGE("BaseHeap::AllocFixed failed to alloc range from host");
|
2015-05-16 07:23:13 +00:00
|
|
|
return false;
|
2013-10-23 04:50:10 +00:00
|
|
|
}
|
2015-05-16 23:41:18 +00:00
|
|
|
|
2019-04-17 19:49:29 +00:00
|
|
|
if (cvars::scribble_heap && protect & kMemoryProtectWrite) {
|
2015-05-28 10:28:59 +00:00
|
|
|
std::memset(result, 0xCD, page_count * page_size_);
|
2015-05-16 23:41:18 +00:00
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
}
|
2013-10-23 04:50:10 +00:00
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
// Set page state.
|
|
|
|
for (uint32_t page_number = start_page_number; page_number <= end_page_number;
|
|
|
|
++page_number) {
|
|
|
|
auto& page_entry = page_table_[page_number];
|
|
|
|
if (allocation_type & kMemoryAllocationReserve) {
|
|
|
|
// Region is based on reservation.
|
|
|
|
page_entry.base_address = start_page_number;
|
|
|
|
page_entry.region_page_count = page_count;
|
|
|
|
}
|
|
|
|
page_entry.allocation_protect = protect;
|
|
|
|
page_entry.current_protect = protect;
|
2021-05-30 10:52:34 +00:00
|
|
|
if (!(page_entry.state & kMemoryAllocationReserve)) {
|
|
|
|
unreserved_page_count_--;
|
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
page_entry.state = kMemoryAllocationReserve | allocation_type;
|
2013-05-30 04:00:55 +00:00
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
|
|
|
|
return true;
|
2013-01-29 05:36:03 +00:00
|
|
|
}
|
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
2022-11-27 17:37:06 +00:00
|
|
|
template<typename T>
|
|
|
|
static inline T QuickMod(T value, uint32_t modv) {
|
|
|
|
if (xe::is_pow2(modv)) {
|
|
|
|
return value & (modv - 1);
|
|
|
|
} else {
|
|
|
|
return value % modv;
|
|
|
|
}
|
|
|
|
}
|
2013-01-29 05:36:03 +00:00
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
bool BaseHeap::AllocRange(uint32_t low_address, uint32_t high_address,
|
|
|
|
uint32_t size, uint32_t alignment,
|
|
|
|
uint32_t allocation_type, uint32_t protect,
|
|
|
|
bool top_down, uint32_t* out_address) {
|
|
|
|
*out_address = 0;
|
|
|
|
|
|
|
|
alignment = xe::round_up(alignment, page_size_);
|
|
|
|
uint32_t page_count = get_page_count(size, page_size_);
|
|
|
|
low_address = std::max(heap_base_, xe::align(low_address, alignment));
|
2020-02-22 11:55:28 +00:00
|
|
|
high_address = std::min(heap_base_ + (heap_size_ - 1),
|
|
|
|
xe::align(high_address, alignment));
|
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
2022-11-27 17:37:06 +00:00
|
|
|
|
|
|
|
uint32_t low_page_number = (low_address - heap_base_) >> page_size_shift_;
|
|
|
|
uint32_t high_page_number = (high_address - heap_base_) >> page_size_shift_;
|
2015-05-16 07:23:13 +00:00
|
|
|
low_page_number = std::min(uint32_t(page_table_.size()) - 1, low_page_number);
|
|
|
|
high_page_number =
|
2015-05-26 04:10:28 +00:00
|
|
|
std::min(uint32_t(page_table_.size()) - 1, high_page_number);
|
2015-05-16 07:23:13 +00:00
|
|
|
|
2015-07-30 06:27:55 +00:00
|
|
|
if (page_count > (high_page_number - low_page_number)) {
|
|
|
|
XELOGE("BaseHeap::Alloc page count too big for requested range");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-09-06 16:30:54 +00:00
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
2015-05-16 07:23:13 +00:00
|
|
|
|
|
|
|
// Find a free page range.
|
|
|
|
// The base page must match the requested alignment, so we first scan for
|
|
|
|
// a free aligned page and only then check for continuous free pages.
|
|
|
|
// TODO(benvanik): optimized searching (free list buckets, bitmap, etc).
|
|
|
|
uint32_t start_page_number = UINT_MAX;
|
|
|
|
uint32_t end_page_number = UINT_MAX;
|
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
2022-11-27 17:37:06 +00:00
|
|
|
// chrispy:todo, page_scan_stride is probably always a power of two...
|
|
|
|
uint32_t page_scan_stride = alignment >> page_size_shift_;
|
|
|
|
high_page_number =
|
|
|
|
high_page_number - QuickMod(high_page_number, page_scan_stride);
|
2015-05-16 07:23:13 +00:00
|
|
|
if (top_down) {
|
2016-06-20 17:02:48 +00:00
|
|
|
for (int64_t base_page_number =
|
|
|
|
high_page_number - xe::round_up(page_count, page_scan_stride);
|
2015-05-16 07:23:13 +00:00
|
|
|
base_page_number >= low_page_number;
|
|
|
|
base_page_number -= page_scan_stride) {
|
|
|
|
if (page_table_[base_page_number].state != 0) {
|
|
|
|
// Base page not free, skip to next usable page.
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// Check requested range to ensure free.
|
|
|
|
start_page_number = uint32_t(base_page_number);
|
|
|
|
end_page_number = uint32_t(base_page_number) + page_count - 1;
|
|
|
|
assert_true(end_page_number < page_table_.size());
|
|
|
|
bool any_taken = false;
|
|
|
|
for (uint32_t page_number = uint32_t(base_page_number);
|
|
|
|
!any_taken && page_number <= end_page_number; ++page_number) {
|
|
|
|
bool is_free = page_table_[page_number].state == 0;
|
|
|
|
if (!is_free) {
|
|
|
|
// At least one page in the range is used, skip to next.
|
2015-06-04 04:24:09 +00:00
|
|
|
// We know we'll be starting at least before this page.
|
2015-05-16 07:23:13 +00:00
|
|
|
any_taken = true;
|
2015-09-05 13:53:05 +00:00
|
|
|
if (page_count > page_number) {
|
|
|
|
// Not enough space left to fit entire page range. Breaks outer
|
|
|
|
// loop.
|
|
|
|
base_page_number = -1;
|
|
|
|
} else {
|
|
|
|
base_page_number = page_number - page_count;
|
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
2022-11-27 17:37:06 +00:00
|
|
|
base_page_number -= QuickMod(base_page_number, page_scan_stride);
|
2015-09-05 13:53:05 +00:00
|
|
|
base_page_number += page_scan_stride; // cancel out loop logic
|
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!any_taken) {
|
|
|
|
// Found our place.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// Retry.
|
|
|
|
start_page_number = end_page_number = UINT_MAX;
|
|
|
|
}
|
2013-05-30 04:00:55 +00:00
|
|
|
} else {
|
2015-05-16 07:23:13 +00:00
|
|
|
for (uint32_t base_page_number = low_page_number;
|
2015-05-26 04:10:28 +00:00
|
|
|
base_page_number <= high_page_number - page_count;
|
2015-05-16 07:23:13 +00:00
|
|
|
base_page_number += page_scan_stride) {
|
|
|
|
if (page_table_[base_page_number].state != 0) {
|
|
|
|
// Base page not free, skip to next usable page.
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// Check requested range to ensure free.
|
|
|
|
start_page_number = base_page_number;
|
|
|
|
end_page_number = base_page_number + page_count - 1;
|
|
|
|
bool any_taken = false;
|
|
|
|
for (uint32_t page_number = base_page_number;
|
|
|
|
!any_taken && page_number <= end_page_number; ++page_number) {
|
|
|
|
bool is_free = page_table_[page_number].state == 0;
|
|
|
|
if (!is_free) {
|
|
|
|
// At least one page in the range is used, skip to next.
|
2015-06-04 04:24:09 +00:00
|
|
|
// We know we'll be starting at least after this page.
|
2015-05-16 07:23:13 +00:00
|
|
|
any_taken = true;
|
2015-06-04 04:24:09 +00:00
|
|
|
base_page_number = xe::round_up(page_number + 1, page_scan_stride);
|
|
|
|
base_page_number -= page_scan_stride; // cancel out loop logic
|
2015-05-16 07:23:13 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!any_taken) {
|
|
|
|
// Found our place.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// Retry.
|
|
|
|
start_page_number = end_page_number = UINT_MAX;
|
|
|
|
}
|
2013-05-30 04:00:55 +00:00
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
if (start_page_number == UINT_MAX || end_page_number == UINT_MAX) {
|
|
|
|
// Out of memory.
|
|
|
|
XELOGE("BaseHeap::Alloc failed to find contiguous range");
|
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
2022-11-27 17:37:06 +00:00
|
|
|
// assert_always("Heap exhausted!");
|
2014-08-19 05:12:21 +00:00
|
|
|
return false;
|
|
|
|
}
|
2014-08-15 06:14:57 +00:00
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
// Allocate from host.
|
|
|
|
if (allocation_type == kMemoryAllocationReserve) {
|
|
|
|
// Reserve is not needed, as we are mapped already.
|
2014-01-05 19:19:02 +00:00
|
|
|
} else {
|
2015-07-16 02:05:08 +00:00
|
|
|
auto alloc_type = (allocation_type & kMemoryAllocationCommit)
|
|
|
|
? xe::memory::AllocationType::kCommit
|
|
|
|
: xe::memory::AllocationType::kReserve;
|
|
|
|
void* result = xe::memory::AllocFixed(
|
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
2022-11-27 17:37:06 +00:00
|
|
|
TranslateRelative(start_page_number << page_size_shift_),
|
|
|
|
page_count << page_size_shift_, alloc_type, ToPageAccess(protect));
|
2015-05-16 07:23:13 +00:00
|
|
|
if (!result) {
|
|
|
|
XELOGE("BaseHeap::Alloc failed to alloc range from host");
|
|
|
|
return false;
|
2014-01-05 19:19:02 +00:00
|
|
|
}
|
2015-05-16 23:41:18 +00:00
|
|
|
|
2019-04-17 19:49:29 +00:00
|
|
|
if (cvars::scribble_heap && (protect & kMemoryProtectWrite)) {
|
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
2022-11-27 17:37:06 +00:00
|
|
|
std::memset(result, 0xCD, page_count << page_size_shift_);
|
2015-05-16 23:41:18 +00:00
|
|
|
}
|
2014-01-05 19:19:02 +00:00
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
|
|
|
|
// Set page state.
|
|
|
|
for (uint32_t page_number = start_page_number; page_number <= end_page_number;
|
|
|
|
++page_number) {
|
|
|
|
auto& page_entry = page_table_[page_number];
|
|
|
|
page_entry.base_address = start_page_number;
|
|
|
|
page_entry.region_page_count = page_count;
|
|
|
|
page_entry.allocation_protect = protect;
|
|
|
|
page_entry.current_protect = protect;
|
|
|
|
page_entry.state = kMemoryAllocationReserve | allocation_type;
|
2021-05-30 10:52:34 +00:00
|
|
|
unreserved_page_count_--;
|
2015-05-16 07:23:13 +00:00
|
|
|
}
|
|
|
|
|
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
2022-11-27 17:37:06 +00:00
|
|
|
*out_address = heap_base_ + (start_page_number << page_size_shift_);
|
2015-05-16 07:23:13 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2022-02-28 13:20:55 +00:00
|
|
|
bool BaseHeap::AllocSystemHeap(uint32_t size, uint32_t alignment,
|
|
|
|
uint32_t allocation_type, uint32_t protect,
|
|
|
|
bool top_down, uint32_t* out_address) {
|
|
|
|
*out_address = 0;
|
|
|
|
size = xe::round_up(size, page_size_);
|
|
|
|
alignment = xe::round_up(alignment, page_size_);
|
|
|
|
|
|
|
|
uint32_t low_address = heap_base_;
|
|
|
|
if (heap_type_ == xe::HeapType::kGuestVirtual) {
|
|
|
|
// Both virtual heaps are same size, so we can assume that we substract
|
|
|
|
// constant value.
|
|
|
|
low_address = heap_base_ + heap_size_ - 0x10000000;
|
|
|
|
}
|
|
|
|
uint32_t high_address = heap_base_ + (heap_size_ - 1);
|
|
|
|
return AllocRange(low_address, high_address, size, alignment, allocation_type,
|
|
|
|
protect, top_down, out_address);
|
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
bool BaseHeap::Decommit(uint32_t address, uint32_t size) {
|
|
|
|
uint32_t page_count = get_page_count(size, page_size_);
|
|
|
|
uint32_t start_page_number = (address - heap_base_) / page_size_;
|
|
|
|
uint32_t end_page_number = start_page_number + page_count - 1;
|
|
|
|
start_page_number =
|
|
|
|
std::min(uint32_t(page_table_.size()) - 1, start_page_number);
|
|
|
|
end_page_number = std::min(uint32_t(page_table_.size()) - 1, end_page_number);
|
|
|
|
|
2015-09-06 16:30:54 +00:00
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
2015-05-16 07:23:13 +00:00
|
|
|
|
|
|
|
// Release from host.
|
|
|
|
// TODO(benvanik): find a way to actually decommit memory;
|
|
|
|
// mapped memory cannot be decommitted.
|
|
|
|
/*BOOL result =
|
2019-08-14 21:31:21 +00:00
|
|
|
VirtualFree(TranslateRelative(start_page_number * page_size_),
|
2015-05-16 07:23:13 +00:00
|
|
|
page_count * page_size_, MEM_DECOMMIT);
|
|
|
|
if (!result) {
|
|
|
|
PLOGW("BaseHeap::Decommit failed due to host VirtualFree failure");
|
|
|
|
return false;
|
|
|
|
}*/
|
|
|
|
|
|
|
|
// Perform table change.
|
|
|
|
for (uint32_t page_number = start_page_number; page_number <= end_page_number;
|
|
|
|
++page_number) {
|
|
|
|
auto& page_entry = page_table_[page_number];
|
|
|
|
page_entry.state &= ~kMemoryAllocationCommit;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
2014-01-05 19:19:02 +00:00
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
bool BaseHeap::Release(uint32_t base_address, uint32_t* out_region_size) {
|
2015-09-06 16:30:54 +00:00
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
2013-01-29 05:36:03 +00:00
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
// Given address must be a region base address.
|
|
|
|
uint32_t base_page_number = (base_address - heap_base_) / page_size_;
|
|
|
|
auto base_page_entry = page_table_[base_page_number];
|
|
|
|
if (base_page_entry.base_address != base_page_number) {
|
|
|
|
XELOGE("BaseHeap::Release failed because address is not a region start");
|
2016-03-18 02:55:16 +00:00
|
|
|
return false;
|
2015-05-16 07:23:13 +00:00
|
|
|
}
|
2013-10-20 20:42:34 +00:00
|
|
|
|
2017-05-17 09:44:48 +00:00
|
|
|
if (heap_base_ == 0x00000000 && base_page_number == 0) {
|
|
|
|
XELOGE("BaseHeap::Release: Attempt to free 0!");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
if (out_region_size) {
|
2015-07-16 02:05:08 +00:00
|
|
|
*out_region_size = (base_page_entry.region_page_count * page_size_);
|
2015-05-16 07:23:13 +00:00
|
|
|
}
|
2013-10-23 06:34:24 +00:00
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
// Release from host not needed as mapping reserves the range for us.
|
|
|
|
// TODO(benvanik): protect with NOACCESS?
|
|
|
|
/*BOOL result = VirtualFree(
|
2019-08-14 21:31:21 +00:00
|
|
|
TranslateRelative(base_page_number * page_size_), 0, MEM_RELEASE);
|
2015-05-16 07:23:13 +00:00
|
|
|
if (!result) {
|
|
|
|
PLOGE("BaseHeap::Release failed due to host VirtualFree failure");
|
|
|
|
return false;
|
|
|
|
}*/
|
2015-05-19 03:47:26 +00:00
|
|
|
// Instead, we just protect it, if we can.
|
2015-07-16 01:20:05 +00:00
|
|
|
if (page_size_ == xe::memory::page_size() ||
|
|
|
|
((base_page_entry.region_page_count * page_size_) %
|
2015-07-20 01:32:48 +00:00
|
|
|
xe::memory::page_size() ==
|
|
|
|
0 &&
|
|
|
|
((base_page_number * page_size_) % xe::memory::page_size() == 0))) {
|
2019-04-17 19:49:29 +00:00
|
|
|
// TODO(benvanik): figure out why games are using memory after releasing
|
|
|
|
// it. It's possible this is some virtual/physical stuff where the GPU
|
|
|
|
// still can access it.
|
|
|
|
if (cvars::protect_on_release) {
|
2019-08-14 21:31:21 +00:00
|
|
|
if (!xe::memory::Protect(TranslateRelative(base_page_number * page_size_),
|
|
|
|
base_page_entry.region_page_count * page_size_,
|
|
|
|
xe::memory::PageAccess::kNoAccess, nullptr)) {
|
2015-12-06 07:27:25 +00:00
|
|
|
XELOGW("BaseHeap::Release failed due to host VirtualProtect failure");
|
|
|
|
}
|
2015-05-19 03:47:26 +00:00
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
}
|
2013-10-21 07:57:48 +00:00
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
// Perform table change.
|
|
|
|
uint32_t end_page_number =
|
|
|
|
base_page_number + base_page_entry.region_page_count - 1;
|
|
|
|
for (uint32_t page_number = base_page_number; page_number <= end_page_number;
|
|
|
|
++page_number) {
|
|
|
|
auto& page_entry = page_table_[page_number];
|
|
|
|
page_entry.qword = 0;
|
2021-05-30 10:52:34 +00:00
|
|
|
unreserved_page_count_++;
|
2013-12-07 06:57:16 +00:00
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
return true;
|
2013-12-07 06:57:16 +00:00
|
|
|
}
|
2013-10-21 07:57:48 +00:00
|
|
|
|
2017-07-25 02:41:47 +00:00
|
|
|
bool BaseHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
|
|
|
|
uint32_t* old_protect) {
|
2020-09-01 09:44:37 +00:00
|
|
|
if (!size) {
|
|
|
|
XELOGE("BaseHeap::Protect failed due to zero size");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// From the VirtualProtect MSDN page:
|
|
|
|
//
|
|
|
|
// "The region of affected pages includes all pages containing one or more
|
|
|
|
// bytes in the range from the lpAddress parameter to (lpAddress+dwSize).
|
|
|
|
// This means that a 2-byte range straddling a page boundary causes the
|
|
|
|
// protection attributes of both pages to be changed."
|
|
|
|
//
|
|
|
|
// "The access protection value can be set only on committed pages. If the
|
|
|
|
// state of any page in the specified region is not committed, the function
|
|
|
|
// fails and returns without modifying the access protection of any pages in
|
|
|
|
// the specified region."
|
|
|
|
|
2022-09-17 11:04:53 +00:00
|
|
|
uint32_t start_page_number = (address - heap_base_) >> page_size_shift_;
|
2020-09-01 09:44:37 +00:00
|
|
|
if (start_page_number >= page_table_.size()) {
|
|
|
|
XELOGE("BaseHeap::Protect failed due to out-of-bounds base address {:08X}",
|
|
|
|
address);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
uint32_t end_page_number =
|
2022-09-17 11:04:53 +00:00
|
|
|
uint32_t((uint64_t(address) + size - 1 - heap_base_) >> page_size_shift_);
|
2020-09-01 09:44:37 +00:00
|
|
|
if (end_page_number >= page_table_.size()) {
|
|
|
|
XELOGE(
|
|
|
|
"BaseHeap::Protect failed due to out-of-bounds range ({:08X} bytes "
|
|
|
|
"from {:08x})",
|
|
|
|
size, address);
|
|
|
|
return false;
|
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
|
2015-09-06 16:30:54 +00:00
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
2015-05-16 07:23:13 +00:00
|
|
|
|
|
|
|
// Ensure all pages are in the same reserved region and all are committed.
|
|
|
|
uint32_t first_base_address = UINT_MAX;
|
|
|
|
for (uint32_t page_number = start_page_number; page_number <= end_page_number;
|
|
|
|
++page_number) {
|
|
|
|
auto page_entry = page_table_[page_number];
|
|
|
|
if (first_base_address == UINT_MAX) {
|
|
|
|
first_base_address = page_entry.base_address;
|
|
|
|
} else if (first_base_address != page_entry.base_address) {
|
|
|
|
XELOGE("BaseHeap::Protect failed due to request spanning regions");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (!(page_entry.state & kMemoryAllocationCommit)) {
|
|
|
|
XELOGE("BaseHeap::Protect failed due to uncommitted page");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2022-09-17 11:04:53 +00:00
|
|
|
uint32_t xe_page_size = static_cast<uint32_t>(xe::memory::page_size());
|
|
|
|
|
|
|
|
uint32_t page_size_mask = xe_page_size - 1;
|
2015-05-16 07:23:13 +00:00
|
|
|
|
|
|
|
// Attempt host change (hopefully won't fail).
|
2015-05-19 03:25:15 +00:00
|
|
|
// We can only do this if our size matches system page granularity.
|
2020-09-01 09:44:37 +00:00
|
|
|
uint32_t page_count = end_page_number - start_page_number + 1;
|
2022-09-17 11:04:53 +00:00
|
|
|
if (page_size_ == xe_page_size ||
|
|
|
|
((((page_count << page_size_shift_) & page_size_mask) == 0) &&
|
|
|
|
(((start_page_number << page_size_shift_) & page_size_mask) == 0))) {
|
2017-07-25 02:41:47 +00:00
|
|
|
memory::PageAccess old_protect_access;
|
2022-09-17 11:04:53 +00:00
|
|
|
if (!xe::memory::Protect(
|
|
|
|
TranslateRelative(start_page_number << page_size_shift_),
|
|
|
|
page_count << page_size_shift_, ToPageAccess(protect),
|
|
|
|
old_protect ? &old_protect_access : nullptr)) {
|
2015-05-19 03:25:15 +00:00
|
|
|
XELOGE("BaseHeap::Protect failed due to host VirtualProtect failure");
|
|
|
|
return false;
|
|
|
|
}
|
2017-07-25 02:41:47 +00:00
|
|
|
|
|
|
|
if (old_protect) {
|
|
|
|
*old_protect = FromPageAccess(old_protect_access);
|
|
|
|
}
|
2015-05-19 03:25:15 +00:00
|
|
|
} else {
|
2017-07-25 02:41:47 +00:00
|
|
|
XELOGW("BaseHeap::Protect: ignoring request as not 4k page aligned");
|
|
|
|
return false;
|
2013-12-07 06:57:16 +00:00
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
// Perform table change.
|
|
|
|
for (uint32_t page_number = start_page_number; page_number <= end_page_number;
|
|
|
|
++page_number) {
|
|
|
|
auto& page_entry = page_table_[page_number];
|
|
|
|
page_entry.current_protect = protect;
|
2013-12-07 06:57:16 +00:00
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
|
|
|
|
return true;
|
2013-12-07 06:57:16 +00:00
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
bool BaseHeap::QueryRegionInfo(uint32_t base_address,
|
|
|
|
HeapAllocationInfo* out_info) {
|
2022-09-17 11:04:53 +00:00
|
|
|
uint32_t start_page_number = (base_address - heap_base_) >> page_size_shift_;
|
2015-05-16 07:23:13 +00:00
|
|
|
if (start_page_number > page_table_.size()) {
|
|
|
|
XELOGE("BaseHeap::QueryRegionInfo base page out of range");
|
|
|
|
return false;
|
2013-10-21 07:57:48 +00:00
|
|
|
}
|
|
|
|
|
2015-09-06 16:30:54 +00:00
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
2015-05-16 07:23:13 +00:00
|
|
|
|
|
|
|
auto start_page_entry = page_table_[start_page_number];
|
|
|
|
out_info->base_address = base_address;
|
|
|
|
out_info->allocation_base = 0;
|
|
|
|
out_info->allocation_protect = 0;
|
|
|
|
out_info->region_size = 0;
|
|
|
|
out_info->state = 0;
|
|
|
|
out_info->protect = 0;
|
|
|
|
if (start_page_entry.state) {
|
|
|
|
// Committed/reserved region.
|
2021-12-30 17:17:48 +00:00
|
|
|
out_info->allocation_base =
|
2022-09-17 11:04:53 +00:00
|
|
|
heap_base_ + (start_page_entry.base_address << page_size_shift_);
|
2015-05-16 07:23:13 +00:00
|
|
|
out_info->allocation_protect = start_page_entry.allocation_protect;
|
2022-09-17 11:04:53 +00:00
|
|
|
out_info->allocation_size = start_page_entry.region_page_count
|
|
|
|
<< page_size_shift_;
|
2015-05-16 07:23:13 +00:00
|
|
|
out_info->state = start_page_entry.state;
|
|
|
|
out_info->protect = start_page_entry.current_protect;
|
2018-02-11 01:14:58 +00:00
|
|
|
|
|
|
|
// Scan forward and report the size of the region matching the initial
|
|
|
|
// base address's attributes.
|
2015-05-16 07:23:13 +00:00
|
|
|
for (uint32_t page_number = start_page_number;
|
2018-02-11 01:14:58 +00:00
|
|
|
page_number <
|
|
|
|
start_page_entry.base_address + start_page_entry.region_page_count;
|
2015-05-16 07:23:13 +00:00
|
|
|
++page_number) {
|
|
|
|
auto page_entry = page_table_[page_number];
|
|
|
|
if (page_entry.base_address != start_page_entry.base_address ||
|
|
|
|
page_entry.state != start_page_entry.state ||
|
|
|
|
page_entry.current_protect != start_page_entry.current_protect) {
|
|
|
|
// Different region or different properties within the region; done.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
out_info->region_size += page_size_;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Free region.
|
|
|
|
for (uint32_t page_number = start_page_number;
|
|
|
|
page_number < page_table_.size(); ++page_number) {
|
|
|
|
auto page_entry = page_table_[page_number];
|
|
|
|
if (page_entry.state) {
|
|
|
|
// First non-free page; done with region.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
out_info->region_size += page_size_;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
2013-10-21 07:57:48 +00:00
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) {
|
2022-09-17 11:04:53 +00:00
|
|
|
uint32_t page_number = (address - heap_base_) >> page_size_shift_;
|
2015-05-16 07:23:13 +00:00
|
|
|
if (page_number > page_table_.size()) {
|
|
|
|
XELOGE("BaseHeap::QuerySize base page out of range");
|
|
|
|
*out_size = 0;
|
|
|
|
return false;
|
2013-10-21 07:57:48 +00:00
|
|
|
}
|
2015-09-06 16:30:54 +00:00
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
2015-05-16 07:23:13 +00:00
|
|
|
auto page_entry = page_table_[page_number];
|
2022-09-17 11:04:53 +00:00
|
|
|
*out_size = (page_entry.region_page_count << page_size_shift_);
|
2015-05-16 07:23:13 +00:00
|
|
|
return true;
|
|
|
|
}
|
2013-10-22 02:28:25 +00:00
|
|
|
|
2018-02-11 03:58:44 +00:00
|
|
|
bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) {
|
2022-09-17 11:04:53 +00:00
|
|
|
uint32_t page_number = (*in_out_address - heap_base_) >> page_size_shift_;
|
2018-02-11 03:58:44 +00:00
|
|
|
if (page_number > page_table_.size()) {
|
|
|
|
XELOGE("BaseHeap::QuerySize base page out of range");
|
|
|
|
*out_size = 0;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
|
|
|
auto page_entry = page_table_[page_number];
|
2022-09-17 11:04:53 +00:00
|
|
|
*in_out_address = (page_entry.base_address << page_size_shift_);
|
|
|
|
*out_size = (page_entry.region_page_count << page_size_shift_);
|
2018-02-11 03:58:44 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
bool BaseHeap::QueryProtect(uint32_t address, uint32_t* out_protect) {
|
2022-09-17 11:04:53 +00:00
|
|
|
uint32_t page_number = (address - heap_base_) >> page_size_shift_;
|
2015-05-16 07:23:13 +00:00
|
|
|
if (page_number > page_table_.size()) {
|
|
|
|
XELOGE("BaseHeap::QueryProtect base page out of range");
|
|
|
|
*out_protect = 0;
|
|
|
|
return false;
|
2014-08-19 05:12:21 +00:00
|
|
|
}
|
2015-09-06 16:30:54 +00:00
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
2015-05-16 07:23:13 +00:00
|
|
|
auto page_entry = page_table_[page_number];
|
|
|
|
*out_protect = page_entry.current_protect;
|
|
|
|
return true;
|
|
|
|
}
|
2014-08-19 05:12:21 +00:00
|
|
|
|
2020-02-22 15:06:56 +00:00
|
|
|
xe::memory::PageAccess BaseHeap::QueryRangeAccess(uint32_t low_address,
|
|
|
|
uint32_t high_address) {
|
2020-02-22 15:12:46 +00:00
|
|
|
if (low_address > high_address || low_address < heap_base_ ||
|
2020-02-22 15:06:56 +00:00
|
|
|
(high_address - heap_base_) >= heap_size_) {
|
|
|
|
return xe::memory::PageAccess::kNoAccess;
|
|
|
|
}
|
2022-09-17 11:04:53 +00:00
|
|
|
uint32_t low_page_number = (low_address - heap_base_) >> page_size_shift_;
|
|
|
|
uint32_t high_page_number = (high_address - heap_base_) >> page_size_shift_;
|
2020-02-22 15:06:56 +00:00
|
|
|
uint32_t protect = kMemoryProtectRead | kMemoryProtectWrite;
|
|
|
|
{
|
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
|
|
|
for (uint32_t i = low_page_number; protect && i <= high_page_number; ++i) {
|
|
|
|
protect &= page_table_[i].current_protect;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ToPageAccess(protect);
|
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
VirtualHeap::VirtualHeap() = default;
|
2013-12-07 06:57:16 +00:00
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
VirtualHeap::~VirtualHeap() = default;
|
|
|
|
|
2019-07-30 05:00:20 +00:00
|
|
|
void VirtualHeap::Initialize(Memory* memory, uint8_t* membase,
|
2020-09-16 18:09:32 +00:00
|
|
|
HeapType heap_type, uint32_t heap_base,
|
|
|
|
uint32_t heap_size, uint32_t page_size) {
|
|
|
|
BaseHeap::Initialize(memory, membase, heap_type, heap_base, heap_size,
|
|
|
|
page_size);
|
2013-10-21 07:57:48 +00:00
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
PhysicalHeap::PhysicalHeap() : parent_heap_(nullptr) {}
|
2013-10-21 07:57:48 +00:00
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
PhysicalHeap::~PhysicalHeap() = default;
|
|
|
|
|
2019-07-30 05:00:20 +00:00
|
|
|
void PhysicalHeap::Initialize(Memory* memory, uint8_t* membase,
|
2020-09-16 18:09:32 +00:00
|
|
|
HeapType heap_type, uint32_t heap_base,
|
|
|
|
uint32_t heap_size, uint32_t page_size,
|
|
|
|
VirtualHeap* parent_heap) {
|
2019-08-04 20:55:54 +00:00
|
|
|
uint32_t host_address_offset;
|
2019-08-15 20:55:33 +00:00
|
|
|
if (heap_base >= 0xE0000000 &&
|
2019-08-04 20:10:59 +00:00
|
|
|
xe::memory::allocation_granularity() > 0x1000) {
|
2019-08-04 20:55:54 +00:00
|
|
|
host_address_offset = 0x1000;
|
2019-08-04 20:10:59 +00:00
|
|
|
} else {
|
2019-08-04 20:55:54 +00:00
|
|
|
host_address_offset = 0;
|
2019-08-04 20:10:59 +00:00
|
|
|
}
|
|
|
|
|
2020-09-16 18:09:32 +00:00
|
|
|
BaseHeap::Initialize(memory, membase, heap_type, heap_base, heap_size,
|
|
|
|
page_size, host_address_offset);
|
2019-08-04 20:55:54 +00:00
|
|
|
parent_heap_ = parent_heap;
|
|
|
|
system_page_size_ = uint32_t(xe::memory::page_size());
|
2022-09-17 11:04:53 +00:00
|
|
|
xenia_assert(xe::is_pow2(system_page_size_));
|
|
|
|
system_page_shift_ = xe::log2_floor(system_page_size_);
|
2019-08-04 20:55:54 +00:00
|
|
|
|
|
|
|
system_page_count_ =
|
2020-02-22 11:55:28 +00:00
|
|
|
(size_t(heap_size_) + host_address_offset + (system_page_size_ - 1)) /
|
2019-08-04 20:55:54 +00:00
|
|
|
system_page_size_;
|
2020-02-15 18:35:24 +00:00
|
|
|
system_page_flags_.resize((system_page_count_ + 63) / 64);
|
2015-05-16 07:23:13 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool PhysicalHeap::Alloc(uint32_t size, uint32_t alignment,
|
|
|
|
uint32_t allocation_type, uint32_t protect,
|
|
|
|
bool top_down, uint32_t* out_address) {
|
|
|
|
*out_address = 0;
|
|
|
|
|
2019-04-17 19:49:29 +00:00
|
|
|
// Default top-down. Since parent heap is bottom-up this prevents
|
|
|
|
// collisions.
|
2015-05-16 07:23:13 +00:00
|
|
|
top_down = true;
|
|
|
|
|
|
|
|
// Adjust alignment size our page size differs from the parent.
|
|
|
|
size = xe::round_up(size, page_size_);
|
|
|
|
alignment = xe::round_up(alignment, page_size_);
|
|
|
|
|
2015-09-06 16:30:54 +00:00
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
2015-05-16 07:23:13 +00:00
|
|
|
|
|
|
|
// Allocate from parent heap (gets our physical address in 0-512mb).
|
2019-08-15 20:55:33 +00:00
|
|
|
uint32_t parent_heap_start = GetPhysicalAddress(heap_base_);
|
2020-02-22 11:55:28 +00:00
|
|
|
uint32_t parent_heap_end = GetPhysicalAddress(heap_base_ + (heap_size_ - 1));
|
2015-05-16 07:23:13 +00:00
|
|
|
uint32_t parent_address;
|
2019-08-15 20:55:33 +00:00
|
|
|
if (!parent_heap_->AllocRange(parent_heap_start, parent_heap_end, size,
|
2015-05-16 07:23:13 +00:00
|
|
|
alignment, allocation_type, protect, top_down,
|
|
|
|
&parent_address)) {
|
|
|
|
XELOGE(
|
|
|
|
"PhysicalHeap::Alloc unable to alloc physical memory in parent heap");
|
|
|
|
return false;
|
2013-10-21 07:57:48 +00:00
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
// Given the address we've reserved in the parent heap, pin that here.
|
|
|
|
// Shouldn't be possible for it to be allocated already.
|
2019-08-15 20:55:33 +00:00
|
|
|
uint32_t address = heap_base_ + parent_address - parent_heap_start;
|
2015-05-16 07:23:13 +00:00
|
|
|
if (!BaseHeap::AllocFixed(address, size, alignment, allocation_type,
|
|
|
|
protect)) {
|
|
|
|
XELOGE(
|
|
|
|
"PhysicalHeap::Alloc unable to pin physical memory in physical heap");
|
|
|
|
// TODO(benvanik): don't leak parent memory.
|
|
|
|
return false;
|
2013-10-27 19:06:02 +00:00
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
*out_address = address;
|
|
|
|
return true;
|
|
|
|
}
|
2013-10-27 19:06:02 +00:00
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
bool PhysicalHeap::AllocFixed(uint32_t base_address, uint32_t size,
|
|
|
|
uint32_t alignment, uint32_t allocation_type,
|
|
|
|
uint32_t protect) {
|
|
|
|
// Adjust alignment size our page size differs from the parent.
|
|
|
|
size = xe::round_up(size, page_size_);
|
|
|
|
alignment = xe::round_up(alignment, page_size_);
|
|
|
|
|
2015-09-06 16:30:54 +00:00
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
2015-05-16 07:23:13 +00:00
|
|
|
|
|
|
|
// Allocate from parent heap (gets our physical address in 0-512mb).
|
|
|
|
// NOTE: this can potentially overwrite heap contents if there are already
|
|
|
|
// committed pages in the requested physical range.
|
|
|
|
// TODO(benvanik): flag for ensure-not-committed?
|
|
|
|
uint32_t parent_base_address = GetPhysicalAddress(base_address);
|
|
|
|
if (!parent_heap_->AllocFixed(parent_base_address, size, alignment,
|
|
|
|
allocation_type, protect)) {
|
|
|
|
XELOGE(
|
|
|
|
"PhysicalHeap::Alloc unable to alloc physical memory in parent heap");
|
|
|
|
return false;
|
2013-10-21 07:57:48 +00:00
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
|
|
|
|
// Given the address we've reserved in the parent heap, pin that here.
|
|
|
|
// Shouldn't be possible for it to be allocated already.
|
2019-08-15 20:55:33 +00:00
|
|
|
uint32_t address =
|
|
|
|
heap_base_ + parent_base_address - GetPhysicalAddress(heap_base_);
|
2015-05-16 07:23:13 +00:00
|
|
|
if (!BaseHeap::AllocFixed(address, size, alignment, allocation_type,
|
|
|
|
protect)) {
|
|
|
|
XELOGE(
|
|
|
|
"PhysicalHeap::Alloc unable to pin physical memory in physical heap");
|
|
|
|
// TODO(benvanik): don't leak parent memory.
|
|
|
|
return false;
|
2013-10-21 07:57:48 +00:00
|
|
|
}
|
2013-10-22 02:28:25 +00:00
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool PhysicalHeap::AllocRange(uint32_t low_address, uint32_t high_address,
|
|
|
|
uint32_t size, uint32_t alignment,
|
|
|
|
uint32_t allocation_type, uint32_t protect,
|
|
|
|
bool top_down, uint32_t* out_address) {
|
|
|
|
*out_address = 0;
|
|
|
|
|
|
|
|
// Adjust alignment size our page size differs from the parent.
|
|
|
|
size = xe::round_up(size, page_size_);
|
|
|
|
alignment = xe::round_up(alignment, page_size_);
|
|
|
|
|
2015-09-06 16:30:54 +00:00
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
2015-05-16 07:23:13 +00:00
|
|
|
|
|
|
|
// Allocate from parent heap (gets our physical address in 0-512mb).
|
|
|
|
low_address = std::max(heap_base_, low_address);
|
2020-02-22 11:55:28 +00:00
|
|
|
high_address = std::min(heap_base_ + (heap_size_ - 1), high_address);
|
2015-05-16 07:23:13 +00:00
|
|
|
uint32_t parent_low_address = GetPhysicalAddress(low_address);
|
|
|
|
uint32_t parent_high_address = GetPhysicalAddress(high_address);
|
|
|
|
uint32_t parent_address;
|
|
|
|
if (!parent_heap_->AllocRange(parent_low_address, parent_high_address, size,
|
|
|
|
alignment, allocation_type, protect, top_down,
|
|
|
|
&parent_address)) {
|
|
|
|
XELOGE(
|
|
|
|
"PhysicalHeap::Alloc unable to alloc physical memory in parent heap");
|
|
|
|
return false;
|
2013-10-22 02:28:25 +00:00
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
// Given the address we've reserved in the parent heap, pin that here.
|
|
|
|
// Shouldn't be possible for it to be allocated already.
|
2019-08-15 20:55:33 +00:00
|
|
|
uint32_t address =
|
|
|
|
heap_base_ + parent_address - GetPhysicalAddress(heap_base_);
|
2015-05-16 07:23:13 +00:00
|
|
|
if (!BaseHeap::AllocFixed(address, size, alignment, allocation_type,
|
|
|
|
protect)) {
|
|
|
|
XELOGE(
|
|
|
|
"PhysicalHeap::Alloc unable to pin physical memory in physical heap");
|
|
|
|
// TODO(benvanik): don't leak parent memory.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
*out_address = address;
|
|
|
|
return true;
|
2013-10-21 07:57:48 +00:00
|
|
|
}
|
2013-10-23 06:34:24 +00:00
|
|
|
|
2022-02-14 18:26:31 +00:00
|
|
|
bool PhysicalHeap::AllocSystemHeap(uint32_t size, uint32_t alignment,
|
Huge set of performance improvements, combined with an architecture specific build and clang-cl users have reported absurd gains over master for some gains, in the range 50%-90%
But for normal msvc builds i would put it at around 30-50%
Added per-xexmodule caching of information per instruction, can be used to remember what code needs compiling at start up
Record what guest addresses wrote mmio and backpropagate that to future runs, eliminating dependence on exception trapping. this makes many games like h3 actually tolerable to run under a debugger
fixed a number of errors where temporaries were being passed by reference/pointer
Can now be compiled with clang-cl 14.0.1, requires -Werror off though and some other solution/project changes.
Added macros wrapping compiler extensions like noinline, forceinline, __expect, and cold.
Removed the "global lock" in guest code completely. It does not properly emulate the behavior of mfmsrd/mtmsr and it seriously cripples amd cpus. Removing this yielded around a 3x speedup in Halo Reach for me.
Disabled the microprofiler for now. The microprofiler has a huge performance cost associated with it. Developers can re-enable it in the base/profiling header if they really need it
Disable the trace writer in release builds. despite just returning after checking if the file was open the trace functions were consuming about 0.60% cpu time total
Add IsValidReg, GetRegisterInfo is a huge (about 45k) branching function and using that to check if a register was valid consumed a significant chunk of time
Optimized RingBuffer::ReadAndSwap and RingBuffer::read_count. This gave us the largest overall boost in performance. The memcpies were unnecessary and one of them was always a no-op
Added simplification rules for multiplicative patterns like (x+x), (x<<1)+x
For the most frequently called win32 functions i added code to call their underlying NT implementations, which lets us skip a lot of MS code we don't care about/isnt relevant to our usecases
^this can be toggled off in the platform_win header
handle indirect call true with constant function pointer, was occurring in h3
lookup host format swizzle in denser array
by default, don't check if a gpu register is unknown, instead just check if its out of range. controlled by a cvar
^looking up whether its known or not took approx 0.3% cpu time
Changed some things in /cpu to make the project UNITYBUILD friendly
The timer thread was spinning way too much and consuming a ton of cpu, changed it to use a blocking wait instead
tagged some conditions as XE_UNLIKELY/LIKELY based on profiler feedback (will only affect clang builds)
Shifted around some code in CommandProcessor::WriteRegister based on how frequently it was executed
added support for docdecaduple precision floating point so that we can represent our performance gains numerically
tons of other stuff im probably forgetting
2022-08-13 19:59:00 +00:00
|
|
|
uint32_t allocation_type, uint32_t protect,
|
|
|
|
bool top_down, uint32_t* out_address) {
|
|
|
|
return Alloc(size, alignment, allocation_type, protect, top_down,
|
|
|
|
out_address);
|
2022-02-14 18:26:31 +00:00
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
bool PhysicalHeap::Decommit(uint32_t address, uint32_t size) {
|
2015-09-06 16:30:54 +00:00
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
2020-02-23 22:04:30 +00:00
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
uint32_t parent_address = GetPhysicalAddress(address);
|
|
|
|
if (!parent_heap_->Decommit(parent_address, size)) {
|
|
|
|
XELOGE("PhysicalHeap::Decommit failed due to parent heap failure");
|
|
|
|
return false;
|
|
|
|
}
|
2020-02-23 22:04:30 +00:00
|
|
|
|
|
|
|
// Not caring about the contents anymore.
|
|
|
|
TriggerCallbacks(std::move(global_lock), address, size, true, true);
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
return BaseHeap::Decommit(address, size);
|
|
|
|
}
|
2014-01-05 19:19:02 +00:00
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
bool PhysicalHeap::Release(uint32_t base_address, uint32_t* out_region_size) {
|
2015-09-06 16:30:54 +00:00
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
2020-02-23 22:04:30 +00:00
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
uint32_t parent_base_address = GetPhysicalAddress(base_address);
|
|
|
|
if (!parent_heap_->Release(parent_base_address, out_region_size)) {
|
|
|
|
XELOGE("PhysicalHeap::Release failed due to parent heap failure");
|
|
|
|
return false;
|
2014-01-05 19:19:02 +00:00
|
|
|
}
|
2020-02-23 22:04:30 +00:00
|
|
|
|
|
|
|
// Must invalidate here because the range being released may be reused in
|
|
|
|
// another mapping of physical memory - but callback flags are set in each
|
|
|
|
// heap separately (https://github.com/xenia-project/xenia/issues/1559 -
|
2021-09-05 18:03:05 +00:00
|
|
|
// dynamic vertices in 4D5307F2 start screen and menu allocated in 0xA0000000
|
|
|
|
// at addresses that overlap intro video textures in 0xE0000000, with the
|
|
|
|
// state of the allocator as of February 24th, 2020). If memory is invalidated
|
|
|
|
// in Alloc instead, Alloc won't be aware of callbacks enabled in other heaps,
|
|
|
|
// thus callback handlers will keep considering this range valid forever.
|
2020-02-23 22:04:30 +00:00
|
|
|
uint32_t region_size;
|
|
|
|
if (QuerySize(base_address, ®ion_size)) {
|
|
|
|
TriggerCallbacks(std::move(global_lock), base_address, region_size, true,
|
|
|
|
true);
|
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
return BaseHeap::Release(base_address, out_region_size);
|
|
|
|
}
|
2014-01-05 19:19:02 +00:00
|
|
|
|
2017-07-25 02:41:47 +00:00
|
|
|
bool PhysicalHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
|
|
|
|
uint32_t* old_protect) {
|
2015-09-06 16:30:54 +00:00
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
2015-11-09 01:14:06 +00:00
|
|
|
|
2019-11-10 11:21:36 +00:00
|
|
|
// Only invalidate if making writable again, for simplicity - not when simply
|
|
|
|
// marking some range as immutable, for instance.
|
|
|
|
if (protect & kMemoryProtectWrite) {
|
2020-02-15 18:35:24 +00:00
|
|
|
TriggerCallbacks(std::move(global_lock), address, size, true, true, false);
|
2019-11-10 11:21:36 +00:00
|
|
|
}
|
2018-09-24 20:18:16 +00:00
|
|
|
|
|
|
|
if (!parent_heap_->Protect(GetPhysicalAddress(address), size, protect,
|
|
|
|
old_protect)) {
|
2015-05-16 07:23:13 +00:00
|
|
|
XELOGE("PhysicalHeap::Protect failed due to parent heap failure");
|
|
|
|
return false;
|
2014-01-12 19:09:52 +00:00
|
|
|
}
|
2015-11-09 01:14:06 +00:00
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
return BaseHeap::Protect(address, size, protect);
|
2013-10-23 06:34:24 +00:00
|
|
|
}
|
2015-05-16 07:23:13 +00:00
|
|
|
|
2020-02-15 18:35:24 +00:00
|
|
|
void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
|
|
|
|
uint32_t length,
|
|
|
|
bool enable_invalidation_notifications,
|
|
|
|
bool enable_data_providers) {
|
|
|
|
// TODO(Triang3l): Implement data providers.
|
|
|
|
assert_false(enable_data_providers);
|
|
|
|
if (!enable_invalidation_notifications && !enable_data_providers) {
|
|
|
|
return;
|
|
|
|
}
|
2019-07-30 05:00:20 +00:00
|
|
|
uint32_t physical_address_offset = GetPhysicalAddress(heap_base_);
|
|
|
|
if (physical_address < physical_address_offset) {
|
|
|
|
if (physical_address_offset - physical_address >= length) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
length -= physical_address_offset - physical_address;
|
|
|
|
physical_address = physical_address_offset;
|
|
|
|
}
|
|
|
|
uint32_t heap_relative_address = physical_address - physical_address_offset;
|
2020-02-22 11:55:28 +00:00
|
|
|
if (heap_relative_address >= heap_size_) {
|
2018-09-24 20:18:16 +00:00
|
|
|
return;
|
|
|
|
}
|
2020-02-22 11:55:28 +00:00
|
|
|
length = std::min(length, heap_size_ - heap_relative_address);
|
2019-07-30 05:00:20 +00:00
|
|
|
if (length == 0) {
|
2018-09-24 20:18:16 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2019-07-30 05:00:20 +00:00
|
|
|
uint32_t system_page_first =
|
2022-09-17 11:04:53 +00:00
|
|
|
(heap_relative_address + host_address_offset()) >> system_page_shift_;
|
|
|
|
swcache::PrefetchL1(&system_page_flags_[system_page_first >> 6]);
|
2019-07-30 05:00:20 +00:00
|
|
|
uint32_t system_page_last =
|
2022-09-17 11:04:53 +00:00
|
|
|
(heap_relative_address + length - 1 + host_address_offset()) >>
|
|
|
|
system_page_shift_;
|
2019-07-30 05:00:20 +00:00
|
|
|
system_page_last = std::min(system_page_last, system_page_count_ - 1);
|
|
|
|
assert_true(system_page_first <= system_page_last);
|
2018-09-24 20:18:16 +00:00
|
|
|
|
2020-02-15 18:35:24 +00:00
|
|
|
// Update callback flags for system pages and make their protection stricter
|
|
|
|
// if needed.
|
|
|
|
xe::memory::PageAccess protect_access =
|
|
|
|
enable_data_providers ? xe::memory::PageAccess::kNoAccess
|
|
|
|
: xe::memory::PageAccess::kReadOnly;
|
2022-09-17 11:04:53 +00:00
|
|
|
|
|
|
|
auto global_lock = global_critical_region_.Acquire();
|
|
|
|
if (enable_invalidation_notifications) {
|
|
|
|
EnableAccessCallbacksInner<true>(system_page_first, system_page_last,
|
|
|
|
protect_access);
|
|
|
|
} else {
|
|
|
|
EnableAccessCallbacksInner<false>(system_page_first, system_page_last,
|
|
|
|
protect_access);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
template <bool enable_invalidation_notifications>
|
|
|
|
XE_NOINLINE void PhysicalHeap::EnableAccessCallbacksInner(
|
|
|
|
const uint32_t system_page_first, const uint32_t system_page_last,
|
|
|
|
xe::memory::PageAccess protect_access) XE_RESTRICT {
|
2019-08-16 05:49:48 +00:00
|
|
|
uint8_t* protect_base = membase_ + heap_base_;
|
2019-07-30 05:00:20 +00:00
|
|
|
uint32_t protect_system_page_first = UINT32_MAX;
|
2022-09-17 11:04:53 +00:00
|
|
|
|
|
|
|
SystemPageFlagsBlock* XE_RESTRICT sys_page_flags = system_page_flags_.data();
|
|
|
|
PageEntry* XE_RESTRICT page_table_ptr = page_table_.data();
|
|
|
|
|
|
|
|
// chrispy: a lot of time is spent in this loop, and i think some of the work
|
|
|
|
// may be avoidable and repetitive profiling shows quite a bit of time spent
|
|
|
|
// in this loop, but very little spent actually calling Protect
|
|
|
|
uint32_t i = system_page_first;
|
|
|
|
|
|
|
|
uint32_t first_guest_page = SystemPagenumToGuestPagenum(system_page_first);
|
|
|
|
uint32_t last_guest_page = SystemPagenumToGuestPagenum(system_page_last);
|
|
|
|
|
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
"Fix" debug console, we were checking the cvar before any cvars were loaded, and the condition it checks in AttachConsole is somehow always false
Remove dead #if 0'd code in math.h
On amd64, page_size == 4096 constant, on amd64 w/ win32, allocation_granularity == 65536. These values for x86 windows havent changed over the last 20 years so this is probably safe
and gives a modest code size reduction
Enable XE_USE_KUSER_SHARED. This sources host time from KUSER_SHARED instead of from QueryPerformanceCounter, which is far faster, but only has a granularity of 100 nanoseconds.
In some games seemingly random crashes were happening that were hard to trace because
the faulting thread was actually not the one that was misbehaving, another threads stack was underflowing into the faulting thread.
Added a bunch of code to synchronize the guest stack and host stack so that if a guest longjmps the host's stack will be adjusted.
Changes were also made to allow the guest to call into a piece of an existing x64 function.
This synchronization might have a slight performance impact on lower end cpus, to disable it set enable_host_guest_stack_synchronization to false.
It is possible it may have introduced regressions, but i dont know of any yet
So far, i know the synchronization change fixes the "hub crash" in super sonic and allows the game "london 2012" to go ingame.
Removed emit_useless_fpscr_updates, not emitting these updates breaks the raiden game
MapGuestAddressToMachineCode now returns nullptr if no address was found, instead of the start of the function
add Processor::LookupModule
Add Backend::DeinitializeBackendContext
Use WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF> in WriteRegisterRangeFromRing for inlining (previously regressed on performance of ExecutePacketType0)
add notes about flags that trap in XamInputGetCapabilities
0 == 3 in XamInputGetCapabilities
Name arg 2 of XamInputSetState
PrefetchW in critical section kernel funcs if available & doing cmpxchg
Add terminated field to X_KTHREAD, set it on termination
Expanded the logic of NtResumeThread/NtSuspendThread to include checking the type of the handle (in release, LookupObject doesnt seem to do anything with the type)
and returning X_STATUS_OBJECT_TYPE_MISMATCH if invalid. Do termination check in NtSuspendThread.
Add basic host exception messagebox, need to flesh it out more (maybe use the new stack tracking stuff if on guest thrd?)
Add rdrand patching hack, mostly affects users with nvidia cards who have many threads on zen
Use page_size_shift in more places
Once again disable precompilation! Raiden is mostly weird ppc asm which probably breaks the precompilation. The code is still useful for running the compiler over the whole of an xex in debug to test for issues
2022-11-27 17:37:06 +00:00
|
|
|
uint32_t guest_one = SystemPagenumToGuestPagenum(1);
|
2022-09-17 11:04:53 +00:00
|
|
|
|
|
|
|
uint32_t system_one = GuestPagenumToSystemPagenum(1);
|
|
|
|
for (; i <= system_page_last; ++i) {
|
2020-02-15 18:35:24 +00:00
|
|
|
// Check if need to enable callbacks for the page and raise its protection.
|
|
|
|
//
|
|
|
|
// If enabling invalidation notifications:
|
|
|
|
// - Page writable and not watched for changes yet - protect and enable
|
|
|
|
// invalidation notifications.
|
|
|
|
// - Page seen as writable by the guest, but only needs data providers -
|
|
|
|
// just set the bits to enable invalidation notifications (already has
|
|
|
|
// even stricter protection than needed).
|
|
|
|
// - Page not writable as requested by the game - don't do anything (need
|
|
|
|
// real access violations here).
|
|
|
|
// If enabling data providers:
|
|
|
|
// - Page accessible (either read/write or read-only) and didn't need data
|
|
|
|
// providers initially - protect and enable data providers.
|
|
|
|
// - Otherwise - do nothing.
|
|
|
|
//
|
|
|
|
// It's safe not to await data provider completion here before protecting as
|
|
|
|
// this never makes protection lighter, so it can't interfere with page
|
|
|
|
// faults that await data providers.
|
|
|
|
//
|
|
|
|
// Enabling data providers doesn't need to be deferred - providers will be
|
|
|
|
// polled for the last time without releasing the lock.
|
2022-09-17 11:04:53 +00:00
|
|
|
SystemPageFlagsBlock& page_flags_block = sys_page_flags[i >> 6];
|
|
|
|
|
|
|
|
#if XE_ARCH_AMD64 == 1
|
|
|
|
// x86 modulus shift
|
|
|
|
uint64_t page_flags_bit = uint64_t(1) << i;
|
|
|
|
#else
|
2020-02-15 18:35:24 +00:00
|
|
|
uint64_t page_flags_bit = uint64_t(1) << (i & 63);
|
2022-09-17 11:04:53 +00:00
|
|
|
#endif
|
|
|
|
|
|
|
|
uint32_t guest_page_number = SystemPagenumToGuestPagenum(i);
|
2020-02-15 18:35:24 +00:00
|
|
|
xe::memory::PageAccess current_page_access =
|
2022-09-17 11:04:53 +00:00
|
|
|
ToPageAccess(page_table_ptr[guest_page_number].current_protect);
|
2020-02-15 18:35:24 +00:00
|
|
|
bool protect_system_page = false;
|
|
|
|
// Don't do anything with inaccessible pages - don't protect, don't enable
|
|
|
|
// callbacks - because real access violations are needed there. And don't
|
|
|
|
// enable invalidation notifications for read-only pages for the same
|
|
|
|
// reason.
|
|
|
|
if (current_page_access != xe::memory::PageAccess::kNoAccess) {
|
|
|
|
// TODO(Triang3l): Enable data providers.
|
2022-09-17 11:04:53 +00:00
|
|
|
if constexpr (enable_invalidation_notifications) {
|
2020-02-15 18:35:24 +00:00
|
|
|
if (current_page_access != xe::memory::PageAccess::kReadOnly &&
|
|
|
|
(page_flags_block.notify_on_invalidation & page_flags_bit) == 0) {
|
|
|
|
// TODO(Triang3l): Check if data providers are already enabled.
|
|
|
|
// If data providers are already enabled for the page, it has even
|
|
|
|
// stricter protection.
|
|
|
|
protect_system_page = true;
|
|
|
|
page_flags_block.notify_on_invalidation |= page_flags_bit;
|
|
|
|
}
|
2019-07-30 05:00:20 +00:00
|
|
|
}
|
|
|
|
}
|
2020-02-15 18:35:24 +00:00
|
|
|
if (protect_system_page) {
|
2019-07-30 05:00:20 +00:00
|
|
|
if (protect_system_page_first == UINT32_MAX) {
|
|
|
|
protect_system_page_first = i;
|
2018-09-24 20:18:16 +00:00
|
|
|
}
|
|
|
|
} else {
|
2019-07-30 05:00:20 +00:00
|
|
|
if (protect_system_page_first != UINT32_MAX) {
|
|
|
|
xe::memory::Protect(
|
2022-09-17 11:04:53 +00:00
|
|
|
protect_base + (protect_system_page_first << system_page_shift_),
|
|
|
|
(i - protect_system_page_first) << system_page_shift_,
|
2020-02-15 18:35:24 +00:00
|
|
|
protect_access);
|
2019-07-30 05:00:20 +00:00
|
|
|
protect_system_page_first = UINT32_MAX;
|
2018-09-24 20:18:16 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2022-09-17 11:04:53 +00:00
|
|
|
|
2019-07-30 05:00:20 +00:00
|
|
|
if (protect_system_page_first != UINT32_MAX) {
|
|
|
|
xe::memory::Protect(
|
2022-09-17 11:04:53 +00:00
|
|
|
protect_base + (protect_system_page_first << system_page_shift_),
|
|
|
|
(system_page_last + 1 - protect_system_page_first)
|
|
|
|
<< system_page_shift_,
|
2020-02-15 18:35:24 +00:00
|
|
|
protect_access);
|
2018-09-24 20:18:16 +00:00
|
|
|
}
|
2019-07-30 05:00:20 +00:00
|
|
|
}
|
2020-02-15 18:35:24 +00:00
|
|
|
bool PhysicalHeap::TriggerCallbacks(
|
use Sleep(0) instead of SwitchToThread, should waste less power and help the os with scheduling.
PM4 buffer handling made a virtual member of commandprocessor, place the implementation/declaration into reusable macro files. this is probably the biggest boost here.
Optimized SET_CONSTANT/ LOAD_CONSTANT pm4 ops based on the register range they start writing at, this was also a nice boost
Expose X64 extension flags to code outside of x64 backend, so we can detect and use things like avx512, xop, avx2, etc in normal code
Add freelists for HIR structures to try to reduce the number of last level cache misses during optimization (currently disabled... fixme later)
Analyzed PGO feedback and reordered branches, uninlined functions, moved code out into different functions based on info from it in the PM4 functions, this gave like a 2% boost at best.
Added support for the db16cyc opcode, which is used often in xb360 spinlocks. before it was just being translated to nop, now on x64 we translate it to _mm_pause but may change that in the future to reduce cpu time wasted
texture util - all our divisors were powers of 2, instead we look up a shift. this made texture scaling slightly faster, more so on intel processors which seem to be worse at int divs. GetGuestTextureLayout is now a little faster, although it is still one of the heaviest functions in the emulator when scaling is on.
xe_unlikely_mutex was not a good choice for the guest clock lock, (running theory) on intel processors another thread may take a significant time to update the clock? maybe because of the uint64 division? really not sure, but switched it to xe_mutex. This fixed audio stutter that i had introduced to 1 or 2 games, fixed performance on that n64 rare game with the monkeys.
Took another crack at DMA implementation, another failure.
Instead of passing as a parameter, keep the ringbuffer reader as the first member of commandprocessor so it can be accessed through this
Added macro for noalias
Applied noalias to Memory::LookupHeap. This reduced the size of the executable by 7 kb.
Reworked kernel shim template, this shaved like 100kb off the exe and eliminated the indirect calls from the shim to the actual implementation. We still unconditionally generate string representations of kernel calls though :(, unless it is kHighFrequency
Add nvapi extensions support, currently unused. Will use CPUVISIBLE memory at some point
Inserted prefetches in a few places based on feedback from vtune.
Add native implementation of SHA int8 if all elements are the same
Vectorized comparisons for SetViewport, SetScissorRect
Vectorized ranged comparisons for WriteRegister
Add XE_MSVC_ASSUME
Move FormatInfo::name out of the structure, instead look up the name in a different table. Debug related data and critical runtime data are best kept apart
Templated UpdateSystemConstantValues based on ROV/RTV and primitive_polygonal
Add ArchFloatMask functions, these are for storing the results of floating point comparisons without doing costly float->int pipeline transfers (vucomiss/setb)
Use floatmasks in UpdateSystemConstantValues for checking if dirty, only transfer to int at end of function.
Instead of dirty |= (x == y) in UpdateSystemConstantValues, now we do dirty_u32 |= (x^y). if any of them are not equal, dirty_u32 will be nz, else if theyre all equal it will be zero. This is more friendly to register renaming and the lack of dependencies on EFLAGS lets the compiler reorder better
Add PrefetchSamplerParameters to D3D12TextureCache
use PrefetchSamplerParameters in UpdateBindings to eliminate cache misses that vtune detected
Add PrefetchTextureBinding to D3D12TextureCache
Prefetch texture bindings to get rid of more misses vtune detected (more accesses out of order with random strides)
Rewrote DMAC, still terrible though and have disabled it for now.
Replace tiny memcmp of 6 U64 in render_target_cache with inline loop, msvc fails to make it a loop and instead does a thunk to their memcmp function, which is optimized for larger sizes
PrefetchTextureBinding in AreActiveTextureSRVKeysUpToDate
Replace memcmp calls for pipelinedescription with handwritten cmp
Directly write some registers that dont have special handling in PM4 functions
Changed EstimateMaxY to try to eliminate mispredictions that vtune was reporting, msvc ended up turning the changed code into a series of blends
in ExecutePacketType3_EVENT_WRITE_EXT, instead of writing extents to an array on the stack and then doing xe_copy_and_swap_16 of the data to its dest, pre-swap each constant and then store those. msvc manages to unroll that into wider stores
stop logging XE_SWAP every time we receive XE_SWAP, stop logging the start and end of each viz query
Prefetch watch nodes in FireWatches based on feedback from vtune
Removed dead code from texture_info.cc
NOINLINE on GpuSwap, PGO builds did it so we should too.
2022-09-11 21:14:48 +00:00
|
|
|
global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
|
|
|
|
uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) {
|
2019-07-30 05:00:20 +00:00
|
|
|
// TODO(Triang3l): Support read watches.
|
|
|
|
assert_true(is_write);
|
|
|
|
if (!is_write) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (virtual_address < heap_base_) {
|
|
|
|
if (heap_base_ - virtual_address >= length) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
length -= heap_base_ - virtual_address;
|
|
|
|
virtual_address = heap_base_;
|
|
|
|
}
|
|
|
|
uint32_t heap_relative_address = virtual_address - heap_base_;
|
2020-02-22 11:55:28 +00:00
|
|
|
if (heap_relative_address >= heap_size_) {
|
2019-07-30 05:00:20 +00:00
|
|
|
return false;
|
|
|
|
}
|
2020-02-22 11:55:28 +00:00
|
|
|
length = std::min(length, heap_size_ - heap_relative_address);
|
2019-07-30 05:00:20 +00:00
|
|
|
if (length == 0) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t system_page_first =
|
2022-09-17 11:04:53 +00:00
|
|
|
(heap_relative_address + host_address_offset()) >> system_page_shift_;
|
2019-07-30 05:00:20 +00:00
|
|
|
uint32_t system_page_last =
|
2022-09-17 11:04:53 +00:00
|
|
|
(heap_relative_address + length - 1 + host_address_offset()) >>
|
|
|
|
system_page_shift_;
|
2019-07-30 05:00:20 +00:00
|
|
|
system_page_last = std::min(system_page_last, system_page_count_ - 1);
|
|
|
|
assert_true(system_page_first <= system_page_last);
|
|
|
|
uint32_t block_index_first = system_page_first >> 6;
|
|
|
|
uint32_t block_index_last = system_page_last >> 6;
|
|
|
|
|
|
|
|
// Check if watching any page, whether need to call the callback at all.
|
|
|
|
bool any_watched = false;
|
|
|
|
for (uint32_t i = block_index_first; i <= block_index_last; ++i) {
|
2020-02-15 18:35:24 +00:00
|
|
|
uint64_t block = system_page_flags_[i].notify_on_invalidation;
|
2019-07-30 05:00:20 +00:00
|
|
|
if (i == block_index_first) {
|
|
|
|
block &= ~((uint64_t(1) << (system_page_first & 63)) - 1);
|
|
|
|
}
|
|
|
|
if (i == block_index_last && (system_page_last & 63) != 63) {
|
|
|
|
block &= (uint64_t(1) << ((system_page_last & 63) + 1)) - 1;
|
|
|
|
}
|
|
|
|
if (block) {
|
|
|
|
any_watched = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!any_watched) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Trigger callbacks.
|
2019-07-30 21:18:12 +00:00
|
|
|
if (!unprotect) {
|
|
|
|
// If not doing anything with protection, no point in unwatching excess
|
|
|
|
// pages.
|
|
|
|
unwatch_exact_range = true;
|
|
|
|
}
|
2019-07-30 05:00:20 +00:00
|
|
|
uint32_t physical_address_offset = GetPhysicalAddress(heap_base_);
|
|
|
|
uint32_t physical_address_start =
|
2022-09-17 11:04:53 +00:00
|
|
|
xe::sat_sub(system_page_first << system_page_shift_,
|
2019-08-04 20:55:54 +00:00
|
|
|
host_address_offset()) +
|
2019-07-30 05:00:20 +00:00
|
|
|
physical_address_offset;
|
|
|
|
uint32_t physical_length = std::min(
|
2022-09-17 11:04:53 +00:00
|
|
|
xe::sat_sub((system_page_last << system_page_shift_) + system_page_size_,
|
2019-08-04 20:55:54 +00:00
|
|
|
host_address_offset()) +
|
2019-07-30 05:00:20 +00:00
|
|
|
physical_address_offset - physical_address_start,
|
2020-02-22 11:55:28 +00:00
|
|
|
heap_size_ - (physical_address_start - physical_address_offset));
|
2019-07-30 21:18:12 +00:00
|
|
|
uint32_t unwatch_first = 0;
|
|
|
|
uint32_t unwatch_last = UINT32_MAX;
|
2020-02-15 18:35:24 +00:00
|
|
|
for (auto invalidation_callback :
|
|
|
|
memory_->physical_memory_invalidation_callbacks_) {
|
2019-07-30 21:18:12 +00:00
|
|
|
std::pair<uint32_t, uint32_t> callback_unwatch_range =
|
2020-02-15 18:35:24 +00:00
|
|
|
invalidation_callback->first(invalidation_callback->second,
|
|
|
|
physical_address_start, physical_length,
|
|
|
|
unwatch_exact_range);
|
2019-07-30 21:18:12 +00:00
|
|
|
if (!unwatch_exact_range) {
|
|
|
|
unwatch_first = std::max(unwatch_first, callback_unwatch_range.first);
|
|
|
|
unwatch_last = std::min(
|
|
|
|
unwatch_last,
|
|
|
|
xe::sat_add(
|
|
|
|
callback_unwatch_range.first,
|
|
|
|
std::max(callback_unwatch_range.second, uint32_t(1)) - 1));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!unwatch_exact_range) {
|
|
|
|
// Always unwatch at least the requested pages.
|
|
|
|
unwatch_first = std::min(unwatch_first, physical_address_start);
|
|
|
|
unwatch_last =
|
|
|
|
std::max(unwatch_last, physical_address_start + physical_length - 1);
|
|
|
|
// Don't unprotect too much if not caring much about the region (limit to
|
|
|
|
// 4 MB - somewhat random, but max 1024 iterations of the page loop).
|
|
|
|
const uint32_t kMaxUnwatchExcess = 4 * 1024 * 1024;
|
|
|
|
unwatch_first = std::max(unwatch_first,
|
|
|
|
physical_address_start & ~(kMaxUnwatchExcess - 1));
|
|
|
|
unwatch_last =
|
|
|
|
std::min(unwatch_last, (physical_address_start + physical_length - 1) |
|
|
|
|
(kMaxUnwatchExcess - 1));
|
|
|
|
// Convert to heap-relative addresses.
|
|
|
|
unwatch_first = xe::sat_sub(unwatch_first, physical_address_offset);
|
|
|
|
unwatch_last = xe::sat_sub(unwatch_last, physical_address_offset);
|
|
|
|
// Clamp to the heap upper bound.
|
2020-02-22 11:55:28 +00:00
|
|
|
unwatch_first = std::min(unwatch_first, heap_size_ - 1);
|
|
|
|
unwatch_last = std::min(unwatch_last, heap_size_ - 1);
|
2019-07-30 21:18:12 +00:00
|
|
|
// Convert to system pages and update the range.
|
2019-08-04 20:55:54 +00:00
|
|
|
unwatch_first += host_address_offset();
|
|
|
|
unwatch_last += host_address_offset();
|
2019-07-30 21:18:12 +00:00
|
|
|
assert_true(unwatch_first <= unwatch_last);
|
2022-09-17 11:04:53 +00:00
|
|
|
system_page_first = unwatch_first >> system_page_shift_;
|
|
|
|
system_page_last = unwatch_last >> system_page_shift_;
|
2019-07-30 21:18:12 +00:00
|
|
|
block_index_first = system_page_first >> 6;
|
|
|
|
block_index_last = system_page_last >> 6;
|
2019-07-30 05:00:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Unprotect ranges that need unprotection.
|
|
|
|
if (unprotect) {
|
2019-08-16 05:49:48 +00:00
|
|
|
uint8_t* protect_base = membase_ + heap_base_;
|
2019-07-30 05:00:20 +00:00
|
|
|
uint32_t unprotect_system_page_first = UINT32_MAX;
|
|
|
|
for (uint32_t i = system_page_first; i <= system_page_last; ++i) {
|
|
|
|
// Check if need to allow writing to this page.
|
2020-02-15 18:35:24 +00:00
|
|
|
bool unprotect_page = (system_page_flags_[i >> 6].notify_on_invalidation &
|
2019-07-30 05:00:20 +00:00
|
|
|
(uint64_t(1) << (i & 63))) != 0;
|
|
|
|
if (unprotect_page) {
|
2020-02-15 18:35:24 +00:00
|
|
|
uint32_t guest_page_number =
|
2022-09-17 11:04:53 +00:00
|
|
|
xe::sat_sub(i << system_page_shift_, host_address_offset()) >>
|
|
|
|
page_size_shift_;
|
2020-02-15 18:35:24 +00:00
|
|
|
if (ToPageAccess(page_table_[guest_page_number].current_protect) !=
|
2019-07-30 05:00:20 +00:00
|
|
|
xe::memory::PageAccess::kReadWrite) {
|
|
|
|
unprotect_page = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (unprotect_page) {
|
|
|
|
if (unprotect_system_page_first == UINT32_MAX) {
|
|
|
|
unprotect_system_page_first = i;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (unprotect_system_page_first != UINT32_MAX) {
|
|
|
|
xe::memory::Protect(
|
2022-09-17 11:04:53 +00:00
|
|
|
protect_base +
|
|
|
|
(unprotect_system_page_first << system_page_shift_),
|
|
|
|
(i - unprotect_system_page_first) << system_page_shift_,
|
2019-07-30 05:00:20 +00:00
|
|
|
xe::memory::PageAccess::kReadWrite);
|
|
|
|
unprotect_system_page_first = UINT32_MAX;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (unprotect_system_page_first != UINT32_MAX) {
|
|
|
|
xe::memory::Protect(
|
2022-09-17 11:04:53 +00:00
|
|
|
protect_base + (unprotect_system_page_first << system_page_shift_),
|
|
|
|
(system_page_last + 1 - unprotect_system_page_first)
|
|
|
|
<< system_page_shift_,
|
2019-07-30 05:00:20 +00:00
|
|
|
xe::memory::PageAccess::kReadWrite);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mark pages as not write-watched.
|
|
|
|
for (uint32_t i = block_index_first; i <= block_index_last; ++i) {
|
|
|
|
uint64_t mask = 0;
|
|
|
|
if (i == block_index_first) {
|
|
|
|
mask |= (uint64_t(1) << (system_page_first & 63)) - 1;
|
|
|
|
}
|
|
|
|
if (i == block_index_last && (system_page_last & 63) != 63) {
|
|
|
|
mask |= ~((uint64_t(1) << ((system_page_last & 63) + 1)) - 1);
|
|
|
|
}
|
2020-02-15 18:35:24 +00:00
|
|
|
system_page_flags_[i].notify_on_invalidation &= mask;
|
2019-07-30 05:00:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
2018-09-24 20:18:16 +00:00
|
|
|
}
|
|
|
|
|
2019-08-24 14:40:59 +00:00
|
|
|
uint32_t PhysicalHeap::GetPhysicalAddress(uint32_t address) const {
|
|
|
|
assert_true(address >= heap_base_);
|
|
|
|
address -= heap_base_;
|
2020-02-22 11:55:28 +00:00
|
|
|
assert_true(address < heap_size_);
|
2019-08-24 14:40:59 +00:00
|
|
|
if (heap_base_ >= 0xE0000000) {
|
|
|
|
address += 0x1000;
|
|
|
|
}
|
|
|
|
return address;
|
|
|
|
}
|
|
|
|
|
2015-05-16 07:23:13 +00:00
|
|
|
} // namespace xe
|