JitArm64: Increase farcode & nearcode cache size

This is a JitArm64 version of 219610d8a0.

Due to limitations on how far you can jump with a single AArch64 branch
instruction, going above the former limit of 128 MiB of code (counting
nearcode and farcode combined) requires a bit of restructuring. With the
restructuring in place, the limit now is 256 MiB. See the new large
comment in Jit.h for a description of the new memory layout.
This commit is contained in:
JosJuice 2024-03-24 11:49:47 +01:00
parent b6f0e8876e
commit e8154a529f
3 changed files with 148 additions and 47 deletions

View File

@ -82,6 +82,10 @@ public:
}
bool IsInSpace(const u8* ptr) const { return ptr >= region && ptr < (region + region_size); }
bool IsInSpaceOrChildSpace(const u8* ptr) const
{
return ptr >= region && ptr < (region + total_region_size);
}
void WriteProtect(bool allow_execute)
{
Common::WriteProtectMemory(region, region_size, allow_execute);
@ -106,7 +110,7 @@ public:
bool HasChildren() const { return region_size != total_region_size; }
u8* AllocChildCodeSpace(size_t child_size)
{
ASSERT_MSG(DYNA_REC, child_size < GetSpaceLeft(), "Insufficient space for child allocation.");
ASSERT_MSG(DYNA_REC, child_size <= GetSpaceLeft(), "Insufficient space for child allocation.");
u8* child_region = region + region_size - child_size;
region_size -= child_size;
ResetCodePtr();

View File

@ -4,6 +4,7 @@
#include "Core/PowerPC/JitArm64/Jit.h"
#include <cstdio>
#include <optional>
#include "Common/Arm64Emitter.h"
#include "Common/CommonTypes.h"
@ -29,13 +30,13 @@
using namespace Arm64Gen;
constexpr size_t CODE_SIZE = 1024 * 1024 * 32;
constexpr size_t NEAR_CODE_SIZE = 1024 * 1024 * 64;
// We use a bigger farcode size for JitArm64 than Jit64, because JitArm64 always emits farcode
// for the slow path of each loadstore instruction. Jit64 postpones emitting farcode until the
// farcode actually is needed, saving it from having to emit farcode for most instructions.
// TODO: Perhaps implement something similar to Jit64. But using more RAM isn't much of a problem.
constexpr size_t FARCODE_SIZE = 1024 * 1024 * 64;
constexpr size_t FARCODE_SIZE_MMU = 1024 * 1024 * 64;
constexpr size_t FAR_CODE_SIZE = 1024 * 1024 * 64;
constexpr size_t TOTAL_CODE_SIZE = NEAR_CODE_SIZE * 2 + FAR_CODE_SIZE * 2;
JitArm64::JitArm64(Core::System& system) : JitBase(system), m_float_emit(this)
{
@ -49,9 +50,18 @@ void JitArm64::Init()
RefreshConfig();
const size_t child_code_size = jo.memcheck ? FARCODE_SIZE_MMU : FARCODE_SIZE;
AllocCodeSpace(CODE_SIZE + child_code_size);
AddChildCodeSpace(&m_far_code, child_code_size);
// We want the regions to be laid out in this order in memory:
// m_far_code_0, m_near_code_0, m_near_code_1, m_far_code_1.
// AddChildCodeSpace grabs space from the end of the parent region,
// so we have to call AddChildCodeSpace in reverse order.
AllocCodeSpace(TOTAL_CODE_SIZE);
AddChildCodeSpace(&m_far_code_1, FAR_CODE_SIZE);
AddChildCodeSpace(&m_near_code_1, NEAR_CODE_SIZE);
AddChildCodeSpace(&m_near_code_0, NEAR_CODE_SIZE);
AddChildCodeSpace(&m_far_code_0, FAR_CODE_SIZE);
ASSERT(m_far_code_0.GetCodeEnd() == m_near_code_0.GetCodePtr());
ASSERT(m_near_code_0.GetCodeEnd() == m_near_code_1.GetCodePtr());
ASSERT(m_near_code_1.GetCodeEnd() == m_far_code_1.GetCodePtr());
jo.optimizeGatherPipe = true;
SetBlockLinkingEnabled(true);
@ -66,9 +76,7 @@ void JitArm64::Init()
InitBLROptimization();
GenerateAsm();
ResetFreeMemoryRanges();
GenerateAsmAndResetFreeMemoryRanges();
}
void JitArm64::SetBlockLinkingEnabled(bool enabled)
@ -113,7 +121,7 @@ bool JitArm64::HandleFault(uintptr_t access_address, SContext* ctx)
success = HandleStackFault();
// If the fault is in JIT code space, look for fastmem areas.
if (!success && IsInSpace(reinterpret_cast<u8*>(ctx->CTX_PC)))
if (!success && IsInSpaceOrChildSpace(reinterpret_cast<u8*>(ctx->CTX_PC)))
{
auto& memory = m_system.GetMemory();
if (memory.IsAddressInFastmemArea(reinterpret_cast<u8*>(access_address)))
@ -153,22 +161,47 @@ void JitArm64::ClearCache()
blocks.Clear();
blocks.ClearRangesToFree();
const Common::ScopedJITPageWriteAndNoExecute enable_jit_page_writes;
ClearCodeSpace();
m_far_code.ClearCodeSpace();
m_far_code_0.ClearCodeSpace();
m_near_code_0.ClearCodeSpace();
m_near_code_1.ClearCodeSpace();
m_far_code_1.ClearCodeSpace();
RefreshConfig();
GenerateAsmAndResetFreeMemoryRanges();
}
void JitArm64::GenerateAsmAndResetFreeMemoryRanges()
{
SetCodePtr(m_near_code_1.GetWritableCodePtr(), m_near_code_1.GetWritableCodeEnd());
m_far_code.SetCodePtr(m_far_code_1.GetWritableCodePtr(), m_far_code_1.GetWritableCodeEnd());
const u8* routines_near_start = GetCodePtr();
const u8* routines_far_start = m_far_code.GetCodePtr();
GenerateAsm();
ResetFreeMemoryRanges();
const u8* routines_near_end = GetCodePtr();
const u8* routines_far_end = m_far_code.GetCodePtr();
ResetFreeMemoryRanges(routines_near_end - routines_near_start,
routines_far_end - routines_far_start);
}
void JitArm64::ResetFreeMemoryRanges()
void JitArm64::ResetFreeMemoryRanges(size_t routines_near_size, size_t routines_far_size)
{
// Set the near and far code regions as unused.
m_free_ranges_near.clear();
m_free_ranges_near.insert(GetWritableCodePtr(), GetWritableCodeEnd());
m_free_ranges_far.clear();
m_free_ranges_far.insert(m_far_code.GetWritableCodePtr(), m_far_code.GetWritableCodeEnd());
m_free_ranges_far_0.clear();
m_free_ranges_far_0.insert(m_far_code_0.GetWritableCodePtr() + routines_near_size,
m_far_code_0.GetWritableCodeEnd());
m_free_ranges_near_0.clear();
m_free_ranges_near_0.insert(m_near_code_0.GetWritableCodePtr(),
m_near_code_0.GetWritableCodeEnd());
m_free_ranges_near_1.clear();
m_free_ranges_near_1.insert(m_near_code_1.GetWritableCodePtr() + routines_near_size,
m_near_code_1.GetWritableCodeEnd());
m_free_ranges_far_1.clear();
m_free_ranges_far_1.insert(m_far_code_1.GetWritableCodePtr() + routines_far_size,
m_far_code_1.GetWritableCodeEnd());
}
void JitArm64::Shutdown()
@ -889,11 +922,17 @@ void JitArm64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure)
++last_fastmem_area;
m_fault_to_handler.erase(first_fastmem_area, last_fastmem_area);
m_free_ranges_near.insert(range.first, range.second);
if (range.first < m_near_code_0.GetCodeEnd())
m_free_ranges_near_0.insert(range.first, range.second);
else
m_free_ranges_near_1.insert(range.first, range.second);
}
for (auto range : blocks.GetRangesToFreeFar())
{
m_free_ranges_far.insert(range.first, range.second);
if (range.first < m_far_code_0.GetCodeEnd())
m_free_ranges_far_0.insert(range.first, range.second);
else
m_free_ranges_far_1.insert(range.first, range.second);
}
blocks.ClearRangesToFree();
@ -939,7 +978,7 @@ void JitArm64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure)
return;
}
if (SetEmitterStateToFreeCodeRegion())
if (std::optional<size_t> code_region_index = SetEmitterStateToFreeCodeRegion())
{
u8* near_start = GetWritableCodePtr();
u8* far_start = m_far_code.GetWritableCodePtr();
@ -952,10 +991,16 @@ void JitArm64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure)
// Mark the memory regions that this code block uses as used in the local rangesets.
u8* near_end = GetWritableCodePtr();
if (near_start != near_end)
m_free_ranges_near.erase(near_start, near_end);
{
(code_region_index == 0 ? m_free_ranges_near_0 : m_free_ranges_near_1)
.erase(near_start, near_end);
}
u8* far_end = m_far_code.GetWritableCodePtr();
if (far_start != far_end)
m_free_ranges_far.erase(far_start, far_end);
{
(code_region_index == 0 ? m_free_ranges_far_0 : m_free_ranges_far_1)
.erase(far_start, far_end);
}
// Store the used memory regions in the block so we know what to mark as unused when the
// block gets invalidated.
@ -984,27 +1029,52 @@ void JitArm64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure)
exit(-1);
}
bool JitArm64::SetEmitterStateToFreeCodeRegion()
std::optional<size_t> JitArm64::SetEmitterStateToFreeCodeRegion()
{
// Find the largest free memory blocks and set code emitters to point at them.
// If we can't find a free block return false instead, which will trigger a JIT cache clear.
auto free_near = m_free_ranges_near.by_size_begin();
if (free_near == m_free_ranges_near.by_size_end())
{
WARN_LOG_FMT(DYNA_REC, "Failed to find free memory region in near code region.");
return false;
}
SetCodePtr(free_near.from(), free_near.to());
// Find some large free memory blocks and set code emitters to point at them. If we can't find
// free blocks, return std::nullopt instead, which will trigger a JIT cache clear.
const auto free_near_0 = m_free_ranges_near_0.by_size_begin();
const auto free_near_1 = m_free_ranges_near_1.by_size_begin();
const auto free_far_0 = m_free_ranges_far_0.by_size_begin();
const auto free_far_1 = m_free_ranges_far_1.by_size_begin();
auto free_far = m_free_ranges_far.by_size_begin();
if (free_far == m_free_ranges_far.by_size_end())
{
WARN_LOG_FMT(DYNA_REC, "Failed to find free memory region in far code region.");
return false;
}
m_far_code.SetCodePtr(free_far.from(), free_far.to());
const size_t free_near_1_size = free_near_1.to() - free_near_1.from();
const size_t free_far_1_size = free_far_1.to() - free_far_1.from();
const size_t free_1_smallest_size = std::min(free_near_1_size, free_far_1_size);
return true;
if (free_1_smallest_size >= 1024 * 1024)
{
// Don't use region 0 unless region 1 is getting full. This improves cache friendliness.
SetCodePtr(free_near_1.from(), free_near_1.to());
m_far_code.SetCodePtr(free_far_1.from(), free_far_1.to());
return std::make_optional(1);
}
const size_t free_near_0_size = free_near_0.to() - free_near_0.from();
const size_t free_far_0_size = free_far_0.to() - free_far_0.from();
const size_t free_0_smallest_size = std::min(free_near_0_size, free_far_0_size);
if (free_0_smallest_size == 0 && free_1_smallest_size == 0)
{
if (free_near_0_size == 0 && free_near_1_size == 0)
WARN_LOG_FMT(DYNA_REC, "Failed to find free memory region in near code regions.");
else if (free_far_0_size == 0 && free_far_1_size == 0)
WARN_LOG_FMT(DYNA_REC, "Failed to find free memory region in far code regions.");
return std::nullopt;
}
if (free_0_smallest_size > free_1_smallest_size)
{
SetCodePtr(free_near_0.from(), free_near_0.to());
m_far_code.SetCodePtr(free_far_0.from(), free_far_0.to());
return std::make_optional(0);
}
else
{
SetCodePtr(free_near_1.from(), free_near_1.to());
m_far_code.SetCodePtr(free_far_1.from(), free_far_1.to());
return std::make_optional(1);
}
}
bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)

View File

@ -5,6 +5,7 @@
#include <cstddef>
#include <map>
#include <optional>
#include <tuple>
#include <rangeset/rangesizeset.h>
@ -285,14 +286,16 @@ protected:
void Trace();
// Finds a free memory region and sets the near and far code emitters to point at that region.
// Returns false if no free memory region can be found for either of the two.
bool SetEmitterStateToFreeCodeRegion();
// On success, returns the index of the memory region (either 0 or 1).
// If either near code or far code is full, returns std::nullopt.
std::optional<size_t> SetEmitterStateToFreeCodeRegion();
void DoDownCount();
void Cleanup();
void ResetStack();
void ResetFreeMemoryRanges();
void GenerateAsmAndResetFreeMemoryRanges();
void ResetFreeMemoryRanges(size_t routines_near_size, size_t routines_far_size);
void IntializeSpeculativeConstants();
@ -372,6 +375,28 @@ protected:
Arm64Gen::ARM64FloatEmitter m_float_emit;
// Because B instructions can't jump farther than +/- 128 MiB, code memory is allocated like this:
//
// m_far_code_0: x MiB of unused space, followed by 64 - x MiB of far code
// m_near_code_0: 64 MiB of near code
// m_near_code_1: x MiB of asm routines, followed by 64 - x MiB of near code
// m_far_code_1: 64 MiB of far code
//
// This ensures that:
//
// * Any code in m_near_code_0 can reach any code in m_far_code_0, and vice versa
// * Any code in m_near_code_1 can reach any code in m_far_code_1, and vice versa
// * Any near code can reach any near code
// * Any code can reach any asm routine
//
// m_far_code_0 and m_far_code_1 can't reach each other, but that isn't needed, because all blocks
// have their entry points in near code.
Arm64Gen::ARM64CodeBlock m_near_code_0;
Arm64Gen::ARM64CodeBlock m_near_code_1;
Arm64Gen::ARM64CodeBlock m_far_code_0;
Arm64Gen::ARM64CodeBlock m_far_code_1;
Arm64Gen::ARM64CodeBlock m_far_code;
bool m_in_far_code = false;
@ -380,6 +405,8 @@ protected:
u8* m_near_code_end = nullptr;
bool m_near_code_write_failed = false;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_near;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_far;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_near_0;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_near_1;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_far_0;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_far_1;
};