Implement multi-level page tables.

Define L1_MAP_ADDR_SPACE_BITS to be either the virtual address size
(in user mode) or physical address size (in system mode), and use
that to size l1_map.  This rewrites page_find_alloc, page_flush_tb,
and walk_memory_regions.

Use TARGET_PHYS_ADDR_SPACE_BITS for the physical memory map based
off of l1_phys_map.  This rewrites page_phys_find_alloc and
phys_page_for_each.

Signed-off-by: Richard Henderson <rth@twiddle.net>
This commit is contained in:
Richard Henderson 2010-03-10 15:53:37 -08:00 committed by Paul Brook
parent 14f24e1465
commit 5cd2c5b6ad
2 changed files with 280 additions and 182 deletions

View File

@ -745,8 +745,11 @@ extern unsigned long qemu_host_page_mask;
#define PAGE_RESERVED 0x0020 #define PAGE_RESERVED 0x0020
void page_dump(FILE *f); void page_dump(FILE *f);
int walk_memory_regions(void *,
int (*fn)(void *, unsigned long, unsigned long, unsigned long)); typedef int (*walk_memory_regions_fn)(void *, unsigned long,
unsigned long, unsigned long);
int walk_memory_regions(void *, walk_memory_regions_fn);
int page_get_flags(target_ulong address); int page_get_flags(target_ulong address);
void page_set_flags(target_ulong start, target_ulong end, int flags); void page_set_flags(target_ulong start, target_ulong end, int flags);
int page_check_range(target_ulong start, target_ulong len, int flags); int page_check_range(target_ulong start, target_ulong len, int flags);

455
exec.c
View File

@ -141,30 +141,56 @@ typedef struct PhysPageDesc {
ram_addr_t region_offset; ram_addr_t region_offset;
} PhysPageDesc; } PhysPageDesc;
#define L2_BITS 10 /* In system mode we want L1_MAP to be based on physical addresses,
#if defined(CONFIG_USER_ONLY) && defined(TARGET_VIRT_ADDR_SPACE_BITS) while in user mode we want it to be based on virtual addresses. */
/* XXX: this is a temporary hack for alpha target. #if !defined(CONFIG_USER_ONLY)
* In the future, this is to be replaced by a multi-level table # define L1_MAP_ADDR_SPACE_BITS TARGET_PHYS_ADDR_SPACE_BITS
* to actually be able to handle the complete 64 bits address space.
*/
#define L1_BITS (TARGET_VIRT_ADDR_SPACE_BITS - L2_BITS - TARGET_PAGE_BITS)
#else #else
#define L1_BITS (32 - L2_BITS - TARGET_PAGE_BITS) # define L1_MAP_ADDR_SPACE_BITS TARGET_VIRT_ADDR_SPACE_BITS
#endif #endif
#define L1_SIZE (1 << L1_BITS) /* Size of the L2 (and L3, etc) page tables. */
#define L2_BITS 10
#define L2_SIZE (1 << L2_BITS) #define L2_SIZE (1 << L2_BITS)
/* The bits remaining after N lower levels of page tables. */
#define P_L1_BITS_REM \
((TARGET_PHYS_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % L2_BITS)
#define V_L1_BITS_REM \
((L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % L2_BITS)
/* Size of the L1 page table. Avoid silly small sizes. */
#if P_L1_BITS_REM < 4
#define P_L1_BITS (P_L1_BITS_REM + L2_BITS)
#else
#define P_L1_BITS P_L1_BITS_REM
#endif
#if V_L1_BITS_REM < 4
#define V_L1_BITS (V_L1_BITS_REM + L2_BITS)
#else
#define V_L1_BITS V_L1_BITS_REM
#endif
#define P_L1_SIZE ((target_phys_addr_t)1 << P_L1_BITS)
#define V_L1_SIZE ((target_ulong)1 << V_L1_BITS)
#define P_L1_SHIFT (TARGET_PHYS_ADDR_SPACE_BITS - TARGET_PAGE_BITS - P_L1_BITS)
#define V_L1_SHIFT (L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS - V_L1_BITS)
unsigned long qemu_real_host_page_size; unsigned long qemu_real_host_page_size;
unsigned long qemu_host_page_bits; unsigned long qemu_host_page_bits;
unsigned long qemu_host_page_size; unsigned long qemu_host_page_size;
unsigned long qemu_host_page_mask; unsigned long qemu_host_page_mask;
/* XXX: for system emulation, it could just be an array */ /* This is a multi-level map on the virtual address space.
static PageDesc *l1_map[L1_SIZE]; The bottom level has pointers to PageDesc. */
static void *l1_map[V_L1_SIZE];
#if !defined(CONFIG_USER_ONLY) #if !defined(CONFIG_USER_ONLY)
static PhysPageDesc **l1_phys_map; /* This is a multi-level map on the physical address space.
The bottom level has pointers to PhysPageDesc. */
static void *l1_phys_map[P_L1_SIZE];
static void io_mem_init(void); static void io_mem_init(void);
@ -239,133 +265,159 @@ static void page_init(void)
while ((1 << qemu_host_page_bits) < qemu_host_page_size) while ((1 << qemu_host_page_bits) < qemu_host_page_size)
qemu_host_page_bits++; qemu_host_page_bits++;
qemu_host_page_mask = ~(qemu_host_page_size - 1); qemu_host_page_mask = ~(qemu_host_page_size - 1);
#if !defined(CONFIG_USER_ONLY)
l1_phys_map = qemu_vmalloc(L1_SIZE * sizeof(void *));
memset(l1_phys_map, 0, L1_SIZE * sizeof(void *));
#endif
#if !defined(_WIN32) && defined(CONFIG_USER_ONLY) #if !defined(_WIN32) && defined(CONFIG_USER_ONLY)
{ {
long long startaddr, endaddr;
FILE *f; FILE *f;
int n;
mmap_lock();
last_brk = (unsigned long)sbrk(0); last_brk = (unsigned long)sbrk(0);
f = fopen("/proc/self/maps", "r"); f = fopen("/proc/self/maps", "r");
if (f) { if (f) {
mmap_lock();
do { do {
n = fscanf (f, "%llx-%llx %*[^\n]\n", &startaddr, &endaddr); unsigned long startaddr, endaddr;
if (n == 2) { int n;
startaddr = MIN(startaddr,
(1ULL << TARGET_PHYS_ADDR_SPACE_BITS) - 1); n = fscanf (f, "%lx-%lx %*[^\n]\n", &startaddr, &endaddr);
endaddr = MIN(endaddr,
(1ULL << TARGET_PHYS_ADDR_SPACE_BITS) - 1); if (n == 2 && h2g_valid(startaddr)) {
page_set_flags(startaddr & TARGET_PAGE_MASK, startaddr = h2g(startaddr) & TARGET_PAGE_MASK;
TARGET_PAGE_ALIGN(endaddr),
PAGE_RESERVED); if (h2g_valid(endaddr)) {
endaddr = h2g(endaddr);
} else {
endaddr = ~0ul;
}
page_set_flags(startaddr, endaddr, PAGE_RESERVED);
} }
} while (!feof(f)); } while (!feof(f));
fclose(f); fclose(f);
mmap_unlock();
} }
mmap_unlock();
} }
#endif #endif
} }
static inline PageDesc **page_l1_map(target_ulong index) static PageDesc *page_find_alloc(target_ulong index, int alloc)
{ {
#if TARGET_LONG_BITS > 32
/* Host memory outside guest VM. For 32-bit targets we have already
excluded high addresses. */
if (index > ((target_ulong)L2_SIZE * L1_SIZE))
return NULL;
#endif
return &l1_map[index >> L2_BITS];
}
static inline PageDesc *page_find_alloc(target_ulong index)
{
PageDesc **lp, *p;
lp = page_l1_map(index);
if (!lp)
return NULL;
p = *lp;
if (!p) {
/* allocate if not found */
#if defined(CONFIG_USER_ONLY) #if defined(CONFIG_USER_ONLY)
size_t len = sizeof(PageDesc) * L2_SIZE; /* We can't use qemu_malloc because it may recurse into a locked mutex.
/* Don't use qemu_malloc because it may recurse. */ Neither can we record the new pages we reserve while allocating a
p = mmap(NULL, len, PROT_READ | PROT_WRITE, given page because that may recurse into an unallocated page table
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); entry. Stuff the allocations we do make into a queue and process
*lp = p; them after having completed one entire page table allocation. */
if (h2g_valid(p)) {
unsigned long addr = h2g(p); unsigned long reserve[2 * (V_L1_SHIFT / L2_BITS)];
page_set_flags(addr & TARGET_PAGE_MASK, int reserve_idx = 0;
TARGET_PAGE_ALIGN(addr + len),
PAGE_RESERVED); # define ALLOC(P, SIZE) \
} do { \
P = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, \
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); \
if (h2g_valid(P)) { \
reserve[reserve_idx] = h2g(P); \
reserve[reserve_idx + 1] = SIZE; \
reserve_idx += 2; \
} \
} while (0)
#else #else
p = qemu_mallocz(sizeof(PageDesc) * L2_SIZE); # define ALLOC(P, SIZE) \
*lp = p; do { P = qemu_mallocz(SIZE); } while (0)
#endif #endif
PageDesc *pd;
void **lp;
int i;
/* Level 1. Always allocated. */
lp = l1_map + ((index >> V_L1_SHIFT) & (V_L1_SIZE - 1));
/* Level 2..N-1. */
for (i = V_L1_SHIFT / L2_BITS - 1; i > 0; i--) {
void **p = *lp;
if (p == NULL) {
if (!alloc) {
return NULL;
}
ALLOC(p, sizeof(void *) * L2_SIZE);
*lp = p;
}
lp = p + ((index >> (i * L2_BITS)) & (L2_SIZE - 1));
} }
return p + (index & (L2_SIZE - 1));
pd = *lp;
if (pd == NULL) {
if (!alloc) {
return NULL;
}
ALLOC(pd, sizeof(PageDesc) * L2_SIZE);
*lp = pd;
}
#undef ALLOC
#if defined(CONFIG_USER_ONLY)
for (i = 0; i < reserve_idx; i += 2) {
unsigned long addr = reserve[i];
unsigned long len = reserve[i + 1];
page_set_flags(addr & TARGET_PAGE_MASK,
TARGET_PAGE_ALIGN(addr + len),
PAGE_RESERVED);
}
#endif
return pd + (index & (L2_SIZE - 1));
} }
static inline PageDesc *page_find(target_ulong index) static inline PageDesc *page_find(target_ulong index)
{ {
PageDesc **lp, *p; return page_find_alloc(index, 0);
lp = page_l1_map(index);
if (!lp)
return NULL;
p = *lp;
if (!p) {
return NULL;
}
return p + (index & (L2_SIZE - 1));
} }
#if !defined(CONFIG_USER_ONLY) #if !defined(CONFIG_USER_ONLY)
static PhysPageDesc *phys_page_find_alloc(target_phys_addr_t index, int alloc) static PhysPageDesc *phys_page_find_alloc(target_phys_addr_t index, int alloc)
{ {
void **lp, **p;
PhysPageDesc *pd; PhysPageDesc *pd;
void **lp;
int i;
p = (void **)l1_phys_map; /* Level 1. Always allocated. */
#if TARGET_PHYS_ADDR_SPACE_BITS > 32 lp = l1_phys_map + ((index >> P_L1_SHIFT) & (P_L1_SIZE - 1));
#if TARGET_PHYS_ADDR_SPACE_BITS > (32 + L1_BITS) /* Level 2..N-1. */
#error unsupported TARGET_PHYS_ADDR_SPACE_BITS for (i = P_L1_SHIFT / L2_BITS - 1; i > 0; i--) {
#endif void **p = *lp;
lp = p + ((index >> (L1_BITS + L2_BITS)) & (L1_SIZE - 1)); if (p == NULL) {
p = *lp; if (!alloc) {
if (!p) { return NULL;
/* allocate if not found */ }
if (!alloc) *lp = p = qemu_mallocz(sizeof(void *) * L2_SIZE);
return NULL; }
p = qemu_vmalloc(sizeof(void *) * L1_SIZE); lp = p + ((index >> (i * L2_BITS)) & (L2_SIZE - 1));
memset(p, 0, sizeof(void *) * L1_SIZE);
*lp = p;
} }
#endif
lp = p + ((index >> L2_BITS) & (L1_SIZE - 1));
pd = *lp; pd = *lp;
if (!pd) { if (pd == NULL) {
int i; int i;
/* allocate if not found */
if (!alloc) if (!alloc) {
return NULL; return NULL;
pd = qemu_vmalloc(sizeof(PhysPageDesc) * L2_SIZE); }
*lp = pd;
*lp = pd = qemu_malloc(sizeof(PhysPageDesc) * L2_SIZE);
for (i = 0; i < L2_SIZE; i++) { for (i = 0; i < L2_SIZE; i++) {
pd[i].phys_offset = IO_MEM_UNASSIGNED; pd[i].phys_offset = IO_MEM_UNASSIGNED;
pd[i].region_offset = (index + i) << TARGET_PAGE_BITS; pd[i].region_offset = (index + i) << TARGET_PAGE_BITS;
} }
} }
return ((PhysPageDesc *)pd) + (index & (L2_SIZE - 1));
return pd + (index & (L2_SIZE - 1));
} }
static inline PhysPageDesc *phys_page_find(target_phys_addr_t index) static inline PhysPageDesc *phys_page_find(target_phys_addr_t index)
@ -573,21 +625,34 @@ static inline void invalidate_page_bitmap(PageDesc *p)
p->code_write_count = 0; p->code_write_count = 0;
} }
/* set to NULL all the 'first_tb' fields in all PageDescs */ /* Set to NULL all the 'first_tb' fields in all PageDescs. */
static void page_flush_tb_1 (int level, void **lp)
{
int i;
if (*lp == NULL) {
return;
}
if (level == 0) {
PageDesc *pd = *lp;
for (i = 0; i < L2_BITS; ++i) {
pd[i].first_tb = NULL;
invalidate_page_bitmap(pd + i);
}
} else {
void **pp = *lp;
for (i = 0; i < L2_BITS; ++i) {
page_flush_tb_1 (level - 1, pp + i);
}
}
}
static void page_flush_tb(void) static void page_flush_tb(void)
{ {
int i, j; int i;
PageDesc *p; for (i = 0; i < V_L1_SIZE; i++) {
page_flush_tb_1(V_L1_SHIFT / L2_BITS - 1, l1_map + i);
for(i = 0; i < L1_SIZE; i++) {
p = l1_map[i];
if (p) {
for(j = 0; j < L2_SIZE; j++) {
p->first_tb = NULL;
invalidate_page_bitmap(p);
p++;
}
}
} }
} }
@ -1081,7 +1146,7 @@ static inline void tb_alloc_page(TranslationBlock *tb,
TranslationBlock *last_first_tb; TranslationBlock *last_first_tb;
tb->page_addr[n] = page_addr; tb->page_addr[n] = page_addr;
p = page_find_alloc(page_addr >> TARGET_PAGE_BITS); p = page_find_alloc(page_addr >> TARGET_PAGE_BITS, 1);
tb->page_next[n] = p->first_tb; tb->page_next[n] = p->first_tb;
last_first_tb = p->first_tb; last_first_tb = p->first_tb;
p->first_tb = (TranslationBlock *)((long)tb | n); p->first_tb = (TranslationBlock *)((long)tb | n);
@ -1641,50 +1706,37 @@ static int cpu_notify_migration_log(int enable)
return 0; return 0;
} }
static void phys_page_for_each_in_l1_map(PhysPageDesc **phys_map, static void phys_page_for_each_1(CPUPhysMemoryClient *client,
CPUPhysMemoryClient *client) int level, void **lp)
{ {
PhysPageDesc *pd; int i;
int l1, l2;
for (l1 = 0; l1 < L1_SIZE; ++l1) { if (*lp == NULL) {
pd = phys_map[l1]; return;
if (!pd) { }
continue; if (level == 0) {
} PhysPageDesc *pd = *lp;
for (l2 = 0; l2 < L2_SIZE; ++l2) { for (i = 0; i < L2_BITS; ++i) {
if (pd[l2].phys_offset == IO_MEM_UNASSIGNED) { if (pd[i].phys_offset != IO_MEM_UNASSIGNED) {
continue; client->set_memory(client, pd[i].region_offset,
TARGET_PAGE_SIZE, pd[i].phys_offset);
} }
client->set_memory(client, pd[l2].region_offset, }
TARGET_PAGE_SIZE, pd[l2].phys_offset); } else {
void **pp = *lp;
for (i = 0; i < L2_BITS; ++i) {
phys_page_for_each_1(client, level - 1, pp + i);
} }
} }
} }
static void phys_page_for_each(CPUPhysMemoryClient *client) static void phys_page_for_each(CPUPhysMemoryClient *client)
{ {
#if TARGET_PHYS_ADDR_SPACE_BITS > 32 int i;
for (i = 0; i < P_L1_SIZE; ++i) {
#if TARGET_PHYS_ADDR_SPACE_BITS > (32 + L1_BITS) phys_page_for_each_1(client, P_L1_SHIFT / L2_BITS - 1,
#error unsupported TARGET_PHYS_ADDR_SPACE_BITS l1_phys_map + 1);
#endif
void **phys_map = (void **)l1_phys_map;
int l1;
if (!l1_phys_map) {
return;
} }
for (l1 = 0; l1 < L1_SIZE; ++l1) {
if (phys_map[l1]) {
phys_page_for_each_in_l1_map(phys_map[l1], client);
}
}
#else
if (!l1_phys_map) {
return;
}
phys_page_for_each_in_l1_map(l1_phys_map, client);
#endif
} }
void cpu_register_phys_memory_client(CPUPhysMemoryClient *client) void cpu_register_phys_memory_client(CPUPhysMemoryClient *client)
@ -2148,44 +2200,87 @@ void tlb_flush_page(CPUState *env, target_ulong addr)
* Walks guest process memory "regions" one by one * Walks guest process memory "regions" one by one
* and calls callback function 'fn' for each region. * and calls callback function 'fn' for each region.
*/ */
int walk_memory_regions(void *priv,
int (*fn)(void *, unsigned long, unsigned long, unsigned long)) struct walk_memory_regions_data
{ {
unsigned long start, end; walk_memory_regions_fn fn;
PageDesc *p = NULL; void *priv;
int i, j, prot, prot1; unsigned long start;
int rc = 0; int prot;
};
start = end = -1; static int walk_memory_regions_end(struct walk_memory_regions_data *data,
prot = 0; unsigned long end, int new_prot)
{
for (i = 0; i <= L1_SIZE; i++) { if (data->start != -1ul) {
p = (i < L1_SIZE) ? l1_map[i] : NULL; int rc = data->fn(data->priv, data->start, end, data->prot);
for (j = 0; j < L2_SIZE; j++) { if (rc != 0) {
prot1 = (p == NULL) ? 0 : p[j].flags; return rc;
/*
* "region" is one continuous chunk of memory
* that has same protection flags set.
*/
if (prot1 != prot) {
end = (i << (32 - L1_BITS)) | (j << TARGET_PAGE_BITS);
if (start != -1) {
rc = (*fn)(priv, start, end, prot);
/* callback can stop iteration by returning != 0 */
if (rc != 0)
return (rc);
}
if (prot1 != 0)
start = end;
else
start = -1;
prot = prot1;
}
if (p == NULL)
break;
} }
} }
return (rc);
data->start = (new_prot ? end : -1ul);
data->prot = new_prot;
return 0;
}
static int walk_memory_regions_1(struct walk_memory_regions_data *data,
unsigned long base, int level, void **lp)
{
unsigned long pa;
int i, rc;
if (*lp == NULL) {
return walk_memory_regions_end(data, base, 0);
}
if (level == 0) {
PageDesc *pd = *lp;
for (i = 0; i < L2_BITS; ++i) {
int prot = pd[i].flags;
pa = base | (i << TARGET_PAGE_BITS);
if (prot != data->prot) {
rc = walk_memory_regions_end(data, pa, prot);
if (rc != 0) {
return rc;
}
}
}
} else {
void **pp = *lp;
for (i = 0; i < L2_BITS; ++i) {
pa = base | (i << (TARGET_PAGE_BITS + L2_BITS * level));
rc = walk_memory_regions_1(data, pa, level - 1, pp + i);
if (rc != 0) {
return rc;
}
}
}
return 0;
}
int walk_memory_regions(void *priv, walk_memory_regions_fn fn)
{
struct walk_memory_regions_data data;
unsigned long i;
data.fn = fn;
data.priv = priv;
data.start = -1ul;
data.prot = 0;
for (i = 0; i < V_L1_SIZE; i++) {
int rc = walk_memory_regions_1(&data, i << V_L1_SHIFT,
V_L1_SHIFT / L2_BITS - 1, l1_map + i);
if (rc != 0) {
return rc;
}
}
return walk_memory_regions_end(&data, 0, 0);
} }
static int dump_region(void *priv, unsigned long start, static int dump_region(void *priv, unsigned long start,