2003-06-15 19:58:51 +00:00
|
|
|
/*
|
|
|
|
* Host code generation
|
2007-09-16 21:08:06 +00:00
|
|
|
*
|
2003-06-15 19:58:51 +00:00
|
|
|
* Copyright (c) 2003 Fabrice Bellard
|
|
|
|
*
|
|
|
|
* This library is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
2019-01-23 14:08:56 +00:00
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
2003-06-15 19:58:51 +00:00
|
|
|
*
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
2009-07-16 20:47:01 +00:00
|
|
|
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
2003-06-15 19:58:51 +00:00
|
|
|
*/
|
2019-05-23 14:35:05 +00:00
|
|
|
|
2016-01-26 18:16:56 +00:00
|
|
|
#include "qemu/osdep.h"
|
2003-06-15 19:58:51 +00:00
|
|
|
|
2004-01-04 23:28:12 +00:00
|
|
|
#define NO_CPU_IO_DEFS
|
2017-06-02 06:06:45 +00:00
|
|
|
#include "trace.h"
|
2012-10-24 09:12:21 +00:00
|
|
|
#include "disas/disas.h"
|
2016-03-15 12:18:37 +00:00
|
|
|
#include "exec/exec-all.h"
|
2020-01-01 11:23:00 +00:00
|
|
|
#include "tcg/tcg.h"
|
2012-12-02 16:04:43 +00:00
|
|
|
#if defined(CONFIG_USER_ONLY)
|
|
|
|
#include "qemu.h"
|
|
|
|
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
|
|
|
|
#include <sys/param.h>
|
|
|
|
#if __FreeBSD_version >= 700104
|
|
|
|
#define HAVE_KINFO_GETVMMAP
|
|
|
|
#define sigqueue sigqueue_freebsd /* avoid redefinition */
|
|
|
|
#include <sys/proc.h>
|
|
|
|
#include <machine/profile.h>
|
|
|
|
#define _KERNEL
|
|
|
|
#include <sys/user.h>
|
|
|
|
#undef _KERNEL
|
|
|
|
#undef sigqueue
|
|
|
|
#include <libutil.h>
|
|
|
|
#endif
|
|
|
|
#endif
|
2013-04-08 15:29:59 +00:00
|
|
|
#else
|
2018-05-30 09:58:36 +00:00
|
|
|
#include "exec/ram_addr.h"
|
2012-12-02 16:04:43 +00:00
|
|
|
#endif
|
|
|
|
|
2012-12-17 17:19:49 +00:00
|
|
|
#include "exec/cputlb.h"
|
2020-12-16 12:27:58 +00:00
|
|
|
#include "exec/translate-all.h"
|
2022-08-11 20:48:03 +00:00
|
|
|
#include "exec/translator.h"
|
2015-04-22 21:50:52 +00:00
|
|
|
#include "qemu/bitmap.h"
|
2019-04-17 19:17:52 +00:00
|
|
|
#include "qemu/qemu-print.h"
|
2013-04-22 07:42:50 +00:00
|
|
|
#include "qemu/timer.h"
|
tcg: drop global lock during TCG code execution
This finally allows TCG to benefit from the iothread introduction: Drop
the global mutex while running pure TCG CPU code. Reacquire the lock
when entering MMIO or PIO emulation, or when leaving the TCG loop.
We have to revert a few optimization for the current TCG threading
model, namely kicking the TCG thread in qemu_mutex_lock_iothread and not
kicking it in qemu_cpu_kick. We also need to disable RAM block
reordering until we have a more efficient locking mechanism at hand.
Still, a Linux x86 UP guest and my Musicpal ARM model boot fine here.
These numbers demonstrate where we gain something:
20338 jan 20 0 331m 75m 6904 R 99 0.9 0:50.95 qemu-system-arm
20337 jan 20 0 331m 75m 6904 S 20 0.9 0:26.50 qemu-system-arm
The guest CPU was fully loaded, but the iothread could still run mostly
independent on a second core. Without the patch we don't get beyond
32206 jan 20 0 330m 73m 7036 R 82 0.9 1:06.00 qemu-system-arm
32204 jan 20 0 330m 73m 7036 S 21 0.9 0:17.03 qemu-system-arm
We don't benefit significantly, though, when the guest is not fully
loading a host CPU.
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Message-Id: <1439220437-23957-10-git-send-email-fred.konrad@greensocs.com>
[FK: Rebase, fix qemu_devices_reset deadlock, rm address_space_* mutex]
Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
[EGC: fixed iothread lock for cpu-exec IRQ handling]
Signed-off-by: Emilio G. Cota <cota@braap.org>
[AJB: -smp single-threaded fix, clean commit msg, BQL fixes]
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Reviewed-by: Pranith Kumar <bobby.prani@gmail.com>
[PM: target-arm changes]
Acked-by: Peter Maydell <peter.maydell@linaro.org>
2017-02-23 18:29:11 +00:00
|
|
|
#include "qemu/main-loop.h"
|
2022-02-08 20:08:55 +00:00
|
|
|
#include "qemu/cacheinfo.h"
|
2016-01-07 13:55:28 +00:00
|
|
|
#include "exec/log.h"
|
2017-03-03 11:01:16 +00:00
|
|
|
#include "sysemu/cpus.h"
|
2020-08-19 11:17:19 +00:00
|
|
|
#include "sysemu/cpu-timers.h"
|
2019-05-23 14:35:05 +00:00
|
|
|
#include "sysemu/tcg.h"
|
2020-10-29 03:14:54 +00:00
|
|
|
#include "qapi/error.h"
|
2021-02-13 13:03:13 +00:00
|
|
|
#include "hw/core/tcg-cpu-ops.h"
|
2022-08-15 20:13:05 +00:00
|
|
|
#include "tb-jmp-cache.h"
|
2021-05-24 17:04:53 +00:00
|
|
|
#include "tb-hash.h"
|
|
|
|
#include "tb-context.h"
|
2021-01-21 06:15:06 +00:00
|
|
|
#include "internal.h"
|
2012-12-02 16:04:43 +00:00
|
|
|
|
|
|
|
/* make various TB consistency checks */
|
2017-07-12 19:31:57 +00:00
|
|
|
|
2017-07-27 00:22:51 +00:00
|
|
|
/**
|
|
|
|
* struct page_entry - page descriptor entry
|
|
|
|
* @pd: pointer to the &struct PageDesc of the page this entry represents
|
|
|
|
* @index: page index of the page
|
|
|
|
* @locked: whether the page is locked
|
|
|
|
*
|
|
|
|
* This struct helps us keep track of the locked state of a page, without
|
|
|
|
* bloating &struct PageDesc.
|
|
|
|
*
|
|
|
|
* A page lock protects accesses to all fields of &struct PageDesc.
|
|
|
|
*
|
|
|
|
* See also: &struct page_collection.
|
|
|
|
*/
|
|
|
|
struct page_entry {
|
|
|
|
PageDesc *pd;
|
|
|
|
tb_page_addr_t index;
|
|
|
|
bool locked;
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* struct page_collection - tracks a set of pages (i.e. &struct page_entry's)
|
|
|
|
* @tree: Binary search tree (BST) of the pages, with key == page index
|
|
|
|
* @max: Pointer to the page in @tree with the highest page index
|
|
|
|
*
|
|
|
|
* To avoid deadlock we lock pages in ascending order of page index.
|
|
|
|
* When operating on a set of pages, we need to keep track of them so that
|
|
|
|
* we can lock them in order and also unlock them later. For this we collect
|
|
|
|
* pages (i.e. &struct page_entry's) in a binary search @tree. Given that the
|
|
|
|
* @tree implementation we use does not provide an O(1) operation to obtain the
|
|
|
|
* highest-ranked element, we use @max to keep track of the inserted page
|
|
|
|
* with the highest index. This is valuable because if a page is not in
|
|
|
|
* the tree and its index is higher than @max's, then we can lock it
|
|
|
|
* without breaking the locking order rule.
|
|
|
|
*
|
|
|
|
* Note on naming: 'struct page_set' would be shorter, but we already have a few
|
|
|
|
* page_set_*() helpers, so page_collection is used instead to avoid confusion.
|
|
|
|
*
|
|
|
|
* See also: page_collection_lock().
|
|
|
|
*/
|
|
|
|
struct page_collection {
|
|
|
|
GTree *tree;
|
|
|
|
struct page_entry *max;
|
|
|
|
};
|
|
|
|
|
2020-05-13 17:51:30 +00:00
|
|
|
/*
|
|
|
|
* In system mode we want L1_MAP to be based on ram offsets,
|
|
|
|
* while in user mode we want it to be based on virtual addresses.
|
|
|
|
*
|
|
|
|
* TODO: For user mode, see the caveat re host vs guest virtual
|
|
|
|
* address spaces near GUEST_ADDR_MAX.
|
|
|
|
*/
|
2012-12-02 16:04:43 +00:00
|
|
|
#if !defined(CONFIG_USER_ONLY)
|
|
|
|
#if HOST_LONG_BITS < TARGET_PHYS_ADDR_SPACE_BITS
|
|
|
|
# define L1_MAP_ADDR_SPACE_BITS HOST_LONG_BITS
|
|
|
|
#else
|
|
|
|
# define L1_MAP_ADDR_SPACE_BITS TARGET_PHYS_ADDR_SPACE_BITS
|
|
|
|
#endif
|
|
|
|
#else
|
2020-05-13 17:51:30 +00:00
|
|
|
# define L1_MAP_ADDR_SPACE_BITS MIN(HOST_LONG_BITS, TARGET_ABI_BITS)
|
2012-12-02 16:04:43 +00:00
|
|
|
#endif
|
|
|
|
|
2017-07-04 08:42:32 +00:00
|
|
|
/* Make sure all possible CPU event bits fit in tb->trace_vcpu_dstate */
|
|
|
|
QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS >
|
2018-06-14 16:44:31 +00:00
|
|
|
sizeof_field(TranslationBlock, trace_vcpu_dstate)
|
2017-07-04 08:42:32 +00:00
|
|
|
* BITS_PER_BYTE);
|
|
|
|
|
2016-10-24 15:26:49 +00:00
|
|
|
/*
|
|
|
|
* L1 Mapping properties
|
|
|
|
*/
|
2022-09-20 05:17:44 +00:00
|
|
|
int v_l1_size;
|
|
|
|
int v_l1_shift;
|
|
|
|
int v_l2_levels;
|
2016-10-24 15:26:49 +00:00
|
|
|
|
2022-09-20 05:17:44 +00:00
|
|
|
void *l1_map[V_L1_MAX_SIZE];
|
2012-12-02 16:04:43 +00:00
|
|
|
|
2017-06-24 00:04:43 +00:00
|
|
|
TBContext tb_ctx;
|
2003-06-15 19:58:51 +00:00
|
|
|
|
2016-10-24 15:26:49 +00:00
|
|
|
static void page_table_config_init(void)
|
|
|
|
{
|
|
|
|
uint32_t v_l1_bits;
|
|
|
|
|
|
|
|
assert(TARGET_PAGE_BITS);
|
|
|
|
/* The bits remaining after N lower levels of page tables. */
|
|
|
|
v_l1_bits = (L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % V_L2_BITS;
|
|
|
|
if (v_l1_bits < V_L1_MIN_BITS) {
|
|
|
|
v_l1_bits += V_L2_BITS;
|
|
|
|
}
|
|
|
|
|
|
|
|
v_l1_size = 1 << v_l1_bits;
|
|
|
|
v_l1_shift = L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS - v_l1_bits;
|
|
|
|
v_l2_levels = v_l1_shift / V_L2_BITS - 1;
|
|
|
|
|
|
|
|
assert(v_l1_bits <= V_L1_MAX_BITS);
|
|
|
|
assert(v_l1_shift % V_L2_BITS == 0);
|
|
|
|
assert(v_l2_levels >= 0);
|
|
|
|
}
|
|
|
|
|
2015-09-02 02:11:45 +00:00
|
|
|
/* Encode VAL as a signed leb128 sequence at P.
|
|
|
|
Return P incremented past the encoded value. */
|
|
|
|
static uint8_t *encode_sleb128(uint8_t *p, target_long val)
|
|
|
|
{
|
|
|
|
int more, byte;
|
|
|
|
|
|
|
|
do {
|
|
|
|
byte = val & 0x7f;
|
|
|
|
val >>= 7;
|
|
|
|
more = !((val == 0 && (byte & 0x40) == 0)
|
|
|
|
|| (val == -1 && (byte & 0x40) != 0));
|
|
|
|
if (more) {
|
|
|
|
byte |= 0x80;
|
|
|
|
}
|
|
|
|
*p++ = byte;
|
|
|
|
} while (more);
|
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Decode a signed leb128 sequence at *PP; increment *PP past the
|
|
|
|
decoded value. Return the decoded value. */
|
2020-10-28 19:05:44 +00:00
|
|
|
static target_long decode_sleb128(const uint8_t **pp)
|
2015-09-02 02:11:45 +00:00
|
|
|
{
|
2020-10-28 19:05:44 +00:00
|
|
|
const uint8_t *p = *pp;
|
2015-09-02 02:11:45 +00:00
|
|
|
target_long val = 0;
|
|
|
|
int byte, shift = 0;
|
|
|
|
|
|
|
|
do {
|
|
|
|
byte = *p++;
|
|
|
|
val |= (target_ulong)(byte & 0x7f) << shift;
|
|
|
|
shift += 7;
|
|
|
|
} while (byte & 0x80);
|
|
|
|
if (shift < TARGET_LONG_BITS && (byte & 0x40)) {
|
|
|
|
val |= -(target_ulong)1 << shift;
|
|
|
|
}
|
|
|
|
|
|
|
|
*pp = p;
|
|
|
|
return val;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Encode the data collected about the instructions while compiling TB.
|
|
|
|
Place the data at BLOCK, and return the number of bytes consumed.
|
|
|
|
|
2017-10-18 22:01:42 +00:00
|
|
|
The logical table consists of TARGET_INSN_START_WORDS target_ulong's,
|
2015-09-02 02:11:45 +00:00
|
|
|
which come from the target's insn_start data, followed by a uintptr_t
|
|
|
|
which comes from the host pc of the end of the code implementing the insn.
|
|
|
|
|
|
|
|
Each line of the table is encoded as sleb128 deltas from the previous
|
2017-07-12 04:08:21 +00:00
|
|
|
line. The seed for the first line is { tb->pc, 0..., tb->tc.ptr }.
|
2015-09-02 02:11:45 +00:00
|
|
|
That is, the first column is seeded with the guest pc, the last column
|
|
|
|
with the host pc, and the middle columns with zeros. */
|
|
|
|
|
|
|
|
static int encode_search(TranslationBlock *tb, uint8_t *block)
|
|
|
|
{
|
2017-07-12 21:15:52 +00:00
|
|
|
uint8_t *highwater = tcg_ctx->code_gen_highwater;
|
2015-09-02 02:11:45 +00:00
|
|
|
uint8_t *p = block;
|
|
|
|
int i, j, n;
|
|
|
|
|
|
|
|
for (i = 0, n = tb->icount; i < n; ++i) {
|
|
|
|
target_ulong prev;
|
|
|
|
|
|
|
|
for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
|
|
|
|
if (i == 0) {
|
2022-08-12 16:53:53 +00:00
|
|
|
prev = (!TARGET_TB_PCREL && j == 0 ? tb_pc(tb) : 0);
|
2015-09-02 02:11:45 +00:00
|
|
|
} else {
|
2017-07-12 21:15:52 +00:00
|
|
|
prev = tcg_ctx->gen_insn_data[i - 1][j];
|
2015-09-02 02:11:45 +00:00
|
|
|
}
|
2017-07-12 21:15:52 +00:00
|
|
|
p = encode_sleb128(p, tcg_ctx->gen_insn_data[i][j] - prev);
|
2015-09-02 02:11:45 +00:00
|
|
|
}
|
2017-07-12 21:15:52 +00:00
|
|
|
prev = (i == 0 ? 0 : tcg_ctx->gen_insn_end_off[i - 1]);
|
|
|
|
p = encode_sleb128(p, tcg_ctx->gen_insn_end_off[i] - prev);
|
2015-09-22 20:01:15 +00:00
|
|
|
|
|
|
|
/* Test for (pending) buffer overflow. The assumption is that any
|
|
|
|
one row beginning below the high water mark cannot overrun
|
|
|
|
the buffer completely. Thus we can test for overflow after
|
|
|
|
encoding a row without having to check during encoding. */
|
|
|
|
if (unlikely(p > highwater)) {
|
|
|
|
return -1;
|
|
|
|
}
|
2015-09-02 02:11:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return p - block;
|
|
|
|
}
|
|
|
|
|
2022-10-24 12:15:04 +00:00
|
|
|
static int cpu_unwind_data_from_tb(TranslationBlock *tb, uintptr_t host_pc,
|
|
|
|
uint64_t *data)
|
2003-06-15 19:58:51 +00:00
|
|
|
{
|
2022-10-24 12:15:04 +00:00
|
|
|
uintptr_t iter_pc = (uintptr_t)tb->tc.ptr;
|
2020-10-28 19:05:44 +00:00
|
|
|
const uint8_t *p = tb->tc.ptr + tb->tc.size;
|
2015-09-02 02:11:45 +00:00
|
|
|
int i, j, num_insns = tb->icount;
|
2008-02-01 10:50:11 +00:00
|
|
|
|
2022-10-24 12:15:04 +00:00
|
|
|
host_pc -= GETPC_ADJ;
|
2016-07-26 00:39:16 +00:00
|
|
|
|
2022-10-24 12:15:04 +00:00
|
|
|
if (host_pc < iter_pc) {
|
2015-09-02 02:11:45 +00:00
|
|
|
return -1;
|
|
|
|
}
|
2003-06-15 19:58:51 +00:00
|
|
|
|
2022-10-24 12:15:04 +00:00
|
|
|
memset(data, 0, sizeof(uint64_t) * TARGET_INSN_START_WORDS);
|
2022-08-12 16:53:53 +00:00
|
|
|
if (!TARGET_TB_PCREL) {
|
|
|
|
data[0] = tb_pc(tb);
|
|
|
|
}
|
|
|
|
|
2022-10-24 12:15:04 +00:00
|
|
|
/*
|
|
|
|
* Reconstruct the stored insn data while looking for the point
|
|
|
|
* at which the end of the insn exceeds host_pc.
|
|
|
|
*/
|
2015-09-02 02:11:45 +00:00
|
|
|
for (i = 0; i < num_insns; ++i) {
|
|
|
|
for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
|
|
|
|
data[j] += decode_sleb128(&p);
|
|
|
|
}
|
2022-10-24 12:15:04 +00:00
|
|
|
iter_pc += decode_sleb128(&p);
|
|
|
|
if (iter_pc > host_pc) {
|
|
|
|
return num_insns - i;
|
2015-09-02 02:11:45 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return -1;
|
2022-10-24 12:15:04 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2022-10-24 13:12:56 +00:00
|
|
|
* The cpu state corresponding to 'host_pc' is restored in
|
|
|
|
* preparation for exiting the TB.
|
2022-10-24 12:15:04 +00:00
|
|
|
*/
|
|
|
|
void cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
|
2022-10-24 13:12:56 +00:00
|
|
|
uintptr_t host_pc)
|
2022-10-24 12:15:04 +00:00
|
|
|
{
|
|
|
|
uint64_t data[TARGET_INSN_START_WORDS];
|
|
|
|
#ifdef CONFIG_PROFILER
|
|
|
|
TCGProfile *prof = &tcg_ctx->prof;
|
|
|
|
int64_t ti = profile_getclock();
|
|
|
|
#endif
|
|
|
|
int insns_left = cpu_unwind_data_from_tb(tb, host_pc, data);
|
|
|
|
|
|
|
|
if (insns_left < 0) {
|
|
|
|
return;
|
|
|
|
}
|
2007-09-17 08:09:54 +00:00
|
|
|
|
2022-10-24 13:12:56 +00:00
|
|
|
if (tb_cflags(tb) & CF_USE_ICOUNT) {
|
2020-08-19 11:17:19 +00:00
|
|
|
assert(icount_enabled());
|
2022-10-24 12:15:04 +00:00
|
|
|
/*
|
|
|
|
* Reset the cycle counter to the start of the block and
|
|
|
|
* shift if to the number of actually executed instructions.
|
|
|
|
*/
|
|
|
|
cpu_neg(cpu)->icount_decr.u16.low += insns_left;
|
2008-06-29 01:03:05 +00:00
|
|
|
}
|
2022-10-24 09:43:40 +00:00
|
|
|
|
2022-10-24 11:17:39 +00:00
|
|
|
cpu->cc->tcg_ops->restore_state_to_opc(cpu, tb, data);
|
2008-02-01 10:50:11 +00:00
|
|
|
|
|
|
|
#ifdef CONFIG_PROFILER
|
2020-09-23 10:56:46 +00:00
|
|
|
qatomic_set(&prof->restore_time,
|
2017-07-05 23:35:06 +00:00
|
|
|
prof->restore_time + profile_getclock() - ti);
|
2020-09-23 10:56:46 +00:00
|
|
|
qatomic_set(&prof->restore_count, prof->restore_count + 1);
|
2008-02-01 10:50:11 +00:00
|
|
|
#endif
|
2003-06-15 19:58:51 +00:00
|
|
|
}
|
2012-12-02 16:04:43 +00:00
|
|
|
|
2022-10-24 13:09:57 +00:00
|
|
|
bool cpu_restore_state(CPUState *cpu, uintptr_t host_pc)
|
2012-12-04 20:16:07 +00:00
|
|
|
{
|
2020-10-31 01:59:09 +00:00
|
|
|
/*
|
2020-10-28 19:05:44 +00:00
|
|
|
* The host_pc has to be in the rx region of the code buffer.
|
2020-10-31 01:59:09 +00:00
|
|
|
* If it is not we will not be able to resolve it here.
|
|
|
|
* The two cases where host_pc will not be correct are:
|
2017-11-13 13:55:27 +00:00
|
|
|
*
|
|
|
|
* - fault during translation (instruction fetch)
|
|
|
|
* - fault from helper (not using GETPC() macro)
|
|
|
|
*
|
2017-08-05 03:46:31 +00:00
|
|
|
* Either way we need return early as we can't resolve it here.
|
2017-03-02 10:31:32 +00:00
|
|
|
*/
|
2020-10-28 19:05:44 +00:00
|
|
|
if (in_code_gen_buffer((const void *)(host_pc - tcg_splitwx_diff))) {
|
2020-10-31 01:59:09 +00:00
|
|
|
TranslationBlock *tb = tcg_tb_lookup(host_pc);
|
2017-11-13 13:55:27 +00:00
|
|
|
if (tb) {
|
2022-10-24 13:12:56 +00:00
|
|
|
cpu_restore_state_from_tb(cpu, tb, host_pc);
|
2020-10-31 01:59:09 +00:00
|
|
|
return true;
|
2014-11-26 10:40:16 +00:00
|
|
|
}
|
2012-12-04 20:16:07 +00:00
|
|
|
}
|
2020-10-31 01:59:09 +00:00
|
|
|
return false;
|
2012-12-04 20:16:07 +00:00
|
|
|
}
|
|
|
|
|
2022-10-24 12:15:04 +00:00
|
|
|
bool cpu_unwind_state_data(CPUState *cpu, uintptr_t host_pc, uint64_t *data)
|
|
|
|
{
|
|
|
|
if (in_code_gen_buffer((const void *)(host_pc - tcg_splitwx_diff))) {
|
|
|
|
TranslationBlock *tb = tcg_tb_lookup(host_pc);
|
|
|
|
if (tb) {
|
|
|
|
return cpu_unwind_data_from_tb(tb, host_pc, data) >= 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2021-03-09 23:42:16 +00:00
|
|
|
void page_init(void)
|
2014-01-17 18:12:07 +00:00
|
|
|
{
|
|
|
|
page_size_init();
|
2016-10-24 15:26:49 +00:00
|
|
|
page_table_config_init();
|
|
|
|
|
2012-12-02 16:04:43 +00:00
|
|
|
#if defined(CONFIG_BSD) && defined(CONFIG_USER_ONLY)
|
|
|
|
{
|
|
|
|
#ifdef HAVE_KINFO_GETVMMAP
|
|
|
|
struct kinfo_vmentry *freep;
|
|
|
|
int i, cnt;
|
|
|
|
|
|
|
|
freep = kinfo_getvmmap(getpid(), &cnt);
|
|
|
|
if (freep) {
|
|
|
|
mmap_lock();
|
|
|
|
for (i = 0; i < cnt; i++) {
|
|
|
|
unsigned long startaddr, endaddr;
|
|
|
|
|
|
|
|
startaddr = freep[i].kve_start;
|
|
|
|
endaddr = freep[i].kve_end;
|
|
|
|
if (h2g_valid(startaddr)) {
|
|
|
|
startaddr = h2g(startaddr) & TARGET_PAGE_MASK;
|
|
|
|
|
|
|
|
if (h2g_valid(endaddr)) {
|
|
|
|
endaddr = h2g(endaddr);
|
|
|
|
page_set_flags(startaddr, endaddr, PAGE_RESERVED);
|
|
|
|
} else {
|
|
|
|
#if TARGET_ABI_BITS <= L1_MAP_ADDR_SPACE_BITS
|
|
|
|
endaddr = ~0ul;
|
|
|
|
page_set_flags(startaddr, endaddr, PAGE_RESERVED);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
free(freep);
|
|
|
|
mmap_unlock();
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
FILE *f;
|
|
|
|
|
|
|
|
last_brk = (unsigned long)sbrk(0);
|
|
|
|
|
|
|
|
f = fopen("/compat/linux/proc/self/maps", "r");
|
|
|
|
if (f) {
|
|
|
|
mmap_lock();
|
|
|
|
|
|
|
|
do {
|
|
|
|
unsigned long startaddr, endaddr;
|
|
|
|
int n;
|
|
|
|
|
|
|
|
n = fscanf(f, "%lx-%lx %*[^\n]\n", &startaddr, &endaddr);
|
|
|
|
|
|
|
|
if (n == 2 && h2g_valid(startaddr)) {
|
|
|
|
startaddr = h2g(startaddr) & TARGET_PAGE_MASK;
|
|
|
|
|
|
|
|
if (h2g_valid(endaddr)) {
|
|
|
|
endaddr = h2g(endaddr);
|
|
|
|
} else {
|
|
|
|
endaddr = ~0ul;
|
|
|
|
}
|
|
|
|
page_set_flags(startaddr, endaddr, PAGE_RESERVED);
|
|
|
|
}
|
|
|
|
} while (!feof(f));
|
|
|
|
|
|
|
|
fclose(f);
|
|
|
|
mmap_unlock();
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2022-09-19 10:28:15 +00:00
|
|
|
PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc)
|
2012-12-02 16:04:43 +00:00
|
|
|
{
|
|
|
|
PageDesc *pd;
|
|
|
|
void **lp;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* Level 1. Always allocated. */
|
2016-10-24 15:26:49 +00:00
|
|
|
lp = l1_map + ((index >> v_l1_shift) & (v_l1_size - 1));
|
2012-12-02 16:04:43 +00:00
|
|
|
|
|
|
|
/* Level 2..N-1. */
|
2016-10-24 15:26:49 +00:00
|
|
|
for (i = v_l2_levels; i > 0; i--) {
|
2020-09-23 10:56:46 +00:00
|
|
|
void **p = qatomic_rcu_read(lp);
|
2012-12-02 16:04:43 +00:00
|
|
|
|
|
|
|
if (p == NULL) {
|
2017-07-27 00:15:41 +00:00
|
|
|
void *existing;
|
|
|
|
|
2012-12-02 16:04:43 +00:00
|
|
|
if (!alloc) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2015-04-09 20:07:33 +00:00
|
|
|
p = g_new0(void *, V_L2_SIZE);
|
2020-09-23 10:56:46 +00:00
|
|
|
existing = qatomic_cmpxchg(lp, NULL, p);
|
2017-07-27 00:15:41 +00:00
|
|
|
if (unlikely(existing)) {
|
|
|
|
g_free(p);
|
|
|
|
p = existing;
|
|
|
|
}
|
2012-12-02 16:04:43 +00:00
|
|
|
}
|
|
|
|
|
2013-11-07 16:14:36 +00:00
|
|
|
lp = p + ((index >> (i * V_L2_BITS)) & (V_L2_SIZE - 1));
|
2012-12-02 16:04:43 +00:00
|
|
|
}
|
|
|
|
|
2020-09-23 10:56:46 +00:00
|
|
|
pd = qatomic_rcu_read(lp);
|
2012-12-02 16:04:43 +00:00
|
|
|
if (pd == NULL) {
|
2017-07-27 00:15:41 +00:00
|
|
|
void *existing;
|
|
|
|
|
2012-12-02 16:04:43 +00:00
|
|
|
if (!alloc) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2015-04-09 20:07:33 +00:00
|
|
|
pd = g_new0(PageDesc, V_L2_SIZE);
|
2017-07-27 00:22:51 +00:00
|
|
|
#ifndef CONFIG_USER_ONLY
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < V_L2_SIZE; i++) {
|
|
|
|
qemu_spin_init(&pd[i].lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
2020-09-23 10:56:46 +00:00
|
|
|
existing = qatomic_cmpxchg(lp, NULL, pd);
|
2017-07-27 00:15:41 +00:00
|
|
|
if (unlikely(existing)) {
|
2020-06-12 19:02:29 +00:00
|
|
|
#ifndef CONFIG_USER_ONLY
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < V_L2_SIZE; i++) {
|
|
|
|
qemu_spin_destroy(&pd[i].lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
2017-07-27 00:15:41 +00:00
|
|
|
g_free(pd);
|
|
|
|
pd = existing;
|
|
|
|
}
|
2012-12-02 16:04:43 +00:00
|
|
|
}
|
|
|
|
|
2013-11-07 16:14:36 +00:00
|
|
|
return pd + (index & (V_L2_SIZE - 1));
|
2012-12-02 16:04:43 +00:00
|
|
|
}
|
|
|
|
|
2017-07-27 00:22:51 +00:00
|
|
|
/* In user-mode page locks aren't used; mmap_lock is enough */
|
|
|
|
#ifdef CONFIG_USER_ONLY
|
|
|
|
struct page_collection *
|
|
|
|
page_collection_lock(tb_page_addr_t start, tb_page_addr_t end)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
void page_collection_unlock(struct page_collection *set)
|
|
|
|
{ }
|
|
|
|
#else /* !CONFIG_USER_ONLY */
|
|
|
|
|
2018-04-05 23:52:53 +00:00
|
|
|
#ifdef CONFIG_DEBUG_TCG
|
|
|
|
|
|
|
|
static __thread GHashTable *ht_pages_locked_debug;
|
|
|
|
|
|
|
|
static void ht_pages_locked_debug_init(void)
|
|
|
|
{
|
|
|
|
if (ht_pages_locked_debug) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
ht_pages_locked_debug = g_hash_table_new(NULL, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool page_is_locked(const PageDesc *pd)
|
|
|
|
{
|
|
|
|
PageDesc *found;
|
|
|
|
|
|
|
|
ht_pages_locked_debug_init();
|
|
|
|
found = g_hash_table_lookup(ht_pages_locked_debug, pd);
|
|
|
|
return !!found;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void page_lock__debug(PageDesc *pd)
|
|
|
|
{
|
|
|
|
ht_pages_locked_debug_init();
|
|
|
|
g_assert(!page_is_locked(pd));
|
|
|
|
g_hash_table_insert(ht_pages_locked_debug, pd, pd);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void page_unlock__debug(const PageDesc *pd)
|
|
|
|
{
|
|
|
|
bool removed;
|
|
|
|
|
|
|
|
ht_pages_locked_debug_init();
|
|
|
|
g_assert(page_is_locked(pd));
|
|
|
|
removed = g_hash_table_remove(ht_pages_locked_debug, pd);
|
|
|
|
g_assert(removed);
|
|
|
|
}
|
|
|
|
|
2022-09-20 05:17:44 +00:00
|
|
|
void do_assert_page_locked(const PageDesc *pd, const char *file, int line)
|
2018-04-05 23:52:53 +00:00
|
|
|
{
|
|
|
|
if (unlikely(!page_is_locked(pd))) {
|
|
|
|
error_report("assert_page_lock: PageDesc %p not locked @ %s:%d",
|
|
|
|
pd, file, line);
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-02-23 01:50:29 +00:00
|
|
|
void assert_no_pages_locked(void)
|
|
|
|
{
|
|
|
|
ht_pages_locked_debug_init();
|
|
|
|
g_assert(g_hash_table_size(ht_pages_locked_debug) == 0);
|
|
|
|
}
|
|
|
|
|
2018-04-05 23:52:53 +00:00
|
|
|
#else /* !CONFIG_DEBUG_TCG */
|
|
|
|
|
2022-09-20 05:17:44 +00:00
|
|
|
static inline void page_lock__debug(const PageDesc *pd) { }
|
|
|
|
static inline void page_unlock__debug(const PageDesc *pd) { }
|
2018-04-05 23:52:53 +00:00
|
|
|
|
|
|
|
#endif /* CONFIG_DEBUG_TCG */
|
|
|
|
|
2022-09-20 05:17:44 +00:00
|
|
|
void page_lock(PageDesc *pd)
|
2017-07-27 00:22:51 +00:00
|
|
|
{
|
2018-04-05 23:52:53 +00:00
|
|
|
page_lock__debug(pd);
|
2017-07-27 00:22:51 +00:00
|
|
|
qemu_spin_lock(&pd->lock);
|
|
|
|
}
|
|
|
|
|
2022-09-20 05:17:44 +00:00
|
|
|
void page_unlock(PageDesc *pd)
|
2017-07-27 00:22:51 +00:00
|
|
|
{
|
|
|
|
qemu_spin_unlock(&pd->lock);
|
2018-04-05 23:52:53 +00:00
|
|
|
page_unlock__debug(pd);
|
2017-07-27 00:22:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct page_entry *
|
|
|
|
page_entry_new(PageDesc *pd, tb_page_addr_t index)
|
|
|
|
{
|
|
|
|
struct page_entry *pe = g_malloc(sizeof(*pe));
|
|
|
|
|
|
|
|
pe->index = index;
|
|
|
|
pe->pd = pd;
|
|
|
|
pe->locked = false;
|
|
|
|
return pe;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void page_entry_destroy(gpointer p)
|
|
|
|
{
|
|
|
|
struct page_entry *pe = p;
|
|
|
|
|
|
|
|
g_assert(pe->locked);
|
|
|
|
page_unlock(pe->pd);
|
|
|
|
g_free(pe);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* returns false on success */
|
|
|
|
static bool page_entry_trylock(struct page_entry *pe)
|
|
|
|
{
|
|
|
|
bool busy;
|
|
|
|
|
|
|
|
busy = qemu_spin_trylock(&pe->pd->lock);
|
|
|
|
if (!busy) {
|
|
|
|
g_assert(!pe->locked);
|
|
|
|
pe->locked = true;
|
2018-04-05 23:52:53 +00:00
|
|
|
page_lock__debug(pe->pd);
|
2017-07-27 00:22:51 +00:00
|
|
|
}
|
|
|
|
return busy;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void do_page_entry_lock(struct page_entry *pe)
|
|
|
|
{
|
|
|
|
page_lock(pe->pd);
|
|
|
|
g_assert(!pe->locked);
|
|
|
|
pe->locked = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static gboolean page_entry_lock(gpointer key, gpointer value, gpointer data)
|
|
|
|
{
|
|
|
|
struct page_entry *pe = value;
|
|
|
|
|
|
|
|
do_page_entry_lock(pe);
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static gboolean page_entry_unlock(gpointer key, gpointer value, gpointer data)
|
|
|
|
{
|
|
|
|
struct page_entry *pe = value;
|
|
|
|
|
|
|
|
if (pe->locked) {
|
|
|
|
pe->locked = false;
|
|
|
|
page_unlock(pe->pd);
|
|
|
|
}
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Trylock a page, and if successful, add the page to a collection.
|
|
|
|
* Returns true ("busy") if the page could not be locked; false otherwise.
|
|
|
|
*/
|
|
|
|
static bool page_trylock_add(struct page_collection *set, tb_page_addr_t addr)
|
|
|
|
{
|
|
|
|
tb_page_addr_t index = addr >> TARGET_PAGE_BITS;
|
|
|
|
struct page_entry *pe;
|
|
|
|
PageDesc *pd;
|
|
|
|
|
|
|
|
pe = g_tree_lookup(set->tree, &index);
|
|
|
|
if (pe) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
pd = page_find(index);
|
|
|
|
if (pd == NULL) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
pe = page_entry_new(pd, index);
|
|
|
|
g_tree_insert(set->tree, &pe->index, pe);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this is either (1) the first insertion or (2) a page whose index
|
|
|
|
* is higher than any other so far, just lock the page and move on.
|
|
|
|
*/
|
|
|
|
if (set->max == NULL || pe->index > set->max->index) {
|
|
|
|
set->max = pe;
|
|
|
|
do_page_entry_lock(pe);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Try to acquire out-of-order lock; if busy, return busy so that we acquire
|
|
|
|
* locks in order.
|
|
|
|
*/
|
|
|
|
return page_entry_trylock(pe);
|
|
|
|
}
|
|
|
|
|
|
|
|
static gint tb_page_addr_cmp(gconstpointer ap, gconstpointer bp, gpointer udata)
|
|
|
|
{
|
|
|
|
tb_page_addr_t a = *(const tb_page_addr_t *)ap;
|
|
|
|
tb_page_addr_t b = *(const tb_page_addr_t *)bp;
|
|
|
|
|
|
|
|
if (a == b) {
|
|
|
|
return 0;
|
|
|
|
} else if (a < b) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Lock a range of pages ([@start,@end[) as well as the pages of all
|
|
|
|
* intersecting TBs.
|
|
|
|
* Locking order: acquire locks in ascending order of page index.
|
|
|
|
*/
|
|
|
|
struct page_collection *
|
|
|
|
page_collection_lock(tb_page_addr_t start, tb_page_addr_t end)
|
|
|
|
{
|
|
|
|
struct page_collection *set = g_malloc(sizeof(*set));
|
|
|
|
tb_page_addr_t index;
|
|
|
|
PageDesc *pd;
|
|
|
|
|
|
|
|
start >>= TARGET_PAGE_BITS;
|
|
|
|
end >>= TARGET_PAGE_BITS;
|
|
|
|
g_assert(start <= end);
|
|
|
|
|
|
|
|
set->tree = g_tree_new_full(tb_page_addr_cmp, NULL, NULL,
|
|
|
|
page_entry_destroy);
|
|
|
|
set->max = NULL;
|
2018-02-23 01:50:29 +00:00
|
|
|
assert_no_pages_locked();
|
2017-07-27 00:22:51 +00:00
|
|
|
|
|
|
|
retry:
|
|
|
|
g_tree_foreach(set->tree, page_entry_lock, NULL);
|
|
|
|
|
|
|
|
for (index = start; index <= end; index++) {
|
|
|
|
TranslationBlock *tb;
|
|
|
|
int n;
|
|
|
|
|
|
|
|
pd = page_find(index);
|
|
|
|
if (pd == NULL) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (page_trylock_add(set, index << TARGET_PAGE_BITS)) {
|
|
|
|
g_tree_foreach(set->tree, page_entry_unlock, NULL);
|
|
|
|
goto retry;
|
|
|
|
}
|
2018-04-05 23:52:53 +00:00
|
|
|
assert_page_locked(pd);
|
2017-07-27 00:22:51 +00:00
|
|
|
PAGE_FOR_EACH_TB(pd, tb, n) {
|
2022-09-20 11:21:40 +00:00
|
|
|
if (page_trylock_add(set, tb_page_addr0(tb)) ||
|
|
|
|
(tb_page_addr1(tb) != -1 &&
|
|
|
|
page_trylock_add(set, tb_page_addr1(tb)))) {
|
2017-07-27 00:22:51 +00:00
|
|
|
/* drop all locks, and reacquire in order */
|
|
|
|
g_tree_foreach(set->tree, page_entry_unlock, NULL);
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return set;
|
|
|
|
}
|
|
|
|
|
|
|
|
void page_collection_unlock(struct page_collection *set)
|
|
|
|
{
|
|
|
|
/* entries are unlocked and freed via page_entry_destroy */
|
|
|
|
g_tree_destroy(set->tree);
|
|
|
|
g_free(set);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* !CONFIG_USER_ONLY */
|
|
|
|
|
2015-08-11 08:59:50 +00:00
|
|
|
/* Called with mmap_lock held for user mode emulation. */
|
2013-09-01 15:43:17 +00:00
|
|
|
TranslationBlock *tb_gen_code(CPUState *cpu,
|
2012-12-02 16:04:43 +00:00
|
|
|
target_ulong pc, target_ulong cs_base,
|
2016-04-07 17:19:22 +00:00
|
|
|
uint32_t flags, int cflags)
|
2012-12-02 16:04:43 +00:00
|
|
|
{
|
2013-09-01 15:43:17 +00:00
|
|
|
CPUArchState *env = cpu->env_ptr;
|
2017-08-01 19:40:16 +00:00
|
|
|
TranslationBlock *tb, *existing_tb;
|
2022-08-11 04:39:29 +00:00
|
|
|
tb_page_addr_t phys_pc;
|
2015-08-28 01:17:40 +00:00
|
|
|
tcg_insn_unit *gen_code_buf;
|
2019-04-16 06:54:54 +00:00
|
|
|
int gen_code_size, search_size, max_insns;
|
2015-08-28 01:17:40 +00:00
|
|
|
#ifdef CONFIG_PROFILER
|
2017-07-05 23:35:06 +00:00
|
|
|
TCGProfile *prof = &tcg_ctx->prof;
|
2015-08-28 01:17:40 +00:00
|
|
|
int64_t ti;
|
|
|
|
#endif
|
2022-08-11 20:48:03 +00:00
|
|
|
void *host_pc;
|
2019-10-23 16:20:47 +00:00
|
|
|
|
2016-10-27 15:10:05 +00:00
|
|
|
assert_memory_lock();
|
2021-01-13 03:28:07 +00:00
|
|
|
qemu_thread_jit_write();
|
2012-12-02 16:04:43 +00:00
|
|
|
|
2022-08-11 20:48:03 +00:00
|
|
|
phys_pc = get_page_addr_code_hostp(env, pc, &host_pc);
|
2015-09-22 20:01:15 +00:00
|
|
|
|
2018-08-14 16:17:19 +00:00
|
|
|
if (phys_pc == -1) {
|
2021-02-13 13:03:20 +00:00
|
|
|
/* Generate a one-shot TB with 1 insn in it */
|
2021-04-15 16:24:53 +00:00
|
|
|
cflags = (cflags & ~CF_COUNT_MASK) | CF_LAST_IO | 1;
|
2018-08-14 16:17:19 +00:00
|
|
|
}
|
|
|
|
|
2019-04-16 06:54:54 +00:00
|
|
|
max_insns = cflags & CF_COUNT_MASK;
|
|
|
|
if (max_insns == 0) {
|
|
|
|
max_insns = TCG_MAX_INSNS;
|
|
|
|
}
|
2021-07-17 22:18:39 +00:00
|
|
|
QEMU_BUILD_BUG_ON(CF_COUNT_MASK + 1 != TCG_MAX_INSNS);
|
|
|
|
|
tcg: introduce regions to split code_gen_buffer
This is groundwork for supporting multiple TCG contexts.
The naive solution here is to split code_gen_buffer statically
among the TCG threads; this however results in poor utilization
if translation needs are different across TCG threads.
What we do here is to add an extra layer of indirection, assigning
regions that act just like pages do in virtual memory allocation.
(BTW if you are wondering about the chosen naming, I did not want
to use blocks or pages because those are already heavily used in QEMU).
We use a global lock to serialize allocations as well as statistics
reporting (we now export the size of the used code_gen_buffer with
tcg_code_size()). Note that for the allocator we could just use
a counter and atomic_inc; however, that would complicate the gathering
of tcg_code_size()-like stats. So given that the region operations are
not a fast path, a lock seems the most reasonable choice.
The effectiveness of this approach is clear after seeing some numbers.
I used the bootup+shutdown of debian-arm with '-tb-size 80' as a benchmark.
Note that I'm evaluating this after enabling per-thread TCG (which
is done by a subsequent commit).
* -smp 1, 1 region (entire buffer):
qemu: flush code_size=83885014 nb_tbs=154739 avg_tb_size=357
qemu: flush code_size=83884902 nb_tbs=153136 avg_tb_size=363
qemu: flush code_size=83885014 nb_tbs=152777 avg_tb_size=364
qemu: flush code_size=83884950 nb_tbs=150057 avg_tb_size=373
qemu: flush code_size=83884998 nb_tbs=150234 avg_tb_size=373
qemu: flush code_size=83885014 nb_tbs=154009 avg_tb_size=360
qemu: flush code_size=83885014 nb_tbs=151007 avg_tb_size=370
qemu: flush code_size=83885014 nb_tbs=151816 avg_tb_size=367
That is, 8 flushes.
* -smp 8, 32 regions (80/32 MB per region) [i.e. this patch]:
qemu: flush code_size=76328008 nb_tbs=141040 avg_tb_size=356
qemu: flush code_size=75366534 nb_tbs=138000 avg_tb_size=361
qemu: flush code_size=76864546 nb_tbs=140653 avg_tb_size=361
qemu: flush code_size=76309084 nb_tbs=135945 avg_tb_size=375
qemu: flush code_size=74581856 nb_tbs=132909 avg_tb_size=375
qemu: flush code_size=73927256 nb_tbs=135616 avg_tb_size=360
qemu: flush code_size=78629426 nb_tbs=142896 avg_tb_size=365
qemu: flush code_size=76667052 nb_tbs=138508 avg_tb_size=368
Again, 8 flushes. Note how buffer utilization is not 100%, but it
is close. Smaller region sizes would yield higher utilization,
but we want region allocation to be rare (it acquires a lock), so
we do not want to go too small.
* -smp 8, static partitioning of 8 regions (10 MB per region):
qemu: flush code_size=21936504 nb_tbs=40570 avg_tb_size=354
qemu: flush code_size=11472174 nb_tbs=20633 avg_tb_size=370
qemu: flush code_size=11603976 nb_tbs=21059 avg_tb_size=365
qemu: flush code_size=23254872 nb_tbs=41243 avg_tb_size=377
qemu: flush code_size=28289496 nb_tbs=52057 avg_tb_size=358
qemu: flush code_size=43605160 nb_tbs=78896 avg_tb_size=367
qemu: flush code_size=45166552 nb_tbs=82158 avg_tb_size=364
qemu: flush code_size=63289640 nb_tbs=116494 avg_tb_size=358
qemu: flush code_size=51389960 nb_tbs=93937 avg_tb_size=362
qemu: flush code_size=59665928 nb_tbs=107063 avg_tb_size=372
qemu: flush code_size=38380824 nb_tbs=68597 avg_tb_size=374
qemu: flush code_size=44884568 nb_tbs=79901 avg_tb_size=376
qemu: flush code_size=50782632 nb_tbs=90681 avg_tb_size=374
qemu: flush code_size=39848888 nb_tbs=71433 avg_tb_size=372
qemu: flush code_size=64708840 nb_tbs=119052 avg_tb_size=359
qemu: flush code_size=49830008 nb_tbs=90992 avg_tb_size=362
qemu: flush code_size=68372408 nb_tbs=123442 avg_tb_size=368
qemu: flush code_size=33555560 nb_tbs=59514 avg_tb_size=378
qemu: flush code_size=44748344 nb_tbs=80974 avg_tb_size=367
qemu: flush code_size=37104248 nb_tbs=67609 avg_tb_size=364
That is, 20 flushes. Note how a static partitioning approach uses
the code buffer poorly, leading to many unnecessary flushes.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-07 23:24:20 +00:00
|
|
|
buffer_overflow:
|
2019-10-23 16:20:47 +00:00
|
|
|
tb = tcg_tb_alloc(tcg_ctx);
|
2015-09-22 20:01:15 +00:00
|
|
|
if (unlikely(!tb)) {
|
2012-12-02 16:04:43 +00:00
|
|
|
/* flush must be done */
|
2015-06-24 02:31:15 +00:00
|
|
|
tb_flush(cpu);
|
2016-08-02 17:27:43 +00:00
|
|
|
mmap_unlock();
|
2017-01-26 12:34:18 +00:00
|
|
|
/* Make the execution loop process the flush as soon as possible. */
|
|
|
|
cpu->exception_index = EXCP_INTERRUPT;
|
2016-08-02 17:27:43 +00:00
|
|
|
cpu_loop_exit(cpu);
|
2012-12-02 16:04:43 +00:00
|
|
|
}
|
2015-08-28 01:17:40 +00:00
|
|
|
|
2017-07-12 21:15:52 +00:00
|
|
|
gen_code_buf = tcg_ctx->code_gen_ptr;
|
2020-10-28 19:05:44 +00:00
|
|
|
tb->tc.ptr = tcg_splitwx_to_rx(gen_code_buf);
|
2022-08-12 16:53:53 +00:00
|
|
|
#if !TARGET_TB_PCREL
|
2017-06-09 19:55:22 +00:00
|
|
|
tb->pc = pc;
|
2022-08-12 16:53:53 +00:00
|
|
|
#endif
|
2012-12-02 16:04:43 +00:00
|
|
|
tb->cs_base = cs_base;
|
|
|
|
tb->flags = flags;
|
|
|
|
tb->cflags = cflags;
|
2017-07-04 08:42:32 +00:00
|
|
|
tb->trace_vcpu_dstate = *cpu->trace_dstate;
|
2022-09-20 11:21:40 +00:00
|
|
|
tb_set_page_addr0(tb, phys_pc);
|
|
|
|
tb_set_page_addr1(tb, -1);
|
2017-07-12 21:15:52 +00:00
|
|
|
tcg_ctx->tb_cflags = cflags;
|
2019-04-16 08:06:39 +00:00
|
|
|
tb_overflow:
|
2015-08-28 01:17:40 +00:00
|
|
|
|
|
|
|
#ifdef CONFIG_PROFILER
|
2017-07-05 23:35:06 +00:00
|
|
|
/* includes aborted translations because of exceptions */
|
2020-09-23 10:56:46 +00:00
|
|
|
qatomic_set(&prof->tb_count1, prof->tb_count1 + 1);
|
2015-08-28 01:17:40 +00:00
|
|
|
ti = profile_getclock();
|
|
|
|
#endif
|
|
|
|
|
2021-01-23 22:11:17 +00:00
|
|
|
gen_code_size = sigsetjmp(tcg_ctx->jmp_trans, 0);
|
|
|
|
if (unlikely(gen_code_size != 0)) {
|
|
|
|
goto error_return;
|
|
|
|
}
|
|
|
|
|
2017-07-12 21:15:52 +00:00
|
|
|
tcg_func_start(tcg_ctx);
|
2015-08-28 01:17:40 +00:00
|
|
|
|
2019-03-22 23:07:18 +00:00
|
|
|
tcg_ctx->cpu = env_cpu(env);
|
2022-08-11 20:48:03 +00:00
|
|
|
gen_intermediate_code(cpu, tb, max_insns, pc, host_pc);
|
2021-04-16 15:49:39 +00:00
|
|
|
assert(tb->size != 0);
|
2017-07-12 21:15:52 +00:00
|
|
|
tcg_ctx->cpu = NULL;
|
2021-01-23 22:11:17 +00:00
|
|
|
max_insns = tb->icount;
|
2015-08-28 01:17:40 +00:00
|
|
|
|
2022-08-15 20:16:06 +00:00
|
|
|
trace_translate_block(tb, pc, tb->tc.ptr);
|
2015-08-28 01:17:40 +00:00
|
|
|
|
|
|
|
/* generate machine code */
|
2016-04-10 20:35:45 +00:00
|
|
|
tb->jmp_reset_offset[0] = TB_JMP_RESET_OFFSET_INVALID;
|
|
|
|
tb->jmp_reset_offset[1] = TB_JMP_RESET_OFFSET_INVALID;
|
2017-07-12 21:15:52 +00:00
|
|
|
tcg_ctx->tb_jmp_reset_offset = tb->jmp_reset_offset;
|
2017-08-01 05:02:31 +00:00
|
|
|
if (TCG_TARGET_HAS_direct_jump) {
|
2017-07-12 21:15:52 +00:00
|
|
|
tcg_ctx->tb_jmp_insn_offset = tb->jmp_target_arg;
|
|
|
|
tcg_ctx->tb_jmp_target_addr = NULL;
|
2017-08-01 05:02:31 +00:00
|
|
|
} else {
|
2017-07-12 21:15:52 +00:00
|
|
|
tcg_ctx->tb_jmp_insn_offset = NULL;
|
|
|
|
tcg_ctx->tb_jmp_target_addr = tb->jmp_target_arg;
|
2017-08-01 05:02:31 +00:00
|
|
|
}
|
2015-08-28 01:17:40 +00:00
|
|
|
|
|
|
|
#ifdef CONFIG_PROFILER
|
2020-09-23 10:56:46 +00:00
|
|
|
qatomic_set(&prof->tb_count, prof->tb_count + 1);
|
|
|
|
qatomic_set(&prof->interm_time,
|
|
|
|
prof->interm_time + profile_getclock() - ti);
|
2017-07-07 22:22:49 +00:00
|
|
|
ti = profile_getclock();
|
2015-08-28 01:17:40 +00:00
|
|
|
#endif
|
|
|
|
|
2022-08-15 20:16:06 +00:00
|
|
|
gen_code_size = tcg_gen_code(tcg_ctx, tb, pc);
|
2015-09-22 20:01:15 +00:00
|
|
|
if (unlikely(gen_code_size < 0)) {
|
2021-01-23 22:11:17 +00:00
|
|
|
error_return:
|
2019-04-16 08:06:39 +00:00
|
|
|
switch (gen_code_size) {
|
|
|
|
case -1:
|
|
|
|
/*
|
|
|
|
* Overflow of code_gen_buffer, or the current slice of it.
|
|
|
|
*
|
|
|
|
* TODO: We don't need to re-do gen_intermediate_code, nor
|
|
|
|
* should we re-do the tcg optimization currently hidden
|
|
|
|
* inside tcg_gen_code. All that should be required is to
|
|
|
|
* flush the TBs, allocate a new TB, re-initialize it per
|
|
|
|
* above, and re-do the actual code generation.
|
|
|
|
*/
|
2021-01-23 22:11:17 +00:00
|
|
|
qemu_log_mask(CPU_LOG_TB_OP | CPU_LOG_TB_OP_OPT,
|
|
|
|
"Restarting code generation for "
|
|
|
|
"code_gen_buffer overflow\n");
|
2019-04-16 08:06:39 +00:00
|
|
|
goto buffer_overflow;
|
|
|
|
|
|
|
|
case -2:
|
|
|
|
/*
|
|
|
|
* The code generated for the TranslationBlock is too large.
|
|
|
|
* The maximum size allowed by the unwind info is 64k.
|
|
|
|
* There may be stricter constraints from relocations
|
|
|
|
* in the tcg backend.
|
|
|
|
*
|
|
|
|
* Try again with half as many insns as we attempted this time.
|
|
|
|
* If a single insn overflows, there's a bug somewhere...
|
|
|
|
*/
|
|
|
|
assert(max_insns > 1);
|
|
|
|
max_insns /= 2;
|
2021-01-23 22:11:17 +00:00
|
|
|
qemu_log_mask(CPU_LOG_TB_OP | CPU_LOG_TB_OP_OPT,
|
|
|
|
"Restarting code generation with "
|
|
|
|
"smaller translation block (max %d insns)\n",
|
|
|
|
max_insns);
|
2019-04-16 08:06:39 +00:00
|
|
|
goto tb_overflow;
|
|
|
|
|
|
|
|
default:
|
|
|
|
g_assert_not_reached();
|
|
|
|
}
|
2015-09-22 20:01:15 +00:00
|
|
|
}
|
2015-09-02 02:11:45 +00:00
|
|
|
search_size = encode_search(tb, (void *)gen_code_buf + gen_code_size);
|
2015-09-22 20:01:15 +00:00
|
|
|
if (unlikely(search_size < 0)) {
|
|
|
|
goto buffer_overflow;
|
|
|
|
}
|
translate-all: use a binary search tree to track TBs in TBContext
This is a prerequisite for supporting multiple TCG contexts, since
we will have threads generating code in separate regions of
code_gen_buffer.
For this we need a new field (.size) in struct tb_tc to keep
track of the size of the translated code. This field uses a size_t
to avoid adding a hole to the struct, although really an unsigned
int would have been enough.
The comparison function we use is optimized for the common case:
insertions. Profiling shows that upon booting debian-arm, 98%
of comparisons are between existing tb's (i.e. a->size and b->size
are both !0), which happens during insertions (and removals, but
those are rare). The remaining cases are lookups. From reading the glib
sources we see that the first key is always the lookup key. However,
the code does not assume this to always be the case because this
behaviour is not guaranteed in the glib docs. However, we embed
this knowledge in the code as a branch hint for the compiler.
Note that tb_free does not free space in the code_gen_buffer anymore,
since we cannot easily know whether the tb is the last one inserted
in code_gen_buffer. The next patch in this series renames tb_free
to tb_remove to reflect this.
Performance-wise, lookups in tb_find_pc are the same as before:
O(log n). However, insertions are O(log n) instead of O(1), which
results in a small slowdown when booting debian-arm:
Performance counter stats for 'build/arm-softmmu/qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel img/arm/aarch32-current-linux-kernel-only.img \
-append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
- Before:
8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% )
16,974 context-switches # 0.002 M/sec ( +- 0.12% )
0 cpu-migrations # 0.000 K/sec
10,125 page-faults # 0.001 M/sec ( +- 1.23% )
35,144,901,879 cycles # 4.367 GHz ( +- 0.14% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% )
10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% )
192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% )
8.640869419 seconds time elapsed ( +- 0.57% )
- After:
8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% )
17,016 context-switches # 0.002 M/sec ( +- 0.40% )
0 cpu-migrations # 0.000 K/sec
18,769 page-faults # 0.002 M/sec ( +- 0.45% )
35,660,956,120 cycles # 4.378 GHz ( +- 1.22% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% )
10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% )
195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% )
8.828660235 seconds time elapsed ( +- 0.38% )
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-23 23:00:11 +00:00
|
|
|
tb->tc.size = gen_code_size;
|
2015-08-28 01:17:40 +00:00
|
|
|
|
|
|
|
#ifdef CONFIG_PROFILER
|
2020-09-23 10:56:46 +00:00
|
|
|
qatomic_set(&prof->code_time, prof->code_time + profile_getclock() - ti);
|
|
|
|
qatomic_set(&prof->code_in_len, prof->code_in_len + tb->size);
|
|
|
|
qatomic_set(&prof->code_out_len, prof->code_out_len + gen_code_size);
|
|
|
|
qatomic_set(&prof->search_out_len, prof->search_out_len + search_size);
|
2015-08-28 01:17:40 +00:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef DEBUG_DISAS
|
2016-03-15 14:30:21 +00:00
|
|
|
if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM) &&
|
2022-08-15 20:16:06 +00:00
|
|
|
qemu_log_in_addr_range(pc)) {
|
2022-04-17 18:29:47 +00:00
|
|
|
FILE *logfile = qemu_log_trylock();
|
2022-04-17 18:29:49 +00:00
|
|
|
if (logfile) {
|
|
|
|
int code_size, data_size;
|
|
|
|
const tcg_target_ulong *rx_data_gen_ptr;
|
|
|
|
size_t chunk_start;
|
|
|
|
int insn = 0;
|
|
|
|
|
|
|
|
if (tcg_ctx->data_gen_ptr) {
|
|
|
|
rx_data_gen_ptr = tcg_splitwx_to_rx(tcg_ctx->data_gen_ptr);
|
|
|
|
code_size = (const void *)rx_data_gen_ptr - tb->tc.ptr;
|
|
|
|
data_size = gen_code_size - code_size;
|
|
|
|
} else {
|
|
|
|
rx_data_gen_ptr = 0;
|
|
|
|
code_size = gen_code_size;
|
|
|
|
data_size = 0;
|
|
|
|
}
|
2017-07-30 20:13:21 +00:00
|
|
|
|
2022-04-17 18:29:49 +00:00
|
|
|
/* Dump header and the first instruction */
|
|
|
|
fprintf(logfile, "OUT: [size=%d]\n", gen_code_size);
|
|
|
|
fprintf(logfile,
|
|
|
|
" -- guest addr 0x" TARGET_FMT_lx " + tb prologue\n",
|
|
|
|
tcg_ctx->gen_insn_data[insn][0]);
|
|
|
|
chunk_start = tcg_ctx->gen_insn_end_off[insn];
|
|
|
|
disas(logfile, tb->tc.ptr, chunk_start);
|
2017-07-30 20:13:21 +00:00
|
|
|
|
2022-04-17 18:29:49 +00:00
|
|
|
/*
|
|
|
|
* Dump each instruction chunk, wrapping up empty chunks into
|
|
|
|
* the next instruction. The whole array is offset so the
|
|
|
|
* first entry is the beginning of the 2nd instruction.
|
|
|
|
*/
|
|
|
|
while (insn < tb->icount) {
|
|
|
|
size_t chunk_end = tcg_ctx->gen_insn_end_off[insn];
|
|
|
|
if (chunk_end > chunk_start) {
|
|
|
|
fprintf(logfile, " -- guest addr 0x" TARGET_FMT_lx "\n",
|
|
|
|
tcg_ctx->gen_insn_data[insn][0]);
|
|
|
|
disas(logfile, tb->tc.ptr + chunk_start,
|
|
|
|
chunk_end - chunk_start);
|
|
|
|
chunk_start = chunk_end;
|
|
|
|
}
|
|
|
|
insn++;
|
2020-05-13 17:51:34 +00:00
|
|
|
}
|
|
|
|
|
2022-04-17 18:29:49 +00:00
|
|
|
if (chunk_start < code_size) {
|
|
|
|
fprintf(logfile, " -- tb slow paths + alignment\n");
|
|
|
|
disas(logfile, tb->tc.ptr + chunk_start,
|
|
|
|
code_size - chunk_start);
|
|
|
|
}
|
2020-09-10 19:15:04 +00:00
|
|
|
|
2022-04-17 18:29:49 +00:00
|
|
|
/* Finally dump any data we may have after the block */
|
|
|
|
if (data_size) {
|
|
|
|
int i;
|
|
|
|
fprintf(logfile, " data: [size=%d]\n", data_size);
|
|
|
|
for (i = 0; i < data_size / sizeof(tcg_target_ulong); i++) {
|
|
|
|
if (sizeof(tcg_target_ulong) == 8) {
|
|
|
|
fprintf(logfile,
|
|
|
|
"0x%08" PRIxPTR ": .quad 0x%016" TCG_PRIlx "\n",
|
|
|
|
(uintptr_t)&rx_data_gen_ptr[i], rx_data_gen_ptr[i]);
|
|
|
|
} else if (sizeof(tcg_target_ulong) == 4) {
|
|
|
|
fprintf(logfile,
|
|
|
|
"0x%08" PRIxPTR ": .long 0x%08" TCG_PRIlx "\n",
|
|
|
|
(uintptr_t)&rx_data_gen_ptr[i], rx_data_gen_ptr[i]);
|
|
|
|
} else {
|
|
|
|
qemu_build_not_reached();
|
|
|
|
}
|
2021-05-15 10:42:02 +00:00
|
|
|
}
|
2017-07-30 20:13:21 +00:00
|
|
|
}
|
2022-04-17 18:29:49 +00:00
|
|
|
fprintf(logfile, "\n");
|
|
|
|
qemu_log_unlock(logfile);
|
2017-07-30 20:13:21 +00:00
|
|
|
}
|
2015-08-28 01:17:40 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2020-09-23 10:56:46 +00:00
|
|
|
qatomic_set(&tcg_ctx->code_gen_ptr, (void *)
|
2015-09-02 02:11:45 +00:00
|
|
|
ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size,
|
tcg: introduce regions to split code_gen_buffer
This is groundwork for supporting multiple TCG contexts.
The naive solution here is to split code_gen_buffer statically
among the TCG threads; this however results in poor utilization
if translation needs are different across TCG threads.
What we do here is to add an extra layer of indirection, assigning
regions that act just like pages do in virtual memory allocation.
(BTW if you are wondering about the chosen naming, I did not want
to use blocks or pages because those are already heavily used in QEMU).
We use a global lock to serialize allocations as well as statistics
reporting (we now export the size of the used code_gen_buffer with
tcg_code_size()). Note that for the allocator we could just use
a counter and atomic_inc; however, that would complicate the gathering
of tcg_code_size()-like stats. So given that the region operations are
not a fast path, a lock seems the most reasonable choice.
The effectiveness of this approach is clear after seeing some numbers.
I used the bootup+shutdown of debian-arm with '-tb-size 80' as a benchmark.
Note that I'm evaluating this after enabling per-thread TCG (which
is done by a subsequent commit).
* -smp 1, 1 region (entire buffer):
qemu: flush code_size=83885014 nb_tbs=154739 avg_tb_size=357
qemu: flush code_size=83884902 nb_tbs=153136 avg_tb_size=363
qemu: flush code_size=83885014 nb_tbs=152777 avg_tb_size=364
qemu: flush code_size=83884950 nb_tbs=150057 avg_tb_size=373
qemu: flush code_size=83884998 nb_tbs=150234 avg_tb_size=373
qemu: flush code_size=83885014 nb_tbs=154009 avg_tb_size=360
qemu: flush code_size=83885014 nb_tbs=151007 avg_tb_size=370
qemu: flush code_size=83885014 nb_tbs=151816 avg_tb_size=367
That is, 8 flushes.
* -smp 8, 32 regions (80/32 MB per region) [i.e. this patch]:
qemu: flush code_size=76328008 nb_tbs=141040 avg_tb_size=356
qemu: flush code_size=75366534 nb_tbs=138000 avg_tb_size=361
qemu: flush code_size=76864546 nb_tbs=140653 avg_tb_size=361
qemu: flush code_size=76309084 nb_tbs=135945 avg_tb_size=375
qemu: flush code_size=74581856 nb_tbs=132909 avg_tb_size=375
qemu: flush code_size=73927256 nb_tbs=135616 avg_tb_size=360
qemu: flush code_size=78629426 nb_tbs=142896 avg_tb_size=365
qemu: flush code_size=76667052 nb_tbs=138508 avg_tb_size=368
Again, 8 flushes. Note how buffer utilization is not 100%, but it
is close. Smaller region sizes would yield higher utilization,
but we want region allocation to be rare (it acquires a lock), so
we do not want to go too small.
* -smp 8, static partitioning of 8 regions (10 MB per region):
qemu: flush code_size=21936504 nb_tbs=40570 avg_tb_size=354
qemu: flush code_size=11472174 nb_tbs=20633 avg_tb_size=370
qemu: flush code_size=11603976 nb_tbs=21059 avg_tb_size=365
qemu: flush code_size=23254872 nb_tbs=41243 avg_tb_size=377
qemu: flush code_size=28289496 nb_tbs=52057 avg_tb_size=358
qemu: flush code_size=43605160 nb_tbs=78896 avg_tb_size=367
qemu: flush code_size=45166552 nb_tbs=82158 avg_tb_size=364
qemu: flush code_size=63289640 nb_tbs=116494 avg_tb_size=358
qemu: flush code_size=51389960 nb_tbs=93937 avg_tb_size=362
qemu: flush code_size=59665928 nb_tbs=107063 avg_tb_size=372
qemu: flush code_size=38380824 nb_tbs=68597 avg_tb_size=374
qemu: flush code_size=44884568 nb_tbs=79901 avg_tb_size=376
qemu: flush code_size=50782632 nb_tbs=90681 avg_tb_size=374
qemu: flush code_size=39848888 nb_tbs=71433 avg_tb_size=372
qemu: flush code_size=64708840 nb_tbs=119052 avg_tb_size=359
qemu: flush code_size=49830008 nb_tbs=90992 avg_tb_size=362
qemu: flush code_size=68372408 nb_tbs=123442 avg_tb_size=368
qemu: flush code_size=33555560 nb_tbs=59514 avg_tb_size=378
qemu: flush code_size=44748344 nb_tbs=80974 avg_tb_size=367
qemu: flush code_size=37104248 nb_tbs=67609 avg_tb_size=364
That is, 20 flushes. Note how a static partitioning approach uses
the code buffer poorly, leading to many unnecessary flushes.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-07 23:24:20 +00:00
|
|
|
CODE_GEN_ALIGN));
|
2012-12-02 16:04:43 +00:00
|
|
|
|
2016-03-22 16:00:12 +00:00
|
|
|
/* init jump list */
|
translate-all: protect TB jumps with a per-destination-TB lock
This applies to both user-mode and !user-mode emulation.
Instead of relying on a global lock, protect the list of incoming
jumps with tb->jmp_lock. This lock also protects tb->cflags,
so update all tb->cflags readers outside tb->jmp_lock to use
atomic reads via tb_cflags().
In order to find the destination TB (and therefore its jmp_lock)
from the origin TB, we introduce tb->jmp_dest[].
I considered not using a linked list of jumps, which simplifies
code and makes the struct smaller. However, it unnecessarily increases
memory usage, which results in a performance decrease. See for
instance these numbers booting+shutting down debian-arm:
Time (s) Rel. err (%) Abs. err (s) Rel. slowdown (%)
------------------------------------------------------------------------------
before 20.88 0.74 0.154512 0.
after 20.81 0.38 0.079078 -0.33524904
GTree 21.02 0.28 0.058856 0.67049808
GHashTable + xxhash 21.63 1.08 0.233604 3.5919540
Using a hash table or a binary tree to keep track of the jumps
doesn't really pay off, not only due to the increased memory usage,
but also because most TBs have only 0 or 1 jumps to them. The maximum
number of jumps when booting debian-arm that I measured is 35, but
as we can see in the histogram below a TB with that many incoming jumps
is extremely rare; the average TB has 0.80 incoming jumps.
n_jumps: 379208; avg jumps/tb: 0.801099
dist: [0.0,1.0)|▄█▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁ ▁▁▁ ▁▁▁ ▁|[34.0,35.0]
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-08-03 00:34:06 +00:00
|
|
|
qemu_spin_init(&tb->jmp_lock);
|
|
|
|
tb->jmp_list_head = (uintptr_t)NULL;
|
2016-03-22 16:00:12 +00:00
|
|
|
tb->jmp_list_next[0] = (uintptr_t)NULL;
|
|
|
|
tb->jmp_list_next[1] = (uintptr_t)NULL;
|
translate-all: protect TB jumps with a per-destination-TB lock
This applies to both user-mode and !user-mode emulation.
Instead of relying on a global lock, protect the list of incoming
jumps with tb->jmp_lock. This lock also protects tb->cflags,
so update all tb->cflags readers outside tb->jmp_lock to use
atomic reads via tb_cflags().
In order to find the destination TB (and therefore its jmp_lock)
from the origin TB, we introduce tb->jmp_dest[].
I considered not using a linked list of jumps, which simplifies
code and makes the struct smaller. However, it unnecessarily increases
memory usage, which results in a performance decrease. See for
instance these numbers booting+shutting down debian-arm:
Time (s) Rel. err (%) Abs. err (s) Rel. slowdown (%)
------------------------------------------------------------------------------
before 20.88 0.74 0.154512 0.
after 20.81 0.38 0.079078 -0.33524904
GTree 21.02 0.28 0.058856 0.67049808
GHashTable + xxhash 21.63 1.08 0.233604 3.5919540
Using a hash table or a binary tree to keep track of the jumps
doesn't really pay off, not only due to the increased memory usage,
but also because most TBs have only 0 or 1 jumps to them. The maximum
number of jumps when booting debian-arm that I measured is 35, but
as we can see in the histogram below a TB with that many incoming jumps
is extremely rare; the average TB has 0.80 incoming jumps.
n_jumps: 379208; avg jumps/tb: 0.801099
dist: [0.0,1.0)|▄█▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁ ▁▁▁ ▁▁▁ ▁|[34.0,35.0]
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-08-03 00:34:06 +00:00
|
|
|
tb->jmp_dest[0] = (uintptr_t)NULL;
|
|
|
|
tb->jmp_dest[1] = (uintptr_t)NULL;
|
2016-03-22 16:00:12 +00:00
|
|
|
|
2018-07-12 19:44:54 +00:00
|
|
|
/* init original jump addresses which have been set during tcg_gen_code() */
|
2016-03-22 16:00:12 +00:00
|
|
|
if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
|
|
|
|
tb_reset_jump(tb, 0);
|
|
|
|
}
|
|
|
|
if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
|
|
|
|
tb_reset_jump(tb, 1);
|
|
|
|
}
|
|
|
|
|
2021-02-13 13:03:20 +00:00
|
|
|
/*
|
2022-08-11 04:39:29 +00:00
|
|
|
* If the TB is not associated with a physical RAM page then it must be
|
|
|
|
* a temporary one-insn TB, and we have nothing left to do. Return early
|
|
|
|
* before attempting to link to other TBs or add to the lookup table.
|
2021-02-13 13:03:20 +00:00
|
|
|
*/
|
2022-09-20 11:21:40 +00:00
|
|
|
if (tb_page_addr0(tb) == -1) {
|
2021-02-13 13:03:20 +00:00
|
|
|
return tb;
|
|
|
|
}
|
|
|
|
|
2021-07-04 14:31:26 +00:00
|
|
|
/*
|
|
|
|
* Insert TB into the corresponding region tree before publishing it
|
|
|
|
* through QHT. Otherwise rewinding happened in the TB might fail to
|
|
|
|
* lookup itself using host PC.
|
|
|
|
*/
|
|
|
|
tcg_tb_insert(tb);
|
|
|
|
|
2017-08-05 03:46:31 +00:00
|
|
|
/*
|
|
|
|
* No explicit memory barrier is required -- tb_link_page() makes the
|
|
|
|
* TB visible in a consistent state.
|
2016-03-22 16:00:12 +00:00
|
|
|
*/
|
2022-09-20 11:21:40 +00:00
|
|
|
existing_tb = tb_link_page(tb, tb_page_addr0(tb), tb_page_addr1(tb));
|
2017-08-01 19:40:16 +00:00
|
|
|
/* if the TB already exists, discard what we just translated */
|
|
|
|
if (unlikely(existing_tb != tb)) {
|
|
|
|
uintptr_t orig_aligned = (uintptr_t)gen_code_buf;
|
|
|
|
|
|
|
|
orig_aligned -= ROUND_UP(sizeof(*tb), qemu_icache_linesize);
|
2020-09-23 10:56:46 +00:00
|
|
|
qatomic_set(&tcg_ctx->code_gen_ptr, (void *)orig_aligned);
|
2021-07-04 14:31:26 +00:00
|
|
|
tcg_tb_remove(tb);
|
2017-08-01 19:40:16 +00:00
|
|
|
return existing_tb;
|
|
|
|
}
|
2012-12-02 16:04:43 +00:00
|
|
|
return tb;
|
|
|
|
}
|
|
|
|
|
2017-08-05 03:46:31 +00:00
|
|
|
/* user-mode: call with mmap_lock held */
|
2019-09-22 03:24:12 +00:00
|
|
|
void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr)
|
2012-12-02 16:04:43 +00:00
|
|
|
{
|
|
|
|
TranslationBlock *tb;
|
|
|
|
|
2017-08-05 03:46:31 +00:00
|
|
|
assert_memory_lock();
|
|
|
|
|
2019-09-22 03:24:12 +00:00
|
|
|
tb = tcg_tb_lookup(retaddr);
|
2015-06-12 22:45:59 +00:00
|
|
|
if (tb) {
|
|
|
|
/* We can use retranslation to find the PC. */
|
2022-10-24 13:12:56 +00:00
|
|
|
cpu_restore_state_from_tb(cpu, tb, retaddr);
|
2015-06-12 22:45:59 +00:00
|
|
|
tb_phys_invalidate(tb, -1);
|
|
|
|
} else {
|
|
|
|
/* The exception probably happened in a helper. The CPU state should
|
|
|
|
have been saved before calling it. Fetch the PC from there. */
|
|
|
|
CPUArchState *env = cpu->env_ptr;
|
|
|
|
target_ulong pc, cs_base;
|
|
|
|
tb_page_addr_t addr;
|
2016-04-07 17:19:22 +00:00
|
|
|
uint32_t flags;
|
2015-06-12 22:45:59 +00:00
|
|
|
|
|
|
|
cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
|
|
|
|
addr = get_page_addr_code(env, pc);
|
2018-08-14 16:17:19 +00:00
|
|
|
if (addr != -1) {
|
|
|
|
tb_invalidate_phys_range(addr, addr + 1);
|
|
|
|
}
|
2012-12-02 16:04:43 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef CONFIG_USER_ONLY
|
2021-02-13 13:03:22 +00:00
|
|
|
/*
|
|
|
|
* In deterministic execution mode, instructions doing device I/Os
|
tcg: drop global lock during TCG code execution
This finally allows TCG to benefit from the iothread introduction: Drop
the global mutex while running pure TCG CPU code. Reacquire the lock
when entering MMIO or PIO emulation, or when leaving the TCG loop.
We have to revert a few optimization for the current TCG threading
model, namely kicking the TCG thread in qemu_mutex_lock_iothread and not
kicking it in qemu_cpu_kick. We also need to disable RAM block
reordering until we have a more efficient locking mechanism at hand.
Still, a Linux x86 UP guest and my Musicpal ARM model boot fine here.
These numbers demonstrate where we gain something:
20338 jan 20 0 331m 75m 6904 R 99 0.9 0:50.95 qemu-system-arm
20337 jan 20 0 331m 75m 6904 S 20 0.9 0:26.50 qemu-system-arm
The guest CPU was fully loaded, but the iothread could still run mostly
independent on a second core. Without the patch we don't get beyond
32206 jan 20 0 330m 73m 7036 R 82 0.9 1:06.00 qemu-system-arm
32204 jan 20 0 330m 73m 7036 S 21 0.9 0:17.03 qemu-system-arm
We don't benefit significantly, though, when the guest is not fully
loading a host CPU.
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Message-Id: <1439220437-23957-10-git-send-email-fred.konrad@greensocs.com>
[FK: Rebase, fix qemu_devices_reset deadlock, rm address_space_* mutex]
Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
[EGC: fixed iothread lock for cpu-exec IRQ handling]
Signed-off-by: Emilio G. Cota <cota@braap.org>
[AJB: -smp single-threaded fix, clean commit msg, BQL fixes]
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Reviewed-by: Pranith Kumar <bobby.prani@gmail.com>
[PM: target-arm changes]
Acked-by: Peter Maydell <peter.maydell@linaro.org>
2017-02-23 18:29:11 +00:00
|
|
|
* must be at the end of the TB.
|
|
|
|
*
|
|
|
|
* Called by softmmu_template.h, with iothread mutex not held.
|
|
|
|
*/
|
2013-09-01 15:21:47 +00:00
|
|
|
void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
|
2012-12-02 16:04:43 +00:00
|
|
|
{
|
|
|
|
TranslationBlock *tb;
|
2021-02-13 13:03:13 +00:00
|
|
|
CPUClass *cc;
|
2018-03-19 03:15:45 +00:00
|
|
|
uint32_t n;
|
2012-12-02 16:04:43 +00:00
|
|
|
|
tcg: track TBs with per-region BST's
This paves the way for enabling scalable parallel generation of TCG code.
Instead of tracking TBs with a single binary search tree (BST), use a
BST for each TCG region, protecting it with a lock. This is as scalable
as it gets, since each TCG thread operates on a separate region.
The core of this change is the introduction of struct tcg_region_tree,
which contains a pointer to a GTree and an associated lock to serialize
accesses to it. We then allocate an array of tcg_region_tree's, adding
the appropriate padding to avoid false sharing based on
qemu_dcache_linesize.
Given a tc_ptr, we first find the corresponding region_tree. This
is done by special-casing the first and last regions first, since they
might be of size != region.size; otherwise we just divide the offset
by region.stride. I was worried about this division (several dozen
cycles of latency), but profiling shows that this is not a fast path.
Note that region.stride is not required to be a power of two; it
is only required to be a multiple of the host's page size.
Note that with this design we can also provide consistent snapshots
about all region trees at once; for instance, tcg_tb_foreach
acquires/releases all region_tree locks before/after iterating over them.
For this reason we now drop tb_lock in dump_exec_info().
As an alternative I considered implementing a concurrent BST, but this
can be tricky to get right, offers no consistent snapshots of the BST,
and performance and scalability-wise I don't think it could ever beat
having separate GTrees, given that our workload is insert-mostly (all
concurrent BST designs I've seen focus, understandably, on making
lookups fast, which comes at the expense of convoluted, non-wait-free
insertions/removals).
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-26 20:58:05 +00:00
|
|
|
tb = tcg_tb_lookup(retaddr);
|
2012-12-02 16:04:43 +00:00
|
|
|
if (!tb) {
|
2013-09-03 15:38:47 +00:00
|
|
|
cpu_abort(cpu, "cpu_io_recompile: could not find TB for pc=%p",
|
2012-12-02 16:04:43 +00:00
|
|
|
(void *)retaddr);
|
|
|
|
}
|
2022-10-24 13:12:56 +00:00
|
|
|
cpu_restore_state_from_tb(cpu, tb, retaddr);
|
2018-03-19 03:15:45 +00:00
|
|
|
|
2021-02-13 13:03:13 +00:00
|
|
|
/*
|
|
|
|
* Some guests must re-execute the branch when re-executing a delay
|
|
|
|
* slot instruction. When this is the case, adjust icount and N
|
|
|
|
* to account for the re-execution of the branch.
|
|
|
|
*/
|
2018-03-19 03:15:45 +00:00
|
|
|
n = 1;
|
2021-02-13 13:03:13 +00:00
|
|
|
cc = CPU_GET_CLASS(cpu);
|
|
|
|
if (cc->tcg_ops->io_recompile_replay_branch &&
|
|
|
|
cc->tcg_ops->io_recompile_replay_branch(cpu, tb)) {
|
|
|
|
cpu_neg(cpu)->icount_decr.u16.low++;
|
|
|
|
n = 2;
|
|
|
|
}
|
2012-12-02 16:04:43 +00:00
|
|
|
|
2021-02-13 13:03:22 +00:00
|
|
|
/*
|
|
|
|
* Exit the loop and potentially generate a new TB executing the
|
|
|
|
* just the I/O insns. We also limit instrumentation to memory
|
|
|
|
* operations only (which execute after completion) so we don't
|
|
|
|
* double instrument the instruction.
|
|
|
|
*/
|
2021-02-24 16:58:08 +00:00
|
|
|
cpu->cflags_next_tb = curr_cflags(cpu) | CF_MEMI_ONLY | CF_LAST_IO | n;
|
2017-10-13 17:50:02 +00:00
|
|
|
|
2022-08-15 20:16:06 +00:00
|
|
|
if (qemu_loglevel_mask(CPU_LOG_EXEC)) {
|
|
|
|
target_ulong pc = log_pc(cpu, tb);
|
|
|
|
if (qemu_log_in_addr_range(pc)) {
|
|
|
|
qemu_log("cpu_io_recompile: rewound execution of TB to "
|
|
|
|
TARGET_FMT_lx "\n", pc);
|
|
|
|
}
|
|
|
|
}
|
2020-10-13 12:26:58 +00:00
|
|
|
|
2016-05-17 14:18:04 +00:00
|
|
|
cpu_loop_exit_noexc(cpu);
|
2012-12-02 16:04:43 +00:00
|
|
|
}
|
|
|
|
|
2021-09-08 09:35:43 +00:00
|
|
|
static void print_qht_statistics(struct qht_stats hst, GString *buf)
|
qht: do not segfault when gathering stats from an uninitialized qht
So far, QHT functions assume that the passed qht has previously been
initialized--otherwise they segfault.
This patch makes an exception for qht_statistics_init, with the goal
of simplifying calling code. For instance, qht_statistics_init is
called from the 'info jit' dump, and given that under KVM the TB qht
is never initialized, we get a segfault. Thus, instead of complicating
the 'info jit' code with additional checks, let's allow passing an
uninitialized qht to qht_statistics_init.
While at it, add a test for this to test-qht.
Before the patch (for $ qemu -enable-kvm [...]):
(qemu) info jit
[...]
direct jump count 0 (0%) (2 jumps=0 0%)
Program received signal SIGSEGV, Segmentation fault.
After the patch the "TB hash buckets", "TB hash occupancy"
and "TB hash avg chain" lines are omitted.
(qemu) info jit
[...]
direct jump count 0 (0%) (2 jumps=0 0%)
TB hash buckets 0/0 (-nan% head buckets used)
TB hash occupancy nan% avg chain occ. Histogram: (null)
TB hash avg chain nan buckets. Histogram: (null)
[...]
Reported by: Changlong Xie <xiecl.fnst@cn.fujitsu.com>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1469205390-14369-1-git-send-email-cota@braap.org>
[Extract printing statistics to an entirely separate function. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-07-22 16:36:30 +00:00
|
|
|
{
|
|
|
|
uint32_t hgram_opts;
|
|
|
|
size_t hgram_bins;
|
|
|
|
char *hgram;
|
|
|
|
|
|
|
|
if (!hst.head_buckets) {
|
|
|
|
return;
|
|
|
|
}
|
2021-09-08 09:35:43 +00:00
|
|
|
g_string_append_printf(buf, "TB hash buckets %zu/%zu "
|
|
|
|
"(%0.2f%% head buckets used)\n",
|
|
|
|
hst.used_head_buckets, hst.head_buckets,
|
|
|
|
(double)hst.used_head_buckets /
|
|
|
|
hst.head_buckets * 100);
|
qht: do not segfault when gathering stats from an uninitialized qht
So far, QHT functions assume that the passed qht has previously been
initialized--otherwise they segfault.
This patch makes an exception for qht_statistics_init, with the goal
of simplifying calling code. For instance, qht_statistics_init is
called from the 'info jit' dump, and given that under KVM the TB qht
is never initialized, we get a segfault. Thus, instead of complicating
the 'info jit' code with additional checks, let's allow passing an
uninitialized qht to qht_statistics_init.
While at it, add a test for this to test-qht.
Before the patch (for $ qemu -enable-kvm [...]):
(qemu) info jit
[...]
direct jump count 0 (0%) (2 jumps=0 0%)
Program received signal SIGSEGV, Segmentation fault.
After the patch the "TB hash buckets", "TB hash occupancy"
and "TB hash avg chain" lines are omitted.
(qemu) info jit
[...]
direct jump count 0 (0%) (2 jumps=0 0%)
TB hash buckets 0/0 (-nan% head buckets used)
TB hash occupancy nan% avg chain occ. Histogram: (null)
TB hash avg chain nan buckets. Histogram: (null)
[...]
Reported by: Changlong Xie <xiecl.fnst@cn.fujitsu.com>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1469205390-14369-1-git-send-email-cota@braap.org>
[Extract printing statistics to an entirely separate function. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-07-22 16:36:30 +00:00
|
|
|
|
|
|
|
hgram_opts = QDIST_PR_BORDER | QDIST_PR_LABELS;
|
|
|
|
hgram_opts |= QDIST_PR_100X | QDIST_PR_PERCENT;
|
|
|
|
if (qdist_xmax(&hst.occupancy) - qdist_xmin(&hst.occupancy) == 1) {
|
|
|
|
hgram_opts |= QDIST_PR_NODECIMAL;
|
|
|
|
}
|
|
|
|
hgram = qdist_pr(&hst.occupancy, 10, hgram_opts);
|
2021-09-08 09:35:43 +00:00
|
|
|
g_string_append_printf(buf, "TB hash occupancy %0.2f%% avg chain occ. "
|
|
|
|
"Histogram: %s\n",
|
|
|
|
qdist_avg(&hst.occupancy) * 100, hgram);
|
qht: do not segfault when gathering stats from an uninitialized qht
So far, QHT functions assume that the passed qht has previously been
initialized--otherwise they segfault.
This patch makes an exception for qht_statistics_init, with the goal
of simplifying calling code. For instance, qht_statistics_init is
called from the 'info jit' dump, and given that under KVM the TB qht
is never initialized, we get a segfault. Thus, instead of complicating
the 'info jit' code with additional checks, let's allow passing an
uninitialized qht to qht_statistics_init.
While at it, add a test for this to test-qht.
Before the patch (for $ qemu -enable-kvm [...]):
(qemu) info jit
[...]
direct jump count 0 (0%) (2 jumps=0 0%)
Program received signal SIGSEGV, Segmentation fault.
After the patch the "TB hash buckets", "TB hash occupancy"
and "TB hash avg chain" lines are omitted.
(qemu) info jit
[...]
direct jump count 0 (0%) (2 jumps=0 0%)
TB hash buckets 0/0 (-nan% head buckets used)
TB hash occupancy nan% avg chain occ. Histogram: (null)
TB hash avg chain nan buckets. Histogram: (null)
[...]
Reported by: Changlong Xie <xiecl.fnst@cn.fujitsu.com>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1469205390-14369-1-git-send-email-cota@braap.org>
[Extract printing statistics to an entirely separate function. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-07-22 16:36:30 +00:00
|
|
|
g_free(hgram);
|
|
|
|
|
|
|
|
hgram_opts = QDIST_PR_BORDER | QDIST_PR_LABELS;
|
|
|
|
hgram_bins = qdist_xmax(&hst.chain) - qdist_xmin(&hst.chain);
|
|
|
|
if (hgram_bins > 10) {
|
|
|
|
hgram_bins = 10;
|
|
|
|
} else {
|
|
|
|
hgram_bins = 0;
|
|
|
|
hgram_opts |= QDIST_PR_NODECIMAL | QDIST_PR_NOBINRANGE;
|
|
|
|
}
|
|
|
|
hgram = qdist_pr(&hst.chain, hgram_bins, hgram_opts);
|
2021-09-08 09:35:43 +00:00
|
|
|
g_string_append_printf(buf, "TB hash avg chain %0.3f buckets. "
|
|
|
|
"Histogram: %s\n",
|
|
|
|
qdist_avg(&hst.chain), hgram);
|
qht: do not segfault when gathering stats from an uninitialized qht
So far, QHT functions assume that the passed qht has previously been
initialized--otherwise they segfault.
This patch makes an exception for qht_statistics_init, with the goal
of simplifying calling code. For instance, qht_statistics_init is
called from the 'info jit' dump, and given that under KVM the TB qht
is never initialized, we get a segfault. Thus, instead of complicating
the 'info jit' code with additional checks, let's allow passing an
uninitialized qht to qht_statistics_init.
While at it, add a test for this to test-qht.
Before the patch (for $ qemu -enable-kvm [...]):
(qemu) info jit
[...]
direct jump count 0 (0%) (2 jumps=0 0%)
Program received signal SIGSEGV, Segmentation fault.
After the patch the "TB hash buckets", "TB hash occupancy"
and "TB hash avg chain" lines are omitted.
(qemu) info jit
[...]
direct jump count 0 (0%) (2 jumps=0 0%)
TB hash buckets 0/0 (-nan% head buckets used)
TB hash occupancy nan% avg chain occ. Histogram: (null)
TB hash avg chain nan buckets. Histogram: (null)
[...]
Reported by: Changlong Xie <xiecl.fnst@cn.fujitsu.com>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1469205390-14369-1-git-send-email-cota@braap.org>
[Extract printing statistics to an entirely separate function. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-07-22 16:36:30 +00:00
|
|
|
g_free(hgram);
|
|
|
|
}
|
|
|
|
|
translate-all: use a binary search tree to track TBs in TBContext
This is a prerequisite for supporting multiple TCG contexts, since
we will have threads generating code in separate regions of
code_gen_buffer.
For this we need a new field (.size) in struct tb_tc to keep
track of the size of the translated code. This field uses a size_t
to avoid adding a hole to the struct, although really an unsigned
int would have been enough.
The comparison function we use is optimized for the common case:
insertions. Profiling shows that upon booting debian-arm, 98%
of comparisons are between existing tb's (i.e. a->size and b->size
are both !0), which happens during insertions (and removals, but
those are rare). The remaining cases are lookups. From reading the glib
sources we see that the first key is always the lookup key. However,
the code does not assume this to always be the case because this
behaviour is not guaranteed in the glib docs. However, we embed
this knowledge in the code as a branch hint for the compiler.
Note that tb_free does not free space in the code_gen_buffer anymore,
since we cannot easily know whether the tb is the last one inserted
in code_gen_buffer. The next patch in this series renames tb_free
to tb_remove to reflect this.
Performance-wise, lookups in tb_find_pc are the same as before:
O(log n). However, insertions are O(log n) instead of O(1), which
results in a small slowdown when booting debian-arm:
Performance counter stats for 'build/arm-softmmu/qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel img/arm/aarch32-current-linux-kernel-only.img \
-append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
- Before:
8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% )
16,974 context-switches # 0.002 M/sec ( +- 0.12% )
0 cpu-migrations # 0.000 K/sec
10,125 page-faults # 0.001 M/sec ( +- 1.23% )
35,144,901,879 cycles # 4.367 GHz ( +- 0.14% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% )
10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% )
192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% )
8.640869419 seconds time elapsed ( +- 0.57% )
- After:
8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% )
17,016 context-switches # 0.002 M/sec ( +- 0.40% )
0 cpu-migrations # 0.000 K/sec
18,769 page-faults # 0.002 M/sec ( +- 0.45% )
35,660,956,120 cycles # 4.378 GHz ( +- 1.22% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% )
10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% )
195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% )
8.828660235 seconds time elapsed ( +- 0.38% )
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-23 23:00:11 +00:00
|
|
|
struct tb_tree_stats {
|
tcg: track TBs with per-region BST's
This paves the way for enabling scalable parallel generation of TCG code.
Instead of tracking TBs with a single binary search tree (BST), use a
BST for each TCG region, protecting it with a lock. This is as scalable
as it gets, since each TCG thread operates on a separate region.
The core of this change is the introduction of struct tcg_region_tree,
which contains a pointer to a GTree and an associated lock to serialize
accesses to it. We then allocate an array of tcg_region_tree's, adding
the appropriate padding to avoid false sharing based on
qemu_dcache_linesize.
Given a tc_ptr, we first find the corresponding region_tree. This
is done by special-casing the first and last regions first, since they
might be of size != region.size; otherwise we just divide the offset
by region.stride. I was worried about this division (several dozen
cycles of latency), but profiling shows that this is not a fast path.
Note that region.stride is not required to be a power of two; it
is only required to be a multiple of the host's page size.
Note that with this design we can also provide consistent snapshots
about all region trees at once; for instance, tcg_tb_foreach
acquires/releases all region_tree locks before/after iterating over them.
For this reason we now drop tb_lock in dump_exec_info().
As an alternative I considered implementing a concurrent BST, but this
can be tricky to get right, offers no consistent snapshots of the BST,
and performance and scalability-wise I don't think it could ever beat
having separate GTrees, given that our workload is insert-mostly (all
concurrent BST designs I've seen focus, understandably, on making
lookups fast, which comes at the expense of convoluted, non-wait-free
insertions/removals).
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-26 20:58:05 +00:00
|
|
|
size_t nb_tbs;
|
2017-06-24 00:57:44 +00:00
|
|
|
size_t host_size;
|
translate-all: use a binary search tree to track TBs in TBContext
This is a prerequisite for supporting multiple TCG contexts, since
we will have threads generating code in separate regions of
code_gen_buffer.
For this we need a new field (.size) in struct tb_tc to keep
track of the size of the translated code. This field uses a size_t
to avoid adding a hole to the struct, although really an unsigned
int would have been enough.
The comparison function we use is optimized for the common case:
insertions. Profiling shows that upon booting debian-arm, 98%
of comparisons are between existing tb's (i.e. a->size and b->size
are both !0), which happens during insertions (and removals, but
those are rare). The remaining cases are lookups. From reading the glib
sources we see that the first key is always the lookup key. However,
the code does not assume this to always be the case because this
behaviour is not guaranteed in the glib docs. However, we embed
this knowledge in the code as a branch hint for the compiler.
Note that tb_free does not free space in the code_gen_buffer anymore,
since we cannot easily know whether the tb is the last one inserted
in code_gen_buffer. The next patch in this series renames tb_free
to tb_remove to reflect this.
Performance-wise, lookups in tb_find_pc are the same as before:
O(log n). However, insertions are O(log n) instead of O(1), which
results in a small slowdown when booting debian-arm:
Performance counter stats for 'build/arm-softmmu/qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel img/arm/aarch32-current-linux-kernel-only.img \
-append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
- Before:
8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% )
16,974 context-switches # 0.002 M/sec ( +- 0.12% )
0 cpu-migrations # 0.000 K/sec
10,125 page-faults # 0.001 M/sec ( +- 1.23% )
35,144,901,879 cycles # 4.367 GHz ( +- 0.14% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% )
10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% )
192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% )
8.640869419 seconds time elapsed ( +- 0.57% )
- After:
8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% )
17,016 context-switches # 0.002 M/sec ( +- 0.40% )
0 cpu-migrations # 0.000 K/sec
18,769 page-faults # 0.002 M/sec ( +- 0.45% )
35,660,956,120 cycles # 4.378 GHz ( +- 1.22% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% )
10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% )
195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% )
8.828660235 seconds time elapsed ( +- 0.38% )
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-23 23:00:11 +00:00
|
|
|
size_t target_size;
|
|
|
|
size_t max_target_size;
|
|
|
|
size_t direct_jmp_count;
|
|
|
|
size_t direct_jmp2_count;
|
|
|
|
size_t cross_page;
|
|
|
|
};
|
|
|
|
|
|
|
|
static gboolean tb_tree_stats_iter(gpointer key, gpointer value, gpointer data)
|
|
|
|
{
|
|
|
|
const TranslationBlock *tb = value;
|
|
|
|
struct tb_tree_stats *tst = data;
|
|
|
|
|
tcg: track TBs with per-region BST's
This paves the way for enabling scalable parallel generation of TCG code.
Instead of tracking TBs with a single binary search tree (BST), use a
BST for each TCG region, protecting it with a lock. This is as scalable
as it gets, since each TCG thread operates on a separate region.
The core of this change is the introduction of struct tcg_region_tree,
which contains a pointer to a GTree and an associated lock to serialize
accesses to it. We then allocate an array of tcg_region_tree's, adding
the appropriate padding to avoid false sharing based on
qemu_dcache_linesize.
Given a tc_ptr, we first find the corresponding region_tree. This
is done by special-casing the first and last regions first, since they
might be of size != region.size; otherwise we just divide the offset
by region.stride. I was worried about this division (several dozen
cycles of latency), but profiling shows that this is not a fast path.
Note that region.stride is not required to be a power of two; it
is only required to be a multiple of the host's page size.
Note that with this design we can also provide consistent snapshots
about all region trees at once; for instance, tcg_tb_foreach
acquires/releases all region_tree locks before/after iterating over them.
For this reason we now drop tb_lock in dump_exec_info().
As an alternative I considered implementing a concurrent BST, but this
can be tricky to get right, offers no consistent snapshots of the BST,
and performance and scalability-wise I don't think it could ever beat
having separate GTrees, given that our workload is insert-mostly (all
concurrent BST designs I've seen focus, understandably, on making
lookups fast, which comes at the expense of convoluted, non-wait-free
insertions/removals).
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-26 20:58:05 +00:00
|
|
|
tst->nb_tbs++;
|
2017-06-24 00:57:44 +00:00
|
|
|
tst->host_size += tb->tc.size;
|
translate-all: use a binary search tree to track TBs in TBContext
This is a prerequisite for supporting multiple TCG contexts, since
we will have threads generating code in separate regions of
code_gen_buffer.
For this we need a new field (.size) in struct tb_tc to keep
track of the size of the translated code. This field uses a size_t
to avoid adding a hole to the struct, although really an unsigned
int would have been enough.
The comparison function we use is optimized for the common case:
insertions. Profiling shows that upon booting debian-arm, 98%
of comparisons are between existing tb's (i.e. a->size and b->size
are both !0), which happens during insertions (and removals, but
those are rare). The remaining cases are lookups. From reading the glib
sources we see that the first key is always the lookup key. However,
the code does not assume this to always be the case because this
behaviour is not guaranteed in the glib docs. However, we embed
this knowledge in the code as a branch hint for the compiler.
Note that tb_free does not free space in the code_gen_buffer anymore,
since we cannot easily know whether the tb is the last one inserted
in code_gen_buffer. The next patch in this series renames tb_free
to tb_remove to reflect this.
Performance-wise, lookups in tb_find_pc are the same as before:
O(log n). However, insertions are O(log n) instead of O(1), which
results in a small slowdown when booting debian-arm:
Performance counter stats for 'build/arm-softmmu/qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel img/arm/aarch32-current-linux-kernel-only.img \
-append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
- Before:
8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% )
16,974 context-switches # 0.002 M/sec ( +- 0.12% )
0 cpu-migrations # 0.000 K/sec
10,125 page-faults # 0.001 M/sec ( +- 1.23% )
35,144,901,879 cycles # 4.367 GHz ( +- 0.14% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% )
10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% )
192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% )
8.640869419 seconds time elapsed ( +- 0.57% )
- After:
8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% )
17,016 context-switches # 0.002 M/sec ( +- 0.40% )
0 cpu-migrations # 0.000 K/sec
18,769 page-faults # 0.002 M/sec ( +- 0.45% )
35,660,956,120 cycles # 4.378 GHz ( +- 1.22% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% )
10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% )
195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% )
8.828660235 seconds time elapsed ( +- 0.38% )
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-23 23:00:11 +00:00
|
|
|
tst->target_size += tb->size;
|
|
|
|
if (tb->size > tst->max_target_size) {
|
|
|
|
tst->max_target_size = tb->size;
|
|
|
|
}
|
2022-09-20 11:21:40 +00:00
|
|
|
if (tb_page_addr1(tb) != -1) {
|
translate-all: use a binary search tree to track TBs in TBContext
This is a prerequisite for supporting multiple TCG contexts, since
we will have threads generating code in separate regions of
code_gen_buffer.
For this we need a new field (.size) in struct tb_tc to keep
track of the size of the translated code. This field uses a size_t
to avoid adding a hole to the struct, although really an unsigned
int would have been enough.
The comparison function we use is optimized for the common case:
insertions. Profiling shows that upon booting debian-arm, 98%
of comparisons are between existing tb's (i.e. a->size and b->size
are both !0), which happens during insertions (and removals, but
those are rare). The remaining cases are lookups. From reading the glib
sources we see that the first key is always the lookup key. However,
the code does not assume this to always be the case because this
behaviour is not guaranteed in the glib docs. However, we embed
this knowledge in the code as a branch hint for the compiler.
Note that tb_free does not free space in the code_gen_buffer anymore,
since we cannot easily know whether the tb is the last one inserted
in code_gen_buffer. The next patch in this series renames tb_free
to tb_remove to reflect this.
Performance-wise, lookups in tb_find_pc are the same as before:
O(log n). However, insertions are O(log n) instead of O(1), which
results in a small slowdown when booting debian-arm:
Performance counter stats for 'build/arm-softmmu/qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel img/arm/aarch32-current-linux-kernel-only.img \
-append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
- Before:
8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% )
16,974 context-switches # 0.002 M/sec ( +- 0.12% )
0 cpu-migrations # 0.000 K/sec
10,125 page-faults # 0.001 M/sec ( +- 1.23% )
35,144,901,879 cycles # 4.367 GHz ( +- 0.14% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% )
10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% )
192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% )
8.640869419 seconds time elapsed ( +- 0.57% )
- After:
8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% )
17,016 context-switches # 0.002 M/sec ( +- 0.40% )
0 cpu-migrations # 0.000 K/sec
18,769 page-faults # 0.002 M/sec ( +- 0.45% )
35,660,956,120 cycles # 4.378 GHz ( +- 1.22% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% )
10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% )
195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% )
8.828660235 seconds time elapsed ( +- 0.38% )
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-23 23:00:11 +00:00
|
|
|
tst->cross_page++;
|
|
|
|
}
|
|
|
|
if (tb->jmp_reset_offset[0] != TB_JMP_RESET_OFFSET_INVALID) {
|
|
|
|
tst->direct_jmp_count++;
|
|
|
|
if (tb->jmp_reset_offset[1] != TB_JMP_RESET_OFFSET_INVALID) {
|
|
|
|
tst->direct_jmp2_count++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2021-09-08 09:35:43 +00:00
|
|
|
void dump_exec_info(GString *buf)
|
2012-12-02 16:04:43 +00:00
|
|
|
{
|
translate-all: use a binary search tree to track TBs in TBContext
This is a prerequisite for supporting multiple TCG contexts, since
we will have threads generating code in separate regions of
code_gen_buffer.
For this we need a new field (.size) in struct tb_tc to keep
track of the size of the translated code. This field uses a size_t
to avoid adding a hole to the struct, although really an unsigned
int would have been enough.
The comparison function we use is optimized for the common case:
insertions. Profiling shows that upon booting debian-arm, 98%
of comparisons are between existing tb's (i.e. a->size and b->size
are both !0), which happens during insertions (and removals, but
those are rare). The remaining cases are lookups. From reading the glib
sources we see that the first key is always the lookup key. However,
the code does not assume this to always be the case because this
behaviour is not guaranteed in the glib docs. However, we embed
this knowledge in the code as a branch hint for the compiler.
Note that tb_free does not free space in the code_gen_buffer anymore,
since we cannot easily know whether the tb is the last one inserted
in code_gen_buffer. The next patch in this series renames tb_free
to tb_remove to reflect this.
Performance-wise, lookups in tb_find_pc are the same as before:
O(log n). However, insertions are O(log n) instead of O(1), which
results in a small slowdown when booting debian-arm:
Performance counter stats for 'build/arm-softmmu/qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel img/arm/aarch32-current-linux-kernel-only.img \
-append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
- Before:
8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% )
16,974 context-switches # 0.002 M/sec ( +- 0.12% )
0 cpu-migrations # 0.000 K/sec
10,125 page-faults # 0.001 M/sec ( +- 1.23% )
35,144,901,879 cycles # 4.367 GHz ( +- 0.14% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% )
10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% )
192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% )
8.640869419 seconds time elapsed ( +- 0.57% )
- After:
8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% )
17,016 context-switches # 0.002 M/sec ( +- 0.40% )
0 cpu-migrations # 0.000 K/sec
18,769 page-faults # 0.002 M/sec ( +- 0.45% )
35,660,956,120 cycles # 4.378 GHz ( +- 1.22% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% )
10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% )
195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% )
8.828660235 seconds time elapsed ( +- 0.38% )
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-23 23:00:11 +00:00
|
|
|
struct tb_tree_stats tst = {};
|
translate-all: add tb hash bucket info to 'info jit' dump
Examples:
- Good hashing, i.e. tb_hash_func5(phys_pc, pc, flags):
TB count 715135/2684354
[...]
TB hash buckets 388775/524288 (74.15% head buckets used)
TB hash occupancy 33.04% avg chain occ. Histogram: [0,10)%|▆ █ ▅▁▃▁▁|[90,100]%
TB hash avg chain 1.017 buckets. Histogram: 1|█▁▁|3
- Not-so-good hashing, i.e. tb_hash_func5(phys_pc, pc, 0):
TB count 712636/2684354
[...]
TB hash buckets 344924/524288 (65.79% head buckets used)
TB hash occupancy 31.64% avg chain occ. Histogram: [0,10)%|█ ▆ ▅▁▃▁▂|[90,100]%
TB hash avg chain 1.047 buckets. Histogram: 1|█▁▁▁|4
- Bad hashing, i.e. tb_hash_func5(phys_pc, 0, 0):
TB count 702818/2684354
[...]
TB hash buckets 112741/524288 (21.50% head buckets used)
TB hash occupancy 10.15% avg chain occ. Histogram: [0,10)%|█ ▁ ▁▁▁▁▁|[90,100]%
TB hash avg chain 2.107 buckets. Histogram: [1.0,10.2)|█▁▁▁▁▁▁▁▁▁|[83.8,93.0]
- Good hashing, but no auto-resize:
TB count 715634/2684354
TB hash buckets 8192/8192 (100.00% head buckets used)
TB hash occupancy 98.30% avg chain occ. Histogram: [95.3,95.8)%|▁▁▃▄▃▄▁▇▁█|[99.5,100.0]%
TB hash avg chain 22.070 buckets. Histogram: [15.0,16.7)|▁▂▅▄█▅▁▁▁▁|[30.3,32.0]
Acked-by: Sergey Fedorov <sergey.fedorov@linaro.org>
Suggested-by: Richard Henderson <rth@twiddle.net>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1465412133-3029-16-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
2016-06-08 18:55:33 +00:00
|
|
|
struct qht_stats hst;
|
2018-10-19 21:36:43 +00:00
|
|
|
size_t nb_tbs, flush_full, flush_part, flush_elide;
|
2012-12-02 16:04:43 +00:00
|
|
|
|
tcg: track TBs with per-region BST's
This paves the way for enabling scalable parallel generation of TCG code.
Instead of tracking TBs with a single binary search tree (BST), use a
BST for each TCG region, protecting it with a lock. This is as scalable
as it gets, since each TCG thread operates on a separate region.
The core of this change is the introduction of struct tcg_region_tree,
which contains a pointer to a GTree and an associated lock to serialize
accesses to it. We then allocate an array of tcg_region_tree's, adding
the appropriate padding to avoid false sharing based on
qemu_dcache_linesize.
Given a tc_ptr, we first find the corresponding region_tree. This
is done by special-casing the first and last regions first, since they
might be of size != region.size; otherwise we just divide the offset
by region.stride. I was worried about this division (several dozen
cycles of latency), but profiling shows that this is not a fast path.
Note that region.stride is not required to be a power of two; it
is only required to be a multiple of the host's page size.
Note that with this design we can also provide consistent snapshots
about all region trees at once; for instance, tcg_tb_foreach
acquires/releases all region_tree locks before/after iterating over them.
For this reason we now drop tb_lock in dump_exec_info().
As an alternative I considered implementing a concurrent BST, but this
can be tricky to get right, offers no consistent snapshots of the BST,
and performance and scalability-wise I don't think it could ever beat
having separate GTrees, given that our workload is insert-mostly (all
concurrent BST designs I've seen focus, understandably, on making
lookups fast, which comes at the expense of convoluted, non-wait-free
insertions/removals).
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-26 20:58:05 +00:00
|
|
|
tcg_tb_foreach(tb_tree_stats_iter, &tst);
|
|
|
|
nb_tbs = tst.nb_tbs;
|
2012-12-02 16:04:43 +00:00
|
|
|
/* XXX: avoid using doubles ? */
|
2021-09-08 09:35:43 +00:00
|
|
|
g_string_append_printf(buf, "Translation buffer state:\n");
|
2017-06-24 00:57:44 +00:00
|
|
|
/*
|
|
|
|
* Report total code size including the padding and TB structs;
|
2020-12-10 15:58:05 +00:00
|
|
|
* otherwise users might think "-accel tcg,tb-size" is not honoured.
|
2017-06-24 00:57:44 +00:00
|
|
|
* For avg host size we use the precise numbers from tb_tree_stats though.
|
|
|
|
*/
|
2021-09-08 09:35:43 +00:00
|
|
|
g_string_append_printf(buf, "gen code size %zu/%zu\n",
|
|
|
|
tcg_code_size(), tcg_code_capacity());
|
|
|
|
g_string_append_printf(buf, "TB count %zu\n", nb_tbs);
|
|
|
|
g_string_append_printf(buf, "TB avg target size %zu max=%zu bytes\n",
|
|
|
|
nb_tbs ? tst.target_size / nb_tbs : 0,
|
|
|
|
tst.max_target_size);
|
|
|
|
g_string_append_printf(buf, "TB avg host size %zu bytes "
|
|
|
|
"(expansion ratio: %0.1f)\n",
|
|
|
|
nb_tbs ? tst.host_size / nb_tbs : 0,
|
|
|
|
tst.target_size ?
|
|
|
|
(double)tst.host_size / tst.target_size : 0);
|
|
|
|
g_string_append_printf(buf, "cross page TB count %zu (%zu%%)\n",
|
|
|
|
tst.cross_page,
|
|
|
|
nb_tbs ? (tst.cross_page * 100) / nb_tbs : 0);
|
|
|
|
g_string_append_printf(buf, "direct jump count %zu (%zu%%) "
|
|
|
|
"(2 jumps=%zu %zu%%)\n",
|
|
|
|
tst.direct_jmp_count,
|
|
|
|
nb_tbs ? (tst.direct_jmp_count * 100) / nb_tbs : 0,
|
|
|
|
tst.direct_jmp2_count,
|
|
|
|
nb_tbs ? (tst.direct_jmp2_count * 100) / nb_tbs : 0);
|
translate-all: add tb hash bucket info to 'info jit' dump
Examples:
- Good hashing, i.e. tb_hash_func5(phys_pc, pc, flags):
TB count 715135/2684354
[...]
TB hash buckets 388775/524288 (74.15% head buckets used)
TB hash occupancy 33.04% avg chain occ. Histogram: [0,10)%|▆ █ ▅▁▃▁▁|[90,100]%
TB hash avg chain 1.017 buckets. Histogram: 1|█▁▁|3
- Not-so-good hashing, i.e. tb_hash_func5(phys_pc, pc, 0):
TB count 712636/2684354
[...]
TB hash buckets 344924/524288 (65.79% head buckets used)
TB hash occupancy 31.64% avg chain occ. Histogram: [0,10)%|█ ▆ ▅▁▃▁▂|[90,100]%
TB hash avg chain 1.047 buckets. Histogram: 1|█▁▁▁|4
- Bad hashing, i.e. tb_hash_func5(phys_pc, 0, 0):
TB count 702818/2684354
[...]
TB hash buckets 112741/524288 (21.50% head buckets used)
TB hash occupancy 10.15% avg chain occ. Histogram: [0,10)%|█ ▁ ▁▁▁▁▁|[90,100]%
TB hash avg chain 2.107 buckets. Histogram: [1.0,10.2)|█▁▁▁▁▁▁▁▁▁|[83.8,93.0]
- Good hashing, but no auto-resize:
TB count 715634/2684354
TB hash buckets 8192/8192 (100.00% head buckets used)
TB hash occupancy 98.30% avg chain occ. Histogram: [95.3,95.8)%|▁▁▃▄▃▄▁▇▁█|[99.5,100.0]%
TB hash avg chain 22.070 buckets. Histogram: [15.0,16.7)|▁▂▅▄█▅▁▁▁▁|[30.3,32.0]
Acked-by: Sergey Fedorov <sergey.fedorov@linaro.org>
Suggested-by: Richard Henderson <rth@twiddle.net>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1465412133-3029-16-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
2016-06-08 18:55:33 +00:00
|
|
|
|
2017-06-24 00:04:43 +00:00
|
|
|
qht_statistics_init(&tb_ctx.htable, &hst);
|
2021-09-08 09:35:43 +00:00
|
|
|
print_qht_statistics(hst, buf);
|
translate-all: add tb hash bucket info to 'info jit' dump
Examples:
- Good hashing, i.e. tb_hash_func5(phys_pc, pc, flags):
TB count 715135/2684354
[...]
TB hash buckets 388775/524288 (74.15% head buckets used)
TB hash occupancy 33.04% avg chain occ. Histogram: [0,10)%|▆ █ ▅▁▃▁▁|[90,100]%
TB hash avg chain 1.017 buckets. Histogram: 1|█▁▁|3
- Not-so-good hashing, i.e. tb_hash_func5(phys_pc, pc, 0):
TB count 712636/2684354
[...]
TB hash buckets 344924/524288 (65.79% head buckets used)
TB hash occupancy 31.64% avg chain occ. Histogram: [0,10)%|█ ▆ ▅▁▃▁▂|[90,100]%
TB hash avg chain 1.047 buckets. Histogram: 1|█▁▁▁|4
- Bad hashing, i.e. tb_hash_func5(phys_pc, 0, 0):
TB count 702818/2684354
[...]
TB hash buckets 112741/524288 (21.50% head buckets used)
TB hash occupancy 10.15% avg chain occ. Histogram: [0,10)%|█ ▁ ▁▁▁▁▁|[90,100]%
TB hash avg chain 2.107 buckets. Histogram: [1.0,10.2)|█▁▁▁▁▁▁▁▁▁|[83.8,93.0]
- Good hashing, but no auto-resize:
TB count 715634/2684354
TB hash buckets 8192/8192 (100.00% head buckets used)
TB hash occupancy 98.30% avg chain occ. Histogram: [95.3,95.8)%|▁▁▃▄▃▄▁▇▁█|[99.5,100.0]%
TB hash avg chain 22.070 buckets. Histogram: [15.0,16.7)|▁▂▅▄█▅▁▁▁▁|[30.3,32.0]
Acked-by: Sergey Fedorov <sergey.fedorov@linaro.org>
Suggested-by: Richard Henderson <rth@twiddle.net>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1465412133-3029-16-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
2016-06-08 18:55:33 +00:00
|
|
|
qht_statistics_destroy(&hst);
|
|
|
|
|
2021-09-08 09:35:43 +00:00
|
|
|
g_string_append_printf(buf, "\nStatistics:\n");
|
|
|
|
g_string_append_printf(buf, "TB flush count %u\n",
|
|
|
|
qatomic_read(&tb_ctx.tb_flush_count));
|
|
|
|
g_string_append_printf(buf, "TB invalidate count %u\n",
|
|
|
|
qatomic_read(&tb_ctx.tb_phys_invalidate_count));
|
2018-10-19 21:36:43 +00:00
|
|
|
|
|
|
|
tlb_flush_counts(&flush_full, &flush_part, &flush_elide);
|
2021-09-08 09:35:43 +00:00
|
|
|
g_string_append_printf(buf, "TLB full flushes %zu\n", flush_full);
|
|
|
|
g_string_append_printf(buf, "TLB partial flushes %zu\n", flush_part);
|
|
|
|
g_string_append_printf(buf, "TLB elided flushes %zu\n", flush_elide);
|
|
|
|
tcg_dump_info(buf);
|
2012-12-02 16:04:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#else /* CONFIG_USER_ONLY */
|
|
|
|
|
2013-01-18 14:03:43 +00:00
|
|
|
void cpu_interrupt(CPUState *cpu, int mask)
|
2012-12-02 16:04:43 +00:00
|
|
|
{
|
tcg: drop global lock during TCG code execution
This finally allows TCG to benefit from the iothread introduction: Drop
the global mutex while running pure TCG CPU code. Reacquire the lock
when entering MMIO or PIO emulation, or when leaving the TCG loop.
We have to revert a few optimization for the current TCG threading
model, namely kicking the TCG thread in qemu_mutex_lock_iothread and not
kicking it in qemu_cpu_kick. We also need to disable RAM block
reordering until we have a more efficient locking mechanism at hand.
Still, a Linux x86 UP guest and my Musicpal ARM model boot fine here.
These numbers demonstrate where we gain something:
20338 jan 20 0 331m 75m 6904 R 99 0.9 0:50.95 qemu-system-arm
20337 jan 20 0 331m 75m 6904 S 20 0.9 0:26.50 qemu-system-arm
The guest CPU was fully loaded, but the iothread could still run mostly
independent on a second core. Without the patch we don't get beyond
32206 jan 20 0 330m 73m 7036 R 82 0.9 1:06.00 qemu-system-arm
32204 jan 20 0 330m 73m 7036 S 21 0.9 0:17.03 qemu-system-arm
We don't benefit significantly, though, when the guest is not fully
loading a host CPU.
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Message-Id: <1439220437-23957-10-git-send-email-fred.konrad@greensocs.com>
[FK: Rebase, fix qemu_devices_reset deadlock, rm address_space_* mutex]
Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
[EGC: fixed iothread lock for cpu-exec IRQ handling]
Signed-off-by: Emilio G. Cota <cota@braap.org>
[AJB: -smp single-threaded fix, clean commit msg, BQL fixes]
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Reviewed-by: Pranith Kumar <bobby.prani@gmail.com>
[PM: target-arm changes]
Acked-by: Peter Maydell <peter.maydell@linaro.org>
2017-02-23 18:29:11 +00:00
|
|
|
g_assert(qemu_mutex_iothread_locked());
|
2013-01-17 17:51:17 +00:00
|
|
|
cpu->interrupt_request |= mask;
|
2020-09-23 10:56:46 +00:00
|
|
|
qatomic_set(&cpu_neg(cpu)->icount_decr.u16.high, -1);
|
2012-12-02 16:04:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Walks guest process memory "regions" one by one
|
|
|
|
* and calls callback function 'fn' for each region.
|
|
|
|
*/
|
|
|
|
struct walk_memory_regions_data {
|
|
|
|
walk_memory_regions_fn fn;
|
|
|
|
void *priv;
|
2014-09-08 13:28:56 +00:00
|
|
|
target_ulong start;
|
2012-12-02 16:04:43 +00:00
|
|
|
int prot;
|
|
|
|
};
|
|
|
|
|
|
|
|
static int walk_memory_regions_end(struct walk_memory_regions_data *data,
|
2014-09-08 13:28:56 +00:00
|
|
|
target_ulong end, int new_prot)
|
2012-12-02 16:04:43 +00:00
|
|
|
{
|
2014-09-08 13:28:56 +00:00
|
|
|
if (data->start != -1u) {
|
2012-12-02 16:04:43 +00:00
|
|
|
int rc = data->fn(data->priv, data->start, end, data->prot);
|
|
|
|
if (rc != 0) {
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-09-08 13:28:56 +00:00
|
|
|
data->start = (new_prot ? end : -1u);
|
2012-12-02 16:04:43 +00:00
|
|
|
data->prot = new_prot;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int walk_memory_regions_1(struct walk_memory_regions_data *data,
|
2014-09-08 13:28:56 +00:00
|
|
|
target_ulong base, int level, void **lp)
|
2012-12-02 16:04:43 +00:00
|
|
|
{
|
2014-09-08 13:28:56 +00:00
|
|
|
target_ulong pa;
|
2012-12-02 16:04:43 +00:00
|
|
|
int i, rc;
|
|
|
|
|
|
|
|
if (*lp == NULL) {
|
|
|
|
return walk_memory_regions_end(data, base, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (level == 0) {
|
|
|
|
PageDesc *pd = *lp;
|
|
|
|
|
2013-11-07 16:14:36 +00:00
|
|
|
for (i = 0; i < V_L2_SIZE; ++i) {
|
2012-12-02 16:04:43 +00:00
|
|
|
int prot = pd[i].flags;
|
|
|
|
|
|
|
|
pa = base | (i << TARGET_PAGE_BITS);
|
|
|
|
if (prot != data->prot) {
|
|
|
|
rc = walk_memory_regions_end(data, pa, prot);
|
|
|
|
if (rc != 0) {
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
void **pp = *lp;
|
|
|
|
|
2013-11-07 16:14:36 +00:00
|
|
|
for (i = 0; i < V_L2_SIZE; ++i) {
|
2014-09-08 13:28:56 +00:00
|
|
|
pa = base | ((target_ulong)i <<
|
2013-11-07 16:14:36 +00:00
|
|
|
(TARGET_PAGE_BITS + V_L2_BITS * level));
|
2012-12-02 16:04:43 +00:00
|
|
|
rc = walk_memory_regions_1(data, pa, level - 1, pp + i);
|
|
|
|
if (rc != 0) {
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int walk_memory_regions(void *priv, walk_memory_regions_fn fn)
|
|
|
|
{
|
|
|
|
struct walk_memory_regions_data data;
|
2016-10-24 15:26:49 +00:00
|
|
|
uintptr_t i, l1_sz = v_l1_size;
|
2012-12-02 16:04:43 +00:00
|
|
|
|
|
|
|
data.fn = fn;
|
|
|
|
data.priv = priv;
|
2014-09-08 13:28:56 +00:00
|
|
|
data.start = -1u;
|
2012-12-02 16:04:43 +00:00
|
|
|
data.prot = 0;
|
|
|
|
|
2016-10-24 15:26:49 +00:00
|
|
|
for (i = 0; i < l1_sz; i++) {
|
|
|
|
target_ulong base = i << (v_l1_shift + TARGET_PAGE_BITS);
|
|
|
|
int rc = walk_memory_regions_1(&data, base, v_l2_levels, l1_map + i);
|
2012-12-02 16:04:43 +00:00
|
|
|
if (rc != 0) {
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return walk_memory_regions_end(&data, 0, 0);
|
|
|
|
}
|
|
|
|
|
2014-09-08 13:28:56 +00:00
|
|
|
static int dump_region(void *priv, target_ulong start,
|
|
|
|
target_ulong end, unsigned long prot)
|
2012-12-02 16:04:43 +00:00
|
|
|
{
|
|
|
|
FILE *f = (FILE *)priv;
|
|
|
|
|
2014-09-08 13:28:56 +00:00
|
|
|
(void) fprintf(f, TARGET_FMT_lx"-"TARGET_FMT_lx
|
|
|
|
" "TARGET_FMT_lx" %c%c%c\n",
|
2012-12-02 16:04:43 +00:00
|
|
|
start, end, end - start,
|
|
|
|
((prot & PAGE_READ) ? 'r' : '-'),
|
|
|
|
((prot & PAGE_WRITE) ? 'w' : '-'),
|
|
|
|
((prot & PAGE_EXEC) ? 'x' : '-'));
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* dump memory mappings */
|
|
|
|
void page_dump(FILE *f)
|
|
|
|
{
|
2014-09-08 13:28:56 +00:00
|
|
|
const int length = sizeof(target_ulong) * 2;
|
2013-09-12 18:09:06 +00:00
|
|
|
(void) fprintf(f, "%-*s %-*s %-*s %s\n",
|
|
|
|
length, "start", length, "end", length, "size", "prot");
|
2012-12-02 16:04:43 +00:00
|
|
|
walk_memory_regions(f, dump_region);
|
|
|
|
}
|
|
|
|
|
|
|
|
int page_get_flags(target_ulong address)
|
|
|
|
{
|
|
|
|
PageDesc *p;
|
|
|
|
|
|
|
|
p = page_find(address >> TARGET_PAGE_BITS);
|
|
|
|
if (!p) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return p->flags;
|
|
|
|
}
|
|
|
|
|
2022-07-11 03:14:20 +00:00
|
|
|
/*
|
|
|
|
* Allow the target to decide if PAGE_TARGET_[12] may be reset.
|
|
|
|
* By default, they are not kept.
|
|
|
|
*/
|
|
|
|
#ifndef PAGE_TARGET_STICKY
|
|
|
|
#define PAGE_TARGET_STICKY 0
|
|
|
|
#endif
|
2022-09-06 00:08:38 +00:00
|
|
|
#define PAGE_STICKY (PAGE_ANON | PAGE_PASSTHROUGH | PAGE_TARGET_STICKY)
|
2022-07-11 03:14:20 +00:00
|
|
|
|
2012-12-02 16:04:43 +00:00
|
|
|
/* Modify the flags of a page and invalidate the code if necessary.
|
|
|
|
The flag PAGE_WRITE_ORG is positioned automatically depending
|
|
|
|
on PAGE_WRITE. The mmap_lock should already be held. */
|
|
|
|
void page_set_flags(target_ulong start, target_ulong end, int flags)
|
|
|
|
{
|
|
|
|
target_ulong addr, len;
|
2022-10-05 19:56:46 +00:00
|
|
|
bool reset, inval_tb = false;
|
2012-12-02 16:04:43 +00:00
|
|
|
|
|
|
|
/* This function should never be called with addresses outside the
|
|
|
|
guest address space. If this assert fires, it probably indicates
|
|
|
|
a missing call to h2g_valid. */
|
2020-05-13 17:51:30 +00:00
|
|
|
assert(end - 1 <= GUEST_ADDR_MAX);
|
2012-12-02 16:04:43 +00:00
|
|
|
assert(start < end);
|
2021-04-06 17:40:20 +00:00
|
|
|
/* Only set PAGE_ANON with new mappings. */
|
|
|
|
assert(!(flags & PAGE_ANON) || (flags & PAGE_RESET));
|
2016-10-27 15:10:05 +00:00
|
|
|
assert_memory_lock();
|
2012-12-02 16:04:43 +00:00
|
|
|
|
|
|
|
start = start & TARGET_PAGE_MASK;
|
|
|
|
end = TARGET_PAGE_ALIGN(end);
|
|
|
|
|
|
|
|
if (flags & PAGE_WRITE) {
|
|
|
|
flags |= PAGE_WRITE_ORG;
|
|
|
|
}
|
2022-10-05 16:44:52 +00:00
|
|
|
reset = !(flags & PAGE_VALID) || (flags & PAGE_RESET);
|
2022-10-05 19:56:14 +00:00
|
|
|
if (reset) {
|
|
|
|
page_reset_target_data(start, end);
|
|
|
|
}
|
2021-02-12 18:48:32 +00:00
|
|
|
flags &= ~PAGE_RESET;
|
2012-12-02 16:04:43 +00:00
|
|
|
|
|
|
|
for (addr = start, len = end - start;
|
|
|
|
len != 0;
|
|
|
|
len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
|
2022-08-10 22:27:46 +00:00
|
|
|
PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, true);
|
2012-12-02 16:04:43 +00:00
|
|
|
|
2022-10-05 16:44:52 +00:00
|
|
|
/*
|
|
|
|
* If the page was executable, but is reset, or is no longer
|
|
|
|
* executable, or has become writable, then invalidate any code.
|
|
|
|
*/
|
|
|
|
if ((p->flags & PAGE_EXEC)
|
|
|
|
&& (reset ||
|
|
|
|
!(flags & PAGE_EXEC) ||
|
|
|
|
(flags & ~p->flags & PAGE_WRITE))) {
|
2022-10-05 19:56:46 +00:00
|
|
|
inval_tb = true;
|
2012-12-02 16:04:43 +00:00
|
|
|
}
|
2022-10-05 19:56:14 +00:00
|
|
|
/* Using mprotect on a page does not change sticky bits. */
|
|
|
|
p->flags = (reset ? 0 : p->flags & PAGE_STICKY) | flags;
|
2012-12-02 16:04:43 +00:00
|
|
|
}
|
2022-10-05 19:56:46 +00:00
|
|
|
|
|
|
|
if (inval_tb) {
|
|
|
|
tb_invalidate_phys_range(start, end);
|
|
|
|
}
|
2012-12-02 16:04:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int page_check_range(target_ulong start, target_ulong len, int flags)
|
|
|
|
{
|
|
|
|
PageDesc *p;
|
|
|
|
target_ulong end;
|
|
|
|
target_ulong addr;
|
|
|
|
|
|
|
|
/* This function should never be called with addresses outside the
|
|
|
|
guest address space. If this assert fires, it probably indicates
|
|
|
|
a missing call to h2g_valid. */
|
osdep: Make MIN/MAX evaluate arguments only once
I'm not aware of any immediate bugs in qemu where a second runtime
evaluation of the arguments to MIN() or MAX() causes a problem, but
proactively preventing such abuse is easier than falling prey to an
unintended case down the road. At any rate, here's the conversation
that sparked the current patch:
https://lists.gnu.org/archive/html/qemu-devel/2018-12/msg05718.html
Update the MIN/MAX macros to only evaluate their argument once at
runtime; this uses typeof(1 ? (a) : (b)) to ensure that we are
promoting the temporaries to the same type as the final comparison (we
have to trigger type promotion, as typeof(bitfield) won't compile; and
we can't use typeof((a) + (b)) or even typeof((a) + 0), as some of our
uses of MAX are on void* pointers where such addition is undefined).
However, we are unable to work around gcc refusing to compile ({}) in
a constant context (such as the array length of a static variable),
even when only used in the dead branch of a __builtin_choose_expr(),
so we have to provide a second macro pair MIN_CONST and MAX_CONST for
use when both arguments are known to be compile-time constants and
where the result must also be usable as a constant; this second form
evaluates arguments multiple times but that doesn't matter for
constants. By using a void expression as the expansion if a
non-constant is presented to this second form, we can enlist the
compiler to ensure the double evaluation is not attempted on
non-constants.
Alas, as both macros now rely on compiler intrinsics, they are no
longer usable in preprocessor #if conditions; those will just have to
be open-coded or the logic rewritten into #define or runtime 'if'
conditions (but where the compiler dead-code-elimination will probably
still apply).
I tested that both gcc 10.1.1 and clang 10.0.0 produce errors for all
forms of macro mis-use. As the errors can sometimes be cryptic, I'm
demonstrating the gcc output:
Use of MIN when MIN_CONST is needed:
In file included from /home/eblake/qemu/qemu-img.c:25:
/home/eblake/qemu/include/qemu/osdep.h:249:5: error: braced-group within expression allowed only inside a function
249 | ({ \
| ^
/home/eblake/qemu/qemu-img.c:92:12: note: in expansion of macro ‘MIN’
92 | char array[MIN(1, 2)] = "";
| ^~~
Use of MIN_CONST when MIN is needed:
/home/eblake/qemu/qemu-img.c: In function ‘is_allocated_sectors’:
/home/eblake/qemu/qemu-img.c:1225:15: error: void value not ignored as it ought to be
1225 | i = MIN_CONST(i, n);
| ^
Use of MIN in the preprocessor:
In file included from /home/eblake/qemu/accel/tcg/translate-all.c:20:
/home/eblake/qemu/accel/tcg/translate-all.c: In function ‘page_check_range’:
/home/eblake/qemu/include/qemu/osdep.h:249:6: error: token "{" is not valid in preprocessor expressions
249 | ({ \
| ^
Fix the resulting callsites that used #if or computed a compile-time
constant min or max to use the new macros. cpu-defs.h is interesting,
as CPU_TLB_DYN_MAX_BITS is sometimes used as a constant and sometimes
dynamic.
It may be worth improving glib's MIN/MAX definitions to be saner, but
that is a task for another day.
Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-Id: <20200625162602.700741-1-eblake@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2020-06-25 16:26:02 +00:00
|
|
|
if (TARGET_ABI_BITS > L1_MAP_ADDR_SPACE_BITS) {
|
|
|
|
assert(start < ((target_ulong)1 << L1_MAP_ADDR_SPACE_BITS));
|
|
|
|
}
|
2012-12-02 16:04:43 +00:00
|
|
|
|
|
|
|
if (len == 0) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (start + len - 1 < start) {
|
|
|
|
/* We've wrapped around. */
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* must do before we loose bits in the next step */
|
|
|
|
end = TARGET_PAGE_ALIGN(start + len);
|
|
|
|
start = start & TARGET_PAGE_MASK;
|
|
|
|
|
|
|
|
for (addr = start, len = end - start;
|
|
|
|
len != 0;
|
|
|
|
len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
|
|
|
|
p = page_find(addr >> TARGET_PAGE_BITS);
|
|
|
|
if (!p) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (!(p->flags & PAGE_VALID)) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((flags & PAGE_READ) && !(p->flags & PAGE_READ)) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (flags & PAGE_WRITE) {
|
|
|
|
if (!(p->flags & PAGE_WRITE_ORG)) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
/* unprotect the page if it was put read-only because it
|
|
|
|
contains translated code */
|
|
|
|
if (!(p->flags & PAGE_WRITE)) {
|
2016-05-17 14:18:03 +00:00
|
|
|
if (!page_unprotect(addr, 0)) {
|
2012-12-02 16:04:43 +00:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-08-05 20:48:35 +00:00
|
|
|
void page_protect(tb_page_addr_t page_addr)
|
|
|
|
{
|
|
|
|
target_ulong addr;
|
|
|
|
PageDesc *p;
|
|
|
|
int prot;
|
|
|
|
|
|
|
|
p = page_find(page_addr >> TARGET_PAGE_BITS);
|
|
|
|
if (p && (p->flags & PAGE_WRITE)) {
|
|
|
|
/*
|
|
|
|
* Force the host page as non writable (writes will have a page fault +
|
|
|
|
* mprotect overhead).
|
|
|
|
*/
|
|
|
|
page_addr &= qemu_host_page_mask;
|
|
|
|
prot = 0;
|
|
|
|
for (addr = page_addr; addr < page_addr + qemu_host_page_size;
|
|
|
|
addr += TARGET_PAGE_SIZE) {
|
|
|
|
|
|
|
|
p = page_find(addr >> TARGET_PAGE_BITS);
|
|
|
|
if (!p) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
prot |= p->flags;
|
|
|
|
p->flags &= ~PAGE_WRITE;
|
|
|
|
}
|
|
|
|
mprotect(g2h_untagged(page_addr), qemu_host_page_size,
|
|
|
|
(prot & PAGE_BITS) & ~PAGE_WRITE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-12-02 16:04:43 +00:00
|
|
|
/* called from signal handler: invalidate the code and unprotect the
|
2016-05-17 14:18:03 +00:00
|
|
|
* page. Return 0 if the fault was not handled, 1 if it was handled,
|
|
|
|
* and 2 if it was handled but the caller must cause the TB to be
|
|
|
|
* immediately exited. (We can only return 2 if the 'pc' argument is
|
|
|
|
* non-zero.)
|
|
|
|
*/
|
|
|
|
int page_unprotect(target_ulong address, uintptr_t pc)
|
2012-12-02 16:04:43 +00:00
|
|
|
{
|
|
|
|
unsigned int prot;
|
2016-07-07 08:33:12 +00:00
|
|
|
bool current_tb_invalidated;
|
2012-12-02 16:04:43 +00:00
|
|
|
PageDesc *p;
|
|
|
|
target_ulong host_start, host_end, addr;
|
|
|
|
|
|
|
|
/* Technically this isn't safe inside a signal handler. However we
|
|
|
|
know this only ever happens in a synchronous SEGV handler, so in
|
|
|
|
practice it seems to be ok. */
|
|
|
|
mmap_lock();
|
|
|
|
|
|
|
|
p = page_find(address >> TARGET_PAGE_BITS);
|
|
|
|
if (!p) {
|
|
|
|
mmap_unlock();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* if the page was really writable, then we change its
|
|
|
|
protection back to writable */
|
2017-11-28 14:35:25 +00:00
|
|
|
if (p->flags & PAGE_WRITE_ORG) {
|
2016-07-07 08:33:12 +00:00
|
|
|
current_tb_invalidated = false;
|
2017-11-28 14:35:25 +00:00
|
|
|
if (p->flags & PAGE_WRITE) {
|
|
|
|
/* If the page is actually marked WRITE then assume this is because
|
|
|
|
* this thread raced with another one which got here first and
|
|
|
|
* set the page to PAGE_WRITE and did the TB invalidate for us.
|
|
|
|
*/
|
|
|
|
#ifdef TARGET_HAS_PRECISE_SMC
|
tcg: track TBs with per-region BST's
This paves the way for enabling scalable parallel generation of TCG code.
Instead of tracking TBs with a single binary search tree (BST), use a
BST for each TCG region, protecting it with a lock. This is as scalable
as it gets, since each TCG thread operates on a separate region.
The core of this change is the introduction of struct tcg_region_tree,
which contains a pointer to a GTree and an associated lock to serialize
accesses to it. We then allocate an array of tcg_region_tree's, adding
the appropriate padding to avoid false sharing based on
qemu_dcache_linesize.
Given a tc_ptr, we first find the corresponding region_tree. This
is done by special-casing the first and last regions first, since they
might be of size != region.size; otherwise we just divide the offset
by region.stride. I was worried about this division (several dozen
cycles of latency), but profiling shows that this is not a fast path.
Note that region.stride is not required to be a power of two; it
is only required to be a multiple of the host's page size.
Note that with this design we can also provide consistent snapshots
about all region trees at once; for instance, tcg_tb_foreach
acquires/releases all region_tree locks before/after iterating over them.
For this reason we now drop tb_lock in dump_exec_info().
As an alternative I considered implementing a concurrent BST, but this
can be tricky to get right, offers no consistent snapshots of the BST,
and performance and scalability-wise I don't think it could ever beat
having separate GTrees, given that our workload is insert-mostly (all
concurrent BST designs I've seen focus, understandably, on making
lookups fast, which comes at the expense of convoluted, non-wait-free
insertions/removals).
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-26 20:58:05 +00:00
|
|
|
TranslationBlock *current_tb = tcg_tb_lookup(pc);
|
2017-11-28 14:35:25 +00:00
|
|
|
if (current_tb) {
|
|
|
|
current_tb_invalidated = tb_cflags(current_tb) & CF_INVALID;
|
2017-07-12 19:31:57 +00:00
|
|
|
}
|
2012-12-02 16:04:43 +00:00
|
|
|
#endif
|
2017-11-28 14:35:25 +00:00
|
|
|
} else {
|
|
|
|
host_start = address & qemu_host_page_mask;
|
|
|
|
host_end = host_start + qemu_host_page_size;
|
|
|
|
|
|
|
|
prot = 0;
|
|
|
|
for (addr = host_start; addr < host_end; addr += TARGET_PAGE_SIZE) {
|
|
|
|
p = page_find(addr >> TARGET_PAGE_BITS);
|
|
|
|
p->flags |= PAGE_WRITE;
|
|
|
|
prot |= p->flags;
|
|
|
|
|
|
|
|
/* and since the content will be modified, we must invalidate
|
|
|
|
the corresponding translated code. */
|
2022-10-05 16:18:39 +00:00
|
|
|
current_tb_invalidated |=
|
|
|
|
tb_invalidate_phys_page_unwind(addr, pc);
|
2017-11-28 14:35:25 +00:00
|
|
|
}
|
2021-02-12 18:48:43 +00:00
|
|
|
mprotect((void *)g2h_untagged(host_start), qemu_host_page_size,
|
2017-11-28 14:35:25 +00:00
|
|
|
prot & PAGE_BITS);
|
2012-12-02 16:04:43 +00:00
|
|
|
}
|
|
|
|
mmap_unlock();
|
2016-07-07 08:33:12 +00:00
|
|
|
/* If current TB was invalidated return to main loop */
|
|
|
|
return current_tb_invalidated ? 2 : 1;
|
2012-12-02 16:04:43 +00:00
|
|
|
}
|
|
|
|
mmap_unlock();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_USER_ONLY */
|
2017-06-26 05:22:55 +00:00
|
|
|
|
2022-08-15 20:13:05 +00:00
|
|
|
/*
|
|
|
|
* Called by generic code at e.g. cpu reset after cpu creation,
|
|
|
|
* therefore we must be prepared to allocate the jump cache.
|
|
|
|
*/
|
|
|
|
void tcg_flush_jmp_cache(CPUState *cpu)
|
|
|
|
{
|
|
|
|
CPUJumpCache *jc = cpu->tb_jmp_cache;
|
|
|
|
|
2022-10-31 02:26:36 +00:00
|
|
|
/* During early initialization, the cache may not yet be allocated. */
|
|
|
|
if (unlikely(jc == NULL)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < TB_JMP_CACHE_SIZE; i++) {
|
|
|
|
qatomic_set(&jc->array[i].tb, NULL);
|
2022-08-15 20:13:05 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-26 05:22:55 +00:00
|
|
|
/* This is a wrapper for common code that can not use CONFIG_SOFTMMU */
|
|
|
|
void tcg_flush_softmmu_tlb(CPUState *cs)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_SOFTMMU
|
|
|
|
tlb_flush(cs);
|
|
|
|
#endif
|
|
|
|
}
|