From 00c8a933d95add3ce4afebbe491ca0fa398a9007 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 14 Aug 2024 03:54:23 -0400 Subject: [PATCH 01/26] target/i386: Don't construct a all-zero entry for CPUID[0xD 0x3f] Currently, QEMU always constructs a all-zero CPUID entry for CPUID[0xD 0x3f]. It's meaningless to construct such a leaf as the end of leaf 0xD. Rework the logic of how subleaves of 0xD are constructed to get rid of such all-zero value of subleaf 0x3f. Signed-off-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240814075431.339209-2-xiaoyao.li@intel.com Signed-off-by: Paolo Bonzini --- target/i386/kvm/kvm.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index e6f94900f3..6f6301460d 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -1864,10 +1864,6 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env, case 0xb: case 0xd: for (j = 0; ; j++) { - if (i == 0xd && j == 64) { - break; - } - c->function = i; c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; c->index = j; @@ -1883,7 +1879,12 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env, break; } if (i == 0xd && c->eax == 0) { - continue; + if (j < 63) { + continue; + } else { + cpuid_i--; + break; + } } if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { goto full; From 7dddc3bb875e7141ab25931d0f30a1c319bc8457 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 14 Aug 2024 03:54:24 -0400 Subject: [PATCH 02/26] target/i386: Enable fdp-excptn-only and zero-fcs-fds - CPUID.(EAX=07H,ECX=0H):EBX[bit 6]: x87 FPU Data Pointer updated only on x87 exceptions if 1. - CPUID.(EAX=07H,ECX=0H):EBX[bit 13]: Deprecates FPU CS and FPU DS values if 1. i.e., X87 FCS and FDS are always zero. Define names for them so that they can be exposed to guest with -cpu host. Also define the bit field MACROs so that named cpu models can add it as well in the future. Signed-off-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240814075431.339209-3-xiaoyao.li@intel.com Signed-off-by: Paolo Bonzini --- target/i386/cpu.c | 4 ++-- target/i386/cpu.h | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 0d30191482..089b651591 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1054,9 +1054,9 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { .type = CPUID_FEATURE_WORD, .feat_names = { "fsgsbase", "tsc-adjust", "sgx", "bmi1", - "hle", "avx2", NULL, "smep", + "hle", "avx2", "fdp-excptn-only", "smep", "bmi2", "erms", "invpcid", "rtm", - NULL, NULL, "mpx", NULL, + NULL, "zero-fcs-fds", "mpx", NULL, "avx512f", "avx512dq", "rdseed", "adx", "smap", "avx512ifma", "pcommit", "clflushopt", "clwb", "intel-pt", "avx512pf", "avx512er", diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 4c63e7b045..4c84cd41fd 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -820,6 +820,8 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); #define CPUID_7_0_EBX_HLE (1U << 4) /* Intel Advanced Vector Extensions 2 */ #define CPUID_7_0_EBX_AVX2 (1U << 5) +/* FPU data pointer updated only on x87 exceptions */ +#define CPUID_7_0_EBX_FDP_EXCPTN_ONLY (1u << 6) /* Supervisor-mode Execution Prevention */ #define CPUID_7_0_EBX_SMEP (1U << 7) /* 2nd Group of Advanced Bit Manipulation Extensions */ @@ -830,6 +832,8 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); #define CPUID_7_0_EBX_INVPCID (1U << 10) /* Restricted Transactional Memory */ #define CPUID_7_0_EBX_RTM (1U << 11) +/* Zero out FPU CS and FPU DS */ +#define CPUID_7_0_EBX_ZERO_FCS_FDS (1U << 13) /* Memory Protection Extension */ #define CPUID_7_0_EBX_MPX (1U << 14) /* AVX-512 Foundation */ From 5ab639141b6d916a6f4041d4ec46f2f1a1e4a365 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 14 Aug 2024 03:54:27 -0400 Subject: [PATCH 03/26] target/i386: Construct CPUID 2 as stateful iff times > 1 When times == 1, the CPUID leaf 2 is not stateful. Signed-off-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240814075431.339209-6-xiaoyao.li@intel.com Signed-off-by: Paolo Bonzini --- target/i386/kvm/kvm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 6f6301460d..77e8816570 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -1838,10 +1838,12 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env, int times; c->function = i; - c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC | - KVM_CPUID_FLAG_STATE_READ_NEXT; cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); times = c->eax & 0xff; + if (times > 1) { + c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC | + KVM_CPUID_FLAG_STATE_READ_NEXT; + } for (j = 1; j < times; ++j) { if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { From 87c88db3143e91076d167a62dd7febf49afca8a2 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 14 Aug 2024 03:54:31 -0400 Subject: [PATCH 04/26] target/i386: Make invtsc migratable when user sets tsc-khz explicitly When user sets tsc-frequency explicitly, the invtsc feature is actually migratable because the tsc-frequency is supposed to be fixed during the migration. See commit d99569d9d856 ("kvm: Allow invtsc migration if tsc-khz is set explicitly") for referrence. Signed-off-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240814075431.339209-10-xiaoyao.li@intel.com Signed-off-by: Paolo Bonzini --- target/i386/cpu.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 089b651591..5535f4e6ab 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1865,9 +1865,10 @@ static inline uint64_t x86_cpu_xsave_xss_components(X86CPU *cpu) * Returns the set of feature flags that are supported and migratable by * QEMU, for a given FeatureWord. */ -static uint64_t x86_cpu_get_migratable_flags(FeatureWord w) +static uint64_t x86_cpu_get_migratable_flags(X86CPU *cpu, FeatureWord w) { FeatureWordInfo *wi = &feature_word_info[w]; + CPUX86State *env = &cpu->env; uint64_t r = 0; int i; @@ -1881,6 +1882,12 @@ static uint64_t x86_cpu_get_migratable_flags(FeatureWord w) r |= f; } } + + /* when tsc-khz is set explicitly, invtsc is migratable */ + if ((w == FEAT_8000_0007_EDX) && env->user_tsc_khz) { + r |= CPUID_APM_INVTSC; + } + return r; } @@ -6124,7 +6131,7 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w) r &= ~unavail; if (cpu && cpu->migratable) { - r &= x86_cpu_get_migratable_flags(w); + r &= x86_cpu_get_migratable_flags(cpu, w); } return r; } From 10eaf9c0fb7060f45807becbb2742a9de9bc3632 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Thu, 19 Sep 2024 13:10:11 +0800 Subject: [PATCH 05/26] target/i386: Add more features enumerated by CPUID.7.2.EDX Following 5 bits in CPUID.7.2.EDX are supported by KVM. Add their supports in QEMU. Each of them indicates certain bits of IA32_SPEC_CTRL are supported. Those bits can control CPU speculation behavior which can be used to defend against side-channel attacks. bit0: intel-psfd if 1, indicates bit 7 of the IA32_SPEC_CTRL MSR is supported. Bit 7 of this MSR disables Fast Store Forwarding Predictor without disabling Speculative Store Bypass bit1: ipred-ctrl If 1, indicates bits 3 and 4 of the IA32_SPEC_CTRL MSR are supported. Bit 3 of this MSR enables IPRED_DIS control for CPL3. Bit 4 of this MSR enables IPRED_DIS control for CPL0/1/2 bit2: rrsba-ctrl If 1, indicates bits 5 and 6 of the IA32_SPEC_CTRL MSR are supported. Bit 5 of this MSR disables RRSBA behavior for CPL3. Bit 6 of this MSR disables RRSBA behavior for CPL0/1/2 bit3: ddpd-u If 1, indicates bit 8 of the IA32_SPEC_CTRL MSR is supported. Bit 8 of this MSR disables Data Dependent Prefetcher. bit4: bhi-ctrl if 1, indicates bit 10 of the IA32_SPEC_CTRL MSR is supported. Bit 10 of this MSR enables BHI_DIS_S behavior. Signed-off-by: Chao Gao Link: https://lore.kernel.org/r/20240919051011.118309-1-chao.gao@intel.com Signed-off-by: Paolo Bonzini --- target/i386/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 5535f4e6ab..9a6b9e9e51 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1148,8 +1148,8 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { [FEAT_7_2_EDX] = { .type = CPUID_FEATURE_WORD, .feat_names = { - NULL, NULL, NULL, NULL, - NULL, "mcdt-no", NULL, NULL, + "intel-psfd", "ipred-ctrl", "rrsba-ctrl", "ddpd-u", + "bhi-ctrl", "mcdt-no", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, From b5151ace58ba1db6bdfeedf4c17336f7195ee849 Mon Sep 17 00:00:00 2001 From: Gao Shiyuan Date: Wed, 9 Oct 2024 17:51:09 +0800 Subject: [PATCH 06/26] target/i386: Add support save/load HWCR MSR KVM commit 191c8137a939 ("x86/kvm: Implement HWCR support") introduced support for emulating HWCR MSR. Add support for QEMU to save/load this MSR for migration purposes. Signed-off-by: Gao Shiyuan Signed-off-by: Wang Liang Link: https://lore.kernel.org/r/20241009095109.66843-1-gaoshiyuan@baidu.com Signed-off-by: Paolo Bonzini --- target/i386/cpu.h | 5 +++++ target/i386/kvm/kvm.c | 12 ++++++++++++ target/i386/machine.c | 20 ++++++++++++++++++++ 3 files changed, 37 insertions(+) diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 4c84cd41fd..74886d1580 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -533,6 +533,8 @@ typedef enum X86Seg { #define MSR_AMD64_TSC_RATIO_DEFAULT 0x100000000ULL +#define MSR_K7_HWCR 0xc0010015 + #define MSR_VM_HSAVE_PA 0xc0010117 #define MSR_IA32_XFD 0x000001c4 @@ -1858,6 +1860,9 @@ typedef struct CPUArchState { uint64_t msr_lbr_depth; LBREntry lbr_records[ARCH_LBR_NR_ENTRIES]; + /* AMD MSRC001_0015 Hardware Configuration */ + uint64_t msr_hwcr; + /* exception/interrupt handling */ int error_code; int exception_is_int; diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 77e8816570..7c3fcb8698 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -165,6 +165,7 @@ static bool has_msr_ucode_rev; static bool has_msr_vmx_procbased_ctls2; static bool has_msr_perf_capabs; static bool has_msr_pkrs; +static bool has_msr_hwcr; static uint32_t has_architectural_pmu_version; static uint32_t num_architectural_pmu_gp_counters; @@ -2577,6 +2578,8 @@ static int kvm_get_supported_msrs(KVMState *s) case MSR_IA32_PKRS: has_msr_pkrs = true; break; + case MSR_K7_HWCR: + has_msr_hwcr = true; } } } @@ -3919,6 +3922,9 @@ static int kvm_put_msrs(X86CPU *cpu, int level) if (has_msr_virt_ssbd) { kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd); } + if (has_msr_hwcr) { + kvm_msr_entry_add(cpu, MSR_K7_HWCR, env->msr_hwcr); + } #ifdef TARGET_X86_64 if (lm_capable_kernel) { @@ -4403,6 +4409,9 @@ static int kvm_get_msrs(X86CPU *cpu) kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0); env->tsc_valid = !runstate_is_running(); } + if (has_msr_hwcr) { + kvm_msr_entry_add(cpu, MSR_K7_HWCR, 0); + } #ifdef TARGET_X86_64 if (lm_capable_kernel) { @@ -4922,6 +4931,9 @@ static int kvm_get_msrs(X86CPU *cpu) case MSR_ARCH_LBR_INFO_0 ... MSR_ARCH_LBR_INFO_0 + 31: env->lbr_records[index - MSR_ARCH_LBR_INFO_0].info = msrs[i].data; break; + case MSR_K7_HWCR: + env->msr_hwcr = msrs[i].data; + break; } } diff --git a/target/i386/machine.c b/target/i386/machine.c index 39f8294f27..b4610325aa 100644 --- a/target/i386/machine.c +++ b/target/i386/machine.c @@ -1543,6 +1543,25 @@ static const VMStateDescription vmstate_msr_xfd = { } }; +static bool msr_hwcr_needed(void *opaque) +{ + X86CPU *cpu = opaque; + CPUX86State *env = &cpu->env; + + return env->msr_hwcr != 0; +} + +static const VMStateDescription vmstate_msr_hwcr = { + .name = "cpu/msr_hwcr", + .version_id = 1, + .minimum_version_id = 1, + .needed = msr_hwcr_needed, + .fields = (VMStateField[]) { + VMSTATE_UINT64(env.msr_hwcr, X86CPU), + VMSTATE_END_OF_LIST() + } +}; + #ifdef TARGET_X86_64 static bool intel_fred_msrs_needed(void *opaque) { @@ -1773,6 +1792,7 @@ const VMStateDescription vmstate_x86_cpu = { &vmstate_msr_intel_sgx, &vmstate_pdptrs, &vmstate_msr_xfd, + &vmstate_msr_hwcr, #ifdef TARGET_X86_64 &vmstate_msr_fred, &vmstate_amx_xtile, From bbf3810f2c4f97bd7a1982d3e0ff0f00295b8169 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 17 Sep 2024 18:00:48 +0200 Subject: [PATCH 07/26] target/i386: Fix conditional CONFIG_SYNDBG enablement Putting HYPERV_FEAT_SYNDBG entry under "#ifdef CONFIG_SYNDBG" in 'kvm_hyperv_properties' array is wrong: as HYPERV_FEAT_SYNDBG is not the highest feature number, the result is an empty (zeroed) entry in the array (and not a skipped entry!). hyperv_feature_supported() is designed to check that all CPUID bits are set but for a zeroed feature in 'kvm_hyperv_properties' it returns 'true' so QEMU considers HYPERV_FEAT_SYNDBG as always supported, regardless of whether KVM host actually supports it. To fix the issue, leave HYPERV_FEAT_SYNDBG's definition in 'kvm_hyperv_properties' array, there's nothing wrong in having it defined even when 'CONFIG_SYNDBG' is not set. Instead, put "hv-syndbg" CPU property under '#ifdef CONFIG_SYNDBG' to alter the existing behavior when the flag is silently skipped in !CONFIG_SYNDBG builds. Leave an 'assert' sentinel in hyperv_feature_supported() making sure there are no 'holes' or improperly defined features in 'kvm_hyperv_properties'. Fixes: d8701185f40c ("hw: hyperv: Initial commit for Synthetic Debugging device") Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240917160051.2637594-2-vkuznets@redhat.com Signed-off-by: Paolo Bonzini --- target/i386/cpu.c | 2 ++ target/i386/kvm/kvm.c | 11 +++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 9a6b9e9e51..565aad02ea 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -8299,8 +8299,10 @@ static Property x86_cpu_properties[] = { HYPERV_FEAT_TLBFLUSH_DIRECT, 0), DEFINE_PROP_ON_OFF_AUTO("hv-no-nonarch-coresharing", X86CPU, hyperv_no_nonarch_cs, ON_OFF_AUTO_OFF), +#ifdef CONFIG_SYNDBG DEFINE_PROP_BIT64("hv-syndbg", X86CPU, hyperv_features, HYPERV_FEAT_SYNDBG, 0), +#endif DEFINE_PROP_BOOL("hv-passthrough", X86CPU, hyperv_passthrough, false), DEFINE_PROP_BOOL("hv-enforce-cpuid", X86CPU, hyperv_enforce_cpuid, false), diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 7c3fcb8698..1ec4977a8e 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -1056,7 +1056,6 @@ static struct { .bits = HV_DEPRECATING_AEOI_RECOMMENDED} } }, -#ifdef CONFIG_SYNDBG [HYPERV_FEAT_SYNDBG] = { .desc = "Enable synthetic kernel debugger channel (hv-syndbg)", .flags = { @@ -1065,7 +1064,6 @@ static struct { }, .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_RELAXED) }, -#endif [HYPERV_FEAT_MSR_BITMAP] = { .desc = "enlightened MSR-Bitmap (hv-emsr-bitmap)", .flags = { @@ -1317,6 +1315,13 @@ static bool hyperv_feature_supported(CPUState *cs, int feature) uint32_t func, bits; int i, reg; + /* + * kvm_hyperv_properties needs to define at least one CPUID flag which + * must be used to detect the feature, it's hard to say whether it is + * supported or not otherwise. + */ + assert(kvm_hyperv_properties[feature].flags[0].func); + for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) { func = kvm_hyperv_properties[feature].flags[i].func; @@ -4025,13 +4030,11 @@ static int kvm_put_msrs(X86CPU *cpu, int level) kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, env->msr_hv_tsc_emulation_status); } -#ifdef CONFIG_SYNDBG if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG) && has_msr_hv_syndbg_options) { kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS, hyperv_syndbg_query_options()); } -#endif } if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) { kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, From 7d7b9c7655a26e09c800ef40373078a80e90d9f3 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 17 Sep 2024 18:00:49 +0200 Subject: [PATCH 08/26] target/i386: Exclude 'hv-syndbg' from 'hv-passthrough' Windows with Hyper-V role enabled doesn't boot with 'hv-passthrough' when no debugger is configured, this significantly limits the usefulness of the feature as there's no support for subtracting Hyper-V features from CPU flags at this moment (e.g. "-cpu host,hv-passthrough,-hv-syndbg" does not work). While this is also theoretically fixable, 'hv-syndbg' is likely very special and unneeded in the default set. Genuine Hyper-V doesn't seem to enable it either. Introduce 'skip_passthrough' flag to 'kvm_hyperv_properties' and use it as one-off to skip 'hv-syndbg' when enabling features in 'hv-passthrough' mode. Note, "-cpu host,hv-passthrough,hv-syndbg" can still be used if needed. As both 'hv-passthrough' and 'hv-syndbg' are debug features, the change should not have any effect on production environments. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240917160051.2637594-3-vkuznets@redhat.com Signed-off-by: Paolo Bonzini --- docs/system/i386/hyperv.rst | 13 +++++++++---- target/i386/kvm/kvm.c | 7 +++++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/docs/system/i386/hyperv.rst b/docs/system/i386/hyperv.rst index 2505dc4c86..009947e391 100644 --- a/docs/system/i386/hyperv.rst +++ b/docs/system/i386/hyperv.rst @@ -262,14 +262,19 @@ Supplementary features ``hv-passthrough`` In some cases (e.g. during development) it may make sense to use QEMU in 'pass-through' mode and give Windows guests all enlightenments currently - supported by KVM. This pass-through mode is enabled by "hv-passthrough" CPU - flag. + supported by KVM. Note: ``hv-passthrough`` flag only enables enlightenments which are known to QEMU (have corresponding 'hv-' flag) and copies ``hv-spinlocks`` and ``hv-vendor-id`` values from KVM to QEMU. ``hv-passthrough`` overrides all other 'hv-' settings on - the command line. Also, enabling this flag effectively prevents migration as the - list of enabled enlightenments may differ between target and destination hosts. + the command line. + + Note: ``hv-passthrough`` does not enable ``hv-syndbg`` which can prevent certain + Windows guests from booting when used without proper configuration. If needed, + ``hv-syndbg`` can be enabled additionally. + + Note: ``hv-passthrough`` effectively prevents migration as the list of enabled + enlightenments may differ between target and destination hosts. ``hv-enforce-cpuid`` By default, KVM allows the guest to use all currently supported Hyper-V diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 1ec4977a8e..fd9f198892 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -934,6 +934,7 @@ static struct { uint32_t bits; } flags[2]; uint64_t dependencies; + bool skip_passthrough; } kvm_hyperv_properties[] = { [HYPERV_FEAT_RELAXED] = { .desc = "relaxed timing (hv-relaxed)", @@ -1062,7 +1063,8 @@ static struct { {.func = HV_CPUID_FEATURES, .reg = R_EDX, .bits = HV_FEATURE_DEBUG_MSRS_AVAILABLE} }, - .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_RELAXED) + .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_RELAXED), + .skip_passthrough = true, }, [HYPERV_FEAT_MSR_BITMAP] = { .desc = "enlightened MSR-Bitmap (hv-emsr-bitmap)", @@ -1471,7 +1473,8 @@ bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp) * hv_build_cpuid_leaf() uses this info to build guest CPUIDs. */ for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) { - if (hyperv_feature_supported(cs, feat)) { + if (hyperv_feature_supported(cs, feat) && + !kvm_hyperv_properties[feat].skip_passthrough) { cpu->hyperv_features |= BIT(feat); } } From d3177e2e4353824a650434c57471615d43507500 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 17 Sep 2024 18:00:50 +0200 Subject: [PATCH 09/26] target/i386: Make sure SynIC state is really updated before KVM_RUN 'hyperv_synic' test from KVM unittests was observed to be flaky on certain hardware (hangs sometimes). Debugging shows that the problem happens in hyperv_sint_route_new() when the test tries to set up a new SynIC route. The function bails out on: if (!synic->sctl_enabled) { goto cleanup; } but the test writes to HV_X64_MSR_SCONTROL just before it starts establishing SINT routes. Further investigation shows that synic_update() (called from async_synic_update()) happens after the SINT setup attempt and not before. Apparently, the comment before async_safe_run_on_cpu() in kvm_hv_handle_exit() does not correctly describe the guarantees async_safe_run_on_cpu() gives. In particular, async worked added to a CPU is actually processed from qemu_wait_io_event() which is not always called before KVM_RUN, i.e. kvm_cpu_exec() checks whether an exit request is pending for a CPU and if not, keeps running the vCPU until it meets an exit it can't handle internally. Hyper-V specific MSR writes are not automatically trigger an exit. Fix the issue by simply raising an exit request for the vCPU where SynIC update was queued. This is not a performance critical path as SynIC state does not get updated so often (and async_safe_run_on_cpu() is a big hammer anyways). Reported-by: Jan Richter Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240917160051.2637594-4-vkuznets@redhat.com Signed-off-by: Paolo Bonzini --- target/i386/kvm/hyperv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/target/i386/kvm/hyperv.c b/target/i386/kvm/hyperv.c index b94f12acc2..70b89cacf9 100644 --- a/target/i386/kvm/hyperv.c +++ b/target/i386/kvm/hyperv.c @@ -80,6 +80,7 @@ int kvm_hv_handle_exit(X86CPU *cpu, struct kvm_hyperv_exit *exit) * necessary because memory hierarchy is being changed */ async_safe_run_on_cpu(CPU(cpu), async_synic_update, RUN_ON_CPU_NULL); + cpu_exit(CPU(cpu)); return EXCP_INTERRUPT; case KVM_EXIT_HYPERV_HCALL: { From 45f519950d4f70d092b552661323ef3c851efdf7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 17 Sep 2024 18:00:51 +0200 Subject: [PATCH 10/26] docs/system: Add recommendations to Hyper-V enlightenments doc While hyperv.rst already has all currently implemented Hyper-V enlightenments documented, it may be unclear what is the recommended set to achieve the best result. Add the corresponding section to the doc. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240917160051.2637594-5-vkuznets@redhat.com Signed-off-by: Paolo Bonzini --- docs/system/i386/hyperv.rst | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/docs/system/i386/hyperv.rst b/docs/system/i386/hyperv.rst index 009947e391..1c1de77feb 100644 --- a/docs/system/i386/hyperv.rst +++ b/docs/system/i386/hyperv.rst @@ -283,6 +283,36 @@ Supplementary features feature alters this behavior and only allows the guest to use exposed Hyper-V enlightenments. +Recommendations +--------------- + +To achieve the best performance of Windows and Hyper-V guests and unless there +are any specific requirements (e.g. migration to older QEMU/KVM versions, +emulating specific Hyper-V version, ...), it is recommended to enable all +currently implemented Hyper-V enlightenments with the following exceptions: + +- ``hv-syndbg``, ``hv-passthrough``, ``hv-enforce-cpuid`` should not be enabled + in production configurations as these are debugging/development features. +- ``hv-reset`` can be avoided as modern Hyper-V versions don't expose it. +- ``hv-evmcs`` can (and should) be enabled on Intel CPUs only. While the feature + is only used in nested configurations (Hyper-V, WSL2), enabling it for regular + Windows guests should not have any negative effects. +- ``hv-no-nonarch-coresharing`` must only be enabled if vCPUs are properly pinned + so no non-architectural core sharing is possible. +- ``hv-vendor-id``, ``hv-version-id-build``, ``hv-version-id-major``, + ``hv-version-id-minor``, ``hv-version-id-spack``, ``hv-version-id-sbranch``, + ``hv-version-id-snumber`` can be left unchanged, guests are not supposed to + behave differently when different Hyper-V version is presented to them. +- ``hv-crash`` must only be enabled if the crash information is consumed via + QAPI by higher levels of the virtualization stack. Enabling this feature + effectively prevents Windows from creating dumps upon crashes. +- ``hv-reenlightenment`` can only be used on hardware which supports TSC + scaling or when guest migration is not needed. +- ``hv-spinlocks`` should be set to e.g. 0xfff when host CPUs are overcommited + (meaning there are other scheduled tasks or guests) and can be left unchanged + from the default value (0xffffffff) otherwise. +- ``hv-avic``/``hv-apicv`` should not be enabled if the hardware does not + support APIC virtualization (Intel APICv, AMD AVIC). Useful links ------------ From 615586cb356811e46c2e5f85c36db4b93f8381cd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 17 Oct 2024 11:09:52 +0200 Subject: [PATCH 11/26] tcg/s390x: fix constraint for 32-bit TSTEQ/TSTNE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 32-bit TSTEQ and TSTNE is subject to the same constraints as for 64-bit, but setcond_i32 and negsetcond_i32 were incorrectly using TCG_CT_CONST ("i") instead of TCG_CT_CONST_CMP ("C"). Adjust the constraint and make tcg_target_const_match use the same sequence as tgen_cmp2: first check if the constant is a valid operand for TSTEQ/TSTNE, then accept everything for 32-bit non-test comparisons, finally check if the constant is a valid operand for 64-bit non-test comparisons. Reported-by: Philippe Mathieu-Daudé Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini --- tcg/s390x/tcg-target.c.inc | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc index a5d57197a4..27bccc14e5 100644 --- a/tcg/s390x/tcg-target.c.inc +++ b/tcg/s390x/tcg-target.c.inc @@ -565,6 +565,20 @@ static bool tcg_target_const_match(int64_t val, int ct, } if (ct & TCG_CT_CONST_CMP) { + if (is_tst_cond(cond)) { + if (is_const_p16(uval) >= 0) { + return true; /* TMxx */ + } + if (risbg_mask(uval)) { + return true; /* RISBG */ + } + return false; + } + + if (type == TCG_TYPE_I32) { + return true; + } + switch (cond) { case TCG_COND_EQ: case TCG_COND_NE: @@ -584,13 +598,7 @@ static bool tcg_target_const_match(int64_t val, int ct, break; case TCG_COND_TSTNE: case TCG_COND_TSTEQ: - if (is_const_p16(uval) >= 0) { - return true; /* TMxx */ - } - if (risbg_mask(uval)) { - return true; /* RISBG */ - } - break; + /* checked above, fallthru */ default: g_assert_not_reached(); } @@ -3231,9 +3239,9 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_rotl_i64: case INDEX_op_rotr_i32: case INDEX_op_rotr_i64: + return C_O1_I2(r, r, ri); case INDEX_op_setcond_i32: case INDEX_op_negsetcond_i32: - return C_O1_I2(r, r, ri); case INDEX_op_setcond_i64: case INDEX_op_negsetcond_i64: return C_O1_I2(r, r, rC); From 10eae89937d3211ce100b7f6a3718df66324bdf5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 20 Jun 2024 11:42:12 +0200 Subject: [PATCH 12/26] target/i386: convert bit test instructions to new decoder Code generation was rewritten; it reuses the same trick to use the CC_OP_SAR values for cc_op, but it tries to use CC_OP_ADCX or CC_OP_ADCOX instead of CC_OP_EFLAGS. This is a tiny bit more efficient in the common case where only CF is checked in the resulting flags. Reviewed-by: Richard Henderson Signed-off-by: Paolo Bonzini --- target/i386/tcg/decode-new.c.inc | 41 ++++++--- target/i386/tcg/decode-new.h | 3 + target/i386/tcg/emit.c.inc | 150 ++++++++++++++++++++++++++++++- target/i386/tcg/translate.c | 147 +----------------------------- 4 files changed, 183 insertions(+), 158 deletions(-) diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc index 30be9237c3..55629f6825 100644 --- a/target/i386/tcg/decode-new.c.inc +++ b/target/i386/tcg/decode-new.c.inc @@ -205,6 +205,7 @@ #define sextT0 .special = X86_SPECIAL_SExtT0, #define zextT0 .special = X86_SPECIAL_ZExtT0, #define op0_Mw .special = X86_SPECIAL_Op0_Mw, +#define btEvGv .special = X86_SPECIAL_BitTest, #define vex1 .vex_class = 1, #define vex1_rep3 .vex_class = 1, .vex_special = X86_VEX_REPScalar, @@ -269,6 +270,24 @@ static inline const X86OpEntry *decode_by_prefix(DisasContext *s, const X86OpEnt } } +static void decode_group8(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86GenFunc group8_gen[8] = { + NULL, NULL, NULL, NULL, + gen_BT, gen_BTS, gen_BTR, gen_BTC, + }; + int op = (get_modrm(s, env) >> 3) & 7; + entry->gen = group8_gen[op]; + if (op == 4) { + /* prevent writeback and LOCK for BT */ + entry->op1 = entry->op0; + entry->op0 = X86_TYPE_None; + entry->s0 = X86_SIZE_None; + } else { + entry->special = X86_SPECIAL_HasLock; + } +} + static void decode_group15(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) { static const X86OpEntry group15_reg[8] = { @@ -1162,12 +1181,14 @@ static const X86OpEntry opcodes_0F[256] = { [0xa0] = X86_OP_ENTRYr(PUSH, FS, w), [0xa1] = X86_OP_ENTRYw(POP, FS, w), [0xa2] = X86_OP_ENTRY0(CPUID), + [0xa3] = X86_OP_ENTRYrr(BT, E,v, G,v, btEvGv), [0xa4] = X86_OP_ENTRY4(SHLD, E,v, 2op,v, G,v), [0xa5] = X86_OP_ENTRY3(SHLD, E,v, 2op,v, G,v), [0xb0] = X86_OP_ENTRY2(CMPXCHG,E,b, G,b, lock), [0xb1] = X86_OP_ENTRY2(CMPXCHG,E,v, G,v, lock), [0xb2] = X86_OP_ENTRY3(LSS, G,v, EM,p, None, None), + [0xb3] = X86_OP_ENTRY2(BTR, E,v, G,v, btEvGv), [0xb4] = X86_OP_ENTRY3(LFS, G,v, EM,p, None, None), [0xb5] = X86_OP_ENTRY3(LGS, G,v, EM,p, None, None), [0xb6] = X86_OP_ENTRY3(MOV, G,v, E,b, None, None, zextT0), /* MOVZX */ @@ -1294,6 +1315,7 @@ static const X86OpEntry opcodes_0F[256] = { [0xa8] = X86_OP_ENTRYr(PUSH, GS, w), [0xa9] = X86_OP_ENTRYw(POP, GS, w), [0xaa] = X86_OP_ENTRY0(RSM, chk(smm) svm(RSM)), + [0xab] = X86_OP_ENTRY2(BTS, E,v, G,v, btEvGv), [0xac] = X86_OP_ENTRY4(SHRD, E,v, 2op,v, G,v), [0xad] = X86_OP_ENTRY3(SHRD, E,v, 2op,v, G,v), [0xae] = X86_OP_GROUP0(group15), @@ -1306,6 +1328,8 @@ static const X86OpEntry opcodes_0F[256] = { [0xb8] = X86_OP_GROUP0(0FB8), /* decoded as modrm, which is visible as a difference between page fault and #UD */ [0xb9] = X86_OP_ENTRYr(UD, nop,v), /* UD1 */ + [0xba] = X86_OP_GROUP2(group8, E,v, I,b), + [0xbb] = X86_OP_ENTRY2(BTC, E,v, G,v, btEvGv), [0xbc] = X86_OP_GROUP0(0FBC), [0xbd] = X86_OP_GROUP0(0FBD), [0xbe] = X86_OP_ENTRY3(MOV, G,v, E,b, None, None, sextT0), /* MOVSX */ @@ -2428,6 +2452,7 @@ static void disas_insn(DisasContext *s, CPUState *cpu) CPUX86State *env = cpu_env(cpu); X86DecodedInsn decode; X86DecodeFunc decode_func = decode_root; + bool accept_lock = false; uint8_t cc_live, b; s->pc = s->base.pc_next; @@ -2601,10 +2626,6 @@ static void disas_insn(DisasContext *s, CPUState *cpu) switch (b) { case 0x00 ... 0x01: /* mostly privileged instructions */ case 0x1a ... 0x1b: /* MPX */ - case 0xa3: /* bt */ - case 0xab: /* bts */ - case 0xb3: /* btr */ - case 0xba ... 0xbb: /* grp8, btc */ case 0xc7: /* grp9 */ disas_insn_old(s, cpu, b + 0x100); return; @@ -2666,9 +2687,10 @@ static void disas_insn(DisasContext *s, CPUState *cpu) if (decode.op[0].has_ea) { s->prefix |= PREFIX_LOCK; } - decode.e.special = X86_SPECIAL_HasLock; /* fallthrough */ case X86_SPECIAL_HasLock: + case X86_SPECIAL_BitTest: + accept_lock = decode.op[0].has_ea; break; case X86_SPECIAL_Op0_Rd: @@ -2710,10 +2732,8 @@ static void disas_insn(DisasContext *s, CPUState *cpu) break; } - if (s->prefix & PREFIX_LOCK) { - if (decode.e.special != X86_SPECIAL_HasLock || !decode.op[0].has_ea) { - goto illegal_op; - } + if ((s->prefix & PREFIX_LOCK) && !accept_lock) { + goto illegal_op; } if (!validate_vex(s, &decode)) { @@ -2759,9 +2779,10 @@ static void disas_insn(DisasContext *s, CPUState *cpu) if (decode.e.special != X86_SPECIAL_NoLoadEA && (decode.op[0].has_ea || decode.op[1].has_ea || decode.op[2].has_ea)) { - gen_load_ea(s, &decode.mem, decode.e.vex_class == 12); + gen_load_ea(s, &decode); } if (s->prefix & PREFIX_LOCK) { + assert(decode.op[0].has_ea && !decode.op[2].has_ea); gen_load(s, &decode, 2, s->T1); decode.e.gen(s, &decode); } else { diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h index f9bf9a6041..e4cdf5e3c4 100644 --- a/target/i386/tcg/decode-new.h +++ b/target/i386/tcg/decode-new.h @@ -190,6 +190,9 @@ typedef enum X86InsnSpecial { /* Always locked if it has a memory operand (XCHG) */ X86_SPECIAL_Locked, + /* Like HasLock, but also operand 2 provides bit displacement into memory. */ + X86_SPECIAL_BitTest, + /* Do not load effective address in s->A0 */ X86_SPECIAL_NoLoadEA, diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc index 9b50419918..bb498464ea 100644 --- a/target/i386/tcg/emit.c.inc +++ b/target/i386/tcg/emit.c.inc @@ -78,9 +78,26 @@ static void gen_NM_exception(DisasContext *s) gen_exception(s, EXCP07_PREX); } -static void gen_load_ea(DisasContext *s, AddressParts *mem, bool is_vsib) +static void gen_load_ea(DisasContext *s, X86DecodedInsn *decode) { - TCGv ea = gen_lea_modrm_1(s, *mem, is_vsib); + AddressParts *mem = &decode->mem; + TCGv ea; + + ea = gen_lea_modrm_1(s, *mem, decode->e.vex_class == 12); + if (decode->e.special == X86_SPECIAL_BitTest) { + MemOp ot = decode->op[1].ot; + int poslen = 8 << ot; + int opn = decode->op[2].n; + TCGv ofs = tcg_temp_new(); + + /* Extract memory displacement from the second operand. */ + assert(decode->op[2].unit == X86_OP_INT && decode->op[2].ot != MO_8); + tcg_gen_sextract_tl(ofs, cpu_regs[opn], 3, poslen - 3); + tcg_gen_andi_tl(ofs, ofs, -1 << ot); + tcg_gen_add_tl(s->A0, ea, ofs); + ea = s->A0; + } + gen_lea_v_seg(s, ea, mem->def_seg, s->override); } @@ -412,6 +429,32 @@ static void prepare_update3_cc(X86DecodedInsn *decode, DisasContext *s, CCOp op, decode->cc_op = op; } +/* Set up decode->cc_* to modify CF while keeping other flags unchanged. */ +static void prepare_update_cf(X86DecodedInsn *decode, DisasContext *s, TCGv cf) +{ + switch (s->cc_op) { + case CC_OP_ADOX: + case CC_OP_ADCOX: + decode->cc_src2 = cpu_cc_src2; + decode->cc_src = cpu_cc_src; + decode->cc_op = CC_OP_ADCOX; + break; + + case CC_OP_EFLAGS: + case CC_OP_ADCX: + decode->cc_src = cpu_cc_src; + decode->cc_op = CC_OP_ADCX; + break; + + default: + decode->cc_src = tcg_temp_new(); + gen_mov_eflags(s, decode->cc_src); + decode->cc_op = CC_OP_ADCX; + break; + } + decode->cc_dst = cf; +} + static void gen_store_sse(DisasContext *s, X86DecodedInsn *decode, int src_ofs) { MemOp ot = decode->op[0].ot; @@ -1390,6 +1433,109 @@ static void gen_BSWAP(DisasContext *s, X86DecodedInsn *decode) tcg_gen_bswap32_tl(s->T0, s->T0, TCG_BSWAP_OZ); } +static TCGv gen_bt_mask(DisasContext *s, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[1].ot; + TCGv mask = tcg_temp_new(); + + tcg_gen_andi_tl(s->T1, s->T1, (8 << ot) - 1); + tcg_gen_shl_tl(mask, tcg_constant_tl(1), s->T1); + return mask; +} + +/* Expects truncated bit index in s->T1, 1 << s->T1 in MASK. */ +static void gen_bt_flags(DisasContext *s, X86DecodedInsn *decode, TCGv src, TCGv mask) +{ + TCGv cf; + + /* + * C is the result of the test, Z is unchanged, and the others + * are all undefined. + */ + switch (s->cc_op) { + case CC_OP_DYNAMIC: + case CC_OP_CLR: + case CC_OP_EFLAGS: + case CC_OP_ADCX: + case CC_OP_ADOX: + case CC_OP_ADCOX: + /* Generate EFLAGS and replace the C bit. */ + cf = tcg_temp_new(); + tcg_gen_setcond_tl(TCG_COND_TSTNE, cf, src, mask); + prepare_update_cf(decode, s, cf); + break; + default: + /* + * Z was going to be computed from the non-zero status of CC_DST. + * We can get that same Z value (and the new C value) by leaving + * CC_DST alone, setting CC_SRC, and using a CC_OP_SAR of the + * same width. + */ + decode->cc_src = tcg_temp_new(); + decode->cc_dst = cpu_cc_dst; + decode->cc_op = ((s->cc_op - CC_OP_MULB) & 3) + CC_OP_SARB; + tcg_gen_shr_tl(decode->cc_src, src, s->T1); + break; + } +} + +static void gen_BT(DisasContext *s, X86DecodedInsn *decode) +{ + TCGv mask = gen_bt_mask(s, decode); + + gen_bt_flags(s, decode, s->T0, mask); +} + +static void gen_BTC(DisasContext *s, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + TCGv old = tcg_temp_new(); + TCGv mask = gen_bt_mask(s, decode); + + if (s->prefix & PREFIX_LOCK) { + tcg_gen_atomic_fetch_xor_tl(old, s->A0, mask, s->mem_index, ot | MO_LE); + } else { + tcg_gen_mov_tl(old, s->T0); + tcg_gen_xor_tl(s->T0, s->T0, mask); + } + + gen_bt_flags(s, decode, old, mask); +} + +static void gen_BTR(DisasContext *s, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + TCGv old = tcg_temp_new(); + TCGv mask = gen_bt_mask(s, decode); + + if (s->prefix & PREFIX_LOCK) { + TCGv maskc = tcg_temp_new(); + tcg_gen_not_tl(maskc, mask); + tcg_gen_atomic_fetch_and_tl(old, s->A0, maskc, s->mem_index, ot | MO_LE); + } else { + tcg_gen_mov_tl(old, s->T0); + tcg_gen_andc_tl(s->T0, s->T0, mask); + } + + gen_bt_flags(s, decode, old, mask); +} + +static void gen_BTS(DisasContext *s, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + TCGv old = tcg_temp_new(); + TCGv mask = gen_bt_mask(s, decode); + + if (s->prefix & PREFIX_LOCK) { + tcg_gen_atomic_fetch_or_tl(old, s->A0, mask, s->mem_index, ot | MO_LE); + } else { + tcg_gen_mov_tl(old, s->T0); + tcg_gen_or_tl(s->T0, s->T0, mask); + } + + gen_bt_flags(s, decode, old, mask); +} + static void gen_BZHI(DisasContext *s, X86DecodedInsn *decode) { MemOp ot = decode->op[0].ot; diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 98f5fe61ed..59b67042cd 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -693,11 +693,6 @@ static TCGv gen_ext_tl(TCGv dst, TCGv src, MemOp size, bool sign) return dst; } -static void gen_exts(MemOp ot, TCGv reg) -{ - gen_ext_tl(reg, reg, ot, true); -} - static void gen_op_j_ecx(DisasContext *s, TCGCond cond, TCGLabel *label1) { TCGv tmp = gen_ext_tl(NULL, cpu_regs[R_ECX], s->aflag, false); @@ -2980,7 +2975,7 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) int prefixes = s->prefix; MemOp dflag = s->dflag; MemOp ot; - int modrm, reg, rm, mod, op, val; + int modrm, reg, rm, mod, op; /* now check op code */ switch (b) { @@ -3046,146 +3041,6 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) } break; - /************************/ - /* bit operations */ - case 0x1ba: /* bt/bts/btr/btc Gv, im */ - ot = dflag; - modrm = x86_ldub_code(env, s); - op = (modrm >> 3) & 7; - mod = (modrm >> 6) & 3; - rm = (modrm & 7) | REX_B(s); - if (mod != 3) { - s->rip_offset = 1; - gen_lea_modrm(env, s, modrm); - if (!(s->prefix & PREFIX_LOCK)) { - gen_op_ld_v(s, ot, s->T0, s->A0); - } - } else { - gen_op_mov_v_reg(s, ot, s->T0, rm); - } - /* load shift */ - val = x86_ldub_code(env, s); - tcg_gen_movi_tl(s->T1, val); - if (op < 4) - goto unknown_op; - op -= 4; - goto bt_op; - case 0x1a3: /* bt Gv, Ev */ - op = 0; - goto do_btx; - case 0x1ab: /* bts */ - op = 1; - goto do_btx; - case 0x1b3: /* btr */ - op = 2; - goto do_btx; - case 0x1bb: /* btc */ - op = 3; - do_btx: - ot = dflag; - modrm = x86_ldub_code(env, s); - reg = ((modrm >> 3) & 7) | REX_R(s); - mod = (modrm >> 6) & 3; - rm = (modrm & 7) | REX_B(s); - gen_op_mov_v_reg(s, MO_32, s->T1, reg); - if (mod != 3) { - AddressParts a = gen_lea_modrm_0(env, s, modrm, false); - /* specific case: we need to add a displacement */ - gen_exts(ot, s->T1); - tcg_gen_sari_tl(s->tmp0, s->T1, 3 + ot); - tcg_gen_shli_tl(s->tmp0, s->tmp0, ot); - tcg_gen_add_tl(s->A0, gen_lea_modrm_1(s, a, false), s->tmp0); - gen_lea_v_seg(s, s->A0, a.def_seg, s->override); - if (!(s->prefix & PREFIX_LOCK)) { - gen_op_ld_v(s, ot, s->T0, s->A0); - } - } else { - gen_op_mov_v_reg(s, ot, s->T0, rm); - } - bt_op: - tcg_gen_andi_tl(s->T1, s->T1, (1 << (3 + ot)) - 1); - tcg_gen_movi_tl(s->tmp0, 1); - tcg_gen_shl_tl(s->tmp0, s->tmp0, s->T1); - if (s->prefix & PREFIX_LOCK) { - switch (op) { - case 0: /* bt */ - /* Needs no atomic ops; we suppressed the normal - memory load for LOCK above so do it now. */ - gen_op_ld_v(s, ot, s->T0, s->A0); - break; - case 1: /* bts */ - tcg_gen_atomic_fetch_or_tl(s->T0, s->A0, s->tmp0, - s->mem_index, ot | MO_LE); - break; - case 2: /* btr */ - tcg_gen_not_tl(s->tmp0, s->tmp0); - tcg_gen_atomic_fetch_and_tl(s->T0, s->A0, s->tmp0, - s->mem_index, ot | MO_LE); - break; - default: - case 3: /* btc */ - tcg_gen_atomic_fetch_xor_tl(s->T0, s->A0, s->tmp0, - s->mem_index, ot | MO_LE); - break; - } - tcg_gen_shr_tl(s->tmp4, s->T0, s->T1); - } else { - tcg_gen_shr_tl(s->tmp4, s->T0, s->T1); - switch (op) { - case 0: /* bt */ - /* Data already loaded; nothing to do. */ - break; - case 1: /* bts */ - tcg_gen_or_tl(s->T0, s->T0, s->tmp0); - break; - case 2: /* btr */ - tcg_gen_andc_tl(s->T0, s->T0, s->tmp0); - break; - default: - case 3: /* btc */ - tcg_gen_xor_tl(s->T0, s->T0, s->tmp0); - break; - } - if (op != 0) { - if (mod != 3) { - gen_op_st_v(s, ot, s->T0, s->A0); - } else { - gen_op_mov_reg_v(s, ot, rm, s->T0); - } - } - } - - /* Delay all CC updates until after the store above. Note that - C is the result of the test, Z is unchanged, and the others - are all undefined. */ - switch (s->cc_op) { - case CC_OP_MULB ... CC_OP_MULQ: - case CC_OP_ADDB ... CC_OP_ADDQ: - case CC_OP_ADCB ... CC_OP_ADCQ: - case CC_OP_SUBB ... CC_OP_SUBQ: - case CC_OP_SBBB ... CC_OP_SBBQ: - case CC_OP_LOGICB ... CC_OP_LOGICQ: - case CC_OP_INCB ... CC_OP_INCQ: - case CC_OP_DECB ... CC_OP_DECQ: - case CC_OP_SHLB ... CC_OP_SHLQ: - case CC_OP_SARB ... CC_OP_SARQ: - case CC_OP_BMILGB ... CC_OP_BMILGQ: - case CC_OP_POPCNT: - /* Z was going to be computed from the non-zero status of CC_DST. - We can get that same Z value (and the new C value) by leaving - CC_DST alone, setting CC_SRC, and using a CC_OP_SAR of the - same width. */ - tcg_gen_mov_tl(cpu_cc_src, s->tmp4); - set_cc_op(s, ((s->cc_op - CC_OP_MULB) & 3) + CC_OP_SARB); - break; - default: - /* Otherwise, generate EFLAGS and replace the C bit. */ - gen_compute_eflags(s); - tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, s->tmp4, - ctz32(CC_C), 1); - break; - } - break; case 0x100: modrm = x86_ldub_code(env, s); mod = (modrm >> 6) & 3; From a2e2c78d2af0cfcc3d59542e70503c56d9ae7369 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 May 2024 17:03:59 +0200 Subject: [PATCH 13/26] target/i386: decode address before going back to translate.c There are now relatively few unconverted opcodes in translate.c (there are 13 of them including 8 for x87), and all of them have the same format with a mod/rm byte and no immediate. A good next step is to remove the early bail out to disas_insn_x87/disas_insn_old, instead giving these legacy translator functions the same prototype as the other gen_* functions. To do this, the X86DecodeInsn can be passed down to the places that used to fetch address bytes from the instruction stream. To make sure that everything is done cleanly, the CPUX86State* argument is removed. As part of the unification, the gen_lea_modrm() name is now free, so rename gen_load_ea() to gen_lea_modrm(). This is as good a name and it makes the changes to translate.c easier to review. Reviewed-by: Richard Henderson Signed-off-by: Paolo Bonzini --- target/i386/tcg/decode-new.c.inc | 53 ++++++----- target/i386/tcg/decode-new.h | 14 ++- target/i386/tcg/emit.c.inc | 2 +- target/i386/tcg/translate.c | 152 +++++++++++++------------------ 4 files changed, 103 insertions(+), 118 deletions(-) diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc index 55629f6825..ee3ba16116 100644 --- a/target/i386/tcg/decode-new.c.inc +++ b/target/i386/tcg/decode-new.c.inc @@ -1092,6 +1092,8 @@ static void decode_MOV_CR_DR(DisasContext *s, CPUX86State *env, X86OpEntry *entr } static const X86OpEntry opcodes_0F[256] = { + [0x00] = X86_OP_ENTRY1(multi0F, nop,v, nolea), /* unconverted */ + [0x01] = X86_OP_ENTRY1(multi0F, nop,v, nolea), /* unconverted */ [0x02] = X86_OP_ENTRYwr(LAR, G,v, E,w, chk(prot)), [0x03] = X86_OP_ENTRYwr(LSL, G,v, E,w, chk(prot)), [0x05] = X86_OP_ENTRY0(SYSCALL, chk(o64_intel)), @@ -1201,6 +1203,7 @@ static const X86OpEntry opcodes_0F[256] = { [0xc4] = X86_OP_ENTRY4(PINSRW, V,dq,H,dq,E,w, vex5 mmx p_00_66), [0xc5] = X86_OP_ENTRY3(PEXTRW, G,d, U,dq,I,b, vex5 mmx p_00_66), [0xc6] = X86_OP_ENTRY4(VSHUF, V,x, H,x, W,x, vex4 p_00_66), + [0xc7] = X86_OP_ENTRY1(multi0F, nop,v, nolea), /* unconverted */ [0xd0] = X86_OP_ENTRY3(VADDSUB, V,x, H,x, W,x, vex2 cpuid(SSE3) p_66_f2), [0xd1] = X86_OP_ENTRY3(PSRLW_r, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), @@ -1243,6 +1246,8 @@ static const X86OpEntry opcodes_0F[256] = { [0x18] = X86_OP_ENTRY1(NOP, nop,v), /* prefetch/reserved NOP */ [0x19] = X86_OP_ENTRY1(NOP, nop,v), /* reserved NOP */ + [0x1a] = X86_OP_ENTRY1(multi0F, nop,v, nolea), /* unconverted MPX */ + [0x1b] = X86_OP_ENTRY1(multi0F, nop,v, nolea), /* unconverted MPX */ [0x1c] = X86_OP_ENTRY1(NOP, nop,v), /* reserved NOP */ [0x1d] = X86_OP_ENTRY1(NOP, nop,v), /* reserved NOP */ [0x1e] = X86_OP_ENTRY1(NOP, nop,v), /* reserved NOP */ @@ -1780,6 +1785,19 @@ static const X86OpEntry opcodes_root[256] = { [0xCE] = X86_OP_ENTRY0(INTO), [0xCF] = X86_OP_ENTRY0(IRET, chk(vm86_iopl) svm(IRET)), + /* + * x87 is nolea because it needs the address without segment base, + * in order to store it in fdp. + */ + [0xD8] = X86_OP_ENTRY1(x87, nop,v, nolea), + [0xD9] = X86_OP_ENTRY1(x87, nop,v, nolea), + [0xDA] = X86_OP_ENTRY1(x87, nop,v, nolea), + [0xDB] = X86_OP_ENTRY1(x87, nop,v, nolea), + [0xDC] = X86_OP_ENTRY1(x87, nop,v, nolea), + [0xDD] = X86_OP_ENTRY1(x87, nop,v, nolea), + [0xDE] = X86_OP_ENTRY1(x87, nop,v, nolea), + [0xDF] = X86_OP_ENTRY1(x87, nop,v, nolea), + [0xE8] = X86_OP_ENTRYr(CALL, J,z_f64), [0xE9] = X86_OP_ENTRYr(JMP, J,z_f64), [0xEA] = X86_OP_ENTRYrr(JMPF, I_unsigned,p, I_unsigned,w, chk(i64)), @@ -2612,30 +2630,6 @@ static void disas_insn(DisasContext *s, CPUState *cpu) } } - /* Go back to old decoder for unconverted opcodes. */ - if (!(s->prefix & PREFIX_VEX)) { - if ((b & ~7) == 0xd8) { - if (!disas_insn_x87(s, cpu, b)) { - goto unknown_op; - } - return; - } - - if (b == 0x0f) { - b = x86_ldub_code(env, s); - switch (b) { - case 0x00 ... 0x01: /* mostly privileged instructions */ - case 0x1a ... 0x1b: /* MPX */ - case 0xc7: /* grp9 */ - disas_insn_old(s, cpu, b + 0x100); - return; - default: - decode_func = do_decode_0F; - break; - } - } - } - memset(&decode, 0, sizeof(decode)); decode.cc_op = -1; decode.b = b; @@ -2732,6 +2726,15 @@ static void disas_insn(DisasContext *s, CPUState *cpu) break; } + /* + * hack for old decoder: 0F C7 has both instructions that accept LOCK + * and instructions that don't, but also needs X86_SPECIAL_NoLoadEA. + * Keep this here until CMPXCHG8B/CMPXCHG16B is separated from the + * other unconverted opcodes. + */ + if (decode.e.gen == gen_multi0F) { + accept_lock = true; + } if ((s->prefix & PREFIX_LOCK) && !accept_lock) { goto illegal_op; } @@ -2779,7 +2782,7 @@ static void disas_insn(DisasContext *s, CPUState *cpu) if (decode.e.special != X86_SPECIAL_NoLoadEA && (decode.op[0].has_ea || decode.op[1].has_ea || decode.op[2].has_ea)) { - gen_load_ea(s, &decode); + gen_lea_modrm(s, &decode); } if (s->prefix & PREFIX_LOCK) { assert(decode.op[0].has_ea && !decode.op[2].has_ea); diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h index e4cdf5e3c4..bebc77bd54 100644 --- a/target/i386/tcg/decode-new.h +++ b/target/i386/tcg/decode-new.h @@ -264,12 +264,13 @@ typedef enum X86VEXSpecial { typedef struct X86OpEntry X86OpEntry; typedef struct X86DecodedInsn X86DecodedInsn; +struct DisasContext; /* Decode function for multibyte opcodes. */ -typedef void (*X86DecodeFunc)(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b); +typedef void (*X86DecodeFunc)(struct DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b); /* Code generation function. */ -typedef void (*X86GenFunc)(DisasContext *s, X86DecodedInsn *decode); +typedef void (*X86GenFunc)(struct DisasContext *s, X86DecodedInsn *decode); struct X86OpEntry { /* Based on the is_decode flags. */ @@ -316,6 +317,14 @@ typedef struct X86DecodedOp { }; } X86DecodedOp; +typedef struct AddressParts { + int def_seg; + int base; + int index; + int scale; + target_long disp; +} AddressParts; + struct X86DecodedInsn { X86OpEntry e; X86DecodedOp op[3]; @@ -333,3 +342,4 @@ struct X86DecodedInsn { uint8_t b; }; +static void gen_lea_modrm(struct DisasContext *s, X86DecodedInsn *decode); diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc index bb498464ea..29de8bba6f 100644 --- a/target/i386/tcg/emit.c.inc +++ b/target/i386/tcg/emit.c.inc @@ -78,7 +78,7 @@ static void gen_NM_exception(DisasContext *s) gen_exception(s, EXCP07_PREX); } -static void gen_load_ea(DisasContext *s, X86DecodedInsn *decode) +static void gen_lea_modrm(DisasContext *s, X86DecodedInsn *decode) { AddressParts *mem = &decode->mem; TCGv ea; diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 59b67042cd..be68cde1ba 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -29,6 +29,7 @@ #include "exec/helper-proto.h" #include "exec/helper-gen.h" #include "helper-tcg.h" +#include "decode-new.h" #include "exec/log.h" @@ -1520,14 +1521,6 @@ static inline uint64_t x86_ldq_code(CPUX86State *env, DisasContext *s) /* Decompose an address. */ -typedef struct AddressParts { - int def_seg; - int base; - int index; - int scale; - target_long disp; -} AddressParts; - static AddressParts gen_lea_modrm_0(CPUX86State *env, DisasContext *s, int modrm, bool is_vsib) { @@ -1686,24 +1679,11 @@ static TCGv gen_lea_modrm_1(DisasContext *s, AddressParts a, bool is_vsib) return ea; } -static void gen_lea_modrm(CPUX86State *env, DisasContext *s, int modrm) -{ - AddressParts a = gen_lea_modrm_0(env, s, modrm, false); - TCGv ea = gen_lea_modrm_1(s, a, false); - gen_lea_v_seg(s, ea, a.def_seg, s->override); -} - -static void gen_nop_modrm(CPUX86State *env, DisasContext *s, int modrm) -{ - (void)gen_lea_modrm_0(env, s, modrm, false); -} - /* Used for BNDCL, BNDCU, BNDCN. */ -static void gen_bndck(CPUX86State *env, DisasContext *s, int modrm, +static void gen_bndck(DisasContext *s, X86DecodedInsn *decode, TCGCond cond, TCGv_i64 bndv) { - AddressParts a = gen_lea_modrm_0(env, s, modrm, false); - TCGv ea = gen_lea_modrm_1(s, a, false); + TCGv ea = gen_lea_modrm_1(s, decode->mem, false); tcg_gen_extu_tl_i64(s->tmp1_i64, ea); if (!CODE64(s)) { @@ -1715,8 +1695,9 @@ static void gen_bndck(CPUX86State *env, DisasContext *s, int modrm, } /* generate modrm load of memory or register. */ -static void gen_ld_modrm(CPUX86State *env, DisasContext *s, int modrm, MemOp ot) +static void gen_ld_modrm(DisasContext *s, X86DecodedInsn *decode, MemOp ot) { + int modrm = s->modrm; int mod, rm; mod = (modrm >> 6) & 3; @@ -1724,14 +1705,15 @@ static void gen_ld_modrm(CPUX86State *env, DisasContext *s, int modrm, MemOp ot) if (mod == 3) { gen_op_mov_v_reg(s, ot, s->T0, rm); } else { - gen_lea_modrm(env, s, modrm); + gen_lea_modrm(s, decode); gen_op_ld_v(s, ot, s->T0, s->A0); } } /* generate modrm store of memory or register. */ -static void gen_st_modrm(CPUX86State *env, DisasContext *s, int modrm, MemOp ot) +static void gen_st_modrm(DisasContext *s, X86DecodedInsn *decode, MemOp ot) { + int modrm = s->modrm; int mod, rm; mod = (modrm >> 6) & 3; @@ -1739,7 +1721,7 @@ static void gen_st_modrm(CPUX86State *env, DisasContext *s, int modrm, MemOp ot) if (mod == 3) { gen_op_mov_reg_v(s, ot, rm, s->T0); } else { - gen_lea_modrm(env, s, modrm); + gen_lea_modrm(s, decode); gen_op_st_v(s, ot, s->T0, s->A0); } } @@ -2307,12 +2289,12 @@ static void gen_sty_env_A0(DisasContext *s, int offset, bool align) tcg_gen_qemu_st_i128(t, s->tmp0, mem_index, mop); } -static void gen_cmpxchg8b(DisasContext *s, CPUX86State *env, int modrm) +static void gen_cmpxchg8b(DisasContext *s, X86DecodedInsn *decode) { TCGv_i64 cmp, val, old; TCGv Z; - gen_lea_modrm(env, s, modrm); + gen_lea_modrm(s, decode); cmp = tcg_temp_new_i64(); val = tcg_temp_new_i64(); @@ -2361,13 +2343,13 @@ static void gen_cmpxchg8b(DisasContext *s, CPUX86State *env, int modrm) } #ifdef TARGET_X86_64 -static void gen_cmpxchg16b(DisasContext *s, CPUX86State *env, int modrm) +static void gen_cmpxchg16b(DisasContext *s, X86DecodedInsn *decode) { MemOp mop = MO_TE | MO_128 | MO_ALIGN; TCGv_i64 t0, t1; TCGv_i128 cmp, val; - gen_lea_modrm(env, s, modrm); + gen_lea_modrm(s, decode); cmp = tcg_temp_new_i128(); val = tcg_temp_new_i128(); @@ -2405,31 +2387,32 @@ static void gen_cmpxchg16b(DisasContext *s, CPUX86State *env, int modrm) } #endif -static bool disas_insn_x87(DisasContext *s, CPUState *cpu, int b) +#include "emit.c.inc" + +static void gen_x87(DisasContext *s, X86DecodedInsn *decode) { - CPUX86State *env = cpu_env(cpu); bool update_fip = true; - int modrm, mod, rm, op; + int b = decode->b; + int modrm = s->modrm; + int mod, rm, op; if (s->flags & (HF_EM_MASK | HF_TS_MASK)) { /* if CR0.EM or CR0.TS are set, generate an FPU exception */ /* XXX: what to do if illegal op ? */ gen_exception(s, EXCP07_PREX); - return true; + return; } - modrm = x86_ldub_code(env, s); mod = (modrm >> 6) & 3; rm = modrm & 7; op = ((b & 7) << 3) | ((modrm >> 3) & 7); if (mod != 3) { /* memory op */ - AddressParts a = gen_lea_modrm_0(env, s, modrm, false); - TCGv ea = gen_lea_modrm_1(s, a, false); + TCGv ea = gen_lea_modrm_1(s, decode->mem, false); TCGv last_addr = tcg_temp_new(); bool update_fdp = true; tcg_gen_mov_tl(last_addr, ea); - gen_lea_v_seg(s, ea, a.def_seg, s->override); + gen_lea_v_seg(s, ea, decode->mem.def_seg, s->override); switch (op) { case 0x00 ... 0x07: /* fxxxs */ @@ -2619,11 +2602,11 @@ static bool disas_insn_x87(DisasContext *s, CPUState *cpu, int b) gen_helper_fpop(tcg_env); break; default: - return false; + goto illegal_op; } if (update_fdp) { - int last_seg = s->override >= 0 ? s->override : a.def_seg; + int last_seg = s->override >= 0 ? s->override : decode->mem.def_seg; tcg_gen_ld_i32(s->tmp2_i32, tcg_env, offsetof(CPUX86State, @@ -2660,7 +2643,7 @@ static bool disas_insn_x87(DisasContext *s, CPUState *cpu, int b) update_fip = false; break; default: - return false; + goto illegal_op; } break; case 0x0c: /* grp d9/4 */ @@ -2679,7 +2662,7 @@ static bool disas_insn_x87(DisasContext *s, CPUState *cpu, int b) gen_helper_fxam_ST0(tcg_env); break; default: - return false; + goto illegal_op; } break; case 0x0d: /* grp d9/5 */ @@ -2714,7 +2697,7 @@ static bool disas_insn_x87(DisasContext *s, CPUState *cpu, int b) gen_helper_fldz_ST0(tcg_env); break; default: - return false; + goto illegal_op; } } break; @@ -2816,7 +2799,7 @@ static bool disas_insn_x87(DisasContext *s, CPUState *cpu, int b) gen_helper_fpop(tcg_env); break; default: - return false; + goto illegal_op; } break; case 0x1c: @@ -2836,7 +2819,7 @@ static bool disas_insn_x87(DisasContext *s, CPUState *cpu, int b) case 4: /* fsetpm (287 only, just do nop here) */ break; default: - return false; + goto illegal_op; } break; case 0x1d: /* fucomi */ @@ -2888,7 +2871,7 @@ static bool disas_insn_x87(DisasContext *s, CPUState *cpu, int b) gen_helper_fpop(tcg_env); break; default: - return false; + goto illegal_op; } break; case 0x38: /* ffreep sti, undocumented op */ @@ -2903,7 +2886,7 @@ static bool disas_insn_x87(DisasContext *s, CPUState *cpu, int b) gen_op_mov_reg_v(s, MO_16, R_EAX, s->T0); break; default: - return false; + goto illegal_op; } break; case 0x3d: /* fucomip */ @@ -2950,7 +2933,7 @@ static bool disas_insn_x87(DisasContext *s, CPUState *cpu, int b) } break; default: - return false; + goto illegal_op; } } @@ -2962,25 +2945,24 @@ static bool disas_insn_x87(DisasContext *s, CPUState *cpu, int b) tcg_gen_st_tl(eip_cur_tl(s), tcg_env, offsetof(CPUX86State, fpip)); } - return true; + return; illegal_op: gen_illegal_opcode(s); - return true; } -static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) +static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode) { - CPUX86State *env = cpu_env(cpu); int prefixes = s->prefix; MemOp dflag = s->dflag; + int b = decode->b + 0x100; + int modrm = s->modrm; MemOp ot; - int modrm, reg, rm, mod, op; + int reg, rm, mod, op; /* now check op code */ switch (b) { case 0x1c7: /* cmpxchg8b */ - modrm = x86_ldub_code(env, s); mod = (modrm >> 6) & 3; switch ((modrm >> 3) & 7) { case 1: /* CMPXCHG8, CMPXCHG16 */ @@ -2992,14 +2974,14 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) if (!(s->cpuid_ext_features & CPUID_EXT_CX16)) { goto illegal_op; } - gen_cmpxchg16b(s, env, modrm); + gen_cmpxchg16b(s, decode); break; } #endif if (!(s->cpuid_features & CPUID_CX8)) { goto illegal_op; } - gen_cmpxchg8b(s, env, modrm); + gen_cmpxchg8b(s, decode); break; case 7: /* RDSEED, RDPID with f3 prefix */ @@ -3042,7 +3024,6 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) break; case 0x100: - modrm = x86_ldub_code(env, s); mod = (modrm >> 6) & 3; op = (modrm >> 3) & 7; switch(op) { @@ -3056,14 +3037,14 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) tcg_gen_ld32u_tl(s->T0, tcg_env, offsetof(CPUX86State, ldt.selector)); ot = mod == 3 ? dflag : MO_16; - gen_st_modrm(env, s, modrm, ot); + gen_st_modrm(s, decode, ot); break; case 2: /* lldt */ if (!PE(s) || VM86(s)) goto illegal_op; if (check_cpl0(s)) { gen_svm_check_intercept(s, SVM_EXIT_LDTR_WRITE); - gen_ld_modrm(env, s, modrm, MO_16); + gen_ld_modrm(s, decode, MO_16); tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0); gen_helper_lldt(tcg_env, s->tmp2_i32); } @@ -3078,14 +3059,14 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) tcg_gen_ld32u_tl(s->T0, tcg_env, offsetof(CPUX86State, tr.selector)); ot = mod == 3 ? dflag : MO_16; - gen_st_modrm(env, s, modrm, ot); + gen_st_modrm(s, decode, ot); break; case 3: /* ltr */ if (!PE(s) || VM86(s)) goto illegal_op; if (check_cpl0(s)) { gen_svm_check_intercept(s, SVM_EXIT_TR_WRITE); - gen_ld_modrm(env, s, modrm, MO_16); + gen_ld_modrm(s, decode, MO_16); tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0); gen_helper_ltr(tcg_env, s->tmp2_i32); } @@ -3094,7 +3075,7 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) case 5: /* verw */ if (!PE(s) || VM86(s)) goto illegal_op; - gen_ld_modrm(env, s, modrm, MO_16); + gen_ld_modrm(s, decode, MO_16); gen_update_cc_op(s); if (op == 4) { gen_helper_verr(tcg_env, s->T0); @@ -3104,19 +3085,18 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) assume_cc_op(s, CC_OP_EFLAGS); break; default: - goto unknown_op; + goto illegal_op; } break; case 0x101: - modrm = x86_ldub_code(env, s); switch (modrm) { CASE_MODRM_MEM_OP(0): /* sgdt */ if (s->flags & HF_UMIP_MASK && !check_cpl0(s)) { break; } gen_svm_check_intercept(s, SVM_EXIT_GDTR_READ); - gen_lea_modrm(env, s, modrm); + gen_lea_modrm(s, decode); tcg_gen_ld32u_tl(s->T0, tcg_env, offsetof(CPUX86State, gdt.limit)); gen_op_st_v(s, MO_16, s->T0, s->A0); @@ -3172,7 +3152,7 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) break; } gen_svm_check_intercept(s, SVM_EXIT_IDTR_READ); - gen_lea_modrm(env, s, modrm); + gen_lea_modrm(s, decode); tcg_gen_ld32u_tl(s->T0, tcg_env, offsetof(CPUX86State, idt.limit)); gen_op_st_v(s, MO_16, s->T0, s->A0); gen_add_A0_im(s, 2); @@ -3322,7 +3302,7 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) break; } gen_svm_check_intercept(s, SVM_EXIT_GDTR_WRITE); - gen_lea_modrm(env, s, modrm); + gen_lea_modrm(s, decode); gen_op_ld_v(s, MO_16, s->T1, s->A0); gen_add_A0_im(s, 2); gen_op_ld_v(s, CODE64(s) + MO_32, s->T0, s->A0); @@ -3338,7 +3318,7 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) break; } gen_svm_check_intercept(s, SVM_EXIT_IDTR_WRITE); - gen_lea_modrm(env, s, modrm); + gen_lea_modrm(s, decode); gen_op_ld_v(s, MO_16, s->T1, s->A0); gen_add_A0_im(s, 2); gen_op_ld_v(s, CODE64(s) + MO_32, s->T0, s->A0); @@ -3362,7 +3342,7 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) */ mod = (modrm >> 6) & 3; ot = (mod != 3 ? MO_16 : s->dflag); - gen_st_modrm(env, s, modrm, ot); + gen_st_modrm(s, decode, ot); break; case 0xee: /* rdpkru */ if (s->prefix & (PREFIX_LOCK | PREFIX_DATA @@ -3389,7 +3369,7 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) break; } gen_svm_check_intercept(s, SVM_EXIT_WRITE_CR0); - gen_ld_modrm(env, s, modrm, MO_16); + gen_ld_modrm(s, decode, MO_16); /* * Only the 4 lower bits of CR0 are modified. * PE cannot be set to zero if already set to one. @@ -3407,7 +3387,7 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) break; } gen_svm_check_intercept(s, SVM_EXIT_INVLPG); - gen_lea_modrm(env, s, modrm); + gen_lea_modrm(s, decode); gen_helper_flush_page(tcg_env, s->A0); s->base.is_jmp = DISAS_EOB_NEXT; break; @@ -3440,12 +3420,11 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) break; default: - goto unknown_op; + goto illegal_op; } break; case 0x11a: - modrm = x86_ldub_code(env, s); if (s->flags & HF_MPX_EN_MASK) { mod = (modrm >> 6) & 3; reg = ((modrm >> 3) & 7) | REX_R(s); @@ -3456,7 +3435,7 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) || s->aflag == MO_16) { goto illegal_op; } - gen_bndck(env, s, modrm, TCG_COND_LTU, cpu_bndl[reg]); + gen_bndck(s, decode, TCG_COND_LTU, cpu_bndl[reg]); } else if (prefixes & PREFIX_REPNZ) { /* bndcu */ if (reg >= 4 @@ -3466,7 +3445,7 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) } TCGv_i64 notu = tcg_temp_new_i64(); tcg_gen_not_i64(notu, cpu_bndu[reg]); - gen_bndck(env, s, modrm, TCG_COND_GTU, notu); + gen_bndck(s, decode, TCG_COND_GTU, notu); } else if (prefixes & PREFIX_DATA) { /* bndmov -- from reg/mem */ if (reg >= 4 || s->aflag == MO_16) { @@ -3482,7 +3461,7 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) tcg_gen_mov_i64(cpu_bndu[reg], cpu_bndu[reg2]); } } else { - gen_lea_modrm(env, s, modrm); + gen_lea_modrm(s, decode); if (CODE64(s)) { tcg_gen_qemu_ld_i64(cpu_bndl[reg], s->A0, s->mem_index, MO_LEUQ); @@ -3501,7 +3480,7 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) } } else if (mod != 3) { /* bndldx */ - AddressParts a = gen_lea_modrm_0(env, s, modrm, false); + AddressParts a = decode->mem; if (reg >= 4 || (prefixes & PREFIX_LOCK) || s->aflag == MO_16 @@ -3531,10 +3510,8 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) gen_set_hflag(s, HF_MPX_IU_MASK); } } - gen_nop_modrm(env, s, modrm); break; case 0x11b: - modrm = x86_ldub_code(env, s); if (s->flags & HF_MPX_EN_MASK) { mod = (modrm >> 6) & 3; reg = ((modrm >> 3) & 7) | REX_R(s); @@ -3545,7 +3522,7 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) || s->aflag == MO_16) { goto illegal_op; } - AddressParts a = gen_lea_modrm_0(env, s, modrm, false); + AddressParts a = decode->mem; if (a.base >= 0) { tcg_gen_extu_tl_i64(cpu_bndl[reg], cpu_regs[a.base]); if (!CODE64(s)) { @@ -3558,7 +3535,7 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) /* rip-relative generates #ud */ goto illegal_op; } - tcg_gen_not_tl(s->A0, gen_lea_modrm_1(s, a, false)); + tcg_gen_not_tl(s->A0, gen_lea_modrm_1(s, decode->mem, false)); if (!CODE64(s)) { tcg_gen_ext32u_tl(s->A0, s->A0); } @@ -3573,7 +3550,7 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) || s->aflag == MO_16) { goto illegal_op; } - gen_bndck(env, s, modrm, TCG_COND_GTU, cpu_bndu[reg]); + gen_bndck(s, decode, TCG_COND_GTU, cpu_bndu[reg]); } else if (prefixes & PREFIX_DATA) { /* bndmov -- to reg/mem */ if (reg >= 4 || s->aflag == MO_16) { @@ -3589,7 +3566,7 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) tcg_gen_mov_i64(cpu_bndu[reg2], cpu_bndu[reg]); } } else { - gen_lea_modrm(env, s, modrm); + gen_lea_modrm(s, decode); if (CODE64(s)) { tcg_gen_qemu_st_i64(cpu_bndl[reg], s->A0, s->mem_index, MO_LEUQ); @@ -3606,7 +3583,7 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) } } else if (mod != 3) { /* bndstx */ - AddressParts a = gen_lea_modrm_0(env, s, modrm, false); + AddressParts a = decode->mem; if (reg >= 4 || (prefixes & PREFIX_LOCK) || s->aflag == MO_16 @@ -3633,7 +3610,6 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) } } } - gen_nop_modrm(env, s, modrm); break; default: g_assert_not_reached(); @@ -3642,12 +3618,8 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) illegal_op: gen_illegal_opcode(s); return; - unknown_op: - gen_unknown_opcode(env, s); } -#include "decode-new.h" -#include "emit.c.inc" #include "decode-new.c.inc" void tcg_x86_init(void) From fcd16539ebfe2c9e4cd568e6ce5da566a027a524 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 10 Jun 2024 10:39:00 +0200 Subject: [PATCH 14/26] target/i386: convert CMPXCHG8B/CMPXCHG16B to new decoder The gen_cmpxchg8b and gen_cmpxchg16b functions even have the correct prototype already; the only thing that needs to be done is removing the gen_lea_modrm() call. This moves the last LOCK-enabled instructions to the new decoder. It is now possible to assume that gen_multi0F is called only after checking that PREFIX_LOCK was not specified. Reviewed-by: Richard Henderson Signed-off-by: Paolo Bonzini --- target/i386/tcg/decode-new.c.inc | 34 ++++++--- target/i386/tcg/decode-new.h | 2 + target/i386/tcg/emit.c.inc | 96 ++++++++++++++++++++++++ target/i386/tcg/translate.c | 121 +------------------------------ 4 files changed, 124 insertions(+), 129 deletions(-) diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc index ee3ba16116..fe3bfed147 100644 --- a/target/i386/tcg/decode-new.c.inc +++ b/target/i386/tcg/decode-new.c.inc @@ -288,6 +288,25 @@ static void decode_group8(DisasContext *s, CPUX86State *env, X86OpEntry *entry, } } +static void decode_group9(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry group9_reg = + X86_OP_ENTRY0(multi0F); /* unconverted */ + static const X86OpEntry cmpxchg8b = + X86_OP_ENTRY1(CMPXCHG8B, M,q, lock p_00 cpuid(CX8)); + static const X86OpEntry cmpxchg16b = + X86_OP_ENTRY1(CMPXCHG16B, M,dq, lock p_00 cpuid(CX16)); + + int modrm = get_modrm(s, env); + int op = (modrm >> 3) & 7; + + if ((modrm >> 6) == 3) { + *entry = group9_reg; + } else if (op == 1) { + *entry = REX_W(s) ? cmpxchg16b : cmpxchg8b; + } +} + static void decode_group15(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) { static const X86OpEntry group15_reg[8] = { @@ -1203,7 +1222,7 @@ static const X86OpEntry opcodes_0F[256] = { [0xc4] = X86_OP_ENTRY4(PINSRW, V,dq,H,dq,E,w, vex5 mmx p_00_66), [0xc5] = X86_OP_ENTRY3(PEXTRW, G,d, U,dq,I,b, vex5 mmx p_00_66), [0xc6] = X86_OP_ENTRY4(VSHUF, V,x, H,x, W,x, vex4 p_00_66), - [0xc7] = X86_OP_ENTRY1(multi0F, nop,v, nolea), /* unconverted */ + [0xc7] = X86_OP_GROUP0(group9), [0xd0] = X86_OP_ENTRY3(VADDSUB, V,x, H,x, W,x, vex2 cpuid(SSE3) p_66_f2), [0xd1] = X86_OP_ENTRY3(PSRLW_r, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), @@ -2245,8 +2264,12 @@ static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid) return (s->cpuid_features & CPUID_CMOV); case X86_FEAT_CLFLUSH: return (s->cpuid_features & CPUID_CLFLUSH); + case X86_FEAT_CX8: + return (s->cpuid_features & CPUID_CX8); case X86_FEAT_FXSR: return (s->cpuid_features & CPUID_FXSR); + case X86_FEAT_CX16: + return (s->cpuid_ext_features & CPUID_EXT_CX16); case X86_FEAT_F16C: return (s->cpuid_ext_features & CPUID_EXT_F16C); case X86_FEAT_FMA: @@ -2726,15 +2749,6 @@ static void disas_insn(DisasContext *s, CPUState *cpu) break; } - /* - * hack for old decoder: 0F C7 has both instructions that accept LOCK - * and instructions that don't, but also needs X86_SPECIAL_NoLoadEA. - * Keep this here until CMPXCHG8B/CMPXCHG16B is separated from the - * other unconverted opcodes. - */ - if (decode.e.gen == gen_multi0F) { - accept_lock = true; - } if ((s->prefix & PREFIX_LOCK) && !accept_lock) { goto illegal_op; } diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h index bebc77bd54..7f23d373ea 100644 --- a/target/i386/tcg/decode-new.h +++ b/target/i386/tcg/decode-new.h @@ -114,6 +114,8 @@ typedef enum X86CPUIDFeature { X86_FEAT_CLWB, X86_FEAT_CMOV, X86_FEAT_CMPCCXADD, + X86_FEAT_CX8, + X86_FEAT_CX16, X86_FEAT_F16C, X86_FEAT_FMA, X86_FEAT_FSGSBASE, diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc index 29de8bba6f..fd17a9b1ec 100644 --- a/target/i386/tcg/emit.c.inc +++ b/target/i386/tcg/emit.c.inc @@ -1788,6 +1788,102 @@ static void gen_CMPXCHG(DisasContext *s, X86DecodedInsn *decode) decode->cc_op = CC_OP_SUBB + ot; } +static void gen_CMPXCHG16B(DisasContext *s, X86DecodedInsn *decode) +{ +#ifdef TARGET_X86_64 + MemOp mop = MO_TE | MO_128 | MO_ALIGN; + TCGv_i64 t0, t1; + TCGv_i128 cmp, val; + + cmp = tcg_temp_new_i128(); + val = tcg_temp_new_i128(); + tcg_gen_concat_i64_i128(cmp, cpu_regs[R_EAX], cpu_regs[R_EDX]); + tcg_gen_concat_i64_i128(val, cpu_regs[R_EBX], cpu_regs[R_ECX]); + + /* Only require atomic with LOCK; non-parallel handled in generator. */ + if (s->prefix & PREFIX_LOCK) { + tcg_gen_atomic_cmpxchg_i128(val, s->A0, cmp, val, s->mem_index, mop); + } else { + tcg_gen_nonatomic_cmpxchg_i128(val, s->A0, cmp, val, s->mem_index, mop); + } + + tcg_gen_extr_i128_i64(s->T0, s->T1, val); + + /* Determine success after the fact. */ + t0 = tcg_temp_new_i64(); + t1 = tcg_temp_new_i64(); + tcg_gen_xor_i64(t0, s->T0, cpu_regs[R_EAX]); + tcg_gen_xor_i64(t1, s->T1, cpu_regs[R_EDX]); + tcg_gen_or_i64(t0, t0, t1); + + /* Update Z. */ + gen_compute_eflags(s); + tcg_gen_setcondi_i64(TCG_COND_EQ, t0, t0, 0); + tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, t0, ctz32(CC_Z), 1); + + /* + * Extract the result values for the register pair. We may do this + * unconditionally, because on success (Z=1), the old value matches + * the previous value in RDX:RAX. + */ + tcg_gen_mov_i64(cpu_regs[R_EAX], s->T0); + tcg_gen_mov_i64(cpu_regs[R_EDX], s->T1); +#else + abort(); +#endif +} + +static void gen_CMPXCHG8B(DisasContext *s, X86DecodedInsn *decode) +{ + TCGv_i64 cmp, val, old; + TCGv Z; + + cmp = tcg_temp_new_i64(); + val = tcg_temp_new_i64(); + old = tcg_temp_new_i64(); + + /* Construct the comparison values from the register pair. */ + tcg_gen_concat_tl_i64(cmp, cpu_regs[R_EAX], cpu_regs[R_EDX]); + tcg_gen_concat_tl_i64(val, cpu_regs[R_EBX], cpu_regs[R_ECX]); + + /* Only require atomic with LOCK; non-parallel handled in generator. */ + if (s->prefix & PREFIX_LOCK) { + tcg_gen_atomic_cmpxchg_i64(old, s->A0, cmp, val, s->mem_index, MO_TEUQ); + } else { + tcg_gen_nonatomic_cmpxchg_i64(old, s->A0, cmp, val, + s->mem_index, MO_TEUQ); + } + + /* Set tmp0 to match the required value of Z. */ + tcg_gen_setcond_i64(TCG_COND_EQ, cmp, old, cmp); + Z = tcg_temp_new(); + tcg_gen_trunc_i64_tl(Z, cmp); + + /* + * Extract the result values for the register pair. + * For 32-bit, we may do this unconditionally, because on success (Z=1), + * the old value matches the previous value in EDX:EAX. For x86_64, + * the store must be conditional, because we must leave the source + * registers unchanged on success, and zero-extend the writeback + * on failure (Z=0). + */ + if (TARGET_LONG_BITS == 32) { + tcg_gen_extr_i64_tl(cpu_regs[R_EAX], cpu_regs[R_EDX], old); + } else { + TCGv zero = tcg_constant_tl(0); + + tcg_gen_extr_i64_tl(s->T0, s->T1, old); + tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EAX], Z, zero, + s->T0, cpu_regs[R_EAX]); + tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EDX], Z, zero, + s->T1, cpu_regs[R_EDX]); + } + + /* Update Z. */ + gen_compute_eflags(s); + tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, Z, ctz32(CC_Z), 1); +} + static void gen_CPUID(DisasContext *s, X86DecodedInsn *decode) { gen_update_cc_op(s); diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index be68cde1ba..1d3b5f35c3 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -2289,104 +2289,6 @@ static void gen_sty_env_A0(DisasContext *s, int offset, bool align) tcg_gen_qemu_st_i128(t, s->tmp0, mem_index, mop); } -static void gen_cmpxchg8b(DisasContext *s, X86DecodedInsn *decode) -{ - TCGv_i64 cmp, val, old; - TCGv Z; - - gen_lea_modrm(s, decode); - - cmp = tcg_temp_new_i64(); - val = tcg_temp_new_i64(); - old = tcg_temp_new_i64(); - - /* Construct the comparison values from the register pair. */ - tcg_gen_concat_tl_i64(cmp, cpu_regs[R_EAX], cpu_regs[R_EDX]); - tcg_gen_concat_tl_i64(val, cpu_regs[R_EBX], cpu_regs[R_ECX]); - - /* Only require atomic with LOCK; non-parallel handled in generator. */ - if (s->prefix & PREFIX_LOCK) { - tcg_gen_atomic_cmpxchg_i64(old, s->A0, cmp, val, s->mem_index, MO_TEUQ); - } else { - tcg_gen_nonatomic_cmpxchg_i64(old, s->A0, cmp, val, - s->mem_index, MO_TEUQ); - } - - /* Set tmp0 to match the required value of Z. */ - tcg_gen_setcond_i64(TCG_COND_EQ, cmp, old, cmp); - Z = tcg_temp_new(); - tcg_gen_trunc_i64_tl(Z, cmp); - - /* - * Extract the result values for the register pair. - * For 32-bit, we may do this unconditionally, because on success (Z=1), - * the old value matches the previous value in EDX:EAX. For x86_64, - * the store must be conditional, because we must leave the source - * registers unchanged on success, and zero-extend the writeback - * on failure (Z=0). - */ - if (TARGET_LONG_BITS == 32) { - tcg_gen_extr_i64_tl(cpu_regs[R_EAX], cpu_regs[R_EDX], old); - } else { - TCGv zero = tcg_constant_tl(0); - - tcg_gen_extr_i64_tl(s->T0, s->T1, old); - tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EAX], Z, zero, - s->T0, cpu_regs[R_EAX]); - tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EDX], Z, zero, - s->T1, cpu_regs[R_EDX]); - } - - /* Update Z. */ - gen_compute_eflags(s); - tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, Z, ctz32(CC_Z), 1); -} - -#ifdef TARGET_X86_64 -static void gen_cmpxchg16b(DisasContext *s, X86DecodedInsn *decode) -{ - MemOp mop = MO_TE | MO_128 | MO_ALIGN; - TCGv_i64 t0, t1; - TCGv_i128 cmp, val; - - gen_lea_modrm(s, decode); - - cmp = tcg_temp_new_i128(); - val = tcg_temp_new_i128(); - tcg_gen_concat_i64_i128(cmp, cpu_regs[R_EAX], cpu_regs[R_EDX]); - tcg_gen_concat_i64_i128(val, cpu_regs[R_EBX], cpu_regs[R_ECX]); - - /* Only require atomic with LOCK; non-parallel handled in generator. */ - if (s->prefix & PREFIX_LOCK) { - tcg_gen_atomic_cmpxchg_i128(val, s->A0, cmp, val, s->mem_index, mop); - } else { - tcg_gen_nonatomic_cmpxchg_i128(val, s->A0, cmp, val, s->mem_index, mop); - } - - tcg_gen_extr_i128_i64(s->T0, s->T1, val); - - /* Determine success after the fact. */ - t0 = tcg_temp_new_i64(); - t1 = tcg_temp_new_i64(); - tcg_gen_xor_i64(t0, s->T0, cpu_regs[R_EAX]); - tcg_gen_xor_i64(t1, s->T1, cpu_regs[R_EDX]); - tcg_gen_or_i64(t0, t0, t1); - - /* Update Z. */ - gen_compute_eflags(s); - tcg_gen_setcondi_i64(TCG_COND_EQ, t0, t0, 0); - tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, t0, ctz32(CC_Z), 1); - - /* - * Extract the result values for the register pair. We may do this - * unconditionally, because on success (Z=1), the old value matches - * the previous value in RDX:RAX. - */ - tcg_gen_mov_i64(cpu_regs[R_EAX], s->T0); - tcg_gen_mov_i64(cpu_regs[R_EDX], s->T1); -} -#endif - #include "emit.c.inc" static void gen_x87(DisasContext *s, X86DecodedInsn *decode) @@ -2962,29 +2864,10 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode) /* now check op code */ switch (b) { - case 0x1c7: /* cmpxchg8b */ + case 0x1c7: /* RDSEED, RDPID with f3 prefix */ mod = (modrm >> 6) & 3; switch ((modrm >> 3) & 7) { - case 1: /* CMPXCHG8, CMPXCHG16 */ - if (mod == 3) { - goto illegal_op; - } -#ifdef TARGET_X86_64 - if (dflag == MO_64) { - if (!(s->cpuid_ext_features & CPUID_EXT_CX16)) { - goto illegal_op; - } - gen_cmpxchg16b(s, decode); - break; - } -#endif - if (!(s->cpuid_features & CPUID_CX8)) { - goto illegal_op; - } - gen_cmpxchg8b(s, decode); - break; - - case 7: /* RDSEED, RDPID with f3 prefix */ + case 7: if (mod != 3 || (s->prefix & (PREFIX_LOCK | PREFIX_REPNZ))) { goto illegal_op; From f091a3f3247440d75561d1e5f2ee667ea10ebfeb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 May 2024 15:58:02 +0200 Subject: [PATCH 15/26] target/i386: do not check PREFIX_LOCK in old-style decoder It is already checked before getting there. Reviewed-by: Richard Henderson Signed-off-by: Paolo Bonzini --- target/i386/tcg/translate.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 1d3b5f35c3..f4bffef9e2 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -2869,7 +2869,7 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode) switch ((modrm >> 3) & 7) { case 7: if (mod != 3 || - (s->prefix & (PREFIX_LOCK | PREFIX_REPNZ))) { + (s->prefix & PREFIX_REPNZ)) { goto illegal_op; } if (s->prefix & PREFIX_REPZ) { @@ -2889,7 +2889,7 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode) case 6: /* RDRAND */ if (mod != 3 || - (s->prefix & (PREFIX_LOCK | PREFIX_REPZ | PREFIX_REPNZ)) || + (s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) || !(s->cpuid_ext_features & CPUID_EXT_RDRAND)) { goto illegal_op; } @@ -3049,8 +3049,7 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode) case 0xd0: /* xgetbv */ if ((s->cpuid_ext_features & CPUID_EXT_XSAVE) == 0 - || (s->prefix & (PREFIX_LOCK | PREFIX_DATA - | PREFIX_REPZ | PREFIX_REPNZ))) { + || (s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) { goto illegal_op; } tcg_gen_trunc_tl_i32(s->tmp2_i32, cpu_regs[R_ECX]); @@ -3060,8 +3059,7 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode) case 0xd1: /* xsetbv */ if ((s->cpuid_ext_features & CPUID_EXT_XSAVE) == 0 - || (s->prefix & (PREFIX_LOCK | PREFIX_DATA - | PREFIX_REPZ | PREFIX_REPNZ))) { + || (s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) { goto illegal_op; } gen_svm_check_intercept(s, SVM_EXIT_XSETBV); @@ -3228,8 +3226,7 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode) gen_st_modrm(s, decode, ot); break; case 0xee: /* rdpkru */ - if (s->prefix & (PREFIX_LOCK | PREFIX_DATA - | PREFIX_REPZ | PREFIX_REPNZ)) { + if (s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ)) { goto illegal_op; } tcg_gen_trunc_tl_i32(s->tmp2_i32, cpu_regs[R_ECX]); @@ -3237,8 +3234,7 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode) tcg_gen_extr_i64_tl(cpu_regs[R_EAX], cpu_regs[R_EDX], s->tmp1_i64); break; case 0xef: /* wrpkru */ - if (s->prefix & (PREFIX_LOCK | PREFIX_DATA - | PREFIX_REPZ | PREFIX_REPNZ)) { + if (s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ)) { goto illegal_op; } tcg_gen_concat_tl_i64(s->tmp1_i64, cpu_regs[R_EAX], @@ -3314,7 +3310,6 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode) if (prefixes & PREFIX_REPZ) { /* bndcl */ if (reg >= 4 - || (prefixes & PREFIX_LOCK) || s->aflag == MO_16) { goto illegal_op; } @@ -3322,7 +3317,6 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode) } else if (prefixes & PREFIX_REPNZ) { /* bndcu */ if (reg >= 4 - || (prefixes & PREFIX_LOCK) || s->aflag == MO_16) { goto illegal_op; } @@ -3336,7 +3330,7 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode) } if (mod == 3) { int reg2 = (modrm & 7) | REX_B(s); - if (reg2 >= 4 || (prefixes & PREFIX_LOCK)) { + if (reg2 >= 4) { goto illegal_op; } if (s->flags & HF_MPX_IU_MASK) { @@ -3365,7 +3359,6 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode) /* bndldx */ AddressParts a = decode->mem; if (reg >= 4 - || (prefixes & PREFIX_LOCK) || s->aflag == MO_16 || a.base < -1) { goto illegal_op; @@ -3401,7 +3394,6 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode) if (mod != 3 && (prefixes & PREFIX_REPZ)) { /* bndmk */ if (reg >= 4 - || (prefixes & PREFIX_LOCK) || s->aflag == MO_16) { goto illegal_op; } @@ -3429,7 +3421,6 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode) } else if (prefixes & PREFIX_REPNZ) { /* bndcn */ if (reg >= 4 - || (prefixes & PREFIX_LOCK) || s->aflag == MO_16) { goto illegal_op; } @@ -3441,7 +3432,7 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode) } if (mod == 3) { int reg2 = (modrm & 7) | REX_B(s); - if (reg2 >= 4 || (prefixes & PREFIX_LOCK)) { + if (reg2 >= 4) { goto illegal_op; } if (s->flags & HF_MPX_IU_MASK) { @@ -3468,7 +3459,6 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn *decode) /* bndstx */ AddressParts a = decode->mem; if (reg >= 4 - || (prefixes & PREFIX_LOCK) || s->aflag == MO_16 || a.base < -1) { goto illegal_op; From 7e62a554afba229ef472119ebd93079838978cd0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 7 May 2024 09:51:32 +0200 Subject: [PATCH 16/26] target/i386: list instructions still in translate.c Group them so that it is easier to figure out which two-byte opcodes to tackle together. Reviewed-by: Richard Henderson Signed-off-by: Paolo Bonzini --- target/i386/tcg/decode-new.c.inc | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc index fe3bfed147..487c376032 100644 --- a/target/i386/tcg/decode-new.c.inc +++ b/target/i386/tcg/decode-new.c.inc @@ -129,6 +129,37 @@ * * (^) these are the two cases in which Intel and AMD disagree on the * primary exception class + * + * Instructions still in translate.c + * --------------------------------- + * Generation of TCG opcodes for almost all instructions is in emit.c.inc; + * this file interprets the prefixes and opcode bytes down to individual + * instruction mnemonics. There is only a handful of opcodes still using + * a switch statement to decode modrm bits 3-5 and prefixes after decoding + * is complete; these are relics of the older x86 decoder and their code + * generation is performed in translate.c. + * + * These unconverted opcodes also perform their own effective address + * generation using the gen_lea_modrm() function. + * + * There is nothing particularly complicated about them; simply, they don't + * need any nasty hacks in the decoder, and they shouldn't get in the way + * of the implementation of new x86 instructions, so they are left alone + * for the time being. + * + * x87: + * 0xD8 - 0xDF + * + * privileged/system: + * 0x0F 0x00 group 6 (SLDT, STR, LLDT, LTR, VERR, VERW) + * 0x0F 0x01 group 7 (SGDT, SIDT, LGDT, LIDT, SMSW, LMSW, INVLPG, + * MONITOR, MWAIT, CLAC, STAC, XGETBV, XSETBV, + * SWAPGS, RDTSCP) + * 0x0F 0xC7 (reg operand) group 9 (RDRAND, RDSEED, RDPID) + * + * MPX: + * 0x0F 0x1A BNDLDX, BNDMOV, BNDCL, BNDCU + * 0x0F 0x1B BNDSTX, BNDMOV, BNDMK, BNDCN */ #define X86_OP_NONE { 0 }, From ac92afd19e4017b6973f06a760b9c61ff9fc63c4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 May 2024 15:53:11 +0200 Subject: [PATCH 17/26] target/i386: assert that cc_op* and pc_save are preserved Now all decoding has been done before any code generation. There is no need anymore to save and restore cc_op* and pc_save but, for the time being, assert that this is indeed the case. Reviewed-by: Richard Henderson Signed-off-by: Paolo Bonzini --- target/i386/tcg/translate.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index f4bffef9e2..ef190416b4 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3700,15 +3700,9 @@ static void i386_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu) case 2: /* Restore state that may affect the next instruction. */ dc->pc = dc->base.pc_next; - /* - * TODO: These save/restore can be removed after the table-based - * decoder is complete; we will be decoding the insn completely - * before any code generation that might affect these variables. - */ - dc->cc_op_dirty = orig_cc_op_dirty; - dc->cc_op = orig_cc_op; - dc->pc_save = orig_pc_save; - /* END TODO */ + assert(dc->cc_op_dirty == orig_cc_op_dirty); + assert(dc->cc_op == orig_cc_op); + assert(dc->pc_save == orig_pc_save); dc->base.num_insns--; tcg_remove_ops_after(dc->prev_insn_end); dc->base.insn_start = dc->prev_insn_start; From 5504a8126115d173687b37e657312a8ffe29fc0c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 17 Sep 2024 12:38:32 -0400 Subject: [PATCH 18/26] KVM: Dynamic sized kvm memslots array Zhiyi reported an infinite loop issue in VFIO use case. The cause of that was a separate discussion, however during that I found a regression of dirty sync slowness when profiling. Each KVMMemoryListerner maintains an array of kvm memslots. Currently it's statically allocated to be the max supported by the kernel. However after Linux commit 4fc096a99e ("KVM: Raise the maximum number of user memslots"), the max supported memslots reported now grows to some number large enough so that it may not be wise to always statically allocate with the max reported. What's worse, QEMU kvm code still walks all the allocated memslots entries to do any form of lookups. It can drastically slow down all memslot operations because each of such loop can run over 32K times on the new kernels. Fix this issue by making the memslots to be allocated dynamically. Here the initial size was set to 16 because it should cover the basic VM usages, so that the hope is the majority VM use case may not even need to grow at all (e.g. if one starts a VM with ./qemu-system-x86_64 by default it'll consume 9 memslots), however not too large to waste memory. There can also be even better way to address this, but so far this is the simplest and should be already better even than before we grow the max supported memslots. For example, in the case of above issue when VFIO was attached on a 32GB system, there are only ~10 memslots used. So it could be good enough as of now. In the above VFIO context, measurement shows that the precopy dirty sync shrinked from ~86ms to ~3ms after this patch applied. It should also apply to any KVM enabled VM even without VFIO. NOTE: we don't have a FIXES tag for this patch because there's no real commit that regressed this in QEMU. Such behavior existed for a long time, but only start to be a problem when the kernel reports very large nr_slots_max value. However that's pretty common now (the kernel change was merged in 2021) so we attached cc:stable because we'll want this change to be backported to stable branches. Cc: qemu-stable Reported-by: Zhiyi Guo Tested-by: Zhiyi Guo Signed-off-by: Peter Xu Acked-by: David Hildenbrand Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240917163835.194664-2-peterx@redhat.com Signed-off-by: Paolo Bonzini --- accel/kvm/kvm-all.c | 87 +++++++++++++++++++++++++++++++++------- accel/kvm/trace-events | 1 + include/sysemu/kvm_int.h | 1 + 3 files changed, 74 insertions(+), 15 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 905fb844e4..f84413b795 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -69,6 +69,9 @@ #define KVM_GUESTDBG_BLOCKIRQ 0 #endif +/* Default num of memslots to be allocated when VM starts */ +#define KVM_MEMSLOTS_NR_ALLOC_DEFAULT 16 + struct KVMParkedVcpu { unsigned long vcpu_id; int kvm_fd; @@ -165,6 +168,57 @@ void kvm_resample_fd_notify(int gsi) } } +/** + * kvm_slots_grow(): Grow the slots[] array in the KVMMemoryListener + * + * @kml: The KVMMemoryListener* to grow the slots[] array + * @nr_slots_new: The new size of slots[] array + * + * Returns: True if the array grows larger, false otherwise. + */ +static bool kvm_slots_grow(KVMMemoryListener *kml, unsigned int nr_slots_new) +{ + unsigned int i, cur = kml->nr_slots_allocated; + KVMSlot *slots; + + if (nr_slots_new > kvm_state->nr_slots) { + nr_slots_new = kvm_state->nr_slots; + } + + if (cur >= nr_slots_new) { + /* Big enough, no need to grow, or we reached max */ + return false; + } + + if (cur == 0) { + slots = g_new0(KVMSlot, nr_slots_new); + } else { + assert(kml->slots); + slots = g_renew(KVMSlot, kml->slots, nr_slots_new); + /* + * g_renew() doesn't initialize extended buffers, however kvm + * memslots require fields to be zero-initialized. E.g. pointers, + * memory_size field, etc. + */ + memset(&slots[cur], 0x0, sizeof(slots[0]) * (nr_slots_new - cur)); + } + + for (i = cur; i < nr_slots_new; i++) { + slots[i].slot = i; + } + + kml->slots = slots; + kml->nr_slots_allocated = nr_slots_new; + trace_kvm_slots_grow(cur, nr_slots_new); + + return true; +} + +static bool kvm_slots_double(KVMMemoryListener *kml) +{ + return kvm_slots_grow(kml, kml->nr_slots_allocated * 2); +} + unsigned int kvm_get_max_memslots(void) { KVMState *s = KVM_STATE(current_accel()); @@ -193,15 +247,26 @@ unsigned int kvm_get_free_memslots(void) /* Called with KVMMemoryListener.slots_lock held */ static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) { - KVMState *s = kvm_state; + unsigned int n; int i; - for (i = 0; i < s->nr_slots; i++) { + for (i = 0; i < kml->nr_slots_allocated; i++) { if (kml->slots[i].memory_size == 0) { return &kml->slots[i]; } } + /* + * If no free slots, try to grow first by doubling. Cache the old size + * here to avoid another round of search: if the grow succeeded, it + * means slots[] now must have the existing "n" slots occupied, + * followed by one or more free slots starting from slots[n]. + */ + n = kml->nr_slots_allocated; + if (kvm_slots_double(kml)) { + return &kml->slots[n]; + } + return NULL; } @@ -222,10 +287,9 @@ static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml, hwaddr start_addr, hwaddr size) { - KVMState *s = kvm_state; int i; - for (i = 0; i < s->nr_slots; i++) { + for (i = 0; i < kml->nr_slots_allocated; i++) { KVMSlot *mem = &kml->slots[i]; if (start_addr == mem->start_addr && size == mem->memory_size) { @@ -267,7 +331,7 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, int i, ret = 0; kvm_slots_lock(); - for (i = 0; i < s->nr_slots; i++) { + for (i = 0; i < kml->nr_slots_allocated; i++) { KVMSlot *mem = &kml->slots[i]; if (ram >= mem->ram && ram < mem->ram + mem->memory_size) { @@ -1071,7 +1135,7 @@ static int kvm_physical_log_clear(KVMMemoryListener *kml, kvm_slots_lock(); - for (i = 0; i < s->nr_slots; i++) { + for (i = 0; i < kml->nr_slots_allocated; i++) { mem = &kml->slots[i]; /* Discard slots that are empty or do not overlap the section */ if (!mem->memory_size || @@ -1715,12 +1779,8 @@ static void kvm_log_sync_global(MemoryListener *l, bool last_stage) /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */ kvm_dirty_ring_flush(); - /* - * TODO: make this faster when nr_slots is big while there are - * only a few used slots (small VMs). - */ kvm_slots_lock(); - for (i = 0; i < s->nr_slots; i++) { + for (i = 0; i < kml->nr_slots_allocated; i++) { mem = &kml->slots[i]; if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { kvm_slot_sync_dirty_pages(mem); @@ -1835,12 +1895,9 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, { int i; - kml->slots = g_new0(KVMSlot, s->nr_slots); kml->as_id = as_id; - for (i = 0; i < s->nr_slots; i++) { - kml->slots[i].slot = i; - } + kvm_slots_grow(kml, KVM_MEMSLOTS_NR_ALLOC_DEFAULT); QSIMPLEQ_INIT(&kml->transaction_add); QSIMPLEQ_INIT(&kml->transaction_del); diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events index 82c65fd2ab..e43d18a869 100644 --- a/accel/kvm/trace-events +++ b/accel/kvm/trace-events @@ -36,3 +36,4 @@ kvm_io_window_exit(void) "" kvm_run_exit_system_event(int cpu_index, uint32_t event_type) "cpu_index %d, system_even_type %"PRIu32 kvm_convert_memory(uint64_t start, uint64_t size, const char *msg) "start 0x%" PRIx64 " size 0x%" PRIx64 " %s" kvm_memory_fault(uint64_t start, uint64_t size, uint64_t flags) "start 0x%" PRIx64 " size 0x%" PRIx64 " flags 0x%" PRIx64 +kvm_slots_grow(unsigned int old, unsigned int new) "%u -> %u" diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h index 17483ff53b..2304537b93 100644 --- a/include/sysemu/kvm_int.h +++ b/include/sysemu/kvm_int.h @@ -46,6 +46,7 @@ typedef struct KVMMemoryListener { MemoryListener listener; KVMSlot *slots; unsigned int nr_used_slots; + unsigned int nr_slots_allocated; int as_id; QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_add; QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_del; From b34a908c8f24eedb0a8e5ff486b059b58fd793f4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 17 Sep 2024 12:38:33 -0400 Subject: [PATCH 19/26] KVM: Define KVM_MEMSLOTS_NUM_MAX_DEFAULT Make the default max nr_slots a macro, it's only used when KVM reports nothing. Reviewed-by: David Hildenbrand Signed-off-by: Peter Xu Link: https://lore.kernel.org/r/20240917163835.194664-3-peterx@redhat.com Signed-off-by: Paolo Bonzini --- accel/kvm/kvm-all.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index f84413b795..c32a84eb5a 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -71,6 +71,8 @@ /* Default num of memslots to be allocated when VM starts */ #define KVM_MEMSLOTS_NR_ALLOC_DEFAULT 16 +/* Default max allowed memslots if kernel reported nothing */ +#define KVM_MEMSLOTS_NR_MAX_DEFAULT 32 struct KVMParkedVcpu { unsigned long vcpu_id; @@ -2613,7 +2615,7 @@ static int kvm_init(MachineState *ms) /* If unspecified, use the default value */ if (!s->nr_slots) { - s->nr_slots = 32; + s->nr_slots_max = KVM_MEMSLOTS_NR_MAX_DEFAULT; } s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE); From dbdc00ba5b136bba80d850f61cc79a9cafaae1cd Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 17 Sep 2024 12:38:34 -0400 Subject: [PATCH 20/26] KVM: Rename KVMMemoryListener.nr_used_slots to nr_slots_used This will make all nr_slots counters to be named in the same manner. Reviewed-by: David Hildenbrand Signed-off-by: Peter Xu Link: https://lore.kernel.org/r/20240917163835.194664-4-peterx@redhat.com Signed-off-by: Paolo Bonzini --- accel/kvm/kvm-all.c | 6 +++--- include/sysemu/kvm_int.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index c32a84eb5a..24d76a1890 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -239,7 +239,7 @@ unsigned int kvm_get_free_memslots(void) if (!s->as[i].ml) { continue; } - used_slots = MAX(used_slots, s->as[i].ml->nr_used_slots); + used_slots = MAX(used_slots, s->as[i].ml->nr_slots_used); } kvm_slots_unlock(); @@ -1516,7 +1516,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, } start_addr += slot_size; size -= slot_size; - kml->nr_used_slots--; + kml->nr_slots_used--; } while (size); return; } @@ -1555,7 +1555,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, ram_start_offset += slot_size; ram += slot_size; size -= slot_size; - kml->nr_used_slots++; + kml->nr_slots_used++; } while (size); } diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h index 2304537b93..914c5c9ec5 100644 --- a/include/sysemu/kvm_int.h +++ b/include/sysemu/kvm_int.h @@ -45,7 +45,7 @@ typedef struct KVMMemoryUpdate { typedef struct KVMMemoryListener { MemoryListener listener; KVMSlot *slots; - unsigned int nr_used_slots; + unsigned int nr_slots_used; unsigned int nr_slots_allocated; int as_id; QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_add; From 943c742868c739c0b14fd996bad3adf744156fec Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 17 Sep 2024 12:38:35 -0400 Subject: [PATCH 21/26] KVM: Rename KVMState->nr_slots to nr_slots_max This value used to reflect the maximum supported memslots from KVM kernel. Rename it to be clearer. Reviewed-by: David Hildenbrand Signed-off-by: Peter Xu Link: https://lore.kernel.org/r/20240917163835.194664-5-peterx@redhat.com Signed-off-by: Paolo Bonzini --- accel/kvm/kvm-all.c | 12 ++++++------ include/sysemu/kvm_int.h | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 24d76a1890..8be731cfee 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -183,8 +183,8 @@ static bool kvm_slots_grow(KVMMemoryListener *kml, unsigned int nr_slots_new) unsigned int i, cur = kml->nr_slots_allocated; KVMSlot *slots; - if (nr_slots_new > kvm_state->nr_slots) { - nr_slots_new = kvm_state->nr_slots; + if (nr_slots_new > kvm_state->nr_slots_max) { + nr_slots_new = kvm_state->nr_slots_max; } if (cur >= nr_slots_new) { @@ -225,7 +225,7 @@ unsigned int kvm_get_max_memslots(void) { KVMState *s = KVM_STATE(current_accel()); - return s->nr_slots; + return s->nr_slots_max; } unsigned int kvm_get_free_memslots(void) @@ -243,7 +243,7 @@ unsigned int kvm_get_free_memslots(void) } kvm_slots_unlock(); - return s->nr_slots - used_slots; + return s->nr_slots_max - used_slots; } /* Called with KVMMemoryListener.slots_lock held */ @@ -2611,10 +2611,10 @@ static int kvm_init(MachineState *ms) (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE); kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); - s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); + s->nr_slots_max = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); /* If unspecified, use the default value */ - if (!s->nr_slots) { + if (!s->nr_slots_max) { s->nr_slots_max = KVM_MEMSLOTS_NR_MAX_DEFAULT; } diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h index 914c5c9ec5..a1e72763da 100644 --- a/include/sysemu/kvm_int.h +++ b/include/sysemu/kvm_int.h @@ -103,8 +103,8 @@ struct KVMDirtyRingReaper { struct KVMState { AccelState parent_obj; - - int nr_slots; + /* Max number of KVM slots supported */ + int nr_slots_max; int fd; int vmfd; int coalesced_mmio; From e136648c5c95ee4ea233cccf999c07e065bef26d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 18 Jun 2024 08:53:19 +0200 Subject: [PATCH 22/26] target/i386/tcg: Use DPL-level accesses for interrupts and call gates Stack accesses should be explicit and use the privilege level of the target stack. This ensures that SMAP is not applied when the target stack is in ring 3. This fixes a bug wherein i386/tcg assumed that an interrupt return, or a far call using the CALL or JMP instruction, was always going from kernel or user mode to kernel mode when using a call gate. This assumption is violated if the call gate has a DPL that is greater than 0. Analyzed-by: Robert R. Henry Resolves: https://gitlab.com/qemu-project/qemu/-/issues/249 Reviewed-by: Richard Henderson Signed-off-by: Paolo Bonzini --- target/i386/tcg/seg_helper.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c index 3b8fd827e1..02ae6a0d1f 100644 --- a/target/i386/tcg/seg_helper.c +++ b/target/i386/tcg/seg_helper.c @@ -695,7 +695,6 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int, sa.env = env; sa.ra = 0; - sa.mmu_index = cpu_mmu_index_kernel(env); if (type == 5) { /* task gate */ @@ -705,7 +704,9 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int, } shift = switch_tss(env, intno * 8, e1, e2, SWITCH_TSS_CALL, old_eip); if (has_error_code) { - /* push the error code */ + /* push the error code on the destination stack */ + cpl = env->hflags & HF_CPL_MASK; + sa.mmu_index = x86_mmu_index_pl(env, cpl); if (env->segs[R_SS].flags & DESC_B_MASK) { sa.sp_mask = 0xffffffff; } else { @@ -750,6 +751,7 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int, if (e2 & DESC_C_MASK) { dpl = cpl; } + sa.mmu_index = x86_mmu_index_pl(env, dpl); if (dpl < cpl) { /* to inner privilege */ uint32_t esp; @@ -1001,7 +1003,7 @@ static void do_interrupt64(CPUX86State *env, int intno, int is_int, sa.env = env; sa.ra = 0; - sa.mmu_index = cpu_mmu_index_kernel(env); + sa.mmu_index = x86_mmu_index_pl(env, dpl); sa.sp_mask = -1; sa.ss_base = 0; if (dpl < cpl || ist != 0) { @@ -1135,7 +1137,7 @@ static void do_interrupt_real(CPUX86State *env, int intno, int is_int, sa.sp = env->regs[R_ESP]; sa.sp_mask = 0xffff; sa.ss_base = env->segs[R_SS].base; - sa.mmu_index = cpu_mmu_index_kernel(env); + sa.mmu_index = x86_mmu_index_pl(env, 0); if (is_int) { old_eip = next_eip; @@ -1599,7 +1601,7 @@ void helper_lcall_real(CPUX86State *env, uint32_t new_cs, uint32_t new_eip, sa.sp = env->regs[R_ESP]; sa.sp_mask = get_sp_mask(env->segs[R_SS].flags); sa.ss_base = env->segs[R_SS].base; - sa.mmu_index = cpu_mmu_index_kernel(env); + sa.mmu_index = x86_mmu_index_pl(env, 0); if (shift) { pushl(&sa, env->segs[R_CS].selector); @@ -1639,9 +1641,9 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, sa.env = env; sa.ra = GETPC(); - sa.mmu_index = cpu_mmu_index_kernel(env); if (e2 & DESC_S_MASK) { + /* "normal" far call, no stack switch possible */ if (!(e2 & DESC_CS_MASK)) { raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 0xfffc, GETPC()); } @@ -1665,6 +1667,7 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, raise_exception_err_ra(env, EXCP0B_NOSEG, new_cs & 0xfffc, GETPC()); } + sa.mmu_index = x86_mmu_index_pl(env, cpl); #ifdef TARGET_X86_64 /* XXX: check 16/32 bit cases in long mode */ if (shift == 2) { @@ -1792,6 +1795,7 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, if (!(e2 & DESC_C_MASK) && dpl < cpl) { /* to inner privilege */ + sa.mmu_index = x86_mmu_index_pl(env, dpl); #ifdef TARGET_X86_64 if (shift == 2) { ss = dpl; /* SS = NULL selector with RPL = new CPL */ @@ -1870,6 +1874,7 @@ void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip, new_stack = 1; } else { /* to same privilege */ + sa.mmu_index = x86_mmu_index_pl(env, cpl); sa.sp = env->regs[R_ESP]; sa.sp_mask = get_sp_mask(env->segs[R_SS].flags); sa.ss_base = env->segs[R_SS].base; From 64e0e63ea16aa0122dc0c41a0679da0ae4616208 Mon Sep 17 00:00:00 2001 From: Tom Dohrmann Date: Tue, 3 Sep 2024 06:29:53 +0000 Subject: [PATCH 23/26] accel/kvm: check for KVM_CAP_READONLY_MEM on VM KVM_CAP_READONLY_MEM used to be a global capability, but with the introduction of AMD SEV-SNP confidential VMs, this extension is not always available on all VM types [1,2]. Query the extension on the VM level instead of on the KVM level. [1] https://patchwork.kernel.org/project/kvm/patch/20240809190319.1710470-2-seanjc@google.com/ [2] https://patchwork.kernel.org/project/kvm/patch/20240902144219.3716974-1-erbse.13@gmx.de/ Cc: Paolo Bonzini Signed-off-by: Tom Dohrmann Link: https://lore.kernel.org/r/20240903062953.3926498-1-erbse.13@gmx.de Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini --- accel/kvm/kvm-all.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 8be731cfee..99746436bf 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -2689,7 +2689,7 @@ static int kvm_init(MachineState *ms) } kvm_readonly_mem_allowed = - (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); + (kvm_vm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); kvm_resamplefds_allowed = (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0); From 60de433d4cb17211dbea5c1e6b74d476dec16370 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 11 Oct 2024 10:39:58 +0200 Subject: [PATCH 24/26] accel/kvm: check for KVM_CAP_MULTI_ADDRESS_SPACE on vm KVM_CAP_MULTI_ADDRESS_SPACE used to be a global capability, but with the introduction of AMD SEV-SNP confidential VMs, the number of address spaces can vary by VM type. Query the extension on the VM level instead of on the KVM level. Inspired by an analogous patch by Tom Dohrmann. Signed-off-by: Paolo Bonzini --- accel/kvm/kvm-all.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 99746436bf..d985486a05 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -2618,12 +2618,6 @@ static int kvm_init(MachineState *ms) s->nr_slots_max = KVM_MEMSLOTS_NR_MAX_DEFAULT; } - s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE); - if (s->nr_as <= 1) { - s->nr_as = 1; - } - s->as = g_new0(struct KVMAs, s->nr_as); - type = find_kvm_machine_type(ms); if (type < 0) { ret = -EINVAL; @@ -2637,6 +2631,12 @@ static int kvm_init(MachineState *ms) s->vmfd = ret; + s->nr_as = kvm_vm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE); + if (s->nr_as <= 1) { + s->nr_as = 1; + } + s->as = g_new0(struct KVMAs, s->nr_as); + /* check the vcpu limits */ soft_vcpus_limit = kvm_recommended_vcpus(s); hard_vcpus_limit = kvm_max_vcpus(s); From 586d708c1e3e5e29a0b3c05c347290aed9478854 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 11 Oct 2024 10:39:58 +0200 Subject: [PATCH 25/26] accel/kvm: check for KVM_CAP_MEMORY_ATTRIBUTES on vm The exact set of available memory attributes can vary by VM. In the future it might vary depending on enabled capabilities, too. Query the extension on the VM level instead of on the KVM level, and only after architecture-specific initialization. Inspired by an analogous patch by Tom Dohrmann. Signed-off-by: Paolo Bonzini --- accel/kvm/kvm-all.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index d985486a05..801cff16a5 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -2604,12 +2604,6 @@ static int kvm_init(MachineState *ms) goto err; } - kvm_supported_memory_attributes = kvm_check_extension(s, KVM_CAP_MEMORY_ATTRIBUTES); - kvm_guest_memfd_supported = - kvm_check_extension(s, KVM_CAP_GUEST_MEMFD) && - kvm_check_extension(s, KVM_CAP_USER_MEMORY2) && - (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE); - kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); s->nr_slots_max = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); @@ -2723,6 +2717,12 @@ static int kvm_init(MachineState *ms) goto err; } + kvm_supported_memory_attributes = kvm_vm_check_extension(s, KVM_CAP_MEMORY_ATTRIBUTES); + kvm_guest_memfd_supported = + kvm_check_extension(s, KVM_CAP_GUEST_MEMFD) && + kvm_check_extension(s, KVM_CAP_USER_MEMORY2) && + (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE); + if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) { s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; } From 15d955975bd484c2c66af0d6daaa02a7d04d2256 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 14 Oct 2024 17:41:44 -0700 Subject: [PATCH 26/26] target/i386: Use only 16 and 32-bit operands for IN/OUT The REX.W prefix is ignored for these instructions. Mirror the solution already used for INS/OUTS: X86_SIZE_z. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2581 Signed-off-by: Richard Henderson Cc: qemu-stable@nongnu.org Link: https://lore.kernel.org/r/20241015004144.2111817-1-richard.henderson@linaro.org Signed-off-by: Paolo Bonzini --- target/i386/tcg/decode-new.c.inc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc index 487c376032..1f19371646 100644 --- a/target/i386/tcg/decode-new.c.inc +++ b/target/i386/tcg/decode-new.c.inc @@ -1706,9 +1706,9 @@ static const X86OpEntry opcodes_root[256] = { [0xE2] = X86_OP_ENTRYr(LOOP, J,b), /* implicit: CX with aflag size */ [0xE3] = X86_OP_ENTRYr(JCXZ, J,b), /* implicit: CX with aflag size */ [0xE4] = X86_OP_ENTRYwr(IN, 0,b, I_unsigned,b), /* AL */ - [0xE5] = X86_OP_ENTRYwr(IN, 0,v, I_unsigned,b), /* AX/EAX */ + [0xE5] = X86_OP_ENTRYwr(IN, 0,z, I_unsigned,b), /* AX/EAX */ [0xE6] = X86_OP_ENTRYrr(OUT, 0,b, I_unsigned,b), /* AL */ - [0xE7] = X86_OP_ENTRYrr(OUT, 0,v, I_unsigned,b), /* AX/EAX */ + [0xE7] = X86_OP_ENTRYrr(OUT, 0,z, I_unsigned,b), /* AX/EAX */ [0xF1] = X86_OP_ENTRY0(INT1, svm(ICEBP)), [0xF4] = X86_OP_ENTRY0(HLT, chk(cpl0) svm(HLT)), @@ -1853,9 +1853,9 @@ static const X86OpEntry opcodes_root[256] = { [0xEA] = X86_OP_ENTRYrr(JMPF, I_unsigned,p, I_unsigned,w, chk(i64)), [0xEB] = X86_OP_ENTRYr(JMP, J,b), [0xEC] = X86_OP_ENTRYwr(IN, 0,b, 2,w), /* AL, DX */ - [0xED] = X86_OP_ENTRYwr(IN, 0,v, 2,w), /* AX/EAX, DX */ + [0xED] = X86_OP_ENTRYwr(IN, 0,z, 2,w), /* AX/EAX, DX */ [0xEE] = X86_OP_ENTRYrr(OUT, 0,b, 2,w), /* DX, AL */ - [0xEF] = X86_OP_ENTRYrr(OUT, 0,v, 2,w), /* DX, AX/EAX */ + [0xEF] = X86_OP_ENTRYrr(OUT, 0,z, 2,w), /* DX, AX/EAX */ [0xF8] = X86_OP_ENTRY0(CLC), [0xF9] = X86_OP_ENTRY0(STC),