From 334a1a1e1a5f8b4b172683e7507cfec566313a7c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 6 Jan 2026 17:37:05 +0000 Subject: [PATCH 001/267] KVM: arm64: Fix comment in fpsimd_lazy_switch_to_host() The comment in fpsimd_lazy_switch_to_host() erroneously says guest traps for FPSIMD/SVE/SME are disabled by fpsimd_lazy_switch_to_guest(). In reality, the traps are disabled by __activate_cptr_traps(), and fpsimd_lazy_switch_to_guest() only manipulates the SVE vector length. This was mistake; I accidentally copy+pasted the wrong function name in commit: 59419f10045b ("KVM: arm64: Eagerly switch ZCR_EL{1,2}") Fix the comment. Fixes: 59419f10045b ("KVM: arm64: Eagerly switch ZCR_EL{1,2}") Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Link: https://patch.msgid.link/20260106173707.3292074-2-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index c5d5e5b86eaf..8dce3da85da3 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -495,7 +495,7 @@ static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu) /* * When the guest owns the FP regs, we know that guest+hyp traps for * any FPSIMD/SVE/SME features exposed to the guest have been disabled - * by either fpsimd_lazy_switch_to_guest() or kvm_hyp_handle_fpsimd() + * by either __activate_cptr_traps() or kvm_hyp_handle_fpsimd() * prior to __guest_entry(). As __guest_entry() guarantees a context * synchronization event, we don't need an ISB here to avoid taking * traps for anything that was exposed to the guest. From acd8bfaa9384300a509c8aff5ee4a59a06eb2b2b Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 6 Jan 2026 17:37:06 +0000 Subject: [PATCH 002/267] KVM: arm64: Shuffle KVM_HOST_DATA_FLAG_* indices There's a gap in the KVM_HOST_DATA_FLAG_* indices since the removal of KVM_HOST_DATA_FLAG_HOST_SVE_ENABLED and KVM_HOST_DATA_FLAG_HOST_SME_ENABLED in commits: * 459f059be702 ("KVM: arm64: Remove VHE host restore of CPACR_EL1.ZEN") * 407a99c4654e ("KVM: arm64: Remove VHE host restore of CPACR_EL1.SMEN") Shuffle the indices to remove the gap, as Oliver requested at the time of the removals: https://lore.kernel.org/linux-arm-kernel/Z6qC4qn47ONfDCSH@linux.dev/ There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Reviewed-by: Mark Brown Link: https://patch.msgid.link/20260106173707.3292074-3-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ac7f970c7883..4c8b4274f669 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -710,11 +710,11 @@ struct cpu_sve_state { struct kvm_host_data { #define KVM_HOST_DATA_FLAG_HAS_SPE 0 #define KVM_HOST_DATA_FLAG_HAS_TRBE 1 -#define KVM_HOST_DATA_FLAG_TRBE_ENABLED 4 -#define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED 5 -#define KVM_HOST_DATA_FLAG_VCPU_IN_HYP_CONTEXT 6 -#define KVM_HOST_DATA_FLAG_L1_VNCR_MAPPED 7 -#define KVM_HOST_DATA_FLAG_HAS_BRBE 8 +#define KVM_HOST_DATA_FLAG_TRBE_ENABLED 2 +#define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED 3 +#define KVM_HOST_DATA_FLAG_VCPU_IN_HYP_CONTEXT 4 +#define KVM_HOST_DATA_FLAG_L1_VNCR_MAPPED 5 +#define KVM_HOST_DATA_FLAG_HAS_BRBE 6 unsigned long flags; struct kvm_cpu_context host_ctxt; From b1a9a9b96169bf1f9cb94b8aa33601996fad1e9c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 6 Jan 2026 17:37:07 +0000 Subject: [PATCH 003/267] KVM: arm64: Remove ISB after writing FPEXC32_EL2 The value of FPEX32_EL2 has no effect on execution in AArch64 state, and consequently there's no need for an ISB after writing to it in the hyp code (which executes in AArch64 state). When performing an exception return to AArch32 state, the exception return will provide the necessary context synchronization event. Remove the redundant ISB. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Link: https://patch.msgid.link/20260106173707.3292074-4-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 8dce3da85da3..91aa1862349c 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -59,10 +59,8 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) * If FP/ASIMD is not implemented, FPEXC is UNDEFINED and any access to * it will cause an exception. */ - if (vcpu_el1_is_32bit(vcpu) && system_supports_fpsimd()) { + if (vcpu_el1_is_32bit(vcpu) && system_supports_fpsimd()) write_sysreg(1 << 30, fpexc32_el2); - isb(); - } } static inline void __activate_cptr_traps_nvhe(struct kvm_vcpu *vcpu) From b47b93c15b12c7a9e560729aa6ad609466f83f0e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 12:56:41 -0800 Subject: [PATCH 004/267] KVM: x86: Disallow setting CPUID and/or feature MSRs if L2 is active Extend KVM's restriction on CPUID and feature MSR changes to disallow updates while L2 is active in addition to rejecting updates after the vCPU has run at least once. Like post-run vCPU model updates, attempting to react to model changes while L2 is active is practically infeasible, e.g. KVM would need to do _something_ in response to impossible situations where userspace has a removed a feature that was consumed as parted of nested VM-Enter. In practice, disallowing vCPU model changes while L2 is active is largely uninteresting, as the only way for L2 to be active without the vCPU having run at least once is if userspace stuffed state via KVM_SET_NESTED_STATE. And because KVM_SET_NESTED_STATE can't put the vCPU into L2 without userspace first defining the vCPU model, e.g. to enable SVM/VMX, modifying the vCPU model while L2 is active would require deliberately setting the vCPU model, then loading nested state, and then changing the model. I.e. no sane VMM should run afoul of the new restriction, and any VMM that does encounter problems has likely been running a broken setup for a long time. Cc: Yosry Ahmed Cc: Kevin Cheng Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230205641.4092235-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 19 +++++++++++-------- arch/x86/kvm/mmu/mmu.c | 6 +----- arch/x86/kvm/pmu.c | 2 +- arch/x86/kvm/x86.c | 13 +++++++------ arch/x86/kvm/x86.h | 15 +++++++++++++-- 5 files changed, 33 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 88a5426674a1..f37331ad3ad8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -534,17 +534,20 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, BUILD_BUG_ON(sizeof(vcpu_caps) != sizeof(vcpu->arch.cpu_caps)); /* - * KVM does not correctly handle changing guest CPUID after KVM_RUN, as - * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't - * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page - * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with - * the core vCPU model on the fly. It would've been better to forbid any - * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately - * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do + * KVM does not correctly handle changing guest CPUID after KVM_RUN or + * while L2 is active, as MAXPHYADDR, GBPAGES support, AMD reserved bit + * behavior, etc. aren't tracked in kvm_mmu_page_role, and L2 state + * can't be adjusted (without breaking L2 in some way). As a result, + * KVM may reuse SPs/SPTEs and/or run L2 with bad/misconfigured state. + * + * In practice, no sane VMM mucks with the core vCPU model on the fly. + * It would've been better to forbid any KVM_SET_CPUID{,2} calls after + * KVM_RUN or KVM_SET_NESTED_STATE altogether, but unfortunately some + * VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do * KVM_SET_CPUID{,2} again. To support this legacy behavior, check * whether the supplied CPUID data is equal to what's already set. */ - if (kvm_vcpu_has_run(vcpu)) { + if (!kvm_can_set_cpuid_and_feature_msrs(vcpu)) { r = kvm_cpuid_check_equal(vcpu, e2, nent); if (r) goto err; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 02c450686b4a..f17324546900 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6031,11 +6031,7 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.nested_mmu.cpu_role.ext.valid = 0; kvm_mmu_reset_context(vcpu); - /* - * Changing guest CPUID after KVM_RUN is forbidden, see the comment in - * kvm_arch_vcpu_ioctl(). - */ - KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm); + KVM_BUG_ON(!kvm_can_set_cpuid_and_feature_msrs(vcpu), vcpu->kvm); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 487ad19a236e..ff20b4102173 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -853,7 +853,7 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) + if (KVM_BUG_ON(!kvm_can_set_cpuid_and_feature_msrs(vcpu), vcpu->kvm)) return; /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ff8812f3a129..211d8c24a4b1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2314,13 +2314,14 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) u64 val; /* - * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does - * not support modifying the guest vCPU model on the fly, e.g. changing - * the nVMX capabilities while L2 is running is nonsensical. Allow - * writes of the same value, e.g. to allow userspace to blindly stuff - * all MSRs when emulating RESET. + * Reject writes to immutable feature MSRs if the vCPU model is frozen, + * as KVM doesn't support modifying the guest vCPU model on the fly, + * e.g. changing the VMX capabilities MSRs while L2 is active is + * nonsensical. Allow writes of the same value, e.g. so that userspace + * can blindly stuff all MSRs when emulating RESET. */ - if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index) && + if (!kvm_can_set_cpuid_and_feature_msrs(vcpu) && + kvm_is_immutable_feature_msr(index) && (do_get_msr(vcpu, index, &val) || *data != val)) return -EINVAL; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index fdab0ad49098..5cee34f6a0e0 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -172,9 +172,20 @@ static inline void kvm_nested_vmexit_handle_ibrs(struct kvm_vcpu *vcpu) indirect_branch_prediction_barrier(); } -static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) +/* + * Disallow modifying CPUID and feature MSRs, which affect the core virtual CPU + * model exposed to the guest and virtualized by KVM, if the vCPU has already + * run or is in guest mode (L2). In both cases, KVM has already consumed the + * current virtual CPU model, and doesn't support "unwinding" to react to the + * new model. + * + * Note, the only way is_guest_mode() can be true with 'last_vmentry_cpu == -1' + * is if userspace sets CPUID and feature MSRs (to enable VMX/SVM), then sets + * nested state, and then attempts to set CPUID and/or feature MSRs *again*. + */ +static inline bool kvm_can_set_cpuid_and_feature_msrs(struct kvm_vcpu *vcpu) { - return vcpu->arch.last_vmentry_cpu != -1; + return vcpu->arch.last_vmentry_cpu == -1 && !is_guest_mode(vcpu); } static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state) From 0b28194c4c8e3a6c2552bfa6451f71b1879dd61f Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Fri, 5 Dec 2025 14:49:37 -0800 Subject: [PATCH 005/267] KVM: selftests: Test TPR / CR8 sync and interrupt masking Add a few extra TPR / CR8 tests to x86's xapic_state_test to see if: * TPR is 0 on reset, * TPR, PPR and CR8 are equal inside the guest, * TPR and CR8 read equal by the host after a VMExit * TPR borderline values set by the host correctly mask interrupts in the guest. These hopefully will catch the most obvious cases of improper TPR sync or interrupt masking. Do these tests both in x2APIC and xAPIC modes. The x2APIC mode uses SELF_IPI register to trigger interrupts to give it a bit of exercise too. Signed-off-by: Maciej S. Szmigiero Acked-by: Naveen N Rao (AMD) [sean: put code in separate test] Link: https://patch.msgid.link/20251205224937.428122-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../testing/selftests/kvm/include/x86/apic.h | 3 + .../selftests/kvm/x86/xapic_tpr_test.c | 276 ++++++++++++++++++ 3 files changed, 280 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/xapic_tpr_test.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..3789890421bd 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -124,6 +124,7 @@ TEST_GEN_PROGS_x86 += x86/vmx_set_nested_state_test TEST_GEN_PROGS_x86 += x86/apic_bus_clock_test TEST_GEN_PROGS_x86 += x86/xapic_ipi_test TEST_GEN_PROGS_x86 += x86/xapic_state_test +TEST_GEN_PROGS_x86 += x86/xapic_tpr_test TEST_GEN_PROGS_x86 += x86/xcr0_cpuid_test TEST_GEN_PROGS_x86 += x86/xss_msr_test TEST_GEN_PROGS_x86 += x86/debug_regs diff --git a/tools/testing/selftests/kvm/include/x86/apic.h b/tools/testing/selftests/kvm/include/x86/apic.h index 80fe9f69b38d..e9b9aebaac97 100644 --- a/tools/testing/selftests/kvm/include/x86/apic.h +++ b/tools/testing/selftests/kvm/include/x86/apic.h @@ -28,6 +28,8 @@ #define GET_APIC_ID_FIELD(x) (((x) >> 24) & 0xFF) #define APIC_TASKPRI 0x80 #define APIC_PROCPRI 0xA0 +#define GET_APIC_PRI(x) (((x) & GENMASK(7, 4)) >> 4) +#define SET_APIC_PRI(x, y) (((x) & ~GENMASK(7, 4)) | (y << 4)) #define APIC_EOI 0xB0 #define APIC_SPIV 0xF0 #define APIC_SPIV_FOCUS_DISABLED (1 << 9) @@ -67,6 +69,7 @@ #define APIC_TMICT 0x380 #define APIC_TMCCT 0x390 #define APIC_TDCR 0x3E0 +#define APIC_SELF_IPI 0x3F0 void apic_disable(void); void xapic_enable(void); diff --git a/tools/testing/selftests/kvm/x86/xapic_tpr_test.c b/tools/testing/selftests/kvm/x86/xapic_tpr_test.c new file mode 100644 index 000000000000..3862134d9d40 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/xapic_tpr_test.c @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include + +#include "apic.h" +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + +static bool is_x2apic; + +#define IRQ_VECTOR 0x20 + +/* See also the comment at similar assertion in memslot_perf_test.c */ +static_assert(ATOMIC_INT_LOCK_FREE == 2, "atomic int is not lockless"); + +static atomic_uint tpr_guest_irq_sync_val; + +static void tpr_guest_irq_sync_flag_reset(void) +{ + atomic_store_explicit(&tpr_guest_irq_sync_val, 0, + memory_order_release); +} + +static unsigned int tpr_guest_irq_sync_val_get(void) +{ + return atomic_load_explicit(&tpr_guest_irq_sync_val, + memory_order_acquire); +} + +static void tpr_guest_irq_sync_val_inc(void) +{ + atomic_fetch_add_explicit(&tpr_guest_irq_sync_val, 1, + memory_order_acq_rel); +} + +static void tpr_guest_irq_handler_xapic(struct ex_regs *regs) +{ + tpr_guest_irq_sync_val_inc(); + + xapic_write_reg(APIC_EOI, 0); +} + +static void tpr_guest_irq_handler_x2apic(struct ex_regs *regs) +{ + tpr_guest_irq_sync_val_inc(); + + x2apic_write_reg(APIC_EOI, 0); +} + +static void tpr_guest_irq_queue(void) +{ + if (is_x2apic) { + x2apic_write_reg(APIC_SELF_IPI, IRQ_VECTOR); + } else { + uint32_t icr, icr2; + + icr = APIC_DEST_SELF | APIC_DEST_PHYSICAL | APIC_DM_FIXED | + IRQ_VECTOR; + icr2 = 0; + + xapic_write_reg(APIC_ICR2, icr2); + xapic_write_reg(APIC_ICR, icr); + } +} + +static uint8_t tpr_guest_tpr_get(void) +{ + uint32_t taskpri; + + if (is_x2apic) + taskpri = x2apic_read_reg(APIC_TASKPRI); + else + taskpri = xapic_read_reg(APIC_TASKPRI); + + return GET_APIC_PRI(taskpri); +} + +static uint8_t tpr_guest_ppr_get(void) +{ + uint32_t procpri; + + if (is_x2apic) + procpri = x2apic_read_reg(APIC_PROCPRI); + else + procpri = xapic_read_reg(APIC_PROCPRI); + + return GET_APIC_PRI(procpri); +} + +static uint8_t tpr_guest_cr8_get(void) +{ + uint64_t cr8; + + asm volatile ("mov %%cr8, %[cr8]\n\t" : [cr8] "=r"(cr8)); + + return cr8 & GENMASK(3, 0); +} + +static void tpr_guest_check_tpr_ppr_cr8_equal(void) +{ + uint8_t tpr; + + tpr = tpr_guest_tpr_get(); + + GUEST_ASSERT_EQ(tpr_guest_ppr_get(), tpr); + GUEST_ASSERT_EQ(tpr_guest_cr8_get(), tpr); +} + +static void tpr_guest_code(void) +{ + cli(); + + if (is_x2apic) + x2apic_enable(); + else + xapic_enable(); + + GUEST_ASSERT_EQ(tpr_guest_tpr_get(), 0); + tpr_guest_check_tpr_ppr_cr8_equal(); + + tpr_guest_irq_queue(); + + /* TPR = 0 but IRQ masked by IF=0, should not fire */ + udelay(1000); + GUEST_ASSERT_EQ(tpr_guest_irq_sync_val_get(), 0); + + sti(); + + /* IF=1 now, IRQ should fire */ + while (tpr_guest_irq_sync_val_get() == 0) + cpu_relax(); + GUEST_ASSERT_EQ(tpr_guest_irq_sync_val_get(), 1); + + GUEST_SYNC(true); + tpr_guest_check_tpr_ppr_cr8_equal(); + + tpr_guest_irq_queue(); + + /* IRQ masked by barely high enough TPR now, should not fire */ + udelay(1000); + GUEST_ASSERT_EQ(tpr_guest_irq_sync_val_get(), 1); + + GUEST_SYNC(false); + tpr_guest_check_tpr_ppr_cr8_equal(); + + /* TPR barely low enough now to unmask IRQ, should fire */ + while (tpr_guest_irq_sync_val_get() == 1) + cpu_relax(); + GUEST_ASSERT_EQ(tpr_guest_irq_sync_val_get(), 2); + + GUEST_DONE(); +} + +static uint8_t lapic_tpr_get(struct kvm_lapic_state *xapic) +{ + return GET_APIC_PRI(*((u32 *)&xapic->regs[APIC_TASKPRI])); +} + +static void lapic_tpr_set(struct kvm_lapic_state *xapic, uint8_t val) +{ + u32 *taskpri = (u32 *)&xapic->regs[APIC_TASKPRI]; + + *taskpri = SET_APIC_PRI(*taskpri, val); +} + +static uint8_t sregs_tpr(struct kvm_sregs *sregs) +{ + return sregs->cr8 & GENMASK(3, 0); +} + +static void test_tpr_check_tpr_zero(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic_state xapic; + + vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); + + TEST_ASSERT_EQ(lapic_tpr_get(&xapic), 0); +} + +static void test_tpr_check_tpr_cr8_equal(struct kvm_vcpu *vcpu) +{ + struct kvm_sregs sregs; + struct kvm_lapic_state xapic; + + vcpu_sregs_get(vcpu, &sregs); + vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); + + TEST_ASSERT_EQ(sregs_tpr(&sregs), lapic_tpr_get(&xapic)); +} + +static void test_tpr_set_tpr_for_irq(struct kvm_vcpu *vcpu, bool mask) +{ + struct kvm_lapic_state xapic; + uint8_t tpr; + + static_assert(IRQ_VECTOR >= 16, "invalid IRQ vector number"); + tpr = IRQ_VECTOR / 16; + if (!mask) + tpr--; + + vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); + lapic_tpr_set(&xapic, tpr); + vcpu_ioctl(vcpu, KVM_SET_LAPIC, &xapic); +} + +static void test_tpr(bool __is_x2apic) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + bool done = false; + + is_x2apic = __is_x2apic; + + vm = vm_create_with_one_vcpu(&vcpu, tpr_guest_code); + if (is_x2apic) { + vm_install_exception_handler(vm, IRQ_VECTOR, + tpr_guest_irq_handler_x2apic); + } else { + vm_install_exception_handler(vm, IRQ_VECTOR, + tpr_guest_irq_handler_xapic); + vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_X2APIC); + virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); + } + + sync_global_to_guest(vcpu->vm, is_x2apic); + + /* According to the SDM/APM the TPR value at reset is 0 */ + test_tpr_check_tpr_zero(vcpu); + test_tpr_check_tpr_cr8_equal(vcpu); + + tpr_guest_irq_sync_flag_reset(); + sync_global_to_guest(vcpu->vm, tpr_guest_irq_sync_val); + + while (!done) { + struct ucall uc; + + alarm(2); + vcpu_run(vcpu); + alarm(0); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + test_tpr_check_tpr_cr8_equal(vcpu); + done = true; + break; + case UCALL_SYNC: + test_tpr_check_tpr_cr8_equal(vcpu); + test_tpr_set_tpr_for_irq(vcpu, uc.args[1]); + break; + default: + TEST_FAIL("Unknown ucall result 0x%lx", uc.cmd); + break; + } + } + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + /* + * Use separate VMs for the xAPIC and x2APIC tests so that x2APIC can + * be fully hidden from the guest. KVM disallows changing CPUID after + * KVM_RUN and AVIC is disabled if _any_ vCPU is allowed to use x2APIC. + */ + test_tpr(false); + test_tpr(true); +} From ff8071eb3aa54e96336ecce7c88b25f9a4d62383 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Nov 2025 14:20:18 -0800 Subject: [PATCH 006/267] KVM: VMX: Always reflect SGX EPCM #PFs back into the guest When handling intercepted #PFs, reflect EPCM (Enclave Page Cache Map) violations, i.e. #PFs with the SGX flag set, back into the guest. KVM doesn't shadow EPCM entries (the EPCM deals only with virtual/linear addresses), and so EPCM violation cannot be due to KVM interference, and more importantly can't be resolved by KVM. On pre-SGX2 hardware, EPCM violations are delivered as #GP(0) faults, but on SGX2+ hardware, they are delivered as #PF(SGX). Failure to account for the SGX2 behavior could put a vCPU into an infinite loop due to KVM not realizing the #PF is the guest's responsibility. Take care to deliver the EPCM violation as a #GP(0) if the _guest_ CPU model is only SGX1. Fixes: 72add915fbd5 ("KVM: VMX: Enable SGX virtualization for SGX1, SGX2 and LC") Cc: Kai Huang Reviewed-by: Richard Lyu Reviewed-by: Kai Huang Link: https://patch.msgid.link/20251121222018.348987-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 58 ++++++++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6b96f7aea20b..97d696014f9a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5303,12 +5303,53 @@ static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu) !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS); } +static int vmx_handle_page_fault(struct kvm_vcpu *vcpu, u32 error_code) +{ + unsigned long cr2 = vmx_get_exit_qual(vcpu); + + if (vcpu->arch.apf.host_apf_flags) + goto handle_pf; + + /* When using EPT, KVM intercepts #PF only to detect illegal GPAs. */ + WARN_ON_ONCE(enable_ept && !allow_smaller_maxphyaddr); + + /* + * On SGX2 hardware, EPCM violations are delivered as #PF with the SGX + * flag set in the error code (SGX1 hardware generates #GP(0)). EPCM + * violations have nothing to do with shadow paging and can never be + * resolved by KVM; always reflect them into the guest. + */ + if (error_code & PFERR_SGX_MASK) { + WARN_ON_ONCE(!IS_ENABLED(CONFIG_X86_SGX_KVM) || + !cpu_feature_enabled(X86_FEATURE_SGX2)); + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2)) + kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); + else + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* + * If EPT is enabled, fixup and inject the #PF. KVM intercepts #PFs + * only to set PFERR_RSVD as appropriate (hardware won't set RSVD due + * to the GPA being legal with respect to host.MAXPHYADDR). + */ + if (enable_ept) { + kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); + return 1; + } + +handle_pf: + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); +} + static int handle_exception_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_run *kvm_run = vcpu->run; u32 intr_info, ex_no, error_code; - unsigned long cr2, dr6; + unsigned long dr6; u32 vect_info; vect_info = vmx->idt_vectoring_info; @@ -5383,19 +5424,8 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) return 0; } - if (is_page_fault(intr_info)) { - cr2 = vmx_get_exit_qual(vcpu); - if (enable_ept && !vcpu->arch.apf.host_apf_flags) { - /* - * EPT will cause page fault only if we need to - * detect illegal GPAs. - */ - WARN_ON_ONCE(!allow_smaller_maxphyaddr); - kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); - return 1; - } else - return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); - } + if (is_page_fault(intr_info)) + return vmx_handle_page_fault(vcpu, error_code); ex_no = intr_info & INTR_INFO_VECTOR_MASK; From 4b24910c056995c0c0fa7c1b142696443b05fd8e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:16:50 -0800 Subject: [PATCH 007/267] KVM: Add a simplified wrapper for registering perf callbacks Add a parameter-less API for registering perf callbacks in anticipation of introducing another x86-only parameter for handling mediated PMU PMIs. No functional change intended. Acked-by: Anup Patel Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-15-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 2 +- arch/loongarch/kvm/main.c | 2 +- arch/riscv/kvm/main.c | 2 +- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 11 +++++++++-- virt/kvm/kvm_main.c | 5 +++-- 6 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 4f80da0c0d1d..3e6f184d6d04 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2357,7 +2357,7 @@ static int __init init_subsystems(void) if (err) goto out; - kvm_register_perf_callbacks(NULL); + kvm_register_perf_callbacks(); out: if (err) diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index 80ea63d465b8..f62326fe29fa 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -394,7 +394,7 @@ static int kvm_loongarch_env_init(void) } kvm_init_gcsr_flag(); - kvm_register_perf_callbacks(NULL); + kvm_register_perf_callbacks(); /* Register LoongArch IPI interrupt controller interface. */ ret = kvm_loongarch_register_ipi_device(); diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 45536af521f0..0f3fe3986fc0 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -174,7 +174,7 @@ static int __init riscv_kvm_init(void) kvm_riscv_setup_vendor_features(); - kvm_register_perf_callbacks(NULL); + kvm_register_perf_callbacks(); rc = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (rc) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0c6d899d53dd..1b2827cecf38 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10107,7 +10107,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif - kvm_register_perf_callbacks(ops->handle_intel_pt_intr); + __kvm_register_perf_callbacks(ops->handle_intel_pt_intr, NULL); if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled) kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d93f75b05ae2..8e410d1a63df 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1749,10 +1749,17 @@ static inline bool kvm_arch_intc_initialized(struct kvm *kvm) #ifdef CONFIG_GUEST_PERF_EVENTS unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu); -void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)); +void __kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void), + void (*mediated_pmi_handler)(void)); + +static inline void kvm_register_perf_callbacks(void) +{ + __kvm_register_perf_callbacks(NULL, NULL); +} + void kvm_unregister_perf_callbacks(void); #else -static inline void kvm_register_perf_callbacks(void *ign) {} +static inline void kvm_register_perf_callbacks(void) {} static inline void kvm_unregister_perf_callbacks(void) {} #endif /* CONFIG_GUEST_PERF_EVENTS */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 21a0d226d63f..d59cb53af76a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6470,10 +6470,11 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .handle_mediated_pmi = NULL, }; -void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)) +void __kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void), + void (*mediated_pmi_handler)(void)) { kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler; - kvm_guest_cbs.handle_mediated_pmi = NULL; + kvm_guest_cbs.handle_mediated_pmi = mediated_pmi_handler; perf_register_guest_info_callbacks(&kvm_guest_cbs); } From 3e51822b2fdf695bda50c2c3f88d6ab022a9e6f3 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:16:52 -0800 Subject: [PATCH 008/267] KVM: x86/pmu: Start stubbing in mediated PMU support Introduce enable_mediated_pmu as a global variable, with the intent of exposing it to userspace a vendor module parameter, to control and reflect mediated vPMU support. Wire up the perf plumbing to create+release a mediated PMU, but defer exposing the parameter to userspace until KVM support for a mediated PMUs is fully landed. To (a) minimize compatibility issues, (b) to give userspace a chance to opt out of the restrictive side-effects of perf_create_mediated_pmu(), and (c) to avoid adding new dependencies between enabling an in-kernel irqchip and a mediated vPMU, defer "creating" a mediated PMU in perf until the first vCPU is created. Regarding userspace compatibility, an alternative solution would be to make the mediated PMU fully opt-in, e.g. to avoid unexpected failure due to perf_create_mediated_pmu() failing. Ironically, that approach creates an even bigger compatibility issue, as turning on enable_mediated_pmu would silently break VMMs that don't utilize KVM_CAP_PMU_CAPABILITY (well, silently until the guest tried to access PMU assets). Regarding an in-kernel irqchip, create a mediated PMU if and only if the VM has an in-kernel local APIC, as the mediated PMU will take a hard dependency on forwarding PMIs to the guest without bouncing through host userspace. Silently "drop" the PMU instead of rejecting KVM_CREATE_VCPU, as KVM's existing vPMU support doesn't function correctly if the local APIC is emulated by userspace, e.g. PMIs will never be delivered. I.e. it's far, far more likely that rejecting KVM_CREATE_VCPU would cause problems, e.g. for tests or userspace daemons that just want to probe basic KVM functionality. Note! Deliberately make mediated PMU creation "sticky", i.e. don't unwind it on failure to create a vCPU. Practically speaking, there's no harm to having a VM with a mediated PMU and no vCPUs. To avoid an "impossible" VM setup, reject KVM_CAP_PMU_CAPABILITY if a mediated PMU has been created, i.e. don't let userspace disable PMU support after failed vCPU creation (with PMU support enabled). Defer vendor specific requirements and constraints to the future. Suggested-by: Sean Christopherson Signed-off-by: Dapeng Mi Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang Tested-by: Xudong Hao Co-developed-by: Sean Christopherson Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-17-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/pmu.c | 4 ++++ arch/x86/kvm/pmu.h | 7 +++++++ arch/x86/kvm/x86.c | 37 +++++++++++++++++++++++++++++++-- arch/x86/kvm/x86.h | 1 + 5 files changed, 48 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5a3bfa293e8b..defd979003be 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1484,6 +1484,7 @@ struct kvm_arch { bool bus_lock_detection_enabled; bool enable_pmu; + bool created_mediated_pmu; u32 notify_window; u32 notify_vmexit_flags; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 487ad19a236e..131e24246b09 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -135,6 +135,10 @@ void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) enable_pmu = false; } + if (!enable_pmu || !enable_mediated_pmu || !kvm_host_pmu.mediated || + !pmu_ops->is_mediated_pmu_supported(&kvm_host_pmu)) + enable_mediated_pmu = false; + if (!enable_pmu) { memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); return; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 5c3939e91f1d..a5c7c026b919 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -37,6 +37,8 @@ struct kvm_pmu_ops { void (*deliver_pmi)(struct kvm_vcpu *vcpu); void (*cleanup)(struct kvm_vcpu *vcpu); + bool (*is_mediated_pmu_supported)(struct x86_pmu_capability *host_pmu); + const u64 EVENTSEL_EVENT; const int MAX_NR_GP_COUNTERS; const int MIN_NR_GP_COUNTERS; @@ -58,6 +60,11 @@ static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) return pmu->version > 1; } +static inline bool kvm_vcpu_has_mediated_pmu(struct kvm_vcpu *vcpu) +{ + return enable_mediated_pmu && vcpu_to_pmu(vcpu)->version; +} + /* * KVM tracks all counters in 64-bit bitmaps, with general purpose counters * mapped to bits 31:0 and fixed counters mapped to 63:32, e.g. fixed counter 0 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1b2827cecf38..fb3a5e861553 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -183,6 +183,10 @@ bool __read_mostly enable_pmu = true; EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_pmu); module_param(enable_pmu, bool, 0444); +/* Enable/disabled mediated PMU virtualization. */ +bool __read_mostly enable_mediated_pmu; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mediated_pmu); + bool __read_mostly eager_page_split = true; module_param(eager_page_split, bool, 0644); @@ -6854,7 +6858,7 @@ disable_exits_unlock: break; mutex_lock(&kvm->lock); - if (!kvm->created_vcpus) { + if (!kvm->created_vcpus && !kvm->arch.created_mediated_pmu) { kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE); r = 0; } @@ -12641,8 +12645,13 @@ static int sync_regs(struct kvm_vcpu *vcpu) return 0; } +#define PERF_MEDIATED_PMU_MSG \ + "Failed to enable mediated vPMU, try disabling system wide perf events and nmi_watchdog.\n" + int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { + int r; + if (kvm_check_tsc_unstable() && kvm->created_vcpus) pr_warn_once("SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); @@ -12653,7 +12662,29 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) if (id >= kvm->arch.max_vcpu_ids) return -EINVAL; - return kvm_x86_call(vcpu_precreate)(kvm); + /* + * Note, any actions done by .vcpu_create() must be idempotent with + * respect to creating multiple vCPUs, and therefore are not undone if + * creating a vCPU fails (including failure during pre-create). + */ + r = kvm_x86_call(vcpu_precreate)(kvm); + if (r) + return r; + + if (enable_mediated_pmu && kvm->arch.enable_pmu && + !kvm->arch.created_mediated_pmu) { + if (irqchip_in_kernel(kvm)) { + r = perf_create_mediated_pmu(); + if (r) { + pr_warn_ratelimited(PERF_MEDIATED_PMU_MSG); + return r; + } + kvm->arch.created_mediated_pmu = true; + } else { + kvm->arch.enable_pmu = false; + } + } + return 0; } int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) @@ -13319,6 +13350,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); mutex_unlock(&kvm->slots_lock); } + if (kvm->arch.created_mediated_pmu) + perf_release_mediated_pmu(); kvm_destroy_vcpus(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); #ifdef CONFIG_KVM_IOAPIC diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index fdab0ad49098..6e1fb1680c0a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -470,6 +470,7 @@ extern struct kvm_caps kvm_caps; extern struct kvm_host_values kvm_host; extern bool enable_pmu; +extern bool enable_mediated_pmu; /* * Get a filtered version of KVM's supported XCR0 that strips out dynamic From bfee4f07d88038f7e662652718e21c60b62ef3a1 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:16:53 -0800 Subject: [PATCH 009/267] KVM: x86/pmu: Implement Intel mediated PMU requirements and constraints Implement Intel PMU requirements and constraints for mediated PMU support. Require host PMU version 4+ so that PERF_GLOBAL_STATUS_SET can be used to precisely load the guest's status value into hardware, and require full- width writes so that KVM can precisely load guest counter values. Disable PEBS and LBRs if mediated PMU support is enabled, as they won't be supported in the initial implementation. Signed-off-by: Dapeng Mi Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang [sean: split to separate patch, add full-width writes dependency] Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-18-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/capabilities.h | 3 ++- arch/x86/kvm/vmx/pmu_intel.c | 17 +++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 3 ++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 02aadb9d730e..26302fd6dd9c 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -395,7 +395,8 @@ static inline bool vmx_pt_mode_is_host_guest(void) static inline bool vmx_pebs_supported(void) { - return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept; + return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept && + !enable_mediated_pmu; } static inline bool cpu_has_notify_vmexit(void) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index de1d9785c01f..050c21298213 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -767,6 +767,20 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) } } +static bool intel_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_pmu) +{ + u64 host_perf_cap = 0; + + if (boot_cpu_has(X86_FEATURE_PDCM)) + rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); + + /* + * Require v4+ for MSR_CORE_PERF_GLOBAL_STATUS_SET, and full-width + * writes so that KVM can precisely load guest counter values. + */ + return host_pmu->version >= 4 && host_perf_cap & PERF_CAP_FW_WRITES; +} + struct kvm_pmu_ops intel_pmu_ops __initdata = { .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, @@ -778,6 +792,9 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .reset = intel_pmu_reset, .deliver_pmi = intel_pmu_deliver_pmi, .cleanup = intel_pmu_cleanup, + + .is_mediated_pmu_supported = intel_pmu_is_mediated_pmu_supported, + .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS, .MIN_NR_GP_COUNTERS = 1, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4cbe8c84b636..fdd18ad1ede3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7958,7 +7958,8 @@ static __init u64 vmx_get_perf_capabilities(void) if (boot_cpu_has(X86_FEATURE_PDCM)) rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); - if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { + if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR) && + !enable_mediated_pmu) { x86_perf_get_lbr(&vmx_lbr_caps); /* From 9ba0bb4ae76a8ee037257499165a4370306c0eac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:16:54 -0800 Subject: [PATCH 010/267] KVM: x86/pmu: Implement AMD mediated PMU requirements Require host PMU version 2+ for AMD mediated PMU support, as PERF_GLOBAL_CTRL and friends are hard requirements for the mediated PMU. Signed-off-by: Dapeng Mi Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang [sean: extract to separate patch, write changelog] Reviewed-by: Sandipan Das Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-19-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/pmu.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index bc062285fbf5..16c88b2a2eb8 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -227,6 +227,11 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu) } } +static bool amd_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_pmu) +{ + return host_pmu->version >= 2; +} + struct kvm_pmu_ops amd_pmu_ops __initdata = { .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = amd_msr_idx_to_pmc, @@ -236,6 +241,9 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .set_msr = amd_pmu_set_msr, .refresh = amd_pmu_refresh, .init = amd_pmu_init, + + .is_mediated_pmu_supported = amd_pmu_is_mediated_pmu_supported, + .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_MAX_NR_AMD_GP_COUNTERS, .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, From 1c4ba7286afba9842f295fc7a3dbe74acc6a92af Mon Sep 17 00:00:00 2001 From: Xiong Zhang Date: Fri, 5 Dec 2025 16:16:55 -0800 Subject: [PATCH 011/267] KVM: x86/pmu: Register PMI handler for mediated vPMU Register a dedicated PMI handler with perf's callback when mediated PMU support is enabled. Perf routes PMIs that arrive while guest context is loaded to the provided callback, by modifying the CPU's LVTPC to point at a dedicated mediated PMI IRQ vector. WARN upon receipt of a mediated PMI if there is no active vCPU, or if the vCPU doesn't have a mediated PMU. Even if a PMI manages to skid past VM-Exit, it should never be delayed all the way beyond unloading the vCPU. And while running vCPUs without a mediated PMU, the LVTPC should never be wired up to the mediated PMI IRQ vector, i.e. should always be routed through perf's NMI handler. Signed-off-by: Xiong Zhang Signed-off-by: Mingwei Zhang Tested-by: Xudong Hao Co-developed-by: Sean Christopherson Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-20-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 10 ++++++++++ arch/x86/kvm/pmu.h | 2 ++ arch/x86/kvm/x86.c | 3 ++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 131e24246b09..0b67920fa069 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -157,6 +157,16 @@ void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); } +void kvm_handle_guest_mediated_pmi(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + if (WARN_ON_ONCE(!vcpu || !kvm_vcpu_has_mediated_pmu(vcpu))) + return; + + kvm_make_request(KVM_REQ_PMI, vcpu); +} + static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index a5c7c026b919..9849c2bb720d 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -46,6 +46,8 @@ struct kvm_pmu_ops { void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); +void kvm_handle_guest_mediated_pmi(void); + static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) { /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fb3a5e861553..1623afddff3b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10111,7 +10111,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif - __kvm_register_perf_callbacks(ops->handle_intel_pt_intr, NULL); + __kvm_register_perf_callbacks(ops->handle_intel_pt_intr, + enable_mediated_pmu ? kvm_handle_guest_mediated_pmi : NULL); if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled) kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM); From 80624272129eacc10cecb30f004cfa611be04770 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:16:56 -0800 Subject: [PATCH 012/267] KVM: x86/pmu: Disable RDPMC interception for compatible mediated vPMU Disable RDPMC interception for vCPUs with a mediated vPMU that is compatible with the host PMU, i.e. that doesn't require KVM emulation of RDPMC to honor the guest's vCPU model. With a mediated vPMU, all guest state accessible via RDPMC is loaded into hardware while the guest is running. Adust RDPMC interception only for non-TDX guests, as the TDX module is responsible for managing RDPMC intercepts based on the TD configuration. Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang Co-developed-by: Sandipan Das Signed-off-by: Sandipan Das Signed-off-by: Dapeng Mi Tested-by: Xudong Hao Co-developed-by: Sean Christopherson Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-21-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 26 ++++++++++++++++++++++++++ arch/x86/kvm/pmu.h | 1 + arch/x86/kvm/svm/svm.c | 5 +++++ arch/x86/kvm/vmx/vmx.c | 7 +++++++ arch/x86/kvm/x86.c | 1 + 5 files changed, 40 insertions(+) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 0b67920fa069..fcecb4c21599 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -714,6 +714,32 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) return 0; } +bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + if (!kvm_vcpu_has_mediated_pmu(vcpu)) + return true; + + /* + * VMware allows access to these Pseduo-PMCs even when read via RDPMC + * in Ring3 when CR4.PCE=0. + */ + if (enable_vmware_backdoor) + return true; + + /* + * Note! Check *host* PMU capabilities, not KVM's PMU capabilities, as + * KVM's capabilities are constrained based on KVM support, i.e. KVM's + * capabilities themselves may be a subset of hardware capabilities. + */ + return pmu->nr_arch_gp_counters != kvm_host_pmu.num_counters_gp || + pmu->nr_arch_fixed_counters != kvm_host_pmu.num_counters_fixed || + pmu->counter_bitmask[KVM_PMC_GP] != (BIT_ULL(kvm_host_pmu.bit_width_gp) - 1) || + pmu->counter_bitmask[KVM_PMC_FIXED] != (BIT_ULL(kvm_host_pmu.bit_width_fixed) - 1); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_need_rdpmc_intercept); + void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) { diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 9849c2bb720d..506c203587ea 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -238,6 +238,7 @@ void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu); void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu); bool is_vmware_backdoor_pmc(u32 pmc_idx); +bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu); extern struct kvm_pmu_ops intel_pmu_ops; extern struct kvm_pmu_ops amd_pmu_ops; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f56c2d895011..ef43360b9282 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1011,6 +1011,11 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; } } + + if (kvm_need_rdpmc_intercept(vcpu)) + svm_set_intercept(svm, INTERCEPT_RDPMC); + else + svm_clr_intercept(svm, INTERCEPT_RDPMC); } static void svm_recalc_intercepts(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index fdd18ad1ede3..9f71ba99cf70 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4300,8 +4300,15 @@ static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) */ } +static void vmx_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) +{ + exec_controls_changebit(to_vmx(vcpu), CPU_BASED_RDPMC_EXITING, + kvm_need_rdpmc_intercept(vcpu)); +} + void vmx_recalc_intercepts(struct kvm_vcpu *vcpu) { + vmx_recalc_instruction_intercepts(vcpu); vmx_recalc_msr_intercepts(vcpu); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1623afddff3b..76e86eb358df 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3945,6 +3945,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.perf_capabilities = data; kvm_pmu_refresh(vcpu); + kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); break; case MSR_IA32_PRED_CMD: { u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB); From d3ba32d1ff2a206621475325c009ab5b51882de1 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:16:57 -0800 Subject: [PATCH 013/267] KVM: x86/pmu: Load/save GLOBAL_CTRL via entry/exit fields for mediated PMU When running a guest with a mediated PMU, context switch PERF_GLOBAL_CTRL via the dedicated VMCS fields for both host and guest. For the host, always zero GLOBAL_CTRL on exit as the guest's state will still be loaded in hardware (KVM will context switch the bulk of PMU state outside of the inner run loop). For the guest, use the dedicated fields to atomically load and save PERF_GLOBAL_CTRL on all entry/exits. For now, require VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL support (introduced by Sapphire Rapids). KVM can support such CPUs by saving PERF_GLOBAL_CTRL via the MSR save list, a.k.a. the MSR auto-store list, but defer that support as it adds a small amount of complexity and is somewhat unique. To minimize VM-Entry latency, propagate IA32_PERF_GLOBAL_CTRL to the VMCS on-demand. But to minimize complexity, read IA32_PERF_GLOBAL_CTRL out of the VMCS on all non-failing VM-Exits. I.e. partially cache the MSR. KVM could track GLOBAL_CTRL as an EXREG and defer all reads, but writes are rare, i.e. the dirty tracking for an EXREG is unnecessary, and it's not obvious that shaving ~15-20 cycles per exit is meaningful given the total overhead associated with mediated PMU context switches. Suggested-by: Sean Christopherson Signed-off-by: Dapeng Mi Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang Tested-by: Xudong Hao Co-developed-by: Sean Christopherson Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-22-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-pmu-ops.h | 2 ++ arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/pmu.c | 13 +++++++++-- arch/x86/kvm/pmu.h | 3 ++- arch/x86/kvm/vmx/capabilities.h | 6 +++++ arch/x86/kvm/vmx/pmu_intel.c | 25 ++++++++++++++++++++- arch/x86/kvm/vmx/vmx.c | 31 +++++++++++++++++++++++++- arch/x86/kvm/vmx/vmx.h | 3 ++- 8 files changed, 78 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index 9159bf1a4730..ad2cc82abf79 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -23,5 +23,7 @@ KVM_X86_PMU_OP_OPTIONAL(reset) KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) KVM_X86_PMU_OP_OPTIONAL(cleanup) +KVM_X86_PMU_OP_OPTIONAL(write_global_ctrl) + #undef KVM_X86_PMU_OP #undef KVM_X86_PMU_OP_OPTIONAL diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index c85c50019523..b92ff87e3560 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -107,6 +107,7 @@ #define VM_EXIT_PT_CONCEAL_PIP 0x01000000 #define VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000 #define VM_EXIT_LOAD_CET_STATE 0x10000000 +#define VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL 0x40000000 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index fcecb4c21599..4b896cbb3d53 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -103,7 +103,7 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } -void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) +void kvm_init_pmu_capability(struct kvm_pmu_ops *pmu_ops) { bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; @@ -139,6 +139,9 @@ void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) !pmu_ops->is_mediated_pmu_supported(&kvm_host_pmu)) enable_mediated_pmu = false; + if (!enable_mediated_pmu) + pmu_ops->write_global_ctrl = NULL; + if (!enable_pmu) { memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); return; @@ -834,6 +837,9 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) diff = pmu->global_ctrl ^ data; pmu->global_ctrl = data; reprogram_counters(pmu, diff); + + if (kvm_vcpu_has_mediated_pmu(vcpu)) + kvm_pmu_call(write_global_ctrl)(data); } break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: @@ -928,8 +934,11 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) * in the global controls). Emulate that behavior when refreshing the * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL. */ - if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) + if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) { pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); + if (kvm_vcpu_has_mediated_pmu(vcpu)) + kvm_pmu_call(write_global_ctrl)(pmu->global_ctrl); + } bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); bitmap_set(pmu->all_valid_pmc_idx, KVM_FIXED_PMC_BASE_IDX, diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 506c203587ea..2ff469334c1a 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -38,6 +38,7 @@ struct kvm_pmu_ops { void (*cleanup)(struct kvm_vcpu *vcpu); bool (*is_mediated_pmu_supported)(struct x86_pmu_capability *host_pmu); + void (*write_global_ctrl)(u64 global_ctrl); const u64 EVENTSEL_EVENT; const int MAX_NR_GP_COUNTERS; @@ -183,7 +184,7 @@ static inline bool pmc_is_locally_enabled(struct kvm_pmc *pmc) extern struct x86_pmu_capability kvm_pmu_cap; -void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops); +void kvm_init_pmu_capability(struct kvm_pmu_ops *pmu_ops); void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc); diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 26302fd6dd9c..4e371c93ae16 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -109,6 +109,12 @@ static inline bool cpu_has_load_cet_ctrl(void) { return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_CET_STATE); } + +static inline bool cpu_has_save_perf_global_ctrl(void) +{ + return vmcs_config.vmexit_ctrl & VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; +} + static inline bool cpu_has_vmx_mpx(void) { return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 050c21298213..dbab7cca7a62 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -778,7 +778,29 @@ static bool intel_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_ * Require v4+ for MSR_CORE_PERF_GLOBAL_STATUS_SET, and full-width * writes so that KVM can precisely load guest counter values. */ - return host_pmu->version >= 4 && host_perf_cap & PERF_CAP_FW_WRITES; + if (host_pmu->version < 4 || !(host_perf_cap & PERF_CAP_FW_WRITES)) + return false; + + /* + * All CPUs that support a mediated PMU are expected to support loading + * PERF_GLOBAL_CTRL via dedicated VMCS fields. + */ + if (WARN_ON_ONCE(!cpu_has_load_perf_global_ctrl())) + return false; + + /* + * KVM doesn't yet support mediated PMU on CPUs without support for + * saving PERF_GLOBAL_CTRL via a dedicated VMCS field. + */ + if (!cpu_has_save_perf_global_ctrl()) + return false; + + return true; +} + +static void intel_pmu_write_global_ctrl(u64 global_ctrl) +{ + vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, global_ctrl); } struct kvm_pmu_ops intel_pmu_ops __initdata = { @@ -794,6 +816,7 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .cleanup = intel_pmu_cleanup, .is_mediated_pmu_supported = intel_pmu_is_mediated_pmu_supported, + .write_global_ctrl = intel_pmu_write_global_ctrl, .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9f71ba99cf70..72b92cea9d72 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4294,6 +4294,18 @@ static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept); } + if (enable_mediated_pmu) { + bool is_mediated_pmu = kvm_vcpu_has_mediated_pmu(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vm_entry_controls_changebit(vmx, + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, is_mediated_pmu); + + vm_exit_controls_changebit(vmx, + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | + VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL, is_mediated_pmu); + } + /* * x2APIC and LBR MSR intercepts are modified on-demand and cannot be * filtered by userspace. @@ -4476,6 +4488,16 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) vmcs_writel(HOST_SSP, 0); vmcs_writel(HOST_INTR_SSP_TABLE, 0); } + + /* + * When running a guest with a mediated PMU, guest state is resident in + * hardware after VM-Exit. Zero PERF_GLOBAL_CTRL on exit so that host + * activity doesn't bleed into the guest counters. When running with + * an emulated PMU, PERF_GLOBAL_CTRL is dynamically computed on every + * entry/exit to merge guest and host PMU usage. + */ + if (enable_mediated_pmu) + vmcs_write64(HOST_IA32_PERF_GLOBAL_CTRL, 0); } void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) @@ -4543,7 +4565,8 @@ static u32 vmx_get_initial_vmexit_ctrl(void) VM_EXIT_CLEAR_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ return vmexit_ctrl & - ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); + ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER | + VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL); } void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) @@ -7270,6 +7293,9 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) struct perf_guest_switch_msr *msrs; struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); + if (kvm_vcpu_has_mediated_pmu(&vmx->vcpu)) + return; + pmu->host_cross_mapped_mask = 0; if (pmu->pebs_enable & pmu->global_ctrl) intel_pmu_cross_mapped_check(pmu); @@ -7572,6 +7598,9 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vmx->loaded_vmcs->launched = 1; + if (!msr_write_intercepted(vmx, MSR_CORE_PERF_GLOBAL_CTRL)) + vcpu_to_pmu(vcpu)->global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL); + vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index bc3ed3145d7e..d7a96c84371f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -510,7 +510,8 @@ static inline u8 vmx_get_rvi(void) VM_EXIT_CLEAR_BNDCFGS | \ VM_EXIT_PT_CONCEAL_PIP | \ VM_EXIT_CLEAR_IA32_RTIT_CTL | \ - VM_EXIT_LOAD_CET_STATE) + VM_EXIT_LOAD_CET_STATE | \ + VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL) #define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL \ (PIN_BASED_EXT_INTR_MASK | \ From 2904df6692f429853cf99b4b47c8592b2f49edaa Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:16:58 -0800 Subject: [PATCH 014/267] KVM: x86/pmu: Disable interception of select PMU MSRs for mediated vPMUs For vCPUs with a mediated vPMU, disable interception of counter MSRs for PMCs that are exposed to the guest, and for GLOBAL_CTRL and related MSRs if they are fully supported according to the vCPU model, i.e. if the MSRs and all bits supported by hardware exist from the guest's point of view. Do NOT passthrough event selector or fixed counter control MSRs, so that KVM can enforce userspace-defined event filters, e.g. to prevent use of AnyThread events (which is unfortunately a setting in the fixed counter control MSR). Defer support for nested passthrough of mediated PMU MSRs to the future, as the logic for nested MSR interception is unfortunately vendor specific. Suggested-by: Sean Christopherson Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang Co-developed-by: Sandipan Das Signed-off-by: Sandipan Das Signed-off-by: Dapeng Mi [sean: squash patches, massage changelog, refresh VMX MSRs on filter change] Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-23-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 41 +++++++++++++++++-------- arch/x86/kvm/pmu.h | 1 + arch/x86/kvm/svm/svm.c | 36 ++++++++++++++++++++++ arch/x86/kvm/vmx/pmu_intel.c | 13 -------- arch/x86/kvm/vmx/pmu_intel.h | 15 +++++++++ arch/x86/kvm/vmx/vmx.c | 59 +++++++++++++++++++++++++++++------- 6 files changed, 128 insertions(+), 37 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 4b896cbb3d53..3e048c170b97 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -717,27 +717,41 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) return 0; } -bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu) +static bool kvm_need_any_pmc_intercept(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); if (!kvm_vcpu_has_mediated_pmu(vcpu)) return true; - /* - * VMware allows access to these Pseduo-PMCs even when read via RDPMC - * in Ring3 when CR4.PCE=0. - */ - if (enable_vmware_backdoor) - return true; - /* * Note! Check *host* PMU capabilities, not KVM's PMU capabilities, as * KVM's capabilities are constrained based on KVM support, i.e. KVM's * capabilities themselves may be a subset of hardware capabilities. */ return pmu->nr_arch_gp_counters != kvm_host_pmu.num_counters_gp || - pmu->nr_arch_fixed_counters != kvm_host_pmu.num_counters_fixed || + pmu->nr_arch_fixed_counters != kvm_host_pmu.num_counters_fixed; +} + +bool kvm_need_perf_global_ctrl_intercept(struct kvm_vcpu *vcpu) +{ + return kvm_need_any_pmc_intercept(vcpu) || + !kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_need_perf_global_ctrl_intercept); + +bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + /* + * VMware allows access to these Pseduo-PMCs even when read via RDPMC + * in Ring3 when CR4.PCE=0. + */ + if (enable_vmware_backdoor) + return true; + + return kvm_need_any_pmc_intercept(vcpu) || pmu->counter_bitmask[KVM_PMC_GP] != (BIT_ULL(kvm_host_pmu.bit_width_gp) - 1) || pmu->counter_bitmask[KVM_PMC_FIXED] != (BIT_ULL(kvm_host_pmu.bit_width_fixed) - 1); } @@ -934,11 +948,12 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) * in the global controls). Emulate that behavior when refreshing the * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL. */ - if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) { + if (pmu->nr_arch_gp_counters && + (kvm_pmu_has_perf_global_ctrl(pmu) || kvm_vcpu_has_mediated_pmu(vcpu))) pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); - if (kvm_vcpu_has_mediated_pmu(vcpu)) - kvm_pmu_call(write_global_ctrl)(pmu->global_ctrl); - } + + if (kvm_vcpu_has_mediated_pmu(vcpu)) + kvm_pmu_call(write_global_ctrl)(pmu->global_ctrl); bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); bitmap_set(pmu->all_valid_pmc_idx, KVM_FIXED_PMC_BASE_IDX, diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 2ff469334c1a..356b08e92bc9 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -239,6 +239,7 @@ void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu); void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu); bool is_vmware_backdoor_pmc(u32 pmc_idx); +bool kvm_need_perf_global_ctrl_intercept(struct kvm_vcpu *vcpu); bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu); extern struct kvm_pmu_ops intel_pmu_ops; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ef43360b9282..dca45f5151f9 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -730,6 +730,40 @@ void svm_vcpu_free_msrpm(void *msrpm) __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); } +static void svm_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu) +{ + bool intercept = !kvm_vcpu_has_mediated_pmu(vcpu); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + int i; + + if (!enable_mediated_pmu) + return; + + /* Legacy counters are always available for AMD CPUs with a PMU. */ + for (i = 0; i < min(pmu->nr_arch_gp_counters, AMD64_NUM_COUNTERS); i++) + svm_set_intercept_for_msr(vcpu, MSR_K7_PERFCTR0 + i, + MSR_TYPE_RW, intercept); + + intercept |= !guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE); + for (i = 0; i < pmu->nr_arch_gp_counters; i++) + svm_set_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i, + MSR_TYPE_RW, intercept); + + for ( ; i < kvm_pmu_cap.num_counters_gp; i++) + svm_enable_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i, + MSR_TYPE_RW); + + intercept = kvm_need_perf_global_ctrl_intercept(vcpu); + svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, + MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, + MSR_TYPE_RW, intercept); +} + static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -798,6 +832,8 @@ static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) if (sev_es_guest(vcpu->kvm)) sev_es_recalc_msr_intercepts(vcpu); + svm_recalc_pmu_msr_intercepts(vcpu); + /* * x2APIC intercepts are modified on-demand and cannot be filtered by * userspace. diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index dbab7cca7a62..820da47454d7 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -128,19 +128,6 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return &counters[array_index_nospec(idx, num_counters)]; } -static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) -{ - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) - return 0; - - return vcpu->arch.perf_capabilities; -} - -static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) -{ - return (vcpu_get_perf_capabilities(vcpu) & PERF_CAP_FW_WRITES) != 0; -} - static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) { if (!fw_writes_is_enabled(pmu_to_vcpu(pmu))) diff --git a/arch/x86/kvm/vmx/pmu_intel.h b/arch/x86/kvm/vmx/pmu_intel.h index 5620d0882cdc..5d9357640aa1 100644 --- a/arch/x86/kvm/vmx/pmu_intel.h +++ b/arch/x86/kvm/vmx/pmu_intel.h @@ -4,6 +4,21 @@ #include +#include "cpuid.h" + +static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) +{ + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) + return 0; + + return vcpu->arch.perf_capabilities; +} + +static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) +{ + return (vcpu_get_perf_capabilities(vcpu) & PERF_CAP_FW_WRITES) != 0; +} + bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 72b92cea9d72..f0a20ff2a941 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4228,6 +4228,53 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) } } +static void vmx_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu) +{ + bool has_mediated_pmu = kvm_vcpu_has_mediated_pmu(vcpu); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool intercept = !has_mediated_pmu; + int i; + + if (!enable_mediated_pmu) + return; + + vm_entry_controls_changebit(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, + has_mediated_pmu); + + vm_exit_controls_changebit(vmx, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | + VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL, + has_mediated_pmu); + + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i, + MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i, MSR_TYPE_RW, + intercept || !fw_writes_is_enabled(vcpu)); + } + for ( ; i < kvm_pmu_cap.num_counters_gp; i++) { + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i, + MSR_TYPE_RW, true); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i, + MSR_TYPE_RW, true); + } + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i, + MSR_TYPE_RW, intercept); + for ( ; i < kvm_pmu_cap.num_counters_fixed; i++) + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i, + MSR_TYPE_RW, true); + + intercept = kvm_need_perf_global_ctrl_intercept(vcpu); + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_STATUS, + MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, + MSR_TYPE_RW, intercept); +} + static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { bool intercept; @@ -4294,17 +4341,7 @@ static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept); } - if (enable_mediated_pmu) { - bool is_mediated_pmu = kvm_vcpu_has_mediated_pmu(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); - - vm_entry_controls_changebit(vmx, - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, is_mediated_pmu); - - vm_exit_controls_changebit(vmx, - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | - VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL, is_mediated_pmu); - } + vmx_recalc_pmu_msr_intercepts(vcpu); /* * x2APIC and LBR MSR intercepts are modified on-demand and cannot be From 0ea0d6314870493ac723afefb6257be71f4c636f Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:16:59 -0800 Subject: [PATCH 015/267] KVM: x86/pmu: Bypass perf checks when emulating mediated PMU counter accesses When emulating a PMC counter read or write for a mediated PMU, bypass the perf checks and emulated_counter logic as the counters aren't proxied through perf, i.e. pmc->counter always holds the guest's up-to-date value, and thus there's no need to defer emulated overflow checks. Suggested-by: Sean Christopherson Signed-off-by: Dapeng Mi Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang [sean: split from event filtering change, write shortlog+changelog] Reviewed-by: Sandipan Das Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-24-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 5 +++++ arch/x86/kvm/pmu.h | 3 +++ 2 files changed, 8 insertions(+) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3e048c170b97..3a901587ca6b 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -379,6 +379,11 @@ static void pmc_update_sample_period(struct kvm_pmc *pmc) void pmc_write_counter(struct kvm_pmc *pmc, u64 val) { + if (kvm_vcpu_has_mediated_pmu(pmc->vcpu)) { + pmc->counter = val & pmc_bitmask(pmc); + return; + } + /* * Drop any unconsumed accumulated counts, the WRMSR is a write, not a * read-modify-write. Adjust the counter value so that its value is diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 356b08e92bc9..9a199109d672 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -111,6 +111,9 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) { u64 counter, enabled, running; + if (kvm_vcpu_has_mediated_pmu(pmc->vcpu)) + return pmc->counter & pmc_bitmask(pmc); + counter = pmc->counter + pmc->emulated_counter; if (pmc->perf_event && !pmc->is_paused) From 02918f0077925994b04be147875b6de8b63ca249 Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Fri, 5 Dec 2025 16:17:00 -0800 Subject: [PATCH 016/267] KVM: x86/pmu: Introduce eventsel_hw to prepare for pmu event filtering Introduce eventsel_hw and fixed_ctr_ctrl_hw to store the actual HW value in PMU event selector MSRs. In mediated PMU checks events before allowing the event values written to the PMU MSRs. However, to match the HW behavior, when PMU event checks fails, KVM should allow guest to read the value back. This essentially requires an extra variable to separate the guest requested value from actual PMU MSR value. Note this only applies to event selectors. Signed-off-by: Mingwei Zhang Co-developed-by: Dapeng Mi Signed-off-by: Dapeng Mi Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-25-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/pmu.c | 7 +++++-- arch/x86/kvm/svm/pmu.c | 1 + arch/x86/kvm/vmx/pmu_intel.c | 2 ++ 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index defd979003be..e72357f64b19 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -529,6 +529,7 @@ struct kvm_pmc { */ u64 emulated_counter; u64 eventsel; + u64 eventsel_hw; struct perf_event *perf_event; struct kvm_vcpu *vcpu; /* @@ -557,6 +558,7 @@ struct kvm_pmu { unsigned nr_arch_fixed_counters; unsigned available_event_types; u64 fixed_ctr_ctrl; + u64 fixed_ctr_ctrl_hw; u64 fixed_ctr_ctrl_rsvd; u64 global_ctrl; u64 global_status; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3a901587ca6b..a05366e4eef2 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -900,11 +900,14 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu) pmc->counter = 0; pmc->emulated_counter = 0; - if (pmc_is_gp(pmc)) + if (pmc_is_gp(pmc)) { pmc->eventsel = 0; + pmc->eventsel_hw = 0; + } } - pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; + pmu->fixed_ctr_ctrl = pmu->fixed_ctr_ctrl_hw = 0; + pmu->global_ctrl = pmu->global_status = 0; kvm_pmu_call(reset)(vcpu); } diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 16c88b2a2eb8..c1ec1962314e 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -166,6 +166,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~pmu->reserved_bits; if (data != pmc->eventsel) { pmc->eventsel = data; + pmc->eventsel_hw = data; kvm_pmu_request_counter_reprogram(pmc); } return 0; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 820da47454d7..855240678300 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -61,6 +61,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) int i; pmu->fixed_ctr_ctrl = data; + pmu->fixed_ctr_ctrl_hw = data; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { u8 new_ctrl = fixed_ctrl_field(data, i); u8 old_ctrl = fixed_ctrl_field(old_fixed_ctr_ctrl, i); @@ -430,6 +431,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data != pmc->eventsel) { pmc->eventsel = data; + pmc->eventsel_hw = data; kvm_pmu_request_counter_reprogram(pmc); } break; From 3db871fe185baca66e78b56a230e236af40f1027 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:17:01 -0800 Subject: [PATCH 017/267] KVM: x86/pmu: Reprogram mediated PMU event selectors on event filter updates Refresh the event selectors that are programmed into hardware when a PMC is "reprogrammed" for a mediated PMU, i.e. if userspace changes the PMU event filters Note, KVM doesn't utilize the reprogramming infrastructure to handle counter overflow for mediated PMUs, as there's no need to reprogram a non-existent perf event. Suggested-by: Sean Christopherson Signed-off-by: Dapeng Mi Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang [sean: add a helper to document behavior, split patch and rewrite changelog] Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-26-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index a05366e4eef2..24f5c14715ef 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -520,6 +520,25 @@ static bool pmc_is_event_allowed(struct kvm_pmc *pmc) return is_fixed_event_allowed(filter, pmc->idx); } +static void kvm_mediated_pmu_refresh_event_filter(struct kvm_pmc *pmc) +{ + bool allowed = pmc_is_event_allowed(pmc); + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (pmc_is_gp(pmc)) { + pmc->eventsel_hw &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + if (allowed) + pmc->eventsel_hw |= pmc->eventsel & + ARCH_PERFMON_EVENTSEL_ENABLE; + } else { + u64 mask = intel_fixed_bits_by_idx(pmc->idx - KVM_FIXED_PMC_BASE_IDX, 0xf); + + pmu->fixed_ctr_ctrl_hw &= ~mask; + if (allowed) + pmu->fixed_ctr_ctrl_hw |= pmu->fixed_ctr_ctrl & mask; + } +} + static int reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -528,6 +547,11 @@ static int reprogram_counter(struct kvm_pmc *pmc) bool emulate_overflow; u8 fixed_ctr_ctrl; + if (kvm_vcpu_has_mediated_pmu(pmu_to_vcpu(pmu))) { + kvm_mediated_pmu_refresh_event_filter(pmc); + return 0; + } + emulate_overflow = pmc_pause_counter(pmc); if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) || From a2f4ba534cc5d681a2d017c82e282bb32d8447df Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 5 Dec 2025 16:17:02 -0800 Subject: [PATCH 018/267] KVM: x86/pmu: Always stuff GuestOnly=1,HostOnly=0 for mediated PMCs on AMD On AMD platforms, there is no way to restore PerfCntrGlobalCtl at VM-Entry or clear it at VM-Exit. Since the register states will be restored before entering and saved after exiting guest context, the counters can keep ticking and even overflow leading to chaos while still in host context. To avoid this, intecept event selectors, which is already done by mediated PMU. In addition, always set the GuestOnly bit and clear the HostOnly bit for PMU selectors on AMD. Doing so allows the counters run only in guest context even if their enable bits are still set after VM exit and before host/guest PMU context switch. Signed-off-by: Sandipan Das Signed-off-by: Mingwei Zhang [sean: massage shortlog] Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-27-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/pmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index c1ec1962314e..6d5f791126b1 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -166,7 +166,8 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~pmu->reserved_bits; if (data != pmc->eventsel) { pmc->eventsel = data; - pmc->eventsel_hw = data; + pmc->eventsel_hw = (data & ~AMD64_EVENTSEL_HOSTONLY) | + AMD64_EVENTSEL_GUESTONLY; kvm_pmu_request_counter_reprogram(pmc); } return 0; From 56bb2736975068cc03648718bb8e50a456ce7173 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:17:03 -0800 Subject: [PATCH 019/267] KVM: x86/pmu: Load/put mediated PMU context when entering/exiting guest Implement the PMU "world switch" between host perf and guest mediated PMU. When loading guest state, call into perf to switch from host to guest, and then load guest state into hardware, and then reverse those actions when putting guest state. On the KVM side, when loading guest state, zero PERF_GLOBAL_CTRL to ensure all counters are disabled, then load selectors and counters, and finally call into vendor code to load control/status information. While VMX and SVM use different mechanisms to avoid counting host activity while guest controls are loaded, both implementations require PERF_GLOBAL_CTRL to be zeroed when the event selectors are in flux. When putting guest state, reverse the order, and save and zero controls and status prior to saving+zeroing selectors and counters. Defer clearing PERF_GLOBAL_CTRL to vendor code, as only SVM needs to manually clear the MSR; VMX configures PERF_GLOBAL_CTRL to be atomically cleared by the CPU on VM-Exit. Handle the difference in MSR layouts between Intel and AMD by communicating the bases and stride via kvm_pmu_ops. Because KVM requires Intel v4 (and full-width writes) and AMD v2, the MSRs to load/save are constant for a given vendor, i.e. do not vary based on the guest PMU, and do not vary based on host PMU (because KVM will simply disable mediated PMU support if the necessary MSRs are unsupported). Except for retrieving the guest's PERF_GLOBAL_CTRL, which needs to be read before invoking any fastpath handler (spoiler alert), perform the context switch around KVM's inner run loop. State only needs to be synchronized from hardware before KVM can access the software "caches". Note, VMX already grabs the guest's PERF_GLOBAL_CTRL immediately after VM-Exit, as hardware saves value into the VMCS. Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang Co-developed-by: Sandipan Das Signed-off-by: Sandipan Das Signed-off-by: Dapeng Mi Tested-by: Xudong Hao Co-developed-by: Sean Christopherson Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-28-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-pmu-ops.h | 2 + arch/x86/include/asm/msr-index.h | 1 + arch/x86/kvm/pmu.c | 130 ++++++++++++++++++++++++- arch/x86/kvm/pmu.h | 10 ++ arch/x86/kvm/svm/pmu.c | 34 +++++++ arch/x86/kvm/svm/svm.c | 3 + arch/x86/kvm/vmx/pmu_intel.c | 44 +++++++++ arch/x86/kvm/x86.c | 4 + 8 files changed, 225 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index ad2cc82abf79..f0aa6996811f 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -24,6 +24,8 @@ KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) KVM_X86_PMU_OP_OPTIONAL(cleanup) KVM_X86_PMU_OP_OPTIONAL(write_global_ctrl) +KVM_X86_PMU_OP(mediated_load) +KVM_X86_PMU_OP(mediated_put) #undef KVM_X86_PMU_OP #undef KVM_X86_PMU_OP_OPTIONAL diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3d0a0950d20a..4d3566bb1a93 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1219,6 +1219,7 @@ #define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e #define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f #define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390 +#define MSR_CORE_PERF_GLOBAL_STATUS_SET 0x00000391 #define MSR_PERF_METRICS 0x00000329 diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 24f5c14715ef..f6387c67b25c 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -880,10 +880,13 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) diff = pmu->global_ctrl ^ data; pmu->global_ctrl = data; reprogram_counters(pmu, diff); - - if (kvm_vcpu_has_mediated_pmu(vcpu)) - kvm_pmu_call(write_global_ctrl)(data); } + /* + * Unconditionally forward writes to vendor code, i.e. to the + * VMC{B,S}, as pmu->global_ctrl is per-VCPU, not per-VMC{B,S}. + */ + if (kvm_vcpu_has_mediated_pmu(vcpu)) + kvm_pmu_call(write_global_ctrl)(data); break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: /* @@ -1244,3 +1247,124 @@ cleanup: kfree(filter); return r; } + +static __always_inline u32 fixed_counter_msr(u32 idx) +{ + return kvm_pmu_ops.FIXED_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE; +} + +static __always_inline u32 gp_counter_msr(u32 idx) +{ + return kvm_pmu_ops.GP_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE; +} + +static __always_inline u32 gp_eventsel_msr(u32 idx) +{ + return kvm_pmu_ops.GP_EVENTSEL_BASE + idx * kvm_pmu_ops.MSR_STRIDE; +} + +static void kvm_pmu_load_guest_pmcs(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 i; + + /* + * No need to zero out unexposed GP/fixed counters/selectors since RDPMC + * is intercepted if hardware has counters that aren't visible to the + * guest (KVM will inject #GP as appropriate). + */ + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + pmc = &pmu->gp_counters[i]; + + wrmsrl(gp_counter_msr(i), pmc->counter); + wrmsrl(gp_eventsel_msr(i), pmc->eventsel_hw); + } + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + + wrmsrl(fixed_counter_msr(i), pmc->counter); + } +} + +void kvm_mediated_pmu_load(struct kvm_vcpu *vcpu) +{ + if (!kvm_vcpu_has_mediated_pmu(vcpu) || + KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) + return; + + lockdep_assert_irqs_disabled(); + + perf_load_guest_context(); + + /* + * Explicitly clear PERF_GLOBAL_CTRL, as "loading" the guest's context + * disables all individual counters (if any were enabled), but doesn't + * globally disable the entire PMU. Loading event selectors and PMCs + * with guest values while PERF_GLOBAL_CTRL is non-zero will generate + * unexpected events and PMIs. + * + * VMX will enable/disable counters at VM-Enter/VM-Exit by atomically + * loading PERF_GLOBAL_CONTROL. SVM effectively performs the switch by + * configuring all events to be GUEST_ONLY. Clear PERF_GLOBAL_CONTROL + * even for SVM to minimize the damage if a perf event is left enabled, + * and to ensure a consistent starting state. + */ + wrmsrq(kvm_pmu_ops.PERF_GLOBAL_CTRL, 0); + + perf_load_guest_lvtpc(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTPC)); + + kvm_pmu_load_guest_pmcs(vcpu); + + kvm_pmu_call(mediated_load)(vcpu); +} + +static void kvm_pmu_put_guest_pmcs(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 i; + + /* + * Clear selectors and counters to ensure hardware doesn't count using + * guest controls when the host (perf) restores its state. + */ + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + pmc = &pmu->gp_counters[i]; + + pmc->counter = rdpmc(i); + if (pmc->counter) + wrmsrq(gp_counter_msr(i), 0); + if (pmc->eventsel_hw) + wrmsrq(gp_eventsel_msr(i), 0); + } + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + + pmc->counter = rdpmc(INTEL_PMC_FIXED_RDPMC_BASE | i); + if (pmc->counter) + wrmsrq(fixed_counter_msr(i), 0); + } +} + +void kvm_mediated_pmu_put(struct kvm_vcpu *vcpu) +{ + if (!kvm_vcpu_has_mediated_pmu(vcpu) || + KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) + return; + + lockdep_assert_irqs_disabled(); + + /* + * Defer handling of PERF_GLOBAL_CTRL to vendor code. On Intel, it's + * atomically cleared on VM-Exit, i.e. doesn't need to be clear here. + */ + kvm_pmu_call(mediated_put)(vcpu); + + kvm_pmu_put_guest_pmcs(vcpu); + + perf_put_guest_lvtpc(); + + perf_put_guest_context(); +} diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 9a199109d672..25b583da9ee2 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -38,11 +38,19 @@ struct kvm_pmu_ops { void (*cleanup)(struct kvm_vcpu *vcpu); bool (*is_mediated_pmu_supported)(struct x86_pmu_capability *host_pmu); + void (*mediated_load)(struct kvm_vcpu *vcpu); + void (*mediated_put)(struct kvm_vcpu *vcpu); void (*write_global_ctrl)(u64 global_ctrl); const u64 EVENTSEL_EVENT; const int MAX_NR_GP_COUNTERS; const int MIN_NR_GP_COUNTERS; + + const u32 PERF_GLOBAL_CTRL; + const u32 GP_EVENTSEL_BASE; + const u32 GP_COUNTER_BASE; + const u32 FIXED_COUNTER_BASE; + const u32 MSR_STRIDE; }; void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); @@ -240,6 +248,8 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu); void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu); +void kvm_mediated_pmu_load(struct kvm_vcpu *vcpu); +void kvm_mediated_pmu_put(struct kvm_vcpu *vcpu); bool is_vmware_backdoor_pmc(u32 pmc_idx); bool kvm_need_perf_global_ctrl_intercept(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 6d5f791126b1..7aa298eeb072 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -234,6 +234,32 @@ static bool amd_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_pm return host_pmu->version >= 2; } +static void amd_mediated_pmu_load(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 global_status; + + rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, global_status); + /* Clear host global_status MSR if non-zero. */ + if (global_status) + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, global_status); + + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, pmu->global_status); + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, pmu->global_ctrl); +} + +static void amd_mediated_pmu_put(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); + rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, pmu->global_status); + + /* Clear global status bits if non-zero */ + if (pmu->global_status) + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, pmu->global_status); +} + struct kvm_pmu_ops amd_pmu_ops __initdata = { .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = amd_msr_idx_to_pmc, @@ -245,8 +271,16 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .init = amd_pmu_init, .is_mediated_pmu_supported = amd_pmu_is_mediated_pmu_supported, + .mediated_load = amd_mediated_pmu_load, + .mediated_put = amd_mediated_pmu_put, .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_MAX_NR_AMD_GP_COUNTERS, .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, + + .PERF_GLOBAL_CTRL = MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + .GP_EVENTSEL_BASE = MSR_F15H_PERF_CTL0, + .GP_COUNTER_BASE = MSR_F15H_PERF_CTR0, + .FIXED_COUNTER_BASE = 0, + .MSR_STRIDE = 2, }; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index dca45f5151f9..1a616eb3ff1c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4367,6 +4367,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; + if (!msr_write_intercepted(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL)) + rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, vcpu_to_pmu(vcpu)->global_ctrl); + trace_kvm_exit(vcpu, KVM_ISA_SVM); svm_complete_interrupts(vcpu); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 855240678300..55249fa4db95 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -792,6 +792,42 @@ static void intel_pmu_write_global_ctrl(u64 global_ctrl) vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, global_ctrl); } + +static void intel_mediated_pmu_load(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 global_status, toggle; + + rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, global_status); + toggle = pmu->global_status ^ global_status; + if (global_status & toggle) + wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status & toggle); + if (pmu->global_status & toggle) + wrmsrq(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status & toggle); + + wrmsrq(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl_hw); +} + +static void intel_mediated_pmu_put(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + /* MSR_CORE_PERF_GLOBAL_CTRL is already saved at VM-exit. */ + rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); + + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ + if (pmu->global_status) + wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); + + /* + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and + * also to avoid accidentally enabling fixed counters (based on guest + * state) while running in the host, e.g. when setting global ctrl. + */ + if (pmu->fixed_ctr_ctrl_hw) + wrmsrq(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); +} + struct kvm_pmu_ops intel_pmu_ops __initdata = { .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, @@ -805,9 +841,17 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .cleanup = intel_pmu_cleanup, .is_mediated_pmu_supported = intel_pmu_is_mediated_pmu_supported, + .mediated_load = intel_mediated_pmu_load, + .mediated_put = intel_mediated_pmu_put, .write_global_ctrl = intel_pmu_write_global_ctrl, .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS, .MIN_NR_GP_COUNTERS = 1, + + .PERF_GLOBAL_CTRL = MSR_CORE_PERF_GLOBAL_CTRL, + .GP_EVENTSEL_BASE = MSR_P6_EVNTSEL0, + .GP_COUNTER_BASE = MSR_IA32_PMC0, + .FIXED_COUNTER_BASE = MSR_CORE_PERF_FIXED_CTR0, + .MSR_STRIDE = 1, }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 76e86eb358df..589a309259f4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11334,6 +11334,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) run_flags |= KVM_RUN_LOAD_DEBUGCTL; vcpu->arch.host_debugctl = debug_ctl; + kvm_mediated_pmu_load(vcpu); + guest_timing_enter_irqoff(); /* @@ -11372,6 +11374,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_load_host_pkru(vcpu); + kvm_mediated_pmu_put(vcpu); + /* * Do this here before restoring debug registers on the host. And * since we do this before handling the vmexit, a DR access vmexit From f7a65e58d64340c3c0e390ea4e1c4857cd451f1f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:04 -0800 Subject: [PATCH 020/267] KVM: x86/pmu: Disallow emulation in the fastpath if mediated PMCs are active Don't handle exits in the fastpath if emulation is required, i.e. if an instruction needs to be skipped, the mediated PMU is enabled, and one or more PMCs is counting instructions. With the mediated PMU, KVM's cache of PMU state is inconsistent with respect to hardware until KVM exits the inner run loop (when the mediated PMU is "put"). Reviewed-by: Sandipan Das Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-29-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.h | 10 ++++++++++ arch/x86/kvm/x86.c | 9 +++++++++ 2 files changed, 19 insertions(+) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 25b583da9ee2..0925246731cb 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -234,6 +234,16 @@ static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); } +static inline bool kvm_pmu_is_fastpath_emulation_allowed(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + return !kvm_vcpu_has_mediated_pmu(vcpu) || + !bitmap_intersects(pmu->pmc_counting_instructions, + (unsigned long *)&pmu->global_ctrl, + X86_PMC_IDX_MAX); +} + void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 589a309259f4..4683df775b0a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2215,6 +2215,9 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_invd); fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu) { + if (!kvm_pmu_is_fastpath_emulation_allowed(vcpu)) + return EXIT_FASTPATH_NONE; + if (!kvm_emulate_invd(vcpu)) return EXIT_FASTPATH_EXIT_USERSPACE; @@ -2271,6 +2274,9 @@ static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) static fastpath_t __handle_fastpath_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { + if (!kvm_pmu_is_fastpath_emulation_allowed(vcpu)) + return EXIT_FASTPATH_NONE; + switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) || @@ -11714,6 +11720,9 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_halt); fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) { + if (!kvm_pmu_is_fastpath_emulation_allowed(vcpu)) + return EXIT_FASTPATH_NONE; + if (!kvm_emulate_halt(vcpu)) return EXIT_FASTPATH_EXIT_USERSPACE; From 283a5aa57b2223abf2f73afcc714c4d4553660f2 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:17:05 -0800 Subject: [PATCH 021/267] KVM: x86/pmu: Handle emulated instruction for mediated vPMU Mediated vPMU needs to accumulate the emulated instructions into counter and load the counter into HW at vm-entry. Moreover, if the accumulation leads to counter overflow, KVM needs to update GLOBAL_STATUS and inject PMI into guest as well. Suggested-by: Sean Christopherson Signed-off-by: Dapeng Mi Signed-off-by: Mingwei Zhang Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-30-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index f6387c67b25c..b78ad897886d 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -1031,10 +1031,45 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) kvm_pmu_reset(vcpu); } +static bool pmc_is_pmi_enabled(struct kvm_pmc *pmc) +{ + u8 fixed_ctr_ctrl; + + if (pmc_is_gp(pmc)) + return pmc->eventsel & ARCH_PERFMON_EVENTSEL_INT; + + fixed_ctr_ctrl = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, + pmc->idx - KVM_FIXED_PMC_BASE_IDX); + return fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI; +} + static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - pmc->emulated_counter++; - kvm_pmu_request_counter_reprogram(pmc); + struct kvm_vcpu *vcpu = pmc->vcpu; + + /* + * For perf-based PMUs, accumulate software-emulated events separately + * from pmc->counter, as pmc->counter is offset by the count of the + * associated perf event. Request reprogramming, which will consult + * both emulated and hardware-generated events to detect overflow. + */ + if (!kvm_vcpu_has_mediated_pmu(vcpu)) { + pmc->emulated_counter++; + kvm_pmu_request_counter_reprogram(pmc); + return; + } + + /* + * For mediated PMUs, pmc->counter is updated when the vCPU's PMU is + * put, and will be loaded into hardware when the PMU is loaded. Simply + * increment the counter and signal overflow if it wraps to zero. + */ + pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); + if (!pmc->counter) { + pmc_to_pmu(pmc)->global_status |= BIT_ULL(pmc->idx); + if (pmc_is_pmi_enabled(pmc)) + kvm_make_request(KVM_REQ_PMI, vcpu); + } } static inline bool cpl_is_matched(struct kvm_pmc *pmc) From cb58327c4c8ad9e81d3a2f17adaf3ab57066f369 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:17:06 -0800 Subject: [PATCH 022/267] KVM: nVMX: Add macros to simplify nested MSR interception setting Add macros nested_vmx_merge_msr_bitmaps_xxx() to simplify nested MSR interception setting. No function change intended. Suggested-by: Sean Christopherson Signed-off-by: Dapeng Mi Signed-off-by: Mingwei Zhang Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-31-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 40777278eabb..b56ed2b1ac67 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -617,6 +617,19 @@ static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, msr_bitmap_l0, msr); } +#define nested_vmx_merge_msr_bitmaps(msr, type) \ + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, \ + msr_bitmap_l0, msr, type) + +#define nested_vmx_merge_msr_bitmaps_read(msr) \ + nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_R) + +#define nested_vmx_merge_msr_bitmaps_write(msr) \ + nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_W) + +#define nested_vmx_merge_msr_bitmaps_rw(msr) \ + nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_RW) + /* * Merge L0's and L1's MSR bitmap, return false to indicate that * we do not use the hardware. @@ -700,23 +713,13 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. */ #ifdef CONFIG_X86_64 - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_FS_BASE, MSR_TYPE_RW); - - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_GS_BASE, MSR_TYPE_RW); - - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + nested_vmx_merge_msr_bitmaps_rw(MSR_FS_BASE); + nested_vmx_merge_msr_bitmaps_rw(MSR_GS_BASE); + nested_vmx_merge_msr_bitmaps_rw(MSR_KERNEL_GS_BASE); #endif - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); - - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_PRED_CMD, MSR_TYPE_W); - - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_FLUSH_CMD, MSR_TYPE_W); + nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_SPEC_CTRL); + nested_vmx_merge_msr_bitmaps_write(MSR_IA32_PRED_CMD); + nested_vmx_merge_msr_bitmaps_write(MSR_IA32_FLUSH_CMD); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_APERF, MSR_TYPE_R); From 88ebc2a3199cb5f16aff20673ed97b63a4295989 Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Fri, 5 Dec 2025 16:17:07 -0800 Subject: [PATCH 023/267] KVM: nVMX: Disable PMU MSR interception as appropriate while running L2 Merge KVM's PMU MSR interception bitmaps with those of L1, i.e. merge the bitmaps of vmcs01 and vmcs12, e.g. so that KVM doesn't interpose on MSR accesses unnecessarily if L1 exposes a mediated PMU (or equivalent) to L2. Signed-off-by: Mingwei Zhang Co-developed-by: Dapeng Mi Signed-off-by: Dapeng Mi [sean: rewrite changelog and comment, omit MSRs that are always intercepted] Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-32-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b56ed2b1ac67..729cc1f05ac8 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -630,6 +630,34 @@ static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, #define nested_vmx_merge_msr_bitmaps_rw(msr) \ nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_RW) +static void nested_vmx_merge_pmu_msr_bitmaps(struct kvm_vcpu *vcpu, + unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_l0) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + int i; + + /* + * Skip the merges if the vCPU doesn't have a mediated PMU MSR, i.e. if + * none of the MSRs can possibly be passed through to L1. + */ + if (!kvm_vcpu_has_mediated_pmu(vcpu)) + return; + + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PERFCTR0 + i); + nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PMC0 + i); + } + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) + nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_FIXED_CTR0 + i); + + nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_GLOBAL_CTRL); + nested_vmx_merge_msr_bitmaps_read(MSR_CORE_PERF_GLOBAL_STATUS); + nested_vmx_merge_msr_bitmaps_write(MSR_CORE_PERF_GLOBAL_OVF_CTRL); +} + /* * Merge L0's and L1's MSR bitmap, return false to indicate that * we do not use the hardware. @@ -745,6 +773,8 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PL3_SSP, MSR_TYPE_RW); + nested_vmx_merge_pmu_msr_bitmaps(vcpu, msr_bitmap_l1, msr_bitmap_l0); + kvm_vcpu_unmap(vcpu, &map); vmx->nested.force_msr_bitmap_recalc = false; From 3b36160d9406863812883c96c1efc8bc5c04e2cc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:08 -0800 Subject: [PATCH 024/267] KVM: nSVM: Disable PMU MSR interception as appropriate while running L2 Add MSRs that might be passed through to L1 when running with a mediated PMU to the nested SVM's set of to-be-merged MSR indices, i.e. disable interception of PMU MSRs when running L2 if both KVM (L0) and L1 disable interception. There is no need for KVM to interpose on such MSR accesses, e.g. if L1 exposes a mediated PMU (or equivalent) to L2. Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-33-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index c81005b24522..9ca8dad9a7f3 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -194,7 +194,7 @@ void recalc_intercepts(struct vcpu_svm *svm) * Hardcode the capacity of the array based on the maximum number of _offsets_. * MSRs are batched together, so there are fewer offsets than MSRs. */ -static int nested_svm_msrpm_merge_offsets[7] __ro_after_init; +static int nested_svm_msrpm_merge_offsets[10] __ro_after_init; static int nested_svm_nr_msrpm_merge_offsets __ro_after_init; typedef unsigned long nsvm_msrpm_merge_t; @@ -222,6 +222,22 @@ int __init nested_svm_init_msrpm_merge_offsets(void) MSR_IA32_LASTBRANCHTOIP, MSR_IA32_LASTINTFROMIP, MSR_IA32_LASTINTTOIP, + + MSR_K7_PERFCTR0, + MSR_K7_PERFCTR1, + MSR_K7_PERFCTR2, + MSR_K7_PERFCTR3, + MSR_F15H_PERF_CTR0, + MSR_F15H_PERF_CTR1, + MSR_F15H_PERF_CTR2, + MSR_F15H_PERF_CTR3, + MSR_F15H_PERF_CTR4, + MSR_F15H_PERF_CTR5, + + MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, }; int i, j; From 860bcb1021f5234820592853d56ca12f69e9c81f Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:17:09 -0800 Subject: [PATCH 025/267] KVM: x86/pmu: Expose enable_mediated_pmu parameter to user space Expose enable_mediated_pmu parameter to user space, i.e. allow userspace to enable/disable mediated vPMU support. Document the mediated versus perf-based behavior as part of the kernel-parameters.txt entry, and opportunistically add an entry for the core enable_pmu param as well. Signed-off-by: Dapeng Mi Signed-off-by: Mingwei Zhang Tested-by: Xudong Hao Co-developed-by: Sean Christopherson Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-34-seanjc@google.com Signed-off-by: Sean Christopherson --- .../admin-guide/kernel-parameters.txt | 49 +++++++++++++++++++ arch/x86/kvm/svm/svm.c | 2 + arch/x86/kvm/vmx/vmx.c | 2 + 3 files changed, 53 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a8d0afde7f85..c13a8877f5b3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3044,6 +3044,26 @@ Kernel parameters Default is Y (on). + kvm.enable_pmu=[KVM,X86] + If enabled, KVM will virtualize PMU functionality based + on the virtual CPU model defined by userspace. This + can be overridden on a per-VM basis via + KVM_CAP_PMU_CAPABILITY. + + If disabled, KVM will not virtualize PMU functionality, + e.g. MSRs, PMCs, PMIs, etc., even if userspace defines + a virtual CPU model that contains PMU assets. + + Note, KVM's vPMU support implicitly requires running + with an in-kernel local APIC, e.g. to deliver PMIs to + the guest. Running without an in-kernel local APIC is + not supported, though KVM will allow such a combination + (with severely degraded functionality). + + See also enable_mediated_pmu. + + Default is Y (on). + kvm.enable_virt_at_load=[KVM,ARM64,LOONGARCH,MIPS,RISCV,X86] If enabled, KVM will enable virtualization in hardware when KVM is loaded, and disable virtualization when KVM @@ -3090,6 +3110,35 @@ Kernel parameters If the value is 0 (the default), KVM will pick a period based on the ratio, such that a page is zapped after 1 hour on average. + kvm-{amd,intel}.enable_mediated_pmu=[KVM,AMD,INTEL] + If enabled, KVM will provide a mediated virtual PMU, + instead of the default perf-based virtual PMU (if + kvm.enable_pmu is true and PMU is enumerated via the + virtual CPU model). + + With a perf-based vPMU, KVM operates as a user of perf, + i.e. emulates guest PMU counters using perf events. + KVM-created perf events are managed by perf as regular + (guest-only) events, e.g. are scheduled in/out, contend + for hardware resources, etc. Using a perf-based vPMU + allows guest and host usage of the PMU to co-exist, but + incurs non-trivial overhead and can result in silently + dropped guest events (due to resource contention). + + With a mediated vPMU, hardware PMU state is context + switched around the world switch to/from the guest. + KVM mediates which events the guest can utilize, but + gives the guest direct access to all other PMU assets + when possible (KVM may intercept some accesses if the + virtual CPU model provides a subset of hardware PMU + functionality). Using a mediated vPMU significantly + reduces PMU virtualization overhead and eliminates lost + guest events, but is mutually exclusive with using perf + to profile KVM guests and adds latency to most VM-Exits + (to context switch PMU state). + + Default is N (off). + kvm-amd.nested= [KVM,AMD] Control nested virtualization feature in KVM/SVM. Default is 1 (enabled). diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1a616eb3ff1c..5910088fe22a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -170,6 +170,8 @@ module_param(intercept_smi, bool, 0444); bool vnmi = true; module_param(vnmi, bool, 0444); +module_param(enable_mediated_pmu, bool, 0444); + static bool svm_gp_erratum_intercept = true; static u8 rsm_ins_bytes[] = "\x0f\xaa"; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f0a20ff2a941..62ba2a2b9e98 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -150,6 +150,8 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); extern bool __read_mostly allow_smaller_maxphyaddr; module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); +module_param(enable_mediated_pmu, bool, 0444); + #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE #define KVM_VM_CR0_ALWAYS_ON \ From b0b6a8d3be16ea742bf835407e9968378c0c753c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:10 -0800 Subject: [PATCH 026/267] KVM: x86/pmu: Elide WRMSRs when loading guest PMCs if values already match When loading a mediated PMU state, elide the WRMSRs to load PMCs with the guest's value if the value in hardware already matches the guest's value. For the relatively common case where neither the guest nor the host is actively using the PMU, i.e. when all/many counters are '0', eliding the WRMSRs reduces the latency of handling VM-Exit by a measurable amount (WRMSR is significantly more expensive than RDPMC). As measured by KVM-Unit-Tests' CPUID VM-Exit testcase, this provides a a ~25% reduction in latency (4k => 3k cycles) on Intel Emerald Rapids, and a ~13% reduction (6.2k => 5.3k cycles) on AMD Turin. Cc: Manali Shukla Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-35-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b78ad897886d..954622f8f817 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -1312,13 +1312,15 @@ static void kvm_pmu_load_guest_pmcs(struct kvm_vcpu *vcpu) for (i = 0; i < pmu->nr_arch_gp_counters; i++) { pmc = &pmu->gp_counters[i]; - wrmsrl(gp_counter_msr(i), pmc->counter); + if (pmc->counter != rdpmc(i)) + wrmsrl(gp_counter_msr(i), pmc->counter); wrmsrl(gp_eventsel_msr(i), pmc->eventsel_hw); } for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { pmc = &pmu->fixed_counters[i]; - wrmsrl(fixed_counter_msr(i), pmc->counter); + if (pmc->counter != rdpmc(INTEL_PMC_FIXED_RDPMC_BASE | i)) + wrmsrl(fixed_counter_msr(i), pmc->counter); } } From 462f092dc55c0eb97da02dd0c773a4394850dd1b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:11 -0800 Subject: [PATCH 027/267] KVM: VMX: Drop intermediate "guest" field from msr_autostore Drop the intermediate "guest" field from vcpu_vmx.msr_autostore as the value saved on VM-Exit isn't guaranteed to be the guest's value, it's purely whatever is in hardware at the time of VM-Exit. E.g. KVM's only use of the store list at the momemnt is to snapshot TSC at VM-Exit, and the value saved is always the raw TSC even if TSC-offseting and/or TSC-scaling is enabled for the guest. And unlike msr_autoload, there is no need differentiate between "on-entry" and "on-exit". No functional change intended. Cc: Jim Mattson Reviewed-by: Dapeng Mi Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-36-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 10 +++++----- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 4 +--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 729cc1f05ac8..486789dac515 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1076,11 +1076,11 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, * VM-exit in L0, use the more accurate value. */ if (msr_index == MSR_IA32_TSC) { - int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, + int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore, MSR_IA32_TSC); if (i >= 0) { - u64 val = vmx->msr_autostore.guest.val[i].value; + u64 val = vmx->msr_autostore.val[i].value; *data = kvm_read_l1_tsc(vcpu, val); return true; @@ -1167,7 +1167,7 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, u32 msr_index) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmx_msrs *autostore = &vmx->msr_autostore.guest; + struct vmx_msrs *autostore = &vmx->msr_autostore; bool in_vmcs12_store_list; int msr_autostore_slot; bool in_autostore_list; @@ -2366,7 +2366,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) * addresses are constant (for vmcs02), the counts can change based * on L2's behavior, e.g. switching to/from long mode. */ - vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); + vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val)); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); @@ -2704,7 +2704,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) */ prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 62ba2a2b9e98..23c92c41fd83 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6567,7 +6567,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu) if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) - vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); + vmx_dump_msrs("autostore", &vmx->msr_autostore); if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE) pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index d7a96c84371f..4ce653d729ca 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -245,9 +245,7 @@ struct vcpu_vmx { struct vmx_msrs host; } msr_autoload; - struct msr_autostore { - struct vmx_msrs guest; - } msr_autostore; + struct vmx_msrs msr_autostore; struct { int vm86_active; From 58f21a01417f273b4246c885558c252e345681b3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:12 -0800 Subject: [PATCH 028/267] KVM: nVMX: Don't update msr_autostore count when saving TSC for vmcs12 Rework nVMX's use of the MSR auto-store list to snapshot TSC to sneak MSR_IA32_TSC into the list _without_ updating KVM's software tracking, and drop the generic functionality so that future usage of the store list for nested specific logic needs to consider the implications of modifying the list. Updating the list only for vmcs02 and only on nested VM-Enter is a disaster waiting to happen, as it means vmcs01 is stale relative to the software tracking, and KVM could unintentionally leave an MSR in the store list in perpetuity while running L1, e.g. if KVM addressed the first issue and updated vmcs01 on nested VM-Exit without removing TSC from the list. Furthermore, mixing KVM's desire to save an MSR with L1's desire to save an MSR result KVM clobbering/ignoring the needs of vmcs01 or vmcs02. E.g. if KVM added MSR_IA32_TSC to the store list for its own purposes, and then _removed_ MSR_IA32_TSC from the list after emulating nested VM-Enter, then KVM would remove MSR_IA32_TSC from the list even though saving TSC on VM-Exit from L2 is still desirable (to provide L1 with an accurate TSC). Similarly, removing an MSR from the list based on vmcs12's settings could drop an MSR that KVM wants to save for its own purposes. In practice, the issues are currently benign, because KVM doesn't use the store list for vmcs01. But that will change with upcoming mediated PMU support. Alternatively, a "full" solution would be to track MSR list entries for vmcs12 separately from KVM's standard lists, but MSR_IA32_TSC is likely the only MSR that KVM would ever want to save on _every_ VM-Exit purely based on vmcs12. I.e. the added complexity isn't remotely justified at this time. Opportunistically escalate from a pr_warn_ratelimited() to a full WARN as KVM reserves eight entries in each MSR list, and as above KVM uses at most one entry. Opportunistically make vmx_find_loadstore_msr_slot() local to vmx.c as using it directly from nested code is unsafe due to the potential for mixing vmcs01 and vmcs02 state (see above). Cc: Jim Mattson Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-37-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 71 ++++++++++++--------------------------- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 2 +- 3 files changed, 24 insertions(+), 51 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 486789dac515..614b789ecf16 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1075,16 +1075,12 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, * does not include the time taken for emulation of the L2->L1 * VM-exit in L0, use the more accurate value. */ - if (msr_index == MSR_IA32_TSC) { - int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore, - MSR_IA32_TSC); + if (msr_index == MSR_IA32_TSC && vmx->nested.tsc_autostore_slot >= 0) { + int slot = vmx->nested.tsc_autostore_slot; + u64 host_tsc = vmx->msr_autostore.val[slot].value; - if (i >= 0) { - u64 val = vmx->msr_autostore.val[i].value; - - *data = kvm_read_l1_tsc(vcpu, val); - return true; - } + *data = kvm_read_l1_tsc(vcpu, host_tsc); + return true; } if (kvm_emulate_msr_read(vcpu, msr_index, data)) { @@ -1163,42 +1159,6 @@ static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) return false; } -static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, - u32 msr_index) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmx_msrs *autostore = &vmx->msr_autostore; - bool in_vmcs12_store_list; - int msr_autostore_slot; - bool in_autostore_list; - int last; - - msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); - in_autostore_list = msr_autostore_slot >= 0; - in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); - - if (in_vmcs12_store_list && !in_autostore_list) { - if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { - /* - * Emulated VMEntry does not fail here. Instead a less - * accurate value will be returned by - * nested_vmx_get_vmexit_msr_value() by reading KVM's - * internal MSR state instead of reading the value from - * the vmcs02 VMExit MSR-store area. - */ - pr_warn_ratelimited( - "Not enough msr entries in msr_autostore. Can't add msr %x\n", - msr_index); - return; - } - last = autostore->nr++; - autostore->val[last].index = msr_index; - } else if (!in_vmcs12_store_list && in_autostore_list) { - last = --autostore->nr; - autostore->val[msr_autostore_slot] = autostore->val[last]; - } -} - /* * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are * emulating VM-Entry into a guest with EPT enabled. On failure, the expected @@ -2699,12 +2659,25 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) } /* - * Make sure the msr_autostore list is up to date before we set the - * count in the vmcs02. + * If vmcs12 is configured to save TSC on exit via the auto-store list, + * append the MSR to vmcs02's auto-store list so that KVM effectively + * reads TSC at the time of VM-Exit from L2. The saved value will be + * propagated to vmcs12's list on nested VM-Exit. + * + * Don't increment the number of MSRs in the vCPU structure, as saving + * TSC is specific to this particular incarnation of vmcb02, i.e. must + * not bleed into vmcs01. */ - prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); + if (nested_msr_store_list_has_msr(&vmx->vcpu, MSR_IA32_TSC) && + !WARN_ON_ONCE(vmx->msr_autostore.nr >= ARRAY_SIZE(vmx->msr_autostore.val))) { + vmx->nested.tsc_autostore_slot = vmx->msr_autostore.nr; + vmx->msr_autostore.val[vmx->msr_autostore.nr].index = MSR_IA32_TSC; - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr + 1); + } else { + vmx->nested.tsc_autostore_slot = -1; + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); + } vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 23c92c41fd83..52bcb817cc15 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1029,7 +1029,7 @@ static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx vm_exit_controls_clearbit(vmx, exit); } -int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) +static int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) { unsigned int i; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 4ce653d729ca..3175fedb5a4d 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -191,6 +191,7 @@ struct nested_vmx { u16 vpid02; u16 last_vpid; + int tsc_autostore_slot; struct nested_vmx_msrs msrs; /* SMM related state */ @@ -383,7 +384,6 @@ void vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags); unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx); bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, unsigned int flags); -int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set); From 0bd29379114b9c669cdabf7d6c08c0c1ea41861c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:13 -0800 Subject: [PATCH 029/267] KVM: VMX: Dedup code for removing MSR from VMCS's auto-load list Add a helper to remove an MSR from an auto-{load,store} list to dedup the msr_autoload code, and in anticipation of adding similar functionality for msr_autostore. No functional change intended. Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-38-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 52bcb817cc15..a51f66d1b201 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1040,9 +1040,22 @@ static int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) return -ENOENT; } -static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) +static void vmx_remove_auto_msr(struct vmx_msrs *m, u32 msr, + unsigned long vmcs_count_field) { int i; + + i = vmx_find_loadstore_msr_slot(m, msr); + if (i < 0) + return; + + --m->nr; + m->val[i] = m->val[m->nr]; + vmcs_write32(vmcs_count_field, m->nr); +} + +static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) +{ struct msr_autoload *m = &vmx->msr_autoload; switch (msr) { @@ -1063,21 +1076,9 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) } break; } - i = vmx_find_loadstore_msr_slot(&m->guest, msr); - if (i < 0) - goto skip_guest; - --m->guest.nr; - m->guest.val[i] = m->guest.val[m->guest.nr]; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); -skip_guest: - i = vmx_find_loadstore_msr_slot(&m->host, msr); - if (i < 0) - return; - - --m->host.nr; - m->host.val[i] = m->host.val[m->host.nr]; - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); + vmx_remove_auto_msr(&m->guest, msr, VM_ENTRY_MSR_LOAD_COUNT); + vmx_remove_auto_msr(&m->host, msr, VM_EXIT_MSR_LOAD_COUNT); } static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, From 84ac00042a28642cc974a0a250fab7df050a5dd5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:14 -0800 Subject: [PATCH 030/267] KVM: VMX: Drop unused @entry_only param from add_atomic_switch_msr() Drop the "on VM-Enter only" parameter from add_atomic_switch_msr() as it is no longer used, and for all intents and purposes was never used. The functionality was added, under embargo, by commit 989e3992d2ec ("x86/KVM/VMX: Extend add_atomic_switch_msr() to allow VMENTER only MSRs"), and then ripped out by commit 2f055947ae5e ("x86/kvm: Drop L1TF MSR list approach") just a few commits later. 2f055947ae5e x86/kvm: Drop L1TF MSR list approach 72c6d2db64fa x86/litf: Introduce vmx status variable 215af5499d9e cpu/hotplug: Online siblings when SMT control is turned on 390d975e0c4e x86/KVM/VMX: Use MSR save list for IA32_FLUSH_CMD if required 989e3992d2ec x86/KVM/VMX: Extend add_atomic_switch_msr() to allow VMENTER only MSRs Furthermore, it's extremely unlikely KVM will ever _need_ to load an MSR value via the auto-load lists only on VM-Enter. MSRs writes via the lists aren't optimized in any way, and so the only reason to use the lists instead of a WRMSR are for cases where the MSR _must_ be load atomically with respect to VM-Enter (and/or VM-Exit). While one could argue that command MSRs, e.g. IA32_FLUSH_CMD, "need" to be done exact at VM-Enter, in practice doing such flushes within a few instructons of VM-Enter is more than sufficient. Note, the shortlog and changelog for commit 390d975e0c4e ("x86/KVM/VMX: Use MSR save list for IA32_FLUSH_CMD if required") are misleading and wrong. That commit added MSR_IA32_FLUSH_CMD to the VM-Enter _load_ list, not the VM-Enter save list (which doesn't exist, only VM-Exit has a store/save list). Reviewed-by: Dapeng Mi Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-39-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a51f66d1b201..38491962b2c1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1094,7 +1094,7 @@ static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, } static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, - u64 guest_val, u64 host_val, bool entry_only) + u64 guest_val, u64 host_val) { int i, j = 0; struct msr_autoload *m = &vmx->msr_autoload; @@ -1132,8 +1132,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, } i = vmx_find_loadstore_msr_slot(&m->guest, msr); - if (!entry_only) - j = vmx_find_loadstore_msr_slot(&m->host, msr); + j = vmx_find_loadstore_msr_slot(&m->host, msr); if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { @@ -1148,9 +1147,6 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, m->guest.val[i].index = msr; m->guest.val[i].value = guest_val; - if (entry_only) - return; - if (j < 0) { j = m->host.nr++; vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); @@ -1190,8 +1186,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) if (!(guest_efer & EFER_LMA)) guest_efer &= ~EFER_LME; if (guest_efer != kvm_host.efer) - add_atomic_switch_msr(vmx, MSR_EFER, - guest_efer, kvm_host.efer, false); + add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, kvm_host.efer); else clear_atomic_switch_msr(vmx, MSR_EFER); return false; @@ -7350,7 +7345,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) clear_atomic_switch_msr(vmx, msrs[i].msr); else add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, - msrs[i].host, false); + msrs[i].host); } static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) From 2ed57bb8997610b33cb92c26ccb9a91b2966fff8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:15 -0800 Subject: [PATCH 031/267] KVM: VMX: Bug the VM if either MSR auto-load list is full WARN and bug the VM if either MSR auto-load list is full when adding an MSR to the lists, as the set of MSRs that KVM loads via the lists is finite and entirely KVM controlled, i.e. overflowing the lists shouldn't be possible in a fully released version of KVM. Terminate the VM as the core KVM infrastructure has no insight as to _why_ an MSR is being added to the list, and failure to load an MSR on VM-Enter and/or VM-Exit could be fatal to the host. E.g. running the host with a guest-controlled PEBS MSR could generate unexpected writes to the DS buffer and crash the host. Reviewed-by: Dapeng Mi Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-40-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 38491962b2c1..2c50ebf4ff1b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1098,6 +1098,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, { int i, j = 0; struct msr_autoload *m = &vmx->msr_autoload; + struct kvm *kvm = vmx->vcpu.kvm; switch (msr) { case MSR_EFER: @@ -1134,12 +1135,10 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, i = vmx_find_loadstore_msr_slot(&m->guest, msr); j = vmx_find_loadstore_msr_slot(&m->host, msr); - if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || - (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { - printk_once(KERN_WARNING "Not enough msr switch entries. " - "Can't add msr %x\n", msr); + if (KVM_BUG_ON(i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS, kvm) || + KVM_BUG_ON(j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS, kvm)) return; - } + if (i < 0) { i = m->guest.nr++; vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); From 0c4ff0866fc1b0bf8c1d8d5f27fedc6dd9c51183 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:16 -0800 Subject: [PATCH 032/267] KVM: VMX: Set MSR index auto-load entry if and only if entry is "new" When adding an MSR to the auto-load lists, update the MSR index in the list entry if and only if a new entry is being inserted, as 'i' can only be non-negative if vmx_find_loadstore_msr_slot() found an entry with the MSR's index. Unnecessarily setting the index is benign, but it makes it harder to see that updating the value is necessary even when an existing entry for the MSR was found. No functional change intended. Reviewed-by: Dapeng Mi Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-41-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2c50ebf4ff1b..be2a2580e8f1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1141,16 +1141,16 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, if (i < 0) { i = m->guest.nr++; + m->guest.val[i].index = msr; vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); } - m->guest.val[i].index = msr; m->guest.val[i].value = guest_val; if (j < 0) { j = m->host.nr++; + m->host.val[j].index = msr; vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); } - m->host.val[j].index = msr; m->host.val[j].value = host_val; } From 2239d137a71d77c7610434473b0c8cfde90d4116 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:17 -0800 Subject: [PATCH 033/267] KVM: VMX: Compartmentalize adding MSRs to host vs. guest auto-load list Undo the bundling of the "host" and "guest" MSR auto-load list logic so that the code can be deduplicated by factoring out the logic to a separate helper. Now that "list full" situations are treated as fatal to the VM, there is no need to pre-check both lists. For all intents and purposes, this reverts the add_atomic_switch_msr() changes made by commit 3190709335dd ("x86/KVM/VMX: Separate the VMX AUTOLOAD guest/host number accounting"). Reviewed-by: Dapeng Mi Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-42-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index be2a2580e8f1..018e01daab68 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1096,9 +1096,9 @@ static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, u64 guest_val, u64 host_val) { - int i, j = 0; struct msr_autoload *m = &vmx->msr_autoload; struct kvm *kvm = vmx->vcpu.kvm; + int i; switch (msr) { case MSR_EFER: @@ -1133,25 +1133,26 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, } i = vmx_find_loadstore_msr_slot(&m->guest, msr); - j = vmx_find_loadstore_msr_slot(&m->host, msr); - - if (KVM_BUG_ON(i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS, kvm) || - KVM_BUG_ON(j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS, kvm)) - return; - if (i < 0) { + if (KVM_BUG_ON(m->guest.nr == MAX_NR_LOADSTORE_MSRS, kvm)) + return; + i = m->guest.nr++; m->guest.val[i].index = msr; vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); } m->guest.val[i].value = guest_val; - if (j < 0) { - j = m->host.nr++; - m->host.val[j].index = msr; + i = vmx_find_loadstore_msr_slot(&m->host, msr); + if (i < 0) { + if (KVM_BUG_ON(m->host.nr == MAX_NR_LOADSTORE_MSRS, kvm)) + return; + + i = m->host.nr++; + m->host.val[i].index = msr; vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); } - m->host.val[j].value = host_val; + m->host.val[i].value = host_val; } static bool update_transition_efer(struct vcpu_vmx *vmx) From c3d6a7210a4de909683a36779f5b8567f79a3688 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:18 -0800 Subject: [PATCH 034/267] KVM: VMX: Dedup code for adding MSR to VMCS's auto list Add a helper to add an MSR to a VMCS's "auto" list to deduplicate the code in add_atomic_switch_msr(), and so that the functionality can be used in the future for managing the MSR auto-store list. No functional change intended. Reviewed-by: Dapeng Mi Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-43-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 018e01daab68..3f64d4b1b19c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1093,12 +1093,28 @@ static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, vm_exit_controls_setbit(vmx, exit); } +static void vmx_add_auto_msr(struct vmx_msrs *m, u32 msr, u64 value, + unsigned long vmcs_count_field, struct kvm *kvm) +{ + int i; + + i = vmx_find_loadstore_msr_slot(m, msr); + if (i < 0) { + if (KVM_BUG_ON(m->nr == MAX_NR_LOADSTORE_MSRS, kvm)) + return; + + i = m->nr++; + m->val[i].index = msr; + vmcs_write32(vmcs_count_field, m->nr); + } + m->val[i].value = value; +} + static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, u64 guest_val, u64 host_val) { struct msr_autoload *m = &vmx->msr_autoload; struct kvm *kvm = vmx->vcpu.kvm; - int i; switch (msr) { case MSR_EFER: @@ -1132,27 +1148,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, wrmsrq(MSR_IA32_PEBS_ENABLE, 0); } - i = vmx_find_loadstore_msr_slot(&m->guest, msr); - if (i < 0) { - if (KVM_BUG_ON(m->guest.nr == MAX_NR_LOADSTORE_MSRS, kvm)) - return; - - i = m->guest.nr++; - m->guest.val[i].index = msr; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); - } - m->guest.val[i].value = guest_val; - - i = vmx_find_loadstore_msr_slot(&m->host, msr); - if (i < 0) { - if (KVM_BUG_ON(m->host.nr == MAX_NR_LOADSTORE_MSRS, kvm)) - return; - - i = m->host.nr++; - m->host.val[i].index = msr; - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); - } - m->host.val[i].value = host_val; + vmx_add_auto_msr(&m->guest, msr, guest_val, VM_ENTRY_MSR_LOAD_COUNT, kvm); + vmx_add_auto_msr(&m->guest, msr, host_val, VM_EXIT_MSR_LOAD_COUNT, kvm); } static bool update_transition_efer(struct vcpu_vmx *vmx) From 9757a5aebcd6ca808d5b80831649438a017478ad Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:19 -0800 Subject: [PATCH 035/267] KVM: VMX: Initialize vmcs01.VM_EXIT_MSR_STORE_ADDR with list address Initialize vmcs01.VM_EXIT_MSR_STORE_ADDR to point at the vCPU's msr_autostore list in anticipation of utilizing the auto-store functionality, and to harden KVM against stray reads to pfn 0 (or, in theory, a random pfn if the underlying CPU uses a complex scheme for encoding VMCS data). The MSR auto lists are supposed to be ignored if the associated COUNT VMCS field is '0', but leaving the ADDR field zero-initialized in memory is an unnecessary risk (albeit a minuscule risk) given that the cost is a single VMWRITE during vCPU creation. Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-44-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3f64d4b1b19c..6a17cb90eaf4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4933,6 +4933,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write64(VM_FUNCTION_CONTROL, 0); vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val)); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); From d374b89edbb9a8d552e03348f59287ff779b4c9d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:20 -0800 Subject: [PATCH 036/267] KVM: VMX: Add mediated PMU support for CPUs without "save perf global ctrl" Extend mediated PMU support for Intel CPUs without support for saving PERF_GLOBAL_CONTROL into the guest VMCS field on VM-Exit, e.g. for Skylake and its derivatives, as well as Icelake. While supporting CPUs without VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL isn't completely trivial, it's not that complex either. And not supporting such CPUs would mean not supporting 7+ years of Intel CPUs released in the past 10 years. On VM-Exit, immediately propagate the saved PERF_GLOBAL_CTRL to the VMCS as well as KVM's software cache so that KVM doesn't need to add full EXREG tracking of PERF_GLOBAL_CTRL. In practice, the vast majority of VM-Exits won't trigger software writes to guest PERF_GLOBAL_CTRL, so deferring the VMWRITE to the next VM-Enter would only delay the inevitable without batching/avoiding VMWRITEs. Note! Take care to refresh VM_EXIT_MSR_STORE_COUNT on nested VM-Exit, as it's unfortunately possible that KVM could recalculate MSR intercepts while L2 is active, e.g. if userspace loads nested state and _then_ sets PERF_CAPABILITIES. Eating the VMWRITE on every nested VM-Exit is unfortunate, but that's a pre-existing problem and can/should be solved separately, e.g. modifying the number of auto-load entries while L2 is active is also uncommon on modern CPUs. Reviewed-by: Dapeng Mi Tested-by: Dapeng Mi Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-45-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 6 ++++- arch/x86/kvm/vmx/pmu_intel.c | 7 ----- arch/x86/kvm/vmx/vmx.c | 52 ++++++++++++++++++++++++++++++++---- 3 files changed, 52 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 614b789ecf16..1ee1edc8419d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5142,7 +5142,11 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, kvm_nested_vmexit_handle_ibrs(vcpu); - /* Update any VMCS fields that might have changed while L2 ran */ + /* + * Update any VMCS fields that might have changed while vmcs02 was the + * active VMCS. The tracking is per-vCPU, not per-VMCS. + */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 55249fa4db95..27eb76e6b6a0 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -777,13 +777,6 @@ static bool intel_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_ if (WARN_ON_ONCE(!cpu_has_load_perf_global_ctrl())) return false; - /* - * KVM doesn't yet support mediated PMU on CPUs without support for - * saving PERF_GLOBAL_CTRL via a dedicated VMCS field. - */ - if (!cpu_has_save_perf_global_ctrl()) - return false; - return true; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6a17cb90eaf4..ba1262c3e3ff 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1204,6 +1204,17 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) return true; } +static void vmx_add_autostore_msr(struct vcpu_vmx *vmx, u32 msr) +{ + vmx_add_auto_msr(&vmx->msr_autostore, msr, 0, VM_EXIT_MSR_STORE_COUNT, + vmx->vcpu.kvm); +} + +static void vmx_remove_autostore_msr(struct vcpu_vmx *vmx, u32 msr) +{ + vmx_remove_auto_msr(&vmx->msr_autostore, msr, VM_EXIT_MSR_STORE_COUNT); +} + #ifdef CONFIG_X86_32 /* * On 32-bit kernels, VM exits still load the FS and GS bases from the @@ -4225,6 +4236,8 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) static void vmx_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu) { + u64 vm_exit_controls_bits = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | + VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; bool has_mediated_pmu = kvm_vcpu_has_mediated_pmu(vcpu); struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4234,12 +4247,19 @@ static void vmx_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu) if (!enable_mediated_pmu) return; + if (!cpu_has_save_perf_global_ctrl()) { + vm_exit_controls_bits &= ~VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; + + if (has_mediated_pmu) + vmx_add_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL); + else + vmx_remove_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL); + } + vm_entry_controls_changebit(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, has_mediated_pmu); - vm_exit_controls_changebit(vmx, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | - VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL, - has_mediated_pmu); + vm_exit_controls_changebit(vmx, vm_exit_controls_bits, has_mediated_pmu); for (i = 0; i < pmu->nr_arch_gp_counters; i++) { vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i, @@ -7346,6 +7366,29 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host); } +static void vmx_refresh_guest_perf_global_control(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (msr_write_intercepted(vmx, MSR_CORE_PERF_GLOBAL_CTRL)) + return; + + if (!cpu_has_save_perf_global_ctrl()) { + int slot = vmx_find_loadstore_msr_slot(&vmx->msr_autostore, + MSR_CORE_PERF_GLOBAL_CTRL); + + if (WARN_ON_ONCE(slot < 0)) + return; + + pmu->global_ctrl = vmx->msr_autostore.val[slot].value; + vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, pmu->global_ctrl); + return; + } + + pmu->global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL); +} + static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7631,8 +7674,7 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vmx->loaded_vmcs->launched = 1; - if (!msr_write_intercepted(vmx, MSR_CORE_PERF_GLOBAL_CTRL)) - vcpu_to_pmu(vcpu)->global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL); + vmx_refresh_guest_perf_global_control(vcpu); vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); From 44da6629d2820c8fd9ffa58cc7e46c2215828cb8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Nov 2025 14:34:40 -0800 Subject: [PATCH 037/267] KVM: Use vCPU specific memslots in __kvm_vcpu_map() When establishing a "host access map", lookup the gfn in the vCPU specific memslots, as the intent is that the mapping will be established for the current vCPU context. Specifically, using __kvm_vcpu_map() in x86's SMM context should create mappings based on the SMM memslots, not the non-SMM memslots. Luckily, the bug is benign as x86 is the only architecture with multiple memslot address spaces, and all of x86's usage is limited to non-SMM. The calls in (or reachable by) {svm,vmx}_enter_smm() are made before enter_smm() sets HF_SMM_MASK, and the calls in {svm,vmx}_leave_smm() are made after emulator_leave_smm() clears HF_SMM_MASK. Note, kvm_vcpu_unmap() uses the vCPU specific memslots, only the map() side of things is broken. Fixes: 357a18ad230f ("KVM: Kill kvm_map_gfn() / kvm_unmap_gfn() and gfn_to_pfn_cache") Link: https://patch.msgid.link/20251121223444.355422-2-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5b5b69c97665..6b1097e76288 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3134,7 +3134,7 @@ int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, bool writable) { struct kvm_follow_pfn kfp = { - .slot = gfn_to_memslot(vcpu->kvm, gfn), + .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn), .gfn = gfn, .flags = writable ? FOLL_WRITE : 0, .refcounted_page = &map->pinned_page, From 70b02809ded96ec790721cd5061e20b63b622310 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Nov 2025 14:34:41 -0800 Subject: [PATCH 038/267] KVM: x86: Mark vmcs12 pages as dirty if and only if they're mapped Mark vmcs12 pages as dirty (in KVM's dirty log bitmap) if and only if the page is mapped, i.e. if the page is actually "active" in vmcs02. For some pages, KVM simply disables the associated VMCS control if the vmcs12 page is unreachable, i.e. it's possible for nested VM-Enter to succeed with a "bad" vmcs12 page. Link: https://patch.msgid.link/20251121223444.355422-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 15 +++------------ include/linux/kvm_host.h | 9 ++++++++- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6137e5307d0f..72fcb1228af4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3984,23 +3984,14 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - gfn_t gfn; + struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Don't need to mark the APIC access page dirty; it is never * written to by the CPU during APIC virtualization. */ - - if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { - gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - } - - if (nested_cpu_has_posted_intr(vmcs12)) { - gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - } + kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map); + kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map); } static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d93f75b05ae2..536d05e2726f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1381,6 +1381,7 @@ bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); +void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map, bool writable); @@ -1398,6 +1399,13 @@ static inline int kvm_vcpu_map_readonly(struct kvm_vcpu *vcpu, gpa_t gpa, return __kvm_vcpu_map(vcpu, gpa, map, false); } +static inline void kvm_vcpu_map_mark_dirty(struct kvm_vcpu *vcpu, + struct kvm_host_map *map) +{ + if (kvm_vcpu_mapped(map)) + kvm_vcpu_mark_page_dirty(vcpu, map->gfn); +} + unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, @@ -1410,7 +1418,6 @@ int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data int offset, int len); int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, unsigned long len); -void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); /** * kvm_gpc_init - initialize gfn_to_pfn_cache. From f74bb1d2eda1b77c37f35876ca0c44be345e2b1f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Nov 2025 14:34:42 -0800 Subject: [PATCH 039/267] KVM: nVMX: Precisely mark vAPIC and PID maps dirty when delivering nested PI Explicitly mark the vmcs12 vAPIC and PI descriptor pages as dirty when delivering a nested posted interrupt instead of marking all vmcs12 pages as dirty. This will allow marking the APIC access page (and any future vmcs12 pages) as dirty in nested_mark_vmcs12_pages_dirty() without over- dirtying in the nested PI case. Manually marking the vAPIC and PID pages as dirty also makes the flow a bit more self-documenting, e.g. it's not obvious at first glance that vmx->nested.pi_desc is actually a host kernel mapping of a vmcs12 page. No functional change intended. Link: https://patch.msgid.link/20251121223444.355422-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 72fcb1228af4..f5ebd84a2f6b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4028,7 +4028,8 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) } } - nested_mark_vmcs12_pages_dirty(vcpu); + kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map); + kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map); return 0; mmio_needed: From 57dfa61f6248e0fb23a3f767a7dc9c8fcecec5d7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Nov 2025 14:34:43 -0800 Subject: [PATCH 040/267] KVM: VMX: Move nested_mark_vmcs12_pages_dirty() to vmx.c, and rename Move nested_mark_vmcs12_pages_dirty() to vmx.c now that it's only used in the VM-Exit path, and add "all" to its name to document that its purpose is to mark all (mapped-out-of-band) vmcs12 pages as dirty. No functional change intended. Link: https://patch.msgid.link/20251121223444.355422-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 13 ------------- arch/x86/kvm/vmx/vmx.c | 14 +++++++++++++- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index f5ebd84a2f6b..6dd1da94f00c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3981,19 +3981,6 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, } } - -void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * Don't need to mark the APIC access page dirty; it is never - * written to by the CPU during APIC virtualization. - */ - kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map); - kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map); -} - static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6b96f7aea20b..b38bdbd7a7fa 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6378,6 +6378,18 @@ static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); } +static void nested_vmx_mark_all_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * Don't need to mark the APIC access page dirty; it is never + * written to by the CPU during APIC virtualization. + */ + kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map); + kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map); +} + static void vmx_dump_sel(char *name, uint32_t sel) { pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", @@ -6655,7 +6667,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * Mark them dirty on every exit from L2 to prevent them from * getting out of sync with dirty tracking. */ - nested_mark_vmcs12_pages_dirty(vcpu); + nested_vmx_mark_all_vmcs12_pages_dirty(vcpu); /* * Synthesize a triple fault if L2 state is invalid. In normal From c9d7134679ebf038c3d1d0b324e378dfc464d198 Mon Sep 17 00:00:00 2001 From: Fred Griffoul Date: Fri, 21 Nov 2025 14:34:44 -0800 Subject: [PATCH 041/267] KVM: nVMX: Mark APIC access page dirty when syncing vmcs12 pages For consistency with commit 7afe79f5734a ("KVM: nVMX: Mark vmcs12's APIC access page dirty when unmapping"), which marks the page dirty during unmap operations, also mark it dirty during vmcs12 page synchronization. Signed-off-by: Fred Griffoul [sean: use kvm_vcpu_map_mark_dirty()] Link: https://patch.msgid.link/20251121223444.355422-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b38bdbd7a7fa..1f4d15cc898e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6382,10 +6382,7 @@ static void nested_vmx_mark_all_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * Don't need to mark the APIC access page dirty; it is never - * written to by the CPU during APIC virtualization. - */ + kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.apic_access_page_map); kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map); kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map); } From 5bb9ac1865123356337a389af935d3913ee917ed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 12:59:48 -0800 Subject: [PATCH 042/267] KVM: x86: Return "unsupported" instead of "invalid" on access to unsupported PV MSR Return KVM_MSR_RET_UNSUPPORTED instead of '1' (which for all intents and purposes means "invalid") when rejecting accesses to KVM PV MSRs to adhere to KVM's ABI of allowing host reads and writes of '0' to MSRs that are advertised to userspace via KVM_GET_MSR_INDEX_LIST, even if the vCPU model doesn't support the MSR. E.g. running a QEMU VM with -cpu host,-kvmclock,kvm-pv-enforce-cpuid yields: qemu: error: failed to set MSR 0x12 to 0x0 qemu: target/i386/kvm/kvm.c:3301: kvm_buf_set_msrs: Assertion `ret == cpu->kvm_msr_buf->nmsrs' failed. Fixes: 66570e966dd9 ("kvm: x86: only provide PV features if enabled in guest's CPUID") Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Link: https://patch.msgid.link/20251230205948.4094097-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 211d8c24a4b1..e4418409b468 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4097,47 +4097,47 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_WALL_CLOCK_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; vcpu->kvm->arch.wall_clock = data; kvm_write_wall_clock(vcpu->kvm, data, 0); break; case MSR_KVM_WALL_CLOCK: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; vcpu->kvm->arch.wall_clock = data; kvm_write_wall_clock(vcpu->kvm, data, 0); break; case MSR_KVM_SYSTEM_TIME_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); break; case MSR_KVM_SYSTEM_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); break; case MSR_KVM_ASYNC_PF_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_INT: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; if (kvm_pv_enable_async_pf_int(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_ACK: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; if (data & 0x1) { /* * Pairs with the smp_mb__after_atomic() in @@ -4150,7 +4150,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_STEAL_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; if (unlikely(!sched_info_on())) return 1; @@ -4168,7 +4168,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_PV_EOI_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8))) return 1; @@ -4176,7 +4176,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KVM_POLL_CONTROL: if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; /* only enable bit supported */ if (data & (-1ULL << 1)) @@ -4477,61 +4477,61 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_WALL_CLOCK: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_WALL_CLOCK_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_SYSTEM_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.time; break; case MSR_KVM_SYSTEM_TIME_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.time; break; case MSR_KVM_ASYNC_PF_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.apf.msr_en_val; break; case MSR_KVM_ASYNC_PF_INT: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.apf.msr_int_val; break; case MSR_KVM_ASYNC_PF_ACK: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = 0; break; case MSR_KVM_STEAL_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.st.msr_val; break; case MSR_KVM_PV_EOI_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.pv_eoi.msr_val; break; case MSR_KVM_POLL_CONTROL: if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.msr_kvm_poll_control; break; From 7fe9f5366bd5d7ee3cfd9f66868a4410d6e4792d Mon Sep 17 00:00:00 2001 From: MJ Pooladkhay Date: Mon, 22 Dec 2025 17:42:07 +0000 Subject: [PATCH 043/267] KVM: selftests: Fix sign extension bug in get_desc64_base() The function get_desc64_base() performs a series of bitwise left shifts on fields of various sizes. More specifically, when performing '<< 24' on 'desc->base2' (which is a u8), 'base2' is promoted to a signed integer before shifting. In a scenario where base2 >= 0x80, the shift places a 1 into bit 31, causing the 32-bit intermediate value to become negative. When this result is cast to uint64_t or ORed into the return value, sign extension occurs, corrupting the upper 32 bits of the address (base3). Example: Given: base0 = 0x5000 base1 = 0xd6 base2 = 0xf8 base3 = 0xfffffe7c Expected return: 0xfffffe7cf8d65000 Actual return: 0xfffffffff8d65000 Fix this by explicitly casting the fields to 'uint64_t' before shifting to prevent sign extension. Signed-off-by: MJ Pooladkhay Link: https://patch.msgid.link/20251222174207.107331-1-mj@pooladkhay.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/processor.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 57d62a425109..26a91bb73c93 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -436,8 +436,10 @@ struct kvm_x86_state { static inline uint64_t get_desc64_base(const struct desc64 *desc) { - return ((uint64_t)desc->base3 << 32) | - (desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24)); + return (uint64_t)desc->base3 << 32 | + (uint64_t)desc->base2 << 24 | + (uint64_t)desc->base1 << 16 | + (uint64_t)desc->base0; } static inline uint64_t rdtsc(void) From 69e81ed5e6a59c12c0c6756c3f0524e2ddb023f4 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:30 -0800 Subject: [PATCH 044/267] KVM: selftests: Make __vm_get_page_table_entry() static The function is only used in processor.c, drop the declaration in processor.h and make it static. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-2-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/processor.h | 2 -- tools/testing/selftests/kvm/lib/x86/processor.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 26a91bb73c93..1cb5b4c46b99 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1369,8 +1369,6 @@ static inline bool kvm_is_ignore_msrs(void) return get_kvm_param_bool("ignore_msrs"); } -uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, - int *level); uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr); uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 36104d27f3d9..c14bf2b5f28f 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -306,8 +306,8 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) return *level == current_level; } -uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, - int *level) +static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, + int *level) { int va_width = 12 + (vm->pgtable_levels) * 9; uint64_t *pte = &vm->pgd; From 97dfbdfea405a0820ccfcf00afdda4c0f47c3df8 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:31 -0800 Subject: [PATCH 045/267] KVM: selftests: Stop passing a memslot to nested_map_memslot() On x86, KVM selftests use memslot 0 for all the default regions used by the test infrastructure. This is an implementation detail. nested_map_memslot() is currently used to map the default regions by explicitly passing slot 0, which leaks the library implementation into the caller. Rename the function to a very verbose nested_identity_map_default_memslots() to reflect what it actually does. Add an assertion that only memslot 0 is being used so that the implementation does not change from under us. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-3-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/vmx.h | 4 ++-- tools/testing/selftests/kvm/lib/x86/vmx.c | 12 ++++++++---- tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c | 2 +- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index 96e2b4c630a9..91916b8aa94b 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -563,8 +563,8 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr); void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); -void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t memslot); +void nested_identity_map_default_memslots(struct vmx_pages *vmx, + struct kvm_vm *vm); void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 29b082a58daa..eec33ec63811 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -494,12 +494,16 @@ void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, /* Prepare an identity extended page table that maps all the * physical pages in VM. */ -void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t memslot) +void nested_identity_map_default_memslots(struct vmx_pages *vmx, + struct kvm_vm *vm) { + uint32_t s, memslot = 0; sparsebit_idx_t i, last; - struct userspace_mem_region *region = - memslot2region(vm, memslot); + struct userspace_mem_region *region = memslot2region(vm, memslot); + + /* Only memslot 0 is mapped here, ensure it's the only one being used */ + for (s = 0; s < NR_MEM_REGIONS; s++) + TEST_ASSERT_EQ(vm->memslots[s], 0); i = (region->region.guest_phys_addr >> vm->page_shift) - 1; last = i + (region->region.memory_size >> vm->page_shift); diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index 98cb6bdab3e6..aab7333aaef0 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -121,7 +121,7 @@ static void test_vmx_dirty_log(bool enable_ept) */ if (enable_ept) { prepare_eptp(vmx, vm); - nested_map_memslot(vmx, vm, 0); + nested_identity_map_default_memslots(vmx, vm); nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); } From 60de423781ad9967bfdd3a2f02ba0a31787b0d2c Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:32 -0800 Subject: [PATCH 046/267] KVM: selftests: Rename nested TDP mapping functions Rename the functions from nested_* to tdp_* to make their purpose clearer. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-4-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/vmx.h | 16 +++--- .../testing/selftests/kvm/lib/x86/memstress.c | 4 +- tools/testing/selftests/kvm/lib/x86/vmx.c | 50 +++++++++---------- .../selftests/kvm/x86/vmx_dirty_log_test.c | 6 +-- 4 files changed, 37 insertions(+), 39 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index 91916b8aa94b..04b8231d032a 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -559,14 +559,14 @@ bool load_vmcs(struct vmx_pages *vmx); bool ept_1g_pages_supported(void); -void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr); -void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size); -void nested_identity_map_default_memslots(struct vmx_pages *vmx, - struct kvm_vm *vm); -void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t addr, uint64_t size); +void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, + uint64_t paddr); +void tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, + uint64_t paddr, uint64_t size); +void tdp_identity_map_default_memslots(struct vmx_pages *vmx, + struct kvm_vm *vm); +void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 0b1f288ad556..1928b00bde51 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -70,11 +70,11 @@ void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) * KVM can shadow the EPT12 with the maximum huge page size supported * by the backing source. */ - nested_identity_map_1g(vmx, vm, 0, 0x100000000ULL); + tdp_identity_map_1g(vmx, vm, 0, 0x100000000ULL); start = align_down(memstress_args.gpa, PG_SIZE_1G); end = align_up(memstress_args.gpa + memstress_args.size, PG_SIZE_1G); - nested_identity_map_1g(vmx, vm, start, end - start); + tdp_identity_map_1g(vmx, vm, start, end - start); } void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]) diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index eec33ec63811..1954ccdfc353 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -362,12 +362,12 @@ void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) init_vmcs_guest_state(guest_rip, guest_rsp); } -static void nested_create_pte(struct kvm_vm *vm, - struct eptPageTableEntry *pte, - uint64_t nested_paddr, - uint64_t paddr, - int current_level, - int target_level) +static void tdp_create_pte(struct kvm_vm *vm, + struct eptPageTableEntry *pte, + uint64_t nested_paddr, + uint64_t paddr, + int current_level, + int target_level) { if (!pte->readable) { pte->writable = true; @@ -394,8 +394,8 @@ static void nested_create_pte(struct kvm_vm *vm, } -void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, int target_level) +void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, int target_level) { const uint64_t page_size = PG_LEVEL_SIZE(target_level); struct eptPageTableEntry *pt = vmx->eptp_hva, *pte; @@ -428,7 +428,7 @@ void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, index = (nested_paddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; pte = &pt[index]; - nested_create_pte(vm, pte, nested_paddr, paddr, level, target_level); + tdp_create_pte(vm, pte, nested_paddr, paddr, level, target_level); if (pte->page_size) break; @@ -445,10 +445,10 @@ void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, } -void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr) +void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr) { - __nested_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K); + __tdp_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K); } /* @@ -468,8 +468,8 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, * Within the VM given by vm, creates a nested guest translation for the * page range starting at nested_paddr to the page range starting at paddr. */ -void __nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size, +void __tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size, int level) { size_t page_size = PG_LEVEL_SIZE(level); @@ -479,23 +479,23 @@ void __nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - __nested_pg_map(vmx, vm, nested_paddr, paddr, level); + __tdp_pg_map(vmx, vm, nested_paddr, paddr, level); nested_paddr += page_size; paddr += page_size; } } -void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size) +void tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size) { - __nested_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K); + __tdp_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K); } /* Prepare an identity extended page table that maps all the * physical pages in VM. */ -void nested_identity_map_default_memslots(struct vmx_pages *vmx, - struct kvm_vm *vm) +void tdp_identity_map_default_memslots(struct vmx_pages *vmx, + struct kvm_vm *vm) { uint32_t s, memslot = 0; sparsebit_idx_t i, last; @@ -512,18 +512,16 @@ void nested_identity_map_default_memslots(struct vmx_pages *vmx, if (i > last) break; - nested_map(vmx, vm, - (uint64_t)i << vm->page_shift, - (uint64_t)i << vm->page_shift, - 1 << vm->page_shift); + tdp_map(vmx, vm, (uint64_t)i << vm->page_shift, + (uint64_t)i << vm->page_shift, 1 << vm->page_shift); } } /* Identity map a region with 1GiB Pages. */ -void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, +void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t addr, uint64_t size) { - __nested_map(vmx, vm, addr, addr, size, PG_LEVEL_1G); + __tdp_map(vmx, vm, addr, addr, size, PG_LEVEL_1G); } bool kvm_cpu_has_ept(void) diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index aab7333aaef0..e7d0c08ba29d 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -121,9 +121,9 @@ static void test_vmx_dirty_log(bool enable_ept) */ if (enable_ept) { prepare_eptp(vmx, vm); - nested_identity_map_default_memslots(vmx, vm); - nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); - nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); + tdp_identity_map_default_memslots(vmx, vm); + tdp_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); + tdp_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); } bmap = bitmap_zalloc(TEST_MEM_PAGES); From b320c03d685704df51cf0774edd799e96c505c74 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:33 -0800 Subject: [PATCH 047/267] KVM: selftests: Kill eptPageTablePointer Replace the struct overlay with explicit bitmasks, which is clearer and less error-prone. See commit f18b4aebe107 ("kvm: selftests: do not use bitfields larger than 32-bits for PTEs") for an example of why bitfields are not preferable. Remove the unused PAGE_SHIFT_4K definition while at it. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-5-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86/vmx.c | 35 +++++++++++------------ 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 1954ccdfc353..85043bb1ec4d 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -10,10 +10,16 @@ #include "processor.h" #include "vmx.h" -#define PAGE_SHIFT_4K 12 - #define KVM_EPT_PAGE_TABLE_MIN_PADDR 0x1c0000 +#define EPTP_MT_SHIFT 0 /* EPTP memtype bits 2:0 */ +#define EPTP_PWL_SHIFT 3 /* EPTP page walk length bits 5:3 */ +#define EPTP_AD_ENABLED_SHIFT 6 /* EPTP AD enabled bit 6 */ + +#define EPTP_WB (X86_MEMTYPE_WB << EPTP_MT_SHIFT) +#define EPTP_PWL_4 (3ULL << EPTP_PWL_SHIFT) /* PWL is (levels - 1) */ +#define EPTP_AD_ENABLED (1ULL << EPTP_AD_ENABLED_SHIFT) + bool enable_evmcs; struct hv_enlightened_vmcs *current_evmcs; @@ -34,14 +40,6 @@ struct eptPageTableEntry { uint64_t suppress_ve:1; }; -struct eptPageTablePointer { - uint64_t memory_type:3; - uint64_t page_walk_length:3; - uint64_t ad_enabled:1; - uint64_t reserved_11_07:5; - uint64_t address:40; - uint64_t reserved_63_52:12; -}; int vcpu_enable_evmcs(struct kvm_vcpu *vcpu) { uint16_t evmcs_ver; @@ -196,16 +194,15 @@ static inline void init_vmcs_control_fields(struct vmx_pages *vmx) vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PINBASED_CTLS)); if (vmx->eptp_gpa) { - uint64_t ept_paddr; - struct eptPageTablePointer eptp = { - .memory_type = X86_MEMTYPE_WB, - .page_walk_length = 3, /* + 1 */ - .ad_enabled = ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS), - .address = vmx->eptp_gpa >> PAGE_SHIFT_4K, - }; + uint64_t eptp = vmx->eptp_gpa | EPTP_WB | EPTP_PWL_4; - memcpy(&ept_paddr, &eptp, sizeof(ept_paddr)); - vmwrite(EPT_POINTER, ept_paddr); + TEST_ASSERT((vmx->eptp_gpa & ~PHYSICAL_PAGE_MASK) == 0, + "Illegal bits set in vmx->eptp_gpa"); + + if (ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS)) + eptp |= EPTP_AD_ENABLED; + + vmwrite(EPT_POINTER, eptp); sec_exec_ctl |= SECONDARY_EXEC_ENABLE_EPT; } From 3cd5002807bebf504cbb6645e73d01204324e54a Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:34 -0800 Subject: [PATCH 048/267] KVM: selftests: Stop setting A/D bits when creating EPT PTEs Stop setting Accessed/Dirty bits when creating EPT entries for L2 so that the stage-1 and stage-2 (a.k.a. TDP) page table APIs can use common code without bleeding the EPT hack into the common APIs. While commit 094444204570 ("selftests: kvm: add test for dirty logging inside nested guests") is _very_ light on details, the most likely explanation is that vmx_dirty_log_test was attempting to avoid taking an EPT Violation on the first _write_ from L2. static void l2_guest_code(u64 *a, u64 *b) { READ_ONCE(*a); WRITE_ONCE(*a, 1); <=== GUEST_SYNC(true); ... } When handling read faults in the shadow MMU, KVM opportunistically creates a writable SPTE if the mapping can be writable *and* the gPTE is dirty (or doesn't support the Dirty bit), i.e. if KVM doesn't need to intercept writes in order to emulate Dirty-bit updates. By setting A/D bits in the test's EPT entries, the above READ+WRITE will fault only on the read, and in theory expose the bug fixed by KVM commit 1f4e5fc83a42 ("KVM: x86: fix nested guest live migration with PML"). If the Dirty bit is NOT set, the test will get a false pass due; though again, in theory. However, the test is flawed (and always was, at least in the versions posted publicly), as KVM (correctly) marks the corresponding L1 GFN as dirty (in the dirty bitmap) when creating the writable SPTE. I.e. without a check on the dirty bitmap after the READ_ONCE(), the check after the first WRITE_ONCE() will get a false pass due to the dirty bitmap/log having been updated by the read fault, not by PML. Furthermore, the subsequent behavior in the test's l2_guest_code() effectively hides the flawed test behavior, as the straight writes to a new L2 GPA fault also trigger the KVM bug, and so the test will still detect the failure due to lack of isolation between the two testcases (Read=>Write vs. Write=>Write). WRITE_ONCE(*b, 1); GUEST_SYNC(true); WRITE_ONCE(*b, 1); GUEST_SYNC(true); GUEST_SYNC(false); Punt on fixing vmx_dirty_log_test for the moment as it will be easier to properly fix the test once the TDP code uses the common MMU APIs, at which point it will be trivially easy for the test to retrieve the EPT PTE and set the Dirty bit as needed. Signed-off-by: Yosry Ahmed [sean: rewrite changelog to explain the situation] Link: https://patch.msgid.link/20251230230150.4150236-6-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86/vmx.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 85043bb1ec4d..a3e2eae981da 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -432,14 +432,6 @@ void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, pt = addr_gpa2hva(vm, pte->address * vm->page_size); } - - /* - * For now mark these as accessed and dirty because the only - * testcase we have needs that. Can be reconsidered later. - */ - pte->accessed = true; - pte->dirty = true; - } void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, From 9f073ac25b4c4cf3b3ea13b155035108c54148bb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:35 -0800 Subject: [PATCH 049/267] KVM: selftests: Add "struct kvm_mmu" to track a given MMU instance Add a "struct kvm_mmu" to track a given MMU instance, e.g. a VM's stage-1 MMU versus a VM's stage-2 MMU, so that x86 can share MMU functionality for both stage-1 and stage-2 MMUs, without creating the potential for subtle bugs, e.g. due to consuming on vm->pgtable_levels when operating a stage-2 MMU. Encapsulate the existing de facto MMU in "struct kvm_vm", e.g instead of burying the MMU details in "struct kvm_vm_arch", to avoid more #ifdefs in ____vm_create(), and in the hopes that other architectures can utilize the formalized MMU structure if/when they too support stage-2 page tables. No functional change intended. Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-7-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/include/kvm_util.h | 11 ++++-- .../selftests/kvm/lib/arm64/processor.c | 38 +++++++++---------- tools/testing/selftests/kvm/lib/kvm_util.c | 28 +++++++------- .../selftests/kvm/lib/loongarch/processor.c | 28 +++++++------- .../selftests/kvm/lib/riscv/processor.c | 31 +++++++-------- .../selftests/kvm/lib/s390/processor.c | 16 ++++---- .../testing/selftests/kvm/lib/x86/processor.c | 28 +++++++------- .../kvm/x86/vmx_nested_la57_state_test.c | 2 +- 8 files changed, 94 insertions(+), 88 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 81f4355ff28a..39558c05c0bf 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -88,12 +88,17 @@ enum kvm_mem_region_type { NR_MEM_REGIONS, }; +struct kvm_mmu { + bool pgd_created; + uint64_t pgd; + int pgtable_levels; +}; + struct kvm_vm { int mode; unsigned long type; int kvm_fd; int fd; - unsigned int pgtable_levels; unsigned int page_size; unsigned int page_shift; unsigned int pa_bits; @@ -104,13 +109,13 @@ struct kvm_vm { struct sparsebit *vpages_valid; struct sparsebit *vpages_mapped; bool has_irqchip; - bool pgd_created; vm_paddr_t ucall_mmio_addr; - vm_paddr_t pgd; vm_vaddr_t handlers; uint32_t dirty_ring_size; uint64_t gpa_tag_mask; + struct kvm_mmu mmu; + struct kvm_vm_arch arch; struct kvm_binary_stats stats; diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index d46e4b13b92c..c40f59d48311 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -28,7 +28,7 @@ static uint64_t page_align(struct kvm_vm *vm, uint64_t v) static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva) { - unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; + unsigned int shift = (vm->mmu.pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; uint64_t mask = (1UL << (vm->va_bits - shift)) - 1; return (gva >> shift) & mask; @@ -39,7 +39,7 @@ static uint64_t pud_index(struct kvm_vm *vm, vm_vaddr_t gva) unsigned int shift = 2 * (vm->page_shift - 3) + vm->page_shift; uint64_t mask = (1UL << (vm->page_shift - 3)) - 1; - TEST_ASSERT(vm->pgtable_levels == 4, + TEST_ASSERT(vm->mmu.pgtable_levels == 4, "Mode %d does not have 4 page table levels", vm->mode); return (gva >> shift) & mask; @@ -50,7 +50,7 @@ static uint64_t pmd_index(struct kvm_vm *vm, vm_vaddr_t gva) unsigned int shift = (vm->page_shift - 3) + vm->page_shift; uint64_t mask = (1UL << (vm->page_shift - 3)) - 1; - TEST_ASSERT(vm->pgtable_levels >= 3, + TEST_ASSERT(vm->mmu.pgtable_levels >= 3, "Mode %d does not have >= 3 page table levels", vm->mode); return (gva >> shift) & mask; @@ -104,7 +104,7 @@ static uint64_t pte_addr(struct kvm_vm *vm, uint64_t pte) static uint64_t ptrs_per_pgd(struct kvm_vm *vm) { - unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; + unsigned int shift = (vm->mmu.pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; return 1 << (vm->va_bits - shift); } @@ -117,13 +117,13 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) { size_t nr_pages = page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size; - if (vm->pgd_created) + if (vm->mmu.pgd_created) return; - vm->pgd = vm_phy_pages_alloc(vm, nr_pages, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, - vm->memslots[MEM_REGION_PT]); - vm->pgd_created = true; + vm->mmu.pgd = vm_phy_pages_alloc(vm, nr_pages, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, + vm->memslots[MEM_REGION_PT]); + vm->mmu.pgd_created = true; } static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, @@ -147,12 +147,12 @@ static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", paddr, vm->max_gfn, vm->page_size); - ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, vaddr) * 8; + ptep = addr_gpa2hva(vm, vm->mmu.pgd) + pgd_index(vm, vaddr) * 8; if (!*ptep) *ptep = addr_pte(vm, vm_alloc_page_table(vm), PGD_TYPE_TABLE | PTE_VALID); - switch (vm->pgtable_levels) { + switch (vm->mmu.pgtable_levels) { case 4: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, vaddr) * 8; if (!*ptep) @@ -190,16 +190,16 @@ uint64_t *virt_get_pte_hva_at_level(struct kvm_vm *vm, vm_vaddr_t gva, int level { uint64_t *ptep; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) goto unmapped_gva; - ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, gva) * 8; + ptep = addr_gpa2hva(vm, vm->mmu.pgd) + pgd_index(vm, gva) * 8; if (!ptep) goto unmapped_gva; if (level == 0) return ptep; - switch (vm->pgtable_levels) { + switch (vm->mmu.pgtable_levels) { case 4: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, gva) * 8; if (!ptep) @@ -263,13 +263,13 @@ static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t p void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { - int level = 4 - (vm->pgtable_levels - 1); + int level = 4 - (vm->mmu.pgtable_levels - 1); uint64_t pgd, *ptep; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) return; - for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pgd(vm) * 8; pgd += 8) { + for (pgd = vm->mmu.pgd; pgd < vm->mmu.pgd + ptrs_per_pgd(vm) * 8; pgd += 8) { ptep = addr_gpa2hva(vm, pgd); if (!*ptep) continue; @@ -350,7 +350,7 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); } - ttbr0_el1 = vm->pgd & GENMASK(47, vm->page_shift); + ttbr0_el1 = vm->mmu.pgd & GENMASK(47, vm->page_shift); /* Configure output size */ switch (vm->mode) { @@ -358,7 +358,7 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) case VM_MODE_P52V48_16K: case VM_MODE_P52V48_64K: tcr_el1 |= TCR_IPS_52_BITS; - ttbr0_el1 |= FIELD_GET(GENMASK(51, 48), vm->pgd) << 2; + ttbr0_el1 |= FIELD_GET(GENMASK(51, 48), vm->mmu.pgd) << 2; break; case VM_MODE_P48V48_4K: case VM_MODE_P48V48_16K: diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 8279b6ced8d2..65752daeed90 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -281,34 +281,34 @@ struct kvm_vm *____vm_create(struct vm_shape shape) /* Setup mode specific traits. */ switch (vm->mode) { case VM_MODE_P52V48_4K: - vm->pgtable_levels = 4; + vm->mmu.pgtable_levels = 4; break; case VM_MODE_P52V48_64K: - vm->pgtable_levels = 3; + vm->mmu.pgtable_levels = 3; break; case VM_MODE_P48V48_4K: - vm->pgtable_levels = 4; + vm->mmu.pgtable_levels = 4; break; case VM_MODE_P48V48_64K: - vm->pgtable_levels = 3; + vm->mmu.pgtable_levels = 3; break; case VM_MODE_P40V48_4K: case VM_MODE_P36V48_4K: - vm->pgtable_levels = 4; + vm->mmu.pgtable_levels = 4; break; case VM_MODE_P40V48_64K: case VM_MODE_P36V48_64K: - vm->pgtable_levels = 3; + vm->mmu.pgtable_levels = 3; break; case VM_MODE_P52V48_16K: case VM_MODE_P48V48_16K: case VM_MODE_P40V48_16K: case VM_MODE_P36V48_16K: - vm->pgtable_levels = 4; + vm->mmu.pgtable_levels = 4; break; case VM_MODE_P47V47_16K: case VM_MODE_P36V47_16K: - vm->pgtable_levels = 3; + vm->mmu.pgtable_levels = 3; break; case VM_MODE_PXXVYY_4K: #ifdef __x86_64__ @@ -321,22 +321,22 @@ struct kvm_vm *____vm_create(struct vm_shape shape) vm->va_bits); if (vm->va_bits == 57) { - vm->pgtable_levels = 5; + vm->mmu.pgtable_levels = 5; } else { TEST_ASSERT(vm->va_bits == 48, "Unexpected guest virtual address width: %d", vm->va_bits); - vm->pgtable_levels = 4; + vm->mmu.pgtable_levels = 4; } #else TEST_FAIL("VM_MODE_PXXVYY_4K not supported on non-x86 platforms"); #endif break; case VM_MODE_P47V64_4K: - vm->pgtable_levels = 5; + vm->mmu.pgtable_levels = 5; break; case VM_MODE_P44V64_4K: - vm->pgtable_levels = 5; + vm->mmu.pgtable_levels = 5; break; default: TEST_FAIL("Unknown guest mode: 0x%x", vm->mode); @@ -1956,8 +1956,8 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*sMapped Virtual Pages:\n", indent, ""); sparsebit_dump(stream, vm->vpages_mapped, indent + 2); fprintf(stream, "%*spgd_created: %u\n", indent, "", - vm->pgd_created); - if (vm->pgd_created) { + vm->mmu.pgd_created); + if (vm->mmu.pgd_created) { fprintf(stream, "%*sVirtual Translation Tables:\n", indent + 2, ""); virt_dump(stream, vm, indent + 4); diff --git a/tools/testing/selftests/kvm/lib/loongarch/processor.c b/tools/testing/selftests/kvm/lib/loongarch/processor.c index 07c103369ddb..17aa55a2047a 100644 --- a/tools/testing/selftests/kvm/lib/loongarch/processor.c +++ b/tools/testing/selftests/kvm/lib/loongarch/processor.c @@ -50,11 +50,11 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) int i; vm_paddr_t child, table; - if (vm->pgd_created) + if (vm->mmu.pgd_created) return; child = table = 0; - for (i = 0; i < vm->pgtable_levels; i++) { + for (i = 0; i < vm->mmu.pgtable_levels; i++) { invalid_pgtable[i] = child; table = vm_phy_page_alloc(vm, LOONGARCH_PAGE_TABLE_PHYS_MIN, vm->memslots[MEM_REGION_PT]); @@ -62,8 +62,8 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) virt_set_pgtable(vm, table, child); child = table; } - vm->pgd = table; - vm->pgd_created = true; + vm->mmu.pgd = table; + vm->mmu.pgd_created = true; } static int virt_pte_none(uint64_t *ptep, int level) @@ -77,11 +77,11 @@ static uint64_t *virt_populate_pte(struct kvm_vm *vm, vm_vaddr_t gva, int alloc) uint64_t *ptep; vm_paddr_t child; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) goto unmapped_gva; - child = vm->pgd; - level = vm->pgtable_levels - 1; + child = vm->mmu.pgd; + level = vm->mmu.pgtable_levels - 1; while (level > 0) { ptep = addr_gpa2hva(vm, child) + virt_pte_index(vm, gva, level) * 8; if (virt_pte_none(ptep, level)) { @@ -161,11 +161,11 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { int level; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) return; - level = vm->pgtable_levels - 1; - pte_dump(stream, vm, indent, vm->pgd, level); + level = vm->mmu.pgtable_levels - 1; + pte_dump(stream, vm, indent, vm->mmu.pgd, level); } void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) @@ -297,7 +297,7 @@ static void loongarch_vcpu_setup(struct kvm_vcpu *vcpu) width = vm->page_shift - 3; - switch (vm->pgtable_levels) { + switch (vm->mmu.pgtable_levels) { case 4: /* pud page shift and width */ val = (vm->page_shift + width * 2) << 20 | (width << 25); @@ -309,15 +309,15 @@ static void loongarch_vcpu_setup(struct kvm_vcpu *vcpu) val |= vm->page_shift | width << 5; break; default: - TEST_FAIL("Got %u page table levels, expected 3 or 4", vm->pgtable_levels); + TEST_FAIL("Got %u page table levels, expected 3 or 4", vm->mmu.pgtable_levels); } loongarch_set_csr(vcpu, LOONGARCH_CSR_PWCTL0, val); /* PGD page shift and width */ - val = (vm->page_shift + width * (vm->pgtable_levels - 1)) | width << 6; + val = (vm->page_shift + width * (vm->mmu.pgtable_levels - 1)) | width << 6; loongarch_set_csr(vcpu, LOONGARCH_CSR_PWCTL1, val); - loongarch_set_csr(vcpu, LOONGARCH_CSR_PGDL, vm->pgd); + loongarch_set_csr(vcpu, LOONGARCH_CSR_PGDL, vm->mmu.pgd); /* * Refill exception runs on real mode diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index 2eac7d4b59e9..e6ec7c224fc3 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -60,7 +60,7 @@ static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level) { TEST_ASSERT(level > -1, "Negative page table level (%d) not possible", level); - TEST_ASSERT(level < vm->pgtable_levels, + TEST_ASSERT(level < vm->mmu.pgtable_levels, "Invalid page table level (%d)", level); return (gva & pte_index_mask[level]) >> pte_index_shift[level]; @@ -70,19 +70,19 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) { size_t nr_pages = page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size; - if (vm->pgd_created) + if (vm->mmu.pgd_created) return; - vm->pgd = vm_phy_pages_alloc(vm, nr_pages, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, - vm->memslots[MEM_REGION_PT]); - vm->pgd_created = true; + vm->mmu.pgd = vm_phy_pages_alloc(vm, nr_pages, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, + vm->memslots[MEM_REGION_PT]); + vm->mmu.pgd_created = true; } void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { uint64_t *ptep, next_ppn; - int level = vm->pgtable_levels - 1; + int level = vm->mmu.pgtable_levels - 1; TEST_ASSERT((vaddr % vm->page_size) == 0, "Virtual address not on page boundary,\n" @@ -98,7 +98,7 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", paddr, vm->max_gfn, vm->page_size); - ptep = addr_gpa2hva(vm, vm->pgd) + pte_index(vm, vaddr, level) * 8; + ptep = addr_gpa2hva(vm, vm->mmu.pgd) + pte_index(vm, vaddr, level) * 8; if (!*ptep) { next_ppn = vm_alloc_page_table(vm) >> PGTBL_PAGE_SIZE_SHIFT; *ptep = (next_ppn << PGTBL_PTE_ADDR_SHIFT) | @@ -126,12 +126,12 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { uint64_t *ptep; - int level = vm->pgtable_levels - 1; + int level = vm->mmu.pgtable_levels - 1; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) goto unmapped_gva; - ptep = addr_gpa2hva(vm, vm->pgd) + pte_index(vm, gva, level) * 8; + ptep = addr_gpa2hva(vm, vm->mmu.pgd) + pte_index(vm, gva, level) * 8; if (!ptep) goto unmapped_gva; level--; @@ -176,13 +176,14 @@ static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { - int level = vm->pgtable_levels - 1; + struct kvm_mmu *mmu = &vm->mmu; + int level = mmu->pgtable_levels - 1; uint64_t pgd, *ptep; - if (!vm->pgd_created) + if (!mmu->pgd_created) return; - for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pte(vm) * 8; pgd += 8) { + for (pgd = mmu->pgd; pgd < mmu->pgd + ptrs_per_pte(vm) * 8; pgd += 8) { ptep = addr_gpa2hva(vm, pgd); if (!*ptep) continue; @@ -211,7 +212,7 @@ void riscv_vcpu_mmu_setup(struct kvm_vcpu *vcpu) TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); } - satp = (vm->pgd >> PGTBL_PAGE_SIZE_SHIFT) & SATP_PPN; + satp = (vm->mmu.pgd >> PGTBL_PAGE_SIZE_SHIFT) & SATP_PPN; satp |= SATP_MODE_48; vcpu_set_reg(vcpu, RISCV_GENERAL_CSR_REG(satp), satp); diff --git a/tools/testing/selftests/kvm/lib/s390/processor.c b/tools/testing/selftests/kvm/lib/s390/processor.c index 8ceeb17c819a..6a9a660413a7 100644 --- a/tools/testing/selftests/kvm/lib/s390/processor.c +++ b/tools/testing/selftests/kvm/lib/s390/processor.c @@ -17,7 +17,7 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", vm->page_size); - if (vm->pgd_created) + if (vm->mmu.pgd_created) return; paddr = vm_phy_pages_alloc(vm, PAGES_PER_REGION, @@ -25,8 +25,8 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) vm->memslots[MEM_REGION_PT]); memset(addr_gpa2hva(vm, paddr), 0xff, PAGES_PER_REGION * vm->page_size); - vm->pgd = paddr; - vm->pgd_created = true; + vm->mmu.pgd = paddr; + vm->mmu.pgd_created = true; } /* @@ -70,7 +70,7 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa) gva, vm->max_gfn, vm->page_size); /* Walk through region and segment tables */ - entry = addr_gpa2hva(vm, vm->pgd); + entry = addr_gpa2hva(vm, vm->mmu.pgd); for (ri = 1; ri <= 4; ri++) { idx = (gva >> (64 - 11 * ri)) & 0x7ffu; if (entry[idx] & REGION_ENTRY_INVALID) @@ -94,7 +94,7 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x", vm->page_size); - entry = addr_gpa2hva(vm, vm->pgd); + entry = addr_gpa2hva(vm, vm->mmu.pgd); for (ri = 1; ri <= 4; ri++) { idx = (gva >> (64 - 11 * ri)) & 0x7ffu; TEST_ASSERT(!(entry[idx] & REGION_ENTRY_INVALID), @@ -149,10 +149,10 @@ static void virt_dump_region(FILE *stream, struct kvm_vm *vm, uint8_t indent, void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) return; - virt_dump_region(stream, vm, indent, vm->pgd); + virt_dump_region(stream, vm, indent, vm->mmu.pgd); } void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code) @@ -184,7 +184,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) vcpu_sregs_get(vcpu, &sregs); sregs.crs[0] |= 0x00040000; /* Enable floating point regs */ - sregs.crs[1] = vm->pgd | 0xf; /* Primary region table */ + sregs.crs[1] = vm->mmu.pgd | 0xf; /* Primary region table */ vcpu_sregs_set(vcpu, &sregs); vcpu->run->psw_mask = 0x0400000180000000ULL; /* DAT enabled + 64 bit mode */ diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index c14bf2b5f28f..f027f86d1535 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -162,9 +162,9 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) "Unknown or unsupported guest mode: 0x%x", vm->mode); /* If needed, create the top-level page table. */ - if (!vm->pgd_created) { - vm->pgd = vm_alloc_page_table(vm); - vm->pgd_created = true; + if (!vm->mmu.pgd_created) { + vm->mmu.pgd = vm_alloc_page_table(vm); + vm->mmu.pgd_created = true; } } @@ -175,7 +175,7 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte, uint64_t *page_table = addr_gpa2hva(vm, pt_gpa); int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; - TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->pgd, + TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->mmu.pgd, "Parent PTE (level %d) not PRESENT for gva: 0x%08lx", level + 1, vaddr); @@ -218,7 +218,7 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) { const uint64_t pg_size = PG_LEVEL_SIZE(level); - uint64_t *pte = &vm->pgd; + uint64_t *pte = &vm->mmu.pgd; int current_level; TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, @@ -243,7 +243,7 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) * Allocate upper level page tables, if not already present. Return * early if a hugepage was created. */ - for (current_level = vm->pgtable_levels; + for (current_level = vm->mmu.pgtable_levels; current_level > PG_LEVEL_4K; current_level--) { pte = virt_create_upper_pte(vm, pte, vaddr, paddr, @@ -309,14 +309,14 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, int *level) { - int va_width = 12 + (vm->pgtable_levels) * 9; - uint64_t *pte = &vm->pgd; + int va_width = 12 + (vm->mmu.pgtable_levels) * 9; + uint64_t *pte = &vm->mmu.pgd; int current_level; TEST_ASSERT(!vm->arch.is_pt_protected, "Walking page tables of protected guests is impossible"); - TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= vm->pgtable_levels, + TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= vm->mmu.pgtable_levels, "Invalid PG_LEVEL_* '%d'", *level); TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, @@ -332,7 +332,7 @@ static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, (((int64_t)vaddr << (64 - va_width) >> (64 - va_width))), "Canonical check failed. The virtual address is invalid."); - for (current_level = vm->pgtable_levels; + for (current_level = vm->mmu.pgtable_levels; current_level > PG_LEVEL_4K; current_level--) { pte = virt_get_pte(vm, pte, vaddr, current_level); @@ -357,7 +357,7 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) uint64_t *pde, *pde_start; uint64_t *pte, *pte_start; - if (!vm->pgd_created) + if (!vm->mmu.pgd_created) return; fprintf(stream, "%*s " @@ -365,7 +365,7 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*s index hvaddr gpaddr " "addr w exec dirty\n", indent, ""); - pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd); + pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->mmu.pgd); for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) { pml4e = &pml4e_start[n1]; if (!(*pml4e & PTE_PRESENT_MASK)) @@ -538,7 +538,7 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; if (kvm_cpu_has(X86_FEATURE_XSAVE)) sregs.cr4 |= X86_CR4_OSXSAVE; - if (vm->pgtable_levels == 5) + if (vm->mmu.pgtable_levels == 5) sregs.cr4 |= X86_CR4_LA57; sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); @@ -549,7 +549,7 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) kvm_seg_set_kernel_data_64bit(&sregs.gs); kvm_seg_set_tss_64bit(vm->arch.tss, &sregs.tr); - sregs.cr3 = vm->pgd; + sregs.cr3 = vm->mmu.pgd; vcpu_sregs_set(vcpu, &sregs); } diff --git a/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c index cf1d2d1f2a8f..915c42001dba 100644 --- a/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c @@ -90,7 +90,7 @@ int main(int argc, char *argv[]) * L1 needs to read its own PML5 table to set up L2. Identity map * the PML5 table to facilitate this. */ - virt_map(vm, vm->pgd, vm->pgd, 1); + virt_map(vm, vm->mmu.pgd, vm->mmu.pgd, 1); vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_args_set(vcpu, 1, vmx_pages_gva); From 11825209f5494098da6cab666d8a767650c1c0cb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:36 -0800 Subject: [PATCH 050/267] KVM: selftests: Plumb "struct kvm_mmu" into x86's MMU APIs In preparation for generalizing the x86 virt mapping APIs to work with TDP (stage-2) page tables, plumb "struct kvm_mmu" into all of the helper functions instead of operating on vm->mmu directly. Opportunistically swap the order of the check in virt_get_pte() to first assert that the parent is the PGD, and then check that the PTE is present, as it makes more sense to check if the parent PTE is the PGD/root (i.e. not a PTE) before checking that the PTE is PRESENT. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Yosry Ahmed [sean: rebase on common kvm_mmu structure, rewrite changelog] Link: https://patch.msgid.link/20251230230150.4150236-8-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86/processor.h | 3 +- .../testing/selftests/kvm/lib/x86/processor.c | 64 +++++++++++-------- 2 files changed, 39 insertions(+), 28 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 1cb5b4c46b99..43970a96baed 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1451,7 +1451,8 @@ enum pg_level { #define PG_SIZE_2M PG_LEVEL_SIZE(PG_LEVEL_2M) #define PG_SIZE_1G PG_LEVEL_SIZE(PG_LEVEL_1G) -void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level); +void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, + uint64_t paddr, int level); void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, uint64_t nr_bytes, int level); diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index f027f86d1535..f25742a804b0 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -156,26 +156,31 @@ bool kvm_is_tdp_enabled(void) return get_kvm_amd_param_bool("npt"); } +static void virt_mmu_init(struct kvm_vm *vm, struct kvm_mmu *mmu) +{ + /* If needed, create the top-level page table. */ + if (!mmu->pgd_created) { + mmu->pgd = vm_alloc_page_table(vm); + mmu->pgd_created = true; + } +} + void virt_arch_pgd_alloc(struct kvm_vm *vm) { TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, "Unknown or unsupported guest mode: 0x%x", vm->mode); - /* If needed, create the top-level page table. */ - if (!vm->mmu.pgd_created) { - vm->mmu.pgd = vm_alloc_page_table(vm); - vm->mmu.pgd_created = true; - } + virt_mmu_init(vm, &vm->mmu); } -static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte, - uint64_t vaddr, int level) +static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu, + uint64_t *parent_pte, uint64_t vaddr, int level) { uint64_t pt_gpa = PTE_GET_PA(*parent_pte); uint64_t *page_table = addr_gpa2hva(vm, pt_gpa); int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; - TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->mmu.pgd, + TEST_ASSERT((*parent_pte == mmu->pgd) || (*parent_pte & PTE_PRESENT_MASK), "Parent PTE (level %d) not PRESENT for gva: 0x%08lx", level + 1, vaddr); @@ -183,13 +188,14 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte, } static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, + struct kvm_mmu *mmu, uint64_t *parent_pte, uint64_t vaddr, uint64_t paddr, int current_level, int target_level) { - uint64_t *pte = virt_get_pte(vm, parent_pte, vaddr, current_level); + uint64_t *pte = virt_get_pte(vm, mmu, parent_pte, vaddr, current_level); paddr = vm_untag_gpa(vm, paddr); @@ -215,10 +221,11 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, return pte; } -void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) +void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, + uint64_t paddr, int level) { const uint64_t pg_size = PG_LEVEL_SIZE(level); - uint64_t *pte = &vm->mmu.pgd; + uint64_t *pte = &mmu->pgd; int current_level; TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, @@ -243,17 +250,17 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) * Allocate upper level page tables, if not already present. Return * early if a hugepage was created. */ - for (current_level = vm->mmu.pgtable_levels; + for (current_level = mmu->pgtable_levels; current_level > PG_LEVEL_4K; current_level--) { - pte = virt_create_upper_pte(vm, pte, vaddr, paddr, + pte = virt_create_upper_pte(vm, mmu, pte, vaddr, paddr, current_level, level); if (*pte & PTE_LARGE_MASK) return; } /* Fill in page table entry. */ - pte = virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K); + pte = virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), "PTE already present for 4k page at vaddr: 0x%lx", vaddr); *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); @@ -270,7 +277,7 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { - __virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K); + __virt_pg_map(vm, &vm->mmu, vaddr, paddr, PG_LEVEL_4K); } void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, @@ -285,7 +292,7 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, nr_bytes, pg_size); for (i = 0; i < nr_pages; i++) { - __virt_pg_map(vm, vaddr, paddr, level); + __virt_pg_map(vm, &vm->mmu, vaddr, paddr, level); sparsebit_set_num(vm->vpages_mapped, vaddr >> vm->page_shift, nr_bytes / PAGE_SIZE); @@ -294,7 +301,8 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, } } -static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) +static bool vm_is_target_pte(struct kvm_mmu *mmu, uint64_t *pte, + int *level, int current_level) { if (*pte & PTE_LARGE_MASK) { TEST_ASSERT(*level == PG_LEVEL_NONE || @@ -306,17 +314,19 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) return *level == current_level; } -static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, +static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, + struct kvm_mmu *mmu, + uint64_t vaddr, int *level) { - int va_width = 12 + (vm->mmu.pgtable_levels) * 9; - uint64_t *pte = &vm->mmu.pgd; + int va_width = 12 + (mmu->pgtable_levels) * 9; + uint64_t *pte = &mmu->pgd; int current_level; TEST_ASSERT(!vm->arch.is_pt_protected, "Walking page tables of protected guests is impossible"); - TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= vm->mmu.pgtable_levels, + TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= mmu->pgtable_levels, "Invalid PG_LEVEL_* '%d'", *level); TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, @@ -332,22 +342,22 @@ static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, (((int64_t)vaddr << (64 - va_width) >> (64 - va_width))), "Canonical check failed. The virtual address is invalid."); - for (current_level = vm->mmu.pgtable_levels; + for (current_level = mmu->pgtable_levels; current_level > PG_LEVEL_4K; current_level--) { - pte = virt_get_pte(vm, pte, vaddr, current_level); - if (vm_is_target_pte(pte, level, current_level)) + pte = virt_get_pte(vm, mmu, pte, vaddr, current_level); + if (vm_is_target_pte(mmu, pte, level, current_level)) return pte; } - return virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K); + return virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); } uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) { int level = PG_LEVEL_4K; - return __vm_get_page_table_entry(vm, vaddr, &level); + return __vm_get_page_table_entry(vm, &vm->mmu, vaddr, &level); } void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) @@ -497,7 +507,7 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_segment *segp) vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { int level = PG_LEVEL_NONE; - uint64_t *pte = __vm_get_page_table_entry(vm, gva, &level); + uint64_t *pte = __vm_get_page_table_entry(vm, &vm->mmu, gva, &level); TEST_ASSERT(*pte & PTE_PRESENT_MASK, "Leaf PTE not PRESENT for gva: 0x%08lx", gva); From 3d0e7595e81017558189d2252ae9453c57ab3436 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:37 -0800 Subject: [PATCH 051/267] KVM: selftests: Add a "struct kvm_mmu_arch arch" member to kvm_mmu Add an arch structure+field in "struct kvm_mmu" so that architectures can track arch-specific information for a given MMU. No functional change intended. Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-9-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h | 2 ++ tools/testing/selftests/kvm/include/kvm_util.h | 2 ++ tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h | 1 + tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h | 1 + tools/testing/selftests/kvm/include/s390/kvm_util_arch.h | 1 + tools/testing/selftests/kvm/include/x86/kvm_util_arch.h | 2 ++ 6 files changed, 9 insertions(+) diff --git a/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h b/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h index b973bb2c64a6..4a2033708227 100644 --- a/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h @@ -2,6 +2,8 @@ #ifndef SELFTEST_KVM_UTIL_ARCH_H #define SELFTEST_KVM_UTIL_ARCH_H +struct kvm_mmu_arch {}; + struct kvm_vm_arch { bool has_gic; int gic_fd; diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 39558c05c0bf..c1497515fa6a 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -92,6 +92,8 @@ struct kvm_mmu { bool pgd_created; uint64_t pgd; int pgtable_levels; + + struct kvm_mmu_arch arch; }; struct kvm_vm { diff --git a/tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h b/tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h index e43a57d99b56..d5095900e442 100644 --- a/tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h @@ -2,6 +2,7 @@ #ifndef SELFTEST_KVM_UTIL_ARCH_H #define SELFTEST_KVM_UTIL_ARCH_H +struct kvm_mmu_arch {}; struct kvm_vm_arch {}; #endif // SELFTEST_KVM_UTIL_ARCH_H diff --git a/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h b/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h index e43a57d99b56..d5095900e442 100644 --- a/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h @@ -2,6 +2,7 @@ #ifndef SELFTEST_KVM_UTIL_ARCH_H #define SELFTEST_KVM_UTIL_ARCH_H +struct kvm_mmu_arch {}; struct kvm_vm_arch {}; #endif // SELFTEST_KVM_UTIL_ARCH_H diff --git a/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h b/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h index e43a57d99b56..d5095900e442 100644 --- a/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h @@ -2,6 +2,7 @@ #ifndef SELFTEST_KVM_UTIL_ARCH_H #define SELFTEST_KVM_UTIL_ARCH_H +struct kvm_mmu_arch {}; struct kvm_vm_arch {}; #endif // SELFTEST_KVM_UTIL_ARCH_H diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h index 972bb1c4ab4c..456e5ca170df 100644 --- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h @@ -10,6 +10,8 @@ extern bool is_forced_emulation_enabled; +struct kvm_mmu_arch {}; + struct kvm_vm_arch { vm_vaddr_t gdt; vm_vaddr_t tss; From 6dd70757213fc046e5f86901e4baf11ed894da6b Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:38 -0800 Subject: [PATCH 052/267] KVM: selftests: Move PTE bitmasks to kvm_mmu Move the PTE bitmasks into kvm_mmu to parameterize them for virt mapping functions. Introduce helpers to read/write different PTE bits given a kvm_mmu. Drop the 'global' bit definition as it's currently unused, but leave the 'user' bit as it will be used in coming changes. Opportunisitcally rename 'large' to 'huge' as it's more consistent with the kernel naming. Leave PHYSICAL_PAGE_MASK alone, it's fixed in all page table formats and a lot of other macros depend on it. It's tempting to move all the other macros to be per-struct instead, but it would be too much noise for little benefit. Keep c_bit and s_bit in vm->arch as they used before the MMU is initialized, through __vmcreate() -> vm_userspace_mem_region_add() -> vm_mem_add() -> vm_arch_has_protected_memory(). No functional change intended. Signed-off-by: Yosry Ahmed [sean: rename accessors to is__pte()] Link: https://patch.msgid.link/20251230230150.4150236-10-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86/kvm_util_arch.h | 16 ++++- .../selftests/kvm/include/x86/processor.h | 28 +++++--- .../testing/selftests/kvm/lib/x86/processor.c | 71 +++++++++++-------- 3 files changed, 76 insertions(+), 39 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h index 456e5ca170df..bad381d63b6a 100644 --- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h @@ -10,7 +10,21 @@ extern bool is_forced_emulation_enabled; -struct kvm_mmu_arch {}; +struct pte_masks { + uint64_t present; + uint64_t writable; + uint64_t user; + uint64_t accessed; + uint64_t dirty; + uint64_t huge; + uint64_t nx; + uint64_t c; + uint64_t s; +}; + +struct kvm_mmu_arch { + struct pte_masks pte_masks; +}; struct kvm_vm_arch { vm_vaddr_t gdt; diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 43970a96baed..55dac84cd4a7 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -362,16 +362,6 @@ static inline unsigned int x86_model(unsigned int eax) return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f); } -/* Page table bitfield declarations */ -#define PTE_PRESENT_MASK BIT_ULL(0) -#define PTE_WRITABLE_MASK BIT_ULL(1) -#define PTE_USER_MASK BIT_ULL(2) -#define PTE_ACCESSED_MASK BIT_ULL(5) -#define PTE_DIRTY_MASK BIT_ULL(6) -#define PTE_LARGE_MASK BIT_ULL(7) -#define PTE_GLOBAL_MASK BIT_ULL(8) -#define PTE_NX_MASK BIT_ULL(63) - #define PHYSICAL_PAGE_MASK GENMASK_ULL(51, 12) #define PAGE_SHIFT 12 @@ -1451,6 +1441,24 @@ enum pg_level { #define PG_SIZE_2M PG_LEVEL_SIZE(PG_LEVEL_2M) #define PG_SIZE_1G PG_LEVEL_SIZE(PG_LEVEL_1G) +#define PTE_PRESENT_MASK(mmu) ((mmu)->arch.pte_masks.present) +#define PTE_WRITABLE_MASK(mmu) ((mmu)->arch.pte_masks.writable) +#define PTE_USER_MASK(mmu) ((mmu)->arch.pte_masks.user) +#define PTE_ACCESSED_MASK(mmu) ((mmu)->arch.pte_masks.accessed) +#define PTE_DIRTY_MASK(mmu) ((mmu)->arch.pte_masks.dirty) +#define PTE_HUGE_MASK(mmu) ((mmu)->arch.pte_masks.huge) +#define PTE_NX_MASK(mmu) ((mmu)->arch.pte_masks.nx) +#define PTE_C_BIT_MASK(mmu) ((mmu)->arch.pte_masks.c) +#define PTE_S_BIT_MASK(mmu) ((mmu)->arch.pte_masks.s) + +#define is_present_pte(mmu, pte) (!!(*(pte) & PTE_PRESENT_MASK(mmu))) +#define is_writable_pte(mmu, pte) (!!(*(pte) & PTE_WRITABLE_MASK(mmu))) +#define is_user_pte(mmu, pte) (!!(*(pte) & PTE_USER_MASK(mmu))) +#define is_accessed_pte(mmu, pte) (!!(*(pte) & PTE_ACCESSED_MASK(mmu))) +#define is_dirty_pte(mmu, pte) (!!(*(pte) & PTE_DIRTY_MASK(mmu))) +#define is_huge_pte(mmu, pte) (!!(*(pte) & PTE_HUGE_MASK(mmu))) +#define is_nx_pte(mmu, pte) (!!(*(pte) & PTE_NX_MASK(mmu))) + void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, uint64_t paddr, int level); void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index f25742a804b0..3800f4ff6770 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -156,12 +156,14 @@ bool kvm_is_tdp_enabled(void) return get_kvm_amd_param_bool("npt"); } -static void virt_mmu_init(struct kvm_vm *vm, struct kvm_mmu *mmu) +static void virt_mmu_init(struct kvm_vm *vm, struct kvm_mmu *mmu, + struct pte_masks *pte_masks) { /* If needed, create the top-level page table. */ if (!mmu->pgd_created) { mmu->pgd = vm_alloc_page_table(vm); mmu->pgd_created = true; + mmu->arch.pte_masks = *pte_masks; } } @@ -170,7 +172,19 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, "Unknown or unsupported guest mode: 0x%x", vm->mode); - virt_mmu_init(vm, &vm->mmu); + struct pte_masks pte_masks = (struct pte_masks){ + .present = BIT_ULL(0), + .writable = BIT_ULL(1), + .user = BIT_ULL(2), + .accessed = BIT_ULL(5), + .dirty = BIT_ULL(6), + .huge = BIT_ULL(7), + .nx = BIT_ULL(63), + .c = vm->arch.c_bit, + .s = vm->arch.s_bit, + }; + + virt_mmu_init(vm, &vm->mmu, &pte_masks); } static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu, @@ -180,7 +194,7 @@ static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t *page_table = addr_gpa2hva(vm, pt_gpa); int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; - TEST_ASSERT((*parent_pte == mmu->pgd) || (*parent_pte & PTE_PRESENT_MASK), + TEST_ASSERT((*parent_pte == mmu->pgd) || is_present_pte(mmu, parent_pte), "Parent PTE (level %d) not PRESENT for gva: 0x%08lx", level + 1, vaddr); @@ -199,10 +213,10 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, paddr = vm_untag_gpa(vm, paddr); - if (!(*pte & PTE_PRESENT_MASK)) { - *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK; + if (!is_present_pte(mmu, pte)) { + *pte = PTE_PRESENT_MASK(mmu) | PTE_WRITABLE_MASK(mmu); if (current_level == target_level) - *pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK); + *pte |= PTE_HUGE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); else *pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK; } else { @@ -214,7 +228,7 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, TEST_ASSERT(current_level != target_level, "Cannot create hugepage at level: %u, vaddr: 0x%lx", current_level, vaddr); - TEST_ASSERT(!(*pte & PTE_LARGE_MASK), + TEST_ASSERT(!is_huge_pte(mmu, pte), "Cannot create page table at level: %u, vaddr: 0x%lx", current_level, vaddr); } @@ -255,24 +269,24 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, current_level--) { pte = virt_create_upper_pte(vm, mmu, pte, vaddr, paddr, current_level, level); - if (*pte & PTE_LARGE_MASK) + if (is_huge_pte(mmu, pte)) return; } /* Fill in page table entry. */ pte = virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); - TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), + TEST_ASSERT(!is_present_pte(mmu, pte), "PTE already present for 4k page at vaddr: 0x%lx", vaddr); - *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); + *pte = PTE_PRESENT_MASK(mmu) | PTE_WRITABLE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); /* * Neither SEV nor TDX supports shared page tables, so only the final * leaf PTE needs manually set the C/S-bit. */ if (vm_is_gpa_protected(vm, paddr)) - *pte |= vm->arch.c_bit; + *pte |= PTE_C_BIT_MASK(mmu); else - *pte |= vm->arch.s_bit; + *pte |= PTE_S_BIT_MASK(mmu); } void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) @@ -304,7 +318,7 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, static bool vm_is_target_pte(struct kvm_mmu *mmu, uint64_t *pte, int *level, int current_level) { - if (*pte & PTE_LARGE_MASK) { + if (is_huge_pte(mmu, pte)) { TEST_ASSERT(*level == PG_LEVEL_NONE || *level == current_level, "Unexpected hugepage at level %d", current_level); @@ -362,12 +376,13 @@ uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { + struct kvm_mmu *mmu = &vm->mmu; uint64_t *pml4e, *pml4e_start; uint64_t *pdpe, *pdpe_start; uint64_t *pde, *pde_start; uint64_t *pte, *pte_start; - if (!vm->mmu.pgd_created) + if (!mmu->pgd_created) return; fprintf(stream, "%*s " @@ -375,47 +390,47 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*s index hvaddr gpaddr " "addr w exec dirty\n", indent, ""); - pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->mmu.pgd); + pml4e_start = (uint64_t *) addr_gpa2hva(vm, mmu->pgd); for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) { pml4e = &pml4e_start[n1]; - if (!(*pml4e & PTE_PRESENT_MASK)) + if (!is_present_pte(mmu, pml4e)) continue; fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u " " %u\n", indent, "", pml4e - pml4e_start, pml4e, addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e), - !!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK)); + is_writable_pte(mmu, pml4e), is_nx_pte(mmu, pml4e)); pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK); for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) { pdpe = &pdpe_start[n2]; - if (!(*pdpe & PTE_PRESENT_MASK)) + if (!is_present_pte(mmu, pdpe)) continue; fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10llx " "%u %u\n", indent, "", pdpe - pdpe_start, pdpe, addr_hva2gpa(vm, pdpe), - PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK), - !!(*pdpe & PTE_NX_MASK)); + PTE_GET_PFN(*pdpe), is_writable_pte(mmu, pdpe), + is_nx_pte(mmu, pdpe)); pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK); for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) { pde = &pde_start[n3]; - if (!(*pde & PTE_PRESENT_MASK)) + if (!is_present_pte(mmu, pde)) continue; fprintf(stream, "%*spde 0x%-3zx %p " "0x%-12lx 0x%-10llx %u %u\n", indent, "", pde - pde_start, pde, addr_hva2gpa(vm, pde), - PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK), - !!(*pde & PTE_NX_MASK)); + PTE_GET_PFN(*pde), is_writable_pte(mmu, pde), + is_nx_pte(mmu, pde)); pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK); for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) { pte = &pte_start[n4]; - if (!(*pte & PTE_PRESENT_MASK)) + if (!is_present_pte(mmu, pte)) continue; fprintf(stream, "%*spte 0x%-3zx %p " "0x%-12lx 0x%-10llx %u %u " @@ -424,9 +439,9 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) pte - pte_start, pte, addr_hva2gpa(vm, pte), PTE_GET_PFN(*pte), - !!(*pte & PTE_WRITABLE_MASK), - !!(*pte & PTE_NX_MASK), - !!(*pte & PTE_DIRTY_MASK), + is_writable_pte(mmu, pte), + is_nx_pte(mmu, pte), + is_dirty_pte(mmu, pte), ((uint64_t) n1 << 27) | ((uint64_t) n2 << 18) | ((uint64_t) n3 << 9) @@ -509,7 +524,7 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) int level = PG_LEVEL_NONE; uint64_t *pte = __vm_get_page_table_entry(vm, &vm->mmu, gva, &level); - TEST_ASSERT(*pte & PTE_PRESENT_MASK, + TEST_ASSERT(is_present_pte(&vm->mmu, pte), "Leaf PTE not PRESENT for gva: 0x%08lx", gva); /* From f00f519cebcd4280c4ce4fab8133ed2dcfa8f95a Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:39 -0800 Subject: [PATCH 053/267] KVM: selftests: Use a TDP MMU to share EPT page tables between vCPUs prepare_eptp() currently allocates new EPTs for each vCPU. memstress has its own hack to share the EPTs between vCPUs. Currently, there is no reason to have separate EPTs for each vCPU, and the complexity is significant. The only reason it doesn't matter now is because memstress is the only user with multiple vCPUs. Add vm_enable_ept() to allocate EPT page tables for an entire VM, and use it everywhere to replace prepare_eptp(). Drop 'eptp' and 'eptp_hva' from 'struct vmx_pages' as they serve no purpose (e.g. the EPTP can be built from the PGD), but keep 'eptp_gpa' so that the MMU structure doesn't need to be passed in along with vmx_pages. Dynamically allocate the TDP MMU structure to avoid a cyclical dependency between kvm_util_arch.h and kvm_util.h. Remove the workaround in memstress to copy the EPT root between vCPUs since that's now the default behavior. Name the MMU tdp_mmu instead of e.g. nested_mmu or nested.mmu to avoid recreating the same mess that KVM has with respect to "nested" MMUs, e.g. does nested refer to the stage-2 page tables created by L1, or the stage-1 page tables created by L2? Signed-off-by: Yosry Ahmed Co-developed-by: Sean Christopherson Link: https://patch.msgid.link/20251230230150.4150236-11-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86/kvm_util_arch.h | 4 +++ .../selftests/kvm/include/x86/processor.h | 3 ++ tools/testing/selftests/kvm/include/x86/vmx.h | 8 ++--- .../testing/selftests/kvm/lib/x86/memstress.c | 19 ++++-------- .../testing/selftests/kvm/lib/x86/processor.c | 9 ++++++ tools/testing/selftests/kvm/lib/x86/vmx.c | 30 ++++++++++++------- .../selftests/kvm/x86/vmx_dirty_log_test.c | 7 ++--- 7 files changed, 48 insertions(+), 32 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h index bad381d63b6a..05a1fc1780f2 100644 --- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h @@ -26,6 +26,8 @@ struct kvm_mmu_arch { struct pte_masks pte_masks; }; +struct kvm_mmu; + struct kvm_vm_arch { vm_vaddr_t gdt; vm_vaddr_t tss; @@ -35,6 +37,8 @@ struct kvm_vm_arch { uint64_t s_bit; int sev_fd; bool is_pt_protected; + + struct kvm_mmu *tdp_mmu; }; static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 55dac84cd4a7..0164ef090787 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1459,6 +1459,9 @@ enum pg_level { #define is_huge_pte(mmu, pte) (!!(*(pte) & PTE_HUGE_MASK(mmu))) #define is_nx_pte(mmu, pte) (!!(*(pte) & PTE_NX_MASK(mmu))) +void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels, + struct pte_masks *pte_masks); + void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, uint64_t paddr, int level); void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index 04b8231d032a..1fd83c23529a 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -520,13 +520,11 @@ struct vmx_pages { uint64_t vmwrite_gpa; void *vmwrite; - void *eptp_hva; - uint64_t eptp_gpa; - void *eptp; - void *apic_access_hva; uint64_t apic_access_gpa; void *apic_access; + + uint64_t eptp_gpa; }; union vmx_basic { @@ -568,7 +566,7 @@ void tdp_identity_map_default_memslots(struct vmx_pages *vmx, void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); -void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm); +void vm_enable_ept(struct kvm_vm *vm); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); #endif /* SELFTEST_KVM_VMX_H */ diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 1928b00bde51..00f7f11e5f0e 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -59,12 +59,10 @@ uint64_t memstress_nested_pages(int nr_vcpus) return 513 + 10 * nr_vcpus; } -void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) +static void memstress_setup_ept_mappings(struct vmx_pages *vmx, struct kvm_vm *vm) { uint64_t start, end; - prepare_eptp(vmx, vm); - /* * Identity map the first 4G and the test region with 1G pages so that * KVM can shadow the EPT12 with the maximum huge page size supported @@ -79,7 +77,7 @@ void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]) { - struct vmx_pages *vmx, *vmx0 = NULL; + struct vmx_pages *vmx; struct kvm_regs regs; vm_vaddr_t vmx_gva; int vcpu_id; @@ -87,18 +85,13 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); TEST_REQUIRE(kvm_cpu_has_ept()); + vm_enable_ept(vm); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { vmx = vcpu_alloc_vmx(vm, &vmx_gva); - if (vcpu_id == 0) { - memstress_setup_ept(vmx, vm); - vmx0 = vmx; - } else { - /* Share the same EPT table across all vCPUs. */ - vmx->eptp = vmx0->eptp; - vmx->eptp_hva = vmx0->eptp_hva; - vmx->eptp_gpa = vmx0->eptp_gpa; - } + /* The EPTs are shared across vCPUs, setup the mappings once */ + if (vcpu_id == 0) + memstress_setup_ept_mappings(vmx, vm); /* * Override the vCPU to run memstress_l1_guest_code() which will diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 3800f4ff6770..8a9298a72897 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -187,6 +187,15 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) virt_mmu_init(vm, &vm->mmu, &pte_masks); } +void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels, + struct pte_masks *pte_masks) +{ + TEST_ASSERT(!vm->arch.tdp_mmu, "TDP MMU already initialized"); + + vm->arch.tdp_mmu = calloc(1, sizeof(*vm->arch.tdp_mmu)); + virt_mmu_init(vm, vm->arch.tdp_mmu, pte_masks); +} + static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t *parent_pte, uint64_t vaddr, int level) { diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index a3e2eae981da..9d4e391fdf2c 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -56,6 +56,21 @@ int vcpu_enable_evmcs(struct kvm_vcpu *vcpu) return evmcs_ver; } +void vm_enable_ept(struct kvm_vm *vm) +{ + TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT"); + if (vm->arch.tdp_mmu) + return; + + /* TODO: Drop eptPageTableEntry in favor of PTE masks. */ + struct pte_masks pte_masks = (struct pte_masks) { + + }; + + /* TODO: Add support for 5-level EPT. */ + tdp_mmu_init(vm, 4, &pte_masks); +} + /* Allocate memory regions for nested VMX tests. * * Input Args: @@ -105,6 +120,9 @@ vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva) vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite); memset(vmx->vmwrite_hva, 0, getpagesize()); + if (vm->arch.tdp_mmu) + vmx->eptp_gpa = vm->arch.tdp_mmu->pgd; + *p_vmx_gva = vmx_gva; return vmx; } @@ -395,7 +413,8 @@ void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, int target_level) { const uint64_t page_size = PG_LEVEL_SIZE(target_level); - struct eptPageTableEntry *pt = vmx->eptp_hva, *pte; + void *eptp_hva = addr_gpa2hva(vm, vm->arch.tdp_mmu->pgd); + struct eptPageTableEntry *pt = eptp_hva, *pte; uint16_t index; TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, @@ -525,15 +544,6 @@ bool kvm_cpu_has_ept(void) return ctrl & SECONDARY_EXEC_ENABLE_EPT; } -void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm) -{ - TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT"); - - vmx->eptp = (void *)vm_vaddr_alloc_page(vm); - vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp); - vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp); -} - void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm) { vmx->apic_access = (void *)vm_vaddr_alloc_page(vm); diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index e7d0c08ba29d..5c8cf8ac42a2 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -93,6 +93,9 @@ static void test_vmx_dirty_log(bool enable_ept) /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + if (enable_ept) + vm_enable_ept(vm); + vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_args_set(vcpu, 1, vmx_pages_gva); @@ -113,14 +116,10 @@ static void test_vmx_dirty_log(bool enable_ept) * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to * 0xc0000000. * - * Note that prepare_eptp should be called only L1's GPA map is done, - * meaning after the last call to virt_map. - * * When EPT is disabled, the L2 guest code will still access the same L1 * GPAs as the EPT enabled case. */ if (enable_ept) { - prepare_eptp(vmx, vm); tdp_identity_map_default_memslots(vmx, vm); tdp_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); tdp_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); From e40e72fec0dea9ac55aea84a0d76ccb7d7f32204 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:40 -0800 Subject: [PATCH 054/267] KVM: selftests: Stop passing VMX metadata to TDP mapping functions The root GPA is now retrieved from the nested MMU, stop passing VMX metadata. This is in preparation for making these functions work for NPTs as well. Opportunistically drop tdp_pg_map() since it's unused. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-12-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/vmx.h | 11 ++----- .../testing/selftests/kvm/lib/x86/memstress.c | 11 +++---- tools/testing/selftests/kvm/lib/x86/vmx.c | 33 +++++++------------ .../selftests/kvm/x86/vmx_dirty_log_test.c | 9 +++-- 4 files changed, 24 insertions(+), 40 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index 1fd83c23529a..4dd4c2094ee6 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -557,14 +557,9 @@ bool load_vmcs(struct vmx_pages *vmx); bool ept_1g_pages_supported(void); -void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, - uint64_t paddr); -void tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr, - uint64_t paddr, uint64_t size); -void tdp_identity_map_default_memslots(struct vmx_pages *vmx, - struct kvm_vm *vm); -void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t addr, uint64_t size); +void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); +void tdp_identity_map_default_memslots(struct kvm_vm *vm); +void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); void vm_enable_ept(struct kvm_vm *vm); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 00f7f11e5f0e..3319cb57a78d 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -59,7 +59,7 @@ uint64_t memstress_nested_pages(int nr_vcpus) return 513 + 10 * nr_vcpus; } -static void memstress_setup_ept_mappings(struct vmx_pages *vmx, struct kvm_vm *vm) +static void memstress_setup_ept_mappings(struct kvm_vm *vm) { uint64_t start, end; @@ -68,16 +68,15 @@ static void memstress_setup_ept_mappings(struct vmx_pages *vmx, struct kvm_vm *v * KVM can shadow the EPT12 with the maximum huge page size supported * by the backing source. */ - tdp_identity_map_1g(vmx, vm, 0, 0x100000000ULL); + tdp_identity_map_1g(vm, 0, 0x100000000ULL); start = align_down(memstress_args.gpa, PG_SIZE_1G); end = align_up(memstress_args.gpa + memstress_args.size, PG_SIZE_1G); - tdp_identity_map_1g(vmx, vm, start, end - start); + tdp_identity_map_1g(vm, start, end - start); } void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]) { - struct vmx_pages *vmx; struct kvm_regs regs; vm_vaddr_t vmx_gva; int vcpu_id; @@ -87,11 +86,11 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc vm_enable_ept(vm); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { - vmx = vcpu_alloc_vmx(vm, &vmx_gva); + vcpu_alloc_vmx(vm, &vmx_gva); /* The EPTs are shared across vCPUs, setup the mappings once */ if (vcpu_id == 0) - memstress_setup_ept_mappings(vmx, vm); + memstress_setup_ept_mappings(vm); /* * Override the vCPU to run memstress_l1_guest_code() which will diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 9d4e391fdf2c..ea1c09f9e8ab 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -409,8 +409,8 @@ static void tdp_create_pte(struct kvm_vm *vm, } -void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, int target_level) +void __tdp_pg_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, + int target_level) { const uint64_t page_size = PG_LEVEL_SIZE(target_level); void *eptp_hva = addr_gpa2hva(vm, vm->arch.tdp_mmu->pgd); @@ -453,12 +453,6 @@ void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, } } -void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr) -{ - __tdp_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K); -} - /* * Map a range of EPT guest physical addresses to the VM's physical address * @@ -476,9 +470,8 @@ void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, * Within the VM given by vm, creates a nested guest translation for the * page range starting at nested_paddr to the page range starting at paddr. */ -void __tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size, - int level) +void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, + uint64_t size, int level) { size_t page_size = PG_LEVEL_SIZE(level); size_t npages = size / page_size; @@ -487,23 +480,22 @@ void __tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - __tdp_pg_map(vmx, vm, nested_paddr, paddr, level); + __tdp_pg_map(vm, nested_paddr, paddr, level); nested_paddr += page_size; paddr += page_size; } } -void tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size) +void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, + uint64_t size) { - __tdp_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K); + __tdp_map(vm, nested_paddr, paddr, size, PG_LEVEL_4K); } /* Prepare an identity extended page table that maps all the * physical pages in VM. */ -void tdp_identity_map_default_memslots(struct vmx_pages *vmx, - struct kvm_vm *vm) +void tdp_identity_map_default_memslots(struct kvm_vm *vm) { uint32_t s, memslot = 0; sparsebit_idx_t i, last; @@ -520,16 +512,15 @@ void tdp_identity_map_default_memslots(struct vmx_pages *vmx, if (i > last) break; - tdp_map(vmx, vm, (uint64_t)i << vm->page_shift, + tdp_map(vm, (uint64_t)i << vm->page_shift, (uint64_t)i << vm->page_shift, 1 << vm->page_shift); } } /* Identity map a region with 1GiB Pages. */ -void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t addr, uint64_t size) +void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size) { - __tdp_map(vmx, vm, addr, addr, size, PG_LEVEL_1G); + __tdp_map(vm, addr, addr, size, PG_LEVEL_1G); } bool kvm_cpu_has_ept(void) diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index 5c8cf8ac42a2..370f8d3117c2 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -80,7 +80,6 @@ void l1_guest_code(struct vmx_pages *vmx) static void test_vmx_dirty_log(bool enable_ept) { vm_vaddr_t vmx_pages_gva = 0; - struct vmx_pages *vmx; unsigned long *bmap; uint64_t *host_test_mem; @@ -96,7 +95,7 @@ static void test_vmx_dirty_log(bool enable_ept) if (enable_ept) vm_enable_ept(vm); - vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_args_set(vcpu, 1, vmx_pages_gva); /* Add an extra memory slot for testing dirty logging */ @@ -120,9 +119,9 @@ static void test_vmx_dirty_log(bool enable_ept) * GPAs as the EPT enabled case. */ if (enable_ept) { - tdp_identity_map_default_memslots(vmx, vm); - tdp_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); - tdp_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); + tdp_identity_map_default_memslots(vm); + tdp_map(vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); + tdp_map(vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); } bmap = bitmap_zalloc(TEST_MEM_PAGES); From 8296b16c0a2ba018c3235db5325d679e603899d6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:41 -0800 Subject: [PATCH 055/267] KVM: selftests: Add a stage-2 MMU instance to kvm_vm Add a stage-2 MMU instance so that architectures that support nested virtualization (more specifically, nested stage-2 page tables) can create and track stage-2 page tables for running L2 guests. Plumb the structure into common code to avoid cyclical dependencies, and to provide some line of sight to having common APIs for creating stage-2 mappings. As a bonus, putting the member in common code justifies using stage2_mmu instead of tdp_mmu for x86. Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-13-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/kvm_util.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index c1497515fa6a..371d55e0366e 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -116,7 +116,12 @@ struct kvm_vm { uint32_t dirty_ring_size; uint64_t gpa_tag_mask; + /* + * "mmu" is the guest's stage-1, with a short name because the vast + * majority of tests only care about the stage-1 MMU. + */ struct kvm_mmu mmu; + struct kvm_mmu stage2_mmu; struct kvm_vm_arch arch; From 508d1cc3ca0ac428b1d5d614519bc497868c2e9f Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:42 -0800 Subject: [PATCH 056/267] KVM: selftests: Reuse virt mapping functions for nested EPTs Rework tdp_map() and friends to use __virt_pg_map() and drop the custom EPT code in __tdp_pg_map() and tdp_create_pte(). The EPT code and __virt_pg_map() are practically identical, the main differences are: - EPT uses the EPT struct overlay instead of the PTE masks. - EPT always assumes 4-level EPTs. To reuse __virt_pg_map(), extend the PTE masks to work with EPT's RWX and X-only capabilities, and provide a tdp_mmu_init() API so that EPT can pass in the EPT PTE masks along with the root page level (which is currently hardcoded to '4'). Don't reuse KVM's insane overloading of the USER bit for EPT_R as there's no reason to multiplex bits in the selftests, e.g. selftests aren't trying to shadow guest PTEs and thus don't care about funnelling protections into a common permissions check. Another benefit of reusing the code is having separate handling for upper-level PTEs vs 4K PTEs, which avoids some quirks like setting the large bit on a 4K PTE in the EPTs. For all intents and purposes, no functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Yosry Ahmed Co-developed-by: Sean Christopherson Link: https://patch.msgid.link/20251230230150.4150236-14-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86/kvm_util_arch.h | 4 +- .../selftests/kvm/include/x86/processor.h | 16 ++- .../testing/selftests/kvm/lib/x86/processor.c | 21 +++- tools/testing/selftests/kvm/lib/x86/vmx.c | 119 +++--------------- 4 files changed, 52 insertions(+), 108 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h index 05a1fc1780f2..1cf84b8212c6 100644 --- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h @@ -14,6 +14,8 @@ struct pte_masks { uint64_t present; uint64_t writable; uint64_t user; + uint64_t readable; + uint64_t executable; uint64_t accessed; uint64_t dirty; uint64_t huge; @@ -37,8 +39,6 @@ struct kvm_vm_arch { uint64_t s_bit; int sev_fd; bool is_pt_protected; - - struct kvm_mmu *tdp_mmu; }; static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 0164ef090787..e17cbbe71b8f 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1444,6 +1444,8 @@ enum pg_level { #define PTE_PRESENT_MASK(mmu) ((mmu)->arch.pte_masks.present) #define PTE_WRITABLE_MASK(mmu) ((mmu)->arch.pte_masks.writable) #define PTE_USER_MASK(mmu) ((mmu)->arch.pte_masks.user) +#define PTE_READABLE_MASK(mmu) ((mmu)->arch.pte_masks.readable) +#define PTE_EXECUTABLE_MASK(mmu) ((mmu)->arch.pte_masks.executable) #define PTE_ACCESSED_MASK(mmu) ((mmu)->arch.pte_masks.accessed) #define PTE_DIRTY_MASK(mmu) ((mmu)->arch.pte_masks.dirty) #define PTE_HUGE_MASK(mmu) ((mmu)->arch.pte_masks.huge) @@ -1451,13 +1453,23 @@ enum pg_level { #define PTE_C_BIT_MASK(mmu) ((mmu)->arch.pte_masks.c) #define PTE_S_BIT_MASK(mmu) ((mmu)->arch.pte_masks.s) -#define is_present_pte(mmu, pte) (!!(*(pte) & PTE_PRESENT_MASK(mmu))) +/* + * For PTEs without a PRESENT bit (i.e. EPT entries), treat the PTE as present + * if it's executable or readable, as EPT supports execute-only PTEs, but not + * write-only PTEs. + */ +#define is_present_pte(mmu, pte) \ + (PTE_PRESENT_MASK(mmu) ? \ + !!(*(pte) & PTE_PRESENT_MASK(mmu)) : \ + !!(*(pte) & (PTE_READABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu)))) +#define is_executable_pte(mmu, pte) \ + ((*(pte) & (PTE_EXECUTABLE_MASK(mmu) | PTE_NX_MASK(mmu))) == PTE_EXECUTABLE_MASK(mmu)) #define is_writable_pte(mmu, pte) (!!(*(pte) & PTE_WRITABLE_MASK(mmu))) #define is_user_pte(mmu, pte) (!!(*(pte) & PTE_USER_MASK(mmu))) #define is_accessed_pte(mmu, pte) (!!(*(pte) & PTE_ACCESSED_MASK(mmu))) #define is_dirty_pte(mmu, pte) (!!(*(pte) & PTE_DIRTY_MASK(mmu))) #define is_huge_pte(mmu, pte) (!!(*(pte) & PTE_HUGE_MASK(mmu))) -#define is_nx_pte(mmu, pte) (!!(*(pte) & PTE_NX_MASK(mmu))) +#define is_nx_pte(mmu, pte) (!is_executable_pte(mmu, pte)) void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels, struct pte_masks *pte_masks); diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 8a9298a72897..41316cac94e0 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -165,6 +165,10 @@ static void virt_mmu_init(struct kvm_vm *vm, struct kvm_mmu *mmu, mmu->pgd_created = true; mmu->arch.pte_masks = *pte_masks; } + + TEST_ASSERT(mmu->pgtable_levels == 4 || mmu->pgtable_levels == 5, + "Selftests MMU only supports 4-level and 5-level paging, not %u-level paging", + mmu->pgtable_levels); } void virt_arch_pgd_alloc(struct kvm_vm *vm) @@ -180,6 +184,7 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) .dirty = BIT_ULL(6), .huge = BIT_ULL(7), .nx = BIT_ULL(63), + .executable = 0, .c = vm->arch.c_bit, .s = vm->arch.s_bit, }; @@ -190,10 +195,10 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm) void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels, struct pte_masks *pte_masks) { - TEST_ASSERT(!vm->arch.tdp_mmu, "TDP MMU already initialized"); + TEST_ASSERT(!vm->stage2_mmu.pgtable_levels, "TDP MMU already initialized"); - vm->arch.tdp_mmu = calloc(1, sizeof(*vm->arch.tdp_mmu)); - virt_mmu_init(vm, vm->arch.tdp_mmu, pte_masks); + vm->stage2_mmu.pgtable_levels = pgtable_levels; + virt_mmu_init(vm, &vm->stage2_mmu, pte_masks); } static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu, @@ -223,7 +228,8 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, paddr = vm_untag_gpa(vm, paddr); if (!is_present_pte(mmu, pte)) { - *pte = PTE_PRESENT_MASK(mmu) | PTE_WRITABLE_MASK(mmu); + *pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) | + PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu); if (current_level == target_level) *pte |= PTE_HUGE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); else @@ -269,6 +275,9 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, TEST_ASSERT(vm_untag_gpa(vm, paddr) == paddr, "Unexpected bits in paddr: %lx", paddr); + TEST_ASSERT(!PTE_EXECUTABLE_MASK(mmu) || !PTE_NX_MASK(mmu), + "X and NX bit masks cannot be used simultaneously"); + /* * Allocate upper level page tables, if not already present. Return * early if a hugepage was created. @@ -286,7 +295,9 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, pte = virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); TEST_ASSERT(!is_present_pte(mmu, pte), "PTE already present for 4k page at vaddr: 0x%lx", vaddr); - *pte = PTE_PRESENT_MASK(mmu) | PTE_WRITABLE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); + *pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) | + PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu) | + (paddr & PHYSICAL_PAGE_MASK); /* * Neither SEV nor TDX supports shared page tables, so only the final diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index ea1c09f9e8ab..e3737b3d9120 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -25,21 +25,6 @@ bool enable_evmcs; struct hv_enlightened_vmcs *current_evmcs; struct hv_vp_assist_page *current_vp_assist; -struct eptPageTableEntry { - uint64_t readable:1; - uint64_t writable:1; - uint64_t executable:1; - uint64_t memory_type:3; - uint64_t ignore_pat:1; - uint64_t page_size:1; - uint64_t accessed:1; - uint64_t dirty:1; - uint64_t ignored_11_10:2; - uint64_t address:40; - uint64_t ignored_62_52:11; - uint64_t suppress_ve:1; -}; - int vcpu_enable_evmcs(struct kvm_vcpu *vcpu) { uint16_t evmcs_ver; @@ -58,13 +43,24 @@ int vcpu_enable_evmcs(struct kvm_vcpu *vcpu) void vm_enable_ept(struct kvm_vm *vm) { + struct pte_masks pte_masks; + TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT"); - if (vm->arch.tdp_mmu) - return; - - /* TODO: Drop eptPageTableEntry in favor of PTE masks. */ - struct pte_masks pte_masks = (struct pte_masks) { + /* + * EPTs do not have 'present' or 'user' bits, instead bit 0 is the + * 'readable' bit. + */ + pte_masks = (struct pte_masks) { + .present = 0, + .user = 0, + .readable = BIT_ULL(0), + .writable = BIT_ULL(1), + .executable = BIT_ULL(2), + .huge = BIT_ULL(7), + .accessed = BIT_ULL(8), + .dirty = BIT_ULL(9), + .nx = 0, }; /* TODO: Add support for 5-level EPT. */ @@ -120,8 +116,8 @@ vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva) vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite); memset(vmx->vmwrite_hva, 0, getpagesize()); - if (vm->arch.tdp_mmu) - vmx->eptp_gpa = vm->arch.tdp_mmu->pgd; + if (vm->stage2_mmu.pgd_created) + vmx->eptp_gpa = vm->stage2_mmu.pgd; *p_vmx_gva = vmx_gva; return vmx; @@ -377,82 +373,6 @@ void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) init_vmcs_guest_state(guest_rip, guest_rsp); } -static void tdp_create_pte(struct kvm_vm *vm, - struct eptPageTableEntry *pte, - uint64_t nested_paddr, - uint64_t paddr, - int current_level, - int target_level) -{ - if (!pte->readable) { - pte->writable = true; - pte->readable = true; - pte->executable = true; - pte->page_size = (current_level == target_level); - if (pte->page_size) - pte->address = paddr >> vm->page_shift; - else - pte->address = vm_alloc_page_table(vm) >> vm->page_shift; - } else { - /* - * Entry already present. Assert that the caller doesn't want - * a hugepage at this level, and that there isn't a hugepage at - * this level. - */ - TEST_ASSERT(current_level != target_level, - "Cannot create hugepage at level: %u, nested_paddr: 0x%lx", - current_level, nested_paddr); - TEST_ASSERT(!pte->page_size, - "Cannot create page table at level: %u, nested_paddr: 0x%lx", - current_level, nested_paddr); - } -} - - -void __tdp_pg_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, - int target_level) -{ - const uint64_t page_size = PG_LEVEL_SIZE(target_level); - void *eptp_hva = addr_gpa2hva(vm, vm->arch.tdp_mmu->pgd); - struct eptPageTableEntry *pt = eptp_hva, *pte; - uint16_t index; - - TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, - "Unknown or unsupported guest mode: 0x%x", vm->mode); - - TEST_ASSERT((nested_paddr >> 48) == 0, - "Nested physical address 0x%lx is > 48-bits and requires 5-level EPT", - nested_paddr); - TEST_ASSERT((nested_paddr % page_size) == 0, - "Nested physical address not on page boundary,\n" - " nested_paddr: 0x%lx page_size: 0x%lx", - nested_paddr, page_size); - TEST_ASSERT((nested_paddr >> vm->page_shift) <= vm->max_gfn, - "Physical address beyond beyond maximum supported,\n" - " nested_paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", - paddr, vm->max_gfn, vm->page_size); - TEST_ASSERT((paddr % page_size) == 0, - "Physical address not on page boundary,\n" - " paddr: 0x%lx page_size: 0x%lx", - paddr, page_size); - TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, - "Physical address beyond beyond maximum supported,\n" - " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", - paddr, vm->max_gfn, vm->page_size); - - for (int level = PG_LEVEL_512G; level >= PG_LEVEL_4K; level--) { - index = (nested_paddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; - pte = &pt[index]; - - tdp_create_pte(vm, pte, nested_paddr, paddr, level, target_level); - - if (pte->page_size) - break; - - pt = addr_gpa2hva(vm, pte->address * vm->page_size); - } -} - /* * Map a range of EPT guest physical addresses to the VM's physical address * @@ -473,6 +393,7 @@ void __tdp_pg_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size, int level) { + struct kvm_mmu *mmu = &vm->stage2_mmu; size_t page_size = PG_LEVEL_SIZE(level); size_t npages = size / page_size; @@ -480,7 +401,7 @@ void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - __tdp_pg_map(vm, nested_paddr, paddr, level); + __virt_pg_map(vm, mmu, nested_paddr, paddr, level); nested_paddr += page_size; paddr += page_size; } From 07676c04bd753f32a9fe2f247b0ba2d213dd7a99 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:43 -0800 Subject: [PATCH 057/267] KVM: selftests: Move TDP mapping functions outside of vmx.c Now that the functions are no longer VMX-specific, move them to processor.c. Do a minor comment tweak replacing 'EPT' with 'TDP'. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-15-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86/processor.h | 4 ++ tools/testing/selftests/kvm/include/x86/vmx.h | 3 - .../testing/selftests/kvm/lib/x86/processor.c | 53 ++++++++++++++ tools/testing/selftests/kvm/lib/x86/vmx.c | 71 ------------------- 4 files changed, 57 insertions(+), 74 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index e17cbbe71b8f..461cf155b96e 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1479,6 +1479,10 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, uint64_t nr_bytes, int level); +void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); +void tdp_identity_map_default_memslots(struct kvm_vm *vm); +void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size); + /* * Basic CPU control in CR0 */ diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index 4dd4c2094ee6..92b918700d24 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -557,9 +557,6 @@ bool load_vmcs(struct vmx_pages *vmx); bool ept_1g_pages_supported(void); -void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); -void tdp_identity_map_default_memslots(struct kvm_vm *vm); -void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); void vm_enable_ept(struct kvm_vm *vm); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 41316cac94e0..29e7d172f945 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -472,6 +472,59 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) } } +void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, + uint64_t size, int level) +{ + size_t page_size = PG_LEVEL_SIZE(level); + size_t npages = size / page_size; + + TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow"); + TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); + + while (npages--) { + __virt_pg_map(vm, &vm->stage2_mmu, nested_paddr, paddr, level); + nested_paddr += page_size; + paddr += page_size; + } +} + +void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, + uint64_t size) +{ + __tdp_map(vm, nested_paddr, paddr, size, PG_LEVEL_4K); +} + +/* Prepare an identity extended page table that maps all the + * physical pages in VM. + */ +void tdp_identity_map_default_memslots(struct kvm_vm *vm) +{ + uint32_t s, memslot = 0; + sparsebit_idx_t i, last; + struct userspace_mem_region *region = memslot2region(vm, memslot); + + /* Only memslot 0 is mapped here, ensure it's the only one being used */ + for (s = 0; s < NR_MEM_REGIONS; s++) + TEST_ASSERT_EQ(vm->memslots[s], 0); + + i = (region->region.guest_phys_addr >> vm->page_shift) - 1; + last = i + (region->region.memory_size >> vm->page_shift); + for (;;) { + i = sparsebit_next_clear(region->unused_phy_pages, i); + if (i > last) + break; + + tdp_map(vm, (uint64_t)i << vm->page_shift, + (uint64_t)i << vm->page_shift, 1 << vm->page_shift); + } +} + +/* Identity map a region with 1GiB Pages. */ +void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size) +{ + __tdp_map(vm, addr, addr, size, PG_LEVEL_1G); +} + /* * Set Unusable Segment * diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index e3737b3d9120..448a63457467 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -373,77 +373,6 @@ void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) init_vmcs_guest_state(guest_rip, guest_rsp); } -/* - * Map a range of EPT guest physical addresses to the VM's physical address - * - * Input Args: - * vm - Virtual Machine - * nested_paddr - Nested guest physical address to map - * paddr - VM Physical Address - * size - The size of the range to map - * level - The level at which to map the range - * - * Output Args: None - * - * Return: None - * - * Within the VM given by vm, creates a nested guest translation for the - * page range starting at nested_paddr to the page range starting at paddr. - */ -void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, - uint64_t size, int level) -{ - struct kvm_mmu *mmu = &vm->stage2_mmu; - size_t page_size = PG_LEVEL_SIZE(level); - size_t npages = size / page_size; - - TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow"); - TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); - - while (npages--) { - __virt_pg_map(vm, mmu, nested_paddr, paddr, level); - nested_paddr += page_size; - paddr += page_size; - } -} - -void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, - uint64_t size) -{ - __tdp_map(vm, nested_paddr, paddr, size, PG_LEVEL_4K); -} - -/* Prepare an identity extended page table that maps all the - * physical pages in VM. - */ -void tdp_identity_map_default_memslots(struct kvm_vm *vm) -{ - uint32_t s, memslot = 0; - sparsebit_idx_t i, last; - struct userspace_mem_region *region = memslot2region(vm, memslot); - - /* Only memslot 0 is mapped here, ensure it's the only one being used */ - for (s = 0; s < NR_MEM_REGIONS; s++) - TEST_ASSERT_EQ(vm->memslots[s], 0); - - i = (region->region.guest_phys_addr >> vm->page_shift) - 1; - last = i + (region->region.memory_size >> vm->page_shift); - for (;;) { - i = sparsebit_next_clear(region->unused_phy_pages, i); - if (i > last) - break; - - tdp_map(vm, (uint64_t)i << vm->page_shift, - (uint64_t)i << vm->page_shift, 1 << vm->page_shift); - } -} - -/* Identity map a region with 1GiB Pages. */ -void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size) -{ - __tdp_map(vm, addr, addr, size, PG_LEVEL_1G); -} - bool kvm_cpu_has_ept(void) { uint64_t ctrl; From 9cb1944f6bf09ecebcc7609f35178b85aa26f165 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:44 -0800 Subject: [PATCH 058/267] KVM: selftests: Allow kvm_cpu_has_ept() to be called on AMD CPUs In preparation for generalizing the nested dirty logging test, checking if either EPT or NPT is enabled will be needed. To avoid needing to gate the kvm_cpu_has_ept() call by the CPU type, make sure the function returns false if VMX is not available instead of trying to read VMX-only MSRs. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-16-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86/vmx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 448a63457467..c87b340362a9 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -377,6 +377,9 @@ bool kvm_cpu_has_ept(void) { uint64_t ctrl; + if (!kvm_cpu_has(X86_FEATURE_VMX)) + return false; + ctrl = kvm_get_feature_msr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) >> 32; if (!(ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) return false; From 753c0d5a507b939d6efc60c7b437d5330880cce3 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:45 -0800 Subject: [PATCH 059/267] KVM: selftests: Add support for nested NPTs Implement nCR3 and NPT initialization functions, similar to the EPT equivalents, and create common TDP helpers for enablement checking and initialization. Enable NPT for nested guests by default if the TDP MMU was initialized, similar to VMX. Reuse the PTE masks from the main MMU in the NPT MMU, except for the C and S bits related to confidential VMs. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-17-seanjc@google.com [sean: apply Yosry's fixup for ncr3_gpa] Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86/processor.h | 2 ++ .../selftests/kvm/include/x86/svm_util.h | 9 +++++++ .../testing/selftests/kvm/lib/x86/memstress.c | 4 ++-- .../testing/selftests/kvm/lib/x86/processor.c | 15 ++++++++++++ tools/testing/selftests/kvm/lib/x86/svm.c | 24 +++++++++++++++++++ .../selftests/kvm/x86/vmx_dirty_log_test.c | 4 ++-- 6 files changed, 54 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 461cf155b96e..115bec5eb1eb 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1479,6 +1479,8 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, uint64_t nr_bytes, int level); +void vm_enable_tdp(struct kvm_vm *vm); +bool kvm_cpu_has_tdp(void); void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); void tdp_identity_map_default_memslots(struct kvm_vm *vm); void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size); diff --git a/tools/testing/selftests/kvm/include/x86/svm_util.h b/tools/testing/selftests/kvm/include/x86/svm_util.h index b74c6dcddcbd..5d7c42534bc4 100644 --- a/tools/testing/selftests/kvm/include/x86/svm_util.h +++ b/tools/testing/selftests/kvm/include/x86/svm_util.h @@ -27,6 +27,9 @@ struct svm_test_data { void *msr; /* gva */ void *msr_hva; uint64_t msr_gpa; + + /* NPT */ + uint64_t ncr3_gpa; }; static inline void vmmcall(void) @@ -57,6 +60,12 @@ struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva); void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp); void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa); +static inline bool kvm_cpu_has_npt(void) +{ + return kvm_cpu_has(X86_FEATURE_NPT); +} +void vm_enable_npt(struct kvm_vm *vm); + int open_sev_dev_path_or_exit(void); #endif /* SELFTEST_KVM_SVM_UTILS_H */ diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 3319cb57a78d..407abfc34909 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -82,9 +82,9 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc int vcpu_id; TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); - TEST_REQUIRE(kvm_cpu_has_ept()); + TEST_REQUIRE(kvm_cpu_has_tdp()); - vm_enable_ept(vm); + vm_enable_tdp(vm); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { vcpu_alloc_vmx(vm, &vmx_gva); diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 29e7d172f945..a3a4c9a4cbcb 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -8,7 +8,9 @@ #include "kvm_util.h" #include "pmu.h" #include "processor.h" +#include "svm_util.h" #include "sev.h" +#include "vmx.h" #ifndef NUM_INTERRUPTS #define NUM_INTERRUPTS 256 @@ -472,6 +474,19 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) } } +void vm_enable_tdp(struct kvm_vm *vm) +{ + if (kvm_cpu_has(X86_FEATURE_VMX)) + vm_enable_ept(vm); + else + vm_enable_npt(vm); +} + +bool kvm_cpu_has_tdp(void) +{ + return kvm_cpu_has_ept() || kvm_cpu_has_npt(); +} + void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size, int level) { diff --git a/tools/testing/selftests/kvm/lib/x86/svm.c b/tools/testing/selftests/kvm/lib/x86/svm.c index d239c2097391..a25a3471f5f6 100644 --- a/tools/testing/selftests/kvm/lib/x86/svm.c +++ b/tools/testing/selftests/kvm/lib/x86/svm.c @@ -46,6 +46,9 @@ vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva) svm->msr_gpa = addr_gva2gpa(vm, (uintptr_t)svm->msr); memset(svm->msr_hva, 0, getpagesize()); + if (vm->stage2_mmu.pgd_created) + svm->ncr3_gpa = vm->stage2_mmu.pgd; + *p_svm_gva = svm_gva; return svm; } @@ -59,6 +62,22 @@ static void vmcb_set_seg(struct vmcb_seg *seg, u16 selector, seg->base = base; } +void vm_enable_npt(struct kvm_vm *vm) +{ + struct pte_masks pte_masks; + + TEST_ASSERT(kvm_cpu_has_npt(), "KVM doesn't supported nested NPT"); + + /* + * NPTs use the same PTE format, but deliberately drop the C-bit as the + * per-VM shared vs. private information is only meant for stage-1. + */ + pte_masks = vm->mmu.arch.pte_masks; + pte_masks.c = 0; + + tdp_mmu_init(vm, vm->mmu.pgtable_levels, &pte_masks); +} + void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp) { struct vmcb *vmcb = svm->vmcb; @@ -102,6 +121,11 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r vmcb->save.rip = (u64)guest_rip; vmcb->save.rsp = (u64)guest_rsp; guest_regs.rdi = (u64)svm; + + if (svm->ncr3_gpa) { + ctrl->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; + ctrl->nested_cr3 = svm->ncr3_gpa; + } } /* diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index 370f8d3117c2..032ab8bf60a4 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -93,7 +93,7 @@ static void test_vmx_dirty_log(bool enable_ept) /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); if (enable_ept) - vm_enable_ept(vm); + vm_enable_tdp(vm); vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_args_set(vcpu, 1, vmx_pages_gva); @@ -170,7 +170,7 @@ int main(int argc, char *argv[]) test_vmx_dirty_log(/*enable_ept=*/false); - if (kvm_cpu_has_ept()) + if (kvm_cpu_has_tdp()) test_vmx_dirty_log(/*enable_ept=*/true); return 0; From 251e4849a79b258fd3e889ff095ba083ce301c13 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:46 -0800 Subject: [PATCH 060/267] KVM: selftests: Set the user bit on nested NPT PTEs According to the APM, NPT walks are treated as user accesses. In preparation for supporting NPT mappings, set the 'user' bit on NPTs by adding a mask of bits to always be set on PTEs in kvm_mmu. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-18-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/kvm_util_arch.h | 2 ++ tools/testing/selftests/kvm/include/x86/processor.h | 1 + tools/testing/selftests/kvm/lib/x86/processor.c | 5 +++-- tools/testing/selftests/kvm/lib/x86/svm.c | 3 +++ 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h index 1cf84b8212c6..be35d26bb320 100644 --- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h +++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h @@ -22,6 +22,8 @@ struct pte_masks { uint64_t nx; uint64_t c; uint64_t s; + + uint64_t always_set; }; struct kvm_mmu_arch { diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 115bec5eb1eb..995277cae94e 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1452,6 +1452,7 @@ enum pg_level { #define PTE_NX_MASK(mmu) ((mmu)->arch.pte_masks.nx) #define PTE_C_BIT_MASK(mmu) ((mmu)->arch.pte_masks.c) #define PTE_S_BIT_MASK(mmu) ((mmu)->arch.pte_masks.s) +#define PTE_ALWAYS_SET_MASK(mmu) ((mmu)->arch.pte_masks.always_set) /* * For PTEs without a PRESENT bit (i.e. EPT entries), treat the PTE as present diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index a3a4c9a4cbcb..5a3385d48902 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -231,7 +231,8 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, if (!is_present_pte(mmu, pte)) { *pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) | - PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu); + PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu) | + PTE_ALWAYS_SET_MASK(mmu); if (current_level == target_level) *pte |= PTE_HUGE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); else @@ -299,7 +300,7 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr, "PTE already present for 4k page at vaddr: 0x%lx", vaddr); *pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) | PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu) | - (paddr & PHYSICAL_PAGE_MASK); + PTE_ALWAYS_SET_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK); /* * Neither SEV nor TDX supports shared page tables, so only the final diff --git a/tools/testing/selftests/kvm/lib/x86/svm.c b/tools/testing/selftests/kvm/lib/x86/svm.c index a25a3471f5f6..2e5c480c9afd 100644 --- a/tools/testing/selftests/kvm/lib/x86/svm.c +++ b/tools/testing/selftests/kvm/lib/x86/svm.c @@ -75,6 +75,9 @@ void vm_enable_npt(struct kvm_vm *vm) pte_masks = vm->mmu.arch.pte_masks; pte_masks.c = 0; + /* NPT walks are treated as user accesses, so set the 'user' bit. */ + pte_masks.always_set = pte_masks.user; + tdp_mmu_init(vm, vm->mmu.pgtable_levels, &pte_masks); } From 6794d916f87e0a6cd51a3d8c2c0e6ffd48fa7a79 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:47 -0800 Subject: [PATCH 061/267] KVM: selftests: Extend vmx_dirty_log_test to cover SVM Generalize the code in vmx_dirty_log_test.c by adding SVM-specific L1 code, doing some renaming (e.g. EPT -> TDP), and having setup code for both SVM and VMX in test_dirty_log(). Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-19-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 2 +- ...rty_log_test.c => nested_dirty_log_test.c} | 73 ++++++++++++++----- 2 files changed, 54 insertions(+), 21 deletions(-) rename tools/testing/selftests/kvm/x86/{vmx_dirty_log_test.c => nested_dirty_log_test.c} (71%) diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 3789890421bd..ffbf891b31d3 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -89,6 +89,7 @@ TEST_GEN_PROGS_x86 += x86/kvm_buslock_test TEST_GEN_PROGS_x86 += x86/monitor_mwait_test TEST_GEN_PROGS_x86 += x86/msrs_test TEST_GEN_PROGS_x86 += x86/nested_close_kvm_test +TEST_GEN_PROGS_x86 += x86/nested_dirty_log_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test @@ -115,7 +116,6 @@ TEST_GEN_PROGS_x86 += x86/ucna_injection_test TEST_GEN_PROGS_x86 += x86/userspace_io_test TEST_GEN_PROGS_x86 += x86/userspace_msr_exit_test TEST_GEN_PROGS_x86 += x86/vmx_apic_access_test -TEST_GEN_PROGS_x86 += x86/vmx_dirty_log_test TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86 += x86/vmx_msrs_test TEST_GEN_PROGS_x86 += x86/vmx_invalid_nested_guest_state diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c similarity index 71% rename from tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c rename to tools/testing/selftests/kvm/x86/nested_dirty_log_test.c index 032ab8bf60a4..89d2e86a0db9 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c @@ -12,6 +12,7 @@ #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "svm_util.h" #include "vmx.h" /* The memory slot index to track dirty pages */ @@ -25,6 +26,8 @@ #define NESTED_TEST_MEM1 0xc0001000 #define NESTED_TEST_MEM2 0xc0002000 +#define L2_GUEST_STACK_SIZE 64 + static void l2_guest_code(u64 *a, u64 *b) { READ_ONCE(*a); @@ -42,20 +45,19 @@ static void l2_guest_code(u64 *a, u64 *b) vmcall(); } -static void l2_guest_code_ept_enabled(void) +static void l2_guest_code_tdp_enabled(void) { l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2); } -static void l2_guest_code_ept_disabled(void) +static void l2_guest_code_tdp_disabled(void) { - /* Access the same L1 GPAs as l2_guest_code_ept_enabled() */ + /* Access the same L1 GPAs as l2_guest_code_tdp_enabled() */ l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM); } -void l1_guest_code(struct vmx_pages *vmx) +void l1_vmx_code(struct vmx_pages *vmx) { -#define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; void *l2_rip; @@ -64,22 +66,49 @@ void l1_guest_code(struct vmx_pages *vmx) GUEST_ASSERT(load_vmcs(vmx)); if (vmx->eptp_gpa) - l2_rip = l2_guest_code_ept_enabled; + l2_rip = l2_guest_code_tdp_enabled; else - l2_rip = l2_guest_code_ept_disabled; + l2_rip = l2_guest_code_tdp_disabled; prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); GUEST_SYNC(false); GUEST_ASSERT(!vmlaunch()); GUEST_SYNC(false); - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL); GUEST_DONE(); } -static void test_vmx_dirty_log(bool enable_ept) +static void l1_svm_code(struct svm_test_data *svm) { - vm_vaddr_t vmx_pages_gva = 0; + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + void *l2_rip; + + if (svm->ncr3_gpa) + l2_rip = l2_guest_code_tdp_enabled; + else + l2_rip = l2_guest_code_tdp_disabled; + + generic_svm_setup(svm, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_SYNC(false); + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_SYNC(false); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); + GUEST_DONE(); +} + +static void l1_guest_code(void *data) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data); + else + l1_svm_code(data); +} + +static void test_dirty_log(bool nested_tdp) +{ + vm_vaddr_t nested_gva = 0; unsigned long *bmap; uint64_t *host_test_mem; @@ -88,15 +117,19 @@ static void test_vmx_dirty_log(bool enable_ept) struct ucall uc; bool done = false; - pr_info("Nested EPT: %s\n", enable_ept ? "enabled" : "disabled"); + pr_info("Nested TDP: %s\n", nested_tdp ? "enabled" : "disabled"); /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - if (enable_ept) + if (nested_tdp) vm_enable_tdp(vm); - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vcpu, 1, vmx_pages_gva); + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &nested_gva); + else + vcpu_alloc_svm(vm, &nested_gva); + + vcpu_args_set(vcpu, 1, nested_gva); /* Add an extra memory slot for testing dirty logging */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, @@ -115,10 +148,10 @@ static void test_vmx_dirty_log(bool enable_ept) * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to * 0xc0000000. * - * When EPT is disabled, the L2 guest code will still access the same L1 - * GPAs as the EPT enabled case. + * When TDP is disabled, the L2 guest code will still access the same L1 + * GPAs as the TDP enabled case. */ - if (enable_ept) { + if (nested_tdp) { tdp_identity_map_default_memslots(vm); tdp_map(vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); tdp_map(vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); @@ -166,12 +199,12 @@ static void test_vmx_dirty_log(bool enable_ept) int main(int argc, char *argv[]) { - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || kvm_cpu_has(X86_FEATURE_SVM)); - test_vmx_dirty_log(/*enable_ept=*/false); + test_dirty_log(/*nested_tdp=*/false); if (kvm_cpu_has_tdp()) - test_vmx_dirty_log(/*enable_ept=*/true); + test_dirty_log(/*nested_tdp=*/true); return 0; } From 59eef1a47b8c264d09ee84c909a1da60b4e70bd7 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 30 Dec 2025 15:01:48 -0800 Subject: [PATCH 062/267] KVM: selftests: Extend memstress to run on nested SVM Add L1 SVM code and generalize the setup code to work for both VMX and SVM. This allows running 'dirty_log_perf_test -n' on AMD CPUs. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-20-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/lib/x86/memstress.c | 42 +++++++++++++++---- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 407abfc34909..86f4c5e4c430 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -13,6 +13,7 @@ #include "kvm_util.h" #include "memstress.h" #include "processor.h" +#include "svm_util.h" #include "vmx.h" void memstress_l2_guest_code(uint64_t vcpu_id) @@ -29,9 +30,10 @@ __asm__( " ud2;" ); -static void memstress_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id) -{ #define L2_GUEST_STACK_SIZE 64 + +static void l1_vmx_code(struct vmx_pages *vmx, uint64_t vcpu_id) +{ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; unsigned long *rsp; @@ -45,10 +47,34 @@ static void memstress_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id) prepare_vmcs(vmx, memstress_l2_guest_entry, rsp); GUEST_ASSERT(!vmlaunch()); - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL); GUEST_DONE(); } +static void l1_svm_code(struct svm_test_data *svm, uint64_t vcpu_id) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + unsigned long *rsp; + + + rsp = &l2_guest_stack[L2_GUEST_STACK_SIZE - 1]; + *rsp = vcpu_id; + generic_svm_setup(svm, memstress_l2_guest_entry, rsp); + + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); + GUEST_DONE(); +} + + +static void memstress_l1_guest_code(void *data, uint64_t vcpu_id) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data, vcpu_id); + else + l1_svm_code(data, vcpu_id); +} + uint64_t memstress_nested_pages(int nr_vcpus) { /* @@ -78,15 +104,17 @@ static void memstress_setup_ept_mappings(struct kvm_vm *vm) void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]) { struct kvm_regs regs; - vm_vaddr_t vmx_gva; + vm_vaddr_t nested_gva; int vcpu_id; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); TEST_REQUIRE(kvm_cpu_has_tdp()); vm_enable_tdp(vm); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { - vcpu_alloc_vmx(vm, &vmx_gva); + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &nested_gva); + else + vcpu_alloc_svm(vm, &nested_gva); /* The EPTs are shared across vCPUs, setup the mappings once */ if (vcpu_id == 0) @@ -99,6 +127,6 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc vcpu_regs_get(vcpus[vcpu_id], ®s); regs.rip = (unsigned long) memstress_l1_guest_code; vcpu_regs_set(vcpus[vcpu_id], ®s); - vcpu_args_set(vcpus[vcpu_id], 2, vmx_gva, vcpu_id); + vcpu_args_set(vcpus[vcpu_id], 2, nested_gva, vcpu_id); } } From e353850499c717f1f984f7c208f49a8618beff2f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 15:01:49 -0800 Subject: [PATCH 063/267] KVM: selftests: Rename vm_get_page_table_entry() to vm_get_pte() Shorten the API to get a PTE as the "PTE" acronym is ubiquitous, and the "page table entry" makes it unnecessarily difficult to quickly understand what callers are doing. No functional change intended. Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230230150.4150236-21-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86/processor.h | 2 +- tools/testing/selftests/kvm/lib/x86/processor.c | 2 +- tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c | 2 +- .../selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c | 4 +--- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 995277cae94e..8f130e7d7048 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1359,7 +1359,7 @@ static inline bool kvm_is_ignore_msrs(void) return get_kvm_param_bool("ignore_msrs"); } -uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr); +uint64_t *vm_get_pte(struct kvm_vm *vm, uint64_t vaddr); uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 5a3385d48902..ab869a98bbdc 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -390,7 +390,7 @@ static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, return virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); } -uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) +uint64_t *vm_get_pte(struct kvm_vm *vm, uint64_t vaddr) { int level = PG_LEVEL_4K; diff --git a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c index a3b7ce155981..c542cc4762b1 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c +++ b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c @@ -619,7 +619,7 @@ int main(int argc, char *argv[]) */ gva = vm_vaddr_unused_gap(vm, NTEST_PAGES * PAGE_SIZE, KVM_UTIL_MIN_VADDR); for (i = 0; i < NTEST_PAGES; i++) { - pte = vm_get_page_table_entry(vm, data->test_pages + i * PAGE_SIZE); + pte = vm_get_pte(vm, data->test_pages + i * PAGE_SIZE); gpa = addr_hva2gpa(vm, pte); virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK); data->test_pages_pte[i] = gva + (gpa & ~PAGE_MASK); diff --git a/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c index fabeeaddfb3a..0e8aec568010 100644 --- a/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c +++ b/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c @@ -47,7 +47,6 @@ int main(int argc, char *argv[]) struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct ucall uc; - uint64_t *pte; uint64_t *hva; uint64_t gpa; int rc; @@ -73,8 +72,7 @@ int main(int argc, char *argv[]) hva = addr_gpa2hva(vm, MEM_REGION_GPA); memset(hva, 0, PAGE_SIZE); - pte = vm_get_page_table_entry(vm, MEM_REGION_GVA); - *pte |= BIT_ULL(MAXPHYADDR); + *vm_get_pte(vm, MEM_REGION_GVA) |= BIT_ULL(MAXPHYADDR); vcpu_run(vcpu); From 57a7b47ab30f6201769988392dc8b1c0822a3369 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Fri, 12 Dec 2025 21:50:51 +0800 Subject: [PATCH 064/267] KVM: x86: Don't read guest CR3 when doing async pf while the MMU is direct Don't read guest CR3 in kvm_arch_setup_async_pf() if the MMU is direct and use INVALID_GPA instead. When KVM tries to perform the host-only async page fault for the shared memory of TDX guests, the following WARNING is triggered: WARNING: CPU: 1 PID: 90922 at arch/x86/kvm/vmx/main.c:483 vt_cache_reg+0x16/0x20 Call Trace: __kvm_mmu_faultin_pfn kvm_mmu_faultin_pfn kvm_tdp_page_fault kvm_mmu_do_page_fault kvm_mmu_page_fault tdx_handle_ept_violation This WARNING is triggered when calling kvm_mmu_get_guest_pgd() to cache the guest CR3 in kvm_arch_setup_async_pf() for later use in kvm_arch_async_page_ready() to determine if it's possible to fix the page fault in the current vCPU context to save one VM exit. However, when guest state is protected, KVM cannot read the guest CR3. Since protected guests aren't compatible with shadow paging, i.e, they must use direct MMU, avoid calling kvm_mmu_get_guest_pgd() to read guest CR3 when the MMU is direct and use INVALID_GPA instead. Note that for protected guests mmu->root_role.direct is always true, so that kvm_mmu_get_guest_pgd() in kvm_arch_async_page_ready() won't be reached. Reported-by: Farrah Chen Suggested-by: Sean Christopherson Signed-off-by: Xiaoyao Li Link: https://patch.msgid.link/20251212135051.2155280-1-xiaoyao.li@intel.com [sean: explicitly cast to "unsigned long" to make 32-bit builds happy] Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f17324546900..3911ac9bddfd 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4521,7 +4521,10 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, arch.gfn = fault->gfn; arch.error_code = fault->error_code; arch.direct_map = vcpu->arch.mmu->root_role.direct; - arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); + if (arch.direct_map) + arch.cr3 = (unsigned long)INVALID_GPA; + else + arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); return kvm_setup_async_pf(vcpu, fault->addr, kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch); From fc3ba56385d03501eb582e4b86691ba378e556f9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 16 Dec 2025 08:17:54 -0800 Subject: [PATCH 065/267] KVM: nSVM: Remove a user-triggerable WARN on nested_svm_load_cr3() succeeding Drop the WARN in svm_set_nested_state() on nested_svm_load_cr3() failing as it is trivially easy to trigger from userspace by modifying CPUID after loading CR3. E.g. modifying the state restoration selftest like so: --- tools/testing/selftests/kvm/x86/state_test.c +++ tools/testing/selftests/kvm/x86/state_test.c @@ -280,7 +280,16 @@ int main(int argc, char *argv[]) /* Restore state in a new VM. */ vcpu = vm_recreate_with_one_vcpu(vm); - vcpu_load_state(vcpu, state); + + if (stage == 4) { + state->sregs.cr3 = BIT(44); + vcpu_load_state(vcpu, state); + + vcpu_set_cpuid_property(vcpu, X86_PROPERTY_MAX_PHY_ADDR, 36); + __vcpu_nested_state_set(vcpu, &state->nested); + } else { + vcpu_load_state(vcpu, state); + } /* * Restore XSAVE state in a dummy vCPU, first without doing generates: WARNING: CPU: 30 PID: 938 at arch/x86/kvm/svm/nested.c:1877 svm_set_nested_state+0x34a/0x360 [kvm_amd] Modules linked in: kvm_amd kvm irqbypass [last unloaded: kvm] CPU: 30 UID: 1000 PID: 938 Comm: state_test Tainted: G W 6.18.0-rc7-58e10b63777d-next-vm Tainted: [W]=WARN Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:svm_set_nested_state+0x34a/0x360 [kvm_amd] Call Trace: kvm_arch_vcpu_ioctl+0xf33/0x1700 [kvm] kvm_vcpu_ioctl+0x4e6/0x8f0 [kvm] __x64_sys_ioctl+0x8f/0xd0 do_syscall_64+0x61/0xad0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Simply delete the WARN instead of trying to prevent userspace from shoving "illegal" state into CR3. For better or worse, KVM's ABI allows userspace to set CPUID after SREGS, and vice versa, and KVM is very permissive when it comes to guest CPUID. I.e. attempting to enforce the virtual CPU model when setting CPUID could break userspace. Given that the WARN doesn't provide any meaningful protection for KVM or benefit for userspace, simply drop it even though the odds of breaking userspace are minuscule. Opportunistically delete a spurious newline. Fixes: b222b0b88162 ("KVM: nSVM: refactor the CR3 reload on migration") Cc: stable@vger.kernel.org Cc: Yosry Ahmed Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251216161755.1775409-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index ba0f11c68372..9be67040e94d 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1870,10 +1870,9 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, * thus MMU might not be initialized correctly. * Set it again to fix this. */ - ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, nested_npt_enabled(svm), false); - if (WARN_ON_ONCE(ret)) + if (ret) goto out_free; svm->nested.force_msr_bitmap_recalc = true; From 737f2a382f89f2ff3d9d6a737004d97bfb98dc56 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Nov 2025 14:16:41 -0800 Subject: [PATCH 066/267] KVM: SVM: Rename "fault_address" to "gpa" in npf_interception() Rename "fault_address" to "gpa" in KVM's #NPF handler and track it as a gpa_t to more precisely document what type of address is being captured, and because "gpa" is much more succinct. No functional change intended. Link: https://patch.msgid.link/20251113221642.1673023-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 24d59ccfa40d..af018c1196b5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1867,8 +1867,8 @@ static int npf_interception(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); int rc; - u64 fault_address = svm->vmcb->control.exit_info_2; u64 error_code = svm->vmcb->control.exit_info_1; + gpa_t gpa = svm->vmcb->control.exit_info_2; /* * WARN if hardware generates a fault with an error code that collides @@ -1882,14 +1882,14 @@ static int npf_interception(struct kvm_vcpu *vcpu) if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK)) error_code |= PFERR_PRIVATE_ACCESS; - trace_kvm_page_fault(vcpu, fault_address, error_code); - rc = kvm_mmu_page_fault(vcpu, fault_address, error_code, + trace_kvm_page_fault(vcpu, gpa, error_code); + rc = kvm_mmu_page_fault(vcpu, gpa, error_code, static_cpu_has(X86_FEATURE_DECODEASSISTS) ? svm->vmcb->control.insn_bytes : NULL, svm->vmcb->control.insn_len); if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK) - sev_handle_rmp_fault(vcpu, fault_address, error_code); + sev_handle_rmp_fault(vcpu, gpa, error_code); return rc; } From 01cde4eaaecaf5df158234f0a52b4a1c55796858 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Nov 2025 14:16:42 -0800 Subject: [PATCH 067/267] KVM: SVM: Add support for expedited writes to the fast MMIO bus Wire up SVM's #NPF handler to fast MMIO. While SVM doesn't provide a dedicated exit reason, it's trivial to key off PFERR_RSVD_MASK. Like VMX, restrict the fast path to L1 to avoid having to deal with nGPA=>GPA translations. For simplicity, use the fast path if and only if the next RIP is known. While KVM could utilize EMULTYPE_SKIP, doing so would require additional logic to deal with SEV guests, e.g. to go down the slow path if the instruction buffer is empty. All modern CPUs support next RIP, and in practice the next RIP will be available for any guest fast path. Copy+paste the kvm_io_bus_write() + trace_kvm_fast_mmio() logic even though KVM would ideally provide a small helper, as such a helper would need to either be a macro or non-inline to avoid including trace.h in a header (trace.h must not be included by x86.c prior to CREATE_TRACE_POINTS being defined). Link: https://patch.msgid.link/20251113221642.1673023-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index af018c1196b5..d1ff23e02ecd 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1862,6 +1862,9 @@ static int pf_interception(struct kvm_vcpu *vcpu) svm->vmcb->control.insn_len); } +static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len); + static int npf_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1879,6 +1882,24 @@ static int npf_interception(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK)) error_code &= ~PFERR_SYNTHETIC_MASK; + /* + * Expedite fast MMIO kicks if the next RIP is known and KVM is allowed + * emulate a page fault, e.g. skipping the current instruction is wrong + * if the #NPF occurred while vectoring an event. + */ + if ((error_code & PFERR_RSVD_MASK) && !is_guest_mode(vcpu)) { + const int emul_type = EMULTYPE_PF | EMULTYPE_NO_DECODE; + + if (svm_check_emulate_instruction(vcpu, emul_type, NULL, 0)) + return 1; + + if (nrips && svm->vmcb->control.next_rip && + !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { + trace_kvm_fast_mmio(gpa); + return kvm_skip_emulated_instruction(vcpu); + } + } + if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK)) error_code |= PFERR_PRIVATE_ACCESS; From 1d1722e52fcd70deb53d8c192f958fe34be14f5e Mon Sep 17 00:00:00 2001 From: Kevin Cheng Date: Mon, 15 Dec 2025 19:25:10 +0000 Subject: [PATCH 068/267] KVM: SVM: Don't allow L1 intercepts for instructions not advertised If a feature is not advertised in the guest's CPUID, prevent L1 from intercepting the unsupported instructions by clearing the corresponding intercept in KVM's cached vmcb12. When an L2 guest executes an instruction that is not advertised to L1, we expect a #UD exception to be injected by L0. However, the nested svm exit handler first checks if the instruction intercept is set in vmcb12, and if so, synthesizes an exit from L2 to L1 instead of a #UD exception. If a feature is not advertised, the L1 intercept should be ignored. While creating KVM's cached vmcb12, sanitize the intercepts for instructions that are not advertised in the guest CPUID. This effectively ignores the L1 intercept on nested vm exit handling. It also ignores the L1 intercept when computing the intercepts in vmcb02, so if L0 (for some reason) does not intercept the instruction, KVM won't intercept it at all. Signed-off-by: Kevin Cheng Co-developed-by: Sean Christopherson Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251215192510.2300816-1-chengkev@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 19 +++++++++++++++++++ arch/x86/kvm/svm/svm.h | 35 +++++++++++++++++++++++++++-------- 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 9be67040e94d..aa1bea134ace 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -403,6 +403,19 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu) return __nested_vmcb_check_controls(vcpu, ctl); } +/* + * If a feature is not advertised to L1, clear the corresponding vmcb12 + * intercept. + */ +#define __nested_svm_sanitize_intercept(__vcpu, __control, fname, iname) \ +do { \ + if (!guest_cpu_cap_has(__vcpu, X86_FEATURE_##fname)) \ + vmcb12_clr_intercept(__control, INTERCEPT_##iname); \ +} while (0) + +#define nested_svm_sanitize_intercept(__vcpu, __control, name) \ + __nested_svm_sanitize_intercept(__vcpu, __control, name, name) + static void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, struct vmcb_ctrl_area_cached *to, @@ -413,6 +426,12 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, for (i = 0; i < MAX_INTERCEPT; i++) to->intercepts[i] = from->intercepts[i]; + __nested_svm_sanitize_intercept(vcpu, to, XSAVE, XSETBV); + nested_svm_sanitize_intercept(vcpu, to, INVPCID); + nested_svm_sanitize_intercept(vcpu, to, RDTSCP); + nested_svm_sanitize_intercept(vcpu, to, SKINIT); + nested_svm_sanitize_intercept(vcpu, to, RDPRU); + to->iopm_base_pa = from->iopm_base_pa; to->msrpm_base_pa = from->msrpm_base_pa; to->tsc_offset = from->tsc_offset; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 01be93a53d07..806e68ba821b 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -434,28 +434,47 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) */ #define SVM_REGS_LAZY_LOAD_SET (1 << VCPU_EXREG_PDPTR) -static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit) +static inline void __vmcb_set_intercept(unsigned long *intercepts, u32 bit) { WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); - __set_bit(bit, (unsigned long *)&control->intercepts); + __set_bit(bit, intercepts); +} + +static inline void __vmcb_clr_intercept(unsigned long *intercepts, u32 bit) +{ + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + __clear_bit(bit, intercepts); +} + +static inline bool __vmcb_is_intercept(unsigned long *intercepts, u32 bit) +{ + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + return test_bit(bit, intercepts); +} + +static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit) +{ + __vmcb_set_intercept((unsigned long *)&control->intercepts, bit); } static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit) { - WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); - __clear_bit(bit, (unsigned long *)&control->intercepts); + __vmcb_clr_intercept((unsigned long *)&control->intercepts, bit); } static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit) { - WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); - return test_bit(bit, (unsigned long *)&control->intercepts); + return __vmcb_is_intercept((unsigned long *)&control->intercepts, bit); +} + +static inline void vmcb12_clr_intercept(struct vmcb_ctrl_area_cached *control, u32 bit) +{ + __vmcb_clr_intercept((unsigned long *)&control->intercepts, bit); } static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u32 bit) { - WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); - return test_bit(bit, (unsigned long *)&control->intercepts); + return __vmcb_is_intercept((unsigned long *)&control->intercepts, bit); } static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit) From db5e82496492b4890b1c3356581c016767ed527f Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Fri, 7 Nov 2025 10:32:39 +0100 Subject: [PATCH 069/267] KVM: SVM: Virtualize and advertise support for ERAPS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AMD CPUs with the Enhanced Return Address Predictor Security (ERAPS) feature (available on Zen5+) obviate the need for FILL_RETURN_BUFFER sequences right after VMEXITs. ERAPS adds guest/host tags to entries in the RSB (a.k.a. RAP). This helps with speculation protection across the VM boundary, and it also preserves host and guest entries in the RSB that can improve software performance (which would otherwise be flushed due to the FILL_RETURN_BUFFER sequences). Importantly, ERAPS also improves cross-domain security by clearing the RAP in certain situations. Specifically, the RAP is cleared in response to actions that are typically tied to software context switching between tasks. Per the APM: The ERAPS feature eliminates the need to execute CALL instructions to clear the return address predictor in most cases. On processors that support ERAPS, return addresses from CALL instructions executed in host mode are not used in guest mode, and vice versa. Additionally, the return address predictor is cleared in all cases when the TLB is implicitly invalidated and in the following cases: • MOV CR3 instruction • INVPCID other than single address invalidation (operation type 0) ERAPS also allows CPUs to extends the size of the RSB/RAP from the older standard (of 32 entries) to a new size, enumerated in CPUID leaf 0x80000021:EBX bits 23:16 (64 entries in Zen5 CPUs). In hardware, ERAPS is always-on, when running in host context, the CPU uses the full RSB/RAP size without any software changes necessary. However, when running in guest context, the CPU utilizes the full size of the RSB/RAP if and only if the new ALLOW_LARGER_RAP flag is set in the VMCB; if the flag is not set, the CPU limits itself to the historical size of 32 entires. Requiring software to opt-in for guest usage of RAPs larger than 32 entries allows hypervisors, i.e. KVM, to emulate the aforementioned conditions in which the RAP is cleared as well as the guest/host split. E.g. if the CPU unconditionally used the full RAP for guests, failure to clear the RAP on transitions between L1 or L2, or on emulated guest TLB flushes, would expose the guest to RAP-based attacks as a guest without support for ERAPS wouldn't know that its FILL_RETURN_BUFFER sequence is insufficient. Address the ~two broad categories of ERAPS emulation, and advertise ERAPS support to userspace, along with the RAP size enumerated in CPUID. 1. Architectural RAP clearing: as above, CPUs with ERAPS clear RAP entries on several conditions, including CR3 updates. To handle scenarios where a relevant operation is handled in common code (emulation of INVPCID and to a lesser extent MOV CR3), piggyback VCPU_EXREG_CR3 and create an alias, VCPU_EXREG_ERAPS. SVM doesn't utilize CR3 dirty tracking, and so for all intents and purposes VCPU_EXREG_CR3 is unused. Aliasing VCPU_EXREG_ERAPS ensures that any flow that writes CR3 will also clear the guest's RAP, and allows common x86 to mark ERAPS vCPUs as needing a RAP clear without having to add a new request (or other mechanism). 2. Nested guests: the ERAPS feature adds host/guest tagging to entries in the RSB, but does not distinguish between the guest ASIDs. To prevent the case of an L2 guest poisoning the RSB to attack the L1 guest, the CPU exposes a new VMCB bit (CLEAR_RAP). The next VMRUN with a VMCB that has this bit set causes the CPU to flush the RSB before entering the guest context. Set the bit in VMCB01 after a nested #VMEXIT to ensure the next time the L1 guest runs, its RSB contents aren't polluted by the L2's contents. Similarly, before entry into a nested guest, set the bit for VMCB02, so that the L1 guest's RSB contents are not leaked/used in the L2 context. Enable ALLOW_LARGER_RAP (and emulate RAP clears) if and only if ERAPS is exposed to the guest. Enabling ALLOW_LARGER_RAP unconditionally wouldn't cause any functional issues, but ignoring userspace's (and L1's) desires would put KVM into a grey area, which is especially undesirable due to the potential security implications. E.g. if a use case wants to have L1 do manual RAP clearing even when ERAPS is present in hardware, enabling ALLOW_LARGER_RAP could result in L1 leaving stale entries in the RAP. ERAPS is documented in AMD APM Vol 2 (Pub 24593), in revisions 3.43 and later. Signed-off-by: Amit Shah Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Reviewed-by: Amit Shah Link: https://patch.msgid.link/aR913X8EqO6meCqa@google.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/kvm_host.h | 8 ++++++++ arch/x86/include/asm/svm.h | 6 +++++- arch/x86/kvm/cpuid.c | 9 ++++++++- arch/x86/kvm/svm/nested.c | 18 ++++++++++++++++++ arch/x86/kvm/svm/svm.c | 25 ++++++++++++++++++++++++- arch/x86/kvm/svm/svm.h | 1 + arch/x86/kvm/x86.c | 12 ++++++++++++ 8 files changed, 77 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index c3b53beb1300..81f7b3b91986 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -472,6 +472,7 @@ #define X86_FEATURE_GP_ON_USER_CPUID (20*32+17) /* User CPUID faulting */ #define X86_FEATURE_PREFETCHI (20*32+20) /* Prefetch Data/Instruction to Cache Level */ +#define X86_FEATURE_ERAPS (20*32+24) /* Enhanced Return Address Predictor Security */ #define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ #define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5a3bfa293e8b..0353d8b6988c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -195,7 +195,15 @@ enum kvm_reg { VCPU_EXREG_PDPTR = NR_VCPU_REGS, VCPU_EXREG_CR0, + /* + * Alias AMD's ERAPS (not a real register) to CR3 so that common code + * can trigger emulation of the RAP (Return Address Predictor) with + * minimal support required in common code. Piggyback CR3 as the RAP + * is cleared on writes to CR3, i.e. marking CR3 dirty will naturally + * mark ERAPS dirty as well. + */ VCPU_EXREG_CR3, + VCPU_EXREG_ERAPS = VCPU_EXREG_CR3, VCPU_EXREG_CR4, VCPU_EXREG_RFLAGS, VCPU_EXREG_SEGMENTS, diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 56aa99503dc4..50ece197c98a 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -131,7 +131,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u64 tsc_offset; u32 asid; u8 tlb_ctl; - u8 reserved_2[3]; + u8 erap_ctl; + u8 reserved_2[2]; u32 int_ctl; u32 int_vector; u32 int_state; @@ -182,6 +183,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define TLB_CONTROL_FLUSH_ASID 3 #define TLB_CONTROL_FLUSH_ASID_LOCAL 7 +#define ERAP_CONTROL_ALLOW_LARGER_RAP BIT(0) +#define ERAP_CONTROL_CLEAR_RAP BIT(1) + #define V_TPR_MASK 0x0f #define V_IRQ_SHIFT 8 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 88a5426674a1..c590a5bd3196 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1223,6 +1223,7 @@ void kvm_set_cpu_caps(void) /* PrefetchCtlMsr */ /* GpOnUserCpuid */ /* EPSF */ + F(ERAPS), SYNTHESIZED_F(SBPB), SYNTHESIZED_F(IBPB_BRTYPE), SYNTHESIZED_F(SRSO_NO), @@ -1803,8 +1804,14 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; case 0x80000021: - entry->ebx = entry->edx = 0; + entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0021_EAX); + + if (kvm_cpu_cap_has(X86_FEATURE_ERAPS)) + entry->ebx &= GENMASK(23, 16); + else + entry->ebx = 0; + cpuid_entry_override(entry, CPUID_8000_0021_ECX); break; /* AMD Extended Performance Monitoring and Debug */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index aa1bea134ace..5a1e1164c197 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -436,6 +436,7 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, to->msrpm_base_pa = from->msrpm_base_pa; to->tsc_offset = from->tsc_offset; to->tlb_ctl = from->tlb_ctl; + to->erap_ctl = from->erap_ctl; to->int_ctl = from->int_ctl; to->int_vector = from->int_vector; to->int_state = from->int_state; @@ -885,6 +886,19 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, } } + /* + * Take ALLOW_LARGER_RAP from vmcb12 even though it should be safe to + * let L2 use a larger RAP since KVM will emulate the necessary clears, + * as it's possible L1 deliberately wants to restrict L2 to the legacy + * RAP size. Unconditionally clear the RAP on nested VMRUN, as KVM is + * responsible for emulating the host vs. guest tags (L1 is the "host", + * L2 is the "guest"). + */ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) + vmcb02->control.erap_ctl = (svm->nested.ctl.erap_ctl & + ERAP_CONTROL_ALLOW_LARGER_RAP) | + ERAP_CONTROL_CLEAR_RAP; + /* * Merge guest and host intercepts - must be called with vcpu in * guest-mode to take effect. @@ -1180,6 +1194,9 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_nested_vmexit_handle_ibrs(vcpu); + if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) + vmcb01->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP; + svm_switch_vmcb(svm, &svm->vmcb01); /* @@ -1686,6 +1703,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->tsc_offset = from->tsc_offset; dst->asid = from->asid; dst->tlb_ctl = from->tlb_ctl; + dst->erap_ctl = from->erap_ctl; dst->int_ctl = from->int_ctl; dst->int_vector = from->int_vector; dst->int_state = from->int_state; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d1ff23e02ecd..34c8a94b1b81 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1141,6 +1141,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event) svm_clr_intercept(svm, INTERCEPT_PAUSE); } + if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) + svm->vmcb->control.erap_ctl |= ERAP_CONTROL_ALLOW_LARGER_RAP; + if (kvm_vcpu_apicv_active(vcpu)) avic_init_vmcb(svm, vmcb); @@ -3293,6 +3296,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); pr_err("%-20s%d\n", "asid:", control->asid); pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); + pr_err("%-20s%d\n", "erap_ctl:", control->erap_ctl); pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); pr_err("%-20s%08x\n", "int_vector:", control->int_vector); pr_err("%-20s%08x\n", "int_state:", control->int_state); @@ -4004,6 +4008,13 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) invlpga(gva, svm->vmcb->control.asid); } +static void svm_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + kvm_register_mark_dirty(vcpu, VCPU_EXREG_ERAPS); + + svm_flush_tlb_asid(vcpu); +} + static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4262,6 +4273,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) } svm->vmcb->save.cr2 = vcpu->arch.cr2; + if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS) && + kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS)) + svm->vmcb->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP; + svm_hv_update_vp_id(svm->vmcb, vcpu); /* @@ -4339,6 +4354,14 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) } svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + + /* + * Unconditionally mask off the CLEAR_RAP bit, the AND is just as cheap + * as the TEST+Jcc to avoid it. + */ + if (cpu_feature_enabled(X86_FEATURE_ERAPS)) + svm->vmcb->control.erap_ctl &= ~ERAP_CONTROL_CLEAR_RAP; + vmcb_mark_all_clean(svm->vmcb); /* if exit due to PF check for async PF */ @@ -5094,7 +5117,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = { .flush_tlb_all = svm_flush_tlb_all, .flush_tlb_current = svm_flush_tlb_current, .flush_tlb_gva = svm_flush_tlb_gva, - .flush_tlb_guest = svm_flush_tlb_asid, + .flush_tlb_guest = svm_flush_tlb_guest, .vcpu_pre_run = svm_vcpu_pre_run, .vcpu_run = svm_vcpu_run, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 806e68ba821b..7d28a739865f 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -156,6 +156,7 @@ struct vmcb_ctrl_area_cached { u64 tsc_offset; u32 asid; u8 tlb_ctl; + u8 erap_ctl; u32 int_ctl; u32 int_vector; u32 int_state; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ff8812f3a129..e013392fe20c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -14130,6 +14130,13 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) return 1; } + /* + * When ERAPS is supported, invalidating a specific PCID clears + * the RAP (Return Address Predicator). + */ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) + kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS); + kvm_invalidate_pcid(vcpu, operand.pcid); return kvm_skip_emulated_instruction(vcpu); @@ -14143,6 +14150,11 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) fallthrough; case INVPCID_TYPE_ALL_INCL_GLOBAL: + /* + * Don't bother marking VCPU_EXREG_ERAPS dirty, SVM will take + * care of doing so when emulating the full guest TLB flush + * (the RAP is cleared on all implicit TLB flushes). + */ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); return kvm_skip_emulated_instruction(vcpu); From 8312f1b9dd71340b5fff65e56c6c163187bfa5d0 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 21 Nov 2025 20:48:01 +0000 Subject: [PATCH 070/267] KVM: SVM: Don't set GIF when clearing EFER.SVME Clearing EFER.SVME is not architected to set GIF. Don't set GIF when emulating a change to EFER that clears EFER.SVME. However, keep setting GIF if clearing EFER.SVME causes force-leaving the nested guest through svm_leave_nested(), to maintain a sane behavior of not leaving GIF cleared after exiting the guest. In every other path, setting GIF is either correct/desirable, or irrelevant because the caller immediately and unconditionally sets/clears GIF. This is more-or-less KVM defining HW behavior, but leaving GIF cleared would also be defining HW behavior anyway. Note that if force-leaving the nested guest is considered a SHUTDOWN, then this could violate the APM-specified behavior: If the processor enters the shutdown state (due to a triple fault for instance) while GIF is clear, it can only be restarted by means of a RESET. However, a SHUTDOWN leaves the VMCB undefined, so there's not a lot that KVM can do in this case. Also, if vGIF is enabled on SHUTDOWN, KVM has no way of finding out of GIF was cleared. The only way for KVM to handle this without making up HW behavior is to completely terminate the VM, so settle for doing the relatively "sane" thing of setting GIF when force-leaving nested. Fixes: c513f484c558 ("KVM: nSVM: leave guest mode when clearing EFER.SVME") Signed-off-by: Jim Mattson Co-developed-by: Sean Christopherson Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251121204803.991707-3-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 2 ++ arch/x86/kvm/svm/svm.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 5a1e1164c197..47e8ce7d360a 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1399,6 +1399,8 @@ void svm_leave_nested(struct kvm_vcpu *vcpu) nested_svm_uninit_mmu_context(vcpu); vmcb_mark_all_dirty(svm->vmcb); + svm_set_gif(svm, true); + if (kvm_apicv_activated(vcpu->kvm)) kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 34c8a94b1b81..c7bd78f5a2c7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -215,7 +215,6 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { if (!(efer & EFER_SVME)) { svm_leave_nested(vcpu); - svm_set_gif(svm, true); /* #GP intercept is still needed for vmware backdoor */ if (!enable_vmware_backdoor) clr_exception_intercept(svm, GP_VECTOR); From 6f4d3ebc24c6ef92e196ebbd389a3f2bfdc7a144 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 21 Nov 2025 20:48:00 +0000 Subject: [PATCH 071/267] KVM: SVM: Allow KVM_SET_NESTED_STATE to clear GIF when SVME==0 GIF==0 together with EFER.SVME==0 is a valid architectural state. Don't return -EINVAL for KVM_SET_NESTED_STATE when this combination is specified. Fixes: cc440cdad5b7 ("KVM: nSVM: implement KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE") Signed-off-by: Jim Mattson Reviewed-by: Yosry Ahmed Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251121204803.991707-2-yosry.ahmed@linux.dev [sean: disallow KVM_STATE_NESTED_RUN_PENDING with SVME=0] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 47e8ce7d360a..5b741f8ed170 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1821,12 +1821,12 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, /* * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed. + * If SVME is disabled, the only valid states are "none" and GIF=1 + * (clearing SVME does NOT set GIF, i.e. GIF=0 is allowed). */ - if (!(vcpu->arch.efer & EFER_SVME)) { - /* GIF=1 and no guest mode are required if SVME=0. */ - if (kvm_state->flags != KVM_STATE_NESTED_GIF_SET) - return -EINVAL; - } + if (!(vcpu->arch.efer & EFER_SVME) && kvm_state->flags && + kvm_state->flags != KVM_STATE_NESTED_GIF_SET) + return -EINVAL; /* SMM temporarily disables SVM, so we cannot be in guest mode. */ if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) From bda6ae6f29664b659671f872a2adda3c1c2f5dd6 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 21 Nov 2025 20:48:02 +0000 Subject: [PATCH 072/267] KVM: selftests: Use TEST_ASSERT_EQ() in test_vmx_nested_state() The assert messages do not add much value, so use TEST_ASSERT_EQ(), which also nicely displays the addresses in hex. While at it, also assert the values of state->flags. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251121204803.991707-4-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c index 67a62a5a8895..b59a8a17084d 100644 --- a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c @@ -241,8 +241,10 @@ void test_vmx_nested_state(struct kvm_vcpu *vcpu) TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, "Size must be between %ld and %d. The size returned was %d.", sizeof(*state), state_sz, state->size); - TEST_ASSERT(state->hdr.vmx.vmxon_pa == -1ull, "vmxon_pa must be -1ull."); - TEST_ASSERT(state->hdr.vmx.vmcs12_pa == -1ull, "vmcs_pa must be -1ull."); + + TEST_ASSERT_EQ(state->hdr.vmx.vmxon_pa, -1ull); + TEST_ASSERT_EQ(state->hdr.vmx.vmcs12_pa, -1ull); + TEST_ASSERT_EQ(state->flags, 0); free(state); } From ca2eccb953fd33ef38701e33e660b21f7e84aa14 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 21 Nov 2025 20:48:03 +0000 Subject: [PATCH 073/267] KVM: selftests: Extend vmx_set_nested_state_test to cover SVM Add test cases for the validation checks in svm_set_nested_state(), and allow the test to run with SVM as well as VMX. The SVM test also makes sure that KVM_SET_NESTED_STATE accepts GIF being set or cleared if EFER.SVME is cleared, verifying a recently fixed bug where GIF was incorrectly expected to always be set when EFER.SVME is cleared. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251121204803.991707-5-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 2 +- ...d_state_test.c => nested_set_state_test.c} | 122 ++++++++++++++++-- 2 files changed, 112 insertions(+), 12 deletions(-) rename tools/testing/selftests/kvm/x86/{vmx_set_nested_state_test.c => nested_set_state_test.c} (71%) diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..4ddece4ee365 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -92,6 +92,7 @@ TEST_GEN_PROGS_x86 += x86/nested_close_kvm_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test +TEST_GEN_PROGS_x86 += x86/nested_set_state_test TEST_GEN_PROGS_x86 += x86/nested_tsc_adjust_test TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test TEST_GEN_PROGS_x86 += x86/platform_info_test @@ -120,7 +121,6 @@ TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86 += x86/vmx_msrs_test TEST_GEN_PROGS_x86 += x86/vmx_invalid_nested_guest_state TEST_GEN_PROGS_x86 += x86/vmx_nested_la57_state_test -TEST_GEN_PROGS_x86 += x86/vmx_set_nested_state_test TEST_GEN_PROGS_x86 += x86/apic_bus_clock_test TEST_GEN_PROGS_x86 += x86/xapic_ipi_test TEST_GEN_PROGS_x86 += x86/xapic_state_test diff --git a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86/nested_set_state_test.c similarity index 71% rename from tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c rename to tools/testing/selftests/kvm/x86/nested_set_state_test.c index b59a8a17084d..0f2102b43629 100644 --- a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86/nested_set_state_test.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * vmx_set_nested_state_test - * * Copyright (C) 2019, Google LLC. * * This test verifies the integrity of calling the ioctl KVM_SET_NESTED_STATE. @@ -11,6 +9,7 @@ #include "kvm_util.h" #include "processor.h" #include "vmx.h" +#include "svm_util.h" #include #include @@ -249,6 +248,104 @@ void test_vmx_nested_state(struct kvm_vcpu *vcpu) free(state); } +static void vcpu_efer_enable_svm(struct kvm_vcpu *vcpu) +{ + uint64_t old_efer = vcpu_get_msr(vcpu, MSR_EFER); + + vcpu_set_msr(vcpu, MSR_EFER, old_efer | EFER_SVME); +} + +static void vcpu_efer_disable_svm(struct kvm_vcpu *vcpu) +{ + uint64_t old_efer = vcpu_get_msr(vcpu, MSR_EFER); + + vcpu_set_msr(vcpu, MSR_EFER, old_efer & ~EFER_SVME); +} + +void set_default_svm_state(struct kvm_nested_state *state, int size) +{ + memset(state, 0, size); + state->format = 1; + state->size = size; + state->hdr.svm.vmcb_pa = 0x3000; +} + +void test_svm_nested_state(struct kvm_vcpu *vcpu) +{ + /* Add a page for VMCB. */ + const int state_sz = sizeof(struct kvm_nested_state) + getpagesize(); + struct kvm_nested_state *state = + (struct kvm_nested_state *)malloc(state_sz); + + vcpu_set_cpuid_feature(vcpu, X86_FEATURE_SVM); + + /* The format must be set to 1. 0 for VMX, 1 for SVM. */ + set_default_svm_state(state, state_sz); + state->format = 0; + test_nested_state_expect_einval(vcpu, state); + + /* Invalid flags are rejected, KVM_STATE_NESTED_EVMCS is VMX-only */ + set_default_svm_state(state, state_sz); + state->flags = KVM_STATE_NESTED_EVMCS; + test_nested_state_expect_einval(vcpu, state); + + /* + * If EFER.SVME is clear, guest mode is disallowed and GIF can be set or + * cleared. + */ + vcpu_efer_disable_svm(vcpu); + + set_default_svm_state(state, state_sz); + state->flags = KVM_STATE_NESTED_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + state->flags = 0; + test_nested_state(vcpu, state); + + state->flags = KVM_STATE_NESTED_GIF_SET; + test_nested_state(vcpu, state); + + /* Enable SVM in the guest EFER. */ + vcpu_efer_enable_svm(vcpu); + + /* Setting vmcb_pa to a non-aligned address is only fine when not entering guest mode */ + set_default_svm_state(state, state_sz); + state->hdr.svm.vmcb_pa = -1ull; + state->flags = 0; + test_nested_state(vcpu, state); + state->flags = KVM_STATE_NESTED_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + /* + * Size must be large enough to fit kvm_nested_state and VMCB + * only when entering guest mode. + */ + set_default_svm_state(state, state_sz/2); + state->flags = 0; + test_nested_state(vcpu, state); + state->flags = KVM_STATE_NESTED_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + /* + * Test that if we leave nesting the state reflects that when we get it + * again, except for vmcb_pa, which is always returned as 0 when not in + * guest mode. + */ + set_default_svm_state(state, state_sz); + state->hdr.svm.vmcb_pa = -1ull; + state->flags = KVM_STATE_NESTED_GIF_SET; + test_nested_state(vcpu, state); + vcpu_nested_state_get(vcpu, state); + TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, + "Size must be between %ld and %d. The size returned was %d.", + sizeof(*state), state_sz, state->size); + + TEST_ASSERT_EQ(state->hdr.svm.vmcb_pa, 0); + TEST_ASSERT_EQ(state->flags, KVM_STATE_NESTED_GIF_SET); + + free(state); +} + int main(int argc, char *argv[]) { struct kvm_vm *vm; @@ -257,20 +354,20 @@ int main(int argc, char *argv[]) have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); - /* - * AMD currently does not implement set_nested_state, so for now we - * just early out. - */ - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); - vm = vm_create_with_one_vcpu(&vcpu, NULL); /* - * First run tests with VMX disabled to check error handling. + * First run tests with VMX/SVM disabled to check error handling. + * test_{vmx/svm}_nested_state() will re-enable as needed. */ - vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_VMX); + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_VMX); + else + vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_SVM); /* Passing a NULL kvm_nested_state causes a EFAULT. */ test_nested_state_expect_efault(vcpu, NULL); @@ -299,7 +396,10 @@ int main(int argc, char *argv[]) state.flags = KVM_STATE_NESTED_RUN_PENDING; test_nested_state_expect_einval(vcpu, &state); - test_vmx_nested_state(vcpu); + if (kvm_cpu_has(X86_FEATURE_VMX)) + test_vmx_nested_state(vcpu); + else + test_svm_nested_state(vcpu); kvm_vm_free(vm); return 0; From fc4d3a6558af99e7b6a2ede6a6e6605be41da0ee Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Nov 2025 11:05:14 -0800 Subject: [PATCH 074/267] KVM: x86: Enforce use of EXPORT_SYMBOL_FOR_KVM_INTERNAL Add a (gnarly) inline "script" in the Makefile to fail the build if there is EXPORT_SYMBOL_GPL or EXPORT_SYMBOL usage in virt/kvm or arch/x86/kvm beyond the known-good/expected exports for other modules. Remembering to use EXPORT_SYMBOL_FOR_KVM_INTERNAL is surprisingly difficult, and hoping to detect "bad" exports via code review is not a robust long-term strategy. Jump through a pile of hoops to coerce make into printing a human-friendly error message, with the offending files+lines cleanly separated. E.g. where is the resolution of $(srctree), i.e. '.' for in-tree builds, and the absolute path for out-of-tree-builds: /arch/x86/kvm/Makefile:97: *** ERROR *** found 2 unwanted occurrences of EXPORT_SYMBOL_GPL: /arch/x86/kvm/x86.c:686:EXPORT_SYMBOL_GPL(__kvm_set_user_return_msr); /arch/x86/kvm/x86.c:703:EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); in directories: /arch/x86/kvm /virt/kvm Use EXPORT_SYMBOL_FOR_KVM_INTERNAL, not EXPORT_SYMBOL_GPL. Stop. and /arch/x86/kvm/Makefile:98: *** ERROR *** found 1 unwanted occurrences of EXPORT_SYMBOL: /arch/x86/kvm/x86.c:709:EXPORT_SYMBOL(kvm_get_user_return_msr); in directories: /arch/x86/kvm /virt/kvm Use EXPORT_SYMBOL_FOR_KVM_INTERNAL, not EXPORT_SYMBOL. Stop. Put the enforcement in x86's Makefile even though the rule itself applies to virt/kvm, as putting the enforcement in virt/kvm/Makefile.kvm would effectively require exempting every architecture except x86. PPC is the only other architecture with sub-modules, and PPC hasn't been switched to use EXPORT_SYMBOL_FOR_KVM_INTERNAL (and given its nearly-orphaned state, likely never will). And for KVM architectures without sub-modules, that means that, barring truly spurious exports, the exports are intended for non-KVM usage and thus shouldn't be using EXPORT_SYMBOL_FOR_KVM_INTERNAL. Tested-by: Chao Gao Link: https://patch.msgid.link/20251121190514.293385-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/Makefile | 49 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index c4b8950c7abe..77337c37324b 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -47,3 +47,52 @@ $(obj)/kvm-asm-offsets.h: $(obj)/kvm-asm-offsets.s FORCE targets += kvm-asm-offsets.s clean-files += kvm-asm-offsets.h + + +# Fail the build if there is unexpected EXPORT_SYMBOL_GPL (or EXPORT_SYMBOL) +# usage. All KVM-internal exports should use EXPORT_SYMBOL_FOR_KVM_INTERNAL. +# Only a handful of exports intended for other modules (VFIO, KVMGT) should +# use EXPORT_SYMBOL_GPL, and EXPORT_SYMBOL should never be used. +ifdef CONFIG_KVM_X86 +# Search recursively for whole words and print line numbers. Filter out the +# allowed set of exports, i.e. those that are intended for external usage. +exports_grep_trailer := --include='*.[ch]' -nrw $(srctree)/virt/kvm $(srctree)/arch/x86/kvm | \ + grep -v -e kvm_page_track_register_notifier \ + -e kvm_page_track_unregister_notifier \ + -e kvm_write_track_add_gfn \ + -e kvm_write_track_remove_gfn \ + -e kvm_get_kvm \ + -e kvm_get_kvm_safe \ + -e kvm_put_kvm + +# Force grep to emit a goofy group separator that can in turn be replaced with +# the above newline macro (newlines in Make are a nightmare). Note, grep only +# prints the group separator when N lines of context are requested via -C, +# a.k.a. --NUM. Simply request zero lines. Print the separator only after +# filtering out expected exports to avoid extra newlines in the error message. +define get_kvm_exports +$(shell grep "$(1)" -C0 $(exports_grep_trailer) | grep "$(1)" -C0 --group-separator="!SEP!") +endef + +define check_kvm_exports +nr_kvm_exports := $(shell grep "$(1)" $(exports_grep_trailer) | wc -l) + +ifneq (0,$$(nr_kvm_exports)) +$$(error ERROR ***\ +$$(newline)found $$(nr_kvm_exports) unwanted occurrences of $(1):\ +$$(newline) $(subst !SEP!,$$(newline) ,$(call get_kvm_exports,$(1)))\ +$$(newline)in directories:\ +$$(newline) $(srctree)/arch/x86/kvm\ +$$(newline) $(srctree)/virt/kvm\ +$$(newline)Use EXPORT_SYMBOL_FOR_KVM_INTERNAL, not $(1)) +endif # nr_kvm_exports != 0 +undefine nr_kvm_exports +endef # check_kvm_exports + +$(eval $(call check_kvm_exports,EXPORT_SYMBOL_GPL)) +$(eval $(call check_kvm_exports,EXPORT_SYMBOL)) + +undefine check_kvm_exports +undefine get_kvm_exports +undefine exports_grep_trailer +endif # CONFIG_KVM_X86 From a4978324e4bd80427313e0baa16e53709e14d878 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:43:03 -0800 Subject: [PATCH 075/267] KVM: x86: Drop ASSERT()s on APIC/vCPU being non-NULL Remove ASSERT()s on vCPU and APIC structures being non-NULL in the local APIC code as the DEBUG=1 path of ASSERT() ends with BUG(), i.e. isn't meaningfully better for debugging than a NULL pointer dereference. For all intents and purposes, no functional change intended. Link: https://patch.msgid.link/20251206004311.479939-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1597dd0b0cc6..558adcb67171 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1048,7 +1048,6 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, struct kvm_lapic *target = vcpu->arch.apic; u32 mda = kvm_apic_mda(vcpu, dest, source, target); - ASSERT(target); switch (shorthand) { case APIC_DEST_NOSHORT: if (dest_mode == APIC_DEST_PHYSICAL) @@ -1607,8 +1606,6 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) ktime_t remaining, now; s64 ns; - ASSERT(apic != NULL); - /* if initial count is 0, current count should also be 0 */ if (kvm_lapic_get_reg(apic, APIC_TMICT) == 0 || apic->lapic_timer.period == 0) @@ -3000,8 +2997,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic; - ASSERT(vcpu != NULL); - if (!irqchip_in_kernel(vcpu->kvm)) { static_branch_inc(&kvm_has_noapic_vcpu); return 0; From 37187992dd821e0283da3f7664ef3d6ab4220ef8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:43:04 -0800 Subject: [PATCH 076/267] KVM: x86: Drop guest/user-triggerable asserts on IRR/ISR vectors Remove the ASSERT()s in apic_find_highest_i{r,s}r() that exist to detect illegal vectors (0-15 are reserved and never recognized by the local APIC), as the asserts, if they were ever to be enabled by #defining DEBUG, can be trivially triggered from both the guest and from userspace, and ultimately because the ASSERT()s are useless. In large part due to lack of emulation for the Error Status Register and its "delayed" read semantics, KVM doesn't filter out bad IRQs (IPIs or otherwise) when IRQs are sent or received. Instead, probably by dumb luck on KVM's part, KVM effectively ignores pending illegal vectors in the IRR due vector 0-15 having priority '0', and thus never being higher priority than PPR. As for ISR, a misbehaving userspace could stuff illegal vector bits, but again the end result is mostly benign (aside from userspace likely breaking the VM), as processing illegal vectors "works" and doesn't cause functional problems. Regardless of the safety and correctness of KVM's illegal vector handling, one thing is for certain: the ASSERT()s have done absolutely nothing to help detect such issues since they were added 18+ years ago by commit 97222cc83163 ("KVM: Emulate local APIC in kernel"). For all intents and purposes, no functional change intended. Link: https://patch.msgid.link/20251206004311.479939-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 558adcb67171..785c0352fa0e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -666,8 +666,6 @@ static inline int apic_search_irr(struct kvm_lapic *apic) static inline int apic_find_highest_irr(struct kvm_lapic *apic) { - int result; - /* * Note that irr_pending is just a hint. It will be always * true with virtual interrupt delivery enabled. @@ -675,10 +673,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) if (!apic->irr_pending) return -1; - result = apic_search_irr(apic); - ASSERT(result == -1 || result >= 16); - - return result; + return apic_search_irr(apic); } static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) @@ -731,8 +726,6 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) static inline int apic_find_highest_isr(struct kvm_lapic *apic) { - int result; - /* * Note that isr_count is always 1, and highest_isr_cache * is always -1, with APIC virtualization enabled. @@ -742,10 +735,7 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) if (likely(apic->highest_isr_cache != -1)) return apic->highest_isr_cache; - result = apic_find_highest_vector(apic->regs + APIC_ISR); - ASSERT(result == -1 || result >= 16); - - return result; + return apic_find_highest_vector(apic->regs + APIC_ISR); } static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) From ca909f9ea8cb19032c9d9d7841a8e2395b6a9e8b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:43:05 -0800 Subject: [PATCH 077/267] KVM: x86: Drop ASSERT() on I/O APIC EOIs being only for LEVEL_to WARN_ON_ONCE Remove kvm_ioapic_update_eoi_one()'s ASSERT() that the vector's entry is configured to be level-triggered, as KVM intercepts and forward EOIs to the I/O APIC even for edge-triggered IRQs (see kvm_ioapic_scan_entry()), and nothing guarantees the local APIC's TMR register is synchronized with the I/O APIC redirection table, i.e. the @trigger_mode check just out of sight doesn't provide any meaningful protection. Given that roughly half of the historic ASSERT()s are/were guest- and/or user-triggerable, it's safe to assume no one has run meaningful workloads with DEBUG=1, i.e. that the ASSERT() has been dead code since it was added 18+ years ago. Opportunistically drop the unnecessary forward declaration of kvm_ioapic_update_eoi_one(). For all intents and purposes, no functional change intended. Link: https://patch.msgid.link/20251206004311.479939-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/ioapic.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 2c2783296aed..e7315b9311d3 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -37,11 +37,6 @@ static int ioapic_service(struct kvm_ioapic *vioapic, int irq, bool line_status); -static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu, - struct kvm_ioapic *ioapic, - int trigger_mode, - int pin); - static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic) { unsigned long result = 0; @@ -564,7 +559,6 @@ static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu, kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) return; - ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); ent->fields.remote_irr = 0; if (!ent->fields.mask && (ioapic->irr & (1 << pin))) { ++ioapic->irq_eoi[pin]; From 9eabb2a5e499ec3e5a013157844335a487c7cb5e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:43:06 -0800 Subject: [PATCH 078/267] KVM: x86: Drop guest-triggerable ASSERT()s on I/O APIC access alignment Drop the asserts on the guest-controlled address being 16-byte aligned when emulating I/O APIC accesses, as the ASSERT()s are guest-triggerable and ultimately pointless since KVM requires exact register matches, i.e. will ultimately ignore unaligned accesses anyways. Drop the ASSERT() definition itself now that all users are gone. For all intents and purposes, no functional change intended. Link: https://patch.msgid.link/20251206004311.479939-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/ioapic.c | 4 ---- arch/x86/kvm/ioapic.h | 13 ------------- 2 files changed, 17 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index e7315b9311d3..955c781ba623 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -618,8 +618,6 @@ static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!ioapic_in_range(ioapic, addr)) return -EOPNOTSUPP; - ASSERT(!(addr & 0xf)); /* check alignment */ - addr &= 0xff; spin_lock(&ioapic->lock); switch (addr) { @@ -660,8 +658,6 @@ static int ioapic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!ioapic_in_range(ioapic, addr)) return -EOPNOTSUPP; - ASSERT(!(addr & 0xf)); /* check alignment */ - switch (len) { case 8: case 4: diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index bf28dbc11ff6..913016acbbd5 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -104,19 +104,6 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, bool mask); -#ifdef DEBUG -#define ASSERT(x) \ -do { \ - if (!(x)) { \ - printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ - __FILE__, __LINE__, #x); \ - BUG(); \ - } \ -} while (0) -#else -#define ASSERT(x) do { } while (0) -#endif - static inline int ioapic_in_kernel(struct kvm *kvm) { return irqchip_full(kvm); From 4d846f183897606ba5e9c76494d19acbe1ba88f9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:43:07 -0800 Subject: [PATCH 079/267] KVM: x86: Drop MAX_NR_RESERVED_IOAPIC_PINS, use KVM_MAX_IRQ_ROUTES directly Directly use KVM_MAX_IRQ_ROUTES when checking the number of routes being defined by userspace when creating a split IRQCHIP. The restriction has nothing to do with the I/O APIC, e.g. most modern userspace usage is for routing MSIs. Breaking the unnecessary dependency on the I/O APIC will allow burying all of ioapic.h behind CONFIG_KVM_IOAPIC=y. No functional change intended. Link: https://patch.msgid.link/20251206004311.479939-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/ioapic.h | 1 - arch/x86/kvm/x86.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 913016acbbd5..ad238a6e63dc 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -10,7 +10,6 @@ struct kvm; struct kvm_vcpu; #define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS -#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES #define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ #define IOAPIC_EDGE_TRIG 0 #define IOAPIC_LEVEL_TRIG 1 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ff8812f3a129..a847353588a1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6717,7 +6717,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, case KVM_CAP_SPLIT_IRQCHIP: { mutex_lock(&kvm->lock); r = -EINVAL; - if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS) + if (cap->args[0] > KVM_MAX_IRQ_ROUTES) goto split_irqchip_unlock; r = -EEXIST; if (irqchip_in_kernel(kvm)) From 1a5d7f9540af1416402887435e4537dcef36a4b2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:43:08 -0800 Subject: [PATCH 080/267] KVM: x86: Add a wrapper to handle common case of IRQ delivery without dest_map Turn kvm_irq_delivery_to_apic() into a wrapper that passes NULL for the @dest_map param, as only the ugly I/O APIC RTC hackery needs to know which vCPUs received the IRQ. No functional change intended. Link: https://patch.msgid.link/20251206004311.479939-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/hyperv.c | 2 +- arch/x86/kvm/ioapic.c | 6 +++--- arch/x86/kvm/irq.c | 4 ++-- arch/x86/kvm/lapic.c | 23 ++++++++++++++++------- arch/x86/kvm/lapic.h | 16 ++++++++++++---- arch/x86/kvm/x86.c | 2 +- arch/x86/kvm/xen.c | 2 +- 7 files changed, 36 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index de92292eb1f5..49bf744ca8e3 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -492,7 +492,7 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) irq.vector = vector; irq.level = 1; - ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL); + ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq); trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret); return ret; } diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 955c781ba623..4b49f9728362 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -485,11 +485,11 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) * if rtc_irq_check_coalesced returns false). */ BUG_ON(ioapic->rtc_status.pending_eoi != 0); - ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, - &ioapic->rtc_status.dest_map); + ret = __kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, + &ioapic->rtc_status.dest_map); ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret); } else - ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL); + ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG) entry->fields.remote_irr = 1; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 7cc8950005b6..a52115441c07 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -235,7 +235,7 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, kvm_msi_to_lapic_irq(kvm, e, &irq); - return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); + return kvm_irq_delivery_to_apic(kvm, NULL, &irq); } int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, @@ -258,7 +258,7 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, kvm_msi_to_lapic_irq(kvm, e, &irq); - if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) + if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r)) return r; break; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 785c0352fa0e..769facb27d3d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1175,8 +1175,9 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, return true; } -bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map) +static bool __kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, int *r, + struct dest_map *dest_map) { struct kvm_apic_map *map; unsigned long bitmap; @@ -1212,6 +1213,13 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, return ret; } + +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, int *r) +{ + return __kvm_irq_delivery_to_apic_fast(kvm, src, irq, r, NULL); +} + /* * This routine tries to handle interrupts in posted mode, here is how * it deals with different cases: @@ -1283,15 +1291,16 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_intr_is_single_vcpu); -int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, struct dest_map *dest_map) +int __kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, + struct dest_map *dest_map) { int r = -1; struct kvm_vcpu *vcpu, *lowest = NULL; unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; unsigned int dest_vcpus = 0; - if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) + if (__kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) return r; if (irq->dest_mode == APIC_DEST_PHYSICAL && @@ -1587,7 +1596,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) trace_kvm_apic_ipi(icr_low, irq.dest_id); - kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL); + kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_send_ipi); @@ -2556,7 +2565,7 @@ static int __kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data, bool fast) kvm_icr_to_lapic_irq(apic, (u32)data, (u32)(data >> 32), &irq); if (!kvm_irq_delivery_to_apic_fast(apic->vcpu->kvm, apic, &irq, - &ignored, NULL)) + &ignored)) return -EWOULDBLOCK; trace_kvm_apic_ipi((u32)data, irq.dest_id); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 282b9b7da98c..901c05a5ac60 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -117,10 +117,18 @@ int kvm_alloc_apic_access_page(struct kvm *kvm); void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu); bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map); -int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, - struct dest_map *dest_map); + struct kvm_lapic_irq *irq, int *r); +int __kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, + struct dest_map *dest_map); + +static inline int kvm_irq_delivery_to_apic(struct kvm *kvm, + struct kvm_lapic *src, + struct kvm_lapic_irq *irq) +{ + return __kvm_irq_delivery_to_apic(kvm, src, irq, NULL); +} + void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high); int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a847353588a1..0c1933a303ca 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10245,7 +10245,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid) .dest_id = apicid, }; - kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); + kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq); } bool kvm_apicv_activated(struct kvm *kvm) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index d6b2a665b499..28eeb1b2a16c 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -626,7 +626,7 @@ void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) irq.delivery_mode = APIC_DM_FIXED; irq.level = 1; - kvm_irq_delivery_to_apic(v->kvm, NULL, &irq, NULL); + kvm_irq_delivery_to_apic(v->kvm, NULL, &irq); } /* From 5cd6b1a6eebd564104897ca87be2812d8feafad4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:43:09 -0800 Subject: [PATCH 081/267] KVM: x86: Fold "struct dest_map" into "struct rtc_status" Drop "struct dest_map" and fold its members into its one and only user, "struct rtc_status". Tracking "pending" EOIs and associated vCPUs is very much a hack for legacy RTC behavior, and should never be needed for other IRQ delivery. In addition to making it more obvious why KVM tracks target vCPUs, this will allow burying the "struct rtc_status" definition behind CONFIG_KVM_IOAPIC=y, which in turn will make it even harder for KVM to misuse the structure. No functional change intended. Link: https://patch.msgid.link/20251206004311.479939-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/ioapic.c | 29 ++++++++++++++--------------- arch/x86/kvm/ioapic.h | 10 +++------- arch/x86/kvm/lapic.c | 28 ++++++++++++++-------------- arch/x86/kvm/lapic.h | 6 +++--- 4 files changed, 34 insertions(+), 39 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 4b49f9728362..9a99d01b111c 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -77,7 +77,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic) static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) { ioapic->rtc_status.pending_eoi = 0; - bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_IDS); + bitmap_zero(ioapic->rtc_status.map, KVM_MAX_VCPU_IDS); } static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); @@ -92,7 +92,7 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) { bool new_val, old_val; struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; - struct dest_map *dest_map = &ioapic->rtc_status.dest_map; + struct rtc_status *status = &ioapic->rtc_status; union kvm_ioapic_redirect_entry *e; e = &ioapic->redirtbl[RTC_GSI]; @@ -102,17 +102,17 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) return; new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector); - old_val = test_bit(vcpu->vcpu_id, dest_map->map); + old_val = test_bit(vcpu->vcpu_id, status->map); if (new_val == old_val) return; if (new_val) { - __set_bit(vcpu->vcpu_id, dest_map->map); - dest_map->vectors[vcpu->vcpu_id] = e->fields.vector; + __set_bit(vcpu->vcpu_id, status->map); + status->vectors[vcpu->vcpu_id] = e->fields.vector; ioapic->rtc_status.pending_eoi++; } else { - __clear_bit(vcpu->vcpu_id, dest_map->map); + __clear_bit(vcpu->vcpu_id, status->map); ioapic->rtc_status.pending_eoi--; rtc_status_pending_eoi_check_valid(ioapic); } @@ -143,13 +143,12 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu, int vector) { - struct dest_map *dest_map = &ioapic->rtc_status.dest_map; + struct rtc_status *status = &ioapic->rtc_status; /* RTC special handling */ - if (test_bit(vcpu->vcpu_id, dest_map->map) && - (vector == dest_map->vectors[vcpu->vcpu_id]) && - (test_and_clear_bit(vcpu->vcpu_id, - ioapic->rtc_status.dest_map.map))) { + if (test_bit(vcpu->vcpu_id, status->map) && + (vector == status->vectors[vcpu->vcpu_id]) && + (test_and_clear_bit(vcpu->vcpu_id, status->map))) { --ioapic->rtc_status.pending_eoi; rtc_status_pending_eoi_check_valid(ioapic); } @@ -260,15 +259,15 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr) void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) { struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; - struct dest_map *dest_map = &ioapic->rtc_status.dest_map; + struct rtc_status *status = &ioapic->rtc_status; union kvm_ioapic_redirect_entry *e; int index; spin_lock(&ioapic->lock); /* Make sure we see any missing RTC EOI */ - if (test_bit(vcpu->vcpu_id, dest_map->map)) - __set_bit(dest_map->vectors[vcpu->vcpu_id], + if (test_bit(vcpu->vcpu_id, status->map)) + __set_bit(status->vectors[vcpu->vcpu_id], ioapic_handled_vectors); for (index = 0; index < IOAPIC_NUM_PINS; index++) { @@ -486,7 +485,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) */ BUG_ON(ioapic->rtc_status.pending_eoi != 0); ret = __kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, - &ioapic->rtc_status.dest_map); + &ioapic->rtc_status); ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret); } else ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index ad238a6e63dc..868ed593a5c9 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -36,7 +36,9 @@ struct kvm_vcpu; #define RTC_GSI 8 -struct dest_map { +struct rtc_status { + int pending_eoi; + /* vcpu bitmap where IRQ has been sent */ DECLARE_BITMAP(map, KVM_MAX_VCPU_IDS); @@ -47,12 +49,6 @@ struct dest_map { u8 vectors[KVM_MAX_VCPU_IDS]; }; - -struct rtc_status { - int pending_eoi; - struct dest_map dest_map; -}; - union kvm_ioapic_redirect_entry { u64 bits; struct { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 769facb27d3d..0a44765aba12 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -784,15 +784,15 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_find_highest_irr); static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode, - struct dest_map *dest_map); + struct rtc_status *rtc_status); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, - struct dest_map *dest_map) + struct rtc_status *rtc_status) { struct kvm_lapic *apic = vcpu->arch.apic; return __apic_accept_irq(apic, irq->delivery_mode, irq->vector, - irq->level, irq->trig_mode, dest_map); + irq->level, irq->trig_mode, rtc_status); } static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map, @@ -1177,7 +1177,7 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, static bool __kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, - struct dest_map *dest_map) + struct rtc_status *rtc_status) { struct kvm_apic_map *map; unsigned long bitmap; @@ -1192,7 +1192,7 @@ static bool __kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *s *r = 0; return true; } - *r = kvm_apic_set_irq(src->vcpu, irq, dest_map); + *r = kvm_apic_set_irq(src->vcpu, irq, rtc_status); return true; } @@ -1205,7 +1205,7 @@ static bool __kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *s for_each_set_bit(i, &bitmap, 16) { if (!dst[i]) continue; - *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); + *r += kvm_apic_set_irq(dst[i]->vcpu, irq, rtc_status); } } @@ -1293,14 +1293,14 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_intr_is_single_vcpu); int __kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, - struct dest_map *dest_map) + struct rtc_status *rtc_status) { int r = -1; struct kvm_vcpu *vcpu, *lowest = NULL; unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; unsigned int dest_vcpus = 0; - if (__kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) + if (__kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, rtc_status)) return r; if (irq->dest_mode == APIC_DEST_PHYSICAL && @@ -1322,7 +1322,7 @@ int __kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, if (!kvm_lowest_prio_delivery(irq)) { if (r < 0) r = 0; - r += kvm_apic_set_irq(vcpu, irq, dest_map); + r += kvm_apic_set_irq(vcpu, irq, rtc_status); } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { if (!vector_hashing_enabled) { if (!lowest) @@ -1344,7 +1344,7 @@ int __kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, } if (lowest) - r = kvm_apic_set_irq(lowest, irq, dest_map); + r = kvm_apic_set_irq(lowest, irq, rtc_status); return r; } @@ -1355,7 +1355,7 @@ int __kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, */ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode, - struct dest_map *dest_map) + struct rtc_status *rtc_status) { int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; @@ -1376,9 +1376,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, result = 1; - if (dest_map) { - __set_bit(vcpu->vcpu_id, dest_map->map); - dest_map->vectors[vcpu->vcpu_id] = vector; + if (rtc_status) { + __set_bit(vcpu->vcpu_id, rtc_status->map); + rtc_status->vectors[vcpu->vcpu_id] = vector; } if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 901c05a5ac60..71c80fa020e0 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -88,7 +88,7 @@ struct kvm_lapic { int nr_lvt_entries; }; -struct dest_map; +struct rtc_status; int kvm_create_lapic(struct kvm_vcpu *vcpu); void kvm_free_lapic(struct kvm_vcpu *vcpu); @@ -110,7 +110,7 @@ bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, - struct dest_map *dest_map); + struct rtc_status *rtc_status); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); void kvm_apic_update_apicv(struct kvm_vcpu *vcpu); int kvm_alloc_apic_access_page(struct kvm *kvm); @@ -120,7 +120,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r); int __kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, - struct dest_map *dest_map); + struct rtc_status *rtc_status); static inline int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, From 59c3e0603d8614e0f5e17089182ca596cf3fc552 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:43:10 -0800 Subject: [PATCH 082/267] KVM: x86: Bury ioapic.h definitions behind CONFIG_KVM_IOAPIC Now that almost everything in ioapic.h is used only by code guarded by CONFIG_KVM_IOAPIC=y, bury (almost) the entire thing behind the Kconfig. Link: https://patch.msgid.link/20251206004311.479939-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/ioapic.h | 14 +++++++++----- arch/x86/kvm/lapic.c | 2 ++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 868ed593a5c9..3dadae093690 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -6,6 +6,8 @@ #include #include "irq.h" +#ifdef CONFIG_KVM_IOAPIC + struct kvm; struct kvm_vcpu; @@ -99,11 +101,6 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, bool mask); -static inline int ioapic_in_kernel(struct kvm *kvm) -{ - return irqchip_full(kvm); -} - void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode); @@ -116,6 +113,13 @@ void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors); +#endif /* CONFIG_KVM_IOAPIC */ + +static inline int ioapic_in_kernel(struct kvm *kvm) +{ + return irqchip_full(kvm); +} + void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors); void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0a44765aba12..78c39341b2a5 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1376,10 +1376,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, result = 1; +#ifdef CONFIG_KVM_IOAPIC if (rtc_status) { __set_bit(vcpu->vcpu_id, rtc_status->map); rtc_status->vectors[vcpu->vcpu_id] = vector; } +#endif if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { if (trig_mode) From fd09d259c161086849774f9261cab16c58ba7dff Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:43:11 -0800 Subject: [PATCH 083/267] KVM: x86: Hide KVM_IRQCHIP_KERNEL behind CONFIG_KVM_IOAPIC=y Enumerate KVM_IRQCHIP_KERNEL if and only if support for an in-kernel I/O APIC is enabled, as all usage is likewise guarded by CONFIG_KVM_IOAPIC=y. Link: https://patch.msgid.link/20251206004311.479939-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5a3bfa293e8b..8a979c389bc0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1222,7 +1222,9 @@ struct kvm_xen { enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, +#ifdef CONFIG_KVM_IOAPIC KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */ +#endif KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ }; From 60b590de8b30dad8b11e9e4fba0df2eae81afb98 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Thu, 8 Jan 2026 15:46:17 -0600 Subject: [PATCH 084/267] KVM: SVM: Fix a missing kunmap_local() in sev_gmem_post_populate() sev_gmem_post_populate() needs to unmap the target vaddr after copy_from_user() to the vaddr fails. Fixes: dee5a47cc7a4 ("KVM: SEV: Add KVM_SEV_SNP_LAUNCH_UPDATE command") Signed-off-by: Yan Zhao Signed-off-by: Michael Roth Link: https://patch.msgid.link/20260108214622.1084057-2-michael.roth@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f59c65abe3cf..261d9ef8631b 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2296,6 +2296,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf void *vaddr = kmap_local_pfn(pfn + i); if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) { + kunmap_local(vaddr); ret = -EFAULT; goto err; } From c3a9a27c79e4e5d8bdb20a26d16230111207e98e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 8 Jan 2026 19:45:25 -0800 Subject: [PATCH 085/267] KVM: selftests: Add a test to verify APICv updates (while L2 is active) Add a test to verify KVM correctly handles a variety of edge cases related to APICv updates, and in particular updates that are triggered while L2 is actively running. Reviewed-by: Chao Gao Link: https://patch.msgid.link/20260109034532.1012993-2-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../testing/selftests/kvm/include/x86/apic.h | 4 + .../kvm/x86/vmx_apicv_updates_test.c | 155 ++++++++++++++++++ 3 files changed, 160 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..6f00bd8271c2 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -115,6 +115,7 @@ TEST_GEN_PROGS_x86 += x86/ucna_injection_test TEST_GEN_PROGS_x86 += x86/userspace_io_test TEST_GEN_PROGS_x86 += x86/userspace_msr_exit_test TEST_GEN_PROGS_x86 += x86/vmx_apic_access_test +TEST_GEN_PROGS_x86 += x86/vmx_apicv_updates_test TEST_GEN_PROGS_x86 += x86/vmx_dirty_log_test TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86 += x86/vmx_msrs_test diff --git a/tools/testing/selftests/kvm/include/x86/apic.h b/tools/testing/selftests/kvm/include/x86/apic.h index 80fe9f69b38d..d42a0998d868 100644 --- a/tools/testing/selftests/kvm/include/x86/apic.h +++ b/tools/testing/selftests/kvm/include/x86/apic.h @@ -32,6 +32,7 @@ #define APIC_SPIV 0xF0 #define APIC_SPIV_FOCUS_DISABLED (1 << 9) #define APIC_SPIV_APIC_ENABLED (1 << 8) +#define APIC_ISR 0x100 #define APIC_IRR 0x200 #define APIC_ICR 0x300 #define APIC_LVTCMCI 0x2f0 @@ -68,6 +69,9 @@ #define APIC_TMCCT 0x390 #define APIC_TDCR 0x3E0 +#define APIC_VECTOR_TO_BIT_NUMBER(v) ((unsigned int)(v) % 32) +#define APIC_VECTOR_TO_REG_OFFSET(v) ((unsigned int)(v) / 32 * 0x10) + void apic_disable(void); void xapic_enable(void); void x2apic_enable(void); diff --git a/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c b/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c new file mode 100644 index 000000000000..337c53fddeff --- /dev/null +++ b/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#define GOOD_IPI_VECTOR 0xe0 +#define BAD_IPI_VECTOR 0xf0 + +static volatile int good_ipis_received; + +static void good_ipi_handler(struct ex_regs *regs) +{ + good_ipis_received++; +} + +static void bad_ipi_handler(struct ex_regs *regs) +{ + GUEST_FAIL("Received \"bad\" IPI; ICR MMIO write should have been ignored"); +} + +static void l2_guest_code(void) +{ + x2apic_enable(); + vmcall(); + + xapic_enable(); + xapic_write_reg(APIC_ID, 1 << 24); + vmcall(); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uint32_t control; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + /* Prepare the VMCS for L2 execution. */ + prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); + control |= CPU_BASED_USE_MSR_BITMAPS; + vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); + + /* Modify APIC ID to coerce KVM into inhibiting APICv. */ + xapic_enable(); + xapic_write_reg(APIC_ID, 1 << 24); + + /* + * Generate+receive an IRQ without doing EOI to get an IRQ set in vISR + * but not SVI. APICv should be inhibited due to running with a + * modified APIC ID. + */ + xapic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_DM_FIXED | GOOD_IPI_VECTOR); + GUEST_ASSERT_EQ(xapic_read_reg(APIC_ID), 1 << 24); + + /* Enable IRQs and verify the IRQ was received. */ + sti_nop(); + GUEST_ASSERT_EQ(good_ipis_received, 1); + + /* + * Run L2 to switch to x2APIC mode, which in turn will uninhibit APICv, + * as KVM should force the APIC ID back to its default. + */ + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + vmreadz(VM_EXIT_INSTRUCTION_LEN)); + GUEST_ASSERT(rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_EXTD); + + /* + * Scribble the APIC access page to verify KVM disabled xAPIC + * virtualization in vmcs01, and to verify that KVM flushes L1's TLB + * when L2 switches back to accelerated xAPIC mode. + */ + xapic_write_reg(APIC_ICR2, 0xdeadbeefu); + xapic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_DM_FIXED | BAD_IPI_VECTOR); + + /* + * Verify the IRQ is still in-service and emit an EOI to verify KVM + * propagates the highest vISR vector to SVI when APICv is activated + * (and does so even if APICv was uninhibited while L2 was active). + */ + GUEST_ASSERT_EQ(x2apic_read_reg(APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(GOOD_IPI_VECTOR)), + BIT(APIC_VECTOR_TO_BIT_NUMBER(GOOD_IPI_VECTOR))); + x2apic_write_reg(APIC_EOI, 0); + GUEST_ASSERT_EQ(x2apic_read_reg(APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(GOOD_IPI_VECTOR)), 0); + + /* + * Run L2 one more time to switch back to xAPIC mode to verify that KVM + * handles the x2APIC => xAPIC transition and inhibits APICv while L2 + * is active. + */ + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_ASSERT(!(rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_EXTD)); + + xapic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_DM_FIXED | GOOD_IPI_VECTOR); + /* Re-enable IRQs, as VM-Exit clears RFLAGS.IF. */ + sti_nop(); + GUEST_ASSERT_EQ(good_ipis_received, 2); + + GUEST_ASSERT_EQ(xapic_read_reg(APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(GOOD_IPI_VECTOR)), + BIT(APIC_VECTOR_TO_BIT_NUMBER(GOOD_IPI_VECTOR))); + xapic_write_reg(APIC_EOI, 0); + GUEST_ASSERT_EQ(xapic_read_reg(APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(GOOD_IPI_VECTOR)), 0); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva; + struct vmx_pages *vmx; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + + vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); + prepare_virtualize_apic_accesses(vmx, vm); + vcpu_args_set(vcpu, 1, vmx_pages_gva); + + virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); + vm_install_exception_handler(vm, BAD_IPI_VECTOR, bad_ipi_handler); + vm_install_exception_handler(vm, GOOD_IPI_VECTOR, good_ipi_handler); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_DONE: + break; + default: + TEST_FAIL("Unexpected ucall %lu", uc.cmd); + } + + /* + * Verify at least two IRQs were injected. Unfortunately, KVM counts + * re-injected IRQs (e.g. if delivering the IRQ hits an EPT violation), + * so being more precise isn't possible given the current stats. + */ + TEST_ASSERT(vcpu_get_stat(vcpu, irq_injections) >= 2, + "Wanted at least 2 IRQ injections, got %lu\n", + vcpu_get_stat(vcpu, irq_injections)); + + kvm_vm_free(vm); + return 0; +} From 3e013d0a70994df2c2ba78d599a6c039ab0977a5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 8 Jan 2026 19:45:26 -0800 Subject: [PATCH 086/267] KVM: nVMX: Switch to vmcs01 to update PML controls on-demand if L2 is active If KVM toggles "CPU dirty logging", a.k.a. Page-Modification Logging (PML), while L2 is active, temporarily load vmcs01 and immediately update the relevant controls instead of deferring the update until the next nested VM-Exit. For PML, deferring the update is relatively straightforward, but for several APICv related updates, deferring updates creates ordering and state consistency problems, e.g. KVM at-large thinks APICv is enabled, but vmcs01 is still running with stale (and effectively unknown) state. Convert PML first precisely because it's the simplest case to handle: if something is broken with the vmcs01 <=> vmcs02 dance, then hopefully bugs will bisect here. Reviewed-by: Chao Gao Link: https://patch.msgid.link/20260109034532.1012993-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 5 ----- arch/x86/kvm/vmx/vmx.c | 40 +++++++++++++++++++++++++++++++++++---- arch/x86/kvm/vmx/vmx.h | 1 - 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6dd1da94f00c..e8adff614bd5 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5131,11 +5131,6 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_set_virtual_apic_mode(vcpu); } - if (vmx->nested.update_vmcs01_cpu_dirty_logging) { - vmx->nested.update_vmcs01_cpu_dirty_logging = false; - vmx_update_cpu_dirty_logging(vcpu); - } - nested_put_vmcs12_pages(vcpu); if (vmx->nested.reload_vmcs01_apic_access_page) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1f4d15cc898e..3aa0e055e2b0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1594,6 +1594,41 @@ void vmx_vcpu_put(struct kvm_vcpu *vcpu) vmx_prepare_switch_to_host(to_vmx(vcpu)); } +static void vmx_switch_loaded_vmcs(struct kvm_vcpu *vcpu, + struct loaded_vmcs *vmcs) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; + + cpu = get_cpu(); + vmx->loaded_vmcs = vmcs; + vmx_vcpu_load_vmcs(vcpu, cpu); + put_cpu(); +} + +static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!is_guest_mode(vcpu)) { + WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); + return; + } + + WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->nested.vmcs02); + vmx_switch_loaded_vmcs(vcpu, &vmx->vmcs01); +} + +static void vmx_put_vmcs01(struct kvm_vcpu *vcpu) +{ + if (!is_guest_mode(vcpu)) + return; + + vmx_switch_loaded_vmcs(vcpu, &to_vmx(vcpu)->nested.vmcs02); +} +DEFINE_GUARD(vmx_vmcs01, struct kvm_vcpu *, + vmx_load_vmcs01(_T), vmx_put_vmcs01(_T)) + bool vmx_emulation_required(struct kvm_vcpu *vcpu) { return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); @@ -8276,10 +8311,7 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(!enable_pml)) return; - if (is_guest_mode(vcpu)) { - vmx->nested.update_vmcs01_cpu_dirty_logging = true; - return; - } + guard(vmx_vmcs01)(vcpu); /* * Note, nr_memslots_dirty_logging can be changed concurrent with this diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index bc3ed3145d7e..b44eda6225f4 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -133,7 +133,6 @@ struct nested_vmx { bool change_vmcs01_virtual_apic_mode; bool reload_vmcs01_apic_access_page; - bool update_vmcs01_cpu_dirty_logging; bool update_vmcs01_apicv_status; bool update_vmcs01_hwapic_isr; From 51ca2746078ef9390db91251805e54eff3601d63 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 8 Jan 2026 19:45:27 -0800 Subject: [PATCH 087/267] KVM: nVMX: Switch to vmcs01 to update TPR threshold on-demand if L2 is active If KVM updates L1's TPR Threshold while L2 is active, temporarily load vmcs01 and immediately update TPR_THRESHOLD instead of deferring the update until the next nested VM-Exit. Deferring the TPR Threshold update is relatively straightforward, but for several APICv related updates, deferring updates creates ordering and state consistency problems, e.g. KVM at-large thinks APICv is enabled, but vmcs01 is still running with stale (and effectively unknown) state. Reviewed-by: Chao Gao Link: https://patch.msgid.link/20260109034532.1012993-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 4 ---- arch/x86/kvm/vmx/vmx.c | 7 +++---- arch/x86/kvm/vmx/vmx.h | 3 --- 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e8adff614bd5..b8c07fbc024e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2402,7 +2402,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; - vmx->nested.l1_tpr_threshold = -1; if (exec_control & CPU_BASED_TPR_SHADOW) vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); #ifdef CONFIG_X86_64 @@ -5123,9 +5122,6 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, if (kvm_caps.has_tsc_control) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); - if (vmx->nested.l1_tpr_threshold != -1) - vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); - if (vmx->nested.change_vmcs01_virtual_apic_mode) { vmx->nested.change_vmcs01_virtual_apic_mode = false; vmx_set_virtual_apic_mode(vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3aa0e055e2b0..074c9d01e7b3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6836,11 +6836,10 @@ void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) return; + guard(vmx_vmcs01)(vcpu); + tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; - if (is_guest_mode(vcpu)) - to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold; - else - vmcs_write32(TPR_THRESHOLD, tpr_threshold); + vmcs_write32(TPR_THRESHOLD, tpr_threshold); } void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index b44eda6225f4..36f48c4b39c0 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -184,9 +184,6 @@ struct nested_vmx { u64 pre_vmenter_ssp; u64 pre_vmenter_ssp_tbl; - /* to migrate it to L1 if L2 writes to L1's CR8 directly */ - int l1_tpr_threshold; - u16 vpid02; u16 last_vpid; From f0044429b257c015f95b8e110c652446d4fcfe4c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 8 Jan 2026 19:45:28 -0800 Subject: [PATCH 088/267] KVM: nVMX: Switch to vmcs01 to update SVI on-demand if L2 is active If APICv is activated while L2 is running and triggers an SVI update, temporarily load vmcs01 and immediately update SVI instead of deferring the update until the next nested VM-Exit. This will eventually allow killing off kvm_apic_update_hwapic_isr(), and all of nVMX's deferred APICv updates. Reviewed-by: Chao Gao Link: https://patch.msgid.link/20260109034532.1012993-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 5 ----- arch/x86/kvm/vmx/vmx.c | 19 +++++++------------ arch/x86/kvm/vmx/vmx.h | 1 - 3 files changed, 7 insertions(+), 18 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b8c07fbc024e..a24a6f5f1a06 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5139,11 +5139,6 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_refresh_apicv_exec_ctrl(vcpu); } - if (vmx->nested.update_vmcs01_hwapic_isr) { - vmx->nested.update_vmcs01_hwapic_isr = false; - kvm_apic_update_hwapic_isr(vcpu); - } - if ((vm_exit_reason != -1) && (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) vmx->nested.need_vmcs12_to_shadow_sync = true; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 074c9d01e7b3..5d3746c49c4a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6972,21 +6972,16 @@ void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) u16 status; u8 old; - /* - * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI - * is only relevant for if and only if Virtual Interrupt Delivery is - * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's - * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested - * VM-Exit, otherwise L1 with run with a stale SVI. - */ - if (is_guest_mode(vcpu)) { - to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true; - return; - } - if (max_isr == -1) max_isr = 0; + /* + * Always update SVI in vmcs01, as SVI is only relevant for L2 if and + * only if Virtual Interrupt Delivery is enabled in vmcs12, and if VID + * is enabled then L2 EOIs affect L2's vAPIC, not L1's vAPIC. + */ + guard(vmx_vmcs01)(vcpu); + status = vmcs_read16(GUEST_INTR_STATUS); old = status >> 8; if (max_isr != old) { diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 36f48c4b39c0..53969e49d9d1 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -134,7 +134,6 @@ struct nested_vmx { bool change_vmcs01_virtual_apic_mode; bool reload_vmcs01_apic_access_page; bool update_vmcs01_apicv_status; - bool update_vmcs01_hwapic_isr; /* * Enlightened VMCS has been enabled. It does not mean that L1 has to From 2bf889a68fbab33133ef9ec1000399913b2c65c4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 8 Jan 2026 19:45:29 -0800 Subject: [PATCH 089/267] KVM: nVMX: Switch to vmcs01 to refresh APICv controls on-demand if L2 is active If APICv is (un)inhibited while L2 is running, temporarily load vmcs01 and immediately refresh the APICv controls in vmcs01 instead of deferring the update until the next nested VM-Exit. This all but eliminates potential ordering issues due to vmcs01 not being synchronized with kvm_lapic.apicv_active, e.g. where KVM _thinks_ it refreshed APICv, but vmcs01 still contains stale state. Reviewed-by: Chao Gao Link: https://patch.msgid.link/20260109034532.1012993-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 5 ----- arch/x86/kvm/vmx/vmx.c | 5 +---- arch/x86/kvm/vmx/vmx.h | 1 - 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a24a6f5f1a06..b141bc1fb247 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5134,11 +5134,6 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); } - if (vmx->nested.update_vmcs01_apicv_status) { - vmx->nested.update_vmcs01_apicv_status = false; - vmx_refresh_apicv_exec_ctrl(vcpu); - } - if ((vm_exit_reason != -1) && (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) vmx->nested.need_vmcs12_to_shadow_sync = true; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5d3746c49c4a..e7bab908cf6c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4578,10 +4578,7 @@ void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (is_guest_mode(vcpu)) { - vmx->nested.update_vmcs01_apicv_status = true; - return; - } + guard(vmx_vmcs01)(vcpu); pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 53969e49d9d1..dfc9766a7fa3 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -133,7 +133,6 @@ struct nested_vmx { bool change_vmcs01_virtual_apic_mode; bool reload_vmcs01_apic_access_page; - bool update_vmcs01_apicv_status; /* * Enlightened VMCS has been enabled. It does not mean that L1 has to From 51c821d6d0ba038506d8b1c522f0b2b0ed756dd3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 8 Jan 2026 19:45:30 -0800 Subject: [PATCH 090/267] KVM: nVMX: Switch to vmcs01 to update APIC page on-demand if L2 is active If the KVM-owned APIC-access page is migrated while L2 is running, temporarily load vmcs01 and immediately update APIC_ACCESS_ADDR instead of deferring the update until the next nested VM-Exit. Once changing the virtual APIC mode is converted to always do on-demand updates, all of the "defer until vmcs01 is active" logic will be gone. Reviewed-by: Chao Gao Link: https://patch.msgid.link/20260109034532.1012993-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 5 ----- arch/x86/kvm/vmx/vmx.c | 7 ++----- arch/x86/kvm/vmx/vmx.h | 1 - 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b141bc1fb247..9354d2017fd9 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5129,11 +5129,6 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, nested_put_vmcs12_pages(vcpu); - if (vmx->nested.reload_vmcs01_apic_access_page) { - vmx->nested.reload_vmcs01_apic_access_page = false; - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - } - if ((vm_exit_reason != -1) && (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) vmx->nested.need_vmcs12_to_shadow_sync = true; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e7bab908cf6c..a8ddf0266072 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6904,11 +6904,8 @@ void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) kvm_pfn_t pfn; bool writable; - /* Defer reload until vmcs01 is the current VMCS. */ - if (is_guest_mode(vcpu)) { - to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; - return; - } + /* Note, the VIRTUALIZE_APIC_ACCESSES check needs to query vmcs01. */ + guard(vmx_vmcs01)(vcpu); if (!(secondary_exec_controls_get(to_vmx(vcpu)) & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index dfc9766a7fa3..078bc6fef7e6 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -132,7 +132,6 @@ struct nested_vmx { bool vmcs02_initialized; bool change_vmcs01_virtual_apic_mode; - bool reload_vmcs01_apic_access_page; /* * Enlightened VMCS has been enabled. It does not mean that L1 has to From 249cc1ab4b9a5caa63d7e9c5a5b7862046089dd4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 8 Jan 2026 19:45:31 -0800 Subject: [PATCH 091/267] KVM: nVMX: Switch to vmcs01 to set virtual APICv mode on-demand if L2 is active If L1's virtual APIC mode changes while L2 is active, e.g. because L1 doesn't intercept writes to the APIC_BASE MSR and L2 changes the mode, temporarily load vmcs01 and do all of the necessary actions instead of deferring the update until the next nested VM-Exit. This will help in fixing yet more issues related to updates while L2 is active, e.g. KVM neglects to update vmcs02 MSR intercepts if vmcs01's MSR intercepts are modified while L2 is active. Not updating x2APIC MSRs is benign because vmcs01's settings are not factored into vmcs02's bitmap, but deferring the x2APIC MSR updates would create a weird, inconsistent state. Reviewed-by: Chao Gao Link: https://patch.msgid.link/20260109034532.1012993-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 5 ----- arch/x86/kvm/vmx/vmx.c | 17 +++++++++++------ arch/x86/kvm/vmx/vmx.h | 2 -- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 9354d2017fd9..685f016a2233 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5122,11 +5122,6 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, if (kvm_caps.has_tsc_control) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); - if (vmx->nested.change_vmcs01_virtual_apic_mode) { - vmx->nested.change_vmcs01_virtual_apic_mode = false; - vmx_set_virtual_apic_mode(vcpu); - } - nested_put_vmcs12_pages(vcpu); if ((vm_exit_reason != -1) && diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a8ddf0266072..0d9897f79b71 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6851,11 +6851,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) !cpu_has_vmx_virtualize_x2apic_mode()) return; - /* Postpone execution until vmcs01 is the current VMCS. */ - if (is_guest_mode(vcpu)) { - vmx->nested.change_vmcs01_virtual_apic_mode = true; - return; - } + guard(vmx_vmcs01)(vcpu); sec_exec_control = secondary_exec_controls_get(vmx); sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | @@ -6878,8 +6874,17 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) * only do so if its physical address has changed, but * the guest may have inserted a non-APIC mapping into * the TLB while the APIC access page was disabled. + * + * If L2 is active, immediately flush L1's TLB instead + * of requesting a flush of the current TLB, because + * the current TLB context is L2's. */ - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + if (!is_guest_mode(vcpu)) + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + else if (!enable_ept) + vpid_sync_context(vmx->vpid); + else if (VALID_PAGE(vcpu->arch.root_mmu.root.hpa)) + vmx_flush_tlb_ept_root(vcpu->arch.root_mmu.root.hpa); } break; case LAPIC_MODE_X2APIC: diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 078bc6fef7e6..a926ce43ad40 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -131,8 +131,6 @@ struct nested_vmx { */ bool vmcs02_initialized; - bool change_vmcs01_virtual_apic_mode; - /* * Enlightened VMCS has been enabled. It does not mean that L1 has to * use it. However, VMX features available to L1 will be limited based From 000d75b0b18622e7454c3955631a3cf39e0353e7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 8 Jan 2026 19:45:32 -0800 Subject: [PATCH 092/267] KVM: x86: Update APICv ISR (a.k.a. SVI) as part of kvm_apic_update_apicv() Fold the calls to .hwapic_isr_update() in kvm_apic_set_state(), kvm_lapic_reset(), and __kvm_vcpu_update_apicv() into kvm_apic_update_apicv(), as updating SVI is directly related to updating KVM's own cache of ISR information, e.g. SVI is more or less the APICv equivalent of highest_isr_cache. Note, calling .hwapic_isr_update() during kvm_apic_update_apicv() has benign side effects, as doing so changes the orders of the calls in kvm_lapic_reset() and kvm_apic_set_state(), specifically with respect to to the order between .hwapic_isr_update() and .apicv_post_state_restore(). However, the changes in ordering are glorified nops as the former hook is VMX-only and the latter is SVM-only. Reviewed-by: Chao Gao Link: https://patch.msgid.link/20260109034532.1012993-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 31 ++++++++++++------------------- arch/x86/kvm/lapic.h | 1 - arch/x86/kvm/x86.c | 7 ------- 3 files changed, 12 insertions(+), 27 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 78c39341b2a5..2e513f1c8988 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -760,17 +760,6 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) } } -void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active) - return; - - kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_hwapic_isr); - int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { /* This may race with setting of irr in __apic_accept_irq() and @@ -2783,10 +2772,18 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) */ apic->irr_pending = true; - if (apic->apicv_active) + /* + * Update SVI when APICv gets enabled, otherwise SVI won't reflect the + * highest bit in vISR and the next accelerated EOI in the guest won't + * be virtualized correctly (the CPU uses SVI to determine which vISR + * vector to clear). + */ + if (apic->apicv_active) { apic->isr_count = 1; - else + kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); + } else { apic->isr_count = count_vectors(apic->regs + APIC_ISR); + } apic->highest_isr_cache = -1; } @@ -2914,10 +2911,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); - if (apic->apicv_active) { + if (apic->apicv_active) kvm_x86_call(apicv_post_state_restore)(vcpu); - kvm_x86_call(hwapic_isr_update)(vcpu, -1); - } vcpu->arch.apic_arb_prio = 0; vcpu->arch.apic_attention = 0; @@ -3228,10 +3223,8 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) __start_apic_timer(apic, APIC_TMCCT); kvm_lapic_set_reg(apic, APIC_TMCCT, 0); kvm_apic_update_apicv(vcpu); - if (apic->apicv_active) { + if (apic->apicv_active) kvm_x86_call(apicv_post_state_restore)(vcpu); - kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); - } kvm_make_request(KVM_REQ_EVENT, vcpu); #ifdef CONFIG_KVM_IOAPIC diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 71c80fa020e0..adf04a9bd57d 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -134,7 +134,6 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high); int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated); int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); -void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0c1933a303ca..bf8059179edb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10886,16 +10886,9 @@ void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was * still active when the interrupt got accepted. Make sure * kvm_check_and_inject_events() is called to check for that. - * - * Update SVI when APICv gets enabled, otherwise SVI won't reflect the - * highest bit in vISR and the next accelerated EOI in the guest won't - * be virtualized correctly (the CPU uses SVI to determine which vISR - * vector to clear). */ if (!apic->apicv_active) kvm_make_request(KVM_REQ_EVENT, vcpu); - else - kvm_apic_update_hwapic_isr(vcpu); out: preempt_enable(); From 9587dd7a7ebd7be3c36815a4c4f90f7e2cedbe03 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 8 Jan 2026 19:31:00 -0800 Subject: [PATCH 093/267] KVM: SVM: Drop the module param to control SEV-ES DebugSwap Rip out the DebugSwap module param, as the sequence of events that led to its inclusion was one big mistake, the param no longer serves any purpose. Commit d1f85fbe836e ("KVM: SEV: Enable data breakpoints in SEV-ES") goofed by not adding a way for the userspace VMM to control the feature. Functionally, that was fine, but it broke attestation signatures because SEV_FEATURES are included in the signature. Commit 5abf6dceb066 ("SEV: disable SEV-ES DebugSwap by default") fixed that issue, but the underlying flaw of userspace not having a way to control SEV_FEATURES was still there. That flaw was addressed by commit 4f5defae7089 ("KVM: SEV: introduce KVM_SEV_INIT2 operation"), and so then 4dd5ecacb9a4 ("KVM: SEV: allow SEV-ES DebugSwap again") re-enabled DebugSwap by default. Now that the dust has settled, the module param doesn't serve any meaningful purpose. Cc: Tom Lendacky Reviewed-by: Tom Lendacky Link: https://patch.msgid.link/20260109033101.1005769-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f59c65abe3cf..9b92f0cccfe6 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -53,9 +53,6 @@ module_param_named(sev_es, sev_es_enabled, bool, 0444); static bool sev_snp_enabled = true; module_param_named(sev_snp, sev_snp_enabled, bool, 0444); -/* enable/disable SEV-ES DebugSwap support */ -static bool sev_es_debug_swap_enabled = true; -module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); static u64 sev_supported_vmsa_features; static unsigned int nr_ciphertext_hiding_asids; @@ -3150,12 +3147,10 @@ out: sev_es_enabled = sev_es_supported; sev_snp_enabled = sev_snp_supported; - if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) || - !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) - sev_es_debug_swap_enabled = false; - sev_supported_vmsa_features = 0; - if (sev_es_debug_swap_enabled) + + if (sev_es_enabled && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) && + cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; if (sev_snp_enabled && tsc_khz && cpu_feature_enabled(X86_FEATURE_SNP_SECURE_TSC)) From d23051f59a5b4eb1f6163cf27e07b8cfcaeb4758 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 8 Jan 2026 19:31:01 -0800 Subject: [PATCH 094/267] KVM: SVM: Tag sev_supported_vmsa_features as read-only after init Tag sev_supported_vmsa_features with __ro_after_init as it's configured by sev_hardware_setup() and never written after initial configuration (and if it were, that'd be a blatant bug). Opportunistically relocate the variable out of the module params area now that sev_es_debug_swap_enabled is gone (which largely motivated its original location). Reviewed-by: Tom Lendacky Link: https://patch.msgid.link/20260109033101.1005769-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 9b92f0cccfe6..28150506b18c 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -53,8 +53,6 @@ module_param_named(sev_es, sev_es_enabled, bool, 0444); static bool sev_snp_enabled = true; module_param_named(sev_snp, sev_snp_enabled, bool, 0444); -static u64 sev_supported_vmsa_features; - static unsigned int nr_ciphertext_hiding_asids; module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 0444); @@ -81,6 +79,8 @@ module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 04 static u64 snp_supported_policy_bits __ro_after_init; +static u64 sev_supported_vmsa_features __ro_after_init; + #define INITIAL_VMSA_GPA 0xFFFFFFFFF000 static u8 sev_enc_bit; From ead63640d4e72e6f6d464f4e31f7fecb79af8869 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 8 Jan 2026 19:06:57 -0800 Subject: [PATCH 095/267] KVM: x86: Ignore -EBUSY when checking nested events from vcpu_block() Ignore -EBUSY when checking nested events after exiting a blocking state while L2 is active, as exiting to userspace will generate a spurious userspace exit, usually with KVM_EXIT_UNKNOWN, and likely lead to the VM's demise. Continuing with the wakeup isn't perfect either, as *something* has gone sideways if a vCPU is awakened in L2 with an injected event (or worse, a nested run pending), but continuing on gives the VM a decent chance of surviving without any major side effects. As explained in the Fixes commits, it _should_ be impossible for a vCPU to be put into a blocking state with an already-injected event (exception, IRQ, or NMI). Unfortunately, userspace can stuff MP_STATE and/or injected events, and thus put the vCPU into what should be an impossible state. Don't bother trying to preserve the WARN, e.g. with an anti-syzkaller Kconfig, as WARNs can (hopefully) be added in paths where _KVM_ would be violating x86 architecture, e.g. by WARNing if KVM attempts to inject an exception or interrupt while the vCPU isn't running. Cc: Alessandro Ratti Cc: stable@vger.kernel.org Fixes: 26844fee6ade ("KVM: x86: never write to memory from kvm_vcpu_check_block()") Fixes: 45405155d876 ("KVM: x86: WARN if a vCPU gets a valid wakeup that KVM can't yet inject") Link: https://syzkaller.appspot.com/text?tag=ReproC&x=10d4261a580000 Reported-by: syzbot+1522459a74d26b0ac33a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/671bc7a7.050a0220.455e8.022a.GAE@google.com Link: https://patch.msgid.link/20260109030657.994759-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e4418409b468..fe9d324da72a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11597,8 +11597,7 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu)) { int r = kvm_check_nested_events(vcpu); - WARN_ON_ONCE(r == -EBUSY); - if (r < 0) + if (r < 0 && r != -EBUSY) return 0; } From de0dc71188ca54cfe13fac5c4334715fb37fe8ce Mon Sep 17 00:00:00 2001 From: Jun Miao Date: Mon, 5 Jan 2026 14:54:23 +0800 Subject: [PATCH 096/267] KVM: x86: align the code with kvm_x86_call() The use of static_call_cond() is essentially the same as static_call() on x86 (e.g. static_call() now handles a NULL pointer as a NOP), and then the kvm_x86_call() is added to improve code readability and maintainability for keeping consistent code style. Fixes 8d032b683c29 ("KVM: TDX: create/destroy VM structure") Link: https://lore.kernel.org/all/3916caa1dcd114301a49beafa5030eca396745c1.1679456900.git.jpoimboe@kernel.org/ Link: https://lore.kernel.org/r/20240507133103.15052-3-wei.w.wang@intel.com Signed-off-by: Jun Miao Link: https://patch.msgid.link/20260105065423.1870622-1-jun.miao@intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fe9d324da72a..386cdb775fd4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13307,7 +13307,7 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) #endif kvm_mmu_pre_destroy_vm(kvm); - static_call_cond(kvm_x86_vm_pre_destroy)(kvm); + kvm_x86_call(vm_pre_destroy)(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) From 217463aa329ea9a2efafd1bbfa6787e8df9091b9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:40 -0800 Subject: [PATCH 097/267] KVM: SVM: Add a helper to detect VMRUN failures Add a helper to detect VMRUN failures so that KVM can guard against its own long-standing bug, where KVM neglects to set exitcode[63:32] when synthesizing a nested VMFAIL_INVALID VM-Exit. This will allow fixing KVM's mess of treating exitcode as two separate 32-bit values without breaking KVM-on-KVM when running on an older, unfixed KVM. Cc: Jim Mattson Cc: Yosry Ahmed Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230211347.4099600-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 16 +++++++--------- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/svm/svm.h | 5 +++++ 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 5b741f8ed170..666b5a36c15d 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1167,7 +1167,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1; vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2; - if (vmcb12->control.exit_code != SVM_EXIT_ERR) + if (!svm_is_vmrun_failure(vmcb12->control.exit_code)) nested_save_pending_event_to_vmcb12(svm, vmcb12); if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) @@ -1463,6 +1463,9 @@ static int nested_svm_intercept(struct vcpu_svm *svm) u32 exit_code = svm->vmcb->control.exit_code; int vmexit = NESTED_EXIT_HOST; + if (svm_is_vmrun_failure(exit_code)) + return NESTED_EXIT_DONE; + switch (exit_code) { case SVM_EXIT_MSR: vmexit = nested_svm_exit_handled_msr(svm); @@ -1470,7 +1473,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm) case SVM_EXIT_IOIO: vmexit = nested_svm_intercept_ioio(svm); break; - case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { + case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: /* * Host-intercepted exceptions have been checked already in * nested_svm_exit_special. There is nothing to do here, @@ -1478,15 +1481,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm) */ vmexit = NESTED_EXIT_DONE; break; - } - case SVM_EXIT_ERR: { - vmexit = NESTED_EXIT_DONE; - break; - } - default: { + default: if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; - } + break; } return vmexit; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c7bd78f5a2c7..e20b40f346af 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3564,7 +3564,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return 1; } - if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { + if (svm_is_vmrun_failure(svm->vmcb->control.exit_code)) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = svm->vmcb->control.exit_code; @@ -4346,7 +4346,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) /* Track VMRUNs that have made past consistency checking */ if (svm->nested.nested_run_pending && - svm->vmcb->control.exit_code != SVM_EXIT_ERR) + !svm_is_vmrun_failure(svm->vmcb->control.exit_code)) ++vcpu->stat.nested_run; svm->nested.nested_run_pending = 0; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 7d28a739865f..3360ac36e071 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -425,6 +425,11 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_svm, vcpu); } +static inline bool svm_is_vmrun_failure(u64 exit_code) +{ + return (u32)exit_code == (u32)SVM_EXIT_ERR; +} + /* * Only the PDPTRs are loaded on demand into the shadow MMU. All other * fields are synchronized on VM-Exit, because accessing the VMCB is cheap. From 2450c9774510e45c506df4a1b46d129435993ff6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:41 -0800 Subject: [PATCH 098/267] KVM: SVM: Open code handling of unexpected exits in svm_invoke_exit_handler() Fold svm_check_exit_valid() and svm_handle_invalid_exit() into their sole caller, svm_invoke_exit_handler(), as having tiny single-use helpers makes the code unncessarily difficult to follow. This will also allow for additional cleanups in svm_invoke_exit_handler(). No functional change intended. Suggested-by: Yosry Ahmed Reviewed-by: Yosry Ahmed Reviewed-by: Pankaj Gupta Link: https://patch.msgid.link/20251230211347.4099600-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e20b40f346af..ddb07c6408de 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3467,23 +3467,13 @@ no_vmsa: sev_free_decrypted_vmsa(vcpu, save); } -static bool svm_check_exit_valid(u64 exit_code) -{ - return (exit_code < ARRAY_SIZE(svm_exit_handlers) && - svm_exit_handlers[exit_code]); -} - -static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) -{ - dump_vmcb(vcpu); - kvm_prepare_unexpected_reason_exit(vcpu, exit_code); - return 0; -} - int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) { - if (!svm_check_exit_valid(exit_code)) - return svm_handle_invalid_exit(vcpu, exit_code); + if (exit_code >= ARRAY_SIZE(svm_exit_handlers)) + goto unexpected_vmexit; + + if (!svm_exit_handlers[exit_code]) + goto unexpected_vmexit; #ifdef CONFIG_MITIGATION_RETPOLINE if (exit_code == SVM_EXIT_MSR) @@ -3502,6 +3492,11 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) #endif #endif return svm_exit_handlers[exit_code](vcpu); + +unexpected_vmexit: + dump_vmcb(vcpu); + kvm_prepare_unexpected_reason_exit(vcpu, exit_code); + return 0; } static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, From 194c17bf5ebadd2fcf52ac641793e3d755a7af55 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:42 -0800 Subject: [PATCH 099/267] KVM: SVM: Check for an unexpected VM-Exit after RETPOLINE "fast" handling Check for an unexpected/unhandled VM-Exit after the manual RETPOLINE=y handling. The entire point of the RETPOLINE checks is to optimize for common VM-Exits, i.e. checking for the rare case of an unsupported VM-Exit is counter-productive. This also aligns SVM and VMX exit handling. No functional change intended. Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230211347.4099600-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ddb07c6408de..d2f997965a96 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3469,12 +3469,6 @@ no_vmsa: int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) { - if (exit_code >= ARRAY_SIZE(svm_exit_handlers)) - goto unexpected_vmexit; - - if (!svm_exit_handlers[exit_code]) - goto unexpected_vmexit; - #ifdef CONFIG_MITIGATION_RETPOLINE if (exit_code == SVM_EXIT_MSR) return msr_interception(vcpu); @@ -3491,6 +3485,12 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) return sev_handle_vmgexit(vcpu); #endif #endif + if (exit_code >= ARRAY_SIZE(svm_exit_handlers)) + goto unexpected_vmexit; + + if (!svm_exit_handlers[exit_code]) + goto unexpected_vmexit; + return svm_exit_handlers[exit_code](vcpu); unexpected_vmexit: From 405fce694bd1589082a7ffd500b5a4b841c22f0d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:43 -0800 Subject: [PATCH 100/267] KVM: SVM: Filter out 64-bit exit codes when invoking exit handlers on bare metal Explicitly filter out 64-bit exit codes when invoking exit handlers, as svm_exit_handlers[] will never be sized with entries that use bits 63:32. Processing the non-failing exit code as a 32-bit value will allow tracking exit_code as a single 64-bit value (which it is, architecturally). This will also allow hardening KVM against Spectre-like attacks without needing to do silly things to avoid build failures on 32-bit kernels (array_index_nospec() rightly asserts that the index fits in an "unsigned long"). Omit the check when running as a VM, as KVM has historically failed to set bits 63:32 appropriately when synthesizing VM-Exits, i.e. KVM could get false positives when running as a VM on an older, broken KVM/kernel. From a functional perspective, omitting the check is "fine", as any unwanted collision between e.g. VMEXIT_INVALID and a 32-bit exit code will be fatal to KVM-on-KVM regardless of what KVM-as-L1 does. Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230211347.4099600-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d2f997965a96..3caf7a21679f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3467,8 +3467,22 @@ no_vmsa: sev_free_decrypted_vmsa(vcpu, save); } -int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) +int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 __exit_code) { + u32 exit_code = __exit_code; + + /* + * SVM uses negative values, i.e. 64-bit values, to indicate that VMRUN + * failed. Report all such errors to userspace (note, VMEXIT_INVALID, + * a.k.a. SVM_EXIT_ERR, is special cased by svm_handle_exit()). Skip + * the check when running as a VM, as KVM has historically left garbage + * in bits 63:32, i.e. running KVM-on-KVM would hit false positives if + * the underlying kernel is buggy. + */ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR) && + (u64)exit_code != __exit_code) + goto unexpected_vmexit; + #ifdef CONFIG_MITIGATION_RETPOLINE if (exit_code == SVM_EXIT_MSR) return msr_interception(vcpu); @@ -3495,7 +3509,7 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) unexpected_vmexit: dump_vmcb(vcpu); - kvm_prepare_unexpected_reason_exit(vcpu, exit_code); + kvm_prepare_unexpected_reason_exit(vcpu, __exit_code); return 0; } From d7507a94a07202234236d7f94bed6015ca645ae6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:44 -0800 Subject: [PATCH 101/267] KVM: SVM: Treat exit_code as an unsigned 64-bit value through all of KVM Fix KVM's long-standing buggy handling of SVM's exit_code as a 32-bit value. Per the APM and Xen commit d1bd157fbc ("Big merge the HVM full-virtualisation abstractions.") (which is arguably more trustworthy than KVM), offset 0x70 is a single 64-bit value: 070h 63:0 EXITCODE Track exit_code as a single u64 to prevent reintroducing bugs where KVM neglects to correctly set bits 63:32. Fixes: 6aa8b732ca01 ("[PATCH] kvm: userspace interface") Cc: Jim Mattson Cc: Yosry Ahmed Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230211347.4099600-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 3 +- arch/x86/include/uapi/asm/svm.h | 32 ++++++++--------- arch/x86/kvm/svm/hyperv.c | 1 - arch/x86/kvm/svm/nested.c | 13 ++----- arch/x86/kvm/svm/sev.c | 36 +++++++------------ arch/x86/kvm/svm/svm.c | 7 ++-- arch/x86/kvm/svm/svm.h | 4 +-- arch/x86/kvm/trace.h | 6 ++-- include/hyperv/hvgdk.h | 2 +- tools/testing/selftests/kvm/include/x86/svm.h | 3 +- .../kvm/x86/svm_nested_soft_inject_test.c | 4 +-- 11 files changed, 42 insertions(+), 69 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 50ece197c98a..edde36097ddc 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -137,8 +137,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 int_vector; u32 int_state; u8 reserved_3[4]; - u32 exit_code; - u32 exit_code_hi; + u64 exit_code; u64 exit_info_1; u64 exit_info_2; u32 exit_int_info; diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 650e3256ea7d..010a45c9f614 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -103,38 +103,38 @@ #define SVM_EXIT_VMGEXIT 0x403 /* SEV-ES software-defined VMGEXIT events */ -#define SVM_VMGEXIT_MMIO_READ 0x80000001 -#define SVM_VMGEXIT_MMIO_WRITE 0x80000002 -#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003 -#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004 -#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005 +#define SVM_VMGEXIT_MMIO_READ 0x80000001ull +#define SVM_VMGEXIT_MMIO_WRITE 0x80000002ull +#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003ull +#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004ull +#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005ull #define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0 #define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 -#define SVM_VMGEXIT_PSC 0x80000010 -#define SVM_VMGEXIT_GUEST_REQUEST 0x80000011 -#define SVM_VMGEXIT_EXT_GUEST_REQUEST 0x80000012 -#define SVM_VMGEXIT_AP_CREATION 0x80000013 +#define SVM_VMGEXIT_PSC 0x80000010ull +#define SVM_VMGEXIT_GUEST_REQUEST 0x80000011ull +#define SVM_VMGEXIT_EXT_GUEST_REQUEST 0x80000012ull +#define SVM_VMGEXIT_AP_CREATION 0x80000013ull #define SVM_VMGEXIT_AP_CREATE_ON_INIT 0 #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 -#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018 -#define SVM_VMGEXIT_SAVIC 0x8000001a +#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018ull +#define SVM_VMGEXIT_SAVIC 0x8000001aull #define SVM_VMGEXIT_SAVIC_REGISTER_GPA 0 #define SVM_VMGEXIT_SAVIC_UNREGISTER_GPA 1 #define SVM_VMGEXIT_SAVIC_SELF_GPA ~0ULL -#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd -#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe +#define SVM_VMGEXIT_HV_FEATURES 0x8000fffdull +#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffeull #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ /* SW_EXITINFO1[3:0] */ \ (((((u64)reason_set) & 0xf)) | \ /* SW_EXITINFO1[11:4] */ \ ((((u64)reason_code) & 0xff) << 4)) -#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff +#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffffull /* Exit code reserved for hypervisor/software use */ -#define SVM_EXIT_SW 0xf0000000 +#define SVM_EXIT_SW 0xf0000000ull -#define SVM_EXIT_ERR -1 +#define SVM_EXIT_ERR -1ull #define SVM_EXIT_REASONS \ { SVM_EXIT_READ_CR0, "read_cr0" }, \ diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c index 088f6429b24c..3ec580d687f5 100644 --- a/arch/x86/kvm/svm/hyperv.c +++ b/arch/x86/kvm/svm/hyperv.c @@ -11,7 +11,6 @@ void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL; - svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH; svm->vmcb->control.exit_info_2 = 0; nested_svm_vmexit(svm); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 666b5a36c15d..5aa0512e09c9 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -45,7 +45,6 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, * correctly fill in the high bits of exit_info_1. */ vmcb->control.exit_code = SVM_EXIT_NPF; - vmcb->control.exit_code_hi = 0; vmcb->control.exit_info_1 = (1ULL << 32); vmcb->control.exit_info_2 = fault->address; } @@ -441,7 +440,6 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, to->int_vector = from->int_vector; to->int_state = from->int_state; to->exit_code = from->exit_code; - to->exit_code_hi = from->exit_code_hi; to->exit_info_1 = from->exit_info_1; to->exit_info_2 = from->exit_info_2; to->exit_int_info = from->exit_int_info; @@ -747,8 +745,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, enter_guest_mode(vcpu); /* - * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, - * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. + * Filled at exit: exit_code, exit_info_1, exit_info_2, exit_int_info, + * exit_int_info_err, next_rip, insn_len, insn_bytes. */ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) && @@ -1018,7 +1016,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) if (!nested_vmcb_check_save(vcpu) || !nested_vmcb_check_controls(vcpu)) { vmcb12->control.exit_code = SVM_EXIT_ERR; - vmcb12->control.exit_code_hi = -1u; vmcb12->control.exit_info_1 = 0; vmcb12->control.exit_info_2 = 0; goto out; @@ -1051,7 +1048,6 @@ out_exit_err: svm->soft_int_injected = false; svm->vmcb->control.exit_code = SVM_EXIT_ERR; - svm->vmcb->control.exit_code_hi = -1u; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; @@ -1163,7 +1159,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.int_state = vmcb02->control.int_state; vmcb12->control.exit_code = vmcb02->control.exit_code; - vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi; vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1; vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2; @@ -1460,7 +1455,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) static int nested_svm_intercept(struct vcpu_svm *svm) { - u32 exit_code = svm->vmcb->control.exit_code; + u64 exit_code = svm->vmcb->control.exit_code; int vmexit = NESTED_EXIT_HOST; if (svm_is_vmrun_failure(exit_code)) @@ -1532,7 +1527,6 @@ static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu) struct vmcb *vmcb = svm->vmcb; vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector; - vmcb->control.exit_code_hi = 0; if (ex->has_error_code) vmcb->control.exit_info_1 = ex->error_code; @@ -1708,7 +1702,6 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->int_vector = from->int_vector; dst->int_state = from->int_state; dst->exit_code = from->exit_code; - dst->exit_code_hi = from->exit_code_hi; dst->exit_info_1 = from->exit_info_1; dst->exit_info_2 = from->exit_info_2; dst->exit_int_info = from->exit_int_info; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 28150506b18c..f67525007089 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3270,11 +3270,6 @@ skip_vmsa_free: kvfree(svm->sev_es.ghcb_sa); } -static u64 kvm_get_cached_sw_exit_code(struct vmcb_control_area *control) -{ - return (((u64)control->exit_code_hi) << 32) | control->exit_code; -} - static void dump_ghcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -3296,7 +3291,7 @@ static void dump_ghcb(struct vcpu_svm *svm) */ pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - kvm_get_cached_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); + control->exit_code, kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", @@ -3330,7 +3325,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; struct ghcb *ghcb = svm->sev_es.ghcb; - u64 exit_code; /* * The GHCB protocol so far allows for the following data @@ -3364,9 +3358,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) __kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm)); /* Copy the GHCB exit information into the VMCB fields */ - exit_code = kvm_ghcb_get_sw_exit_code(svm); - control->exit_code = lower_32_bits(exit_code); - control->exit_code_hi = upper_32_bits(exit_code); + control->exit_code = kvm_ghcb_get_sw_exit_code(svm); control->exit_info_1 = kvm_ghcb_get_sw_exit_info_1(svm); control->exit_info_2 = kvm_ghcb_get_sw_exit_info_2(svm); svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm); @@ -3379,15 +3371,8 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; - u64 exit_code; u64 reason; - /* - * Retrieve the exit code now even though it may not be marked valid - * as it could help with debugging. - */ - exit_code = kvm_get_cached_sw_exit_code(control); - /* Only GHCB Usage code 0 is supported */ if (svm->sev_es.ghcb->ghcb_usage) { reason = GHCB_ERR_INVALID_USAGE; @@ -3401,7 +3386,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) !kvm_ghcb_sw_exit_info_2_is_valid(svm)) goto vmgexit_err; - switch (exit_code) { + switch (control->exit_code) { case SVM_EXIT_READ_DR7: break; case SVM_EXIT_WRITE_DR7: @@ -3502,15 +3487,19 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) return 0; vmgexit_err: + /* + * Print the exit code even though it may not be marked valid as it + * could help with debugging. + */ if (reason == GHCB_ERR_INVALID_USAGE) { vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", svm->sev_es.ghcb->ghcb_usage); } else if (reason == GHCB_ERR_INVALID_EVENT) { vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n", - exit_code); + control->exit_code); } else { vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n", - exit_code); + control->exit_code); dump_ghcb(svm); } @@ -4349,7 +4338,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; - u64 ghcb_gpa, exit_code; + u64 ghcb_gpa; int ret; /* Validate the GHCB */ @@ -4391,8 +4380,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) svm_vmgexit_success(svm, 0); - exit_code = kvm_get_cached_sw_exit_code(control); - switch (exit_code) { + switch (control->exit_code) { case SVM_VMGEXIT_MMIO_READ: ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); if (ret) @@ -4484,7 +4472,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = -EINVAL; break; default: - ret = svm_invoke_exit_handler(vcpu, exit_code); + ret = svm_invoke_exit_handler(vcpu, control->exit_code); } return ret; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3caf7a21679f..a28cd61d87ea 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2466,7 +2466,6 @@ static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, if (cr0 ^ val) { svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; - svm->vmcb->control.exit_code_hi = 0; ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); } @@ -3299,7 +3298,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); pr_err("%-20s%08x\n", "int_vector:", control->int_vector); pr_err("%-20s%08x\n", "int_state:", control->int_state); - pr_err("%-20s%08x\n", "exit_code:", control->exit_code); + pr_err("%-20s%016llx\n", "exit_code:", control->exit_code); pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); @@ -3549,7 +3548,6 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; - u32 exit_code = svm->vmcb->control.exit_code; /* SEV-ES guests must use the CR write traps to track CR registers. */ if (!sev_es_guest(vcpu->kvm)) { @@ -3585,7 +3583,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; - return svm_invoke_exit_handler(vcpu, exit_code); + return svm_invoke_exit_handler(vcpu, svm->vmcb->control.exit_code); } static int pre_svm_run(struct kvm_vcpu *vcpu) @@ -4670,7 +4668,6 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, if (static_cpu_has(X86_FEATURE_NRIPS)) vmcb->control.next_rip = info->next_rip; vmcb->control.exit_code = icpt_info.exit_code; - vmcb->control.exit_code_hi = 0; vmexit = nested_svm_exit_handled(svm); ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 3360ac36e071..a22433680c73 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -160,8 +160,7 @@ struct vmcb_ctrl_area_cached { u32 int_ctl; u32 int_vector; u32 int_state; - u32 exit_code; - u32 exit_code_hi; + u64 exit_code; u64 exit_info_1; u64 exit_info_2; u32 exit_int_info; @@ -787,7 +786,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm); static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code) { svm->vmcb->control.exit_code = exit_code; - svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; return nested_svm_vmexit(svm); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index e79bc9cb7162..e7fdbe9efc90 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -383,10 +383,10 @@ TRACE_EVENT(kvm_apic, #define kvm_print_exit_reason(exit_reason, isa) \ (isa == KVM_ISA_VMX) ? \ __print_symbolic(exit_reason & 0xffff, VMX_EXIT_REASONS) : \ - __print_symbolic(exit_reason, SVM_EXIT_REASONS), \ + __print_symbolic_u64(exit_reason, SVM_EXIT_REASONS), \ (isa == KVM_ISA_VMX && exit_reason & ~0xffff) ? " " : "", \ (isa == KVM_ISA_VMX) ? \ - __print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" + __print_flags_u64(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" #define TRACE_EVENT_KVM_EXIT(name) \ TRACE_EVENT(name, \ @@ -781,7 +781,7 @@ TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit); * Tracepoint for #VMEXIT reinjected to the guest */ TRACE_EVENT(kvm_nested_vmexit_inject, - TP_PROTO(__u32 exit_code, + TP_PROTO(__u64 exit_code, __u64 exit_info1, __u64 exit_info2, __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), TP_ARGS(exit_code, exit_info1, exit_info2, diff --git a/include/hyperv/hvgdk.h b/include/hyperv/hvgdk.h index dd6d4939ea29..384c3f3ff4a5 100644 --- a/include/hyperv/hvgdk.h +++ b/include/hyperv/hvgdk.h @@ -281,7 +281,7 @@ struct hv_vmcb_enlightenments { #define HV_VMCB_NESTED_ENLIGHTENMENTS 31 /* Synthetic VM-Exit */ -#define HV_SVM_EXITCODE_ENL 0xf0000000 +#define HV_SVM_EXITCODE_ENL 0xf0000000ull #define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH (1) /* VM_PARTITION_ASSIST_PAGE */ diff --git a/tools/testing/selftests/kvm/include/x86/svm.h b/tools/testing/selftests/kvm/include/x86/svm.h index 29cffd0a9181..10b30b38bb3f 100644 --- a/tools/testing/selftests/kvm/include/x86/svm.h +++ b/tools/testing/selftests/kvm/include/x86/svm.h @@ -92,8 +92,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 int_vector; u32 int_state; u8 reserved_3[4]; - u32 exit_code; - u32 exit_code_hi; + u64 exit_code; u64 exit_info_1; u64 exit_info_2; u32 exit_int_info; diff --git a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c index 7b6481d6c0d3..4bd1655f9e6d 100644 --- a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c @@ -103,7 +103,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i run_guest(vmcb, svm->vmcb_gpa); __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL, - "Expected VMMCAL #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'", + "Expected VMMCAL #VMEXIT, got '0x%lx', info1 = '0x%lx, info2 = '0x%lx'", vmcb->control.exit_code, vmcb->control.exit_info_1, vmcb->control.exit_info_2); @@ -133,7 +133,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i run_guest(vmcb, svm->vmcb_gpa); __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_HLT, - "Expected HLT #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'", + "Expected HLT #VMEXIT, got '0x%lx', info1 = '0x%lx, info2 = '0x%lx'", vmcb->control.exit_code, vmcb->control.exit_info_1, vmcb->control.exit_info_2); From a08ca6691fd3ab40e40eb6600193672d50c7a7ba Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:45 -0800 Subject: [PATCH 102/267] KVM: SVM: Limit incorrect check on SVM_EXIT_ERR to running as a VM Limit KVM's incorrect check for VMXEXIT_INVALID, a.k.a. SVM_EXIT_ERR, to running as a VM, as detected by X86_FEATURE_HYPERVISOR. The exit_code and all failure codes, e.g. VMXEXIT_INVALID, are 64-bit values, and so checking only bits 31:0 could result in false positives when running on non-broken hardware, e.g. in the extremely unlikely scenario exit code 0xffffffffull is ever generated by hardware. Keep the 32-bit check to play nice with running on broken KVM (for years, KVM has not set bits 63:32 when synthesizing nested SVM VM-Exits). Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230211347.4099600-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index a22433680c73..338fc4f5cc4c 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -426,7 +426,10 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) static inline bool svm_is_vmrun_failure(u64 exit_code) { - return (u32)exit_code == (u32)SVM_EXIT_ERR; + if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return (u32)exit_code == (u32)SVM_EXIT_ERR; + + return exit_code == SVM_EXIT_ERR; } /* From 1e3dddafeceeb8d2cd182b78456cb9ca9d042a01 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:46 -0800 Subject: [PATCH 103/267] KVM: SVM: Harden exit_code against being used in Spectre-like attacks Explicitly clamp the exit code used to index KVM's exit handlers to guard against Spectre-like attacks, mainly to provide consistency between VMX and SVM (VMX was given the same treatment by commit c926f2f7230b ("KVM: x86: Protect exit_reason from being used in Spectre-v1/L1TF attacks"). For normal VMs, it's _extremely_ unlikely the exit code could be used to exploit a speculation vulnerability, as the exit code is set by hardware and unexpected/unknown exit codes should be quite well bounded (as is/was the case with VMX). But with SEV-ES+, the exit code is guest-controlled as it comes from the GHCB, not from hardware, i.e. an attack from the guest is at least somewhat plausible. Irrespective of SEV-ES+, hardening KVM is easy and inexpensive, and such an attack is theoretically possible. Link: https://patch.msgid.link/20251230211347.4099600-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a28cd61d87ea..e454ae095cf7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3501,6 +3501,7 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 __exit_code) if (exit_code >= ARRAY_SIZE(svm_exit_handlers)) goto unexpected_vmexit; + exit_code = array_index_nospec(exit_code, ARRAY_SIZE(svm_exit_handlers)); if (!svm_exit_handlers[exit_code]) goto unexpected_vmexit; From d6c20d19f7d3de14d02b47221988cdb19504bb84 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:47 -0800 Subject: [PATCH 104/267] KVM: SVM: Assert that Hyper-V's HV_SVM_EXITCODE_ENL == SVM_EXIT_SW Add a build-time assertiont that Hyper-V's "enlightened" exit code is that, same as the AMD-defined "Reserved for Host" exit code, mostly to help readers connect the dots and understand why synthesizing a software-defined exit code is safe/ok. Reviewed-by: Vitaly Kuznetsov Link: https://patch.msgid.link/20251230211347.4099600-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/hyperv.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c index 3ec580d687f5..4f24dcb45116 100644 --- a/arch/x86/kvm/svm/hyperv.c +++ b/arch/x86/kvm/svm/hyperv.c @@ -10,6 +10,12 @@ void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + /* + * The exit code used by Hyper-V for software-defined exits is reserved + * by AMD specifically for such use cases. + */ + BUILD_BUG_ON(HV_SVM_EXITCODE_ENL != SVM_EXIT_SW); + svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL; svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH; svm->vmcb->control.exit_info_2 = 0; From 8e8eb10c107e67f22f87cd8c963d30ea73f04d5f Mon Sep 17 00:00:00 2001 From: Petteri Kangaslampi Date: Tue, 13 Jan 2026 19:44:09 +0000 Subject: [PATCH 105/267] KVM: arm64: Calculate hyp VA size only once Calculate the hypervisor's VA size only once to maintain consistency between the memory layout and MMU initialization logic. Previously the two would be inconsistent when the kernel is configured for less than IDMAP_VA_BITS of VA space. Signed-off-by: Petteri Kangaslampi Tested-by: Vincent Donnefort Link: https://patch.msgid.link/20260113194409.2970324-2-pekangas@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_mmu.h | 3 ++- arch/arm64/kvm/arm.c | 4 ++-- arch/arm64/kvm/mmu.c | 28 ++++----------------------- arch/arm64/kvm/va_layout.c | 33 +++++++++++++++++++++++++++----- 4 files changed, 36 insertions(+), 32 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 2dc5e6e742bb..d968aca0461a 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -103,6 +103,7 @@ alternative_cb_end void kvm_update_va_mask(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst); void kvm_compute_layout(void); +u32 kvm_hyp_va_bits(void); void kvm_apply_hyp_relocations(void); #define __hyp_pa(x) (((phys_addr_t)(x)) + hyp_physvirt_offset) @@ -185,7 +186,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_get_idmap_vector(void); -int __init kvm_mmu_init(u32 *hyp_va_bits); +int __init kvm_mmu_init(u32 hyp_va_bits); static inline void *__kvm_vector_slot2addr(void *base, enum arm64_hyp_spectre_vector slot) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 4f80da0c0d1d..4703f0e15102 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2568,7 +2568,7 @@ static void pkvm_hyp_init_ptrauth(void) /* Inits Hyp-mode on all online CPUs */ static int __init init_hyp_mode(void) { - u32 hyp_va_bits; + u32 hyp_va_bits = kvm_hyp_va_bits(); int cpu; int err = -ENOMEM; @@ -2582,7 +2582,7 @@ static int __init init_hyp_mode(void) /* * Allocate Hyp PGD and setup Hyp identity mapping */ - err = kvm_mmu_init(&hyp_va_bits); + err = kvm_mmu_init(hyp_va_bits); if (err) goto out_err; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 48d7c372a4cd..d5a506c99f73 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -2284,11 +2284,9 @@ static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { .virt_to_phys = kvm_host_pa, }; -int __init kvm_mmu_init(u32 *hyp_va_bits) +int __init kvm_mmu_init(u32 hyp_va_bits) { int err; - u32 idmap_bits; - u32 kernel_bits; hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); @@ -2302,25 +2300,7 @@ int __init kvm_mmu_init(u32 *hyp_va_bits) */ BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); - /* - * The ID map is always configured for 48 bits of translation, which - * may be fewer than the number of VA bits used by the regular kernel - * stage 1, when VA_BITS=52. - * - * At EL2, there is only one TTBR register, and we can't switch between - * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom - * line: we need to use the extended range with *both* our translation - * tables. - * - * So use the maximum of the idmap VA bits and the regular kernel stage - * 1 VA bits to assure that the hypervisor can both ID map its code page - * and map any kernel memory. - */ - idmap_bits = IDMAP_VA_BITS; - kernel_bits = vabits_actual; - *hyp_va_bits = max(idmap_bits, kernel_bits); - - kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); + kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits); kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); kvm_debug("HYP VA range: %lx:%lx\n", kern_hyp_va(PAGE_OFFSET), @@ -2345,7 +2325,7 @@ int __init kvm_mmu_init(u32 *hyp_va_bits) goto out; } - err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops); + err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits, &kvm_hyp_mm_ops); if (err) goto out_free_pgtable; @@ -2354,7 +2334,7 @@ int __init kvm_mmu_init(u32 *hyp_va_bits) goto out_destroy_pgtable; io_map_base = hyp_idmap_start; - __hyp_va_bits = *hyp_va_bits; + __hyp_va_bits = hyp_va_bits; return 0; out_destroy_pgtable: diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index 91b22a014610..2346f9435a71 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -46,9 +46,31 @@ static void init_hyp_physvirt_offset(void) hyp_physvirt_offset = (s64)__pa(kern_va) - (s64)hyp_va; } +/* + * Calculate the actual VA size used by the hypervisor + */ +__init u32 kvm_hyp_va_bits(void) +{ + /* + * The ID map is always configured for 48 bits of translation, which may + * be different from the number of VA bits used by the regular kernel + * stage 1. + * + * At EL2, there is only one TTBR register, and we can't switch between + * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom + * line: we need to use the extended range with *both* our translation + * tables. + * + * So use the maximum of the idmap VA bits and the regular kernel stage + * 1 VA bits as the hypervisor VA size to assure that the hypervisor can + * both ID map its code page and map any kernel memory. + */ + return max(IDMAP_VA_BITS, vabits_actual); +} + /* * We want to generate a hyp VA with the following format (with V == - * vabits_actual): + * hypervisor VA bits): * * 63 ... V | V-1 | V-2 .. tag_lsb | tag_lsb - 1 .. 0 * --------------------------------------------------------- @@ -61,10 +83,11 @@ __init void kvm_compute_layout(void) { phys_addr_t idmap_addr = __pa_symbol(__hyp_idmap_text_start); u64 hyp_va_msb; + u32 hyp_va_bits = kvm_hyp_va_bits(); /* Where is my RAM region? */ - hyp_va_msb = idmap_addr & BIT(vabits_actual - 1); - hyp_va_msb ^= BIT(vabits_actual - 1); + hyp_va_msb = idmap_addr & BIT(hyp_va_bits - 1); + hyp_va_msb ^= BIT(hyp_va_bits - 1); tag_lsb = fls64((u64)phys_to_virt(memblock_start_of_DRAM()) ^ (u64)(high_memory - 1)); @@ -72,9 +95,9 @@ __init void kvm_compute_layout(void) va_mask = GENMASK_ULL(tag_lsb - 1, 0); tag_val = hyp_va_msb; - if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && tag_lsb != (vabits_actual - 1)) { + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && tag_lsb != (hyp_va_bits - 1)) { /* We have some free bits to insert a random tag. */ - tag_val |= get_random_long() & GENMASK_ULL(vabits_actual - 2, tag_lsb); + tag_val |= get_random_long() & GENMASK_ULL(hyp_va_bits - 2, tag_lsb); } tag_val >>= tag_lsb; From 4b16ad0bf821d4aceb050e9f569dc329883f1c5b Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Mon, 12 Jan 2026 16:04:13 +0000 Subject: [PATCH 106/267] KVM: arm64: Fix missing include Include for kvm_arm_hyp_stack_base declaration which fixes the following sparse warning: arch/arm64/kvm/arm.c:63:1: warning: symbol 'kvm_arm_hyp_stack_base' was not declared. Should it be static? Signed-off-by: Ben Dooks Link: https://patch.msgid.link/20260112160413.603493-1-ben.dooks@codethink.co.uk Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 4703f0e15102..0e1d18b90376 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include From 69555130dccb39df4d40f90fafc7fc79a5d55b8a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 8 Jan 2026 19:50:37 -0800 Subject: [PATCH 107/267] KVM: SVM: Fix an off-by-one typo in the comment for enabling AVIC by default Fix a goof in the comment that documents KVM's logic for enabling AVIC by default to reference Zen5+ as family 0x1A (Zen5), not family 0x19 (Zen4). The code is correct (checks for _greater_ than 0x19), only the comment is flawed. Opportunistically tweak the check too, even though it's already correct, so that both the comment and the code reference 0x1A, and so that the checks are "ascending", i.e. check Zen4 and then Zen5+. No functional change intended. Fixes: ca2967de5a5b ("KVM: SVM: Enable AVIC by default for Zen4+ if x2AVIC is support") Acked-by: Naveen N Rao (AMD) Link: https://patch.msgid.link/20260109035037.1015073-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 6b77b2033208..e8acac56da5b 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1224,13 +1224,13 @@ static bool __init avic_want_avic_enabled(void) * In "auto" mode, enable AVIC by default for Zen4+ if x2AVIC is * supported (to avoid enabling partial support by default, and because * x2AVIC should be supported by all Zen4+ CPUs). Explicitly check for - * family 0x19 and later (Zen5+), as the kernel's synthetic ZenX flags + * family 0x1A and later (Zen5+), as the kernel's synthetic ZenX flags * aren't inclusive of previous generations, i.e. the kernel will set * at most one ZenX feature flag. */ if (avic == AVIC_AUTO_MODE) avic = boot_cpu_has(X86_FEATURE_X2AVIC) && - (boot_cpu_data.x86 > 0x19 || cpu_feature_enabled(X86_FEATURE_ZEN4)); + (cpu_feature_enabled(X86_FEATURE_ZEN4) || boot_cpu_data.x86 >= 0x1A); if (!avic || !npt_enabled) return false; From ac4f869c56301831a60706a84acbf13b4f0f9886 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Tue, 13 Jan 2026 16:47:48 +0800 Subject: [PATCH 108/267] KVM: VMX: Remove declaration of nested_mark_vmcs12_pages_dirty() Remove the declaration of nested_mark_vmcs12_pages_dirty() from the header file since it has been moved and renamed to nested_vmx_mark_all_vmcs12_pages_dirty(), which is a static function. Signed-off-by: Binbin Wu Link: https://patch.msgid.link/20260113084748.1714633-1-binbin.wu@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 983484d42ebf..b844c5d59025 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -51,7 +51,6 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); -void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu); bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, int size); From f756ed82c62aa2725757ac011710492d4cc8c7d8 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 13 Jan 2026 17:14:56 +0000 Subject: [PATCH 109/267] KVM: selftests: Slightly simplify memstress_setup_nested() Instead of calling memstress_setup_ept_mappings() only in the first iteration in the loop, move it before the loop. The call needed to happen within the loop before commit e40e72fec0de ("KVM: selftests: Stop passing VMX metadata to TDP mapping functions"), as memstress_setup_ept_mappings() used to take in a pointer to vmx_pages and pass it into tdp_identity_map_1g() (to get the EPT root GPA). This is no longer the case, as tdp_identity_map_1g() gets the EPT root through stage2 MMU. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20260113171456.2097312-1-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/x86/memstress.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 86f4c5e4c430..f53414ba7103 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -110,16 +110,13 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc TEST_REQUIRE(kvm_cpu_has_tdp()); vm_enable_tdp(vm); + memstress_setup_ept_mappings(vm); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { if (kvm_cpu_has(X86_FEATURE_VMX)) vcpu_alloc_vmx(vm, &nested_gva); else vcpu_alloc_svm(vm, &nested_gva); - /* The EPTs are shared across vCPUs, setup the mappings once */ - if (vcpu_id == 0) - memstress_setup_ept_mappings(vm); - /* * Override the vCPU to run memstress_l1_guest_code() which will * bounce it into L2 before calling memstress_guest_code(). From f00ccdede3c84df2287e59b546fd92d58b7e07af Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 13 Jan 2026 17:28:07 +0000 Subject: [PATCH 110/267] KVM: nSVM: Drop redundant/wrong comment in nested_vmcb02_prepare_save() The comment above DR6 and DR7 initializations is redundant, because the entire function follows the same pattern of only initializing the fields in vmcb02 if the vmcb12 changed or the fields are dirty, which handles the first execution case. Also, the comment refers to new_vmcb12 as new_vmcs12. Just drop the comment. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20260113172807.2178526-1-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 5aa0512e09c9..79cb85b8a156 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -681,7 +681,6 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 vmcb02->save.rsp = vmcb12->save.rsp; vmcb02->save.rip = vmcb12->save.rip; - /* These bits will be set properly on the first execution when new_vmc12 is true */ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) { vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1; svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW; From 6c8512a5b7f44caf981cee4ffa2a4ac73e627732 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Tue, 13 Jan 2026 19:56:50 +0800 Subject: [PATCH 111/267] KVM: VMX: Don't register posted interrupt wakeup handler if alloc_kvm_area() fails Unregistering the posted interrupt wakeup handler only happens during hardware unsetup. Therefore, if alloc_kvm_area() fails and continue to register the posted interrupt wakeup handler, this will leave the global posted interrupt wakeup handler pointer in an incorrect state. Although it should not be an issue, it's still better to change it. Signed-off-by: Hou Wenlong Fixes: ec5a4919fa7b ("KVM: VMX: Unregister posted interrupt wakeup handler on hardware unsetup") Link: https://patch.msgid.link/0ac6908b608cf80eab7437004334fedd0f5f5317.1768304590.git.houwenlong.hwl@antgroup.com [sean: use a goto] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 97d696014f9a..ccca182a346d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8708,8 +8708,8 @@ __init int vmx_hardware_setup(void) } r = alloc_kvm_area(); - if (r && nested) - nested_vmx_hardware_unsetup(); + if (r) + goto err_kvm_area; kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); @@ -8736,6 +8736,11 @@ __init int vmx_hardware_setup(void) kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + return 0; + +err_kvm_area: + if (nested) + nested_vmx_hardware_unsetup(); return r; } From 127ccae2c185f62e6ecb4bf24f9cb307e9b9c619 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Sat, 10 Jan 2026 00:48:18 +0000 Subject: [PATCH 112/267] KVM: nSVM: Always use vmcb01 in VMLOAD/VMSAVE emulation Commit cc3ed80ae69f ("KVM: nSVM: always use vmcb01 to for vmsave/vmload of guest state") made KVM always use vmcb01 for the fields controlled by VMSAVE/VMLOAD, but it missed updating the VMLOAD/VMSAVE emulation code to always use vmcb01. As a result, if VMSAVE/VMLOAD is executed by an L2 guest and is not intercepted by L1, KVM will mistakenly use vmcb02. Always use vmcb01 instead of the current VMCB. Fixes: cc3ed80ae69f ("KVM: nSVM: always use vmcb01 to for vmsave/vmload of guest state") Cc: Maxim Levitsky Cc: stable@vger.kernel.org Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20260110004821.3411245-2-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e454ae095cf7..f1a5b61bdb5b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2122,12 +2122,13 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) ret = kvm_skip_emulated_instruction(vcpu); + /* KVM always performs VMLOAD/VMSAVE on VMCB01 (see __svm_vcpu_run()) */ if (vmload) { - svm_copy_vmloadsave_state(svm->vmcb, vmcb12); + svm_copy_vmloadsave_state(svm->vmcb01.ptr, vmcb12); svm->sysenter_eip_hi = 0; svm->sysenter_esp_hi = 0; } else { - svm_copy_vmloadsave_state(vmcb12, svm->vmcb); + svm_copy_vmloadsave_state(vmcb12, svm->vmcb01.ptr); } kvm_vcpu_unmap(vcpu, &map); From 55780d8a1dcc93d2c4b33c565ada88df12c9f206 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Sat, 10 Jan 2026 00:48:19 +0000 Subject: [PATCH 113/267] KVM: SVM: Stop toggling virtual VMSAVE/VMLOAD on intercept recalc Virtual VMSAVE/VMLOAD enablement (i.e. VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK) is set/cleared by svm_recalc_instruction_intercepts() when the intercepts are cleared/set. This is unnecessary because the bit is meaningless when intercepts are set and KVM emulates the instructions. Initialize the bit in vmcb01 base on vls, and keep it unchanged. This is similar-ish to how vGIF is handled. It is enabled in init_vmcb() if vgif=1 and remains unchanged when the STGI intercept is enabled (e.g. for NMI windows). This fixes a bug in svm_recalc_instruction_intercepts(). The intercepts for VMSAVE/VMLOAD are always toggled in vmcb01, but VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK is toggled in the current VMCB, which could be vmcb02 instead of vmcb01 if L2 is active. Virtual VMSAVE/VMLOAD enablement in vmcb02 is separately controlled by nested_vmcb02_prepare_control() based on the vCPU features and VMCB12, and if intercepts are needed they are set by recalc_intercepts(). The bug is benign though. Not toggling the bit for vmcb01 is harmless because it's useless anyway. For vmcb02: - The bit could be incorrectly cleared when intercepts are set in vmcb01. This is harmless because VMSAVE/VMLOAD will be emulated by KVM anyway. - The bit could be incorrectly set when the intercepts are cleared in vmcb01. However, if the bit was originally clear in vmcb02, then recalc_intercepts() will enable in the intercepts in vmcb02 anyway and VMSAVE/VMLOAD will be emulated by KVM. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20260110004821.3411245-3-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f1a5b61bdb5b..5eadecc5246c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -995,10 +995,14 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) svm_set_intercept(svm, INTERCEPT_RDTSCP); } + /* + * No need to toggle VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK here, it is + * always set if vls is enabled. If the intercepts are set, the bit is + * meaningless anyway. + */ if (guest_cpuid_is_intel_compatible(vcpu)) { svm_set_intercept(svm, INTERCEPT_VMLOAD); svm_set_intercept(svm, INTERCEPT_VMSAVE); - svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; } else { /* * If hardware supports Virtual VMLOAD VMSAVE then enable it @@ -1007,7 +1011,6 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) if (vls) { svm_clr_intercept(svm, INTERCEPT_VMLOAD); svm_clr_intercept(svm, INTERCEPT_VMSAVE); - svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; } } } @@ -1155,6 +1158,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } + if (vls) + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + if (vcpu->kvm->arch.bus_lock_detection_enabled) svm_set_intercept(svm, INTERCEPT_BUSLOCK); From 55058e32151f95dc5badd62381d184e89f15de99 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Sat, 10 Jan 2026 00:48:20 +0000 Subject: [PATCH 114/267] KVM: selftests: Add a selftests for nested VMLOAD/VMSAVE Add a test for VMLOAD/VMSAVE in an L2 guest. The test verifies that L1 intercepts for VMSAVE/VMLOAD always work regardless of VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK. Then, more interestingly, it makes sure that when L1 does not intercept VMLOAD/VMSAVE, they work as intended in L2. When VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK is enabled by L1, VMSAVE/VMLOAD from L2 should interpret the GPA as an L2 GPA and translate it through the NPT. When VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK is disabled by L1, VMSAVE/VMLOAD from L2 should interpret the GPA as an L1 GPA. To test this, put two VMCBs (0 and 1) in L1's physical address space, and have a single L2 GPA where: - L2 VMCB GPA == L1 VMCB(0) GPA - L2 VMCB GPA maps to L1 VMCB(1) via the NPT in L1. This setup allows detecting how the GPA is interpreted based on which L1 VMCB is actually accessed. In both cases, L2 sets KERNEL_GS_BASE (one of the fields handled by VMSAVE/VMLOAD), and executes VMSAVE to write its value to the VMCB. The test userspace code then checks that the write was made to the correct VMCB (based on whether VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK is set by L1), and writes a new value to that VMCB. L2 then executes VMLOAD to load the new value and makes sure it's reflected correctly in KERNERL_GS_BASE. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20260110004821.3411245-4-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../selftests/kvm/include/x86/processor.h | 1 + .../kvm/x86/nested_vmsave_vmload_test.c | 197 ++++++++++++++++++ 3 files changed, 199 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ffbf891b31d3..fc78d4508ebd 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -95,6 +95,7 @@ TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test TEST_GEN_PROGS_x86 += x86/nested_tsc_adjust_test TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test +TEST_GEN_PROGS_x86 += x86/nested_vmsave_vmload_test TEST_GEN_PROGS_x86 += x86/platform_info_test TEST_GEN_PROGS_x86 += x86/pmu_counters_test TEST_GEN_PROGS_x86 += x86/pmu_event_filter_test diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 8f130e7d7048..6bfffc3b0a33 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -201,6 +201,7 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_TSCRATEMSR KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 4) #define X86_FEATURE_PAUSEFILTER KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 10) #define X86_FEATURE_PFTHRESHOLD KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 12) +#define X86_FEATURE_V_VMSAVE_VMLOAD KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 15) #define X86_FEATURE_VGIF KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 16) #define X86_FEATURE_IDLE_HLT KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 30) #define X86_FEATURE_SEV KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 1) diff --git a/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c b/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c new file mode 100644 index 000000000000..6764a48f9d4d --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2026, Google LLC. + */ +#include "kvm_util.h" +#include "vmx.h" +#include "svm_util.h" +#include "kselftest.h" + +/* + * Allocate two VMCB pages for testing. Both pages have different GVAs (shared + * by both L1 and L2) and L1 GPAs. A single L2 GPA is used such that: + * - L2 GPA == L1 GPA for VMCB0. + * - L2 GPA is mapped to L1 GPA for VMCB1 using NPT in L1. + * + * This allows testing whether the GPA used by VMSAVE/VMLOAD in L2 is + * interpreted as a direct L1 GPA or translated using NPT as an L2 GPA, depends + * on which VMCB is accessed. + */ +#define TEST_MEM_SLOT_INDEX 1 +#define TEST_MEM_PAGES 2 +#define TEST_MEM_BASE 0xc0000000 + +#define TEST_GUEST_ADDR(idx) (TEST_MEM_BASE + (idx) * PAGE_SIZE) + +#define TEST_VMCB_L1_GPA(idx) TEST_GUEST_ADDR(idx) +#define TEST_VMCB_GVA(idx) TEST_GUEST_ADDR(idx) + +#define TEST_VMCB_L2_GPA TEST_VMCB_L1_GPA(0) + +#define L2_GUEST_STACK_SIZE 64 + +static void l2_guest_code_vmsave(void) +{ + asm volatile("vmsave %0" : : "a"(TEST_VMCB_L2_GPA) : "memory"); +} + +static void l2_guest_code_vmload(void) +{ + asm volatile("vmload %0" : : "a"(TEST_VMCB_L2_GPA) : "memory"); +} + +static void l2_guest_code_vmcb(int vmcb_idx) +{ + wrmsr(MSR_KERNEL_GS_BASE, 0xaaaa); + l2_guest_code_vmsave(); + + /* Verify the VMCB used by VMSAVE and update KERNEL_GS_BASE to 0xbbbb */ + GUEST_SYNC(vmcb_idx); + + l2_guest_code_vmload(); + GUEST_ASSERT_EQ(rdmsr(MSR_KERNEL_GS_BASE), 0xbbbb); + + /* Reset MSR_KERNEL_GS_BASE */ + wrmsr(MSR_KERNEL_GS_BASE, 0); + l2_guest_code_vmsave(); + + vmmcall(); +} + +static void l2_guest_code_vmcb0(void) +{ + l2_guest_code_vmcb(0); +} + +static void l2_guest_code_vmcb1(void) +{ + l2_guest_code_vmcb(1); +} + +static void l1_guest_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + /* Each test case initializes the guest RIP below */ + generic_svm_setup(svm, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* Set VMSAVE/VMLOAD intercepts and make sure they work with.. */ + svm->vmcb->control.intercept |= (BIT_ULL(INTERCEPT_VMSAVE) | + BIT_ULL(INTERCEPT_VMLOAD)); + + /* ..VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK cleared.. */ + svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + + svm->vmcb->save.rip = (u64)l2_guest_code_vmsave; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMSAVE); + + svm->vmcb->save.rip = (u64)l2_guest_code_vmload; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMLOAD); + + /* ..and VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK set */ + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + + svm->vmcb->save.rip = (u64)l2_guest_code_vmsave; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMSAVE); + + svm->vmcb->save.rip = (u64)l2_guest_code_vmload; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMLOAD); + + /* Now clear the intercepts to test VMSAVE/VMLOAD behavior */ + svm->vmcb->control.intercept &= ~(BIT_ULL(INTERCEPT_VMSAVE) | + BIT_ULL(INTERCEPT_VMLOAD)); + + /* + * Without VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK, the GPA will be + * interpreted as an L1 GPA, so VMCB0 should be used. + */ + svm->vmcb->save.rip = (u64)l2_guest_code_vmcb0; + svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); + + /* + * With VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK, the GPA will be interpeted as + * an L2 GPA, and translated through the NPT to VMCB1. + */ + svm->vmcb->save.rip = (u64)l2_guest_code_vmcb1; + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t nested_gva = 0; + struct vmcb *test_vmcb[2]; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int i; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_NPT)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + vm_enable_tdp(vm); + + vcpu_alloc_svm(vm, &nested_gva); + vcpu_args_set(vcpu, 1, nested_gva); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + TEST_MEM_BASE, TEST_MEM_SLOT_INDEX, + TEST_MEM_PAGES, 0); + + for (i = 0; i <= 1; i++) { + virt_map(vm, TEST_VMCB_GVA(i), TEST_VMCB_L1_GPA(i), 1); + test_vmcb[i] = (struct vmcb *)addr_gva2hva(vm, TEST_VMCB_GVA(i)); + } + + tdp_identity_map_default_memslots(vm); + + /* + * L2 GPA == L1_GPA(0), but map it to L1_GPA(1), to allow testing + * whether the L2 GPA is interpreted as an L1 GPA or translated through + * the NPT. + */ + TEST_ASSERT_EQ(TEST_VMCB_L2_GPA, TEST_VMCB_L1_GPA(0)); + tdp_map(vm, TEST_VMCB_L2_GPA, TEST_VMCB_L1_GPA(1), PAGE_SIZE); + + for (;;) { + struct ucall uc; + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + case UCALL_SYNC: + i = uc.args[1]; + TEST_ASSERT(i == 0 || i == 1, "Unexpected VMCB idx: %d", i); + + /* + * Check that only the expected VMCB has KERNEL_GS_BASE + * set to 0xaaaa, and update it to 0xbbbb. + */ + TEST_ASSERT_EQ(test_vmcb[i]->save.kernel_gs_base, 0xaaaa); + TEST_ASSERT_EQ(test_vmcb[1-i]->save.kernel_gs_base, 0); + test_vmcb[i]->save.kernel_gs_base = 0xbbbb; + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + kvm_vm_free(vm); + return 0; +} From f1640174c8a769511641bfd5b7da16c4943e2c64 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 10 Dec 2025 17:30:20 +0000 Subject: [PATCH 115/267] arm64: Convert ID_AA64MMFR0_EL1.TGRAN{4,16,64}_2 to UnsignedEnum ID_AA64MMFR0_EL1.TGRAN{4,16,64}_2 are currently represented as unordered enumerations. However, the architecture treats them as Unsigned, as hinted to by the MRS data: (FEAT_S2TGran4K <=> (((UInt(ID_AA64MMFR0_EL1.TGran4_2) == 0) && FEAT_TGran4K) || (UInt(ID_AA64MMFR0_EL1.TGran4_2) >= 2)))) and similar descriptions exist for 16 and 64k. This is also confirmed by D24.1.3.3 ("Alternative ID scheme used for ID_AA64MMFR0_EL1 stage 2 granule sizes") in the L.b revision of the ARM ARM. Turn these fields into UnsignedEnum so that we can use the above description more or less literally. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20251210173024.561160-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/tools/sysreg | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 8921b51866d6..063b13b7c731 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2098,18 +2098,18 @@ UnsignedEnum 47:44 EXS 0b0000 NI 0b0001 IMP EndEnum -Enum 43:40 TGRAN4_2 +UnsignedEnum 43:40 TGRAN4_2 0b0000 TGRAN4 0b0001 NI 0b0010 IMP 0b0011 52_BIT EndEnum -Enum 39:36 TGRAN64_2 +UnsignedEnum 39:36 TGRAN64_2 0b0000 TGRAN64 0b0001 NI 0b0010 IMP EndEnum -Enum 35:32 TGRAN16_2 +UnsignedEnum 35:32 TGRAN16_2 0b0000 TGRAN16 0b0001 NI 0b0010 IMP From a035001dea37b885efb934e25057430ae1193d0a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 10 Dec 2025 17:30:21 +0000 Subject: [PATCH 116/267] arm64: Convert VTCR_EL2 to sysreg infratructure Our definition of VTCR_EL2 is both partial (tons of fields are missing) and totally inconsistent (some constants are shifted, some are not). They are also expressed in terms of TCR, which is rather inconvenient. Replace the ad-hoc definitions with the the generated version. This results in a bunch of additional changes to make the code with the unshifted nature of generated enumerations. The register data was extracted from the BSD licenced AARCHMRS (AARCHMRS_OPENSOURCE_A_profile_FAT-2025-09_ASL0). Reviewed-by: Alexandru Elisei Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20251210173024.561160-4-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_arm.h | 52 +++++++---------------------- arch/arm64/include/asm/sysreg.h | 1 - arch/arm64/kvm/hyp/pgtable.c | 8 ++--- arch/arm64/kvm/nested.c | 8 ++--- arch/arm64/tools/sysreg | 57 ++++++++++++++++++++++++++++++++ 5 files changed, 76 insertions(+), 50 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index e500600e4b9b..dfdbd2b65db9 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -124,37 +124,7 @@ #define TCR_EL2_MASK (TCR_EL2_TG0_MASK | TCR_EL2_SH0_MASK | \ TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK) -/* VTCR_EL2 Registers bits */ -#define VTCR_EL2_DS TCR_EL2_DS -#define VTCR_EL2_RES1 (1U << 31) -#define VTCR_EL2_HD (1 << 22) -#define VTCR_EL2_HA (1 << 21) -#define VTCR_EL2_PS_SHIFT TCR_EL2_PS_SHIFT -#define VTCR_EL2_PS_MASK TCR_EL2_PS_MASK -#define VTCR_EL2_TG0_MASK TCR_TG0_MASK -#define VTCR_EL2_TG0_4K TCR_TG0_4K -#define VTCR_EL2_TG0_16K TCR_TG0_16K -#define VTCR_EL2_TG0_64K TCR_TG0_64K -#define VTCR_EL2_SH0_MASK TCR_SH0_MASK -#define VTCR_EL2_SH0_INNER TCR_SH0_INNER -#define VTCR_EL2_ORGN0_MASK TCR_ORGN0_MASK -#define VTCR_EL2_ORGN0_WBWA TCR_ORGN0_WBWA -#define VTCR_EL2_IRGN0_MASK TCR_IRGN0_MASK -#define VTCR_EL2_IRGN0_WBWA TCR_IRGN0_WBWA -#define VTCR_EL2_SL0_SHIFT 6 -#define VTCR_EL2_SL0_MASK (3 << VTCR_EL2_SL0_SHIFT) -#define VTCR_EL2_T0SZ_MASK 0x3f -#define VTCR_EL2_VS_SHIFT 19 -#define VTCR_EL2_VS_8BIT (0 << VTCR_EL2_VS_SHIFT) -#define VTCR_EL2_VS_16BIT (1 << VTCR_EL2_VS_SHIFT) - -#define VTCR_EL2_T0SZ(x) TCR_T0SZ(x) - /* - * We configure the Stage-2 page tables to always restrict the IPA space to be - * 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are - * not known to exist and will break with this configuration. - * * The VTCR_EL2 is configured per VM and is initialised in kvm_init_stage2_mmu. * * Note that when using 4K pages, we concatenate two first level page tables @@ -162,9 +132,6 @@ * */ -#define VTCR_EL2_COMMON_BITS (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \ - VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1) - /* * VTCR_EL2:SL0 indicates the entry level for Stage2 translation. * Interestingly, it depends on the page size. @@ -196,30 +163,35 @@ */ #ifdef CONFIG_ARM64_64K_PAGES -#define VTCR_EL2_TGRAN VTCR_EL2_TG0_64K +#define VTCR_EL2_TGRAN 64K #define VTCR_EL2_TGRAN_SL0_BASE 3UL #elif defined(CONFIG_ARM64_16K_PAGES) -#define VTCR_EL2_TGRAN VTCR_EL2_TG0_16K +#define VTCR_EL2_TGRAN 16K #define VTCR_EL2_TGRAN_SL0_BASE 3UL #else /* 4K */ -#define VTCR_EL2_TGRAN VTCR_EL2_TG0_4K +#define VTCR_EL2_TGRAN 4K #define VTCR_EL2_TGRAN_SL0_BASE 2UL #endif #define VTCR_EL2_LVLS_TO_SL0(levels) \ - ((VTCR_EL2_TGRAN_SL0_BASE - (4 - (levels))) << VTCR_EL2_SL0_SHIFT) + FIELD_PREP(VTCR_EL2_SL0, (VTCR_EL2_TGRAN_SL0_BASE - (4 - (levels)))) #define VTCR_EL2_SL0_TO_LVLS(sl0) \ ((sl0) + 4 - VTCR_EL2_TGRAN_SL0_BASE) #define VTCR_EL2_LVLS(vtcr) \ - VTCR_EL2_SL0_TO_LVLS(((vtcr) & VTCR_EL2_SL0_MASK) >> VTCR_EL2_SL0_SHIFT) + VTCR_EL2_SL0_TO_LVLS(FIELD_GET(VTCR_EL2_SL0, (vtcr))) -#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN) -#define VTCR_EL2_IPA(vtcr) (64 - ((vtcr) & VTCR_EL2_T0SZ_MASK)) +#define VTCR_EL2_FLAGS (SYS_FIELD_PREP_ENUM(VTCR_EL2, SH0, INNER) | \ + SYS_FIELD_PREP_ENUM(VTCR_EL2, ORGN0, WBWA) | \ + SYS_FIELD_PREP_ENUM(VTCR_EL2, IRGN0, WBWA) | \ + SYS_FIELD_PREP_ENUM(VTCR_EL2, TG0, VTCR_EL2_TGRAN) | \ + VTCR_EL2_RES1) + +#define VTCR_EL2_IPA(vtcr) (64 - FIELD_GET(VTCR_EL2_T0SZ, (vtcr))) /* * ARM VMSAv8-64 defines an algorithm for finding the translation table diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 106b15eb232a..939f9c5bbae6 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -517,7 +517,6 @@ #define SYS_TTBR1_EL2 sys_reg(3, 4, 2, 0, 1) #define SYS_TCR_EL2 sys_reg(3, 4, 2, 0, 2) #define SYS_VTTBR_EL2 sys_reg(3, 4, 2, 1, 0) -#define SYS_VTCR_EL2 sys_reg(3, 4, 2, 1, 2) #define SYS_HAFGRTR_EL2 sys_reg(3, 4, 3, 1, 6) #define SYS_SPSR_EL2 sys_reg(3, 4, 4, 0, 0) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 9abc0a6cf448..a99a7d6e7d0c 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -583,8 +583,8 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) u64 vtcr = VTCR_EL2_FLAGS; s8 lvls; - vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT; - vtcr |= VTCR_EL2_T0SZ(phys_shift); + vtcr |= FIELD_PREP(VTCR_EL2_PS, kvm_get_parange(mmfr0)); + vtcr |= FIELD_PREP(VTCR_EL2_T0SZ, (UL(64) - phys_shift)); /* * Use a minimum 2 level page table to prevent splitting * host PMD huge pages at stage2. @@ -624,9 +624,7 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) vtcr |= VTCR_EL2_DS; /* Set the vmid bits */ - vtcr |= (get_vmid_bits(mmfr1) == 16) ? - VTCR_EL2_VS_16BIT : - VTCR_EL2_VS_8BIT; + vtcr |= (get_vmid_bits(mmfr1) == 16) ? VTCR_EL2_VS : 0; return vtcr; } diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index cdeeb8f09e72..522fa3cc9578 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -377,7 +377,7 @@ static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi) { wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK; - switch (vtcr & VTCR_EL2_TG0_MASK) { + switch (FIELD_GET(VTCR_EL2_TG0_MASK, vtcr)) { case VTCR_EL2_TG0_4K: wi->pgshift = 12; break; case VTCR_EL2_TG0_16K: @@ -513,7 +513,7 @@ static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr) lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(mmu)->mmu_lock); - switch (vtcr & VTCR_EL2_TG0_MASK) { + switch (FIELD_GET(VTCR_EL2_TG0_MASK, vtcr)) { case VTCR_EL2_TG0_4K: ttl = (TLBI_TTL_TG_4K << 2); break; @@ -530,7 +530,7 @@ static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr) again: /* Iteratively compute the block sizes for a particular granule size */ - switch (vtcr & VTCR_EL2_TG0_MASK) { + switch (FIELD_GET(VTCR_EL2_TG0_MASK, vtcr)) { case VTCR_EL2_TG0_4K: if (sz < SZ_4K) sz = SZ_4K; else if (sz < SZ_2M) sz = SZ_2M; @@ -593,7 +593,7 @@ unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val) if (!max_size) { /* Compute the maximum extent of the invalidation */ - switch (mmu->tlb_vtcr & VTCR_EL2_TG0_MASK) { + switch (FIELD_GET(VTCR_EL2_TG0_MASK, mmu->tlb_vtcr)) { case VTCR_EL2_TG0_4K: max_size = SZ_1G; break; diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 063b13b7c731..a0f6249bd4f9 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -4400,6 +4400,63 @@ Field 56:12 BADDR Res0 11:0 EndSysreg +Sysreg VTCR_EL2 3 4 2 1 2 +Res0 63:46 +Field 45 HDBSS +Field 44 HAFT +Res0 43:42 +Field 41 TL0 +Field 40 GCSH +Res0 39 +Field 38 D128 +Field 37 S2POE +Field 36 S2PIE +Field 35 TL1 +Field 34 AssuredOnly +Field 33 SL2 +Field 32 DS +Res1 31 +Field 30 NSA +Field 29 NSW +Field 28 HWU62 +Field 27 HWU61 +Field 26 HWU60 +Field 25 HWU59 +Res0 24:23 +Field 22 HD +Field 21 HA +Res0 20 +Enum 19 VS + 0b0 8BIT + 0b1 16BIT +EndEnum +Field 18:16 PS +Enum 15:14 TG0 + 0b00 4K + 0b01 64K + 0b10 16K +EndEnum +Enum 13:12 SH0 + 0b00 NONE + 0b01 OUTER + 0b11 INNER +EndEnum +Enum 11:10 ORGN0 + 0b00 NC + 0b01 WBWA + 0b10 WT + 0b11 WBnWA +EndEnum +Enum 9:8 IRGN0 + 0b00 NC + 0b01 WBWA + 0b10 WT + 0b11 WBnWA +EndEnum +Field 7:6 SL0 +Field 5:0 T0SZ +EndSysreg + Sysreg GCSCR_EL2 3 4 2 5 0 Fields GCSCR_ELx EndSysreg From c259d763e6b09a463c85ff6b1d20ede92da48d24 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 10 Dec 2025 17:30:22 +0000 Subject: [PATCH 117/267] KVM: arm64: Account for RES1 bits in DECLARE_FEAT_MAP() and co None of the registers we manage in the feature dependency infrastructure so far has any RES1 bit. This is about to change, as VTCR_EL2 has its bit 31 being RES1. In order to not fail the consistency checks by not describing a bit, add RES1 bits to the set of immutable bits. This requires some extra surgery for the FGT handling, as we now need to track RES1 bits there as well. There are no RES1 FGT bits *yet*. Watch this space. Reviewed-by: Fuad Tabba Tested-by: Sascha Bischoff Reviewed-by: Jonathan Cameron Tested-by: Fuad Tabba Link: https://patch.msgid.link/20251210173024.561160-5-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/config.c | 25 ++++++++------- arch/arm64/kvm/emulate-nested.c | 53 +++++++++++++++++-------------- 3 files changed, 44 insertions(+), 35 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ac7f970c7883..b552a1e03848 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -638,6 +638,7 @@ struct fgt_masks { u64 mask; u64 nmask; u64 res0; + u64 res1; }; extern struct fgt_masks hfgrtr_masks; diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index 24bb3f36e9d5..3845b188551b 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -16,14 +16,14 @@ */ struct reg_bits_to_feat_map { union { - u64 bits; - u64 *res0p; + u64 bits; + struct fgt_masks *masks; }; #define NEVER_FGU BIT(0) /* Can trap, but never UNDEF */ #define CALL_FUNC BIT(1) /* Needs to evaluate tons of crap */ #define FIXED_VALUE BIT(2) /* RAZ/WI or RAO/WI in KVM */ -#define RES0_POINTER BIT(3) /* Pointer to RES0 value instead of bits */ +#define MASKS_POINTER BIT(3) /* Pointer to fgt_masks struct instead of bits */ unsigned long flags; @@ -92,8 +92,8 @@ struct reg_feat_map_desc { #define NEEDS_FEAT_FIXED(m, ...) \ __NEEDS_FEAT_FLAG(m, FIXED_VALUE, bits, __VA_ARGS__, 0) -#define NEEDS_FEAT_RES0(p, ...) \ - __NEEDS_FEAT_FLAG(p, RES0_POINTER, res0p, __VA_ARGS__) +#define NEEDS_FEAT_MASKS(p, ...) \ + __NEEDS_FEAT_FLAG(p, MASKS_POINTER, masks, __VA_ARGS__) /* * Declare the dependency between a set of bits and a set of features, @@ -109,19 +109,20 @@ struct reg_feat_map_desc { #define DECLARE_FEAT_MAP(n, r, m, f) \ struct reg_feat_map_desc n = { \ .name = #r, \ - .feat_map = NEEDS_FEAT(~r##_RES0, f), \ + .feat_map = NEEDS_FEAT(~(r##_RES0 | \ + r##_RES1), f), \ .bit_feat_map = m, \ .bit_feat_map_sz = ARRAY_SIZE(m), \ } /* * Specialised version of the above for FGT registers that have their - * RES0 masks described as struct fgt_masks. + * RESx masks described as struct fgt_masks. */ #define DECLARE_FEAT_MAP_FGT(n, msk, m, f) \ struct reg_feat_map_desc n = { \ .name = #msk, \ - .feat_map = NEEDS_FEAT_RES0(&msk.res0, f),\ + .feat_map = NEEDS_FEAT_MASKS(&msk, f), \ .bit_feat_map = m, \ .bit_feat_map_sz = ARRAY_SIZE(m), \ } @@ -1168,21 +1169,21 @@ static const DECLARE_FEAT_MAP(mdcr_el2_desc, MDCR_EL2, mdcr_el2_feat_map, FEAT_AA64EL2); static void __init check_feat_map(const struct reg_bits_to_feat_map *map, - int map_size, u64 res0, const char *str) + int map_size, u64 resx, const char *str) { u64 mask = 0; for (int i = 0; i < map_size; i++) mask |= map[i].bits; - if (mask != ~res0) + if (mask != ~resx) kvm_err("Undefined %s behaviour, bits %016llx\n", - str, mask ^ ~res0); + str, mask ^ ~resx); } static u64 reg_feat_map_bits(const struct reg_bits_to_feat_map *map) { - return map->flags & RES0_POINTER ? ~(*map->res0p) : map->bits; + return map->flags & MASKS_POINTER ? (map->masks->mask | map->masks->nmask) : map->bits; } static void __init check_reg_desc(const struct reg_feat_map_desc *r) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 834f13fb1fb7..75d49f83342a 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -2105,23 +2105,24 @@ static u32 encoding_next(u32 encoding) } #define FGT_MASKS(__n, __m) \ - struct fgt_masks __n = { .str = #__m, .res0 = __m, } + struct fgt_masks __n = { .str = #__m, .res0 = __m ## _RES0, .res1 = __m ## _RES1 } -FGT_MASKS(hfgrtr_masks, HFGRTR_EL2_RES0); -FGT_MASKS(hfgwtr_masks, HFGWTR_EL2_RES0); -FGT_MASKS(hfgitr_masks, HFGITR_EL2_RES0); -FGT_MASKS(hdfgrtr_masks, HDFGRTR_EL2_RES0); -FGT_MASKS(hdfgwtr_masks, HDFGWTR_EL2_RES0); -FGT_MASKS(hafgrtr_masks, HAFGRTR_EL2_RES0); -FGT_MASKS(hfgrtr2_masks, HFGRTR2_EL2_RES0); -FGT_MASKS(hfgwtr2_masks, HFGWTR2_EL2_RES0); -FGT_MASKS(hfgitr2_masks, HFGITR2_EL2_RES0); -FGT_MASKS(hdfgrtr2_masks, HDFGRTR2_EL2_RES0); -FGT_MASKS(hdfgwtr2_masks, HDFGWTR2_EL2_RES0); +FGT_MASKS(hfgrtr_masks, HFGRTR_EL2); +FGT_MASKS(hfgwtr_masks, HFGWTR_EL2); +FGT_MASKS(hfgitr_masks, HFGITR_EL2); +FGT_MASKS(hdfgrtr_masks, HDFGRTR_EL2); +FGT_MASKS(hdfgwtr_masks, HDFGWTR_EL2); +FGT_MASKS(hafgrtr_masks, HAFGRTR_EL2); +FGT_MASKS(hfgrtr2_masks, HFGRTR2_EL2); +FGT_MASKS(hfgwtr2_masks, HFGWTR2_EL2); +FGT_MASKS(hfgitr2_masks, HFGITR2_EL2); +FGT_MASKS(hdfgrtr2_masks, HDFGRTR2_EL2); +FGT_MASKS(hdfgwtr2_masks, HDFGWTR2_EL2); static __init bool aggregate_fgt(union trap_config tc) { struct fgt_masks *rmasks, *wmasks; + u64 rresx, wresx; switch (tc.fgt) { case HFGRTR_GROUP: @@ -2154,24 +2155,27 @@ static __init bool aggregate_fgt(union trap_config tc) break; } + rresx = rmasks->res0 | rmasks->res1; + if (wmasks) + wresx = wmasks->res0 | wmasks->res1; + /* * A bit can be reserved in either the R or W register, but * not both. */ - if ((BIT(tc.bit) & rmasks->res0) && - (!wmasks || (BIT(tc.bit) & wmasks->res0))) + if ((BIT(tc.bit) & rresx) && (!wmasks || (BIT(tc.bit) & wresx))) return false; if (tc.pol) - rmasks->mask |= BIT(tc.bit) & ~rmasks->res0; + rmasks->mask |= BIT(tc.bit) & ~rresx; else - rmasks->nmask |= BIT(tc.bit) & ~rmasks->res0; + rmasks->nmask |= BIT(tc.bit) & ~rresx; if (wmasks) { if (tc.pol) - wmasks->mask |= BIT(tc.bit) & ~wmasks->res0; + wmasks->mask |= BIT(tc.bit) & ~wresx; else - wmasks->nmask |= BIT(tc.bit) & ~wmasks->res0; + wmasks->nmask |= BIT(tc.bit) & ~wresx; } return true; @@ -2180,7 +2184,6 @@ static __init bool aggregate_fgt(union trap_config tc) static __init int check_fgt_masks(struct fgt_masks *masks) { unsigned long duplicate = masks->mask & masks->nmask; - u64 res0 = masks->res0; int ret = 0; if (duplicate) { @@ -2194,10 +2197,14 @@ static __init int check_fgt_masks(struct fgt_masks *masks) ret = -EINVAL; } - masks->res0 = ~(masks->mask | masks->nmask); - if (masks->res0 != res0) - kvm_info("Implicit %s = %016llx, expecting %016llx\n", - masks->str, masks->res0, res0); + if ((masks->res0 | masks->res1 | masks->mask | masks->nmask) != GENMASK(63, 0) || + (masks->res0 & masks->res1) || (masks->res0 & masks->mask) || + (masks->res0 & masks->nmask) || (masks->res1 & masks->mask) || + (masks->res1 & masks->nmask) || (masks->mask & masks->nmask)) { + kvm_info("Inconsistent masks for %s (%016llx, %016llx, %016llx, %016llx)\n", + masks->str, masks->res0, masks->res1, masks->mask, masks->nmask); + masks->res0 = ~(masks->res1 | masks->mask | masks->nmask); + } return ret; } From 9d2de51825598fc8e76f596c16368362753d8021 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 10 Dec 2025 17:30:23 +0000 Subject: [PATCH 118/267] KVM: arm64: Convert VTCR_EL2 to config-driven sanitisation Describe all the VTCR_EL2 fields and their respective configurations, making sure that we correctly ignore the bits that are not defined for a given guest configuration. Reviewed-by: Alexandru Elisei Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20251210173024.561160-6-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/config.c | 69 +++++++++++++++++++++++++++++++++++++++++ arch/arm64/kvm/nested.c | 3 +- 2 files changed, 70 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index 3845b188551b..9c04f895d376 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -141,6 +141,7 @@ struct reg_feat_map_desc { #define FEAT_AA64EL1 ID_AA64PFR0_EL1, EL1, IMP #define FEAT_AA64EL2 ID_AA64PFR0_EL1, EL2, IMP #define FEAT_AA64EL3 ID_AA64PFR0_EL1, EL3, IMP +#define FEAT_SEL2 ID_AA64PFR0_EL1, SEL2, IMP #define FEAT_AIE ID_AA64MMFR3_EL1, AIE, IMP #define FEAT_S2POE ID_AA64MMFR3_EL1, S2POE, IMP #define FEAT_S1POE ID_AA64MMFR3_EL1, S1POE, IMP @@ -202,6 +203,8 @@ struct reg_feat_map_desc { #define FEAT_ASID2 ID_AA64MMFR4_EL1, ASID2, IMP #define FEAT_MEC ID_AA64MMFR3_EL1, MEC, IMP #define FEAT_HAFT ID_AA64MMFR1_EL1, HAFDBS, HAFT +#define FEAT_HDBSS ID_AA64MMFR1_EL1, HAFDBS, HDBSS +#define FEAT_HPDS2 ID_AA64MMFR1_EL1, HPDS, HPDS2 #define FEAT_BTI ID_AA64PFR1_EL1, BT, IMP #define FEAT_ExS ID_AA64MMFR0_EL1, EXS, IMP #define FEAT_IESB ID_AA64MMFR2_EL1, IESB, IMP @@ -219,6 +222,7 @@ struct reg_feat_map_desc { #define FEAT_FGT2 ID_AA64MMFR0_EL1, FGT, FGT2 #define FEAT_MTPMU ID_AA64DFR0_EL1, MTPMU, IMP #define FEAT_HCX ID_AA64MMFR1_EL1, HCX, IMP +#define FEAT_S2PIE ID_AA64MMFR3_EL1, S2PIE, IMP static bool not_feat_aa64el3(struct kvm *kvm) { @@ -362,6 +366,28 @@ static bool feat_pmuv3p9(struct kvm *kvm) return check_pmu_revision(kvm, V3P9); } +#define has_feat_s2tgran(k, s) \ + ((kvm_has_feat_enum(kvm, ID_AA64MMFR0_EL1, TGRAN##s##_2, TGRAN##s) && \ + kvm_has_feat(kvm, ID_AA64MMFR0_EL1, TGRAN##s, IMP)) || \ + kvm_has_feat(kvm, ID_AA64MMFR0_EL1, TGRAN##s##_2, IMP)) + +static bool feat_lpa2(struct kvm *kvm) +{ + return ((kvm_has_feat(kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT) || + !kvm_has_feat(kvm, ID_AA64MMFR0_EL1, TGRAN4, IMP)) && + (kvm_has_feat(kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT) || + !kvm_has_feat(kvm, ID_AA64MMFR0_EL1, TGRAN16, IMP)) && + (kvm_has_feat(kvm, ID_AA64MMFR0_EL1, TGRAN4_2, 52_BIT) || + !has_feat_s2tgran(kvm, 4)) && + (kvm_has_feat(kvm, ID_AA64MMFR0_EL1, TGRAN16_2, 52_BIT) || + !has_feat_s2tgran(kvm, 16))); +} + +static bool feat_vmid16(struct kvm *kvm) +{ + return kvm_has_feat_enum(kvm, ID_AA64MMFR1_EL1, VMIDBits, 16); +} + static bool compute_hcr_rw(struct kvm *kvm, u64 *bits) { /* This is purely academic: AArch32 and NV are mutually exclusive */ @@ -1168,6 +1194,44 @@ static const struct reg_bits_to_feat_map mdcr_el2_feat_map[] = { static const DECLARE_FEAT_MAP(mdcr_el2_desc, MDCR_EL2, mdcr_el2_feat_map, FEAT_AA64EL2); +static const struct reg_bits_to_feat_map vtcr_el2_feat_map[] = { + NEEDS_FEAT(VTCR_EL2_HDBSS, FEAT_HDBSS), + NEEDS_FEAT(VTCR_EL2_HAFT, FEAT_HAFT), + NEEDS_FEAT(VTCR_EL2_TL0 | + VTCR_EL2_TL1 | + VTCR_EL2_AssuredOnly | + VTCR_EL2_GCSH, + FEAT_THE), + NEEDS_FEAT(VTCR_EL2_D128, FEAT_D128), + NEEDS_FEAT(VTCR_EL2_S2POE, FEAT_S2POE), + NEEDS_FEAT(VTCR_EL2_S2PIE, FEAT_S2PIE), + NEEDS_FEAT(VTCR_EL2_SL2 | + VTCR_EL2_DS, + feat_lpa2), + NEEDS_FEAT(VTCR_EL2_NSA | + VTCR_EL2_NSW, + FEAT_SEL2), + NEEDS_FEAT(VTCR_EL2_HWU62 | + VTCR_EL2_HWU61 | + VTCR_EL2_HWU60 | + VTCR_EL2_HWU59, + FEAT_HPDS2), + NEEDS_FEAT(VTCR_EL2_HD, ID_AA64MMFR1_EL1, HAFDBS, DBM), + NEEDS_FEAT(VTCR_EL2_HA, ID_AA64MMFR1_EL1, HAFDBS, AF), + NEEDS_FEAT(VTCR_EL2_VS, feat_vmid16), + NEEDS_FEAT(VTCR_EL2_PS | + VTCR_EL2_TG0 | + VTCR_EL2_SH0 | + VTCR_EL2_ORGN0 | + VTCR_EL2_IRGN0 | + VTCR_EL2_SL0 | + VTCR_EL2_T0SZ, + FEAT_AA64EL1), +}; + +static const DECLARE_FEAT_MAP(vtcr_el2_desc, VTCR_EL2, + vtcr_el2_feat_map, FEAT_AA64EL2); + static void __init check_feat_map(const struct reg_bits_to_feat_map *map, int map_size, u64 resx, const char *str) { @@ -1211,6 +1275,7 @@ void __init check_feature_map(void) check_reg_desc(&tcr2_el2_desc); check_reg_desc(&sctlr_el1_desc); check_reg_desc(&mdcr_el2_desc); + check_reg_desc(&vtcr_el2_desc); } static bool idreg_feat_match(struct kvm *kvm, const struct reg_bits_to_feat_map *map) @@ -1425,6 +1490,10 @@ void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *r *res0 = compute_reg_res0_bits(kvm, &mdcr_el2_desc, 0, 0); *res1 = MDCR_EL2_RES1; break; + case VTCR_EL2: + *res0 = compute_reg_res0_bits(kvm, &vtcr_el2_desc, 0, 0); + *res1 = VTCR_EL2_RES1; + break; default: WARN_ON_ONCE(1); *res0 = *res1 = 0; diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 522fa3cc9578..486eba72bb02 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1719,8 +1719,7 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) set_sysreg_masks(kvm, VTTBR_EL2, res0, res1); /* VTCR_EL2 */ - res0 = GENMASK(63, 32) | GENMASK(30, 20); - res1 = BIT(31); + get_reg_fixed_bits(kvm, VTCR_EL2, &res0, &res1); set_sysreg_masks(kvm, VTCR_EL2, res0, res1); /* VMPIDR_EL2 */ From 80cbfd7174f31010982f065e8ae73bf337992105 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 10 Dec 2025 17:30:24 +0000 Subject: [PATCH 119/267] KVM: arm64: Honor UX/PX attributes for EL2 S1 mappings Now that we potentially have two bits to deal with when setting execution permissions, make sure we correctly handle them when both when building the page tables and when reading back from them. Reported-by: Alexandru Elisei Reviewed-by: Fuad Tabba Reviewed-by: Joey Gouly Tested-by: Fuad Tabba Link: https://patch.msgid.link/20251210173024.561160-7-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_pgtable.h | 12 +++--------- arch/arm64/kvm/hyp/pgtable.c | 24 +++++++++++++++++++++--- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index c0ad262a8289..56bd08be9630 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -87,15 +87,9 @@ typedef u64 kvm_pte_t; #define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) -#define __KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) -#define __KVM_PTE_LEAF_ATTR_HI_S1_UXN BIT(54) -#define __KVM_PTE_LEAF_ATTR_HI_S1_PXN BIT(53) - -#define KVM_PTE_LEAF_ATTR_HI_S1_XN \ - ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? \ - (__KVM_PTE_LEAF_ATTR_HI_S1_UXN | \ - __KVM_PTE_LEAF_ATTR_HI_S1_PXN) : \ - __KVM_PTE_LEAF_ATTR_HI_S1_XN; }) +#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) +#define KVM_PTE_LEAF_ATTR_HI_S1_UXN BIT(54) +#define KVM_PTE_LEAF_ATTR_HI_S1_PXN BIT(53) #define KVM_PTE_LEAF_ATTR_HI_S2_XN GENMASK(54, 53) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index a99a7d6e7d0c..13e34d66b6a7 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -342,6 +342,9 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) if (!(prot & KVM_PGTABLE_PROT_R)) return -EINVAL; + if (!cpus_have_final_cap(ARM64_KVM_HVHE)) + prot &= ~KVM_PGTABLE_PROT_UX; + if (prot & KVM_PGTABLE_PROT_X) { if (prot & KVM_PGTABLE_PROT_W) return -EINVAL; @@ -351,8 +354,16 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) if (system_supports_bti_kernel()) attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP; + } + + if (cpus_have_final_cap(ARM64_KVM_HVHE)) { + if (!(prot & KVM_PGTABLE_PROT_PX)) + attr |= KVM_PTE_LEAF_ATTR_HI_S1_PXN; + if (!(prot & KVM_PGTABLE_PROT_UX)) + attr |= KVM_PTE_LEAF_ATTR_HI_S1_UXN; } else { - attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; + if (!(prot & KVM_PGTABLE_PROT_PX)) + attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; } attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); @@ -373,8 +384,15 @@ enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) if (!kvm_pte_valid(pte)) return prot; - if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN)) - prot |= KVM_PGTABLE_PROT_X; + if (cpus_have_final_cap(ARM64_KVM_HVHE)) { + if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_PXN)) + prot |= KVM_PGTABLE_PROT_PX; + if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_UXN)) + prot |= KVM_PGTABLE_PROT_UX; + } else { + if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN)) + prot |= KVM_PGTABLE_PROT_PX; + } ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte); if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO) From 4a7fe842b8a3f3c173c3075f03c60c3f9f62e299 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 8 Jan 2026 17:32:25 +0000 Subject: [PATCH 120/267] arm64: Repaint ID_AA64MMFR2_EL1.IDS description ID_AA64MMFR2_EL1.IDS, as described in the sysreg file, is pretty horrible as it diesctly give the ESR value. Repaint it using the usual NI/IMP identifiers to describe the absence/presence of FEAT_IDST. Also add the new EL3 routing feature, even if we really don't care about it. Reviewed-by: Joey Gouly Link: https://patch.msgid.link/20260108173233.2911955-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/sys_regs.c | 2 +- arch/arm64/tools/sysreg | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 3108b5185c20..4db0562f5bfa 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -134,7 +134,7 @@ static const struct pvm_ftr_bits pvmid_aa64mmfr2[] = { MAX_FEAT(ID_AA64MMFR2_EL1, UAO, IMP), MAX_FEAT(ID_AA64MMFR2_EL1, IESB, IMP), MAX_FEAT(ID_AA64MMFR2_EL1, AT, IMP), - MAX_FEAT_ENUM(ID_AA64MMFR2_EL1, IDS, 0x18), + MAX_FEAT(ID_AA64MMFR2_EL1, IDS, IMP), MAX_FEAT(ID_AA64MMFR2_EL1, TTL, IMP), MAX_FEAT(ID_AA64MMFR2_EL1, BBM, 2), MAX_FEAT(ID_AA64MMFR2_EL1, E0PD, IMP), diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 8921b51866d6..d0ddfd572b89 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2256,9 +2256,10 @@ UnsignedEnum 43:40 FWB 0b0000 NI 0b0001 IMP EndEnum -Enum 39:36 IDS - 0b0000 0x0 - 0b0001 0x18 +UnsignedEnum 39:36 IDS + 0b0000 NI + 0b0001 IMP + 0b0010 EL3 EndEnum UnsignedEnum 35:32 AT 0b0000 NI From 1ad9767accfcb81f404aa3d37d46b3eb494dce2f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 8 Jan 2026 17:32:26 +0000 Subject: [PATCH 121/267] KVM: arm64: Add trap routing for GMID_EL1 HCR_EL2.TID5 is currently ignored by the trap routing infrastructure. Wire it in the routing table so that GMID_EL1, the sole register trapped by this bit, is correctly handled in the NV case. Link: https://patch.msgid.link/20260108173233.2911955-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/emulate-nested.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 834f13fb1fb7..616eb6ad6870 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -70,6 +70,7 @@ enum cgt_group_id { CGT_HCR_ENSCXT, CGT_HCR_TTLBIS, CGT_HCR_TTLBOS, + CGT_HCR_TID5, CGT_MDCR_TPMCR, CGT_MDCR_TPM, @@ -308,6 +309,12 @@ static const struct trap_bits coarse_trap_bits[] = { .mask = HCR_TTLBOS, .behaviour = BEHAVE_FORWARD_RW, }, + [CGT_HCR_TID5] = { + .index = HCR_EL2, + .value = HCR_TID5, + .mask = HCR_TID5, + .behaviour = BEHAVE_FORWARD_RW, + }, [CGT_MDCR_TPMCR] = { .index = MDCR_EL2, .value = MDCR_EL2_TPMCR, @@ -665,6 +672,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_CCSIDR2_EL1, CGT_HCR_TID2_TID4), SR_TRAP(SYS_CLIDR_EL1, CGT_HCR_TID2_TID4), SR_TRAP(SYS_CSSELR_EL1, CGT_HCR_TID2_TID4), + SR_TRAP(SYS_GMID_EL1, CGT_HCR_TID5), SR_RANGE_TRAP(SYS_ID_PFR0_EL1, sys_reg(3, 0, 0, 7, 7), CGT_HCR_TID3), SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), From 19f75678238734ef383f9e10d8e1020873e97170 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 8 Jan 2026 17:32:27 +0000 Subject: [PATCH 122/267] KVM: arm64: Add a generic synchronous exception injection primitive Maybe in a surprising way, we don't currently have a generic way to inject a synchronous exception at the EL the vcpu is currently running at. Extract such primitive from the UNDEF injection code. Reviewed-by: Ben Horgan Reviewed-by: Yuan Yao Link: https://patch.msgid.link/20260108173233.2911955-4-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 1 + arch/arm64/kvm/inject_fault.c | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index c9eab316398e..df20d47f0d25 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -45,6 +45,7 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); void kvm_skip_instr32(struct kvm_vcpu *vcpu); void kvm_inject_undefined(struct kvm_vcpu *vcpu); +void kvm_inject_sync(struct kvm_vcpu *vcpu, u64 esr); int kvm_inject_serror_esr(struct kvm_vcpu *vcpu, u64 esr); int kvm_inject_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr); void kvm_inject_size_fault(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index dfcd66c65517..7102424a3fa5 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -162,12 +162,16 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu)); } +void kvm_inject_sync(struct kvm_vcpu *vcpu, u64 esr) +{ + pend_sync_exception(vcpu); + vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu)); +} + static void inject_undef64(struct kvm_vcpu *vcpu) { u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); - pend_sync_exception(vcpu); - /* * Build an unknown exception, depending on the instruction * set. @@ -175,7 +179,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) if (kvm_vcpu_trap_il_is32bit(vcpu)) esr |= ESR_ELx_IL; - vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu)); + kvm_inject_sync(vcpu, esr); } #define DFSR_FSC_EXTABT_LPAE 0x10 From d78a14decd494caf72ea0144624621e7e43ae451 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 8 Jan 2026 17:32:28 +0000 Subject: [PATCH 123/267] KVM: arm64: Handle FEAT_IDST for sysregs without specific handlers Add a bit of infrastrtcture to triage_sysreg_trap() to handle the case of registers falling into the Feature ID space that do not have a local handler. For these, we can directly apply the FEAT_IDST semantics and inject an EC=0x18 exception. Otherwise, an UNDEF will do. Reviewed-by: Joey Gouly Reviewed-by: Yuan Yao Link: https://patch.msgid.link/20260108173233.2911955-5-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/emulate-nested.c | 13 +++++++++++++ arch/arm64/kvm/sys_regs.h | 10 ++++++++++ 2 files changed, 23 insertions(+) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 616eb6ad6870..4aabd624c4be 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -2588,6 +2588,19 @@ local: params = esr_sys64_to_params(esr); + /* + * This implements the pseudocode UnimplementedIDRegister() + * helper for the purpose of dealing with FEAT_IDST. + */ + if (in_feat_id_space(¶ms)) { + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, IDS, IMP)) + kvm_inject_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + else + kvm_inject_undefined(vcpu); + + return true; + } + /* * Check for the IMPDEF range, as per DDI0487 J.a, * D18.3.2 Reserved encodings for IMPLEMENTATION diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index b3f904472fac..2a983664220c 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -49,6 +49,16 @@ struct sys_reg_params { .Op2 = ((esr) >> 17) & 0x7, \ .is_write = !((esr) & 1) }) +/* + * The Feature ID space is defined as the System register space in AArch64 + * with op0==3, op1=={0, 1, 3}, CRn==0, CRm=={0-7}, op2=={0-7}. + */ +static inline bool in_feat_id_space(struct sys_reg_params *p) +{ + return (p->Op0 == 3 && !(p->Op1 & 0b100) && p->Op1 != 2 && + p->CRn == 0 && !(p->CRm & 0b1000)); +} + struct sys_reg_desc { /* Sysreg string for debug */ const char *name; From f07ef1bef67ca08799df262cc901971ac274783d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 8 Jan 2026 17:32:29 +0000 Subject: [PATCH 124/267] KVM: arm64: Handle CSSIDR2_EL1 and SMIDR_EL1 in a generic way Now that we can handle ID registers using the FEAT_IDST infrastrcuture, get rid of the handling of CSSIDR2_EL1 and SMIDR_EL1. Reviewed-by: Yuan Yao Link: https://patch.msgid.link/20260108173233.2911955-6-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c8fd7c6a12a1..a2b14ca2a702 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -3414,8 +3414,6 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr }, { SYS_DESC(SYS_CLIDR_EL1), access_clidr, reset_clidr, CLIDR_EL1, .set_user = set_clidr, .val = ~CLIDR_EL1_RES0 }, - { SYS_DESC(SYS_CCSIDR2_EL1), undef_access }, - { SYS_DESC(SYS_SMIDR_EL1), undef_access }, IMPLEMENTATION_ID(AIDR_EL1, GENMASK_ULL(63, 0)), { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, ID_FILTERED(CTR_EL0, ctr_el0, From 70a5ce4efc0e1194718aad6f26332c99e6a119db Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 8 Jan 2026 17:32:30 +0000 Subject: [PATCH 125/267] KVM: arm64: Force trap of GMID_EL1 when the guest doesn't have MTE If our host has MTE, but the guest doesn't, make sure we set HCR_EL2.TID5 to force GMID_EL1 being trapped. Such trap will be handled by the FEAT_IDST handling. Reviewed-by: Joey Gouly Reviewed-by: Yuan Yao Link: https://patch.msgid.link/20260108173233.2911955-7-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index a2b14ca2a702..3de206b49484 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -5576,6 +5576,8 @@ static void vcpu_set_hcr(struct kvm_vcpu *vcpu) if (kvm_has_mte(vcpu->kvm)) vcpu->arch.hcr_el2 |= HCR_ATA; + else + vcpu->arch.hcr_el2 |= HCR_TID5; /* * In the absence of FGT, we cannot independently trap TLBI From e5d40a5a97c1d57e89aa5f324734065c6580b436 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 8 Jan 2026 17:32:31 +0000 Subject: [PATCH 126/267] KVM: arm64: pkvm: Add a generic synchronous exception injection primitive Similarly to the "classic" KVM code, pKVM doesn't have an "anything goes" synchronous exception injection primitive. Carve one out of the UNDEF injection code. Link: https://patch.msgid.link/20260108173233.2911955-8-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/sys_regs.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 4db0562f5bfa..b197f3eb272c 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -243,16 +243,15 @@ static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id) } } -/* - * Inject an unknown/undefined exception to an AArch64 guest while most of its - * sysregs are live. - */ -static void inject_undef64(struct kvm_vcpu *vcpu) +static void inject_sync64(struct kvm_vcpu *vcpu, u64 esr) { - u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); - *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); + + /* + * Make sure we have the latest update to VBAR_EL1, as pKVM + * handles traps very early, before sysregs are resync'ed + */ __vcpu_assign_sys_reg(vcpu, VBAR_EL1, read_sysreg_el1(SYS_VBAR)); kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); @@ -265,6 +264,15 @@ static void inject_undef64(struct kvm_vcpu *vcpu) write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); } +/* + * Inject an unknown/undefined exception to an AArch64 guest while most of its + * sysregs are live. + */ +static void inject_undef64(struct kvm_vcpu *vcpu) +{ + inject_sync64(vcpu, (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT)); +} + static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) { From 592dc2c020686536dae1c427c78cf558a3df4414 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 8 Jan 2026 17:32:32 +0000 Subject: [PATCH 127/267] KVM: arm64: pkvm: Report optional ID register traps with a 0x18 syndrome With FEAT_IDST, unimplemented system registers in the feature ID space must be reported using EC=0x18 at the closest handling EL, rather than with an UNDEF. Most of these system registers are always implemented thanks to their dependency on FEAT_AA64, except for a set of (currently) three registers: GMID_EL1 (depending on MTE2), CCSIDR2_EL1 (depending on FEAT_CCIDX), and SMIDR_EL1 (depending on SME). For these three registers, report their trap as EC=0x18 if they end-up trapping into KVM and that FEAT_IDST is implemented in the guest. Otherwise, just make them UNDEF. Link: https://patch.msgid.link/20260108173233.2911955-9-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/sys_regs.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index b197f3eb272c..06d28621722e 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -347,6 +347,18 @@ static bool pvm_gic_read_sre(struct kvm_vcpu *vcpu, return true; } +static bool pvm_idst_access(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, IDS, IMP)) + inject_sync64(vcpu, kvm_vcpu_get_esr(vcpu)); + else + inject_undef64(vcpu); + + return false; +} + /* Mark the specified system register as an AArch32 feature id register. */ #define AARCH32(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch32 } @@ -477,6 +489,9 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { HOST_HANDLED(SYS_CCSIDR_EL1), HOST_HANDLED(SYS_CLIDR_EL1), + { SYS_DESC(SYS_CCSIDR2_EL1), .access = pvm_idst_access }, + { SYS_DESC(SYS_GMID_EL1), .access = pvm_idst_access }, + { SYS_DESC(SYS_SMIDR_EL1), .access = pvm_idst_access }, HOST_HANDLED(SYS_AIDR_EL1), HOST_HANDLED(SYS_CSSELR_EL1), HOST_HANDLED(SYS_CTR_EL0), From b638a9d0f8965b98403022cb91d8f3b31170eb35 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 8 Jan 2026 17:32:33 +0000 Subject: [PATCH 128/267] KVM: arm64: selftests: Add a test for FEAT_IDST Add a very basic test checking that FEAT_IDST actually works for the {GMID,SMIDR,CSSIDR2}_EL1 registers. Link: https://patch.msgid.link/20260108173233.2911955-10-maz@kernel.org Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../testing/selftests/kvm/arm64/idreg-idst.c | 117 ++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 tools/testing/selftests/kvm/arm64/idreg-idst.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..bfa2fad0aba1 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -175,6 +175,7 @@ TEST_GEN_PROGS_arm64 += arm64/vgic_irq TEST_GEN_PROGS_arm64 += arm64/vgic_lpi_stress TEST_GEN_PROGS_arm64 += arm64/vpmu_counter_access TEST_GEN_PROGS_arm64 += arm64/no-vgic-v3 +TEST_GEN_PROGS_arm64 += arm64/idreg-idst TEST_GEN_PROGS_arm64 += arm64/kvm-uuid TEST_GEN_PROGS_arm64 += access_tracking_perf_test TEST_GEN_PROGS_arm64 += arch_timer diff --git a/tools/testing/selftests/kvm/arm64/idreg-idst.c b/tools/testing/selftests/kvm/arm64/idreg-idst.c new file mode 100644 index 000000000000..9ca9f125abdb --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/idreg-idst.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Access all FEAT_IDST-handled registers that depend on more than + * just FEAT_AA64, and fail if we don't get an a trap with an 0x18 EC. + */ + +#include +#include +#include + +static volatile bool sys64, undef; + +#define __check_sr_read(r) \ + ({ \ + uint64_t val; \ + \ + sys64 = false; \ + undef = false; \ + dsb(sy); \ + val = read_sysreg_s(SYS_ ## r); \ + val; \ + }) + +/* Fatal checks */ +#define check_sr_read(r) \ + do { \ + __check_sr_read(r); \ + __GUEST_ASSERT(!undef, #r " unexpected UNDEF"); \ + __GUEST_ASSERT(sys64, #r " didn't trap"); \ + } while(0) + + +static void guest_code(void) +{ + check_sr_read(CCSIDR2_EL1); + check_sr_read(SMIDR_EL1); + check_sr_read(GMID_EL1); + + GUEST_DONE(); +} + +static void guest_sys64_handler(struct ex_regs *regs) +{ + sys64 = true; + undef = false; + regs->pc += 4; +} + +static void guest_undef_handler(struct ex_regs *regs) +{ + sys64 = false; + undef = true; + regs->pc += 4; +} + +static void test_run_vcpu(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + do { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_PRINTF: + printf("%s", uc.buffer); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } while (uc.cmd != UCALL_DONE); +} + +static void test_guest_feat_idst(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + /* This VM has no MTE, no SME, no CCIDX */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpu); + + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_ELx_EC_SYS64, guest_sys64_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_ELx_EC_UNKNOWN, guest_undef_handler); + + test_run_vcpu(vcpu); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + uint64_t mmfr2; + + test_disable_default_vgic(); + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + mmfr2 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64MMFR2_EL1)); + __TEST_REQUIRE(FIELD_GET(ID_AA64MMFR2_EL1_IDS, mmfr2) > 0, + "FEAT_IDST not supported"); + kvm_vm_free(vm); + + test_guest_feat_idst(); + + return 0; +} From 7e03d07d03a486c66d5c084c7185b1bef29049e9 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 9 Jan 2026 08:22:14 +0000 Subject: [PATCH 129/267] KVM: arm64: selftests: Disable unused TTBR1_EL1 translations KVM selftests map all guest code and data into the lower virtual address range (0x0000...) managed by TTBR0_EL1. The upper range (0xFFFF...) managed by TTBR1_EL1 is unused and uninitialized. If a guest accesses the upper range, the MMU attempts a translation table walk using uninitialized registers, leading to unpredictable behavior. Set `TCR_EL1.EPD1` to disable translation table walks for TTBR1_EL1, ensuring that any access to the upper range generates an immediate Translation Fault. Additionally, set `TCR_EL1.TBI1` (Top Byte Ignore) to ensure that tagged pointers in the upper range also deterministically trigger a Translation Fault via EPD1. Define `TCR_EPD1_MASK`, `TCR_EPD1_SHIFT`, and `TCR_TBI1` in `processor.h` to support this configuration. These are based on their definitions in `arch/arm64/include/asm/pgtable-hwdef.h`. Suggested-by: Will Deacon Reviewed-by: Itaru Kitayama Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260109082218.3236580-2-tabba@google.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/include/arm64/processor.h | 4 ++++ tools/testing/selftests/kvm/lib/arm64/processor.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/tools/testing/selftests/kvm/include/arm64/processor.h b/tools/testing/selftests/kvm/include/arm64/processor.h index ff928716574d..ac97a1c436fc 100644 --- a/tools/testing/selftests/kvm/include/arm64/processor.h +++ b/tools/testing/selftests/kvm/include/arm64/processor.h @@ -90,6 +90,9 @@ #define TCR_TG0_64K (UL(1) << TCR_TG0_SHIFT) #define TCR_TG0_16K (UL(2) << TCR_TG0_SHIFT) +#define TCR_EPD1_SHIFT 23 +#define TCR_EPD1_MASK (UL(1) << TCR_EPD1_SHIFT) + #define TCR_IPS_SHIFT 32 #define TCR_IPS_MASK (UL(7) << TCR_IPS_SHIFT) #define TCR_IPS_52_BITS (UL(6) << TCR_IPS_SHIFT) @@ -97,6 +100,7 @@ #define TCR_IPS_40_BITS (UL(2) << TCR_IPS_SHIFT) #define TCR_IPS_36_BITS (UL(1) << TCR_IPS_SHIFT) +#define TCR_TBI1 (UL(1) << 38) #define TCR_HA (UL(1) << 39) #define TCR_DS (UL(1) << 59) diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index d46e4b13b92c..5b379da8cb90 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -384,6 +384,8 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) tcr_el1 |= TCR_IRGN0_WBWA | TCR_ORGN0_WBWA | TCR_SH0_INNER; tcr_el1 |= TCR_T0SZ(vm->va_bits); + tcr_el1 |= TCR_TBI1; + tcr_el1 |= TCR_EPD1_MASK; if (use_lpa2_pte_format(vm)) tcr_el1 |= TCR_DS; From dd0c5d04d13cae8ff2694ef83d1ae5804d6d9798 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 9 Jan 2026 08:22:15 +0000 Subject: [PATCH 130/267] KVM: arm64: selftests: Fix incorrect rounding in page_align() The implementation of `page_align()` in `processor.c` calculates alignment incorrectly for values that are already aligned. Specifically, `(v + vm->page_size) & ~(vm->page_size - 1)` aligns to the *next* page boundary even if `v` is already page-aligned, potentially wasting a page of memory. Fix the calculation to use standard alignment logic: `(v + vm->page_size - 1) & ~(vm->page_size - 1)`. Fixes: 7a6629ef746d ("kvm: selftests: add virt mem support for aarch64") Reviewed-by: Andrew Jones Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260109082218.3236580-3-tabba@google.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/lib/arm64/processor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index 5b379da8cb90..607a4e462984 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -23,7 +23,7 @@ static vm_vaddr_t exception_handlers; static uint64_t page_align(struct kvm_vm *vm, uint64_t v) { - return (v + vm->page_size) & ~(vm->page_size - 1); + return (v + vm->page_size - 1) & ~(vm->page_size - 1); } static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva) From 582b39463f1c0774e0b3cb5be2118e8564b7941e Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 9 Jan 2026 08:22:16 +0000 Subject: [PATCH 131/267] KVM: riscv: selftests: Fix incorrect rounding in page_align() The implementation of `page_align()` in `processor.c` calculates alignment incorrectly for values that are already aligned. Specifically, `(v + vm->page_size) & ~(vm->page_size - 1)` aligns to the *next* page boundary even if `v` is already page-aligned, potentially wasting a page of memory. Fix the calculation to use standard alignment logic: `(v + vm->page_size - 1) & ~(vm->page_size - 1)`. Fixes: 3e06cdf10520 ("KVM: selftests: Add initial support for RISC-V 64-bit") Reviewed-by: Andrew Jones Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260109082218.3236580-4-tabba@google.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/lib/riscv/processor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index 2eac7d4b59e9..d5e8747b5e69 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -28,7 +28,7 @@ bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext) static uint64_t page_align(struct kvm_vm *vm, uint64_t v) { - return (v + vm->page_size) & ~(vm->page_size - 1); + return (v + vm->page_size - 1) & ~(vm->page_size - 1); } static uint64_t pte_addr(struct kvm_vm *vm, uint64_t entry) From de00d07321cf3f182762de2308c08062d5b824c0 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 9 Jan 2026 08:22:17 +0000 Subject: [PATCH 132/267] KVM: selftests: Move page_align() to shared header To avoid code duplication, move page_align() to the shared `kvm_util.h` header file. Rename it to vm_page_align(), to make it clear that the alignment is done with respect to the guest's base page size. No functional change intended. Reviewed-by: Andrew Jones Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260109082218.3236580-5-tabba@google.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/include/kvm_util.h | 5 +++++ tools/testing/selftests/kvm/lib/arm64/processor.c | 7 +------ tools/testing/selftests/kvm/lib/riscv/processor.c | 7 +------ 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 81f4355ff28a..747effa614f1 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -1258,6 +1258,11 @@ static inline int __vm_disable_nx_huge_pages(struct kvm_vm *vm) return __vm_enable_cap(vm, KVM_CAP_VM_DISABLE_NX_HUGE_PAGES, 0); } +static inline uint64_t vm_page_align(struct kvm_vm *vm, uint64_t v) +{ + return (v + vm->page_size - 1) & ~(vm->page_size - 1); +} + /* * Arch hook that is invoked via a constructor, i.e. before exeucting main(), * to allow for arch-specific setup that is common to all tests, e.g. computing diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index 607a4e462984..1605dc740d1e 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -21,11 +21,6 @@ static vm_vaddr_t exception_handlers; -static uint64_t page_align(struct kvm_vm *vm, uint64_t v) -{ - return (v + vm->page_size - 1) & ~(vm->page_size - 1); -} - static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva) { unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; @@ -115,7 +110,7 @@ static uint64_t __maybe_unused ptrs_per_pte(struct kvm_vm *vm) void virt_arch_pgd_alloc(struct kvm_vm *vm) { - size_t nr_pages = page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size; + size_t nr_pages = vm_page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size; if (vm->pgd_created) return; diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index d5e8747b5e69..401245fe31db 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -26,11 +26,6 @@ bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext) return !ret && !!value; } -static uint64_t page_align(struct kvm_vm *vm, uint64_t v) -{ - return (v + vm->page_size - 1) & ~(vm->page_size - 1); -} - static uint64_t pte_addr(struct kvm_vm *vm, uint64_t entry) { return ((entry & PGTBL_PTE_ADDR_MASK) >> PGTBL_PTE_ADDR_SHIFT) << @@ -68,7 +63,7 @@ static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level) void virt_arch_pgd_alloc(struct kvm_vm *vm) { - size_t nr_pages = page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size; + size_t nr_pages = vm_page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size; if (vm->pgd_created) return; From e0a99a2b72f3c6365d9f4d6943ed45f7fc286b70 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 9 Jan 2026 08:22:18 +0000 Subject: [PATCH 133/267] KVM: selftests: Fix typos and stale comments in kvm_util Fix minor documentation errors in `kvm_util.h` and `kvm_util.c`. - Correct the argument description for `vcpu_args_set` in `kvm_util.h`, which incorrectly listed `vm` instead of `vcpu`. - Fix a typo in the comment for `kvm_selftest_arch_init` ("exeucting" -> "executing"). - Correct the return value description for `vm_vaddr_unused_gap` in `kvm_util.c` to match the implementation, which returns an address "at or above" `vaddr_min`, not "at or below". No functional change intended. Reviewed-by: Andrew Jones Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260109082218.3236580-6-tabba@google.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/include/kvm_util.h | 4 ++-- tools/testing/selftests/kvm/lib/kvm_util.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 747effa614f1..97f9251eb073 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -939,7 +939,7 @@ void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu); * VM VCPU Args Set * * Input Args: - * vm - Virtual Machine + * vcpu - vCPU * num - number of arguments * ... - arguments, each of type uint64_t * @@ -1264,7 +1264,7 @@ static inline uint64_t vm_page_align(struct kvm_vm *vm, uint64_t v) } /* - * Arch hook that is invoked via a constructor, i.e. before exeucting main(), + * Arch hook that is invoked via a constructor, i.e. before executing main(), * to allow for arch-specific setup that is common to all tests, e.g. computing * the default guest "mode". */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 8279b6ced8d2..fab6b62d7810 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1351,7 +1351,7 @@ struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) * Output Args: None * * Return: - * Lowest virtual address at or below vaddr_min, with at least + * Lowest virtual address at or above vaddr_min, with at least * sz unused bytes. TEST_ASSERT failure if no area of at least * size sz is available. * From 288eb55483c05bc37379a781d0d18b8e6c280f92 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 11 Dec 2025 10:47:01 +0000 Subject: [PATCH 134/267] KVM: arm64: Fix Trace Buffer trapping for protected VMs For protected VMs in pKVM, the hypervisor should trap accesses to trace buffer system registers if Trace Buffer isn't supported by the VM. However, the current code only traps if Trace Buffer External Mode isn't supported. Fix this by checking for FEAT_TRBE (Trace Buffer) rather than FEAT_TRBE_EXT. Fixes: 9d5261269098 ("KVM: arm64: Trap external trace for protected VMs") Reported-by: Suzuki K Poulose Reviewed-by: Suzuki K Poulose Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20251211104710.151771-2-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 8911338961c5..f0bfab99c334 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -117,7 +117,7 @@ static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP)) val |= MDCR_EL2_TTRF; - if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, ExtTrcBuff, IMP)) + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP)) val |= MDCR_EL2_E2TB_MASK; /* Trap Debug Communications Channel registers */ From e913c7ce9e6f62038a486218f43f699fc443e3e1 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 11 Dec 2025 10:47:02 +0000 Subject: [PATCH 135/267] KVM: arm64: Fix Trace Buffer trap polarity for protected VMs The E2TB bits in MDCR_EL2 control trapping of Trace Buffer system register accesses. These accesses are trapped to EL2 when the bits are clear. The trap initialization logic for protected VMs in pvm_init_traps_mdcr() had the polarity inverted. When a guest did not support the Trace Buffer feature, the code was setting E2TB. This incorrectly disabled the trap, potentially allowing a protected guest to access registers for a feature it was not given. Fix this by inverting the operation. Fixes: f50758260bff ("KVM: arm64: Group setting traps for protected VMs by control register") Reviewed-by: Suzuki K Poulose Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20251211104710.151771-3-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index f0bfab99c334..6bd539646204 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -118,7 +118,7 @@ static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) val |= MDCR_EL2_TTRF; if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP)) - val |= MDCR_EL2_E2TB_MASK; + val &= ~MDCR_EL2_E2TB_MASK; /* Trap Debug Communications Channel registers */ if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP)) From ebbcaece84738f71b35f32339bdeb8776004e641 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 11 Dec 2025 10:47:03 +0000 Subject: [PATCH 136/267] KVM: arm64: Fix MTE flag initialization for protected VMs The function pkvm_init_features_from_host() initializes guest features, propagating them from the host. The logic to propagate KVM_ARCH_FLAG_MTE_ENABLED (Memory Tagging Extension) has a couple of issues. First, the check was in the common path, before the divergence for protected and non-protected VMs. For non-protected VMs, this was unnecessary, as 'kvm->arch.flags' is completely overwritten by host_arch_flags immediately after, which already contains the MTE flag. For protected VMs, this was setting the flag even if the feature is not allowed. Second, the check was reading 'host_kvm->arch.flags' instead of using the local 'host_arch_flags', which is read once from the host flags. Fix these by moving the MTE flag check inside the protected-VM-only path, checking if the feature is allowed, and changing it to use the correct host_arch_flags local variable. This ensures non-protected VMs get the flag via the bulk copy, and protected VMs get it via an explicit check. Fixes: b7f345fbc32a ("KVM: arm64: Fix FEAT_MTE in pKVM") Reviewed-by: Ben Horgan Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20251211104710.151771-4-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 6bd539646204..d99137dddb1f 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -340,9 +340,6 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc /* Preserve the vgic model so that GICv3 emulation works */ hyp_vm->kvm.arch.vgic.vgic_model = host_kvm->arch.vgic.vgic_model; - if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags)) - set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); - /* No restrictions for non-protected VMs. */ if (!kvm_vm_is_protected(kvm)) { hyp_vm->kvm.arch.flags = host_arch_flags; @@ -357,6 +354,9 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc return; } + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_MTE)) + kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_MTE_ENABLED); + bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES); set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); From c273feee70bd3d8c6c4d5efaf6b3ae945c839378 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 11 Dec 2025 10:47:04 +0000 Subject: [PATCH 137/267] KVM: arm64: Introduce helper to calculate fault IPA offset This 12-bit FAR fault IPA offset mask is hard-coded as 'GENMASK(11, 0)' in several places to reconstruct the full fault IPA. Introduce FAR_TO_FIPA_OFFSET() to calculate this value in a shared header and replace all open-coded instances to improve readability. No functional change intended. Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20251211104710.151771-5-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_arm.h | 2 ++ arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c | 2 +- arch/arm64/kvm/inject_fault.c | 2 +- arch/arm64/kvm/mmu.c | 4 ++-- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index e500600e4b9b..58495593570c 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -344,6 +344,8 @@ #define PAR_TO_HPFAR(par) \ (((par) & GENMASK_ULL(52 - 1, 12)) >> 8) +#define FAR_TO_FIPA_OFFSET(far) ((far) & GENMASK_ULL(11, 0)) + #define ECN(x) { ESR_ELx_EC_##x, #x } #define kvm_arm_exception_class \ diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c index 5fd99763b54d..f0605836821e 100644 --- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c +++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c @@ -44,7 +44,7 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) /* Build the full address */ fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); - fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); + fault_ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(vcpu)); /* If not for GICV, move on */ if (fault_ipa < vgic->vgic_cpu_base || diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index dfcd66c65517..c9bbce017dee 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -258,7 +258,7 @@ void kvm_inject_size_fault(struct kvm_vcpu *vcpu) unsigned long addr, esr; addr = kvm_vcpu_get_fault_ipa(vcpu); - addr |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); + addr |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(vcpu)); __kvm_inject_sea(vcpu, kvm_vcpu_trap_is_iabt(vcpu), addr); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 48d7c372a4cd..ba44a29b75aa 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -2070,7 +2070,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) /* Falls between the IPA range and the PARange? */ if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { - fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); + fault_ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(vcpu)); return kvm_inject_sea(vcpu, is_iabt, fault_ipa); } @@ -2175,7 +2175,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) * faulting VA. This is always 12 bits, irrespective * of the page size. */ - ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); + ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(vcpu)); ret = io_mem_abort(vcpu, ipa); goto out_unlock; } From 43a21a0f0c4ab7de755f2cee2ff4700f26fe0bba Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 11 Dec 2025 10:47:05 +0000 Subject: [PATCH 138/267] KVM: arm64: Include VM type when checking VM capabilities in pKVM Certain features and capabilities are restricted in protected mode. Most of these features are restricted only for protected VMs, but some are restricted for ALL VMs in protected mode. Extend the pKVM capability check to pass the VM (kvm), and use that when determining supported features. Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20251211104710.151771-6-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_pkvm.h | 10 ++++++---- arch/arm64/kvm/arm.c | 4 ++-- arch/arm64/kvm/hyp/nvhe/pkvm.c | 10 +++++----- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index 0aecd4ac5f45..cccfff96f062 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -23,10 +23,12 @@ void pkvm_destroy_hyp_vm(struct kvm *kvm); int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu); /* - * This functions as an allow-list of protected VM capabilities. - * Features not explicitly allowed by this function are denied. + * Check whether the specific capability is allowed in pKVM. + * + * Certain features are allowed only for non-protected VMs in pKVM, which is why + * this takes the VM (kvm) as a parameter. */ -static inline bool kvm_pvm_ext_allowed(long ext) +static inline bool kvm_pkvm_ext_allowed(struct kvm *kvm, long ext) { switch (ext) { case KVM_CAP_IRQCHIP: @@ -43,7 +45,7 @@ static inline bool kvm_pvm_ext_allowed(long ext) case KVM_CAP_ARM_PTRAUTH_GENERIC: return true; default: - return false; + return !kvm || !kvm_vm_is_protected(kvm); } } diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 4f80da0c0d1d..e906ad414e72 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -87,7 +87,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, if (cap->flags) return -EINVAL; - if (kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(cap->cap)) + if (is_protected_kvm_enabled() && !kvm_pkvm_ext_allowed(kvm, cap->cap)) return -EINVAL; switch (cap->cap) { @@ -303,7 +303,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; - if (kvm && kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(ext)) + if (is_protected_kvm_enabled() && !kvm_pkvm_ext_allowed(kvm, ext)) return 0; switch (ext) { diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index d99137dddb1f..191d6f516113 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -354,23 +354,23 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc return; } - if (kvm_pvm_ext_allowed(KVM_CAP_ARM_MTE)) + if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_MTE)) kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_MTE_ENABLED); bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES); set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); - if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PMU_V3)) + if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_PMU_V3)) set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features); - if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_ADDRESS)) + if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_PTRAUTH_ADDRESS)) set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features); - if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_GENERIC)) + if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_PTRAUTH_GENERIC)) set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features); - if (kvm_pvm_ext_allowed(KVM_CAP_ARM_SVE)) { + if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_SVE)) { set_bit(KVM_ARM_VCPU_SVE, allowed_features); kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_GUEST_HAS_SVE); } From f4eee308c8f4013a52bd7d7735e64b5127c1b4a8 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 11 Dec 2025 10:47:06 +0000 Subject: [PATCH 139/267] KVM: arm64: Do not allow KVM_CAP_ARM_MTE for any guest in pKVM Supporting MTE in pKVM introduces significant complexity to the hypervisor at EL2, even for non-protected VMs, since it would require EL2 to handle tag management. For now, do not allow KVM_CAP_ARM_MTE for any VM type in protected mode. Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20251211104710.151771-7-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_pkvm.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index cccfff96f062..09a759971653 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -44,6 +44,8 @@ static inline bool kvm_pkvm_ext_allowed(struct kvm *kvm, long ext) case KVM_CAP_ARM_PTRAUTH_ADDRESS: case KVM_CAP_ARM_PTRAUTH_GENERIC: return true; + case KVM_CAP_ARM_MTE: + return false; default: return !kvm || !kvm_vm_is_protected(kvm); } From 8823485a697de5c280a7a2632620338722f16663 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 11 Dec 2025 10:47:07 +0000 Subject: [PATCH 140/267] KVM: arm64: Track KVM IOCTLs and their associated KVM caps Track KVM IOCTLs (VM IOCTLs for now), and the associated KVM capability that enables that IOCTL. Add a function that performs the lookup. This will be used by CoCo VM Hypervisors (e.g., pKVM) to determine whether a particular KVM IOCTL is allowed for its VMs. Suggested-by: Oliver Upton Signed-off-by: Fuad Tabba [maz: don't expose KVM_CAP_BASIC to userspace, and rely on NR_VCPUS as a proxy for this] Link: https://patch.msgid.link/20251211104710.151771-8-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 ++ arch/arm64/kvm/arm.c | 45 +++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ac7f970c7883..395778c8ecab 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1655,4 +1655,6 @@ static __always_inline enum fgt_group_id __fgt_reg_to_group_id(enum vcpu_sysreg p; \ }) +long kvm_get_cap_for_kvm_ioctl(unsigned int ioctl, long *ext); + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e906ad414e72..749274483185 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -58,6 +58,51 @@ enum kvm_wfx_trap_policy { static enum kvm_wfx_trap_policy kvm_wfi_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK; static enum kvm_wfx_trap_policy kvm_wfe_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK; +/* + * Tracks KVM IOCTLs and their associated KVM capabilities. + */ +struct kvm_ioctl_cap_map { + unsigned int ioctl; + long ext; +}; + +/* Make KVM_CAP_NR_VCPUS the reference for features we always supported */ +#define KVM_CAP_ARM_BASIC KVM_CAP_NR_VCPUS + +/* + * Sorted by ioctl to allow for potential binary search, + * though linear scan is sufficient for this size. + */ +static const struct kvm_ioctl_cap_map vm_ioctl_caps[] = { + { KVM_CREATE_IRQCHIP, KVM_CAP_IRQCHIP }, + { KVM_ARM_SET_DEVICE_ADDR, KVM_CAP_ARM_SET_DEVICE_ADDR }, + { KVM_ARM_MTE_COPY_TAGS, KVM_CAP_ARM_MTE }, + { KVM_SET_DEVICE_ATTR, KVM_CAP_DEVICE_CTRL }, + { KVM_GET_DEVICE_ATTR, KVM_CAP_DEVICE_CTRL }, + { KVM_HAS_DEVICE_ATTR, KVM_CAP_DEVICE_CTRL }, + { KVM_ARM_SET_COUNTER_OFFSET, KVM_CAP_COUNTER_OFFSET }, + { KVM_ARM_GET_REG_WRITABLE_MASKS, KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES }, + { KVM_ARM_PREFERRED_TARGET, KVM_CAP_ARM_BASIC }, +}; + +/* + * Set *ext to the capability. + * Return 0 if found, or -EINVAL if no IOCTL matches. + */ +long kvm_get_cap_for_kvm_ioctl(unsigned int ioctl, long *ext) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(vm_ioctl_caps); i++) { + if (vm_ioctl_caps[i].ioctl == ioctl) { + *ext = vm_ioctl_caps[i].ext; + return 0; + } + } + + return -EINVAL; +} + DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_base); From b12b3b04f6ba072ca5a618a75e546c996be94bd1 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 11 Dec 2025 10:47:08 +0000 Subject: [PATCH 141/267] KVM: arm64: Check whether a VM IOCTL is allowed in pKVM Certain VM IOCTLs are tied to specific VM features. Since pKVM does not support all features, restrict which IOCTLs are allowed depending on whether the associated feature is supported. Use the existing VM capability check as the source of truth to whether an IOCTL is allowed for a particular VM by mapping the IOCTLs with their associated capabilities. Suggested-by: Oliver Upton Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20251211104710.151771-9-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_pkvm.h | 20 ++++++++++++++++++++ arch/arm64/kvm/arm.c | 3 +++ 2 files changed, 23 insertions(+) diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index 09a759971653..757076ad4ec9 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -9,6 +9,7 @@ #include #include #include +#include #include /* Maximum number of VMs that can co-exist under pKVM. */ @@ -51,6 +52,25 @@ static inline bool kvm_pkvm_ext_allowed(struct kvm *kvm, long ext) } } +/* + * Check whether the KVM VM IOCTL is allowed in pKVM. + * + * Certain features are allowed only for non-protected VMs in pKVM, which is why + * this takes the VM (kvm) as a parameter. + */ +static inline bool kvm_pkvm_ioctl_allowed(struct kvm *kvm, unsigned int ioctl) +{ + long ext; + int r; + + r = kvm_get_cap_for_kvm_ioctl(ioctl, &ext); + + if (WARN_ON_ONCE(r < 0)) + return false; + + return kvm_pkvm_ext_allowed(kvm, ext); +} + extern struct memblock_region kvm_nvhe_sym(hyp_memory)[]; extern unsigned int kvm_nvhe_sym(hyp_memblock_nr); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 749274483185..a5d48bae8f6a 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1938,6 +1938,9 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) void __user *argp = (void __user *)arg; struct kvm_device_attr attr; + if (is_protected_kvm_enabled() && !kvm_pkvm_ioctl_allowed(kvm, ioctl)) + return -EINVAL; + switch (ioctl) { case KVM_CREATE_IRQCHIP: { int ret; From f7d05ee84a6a8d3775e0f0c3070d9380bed844a9 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 11 Dec 2025 10:47:09 +0000 Subject: [PATCH 142/267] KVM: arm64: Prevent host from managing timer offsets for protected VMs For protected VMs, the guest's timer offset state should not be controlled by the host and must always run with a virtual counter offset of 0. The existing timer logic allowed the host to set and manage the timer counter offsets for protected VMs in certain cases. Disable all host-side management of timer offsets for protected VMs by adding checks in the relevant code paths. Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20251211104710.151771-10-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 99a07972068d..600f250753b4 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -1056,10 +1056,14 @@ static void timer_context_init(struct kvm_vcpu *vcpu, int timerid) ctxt->timer_id = timerid; - if (timerid == TIMER_VTIMER) - ctxt->offset.vm_offset = &kvm->arch.timer_data.voffset; - else - ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset; + if (!kvm_vm_is_protected(vcpu->kvm)) { + if (timerid == TIMER_VTIMER) + ctxt->offset.vm_offset = &kvm->arch.timer_data.voffset; + else + ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset; + } else { + ctxt->offset.vm_offset = NULL; + } hrtimer_setup(&ctxt->hrtimer, kvm_hrtimer_expire, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); @@ -1083,7 +1087,8 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) timer_context_init(vcpu, i); /* Synchronize offsets across timers of a VM if not already provided */ - if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) { + if (!vcpu_is_protected(vcpu) && + !test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) { timer_set_offset(vcpu_vtimer(vcpu), kvm_phys_timer_read()); timer_set_offset(vcpu_ptimer(vcpu), 0); } @@ -1687,6 +1692,9 @@ int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm, if (offset->reserved) return -EINVAL; + if (kvm_vm_is_protected(kvm)) + return -EINVAL; + mutex_lock(&kvm->lock); if (!kvm_trylock_all_vcpus(kvm)) { From 6538b6221cc2feda415ca1946e66a5ef02dc6a0a Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 8 Jan 2026 15:46:18 -0600 Subject: [PATCH 143/267] KVM: guest_memfd: Remove partial hugepage handling from kvm_gmem_populate() kvm_gmem_populate(), and the associated post-populate callbacks, have some limited support for dealing with guests backed by hugepages by passing the order information along to each post-populate callback and iterating through the pages passed to kvm_gmem_populate() in hugepage-chunks. However, guest_memfd doesn't yet support hugepages, and in most cases additional changes in the kvm_gmem_populate() path would also be needed to actually allow for this functionality. This makes the existing code unnecessarily complex, and makes changes difficult to work through upstream due to theoretical impacts on hugepage support that can't be considered properly without an actual hugepage implementation to reference. So for now, remove what's there so changes for things like in-place conversion can be implemented/reviewed more efficiently. Suggested-by: Vishal Annapurve Co-developed-by: Vishal Annapurve Signed-off-by: Vishal Annapurve Tested-by: Vishal Annapurve Tested-by: Kai Huang Signed-off-by: Michael Roth Tested-by: Yan Zhao Reviewed-by: Yan Zhao Link: https://patch.msgid.link/20260108214622.1084057-3-michael.roth@amd.com [sean: check for !IS_ERR() before checking folio_order()] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 102 ++++++++++++++++----------------------- arch/x86/kvm/vmx/tdx.c | 2 +- include/linux/kvm_host.h | 2 +- virt/kvm/guest_memfd.c | 30 +++++++----- 4 files changed, 60 insertions(+), 76 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 261d9ef8631b..a70bd3f19e29 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2267,67 +2267,53 @@ struct sev_gmem_populate_args { int fw_error; }; -static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn, - void __user *src, int order, void *opaque) +static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + void __user *src, void *opaque) { struct sev_gmem_populate_args *sev_populate_args = opaque; + struct sev_data_snp_launch_update fw_args = {0}; struct kvm_sev_info *sev = to_kvm_sev_info(kvm); - int n_private = 0, ret, i; - int npages = (1 << order); - gfn_t gfn; + bool assigned = false; + int level; + int ret; if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src)) return -EINVAL; - for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) { - struct sev_data_snp_launch_update fw_args = {0}; - bool assigned = false; - int level; - - ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level); - if (ret || assigned) { - pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", - __func__, gfn, ret, assigned); - ret = ret ? -EINVAL : -EEXIST; - goto err; - } - - if (src) { - void *vaddr = kmap_local_pfn(pfn + i); - - if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) { - kunmap_local(vaddr); - ret = -EFAULT; - goto err; - } - kunmap_local(vaddr); - } - - ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K, - sev_get_asid(kvm), true); - if (ret) - goto err; - - n_private++; - - fw_args.gctx_paddr = __psp_pa(sev->snp_context); - fw_args.address = __sme_set(pfn_to_hpa(pfn + i)); - fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K); - fw_args.page_type = sev_populate_args->type; - - ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, - &fw_args, &sev_populate_args->fw_error); - if (ret) - goto fw_err; + ret = snp_lookup_rmpentry((u64)pfn, &assigned, &level); + if (ret || assigned) { + pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", + __func__, gfn, ret, assigned); + ret = ret ? -EINVAL : -EEXIST; + goto out; } - return 0; + if (src) { + void *vaddr = kmap_local_pfn(pfn); -fw_err: + if (copy_from_user(vaddr, src, PAGE_SIZE)) { + kunmap_local(vaddr); + ret = -EFAULT; + goto out; + } + kunmap_local(vaddr); + } + + ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, PG_LEVEL_4K, + sev_get_asid(kvm), true); + if (ret) + goto out; + + fw_args.gctx_paddr = __psp_pa(sev->snp_context); + fw_args.address = __sme_set(pfn_to_hpa(pfn)); + fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K); + fw_args.page_type = sev_populate_args->type; + + ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, + &fw_args, &sev_populate_args->fw_error); /* * If the firmware command failed handle the reclaim and cleanup of that - * PFN specially vs. prior pages which can be cleaned up below without - * needing to reclaim in advance. + * PFN before reporting an error. * * Additionally, when invalid CPUID function entries are detected, * firmware writes the expected values into the page and leaves it @@ -2337,26 +2323,20 @@ fw_err: * information to provide information on which CPUID leaves/fields * failed CPUID validation. */ - if (!snp_page_reclaim(kvm, pfn + i) && + if (ret && !snp_page_reclaim(kvm, pfn) && sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID && sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { - void *vaddr = kmap_local_pfn(pfn + i); + void *vaddr = kmap_local_pfn(pfn); - if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE)) + if (copy_to_user(src, vaddr, PAGE_SIZE)) pr_debug("Failed to write CPUID page back to userspace\n"); kunmap_local(vaddr); } - /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */ - n_private--; - -err: - pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n", - __func__, ret, sev_populate_args->fw_error, n_private); - for (i = 0; i < n_private; i++) - kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K); - +out: + pr_debug("%s: exiting with return code %d (fw_error %d)\n", + __func__, ret, sev_populate_args->fw_error); return ret; } diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 2d7a4d52ccfb..4fb042ce8ed1 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3118,7 +3118,7 @@ struct tdx_gmem_post_populate_arg { }; static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - void __user *src, int order, void *_arg) + void __user *src, void *_arg) { struct tdx_gmem_post_populate_arg *arg = _arg; struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d93f75b05ae2..1d0cee72e560 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2581,7 +2581,7 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord * Returns the number of pages that were populated. */ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - void __user *src, int order, void *opaque); + void __user *src, void *opaque); long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque); diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index fdaea3422c30..24eb33c7948d 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -151,6 +151,15 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) mapping_gfp_mask(inode->i_mapping), policy); mpol_cond_put(policy); + /* + * External interfaces like kvm_gmem_get_pfn() support dealing + * with hugepages to a degree, but internally, guest_memfd currently + * assumes that all folios are order-0 and handling would need + * to be updated for anything otherwise (e.g. page-clearing + * operations). + */ + WARN_ON_ONCE(!IS_ERR(folio) && folio_order(folio)); + return folio; } @@ -829,7 +838,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long struct kvm_memory_slot *slot; void __user *p; - int ret = 0, max_order; + int ret = 0; long i; lockdep_assert_held(&kvm->slots_lock); @@ -848,7 +857,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long filemap_invalidate_lock(file->f_mapping); npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); - for (i = 0; i < npages; i += (1 << max_order)) { + for (i = 0; i < npages; i++) { struct folio *folio; gfn_t gfn = start_gfn + i; pgoff_t index = kvm_gmem_get_index(slot, gfn); @@ -860,7 +869,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long break; } - folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order); + folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, NULL); if (IS_ERR(folio)) { ret = PTR_ERR(folio); break; @@ -874,20 +883,15 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long } folio_unlock(folio); - WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) || - (npages - i) < (1 << max_order)); ret = -EINVAL; - while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order), - KVM_MEMORY_ATTRIBUTE_PRIVATE, - KVM_MEMORY_ATTRIBUTE_PRIVATE)) { - if (!max_order) - goto put_folio_and_exit; - max_order--; - } + if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1, + KVM_MEMORY_ATTRIBUTE_PRIVATE, + KVM_MEMORY_ATTRIBUTE_PRIVATE)) + goto put_folio_and_exit; p = src ? src + i * PAGE_SIZE : NULL; - ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); + ret = post_populate(kvm, gfn, pfn, p, opaque); if (!ret) kvm_gmem_mark_prepared(folio); From 8622ef05709fbc4903f54cdf1ac8c3725e479dd8 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 8 Jan 2026 15:46:19 -0600 Subject: [PATCH 144/267] KVM: guest_memfd: Remove preparation tracking guest_memfd currently uses the folio uptodate flag to track: 1) whether or not a page has been cleared before initial usage 2) whether or not the architecture hooks have been issued to put the page in a private state as defined by the architecture In practice, (2) is only actually being tracked for SEV-SNP VMs, and there do not seem to be any plans/reasons that would suggest this will change in the future, so this additional tracking/complexity is not really providing any general benefit to guest_memfd users. On the other hand, future plans around in-place conversion and hugepage support will make the burden of tracking this information within guest_memfd even more complex. With in-place conversion and hugepage support, the plan is to use the per-folio uptodate flag purely to track the initial clearing of folios, whereas conversion operations could trigger multiple transitions between 'prepared' and 'unprepared' and thus need separate tracking. Since preparation generally happens during fault time, i.e. on the "read-side" of any VM-wide locks that might protect state tracked by guest_memfd, supporting concurrent handling of page faults would likely require more complex locking schemes if the "preparedness" state were tracked by guest_memfd, i.e. if it needs to be updated as part of handling the fault. Instead of keeping this current/future complexity within guest_memfd for what is essentially just SEV-SNP, just drop the tracking for (2) and have the arch-specific preparation hooks get triggered unconditionally on every fault so the arch-specific hooks can check the preparation state directly and decide whether or not a folio still needs additional preparation. In the case of SEV-SNP, the preparation state is already checked again via the preparation hooks to avoid double-preparation, so nothing extra needs to be done to update the handling of things there. Reviewed-by: Vishal Annapurve Tested-by: Vishal Annapurve Reviewed-by: Pankaj Gupta Tested-by: Kai Huang Signed-off-by: Michael Roth Link: https://patch.msgid.link/20260108214622.1084057-4-michael.roth@amd.com [sean: massage changelog] Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 44 ++++++++++++------------------------------ 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 24eb33c7948d..e90879322fd0 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -76,11 +76,6 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo return 0; } -static inline void kvm_gmem_mark_prepared(struct folio *folio) -{ - folio_mark_uptodate(folio); -} - /* * Process @folio, which contains @gfn, so that the guest can use it. * The folio must be locked and the gfn must be contained in @slot. @@ -90,13 +85,7 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio) static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, struct folio *folio) { - unsigned long nr_pages, i; pgoff_t index; - int r; - - nr_pages = folio_nr_pages(folio); - for (i = 0; i < nr_pages; i++) - clear_highpage(folio_page(folio, i)); /* * Preparing huge folios should always be safe, since it should @@ -114,11 +103,8 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio))); index = kvm_gmem_get_index(slot, gfn); index = ALIGN_DOWN(index, folio_nr_pages(folio)); - r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); - if (!r) - kvm_gmem_mark_prepared(folio); - return r; + return __kvm_gmem_prepare_folio(kvm, slot, index, folio); } /* @@ -429,7 +415,7 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) if (!folio_test_uptodate(folio)) { clear_highpage(folio_page(folio, 0)); - kvm_gmem_mark_prepared(folio); + folio_mark_uptodate(folio); } vmf->page = folio_file_page(folio, vmf->pgoff); @@ -766,7 +752,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) static struct folio *__kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, pgoff_t index, kvm_pfn_t *pfn, - bool *is_prepared, int *max_order) + int *max_order) { struct file *slot_file = READ_ONCE(slot->gmem.file); struct gmem_file *f = file->private_data; @@ -796,7 +782,6 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file, if (max_order) *max_order = 0; - *is_prepared = folio_test_uptodate(folio); return folio; } @@ -806,19 +791,22 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, { pgoff_t index = kvm_gmem_get_index(slot, gfn); struct folio *folio; - bool is_prepared = false; int r = 0; CLASS(gmem_get_file, file)(slot); if (!file) return -EFAULT; - folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order); + folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order); if (IS_ERR(folio)) return PTR_ERR(folio); - if (!is_prepared) - r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); + if (!folio_test_uptodate(folio)) { + clear_highpage(folio_page(folio, 0)); + folio_mark_uptodate(folio); + } + + r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); folio_unlock(folio); @@ -861,7 +849,6 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long struct folio *folio; gfn_t gfn = start_gfn + i; pgoff_t index = kvm_gmem_get_index(slot, gfn); - bool is_prepared = false; kvm_pfn_t pfn; if (signal_pending(current)) { @@ -869,19 +856,12 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long break; } - folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, NULL); + folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL); if (IS_ERR(folio)) { ret = PTR_ERR(folio); break; } - if (is_prepared) { - folio_unlock(folio); - folio_put(folio); - ret = -EEXIST; - break; - } - folio_unlock(folio); ret = -EINVAL; @@ -893,7 +873,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long p = src ? src + i * PAGE_SIZE : NULL; ret = post_populate(kvm, gfn, pfn, p, opaque); if (!ret) - kvm_gmem_mark_prepared(folio); + folio_mark_uptodate(folio); put_folio_and_exit: folio_put(folio); From dcbcc2323c806b55939d765c13d0728421756017 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 8 Jan 2026 15:46:20 -0600 Subject: [PATCH 145/267] KVM: SEV: Document/enforce page-alignment for KVM_SEV_SNP_LAUNCH_UPDATE In the past, KVM_SEV_SNP_LAUNCH_UPDATE accepted a non-page-aligned 'uaddr' parameter to copy data from, but continuing to support this with new functionality like in-place conversion and hugepages in the pipeline has proven to be more trouble than it is worth, since there are no known users that have been identified who use a non-page-aligned 'uaddr' parameter. Rather than locking guest_memfd into continuing to support this, go ahead and document page-alignment as a requirement and begin enforcing this in the handling function. Reviewed-by: Vishal Annapurve Tested-by: Kai Huang Signed-off-by: Michael Roth Link: https://patch.msgid.link/20260108214622.1084057-5-michael.roth@amd.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/x86/amd-memory-encryption.rst | 2 +- arch/x86/kvm/svm/sev.c | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst index 1ddb6a86ce7f..5a88d0197cb3 100644 --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst @@ -523,7 +523,7 @@ Returns: 0 on success, < 0 on error, -EAGAIN if caller should retry struct kvm_sev_snp_launch_update { __u64 gfn_start; /* Guest page number to load/encrypt data into. */ - __u64 uaddr; /* Userspace address of data to be loaded/encrypted. */ + __u64 uaddr; /* 4k-aligned address of data to be loaded/encrypted. */ __u64 len; /* 4k-aligned length in bytes to copy into guest memory.*/ __u8 type; /* The type of the guest pages being initialized. */ __u8 pad0; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a70bd3f19e29..b4409bc652d1 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2367,6 +2367,11 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID)) return -EINVAL; + src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr); + + if (!PAGE_ALIGNED(src)) + return -EINVAL; + npages = params.len / PAGE_SIZE; /* @@ -2398,7 +2403,6 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) sev_populate_args.sev_fd = argp->sev_fd; sev_populate_args.type = params.type; - src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr); count = kvm_gmem_populate(kvm, params.gfn_start, src, npages, sev_gmem_post_populate, &sev_populate_args); From 189fd1b059a9c7ed22750bc8a4a1182c58ccf138 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 8 Jan 2026 15:46:21 -0600 Subject: [PATCH 146/267] KVM: TDX: Document alignment requirements for KVM_TDX_INIT_MEM_REGION Since it was never possible to use a non-PAGE_SIZE-aligned @source_addr, go ahead and document this as a requirement. This is in preparation for enforcing page-aligned @source_addr for all architectures in guest_memfd. Reviewed-by: Vishal Annapurve Tested-by: Kai Huang Signed-off-by: Michael Roth Reviewed-by: Yan Zhao Link: https://patch.msgid.link/20260108214622.1084057-6-michael.roth@amd.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/x86/intel-tdx.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/x86/intel-tdx.rst b/Documentation/virt/kvm/x86/intel-tdx.rst index 5efac62c92c7..6a222e9d0954 100644 --- a/Documentation/virt/kvm/x86/intel-tdx.rst +++ b/Documentation/virt/kvm/x86/intel-tdx.rst @@ -156,7 +156,7 @@ KVM_TDX_INIT_MEM_REGION :Returns: 0 on success, <0 on error Initialize @nr_pages TDX guest private memory starting from @gpa with userspace -provided data from @source_addr. +provided data from @source_addr. @source_addr must be PAGE_SIZE-aligned. Note, before calling this sub command, memory attribute of the range [gpa, gpa + nr_pages] needs to be private. Userspace can use From 2a62345b30529e488beb6a1220577b3495933724 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 8 Jan 2026 15:46:22 -0600 Subject: [PATCH 147/267] KVM: guest_memfd: GUP source pages prior to populating guest memory Currently the post-populate callbacks handle copying source pages into private GPA ranges backed by guest_memfd, where kvm_gmem_populate() acquires the filemap invalidate lock, then calls a post-populate callback which may issue a get_user_pages() on the source pages prior to copying them into the private GPA (e.g. TDX). This will not be compatible with in-place conversion, where the userspace page fault path will attempt to acquire the filemap invalidate lock while holding the mm->mmap_lock, leading to a potential ABBA deadlock. Address this by hoisting the GUP above the filemap invalidate lock so that these page faults path can be taken early, prior to acquiring the filemap invalidate lock. It's not currently clear whether this issue is reachable with the current implementation of guest_memfd, which doesn't support in-place conversion, however it does provide a consistent mechanism to provide stable source/target PFNs to callbacks rather than punting to vendor-specific code, which allows for more commonality across architectures, which may be worthwhile even without in-place conversion. As part of this change, also begin enforcing that the 'src' argument to kvm_gmem_populate() must be page-aligned, as this greatly reduces the complexity around how the post-populate callbacks are implemented, and since no current in-tree users support using a non-page-aligned 'src' argument. Suggested-by: Sean Christopherson Co-developed-by: Sean Christopherson Co-developed-by: Vishal Annapurve Signed-off-by: Vishal Annapurve Tested-by: Vishal Annapurve Tested-by: Kai Huang Signed-off-by: Michael Roth Tested-by: Yan Zhao Reviewed-by: Yan Zhao Link: https://patch.msgid.link/20260108214622.1084057-7-michael.roth@amd.com [sean: avoid local "p" variable] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 33 ++++++++-------- arch/x86/kvm/vmx/tdx.c | 16 ++------ include/linux/kvm_host.h | 4 +- virt/kvm/guest_memfd.c | 83 +++++++++++++++++++++++++++------------- 4 files changed, 78 insertions(+), 58 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index b4409bc652d1..0ab7c89262fb 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2268,7 +2268,7 @@ struct sev_gmem_populate_args { }; static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - void __user *src, void *opaque) + struct page *src_page, void *opaque) { struct sev_gmem_populate_args *sev_populate_args = opaque; struct sev_data_snp_launch_update fw_args = {0}; @@ -2277,7 +2277,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int level; int ret; - if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src)) + if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_page)) return -EINVAL; ret = snp_lookup_rmpentry((u64)pfn, &assigned, &level); @@ -2288,15 +2288,14 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, goto out; } - if (src) { - void *vaddr = kmap_local_pfn(pfn); + if (src_page) { + void *src_vaddr = kmap_local_page(src_page); + void *dst_vaddr = kmap_local_pfn(pfn); - if (copy_from_user(vaddr, src, PAGE_SIZE)) { - kunmap_local(vaddr); - ret = -EFAULT; - goto out; - } - kunmap_local(vaddr); + memcpy(dst_vaddr, src_vaddr, PAGE_SIZE); + + kunmap_local(src_vaddr); + kunmap_local(dst_vaddr); } ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, PG_LEVEL_4K, @@ -2326,17 +2325,19 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, if (ret && !snp_page_reclaim(kvm, pfn) && sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID && sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { - void *vaddr = kmap_local_pfn(pfn); + void *src_vaddr = kmap_local_page(src_page); + void *dst_vaddr = kmap_local_pfn(pfn); - if (copy_to_user(src, vaddr, PAGE_SIZE)) - pr_debug("Failed to write CPUID page back to userspace\n"); + memcpy(src_vaddr, dst_vaddr, PAGE_SIZE); - kunmap_local(vaddr); + kunmap_local(src_vaddr); + kunmap_local(dst_vaddr); } out: - pr_debug("%s: exiting with return code %d (fw_error %d)\n", - __func__, ret, sev_populate_args->fw_error); + if (ret) + pr_debug("%s: error updating GFN %llx, return code %d (fw_error %d)\n", + __func__, gfn, ret, sev_populate_args->fw_error); return ret; } diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 4fb042ce8ed1..5df9d32d2058 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3118,34 +3118,24 @@ struct tdx_gmem_post_populate_arg { }; static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - void __user *src, void *_arg) + struct page *src_page, void *_arg) { struct tdx_gmem_post_populate_arg *arg = _arg; struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); u64 err, entry, level_state; gpa_t gpa = gfn_to_gpa(gfn); - struct page *src_page; int ret, i; if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm)) return -EIO; - /* - * Get the source page if it has been faulted in. Return failure if the - * source page has been swapped out or unmapped in primary memory. - */ - ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page); - if (ret < 0) - return ret; - if (ret != 1) - return -ENOMEM; + if (!src_page) + return -EOPNOTSUPP; kvm_tdx->page_add_src = src_page; ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn); kvm_tdx->page_add_src = NULL; - put_page(src_page); - if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION)) return ret; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1d0cee72e560..49c0cfe24fd8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2566,7 +2566,7 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord * @gfn: starting GFN to be populated * @src: userspace-provided buffer containing data to copy into GFN range * (passed to @post_populate, and incremented on each iteration - * if not NULL) + * if not NULL). Must be page-aligned. * @npages: number of pages to copy from userspace-buffer * @post_populate: callback to issue for each gmem page that backs the GPA * range @@ -2581,7 +2581,7 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord * Returns the number of pages that were populated. */ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - void __user *src, void *opaque); + struct page *page, void *opaque); long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque); diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index e90879322fd0..923c51a3a525 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -820,12 +820,48 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE + +static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot, + struct file *file, gfn_t gfn, struct page *src_page, + kvm_gmem_populate_cb post_populate, void *opaque) +{ + pgoff_t index = kvm_gmem_get_index(slot, gfn); + struct folio *folio; + kvm_pfn_t pfn; + int ret; + + filemap_invalidate_lock(file->f_mapping); + + folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + goto out_unlock; + } + + folio_unlock(folio); + + if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1, + KVM_MEMORY_ATTRIBUTE_PRIVATE, + KVM_MEMORY_ATTRIBUTE_PRIVATE)) { + ret = -EINVAL; + goto out_put_folio; + } + + ret = post_populate(kvm, gfn, pfn, src_page, opaque); + if (!ret) + folio_mark_uptodate(folio); + +out_put_folio: + folio_put(folio); +out_unlock: + filemap_invalidate_unlock(file->f_mapping); + return ret; +} + long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque) { struct kvm_memory_slot *slot; - void __user *p; - int ret = 0; long i; @@ -834,6 +870,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long if (WARN_ON_ONCE(npages <= 0)) return -EINVAL; + if (WARN_ON_ONCE(!PAGE_ALIGNED(src))) + return -EINVAL; + slot = gfn_to_memslot(kvm, start_gfn); if (!kvm_slot_has_gmem(slot)) return -EINVAL; @@ -842,47 +881,37 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long if (!file) return -EFAULT; - filemap_invalidate_lock(file->f_mapping); - npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); for (i = 0; i < npages; i++) { - struct folio *folio; - gfn_t gfn = start_gfn + i; - pgoff_t index = kvm_gmem_get_index(slot, gfn); - kvm_pfn_t pfn; + struct page *src_page = NULL; if (signal_pending(current)) { ret = -EINTR; break; } - folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL); - if (IS_ERR(folio)) { - ret = PTR_ERR(folio); - break; + if (src) { + unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE; + + ret = get_user_pages_fast(uaddr, 1, 0, &src_page); + if (ret < 0) + break; + if (ret != 1) { + ret = -ENOMEM; + break; + } } - folio_unlock(folio); + ret = __kvm_gmem_populate(kvm, slot, file, start_gfn + i, src_page, + post_populate, opaque); - ret = -EINVAL; - if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1, - KVM_MEMORY_ATTRIBUTE_PRIVATE, - KVM_MEMORY_ATTRIBUTE_PRIVATE)) - goto put_folio_and_exit; + if (src_page) + put_page(src_page); - p = src ? src + i * PAGE_SIZE : NULL; - ret = post_populate(kvm, gfn, pfn, p, opaque); - if (!ret) - folio_mark_uptodate(folio); - -put_folio_and_exit: - folio_put(folio); if (ret) break; } - filemap_invalidate_unlock(file->f_mapping); - return ret && !i ? ret : i; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); From 582234b0d8419e0b6cbfd87ae3f80568c8d0917e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kornel=20Dul=C4=99ba?= Date: Fri, 14 Nov 2025 11:11:53 +0000 Subject: [PATCH 148/267] KVM: arm64: Fix error checking for FFA_VERSION MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to section 13.2 of the DEN0077 FF-A specification, when firmware does not support the requested version, it should reply with FFA_RET_NOT_SUPPORTED(-1). Table 13.6 specifies the type of the error code as int32. Currently, the error checking logic compares the unsigned long return value it got from the SMC layer, against a "-1" literal. This fails due to a type mismatch: the literal is extended to 64 bits, whereas the register contains only 32 bits of ones(0x00000000ffffffff). Consequently, hyp_ffa_init misinterprets the "-1" return value as an invalid FF-A version. This prevents pKVM initialization on devices where FF-A is not supported in firmware. Fix this by explicitly casting res.a0 to s32. Signed-off-by: Kornel DulÄ™ba Acked-by: Will Deacon Link: https://patch.msgid.link/20251114-pkvm_init_noffa-v1-1-87a82e87c345@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/ffa.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index f731cc4c3f28..94161ea1cd60 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -792,7 +792,7 @@ static void do_ffa_version(struct arm_smccc_1_2_regs *res, .a0 = FFA_VERSION, .a1 = ffa_req_version, }, res); - if (res->a0 == FFA_RET_NOT_SUPPORTED) + if ((s32)res->a0 == FFA_RET_NOT_SUPPORTED) goto unlock; hyp_ffa_version = ffa_req_version; @@ -943,7 +943,7 @@ int hyp_ffa_init(void *pages) .a0 = FFA_VERSION, .a1 = FFA_VERSION_1_2, }, &res); - if (res.a0 == FFA_RET_NOT_SUPPORTED) + if ((s32)res.a0 == FFA_RET_NOT_SUPPORTED) return 0; /* From 26304e0e694f4cacc30bcf757663f26533351fbd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 15 Jan 2026 09:34:24 -0800 Subject: [PATCH 149/267] KVM: nVMX: Setup VMX MSRs on loading CPU during nested_vmx_hardware_setup() Move the call to nested_vmx_setup_ctls_msrs() from vmx_hardware_setup() to nested_vmx_hardware_setup() so that the nested code can deal with ordering dependencies without having to straddle vmx_hardware_setup() and nested_vmx_hardware_setup(). Specifically, an upcoming change will sanitize the vmcs12 fields based on hardware support, and that code needs to run _before_ the MSRs are configured, because the lovely vmcs_enum MSR depends on the max support vmcs12 field. No functional change intended. Reviewed-by: Xiaoyao Li Link: https://patch.msgid.link/20260115173427.716021-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6137e5307d0f..61113ead3d7b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7407,6 +7407,8 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) { int i; + nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); + if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ccca182a346d..8438799eed74 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8700,8 +8700,6 @@ __init int vmx_hardware_setup(void) * can hide/show features based on kvm_cpu_cap_has(). */ if (nested) { - nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); - r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); if (r) return r; From c68feb605cc47431b3d86e6c2fe4f8342ebc87eb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 15 Jan 2026 09:34:25 -0800 Subject: [PATCH 150/267] KVM: VMX: Add a wrapper around ROL16() to get a vmcs12 from a field encoding Add a wrapper macro, ENC_TO_VMCS12_IDX(), to get a vmcs12 index given a field encoding in anticipation of adding a macro to get from a vmcs12 index back to the field encoding. And because open coding ROL16(n, 6) everywhere is gross. No functional change intended. Suggested-by: Xiaoyao Li Reviewed-by: Xiaoyao Li Link: https://patch.msgid.link/20260115173427.716021-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/hyperv_evmcs.c | 2 +- arch/x86/kvm/vmx/hyperv_evmcs.h | 2 +- arch/x86/kvm/vmx/vmcs.h | 1 + arch/x86/kvm/vmx/vmcs12.c | 4 ++-- arch/x86/kvm/vmx/vmcs12.h | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.c b/arch/x86/kvm/vmx/hyperv_evmcs.c index 904bfcd1519b..cc728c9a3de5 100644 --- a/arch/x86/kvm/vmx/hyperv_evmcs.c +++ b/arch/x86/kvm/vmx/hyperv_evmcs.c @@ -7,7 +7,7 @@ #include "hyperv_evmcs.h" #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) -#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ +#define EVMCS1_FIELD(number, name, clean_field)[ENC_TO_VMCS12_IDX(number)] = \ {EVMCS1_OFFSET(name), clean_field} const struct evmcs_field vmcs_field_to_evmcs_1[] = { diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.h b/arch/x86/kvm/vmx/hyperv_evmcs.h index 6536290f4274..fc7c4e7bd1bf 100644 --- a/arch/x86/kvm/vmx/hyperv_evmcs.h +++ b/arch/x86/kvm/vmx/hyperv_evmcs.h @@ -130,7 +130,7 @@ static __always_inline int evmcs_field_offset(unsigned long field, u16 *clean_field) { const struct evmcs_field *evmcs_field; - unsigned int index = ROL16(field, 6); + unsigned int index = ENC_TO_VMCS12_IDX(field); if (unlikely(index >= nr_evmcs_1_fields)) return -ENOENT; diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index b25625314658..9aa204c87661 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -12,6 +12,7 @@ #include "capabilities.h" #define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) +#define ENC_TO_VMCS12_IDX(enc) ROL16(enc, 6) struct vmcs_hdr { u32 revision_id:31; diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index 4233b5ca9461..c2ac9e1a50b3 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -4,10 +4,10 @@ #include "vmcs12.h" #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) -#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) +#define FIELD(number, name) [ENC_TO_VMCS12_IDX(number)] = VMCS12_OFFSET(name) #define FIELD64(number, name) \ FIELD(number, name), \ - [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) + [ENC_TO_VMCS12_IDX(number##_HIGH)] = VMCS12_OFFSET(name) + sizeof(u32) const unsigned short vmcs12_field_offsets[] = { FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 4ad6b16525b9..7a5fdd9b27ba 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -385,7 +385,7 @@ static inline short get_vmcs12_field_offset(unsigned long field) if (field >> 15) return -ENOENT; - index = ROL16(field, 6); + index = ENC_TO_VMCS12_IDX(field); if (index >= nr_vmcs12_fields) return -ENOENT; From a91cc48246605af9aeef1edd32232976d74d9502 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 15 Jan 2026 09:21:54 -0800 Subject: [PATCH 151/267] KVM: selftests: Test READ=>WRITE dirty logging behavior for shadow MMU Update the nested dirty log test to validate KVM's handling of READ faults when dirty logging is enabled. Specifically, set the Dirty bit in the guest PTEs used to map L2 GPAs, so that KVM will create writable SPTEs when handling L2 read faults. When handling read faults in the shadow MMU, KVM opportunistically creates a writable SPTE if the mapping can be writable *and* the gPTE is dirty (or doesn't support the Dirty bit), i.e. if KVM doesn't need to intercept writes in order to emulate Dirty-bit updates. To actually test the L2 READ=>WRITE sequence, e.g. without masking a false pass by other test activity, route the READ=>WRITE and WRITE=>WRITE sequences to separate L1 pages, and differentiate between "marked dirty due to a WRITE access/fault" and "marked dirty due to creating a writable SPTE for a READ access/fault". The updated sequence exposes the bug fixed by KVM commit 1f4e5fc83a42 ("KVM: x86: fix nested guest live migration with PML") when the guest performs a READ=>WRITE sequence with dirty guest PTEs. Opportunistically tweak and rename the address macros, and add comments, to make it more obvious what the test is doing. E.g. NESTED_TEST_MEM1 vs. GUEST_TEST_MEM doesn't make it all that obvious that the test is creating aliases in both the L2 GPA and GVA address spaces, but only when L1 is using TDP to run L2. Cc: Yosry Ahmed Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20260115172154.709024-1-seanjc@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86/processor.h | 1 + .../testing/selftests/kvm/lib/x86/processor.c | 7 + .../selftests/kvm/x86/nested_dirty_log_test.c | 187 +++++++++++++----- 3 files changed, 143 insertions(+), 52 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 6bfffc3b0a33..4ebae4269e68 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1486,6 +1486,7 @@ bool kvm_cpu_has_tdp(void); void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size); void tdp_identity_map_default_memslots(struct kvm_vm *vm); void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size); +uint64_t *tdp_get_pte(struct kvm_vm *vm, uint64_t l2_gpa); /* * Basic CPU control in CR0 diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index ab869a98bbdc..fab18e9be66c 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -390,6 +390,13 @@ static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, return virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K); } +uint64_t *tdp_get_pte(struct kvm_vm *vm, uint64_t l2_gpa) +{ + int level = PG_LEVEL_4K; + + return __vm_get_page_table_entry(vm, &vm->stage2_mmu, l2_gpa, &level); +} + uint64_t *vm_get_pte(struct kvm_vm *vm, uint64_t vaddr) { int level = PG_LEVEL_4K; diff --git a/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c index 89d2e86a0db9..619229bbd693 100644 --- a/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c @@ -17,29 +17,54 @@ /* The memory slot index to track dirty pages */ #define TEST_MEM_SLOT_INDEX 1 -#define TEST_MEM_PAGES 3 -/* L1 guest test virtual memory offset */ -#define GUEST_TEST_MEM 0xc0000000 +/* + * Allocate four pages total. Two pages are used to verify that the KVM marks + * the accessed page/GFN as marked dirty, but not the "other" page. Times two + * so that each "normal" page can be accessed from L2 via an aliased L2 GVA+GPA + * (when TDP is enabled), to verify KVM marks _L1's_ page/GFN as dirty (to + * detect failures, L2 => L1 GPAs can't be identity mapped in the TDP page + * tables, as marking L2's GPA dirty would get a false pass if L1 == L2). + */ +#define TEST_MEM_PAGES 4 -/* L2 guest test virtual memory offset */ -#define NESTED_TEST_MEM1 0xc0001000 -#define NESTED_TEST_MEM2 0xc0002000 +#define TEST_MEM_BASE 0xc0000000 +#define TEST_MEM_ALIAS_BASE 0xc0002000 + +#define TEST_GUEST_ADDR(base, idx) ((base) + (idx) * PAGE_SIZE) + +#define TEST_GVA(idx) TEST_GUEST_ADDR(TEST_MEM_BASE, idx) +#define TEST_GPA(idx) TEST_GUEST_ADDR(TEST_MEM_BASE, idx) + +#define TEST_ALIAS_GPA(idx) TEST_GUEST_ADDR(TEST_MEM_ALIAS_BASE, idx) + +#define TEST_HVA(vm, idx) addr_gpa2hva(vm, TEST_GPA(idx)) #define L2_GUEST_STACK_SIZE 64 -static void l2_guest_code(u64 *a, u64 *b) -{ - READ_ONCE(*a); - WRITE_ONCE(*a, 1); - GUEST_SYNC(true); - GUEST_SYNC(false); +/* Use the page offset bits to communicate the access+fault type. */ +#define TEST_SYNC_READ_FAULT BIT(0) +#define TEST_SYNC_WRITE_FAULT BIT(1) +#define TEST_SYNC_NO_FAULT BIT(2) - WRITE_ONCE(*b, 1); - GUEST_SYNC(true); - WRITE_ONCE(*b, 1); - GUEST_SYNC(true); - GUEST_SYNC(false); +static void l2_guest_code(vm_vaddr_t base) +{ + vm_vaddr_t page0 = TEST_GUEST_ADDR(base, 0); + vm_vaddr_t page1 = TEST_GUEST_ADDR(base, 1); + + READ_ONCE(*(u64 *)page0); + GUEST_SYNC(page0 | TEST_SYNC_READ_FAULT); + WRITE_ONCE(*(u64 *)page0, 1); + GUEST_SYNC(page0 | TEST_SYNC_WRITE_FAULT); + READ_ONCE(*(u64 *)page0); + GUEST_SYNC(page0 | TEST_SYNC_NO_FAULT); + + WRITE_ONCE(*(u64 *)page1, 1); + GUEST_SYNC(page1 | TEST_SYNC_WRITE_FAULT); + WRITE_ONCE(*(u64 *)page1, 1); + GUEST_SYNC(page1 | TEST_SYNC_WRITE_FAULT); + READ_ONCE(*(u64 *)page1); + GUEST_SYNC(page1 | TEST_SYNC_NO_FAULT); /* Exit to L1 and never come back. */ vmcall(); @@ -47,13 +72,22 @@ static void l2_guest_code(u64 *a, u64 *b) static void l2_guest_code_tdp_enabled(void) { - l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2); + /* + * Use the aliased virtual addresses when running with TDP to verify + * that KVM correctly handles the case where a page is dirtied via a + * different GPA than would be used by L1. + */ + l2_guest_code(TEST_MEM_ALIAS_BASE); } static void l2_guest_code_tdp_disabled(void) { - /* Access the same L1 GPAs as l2_guest_code_tdp_enabled() */ - l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM); + /* + * Use the "normal" virtual addresses when running without TDP enabled, + * in which case L2 will use the same page tables as L1, and thus needs + * to use the same virtual addresses that are mapped into L1. + */ + l2_guest_code(TEST_MEM_BASE); } void l1_vmx_code(struct vmx_pages *vmx) @@ -72,9 +106,9 @@ void l1_vmx_code(struct vmx_pages *vmx) prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); - GUEST_SYNC(false); + GUEST_SYNC(TEST_SYNC_NO_FAULT); GUEST_ASSERT(!vmlaunch()); - GUEST_SYNC(false); + GUEST_SYNC(TEST_SYNC_NO_FAULT); GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL); GUEST_DONE(); } @@ -91,9 +125,9 @@ static void l1_svm_code(struct svm_test_data *svm) generic_svm_setup(svm, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); - GUEST_SYNC(false); + GUEST_SYNC(TEST_SYNC_NO_FAULT); run_guest(svm->vmcb, svm->vmcb_gpa); - GUEST_SYNC(false); + GUEST_SYNC(TEST_SYNC_NO_FAULT); GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); GUEST_DONE(); } @@ -106,12 +140,66 @@ static void l1_guest_code(void *data) l1_svm_code(data); } +static void test_handle_ucall_sync(struct kvm_vm *vm, u64 arg, + unsigned long *bmap) +{ + vm_vaddr_t gva = arg & ~(PAGE_SIZE - 1); + int page_nr, i; + + /* + * Extract the page number of underlying physical page, which is also + * the _L1_ page number. The dirty bitmap _must_ be updated based on + * the L1 GPA, not L2 GPA, i.e. whether or not L2 used an aliased GPA + * (i.e. if TDP enabled for L2) is irrelevant with respect to the dirty + * bitmap and which underlying physical page is accessed. + * + * Note, gva will be '0' if there was no access, i.e. if the purpose of + * the sync is to verify all pages are clean. + */ + if (!gva) + page_nr = 0; + else if (gva >= TEST_MEM_ALIAS_BASE) + page_nr = (gva - TEST_MEM_ALIAS_BASE) >> PAGE_SHIFT; + else + page_nr = (gva - TEST_MEM_BASE) >> PAGE_SHIFT; + TEST_ASSERT(page_nr == 0 || page_nr == 1, + "Test bug, unexpected frame number '%u' for arg = %lx", page_nr, arg); + TEST_ASSERT(gva || (arg & TEST_SYNC_NO_FAULT), + "Test bug, gva must be valid if a fault is expected"); + + kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); + + /* + * Check all pages to verify the correct physical page was modified (or + * not), and that all pages are clean/dirty as expected. + * + * If a fault of any kind is expected, the target page should be dirty + * as the Dirty bit is set in the gPTE. KVM should create a writable + * SPTE even on a read fault, *and* KVM must mark the GFN as dirty + * when doing so. + */ + for (i = 0; i < TEST_MEM_PAGES; i++) { + if (i == page_nr && (arg & TEST_SYNC_WRITE_FAULT)) + TEST_ASSERT(*(u64 *)TEST_HVA(vm, i) == 1, + "Page %u incorrectly not written by guest", i); + else + TEST_ASSERT(*(u64 *)TEST_HVA(vm, i) == 0xaaaaaaaaaaaaaaaaULL, + "Page %u incorrectly written by guest", i); + + if (i == page_nr && !(arg & TEST_SYNC_NO_FAULT)) + TEST_ASSERT(test_bit(i, bmap), + "Page %u incorrectly reported clean on %s fault", + i, arg & TEST_SYNC_READ_FAULT ? "read" : "write"); + else + TEST_ASSERT(!test_bit(i, bmap), + "Page %u incorrectly reported dirty", i); + } +} + static void test_dirty_log(bool nested_tdp) { vm_vaddr_t nested_gva = 0; unsigned long *bmap; - uint64_t *host_test_mem; - struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct ucall uc; @@ -133,35 +221,46 @@ static void test_dirty_log(bool nested_tdp) /* Add an extra memory slot for testing dirty logging */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - GUEST_TEST_MEM, + TEST_MEM_BASE, TEST_MEM_SLOT_INDEX, TEST_MEM_PAGES, KVM_MEM_LOG_DIRTY_PAGES); /* - * Add an identity map for GVA range [0xc0000000, 0xc0002000). This + * Add an identity map for GVA range [0xc0000000, 0xc0004000). This * affects both L1 and L2. However... */ - virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES); + virt_map(vm, TEST_MEM_BASE, TEST_MEM_BASE, TEST_MEM_PAGES); /* - * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to - * 0xc0000000. + * ... pages in the L2 GPA address range [0xc0002000, 0xc0004000) will + * map to [0xc0000000, 0xc0002000) when TDP is enabled (for L2). * * When TDP is disabled, the L2 guest code will still access the same L1 * GPAs as the TDP enabled case. + * + * Set the Dirty bit in the PTEs used by L2 so that KVM will create + * writable SPTEs when handling read faults (if the Dirty bit isn't + * set, KVM must intercept the next write to emulate the Dirty bit + * update). */ if (nested_tdp) { tdp_identity_map_default_memslots(vm); - tdp_map(vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); - tdp_map(vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); + tdp_map(vm, TEST_ALIAS_GPA(0), TEST_GPA(0), PAGE_SIZE); + tdp_map(vm, TEST_ALIAS_GPA(1), TEST_GPA(1), PAGE_SIZE); + + *tdp_get_pte(vm, TEST_ALIAS_GPA(0)) |= PTE_DIRTY_MASK(&vm->stage2_mmu); + *tdp_get_pte(vm, TEST_ALIAS_GPA(1)) |= PTE_DIRTY_MASK(&vm->stage2_mmu); + } else { + *vm_get_pte(vm, TEST_GVA(0)) |= PTE_DIRTY_MASK(&vm->mmu); + *vm_get_pte(vm, TEST_GVA(1)) |= PTE_DIRTY_MASK(&vm->mmu); } bmap = bitmap_zalloc(TEST_MEM_PAGES); - host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); while (!done) { - memset(host_test_mem, 0xaa, TEST_MEM_PAGES * PAGE_SIZE); + memset(TEST_HVA(vm, 0), 0xaa, TEST_MEM_PAGES * PAGE_SIZE); + vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); @@ -170,23 +269,7 @@ static void test_dirty_log(bool nested_tdp) REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: - /* - * The nested guest wrote at offset 0x1000 in the memslot, but the - * dirty bitmap must be filled in according to L1 GPA, not L2. - */ - kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); - if (uc.args[1]) { - TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean"); - TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest"); - } else { - TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest"); - } - - TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[PAGE_SIZE / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest"); - TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[PAGE_SIZE*2 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest"); + test_handle_ucall_sync(vm, uc.args[1], bmap); break; case UCALL_DONE: done = true; From 929fde5e8457ac9af0ab63b59009102de4a08704 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Wed, 17 Dec 2025 04:01:07 +0100 Subject: [PATCH 152/267] KVM: s390: vsie: retry SIE when unable to get vsie_page SIE may exit because of pending host work, such as handling an interrupt, in which case VSIE rewinds the guest PSW such that it is transparently resumed (see Fixes tag). Unlike those other places that return rc=0, this return leaves the guest PSW in place, requiring the guest to handle a spurious intercept. This showed up when testing heavy I/O workloads, when multiple vcpus attempted to dispatch the same SIE block and incurred failures inserting them into the radix tree. Fixes: 33a729a1770b ("KVM: s390: vsie: retry SIE instruction on host intercepts") Signed-off-by: Eric Farman Reviewed-by: Christoph Schlameuss Reviewed-by: Claudio Imbrenda [frankja@linux.ibm.com: Replaced commit message as agreed on the list] Signed-off-by: Janosch Frank --- arch/s390/kvm/vsie.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index b526621d2a1b..b8064c6a4b57 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1498,11 +1498,13 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) } vsie_page = get_vsie_page(vcpu->kvm, scb_addr); - if (IS_ERR(vsie_page)) + if (IS_ERR(vsie_page)) { return PTR_ERR(vsie_page); - else if (!vsie_page) + } else if (!vsie_page) { /* double use of sie control block - simply do nothing */ + kvm_s390_rewind_psw(vcpu, 4); return 0; + } rc = pin_scb(vcpu, vsie_page, scb_addr); if (rc) From 54adbfe40e3b6d238293cc36ef071ed9f35c8af7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 22 Jan 2026 08:51:53 +0000 Subject: [PATCH 153/267] KVM: arm64: Always populate FGT masks at boot time We currently only populate the FGT masks if the underlying HW does support FEAT_FGT. However, with the addition of the RES1 support for system registers, this results in a lot of noise at boot time, as reported by Nathan. That's because even if FGT isn't supported, we still check for the attribution of the bits to particular features, and not keeping the masks up-to-date leads to (fairly harmess) warnings. Given that we want these checks to be enforced even if the HW doesn't support FGT, enable the generation of FGT masks unconditionally (this is rather cheap anyway). Only the storage of the FGT configuration is avoided, which will save a tiny bit of memory on these machines. Reported-by: Nathan Chancellor Tested-by: Nathan Chancellor Fixes: c259d763e6b09 ("KVM: arm64: Account for RES1 bits in DECLARE_FEAT_MAP() and co") Link: https://lore.kernel.org/r/20260120211558.GA834868@ax162 Link: https://patch.msgid.link/20260122085153.535538-1-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/emulate-nested.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 75d49f83342a..e5874effdf16 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -2276,9 +2276,6 @@ int __init populate_nv_trap_config(void) kvm_info("nv: %ld coarse grained trap handlers\n", ARRAY_SIZE(encoding_to_cgt)); - if (!cpus_have_final_cap(ARM64_HAS_FGT)) - goto check_mcb; - for (int i = 0; i < ARRAY_SIZE(encoding_to_fgt); i++) { const struct encoding_to_trap_config *fgt = &encoding_to_fgt[i]; union trap_config tc; @@ -2298,6 +2295,15 @@ int __init populate_nv_trap_config(void) } tc.val |= fgt->tc.val; + + if (!aggregate_fgt(tc)) { + ret = -EINVAL; + print_nv_trap_error(fgt, "FGT bit is reserved", ret); + } + + if (!cpus_have_final_cap(ARM64_HAS_FGT)) + continue; + prev = xa_store(&sr_forward_xa, enc, xa_mk_value(tc.val), GFP_KERNEL); @@ -2305,11 +2311,6 @@ int __init populate_nv_trap_config(void) ret = xa_err(prev); print_nv_trap_error(fgt, "Failed FGT insertion", ret); } - - if (!aggregate_fgt(tc)) { - ret = -EINVAL; - print_nv_trap_error(fgt, "FGT bit is reserved", ret); - } } } @@ -2325,7 +2326,6 @@ int __init populate_nv_trap_config(void) kvm_info("nv: %ld fine grained trap handlers\n", ARRAY_SIZE(encoding_to_fgt)); -check_mcb: for (int id = __MULTIPLE_CONTROL_BITS__; id < __COMPLEX_CONDITIONS__; id++) { const enum cgt_group_id *cgids; From 2eb80a2eee18762a33aa770d742d64fe47852c7e Mon Sep 17 00:00:00 2001 From: "Zenghui Yu (Huawei)" Date: Wed, 21 Jan 2026 18:16:31 +0800 Subject: [PATCH 154/267] KVM: arm64: nv: Return correct RES0 bits for FGT registers We had extended the sysreg masking infrastructure to more general registers, instead of restricting it to VNCR-backed registers, since commit a0162020095e ("KVM: arm64: Extend masking facility to arbitrary registers"). Fix kvm_get_sysreg_res0() to reflect this fact. Note that we're sure that we only deal with FGT registers in kvm_get_sysreg_res0(), the if (sr < __VNCR_START__) is actually a never false, which should probably be removed later. Fixes: 69c19e047dfe ("KVM: arm64: Add TCR2_EL2 to the sysreg arrays") Signed-off-by: Zenghui Yu (Huawei) Link: https://patch.msgid.link/20260121101631.41037-1-zenghui.yu@linux.dev Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org --- arch/arm64/kvm/emulate-nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index e5874effdf16..774cfbf5b43b 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -2435,7 +2435,7 @@ static u64 kvm_get_sysreg_res0(struct kvm *kvm, enum vcpu_sysreg sr) masks = kvm->arch.sysreg_masks; - return masks->mask[sr - __VNCR_START__].res0; + return masks->mask[sr - __SANITISED_REG_START__].res0; } static bool check_fgt_bit(struct kvm_vcpu *vcpu, enum vcpu_sysreg sr, From c103c2dfe4975da31b67a0fcb95761359f30992d Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 22 Jan 2026 11:22:15 +0000 Subject: [PATCH 155/267] KVM: arm64: Remove dead code resetting HCR_EL2 for pKVM The pKVM lifecycle does not support tearing down the hypervisor and returning to the hyp stub once initialized. The transition to protected mode is one-way. Consequently, the code path in hyp-init.S responsible for resetting EL2 registers (triggered by kexec or hibernation) is unreachable in protected mode. Remove the dead code handling HCR_EL2 reset for ARM64_KVM_PROTECTED_MODE. No functional change intended. Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260122112218.531948-2-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/hyp-init.S | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index aada42522e7b..0d42eedc7167 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -260,11 +260,6 @@ reset: msr sctlr_el2, x5 isb -alternative_if ARM64_KVM_PROTECTED_MODE - mov_q x5, HCR_HOST_NVHE_FLAGS - msr_hcr_el2 x5 -alternative_else_nop_endif - /* Install stub vectors */ adr_l x5, __hyp_stub_vectors msr vbar_el2, x5 From f35abcbb8a084db4c24b66ccc8db0405c08e2f61 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 22 Jan 2026 11:22:16 +0000 Subject: [PATCH 156/267] KVM: arm64: Trap MTE access and discovery when MTE is disabled If MTE is not supported by the hardware, or is disabled in the kernel configuration (`CONFIG_ARM64_MTE=n`) or command line (`arm64.nomte`), the kernel stops advertising MTE to userspace and avoids using MTE instructions. However, this is a software-level disable only. When MTE hardware is present and enabled by EL3 firmware, leaving `HCR_EL2.ATA` set allows the host to execute MTE instructions (STG, LDG, etc.) and access allocation tags in physical memory. Prevent this by clearing `HCR_EL2.ATA` when MTE is disabled. Remove it from the `HCR_HOST_NVHE_FLAGS` default, and conditionally set it in `cpu_prepare_hyp_mode()` only when `system_supports_mte()` returns true. This causes MTE instructions to trap to EL2 when `HCR_EL2.ATA` is cleared. Additionally, set `HCR_EL2.TID5` when MTE is disabled. This traps reads of `GMID_EL1` (Multiple tag transfer ID register) to EL2, preventing the discovery of MTE parameters (such as tag block size) when the feature is suppressed. Early boot code in `head.S` temporarily keeps `HCR_ATA` set to avoid special-casing initialization paths. This is safe because this code executes before untrusted code runs and will clear `HCR_ATA` if MTE is disabled. Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260122112218.531948-3-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_arm.h | 2 +- arch/arm64/kernel/head.S | 2 +- arch/arm64/kvm/arm.c | 6 ++++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index e500600e4b9b..752e3e1604e8 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -101,7 +101,7 @@ HCR_BSU_IS | HCR_FB | HCR_TACR | \ HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \ HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 | HCR_TID1) -#define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA) +#define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK) #define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC) #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H | HCR_AMO | HCR_IMO | HCR_FMO) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index ca04b338cb0d..87a822e5c4ca 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -299,7 +299,7 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) isb 0: - init_el2_hcr HCR_HOST_NVHE_FLAGS + init_el2_hcr HCR_HOST_NVHE_FLAGS | HCR_ATA init_el2_state /* Hypervisor stub */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 4f80da0c0d1d..aeac113e5e74 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2044,6 +2044,12 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS; else params->hcr_el2 = HCR_HOST_NVHE_FLAGS; + + if (system_supports_mte()) + params->hcr_el2 |= HCR_ATA; + else + params->hcr_el2 |= HCR_TID5; + if (cpus_have_final_cap(ARM64_KVM_HVHE)) params->hcr_el2 |= HCR_E2H; params->vttbr = params->vtcr = 0; From 5ee8ad69da07d0e2cffa0ce2f2339c9ad2d587f2 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 22 Jan 2026 11:22:17 +0000 Subject: [PATCH 157/267] KVM: arm64: Inject UNDEF when accessing MTE sysregs with MTE disabled When MTE hardware is present but disabled via software (`arm64.nomte` or `CONFIG_ARM64_MTE=n`), the kernel clears `HCR_EL2.ATA` and sets `HCR_EL2.TID5`, to prevent the use of MTE instructions. Additionally, accesses to certain MTE system registers trap to EL2 with exception class ESR_ELx_EC_SYS64. To emulate hardware without MTE (where such accesses would cause an Undefined Instruction exception), inject UNDEF into the host. Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260122112218.531948-4-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 67 ++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index a7c689152f68..faed1b38e6cc 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -687,6 +687,69 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt) kvm_skip_host_instr(); } +/* + * Inject an Undefined Instruction exception into the host. + * + * This is open-coded to allow control over PSTATE construction without + * complicating the generic exception entry helpers. + */ +static void inject_undef64(void) +{ + u64 spsr_mask, vbar, sctlr, old_spsr, new_spsr, esr, offset; + + spsr_mask = PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT | PSR_DIT_BIT | PSR_PAN_BIT; + + vbar = read_sysreg_el1(SYS_VBAR); + sctlr = read_sysreg_el1(SYS_SCTLR); + old_spsr = read_sysreg_el2(SYS_SPSR); + + new_spsr = old_spsr & spsr_mask; + new_spsr |= PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT; + new_spsr |= PSR_MODE_EL1h; + + if (!(sctlr & SCTLR_EL1_SPAN)) + new_spsr |= PSR_PAN_BIT; + + if (sctlr & SCTLR_ELx_DSSBS) + new_spsr |= PSR_SSBS_BIT; + + if (system_supports_mte()) + new_spsr |= PSR_TCO_BIT; + + esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT) | ESR_ELx_IL; + offset = CURRENT_EL_SP_ELx_VECTOR + except_type_sync; + + write_sysreg_el1(esr, SYS_ESR); + write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); + write_sysreg_el1(old_spsr, SYS_SPSR); + write_sysreg_el2(vbar + offset, SYS_ELR); + write_sysreg_el2(new_spsr, SYS_SPSR); +} + +static bool handle_host_mte(u64 esr) +{ + switch (esr_sys64_to_sysreg(esr)) { + case SYS_RGSR_EL1: + case SYS_GCR_EL1: + case SYS_TFSR_EL1: + case SYS_TFSRE0_EL1: + /* If we're here for any reason other than MTE, it's a bug. */ + if (read_sysreg(HCR_EL2) & HCR_ATA) + return false; + break; + case SYS_GMID_EL1: + /* If we're here for any reason other than MTE, it's a bug. */ + if (!(read_sysreg(HCR_EL2) & HCR_TID5)) + return false; + break; + default: + return false; + } + + inject_undef64(); + return true; +} + void handle_trap(struct kvm_cpu_context *host_ctxt) { u64 esr = read_sysreg_el2(SYS_ESR); @@ -702,6 +765,10 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) case ESR_ELx_EC_DABT_LOW: handle_host_mem_abort(host_ctxt); break; + case ESR_ELx_EC_SYS64: + if (handle_host_mte(esr)) + break; + fallthrough; default: BUG(); } From 230b080623fec2e1302df2afe2cf2dcb34f9c89b Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 22 Jan 2026 11:22:18 +0000 Subject: [PATCH 158/267] KVM: arm64: Use kvm_has_mte() in pKVM trap initialization When initializing HCR traps in protected mode, use kvm_has_mte() to check for MTE support rather than kvm_has_feat(kvm, ID_AA64PFR1_EL1, MTE, IMP). kvm_has_mte() provides a more comprehensive check: - kvm_has_feat() only checks if MTE is in the guest's ID register view (i.e., what we advertise to the guest) - kvm_has_mte() checks both system_supports_mte() AND whether KVM_ARCH_FLAG_MTE_ENABLED is set for this VM instance Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260122112218.531948-5-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 8911338961c5..9b933424e70a 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -82,7 +82,7 @@ static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) val &= ~(HCR_AMVOFFEN); - if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, MTE, IMP)) { + if (!kvm_has_mte(kvm)) { val |= HCR_TID5; val &= ~(HCR_DCT | HCR_ATA); } From c4a365cd4a4ec105012ab3ae5ff5cf11f8533771 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 Jan 2026 18:28:16 -0800 Subject: [PATCH 159/267] KVM: x86: Drop WARN on INIT/SIPI being blocked when vCPU is in Wait-For-SIPI Drop the sanity check in kvm_apic_accept_events() that attempts to detect KVM bugs by asserting that a vCPU isn't in Wait-For-SIPI if INIT/SIPI are blocked, because if INIT is blocked, then it should be impossible for a vCPU to get into WFS in the first place. Unfortunately, syzbot is smarter than KVM (and its maintainers), and circumvented the guards put in place by commit 0fe3e8d804fd ("KVM: x86: Move INIT_RECEIVED vs. INIT/SIPI blocked check to KVM_RUN") by swapping the order and stuffing VMXON after INIT, and then triggering kvm_apic_accept_events() by way of KVM_GET_MP_STATE. Simply drop the WARN as it hasn't detected any meaningful KVM bugs in years (if ever?), and preventing userspace from clobbering guest state is generally a non-goal. More importantly, fully closing the hole would likely require enforcing some amount of ordering in KVM's ioctls, which is a much bigger risk than simply deleting the WARN. Reported-by: syzbot+59f2c3a3fc4f6c09b8cd@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/6925da1b.a70a0220.d98e3.00b0.GAE@google.com Link: https://patch.msgid.link/20260123022816.2283567-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1597dd0b0cc6..01c2557bfa05 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -3498,7 +3498,6 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) * wait-for-SIPI (WFS). */ if (!kvm_apic_init_sipi_allowed(vcpu)) { - WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); clear_bit(KVM_APIC_SIPI, &apic->pending_events); return 0; } From fa9893fadbc245e179cb17f3c371c67471b5a8a8 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Fri, 9 Jan 2026 17:17:32 -0600 Subject: [PATCH 160/267] KVM: Introduce KVM_EXIT_SNP_REQ_CERTS for SNP certificate-fetching For SEV-SNP, the host can optionally provide a certificate table to the guest when it issues an attestation request to firmware (see GHCB 2.0 specification regarding "SNP Extended Guest Requests"). This certificate table can then be used to verify the endorsement key used by firmware to sign the attestation report. While it is possible for guests to obtain the certificates through other means, handling it via the host provides more flexibility in being able to keep the certificate data in sync with the endorsement key throughout host-side operations that might resulting in the endorsement key changing. In the case of KVM, userspace will be responsible for fetching the certificate table and keeping it in sync with any modifications to the endorsement key by other userspace management tools. Define a new KVM_EXIT_SNP_REQ_CERTS event where userspace is provided with the GPA of the buffer the guest has provided as part of the attestation request so that userspace can write the certificate data into it while relying on filesystem-based locking to keep the certificates up-to-date relative to the endorsement keys installed/utilized by firmware at the time the certificates are fetched. [Melody: Update the documentation scheme about how file locking is expected to happen.] Reviewed-by: Liam Merwick Tested-by: Liam Merwick Tested-by: Dionna Glaze Signed-off-by: Michael Roth Signed-off-by: Melody Wang Signed-off-by: Michael Roth Link: https://patch.msgid.link/20260109231732.1160759-2-michael.roth@amd.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 44 ++++++++++++++++++++++++ arch/x86/kvm/svm/sev.c | 62 ++++++++++++++++++++++++++++++---- arch/x86/kvm/svm/svm.h | 1 + include/uapi/linux/kvm.h | 9 +++++ 4 files changed, 110 insertions(+), 6 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 01a3abef8abb..428d7d9cb4d6 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7353,6 +7353,50 @@ Please note that the kernel is allowed to use the kvm_run structure as the primary storage for certain register types. Therefore, the kernel may use the values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set. +:: + + /* KVM_EXIT_SNP_REQ_CERTS */ + struct kvm_exit_snp_req_certs { + __u64 gpa; + __u64 npages; + __u64 ret; + }; + +KVM_EXIT_SNP_REQ_CERTS indicates an SEV-SNP guest with certificate-fetching +enabled (see KVM_SEV_SNP_ENABLE_REQ_CERTS) has generated an Extended Guest +Request NAE #VMGEXIT (SNP_GUEST_REQUEST) with message type MSG_REPORT_REQ, +i.e. has requested an attestation report from firmware, and would like the +certificate data corresponding to the attestation report signature to be +provided by the hypervisor as part of the request. + +To allow for userspace to provide the certificate, the 'gpa' and 'npages' +are forwarded verbatim from the guest request (the RAX and RBX GHCB fields +respectively). 'ret' is not an "output" from KVM, and is always '0' on +exit. KVM verifies the 'gpa' is 4KiB aligned prior to exiting to userspace, +but otherwise the information from the guest isn't validated. + +Upon the next KVM_RUN, e.g. after userspace has serviced the request (or not), +KVM will complete the #VMGEXIT, using the 'ret' field to determine whether to +signal success or failure to the guest, and on failure, what reason code will +be communicated via SW_EXITINFO2. If 'ret' is set to an unsupported value (see +the table below), KVM_RUN will fail with -EINVAL. For a 'ret' of 'ENOSPC', KVM +also consumes the 'npages' field, i.e. userspace can use the field to inform +the guest of the number of pages needed to hold all the certificate data. + +The supported 'ret' values and their respective SW_EXITINFO2 encodings: + + ====== ============================================================= + 0 0x0, i.e. success. KVM will emit an SNP_GUEST_REQUEST command + to SNP firmware. + ENOSPC 0x0000000100000000, i.e. not enough guest pages to hold the + certificate table and certificate data. KVM will also set the + RBX field in the GHBC to 'npages'. + EAGAIN 0x0000000200000000, i.e. the host is busy and the guest should + retry the request. + EIO 0xffffffff00000000, for all other errors (this return code is + a KVM-defined hypervisor value, as allowed by the GHCB) + ====== ============================================================= + .. _cap_enable: diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f67525007089..9e6a78e448f2 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -41,6 +41,16 @@ #define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) +/* + * The GHCB spec essentially states that all non-zero error codes other than + * those explicitly defined above should be treated as an error by the guest. + * Define a generic error to cover that case, and choose a value that is not + * likely to overlap with new explicit error codes should more be added to + * the GHCB spec later. KVM will use this to report generic errors when + * handling SNP guest requests. + */ +#define SNP_GUEST_VMM_ERR_GENERIC (~0U) + /* enable/disable SEV support */ static bool sev_enabled = true; module_param_named(sev, sev_enabled, bool, 0444); @@ -4139,6 +4149,36 @@ out_unlock: return ret; } +static int snp_req_certs_err(struct vcpu_svm *svm, u32 vmm_error) +{ + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(vmm_error, 0)); + + return 1; /* resume guest */ +} + +static int snp_complete_req_certs(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + + switch (READ_ONCE(vcpu->run->snp_req_certs.ret)) { + case 0: + return snp_handle_guest_req(svm, control->exit_info_1, + control->exit_info_2); + case ENOSPC: + vcpu->arch.regs[VCPU_REGS_RBX] = vcpu->run->snp_req_certs.npages; + return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_INVALID_LEN); + case EAGAIN: + return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_BUSY); + case EIO: + return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_GENERIC); + default: + break; + } + + return -EINVAL; +} + static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) { struct kvm *kvm = svm->vcpu.kvm; @@ -4154,14 +4194,15 @@ static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t r /* * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for * additional certificate data to be provided alongside the attestation - * report via the guest-provided data pages indicated by RAX/RBX. The - * certificate data is optional and requires additional KVM enablement - * to provide an interface for userspace to provide it, but KVM still - * needs to be able to handle extended guest requests either way. So - * provide a stub implementation that will always return an empty - * certificate table in the guest-provided data pages. + * report via the guest-provided data pages indicated by RAX/RBX. If + * userspace enables KVM_EXIT_SNP_REQ_CERTS, then exit to userspace + * to give userspace an opportunity to provide the certificate data + * before issuing/completing the attestation request. Otherwise, return + * an empty certificate table in the guest-provided data pages and + * handle the attestation request immediately. */ if (msg_type == SNP_MSG_REPORT_REQ) { + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_vcpu *vcpu = &svm->vcpu; u64 data_npages; gpa_t data_gpa; @@ -4175,6 +4216,15 @@ static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t r if (!PAGE_ALIGNED(data_gpa)) goto request_invalid; + if (sev->snp_certs_enabled) { + vcpu->run->exit_reason = KVM_EXIT_SNP_REQ_CERTS; + vcpu->run->snp_req_certs.gpa = data_gpa; + vcpu->run->snp_req_certs.npages = data_npages; + vcpu->run->snp_req_certs.ret = 0; + vcpu->arch.complete_userspace_io = snp_complete_req_certs; + return 0; + } + /* * As per GHCB spec (see "SNP Extended Guest Request"), the * certificate table is terminated by 24-bytes of zeroes. diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 338fc4f5cc4c..ebd7b36b1ceb 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -115,6 +115,7 @@ struct kvm_sev_info { void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */ struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */ cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */ + bool snp_certs_enabled; /* SNP certificate-fetching support. */ }; struct kvm_svm { diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index dddb781b0507..8cd107cdcf0b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -135,6 +135,12 @@ struct kvm_xen_exit { } u; }; +struct kvm_exit_snp_req_certs { + __u64 gpa; + __u64 npages; + __u64 ret; +}; + #define KVM_S390_GET_SKEYS_NONE 1 #define KVM_S390_SKEYS_MAX 1048576 @@ -180,6 +186,7 @@ struct kvm_xen_exit { #define KVM_EXIT_MEMORY_FAULT 39 #define KVM_EXIT_TDX 40 #define KVM_EXIT_ARM_SEA 41 +#define KVM_EXIT_SNP_REQ_CERTS 42 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -482,6 +489,8 @@ struct kvm_run { __u64 gva; __u64 gpa; } arm_sea; + /* KVM_EXIT_SNP_REQ_CERTS */ + struct kvm_exit_snp_req_certs snp_req_certs; /* Fix the size of the union. */ char padding[256]; }; From 20c3c4108d58f87c711bf44cb0b498b3ac5af6bf Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Fri, 9 Jan 2026 17:17:33 -0600 Subject: [PATCH 161/267] KVM: SEV: Add KVM_SEV_SNP_ENABLE_REQ_CERTS command Introduce a new command for KVM_MEMORY_ENCRYPT_OP ioctl that can be used to enable fetching of endorsement key certificates from userspace via the new KVM_EXIT_SNP_REQ_CERTS exit type. Also introduce a new KVM_X86_SEV_SNP_REQ_CERTS KVM device attribute so that userspace can query whether the kernel supports the new command/exit. Suggested-by: Sean Christopherson Reviewed-by: Liam Merwick Tested-by: Liam Merwick Signed-off-by: Michael Roth Link: https://patch.msgid.link/20260109231732.1160759-3-michael.roth@amd.com Signed-off-by: Sean Christopherson --- .../virt/kvm/x86/amd-memory-encryption.rst | 52 ++++++++++++++++++- arch/x86/include/uapi/asm/kvm.h | 2 + arch/x86/kvm/svm/sev.c | 16 ++++++ 3 files changed, 69 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst index 1ddb6a86ce7f..543b5e5dd8d4 100644 --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst @@ -572,6 +572,52 @@ Returns: 0 on success, -negative on error See SNP_LAUNCH_FINISH in the SEV-SNP specification [snp-fw-abi]_ for further details on the input parameters in ``struct kvm_sev_snp_launch_finish``. +21. KVM_SEV_SNP_ENABLE_REQ_CERTS +-------------------------------- + +The KVM_SEV_SNP_ENABLE_REQ_CERTS command will configure KVM to exit to +userspace with a ``KVM_EXIT_SNP_REQ_CERTS`` exit type as part of handling +a guest attestation report, which will to allow userspace to provide a +certificate corresponding to the endorsement key used by firmware to sign +that attestation report. + +Returns: 0 on success, -negative on error + +NOTE: The endorsement key used by firmware may change as a result of +management activities like updating SEV-SNP firmware or loading new +endorsement keys, so some care should be taken to keep the returned +certificate data in sync with the actual endorsement key in use by +firmware at the time the attestation request is sent to SNP firmware. The +recommended scheme to do this is to use file locking (e.g. via fcntl()'s +F_OFD_SETLK) in the following manner: + + - Prior to obtaining/providing certificate data as part of servicing an + exit type of ``KVM_EXIT_SNP_REQ_CERTS``, the VMM should obtain a + shared/read or exclusive/write lock on the certificate blob file before + reading it and returning it to KVM, and continue to hold the lock until + the attestation request is actually sent to firmware. To facilitate + this, the VMM can set the ``immediate_exit`` flag of kvm_run just after + supplying the certificate data, and just before resuming the vCPU. + This will ensure the vCPU will exit again to userspace with ``-EINTR`` + after it finishes fetching the attestation request from firmware, at + which point the VMM can safely drop the file lock. + + - Tools/libraries that perform updates to SNP firmware TCB values or + endorsement keys (e.g. via /dev/sev interfaces such as ``SNP_COMMIT``, + ``SNP_SET_CONFIG``, or ``SNP_VLEK_LOAD``, see + Documentation/virt/coco/sev-guest.rst for more details) in such a way + that the certificate blob needs to be updated, should similarly take an + exclusive lock on the certificate blob for the duration of any updates + to endorsement keys or the certificate blob contents to ensure that + VMMs using the above scheme will not return certificate blob data that + is out of sync with the endorsement key used by firmware at the time + the attestation request is actually issued. + +This scheme is recommended so that tools can use a fairly generic/natural +approach to synchronizing firmware/certificate updates via file-locking, +which should make it easier to maintain interoperability across +tools/VMMs/vendors. + Device attribute API ==================== @@ -579,11 +625,15 @@ Attributes of the SEV implementation can be retrieved through the ``KVM_HAS_DEVICE_ATTR`` and ``KVM_GET_DEVICE_ATTR`` ioctls on the ``/dev/kvm`` device node, using group ``KVM_X86_GRP_SEV``. -Currently only one attribute is implemented: +The following attributes are currently implemented: * ``KVM_X86_SEV_VMSA_FEATURES``: return the set of all bits that are accepted in the ``vmsa_features`` of ``KVM_SEV_INIT2``. +* ``KVM_X86_SEV_SNP_REQ_CERTS``: return a value of 1 if the kernel supports the + ``KVM_EXIT_SNP_REQ_CERTS`` exit, which allows for fetching endorsement key + certificates from userspace for each SNP attestation request the guest issues. + Firmware Management =================== diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 7ceff6583652..b2c928c5965d 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -503,6 +503,7 @@ struct kvm_sync_regs { #define KVM_X86_GRP_SEV 1 # define KVM_X86_SEV_VMSA_FEATURES 0 # define KVM_X86_SNP_POLICY_BITS 1 +# define KVM_X86_SEV_SNP_REQ_CERTS 2 struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; @@ -743,6 +744,7 @@ enum sev_cmd_id { KVM_SEV_SNP_LAUNCH_START = 100, KVM_SEV_SNP_LAUNCH_UPDATE, KVM_SEV_SNP_LAUNCH_FINISH, + KVM_SEV_SNP_ENABLE_REQ_CERTS, KVM_SEV_NR_MAX, }; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 9e6a78e448f2..f9aad5c1447e 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2158,6 +2158,9 @@ int sev_dev_get_attr(u32 group, u64 attr, u64 *val) *val = snp_supported_policy_bits; return 0; + case KVM_X86_SEV_SNP_REQ_CERTS: + *val = sev_snp_enabled ? 1 : 0; + return 0; default: return -ENXIO; } @@ -2574,6 +2577,16 @@ e_free: return ret; } +static int snp_enable_certs(struct kvm *kvm) +{ + if (kvm->created_vcpus || !sev_snp_guest(kvm)) + return -EINVAL; + + to_kvm_sev_info(kvm)->snp_certs_enabled = true; + + return 0; +} + int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -2679,6 +2692,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) case KVM_SEV_SNP_LAUNCH_FINISH: r = snp_launch_finish(kvm, &sev_cmd); break; + case KVM_SEV_SNP_ENABLE_REQ_CERTS: + r = snp_enable_certs(kvm); + break; default: r = -EINVAL; goto out; From f24ef0093dd8cf60ed7f93a82fb16335fdcbc310 Mon Sep 17 00:00:00 2001 From: Zhao Liu Date: Thu, 20 Nov 2025 13:07:17 +0800 Subject: [PATCH 162/267] KVM: x86: Advertise MOVRS CPUID to userspace Define the feature flag for MOVRS and advertise support to userspace when the feature is supported by the host. MOVRS is a new set of instructions introduced in the Intel platform Diamond Rapids, to provide load instructions that carry a read-shared hint. Functionally, MOVRS family is equivalent to existing load instructions, but its read-shared hint indicates that the source memory location is likely to become read-shared by multiple processors, i.e., read in the future by at least one other processor before it is written (assuming it is ever written in the future). This hint could optimize the behavior of the caches, especially shared caches, for this data for future reads by multiple processors. Additionally, MOVRS family also includes a software prefetch instruction, PREFETCHRST2, that carries the same read-shared hint. [*] MOVRS family is enumerated by CPUID single-bit (0x7.0x1.EAX[bit 31]). Since it's on a densely-populated CPUID leaf and some other bits on this leaf have kernel usages, define this new feature in cpufeatures.h, but hide it in /proc/cpuinfo due to lack of current kernel usage. Advertise MOVRS bit to userspace directly. It's safe, since there's no new VMX controls or additional host enabling required for guests to use this feature. [*]: Intel Architecture Instruction Set Extensions and Future Features (rev.059). Tested-by: Xudong Hao Signed-off-by: Zhao Liu Reviewed-by: Xiaoyao Li Reviewed-by: Paolo Bonzini Link: https://patch.msgid.link/20251120050720.931449-2-zhao1.liu@intel.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kvm/cpuid.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index c3b53beb1300..33d0f5764e24 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -326,6 +326,7 @@ #define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */ #define X86_FEATURE_LAM (12*32+26) /* "lam" Linear Address Masking */ +#define X86_FEATURE_MOVRS (12*32+31) /* MOVRS instructions */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index f37331ad3ad8..e83b73a4aad8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1028,6 +1028,7 @@ void kvm_set_cpu_caps(void) F(AMX_FP16), F(AVX_IFMA), F(LAM), + F(MOVRS), ); kvm_cpu_cap_init(CPUID_7_1_ECX, From 58cbaf64e6530fa37752ff1362129842ac94fee0 Mon Sep 17 00:00:00 2001 From: Zhao Liu Date: Thu, 20 Nov 2025 13:07:18 +0800 Subject: [PATCH 163/267] KVM: x86: Advertise AMX CPUIDs in subleaf 0x1E.0x1 to userspace Define and advertise AMX CPUIDs (0x1E.0x1) to userspace when the leaf is supported by the host. Intel Diamond Rapids adds new AMX instructions to support new formats and memory operations [*], and introduces the CPUID subleaf 0x1E.0x1 to centralize the discrete AMX feature bits within EAX. Since these AMX features have no actual kernel usages, define them as KVM-only features in reverse_cpuid.h. In addition to the new features, CPUID 0x1E.0x1.EAX[bits 0-3] are aliaseed positions of existing AMX feature bits distributed across the 0x7 leaves. To avoid duplicate feature names, name these aliases with an *_ALIAS suffix, and define them in reverse_cpuid.h as KVM-only features as well. Advertise new CPUID subleaf 0x1E.0x1 with its AMX CPUID feature bits to userspace for guest use. It's safe since no additional enabling work is needed in the host kernel. [*]: Intel Architecture Instruction Set Extensions and Future Features (rev.059). Tested-by: Xudong Hao Signed-off-by: Zhao Liu Reviewed-by: Xiaoyao Li Reviewed-by: Paolo Bonzini Link: https://patch.msgid.link/20251120050720.931449-3-zhao1.liu@intel.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 25 +++++++++++++++++++++++++ arch/x86/kvm/reverse_cpuid.h | 15 +++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5a3bfa293e8b..c1859c65453f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -776,6 +776,7 @@ enum kvm_only_cpuid_leafs { CPUID_24_0_EBX, CPUID_8000_0021_ECX, CPUID_7_1_ECX, + CPUID_1E_1_EAX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e83b73a4aad8..750c353e0dbf 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1067,6 +1067,17 @@ void kvm_set_cpu_caps(void) SCATTERED_F(SGX_EDECCSSA), ); + kvm_cpu_cap_init(CPUID_1E_1_EAX, + F(AMX_INT8_ALIAS), + F(AMX_BF16_ALIAS), + F(AMX_COMPLEX_ALIAS), + F(AMX_FP16_ALIAS), + F(AMX_FP8), + F(AMX_TF32), + F(AMX_AVX512), + F(AMX_MOVRS), + ); + kvm_cpu_cap_init(CPUID_24_0_EBX, F(AVX10_128), F(AVX10_256), @@ -1627,6 +1638,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } + + max_idx = entry->eax = min(entry->eax, 1u); + + /* KVM only supports up to 0x1e.0x1, capped above via min(). */ + if (max_idx >= 1) { + entry = do_host_cpuid(array, function, 1); + if (!entry) + goto out; + + cpuid_entry_override(entry, CPUID_1E_1_EAX); + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + } break; case 0x24: { u8 avx10_version; diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 81b4a7acf72e..7dc4200a26f1 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -44,6 +44,20 @@ #define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) #define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) +/* + * Intel-defined sub-features, CPUID level 0x0000001E:1 (EAX). Note, several + * of the bits are aliases to features of the same name that are enumerated via + * various CPUID.0x7 sub-leafs. + */ +#define X86_FEATURE_AMX_INT8_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 0) +#define X86_FEATURE_AMX_BF16_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 1) +#define X86_FEATURE_AMX_COMPLEX_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 2) +#define X86_FEATURE_AMX_FP16_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 3) +#define X86_FEATURE_AMX_FP8 KVM_X86_FEATURE(CPUID_1E_1_EAX, 4) +#define X86_FEATURE_AMX_TF32 KVM_X86_FEATURE(CPUID_1E_1_EAX, 6) +#define X86_FEATURE_AMX_AVX512 KVM_X86_FEATURE(CPUID_1E_1_EAX, 7) +#define X86_FEATURE_AMX_MOVRS KVM_X86_FEATURE(CPUID_1E_1_EAX, 8) + /* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */ #define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16) #define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17) @@ -90,6 +104,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX}, [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX}, [CPUID_7_1_ECX] = { 7, 1, CPUID_ECX}, + [CPUID_1E_1_EAX] = { 0x1e, 1, CPUID_EAX}, }; /* From 2ff8fb1e65e1a97f75f15935f84b9d2882a49623 Mon Sep 17 00:00:00 2001 From: Zhao Liu Date: Thu, 20 Nov 2025 13:07:19 +0800 Subject: [PATCH 164/267] KVM: x86: Advertise AVX10.2 CPUID to userspace Bump up the maximum supported AVX10 version and enumerate AVX10.2 to userspace when it's supported by the host. Intel AVX10 Version 2 (Intel AVX10.2) includes a suite of new instructions delivering new AI features and performance, accelerated media processing, expanded Web Assembly, and Cryptography support, along with enhancements to existing legacy instructions for completeness and efficiency, and it is enumerated as version 2 in CPUID 0x24.0x0.EBX[bits 0-7] [1]. AVX10.2 has no current kernel usage and requires no additional host kernel enabling work (based on AVX10.1 support) and provides no new VMX controls [2]. Moreover, since AVX10.2 is the superset of AVX10.1, there's no need to worry about AVX10.1 and AVX10.2 compatibility issues in KVM. Therefore, it's safe to advertise AVX10.2 version to userspace directly if host supports AVX10.2. [1]: Intel Advanced Vector Extensions 10.2 Architecture Specification (rev 5.0). [2]: Note: Since AVX10.2 spec (rev 4.0), it has been declared "AVX10/512 will be used in all Intel products, supporting vector lengths of 128, 256, and 512 in all product lines", and the VMX support (in earlier revisions) for AVX10/256 guest on AVX10/512 host has been dropped. Tested-by: Xudong Hao Signed-off-by: Zhao Liu Reviewed-by: Xiaoyao Li Reviewed-by: Paolo Bonzini Link: https://patch.msgid.link/20251120050720.931449-4-zhao1.liu@intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 750c353e0dbf..ecb4ca9af788 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1666,7 +1666,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * is guaranteed to be >=1 if AVX10 is supported. Note #2, the * version needs to be captured before overriding EBX features! */ - avx10_version = min_t(u8, entry->ebx & 0xff, 1); + avx10_version = min_t(u8, entry->ebx & 0xff, 2); cpuid_entry_override(entry, CPUID_24_0_EBX); entry->ebx |= avx10_version; From 062768f426895b958417c85582826e20c44f477c Mon Sep 17 00:00:00 2001 From: Zhao Liu Date: Thu, 20 Nov 2025 13:07:20 +0800 Subject: [PATCH 165/267] KVM: x86: Advertise AVX10_VNNI_INT CPUID to userspace Define and advertise AVX10_VNNI_INT CPUID to userspace when it's supported by the host. AVX10_VNNI_INT (0x24.0x1.ECX[bit 2]) is a discrete feature bit introduced on Intel Diamond Rapids, which enumerates the support for EVEX VPDP* instructions for INT8/INT16 [*]. Since this feature has no actual kernel usages, define it as a KVM-only feature in reverse_cpuid.h. Advertise new CPUID subleaf 0x24.0x1 with AVX10_VNNI_INT bit to userspace for guest use. It's safe since no additional enabling work is needed in the host kernel. [*]: Intel Advanced Vector Extensions 10.2 Architecture Specification (rev 5.0). Tested-by: Xudong Hao Signed-off-by: Zhao Liu Reviewed-by: Xiaoyao Li Reviewed-by: Paolo Bonzini Link: https://patch.msgid.link/20251120050720.931449-5-zhao1.liu@intel.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 18 +++++++++++++++++- arch/x86/kvm/reverse_cpuid.h | 4 ++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c1859c65453f..80fd7ca69c87 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -777,6 +777,7 @@ enum kvm_only_cpuid_leafs { CPUID_8000_0021_ECX, CPUID_7_1_ECX, CPUID_1E_1_EAX, + CPUID_24_1_ECX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ecb4ca9af788..0172b7ef2ba4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1084,6 +1084,10 @@ void kvm_set_cpu_caps(void) F(AVX10_512), ); + kvm_cpu_cap_init(CPUID_24_1_ECX, + F(AVX10_VNNI_INT), + ); + kvm_cpu_cap_init(CPUID_8000_0001_ECX, F(LAHF_LM), F(CMP_LEGACY), @@ -1661,6 +1665,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; } + max_idx = entry->eax = min(entry->eax, 1u); /* * The AVX10 version is encoded in EBX[7:0]. Note, the version * is guaranteed to be >=1 if AVX10 is supported. Note #2, the @@ -1670,9 +1675,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) cpuid_entry_override(entry, CPUID_24_0_EBX); entry->ebx |= avx10_version; - entry->eax = 0; entry->ecx = 0; entry->edx = 0; + + /* KVM only supports up to 0x24.0x1, capped above via min(). */ + if (max_idx >= 1) { + entry = do_host_cpuid(array, function, 1); + if (!entry) + goto out; + + cpuid_entry_override(entry, CPUID_24_1_ECX); + entry->eax = 0; + entry->ebx = 0; + entry->edx = 0; + } break; } case KVM_CPUID_SIGNATURE: { diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 7dc4200a26f1..657f5f743ed9 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -63,6 +63,9 @@ #define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17) #define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18) +/* Intel-defined sub-features, CPUID level 0x00000024:1 (ECX) */ +#define X86_FEATURE_AVX10_VNNI_INT KVM_X86_FEATURE(CPUID_24_1_ECX, 2) + /* CPUID level 0x80000007 (EDX). */ #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) @@ -105,6 +108,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX}, [CPUID_7_1_ECX] = { 7, 1, CPUID_ECX}, [CPUID_1E_1_EAX] = { 0x1e, 1, CPUID_EAX}, + [CPUID_24_1_ECX] = { 0x24, 1, CPUID_ECX}, }; /* From 95d848dc7e639988dbb385a8cba9b484607cf98c Mon Sep 17 00:00:00 2001 From: Vasiliy Kovalev Date: Sat, 24 Jan 2026 01:28:01 +0300 Subject: [PATCH 166/267] KVM: x86: Add SRCU protection for reading PDPTRs in __get_sregs2() Add SRCU read-side protection when reading PDPTR registers in __get_sregs2(). Reading PDPTRs may trigger access to guest memory: kvm_pdptr_read() -> svm_cache_reg() -> load_pdptrs() -> kvm_vcpu_read_guest_page() -> kvm_vcpu_gfn_to_memslot() kvm_vcpu_gfn_to_memslot() dereferences memslots via __kvm_memslots(), which uses srcu_dereference_check() and requires either kvm->srcu or kvm->slots_lock to be held. Currently only vcpu->mutex is held, triggering lockdep warning: ============================= WARNING: suspicious RCU usage in kvm_vcpu_gfn_to_memslot 6.12.59+ #3 Not tainted include/linux/kvm_host.h:1062 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by syz.5.1717/15100: #0: ff1100002f4b00b0 (&vcpu->mutex){+.+.}-{3:3}, at: kvm_vcpu_ioctl+0x1d5/0x1590 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0xf0/0x120 lib/dump_stack.c:120 lockdep_rcu_suspicious+0x1e3/0x270 kernel/locking/lockdep.c:6824 __kvm_memslots include/linux/kvm_host.h:1062 [inline] __kvm_memslots include/linux/kvm_host.h:1059 [inline] kvm_vcpu_memslots include/linux/kvm_host.h:1076 [inline] kvm_vcpu_gfn_to_memslot+0x518/0x5e0 virt/kvm/kvm_main.c:2617 kvm_vcpu_read_guest_page+0x27/0x50 virt/kvm/kvm_main.c:3302 load_pdptrs+0xff/0x4b0 arch/x86/kvm/x86.c:1065 svm_cache_reg+0x1c9/0x230 arch/x86/kvm/svm/svm.c:1688 kvm_pdptr_read arch/x86/kvm/kvm_cache_regs.h:141 [inline] __get_sregs2 arch/x86/kvm/x86.c:11784 [inline] kvm_arch_vcpu_ioctl+0x3e20/0x4aa0 arch/x86/kvm/x86.c:6279 kvm_vcpu_ioctl+0x856/0x1590 virt/kvm/kvm_main.c:4663 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl fs/ioctl.c:893 [inline] __x64_sys_ioctl+0x18b/0x210 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xbd/0x1d0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Suggested-by: Sean Christopherson Cc: stable@vger.kernel.org Fixes: 6dba94035203 ("KVM: x86: Introduce KVM_GET_SREGS2 / KVM_SET_SREGS2") Signed-off-by: Vasiliy Kovalev Link: https://patch.msgid.link/20260123222801.646123-1-kovalev@altlinux.org Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 386cdb775fd4..1ea94f4a3dcb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12145,9 +12145,11 @@ static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) return; if (is_pae_paging(vcpu)) { + kvm_vcpu_srcu_read_lock(vcpu); for (i = 0 ; i < 4 ; i++) sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i); sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID; + kvm_vcpu_srcu_read_unlock(vcpu); } } From d4236f1ef270347b26ed34bcd0eb28d285b86bcb Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 23 Jan 2026 19:16:33 +0000 Subject: [PATCH 167/267] arm64: Add MT_S2{,_FWB}_AS_S1 encodings pKVM usage of S2 translation on the host is purely for isolation purposes, not translation. To that effect, the memory attributes being used must be that of S1. With FWB=0, this is easily achieved by using the Normal Cacheable type (which is the weakest possible memory type) at S2, and let S1 pick something stronger as required. With FWB=1, the attributes are combined in a different way, and we cannot arbitrarily use Normal Cacheable. We can, however, use a memattr encoding that indicates that the final attributes are that of Stage-1. Add these encoding and a few pointers to the relevant parts of the specification. It might come handy some day. Reviewed-by: Joey Gouly Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260123191637.715429-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/memory.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 9d54b2ea49d6..a2b7a33966ff 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -175,19 +175,24 @@ #define MT_DEVICE_nGnRE 4 /* - * Memory types for Stage-2 translation + * Memory types for Stage-2 translation when HCR_EL2.FWB=0. See R_HMNDG, + * R_TNHFM, R_GQFSF and I_MCQKW for the details on how these attributes get + * combined with Stage-1. */ #define MT_S2_NORMAL 0xf #define MT_S2_NORMAL_NC 0x5 #define MT_S2_DEVICE_nGnRE 0x1 +#define MT_S2_AS_S1 MT_S2_NORMAL /* - * Memory types for Stage-2 translation when ID_AA64MMFR2_EL1.FWB is 0001 - * Stage-2 enforces Normal-WB and Device-nGnRE + * Memory types for Stage-2 translation when HCR_EL2.FWB=1. Stage-2 enforces + * Normal-WB and Device-nGnRE, unless we actively say that S1 wins. See + * R_VRJSW and R_RHWZM for details. */ #define MT_S2_FWB_NORMAL 6 #define MT_S2_FWB_NORMAL_NC 5 #define MT_S2_FWB_DEVICE_nGnRE 1 +#define MT_S2_FWB_AS_S1 7 #ifdef CONFIG_ARM64_4K_PAGES #define IOREMAP_MAX_ORDER (PUD_SHIFT) From 17d7b15131f3ed7fff29f2a43b6ead547e495653 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 23 Jan 2026 19:16:34 +0000 Subject: [PATCH 168/267] KVM: arm64: Add KVM_PGTABLE_S2_AS_S1 flag Plumb the MT_S2{,_FWB}_AS_S1 memory types into the KVM_S2_MEMATTR() macro with a new KVM_PGTABLE_S2_AS_S1 flag. Nobody selects it yet. Reviewed-by: Joey Gouly Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260123191637.715429-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_pgtable.h | 2 ++ arch/arm64/kvm/hyp/pgtable.c | 14 +++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index fc02de43c68d..9ce51a637da0 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -232,10 +232,12 @@ struct kvm_pgtable_mm_ops { * @KVM_PGTABLE_S2_NOFWB: Don't enforce Normal-WB even if the CPUs have * ARM64_HAS_STAGE2_FWB. * @KVM_PGTABLE_S2_IDMAP: Only use identity mappings. + * @KVM_PGTABLE_S2_AS_S1: Final memory attributes are that of Stage-1. */ enum kvm_pgtable_stage2_flags { KVM_PGTABLE_S2_NOFWB = BIT(0), KVM_PGTABLE_S2_IDMAP = BIT(1), + KVM_PGTABLE_S2_AS_S1 = BIT(2), }; /** diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 947ac1a951a5..c52a24c15ff2 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -659,7 +659,19 @@ void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, } } -#define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt)) +#define KVM_S2_MEMATTR(pgt, attr) \ + ({ \ + kvm_pte_t __attr; \ + \ + if ((pgt)->flags & KVM_PGTABLE_S2_AS_S1) \ + __attr = PAGE_S2_MEMATTR(AS_S1, \ + stage2_has_fwb(pgt)); \ + else \ + __attr = PAGE_S2_MEMATTR(attr, \ + stage2_has_fwb(pgt)); \ + \ + __attr; \ + }) static int stage2_set_xn_attr(enum kvm_pgtable_prot prot, kvm_pte_t *attr) { From a373930ec9406aeb9f82b5b78db56ca8d0919d99 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 23 Jan 2026 19:16:35 +0000 Subject: [PATCH 169/267] KVM: arm64: Switch pKVM host S2 over to KVM_PGTABLE_S2_AS_S1 Since we have the basics to use the S1 memory attributes as the final ones with FWB, flip the host over to that when FWB is present. Reviewed-by: Joey Gouly Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Link: https://patch.msgid.link/20260123191637.715429-4-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 49db32f3ddf7..38f66a56a766 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -19,7 +19,7 @@ #include #include -#define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP) +#define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_AS_S1 | KVM_PGTABLE_S2_IDMAP) struct host_mmu host_mmu; @@ -324,6 +324,8 @@ int __pkvm_prot_finalize(void) params->vttbr = kvm_get_vttbr(mmu); params->vtcr = mmu->vtcr; params->hcr_el2 |= HCR_VM; + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + params->hcr_el2 |= HCR_FWB; /* * The CMO below not only cleans the updated params to the From 4f27fe82aa304c50b24f92da72b4895cf73b54ba Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 23 Jan 2026 19:16:36 +0000 Subject: [PATCH 170/267] KVM: arm64: Kill KVM_PGTABLE_S2_NOFWB Nobody is using this flag anymore, so remove it. This allows some cleanup by removing stage2_has_fwb(), which is can be replaced by a direct check on the capability. Reviewed-by: Joey Gouly Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260123191637.715429-5-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_pgtable.h | 7 ++----- arch/arm64/kvm/hyp/pgtable.c | 21 ++++++--------------- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 9ce51a637da0..2198b6242883 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -229,15 +229,12 @@ struct kvm_pgtable_mm_ops { /** * enum kvm_pgtable_stage2_flags - Stage-2 page-table flags. - * @KVM_PGTABLE_S2_NOFWB: Don't enforce Normal-WB even if the CPUs have - * ARM64_HAS_STAGE2_FWB. * @KVM_PGTABLE_S2_IDMAP: Only use identity mappings. * @KVM_PGTABLE_S2_AS_S1: Final memory attributes are that of Stage-1. */ enum kvm_pgtable_stage2_flags { - KVM_PGTABLE_S2_NOFWB = BIT(0), - KVM_PGTABLE_S2_IDMAP = BIT(1), - KVM_PGTABLE_S2_AS_S1 = BIT(2), + KVM_PGTABLE_S2_IDMAP = BIT(0), + KVM_PGTABLE_S2_AS_S1 = BIT(1), }; /** diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index c52a24c15ff2..00e33a16494b 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -631,14 +631,6 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) return vtcr; } -static bool stage2_has_fwb(struct kvm_pgtable *pgt) -{ - if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) - return false; - - return !(pgt->flags & KVM_PGTABLE_S2_NOFWB); -} - void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, size_t size) { @@ -661,14 +653,13 @@ void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, #define KVM_S2_MEMATTR(pgt, attr) \ ({ \ + bool __fwb = cpus_have_final_cap(ARM64_HAS_STAGE2_FWB); \ kvm_pte_t __attr; \ \ if ((pgt)->flags & KVM_PGTABLE_S2_AS_S1) \ - __attr = PAGE_S2_MEMATTR(AS_S1, \ - stage2_has_fwb(pgt)); \ + __attr = PAGE_S2_MEMATTR(AS_S1, __fwb); \ else \ - __attr = PAGE_S2_MEMATTR(attr, \ - stage2_has_fwb(pgt)); \ + __attr = PAGE_S2_MEMATTR(attr, __fwb); \ \ __attr; \ }) @@ -880,7 +871,7 @@ static bool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt) * system supporting FWB as the optimization is entirely * pointless when the unmap walker needs to perform CMOs. */ - return system_supports_tlb_range() && stage2_has_fwb(pgt); + return system_supports_tlb_range() && cpus_have_final_cap(ARM64_HAS_STAGE2_FWB); } static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, @@ -1160,7 +1151,7 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, if (mm_ops->page_count(childp) != 1) return 0; } else if (stage2_pte_cacheable(pgt, ctx->old)) { - need_flush = !stage2_has_fwb(pgt); + need_flush = !cpus_have_final_cap(ARM64_HAS_STAGE2_FWB); } /* @@ -1390,7 +1381,7 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) .arg = pgt, }; - if (stage2_has_fwb(pgt)) + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) return 0; return kvm_pgtable_walk(pgt, addr, size, &walker); From 65d00e37b17f46a21e7a25e0fb211a4c33274a83 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 23 Jan 2026 19:16:37 +0000 Subject: [PATCH 171/267] KVM: arm64: Simplify PAGE_S2_MEMATTR Restore PAGE_S2_MEMATTR() to its former glory, keeping the use of FWB as an implementation detail. Reviewed-by: Joey Gouly Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260123191637.715429-6-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/pgtable-prot.h | 4 ++-- arch/arm64/kvm/hyp/pgtable.c | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 161e8660eddd..d27e8872fe3c 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -109,10 +109,10 @@ static inline bool __pure lpa2_is_enabled(void) #define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL_EXEC) #define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_KERNEL_EXEC_CONT) -#define PAGE_S2_MEMATTR(attr, has_fwb) \ +#define PAGE_S2_MEMATTR(attr) \ ({ \ u64 __val; \ - if (has_fwb) \ + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) \ __val = PTE_S2_MEMATTR(MT_S2_FWB_ ## attr); \ else \ __val = PTE_S2_MEMATTR(MT_S2_ ## attr); \ diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 00e33a16494b..a6f7533ce497 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -653,13 +653,12 @@ void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, #define KVM_S2_MEMATTR(pgt, attr) \ ({ \ - bool __fwb = cpus_have_final_cap(ARM64_HAS_STAGE2_FWB); \ kvm_pte_t __attr; \ \ if ((pgt)->flags & KVM_PGTABLE_S2_AS_S1) \ - __attr = PAGE_S2_MEMATTR(AS_S1, __fwb); \ + __attr = PAGE_S2_MEMATTR(AS_S1); \ else \ - __attr = PAGE_S2_MEMATTR(attr, __fwb); \ + __attr = PAGE_S2_MEMATTR(attr); \ \ __attr; \ }) From 5fdf86e7353ce2d91a24dcbe7320935b40d55367 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 15 Jan 2026 09:34:26 -0800 Subject: [PATCH 172/267] KVM: nVMX: Disallow access to vmcs12 fields that aren't supported by "hardware" Disallow access (VMREAD/VMWRITE), both emulated and via a shadow VMCS, to VMCS fields that the loaded incarnation of KVM doesn't support, e.g. due to lack of hardware support, as a middle ground between allowing access to any vmcs12 field defined by KVM (current behavior) and gating access based on the userspace-defined vCPU model (the most functionally correct, but very costly, implementation). Disallowing access to unsupported fields helps a tiny bit in terms of closing the virtualization hole (see below), but the main motivation is to avoid having to weed out unsupported fields when synchronizing between vmcs12 and a shadow VMCS. Because shadow VMCS accesses are done via VMREAD and VMWRITE, KVM _must_ filter out unsupported fields (or eat VMREAD/VMWRITE failures), and filtering out just shadow VMCS fields is about the same amount of effort, and arguably much more confusing. As a bonus, this also fixes a KVM-Unit-Test failure bug when running on _hardware_ without support for TSC Scaling, which fails with the same signature as the bug fixed by commit ba1f82456ba8 ("KVM: nVMX: Dynamically compute max VMCS index for vmcs12"): FAIL: VMX_VMCS_ENUM.MAX_INDEX expected: 19, actual: 17 Dynamically computing the max VMCS index only resolved the issue where KVM was hardcoding max index, but for CPUs with TSC Scaling, that was "good enough". Reviewed-by: Chao Gao Reviewed-by: Xin Li Cc: Xiaoyao Li Cc: Yosry Ahmed Link: https://lore.kernel.org/all/20251026201911.505204-22-xin@zytor.com Link: https://lore.kernel.org/all/YR2Tf9WPNEzrE7Xg@google.com Reviewed-by: Xiaoyao Li Link: https://patch.msgid.link/20260115173427.716021-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 18 ++++++---- arch/x86/kvm/vmx/vmcs.h | 8 +++++ arch/x86/kvm/vmx/vmcs12.c | 70 +++++++++++++++++++++++++++++++++++++-- arch/x86/kvm/vmx/vmcs12.h | 6 ++-- 4 files changed, 92 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 61113ead3d7b..f7950d8465a9 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -86,6 +86,9 @@ static void init_vmcs_shadow_fields(void) pr_err("Missing field from shadow_read_only_field %x\n", field + 1); + if (get_vmcs12_field_offset(field) < 0) + continue; + clear_bit(field, vmx_vmread_bitmap); if (field & 1) #ifdef CONFIG_X86_64 @@ -111,6 +114,9 @@ static void init_vmcs_shadow_fields(void) field <= GUEST_TR_AR_BYTES, "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); + if (get_vmcs12_field_offset(field) < 0) + continue; + /* * PML and the preemption timer can be emulated, but the * processor cannot vmwrite to fields that don't exist @@ -7074,12 +7080,6 @@ void nested_vmx_set_vmcs_shadowing_bitmap(void) } } -/* - * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo - * that madness to get the encoding for comparison. - */ -#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) - static u64 nested_vmx_calc_vmcs_enum_msr(void) { /* @@ -7407,6 +7407,12 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) { int i; + /* + * Note! The set of supported vmcs12 fields is consumed by both VMX + * MSR and shadow VMCS setup. + */ + nested_vmx_setup_vmcs12_fields(); + nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); if (!cpu_has_vmx_shadow_vmcs()) diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 9aa204c87661..66d747e265b1 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -11,7 +11,15 @@ #include "capabilities.h" +/* + * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6 as a very + * rudimentary compression of the range of indices. The compression ratio is + * good enough to allow KVM to use a (very sparsely populated) array without + * wasting too much memory, while the "algorithm" is fast enough to be used to + * lookup vmcs12 fields on-demand, e.g. for emulation. + */ #define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) +#define VMCS12_IDX_TO_ENC(idx) ROL16(idx, 10) #define ENC_TO_VMCS12_IDX(enc) ROL16(enc, 6) struct vmcs_hdr { diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index c2ac9e1a50b3..1ebe67c384ad 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -9,7 +9,7 @@ FIELD(number, name), \ [ENC_TO_VMCS12_IDX(number##_HIGH)] = VMCS12_OFFSET(name) + sizeof(u32) -const unsigned short vmcs12_field_offsets[] = { +static const u16 kvm_supported_vmcs12_field_offsets[] __initconst = { FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), FIELD(POSTED_INTR_NV, posted_intr_nv), FIELD(GUEST_ES_SELECTOR, guest_es_selector), @@ -158,4 +158,70 @@ const unsigned short vmcs12_field_offsets[] = { FIELD(HOST_SSP, host_ssp), FIELD(HOST_INTR_SSP_TABLE, host_ssp_tbl), }; -const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets); + +u16 vmcs12_field_offsets[ARRAY_SIZE(kvm_supported_vmcs12_field_offsets)] __ro_after_init; +unsigned int nr_vmcs12_fields __ro_after_init; + +#define VMCS12_CASE64(enc) case enc##_HIGH: case enc + +static __init bool cpu_has_vmcs12_field(unsigned int idx) +{ + switch (VMCS12_IDX_TO_ENC(idx)) { + case VIRTUAL_PROCESSOR_ID: + return cpu_has_vmx_vpid(); + case POSTED_INTR_NV: + return cpu_has_vmx_posted_intr(); + VMCS12_CASE64(TSC_MULTIPLIER): + return cpu_has_vmx_tsc_scaling(); + case TPR_THRESHOLD: + VMCS12_CASE64(VIRTUAL_APIC_PAGE_ADDR): + return cpu_has_vmx_tpr_shadow(); + VMCS12_CASE64(APIC_ACCESS_ADDR): + return cpu_has_vmx_virtualize_apic_accesses(); + VMCS12_CASE64(POSTED_INTR_DESC_ADDR): + return cpu_has_vmx_posted_intr(); + case GUEST_INTR_STATUS: + return cpu_has_vmx_virtual_intr_delivery(); + VMCS12_CASE64(VM_FUNCTION_CONTROL): + VMCS12_CASE64(EPTP_LIST_ADDRESS): + return cpu_has_vmx_vmfunc(); + VMCS12_CASE64(EPT_POINTER): + return cpu_has_vmx_ept(); + VMCS12_CASE64(XSS_EXIT_BITMAP): + return cpu_has_vmx_xsaves(); + VMCS12_CASE64(ENCLS_EXITING_BITMAP): + return cpu_has_vmx_encls_vmexit(); + VMCS12_CASE64(GUEST_IA32_PERF_GLOBAL_CTRL): + VMCS12_CASE64(HOST_IA32_PERF_GLOBAL_CTRL): + return cpu_has_load_perf_global_ctrl(); + case SECONDARY_VM_EXEC_CONTROL: + return cpu_has_secondary_exec_ctrls(); + case GUEST_S_CET: + case GUEST_SSP: + case GUEST_INTR_SSP_TABLE: + case HOST_S_CET: + case HOST_SSP: + case HOST_INTR_SSP_TABLE: + return cpu_has_load_cet_ctrl(); + + /* KVM always emulates PML and the VMX preemption timer in software. */ + case GUEST_PML_INDEX: + case VMX_PREEMPTION_TIMER_VALUE: + default: + return true; + } +} + +void __init nested_vmx_setup_vmcs12_fields(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(kvm_supported_vmcs12_field_offsets); i++) { + if (!kvm_supported_vmcs12_field_offsets[i] || + !cpu_has_vmcs12_field(i)) + continue; + + vmcs12_field_offsets[i] = kvm_supported_vmcs12_field_offsets[i]; + nr_vmcs12_fields = i + 1; + } +} diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 7a5fdd9b27ba..21cd1b75e4fd 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -374,8 +374,10 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(guest_pml_index, 996); } -extern const unsigned short vmcs12_field_offsets[]; -extern const unsigned int nr_vmcs12_fields; +extern u16 vmcs12_field_offsets[] __ro_after_init; +extern unsigned int nr_vmcs12_fields __ro_after_init; + +void __init nested_vmx_setup_vmcs12_fields(void); static inline short get_vmcs12_field_offset(unsigned long field) { From 1dc643205953c2ff2e1d95dbdc2e784675abb38c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 15 Jan 2026 09:34:27 -0800 Subject: [PATCH 173/267] KVM: nVMX: Remove explicit filtering of GUEST_INTR_STATUS from shadow VMCS fields Drop KVM's filtering of GUEST_INTR_STATUS when generating the shadow VMCS bitmap now that KVM drops GUEST_INTR_STATUS from the set of supported vmcs12 fields if the field isn't supported by hardware, and initialization of the shadow VMCS fields omits unsupported vmcs12 fields. Note, there is technically a small functional change here, as the vmcs12 filtering only requires support for Virtual Interrupt Delivery, whereas the shadow VMCS code being removed required "full" APICv support, i.e. required Virtual Interrupt Delivery *and* APIC Register Virtualizaton *and* Posted Interrupt support. Opportunistically tweak the comment to more precisely explain why the PML and VMX preemption timer fields need to be explicitly checked. Reviewed-by: Xiaoyao Li Link: https://patch.msgid.link/20260115173427.716021-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index f7950d8465a9..a5529a4bc87d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -118,9 +118,10 @@ static void init_vmcs_shadow_fields(void) continue; /* - * PML and the preemption timer can be emulated, but the - * processor cannot vmwrite to fields that don't exist - * on bare metal. + * KVM emulates PML and the VMX preemption timer irrespective + * of hardware support, but shadowing their related VMCS fields + * requires hardware support as the CPU will reject VMWRITEs to + * fields that don't exist. */ switch (field) { case GUEST_PML_INDEX: @@ -131,10 +132,6 @@ static void init_vmcs_shadow_fields(void) if (!cpu_has_vmx_preemption_timer()) continue; break; - case GUEST_INTR_STATUS: - if (!cpu_has_vmx_apicv()) - continue; - break; default: break; } From da63758c1876d899031066a9d4b8050af767ceb8 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Mon, 8 Dec 2025 15:28:22 +0000 Subject: [PATCH 174/267] KVM: arm64: gic: Enable GICv3 CPUIF trapping on GICv5 hosts if required Factor out the enable (and printing of) the GICv3 CPUIF traps from the main GICv3 probe into a separate function. Call said function from the GICv5 probe for legacy support, ensuring that any required GICv3 CPUIF traps on GICv5 hosts will be correctly handled, rather than injecting an undef into the guest. Signed-off-by: Sascha Bischoff Link: https://patch.msgid.link/20251208152724.3637157-3-sascha.bischoff@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-v3.c | 25 +++++++++++++++---------- arch/arm64/kvm/vgic/vgic-v5.c | 2 ++ arch/arm64/kvm/vgic/vgic.h | 1 + 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 1d6dd1b545bd..4b03a726f164 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -880,6 +880,20 @@ void noinstr kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt, *updptr = cpu_to_le32(insn); } +void vgic_v3_enable_cpuif_traps(void) +{ + u64 traps = vgic_ich_hcr_trap_bits(); + + if (traps) { + kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n", + (traps & ICH_HCR_EL2_TALL0) ? "G0" : "", + (traps & ICH_HCR_EL2_TALL1) ? "G1" : "", + (traps & ICH_HCR_EL2_TC) ? "C" : "", + (traps & ICH_HCR_EL2_TDIR) ? "D" : ""); + static_branch_enable(&vgic_v3_cpuif_trap); + } +} + /** * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller * @info: pointer to the GIC description @@ -891,7 +905,6 @@ int vgic_v3_probe(const struct gic_kvm_info *info) { u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config); bool has_v2; - u64 traps; int ret; has_v2 = ich_vtr_el2 >> 63; @@ -955,15 +968,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info) kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_EL2_SEIS; } - traps = vgic_ich_hcr_trap_bits(); - if (traps) { - kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n", - (traps & ICH_HCR_EL2_TALL0) ? "G0" : "", - (traps & ICH_HCR_EL2_TALL1) ? "G1" : "", - (traps & ICH_HCR_EL2_TC) ? "C" : "", - (traps & ICH_HCR_EL2_TDIR) ? "D" : ""); - static_branch_enable(&vgic_v3_cpuif_trap); - } + vgic_v3_enable_cpuif_traps(); kvm_vgic_global_state.vctrl_base = NULL; kvm_vgic_global_state.type = VGIC_V3; diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c index 2d3811f4e117..331651087e2c 100644 --- a/arch/arm64/kvm/vgic/vgic-v5.c +++ b/arch/arm64/kvm/vgic/vgic-v5.c @@ -48,5 +48,7 @@ int vgic_v5_probe(const struct gic_kvm_info *info) static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif); kvm_info("GCIE legacy system register CPU interface\n"); + vgic_v3_enable_cpuif_traps(); + return 0; } diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 5f0fc96b4dc2..c9b3bb07e483 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -324,6 +324,7 @@ void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als); void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_reset(struct kvm_vcpu *vcpu); +void vgic_v3_enable_cpuif_traps(void); int vgic_v3_probe(const struct gic_kvm_info *info); int vgic_v3_map_resources(struct kvm *kvm); int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq); From 28e505d81766dcbe25c60d57ab9fc941cd3d38bf Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Mon, 8 Dec 2025 15:28:23 +0000 Subject: [PATCH 175/267] KVM: arm64: Correct test for ICH_HCR_EL2_TDIR cap for GICv5 hosts The original order of checks in the ICH_HCR_EL2_TDIR test returned with false early in the case where the native GICv3 CPUIF was not present. The result was that on GICv5 hosts with legacy support - which do not have the GICv3 CPUIF - the test always returned false. Reshuffle the checks such that support for GICv5 legacy is checked prior to checking for the native GICv3 CPUIF. Signed-off-by: Sascha Bischoff Fixes: 2a28810cbb8b2 ("KVM: arm64: GICv3: Detect and work around the lack of ICV_DIR_EL1 trapping") Link: https://patch.msgid.link/20251208152724.3637157-4-sascha.bischoff@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kernel/cpufeature.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index c840a93b9ef9..65aaea68adaa 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2326,16 +2326,16 @@ static bool can_trap_icv_dir_el1(const struct arm64_cpu_capabilities *entry, BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV3_CPUIF); BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV5_LEGACY); - if (!this_cpu_has_cap(ARM64_HAS_GICV3_CPUIF) && - !is_midr_in_range_list(has_vgic_v3)) - return false; - if (!is_hyp_mode_available()) return false; if (this_cpu_has_cap(ARM64_HAS_GICV5_LEGACY)) return true; + if (!this_cpu_has_cap(ARM64_HAS_GICV3_CPUIF) && + !is_midr_in_range_list(has_vgic_v3)) + return false; + if (is_kernel_in_hyp_mode()) res.a1 = read_sysreg_s(SYS_ICH_VTR_EL2); else From 9d46e83a72392a644604458448a72d7c45977f0f Mon Sep 17 00:00:00 2001 From: "Zenghui Yu (Huawei)" Date: Fri, 30 Jan 2026 17:44:35 +0800 Subject: [PATCH 176/267] KVM: arm64: nv: Add trap config for DBGWCR<15>_EL1 Seems that it was missed when MDCR_EL2 was first added to the trap forwarding infrastructure. Add it back. Fixes: cb31632c4452 ("KVM: arm64: nv: Add trap forwarding for MDCR_EL2") Signed-off-by: Zenghui Yu (Huawei) Link: https://patch.msgid.link/20260130094435.39942-1-zenghui.yu@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/emulate-nested.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 834f13fb1fb7..34a5460adaf0 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -1166,6 +1166,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_DBGWCRn_EL1(12), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(13), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(14), CGT_MDCR_TDE_TDA), + SR_TRAP(SYS_DBGWCRn_EL1(15), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGCLAIMSET_EL1, CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGCLAIMCLR_EL1, CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGAUTHSTATUS_EL1, CGT_MDCR_TDE_TDA), From 82a32eacbacc6f7e372f98999e5ee1ee0dd7462d Mon Sep 17 00:00:00 2001 From: "Zenghui Yu (Huawei)" Date: Wed, 28 Jan 2026 15:52:08 +0800 Subject: [PATCH 177/267] KVM: arm64: Fix various comments Use tab instead of whitespaces, as well as 2 minor typo fixes. Signed-off-by: Zenghui Yu (Huawei) Link: https://patch.msgid.link/20260128075208.23024-1-zenghui.yu@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kvm/hyp/vhe/sysreg-sr.c | 2 +- arch/arm64/kvm/vgic/vgic-v3-nested.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 4c8b4274f669..fa01877a7ba1 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -201,7 +201,7 @@ struct kvm_s2_mmu { * host to parse the guest S2. * This either contains: * - the virtual VTTBR programmed by the guest hypervisor with - * CnP cleared + * CnP cleared * - The value 1 (VMID=0, BADDR=0, CnP=1) if invalid * * We also cache the full VTCR which gets used for TLB invalidation, diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index f28c6cf4fe1b..b254d442e54e 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -205,7 +205,7 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) /* * When running a normal EL1 guest, we only load a new vcpu - * after a context switch, which imvolves a DSB, so all + * after a context switch, which involves a DSB, so all * speculative EL1&0 walks will have already completed. * If running NV, the vcpu may transition between vEL1 and * vEL2 without a context switch, so make sure we complete diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 61b44f3f2bf1..a2ccef116483 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -57,7 +57,7 @@ static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx) * as the L1 guest is in charge of provisioning the interrupts via its own * view of the ICH_LR*_EL2 registers, which conveniently live in the VNCR * page. This means that the flow described above does work (there is no - * state to rebuild in the L0 hypervisor), and that most things happed on L2 + * state to rebuild in the L0 hypervisor), and that most things happen on L2 * load/put: * * - on L2 load: move the in-memory L1 vGIC configuration into a shadow, From 4a03431b742b4edc24fe1a14d355de1df6d80f86 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 28 Jan 2026 17:59:50 +0000 Subject: [PATCH 178/267] KVM: arm64: gic-v3: Switch vGIC-v3 to use generated ICH_VMCR_EL2 The VGIC-v3 code relied on hand-written definitions for the ICH_VMCR_EL2 register. This register, and the associated fields, is now generated as part of the sysreg framework. Move to using the generated definitions instead of the hand-written ones. There are no functional changes as part of this change. Signed-off-by: Sascha Bischoff Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260128175919.3828384-3-sascha.bischoff@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/sysreg.h | 21 --------- arch/arm64/kvm/hyp/vgic-v3-sr.c | 69 ++++++++++------------------ arch/arm64/kvm/vgic/vgic-v3-nested.c | 8 ++-- arch/arm64/kvm/vgic/vgic-v3.c | 48 +++++++++---------- 4 files changed, 51 insertions(+), 95 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 9df51accbb02..b3b8b8cd7bf1 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -560,7 +560,6 @@ #define SYS_ICC_SRE_EL2 sys_reg(3, 4, 12, 9, 5) #define SYS_ICH_EISR_EL2 sys_reg(3, 4, 12, 11, 3) #define SYS_ICH_ELRSR_EL2 sys_reg(3, 4, 12, 11, 5) -#define SYS_ICH_VMCR_EL2 sys_reg(3, 4, 12, 11, 7) #define __SYS__LR0_EL2(x) sys_reg(3, 4, 12, 12, x) #define SYS_ICH_LR0_EL2 __SYS__LR0_EL2(0) @@ -988,26 +987,6 @@ #define ICH_LR_PRIORITY_SHIFT 48 #define ICH_LR_PRIORITY_MASK (0xffULL << ICH_LR_PRIORITY_SHIFT) -/* ICH_VMCR_EL2 bit definitions */ -#define ICH_VMCR_ACK_CTL_SHIFT 2 -#define ICH_VMCR_ACK_CTL_MASK (1 << ICH_VMCR_ACK_CTL_SHIFT) -#define ICH_VMCR_FIQ_EN_SHIFT 3 -#define ICH_VMCR_FIQ_EN_MASK (1 << ICH_VMCR_FIQ_EN_SHIFT) -#define ICH_VMCR_CBPR_SHIFT 4 -#define ICH_VMCR_CBPR_MASK (1 << ICH_VMCR_CBPR_SHIFT) -#define ICH_VMCR_EOIM_SHIFT 9 -#define ICH_VMCR_EOIM_MASK (1 << ICH_VMCR_EOIM_SHIFT) -#define ICH_VMCR_BPR1_SHIFT 18 -#define ICH_VMCR_BPR1_MASK (7 << ICH_VMCR_BPR1_SHIFT) -#define ICH_VMCR_BPR0_SHIFT 21 -#define ICH_VMCR_BPR0_MASK (7 << ICH_VMCR_BPR0_SHIFT) -#define ICH_VMCR_PMR_SHIFT 24 -#define ICH_VMCR_PMR_MASK (0xffUL << ICH_VMCR_PMR_SHIFT) -#define ICH_VMCR_ENG0_SHIFT 0 -#define ICH_VMCR_ENG0_MASK (1 << ICH_VMCR_ENG0_SHIFT) -#define ICH_VMCR_ENG1_SHIFT 1 -#define ICH_VMCR_ENG1_MASK (1 << ICH_VMCR_ENG1_SHIFT) - /* * Permission Indirection Extension (PIE) permission encodings. * Encodings with the _O suffix, have overlays applied (Permission Overlay Extension). diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 0b670a033fd8..c4d2f1feea8b 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -569,11 +569,11 @@ static int __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu, u32 vmcr, continue; /* Group-0 interrupt, but Group-0 disabled? */ - if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK)) + if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_EL2_VENG0_MASK)) continue; /* Group-1 interrupt, but Group-1 disabled? */ - if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK)) + if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_EL2_VENG1_MASK)) continue; /* Not the highest priority? */ @@ -646,19 +646,19 @@ static int __vgic_v3_get_highest_active_priority(void) static unsigned int __vgic_v3_get_bpr0(u32 vmcr) { - return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; + return FIELD_GET(ICH_VMCR_EL2_VBPR0, vmcr); } static unsigned int __vgic_v3_get_bpr1(u32 vmcr) { unsigned int bpr; - if (vmcr & ICH_VMCR_CBPR_MASK) { + if (vmcr & ICH_VMCR_EL2_VCBPR_MASK) { bpr = __vgic_v3_get_bpr0(vmcr); if (bpr < 7) bpr++; } else { - bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; + bpr = FIELD_GET(ICH_VMCR_EL2_VBPR1, vmcr); } return bpr; @@ -758,7 +758,7 @@ static void __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt) if (grp != !!(lr_val & ICH_LR_GROUP)) goto spurious; - pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; + pmr = FIELD_GET(ICH_VMCR_EL2_VPMR, vmcr); lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; if (pmr <= lr_prio) goto spurious; @@ -806,7 +806,7 @@ static int ___vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) int lr; /* EOImode == 0, nothing to be done here */ - if (!(vmcr & ICH_VMCR_EOIM_MASK)) + if (!(vmcr & ICH_VMCR_EL2_VEOIM_MASK)) return 1; /* No deactivate to be performed on an LPI */ @@ -849,7 +849,7 @@ static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) } /* EOImode == 1 and not an LPI, nothing to be done here */ - if ((vmcr & ICH_VMCR_EOIM_MASK) && !(vid >= VGIC_MIN_LPI)) + if ((vmcr & ICH_VMCR_EL2_VEOIM_MASK) && !(vid >= VGIC_MIN_LPI)) return; lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; @@ -865,22 +865,19 @@ static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) static void __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { - vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK)); + vcpu_set_reg(vcpu, rt, FIELD_GET(ICH_VMCR_EL2_VENG0, vmcr)); } static void __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { - vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK)); + vcpu_set_reg(vcpu, rt, FIELD_GET(ICH_VMCR_EL2_VENG1, vmcr)); } static void __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u64 val = vcpu_get_reg(vcpu, rt); - if (val & 1) - vmcr |= ICH_VMCR_ENG0_MASK; - else - vmcr &= ~ICH_VMCR_ENG0_MASK; + FIELD_MODIFY(ICH_VMCR_EL2_VENG0, &vmcr, val & 1); __vgic_v3_write_vmcr(vmcr); } @@ -889,10 +886,7 @@ static void __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u64 val = vcpu_get_reg(vcpu, rt); - if (val & 1) - vmcr |= ICH_VMCR_ENG1_MASK; - else - vmcr &= ~ICH_VMCR_ENG1_MASK; + FIELD_MODIFY(ICH_VMCR_EL2_VENG1, &vmcr, val & 1); __vgic_v3_write_vmcr(vmcr); } @@ -916,10 +910,7 @@ static void __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) if (val < bpr_min) val = bpr_min; - val <<= ICH_VMCR_BPR0_SHIFT; - val &= ICH_VMCR_BPR0_MASK; - vmcr &= ~ICH_VMCR_BPR0_MASK; - vmcr |= val; + FIELD_MODIFY(ICH_VMCR_EL2_VBPR0, &vmcr, val); __vgic_v3_write_vmcr(vmcr); } @@ -929,17 +920,14 @@ static void __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) u64 val = vcpu_get_reg(vcpu, rt); u8 bpr_min = __vgic_v3_bpr_min(); - if (vmcr & ICH_VMCR_CBPR_MASK) + if (FIELD_GET(ICH_VMCR_EL2_VCBPR, val)) return; /* Enforce BPR limiting */ if (val < bpr_min) val = bpr_min; - val <<= ICH_VMCR_BPR1_SHIFT; - val &= ICH_VMCR_BPR1_MASK; - vmcr &= ~ICH_VMCR_BPR1_MASK; - vmcr |= val; + FIELD_MODIFY(ICH_VMCR_EL2_VBPR1, &vmcr, val); __vgic_v3_write_vmcr(vmcr); } @@ -1029,19 +1017,14 @@ spurious: static void __vgic_v3_read_pmr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { - vmcr &= ICH_VMCR_PMR_MASK; - vmcr >>= ICH_VMCR_PMR_SHIFT; - vcpu_set_reg(vcpu, rt, vmcr); + vcpu_set_reg(vcpu, rt, FIELD_GET(ICH_VMCR_EL2_VPMR, vmcr)); } static void __vgic_v3_write_pmr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 val = vcpu_get_reg(vcpu, rt); - val <<= ICH_VMCR_PMR_SHIFT; - val &= ICH_VMCR_PMR_MASK; - vmcr &= ~ICH_VMCR_PMR_MASK; - vmcr |= val; + FIELD_MODIFY(ICH_VMCR_EL2_VPMR, &vmcr, val); write_gicreg(vmcr, ICH_VMCR_EL2); } @@ -1064,9 +1047,11 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) /* A3V */ val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; /* EOImode */ - val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT; + val |= FIELD_PREP(ICC_CTLR_EL1_EOImode_MASK, + FIELD_GET(ICH_VMCR_EL2_VEOIM, vmcr)); /* CBPR */ - val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; + val |= FIELD_PREP(ICC_CTLR_EL1_CBPR_MASK, + FIELD_GET(ICH_VMCR_EL2_VCBPR, vmcr)); vcpu_set_reg(vcpu, rt, val); } @@ -1075,15 +1060,11 @@ static void __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 val = vcpu_get_reg(vcpu, rt); - if (val & ICC_CTLR_EL1_CBPR_MASK) - vmcr |= ICH_VMCR_CBPR_MASK; - else - vmcr &= ~ICH_VMCR_CBPR_MASK; + FIELD_MODIFY(ICH_VMCR_EL2_VCBPR, &vmcr, + FIELD_GET(ICC_CTLR_EL1_CBPR_MASK, val)); - if (val & ICC_CTLR_EL1_EOImode_MASK) - vmcr |= ICH_VMCR_EOIM_MASK; - else - vmcr &= ~ICH_VMCR_EOIM_MASK; + FIELD_MODIFY(ICH_VMCR_EL2_VEOIM, &vmcr, + FIELD_GET(ICC_CTLR_EL1_EOImode_MASK, val)); write_gicreg(vmcr, ICH_VMCR_EL2); } diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 61b44f3f2bf1..c9e35ec67117 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -202,16 +202,16 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu) if ((hcr & ICH_HCR_EL2_NPIE) && !mi_state.pend) reg |= ICH_MISR_EL2_NP; - if ((hcr & ICH_HCR_EL2_VGrp0EIE) && (vmcr & ICH_VMCR_ENG0_MASK)) + if ((hcr & ICH_HCR_EL2_VGrp0EIE) && (vmcr & ICH_VMCR_EL2_VENG0_MASK)) reg |= ICH_MISR_EL2_VGrp0E; - if ((hcr & ICH_HCR_EL2_VGrp0DIE) && !(vmcr & ICH_VMCR_ENG0_MASK)) + if ((hcr & ICH_HCR_EL2_VGrp0DIE) && !(vmcr & ICH_VMCR_EL2_VENG0_MASK)) reg |= ICH_MISR_EL2_VGrp0D; - if ((hcr & ICH_HCR_EL2_VGrp1EIE) && (vmcr & ICH_VMCR_ENG1_MASK)) + if ((hcr & ICH_HCR_EL2_VGrp1EIE) && (vmcr & ICH_VMCR_EL2_VENG1_MASK)) reg |= ICH_MISR_EL2_VGrp1E; - if ((hcr & ICH_HCR_EL2_VGrp1DIE) && !(vmcr & ICH_VMCR_ENG1_MASK)) + if ((hcr & ICH_HCR_EL2_VGrp1DIE) && !(vmcr & ICH_VMCR_EL2_VENG1_MASK)) reg |= ICH_MISR_EL2_VGrp1D; return reg; diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 1d6dd1b545bd..2afc04167231 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -41,9 +41,9 @@ void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, if (!als->nr_sgi) cpuif->vgic_hcr |= ICH_HCR_EL2_vSGIEOICount; - cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG0_MASK) ? + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_EL2_VENG0_MASK) ? ICH_HCR_EL2_VGrp0DIE : ICH_HCR_EL2_VGrp0EIE; - cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG1_MASK) ? + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_EL2_VENG1_MASK) ? ICH_HCR_EL2_VGrp1DIE : ICH_HCR_EL2_VGrp1EIE; /* @@ -215,7 +215,7 @@ void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val) * We only deal with DIR when EOIMode==1, and only for SGI, * PPI or SPI. */ - if (!(cpuif->vgic_vmcr & ICH_VMCR_EOIM_MASK) || + if (!(cpuif->vgic_vmcr & ICH_VMCR_EL2_VEOIM_MASK) || val >= vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS) return; @@ -408,25 +408,23 @@ void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) u32 vmcr; if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { - vmcr = (vmcrp->ackctl << ICH_VMCR_ACK_CTL_SHIFT) & - ICH_VMCR_ACK_CTL_MASK; - vmcr |= (vmcrp->fiqen << ICH_VMCR_FIQ_EN_SHIFT) & - ICH_VMCR_FIQ_EN_MASK; + vmcr = FIELD_PREP(ICH_VMCR_EL2_VAckCtl, vmcrp->ackctl); + vmcr |= FIELD_PREP(ICH_VMCR_EL2_VFIQEn, vmcrp->fiqen); } else { /* * When emulating GICv3 on GICv3 with SRE=1 on the * VFIQEn bit is RES1 and the VAckCtl bit is RES0. */ - vmcr = ICH_VMCR_FIQ_EN_MASK; + vmcr = ICH_VMCR_EL2_VFIQEn_MASK; } - vmcr |= (vmcrp->cbpr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK; - vmcr |= (vmcrp->eoim << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK; - vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK; - vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK; - vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK; - vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK; - vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK; + vmcr |= FIELD_PREP(ICH_VMCR_EL2_VCBPR, vmcrp->cbpr); + vmcr |= FIELD_PREP(ICH_VMCR_EL2_VEOIM, vmcrp->eoim); + vmcr |= FIELD_PREP(ICH_VMCR_EL2_VBPR1, vmcrp->abpr); + vmcr |= FIELD_PREP(ICH_VMCR_EL2_VBPR0, vmcrp->bpr); + vmcr |= FIELD_PREP(ICH_VMCR_EL2_VPMR, vmcrp->pmr); + vmcr |= FIELD_PREP(ICH_VMCR_EL2_VENG0, vmcrp->grpen0); + vmcr |= FIELD_PREP(ICH_VMCR_EL2_VENG1, vmcrp->grpen1); cpu_if->vgic_vmcr = vmcr; } @@ -440,10 +438,8 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) vmcr = cpu_if->vgic_vmcr; if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { - vmcrp->ackctl = (vmcr & ICH_VMCR_ACK_CTL_MASK) >> - ICH_VMCR_ACK_CTL_SHIFT; - vmcrp->fiqen = (vmcr & ICH_VMCR_FIQ_EN_MASK) >> - ICH_VMCR_FIQ_EN_SHIFT; + vmcrp->ackctl = FIELD_GET(ICH_VMCR_EL2_VAckCtl, vmcr); + vmcrp->fiqen = FIELD_GET(ICH_VMCR_EL2_VFIQEn, vmcr); } else { /* * When emulating GICv3 on GICv3 with SRE=1 on the @@ -453,13 +449,13 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) vmcrp->ackctl = 0; } - vmcrp->cbpr = (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; - vmcrp->eoim = (vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT; - vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; - vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; - vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; - vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT; - vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT; + vmcrp->cbpr = FIELD_GET(ICH_VMCR_EL2_VCBPR, vmcr); + vmcrp->eoim = FIELD_GET(ICH_VMCR_EL2_VEOIM, vmcr); + vmcrp->abpr = FIELD_GET(ICH_VMCR_EL2_VBPR1, vmcr); + vmcrp->bpr = FIELD_GET(ICH_VMCR_EL2_VBPR0, vmcr); + vmcrp->pmr = FIELD_GET(ICH_VMCR_EL2_VPMR, vmcr); + vmcrp->grpen0 = FIELD_GET(ICH_VMCR_EL2_VENG0, vmcr); + vmcrp->grpen1 = FIELD_GET(ICH_VMCR_EL2_VENG1, vmcr); } #define INITIAL_PENDBASER_VALUE \ From b583177aafe3ca753ddd3624c8731a93d0cd0b37 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 28 Jan 2026 18:00:06 +0000 Subject: [PATCH 179/267] arm64/sysreg: Drop ICH_HFGRTR_EL2.ICC_HAPR_EL1 and make RES1 The GICv5 architecture is dropping the ICC_HAPR_EL1 and ICV_HAPR_EL1 system registers. These registers were never added to the sysregs, but the traps for them were. Drop the trap bit from the ICH_HFGRTR_EL2 and make it Res1 as per the upcoming GICv5 spec change. Additionally, update the EL2 setup code to not attempt to set that bit. Signed-off-by: Sascha Bischoff Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260128175919.3828384-4-sascha.bischoff@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/el2_setup.h | 1 - arch/arm64/tools/sysreg | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index cacd20df1786..07c12f4a69b4 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -225,7 +225,6 @@ ICH_HFGRTR_EL2_ICC_ICSR_EL1 | \ ICH_HFGRTR_EL2_ICC_PCR_EL1 | \ ICH_HFGRTR_EL2_ICC_HPPIR_EL1 | \ - ICH_HFGRTR_EL2_ICC_HAPR_EL1 | \ ICH_HFGRTR_EL2_ICC_CR0_EL1 | \ ICH_HFGRTR_EL2_ICC_IDRn_EL1 | \ ICH_HFGRTR_EL2_ICC_APR_EL1) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 8921b51866d6..dab5bfe8c968 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -4579,7 +4579,7 @@ Field 7 ICC_IAFFIDR_EL1 Field 6 ICC_ICSR_EL1 Field 5 ICC_PCR_EL1 Field 4 ICC_HPPIR_EL1 -Field 3 ICC_HAPR_EL1 +Res1 3 Field 2 ICC_CR0_EL1 Field 1 ICC_IDRn_EL1 Field 0 ICC_APR_EL1 From 9435c1e1431003e23aa34ef8e46c30d09c3dbcb5 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 28 Jan 2026 18:00:52 +0000 Subject: [PATCH 180/267] KVM: arm64: gic: Set vgic_model before initing private IRQs Different GIC types require the private IRQs to be initialised differently. GICv5 is the culprit as it supports both a different number of private IRQs, and all of these are PPIs (there are no SGIs). Moreover, as GICv5 uses the top bits of the interrupt ID to encode the type, the intid also needs to computed differently. Up until now, the GIC model has been set after initialising the private IRQs for a VCPU. Move this earlier to ensure that the GIC model is available when configuring the private IRQs. While we're at it, also move the setting of the in_kernel flag and implementation revision to keep them grouped together as before. Signed-off-by: Sascha Bischoff Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260128175919.3828384-7-sascha.bischoff@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-init.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index dc9f9db31026..86c149537493 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -140,6 +140,10 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) goto out_unlock; } + kvm->arch.vgic.in_kernel = true; + kvm->arch.vgic.vgic_model = type; + kvm->arch.vgic.implementation_rev = KVM_VGIC_IMP_REV_LATEST; + kvm_for_each_vcpu(i, vcpu, kvm) { ret = vgic_allocate_private_irqs_locked(vcpu, type); if (ret) @@ -156,10 +160,6 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) goto out_unlock; } - kvm->arch.vgic.in_kernel = true; - kvm->arch.vgic.vgic_model = type; - kvm->arch.vgic.implementation_rev = KVM_VGIC_IMP_REV_LATEST; - kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; From 3227c3a89d65fe7482312b7b27038d9ebd86f210 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 28 Jan 2026 18:07:33 +0000 Subject: [PATCH 181/267] irqchip/gic-v5: Check if impl is virt capable Now that there is support for creating a GICv5-based guest with KVM, check that the hardware itself supports virtualisation, skipping the setting of struct gic_kvm_info if not. Note: If native GICv5 virt is not supported, then nor is FEAT_GCIE_LEGACY, so we are able to skip altogether. Signed-off-by: Sascha Bischoff Reviewed-by: Lorenzo Pieralisi Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260128175919.3828384-33-sascha.bischoff@arm.com [maz: cosmetic changes] Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v5-irs.c | 2 ++ drivers/irqchip/irq-gic-v5.c | 10 ++++++++++ include/linux/irqchip/arm-gic-v5.h | 4 ++++ 3 files changed, 16 insertions(+) diff --git a/drivers/irqchip/irq-gic-v5-irs.c b/drivers/irqchip/irq-gic-v5-irs.c index ce2732d649a3..eeeb40fb0eaa 100644 --- a/drivers/irqchip/irq-gic-v5-irs.c +++ b/drivers/irqchip/irq-gic-v5-irs.c @@ -743,6 +743,8 @@ static int __init gicv5_irs_init(struct device_node *node) * be consistent across IRSes by the architecture. */ if (list_empty(&irs_nodes)) { + idr = irs_readl_relaxed(irs_data, GICV5_IRS_IDR0); + gicv5_global_data.virt_capable = !FIELD_GET(GICV5_IRS_IDR0_VIRT, idr); idr = irs_readl_relaxed(irs_data, GICV5_IRS_IDR1); irs_setup_pri_bits(idr); diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c index 41ef286c4d78..3c86bbc05761 100644 --- a/drivers/irqchip/irq-gic-v5.c +++ b/drivers/irqchip/irq-gic-v5.c @@ -1064,6 +1064,16 @@ static struct gic_kvm_info gic_v5_kvm_info __initdata; static void __init gic_of_setup_kvm_info(struct device_node *node) { + /* + * If we don't have native GICv5 virtualisation support, then + * we also don't have FEAT_GCIE_LEGACY - the architecture + * forbids this combination. + */ + if (!gicv5_global_data.virt_capable) { + pr_info("GIC implementation is not virtualization capable\n"); + return; + } + gic_v5_kvm_info.type = GIC_V5; /* GIC Virtual CPU interface maintenance interrupt */ diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h index 68ddcdb1cec5..4cb71ce6e8ad 100644 --- a/include/linux/irqchip/arm-gic-v5.h +++ b/include/linux/irqchip/arm-gic-v5.h @@ -43,6 +43,7 @@ /* * IRS registers and tables structures */ +#define GICV5_IRS_IDR0 0x0000 #define GICV5_IRS_IDR1 0x0004 #define GICV5_IRS_IDR2 0x0008 #define GICV5_IRS_IDR5 0x0014 @@ -63,6 +64,8 @@ #define GICV5_IRS_IST_STATUSR 0x0194 #define GICV5_IRS_MAP_L2_ISTR 0x01c0 +#define GICV5_IRS_IDR0_VIRT BIT(6) + #define GICV5_IRS_IDR1_PRIORITY_BITS GENMASK(22, 20) #define GICV5_IRS_IDR1_IAFFID_BITS GENMASK(19, 16) @@ -278,6 +281,7 @@ struct gicv5_chip_data { u8 cpuif_pri_bits; u8 cpuif_id_bits; u8 irs_pri_bits; + bool virt_capable; struct { __le64 *l1ist_addr; u32 l2_size; From c0d6b8bbbced660e9c2efe079e2b2cb34b27d97f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Jan 2026 17:43:10 -0800 Subject: [PATCH 182/267] KVM: VMX: Print out "bad" offsets+value on VMCS config mismatch When kvm-intel.ko refuses to load due to a mismatched VMCS config, print all mismatching offsets+values to make it easier to debug goofs during development, and to make it at least feasible to triage failures that occur during production. E.g. if a physical core is flaky or is running with the "wrong" microcode patch loaded, then a CPU can get a legitimate mismatch even without KVM bugs. Print the mismatches as 32-bit values as a compromise between hand coding every field (to provide precise information) and printing individual bytes (requires more effort to deduce the mismatch bit(s)). All fields in the VMCS config are either 32-bit or 64-bit values, i.e. in many cases, printing 32-bit values will be 100% precise, and in the others it's close enough, especially when considering that MSR values are split into EDX:EAX anyways. E.g. on mismatch CET entry/exit controls, KVM will print: kvm_intel: VMCS config on CPU 0 doesn't match reference config: Offset 76 REF = 0x107fffff, CPU0 = 0x007fffff, mismatch = 0x10000000 Offset 84 REF = 0x0010f3ff, CPU0 = 0x0000f3ff, mismatch = 0x00100000 Opportunistically tweak the wording on the initial error message to say "mismatch" instead of "inconsistent", as the VMCS config itself isn't inconsistent, and the wording conflates the cross-CPU compatibility check with the error_on_inconsistent_vmcs_config knob that treats inconsistent VMCS configurations as errors (e.g. if a CPU supports CET entry controls but no CET exit controls). Cc: Jim Mattson Reviewed-by: Xiaoyao Li Reviewed-by: Binbin Wu Link: https://patch.msgid.link/20260128014310.3255561-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8438799eed74..c105756e59f0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2921,8 +2921,23 @@ int vmx_check_processor_compat(void) } if (nested) nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); + if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { - pr_err("Inconsistent VMCS config on CPU %d\n", cpu); + u32 *gold = (void *)&vmcs_config; + u32 *mine = (void *)&vmcs_conf; + int i; + + BUILD_BUG_ON(sizeof(struct vmcs_config) % sizeof(u32)); + + pr_err("VMCS config on CPU %d doesn't match reference config:", cpu); + for (i = 0; i < sizeof(struct vmcs_config) / sizeof(u32); i++) { + if (gold[i] == mine[i]) + continue; + + pr_cont("\n Offset %u REF = 0x%08x, CPU%u = 0x%08x, mismatch = 0x%08x", + i * (int)sizeof(u32), gold[i], cpu, mine[i], gold[i] ^ mine[i]); + } + pr_cont("\n"); return -EIO; } return 0; From 3f2757dbf32a31cef738a983bde6ecd2641484c0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Jan 2026 17:43:09 -0800 Subject: [PATCH 183/267] KVM: x86: Harden against unexpected adjustments to kvm_cpu_caps Add a flag to track when KVM is actively configuring its CPU caps, and WARN if a cap is set or cleared if KVM isn't in its configuration stage. Modifying CPU caps after {svm,vmx}_set_cpu_caps() can be fatal to KVM, as vendor setup code expects the CPU caps to be frozen at that point, e.g. will do additional configuration based on the caps. Rename kvm_set_cpu_caps() to kvm_initialize_cpu_caps() to pair with the new "finalize", and to make it more obvious that KVM's CPU caps aren't fully configured within the function. Reviewed-by: Xiaoyao Li Reviewed-by: Binbin Wu Link: https://patch.msgid.link/20260128014310.3255561-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 10 ++++++++-- arch/x86/kvm/cpuid.h | 12 +++++++++++- arch/x86/kvm/svm/svm.c | 4 +++- arch/x86/kvm/vmx/vmx.c | 4 +++- 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0172b7ef2ba4..3508fb8606e9 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -36,6 +36,9 @@ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_caps); +bool kvm_is_configuring_cpu_caps __read_mostly; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_configuring_cpu_caps); + struct cpuid_xstate_sizes { u32 eax; u32 ebx; @@ -826,10 +829,13 @@ do { \ /* DS is defined by ptrace-abi.h on 32-bit builds. */ #undef DS -void kvm_set_cpu_caps(void) +void kvm_initialize_cpu_caps(void) { memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps)); + WARN_ON_ONCE(kvm_is_configuring_cpu_caps); + kvm_is_configuring_cpu_caps = true; + BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) > sizeof(boot_cpu_data.x86_capability)); @@ -1288,7 +1294,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_RDPID); } } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cpu_caps); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_initialize_cpu_caps); #undef F #undef SCATTERED_F diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index d3f5ae15a7ca..039b8e6f40ba 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -8,7 +8,15 @@ #include extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; -void kvm_set_cpu_caps(void); +extern bool kvm_is_configuring_cpu_caps __read_mostly; + +void kvm_initialize_cpu_caps(void); + +static inline void kvm_finalize_cpu_caps(void) +{ + WARN_ON_ONCE(!kvm_is_configuring_cpu_caps); + kvm_is_configuring_cpu_caps = false; +} void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries, @@ -188,6 +196,7 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); + WARN_ON_ONCE(!kvm_is_configuring_cpu_caps); kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature); } @@ -195,6 +204,7 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); + WARN_ON_ONCE(!kvm_is_configuring_cpu_caps); kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 24d59ccfa40d..8cf1a47304df 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5202,7 +5202,7 @@ static __init void svm_adjust_mmio_mask(void) static __init void svm_set_cpu_caps(void) { - kvm_set_cpu_caps(); + kvm_initialize_cpu_caps(); kvm_caps.supported_perf_cap = 0; @@ -5284,6 +5284,8 @@ static __init void svm_set_cpu_caps(void) */ kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM); + + kvm_finalize_cpu_caps(); } static __init int svm_hardware_setup(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6b96f7aea20b..30fa20f8b72d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7994,7 +7994,7 @@ static __init u64 vmx_get_perf_capabilities(void) static __init void vmx_set_cpu_caps(void) { - kvm_set_cpu_caps(); + kvm_initialize_cpu_caps(); /* CPUID 0x1 */ if (nested) @@ -8051,6 +8051,8 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_SHSTK); kvm_cpu_cap_clear(X86_FEATURE_IBT); } + + kvm_finalize_cpu_caps(); } static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, From 6517dfbcc918f970a928d9dc17586904bac06893 Mon Sep 17 00:00:00 2001 From: Khushit Shah Date: Fri, 23 Jan 2026 12:56:25 +0000 Subject: [PATCH 184/267] KVM: x86: Add x2APIC "features" to control EOI broadcast suppression Add two flags for KVM_CAP_X2APIC_API to allow userspace to control support for Suppress EOI Broadcasts when using a split IRQCHIP (I/O APIC emulated by userspace), which KVM completely mishandles. When x2APIC support was first added, KVM incorrectly advertised and "enabled" Suppress EOI Broadcast, without fully supporting the I/O APIC side of the equation, i.e. without adding directed EOI to KVM's in-kernel I/O APIC. That flaw was carried over to split IRQCHIP support, i.e. KVM advertised support for Suppress EOI Broadcasts irrespective of whether or not the userspace I/O APIC implementation supported directed EOIs. Even worse, KVM didn't actually suppress EOI broadcasts, i.e. userspace VMMs without support for directed EOI came to rely on the "spurious" broadcasts. KVM "fixed" the in-kernel I/O APIC implementation by completely disabling support for Suppress EOI Broadcasts in commit 0bcc3fb95b97 ("KVM: lapic: stop advertising DIRECTED_EOI when in-kernel IOAPIC is in use"), but didn't do anything to remedy userspace I/O APIC implementations. KVM's bogus handling of Suppress EOI Broadcast is problematic when the guest relies on interrupts being masked in the I/O APIC until well after the initial local APIC EOI. E.g. Windows with Credential Guard enabled handles interrupts in the following order: 1. Interrupt for L2 arrives. 2. L1 APIC EOIs the interrupt. 3. L1 resumes L2 and injects the interrupt. 4. L2 EOIs after servicing. 5. L1 performs the I/O APIC EOI. Because KVM EOIs the I/O APIC at step #2, the guest can get an interrupt storm, e.g. if the IRQ line is still asserted and userspace reacts to the EOI by re-injecting the IRQ, because the guest doesn't de-assert the line until step #4, and doesn't expect the interrupt to be re-enabled until step #5. Unfortunately, simply "fixing" the bug isn't an option, as KVM has no way of knowing if the userspace I/O APIC supports directed EOIs, i.e. suppressing EOI broadcasts would result in interrupts being stuck masked in the userspace I/O APIC due to step #5 being ignored by userspace. And fully disabling support for Suppress EOI Broadcast is also undesirable, as picking up the fix would require a guest reboot, *and* more importantly would change the virtual CPU model exposed to the guest without any buy-in from userspace. Add KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST and KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST flags to allow userspace to explicitly enable or disable support for Suppress EOI Broadcasts. This gives userspace control over the virtual CPU model exposed to the guest, as KVM should never have enabled support for Suppress EOI Broadcast without userspace opt-in. Not setting either flag will result in legacy quirky behavior for backward compatibility. Disallow fully enabling SUPPRESS_EOI_BROADCAST when using an in-kernel I/O APIC, as KVM's history/support is just as tragic. E.g. it's not clear that commit c806a6ad35bf ("KVM: x86: call irq notifiers with directed EOI") was entirely correct, i.e. it may have simply papered over the lack of Directed EOI emulation in the I/O APIC. Note, Suppress EOI Broadcasts is defined only in Intel's SDM, not in AMD's APM. But the bit is writable on some AMD CPUs, e.g. Turin, and KVM's ABI is to support Directed EOI (KVM's name) irrespective of guest CPU vendor. Fixes: 7543a635aa09 ("KVM: x86: Add KVM exit for IOAPIC EOIs") Closes: https://lore.kernel.org/kvm/7D497EF1-607D-4D37-98E7-DAF95F099342@nutanix.com Cc: stable@vger.kernel.org Suggested-by: David Woodhouse Signed-off-by: Khushit Shah Link: https://patch.msgid.link/20260123125657.3384063-1-khushit.shah@nutanix.com [sean: clean up minor formatting goofs and fix a comment typo] Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 28 +++++++++++- arch/x86/include/asm/kvm_host.h | 7 +++ arch/x86/include/uapi/asm/kvm.h | 6 ++- arch/x86/kvm/ioapic.c | 2 +- arch/x86/kvm/lapic.c | 76 +++++++++++++++++++++++++++++---- arch/x86/kvm/lapic.h | 2 + arch/x86/kvm/x86.c | 21 ++++++++- 7 files changed, 127 insertions(+), 15 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 01a3abef8abb..f1f1d2e5dc7c 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7835,8 +7835,10 @@ Will return -EBUSY if a VCPU has already been created. Valid feature flags in args[0] are:: - #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) - #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) + #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + #define KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST (1ULL << 2) + #define KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST (1ULL << 3) Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC, @@ -7849,6 +7851,28 @@ as a broadcast even in x2APIC mode in order to support physical x2APIC without interrupt remapping. This is undesirable in logical mode, where 0xff represents CPUs 0-7 in cluster 0. +Setting KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST instructs KVM to enable +Suppress EOI Broadcasts. KVM will advertise support for Suppress EOI +Broadcast to the guest and suppress LAPIC EOI broadcasts when the guest +sets the Suppress EOI Broadcast bit in the SPIV register. This flag is +supported only when using a split IRQCHIP. + +Setting KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST disables support for +Suppress EOI Broadcasts entirely, i.e. instructs KVM to NOT advertise +support to the guest. + +Modern VMMs should either enable KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST +or KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST. If not, legacy quirky +behavior will be used by KVM: in split IRQCHIP mode, KVM will advertise +support for Suppress EOI Broadcasts but not actually suppress EOI +broadcasts; for in-kernel IRQCHIP mode, KVM will not advertise support for +Suppress EOI Broadcasts. + +Setting both KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST and +KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST will fail with an EINVAL error, +as will setting KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST without a split +IRCHIP. + 7.8 KVM_CAP_S390_USER_INSTR0 ---------------------------- diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 80fd7ca69c87..0414859bfa66 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1228,6 +1228,12 @@ enum kvm_irqchip_mode { KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ }; +enum kvm_suppress_eoi_broadcast_mode { + KVM_SUPPRESS_EOI_BROADCAST_QUIRKED, /* Legacy behavior */ + KVM_SUPPRESS_EOI_BROADCAST_ENABLED, /* Enable Suppress EOI broadcast */ + KVM_SUPPRESS_EOI_BROADCAST_DISABLED /* Disable Suppress EOI broadcast */ +}; + struct kvm_x86_msr_filter { u8 count; bool default_allow:1; @@ -1477,6 +1483,7 @@ struct kvm_arch { bool x2apic_format; bool x2apic_broadcast_quirk_disabled; + enum kvm_suppress_eoi_broadcast_mode suppress_eoi_broadcast_mode; bool has_mapped_host_mmio; bool guest_can_read_msr_platform_info; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 7ceff6583652..1208932e5cc3 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -914,8 +914,10 @@ struct kvm_sev_snp_launch_finish { __u64 pad1[4]; }; -#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) -#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) +#define KVM_X2APIC_API_USE_32BIT_IDS _BITULL(0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK _BITULL(1) +#define KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST _BITULL(2) +#define KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST _BITULL(3) struct kvm_hyperv_eventfd { __u32 conn_id; diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 2c2783296aed..a26fa4222f29 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -561,7 +561,7 @@ static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu, spin_lock(&ioapic->lock); if (trigger_mode != IOAPIC_LEVEL_TRIG || - kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) + kvm_lapic_suppress_eoi_broadcast(apic)) return; ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 01c2557bfa05..738ec3c1b0b5 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -105,6 +105,63 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) apic_test_vector(vector, apic->regs + APIC_IRR); } +static bool kvm_lapic_advertise_suppress_eoi_broadcast(struct kvm *kvm) +{ + switch (kvm->arch.suppress_eoi_broadcast_mode) { + case KVM_SUPPRESS_EOI_BROADCAST_ENABLED: + return true; + case KVM_SUPPRESS_EOI_BROADCAST_DISABLED: + return false; + case KVM_SUPPRESS_EOI_BROADCAST_QUIRKED: + /* + * The default in-kernel I/O APIC emulates the 82093AA and does not + * implement an EOI register. Some guests (e.g. Windows with the + * Hyper-V role enabled) disable LAPIC EOI broadcast without + * checking the I/O APIC version, which can cause level-triggered + * interrupts to never be EOI'd. + * + * To avoid this, KVM doesn't advertise Suppress EOI Broadcast + * support when using the default in-kernel I/O APIC. + * + * Historically, in split IRQCHIP mode, KVM always advertised + * Suppress EOI Broadcast support but did not actually suppress + * EOIs, resulting in quirky behavior. + */ + return !ioapic_in_kernel(kvm); + default: + WARN_ON_ONCE(1); + return false; + } +} + +bool kvm_lapic_suppress_eoi_broadcast(struct kvm_lapic *apic) +{ + struct kvm *kvm = apic->vcpu->kvm; + + if (!(kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) + return false; + + switch (kvm->arch.suppress_eoi_broadcast_mode) { + case KVM_SUPPRESS_EOI_BROADCAST_ENABLED: + return true; + case KVM_SUPPRESS_EOI_BROADCAST_DISABLED: + return false; + case KVM_SUPPRESS_EOI_BROADCAST_QUIRKED: + /* + * Historically, in split IRQCHIP mode, KVM ignored the suppress + * EOI broadcast bit set by the guest and broadcasts EOIs to the + * userspace I/O APIC. For In-kernel I/O APIC, the support itself + * is not advertised, can only be enabled via KVM_SET_APIC_STATE, + * and KVM's I/O APIC doesn't emulate Directed EOIs; but if the + * feature is enabled, it is respected (with odd behavior). + */ + return ioapic_in_kernel(kvm); + default: + WARN_ON_ONCE(1); + return false; + } +} + __read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_has_noapic_vcpu); @@ -554,15 +611,9 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16); - /* - * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation) - * which doesn't have EOI register; Some buggy OSes (e.g. Windows with - * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC - * version first and level-triggered interrupts never get EOIed in - * IOAPIC. - */ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) && - !ioapic_in_kernel(vcpu->kvm)) + kvm_lapic_advertise_suppress_eoi_broadcast(vcpu->kvm)) v |= APIC_LVR_DIRECTED_EOI; kvm_lapic_set_reg(apic, APIC_LVR, v); } @@ -1517,6 +1568,15 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) /* Request a KVM exit to inform the userspace IOAPIC. */ if (irqchip_split(apic->vcpu->kvm)) { + /* + * Don't exit to userspace if the guest has enabled Directed + * EOI, a.k.a. Suppress EOI Broadcasts, in which case the local + * APIC doesn't broadcast EOIs (the guest must EOI the target + * I/O APIC(s) directly). + */ + if (kvm_lapic_suppress_eoi_broadcast(apic)) + return; + apic->vcpu->arch.pending_ioapic_eoi = vector; kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu); return; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 282b9b7da98c..e5f5a222eced 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -231,6 +231,8 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); +bool kvm_lapic_suppress_eoi_broadcast(struct kvm_lapic *apic); + void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu); void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1ea94f4a3dcb..67e666921a12 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -121,8 +121,10 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE -#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ - KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) +#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ + KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK | \ + KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST | \ + KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST) static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); @@ -4932,6 +4934,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_X2APIC_API: r = KVM_X2APIC_API_VALID_FLAGS; + if (kvm && !irqchip_split(kvm)) + r &= ~KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST; break; case KVM_CAP_NESTED_STATE: r = kvm_x86_ops.nested_ops->get_state ? @@ -6740,11 +6744,24 @@ split_irqchip_unlock: if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS) break; + if ((cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST) && + (cap->args[0] & KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST)) + break; + + if ((cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST) && + !irqchip_split(kvm)) + break; + if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) kvm->arch.x2apic_format = true; if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) kvm->arch.x2apic_broadcast_quirk_disabled = true; + if (cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST) + kvm->arch.suppress_eoi_broadcast_mode = KVM_SUPPRESS_EOI_BROADCAST_ENABLED; + if (cap->args[0] & KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST) + kvm->arch.suppress_eoi_broadcast_mode = KVM_SUPPRESS_EOI_BROADCAST_DISABLED; + r = 0; break; case KVM_CAP_X86_DISABLE_EXITS: From dcd79ed450934421158d81600f1be4c2e2af20bf Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 2 Feb 2026 08:57:19 +0000 Subject: [PATCH 185/267] KVM: arm64: Use standard seq_file iterator for idregs debugfs The current implementation uses `idreg_debugfs_iter` in `struct kvm_arch` to track the sequence position. This effectively makes the iterator shared across all open file descriptors for the VM. This approach has significant drawbacks: - It enforces mutual exclusion, preventing concurrent reads of the debugfs file (returning -EBUSY). - It relies on storing transient iterator state in the long-lived VM structure (`kvm_arch`). - The use of `u8` for the iterator index imposes an implicit limit of 255 registers. While not currently exceeded, this is fragile against future architectural growth. Switching to `loff_t` eliminates this overflow risk. Refactor the implementation to use the standard `seq_file` iterator. Instead of storing state in `kvm_arch`, rely on the `pos` argument passed to the `start` and `next` callbacks, which tracks the logical index specific to the file descriptor. This change enables concurrent access and eliminates the `idreg_debugfs_iter` field from `struct kvm_arch`. Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260202085721.3954942-2-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 3 -- arch/arm64/kvm/sys_regs.c | 50 +++++-------------------------- 2 files changed, 8 insertions(+), 45 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ac7f970c7883..643a199f6e9e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -373,9 +373,6 @@ struct kvm_arch { /* Maximum number of counters for the guest */ u8 nr_pmu_counters; - /* Iterator for idreg debugfs */ - u8 idreg_debugfs_iter; - /* Hypercall features firmware registers' descriptor */ struct kvm_smccc_features smccc_feat; struct maple_tree smccc_filter; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c8fd7c6a12a1..ef6b76ecfb11 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -4992,7 +4992,7 @@ static bool emulate_sys_reg(struct kvm_vcpu *vcpu, return false; } -static const struct sys_reg_desc *idregs_debug_find(struct kvm *kvm, u8 pos) +static const struct sys_reg_desc *idregs_debug_find(struct kvm *kvm, loff_t pos) { unsigned long i, idreg_idx = 0; @@ -5002,10 +5002,8 @@ static const struct sys_reg_desc *idregs_debug_find(struct kvm *kvm, u8 pos) if (!is_vm_ftr_id_reg(reg_to_encoding(r))) continue; - if (idreg_idx == pos) + if (idreg_idx++ == pos) return r; - - idreg_idx++; } return NULL; @@ -5014,23 +5012,11 @@ static const struct sys_reg_desc *idregs_debug_find(struct kvm *kvm, u8 pos) static void *idregs_debug_start(struct seq_file *s, loff_t *pos) { struct kvm *kvm = s->private; - u8 *iter; - mutex_lock(&kvm->arch.config_lock); + if (!test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) + return NULL; - iter = &kvm->arch.idreg_debugfs_iter; - if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags) && - *iter == (u8)~0) { - *iter = *pos; - if (!idregs_debug_find(kvm, *iter)) - iter = NULL; - } else { - iter = ERR_PTR(-EBUSY); - } - - mutex_unlock(&kvm->arch.config_lock); - - return iter; + return (void *)idregs_debug_find(kvm, *pos); } static void *idregs_debug_next(struct seq_file *s, void *v, loff_t *pos) @@ -5039,37 +5025,19 @@ static void *idregs_debug_next(struct seq_file *s, void *v, loff_t *pos) (*pos)++; - if (idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter + 1)) { - kvm->arch.idreg_debugfs_iter++; - - return &kvm->arch.idreg_debugfs_iter; - } - - return NULL; + return (void *)idregs_debug_find(kvm, *pos); } static void idregs_debug_stop(struct seq_file *s, void *v) { - struct kvm *kvm = s->private; - - if (IS_ERR(v)) - return; - - mutex_lock(&kvm->arch.config_lock); - - kvm->arch.idreg_debugfs_iter = ~0; - - mutex_unlock(&kvm->arch.config_lock); } static int idregs_debug_show(struct seq_file *s, void *v) { - const struct sys_reg_desc *desc; + const struct sys_reg_desc *desc = v; struct kvm *kvm = s->private; - desc = idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter); - - if (!desc->name) + if (!desc) return 0; seq_printf(s, "%20s:\t%016llx\n", @@ -5089,8 +5057,6 @@ DEFINE_SEQ_ATTRIBUTE(idregs_debug); void kvm_sys_regs_create_debugfs(struct kvm *kvm) { - kvm->arch.idreg_debugfs_iter = ~0; - debugfs_create_file("idregs", 0444, kvm->debugfs_dentry, kvm, &idregs_debug_fops); } From 5ab24969705a9adadbc1d3cff4c1c15df174eafb Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 2 Feb 2026 08:57:20 +0000 Subject: [PATCH 186/267] KVM: arm64: Reimplement vgic-debug XArray iteration The vgic-debug interface implementation uses XArray marks (`LPI_XA_MARK_DEBUG_ITER`) to "snapshot" LPIs at the start of iteration. This modifies global state for a read-only operation and complicates reference counting, leading to leaks if iteration is aborted or fails. Reimplement the iterator to use dynamic iteration logic: - Remove `lpi_idx` from `struct vgic_state_iter`. - Replace the XArray marking mechanism with dynamic iteration using `xa_find_after(..., XA_PRESENT)`. - Wrap XArray traversals in `rcu_read_lock()`/`rcu_read_unlock()` to ensure safety against concurrent modifications (e.g., LPI unmapping). - Handle potential races where an LPI is removed during iteration by gracefully skipping it in `show()`, rather than warning. - Remove the unused `LPI_XA_MARK_DEBUG_ITER` definition. This simplifies the lifecycle management of the iterator and prevents resource leaks associated with the marking mechanism, and paves the way for using a standard seq_file iterator. Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260202085721.3954942-3-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-debug.c | 68 ++++++++++---------------------- include/kvm/arm_vgic.h | 1 - 2 files changed, 20 insertions(+), 49 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index bb92853d1fd3..ec3d0c1fe703 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -25,11 +25,9 @@ struct vgic_state_iter { int nr_cpus; int nr_spis; - int nr_lpis; int dist_id; int vcpu_id; unsigned long intid; - int lpi_idx; }; static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter) @@ -45,13 +43,15 @@ static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter) * Let the xarray drive the iterator after the last SPI, as the iterator * has exhausted the sequentially-allocated INTID space. */ - if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS - 1) && - iter->nr_lpis) { - if (iter->lpi_idx < iter->nr_lpis) - xa_find_after(&dist->lpi_xa, &iter->intid, - VGIC_LPI_MAX_INTID, - LPI_XA_MARK_DEBUG_ITER); - iter->lpi_idx++; + if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS - 1)) { + if (iter->intid == VGIC_LPI_MAX_INTID + 1) + return; + + rcu_read_lock(); + if (!xa_find_after(&dist->lpi_xa, &iter->intid, + VGIC_LPI_MAX_INTID, XA_PRESENT)) + iter->intid = VGIC_LPI_MAX_INTID + 1; + rcu_read_unlock(); return; } @@ -61,44 +61,21 @@ static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter) iter->intid = 0; } -static int iter_mark_lpis(struct kvm *kvm) +static int vgic_count_lpis(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - unsigned long intid, flags; struct vgic_irq *irq; + unsigned long intid; int nr_lpis = 0; - xa_lock_irqsave(&dist->lpi_xa, flags); - - xa_for_each(&dist->lpi_xa, intid, irq) { - if (!vgic_try_get_irq_ref(irq)) - continue; - - __xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); + rcu_read_lock(); + xa_for_each(&dist->lpi_xa, intid, irq) nr_lpis++; - } - - xa_unlock_irqrestore(&dist->lpi_xa, flags); + rcu_read_unlock(); return nr_lpis; } -static void iter_unmark_lpis(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - unsigned long intid, flags; - struct vgic_irq *irq; - - xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) { - xa_lock_irqsave(&dist->lpi_xa, flags); - __xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); - xa_unlock_irqrestore(&dist->lpi_xa, flags); - - /* vgic_put_irq() expects to be called outside of the xa_lock */ - vgic_put_irq(kvm, irq); - } -} - static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter, loff_t pos) { @@ -108,8 +85,6 @@ static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter, iter->nr_cpus = nr_cpus; iter->nr_spis = kvm->arch.vgic.nr_spis; - if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) - iter->nr_lpis = iter_mark_lpis(kvm); /* Fast forward to the right position if needed */ while (pos--) @@ -121,7 +96,7 @@ static bool end_of_vgic(struct vgic_state_iter *iter) return iter->dist_id > 0 && iter->vcpu_id == iter->nr_cpus && iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) && - (!iter->nr_lpis || iter->lpi_idx > iter->nr_lpis); + iter->intid > VGIC_LPI_MAX_INTID; } static void *vgic_debug_start(struct seq_file *s, loff_t *pos) @@ -178,7 +153,6 @@ static void vgic_debug_stop(struct seq_file *s, void *v) mutex_lock(&kvm->arch.config_lock); iter = kvm->arch.vgic.iter; - iter_unmark_lpis(kvm); kfree(iter); kvm->arch.vgic.iter = NULL; mutex_unlock(&kvm->arch.config_lock); @@ -188,13 +162,14 @@ static void print_dist_state(struct seq_file *s, struct vgic_dist *dist, struct vgic_state_iter *iter) { bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3; + struct kvm *kvm = s->private; seq_printf(s, "Distributor\n"); seq_printf(s, "===========\n"); seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2"); seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis); if (v3) - seq_printf(s, "nr_lpis:\t%d\n", iter->nr_lpis); + seq_printf(s, "nr_lpis:\t%d\n", vgic_count_lpis(kvm)); seq_printf(s, "enabled:\t%d\n", dist->enabled); seq_printf(s, "\n"); @@ -291,16 +266,13 @@ static int vgic_debug_show(struct seq_file *s, void *v) if (iter->vcpu_id < iter->nr_cpus) vcpu = kvm_get_vcpu(kvm, iter->vcpu_id); - /* - * Expect this to succeed, as iter_mark_lpis() takes a reference on - * every LPI to be visited. - */ if (iter->intid < VGIC_NR_PRIVATE_IRQS) irq = vgic_get_vcpu_irq(vcpu, iter->intid); else irq = vgic_get_irq(kvm, iter->intid); - if (WARN_ON_ONCE(!irq)) - return -EINVAL; + + if (!irq) + return 0; raw_spin_lock_irqsave(&irq->irq_lock, flags); print_irq_state(s, irq, vcpu); diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index b261fb3968d0..d32fafbd2907 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -300,7 +300,6 @@ struct vgic_dist { */ u64 propbaser; -#define LPI_XA_MARK_DEBUG_ITER XA_MARK_0 struct xarray lpi_xa; /* used by vgic-debug */ From fb21cb08566ebed91d5c876db5c013cc8af83b89 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 2 Feb 2026 08:57:21 +0000 Subject: [PATCH 187/267] KVM: arm64: Use standard seq_file iterator for vgic-debug debugfs The current implementation uses `vgic_state_iter` in `struct vgic_dist` to track the sequence position. This effectively makes the iterator shared across all open file descriptors for the VM. This approach has significant drawbacks: - It enforces mutual exclusion, preventing concurrent reads of the debugfs file (returning -EBUSY). - It relies on storing transient iterator state in the long-lived VM structure (`vgic_dist`). Refactor the implementation to use the standard `seq_file` iterator. Instead of storing state in `kvm_arch`, rely on the `pos` argument passed to the `start` and `next` callbacks, which tracks the logical index specific to the file descriptor. This change enables concurrent access and eliminates the `vgic_state_iter` field from `struct vgic_dist`. Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260202085721.3954942-4-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-debug.c | 40 ++++++++++---------------------- include/kvm/arm_vgic.h | 3 --- 2 files changed, 12 insertions(+), 31 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index ec3d0c1fe703..2c6776a1779b 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -104,58 +104,42 @@ static void *vgic_debug_start(struct seq_file *s, loff_t *pos) struct kvm *kvm = s->private; struct vgic_state_iter *iter; - mutex_lock(&kvm->arch.config_lock); - iter = kvm->arch.vgic.iter; - if (iter) { - iter = ERR_PTR(-EBUSY); - goto out; - } - iter = kmalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) { - iter = ERR_PTR(-ENOMEM); - goto out; - } + if (!iter) + return ERR_PTR(-ENOMEM); iter_init(kvm, iter, *pos); - kvm->arch.vgic.iter = iter; - if (end_of_vgic(iter)) + if (end_of_vgic(iter)) { + kfree(iter); iter = NULL; -out: - mutex_unlock(&kvm->arch.config_lock); + } + return iter; } static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos) { struct kvm *kvm = s->private; - struct vgic_state_iter *iter = kvm->arch.vgic.iter; + struct vgic_state_iter *iter = v; ++*pos; iter_next(kvm, iter); - if (end_of_vgic(iter)) + if (end_of_vgic(iter)) { + kfree(iter); iter = NULL; + } return iter; } static void vgic_debug_stop(struct seq_file *s, void *v) { - struct kvm *kvm = s->private; - struct vgic_state_iter *iter; + struct vgic_state_iter *iter = v; - /* - * If the seq file wasn't properly opened, there's nothing to clearn - * up. - */ - if (IS_ERR(v)) + if (IS_ERR_OR_NULL(v)) return; - mutex_lock(&kvm->arch.config_lock); - iter = kvm->arch.vgic.iter; kfree(iter); - kvm->arch.vgic.iter = NULL; - mutex_unlock(&kvm->arch.config_lock); } static void print_dist_state(struct seq_file *s, struct vgic_dist *dist, diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index d32fafbd2907..f2eafc65bbf4 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -302,9 +302,6 @@ struct vgic_dist { struct xarray lpi_xa; - /* used by vgic-debug */ - struct vgic_state_iter *iter; - /* * GICv4 ITS per-VM data, containing the IRQ domain, the VPE * array, the property table pointer as well as allocation From 0c4762e26879acc101790269382f230f22fd6905 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 2 Feb 2026 15:22:53 +0000 Subject: [PATCH 188/267] KVM: arm64: nv: Avoid NV stage-2 code when NV is not supported The NV stage-2 manipulation functions kvm_nested_s2_unmap(), kvm_nested_s2_wp(), and others, are being called for any stage-2 manipulation regardless of whether nested virtualization is supported or enabled for the VM. For protected KVM (pKVM), `struct kvm_pgtable` uses the `pkvm_mappings` member of the union. This member aliases `ia_bits`, which is used by the non-protected NV code paths. Attempting to read `pgt->ia_bits` in these functions results in treating protected mapping pointers or state values as bit-shift amounts. This triggers a UBSAN shift-out-of-bounds error: UBSAN: shift-out-of-bounds in arch/arm64/kvm/nested.c:1127:34 shift exponent 174565952 is too large for 64-bit type 'unsigned long' Call trace: __ubsan_handle_shift_out_of_bounds+0x28c/0x2c0 kvm_nested_s2_unmap+0x228/0x248 kvm_arch_flush_shadow_memslot+0x98/0xc0 kvm_set_memslot+0x248/0xce0 Since pKVM and NV are mutually exclusive, prevent entry into these NV handling functions if the VM has not allocated any nested MMUs (i.e., `kvm->arch.nested_mmus_size` is 0). Fixes: 7270cc9157f47 ("KVM: arm64: nv: Handle VNCR_EL2 invalidation from MMU notifiers") Suggested-by: Marc Zyngier Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260202152310.113467-1-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index cdeeb8f09e72..d03e9b71bf6c 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1101,6 +1101,9 @@ void kvm_nested_s2_wp(struct kvm *kvm) lockdep_assert_held_write(&kvm->mmu_lock); + if (!kvm->arch.nested_mmus_size) + return; + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; @@ -1117,6 +1120,9 @@ void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block) lockdep_assert_held_write(&kvm->mmu_lock); + if (!kvm->arch.nested_mmus_size) + return; + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; @@ -1133,6 +1139,9 @@ void kvm_nested_s2_flush(struct kvm *kvm) lockdep_assert_held_write(&kvm->mmu_lock); + if (!kvm->arch.nested_mmus_size) + return; + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; @@ -1145,6 +1154,9 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) { int i; + if (!kvm->arch.nested_mmus_size) + return; + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; From 05664e002129f5404c9d066844ef04a2223bcfea Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:30 +0100 Subject: [PATCH 189/267] KVM: s390: Refactor pgste lock and unlock functions Move the pgste lock and unlock functions back into mm/pgtable.c and duplicate them in mm/gmap_helpers.c to avoid function name collisions later on. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/include/asm/pgtable.h | 22 ---------------------- arch/s390/mm/gmap_helpers.c | 23 ++++++++++++++++++++++- arch/s390/mm/pgtable.c | 23 ++++++++++++++++++++++- 3 files changed, 44 insertions(+), 24 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index bca9b29778c3..8194a2b12ecf 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -2040,26 +2040,4 @@ static inline unsigned long gmap_pgste_get_pgt_addr(unsigned long *pgt) return res; } -static inline pgste_t pgste_get_lock(pte_t *ptep) -{ - unsigned long value = 0; -#ifdef CONFIG_PGSTE - unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE); - - do { - value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr); - } while (value & PGSTE_PCL_BIT); - value |= PGSTE_PCL_BIT; -#endif - return __pgste(value); -} - -static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) -{ -#ifdef CONFIG_PGSTE - barrier(); - WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT); -#endif -} - #endif /* _S390_PAGE_H */ diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index d41b19925a5a..4fba13675950 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -15,7 +15,6 @@ #include #include #include -#include /** * ptep_zap_softleaf_entry() - discard a software leaf entry. @@ -35,6 +34,28 @@ static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) free_swap_and_cache(entry); } +static inline pgste_t pgste_get_lock(pte_t *ptep) +{ + unsigned long value = 0; +#ifdef CONFIG_PGSTE + unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE); + + do { + value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr); + } while (value & PGSTE_PCL_BIT); + value |= PGSTE_PCL_BIT; +#endif + return __pgste(value); +} + +static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) +{ +#ifdef CONFIG_PGSTE + barrier(); + WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT); +#endif +} + /** * gmap_helper_zap_one_page() - discard a page if it was swapped. * @mm: the mm diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 666adcd681ab..08743c1dac2f 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -24,7 +24,6 @@ #include #include #include -#include #include pgprot_t pgprot_writecombine(pgprot_t prot) @@ -116,6 +115,28 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, return old; } +static inline pgste_t pgste_get_lock(pte_t *ptep) +{ + unsigned long value = 0; +#ifdef CONFIG_PGSTE + unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE); + + do { + value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr); + } while (value & PGSTE_PCL_BIT); + value |= PGSTE_PCL_BIT; +#endif + return __pgste(value); +} + +static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) +{ +#ifdef CONFIG_PGSTE + barrier(); + WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT); +#endif +} + static inline pgste_t pgste_get(pte_t *ptep) { unsigned long pgste = 0; From 013bf0f57e12b7cbed3ec28739b49606c3b1c054 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:31 +0100 Subject: [PATCH 190/267] KVM: s390: Add P bit in table entry bitfields, move union vaddress Add P bit in hardware definition of region 3 and segment table entries. Move union vaddress from kvm/gaccess.c to asm/dat_bits.h Reviewed-by: Christian Borntraeger Reviewed-by: Steffen Eiden Reviewed-by: Christoph Schlameuss Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/include/asm/dat-bits.h | 32 ++++++++++++++++++++++++++++++-- arch/s390/kvm/gaccess.c | 26 -------------------------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/arch/s390/include/asm/dat-bits.h b/arch/s390/include/asm/dat-bits.h index 8d65eec2f124..c40874e0e426 100644 --- a/arch/s390/include/asm/dat-bits.h +++ b/arch/s390/include/asm/dat-bits.h @@ -9,6 +9,32 @@ #ifndef _S390_DAT_BITS_H #define _S390_DAT_BITS_H +/* + * vaddress union in order to easily decode a virtual address into its + * region first index, region second index etc. parts. + */ +union vaddress { + unsigned long addr; + struct { + unsigned long rfx : 11; + unsigned long rsx : 11; + unsigned long rtx : 11; + unsigned long sx : 11; + unsigned long px : 8; + unsigned long bx : 12; + }; + struct { + unsigned long rfx01 : 2; + unsigned long : 9; + unsigned long rsx01 : 2; + unsigned long : 9; + unsigned long rtx01 : 2; + unsigned long : 9; + unsigned long sx01 : 2; + unsigned long : 29; + }; +}; + union asce { unsigned long val; struct { @@ -98,7 +124,8 @@ union region3_table_entry { struct { unsigned long : 53; unsigned long fc: 1; /* Format-Control */ - unsigned long : 4; + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long : 3; unsigned long i : 1; /* Region-Invalid Bit */ unsigned long cr: 1; /* Common-Region Bit */ unsigned long tt: 2; /* Table-Type Bits */ @@ -140,7 +167,8 @@ union segment_table_entry { struct { unsigned long : 53; unsigned long fc: 1; /* Format-Control */ - unsigned long : 4; + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long : 3; unsigned long i : 1; /* Segment-Invalid Bit */ unsigned long cs: 1; /* Common-Segment Bit */ unsigned long tt: 2; /* Table-Type Bits */ diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 41ca6b0ee7a9..d8347f7cbe51 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -20,32 +20,6 @@ #define GMAP_SHADOW_FAKE_TABLE 1ULL -/* - * vaddress union in order to easily decode a virtual address into its - * region first index, region second index etc. parts. - */ -union vaddress { - unsigned long addr; - struct { - unsigned long rfx : 11; - unsigned long rsx : 11; - unsigned long rtx : 11; - unsigned long sx : 11; - unsigned long px : 8; - unsigned long bx : 12; - }; - struct { - unsigned long rfx01 : 2; - unsigned long : 9; - unsigned long rsx01 : 2; - unsigned long : 9; - unsigned long rtx01 : 2; - unsigned long : 9; - unsigned long sx01 : 2; - unsigned long : 29; - }; -}; - /* * raddress union which will contain the result (real or absolute address) * after a page table walk. The rfaa, sfaa and pfra members are used to From 21401ce497062400f24cdd268c62fd47d126dd2e Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:32 +0100 Subject: [PATCH 191/267] s390: Make UV folio operations work on whole folio uv_destroy_folio() and uv_convert_from_secure_folio() should work on all pages in the folio, not just the first one. This was fine until now, but it will become a problem with upcoming patches. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/kernel/uv.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index ed46950be86f..ca0849008c0d 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -134,14 +134,15 @@ static int uv_destroy(unsigned long paddr) */ int uv_destroy_folio(struct folio *folio) { + unsigned long i; int rc; - /* Large folios cannot be secure */ - if (unlikely(folio_test_large(folio))) - return 0; - folio_get(folio); - rc = uv_destroy(folio_to_phys(folio)); + for (i = 0; i < (1 << folio_order(folio)); i++) { + rc = uv_destroy(folio_to_phys(folio) + i * PAGE_SIZE); + if (rc) + break; + } if (!rc) clear_bit(PG_arch_1, &folio->flags.f); folio_put(folio); @@ -183,14 +184,15 @@ EXPORT_SYMBOL_GPL(uv_convert_from_secure); */ int uv_convert_from_secure_folio(struct folio *folio) { + unsigned long i; int rc; - /* Large folios cannot be secure */ - if (unlikely(folio_test_large(folio))) - return 0; - folio_get(folio); - rc = uv_convert_from_secure(folio_to_phys(folio)); + for (i = 0; i < (1 << folio_order(folio)); i++) { + rc = uv_convert_from_secure(folio_to_phys(folio) + i * PAGE_SIZE); + if (rc) + break; + } if (!rc) clear_bit(PG_arch_1, &folio->flags.f); folio_put(folio); From 4dadf64d9b75916081a55f9fbdc70e33be8d3ead Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:33 +0100 Subject: [PATCH 192/267] s390: Move sske_frame() to a header Move the sske_frame() function to asm/pgtable.h, so it can be used in other modules too. Opportunistically convert the .insn opcode specification to the appropriate mnemonic. Reviewed-by: Christian Borntraeger Reviewed-by: Steffen Eiden Reviewed-by: Christoph Schlameuss Reviewed-by: Nina Schoetterl-Glausch Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/include/asm/pgtable.h | 7 +++++++ arch/s390/mm/pageattr.c | 7 ------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 8194a2b12ecf..73c30b811b98 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1136,6 +1136,13 @@ static inline pte_t pte_mkhuge(pte_t pte) } #endif +static inline unsigned long sske_frame(unsigned long addr, unsigned char skey) +{ + asm volatile("sske %[skey],%[addr],1" + : [addr] "+a" (addr) : [skey] "d" (skey)); + return addr; +} + #define IPTE_GLOBAL 0 #define IPTE_LOCAL 1 diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index d3ce04a4b248..bb29c38ae624 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -16,13 +16,6 @@ #include #include -static inline unsigned long sske_frame(unsigned long addr, unsigned char skey) -{ - asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],1,0" - : [addr] "+a" (addr) : [skey] "d" (skey)); - return addr; -} - void __storage_key_init_range(unsigned long start, unsigned long end) { unsigned long boundary, size; From c98175b7917fa81cd499b1527c4a57fd7d36711e Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:34 +0100 Subject: [PATCH 193/267] KVM: s390: Add gmap_helper_set_unused() Add gmap_helper_set_unused() to mark userspace ptes as unused. Core mm code will use that information to discard unused pages instead of attempting to swap them. Reviewed-by: Nico Boehr Tested-by: Nico Boehr Acked-by: Christoph Schlameuss Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/include/asm/gmap_helpers.h | 1 + arch/s390/mm/gmap_helpers.c | 79 ++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/arch/s390/include/asm/gmap_helpers.h b/arch/s390/include/asm/gmap_helpers.h index 5356446a61c4..2d3ae421077e 100644 --- a/arch/s390/include/asm/gmap_helpers.h +++ b/arch/s390/include/asm/gmap_helpers.h @@ -11,5 +11,6 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr); void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end); int gmap_helper_disable_cow_sharing(void); +void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr); #endif /* _ASM_S390_GMAP_HELPERS_H */ diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index 4fba13675950..4864cb35fc25 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -129,6 +129,85 @@ void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned lo } EXPORT_SYMBOL_GPL(gmap_helper_discard); +/** + * gmap_helper_try_set_pte_unused() - mark a pte entry as unused + * @mm: the mm + * @vmaddr: the userspace address whose pte is to be marked + * + * Mark the pte corresponding the given address as unused. This will cause + * core mm code to just drop this page instead of swapping it. + * + * This function needs to be called with interrupts disabled (for example + * while holding a spinlock), or while holding the mmap lock. Normally this + * function is called as a result of an unmap operation, and thus KVM common + * code will already hold kvm->mmu_lock in write mode. + * + * Context: Needs to be called while holding the mmap lock or with interrupts + * disabled. + */ +void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr) +{ + pmd_t *pmdp, pmd, pmdval; + pud_t *pudp, pud; + p4d_t *p4dp, p4d; + pgd_t *pgdp, pgd; + spinlock_t *ptl; /* Lock for the host (userspace) page table */ + pte_t *ptep; + + pgdp = pgd_offset(mm, vmaddr); + pgd = pgdp_get(pgdp); + if (pgd_none(pgd) || !pgd_present(pgd)) + return; + + p4dp = p4d_offset(pgdp, vmaddr); + p4d = p4dp_get(p4dp); + if (p4d_none(p4d) || !p4d_present(p4d)) + return; + + pudp = pud_offset(p4dp, vmaddr); + pud = pudp_get(pudp); + if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud)) + return; + + pmdp = pmd_offset(pudp, vmaddr); + pmd = pmdp_get_lockless(pmdp); + if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd)) + return; + + ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, &ptl); + if (!ptep) + return; + + /* + * Several paths exists that takes the ptl lock and then call the + * mmu_notifier, which takes the mmu_lock. The unmap path, instead, + * takes the mmu_lock in write mode first, and then potentially + * calls this function, which takes the ptl lock. This can lead to a + * deadlock. + * The unused page mechanism is only an optimization, if the + * _PAGE_UNUSED bit is not set, the unused page is swapped as normal + * instead of being discarded. + * If the lock is contended the bit is not set and the deadlock is + * avoided. + */ + if (spin_trylock(ptl)) { + /* + * Make sure the pte we are touching is still the correct + * one. In theory this check should not be needed, but + * better safe than sorry. + * Disabling interrupts or holding the mmap lock is enough to + * guarantee that no concurrent updates to the page tables + * are possible. + */ + if (likely(pmd_same(pmdval, pmdp_get_lockless(pmdp)))) + __atomic64_or(_PAGE_UNUSED, (long *)ptep); + spin_unlock(ptl); + } + + pte_unmap(ptep); +} +EXPORT_SYMBOL_GPL(gmap_helper_try_set_pte_unused); + static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr, unsigned long end, struct mm_walk *walk) { From cf3ce3d69d56bd3ef2bc5f6b500ebc5181637161 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:35 +0100 Subject: [PATCH 194/267] KVM: s390: Introduce import_lock Introduce import_lock to avoid future races when converting pages to secure. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/include/asm/kvm_host.h | 2 ++ arch/s390/kvm/kvm-s390.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index ae1223264d3c..3dbddb7c60a9 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -630,6 +630,8 @@ struct kvm_s390_pv { void *set_aside; struct list_head need_cleanup; struct mmu_notifier mmu_notifier; + /* Protects against concurrent import-like operations */ + struct mutex import_lock; }; struct kvm_arch { diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 56a50524b3ee..cd39b2f099ca 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3330,6 +3330,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) char debug_name[16]; int i, rc; + mutex_init(&kvm->arch.pv.import_lock); + rc = -EINVAL; #ifdef CONFIG_KVM_S390_UCONTROL if (type & ~KVM_VM_S390_UCONTROL) From 3ddee7e651661a2c3cbe12bb36215a64295f8b01 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:36 +0100 Subject: [PATCH 195/267] KVM: s390: Export two functions Export __make_folio_secure() and s390_wiggle_split_folio(), as they will be needed to be used by KVM. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/include/asm/uv.h | 2 ++ arch/s390/kernel/uv.c | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index 8018549a1ad2..0744874ca6df 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -632,6 +632,8 @@ int uv_destroy_folio(struct folio *folio); int uv_destroy_pte(pte_t pte); int uv_convert_from_secure_pte(pte_t pte); int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb); +int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio); +int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb); int uv_convert_from_secure(unsigned long paddr); int uv_convert_from_secure_folio(struct folio *folio); diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index ca0849008c0d..cb4e8089fbca 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -281,7 +281,7 @@ static int expected_folio_refs(struct folio *folio) * (it's the same logic as split_folio()), and the folio must be * locked. */ -static int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) +int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) { int expected, cc = 0; @@ -311,6 +311,7 @@ static int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) return -EAGAIN; return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; } +EXPORT_SYMBOL(__make_folio_secure); static int make_folio_secure(struct mm_struct *mm, struct folio *folio, struct uv_cb_header *uvcb) { @@ -339,7 +340,7 @@ static int make_folio_secure(struct mm_struct *mm, struct folio *folio, struct u * but another attempt can be made; * -EINVAL in case of other folio splitting errors. See split_folio(). */ -static int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio) +int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio) { int rc, tried_splits; @@ -411,6 +412,7 @@ static int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio) } return -EAGAIN; } +EXPORT_SYMBOL_GPL(s390_wiggle_split_folio); int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb) { From 850103b0ce6de85aec3a8722c2f48c20d822b10c Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:37 +0100 Subject: [PATCH 196/267] s390/mm: Warn if uv_convert_from_secure_pte() fails If uv_convert_from_secure_pte() fails, the page becomes unusable by the host. The failure can only occour in case of hardware malfunction or a serious KVM bug. When the unusable page is reused, the system can have issues and hang. Print a warning to aid debugging such unlikely scenarios. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/include/asm/pgtable.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 73c30b811b98..04335f5e7f47 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1239,7 +1239,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); /* At this point the reference through the mapping is still present */ if (mm_is_protected(mm) && pte_present(res)) - uv_convert_from_secure_pte(res); + WARN_ON_ONCE(uv_convert_from_secure_pte(res)); return res; } @@ -1257,7 +1257,7 @@ static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, res = ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID)); /* At this point the reference through the mapping is still present */ if (mm_is_protected(vma->vm_mm) && pte_present(res)) - uv_convert_from_secure_pte(res); + WARN_ON_ONCE(uv_convert_from_secure_pte(res)); return res; } @@ -1294,9 +1294,10 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, /* * If something went wrong and the page could not be destroyed, or * if this is not a mm teardown, the slower export is used as - * fallback instead. + * fallback instead. If even that fails, print a warning and leak + * the page, to avoid crashing the whole system. */ - uv_convert_from_secure_pte(res); + WARN_ON_ONCE(uv_convert_from_secure_pte(res)); return res; } From f0e1ca6cc32fd3b1616409354da2637e21ff9a6f Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:38 +0100 Subject: [PATCH 197/267] KVM: s390: vsie: Pass gmap explicitly as parameter Pass the gmap explicitly as parameter, instead of just using vsie_page->gmap. This will be used in upcoming patches. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/vsie.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index b8064c6a4b57..1da1d0f460e1 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -652,7 +652,7 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, * - -EAGAIN if the caller can retry immediately * - -ENOMEM if out of memory */ -static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT; @@ -667,10 +667,9 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* with mso/msl, the prefix lies at offset *mso* */ prefix += scb_s->mso; - rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL); + rc = kvm_s390_shadow_fault(vcpu, sg, prefix, NULL); if (!rc && (scb_s->ecb & ECB_TE)) - rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - prefix + PAGE_SIZE, NULL); + rc = kvm_s390_shadow_fault(vcpu, sg, prefix + PAGE_SIZE, NULL); /* * We don't have to mprotect, we will be called for all unshadows. * SIE will detect if protection applies and trigger a validity. @@ -951,7 +950,7 @@ static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr, * - > 0 if control has to be given to guest 2 * - < 0 if an error occurred */ -static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { int rc; @@ -960,8 +959,7 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return inject_fault(vcpu, PGM_PROTECTION, current->thread.gmap_teid.addr * PAGE_SIZE, 1); - rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - current->thread.gmap_teid.addr * PAGE_SIZE, NULL); + rc = kvm_s390_shadow_fault(vcpu, sg, current->thread.gmap_teid.addr * PAGE_SIZE, NULL); if (rc > 0) { rc = inject_fault(vcpu, rc, current->thread.gmap_teid.addr * PAGE_SIZE, @@ -978,12 +976,10 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * * Will ignore any errors. The next SIE fault will do proper fault handling. */ -static void handle_last_fault(struct kvm_vcpu *vcpu, - struct vsie_page *vsie_page) +static void handle_last_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { if (vsie_page->fault_addr) - kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - vsie_page->fault_addr, NULL); + kvm_s390_shadow_fault(vcpu, sg, vsie_page->fault_addr, NULL); vsie_page->fault_addr = 0; } @@ -1065,7 +1061,7 @@ static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, } } -static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; unsigned long pei_dest, pei_src, src, dest, mask, prefix; @@ -1083,8 +1079,8 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask; src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso; - rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest); - rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src); + rc_dest = kvm_s390_shadow_fault(vcpu, sg, dest, &pei_dest); + rc_src = kvm_s390_shadow_fault(vcpu, sg, src, &pei_src); /* * Either everything went well, or something non-critical went wrong * e.g. because of a race. In either case, simply retry. @@ -1144,7 +1140,7 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * - > 0 if control has to be given to guest 2 * - < 0 if an error occurred */ -static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) __releases(vcpu->kvm->srcu) __acquires(vcpu->kvm->srcu) { @@ -1153,7 +1149,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) int guest_bp_isolation; int rc = 0; - handle_last_fault(vcpu, vsie_page); + handle_last_fault(vcpu, vsie_page, sg); kvm_vcpu_srcu_read_unlock(vcpu); @@ -1191,7 +1187,7 @@ xfer_to_guest_mode_check: goto xfer_to_guest_mode_check; } guest_timing_enter_irqoff(); - rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce); + rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, sg->asce); guest_timing_exit_irqoff(); local_irq_enable(); } @@ -1215,7 +1211,7 @@ skip_sie: if (rc > 0) rc = 0; /* we could still have an icpt */ else if (current->thread.gmap_int_code) - return handle_fault(vcpu, vsie_page); + return handle_fault(vcpu, vsie_page, sg); switch (scb_s->icptcode) { case ICPT_INST: @@ -1233,7 +1229,7 @@ skip_sie: break; case ICPT_PARTEXEC: if (scb_s->ipa == 0xb254) - rc = vsie_handle_mvpg(vcpu, vsie_page); + rc = vsie_handle_mvpg(vcpu, vsie_page, sg); break; } return rc; @@ -1330,15 +1326,17 @@ static void unregister_shadow_scb(struct kvm_vcpu *vcpu) static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct gmap *sg; int rc = 0; while (1) { rc = acquire_gmap_shadow(vcpu, vsie_page); + sg = vsie_page->gmap; if (!rc) - rc = map_prefix(vcpu, vsie_page); + rc = map_prefix(vcpu, vsie_page, sg); if (!rc) { update_intervention_requests(vsie_page); - rc = do_vsie_run(vcpu, vsie_page); + rc = do_vsie_run(vcpu, vsie_page, sg); } atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20); From a2f2798fa6d18d91766b19a4896bd9fd0654d730 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:39 +0100 Subject: [PATCH 198/267] KVM: s390: Enable KVM_GENERIC_MMU_NOTIFIER Enable KVM_GENERIC_MMU_NOTIFIER, for now with empty placeholder callbacks. Also enable KVM_MMU_LOCKLESS_AGING and define KVM_HAVE_MMU_RWLOCK. Acked-by: Christian Borntraeger Reviewed-by: Steffen Eiden Reviewed-by: Christoph Schlameuss Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/include/asm/kvm_host.h | 1 + arch/s390/kvm/Kconfig | 2 ++ arch/s390/kvm/kvm-s390.c | 45 +++++++++++++++++++++++++++++++- 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 3dbddb7c60a9..6ba99870fc32 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -27,6 +27,7 @@ #include #include +#define KVM_HAVE_MMU_RWLOCK #define KVM_MAX_VCPUS 255 #define KVM_INTERNAL_MEM_SLOTS 1 diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index f4ec8c1ce214..917ac740513e 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -30,6 +30,8 @@ config KVM select KVM_VFIO select MMU_NOTIFIER select VIRT_XFER_TO_GUEST_WORK + select KVM_GENERIC_MMU_NOTIFIER + select KVM_MMU_LOCKLESS_AGING help Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index cd39b2f099ca..ec92e6361eab 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4805,7 +4805,7 @@ try_again: rc = fixup_user_fault(vcpu->arch.gmap->mm, vmaddr, fault_flags, &unlocked); if (!rc) rc = __gmap_link(vcpu->arch.gmap, gaddr, vmaddr); - scoped_guard(spinlock, &vcpu->kvm->mmu_lock) { + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { kvm_release_faultin_page(vcpu->kvm, page, false, writable); } mmap_read_unlock(vcpu->arch.gmap->mm); @@ -6021,6 +6021,49 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, return; } +/** + * kvm_test_age_gfn() - test young + * @kvm: the kvm instance + * @range: the range of guest addresses whose young status needs to be cleared + * + * Context: called by KVM common code without holding the kvm mmu lock + * Return: true if any page in the given range is young, otherwise 0. + */ +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + return false; +} + +/** + * kvm_age_gfn() - clear young + * @kvm: the kvm instance + * @range: the range of guest addresses whose young status needs to be cleared + * + * Context: called by KVM common code without holding the kvm mmu lock + * Return: true if any page in the given range was young, otherwise 0. + */ +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + return false; +} + +/** + * kvm_unmap_gfn_range() - Unmap a range of guest addresses + * @kvm: the kvm instance + * @range: the range of guest page frames to invalidate + * + * This function always returns false because every DAT table modification + * has to use the appropriate DAT table manipulation instructions, which will + * keep the TLB coherent, hence no additional TLB flush is ever required. + * + * Context: called by KVM common code with the kvm mmu write lock held + * Return: false + */ +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) +{ + return false; +} + static inline unsigned long nonhyp_mask(int i) { unsigned int nonhyp_fai = (sclp.hmfai << i * 2) >> 30; From 2f71db5eb9f8340ee58ae4b4faa6279b3a7f4a0e Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:40 +0100 Subject: [PATCH 199/267] KVM: s390: Rename some functions in gaccess.c Rename some functions in gaccess.c to add a _gva or _gpa suffix to indicate whether the function accepts a virtual or a guest-absolute address. This makes it easier to understand the code. Reviewed-by: Steffen Eiden Reviewed-by: Christoph Schlameuss Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/gaccess.c | 51 +++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index d8347f7cbe51..9df868bddf9a 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -397,7 +397,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val) } /** - * guest_translate - translate a guest virtual into a guest absolute address + * guest_translate_gva() - translate a guest virtual into a guest absolute address * @vcpu: virtual cpu * @gva: guest virtual address * @gpa: points to where guest physical (absolute) address should be stored @@ -417,9 +417,9 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val) * the returned value is the program interruption code as defined * by the architecture */ -static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, - unsigned long *gpa, const union asce asce, - enum gacc_mode mode, enum prot_type *prot) +static unsigned long guest_translate_gva(struct kvm_vcpu *vcpu, unsigned long gva, + unsigned long *gpa, const union asce asce, + enum gacc_mode mode, enum prot_type *prot) { union vaddress vaddr = {.addr = gva}; union raddress raddr = {.addr = gva}; @@ -600,8 +600,8 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu, return 1; } -static int vm_check_access_key(struct kvm *kvm, u8 access_key, - enum gacc_mode mode, gpa_t gpa) +static int vm_check_access_key_gpa(struct kvm *kvm, u8 access_key, + enum gacc_mode mode, gpa_t gpa) { u8 storage_key, access_control; bool fetch_protected; @@ -663,9 +663,9 @@ static bool storage_prot_override_applies(u8 access_control) return access_control == PAGE_SPO_ACC; } -static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key, - enum gacc_mode mode, union asce asce, gpa_t gpa, - unsigned long ga, unsigned int len) +static int vcpu_check_access_key_gpa(struct kvm_vcpu *vcpu, u8 access_key, + enum gacc_mode mode, union asce asce, gpa_t gpa, + unsigned long ga, unsigned int len) { u8 storage_key, access_control; unsigned long hva; @@ -757,7 +757,7 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode, PROT_TYPE_LA); if (psw_bits(*psw).dat) { - rc = guest_translate(vcpu, ga, &gpa, asce, mode, &prot); + rc = guest_translate_gva(vcpu, ga, &gpa, asce, mode, &prot); if (rc < 0) return rc; } else { @@ -769,8 +769,7 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, } if (rc) return trans_exc(vcpu, rc, ga, ar, mode, prot); - rc = vcpu_check_access_key(vcpu, access_key, mode, asce, gpa, ga, - fragment_len); + rc = vcpu_check_access_key_gpa(vcpu, access_key, mode, asce, gpa, ga, fragment_len); if (rc) return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_KEYC); if (gpas) @@ -782,8 +781,8 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, return 0; } -static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, - void *data, unsigned int len) +static int access_guest_page_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, + void *data, unsigned int len) { const unsigned int offset = offset_in_page(gpa); const gfn_t gfn = gpa_to_gfn(gpa); @@ -798,9 +797,8 @@ static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, return rc; } -static int -access_guest_page_with_key(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, - void *data, unsigned int len, u8 access_key) +static int access_guest_page_with_key_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, + void *data, unsigned int len, u8 access_key) { struct kvm_memory_slot *slot; bool writable; @@ -808,7 +806,7 @@ access_guest_page_with_key(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, hva_t hva; int rc; - gfn = gpa >> PAGE_SHIFT; + gfn = gpa_to_gfn(gpa); slot = gfn_to_memslot(kvm, gfn); hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); @@ -841,7 +839,7 @@ int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data, while (min(PAGE_SIZE - offset, len) > 0) { fragment_len = min(PAGE_SIZE - offset, len); - rc = access_guest_page_with_key(kvm, mode, gpa, data, fragment_len, access_key); + rc = access_guest_page_with_key_gpa(kvm, mode, gpa, data, fragment_len, access_key); if (rc) return rc; offset = 0; @@ -901,15 +899,14 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, for (idx = 0; idx < nr_pages; idx++) { fragment_len = min(PAGE_SIZE - offset_in_page(gpas[idx]), len); if (try_fetch_prot_override && fetch_prot_override_applies(ga, fragment_len)) { - rc = access_guest_page(vcpu->kvm, mode, gpas[idx], - data, fragment_len); + rc = access_guest_page_gpa(vcpu->kvm, mode, gpas[idx], data, fragment_len); } else { - rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx], - data, fragment_len, access_key); + rc = access_guest_page_with_key_gpa(vcpu->kvm, mode, gpas[idx], + data, fragment_len, access_key); } if (rc == PGM_PROTECTION && try_storage_prot_override) - rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx], - data, fragment_len, PAGE_SPO_ACC); + rc = access_guest_page_with_key_gpa(vcpu->kvm, mode, gpas[idx], + data, fragment_len, PAGE_SPO_ACC); if (rc) break; len -= fragment_len; @@ -943,7 +940,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, while (len && !rc) { gpa = kvm_s390_real_to_abs(vcpu, gra); fragment_len = min(PAGE_SIZE - offset_in_page(gpa), len); - rc = access_guest_page(vcpu->kvm, mode, gpa, data, fragment_len); + rc = access_guest_page_gpa(vcpu->kvm, mode, gpa, data, fragment_len); len -= fragment_len; gra += fragment_len; data += fragment_len; @@ -1134,7 +1131,7 @@ int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length, while (length && !rc) { fragment_len = min(PAGE_SIZE - offset_in_page(gpa), length); - rc = vm_check_access_key(kvm, access_key, mode, gpa); + rc = vm_check_access_key_gpa(kvm, access_key, mode, gpa); length -= fragment_len; gpa += fragment_len; } From 5a74e3d934171fe54115d1f504d1a34382ff1aab Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:41 +0100 Subject: [PATCH 200/267] KVM: s390: KVM-specific bitfields and helper functions Add KVM-s390 specific bitfields and helper functions to manipulate DAT tables. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/dat.h | 720 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 720 insertions(+) create mode 100644 arch/s390/kvm/dat.h diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h new file mode 100644 index 000000000000..d5e1a45813bc --- /dev/null +++ b/arch/s390/kvm/dat.h @@ -0,0 +1,720 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2024, 2025 + * Author(s): Claudio Imbrenda + */ + +#ifndef __KVM_S390_DAT_H +#define __KVM_S390_DAT_H + +#include +#include +#include +#include +#include +#include +#include + +#define _ASCE(x) ((union asce) { .val = (x), }) +#define NULL_ASCE _ASCE(0) + +enum { + _DAT_TOKEN_NONE = 0, + _DAT_TOKEN_PIC, +}; + +#define _CRSTE_TOK(l, t, p) ((union crste) { \ + .tok.i = 1, \ + .tok.tt = (l), \ + .tok.type = (t), \ + .tok.par = (p) \ + }) +#define _CRSTE_PIC(l, p) _CRSTE_TOK(l, _DAT_TOKEN_PIC, p) + +#define _CRSTE_HOLE(l) _CRSTE_PIC(l, PGM_ADDRESSING) +#define _CRSTE_EMPTY(l) _CRSTE_TOK(l, _DAT_TOKEN_NONE, 0) + +#define _PMD_EMPTY _CRSTE_EMPTY(TABLE_TYPE_SEGMENT) + +#define _PTE_TOK(t, p) ((union pte) { .tok.i = 1, .tok.type = (t), .tok.par = (p) }) +#define _PTE_EMPTY _PTE_TOK(_DAT_TOKEN_NONE, 0) + +/* This fake table type is used for page table walks (both for normal page tables and vSIE) */ +#define TABLE_TYPE_PAGE_TABLE -1 + +enum dat_walk_flags { + DAT_WALK_CONTINUE = 0x20, + DAT_WALK_IGN_HOLES = 0x10, + DAT_WALK_SPLIT = 0x08, + DAT_WALK_ALLOC = 0x04, + DAT_WALK_ANY = 0x02, + DAT_WALK_LEAF = 0x01, + DAT_WALK_DEFAULT = 0 +}; + +#define DAT_WALK_SPLIT_ALLOC (DAT_WALK_SPLIT | DAT_WALK_ALLOC) +#define DAT_WALK_ALLOC_CONTINUE (DAT_WALK_CONTINUE | DAT_WALK_ALLOC) +#define DAT_WALK_LEAF_ALLOC (DAT_WALK_LEAF | DAT_WALK_ALLOC) + +union pte { + unsigned long val; + union page_table_entry h; + struct { + unsigned long :56; /* Hardware bits */ + unsigned long u : 1; /* Page unused */ + unsigned long s : 1; /* Special */ + unsigned long w : 1; /* Writable */ + unsigned long r : 1; /* Readable */ + unsigned long d : 1; /* Dirty */ + unsigned long y : 1; /* Young */ + unsigned long sd: 1; /* Soft dirty */ + unsigned long pr: 1; /* Present */ + } s; + struct { + unsigned char hwbytes[7]; + unsigned char swbyte; + }; + union { + struct { + unsigned long type :16; /* Token type */ + unsigned long par :16; /* Token parameter */ + unsigned long :20; + unsigned long : 1; /* Must be 0 */ + unsigned long i : 1; /* Must be 1 */ + unsigned long : 2; + unsigned long : 7; + unsigned long pr : 1; /* Must be 0 */ + }; + struct { + unsigned long token:32; /* Token and parameter */ + unsigned long :32; + }; + } tok; +}; + +/* Soft dirty, needed as macro for atomic operations on ptes */ +#define _PAGE_SD 0x002 + +/* Needed as macro to perform atomic operations */ +#define PGSTE_CMMA_D_BIT 0x0000000000008000UL /* CMMA dirty soft-bit */ + +enum pgste_gps_usage { + PGSTE_GPS_USAGE_STABLE = 0, + PGSTE_GPS_USAGE_UNUSED, + PGSTE_GPS_USAGE_POT_VOLATILE, + PGSTE_GPS_USAGE_VOLATILE, +}; + +union pgste { + unsigned long val; + struct { + unsigned long acc : 4; + unsigned long fp : 1; + unsigned long : 3; + unsigned long pcl : 1; + unsigned long hr : 1; + unsigned long hc : 1; + unsigned long : 2; + unsigned long gr : 1; + unsigned long gc : 1; + unsigned long : 1; + unsigned long :16; /* val16 */ + unsigned long zero : 1; + unsigned long nodat : 1; + unsigned long : 4; + unsigned long usage : 2; + unsigned long : 8; + unsigned long cmma_d : 1; /* Dirty flag for CMMA bits */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ + unsigned long vsie_notif : 1; /* Referenced in a shadow table */ + unsigned long : 5; + unsigned long : 8; + }; + struct { + unsigned short hwbytes0; + unsigned short val16; /* Used to store chunked values, see dat_{s,g}et_ptval() */ + unsigned short hwbytes4; + unsigned char flags; /* Maps to the software bits */ + unsigned char hwbyte7; + } __packed; +}; + +union pmd { + unsigned long val; + union segment_table_entry h; + struct { + struct { + unsigned long :44; /* HW */ + unsigned long : 3; /* Unused */ + unsigned long : 1; /* HW */ + unsigned long w : 1; /* Writable soft-bit */ + unsigned long r : 1; /* Readable soft-bit */ + unsigned long d : 1; /* Dirty */ + unsigned long y : 1; /* Young */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ + unsigned long : 3; /* HW */ + unsigned long vsie_notif : 1; /* Referenced in a shadow table */ + unsigned long : 1; /* Unused */ + unsigned long : 4; /* HW */ + unsigned long sd : 1; /* Soft-Dirty */ + unsigned long pr : 1; /* Present */ + } fc1; + } s; +}; + +union pud { + unsigned long val; + union region3_table_entry h; + struct { + struct { + unsigned long :33; /* HW */ + unsigned long :14; /* Unused */ + unsigned long : 1; /* HW */ + unsigned long w : 1; /* Writable soft-bit */ + unsigned long r : 1; /* Readable soft-bit */ + unsigned long d : 1; /* Dirty */ + unsigned long y : 1; /* Young */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ + unsigned long : 3; /* HW */ + unsigned long vsie_notif : 1; /* Referenced in a shadow table */ + unsigned long : 1; /* Unused */ + unsigned long : 4; /* HW */ + unsigned long sd : 1; /* Soft-Dirty */ + unsigned long pr : 1; /* Present */ + } fc1; + } s; +}; + +union p4d { + unsigned long val; + union region2_table_entry h; +}; + +union pgd { + unsigned long val; + union region1_table_entry h; +}; + +union crste { + unsigned long val; + union { + struct { + unsigned long :52; + unsigned long : 1; + unsigned long fc: 1; + unsigned long p : 1; + unsigned long : 1; + unsigned long : 2; + unsigned long i : 1; + unsigned long : 1; + unsigned long tt: 2; + unsigned long : 2; + }; + struct { + unsigned long to:52; + unsigned long : 1; + unsigned long fc: 1; + unsigned long p : 1; + unsigned long : 1; + unsigned long tf: 2; + unsigned long i : 1; + unsigned long : 1; + unsigned long tt: 2; + unsigned long tl: 2; + } fc0; + struct { + unsigned long :47; + unsigned long av : 1; /* ACCF-Validity Control */ + unsigned long acc: 4; /* Access-Control Bits */ + unsigned long f : 1; /* Fetch-Protection Bit */ + unsigned long fc : 1; /* Format-Control */ + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long iep: 1; /* Instruction-Execution-Protection */ + unsigned long : 2; + unsigned long i : 1; /* Segment-Invalid Bit */ + unsigned long cs : 1; /* Common-Segment Bit */ + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long : 2; + } fc1; + } h; + struct { + struct { + unsigned long :47; + unsigned long : 1; /* HW (should be 0) */ + unsigned long w : 1; /* Writable */ + unsigned long r : 1; /* Readable */ + unsigned long d : 1; /* Dirty */ + unsigned long y : 1; /* Young */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ + unsigned long : 3; /* HW */ + unsigned long vsie_notif : 1; /* Referenced in a shadow table */ + unsigned long : 1; + unsigned long : 4; /* HW */ + unsigned long sd : 1; /* Soft-Dirty */ + unsigned long pr : 1; /* Present */ + } fc1; + } s; + union { + struct { + unsigned long type :16; /* Token type */ + unsigned long par :16; /* Token parameter */ + unsigned long :26; + unsigned long i : 1; /* Must be 1 */ + unsigned long : 1; + unsigned long tt : 2; + unsigned long : 1; + unsigned long pr : 1; /* Must be 0 */ + }; + struct { + unsigned long token:32; /* Token and parameter */ + unsigned long :32; + }; + } tok; + union pmd pmd; + union pud pud; + union p4d p4d; + union pgd pgd; +}; + +union skey { + unsigned char skey; + struct { + unsigned char acc :4; + unsigned char fp :1; + unsigned char r :1; + unsigned char c :1; + unsigned char zero:1; + }; +}; + +static_assert(sizeof(union pgste) == sizeof(unsigned long)); +static_assert(sizeof(union pte) == sizeof(unsigned long)); +static_assert(sizeof(union pmd) == sizeof(unsigned long)); +static_assert(sizeof(union pud) == sizeof(unsigned long)); +static_assert(sizeof(union p4d) == sizeof(unsigned long)); +static_assert(sizeof(union pgd) == sizeof(unsigned long)); +static_assert(sizeof(union crste) == sizeof(unsigned long)); +static_assert(sizeof(union skey) == sizeof(char)); + +struct segment_table { + union pmd pmds[_CRST_ENTRIES]; +}; + +struct region3_table { + union pud puds[_CRST_ENTRIES]; +}; + +struct region2_table { + union p4d p4ds[_CRST_ENTRIES]; +}; + +struct region1_table { + union pgd pgds[_CRST_ENTRIES]; +}; + +struct crst_table { + union { + union crste crstes[_CRST_ENTRIES]; + struct segment_table segment; + struct region3_table region3; + struct region2_table region2; + struct region1_table region1; + }; +}; + +struct page_table { + union pte ptes[_PAGE_ENTRIES]; + union pgste pgstes[_PAGE_ENTRIES]; +}; + +static_assert(sizeof(struct crst_table) == _CRST_TABLE_SIZE); +static_assert(sizeof(struct page_table) == PAGE_SIZE); + +/** + * _pte() - Useful constructor for union pte + * @pfn: the pfn this pte should point to. + * @writable: whether the pte should be writable. + * @dirty: whether the pte should be dirty. + * @special: whether the pte should be marked as special + * + * The pte is also marked as young and present. If the pte is marked as dirty, + * it gets marked as soft-dirty too. If the pte is not dirty, the hardware + * protect bit is set (independently of the write softbit); this way proper + * dirty tracking can be performed. + * + * Return: a union pte value. + */ +static inline union pte _pte(kvm_pfn_t pfn, bool writable, bool dirty, bool special) +{ + union pte res = { .val = PFN_PHYS(pfn) }; + + res.h.p = !dirty; + res.s.y = 1; + res.s.pr = 1; + res.s.w = writable; + res.s.d = dirty; + res.s.sd = dirty; + res.s.s = special; + return res; +} + +static inline union crste _crste_fc0(kvm_pfn_t pfn, int tt) +{ + union crste res = { .val = PFN_PHYS(pfn) }; + + res.h.tt = tt; + res.h.fc0.tl = _REGION_ENTRY_LENGTH; + res.h.fc0.tf = 0; + return res; +} + +/** + * _crste() - Useful constructor for union crste with FC=1 + * @pfn: the pfn this pte should point to. + * @tt: the table type + * @writable: whether the pte should be writable. + * @dirty: whether the pte should be dirty. + * + * The crste is also marked as young and present. If the crste is marked as + * dirty, it gets marked as soft-dirty too. If the crste is not dirty, the + * hardware protect bit is set (independently of the write softbit); this way + * proper dirty tracking can be performed. + * + * Return: a union crste value. + */ +static inline union crste _crste_fc1(kvm_pfn_t pfn, int tt, bool writable, bool dirty) +{ + union crste res = { .val = PFN_PHYS(pfn) & _SEGMENT_MASK }; + + res.h.tt = tt; + res.h.p = !dirty; + res.h.fc = 1; + res.s.fc1.y = 1; + res.s.fc1.pr = 1; + res.s.fc1.w = writable; + res.s.fc1.d = dirty; + res.s.fc1.sd = dirty; + return res; +} + +/** + * struct vsie_rmap - reverse mapping for shadow page table entries + * @next: pointer to next rmap in the list + * @r_gfn: virtual rmap address in the shadow guest address space + */ +struct vsie_rmap { + struct vsie_rmap *next; + union { + unsigned long val; + struct { + long level: 8; + unsigned long : 4; + unsigned long r_gfn:52; + }; + }; +}; + +static_assert(sizeof(struct vsie_rmap) == 2 * sizeof(long)); + +static inline struct crst_table *crste_table_start(union crste *crstep) +{ + return (struct crst_table *)ALIGN_DOWN((unsigned long)crstep, _CRST_TABLE_SIZE); +} + +static inline struct page_table *pte_table_start(union pte *ptep) +{ + return (struct page_table *)ALIGN_DOWN((unsigned long)ptep, _PAGE_TABLE_SIZE); +} + +static inline bool crdte_crste(union crste *crstep, union crste old, union crste new, gfn_t gfn, + union asce asce) +{ + unsigned long dtt = 0x10 | new.h.tt << 2; + void *table = crste_table_start(crstep); + + return crdte(old.val, new.val, table, dtt, gfn_to_gpa(gfn), asce.val); +} + +/** + * idte_crste() - invalidate a crste entry using idte + * @crstep: pointer to the crste to be invalidated + * @gfn: a gfn mapped by the crste + * @opt: options for the idte instruction + * @asce: the asce + * @local: whether the operation is cpu-local + */ +static __always_inline void idte_crste(union crste *crstep, gfn_t gfn, unsigned long opt, + union asce asce, int local) +{ + unsigned long table_origin = __pa(crste_table_start(crstep)); + unsigned long gaddr = gfn_to_gpa(gfn) & HPAGE_MASK; + + if (__builtin_constant_p(opt) && opt == 0) { + /* flush without guest asce */ + asm volatile("idte %[table_origin],0,%[gaddr],%[local]" + : "+m" (*crstep) + : [table_origin] "a" (table_origin), [gaddr] "a" (gaddr), + [local] "i" (local) + : "cc"); + } else { + /* flush with guest asce */ + asm volatile("idte %[table_origin],%[asce],%[gaddr_opt],%[local]" + : "+m" (*crstep) + : [table_origin] "a" (table_origin), [gaddr_opt] "a" (gaddr | opt), + [asce] "a" (asce.val), [local] "i" (local) + : "cc"); + } +} + +static inline void dat_init_pgstes(struct page_table *pt, unsigned long val) +{ + memset64((void *)pt->pgstes, val, PTRS_PER_PTE); +} + +static inline void dat_init_page_table(struct page_table *pt, unsigned long ptes, + unsigned long pgstes) +{ + memset64((void *)pt->ptes, ptes, PTRS_PER_PTE); + dat_init_pgstes(pt, pgstes); +} + +static inline gfn_t asce_end(union asce asce) +{ + return 1ULL << ((asce.dt + 1) * 11 + _SEGMENT_SHIFT - PAGE_SHIFT); +} + +#define _CRSTE(x) ((union crste) { .val = _Generic((x), \ + union pgd : (x).val, \ + union p4d : (x).val, \ + union pud : (x).val, \ + union pmd : (x).val, \ + union crste : (x).val)}) + +#define _CRSTEP(x) ((union crste *)_Generic((*(x)), \ + union pgd : (x), \ + union p4d : (x), \ + union pud : (x), \ + union pmd : (x), \ + union crste : (x))) + +#define _CRSTP(x) ((struct crst_table *)_Generic((*(x)), \ + struct crst_table : (x), \ + struct segment_table : (x), \ + struct region3_table : (x), \ + struct region2_table : (x), \ + struct region1_table : (x))) + +static inline bool asce_contains_gfn(union asce asce, gfn_t gfn) +{ + return gfn < asce_end(asce); +} + +static inline bool is_pmd(union crste crste) +{ + return crste.h.tt == TABLE_TYPE_SEGMENT; +} + +static inline bool is_pud(union crste crste) +{ + return crste.h.tt == TABLE_TYPE_REGION3; +} + +static inline bool is_p4d(union crste crste) +{ + return crste.h.tt == TABLE_TYPE_REGION2; +} + +static inline bool is_pgd(union crste crste) +{ + return crste.h.tt == TABLE_TYPE_REGION1; +} + +static inline phys_addr_t pmd_origin_large(union pmd pmd) +{ + return pmd.val & _SEGMENT_ENTRY_ORIGIN_LARGE; +} + +static inline phys_addr_t pud_origin_large(union pud pud) +{ + return pud.val & _REGION3_ENTRY_ORIGIN_LARGE; +} + +/** + * crste_origin_large() - Return the large frame origin of a large crste + * @crste: The crste whose origin is to be returned. Should be either a + * region-3 table entry or a segment table entry, in both cases with + * FC set to 1 (large pages). + * + * Return: The origin of the large frame pointed to by @crste, or -1 if the + * crste was not large (wrong table type, or FC==0) + */ +static inline phys_addr_t crste_origin_large(union crste crste) +{ + if (unlikely(!crste.h.fc || crste.h.tt > TABLE_TYPE_REGION3)) + return -1; + if (is_pmd(crste)) + return pmd_origin_large(crste.pmd); + return pud_origin_large(crste.pud); +} + +#define crste_origin(x) (_Generic((x), \ + union pmd : (x).val & _SEGMENT_ENTRY_ORIGIN, \ + union pud : (x).val & _REGION_ENTRY_ORIGIN, \ + union p4d : (x).val & _REGION_ENTRY_ORIGIN, \ + union pgd : (x).val & _REGION_ENTRY_ORIGIN)) + +static inline unsigned long pte_origin(union pte pte) +{ + return pte.val & PAGE_MASK; +} + +static inline bool pmd_prefix(union pmd pmd) +{ + return pmd.h.fc && pmd.s.fc1.prefix_notif; +} + +static inline bool pud_prefix(union pud pud) +{ + return pud.h.fc && pud.s.fc1.prefix_notif; +} + +static inline bool crste_leaf(union crste crste) +{ + return (crste.h.tt <= TABLE_TYPE_REGION3) && crste.h.fc; +} + +static inline bool crste_prefix(union crste crste) +{ + return crste_leaf(crste) && crste.s.fc1.prefix_notif; +} + +static inline bool crste_dirty(union crste crste) +{ + return crste_leaf(crste) && crste.s.fc1.d; +} + +static inline union pgste *pgste_of(union pte *pte) +{ + return (union pgste *)(pte + _PAGE_ENTRIES); +} + +static inline bool pte_hole(union pte pte) +{ + return pte.h.i && !pte.tok.pr && pte.tok.type != _DAT_TOKEN_NONE; +} + +static inline bool _crste_hole(union crste crste) +{ + return crste.h.i && !crste.tok.pr && crste.tok.type != _DAT_TOKEN_NONE; +} + +#define crste_hole(x) _crste_hole(_CRSTE(x)) + +static inline bool _crste_none(union crste crste) +{ + return crste.h.i && !crste.tok.pr && crste.tok.type == _DAT_TOKEN_NONE; +} + +#define crste_none(x) _crste_none(_CRSTE(x)) + +static inline phys_addr_t large_pud_to_phys(union pud pud, gfn_t gfn) +{ + return pud_origin_large(pud) | (gfn_to_gpa(gfn) & ~_REGION3_MASK); +} + +static inline phys_addr_t large_pmd_to_phys(union pmd pmd, gfn_t gfn) +{ + return pmd_origin_large(pmd) | (gfn_to_gpa(gfn) & ~_SEGMENT_MASK); +} + +static inline phys_addr_t large_crste_to_phys(union crste crste, gfn_t gfn) +{ + if (unlikely(!crste.h.fc || crste.h.tt > TABLE_TYPE_REGION3)) + return -1; + if (is_pmd(crste)) + return large_pmd_to_phys(crste.pmd, gfn); + return large_pud_to_phys(crste.pud, gfn); +} + +static inline bool cspg_crste(union crste *crstep, union crste old, union crste new) +{ + return cspg(&crstep->val, old.val, new.val); +} + +static inline struct page_table *dereference_pmd(union pmd pmd) +{ + return phys_to_virt(crste_origin(pmd)); +} + +static inline struct segment_table *dereference_pud(union pud pud) +{ + return phys_to_virt(crste_origin(pud)); +} + +static inline struct region3_table *dereference_p4d(union p4d p4d) +{ + return phys_to_virt(crste_origin(p4d)); +} + +static inline struct region2_table *dereference_pgd(union pgd pgd) +{ + return phys_to_virt(crste_origin(pgd)); +} + +static inline struct crst_table *_dereference_crste(union crste crste) +{ + if (unlikely(is_pmd(crste))) + return NULL; + return phys_to_virt(crste_origin(crste.pud)); +} + +#define dereference_crste(x) (_Generic((x), \ + union pud : _dereference_crste(_CRSTE(x)), \ + union p4d : _dereference_crste(_CRSTE(x)), \ + union pgd : _dereference_crste(_CRSTE(x)), \ + union crste : _dereference_crste(_CRSTE(x)))) + +static inline struct crst_table *dereference_asce(union asce asce) +{ + return phys_to_virt(asce.val & _ASCE_ORIGIN); +} + +static inline void asce_flush_tlb(union asce asce) +{ + __tlb_flush_idte(asce.val); +} + +static inline bool pgste_get_trylock(union pte *ptep, union pgste *res) +{ + union pgste *pgstep = pgste_of(ptep); + union pgste old_pgste; + + if (READ_ONCE(pgstep->val) & PGSTE_PCL_BIT) + return false; + old_pgste.val = __atomic64_or_barrier(PGSTE_PCL_BIT, &pgstep->val); + if (old_pgste.pcl) + return false; + old_pgste.pcl = 1; + *res = old_pgste; + return true; +} + +static inline union pgste pgste_get_lock(union pte *ptep) +{ + union pgste res; + + while (!pgste_get_trylock(ptep, &res)) + cpu_relax(); + return res; +} + +static inline void pgste_set_unlock(union pte *ptep, union pgste pgste) +{ + pgste.pcl = 0; + barrier(); + WRITE_ONCE(*pgste_of(ptep), pgste); +} + +#endif /* __KVM_S390_DAT_H */ From 12f2f61a9e1a3e54f57432c0f9d63ba7299993e1 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:42 +0100 Subject: [PATCH 201/267] KVM: s390: KVM page table management functions: allocation Add page table management functions to be used for KVM guest (gmap) page tables. This patch adds the boilerplate and functions for the allocation and deallocation of DAT tables. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/Makefile | 1 + arch/s390/kvm/dat.c | 103 +++++++++++++++++++++++++++++++++++++ arch/s390/kvm/dat.h | 77 +++++++++++++++++++++++++++ arch/s390/mm/page-states.c | 1 + 4 files changed, 182 insertions(+) create mode 100644 arch/s390/kvm/dat.c diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 9a723c48b05a..84315d2f75fb 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -9,6 +9,7 @@ ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap-vsie.o +kvm-y += dat.o kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c new file mode 100644 index 000000000000..c324a27f379f --- /dev/null +++ b/arch/s390/kvm/dat.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2020, 2024 + * Author(s): Claudio Imbrenda + * Martin Schwidefsky + * David Hildenbrand + * Janosch Frank + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "dat.h" + +int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc) +{ + void *o; + + for ( ; mc->n_crsts < KVM_S390_MMU_CACHE_N_CRSTS; mc->n_crsts++) { + o = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_COMP, CRST_ALLOC_ORDER); + if (!o) + return -ENOMEM; + mc->crsts[mc->n_crsts] = o; + } + for ( ; mc->n_pts < KVM_S390_MMU_CACHE_N_PTS; mc->n_pts++) { + o = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + if (!o) + return -ENOMEM; + mc->pts[mc->n_pts] = o; + } + for ( ; mc->n_rmaps < KVM_S390_MMU_CACHE_N_RMAPS; mc->n_rmaps++) { + o = kzalloc(sizeof(*mc->rmaps[0]), GFP_KERNEL_ACCOUNT); + if (!o) + return -ENOMEM; + mc->rmaps[mc->n_rmaps] = o; + } + return 0; +} + +static inline struct page_table *dat_alloc_pt_noinit(struct kvm_s390_mmu_cache *mc) +{ + struct page_table *res; + + res = kvm_s390_mmu_cache_alloc_pt(mc); + if (res) + __arch_set_page_dat(res, 1); + return res; +} + +static inline struct crst_table *dat_alloc_crst_noinit(struct kvm_s390_mmu_cache *mc) +{ + struct crst_table *res; + + res = kvm_s390_mmu_cache_alloc_crst(mc); + if (res) + __arch_set_page_dat(res, 1UL << CRST_ALLOC_ORDER); + return res; +} + +struct crst_table *dat_alloc_crst_sleepable(unsigned long init) +{ + struct page *page; + void *virt; + + page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_COMP, CRST_ALLOC_ORDER); + if (!page) + return NULL; + virt = page_to_virt(page); + __arch_set_page_dat(virt, 1UL << CRST_ALLOC_ORDER); + crst_table_init(virt, init); + return virt; +} + +void dat_free_level(struct crst_table *table, bool owns_ptes) +{ + unsigned int i; + + for (i = 0; i < _CRST_ENTRIES; i++) { + if (table->crstes[i].h.fc || table->crstes[i].h.i) + continue; + if (!is_pmd(table->crstes[i])) + dat_free_level(dereference_crste(table->crstes[i]), owns_ptes); + else if (owns_ptes) + dat_free_pt(dereference_pmd(table->crstes[i].pmd)); + } + dat_free_crst(table); +} diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h index d5e1a45813bc..a053f0d49bae 100644 --- a/arch/s390/kvm/dat.h +++ b/arch/s390/kvm/dat.h @@ -418,6 +418,46 @@ struct vsie_rmap { static_assert(sizeof(struct vsie_rmap) == 2 * sizeof(long)); +#define KVM_S390_MMU_CACHE_N_CRSTS 6 +#define KVM_S390_MMU_CACHE_N_PTS 2 +#define KVM_S390_MMU_CACHE_N_RMAPS 16 +struct kvm_s390_mmu_cache { + void *crsts[KVM_S390_MMU_CACHE_N_CRSTS]; + void *pts[KVM_S390_MMU_CACHE_N_PTS]; + void *rmaps[KVM_S390_MMU_CACHE_N_RMAPS]; + short int n_crsts; + short int n_pts; + short int n_rmaps; +}; + +void dat_free_level(struct crst_table *table, bool owns_ptes); +struct crst_table *dat_alloc_crst_sleepable(unsigned long init); + +int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc); + +#define GFP_KVM_S390_MMU_CACHE (GFP_ATOMIC | __GFP_ACCOUNT | __GFP_NOWARN) + +static inline struct page_table *kvm_s390_mmu_cache_alloc_pt(struct kvm_s390_mmu_cache *mc) +{ + if (mc->n_pts) + return mc->pts[--mc->n_pts]; + return (void *)__get_free_page(GFP_KVM_S390_MMU_CACHE); +} + +static inline struct crst_table *kvm_s390_mmu_cache_alloc_crst(struct kvm_s390_mmu_cache *mc) +{ + if (mc->n_crsts) + return mc->crsts[--mc->n_crsts]; + return (void *)__get_free_pages(GFP_KVM_S390_MMU_CACHE | __GFP_COMP, CRST_ALLOC_ORDER); +} + +static inline struct vsie_rmap *kvm_s390_mmu_cache_alloc_rmap(struct kvm_s390_mmu_cache *mc) +{ + if (mc->n_rmaps) + return mc->rmaps[--mc->n_rmaps]; + return kzalloc(sizeof(struct vsie_rmap), GFP_KVM_S390_MMU_CACHE); +} + static inline struct crst_table *crste_table_start(union crste *crstep) { return (struct crst_table *)ALIGN_DOWN((unsigned long)crstep, _CRST_TABLE_SIZE); @@ -717,4 +757,41 @@ static inline void pgste_set_unlock(union pte *ptep, union pgste pgste) WRITE_ONCE(*pgste_of(ptep), pgste); } +static inline void dat_free_pt(struct page_table *pt) +{ + free_page((unsigned long)pt); +} + +static inline void _dat_free_crst(struct crst_table *table) +{ + free_pages((unsigned long)table, CRST_ALLOC_ORDER); +} + +#define dat_free_crst(x) _dat_free_crst(_CRSTP(x)) + +static inline void kvm_s390_free_mmu_cache(struct kvm_s390_mmu_cache *mc) +{ + if (!mc) + return; + while (mc->n_pts) + dat_free_pt(mc->pts[--mc->n_pts]); + while (mc->n_crsts) + _dat_free_crst(mc->crsts[--mc->n_crsts]); + while (mc->n_rmaps) + kfree(mc->rmaps[--mc->n_rmaps]); + kfree(mc); +} + +DEFINE_FREE(kvm_s390_mmu_cache, struct kvm_s390_mmu_cache *, if (_T) kvm_s390_free_mmu_cache(_T)) + +static inline struct kvm_s390_mmu_cache *kvm_s390_new_mmu_cache(void) +{ + struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL; + + mc = kzalloc(sizeof(*mc), GFP_KERNEL_ACCOUNT); + if (mc && !kvm_s390_mmu_cache_topup(mc)) + return_ptr(mc); + return NULL; +} + #endif /* __KVM_S390_DAT_H */ diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index 01f9b39e65f5..5bee173db72e 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -13,6 +13,7 @@ #include int __bootdata_preserved(cmma_flag); +EXPORT_SYMBOL(cmma_flag); void arch_free_page(struct page *page, int order) { From 589071eaaa8f8459d24584d54fbe6db5a0be27a5 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:43 +0100 Subject: [PATCH 202/267] KVM: s390: KVM page table management functions: clear and replace Add page table management functions to be used for KVM guest (gmap) page tables. This patch adds functions to clear, replace or exchange DAT table entries. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/dat.c | 115 ++++++++++++++++++++++++++++++++++++++++++++ arch/s390/kvm/dat.h | 40 +++++++++++++++ 2 files changed, 155 insertions(+) diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c index c324a27f379f..e38b1a139fbb 100644 --- a/arch/s390/kvm/dat.c +++ b/arch/s390/kvm/dat.c @@ -101,3 +101,118 @@ void dat_free_level(struct crst_table *table, bool owns_ptes) } dat_free_crst(table); } + +/** + * dat_crstep_xchg() - Exchange a gmap CRSTE with another. + * @crstep: Pointer to the CRST entry + * @new: Replacement entry. + * @gfn: The affected guest address. + * @asce: The ASCE of the address space. + * + * Context: This function is assumed to be called with kvm->mmu_lock held. + */ +void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce) +{ + if (crstep->h.i) { + WRITE_ONCE(*crstep, new); + return; + } else if (cpu_has_edat2()) { + crdte_crste(crstep, *crstep, new, gfn, asce); + return; + } + + if (machine_has_tlb_guest()) + idte_crste(crstep, gfn, IDTE_GUEST_ASCE, asce, IDTE_GLOBAL); + else + idte_crste(crstep, gfn, 0, NULL_ASCE, IDTE_GLOBAL); + WRITE_ONCE(*crstep, new); +} + +/** + * dat_crstep_xchg_atomic() - Atomically exchange a gmap CRSTE with another. + * @crstep: Pointer to the CRST entry. + * @old: Expected old value. + * @new: Replacement entry. + * @gfn: The affected guest address. + * @asce: The asce of the address space. + * + * This function is needed to atomically exchange a CRSTE that potentially + * maps a prefix area, without having to invalidate it inbetween. + * + * Context: This function is assumed to be called with kvm->mmu_lock held. + * + * Return: %true if the exchange was successful. + */ +bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn, + union asce asce) +{ + if (old.h.i) + return arch_try_cmpxchg((long *)crstep, &old.val, new.val); + if (cpu_has_edat2()) + return crdte_crste(crstep, old, new, gfn, asce); + return cspg_crste(crstep, old, new); +} + +static void dat_set_storage_key_from_pgste(union pte pte, union pgste pgste) +{ + union skey nkey = { .acc = pgste.acc, .fp = pgste.fp }; + + page_set_storage_key(pte_origin(pte), nkey.skey, 0); +} + +static void dat_move_storage_key(union pte old, union pte new) +{ + page_set_storage_key(pte_origin(new), page_get_storage_key(pte_origin(old)), 1); +} + +static union pgste dat_save_storage_key_into_pgste(union pte pte, union pgste pgste) +{ + union skey skey; + + skey.skey = page_get_storage_key(pte_origin(pte)); + + pgste.acc = skey.acc; + pgste.fp = skey.fp; + pgste.gr |= skey.r; + pgste.gc |= skey.c; + + return pgste; +} + +union pgste __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new, gfn_t gfn, + union asce asce, bool uses_skeys) +{ + union pte old = READ_ONCE(*ptep); + + /* Updating only the software bits while holding the pgste lock. */ + if (!((ptep->val ^ new.val) & ~_PAGE_SW_BITS)) { + WRITE_ONCE(ptep->swbyte, new.swbyte); + return pgste; + } + + if (!old.h.i) { + unsigned long opts = IPTE_GUEST_ASCE | (pgste.nodat ? IPTE_NODAT : 0); + + if (machine_has_tlb_guest()) + __ptep_ipte(gfn_to_gpa(gfn), (void *)ptep, opts, asce.val, IPTE_GLOBAL); + else + __ptep_ipte(gfn_to_gpa(gfn), (void *)ptep, 0, 0, IPTE_GLOBAL); + } + + if (uses_skeys) { + if (old.h.i && !new.h.i) + /* Invalid to valid: restore storage keys from PGSTE. */ + dat_set_storage_key_from_pgste(new, pgste); + else if (!old.h.i && new.h.i) + /* Valid to invalid: save storage keys to PGSTE. */ + pgste = dat_save_storage_key_into_pgste(old, pgste); + else if (!old.h.i && !new.h.i) + /* Valid to valid: move storage keys. */ + if (old.h.pfra != new.h.pfra) + dat_move_storage_key(old, new); + /* Invalid to invalid: nothing to do. */ + } + + WRITE_ONCE(*ptep, new); + return pgste; +} diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h index a053f0d49bae..ee070d18bd36 100644 --- a/arch/s390/kvm/dat.h +++ b/arch/s390/kvm/dat.h @@ -430,6 +430,12 @@ struct kvm_s390_mmu_cache { short int n_rmaps; }; +union pgste __must_check __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new, + gfn_t gfn, union asce asce, bool uses_skeys); +bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn, + union asce asce); +void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce); + void dat_free_level(struct crst_table *table, bool owns_ptes); struct crst_table *dat_alloc_crst_sleepable(unsigned long init); @@ -757,6 +763,21 @@ static inline void pgste_set_unlock(union pte *ptep, union pgste pgste) WRITE_ONCE(*pgste_of(ptep), pgste); } +static inline void dat_ptep_xchg(union pte *ptep, union pte new, gfn_t gfn, union asce asce, + bool has_skeys) +{ + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, asce, has_skeys); + pgste_set_unlock(ptep, pgste); +} + +static inline void dat_ptep_clear(union pte *ptep, gfn_t gfn, union asce asce, bool has_skeys) +{ + dat_ptep_xchg(ptep, _PTE_EMPTY, gfn, asce, has_skeys); +} + static inline void dat_free_pt(struct page_table *pt) { free_page((unsigned long)pt); @@ -794,4 +815,23 @@ static inline struct kvm_s390_mmu_cache *kvm_s390_new_mmu_cache(void) return NULL; } +static inline bool dat_pmdp_xchg_atomic(union pmd *pmdp, union pmd old, union pmd new, + gfn_t gfn, union asce asce) +{ + return dat_crstep_xchg_atomic(_CRSTEP(pmdp), _CRSTE(old), _CRSTE(new), gfn, asce); +} + +static inline bool dat_pudp_xchg_atomic(union pud *pudp, union pud old, union pud new, + gfn_t gfn, union asce asce) +{ + return dat_crstep_xchg_atomic(_CRSTEP(pudp), _CRSTE(old), _CRSTE(new), gfn, asce); +} + +static inline void dat_crstep_clear(union crste *crstep, gfn_t gfn, union asce asce) +{ + union crste newcrste = _CRSTE_EMPTY(crstep->h.tt); + + dat_crstep_xchg(crstep, newcrste, gfn, asce); +} + #endif /* __KVM_S390_DAT_H */ From 2db149a0a6c5e81473298e57e762240901764a94 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:44 +0100 Subject: [PATCH 203/267] KVM: s390: KVM page table management functions: walks Add page table management functions to be used for KVM guest (gmap) page tables. This patch adds functions to walk to specific table entries, or to perform actions on a range of entries. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/dat.c | 386 ++++++++++++++++++++++++++++++++++++++++++++ arch/s390/kvm/dat.h | 39 +++++ 2 files changed, 425 insertions(+) diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c index e38b1a139fbb..c31838107752 100644 --- a/arch/s390/kvm/dat.c +++ b/arch/s390/kvm/dat.c @@ -216,3 +216,389 @@ union pgste __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new, g WRITE_ONCE(*ptep, new); return pgste; } + +/* + * dat_split_ste() - Split a segment table entry into page table entries. + * + * Context: This function is assumed to be called with kvm->mmu_lock held. + * + * Return: 0 in case of success, -ENOMEM if running out of memory. + */ +static int dat_split_ste(struct kvm_s390_mmu_cache *mc, union pmd *pmdp, gfn_t gfn, + union asce asce, bool uses_skeys) +{ + union pgste pgste_init; + struct page_table *pt; + union pmd new, old; + union pte init; + int i; + + BUG_ON(!mc); + old = READ_ONCE(*pmdp); + + /* Already split, nothing to do. */ + if (!old.h.i && !old.h.fc) + return 0; + + pt = dat_alloc_pt_noinit(mc); + if (!pt) + return -ENOMEM; + new.val = virt_to_phys(pt); + + while (old.h.i || old.h.fc) { + init.val = pmd_origin_large(old); + init.h.p = old.h.p; + init.h.i = old.h.i; + init.s.d = old.s.fc1.d; + init.s.w = old.s.fc1.w; + init.s.y = old.s.fc1.y; + init.s.sd = old.s.fc1.sd; + init.s.pr = old.s.fc1.pr; + pgste_init.val = 0; + if (old.h.fc) { + for (i = 0; i < _PAGE_ENTRIES; i++) + pt->ptes[i].val = init.val | i * PAGE_SIZE; + /* No need to take locks as the page table is not installed yet. */ + pgste_init.prefix_notif = old.s.fc1.prefix_notif; + pgste_init.pcl = uses_skeys && init.h.i; + dat_init_pgstes(pt, pgste_init.val); + } else { + dat_init_page_table(pt, init.val, 0); + } + + if (dat_pmdp_xchg_atomic(pmdp, old, new, gfn, asce)) { + if (!pgste_init.pcl) + return 0; + for (i = 0; i < _PAGE_ENTRIES; i++) { + union pgste pgste = pt->pgstes[i]; + + pgste = dat_save_storage_key_into_pgste(pt->ptes[i], pgste); + pgste_set_unlock(pt->ptes + i, pgste); + } + return 0; + } + old = READ_ONCE(*pmdp); + } + + dat_free_pt(pt); + return 0; +} + +/* + * dat_split_crste() - Split a crste into smaller crstes. + * + * Context: This function is assumed to be called with kvm->mmu_lock held. + * + * Return: %0 in case of success, %-ENOMEM if running out of memory. + */ +static int dat_split_crste(struct kvm_s390_mmu_cache *mc, union crste *crstep, + gfn_t gfn, union asce asce, bool uses_skeys) +{ + struct crst_table *table; + union crste old, new, init; + int i; + + old = READ_ONCE(*crstep); + if (is_pmd(old)) + return dat_split_ste(mc, &crstep->pmd, gfn, asce, uses_skeys); + + BUG_ON(!mc); + + /* Already split, nothing to do. */ + if (!old.h.i && !old.h.fc) + return 0; + + table = dat_alloc_crst_noinit(mc); + if (!table) + return -ENOMEM; + + new.val = virt_to_phys(table); + new.h.tt = old.h.tt; + new.h.fc0.tl = _REGION_ENTRY_LENGTH; + + while (old.h.i || old.h.fc) { + init = old; + init.h.tt--; + if (old.h.fc) { + for (i = 0; i < _CRST_ENTRIES; i++) + table->crstes[i].val = init.val | i * HPAGE_SIZE; + } else { + crst_table_init((void *)table, init.val); + } + if (dat_crstep_xchg_atomic(crstep, old, new, gfn, asce)) + return 0; + old = READ_ONCE(*crstep); + } + + dat_free_crst(table); + return 0; +} + +/** + * dat_entry_walk() - Walk the gmap page tables. + * @mc: Cache to use to allocate dat tables, if needed; can be NULL if neither + * %DAT_WALK_SPLIT or %DAT_WALK_ALLOC is specified in @flags. + * @gfn: Guest frame. + * @asce: The ASCE of the address space. + * @flags: Flags from WALK_* macros. + * @walk_level: Level to walk to, from LEVEL_* macros. + * @last: Will be filled the last visited non-pte DAT entry. + * @ptepp: Will be filled the last visited pte entry, if any, otherwise NULL. + * + * Returns a table entry pointer for the given guest address and @walk_level. + * + * The @flags have the following meanings: + * * %DAT_WALK_IGN_HOLES: consider holes as normal table entries + * * %DAT_WALK_ALLOC: allocate new tables to reach the requested level, if needed + * * %DAT_WALK_SPLIT: split existing large pages to reach the requested level, if needed + * * %DAT_WALK_LEAF: return successfully whenever a large page is encountered + * * %DAT_WALK_ANY: return successfully even if the requested level could not be reached + * * %DAT_WALK_CONTINUE: walk to the requested level with the specified flags, and then try to + * continue walking to ptes with only DAT_WALK_ANY + * * %DAT_WALK_USES_SKEYS: storage keys are in use + * + * Context: called with kvm->mmu_lock held. + * + * Return: + * * %PGM_ADDRESSING if the requested address lies outside memory + * * a PIC number if the requested address lies in a memory hole of type _DAT_TOKEN_PIC + * * %-EFAULT if the requested address lies inside a memory hole of a different type + * * %-EINVAL if the given ASCE is not compatible with the requested level + * * %-EFBIG if the requested level could not be reached because a larger frame was found + * * %-ENOENT if the requested level could not be reached for other reasons + * * %-ENOMEM if running out of memory while allocating or splitting a table + */ +int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, int flags, + int walk_level, union crste **last, union pte **ptepp) +{ + union vaddress vaddr = { .addr = gfn_to_gpa(gfn) }; + bool continue_anyway = flags & DAT_WALK_CONTINUE; + bool uses_skeys = flags & DAT_WALK_USES_SKEYS; + bool ign_holes = flags & DAT_WALK_IGN_HOLES; + bool allocate = flags & DAT_WALK_ALLOC; + bool split = flags & DAT_WALK_SPLIT; + bool leaf = flags & DAT_WALK_LEAF; + bool any = flags & DAT_WALK_ANY; + struct page_table *pgtable; + struct crst_table *table; + union crste entry; + int rc; + + *last = NULL; + *ptepp = NULL; + if (WARN_ON_ONCE(unlikely(!asce.val))) + return -EINVAL; + if (WARN_ON_ONCE(unlikely(walk_level > asce.dt))) + return -EINVAL; + if (!asce_contains_gfn(asce, gfn)) + return PGM_ADDRESSING; + + table = dereference_asce(asce); + if (asce.dt >= ASCE_TYPE_REGION1) { + *last = table->crstes + vaddr.rfx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION1)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == TABLE_TYPE_REGION1) + return 0; + if (entry.pgd.h.i) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.pgd); + } + + if (asce.dt >= ASCE_TYPE_REGION2) { + *last = table->crstes + vaddr.rsx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION2)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == TABLE_TYPE_REGION2) + return 0; + if (entry.p4d.h.i) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.p4d); + } + + if (asce.dt >= ASCE_TYPE_REGION3) { + *last = table->crstes + vaddr.rtx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION3)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == TABLE_TYPE_REGION3 && + continue_anyway && !entry.pud.h.fc && !entry.h.i) { + walk_level = TABLE_TYPE_PAGE_TABLE; + allocate = false; + } + if (walk_level == TABLE_TYPE_REGION3 || ((leaf || any) && entry.pud.h.fc)) + return 0; + if (entry.pud.h.i && !entry.pud.h.fc) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + if (walk_level <= TABLE_TYPE_SEGMENT && entry.pud.h.fc) { + if (!split) + return -EFBIG; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.pud); + } + + *last = table->crstes + vaddr.sx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_SEGMENT)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (continue_anyway && !entry.pmd.h.fc && !entry.h.i) { + walk_level = TABLE_TYPE_PAGE_TABLE; + allocate = false; + } + if (walk_level == TABLE_TYPE_SEGMENT || ((leaf || any) && entry.pmd.h.fc)) + return 0; + + if (entry.pmd.h.i && !entry.pmd.h.fc) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_ste(mc, &(*last)->pmd, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + if (walk_level <= TABLE_TYPE_PAGE_TABLE && entry.pmd.h.fc) { + if (!split) + return -EFBIG; + rc = dat_split_ste(mc, &(*last)->pmd, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + pgtable = dereference_pmd(entry.pmd); + *ptepp = pgtable->ptes + vaddr.px; + if (pte_hole(**ptepp) && !ign_holes) + return (*ptepp)->tok.type == _DAT_TOKEN_PIC ? (*ptepp)->tok.par : -EFAULT; + return 0; +} + +static long dat_pte_walk_range(gfn_t gfn, gfn_t end, struct page_table *table, struct dat_walk *w) +{ + unsigned int idx = gfn & (_PAGE_ENTRIES - 1); + long rc = 0; + + for ( ; gfn < end; idx++, gfn++) { + if (pte_hole(READ_ONCE(table->ptes[idx]))) { + if (!(w->flags & DAT_WALK_IGN_HOLES)) + return -EFAULT; + if (!(w->flags & DAT_WALK_ANY)) + continue; + } + + rc = w->ops->pte_entry(table->ptes + idx, gfn, gfn + 1, w); + if (rc) + break; + } + return rc; +} + +static long dat_crste_walk_range(gfn_t start, gfn_t end, struct crst_table *table, + struct dat_walk *walk) +{ + unsigned long idx, cur_shift, cur_size; + dat_walk_op the_op; + union crste crste; + gfn_t cur, next; + long rc = 0; + + cur_shift = 8 + table->crstes[0].h.tt * 11; + idx = (start >> cur_shift) & (_CRST_ENTRIES - 1); + cur_size = 1UL << cur_shift; + + for (cur = ALIGN_DOWN(start, cur_size); cur < end; idx++, cur = next) { + next = cur + cur_size; + walk->last = table->crstes + idx; + crste = READ_ONCE(*walk->last); + + if (crste_hole(crste)) { + if (!(walk->flags & DAT_WALK_IGN_HOLES)) + return -EFAULT; + if (!(walk->flags & DAT_WALK_ANY)) + continue; + } + + the_op = walk->ops->crste_ops[crste.h.tt]; + if (the_op) { + rc = the_op(walk->last, cur, next, walk); + crste = READ_ONCE(*walk->last); + } + if (rc) + break; + if (!crste.h.i && !crste.h.fc) { + if (!is_pmd(crste)) + rc = dat_crste_walk_range(max(start, cur), min(end, next), + _dereference_crste(crste), walk); + else if (walk->ops->pte_entry) + rc = dat_pte_walk_range(max(start, cur), min(end, next), + dereference_pmd(crste.pmd), walk); + } + } + return rc; +} + +/** + * _dat_walk_gfn_range() - Walk DAT tables. + * @start: The first guest page frame to walk. + * @end: The guest page frame immediately after the last one to walk. + * @asce: The ASCE of the guest mapping. + * @ops: The gmap_walk_ops that will be used to perform the walk. + * @flags: Flags from WALK_* (currently only WALK_IGN_HOLES is supported). + * @priv: Will be passed as-is to the callbacks. + * + * Any callback returning non-zero causes the walk to stop immediately. + * + * Return: %-EINVAL in case of error, %-EFAULT if @start is too high for the + * given ASCE unless the DAT_WALK_IGN_HOLES flag is specified, + * otherwise it returns whatever the callbacks return. + */ +long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce, + const struct dat_walk_ops *ops, int flags, void *priv) +{ + struct crst_table *table = dereference_asce(asce); + struct dat_walk walk = { + .ops = ops, + .asce = asce, + .priv = priv, + .flags = flags, + .start = start, + .end = end, + }; + + if (WARN_ON_ONCE(unlikely(!asce.val))) + return -EINVAL; + if (!asce_contains_gfn(asce, start)) + return (flags & DAT_WALK_IGN_HOLES) ? 0 : -EFAULT; + + return dat_crste_walk_range(start, min(end, asce_end(asce)), table, &walk); +} diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h index ee070d18bd36..409064bd20a5 100644 --- a/arch/s390/kvm/dat.h +++ b/arch/s390/kvm/dat.h @@ -45,6 +45,7 @@ enum { #define TABLE_TYPE_PAGE_TABLE -1 enum dat_walk_flags { + DAT_WALK_USES_SKEYS = 0x40, DAT_WALK_CONTINUE = 0x20, DAT_WALK_IGN_HOLES = 0x10, DAT_WALK_SPLIT = 0x08, @@ -332,6 +333,34 @@ struct page_table { static_assert(sizeof(struct crst_table) == _CRST_TABLE_SIZE); static_assert(sizeof(struct page_table) == PAGE_SIZE); +struct dat_walk; + +typedef long (*dat_walk_op)(union crste *crste, gfn_t gfn, gfn_t next, struct dat_walk *w); + +struct dat_walk_ops { + union { + dat_walk_op crste_ops[4]; + struct { + dat_walk_op pmd_entry; + dat_walk_op pud_entry; + dat_walk_op p4d_entry; + dat_walk_op pgd_entry; + }; + }; + long (*pte_entry)(union pte *pte, gfn_t gfn, gfn_t next, struct dat_walk *w); +}; + +struct dat_walk { + const struct dat_walk_ops *ops; + union crste *last; + union pte *last_pte; + union asce asce; + gfn_t start; + gfn_t end; + int flags; + void *priv; +}; + /** * _pte() - Useful constructor for union pte * @pfn: the pfn this pte should point to. @@ -436,6 +465,11 @@ bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste ne union asce asce); void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce); +long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce, + const struct dat_walk_ops *ops, int flags, void *priv); + +int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, int flags, + int walk_level, union crste **last, union pte **ptepp); void dat_free_level(struct crst_table *table, bool owns_ptes); struct crst_table *dat_alloc_crst_sleepable(unsigned long init); @@ -834,4 +868,9 @@ static inline void dat_crstep_clear(union crste *crstep, gfn_t gfn, union asce a dat_crstep_xchg(crstep, newcrste, gfn, asce); } +static inline int get_level(union crste *crstep, union pte *ptep) +{ + return ptep ? TABLE_TYPE_PAGE_TABLE : crstep->h.tt; +} + #endif /* __KVM_S390_DAT_H */ From 8e03e8316eb242af307d1c789d09f1a1457cf9a3 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:45 +0100 Subject: [PATCH 204/267] KVM: s390: KVM page table management functions: storage keys Add page table management functions to be used for KVM guest (gmap) page tables. This patch adds functions related to storage key handling. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/dat.c | 223 ++++++++++++++++++++++++++++++++++++++++++++ arch/s390/kvm/dat.h | 7 ++ 2 files changed, 230 insertions(+) diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c index c31838107752..99682792d9ee 100644 --- a/arch/s390/kvm/dat.c +++ b/arch/s390/kvm/dat.c @@ -602,3 +602,226 @@ long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce, return dat_crste_walk_range(start, min(end, asce_end(asce)), table, &walk); } + +int dat_get_storage_key(union asce asce, gfn_t gfn, union skey *skey) +{ + union crste *crstep; + union pgste pgste; + union pte *ptep; + int rc; + + skey->skey = 0; + rc = dat_entry_walk(NULL, gfn, asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); + if (rc) + return rc; + + if (!ptep) { + union crste crste; + + crste = READ_ONCE(*crstep); + if (!crste.h.fc || !crste.s.fc1.pr) + return 0; + skey->skey = page_get_storage_key(large_crste_to_phys(crste, gfn)); + return 0; + } + pgste = pgste_get_lock(ptep); + if (ptep->h.i) { + skey->acc = pgste.acc; + skey->fp = pgste.fp; + } else { + skey->skey = page_get_storage_key(pte_origin(*ptep)); + } + skey->r |= pgste.gr; + skey->c |= pgste.gc; + pgste_set_unlock(ptep, pgste); + return 0; +} + +static void dat_update_ptep_sd(union pgste old, union pgste pgste, union pte *ptep) +{ + if (pgste.acc != old.acc || pgste.fp != old.fp || pgste.gr != old.gr || pgste.gc != old.gc) + __atomic64_or(_PAGE_SD, &ptep->val); +} + +int dat_set_storage_key(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn, + union skey skey, bool nq) +{ + union pgste pgste, old; + union crste *crstep; + union pte *ptep; + int rc; + + rc = dat_entry_walk(mc, gfn, asce, DAT_WALK_LEAF_ALLOC, TABLE_TYPE_PAGE_TABLE, + &crstep, &ptep); + if (rc) + return rc; + + if (!ptep) { + page_set_storage_key(large_crste_to_phys(*crstep, gfn), skey.skey, !nq); + return 0; + } + + old = pgste_get_lock(ptep); + pgste = old; + + pgste.acc = skey.acc; + pgste.fp = skey.fp; + pgste.gc = skey.c; + pgste.gr = skey.r; + + if (!ptep->h.i) { + union skey old_skey; + + old_skey.skey = page_get_storage_key(pte_origin(*ptep)); + pgste.hc |= old_skey.c; + pgste.hr |= old_skey.r; + old_skey.c = old.gc; + old_skey.r = old.gr; + skey.r = 0; + skey.c = 0; + page_set_storage_key(pte_origin(*ptep), skey.skey, !nq); + } + + dat_update_ptep_sd(old, pgste, ptep); + pgste_set_unlock(ptep, pgste); + return 0; +} + +static bool page_cond_set_storage_key(phys_addr_t paddr, union skey skey, union skey *oldkey, + bool nq, bool mr, bool mc) +{ + oldkey->skey = page_get_storage_key(paddr); + if (oldkey->acc == skey.acc && oldkey->fp == skey.fp && + (oldkey->r == skey.r || mr) && (oldkey->c == skey.c || mc)) + return false; + page_set_storage_key(paddr, skey.skey, !nq); + return true; +} + +int dat_cond_set_storage_key(struct kvm_s390_mmu_cache *mmc, union asce asce, gfn_t gfn, + union skey skey, union skey *oldkey, bool nq, bool mr, bool mc) +{ + union pgste pgste, old; + union crste *crstep; + union skey prev; + union pte *ptep; + int rc; + + rc = dat_entry_walk(mmc, gfn, asce, DAT_WALK_LEAF_ALLOC, TABLE_TYPE_PAGE_TABLE, + &crstep, &ptep); + if (rc) + return rc; + + if (!ptep) + return page_cond_set_storage_key(large_crste_to_phys(*crstep, gfn), skey, oldkey, + nq, mr, mc); + + old = pgste_get_lock(ptep); + pgste = old; + + rc = 1; + pgste.acc = skey.acc; + pgste.fp = skey.fp; + pgste.gc = skey.c; + pgste.gr = skey.r; + + if (!ptep->h.i) { + rc = page_cond_set_storage_key(pte_origin(*ptep), skey, &prev, nq, mr, mc); + pgste.hc |= prev.c; + pgste.hr |= prev.r; + prev.c |= old.gc; + prev.r |= old.gr; + } else { + prev.acc = old.acc; + prev.fp = old.fp; + prev.c = old.gc; + prev.r = old.gr; + } + if (oldkey) + *oldkey = prev; + + dat_update_ptep_sd(old, pgste, ptep); + pgste_set_unlock(ptep, pgste); + return rc; +} + +int dat_reset_reference_bit(union asce asce, gfn_t gfn) +{ + union pgste pgste, old; + union crste *crstep; + union pte *ptep; + int rc; + + rc = dat_entry_walk(NULL, gfn, asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); + if (rc) + return rc; + + if (!ptep) { + union crste crste = READ_ONCE(*crstep); + + if (!crste.h.fc || !crste.s.fc1.pr) + return 0; + return page_reset_referenced(large_crste_to_phys(*crstep, gfn)); + } + old = pgste_get_lock(ptep); + pgste = old; + + if (!ptep->h.i) { + rc = page_reset_referenced(pte_origin(*ptep)); + pgste.hr = rc >> 1; + } + rc |= (pgste.gr << 1) | pgste.gc; + pgste.gr = 0; + + dat_update_ptep_sd(old, pgste, ptep); + pgste_set_unlock(ptep, pgste); + return rc; +} + +static long dat_reset_skeys_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste.acc = 0; + pgste.fp = 0; + pgste.gr = 0; + pgste.gc = 0; + if (ptep->s.pr) + page_set_storage_key(pte_origin(*ptep), PAGE_DEFAULT_KEY, 1); + pgste_set_unlock(ptep, pgste); + + if (need_resched()) + return next; + return 0; +} + +static long dat_reset_skeys_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + phys_addr_t addr, end, origin = crste_origin_large(*crstep); + + if (!crstep->h.fc || !crstep->s.fc1.pr) + return 0; + + addr = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin; + end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin; + while (ALIGN(addr + 1, _SEGMENT_SIZE) <= end) + addr = sske_frame(addr, PAGE_DEFAULT_KEY); + for ( ; addr < end; addr += PAGE_SIZE) + page_set_storage_key(addr, PAGE_DEFAULT_KEY, 1); + + if (need_resched()) + return next; + return 0; +} + +long dat_reset_skeys(union asce asce, gfn_t start) +{ + const struct dat_walk_ops ops = { + .pte_entry = dat_reset_skeys_pte, + .pmd_entry = dat_reset_skeys_crste, + .pud_entry = dat_reset_skeys_crste, + }; + + return _dat_walk_gfn_range(start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, NULL); +} diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h index 409064bd20a5..6c7815d4f07f 100644 --- a/arch/s390/kvm/dat.h +++ b/arch/s390/kvm/dat.h @@ -472,6 +472,13 @@ int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, in int walk_level, union crste **last, union pte **ptepp); void dat_free_level(struct crst_table *table, bool owns_ptes); struct crst_table *dat_alloc_crst_sleepable(unsigned long init); +int dat_get_storage_key(union asce asce, gfn_t gfn, union skey *skey); +int dat_set_storage_key(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn, + union skey skey, bool nq); +int dat_cond_set_storage_key(struct kvm_s390_mmu_cache *mmc, union asce asce, gfn_t gfn, + union skey skey, union skey *oldkey, bool nq, bool mr, bool mc); +int dat_reset_reference_bit(union asce asce, gfn_t gfn); +long dat_reset_skeys(union asce asce, gfn_t start); int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc); From 94fd9b16cc678ad440425b6fc4d1d505556fb29f Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:46 +0100 Subject: [PATCH 205/267] KVM: s390: KVM page table management functions: lifecycle management Add page table management functions to be used for KVM guest (gmap) page tables. This patch adds functions to handle memslot creation and destruction, additional per-pagetable data stored in the PGSTEs, mapping physical addresses into the gmap, and marking address ranges as prefix. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/dat.c | 289 ++++++++++++++++++++++++++++++++++++++++++++ arch/s390/kvm/dat.h | 59 +++++++++ 2 files changed, 348 insertions(+) diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c index 99682792d9ee..bc27405cdea1 100644 --- a/arch/s390/kvm/dat.c +++ b/arch/s390/kvm/dat.c @@ -102,6 +102,38 @@ void dat_free_level(struct crst_table *table, bool owns_ptes) dat_free_crst(table); } +int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newtype) +{ + struct crst_table *table; + union crste crste; + + while (asce->dt > newtype) { + table = dereference_asce(*asce); + crste = table->crstes[0]; + if (crste.h.fc) + return 0; + if (!crste.h.i) { + asce->rsto = crste.h.fc0.to; + dat_free_crst(table); + } else { + crste.h.tt--; + crst_table_init((void *)table, crste.val); + } + asce->dt--; + } + while (asce->dt < newtype) { + crste = _crste_fc0(asce->rsto, asce->dt + 1); + table = dat_alloc_crst_noinit(mc); + if (!table) + return -ENOMEM; + crst_table_init((void *)table, _CRSTE_HOLE(crste.h.tt).val); + table->crstes[0] = crste; + asce->rsto = __pa(table) >> PAGE_SHIFT; + asce->dt++; + } + return 0; +} + /** * dat_crstep_xchg() - Exchange a gmap CRSTE with another. * @crstep: Pointer to the CRST entry @@ -825,3 +857,260 @@ long dat_reset_skeys(union asce asce, gfn_t start) return _dat_walk_gfn_range(start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, NULL); } + +struct slot_priv { + unsigned long token; + struct kvm_s390_mmu_cache *mc; +}; + +static long _dat_slot_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct slot_priv *p = walk->priv; + union crste dummy = { .val = p->token }; + union pte new_pte, pte = READ_ONCE(*ptep); + + new_pte = _PTE_TOK(dummy.tok.type, dummy.tok.par); + + /* Table entry already in the desired state. */ + if (pte.val == new_pte.val) + return 0; + + dat_ptep_xchg(ptep, new_pte, gfn, walk->asce, false); + return 0; +} + +static long _dat_slot_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union crste new_crste, crste = READ_ONCE(*crstep); + struct slot_priv *p = walk->priv; + + new_crste.val = p->token; + new_crste.h.tt = crste.h.tt; + + /* Table entry already in the desired state. */ + if (crste.val == new_crste.val) + return 0; + + /* This table entry needs to be updated. */ + if (walk->start <= gfn && walk->end >= next) { + dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce); + /* A lower level table was present, needs to be freed. */ + if (!crste.h.fc && !crste.h.i) { + if (is_pmd(crste)) + dat_free_pt(dereference_pmd(crste.pmd)); + else + dat_free_level(dereference_crste(crste), true); + } + return 0; + } + + /* A lower level table is present, things will handled there. */ + if (!crste.h.fc && !crste.h.i) + return 0; + /* Split (install a lower level table), and handle things there. */ + return dat_split_crste(p->mc, crstep, gfn, walk->asce, false); +} + +static const struct dat_walk_ops dat_slot_ops = { + .pte_entry = _dat_slot_pte, + .crste_ops = { _dat_slot_crste, _dat_slot_crste, _dat_slot_crste, _dat_slot_crste, }, +}; + +int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gfn_t end, + u16 type, u16 param) +{ + struct slot_priv priv = { + .token = _CRSTE_TOK(0, type, param).val, + .mc = mc, + }; + + return _dat_walk_gfn_range(start, end, asce, &dat_slot_ops, + DAT_WALK_IGN_HOLES | DAT_WALK_ANY, &priv); +} + +static void pgste_set_unlock_multiple(union pte *first, int n, union pgste *pgstes) +{ + int i; + + for (i = 0; i < n; i++) { + if (!pgstes[i].pcl) + break; + pgste_set_unlock(first + i, pgstes[i]); + } +} + +static bool pgste_get_trylock_multiple(union pte *first, int n, union pgste *pgstes) +{ + int i; + + for (i = 0; i < n; i++) { + if (!pgste_get_trylock(first + i, pgstes + i)) + break; + } + if (i == n) + return true; + pgste_set_unlock_multiple(first, n, pgstes); + return false; +} + +unsigned long dat_get_ptval(struct page_table *table, struct ptval_param param) +{ + union pgste pgstes[4] = {}; + unsigned long res = 0; + int i, n; + + n = param.len + 1; + + while (!pgste_get_trylock_multiple(table->ptes + param.offset, n, pgstes)) + cpu_relax(); + + for (i = 0; i < n; i++) + res = res << 16 | pgstes[i].val16; + + pgste_set_unlock_multiple(table->ptes + param.offset, n, pgstes); + return res; +} + +void dat_set_ptval(struct page_table *table, struct ptval_param param, unsigned long val) +{ + union pgste pgstes[4] = {}; + int i, n; + + n = param.len + 1; + + while (!pgste_get_trylock_multiple(table->ptes + param.offset, n, pgstes)) + cpu_relax(); + + for (i = param.len; i >= 0; i--) { + pgstes[i].val16 = val; + val = val >> 16; + } + + pgste_set_unlock_multiple(table->ptes + param.offset, n, pgstes); +} + +static long _dat_test_young_pte(union pte *ptep, gfn_t start, gfn_t end, struct dat_walk *walk) +{ + return ptep->s.y; +} + +static long _dat_test_young_crste(union crste *crstep, gfn_t start, gfn_t end, + struct dat_walk *walk) +{ + return crstep->h.fc && crstep->s.fc1.y; +} + +static const struct dat_walk_ops test_age_ops = { + .pte_entry = _dat_test_young_pte, + .pmd_entry = _dat_test_young_crste, + .pud_entry = _dat_test_young_crste, +}; + +/** + * dat_test_age_gfn() - Test young. + * @asce: The ASCE whose address range is to be tested. + * @start: The first guest frame of the range to check. + * @end: The guest frame after the last in the range. + * + * Context: called by KVM common code with the kvm mmu write lock held. + * + * Return: %true if any page in the given range is young, otherwise %false. + */ +bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end) +{ + return _dat_walk_gfn_range(start, end, asce, &test_age_ops, 0, NULL) > 0; +} + +int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level, + bool uses_skeys, struct guest_fault *f) +{ + union crste oldval, newval; + union pte newpte, oldpte; + union pgste pgste; + int rc = 0; + + rc = dat_entry_walk(mc, f->gfn, asce, DAT_WALK_ALLOC_CONTINUE, level, &f->crstep, &f->ptep); + if (rc == -EINVAL || rc == -ENOMEM) + return rc; + if (rc) + return -EAGAIN; + + if (WARN_ON_ONCE(unlikely(get_level(f->crstep, f->ptep) > level))) + return -EINVAL; + + if (f->ptep) { + pgste = pgste_get_lock(f->ptep); + oldpte = *f->ptep; + newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page); + newpte.s.sd = oldpte.s.sd; + oldpte.s.sd = 0; + if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) { + pgste = __dat_ptep_xchg(f->ptep, pgste, newpte, f->gfn, asce, uses_skeys); + if (f->callback) + f->callback(f); + } else { + rc = -EAGAIN; + } + pgste_set_unlock(f->ptep, pgste); + } else { + oldval = READ_ONCE(*f->crstep); + newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable, + f->write_attempt | oldval.s.fc1.d); + newval.s.fc1.sd = oldval.s.fc1.sd; + if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val && + crste_origin_large(oldval) != crste_origin_large(newval)) + return -EAGAIN; + if (!dat_crstep_xchg_atomic(f->crstep, oldval, newval, f->gfn, asce)) + return -EAGAIN; + if (f->callback) + f->callback(f); + } + + return rc; +} + +static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union crste crste = READ_ONCE(*crstep); + int *n = walk->priv; + + if (!crste.h.fc || crste.h.i || crste.h.p) + return 0; + + *n = 2; + if (crste.s.fc1.prefix_notif) + return 0; + crste.s.fc1.prefix_notif = 1; + dat_crstep_xchg(crstep, crste, gfn, walk->asce); + return 0; +} + +static long dat_set_pn_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + int *n = walk->priv; + union pgste pgste; + + pgste = pgste_get_lock(ptep); + if (!ptep->h.i && !ptep->h.p) { + pgste.prefix_notif = 1; + *n += 1; + } + pgste_set_unlock(ptep, pgste); + return 0; +} + +int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn) +{ + static const struct dat_walk_ops ops = { + .pte_entry = dat_set_pn_pte, + .pmd_entry = dat_set_pn_crste, + .pud_entry = dat_set_pn_crste, + }; + + int n = 0; + + _dat_walk_gfn_range(gfn, gfn + 2, asce, &ops, DAT_WALK_IGN_HOLES, &n); + if (n != 2) + return -EAGAIN; + return 0; +} diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h index 6c7815d4f07f..fe8d790a297d 100644 --- a/arch/s390/kvm/dat.h +++ b/arch/s390/kvm/dat.h @@ -361,6 +361,11 @@ struct dat_walk { void *priv; }; +struct ptval_param { + unsigned char offset : 6; + unsigned char len : 2; +}; + /** * _pte() - Useful constructor for union pte * @pfn: the pfn this pte should point to. @@ -459,6 +464,32 @@ struct kvm_s390_mmu_cache { short int n_rmaps; }; +struct guest_fault { + gfn_t gfn; /* Guest frame */ + kvm_pfn_t pfn; /* Host PFN */ + struct page *page; /* Host page */ + union pte *ptep; /* Used to resolve the fault, or NULL */ + union crste *crstep; /* Used to resolve the fault, or NULL */ + bool writable; /* Mapping is writable */ + bool write_attempt; /* Write access attempted */ + bool attempt_pfault; /* Attempt a pfault first */ + bool valid; /* This entry contains valid data */ + void (*callback)(struct guest_fault *f); + void *priv; +}; + +/* + * 0 1 2 3 4 5 6 7 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | | PGT_ADDR | + * 8 | VMADDR | | + * 16 | | + * 24 | | + */ +#define MKPTVAL(o, l) ((struct ptval_param) { .offset = (o), .len = ((l) + 1) / 2 - 1}) +#define PTVAL_PGT_ADDR MKPTVAL(4, 8) +#define PTVAL_VMADDR MKPTVAL(8, 6) + union pgste __must_check __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new, gfn_t gfn, union asce asce, bool uses_skeys); bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn, @@ -472,6 +503,7 @@ int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, in int walk_level, union crste **last, union pte **ptepp); void dat_free_level(struct crst_table *table, bool owns_ptes); struct crst_table *dat_alloc_crst_sleepable(unsigned long init); +int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newtype); int dat_get_storage_key(union asce asce, gfn_t gfn, union skey *skey); int dat_set_storage_key(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn, union skey skey, bool nq); @@ -480,6 +512,16 @@ int dat_cond_set_storage_key(struct kvm_s390_mmu_cache *mmc, union asce asce, gf int dat_reset_reference_bit(union asce asce, gfn_t gfn); long dat_reset_skeys(union asce asce, gfn_t start); +unsigned long dat_get_ptval(struct page_table *table, struct ptval_param param); +void dat_set_ptval(struct page_table *table, struct ptval_param param, unsigned long val); + +int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gfn_t end, + u16 type, u16 param); +int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn); +bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end); +int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level, + bool uses_skeys, struct guest_fault *f); + int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc); #define GFP_KVM_S390_MMU_CACHE (GFP_ATOMIC | __GFP_ACCOUNT | __GFP_NOWARN) @@ -880,4 +922,21 @@ static inline int get_level(union crste *crstep, union pte *ptep) return ptep ? TABLE_TYPE_PAGE_TABLE : crstep->h.tt; } +static inline int dat_delete_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, + unsigned long npages) +{ + return dat_set_slot(mc, asce, start, start + npages, _DAT_TOKEN_PIC, PGM_ADDRESSING); +} + +static inline int dat_create_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, + unsigned long npages) +{ + return dat_set_slot(mc, asce, start, start + npages, _DAT_TOKEN_NONE, 0); +} + +static inline bool crste_is_ucas(union crste crste) +{ + return is_pmd(crste) && crste.h.i && crste.h.fc0.tl == 1 && crste.h.fc == 0; +} + #endif /* __KVM_S390_DAT_H */ From 7b368470e1a406cdb0fab3ea1a5a8ebffb5ad91e Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:47 +0100 Subject: [PATCH 206/267] KVM: s390: KVM page table management functions: CMMA Add page table management functions to be used for KVM guest (gmap) page tables. This patch adds functions to handle CMMA and the ESSA instruction. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/dat.c | 275 ++++++++++++++++++++++++++++++++++++++++++++ arch/s390/kvm/dat.h | 27 +++++ 2 files changed, 302 insertions(+) diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c index bc27405cdea1..129dc55a4a0d 100644 --- a/arch/s390/kvm/dat.c +++ b/arch/s390/kvm/dat.c @@ -1114,3 +1114,278 @@ int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn) return -EAGAIN; return 0; } + +/** + * dat_perform_essa() - Perform ESSA actions on the PGSTE. + * @asce: The asce to operate on. + * @gfn: The guest page frame to operate on. + * @orc: The specific action to perform, see the ESSA_SET_* macros. + * @state: The storage attributes to be returned to the guest. + * @dirty: Returns whether the function dirtied a previously clean entry. + * + * Context: Called with kvm->mmu_lock held. + * + * Return: + * * %1 if the page state has been altered and the page is to be added to the CBRL + * * %0 if the page state has been altered, but the page is not to be added to the CBRL + * * %-1 if the page state has not been altered and the page is not to be added to the CBRL + */ +int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty) +{ + union crste *crstep; + union pgste pgste; + union pte *ptep; + int res = 0; + + if (dat_entry_walk(NULL, gfn, asce, 0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep)) { + *state = (union essa_state) { .exception = 1 }; + return -1; + } + + pgste = pgste_get_lock(ptep); + + *state = (union essa_state) { + .content = (ptep->h.i << 1) + (ptep->h.i && pgste.zero), + .nodat = pgste.nodat, + .usage = pgste.usage, + }; + + switch (orc) { + case ESSA_GET_STATE: + res = -1; + break; + case ESSA_SET_STABLE: + pgste.usage = PGSTE_GPS_USAGE_STABLE; + pgste.nodat = 0; + break; + case ESSA_SET_UNUSED: + pgste.usage = PGSTE_GPS_USAGE_UNUSED; + if (ptep->h.i) + res = 1; + break; + case ESSA_SET_VOLATILE: + pgste.usage = PGSTE_GPS_USAGE_VOLATILE; + if (ptep->h.i) + res = 1; + break; + case ESSA_SET_POT_VOLATILE: + if (!ptep->h.i) { + pgste.usage = PGSTE_GPS_USAGE_POT_VOLATILE; + } else if (pgste.zero) { + pgste.usage = PGSTE_GPS_USAGE_VOLATILE; + } else if (!pgste.gc) { + pgste.usage = PGSTE_GPS_USAGE_VOLATILE; + res = 1; + } + break; + case ESSA_SET_STABLE_RESIDENT: + pgste.usage = PGSTE_GPS_USAGE_STABLE; + /* + * Since the resident state can go away any time after this + * call, we will not make this page resident. We can revisit + * this decision if a guest will ever start using this. + */ + break; + case ESSA_SET_STABLE_IF_RESIDENT: + if (!ptep->h.i) + pgste.usage = PGSTE_GPS_USAGE_STABLE; + break; + case ESSA_SET_STABLE_NODAT: + pgste.usage = PGSTE_GPS_USAGE_STABLE; + pgste.nodat = 1; + break; + default: + WARN_ONCE(1, "Invalid ORC!"); + res = -1; + break; + } + /* If we are discarding a page, set it to logical zero. */ + pgste.zero = res == 1; + if (orc > 0) { + *dirty = !pgste.cmma_d; + pgste.cmma_d = 1; + } + + pgste_set_unlock(ptep, pgste); + + return res; +} + +static long dat_reset_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste.usage = 0; + pgste.nodat = 0; + pgste.cmma_d = 0; + pgste_set_unlock(ptep, pgste); + if (need_resched()) + return next; + return 0; +} + +long dat_reset_cmma(union asce asce, gfn_t start) +{ + const struct dat_walk_ops dat_reset_cmma_ops = { + .pte_entry = dat_reset_cmma_pte, + }; + + return _dat_walk_gfn_range(start, asce_end(asce), asce, &dat_reset_cmma_ops, + DAT_WALK_IGN_HOLES, NULL); +} + +struct dat_get_cmma_state { + gfn_t start; + gfn_t end; + unsigned int count; + u8 *values; + atomic64_t *remaining; +}; + +static long __dat_peek_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_get_cmma_state *state = walk->priv; + union pgste pgste; + + pgste = pgste_get_lock(ptep); + state->values[gfn - walk->start] = pgste.usage | (pgste.nodat << 6); + pgste_set_unlock(ptep, pgste); + state->end = next; + + return 0; +} + +static long __dat_peek_cmma_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_get_cmma_state *state = walk->priv; + + if (crstep->h.i) + state->end = min(walk->end, next); + return 0; +} + +int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values) +{ + const struct dat_walk_ops ops = { + .pte_entry = __dat_peek_cmma_pte, + .pmd_entry = __dat_peek_cmma_crste, + .pud_entry = __dat_peek_cmma_crste, + .p4d_entry = __dat_peek_cmma_crste, + .pgd_entry = __dat_peek_cmma_crste, + }; + struct dat_get_cmma_state state = { .values = values, }; + int rc; + + rc = _dat_walk_gfn_range(start, start + *count, asce, &ops, DAT_WALK_DEFAULT, &state); + *count = state.end - start; + /* Return success if at least one value was saved, otherwise an error. */ + return (rc == -EFAULT && *count > 0) ? 0 : rc; +} + +static long __dat_get_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_get_cmma_state *state = walk->priv; + union pgste pgste; + + if (state->start != -1) { + if ((gfn - state->end) > KVM_S390_MAX_BIT_DISTANCE) + return 1; + if (gfn - state->start >= state->count) + return 1; + } + + if (!READ_ONCE(*pgste_of(ptep)).cmma_d) + return 0; + + pgste = pgste_get_lock(ptep); + if (pgste.cmma_d) { + if (state->start == -1) + state->start = gfn; + pgste.cmma_d = 0; + atomic64_dec(state->remaining); + state->values[gfn - state->start] = pgste.usage | pgste.nodat << 6; + state->end = next; + } + pgste_set_unlock(ptep, pgste); + return 0; +} + +int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem) +{ + const struct dat_walk_ops ops = { .pte_entry = __dat_get_cmma_pte, }; + struct dat_get_cmma_state state = { + .remaining = rem, + .values = values, + .count = *count, + .start = -1, + }; + + _dat_walk_gfn_range(*start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, &state); + + if (state.start == -1) { + *count = 0; + } else { + *count = state.end - state.start; + *start = state.start; + } + + return 0; +} + +struct dat_set_cmma_state { + unsigned long mask; + const u8 *bits; +}; + +static long __dat_set_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_set_cmma_state *state = walk->priv; + union pgste pgste, tmp; + + tmp.val = (state->bits[gfn - walk->start] << 24) & state->mask; + + pgste = pgste_get_lock(ptep); + pgste.usage = tmp.usage; + pgste.nodat = tmp.nodat; + pgste_set_unlock(ptep, pgste); + + return 0; +} + +/** + * dat_set_cmma_bits() - Set CMMA bits for a range of guest pages. + * @mc: Cache used for allocations. + * @asce: The ASCE of the guest. + * @gfn: The guest frame of the fist page whose CMMA bits are to set. + * @count: How many pages need to be processed. + * @mask: Which PGSTE bits should be set. + * @bits: Points to an array with the CMMA attributes. + * + * This function sets the CMMA attributes for the given pages. If the input + * buffer has zero length, no action is taken, otherwise the attributes are + * set and the mm->context.uses_cmm flag is set. + * + * Each byte in @bits contains new values for bits 32-39 of the PGSTE. + * Currently, only the fields NT and US are applied. + * + * Return: %0 in case of success, a negative error value otherwise. + */ +int dat_set_cmma_bits(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn, + unsigned long count, unsigned long mask, const uint8_t *bits) +{ + const struct dat_walk_ops ops = { .pte_entry = __dat_set_cmma_pte, }; + struct dat_set_cmma_state state = { .mask = mask, .bits = bits, }; + union crste *crstep; + union pte *ptep; + gfn_t cur; + int rc; + + for (cur = ALIGN_DOWN(gfn, _PAGE_ENTRIES); cur < gfn + count; cur += _PAGE_ENTRIES) { + rc = dat_entry_walk(mc, cur, asce, DAT_WALK_ALLOC, TABLE_TYPE_PAGE_TABLE, + &crstep, &ptep); + if (rc) + return rc; + } + return _dat_walk_gfn_range(gfn, gfn + count, asce, &ops, DAT_WALK_IGN_HOLES, &state); +} diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h index fe8d790a297d..358b756ca8c9 100644 --- a/arch/s390/kvm/dat.h +++ b/arch/s390/kvm/dat.h @@ -17,6 +17,15 @@ #include #include +/* + * Base address and length must be sent at the start of each block, therefore + * it's cheaper to send some clean data, as long as it's less than the size of + * two longs. + */ +#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *)) +/* For consistency */ +#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) + #define _ASCE(x) ((union asce) { .val = (x), }) #define NULL_ASCE _ASCE(0) @@ -433,6 +442,17 @@ static inline union crste _crste_fc1(kvm_pfn_t pfn, int tt, bool writable, bool return res; } +union essa_state { + unsigned char val; + struct { + unsigned char : 2; + unsigned char nodat : 1; + unsigned char exception : 1; + unsigned char usage : 2; + unsigned char content : 2; + }; +}; + /** * struct vsie_rmap - reverse mapping for shadow page table entries * @next: pointer to next rmap in the list @@ -522,6 +542,13 @@ bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end); int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level, bool uses_skeys, struct guest_fault *f); +int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty); +long dat_reset_cmma(union asce asce, gfn_t start_gfn); +int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values); +int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem); +int dat_set_cmma_bits(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn, + unsigned long count, unsigned long mask, const uint8_t *bits); + int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc); #define GFP_KVM_S390_MMU_CACHE (GFP_ATOMIC | __GFP_ACCOUNT | __GFP_NOWARN) From a2c17f9270cc3f1c55c35e8b1d151f41d75b40f5 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:48 +0100 Subject: [PATCH 207/267] KVM: s390: New gmap code New gmap (guest map) code. This new gmap code will only be used by KVM. This will replace the existing gmap. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/Makefile | 2 +- arch/s390/kvm/gmap.c | 1165 ++++++++++++++++++++++++++++++++++++++++ arch/s390/kvm/gmap.h | 244 +++++++++ 3 files changed, 1410 insertions(+), 1 deletion(-) create mode 100644 arch/s390/kvm/gmap.c create mode 100644 arch/s390/kvm/gmap.h diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 84315d2f75fb..21088265402c 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -9,7 +9,7 @@ ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap-vsie.o -kvm-y += dat.o +kvm-y += dat.o gmap.o kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c new file mode 100644 index 000000000000..5736145ef53a --- /dev/null +++ b/arch/s390/kvm/gmap.c @@ -0,0 +1,1165 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Guest memory management for KVM/s390 + * + * Copyright IBM Corp. 2008, 2020, 2024 + * + * Author(s): Claudio Imbrenda + * Martin Schwidefsky + * David Hildenbrand + * Janosch Frank + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dat.h" +#include "gmap.h" +#include "kvm-s390.h" + +static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.sie_block->prog0c & PROG_IN_SIE; +} + +static int gmap_limit_to_type(gfn_t limit) +{ + if (!limit) + return TABLE_TYPE_REGION1; + if (limit <= _REGION3_SIZE >> PAGE_SHIFT) + return TABLE_TYPE_SEGMENT; + if (limit <= _REGION2_SIZE >> PAGE_SHIFT) + return TABLE_TYPE_REGION3; + if (limit <= _REGION1_SIZE >> PAGE_SHIFT) + return TABLE_TYPE_REGION2; + return TABLE_TYPE_REGION1; +} + +/** + * gmap_new() - Allocate and initialize a guest address space. + * @kvm: The kvm owning the guest. + * @limit: Maximum address of the gmap address space. + * + * Return: A guest address space structure. + */ +struct gmap *gmap_new(struct kvm *kvm, gfn_t limit) +{ + struct crst_table *table; + struct gmap *gmap; + int type; + + type = gmap_limit_to_type(limit); + + gmap = kzalloc(sizeof(*gmap), GFP_KERNEL_ACCOUNT); + if (!gmap) + return NULL; + INIT_LIST_HEAD(&gmap->children); + INIT_LIST_HEAD(&gmap->list); + INIT_LIST_HEAD(&gmap->scb_users); + INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_KVM_S390_MMU_CACHE); + spin_lock_init(&gmap->children_lock); + spin_lock_init(&gmap->host_to_rmap_lock); + refcount_set(&gmap->refcount, 1); + + table = dat_alloc_crst_sleepable(_CRSTE_EMPTY(type).val); + if (!table) { + kfree(gmap); + return NULL; + } + + gmap->asce.val = __pa(table); + gmap->asce.dt = type; + gmap->asce.tl = _ASCE_TABLE_LENGTH; + gmap->asce.x = 1; + gmap->asce.p = 1; + gmap->asce.s = 1; + gmap->kvm = kvm; + set_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags); + + return gmap; +} + +static void gmap_add_child(struct gmap *parent, struct gmap *child) +{ + KVM_BUG_ON(is_ucontrol(parent) && parent->parent, parent->kvm); + KVM_BUG_ON(is_ucontrol(parent) && !owns_page_tables(parent), parent->kvm); + KVM_BUG_ON(!refcount_read(&child->refcount), parent->kvm); + lockdep_assert_held(&parent->children_lock); + + child->parent = parent; + + if (is_ucontrol(parent)) + set_bit(GMAP_FLAG_IS_UCONTROL, &child->flags); + else + clear_bit(GMAP_FLAG_IS_UCONTROL, &child->flags); + + if (test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &parent->flags)) + set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags); + else + clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags); + + if (kvm_is_ucontrol(parent->kvm)) + clear_bit(GMAP_FLAG_OWNS_PAGETABLES, &child->flags); + list_add(&child->list, &parent->children); +} + +struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit) +{ + struct gmap *res; + + lockdep_assert_not_held(&parent->children_lock); + res = gmap_new(parent->kvm, limit); + if (res) { + scoped_guard(spinlock, &parent->children_lock) + gmap_add_child(parent, res); + } + return res; +} + +int gmap_set_limit(struct gmap *gmap, gfn_t limit) +{ + struct kvm_s390_mmu_cache *mc; + int rc, type; + + type = gmap_limit_to_type(limit); + + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; + + do { + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + scoped_guard(write_lock, &gmap->kvm->mmu_lock) + rc = dat_set_asce_limit(mc, &gmap->asce, type); + } while (rc == -ENOMEM); + + kvm_s390_free_mmu_cache(mc); + return 0; +} + +static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) +{ + struct vsie_rmap *rmap, *rnext, *head; + struct radix_tree_iter iter; + unsigned long indices[16]; + unsigned long index; + void __rcu **slot; + int i, nr; + + /* A radix tree is freed by deleting all of its entries */ + index = 0; + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == 16) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + head = radix_tree_delete(root, index); + gmap_for_each_rmap_safe(rmap, rnext, head) + kfree(rmap); + } + } while (nr > 0); +} + +void gmap_remove_child(struct gmap *child) +{ + if (KVM_BUG_ON(!child->parent, child->kvm)) + return; + lockdep_assert_held(&child->parent->children_lock); + + list_del(&child->list); + child->parent = NULL; +} + +/** + * gmap_dispose() - Remove and free a guest address space and its children. + * @gmap: Pointer to the guest address space structure. + */ +void gmap_dispose(struct gmap *gmap) +{ + /* The gmap must have been removed from the parent beforehands */ + KVM_BUG_ON(gmap->parent, gmap->kvm); + /* All children of this gmap must have been removed beforehands */ + KVM_BUG_ON(!list_empty(&gmap->children), gmap->kvm); + /* No VSIE shadow block is allowed to use this gmap */ + KVM_BUG_ON(!list_empty(&gmap->scb_users), gmap->kvm); + /* The ASCE must be valid */ + KVM_BUG_ON(!gmap->asce.val, gmap->kvm); + /* The refcount must be 0 */ + KVM_BUG_ON(refcount_read(&gmap->refcount), gmap->kvm); + + /* Flush tlb of all gmaps */ + asce_flush_tlb(gmap->asce); + + /* Free all DAT tables. */ + dat_free_level(dereference_asce(gmap->asce), owns_page_tables(gmap)); + + /* Free additional data for a shadow gmap */ + if (is_shadow(gmap)) + gmap_rmap_radix_tree_free(&gmap->host_to_rmap); + + kfree(gmap); +} + +/** + * s390_replace_asce() - Try to replace the current ASCE of a gmap with a copy. + * @gmap: The gmap whose ASCE needs to be replaced. + * + * If the ASCE is a SEGMENT type then this function will return -EINVAL, + * otherwise the pointers in the host_to_guest radix tree will keep pointing + * to the wrong pages, causing use-after-free and memory corruption. + * If the allocation of the new top level page table fails, the ASCE is not + * replaced. + * In any case, the old ASCE is always removed from the gmap CRST list. + * Therefore the caller has to make sure to save a pointer to it + * beforehand, unless a leak is actually intended. + * + * Return: 0 in case of success, -EINVAL if the ASCE is segment type ASCE, + * -ENOMEM if runinng out of memory. + */ +int s390_replace_asce(struct gmap *gmap) +{ + struct crst_table *table; + union asce asce; + + /* Replacing segment type ASCEs would cause serious issues */ + if (gmap->asce.dt == ASCE_TYPE_SEGMENT) + return -EINVAL; + + table = dat_alloc_crst_sleepable(0); + if (!table) + return -ENOMEM; + memcpy(table, dereference_asce(gmap->asce), sizeof(*table)); + + /* Set new table origin while preserving existing ASCE control bits */ + asce = gmap->asce; + asce.rsto = virt_to_pfn(table); + WRITE_ONCE(gmap->asce, asce); + + return 0; +} + +bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint) +{ + struct kvm *kvm = gmap->kvm; + struct kvm_vcpu *vcpu; + gfn_t prefix_gfn; + unsigned long i; + + if (is_shadow(gmap)) + return false; + kvm_for_each_vcpu(i, vcpu, kvm) { + /* Match against both prefix pages */ + prefix_gfn = gpa_to_gfn(kvm_s390_get_prefix(vcpu)); + if (prefix_gfn < end && gfn <= prefix_gfn + 1) { + if (hint && kvm_s390_is_in_sie(vcpu)) + return false; + VCPU_EVENT(vcpu, 2, "gmap notifier for %llx-%llx", + gfn_to_gpa(gfn), gfn_to_gpa(end)); + kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); + } + } + return true; +} + +struct clear_young_pte_priv { + struct gmap *gmap; + bool young; +}; + +static long gmap_clear_young_pte(union pte *ptep, gfn_t gfn, gfn_t end, struct dat_walk *walk) +{ + struct clear_young_pte_priv *p = walk->priv; + union pgste pgste; + union pte pte, new; + + pte = READ_ONCE(*ptep); + + if (!pte.s.pr || (!pte.s.y && pte.h.i)) + return 0; + + pgste = pgste_get_lock(ptep); + if (!pgste.prefix_notif || gmap_mkold_prefix(p->gmap, gfn, end)) { + new = pte; + new.h.i = 1; + new.s.y = 0; + if ((new.s.d || !new.h.p) && !new.s.s) + folio_set_dirty(pfn_folio(pte.h.pfra)); + new.s.d = 0; + new.h.p = 1; + + pgste.prefix_notif = 0; + pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, walk->asce, uses_skeys(p->gmap)); + } + p->young = 1; + pgste_set_unlock(ptep, pgste); + return 0; +} + +static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, struct dat_walk *walk) +{ + struct clear_young_pte_priv *priv = walk->priv; + union crste crste, new; + + crste = READ_ONCE(*crstep); + + if (!crste.h.fc) + return 0; + if (!crste.s.fc1.y && crste.h.i) + return 0; + if (!crste_prefix(crste) || gmap_mkold_prefix(priv->gmap, gfn, end)) { + new = crste; + new.h.i = 1; + new.s.fc1.y = 0; + new.s.fc1.prefix_notif = 0; + if (new.s.fc1.d || !new.h.p) + folio_set_dirty(phys_to_folio(crste_origin_large(crste))); + new.s.fc1.d = 0; + new.h.p = 1; + dat_crstep_xchg(crstep, new, gfn, walk->asce); + } + priv->young = 1; + return 0; +} + +/** + * gmap_age_gfn() - Clear young. + * @gmap: The guest gmap. + * @start: The first gfn to test. + * @end: The gfn after the last one to test. + * + * Context: Called with the kvm mmu write lock held. + * Return: 1 if any page in the given range was young, otherwise 0. + */ +bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end) +{ + const struct dat_walk_ops ops = { + .pte_entry = gmap_clear_young_pte, + .pmd_entry = gmap_clear_young_crste, + .pud_entry = gmap_clear_young_crste, + }; + struct clear_young_pte_priv priv = { + .gmap = gmap, + .young = false, + }; + + _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv); + + return priv.young; +} + +struct gmap_unmap_priv { + struct gmap *gmap; + struct kvm_memory_slot *slot; +}; + +static long _gmap_unmap_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *w) +{ + struct gmap_unmap_priv *priv = w->priv; + struct folio *folio = NULL; + unsigned long vmaddr; + union pgste pgste; + + pgste = pgste_get_lock(ptep); + if (ptep->s.pr && pgste.usage == PGSTE_GPS_USAGE_UNUSED) { + vmaddr = __gfn_to_hva_memslot(priv->slot, gfn); + gmap_helper_try_set_pte_unused(priv->gmap->kvm->mm, vmaddr); + } + if (ptep->s.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) + folio = pfn_folio(ptep->h.pfra); + pgste = gmap_ptep_xchg(priv->gmap, ptep, _PTE_EMPTY, pgste, gfn); + pgste_set_unlock(ptep, pgste); + if (folio) + uv_convert_from_secure_folio(folio); + + return 0; +} + +static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct gmap_unmap_priv *priv = walk->priv; + struct folio *folio = NULL; + + if (crstep->h.fc) { + if (crstep->s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) + folio = phys_to_folio(crste_origin_large(*crstep)); + gmap_crstep_xchg(priv->gmap, crstep, _CRSTE_EMPTY(crstep->h.tt), gfn); + if (folio) + uv_convert_from_secure_folio(folio); + } + + return 0; +} + +/** + * gmap_unmap_gfn_range() - Unmap a range of guest addresses. + * @gmap: The gmap to act on. + * @slot: The memslot in which the range is located. + * @start: The first gfn to unmap. + * @end: The gfn after the last one to unmap. + * + * Context: Called with the kvm mmu write lock held. + * Return: false + */ +bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end) +{ + const struct dat_walk_ops ops = { + .pte_entry = _gmap_unmap_pte, + .pmd_entry = _gmap_unmap_crste, + .pud_entry = _gmap_unmap_crste, + }; + struct gmap_unmap_priv priv = { + .gmap = gmap, + .slot = slot, + }; + + lockdep_assert_held_write(&gmap->kvm->mmu_lock); + + _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv); + return false; +} + +static union pgste __pte_test_and_clear_softdirty(union pte *ptep, union pgste pgste, gfn_t gfn, + struct gmap *gmap) +{ + union pte pte = READ_ONCE(*ptep); + + if (!pte.s.pr || (pte.h.p && !pte.s.sd)) + return pgste; + + /* + * If this page contains one or more prefixes of vCPUS that are currently + * running, do not reset the protection, leave it marked as dirty. + */ + if (!pgste.prefix_notif || gmap_mkold_prefix(gmap, gfn, gfn + 1)) { + pte.h.p = 1; + pte.s.sd = 0; + pgste = gmap_ptep_xchg(gmap, ptep, pte, pgste, gfn); + } + + mark_page_dirty(gmap->kvm, gfn); + + return pgste; +} + +static long _pte_test_and_clear_softdirty(union pte *ptep, gfn_t gfn, gfn_t end, + struct dat_walk *walk) +{ + struct gmap *gmap = walk->priv; + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste = __pte_test_and_clear_softdirty(ptep, pgste, gfn, gmap); + pgste_set_unlock(ptep, pgste); + return 0; +} + +static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t end, + struct dat_walk *walk) +{ + struct gmap *gmap = walk->priv; + union crste crste, new; + + if (fatal_signal_pending(current)) + return 1; + crste = READ_ONCE(*table); + if (!crste.h.fc) + return 0; + if (crste.h.p && !crste.s.fc1.sd) + return 0; + + /* + * If this large page contains one or more prefixes of vCPUs that are + * currently running, do not reset the protection, leave it marked as + * dirty. + */ + if (!crste.s.fc1.prefix_notif || gmap_mkold_prefix(gmap, gfn, end)) { + new = crste; + new.h.p = 1; + new.s.fc1.sd = 0; + gmap_crstep_xchg(gmap, table, new, gfn); + } + + for ( ; gfn < end; gfn++) + mark_page_dirty(gmap->kvm, gfn); + + return 0; +} + +void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end) +{ + const struct dat_walk_ops walk_ops = { + .pte_entry = _pte_test_and_clear_softdirty, + .pmd_entry = _crste_test_and_clear_softdirty, + .pud_entry = _crste_test_and_clear_softdirty, + }; + + lockdep_assert_held(&gmap->kvm->mmu_lock); + + _dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap); +} + +static int gmap_handle_minor_crste_fault(union asce asce, struct guest_fault *f) +{ + union crste newcrste, oldcrste = READ_ONCE(*f->crstep); + + /* Somehow the crste is not large anymore, let the slow path deal with it. */ + if (!oldcrste.h.fc) + return 1; + + f->pfn = PHYS_PFN(large_crste_to_phys(oldcrste, f->gfn)); + f->writable = oldcrste.s.fc1.w; + + /* Appropriate permissions already (race with another handler), nothing to do. */ + if (!oldcrste.h.i && !(f->write_attempt && oldcrste.h.p)) + return 0; + + if (!f->write_attempt || oldcrste.s.fc1.w) { + f->write_attempt |= oldcrste.s.fc1.w && oldcrste.s.fc1.d; + newcrste = oldcrste; + newcrste.h.i = 0; + newcrste.s.fc1.y = 1; + if (f->write_attempt) { + newcrste.h.p = 0; + newcrste.s.fc1.d = 1; + newcrste.s.fc1.sd = 1; + } + if (!oldcrste.s.fc1.d && newcrste.s.fc1.d) + SetPageDirty(phys_to_page(crste_origin_large(newcrste))); + /* In case of races, let the slow path deal with it. */ + return !dat_crstep_xchg_atomic(f->crstep, oldcrste, newcrste, f->gfn, asce); + } + /* Trying to write on a read-only page, let the slow path deal with it. */ + return 1; +} + +static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste, + struct guest_fault *f) +{ + union pte newpte, oldpte = READ_ONCE(*f->ptep); + + f->pfn = oldpte.h.pfra; + f->writable = oldpte.s.w; + + /* Appropriate permissions already (race with another handler), nothing to do. */ + if (!oldpte.h.i && !(f->write_attempt && oldpte.h.p)) + return 0; + /* Trying to write on a read-only page, let the slow path deal with it. */ + if (!oldpte.s.pr || (f->write_attempt && !oldpte.s.w)) + return 1; + + newpte = oldpte; + newpte.h.i = 0; + newpte.s.y = 1; + if (f->write_attempt) { + newpte.h.p = 0; + newpte.s.d = 1; + newpte.s.sd = 1; + } + if (!oldpte.s.d && newpte.s.d) + SetPageDirty(pfn_to_page(newpte.h.pfra)); + *pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn); + + return 0; +} + +/** + * gmap_try_fixup_minor() -- Try to fixup a minor gmap fault. + * @gmap: The gmap whose fault needs to be resolved. + * @fault: Describes the fault that is being resolved. + * + * A minor fault is a fault that can be resolved quickly within gmap. + * The page is already mapped, the fault is only due to dirty/young tracking. + * + * Return: 0 in case of success, < 0 in case of error, > 0 if the fault could + * not be resolved and needs to go through the slow path. + */ +int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault) +{ + union pgste pgste; + int rc; + + lockdep_assert_held(&gmap->kvm->mmu_lock); + + rc = dat_entry_walk(NULL, fault->gfn, gmap->asce, DAT_WALK_LEAF, TABLE_TYPE_PAGE_TABLE, + &fault->crstep, &fault->ptep); + /* If a PTE or a leaf CRSTE could not be reached, slow path. */ + if (rc) + return 1; + + if (fault->ptep) { + pgste = pgste_get_lock(fault->ptep); + rc = _gmap_handle_minor_pte_fault(gmap, &pgste, fault); + if (!rc && fault->callback) + fault->callback(fault); + pgste_set_unlock(fault->ptep, pgste); + } else { + rc = gmap_handle_minor_crste_fault(gmap->asce, fault); + if (!rc && fault->callback) + fault->callback(fault); + } + return rc; +} + +static inline bool gmap_2g_allowed(struct gmap *gmap, gfn_t gfn) +{ + return false; +} + +static inline bool gmap_1m_allowed(struct gmap *gmap, gfn_t gfn) +{ + return false; +} + +int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f) +{ + unsigned int order; + int rc, level; + + lockdep_assert_held(&gmap->kvm->mmu_lock); + + level = TABLE_TYPE_PAGE_TABLE; + if (f->page) { + order = folio_order(page_folio(f->page)); + if (order >= get_order(_REGION3_SIZE) && gmap_2g_allowed(gmap, f->gfn)) + level = TABLE_TYPE_REGION3; + else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f->gfn)) + level = TABLE_TYPE_SEGMENT; + } + rc = dat_link(mc, gmap->asce, level, uses_skeys(gmap), f); + KVM_BUG_ON(rc == -EINVAL, gmap->kvm); + return rc; +} + +static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, + gfn_t p_gfn, gfn_t c_gfn, bool force_alloc) +{ + struct page_table *pt; + union crste newcrste; + union crste *crstep; + union pte *ptep; + int rc; + + if (force_alloc) + rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC, + TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); + else + rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC_CONTINUE, + TABLE_TYPE_SEGMENT, &crstep, &ptep); + if (rc) + return rc; + if (!ptep) { + newcrste = _crste_fc0(p_gfn, TABLE_TYPE_SEGMENT); + newcrste.h.i = 1; + newcrste.h.fc0.tl = 1; + } else { + pt = pte_table_start(ptep); + dat_set_ptval(pt, PTVAL_VMADDR, p_gfn >> (_SEGMENT_SHIFT - PAGE_SHIFT)); + newcrste = _crste_fc0(virt_to_pfn(pt), TABLE_TYPE_SEGMENT); + } + rc = dat_entry_walk(mc, c_gfn, gmap->asce, DAT_WALK_ALLOC, TABLE_TYPE_SEGMENT, + &crstep, &ptep); + if (rc) + return rc; + dat_crstep_xchg(crstep, newcrste, c_gfn, gmap->asce); + return 0; +} + +static int gmap_ucas_translate_simple(struct gmap *gmap, gpa_t *gaddr, union crste **crstepp) +{ + union pte *ptep; + int rc; + + rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), gmap->asce, DAT_WALK_CONTINUE, + TABLE_TYPE_SEGMENT, crstepp, &ptep); + if (rc || (!ptep && !crste_is_ucas(**crstepp))) + return -EREMOTE; + if (!ptep) + return 1; + *gaddr &= ~_SEGMENT_MASK; + *gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT; + return 0; +} + +/** + * gmap_ucas_translate() - Translate a vcpu address into a host gmap address + * @mc: The memory cache to be used for allocations. + * @gmap: The per-cpu gmap. + * @gaddr: Pointer to the address to be translated, will get overwritten with + * the translated address in case of success. + * Translates the per-vCPU guest address into a fake guest address, which can + * then be used with the fake memslots that are identity mapping userspace. + * This allows ucontrol VMs to use the normal fault resolution path, like + * normal VMs. + * + * Return: %0 in case of success, otherwise %-EREMOTE. + */ +int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr) +{ + gpa_t translated_address; + union crste *crstep; + gfn_t gfn; + int rc; + + gfn = gpa_to_gfn(*gaddr); + + scoped_guard(read_lock, &gmap->kvm->mmu_lock) { + rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep); + if (rc <= 0) + return rc; + } + do { + scoped_guard(write_lock, &gmap->kvm->mmu_lock) { + rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep); + if (rc <= 0) + return rc; + translated_address = (*gaddr & ~_SEGMENT_MASK) | + (crstep->val & _SEGMENT_MASK); + rc = gmap_ucas_map_one(mc, gmap, gpa_to_gfn(translated_address), gfn, true); + } + if (!rc) { + *gaddr = translated_address; + return 0; + } + if (rc != -ENOMEM) + return -EREMOTE; + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + } while (1); + return 0; +} + +int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count) +{ + struct kvm_s390_mmu_cache *mc; + int rc; + + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; + + while (count) { + scoped_guard(write_lock, &gmap->kvm->mmu_lock) + rc = gmap_ucas_map_one(mc, gmap, p_gfn, c_gfn, false); + if (rc == -ENOMEM) { + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + continue; + } + if (rc) + return rc; + + count--; + c_gfn += _PAGE_ENTRIES; + p_gfn += _PAGE_ENTRIES; + } + return rc; +} + +static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn) +{ + union crste *crstep; + union pte *ptep; + int rc; + + rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep); + if (!rc) + dat_crstep_xchg(crstep, _PMD_EMPTY, c_gfn, gmap->asce); +} + +void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count) +{ + guard(read_lock)(&gmap->kvm->mmu_lock); + + for ( ; count; count--, c_gfn += _PAGE_ENTRIES) + gmap_ucas_unmap_one(gmap, c_gfn); +} + +static long _gmap_split_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct gmap *gmap = walk->priv; + union crste crste, newcrste; + + crste = READ_ONCE(*crstep); + newcrste = _CRSTE_EMPTY(crste.h.tt); + + while (crste_leaf(crste)) { + if (crste_prefix(crste)) + gmap_unmap_prefix(gmap, gfn, next); + if (crste.s.fc1.vsie_notif) + gmap_handle_vsie_unshadow_event(gmap, gfn); + if (dat_crstep_xchg_atomic(crstep, crste, newcrste, gfn, walk->asce)) + break; + crste = READ_ONCE(*crstep); + } + + if (need_resched()) + return next; + + return 0; +} + +void gmap_split_huge_pages(struct gmap *gmap) +{ + const struct dat_walk_ops ops = { + .pmd_entry = _gmap_split_crste, + .pud_entry = _gmap_split_crste, + }; + gfn_t start = 0; + + do { + scoped_guard(read_lock, &gmap->kvm->mmu_lock) + start = _dat_walk_gfn_range(start, asce_end(gmap->asce), gmap->asce, + &ops, DAT_WALK_IGN_HOLES, gmap); + cond_resched(); + } while (start); +} + +static int _gmap_enable_skeys(struct gmap *gmap) +{ + gfn_t start = 0; + int rc; + + if (uses_skeys(gmap)) + return 0; + + set_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); + rc = gmap_helper_disable_cow_sharing(); + if (rc) { + clear_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); + return rc; + } + + do { + scoped_guard(write_lock, &gmap->kvm->mmu_lock) + start = dat_reset_skeys(gmap->asce, start); + cond_resched(); + } while (start); + return 0; +} + +int gmap_enable_skeys(struct gmap *gmap) +{ + int rc; + + mmap_write_lock(gmap->kvm->mm); + rc = _gmap_enable_skeys(gmap); + mmap_write_unlock(gmap->kvm->mm); + return rc; +} + +static long _destroy_pages_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + if (!ptep->s.pr) + return 0; + __kvm_s390_pv_destroy_page(phys_to_page(pte_origin(*ptep))); + if (need_resched()) + return next; + return 0; +} + +static long _destroy_pages_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + phys_addr_t origin, cur, end; + + if (!crstep->h.fc || !crstep->s.fc1.pr) + return 0; + + origin = crste_origin_large(*crstep); + cur = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin; + end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin; + for ( ; cur < end; cur += PAGE_SIZE) + __kvm_s390_pv_destroy_page(phys_to_page(cur)); + if (need_resched()) + return next; + return 0; +} + +int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible) +{ + const struct dat_walk_ops ops = { + .pte_entry = _destroy_pages_pte, + .pmd_entry = _destroy_pages_crste, + .pud_entry = _destroy_pages_crste, + }; + + do { + scoped_guard(read_lock, &gmap->kvm->mmu_lock) + start = _dat_walk_gfn_range(start, end, gmap->asce, &ops, + DAT_WALK_IGN_HOLES, NULL); + if (interruptible && fatal_signal_pending(current)) + return -EINTR; + cond_resched(); + } while (start && start < end); + return 0; +} + +int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level) +{ + struct vsie_rmap *rmap __free(kvfree) = NULL; + struct vsie_rmap *temp; + void __rcu **slot; + int rc = 0; + + KVM_BUG_ON(!is_shadow(sg), sg->kvm); + lockdep_assert_held(&sg->host_to_rmap_lock); + + rmap = kzalloc(sizeof(*rmap), GFP_ATOMIC); + if (!rmap) + return -ENOMEM; + + rmap->r_gfn = r_gfn; + rmap->level = level; + slot = radix_tree_lookup_slot(&sg->host_to_rmap, p_gfn); + if (slot) { + rmap->next = radix_tree_deref_slot_protected(slot, &sg->host_to_rmap_lock); + for (temp = rmap->next; temp; temp = temp->next) { + if (temp->val == rmap->val) + return 0; + } + radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); + } else { + rmap->next = NULL; + rc = radix_tree_insert(&sg->host_to_rmap, p_gfn, rmap); + if (rc) + return rc; + } + rmap = NULL; + + return 0; +} + +int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, + kvm_pfn_t pfn, int level, bool wr) +{ + union crste *crstep; + union pgste pgste; + union pte *ptep; + union pte pte; + int flags, rc; + + KVM_BUG_ON(!is_shadow(sg), sg->kvm); + lockdep_assert_held(&sg->parent->children_lock); + + flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0); + rc = dat_entry_walk(mc, p_gfn, sg->parent->asce, flags, + TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); + if (rc) + return rc; + if (level <= TABLE_TYPE_REGION1) { + scoped_guard(spinlock, &sg->host_to_rmap_lock) + rc = gmap_insert_rmap(sg, p_gfn, r_gfn, level); + } + if (rc) + return rc; + + if (!pgste_get_trylock(ptep, &pgste)) + return -EAGAIN; + pte = ptep->s.pr ? *ptep : _pte(pfn, wr, false, false); + pte.h.p = 1; + pgste = _gmap_ptep_xchg(sg->parent, ptep, pte, pgste, p_gfn, false); + pgste.vsie_notif = 1; + pgste_set_unlock(ptep, pgste); + + return 0; +} + +static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + __atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val); + if (need_resched()) + return next; + return 0; +} + +void gmap_set_cmma_all_dirty(struct gmap *gmap) +{ + const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, }; + gfn_t gfn = 0; + + do { + scoped_guard(read_lock, &gmap->kvm->mmu_lock) + gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops, + DAT_WALK_IGN_HOLES, NULL); + cond_resched(); + } while (gfn); +} + +static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level) +{ + unsigned long align = PAGE_SIZE; + gpa_t gaddr = gfn_to_gpa(r_gfn); + union crste *crstep; + union crste crste; + union pte *ptep; + + if (level > TABLE_TYPE_PAGE_TABLE) + align = 1UL << (11 * level + _SEGMENT_SHIFT); + kvm_s390_vsie_gmap_notifier(sg, ALIGN_DOWN(gaddr, align), ALIGN(gaddr + 1, align)); + if (dat_entry_walk(NULL, r_gfn, sg->asce, 0, level, &crstep, &ptep)) + return; + if (ptep) { + if (READ_ONCE(*ptep).val != _PTE_EMPTY.val) + dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg)); + return; + } + crste = READ_ONCE(*crstep); + dat_crstep_clear(crstep, r_gfn, sg->asce); + if (crste_leaf(crste) || crste.h.i) + return; + if (is_pmd(crste)) + dat_free_pt(dereference_pmd(crste.pmd)); + else + dat_free_level(dereference_crste(crste), true); +} + +static void gmap_unshadow(struct gmap *sg) +{ + struct gmap_cache *gmap_cache, *next; + + KVM_BUG_ON(!is_shadow(sg), sg->kvm); + KVM_BUG_ON(!sg->parent, sg->kvm); + + lockdep_assert_held(&sg->parent->children_lock); + + gmap_remove_child(sg); + kvm_s390_vsie_gmap_notifier(sg, 0, -1UL); + + list_for_each_entry_safe(gmap_cache, next, &sg->scb_users, list) { + gmap_cache->gmap = NULL; + list_del(&gmap_cache->list); + } + + gmap_put(sg); +} + +void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn) +{ + struct vsie_rmap *rmap, *rnext, *head; + struct gmap *sg, *next; + gfn_t start, end; + + list_for_each_entry_safe(sg, next, &parent->children, list) { + start = sg->guest_asce.rsto; + end = start + sg->guest_asce.tl + 1; + if (!sg->guest_asce.r && gfn >= start && gfn < end) { + gmap_unshadow(sg); + continue; + } + scoped_guard(spinlock, &sg->host_to_rmap_lock) + head = radix_tree_delete(&sg->host_to_rmap, gfn); + gmap_for_each_rmap_safe(rmap, rnext, head) + gmap_unshadow_level(sg, rmap->r_gfn, rmap->level); + } +} + +/** + * gmap_find_shadow() - Find a specific ASCE in the list of shadow tables. + * @parent: Pointer to the parent gmap. + * @asce: ASCE for which the shadow table is created. + * @edat_level: Edat level to be used for the shadow translation. + * + * Context: Called with parent->children_lock held. + * + * Return: The pointer to a gmap if a shadow table with the given asce is + * already available, ERR_PTR(-EAGAIN) if another one is just being created, + * otherwise NULL. + */ +static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int edat_level) +{ + struct gmap *sg; + + lockdep_assert_held(&parent->children_lock); + list_for_each_entry(sg, &parent->children, list) { + if (!gmap_is_shadow_valid(sg, asce, edat_level)) + continue; + return sg; + } + return NULL; +} + +static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg) +{ + KVM_BUG_ON(1, sg->kvm); + return -EINVAL; +} + +/** + * gmap_create_shadow() - Create/find a shadow guest address space. + * @mc: The cache to use to allocate dat tables. + * @parent: Pointer to the parent gmap. + * @asce: ASCE for which the shadow table is created. + * @edat_level: Edat level to be used for the shadow translation. + * + * The pages of the top level page table referred by the asce parameter + * will be set to read-only and marked in the PGSTEs of the kvm process. + * The shadow table will be removed automatically on any change to the + * PTE mapping for the source table. + * + * Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory, + * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the + * parent gmap table could not be protected. + */ +struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *parent, + union asce asce, int edat_level) +{ + struct gmap *sg, *new; + int rc; + + scoped_guard(spinlock, &parent->children_lock) + sg = gmap_find_shadow(parent, asce, edat_level); + if (sg) + return sg; + /* Create a new shadow gmap. */ + new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce)); + if (!new) + return ERR_PTR(-ENOMEM); + new->guest_asce = asce; + new->edat_level = edat_level; + set_bit(GMAP_FLAG_SHADOW, &new->flags); + + scoped_guard(spinlock, &parent->children_lock) { + /* Recheck if another CPU created the same shadow. */ + sg = gmap_find_shadow(parent, asce, edat_level); + if (sg) { + gmap_put(new); + return sg; + } + if (asce.r) { + /* Only allow one real-space gmap shadow. */ + list_for_each_entry(sg, &parent->children, list) { + if (sg->guest_asce.r) { + scoped_guard(write_lock, &parent->kvm->mmu_lock) + gmap_unshadow(sg); + break; + } + } + gmap_add_child(parent, new); + /* Nothing to protect, return right away. */ + return new; + } + } + + new->parent = parent; + /* Protect while inserting, protects against invalidation races. */ + rc = gmap_protect_asce_top_level(mc, new); + if (rc) { + new->parent = NULL; + gmap_put(new); + return ERR_PTR(rc); + } + return new; +} diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h new file mode 100644 index 000000000000..ccb5cd751e31 --- /dev/null +++ b/arch/s390/kvm/gmap.h @@ -0,0 +1,244 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2016, 2025 + * Author(s): Martin Schwidefsky + * Claudio Imbrenda + */ + +#ifndef ARCH_KVM_S390_GMAP_H +#define ARCH_KVM_S390_GMAP_H + +#include "dat.h" + +/** + * enum gmap_flags - Flags of a gmap. + * + * @GMAP_FLAG_SHADOW: The gmap is a vsie shadow gmap. + * @GMAP_FLAG_OWNS_PAGETABLES: The gmap owns all dat levels; normally 1, is 0 + * only for ucontrol per-cpu gmaps, since they + * share the page tables with the main gmap. + * @GMAP_FLAG_IS_UCONTROL: The gmap is ucontrol (main gmap or per-cpu gmap). + * @GMAP_FLAG_ALLOW_HPAGE_1M: 1M hugepages are allowed for this gmap, + * independently of the page size used by userspace. + * @GMAP_FLAG_ALLOW_HPAGE_2G: 2G hugepages are allowed for this gmap, + * independently of the page size used by userspace. + * @GMAP_FLAG_PFAULT_ENABLED: Pfault is enabled for the gmap. + * @GMAP_FLAG_USES_SKEYS: If the guest uses storage keys. + * @GMAP_FLAG_USES_CMM: Whether the guest uses CMMA. + * @GMAP_FLAG_EXPORT_ON_UNMAP: Whether to export guest pages when unmapping. + */ +enum gmap_flags { + GMAP_FLAG_SHADOW = 0, + GMAP_FLAG_OWNS_PAGETABLES, + GMAP_FLAG_IS_UCONTROL, + GMAP_FLAG_ALLOW_HPAGE_1M, + GMAP_FLAG_ALLOW_HPAGE_2G, + GMAP_FLAG_PFAULT_ENABLED, + GMAP_FLAG_USES_SKEYS, + GMAP_FLAG_USES_CMM, + GMAP_FLAG_EXPORT_ON_UNMAP, +}; + +/** + * struct gmap_struct - Guest address space. + * + * @flags: GMAP_FLAG_* flags. + * @edat_level: The edat level of this shadow gmap. + * @kvm: The vm. + * @asce: The ASCE used by this gmap. + * @list: List head used in children gmaps for the children gmap list. + * @children_lock: Protects children and scb_users. + * @children: List of child gmaps of this gmap. + * @scb_users: List of vsie_scb that use this shadow gmap. + * @parent: Parent gmap of a child gmap. + * @guest_asce: Original ASCE of this shadow gmap. + * @host_to_rmap_lock: Protects host_to_rmap. + * @host_to_rmap: Radix tree mapping host addresses to guest addresses. + */ +struct gmap { + unsigned long flags; + unsigned char edat_level; + struct kvm *kvm; + union asce asce; + struct list_head list; + spinlock_t children_lock; /* Protects: children, scb_users */ + struct list_head children; + struct list_head scb_users; + struct gmap *parent; + union asce guest_asce; + spinlock_t host_to_rmap_lock; /* Protects host_to_rmap */ + struct radix_tree_root host_to_rmap; + refcount_t refcount; +}; + +struct gmap_cache { + struct list_head list; + struct gmap *gmap; +}; + +#define gmap_for_each_rmap_safe(pos, n, head) \ + for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n) + +int s390_replace_asce(struct gmap *gmap); +bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint); +bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end); +bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end); +int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault); +struct gmap *gmap_new(struct kvm *kvm, gfn_t limit); +struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit); +void gmap_remove_child(struct gmap *child); +void gmap_dispose(struct gmap *gmap); +int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *fault); +void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end); +int gmap_set_limit(struct gmap *gmap, gfn_t limit); +int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr); +int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count); +void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count); +int gmap_enable_skeys(struct gmap *gmap); +int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible); +int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level); +int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, + kvm_pfn_t pfn, int level, bool wr); +void gmap_set_cmma_all_dirty(struct gmap *gmap); +void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn); +struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, + union asce asce, int edat_level); +void gmap_split_huge_pages(struct gmap *gmap); + +static inline bool uses_skeys(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); +} + +static inline bool uses_cmm(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_USES_CMM, &gmap->flags); +} + +static inline bool pfault_enabled(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_PFAULT_ENABLED, &gmap->flags); +} + +static inline bool is_ucontrol(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_IS_UCONTROL, &gmap->flags); +} + +static inline bool is_shadow(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_SHADOW, &gmap->flags); +} + +static inline bool owns_page_tables(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags); +} + +static inline struct gmap *gmap_put(struct gmap *gmap) +{ + if (refcount_dec_and_test(&gmap->refcount)) + gmap_dispose(gmap); + return NULL; +} + +static inline void gmap_get(struct gmap *gmap) +{ + WARN_ON_ONCE(unlikely(!refcount_inc_not_zero(&gmap->refcount))); +} + +static inline void gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn) +{ + scoped_guard(spinlock, &parent->children_lock) + _gmap_handle_vsie_unshadow_event(parent, gfn); +} + +static inline bool gmap_mkold_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end) +{ + return _gmap_unmap_prefix(gmap, gfn, end, true); +} + +static inline bool gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end) +{ + return _gmap_unmap_prefix(gmap, gfn, end, false); +} + +static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, union pte newpte, + union pgste pgste, gfn_t gfn, bool needs_lock) +{ + lockdep_assert_held(&gmap->kvm->mmu_lock); + if (!needs_lock) + lockdep_assert_held(&gmap->children_lock); + else + lockdep_assert_not_held(&gmap->children_lock); + + if (pgste.prefix_notif && (newpte.h.p || newpte.h.i)) { + pgste.prefix_notif = 0; + gmap_unmap_prefix(gmap, gfn, gfn + 1); + } + if (pgste.vsie_notif && (ptep->h.p != newpte.h.p || newpte.h.i)) { + pgste.vsie_notif = 0; + if (needs_lock) + gmap_handle_vsie_unshadow_event(gmap, gfn); + else + _gmap_handle_vsie_unshadow_event(gmap, gfn); + } + return __dat_ptep_xchg(ptep, pgste, newpte, gfn, gmap->asce, uses_skeys(gmap)); +} + +static inline union pgste gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, union pte newpte, + union pgste pgste, gfn_t gfn) +{ + return _gmap_ptep_xchg(gmap, ptep, newpte, pgste, gfn, true); +} + +static inline void _gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne, + gfn_t gfn, bool needs_lock) +{ + unsigned long align = 8 + (is_pmd(*crstep) ? 0 : 11); + + lockdep_assert_held(&gmap->kvm->mmu_lock); + if (!needs_lock) + lockdep_assert_held(&gmap->children_lock); + + gfn = ALIGN_DOWN(gfn, align); + if (crste_prefix(*crstep) && (ne.h.p || ne.h.i || !crste_prefix(ne))) { + ne.s.fc1.prefix_notif = 0; + gmap_unmap_prefix(gmap, gfn, gfn + align); + } + if (crste_leaf(*crstep) && crstep->s.fc1.vsie_notif && + (ne.h.p || ne.h.i || !ne.s.fc1.vsie_notif)) { + ne.s.fc1.vsie_notif = 0; + if (needs_lock) + gmap_handle_vsie_unshadow_event(gmap, gfn); + else + _gmap_handle_vsie_unshadow_event(gmap, gfn); + } + dat_crstep_xchg(crstep, ne, gfn, gmap->asce); +} + +static inline void gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne, + gfn_t gfn) +{ + return _gmap_crstep_xchg(gmap, crstep, ne, gfn, true); +} + +/** + * gmap_is_shadow_valid() - check if a shadow guest address space matches the + * given properties and is still valid. + * @sg: Pointer to the shadow guest address space structure. + * @asce: ASCE for which the shadow table is requested. + * @edat_level: Edat level to be used for the shadow translation. + * + * Return: true if the gmap shadow is still valid and matches the given + * properties and the caller can continue using it; false otherwise, the + * caller has to request a new shadow gmap in this case. + */ +static inline bool gmap_is_shadow_valid(struct gmap *sg, union asce asce, int edat_level) +{ + return sg->guest_asce.val == asce.val && sg->edat_level == edat_level; +} + +#endif /* ARCH_KVM_S390_GMAP_H */ From e907ae5301338e7c1109311bbb3449fd9bd5d1ca Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:49 +0100 Subject: [PATCH 208/267] KVM: s390: Add helper functions for fault handling Add some helper functions for handling multiple guest faults at the same time. This will be needed for VSIE, where a nested guest access also needs to access all the page tables that map it. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/include/asm/kvm_host.h | 1 + arch/s390/kvm/Makefile | 2 +- arch/s390/kvm/faultin.c | 148 +++++++++++++++++++++++++++++++ arch/s390/kvm/faultin.h | 92 +++++++++++++++++++ arch/s390/kvm/kvm-s390.c | 2 +- arch/s390/kvm/kvm-s390.h | 2 + 6 files changed, 245 insertions(+), 2 deletions(-) create mode 100644 arch/s390/kvm/faultin.c create mode 100644 arch/s390/kvm/faultin.h diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 6ba99870fc32..816776a8a8e3 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -442,6 +442,7 @@ struct kvm_vcpu_arch { bool acrs_loaded; struct kvm_s390_pv_vcpu pv; union diag318_info diag318_info; + void *mc; /* Placeholder */ }; struct kvm_vm_stat { diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 21088265402c..1e2dcd3e2436 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -9,7 +9,7 @@ ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap-vsie.o -kvm-y += dat.o gmap.o +kvm-y += dat.o gmap.o faultin.o kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/faultin.c b/arch/s390/kvm/faultin.c new file mode 100644 index 000000000000..e37cd18200f5 --- /dev/null +++ b/arch/s390/kvm/faultin.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM guest fault handling. + * + * Copyright IBM Corp. 2025 + * Author(s): Claudio Imbrenda + */ +#include +#include + +#include "gmap.h" +#include "trace.h" +#include "faultin.h" + +bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu); + +/* + * kvm_s390_faultin_gfn() - handle a dat fault. + * @vcpu: The vCPU whose gmap is to be fixed up, or NULL if operating on the VM. + * @kvm: The VM whose gmap is to be fixed up, or NULL if operating on a vCPU. + * @f: The guest fault that needs to be resolved. + * + * Return: + * * 0 on success + * * < 0 in case of error + * * > 0 in case of guest exceptions + * + * Context: + * * The mm lock must not be held before calling + * * kvm->srcu must be held + * * may sleep + */ +int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fault *f) +{ + struct kvm_s390_mmu_cache *local_mc __free(kvm_s390_mmu_cache) = NULL; + struct kvm_s390_mmu_cache *mc = NULL; + struct kvm_memory_slot *slot; + unsigned long inv_seq; + int foll, rc = 0; + + foll = f->write_attempt ? FOLL_WRITE : 0; + foll |= f->attempt_pfault ? FOLL_NOWAIT : 0; + + if (vcpu) { + kvm = vcpu->kvm; + mc = vcpu->arch.mc; + } + + lockdep_assert_held(&kvm->srcu); + + scoped_guard(read_lock, &kvm->mmu_lock) { + if (gmap_try_fixup_minor(kvm->arch.gmap, f) == 0) + return 0; + } + + while (1) { + f->valid = false; + inv_seq = kvm->mmu_invalidate_seq; + /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ + smp_rmb(); + + if (vcpu) + slot = kvm_vcpu_gfn_to_memslot(vcpu, f->gfn); + else + slot = gfn_to_memslot(kvm, f->gfn); + f->pfn = __kvm_faultin_pfn(slot, f->gfn, foll, &f->writable, &f->page); + + /* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT). */ + if (f->pfn == KVM_PFN_ERR_NEEDS_IO) { + if (unlikely(!f->attempt_pfault)) + return -EAGAIN; + if (unlikely(!vcpu)) + return -EINVAL; + trace_kvm_s390_major_guest_pfault(vcpu); + if (kvm_arch_setup_async_pf(vcpu)) + return 0; + vcpu->stat.pfault_sync++; + /* Could not setup async pfault, try again synchronously. */ + foll &= ~FOLL_NOWAIT; + f->pfn = __kvm_faultin_pfn(slot, f->gfn, foll, &f->writable, &f->page); + } + + /* Access outside memory, addressing exception. */ + if (is_noslot_pfn(f->pfn)) + return PGM_ADDRESSING; + /* Signal pending: try again. */ + if (f->pfn == KVM_PFN_ERR_SIGPENDING) + return -EAGAIN; + /* Check if it's read-only memory; don't try to actually handle that case. */ + if (f->pfn == KVM_PFN_ERR_RO_FAULT) + return -EOPNOTSUPP; + /* Any other error. */ + if (is_error_pfn(f->pfn)) + return -EFAULT; + + if (!mc) { + local_mc = kvm_s390_new_mmu_cache(); + if (!local_mc) + return -ENOMEM; + mc = local_mc; + } + + /* Loop, will automatically release the faulted page. */ + if (mmu_invalidate_retry_gfn_unsafe(kvm, inv_seq, f->gfn)) { + kvm_release_faultin_page(kvm, f->page, true, false); + continue; + } + + scoped_guard(read_lock, &kvm->mmu_lock) { + if (!mmu_invalidate_retry_gfn(kvm, inv_seq, f->gfn)) { + f->valid = true; + rc = gmap_link(mc, kvm->arch.gmap, f); + kvm_release_faultin_page(kvm, f->page, !!rc, f->write_attempt); + f->page = NULL; + } + } + kvm_release_faultin_page(kvm, f->page, true, false); + + if (rc == -ENOMEM) { + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + } else if (rc != -EAGAIN) { + return rc; + } + } +} + +int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w) +{ + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + int foll = w ? FOLL_WRITE : 0; + + f->write_attempt = w; + f->gfn = gfn; + f->pfn = __kvm_faultin_pfn(slot, gfn, foll, &f->writable, &f->page); + if (is_noslot_pfn(f->pfn)) + return PGM_ADDRESSING; + if (is_sigpending_pfn(f->pfn)) + return -EINTR; + if (f->pfn == KVM_PFN_ERR_NEEDS_IO) + return -EAGAIN; + if (is_error_pfn(f->pfn)) + return -EFAULT; + + f->valid = true; + return 0; +} diff --git a/arch/s390/kvm/faultin.h b/arch/s390/kvm/faultin.h new file mode 100644 index 000000000000..f86176d2769c --- /dev/null +++ b/arch/s390/kvm/faultin.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KVM guest fault handling. + * + * Copyright IBM Corp. 2025 + * Author(s): Claudio Imbrenda + */ + +#ifndef __KVM_S390_FAULTIN_H +#define __KVM_S390_FAULTIN_H + +#include + +#include "dat.h" + +int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fault *f); +int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w); + +static inline int kvm_s390_faultin_gfn_simple(struct kvm_vcpu *vcpu, struct kvm *kvm, + gfn_t gfn, bool wr) +{ + struct guest_fault f = { .gfn = gfn, .write_attempt = wr, }; + + return kvm_s390_faultin_gfn(vcpu, kvm, &f); +} + +static inline int kvm_s390_get_guest_page_and_read_gpa(struct kvm *kvm, struct guest_fault *f, + gpa_t gaddr, unsigned long *val) +{ + int rc; + + rc = kvm_s390_get_guest_page(kvm, f, gpa_to_gfn(gaddr), false); + if (rc) + return rc; + + *val = *(unsigned long *)phys_to_virt(pfn_to_phys(f->pfn) | offset_in_page(gaddr)); + + return 0; +} + +static inline void kvm_s390_release_multiple(struct kvm *kvm, struct guest_fault *guest_faults, + int n, bool ignore) +{ + int i; + + for (i = 0; i < n; i++) { + kvm_release_faultin_page(kvm, guest_faults[i].page, ignore, + guest_faults[i].write_attempt); + guest_faults[i].page = NULL; + } +} + +static inline bool kvm_s390_multiple_faults_need_retry(struct kvm *kvm, unsigned long seq, + struct guest_fault *guest_faults, int n, + bool unsafe) +{ + int i; + + for (i = 0; i < n; i++) { + if (!guest_faults[i].valid) + continue; + if (unsafe && mmu_invalidate_retry_gfn_unsafe(kvm, seq, guest_faults[i].gfn)) + return true; + if (!unsafe && mmu_invalidate_retry_gfn(kvm, seq, guest_faults[i].gfn)) + return true; + } + return false; +} + +static inline int kvm_s390_get_guest_pages(struct kvm *kvm, struct guest_fault *guest_faults, + gfn_t start, int n_pages, bool write_attempt) +{ + int i, rc; + + for (i = 0; i < n_pages; i++) { + rc = kvm_s390_get_guest_page(kvm, guest_faults + i, start + i, write_attempt); + if (rc) + break; + } + return rc; +} + +#define kvm_s390_release_faultin_array(kvm, array, ignore) \ + kvm_s390_release_multiple(kvm, array, ARRAY_SIZE(array), ignore) + +#define kvm_s390_array_needs_retry_unsafe(kvm, seq, array) \ + kvm_s390_multiple_faults_need_retry(kvm, seq, array, ARRAY_SIZE(array), true) + +#define kvm_s390_array_needs_retry_safe(kvm, seq, array) \ + kvm_s390_multiple_faults_need_retry(kvm, seq, array, ARRAY_SIZE(array), false) + +#endif /* __KVM_S390_FAULTIN_H */ diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ec92e6361eab..2b5ecdc3814e 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4637,7 +4637,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) return true; } -static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) +bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) { hva_t hva; struct kvm_arch_async_pf arch; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 65c950760993..9ce71c8433a1 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -470,6 +470,8 @@ static inline int kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gpa_t gaddr, return __kvm_s390_handle_dat_fault(vcpu, gpa_to_gfn(gaddr), gaddr, flags); } +bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu); + /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); From e5f98a6899bd4bd0d99cb21777bd361fbb873838 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:50 +0100 Subject: [PATCH 209/267] KVM: s390: Add some helper functions needed for vSIE Implement gmap_protect_asce_top_level(), which was a stub. This function was a stub due to cross dependencies with other patches. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/gmap.c | 74 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index 5736145ef53a..fea1c66fcabe 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -22,6 +22,7 @@ #include "dat.h" #include "gmap.h" #include "kvm-s390.h" +#include "faultin.h" static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu) { @@ -1091,10 +1092,79 @@ static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int e return NULL; } +#define CRST_TABLE_PAGES (_CRST_TABLE_SIZE / PAGE_SIZE) +struct gmap_protect_asce_top_level { + unsigned long seq; + struct guest_fault f[CRST_TABLE_PAGES]; +}; + +static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, + struct gmap_protect_asce_top_level *context) +{ + int rc, i; + + guard(write_lock)(&sg->kvm->mmu_lock); + + if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f)) + return -EAGAIN; + + scoped_guard(spinlock, &sg->parent->children_lock) { + for (i = 0; i < CRST_TABLE_PAGES; i++) { + if (!context->f[i].valid) + continue; + rc = gmap_protect_rmap(mc, sg, context->f[i].gfn, 0, context->f[i].pfn, + TABLE_TYPE_REGION1 + 1, context->f[i].writable); + if (rc) + return rc; + } + gmap_add_child(sg->parent, sg); + } + + kvm_s390_release_faultin_array(sg->kvm, context->f, false); + return 0; +} + +static inline int _gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, + struct gmap_protect_asce_top_level *context) +{ + int rc; + + if (kvm_s390_array_needs_retry_unsafe(sg->kvm, context->seq, context->f)) + return -EAGAIN; + do { + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + rc = radix_tree_preload(GFP_KERNEL); + if (rc) + return rc; + rc = __gmap_protect_asce_top_level(mc, sg, context); + radix_tree_preload_end(); + } while (rc == -ENOMEM); + + return rc; +} + static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg) { - KVM_BUG_ON(1, sg->kvm); - return -EINVAL; + struct gmap_protect_asce_top_level context = {}; + union asce asce = sg->guest_asce; + int rc; + + KVM_BUG_ON(!is_shadow(sg), sg->kvm); + + context.seq = sg->kvm->mmu_invalidate_seq; + /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ + smp_rmb(); + + rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false); + if (rc > 0) + rc = -EFAULT; + if (!rc) + rc = _gmap_protect_asce_top_level(mc, sg, &context); + if (rc) + kvm_s390_release_faultin_array(sg->kvm, context.f, true); + return rc; } /** From cc50f105fde083801d0644c8678bc984c6e83a3b Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:51 +0100 Subject: [PATCH 210/267] KVM: s390: Stop using CONFIG_PGSTE Switch to using IS_ENABLED(CONFIG_KVM) instead of CONFIG_PGSTE, since the latter will be removed soon. Many CONFIG_PGSTE are left behind, because they will be removed completely in upcoming patches. The ones replaced here are mostly the ones that will stay. Reviewed-by: Steffen Eiden Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/include/asm/mmu_context.h | 2 +- arch/s390/include/asm/pgtable.h | 4 ++-- arch/s390/mm/fault.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index d9b8501bc93d..48e548c01daa 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -29,7 +29,7 @@ static inline int init_new_context(struct task_struct *tsk, atomic_set(&mm->context.protected_count, 0); mm->context.gmap_asce = 0; mm->context.flush_mm = 0; -#ifdef CONFIG_PGSTE +#if IS_ENABLED(CONFIG_KVM) mm->context.has_pgste = 0; mm->context.uses_skeys = 0; mm->context.uses_cmm = 0; diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 04335f5e7f47..cd4d135c4503 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -577,7 +577,7 @@ static inline int mm_has_pgste(struct mm_struct *mm) static inline int mm_is_protected(struct mm_struct *mm) { -#ifdef CONFIG_PGSTE +#if IS_ENABLED(CONFIG_KVM) if (unlikely(atomic_read(&mm->context.protected_count))) return 1; #endif @@ -632,7 +632,7 @@ static inline pud_t set_pud_bit(pud_t pud, pgprot_t prot) #define mm_forbids_zeropage mm_forbids_zeropage static inline int mm_forbids_zeropage(struct mm_struct *mm) { -#ifdef CONFIG_PGSTE +#if IS_ENABLED(CONFIG_KVM) if (!mm->context.allow_cow_sharing) return 1; #endif diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index e2e13778c36a..a52aa7a99b6b 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -403,7 +403,7 @@ void do_dat_exception(struct pt_regs *regs) } NOKPROBE_SYMBOL(do_dat_exception); -#if IS_ENABLED(CONFIG_PGSTE) +#if IS_ENABLED(CONFIG_KVM) void do_secure_storage_access(struct pt_regs *regs) { @@ -470,4 +470,4 @@ void do_secure_storage_access(struct pt_regs *regs) } NOKPROBE_SYMBOL(do_secure_storage_access); -#endif /* CONFIG_PGSTE */ +#endif /* CONFIG_KVM */ From d29a29a9e1a3d24cdb867ae2adc487f1a67e6225 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:52 +0100 Subject: [PATCH 211/267] KVM: s390: Storage key functions refactoring Refactor some storage key functions to improve readability. Introduce helper functions that will be used in the next patches. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/gaccess.c | 38 +++++++++---------- arch/s390/kvm/gaccess.h | 4 +- arch/s390/kvm/kvm-s390.c | 82 +++++++++++++++------------------------- arch/s390/kvm/kvm-s390.h | 8 ++++ 4 files changed, 60 insertions(+), 72 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 9df868bddf9a..2649365bf054 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -961,7 +961,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * *@old_addr contains the value at @gpa before the attempt to * exchange the value. * @new: The value to place at @gpa. - * @access_key: The access key to use for the guest access. + * @acc: The access key to use for the guest access. * @success: output value indicating if an exchange occurred. * * Atomically exchange the value at @gpa by @new, if it contains *@old. @@ -974,9 +974,8 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * * -EAGAIN: transient failure (len 1 or 2) * * -EOPNOTSUPP: read-only memslot (should never occur) */ -int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, - __uint128_t *old_addr, __uint128_t new, - u8 access_key, bool *success) +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old_addr, + union kvm_s390_quad new, u8 acc, bool *success) { gfn_t gfn = gpa_to_gfn(gpa); struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); @@ -1008,41 +1007,42 @@ int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, case 1: { u8 old; - ret = cmpxchg_user_key((u8 __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; + ret = cmpxchg_user_key((u8 __user *)hva, &old, old_addr->one, new.one, acc); + *success = !ret && old == old_addr->one; + old_addr->one = old; break; } case 2: { u16 old; - ret = cmpxchg_user_key((u16 __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; + ret = cmpxchg_user_key((u16 __user *)hva, &old, old_addr->two, new.two, acc); + *success = !ret && old == old_addr->two; + old_addr->two = old; break; } case 4: { u32 old; - ret = cmpxchg_user_key((u32 __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; + ret = cmpxchg_user_key((u32 __user *)hva, &old, old_addr->four, new.four, acc); + *success = !ret && old == old_addr->four; + old_addr->four = old; break; } case 8: { u64 old; - ret = cmpxchg_user_key((u64 __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; + ret = cmpxchg_user_key((u64 __user *)hva, &old, old_addr->eight, new.eight, acc); + *success = !ret && old == old_addr->eight; + old_addr->eight = old; break; } case 16: { __uint128_t old; - ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; + ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, old_addr->sixteen, + new.sixteen, acc); + *success = !ret && old == old_addr->sixteen; + old_addr->sixteen = old; break; } default: diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 3fde45a151f2..774cdf19998f 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -206,8 +206,8 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, unsigned long len, enum gacc_mode mode); -int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, __uint128_t *old, - __uint128_t new, u8 access_key, bool *success); +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old_addr, + union kvm_s390_quad new, u8 access_key, bool *success); /** * write_guest_with_key - copy data from kernel space to guest space diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 2b5ecdc3814e..f5411e093fb5 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2900,9 +2900,9 @@ static int mem_op_validate_common(struct kvm_s390_mem_op *mop, u64 supported_fla static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; + void *tmpbuf __free(kvfree) = NULL; enum gacc_mode acc_mode; - void *tmpbuf = NULL; - int r, srcu_idx; + int r; r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION | KVM_S390_MEMOP_F_CHECK_ONLY); @@ -2915,52 +2915,36 @@ static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop) return -ENOMEM; } - srcu_idx = srcu_read_lock(&kvm->srcu); - - if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) { - r = PGM_ADDRESSING; - goto out_unlock; - } - acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE; - if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key); - goto out_unlock; - } - if (acc_mode == GACC_FETCH) { + + scoped_guard(srcu, &kvm->srcu) { + if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) + return PGM_ADDRESSING; + + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) + return check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key); + + if (acc_mode == GACC_STORE && copy_from_user(tmpbuf, uaddr, mop->size)) + return -EFAULT; r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, - mop->size, GACC_FETCH, mop->key); + mop->size, acc_mode, mop->key); if (r) - goto out_unlock; - if (copy_to_user(uaddr, tmpbuf, mop->size)) - r = -EFAULT; - } else { - if (copy_from_user(tmpbuf, uaddr, mop->size)) { - r = -EFAULT; - goto out_unlock; - } - r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, - mop->size, GACC_STORE, mop->key); + return r; + if (acc_mode != GACC_STORE && copy_to_user(uaddr, tmpbuf, mop->size)) + return -EFAULT; } -out_unlock: - srcu_read_unlock(&kvm->srcu, srcu_idx); - - vfree(tmpbuf); - return r; + return 0; } static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; void __user *old_addr = (void __user *)mop->old_addr; - union { - __uint128_t quad; - char raw[sizeof(__uint128_t)]; - } old = { .quad = 0}, new = { .quad = 0 }; - unsigned int off_in_quad = sizeof(new) - mop->size; - int r, srcu_idx; - bool success; + union kvm_s390_quad old = { .sixteen = 0 }; + union kvm_s390_quad new = { .sixteen = 0 }; + bool success = false; + int r; r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION); if (r) @@ -2972,25 +2956,21 @@ static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *m */ if (mop->size > sizeof(new)) return -EINVAL; - if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size)) + if (copy_from_user(&new, uaddr, mop->size)) return -EFAULT; - if (copy_from_user(&old.raw[off_in_quad], old_addr, mop->size)) + if (copy_from_user(&old, old_addr, mop->size)) return -EFAULT; - srcu_idx = srcu_read_lock(&kvm->srcu); + scoped_guard(srcu, &kvm->srcu) { + if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) + return PGM_ADDRESSING; - if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) { - r = PGM_ADDRESSING; - goto out_unlock; + r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old, new, + mop->key, &success); + + if (!success && copy_to_user(old_addr, &old, mop->size)) + return -EFAULT; } - - r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old.quad, - new.quad, mop->key, &success); - if (!success && copy_to_user(old_addr, &old.raw[off_in_quad], mop->size)) - r = -EFAULT; - -out_unlock: - srcu_read_unlock(&kvm->srcu, srcu_idx); return r; } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 9ce71c8433a1..c44c52266e26 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -22,6 +22,14 @@ #define KVM_S390_UCONTROL_MEMSLOT (KVM_USER_MEM_SLOTS + 0) +union kvm_s390_quad { + __uint128_t sixteen; + unsigned long eight; + unsigned int four; + unsigned short two; + unsigned char one; +}; + static inline void kvm_s390_fpu_store(struct kvm_run *run) { fpu_stfpc(&run->s.regs.fpc); From e38c884df92119d96f652d51f82661dd2fc0b885 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:53 +0100 Subject: [PATCH 212/267] KVM: s390: Switch to new gmap Switch KVM/s390 to use the new gmap code. Remove includes to and include "gmap.h" instead; fix all the existing users of the old gmap functions to use the new ones instead. Fix guest storage key access functions to work with the new gmap. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/Kconfig | 2 +- arch/s390/include/asm/kvm_host.h | 5 +- arch/s390/include/asm/mmu_context.h | 4 - arch/s390/include/asm/tlb.h | 3 - arch/s390/include/asm/uaccess.h | 70 +-- arch/s390/include/asm/uv.h | 1 - arch/s390/kernel/uv.c | 114 +--- arch/s390/kvm/Makefile | 2 +- arch/s390/kvm/diag.c | 2 +- arch/s390/kvm/gaccess.c | 899 +++++++++++++++++----------- arch/s390/kvm/gaccess.h | 18 +- arch/s390/kvm/gmap-vsie.c | 141 ----- arch/s390/kvm/intercept.c | 15 +- arch/s390/kvm/interrupt.c | 6 +- arch/s390/kvm/kvm-s390.c | 787 +++++++----------------- arch/s390/kvm/kvm-s390.h | 19 +- arch/s390/kvm/priv.c | 213 +++---- arch/s390/kvm/pv.c | 174 ++++-- arch/s390/kvm/vsie.c | 168 +++--- arch/s390/lib/uaccess.c | 184 +----- arch/s390/mm/gmap_helpers.c | 38 +- 21 files changed, 1129 insertions(+), 1736 deletions(-) delete mode 100644 arch/s390/kvm/gmap-vsie.c diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 0e5fad5f06ca..8270754985e9 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -33,7 +33,7 @@ config GENERIC_LOCKBREAK def_bool y if PREEMPTION config PGSTE - def_bool y if KVM + def_bool n config AUDIT_ARCH def_bool y diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 816776a8a8e3..64a50f0862aa 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -442,7 +442,7 @@ struct kvm_vcpu_arch { bool acrs_loaded; struct kvm_s390_pv_vcpu pv; union diag318_info diag318_info; - void *mc; /* Placeholder */ + struct kvm_s390_mmu_cache *mc; }; struct kvm_vm_stat { @@ -636,6 +636,8 @@ struct kvm_s390_pv { struct mutex import_lock; }; +struct kvm_s390_mmu_cache; + struct kvm_arch { struct esca_block *sca; debug_info_t *dbf; @@ -675,6 +677,7 @@ struct kvm_arch { struct kvm_s390_pv pv; struct list_head kzdev_list; spinlock_t kzdev_list_lock; + struct kvm_s390_mmu_cache *mc; }; #define KVM_HVA_ERR_BAD (-1UL) diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 48e548c01daa..bd1ef5e2d2eb 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -30,11 +30,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.gmap_asce = 0; mm->context.flush_mm = 0; #if IS_ENABLED(CONFIG_KVM) - mm->context.has_pgste = 0; - mm->context.uses_skeys = 0; - mm->context.uses_cmm = 0; mm->context.allow_cow_sharing = 1; - mm->context.allow_gmap_hpage_1m = 0; #endif switch (mm->context.asce_limit) { default: diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 1e50f6f1ad9d..7354b42ee994 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -36,7 +36,6 @@ static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb, #include #include -#include /* * Release the page cache reference for a pte removed by @@ -85,8 +84,6 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_pmds = 1; - if (mm_has_pgste(tlb->mm)) - gmap_unlink(tlb->mm, (unsigned long *)pte, address); tlb_remove_ptdesc(tlb, virt_to_ptdesc(pte)); } diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index c5e02addcd67..dff035372601 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -471,65 +471,15 @@ do { \ #define arch_get_kernel_nofault __mvc_kernel_nofault #define arch_put_kernel_nofault __mvc_kernel_nofault -void __cmpxchg_user_key_called_with_bad_pointer(void); - -int __cmpxchg_user_key1(unsigned long address, unsigned char *uval, - unsigned char old, unsigned char new, unsigned long key); -int __cmpxchg_user_key2(unsigned long address, unsigned short *uval, - unsigned short old, unsigned short new, unsigned long key); -int __cmpxchg_user_key4(unsigned long address, unsigned int *uval, - unsigned int old, unsigned int new, unsigned long key); -int __cmpxchg_user_key8(unsigned long address, unsigned long *uval, - unsigned long old, unsigned long new, unsigned long key); -int __cmpxchg_user_key16(unsigned long address, __uint128_t *uval, - __uint128_t old, __uint128_t new, unsigned long key); - -static __always_inline int _cmpxchg_user_key(unsigned long address, void *uval, - __uint128_t old, __uint128_t new, - unsigned long key, int size) -{ - switch (size) { - case 1: return __cmpxchg_user_key1(address, uval, old, new, key); - case 2: return __cmpxchg_user_key2(address, uval, old, new, key); - case 4: return __cmpxchg_user_key4(address, uval, old, new, key); - case 8: return __cmpxchg_user_key8(address, uval, old, new, key); - case 16: return __cmpxchg_user_key16(address, uval, old, new, key); - default: __cmpxchg_user_key_called_with_bad_pointer(); - } - return 0; -} - -/** - * cmpxchg_user_key() - cmpxchg with user space target, honoring storage keys - * @ptr: User space address of value to compare to @old and exchange with - * @new. Must be aligned to sizeof(*@ptr). - * @uval: Address where the old value of *@ptr is written to. - * @old: Old value. Compared to the content pointed to by @ptr in order to - * determine if the exchange occurs. The old value read from *@ptr is - * written to *@uval. - * @new: New value to place at *@ptr. - * @key: Access key to use for checking storage key protection. - * - * Perform a cmpxchg on a user space target, honoring storage key protection. - * @key alone determines how key checking is performed, neither - * storage-protection-override nor fetch-protection-override apply. - * The caller must compare *@uval and @old to determine if values have been - * exchanged. In case of an exception *@uval is set to zero. - * - * Return: 0: cmpxchg executed - * -EFAULT: an exception happened when trying to access *@ptr - * -EAGAIN: maxed out number of retries (byte and short only) - */ -#define cmpxchg_user_key(ptr, uval, old, new, key) \ -({ \ - __typeof__(ptr) __ptr = (ptr); \ - __typeof__(uval) __uval = (uval); \ - \ - BUILD_BUG_ON(sizeof(*(__ptr)) != sizeof(*(__uval))); \ - might_fault(); \ - __chk_user_ptr(__ptr); \ - _cmpxchg_user_key((unsigned long)(__ptr), (void *)(__uval), \ - (old), (new), (key), sizeof(*(__ptr))); \ -}) +int __cmpxchg_key1(void *address, unsigned char *uval, unsigned char old, + unsigned char new, unsigned long key); +int __cmpxchg_key2(void *address, unsigned short *uval, unsigned short old, + unsigned short new, unsigned long key); +int __cmpxchg_key4(void *address, unsigned int *uval, unsigned int old, + unsigned int new, unsigned long key); +int __cmpxchg_key8(void *address, unsigned long *uval, unsigned long old, + unsigned long new, unsigned long key); +int __cmpxchg_key16(void *address, __uint128_t *uval, __uint128_t old, + __uint128_t new, unsigned long key); #endif /* __S390_UACCESS_H */ diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index 0744874ca6df..d919e69662f5 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -631,7 +631,6 @@ int uv_pin_shared(unsigned long paddr); int uv_destroy_folio(struct folio *folio); int uv_destroy_pte(pte_t pte); int uv_convert_from_secure_pte(pte_t pte); -int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb); int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio); int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb); int uv_convert_from_secure(unsigned long paddr); diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index cb4e8089fbca..a284f98d9716 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -209,39 +209,6 @@ int uv_convert_from_secure_pte(pte_t pte) return uv_convert_from_secure_folio(pfn_folio(pte_pfn(pte))); } -/** - * should_export_before_import - Determine whether an export is needed - * before an import-like operation - * @uvcb: the Ultravisor control block of the UVC to be performed - * @mm: the mm of the process - * - * Returns whether an export is needed before every import-like operation. - * This is needed for shared pages, which don't trigger a secure storage - * exception when accessed from a different guest. - * - * Although considered as one, the Unpin Page UVC is not an actual import, - * so it is not affected. - * - * No export is needed also when there is only one protected VM, because the - * page cannot belong to the wrong VM in that case (there is no "other VM" - * it can belong to). - * - * Return: true if an export is needed before every import, otherwise false. - */ -static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) -{ - /* - * The misc feature indicates, among other things, that importing a - * shared page from a different protected VM will automatically also - * transfer its ownership. - */ - if (uv_has_feature(BIT_UV_FEAT_MISC)) - return false; - if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) - return false; - return atomic_read(&mm->context.protected_count) > 1; -} - /* * Calculate the expected ref_count for a folio that would otherwise have no * further pins. This was cribbed from similar functions in other places in @@ -313,20 +280,6 @@ int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) } EXPORT_SYMBOL(__make_folio_secure); -static int make_folio_secure(struct mm_struct *mm, struct folio *folio, struct uv_cb_header *uvcb) -{ - int rc; - - if (!folio_trylock(folio)) - return -EAGAIN; - if (should_export_before_import(uvcb, mm)) - uv_convert_from_secure(folio_to_phys(folio)); - rc = __make_folio_secure(folio, uvcb); - folio_unlock(folio); - - return rc; -} - /** * s390_wiggle_split_folio() - try to drain extra references to a folio and * split the folio if it is large. @@ -414,56 +367,6 @@ int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio) } EXPORT_SYMBOL_GPL(s390_wiggle_split_folio); -int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb) -{ - struct vm_area_struct *vma; - struct folio_walk fw; - struct folio *folio; - int rc; - - mmap_read_lock(mm); - vma = vma_lookup(mm, hva); - if (!vma) { - mmap_read_unlock(mm); - return -EFAULT; - } - folio = folio_walk_start(&fw, vma, hva, 0); - if (!folio) { - mmap_read_unlock(mm); - return -ENXIO; - } - - folio_get(folio); - /* - * Secure pages cannot be huge and userspace should not combine both. - * In case userspace does it anyway this will result in an -EFAULT for - * the unpack. The guest is thus never reaching secure mode. - * If userspace plays dirty tricks and decides to map huge pages at a - * later point in time, it will receive a segmentation fault or - * KVM_RUN will return -EFAULT. - */ - if (folio_test_hugetlb(folio)) - rc = -EFAULT; - else if (folio_test_large(folio)) - rc = -E2BIG; - else if (!pte_write(fw.pte) || (pte_val(fw.pte) & _PAGE_INVALID)) - rc = -ENXIO; - else - rc = make_folio_secure(mm, folio, uvcb); - folio_walk_end(&fw, vma); - mmap_read_unlock(mm); - - if (rc == -E2BIG || rc == -EBUSY) { - rc = s390_wiggle_split_folio(mm, folio); - if (!rc) - rc = -EAGAIN; - } - folio_put(folio); - - return rc; -} -EXPORT_SYMBOL_GPL(make_hva_secure); - /* * To be called with the folio locked or with an extra reference! This will * prevent kvm_s390_pv_make_secure() from touching the folio concurrently. @@ -474,21 +377,18 @@ int arch_make_folio_accessible(struct folio *folio) { int rc = 0; - /* Large folios cannot be secure */ - if (unlikely(folio_test_large(folio))) - return 0; - /* - * PG_arch_1 is used in 2 places: - * 1. for storage keys of hugetlb folios and KVM - * 2. As an indication that this small folio might be secure. This can - * overindicate, e.g. we set the bit before calling - * convert_to_secure. - * As secure pages are never large folios, both variants can co-exists. + * PG_arch_1 is used as an indication that this small folio might be + * secure. This can overindicate, e.g. we set the bit before calling + * convert_to_secure. */ if (!test_bit(PG_arch_1, &folio->flags.f)) return 0; + /* Large folios cannot be secure. */ + if (WARN_ON_ONCE(folio_test_large(folio))) + return -EFAULT; + rc = uv_pin_shared(folio_to_phys(folio)); if (!rc) { clear_bit(PG_arch_1, &folio->flags.f); diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 1e2dcd3e2436..dac9d53b23d8 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -8,7 +8,7 @@ include $(srctree)/virt/kvm/Makefile.kvm ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap-vsie.o +kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o kvm-y += dat.o gmap.o faultin.o kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 53233dec8cad..d89d1c381522 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -10,13 +10,13 @@ #include #include -#include #include #include #include "kvm-s390.h" #include "trace.h" #include "trace-s390.h" #include "gaccess.h" +#include "gmap.h" static void do_discard_gfn_range(struct kvm_vcpu *vcpu, gfn_t gfn_start, gfn_t gfn_end) { diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 2649365bf054..67de47a81a87 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -11,15 +11,43 @@ #include #include #include +#include +#include +#include #include #include -#include #include #include "kvm-s390.h" +#include "dat.h" +#include "gmap.h" #include "gaccess.h" +#include "faultin.h" #define GMAP_SHADOW_FAKE_TABLE 1ULL +union dat_table_entry { + unsigned long val; + union region1_table_entry pgd; + union region2_table_entry p4d; + union region3_table_entry pud; + union segment_table_entry pmd; + union page_table_entry pte; +}; + +#define WALK_N_ENTRIES 7 +#define LEVEL_MEM -2 +struct pgtwalk { + struct guest_fault raw_entries[WALK_N_ENTRIES]; + gpa_t last_addr; + int level; + bool p; +}; + +static inline struct guest_fault *get_entries(struct pgtwalk *w) +{ + return w->raw_entries - LEVEL_MEM; +} + /* * raddress union which will contain the result (real or absolute address) * after a page table walk. The rfaa, sfaa and pfra members are used to @@ -81,6 +109,28 @@ struct aste { /* .. more fields there */ }; +union oac { + unsigned int val; + struct { + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac1; + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac2; + }; +}; + int ipte_lock_held(struct kvm *kvm) { if (sclp.has_siif) @@ -603,28 +653,16 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu, static int vm_check_access_key_gpa(struct kvm *kvm, u8 access_key, enum gacc_mode mode, gpa_t gpa) { - u8 storage_key, access_control; - bool fetch_protected; - unsigned long hva; + union skey storage_key; int r; - if (access_key == 0) - return 0; - - hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); - if (kvm_is_error_hva(hva)) - return PGM_ADDRESSING; - - mmap_read_lock(current->mm); - r = get_guest_storage_key(current->mm, hva, &storage_key); - mmap_read_unlock(current->mm); + scoped_guard(read_lock, &kvm->mmu_lock) + r = dat_get_storage_key(kvm->arch.gmap->asce, gpa_to_gfn(gpa), &storage_key); if (r) return r; - access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); - if (access_control == access_key) + if (access_key == 0 || storage_key.acc == access_key) return 0; - fetch_protected = storage_key & _PAGE_FP_BIT; - if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !fetch_protected) + if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !storage_key.fp) return 0; return PGM_PROTECTION; } @@ -667,8 +705,7 @@ static int vcpu_check_access_key_gpa(struct kvm_vcpu *vcpu, u8 access_key, enum gacc_mode mode, union asce asce, gpa_t gpa, unsigned long ga, unsigned int len) { - u8 storage_key, access_control; - unsigned long hva; + union skey storage_key; int r; /* access key 0 matches any storage key -> allow */ @@ -678,26 +715,23 @@ static int vcpu_check_access_key_gpa(struct kvm_vcpu *vcpu, u8 access_key, * caller needs to ensure that gfn is accessible, so we can * assume that this cannot fail */ - hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gpa)); - mmap_read_lock(current->mm); - r = get_guest_storage_key(current->mm, hva, &storage_key); - mmap_read_unlock(current->mm); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + r = dat_get_storage_key(vcpu->arch.gmap->asce, gpa_to_gfn(gpa), &storage_key); if (r) return r; - access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); /* access key matches storage key -> allow */ - if (access_control == access_key) + if (storage_key.acc == access_key) return 0; if (mode == GACC_FETCH || mode == GACC_IFETCH) { /* it is a fetch and fetch protection is off -> allow */ - if (!(storage_key & _PAGE_FP_BIT)) + if (!storage_key.fp) return 0; if (fetch_prot_override_applicable(vcpu, mode, asce) && fetch_prot_override_applies(ga, len)) return 0; } if (storage_prot_override_applicable(vcpu) && - storage_prot_override_applies(access_control)) + storage_prot_override_applies(storage_key.acc)) return 0; return PGM_PROTECTION; } @@ -797,37 +831,79 @@ static int access_guest_page_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa return rc; } -static int access_guest_page_with_key_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, - void *data, unsigned int len, u8 access_key) +static int mvcos_key(void *to, const void *from, unsigned long size, u8 dst_key, u8 src_key) { - struct kvm_memory_slot *slot; - bool writable; - gfn_t gfn; - hva_t hva; + union oac spec = { + .oac1.key = dst_key, + .oac1.k = !!dst_key, + .oac2.key = src_key, + .oac2.k = !!src_key, + }; + int exception = PGM_PROTECTION; + + asm_inline volatile( + " lr %%r0,%[spec]\n" + "0: mvcos %[to],%[from],%[size]\n" + "1: lhi %[exc],0\n" + "2:\n" + EX_TABLE(0b, 2b) + EX_TABLE(1b, 2b) + : [size] "+d" (size), [to] "=Q" (*(char *)to), [exc] "+d" (exception) + : [spec] "d" (spec.val), [from] "Q" (*(const char *)from) + : "memory", "cc", "0"); + return exception; +} + +struct acc_page_key_context { + void *data; + int exception; + unsigned short offset; + unsigned short len; + bool store; + u8 access_key; +}; + +static void _access_guest_page_with_key_gpa(struct guest_fault *f) +{ + struct acc_page_key_context *context = f->priv; + void *ptr; + int r; + + ptr = __va(PFN_PHYS(f->pfn) | context->offset); + + if (context->store) + r = mvcos_key(ptr, context->data, context->len, context->access_key, 0); + else + r = mvcos_key(context->data, ptr, context->len, 0, context->access_key); + + context->exception = r; +} + +static int access_guest_page_with_key_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, + void *data, unsigned int len, u8 acc) +{ + struct acc_page_key_context context = { + .offset = offset_in_page(gpa), + .len = len, + .data = data, + .access_key = acc, + .store = mode == GACC_STORE, + }; + struct guest_fault fault = { + .gfn = gpa_to_gfn(gpa), + .priv = &context, + .write_attempt = mode == GACC_STORE, + .callback = _access_guest_page_with_key_gpa, + }; int rc; - gfn = gpa_to_gfn(gpa); - slot = gfn_to_memslot(kvm, gfn); - hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); + if (KVM_BUG_ON((len + context.offset) > PAGE_SIZE, kvm)) + return -EINVAL; - if (kvm_is_error_hva(hva)) - return PGM_ADDRESSING; - /* - * Check if it's a ro memslot, even tho that can't occur (they're unsupported). - * Don't try to actually handle that case. - */ - if (!writable && mode == GACC_STORE) - return -EOPNOTSUPP; - hva += offset_in_page(gpa); - if (mode == GACC_STORE) - rc = copy_to_user_key((void __user *)hva, data, len, access_key); - else - rc = copy_from_user_key(data, (void __user *)hva, len, access_key); + rc = kvm_s390_faultin_gfn(NULL, kvm, &fault); if (rc) - return PGM_PROTECTION; - if (mode == GACC_STORE) - mark_page_dirty_in_slot(kvm, slot, gfn); - return 0; + return rc; + return context.exception; } int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data, @@ -950,16 +1026,100 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, return rc; } +/** + * __cmpxchg_with_key() - Perform cmpxchg, honoring storage keys. + * @ptr: Address of value to compare to *@old and exchange with + * @new. Must be aligned to @size. + * @old: Old value. Compared to the content pointed to by @ptr in order to + * determine if the exchange occurs. The old value read from *@ptr is + * written here. + * @new: New value to place at *@ptr. + * @size: Size of the operation in bytes, may only be a power of two up to 16. + * @access_key: Access key to use for checking storage key protection. + * + * Perform a cmpxchg on guest memory, honoring storage key protection. + * @access_key alone determines how key checking is performed, neither + * storage-protection-override nor fetch-protection-override apply. + * In case of an exception *@uval is set to zero. + * + * Return: + * * %0: cmpxchg executed successfully + * * %1: cmpxchg executed unsuccessfully + * * %PGM_PROTECTION: an exception happened when trying to access *@ptr + * * %-EAGAIN: maxed out number of retries (byte and short only) + * * %-EINVAL: invalid value for @size + */ +static int __cmpxchg_with_key(union kvm_s390_quad *ptr, union kvm_s390_quad *old, + union kvm_s390_quad new, int size, u8 access_key) +{ + union kvm_s390_quad tmp = { .sixteen = 0 }; + int rc; + + /* + * The cmpxchg_key macro depends on the type of "old", so we need + * a case for each valid length and get some code duplication as long + * as we don't introduce a new macro. + */ + switch (size) { + case 1: + rc = __cmpxchg_key1(&ptr->one, &tmp.one, old->one, new.one, access_key); + break; + case 2: + rc = __cmpxchg_key2(&ptr->two, &tmp.two, old->two, new.two, access_key); + break; + case 4: + rc = __cmpxchg_key4(&ptr->four, &tmp.four, old->four, new.four, access_key); + break; + case 8: + rc = __cmpxchg_key8(&ptr->eight, &tmp.eight, old->eight, new.eight, access_key); + break; + case 16: + rc = __cmpxchg_key16(&ptr->sixteen, &tmp.sixteen, old->sixteen, new.sixteen, + access_key); + break; + default: + return -EINVAL; + } + if (!rc && memcmp(&tmp, old, size)) + rc = 1; + *old = tmp; + /* + * Assume that the fault is caused by protection, either key protection + * or user page write protection. + */ + if (rc == -EFAULT) + rc = PGM_PROTECTION; + return rc; +} + +struct cmpxchg_key_context { + union kvm_s390_quad new; + union kvm_s390_quad *old; + int exception; + unsigned short offset; + u8 access_key; + u8 len; +}; + +static void _cmpxchg_guest_abs_with_key(struct guest_fault *f) +{ + struct cmpxchg_key_context *context = f->priv; + + context->exception = __cmpxchg_with_key(__va(PFN_PHYS(f->pfn) | context->offset), + context->old, context->new, context->len, + context->access_key); +} + /** * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address. * @kvm: Virtual machine instance. * @gpa: Absolute guest address of the location to be changed. * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a * non power of two will result in failure. - * @old_addr: Pointer to old value. If the location at @gpa contains this value, - * the exchange will succeed. After calling cmpxchg_guest_abs_with_key() - * *@old_addr contains the value at @gpa before the attempt to - * exchange the value. + * @old: Pointer to old value. If the location at @gpa contains this value, + * the exchange will succeed. After calling cmpxchg_guest_abs_with_key() + * *@old contains the value at @gpa before the attempt to + * exchange the value. * @new: The value to place at @gpa. * @acc: The access key to use for the guest access. * @success: output value indicating if an exchange occurred. @@ -974,89 +1134,36 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * * -EAGAIN: transient failure (len 1 or 2) * * -EOPNOTSUPP: read-only memslot (should never occur) */ -int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old_addr, +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old, union kvm_s390_quad new, u8 acc, bool *success) { - gfn_t gfn = gpa_to_gfn(gpa); - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); - bool writable; - hva_t hva; - int ret; + struct cmpxchg_key_context context = { + .old = old, + .new = new, + .offset = offset_in_page(gpa), + .len = len, + .access_key = acc, + }; + struct guest_fault fault = { + .gfn = gpa_to_gfn(gpa), + .priv = &context, + .write_attempt = true, + .callback = _cmpxchg_guest_abs_with_key, + }; + int rc; - if (!IS_ALIGNED(gpa, len)) + lockdep_assert_held(&kvm->srcu); + + if (len > 16 || !IS_ALIGNED(gpa, len)) return -EINVAL; - hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); - if (kvm_is_error_hva(hva)) - return PGM_ADDRESSING; - /* - * Check if it's a read-only memslot, even though that cannot occur - * since those are unsupported. - * Don't try to actually handle that case. - */ - if (!writable) - return -EOPNOTSUPP; - - hva += offset_in_page(gpa); - /* - * The cmpxchg_user_key macro depends on the type of "old", so we need - * a case for each valid length and get some code duplication as long - * as we don't introduce a new macro. - */ - switch (len) { - case 1: { - u8 old; - - ret = cmpxchg_user_key((u8 __user *)hva, &old, old_addr->one, new.one, acc); - *success = !ret && old == old_addr->one; - old_addr->one = old; - break; - } - case 2: { - u16 old; - - ret = cmpxchg_user_key((u16 __user *)hva, &old, old_addr->two, new.two, acc); - *success = !ret && old == old_addr->two; - old_addr->two = old; - break; - } - case 4: { - u32 old; - - ret = cmpxchg_user_key((u32 __user *)hva, &old, old_addr->four, new.four, acc); - *success = !ret && old == old_addr->four; - old_addr->four = old; - break; - } - case 8: { - u64 old; - - ret = cmpxchg_user_key((u64 __user *)hva, &old, old_addr->eight, new.eight, acc); - *success = !ret && old == old_addr->eight; - old_addr->eight = old; - break; - } - case 16: { - __uint128_t old; - - ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, old_addr->sixteen, - new.sixteen, acc); - *success = !ret && old == old_addr->sixteen; - old_addr->sixteen = old; - break; - } - default: - return -EINVAL; - } - if (*success) - mark_page_dirty_in_slot(kvm, slot, gfn); - /* - * Assume that the fault is caused by protection, either key protection - * or user page write protection. - */ - if (ret == -EFAULT) - ret = PGM_PROTECTION; - return ret; + rc = kvm_s390_faultin_gfn(NULL, kvm, &fault); + if (rc) + return rc; + *success = !context.exception; + if (context.exception == 1) + return 0; + return context.exception; } /** @@ -1158,304 +1265,372 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra) } /** - * kvm_s390_shadow_tables - walk the guest page table and create shadow tables - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @pgt: pointer to the beginning of the page table for the given address if - * successful (return value 0), or to the first invalid DAT entry in - * case of exceptions (return value > 0) - * @dat_protection: referenced memory is write protected - * @fake: pgt references contiguous guest memory block, not a pgtable + * walk_guest_tables() - Walk the guest page table and pin the dat tables. + * @sg: Pointer to the shadow guest address space structure. + * @saddr: Faulting address in the shadow gmap. + * @w: Will be filled with information on the pinned pages. + * @wr: Wndicates a write access if true. + * + * Return: + * * %0 in case of success, + * * a PIC code > 0 in case the address translation fails + * * an error code < 0 if other errors happen in the host */ -static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, - unsigned long *pgt, int *dat_protection, - int *fake) +static int walk_guest_tables(struct gmap *sg, unsigned long saddr, struct pgtwalk *w, bool wr) { - struct kvm *kvm; - struct gmap *parent; - union asce asce; + struct gmap *parent = sg->parent; + struct guest_fault *entries; + union dat_table_entry table; union vaddress vaddr; unsigned long ptr; + struct kvm *kvm; + union asce asce; int rc; - *fake = 0; - *dat_protection = 0; - kvm = sg->private; - parent = sg->parent; + kvm = parent->kvm; + asce = sg->guest_asce; + entries = get_entries(w); + + w->level = LEVEL_MEM; + w->last_addr = saddr; + if (asce.r) + return kvm_s390_get_guest_page(kvm, entries + LEVEL_MEM, gpa_to_gfn(saddr), false); + vaddr.addr = saddr; - asce.val = sg->orig_asce; ptr = asce.rsto * PAGE_SIZE; - if (asce.r) { - *fake = 1; - ptr = 0; - asce.dt = ASCE_TYPE_REGION1; - } + + if (!asce_contains_gfn(asce, gpa_to_gfn(saddr))) + return PGM_ASCE_TYPE; switch (asce.dt) { case ASCE_TYPE_REGION1: - if (vaddr.rfx01 > asce.tl && !*fake) + if (vaddr.rfx01 > asce.tl) return PGM_REGION_FIRST_TRANS; break; case ASCE_TYPE_REGION2: - if (vaddr.rfx) - return PGM_ASCE_TYPE; if (vaddr.rsx01 > asce.tl) return PGM_REGION_SECOND_TRANS; break; case ASCE_TYPE_REGION3: - if (vaddr.rfx || vaddr.rsx) - return PGM_ASCE_TYPE; if (vaddr.rtx01 > asce.tl) return PGM_REGION_THIRD_TRANS; break; case ASCE_TYPE_SEGMENT: - if (vaddr.rfx || vaddr.rsx || vaddr.rtx) - return PGM_ASCE_TYPE; if (vaddr.sx01 > asce.tl) return PGM_SEGMENT_TRANSLATION; break; } + w->level = asce.dt; switch (asce.dt) { - case ASCE_TYPE_REGION1: { - union region1_table_entry rfte; - - if (*fake) { - ptr += vaddr.rfx * _REGION1_SIZE; - rfte.val = ptr; - goto shadow_r2t; - } - *pgt = ptr + vaddr.rfx * 8; - rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val); + case ASCE_TYPE_REGION1: + w->last_addr = ptr + vaddr.rfx * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); if (rc) return rc; - if (rfte.i) + if (table.pgd.i) return PGM_REGION_FIRST_TRANS; - if (rfte.tt != TABLE_TYPE_REGION1) + if (table.pgd.tt != TABLE_TYPE_REGION1) return PGM_TRANSLATION_SPEC; - if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl) + if (vaddr.rsx01 < table.pgd.tf || vaddr.rsx01 > table.pgd.tl) return PGM_REGION_SECOND_TRANS; if (sg->edat_level >= 1) - *dat_protection |= rfte.p; - ptr = rfte.rto * PAGE_SIZE; -shadow_r2t: - rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); - if (rc) - return rc; - kvm->stat.gmap_shadow_r1_entry++; - } + w->p |= table.pgd.p; + ptr = table.pgd.rto * PAGE_SIZE; + w->level--; fallthrough; - case ASCE_TYPE_REGION2: { - union region2_table_entry rste; - - if (*fake) { - ptr += vaddr.rsx * _REGION2_SIZE; - rste.val = ptr; - goto shadow_r3t; - } - *pgt = ptr + vaddr.rsx * 8; - rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val); + case ASCE_TYPE_REGION2: + w->last_addr = ptr + vaddr.rsx * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); if (rc) return rc; - if (rste.i) + if (table.p4d.i) return PGM_REGION_SECOND_TRANS; - if (rste.tt != TABLE_TYPE_REGION2) + if (table.p4d.tt != TABLE_TYPE_REGION2) return PGM_TRANSLATION_SPEC; - if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl) + if (vaddr.rtx01 < table.p4d.tf || vaddr.rtx01 > table.p4d.tl) return PGM_REGION_THIRD_TRANS; if (sg->edat_level >= 1) - *dat_protection |= rste.p; - ptr = rste.rto * PAGE_SIZE; -shadow_r3t: - rste.p |= *dat_protection; - rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); - if (rc) - return rc; - kvm->stat.gmap_shadow_r2_entry++; - } + w->p |= table.p4d.p; + ptr = table.p4d.rto * PAGE_SIZE; + w->level--; fallthrough; - case ASCE_TYPE_REGION3: { - union region3_table_entry rtte; - - if (*fake) { - ptr += vaddr.rtx * _REGION3_SIZE; - rtte.val = ptr; - goto shadow_sgt; - } - *pgt = ptr + vaddr.rtx * 8; - rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val); + case ASCE_TYPE_REGION3: + w->last_addr = ptr + vaddr.rtx * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); if (rc) return rc; - if (rtte.i) + if (table.pud.i) return PGM_REGION_THIRD_TRANS; - if (rtte.tt != TABLE_TYPE_REGION3) + if (table.pud.tt != TABLE_TYPE_REGION3) return PGM_TRANSLATION_SPEC; - if (rtte.cr && asce.p && sg->edat_level >= 2) + if (table.pud.cr && asce.p && sg->edat_level >= 2) return PGM_TRANSLATION_SPEC; - if (rtte.fc && sg->edat_level >= 2) { - *dat_protection |= rtte.fc0.p; - *fake = 1; - ptr = rtte.fc1.rfaa * _REGION3_SIZE; - rtte.val = ptr; - goto shadow_sgt; - } - if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl) - return PGM_SEGMENT_TRANSLATION; if (sg->edat_level >= 1) - *dat_protection |= rtte.fc0.p; - ptr = rtte.fc0.sto * PAGE_SIZE; -shadow_sgt: - rtte.fc0.p |= *dat_protection; - rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); - if (rc) - return rc; - kvm->stat.gmap_shadow_r3_entry++; - } - fallthrough; - case ASCE_TYPE_SEGMENT: { - union segment_table_entry ste; - - if (*fake) { - ptr += vaddr.sx * _SEGMENT_SIZE; - ste.val = ptr; - goto shadow_pgt; + w->p |= table.pud.p; + if (table.pud.fc && sg->edat_level >= 2) { + table.val = u64_replace_bits(table.val, saddr, ~_REGION3_MASK); + goto edat_applies; } - *pgt = ptr + vaddr.sx * 8; - rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val); - if (rc) - return rc; - if (ste.i) + if (vaddr.sx01 < table.pud.fc0.tf || vaddr.sx01 > table.pud.fc0.tl) return PGM_SEGMENT_TRANSLATION; - if (ste.tt != TABLE_TYPE_SEGMENT) - return PGM_TRANSLATION_SPEC; - if (ste.cs && asce.p) - return PGM_TRANSLATION_SPEC; - *dat_protection |= ste.fc0.p; - if (ste.fc && sg->edat_level >= 1) { - *fake = 1; - ptr = ste.fc1.sfaa * _SEGMENT_SIZE; - ste.val = ptr; - goto shadow_pgt; - } - ptr = ste.fc0.pto * (PAGE_SIZE / 2); -shadow_pgt: - ste.fc0.p |= *dat_protection; - rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); + ptr = table.pud.fc0.sto * PAGE_SIZE; + w->level--; + fallthrough; + case ASCE_TYPE_SEGMENT: + w->last_addr = ptr + vaddr.sx * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); if (rc) return rc; - kvm->stat.gmap_shadow_sg_entry++; + if (table.pmd.i) + return PGM_SEGMENT_TRANSLATION; + if (table.pmd.tt != TABLE_TYPE_SEGMENT) + return PGM_TRANSLATION_SPEC; + if (table.pmd.cs && asce.p) + return PGM_TRANSLATION_SPEC; + w->p |= table.pmd.p; + if (table.pmd.fc && sg->edat_level >= 1) { + table.val = u64_replace_bits(table.val, saddr, ~_SEGMENT_MASK); + goto edat_applies; + } + ptr = table.pmd.fc0.pto * (PAGE_SIZE / 2); + w->level--; } - } - /* Return the parent address of the page table */ - *pgt = ptr; + w->last_addr = ptr + vaddr.px * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); + if (rc) + return rc; + if (table.pte.i) + return PGM_PAGE_TRANSLATION; + if (table.pte.z) + return PGM_TRANSLATION_SPEC; + w->p |= table.pte.p; +edat_applies: + if (wr && w->p) + return PGM_PROTECTION; + + return kvm_s390_get_guest_page(kvm, entries + LEVEL_MEM, table.pte.pfra, wr); +} + +static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union pte *ptep, + struct guest_fault *f, bool p) +{ + union pgste pgste; + union pte newpte; + int rc; + + lockdep_assert_held(&sg->kvm->mmu_lock); + lockdep_assert_held(&sg->parent->children_lock); + + scoped_guard(spinlock, &sg->host_to_rmap_lock) + rc = gmap_insert_rmap(sg, f->gfn, gpa_to_gfn(raddr), TABLE_TYPE_PAGE_TABLE); + if (rc) + return rc; + + pgste = pgste_get_lock(ptep_h); + newpte = _pte(f->pfn, f->writable, !p, 0); + newpte.s.d |= ptep->s.d; + newpte.s.sd |= ptep->s.sd; + newpte.h.p &= ptep->h.p; + pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false); + pgste.vsie_notif = 1; + pgste_set_unlock(ptep_h, pgste); + + newpte = _pte(f->pfn, 0, !p, 0); + pgste = pgste_get_lock(ptep); + pgste = __dat_ptep_xchg(ptep, pgste, newpte, gpa_to_gfn(raddr), sg->asce, uses_skeys(sg)); + pgste_set_unlock(ptep, pgste); + return 0; } -/** - * shadow_pgt_lookup() - find a shadow page table - * @sg: pointer to the shadow guest address space structure - * @saddr: the address in the shadow aguest address space - * @pgt: parent gmap address of the page table to get shadowed - * @dat_protection: if the pgtable is marked as protected by dat - * @fake: pgt references contiguous guest memory block, not a pgtable - * - * Returns 0 if the shadow page table was found and -EAGAIN if the page - * table was not found. - * - * Called with sg->mm->mmap_lock in read. - */ -static int shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt, - int *dat_protection, int *fake) +static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, union crste *table, + struct guest_fault *f, bool p) { - unsigned long pt_index; - unsigned long *table; - struct page *page; + union crste newcrste; + gfn_t gfn; int rc; - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ - if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { - /* Shadow page tables are full pages (pte+pgste) */ - page = pfn_to_page(*table >> PAGE_SHIFT); - pt_index = gmap_pgste_get_pgt_addr(page_to_virt(page)); - *pgt = pt_index & ~GMAP_SHADOW_FAKE_TABLE; - *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); - *fake = !!(pt_index & GMAP_SHADOW_FAKE_TABLE); - rc = 0; - } else { - rc = -EAGAIN; + lockdep_assert_held(&sg->kvm->mmu_lock); + lockdep_assert_held(&sg->parent->children_lock); + + gfn = f->gfn & gpa_to_gfn(is_pmd(*table) ? _SEGMENT_MASK : _REGION3_MASK); + scoped_guard(spinlock, &sg->host_to_rmap_lock) + rc = gmap_insert_rmap(sg, gfn, gpa_to_gfn(raddr), host->h.tt); + if (rc) + return rc; + + newcrste = _crste_fc1(f->pfn, host->h.tt, f->writable, !p); + newcrste.s.fc1.d |= host->s.fc1.d; + newcrste.s.fc1.sd |= host->s.fc1.sd; + newcrste.h.p &= host->h.p; + newcrste.s.fc1.vsie_notif = 1; + newcrste.s.fc1.prefix_notif = host->s.fc1.prefix_notif; + _gmap_crstep_xchg(sg->parent, host, newcrste, f->gfn, false); + + newcrste = _crste_fc1(f->pfn, host->h.tt, 0, !p); + dat_crstep_xchg(table, newcrste, gpa_to_gfn(raddr), sg->asce); + return 0; +} + +static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg, + unsigned long saddr, struct pgtwalk *w) +{ + struct guest_fault *entries; + int flags, i, hl, gl, l, rc; + union crste *table, *host; + union pte *ptep, *ptep_h; + + lockdep_assert_held(&sg->kvm->mmu_lock); + lockdep_assert_held(&sg->parent->children_lock); + + entries = get_entries(w); + ptep_h = NULL; + ptep = NULL; + + rc = dat_entry_walk(NULL, gpa_to_gfn(saddr), sg->asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, + &table, &ptep); + if (rc) + return rc; + + /* A race occourred. The shadow mapping is already valid, nothing to do */ + if ((ptep && !ptep->h.i) || (!ptep && crste_leaf(*table))) + return 0; + + gl = get_level(table, ptep); + + /* + * Skip levels that are already protected. For each level, protect + * only the page containing the entry, not the whole table. + */ + for (i = gl ; i >= w->level; i--) { + rc = gmap_protect_rmap(mc, sg, entries[i - 1].gfn, gpa_to_gfn(saddr), + entries[i - 1].pfn, i, entries[i - 1].writable); + if (rc) + return rc; + } + + rc = dat_entry_walk(NULL, entries[LEVEL_MEM].gfn, sg->parent->asce, DAT_WALK_LEAF, + TABLE_TYPE_PAGE_TABLE, &host, &ptep_h); + if (rc) + return rc; + + hl = get_level(host, ptep_h); + /* Get the smallest granularity */ + l = min3(gl, hl, w->level); + + flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0); + /* If necessary, create the shadow mapping */ + if (l < gl) { + rc = dat_entry_walk(mc, gpa_to_gfn(saddr), sg->asce, flags, l, &table, &ptep); + if (rc) + return rc; + } + if (l < hl) { + rc = dat_entry_walk(mc, entries[LEVEL_MEM].gfn, sg->parent->asce, + flags, l, &host, &ptep_h); + if (rc) + return rc; + } + + if (KVM_BUG_ON(l > TABLE_TYPE_REGION3, sg->kvm)) + return -EFAULT; + if (l == TABLE_TYPE_PAGE_TABLE) + return _do_shadow_pte(sg, saddr, ptep_h, ptep, entries + LEVEL_MEM, w->p); + return _do_shadow_crste(sg, saddr, host, table, entries + LEVEL_MEM, w->p); +} + +static inline int _gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, + unsigned long seq, struct pgtwalk *walk) +{ + struct gmap *parent; + int rc; + + if (kvm_s390_array_needs_retry_unsafe(vcpu->kvm, seq, walk->raw_entries)) + return -EAGAIN; +again: + rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc); + if (rc) + return rc; + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { + if (kvm_s390_array_needs_retry_safe(vcpu->kvm, seq, walk->raw_entries)) + return -EAGAIN; + parent = READ_ONCE(sg->parent); + if (!parent) + return -EAGAIN; + scoped_guard(spinlock, &parent->children_lock) { + if (READ_ONCE(sg->parent) != parent) + return -EAGAIN; + rc = _gaccess_do_shadow(vcpu->arch.mc, sg, saddr, walk); + } + if (rc == -ENOMEM) + goto again; + if (!rc) + kvm_s390_release_faultin_array(vcpu->kvm, walk->raw_entries, false); } - spin_unlock(&sg->guest_table_lock); return rc; } /** - * kvm_s390_shadow_fault - handle fault on a shadow page table - * @vcpu: virtual cpu - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @datptr: will contain the address of the faulting DAT table entry, or of - * the valid leaf, plus some flags + * __gaccess_shadow_fault() - Handle fault on a shadow page table. + * @vcpu: Virtual cpu that triggered the action. + * @sg: The shadow guest address space structure. + * @saddr: Faulting address in the shadow gmap. + * @datptr: Will contain the address of the faulting DAT table entry, or of + * the valid leaf, plus some flags. + * @wr: Whether this is a write access. * - * Returns: - 0 if the shadow fault was successfully resolved - * - > 0 (pgm exception code) on exceptions while faulting - * - -EAGAIN if the caller can retry immediately - * - -EFAULT when accessing invalid guest addresses - * - -ENOMEM if out of memory + * Return: + * * %0 if the shadow fault was successfully resolved + * * > 0 (pgm exception code) on exceptions while faulting + * * %-EAGAIN if the caller can retry immediately + * * %-EFAULT when accessing invalid guest addresses + * * %-ENOMEM if out of memory */ -int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, - unsigned long saddr, unsigned long *datptr) +static int __gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, + union mvpg_pei *datptr, bool wr) { - union vaddress vaddr; - union page_table_entry pte; - unsigned long pgt = 0; - int dat_protection, fake; + struct pgtwalk walk = { .p = false, }; + unsigned long seq; int rc; - if (KVM_BUG_ON(!gmap_is_shadow(sg), vcpu->kvm)) + seq = vcpu->kvm->mmu_invalidate_seq; + /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ + smp_rmb(); + + rc = walk_guest_tables(sg, saddr, &walk, wr); + if (datptr) { + datptr->val = walk.last_addr; + datptr->dat_prot = wr && walk.p; + datptr->not_pte = walk.level > TABLE_TYPE_PAGE_TABLE; + datptr->real = sg->guest_asce.r; + } + if (!rc) + rc = _gaccess_shadow_fault(vcpu, sg, saddr, seq, &walk); + if (rc) + kvm_s390_release_faultin_array(vcpu->kvm, walk.raw_entries, true); + return rc; +} + +int gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, + union mvpg_pei *datptr, bool wr) +{ + int rc; + + if (KVM_BUG_ON(!test_bit(GMAP_FLAG_SHADOW, &sg->flags), vcpu->kvm)) return -EFAULT; - mmap_read_lock(sg->mm); - /* - * We don't want any guest-2 tables to change - so the parent - * tables/pointers we read stay valid - unshadowing is however - * always possible - only guest_table_lock protects us. - */ - ipte_lock(vcpu->kvm); - - rc = shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); + rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc); if (rc) - rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection, - &fake); + return rc; - vaddr.addr = saddr; - if (fake) { - pte.val = pgt + vaddr.px * PAGE_SIZE; - goto shadow_page; - } - - switch (rc) { - case PGM_SEGMENT_TRANSLATION: - case PGM_REGION_THIRD_TRANS: - case PGM_REGION_SECOND_TRANS: - case PGM_REGION_FIRST_TRANS: - pgt |= PEI_NOT_PTE; - break; - case 0: - pgt += vaddr.px * 8; - rc = gmap_read_table(sg->parent, pgt, &pte.val); - } - if (datptr) - *datptr = pgt | dat_protection * PEI_DAT_PROT; - if (!rc && pte.i) - rc = PGM_PAGE_TRANSLATION; - if (!rc && pte.z) - rc = PGM_TRANSLATION_SPEC; -shadow_page: - pte.p |= dat_protection; - if (!rc) - rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); - vcpu->kvm->stat.gmap_shadow_pg_entry++; + ipte_lock(vcpu->kvm); + rc = __gaccess_shadow_fault(vcpu, sg, saddr, datptr, wr || sg->guest_asce.r); ipte_unlock(vcpu->kvm); - mmap_read_unlock(sg->mm); + return rc; } diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 774cdf19998f..b5385cec60f4 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -206,7 +206,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, unsigned long len, enum gacc_mode mode); -int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old_addr, +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old, union kvm_s390_quad new, u8 access_key, bool *success); /** @@ -450,11 +450,17 @@ void ipte_unlock(struct kvm *kvm); int ipte_lock_held(struct kvm *kvm); int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); -/* MVPG PEI indication bits */ -#define PEI_DAT_PROT 2 -#define PEI_NOT_PTE 4 +union mvpg_pei { + unsigned long val; + struct { + unsigned long addr : 61; + unsigned long not_pte : 1; + unsigned long dat_prot: 1; + unsigned long real : 1; + }; +}; -int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow, - unsigned long saddr, unsigned long *datptr); +int gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, + union mvpg_pei *datptr, bool wr); #endif /* __KVM_S390_GACCESS_H */ diff --git a/arch/s390/kvm/gmap-vsie.c b/arch/s390/kvm/gmap-vsie.c deleted file mode 100644 index 56ef153eb8fe..000000000000 --- a/arch/s390/kvm/gmap-vsie.c +++ /dev/null @@ -1,141 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Guest memory management for KVM/s390 nested VMs. - * - * Copyright IBM Corp. 2008, 2020, 2024 - * - * Author(s): Claudio Imbrenda - * Martin Schwidefsky - * David Hildenbrand - * Janosch Frank - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "kvm-s390.h" - -/** - * gmap_find_shadow - find a specific asce in the list of shadow tables - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * Returns the pointer to a gmap if a shadow table with the given asce is - * already available, ERR_PTR(-EAGAIN) if another one is just being created, - * otherwise NULL - * - * Context: Called with parent->shadow_lock held - */ -static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, int edat_level) -{ - struct gmap *sg; - - lockdep_assert_held(&parent->shadow_lock); - list_for_each_entry(sg, &parent->children, list) { - if (!gmap_shadow_valid(sg, asce, edat_level)) - continue; - if (!sg->initialized) - return ERR_PTR(-EAGAIN); - refcount_inc(&sg->ref_count); - return sg; - } - return NULL; -} - -/** - * gmap_shadow - create/find a shadow guest address space - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * The pages of the top level page table referred by the asce parameter - * will be set to read-only and marked in the PGSTEs of the kvm process. - * The shadow table will be removed automatically on any change to the - * PTE mapping for the source table. - * - * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, - * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the - * parent gmap table could not be protected. - */ -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level) -{ - struct gmap *sg, *new; - unsigned long limit; - int rc; - - if (KVM_BUG_ON(parent->mm->context.allow_gmap_hpage_1m, (struct kvm *)parent->private) || - KVM_BUG_ON(gmap_is_shadow(parent), (struct kvm *)parent->private)) - return ERR_PTR(-EFAULT); - spin_lock(&parent->shadow_lock); - sg = gmap_find_shadow(parent, asce, edat_level); - spin_unlock(&parent->shadow_lock); - if (sg) - return sg; - /* Create a new shadow gmap */ - limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); - if (asce & _ASCE_REAL_SPACE) - limit = -1UL; - new = gmap_alloc(limit); - if (!new) - return ERR_PTR(-ENOMEM); - new->mm = parent->mm; - new->parent = gmap_get(parent); - new->private = parent->private; - new->orig_asce = asce; - new->edat_level = edat_level; - new->initialized = false; - spin_lock(&parent->shadow_lock); - /* Recheck if another CPU created the same shadow */ - sg = gmap_find_shadow(parent, asce, edat_level); - if (sg) { - spin_unlock(&parent->shadow_lock); - gmap_free(new); - return sg; - } - if (asce & _ASCE_REAL_SPACE) { - /* only allow one real-space gmap shadow */ - list_for_each_entry(sg, &parent->children, list) { - if (sg->orig_asce & _ASCE_REAL_SPACE) { - spin_lock(&sg->guest_table_lock); - gmap_unshadow(sg); - spin_unlock(&sg->guest_table_lock); - list_del(&sg->list); - gmap_put(sg); - break; - } - } - } - refcount_set(&new->ref_count, 2); - list_add(&new->list, &parent->children); - if (asce & _ASCE_REAL_SPACE) { - /* nothing to protect, return right away */ - new->initialized = true; - spin_unlock(&parent->shadow_lock); - return new; - } - spin_unlock(&parent->shadow_lock); - /* protect after insertion, so it will get properly invalidated */ - mmap_read_lock(parent->mm); - rc = __kvm_s390_mprotect_many(parent, asce & _ASCE_ORIGIN, - ((asce & _ASCE_TABLE_LENGTH) + 1), - PROT_READ, GMAP_NOTIFY_SHADOW); - mmap_read_unlock(parent->mm); - spin_lock(&parent->shadow_lock); - new->initialized = true; - if (rc) { - list_del(&new->list); - gmap_free(new); - new = ERR_PTR(rc); - } - spin_unlock(&parent->shadow_lock); - return new; -} diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 420ae62977e2..39aff324203e 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -21,6 +21,7 @@ #include "gaccess.h" #include "trace.h" #include "trace-s390.h" +#include "faultin.h" u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu) { @@ -367,8 +368,11 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) reg2, &srcaddr, GACC_FETCH, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - rc = kvm_s390_handle_dat_fault(vcpu, srcaddr, 0); - if (rc != 0) + + do { + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(srcaddr), false); + } while (rc == -EAGAIN); + if (rc) return rc; /* Ensure that the source is paged-in, no actual access -> no key checking */ @@ -376,8 +380,11 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) reg1, &dstaddr, GACC_STORE, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - rc = kvm_s390_handle_dat_fault(vcpu, dstaddr, FOLL_WRITE); - if (rc != 0) + + do { + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(dstaddr), true); + } while (rc == -EAGAIN); + if (rc) return rc; kvm_s390_retry_instr(vcpu); diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 249cdc822ec5..f55eca9aa638 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -34,6 +33,7 @@ #include "gaccess.h" #include "trace-s390.h" #include "pci.h" +#include "gmap.h" #define PFAULT_INIT 0x0600 #define PFAULT_DONE 0x0680 @@ -2632,12 +2632,12 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) case KVM_DEV_FLIC_APF_ENABLE: if (kvm_is_ucontrol(dev->kvm)) return -EINVAL; - dev->kvm->arch.gmap->pfault_enabled = 1; + set_bit(GMAP_FLAG_PFAULT_ENABLED, &dev->kvm->arch.gmap->flags); break; case KVM_DEV_FLIC_APF_DISABLE_WAIT: if (kvm_is_ucontrol(dev->kvm)) return -EINVAL; - dev->kvm->arch.gmap->pfault_enabled = 0; + clear_bit(GMAP_FLAG_PFAULT_ENABLED, &dev->kvm->arch.gmap->flags); /* * Make sure no async faults are in transition when * clearing the queues. So we don't need to worry diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index f5411e093fb5..bde55761bf8a 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include @@ -53,6 +52,8 @@ #include #include "kvm-s390.h" #include "gaccess.h" +#include "gmap.h" +#include "faultin.h" #include "pci.h" #define CREATE_TRACE_POINTS @@ -264,16 +265,11 @@ static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS) /* available subfunctions indicated via query / "test bit" */ static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc; -static struct gmap_notifier gmap_notifier; -static struct gmap_notifier vsie_gmap_notifier; debug_info_t *kvm_s390_dbf; debug_info_t *kvm_s390_dbf_uv; /* Section: not file related */ /* forward declarations */ -static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, - unsigned long end); - static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta) { u8 delta_idx = 0; @@ -529,10 +525,6 @@ static int __init __kvm_s390_init(void) if (rc) goto err_gib; - gmap_notifier.notifier_call = kvm_gmap_notifier; - gmap_register_pte_notifier(&gmap_notifier); - vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; - gmap_register_pte_notifier(&vsie_gmap_notifier); atomic_notifier_chain_register(&s390_epoch_delta_notifier, &kvm_clock_notifier); @@ -552,8 +544,6 @@ err_kvm_uv: static void __kvm_s390_exit(void) { - gmap_unregister_pte_notifier(&gmap_notifier); - gmap_unregister_pte_notifier(&vsie_gmap_notifier); atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, &kvm_clock_notifier); @@ -569,7 +559,7 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { if (ioctl == KVM_S390_ENABLE_SIE) - return s390_enable_sie(); + return 0; return -EINVAL; } @@ -698,32 +688,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { - int i; - gfn_t cur_gfn, last_gfn; - unsigned long gaddr, vmaddr; - struct gmap *gmap = kvm->arch.gmap; - DECLARE_BITMAP(bitmap, _PAGE_ENTRIES); + gfn_t last_gfn = memslot->base_gfn + memslot->npages; - /* Loop over all guest segments */ - cur_gfn = memslot->base_gfn; - last_gfn = memslot->base_gfn + memslot->npages; - for (; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES) { - gaddr = gfn_to_gpa(cur_gfn); - vmaddr = gfn_to_hva_memslot(memslot, cur_gfn); - if (kvm_is_error_hva(vmaddr)) - continue; - - bitmap_zero(bitmap, _PAGE_ENTRIES); - gmap_sync_dirty_log_pmd(gmap, bitmap, gaddr, vmaddr); - for (i = 0; i < _PAGE_ENTRIES; i++) { - if (test_bit(i, bitmap)) - mark_page_dirty(kvm, cur_gfn + i); - } - - if (fatal_signal_pending(current)) - return; - cond_resched(); - } + scoped_guard(read_lock, &kvm->mmu_lock) + gmap_sync_dirty_log(kvm->arch.gmap, memslot->base_gfn, last_gfn); } /* Section: vm related */ @@ -883,9 +851,6 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) r = -EINVAL; else { r = 0; - mmap_write_lock(kvm->mm); - kvm->mm->context.allow_gmap_hpage_1m = 1; - mmap_write_unlock(kvm->mm); /* * We might have to create fake 4k page * tables. To avoid that the hardware works on @@ -958,7 +923,7 @@ static int kvm_s390_get_mem_control(struct kvm *kvm, struct kvm_device_attr *att static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *attr) { int ret; - unsigned int idx; + switch (attr->attr) { case KVM_S390_VM_MEM_ENABLE_CMMA: ret = -ENXIO; @@ -969,8 +934,6 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att mutex_lock(&kvm->lock); if (kvm->created_vcpus) ret = -EBUSY; - else if (kvm->mm->context.allow_gmap_hpage_1m) - ret = -EINVAL; else { kvm->arch.use_cmma = 1; /* Not compatible with cmma. */ @@ -979,7 +942,9 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att } mutex_unlock(&kvm->lock); break; - case KVM_S390_VM_MEM_CLR_CMMA: + case KVM_S390_VM_MEM_CLR_CMMA: { + gfn_t start_gfn = 0; + ret = -ENXIO; if (!sclp.has_cmma) break; @@ -988,13 +953,13 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att break; VM_EVENT(kvm, 3, "%s", "RESET: CMMA states"); - mutex_lock(&kvm->lock); - idx = srcu_read_lock(&kvm->srcu); - s390_reset_cmma(kvm->arch.gmap->mm); - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); + do { + start_gfn = dat_reset_cmma(kvm->arch.gmap->asce, start_gfn); + cond_resched(); + } while (start_gfn); ret = 0; break; + } case KVM_S390_VM_MEM_LIMIT_SIZE: { unsigned long new_limit; @@ -1011,29 +976,12 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (!new_limit) return -EINVAL; - /* gmap_create takes last usable address */ - if (new_limit != KVM_S390_NO_MEM_LIMIT) - new_limit -= 1; - ret = -EBUSY; - mutex_lock(&kvm->lock); - if (!kvm->created_vcpus) { - /* gmap_create will round the limit up */ - struct gmap *new = gmap_create(current->mm, new_limit); - - if (!new) { - ret = -ENOMEM; - } else { - gmap_remove(kvm->arch.gmap); - new->private = kvm; - kvm->arch.gmap = new; - ret = 0; - } - } - mutex_unlock(&kvm->lock); + if (!kvm->created_vcpus) + ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit)); VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit); VM_EVENT(kvm, 3, "New guest asce: 0x%p", - (void *) kvm->arch.gmap->asce); + (void *)kvm->arch.gmap->asce.val); break; } default: @@ -1198,19 +1146,13 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm) kvm->arch.migration_mode = 1; return 0; } - /* mark all the pages in active slots as dirty */ kvm_for_each_memslot(ms, bkt, slots) { if (!ms->dirty_bitmap) return -EINVAL; - /* - * The second half of the bitmap is only used on x86, - * and would be wasted otherwise, so we put it to good - * use here to keep track of the state of the storage - * attributes. - */ - memset(kvm_second_dirty_bitmap(ms), 0xff, kvm_dirty_bitmap_bytes(ms)); ram_pages += ms->npages; } + /* mark all the pages as dirty */ + gmap_set_cmma_all_dirty(kvm->arch.gmap); atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages); kvm->arch.migration_mode = 1; kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION); @@ -2116,40 +2058,32 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { - uint8_t *keys; - uint64_t hva; - int srcu_idx, i, r = 0; + union skey *keys; + int i, r = 0; if (args->flags != 0) return -EINVAL; /* Is this guest using storage keys? */ - if (!mm_uses_skeys(current->mm)) + if (!uses_skeys(kvm->arch.gmap)) return KVM_S390_GET_SKEYS_NONE; /* Enforce sane limit on memory allocation */ if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) return -EINVAL; - keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT); + keys = kvmalloc_array(args->count, sizeof(*keys), GFP_KERNEL_ACCOUNT); if (!keys) return -ENOMEM; - mmap_read_lock(current->mm); - srcu_idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < args->count; i++) { - hva = gfn_to_hva(kvm, args->start_gfn + i); - if (kvm_is_error_hva(hva)) { - r = -EFAULT; - break; + scoped_guard(read_lock, &kvm->mmu_lock) { + for (i = 0; i < args->count; i++) { + r = dat_get_storage_key(kvm->arch.gmap->asce, + args->start_gfn + i, keys + i); + if (r) + break; } - - r = get_guest_storage_key(current->mm, hva, &keys[i]); - if (r) - break; } - srcu_read_unlock(&kvm->srcu, srcu_idx); - mmap_read_unlock(current->mm); if (!r) { r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys, @@ -2164,10 +2098,9 @@ static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { - uint8_t *keys; - uint64_t hva; - int srcu_idx, i, r = 0; - bool unlocked; + struct kvm_s390_mmu_cache *mc; + union skey *keys; + int i, r = 0; if (args->flags != 0) return -EINVAL; @@ -2176,7 +2109,7 @@ static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) return -EINVAL; - keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT); + keys = kvmalloc_array(args->count, sizeof(*keys), GFP_KERNEL_ACCOUNT); if (!keys) return -ENOMEM; @@ -2188,161 +2121,43 @@ static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) } /* Enable storage key handling for the guest */ - r = s390_enable_skey(); + r = gmap_enable_skeys(kvm->arch.gmap); if (r) goto out; - i = 0; - mmap_read_lock(current->mm); - srcu_idx = srcu_read_lock(&kvm->srcu); - while (i < args->count) { - unlocked = false; - hva = gfn_to_hva(kvm, args->start_gfn + i); - if (kvm_is_error_hva(hva)) { - r = -EFAULT; - break; - } - + r = -EINVAL; + for (i = 0; i < args->count; i++) { /* Lowest order bit is reserved */ - if (keys[i] & 0x01) { - r = -EINVAL; - break; - } - - r = set_guest_storage_key(current->mm, hva, keys[i], 0); - if (r) { - r = fixup_user_fault(current->mm, hva, - FAULT_FLAG_WRITE, &unlocked); - if (r) - break; - } - if (!r) - i++; + if (keys[i].zero) + goto out; } - srcu_read_unlock(&kvm->srcu, srcu_idx); - mmap_read_unlock(current->mm); + + mc = kvm_s390_new_mmu_cache(); + if (!mc) { + r = -ENOMEM; + goto out; + } + + r = 0; + do { + r = kvm_s390_mmu_cache_topup(mc); + if (r == -ENOMEM) + break; + scoped_guard(read_lock, &kvm->mmu_lock) { + for (i = 0 ; i < args->count; i++) { + r = dat_set_storage_key(mc, kvm->arch.gmap->asce, + args->start_gfn + i, keys[i], 0); + if (r) + break; + } + } + } while (r == -ENOMEM); + kvm_s390_free_mmu_cache(mc); out: kvfree(keys); return r; } -/* - * Base address and length must be sent at the start of each block, therefore - * it's cheaper to send some clean data, as long as it's less than the size of - * two longs. - */ -#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *)) -/* for consistency */ -#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) - -static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, - u8 *res, unsigned long bufsize) -{ - unsigned long pgstev, hva, cur_gfn = args->start_gfn; - - args->count = 0; - while (args->count < bufsize) { - hva = gfn_to_hva(kvm, cur_gfn); - /* - * We return an error if the first value was invalid, but we - * return successfully if at least one value was copied. - */ - if (kvm_is_error_hva(hva)) - return args->count ? 0 : -EFAULT; - if (get_pgste(kvm->mm, hva, &pgstev) < 0) - pgstev = 0; - res[args->count++] = (pgstev >> 24) & 0x43; - cur_gfn++; - } - - return 0; -} - -static struct kvm_memory_slot *gfn_to_memslot_approx(struct kvm_memslots *slots, - gfn_t gfn) -{ - return ____gfn_to_memslot(slots, gfn, true); -} - -static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots, - unsigned long cur_gfn) -{ - struct kvm_memory_slot *ms = gfn_to_memslot_approx(slots, cur_gfn); - unsigned long ofs = cur_gfn - ms->base_gfn; - struct rb_node *mnode = &ms->gfn_node[slots->node_idx]; - - if (ms->base_gfn + ms->npages <= cur_gfn) { - mnode = rb_next(mnode); - /* If we are above the highest slot, wrap around */ - if (!mnode) - mnode = rb_first(&slots->gfn_tree); - - ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); - ofs = 0; - } - - if (cur_gfn < ms->base_gfn) - ofs = 0; - - ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs); - while (ofs >= ms->npages && (mnode = rb_next(mnode))) { - ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); - ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages); - } - return ms->base_gfn + ofs; -} - -static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, - u8 *res, unsigned long bufsize) -{ - unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev; - struct kvm_memslots *slots = kvm_memslots(kvm); - struct kvm_memory_slot *ms; - - if (unlikely(kvm_memslots_empty(slots))) - return 0; - - cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn); - ms = gfn_to_memslot(kvm, cur_gfn); - args->count = 0; - args->start_gfn = cur_gfn; - if (!ms) - return 0; - next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); - mem_end = kvm_s390_get_gfn_end(slots); - - while (args->count < bufsize) { - hva = gfn_to_hva(kvm, cur_gfn); - if (kvm_is_error_hva(hva)) - return 0; - /* Decrement only if we actually flipped the bit to 0 */ - if (test_and_clear_bit(cur_gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms))) - atomic64_dec(&kvm->arch.cmma_dirty_pages); - if (get_pgste(kvm->mm, hva, &pgstev) < 0) - pgstev = 0; - /* Save the value */ - res[args->count++] = (pgstev >> 24) & 0x43; - /* If the next bit is too far away, stop. */ - if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE) - return 0; - /* If we reached the previous "next", find the next one */ - if (cur_gfn == next_gfn) - next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); - /* Reached the end of memory or of the buffer, stop */ - if ((next_gfn >= mem_end) || - (next_gfn - args->start_gfn >= bufsize)) - return 0; - cur_gfn++; - /* Reached the end of the current memslot, take the next one. */ - if (cur_gfn - ms->base_gfn >= ms->npages) { - ms = gfn_to_memslot(kvm, cur_gfn); - if (!ms) - return 0; - } - } - return 0; -} - /* * This function searches for the next page with dirty CMMA attributes, and * saves the attributes in the buffer up to either the end of the buffer or @@ -2354,8 +2169,7 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, static int kvm_s390_get_cmma_bits(struct kvm *kvm, struct kvm_s390_cmma_log *args) { - unsigned long bufsize; - int srcu_idx, peek, ret; + int peek, ret; u8 *values; if (!kvm->arch.use_cmma) @@ -2368,8 +2182,8 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm, if (!peek && !kvm->arch.migration_mode) return -EINVAL; /* CMMA is disabled or was not used, or the buffer has length zero */ - bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX); - if (!bufsize || !kvm->mm->context.uses_cmm) { + args->count = min(args->count, KVM_S390_CMMA_SIZE_MAX); + if (!args->count || !uses_cmm(kvm->arch.gmap)) { memset(args, 0, sizeof(*args)); return 0; } @@ -2379,18 +2193,18 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm, return 0; } - values = vmalloc(bufsize); + values = vmalloc(args->count); if (!values) return -ENOMEM; - mmap_read_lock(kvm->mm); - srcu_idx = srcu_read_lock(&kvm->srcu); - if (peek) - ret = kvm_s390_peek_cmma(kvm, args, values, bufsize); - else - ret = kvm_s390_get_cmma(kvm, args, values, bufsize); - srcu_read_unlock(&kvm->srcu, srcu_idx); - mmap_read_unlock(kvm->mm); + scoped_guard(read_lock, &kvm->mmu_lock) { + if (peek) + ret = dat_peek_cmma(args->start_gfn, kvm->arch.gmap->asce, &args->count, + values); + else + ret = dat_get_cmma(kvm->arch.gmap->asce, &args->start_gfn, &args->count, + values, &kvm->arch.cmma_dirty_pages); + } if (kvm->arch.migration_mode) args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages); @@ -2412,11 +2226,9 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm, static int kvm_s390_set_cmma_bits(struct kvm *kvm, const struct kvm_s390_cmma_log *args) { - unsigned long hva, mask, pgstev, i; - uint8_t *bits; - int srcu_idx, r = 0; - - mask = args->mask; + struct kvm_s390_mmu_cache *mc; + u8 *bits = NULL; + int r = 0; if (!kvm->arch.use_cmma) return -ENXIO; @@ -2430,9 +2242,12 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm, if (args->count == 0) return 0; + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; bits = vmalloc(array_size(sizeof(*bits), args->count)); if (!bits) - return -ENOMEM; + goto out; r = copy_from_user(bits, (void __user *)args->values, args->count); if (r) { @@ -2440,29 +2255,19 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm, goto out; } - mmap_read_lock(kvm->mm); - srcu_idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < args->count; i++) { - hva = gfn_to_hva(kvm, args->start_gfn + i); - if (kvm_is_error_hva(hva)) { - r = -EFAULT; + do { + r = kvm_s390_mmu_cache_topup(mc); + if (r) break; + scoped_guard(read_lock, &kvm->mmu_lock) { + r = dat_set_cmma_bits(mc, kvm->arch.gmap->asce, args->start_gfn, + args->count, args->mask, bits); } + } while (r == -ENOMEM); - pgstev = bits[i]; - pgstev = pgstev << 24; - mask &= _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT; - set_pgste_bits(kvm->mm, hva, mask, pgstev); - } - srcu_read_unlock(&kvm->srcu, srcu_idx); - mmap_read_unlock(kvm->mm); - - if (!kvm->mm->context.uses_cmm) { - mmap_write_lock(kvm->mm); - kvm->mm->context.uses_cmm = 1; - mmap_write_unlock(kvm->mm); - } + set_bit(GMAP_FLAG_USES_CMM, &kvm->arch.gmap->flags); out: + kvm_s390_free_mmu_cache(mc); vfree(bits); return r; } @@ -2671,6 +2476,13 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) break; mmap_write_lock(kvm->mm); + /* + * Disable creation of new THPs. Existing THPs can stay, they + * will be split when any part of them gets imported. + */ + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, kvm->mm); + mm_flags_set(MMF_DISABLE_THP_COMPLETELY, kvm->mm); + set_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &kvm->arch.gmap->flags); r = gmap_helper_disable_cow_sharing(); mmap_write_unlock(kvm->mm); if (r) @@ -2918,9 +2730,6 @@ static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop) acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE; scoped_guard(srcu, &kvm->srcu) { - if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) - return PGM_ADDRESSING; - if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) return check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key); @@ -2933,7 +2742,6 @@ static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop) if (acc_mode != GACC_STORE && copy_to_user(uaddr, tmpbuf, mop->size)) return -EFAULT; } - return 0; } @@ -2962,9 +2770,6 @@ static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *m return -EFAULT; scoped_guard(srcu, &kvm->srcu) { - if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) - return PGM_ADDRESSING; - r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old, new, mop->key, &success); @@ -3322,11 +3127,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (type) goto out_err; #endif - - rc = s390_enable_sie(); - if (rc) - goto out_err; - rc = -ENOMEM; if (!sclp.has_64bscao) @@ -3400,6 +3200,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) debug_register_view(kvm->arch.dbf, &debug_sprintf_view); VM_EVENT(kvm, 3, "vm created with type %lu", type); + kvm->arch.mem_limit = type & KVM_VM_S390_UCONTROL ? KVM_S390_NO_MEM_LIMIT : sclp.hamax + 1; + kvm->arch.gmap = gmap_new(kvm, gpa_to_gfn(kvm->arch.mem_limit)); + if (!kvm->arch.gmap) + goto out_err; + clear_bit(GMAP_FLAG_PFAULT_ENABLED, &kvm->arch.gmap->flags); + if (type & KVM_VM_S390_UCONTROL) { struct kvm_userspace_memory_region2 fake_memslot = { .slot = KVM_S390_UCONTROL_MEMSLOT, @@ -3409,23 +3215,15 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) .flags = 0, }; - kvm->arch.gmap = NULL; - kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT; /* one flat fake memslot covering the whole address-space */ mutex_lock(&kvm->slots_lock); KVM_BUG_ON(kvm_set_internal_memslot(kvm, &fake_memslot), kvm); mutex_unlock(&kvm->slots_lock); + set_bit(GMAP_FLAG_IS_UCONTROL, &kvm->arch.gmap->flags); } else { - if (sclp.hamax == U64_MAX) - kvm->arch.mem_limit = TASK_SIZE_MAX; - else - kvm->arch.mem_limit = min_t(unsigned long, TASK_SIZE_MAX, - sclp.hamax + 1); - kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1); - if (!kvm->arch.gmap) - goto out_err; - kvm->arch.gmap->private = kvm; - kvm->arch.gmap->pfault_enabled = 0; + struct crst_table *table = dereference_asce(kvm->arch.gmap->asce); + + crst_table_init((void *)table, _CRSTE_HOLE(table->crstes[0].h.tt).val); } kvm->arch.use_pfmfi = sclp.has_pfmfi; @@ -3459,8 +3257,11 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) sca_del_vcpu(vcpu); kvm_s390_update_topology_change_report(vcpu->kvm, 1); - if (kvm_is_ucontrol(vcpu->kvm)) - gmap_remove(vcpu->arch.gmap); + if (kvm_is_ucontrol(vcpu->kvm)) { + scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) + gmap_remove_child(vcpu->arch.gmap); + vcpu->arch.gmap = gmap_put(vcpu->arch.gmap); + } if (vcpu->kvm->arch.use_cmma) kvm_s390_vcpu_unsetup_cmma(vcpu); @@ -3468,6 +3269,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) if (kvm_s390_pv_cpu_get_handle(vcpu)) kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc); free_page((unsigned long)(vcpu->arch.sie_block)); + kvm_s390_free_mmu_cache(vcpu->arch.mc); } void kvm_arch_destroy_vm(struct kvm *kvm) @@ -3494,25 +3296,14 @@ void kvm_arch_destroy_vm(struct kvm *kvm) debug_unregister(kvm->arch.dbf); free_page((unsigned long)kvm->arch.sie_page2); - if (!kvm_is_ucontrol(kvm)) - gmap_remove(kvm->arch.gmap); kvm_s390_destroy_adapters(kvm); kvm_s390_clear_float_irqs(kvm); kvm_s390_vsie_destroy(kvm); + kvm->arch.gmap = gmap_put(kvm->arch.gmap); KVM_EVENT(3, "vm 0x%p destroyed", kvm); } /* Section: vcpu related */ -static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) -{ - vcpu->arch.gmap = gmap_create(current->mm, -1UL); - if (!vcpu->arch.gmap) - return -ENOMEM; - vcpu->arch.gmap->private = vcpu->kvm; - - return 0; -} - static void sca_del_vcpu(struct kvm_vcpu *vcpu) { struct esca_block *sca = vcpu->kvm->arch.sca; @@ -3853,9 +3644,15 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) int rc; BUILD_BUG_ON(sizeof(struct sie_page) != 4096); - sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!sie_page) + vcpu->arch.mc = kvm_s390_new_mmu_cache(); + if (!vcpu->arch.mc) return -ENOMEM; + sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!sie_page) { + kvm_s390_free_mmu_cache(vcpu->arch.mc); + vcpu->arch.mc = NULL; + return -ENOMEM; + } vcpu->arch.sie_block = &sie_page->sie_block; vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb); @@ -3897,8 +3694,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS; if (kvm_is_ucontrol(vcpu->kvm)) { - rc = __kvm_ucontrol_vcpu_init(vcpu); - if (rc) + rc = -ENOMEM; + vcpu->arch.gmap = gmap_new_child(vcpu->kvm->arch.gmap, -1UL); + if (!vcpu->arch.gmap) goto out_free_sie_block; } @@ -3914,8 +3712,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) return 0; out_ucontrol_uninit: - if (kvm_is_ucontrol(vcpu->kvm)) - gmap_remove(vcpu->arch.gmap); + if (kvm_is_ucontrol(vcpu->kvm)) { + gmap_remove_child(vcpu->arch.gmap); + vcpu->arch.gmap = gmap_put(vcpu->arch.gmap); + } out_free_sie_block: free_page((unsigned long)(vcpu->arch.sie_block)); return rc; @@ -3979,32 +3779,6 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu) kvm_s390_vcpu_request(vcpu); } -static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, - unsigned long end) -{ - struct kvm *kvm = gmap->private; - struct kvm_vcpu *vcpu; - unsigned long prefix; - unsigned long i; - - trace_kvm_s390_gmap_notifier(start, end, gmap_is_shadow(gmap)); - - if (gmap_is_shadow(gmap)) - return; - if (start >= 1UL << 31) - /* We are only interested in prefix pages */ - return; - kvm_for_each_vcpu(i, vcpu, kvm) { - /* match against both prefix pages */ - prefix = kvm_s390_get_prefix(vcpu); - if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) { - VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx", - start, end); - kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); - } - } -} - bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { /* do not poll with more than halt_poll_max_steal percent of steal time */ @@ -4386,72 +4160,41 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu) return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS); } -static int __kvm_s390_fixup_fault_sync(struct gmap *gmap, gpa_t gaddr, unsigned int flags) +static int vcpu_ucontrol_translate(struct kvm_vcpu *vcpu, gpa_t *gaddr) { - struct kvm *kvm = gmap->private; - gfn_t gfn = gpa_to_gfn(gaddr); - bool unlocked; - hva_t vmaddr; - gpa_t tmp; int rc; - if (kvm_is_ucontrol(kvm)) { - tmp = __gmap_translate(gmap, gaddr); - gfn = gpa_to_gfn(tmp); - } - - vmaddr = gfn_to_hva(kvm, gfn); - rc = fixup_user_fault(gmap->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); - if (!rc) - rc = __gmap_link(gmap, gaddr, vmaddr); - return rc; -} - -/** - * __kvm_s390_mprotect_many() - Apply specified protection to guest pages - * @gmap: the gmap of the guest - * @gpa: the starting guest address - * @npages: how many pages to protect - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bits: pgste notification bits to set - * - * Returns: 0 in case of success, < 0 in case of error - see gmap_protect_one() - * - * Context: kvm->srcu and gmap->mm need to be held in read mode - */ -int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, - unsigned long bits) -{ - unsigned int fault_flag = (prot & PROT_WRITE) ? FAULT_FLAG_WRITE : 0; - gpa_t end = gpa + npages * PAGE_SIZE; - int rc; - - for (; gpa < end; gpa = ALIGN(gpa + 1, rc)) { - rc = gmap_protect_one(gmap, gpa, prot, bits); - if (rc == -EAGAIN) { - __kvm_s390_fixup_fault_sync(gmap, gpa, fault_flag); - rc = gmap_protect_one(gmap, gpa, prot, bits); + if (kvm_is_ucontrol(vcpu->kvm)) { + rc = gmap_ucas_translate(vcpu->arch.mc, vcpu->arch.gmap, gaddr); + if (rc == -EREMOTE) { + vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; + vcpu->run->s390_ucontrol.trans_exc_code = *gaddr; + vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION; } - if (rc < 0) - return rc; + return rc; } - return 0; } -static int kvm_s390_mprotect_notify_prefix(struct kvm_vcpu *vcpu) +static int kvm_s390_fixup_prefix(struct kvm_vcpu *vcpu) { gpa_t gaddr = kvm_s390_get_prefix(vcpu); - int idx, rc; + gfn_t gfn; + int rc; - idx = srcu_read_lock(&vcpu->kvm->srcu); - mmap_read_lock(vcpu->arch.gmap->mm); + if (vcpu_ucontrol_translate(vcpu, &gaddr)) + return -EREMOTE; + gfn = gpa_to_gfn(gaddr); - rc = __kvm_s390_mprotect_many(vcpu->arch.gmap, gaddr, 2, PROT_WRITE, GMAP_NOTIFY_MPROT); - - mmap_read_unlock(vcpu->arch.gmap->mm); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gfn, true); + if (rc) + return rc; + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gfn + 1, true); + if (rc) + return rc; + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) + rc = dat_set_prefix_notif_bit(vcpu->kvm->arch.gmap->asce, gfn); return rc; } @@ -4471,7 +4214,7 @@ retry: if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) { int rc; - rc = kvm_s390_mprotect_notify_prefix(vcpu); + rc = kvm_s390_fixup_prefix(vcpu); if (rc) { kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); return rc; @@ -4520,8 +4263,7 @@ retry: * Re-enable CMM virtualization if CMMA is available and * CMM has been used. */ - if ((vcpu->kvm->arch.use_cmma) && - (vcpu->kvm->mm->context.uses_cmm)) + if (vcpu->kvm->arch.use_cmma && uses_cmm(vcpu->arch.gmap)) vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; goto retry; } @@ -4633,7 +4375,7 @@ bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) return false; if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) return false; - if (!vcpu->arch.gmap->pfault_enabled) + if (!pfault_enabled(vcpu->arch.gmap)) return false; hva = gfn_to_hva(vcpu->kvm, current->thread.gmap_teid.addr); @@ -4726,98 +4468,25 @@ static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu) current->thread.gmap_int_code, current->thread.gmap_teid.val); } -/* - * __kvm_s390_handle_dat_fault() - handle a dat fault for the gmap of a vcpu - * @vcpu: the vCPU whose gmap is to be fixed up - * @gfn: the guest frame number used for memslots (including fake memslots) - * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps - * @foll: FOLL_* flags - * - * Return: 0 on success, < 0 in case of error. - * Context: The mm lock must not be held before calling. May sleep. - */ -int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int foll) +static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, gpa_t gaddr, bool wr) { - struct kvm_memory_slot *slot; - unsigned int fault_flags; - bool writable, unlocked; - unsigned long vmaddr; - struct page *page; - kvm_pfn_t pfn; + struct guest_fault f = { + .write_attempt = wr, + .attempt_pfault = pfault_enabled(vcpu->arch.gmap), + }; int rc; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + if (vcpu_ucontrol_translate(vcpu, &gaddr)) + return -EREMOTE; + f.gfn = gpa_to_gfn(gaddr); + + rc = kvm_s390_faultin_gfn(vcpu, NULL, &f); + if (rc <= 0) + return rc; + if (rc == PGM_ADDRESSING) return vcpu_post_run_addressing_exception(vcpu); - - fault_flags = foll & FOLL_WRITE ? FAULT_FLAG_WRITE : 0; - if (vcpu->arch.gmap->pfault_enabled) - foll |= FOLL_NOWAIT; - vmaddr = __gfn_to_hva_memslot(slot, gfn); - -try_again: - pfn = __kvm_faultin_pfn(slot, gfn, foll, &writable, &page); - - /* Access outside memory, inject addressing exception */ - if (is_noslot_pfn(pfn)) - return vcpu_post_run_addressing_exception(vcpu); - /* Signal pending: try again */ - if (pfn == KVM_PFN_ERR_SIGPENDING) - return -EAGAIN; - - /* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT) */ - if (pfn == KVM_PFN_ERR_NEEDS_IO) { - trace_kvm_s390_major_guest_pfault(vcpu); - if (kvm_arch_setup_async_pf(vcpu)) - return 0; - vcpu->stat.pfault_sync++; - /* Could not setup async pfault, try again synchronously */ - foll &= ~FOLL_NOWAIT; - goto try_again; - } - /* Any other error */ - if (is_error_pfn(pfn)) - return -EFAULT; - - /* Success */ - mmap_read_lock(vcpu->arch.gmap->mm); - /* Mark the userspace PTEs as young and/or dirty, to avoid page fault loops */ - rc = fixup_user_fault(vcpu->arch.gmap->mm, vmaddr, fault_flags, &unlocked); - if (!rc) - rc = __gmap_link(vcpu->arch.gmap, gaddr, vmaddr); - scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { - kvm_release_faultin_page(vcpu->kvm, page, false, writable); - } - mmap_read_unlock(vcpu->arch.gmap->mm); - return rc; -} - -static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int foll) -{ - unsigned long gaddr_tmp; - gfn_t gfn; - - gfn = gpa_to_gfn(gaddr); - if (kvm_is_ucontrol(vcpu->kvm)) { - /* - * This translates the per-vCPU guest address into a - * fake guest address, which can then be used with the - * fake memslots that are identity mapping userspace. - * This allows ucontrol VMs to use the normal fault - * resolution path, like normal VMs. - */ - mmap_read_lock(vcpu->arch.gmap->mm); - gaddr_tmp = __gmap_translate(vcpu->arch.gmap, gaddr); - mmap_read_unlock(vcpu->arch.gmap->mm); - if (gaddr_tmp == -EFAULT) { - vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; - vcpu->run->s390_ucontrol.trans_exc_code = gaddr; - vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION; - return -EREMOTE; - } - gfn = gpa_to_gfn(gaddr_tmp); - } - return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, foll); + KVM_BUG_ON(rc, vcpu->kvm); + return -EINVAL; } static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) @@ -4994,7 +4663,7 @@ xfer_to_guest_mode_check: exit_reason = kvm_s390_enter_exit_sie(vcpu->arch.sie_block, vcpu->run->s.regs.gprs, - vcpu->arch.gmap->asce); + vcpu->arch.gmap->asce.val); __enable_cpu_timer_accounting(vcpu); guest_timing_exit_irqoff(); @@ -5529,8 +5198,8 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; + void *tmpbuf __free(kvfree) = NULL; enum gacc_mode acc_mode; - void *tmpbuf = NULL; int r; r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_INJECT_EXCEPTION | @@ -5552,32 +5221,21 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu, if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, acc_mode, mop->key); - goto out_inject; - } - if (acc_mode == GACC_FETCH) { + } else if (acc_mode == GACC_FETCH) { r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size, mop->key); - if (r) - goto out_inject; - if (copy_to_user(uaddr, tmpbuf, mop->size)) { - r = -EFAULT; - goto out_free; - } + if (!r && copy_to_user(uaddr, tmpbuf, mop->size)) + return -EFAULT; } else { - if (copy_from_user(tmpbuf, uaddr, mop->size)) { - r = -EFAULT; - goto out_free; - } + if (copy_from_user(tmpbuf, uaddr, mop->size)) + return -EFAULT; r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size, mop->key); } -out_inject: if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0) kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); -out_free: - vfree(tmpbuf); return r; } @@ -5767,37 +5425,39 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } #ifdef CONFIG_KVM_S390_UCONTROL case KVM_S390_UCAS_MAP: { - struct kvm_s390_ucas_mapping ucasmap; + struct kvm_s390_ucas_mapping ucas; - if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) { - r = -EFAULT; + r = -EFAULT; + if (copy_from_user(&ucas, argp, sizeof(ucas))) break; - } - if (!kvm_is_ucontrol(vcpu->kvm)) { - r = -EINVAL; + r = -EINVAL; + if (!kvm_is_ucontrol(vcpu->kvm)) + break; + if (!IS_ALIGNED(ucas.user_addr | ucas.vcpu_addr | ucas.length, _SEGMENT_SIZE)) break; - } - r = gmap_map_segment(vcpu->arch.gmap, ucasmap.user_addr, - ucasmap.vcpu_addr, ucasmap.length); + r = gmap_ucas_map(vcpu->arch.gmap, gpa_to_gfn(ucas.user_addr), + gpa_to_gfn(ucas.vcpu_addr), + ucas.length >> _SEGMENT_SHIFT); break; } case KVM_S390_UCAS_UNMAP: { - struct kvm_s390_ucas_mapping ucasmap; + struct kvm_s390_ucas_mapping ucas; - if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) { - r = -EFAULT; + r = -EFAULT; + if (copy_from_user(&ucas, argp, sizeof(ucas))) break; - } - if (!kvm_is_ucontrol(vcpu->kvm)) { - r = -EINVAL; + r = -EINVAL; + if (!kvm_is_ucontrol(vcpu->kvm)) + break; + if (!IS_ALIGNED(ucas.vcpu_addr | ucas.length, _SEGMENT_SIZE)) break; - } - r = gmap_unmap_segment(vcpu->arch.gmap, ucasmap.vcpu_addr, - ucasmap.length); + gmap_ucas_unmap(vcpu->arch.gmap, gpa_to_gfn(ucas.vcpu_addr), + ucas.length >> _SEGMENT_SHIFT); + r = 0; break; } #endif @@ -5970,34 +5630,41 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *new, enum kvm_mr_change change) { + struct kvm_s390_mmu_cache *mc = NULL; int rc = 0; - if (kvm_is_ucontrol(kvm)) + if (change == KVM_MR_FLAGS_ONLY) return; - switch (change) { - case KVM_MR_DELETE: - rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE, - old->npages * PAGE_SIZE); - break; - case KVM_MR_MOVE: - rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE, - old->npages * PAGE_SIZE); - if (rc) - break; - fallthrough; - case KVM_MR_CREATE: - rc = gmap_map_segment(kvm->arch.gmap, new->userspace_addr, - new->base_gfn * PAGE_SIZE, - new->npages * PAGE_SIZE); - break; - case KVM_MR_FLAGS_ONLY: - break; - default: - WARN(1, "Unknown KVM MR CHANGE: %d\n", change); + mc = kvm_s390_new_mmu_cache(); + if (!mc) { + rc = -ENOMEM; + goto out; } + + scoped_guard(write_lock, &kvm->mmu_lock) { + switch (change) { + case KVM_MR_DELETE: + rc = dat_delete_slot(mc, kvm->arch.gmap->asce, old->base_gfn, old->npages); + break; + case KVM_MR_MOVE: + rc = dat_delete_slot(mc, kvm->arch.gmap->asce, old->base_gfn, old->npages); + if (rc) + break; + fallthrough; + case KVM_MR_CREATE: + rc = dat_create_slot(mc, kvm->arch.gmap->asce, new->base_gfn, new->npages); + break; + case KVM_MR_FLAGS_ONLY: + break; + default: + WARN(1, "Unknown KVM MR CHANGE: %d\n", change); + } + } +out: if (rc) pr_warn("failed to commit memory region\n"); + kvm_s390_free_mmu_cache(mc); return; } @@ -6011,7 +5678,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, */ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return false; + scoped_guard(read_lock, &kvm->mmu_lock) + return dat_test_age_gfn(kvm->arch.gmap->asce, range->start, range->end); } /** @@ -6024,7 +5692,8 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) */ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return false; + scoped_guard(read_lock, &kvm->mmu_lock) + return gmap_age_gfn(kvm->arch.gmap, range->start, range->end); } /** @@ -6041,7 +5710,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) */ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - return false; + return gmap_unmap_gfn_range(kvm->arch.gmap, range->slot, range->start, range->end); } static inline unsigned long nonhyp_mask(int i) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index c44c52266e26..bf1d7798c1af 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -19,6 +19,8 @@ #include #include #include +#include "dat.h" +#include "gmap.h" #define KVM_S390_UCONTROL_MEMSLOT (KVM_USER_MEM_SLOTS + 0) @@ -114,9 +116,7 @@ static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) static inline int kvm_is_ucontrol(struct kvm *kvm) { #ifdef CONFIG_KVM_S390_UCONTROL - if (kvm->arch.gmap) - return 0; - return 1; + return test_bit(GMAP_FLAG_IS_UCONTROL, &kvm->arch.gmap->flags); #else return 0; #endif @@ -440,14 +440,9 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu); /* implemented in vsie.c */ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu); void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu); -void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, - unsigned long end); +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end); void kvm_s390_vsie_init(struct kvm *kvm); void kvm_s390_vsie_destroy(struct kvm *kvm); -int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level); - -/* implemented in gmap-vsie.c */ -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level); /* implemented in sigp.c */ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); @@ -469,15 +464,9 @@ void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm); __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu); int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc); -int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags); int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, unsigned long bits); -static inline int kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gpa_t gaddr, unsigned int flags) -{ - return __kvm_s390_handle_dat_fault(vcpu, gpa_to_gfn(gaddr), gaddr, flags); -} - bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu); /* implemented in diag.c */ diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 0b14d894f38a..a3250ad83a8e 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -21,13 +21,14 @@ #include #include #include -#include #include #include #include +#include #include "gaccess.h" #include "kvm-s390.h" #include "trace.h" +#include "gmap.h" static int handle_ri(struct kvm_vcpu *vcpu) { @@ -222,7 +223,7 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu) if (vcpu->arch.skey_enabled) return 0; - rc = s390_enable_skey(); + rc = gmap_enable_skeys(vcpu->arch.gmap); VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc); if (rc) return rc; @@ -255,10 +256,9 @@ static int try_handle_skey(struct kvm_vcpu *vcpu) static int handle_iske(struct kvm_vcpu *vcpu) { - unsigned long gaddr, vmaddr; - unsigned char key; + unsigned long gaddr; int reg1, reg2; - bool unlocked; + union skey key; int rc; vcpu->stat.instruction_iske++; @@ -275,37 +275,21 @@ static int handle_iske(struct kvm_vcpu *vcpu) gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; gaddr = kvm_s390_logical_to_effective(vcpu, gaddr); gaddr = kvm_s390_real_to_abs(vcpu, gaddr); - vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr)); - if (kvm_is_error_hva(vmaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); -retry: - unlocked = false; - mmap_read_lock(current->mm); - rc = get_guest_storage_key(current->mm, vmaddr, &key); - - if (rc) { - rc = fixup_user_fault(current->mm, vmaddr, - FAULT_FLAG_WRITE, &unlocked); - if (!rc) { - mmap_read_unlock(current->mm); - goto retry; - } - } - mmap_read_unlock(current->mm); - if (rc == -EFAULT) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + rc = dat_get_storage_key(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr), &key); + if (rc > 0) + return kvm_s390_inject_program_int(vcpu, rc); if (rc < 0) return rc; vcpu->run->s.regs.gprs[reg1] &= ~0xff; - vcpu->run->s.regs.gprs[reg1] |= key; + vcpu->run->s.regs.gprs[reg1] |= key.skey; return 0; } static int handle_rrbe(struct kvm_vcpu *vcpu) { - unsigned long vmaddr, gaddr; + unsigned long gaddr; int reg1, reg2; - bool unlocked; int rc; vcpu->stat.instruction_rrbe++; @@ -322,24 +306,10 @@ static int handle_rrbe(struct kvm_vcpu *vcpu) gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; gaddr = kvm_s390_logical_to_effective(vcpu, gaddr); gaddr = kvm_s390_real_to_abs(vcpu, gaddr); - vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr)); - if (kvm_is_error_hva(vmaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); -retry: - unlocked = false; - mmap_read_lock(current->mm); - rc = reset_guest_reference_bit(current->mm, vmaddr); - if (rc < 0) { - rc = fixup_user_fault(current->mm, vmaddr, - FAULT_FLAG_WRITE, &unlocked); - if (!rc) { - mmap_read_unlock(current->mm); - goto retry; - } - } - mmap_read_unlock(current->mm); - if (rc == -EFAULT) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + rc = dat_reset_reference_bit(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr)); + if (rc > 0) + return kvm_s390_inject_program_int(vcpu, rc); if (rc < 0) return rc; kvm_s390_set_psw_cc(vcpu, rc); @@ -354,9 +324,8 @@ static int handle_sske(struct kvm_vcpu *vcpu) { unsigned char m3 = vcpu->arch.sie_block->ipb >> 28; unsigned long start, end; - unsigned char key, oldkey; + union skey key, oldkey; int reg1, reg2; - bool unlocked; int rc; vcpu->stat.instruction_sske++; @@ -377,7 +346,7 @@ static int handle_sske(struct kvm_vcpu *vcpu) kvm_s390_get_regs_rre(vcpu, ®1, ®2); - key = vcpu->run->s.regs.gprs[reg1] & 0xfe; + key.skey = vcpu->run->s.regs.gprs[reg1] & 0xfe; start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; start = kvm_s390_logical_to_effective(vcpu, start); if (m3 & SSKE_MB) { @@ -389,27 +358,17 @@ static int handle_sske(struct kvm_vcpu *vcpu) } while (start != end) { - unsigned long vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); - unlocked = false; - - if (kvm_is_error_hva(vmaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - - mmap_read_lock(current->mm); - rc = cond_set_guest_storage_key(current->mm, vmaddr, key, &oldkey, - m3 & SSKE_NQ, m3 & SSKE_MR, - m3 & SSKE_MC); - - if (rc < 0) { - rc = fixup_user_fault(current->mm, vmaddr, - FAULT_FLAG_WRITE, &unlocked); - rc = !rc ? -EAGAIN : rc; + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { + rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce, + gpa_to_gfn(start), key, &oldkey, + m3 & SSKE_NQ, m3 & SSKE_MR, m3 & SSKE_MC); } - mmap_read_unlock(current->mm); - if (rc == -EFAULT) + if (rc > 1) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - if (rc == -EAGAIN) + if (rc == -ENOMEM) { + kvm_s390_mmu_cache_topup(vcpu->arch.mc); continue; + } if (rc < 0) return rc; start += PAGE_SIZE; @@ -422,7 +381,7 @@ static int handle_sske(struct kvm_vcpu *vcpu) } else { kvm_s390_set_psw_cc(vcpu, rc); vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL; - vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8; + vcpu->run->s.regs.gprs[reg1] |= (u64)oldkey.skey << 8; } } if (m3 & SSKE_MB) { @@ -1082,7 +1041,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) bool mr = false, mc = false, nq; int reg1, reg2; unsigned long start, end; - unsigned char key; + union skey key; vcpu->stat.instruction_pfmf++; @@ -1110,7 +1069,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) } nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ; - key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY; + key.skey = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY; start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; start = kvm_s390_logical_to_effective(vcpu, start); @@ -1141,14 +1100,6 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) } while (start != end) { - unsigned long vmaddr; - bool unlocked = false; - - /* Translate guest address to host address */ - vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); - if (kvm_is_error_hva(vmaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { if (kvm_clear_guest(vcpu->kvm, start, PAGE_SIZE)) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); @@ -1159,19 +1110,17 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) if (rc) return rc; - mmap_read_lock(current->mm); - rc = cond_set_guest_storage_key(current->mm, vmaddr, - key, NULL, nq, mr, mc); - if (rc < 0) { - rc = fixup_user_fault(current->mm, vmaddr, - FAULT_FLAG_WRITE, &unlocked); - rc = !rc ? -EAGAIN : rc; + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { + rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce, + gpa_to_gfn(start), key, + NULL, nq, mr, mc); } - mmap_read_unlock(current->mm); - if (rc == -EFAULT) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - if (rc == -EAGAIN) + if (rc > 1) + return kvm_s390_inject_program_int(vcpu, rc); + if (rc == -ENOMEM) { + kvm_s390_mmu_cache_topup(vcpu->arch.mc); continue; + } if (rc < 0) return rc; } @@ -1195,8 +1144,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) { int r1, r2, nappended, entries; - unsigned long gfn, hva, res, pgstev, ptev; + union essa_state state; unsigned long *cbrlo; + unsigned long gfn; + bool dirtied; /* * We don't need to set SD.FPF.SK to 1 here, because if we have a @@ -1205,33 +1156,12 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) kvm_s390_get_regs_rre(vcpu, &r1, &r2); gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT; - hva = gfn_to_hva(vcpu->kvm, gfn); entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; - if (kvm_is_error_hva(hva)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - - nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev); - if (nappended < 0) { - res = orc ? 0x10 : 0; - vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */ + nappended = dat_perform_essa(vcpu->arch.gmap->asce, gfn, orc, &state, &dirtied); + vcpu->run->s.regs.gprs[r1] = state.val; + if (nappended < 0) return 0; - } - res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22; - /* - * Set the block-content state part of the result. 0 means resident, so - * nothing to do if the page is valid. 2 is for preserved pages - * (non-present and non-zero), and 3 for zero pages (non-present and - * zero). - */ - if (ptev & _PAGE_INVALID) { - res |= 2; - if (pgstev & _PGSTE_GPS_ZERO) - res |= 1; - } - if (pgstev & _PGSTE_GPS_NODAT) - res |= 0x20; - vcpu->run->s.regs.gprs[r1] = res; /* * It is possible that all the normal 511 slots were full, in which case * we will now write in the 512th slot, which is reserved for host use. @@ -1243,17 +1173,34 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) cbrlo[entries] = gfn << PAGE_SHIFT; } - if (orc) { - struct kvm_memory_slot *ms = gfn_to_memslot(vcpu->kvm, gfn); - - /* Increment only if we are really flipping the bit */ - if (ms && !test_and_set_bit(gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms))) - atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages); - } + if (dirtied) + atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages); return nappended; } +static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len) +{ + union crste *crstep; + union pgste pgste; + union pte *ptep; + int i; + + lockdep_assert_held(&vcpu->kvm->mmu_lock); + + for (i = 0; i < len; i++) { + if (dat_entry_walk(NULL, gpa_to_gfn(cbrl[i]), vcpu->arch.gmap->asce, + 0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep)) + continue; + if (!ptep || ptep->s.pr) + continue; + pgste = pgste_get_lock(ptep); + if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero) + gmap_helper_zap_one_page(vcpu->kvm->mm, cbrl[i]); + pgste_set_unlock(ptep, pgste); + } +} + static int handle_essa(struct kvm_vcpu *vcpu) { lockdep_assert_held(&vcpu->kvm->srcu); @@ -1261,11 +1208,9 @@ static int handle_essa(struct kvm_vcpu *vcpu) /* entries expected to be 1FF */ int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; unsigned long *cbrlo; - struct gmap *gmap; int i, orc; VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries); - gmap = vcpu->arch.gmap; vcpu->stat.instruction_essa++; if (!vcpu->kvm->arch.use_cmma) return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); @@ -1289,11 +1234,7 @@ static int handle_essa(struct kvm_vcpu *vcpu) * value really needs to be written to; if the value is * already correct, we do nothing and avoid the lock. */ - if (vcpu->kvm->mm->context.uses_cmm == 0) { - mmap_write_lock(vcpu->kvm->mm); - vcpu->kvm->mm->context.uses_cmm = 1; - mmap_write_unlock(vcpu->kvm->mm); - } + set_bit(GMAP_FLAG_USES_CMM, &vcpu->arch.gmap->flags); /* * If we are here, we are supposed to have CMMA enabled in * the SIE block. Enabling CMMA works on a per-CPU basis, @@ -1307,20 +1248,22 @@ static int handle_essa(struct kvm_vcpu *vcpu) /* Retry the ESSA instruction */ kvm_s390_retry_instr(vcpu); } else { - mmap_read_lock(vcpu->kvm->mm); - i = __do_essa(vcpu, orc); - mmap_read_unlock(vcpu->kvm->mm); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + i = __do_essa(vcpu, orc); if (i < 0) return i; /* Account for the possible extra cbrl entry */ entries += i; } - vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */ + /* reset nceo */ + vcpu->arch.sie_block->cbrlo &= PAGE_MASK; cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo); - mmap_read_lock(gmap->mm); - for (i = 0; i < entries; ++i) - __gmap_zap(gmap, cbrlo[i]); - mmap_read_unlock(gmap->mm); + + mmap_read_lock(vcpu->kvm->mm); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + _essa_clear_cbrl(vcpu, cbrlo, entries); + mmap_read_unlock(vcpu->kvm->mm); + return 0; } diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index 6ba5a0305e25..a48a8afd40df 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -12,13 +12,16 @@ #include #include #include -#include #include #include #include #include #include #include "kvm-s390.h" +#include "dat.h" +#include "gaccess.h" +#include "gmap.h" +#include "faultin.h" bool kvm_s390_pv_is_protected(struct kvm *kvm) { @@ -34,6 +37,85 @@ bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected); +/** + * should_export_before_import() - Determine whether an export is needed + * before an import-like operation. + * @uvcb: The Ultravisor control block of the UVC to be performed. + * @mm: The mm of the process. + * + * Returns whether an export is needed before every import-like operation. + * This is needed for shared pages, which don't trigger a secure storage + * exception when accessed from a different guest. + * + * Although considered as one, the Unpin Page UVC is not an actual import, + * so it is not affected. + * + * No export is needed also when there is only one protected VM, because the + * page cannot belong to the wrong VM in that case (there is no "other VM" + * it can belong to). + * + * Return: %true if an export is needed before every import, otherwise %false. + */ +static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) +{ + /* + * The misc feature indicates, among other things, that importing a + * shared page from a different protected VM will automatically also + * transfer its ownership. + */ + if (uv_has_feature(BIT_UV_FEAT_MISC)) + return false; + if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) + return false; + return atomic_read(&mm->context.protected_count) > 1; +} + +struct pv_make_secure { + void *uvcb; + struct folio *folio; + int rc; + bool needs_export; +}; + +static int __kvm_s390_pv_make_secure(struct guest_fault *f, struct folio *folio) +{ + struct pv_make_secure *priv = f->priv; + int rc; + + if (priv->needs_export) + uv_convert_from_secure(folio_to_phys(folio)); + + if (folio_test_hugetlb(folio)) + return -EFAULT; + if (folio_test_large(folio)) + return -E2BIG; + + if (!f->page) + folio_get(folio); + rc = __make_folio_secure(folio, priv->uvcb); + if (!f->page) + folio_put(folio); + + return rc; +} + +static void _kvm_s390_pv_make_secure(struct guest_fault *f) +{ + struct pv_make_secure *priv = f->priv; + struct folio *folio; + + folio = pfn_folio(f->pfn); + priv->rc = -EAGAIN; + if (folio_trylock(folio)) { + priv->rc = __kvm_s390_pv_make_secure(f, folio); + if (priv->rc == -E2BIG || priv->rc == -EBUSY) { + priv->folio = folio; + folio_get(folio); + } + folio_unlock(folio); + } +} + /** * kvm_s390_pv_make_secure() - make one guest page secure * @kvm: the guest @@ -45,14 +127,34 @@ EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected); */ int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb) { - unsigned long vmaddr; + struct pv_make_secure priv = { .uvcb = uvcb }; + struct guest_fault f = { + .write_attempt = true, + .gfn = gpa_to_gfn(gaddr), + .callback = _kvm_s390_pv_make_secure, + .priv = &priv, + }; + int rc; lockdep_assert_held(&kvm->srcu); - vmaddr = gfn_to_hva(kvm, gpa_to_gfn(gaddr)); - if (kvm_is_error_hva(vmaddr)) - return -EFAULT; - return make_hva_secure(kvm->mm, vmaddr, uvcb); + priv.needs_export = should_export_before_import(uvcb, kvm->mm); + + scoped_guard(mutex, &kvm->arch.pv.import_lock) { + rc = kvm_s390_faultin_gfn(NULL, kvm, &f); + + if (!rc) { + rc = priv.rc; + if (priv.folio) { + rc = s390_wiggle_split_folio(kvm->mm, priv.folio); + if (!rc) + rc = -EAGAIN; + } + } + } + if (priv.folio) + folio_put(priv.folio); + return rc; } int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr) @@ -299,35 +401,6 @@ done_fast: return 0; } -/** - * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory. - * @kvm: the VM whose memory is to be cleared. - * - * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot. - * The CPUs of the protected VM need to be destroyed beforehand. - */ -static void kvm_s390_destroy_lower_2g(struct kvm *kvm) -{ - const unsigned long pages_2g = SZ_2G / PAGE_SIZE; - struct kvm_memory_slot *slot; - unsigned long len; - int srcu_idx; - - srcu_idx = srcu_read_lock(&kvm->srcu); - - /* Take the memslot containing guest absolute address 0 */ - slot = gfn_to_memslot(kvm, 0); - /* Clear all slots or parts thereof that are below 2GB */ - while (slot && slot->base_gfn < pages_2g) { - len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE; - s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len); - /* Take the next memslot */ - slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages); - } - - srcu_read_unlock(&kvm->srcu, srcu_idx); -} - static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc) { struct uv_cb_destroy_fast uvcb = { @@ -342,7 +415,6 @@ static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc) *rc = uvcb.header.rc; if (rrc) *rrc = uvcb.header.rrc; - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x", uvcb.header.rc, uvcb.header.rrc); WARN_ONCE(cc && uvcb.header.rc != 0x104, @@ -391,7 +463,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) return -EINVAL; /* Guest with segment type ASCE, refuse to destroy asynchronously */ - if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + if (kvm->arch.gmap->asce.dt == TABLE_TYPE_SEGMENT) return -EINVAL; priv = kzalloc(sizeof(*priv), GFP_KERNEL); @@ -404,8 +476,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) priv->stor_var = kvm->arch.pv.stor_var; priv->stor_base = kvm->arch.pv.stor_base; priv->handle = kvm_s390_pv_get_handle(kvm); - priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table; - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + priv->old_gmap_table = (unsigned long)dereference_asce(kvm->arch.gmap->asce); if (s390_replace_asce(kvm->arch.gmap)) res = -ENOMEM; } @@ -415,7 +486,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) return res; } - kvm_s390_destroy_lower_2g(kvm); + gmap_pv_destroy_range(kvm->arch.gmap, 0, gpa_to_gfn(SZ_2G), false); kvm_s390_clear_pv_state(kvm); kvm->arch.pv.set_aside = priv; @@ -449,7 +520,6 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), UVC_CMD_DESTROY_SEC_CONF, rc, rrc); - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); if (!cc) { atomic_dec(&kvm->mm->context.protected_count); kvm_s390_pv_dealloc_vm(kvm); @@ -532,7 +602,7 @@ int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc) * cleanup has been performed. */ if (need_zap && mmget_not_zero(kvm->mm)) { - s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); + gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), false); mmput(kvm->mm); } @@ -570,7 +640,7 @@ int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc) return -EINVAL; /* When a fatal signal is received, stop immediately */ - if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) + if (gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), true)) goto done; if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc)) ret = -EIO; @@ -609,6 +679,7 @@ static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription, r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm)) kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy); + set_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &kvm->arch.gmap->flags); } static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = { @@ -642,7 +713,7 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) /* Inputs */ uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */ uvcb.guest_stor_len = kvm->arch.pv.guest_len; - uvcb.guest_asce = kvm->arch.gmap->asce; + uvcb.guest_asce = kvm->arch.gmap->asce.val; uvcb.guest_sca = virt_to_phys(kvm->arch.sca); uvcb.conf_base_stor_origin = virt_to_phys((void *)kvm->arch.pv.stor_base); @@ -669,7 +740,6 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) } return -EIO; } - kvm->arch.gmap->guest_handle = uvcb.guest_handle; return 0; } @@ -704,26 +774,14 @@ static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak, .tweak[1] = offset, }; int ret = kvm_s390_pv_make_secure(kvm, addr, &uvcb); - unsigned long vmaddr; - bool unlocked; *rc = uvcb.header.rc; *rrc = uvcb.header.rrc; if (ret == -ENXIO) { - mmap_read_lock(kvm->mm); - vmaddr = gfn_to_hva(kvm, gpa_to_gfn(addr)); - if (kvm_is_error_hva(vmaddr)) { - ret = -EFAULT; - } else { - ret = fixup_user_fault(kvm->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); - if (!ret) - ret = __gmap_link(kvm->arch.gmap, addr, vmaddr); - } - mmap_read_unlock(kvm->mm); + ret = kvm_s390_faultin_gfn_simple(NULL, kvm, gpa_to_gfn(addr), true); if (!ret) return -EAGAIN; - return ret; } if (ret && ret != -EAGAIN) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 1da1d0f460e1..f950ebb32812 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -15,7 +15,6 @@ #include #include -#include #include #include #include @@ -23,6 +22,7 @@ #include #include "kvm-s390.h" #include "gaccess.h" +#include "gmap.h" enum vsie_page_flags { VSIE_PAGE_IN_USE = 0, @@ -41,8 +41,11 @@ struct vsie_page { * are reused conditionally, should be accessed via READ_ONCE. */ struct kvm_s390_sie_block *scb_o; /* 0x0218 */ - /* the shadow gmap in use by the vsie_page */ - struct gmap *gmap; /* 0x0220 */ + /* + * Flags: must be set/cleared atomically after the vsie page can be + * looked up by other CPUs. + */ + unsigned long flags; /* 0x0220 */ /* address of the last reported fault to guest2 */ unsigned long fault_addr; /* 0x0228 */ /* calculated guest addresses of satellite control blocks */ @@ -57,33 +60,14 @@ struct vsie_page { * radix tree. */ gpa_t scb_gpa; /* 0x0258 */ - /* - * Flags: must be set/cleared atomically after the vsie page can be - * looked up by other CPUs. - */ - unsigned long flags; /* 0x0260 */ - __u8 reserved[0x0700 - 0x0268]; /* 0x0268 */ + /* the shadow gmap in use by the vsie_page */ + struct gmap_cache gmap_cache; /* 0x0260 */ + __u8 reserved[0x0700 - 0x0278]; /* 0x0278 */ struct kvm_s390_crypto_cb crycb; /* 0x0700 */ __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ }; -/** - * gmap_shadow_valid() - check if a shadow guest address space matches the - * given properties and is still valid - * @sg: pointer to the shadow guest address space structure - * @asce: ASCE for which the shadow table is requested - * @edat_level: edat level to be used for the shadow translation - * - * Returns 1 if the gmap shadow is still valid and matches the given - * properties, the caller can continue using it. Returns 0 otherwise; the - * caller has to request a new shadow gmap in this case. - */ -int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) -{ - if (sg->removed) - return 0; - return sg->orig_asce == asce && sg->edat_level == edat_level; -} +static_assert(sizeof(struct vsie_page) == PAGE_SIZE); /* trigger a validity icpt for the given scb */ static int set_validity_icpt(struct kvm_s390_sie_block *scb, @@ -612,26 +596,17 @@ out: return rc; } -void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, - unsigned long end) +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end) { - struct kvm *kvm = gmap->private; - struct vsie_page *cur; + struct vsie_page *cur, *next; unsigned long prefix; - int i; - if (!gmap_is_shadow(gmap)) - return; + KVM_BUG_ON(!test_bit(GMAP_FLAG_SHADOW, &gmap->flags), gmap->kvm); /* * Only new shadow blocks are added to the list during runtime, * therefore we can safely reference them all the time. */ - for (i = 0; i < kvm->arch.vsie.page_count; i++) { - cur = READ_ONCE(kvm->arch.vsie.pages[i]); - if (!cur) - continue; - if (READ_ONCE(cur->gmap) != gmap) - continue; + list_for_each_entry_safe(cur, next, &gmap->scb_users, gmap_cache.list) { prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT; /* with mso/msl, the prefix lies at an offset */ prefix += cur->scb_s.mso; @@ -667,9 +642,9 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct /* with mso/msl, the prefix lies at offset *mso* */ prefix += scb_s->mso; - rc = kvm_s390_shadow_fault(vcpu, sg, prefix, NULL); + rc = gaccess_shadow_fault(vcpu, sg, prefix, NULL, true); if (!rc && (scb_s->ecb & ECB_TE)) - rc = kvm_s390_shadow_fault(vcpu, sg, prefix + PAGE_SIZE, NULL); + rc = gaccess_shadow_fault(vcpu, sg, prefix + PAGE_SIZE, NULL, true); /* * We don't have to mprotect, we will be called for all unshadows. * SIE will detect if protection applies and trigger a validity. @@ -952,6 +927,7 @@ static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr, */ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { + bool wr = kvm_s390_cur_gmap_fault_is_write(); int rc; if ((current->thread.gmap_int_code & PGM_INT_CODE_MASK) == PGM_PROTECTION) @@ -959,11 +935,10 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, stru return inject_fault(vcpu, PGM_PROTECTION, current->thread.gmap_teid.addr * PAGE_SIZE, 1); - rc = kvm_s390_shadow_fault(vcpu, sg, current->thread.gmap_teid.addr * PAGE_SIZE, NULL); + rc = gaccess_shadow_fault(vcpu, sg, current->thread.gmap_teid.addr * PAGE_SIZE, NULL, wr); if (rc > 0) { rc = inject_fault(vcpu, rc, - current->thread.gmap_teid.addr * PAGE_SIZE, - kvm_s390_cur_gmap_fault_is_write()); + current->thread.gmap_teid.addr * PAGE_SIZE, wr); if (rc >= 0) vsie_page->fault_addr = current->thread.gmap_teid.addr * PAGE_SIZE; } @@ -979,7 +954,7 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, stru static void handle_last_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { if (vsie_page->fault_addr) - kvm_s390_shadow_fault(vcpu, sg, vsie_page->fault_addr, NULL); + gaccess_shadow_fault(vcpu, sg, vsie_page->fault_addr, NULL, true); vsie_page->fault_addr = 0; } @@ -1064,8 +1039,9 @@ static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; - unsigned long pei_dest, pei_src, src, dest, mask, prefix; + unsigned long src, dest, mask, prefix; u64 *pei_block = &vsie_page->scb_o->mcic; + union mvpg_pei pei_dest, pei_src; int edat, rc_dest, rc_src; union ctlreg0 cr0; @@ -1079,8 +1055,8 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask; src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso; - rc_dest = kvm_s390_shadow_fault(vcpu, sg, dest, &pei_dest); - rc_src = kvm_s390_shadow_fault(vcpu, sg, src, &pei_src); + rc_dest = gaccess_shadow_fault(vcpu, sg, dest, &pei_dest, true); + rc_src = gaccess_shadow_fault(vcpu, sg, src, &pei_src, false); /* * Either everything went well, or something non-critical went wrong * e.g. because of a race. In either case, simply retry. @@ -1115,8 +1091,8 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0; } if (!rc_dest && !rc_src) { - pei_block[0] = pei_dest; - pei_block[1] = pei_src; + pei_block[0] = pei_dest.val; + pei_block[1] = pei_src.val; return 1; } @@ -1187,7 +1163,7 @@ xfer_to_guest_mode_check: goto xfer_to_guest_mode_check; } guest_timing_enter_irqoff(); - rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, sg->asce); + rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, sg->asce.val); guest_timing_exit_irqoff(); local_irq_enable(); } @@ -1237,43 +1213,63 @@ skip_sie: static void release_gmap_shadow(struct vsie_page *vsie_page) { - if (vsie_page->gmap) - gmap_put(vsie_page->gmap); - WRITE_ONCE(vsie_page->gmap, NULL); + struct gmap *gmap = vsie_page->gmap_cache.gmap; + + lockdep_assert_held(&gmap->kvm->arch.gmap->children_lock); + + list_del(&vsie_page->gmap_cache.list); + vsie_page->gmap_cache.gmap = NULL; prefix_unmapped(vsie_page); + + if (list_empty(&gmap->scb_users)) { + gmap_remove_child(gmap); + gmap_put(gmap); + } } -static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, - struct vsie_page *vsie_page) +static struct gmap *acquire_gmap_shadow(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { - unsigned long asce; union ctlreg0 cr0; struct gmap *gmap; + union asce asce; int edat; - asce = vcpu->arch.sie_block->gcr[1]; + asce.val = vcpu->arch.sie_block->gcr[1]; cr0.val = vcpu->arch.sie_block->gcr[0]; edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8); edat += edat && test_kvm_facility(vcpu->kvm, 78); - /* - * ASCE or EDAT could have changed since last icpt, or the gmap - * we're holding has been unshadowed. If the gmap is still valid, - * we can safely reuse it. - */ - if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) { - vcpu->kvm->stat.gmap_shadow_reuse++; - return 0; + scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) { + gmap = vsie_page->gmap_cache.gmap; + if (gmap) { + /* + * ASCE or EDAT could have changed since last icpt, or the gmap + * we're holding has been unshadowed. If the gmap is still valid, + * we can safely reuse it. + */ + if (gmap_is_shadow_valid(gmap, asce, edat)) { + vcpu->kvm->stat.gmap_shadow_reuse++; + gmap_get(gmap); + return gmap; + } + /* release the old shadow and mark the prefix as unmapped */ + release_gmap_shadow(vsie_page); + } } - - /* release the old shadow - if any, and mark the prefix as unmapped */ - release_gmap_shadow(vsie_page); - gmap = gmap_shadow(vcpu->arch.gmap, asce, edat); + gmap = gmap_create_shadow(vcpu->arch.mc, vcpu->kvm->arch.gmap, asce, edat); if (IS_ERR(gmap)) - return PTR_ERR(gmap); - vcpu->kvm->stat.gmap_shadow_create++; - WRITE_ONCE(vsie_page->gmap, gmap); - return 0; + return gmap; + scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) { + /* unlikely race condition, remove the previous shadow */ + if (vsie_page->gmap_cache.gmap) + release_gmap_shadow(vsie_page); + vcpu->kvm->stat.gmap_shadow_create++; + list_add(&vsie_page->gmap_cache.list, &gmap->scb_users); + vsie_page->gmap_cache.gmap = gmap; + prefix_unmapped(vsie_page); + gmap_get(gmap); + } + return gmap; } /* @@ -1330,8 +1326,11 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) int rc = 0; while (1) { - rc = acquire_gmap_shadow(vcpu, vsie_page); - sg = vsie_page->gmap; + sg = acquire_gmap_shadow(vcpu, vsie_page); + if (IS_ERR(sg)) { + rc = PTR_ERR(sg); + sg = NULL; + } if (!rc) rc = map_prefix(vcpu, vsie_page, sg); if (!rc) { @@ -1359,6 +1358,9 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) kvm_s390_rewind_psw(vcpu, 4); break; } + if (sg) + sg = gmap_put(sg); + cond_resched(); } if (rc == -EFAULT) { @@ -1455,8 +1457,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) vsie_page->scb_gpa = ULONG_MAX; /* Double use of the same address or allocation failure. */ - if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, - vsie_page)) { + if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, vsie_page)) { put_vsie_page(vsie_page); mutex_unlock(&kvm->arch.vsie.mutex); return NULL; @@ -1465,7 +1466,12 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) mutex_unlock(&kvm->arch.vsie.mutex); memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block)); - release_gmap_shadow(vsie_page); + if (vsie_page->gmap_cache.gmap) { + scoped_guard(spinlock, &kvm->arch.gmap->children_lock) + if (vsie_page->gmap_cache.gmap) + release_gmap_shadow(vsie_page); + } + prefix_unmapped(vsie_page); vsie_page->fault_addr = 0; vsie_page->scb_s.ihcpu = 0xffffU; return vsie_page; @@ -1543,8 +1549,10 @@ void kvm_s390_vsie_destroy(struct kvm *kvm) mutex_lock(&kvm->arch.vsie.mutex); for (i = 0; i < kvm->arch.vsie.page_count; i++) { vsie_page = kvm->arch.vsie.pages[i]; + scoped_guard(spinlock, &kvm->arch.gmap->children_lock) + if (vsie_page->gmap_cache.gmap) + release_gmap_shadow(vsie_page); kvm->arch.vsie.pages[i] = NULL; - release_gmap_shadow(vsie_page); /* free the radix tree entry */ if (vsie_page->scb_gpa != ULONG_MAX) radix_tree_delete(&kvm->arch.vsie.addr_to_page, diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index 1a6ba105e071..0ac2f3998b14 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -34,136 +34,19 @@ void debug_user_asce(int exit) } #endif /*CONFIG_DEBUG_ENTRY */ -union oac { - unsigned int val; - struct { - struct { - unsigned short key : 4; - unsigned short : 4; - unsigned short as : 2; - unsigned short : 4; - unsigned short k : 1; - unsigned short a : 1; - } oac1; - struct { - unsigned short key : 4; - unsigned short : 4; - unsigned short as : 2; - unsigned short : 4; - unsigned short k : 1; - unsigned short a : 1; - } oac2; - }; -}; - -static uaccess_kmsan_or_inline __must_check unsigned long -raw_copy_from_user_key(void *to, const void __user *from, unsigned long size, unsigned long key) -{ - unsigned long osize; - union oac spec = { - .oac2.key = key, - .oac2.as = PSW_BITS_AS_SECONDARY, - .oac2.k = 1, - .oac2.a = 1, - }; - int cc; - - while (1) { - osize = size; - asm_inline volatile( - " lr %%r0,%[spec]\n" - "0: mvcos %[to],%[from],%[size]\n" - "1: nopr %%r7\n" - CC_IPM(cc) - EX_TABLE_UA_MVCOS_FROM(0b, 0b) - EX_TABLE_UA_MVCOS_FROM(1b, 0b) - : CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char *)to) - : [spec] "d" (spec.val), [from] "Q" (*(const char __user *)from) - : CC_CLOBBER_LIST("memory", "0")); - if (CC_TRANSFORM(cc) == 0) - return osize - size; - size -= 4096; - to += 4096; - from += 4096; - } -} - -unsigned long _copy_from_user_key(void *to, const void __user *from, - unsigned long n, unsigned long key) -{ - unsigned long res = n; - - might_fault(); - if (!should_fail_usercopy()) { - instrument_copy_from_user_before(to, from, n); - res = raw_copy_from_user_key(to, from, n, key); - instrument_copy_from_user_after(to, from, n, res); - } - if (unlikely(res)) - memset(to + (n - res), 0, res); - return res; -} -EXPORT_SYMBOL(_copy_from_user_key); - -static uaccess_kmsan_or_inline __must_check unsigned long -raw_copy_to_user_key(void __user *to, const void *from, unsigned long size, unsigned long key) -{ - unsigned long osize; - union oac spec = { - .oac1.key = key, - .oac1.as = PSW_BITS_AS_SECONDARY, - .oac1.k = 1, - .oac1.a = 1, - }; - int cc; - - while (1) { - osize = size; - asm_inline volatile( - " lr %%r0,%[spec]\n" - "0: mvcos %[to],%[from],%[size]\n" - "1: nopr %%r7\n" - CC_IPM(cc) - EX_TABLE_UA_MVCOS_TO(0b, 0b) - EX_TABLE_UA_MVCOS_TO(1b, 0b) - : CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char __user *)to) - : [spec] "d" (spec.val), [from] "Q" (*(const char *)from) - : CC_CLOBBER_LIST("memory", "0")); - if (CC_TRANSFORM(cc) == 0) - return osize - size; - size -= 4096; - to += 4096; - from += 4096; - } -} - -unsigned long _copy_to_user_key(void __user *to, const void *from, - unsigned long n, unsigned long key) -{ - might_fault(); - if (should_fail_usercopy()) - return n; - instrument_copy_to_user(to, from, n); - return raw_copy_to_user_key(to, from, n, key); -} -EXPORT_SYMBOL(_copy_to_user_key); - #define CMPXCHG_USER_KEY_MAX_LOOPS 128 -static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsigned int *uval, - unsigned int old, unsigned int new, - unsigned int mask, unsigned long key) +static nokprobe_inline int __cmpxchg_key_small(void *address, unsigned int *uval, + unsigned int old, unsigned int new, + unsigned int mask, unsigned long key) { unsigned long count; unsigned int prev; - bool sacf_flag; int rc = 0; skey_regions_initialize(); - sacf_flag = enable_sacf_uaccess(); asm_inline volatile( "20: spka 0(%[key])\n" - " sacf 256\n" " llill %[count],%[max_loops]\n" "0: l %[prev],%[address]\n" "1: nr %[prev],%[mask]\n" @@ -178,8 +61,7 @@ static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsig " nr %[tmp],%[mask]\n" " jnz 5f\n" " brct %[count],2b\n" - "5: sacf 768\n" - " spka %[default_key]\n" + "5: spka %[default_key]\n" "21:\n" EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev]) EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev]) @@ -197,16 +79,16 @@ static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsig [default_key] "J" (PAGE_DEFAULT_KEY), [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS) : "memory", "cc"); - disable_sacf_uaccess(sacf_flag); *uval = prev; if (!count) rc = -EAGAIN; return rc; } -int __kprobes __cmpxchg_user_key1(unsigned long address, unsigned char *uval, - unsigned char old, unsigned char new, unsigned long key) +int __kprobes __cmpxchg_key1(void *addr, unsigned char *uval, unsigned char old, + unsigned char new, unsigned long key) { + unsigned long address = (unsigned long)addr; unsigned int prev, shift, mask, _old, _new; int rc; @@ -215,15 +97,16 @@ int __kprobes __cmpxchg_user_key1(unsigned long address, unsigned char *uval, _old = (unsigned int)old << shift; _new = (unsigned int)new << shift; mask = ~(0xff << shift); - rc = __cmpxchg_user_key_small(address, &prev, _old, _new, mask, key); + rc = __cmpxchg_key_small((void *)address, &prev, _old, _new, mask, key); *uval = prev >> shift; return rc; } -EXPORT_SYMBOL(__cmpxchg_user_key1); +EXPORT_SYMBOL(__cmpxchg_key1); -int __kprobes __cmpxchg_user_key2(unsigned long address, unsigned short *uval, - unsigned short old, unsigned short new, unsigned long key) +int __kprobes __cmpxchg_key2(void *addr, unsigned short *uval, unsigned short old, + unsigned short new, unsigned long key) { + unsigned long address = (unsigned long)addr; unsigned int prev, shift, mask, _old, _new; int rc; @@ -232,27 +115,23 @@ int __kprobes __cmpxchg_user_key2(unsigned long address, unsigned short *uval, _old = (unsigned int)old << shift; _new = (unsigned int)new << shift; mask = ~(0xffff << shift); - rc = __cmpxchg_user_key_small(address, &prev, _old, _new, mask, key); + rc = __cmpxchg_key_small((void *)address, &prev, _old, _new, mask, key); *uval = prev >> shift; return rc; } -EXPORT_SYMBOL(__cmpxchg_user_key2); +EXPORT_SYMBOL(__cmpxchg_key2); -int __kprobes __cmpxchg_user_key4(unsigned long address, unsigned int *uval, - unsigned int old, unsigned int new, unsigned long key) +int __kprobes __cmpxchg_key4(void *address, unsigned int *uval, unsigned int old, + unsigned int new, unsigned long key) { unsigned int prev = old; - bool sacf_flag; int rc = 0; skey_regions_initialize(); - sacf_flag = enable_sacf_uaccess(); asm_inline volatile( "20: spka 0(%[key])\n" - " sacf 256\n" "0: cs %[prev],%[new],%[address]\n" - "1: sacf 768\n" - " spka %[default_key]\n" + "1: spka %[default_key]\n" "21:\n" EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) @@ -264,27 +143,22 @@ int __kprobes __cmpxchg_user_key4(unsigned long address, unsigned int *uval, [key] "a" (key << 4), [default_key] "J" (PAGE_DEFAULT_KEY) : "memory", "cc"); - disable_sacf_uaccess(sacf_flag); *uval = prev; return rc; } -EXPORT_SYMBOL(__cmpxchg_user_key4); +EXPORT_SYMBOL(__cmpxchg_key4); -int __kprobes __cmpxchg_user_key8(unsigned long address, unsigned long *uval, - unsigned long old, unsigned long new, unsigned long key) +int __kprobes __cmpxchg_key8(void *address, unsigned long *uval, unsigned long old, + unsigned long new, unsigned long key) { unsigned long prev = old; - bool sacf_flag; int rc = 0; skey_regions_initialize(); - sacf_flag = enable_sacf_uaccess(); asm_inline volatile( "20: spka 0(%[key])\n" - " sacf 256\n" "0: csg %[prev],%[new],%[address]\n" - "1: sacf 768\n" - " spka %[default_key]\n" + "1: spka %[default_key]\n" "21:\n" EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) @@ -296,27 +170,22 @@ int __kprobes __cmpxchg_user_key8(unsigned long address, unsigned long *uval, [key] "a" (key << 4), [default_key] "J" (PAGE_DEFAULT_KEY) : "memory", "cc"); - disable_sacf_uaccess(sacf_flag); *uval = prev; return rc; } -EXPORT_SYMBOL(__cmpxchg_user_key8); +EXPORT_SYMBOL(__cmpxchg_key8); -int __kprobes __cmpxchg_user_key16(unsigned long address, __uint128_t *uval, - __uint128_t old, __uint128_t new, unsigned long key) +int __kprobes __cmpxchg_key16(void *address, __uint128_t *uval, __uint128_t old, + __uint128_t new, unsigned long key) { __uint128_t prev = old; - bool sacf_flag; int rc = 0; skey_regions_initialize(); - sacf_flag = enable_sacf_uaccess(); asm_inline volatile( "20: spka 0(%[key])\n" - " sacf 256\n" "0: cdsg %[prev],%[new],%[address]\n" - "1: sacf 768\n" - " spka %[default_key]\n" + "1: spka %[default_key]\n" "21:\n" EX_TABLE_UA_LOAD_REGPAIR(0b, 1b, %[rc], %[prev]) EX_TABLE_UA_LOAD_REGPAIR(1b, 1b, %[rc], %[prev]) @@ -328,8 +197,7 @@ int __kprobes __cmpxchg_user_key16(unsigned long address, __uint128_t *uval, [key] "a" (key << 4), [default_key] "J" (PAGE_DEFAULT_KEY) : "memory", "cc"); - disable_sacf_uaccess(sacf_flag); *uval = prev; return rc; } -EXPORT_SYMBOL(__cmpxchg_user_key16); +EXPORT_SYMBOL(__cmpxchg_key16); diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index 4864cb35fc25..d653c64b869a 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -34,28 +34,6 @@ static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) free_swap_and_cache(entry); } -static inline pgste_t pgste_get_lock(pte_t *ptep) -{ - unsigned long value = 0; -#ifdef CONFIG_PGSTE - unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE); - - do { - value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr); - } while (value & PGSTE_PCL_BIT); - value |= PGSTE_PCL_BIT; -#endif - return __pgste(value); -} - -static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) -{ -#ifdef CONFIG_PGSTE - barrier(); - WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT); -#endif -} - /** * gmap_helper_zap_one_page() - discard a page if it was swapped. * @mm: the mm @@ -68,9 +46,7 @@ static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) { struct vm_area_struct *vma; - unsigned long pgstev; spinlock_t *ptl; - pgste_t pgste; pte_t *ptep; mmap_assert_locked(mm); @@ -85,18 +61,8 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) if (unlikely(!ptep)) return; if (pte_swap(*ptep)) { - preempt_disable(); - pgste = pgste_get_lock(ptep); - pgstev = pgste_val(pgste); - - if ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || - (pgstev & _PGSTE_GPS_ZERO)) { - ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); - pte_clear(mm, vmaddr, ptep); - } - - pgste_set_unlock(ptep, pgste); - preempt_enable(); + ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); + pte_clear(mm, vmaddr, ptep); } pte_unmap_unlock(ptep, ptl); } From cec38587a9e395576fb54ee9e077b037a50d9335 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:54 +0100 Subject: [PATCH 213/267] KVM: s390: Remove gmap from s390/mm Remove the now unused include/asm/gmap.h and mm/gmap.c files. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- MAINTAINERS | 2 - arch/s390/include/asm/gmap.h | 174 --- arch/s390/include/asm/pgtable.h | 8 - arch/s390/mm/Makefile | 1 - arch/s390/mm/gmap.c | 2436 ------------------------------- arch/s390/mm/pgtable.c | 8 - 6 files changed, 2629 deletions(-) delete mode 100644 arch/s390/include/asm/gmap.h delete mode 100644 arch/s390/mm/gmap.c diff --git a/MAINTAINERS b/MAINTAINERS index dc731d37c8fe..95448b485fd2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13899,14 +13899,12 @@ L: kvm@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git F: Documentation/virt/kvm/s390* -F: arch/s390/include/asm/gmap.h F: arch/s390/include/asm/gmap_helpers.h F: arch/s390/include/asm/kvm* F: arch/s390/include/uapi/asm/kvm* F: arch/s390/include/uapi/asm/uvdevice.h F: arch/s390/kernel/uv.c F: arch/s390/kvm/ -F: arch/s390/mm/gmap.c F: arch/s390/mm/gmap_helpers.c F: drivers/s390/char/uvdevice.c F: tools/testing/selftests/drivers/s390x/uvdevice/ diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h deleted file mode 100644 index 66c5808fd011..000000000000 --- a/arch/s390/include/asm/gmap.h +++ /dev/null @@ -1,174 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * KVM guest address space mapping code - * - * Copyright IBM Corp. 2007, 2016 - * Author(s): Martin Schwidefsky - */ - -#ifndef _ASM_S390_GMAP_H -#define _ASM_S390_GMAP_H - -#include -#include - -/* Generic bits for GMAP notification on DAT table entry changes. */ -#define GMAP_NOTIFY_SHADOW 0x2 -#define GMAP_NOTIFY_MPROT 0x1 - -/* Status bits only for huge segment entries */ -#define _SEGMENT_ENTRY_GMAP_IN 0x0800 /* invalidation notify bit */ -#define _SEGMENT_ENTRY_GMAP_UC 0x0002 /* dirty (migration) */ - -/** - * struct gmap_struct - guest address space - * @list: list head for the mm->context gmap list - * @mm: pointer to the parent mm_struct - * @guest_to_host: radix tree with guest to host address translation - * @host_to_guest: radix tree with pointer to segment table entries - * @guest_table_lock: spinlock to protect all entries in the guest page table - * @ref_count: reference counter for the gmap structure - * @table: pointer to the page directory - * @asce: address space control element for gmap page table - * @pfault_enabled: defines if pfaults are applicable for the guest - * @guest_handle: protected virtual machine handle for the ultravisor - * @host_to_rmap: radix tree with gmap_rmap lists - * @children: list of shadow gmap structures - * @shadow_lock: spinlock to protect the shadow gmap list - * @parent: pointer to the parent gmap for shadow guest address spaces - * @orig_asce: ASCE for which the shadow page table has been created - * @edat_level: edat level to be used for the shadow translation - * @removed: flag to indicate if a shadow guest address space has been removed - * @initialized: flag to indicate if a shadow guest address space can be used - */ -struct gmap { - struct list_head list; - struct mm_struct *mm; - struct radix_tree_root guest_to_host; - struct radix_tree_root host_to_guest; - spinlock_t guest_table_lock; - refcount_t ref_count; - unsigned long *table; - unsigned long asce; - unsigned long asce_end; - void *private; - bool pfault_enabled; - /* only set for protected virtual machines */ - unsigned long guest_handle; - /* Additional data for shadow guest address spaces */ - struct radix_tree_root host_to_rmap; - struct list_head children; - spinlock_t shadow_lock; - struct gmap *parent; - unsigned long orig_asce; - int edat_level; - bool removed; - bool initialized; -}; - -/** - * struct gmap_rmap - reverse mapping for shadow page table entries - * @next: pointer to next rmap in the list - * @raddr: virtual rmap address in the shadow guest address space - */ -struct gmap_rmap { - struct gmap_rmap *next; - unsigned long raddr; -}; - -#define gmap_for_each_rmap(pos, head) \ - for (pos = (head); pos; pos = pos->next) - -#define gmap_for_each_rmap_safe(pos, n, head) \ - for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n) - -/** - * struct gmap_notifier - notify function block for page invalidation - * @notifier_call: address of callback function - */ -struct gmap_notifier { - struct list_head list; - struct rcu_head rcu; - void (*notifier_call)(struct gmap *gmap, unsigned long start, - unsigned long end); -}; - -static inline int gmap_is_shadow(struct gmap *gmap) -{ - return !!gmap->parent; -} - -struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit); -void gmap_remove(struct gmap *gmap); -struct gmap *gmap_get(struct gmap *gmap); -void gmap_put(struct gmap *gmap); -void gmap_free(struct gmap *gmap); -struct gmap *gmap_alloc(unsigned long limit); - -int gmap_map_segment(struct gmap *gmap, unsigned long from, - unsigned long to, unsigned long len); -int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); -unsigned long __gmap_translate(struct gmap *, unsigned long gaddr); -int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr); -void __gmap_zap(struct gmap *, unsigned long gaddr); -void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); - -int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val); - -void gmap_unshadow(struct gmap *sg); -int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, - int fake); -int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, - int fake); -int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, - int fake); -int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, - int fake); -int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte); - -void gmap_register_pte_notifier(struct gmap_notifier *); -void gmap_unregister_pte_notifier(struct gmap_notifier *); - -int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits); - -void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], - unsigned long gaddr, unsigned long vmaddr); -int s390_replace_asce(struct gmap *gmap); -void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns); -int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, - unsigned long end, bool interruptible); -unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level); - -/** - * s390_uv_destroy_range - Destroy a range of pages in the given mm. - * @mm: the mm on which to operate on - * @start: the start of the range - * @end: the end of the range - * - * This function will call cond_sched, so it should not generate stalls, but - * it will otherwise only return when it completed. - */ -static inline void s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - (void)__s390_uv_destroy_range(mm, start, end, false); -} - -/** - * s390_uv_destroy_range_interruptible - Destroy a range of pages in the - * given mm, but stop when a fatal signal is received. - * @mm: the mm on which to operate on - * @start: the start of the range - * @end: the end of the range - * - * This function will call cond_sched, so it should not generate stalls. If - * a fatal signal is received, it will return with -EINTR immediately, - * without finishing destroying the whole range. Upon successful - * completion, 0 is returned. - */ -static inline int s390_uv_destroy_range_interruptible(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - return __s390_uv_destroy_range(mm, start, end, true); -} -#endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index cd4d135c4503..45f13697cf9e 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1369,8 +1369,6 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry); void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void ptep_notify(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, unsigned long bits); int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr, pte_t *ptep, int prot, unsigned long bit); void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, @@ -1396,9 +1394,6 @@ int set_pgste_bits(struct mm_struct *mm, unsigned long addr, int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep); int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, unsigned long *oldpte, unsigned long *oldpgste); -void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr); -void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr); -void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr); #define pgprot_writecombine pgprot_writecombine pgprot_t pgprot_writecombine(pgprot_t prot); @@ -2023,9 +2018,6 @@ extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t p extern int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot); extern void vmem_unmap_4k_page(unsigned long addr); extern pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc); -extern int s390_enable_sie(void); -extern int s390_enable_skey(void); -extern void s390_reset_cmma(struct mm_struct *mm); /* s390 has a private copy of get unmapped area to deal with cache synonyms */ #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index bd0401cc7ca5..193899c39ca7 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -10,7 +10,6 @@ obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP) += dump_pagetables.o -obj-$(CONFIG_PGSTE) += gmap.o obj-$(CONFIG_PFAULT) += pfault.o obj-$(subst m,y,$(CONFIG_KVM)) += gmap_helpers.o diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c deleted file mode 100644 index dd85bcca817d..000000000000 --- a/arch/s390/mm/gmap.c +++ /dev/null @@ -1,2436 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * KVM guest address space mapping code - * - * Copyright IBM Corp. 2007, 2020 - * Author(s): Martin Schwidefsky - * David Hildenbrand - * Janosch Frank - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * The address is saved in a radix tree directly; NULL would be ambiguous, - * since 0 is a valid address, and NULL is returned when nothing was found. - * The lower bits are ignored by all users of the macro, so it can be used - * to distinguish a valid address 0 from a NULL. - */ -#define VALID_GADDR_FLAG 1 -#define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG) -#define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG) - -#define GMAP_SHADOW_FAKE_TABLE 1ULL - -static struct page *gmap_alloc_crst(void) -{ - struct page *page; - - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); - if (!page) - return NULL; - __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER); - return page; -} - -/** - * gmap_alloc - allocate and initialize a guest address space - * @limit: maximum address of the gmap address space - * - * Returns a guest address space structure. - */ -struct gmap *gmap_alloc(unsigned long limit) -{ - struct gmap *gmap; - struct page *page; - unsigned long *table; - unsigned long etype, atype; - - if (limit < _REGION3_SIZE) { - limit = _REGION3_SIZE - 1; - atype = _ASCE_TYPE_SEGMENT; - etype = _SEGMENT_ENTRY_EMPTY; - } else if (limit < _REGION2_SIZE) { - limit = _REGION2_SIZE - 1; - atype = _ASCE_TYPE_REGION3; - etype = _REGION3_ENTRY_EMPTY; - } else if (limit < _REGION1_SIZE) { - limit = _REGION1_SIZE - 1; - atype = _ASCE_TYPE_REGION2; - etype = _REGION2_ENTRY_EMPTY; - } else { - limit = -1UL; - atype = _ASCE_TYPE_REGION1; - etype = _REGION1_ENTRY_EMPTY; - } - gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); - if (!gmap) - goto out; - INIT_LIST_HEAD(&gmap->children); - INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); - INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); - INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); - spin_lock_init(&gmap->guest_table_lock); - spin_lock_init(&gmap->shadow_lock); - refcount_set(&gmap->ref_count, 1); - page = gmap_alloc_crst(); - if (!page) - goto out_free; - table = page_to_virt(page); - crst_table_init(table, etype); - gmap->table = table; - gmap->asce = atype | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | __pa(table); - gmap->asce_end = limit; - return gmap; - -out_free: - kfree(gmap); -out: - return NULL; -} -EXPORT_SYMBOL_GPL(gmap_alloc); - -/** - * gmap_create - create a guest address space - * @mm: pointer to the parent mm_struct - * @limit: maximum size of the gmap address space - * - * Returns a guest address space structure. - */ -struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) -{ - struct gmap *gmap; - unsigned long gmap_asce; - - gmap = gmap_alloc(limit); - if (!gmap) - return NULL; - gmap->mm = mm; - spin_lock(&mm->context.lock); - list_add_rcu(&gmap->list, &mm->context.gmap_list); - if (list_is_singular(&mm->context.gmap_list)) - gmap_asce = gmap->asce; - else - gmap_asce = -1UL; - WRITE_ONCE(mm->context.gmap_asce, gmap_asce); - spin_unlock(&mm->context.lock); - return gmap; -} -EXPORT_SYMBOL_GPL(gmap_create); - -static void gmap_flush_tlb(struct gmap *gmap) -{ - __tlb_flush_idte(gmap->asce); -} - -static void gmap_radix_tree_free(struct radix_tree_root *root) -{ - struct radix_tree_iter iter; - unsigned long indices[16]; - unsigned long index; - void __rcu **slot; - int i, nr; - - /* A radix tree is freed by deleting all of its entries */ - index = 0; - do { - nr = 0; - radix_tree_for_each_slot(slot, root, &iter, index) { - indices[nr] = iter.index; - if (++nr == 16) - break; - } - for (i = 0; i < nr; i++) { - index = indices[i]; - radix_tree_delete(root, index); - } - } while (nr > 0); -} - -static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) -{ - struct gmap_rmap *rmap, *rnext, *head; - struct radix_tree_iter iter; - unsigned long indices[16]; - unsigned long index; - void __rcu **slot; - int i, nr; - - /* A radix tree is freed by deleting all of its entries */ - index = 0; - do { - nr = 0; - radix_tree_for_each_slot(slot, root, &iter, index) { - indices[nr] = iter.index; - if (++nr == 16) - break; - } - for (i = 0; i < nr; i++) { - index = indices[i]; - head = radix_tree_delete(root, index); - gmap_for_each_rmap_safe(rmap, rnext, head) - kfree(rmap); - } - } while (nr > 0); -} - -static void gmap_free_crst(unsigned long *table, bool free_ptes) -{ - bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0; - int i; - - if (is_segment) { - if (!free_ptes) - goto out; - for (i = 0; i < _CRST_ENTRIES; i++) - if (!(table[i] & _SEGMENT_ENTRY_INVALID)) - page_table_free_pgste(page_ptdesc(phys_to_page(table[i]))); - } else { - for (i = 0; i < _CRST_ENTRIES; i++) - if (!(table[i] & _REGION_ENTRY_INVALID)) - gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes); - } - -out: - free_pages((unsigned long)table, CRST_ALLOC_ORDER); -} - -/** - * gmap_free - free a guest address space - * @gmap: pointer to the guest address space structure - * - * No locks required. There are no references to this gmap anymore. - */ -void gmap_free(struct gmap *gmap) -{ - /* Flush tlb of all gmaps (if not already done for shadows) */ - if (!(gmap_is_shadow(gmap) && gmap->removed)) - gmap_flush_tlb(gmap); - /* Free all segment & region tables. */ - gmap_free_crst(gmap->table, gmap_is_shadow(gmap)); - - gmap_radix_tree_free(&gmap->guest_to_host); - gmap_radix_tree_free(&gmap->host_to_guest); - - /* Free additional data for a shadow gmap */ - if (gmap_is_shadow(gmap)) { - gmap_rmap_radix_tree_free(&gmap->host_to_rmap); - /* Release reference to the parent */ - gmap_put(gmap->parent); - } - - kfree(gmap); -} -EXPORT_SYMBOL_GPL(gmap_free); - -/** - * gmap_get - increase reference counter for guest address space - * @gmap: pointer to the guest address space structure - * - * Returns the gmap pointer - */ -struct gmap *gmap_get(struct gmap *gmap) -{ - refcount_inc(&gmap->ref_count); - return gmap; -} -EXPORT_SYMBOL_GPL(gmap_get); - -/** - * gmap_put - decrease reference counter for guest address space - * @gmap: pointer to the guest address space structure - * - * If the reference counter reaches zero the guest address space is freed. - */ -void gmap_put(struct gmap *gmap) -{ - if (refcount_dec_and_test(&gmap->ref_count)) - gmap_free(gmap); -} -EXPORT_SYMBOL_GPL(gmap_put); - -/** - * gmap_remove - remove a guest address space but do not free it yet - * @gmap: pointer to the guest address space structure - */ -void gmap_remove(struct gmap *gmap) -{ - struct gmap *sg, *next; - unsigned long gmap_asce; - - /* Remove all shadow gmaps linked to this gmap */ - if (!list_empty(&gmap->children)) { - spin_lock(&gmap->shadow_lock); - list_for_each_entry_safe(sg, next, &gmap->children, list) { - list_del(&sg->list); - gmap_put(sg); - } - spin_unlock(&gmap->shadow_lock); - } - /* Remove gmap from the pre-mm list */ - spin_lock(&gmap->mm->context.lock); - list_del_rcu(&gmap->list); - if (list_empty(&gmap->mm->context.gmap_list)) - gmap_asce = 0; - else if (list_is_singular(&gmap->mm->context.gmap_list)) - gmap_asce = list_first_entry(&gmap->mm->context.gmap_list, - struct gmap, list)->asce; - else - gmap_asce = -1UL; - WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce); - spin_unlock(&gmap->mm->context.lock); - synchronize_rcu(); - /* Put reference */ - gmap_put(gmap); -} -EXPORT_SYMBOL_GPL(gmap_remove); - -/* - * gmap_alloc_table is assumed to be called with mmap_lock held - */ -static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, - unsigned long init, unsigned long gaddr) -{ - struct page *page; - unsigned long *new; - - /* since we dont free the gmap table until gmap_free we can unlock */ - page = gmap_alloc_crst(); - if (!page) - return -ENOMEM; - new = page_to_virt(page); - crst_table_init(new, init); - spin_lock(&gmap->guest_table_lock); - if (*table & _REGION_ENTRY_INVALID) { - *table = __pa(new) | _REGION_ENTRY_LENGTH | - (*table & _REGION_ENTRY_TYPE_MASK); - page = NULL; - } - spin_unlock(&gmap->guest_table_lock); - if (page) - __free_pages(page, CRST_ALLOC_ORDER); - return 0; -} - -static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr) -{ - return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); -} - -static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr) -{ - return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); -} - -static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr, - unsigned long *gaddr) -{ - *gaddr = host_to_guest_delete(gmap, vmaddr); - if (IS_GADDR_VALID(*gaddr)) - return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1); - return NULL; -} - -/** - * __gmap_unlink_by_vmaddr - unlink a single segment via a host address - * @gmap: pointer to the guest address space structure - * @vmaddr: address in the host process address space - * - * Returns 1 if a TLB flush is required - */ -static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) -{ - unsigned long gaddr; - int flush = 0; - pmd_t *pmdp; - - BUG_ON(gmap_is_shadow(gmap)); - spin_lock(&gmap->guest_table_lock); - - pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); - if (pmdp) { - flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY); - *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); - } - - spin_unlock(&gmap->guest_table_lock); - return flush; -} - -/** - * __gmap_unmap_by_gaddr - unmap a single segment via a guest address - * @gmap: pointer to the guest address space structure - * @gaddr: address in the guest address space - * - * Returns 1 if a TLB flush is required - */ -static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long vmaddr; - - vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; -} - -/** - * gmap_unmap_segment - unmap segment from the guest address space - * @gmap: pointer to the guest address space structure - * @to: address in the guest address space - * @len: length of the memory area to unmap - * - * Returns 0 if the unmap succeeded, -EINVAL if not. - */ -int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) -{ - unsigned long off; - int flush; - - BUG_ON(gmap_is_shadow(gmap)); - if ((to | len) & (PMD_SIZE - 1)) - return -EINVAL; - if (len == 0 || to + len < to) - return -EINVAL; - - flush = 0; - mmap_write_lock(gmap->mm); - for (off = 0; off < len; off += PMD_SIZE) - flush |= __gmap_unmap_by_gaddr(gmap, to + off); - mmap_write_unlock(gmap->mm); - if (flush) - gmap_flush_tlb(gmap); - return 0; -} -EXPORT_SYMBOL_GPL(gmap_unmap_segment); - -/** - * gmap_map_segment - map a segment to the guest address space - * @gmap: pointer to the guest address space structure - * @from: source address in the parent address space - * @to: target address in the guest address space - * @len: length of the memory area to map - * - * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. - */ -int gmap_map_segment(struct gmap *gmap, unsigned long from, - unsigned long to, unsigned long len) -{ - unsigned long off; - int flush; - - BUG_ON(gmap_is_shadow(gmap)); - if ((from | to | len) & (PMD_SIZE - 1)) - return -EINVAL; - if (len == 0 || from + len < from || to + len < to || - from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end) - return -EINVAL; - - flush = 0; - mmap_write_lock(gmap->mm); - for (off = 0; off < len; off += PMD_SIZE) { - /* Remove old translation */ - flush |= __gmap_unmap_by_gaddr(gmap, to + off); - /* Store new translation */ - if (radix_tree_insert(&gmap->guest_to_host, - (to + off) >> PMD_SHIFT, - (void *) from + off)) - break; - } - mmap_write_unlock(gmap->mm); - if (flush) - gmap_flush_tlb(gmap); - if (off >= len) - return 0; - gmap_unmap_segment(gmap, to, len); - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(gmap_map_segment); - -/** - * __gmap_translate - translate a guest address to a user space address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * - * Returns user space address which corresponds to the guest address or - * -EFAULT if no such mapping exists. - * This function does not establish potentially missing page table entries. - * The mmap_lock of the mm that belongs to the address space must be held - * when this function gets called. - * - * Note: Can also be called for shadow gmaps. - */ -unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long vmaddr; - - vmaddr = (unsigned long) - radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); - /* Note: guest_to_host is empty for a shadow gmap */ - return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; -} -EXPORT_SYMBOL_GPL(__gmap_translate); - -/** - * gmap_unlink - disconnect a page table from the gmap shadow tables - * @mm: pointer to the parent mm_struct - * @table: pointer to the host page table - * @vmaddr: vm address associated with the host page table - */ -void gmap_unlink(struct mm_struct *mm, unsigned long *table, - unsigned long vmaddr) -{ - struct gmap *gmap; - int flush; - - rcu_read_lock(); - list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { - flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); - if (flush) - gmap_flush_tlb(gmap); - } - rcu_read_unlock(); -} - -static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, - unsigned long gaddr); - -/** - * __gmap_link - set up shadow page tables to connect a host to a guest address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * @vmaddr: vm address - * - * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT - * if the vm address is already mapped to a different guest segment. - * The mmap_lock of the mm that belongs to the address space must be held - * when this function gets called. - */ -int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) -{ - struct mm_struct *mm; - unsigned long *table; - spinlock_t *ptl; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - u64 unprot; - int rc; - - BUG_ON(gmap_is_shadow(gmap)); - /* Create higher level tables in the gmap page table */ - table = gmap->table; - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { - table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, - gaddr & _REGION1_MASK)) - return -ENOMEM; - table = __va(*table & _REGION_ENTRY_ORIGIN); - } - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { - table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, - gaddr & _REGION2_MASK)) - return -ENOMEM; - table = __va(*table & _REGION_ENTRY_ORIGIN); - } - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { - table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, - gaddr & _REGION3_MASK)) - return -ENOMEM; - table = __va(*table & _REGION_ENTRY_ORIGIN); - } - table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; - /* Walk the parent mm page table */ - mm = gmap->mm; - pgd = pgd_offset(mm, vmaddr); - VM_BUG_ON(pgd_none(*pgd)); - p4d = p4d_offset(pgd, vmaddr); - VM_BUG_ON(p4d_none(*p4d)); - pud = pud_offset(p4d, vmaddr); - VM_BUG_ON(pud_none(*pud)); - /* large puds cannot yet be handled */ - if (pud_leaf(*pud)) - return -EFAULT; - pmd = pmd_offset(pud, vmaddr); - VM_BUG_ON(pmd_none(*pmd)); - /* Are we allowed to use huge pages? */ - if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) - return -EFAULT; - /* Link gmap segment table entry location to page table. */ - rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); - if (rc) - return rc; - ptl = pmd_lock(mm, pmd); - spin_lock(&gmap->guest_table_lock); - if (*table == _SEGMENT_ENTRY_EMPTY) { - rc = radix_tree_insert(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT, - (void *)MAKE_VALID_GADDR(gaddr)); - if (!rc) { - if (pmd_leaf(*pmd)) { - *table = (pmd_val(*pmd) & - _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) - | _SEGMENT_ENTRY_GMAP_UC - | _SEGMENT_ENTRY; - } else - *table = (pmd_val(*pmd) & - _SEGMENT_ENTRY_HARDWARE_BITS) - | _SEGMENT_ENTRY; - } - } else if (*table & _SEGMENT_ENTRY_PROTECT && - !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { - unprot = (u64)*table; - unprot &= ~_SEGMENT_ENTRY_PROTECT; - unprot |= _SEGMENT_ENTRY_GMAP_UC; - gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr); - } - spin_unlock(&gmap->guest_table_lock); - spin_unlock(ptl); - radix_tree_preload_end(); - return rc; -} -EXPORT_SYMBOL(__gmap_link); - -/* - * this function is assumed to be called with mmap_lock held - */ -void __gmap_zap(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long vmaddr; - - mmap_assert_locked(gmap->mm); - - /* Find the vm address for the guest address */ - vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - if (vmaddr) { - vmaddr |= gaddr & ~PMD_MASK; - gmap_helper_zap_one_page(gmap->mm, vmaddr); - } -} -EXPORT_SYMBOL_GPL(__gmap_zap); - -static LIST_HEAD(gmap_notifier_list); -static DEFINE_SPINLOCK(gmap_notifier_lock); - -/** - * gmap_register_pte_notifier - register a pte invalidation callback - * @nb: pointer to the gmap notifier block - */ -void gmap_register_pte_notifier(struct gmap_notifier *nb) -{ - spin_lock(&gmap_notifier_lock); - list_add_rcu(&nb->list, &gmap_notifier_list); - spin_unlock(&gmap_notifier_lock); -} -EXPORT_SYMBOL_GPL(gmap_register_pte_notifier); - -/** - * gmap_unregister_pte_notifier - remove a pte invalidation callback - * @nb: pointer to the gmap notifier block - */ -void gmap_unregister_pte_notifier(struct gmap_notifier *nb) -{ - spin_lock(&gmap_notifier_lock); - list_del_rcu(&nb->list); - spin_unlock(&gmap_notifier_lock); - synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier); - -/** - * gmap_call_notifier - call all registered invalidation callbacks - * @gmap: pointer to guest mapping meta data structure - * @start: start virtual address in the guest address space - * @end: end virtual address in the guest address space - */ -static void gmap_call_notifier(struct gmap *gmap, unsigned long start, - unsigned long end) -{ - struct gmap_notifier *nb; - - list_for_each_entry(nb, &gmap_notifier_list, list) - nb->notifier_call(gmap, start, end); -} - -/** - * gmap_table_walk - walk the gmap page tables - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @level: page table level to stop at - * - * Returns a table entry pointer for the given guest address and @level - * @level=0 : returns a pointer to a page table table entry (or NULL) - * @level=1 : returns a pointer to a segment table entry (or NULL) - * @level=2 : returns a pointer to a region-3 table entry (or NULL) - * @level=3 : returns a pointer to a region-2 table entry (or NULL) - * @level=4 : returns a pointer to a region-1 table entry (or NULL) - * - * Returns NULL if the gmap page tables could not be walked to the - * requested level. - * - * Note: Can also be called for shadow gmaps. - */ -unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level) -{ - const int asce_type = gmap->asce & _ASCE_TYPE_MASK; - unsigned long *table = gmap->table; - - if (gmap_is_shadow(gmap) && gmap->removed) - return NULL; - - if (WARN_ON_ONCE(level > (asce_type >> 2) + 1)) - return NULL; - - if (asce_type != _ASCE_TYPE_REGION1 && - gaddr & (-1UL << (31 + (asce_type >> 2) * 11))) - return NULL; - - switch (asce_type) { - case _ASCE_TYPE_REGION1: - table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; - if (level == 4) - break; - if (*table & _REGION_ENTRY_INVALID) - return NULL; - table = __va(*table & _REGION_ENTRY_ORIGIN); - fallthrough; - case _ASCE_TYPE_REGION2: - table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; - if (level == 3) - break; - if (*table & _REGION_ENTRY_INVALID) - return NULL; - table = __va(*table & _REGION_ENTRY_ORIGIN); - fallthrough; - case _ASCE_TYPE_REGION3: - table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; - if (level == 2) - break; - if (*table & _REGION_ENTRY_INVALID) - return NULL; - table = __va(*table & _REGION_ENTRY_ORIGIN); - fallthrough; - case _ASCE_TYPE_SEGMENT: - table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; - if (level == 1) - break; - if (*table & _REGION_ENTRY_INVALID) - return NULL; - table = __va(*table & _SEGMENT_ENTRY_ORIGIN); - table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT; - } - return table; -} -EXPORT_SYMBOL(gmap_table_walk); - -/** - * gmap_pte_op_walk - walk the gmap page table, get the page table lock - * and return the pte pointer - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @ptl: pointer to the spinlock pointer - * - * Returns a pointer to the locked pte for a guest address, or NULL - */ -static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, - spinlock_t **ptl) -{ - unsigned long *table; - - BUG_ON(gmap_is_shadow(gmap)); - /* Walk the gmap page table, lock and get pte pointer */ - table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ - if (!table || *table & _SEGMENT_ENTRY_INVALID) - return NULL; - return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); -} - -/** - * gmap_pte_op_fixup - force a page in and connect the gmap page table - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @vmaddr: address in the host process address space - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * - * Returns 0 if the caller can retry __gmap_translate (might fail again), - * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing - * up or connecting the gmap page table. - */ -static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, - unsigned long vmaddr, int prot) -{ - struct mm_struct *mm = gmap->mm; - unsigned int fault_flags; - bool unlocked = false; - - BUG_ON(gmap_is_shadow(gmap)); - fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; - if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked)) - return -EFAULT; - if (unlocked) - /* lost mmap_lock, caller has to retry __gmap_translate */ - return 0; - /* Connect the page tables */ - return __gmap_link(gmap, gaddr, vmaddr); -} - -/** - * gmap_pte_op_end - release the page table lock - * @ptep: pointer to the locked pte - * @ptl: pointer to the page table spinlock - */ -static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl) -{ - pte_unmap_unlock(ptep, ptl); -} - -/** - * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock - * and return the pmd pointer - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * - * Returns a pointer to the pmd for a guest address, or NULL - */ -static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) -{ - pmd_t *pmdp; - - BUG_ON(gmap_is_shadow(gmap)); - pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1); - if (!pmdp) - return NULL; - - /* without huge pages, there is no need to take the table lock */ - if (!gmap->mm->context.allow_gmap_hpage_1m) - return pmd_none(*pmdp) ? NULL : pmdp; - - spin_lock(&gmap->guest_table_lock); - if (pmd_none(*pmdp)) { - spin_unlock(&gmap->guest_table_lock); - return NULL; - } - - /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ - if (!pmd_leaf(*pmdp)) - spin_unlock(&gmap->guest_table_lock); - return pmdp; -} - -/** - * gmap_pmd_op_end - release the guest_table_lock if needed - * @gmap: pointer to the guest mapping meta data structure - * @pmdp: pointer to the pmd - */ -static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) -{ - if (pmd_leaf(*pmdp)) - spin_unlock(&gmap->guest_table_lock); -} - -/* - * gmap_protect_pmd - remove access rights to memory and set pmd notification bits - * @pmdp: pointer to the pmd to be protected - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bits: notification bits to set - * - * Returns: - * 0 if successfully protected - * -EAGAIN if a fixup is needed - * -EINVAL if unsupported notifier bits have been specified - * - * Expected to be called with sg->mm->mmap_lock in read and - * guest_table_lock held. - */ -static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, - pmd_t *pmdp, int prot, unsigned long bits) -{ - int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID; - int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT; - pmd_t new = *pmdp; - - /* Fixup needed */ - if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE))) - return -EAGAIN; - - if (prot == PROT_NONE && !pmd_i) { - new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); - gmap_pmdp_xchg(gmap, pmdp, new, gaddr); - } - - if (prot == PROT_READ && !pmd_p) { - new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); - new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT)); - gmap_pmdp_xchg(gmap, pmdp, new, gaddr); - } - - if (bits & GMAP_NOTIFY_MPROT) - set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); - - /* Shadow GMAP protection needs split PMDs */ - if (bits & GMAP_NOTIFY_SHADOW) - return -EINVAL; - - return 0; -} - -/* - * gmap_protect_pte - remove access rights to memory and set pgste bits - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @pmdp: pointer to the pmd associated with the pte - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bits: notification bits to set - * - * Returns 0 if successfully protected, -ENOMEM if out of memory and - * -EAGAIN if a fixup is needed. - * - * Expected to be called with sg->mm->mmap_lock in read - */ -static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, - pmd_t *pmdp, int prot, unsigned long bits) -{ - int rc; - pte_t *ptep; - spinlock_t *ptl; - unsigned long pbits = 0; - - if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) - return -EAGAIN; - - ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl); - if (!ptep) - return -ENOMEM; - - pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0; - pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; - /* Protect and unlock. */ - rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); - gmap_pte_op_end(ptep, ptl); - return rc; -} - -/* - * gmap_protect_range - remove access rights to memory and set pgste bits - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @len: size of area - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bits: pgste notification bits to set - * - * Returns: - * PAGE_SIZE if a small page was successfully protected; - * HPAGE_SIZE if a large page was successfully protected; - * -ENOMEM if out of memory; - * -EFAULT if gaddr is invalid (or mapping for shadows is missing); - * -EAGAIN if the guest mapping is missing and should be fixed by the caller. - * - * Context: Called with sg->mm->mmap_lock in read. - */ -int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits) -{ - pmd_t *pmdp; - int rc = 0; - - BUG_ON(gmap_is_shadow(gmap)); - - pmdp = gmap_pmd_op_walk(gmap, gaddr); - if (!pmdp) - return -EAGAIN; - - if (!pmd_leaf(*pmdp)) { - rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits); - if (!rc) - rc = PAGE_SIZE; - } else { - rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits); - if (!rc) - rc = HPAGE_SIZE; - } - gmap_pmd_op_end(gmap, pmdp); - - return rc; -} -EXPORT_SYMBOL_GPL(gmap_protect_one); - -/** - * gmap_read_table - get an unsigned long value from a guest page table using - * absolute addressing, without marking the page referenced. - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @val: pointer to the unsigned long value to return - * - * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT - * if reading using the virtual address failed. -EINVAL if called on a gmap - * shadow. - * - * Called with gmap->mm->mmap_lock in read. - */ -int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) -{ - unsigned long address, vmaddr; - spinlock_t *ptl; - pte_t *ptep, pte; - int rc; - - if (gmap_is_shadow(gmap)) - return -EINVAL; - - while (1) { - rc = -EAGAIN; - ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); - if (ptep) { - pte = *ptep; - if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { - address = pte_val(pte) & PAGE_MASK; - address += gaddr & ~PAGE_MASK; - *val = *(unsigned long *)__va(address); - set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); - /* Do *NOT* clear the _PAGE_INVALID bit! */ - rc = 0; - } - gmap_pte_op_end(ptep, ptl); - } - if (!rc) - break; - vmaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(vmaddr)) { - rc = vmaddr; - break; - } - rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ); - if (rc) - break; - } - return rc; -} -EXPORT_SYMBOL_GPL(gmap_read_table); - -/** - * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree - * @sg: pointer to the shadow guest address space structure - * @vmaddr: vm address associated with the rmap - * @rmap: pointer to the rmap structure - * - * Called with the sg->guest_table_lock - */ -static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, - struct gmap_rmap *rmap) -{ - struct gmap_rmap *temp; - void __rcu **slot; - - BUG_ON(!gmap_is_shadow(sg)); - slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); - if (slot) { - rmap->next = radix_tree_deref_slot_protected(slot, - &sg->guest_table_lock); - for (temp = rmap->next; temp; temp = temp->next) { - if (temp->raddr == rmap->raddr) { - kfree(rmap); - return; - } - } - radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); - } else { - rmap->next = NULL; - radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, - rmap); - } -} - -/** - * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow gmap - * @paddr: address in the parent guest address space - * @len: length of the memory area to protect - * - * Returns 0 if successfully protected and the rmap was created, -ENOMEM - * if out of memory and -EFAULT if paddr is invalid. - */ -static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, - unsigned long paddr, unsigned long len) -{ - struct gmap *parent; - struct gmap_rmap *rmap; - unsigned long vmaddr; - spinlock_t *ptl; - pte_t *ptep; - int rc; - - BUG_ON(!gmap_is_shadow(sg)); - parent = sg->parent; - while (len) { - vmaddr = __gmap_translate(parent, paddr); - if (IS_ERR_VALUE(vmaddr)) - return vmaddr; - rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); - if (!rmap) - return -ENOMEM; - rmap->raddr = raddr; - rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); - if (rc) { - kfree(rmap); - return rc; - } - rc = -EAGAIN; - ptep = gmap_pte_op_walk(parent, paddr, &ptl); - if (ptep) { - spin_lock(&sg->guest_table_lock); - rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ, - PGSTE_VSIE_BIT); - if (!rc) - gmap_insert_rmap(sg, vmaddr, rmap); - spin_unlock(&sg->guest_table_lock); - gmap_pte_op_end(ptep, ptl); - } - radix_tree_preload_end(); - if (rc) { - kfree(rmap); - rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ); - if (rc) - return rc; - continue; - } - paddr += PAGE_SIZE; - len -= PAGE_SIZE; - } - return 0; -} - -#define _SHADOW_RMAP_MASK 0x7 -#define _SHADOW_RMAP_REGION1 0x5 -#define _SHADOW_RMAP_REGION2 0x4 -#define _SHADOW_RMAP_REGION3 0x3 -#define _SHADOW_RMAP_SEGMENT 0x2 -#define _SHADOW_RMAP_PGTABLE 0x1 - -/** - * gmap_idte_one - invalidate a single region or segment table entry - * @asce: region or segment table *origin* + table-type bits - * @vaddr: virtual address to identify the table entry to flush - * - * The invalid bit of a single region or segment table entry is set - * and the associated TLB entries depending on the entry are flushed. - * The table-type of the @asce identifies the portion of the @vaddr - * that is used as the invalidation index. - */ -static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) -{ - asm volatile( - " idte %0,0,%1" - : : "a" (asce), "a" (vaddr) : "cc", "memory"); -} - -/** - * gmap_unshadow_page - remove a page from a shadow page table - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * - * Called with the sg->guest_table_lock - */ -static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) -{ - unsigned long *table; - - BUG_ON(!gmap_is_shadow(sg)); - table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ - if (!table || *table & _PAGE_INVALID) - return; - gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1); - ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); -} - -/** - * __gmap_unshadow_pgt - remove all entries from a shadow page table - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * @pgt: pointer to the start of a shadow page table - * - * Called with the sg->guest_table_lock - */ -static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, - unsigned long *pgt) -{ - int i; - - BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE) - pgt[i] = _PAGE_INVALID; -} - -/** - * gmap_unshadow_pgt - remove a shadow page table from a segment entry - * @sg: pointer to the shadow guest address space structure - * @raddr: address in the shadow guest address space - * - * Called with the sg->guest_table_lock - */ -static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) -{ - unsigned long *ste; - phys_addr_t sto, pgt; - struct ptdesc *ptdesc; - - BUG_ON(!gmap_is_shadow(sg)); - ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ - if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) - return; - gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); - sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); - gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); - pgt = *ste & _SEGMENT_ENTRY_ORIGIN; - *ste = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, __va(pgt)); - /* Free page table */ - ptdesc = page_ptdesc(phys_to_page(pgt)); - page_table_free_pgste(ptdesc); -} - -/** - * __gmap_unshadow_sgt - remove all entries from a shadow segment table - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * @sgt: pointer to the start of a shadow segment table - * - * Called with the sg->guest_table_lock - */ -static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, - unsigned long *sgt) -{ - struct ptdesc *ptdesc; - phys_addr_t pgt; - int i; - - BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { - if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) - continue; - pgt = sgt[i] & _REGION_ENTRY_ORIGIN; - sgt[i] = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, __va(pgt)); - /* Free page table */ - ptdesc = page_ptdesc(phys_to_page(pgt)); - page_table_free_pgste(ptdesc); - } -} - -/** - * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * - * Called with the shadow->guest_table_lock - */ -static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) -{ - unsigned long r3o, *r3e; - phys_addr_t sgt; - struct page *page; - - BUG_ON(!gmap_is_shadow(sg)); - r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ - if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) - return; - gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); - r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); - gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr); - sgt = *r3e & _REGION_ENTRY_ORIGIN; - *r3e = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, __va(sgt)); - /* Free segment table */ - page = phys_to_page(sgt); - __free_pages(page, CRST_ALLOC_ORDER); -} - -/** - * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table - * @sg: pointer to the shadow guest address space structure - * @raddr: address in the shadow guest address space - * @r3t: pointer to the start of a shadow region-3 table - * - * Called with the sg->guest_table_lock - */ -static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, - unsigned long *r3t) -{ - struct page *page; - phys_addr_t sgt; - int i; - - BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { - if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) - continue; - sgt = r3t[i] & _REGION_ENTRY_ORIGIN; - r3t[i] = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, __va(sgt)); - /* Free segment table */ - page = phys_to_page(sgt); - __free_pages(page, CRST_ALLOC_ORDER); - } -} - -/** - * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * - * Called with the sg->guest_table_lock - */ -static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) -{ - unsigned long r2o, *r2e; - phys_addr_t r3t; - struct page *page; - - BUG_ON(!gmap_is_shadow(sg)); - r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ - if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) - return; - gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); - r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); - gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr); - r3t = *r2e & _REGION_ENTRY_ORIGIN; - *r2e = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, __va(r3t)); - /* Free region 3 table */ - page = phys_to_page(r3t); - __free_pages(page, CRST_ALLOC_ORDER); -} - -/** - * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * @r2t: pointer to the start of a shadow region-2 table - * - * Called with the sg->guest_table_lock - */ -static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, - unsigned long *r2t) -{ - phys_addr_t r3t; - struct page *page; - int i; - - BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { - if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) - continue; - r3t = r2t[i] & _REGION_ENTRY_ORIGIN; - r2t[i] = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, __va(r3t)); - /* Free region 3 table */ - page = phys_to_page(r3t); - __free_pages(page, CRST_ALLOC_ORDER); - } -} - -/** - * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * - * Called with the sg->guest_table_lock - */ -static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) -{ - unsigned long r1o, *r1e; - struct page *page; - phys_addr_t r2t; - - BUG_ON(!gmap_is_shadow(sg)); - r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ - if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) - return; - gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); - r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); - gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr); - r2t = *r1e & _REGION_ENTRY_ORIGIN; - *r1e = _REGION1_ENTRY_EMPTY; - __gmap_unshadow_r2t(sg, raddr, __va(r2t)); - /* Free region 2 table */ - page = phys_to_page(r2t); - __free_pages(page, CRST_ALLOC_ORDER); -} - -/** - * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * @r1t: pointer to the start of a shadow region-1 table - * - * Called with the shadow->guest_table_lock - */ -static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, - unsigned long *r1t) -{ - unsigned long asce; - struct page *page; - phys_addr_t r2t; - int i; - - BUG_ON(!gmap_is_shadow(sg)); - asce = __pa(r1t) | _ASCE_TYPE_REGION1; - for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { - if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) - continue; - r2t = r1t[i] & _REGION_ENTRY_ORIGIN; - __gmap_unshadow_r2t(sg, raddr, __va(r2t)); - /* Clear entry and flush translation r1t -> r2t */ - gmap_idte_one(asce, raddr); - r1t[i] = _REGION1_ENTRY_EMPTY; - /* Free region 2 table */ - page = phys_to_page(r2t); - __free_pages(page, CRST_ALLOC_ORDER); - } -} - -/** - * gmap_unshadow - remove a shadow page table completely - * @sg: pointer to the shadow guest address space structure - * - * Called with sg->guest_table_lock - */ -void gmap_unshadow(struct gmap *sg) -{ - unsigned long *table; - - BUG_ON(!gmap_is_shadow(sg)); - if (sg->removed) - return; - sg->removed = 1; - gmap_call_notifier(sg, 0, -1UL); - gmap_flush_tlb(sg); - table = __va(sg->asce & _ASCE_ORIGIN); - switch (sg->asce & _ASCE_TYPE_MASK) { - case _ASCE_TYPE_REGION1: - __gmap_unshadow_r1t(sg, 0, table); - break; - case _ASCE_TYPE_REGION2: - __gmap_unshadow_r2t(sg, 0, table); - break; - case _ASCE_TYPE_REGION3: - __gmap_unshadow_r3t(sg, 0, table); - break; - case _ASCE_TYPE_SEGMENT: - __gmap_unshadow_sgt(sg, 0, table); - break; - } -} -EXPORT_SYMBOL(gmap_unshadow); - -/** - * gmap_shadow_r2t - create an empty shadow region 2 table - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @r2t: parent gmap address of the region 2 table to get shadowed - * @fake: r2t references contiguous guest memory block, not a r2t - * - * The r2t parameter specifies the address of the source table. The - * four pages of the source table are made read-only in the parent gmap - * address space. A write to the source table area @r2t will automatically - * remove the shadow r2 table and all of its descendants. - * - * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the - * shadow table structure is incomplete, -ENOMEM if out of memory and - * -EFAULT if an address in the parent gmap could not be resolved. - * - * Called with sg->mm->mmap_lock in read. - */ -int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, - int fake) -{ - unsigned long raddr, origin, offset, len; - unsigned long *table; - phys_addr_t s_r2t; - struct page *page; - int rc; - - BUG_ON(!gmap_is_shadow(sg)); - /* Allocate a shadow region second table */ - page = gmap_alloc_crst(); - if (!page) - return -ENOMEM; - s_r2t = page_to_phys(page); - /* Install shadow region second table */ - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ - if (!table) { - rc = -EAGAIN; /* Race with unshadow */ - goto out_free; - } - if (!(*table & _REGION_ENTRY_INVALID)) { - rc = 0; /* Already established */ - goto out_free; - } else if (*table & _REGION_ENTRY_ORIGIN) { - rc = -EAGAIN; /* Race with shadow */ - goto out_free; - } - crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY); - /* mark as invalid as long as the parent table is not protected */ - *table = s_r2t | _REGION_ENTRY_LENGTH | - _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; - if (sg->edat_level >= 1) - *table |= (r2t & _REGION_ENTRY_PROTECT); - if (fake) { - /* nothing to protect for fake tables */ - *table &= ~_REGION_ENTRY_INVALID; - spin_unlock(&sg->guest_table_lock); - return 0; - } - spin_unlock(&sg->guest_table_lock); - /* Make r2t read-only in parent gmap page table */ - raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1; - origin = r2t & _REGION_ENTRY_ORIGIN; - offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; - len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len); - spin_lock(&sg->guest_table_lock); - if (!rc) { - table = gmap_table_walk(sg, saddr, 4); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t) - rc = -EAGAIN; /* Race with unshadow */ - else - *table &= ~_REGION_ENTRY_INVALID; - } else { - gmap_unshadow_r2t(sg, raddr); - } - spin_unlock(&sg->guest_table_lock); - return rc; -out_free: - spin_unlock(&sg->guest_table_lock); - __free_pages(page, CRST_ALLOC_ORDER); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_shadow_r2t); - -/** - * gmap_shadow_r3t - create a shadow region 3 table - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @r3t: parent gmap address of the region 3 table to get shadowed - * @fake: r3t references contiguous guest memory block, not a r3t - * - * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the - * shadow table structure is incomplete, -ENOMEM if out of memory and - * -EFAULT if an address in the parent gmap could not be resolved. - * - * Called with sg->mm->mmap_lock in read. - */ -int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, - int fake) -{ - unsigned long raddr, origin, offset, len; - unsigned long *table; - phys_addr_t s_r3t; - struct page *page; - int rc; - - BUG_ON(!gmap_is_shadow(sg)); - /* Allocate a shadow region second table */ - page = gmap_alloc_crst(); - if (!page) - return -ENOMEM; - s_r3t = page_to_phys(page); - /* Install shadow region second table */ - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ - if (!table) { - rc = -EAGAIN; /* Race with unshadow */ - goto out_free; - } - if (!(*table & _REGION_ENTRY_INVALID)) { - rc = 0; /* Already established */ - goto out_free; - } else if (*table & _REGION_ENTRY_ORIGIN) { - rc = -EAGAIN; /* Race with shadow */ - goto out_free; - } - crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY); - /* mark as invalid as long as the parent table is not protected */ - *table = s_r3t | _REGION_ENTRY_LENGTH | - _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; - if (sg->edat_level >= 1) - *table |= (r3t & _REGION_ENTRY_PROTECT); - if (fake) { - /* nothing to protect for fake tables */ - *table &= ~_REGION_ENTRY_INVALID; - spin_unlock(&sg->guest_table_lock); - return 0; - } - spin_unlock(&sg->guest_table_lock); - /* Make r3t read-only in parent gmap page table */ - raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2; - origin = r3t & _REGION_ENTRY_ORIGIN; - offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; - len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len); - spin_lock(&sg->guest_table_lock); - if (!rc) { - table = gmap_table_walk(sg, saddr, 3); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t) - rc = -EAGAIN; /* Race with unshadow */ - else - *table &= ~_REGION_ENTRY_INVALID; - } else { - gmap_unshadow_r3t(sg, raddr); - } - spin_unlock(&sg->guest_table_lock); - return rc; -out_free: - spin_unlock(&sg->guest_table_lock); - __free_pages(page, CRST_ALLOC_ORDER); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_shadow_r3t); - -/** - * gmap_shadow_sgt - create a shadow segment table - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @sgt: parent gmap address of the segment table to get shadowed - * @fake: sgt references contiguous guest memory block, not a sgt - * - * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the - * shadow table structure is incomplete, -ENOMEM if out of memory and - * -EFAULT if an address in the parent gmap could not be resolved. - * - * Called with sg->mm->mmap_lock in read. - */ -int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, - int fake) -{ - unsigned long raddr, origin, offset, len; - unsigned long *table; - phys_addr_t s_sgt; - struct page *page; - int rc; - - BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); - /* Allocate a shadow segment table */ - page = gmap_alloc_crst(); - if (!page) - return -ENOMEM; - s_sgt = page_to_phys(page); - /* Install shadow region second table */ - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ - if (!table) { - rc = -EAGAIN; /* Race with unshadow */ - goto out_free; - } - if (!(*table & _REGION_ENTRY_INVALID)) { - rc = 0; /* Already established */ - goto out_free; - } else if (*table & _REGION_ENTRY_ORIGIN) { - rc = -EAGAIN; /* Race with shadow */ - goto out_free; - } - crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY); - /* mark as invalid as long as the parent table is not protected */ - *table = s_sgt | _REGION_ENTRY_LENGTH | - _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; - if (sg->edat_level >= 1) - *table |= sgt & _REGION_ENTRY_PROTECT; - if (fake) { - /* nothing to protect for fake tables */ - *table &= ~_REGION_ENTRY_INVALID; - spin_unlock(&sg->guest_table_lock); - return 0; - } - spin_unlock(&sg->guest_table_lock); - /* Make sgt read-only in parent gmap page table */ - raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3; - origin = sgt & _REGION_ENTRY_ORIGIN; - offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; - len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len); - spin_lock(&sg->guest_table_lock); - if (!rc) { - table = gmap_table_walk(sg, saddr, 2); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt) - rc = -EAGAIN; /* Race with unshadow */ - else - *table &= ~_REGION_ENTRY_INVALID; - } else { - gmap_unshadow_sgt(sg, raddr); - } - spin_unlock(&sg->guest_table_lock); - return rc; -out_free: - spin_unlock(&sg->guest_table_lock); - __free_pages(page, CRST_ALLOC_ORDER); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_shadow_sgt); - -static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr) -{ - unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc)); - - pgstes += _PAGE_ENTRIES; - - pgstes[0] &= ~PGSTE_ST2_MASK; - pgstes[1] &= ~PGSTE_ST2_MASK; - pgstes[2] &= ~PGSTE_ST2_MASK; - pgstes[3] &= ~PGSTE_ST2_MASK; - - pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK; - pgstes[1] |= pgt_addr & PGSTE_ST2_MASK; - pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK; - pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK; -} - -/** - * gmap_shadow_pgt - instantiate a shadow page table - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @pgt: parent gmap address of the page table to get shadowed - * @fake: pgt references contiguous guest memory block, not a pgtable - * - * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the - * shadow table structure is incomplete, -ENOMEM if out of memory, - * -EFAULT if an address in the parent gmap could not be resolved and - * - * Called with gmap->mm->mmap_lock in read - */ -int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, - int fake) -{ - unsigned long raddr, origin; - unsigned long *table; - struct ptdesc *ptdesc; - phys_addr_t s_pgt; - int rc; - - BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); - /* Allocate a shadow page table */ - ptdesc = page_table_alloc_pgste(sg->mm); - if (!ptdesc) - return -ENOMEM; - origin = pgt & _SEGMENT_ENTRY_ORIGIN; - if (fake) - origin |= GMAP_SHADOW_FAKE_TABLE; - gmap_pgste_set_pgt_addr(ptdesc, origin); - s_pgt = page_to_phys(ptdesc_page(ptdesc)); - /* Install shadow page table */ - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ - if (!table) { - rc = -EAGAIN; /* Race with unshadow */ - goto out_free; - } - if (!(*table & _SEGMENT_ENTRY_INVALID)) { - rc = 0; /* Already established */ - goto out_free; - } else if (*table & _SEGMENT_ENTRY_ORIGIN) { - rc = -EAGAIN; /* Race with shadow */ - goto out_free; - } - /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | - (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; - if (fake) { - /* nothing to protect for fake tables */ - *table &= ~_SEGMENT_ENTRY_INVALID; - spin_unlock(&sg->guest_table_lock); - return 0; - } - spin_unlock(&sg->guest_table_lock); - /* Make pgt read-only in parent gmap page table (not the pgste) */ - raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; - origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; - rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE); - spin_lock(&sg->guest_table_lock); - if (!rc) { - table = gmap_table_walk(sg, saddr, 1); - if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt) - rc = -EAGAIN; /* Race with unshadow */ - else - *table &= ~_SEGMENT_ENTRY_INVALID; - } else { - gmap_unshadow_pgt(sg, raddr); - } - spin_unlock(&sg->guest_table_lock); - return rc; -out_free: - spin_unlock(&sg->guest_table_lock); - page_table_free_pgste(ptdesc); - return rc; - -} -EXPORT_SYMBOL_GPL(gmap_shadow_pgt); - -/** - * gmap_shadow_page - create a shadow page mapping - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @pte: pte in parent gmap address space to get shadowed - * - * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the - * shadow table structure is incomplete, -ENOMEM if out of memory and - * -EFAULT if an address in the parent gmap could not be resolved. - * - * Called with sg->mm->mmap_lock in read. - */ -int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) -{ - struct gmap *parent; - struct gmap_rmap *rmap; - unsigned long vmaddr, paddr; - spinlock_t *ptl; - pte_t *sptep, *tptep; - int prot; - int rc; - - BUG_ON(!gmap_is_shadow(sg)); - parent = sg->parent; - prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; - - rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); - if (!rmap) - return -ENOMEM; - rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; - - while (1) { - paddr = pte_val(pte) & PAGE_MASK; - vmaddr = __gmap_translate(parent, paddr); - if (IS_ERR_VALUE(vmaddr)) { - rc = vmaddr; - break; - } - rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); - if (rc) - break; - rc = -EAGAIN; - sptep = gmap_pte_op_walk(parent, paddr, &ptl); - if (sptep) { - spin_lock(&sg->guest_table_lock); - /* Get page table pointer */ - tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); - if (!tptep) { - spin_unlock(&sg->guest_table_lock); - gmap_pte_op_end(sptep, ptl); - radix_tree_preload_end(); - break; - } - rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte); - if (rc > 0) { - /* Success and a new mapping */ - gmap_insert_rmap(sg, vmaddr, rmap); - rmap = NULL; - rc = 0; - } - gmap_pte_op_end(sptep, ptl); - spin_unlock(&sg->guest_table_lock); - } - radix_tree_preload_end(); - if (!rc) - break; - rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); - if (rc) - break; - } - kfree(rmap); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_shadow_page); - -/* - * gmap_shadow_notify - handle notifications for shadow gmap - * - * Called with sg->parent->shadow_lock. - */ -static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, - unsigned long gaddr) -{ - struct gmap_rmap *rmap, *rnext, *head; - unsigned long start, end, bits, raddr; - - BUG_ON(!gmap_is_shadow(sg)); - - spin_lock(&sg->guest_table_lock); - if (sg->removed) { - spin_unlock(&sg->guest_table_lock); - return; - } - /* Check for top level table */ - start = sg->orig_asce & _ASCE_ORIGIN; - end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE; - if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && - gaddr < end) { - /* The complete shadow table has to go */ - gmap_unshadow(sg); - spin_unlock(&sg->guest_table_lock); - list_del(&sg->list); - gmap_put(sg); - return; - } - /* Remove the page table tree from on specific entry */ - head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); - gmap_for_each_rmap_safe(rmap, rnext, head) { - bits = rmap->raddr & _SHADOW_RMAP_MASK; - raddr = rmap->raddr ^ bits; - switch (bits) { - case _SHADOW_RMAP_REGION1: - gmap_unshadow_r2t(sg, raddr); - break; - case _SHADOW_RMAP_REGION2: - gmap_unshadow_r3t(sg, raddr); - break; - case _SHADOW_RMAP_REGION3: - gmap_unshadow_sgt(sg, raddr); - break; - case _SHADOW_RMAP_SEGMENT: - gmap_unshadow_pgt(sg, raddr); - break; - case _SHADOW_RMAP_PGTABLE: - gmap_unshadow_page(sg, raddr); - break; - } - kfree(rmap); - } - spin_unlock(&sg->guest_table_lock); -} - -/** - * ptep_notify - call all invalidation callbacks for a specific pte. - * @mm: pointer to the process mm_struct - * @vmaddr: virtual address in the process address space - * @pte: pointer to the page table entry - * @bits: bits from the pgste that caused the notify call - * - * This function is assumed to be called with the page table lock held - * for the pte to notify. - */ -void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, - pte_t *pte, unsigned long bits) -{ - unsigned long offset, gaddr = 0; - struct gmap *gmap, *sg, *next; - - offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); - offset = offset * (PAGE_SIZE / sizeof(pte_t)); - rcu_read_lock(); - list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { - spin_lock(&gmap->guest_table_lock); - gaddr = host_to_guest_lookup(gmap, vmaddr) + offset; - spin_unlock(&gmap->guest_table_lock); - if (!IS_GADDR_VALID(gaddr)) - continue; - - if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { - spin_lock(&gmap->shadow_lock); - list_for_each_entry_safe(sg, next, - &gmap->children, list) - gmap_shadow_notify(sg, vmaddr, gaddr); - spin_unlock(&gmap->shadow_lock); - } - if (bits & PGSTE_IN_BIT) - gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); - } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(ptep_notify); - -static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, - unsigned long gaddr) -{ - set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); - gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); -} - -/** - * gmap_pmdp_xchg - exchange a gmap pmd with another - * @gmap: pointer to the guest address space structure - * @pmdp: pointer to the pmd entry - * @new: replacement entry - * @gaddr: the affected guest address - * - * This function is assumed to be called with the guest_table_lock - * held. - */ -static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, - unsigned long gaddr) -{ - gaddr &= HPAGE_MASK; - pmdp_notify_gmap(gmap, pmdp, gaddr); - new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN)); - if (machine_has_tlb_guest()) - __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, - IDTE_GLOBAL); - else - __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); - set_pmd(pmdp, new); -} - -static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, - int purge) -{ - pmd_t *pmdp; - struct gmap *gmap; - unsigned long gaddr; - - rcu_read_lock(); - list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { - spin_lock(&gmap->guest_table_lock); - pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); - if (pmdp) { - pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC | - _SEGMENT_ENTRY)); - if (purge) - __pmdp_cspg(pmdp); - set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); - } - spin_unlock(&gmap->guest_table_lock); - } - rcu_read_unlock(); -} - -/** - * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without - * flushing - * @mm: pointer to the process mm_struct - * @vmaddr: virtual address in the process address space - */ -void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr) -{ - gmap_pmdp_clear(mm, vmaddr, 0); -} -EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate); - -/** - * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry - * @mm: pointer to the process mm_struct - * @vmaddr: virtual address in the process address space - */ -void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) -{ - unsigned long gaddr; - struct gmap *gmap; - pmd_t *pmdp; - - rcu_read_lock(); - list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { - spin_lock(&gmap->guest_table_lock); - pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); - if (pmdp) { - pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC | - _SEGMENT_ENTRY)); - if (machine_has_tlb_guest()) - __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, - gmap->asce, IDTE_LOCAL); - else - __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); - *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); - } - spin_unlock(&gmap->guest_table_lock); - } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); - -/** - * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry - * @mm: pointer to the process mm_struct - * @vmaddr: virtual address in the process address space - */ -void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) -{ - unsigned long gaddr; - struct gmap *gmap; - pmd_t *pmdp; - - rcu_read_lock(); - list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { - spin_lock(&gmap->guest_table_lock); - pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); - if (pmdp) { - pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC | - _SEGMENT_ENTRY)); - if (machine_has_tlb_guest()) - __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, - gmap->asce, IDTE_GLOBAL); - else - __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); - *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); - } - spin_unlock(&gmap->guest_table_lock); - } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global); - -/** - * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status - * @gmap: pointer to guest address space - * @pmdp: pointer to the pmd to be tested - * @gaddr: virtual address in the guest address space - * - * This function is assumed to be called with the guest_table_lock - * held. - */ -static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, - unsigned long gaddr) -{ - if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) - return false; - - /* Already protected memory, which did not change is clean */ - if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT && - !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC)) - return false; - - /* Clear UC indication and reset protection */ - set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC))); - gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); - return true; -} - -/** - * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment - * @gmap: pointer to guest address space - * @bitmap: dirty bitmap for this pmd - * @gaddr: virtual address in the guest address space - * @vmaddr: virtual address in the host address space - * - * This function is assumed to be called with the guest_table_lock - * held. - */ -void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], - unsigned long gaddr, unsigned long vmaddr) -{ - int i; - pmd_t *pmdp; - pte_t *ptep; - spinlock_t *ptl; - - pmdp = gmap_pmd_op_walk(gmap, gaddr); - if (!pmdp) - return; - - if (pmd_leaf(*pmdp)) { - if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) - bitmap_fill(bitmap, _PAGE_ENTRIES); - } else { - for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) { - ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl); - if (!ptep) - continue; - if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) - set_bit(i, bitmap); - pte_unmap_unlock(ptep, ptl); - } - } - gmap_pmd_op_end(gmap, pmdp); -} -EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) -{ - struct vm_area_struct *vma = walk->vma; - - split_huge_pmd(vma, pmd, addr); - return 0; -} - -static const struct mm_walk_ops thp_split_walk_ops = { - .pmd_entry = thp_split_walk_pmd_entry, - .walk_lock = PGWALK_WRLOCK_VERIFY, -}; - -static inline void thp_split_mm(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - for_each_vma(vmi, vma) { - vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); - walk_page_vma(vma, &thp_split_walk_ops, NULL); - } - mm->def_flags |= VM_NOHUGEPAGE; -} -#else -static inline void thp_split_mm(struct mm_struct *mm) -{ -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -/* - * switch on pgstes for its userspace process (for kvm) - */ -int s390_enable_sie(void) -{ - struct mm_struct *mm = current->mm; - - /* Do we have pgstes? if yes, we are done */ - if (mm_has_pgste(mm)) - return 0; - mmap_write_lock(mm); - mm->context.has_pgste = 1; - /* split thp mappings and disable thp for future mappings */ - thp_split_mm(mm); - mmap_write_unlock(mm); - return 0; -} -EXPORT_SYMBOL_GPL(s390_enable_sie); - -/* - * Enable storage key handling from now on and initialize the storage - * keys with the default key. - */ -static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - /* Clear storage key */ - ptep_zap_key(walk->mm, addr, pte); - return 0; -} - -/* - * Give a chance to schedule after setting a key to 256 pages. - * We only hold the mm lock, which is a rwsem and the kvm srcu. - * Both can sleep. - */ -static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - cond_resched(); - return 0; -} - -static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, - unsigned long hmask, unsigned long next, - struct mm_walk *walk) -{ - pmd_t *pmd = (pmd_t *)pte; - unsigned long start, end; - struct folio *folio = page_folio(pmd_page(*pmd)); - - /* - * The write check makes sure we do not set a key on shared - * memory. This is needed as the walker does not differentiate - * between actual guest memory and the process executable or - * shared libraries. - */ - if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID || - !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE)) - return 0; - - start = pmd_val(*pmd) & HPAGE_MASK; - end = start + HPAGE_SIZE; - __storage_key_init_range(start, end); - set_bit(PG_arch_1, &folio->flags.f); - cond_resched(); - return 0; -} - -static const struct mm_walk_ops enable_skey_walk_ops = { - .hugetlb_entry = __s390_enable_skey_hugetlb, - .pte_entry = __s390_enable_skey_pte, - .pmd_entry = __s390_enable_skey_pmd, - .walk_lock = PGWALK_WRLOCK, -}; - -int s390_enable_skey(void) -{ - struct mm_struct *mm = current->mm; - int rc = 0; - - mmap_write_lock(mm); - if (mm_uses_skeys(mm)) - goto out_up; - - mm->context.uses_skeys = 1; - rc = gmap_helper_disable_cow_sharing(); - if (rc) { - mm->context.uses_skeys = 0; - goto out_up; - } - walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); - -out_up: - mmap_write_unlock(mm); - return rc; -} -EXPORT_SYMBOL_GPL(s390_enable_skey); - -/* - * Reset CMMA state, make all pages stable again. - */ -static int __s390_reset_cmma(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - ptep_zap_unused(walk->mm, addr, pte, 1); - return 0; -} - -static const struct mm_walk_ops reset_cmma_walk_ops = { - .pte_entry = __s390_reset_cmma, - .walk_lock = PGWALK_WRLOCK, -}; - -void s390_reset_cmma(struct mm_struct *mm) -{ - mmap_write_lock(mm); - walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); - mmap_write_unlock(mm); -} -EXPORT_SYMBOL_GPL(s390_reset_cmma); - -#define GATHER_GET_PAGES 32 - -struct reset_walk_state { - unsigned long next; - unsigned long count; - unsigned long pfns[GATHER_GET_PAGES]; -}; - -static int s390_gather_pages(pte_t *ptep, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - struct reset_walk_state *p = walk->private; - pte_t pte = READ_ONCE(*ptep); - - if (pte_present(pte)) { - /* we have a reference from the mapping, take an extra one */ - get_page(phys_to_page(pte_val(pte))); - p->pfns[p->count] = phys_to_pfn(pte_val(pte)); - p->next = next; - p->count++; - } - return p->count >= GATHER_GET_PAGES; -} - -static const struct mm_walk_ops gather_pages_ops = { - .pte_entry = s390_gather_pages, - .walk_lock = PGWALK_RDLOCK, -}; - -/* - * Call the Destroy secure page UVC on each page in the given array of PFNs. - * Each page needs to have an extra reference, which will be released here. - */ -void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) -{ - struct folio *folio; - unsigned long i; - - for (i = 0; i < count; i++) { - folio = pfn_folio(pfns[i]); - /* we always have an extra reference */ - uv_destroy_folio(folio); - /* get rid of the extra reference */ - folio_put(folio); - cond_resched(); - } -} -EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); - -/** - * __s390_uv_destroy_range - Call the destroy secure page UVC on each page - * in the given range of the given address space. - * @mm: the mm to operate on - * @start: the start of the range - * @end: the end of the range - * @interruptible: if not 0, stop when a fatal signal is received - * - * Walk the given range of the given address space and call the destroy - * secure page UVC on each page. Optionally exit early if a fatal signal is - * pending. - * - * Return: 0 on success, -EINTR if the function stopped before completing - */ -int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, - unsigned long end, bool interruptible) -{ - struct reset_walk_state state = { .next = start }; - int r = 1; - - while (r > 0) { - state.count = 0; - mmap_read_lock(mm); - r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); - mmap_read_unlock(mm); - cond_resched(); - s390_uv_destroy_pfns(state.count, state.pfns); - if (interruptible && fatal_signal_pending(current)) - return -EINTR; - } - return 0; -} -EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); - -/** - * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy - * @gmap: the gmap whose ASCE needs to be replaced - * - * If the ASCE is a SEGMENT type then this function will return -EINVAL, - * otherwise the pointers in the host_to_guest radix tree will keep pointing - * to the wrong pages, causing use-after-free and memory corruption. - * If the allocation of the new top level page table fails, the ASCE is not - * replaced. - * In any case, the old ASCE is always removed from the gmap CRST list. - * Therefore the caller has to make sure to save a pointer to it - * beforehand, unless a leak is actually intended. - */ -int s390_replace_asce(struct gmap *gmap) -{ - unsigned long asce; - struct page *page; - void *table; - - /* Replacing segment type ASCEs would cause serious issues */ - if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) - return -EINVAL; - - page = gmap_alloc_crst(); - if (!page) - return -ENOMEM; - table = page_to_virt(page); - memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); - - /* Set new table origin while preserving existing ASCE control bits */ - asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); - WRITE_ONCE(gmap->asce, asce); - WRITE_ONCE(gmap->mm->context.gmap_asce, asce); - WRITE_ONCE(gmap->table, table); - - return 0; -} -EXPORT_SYMBOL_GPL(s390_replace_asce); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 08743c1dac2f..eced1dc5214f 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -369,8 +369,6 @@ static inline void pmdp_idte_local(struct mm_struct *mm, mm->context.asce, IDTE_LOCAL); else __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); - if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) - gmap_pmdp_idte_local(mm, addr); } static inline void pmdp_idte_global(struct mm_struct *mm, @@ -379,12 +377,8 @@ static inline void pmdp_idte_global(struct mm_struct *mm, if (machine_has_tlb_guest()) { __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_GLOBAL); - if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) - gmap_pmdp_idte_global(mm, addr); } else { __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); - if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) - gmap_pmdp_idte_global(mm, addr); } } @@ -419,8 +413,6 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, cpumask_of(smp_processor_id()))) { set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID))); mm->context.flush_mm = 1; - if (mm_has_pgste(mm)) - gmap_pmdp_invalidate(mm, addr); } else { pmdp_idte_global(mm, addr, pmdp); } From 728b0e21b473ad8097185fb85ce2b9ab1ddf4ef7 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:55 +0100 Subject: [PATCH 214/267] KVM: S390: Remove PGSTE code from linux/s390 mm Remove the PGSTE config option. Remove all code from linux/s390 mm that involves PGSTEs. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/Kconfig | 3 - arch/s390/include/asm/hugetlb.h | 6 - arch/s390/include/asm/mmu.h | 13 - arch/s390/include/asm/page.h | 4 - arch/s390/include/asm/pgalloc.h | 4 - arch/s390/include/asm/pgtable.h | 121 +---- arch/s390/kvm/dat.h | 1 + arch/s390/mm/hugetlbpage.c | 24 - arch/s390/mm/pgalloc.c | 24 - arch/s390/mm/pgtable.c | 827 +------------------------------- mm/khugepaged.c | 9 - 11 files changed, 15 insertions(+), 1021 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8270754985e9..961cbf023c1b 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -32,9 +32,6 @@ config GENERIC_BUG_RELATIVE_POINTERS config GENERIC_LOCKBREAK def_bool y if PREEMPTION -config PGSTE - def_bool n - config AUDIT_ARCH def_bool y diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 69131736daaa..6983e52eaf81 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -37,12 +37,6 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, return __huge_ptep_get_and_clear(mm, addr, ptep); } -static inline void arch_clear_hugetlb_flags(struct folio *folio) -{ - clear_bit(PG_arch_1, &folio->flags.f); -} -#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags - #define __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index f07e49b419ab..d4fd7bf3692e 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -18,24 +18,11 @@ typedef struct { unsigned long vdso_base; /* The mmu context belongs to a secure guest. */ atomic_t protected_count; - /* - * The following bitfields need a down_write on the mm - * semaphore when they are written to. As they are only - * written once, they can be read without a lock. - */ - /* The mmu context uses extended page tables. */ - unsigned int has_pgste:1; - /* The mmu context uses storage keys. */ - unsigned int uses_skeys:1; - /* The mmu context uses CMM. */ - unsigned int uses_cmm:1; /* * The mmu context allows COW-sharing of memory pages (KSM, zeropage). * Note that COW-sharing during fork() is currently always allowed. */ unsigned int allow_cow_sharing:1; - /* The gmaps associated with this context are allowed to use huge pages. */ - unsigned int allow_gmap_hpage_1m:1; } mm_context_t; #define INIT_MM_CONTEXT(name) \ diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index c1d63b613bf9..6de2f4d25b63 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -78,7 +78,6 @@ static inline void copy_page(void *to, void *from) #ifdef STRICT_MM_TYPECHECKS typedef struct { unsigned long pgprot; } pgprot_t; -typedef struct { unsigned long pgste; } pgste_t; typedef struct { unsigned long pte; } pte_t; typedef struct { unsigned long pmd; } pmd_t; typedef struct { unsigned long pud; } pud_t; @@ -94,7 +93,6 @@ static __always_inline unsigned long name ## _val(name ## _t name) \ #else /* STRICT_MM_TYPECHECKS */ typedef unsigned long pgprot_t; -typedef unsigned long pgste_t; typedef unsigned long pte_t; typedef unsigned long pmd_t; typedef unsigned long pud_t; @@ -110,7 +108,6 @@ static __always_inline unsigned long name ## _val(name ## _t name) \ #endif /* STRICT_MM_TYPECHECKS */ DEFINE_PGVAL_FUNC(pgprot) -DEFINE_PGVAL_FUNC(pgste) DEFINE_PGVAL_FUNC(pte) DEFINE_PGVAL_FUNC(pmd) DEFINE_PGVAL_FUNC(pud) @@ -120,7 +117,6 @@ DEFINE_PGVAL_FUNC(pgd) typedef pte_t *pgtable_t; #define __pgprot(x) ((pgprot_t) { (x) } ) -#define __pgste(x) ((pgste_t) { (x) } ) #define __pte(x) ((pte_t) { (x) } ) #define __pmd(x) ((pmd_t) { (x) } ) #define __pud(x) ((pud_t) { (x) } ) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index a16e65072371..a5de9e61ea9e 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -27,10 +27,6 @@ unsigned long *page_table_alloc_noprof(struct mm_struct *); #define page_table_alloc(...) alloc_hooks(page_table_alloc_noprof(__VA_ARGS__)) void page_table_free(struct mm_struct *, unsigned long *); -struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm); -#define page_table_alloc_pgste(...) alloc_hooks(page_table_alloc_pgste_noprof(__VA_ARGS__)) -void page_table_free_pgste(struct ptdesc *ptdesc); - static inline void crst_table_init(unsigned long *crst, unsigned long entry) { memset64((u64 *)crst, entry, _CRST_ENTRIES); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 45f13697cf9e..1c3c3be93be9 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -413,28 +413,6 @@ void setup_protection_map(void); * SW-bits: y young, d dirty, r read, w write */ -/* Page status table bits for virtualization */ -#define PGSTE_ACC_BITS 0xf000000000000000UL -#define PGSTE_FP_BIT 0x0800000000000000UL -#define PGSTE_PCL_BIT 0x0080000000000000UL -#define PGSTE_HR_BIT 0x0040000000000000UL -#define PGSTE_HC_BIT 0x0020000000000000UL -#define PGSTE_GR_BIT 0x0004000000000000UL -#define PGSTE_GC_BIT 0x0002000000000000UL -#define PGSTE_ST2_MASK 0x0000ffff00000000UL -#define PGSTE_UC_BIT 0x0000000000008000UL /* user dirty (migration) */ -#define PGSTE_IN_BIT 0x0000000000004000UL /* IPTE notify bit */ -#define PGSTE_VSIE_BIT 0x0000000000002000UL /* ref'd in a shadow table */ - -/* Guest Page State used for virtualization */ -#define _PGSTE_GPS_ZERO 0x0000000080000000UL -#define _PGSTE_GPS_NODAT 0x0000000040000000UL -#define _PGSTE_GPS_USAGE_MASK 0x0000000003000000UL -#define _PGSTE_GPS_USAGE_STABLE 0x0000000000000000UL -#define _PGSTE_GPS_USAGE_UNUSED 0x0000000001000000UL -#define _PGSTE_GPS_USAGE_POT_VOLATILE 0x0000000002000000UL -#define _PGSTE_GPS_USAGE_VOLATILE _PGSTE_GPS_USAGE_MASK - /* * A user page table pointer has the space-switch-event bit, the * private-space-control bit and the storage-alteration-event-control @@ -566,15 +544,6 @@ static inline bool mm_pmd_folded(struct mm_struct *mm) } #define mm_pmd_folded(mm) mm_pmd_folded(mm) -static inline int mm_has_pgste(struct mm_struct *mm) -{ -#ifdef CONFIG_PGSTE - if (unlikely(mm->context.has_pgste)) - return 1; -#endif - return 0; -} - static inline int mm_is_protected(struct mm_struct *mm) { #if IS_ENABLED(CONFIG_KVM) @@ -584,16 +553,6 @@ static inline int mm_is_protected(struct mm_struct *mm) return 0; } -static inline pgste_t clear_pgste_bit(pgste_t pgste, unsigned long mask) -{ - return __pgste(pgste_val(pgste) & ~mask); -} - -static inline pgste_t set_pgste_bit(pgste_t pgste, unsigned long mask) -{ - return __pgste(pgste_val(pgste) | mask); -} - static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot) { return __pte(pte_val(pte) & ~pgprot_val(prot)); @@ -639,15 +598,6 @@ static inline int mm_forbids_zeropage(struct mm_struct *mm) return 0; } -static inline int mm_uses_skeys(struct mm_struct *mm) -{ -#ifdef CONFIG_PGSTE - if (mm->context.uses_skeys) - return 1; -#endif - return 0; -} - /** * cspg() - Compare and Swap and Purge (CSPG) * @ptr: Pointer to the value to be exchanged @@ -1356,45 +1306,13 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, { if (pte_same(*ptep, entry)) return 0; - if (cpu_has_rdp() && !mm_has_pgste(vma->vm_mm) && pte_allow_rdp(*ptep, entry)) + if (cpu_has_rdp() && pte_allow_rdp(*ptep, entry)) ptep_reset_dat_prot(vma->vm_mm, addr, ptep, entry); else ptep_xchg_direct(vma->vm_mm, addr, ptep, entry); return 1; } -/* - * Additional functions to handle KVM guest page tables - */ -void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t entry); -void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr, - pte_t *ptep, int prot, unsigned long bit); -void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, - pte_t *ptep , int reset); -void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, - pte_t *sptep, pte_t *tptep, pte_t pte); -void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep); - -bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long address, - pte_t *ptep); -int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char key, bool nq); -int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char key, unsigned char *oldkey, - bool nq, bool mr, bool mc); -int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr); -int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char *key); - -int set_pgste_bits(struct mm_struct *mm, unsigned long addr, - unsigned long bits, unsigned long value); -int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep); -int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, - unsigned long *oldpte, unsigned long *oldpgste); - #define pgprot_writecombine pgprot_writecombine pgprot_t pgprot_writecombine(pgprot_t prot); @@ -1409,23 +1327,12 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, { if (pte_present(entry)) entry = clear_pte_bit(entry, __pgprot(_PAGE_UNUSED)); - if (mm_has_pgste(mm)) { - for (;;) { - ptep_set_pte_at(mm, addr, ptep, entry); - if (--nr == 0) - break; - ptep++; - entry = __pte(pte_val(entry) + PAGE_SIZE); - addr += PAGE_SIZE; - } - } else { - for (;;) { - set_pte(ptep, entry); - if (--nr == 0) - break; - ptep++; - entry = __pte(pte_val(entry) + PAGE_SIZE); - } + for (;;) { + set_pte(ptep, entry); + if (--nr == 0) + break; + ptep++; + entry = __pte(pte_val(entry) + PAGE_SIZE); } } #define set_ptes set_ptes @@ -2026,18 +1933,4 @@ extern pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc); #define pmd_pgtable(pmd) \ ((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE)) -static inline unsigned long gmap_pgste_get_pgt_addr(unsigned long *pgt) -{ - unsigned long *pgstes, res; - - pgstes = pgt + _PAGE_ENTRIES; - - res = (pgstes[0] & PGSTE_ST2_MASK) << 16; - res |= pgstes[1] & PGSTE_ST2_MASK; - res |= (pgstes[2] & PGSTE_ST2_MASK) >> 16; - res |= (pgstes[3] & PGSTE_ST2_MASK) >> 32; - - return res; -} - #endif /* _S390_PAGE_H */ diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h index 358b756ca8c9..8c7ae07dcc28 100644 --- a/arch/s390/kvm/dat.h +++ b/arch/s390/kvm/dat.h @@ -108,6 +108,7 @@ union pte { #define _PAGE_SD 0x002 /* Needed as macro to perform atomic operations */ +#define PGSTE_PCL_BIT 0x0080000000000000UL /* PCL lock, HW bit */ #define PGSTE_CMMA_D_BIT 0x0000000000008000UL /* CMMA dirty soft-bit */ enum pgste_gps_usage { diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index d42e61c7594e..35a898e15b1c 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -135,29 +135,6 @@ static inline pte_t __rste_to_pte(unsigned long rste) return __pte(pteval); } -static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) -{ - struct folio *folio; - unsigned long size, paddr; - - if (!mm_uses_skeys(mm) || - rste & _SEGMENT_ENTRY_INVALID) - return; - - if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { - folio = page_folio(pud_page(__pud(rste))); - size = PUD_SIZE; - paddr = rste & PUD_MASK; - } else { - folio = page_folio(pmd_page(__pmd(rste))); - size = PMD_SIZE; - paddr = rste & PMD_MASK; - } - - if (!test_and_set_bit(PG_arch_1, &folio->flags.f)) - __storage_key_init_range(paddr, paddr + size); -} - void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { @@ -173,7 +150,6 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, } else if (likely(pte_present(pte))) rste |= _SEGMENT_ENTRY_LARGE; - clear_huge_pte_skeys(mm, rste); set_pte(ptep, __pte(rste)); } diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 7df23528c01b..7ac44543e051 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -114,30 +114,6 @@ err_p4d: return -ENOMEM; } -#ifdef CONFIG_PGSTE - -struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm) -{ - struct ptdesc *ptdesc; - u64 *table; - - ptdesc = pagetable_alloc_noprof(GFP_KERNEL_ACCOUNT, 0); - if (ptdesc) { - table = (u64 *)ptdesc_address(ptdesc); - __arch_set_page_dat(table, 1); - memset64(table, _PAGE_INVALID, PTRS_PER_PTE); - memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); - } - return ptdesc; -} - -void page_table_free_pgste(struct ptdesc *ptdesc) -{ - pagetable_free(ptdesc); -} - -#endif /* CONFIG_PGSTE */ - unsigned long *page_table_alloc_noprof(struct mm_struct *mm) { gfp_t gfp = GFP_KERNEL_ACCOUNT; diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index eced1dc5214f..4acd8b140c4b 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -115,171 +115,14 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, return old; } -static inline pgste_t pgste_get_lock(pte_t *ptep) -{ - unsigned long value = 0; -#ifdef CONFIG_PGSTE - unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE); - - do { - value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr); - } while (value & PGSTE_PCL_BIT); - value |= PGSTE_PCL_BIT; -#endif - return __pgste(value); -} - -static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) -{ -#ifdef CONFIG_PGSTE - barrier(); - WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT); -#endif -} - -static inline pgste_t pgste_get(pte_t *ptep) -{ - unsigned long pgste = 0; -#ifdef CONFIG_PGSTE - pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); -#endif - return __pgste(pgste); -} - -static inline void pgste_set(pte_t *ptep, pgste_t pgste) -{ -#ifdef CONFIG_PGSTE - *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; -#endif -} - -static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, - struct mm_struct *mm) -{ -#ifdef CONFIG_PGSTE - unsigned long address, bits, skey; - - if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID) - return pgste; - address = pte_val(pte) & PAGE_MASK; - skey = (unsigned long) page_get_storage_key(address); - bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); - /* Transfer page changed & referenced bit to guest bits in pgste */ - pgste = set_pgste_bit(pgste, bits << 48); /* GR bit & GC bit */ - /* Copy page access key and fetch protection bit to pgste */ - pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); - pgste = set_pgste_bit(pgste, (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); -#endif - return pgste; - -} - -static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, - struct mm_struct *mm) -{ -#ifdef CONFIG_PGSTE - unsigned long address; - unsigned long nkey; - - if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID) - return; - VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID)); - address = pte_val(entry) & PAGE_MASK; - /* - * Set page access key and fetch protection bit from pgste. - * The guest C/R information is still in the PGSTE, set real - * key C/R to 0. - */ - nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; - nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; - page_set_storage_key(address, nkey, 0); -#endif -} - -static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) -{ -#ifdef CONFIG_PGSTE - if ((pte_val(entry) & _PAGE_PRESENT) && - (pte_val(entry) & _PAGE_WRITE) && - !(pte_val(entry) & _PAGE_INVALID)) { - if (!machine_has_esop()) { - /* - * Without enhanced suppression-on-protection force - * the dirty bit on for all writable ptes. - */ - entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY)); - entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT)); - } - if (!(pte_val(entry) & _PAGE_PROTECT)) - /* This pte allows write access, set user-dirty */ - pgste = set_pgste_bit(pgste, PGSTE_UC_BIT); - } -#endif - set_pte(ptep, entry); - return pgste; -} - -static inline pgste_t pgste_pte_notify(struct mm_struct *mm, - unsigned long addr, - pte_t *ptep, pgste_t pgste) -{ -#ifdef CONFIG_PGSTE - unsigned long bits; - - bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); - if (bits) { - pgste = __pgste(pgste_val(pgste) ^ bits); - ptep_notify(mm, addr, ptep, bits); - } -#endif - return pgste; -} - -static inline pgste_t ptep_xchg_start(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - pgste_t pgste = __pgste(0); - - if (mm_has_pgste(mm)) { - pgste = pgste_get_lock(ptep); - pgste = pgste_pte_notify(mm, addr, ptep, pgste); - } - return pgste; -} - -static inline pte_t ptep_xchg_commit(struct mm_struct *mm, - unsigned long addr, pte_t *ptep, - pgste_t pgste, pte_t old, pte_t new) -{ - if (mm_has_pgste(mm)) { - if (pte_val(old) & _PAGE_INVALID) - pgste_set_key(ptep, pgste, new, mm); - if (pte_val(new) & _PAGE_INVALID) { - pgste = pgste_update_all(old, pgste, mm); - if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == - _PGSTE_GPS_USAGE_UNUSED) - old = set_pte_bit(old, __pgprot(_PAGE_UNUSED)); - } - pgste = pgste_set_pte(ptep, pgste, new); - pgste_set_unlock(ptep, pgste); - } else { - set_pte(ptep, new); - } - return old; -} - pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t new) { - pgste_t pgste; pte_t old; - int nodat; preempt_disable(); - pgste = ptep_xchg_start(mm, addr, ptep); - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - old = ptep_flush_direct(mm, addr, ptep, nodat); - old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); + old = ptep_flush_direct(mm, addr, ptep, 1); + set_pte(ptep, new); preempt_enable(); return old; } @@ -313,15 +156,11 @@ EXPORT_SYMBOL(ptep_reset_dat_prot); pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t new) { - pgste_t pgste; pte_t old; - int nodat; preempt_disable(); - pgste = ptep_xchg_start(mm, addr, ptep); - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - old = ptep_flush_lazy(mm, addr, ptep, nodat); - old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); + old = ptep_flush_lazy(mm, addr, ptep, 1); + set_pte(ptep, new); preempt_enable(); return old; } @@ -330,43 +169,20 @@ EXPORT_SYMBOL(ptep_xchg_lazy); pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - pgste_t pgste; - pte_t old; - int nodat; - struct mm_struct *mm = vma->vm_mm; - - pgste = ptep_xchg_start(mm, addr, ptep); - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - old = ptep_flush_lazy(mm, addr, ptep, nodat); - if (mm_has_pgste(mm)) { - pgste = pgste_update_all(old, pgste, mm); - pgste_set(ptep, pgste); - } - return old; + return ptep_flush_lazy(vma->vm_mm, addr, ptep, 1); } void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { - pgste_t pgste; - struct mm_struct *mm = vma->vm_mm; - - if (mm_has_pgste(mm)) { - pgste = pgste_get(ptep); - pgste_set_key(ptep, pgste, pte, mm); - pgste = pgste_set_pte(ptep, pgste, pte); - pgste_set_unlock(ptep, pgste); - } else { - set_pte(ptep, pte); - } + set_pte(ptep, pte); } static inline void pmdp_idte_local(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { if (machine_has_tlb_guest()) - __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, - mm->context.asce, IDTE_LOCAL); + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_LOCAL); else __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); } @@ -420,40 +236,6 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, return old; } -#ifdef CONFIG_PGSTE -static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) -{ - struct vm_area_struct *vma; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - - /* We need a valid VMA, otherwise this is clearly a fault. */ - vma = vma_lookup(mm, addr); - if (!vma) - return -EFAULT; - - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) - return -ENOENT; - - p4d = p4d_offset(pgd, addr); - if (!p4d_present(*p4d)) - return -ENOENT; - - pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) - return -ENOENT; - - /* Large PUDs are not supported yet. */ - if (pud_leaf(*pud)) - return -EFAULT; - - *pmdp = pmd_offset(pud, addr); - return 0; -} -#endif - pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t new) { @@ -571,598 +353,3 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) return pgtable; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -#ifdef CONFIG_PGSTE -void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t entry) -{ - pgste_t pgste; - - /* the mm_has_pgste() check is done in set_pte_at() */ - preempt_disable(); - pgste = pgste_get_lock(ptep); - pgste = clear_pgste_bit(pgste, _PGSTE_GPS_ZERO); - pgste_set_key(ptep, pgste, entry, mm); - pgste = pgste_set_pte(ptep, pgste, entry); - pgste_set_unlock(ptep, pgste); - preempt_enable(); -} - -void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - pgste_t pgste; - - preempt_disable(); - pgste = pgste_get_lock(ptep); - pgste = set_pgste_bit(pgste, PGSTE_IN_BIT); - pgste_set_unlock(ptep, pgste); - preempt_enable(); -} - -/** - * ptep_force_prot - change access rights of a locked pte - * @mm: pointer to the process mm_struct - * @addr: virtual address in the guest address space - * @ptep: pointer to the page table entry - * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bit: pgste bit to set (e.g. for notification) - * - * Returns 0 if the access rights were changed and -EAGAIN if the current - * and requested access rights are incompatible. - */ -int ptep_force_prot(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, int prot, unsigned long bit) -{ - pte_t entry; - pgste_t pgste; - int pte_i, pte_p, nodat; - - pgste = pgste_get_lock(ptep); - entry = *ptep; - /* Check pte entry after all locks have been acquired */ - pte_i = pte_val(entry) & _PAGE_INVALID; - pte_p = pte_val(entry) & _PAGE_PROTECT; - if ((pte_i && (prot != PROT_NONE)) || - (pte_p && (prot & PROT_WRITE))) { - pgste_set_unlock(ptep, pgste); - return -EAGAIN; - } - /* Change access rights and set pgste bit */ - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - if (prot == PROT_NONE && !pte_i) { - ptep_flush_direct(mm, addr, ptep, nodat); - pgste = pgste_update_all(entry, pgste, mm); - entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID)); - } - if (prot == PROT_READ && !pte_p) { - ptep_flush_direct(mm, addr, ptep, nodat); - entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); - entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); - } - pgste = set_pgste_bit(pgste, bit); - pgste = pgste_set_pte(ptep, pgste, entry); - pgste_set_unlock(ptep, pgste); - return 0; -} - -int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, - pte_t *sptep, pte_t *tptep, pte_t pte) -{ - pgste_t spgste, tpgste; - pte_t spte, tpte; - int rc = -EAGAIN; - - if (!(pte_val(*tptep) & _PAGE_INVALID)) - return 0; /* already shadowed */ - spgste = pgste_get_lock(sptep); - spte = *sptep; - if (!(pte_val(spte) & _PAGE_INVALID) && - !((pte_val(spte) & _PAGE_PROTECT) && - !(pte_val(pte) & _PAGE_PROTECT))) { - spgste = set_pgste_bit(spgste, PGSTE_VSIE_BIT); - tpgste = pgste_get_lock(tptep); - tpte = __pte((pte_val(spte) & PAGE_MASK) | - (pte_val(pte) & _PAGE_PROTECT)); - /* don't touch the storage key - it belongs to parent pgste */ - tpgste = pgste_set_pte(tptep, tpgste, tpte); - pgste_set_unlock(tptep, tpgste); - rc = 1; - } - pgste_set_unlock(sptep, spgste); - return rc; -} - -void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) -{ - pgste_t pgste; - int nodat; - - pgste = pgste_get_lock(ptep); - /* notifier is called by the caller */ - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - ptep_flush_direct(mm, saddr, ptep, nodat); - /* don't touch the storage key - it belongs to parent pgste */ - pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); - pgste_set_unlock(ptep, pgste); -} - -static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) -{ - if (softleaf_is_swap(entry)) - dec_mm_counter(mm, MM_SWAPENTS); - else if (softleaf_is_migration(entry)) { - struct folio *folio = softleaf_to_folio(entry); - - dec_mm_counter(mm, mm_counter(folio)); - } - free_swap_and_cache(entry); -} - -void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, int reset) -{ - unsigned long pgstev; - pgste_t pgste; - pte_t pte; - - /* Zap unused and logically-zero pages */ - preempt_disable(); - pgste = pgste_get_lock(ptep); - pgstev = pgste_val(pgste); - pte = *ptep; - if (!reset && pte_swap(pte) && - ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || - (pgstev & _PGSTE_GPS_ZERO))) { - ptep_zap_softleaf_entry(mm, softleaf_from_pte(pte)); - pte_clear(mm, addr, ptep); - } - if (reset) - pgste = clear_pgste_bit(pgste, _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); - pgste_set_unlock(ptep, pgste); - preempt_enable(); -} - -void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - unsigned long ptev; - pgste_t pgste; - - /* Clear storage key ACC and F, but set R/C */ - preempt_disable(); - pgste = pgste_get_lock(ptep); - pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); - pgste = set_pgste_bit(pgste, PGSTE_GR_BIT | PGSTE_GC_BIT); - ptev = pte_val(*ptep); - if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) - page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); - pgste_set_unlock(ptep, pgste); - preempt_enable(); -} - -/* - * Test and reset if a guest page is dirty - */ -bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - pgste_t pgste; - pte_t pte; - bool dirty; - int nodat; - - pgste = pgste_get_lock(ptep); - dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); - pgste = clear_pgste_bit(pgste, PGSTE_UC_BIT); - pte = *ptep; - if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { - pgste = pgste_pte_notify(mm, addr, ptep, pgste); - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - ptep_ipte_global(mm, addr, ptep, nodat); - if (machine_has_esop() || !(pte_val(pte) & _PAGE_WRITE)) - pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); - else - pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); - set_pte(ptep, pte); - } - pgste_set_unlock(ptep, pgste); - return dirty; -} -EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc); - -int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char key, bool nq) -{ - unsigned long keyul, paddr; - spinlock_t *ptl; - pgste_t old, new; - pmd_t *pmdp; - pte_t *ptep; - - /* - * If we don't have a PTE table and if there is no huge page mapped, - * we can ignore attempts to set the key to 0, because it already is 0. - */ - switch (pmd_lookup(mm, addr, &pmdp)) { - case -ENOENT: - return key ? -EFAULT : 0; - case 0: - break; - default: - return -EFAULT; - } -again: - ptl = pmd_lock(mm, pmdp); - if (!pmd_present(*pmdp)) { - spin_unlock(ptl); - return key ? -EFAULT : 0; - } - - if (pmd_leaf(*pmdp)) { - paddr = pmd_val(*pmdp) & HPAGE_MASK; - paddr |= addr & ~HPAGE_MASK; - /* - * Huge pmds need quiescing operations, they are - * always mapped. - */ - page_set_storage_key(paddr, key, 1); - spin_unlock(ptl); - return 0; - } - spin_unlock(ptl); - - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); - if (!ptep) - goto again; - new = old = pgste_get_lock(ptep); - new = clear_pgste_bit(new, PGSTE_GR_BIT | PGSTE_GC_BIT | - PGSTE_ACC_BITS | PGSTE_FP_BIT); - keyul = (unsigned long) key; - new = set_pgste_bit(new, (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48); - new = set_pgste_bit(new, (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); - if (!(pte_val(*ptep) & _PAGE_INVALID)) { - unsigned long bits, skey; - - paddr = pte_val(*ptep) & PAGE_MASK; - skey = (unsigned long) page_get_storage_key(paddr); - bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); - skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); - /* Set storage key ACC and FP */ - page_set_storage_key(paddr, skey, !nq); - /* Merge host changed & referenced into pgste */ - new = set_pgste_bit(new, bits << 52); - } - /* changing the guest storage key is considered a change of the page */ - if ((pgste_val(new) ^ pgste_val(old)) & - (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) - new = set_pgste_bit(new, PGSTE_UC_BIT); - - pgste_set_unlock(ptep, new); - pte_unmap_unlock(ptep, ptl); - return 0; -} -EXPORT_SYMBOL(set_guest_storage_key); - -/* - * Conditionally set a guest storage key (handling csske). - * oldkey will be updated when either mr or mc is set and a pointer is given. - * - * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest - * storage key was updated and -EFAULT on access errors. - */ -int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char key, unsigned char *oldkey, - bool nq, bool mr, bool mc) -{ - unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT; - int rc; - - /* we can drop the pgste lock between getting and setting the key */ - if (mr | mc) { - rc = get_guest_storage_key(current->mm, addr, &tmp); - if (rc) - return rc; - if (oldkey) - *oldkey = tmp; - if (!mr) - mask |= _PAGE_REFERENCED; - if (!mc) - mask |= _PAGE_CHANGED; - if (!((tmp ^ key) & mask)) - return 0; - } - rc = set_guest_storage_key(current->mm, addr, key, nq); - return rc < 0 ? rc : 1; -} -EXPORT_SYMBOL(cond_set_guest_storage_key); - -/* - * Reset a guest reference bit (rrbe), returning the reference and changed bit. - * - * Returns < 0 in case of error, otherwise the cc to be reported to the guest. - */ -int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) -{ - spinlock_t *ptl; - unsigned long paddr; - pgste_t old, new; - pmd_t *pmdp; - pte_t *ptep; - int cc = 0; - - /* - * If we don't have a PTE table and if there is no huge page mapped, - * the storage key is 0 and there is nothing for us to do. - */ - switch (pmd_lookup(mm, addr, &pmdp)) { - case -ENOENT: - return 0; - case 0: - break; - default: - return -EFAULT; - } -again: - ptl = pmd_lock(mm, pmdp); - if (!pmd_present(*pmdp)) { - spin_unlock(ptl); - return 0; - } - - if (pmd_leaf(*pmdp)) { - paddr = pmd_val(*pmdp) & HPAGE_MASK; - paddr |= addr & ~HPAGE_MASK; - cc = page_reset_referenced(paddr); - spin_unlock(ptl); - return cc; - } - spin_unlock(ptl); - - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); - if (!ptep) - goto again; - new = old = pgste_get_lock(ptep); - /* Reset guest reference bit only */ - new = clear_pgste_bit(new, PGSTE_GR_BIT); - - if (!(pte_val(*ptep) & _PAGE_INVALID)) { - paddr = pte_val(*ptep) & PAGE_MASK; - cc = page_reset_referenced(paddr); - /* Merge real referenced bit into host-set */ - new = set_pgste_bit(new, ((unsigned long)cc << 53) & PGSTE_HR_BIT); - } - /* Reflect guest's logical view, not physical */ - cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; - /* Changing the guest storage key is considered a change of the page */ - if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) - new = set_pgste_bit(new, PGSTE_UC_BIT); - - pgste_set_unlock(ptep, new); - pte_unmap_unlock(ptep, ptl); - return cc; -} -EXPORT_SYMBOL(reset_guest_reference_bit); - -int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char *key) -{ - unsigned long paddr; - spinlock_t *ptl; - pgste_t pgste; - pmd_t *pmdp; - pte_t *ptep; - - /* - * If we don't have a PTE table and if there is no huge page mapped, - * the storage key is 0. - */ - *key = 0; - - switch (pmd_lookup(mm, addr, &pmdp)) { - case -ENOENT: - return 0; - case 0: - break; - default: - return -EFAULT; - } -again: - ptl = pmd_lock(mm, pmdp); - if (!pmd_present(*pmdp)) { - spin_unlock(ptl); - return 0; - } - - if (pmd_leaf(*pmdp)) { - paddr = pmd_val(*pmdp) & HPAGE_MASK; - paddr |= addr & ~HPAGE_MASK; - *key = page_get_storage_key(paddr); - spin_unlock(ptl); - return 0; - } - spin_unlock(ptl); - - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); - if (!ptep) - goto again; - pgste = pgste_get_lock(ptep); - *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; - paddr = pte_val(*ptep) & PAGE_MASK; - if (!(pte_val(*ptep) & _PAGE_INVALID)) - *key = page_get_storage_key(paddr); - /* Reflect guest's logical view, not physical */ - *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; - pgste_set_unlock(ptep, pgste); - pte_unmap_unlock(ptep, ptl); - return 0; -} -EXPORT_SYMBOL(get_guest_storage_key); - -/** - * pgste_perform_essa - perform ESSA actions on the PGSTE. - * @mm: the memory context. It must have PGSTEs, no check is performed here! - * @hva: the host virtual address of the page whose PGSTE is to be processed - * @orc: the specific action to perform, see the ESSA_SET_* macros. - * @oldpte: the PTE will be saved there if the pointer is not NULL. - * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL. - * - * Return: 1 if the page is to be added to the CBRL, otherwise 0, - * or < 0 in case of error. -EINVAL is returned for invalid values - * of orc, -EFAULT for invalid addresses. - */ -int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, - unsigned long *oldpte, unsigned long *oldpgste) -{ - struct vm_area_struct *vma; - unsigned long pgstev; - spinlock_t *ptl; - pgste_t pgste; - pte_t *ptep; - int res = 0; - - WARN_ON_ONCE(orc > ESSA_MAX); - if (unlikely(orc > ESSA_MAX)) - return -EINVAL; - - vma = vma_lookup(mm, hva); - if (!vma || is_vm_hugetlb_page(vma)) - return -EFAULT; - ptep = get_locked_pte(mm, hva, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - pgste = pgste_get_lock(ptep); - pgstev = pgste_val(pgste); - if (oldpte) - *oldpte = pte_val(*ptep); - if (oldpgste) - *oldpgste = pgstev; - - switch (orc) { - case ESSA_GET_STATE: - break; - case ESSA_SET_STABLE: - pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); - pgstev |= _PGSTE_GPS_USAGE_STABLE; - break; - case ESSA_SET_UNUSED: - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - pgstev |= _PGSTE_GPS_USAGE_UNUSED; - if (pte_val(*ptep) & _PAGE_INVALID) - res = 1; - break; - case ESSA_SET_VOLATILE: - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - pgstev |= _PGSTE_GPS_USAGE_VOLATILE; - if (pte_val(*ptep) & _PAGE_INVALID) - res = 1; - break; - case ESSA_SET_POT_VOLATILE: - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - if (!(pte_val(*ptep) & _PAGE_INVALID)) { - pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE; - break; - } - if (pgstev & _PGSTE_GPS_ZERO) { - pgstev |= _PGSTE_GPS_USAGE_VOLATILE; - break; - } - if (!(pgstev & PGSTE_GC_BIT)) { - pgstev |= _PGSTE_GPS_USAGE_VOLATILE; - res = 1; - break; - } - break; - case ESSA_SET_STABLE_RESIDENT: - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - pgstev |= _PGSTE_GPS_USAGE_STABLE; - /* - * Since the resident state can go away any time after this - * call, we will not make this page resident. We can revisit - * this decision if a guest will ever start using this. - */ - break; - case ESSA_SET_STABLE_IF_RESIDENT: - if (!(pte_val(*ptep) & _PAGE_INVALID)) { - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - pgstev |= _PGSTE_GPS_USAGE_STABLE; - } - break; - case ESSA_SET_STABLE_NODAT: - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT; - break; - default: - /* we should never get here! */ - break; - } - /* If we are discarding a page, set it to logical zero */ - if (res) - pgstev |= _PGSTE_GPS_ZERO; - - pgste = __pgste(pgstev); - pgste_set_unlock(ptep, pgste); - pte_unmap_unlock(ptep, ptl); - return res; -} -EXPORT_SYMBOL(pgste_perform_essa); - -/** - * set_pgste_bits - set specific PGSTE bits. - * @mm: the memory context. It must have PGSTEs, no check is performed here! - * @hva: the host virtual address of the page whose PGSTE is to be processed - * @bits: a bitmask representing the bits that will be touched - * @value: the values of the bits to be written. Only the bits in the mask - * will be written. - * - * Return: 0 on success, < 0 in case of error. - */ -int set_pgste_bits(struct mm_struct *mm, unsigned long hva, - unsigned long bits, unsigned long value) -{ - struct vm_area_struct *vma; - spinlock_t *ptl; - pgste_t new; - pte_t *ptep; - - vma = vma_lookup(mm, hva); - if (!vma || is_vm_hugetlb_page(vma)) - return -EFAULT; - ptep = get_locked_pte(mm, hva, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - new = pgste_get_lock(ptep); - - new = clear_pgste_bit(new, bits); - new = set_pgste_bit(new, value & bits); - - pgste_set_unlock(ptep, new); - pte_unmap_unlock(ptep, ptl); - return 0; -} -EXPORT_SYMBOL(set_pgste_bits); - -/** - * get_pgste - get the current PGSTE for the given address. - * @mm: the memory context. It must have PGSTEs, no check is performed here! - * @hva: the host virtual address of the page whose PGSTE is to be processed - * @pgstep: will be written with the current PGSTE for the given address. - * - * Return: 0 on success, < 0 in case of error. - */ -int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) -{ - struct vm_area_struct *vma; - spinlock_t *ptl; - pte_t *ptep; - - vma = vma_lookup(mm, hva); - if (!vma || is_vm_hugetlb_page(vma)) - return -EFAULT; - ptep = get_locked_pte(mm, hva, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - *pgstep = pgste_val(pgste_get(ptep)); - pte_unmap_unlock(ptep, ptl); - return 0; -} -EXPORT_SYMBOL(get_pgste); -#endif diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 97d1b2824386..be3a2a603fb1 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -343,15 +343,6 @@ int hugepage_madvise(struct vm_area_struct *vma, { switch (advice) { case MADV_HUGEPAGE: -#ifdef CONFIG_S390 - /* - * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 - * can't handle this properly after s390_enable_sie, so we simply - * ignore the madvise to prevent qemu from causing a SIGSEGV. - */ - if (mm_has_pgste(vma->vm_mm)) - return 0; -#endif *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; /* From 0fdd5c18a99cc89e829243f9f8561b37b7ebc4fc Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:56 +0100 Subject: [PATCH 215/267] KVM: s390: Enable 1M pages for gmap While userspace is allowed to have pages of any size, the new gmap would always use 4k pages to back the guest. Enable 1M pages for gmap. This allows 1M pages to be used to back a guest when userspace is using 1M pages for the corresponding addresses (e.g. THP or hugetlbfs). Remove the limitation that disallowed having nested guests and hugepages at the same time. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/gmap.c | 2 +- arch/s390/kvm/kvm-s390.c | 6 +----- arch/s390/kvm/pv.c | 3 +++ 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index fea1c66fcabe..da222962ef6d 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -620,7 +620,7 @@ static inline bool gmap_2g_allowed(struct gmap *gmap, gfn_t gfn) static inline bool gmap_1m_allowed(struct gmap *gmap, gfn_t gfn) { - return false; + return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags); } int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index bde55761bf8a..ac7b5f56f0b5 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -851,6 +851,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) r = -EINVAL; else { r = 0; + set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &kvm->arch.gmap->flags); /* * We might have to create fake 4k page * tables. To avoid that the hardware works on @@ -5729,11 +5730,6 @@ static int __init kvm_s390_init(void) return -ENODEV; } - if (nested && hpage) { - pr_info("A KVM host that supports nesting cannot back its KVM guests with huge pages\n"); - return -EINVAL; - } - for (i = 0; i < 16; i++) kvm_s390_fac_base[i] |= stfle_fac_list[i] & nonhyp_mask(i); diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index a48a8afd40df..461b413c76a3 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -721,6 +721,9 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) uvcb.flags.ap_allow_instr = kvm->arch.model.uv_feat_guest.ap; uvcb.flags.ap_instr_intr = kvm->arch.model.uv_feat_guest.ap_intr; + clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &kvm->arch.gmap->flags); + gmap_split_huge_pages(kvm->arch.gmap); + cc = uv_call_sched(0, (u64)&uvcb); *rc = uvcb.header.rc; *rrc = uvcb.header.rrc; From 0ee4ddc1647b8b3b9e7a94d798a1774a764428c1 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:57 +0100 Subject: [PATCH 216/267] KVM: s390: Storage key manipulation IOCTL Add a new IOCTL to allow userspace to manipulate storage keys directly. This will make it easier to write selftests related to storage keys. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- Documentation/virt/kvm/api.rst | 42 ++++++++++++++++++++++++ arch/s390/kvm/kvm-s390.c | 58 ++++++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 11 +++++++ 3 files changed, 111 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 01a3abef8abb..72e04dedb068 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6517,6 +6517,40 @@ the capability to be present. `flags` must currently be zero. +4.144 KVM_S390_KEYOP +-------------------- + +:Capability: KVM_CAP_S390_KEYOP +:Architectures: s390 +:Type: vm ioctl +:Parameters: struct kvm_s390_keyop (in/out) +:Returns: 0 in case of success, < 0 on error + +The specified key operation is performed on the given guest address. The +previous storage key (or the relevant part thereof) will be returned in +`key`. + +:: + + struct kvm_s390_keyop { + __u64 guest_addr; + __u8 key; + __u8 operation; + }; + +Currently supported values for ``operation``: + +KVM_S390_KEYOP_ISKE + Returns the storage key for the guest address ``guest_addr`` in ``key``. + +KVM_S390_KEYOP_RRBE + Resets the reference bit for the guest address ``guest_addr``, returning the + R and C bits of the old storage key in ``key``; the remaining fields of + the storage key will be set to 0. + +KVM_S390_KEYOP_SSKE + Sets the storage key for the guest address ``guest_addr`` to the key + specified in ``key``, returning the previous value in ``key``. .. _kvm_run: @@ -9287,6 +9321,14 @@ The presence of this capability indicates that KVM_RUN will update the KVM_RUN_X86_GUEST_MODE bit in kvm_run.flags to indicate whether the vCPU was executing nested guest code when it exited. +8.46 KVM_CAP_S390_KEYOP +----------------------- + +:Architectures: s390 + +The presence of this capability indicates that the KVM_S390_KEYOP ioctl is +available. + KVM exits with the register state of either the L1 or L2 guest depending on which executed at the time of an exit. Userspace must take care to differentiate between these cases. diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ac7b5f56f0b5..9f24252775dd 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -554,6 +554,37 @@ static void __kvm_s390_exit(void) debug_unregister(kvm_s390_dbf_uv); } +static int kvm_s390_keyop(struct kvm_s390_mmu_cache *mc, struct kvm *kvm, int op, + unsigned long addr, union skey skey) +{ + union asce asce = kvm->arch.gmap->asce; + gfn_t gfn = gpa_to_gfn(addr); + int r; + + guard(read_lock)(&kvm->mmu_lock); + + switch (op) { + case KVM_S390_KEYOP_SSKE: + r = dat_cond_set_storage_key(mc, asce, gfn, skey, &skey, 0, 0, 0); + if (r >= 0) + return skey.skey; + break; + case KVM_S390_KEYOP_ISKE: + r = dat_get_storage_key(asce, gfn, &skey); + if (!r) + return skey.skey; + break; + case KVM_S390_KEYOP_RRBE: + r = dat_reset_reference_bit(asce, gfn); + if (r > 0) + return r << 1; + break; + default: + return -EINVAL; + } + return r; +} + /* Section: device related */ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) @@ -598,6 +629,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_DIAG318: case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_S390_USER_OPEREXEC: + case KVM_CAP_S390_KEYOP: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -2931,6 +2963,32 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) r = -EFAULT; break; } + case KVM_S390_KEYOP: { + struct kvm_s390_mmu_cache *mc; + struct kvm_s390_keyop kop; + union skey skey; + + if (copy_from_user(&kop, argp, sizeof(kop))) { + r = -EFAULT; + break; + } + skey.skey = kop.key; + + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; + + r = kvm_s390_keyop(mc, kvm, kop.operation, kop.guest_addr, skey); + kvm_s390_free_mmu_cache(mc); + if (r < 0) + break; + + kop.key = r; + r = 0; + if (copy_to_user(argp, &kop, sizeof(kop))) + r = -EFAULT; + break; + } case KVM_S390_ZPCI_OP: { struct kvm_s390_zpci_op args; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index dddb781b0507..ab3d3d96e75f 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -974,6 +974,7 @@ struct kvm_enable_cap { #define KVM_CAP_GUEST_MEMFD_FLAGS 244 #define KVM_CAP_ARM_SEA_TO_USER 245 #define KVM_CAP_S390_USER_OPEREXEC 246 +#define KVM_CAP_S390_KEYOP 247 struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1219,6 +1220,15 @@ struct kvm_vfio_spapr_tce { __s32 tablefd; }; +#define KVM_S390_KEYOP_ISKE 0x01 +#define KVM_S390_KEYOP_RRBE 0x02 +#define KVM_S390_KEYOP_SSKE 0x03 +struct kvm_s390_keyop { + __u64 guest_addr; + __u8 key; + __u8 operation; +}; + /* * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. @@ -1238,6 +1248,7 @@ struct kvm_vfio_spapr_tce { #define KVM_S390_UCAS_MAP _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping) #define KVM_S390_UCAS_UNMAP _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping) #define KVM_S390_VCPU_FAULT _IOW(KVMIO, 0x52, unsigned long) +#define KVM_S390_KEYOP _IOWR(KVMIO, 0x53, struct kvm_s390_keyop) /* Device model IOC */ #define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) From 52940a34a85bc8a17a095f6fae80c33a18c1f7ec Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:58 +0100 Subject: [PATCH 217/267] KVM: s390: selftests: Add selftest for the KVM_S390_KEYOP ioctl This test allows to test the various storage key handling functions. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- tools/testing/selftests/kvm/Makefile.kvm | 1 + tools/testing/selftests/kvm/s390/keyop.c | 299 +++++++++++++++++++++++ 2 files changed, 300 insertions(+) create mode 100644 tools/testing/selftests/kvm/s390/keyop.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..2e4774666723 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -199,6 +199,7 @@ TEST_GEN_PROGS_s390 += s390/cpumodel_subfuncs_test TEST_GEN_PROGS_s390 += s390/shared_zeropage_test TEST_GEN_PROGS_s390 += s390/ucontrol_test TEST_GEN_PROGS_s390 += s390/user_operexec +TEST_GEN_PROGS_s390 += s390/keyop TEST_GEN_PROGS_s390 += rseq_test TEST_GEN_PROGS_riscv = $(TEST_GEN_PROGS_COMMON) diff --git a/tools/testing/selftests/kvm/s390/keyop.c b/tools/testing/selftests/kvm/s390/keyop.c new file mode 100644 index 000000000000..c7805e87d12c --- /dev/null +++ b/tools/testing/selftests/kvm/s390/keyop.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test for s390x KVM_S390_KEYOP + * + * Copyright IBM Corp. 2026 + * + * Authors: + * Claudio Imbrenda + */ +#include +#include +#include +#include + +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "kselftest.h" +#include "processor.h" + +#define BUF_PAGES 128UL +#define GUEST_PAGES 256UL + +#define BUF_START_GFN (GUEST_PAGES - BUF_PAGES) +#define BUF_START_ADDR (BUF_START_GFN << PAGE_SHIFT) + +#define KEY_BITS_ACC 0xf0 +#define KEY_BIT_F 0x08 +#define KEY_BIT_R 0x04 +#define KEY_BIT_C 0x02 + +#define KEY_BITS_RC (KEY_BIT_R | KEY_BIT_C) +#define KEY_BITS_ALL (KEY_BITS_ACC | KEY_BIT_F | KEY_BITS_RC) + +static unsigned char tmp[BUF_PAGES]; +static unsigned char old[BUF_PAGES]; +static unsigned char expected[BUF_PAGES]; + +static int _get_skeys(struct kvm_vcpu *vcpu, unsigned char skeys[]) +{ + struct kvm_s390_skeys skeys_ioctl = { + .start_gfn = BUF_START_GFN, + .count = BUF_PAGES, + .skeydata_addr = (unsigned long)skeys, + }; + + return __vm_ioctl(vcpu->vm, KVM_S390_GET_SKEYS, &skeys_ioctl); +} + +static void get_skeys(struct kvm_vcpu *vcpu, unsigned char skeys[]) +{ + int r = _get_skeys(vcpu, skeys); + + TEST_ASSERT(!r, "Failed to get storage keys, r=%d", r); +} + +static void set_skeys(struct kvm_vcpu *vcpu, unsigned char skeys[]) +{ + struct kvm_s390_skeys skeys_ioctl = { + .start_gfn = BUF_START_GFN, + .count = BUF_PAGES, + .skeydata_addr = (unsigned long)skeys, + }; + int r; + + r = __vm_ioctl(vcpu->vm, KVM_S390_SET_SKEYS, &skeys_ioctl); + TEST_ASSERT(!r, "Failed to set storage keys, r=%d", r); +} + +static int do_keyop(struct kvm_vcpu *vcpu, int op, unsigned long page_idx, unsigned char skey) +{ + struct kvm_s390_keyop keyop = { + .guest_addr = BUF_START_ADDR + page_idx * PAGE_SIZE, + .key = skey, + .operation = op, + }; + int r; + + r = __vm_ioctl(vcpu->vm, KVM_S390_KEYOP, &keyop); + TEST_ASSERT(!r, "Failed to perform keyop, r=%d", r); + TEST_ASSERT((keyop.key & 1) == 0, + "Last bit of key is 1, should be 0! page %lu, new key=%#x, old key=%#x", + page_idx, skey, keyop.key); + + return keyop.key; +} + +static void fault_in_buffer(struct kvm_vcpu *vcpu, int where, int cur_loc) +{ + unsigned long i; + int r; + + if (where != cur_loc) + return; + + for (i = 0; i < BUF_PAGES; i++) { + r = ioctl(vcpu->fd, KVM_S390_VCPU_FAULT, BUF_START_ADDR + i * PAGE_SIZE); + TEST_ASSERT(!r, "Faulting in buffer page %lu, r=%d", i, r); + } +} + +static inline void set_pattern(unsigned char skeys[]) +{ + int i; + + for (i = 0; i < BUF_PAGES; i++) + skeys[i] = i << 1; +} + +static void dump_sk(const unsigned char skeys[], const char *descr) +{ + int i, j; + + fprintf(stderr, "# %s:\n", descr); + for (i = 0; i < BUF_PAGES; i += 32) { + fprintf(stderr, "# %3d: ", i); + for (j = 0; j < 32; j++) + fprintf(stderr, "%02x ", skeys[i + j]); + fprintf(stderr, "\n"); + } +} + +static inline void compare(const unsigned char what[], const unsigned char expected[], + const char *descr, int fault_in_loc) +{ + int i; + + for (i = 0; i < BUF_PAGES; i++) { + if (expected[i] != what[i]) { + dump_sk(expected, "Expected"); + dump_sk(what, "Got"); + } + TEST_ASSERT(expected[i] == what[i], + "%s! fault-in location %d, page %d, expected %#x, got %#x", + descr, fault_in_loc, i, expected[i], what[i]); + } +} + +static inline void clear_all(void) +{ + memset(tmp, 0, BUF_PAGES); + memset(old, 0, BUF_PAGES); + memset(expected, 0, BUF_PAGES); +} + +static void test_init(struct kvm_vcpu *vcpu, int fault_in) +{ + /* Set all storage keys to zero */ + fault_in_buffer(vcpu, fault_in, 1); + set_skeys(vcpu, expected); + + fault_in_buffer(vcpu, fault_in, 2); + get_skeys(vcpu, tmp); + compare(tmp, expected, "Setting keys not zero", fault_in); + + /* Set storage keys to a sequential pattern */ + fault_in_buffer(vcpu, fault_in, 3); + set_pattern(expected); + set_skeys(vcpu, expected); + + fault_in_buffer(vcpu, fault_in, 4); + get_skeys(vcpu, tmp); + compare(tmp, expected, "Setting storage keys failed", fault_in); +} + +static void test_rrbe(struct kvm_vcpu *vcpu, int fault_in) +{ + unsigned char k; + int i; + + /* Set storage keys to a sequential pattern */ + fault_in_buffer(vcpu, fault_in, 1); + set_pattern(expected); + set_skeys(vcpu, expected); + + /* Call the RRBE KEYOP ioctl on each page and verify the result */ + fault_in_buffer(vcpu, fault_in, 2); + for (i = 0; i < BUF_PAGES; i++) { + k = do_keyop(vcpu, KVM_S390_KEYOP_RRBE, i, 0xff); + TEST_ASSERT((expected[i] & KEY_BITS_RC) == k, + "Old R or C value mismatch! expected: %#x, got %#x", + expected[i] & KEY_BITS_RC, k); + if (i == BUF_PAGES / 2) + fault_in_buffer(vcpu, fault_in, 3); + } + + for (i = 0; i < BUF_PAGES; i++) + expected[i] &= ~KEY_BIT_R; + + /* Verify that only the R bit has been cleared */ + fault_in_buffer(vcpu, fault_in, 4); + get_skeys(vcpu, tmp); + compare(tmp, expected, "New value mismatch", fault_in); +} + +static void test_iske(struct kvm_vcpu *vcpu, int fault_in) +{ + int i; + + /* Set storage keys to a sequential pattern */ + fault_in_buffer(vcpu, fault_in, 1); + set_pattern(expected); + set_skeys(vcpu, expected); + + /* Call the ISKE KEYOP ioctl on each page and verify the result */ + fault_in_buffer(vcpu, fault_in, 2); + for (i = 0; i < BUF_PAGES; i++) { + tmp[i] = do_keyop(vcpu, KVM_S390_KEYOP_ISKE, i, 0xff); + if (i == BUF_PAGES / 2) + fault_in_buffer(vcpu, fault_in, 3); + } + compare(tmp, expected, "Old value mismatch", fault_in); + + /* Check storage keys have not changed */ + fault_in_buffer(vcpu, fault_in, 4); + get_skeys(vcpu, tmp); + compare(tmp, expected, "Storage keys values changed", fault_in); +} + +static void test_sske(struct kvm_vcpu *vcpu, int fault_in) +{ + int i; + + /* Set storage keys to a sequential pattern */ + fault_in_buffer(vcpu, fault_in, 1); + set_pattern(tmp); + set_skeys(vcpu, tmp); + + /* Call the SSKE KEYOP ioctl on each page and verify the result */ + fault_in_buffer(vcpu, fault_in, 2); + for (i = 0; i < BUF_PAGES; i++) { + expected[i] = ~tmp[i] & KEY_BITS_ALL; + /* Set the new storage keys to be the bit-inversion of the previous ones */ + old[i] = do_keyop(vcpu, KVM_S390_KEYOP_SSKE, i, expected[i] | 1); + if (i == BUF_PAGES / 2) + fault_in_buffer(vcpu, fault_in, 3); + } + compare(old, tmp, "Old value mismatch", fault_in); + + /* Verify that the storage keys have been set correctly */ + fault_in_buffer(vcpu, fault_in, 4); + get_skeys(vcpu, tmp); + compare(tmp, expected, "New value mismatch", fault_in); +} + +static struct testdef { + const char *name; + void (*test)(struct kvm_vcpu *vcpu, int fault_in_location); + int n_fault_in_locations; +} testplan[] = { + { "Initialization", test_init, 5 }, + { "RRBE", test_rrbe, 5 }, + { "ISKE", test_iske, 5 }, + { "SSKE", test_sske, 5 }, +}; + +static void run_test(void (*the_test)(struct kvm_vcpu *, int), int fault_in_location) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int r; + + vm = vm_create_barebones(); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, GUEST_PAGES, 0); + vcpu = __vm_vcpu_add(vm, 0); + + r = _get_skeys(vcpu, tmp); + TEST_ASSERT(r == KVM_S390_GET_SKEYS_NONE, + "Storage keys are not disabled initially, r=%d", r); + + clear_all(); + + the_test(vcpu, fault_in_location); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + int i, f; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_KEYOP)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_UCONTROL)); + + ksft_print_header(); + for (i = 0, f = 0; i < ARRAY_SIZE(testplan); i++) + f += testplan[i].n_fault_in_locations; + ksft_set_plan(f); + + for (i = 0; i < ARRAY_SIZE(testplan); i++) { + for (f = 0; f < testplan[i].n_fault_in_locations; f++) { + run_test(testplan[i].test, f); + ksft_test_result_pass("%s (fault-in location %d)\n", testplan[i].name, f); + } + } + + ksft_finished(); /* Print results and exit() accordingly */ +} From 0a35bd285f43c26ccec33872fc6bb679069eaea8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:10 +0000 Subject: [PATCH 218/267] arm64: Convert SCTLR_EL2 to sysreg infrastructure Convert SCTLR_EL2 to the sysreg infrastructure, as per the 2025-12_rel revision of the Registers.json file. Note that we slightly deviate from the above, as we stick to the ARM ARM M.a definition of SCTLR_EL2[9], which is RES0, in order to avoid dragging the POE2 definitions... Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/sysreg.h | 7 --- arch/arm64/tools/sysreg | 69 +++++++++++++++++++++++++++ tools/arch/arm64/include/asm/sysreg.h | 6 --- 3 files changed, 69 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 939f9c5bbae6..30f0409b1c80 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -504,7 +504,6 @@ #define SYS_VPIDR_EL2 sys_reg(3, 4, 0, 0, 0) #define SYS_VMPIDR_EL2 sys_reg(3, 4, 0, 0, 5) -#define SYS_SCTLR_EL2 sys_reg(3, 4, 1, 0, 0) #define SYS_ACTLR_EL2 sys_reg(3, 4, 1, 0, 1) #define SYS_SCTLR2_EL2 sys_reg(3, 4, 1, 0, 3) #define SYS_HCR_EL2 sys_reg(3, 4, 1, 1, 0) @@ -837,12 +836,6 @@ #define SCTLR_ELx_A (BIT(1)) #define SCTLR_ELx_M (BIT(0)) -/* SCTLR_EL2 specific flags. */ -#define SCTLR_EL2_RES1 ((BIT(4)) | (BIT(5)) | (BIT(11)) | (BIT(16)) | \ - (BIT(18)) | (BIT(22)) | (BIT(23)) | (BIT(28)) | \ - (BIT(29))) - -#define SCTLR_EL2_BT (BIT(36)) #ifdef CONFIG_CPU_BIG_ENDIAN #define ENDIAN_SET_EL2 SCTLR_ELx_EE #else diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index a0f6249bd4f9..969a75615d61 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -3749,6 +3749,75 @@ UnsignedEnum 2:0 F8S1 EndEnum EndSysreg +Sysreg SCTLR_EL2 3 4 1 0 0 +Field 63 TIDCP +Field 62 SPINTMASK +Field 61 NMI +Field 60 EnTP2 +Field 59 TCSO +Field 58 TCSO0 +Field 57 EPAN +Field 56 EnALS +Field 55 EnAS0 +Field 54 EnASR +Res0 53:50 +Field 49:46 TWEDEL +Field 45 TWEDEn +Field 44 DSSBS +Field 43 ATA +Field 42 ATA0 +Enum 41:40 TCF + 0b00 NONE + 0b01 SYNC + 0b10 ASYNC + 0b11 ASYMM +EndEnum +Enum 39:38 TCF0 + 0b00 NONE + 0b01 SYNC + 0b10 ASYNC + 0b11 ASYMM +EndEnum +Field 37 ITFSB +Field 36 BT +Field 35 BT0 +Field 34 EnFPM +Field 33 MSCEn +Field 32 CMOW +Field 31 EnIA +Field 30 EnIB +Field 29 LSMAOE +Field 28 nTLSMD +Field 27 EnDA +Field 26 UCI +Field 25 EE +Field 24 E0E +Field 23 SPAN +Field 22 EIS +Field 21 IESB +Field 20 TSCXT +Field 19 WXN +Field 18 nTWE +Res0 17 +Field 16 nTWI +Field 15 UCT +Field 14 DZE +Field 13 EnDB +Field 12 I +Field 11 EOS +Field 10 EnRCTX +Res0 9 +Field 8 SED +Field 7 ITD +Field 6 nAA +Field 5 CP15BEN +Field 4 SA0 +Field 3 SA +Field 2 C +Field 1 A +Field 0 M +EndSysreg + Sysreg HCR_EL2 3 4 1 1 0 Field 63:60 TWEDEL Field 59 TWEDEn diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h index 178b7322bf04..f75efe98e9df 100644 --- a/tools/arch/arm64/include/asm/sysreg.h +++ b/tools/arch/arm64/include/asm/sysreg.h @@ -847,12 +847,6 @@ #define SCTLR_ELx_A (BIT(1)) #define SCTLR_ELx_M (BIT(0)) -/* SCTLR_EL2 specific flags. */ -#define SCTLR_EL2_RES1 ((BIT(4)) | (BIT(5)) | (BIT(11)) | (BIT(16)) | \ - (BIT(18)) | (BIT(22)) | (BIT(23)) | (BIT(28)) | \ - (BIT(29))) - -#define SCTLR_EL2_BT (BIT(36)) #ifdef CONFIG_CPU_BIG_ENDIAN #define ENDIAN_SET_EL2 SCTLR_ELx_EE #else From 4faf52106de73cdd1f34752afd60b256721e97b5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:11 +0000 Subject: [PATCH 219/267] KVM: arm64: Remove duplicate configuration for SCTLR_EL1.{EE,E0E} We already have specific constraints for SCTLR_EL1.{EE,E0E}, and making them depend on FEAT_AA64EL1 is just buggy. Fixes: 6bd4a274b026e ("KVM: arm64: Convert SCTLR_EL1 to config-driven sanitisation") Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/config.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index 9c04f895d376..0bcdb3988573 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -1140,8 +1140,6 @@ static const struct reg_bits_to_feat_map sctlr_el1_feat_map[] = { SCTLR_EL1_TWEDEn, FEAT_TWED), NEEDS_FEAT(SCTLR_EL1_UCI | - SCTLR_EL1_EE | - SCTLR_EL1_E0E | SCTLR_EL1_WXN | SCTLR_EL1_nTWE | SCTLR_EL1_nTWI | From a3c92001812358c35c55c844c1e0d9de8caf13fe Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:12 +0000 Subject: [PATCH 220/267] KVM: arm64: Introduce standalone FGU computing primitive Computing the FGU bits is made oddly complicated, as we use the RES0 helper instead of using a specific abstraction. Introduce such an abstraction, which is going to make things significantly simpler in the future. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-4-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/config.c | 57 ++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index 0bcdb3988573..2122599f7cbb 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -1335,26 +1335,30 @@ static u64 compute_res0_bits(struct kvm *kvm, static u64 compute_reg_res0_bits(struct kvm *kvm, const struct reg_feat_map_desc *r, unsigned long require, unsigned long exclude) - { u64 res0; res0 = compute_res0_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz, require, exclude); - /* - * If computing FGUs, don't take RES0 or register existence - * into account -- we're not computing bits for the register - * itself. - */ - if (!(exclude & NEVER_FGU)) { - res0 |= compute_res0_bits(kvm, &r->feat_map, 1, require, exclude); - res0 |= ~reg_feat_map_bits(&r->feat_map); - } + res0 |= compute_res0_bits(kvm, &r->feat_map, 1, require, exclude); + res0 |= ~reg_feat_map_bits(&r->feat_map); return res0; } +static u64 compute_fgu_bits(struct kvm *kvm, const struct reg_feat_map_desc *r) +{ + /* + * If computing FGUs, we collect the unsupported feature bits as + * RES0 bits, but don't take the actual RES0 bits or register + * existence into account -- we're not computing bits for the + * register itself. + */ + return compute_res0_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz, + 0, NEVER_FGU); +} + static u64 compute_reg_fixed_bits(struct kvm *kvm, const struct reg_feat_map_desc *r, u64 *fixed_bits, unsigned long require, @@ -1370,40 +1374,29 @@ void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt) switch (fgt) { case HFGRTR_GROUP: - val |= compute_reg_res0_bits(kvm, &hfgrtr_desc, - 0, NEVER_FGU); - val |= compute_reg_res0_bits(kvm, &hfgwtr_desc, - 0, NEVER_FGU); + val |= compute_fgu_bits(kvm, &hfgrtr_desc); + val |= compute_fgu_bits(kvm, &hfgwtr_desc); break; case HFGITR_GROUP: - val |= compute_reg_res0_bits(kvm, &hfgitr_desc, - 0, NEVER_FGU); + val |= compute_fgu_bits(kvm, &hfgitr_desc); break; case HDFGRTR_GROUP: - val |= compute_reg_res0_bits(kvm, &hdfgrtr_desc, - 0, NEVER_FGU); - val |= compute_reg_res0_bits(kvm, &hdfgwtr_desc, - 0, NEVER_FGU); + val |= compute_fgu_bits(kvm, &hdfgrtr_desc); + val |= compute_fgu_bits(kvm, &hdfgwtr_desc); break; case HAFGRTR_GROUP: - val |= compute_reg_res0_bits(kvm, &hafgrtr_desc, - 0, NEVER_FGU); + val |= compute_fgu_bits(kvm, &hafgrtr_desc); break; case HFGRTR2_GROUP: - val |= compute_reg_res0_bits(kvm, &hfgrtr2_desc, - 0, NEVER_FGU); - val |= compute_reg_res0_bits(kvm, &hfgwtr2_desc, - 0, NEVER_FGU); + val |= compute_fgu_bits(kvm, &hfgrtr2_desc); + val |= compute_fgu_bits(kvm, &hfgwtr2_desc); break; case HFGITR2_GROUP: - val |= compute_reg_res0_bits(kvm, &hfgitr2_desc, - 0, NEVER_FGU); + val |= compute_fgu_bits(kvm, &hfgitr2_desc); break; case HDFGRTR2_GROUP: - val |= compute_reg_res0_bits(kvm, &hdfgrtr2_desc, - 0, NEVER_FGU); - val |= compute_reg_res0_bits(kvm, &hdfgwtr2_desc, - 0, NEVER_FGU); + val |= compute_fgu_bits(kvm, &hdfgrtr2_desc); + val |= compute_fgu_bits(kvm, &hdfgwtr2_desc); break; default: BUG(); From 0879478913dd671b0aed1e3960c4b35fb8546ab4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:13 +0000 Subject: [PATCH 221/267] KVM: arm64: Introduce data structure tracking both RES0 and RES1 bits We have so far mostly tracked RES0 bits, but only made a few attempts at being just as strict for RES1 bits (probably because they are both rarer and harder to handle). Start scratching the surface by introducing a data structure tracking RES0 and RES1 bits at the same time. Note that contrary to the usual idiom, this structure is mostly passed around by value -- the ABI handles it nicely, and the resulting code is much nicer. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-5-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 23 +++-- arch/arm64/kvm/config.c | 148 ++++++++++++++++-------------- arch/arm64/kvm/nested.c | 129 +++++++++++++------------- 3 files changed, 161 insertions(+), 139 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index b552a1e03848..799f494a1349 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -626,13 +626,24 @@ enum vcpu_sysreg { NR_SYS_REGS /* Nothing after this line! */ }; -struct kvm_sysreg_masks { - struct { - u64 res0; - u64 res1; - } mask[NR_SYS_REGS - __SANITISED_REG_START__]; +struct resx { + u64 res0; + u64 res1; }; +struct kvm_sysreg_masks { + struct resx mask[NR_SYS_REGS - __SANITISED_REG_START__]; +}; + +static inline void __kvm_set_sysreg_resx(struct kvm_arch *arch, + enum vcpu_sysreg sr, struct resx resx) +{ + arch->sysreg_masks->mask[sr - __SANITISED_REG_START__] = resx; +} + +#define kvm_set_sysreg_resx(k, sr, resx) \ + __kvm_set_sysreg_resx(&(k)->arch, (sr), (resx)) + struct fgt_masks { const char *str; u64 mask; @@ -1607,7 +1618,7 @@ static inline bool kvm_arch_has_irq_bypass(void) } void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt); -void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *res1); +struct resx get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg); void check_feature_map(void); void kvm_vcpu_load_fgt(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index 2122599f7cbb..2214c06902f8 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -1290,14 +1290,14 @@ static bool idreg_feat_match(struct kvm *kvm, const struct reg_bits_to_feat_map } } -static u64 __compute_fixed_bits(struct kvm *kvm, - const struct reg_bits_to_feat_map *map, - int map_size, - u64 *fixed_bits, - unsigned long require, - unsigned long exclude) +static struct resx __compute_fixed_bits(struct kvm *kvm, + const struct reg_bits_to_feat_map *map, + int map_size, + u64 *fixed_bits, + unsigned long require, + unsigned long exclude) { - u64 val = 0; + struct resx resx = {}; for (int i = 0; i < map_size; i++) { bool match; @@ -1316,53 +1316,62 @@ static u64 __compute_fixed_bits(struct kvm *kvm, match = idreg_feat_match(kvm, &map[i]); if (!match || (map[i].flags & FIXED_VALUE)) - val |= reg_feat_map_bits(&map[i]); + resx.res0 |= reg_feat_map_bits(&map[i]); } - return val; + return resx; } -static u64 compute_res0_bits(struct kvm *kvm, - const struct reg_bits_to_feat_map *map, - int map_size, - unsigned long require, - unsigned long exclude) +static struct resx compute_resx_bits(struct kvm *kvm, + const struct reg_bits_to_feat_map *map, + int map_size, + unsigned long require, + unsigned long exclude) { return __compute_fixed_bits(kvm, map, map_size, NULL, require, exclude | FIXED_VALUE); } -static u64 compute_reg_res0_bits(struct kvm *kvm, - const struct reg_feat_map_desc *r, - unsigned long require, unsigned long exclude) +static struct resx compute_reg_resx_bits(struct kvm *kvm, + const struct reg_feat_map_desc *r, + unsigned long require, + unsigned long exclude) { - u64 res0; + struct resx resx, tmp; - res0 = compute_res0_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz, + resx = compute_resx_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz, require, exclude); - res0 |= compute_res0_bits(kvm, &r->feat_map, 1, require, exclude); - res0 |= ~reg_feat_map_bits(&r->feat_map); + tmp = compute_resx_bits(kvm, &r->feat_map, 1, require, exclude); - return res0; + resx.res0 |= tmp.res0; + resx.res0 |= ~reg_feat_map_bits(&r->feat_map); + resx.res1 |= tmp.res1; + + return resx; } static u64 compute_fgu_bits(struct kvm *kvm, const struct reg_feat_map_desc *r) { + struct resx resx; + /* * If computing FGUs, we collect the unsupported feature bits as - * RES0 bits, but don't take the actual RES0 bits or register + * RESx bits, but don't take the actual RESx bits or register * existence into account -- we're not computing bits for the * register itself. */ - return compute_res0_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz, + resx = compute_resx_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz, 0, NEVER_FGU); + + return resx.res0 | resx.res1; } -static u64 compute_reg_fixed_bits(struct kvm *kvm, - const struct reg_feat_map_desc *r, - u64 *fixed_bits, unsigned long require, - unsigned long exclude) +static struct resx compute_reg_fixed_bits(struct kvm *kvm, + const struct reg_feat_map_desc *r, + u64 *fixed_bits, + unsigned long require, + unsigned long exclude) { return __compute_fixed_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz, fixed_bits, require | FIXED_VALUE, exclude); @@ -1405,91 +1414,94 @@ void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt) kvm->arch.fgu[fgt] = val; } -void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *res1) +struct resx get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg) { u64 fixed = 0, mask; + struct resx resx; switch (reg) { case HFGRTR_EL2: - *res0 = compute_reg_res0_bits(kvm, &hfgrtr_desc, 0, 0); - *res1 = HFGRTR_EL2_RES1; + resx = compute_reg_resx_bits(kvm, &hfgrtr_desc, 0, 0); + resx.res1 |= HFGRTR_EL2_RES1; break; case HFGWTR_EL2: - *res0 = compute_reg_res0_bits(kvm, &hfgwtr_desc, 0, 0); - *res1 = HFGWTR_EL2_RES1; + resx = compute_reg_resx_bits(kvm, &hfgwtr_desc, 0, 0); + resx.res1 |= HFGWTR_EL2_RES1; break; case HFGITR_EL2: - *res0 = compute_reg_res0_bits(kvm, &hfgitr_desc, 0, 0); - *res1 = HFGITR_EL2_RES1; + resx = compute_reg_resx_bits(kvm, &hfgitr_desc, 0, 0); + resx.res1 |= HFGITR_EL2_RES1; break; case HDFGRTR_EL2: - *res0 = compute_reg_res0_bits(kvm, &hdfgrtr_desc, 0, 0); - *res1 = HDFGRTR_EL2_RES1; + resx = compute_reg_resx_bits(kvm, &hdfgrtr_desc, 0, 0); + resx.res1 |= HDFGRTR_EL2_RES1; break; case HDFGWTR_EL2: - *res0 = compute_reg_res0_bits(kvm, &hdfgwtr_desc, 0, 0); - *res1 = HDFGWTR_EL2_RES1; + resx = compute_reg_resx_bits(kvm, &hdfgwtr_desc, 0, 0); + resx.res1 |= HDFGWTR_EL2_RES1; break; case HAFGRTR_EL2: - *res0 = compute_reg_res0_bits(kvm, &hafgrtr_desc, 0, 0); - *res1 = HAFGRTR_EL2_RES1; + resx = compute_reg_resx_bits(kvm, &hafgrtr_desc, 0, 0); + resx.res1 |= HAFGRTR_EL2_RES1; break; case HFGRTR2_EL2: - *res0 = compute_reg_res0_bits(kvm, &hfgrtr2_desc, 0, 0); - *res1 = HFGRTR2_EL2_RES1; + resx = compute_reg_resx_bits(kvm, &hfgrtr2_desc, 0, 0); + resx.res1 |= HFGRTR2_EL2_RES1; break; case HFGWTR2_EL2: - *res0 = compute_reg_res0_bits(kvm, &hfgwtr2_desc, 0, 0); - *res1 = HFGWTR2_EL2_RES1; + resx = compute_reg_resx_bits(kvm, &hfgwtr2_desc, 0, 0); + resx.res1 |= HFGWTR2_EL2_RES1; break; case HFGITR2_EL2: - *res0 = compute_reg_res0_bits(kvm, &hfgitr2_desc, 0, 0); - *res1 = HFGITR2_EL2_RES1; + resx = compute_reg_resx_bits(kvm, &hfgitr2_desc, 0, 0); + resx.res1 |= HFGITR2_EL2_RES1; break; case HDFGRTR2_EL2: - *res0 = compute_reg_res0_bits(kvm, &hdfgrtr2_desc, 0, 0); - *res1 = HDFGRTR2_EL2_RES1; + resx = compute_reg_resx_bits(kvm, &hdfgrtr2_desc, 0, 0); + resx.res1 |= HDFGRTR2_EL2_RES1; break; case HDFGWTR2_EL2: - *res0 = compute_reg_res0_bits(kvm, &hdfgwtr2_desc, 0, 0); - *res1 = HDFGWTR2_EL2_RES1; + resx = compute_reg_resx_bits(kvm, &hdfgwtr2_desc, 0, 0); + resx.res1 |= HDFGWTR2_EL2_RES1; break; case HCRX_EL2: - *res0 = compute_reg_res0_bits(kvm, &hcrx_desc, 0, 0); - *res1 = __HCRX_EL2_RES1; + resx = compute_reg_resx_bits(kvm, &hcrx_desc, 0, 0); + resx.res1 |= __HCRX_EL2_RES1; break; case HCR_EL2: - mask = compute_reg_fixed_bits(kvm, &hcr_desc, &fixed, 0, 0); - *res0 = compute_reg_res0_bits(kvm, &hcr_desc, 0, 0); - *res0 |= (mask & ~fixed); - *res1 = HCR_EL2_RES1 | (mask & fixed); + mask = compute_reg_fixed_bits(kvm, &hcr_desc, &fixed, 0, 0).res0; + resx = compute_reg_resx_bits(kvm, &hcr_desc, 0, 0); + resx.res0 |= (mask & ~fixed); + resx.res1 |= HCR_EL2_RES1 | (mask & fixed); break; case SCTLR2_EL1: case SCTLR2_EL2: - *res0 = compute_reg_res0_bits(kvm, &sctlr2_desc, 0, 0); - *res1 = SCTLR2_EL1_RES1; + resx = compute_reg_resx_bits(kvm, &sctlr2_desc, 0, 0); + resx.res1 |= SCTLR2_EL1_RES1; break; case TCR2_EL2: - *res0 = compute_reg_res0_bits(kvm, &tcr2_el2_desc, 0, 0); - *res1 = TCR2_EL2_RES1; + resx = compute_reg_resx_bits(kvm, &tcr2_el2_desc, 0, 0); + resx.res1 |= TCR2_EL2_RES1; break; case SCTLR_EL1: - *res0 = compute_reg_res0_bits(kvm, &sctlr_el1_desc, 0, 0); - *res1 = SCTLR_EL1_RES1; + resx = compute_reg_resx_bits(kvm, &sctlr_el1_desc, 0, 0); + resx.res1 |= SCTLR_EL1_RES1; break; case MDCR_EL2: - *res0 = compute_reg_res0_bits(kvm, &mdcr_el2_desc, 0, 0); - *res1 = MDCR_EL2_RES1; + resx = compute_reg_resx_bits(kvm, &mdcr_el2_desc, 0, 0); + resx.res1 |= MDCR_EL2_RES1; break; case VTCR_EL2: - *res0 = compute_reg_res0_bits(kvm, &vtcr_el2_desc, 0, 0); - *res1 = VTCR_EL2_RES1; + resx = compute_reg_resx_bits(kvm, &vtcr_el2_desc, 0, 0); + resx.res1 |= VTCR_EL2_RES1; break; default: WARN_ON_ONCE(1); - *res0 = *res1 = 0; + resx = (typeof(resx)){}; break; } + + return resx; } static __always_inline struct fgt_masks *__fgt_reg_to_masks(enum vcpu_sysreg reg) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 486eba72bb02..c5a45bc62153 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1683,22 +1683,19 @@ u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *vcpu, return v; } -static __always_inline void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1) +static __always_inline void set_sysreg_masks(struct kvm *kvm, int sr, struct resx resx) { - int i = sr - __SANITISED_REG_START__; - BUILD_BUG_ON(!__builtin_constant_p(sr)); BUILD_BUG_ON(sr < __SANITISED_REG_START__); BUILD_BUG_ON(sr >= NR_SYS_REGS); - kvm->arch.sysreg_masks->mask[i].res0 = res0; - kvm->arch.sysreg_masks->mask[i].res1 = res1; + kvm_set_sysreg_resx(kvm, sr, resx); } int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - u64 res0, res1; + struct resx resx; lockdep_assert_held(&kvm->arch.config_lock); @@ -1711,110 +1708,112 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) return -ENOMEM; /* VTTBR_EL2 */ - res0 = res1 = 0; + resx = (typeof(resx)){}; if (!kvm_has_feat_enum(kvm, ID_AA64MMFR1_EL1, VMIDBits, 16)) - res0 |= GENMASK(63, 56); + resx.res0 |= GENMASK(63, 56); if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, CnP, IMP)) - res0 |= VTTBR_CNP_BIT; - set_sysreg_masks(kvm, VTTBR_EL2, res0, res1); + resx.res0 |= VTTBR_CNP_BIT; + set_sysreg_masks(kvm, VTTBR_EL2, resx); /* VTCR_EL2 */ - get_reg_fixed_bits(kvm, VTCR_EL2, &res0, &res1); - set_sysreg_masks(kvm, VTCR_EL2, res0, res1); + resx = get_reg_fixed_bits(kvm, VTCR_EL2); + set_sysreg_masks(kvm, VTCR_EL2, resx); /* VMPIDR_EL2 */ - res0 = GENMASK(63, 40) | GENMASK(30, 24); - res1 = BIT(31); - set_sysreg_masks(kvm, VMPIDR_EL2, res0, res1); + resx.res0 = GENMASK(63, 40) | GENMASK(30, 24); + resx.res1 = BIT(31); + set_sysreg_masks(kvm, VMPIDR_EL2, resx); /* HCR_EL2 */ - get_reg_fixed_bits(kvm, HCR_EL2, &res0, &res1); - set_sysreg_masks(kvm, HCR_EL2, res0, res1); + resx = get_reg_fixed_bits(kvm, HCR_EL2); + set_sysreg_masks(kvm, HCR_EL2, resx); /* HCRX_EL2 */ - get_reg_fixed_bits(kvm, HCRX_EL2, &res0, &res1); - set_sysreg_masks(kvm, HCRX_EL2, res0, res1); + resx = get_reg_fixed_bits(kvm, HCRX_EL2); + set_sysreg_masks(kvm, HCRX_EL2, resx); /* HFG[RW]TR_EL2 */ - get_reg_fixed_bits(kvm, HFGRTR_EL2, &res0, &res1); - set_sysreg_masks(kvm, HFGRTR_EL2, res0, res1); - get_reg_fixed_bits(kvm, HFGWTR_EL2, &res0, &res1); - set_sysreg_masks(kvm, HFGWTR_EL2, res0, res1); + resx = get_reg_fixed_bits(kvm, HFGRTR_EL2); + set_sysreg_masks(kvm, HFGRTR_EL2, resx); + resx = get_reg_fixed_bits(kvm, HFGWTR_EL2); + set_sysreg_masks(kvm, HFGWTR_EL2, resx); /* HDFG[RW]TR_EL2 */ - get_reg_fixed_bits(kvm, HDFGRTR_EL2, &res0, &res1); - set_sysreg_masks(kvm, HDFGRTR_EL2, res0, res1); - get_reg_fixed_bits(kvm, HDFGWTR_EL2, &res0, &res1); - set_sysreg_masks(kvm, HDFGWTR_EL2, res0, res1); + resx = get_reg_fixed_bits(kvm, HDFGRTR_EL2); + set_sysreg_masks(kvm, HDFGRTR_EL2, resx); + resx = get_reg_fixed_bits(kvm, HDFGWTR_EL2); + set_sysreg_masks(kvm, HDFGWTR_EL2, resx); /* HFGITR_EL2 */ - get_reg_fixed_bits(kvm, HFGITR_EL2, &res0, &res1); - set_sysreg_masks(kvm, HFGITR_EL2, res0, res1); + resx = get_reg_fixed_bits(kvm, HFGITR_EL2); + set_sysreg_masks(kvm, HFGITR_EL2, resx); /* HAFGRTR_EL2 - not a lot to see here */ - get_reg_fixed_bits(kvm, HAFGRTR_EL2, &res0, &res1); - set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1); + resx = get_reg_fixed_bits(kvm, HAFGRTR_EL2); + set_sysreg_masks(kvm, HAFGRTR_EL2, resx); /* HFG[RW]TR2_EL2 */ - get_reg_fixed_bits(kvm, HFGRTR2_EL2, &res0, &res1); - set_sysreg_masks(kvm, HFGRTR2_EL2, res0, res1); - get_reg_fixed_bits(kvm, HFGWTR2_EL2, &res0, &res1); - set_sysreg_masks(kvm, HFGWTR2_EL2, res0, res1); + resx = get_reg_fixed_bits(kvm, HFGRTR2_EL2); + set_sysreg_masks(kvm, HFGRTR2_EL2, resx); + resx = get_reg_fixed_bits(kvm, HFGWTR2_EL2); + set_sysreg_masks(kvm, HFGWTR2_EL2, resx); /* HDFG[RW]TR2_EL2 */ - get_reg_fixed_bits(kvm, HDFGRTR2_EL2, &res0, &res1); - set_sysreg_masks(kvm, HDFGRTR2_EL2, res0, res1); - get_reg_fixed_bits(kvm, HDFGWTR2_EL2, &res0, &res1); - set_sysreg_masks(kvm, HDFGWTR2_EL2, res0, res1); + resx = get_reg_fixed_bits(kvm, HDFGRTR2_EL2); + set_sysreg_masks(kvm, HDFGRTR2_EL2, resx); + resx = get_reg_fixed_bits(kvm, HDFGWTR2_EL2); + set_sysreg_masks(kvm, HDFGWTR2_EL2, resx); /* HFGITR2_EL2 */ - get_reg_fixed_bits(kvm, HFGITR2_EL2, &res0, &res1); - set_sysreg_masks(kvm, HFGITR2_EL2, res0, res1); + resx = get_reg_fixed_bits(kvm, HFGITR2_EL2); + set_sysreg_masks(kvm, HFGITR2_EL2, resx); /* TCR2_EL2 */ - get_reg_fixed_bits(kvm, TCR2_EL2, &res0, &res1); - set_sysreg_masks(kvm, TCR2_EL2, res0, res1); + resx = get_reg_fixed_bits(kvm, TCR2_EL2); + set_sysreg_masks(kvm, TCR2_EL2, resx); /* SCTLR_EL1 */ - get_reg_fixed_bits(kvm, SCTLR_EL1, &res0, &res1); - set_sysreg_masks(kvm, SCTLR_EL1, res0, res1); + resx = get_reg_fixed_bits(kvm, SCTLR_EL1); + set_sysreg_masks(kvm, SCTLR_EL1, resx); /* SCTLR2_ELx */ - get_reg_fixed_bits(kvm, SCTLR2_EL1, &res0, &res1); - set_sysreg_masks(kvm, SCTLR2_EL1, res0, res1); - get_reg_fixed_bits(kvm, SCTLR2_EL2, &res0, &res1); - set_sysreg_masks(kvm, SCTLR2_EL2, res0, res1); + resx = get_reg_fixed_bits(kvm, SCTLR2_EL1); + set_sysreg_masks(kvm, SCTLR2_EL1, resx); + resx = get_reg_fixed_bits(kvm, SCTLR2_EL2); + set_sysreg_masks(kvm, SCTLR2_EL2, resx); /* MDCR_EL2 */ - get_reg_fixed_bits(kvm, MDCR_EL2, &res0, &res1); - set_sysreg_masks(kvm, MDCR_EL2, res0, res1); + resx = get_reg_fixed_bits(kvm, MDCR_EL2); + set_sysreg_masks(kvm, MDCR_EL2, resx); /* CNTHCTL_EL2 */ - res0 = GENMASK(63, 20); - res1 = 0; + resx.res0 = GENMASK(63, 20); + resx.res1 = 0; if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RME, IMP)) - res0 |= CNTHCTL_CNTPMASK | CNTHCTL_CNTVMASK; + resx.res0 |= CNTHCTL_CNTPMASK | CNTHCTL_CNTVMASK; if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, CNTPOFF)) { - res0 |= CNTHCTL_ECV; + resx.res0 |= CNTHCTL_ECV; if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, IMP)) - res0 |= (CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT | - CNTHCTL_EL1NVPCT | CNTHCTL_EL1NVVCT); + resx.res0 |= (CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT | + CNTHCTL_EL1NVPCT | CNTHCTL_EL1NVVCT); } if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP)) - res0 |= GENMASK(11, 8); - set_sysreg_masks(kvm, CNTHCTL_EL2, res0, res1); + resx.res0 |= GENMASK(11, 8); + set_sysreg_masks(kvm, CNTHCTL_EL2, resx); /* ICH_HCR_EL2 */ - res0 = ICH_HCR_EL2_RES0; - res1 = ICH_HCR_EL2_RES1; + resx.res0 = ICH_HCR_EL2_RES0; + resx.res1 = ICH_HCR_EL2_RES1; if (!(kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_TDS)) - res0 |= ICH_HCR_EL2_TDIR; + resx.res0 |= ICH_HCR_EL2_TDIR; /* No GICv4 is presented to the guest */ - res0 |= ICH_HCR_EL2_DVIM | ICH_HCR_EL2_vSGIEOICount; - set_sysreg_masks(kvm, ICH_HCR_EL2, res0, res1); + resx.res0 |= ICH_HCR_EL2_DVIM | ICH_HCR_EL2_vSGIEOICount; + set_sysreg_masks(kvm, ICH_HCR_EL2, resx); /* VNCR_EL2 */ - set_sysreg_masks(kvm, VNCR_EL2, VNCR_EL2_RES0, VNCR_EL2_RES1); + resx.res0 = VNCR_EL2_RES0; + resx.res1 = VNCR_EL2_RES1; + set_sysreg_masks(kvm, VNCR_EL2, resx); out: for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++) From f9d58956423844237e18a758dc0f1b2cf6480042 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:14 +0000 Subject: [PATCH 222/267] KVM: arm64: Extend unified RESx handling to runtime sanitisation Add a new helper to retrieve the RESx values for a given system register, and use it for the runtime sanitisation. This results in slightly better code generation for a fairly hot path in the hypervisor, and additionally covers all sanitised registers in all conditions, not just the VNCR-based ones. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-6-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 15 +++++++++++++++ arch/arm64/kvm/emulate-nested.c | 10 +--------- arch/arm64/kvm/nested.c | 13 ++++--------- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 799f494a1349..20ebc1610ac8 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -635,6 +635,21 @@ struct kvm_sysreg_masks { struct resx mask[NR_SYS_REGS - __SANITISED_REG_START__]; }; +static inline struct resx __kvm_get_sysreg_resx(struct kvm_arch *arch, + enum vcpu_sysreg sr) +{ + struct kvm_sysreg_masks *masks; + + masks = arch->sysreg_masks; + if (likely(masks && + sr >= __SANITISED_REG_START__ && sr < NR_SYS_REGS)) + return masks->mask[sr - __SANITISED_REG_START__]; + + return (struct resx){}; +} + +#define kvm_get_sysreg_resx(k, sr) __kvm_get_sysreg_resx(&(k)->arch, (sr)) + static inline void __kvm_set_sysreg_resx(struct kvm_arch *arch, enum vcpu_sysreg sr, struct resx resx) { diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 774cfbf5b43b..43334cd2db9e 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -2427,15 +2427,7 @@ static enum trap_behaviour compute_trap_behaviour(struct kvm_vcpu *vcpu, static u64 kvm_get_sysreg_res0(struct kvm *kvm, enum vcpu_sysreg sr) { - struct kvm_sysreg_masks *masks; - - /* Only handle the VNCR-backed regs for now */ - if (sr < __VNCR_START__) - return 0; - - masks = kvm->arch.sysreg_masks; - - return masks->mask[sr - __SANITISED_REG_START__].res0; + return kvm_get_sysreg_resx(kvm, sr).res0; } static bool check_fgt_bit(struct kvm_vcpu *vcpu, enum vcpu_sysreg sr, diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index c5a45bc62153..75a23f1c56d1 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1669,16 +1669,11 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *vcpu, enum vcpu_sysreg sr, u64 v) { - struct kvm_sysreg_masks *masks; + struct resx resx; - masks = vcpu->kvm->arch.sysreg_masks; - - if (masks) { - sr -= __SANITISED_REG_START__; - - v &= ~masks->mask[sr].res0; - v |= masks->mask[sr].res1; - } + resx = kvm_get_sysreg_resx(vcpu->kvm, sr); + v &= ~resx.res0; + v |= resx.res1; return v; } From bbea27636e660df907ebf0d36e3dfca5c77cfbc0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:15 +0000 Subject: [PATCH 223/267] KVM: arm64: Inherit RESx bits from FGT register descriptors The FGT registers have their computed RESx bits stashed in specific descriptors, which we can easily use when computing the masks used for the guest. This removes a bit of boilerplate code. Reviewed-by: Joey Gouly Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-7-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/config.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index 2214c06902f8..9ad7eb5f4b98 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -1342,6 +1342,11 @@ static struct resx compute_reg_resx_bits(struct kvm *kvm, resx = compute_resx_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz, require, exclude); + if (r->feat_map.flags & MASKS_POINTER) { + resx.res0 |= r->feat_map.masks->res0; + resx.res1 |= r->feat_map.masks->res1; + } + tmp = compute_resx_bits(kvm, &r->feat_map, 1, require, exclude); resx.res0 |= tmp.res0; @@ -1422,47 +1427,36 @@ struct resx get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg) switch (reg) { case HFGRTR_EL2: resx = compute_reg_resx_bits(kvm, &hfgrtr_desc, 0, 0); - resx.res1 |= HFGRTR_EL2_RES1; break; case HFGWTR_EL2: resx = compute_reg_resx_bits(kvm, &hfgwtr_desc, 0, 0); - resx.res1 |= HFGWTR_EL2_RES1; break; case HFGITR_EL2: resx = compute_reg_resx_bits(kvm, &hfgitr_desc, 0, 0); - resx.res1 |= HFGITR_EL2_RES1; break; case HDFGRTR_EL2: resx = compute_reg_resx_bits(kvm, &hdfgrtr_desc, 0, 0); - resx.res1 |= HDFGRTR_EL2_RES1; break; case HDFGWTR_EL2: resx = compute_reg_resx_bits(kvm, &hdfgwtr_desc, 0, 0); - resx.res1 |= HDFGWTR_EL2_RES1; break; case HAFGRTR_EL2: resx = compute_reg_resx_bits(kvm, &hafgrtr_desc, 0, 0); - resx.res1 |= HAFGRTR_EL2_RES1; break; case HFGRTR2_EL2: resx = compute_reg_resx_bits(kvm, &hfgrtr2_desc, 0, 0); - resx.res1 |= HFGRTR2_EL2_RES1; break; case HFGWTR2_EL2: resx = compute_reg_resx_bits(kvm, &hfgwtr2_desc, 0, 0); - resx.res1 |= HFGWTR2_EL2_RES1; break; case HFGITR2_EL2: resx = compute_reg_resx_bits(kvm, &hfgitr2_desc, 0, 0); - resx.res1 |= HFGITR2_EL2_RES1; break; case HDFGRTR2_EL2: resx = compute_reg_resx_bits(kvm, &hdfgrtr2_desc, 0, 0); - resx.res1 |= HDFGRTR2_EL2_RES1; break; case HDFGWTR2_EL2: resx = compute_reg_resx_bits(kvm, &hdfgwtr2_desc, 0, 0); - resx.res1 |= HDFGWTR2_EL2_RES1; break; case HCRX_EL2: resx = compute_reg_resx_bits(kvm, &hcrx_desc, 0, 0); From 459fc4e77e1ac932e47cb4a6d1a01b3be79fd41c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:16 +0000 Subject: [PATCH 224/267] KVM: arm64: Allow RES1 bits to be inferred from configuration So far, when a bit field is tied to an unsupported feature, we set it as RES0. This is almost correct, but there are a few exceptions where the bits become RES1. Add a AS_RES1 qualifier that instruct the RESx computing code to simply do that. Reviewed-by: Joey Gouly Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-8-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/config.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index 9ad7eb5f4b98..72c6dd7656ba 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -24,6 +24,7 @@ struct reg_bits_to_feat_map { #define CALL_FUNC BIT(1) /* Needs to evaluate tons of crap */ #define FIXED_VALUE BIT(2) /* RAZ/WI or RAO/WI in KVM */ #define MASKS_POINTER BIT(3) /* Pointer to fgt_masks struct instead of bits */ +#define AS_RES1 BIT(4) /* RES1 when not supported */ unsigned long flags; @@ -1315,8 +1316,12 @@ static struct resx __compute_fixed_bits(struct kvm *kvm, else match = idreg_feat_match(kvm, &map[i]); - if (!match || (map[i].flags & FIXED_VALUE)) - resx.res0 |= reg_feat_map_bits(&map[i]); + if (!match || (map[i].flags & FIXED_VALUE)) { + if (map[i].flags & AS_RES1) + resx.res1 |= reg_feat_map_bits(&map[i]); + else + resx.res0 |= reg_feat_map_bits(&map[i]); + } } return resx; From c27b8b7aabefb8ef226c3dfbc86ef7df6a9958c2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:17 +0000 Subject: [PATCH 225/267] KVM: arm64: Correctly handle SCTLR_EL1 RES1 bits for unsupported features A bunch of SCTLR_EL1 bits must be set to RES1 when the controlling feature is not present. Add the AS_RES1 qualifier where needed. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-9-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/config.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index 72c6dd7656ba..25ef1873a6c3 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -1085,27 +1085,28 @@ static const DECLARE_FEAT_MAP(tcr2_el2_desc, TCR2_EL2, tcr2_el2_feat_map, FEAT_TCR2); static const struct reg_bits_to_feat_map sctlr_el1_feat_map[] = { - NEEDS_FEAT(SCTLR_EL1_CP15BEN | - SCTLR_EL1_ITD | - SCTLR_EL1_SED, - FEAT_AA32EL0), + NEEDS_FEAT(SCTLR_EL1_CP15BEN, FEAT_AA32EL0), + NEEDS_FEAT_FLAG(SCTLR_EL1_ITD | + SCTLR_EL1_SED, + AS_RES1, FEAT_AA32EL0), NEEDS_FEAT(SCTLR_EL1_BT0 | SCTLR_EL1_BT1, FEAT_BTI), NEEDS_FEAT(SCTLR_EL1_CMOW, FEAT_CMOW), - NEEDS_FEAT(SCTLR_EL1_TSCXT, feat_csv2_2_csv2_1p2), - NEEDS_FEAT(SCTLR_EL1_EIS | - SCTLR_EL1_EOS, - FEAT_ExS), + NEEDS_FEAT_FLAG(SCTLR_EL1_TSCXT, + AS_RES1, feat_csv2_2_csv2_1p2), + NEEDS_FEAT_FLAG(SCTLR_EL1_EIS | + SCTLR_EL1_EOS, + AS_RES1, FEAT_ExS), NEEDS_FEAT(SCTLR_EL1_EnFPM, FEAT_FPMR), NEEDS_FEAT(SCTLR_EL1_IESB, FEAT_IESB), NEEDS_FEAT(SCTLR_EL1_EnALS, FEAT_LS64), NEEDS_FEAT(SCTLR_EL1_EnAS0, FEAT_LS64_ACCDATA), NEEDS_FEAT(SCTLR_EL1_EnASR, FEAT_LS64_V), NEEDS_FEAT(SCTLR_EL1_nAA, FEAT_LSE2), - NEEDS_FEAT(SCTLR_EL1_LSMAOE | - SCTLR_EL1_nTLSMD, - FEAT_LSMAOC), + NEEDS_FEAT_FLAG(SCTLR_EL1_LSMAOE | + SCTLR_EL1_nTLSMD, + AS_RES1, FEAT_LSMAOC), NEEDS_FEAT(SCTLR_EL1_EE, FEAT_MixedEnd), NEEDS_FEAT(SCTLR_EL1_E0E, feat_mixedendel0), NEEDS_FEAT(SCTLR_EL1_MSCEn, FEAT_MOPS), @@ -1121,7 +1122,8 @@ static const struct reg_bits_to_feat_map sctlr_el1_feat_map[] = { NEEDS_FEAT(SCTLR_EL1_NMI | SCTLR_EL1_SPINTMASK, FEAT_NMI), - NEEDS_FEAT(SCTLR_EL1_SPAN, FEAT_PAN), + NEEDS_FEAT_FLAG(SCTLR_EL1_SPAN, + AS_RES1, FEAT_PAN), NEEDS_FEAT(SCTLR_EL1_EPAN, FEAT_PAN3), NEEDS_FEAT(SCTLR_EL1_EnDA | SCTLR_EL1_EnDB | From fb86207bdc5bf4d5743eab78d6babbecda6c2609 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:18 +0000 Subject: [PATCH 226/267] KVM: arm64: Convert HCR_EL2.RW to AS_RES1 Now that we have the AS_RES1 constraint, it becomes trivial to express the HCR_EL2.RW behaviour. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-10-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/config.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index 25ef1873a6c3..70aa5587d99b 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -389,19 +389,6 @@ static bool feat_vmid16(struct kvm *kvm) return kvm_has_feat_enum(kvm, ID_AA64MMFR1_EL1, VMIDBits, 16); } -static bool compute_hcr_rw(struct kvm *kvm, u64 *bits) -{ - /* This is purely academic: AArch32 and NV are mutually exclusive */ - if (bits) { - if (kvm_has_feat(kvm, FEAT_AA32EL1)) - *bits &= ~HCR_EL2_RW; - else - *bits |= HCR_EL2_RW; - } - - return true; -} - static bool compute_hcr_e2h(struct kvm *kvm, u64 *bits) { if (bits) { @@ -967,7 +954,7 @@ static const DECLARE_FEAT_MAP(hcrx_desc, __HCRX_EL2, static const struct reg_bits_to_feat_map hcr_feat_map[] = { NEEDS_FEAT(HCR_EL2_TID0, FEAT_AA32EL0), - NEEDS_FEAT_FIXED(HCR_EL2_RW, compute_hcr_rw), + NEEDS_FEAT_FLAG(HCR_EL2_RW, AS_RES1, FEAT_AA32EL1), NEEDS_FEAT(HCR_EL2_HCD, not_feat_aa64el3), NEEDS_FEAT(HCR_EL2_AMO | HCR_EL2_BSU | From 8d94458263bb2d44d8ba461327a1e18c05cfc453 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:19 +0000 Subject: [PATCH 227/267] KVM: arm64: Simplify FIXED_VALUE handling The FIXED_VALUE qualifier (mostly used for HCR_EL2) is pointlessly complicated, as it tries to piggy-back on the previous RES0 handling while being done in a different phase, on different data. Instead, make it an integral part of the RESx computation, and allow it to directly set RESx bits. This is much easier to understand. It also paves the way for some additional changes to that will allow the full removal of the FIXED_VALUE handling. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-11-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/config.c | 66 ++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 44 deletions(-) diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index 70aa5587d99b..d3467bac40fc 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -37,7 +37,7 @@ struct reg_bits_to_feat_map { s8 lo_lim; }; bool (*match)(struct kvm *); - bool (*fval)(struct kvm *, u64 *); + bool (*fval)(struct kvm *, struct resx *); }; }; @@ -389,14 +389,12 @@ static bool feat_vmid16(struct kvm *kvm) return kvm_has_feat_enum(kvm, ID_AA64MMFR1_EL1, VMIDBits, 16); } -static bool compute_hcr_e2h(struct kvm *kvm, u64 *bits) +static bool compute_hcr_e2h(struct kvm *kvm, struct resx *bits) { - if (bits) { - if (kvm_has_feat(kvm, FEAT_E2H0)) - *bits &= ~HCR_EL2_E2H; - else - *bits |= HCR_EL2_E2H; - } + if (kvm_has_feat(kvm, FEAT_E2H0)) + bits->res0 |= HCR_EL2_E2H; + else + bits->res1 |= HCR_EL2_E2H; return true; } @@ -1280,12 +1278,11 @@ static bool idreg_feat_match(struct kvm *kvm, const struct reg_bits_to_feat_map } } -static struct resx __compute_fixed_bits(struct kvm *kvm, - const struct reg_bits_to_feat_map *map, - int map_size, - u64 *fixed_bits, - unsigned long require, - unsigned long exclude) +static struct resx compute_resx_bits(struct kvm *kvm, + const struct reg_bits_to_feat_map *map, + int map_size, + unsigned long require, + unsigned long exclude) { struct resx resx = {}; @@ -1298,14 +1295,18 @@ static struct resx __compute_fixed_bits(struct kvm *kvm, if (map[i].flags & exclude) continue; - if (map[i].flags & CALL_FUNC) - match = (map[i].flags & FIXED_VALUE) ? - map[i].fval(kvm, fixed_bits) : - map[i].match(kvm); - else + switch (map[i].flags & (CALL_FUNC | FIXED_VALUE)) { + case CALL_FUNC | FIXED_VALUE: + map[i].fval(kvm, &resx); + continue; + case CALL_FUNC: + match = map[i].match(kvm); + break; + default: match = idreg_feat_match(kvm, &map[i]); + } - if (!match || (map[i].flags & FIXED_VALUE)) { + if (!match) { if (map[i].flags & AS_RES1) resx.res1 |= reg_feat_map_bits(&map[i]); else @@ -1316,16 +1317,6 @@ static struct resx __compute_fixed_bits(struct kvm *kvm, return resx; } -static struct resx compute_resx_bits(struct kvm *kvm, - const struct reg_bits_to_feat_map *map, - int map_size, - unsigned long require, - unsigned long exclude) -{ - return __compute_fixed_bits(kvm, map, map_size, NULL, - require, exclude | FIXED_VALUE); -} - static struct resx compute_reg_resx_bits(struct kvm *kvm, const struct reg_feat_map_desc *r, unsigned long require, @@ -1366,16 +1357,6 @@ static u64 compute_fgu_bits(struct kvm *kvm, const struct reg_feat_map_desc *r) return resx.res0 | resx.res1; } -static struct resx compute_reg_fixed_bits(struct kvm *kvm, - const struct reg_feat_map_desc *r, - u64 *fixed_bits, - unsigned long require, - unsigned long exclude) -{ - return __compute_fixed_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz, - fixed_bits, require | FIXED_VALUE, exclude); -} - void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt) { u64 val = 0; @@ -1415,7 +1396,6 @@ void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt) struct resx get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg) { - u64 fixed = 0, mask; struct resx resx; switch (reg) { @@ -1457,10 +1437,8 @@ struct resx get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg) resx.res1 |= __HCRX_EL2_RES1; break; case HCR_EL2: - mask = compute_reg_fixed_bits(kvm, &hcr_desc, &fixed, 0, 0).res0; resx = compute_reg_resx_bits(kvm, &hcr_desc, 0, 0); - resx.res0 |= (mask & ~fixed); - resx.res1 |= HCR_EL2_RES1 | (mask & fixed); + resx.res1 |= HCR_EL2_RES1; break; case SCTLR2_EL1: case SCTLR2_EL2: From ad90512f12fef5506d1f72cdfbd720eb701eab8c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:20 +0000 Subject: [PATCH 228/267] KVM: arm64: Add REQUIRES_E2H1 constraint as configuration flags A bunch of EL2 configuration are very similar to their EL1 counterpart, with the added constraint that HCR_EL2.E2H being 1. For us, this means HCR_EL2.E2H being RES1, which is something we can statically evaluate. Add a REQUIRES_E2H1 constraint, which allows us to express conditions in a much simpler way (without extra code). Existing occurrences are converted, before we add a lot more. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-12-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/config.c | 38 ++++++++++++++------------------------ 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index d3467bac40fc..84c577672e97 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -25,6 +25,7 @@ struct reg_bits_to_feat_map { #define FIXED_VALUE BIT(2) /* RAZ/WI or RAO/WI in KVM */ #define MASKS_POINTER BIT(3) /* Pointer to fgt_masks struct instead of bits */ #define AS_RES1 BIT(4) /* RES1 when not supported */ +#define REQUIRES_E2H1 BIT(5) /* Add HCR_EL2.E2H RES1 as a pre-condition */ unsigned long flags; @@ -311,21 +312,6 @@ static bool feat_trbe_mpam(struct kvm *kvm) (read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_MPAM)); } -static bool feat_asid2_e2h1(struct kvm *kvm) -{ - return kvm_has_feat(kvm, FEAT_ASID2) && !kvm_has_feat(kvm, FEAT_E2H0); -} - -static bool feat_d128_e2h1(struct kvm *kvm) -{ - return kvm_has_feat(kvm, FEAT_D128) && !kvm_has_feat(kvm, FEAT_E2H0); -} - -static bool feat_mec_e2h1(struct kvm *kvm) -{ - return kvm_has_feat(kvm, FEAT_MEC) && !kvm_has_feat(kvm, FEAT_E2H0); -} - static bool feat_ebep_pmuv3_ss(struct kvm *kvm) { return kvm_has_feat(kvm, FEAT_EBEP) || kvm_has_feat(kvm, FEAT_PMUv3_SS); @@ -1045,15 +1031,15 @@ static const DECLARE_FEAT_MAP(sctlr2_desc, SCTLR2_EL1, sctlr2_feat_map, FEAT_SCTLR2); static const struct reg_bits_to_feat_map tcr2_el2_feat_map[] = { - NEEDS_FEAT(TCR2_EL2_FNG1 | - TCR2_EL2_FNG0 | - TCR2_EL2_A2, - feat_asid2_e2h1), - NEEDS_FEAT(TCR2_EL2_DisCH1 | - TCR2_EL2_DisCH0 | - TCR2_EL2_D128, - feat_d128_e2h1), - NEEDS_FEAT(TCR2_EL2_AMEC1, feat_mec_e2h1), + NEEDS_FEAT_FLAG(TCR2_EL2_FNG1 | + TCR2_EL2_FNG0 | + TCR2_EL2_A2, + REQUIRES_E2H1, FEAT_ASID2), + NEEDS_FEAT_FLAG(TCR2_EL2_DisCH1 | + TCR2_EL2_DisCH0 | + TCR2_EL2_D128, + REQUIRES_E2H1, FEAT_D128), + NEEDS_FEAT_FLAG(TCR2_EL2_AMEC1, REQUIRES_E2H1, FEAT_MEC), NEEDS_FEAT(TCR2_EL2_AMEC0, FEAT_MEC), NEEDS_FEAT(TCR2_EL2_HAFT, FEAT_HAFT), NEEDS_FEAT(TCR2_EL2_PTTWI | @@ -1284,6 +1270,7 @@ static struct resx compute_resx_bits(struct kvm *kvm, unsigned long require, unsigned long exclude) { + bool e2h0 = kvm_has_feat(kvm, FEAT_E2H0); struct resx resx = {}; for (int i = 0; i < map_size; i++) { @@ -1306,6 +1293,9 @@ static struct resx compute_resx_bits(struct kvm *kvm, match = idreg_feat_match(kvm, &map[i]); } + if (map[i].flags & REQUIRES_E2H1) + match &= !e2h0; + if (!match) { if (map[i].flags & AS_RES1) resx.res1 |= reg_feat_map_bits(&map[i]); From d406fcb2030e3efe2c5a7f043028cb3727f522d8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:21 +0000 Subject: [PATCH 229/267] KVM: arm64: Add RES1_WHEN_E2Hx constraints as configuration flags "Thanks" to VHE, SCTLR_EL2 radically changes shape depending on the value of HCR_EL2.E2H, as a lot of the bits that didn't have much meaning with E2H=0 start impacting EL0 with E2H=1. This has a direct impact on the RESx behaviour of these bits, and we need a way to express them. For this purpose, introduce two new constaints that, when the controlling feature is not present, force the field to RES1 depending on the value of E2H. Note that RES0 is still implicit, This allows diverging RESx values depending on the value of E2H, something that is required by a bunch of SCTLR_EL2 bits. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-13-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/config.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index 84c577672e97..7e8e42c1cee4 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -26,6 +26,8 @@ struct reg_bits_to_feat_map { #define MASKS_POINTER BIT(3) /* Pointer to fgt_masks struct instead of bits */ #define AS_RES1 BIT(4) /* RES1 when not supported */ #define REQUIRES_E2H1 BIT(5) /* Add HCR_EL2.E2H RES1 as a pre-condition */ +#define RES1_WHEN_E2H0 BIT(6) /* RES1 when E2H=0 and not supported */ +#define RES1_WHEN_E2H1 BIT(7) /* RES1 when E2H=1 and not supported */ unsigned long flags; @@ -1297,10 +1299,14 @@ static struct resx compute_resx_bits(struct kvm *kvm, match &= !e2h0; if (!match) { - if (map[i].flags & AS_RES1) - resx.res1 |= reg_feat_map_bits(&map[i]); + u64 bits = reg_feat_map_bits(&map[i]); + + if ((map[i].flags & AS_RES1) || + (e2h0 && (map[i].flags & RES1_WHEN_E2H0)) || + (!e2h0 && (map[i].flags & RES1_WHEN_E2H1))) + resx.res1 |= bits; else - resx.res0 |= reg_feat_map_bits(&map[i]); + resx.res0 |= bits; } } From d2f629aa75bef1c346f17ca195271582dafc6f3b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:22 +0000 Subject: [PATCH 230/267] KVM: arm64: Move RESx into individual register descriptors Instead of hacking the RES1 bits at runtime, move them into the register descriptors. This makes it significantly nicer. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-14-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/config.c | 48 +++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index 7e8e42c1cee4..474d5c8038c2 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -28,6 +28,7 @@ struct reg_bits_to_feat_map { #define REQUIRES_E2H1 BIT(5) /* Add HCR_EL2.E2H RES1 as a pre-condition */ #define RES1_WHEN_E2H0 BIT(6) /* RES1 when E2H=0 and not supported */ #define RES1_WHEN_E2H1 BIT(7) /* RES1 when E2H=1 and not supported */ +#define FORCE_RESx BIT(8) /* Unconditional RESx */ unsigned long flags; @@ -87,6 +88,12 @@ struct reg_feat_map_desc { .match = (fun), \ } +#define __NEEDS_FEAT_0(m, f, w, ...) \ + { \ + .w = (m), \ + .flags = (f), \ + } + #define __NEEDS_FEAT_FLAG(m, f, w, ...) \ CONCATENATE(__NEEDS_FEAT_, COUNT_ARGS(__VA_ARGS__))(m, f, w, __VA_ARGS__) @@ -105,10 +112,14 @@ struct reg_feat_map_desc { */ #define NEEDS_FEAT(m, ...) NEEDS_FEAT_FLAG(m, 0, __VA_ARGS__) +/* Declare fixed RESx bits */ +#define FORCE_RES0(m) NEEDS_FEAT_FLAG(m, FORCE_RESx) +#define FORCE_RES1(m) NEEDS_FEAT_FLAG(m, FORCE_RESx | AS_RES1) + /* - * Declare the dependency between a non-FGT register, a set of - * feature, and the set of individual bits it contains. This generates - * a struct reg_feat_map_desc. + * Declare the dependency between a non-FGT register, a set of features, + * and the set of individual bits it contains. This generates a struct + * reg_feat_map_desc. */ #define DECLARE_FEAT_MAP(n, r, m, f) \ struct reg_feat_map_desc n = { \ @@ -1007,6 +1018,8 @@ static const struct reg_bits_to_feat_map hcr_feat_map[] = { HCR_EL2_TWEDEn, FEAT_TWED), NEEDS_FEAT_FIXED(HCR_EL2_E2H, compute_hcr_e2h), + FORCE_RES0(HCR_EL2_RES0), + FORCE_RES1(HCR_EL2_RES1), }; static const DECLARE_FEAT_MAP(hcr_desc, HCR_EL2, @@ -1027,6 +1040,8 @@ static const struct reg_bits_to_feat_map sctlr2_feat_map[] = { SCTLR2_EL1_CPTM | SCTLR2_EL1_CPTM0, FEAT_CPA2), + FORCE_RES0(SCTLR2_EL1_RES0), + FORCE_RES1(SCTLR2_EL1_RES1), }; static const DECLARE_FEAT_MAP(sctlr2_desc, SCTLR2_EL1, @@ -1052,6 +1067,8 @@ static const struct reg_bits_to_feat_map tcr2_el2_feat_map[] = { TCR2_EL2_E0POE, FEAT_S1POE), NEEDS_FEAT(TCR2_EL2_PIE, FEAT_S1PIE), + FORCE_RES0(TCR2_EL2_RES0), + FORCE_RES1(TCR2_EL2_RES1), }; static const DECLARE_FEAT_MAP(tcr2_el2_desc, TCR2_EL2, @@ -1129,6 +1146,8 @@ static const struct reg_bits_to_feat_map sctlr_el1_feat_map[] = { SCTLR_EL1_A | SCTLR_EL1_M, FEAT_AA64EL1), + FORCE_RES0(SCTLR_EL1_RES0), + FORCE_RES1(SCTLR_EL1_RES1), }; static const DECLARE_FEAT_MAP(sctlr_el1_desc, SCTLR_EL1, @@ -1163,6 +1182,8 @@ static const struct reg_bits_to_feat_map mdcr_el2_feat_map[] = { MDCR_EL2_TDE | MDCR_EL2_TDRA, FEAT_AA64EL1), + FORCE_RES0(MDCR_EL2_RES0), + FORCE_RES1(MDCR_EL2_RES1), }; static const DECLARE_FEAT_MAP(mdcr_el2_desc, MDCR_EL2, @@ -1201,6 +1222,8 @@ static const struct reg_bits_to_feat_map vtcr_el2_feat_map[] = { VTCR_EL2_SL0 | VTCR_EL2_T0SZ, FEAT_AA64EL1), + FORCE_RES0(VTCR_EL2_RES0), + FORCE_RES1(VTCR_EL2_RES1), }; static const DECLARE_FEAT_MAP(vtcr_el2_desc, VTCR_EL2, @@ -1211,8 +1234,14 @@ static void __init check_feat_map(const struct reg_bits_to_feat_map *map, { u64 mask = 0; + /* + * Don't account for FORCE_RESx that are architectural, and + * therefore part of the resx parameter. Other FORCE_RESx bits + * are implementation choices, and therefore accounted for. + */ for (int i = 0; i < map_size; i++) - mask |= map[i].bits; + if (!((map[i].flags & FORCE_RESx) && (map[i].bits & resx))) + mask |= map[i].bits; if (mask != ~resx) kvm_err("Undefined %s behaviour, bits %016llx\n", @@ -1284,13 +1313,16 @@ static struct resx compute_resx_bits(struct kvm *kvm, if (map[i].flags & exclude) continue; - switch (map[i].flags & (CALL_FUNC | FIXED_VALUE)) { + switch (map[i].flags & (FORCE_RESx | CALL_FUNC | FIXED_VALUE)) { case CALL_FUNC | FIXED_VALUE: map[i].fval(kvm, &resx); continue; case CALL_FUNC: match = map[i].match(kvm); break; + case FORCE_RESx: + match = false; + break; default: match = idreg_feat_match(kvm, &map[i]); } @@ -1434,28 +1466,22 @@ struct resx get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg) break; case HCR_EL2: resx = compute_reg_resx_bits(kvm, &hcr_desc, 0, 0); - resx.res1 |= HCR_EL2_RES1; break; case SCTLR2_EL1: case SCTLR2_EL2: resx = compute_reg_resx_bits(kvm, &sctlr2_desc, 0, 0); - resx.res1 |= SCTLR2_EL1_RES1; break; case TCR2_EL2: resx = compute_reg_resx_bits(kvm, &tcr2_el2_desc, 0, 0); - resx.res1 |= TCR2_EL2_RES1; break; case SCTLR_EL1: resx = compute_reg_resx_bits(kvm, &sctlr_el1_desc, 0, 0); - resx.res1 |= SCTLR_EL1_RES1; break; case MDCR_EL2: resx = compute_reg_resx_bits(kvm, &mdcr_el2_desc, 0, 0); - resx.res1 |= MDCR_EL2_RES1; break; case VTCR_EL2: resx = compute_reg_resx_bits(kvm, &vtcr_el2_desc, 0, 0); - resx.res1 |= VTCR_EL2_RES1; break; default: WARN_ON_ONCE(1); From f01e3429cf0e4b1ab20c9d51ebfa0d8514d8fe4d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:23 +0000 Subject: [PATCH 231/267] KVM: arm64: Simplify handling of HCR_EL2.E2H RESx Now that we can link the RESx behaviour with the value of HCR_EL2.E2H, we can trivially express the tautological constraint that makes E2H a reserved value at all times. Fun, isn't it? Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-15-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/config.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index 474d5c8038c2..ae72f3b8e50b 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -388,16 +388,6 @@ static bool feat_vmid16(struct kvm *kvm) return kvm_has_feat_enum(kvm, ID_AA64MMFR1_EL1, VMIDBits, 16); } -static bool compute_hcr_e2h(struct kvm *kvm, struct resx *bits) -{ - if (kvm_has_feat(kvm, FEAT_E2H0)) - bits->res0 |= HCR_EL2_E2H; - else - bits->res1 |= HCR_EL2_E2H; - - return true; -} - static const struct reg_bits_to_feat_map hfgrtr_feat_map[] = { NEEDS_FEAT(HFGRTR_EL2_nAMAIR2_EL1 | HFGRTR_EL2_nMAIR2_EL1, @@ -1017,7 +1007,7 @@ static const struct reg_bits_to_feat_map hcr_feat_map[] = { NEEDS_FEAT(HCR_EL2_TWEDEL | HCR_EL2_TWEDEn, FEAT_TWED), - NEEDS_FEAT_FIXED(HCR_EL2_E2H, compute_hcr_e2h), + NEEDS_FEAT_FLAG(HCR_EL2_E2H, RES1_WHEN_E2H1 | FORCE_RESx), FORCE_RES0(HCR_EL2_RES0), FORCE_RES1(HCR_EL2_RES1), }; From ab1f377b4c930e8d117cc461bd64d5866fc6aab8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:24 +0000 Subject: [PATCH 232/267] KVM: arm64: Get rid of FIXED_VALUE altogether We have now killed every occurrences of FIXED_VALUE, and we can therefore drop the whole infrastructure. Good riddance. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-16-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/config.c | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index ae72f3b8e50b..274ae049c4b3 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -22,13 +22,12 @@ struct reg_bits_to_feat_map { #define NEVER_FGU BIT(0) /* Can trap, but never UNDEF */ #define CALL_FUNC BIT(1) /* Needs to evaluate tons of crap */ -#define FIXED_VALUE BIT(2) /* RAZ/WI or RAO/WI in KVM */ +#define FORCE_RESx BIT(2) /* Unconditional RESx */ #define MASKS_POINTER BIT(3) /* Pointer to fgt_masks struct instead of bits */ #define AS_RES1 BIT(4) /* RES1 when not supported */ #define REQUIRES_E2H1 BIT(5) /* Add HCR_EL2.E2H RES1 as a pre-condition */ #define RES1_WHEN_E2H0 BIT(6) /* RES1 when E2H=0 and not supported */ #define RES1_WHEN_E2H1 BIT(7) /* RES1 when E2H=1 and not supported */ -#define FORCE_RESx BIT(8) /* Unconditional RESx */ unsigned long flags; @@ -41,7 +40,6 @@ struct reg_bits_to_feat_map { s8 lo_lim; }; bool (*match)(struct kvm *); - bool (*fval)(struct kvm *, struct resx *); }; }; @@ -74,13 +72,6 @@ struct reg_feat_map_desc { .lo_lim = id ##_## fld ##_## lim \ } -#define __NEEDS_FEAT_2(m, f, w, fun, dummy) \ - { \ - .w = (m), \ - .flags = (f) | CALL_FUNC, \ - .fval = (fun), \ - } - #define __NEEDS_FEAT_1(m, f, w, fun) \ { \ .w = (m), \ @@ -100,9 +91,6 @@ struct reg_feat_map_desc { #define NEEDS_FEAT_FLAG(m, f, ...) \ __NEEDS_FEAT_FLAG(m, f, bits, __VA_ARGS__) -#define NEEDS_FEAT_FIXED(m, ...) \ - __NEEDS_FEAT_FLAG(m, FIXED_VALUE, bits, __VA_ARGS__, 0) - #define NEEDS_FEAT_MASKS(p, ...) \ __NEEDS_FEAT_FLAG(p, MASKS_POINTER, masks, __VA_ARGS__) @@ -1303,19 +1291,12 @@ static struct resx compute_resx_bits(struct kvm *kvm, if (map[i].flags & exclude) continue; - switch (map[i].flags & (FORCE_RESx | CALL_FUNC | FIXED_VALUE)) { - case CALL_FUNC | FIXED_VALUE: - map[i].fval(kvm, &resx); - continue; - case CALL_FUNC: - match = map[i].match(kvm); - break; - case FORCE_RESx: + if (map[i].flags & FORCE_RESx) match = false; - break; - default: + else if (map[i].flags & CALL_FUNC) + match = map[i].match(kvm); + else match = idreg_feat_match(kvm, &map[i]); - } if (map[i].flags & REQUIRES_E2H1) match &= !e2h0; From d784cfe697abdfd53f332d39d0cffcf03cbcafaa Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:25 +0000 Subject: [PATCH 233/267] KVM: arm64: Simplify handling of full register invalid constraint Now that we embed the RESx bits in the register description, it becomes easier to deal with registers that are simply not valid, as their existence is not satisfied by the configuration (SCTLR2_ELx without FEAT_SCTLR2, for example). Such registers essentially become RES0 for any bit that wasn't already advertised as RESx. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-17-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/config.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index 274ae049c4b3..b37b40744db9 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -1321,7 +1321,7 @@ static struct resx compute_reg_resx_bits(struct kvm *kvm, unsigned long require, unsigned long exclude) { - struct resx resx, tmp; + struct resx resx; resx = compute_resx_bits(kvm, r->bit_feat_map, r->bit_feat_map_sz, require, exclude); @@ -1331,11 +1331,14 @@ static struct resx compute_reg_resx_bits(struct kvm *kvm, resx.res1 |= r->feat_map.masks->res1; } - tmp = compute_resx_bits(kvm, &r->feat_map, 1, require, exclude); - - resx.res0 |= tmp.res0; - resx.res0 |= ~reg_feat_map_bits(&r->feat_map); - resx.res1 |= tmp.res1; + /* + * If the register itself was not valid, all the non-RESx bits are + * now considered RES0 (this matches the behaviour of registers such + * as SCTLR2 and TCR2). Weed out any potential (though unlikely) + * overlap with RES1 bits coming from the previous computation. + */ + resx.res0 |= compute_resx_bits(kvm, &r->feat_map, 1, require, exclude).res0; + resx.res1 &= ~resx.res0; return resx; } From d65bf6e317e7bb13612bd94e01c5a11b6fc67e9d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:26 +0000 Subject: [PATCH 234/267] KVM: arm64: Remove all traces of FEAT_TME FEAT_TME has been dropped from the architecture. Retrospectively. I'm sure someone is crying somewhere, but most of us won't. Clean-up time. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-18-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/config.c | 7 ------- arch/arm64/kvm/nested.c | 5 ----- arch/arm64/tools/sysreg | 12 +++--------- tools/perf/Documentation/perf-arm-spe.txt | 1 - tools/testing/selftests/kvm/arm64/set_id_regs.c | 1 - 5 files changed, 3 insertions(+), 23 deletions(-) diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index b37b40744db9..c1b76a76a5e4 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -187,7 +187,6 @@ struct reg_feat_map_desc { #define FEAT_RME ID_AA64PFR0_EL1, RME, IMP #define FEAT_MPAM ID_AA64PFR0_EL1, MPAM, 1 #define FEAT_S2FWB ID_AA64MMFR2_EL1, FWB, IMP -#define FEAT_TME ID_AA64ISAR0_EL1, TME, IMP #define FEAT_TWED ID_AA64MMFR1_EL1, TWED, IMP #define FEAT_E2H0 ID_AA64MMFR4_EL1, E2H0, IMP #define FEAT_SRMASK ID_AA64MMFR4_EL1, SRMASK, IMP @@ -991,7 +990,6 @@ static const struct reg_bits_to_feat_map hcr_feat_map[] = { NEEDS_FEAT(HCR_EL2_FIEN, feat_rasv1p1), NEEDS_FEAT(HCR_EL2_GPF, FEAT_RME), NEEDS_FEAT(HCR_EL2_FWB, FEAT_S2FWB), - NEEDS_FEAT(HCR_EL2_TME, FEAT_TME), NEEDS_FEAT(HCR_EL2_TWEDEL | HCR_EL2_TWEDEn, FEAT_TWED), @@ -1102,11 +1100,6 @@ static const struct reg_bits_to_feat_map sctlr_el1_feat_map[] = { NEEDS_FEAT(SCTLR_EL1_EnRCTX, FEAT_SPECRES), NEEDS_FEAT(SCTLR_EL1_DSSBS, FEAT_SSBS), NEEDS_FEAT(SCTLR_EL1_TIDCP, FEAT_TIDCP1), - NEEDS_FEAT(SCTLR_EL1_TME0 | - SCTLR_EL1_TME | - SCTLR_EL1_TMT0 | - SCTLR_EL1_TMT, - FEAT_TME), NEEDS_FEAT(SCTLR_EL1_TWEDEL | SCTLR_EL1_TWEDEn, FEAT_TWED), diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 75a23f1c56d1..96e899dbd919 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1505,11 +1505,6 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) u64 orig_val = val; switch (reg) { - case SYS_ID_AA64ISAR0_EL1: - /* Support everything but TME */ - val &= ~ID_AA64ISAR0_EL1_TME; - break; - case SYS_ID_AA64ISAR1_EL1: /* Support everything but LS64 and Spec Invalidation */ val &= ~(ID_AA64ISAR1_EL1_LS64 | diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 969a75615d61..650d7d477087 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1856,10 +1856,7 @@ UnsignedEnum 31:28 RDM 0b0000 NI 0b0001 IMP EndEnum -UnsignedEnum 27:24 TME - 0b0000 NI - 0b0001 IMP -EndEnum +Res0 27:24 UnsignedEnum 23:20 ATOMIC 0b0000 NI 0b0010 IMP @@ -2432,10 +2429,7 @@ Field 57 EPAN Field 56 EnALS Field 55 EnAS0 Field 54 EnASR -Field 53 TME -Field 52 TME0 -Field 51 TMT -Field 50 TMT0 +Res0 53:50 Field 49:46 TWEDEL Field 45 TWEDEn Field 44 DSSBS @@ -3840,7 +3834,7 @@ Field 43 NV1 Field 42 NV Field 41 API Field 40 APK -Field 39 TME +Res0 39 Field 38 MIOCNCE Field 37 TEA Field 36 TERR diff --git a/tools/perf/Documentation/perf-arm-spe.txt b/tools/perf/Documentation/perf-arm-spe.txt index 8b02e5b983fa..201a82bec0de 100644 --- a/tools/perf/Documentation/perf-arm-spe.txt +++ b/tools/perf/Documentation/perf-arm-spe.txt @@ -176,7 +176,6 @@ and inv_event_filter are: bit 10 - Remote access (FEAT_SPEv1p4) bit 11 - Misaligned access (FEAT_SPEv1p1) bit 12-15 - IMPLEMENTATION DEFINED events (when implemented) - bit 16 - Transaction (FEAT_TME) bit 17 - Partial or empty SME or SVE predicate (FEAT_SPEv1p1) bit 18 - Empty SME or SVE predicate (FEAT_SPEv1p1) bit 19 - L2D access (FEAT_SPEv1p4) diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c index c4815d365816..73de5be58bab 100644 --- a/tools/testing/selftests/kvm/arm64/set_id_regs.c +++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c @@ -91,7 +91,6 @@ static const struct reg_ftr_bits ftr_id_aa64isar0_el1[] = { REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SM3, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SHA3, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, RDM, 0), - REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, TME, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, ATOMIC, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, CRC32, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SHA2, 0), From fb40cb15e8ad1e7511966a953de0f409aaae4398 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:27 +0000 Subject: [PATCH 235/267] KVM: arm64: Remove all traces of HCR_EL2.MIOCNCE MIOCNCE had the potential to eat your data, and also was never implemented by anyone. It's been retrospectively removed from the architecture, and we're happy to follow that lead. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-19-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/config.c | 1 - arch/arm64/tools/sysreg | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index c1b76a76a5e4..8640f9c9b2e0 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -938,7 +938,6 @@ static const struct reg_bits_to_feat_map hcr_feat_map[] = { HCR_EL2_FMO | HCR_EL2_ID | HCR_EL2_IMO | - HCR_EL2_MIOCNCE | HCR_EL2_PTW | HCR_EL2_SWIO | HCR_EL2_TACR | diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 650d7d477087..724e6ad966c2 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -3834,8 +3834,7 @@ Field 43 NV1 Field 42 NV Field 41 API Field 40 APK -Res0 39 -Field 38 MIOCNCE +Res0 39:38 Field 37 TEA Field 36 TERR Field 35 TLOR From e8ef27900c6a8c0727dcf62d4112d08c5046d33e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:28 +0000 Subject: [PATCH 236/267] KVM: arm64: Add sanitisation to SCTLR_EL2 Sanitise SCTLR_EL2 the usual way. The most important aspect of this is that we benefit from SCTLR_EL2.SPAN being RES1 when HCR_EL2.E2H==0. Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-20-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kvm/config.c | 82 +++++++++++++++++++++++++++++++ arch/arm64/kvm/nested.c | 4 ++ 3 files changed, 87 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 20ebc1610ac8..771ef1b61f9a 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -495,7 +495,6 @@ enum vcpu_sysreg { DBGVCR32_EL2, /* Debug Vector Catch Register */ /* EL2 registers */ - SCTLR_EL2, /* System Control Register (EL2) */ ACTLR_EL2, /* Auxiliary Control Register (EL2) */ CPTR_EL2, /* Architectural Feature Trap Register (EL2) */ HACR_EL2, /* Hypervisor Auxiliary Control Register */ @@ -526,6 +525,7 @@ enum vcpu_sysreg { /* Anything from this can be RES0/RES1 sanitised */ MARKER(__SANITISED_REG_START__), + SCTLR_EL2, /* System Control Register (EL2) */ TCR2_EL2, /* Extended Translation Control Register (EL2) */ SCTLR2_EL2, /* System Control Register 2 (EL2) */ MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */ diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index 8640f9c9b2e0..d9f553cbf9df 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -1123,6 +1123,84 @@ static const struct reg_bits_to_feat_map sctlr_el1_feat_map[] = { static const DECLARE_FEAT_MAP(sctlr_el1_desc, SCTLR_EL1, sctlr_el1_feat_map, FEAT_AA64EL1); +static const struct reg_bits_to_feat_map sctlr_el2_feat_map[] = { + NEEDS_FEAT_FLAG(SCTLR_EL2_CP15BEN, + RES1_WHEN_E2H0 | REQUIRES_E2H1, + FEAT_AA32EL0), + NEEDS_FEAT_FLAG(SCTLR_EL2_ITD | + SCTLR_EL2_SED, + RES1_WHEN_E2H1 | REQUIRES_E2H1, + FEAT_AA32EL0), + NEEDS_FEAT_FLAG(SCTLR_EL2_BT0, REQUIRES_E2H1, FEAT_BTI), + NEEDS_FEAT(SCTLR_EL2_BT, FEAT_BTI), + NEEDS_FEAT_FLAG(SCTLR_EL2_CMOW, REQUIRES_E2H1, FEAT_CMOW), + NEEDS_FEAT_FLAG(SCTLR_EL2_TSCXT, + RES1_WHEN_E2H1 | REQUIRES_E2H1, + feat_csv2_2_csv2_1p2), + NEEDS_FEAT_FLAG(SCTLR_EL2_EIS | + SCTLR_EL2_EOS, + AS_RES1, FEAT_ExS), + NEEDS_FEAT(SCTLR_EL2_EnFPM, FEAT_FPMR), + NEEDS_FEAT(SCTLR_EL2_IESB, FEAT_IESB), + NEEDS_FEAT_FLAG(SCTLR_EL2_EnALS, REQUIRES_E2H1, FEAT_LS64), + NEEDS_FEAT_FLAG(SCTLR_EL2_EnAS0, REQUIRES_E2H1, FEAT_LS64_ACCDATA), + NEEDS_FEAT_FLAG(SCTLR_EL2_EnASR, REQUIRES_E2H1, FEAT_LS64_V), + NEEDS_FEAT(SCTLR_EL2_nAA, FEAT_LSE2), + NEEDS_FEAT_FLAG(SCTLR_EL2_LSMAOE | + SCTLR_EL2_nTLSMD, + AS_RES1 | REQUIRES_E2H1, FEAT_LSMAOC), + NEEDS_FEAT(SCTLR_EL2_EE, FEAT_MixedEnd), + NEEDS_FEAT_FLAG(SCTLR_EL2_E0E, REQUIRES_E2H1, feat_mixedendel0), + NEEDS_FEAT_FLAG(SCTLR_EL2_MSCEn, REQUIRES_E2H1, FEAT_MOPS), + NEEDS_FEAT_FLAG(SCTLR_EL2_ATA0 | + SCTLR_EL2_TCF0, + REQUIRES_E2H1, FEAT_MTE2), + NEEDS_FEAT(SCTLR_EL2_ATA | + SCTLR_EL2_TCF, + FEAT_MTE2), + NEEDS_FEAT(SCTLR_EL2_ITFSB, feat_mte_async), + NEEDS_FEAT_FLAG(SCTLR_EL2_TCSO0, REQUIRES_E2H1, FEAT_MTE_STORE_ONLY), + NEEDS_FEAT(SCTLR_EL2_TCSO, + FEAT_MTE_STORE_ONLY), + NEEDS_FEAT(SCTLR_EL2_NMI | + SCTLR_EL2_SPINTMASK, + FEAT_NMI), + NEEDS_FEAT_FLAG(SCTLR_EL2_SPAN, AS_RES1 | REQUIRES_E2H1, FEAT_PAN), + NEEDS_FEAT_FLAG(SCTLR_EL2_EPAN, REQUIRES_E2H1, FEAT_PAN3), + NEEDS_FEAT(SCTLR_EL2_EnDA | + SCTLR_EL2_EnDB | + SCTLR_EL2_EnIA | + SCTLR_EL2_EnIB, + feat_pauth), + NEEDS_FEAT_FLAG(SCTLR_EL2_EnTP2, REQUIRES_E2H1, FEAT_SME), + NEEDS_FEAT(SCTLR_EL2_EnRCTX, FEAT_SPECRES), + NEEDS_FEAT(SCTLR_EL2_DSSBS, FEAT_SSBS), + NEEDS_FEAT_FLAG(SCTLR_EL2_TIDCP, REQUIRES_E2H1, FEAT_TIDCP1), + NEEDS_FEAT_FLAG(SCTLR_EL2_TWEDEL | + SCTLR_EL2_TWEDEn, + REQUIRES_E2H1, FEAT_TWED), + NEEDS_FEAT_FLAG(SCTLR_EL2_nTWE | + SCTLR_EL2_nTWI, + AS_RES1 | REQUIRES_E2H1, FEAT_AA64EL2), + NEEDS_FEAT_FLAG(SCTLR_EL2_UCI | + SCTLR_EL2_UCT | + SCTLR_EL2_DZE | + SCTLR_EL2_SA0, + REQUIRES_E2H1, FEAT_AA64EL2), + NEEDS_FEAT(SCTLR_EL2_WXN | + SCTLR_EL2_I | + SCTLR_EL2_SA | + SCTLR_EL2_C | + SCTLR_EL2_A | + SCTLR_EL2_M, + FEAT_AA64EL2), + FORCE_RES0(SCTLR_EL2_RES0), + FORCE_RES1(SCTLR_EL2_RES1), +}; + +static const DECLARE_FEAT_MAP(sctlr_el2_desc, SCTLR_EL2, + sctlr_el2_feat_map, FEAT_AA64EL2); + static const struct reg_bits_to_feat_map mdcr_el2_feat_map[] = { NEEDS_FEAT(MDCR_EL2_EBWE, FEAT_Debugv8p9), NEEDS_FEAT(MDCR_EL2_TDOSA, FEAT_DoubleLock), @@ -1247,6 +1325,7 @@ void __init check_feature_map(void) check_reg_desc(&sctlr2_desc); check_reg_desc(&tcr2_el2_desc); check_reg_desc(&sctlr_el1_desc); + check_reg_desc(&sctlr_el2_desc); check_reg_desc(&mdcr_el2_desc); check_reg_desc(&vtcr_el2_desc); } @@ -1443,6 +1522,9 @@ struct resx get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg) case SCTLR_EL1: resx = compute_reg_resx_bits(kvm, &sctlr_el1_desc, 0, 0); break; + case SCTLR_EL2: + resx = compute_reg_resx_bits(kvm, &sctlr_el2_desc, 0, 0); + break; case MDCR_EL2: resx = compute_reg_resx_bits(kvm, &mdcr_el2_desc, 0, 0); break; diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 96e899dbd919..ed710228484f 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1766,6 +1766,10 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) resx = get_reg_fixed_bits(kvm, SCTLR_EL1); set_sysreg_masks(kvm, SCTLR_EL1, resx); + /* SCTLR_EL2 */ + resx = get_reg_fixed_bits(kvm, SCTLR_EL2); + set_sysreg_masks(kvm, SCTLR_EL2, resx); + /* SCTLR2_ELx */ resx = get_reg_fixed_bits(kvm, SCTLR2_EL1); set_sysreg_masks(kvm, SCTLR2_EL1, resx); From edba407843340c4b66134fce6c54a007c1ac83a2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Feb 2026 18:43:29 +0000 Subject: [PATCH 237/267] KVM: arm64: Add debugfs file dumping computed RESx values Computing RESx values is hard. Verifying that they are correct is harder. Add a debugfs file called "resx" that will dump all the RESx values for a given VM. I found it useful, maybe you will too. Co-developed-by: Fuad Tabba Signed-off-by: Fuad Tabba Tested-by: Fuad Tabba Link: https://patch.msgid.link/20260202184329.2724080-21-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 68 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 88a57ca36d96..d33c39ea8fad 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -5090,12 +5090,80 @@ static const struct seq_operations idregs_debug_sops = { DEFINE_SEQ_ATTRIBUTE(idregs_debug); +static const struct sys_reg_desc *sr_resx_find(struct kvm *kvm, loff_t pos) +{ + unsigned long i, sr_idx = 0; + + for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { + const struct sys_reg_desc *r = &sys_reg_descs[i]; + + if (r->reg < __SANITISED_REG_START__) + continue; + + if (sr_idx++ == pos) + return r; + } + + return NULL; +} + +static void *sr_resx_start(struct seq_file *s, loff_t *pos) +{ + struct kvm *kvm = s->private; + + if (!kvm->arch.sysreg_masks) + return NULL; + + return (void *)sr_resx_find(kvm, *pos); +} + +static void *sr_resx_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct kvm *kvm = s->private; + + (*pos)++; + + return (void *)sr_resx_find(kvm, *pos); +} + +static void sr_resx_stop(struct seq_file *s, void *v) +{ +} + +static int sr_resx_show(struct seq_file *s, void *v) +{ + const struct sys_reg_desc *desc = v; + struct kvm *kvm = s->private; + struct resx resx; + + if (!desc) + return 0; + + resx = kvm_get_sysreg_resx(kvm, desc->reg); + + seq_printf(s, "%20s:\tRES0:%016llx\tRES1:%016llx\n", + desc->name, resx.res0, resx.res1); + + return 0; +} + +static const struct seq_operations sr_resx_sops = { + .start = sr_resx_start, + .next = sr_resx_next, + .stop = sr_resx_stop, + .show = sr_resx_show, +}; + +DEFINE_SEQ_ATTRIBUTE(sr_resx); + void kvm_sys_regs_create_debugfs(struct kvm *kvm) { kvm->arch.idreg_debugfs_iter = ~0; debugfs_create_file("idregs", 0444, kvm->debugfs_dentry, kvm, &idregs_debug_fops); + debugfs_create_file("resx", 0444, kvm->debugfs_dentry, kvm, + &sr_resx_fops); } static void reset_vm_ftr_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *reg) From de0c51370b7dca71825c5e3e4099ebc353a0df1a Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 6 Feb 2026 09:27:46 +0800 Subject: [PATCH 238/267] LoongArch: KVM: Add more CPUCFG mask bits With new CPU cores there are more features supported which are indicated in CPUCFG2 bits 24:30 and CPUCFG3 bits 17:23. The KVM hypervisor cannot enable or disable (most of) these features and there is no KVM exception when instructions of these features are executed in guest mode. Here add more CPUCFG mask support with LA664 CPU type. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/vcpu.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 656b954c1134..2ef7bb0ad047 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -652,6 +652,8 @@ static int _kvm_setcsr(struct kvm_vcpu *vcpu, unsigned int id, u64 val) static int _kvm_get_cpucfg_mask(int id, u64 *v) { + unsigned int config; + if (id < 0 || id >= KVM_MAX_CPUCFG_REGS) return -EINVAL; @@ -684,9 +686,17 @@ static int _kvm_get_cpucfg_mask(int id, u64 *v) if (cpu_has_ptw) *v |= CPUCFG2_PTW; + config = read_cpucfg(LOONGARCH_CPUCFG2); + *v |= config & (CPUCFG2_FRECIPE | CPUCFG2_DIV32 | CPUCFG2_LAM_BH); + *v |= config & (CPUCFG2_LAMCAS | CPUCFG2_LLACQ_SCREL | CPUCFG2_SCQ); return 0; case LOONGARCH_CPUCFG3: - *v = GENMASK(16, 0); + *v = GENMASK(23, 0); + + /* VM does not support memory order and SFB setting */ + config = read_cpucfg(LOONGARCH_CPUCFG3); + *v &= config & ~(CPUCFG3_SFB); + *v &= config & ~(CPUCFG3_ALDORDER_CAP | CPUCFG3_ASTORDER_CAP | CPUCFG3_SLDORDER_CAP); return 0; case LOONGARCH_CPUCFG4: case LOONGARCH_CPUCFG5: @@ -717,6 +727,7 @@ static int _kvm_get_cpucfg_mask(int id, u64 *v) static int kvm_check_cpucfg(int id, u64 val) { int ret; + u32 host; u64 mask = 0; ret = _kvm_get_cpucfg_mask(id, &mask); @@ -746,9 +757,16 @@ static int kvm_check_cpucfg(int id, u64 val) /* LASX architecturally implies LSX and FP but val does not satisfy that */ return -EINVAL; return 0; + case LOONGARCH_CPUCFG3: + host = read_cpucfg(LOONGARCH_CPUCFG3); + if ((val & CPUCFG3_RVAMAX) > (host & CPUCFG3_RVAMAX)) + return -EINVAL; + if ((val & CPUCFG3_SPW_LVL) > (host & CPUCFG3_SPW_LVL)) + return -EINVAL; + return 0; case LOONGARCH_CPUCFG6: if (val & CPUCFG6_PMP) { - u32 host = read_cpucfg(LOONGARCH_CPUCFG6); + host = read_cpucfg(LOONGARCH_CPUCFG6); if ((val & CPUCFG6_PMBITS) != (host & CPUCFG6_PMBITS)) return -EINVAL; if ((val & CPUCFG6_PMNUM) > (host & CPUCFG6_PMNUM)) From 82db90bf461b0cfbb9457a2e721a781c14512e37 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 6 Feb 2026 09:27:46 +0800 Subject: [PATCH 239/267] LoongArch: KVM: Move feature detection in kvm_vm_init_features() VM feature detection is sparsed in function kvm_vm_init_features() and kvm_vm_feature_has_attr(). Here move all the features detection in function kvm_vm_init_features(), and there is only feature checking in function kvm_vm_feature_has_attr(). Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/vm.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c index 194ccbcdc3b3..d3ff6d6966f8 100644 --- a/arch/loongarch/kvm/vm.c +++ b/arch/loongarch/kvm/vm.c @@ -29,6 +29,21 @@ static void kvm_vm_init_features(struct kvm *kvm) { unsigned long val; + if (cpu_has_lsx) + kvm->arch.kvm_features |= BIT(KVM_LOONGARCH_VM_FEAT_LSX); + if (cpu_has_lasx) + kvm->arch.kvm_features |= BIT(KVM_LOONGARCH_VM_FEAT_LASX); + if (cpu_has_lbt_x86) + kvm->arch.kvm_features |= BIT(KVM_LOONGARCH_VM_FEAT_X86BT); + if (cpu_has_lbt_arm) + kvm->arch.kvm_features |= BIT(KVM_LOONGARCH_VM_FEAT_ARMBT); + if (cpu_has_lbt_mips) + kvm->arch.kvm_features |= BIT(KVM_LOONGARCH_VM_FEAT_MIPSBT); + if (cpu_has_ptw) + kvm->arch.kvm_features |= BIT(KVM_LOONGARCH_VM_FEAT_PTW); + if (cpu_has_msgint) + kvm->arch.kvm_features |= BIT(KVM_LOONGARCH_VM_FEAT_MSGINT); + val = read_csr_gcfg(); if (val & CSR_GCFG_GPMP) kvm->arch.kvm_features |= BIT(KVM_LOONGARCH_VM_FEAT_PMU); @@ -131,33 +146,12 @@ static int kvm_vm_feature_has_attr(struct kvm *kvm, struct kvm_device_attr *attr { switch (attr->attr) { case KVM_LOONGARCH_VM_FEAT_LSX: - if (cpu_has_lsx) - return 0; - return -ENXIO; case KVM_LOONGARCH_VM_FEAT_LASX: - if (cpu_has_lasx) - return 0; - return -ENXIO; case KVM_LOONGARCH_VM_FEAT_X86BT: - if (cpu_has_lbt_x86) - return 0; - return -ENXIO; case KVM_LOONGARCH_VM_FEAT_ARMBT: - if (cpu_has_lbt_arm) - return 0; - return -ENXIO; case KVM_LOONGARCH_VM_FEAT_MIPSBT: - if (cpu_has_lbt_mips) - return 0; - return -ENXIO; case KVM_LOONGARCH_VM_FEAT_PTW: - if (cpu_has_ptw) - return 0; - return -ENXIO; case KVM_LOONGARCH_VM_FEAT_MSGINT: - if (cpu_has_msgint) - return 0; - return -ENXIO; case KVM_LOONGARCH_VM_FEAT_PMU: case KVM_LOONGARCH_VM_FEAT_PV_IPI: case KVM_LOONGARCH_VM_FEAT_PV_STEALTIME: From 31966edb9a5d70b6a842da3ac7e5602df10bb4c2 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 6 Feb 2026 09:27:46 +0800 Subject: [PATCH 240/267] LoongArch: KVM: Add msgint registers in kvm_init_gcsr_flag() Add flag HW_GCSR with msgint registers in function kvm_init_gcsr_flag(). Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index 80ea63d465b8..f3211fc447fd 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -192,6 +192,13 @@ static void kvm_init_gcsr_flag(void) set_gcsr_sw_flag(LOONGARCH_CSR_PERFCNTR2); set_gcsr_sw_flag(LOONGARCH_CSR_PERFCTRL3); set_gcsr_sw_flag(LOONGARCH_CSR_PERFCNTR3); + + if (cpu_has_msgint) { + set_gcsr_hw_flag(LOONGARCH_CSR_ISR0); + set_gcsr_hw_flag(LOONGARCH_CSR_ISR1); + set_gcsr_hw_flag(LOONGARCH_CSR_ISR2); + set_gcsr_hw_flag(LOONGARCH_CSR_ISR3); + } } static void kvm_update_vpid(struct kvm_vcpu *vcpu, int cpu) From c2f94dafe197961f266fef8946d39df66a9750f4 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 6 Feb 2026 09:27:46 +0800 Subject: [PATCH 241/267] LoongArch: KVM: Check VM msgint feature during interrupt handling During message interrupt handling and relative CSR registers saving and restore, it is better to check VM msgint feature rather than host msgint feature, because VM may disable this feature even if host supports this. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/kvm_host.h | 5 +++++ arch/loongarch/kvm/interrupt.c | 4 ++-- arch/loongarch/kvm/vcpu.c | 6 ++++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index e4fe5b8e8149..bced2d607849 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -265,6 +265,11 @@ static inline void writel_sw_gcsr(struct loongarch_csrs *csr, int reg, unsigned csr->csrs[reg] = val; } +static inline bool kvm_guest_has_msgint(struct kvm_vcpu_arch *arch) +{ + return arch->cpucfg[1] & CPUCFG1_MSGINT; +} + static inline bool kvm_guest_has_fpu(struct kvm_vcpu_arch *arch) { return arch->cpucfg[2] & CPUCFG2_FP; diff --git a/arch/loongarch/kvm/interrupt.c b/arch/loongarch/kvm/interrupt.c index a6d42d399a59..fb704f4c8ac5 100644 --- a/arch/loongarch/kvm/interrupt.c +++ b/arch/loongarch/kvm/interrupt.c @@ -32,7 +32,7 @@ static int kvm_irq_deliver(struct kvm_vcpu *vcpu, unsigned int priority) if (priority < EXCCODE_INT_NUM) irq = priority_to_irq[priority]; - if (cpu_has_msgint && (priority == INT_AVEC)) { + if (kvm_guest_has_msgint(&vcpu->arch) && (priority == INT_AVEC)) { set_gcsr_estat(irq); return 1; } @@ -64,7 +64,7 @@ static int kvm_irq_clear(struct kvm_vcpu *vcpu, unsigned int priority) if (priority < EXCCODE_INT_NUM) irq = priority_to_irq[priority]; - if (cpu_has_msgint && (priority == INT_AVEC)) { + if (kvm_guest_has_msgint(&vcpu->arch) && (priority == INT_AVEC)) { clear_gcsr_estat(irq); return 1; } diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 2ef7bb0ad047..948a3d809aac 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -1679,7 +1679,8 @@ static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_DMWIN2); kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_DMWIN3); kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_LLBCTL); - if (cpu_has_msgint) { + + if (kvm_guest_has_msgint(&vcpu->arch)) { kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR0); kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR1); kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR2); @@ -1774,7 +1775,8 @@ static int _kvm_vcpu_put(struct kvm_vcpu *vcpu, int cpu) kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN1); kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN2); kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN3); - if (cpu_has_msgint) { + + if (kvm_guest_has_msgint(&vcpu->arch)) { kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR0); kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR1); kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR2); From c5cb12b81a0bddbff0f963662f18747b6d633592 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 6 Feb 2026 09:27:46 +0800 Subject: [PATCH 242/267] LoongArch: KVM: Handle LOONGARCH_CSR_IPR during vCPU context switch Register LOONGARCH_CSR_IPR is interrupt priority setting for nested interrupt handling. Though LoongArch Linux AVEC driver does not use this register, KVM hypervisor needs to save and restore this it during vCPU context switch. Because Linux AVEC driver may use this register in future, or other OS may use it. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/loongarch.h | 1 + arch/loongarch/kvm/main.c | 1 + arch/loongarch/kvm/vcpu.c | 2 ++ 3 files changed, 4 insertions(+) diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 553c4dc7a156..2a6bc99177d8 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -690,6 +690,7 @@ #define LOONGARCH_CSR_ISR3 0xa3 #define LOONGARCH_CSR_IRR 0xa4 +#define LOONGARCH_CSR_IPR 0xa5 #define LOONGARCH_CSR_PRID 0xc0 diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index f3211fc447fd..d1c5156e02d8 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -194,6 +194,7 @@ static void kvm_init_gcsr_flag(void) set_gcsr_sw_flag(LOONGARCH_CSR_PERFCNTR3); if (cpu_has_msgint) { + set_gcsr_hw_flag(LOONGARCH_CSR_IPR); set_gcsr_hw_flag(LOONGARCH_CSR_ISR0); set_gcsr_hw_flag(LOONGARCH_CSR_ISR1); set_gcsr_hw_flag(LOONGARCH_CSR_ISR2); diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 948a3d809aac..fd3a2e60c670 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -1681,6 +1681,7 @@ static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_LLBCTL); if (kvm_guest_has_msgint(&vcpu->arch)) { + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_IPR); kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR0); kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR1); kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR2); @@ -1777,6 +1778,7 @@ static int _kvm_vcpu_put(struct kvm_vcpu *vcpu, int cpu) kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN3); if (kvm_guest_has_msgint(&vcpu->arch)) { + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_IPR); kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR0); kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR1); kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR2); From 89b5dc53971328934fcb6a68bf75e4b76fc59ef0 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 6 Feb 2026 09:27:47 +0800 Subject: [PATCH 243/267] LoongArch: KVM: Move LSX capability check in exception handler Like FPU exception handler, check LSX capability in the LSX exception handler rather than function kvm_own_lsx(). Since LSX capability in the function kvm_guest_has_lsx() implies FPU capability, only checking kvm_guest_has_lsx() is OK here. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/exit.c | 4 +++- arch/loongarch/kvm/vcpu.c | 3 --- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index cb493980d874..76eec3f24953 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -792,8 +792,10 @@ static long kvm_save_notify(struct kvm_vcpu *vcpu) */ static int kvm_handle_lsx_disabled(struct kvm_vcpu *vcpu, int ecode) { - if (kvm_own_lsx(vcpu)) + if (!kvm_guest_has_lsx(&vcpu->arch)) kvm_queue_exception(vcpu, EXCCODE_INE, 0); + else + kvm_own_lsx(vcpu); return RESUME_GUEST; } diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index fd3a2e60c670..0fa3981ec7ed 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -1376,9 +1376,6 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu) /* Enable LSX and restore context */ int kvm_own_lsx(struct kvm_vcpu *vcpu) { - if (!kvm_guest_has_fpu(&vcpu->arch) || !kvm_guest_has_lsx(&vcpu->arch)) - return -EINVAL; - preempt_disable(); /* Enable LSX for guest */ From 37da26e0e8396def86ff4ca70ece0a5f5619dbd4 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 6 Feb 2026 09:27:47 +0800 Subject: [PATCH 244/267] LoongArch: KVM: Move LASX capability check in exception handler Like FPU exception handler, check LASX capability in the LASX exception handler rather than function kvm_own_lasx(). Since LASX capability in the function kvm_guest_has_lasx() implies FPU and LSX capability, only checking kvm_guest_has_lasx() is OK here. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/exit.c | 4 +++- arch/loongarch/kvm/vcpu.c | 3 --- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index 76eec3f24953..74b427287e96 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -810,8 +810,10 @@ static int kvm_handle_lsx_disabled(struct kvm_vcpu *vcpu, int ecode) */ static int kvm_handle_lasx_disabled(struct kvm_vcpu *vcpu, int ecode) { - if (kvm_own_lasx(vcpu)) + if (!kvm_guest_has_lasx(&vcpu->arch)) kvm_queue_exception(vcpu, EXCCODE_INE, 0); + else + kvm_own_lasx(vcpu); return RESUME_GUEST; } diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 0fa3981ec7ed..53d720a15617 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -1409,9 +1409,6 @@ int kvm_own_lsx(struct kvm_vcpu *vcpu) /* Enable LASX and restore context */ int kvm_own_lasx(struct kvm_vcpu *vcpu) { - if (!kvm_guest_has_fpu(&vcpu->arch) || !kvm_guest_has_lsx(&vcpu->arch) || !kvm_guest_has_lasx(&vcpu->arch)) - return -EINVAL; - preempt_disable(); kvm_check_fcsr(vcpu, vcpu->arch.fpu.fcsr); From b1388a9598fbe77b193cbb8368eed46e982212c5 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 6 Feb 2026 09:27:47 +0800 Subject: [PATCH 245/267] LoongArch: KVM: Move LBT capability check in exception handler Like FPU exception handler, check LBT capability in the LBT exception handler rather than function kvm_own_lbt(). Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/exit.c | 4 +++- arch/loongarch/kvm/vcpu.c | 3 --- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index 74b427287e96..65ec10a7245a 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -820,8 +820,10 @@ static int kvm_handle_lasx_disabled(struct kvm_vcpu *vcpu, int ecode) static int kvm_handle_lbt_disabled(struct kvm_vcpu *vcpu, int ecode) { - if (kvm_own_lbt(vcpu)) + if (!kvm_guest_has_lbt(&vcpu->arch)) kvm_queue_exception(vcpu, EXCCODE_INE, 0); + else + kvm_own_lbt(vcpu); return RESUME_GUEST; } diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 53d720a15617..07c427a5e156 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -1304,9 +1304,6 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) #ifdef CONFIG_CPU_HAS_LBT int kvm_own_lbt(struct kvm_vcpu *vcpu) { - if (!kvm_guest_has_lbt(&vcpu->arch)) - return -EINVAL; - preempt_disable(); if (!(vcpu->arch.aux_inuse & KVM_LARCH_LBT)) { set_csr_euen(CSR_EUEN_LBTEN); From 382c38c9ec94fe87ffd6c9b456fa8ce67943cf1b Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 6 Feb 2026 09:28:00 +0800 Subject: [PATCH 246/267] LoongArch: KVM: Add FPU/LBT delay load support FPU/LBT are lazy enabled with KVM hypervisor. After FPU/LBT enabled and loaded, vCPU can be preempted and FPU/LBT will be lost again, there will be unnecessary FPU/LBT exceptions, load and store stuff. Here delay the FPU/LBT load until the guest entry. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/kvm_host.h | 2 ++ arch/loongarch/kvm/exit.c | 21 +++++++++++------ arch/loongarch/kvm/vcpu.c | 33 +++++++++++++++++---------- 3 files changed, 37 insertions(+), 19 deletions(-) diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index bced2d607849..4a7e816fae84 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -37,6 +37,7 @@ #define KVM_REQ_TLB_FLUSH_GPA KVM_ARCH_REQ(0) #define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(1) #define KVM_REQ_PMU KVM_ARCH_REQ(2) +#define KVM_REQ_AUX_LOAD KVM_ARCH_REQ(3) #define KVM_GUESTDBG_SW_BP_MASK \ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP) @@ -200,6 +201,7 @@ struct kvm_vcpu_arch { /* Which auxiliary state is loaded (KVM_LARCH_*) */ unsigned int aux_inuse; + unsigned int aux_ldtype; /* FPU state */ struct loongarch_fpu fpu FPU_ALIGN; diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index 65ec10a7245a..da0ad89f2eb7 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -754,7 +754,8 @@ static int kvm_handle_fpu_disabled(struct kvm_vcpu *vcpu, int ecode) return RESUME_HOST; } - kvm_own_fpu(vcpu); + vcpu->arch.aux_ldtype = KVM_LARCH_FPU; + kvm_make_request(KVM_REQ_AUX_LOAD, vcpu); return RESUME_GUEST; } @@ -794,8 +795,10 @@ static int kvm_handle_lsx_disabled(struct kvm_vcpu *vcpu, int ecode) { if (!kvm_guest_has_lsx(&vcpu->arch)) kvm_queue_exception(vcpu, EXCCODE_INE, 0); - else - kvm_own_lsx(vcpu); + else { + vcpu->arch.aux_ldtype = KVM_LARCH_LSX; + kvm_make_request(KVM_REQ_AUX_LOAD, vcpu); + } return RESUME_GUEST; } @@ -812,8 +815,10 @@ static int kvm_handle_lasx_disabled(struct kvm_vcpu *vcpu, int ecode) { if (!kvm_guest_has_lasx(&vcpu->arch)) kvm_queue_exception(vcpu, EXCCODE_INE, 0); - else - kvm_own_lasx(vcpu); + else { + vcpu->arch.aux_ldtype = KVM_LARCH_LASX; + kvm_make_request(KVM_REQ_AUX_LOAD, vcpu); + } return RESUME_GUEST; } @@ -822,8 +827,10 @@ static int kvm_handle_lbt_disabled(struct kvm_vcpu *vcpu, int ecode) { if (!kvm_guest_has_lbt(&vcpu->arch)) kvm_queue_exception(vcpu, EXCCODE_INE, 0); - else - kvm_own_lbt(vcpu); + else { + vcpu->arch.aux_ldtype = KVM_LARCH_LBT; + kvm_make_request(KVM_REQ_AUX_LOAD, vcpu); + } return RESUME_GUEST; } diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 07c427a5e156..4f0d10f52b99 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -232,6 +232,27 @@ static void kvm_late_check_requests(struct kvm_vcpu *vcpu) kvm_flush_tlb_gpa(vcpu, vcpu->arch.flush_gpa); vcpu->arch.flush_gpa = INVALID_GPA; } + + if (kvm_check_request(KVM_REQ_AUX_LOAD, vcpu)) { + switch (vcpu->arch.aux_ldtype) { + case KVM_LARCH_FPU: + kvm_own_fpu(vcpu); + break; + case KVM_LARCH_LSX: + kvm_own_lsx(vcpu); + break; + case KVM_LARCH_LASX: + kvm_own_lasx(vcpu); + break; + case KVM_LARCH_LBT: + kvm_own_lbt(vcpu); + break; + default: + break; + } + + vcpu->arch.aux_ldtype = 0; + } } /* @@ -1304,13 +1325,11 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) #ifdef CONFIG_CPU_HAS_LBT int kvm_own_lbt(struct kvm_vcpu *vcpu) { - preempt_disable(); if (!(vcpu->arch.aux_inuse & KVM_LARCH_LBT)) { set_csr_euen(CSR_EUEN_LBTEN); _restore_lbt(&vcpu->arch.lbt); vcpu->arch.aux_inuse |= KVM_LARCH_LBT; } - preempt_enable(); return 0; } @@ -1353,8 +1372,6 @@ static inline void kvm_check_fcsr_alive(struct kvm_vcpu *vcpu) { } /* Enable FPU and restore context */ void kvm_own_fpu(struct kvm_vcpu *vcpu) { - preempt_disable(); - /* * Enable FPU for guest * Set FR and FRE according to guest context @@ -1365,16 +1382,12 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu) kvm_restore_fpu(&vcpu->arch.fpu); vcpu->arch.aux_inuse |= KVM_LARCH_FPU; trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_FPU); - - preempt_enable(); } #ifdef CONFIG_CPU_HAS_LSX /* Enable LSX and restore context */ int kvm_own_lsx(struct kvm_vcpu *vcpu) { - preempt_disable(); - /* Enable LSX for guest */ kvm_check_fcsr(vcpu, vcpu->arch.fpu.fcsr); set_csr_euen(CSR_EUEN_LSXEN | CSR_EUEN_FPEN); @@ -1396,7 +1409,6 @@ int kvm_own_lsx(struct kvm_vcpu *vcpu) trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_LSX); vcpu->arch.aux_inuse |= KVM_LARCH_LSX | KVM_LARCH_FPU; - preempt_enable(); return 0; } @@ -1406,8 +1418,6 @@ int kvm_own_lsx(struct kvm_vcpu *vcpu) /* Enable LASX and restore context */ int kvm_own_lasx(struct kvm_vcpu *vcpu) { - preempt_disable(); - kvm_check_fcsr(vcpu, vcpu->arch.fpu.fcsr); set_csr_euen(CSR_EUEN_FPEN | CSR_EUEN_LSXEN | CSR_EUEN_LASXEN); switch (vcpu->arch.aux_inuse & (KVM_LARCH_FPU | KVM_LARCH_LSX)) { @@ -1429,7 +1439,6 @@ int kvm_own_lasx(struct kvm_vcpu *vcpu) trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_LASX); vcpu->arch.aux_inuse |= KVM_LARCH_LASX | KVM_LARCH_LSX | KVM_LARCH_FPU; - preempt_enable(); return 0; } From 2faec60a4858bd7fe1bc1419963c6bf030e85706 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 6 Feb 2026 09:28:00 +0800 Subject: [PATCH 247/267] LoongArch: KVM: Set default return value in KVM IO bus ops When in-kernel irqchip is enabled, its register area is registered in the KVM IO bus list with API kvm_io_bus_register_dev(). In MMIO/IOCSR register access emulation, kvm_io_bus_read()/kvm_io_bus_write() is called firstly. If it returns 0, it means that the in-kernel irqchip handles the emulation already, else it returns to user-mode VMM and lets VMM emulate the register access. Once in-kernel irqchip is enabled, it should return 0 if the address is within range of the registered KVM IO bus. It should not return to user-mode VMM since VMM does not know how to handle it, and irqchip is handled in kernel already. Here set default return value with 0 in KVM IO bus operations. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/eiointc.c | 43 ++++++++++++------------------- arch/loongarch/kvm/intc/ipi.c | 26 ++++++------------- arch/loongarch/kvm/intc/pch_pic.c | 31 ++++++++++------------ 3 files changed, 39 insertions(+), 61 deletions(-) diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index dfaf6ccfdd8b..e498a3f1e136 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -119,7 +119,7 @@ void eiointc_set_irq(struct loongarch_eiointc *s, int irq, int level) static int loongarch_eiointc_read(struct kvm_vcpu *vcpu, struct loongarch_eiointc *s, gpa_t addr, unsigned long *val) { - int index, ret = 0; + int index; u64 data = 0; gpa_t offset; @@ -150,40 +150,36 @@ static int loongarch_eiointc_read(struct kvm_vcpu *vcpu, struct loongarch_eioint data = s->coremap[index]; break; default: - ret = -EINVAL; break; } *val = data; - return ret; + return 0; } static int kvm_eiointc_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, void *val) { - int ret = -EINVAL; unsigned long flags, data, offset; struct loongarch_eiointc *eiointc = vcpu->kvm->arch.eiointc; if (!eiointc) { kvm_err("%s: eiointc irqchip not valid!\n", __func__); - return -EINVAL; + return 0; } if (addr & (len - 1)) { kvm_err("%s: eiointc not aligned addr %llx len %d\n", __func__, addr, len); - return -EINVAL; + return 0; } offset = addr & 0x7; addr -= offset; vcpu->stat.eiointc_read_exits++; spin_lock_irqsave(&eiointc->lock, flags); - ret = loongarch_eiointc_read(vcpu, eiointc, addr, &data); + loongarch_eiointc_read(vcpu, eiointc, addr, &data); spin_unlock_irqrestore(&eiointc->lock, flags); - if (ret) - return ret; data = data >> (offset * 8); switch (len) { @@ -208,7 +204,7 @@ static int loongarch_eiointc_write(struct kvm_vcpu *vcpu, struct loongarch_eiointc *s, gpa_t addr, u64 value, u64 field_mask) { - int index, irq, ret = 0; + int index, irq; u8 cpu; u64 data, old, mask; gpa_t offset; @@ -287,29 +283,27 @@ static int loongarch_eiointc_write(struct kvm_vcpu *vcpu, eiointc_update_sw_coremap(s, index * 8, data, sizeof(data), true); break; default: - ret = -EINVAL; break; } - return ret; + return 0; } static int kvm_eiointc_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, const void *val) { - int ret = -EINVAL; unsigned long flags, value; struct loongarch_eiointc *eiointc = vcpu->kvm->arch.eiointc; if (!eiointc) { kvm_err("%s: eiointc irqchip not valid!\n", __func__); - return -EINVAL; + return 0; } if (addr & (len - 1)) { kvm_err("%s: eiointc not aligned addr %llx len %d\n", __func__, addr, len); - return -EINVAL; + return 0; } vcpu->stat.eiointc_write_exits++; @@ -317,24 +311,24 @@ static int kvm_eiointc_write(struct kvm_vcpu *vcpu, switch (len) { case 1: value = *(unsigned char *)val; - ret = loongarch_eiointc_write(vcpu, eiointc, addr, value, 0xFF); + loongarch_eiointc_write(vcpu, eiointc, addr, value, 0xFF); break; case 2: value = *(unsigned short *)val; - ret = loongarch_eiointc_write(vcpu, eiointc, addr, value, USHRT_MAX); + loongarch_eiointc_write(vcpu, eiointc, addr, value, USHRT_MAX); break; case 4: value = *(unsigned int *)val; - ret = loongarch_eiointc_write(vcpu, eiointc, addr, value, UINT_MAX); + loongarch_eiointc_write(vcpu, eiointc, addr, value, UINT_MAX); break; default: value = *(unsigned long *)val; - ret = loongarch_eiointc_write(vcpu, eiointc, addr, value, ULONG_MAX); + loongarch_eiointc_write(vcpu, eiointc, addr, value, ULONG_MAX); break; } spin_unlock_irqrestore(&eiointc->lock, flags); - return ret; + return 0; } static const struct kvm_io_device_ops kvm_eiointc_ops = { @@ -352,7 +346,7 @@ static int kvm_eiointc_virt_read(struct kvm_vcpu *vcpu, if (!eiointc) { kvm_err("%s: eiointc irqchip not valid!\n", __func__); - return -EINVAL; + return 0; } addr -= EIOINTC_VIRT_BASE; @@ -376,28 +370,25 @@ static int kvm_eiointc_virt_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, const void *val) { - int ret = 0; unsigned long flags; u32 value = *(u32 *)val; struct loongarch_eiointc *eiointc = vcpu->kvm->arch.eiointc; if (!eiointc) { kvm_err("%s: eiointc irqchip not valid!\n", __func__); - return -EINVAL; + return 0; } addr -= EIOINTC_VIRT_BASE; spin_lock_irqsave(&eiointc->lock, flags); switch (addr) { case EIOINTC_VIRT_FEATURES: - ret = -EPERM; break; case EIOINTC_VIRT_CONFIG: /* * eiointc features can only be set at disabled status */ if ((eiointc->status & BIT(EIOINTC_ENABLE)) && value) { - ret = -EPERM; break; } eiointc->status = value & eiointc->features; @@ -407,7 +398,7 @@ static int kvm_eiointc_virt_write(struct kvm_vcpu *vcpu, } spin_unlock_irqrestore(&eiointc->lock, flags); - return ret; + return 0; } static const struct kvm_io_device_ops kvm_eiointc_virt_ops = { diff --git a/arch/loongarch/kvm/intc/ipi.c b/arch/loongarch/kvm/intc/ipi.c index 1058c13dba7f..6a044a74c095 100644 --- a/arch/loongarch/kvm/intc/ipi.c +++ b/arch/loongarch/kvm/intc/ipi.c @@ -111,7 +111,7 @@ static int mail_send(struct kvm *kvm, uint64_t data) vcpu = kvm_get_vcpu_by_cpuid(kvm, cpu); if (unlikely(vcpu == NULL)) { kvm_err("%s: invalid target cpu: %d\n", __func__, cpu); - return -EINVAL; + return 0; } mailbox = ((data & 0xffffffff) >> 2) & 0x7; offset = IOCSR_IPI_BUF_20 + mailbox * 4; @@ -145,7 +145,7 @@ static int send_ipi_data(struct kvm_vcpu *vcpu, gpa_t addr, uint64_t data) srcu_read_unlock(&vcpu->kvm->srcu, idx); if (unlikely(ret)) { kvm_err("%s: : read data from addr %llx failed\n", __func__, addr); - return ret; + return 0; } /* Construct the mask by scanning the bit 27-30 */ for (i = 0; i < 4; i++) { @@ -162,7 +162,7 @@ static int send_ipi_data(struct kvm_vcpu *vcpu, gpa_t addr, uint64_t data) if (unlikely(ret)) kvm_err("%s: : write data to addr %llx failed\n", __func__, addr); - return ret; + return 0; } static int any_send(struct kvm *kvm, uint64_t data) @@ -174,7 +174,7 @@ static int any_send(struct kvm *kvm, uint64_t data) vcpu = kvm_get_vcpu_by_cpuid(kvm, cpu); if (unlikely(vcpu == NULL)) { kvm_err("%s: invalid target cpu: %d\n", __func__, cpu); - return -EINVAL; + return 0; } offset = data & 0xffff; @@ -183,7 +183,6 @@ static int any_send(struct kvm *kvm, uint64_t data) static int loongarch_ipi_readl(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *val) { - int ret = 0; uint32_t offset; uint64_t res = 0; @@ -202,33 +201,27 @@ static int loongarch_ipi_readl(struct kvm_vcpu *vcpu, gpa_t addr, int len, void spin_unlock(&vcpu->arch.ipi_state.lock); break; case IOCSR_IPI_SET: - res = 0; - break; case IOCSR_IPI_CLEAR: - res = 0; break; case IOCSR_IPI_BUF_20 ... IOCSR_IPI_BUF_38 + 7: if (offset + len > IOCSR_IPI_BUF_38 + 8) { kvm_err("%s: invalid offset or len: offset = %d, len = %d\n", __func__, offset, len); - ret = -EINVAL; break; } res = read_mailbox(vcpu, offset, len); break; default: kvm_err("%s: unknown addr: %llx\n", __func__, addr); - ret = -EINVAL; break; } *(uint64_t *)val = res; - return ret; + return 0; } static int loongarch_ipi_writel(struct kvm_vcpu *vcpu, gpa_t addr, int len, const void *val) { - int ret = 0; uint64_t data; uint32_t offset; @@ -239,7 +232,6 @@ static int loongarch_ipi_writel(struct kvm_vcpu *vcpu, gpa_t addr, int len, cons switch (offset) { case IOCSR_IPI_STATUS: - ret = -EINVAL; break; case IOCSR_IPI_EN: spin_lock(&vcpu->arch.ipi_state.lock); @@ -257,7 +249,6 @@ static int loongarch_ipi_writel(struct kvm_vcpu *vcpu, gpa_t addr, int len, cons if (offset + len > IOCSR_IPI_BUF_38 + 8) { kvm_err("%s: invalid offset or len: offset = %d, len = %d\n", __func__, offset, len); - ret = -EINVAL; break; } write_mailbox(vcpu, offset, data, len); @@ -266,18 +257,17 @@ static int loongarch_ipi_writel(struct kvm_vcpu *vcpu, gpa_t addr, int len, cons ipi_send(vcpu->kvm, data); break; case IOCSR_MAIL_SEND: - ret = mail_send(vcpu->kvm, data); + mail_send(vcpu->kvm, data); break; case IOCSR_ANY_SEND: - ret = any_send(vcpu->kvm, data); + any_send(vcpu->kvm, data); break; default: kvm_err("%s: unknown addr: %llx\n", __func__, addr); - ret = -EINVAL; break; } - return ret; + return 0; } static int kvm_ipi_read(struct kvm_vcpu *vcpu, diff --git a/arch/loongarch/kvm/intc/pch_pic.c b/arch/loongarch/kvm/intc/pch_pic.c index 4addb34bf432..a175f52fcf7f 100644 --- a/arch/loongarch/kvm/intc/pch_pic.c +++ b/arch/loongarch/kvm/intc/pch_pic.c @@ -74,7 +74,7 @@ void pch_msi_set_irq(struct kvm *kvm, int irq, int level) static int loongarch_pch_pic_read(struct loongarch_pch_pic *s, gpa_t addr, int len, void *val) { - int ret = 0, offset; + int offset; u64 data = 0; void *ptemp; @@ -121,34 +121,32 @@ static int loongarch_pch_pic_read(struct loongarch_pch_pic *s, gpa_t addr, int l data = s->isr; break; default: - ret = -EINVAL; + break; } spin_unlock(&s->lock); - if (ret == 0) { - offset = (addr - s->pch_pic_base) & 7; - data = data >> (offset * 8); - memcpy(val, &data, len); - } + offset = (addr - s->pch_pic_base) & 7; + data = data >> (offset * 8); + memcpy(val, &data, len); - return ret; + return 0; } static int kvm_pch_pic_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, void *val) { - int ret; + int ret = 0; struct loongarch_pch_pic *s = vcpu->kvm->arch.pch_pic; if (!s) { kvm_err("%s: pch pic irqchip not valid!\n", __func__); - return -EINVAL; + return ret; } if (addr & (len - 1)) { kvm_err("%s: pch pic not aligned addr %llx len %d\n", __func__, addr, len); - return -EINVAL; + return ret; } /* statistics of pch pic reading */ @@ -161,7 +159,7 @@ static int kvm_pch_pic_read(struct kvm_vcpu *vcpu, static int loongarch_pch_pic_write(struct loongarch_pch_pic *s, gpa_t addr, int len, const void *val) { - int ret = 0, offset; + int offset; u64 old, data, mask; void *ptemp; @@ -226,29 +224,28 @@ static int loongarch_pch_pic_write(struct loongarch_pch_pic *s, gpa_t addr, case PCH_PIC_ROUTE_ENTRY_START ... PCH_PIC_ROUTE_ENTRY_END: break; default: - ret = -EINVAL; break; } spin_unlock(&s->lock); - return ret; + return 0; } static int kvm_pch_pic_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, const void *val) { - int ret; + int ret = 0; struct loongarch_pch_pic *s = vcpu->kvm->arch.pch_pic; if (!s) { kvm_err("%s: pch pic irqchip not valid!\n", __func__); - return -EINVAL; + return ret; } if (addr & (len - 1)) { kvm_err("%s: pch pic not aligned addr %llx len %d\n", __func__, addr, len); - return -EINVAL; + return ret; } /* statistics of pch pic writing */ From 9b486cdd032a90032c6b567ea723595205ca2626 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 6 Feb 2026 09:28:01 +0800 Subject: [PATCH 248/267] LoongArch: KVM: Add paravirt preempt feature in hypervisor side Feature KVM_FEATURE_PREEMPT is added to show whether vCPU is preempted or not. It is to help guest OS scheduling or lock checking etc. Here add KVM_FEATURE_PREEMPT feature and use one byte as preempted flag in the steal time structure. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/kvm_host.h | 2 + arch/loongarch/include/asm/kvm_para.h | 4 +- arch/loongarch/include/uapi/asm/kvm.h | 1 + arch/loongarch/include/uapi/asm/kvm_para.h | 1 + arch/loongarch/kvm/vcpu.c | 53 +++++++++++++++++++++- arch/loongarch/kvm/vm.c | 3 ++ 6 files changed, 62 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index 4a7e816fae84..19eb5e5c3984 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -165,6 +165,7 @@ enum emulation_result { #define LOONGARCH_PV_FEAT_UPDATED BIT_ULL(63) #define LOONGARCH_PV_FEAT_MASK (BIT(KVM_FEATURE_IPI) | \ + BIT(KVM_FEATURE_PREEMPT) | \ BIT(KVM_FEATURE_STEAL_TIME) | \ BIT(KVM_FEATURE_USER_HCALL) | \ BIT(KVM_FEATURE_VIRT_EXTIOI)) @@ -254,6 +255,7 @@ struct kvm_vcpu_arch { u64 guest_addr; u64 last_steal; struct gfn_to_hva_cache cache; + u8 preempted; } st; }; diff --git a/arch/loongarch/include/asm/kvm_para.h b/arch/loongarch/include/asm/kvm_para.h index 3e4b397f423f..fb17ba0fa101 100644 --- a/arch/loongarch/include/asm/kvm_para.h +++ b/arch/loongarch/include/asm/kvm_para.h @@ -37,8 +37,10 @@ struct kvm_steal_time { __u64 steal; __u32 version; __u32 flags; - __u32 pad[12]; + __u8 preempted; + __u8 pad[47]; }; +#define KVM_VCPU_PREEMPTED (1 << 0) /* * Hypercall interface for KVM hypervisor diff --git a/arch/loongarch/include/uapi/asm/kvm.h b/arch/loongarch/include/uapi/asm/kvm.h index de6c3f18e40a..419647aacdf3 100644 --- a/arch/loongarch/include/uapi/asm/kvm.h +++ b/arch/loongarch/include/uapi/asm/kvm.h @@ -105,6 +105,7 @@ struct kvm_fpu { #define KVM_LOONGARCH_VM_FEAT_PV_STEALTIME 7 #define KVM_LOONGARCH_VM_FEAT_PTW 8 #define KVM_LOONGARCH_VM_FEAT_MSGINT 9 +#define KVM_LOONGARCH_VM_FEAT_PV_PREEMPT 10 /* Device Control API on vcpu fd */ #define KVM_LOONGARCH_VCPU_CPUCFG 0 diff --git a/arch/loongarch/include/uapi/asm/kvm_para.h b/arch/loongarch/include/uapi/asm/kvm_para.h index 76d802ef01ce..d28cbcadd276 100644 --- a/arch/loongarch/include/uapi/asm/kvm_para.h +++ b/arch/loongarch/include/uapi/asm/kvm_para.h @@ -15,6 +15,7 @@ #define CPUCFG_KVM_FEATURE (CPUCFG_KVM_BASE + 4) #define KVM_FEATURE_IPI 1 #define KVM_FEATURE_STEAL_TIME 2 +#define KVM_FEATURE_PREEMPT 3 /* BIT 24 - 31 are features configurable by user space vmm */ #define KVM_FEATURE_VIRT_EXTIOI 24 #define KVM_FEATURE_USER_HCALL 25 diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 4f0d10f52b99..550c0d05666a 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -181,6 +181,11 @@ static void kvm_update_stolen_time(struct kvm_vcpu *vcpu) } st = (struct kvm_steal_time __user *)ghc->hva; + if (kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_PREEMPT)) { + unsafe_put_user(0, &st->preempted, out); + vcpu->arch.st.preempted = 0; + } + unsafe_get_user(version, &st->version, out); if (version & 1) version += 1; /* first time write, random junk */ @@ -1795,11 +1800,57 @@ out: return 0; } +static void kvm_vcpu_set_pv_preempted(struct kvm_vcpu *vcpu) +{ + gpa_t gpa; + struct gfn_to_hva_cache *ghc; + struct kvm_memslots *slots; + struct kvm_steal_time __user *st; + + gpa = vcpu->arch.st.guest_addr; + if (!(gpa & KVM_STEAL_PHYS_VALID)) + return; + + /* vCPU may be preempted for many times */ + if (vcpu->arch.st.preempted) + return; + + /* This happens on process exit */ + if (unlikely(current->mm != vcpu->kvm->mm)) + return; + + gpa &= KVM_STEAL_PHYS_MASK; + ghc = &vcpu->arch.st.cache; + slots = kvm_memslots(vcpu->kvm); + if (slots->generation != ghc->generation || gpa != ghc->gpa) { + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st))) { + ghc->gpa = INVALID_GPA; + return; + } + } + + st = (struct kvm_steal_time __user *)ghc->hva; + unsafe_put_user(KVM_VCPU_PREEMPTED, &st->preempted, out); + vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; +out: + mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); +} + void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - int cpu; + int cpu, idx; unsigned long flags; + if (vcpu->preempted && kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_PREEMPT)) { + /* + * Take the srcu lock as memslots will be accessed to check + * the gfn cache generation against the memslots generation. + */ + idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_set_pv_preempted(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } + local_irq_save(flags); cpu = smp_processor_id(); vcpu->arch.last_sched_cpu = cpu; diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c index d3ff6d6966f8..9681ade890c6 100644 --- a/arch/loongarch/kvm/vm.c +++ b/arch/loongarch/kvm/vm.c @@ -52,7 +52,9 @@ static void kvm_vm_init_features(struct kvm *kvm) kvm->arch.pv_features = BIT(KVM_FEATURE_IPI); kvm->arch.kvm_features = BIT(KVM_LOONGARCH_VM_FEAT_PV_IPI); if (kvm_pvtime_supported()) { + kvm->arch.pv_features |= BIT(KVM_FEATURE_PREEMPT); kvm->arch.pv_features |= BIT(KVM_FEATURE_STEAL_TIME); + kvm->arch.kvm_features |= BIT(KVM_LOONGARCH_VM_FEAT_PV_PREEMPT); kvm->arch.kvm_features |= BIT(KVM_LOONGARCH_VM_FEAT_PV_STEALTIME); } } @@ -154,6 +156,7 @@ static int kvm_vm_feature_has_attr(struct kvm *kvm, struct kvm_device_attr *attr case KVM_LOONGARCH_VM_FEAT_MSGINT: case KVM_LOONGARCH_VM_FEAT_PMU: case KVM_LOONGARCH_VM_FEAT_PV_IPI: + case KVM_LOONGARCH_VM_FEAT_PV_PREEMPT: case KVM_LOONGARCH_VM_FEAT_PV_STEALTIME: if (kvm_vm_support(&kvm->arch, attr->attr)) return 0; From 872d277a1e2f8a8b83dc9f21151af9cb64e8f1ce Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 6 Feb 2026 09:28:01 +0800 Subject: [PATCH 249/267] LoongArch: KVM: Add paravirt vcpu_is_preempted() support in guest side Function vcpu_is_preempted() is used to check whether vCPU is preempted or not. Here add the implementation with vcpu_is_preempted() when option CONFIG_PARAVIRT is enabled. Acked-by: Juergen Gross Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/qspinlock.h | 4 ++++ arch/loongarch/kernel/paravirt.c | 21 ++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/include/asm/qspinlock.h b/arch/loongarch/include/asm/qspinlock.h index e76d3aa1e1eb..66244801db67 100644 --- a/arch/loongarch/include/asm/qspinlock.h +++ b/arch/loongarch/include/asm/qspinlock.h @@ -34,6 +34,10 @@ __retry: return true; } +#define vcpu_is_preempted vcpu_is_preempted + +bool vcpu_is_preempted(int cpu); + #endif /* CONFIG_PARAVIRT */ #include diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c index b1b51f920b23..7f1561906e10 100644 --- a/arch/loongarch/kernel/paravirt.c +++ b/arch/loongarch/kernel/paravirt.c @@ -12,6 +12,7 @@ static int has_steal_clock; struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); +static DEFINE_STATIC_KEY_FALSE(virt_preempt_key); DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key); static u64 native_steal_clock(int cpu) @@ -267,6 +268,18 @@ static int pv_time_cpu_down_prepare(unsigned int cpu) return 0; } + +bool vcpu_is_preempted(int cpu) +{ + struct kvm_steal_time *src; + + if (!static_branch_unlikely(&virt_preempt_key)) + return false; + + src = &per_cpu(steal_time, cpu); + return !!(src->preempted & KVM_VCPU_PREEMPTED); +} +EXPORT_SYMBOL(vcpu_is_preempted); #endif static void pv_cpu_reboot(void *unused) @@ -308,6 +321,9 @@ int __init pv_time_init(void) pr_err("Failed to install cpu hotplug callbacks\n"); return r; } + + if (kvm_para_has_feature(KVM_FEATURE_PREEMPT)) + static_branch_enable(&virt_preempt_key); #endif static_call_update(pv_steal_clock, paravt_steal_clock); @@ -318,7 +334,10 @@ int __init pv_time_init(void) static_key_slow_inc(¶virt_steal_rq_enabled); #endif - pr_info("Using paravirt steal-time\n"); + if (static_key_enabled(&virt_preempt_key)) + pr_info("Using paravirt steal-time with preempt enabled\n"); + else + pr_info("Using paravirt steal-time with preempt disabled\n"); return 0; } From 2d94a3f7088b69ae25e27fb98d7f1ef572c843f9 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Fri, 6 Feb 2026 09:28:01 +0800 Subject: [PATCH 250/267] KVM: LoongArch: selftests: Add steal time test case LoongArch KVM supports steal time accounting now, here add steal time test case on LoongArch. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- tools/testing/selftests/kvm/Makefile.kvm | 1 + tools/testing/selftests/kvm/steal_time.c | 96 ++++++++++++++++++++++++ 2 files changed, 97 insertions(+) diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..a18c00f1a4fa 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -228,6 +228,7 @@ TEST_GEN_PROGS_loongarch += kvm_page_table_test TEST_GEN_PROGS_loongarch += memslot_modification_stress_test TEST_GEN_PROGS_loongarch += memslot_perf_test TEST_GEN_PROGS_loongarch += set_memory_region_test +TEST_GEN_PROGS_loongarch += steal_time SPLIT_TESTS += arch_timer SPLIT_TESTS += get-reg-list diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index 8edc1fca345b..7be8adfe5dd3 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -301,6 +301,102 @@ static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) pr_info("\n"); } +#elif defined(__loongarch__) + +/* steal_time must have 64-byte alignment */ +#define STEAL_TIME_SIZE ((sizeof(struct kvm_steal_time) + 63) & ~63) +#define KVM_STEAL_PHYS_VALID BIT_ULL(0) + +struct kvm_steal_time { + __u64 steal; + __u32 version; + __u32 flags; + __u8 preempted; + __u8 pad[47]; +}; + +static void check_status(struct kvm_steal_time *st) +{ + GUEST_ASSERT(!(READ_ONCE(st->version) & 1)); + GUEST_ASSERT_EQ(READ_ONCE(st->flags), 0); + GUEST_ASSERT_EQ(READ_ONCE(st->preempted), 0); +} + +static void guest_code(int cpu) +{ + uint32_t version; + struct kvm_steal_time *st = st_gva[cpu]; + + memset(st, 0, sizeof(*st)); + GUEST_SYNC(0); + + check_status(st); + WRITE_ONCE(guest_stolen_time[cpu], st->steal); + version = READ_ONCE(st->version); + check_status(st); + GUEST_SYNC(1); + + check_status(st); + GUEST_ASSERT(version < READ_ONCE(st->version)); + WRITE_ONCE(guest_stolen_time[cpu], st->steal); + check_status(st); + GUEST_DONE(); +} + +static bool is_steal_time_supported(struct kvm_vcpu *vcpu) +{ + int err; + uint64_t val; + struct kvm_device_attr attr = { + .group = KVM_LOONGARCH_VCPU_CPUCFG, + .attr = CPUCFG_KVM_FEATURE, + .addr = (uint64_t)&val, + }; + + err = __vcpu_ioctl(vcpu, KVM_HAS_DEVICE_ATTR, &attr); + if (err) + return false; + + err = __vcpu_ioctl(vcpu, KVM_GET_DEVICE_ATTR, &attr); + if (err) + return false; + + return val & BIT(KVM_FEATURE_STEAL_TIME); +} + +static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) +{ + int err; + uint64_t st_gpa; + struct kvm_vm *vm = vcpu->vm; + struct kvm_device_attr attr = { + .group = KVM_LOONGARCH_VCPU_PVTIME_CTRL, + .attr = KVM_LOONGARCH_VCPU_PVTIME_GPA, + .addr = (uint64_t)&st_gpa, + }; + + /* ST_GPA_BASE is identity mapped */ + st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); + sync_global_to_guest(vm, st_gva[i]); + + err = __vcpu_ioctl(vcpu, KVM_HAS_DEVICE_ATTR, &attr); + TEST_ASSERT(err == 0, "No PV stealtime Feature"); + + st_gpa = (unsigned long)st_gva[i] | KVM_STEAL_PHYS_VALID; + err = __vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &attr); + TEST_ASSERT(err == 0, "Fail to set PV stealtime GPA"); +} + +static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) +{ + struct kvm_steal_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpu_idx]); + + ksft_print_msg("VCPU%d:\n", vcpu_idx); + ksft_print_msg(" steal: %lld\n", st->steal); + ksft_print_msg(" flags: %d\n", st->flags); + ksft_print_msg(" version: %d\n", st->version); + ksft_print_msg(" preempted: %d\n", st->preempted); +} #endif static void *do_steal_time(void *arg) From f7ab71f178d56447e5efb55b65436feb68662f8f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 6 Feb 2026 10:17:30 +0100 Subject: [PATCH 251/267] KVM: s390: Add explicit padding to struct kvm_s390_keyop The newly added structure causes a warning about implied padding: ./usr/include/linux/kvm.h:1247:1: error: padding struct size to alignment boundary with 6 bytes [-Werror=padded] The padding can lead to leaking kernel data and ABI incompatibilies when used on x86. Neither of these is a problem in this specific patch, but it's best to avoid it and use explicit padding fields in general. Fixes: 0ee4ddc1647b ("KVM: s390: Storage key manipulation IOCTL") Signed-off-by: Arnd Bergmann Reviewed-by: Claudio Imbrenda Signed-off-by: Claudio Imbrenda --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ab3d3d96e75f..d4250ab662fe 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1227,6 +1227,7 @@ struct kvm_s390_keyop { __u64 guest_addr; __u8 key; __u8 operation; + __u8 pad[6]; }; /* From 1c0180cd42839fe7a60a61412b7c28cd43553c6b Mon Sep 17 00:00:00 2001 From: Qiang Ma Date: Mon, 29 Dec 2025 15:25:30 +0800 Subject: [PATCH 252/267] RISC-V: KVM: Remove unnecessary 'ret' assignment If execution reaches "ret = 0" assignment in kvm_riscv_vcpu_pmu_event_info() then it means kvm_vcpu_write_guest() returned 0 hence ret is already zero and does not need to be assigned 0. Fixes: e309fd113b9f ("RISC-V: KVM: Implement get event info function") Signed-off-by: Qiang Ma Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20251229072530.3075496-1-maqianga@uniontech.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_pmu.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c index a2fae70ee174..4d8d5e9aa53d 100644 --- a/arch/riscv/kvm/vcpu_pmu.c +++ b/arch/riscv/kvm/vcpu_pmu.c @@ -494,12 +494,9 @@ int kvm_riscv_vcpu_pmu_event_info(struct kvm_vcpu *vcpu, unsigned long saddr_low } ret = kvm_vcpu_write_guest(vcpu, shmem, einfo, shmem_size); - if (ret) { + if (ret) ret = SBI_ERR_INVALID_ADDRESS; - goto free_mem; - } - ret = 0; free_mem: kfree(einfo); out: From 11366ead4f1412befea660777576c89a1bac0c1e Mon Sep 17 00:00:00 2001 From: Jiakai Xu Date: Sun, 25 Jan 2026 14:33:44 +0000 Subject: [PATCH 253/267] RISC-V: KVM: Fix null pointer dereference in kvm_riscv_aia_imsic_has_attr() Add a null pointer check for imsic_state before dereferencing it in kvm_riscv_aia_imsic_has_attr(). While the function checks that the vcpu exists, it doesn't verify that the vcpu's imsic_state has been initialized, leading to a null pointer dereference when accessed. This issue was discovered during fuzzing of RISC-V KVM code. The crash occurs when userspace calls KVM_HAS_DEVICE_ATTR ioctl on an AIA IMSIC device before the IMSIC state has been fully initialized for a vcpu. The crash manifests as: Unable to handle kernel paging request at virtual address dfffffff00000001 ... epc : kvm_riscv_aia_imsic_has_attr+0x464/0x50e arch/riscv/kvm/aia_imsic.c:998 ... kvm_riscv_aia_imsic_has_attr+0x464/0x50e arch/riscv/kvm/aia_imsic.c:998 aia_has_attr+0x128/0x2bc arch/riscv/kvm/aia_device.c:471 kvm_device_ioctl_attr virt/kvm/kvm_main.c:4722 [inline] kvm_device_ioctl+0x296/0x374 virt/kvm/kvm_main.c:4739 ... The fix adds a check to return -ENODEV if imsic_state is NULL, which is consistent with other error handling in the function and prevents the null pointer dereference. Fixes: 5463091a51cf ("RISC-V: KVM: Expose IMSIC registers as attributes of AIA irqchip") Signed-off-by: Jiakai Xu Signed-off-by: Jiakai Xu Reviewed-by: Nutty Liu Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20260125143344.2515451-1-xujiakai2025@iscas.ac.cn Signed-off-by: Anup Patel --- arch/riscv/kvm/aia_imsic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c index e597e86491c3..cd070d83663a 100644 --- a/arch/riscv/kvm/aia_imsic.c +++ b/arch/riscv/kvm/aia_imsic.c @@ -993,8 +993,11 @@ int kvm_riscv_aia_imsic_has_attr(struct kvm *kvm, unsigned long type) if (!vcpu) return -ENODEV; - isel = KVM_DEV_RISCV_AIA_IMSIC_GET_ISEL(type); imsic = vcpu->arch.aia_context.imsic_state; + if (!imsic) + return -ENODEV; + + isel = KVM_DEV_RISCV_AIA_IMSIC_GET_ISEL(type); return imsic_mrif_isel_check(imsic->nr_eix, isel); } From aeb1d17d1af5924f7357d7204a293bd8fc06ea13 Mon Sep 17 00:00:00 2001 From: Jiakai Xu Date: Tue, 27 Jan 2026 07:22:19 +0000 Subject: [PATCH 254/267] RISC-V: KVM: Fix null pointer dereference in kvm_riscv_aia_imsic_rw_attr() Add a null pointer check for imsic_state before dereferencing it in kvm_riscv_aia_imsic_rw_attr(). While the function checks that the vcpu exists, it doesn't verify that the vcpu's imsic_state has been initialized, leading to a null pointer dereference when accessed. The crash manifests as: Unable to handle kernel paging request at virtual address dfffffff00000006 ... kvm_riscv_aia_imsic_rw_attr+0x2d8/0x854 arch/riscv/kvm/aia_imsic.c:958 aia_set_attr+0x2ee/0x1726 arch/riscv/kvm/aia_device.c:354 kvm_device_ioctl_attr virt/kvm/kvm_main.c:4744 [inline] kvm_device_ioctl+0x296/0x374 virt/kvm/kvm_main.c:4761 vfs_ioctl fs/ioctl.c:51 [inline] ... The fix adds a check to return -ENODEV if imsic_state is NULL and moves isel assignment after imsic_state NULL check. Fixes: 5463091a51cfaa ("RISC-V: KVM: Expose IMSIC registers as attributes of AIA irqchip") Signed-off-by: Jiakai Xu Signed-off-by: Jiakai Xu Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20260127072219.3366607-1-xujiakai2025@iscas.ac.cn Signed-off-by: Anup Patel --- arch/riscv/kvm/aia_imsic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c index cd070d83663a..7edfd129242f 100644 --- a/arch/riscv/kvm/aia_imsic.c +++ b/arch/riscv/kvm/aia_imsic.c @@ -952,8 +952,10 @@ int kvm_riscv_aia_imsic_rw_attr(struct kvm *kvm, unsigned long type, if (!vcpu) return -ENODEV; - isel = KVM_DEV_RISCV_AIA_IMSIC_GET_ISEL(type); imsic = vcpu->arch.aia_context.imsic_state; + if (!imsic) + return -ENODEV; + isel = KVM_DEV_RISCV_AIA_IMSIC_GET_ISEL(type); read_lock_irqsave(&imsic->vsfile_lock, flags); From 003b9dae53ae795930c54945e0b0a41bbd527b44 Mon Sep 17 00:00:00 2001 From: Jiakai Xu Date: Tue, 27 Jan 2026 08:43:13 +0000 Subject: [PATCH 255/267] RISC-V: KVM: Skip IMSIC update if vCPU IMSIC state is not initialized kvm_riscv_vcpu_aia_imsic_update() assumes that the vCPU IMSIC state has already been initialized and unconditionally accesses imsic->vsfile_lock. However, in fuzzed ioctl sequences, the AIA device may be initialized at the VM level while the per-vCPU IMSIC state is still NULL. This leads to invalid access when entering the vCPU run loop before IMSIC initialization has completed. The crash manifests as: Unable to handle kernel paging request at virtual address dfffffff00000006 ... kvm_riscv_vcpu_aia_imsic_update arch/riscv/kvm/aia_imsic.c:801 kvm_riscv_vcpu_aia_update arch/riscv/kvm/aia_device.c:493 kvm_arch_vcpu_ioctl_run arch/riscv/kvm/vcpu.c:927 ... Add a guard to skip the IMSIC update path when imsic_state is NULL. This allows the vCPU run loop to continue safely. This issue was discovered during fuzzing of RISC-V KVM code. Fixes: db8b7e97d6137a ("RISC-V: KVM: Add in-kernel virtualization of AIA IMSIC") Signed-off-by: Jiakai Xu Signed-off-by: Jiakai Xu Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20260127084313.3496485-1-xujiakai2025@iscas.ac.cn Signed-off-by: Anup Patel --- arch/riscv/kvm/aia_imsic.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c index 7edfd129242f..60da5fa155a6 100644 --- a/arch/riscv/kvm/aia_imsic.c +++ b/arch/riscv/kvm/aia_imsic.c @@ -797,6 +797,10 @@ int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu) if (kvm->arch.aia.mode == KVM_DEV_RISCV_AIA_MODE_EMUL) return 1; + /* IMSIC vCPU state may not be initialized yet */ + if (!imsic) + return 1; + /* Read old IMSIC VS-file details */ read_lock_irqsave(&imsic->vsfile_lock, flags); old_vsfile_hgei = imsic->vsfile_hgei; From f326e846ff89c82083c09fa495457442c252983e Mon Sep 17 00:00:00 2001 From: Pincheng Wang Date: Wed, 27 Aug 2025 00:29:38 +0800 Subject: [PATCH 256/267] riscv: KVM: allow Zilsd and Zclsd extensions for Guest/VM Extend the KVM ISA extension ONE_REG interface to allow KVM user space to detect and enable Zilsd and Zclsd extensions for Guest/VM. Signed-off-by: Pincheng Wang Reviewed-by: Nutty Liu Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20250826162939.1494021-5-pincheng.plct@isrc.iscas.ac.cn Signed-off-by: Anup Patel --- arch/riscv/include/uapi/asm/kvm.h | 2 ++ arch/riscv/kvm/vcpu_onereg.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 54f3ad7ed2e4..6c536b2823c3 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -192,6 +192,8 @@ enum KVM_RISCV_ISA_EXT_ID { KVM_RISCV_ISA_EXT_ZFBFMIN, KVM_RISCV_ISA_EXT_ZVFBFMIN, KVM_RISCV_ISA_EXT_ZVFBFWMA, + KVM_RISCV_ISA_EXT_ZCLSD, + KVM_RISCV_ISA_EXT_ZILSD, KVM_RISCV_ISA_EXT_MAX, }; diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index 865dae903aa0..347c0a902d28 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -63,6 +63,7 @@ static const unsigned long kvm_isa_ext_arr[] = { KVM_ISA_EXT_ARR(ZCB), KVM_ISA_EXT_ARR(ZCD), KVM_ISA_EXT_ARR(ZCF), + KVM_ISA_EXT_ARR(ZCLSD), KVM_ISA_EXT_ARR(ZCMOP), KVM_ISA_EXT_ARR(ZFA), KVM_ISA_EXT_ARR(ZFBFMIN), @@ -79,6 +80,7 @@ static const unsigned long kvm_isa_ext_arr[] = { KVM_ISA_EXT_ARR(ZIHINTNTL), KVM_ISA_EXT_ARR(ZIHINTPAUSE), KVM_ISA_EXT_ARR(ZIHPM), + KVM_ISA_EXT_ARR(ZILSD), KVM_ISA_EXT_ARR(ZIMOP), KVM_ISA_EXT_ARR(ZKND), KVM_ISA_EXT_ARR(ZKNE), From ab2a7b7b6b8831348646688345c3209cdaee5d46 Mon Sep 17 00:00:00 2001 From: Pincheng Wang Date: Wed, 27 Aug 2025 00:29:39 +0800 Subject: [PATCH 257/267] KVM: riscv: selftests: add Zilsd and Zclsd extension to get-reg-list test The KVM RISC-V allows Zilsd and Zclsd extensions for Guest/VM so add this extension to get-reg-list test. Signed-off-by: Pincheng Wang Reviewed-by: Nutty Liu Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20250826162939.1494021-6-pincheng.plct@isrc.iscas.ac.cn Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/riscv/get-reg-list.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c index cb54a56990a0..20cc7d9b65ed 100644 --- a/tools/testing/selftests/kvm/riscv/get-reg-list.c +++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c @@ -78,6 +78,7 @@ bool filter_reg(__u64 reg) case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZCB: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZCD: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZCF: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZCLSD: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZCMOP: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZFA: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZFBFMIN: @@ -94,6 +95,7 @@ bool filter_reg(__u64 reg) case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZIHINTNTL: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZIHINTPAUSE: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZIHPM: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZILSD: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZIMOP: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZKND: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZKNE: @@ -538,6 +540,7 @@ static const char *isa_ext_single_id_to_str(__u64 reg_off) KVM_ISA_EXT_ARR(ZCB), KVM_ISA_EXT_ARR(ZCD), KVM_ISA_EXT_ARR(ZCF), + KVM_ISA_EXT_ARR(ZCLSD), KVM_ISA_EXT_ARR(ZCMOP), KVM_ISA_EXT_ARR(ZFA), KVM_ISA_EXT_ARR(ZFBFMIN), @@ -554,6 +557,7 @@ static const char *isa_ext_single_id_to_str(__u64 reg_off) KVM_ISA_EXT_ARR(ZIHINTNTL), KVM_ISA_EXT_ARR(ZIHINTPAUSE), KVM_ISA_EXT_ARR(ZIHPM), + KVM_ISA_EXT_ARR(ZILSD), KVM_ISA_EXT_ARR(ZIMOP), KVM_ISA_EXT_ARR(ZKND), KVM_ISA_EXT_ARR(ZKNE), @@ -1179,6 +1183,7 @@ KVM_ISA_EXT_SIMPLE_CONFIG(zca, ZCA); KVM_ISA_EXT_SIMPLE_CONFIG(zcb, ZCB); KVM_ISA_EXT_SIMPLE_CONFIG(zcd, ZCD); KVM_ISA_EXT_SIMPLE_CONFIG(zcf, ZCF); +KVM_ISA_EXT_SIMPLE_CONFIG(zclsd, ZCLSD); KVM_ISA_EXT_SIMPLE_CONFIG(zcmop, ZCMOP); KVM_ISA_EXT_SIMPLE_CONFIG(zfa, ZFA); KVM_ISA_EXT_SIMPLE_CONFIG(zfbfmin, ZFBFMIN); @@ -1195,6 +1200,7 @@ KVM_ISA_EXT_SIMPLE_CONFIG(zifencei, ZIFENCEI); KVM_ISA_EXT_SIMPLE_CONFIG(zihintntl, ZIHINTNTL); KVM_ISA_EXT_SIMPLE_CONFIG(zihintpause, ZIHINTPAUSE); KVM_ISA_EXT_SIMPLE_CONFIG(zihpm, ZIHPM); +KVM_ISA_EXT_SIMPLE_CONFIG(zilsd, ZILSD); KVM_ISA_EXT_SIMPLE_CONFIG(zimop, ZIMOP); KVM_ISA_EXT_SIMPLE_CONFIG(zknd, ZKND); KVM_ISA_EXT_SIMPLE_CONFIG(zkne, ZKNE); @@ -1259,6 +1265,7 @@ struct vcpu_reg_list *vcpu_configs[] = { &config_zcb, &config_zcd, &config_zcf, + &config_zclsd, &config_zcmop, &config_zfa, &config_zfbfmin, @@ -1275,6 +1282,7 @@ struct vcpu_reg_list *vcpu_configs[] = { &config_zihintntl, &config_zihintpause, &config_zihpm, + &config_zilsd, &config_zimop, &config_zknd, &config_zkne, From 39ad809dd2579d9b7400bbc50a5b95d84527b75e Mon Sep 17 00:00:00 2001 From: Wu Fei Date: Wed, 5 Nov 2025 23:14:26 +0800 Subject: [PATCH 258/267] KVM: riscv: selftests: Add riscv vm satp modes Current vm modes cannot represent riscv guest modes precisely, here add all 9 combinations of P(56,40,41) x V(57,48,39). Also the default vm mode is detected on runtime instead of hardcoded one, which might not be supported on specific machine. Signed-off-by: Wu Fei Reviewed-by: Andrew Jones Reviewed-by: Nutty Liu Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20251105151442.28767-1-wu.fei9@sanechips.com.cn Signed-off-by: Anup Patel --- .../testing/selftests/kvm/include/kvm_util.h | 17 ++++- .../selftests/kvm/include/riscv/processor.h | 2 + tools/testing/selftests/kvm/lib/guest_modes.c | 41 +++++++++--- tools/testing/selftests/kvm/lib/kvm_util.c | 33 ++++++++++ .../selftests/kvm/lib/riscv/processor.c | 63 +++++++++++++++++-- 5 files changed, 142 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 81f4355ff28a..fe1bd661b1c1 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -186,6 +186,17 @@ enum vm_guest_mode { VM_MODE_P36V48_64K, VM_MODE_P47V47_16K, VM_MODE_P36V47_16K, + + VM_MODE_P56V57_4K, /* For riscv64 */ + VM_MODE_P56V48_4K, + VM_MODE_P56V39_4K, + VM_MODE_P50V57_4K, + VM_MODE_P50V48_4K, + VM_MODE_P50V39_4K, + VM_MODE_P41V57_4K, + VM_MODE_P41V48_4K, + VM_MODE_P41V39_4K, + NUM_VM_MODES, }; @@ -210,10 +221,10 @@ kvm_static_assert(sizeof(struct vm_shape) == sizeof(uint64_t)); shape; \ }) -#if defined(__aarch64__) - extern enum vm_guest_mode vm_mode_default; +#if defined(__aarch64__) + #define VM_MODE_DEFAULT vm_mode_default #define MIN_PAGE_SHIFT 12U #define ptes_per_page(page_size) ((page_size) / 8) @@ -236,7 +247,7 @@ extern enum vm_guest_mode vm_mode_default; #error "RISC-V 32-bit kvm selftests not supported" #endif -#define VM_MODE_DEFAULT VM_MODE_P40V48_4K +#define VM_MODE_DEFAULT vm_mode_default #define MIN_PAGE_SHIFT 12U #define ptes_per_page(page_size) ((page_size) / 8) diff --git a/tools/testing/selftests/kvm/include/riscv/processor.h b/tools/testing/selftests/kvm/include/riscv/processor.h index e58282488beb..4dade8c4d18e 100644 --- a/tools/testing/selftests/kvm/include/riscv/processor.h +++ b/tools/testing/selftests/kvm/include/riscv/processor.h @@ -192,4 +192,6 @@ static inline void local_irq_disable(void) csr_clear(CSR_SSTATUS, SR_SIE); } +unsigned long riscv64_get_satp_mode(void); + #endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/lib/guest_modes.c b/tools/testing/selftests/kvm/lib/guest_modes.c index b04901e55138..ce3099630397 100644 --- a/tools/testing/selftests/kvm/lib/guest_modes.c +++ b/tools/testing/selftests/kvm/lib/guest_modes.c @@ -4,7 +4,7 @@ */ #include "guest_modes.h" -#ifdef __aarch64__ +#if defined(__aarch64__) || defined(__riscv) #include "processor.h" enum vm_guest_mode vm_mode_default; #endif @@ -13,9 +13,11 @@ struct guest_mode guest_modes[NUM_VM_MODES]; void guest_modes_append_default(void) { -#ifndef __aarch64__ +#if !defined(__aarch64__) && !defined(__riscv) guest_mode_append(VM_MODE_DEFAULT, true); -#else +#endif + +#ifdef __aarch64__ { unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); uint32_t ipa4k, ipa16k, ipa64k; @@ -74,11 +76,36 @@ void guest_modes_append_default(void) #ifdef __riscv { unsigned int sz = kvm_check_cap(KVM_CAP_VM_GPA_BITS); + unsigned long satp_mode = riscv64_get_satp_mode() << SATP_MODE_SHIFT; + int i; - if (sz >= 52) - guest_mode_append(VM_MODE_P52V48_4K, true); - if (sz >= 48) - guest_mode_append(VM_MODE_P48V48_4K, true); + switch (sz) { + case 59: + guest_mode_append(VM_MODE_P56V57_4K, satp_mode >= SATP_MODE_57); + guest_mode_append(VM_MODE_P56V48_4K, satp_mode >= SATP_MODE_48); + guest_mode_append(VM_MODE_P56V39_4K, satp_mode >= SATP_MODE_39); + break; + case 50: + guest_mode_append(VM_MODE_P50V57_4K, satp_mode >= SATP_MODE_57); + guest_mode_append(VM_MODE_P50V48_4K, satp_mode >= SATP_MODE_48); + guest_mode_append(VM_MODE_P50V39_4K, satp_mode >= SATP_MODE_39); + break; + case 41: + guest_mode_append(VM_MODE_P41V57_4K, satp_mode >= SATP_MODE_57); + guest_mode_append(VM_MODE_P41V48_4K, satp_mode >= SATP_MODE_48); + guest_mode_append(VM_MODE_P41V39_4K, satp_mode >= SATP_MODE_39); + break; + default: + break; + } + + /* set the first supported mode as default */ + vm_mode_default = NUM_VM_MODES; + for (i = 0; vm_mode_default == NUM_VM_MODES && i < NUM_VM_MODES; i++) { + if (guest_modes[i].supported && guest_modes[i].enabled) + vm_mode_default = i; + } + TEST_ASSERT(vm_mode_default != NUM_VM_MODES, "No supported mode!"); } #endif } diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 8279b6ced8d2..174490e6cd10 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -209,6 +209,15 @@ const char *vm_guest_mode_string(uint32_t i) [VM_MODE_P36V48_64K] = "PA-bits:36, VA-bits:48, 64K pages", [VM_MODE_P47V47_16K] = "PA-bits:47, VA-bits:47, 16K pages", [VM_MODE_P36V47_16K] = "PA-bits:36, VA-bits:47, 16K pages", + [VM_MODE_P56V57_4K] = "PA-bits:56, VA-bits:57, 4K pages", + [VM_MODE_P56V48_4K] = "PA-bits:56, VA-bits:48, 4K pages", + [VM_MODE_P56V39_4K] = "PA-bits:56, VA-bits:39, 4K pages", + [VM_MODE_P50V57_4K] = "PA-bits:50, VA-bits:57, 4K pages", + [VM_MODE_P50V48_4K] = "PA-bits:50, VA-bits:48, 4K pages", + [VM_MODE_P50V39_4K] = "PA-bits:50, VA-bits:39, 4K pages", + [VM_MODE_P41V57_4K] = "PA-bits:41, VA-bits:57, 4K pages", + [VM_MODE_P41V48_4K] = "PA-bits:41, VA-bits:48, 4K pages", + [VM_MODE_P41V39_4K] = "PA-bits:41, VA-bits:39, 4K pages", }; _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES, "Missing new mode strings?"); @@ -236,6 +245,15 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = { [VM_MODE_P36V48_64K] = { 36, 48, 0x10000, 16 }, [VM_MODE_P47V47_16K] = { 47, 47, 0x4000, 14 }, [VM_MODE_P36V47_16K] = { 36, 47, 0x4000, 14 }, + [VM_MODE_P56V57_4K] = { 56, 57, 0x1000, 12 }, + [VM_MODE_P56V48_4K] = { 56, 48, 0x1000, 12 }, + [VM_MODE_P56V39_4K] = { 56, 39, 0x1000, 12 }, + [VM_MODE_P50V57_4K] = { 50, 57, 0x1000, 12 }, + [VM_MODE_P50V48_4K] = { 50, 48, 0x1000, 12 }, + [VM_MODE_P50V39_4K] = { 50, 39, 0x1000, 12 }, + [VM_MODE_P41V57_4K] = { 41, 57, 0x1000, 12 }, + [VM_MODE_P41V48_4K] = { 41, 48, 0x1000, 12 }, + [VM_MODE_P41V39_4K] = { 41, 39, 0x1000, 12 }, }; _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, "Missing new mode params?"); @@ -338,6 +356,21 @@ struct kvm_vm *____vm_create(struct vm_shape shape) case VM_MODE_P44V64_4K: vm->pgtable_levels = 5; break; + case VM_MODE_P56V57_4K: + case VM_MODE_P50V57_4K: + case VM_MODE_P41V57_4K: + vm->pgtable_levels = 5; + break; + case VM_MODE_P56V48_4K: + case VM_MODE_P50V48_4K: + case VM_MODE_P41V48_4K: + vm->pgtable_levels = 4; + break; + case VM_MODE_P56V39_4K: + case VM_MODE_P50V39_4K: + case VM_MODE_P41V39_4K: + vm->pgtable_levels = 3; + break; default: TEST_FAIL("Unknown guest mode: 0x%x", vm->mode); } diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index 2eac7d4b59e9..003693576225 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -8,6 +8,7 @@ #include #include +#include "guest_modes.h" #include "kvm_util.h" #include "processor.h" #include "ucall_common.h" @@ -197,22 +198,41 @@ void riscv_vcpu_mmu_setup(struct kvm_vcpu *vcpu) { struct kvm_vm *vm = vcpu->vm; unsigned long satp; + unsigned long satp_mode; + unsigned long max_satp_mode; /* * The RISC-V Sv48 MMU mode supports 56-bit physical address * for 48-bit virtual address with 4KB last level page size. */ switch (vm->mode) { - case VM_MODE_P52V48_4K: - case VM_MODE_P48V48_4K: - case VM_MODE_P40V48_4K: + case VM_MODE_P56V57_4K: + case VM_MODE_P50V57_4K: + case VM_MODE_P41V57_4K: + satp_mode = SATP_MODE_57; + break; + case VM_MODE_P56V48_4K: + case VM_MODE_P50V48_4K: + case VM_MODE_P41V48_4K: + satp_mode = SATP_MODE_48; + break; + case VM_MODE_P56V39_4K: + case VM_MODE_P50V39_4K: + case VM_MODE_P41V39_4K: + satp_mode = SATP_MODE_39; break; default: TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); } + max_satp_mode = vcpu_get_reg(vcpu, RISCV_CONFIG_REG(satp_mode)); + + if ((satp_mode >> SATP_MODE_SHIFT) > max_satp_mode) + TEST_FAIL("Unable to set satp mode 0x%lx, max mode 0x%lx\n", + satp_mode >> SATP_MODE_SHIFT, max_satp_mode); + satp = (vm->pgd >> PGTBL_PAGE_SIZE_SHIFT) & SATP_PPN; - satp |= SATP_MODE_48; + satp |= satp_mode; vcpu_set_reg(vcpu, RISCV_GENERAL_CSR_REG(satp), satp); } @@ -515,3 +535,38 @@ unsigned long get_host_sbi_spec_version(void) return ret.value; } + +void kvm_selftest_arch_init(void) +{ + /* + * riscv64 doesn't have a true default mode, so start by detecting the + * supported vm mode. + */ + guest_modes_append_default(); +} + +unsigned long riscv64_get_satp_mode(void) +{ + int kvm_fd, vm_fd, vcpu_fd, err; + uint64_t val; + struct kvm_one_reg reg = { + .id = RISCV_CONFIG_REG(satp_mode), + .addr = (uint64_t)&val, + }; + + kvm_fd = open_kvm_dev_path_or_exit(); + vm_fd = __kvm_ioctl(kvm_fd, KVM_CREATE_VM, NULL); + TEST_ASSERT(vm_fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm_fd)); + + vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0); + TEST_ASSERT(vcpu_fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, vcpu_fd)); + + err = ioctl(vcpu_fd, KVM_GET_ONE_REG, ®); + TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_GET_ONE_REG, vcpu_fd)); + + close(vcpu_fd); + close(vm_fd); + close(kvm_fd); + + return val; +} From 655d330c058f4e16de46d5c9b203008c630b59c8 Mon Sep 17 00:00:00 2001 From: Xu Lu Date: Mon, 20 Oct 2025 12:24:56 +0800 Subject: [PATCH 259/267] RISC-V: KVM: Allow Zalasr extensions for Guest/VM Extend the KVM ISA extension ONE_REG interface to allow KVM user space to detect and enable Zalasr extensions for Guest/VM. Signed-off-by: Xu Lu Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20251020042457.30915-5-luxu.kernel@bytedance.com Signed-off-by: Anup Patel --- arch/riscv/include/uapi/asm/kvm.h | 1 + arch/riscv/kvm/vcpu_onereg.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 6c536b2823c3..6a89c1d00a72 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -194,6 +194,7 @@ enum KVM_RISCV_ISA_EXT_ID { KVM_RISCV_ISA_EXT_ZVFBFWMA, KVM_RISCV_ISA_EXT_ZCLSD, KVM_RISCV_ISA_EXT_ZILSD, + KVM_RISCV_ISA_EXT_ZALASR, KVM_RISCV_ISA_EXT_MAX, }; diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index 347c0a902d28..e7ab6cb00646 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -50,6 +50,7 @@ static const unsigned long kvm_isa_ext_arr[] = { KVM_ISA_EXT_ARR(ZAAMO), KVM_ISA_EXT_ARR(ZABHA), KVM_ISA_EXT_ARR(ZACAS), + KVM_ISA_EXT_ARR(ZALASR), KVM_ISA_EXT_ARR(ZALRSC), KVM_ISA_EXT_ARR(ZAWRS), KVM_ISA_EXT_ARR(ZBA), @@ -189,6 +190,7 @@ static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext) case KVM_RISCV_ISA_EXT_ZAAMO: case KVM_RISCV_ISA_EXT_ZABHA: case KVM_RISCV_ISA_EXT_ZACAS: + case KVM_RISCV_ISA_EXT_ZALASR: case KVM_RISCV_ISA_EXT_ZALRSC: case KVM_RISCV_ISA_EXT_ZAWRS: case KVM_RISCV_ISA_EXT_ZBA: From 671995ff4c308fc1adf01727df670c83434ffb5a Mon Sep 17 00:00:00 2001 From: Xu Lu Date: Mon, 20 Oct 2025 12:29:04 +0800 Subject: [PATCH 260/267] RISC-V: KVM: selftests: Add Zalasr extensions to get-reg-list test The KVM RISC-V allows Zalasr extensions for Guest/VM so add this extension to get-reg-list test. Signed-off-by: Xu Lu Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20251020042904.32096-1-luxu.kernel@bytedance.com Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/riscv/get-reg-list.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c index 20cc7d9b65ed..8d6b951434eb 100644 --- a/tools/testing/selftests/kvm/riscv/get-reg-list.c +++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c @@ -65,6 +65,7 @@ bool filter_reg(__u64 reg) case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZAAMO: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZABHA: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZACAS: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZALASR: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZALRSC: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZAWRS: case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZBA: @@ -527,6 +528,7 @@ static const char *isa_ext_single_id_to_str(__u64 reg_off) KVM_ISA_EXT_ARR(ZAAMO), KVM_ISA_EXT_ARR(ZABHA), KVM_ISA_EXT_ARR(ZACAS), + KVM_ISA_EXT_ARR(ZALASR), KVM_ISA_EXT_ARR(ZALRSC), KVM_ISA_EXT_ARR(ZAWRS), KVM_ISA_EXT_ARR(ZBA), @@ -1170,6 +1172,7 @@ KVM_ISA_EXT_SIMPLE_CONFIG(svvptc, SVVPTC); KVM_ISA_EXT_SIMPLE_CONFIG(zaamo, ZAAMO); KVM_ISA_EXT_SIMPLE_CONFIG(zabha, ZABHA); KVM_ISA_EXT_SIMPLE_CONFIG(zacas, ZACAS); +KVM_ISA_EXT_SIMPLE_CONFIG(zalasr, ZALASR); KVM_ISA_EXT_SIMPLE_CONFIG(zalrsc, ZALRSC); KVM_ISA_EXT_SIMPLE_CONFIG(zawrs, ZAWRS); KVM_ISA_EXT_SIMPLE_CONFIG(zba, ZBA); @@ -1253,6 +1256,7 @@ struct vcpu_reg_list *vcpu_configs[] = { &config_zabha, &config_zacas, &config_zalrsc, + &config_zalasr, &config_zawrs, &config_zba, &config_zbb, From ed7ae7a34bea06f81270866dfbe619a22518666b Mon Sep 17 00:00:00 2001 From: Jessica Liu Date: Thu, 27 Nov 2025 16:51:37 +0800 Subject: [PATCH 261/267] RISC-V: KVM: Transparent huge page support Use block mapping if backed by a THP, as implemented in architectures like ARM and x86_64. Signed-off-by: Jessica Liu Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20251127165137780QbUOVPKPAfWSGAFl5qtRy@zte.com.cn Signed-off-by: Anup Patel --- arch/riscv/kvm/mmu.c | 140 ++++++++++++++++++++++++++++++++++++++++ arch/riscv/mm/pgtable.c | 2 + 2 files changed, 142 insertions(+) diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 4ab06697bfc0..0b75eb2a1820 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -305,6 +305,142 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return pte_young(ptep_get(ptep)); } +static bool fault_supports_gstage_huge_mapping(struct kvm_memory_slot *memslot, + unsigned long hva) +{ + hva_t uaddr_start, uaddr_end; + gpa_t gpa_start; + size_t size; + + size = memslot->npages * PAGE_SIZE; + uaddr_start = memslot->userspace_addr; + uaddr_end = uaddr_start + size; + + gpa_start = memslot->base_gfn << PAGE_SHIFT; + + /* + * Pages belonging to memslots that don't have the same alignment + * within a PMD for userspace and GPA cannot be mapped with g-stage + * PMD entries, because we'll end up mapping the wrong pages. + * + * Consider a layout like the following: + * + * memslot->userspace_addr: + * +-----+--------------------+--------------------+---+ + * |abcde|fgh vs-stage block | vs-stage block tv|xyz| + * +-----+--------------------+--------------------+---+ + * + * memslot->base_gfn << PAGE_SHIFT: + * +---+--------------------+--------------------+-----+ + * |abc|def g-stage block | g-stage block |tvxyz| + * +---+--------------------+--------------------+-----+ + * + * If we create those g-stage blocks, we'll end up with this incorrect + * mapping: + * d -> f + * e -> g + * f -> h + */ + if ((gpa_start & (PMD_SIZE - 1)) != (uaddr_start & (PMD_SIZE - 1))) + return false; + + /* + * Next, let's make sure we're not trying to map anything not covered + * by the memslot. This means we have to prohibit block size mappings + * for the beginning and end of a non-block aligned and non-block sized + * memory slot (illustrated by the head and tail parts of the + * userspace view above containing pages 'abcde' and 'xyz', + * respectively). + * + * Note that it doesn't matter if we do the check using the + * userspace_addr or the base_gfn, as both are equally aligned (per + * the check above) and equally sized. + */ + return (hva >= ALIGN(uaddr_start, PMD_SIZE)) && (hva < ALIGN_DOWN(uaddr_end, PMD_SIZE)); +} + +static int get_hva_mapping_size(struct kvm *kvm, + unsigned long hva) +{ + int size = PAGE_SIZE; + unsigned long flags; + pgd_t pgd; + p4d_t p4d; + pud_t pud; + pmd_t pmd; + + /* + * Disable IRQs to prevent concurrent tear down of host page tables, + * e.g. if the primary MMU promotes a P*D to a huge page and then frees + * the original page table. + */ + local_irq_save(flags); + + /* + * Read each entry once. As above, a non-leaf entry can be promoted to + * a huge page _during_ this walk. Re-reading the entry could send the + * walk into the weeks, e.g. p*d_leaf() returns false (sees the old + * value) and then p*d_offset() walks into the target huge page instead + * of the old page table (sees the new value). + */ + pgd = pgdp_get(pgd_offset(kvm->mm, hva)); + if (pgd_none(pgd)) + goto out; + + p4d = p4dp_get(p4d_offset(&pgd, hva)); + if (p4d_none(p4d) || !p4d_present(p4d)) + goto out; + + pud = pudp_get(pud_offset(&p4d, hva)); + if (pud_none(pud) || !pud_present(pud)) + goto out; + + if (pud_leaf(pud)) { + size = PUD_SIZE; + goto out; + } + + pmd = pmdp_get(pmd_offset(&pud, hva)); + if (pmd_none(pmd) || !pmd_present(pmd)) + goto out; + + if (pmd_leaf(pmd)) + size = PMD_SIZE; + +out: + local_irq_restore(flags); + return size; +} + +static unsigned long transparent_hugepage_adjust(struct kvm *kvm, + struct kvm_memory_slot *memslot, + unsigned long hva, + kvm_pfn_t *hfnp, gpa_t *gpa) +{ + kvm_pfn_t hfn = *hfnp; + + /* + * Make sure the adjustment is done only for THP pages. Also make + * sure that the HVA and GPA are sufficiently aligned and that the + * block map is contained within the memslot. + */ + if (fault_supports_gstage_huge_mapping(memslot, hva)) { + int sz; + + sz = get_hva_mapping_size(kvm, hva); + if (sz < PMD_SIZE) + return sz; + + *gpa &= PMD_MASK; + hfn &= ~(PTRS_PER_PMD - 1); + *hfnp = hfn; + + return PMD_SIZE; + } + + return PAGE_SIZE; +} + int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, gpa_t gpa, unsigned long hva, bool is_write, struct kvm_gstage_mapping *out_map) @@ -398,6 +534,10 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, if (mmu_invalidate_retry(kvm, mmu_seq)) goto out_unlock; + /* Check if we are backed by a THP and thus use block mapping if possible */ + if (vma_pagesize == PAGE_SIZE) + vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva, &hfn, &gpa); + if (writable) { mark_page_dirty_in_slot(kvm, memslot, gfn); ret = kvm_riscv_gstage_map_page(&gstage, pcache, gpa, hfn << PAGE_SHIFT, diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c index 807c0a0de182..b02bf71db33d 100644 --- a/arch/riscv/mm/pgtable.c +++ b/arch/riscv/mm/pgtable.c @@ -47,6 +47,7 @@ pud_t *pud_offset(p4d_t *p4d, unsigned long address) return (pud_t *)p4d; } +EXPORT_SYMBOL_GPL(pud_offset); p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) { @@ -55,6 +56,7 @@ p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) return (p4d_t *)pgd; } +EXPORT_SYMBOL_GPL(p4d_offset); #endif #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP From 376e2f8cca2816c489a9196e65cc904d1a907fd2 Mon Sep 17 00:00:00 2001 From: Xu Lu Date: Sun, 4 Jan 2026 21:34:57 +0800 Subject: [PATCH 262/267] irqchip/riscv-imsic: Adjust the number of available guest irq files Currently, KVM assumes the minimum of implemented HGEIE bits and "BIT(gc->guest_index_bits) - 1" as the number of guest files available across all CPUs. This will not work when CPUs have different number of guest files because KVM may incorrectly allocate a guest file on a CPU with fewer guest files. To address above, during initialization, calculate the number of available guest interrupt files according to MMIO resources and constrain the number of guest interrupt files that can be allocated by KVM. Signed-off-by: Xu Lu Reviewed-by: Nutty Liu Reviewed-by: Anup Patel Acked-by: Thomas Gleixner Link: https://lore.kernel.org/r/20260104133457.57742-1-luxu.kernel@bytedance.com Signed-off-by: Anup Patel --- arch/riscv/kvm/aia.c | 2 +- drivers/irqchip/irq-riscv-imsic-state.c | 12 +++++++++++- include/linux/irqchip/riscv-imsic.h | 3 +++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c index dad318185660..cac3c2b51d72 100644 --- a/arch/riscv/kvm/aia.c +++ b/arch/riscv/kvm/aia.c @@ -630,7 +630,7 @@ int kvm_riscv_aia_init(void) */ if (gc) kvm_riscv_aia_nr_hgei = min((ulong)kvm_riscv_aia_nr_hgei, - BIT(gc->guest_index_bits) - 1); + gc->nr_guest_files); else kvm_riscv_aia_nr_hgei = 0; diff --git a/drivers/irqchip/irq-riscv-imsic-state.c b/drivers/irqchip/irq-riscv-imsic-state.c index b6cebfee9461..7566c8aa2d48 100644 --- a/drivers/irqchip/irq-riscv-imsic-state.c +++ b/drivers/irqchip/irq-riscv-imsic-state.c @@ -784,7 +784,7 @@ static int __init imsic_parse_fwnode(struct fwnode_handle *fwnode, int __init imsic_setup_state(struct fwnode_handle *fwnode, void *opaque) { - u32 i, j, index, nr_parent_irqs, nr_mmios, nr_handlers = 0; + u32 i, j, index, nr_parent_irqs, nr_mmios, nr_guest_files, nr_handlers = 0; struct imsic_global_config *global; struct imsic_local_config *local; void __iomem **mmios_va = NULL; @@ -878,6 +878,7 @@ int __init imsic_setup_state(struct fwnode_handle *fwnode, void *opaque) } /* Configure handlers for target CPUs */ + global->nr_guest_files = BIT(global->guest_index_bits) - 1; for (i = 0; i < nr_parent_irqs; i++) { rc = imsic_get_parent_hartid(fwnode, i, &hartid); if (rc) { @@ -918,6 +919,15 @@ int __init imsic_setup_state(struct fwnode_handle *fwnode, void *opaque) local->msi_pa = mmios[index].start + reloff; local->msi_va = mmios_va[index] + reloff; + /* + * KVM uses global->nr_guest_files to determine the available guest + * interrupt files on each CPU. Take the minimum number of guest + * interrupt files across all CPUs to avoid KVM incorrectly allocating + * an unexisted or unmapped guest interrupt file on some CPUs. + */ + nr_guest_files = (resource_size(&mmios[index]) - reloff) / IMSIC_MMIO_PAGE_SZ - 1; + global->nr_guest_files = min(global->nr_guest_files, nr_guest_files); + nr_handlers++; } diff --git a/include/linux/irqchip/riscv-imsic.h b/include/linux/irqchip/riscv-imsic.h index 7f3ff5c5ea53..4b348836de7a 100644 --- a/include/linux/irqchip/riscv-imsic.h +++ b/include/linux/irqchip/riscv-imsic.h @@ -68,6 +68,9 @@ struct imsic_global_config { /* Number of guest interrupt identities */ u32 nr_guest_ids; + /* Number of guest interrupt files per core */ + u32 nr_guest_files; + /* Per-CPU IMSIC addresses */ struct imsic_local_config __percpu *local; }; From 898885477e0fa23d2e42b65bcb7c250215ecac37 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Fri, 6 Feb 2026 15:35:51 +0100 Subject: [PATCH 263/267] KVM: s390: Use guest address to mark guest page dirty Stop using the userspace address to mark the guest page dirty. mark_page_dirty() expects a guest frame number, but was being passed a host virtual frame number. When slot == NULL, mark_page_dirty_in_slot() does nothing and does not complain. This means that in some circumstances the dirtiness of the guest page might have been lost. Fix by adding two fields in struct kvm_s390_adapter_int to keep the guest addressses, and use those for mark_page_dirty(). Fixes: f65470661f36 ("KVM: s390/interrupt: do not pin adapter interrupt pages") Reviewed-by: Steffen Eiden Reviewed-by: Janosch Frank Reviewed-by: Christoph Schlameuss Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/interrupt.c | 6 ++++-- include/linux/kvm_host.h | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index f55eca9aa638..1c2bb5cd7e12 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2768,13 +2768,13 @@ static int adapter_indicators_set(struct kvm *kvm, bit = get_ind_bit(adapter_int->ind_addr, adapter_int->ind_offset, adapter->swap); set_bit(bit, map); - mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT); + mark_page_dirty(kvm, adapter_int->ind_gaddr >> PAGE_SHIFT); set_page_dirty_lock(ind_page); map = page_address(summary_page); bit = get_ind_bit(adapter_int->summary_addr, adapter_int->summary_offset, adapter->swap); summary_set = test_and_set_bit(bit, map); - mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT); + mark_page_dirty(kvm, adapter_int->summary_gaddr >> PAGE_SHIFT); set_page_dirty_lock(summary_page); srcu_read_unlock(&kvm->srcu, idx); @@ -2870,7 +2870,9 @@ int kvm_set_routing_entry(struct kvm *kvm, if (kvm_is_error_hva(uaddr_s) || kvm_is_error_hva(uaddr_i)) return -EFAULT; e->adapter.summary_addr = uaddr_s; + e->adapter.summary_gaddr = ue->u.adapter.summary_addr; e->adapter.ind_addr = uaddr_i; + e->adapter.ind_gaddr = ue->u.adapter.ind_addr; e->adapter.summary_offset = ue->u.adapter.summary_offset; e->adapter.ind_offset = ue->u.adapter.ind_offset; e->adapter.adapter_id = ue->u.adapter.adapter_id; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d93f75b05ae2..deb36007480d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -645,7 +645,9 @@ static inline unsigned long *kvm_second_dirty_bitmap(struct kvm_memory_slot *mem struct kvm_s390_adapter_int { u64 ind_addr; + u64 ind_gaddr; u64 summary_addr; + u64 summary_gaddr; u64 ind_offset; u32 summary_offset; u32 adapter_id; From b6ab71a27c50942cfc10d12ca3f3c0cfb1634d19 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Fri, 6 Feb 2026 15:35:52 +0100 Subject: [PATCH 264/267] KVM: s390: vsie: Fix race in walk_guest_tables() It is possible that walk_guest_tables() is called on a shadow gmap that has been removed already, in which case its parent will be NULL. In such case, return -EAGAIN and let the callers deal with it. Fixes: e38c884df921 ("KVM: s390: Switch to new gmap") Acked-by: Janosch Frank Reviewed-by: Christoph Schlameuss Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/gaccess.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 67de47a81a87..4630b2a067ea 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1287,7 +1287,10 @@ static int walk_guest_tables(struct gmap *sg, unsigned long saddr, struct pgtwal union asce asce; int rc; + if (!parent) + return -EAGAIN; kvm = parent->kvm; + WARN_ON(!kvm); asce = sg->guest_asce; entries = get_entries(w); From f8f296ea1c61ce98a03dd9ede370adb864c4cde3 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Fri, 6 Feb 2026 15:35:53 +0100 Subject: [PATCH 265/267] KVM: s390: vsie: Fix race in acquire_gmap_shadow() The shadow gmap returned by gmap_create_shadow() could get dropped before taking the gmap->children_lock. This meant that the shadow gmap was sometimes being used while its reference count was 0. Fix this by taking the additional reference inside gmap_create_shadow() while still holding gmap->children_lock, instead of afterwards. Fixes: e38c884df921 ("KVM: s390: Switch to new gmap") Reviewed-by: Christoph Schlameuss Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/gmap.c | 15 ++++++++++++--- arch/s390/kvm/vsie.c | 6 +++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index da222962ef6d..26cd2b208b6f 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -1179,6 +1179,8 @@ static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gma * The shadow table will be removed automatically on any change to the * PTE mapping for the source table. * + * The returned shadow gmap will be returned with one extra reference. + * * Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory, * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the * parent gmap table could not be protected. @@ -1189,10 +1191,13 @@ struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *pare struct gmap *sg, *new; int rc; - scoped_guard(spinlock, &parent->children_lock) + scoped_guard(spinlock, &parent->children_lock) { sg = gmap_find_shadow(parent, asce, edat_level); - if (sg) - return sg; + if (sg) { + gmap_get(sg); + return sg; + } + } /* Create a new shadow gmap. */ new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce)); if (!new) @@ -1206,6 +1211,7 @@ struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *pare sg = gmap_find_shadow(parent, asce, edat_level); if (sg) { gmap_put(new); + gmap_get(sg); return sg; } if (asce.r) { @@ -1219,16 +1225,19 @@ struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *pare } gmap_add_child(parent, new); /* Nothing to protect, return right away. */ + gmap_get(new); return new; } } + gmap_get(new); new->parent = parent; /* Protect while inserting, protects against invalidation races. */ rc = gmap_protect_asce_top_level(mc, new); if (rc) { new->parent = NULL; gmap_put(new); + gmap_put(new); return ERR_PTR(rc); } return new; diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index f950ebb32812..d249b10044eb 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1256,6 +1256,7 @@ static struct gmap *acquire_gmap_shadow(struct kvm_vcpu *vcpu, struct vsie_page release_gmap_shadow(vsie_page); } } +again: gmap = gmap_create_shadow(vcpu->arch.mc, vcpu->kvm->arch.gmap, asce, edat); if (IS_ERR(gmap)) return gmap; @@ -1263,11 +1264,14 @@ static struct gmap *acquire_gmap_shadow(struct kvm_vcpu *vcpu, struct vsie_page /* unlikely race condition, remove the previous shadow */ if (vsie_page->gmap_cache.gmap) release_gmap_shadow(vsie_page); + if (!gmap->parent) { + gmap_put(gmap); + goto again; + } vcpu->kvm->stat.gmap_shadow_create++; list_add(&vsie_page->gmap_cache.list, &gmap->scb_users); vsie_page->gmap_cache.gmap = gmap; prefix_unmapped(vsie_page); - gmap_get(gmap); } return gmap; } From 46aa186a08d87ce834c7bb68e46dc84bac1995c1 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Mon, 2 Feb 2026 15:45:57 +0100 Subject: [PATCH 266/267] MAINTAINERS: Replace backup for s390 vfio-pci Farhan has been doing a masterful job coming on in the s390 PCI space, and my own attention has been lacking. Let's make MAINTAINERS reflect reality. Signed-off-by: Eric Farman Acked-by: Alex Williamson Acked-by: Farhan Ali Acked-by: Matthew Rosato Acked-by: Christian Borntraeger Signed-off-by: Claudio Imbrenda --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 95448b485fd2..51063f30c577 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23072,7 +23072,8 @@ F: include/uapi/linux/vfio_ccw.h S390 VFIO-PCI DRIVER M: Matthew Rosato -M: Eric Farman +M: Farhan Ali +R: Eric Farman L: linux-s390@vger.kernel.org L: kvm@vger.kernel.org S: Supported From e3372ffb5f9e2dda3da259b768aab6271672b90d Mon Sep 17 00:00:00 2001 From: Steffen Eiden Date: Mon, 9 Feb 2026 16:29:25 +0100 Subject: [PATCH 267/267] KVM: s390: Increase permitted SE header size to 1 MiB Relax the maximum allowed Secure Execution (SE) header size from 8 KiB to 1 MiB. This allows individual secure guest images to run on a wider range of physical machines. Signed-off-by: Steffen Eiden Reviewed-by: Janosch Frank Reviewed-by: Claudio Imbrenda Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/kvm-s390.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 9f24252775dd..de645025db0f 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2589,9 +2589,9 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) if (copy_from_user(&parms, argp, sizeof(parms))) break; - /* Currently restricted to 8KB */ + /* Currently restricted to 1MiB */ r = -EINVAL; - if (parms.length > PAGE_SIZE * 2) + if (parms.length > SZ_1M) break; r = -ENOMEM;