From 6d4b8d052ff22742c3980fa45f26b0969a9b6163 Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Mon, 15 Dec 2025 10:25:18 -0800 Subject: [PATCH 01/55] perf/x86/intel/cstate: Add Wildcat Lake support Wildcat Lake (WCL) is a low-power variant of Panther Lake. From a C-state profiling perspective, it supports the same residency counters: CC1/CC6/CC7 and PC2/PC6/PC10. Signed-off-by: Zide Chen Signed-off-by: Ingo Molnar Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251215182520.115822-1-zide.chen@intel.com --- arch/x86/events/intel/cstate.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index fa67fda6e45b..b719b0a68a2a 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -41,7 +41,7 @@ * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL - * MTL,SRF,GRR,ARL,LNL,PTL + * MTL,SRF,GRR,ARL,LNL,PTL,WCL * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 @@ -53,19 +53,19 @@ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, - * GRR,ARL,LNL,PTL + * GRR,ARL,LNL,PTL,WCL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL, - * PTL + * PTL,WCL * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL, - * RPL,SPR,MTL,ARL,LNL,SRF,PTL + * RPL,SPR,MTL,ARL,LNL,SRF,PTL,WCL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 @@ -78,7 +78,7 @@ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, - * ARL,LNL,PTL + * ARL,LNL,PTL,WCL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 @@ -97,7 +97,8 @@ * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, - * TNT,RKL,ADL,RPL,MTL,ARL,LNL,PTL + * TNT,RKL,ADL,RPL,MTL,ARL,LNL,PTL, + * WCL * Scope: Package (physical package) * MSR_MODULE_C6_RES_MS: Module C6 Residency Counter. * perf code: 0x00 @@ -654,6 +655,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_VFM(INTEL_ARROWLAKE_U, &adl_cstates), X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_cstates), X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &lnl_cstates), + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &lnl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); From 7e760ac4617b63628edd55a96be2fc85b7eaa435 Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Mon, 15 Dec 2025 10:25:19 -0800 Subject: [PATCH 02/55] perf/x86/intel/cstate: Add Nova Lake support Similar to Lunar Lake and Panther Lake, Nova Lake supports CC1/CC6/CC7 and PC2/PC6/PC10 residency counters; it also adds support for MC6. Signed-off-by: Zide Chen Signed-off-by: Ingo Molnar Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251215182520.115822-2-zide.chen@intel.com --- arch/x86/events/intel/cstate.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index b719b0a68a2a..008f8ea59315 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -41,7 +41,7 @@ * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL - * MTL,SRF,GRR,ARL,LNL,PTL,WCL + * MTL,SRF,GRR,ARL,LNL,PTL,WCL,NVL * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 @@ -53,19 +53,20 @@ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, - * GRR,ARL,LNL,PTL,WCL + * GRR,ARL,LNL,PTL,WCL,NVL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL, - * PTL,WCL + * PTL,WCL,NVL * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL, - * RPL,SPR,MTL,ARL,LNL,SRF,PTL,WCL + * RPL,SPR,MTL,ARL,LNL,SRF,PTL,WCL, + * NVL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 @@ -78,7 +79,7 @@ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, - * ARL,LNL,PTL,WCL + * ARL,LNL,PTL,WCL,NVL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 @@ -98,11 +99,11 @@ * perf code: 0x06 * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, * TNT,RKL,ADL,RPL,MTL,ARL,LNL,PTL, - * WCL + * WCL,NVL * Scope: Package (physical package) * MSR_MODULE_C6_RES_MS: Module C6 Residency Counter. * perf code: 0x00 - * Available model: SRF,GRR + * Available model: SRF,GRR,NVL * Scope: A cluster of cores shared L2 cache * */ @@ -528,6 +529,18 @@ static const struct cstate_model lnl_cstates __initconst = { BIT(PERF_CSTATE_PKG_C10_RES), }; +static const struct cstate_model nvl_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | + BIT(PERF_CSTATE_CORE_C6_RES) | + BIT(PERF_CSTATE_CORE_C7_RES), + + .module_events = BIT(PERF_CSTATE_MODULE_C6_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C10_RES), +}; + static const struct cstate_model slm_cstates __initconst = { .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | BIT(PERF_CSTATE_CORE_C6_RES), @@ -656,6 +669,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_cstates), X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &lnl_cstates), X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &lnl_cstates), + X86_MATCH_VFM(INTEL_NOVALAKE, &nvl_cstates), + X86_MATCH_VFM(INTEL_NOVALAKE_L, &nvl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); From 7ac422cf7b16ec524bcd8e017459e328a4103f63 Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Mon, 15 Dec 2025 10:25:20 -0800 Subject: [PATCH 03/55] perf/x86/intel/cstate: Add Diamond Rapids support From a C-state residency profiling perspective, Diamond Rapids is similar to SRF and GNR, supporting core C1/C6, module C6, and package C2/C6 residency counters. Similar to CWF, the C1E residency can be accessed via PMT only. Signed-off-by: Zide Chen Signed-off-by: Ingo Molnar Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251215182520.115822-3-zide.chen@intel.com --- arch/x86/events/intel/cstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 008f8ea59315..1e2658b60d91 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -652,6 +652,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &icx_cstates), X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &icx_cstates), X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &icx_cstates), + X86_MATCH_VFM(INTEL_DIAMONDRAPIDS_X, &srf_cstates), X86_MATCH_VFM(INTEL_TIGERLAKE_L, &icl_cstates), X86_MATCH_VFM(INTEL_TIGERLAKE, &icl_cstates), From b825444b6179eb071e66ca3da5ac12d4dbd808d5 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 5 Dec 2025 16:16:37 -0800 Subject: [PATCH 04/55] perf: Skip pmu_ctx based on event_type To optimize the cgroup context switch, the perf_event_pmu_context iteration skips the PMUs without cgroup events. A bool cgroup was introduced to indicate the case. It can work, but this way is hard to extend for other cases, e.g. skipping non-mediated PMUs. It doesn't make sense to keep adding bool variables. Pass the event_type instead of the specific bool variable. Check both the event_type and related pmu_ctx variables to decide whether skipping a PMU. Event flags, e.g., EVENT_CGROUP, should be cleard in the ctx->is_active. Add EVENT_FLAGS to indicate such event flags. No functional change. Signed-off-by: Kan Liang Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Yongwei Ma Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-2-seanjc@google.com --- kernel/events/core.c | 74 ++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index dad0d3d2e85f..406371ce45f2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -165,7 +165,7 @@ enum event_type_t { /* see ctx_resched() for details */ EVENT_CPU = 0x10, EVENT_CGROUP = 0x20, - + EVENT_FLAGS = EVENT_CGROUP, /* compound helpers */ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, @@ -779,27 +779,37 @@ do { \ ___p; \ }) -#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ +static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) +{ + if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups) + return true; + return false; +} + +#define for_each_epc(_epc, _ctx, _pmu, _event_type) \ list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ - if (_cgroup && !_epc->nr_cgroups) \ + if (perf_skip_pmu_ctx(_epc, _event_type)) \ continue; \ else if (_pmu && _epc->pmu != _pmu) \ continue; \ else -static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) +static void perf_ctx_disable(struct perf_event_context *ctx, + enum event_type_t event_type) { struct perf_event_pmu_context *pmu_ctx; - for_each_epc(pmu_ctx, ctx, NULL, cgroup) + for_each_epc(pmu_ctx, ctx, NULL, event_type) perf_pmu_disable(pmu_ctx->pmu); } -static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) +static void perf_ctx_enable(struct perf_event_context *ctx, + enum event_type_t event_type) { struct perf_event_pmu_context *pmu_ctx; - for_each_epc(pmu_ctx, ctx, NULL, cgroup) + for_each_epc(pmu_ctx, ctx, NULL, event_type) perf_pmu_enable(pmu_ctx->pmu); } @@ -964,8 +974,7 @@ static void perf_cgroup_switch(struct task_struct *task) return; WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - - perf_ctx_disable(&cpuctx->ctx, true); + perf_ctx_disable(&cpuctx->ctx, EVENT_CGROUP); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* @@ -981,7 +990,7 @@ static void perf_cgroup_switch(struct task_struct *task) */ ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); - perf_ctx_enable(&cpuctx->ctx, true); + perf_ctx_enable(&cpuctx->ctx, EVENT_CGROUP); } static int perf_cgroup_ensure_storage(struct perf_event *event, @@ -2902,11 +2911,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, event_type &= EVENT_ALL; - for_each_epc(epc, &cpuctx->ctx, pmu, false) + for_each_epc(epc, &cpuctx->ctx, pmu, 0) perf_pmu_disable(epc->pmu); if (task_ctx) { - for_each_epc(epc, task_ctx, pmu, false) + for_each_epc(epc, task_ctx, pmu, 0) perf_pmu_disable(epc->pmu); task_ctx_sched_out(task_ctx, pmu, event_type); @@ -2926,11 +2935,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, perf_event_sched_in(cpuctx, task_ctx, pmu); - for_each_epc(epc, &cpuctx->ctx, pmu, false) + for_each_epc(epc, &cpuctx->ctx, pmu, 0) perf_pmu_enable(epc->pmu); if (task_ctx) { - for_each_epc(epc, task_ctx, pmu, false) + for_each_epc(epc, task_ctx, pmu, 0) perf_pmu_enable(epc->pmu); } } @@ -3479,11 +3488,10 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + enum event_type_t active_type = event_type & ~EVENT_FLAGS; struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; - bool cgroup = event_type & EVENT_CGROUP; - event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -3514,7 +3522,7 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t * see __load_acquire() in perf_event_time_now() */ barrier(); - ctx->is_active &= ~event_type; + ctx->is_active &= ~active_type; if (!(ctx->is_active & EVENT_ALL)) { /* @@ -3535,7 +3543,7 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t is_active ^= ctx->is_active; /* changed bits */ - for_each_epc(pmu_ctx, ctx, pmu, cgroup) + for_each_epc(pmu_ctx, ctx, pmu, event_type) __pmu_ctx_sched_out(pmu_ctx, is_active); } @@ -3691,7 +3699,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ if (local_read(&ctx->nr_no_switch_fast) || @@ -3715,7 +3723,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) perf_ctx_sched_task_cb(ctx, task, false); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); /* * RCU_INIT_POINTER here is safe because we've not @@ -3739,13 +3747,13 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); inside_switch: perf_ctx_sched_task_cb(ctx, task, false); task_ctx_sched_out(ctx, NULL, EVENT_ALL); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); raw_spin_unlock(&ctx->lock); } } @@ -4054,11 +4062,9 @@ static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + enum event_type_t active_type = event_type & ~EVENT_FLAGS; struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; - bool cgroup = event_type & EVENT_CGROUP; - - event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -4076,7 +4082,7 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t barrier(); } - ctx->is_active |= (event_type | EVENT_TIME); + ctx->is_active |= active_type | EVENT_TIME; if (ctx->task) { if (!(is_active & EVENT_ALL)) cpuctx->task_ctx = ctx; @@ -4091,13 +4097,13 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) { - for_each_epc(pmu_ctx, ctx, pmu, cgroup) + for_each_epc(pmu_ctx, ctx, pmu, event_type) __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); } /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) { - for_each_epc(pmu_ctx, ctx, pmu, cgroup) + for_each_epc(pmu_ctx, ctx, pmu, event_type) __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); } } @@ -4114,11 +4120,11 @@ static void perf_event_context_sched_in(struct task_struct *task) if (cpuctx->task_ctx == ctx) { perf_ctx_lock(cpuctx, ctx); - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); perf_ctx_sched_task_cb(ctx, task, true); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); perf_ctx_unlock(cpuctx, ctx); goto rcu_unlock; } @@ -4131,7 +4137,7 @@ static void perf_event_context_sched_in(struct task_struct *task) if (!ctx->nr_events) goto unlock; - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -4141,7 +4147,7 @@ static void perf_event_context_sched_in(struct task_struct *task) * events, no need to flip the cpuctx's events around. */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { - perf_ctx_disable(&cpuctx->ctx, false); + perf_ctx_disable(&cpuctx->ctx, 0); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); } @@ -4150,9 +4156,9 @@ static void perf_event_context_sched_in(struct task_struct *task) perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) - perf_ctx_enable(&cpuctx->ctx, false); + perf_ctx_enable(&cpuctx->ctx, 0); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); unlock: perf_ctx_unlock(cpuctx, ctx); From b9e52b11d2e5e403afaf69a7f8d6b29f8380ed38 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 5 Dec 2025 16:16:38 -0800 Subject: [PATCH 05/55] perf: Add generic exclude_guest support Only KVM knows the exact time when a guest is entering/exiting. Expose two interfaces to KVM to switch the ownership of the PMU resources. All the pinned events must be scheduled in first. Extend the perf_event_sched_in() helper to support extra flag, e.g., EVENT_GUEST. Signed-off-by: Kan Liang Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-3-seanjc@google.com --- kernel/events/core.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 406371ce45f2..fab358daa42e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2870,14 +2870,15 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, - struct pmu *pmu) + struct pmu *pmu, + enum event_type_t event_type) { - ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED | event_type); if (ctx) - ctx_sched_in(ctx, pmu, EVENT_PINNED); - ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_PINNED | event_type); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE | event_type); if (ctx) - ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE | event_type); } /* @@ -2933,7 +2934,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, else if (event_type & EVENT_PINNED) ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, task_ctx, pmu); + perf_event_sched_in(cpuctx, task_ctx, pmu, 0); for_each_epc(epc, &cpuctx->ctx, pmu, 0) perf_pmu_enable(epc->pmu); @@ -4151,7 +4152,7 @@ static void perf_event_context_sched_in(struct task_struct *task) ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); } - perf_event_sched_in(cpuctx, ctx, NULL); + perf_event_sched_in(cpuctx, ctx, NULL, 0); perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true); From 991bdf7e9d6cc74c1de215d1a05c23ff61076bf0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:16:39 -0800 Subject: [PATCH 06/55] perf: Move security_perf_event_free() call to __free_event() Move the freeing of any security state associated with a perf event from _free_event() to __free_event(), i.e. invoke security_perf_event_free() in the error paths for perf_event_alloc(). This will allow adding potential error paths in perf_event_alloc() that can occur after allocating security state. Note, kfree() and thus security_perf_event_free() is a nop if event->security is NULL, i.e. calling security_perf_event_free() even if security_perf_event_alloc() fails or is never reached is functionality ok. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-4-seanjc@google.com --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index fab358daa42e..6973483d0dfa 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5601,6 +5601,8 @@ static void __free_event(struct perf_event *event) { struct pmu *pmu = event->pmu; + security_perf_event_free(event); + if (event->attach_state & PERF_ATTACH_CALLCHAIN) put_callchain_buffers(); @@ -5664,8 +5666,6 @@ static void _free_event(struct perf_event *event) unaccount_event(event); - security_perf_event_free(event); - if (event->rb) { /* * Can happen when we close an event with re-directed output. From eff95e170275d9e80b968f335cd03d0ac250d2d1 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 5 Dec 2025 16:16:40 -0800 Subject: [PATCH 07/55] perf: Add APIs to create/release mediated guest vPMUs Currently, exposing PMU capabilities to a KVM guest is done by emulating guest PMCs via host perf events, i.e. by having KVM be "just" another user of perf. As a result, the guest and host are effectively competing for resources, and emulating guest accesses to vPMU resources requires expensive actions (expensive relative to the native instruction). The overhead and resource competition results in degraded guest performance and ultimately very poor vPMU accuracy. To address the issues with the perf-emulated vPMU, introduce a "mediated vPMU", where the data plane (PMCs and enable/disable knobs) is exposed directly to the guest, but the control plane (event selectors and access to fixed counters) is managed by KVM (via MSR interceptions). To allow host perf usage of the PMU to (partially) co-exist with KVM/guest usage of the PMU, KVM and perf will coordinate to a world switch between host perf context and guest vPMU context near VM-Enter/VM-Exit. Add two exported APIs, perf_{create,release}_mediated_pmu(), to allow KVM to create and release a mediated PMU instance (per VM). Because host perf context will be deactivated while the guest is running, mediated PMU usage will be mutually exclusive with perf analysis of the guest, i.e. perf events that do NOT exclude the guest will not behave as expected. To avoid silent failure of !exclude_guest perf events, disallow creating a mediated PMU if there are active !exclude_guest events, and on the perf side, disallowing creating new !exclude_guest perf events while there is at least one active mediated PMU. Exempt PMU resources that do not support mediated PMU usage, i.e. that are outside the scope/view of KVM's vPMU and will not be swapped out while the guest is running. Guard mediated PMU with a new kconfig to help readers identify code paths that are unique to mediated PMU support, and to allow for adding arch- specific hooks without stubs. KVM x86 is expected to be the only KVM architecture to support a mediated PMU in the near future (e.g. arm64 is trending toward a partitioned PMU implementation), and KVM x86 will select PERF_GUEST_MEDIATED_PMU unconditionally, i.e. won't need stubs. Immediately select PERF_GUEST_MEDIATED_PMU when KVM x86 is enabled so that all paths are compile tested. Full KVM support is on its way... [sean: add kconfig and WARNing, rewrite changelog, swizzle patch ordering] Suggested-by: Sean Christopherson Signed-off-by: Kan Liang Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-5-seanjc@google.com --- arch/x86/kvm/Kconfig | 1 + include/linux/perf_event.h | 6 +++ init/Kconfig | 4 ++ kernel/events/core.c | 82 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 93 insertions(+) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 278f08194ec8..d916bd766c94 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -37,6 +37,7 @@ config KVM_X86 select SCHED_INFO select PERF_EVENTS select GUEST_PERF_EVENTS + select PERF_GUEST_MEDIATED_PMU select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_NO_POLL diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9870d768db4c..31929da6e711 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -305,6 +305,7 @@ struct perf_event_pmu_context; #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 #define PERF_PMU_CAP_AUX_PAUSE 0x0200 #define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400 +#define PERF_PMU_CAP_MEDIATED_VPMU 0x0800 /** * pmu::scope @@ -1914,6 +1915,11 @@ extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +int perf_create_mediated_pmu(void); +void perf_release_mediated_pmu(void); +#endif + #else /* !CONFIG_PERF_EVENTS: */ static inline void * diff --git a/init/Kconfig b/init/Kconfig index fa79feb8fe57..6628ff295cb8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2061,6 +2061,10 @@ config GUEST_PERF_EVENTS bool depends on HAVE_PERF_EVENTS +config PERF_GUEST_MEDIATED_PMU + bool + depends on GUEST_PERF_EVENTS + config PERF_USE_VMALLOC bool help diff --git a/kernel/events/core.c b/kernel/events/core.c index 6973483d0dfa..5a2166ba6138 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5656,6 +5656,8 @@ static void __free_event(struct perf_event *event) call_rcu(&event->rcu_head, free_event_rcu); } +static void mediated_pmu_unaccount_event(struct perf_event *event); + DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T)) /* vs perf_event_alloc() success */ @@ -5665,6 +5667,7 @@ static void _free_event(struct perf_event *event) irq_work_sync(&event->pending_disable_irq); unaccount_event(event); + mediated_pmu_unaccount_event(event); if (event->rb) { /* @@ -6187,6 +6190,81 @@ u64 perf_event_pause(struct perf_event *event, bool reset) } EXPORT_SYMBOL_GPL(perf_event_pause); +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +static atomic_t nr_include_guest_events __read_mostly; + +static atomic_t nr_mediated_pmu_vms __read_mostly; +static DEFINE_MUTEX(perf_mediated_pmu_mutex); + +/* !exclude_guest event of PMU with PERF_PMU_CAP_MEDIATED_VPMU */ +static inline bool is_include_guest_event(struct perf_event *event) +{ + if ((event->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU) && + !event->attr.exclude_guest) + return true; + + return false; +} + +static int mediated_pmu_account_event(struct perf_event *event) +{ + if (!is_include_guest_event(event)) + return 0; + + guard(mutex)(&perf_mediated_pmu_mutex); + + if (atomic_read(&nr_mediated_pmu_vms)) + return -EOPNOTSUPP; + + atomic_inc(&nr_include_guest_events); + return 0; +} + +static void mediated_pmu_unaccount_event(struct perf_event *event) +{ + if (!is_include_guest_event(event)) + return; + + atomic_dec(&nr_include_guest_events); +} + +/* + * Currently invoked at VM creation to + * - Check whether there are existing !exclude_guest events of PMU with + * PERF_PMU_CAP_MEDIATED_VPMU + * - Set nr_mediated_pmu_vms to prevent !exclude_guest event creation on + * PMUs with PERF_PMU_CAP_MEDIATED_VPMU + * + * No impact for the PMU without PERF_PMU_CAP_MEDIATED_VPMU. The perf + * still owns all the PMU resources. + */ +int perf_create_mediated_pmu(void) +{ + guard(mutex)(&perf_mediated_pmu_mutex); + if (atomic_inc_not_zero(&nr_mediated_pmu_vms)) + return 0; + + if (atomic_read(&nr_include_guest_events)) + return -EBUSY; + + atomic_inc(&nr_mediated_pmu_vms); + return 0; +} +EXPORT_SYMBOL_GPL(perf_create_mediated_pmu); + +void perf_release_mediated_pmu(void) +{ + if (WARN_ON_ONCE(!atomic_read(&nr_mediated_pmu_vms))) + return; + + atomic_dec(&nr_mediated_pmu_vms); +} +EXPORT_SYMBOL_GPL(perf_release_mediated_pmu); +#else +static int mediated_pmu_account_event(struct perf_event *event) { return 0; } +static void mediated_pmu_unaccount_event(struct perf_event *event) {} +#endif + /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block @@ -13147,6 +13225,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (err) return ERR_PTR(err); + err = mediated_pmu_account_event(event); + if (err) + return ERR_PTR(err); + /* symmetric to unaccount_event() in _free_event() */ account_event(event); From f5c7de8f84a152d559256aa4d0fc953118b73ca4 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 5 Dec 2025 16:16:41 -0800 Subject: [PATCH 08/55] perf: Clean up perf ctx time The current perf tracks two timestamps for the normal ctx and cgroup. The same type of variables and similar codes are used to track the timestamps. In the following patch, the third timestamp to track the guest time will be introduced. To avoid the code duplication, add a new struct perf_time_ctx and factor out a generic function update_perf_time_ctx(). No functional change. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-6-seanjc@google.com --- include/linux/perf_event.h | 13 +++---- kernel/events/core.c | 70 +++++++++++++++++--------------------- 2 files changed, 39 insertions(+), 44 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 31929da6e711..d5aa1bc3f088 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -999,6 +999,11 @@ struct perf_event_groups { u64 index; }; +struct perf_time_ctx { + u64 time; + u64 stamp; + u64 offset; +}; /** * struct perf_event_context - event context structure @@ -1037,9 +1042,7 @@ struct perf_event_context { /* * Context clock, runs when context enabled. */ - u64 time; - u64 timestamp; - u64 timeoffset; + struct perf_time_ctx time; /* * These fields let us detect when two contexts have both @@ -1172,9 +1175,7 @@ struct bpf_perf_event_data_kern { * This is a per-cpu dynamically allocated data structure. */ struct perf_cgroup_info { - u64 time; - u64 timestamp; - u64 timeoffset; + struct perf_time_ctx time; int active; }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 5a2166ba6138..95f118230ff5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -816,6 +816,24 @@ static void perf_ctx_enable(struct perf_event_context *ctx, static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); +static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv) +{ + if (adv) + time->time += now - time->stamp; + time->stamp = now; + + /* + * The above: time' = time + (now - timestamp), can be re-arranged + * into: time` = now + (time - timestamp), which gives a single value + * offset to compute future time without locks on. + * + * See perf_event_time_now(), which can be used from NMI context where + * it's (obviously) not possible to acquire ctx->lock in order to read + * both the above values in a consistent manner. + */ + WRITE_ONCE(time->offset, time->time - time->stamp); +} + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -857,7 +875,7 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); - return t->time; + return t->time.time; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) @@ -866,22 +884,11 @@ static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) - return t->time; - now += READ_ONCE(t->timeoffset); + return t->time.time; + now += READ_ONCE(t->time.offset); return now; } -static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) -{ - if (adv) - info->time += now - info->timestamp; - info->timestamp = now; - /* - * see update_context_time() - */ - WRITE_ONCE(info->timeoffset, info->time - info->timestamp); -} - static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; @@ -895,7 +902,7 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - __update_cgrp_time(info, now, true); + update_perf_time_ctx(&info->time, now, true); if (final) __store_release(&info->active, 0); } @@ -918,7 +925,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) * Do not update time when cgroup is not active */ if (info->active) - __update_cgrp_time(info, perf_clock(), true); + update_perf_time_ctx(&info->time, perf_clock(), true); } static inline void @@ -942,7 +949,7 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - __update_cgrp_time(info, ctx->timestamp, false); + update_perf_time_ctx(&info->time, ctx->time.stamp, false); __store_release(&info->active, 1); } } @@ -1563,20 +1570,7 @@ static void __update_context_time(struct perf_event_context *ctx, bool adv) lockdep_assert_held(&ctx->lock); - if (adv) - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; - - /* - * The above: time' = time + (now - timestamp), can be re-arranged - * into: time` = now + (time - timestamp), which gives a single value - * offset to compute future time without locks on. - * - * See perf_event_time_now(), which can be used from NMI context where - * it's (obviously) not possible to acquire ctx->lock in order to read - * both the above values in a consistent manner. - */ - WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); + update_perf_time_ctx(&ctx->time, now, adv); } static void update_context_time(struct perf_event_context *ctx) @@ -1594,7 +1588,7 @@ static u64 perf_event_time(struct perf_event *event) if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx->time; + return ctx->time.time; } static u64 perf_event_time_now(struct perf_event *event, u64 now) @@ -1608,9 +1602,9 @@ static u64 perf_event_time_now(struct perf_event *event, u64 now) return perf_cgroup_event_time_now(event, now); if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) - return ctx->time; + return ctx->time.time; - now += READ_ONCE(ctx->timeoffset); + now += READ_ONCE(ctx->time.offset); return now; } @@ -12113,7 +12107,7 @@ static void task_clock_event_update(struct perf_event *event, u64 now) static void task_clock_event_start(struct perf_event *event, int flags) { event->hw.state = 0; - local64_set(&event->hw.prev_count, event->ctx->time); + local64_set(&event->hw.prev_count, event->ctx->time.time); perf_swevent_start_hrtimer(event); } @@ -12122,7 +12116,7 @@ static void task_clock_event_stop(struct perf_event *event, int flags) event->hw.state = PERF_HES_STOPPED; perf_swevent_cancel_hrtimer(event); if (flags & PERF_EF_UPDATE) - task_clock_event_update(event, event->ctx->time); + task_clock_event_update(event, event->ctx->time.time); } static int task_clock_event_add(struct perf_event *event, int flags) @@ -12142,8 +12136,8 @@ static void task_clock_event_del(struct perf_event *event, int flags) static void task_clock_event_read(struct perf_event *event) { u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - u64 time = event->ctx->time + delta; + u64 delta = now - event->ctx->time.stamp; + u64 time = event->ctx->time.time + delta; task_clock_event_update(event, time); } From 4593b4b6e218a0f21afbacc8124cf469d2d04094 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 5 Dec 2025 16:16:42 -0800 Subject: [PATCH 09/55] perf: Add a EVENT_GUEST flag Current perf doesn't explicitly schedule out all exclude_guest events while the guest is running. There is no problem with the current emulated vPMU. Because perf owns all the PMU counters. It can mask the counter which is assigned to an exclude_guest event when a guest is running (Intel way), or set the corresponding HOSTONLY bit in evsentsel (AMD way). The counter doesn't count when a guest is running. However, either way doesn't work with the introduced mediated vPMU. A guest owns all the PMU counters when it's running. The host should not mask any counters. The counter may be used by the guest. The evsentsel may be overwritten. Perf should explicitly schedule out all exclude_guest events to release the PMU resources when entering a guest, and resume the counting when exiting the guest. It's possible that an exclude_guest event is created when a guest is running. The new event should not be scheduled in as well. The ctx time is shared among different PMUs. The time cannot be stopped when a guest is running. It is required to calculate the time for events from other PMUs, e.g., uncore events. Add timeguest to track the guest run time. For an exclude_guest event, the elapsed time equals the ctx time - guest time. Cgroup has dedicated times. Use the same method to deduct the guest time from the cgroup time as well. [sean: massage comments] Co-developed-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-7-seanjc@google.com --- include/linux/perf_event.h | 6 + kernel/events/core.c | 234 ++++++++++++++++++++++++++++--------- 2 files changed, 187 insertions(+), 53 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d5aa1bc3f088..d9988e3fd557 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1044,6 +1044,11 @@ struct perf_event_context { */ struct perf_time_ctx time; + /* + * Context clock, runs when in the guest mode. + */ + struct perf_time_ctx timeguest; + /* * These fields let us detect when two contexts have both * been cloned (inherited) from a common ancestor. @@ -1176,6 +1181,7 @@ struct bpf_perf_event_data_kern { */ struct perf_cgroup_info { struct perf_time_ctx time; + struct perf_time_ctx timeguest; int active; }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 95f118230ff5..6781d39f3158 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -165,7 +165,19 @@ enum event_type_t { /* see ctx_resched() for details */ EVENT_CPU = 0x10, EVENT_CGROUP = 0x20, - EVENT_FLAGS = EVENT_CGROUP, + + /* + * EVENT_GUEST is set when scheduling in/out events between the host + * and a guest with a mediated vPMU. Among other things, EVENT_GUEST + * is used: + * + * - In for_each_epc() to skip PMUs that don't support events in a + * MEDIATED_VPMU guest, i.e. don't need to be context switched. + * - To indicate the start/end point of the events in a guest. Guest + * running time is deducted for host-only (exclude_guest) events. + */ + EVENT_GUEST = 0x40, + EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST, /* compound helpers */ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, @@ -458,6 +470,11 @@ static cpumask_var_t perf_online_pkg_mask; static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; +static __always_inline bool is_guest_mediated_pmu_loaded(void) +{ + return false; +} + /* * perf event paranoia level: * -1 - not paranoid at all @@ -784,6 +801,9 @@ static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx, { if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups) return true; + if ((event_type & EVENT_GUEST) && + !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU)) + return true; return false; } @@ -834,6 +854,39 @@ static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, boo WRITE_ONCE(time->offset, time->time - time->stamp); } +static_assert(offsetof(struct perf_event_context, timeguest) - + offsetof(struct perf_event_context, time) == + sizeof(struct perf_time_ctx)); + +#define T_TOTAL 0 +#define T_GUEST 1 + +static inline u64 __perf_event_time_ctx(struct perf_event *event, + struct perf_time_ctx *times) +{ + u64 time = times[T_TOTAL].time; + + if (event->attr.exclude_guest) + time -= times[T_GUEST].time; + + return time; +} + +static inline u64 __perf_event_time_ctx_now(struct perf_event *event, + struct perf_time_ctx *times, + u64 now) +{ + if (is_guest_mediated_pmu_loaded() && event->attr.exclude_guest) { + /* + * (now + times[total].offset) - (now + times[guest].offset) := + * times[total].offset - times[guest].offset + */ + return READ_ONCE(times[T_TOTAL].offset) - READ_ONCE(times[T_GUEST].offset); + } + + return now + READ_ONCE(times[T_TOTAL].offset); +} + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -870,12 +923,16 @@ static inline int is_cgroup_event(struct perf_event *event) return event->cgrp != NULL; } +static_assert(offsetof(struct perf_cgroup_info, timeguest) - + offsetof(struct perf_cgroup_info, time) == + sizeof(struct perf_time_ctx)); + static inline u64 perf_cgroup_event_time(struct perf_event *event) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); - return t->time.time; + return __perf_event_time_ctx(event, &t->time); } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) @@ -884,9 +941,21 @@ static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) - return t->time.time; - now += READ_ONCE(t->time.offset); - return now; + return __perf_event_time_ctx(event, &t->time); + + return __perf_event_time_ctx_now(event, &t->time, now); +} + +static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv) +{ + update_perf_time_ctx(&info->timeguest, now, adv); +} + +static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now) +{ + update_perf_time_ctx(&info->time, now, true); + if (is_guest_mediated_pmu_loaded()) + __update_cgrp_guest_time(info, now, true); } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) @@ -902,7 +971,7 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - update_perf_time_ctx(&info->time, now, true); + update_cgrp_time(info, now); if (final) __store_release(&info->active, 0); } @@ -925,11 +994,11 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) * Do not update time when cgroup is not active */ if (info->active) - update_perf_time_ctx(&info->time, perf_clock(), true); + update_cgrp_time(info, perf_clock()); } static inline void -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) { struct perf_event_context *ctx = &cpuctx->ctx; struct perf_cgroup *cgrp = cpuctx->cgrp; @@ -949,8 +1018,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - update_perf_time_ctx(&info->time, ctx->time.stamp, false); - __store_release(&info->active, 1); + if (guest) { + __update_cgrp_guest_time(info, ctx->time.stamp, false); + } else { + update_perf_time_ctx(&info->time, ctx->time.stamp, false); + __store_release(&info->active, 1); + } } } @@ -1154,7 +1227,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, } static inline void -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) { } @@ -1566,16 +1639,24 @@ static void perf_unpin_context(struct perf_event_context *ctx) */ static void __update_context_time(struct perf_event_context *ctx, bool adv) { - u64 now = perf_clock(); - lockdep_assert_held(&ctx->lock); - update_perf_time_ctx(&ctx->time, now, adv); + update_perf_time_ctx(&ctx->time, perf_clock(), adv); +} + +static void __update_context_guest_time(struct perf_event_context *ctx, bool adv) +{ + lockdep_assert_held(&ctx->lock); + + /* must be called after __update_context_time(); */ + update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv); } static void update_context_time(struct perf_event_context *ctx) { __update_context_time(ctx, true); + if (is_guest_mediated_pmu_loaded()) + __update_context_guest_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) @@ -1588,7 +1669,7 @@ static u64 perf_event_time(struct perf_event *event) if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx->time.time; + return __perf_event_time_ctx(event, &ctx->time); } static u64 perf_event_time_now(struct perf_event *event, u64 now) @@ -1602,10 +1683,9 @@ static u64 perf_event_time_now(struct perf_event *event, u64 now) return perf_cgroup_event_time_now(event, now); if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) - return ctx->time.time; + return __perf_event_time_ctx(event, &ctx->time); - now += READ_ONCE(ctx->time.offset); - return now; + return __perf_event_time_ctx_now(event, &ctx->time, now); } static enum event_type_t get_event_type(struct perf_event *event) @@ -2425,20 +2505,23 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) } static inline void -__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, + bool final, enum event_type_t event_type) { if (ctx->is_active & EVENT_TIME) { if (ctx->is_active & EVENT_FROZEN) return; + update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, final); + /* vPMU should not stop time */ + update_cgrp_time_from_cpuctx(cpuctx, !(event_type & EVENT_GUEST) && final); } } static inline void ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - __ctx_time_update(cpuctx, ctx, false); + __ctx_time_update(cpuctx, ctx, false, 0); } /* @@ -3510,7 +3593,7 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t * * would only update time for the pinned events. */ - __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx, event_type); /* * CPU-release for the below ->is_active store, @@ -3536,7 +3619,18 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t cpuctx->task_ctx = NULL; } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule out all exclude_guest events of PMU + * with PERF_PMU_CAP_MEDIATED_VPMU. + */ + is_active = EVENT_ALL; + __update_context_guest_time(ctx, false); + perf_cgroup_set_timestamp(cpuctx, true); + barrier(); + } else { + is_active ^= ctx->is_active; /* changed bits */ + } for_each_epc(pmu_ctx, ctx, pmu, event_type) __pmu_ctx_sched_out(pmu_ctx, is_active); @@ -3995,10 +4089,15 @@ static inline void group_update_userpage(struct perf_event *group_event) event_update_userpage(event); } +struct merge_sched_data { + int can_add_hw; + enum event_type_t event_type; +}; + static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; - int *can_add_hw = data; + struct merge_sched_data *msd = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -4006,13 +4105,22 @@ static int merge_sched_in(struct perf_event *event, void *data) if (!event_filter_match(event)) return 0; - if (group_can_go_on(event, *can_add_hw)) { + /* + * Don't schedule in any host events from PMU with + * PERF_PMU_CAP_MEDIATED_VPMU, while a guest is running. + */ + if (is_guest_mediated_pmu_loaded() && + event->pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU && + !(msd->event_type & EVENT_GUEST)) + return 0; + + if (group_can_go_on(event, msd->can_add_hw)) { if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } if (event->state == PERF_EVENT_STATE_INACTIVE) { - *can_add_hw = 0; + msd->can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); @@ -4035,11 +4143,15 @@ static int merge_sched_in(struct perf_event *event, void *data) static void pmu_groups_sched_in(struct perf_event_context *ctx, struct perf_event_groups *groups, - struct pmu *pmu) + struct pmu *pmu, + enum event_type_t event_type) { - int can_add_hw = 1; + struct merge_sched_data msd = { + .can_add_hw = 1, + .event_type = event_type, + }; visit_groups_merge(ctx, groups, smp_processor_id(), pmu, - merge_sched_in, &can_add_hw); + merge_sched_in, &msd); } static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, @@ -4048,9 +4160,9 @@ static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, struct perf_event_context *ctx = pmu_ctx->ctx; if (event_type & EVENT_PINNED) - pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu, event_type); if (event_type & EVENT_FLEXIBLE) - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu, event_type); } static void @@ -4067,9 +4179,11 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t return; if (!(is_active & EVENT_TIME)) { + /* EVENT_TIME should be active while the guest runs */ + WARN_ON_ONCE(event_type & EVENT_GUEST); /* start ctx time */ __update_context_time(ctx, false); - perf_cgroup_set_timestamp(cpuctx); + perf_cgroup_set_timestamp(cpuctx, false); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() @@ -4085,7 +4199,23 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t WARN_ON_ONCE(cpuctx->task_ctx != ctx); } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule in the required exclude_guest events of PMU + * with PERF_PMU_CAP_MEDIATED_VPMU. + */ + is_active = event_type & EVENT_ALL; + + /* + * Update ctx time to set the new start time for + * the exclude_guest events. + */ + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx, false); + barrier(); + } else { + is_active ^= ctx->is_active; /* changed bits */ + } /* * First go through the list and put on any pinned groups @@ -4093,13 +4223,13 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t */ if (is_active & EVENT_PINNED) { for_each_epc(pmu_ctx, ctx, pmu, event_type) - __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED | (event_type & EVENT_GUEST)); } /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) { for_each_epc(pmu_ctx, ctx, pmu, event_type) - __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE | (event_type & EVENT_GUEST)); } } @@ -6626,23 +6756,23 @@ void perf_event_update_userpage(struct perf_event *event) if (!rb) goto unlock; - /* - * compute total_time_enabled, total_time_running - * based on snapshot values taken when the event - * was last scheduled in. - * - * we cannot simply called update_context_time() - * because of locking issue as we can be called in - * NMI context - */ - calc_timer_values(event, &now, &enabled, &running); - - userpg = rb->user_page; /* * Disable preemption to guarantee consistent time stamps are stored to * the user page. */ preempt_disable(); + + /* + * Compute total_time_enabled, total_time_running based on snapshot + * values taken when the event was last scheduled in. + * + * We cannot simply call update_context_time() because doing so would + * lead to deadlock when called from NMI context. + */ + calc_timer_values(event, &now, &enabled, &running); + + userpg = rb->user_page; + ++userpg->lock; barrier(); userpg->index = perf_event_index(event); @@ -7939,13 +8069,11 @@ static void perf_output_read(struct perf_output_handle *handle, u64 read_format = event->attr.read_format; /* - * compute total_time_enabled, total_time_running - * based on snapshot values taken when the event - * was last scheduled in. + * Compute total_time_enabled, total_time_running based on snapshot + * values taken when the event was last scheduled in. * - * we cannot simply called update_context_time() - * because of locking issue as we are called in - * NMI context + * We cannot simply call update_context_time() because doing so would + * lead to deadlock when called from NMI context. */ if (read_format & PERF_FORMAT_TOTAL_TIMES) calc_timer_values(event, &now, &enabled, &running); From 42457a7fb6cacca83be4deaf202ac3e45830daf2 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 5 Dec 2025 16:16:43 -0800 Subject: [PATCH 10/55] perf: Add APIs to load/put guest mediated PMU context Add exported APIs to load/put a guest mediated PMU context. KVM will load the guest PMU shortly before VM-Enter, and put the guest PMU shortly after VM-Exit. On the perf side of things, schedule out all exclude_guest events when the guest context is loaded, and schedule them back in when the guest context is put. I.e. yield the hardware PMU resources to the guest, by way of KVM. Note, perf is only responsible for managing host context. KVM is responsible for loading/storing guest state to/from hardware. [sean: shuffle patches around, write changelog] Suggested-by: Sean Christopherson Signed-off-by: Kan Liang Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-8-seanjc@google.com --- include/linux/perf_event.h | 2 ++ kernel/events/core.c | 61 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d9988e3fd557..322cfa9f3d48 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1925,6 +1925,8 @@ extern u64 perf_event_pause(struct perf_event *event, bool reset); #ifdef CONFIG_PERF_GUEST_MEDIATED_PMU int perf_create_mediated_pmu(void); void perf_release_mediated_pmu(void); +void perf_load_guest_context(void); +void perf_put_guest_context(void); #endif #else /* !CONFIG_PERF_EVENTS: */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 6781d39f3158..bbb81a4a3196 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -470,10 +470,19 @@ static cpumask_var_t perf_online_pkg_mask; static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +static DEFINE_PER_CPU(bool, guest_ctx_loaded); + +static __always_inline bool is_guest_mediated_pmu_loaded(void) +{ + return __this_cpu_read(guest_ctx_loaded); +} +#else static __always_inline bool is_guest_mediated_pmu_loaded(void) { return false; } +#endif /* * perf event paranoia level: @@ -6384,6 +6393,58 @@ void perf_release_mediated_pmu(void) atomic_dec(&nr_mediated_pmu_vms); } EXPORT_SYMBOL_GPL(perf_release_mediated_pmu); + +/* When loading a guest's mediated PMU, schedule out all exclude_guest events. */ +void perf_load_guest_context(void) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + + lockdep_assert_irqs_disabled(); + + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + + if (WARN_ON_ONCE(__this_cpu_read(guest_ctx_loaded))) + return; + + perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST); + if (cpuctx->task_ctx) { + perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); + task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST); + } + + perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); + if (cpuctx->task_ctx) + perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); + + __this_cpu_write(guest_ctx_loaded, true); +} +EXPORT_SYMBOL_GPL(perf_load_guest_context); + +void perf_put_guest_context(void) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + + lockdep_assert_irqs_disabled(); + + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + + if (WARN_ON_ONCE(!__this_cpu_read(guest_ctx_loaded))) + return; + + perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); + if (cpuctx->task_ctx) + perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); + + perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST); + + if (cpuctx->task_ctx) + perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); + perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); + + __this_cpu_write(guest_ctx_loaded, false); +} +EXPORT_SYMBOL_GPL(perf_put_guest_context); #else static int mediated_pmu_account_event(struct perf_event *event) { return 0; } static void mediated_pmu_unaccount_event(struct perf_event *event) {} From a05385d84b2af64600fc84b027bea481e8f6261d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:16:44 -0800 Subject: [PATCH 11/55] perf/x86/core: Register a new vector for handling mediated guest PMIs Wire up system vector 0xf5 for handling PMIs (i.e. interrupts delivered through the LVTPC) while running KVM guests with a mediated PMU. Perf currently delivers all PMIs as NMIs, e.g. so that events that trigger while IRQs are disabled aren't delayed and generate useless records, but due to the multiplexing of NMIs throughout the system, correctly identifying NMIs for a mediated PMU is practically infeasible. To (greatly) simplify identifying guest mediated PMU PMIs, perf will switch the CPU's LVTPC between PERF_GUEST_MEDIATED_PMI_VECTOR and NMI when guest PMU context is loaded/put. I.e. PMIs that are generated by the CPU while the guest is active will be identified purely based on the IRQ vector. Route the vector through perf, e.g. as opposed to letting KVM attach a handler directly a la posted interrupt notification vectors, as perf owns the LVTPC and thus is the rightful owner of PERF_GUEST_MEDIATED_PMI_VECTOR. Functionally, having KVM directly own the vector would be fine (both KVM and perf will be completely aware of when a mediated PMU is active), but would lead to an undesirable split in ownership: perf would be responsible for installing the vector, but not handling the resulting IRQs. Add a new perf_guest_info_callbacks hook (and static call) to allow KVM to register its handler with perf when running guests with mediated PMUs. Note, because KVM always runs guests with host IRQs enabled, there is no danger of a PMI being delayed from the guest's perspective due to using a regular IRQ instead of an NMI. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-9-seanjc@google.com --- arch/x86/entry/entry_fred.c | 1 + arch/x86/include/asm/hardirq.h | 3 +++ arch/x86/include/asm/idtentry.h | 6 ++++++ arch/x86/include/asm/irq_vectors.h | 4 +++- arch/x86/kernel/idt.c | 3 +++ arch/x86/kernel/irq.c | 19 +++++++++++++++++++ include/linux/perf_event.h | 8 ++++++++ kernel/events/core.c | 9 +++++++-- .../beauty/arch/x86/include/asm/irq_vectors.h | 3 ++- virt/kvm/kvm_main.c | 3 +++ 10 files changed, 55 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index 94e626cc6a07..a9b72997103d 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -114,6 +114,7 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = { SYSVEC(IRQ_WORK_VECTOR, irq_work), + SYSVEC(PERF_GUEST_MEDIATED_PMI_VECTOR, perf_guest_mediated_pmi_handler), SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 6b6d472baa0b..9314642ae93c 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -18,6 +18,9 @@ typedef struct { unsigned int kvm_posted_intr_ipis; unsigned int kvm_posted_intr_wakeup_ipis; unsigned int kvm_posted_intr_nested_ipis; +#endif +#ifdef CONFIG_GUEST_PERF_EVENTS + unsigned int perf_guest_mediated_pmis; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 3218770670d3..42bf6a58ec36 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -746,6 +746,12 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested # define fred_sysvec_kvm_posted_intr_nested_ipi NULL #endif +# ifdef CONFIG_GUEST_PERF_EVENTS +DECLARE_IDTENTRY_SYSVEC(PERF_GUEST_MEDIATED_PMI_VECTOR, sysvec_perf_guest_mediated_pmi_handler); +#else +# define fred_sysvec_perf_guest_mediated_pmi_handler NULL +#endif + # ifdef CONFIG_X86_POSTED_MSI DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, sysvec_posted_msi_notification); #else diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 47051871b436..85253fc8e384 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -77,7 +77,9 @@ */ #define IRQ_WORK_VECTOR 0xf6 -/* 0xf5 - unused, was UV_BAU_MESSAGE */ +/* IRQ vector for PMIs when running a guest with a mediated PMU. */ +#define PERF_GUEST_MEDIATED_PMI_VECTOR 0xf5 + #define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index f445bec516a0..260456588756 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -158,6 +158,9 @@ static const __initconst struct idt_data apic_idts[] = { INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi), INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi), # endif +#ifdef CONFIG_GUEST_PERF_EVENTS + INTG(PERF_GUEST_MEDIATED_PMI_VECTOR, asm_sysvec_perf_guest_mediated_pmi_handler), +#endif # ifdef CONFIG_IRQ_WORK INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work), # endif diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 86f4e574de02..d56185b49a0e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -192,6 +192,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) irq_stats(j)->kvm_posted_intr_wakeup_ipis); seq_puts(p, " Posted-interrupt wakeup event\n"); #endif +#ifdef CONFIG_GUEST_PERF_EVENTS + seq_printf(p, "%*s: ", prec, "VPMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->perf_guest_mediated_pmis); + seq_puts(p, " Perf Guest Mediated PMI\n"); +#endif #ifdef CONFIG_X86_POSTED_MSI seq_printf(p, "%*s: ", prec, "PMN"); for_each_online_cpu(j) @@ -349,6 +356,18 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) } #endif +#ifdef CONFIG_GUEST_PERF_EVENTS +/* + * Handler for PERF_GUEST_MEDIATED_PMI_VECTOR. + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_perf_guest_mediated_pmi_handler) +{ + apic_eoi(); + inc_irq_stat(perf_guest_mediated_pmis); + perf_guest_handle_mediated_pmi(); +} +#endif + #if IS_ENABLED(CONFIG_KVM) static void dummy_handler(void) {} static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 322cfa9f3d48..82e617fad165 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1677,6 +1677,8 @@ struct perf_guest_info_callbacks { unsigned int (*state)(void); unsigned long (*get_ip)(void); unsigned int (*handle_intel_pt_intr)(void); + + void (*handle_mediated_pmi)(void); }; #ifdef CONFIG_GUEST_PERF_EVENTS @@ -1686,6 +1688,7 @@ extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); +DECLARE_STATIC_CALL(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi); static inline unsigned int perf_guest_state(void) { @@ -1702,6 +1705,11 @@ static inline unsigned int perf_guest_handle_intel_pt_intr(void) return static_call(__perf_guest_handle_intel_pt_intr)(); } +static inline void perf_guest_handle_mediated_pmi(void) +{ + static_call(__perf_guest_handle_mediated_pmi)(); +} + extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); diff --git a/kernel/events/core.c b/kernel/events/core.c index bbb81a4a3196..dd842a4ca789 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7644,6 +7644,7 @@ struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); +DEFINE_STATIC_CALL_RET0(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi); void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { @@ -7658,6 +7659,10 @@ void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) if (cbs->handle_intel_pt_intr) static_call_update(__perf_guest_handle_intel_pt_intr, cbs->handle_intel_pt_intr); + + if (cbs->handle_mediated_pmi) + static_call_update(__perf_guest_handle_mediated_pmi, + cbs->handle_mediated_pmi); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); @@ -7669,8 +7674,8 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) rcu_assign_pointer(perf_guest_cbs, NULL); static_call_update(__perf_guest_state, (void *)&__static_call_return0); static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); - static_call_update(__perf_guest_handle_intel_pt_intr, - (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_intel_pt_intr, (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_mediated_pmi, (void *)&__static_call_return0); synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); diff --git a/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h index 47051871b436..6e1d5b955aae 100644 --- a/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h +++ b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h @@ -77,7 +77,8 @@ */ #define IRQ_WORK_VECTOR 0xf6 -/* 0xf5 - unused, was UV_BAU_MESSAGE */ +#define PERF_GUEST_MEDIATED_PMI_VECTOR 0xf5 + #define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5fcd401a5897..21a0d226d63f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6467,11 +6467,14 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .state = kvm_guest_state, .get_ip = kvm_guest_get_ip, .handle_intel_pt_intr = NULL, + .handle_mediated_pmi = NULL, }; void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)) { kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler; + kvm_guest_cbs.handle_mediated_pmi = NULL; + perf_register_guest_info_callbacks(&kvm_guest_cbs); } void kvm_unregister_perf_callbacks(void) From 560ac136f25da2da44a8b68d581adfdc8230b7e2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:16:45 -0800 Subject: [PATCH 12/55] perf/x86/core: Add APIs to switch to/from mediated PMI vector (for KVM) Add APIs (exported only for KVM) to switch PMIs to the dedicated mediated PMU IRQ vector when loading guest context, and back to perf's standard NMI when the guest context is put. I.e. route PMIs to PERF_GUEST_MEDIATED_PMI_VECTOR when the guest context is active, and to NMIs while the host context is active. While running with guest context loaded, ignore all NMIs (in perf). Any NMI that arrives while the LVTPC points at the mediated PMU IRQ vector can't possibly be due to a host perf event. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251206001720.468579-10-seanjc@google.com --- arch/x86/events/core.c | 32 +++++++++++++++++++++++++++++++ arch/x86/include/asm/perf_event.h | 5 +++++ 2 files changed, 37 insertions(+) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 0c38a31d5fc7..3ad5c658e286 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -56,6 +56,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .pmu = &pmu, }; +static DEFINE_PER_CPU(bool, guest_lvtpc_loaded); + DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); DEFINE_STATIC_KEY_FALSE(perf_is_hybrid); @@ -1760,6 +1762,25 @@ void perf_events_lapic_init(void) apic_write(APIC_LVTPC, APIC_DM_NMI); } +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +void perf_load_guest_lvtpc(u32 guest_lvtpc) +{ + u32 masked = guest_lvtpc & APIC_LVT_MASKED; + + apic_write(APIC_LVTPC, + APIC_DM_FIXED | PERF_GUEST_MEDIATED_PMI_VECTOR | masked); + this_cpu_write(guest_lvtpc_loaded, true); +} +EXPORT_SYMBOL_FOR_MODULES(perf_load_guest_lvtpc, "kvm"); + +void perf_put_guest_lvtpc(void) +{ + this_cpu_write(guest_lvtpc_loaded, false); + apic_write(APIC_LVTPC, APIC_DM_NMI); +} +EXPORT_SYMBOL_FOR_MODULES(perf_put_guest_lvtpc, "kvm"); +#endif /* CONFIG_PERF_GUEST_MEDIATED_PMU */ + static int perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { @@ -1767,6 +1788,17 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) u64 finish_clock; int ret; + /* + * Ignore all NMIs when the CPU's LVTPC is configured to route PMIs to + * PERF_GUEST_MEDIATED_PMI_VECTOR, i.e. when an NMI time can't be due + * to a PMI. Attempting to handle a PMI while the guest's context is + * loaded will generate false positives and clobber guest state. Note, + * the LVTPC is switched to/from the dedicated mediated PMI IRQ vector + * while host events are quiesced. + */ + if (this_cpu_read(guest_lvtpc_loaded)) + return NMI_DONE; + /* * All PMUs/events that share this PMI handler should make sure to * increment active_events for their events. diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 7276ba70c88a..fb7b261357bf 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -759,6 +759,11 @@ static inline void perf_events_lapic_init(void) { } static inline void perf_check_microcode(void) { } #endif +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +extern void perf_load_guest_lvtpc(u32 guest_lvtpc); +extern void perf_put_guest_lvtpc(void); +#endif + #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); extern void x86_perf_get_lbr(struct x86_pmu_lbr *lbr); From b456a6ba5756b6fb7e651775343e713bd08418e7 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 5 Dec 2025 16:16:46 -0800 Subject: [PATCH 13/55] perf/x86/core: Do not set bit width for unavailable counters Not all x86 processors have fixed counters. It may also be the case that a processor has only fixed counters and no general-purpose counters. Set the bit widths corresponding to each counter type only if such counters are available. Fixes: b3d9468a8bd2 ("perf, x86: Expose perf capability to other modules") Signed-off-by: Sandipan Das Co-developed-by: Dapeng Mi Signed-off-by: Dapeng Mi Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-11-seanjc@google.com --- arch/x86/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 3ad5c658e286..3f7838810cc5 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -3105,8 +3105,8 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) cap->version = x86_pmu.version; cap->num_counters_gp = x86_pmu_num_counters(NULL); cap->num_counters_fixed = x86_pmu_num_counters_fixed(NULL); - cap->bit_width_gp = x86_pmu.cntval_bits; - cap->bit_width_fixed = x86_pmu.cntval_bits; + cap->bit_width_gp = cap->num_counters_gp ? x86_pmu.cntval_bits : 0; + cap->bit_width_fixed = cap->num_counters_fixed ? x86_pmu.cntval_bits : 0; cap->events_mask = (unsigned int)x86_pmu.events_maskl; cap->events_mask_len = x86_pmu.events_mask_len; cap->pebs_ept = x86_pmu.pebs_ept; From c8824a95d9673763a0a9d642f8c79b2162296923 Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Fri, 5 Dec 2025 16:16:47 -0800 Subject: [PATCH 14/55] perf/x86/core: Plumb mediated PMU capability from x86_pmu to x86_pmu_cap Plumb mediated PMU capability to x86_pmu_cap in order to let any kernel entity such as KVM know that host PMU support mediated PMU mode and has the implementation. Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-12-seanjc@google.com --- arch/x86/events/core.c | 1 + arch/x86/include/asm/perf_event.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 3f7838810cc5..df7a32be9914 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -3110,6 +3110,7 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) cap->events_mask = (unsigned int)x86_pmu.events_maskl; cap->events_mask_len = x86_pmu.events_mask_len; cap->pebs_ept = x86_pmu.pebs_ept; + cap->mediated = !!(pmu.capabilities & PERF_PMU_CAP_MEDIATED_VPMU); } EXPORT_SYMBOL_FOR_KVM(perf_get_x86_pmu_capability); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index fb7b261357bf..0d9af4135e0a 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -301,6 +301,7 @@ struct x86_pmu_capability { unsigned int events_mask; int events_mask_len; unsigned int pebs_ept :1; + unsigned int mediated :1; }; /* From 4280d79587a3fd4bf9415705536fe385467c5f44 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 5 Dec 2025 16:16:48 -0800 Subject: [PATCH 15/55] perf/x86/intel: Support PERF_PMU_CAP_MEDIATED_VPMU Apply the PERF_PMU_CAP_MEDIATED_VPMU for Intel core PMU. It only indicates that the perf side of core PMU is ready to support the mediated vPMU. Besides the capability, the hypervisor, a.k.a. KVM, still needs to check the PMU version and other PMU features/capabilities to decide whether to enable support mediated vPMUs. [sean: massage changelog] Signed-off-by: Kan Liang Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-13-seanjc@google.com --- arch/x86/events/intel/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index bdf3f0d0fe21..0553c1160f15 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5695,6 +5695,8 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) else pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; + pmu->pmu.capabilities |= PERF_PMU_CAP_MEDIATED_VPMU; + intel_pmu_check_event_constraints_all(&pmu->pmu); intel_pmu_check_extra_regs(pmu->extra_regs); @@ -7314,6 +7316,9 @@ __init int intel_pmu_init(void) pr_cont(" AnyThread deprecated, "); } + /* The perf side of core PMU is ready to support the mediated vPMU. */ + x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_MEDIATED_VPMU; + /* * Many features on and after V6 require dynamic constraint, * e.g., Arch PEBS, ACR. From 65eb3a9a8a34fa9188e0ab5e657d84ce4fa242a7 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 5 Dec 2025 16:16:49 -0800 Subject: [PATCH 16/55] perf/x86/amd: Support PERF_PMU_CAP_MEDIATED_VPMU for AMD host Apply the PERF_PMU_CAP_MEDIATED_VPMU flag for version 2 and later implementations of the core PMU. Aside from having Global Control and Status registers, virtualizing the PMU using the mediated model requires an interface to set or clear the overflow bits in the Global Status MSRs while restoring or saving the PMU context of a vCPU. PerfMonV2-capable hardware has additional MSRs for this purpose, namely PerfCntrGlobalStatusSet and PerfCntrGlobalStatusClr, thereby making it suitable for use with mediated vPMU. Signed-off-by: Sandipan Das Signed-off-by: Mingwei Zhang Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Xudong Hao Link: https://patch.msgid.link/20251206001720.468579-14-seanjc@google.com --- arch/x86/events/amd/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 44656d2fb555..0c92ed5f464b 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1439,6 +1439,8 @@ static int __init amd_core_pmu_init(void) amd_pmu_global_cntr_mask = x86_pmu.cntr_mask64; + x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_MEDIATED_VPMU; + /* Update PMC handling functions */ x86_pmu.enable_all = amd_pmu_v2_enable_all; x86_pmu.disable_all = amd_pmu_v2_disable_all; From 2d6ad925fb2386f3ee1d26f5022f7ea71bbc1541 Mon Sep 17 00:00:00 2001 From: Jens Remus Date: Mon, 8 Dec 2025 17:03:49 +0100 Subject: [PATCH 17/55] unwind_user: Enhance comments on get CFA, FP, and RA Move the comment "Get the Canonical Frame Address (CFA)" to the top of the sequence of statements that actually get the CFA. Reword the comment "Find the Return Address (RA)" to "Get ...", as the statements actually get the RA. Add a respective comment to the statements that get the FP. This will be useful once future commits extend the logic to get the RA and FP. While at it align the comment on the "stack going in wrong direction" check to the following one on the "address is word aligned" check. Signed-off-by: Jens Remus Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251208160352.1363040-2-jremus@linux.ibm.com --- kernel/unwind/user.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c index 39e270789444..0ca434f86e73 100644 --- a/kernel/unwind/user.c +++ b/kernel/unwind/user.c @@ -31,6 +31,7 @@ static int unwind_user_next_common(struct unwind_user_state *state, { unsigned long cfa, fp, ra; + /* Get the Canonical Frame Address (CFA) */ if (frame->use_fp) { if (state->fp < state->sp) return -EINVAL; @@ -38,11 +39,9 @@ static int unwind_user_next_common(struct unwind_user_state *state, } else { cfa = state->sp; } - - /* Get the Canonical Frame Address (CFA) */ cfa += frame->cfa_off; - /* stack going in wrong direction? */ + /* Make sure that stack is not going in wrong direction */ if (cfa <= state->sp) return -EINVAL; @@ -50,10 +49,11 @@ static int unwind_user_next_common(struct unwind_user_state *state, if (cfa & (state->ws - 1)) return -EINVAL; - /* Find the Return Address (RA) */ + /* Get the Return Address (RA) */ if (get_user_word(&ra, cfa, frame->ra_off, state->ws)) return -EINVAL; + /* Get the Frame Pointer (FP) */ if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws)) return -EINVAL; From 2652f9a4b019e34fbbde8dcd1396f1f00ec4844f Mon Sep 17 00:00:00 2001 From: Jens Remus Date: Mon, 8 Dec 2025 17:03:50 +0100 Subject: [PATCH 18/55] unwind_user/fp: Use dummies instead of ifdef This simplifies the code. unwind_user_next_fp() does not need to return -EINVAL if config option HAVE_UNWIND_USER_FP is disabled, as unwind_user_start() will then not select this unwind method and unwind_user_next() will therefore not call it. Provide (1) a dummy definition of ARCH_INIT_USER_FP_FRAME, if the unwind user method HAVE_UNWIND_USER_FP is not enabled, (2) a common fallback definition of unwind_user_at_function_start() which returns false, and (3) a common dummy definition of ARCH_INIT_USER_FP_ENTRY_FRAME. Note that enabling the config option HAVE_UNWIND_USER_FP without defining ARCH_INIT_USER_FP_FRAME triggers a compile error, which is helpful when implementing support for this unwind user method in an architecture. Enabling the config option when providing an arch- specific unwind_user_at_function_start() definition makes it necessary to also provide an arch-specific ARCH_INIT_USER_FP_ENTRY_FRAME definition. Signed-off-by: Jens Remus Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251208160352.1363040-3-jremus@linux.ibm.com --- arch/x86/include/asm/unwind_user.h | 1 + include/linux/unwind_user.h | 18 ++++++++++++++++-- kernel/unwind/user.c | 4 ---- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h index 12064284bc4e..971ffe937d50 100644 --- a/arch/x86/include/asm/unwind_user.h +++ b/arch/x86/include/asm/unwind_user.h @@ -35,6 +35,7 @@ static inline bool unwind_user_at_function_start(struct pt_regs *regs) { return is_uprobe_at_func_entry(regs); } +#define unwind_user_at_function_start unwind_user_at_function_start #endif /* CONFIG_HAVE_UNWIND_USER_FP */ diff --git a/include/linux/unwind_user.h b/include/linux/unwind_user.h index 7f7282516bf5..64618618febd 100644 --- a/include/linux/unwind_user.h +++ b/include/linux/unwind_user.h @@ -5,8 +5,22 @@ #include #include -#ifndef ARCH_INIT_USER_FP_FRAME - #define ARCH_INIT_USER_FP_FRAME +#ifndef CONFIG_HAVE_UNWIND_USER_FP + +#define ARCH_INIT_USER_FP_FRAME(ws) + +#endif + +#ifndef ARCH_INIT_USER_FP_ENTRY_FRAME +#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) +#endif + +#ifndef unwind_user_at_function_start +static inline bool unwind_user_at_function_start(struct pt_regs *regs) +{ + return false; +} +#define unwind_user_at_function_start unwind_user_at_function_start #endif int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries); diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c index 0ca434f86e73..90ab3c1a205e 100644 --- a/kernel/unwind/user.c +++ b/kernel/unwind/user.c @@ -67,7 +67,6 @@ static int unwind_user_next_common(struct unwind_user_state *state, static int unwind_user_next_fp(struct unwind_user_state *state) { -#ifdef CONFIG_HAVE_UNWIND_USER_FP struct pt_regs *regs = task_pt_regs(current); if (state->topmost && unwind_user_at_function_start(regs)) { @@ -81,9 +80,6 @@ static int unwind_user_next_fp(struct unwind_user_state *state) ARCH_INIT_USER_FP_FRAME(state->ws) }; return unwind_user_next_common(state, &fp_frame); -#else - return -EINVAL; -#endif } static int unwind_user_next(struct unwind_user_state *state) From aa6047ef7204ea1faa346b9123439abed0546f7e Mon Sep 17 00:00:00 2001 From: Jens Remus Date: Mon, 8 Dec 2025 17:03:51 +0100 Subject: [PATCH 19/55] x86/unwind_user: Guard unwind_user_word_size() by UNWIND_USER The unwind user framework in general requires an architecture-specific implementation of unwind_user_word_size() to be present for any unwind method, whether that is fp or a future other method, such as potentially sframe. Guard unwind_user_word_size() by the availability of the UNWIND_USER framework instead of the specific HAVE_UNWIND_USER_FP method. This facilitates to selectively disable HAVE_UNWIND_USER_FP on x86 (e.g. for test purposes) once a new unwind method is added to unwind user. Signed-off-by: Jens Remus Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251208160352.1363040-4-jremus@linux.ibm.com --- arch/x86/include/asm/unwind_user.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h index 971ffe937d50..7f1229b33d06 100644 --- a/arch/x86/include/asm/unwind_user.h +++ b/arch/x86/include/asm/unwind_user.h @@ -2,23 +2,11 @@ #ifndef _ASM_X86_UNWIND_USER_H #define _ASM_X86_UNWIND_USER_H -#ifdef CONFIG_HAVE_UNWIND_USER_FP +#ifdef CONFIG_UNWIND_USER #include #include -#define ARCH_INIT_USER_FP_FRAME(ws) \ - .cfa_off = 2*(ws), \ - .ra_off = -1*(ws), \ - .fp_off = -2*(ws), \ - .use_fp = true, - -#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) \ - .cfa_off = 1*(ws), \ - .ra_off = -1*(ws), \ - .fp_off = 0, \ - .use_fp = false, - static inline int unwind_user_word_size(struct pt_regs *regs) { /* We can't unwind VM86 stacks */ @@ -31,6 +19,22 @@ static inline int unwind_user_word_size(struct pt_regs *regs) return sizeof(long); } +#endif /* CONFIG_UNWIND_USER */ + +#ifdef CONFIG_HAVE_UNWIND_USER_FP + +#define ARCH_INIT_USER_FP_FRAME(ws) \ + .cfa_off = 2*(ws), \ + .ra_off = -1*(ws), \ + .fp_off = -2*(ws), \ + .use_fp = true, + +#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) \ + .cfa_off = 1*(ws), \ + .ra_off = -1*(ws), \ + .fp_off = 0, \ + .use_fp = false, + static inline bool unwind_user_at_function_start(struct pt_regs *regs) { return is_uprobe_at_func_entry(regs); From 3c48808408af11d6f173c65eee9bd5ca4c53667c Mon Sep 17 00:00:00 2001 From: Jens Remus Date: Mon, 8 Dec 2025 17:03:52 +0100 Subject: [PATCH 20/55] x86/unwind_user: Simplify unwind_user_word_size() Get rid of superfluous ifdef and return explicit word size depending on 32-bit or 64-bit mode. Suggested-by: Linus Torvalds Signed-off-by: Jens Remus Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251208160352.1363040-5-jremus@linux.ibm.com --- arch/x86/include/asm/unwind_user.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h index 7f1229b33d06..6e469044e4de 100644 --- a/arch/x86/include/asm/unwind_user.h +++ b/arch/x86/include/asm/unwind_user.h @@ -12,11 +12,7 @@ static inline int unwind_user_word_size(struct pt_regs *regs) /* We can't unwind VM86 stacks */ if (regs->flags & X86_VM_MASK) return 0; -#ifdef CONFIG_X86_64 - if (!user_64bit_mode(regs)) - return sizeof(int); -#endif - return sizeof(long); + return user_64bit_mode(regs) ? 8 : 4; } #endif /* CONFIG_UNWIND_USER */ From 63dbadcafc1f4d1da796a8e2c0aea1e561f79ece Mon Sep 17 00:00:00 2001 From: Martin Schiller Date: Mon, 24 Nov 2025 08:48:44 +0100 Subject: [PATCH 21/55] perf/x86/msr: Add Airmont NP Like Airmont, the Airmont NP (aka Intel / MaxLinear Lightning Mountain) supports SMI_COUNT MSR. Signed-off-by: Martin Schiller Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251124074846.9653-2-ms@dev.tdt.de --- arch/x86/events/msr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 7f5007a4752a..8052596b8503 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -78,6 +78,7 @@ static bool test_intel(int idx, void *data) case INTEL_ATOM_SILVERMONT: case INTEL_ATOM_SILVERMONT_D: case INTEL_ATOM_AIRMONT: + case INTEL_ATOM_AIRMONT_NP: case INTEL_ATOM_GOLDMONT: case INTEL_ATOM_GOLDMONT_D: From a08340fd291671c54d379d285b2325490ce90ddd Mon Sep 17 00:00:00 2001 From: Martin Schiller Date: Mon, 24 Nov 2025 08:48:45 +0100 Subject: [PATCH 22/55] perf/x86/intel: Add Airmont NP The Intel / MaxLinear Airmont NP (aka Lightning Mountain) supports the same architectual and non-architecural events as Airmont. Signed-off-by: Martin Schiller Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251124074846.9653-3-ms@dev.tdt.de --- arch/x86/events/intel/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 0553c1160f15..1840ca1918d1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -7410,6 +7410,7 @@ __init int intel_pmu_init(void) case INTEL_ATOM_SILVERMONT_D: case INTEL_ATOM_SILVERMONT_MID: case INTEL_ATOM_AIRMONT: + case INTEL_ATOM_AIRMONT_NP: case INTEL_ATOM_SILVERMONT_MID2: memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); From 3006911f284d769b0f66c12b39da130325ef1440 Mon Sep 17 00:00:00 2001 From: Martin Schiller Date: Mon, 24 Nov 2025 08:48:46 +0100 Subject: [PATCH 23/55] perf/x86/cstate: Add Airmont NP From the perspective of Intel cstate residency counters, the Airmont NP (aka Lightning Mountain) is identical to the Airmont. Signed-off-by: Martin Schiller Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://patch.msgid.link/20251124074846.9653-4-ms@dev.tdt.de --- arch/x86/events/intel/cstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 1e2658b60d91..f3d5ee07f8f2 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -613,6 +613,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &slm_cstates), X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_D, &slm_cstates), X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &slm_cstates), + X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &slm_cstates), X86_MATCH_VFM(INTEL_BROADWELL, &snb_cstates), X86_MATCH_VFM(INTEL_BROADWELL_D, &snb_cstates), From 3cb3c2f6886f9489df13de8efe7a1e803a3f21ea Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 17 Dec 2025 12:08:01 +0100 Subject: [PATCH 24/55] perf: Clean up mediated vPMU accounting The mediated_pmu_account_event() and perf_create_mediated_pmu() functions implement the exclusion between '!exclude_guest' counters and mediated vPMUs. Their implementation is basically identical, except mirrored in what they count/check. Make sure the actual implementations reflect this similarity. Notably: - while perf_release_mediated_pmu() has an underflow check; mediated_pmu_unaccount_event() did not. - while perf_create_mediated_pmu() has an inc_not_zero() path; mediated_pmu_account_event() did not. Also, the inc_not_zero() path can be outsite of perf_mediated_pmu_mutex. The mutex must guard the 0->1 (of either nr_include_guest_events or nr_mediated_pmu_vms) transition, but once a counter is already non-zero, it can safely be incremented further. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251208115156.GE3707891@noisy.programming.kicks-ass.net --- kernel/events/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index dd842a4ca789..e6a4b1e34f84 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6344,8 +6344,10 @@ static int mediated_pmu_account_event(struct perf_event *event) if (!is_include_guest_event(event)) return 0; - guard(mutex)(&perf_mediated_pmu_mutex); + if (atomic_inc_not_zero(&nr_include_guest_events)) + return 0; + guard(mutex)(&perf_mediated_pmu_mutex); if (atomic_read(&nr_mediated_pmu_vms)) return -EOPNOTSUPP; @@ -6358,6 +6360,9 @@ static void mediated_pmu_unaccount_event(struct perf_event *event) if (!is_include_guest_event(event)) return; + if (WARN_ON_ONCE(!atomic_read(&nr_include_guest_events))) + return; + atomic_dec(&nr_include_guest_events); } @@ -6373,10 +6378,10 @@ static void mediated_pmu_unaccount_event(struct perf_event *event) */ int perf_create_mediated_pmu(void) { - guard(mutex)(&perf_mediated_pmu_mutex); if (atomic_inc_not_zero(&nr_mediated_pmu_vms)) return 0; + guard(mutex)(&perf_mediated_pmu_mutex); if (atomic_read(&nr_include_guest_events)) return -EBUSY; From 01122b89361e565b3c88b9fbebe92dc5c7420cb7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 17 Dec 2025 13:23:59 +0100 Subject: [PATCH 25/55] perf: Use EXPORT_SYMBOL_FOR_KVM() for the mediated APIs Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251208115156.GE3707891@noisy.programming.kicks-ass.net --- arch/x86/events/core.c | 5 +++-- include/asm-generic/Kbuild | 1 + kernel/events/core.c | 5 +++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index df7a32be9914..0ecac9495d74 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -1771,14 +1772,14 @@ void perf_load_guest_lvtpc(u32 guest_lvtpc) APIC_DM_FIXED | PERF_GUEST_MEDIATED_PMI_VECTOR | masked); this_cpu_write(guest_lvtpc_loaded, true); } -EXPORT_SYMBOL_FOR_MODULES(perf_load_guest_lvtpc, "kvm"); +EXPORT_SYMBOL_FOR_KVM(perf_load_guest_lvtpc); void perf_put_guest_lvtpc(void) { this_cpu_write(guest_lvtpc_loaded, false); apic_write(APIC_LVTPC, APIC_DM_NMI); } -EXPORT_SYMBOL_FOR_MODULES(perf_put_guest_lvtpc, "kvm"); +EXPORT_SYMBOL_FOR_KVM(perf_put_guest_lvtpc); #endif /* CONFIG_PERF_GUEST_MEDIATED_PMU */ static int diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 295c94a3ccc1..9aff61e7b8f2 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -32,6 +32,7 @@ mandatory-y += irq_work.h mandatory-y += kdebug.h mandatory-y += kmap_size.h mandatory-y += kprobes.h +mandatory-y += kvm_types.h mandatory-y += linkage.h mandatory-y += local.h mandatory-y += local64.h diff --git a/kernel/events/core.c b/kernel/events/core.c index e6a4b1e34f84..376fb07d869b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "internal.h" @@ -6388,7 +6389,7 @@ int perf_create_mediated_pmu(void) atomic_inc(&nr_mediated_pmu_vms); return 0; } -EXPORT_SYMBOL_GPL(perf_create_mediated_pmu); +EXPORT_SYMBOL_FOR_KVM(perf_create_mediated_pmu); void perf_release_mediated_pmu(void) { @@ -6397,7 +6398,7 @@ void perf_release_mediated_pmu(void) atomic_dec(&nr_mediated_pmu_vms); } -EXPORT_SYMBOL_GPL(perf_release_mediated_pmu); +EXPORT_SYMBOL_FOR_KVM(perf_release_mediated_pmu); /* When loading a guest's mediated PMU, schedule out all exclude_guest events. */ void perf_load_guest_context(void) From 4b24910c056995c0c0fa7c1b142696443b05fd8e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:16:50 -0800 Subject: [PATCH 26/55] KVM: Add a simplified wrapper for registering perf callbacks Add a parameter-less API for registering perf callbacks in anticipation of introducing another x86-only parameter for handling mediated PMU PMIs. No functional change intended. Acked-by: Anup Patel Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-15-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 2 +- arch/loongarch/kvm/main.c | 2 +- arch/riscv/kvm/main.c | 2 +- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 11 +++++++++-- virt/kvm/kvm_main.c | 5 +++-- 6 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 4f80da0c0d1d..3e6f184d6d04 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2357,7 +2357,7 @@ static int __init init_subsystems(void) if (err) goto out; - kvm_register_perf_callbacks(NULL); + kvm_register_perf_callbacks(); out: if (err) diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index 80ea63d465b8..f62326fe29fa 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -394,7 +394,7 @@ static int kvm_loongarch_env_init(void) } kvm_init_gcsr_flag(); - kvm_register_perf_callbacks(NULL); + kvm_register_perf_callbacks(); /* Register LoongArch IPI interrupt controller interface. */ ret = kvm_loongarch_register_ipi_device(); diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 45536af521f0..0f3fe3986fc0 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -174,7 +174,7 @@ static int __init riscv_kvm_init(void) kvm_riscv_setup_vendor_features(); - kvm_register_perf_callbacks(NULL); + kvm_register_perf_callbacks(); rc = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (rc) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0c6d899d53dd..1b2827cecf38 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10107,7 +10107,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif - kvm_register_perf_callbacks(ops->handle_intel_pt_intr); + __kvm_register_perf_callbacks(ops->handle_intel_pt_intr, NULL); if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled) kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d93f75b05ae2..8e410d1a63df 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1749,10 +1749,17 @@ static inline bool kvm_arch_intc_initialized(struct kvm *kvm) #ifdef CONFIG_GUEST_PERF_EVENTS unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu); -void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)); +void __kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void), + void (*mediated_pmi_handler)(void)); + +static inline void kvm_register_perf_callbacks(void) +{ + __kvm_register_perf_callbacks(NULL, NULL); +} + void kvm_unregister_perf_callbacks(void); #else -static inline void kvm_register_perf_callbacks(void *ign) {} +static inline void kvm_register_perf_callbacks(void) {} static inline void kvm_unregister_perf_callbacks(void) {} #endif /* CONFIG_GUEST_PERF_EVENTS */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 21a0d226d63f..d59cb53af76a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6470,10 +6470,11 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .handle_mediated_pmi = NULL, }; -void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)) +void __kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void), + void (*mediated_pmi_handler)(void)) { kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler; - kvm_guest_cbs.handle_mediated_pmi = NULL; + kvm_guest_cbs.handle_mediated_pmi = mediated_pmi_handler; perf_register_guest_info_callbacks(&kvm_guest_cbs); } From 3e51822b2fdf695bda50c2c3f88d6ab022a9e6f3 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:16:52 -0800 Subject: [PATCH 27/55] KVM: x86/pmu: Start stubbing in mediated PMU support Introduce enable_mediated_pmu as a global variable, with the intent of exposing it to userspace a vendor module parameter, to control and reflect mediated vPMU support. Wire up the perf plumbing to create+release a mediated PMU, but defer exposing the parameter to userspace until KVM support for a mediated PMUs is fully landed. To (a) minimize compatibility issues, (b) to give userspace a chance to opt out of the restrictive side-effects of perf_create_mediated_pmu(), and (c) to avoid adding new dependencies between enabling an in-kernel irqchip and a mediated vPMU, defer "creating" a mediated PMU in perf until the first vCPU is created. Regarding userspace compatibility, an alternative solution would be to make the mediated PMU fully opt-in, e.g. to avoid unexpected failure due to perf_create_mediated_pmu() failing. Ironically, that approach creates an even bigger compatibility issue, as turning on enable_mediated_pmu would silently break VMMs that don't utilize KVM_CAP_PMU_CAPABILITY (well, silently until the guest tried to access PMU assets). Regarding an in-kernel irqchip, create a mediated PMU if and only if the VM has an in-kernel local APIC, as the mediated PMU will take a hard dependency on forwarding PMIs to the guest without bouncing through host userspace. Silently "drop" the PMU instead of rejecting KVM_CREATE_VCPU, as KVM's existing vPMU support doesn't function correctly if the local APIC is emulated by userspace, e.g. PMIs will never be delivered. I.e. it's far, far more likely that rejecting KVM_CREATE_VCPU would cause problems, e.g. for tests or userspace daemons that just want to probe basic KVM functionality. Note! Deliberately make mediated PMU creation "sticky", i.e. don't unwind it on failure to create a vCPU. Practically speaking, there's no harm to having a VM with a mediated PMU and no vCPUs. To avoid an "impossible" VM setup, reject KVM_CAP_PMU_CAPABILITY if a mediated PMU has been created, i.e. don't let userspace disable PMU support after failed vCPU creation (with PMU support enabled). Defer vendor specific requirements and constraints to the future. Suggested-by: Sean Christopherson Signed-off-by: Dapeng Mi Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang Tested-by: Xudong Hao Co-developed-by: Sean Christopherson Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-17-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/pmu.c | 4 ++++ arch/x86/kvm/pmu.h | 7 +++++++ arch/x86/kvm/x86.c | 37 +++++++++++++++++++++++++++++++-- arch/x86/kvm/x86.h | 1 + 5 files changed, 48 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5a3bfa293e8b..defd979003be 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1484,6 +1484,7 @@ struct kvm_arch { bool bus_lock_detection_enabled; bool enable_pmu; + bool created_mediated_pmu; u32 notify_window; u32 notify_vmexit_flags; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 487ad19a236e..131e24246b09 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -135,6 +135,10 @@ void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) enable_pmu = false; } + if (!enable_pmu || !enable_mediated_pmu || !kvm_host_pmu.mediated || + !pmu_ops->is_mediated_pmu_supported(&kvm_host_pmu)) + enable_mediated_pmu = false; + if (!enable_pmu) { memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); return; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 5c3939e91f1d..a5c7c026b919 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -37,6 +37,8 @@ struct kvm_pmu_ops { void (*deliver_pmi)(struct kvm_vcpu *vcpu); void (*cleanup)(struct kvm_vcpu *vcpu); + bool (*is_mediated_pmu_supported)(struct x86_pmu_capability *host_pmu); + const u64 EVENTSEL_EVENT; const int MAX_NR_GP_COUNTERS; const int MIN_NR_GP_COUNTERS; @@ -58,6 +60,11 @@ static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) return pmu->version > 1; } +static inline bool kvm_vcpu_has_mediated_pmu(struct kvm_vcpu *vcpu) +{ + return enable_mediated_pmu && vcpu_to_pmu(vcpu)->version; +} + /* * KVM tracks all counters in 64-bit bitmaps, with general purpose counters * mapped to bits 31:0 and fixed counters mapped to 63:32, e.g. fixed counter 0 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1b2827cecf38..fb3a5e861553 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -183,6 +183,10 @@ bool __read_mostly enable_pmu = true; EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_pmu); module_param(enable_pmu, bool, 0444); +/* Enable/disabled mediated PMU virtualization. */ +bool __read_mostly enable_mediated_pmu; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mediated_pmu); + bool __read_mostly eager_page_split = true; module_param(eager_page_split, bool, 0644); @@ -6854,7 +6858,7 @@ disable_exits_unlock: break; mutex_lock(&kvm->lock); - if (!kvm->created_vcpus) { + if (!kvm->created_vcpus && !kvm->arch.created_mediated_pmu) { kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE); r = 0; } @@ -12641,8 +12645,13 @@ static int sync_regs(struct kvm_vcpu *vcpu) return 0; } +#define PERF_MEDIATED_PMU_MSG \ + "Failed to enable mediated vPMU, try disabling system wide perf events and nmi_watchdog.\n" + int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { + int r; + if (kvm_check_tsc_unstable() && kvm->created_vcpus) pr_warn_once("SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); @@ -12653,7 +12662,29 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) if (id >= kvm->arch.max_vcpu_ids) return -EINVAL; - return kvm_x86_call(vcpu_precreate)(kvm); + /* + * Note, any actions done by .vcpu_create() must be idempotent with + * respect to creating multiple vCPUs, and therefore are not undone if + * creating a vCPU fails (including failure during pre-create). + */ + r = kvm_x86_call(vcpu_precreate)(kvm); + if (r) + return r; + + if (enable_mediated_pmu && kvm->arch.enable_pmu && + !kvm->arch.created_mediated_pmu) { + if (irqchip_in_kernel(kvm)) { + r = perf_create_mediated_pmu(); + if (r) { + pr_warn_ratelimited(PERF_MEDIATED_PMU_MSG); + return r; + } + kvm->arch.created_mediated_pmu = true; + } else { + kvm->arch.enable_pmu = false; + } + } + return 0; } int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) @@ -13319,6 +13350,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); mutex_unlock(&kvm->slots_lock); } + if (kvm->arch.created_mediated_pmu) + perf_release_mediated_pmu(); kvm_destroy_vcpus(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); #ifdef CONFIG_KVM_IOAPIC diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index fdab0ad49098..6e1fb1680c0a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -470,6 +470,7 @@ extern struct kvm_caps kvm_caps; extern struct kvm_host_values kvm_host; extern bool enable_pmu; +extern bool enable_mediated_pmu; /* * Get a filtered version of KVM's supported XCR0 that strips out dynamic From bfee4f07d88038f7e662652718e21c60b62ef3a1 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:16:53 -0800 Subject: [PATCH 28/55] KVM: x86/pmu: Implement Intel mediated PMU requirements and constraints Implement Intel PMU requirements and constraints for mediated PMU support. Require host PMU version 4+ so that PERF_GLOBAL_STATUS_SET can be used to precisely load the guest's status value into hardware, and require full- width writes so that KVM can precisely load guest counter values. Disable PEBS and LBRs if mediated PMU support is enabled, as they won't be supported in the initial implementation. Signed-off-by: Dapeng Mi Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang [sean: split to separate patch, add full-width writes dependency] Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-18-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/capabilities.h | 3 ++- arch/x86/kvm/vmx/pmu_intel.c | 17 +++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 3 ++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 02aadb9d730e..26302fd6dd9c 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -395,7 +395,8 @@ static inline bool vmx_pt_mode_is_host_guest(void) static inline bool vmx_pebs_supported(void) { - return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept; + return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept && + !enable_mediated_pmu; } static inline bool cpu_has_notify_vmexit(void) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index de1d9785c01f..050c21298213 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -767,6 +767,20 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) } } +static bool intel_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_pmu) +{ + u64 host_perf_cap = 0; + + if (boot_cpu_has(X86_FEATURE_PDCM)) + rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); + + /* + * Require v4+ for MSR_CORE_PERF_GLOBAL_STATUS_SET, and full-width + * writes so that KVM can precisely load guest counter values. + */ + return host_pmu->version >= 4 && host_perf_cap & PERF_CAP_FW_WRITES; +} + struct kvm_pmu_ops intel_pmu_ops __initdata = { .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, @@ -778,6 +792,9 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .reset = intel_pmu_reset, .deliver_pmi = intel_pmu_deliver_pmi, .cleanup = intel_pmu_cleanup, + + .is_mediated_pmu_supported = intel_pmu_is_mediated_pmu_supported, + .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS, .MIN_NR_GP_COUNTERS = 1, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4cbe8c84b636..fdd18ad1ede3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7958,7 +7958,8 @@ static __init u64 vmx_get_perf_capabilities(void) if (boot_cpu_has(X86_FEATURE_PDCM)) rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); - if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { + if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR) && + !enable_mediated_pmu) { x86_perf_get_lbr(&vmx_lbr_caps); /* From 9ba0bb4ae76a8ee037257499165a4370306c0eac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:16:54 -0800 Subject: [PATCH 29/55] KVM: x86/pmu: Implement AMD mediated PMU requirements Require host PMU version 2+ for AMD mediated PMU support, as PERF_GLOBAL_CTRL and friends are hard requirements for the mediated PMU. Signed-off-by: Dapeng Mi Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang [sean: extract to separate patch, write changelog] Reviewed-by: Sandipan Das Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-19-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/pmu.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index bc062285fbf5..16c88b2a2eb8 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -227,6 +227,11 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu) } } +static bool amd_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_pmu) +{ + return host_pmu->version >= 2; +} + struct kvm_pmu_ops amd_pmu_ops __initdata = { .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = amd_msr_idx_to_pmc, @@ -236,6 +241,9 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .set_msr = amd_pmu_set_msr, .refresh = amd_pmu_refresh, .init = amd_pmu_init, + + .is_mediated_pmu_supported = amd_pmu_is_mediated_pmu_supported, + .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_MAX_NR_AMD_GP_COUNTERS, .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, From 1c4ba7286afba9842f295fc7a3dbe74acc6a92af Mon Sep 17 00:00:00 2001 From: Xiong Zhang Date: Fri, 5 Dec 2025 16:16:55 -0800 Subject: [PATCH 30/55] KVM: x86/pmu: Register PMI handler for mediated vPMU Register a dedicated PMI handler with perf's callback when mediated PMU support is enabled. Perf routes PMIs that arrive while guest context is loaded to the provided callback, by modifying the CPU's LVTPC to point at a dedicated mediated PMI IRQ vector. WARN upon receipt of a mediated PMI if there is no active vCPU, or if the vCPU doesn't have a mediated PMU. Even if a PMI manages to skid past VM-Exit, it should never be delayed all the way beyond unloading the vCPU. And while running vCPUs without a mediated PMU, the LVTPC should never be wired up to the mediated PMI IRQ vector, i.e. should always be routed through perf's NMI handler. Signed-off-by: Xiong Zhang Signed-off-by: Mingwei Zhang Tested-by: Xudong Hao Co-developed-by: Sean Christopherson Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-20-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 10 ++++++++++ arch/x86/kvm/pmu.h | 2 ++ arch/x86/kvm/x86.c | 3 ++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 131e24246b09..0b67920fa069 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -157,6 +157,16 @@ void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); } +void kvm_handle_guest_mediated_pmi(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + if (WARN_ON_ONCE(!vcpu || !kvm_vcpu_has_mediated_pmu(vcpu))) + return; + + kvm_make_request(KVM_REQ_PMI, vcpu); +} + static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index a5c7c026b919..9849c2bb720d 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -46,6 +46,8 @@ struct kvm_pmu_ops { void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); +void kvm_handle_guest_mediated_pmi(void); + static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) { /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fb3a5e861553..1623afddff3b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10111,7 +10111,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif - __kvm_register_perf_callbacks(ops->handle_intel_pt_intr, NULL); + __kvm_register_perf_callbacks(ops->handle_intel_pt_intr, + enable_mediated_pmu ? kvm_handle_guest_mediated_pmi : NULL); if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled) kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM); From 80624272129eacc10cecb30f004cfa611be04770 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:16:56 -0800 Subject: [PATCH 31/55] KVM: x86/pmu: Disable RDPMC interception for compatible mediated vPMU Disable RDPMC interception for vCPUs with a mediated vPMU that is compatible with the host PMU, i.e. that doesn't require KVM emulation of RDPMC to honor the guest's vCPU model. With a mediated vPMU, all guest state accessible via RDPMC is loaded into hardware while the guest is running. Adust RDPMC interception only for non-TDX guests, as the TDX module is responsible for managing RDPMC intercepts based on the TD configuration. Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang Co-developed-by: Sandipan Das Signed-off-by: Sandipan Das Signed-off-by: Dapeng Mi Tested-by: Xudong Hao Co-developed-by: Sean Christopherson Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-21-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 26 ++++++++++++++++++++++++++ arch/x86/kvm/pmu.h | 1 + arch/x86/kvm/svm/svm.c | 5 +++++ arch/x86/kvm/vmx/vmx.c | 7 +++++++ arch/x86/kvm/x86.c | 1 + 5 files changed, 40 insertions(+) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 0b67920fa069..fcecb4c21599 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -714,6 +714,32 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) return 0; } +bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + if (!kvm_vcpu_has_mediated_pmu(vcpu)) + return true; + + /* + * VMware allows access to these Pseduo-PMCs even when read via RDPMC + * in Ring3 when CR4.PCE=0. + */ + if (enable_vmware_backdoor) + return true; + + /* + * Note! Check *host* PMU capabilities, not KVM's PMU capabilities, as + * KVM's capabilities are constrained based on KVM support, i.e. KVM's + * capabilities themselves may be a subset of hardware capabilities. + */ + return pmu->nr_arch_gp_counters != kvm_host_pmu.num_counters_gp || + pmu->nr_arch_fixed_counters != kvm_host_pmu.num_counters_fixed || + pmu->counter_bitmask[KVM_PMC_GP] != (BIT_ULL(kvm_host_pmu.bit_width_gp) - 1) || + pmu->counter_bitmask[KVM_PMC_FIXED] != (BIT_ULL(kvm_host_pmu.bit_width_fixed) - 1); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_need_rdpmc_intercept); + void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) { diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 9849c2bb720d..506c203587ea 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -238,6 +238,7 @@ void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu); void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu); bool is_vmware_backdoor_pmc(u32 pmc_idx); +bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu); extern struct kvm_pmu_ops intel_pmu_ops; extern struct kvm_pmu_ops amd_pmu_ops; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f56c2d895011..ef43360b9282 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1011,6 +1011,11 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; } } + + if (kvm_need_rdpmc_intercept(vcpu)) + svm_set_intercept(svm, INTERCEPT_RDPMC); + else + svm_clr_intercept(svm, INTERCEPT_RDPMC); } static void svm_recalc_intercepts(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index fdd18ad1ede3..9f71ba99cf70 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4300,8 +4300,15 @@ static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) */ } +static void vmx_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) +{ + exec_controls_changebit(to_vmx(vcpu), CPU_BASED_RDPMC_EXITING, + kvm_need_rdpmc_intercept(vcpu)); +} + void vmx_recalc_intercepts(struct kvm_vcpu *vcpu) { + vmx_recalc_instruction_intercepts(vcpu); vmx_recalc_msr_intercepts(vcpu); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1623afddff3b..76e86eb358df 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3945,6 +3945,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.perf_capabilities = data; kvm_pmu_refresh(vcpu); + kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); break; case MSR_IA32_PRED_CMD: { u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB); From d3ba32d1ff2a206621475325c009ab5b51882de1 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:16:57 -0800 Subject: [PATCH 32/55] KVM: x86/pmu: Load/save GLOBAL_CTRL via entry/exit fields for mediated PMU When running a guest with a mediated PMU, context switch PERF_GLOBAL_CTRL via the dedicated VMCS fields for both host and guest. For the host, always zero GLOBAL_CTRL on exit as the guest's state will still be loaded in hardware (KVM will context switch the bulk of PMU state outside of the inner run loop). For the guest, use the dedicated fields to atomically load and save PERF_GLOBAL_CTRL on all entry/exits. For now, require VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL support (introduced by Sapphire Rapids). KVM can support such CPUs by saving PERF_GLOBAL_CTRL via the MSR save list, a.k.a. the MSR auto-store list, but defer that support as it adds a small amount of complexity and is somewhat unique. To minimize VM-Entry latency, propagate IA32_PERF_GLOBAL_CTRL to the VMCS on-demand. But to minimize complexity, read IA32_PERF_GLOBAL_CTRL out of the VMCS on all non-failing VM-Exits. I.e. partially cache the MSR. KVM could track GLOBAL_CTRL as an EXREG and defer all reads, but writes are rare, i.e. the dirty tracking for an EXREG is unnecessary, and it's not obvious that shaving ~15-20 cycles per exit is meaningful given the total overhead associated with mediated PMU context switches. Suggested-by: Sean Christopherson Signed-off-by: Dapeng Mi Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang Tested-by: Xudong Hao Co-developed-by: Sean Christopherson Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-22-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-pmu-ops.h | 2 ++ arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/pmu.c | 13 +++++++++-- arch/x86/kvm/pmu.h | 3 ++- arch/x86/kvm/vmx/capabilities.h | 6 +++++ arch/x86/kvm/vmx/pmu_intel.c | 25 ++++++++++++++++++++- arch/x86/kvm/vmx/vmx.c | 31 +++++++++++++++++++++++++- arch/x86/kvm/vmx/vmx.h | 3 ++- 8 files changed, 78 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index 9159bf1a4730..ad2cc82abf79 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -23,5 +23,7 @@ KVM_X86_PMU_OP_OPTIONAL(reset) KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) KVM_X86_PMU_OP_OPTIONAL(cleanup) +KVM_X86_PMU_OP_OPTIONAL(write_global_ctrl) + #undef KVM_X86_PMU_OP #undef KVM_X86_PMU_OP_OPTIONAL diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index c85c50019523..b92ff87e3560 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -107,6 +107,7 @@ #define VM_EXIT_PT_CONCEAL_PIP 0x01000000 #define VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000 #define VM_EXIT_LOAD_CET_STATE 0x10000000 +#define VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL 0x40000000 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index fcecb4c21599..4b896cbb3d53 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -103,7 +103,7 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } -void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) +void kvm_init_pmu_capability(struct kvm_pmu_ops *pmu_ops) { bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; @@ -139,6 +139,9 @@ void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) !pmu_ops->is_mediated_pmu_supported(&kvm_host_pmu)) enable_mediated_pmu = false; + if (!enable_mediated_pmu) + pmu_ops->write_global_ctrl = NULL; + if (!enable_pmu) { memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); return; @@ -834,6 +837,9 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) diff = pmu->global_ctrl ^ data; pmu->global_ctrl = data; reprogram_counters(pmu, diff); + + if (kvm_vcpu_has_mediated_pmu(vcpu)) + kvm_pmu_call(write_global_ctrl)(data); } break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: @@ -928,8 +934,11 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) * in the global controls). Emulate that behavior when refreshing the * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL. */ - if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) + if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) { pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); + if (kvm_vcpu_has_mediated_pmu(vcpu)) + kvm_pmu_call(write_global_ctrl)(pmu->global_ctrl); + } bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); bitmap_set(pmu->all_valid_pmc_idx, KVM_FIXED_PMC_BASE_IDX, diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 506c203587ea..2ff469334c1a 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -38,6 +38,7 @@ struct kvm_pmu_ops { void (*cleanup)(struct kvm_vcpu *vcpu); bool (*is_mediated_pmu_supported)(struct x86_pmu_capability *host_pmu); + void (*write_global_ctrl)(u64 global_ctrl); const u64 EVENTSEL_EVENT; const int MAX_NR_GP_COUNTERS; @@ -183,7 +184,7 @@ static inline bool pmc_is_locally_enabled(struct kvm_pmc *pmc) extern struct x86_pmu_capability kvm_pmu_cap; -void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops); +void kvm_init_pmu_capability(struct kvm_pmu_ops *pmu_ops); void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc); diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 26302fd6dd9c..4e371c93ae16 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -109,6 +109,12 @@ static inline bool cpu_has_load_cet_ctrl(void) { return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_CET_STATE); } + +static inline bool cpu_has_save_perf_global_ctrl(void) +{ + return vmcs_config.vmexit_ctrl & VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; +} + static inline bool cpu_has_vmx_mpx(void) { return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 050c21298213..dbab7cca7a62 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -778,7 +778,29 @@ static bool intel_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_ * Require v4+ for MSR_CORE_PERF_GLOBAL_STATUS_SET, and full-width * writes so that KVM can precisely load guest counter values. */ - return host_pmu->version >= 4 && host_perf_cap & PERF_CAP_FW_WRITES; + if (host_pmu->version < 4 || !(host_perf_cap & PERF_CAP_FW_WRITES)) + return false; + + /* + * All CPUs that support a mediated PMU are expected to support loading + * PERF_GLOBAL_CTRL via dedicated VMCS fields. + */ + if (WARN_ON_ONCE(!cpu_has_load_perf_global_ctrl())) + return false; + + /* + * KVM doesn't yet support mediated PMU on CPUs without support for + * saving PERF_GLOBAL_CTRL via a dedicated VMCS field. + */ + if (!cpu_has_save_perf_global_ctrl()) + return false; + + return true; +} + +static void intel_pmu_write_global_ctrl(u64 global_ctrl) +{ + vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, global_ctrl); } struct kvm_pmu_ops intel_pmu_ops __initdata = { @@ -794,6 +816,7 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .cleanup = intel_pmu_cleanup, .is_mediated_pmu_supported = intel_pmu_is_mediated_pmu_supported, + .write_global_ctrl = intel_pmu_write_global_ctrl, .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9f71ba99cf70..72b92cea9d72 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4294,6 +4294,18 @@ static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept); } + if (enable_mediated_pmu) { + bool is_mediated_pmu = kvm_vcpu_has_mediated_pmu(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vm_entry_controls_changebit(vmx, + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, is_mediated_pmu); + + vm_exit_controls_changebit(vmx, + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | + VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL, is_mediated_pmu); + } + /* * x2APIC and LBR MSR intercepts are modified on-demand and cannot be * filtered by userspace. @@ -4476,6 +4488,16 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) vmcs_writel(HOST_SSP, 0); vmcs_writel(HOST_INTR_SSP_TABLE, 0); } + + /* + * When running a guest with a mediated PMU, guest state is resident in + * hardware after VM-Exit. Zero PERF_GLOBAL_CTRL on exit so that host + * activity doesn't bleed into the guest counters. When running with + * an emulated PMU, PERF_GLOBAL_CTRL is dynamically computed on every + * entry/exit to merge guest and host PMU usage. + */ + if (enable_mediated_pmu) + vmcs_write64(HOST_IA32_PERF_GLOBAL_CTRL, 0); } void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) @@ -4543,7 +4565,8 @@ static u32 vmx_get_initial_vmexit_ctrl(void) VM_EXIT_CLEAR_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ return vmexit_ctrl & - ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); + ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER | + VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL); } void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) @@ -7270,6 +7293,9 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) struct perf_guest_switch_msr *msrs; struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); + if (kvm_vcpu_has_mediated_pmu(&vmx->vcpu)) + return; + pmu->host_cross_mapped_mask = 0; if (pmu->pebs_enable & pmu->global_ctrl) intel_pmu_cross_mapped_check(pmu); @@ -7572,6 +7598,9 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vmx->loaded_vmcs->launched = 1; + if (!msr_write_intercepted(vmx, MSR_CORE_PERF_GLOBAL_CTRL)) + vcpu_to_pmu(vcpu)->global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL); + vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index bc3ed3145d7e..d7a96c84371f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -510,7 +510,8 @@ static inline u8 vmx_get_rvi(void) VM_EXIT_CLEAR_BNDCFGS | \ VM_EXIT_PT_CONCEAL_PIP | \ VM_EXIT_CLEAR_IA32_RTIT_CTL | \ - VM_EXIT_LOAD_CET_STATE) + VM_EXIT_LOAD_CET_STATE | \ + VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL) #define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL \ (PIN_BASED_EXT_INTR_MASK | \ From 2904df6692f429853cf99b4b47c8592b2f49edaa Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:16:58 -0800 Subject: [PATCH 33/55] KVM: x86/pmu: Disable interception of select PMU MSRs for mediated vPMUs For vCPUs with a mediated vPMU, disable interception of counter MSRs for PMCs that are exposed to the guest, and for GLOBAL_CTRL and related MSRs if they are fully supported according to the vCPU model, i.e. if the MSRs and all bits supported by hardware exist from the guest's point of view. Do NOT passthrough event selector or fixed counter control MSRs, so that KVM can enforce userspace-defined event filters, e.g. to prevent use of AnyThread events (which is unfortunately a setting in the fixed counter control MSR). Defer support for nested passthrough of mediated PMU MSRs to the future, as the logic for nested MSR interception is unfortunately vendor specific. Suggested-by: Sean Christopherson Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang Co-developed-by: Sandipan Das Signed-off-by: Sandipan Das Signed-off-by: Dapeng Mi [sean: squash patches, massage changelog, refresh VMX MSRs on filter change] Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-23-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 41 +++++++++++++++++-------- arch/x86/kvm/pmu.h | 1 + arch/x86/kvm/svm/svm.c | 36 ++++++++++++++++++++++ arch/x86/kvm/vmx/pmu_intel.c | 13 -------- arch/x86/kvm/vmx/pmu_intel.h | 15 +++++++++ arch/x86/kvm/vmx/vmx.c | 59 +++++++++++++++++++++++++++++------- 6 files changed, 128 insertions(+), 37 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 4b896cbb3d53..3e048c170b97 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -717,27 +717,41 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) return 0; } -bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu) +static bool kvm_need_any_pmc_intercept(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); if (!kvm_vcpu_has_mediated_pmu(vcpu)) return true; - /* - * VMware allows access to these Pseduo-PMCs even when read via RDPMC - * in Ring3 when CR4.PCE=0. - */ - if (enable_vmware_backdoor) - return true; - /* * Note! Check *host* PMU capabilities, not KVM's PMU capabilities, as * KVM's capabilities are constrained based on KVM support, i.e. KVM's * capabilities themselves may be a subset of hardware capabilities. */ return pmu->nr_arch_gp_counters != kvm_host_pmu.num_counters_gp || - pmu->nr_arch_fixed_counters != kvm_host_pmu.num_counters_fixed || + pmu->nr_arch_fixed_counters != kvm_host_pmu.num_counters_fixed; +} + +bool kvm_need_perf_global_ctrl_intercept(struct kvm_vcpu *vcpu) +{ + return kvm_need_any_pmc_intercept(vcpu) || + !kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_need_perf_global_ctrl_intercept); + +bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + /* + * VMware allows access to these Pseduo-PMCs even when read via RDPMC + * in Ring3 when CR4.PCE=0. + */ + if (enable_vmware_backdoor) + return true; + + return kvm_need_any_pmc_intercept(vcpu) || pmu->counter_bitmask[KVM_PMC_GP] != (BIT_ULL(kvm_host_pmu.bit_width_gp) - 1) || pmu->counter_bitmask[KVM_PMC_FIXED] != (BIT_ULL(kvm_host_pmu.bit_width_fixed) - 1); } @@ -934,11 +948,12 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) * in the global controls). Emulate that behavior when refreshing the * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL. */ - if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) { + if (pmu->nr_arch_gp_counters && + (kvm_pmu_has_perf_global_ctrl(pmu) || kvm_vcpu_has_mediated_pmu(vcpu))) pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); - if (kvm_vcpu_has_mediated_pmu(vcpu)) - kvm_pmu_call(write_global_ctrl)(pmu->global_ctrl); - } + + if (kvm_vcpu_has_mediated_pmu(vcpu)) + kvm_pmu_call(write_global_ctrl)(pmu->global_ctrl); bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); bitmap_set(pmu->all_valid_pmc_idx, KVM_FIXED_PMC_BASE_IDX, diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 2ff469334c1a..356b08e92bc9 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -239,6 +239,7 @@ void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu); void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu); bool is_vmware_backdoor_pmc(u32 pmc_idx); +bool kvm_need_perf_global_ctrl_intercept(struct kvm_vcpu *vcpu); bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu); extern struct kvm_pmu_ops intel_pmu_ops; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ef43360b9282..dca45f5151f9 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -730,6 +730,40 @@ void svm_vcpu_free_msrpm(void *msrpm) __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); } +static void svm_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu) +{ + bool intercept = !kvm_vcpu_has_mediated_pmu(vcpu); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + int i; + + if (!enable_mediated_pmu) + return; + + /* Legacy counters are always available for AMD CPUs with a PMU. */ + for (i = 0; i < min(pmu->nr_arch_gp_counters, AMD64_NUM_COUNTERS); i++) + svm_set_intercept_for_msr(vcpu, MSR_K7_PERFCTR0 + i, + MSR_TYPE_RW, intercept); + + intercept |= !guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE); + for (i = 0; i < pmu->nr_arch_gp_counters; i++) + svm_set_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i, + MSR_TYPE_RW, intercept); + + for ( ; i < kvm_pmu_cap.num_counters_gp; i++) + svm_enable_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i, + MSR_TYPE_RW); + + intercept = kvm_need_perf_global_ctrl_intercept(vcpu); + svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, + MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, + MSR_TYPE_RW, intercept); +} + static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -798,6 +832,8 @@ static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) if (sev_es_guest(vcpu->kvm)) sev_es_recalc_msr_intercepts(vcpu); + svm_recalc_pmu_msr_intercepts(vcpu); + /* * x2APIC intercepts are modified on-demand and cannot be filtered by * userspace. diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index dbab7cca7a62..820da47454d7 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -128,19 +128,6 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return &counters[array_index_nospec(idx, num_counters)]; } -static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) -{ - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) - return 0; - - return vcpu->arch.perf_capabilities; -} - -static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) -{ - return (vcpu_get_perf_capabilities(vcpu) & PERF_CAP_FW_WRITES) != 0; -} - static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) { if (!fw_writes_is_enabled(pmu_to_vcpu(pmu))) diff --git a/arch/x86/kvm/vmx/pmu_intel.h b/arch/x86/kvm/vmx/pmu_intel.h index 5620d0882cdc..5d9357640aa1 100644 --- a/arch/x86/kvm/vmx/pmu_intel.h +++ b/arch/x86/kvm/vmx/pmu_intel.h @@ -4,6 +4,21 @@ #include +#include "cpuid.h" + +static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) +{ + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) + return 0; + + return vcpu->arch.perf_capabilities; +} + +static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) +{ + return (vcpu_get_perf_capabilities(vcpu) & PERF_CAP_FW_WRITES) != 0; +} + bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 72b92cea9d72..f0a20ff2a941 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4228,6 +4228,53 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) } } +static void vmx_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu) +{ + bool has_mediated_pmu = kvm_vcpu_has_mediated_pmu(vcpu); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool intercept = !has_mediated_pmu; + int i; + + if (!enable_mediated_pmu) + return; + + vm_entry_controls_changebit(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, + has_mediated_pmu); + + vm_exit_controls_changebit(vmx, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | + VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL, + has_mediated_pmu); + + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i, + MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i, MSR_TYPE_RW, + intercept || !fw_writes_is_enabled(vcpu)); + } + for ( ; i < kvm_pmu_cap.num_counters_gp; i++) { + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i, + MSR_TYPE_RW, true); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i, + MSR_TYPE_RW, true); + } + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i, + MSR_TYPE_RW, intercept); + for ( ; i < kvm_pmu_cap.num_counters_fixed; i++) + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i, + MSR_TYPE_RW, true); + + intercept = kvm_need_perf_global_ctrl_intercept(vcpu); + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_STATUS, + MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, + MSR_TYPE_RW, intercept); +} + static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { bool intercept; @@ -4294,17 +4341,7 @@ static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept); } - if (enable_mediated_pmu) { - bool is_mediated_pmu = kvm_vcpu_has_mediated_pmu(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); - - vm_entry_controls_changebit(vmx, - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, is_mediated_pmu); - - vm_exit_controls_changebit(vmx, - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | - VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL, is_mediated_pmu); - } + vmx_recalc_pmu_msr_intercepts(vcpu); /* * x2APIC and LBR MSR intercepts are modified on-demand and cannot be From 0ea0d6314870493ac723afefb6257be71f4c636f Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:16:59 -0800 Subject: [PATCH 34/55] KVM: x86/pmu: Bypass perf checks when emulating mediated PMU counter accesses When emulating a PMC counter read or write for a mediated PMU, bypass the perf checks and emulated_counter logic as the counters aren't proxied through perf, i.e. pmc->counter always holds the guest's up-to-date value, and thus there's no need to defer emulated overflow checks. Suggested-by: Sean Christopherson Signed-off-by: Dapeng Mi Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang [sean: split from event filtering change, write shortlog+changelog] Reviewed-by: Sandipan Das Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-24-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 5 +++++ arch/x86/kvm/pmu.h | 3 +++ 2 files changed, 8 insertions(+) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3e048c170b97..3a901587ca6b 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -379,6 +379,11 @@ static void pmc_update_sample_period(struct kvm_pmc *pmc) void pmc_write_counter(struct kvm_pmc *pmc, u64 val) { + if (kvm_vcpu_has_mediated_pmu(pmc->vcpu)) { + pmc->counter = val & pmc_bitmask(pmc); + return; + } + /* * Drop any unconsumed accumulated counts, the WRMSR is a write, not a * read-modify-write. Adjust the counter value so that its value is diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 356b08e92bc9..9a199109d672 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -111,6 +111,9 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) { u64 counter, enabled, running; + if (kvm_vcpu_has_mediated_pmu(pmc->vcpu)) + return pmc->counter & pmc_bitmask(pmc); + counter = pmc->counter + pmc->emulated_counter; if (pmc->perf_event && !pmc->is_paused) From 02918f0077925994b04be147875b6de8b63ca249 Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Fri, 5 Dec 2025 16:17:00 -0800 Subject: [PATCH 35/55] KVM: x86/pmu: Introduce eventsel_hw to prepare for pmu event filtering Introduce eventsel_hw and fixed_ctr_ctrl_hw to store the actual HW value in PMU event selector MSRs. In mediated PMU checks events before allowing the event values written to the PMU MSRs. However, to match the HW behavior, when PMU event checks fails, KVM should allow guest to read the value back. This essentially requires an extra variable to separate the guest requested value from actual PMU MSR value. Note this only applies to event selectors. Signed-off-by: Mingwei Zhang Co-developed-by: Dapeng Mi Signed-off-by: Dapeng Mi Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-25-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/pmu.c | 7 +++++-- arch/x86/kvm/svm/pmu.c | 1 + arch/x86/kvm/vmx/pmu_intel.c | 2 ++ 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index defd979003be..e72357f64b19 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -529,6 +529,7 @@ struct kvm_pmc { */ u64 emulated_counter; u64 eventsel; + u64 eventsel_hw; struct perf_event *perf_event; struct kvm_vcpu *vcpu; /* @@ -557,6 +558,7 @@ struct kvm_pmu { unsigned nr_arch_fixed_counters; unsigned available_event_types; u64 fixed_ctr_ctrl; + u64 fixed_ctr_ctrl_hw; u64 fixed_ctr_ctrl_rsvd; u64 global_ctrl; u64 global_status; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3a901587ca6b..a05366e4eef2 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -900,11 +900,14 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu) pmc->counter = 0; pmc->emulated_counter = 0; - if (pmc_is_gp(pmc)) + if (pmc_is_gp(pmc)) { pmc->eventsel = 0; + pmc->eventsel_hw = 0; + } } - pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; + pmu->fixed_ctr_ctrl = pmu->fixed_ctr_ctrl_hw = 0; + pmu->global_ctrl = pmu->global_status = 0; kvm_pmu_call(reset)(vcpu); } diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 16c88b2a2eb8..c1ec1962314e 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -166,6 +166,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~pmu->reserved_bits; if (data != pmc->eventsel) { pmc->eventsel = data; + pmc->eventsel_hw = data; kvm_pmu_request_counter_reprogram(pmc); } return 0; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 820da47454d7..855240678300 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -61,6 +61,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) int i; pmu->fixed_ctr_ctrl = data; + pmu->fixed_ctr_ctrl_hw = data; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { u8 new_ctrl = fixed_ctrl_field(data, i); u8 old_ctrl = fixed_ctrl_field(old_fixed_ctr_ctrl, i); @@ -430,6 +431,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data != pmc->eventsel) { pmc->eventsel = data; + pmc->eventsel_hw = data; kvm_pmu_request_counter_reprogram(pmc); } break; From 3db871fe185baca66e78b56a230e236af40f1027 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:17:01 -0800 Subject: [PATCH 36/55] KVM: x86/pmu: Reprogram mediated PMU event selectors on event filter updates Refresh the event selectors that are programmed into hardware when a PMC is "reprogrammed" for a mediated PMU, i.e. if userspace changes the PMU event filters Note, KVM doesn't utilize the reprogramming infrastructure to handle counter overflow for mediated PMUs, as there's no need to reprogram a non-existent perf event. Suggested-by: Sean Christopherson Signed-off-by: Dapeng Mi Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang [sean: add a helper to document behavior, split patch and rewrite changelog] Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-26-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index a05366e4eef2..24f5c14715ef 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -520,6 +520,25 @@ static bool pmc_is_event_allowed(struct kvm_pmc *pmc) return is_fixed_event_allowed(filter, pmc->idx); } +static void kvm_mediated_pmu_refresh_event_filter(struct kvm_pmc *pmc) +{ + bool allowed = pmc_is_event_allowed(pmc); + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (pmc_is_gp(pmc)) { + pmc->eventsel_hw &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + if (allowed) + pmc->eventsel_hw |= pmc->eventsel & + ARCH_PERFMON_EVENTSEL_ENABLE; + } else { + u64 mask = intel_fixed_bits_by_idx(pmc->idx - KVM_FIXED_PMC_BASE_IDX, 0xf); + + pmu->fixed_ctr_ctrl_hw &= ~mask; + if (allowed) + pmu->fixed_ctr_ctrl_hw |= pmu->fixed_ctr_ctrl & mask; + } +} + static int reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -528,6 +547,11 @@ static int reprogram_counter(struct kvm_pmc *pmc) bool emulate_overflow; u8 fixed_ctr_ctrl; + if (kvm_vcpu_has_mediated_pmu(pmu_to_vcpu(pmu))) { + kvm_mediated_pmu_refresh_event_filter(pmc); + return 0; + } + emulate_overflow = pmc_pause_counter(pmc); if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) || From a2f4ba534cc5d681a2d017c82e282bb32d8447df Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 5 Dec 2025 16:17:02 -0800 Subject: [PATCH 37/55] KVM: x86/pmu: Always stuff GuestOnly=1,HostOnly=0 for mediated PMCs on AMD On AMD platforms, there is no way to restore PerfCntrGlobalCtl at VM-Entry or clear it at VM-Exit. Since the register states will be restored before entering and saved after exiting guest context, the counters can keep ticking and even overflow leading to chaos while still in host context. To avoid this, intecept event selectors, which is already done by mediated PMU. In addition, always set the GuestOnly bit and clear the HostOnly bit for PMU selectors on AMD. Doing so allows the counters run only in guest context even if their enable bits are still set after VM exit and before host/guest PMU context switch. Signed-off-by: Sandipan Das Signed-off-by: Mingwei Zhang [sean: massage shortlog] Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-27-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/pmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index c1ec1962314e..6d5f791126b1 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -166,7 +166,8 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~pmu->reserved_bits; if (data != pmc->eventsel) { pmc->eventsel = data; - pmc->eventsel_hw = data; + pmc->eventsel_hw = (data & ~AMD64_EVENTSEL_HOSTONLY) | + AMD64_EVENTSEL_GUESTONLY; kvm_pmu_request_counter_reprogram(pmc); } return 0; From 56bb2736975068cc03648718bb8e50a456ce7173 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:17:03 -0800 Subject: [PATCH 38/55] KVM: x86/pmu: Load/put mediated PMU context when entering/exiting guest Implement the PMU "world switch" between host perf and guest mediated PMU. When loading guest state, call into perf to switch from host to guest, and then load guest state into hardware, and then reverse those actions when putting guest state. On the KVM side, when loading guest state, zero PERF_GLOBAL_CTRL to ensure all counters are disabled, then load selectors and counters, and finally call into vendor code to load control/status information. While VMX and SVM use different mechanisms to avoid counting host activity while guest controls are loaded, both implementations require PERF_GLOBAL_CTRL to be zeroed when the event selectors are in flux. When putting guest state, reverse the order, and save and zero controls and status prior to saving+zeroing selectors and counters. Defer clearing PERF_GLOBAL_CTRL to vendor code, as only SVM needs to manually clear the MSR; VMX configures PERF_GLOBAL_CTRL to be atomically cleared by the CPU on VM-Exit. Handle the difference in MSR layouts between Intel and AMD by communicating the bases and stride via kvm_pmu_ops. Because KVM requires Intel v4 (and full-width writes) and AMD v2, the MSRs to load/save are constant for a given vendor, i.e. do not vary based on the guest PMU, and do not vary based on host PMU (because KVM will simply disable mediated PMU support if the necessary MSRs are unsupported). Except for retrieving the guest's PERF_GLOBAL_CTRL, which needs to be read before invoking any fastpath handler (spoiler alert), perform the context switch around KVM's inner run loop. State only needs to be synchronized from hardware before KVM can access the software "caches". Note, VMX already grabs the guest's PERF_GLOBAL_CTRL immediately after VM-Exit, as hardware saves value into the VMCS. Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang Co-developed-by: Sandipan Das Signed-off-by: Sandipan Das Signed-off-by: Dapeng Mi Tested-by: Xudong Hao Co-developed-by: Sean Christopherson Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-28-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-pmu-ops.h | 2 + arch/x86/include/asm/msr-index.h | 1 + arch/x86/kvm/pmu.c | 130 ++++++++++++++++++++++++- arch/x86/kvm/pmu.h | 10 ++ arch/x86/kvm/svm/pmu.c | 34 +++++++ arch/x86/kvm/svm/svm.c | 3 + arch/x86/kvm/vmx/pmu_intel.c | 44 +++++++++ arch/x86/kvm/x86.c | 4 + 8 files changed, 225 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index ad2cc82abf79..f0aa6996811f 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -24,6 +24,8 @@ KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) KVM_X86_PMU_OP_OPTIONAL(cleanup) KVM_X86_PMU_OP_OPTIONAL(write_global_ctrl) +KVM_X86_PMU_OP(mediated_load) +KVM_X86_PMU_OP(mediated_put) #undef KVM_X86_PMU_OP #undef KVM_X86_PMU_OP_OPTIONAL diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3d0a0950d20a..4d3566bb1a93 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1219,6 +1219,7 @@ #define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e #define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f #define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390 +#define MSR_CORE_PERF_GLOBAL_STATUS_SET 0x00000391 #define MSR_PERF_METRICS 0x00000329 diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 24f5c14715ef..f6387c67b25c 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -880,10 +880,13 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) diff = pmu->global_ctrl ^ data; pmu->global_ctrl = data; reprogram_counters(pmu, diff); - - if (kvm_vcpu_has_mediated_pmu(vcpu)) - kvm_pmu_call(write_global_ctrl)(data); } + /* + * Unconditionally forward writes to vendor code, i.e. to the + * VMC{B,S}, as pmu->global_ctrl is per-VCPU, not per-VMC{B,S}. + */ + if (kvm_vcpu_has_mediated_pmu(vcpu)) + kvm_pmu_call(write_global_ctrl)(data); break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: /* @@ -1244,3 +1247,124 @@ cleanup: kfree(filter); return r; } + +static __always_inline u32 fixed_counter_msr(u32 idx) +{ + return kvm_pmu_ops.FIXED_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE; +} + +static __always_inline u32 gp_counter_msr(u32 idx) +{ + return kvm_pmu_ops.GP_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE; +} + +static __always_inline u32 gp_eventsel_msr(u32 idx) +{ + return kvm_pmu_ops.GP_EVENTSEL_BASE + idx * kvm_pmu_ops.MSR_STRIDE; +} + +static void kvm_pmu_load_guest_pmcs(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 i; + + /* + * No need to zero out unexposed GP/fixed counters/selectors since RDPMC + * is intercepted if hardware has counters that aren't visible to the + * guest (KVM will inject #GP as appropriate). + */ + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + pmc = &pmu->gp_counters[i]; + + wrmsrl(gp_counter_msr(i), pmc->counter); + wrmsrl(gp_eventsel_msr(i), pmc->eventsel_hw); + } + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + + wrmsrl(fixed_counter_msr(i), pmc->counter); + } +} + +void kvm_mediated_pmu_load(struct kvm_vcpu *vcpu) +{ + if (!kvm_vcpu_has_mediated_pmu(vcpu) || + KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) + return; + + lockdep_assert_irqs_disabled(); + + perf_load_guest_context(); + + /* + * Explicitly clear PERF_GLOBAL_CTRL, as "loading" the guest's context + * disables all individual counters (if any were enabled), but doesn't + * globally disable the entire PMU. Loading event selectors and PMCs + * with guest values while PERF_GLOBAL_CTRL is non-zero will generate + * unexpected events and PMIs. + * + * VMX will enable/disable counters at VM-Enter/VM-Exit by atomically + * loading PERF_GLOBAL_CONTROL. SVM effectively performs the switch by + * configuring all events to be GUEST_ONLY. Clear PERF_GLOBAL_CONTROL + * even for SVM to minimize the damage if a perf event is left enabled, + * and to ensure a consistent starting state. + */ + wrmsrq(kvm_pmu_ops.PERF_GLOBAL_CTRL, 0); + + perf_load_guest_lvtpc(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTPC)); + + kvm_pmu_load_guest_pmcs(vcpu); + + kvm_pmu_call(mediated_load)(vcpu); +} + +static void kvm_pmu_put_guest_pmcs(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 i; + + /* + * Clear selectors and counters to ensure hardware doesn't count using + * guest controls when the host (perf) restores its state. + */ + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + pmc = &pmu->gp_counters[i]; + + pmc->counter = rdpmc(i); + if (pmc->counter) + wrmsrq(gp_counter_msr(i), 0); + if (pmc->eventsel_hw) + wrmsrq(gp_eventsel_msr(i), 0); + } + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + + pmc->counter = rdpmc(INTEL_PMC_FIXED_RDPMC_BASE | i); + if (pmc->counter) + wrmsrq(fixed_counter_msr(i), 0); + } +} + +void kvm_mediated_pmu_put(struct kvm_vcpu *vcpu) +{ + if (!kvm_vcpu_has_mediated_pmu(vcpu) || + KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) + return; + + lockdep_assert_irqs_disabled(); + + /* + * Defer handling of PERF_GLOBAL_CTRL to vendor code. On Intel, it's + * atomically cleared on VM-Exit, i.e. doesn't need to be clear here. + */ + kvm_pmu_call(mediated_put)(vcpu); + + kvm_pmu_put_guest_pmcs(vcpu); + + perf_put_guest_lvtpc(); + + perf_put_guest_context(); +} diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 9a199109d672..25b583da9ee2 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -38,11 +38,19 @@ struct kvm_pmu_ops { void (*cleanup)(struct kvm_vcpu *vcpu); bool (*is_mediated_pmu_supported)(struct x86_pmu_capability *host_pmu); + void (*mediated_load)(struct kvm_vcpu *vcpu); + void (*mediated_put)(struct kvm_vcpu *vcpu); void (*write_global_ctrl)(u64 global_ctrl); const u64 EVENTSEL_EVENT; const int MAX_NR_GP_COUNTERS; const int MIN_NR_GP_COUNTERS; + + const u32 PERF_GLOBAL_CTRL; + const u32 GP_EVENTSEL_BASE; + const u32 GP_COUNTER_BASE; + const u32 FIXED_COUNTER_BASE; + const u32 MSR_STRIDE; }; void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); @@ -240,6 +248,8 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu); void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu); +void kvm_mediated_pmu_load(struct kvm_vcpu *vcpu); +void kvm_mediated_pmu_put(struct kvm_vcpu *vcpu); bool is_vmware_backdoor_pmc(u32 pmc_idx); bool kvm_need_perf_global_ctrl_intercept(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 6d5f791126b1..7aa298eeb072 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -234,6 +234,32 @@ static bool amd_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_pm return host_pmu->version >= 2; } +static void amd_mediated_pmu_load(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 global_status; + + rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, global_status); + /* Clear host global_status MSR if non-zero. */ + if (global_status) + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, global_status); + + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, pmu->global_status); + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, pmu->global_ctrl); +} + +static void amd_mediated_pmu_put(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); + rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, pmu->global_status); + + /* Clear global status bits if non-zero */ + if (pmu->global_status) + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, pmu->global_status); +} + struct kvm_pmu_ops amd_pmu_ops __initdata = { .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = amd_msr_idx_to_pmc, @@ -245,8 +271,16 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .init = amd_pmu_init, .is_mediated_pmu_supported = amd_pmu_is_mediated_pmu_supported, + .mediated_load = amd_mediated_pmu_load, + .mediated_put = amd_mediated_pmu_put, .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_MAX_NR_AMD_GP_COUNTERS, .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, + + .PERF_GLOBAL_CTRL = MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + .GP_EVENTSEL_BASE = MSR_F15H_PERF_CTL0, + .GP_COUNTER_BASE = MSR_F15H_PERF_CTR0, + .FIXED_COUNTER_BASE = 0, + .MSR_STRIDE = 2, }; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index dca45f5151f9..1a616eb3ff1c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4367,6 +4367,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; + if (!msr_write_intercepted(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL)) + rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, vcpu_to_pmu(vcpu)->global_ctrl); + trace_kvm_exit(vcpu, KVM_ISA_SVM); svm_complete_interrupts(vcpu); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 855240678300..55249fa4db95 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -792,6 +792,42 @@ static void intel_pmu_write_global_ctrl(u64 global_ctrl) vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, global_ctrl); } + +static void intel_mediated_pmu_load(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 global_status, toggle; + + rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, global_status); + toggle = pmu->global_status ^ global_status; + if (global_status & toggle) + wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status & toggle); + if (pmu->global_status & toggle) + wrmsrq(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status & toggle); + + wrmsrq(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl_hw); +} + +static void intel_mediated_pmu_put(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + /* MSR_CORE_PERF_GLOBAL_CTRL is already saved at VM-exit. */ + rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); + + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ + if (pmu->global_status) + wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); + + /* + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and + * also to avoid accidentally enabling fixed counters (based on guest + * state) while running in the host, e.g. when setting global ctrl. + */ + if (pmu->fixed_ctr_ctrl_hw) + wrmsrq(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); +} + struct kvm_pmu_ops intel_pmu_ops __initdata = { .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, @@ -805,9 +841,17 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .cleanup = intel_pmu_cleanup, .is_mediated_pmu_supported = intel_pmu_is_mediated_pmu_supported, + .mediated_load = intel_mediated_pmu_load, + .mediated_put = intel_mediated_pmu_put, .write_global_ctrl = intel_pmu_write_global_ctrl, .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS, .MIN_NR_GP_COUNTERS = 1, + + .PERF_GLOBAL_CTRL = MSR_CORE_PERF_GLOBAL_CTRL, + .GP_EVENTSEL_BASE = MSR_P6_EVNTSEL0, + .GP_COUNTER_BASE = MSR_IA32_PMC0, + .FIXED_COUNTER_BASE = MSR_CORE_PERF_FIXED_CTR0, + .MSR_STRIDE = 1, }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 76e86eb358df..589a309259f4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11334,6 +11334,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) run_flags |= KVM_RUN_LOAD_DEBUGCTL; vcpu->arch.host_debugctl = debug_ctl; + kvm_mediated_pmu_load(vcpu); + guest_timing_enter_irqoff(); /* @@ -11372,6 +11374,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_load_host_pkru(vcpu); + kvm_mediated_pmu_put(vcpu); + /* * Do this here before restoring debug registers on the host. And * since we do this before handling the vmexit, a DR access vmexit From f7a65e58d64340c3c0e390ea4e1c4857cd451f1f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:04 -0800 Subject: [PATCH 39/55] KVM: x86/pmu: Disallow emulation in the fastpath if mediated PMCs are active Don't handle exits in the fastpath if emulation is required, i.e. if an instruction needs to be skipped, the mediated PMU is enabled, and one or more PMCs is counting instructions. With the mediated PMU, KVM's cache of PMU state is inconsistent with respect to hardware until KVM exits the inner run loop (when the mediated PMU is "put"). Reviewed-by: Sandipan Das Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-29-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.h | 10 ++++++++++ arch/x86/kvm/x86.c | 9 +++++++++ 2 files changed, 19 insertions(+) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 25b583da9ee2..0925246731cb 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -234,6 +234,16 @@ static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); } +static inline bool kvm_pmu_is_fastpath_emulation_allowed(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + return !kvm_vcpu_has_mediated_pmu(vcpu) || + !bitmap_intersects(pmu->pmc_counting_instructions, + (unsigned long *)&pmu->global_ctrl, + X86_PMC_IDX_MAX); +} + void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 589a309259f4..4683df775b0a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2215,6 +2215,9 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_invd); fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu) { + if (!kvm_pmu_is_fastpath_emulation_allowed(vcpu)) + return EXIT_FASTPATH_NONE; + if (!kvm_emulate_invd(vcpu)) return EXIT_FASTPATH_EXIT_USERSPACE; @@ -2271,6 +2274,9 @@ static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) static fastpath_t __handle_fastpath_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { + if (!kvm_pmu_is_fastpath_emulation_allowed(vcpu)) + return EXIT_FASTPATH_NONE; + switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) || @@ -11714,6 +11720,9 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_halt); fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) { + if (!kvm_pmu_is_fastpath_emulation_allowed(vcpu)) + return EXIT_FASTPATH_NONE; + if (!kvm_emulate_halt(vcpu)) return EXIT_FASTPATH_EXIT_USERSPACE; From 283a5aa57b2223abf2f73afcc714c4d4553660f2 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:17:05 -0800 Subject: [PATCH 40/55] KVM: x86/pmu: Handle emulated instruction for mediated vPMU Mediated vPMU needs to accumulate the emulated instructions into counter and load the counter into HW at vm-entry. Moreover, if the accumulation leads to counter overflow, KVM needs to update GLOBAL_STATUS and inject PMI into guest as well. Suggested-by: Sean Christopherson Signed-off-by: Dapeng Mi Signed-off-by: Mingwei Zhang Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-30-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index f6387c67b25c..b78ad897886d 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -1031,10 +1031,45 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) kvm_pmu_reset(vcpu); } +static bool pmc_is_pmi_enabled(struct kvm_pmc *pmc) +{ + u8 fixed_ctr_ctrl; + + if (pmc_is_gp(pmc)) + return pmc->eventsel & ARCH_PERFMON_EVENTSEL_INT; + + fixed_ctr_ctrl = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, + pmc->idx - KVM_FIXED_PMC_BASE_IDX); + return fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI; +} + static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - pmc->emulated_counter++; - kvm_pmu_request_counter_reprogram(pmc); + struct kvm_vcpu *vcpu = pmc->vcpu; + + /* + * For perf-based PMUs, accumulate software-emulated events separately + * from pmc->counter, as pmc->counter is offset by the count of the + * associated perf event. Request reprogramming, which will consult + * both emulated and hardware-generated events to detect overflow. + */ + if (!kvm_vcpu_has_mediated_pmu(vcpu)) { + pmc->emulated_counter++; + kvm_pmu_request_counter_reprogram(pmc); + return; + } + + /* + * For mediated PMUs, pmc->counter is updated when the vCPU's PMU is + * put, and will be loaded into hardware when the PMU is loaded. Simply + * increment the counter and signal overflow if it wraps to zero. + */ + pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); + if (!pmc->counter) { + pmc_to_pmu(pmc)->global_status |= BIT_ULL(pmc->idx); + if (pmc_is_pmi_enabled(pmc)) + kvm_make_request(KVM_REQ_PMI, vcpu); + } } static inline bool cpl_is_matched(struct kvm_pmc *pmc) From cb58327c4c8ad9e81d3a2f17adaf3ab57066f369 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:17:06 -0800 Subject: [PATCH 41/55] KVM: nVMX: Add macros to simplify nested MSR interception setting Add macros nested_vmx_merge_msr_bitmaps_xxx() to simplify nested MSR interception setting. No function change intended. Suggested-by: Sean Christopherson Signed-off-by: Dapeng Mi Signed-off-by: Mingwei Zhang Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-31-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 40777278eabb..b56ed2b1ac67 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -617,6 +617,19 @@ static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, msr_bitmap_l0, msr); } +#define nested_vmx_merge_msr_bitmaps(msr, type) \ + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, \ + msr_bitmap_l0, msr, type) + +#define nested_vmx_merge_msr_bitmaps_read(msr) \ + nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_R) + +#define nested_vmx_merge_msr_bitmaps_write(msr) \ + nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_W) + +#define nested_vmx_merge_msr_bitmaps_rw(msr) \ + nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_RW) + /* * Merge L0's and L1's MSR bitmap, return false to indicate that * we do not use the hardware. @@ -700,23 +713,13 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. */ #ifdef CONFIG_X86_64 - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_FS_BASE, MSR_TYPE_RW); - - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_GS_BASE, MSR_TYPE_RW); - - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + nested_vmx_merge_msr_bitmaps_rw(MSR_FS_BASE); + nested_vmx_merge_msr_bitmaps_rw(MSR_GS_BASE); + nested_vmx_merge_msr_bitmaps_rw(MSR_KERNEL_GS_BASE); #endif - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); - - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_PRED_CMD, MSR_TYPE_W); - - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_FLUSH_CMD, MSR_TYPE_W); + nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_SPEC_CTRL); + nested_vmx_merge_msr_bitmaps_write(MSR_IA32_PRED_CMD); + nested_vmx_merge_msr_bitmaps_write(MSR_IA32_FLUSH_CMD); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_APERF, MSR_TYPE_R); From 88ebc2a3199cb5f16aff20673ed97b63a4295989 Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Fri, 5 Dec 2025 16:17:07 -0800 Subject: [PATCH 42/55] KVM: nVMX: Disable PMU MSR interception as appropriate while running L2 Merge KVM's PMU MSR interception bitmaps with those of L1, i.e. merge the bitmaps of vmcs01 and vmcs12, e.g. so that KVM doesn't interpose on MSR accesses unnecessarily if L1 exposes a mediated PMU (or equivalent) to L2. Signed-off-by: Mingwei Zhang Co-developed-by: Dapeng Mi Signed-off-by: Dapeng Mi [sean: rewrite changelog and comment, omit MSRs that are always intercepted] Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-32-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b56ed2b1ac67..729cc1f05ac8 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -630,6 +630,34 @@ static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, #define nested_vmx_merge_msr_bitmaps_rw(msr) \ nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_RW) +static void nested_vmx_merge_pmu_msr_bitmaps(struct kvm_vcpu *vcpu, + unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_l0) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + int i; + + /* + * Skip the merges if the vCPU doesn't have a mediated PMU MSR, i.e. if + * none of the MSRs can possibly be passed through to L1. + */ + if (!kvm_vcpu_has_mediated_pmu(vcpu)) + return; + + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PERFCTR0 + i); + nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PMC0 + i); + } + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) + nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_FIXED_CTR0 + i); + + nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_GLOBAL_CTRL); + nested_vmx_merge_msr_bitmaps_read(MSR_CORE_PERF_GLOBAL_STATUS); + nested_vmx_merge_msr_bitmaps_write(MSR_CORE_PERF_GLOBAL_OVF_CTRL); +} + /* * Merge L0's and L1's MSR bitmap, return false to indicate that * we do not use the hardware. @@ -745,6 +773,8 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PL3_SSP, MSR_TYPE_RW); + nested_vmx_merge_pmu_msr_bitmaps(vcpu, msr_bitmap_l1, msr_bitmap_l0); + kvm_vcpu_unmap(vcpu, &map); vmx->nested.force_msr_bitmap_recalc = false; From 3b36160d9406863812883c96c1efc8bc5c04e2cc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:08 -0800 Subject: [PATCH 43/55] KVM: nSVM: Disable PMU MSR interception as appropriate while running L2 Add MSRs that might be passed through to L1 when running with a mediated PMU to the nested SVM's set of to-be-merged MSR indices, i.e. disable interception of PMU MSRs when running L2 if both KVM (L0) and L1 disable interception. There is no need for KVM to interpose on such MSR accesses, e.g. if L1 exposes a mediated PMU (or equivalent) to L2. Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-33-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index c81005b24522..9ca8dad9a7f3 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -194,7 +194,7 @@ void recalc_intercepts(struct vcpu_svm *svm) * Hardcode the capacity of the array based on the maximum number of _offsets_. * MSRs are batched together, so there are fewer offsets than MSRs. */ -static int nested_svm_msrpm_merge_offsets[7] __ro_after_init; +static int nested_svm_msrpm_merge_offsets[10] __ro_after_init; static int nested_svm_nr_msrpm_merge_offsets __ro_after_init; typedef unsigned long nsvm_msrpm_merge_t; @@ -222,6 +222,22 @@ int __init nested_svm_init_msrpm_merge_offsets(void) MSR_IA32_LASTBRANCHTOIP, MSR_IA32_LASTINTFROMIP, MSR_IA32_LASTINTTOIP, + + MSR_K7_PERFCTR0, + MSR_K7_PERFCTR1, + MSR_K7_PERFCTR2, + MSR_K7_PERFCTR3, + MSR_F15H_PERF_CTR0, + MSR_F15H_PERF_CTR1, + MSR_F15H_PERF_CTR2, + MSR_F15H_PERF_CTR3, + MSR_F15H_PERF_CTR4, + MSR_F15H_PERF_CTR5, + + MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, }; int i, j; From 860bcb1021f5234820592853d56ca12f69e9c81f Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Fri, 5 Dec 2025 16:17:09 -0800 Subject: [PATCH 44/55] KVM: x86/pmu: Expose enable_mediated_pmu parameter to user space Expose enable_mediated_pmu parameter to user space, i.e. allow userspace to enable/disable mediated vPMU support. Document the mediated versus perf-based behavior as part of the kernel-parameters.txt entry, and opportunistically add an entry for the core enable_pmu param as well. Signed-off-by: Dapeng Mi Signed-off-by: Mingwei Zhang Tested-by: Xudong Hao Co-developed-by: Sean Christopherson Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-34-seanjc@google.com Signed-off-by: Sean Christopherson --- .../admin-guide/kernel-parameters.txt | 49 +++++++++++++++++++ arch/x86/kvm/svm/svm.c | 2 + arch/x86/kvm/vmx/vmx.c | 2 + 3 files changed, 53 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a8d0afde7f85..c13a8877f5b3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3044,6 +3044,26 @@ Kernel parameters Default is Y (on). + kvm.enable_pmu=[KVM,X86] + If enabled, KVM will virtualize PMU functionality based + on the virtual CPU model defined by userspace. This + can be overridden on a per-VM basis via + KVM_CAP_PMU_CAPABILITY. + + If disabled, KVM will not virtualize PMU functionality, + e.g. MSRs, PMCs, PMIs, etc., even if userspace defines + a virtual CPU model that contains PMU assets. + + Note, KVM's vPMU support implicitly requires running + with an in-kernel local APIC, e.g. to deliver PMIs to + the guest. Running without an in-kernel local APIC is + not supported, though KVM will allow such a combination + (with severely degraded functionality). + + See also enable_mediated_pmu. + + Default is Y (on). + kvm.enable_virt_at_load=[KVM,ARM64,LOONGARCH,MIPS,RISCV,X86] If enabled, KVM will enable virtualization in hardware when KVM is loaded, and disable virtualization when KVM @@ -3090,6 +3110,35 @@ Kernel parameters If the value is 0 (the default), KVM will pick a period based on the ratio, such that a page is zapped after 1 hour on average. + kvm-{amd,intel}.enable_mediated_pmu=[KVM,AMD,INTEL] + If enabled, KVM will provide a mediated virtual PMU, + instead of the default perf-based virtual PMU (if + kvm.enable_pmu is true and PMU is enumerated via the + virtual CPU model). + + With a perf-based vPMU, KVM operates as a user of perf, + i.e. emulates guest PMU counters using perf events. + KVM-created perf events are managed by perf as regular + (guest-only) events, e.g. are scheduled in/out, contend + for hardware resources, etc. Using a perf-based vPMU + allows guest and host usage of the PMU to co-exist, but + incurs non-trivial overhead and can result in silently + dropped guest events (due to resource contention). + + With a mediated vPMU, hardware PMU state is context + switched around the world switch to/from the guest. + KVM mediates which events the guest can utilize, but + gives the guest direct access to all other PMU assets + when possible (KVM may intercept some accesses if the + virtual CPU model provides a subset of hardware PMU + functionality). Using a mediated vPMU significantly + reduces PMU virtualization overhead and eliminates lost + guest events, but is mutually exclusive with using perf + to profile KVM guests and adds latency to most VM-Exits + (to context switch PMU state). + + Default is N (off). + kvm-amd.nested= [KVM,AMD] Control nested virtualization feature in KVM/SVM. Default is 1 (enabled). diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1a616eb3ff1c..5910088fe22a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -170,6 +170,8 @@ module_param(intercept_smi, bool, 0444); bool vnmi = true; module_param(vnmi, bool, 0444); +module_param(enable_mediated_pmu, bool, 0444); + static bool svm_gp_erratum_intercept = true; static u8 rsm_ins_bytes[] = "\x0f\xaa"; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f0a20ff2a941..62ba2a2b9e98 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -150,6 +150,8 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); extern bool __read_mostly allow_smaller_maxphyaddr; module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); +module_param(enable_mediated_pmu, bool, 0444); + #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE #define KVM_VM_CR0_ALWAYS_ON \ From b0b6a8d3be16ea742bf835407e9968378c0c753c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:10 -0800 Subject: [PATCH 45/55] KVM: x86/pmu: Elide WRMSRs when loading guest PMCs if values already match When loading a mediated PMU state, elide the WRMSRs to load PMCs with the guest's value if the value in hardware already matches the guest's value. For the relatively common case where neither the guest nor the host is actively using the PMU, i.e. when all/many counters are '0', eliding the WRMSRs reduces the latency of handling VM-Exit by a measurable amount (WRMSR is significantly more expensive than RDPMC). As measured by KVM-Unit-Tests' CPUID VM-Exit testcase, this provides a a ~25% reduction in latency (4k => 3k cycles) on Intel Emerald Rapids, and a ~13% reduction (6.2k => 5.3k cycles) on AMD Turin. Cc: Manali Shukla Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-35-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b78ad897886d..954622f8f817 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -1312,13 +1312,15 @@ static void kvm_pmu_load_guest_pmcs(struct kvm_vcpu *vcpu) for (i = 0; i < pmu->nr_arch_gp_counters; i++) { pmc = &pmu->gp_counters[i]; - wrmsrl(gp_counter_msr(i), pmc->counter); + if (pmc->counter != rdpmc(i)) + wrmsrl(gp_counter_msr(i), pmc->counter); wrmsrl(gp_eventsel_msr(i), pmc->eventsel_hw); } for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { pmc = &pmu->fixed_counters[i]; - wrmsrl(fixed_counter_msr(i), pmc->counter); + if (pmc->counter != rdpmc(INTEL_PMC_FIXED_RDPMC_BASE | i)) + wrmsrl(fixed_counter_msr(i), pmc->counter); } } From 462f092dc55c0eb97da02dd0c773a4394850dd1b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:11 -0800 Subject: [PATCH 46/55] KVM: VMX: Drop intermediate "guest" field from msr_autostore Drop the intermediate "guest" field from vcpu_vmx.msr_autostore as the value saved on VM-Exit isn't guaranteed to be the guest's value, it's purely whatever is in hardware at the time of VM-Exit. E.g. KVM's only use of the store list at the momemnt is to snapshot TSC at VM-Exit, and the value saved is always the raw TSC even if TSC-offseting and/or TSC-scaling is enabled for the guest. And unlike msr_autoload, there is no need differentiate between "on-entry" and "on-exit". No functional change intended. Cc: Jim Mattson Reviewed-by: Dapeng Mi Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-36-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 10 +++++----- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 4 +--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 729cc1f05ac8..486789dac515 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1076,11 +1076,11 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, * VM-exit in L0, use the more accurate value. */ if (msr_index == MSR_IA32_TSC) { - int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, + int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore, MSR_IA32_TSC); if (i >= 0) { - u64 val = vmx->msr_autostore.guest.val[i].value; + u64 val = vmx->msr_autostore.val[i].value; *data = kvm_read_l1_tsc(vcpu, val); return true; @@ -1167,7 +1167,7 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, u32 msr_index) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmx_msrs *autostore = &vmx->msr_autostore.guest; + struct vmx_msrs *autostore = &vmx->msr_autostore; bool in_vmcs12_store_list; int msr_autostore_slot; bool in_autostore_list; @@ -2366,7 +2366,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) * addresses are constant (for vmcs02), the counts can change based * on L2's behavior, e.g. switching to/from long mode. */ - vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); + vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val)); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); @@ -2704,7 +2704,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) */ prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 62ba2a2b9e98..23c92c41fd83 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6567,7 +6567,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu) if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) - vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); + vmx_dump_msrs("autostore", &vmx->msr_autostore); if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE) pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index d7a96c84371f..4ce653d729ca 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -245,9 +245,7 @@ struct vcpu_vmx { struct vmx_msrs host; } msr_autoload; - struct msr_autostore { - struct vmx_msrs guest; - } msr_autostore; + struct vmx_msrs msr_autostore; struct { int vm86_active; From 58f21a01417f273b4246c885558c252e345681b3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:12 -0800 Subject: [PATCH 47/55] KVM: nVMX: Don't update msr_autostore count when saving TSC for vmcs12 Rework nVMX's use of the MSR auto-store list to snapshot TSC to sneak MSR_IA32_TSC into the list _without_ updating KVM's software tracking, and drop the generic functionality so that future usage of the store list for nested specific logic needs to consider the implications of modifying the list. Updating the list only for vmcs02 and only on nested VM-Enter is a disaster waiting to happen, as it means vmcs01 is stale relative to the software tracking, and KVM could unintentionally leave an MSR in the store list in perpetuity while running L1, e.g. if KVM addressed the first issue and updated vmcs01 on nested VM-Exit without removing TSC from the list. Furthermore, mixing KVM's desire to save an MSR with L1's desire to save an MSR result KVM clobbering/ignoring the needs of vmcs01 or vmcs02. E.g. if KVM added MSR_IA32_TSC to the store list for its own purposes, and then _removed_ MSR_IA32_TSC from the list after emulating nested VM-Enter, then KVM would remove MSR_IA32_TSC from the list even though saving TSC on VM-Exit from L2 is still desirable (to provide L1 with an accurate TSC). Similarly, removing an MSR from the list based on vmcs12's settings could drop an MSR that KVM wants to save for its own purposes. In practice, the issues are currently benign, because KVM doesn't use the store list for vmcs01. But that will change with upcoming mediated PMU support. Alternatively, a "full" solution would be to track MSR list entries for vmcs12 separately from KVM's standard lists, but MSR_IA32_TSC is likely the only MSR that KVM would ever want to save on _every_ VM-Exit purely based on vmcs12. I.e. the added complexity isn't remotely justified at this time. Opportunistically escalate from a pr_warn_ratelimited() to a full WARN as KVM reserves eight entries in each MSR list, and as above KVM uses at most one entry. Opportunistically make vmx_find_loadstore_msr_slot() local to vmx.c as using it directly from nested code is unsafe due to the potential for mixing vmcs01 and vmcs02 state (see above). Cc: Jim Mattson Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-37-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 71 ++++++++++++--------------------------- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 2 +- 3 files changed, 24 insertions(+), 51 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 486789dac515..614b789ecf16 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1075,16 +1075,12 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, * does not include the time taken for emulation of the L2->L1 * VM-exit in L0, use the more accurate value. */ - if (msr_index == MSR_IA32_TSC) { - int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore, - MSR_IA32_TSC); + if (msr_index == MSR_IA32_TSC && vmx->nested.tsc_autostore_slot >= 0) { + int slot = vmx->nested.tsc_autostore_slot; + u64 host_tsc = vmx->msr_autostore.val[slot].value; - if (i >= 0) { - u64 val = vmx->msr_autostore.val[i].value; - - *data = kvm_read_l1_tsc(vcpu, val); - return true; - } + *data = kvm_read_l1_tsc(vcpu, host_tsc); + return true; } if (kvm_emulate_msr_read(vcpu, msr_index, data)) { @@ -1163,42 +1159,6 @@ static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) return false; } -static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, - u32 msr_index) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmx_msrs *autostore = &vmx->msr_autostore; - bool in_vmcs12_store_list; - int msr_autostore_slot; - bool in_autostore_list; - int last; - - msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); - in_autostore_list = msr_autostore_slot >= 0; - in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); - - if (in_vmcs12_store_list && !in_autostore_list) { - if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { - /* - * Emulated VMEntry does not fail here. Instead a less - * accurate value will be returned by - * nested_vmx_get_vmexit_msr_value() by reading KVM's - * internal MSR state instead of reading the value from - * the vmcs02 VMExit MSR-store area. - */ - pr_warn_ratelimited( - "Not enough msr entries in msr_autostore. Can't add msr %x\n", - msr_index); - return; - } - last = autostore->nr++; - autostore->val[last].index = msr_index; - } else if (!in_vmcs12_store_list && in_autostore_list) { - last = --autostore->nr; - autostore->val[msr_autostore_slot] = autostore->val[last]; - } -} - /* * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are * emulating VM-Entry into a guest with EPT enabled. On failure, the expected @@ -2699,12 +2659,25 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) } /* - * Make sure the msr_autostore list is up to date before we set the - * count in the vmcs02. + * If vmcs12 is configured to save TSC on exit via the auto-store list, + * append the MSR to vmcs02's auto-store list so that KVM effectively + * reads TSC at the time of VM-Exit from L2. The saved value will be + * propagated to vmcs12's list on nested VM-Exit. + * + * Don't increment the number of MSRs in the vCPU structure, as saving + * TSC is specific to this particular incarnation of vmcb02, i.e. must + * not bleed into vmcs01. */ - prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); + if (nested_msr_store_list_has_msr(&vmx->vcpu, MSR_IA32_TSC) && + !WARN_ON_ONCE(vmx->msr_autostore.nr >= ARRAY_SIZE(vmx->msr_autostore.val))) { + vmx->nested.tsc_autostore_slot = vmx->msr_autostore.nr; + vmx->msr_autostore.val[vmx->msr_autostore.nr].index = MSR_IA32_TSC; - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr + 1); + } else { + vmx->nested.tsc_autostore_slot = -1; + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); + } vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 23c92c41fd83..52bcb817cc15 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1029,7 +1029,7 @@ static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx vm_exit_controls_clearbit(vmx, exit); } -int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) +static int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) { unsigned int i; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 4ce653d729ca..3175fedb5a4d 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -191,6 +191,7 @@ struct nested_vmx { u16 vpid02; u16 last_vpid; + int tsc_autostore_slot; struct nested_vmx_msrs msrs; /* SMM related state */ @@ -383,7 +384,6 @@ void vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags); unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx); bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, unsigned int flags); -int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set); From 0bd29379114b9c669cdabf7d6c08c0c1ea41861c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:13 -0800 Subject: [PATCH 48/55] KVM: VMX: Dedup code for removing MSR from VMCS's auto-load list Add a helper to remove an MSR from an auto-{load,store} list to dedup the msr_autoload code, and in anticipation of adding similar functionality for msr_autostore. No functional change intended. Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-38-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 52bcb817cc15..a51f66d1b201 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1040,9 +1040,22 @@ static int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) return -ENOENT; } -static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) +static void vmx_remove_auto_msr(struct vmx_msrs *m, u32 msr, + unsigned long vmcs_count_field) { int i; + + i = vmx_find_loadstore_msr_slot(m, msr); + if (i < 0) + return; + + --m->nr; + m->val[i] = m->val[m->nr]; + vmcs_write32(vmcs_count_field, m->nr); +} + +static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) +{ struct msr_autoload *m = &vmx->msr_autoload; switch (msr) { @@ -1063,21 +1076,9 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) } break; } - i = vmx_find_loadstore_msr_slot(&m->guest, msr); - if (i < 0) - goto skip_guest; - --m->guest.nr; - m->guest.val[i] = m->guest.val[m->guest.nr]; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); -skip_guest: - i = vmx_find_loadstore_msr_slot(&m->host, msr); - if (i < 0) - return; - - --m->host.nr; - m->host.val[i] = m->host.val[m->host.nr]; - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); + vmx_remove_auto_msr(&m->guest, msr, VM_ENTRY_MSR_LOAD_COUNT); + vmx_remove_auto_msr(&m->host, msr, VM_EXIT_MSR_LOAD_COUNT); } static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, From 84ac00042a28642cc974a0a250fab7df050a5dd5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:14 -0800 Subject: [PATCH 49/55] KVM: VMX: Drop unused @entry_only param from add_atomic_switch_msr() Drop the "on VM-Enter only" parameter from add_atomic_switch_msr() as it is no longer used, and for all intents and purposes was never used. The functionality was added, under embargo, by commit 989e3992d2ec ("x86/KVM/VMX: Extend add_atomic_switch_msr() to allow VMENTER only MSRs"), and then ripped out by commit 2f055947ae5e ("x86/kvm: Drop L1TF MSR list approach") just a few commits later. 2f055947ae5e x86/kvm: Drop L1TF MSR list approach 72c6d2db64fa x86/litf: Introduce vmx status variable 215af5499d9e cpu/hotplug: Online siblings when SMT control is turned on 390d975e0c4e x86/KVM/VMX: Use MSR save list for IA32_FLUSH_CMD if required 989e3992d2ec x86/KVM/VMX: Extend add_atomic_switch_msr() to allow VMENTER only MSRs Furthermore, it's extremely unlikely KVM will ever _need_ to load an MSR value via the auto-load lists only on VM-Enter. MSRs writes via the lists aren't optimized in any way, and so the only reason to use the lists instead of a WRMSR are for cases where the MSR _must_ be load atomically with respect to VM-Enter (and/or VM-Exit). While one could argue that command MSRs, e.g. IA32_FLUSH_CMD, "need" to be done exact at VM-Enter, in practice doing such flushes within a few instructons of VM-Enter is more than sufficient. Note, the shortlog and changelog for commit 390d975e0c4e ("x86/KVM/VMX: Use MSR save list for IA32_FLUSH_CMD if required") are misleading and wrong. That commit added MSR_IA32_FLUSH_CMD to the VM-Enter _load_ list, not the VM-Enter save list (which doesn't exist, only VM-Exit has a store/save list). Reviewed-by: Dapeng Mi Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-39-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a51f66d1b201..38491962b2c1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1094,7 +1094,7 @@ static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, } static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, - u64 guest_val, u64 host_val, bool entry_only) + u64 guest_val, u64 host_val) { int i, j = 0; struct msr_autoload *m = &vmx->msr_autoload; @@ -1132,8 +1132,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, } i = vmx_find_loadstore_msr_slot(&m->guest, msr); - if (!entry_only) - j = vmx_find_loadstore_msr_slot(&m->host, msr); + j = vmx_find_loadstore_msr_slot(&m->host, msr); if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { @@ -1148,9 +1147,6 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, m->guest.val[i].index = msr; m->guest.val[i].value = guest_val; - if (entry_only) - return; - if (j < 0) { j = m->host.nr++; vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); @@ -1190,8 +1186,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) if (!(guest_efer & EFER_LMA)) guest_efer &= ~EFER_LME; if (guest_efer != kvm_host.efer) - add_atomic_switch_msr(vmx, MSR_EFER, - guest_efer, kvm_host.efer, false); + add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, kvm_host.efer); else clear_atomic_switch_msr(vmx, MSR_EFER); return false; @@ -7350,7 +7345,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) clear_atomic_switch_msr(vmx, msrs[i].msr); else add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, - msrs[i].host, false); + msrs[i].host); } static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) From 2ed57bb8997610b33cb92c26ccb9a91b2966fff8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:15 -0800 Subject: [PATCH 50/55] KVM: VMX: Bug the VM if either MSR auto-load list is full WARN and bug the VM if either MSR auto-load list is full when adding an MSR to the lists, as the set of MSRs that KVM loads via the lists is finite and entirely KVM controlled, i.e. overflowing the lists shouldn't be possible in a fully released version of KVM. Terminate the VM as the core KVM infrastructure has no insight as to _why_ an MSR is being added to the list, and failure to load an MSR on VM-Enter and/or VM-Exit could be fatal to the host. E.g. running the host with a guest-controlled PEBS MSR could generate unexpected writes to the DS buffer and crash the host. Reviewed-by: Dapeng Mi Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-40-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 38491962b2c1..2c50ebf4ff1b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1098,6 +1098,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, { int i, j = 0; struct msr_autoload *m = &vmx->msr_autoload; + struct kvm *kvm = vmx->vcpu.kvm; switch (msr) { case MSR_EFER: @@ -1134,12 +1135,10 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, i = vmx_find_loadstore_msr_slot(&m->guest, msr); j = vmx_find_loadstore_msr_slot(&m->host, msr); - if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || - (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { - printk_once(KERN_WARNING "Not enough msr switch entries. " - "Can't add msr %x\n", msr); + if (KVM_BUG_ON(i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS, kvm) || + KVM_BUG_ON(j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS, kvm)) return; - } + if (i < 0) { i = m->guest.nr++; vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); From 0c4ff0866fc1b0bf8c1d8d5f27fedc6dd9c51183 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:16 -0800 Subject: [PATCH 51/55] KVM: VMX: Set MSR index auto-load entry if and only if entry is "new" When adding an MSR to the auto-load lists, update the MSR index in the list entry if and only if a new entry is being inserted, as 'i' can only be non-negative if vmx_find_loadstore_msr_slot() found an entry with the MSR's index. Unnecessarily setting the index is benign, but it makes it harder to see that updating the value is necessary even when an existing entry for the MSR was found. No functional change intended. Reviewed-by: Dapeng Mi Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-41-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2c50ebf4ff1b..be2a2580e8f1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1141,16 +1141,16 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, if (i < 0) { i = m->guest.nr++; + m->guest.val[i].index = msr; vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); } - m->guest.val[i].index = msr; m->guest.val[i].value = guest_val; if (j < 0) { j = m->host.nr++; + m->host.val[j].index = msr; vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); } - m->host.val[j].index = msr; m->host.val[j].value = host_val; } From 2239d137a71d77c7610434473b0c8cfde90d4116 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:17 -0800 Subject: [PATCH 52/55] KVM: VMX: Compartmentalize adding MSRs to host vs. guest auto-load list Undo the bundling of the "host" and "guest" MSR auto-load list logic so that the code can be deduplicated by factoring out the logic to a separate helper. Now that "list full" situations are treated as fatal to the VM, there is no need to pre-check both lists. For all intents and purposes, this reverts the add_atomic_switch_msr() changes made by commit 3190709335dd ("x86/KVM/VMX: Separate the VMX AUTOLOAD guest/host number accounting"). Reviewed-by: Dapeng Mi Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-42-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index be2a2580e8f1..018e01daab68 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1096,9 +1096,9 @@ static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, u64 guest_val, u64 host_val) { - int i, j = 0; struct msr_autoload *m = &vmx->msr_autoload; struct kvm *kvm = vmx->vcpu.kvm; + int i; switch (msr) { case MSR_EFER: @@ -1133,25 +1133,26 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, } i = vmx_find_loadstore_msr_slot(&m->guest, msr); - j = vmx_find_loadstore_msr_slot(&m->host, msr); - - if (KVM_BUG_ON(i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS, kvm) || - KVM_BUG_ON(j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS, kvm)) - return; - if (i < 0) { + if (KVM_BUG_ON(m->guest.nr == MAX_NR_LOADSTORE_MSRS, kvm)) + return; + i = m->guest.nr++; m->guest.val[i].index = msr; vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); } m->guest.val[i].value = guest_val; - if (j < 0) { - j = m->host.nr++; - m->host.val[j].index = msr; + i = vmx_find_loadstore_msr_slot(&m->host, msr); + if (i < 0) { + if (KVM_BUG_ON(m->host.nr == MAX_NR_LOADSTORE_MSRS, kvm)) + return; + + i = m->host.nr++; + m->host.val[i].index = msr; vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); } - m->host.val[j].value = host_val; + m->host.val[i].value = host_val; } static bool update_transition_efer(struct vcpu_vmx *vmx) From c3d6a7210a4de909683a36779f5b8567f79a3688 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:18 -0800 Subject: [PATCH 53/55] KVM: VMX: Dedup code for adding MSR to VMCS's auto list Add a helper to add an MSR to a VMCS's "auto" list to deduplicate the code in add_atomic_switch_msr(), and so that the functionality can be used in the future for managing the MSR auto-store list. No functional change intended. Reviewed-by: Dapeng Mi Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-43-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 018e01daab68..3f64d4b1b19c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1093,12 +1093,28 @@ static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, vm_exit_controls_setbit(vmx, exit); } +static void vmx_add_auto_msr(struct vmx_msrs *m, u32 msr, u64 value, + unsigned long vmcs_count_field, struct kvm *kvm) +{ + int i; + + i = vmx_find_loadstore_msr_slot(m, msr); + if (i < 0) { + if (KVM_BUG_ON(m->nr == MAX_NR_LOADSTORE_MSRS, kvm)) + return; + + i = m->nr++; + m->val[i].index = msr; + vmcs_write32(vmcs_count_field, m->nr); + } + m->val[i].value = value; +} + static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, u64 guest_val, u64 host_val) { struct msr_autoload *m = &vmx->msr_autoload; struct kvm *kvm = vmx->vcpu.kvm; - int i; switch (msr) { case MSR_EFER: @@ -1132,27 +1148,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, wrmsrq(MSR_IA32_PEBS_ENABLE, 0); } - i = vmx_find_loadstore_msr_slot(&m->guest, msr); - if (i < 0) { - if (KVM_BUG_ON(m->guest.nr == MAX_NR_LOADSTORE_MSRS, kvm)) - return; - - i = m->guest.nr++; - m->guest.val[i].index = msr; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); - } - m->guest.val[i].value = guest_val; - - i = vmx_find_loadstore_msr_slot(&m->host, msr); - if (i < 0) { - if (KVM_BUG_ON(m->host.nr == MAX_NR_LOADSTORE_MSRS, kvm)) - return; - - i = m->host.nr++; - m->host.val[i].index = msr; - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); - } - m->host.val[i].value = host_val; + vmx_add_auto_msr(&m->guest, msr, guest_val, VM_ENTRY_MSR_LOAD_COUNT, kvm); + vmx_add_auto_msr(&m->guest, msr, host_val, VM_EXIT_MSR_LOAD_COUNT, kvm); } static bool update_transition_efer(struct vcpu_vmx *vmx) From 9757a5aebcd6ca808d5b80831649438a017478ad Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:19 -0800 Subject: [PATCH 54/55] KVM: VMX: Initialize vmcs01.VM_EXIT_MSR_STORE_ADDR with list address Initialize vmcs01.VM_EXIT_MSR_STORE_ADDR to point at the vCPU's msr_autostore list in anticipation of utilizing the auto-store functionality, and to harden KVM against stray reads to pfn 0 (or, in theory, a random pfn if the underlying CPU uses a complex scheme for encoding VMCS data). The MSR auto lists are supposed to be ignored if the associated COUNT VMCS field is '0', but leaving the ADDR field zero-initialized in memory is an unnecessary risk (albeit a minuscule risk) given that the cost is a single VMWRITE during vCPU creation. Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-44-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3f64d4b1b19c..6a17cb90eaf4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4933,6 +4933,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write64(VM_FUNCTION_CONTROL, 0); vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val)); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); From d374b89edbb9a8d552e03348f59287ff779b4c9d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:17:20 -0800 Subject: [PATCH 55/55] KVM: VMX: Add mediated PMU support for CPUs without "save perf global ctrl" Extend mediated PMU support for Intel CPUs without support for saving PERF_GLOBAL_CONTROL into the guest VMCS field on VM-Exit, e.g. for Skylake and its derivatives, as well as Icelake. While supporting CPUs without VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL isn't completely trivial, it's not that complex either. And not supporting such CPUs would mean not supporting 7+ years of Intel CPUs released in the past 10 years. On VM-Exit, immediately propagate the saved PERF_GLOBAL_CTRL to the VMCS as well as KVM's software cache so that KVM doesn't need to add full EXREG tracking of PERF_GLOBAL_CTRL. In practice, the vast majority of VM-Exits won't trigger software writes to guest PERF_GLOBAL_CTRL, so deferring the VMWRITE to the next VM-Enter would only delay the inevitable without batching/avoiding VMWRITEs. Note! Take care to refresh VM_EXIT_MSR_STORE_COUNT on nested VM-Exit, as it's unfortunately possible that KVM could recalculate MSR intercepts while L2 is active, e.g. if userspace loads nested state and _then_ sets PERF_CAPABILITIES. Eating the VMWRITE on every nested VM-Exit is unfortunate, but that's a pre-existing problem and can/should be solved separately, e.g. modifying the number of auto-load entries while L2 is active is also uncommon on modern CPUs. Reviewed-by: Dapeng Mi Tested-by: Dapeng Mi Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-45-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 6 ++++- arch/x86/kvm/vmx/pmu_intel.c | 7 ----- arch/x86/kvm/vmx/vmx.c | 52 ++++++++++++++++++++++++++++++++---- 3 files changed, 52 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 614b789ecf16..1ee1edc8419d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5142,7 +5142,11 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, kvm_nested_vmexit_handle_ibrs(vcpu); - /* Update any VMCS fields that might have changed while L2 ran */ + /* + * Update any VMCS fields that might have changed while vmcs02 was the + * active VMCS. The tracking is per-vCPU, not per-VMCS. + */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 55249fa4db95..27eb76e6b6a0 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -777,13 +777,6 @@ static bool intel_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_ if (WARN_ON_ONCE(!cpu_has_load_perf_global_ctrl())) return false; - /* - * KVM doesn't yet support mediated PMU on CPUs without support for - * saving PERF_GLOBAL_CTRL via a dedicated VMCS field. - */ - if (!cpu_has_save_perf_global_ctrl()) - return false; - return true; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6a17cb90eaf4..ba1262c3e3ff 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1204,6 +1204,17 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) return true; } +static void vmx_add_autostore_msr(struct vcpu_vmx *vmx, u32 msr) +{ + vmx_add_auto_msr(&vmx->msr_autostore, msr, 0, VM_EXIT_MSR_STORE_COUNT, + vmx->vcpu.kvm); +} + +static void vmx_remove_autostore_msr(struct vcpu_vmx *vmx, u32 msr) +{ + vmx_remove_auto_msr(&vmx->msr_autostore, msr, VM_EXIT_MSR_STORE_COUNT); +} + #ifdef CONFIG_X86_32 /* * On 32-bit kernels, VM exits still load the FS and GS bases from the @@ -4225,6 +4236,8 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) static void vmx_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu) { + u64 vm_exit_controls_bits = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | + VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; bool has_mediated_pmu = kvm_vcpu_has_mediated_pmu(vcpu); struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4234,12 +4247,19 @@ static void vmx_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu) if (!enable_mediated_pmu) return; + if (!cpu_has_save_perf_global_ctrl()) { + vm_exit_controls_bits &= ~VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; + + if (has_mediated_pmu) + vmx_add_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL); + else + vmx_remove_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL); + } + vm_entry_controls_changebit(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, has_mediated_pmu); - vm_exit_controls_changebit(vmx, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | - VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL, - has_mediated_pmu); + vm_exit_controls_changebit(vmx, vm_exit_controls_bits, has_mediated_pmu); for (i = 0; i < pmu->nr_arch_gp_counters; i++) { vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i, @@ -7346,6 +7366,29 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host); } +static void vmx_refresh_guest_perf_global_control(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (msr_write_intercepted(vmx, MSR_CORE_PERF_GLOBAL_CTRL)) + return; + + if (!cpu_has_save_perf_global_ctrl()) { + int slot = vmx_find_loadstore_msr_slot(&vmx->msr_autostore, + MSR_CORE_PERF_GLOBAL_CTRL); + + if (WARN_ON_ONCE(slot < 0)) + return; + + pmu->global_ctrl = vmx->msr_autostore.val[slot].value; + vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, pmu->global_ctrl); + return; + } + + pmu->global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL); +} + static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7631,8 +7674,7 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vmx->loaded_vmcs->launched = 1; - if (!msr_write_intercepted(vmx, MSR_CORE_PERF_GLOBAL_CTRL)) - vcpu_to_pmu(vcpu)->global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL); + vmx_refresh_guest_perf_global_control(vcpu); vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx);