From 90fb0a25f6543d2403dcd82e6f0bece92fa7f54c Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sun, 11 Jan 2026 09:00:34 -0800 Subject: [PATCH 01/34] PCI: hv: Remove unused field pci_bus in struct hv_pcibus_device Field pci_bus in struct hv_pcibus_device is unused since commit 418cb6c8e051 ("PCI: hv: Generify PCI probing"). Remove it. No functional change. Signed-off-by: Michael Kelley Reviewed-by: Easwar Hariharan Reviewed-by: Prasanna Kumar T S M Reviewed-by: Srivatsa S. Bhat (Microsoft) Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 1e237d3538f9..7fcba05cec30 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -501,7 +501,6 @@ struct hv_pcibus_device { struct resource *low_mmio_res; struct resource *high_mmio_res; struct completion *survey_event; - struct pci_bus *pci_bus; spinlock_t config_lock; /* Avoid two threads writing index page */ spinlock_t device_list_lock; /* Protect lists below */ void __iomem *cfg_addr; From 754cf84504ea7dad1e9439b93f8729409ef2c2f2 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sun, 18 Jan 2026 09:02:45 -0800 Subject: [PATCH 02/34] mshv: Fix compiler warning about cast converting incompatible function type In mshv_vtl_sint_ioctl_pause_msg_stream(), the reference to function mshv_vtl_synic_mask_vmbus_sint() is cast to type smp_call_func_t. The cast generates a compiler warning because the function signature of mshv_vtl_synic_mask_vmbus_sint() doesn't match smp_call_func_t. There's no actual bug here because the mis-matched function signatures are compatible at runtime. Nonetheless, eliminate the compiler warning by changing the function signature of mshv_vtl_synic_mask_vmbus_sint() to match what on_each_cpu() expects. Remove the cast because it is then no longer necessary. No functional change. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202601170352.qbh3EKH5-lkp@intel.com/ Signed-off-by: Michael Kelley Reviewed-by: Naman Jain Signed-off-by: Wei Liu --- drivers/hv/mshv_vtl_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c index 2cebe9de5a5a..7bbbce009732 100644 --- a/drivers/hv/mshv_vtl_main.c +++ b/drivers/hv/mshv_vtl_main.c @@ -845,9 +845,10 @@ static const struct file_operations mshv_vtl_fops = { .mmap = mshv_vtl_mmap, }; -static void mshv_vtl_synic_mask_vmbus_sint(const u8 *mask) +static void mshv_vtl_synic_mask_vmbus_sint(void *info) { union hv_synic_sint sint; + const u8 *mask = info; sint.as_uint64 = 0; sint.vector = HYPERVISOR_CALLBACK_VECTOR; @@ -999,7 +1000,7 @@ static int mshv_vtl_sint_ioctl_pause_msg_stream(struct mshv_sint_mask __user *ar if (copy_from_user(&mask, arg, sizeof(mask))) return -EFAULT; guard(mutex)(&vtl2_vmbus_sint_mask_mutex); - on_each_cpu((smp_call_func_t)mshv_vtl_synic_mask_vmbus_sint, &mask.mask, 1); + on_each_cpu(mshv_vtl_synic_mask_vmbus_sint, &mask.mask, 1); WRITE_ONCE(vtl_synic_mask_vmbus_sint_masked, mask.mask != 0); if (mask.mask) wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN); From 956efd32f9e5d258a82c086e5c18a4f5abdd10c4 Mon Sep 17 00:00:00 2001 From: Mukesh R Date: Wed, 4 Feb 2026 06:11:38 +0000 Subject: [PATCH 03/34] x86/hyperv: fix a compiler warning in hv_crash.c Fix a compiler warning that status is defined by not used. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202512301641.FC6OAbGM-lkp@intel.com/ Signed-off-by: Mukesh R Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_crash.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/hyperv/hv_crash.c b/arch/x86/hyperv/hv_crash.c index c0e22921ace1..3f99894a2942 100644 --- a/arch/x86/hyperv/hv_crash.c +++ b/arch/x86/hyperv/hv_crash.c @@ -279,7 +279,6 @@ static void hv_notify_prepare_hyp(void) static noinline __noclone void crash_nmi_callback(struct pt_regs *regs) { struct hv_input_disable_hyp_ex *input; - u64 status; int msecs = 1000, ccpu = smp_processor_id(); if (ccpu == 0) { @@ -313,7 +312,7 @@ static noinline __noclone void crash_nmi_callback(struct pt_regs *regs) input->rip = trampoline_pa; input->arg = devirt_arg; - status = hv_do_hypercall(HVCALL_DISABLE_HYP_EX, input, NULL); + (void)hv_do_hypercall(HVCALL_DISABLE_HYP_EX, input, NULL); hv_panic_timeout_reboot(); } From 8d1294d4b54464a6b826709c871878c80e3c219e Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Thu, 29 Jan 2026 07:51:54 -0800 Subject: [PATCH 04/34] mshv: Use EPOLLIN and EPOLLHUP instead of POLLIN and POLLHUP mshv code currently uses the POLLIN and POLLHUP flags. Starting with commit a9a08845e9acb ("vfs: do bulk POLL* -> EPOLL* replacement") the intent is to use the EPOLL* versions throughout the kernel. The comment at the top of mshv_eventfd.c describes it as being inspired by the KVM implementation, which was changed by the above mentioned commit in 2018 to use EPOLL*. mshv_eventfd.c is much newer than 2018 and there's no statement as to why it must use the POLL* versions. So change it to use the EPOLL* versions. This change also resolves a 'sparse' warning. No functional change, and the generated code is the same. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202601220948.MUTO60W4-lkp@intel.com/ Signed-off-by: Michael Kelley Reviewed-by: Stanislav Kinsburskii Signed-off-by: Wei Liu --- drivers/hv/mshv_eventfd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c index 0b75ff1edb73..dfc8b1092c02 100644 --- a/drivers/hv/mshv_eventfd.c +++ b/drivers/hv/mshv_eventfd.c @@ -295,13 +295,13 @@ static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, { struct mshv_irqfd *irqfd = container_of(wait, struct mshv_irqfd, irqfd_wait); - unsigned long flags = (unsigned long)key; + __poll_t flags = key_to_poll(key); int idx; unsigned int seq; struct mshv_partition *pt = irqfd->irqfd_partn; int ret = 0; - if (flags & POLLIN) { + if (flags & EPOLLIN) { u64 cnt; eventfd_ctx_do_read(irqfd->irqfd_eventfd_ctx, &cnt); @@ -320,7 +320,7 @@ static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, ret = 1; } - if (flags & POLLHUP) { + if (flags & EPOLLHUP) { /* The eventfd is closing, detach from the partition */ unsigned long flags; @@ -506,7 +506,7 @@ static int mshv_irqfd_assign(struct mshv_partition *pt, */ events = vfs_poll(fd_file(f), &irqfd->irqfd_polltbl); - if (events & POLLIN) + if (events & EPOLLIN) mshv_assert_irq_slow(irqfd); srcu_read_unlock(&pt->pt_irq_srcu, idx); From 7538b80e5a4b473b73428d13b3a47ceaad9a8a7c Mon Sep 17 00:00:00 2001 From: Purna Pavan Chandra Aekkaladevi Date: Wed, 28 Jan 2026 10:11:40 -0800 Subject: [PATCH 05/34] mshv: Ignore second stats page map result failure Older versions of the hypervisor do not have a concept of separate SELF and PARENT stats areas. In this case, mapping the HV_STATS_AREA_SELF page is sufficient - it's the only page and it contains all available stats. Mapping HV_STATS_AREA_PARENT returns HV_STATUS_INVALID_PARAMETER which currently causes module init to fail on older hypevisor versions. Detect this case and gracefully fall back to populating stats_pages[HV_STATS_AREA_PARENT] with the already-mapped SELF page. Add comments to clarify the behavior, including a clarification of why this isn't needed for hv_call_map_stats_page2() which always supports PARENT and SELF areas. Signed-off-by: Purna Pavan Chandra Aekkaladevi Signed-off-by: Nuno Das Neves Reviewed-by: Stanislav Kinsburskii Acked-by: Stanislav Kinsburskii Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/mshv_root_hv_call.c | 52 +++++++++++++++++++++++++++++++--- drivers/hv/mshv_root_main.c | 3 ++ 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c index 598eaff4ff29..1f93b94d7580 100644 --- a/drivers/hv/mshv_root_hv_call.c +++ b/drivers/hv/mshv_root_hv_call.c @@ -813,6 +813,13 @@ hv_call_notify_port_ring_empty(u32 sint_index) return hv_result_to_errno(status); } +/* + * Equivalent of hv_call_map_stats_page() for cases when the caller provides + * the map location. + * + * NOTE: This is a newer hypercall that always supports SELF and PARENT stats + * areas, unlike hv_call_map_stats_page(). + */ static int hv_call_map_stats_page2(enum hv_stats_object_type type, const union hv_stats_object_identity *identity, u64 map_location) @@ -855,6 +862,34 @@ static int hv_call_map_stats_page2(enum hv_stats_object_type type, return ret; } +static int +hv_stats_get_area_type(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity) +{ + switch (type) { + case HV_STATS_OBJECT_HYPERVISOR: + return identity->hv.stats_area_type; + case HV_STATS_OBJECT_LOGICAL_PROCESSOR: + return identity->lp.stats_area_type; + case HV_STATS_OBJECT_PARTITION: + return identity->partition.stats_area_type; + case HV_STATS_OBJECT_VP: + return identity->vp.stats_area_type; + } + + return -EINVAL; +} + +/* + * Map a stats page, where the page location is provided by the hypervisor. + * + * NOTE: The concept of separate SELF and PARENT stats areas does not exist on + * older hypervisor versions. All the available stats information can be found + * on the SELF page. When attempting to map the PARENT area on a hypervisor + * that doesn't support it, return "success" but with a NULL address. The + * caller should check for this case and instead fallback to the SELF area + * alone. + */ static int hv_call_map_stats_page(enum hv_stats_object_type type, const union hv_stats_object_identity *identity, void **addr) @@ -863,7 +898,7 @@ static int hv_call_map_stats_page(enum hv_stats_object_type type, struct hv_input_map_stats_page *input; struct hv_output_map_stats_page *output; u64 status, pfn; - int ret = 0; + int hv_status, ret = 0; do { local_irq_save(flags); @@ -878,11 +913,20 @@ static int hv_call_map_stats_page(enum hv_stats_object_type type, pfn = output->map_location; local_irq_restore(flags); - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { - ret = hv_result_to_errno(status); + + hv_status = hv_result(status); + if (hv_status != HV_STATUS_INSUFFICIENT_MEMORY) { if (hv_result_success(status)) break; - return ret; + + if (hv_stats_get_area_type(type, identity) == HV_STATS_AREA_PARENT && + hv_status == HV_STATUS_INVALID_PARAMETER) { + *addr = NULL; + return 0; + } + + hv_status_debug(status, "\n"); + return hv_result_to_errno(status); } ret = hv_call_deposit_pages(NUMA_NO_NODE, diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 681b58154d5e..d3e8a66443ad 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -993,6 +993,9 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, if (err) goto unmap_self; + if (!stats_pages[HV_STATS_AREA_PARENT]) + stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF]; + return 0; unmap_self: From 2de4516aa8f726946eadb9831c610b7aeb5bc682 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Wed, 28 Jan 2026 10:11:41 -0800 Subject: [PATCH 06/34] mshv: Use typed hv_stats_page pointers Refactor all relevant functions to use struct hv_stats_page pointers instead of void pointers for stats page mapping and unmapping thus improving type safety and code clarity across the Hyper-V stats mapping APIs. Signed-off-by: Stanislav Kinsburskii Signed-off-by: Nuno Das Neves Acked-by: Stanislav Kinsburskii Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/mshv_root.h | 5 +++-- drivers/hv/mshv_root_hv_call.c | 12 +++++++----- drivers/hv/mshv_root_main.c | 8 ++++---- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index 3c1d88b36741..05ba1f716f9e 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -307,8 +307,9 @@ int hv_call_disconnect_port(u64 connection_partition_id, int hv_call_notify_port_ring_empty(u32 sint_index); int hv_map_stats_page(enum hv_stats_object_type type, const union hv_stats_object_identity *identity, - void **addr); -int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr, + struct hv_stats_page **addr); +int hv_unmap_stats_page(enum hv_stats_object_type type, + struct hv_stats_page *page_addr, const union hv_stats_object_identity *identity); int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, u64 page_struct_count, u32 host_access, diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c index 1f93b94d7580..daee036e48bc 100644 --- a/drivers/hv/mshv_root_hv_call.c +++ b/drivers/hv/mshv_root_hv_call.c @@ -890,9 +890,10 @@ hv_stats_get_area_type(enum hv_stats_object_type type, * caller should check for this case and instead fallback to the SELF area * alone. */ -static int hv_call_map_stats_page(enum hv_stats_object_type type, - const union hv_stats_object_identity *identity, - void **addr) +static int +hv_call_map_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + struct hv_stats_page **addr) { unsigned long flags; struct hv_input_map_stats_page *input; @@ -942,7 +943,7 @@ static int hv_call_map_stats_page(enum hv_stats_object_type type, int hv_map_stats_page(enum hv_stats_object_type type, const union hv_stats_object_identity *identity, - void **addr) + struct hv_stats_page **addr) { int ret; struct page *allocated_page = NULL; @@ -990,7 +991,8 @@ static int hv_call_unmap_stats_page(enum hv_stats_object_type type, return hv_result_to_errno(status); } -int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr, +int hv_unmap_stats_page(enum hv_stats_object_type type, + struct hv_stats_page *page_addr, const union hv_stats_object_identity *identity) { int ret; diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index d3e8a66443ad..f2cd48101d2b 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -958,7 +958,7 @@ mshv_vp_release(struct inode *inode, struct file *filp) } static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index, - void *stats_pages[]) + struct hv_stats_page *stats_pages[]) { union hv_stats_object_identity identity = { .vp.partition_id = partition_id, @@ -973,7 +973,7 @@ static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index, } static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, - void *stats_pages[]) + struct hv_stats_page *stats_pages[]) { union hv_stats_object_identity identity = { .vp.partition_id = partition_id, @@ -1011,7 +1011,7 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, struct mshv_create_vp args; struct mshv_vp *vp; struct page *intercept_msg_page, *register_page, *ghcb_page; - void *stats_pages[2]; + struct hv_stats_page *stats_pages[2]; long ret; if (copy_from_user(&args, arg, sizeof(args))) @@ -1730,7 +1730,7 @@ static void destroy_partition(struct mshv_partition *partition) if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) mshv_vp_stats_unmap(partition->pt_id, vp->vp_index, - (void **)vp->vp_stats_pages); + vp->vp_stats_pages); if (vp->vp_register_page) { (void)hv_unmap_vp_state_page(partition->pt_id, From 1ba923d8cd837ec6ee33525f60f84daaaa26d4e9 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Wed, 28 Jan 2026 10:11:42 -0800 Subject: [PATCH 07/34] mshv: Improve mshv_vp_stats_map/unmap(), add them to mshv_root.h These functions are currently only used to map child partition VP stats, on root partition. However, they will soon be used on L1VH, and also used for mapping the host's own VP stats. Introduce a helper is_l1vh_parent() to determine whether we are mapping our own VP stats. In this case, do not attempt to map the PARENT area. Note this is a different case than mapping PARENT on an older hypervisor where it is not available at all, so must be handled separately. On unmap, pass the stats pages since on L1VH the kernel allocates them and they must be freed in hv_unmap_stats_page(). Signed-off-by: Stanislav Kinsburskii Signed-off-by: Nuno Das Neves Acked-by: Stanislav Kinsburskii Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/mshv_root.h | 10 +++++++ drivers/hv/mshv_root_main.c | 59 +++++++++++++++++++++++++++---------- 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index 05ba1f716f9e..e4912b0618fa 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -254,6 +254,16 @@ struct mshv_partition *mshv_partition_get(struct mshv_partition *partition); void mshv_partition_put(struct mshv_partition *partition); struct mshv_partition *mshv_partition_find(u64 partition_id) __must_hold(RCU); +static inline bool is_l1vh_parent(u64 partition_id) +{ + return hv_l1vh_partition() && (partition_id == HV_PARTITION_ID_SELF); +} + +int mshv_vp_stats_map(u64 partition_id, u32 vp_index, + struct hv_stats_page **stats_pages); +void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index, + struct hv_stats_page **stats_pages); + /* hypercalls */ int hv_call_withdraw_memory(u64 count, int node, u64 partition_id); diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index f2cd48101d2b..781e49721539 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -957,23 +957,36 @@ mshv_vp_release(struct inode *inode, struct file *filp) return 0; } -static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index, - struct hv_stats_page *stats_pages[]) +void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index, + struct hv_stats_page *stats_pages[]) { union hv_stats_object_identity identity = { .vp.partition_id = partition_id, .vp.vp_index = vp_index, }; + int err; identity.vp.stats_area_type = HV_STATS_AREA_SELF; - hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); + err = hv_unmap_stats_page(HV_STATS_OBJECT_VP, + stats_pages[HV_STATS_AREA_SELF], + &identity); + if (err) + pr_err("%s: failed to unmap partition %llu vp %u self stats, err: %d\n", + __func__, partition_id, vp_index, err); - identity.vp.stats_area_type = HV_STATS_AREA_PARENT; - hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); + if (stats_pages[HV_STATS_AREA_PARENT] != stats_pages[HV_STATS_AREA_SELF]) { + identity.vp.stats_area_type = HV_STATS_AREA_PARENT; + err = hv_unmap_stats_page(HV_STATS_OBJECT_VP, + stats_pages[HV_STATS_AREA_PARENT], + &identity); + if (err) + pr_err("%s: failed to unmap partition %llu vp %u parent stats, err: %d\n", + __func__, partition_id, vp_index, err); + } } -static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, - struct hv_stats_page *stats_pages[]) +int mshv_vp_stats_map(u64 partition_id, u32 vp_index, + struct hv_stats_page *stats_pages[]) { union hv_stats_object_identity identity = { .vp.partition_id = partition_id, @@ -984,23 +997,37 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, identity.vp.stats_area_type = HV_STATS_AREA_SELF; err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, &stats_pages[HV_STATS_AREA_SELF]); - if (err) + if (err) { + pr_err("%s: failed to map partition %llu vp %u self stats, err: %d\n", + __func__, partition_id, vp_index, err); return err; + } - identity.vp.stats_area_type = HV_STATS_AREA_PARENT; - err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, - &stats_pages[HV_STATS_AREA_PARENT]); - if (err) - goto unmap_self; - - if (!stats_pages[HV_STATS_AREA_PARENT]) + /* + * L1VH partition cannot access its vp stats in parent area. + */ + if (is_l1vh_parent(partition_id)) { stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF]; + } else { + identity.vp.stats_area_type = HV_STATS_AREA_PARENT; + err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, + &stats_pages[HV_STATS_AREA_PARENT]); + if (err) { + pr_err("%s: failed to map partition %llu vp %u parent stats, err: %d\n", + __func__, partition_id, vp_index, err); + goto unmap_self; + } + if (!stats_pages[HV_STATS_AREA_PARENT]) + stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF]; + } return 0; unmap_self: identity.vp.stats_area_type = HV_STATS_AREA_SELF; - hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); + hv_unmap_stats_page(HV_STATS_OBJECT_VP, + stats_pages[HV_STATS_AREA_SELF], + &identity); return err; } From c527c7aee28f266423afff872df7bff4fad3e084 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Wed, 28 Jan 2026 10:11:43 -0800 Subject: [PATCH 08/34] mshv: Always map child vp stats pages regardless of scheduler type Currently vp->vp_stats_pages is only used by the root scheduler for fast interrupt injection. Soon, vp_stats_pages will also be needed for exposing child VP stats to userspace via debugfs. Mapping the pages a second time to a different address causes an error on L1VH. Remove the scheduler requirement and always map the vp stats pages. Signed-off-by: Stanislav Kinsburskii Signed-off-by: Nuno Das Neves Acked-by: Stanislav Kinsburskii Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 781e49721539..e679cebadfd0 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -1078,16 +1078,10 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, goto unmap_register_page; } - /* - * This mapping of the stats page is for detecting if dispatch thread - * is blocked - only relevant for root scheduler - */ - if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) { - ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, - stats_pages); - if (ret) - goto unmap_ghcb_page; - } + ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, + stats_pages); + if (ret) + goto unmap_ghcb_page; vp = kzalloc(sizeof(*vp), GFP_KERNEL); if (!vp) @@ -1111,8 +1105,7 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) vp->vp_ghcb_page = page_to_virt(ghcb_page); - if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) - memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); + memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); /* * Keep anon_inode_getfd last: it installs fd in the file struct and @@ -1134,8 +1127,7 @@ put_partition: free_vp: kfree(vp); unmap_stats_pages: - if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) - mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages); + mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages); unmap_ghcb_page: if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) hv_unmap_vp_state_page(partition->pt_id, args.vp_index, @@ -1755,9 +1747,8 @@ static void destroy_partition(struct mshv_partition *partition) if (!vp) continue; - if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) - mshv_vp_stats_unmap(partition->pt_id, vp->vp_index, - vp->vp_stats_pages); + mshv_vp_stats_unmap(partition->pt_id, vp->vp_index, + vp->vp_stats_pages); if (vp->vp_register_page) { (void)hv_unmap_vp_state_page(partition->pt_id, From df40f32c87be64c96ee974573968592e147a4ded Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Wed, 28 Jan 2026 10:11:44 -0800 Subject: [PATCH 09/34] mshv: Update hv_stats_page definitions hv_stats_page belongs in hvhdk.h, move it there. It does not require a union to access the data for different counters, just use a single u64 array for simplicity and to match the Windows definitions. While at it, correct the ARM64 value for VpRootDispatchThreadBlocked. Signed-off-by: Nuno Das Neves Acked-by: Stanislav Kinsburskii Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 27 ++++++++------------------- include/hyperv/hvhdk.h | 7 +++++++ 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index e679cebadfd0..8803cc71a542 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -39,22 +39,12 @@ MODULE_AUTHOR("Microsoft"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); -/* TODO move this to another file when debugfs code is added */ -enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */ -#if defined(CONFIG_X86) - VpRootDispatchThreadBlocked = 202, +/* HV_THREAD_COUNTER */ +#if defined(CONFIG_X86_64) +#define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 202 #elif defined(CONFIG_ARM64) - VpRootDispatchThreadBlocked = 94, +#define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 95 #endif - VpStatsMaxCounter -}; - -struct hv_stats_page { - union { - u64 vp_cntrs[VpStatsMaxCounter]; /* VP counters */ - u8 data[HV_HYP_PAGE_SIZE]; - }; -} __packed; struct mshv_root mshv_root; @@ -485,12 +475,11 @@ static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp) { struct hv_stats_page **stats = vp->vp_stats_pages; - u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->vp_cntrs; - u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->vp_cntrs; + u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->data; + u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->data; - if (self_vp_cntrs[VpRootDispatchThreadBlocked]) - return self_vp_cntrs[VpRootDispatchThreadBlocked]; - return parent_vp_cntrs[VpRootDispatchThreadBlocked]; + return parent_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED] || + self_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED]; } static int diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h index 08965970c17d..79d1f16a850a 100644 --- a/include/hyperv/hvhdk.h +++ b/include/hyperv/hvhdk.h @@ -10,6 +10,13 @@ #include "hvhdk_mini.h" #include "hvgdk.h" +/* + * Hypervisor statistics page format + */ +struct hv_stats_page { + u64 data[HV_HYP_PAGE_SIZE / sizeof(u64)]; +} __packed; + /* Bits for dirty mask of hv_vp_register_page */ #define HV_X64_REGISTER_CLASS_GENERAL 0 #define HV_X64_REGISTER_CLASS_IP 1 From c23271b636db45156933d0c55c49109766f12f5a Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Wed, 28 Jan 2026 10:11:45 -0800 Subject: [PATCH 10/34] mshv: Add data for printing stats page counters Introduce mshv_debugfs_counters.c, containing static data corresponding to HV_*_COUNTER enums in the hypervisor source. Defining the enum members as an array instead makes more sense, since it will be iterated over to print counter information to debugfs. Include hypervisor, logical processor, partition, and virtual processor counters. Signed-off-by: Nuno Das Neves Acked-by: Stanislav Kinsburskii Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/mshv_debugfs_counters.c | 490 +++++++++++++++++++++++++++++ 1 file changed, 490 insertions(+) create mode 100644 drivers/hv/mshv_debugfs_counters.c diff --git a/drivers/hv/mshv_debugfs_counters.c b/drivers/hv/mshv_debugfs_counters.c new file mode 100644 index 000000000000..978536ba691f --- /dev/null +++ b/drivers/hv/mshv_debugfs_counters.c @@ -0,0 +1,490 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2026, Microsoft Corporation. + * + * Data for printing stats page counters via debugfs. + * + * Authors: Microsoft Linux virtualization team + */ + +/* + * For simplicity, this file is included directly in mshv_debugfs.c. + * If these are ever needed elsewhere they should be compiled separately. + * Ensure this file is not used twice by accident. + */ +#ifndef MSHV_DEBUGFS_C +#error "This file should only be included in mshv_debugfs.c" +#endif + +/* HV_HYPERVISOR_COUNTER */ +static char *hv_hypervisor_counters[] = { + [1] = "HvLogicalProcessors", + [2] = "HvPartitions", + [3] = "HvTotalPages", + [4] = "HvVirtualProcessors", + [5] = "HvMonitoredNotifications", + [6] = "HvModernStandbyEntries", + [7] = "HvPlatformIdleTransitions", + [8] = "HvHypervisorStartupCost", + + [10] = "HvIOSpacePages", + [11] = "HvNonEssentialPagesForDump", + [12] = "HvSubsumedPages", +}; + +/* HV_CPU_COUNTER */ +static char *hv_lp_counters[] = { + [1] = "LpGlobalTime", + [2] = "LpTotalRunTime", + [3] = "LpHypervisorRunTime", + [4] = "LpHardwareInterrupts", + [5] = "LpContextSwitches", + [6] = "LpInterProcessorInterrupts", + [7] = "LpSchedulerInterrupts", + [8] = "LpTimerInterrupts", + [9] = "LpInterProcessorInterruptsSent", + [10] = "LpProcessorHalts", + [11] = "LpMonitorTransitionCost", + [12] = "LpContextSwitchTime", + [13] = "LpC1TransitionsCount", + [14] = "LpC1RunTime", + [15] = "LpC2TransitionsCount", + [16] = "LpC2RunTime", + [17] = "LpC3TransitionsCount", + [18] = "LpC3RunTime", + [19] = "LpRootVpIndex", + [20] = "LpIdleSequenceNumber", + [21] = "LpGlobalTscCount", + [22] = "LpActiveTscCount", + [23] = "LpIdleAccumulation", + [24] = "LpReferenceCycleCount0", + [25] = "LpActualCycleCount0", + [26] = "LpReferenceCycleCount1", + [27] = "LpActualCycleCount1", + [28] = "LpProximityDomainId", + [29] = "LpPostedInterruptNotifications", + [30] = "LpBranchPredictorFlushes", +#if IS_ENABLED(CONFIG_X86_64) + [31] = "LpL1DataCacheFlushes", + [32] = "LpImmediateL1DataCacheFlushes", + [33] = "LpMbFlushes", + [34] = "LpCounterRefreshSequenceNumber", + [35] = "LpCounterRefreshReferenceTime", + [36] = "LpIdleAccumulationSnapshot", + [37] = "LpActiveTscCountSnapshot", + [38] = "LpHwpRequestContextSwitches", + [39] = "LpPlaceholder1", + [40] = "LpPlaceholder2", + [41] = "LpPlaceholder3", + [42] = "LpPlaceholder4", + [43] = "LpPlaceholder5", + [44] = "LpPlaceholder6", + [45] = "LpPlaceholder7", + [46] = "LpPlaceholder8", + [47] = "LpPlaceholder9", + [48] = "LpSchLocalRunListSize", + [49] = "LpReserveGroupId", + [50] = "LpRunningPriority", + [51] = "LpPerfmonInterruptCount", +#elif IS_ENABLED(CONFIG_ARM64) + [31] = "LpCounterRefreshSequenceNumber", + [32] = "LpCounterRefreshReferenceTime", + [33] = "LpIdleAccumulationSnapshot", + [34] = "LpActiveTscCountSnapshot", + [35] = "LpHwpRequestContextSwitches", + [36] = "LpPlaceholder2", + [37] = "LpPlaceholder3", + [38] = "LpPlaceholder4", + [39] = "LpPlaceholder5", + [40] = "LpPlaceholder6", + [41] = "LpPlaceholder7", + [42] = "LpPlaceholder8", + [43] = "LpPlaceholder9", + [44] = "LpSchLocalRunListSize", + [45] = "LpReserveGroupId", + [46] = "LpRunningPriority", +#endif +}; + +/* HV_PROCESS_COUNTER */ +static char *hv_partition_counters[] = { + [1] = "PtVirtualProcessors", + + [3] = "PtTlbSize", + [4] = "PtAddressSpaces", + [5] = "PtDepositedPages", + [6] = "PtGpaPages", + [7] = "PtGpaSpaceModifications", + [8] = "PtVirtualTlbFlushEntires", + [9] = "PtRecommendedTlbSize", + [10] = "PtGpaPages4K", + [11] = "PtGpaPages2M", + [12] = "PtGpaPages1G", + [13] = "PtGpaPages512G", + [14] = "PtDevicePages4K", + [15] = "PtDevicePages2M", + [16] = "PtDevicePages1G", + [17] = "PtDevicePages512G", + [18] = "PtAttachedDevices", + [19] = "PtDeviceInterruptMappings", + [20] = "PtIoTlbFlushes", + [21] = "PtIoTlbFlushCost", + [22] = "PtDeviceInterruptErrors", + [23] = "PtDeviceDmaErrors", + [24] = "PtDeviceInterruptThrottleEvents", + [25] = "PtSkippedTimerTicks", + [26] = "PtPartitionId", +#if IS_ENABLED(CONFIG_X86_64) + [27] = "PtNestedTlbSize", + [28] = "PtRecommendedNestedTlbSize", + [29] = "PtNestedTlbFreeListSize", + [30] = "PtNestedTlbTrimmedPages", + [31] = "PtPagesShattered", + [32] = "PtPagesRecombined", + [33] = "PtHwpRequestValue", + [34] = "PtAutoSuspendEnableTime", + [35] = "PtAutoSuspendTriggerTime", + [36] = "PtAutoSuspendDisableTime", + [37] = "PtPlaceholder1", + [38] = "PtPlaceholder2", + [39] = "PtPlaceholder3", + [40] = "PtPlaceholder4", + [41] = "PtPlaceholder5", + [42] = "PtPlaceholder6", + [43] = "PtPlaceholder7", + [44] = "PtPlaceholder8", + [45] = "PtHypervisorStateTransferGeneration", + [46] = "PtNumberofActiveChildPartitions", +#elif IS_ENABLED(CONFIG_ARM64) + [27] = "PtHwpRequestValue", + [28] = "PtAutoSuspendEnableTime", + [29] = "PtAutoSuspendTriggerTime", + [30] = "PtAutoSuspendDisableTime", + [31] = "PtPlaceholder1", + [32] = "PtPlaceholder2", + [33] = "PtPlaceholder3", + [34] = "PtPlaceholder4", + [35] = "PtPlaceholder5", + [36] = "PtPlaceholder6", + [37] = "PtPlaceholder7", + [38] = "PtPlaceholder8", + [39] = "PtHypervisorStateTransferGeneration", + [40] = "PtNumberofActiveChildPartitions", +#endif +}; + +/* HV_THREAD_COUNTER */ +static char *hv_vp_counters[] = { + [1] = "VpTotalRunTime", + [2] = "VpHypervisorRunTime", + [3] = "VpRemoteNodeRunTime", + [4] = "VpNormalizedRunTime", + [5] = "VpIdealCpu", + + [7] = "VpHypercallsCount", + [8] = "VpHypercallsTime", +#if IS_ENABLED(CONFIG_X86_64) + [9] = "VpPageInvalidationsCount", + [10] = "VpPageInvalidationsTime", + [11] = "VpControlRegisterAccessesCount", + [12] = "VpControlRegisterAccessesTime", + [13] = "VpIoInstructionsCount", + [14] = "VpIoInstructionsTime", + [15] = "VpHltInstructionsCount", + [16] = "VpHltInstructionsTime", + [17] = "VpMwaitInstructionsCount", + [18] = "VpMwaitInstructionsTime", + [19] = "VpCpuidInstructionsCount", + [20] = "VpCpuidInstructionsTime", + [21] = "VpMsrAccessesCount", + [22] = "VpMsrAccessesTime", + [23] = "VpOtherInterceptsCount", + [24] = "VpOtherInterceptsTime", + [25] = "VpExternalInterruptsCount", + [26] = "VpExternalInterruptsTime", + [27] = "VpPendingInterruptsCount", + [28] = "VpPendingInterruptsTime", + [29] = "VpEmulatedInstructionsCount", + [30] = "VpEmulatedInstructionsTime", + [31] = "VpDebugRegisterAccessesCount", + [32] = "VpDebugRegisterAccessesTime", + [33] = "VpPageFaultInterceptsCount", + [34] = "VpPageFaultInterceptsTime", + [35] = "VpGuestPageTableMaps", + [36] = "VpLargePageTlbFills", + [37] = "VpSmallPageTlbFills", + [38] = "VpReflectedGuestPageFaults", + [39] = "VpApicMmioAccesses", + [40] = "VpIoInterceptMessages", + [41] = "VpMemoryInterceptMessages", + [42] = "VpApicEoiAccesses", + [43] = "VpOtherMessages", + [44] = "VpPageTableAllocations", + [45] = "VpLogicalProcessorMigrations", + [46] = "VpAddressSpaceEvictions", + [47] = "VpAddressSpaceSwitches", + [48] = "VpAddressDomainFlushes", + [49] = "VpAddressSpaceFlushes", + [50] = "VpGlobalGvaRangeFlushes", + [51] = "VpLocalGvaRangeFlushes", + [52] = "VpPageTableEvictions", + [53] = "VpPageTableReclamations", + [54] = "VpPageTableResets", + [55] = "VpPageTableValidations", + [56] = "VpApicTprAccesses", + [57] = "VpPageTableWriteIntercepts", + [58] = "VpSyntheticInterrupts", + [59] = "VpVirtualInterrupts", + [60] = "VpApicIpisSent", + [61] = "VpApicSelfIpisSent", + [62] = "VpGpaSpaceHypercalls", + [63] = "VpLogicalProcessorHypercalls", + [64] = "VpLongSpinWaitHypercalls", + [65] = "VpOtherHypercalls", + [66] = "VpSyntheticInterruptHypercalls", + [67] = "VpVirtualInterruptHypercalls", + [68] = "VpVirtualMmuHypercalls", + [69] = "VpVirtualProcessorHypercalls", + [70] = "VpHardwareInterrupts", + [71] = "VpNestedPageFaultInterceptsCount", + [72] = "VpNestedPageFaultInterceptsTime", + [73] = "VpPageScans", + [74] = "VpLogicalProcessorDispatches", + [75] = "VpWaitingForCpuTime", + [76] = "VpExtendedHypercalls", + [77] = "VpExtendedHypercallInterceptMessages", + [78] = "VpMbecNestedPageTableSwitches", + [79] = "VpOtherReflectedGuestExceptions", + [80] = "VpGlobalIoTlbFlushes", + [81] = "VpGlobalIoTlbFlushCost", + [82] = "VpLocalIoTlbFlushes", + [83] = "VpLocalIoTlbFlushCost", + [84] = "VpHypercallsForwardedCount", + [85] = "VpHypercallsForwardingTime", + [86] = "VpPageInvalidationsForwardedCount", + [87] = "VpPageInvalidationsForwardingTime", + [88] = "VpControlRegisterAccessesForwardedCount", + [89] = "VpControlRegisterAccessesForwardingTime", + [90] = "VpIoInstructionsForwardedCount", + [91] = "VpIoInstructionsForwardingTime", + [92] = "VpHltInstructionsForwardedCount", + [93] = "VpHltInstructionsForwardingTime", + [94] = "VpMwaitInstructionsForwardedCount", + [95] = "VpMwaitInstructionsForwardingTime", + [96] = "VpCpuidInstructionsForwardedCount", + [97] = "VpCpuidInstructionsForwardingTime", + [98] = "VpMsrAccessesForwardedCount", + [99] = "VpMsrAccessesForwardingTime", + [100] = "VpOtherInterceptsForwardedCount", + [101] = "VpOtherInterceptsForwardingTime", + [102] = "VpExternalInterruptsForwardedCount", + [103] = "VpExternalInterruptsForwardingTime", + [104] = "VpPendingInterruptsForwardedCount", + [105] = "VpPendingInterruptsForwardingTime", + [106] = "VpEmulatedInstructionsForwardedCount", + [107] = "VpEmulatedInstructionsForwardingTime", + [108] = "VpDebugRegisterAccessesForwardedCount", + [109] = "VpDebugRegisterAccessesForwardingTime", + [110] = "VpPageFaultInterceptsForwardedCount", + [111] = "VpPageFaultInterceptsForwardingTime", + [112] = "VpVmclearEmulationCount", + [113] = "VpVmclearEmulationTime", + [114] = "VpVmptrldEmulationCount", + [115] = "VpVmptrldEmulationTime", + [116] = "VpVmptrstEmulationCount", + [117] = "VpVmptrstEmulationTime", + [118] = "VpVmreadEmulationCount", + [119] = "VpVmreadEmulationTime", + [120] = "VpVmwriteEmulationCount", + [121] = "VpVmwriteEmulationTime", + [122] = "VpVmxoffEmulationCount", + [123] = "VpVmxoffEmulationTime", + [124] = "VpVmxonEmulationCount", + [125] = "VpVmxonEmulationTime", + [126] = "VpNestedVMEntriesCount", + [127] = "VpNestedVMEntriesTime", + [128] = "VpNestedSLATSoftPageFaultsCount", + [129] = "VpNestedSLATSoftPageFaultsTime", + [130] = "VpNestedSLATHardPageFaultsCount", + [131] = "VpNestedSLATHardPageFaultsTime", + [132] = "VpInvEptAllContextEmulationCount", + [133] = "VpInvEptAllContextEmulationTime", + [134] = "VpInvEptSingleContextEmulationCount", + [135] = "VpInvEptSingleContextEmulationTime", + [136] = "VpInvVpidAllContextEmulationCount", + [137] = "VpInvVpidAllContextEmulationTime", + [138] = "VpInvVpidSingleContextEmulationCount", + [139] = "VpInvVpidSingleContextEmulationTime", + [140] = "VpInvVpidSingleAddressEmulationCount", + [141] = "VpInvVpidSingleAddressEmulationTime", + [142] = "VpNestedTlbPageTableReclamations", + [143] = "VpNestedTlbPageTableEvictions", + [144] = "VpFlushGuestPhysicalAddressSpaceHypercalls", + [145] = "VpFlushGuestPhysicalAddressListHypercalls", + [146] = "VpPostedInterruptNotifications", + [147] = "VpPostedInterruptScans", + [148] = "VpTotalCoreRunTime", + [149] = "VpMaximumRunTime", + [150] = "VpHwpRequestContextSwitches", + [151] = "VpWaitingForCpuTimeBucket0", + [152] = "VpWaitingForCpuTimeBucket1", + [153] = "VpWaitingForCpuTimeBucket2", + [154] = "VpWaitingForCpuTimeBucket3", + [155] = "VpWaitingForCpuTimeBucket4", + [156] = "VpWaitingForCpuTimeBucket5", + [157] = "VpWaitingForCpuTimeBucket6", + [158] = "VpVmloadEmulationCount", + [159] = "VpVmloadEmulationTime", + [160] = "VpVmsaveEmulationCount", + [161] = "VpVmsaveEmulationTime", + [162] = "VpGifInstructionEmulationCount", + [163] = "VpGifInstructionEmulationTime", + [164] = "VpEmulatedErrataSvmInstructions", + [165] = "VpPlaceholder1", + [166] = "VpPlaceholder2", + [167] = "VpPlaceholder3", + [168] = "VpPlaceholder4", + [169] = "VpPlaceholder5", + [170] = "VpPlaceholder6", + [171] = "VpPlaceholder7", + [172] = "VpPlaceholder8", + [173] = "VpContentionTime", + [174] = "VpWakeUpTime", + [175] = "VpSchedulingPriority", + [176] = "VpRdpmcInstructionsCount", + [177] = "VpRdpmcInstructionsTime", + [178] = "VpPerfmonPmuMsrAccessesCount", + [179] = "VpPerfmonLbrMsrAccessesCount", + [180] = "VpPerfmonIptMsrAccessesCount", + [181] = "VpPerfmonInterruptCount", + [182] = "VpVtl1DispatchCount", + [183] = "VpVtl2DispatchCount", + [184] = "VpVtl2DispatchBucket0", + [185] = "VpVtl2DispatchBucket1", + [186] = "VpVtl2DispatchBucket2", + [187] = "VpVtl2DispatchBucket3", + [188] = "VpVtl2DispatchBucket4", + [189] = "VpVtl2DispatchBucket5", + [190] = "VpVtl2DispatchBucket6", + [191] = "VpVtl1RunTime", + [192] = "VpVtl2RunTime", + [193] = "VpIommuHypercalls", + [194] = "VpCpuGroupHypercalls", + [195] = "VpVsmHypercalls", + [196] = "VpEventLogHypercalls", + [197] = "VpDeviceDomainHypercalls", + [198] = "VpDepositHypercalls", + [199] = "VpSvmHypercalls", + [200] = "VpBusLockAcquisitionCount", + [201] = "VpLoadAvg", + [202] = "VpRootDispatchThreadBlocked", + [203] = "VpIdleCpuTime", + [204] = "VpWaitingForCpuTimeBucket7", + [205] = "VpWaitingForCpuTimeBucket8", + [206] = "VpWaitingForCpuTimeBucket9", + [207] = "VpWaitingForCpuTimeBucket10", + [208] = "VpWaitingForCpuTimeBucket11", + [209] = "VpWaitingForCpuTimeBucket12", + [210] = "VpHierarchicalSuspendTime", + [211] = "VpExpressSchedulingAttempts", + [212] = "VpExpressSchedulingCount", +#elif IS_ENABLED(CONFIG_ARM64) + [9] = "VpSysRegAccessesCount", + [10] = "VpSysRegAccessesTime", + [11] = "VpSmcInstructionsCount", + [12] = "VpSmcInstructionsTime", + [13] = "VpOtherInterceptsCount", + [14] = "VpOtherInterceptsTime", + [15] = "VpExternalInterruptsCount", + [16] = "VpExternalInterruptsTime", + [17] = "VpPendingInterruptsCount", + [18] = "VpPendingInterruptsTime", + [19] = "VpGuestPageTableMaps", + [20] = "VpLargePageTlbFills", + [21] = "VpSmallPageTlbFills", + [22] = "VpReflectedGuestPageFaults", + [23] = "VpMemoryInterceptMessages", + [24] = "VpOtherMessages", + [25] = "VpLogicalProcessorMigrations", + [26] = "VpAddressDomainFlushes", + [27] = "VpAddressSpaceFlushes", + [28] = "VpSyntheticInterrupts", + [29] = "VpVirtualInterrupts", + [30] = "VpApicSelfIpisSent", + [31] = "VpGpaSpaceHypercalls", + [32] = "VpLogicalProcessorHypercalls", + [33] = "VpLongSpinWaitHypercalls", + [34] = "VpOtherHypercalls", + [35] = "VpSyntheticInterruptHypercalls", + [36] = "VpVirtualInterruptHypercalls", + [37] = "VpVirtualMmuHypercalls", + [38] = "VpVirtualProcessorHypercalls", + [39] = "VpHardwareInterrupts", + [40] = "VpNestedPageFaultInterceptsCount", + [41] = "VpNestedPageFaultInterceptsTime", + [42] = "VpLogicalProcessorDispatches", + [43] = "VpWaitingForCpuTime", + [44] = "VpExtendedHypercalls", + [45] = "VpExtendedHypercallInterceptMessages", + [46] = "VpMbecNestedPageTableSwitches", + [47] = "VpOtherReflectedGuestExceptions", + [48] = "VpGlobalIoTlbFlushes", + [49] = "VpGlobalIoTlbFlushCost", + [50] = "VpLocalIoTlbFlushes", + [51] = "VpLocalIoTlbFlushCost", + [52] = "VpFlushGuestPhysicalAddressSpaceHypercalls", + [53] = "VpFlushGuestPhysicalAddressListHypercalls", + [54] = "VpPostedInterruptNotifications", + [55] = "VpPostedInterruptScans", + [56] = "VpTotalCoreRunTime", + [57] = "VpMaximumRunTime", + [58] = "VpWaitingForCpuTimeBucket0", + [59] = "VpWaitingForCpuTimeBucket1", + [60] = "VpWaitingForCpuTimeBucket2", + [61] = "VpWaitingForCpuTimeBucket3", + [62] = "VpWaitingForCpuTimeBucket4", + [63] = "VpWaitingForCpuTimeBucket5", + [64] = "VpWaitingForCpuTimeBucket6", + [65] = "VpHwpRequestContextSwitches", + [66] = "VpPlaceholder2", + [67] = "VpPlaceholder3", + [68] = "VpPlaceholder4", + [69] = "VpPlaceholder5", + [70] = "VpPlaceholder6", + [71] = "VpPlaceholder7", + [72] = "VpPlaceholder8", + [73] = "VpContentionTime", + [74] = "VpWakeUpTime", + [75] = "VpSchedulingPriority", + [76] = "VpVtl1DispatchCount", + [77] = "VpVtl2DispatchCount", + [78] = "VpVtl2DispatchBucket0", + [79] = "VpVtl2DispatchBucket1", + [80] = "VpVtl2DispatchBucket2", + [81] = "VpVtl2DispatchBucket3", + [82] = "VpVtl2DispatchBucket4", + [83] = "VpVtl2DispatchBucket5", + [84] = "VpVtl2DispatchBucket6", + [85] = "VpVtl1RunTime", + [86] = "VpVtl2RunTime", + [87] = "VpIommuHypercalls", + [88] = "VpCpuGroupHypercalls", + [89] = "VpVsmHypercalls", + [90] = "VpEventLogHypercalls", + [91] = "VpDeviceDomainHypercalls", + [92] = "VpDepositHypercalls", + [93] = "VpSvmHypercalls", + [94] = "VpLoadAvg", + [95] = "VpRootDispatchThreadBlocked", + [96] = "VpIdleCpuTime", + [97] = "VpWaitingForCpuTimeBucket7", + [98] = "VpWaitingForCpuTimeBucket8", + [99] = "VpWaitingForCpuTimeBucket9", + [100] = "VpWaitingForCpuTimeBucket10", + [101] = "VpWaitingForCpuTimeBucket11", + [102] = "VpWaitingForCpuTimeBucket12", + [103] = "VpHierarchicalSuspendTime", + [104] = "VpExpressSchedulingAttempts", + [105] = "VpExpressSchedulingCount", +#endif +}; From ff225ba9ad71c4c5f900b9aa1b757adafcfb449d Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Wed, 28 Jan 2026 10:11:46 -0800 Subject: [PATCH 11/34] mshv: Add debugfs to view hypervisor statistics Introduce a debugfs interface to expose root and child partition stats when running with mshv_root. Create a debugfs directory "mshv" containing 'stats' files organized by type and id. A stats file contains a number of counters depending on its type. e.g. an excerpt from a VP stats file: TotalRunTime : 1997602722 HypervisorRunTime : 649671371 RemoteNodeRunTime : 0 NormalizedRunTime : 1997602721 IdealCpu : 0 HypercallsCount : 1708169 HypercallsTime : 111914774 PageInvalidationsCount : 0 PageInvalidationsTime : 0 On a root partition with some active child partitions, the entire directory structure may look like: mshv/ stats # hypervisor stats lp/ # logical processors 0/ # LP id stats # LP 0 stats 1/ 2/ 3/ partition/ # partition stats 1/ # root partition id stats # root partition stats vp/ # root virtual processors 0/ # root VP id stats # root VP 0 stats 1/ 2/ 3/ 42/ # child partition id stats # child partition stats vp/ # child VPs 0/ # child VP id stats # child VP 0 stats 1/ 43/ 55/ On L1VH, some stats are not present as it does not own the hardware like the root partition does: - The hypervisor and lp stats are not present - L1VH's partition directory is named "self" because it can't get its own id - Some of L1VH's partition and VP stats fields are not populated, because it can't map its own HV_STATS_AREA_PARENT page. Co-developed-by: Stanislav Kinsburskii Signed-off-by: Stanislav Kinsburskii Co-developed-by: Praveen K Paladugu Signed-off-by: Praveen K Paladugu Co-developed-by: Mukesh Rathor Signed-off-by: Mukesh Rathor Co-developed-by: Purna Pavan Chandra Aekkaladevi Signed-off-by: Purna Pavan Chandra Aekkaladevi Co-developed-by: Jinank Jain Signed-off-by: Jinank Jain Signed-off-by: Nuno Das Neves Reviewed-by: Stanislav Kinsburskii Acked-by: Stanislav Kinsburskii Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/Makefile | 1 + drivers/hv/mshv_debugfs.c | 726 ++++++++++++++++++++++++++++++++++++ drivers/hv/mshv_root.h | 34 ++ drivers/hv/mshv_root_main.c | 26 +- 4 files changed, 785 insertions(+), 2 deletions(-) create mode 100644 drivers/hv/mshv_debugfs.c diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index a49f93c2d245..2593711c3628 100644 --- a/drivers/hv/Makefile +++ b/drivers/hv/Makefile @@ -15,6 +15,7 @@ hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \ mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o +mshv_root-$(CONFIG_DEBUG_FS) += mshv_debugfs.o mshv_vtl-y := mshv_vtl_main.o # Code that must be built-in diff --git a/drivers/hv/mshv_debugfs.c b/drivers/hv/mshv_debugfs.c new file mode 100644 index 000000000000..ebf2549eb44d --- /dev/null +++ b/drivers/hv/mshv_debugfs.c @@ -0,0 +1,726 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2026, Microsoft Corporation. + * + * The /sys/kernel/debug/mshv directory contents. + * Contains various statistics data, provided by the hypervisor. + * + * Authors: Microsoft Linux virtualization team + */ + +#include +#include +#include +#include + +#include "mshv.h" +#include "mshv_root.h" + +/* Ensure this file is not used elsewhere by accident */ +#define MSHV_DEBUGFS_C +#include "mshv_debugfs_counters.c" + +#define U32_BUF_SZ 11 +#define U64_BUF_SZ 21 +/* Only support SELF and PARENT areas */ +#define NUM_STATS_AREAS 2 +static_assert(HV_STATS_AREA_SELF == 0 && HV_STATS_AREA_PARENT == 1, + "SELF and PARENT areas must be usable as indices into an array of size NUM_STATS_AREAS"); +/* HV_HYPERVISOR_COUNTER */ +#define HV_HYPERVISOR_COUNTER_LOGICAL_PROCESSORS 1 + +static struct dentry *mshv_debugfs; +static struct dentry *mshv_debugfs_partition; +static struct dentry *mshv_debugfs_lp; +static struct dentry **parent_vp_stats; +static struct dentry *parent_partition_stats; + +static u64 mshv_lps_count; +static struct hv_stats_page **mshv_lps_stats; + +static int lp_stats_show(struct seq_file *m, void *v) +{ + const struct hv_stats_page *stats = m->private; + int idx; + + for (idx = 0; idx < ARRAY_SIZE(hv_lp_counters); idx++) { + char *name = hv_lp_counters[idx]; + + if (!name) + continue; + seq_printf(m, "%-32s: %llu\n", name, stats->data[idx]); + } + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(lp_stats); + +static void mshv_lp_stats_unmap(u32 lp_index) +{ + union hv_stats_object_identity identity = { + .lp.lp_index = lp_index, + .lp.stats_area_type = HV_STATS_AREA_SELF, + }; + int err; + + err = hv_unmap_stats_page(HV_STATS_OBJECT_LOGICAL_PROCESSOR, + mshv_lps_stats[lp_index], &identity); + if (err) + pr_err("%s: failed to unmap logical processor %u stats, err: %d\n", + __func__, lp_index, err); + + mshv_lps_stats[lp_index] = NULL; +} + +static struct hv_stats_page * __init mshv_lp_stats_map(u32 lp_index) +{ + union hv_stats_object_identity identity = { + .lp.lp_index = lp_index, + .lp.stats_area_type = HV_STATS_AREA_SELF, + }; + struct hv_stats_page *stats; + int err; + + err = hv_map_stats_page(HV_STATS_OBJECT_LOGICAL_PROCESSOR, &identity, + &stats); + if (err) { + pr_err("%s: failed to map logical processor %u stats, err: %d\n", + __func__, lp_index, err); + return ERR_PTR(err); + } + mshv_lps_stats[lp_index] = stats; + + return stats; +} + +static struct hv_stats_page * __init lp_debugfs_stats_create(u32 lp_index, + struct dentry *parent) +{ + struct dentry *dentry; + struct hv_stats_page *stats; + + stats = mshv_lp_stats_map(lp_index); + if (IS_ERR(stats)) + return stats; + + dentry = debugfs_create_file("stats", 0400, parent, + stats, &lp_stats_fops); + if (IS_ERR(dentry)) { + mshv_lp_stats_unmap(lp_index); + return ERR_CAST(dentry); + } + return stats; +} + +static int __init lp_debugfs_create(u32 lp_index, struct dentry *parent) +{ + struct dentry *idx; + char lp_idx_str[U32_BUF_SZ]; + struct hv_stats_page *stats; + int err; + + sprintf(lp_idx_str, "%u", lp_index); + + idx = debugfs_create_dir(lp_idx_str, parent); + if (IS_ERR(idx)) + return PTR_ERR(idx); + + stats = lp_debugfs_stats_create(lp_index, idx); + if (IS_ERR(stats)) { + err = PTR_ERR(stats); + goto remove_debugfs_lp_idx; + } + + return 0; + +remove_debugfs_lp_idx: + debugfs_remove_recursive(idx); + return err; +} + +static void mshv_debugfs_lp_remove(void) +{ + int lp_index; + + debugfs_remove_recursive(mshv_debugfs_lp); + + for (lp_index = 0; lp_index < mshv_lps_count; lp_index++) + mshv_lp_stats_unmap(lp_index); + + kfree(mshv_lps_stats); + mshv_lps_stats = NULL; +} + +static int __init mshv_debugfs_lp_create(struct dentry *parent) +{ + struct dentry *lp_dir; + int err, lp_index; + + mshv_lps_stats = kcalloc(mshv_lps_count, + sizeof(*mshv_lps_stats), + GFP_KERNEL_ACCOUNT); + + if (!mshv_lps_stats) + return -ENOMEM; + + lp_dir = debugfs_create_dir("lp", parent); + if (IS_ERR(lp_dir)) { + err = PTR_ERR(lp_dir); + goto free_lp_stats; + } + + for (lp_index = 0; lp_index < mshv_lps_count; lp_index++) { + err = lp_debugfs_create(lp_index, lp_dir); + if (err) + goto remove_debugfs_lps; + } + + mshv_debugfs_lp = lp_dir; + + return 0; + +remove_debugfs_lps: + for (lp_index -= 1; lp_index >= 0; lp_index--) + mshv_lp_stats_unmap(lp_index); + debugfs_remove_recursive(lp_dir); +free_lp_stats: + kfree(mshv_lps_stats); + mshv_lps_stats = NULL; + + return err; +} + +static int vp_stats_show(struct seq_file *m, void *v) +{ + const struct hv_stats_page **pstats = m->private; + u64 parent_val, self_val; + int idx; + + /* + * For VP and partition stats, there may be two stats areas mapped, + * SELF and PARENT. These refer to the privilege level of the data in + * each page. Some fields may be 0 in SELF and nonzero in PARENT, or + * vice versa. + * + * Hence, prioritize printing from the PARENT page (more privileged + * data), but use the value from the SELF page if the PARENT value is + * 0. + */ + + for (idx = 0; idx < ARRAY_SIZE(hv_vp_counters); idx++) { + char *name = hv_vp_counters[idx]; + + if (!name) + continue; + + parent_val = pstats[HV_STATS_AREA_PARENT]->data[idx]; + self_val = pstats[HV_STATS_AREA_SELF]->data[idx]; + seq_printf(m, "%-43s: %llu\n", name, + parent_val ? parent_val : self_val); + } + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(vp_stats); + +static void vp_debugfs_remove(struct dentry *vp_stats) +{ + debugfs_remove_recursive(vp_stats->d_parent); +} + +static int vp_debugfs_create(u64 partition_id, u32 vp_index, + struct hv_stats_page **pstats, + struct dentry **vp_stats_ptr, + struct dentry *parent) +{ + struct dentry *vp_idx_dir, *d; + char vp_idx_str[U32_BUF_SZ]; + int err; + + sprintf(vp_idx_str, "%u", vp_index); + + vp_idx_dir = debugfs_create_dir(vp_idx_str, parent); + if (IS_ERR(vp_idx_dir)) + return PTR_ERR(vp_idx_dir); + + d = debugfs_create_file("stats", 0400, vp_idx_dir, + pstats, &vp_stats_fops); + if (IS_ERR(d)) { + err = PTR_ERR(d); + goto remove_debugfs_vp_idx; + } + + *vp_stats_ptr = d; + + return 0; + +remove_debugfs_vp_idx: + debugfs_remove_recursive(vp_idx_dir); + return err; +} + +static int partition_stats_show(struct seq_file *m, void *v) +{ + const struct hv_stats_page **pstats = m->private; + u64 parent_val, self_val; + int idx; + + for (idx = 0; idx < ARRAY_SIZE(hv_partition_counters); idx++) { + char *name = hv_partition_counters[idx]; + + if (!name) + continue; + + parent_val = pstats[HV_STATS_AREA_PARENT]->data[idx]; + self_val = pstats[HV_STATS_AREA_SELF]->data[idx]; + seq_printf(m, "%-37s: %llu\n", name, + parent_val ? parent_val : self_val); + } + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(partition_stats); + +static void mshv_partition_stats_unmap(u64 partition_id, + struct hv_stats_page *stats_page, + enum hv_stats_area_type stats_area_type) +{ + union hv_stats_object_identity identity = { + .partition.partition_id = partition_id, + .partition.stats_area_type = stats_area_type, + }; + int err; + + err = hv_unmap_stats_page(HV_STATS_OBJECT_PARTITION, stats_page, + &identity); + if (err) + pr_err("%s: failed to unmap partition %lld %s stats, err: %d\n", + __func__, partition_id, + (stats_area_type == HV_STATS_AREA_SELF) ? "self" : "parent", + err); +} + +static struct hv_stats_page *mshv_partition_stats_map(u64 partition_id, + enum hv_stats_area_type stats_area_type) +{ + union hv_stats_object_identity identity = { + .partition.partition_id = partition_id, + .partition.stats_area_type = stats_area_type, + }; + struct hv_stats_page *stats; + int err; + + err = hv_map_stats_page(HV_STATS_OBJECT_PARTITION, &identity, &stats); + if (err) { + pr_err("%s: failed to map partition %lld %s stats, err: %d\n", + __func__, partition_id, + (stats_area_type == HV_STATS_AREA_SELF) ? "self" : "parent", + err); + return ERR_PTR(err); + } + return stats; +} + +static int mshv_debugfs_partition_stats_create(u64 partition_id, + struct dentry **partition_stats_ptr, + struct dentry *parent) +{ + struct dentry *dentry; + struct hv_stats_page **pstats; + int err; + + pstats = kcalloc(NUM_STATS_AREAS, sizeof(struct hv_stats_page *), + GFP_KERNEL_ACCOUNT); + if (!pstats) + return -ENOMEM; + + pstats[HV_STATS_AREA_SELF] = mshv_partition_stats_map(partition_id, + HV_STATS_AREA_SELF); + if (IS_ERR(pstats[HV_STATS_AREA_SELF])) { + err = PTR_ERR(pstats[HV_STATS_AREA_SELF]); + goto cleanup; + } + + /* + * L1VH partition cannot access its partition stats in parent area. + */ + if (is_l1vh_parent(partition_id)) { + pstats[HV_STATS_AREA_PARENT] = pstats[HV_STATS_AREA_SELF]; + } else { + pstats[HV_STATS_AREA_PARENT] = mshv_partition_stats_map(partition_id, + HV_STATS_AREA_PARENT); + if (IS_ERR(pstats[HV_STATS_AREA_PARENT])) { + err = PTR_ERR(pstats[HV_STATS_AREA_PARENT]); + goto unmap_self; + } + if (!pstats[HV_STATS_AREA_PARENT]) + pstats[HV_STATS_AREA_PARENT] = pstats[HV_STATS_AREA_SELF]; + } + + dentry = debugfs_create_file("stats", 0400, parent, + pstats, &partition_stats_fops); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto unmap_partition_stats; + } + + *partition_stats_ptr = dentry; + return 0; + +unmap_partition_stats: + if (pstats[HV_STATS_AREA_PARENT] != pstats[HV_STATS_AREA_SELF]) + mshv_partition_stats_unmap(partition_id, pstats[HV_STATS_AREA_PARENT], + HV_STATS_AREA_PARENT); +unmap_self: + mshv_partition_stats_unmap(partition_id, pstats[HV_STATS_AREA_SELF], + HV_STATS_AREA_SELF); +cleanup: + kfree(pstats); + return err; +} + +static void partition_debugfs_remove(u64 partition_id, struct dentry *dentry) +{ + struct hv_stats_page **pstats = NULL; + + pstats = dentry->d_inode->i_private; + + debugfs_remove_recursive(dentry->d_parent); + + if (pstats[HV_STATS_AREA_PARENT] != pstats[HV_STATS_AREA_SELF]) { + mshv_partition_stats_unmap(partition_id, + pstats[HV_STATS_AREA_PARENT], + HV_STATS_AREA_PARENT); + } + + mshv_partition_stats_unmap(partition_id, + pstats[HV_STATS_AREA_SELF], + HV_STATS_AREA_SELF); + + kfree(pstats); +} + +static int partition_debugfs_create(u64 partition_id, + struct dentry **vp_dir_ptr, + struct dentry **partition_stats_ptr, + struct dentry *parent) +{ + char part_id_str[U64_BUF_SZ]; + struct dentry *part_id_dir, *vp_dir; + int err; + + if (is_l1vh_parent(partition_id)) + sprintf(part_id_str, "self"); + else + sprintf(part_id_str, "%llu", partition_id); + + part_id_dir = debugfs_create_dir(part_id_str, parent); + if (IS_ERR(part_id_dir)) + return PTR_ERR(part_id_dir); + + vp_dir = debugfs_create_dir("vp", part_id_dir); + if (IS_ERR(vp_dir)) { + err = PTR_ERR(vp_dir); + goto remove_debugfs_partition_id; + } + + err = mshv_debugfs_partition_stats_create(partition_id, + partition_stats_ptr, + part_id_dir); + if (err) + goto remove_debugfs_partition_id; + + *vp_dir_ptr = vp_dir; + + return 0; + +remove_debugfs_partition_id: + debugfs_remove_recursive(part_id_dir); + return err; +} + +static void parent_vp_debugfs_remove(u32 vp_index, + struct dentry *vp_stats_ptr) +{ + struct hv_stats_page **pstats; + + pstats = vp_stats_ptr->d_inode->i_private; + vp_debugfs_remove(vp_stats_ptr); + mshv_vp_stats_unmap(hv_current_partition_id, vp_index, pstats); + kfree(pstats); +} + +static void mshv_debugfs_parent_partition_remove(void) +{ + int idx; + + for_each_online_cpu(idx) + parent_vp_debugfs_remove(hv_vp_index[idx], + parent_vp_stats[idx]); + + partition_debugfs_remove(hv_current_partition_id, + parent_partition_stats); + kfree(parent_vp_stats); + parent_vp_stats = NULL; + parent_partition_stats = NULL; +} + +static int __init parent_vp_debugfs_create(u32 vp_index, + struct dentry **vp_stats_ptr, + struct dentry *parent) +{ + struct hv_stats_page **pstats; + int err; + + pstats = kcalloc(NUM_STATS_AREAS, sizeof(struct hv_stats_page *), + GFP_KERNEL_ACCOUNT); + if (!pstats) + return -ENOMEM; + + err = mshv_vp_stats_map(hv_current_partition_id, vp_index, pstats); + if (err) + goto cleanup; + + err = vp_debugfs_create(hv_current_partition_id, vp_index, pstats, + vp_stats_ptr, parent); + if (err) + goto unmap_vp_stats; + + return 0; + +unmap_vp_stats: + mshv_vp_stats_unmap(hv_current_partition_id, vp_index, pstats); +cleanup: + kfree(pstats); + return err; +} + +static int __init mshv_debugfs_parent_partition_create(void) +{ + struct dentry *vp_dir; + int err, idx, i; + + mshv_debugfs_partition = debugfs_create_dir("partition", + mshv_debugfs); + if (IS_ERR(mshv_debugfs_partition)) + return PTR_ERR(mshv_debugfs_partition); + + err = partition_debugfs_create(hv_current_partition_id, + &vp_dir, + &parent_partition_stats, + mshv_debugfs_partition); + if (err) + goto remove_debugfs_partition; + + parent_vp_stats = kcalloc(nr_cpu_ids, sizeof(*parent_vp_stats), + GFP_KERNEL); + if (!parent_vp_stats) { + err = -ENOMEM; + goto remove_debugfs_partition; + } + + for_each_online_cpu(idx) { + err = parent_vp_debugfs_create(hv_vp_index[idx], + &parent_vp_stats[idx], + vp_dir); + if (err) + goto remove_debugfs_partition_vp; + } + + return 0; + +remove_debugfs_partition_vp: + for_each_online_cpu(i) { + if (i >= idx) + break; + parent_vp_debugfs_remove(i, parent_vp_stats[i]); + } + partition_debugfs_remove(hv_current_partition_id, + parent_partition_stats); + + kfree(parent_vp_stats); + parent_vp_stats = NULL; + parent_partition_stats = NULL; + +remove_debugfs_partition: + debugfs_remove_recursive(mshv_debugfs_partition); + mshv_debugfs_partition = NULL; + return err; +} + +static int hv_stats_show(struct seq_file *m, void *v) +{ + const struct hv_stats_page *stats = m->private; + int idx; + + for (idx = 0; idx < ARRAY_SIZE(hv_hypervisor_counters); idx++) { + char *name = hv_hypervisor_counters[idx]; + + if (!name) + continue; + seq_printf(m, "%-27s: %llu\n", name, stats->data[idx]); + } + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(hv_stats); + +static void mshv_hv_stats_unmap(void) +{ + union hv_stats_object_identity identity = { + .hv.stats_area_type = HV_STATS_AREA_SELF, + }; + int err; + + err = hv_unmap_stats_page(HV_STATS_OBJECT_HYPERVISOR, NULL, &identity); + if (err) + pr_err("%s: failed to unmap hypervisor stats: %d\n", + __func__, err); +} + +static void * __init mshv_hv_stats_map(void) +{ + union hv_stats_object_identity identity = { + .hv.stats_area_type = HV_STATS_AREA_SELF, + }; + struct hv_stats_page *stats; + int err; + + err = hv_map_stats_page(HV_STATS_OBJECT_HYPERVISOR, &identity, &stats); + if (err) { + pr_err("%s: failed to map hypervisor stats: %d\n", + __func__, err); + return ERR_PTR(err); + } + return stats; +} + +static int __init mshv_debugfs_hv_stats_create(struct dentry *parent) +{ + struct dentry *dentry; + u64 *stats; + int err; + + stats = mshv_hv_stats_map(); + if (IS_ERR(stats)) + return PTR_ERR(stats); + + dentry = debugfs_create_file("stats", 0400, parent, + stats, &hv_stats_fops); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + pr_err("%s: failed to create hypervisor stats dentry: %d\n", + __func__, err); + goto unmap_hv_stats; + } + + mshv_lps_count = stats[HV_HYPERVISOR_COUNTER_LOGICAL_PROCESSORS]; + + return 0; + +unmap_hv_stats: + mshv_hv_stats_unmap(); + return err; +} + +int mshv_debugfs_vp_create(struct mshv_vp *vp) +{ + struct mshv_partition *p = vp->vp_partition; + + if (!mshv_debugfs) + return 0; + + return vp_debugfs_create(p->pt_id, vp->vp_index, + vp->vp_stats_pages, + &vp->vp_stats_dentry, + p->pt_vp_dentry); +} + +void mshv_debugfs_vp_remove(struct mshv_vp *vp) +{ + if (!mshv_debugfs) + return; + + vp_debugfs_remove(vp->vp_stats_dentry); +} + +int mshv_debugfs_partition_create(struct mshv_partition *partition) +{ + int err; + + if (!mshv_debugfs) + return 0; + + err = partition_debugfs_create(partition->pt_id, + &partition->pt_vp_dentry, + &partition->pt_stats_dentry, + mshv_debugfs_partition); + if (err) + return err; + + return 0; +} + +void mshv_debugfs_partition_remove(struct mshv_partition *partition) +{ + if (!mshv_debugfs) + return; + + partition_debugfs_remove(partition->pt_id, + partition->pt_stats_dentry); +} + +int __init mshv_debugfs_init(void) +{ + int err; + + mshv_debugfs = debugfs_create_dir("mshv", NULL); + if (IS_ERR(mshv_debugfs)) { + pr_err("%s: failed to create debugfs directory\n", __func__); + return PTR_ERR(mshv_debugfs); + } + + if (hv_root_partition()) { + err = mshv_debugfs_hv_stats_create(mshv_debugfs); + if (err) + goto remove_mshv_dir; + + err = mshv_debugfs_lp_create(mshv_debugfs); + if (err) + goto unmap_hv_stats; + } + + err = mshv_debugfs_parent_partition_create(); + if (err) + goto unmap_lp_stats; + + return 0; + +unmap_lp_stats: + if (hv_root_partition()) { + mshv_debugfs_lp_remove(); + mshv_debugfs_lp = NULL; + } +unmap_hv_stats: + if (hv_root_partition()) + mshv_hv_stats_unmap(); +remove_mshv_dir: + debugfs_remove_recursive(mshv_debugfs); + mshv_debugfs = NULL; + return err; +} + +void mshv_debugfs_exit(void) +{ + mshv_debugfs_parent_partition_remove(); + + if (hv_root_partition()) { + mshv_debugfs_lp_remove(); + mshv_debugfs_lp = NULL; + mshv_hv_stats_unmap(); + } + + debugfs_remove_recursive(mshv_debugfs); + mshv_debugfs = NULL; + mshv_debugfs_partition = NULL; +} diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index e4912b0618fa..7332d9af8373 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -52,6 +52,9 @@ struct mshv_vp { unsigned int kicked_by_hv; wait_queue_head_t vp_suspend_queue; } run; +#if IS_ENABLED(CONFIG_DEBUG_FS) + struct dentry *vp_stats_dentry; +#endif }; #define vp_fmt(fmt) "p%lluvp%u: " fmt @@ -136,6 +139,10 @@ struct mshv_partition { u64 isolation_type; bool import_completed; bool pt_initialized; +#if IS_ENABLED(CONFIG_DEBUG_FS) + struct dentry *pt_stats_dentry; + struct dentry *pt_vp_dentry; +#endif }; #define pt_fmt(fmt) "p%llu: " fmt @@ -327,6 +334,33 @@ int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, u64 arg, void *property_value, size_t property_value_sz); +#if IS_ENABLED(CONFIG_DEBUG_FS) +int __init mshv_debugfs_init(void); +void mshv_debugfs_exit(void); + +int mshv_debugfs_partition_create(struct mshv_partition *partition); +void mshv_debugfs_partition_remove(struct mshv_partition *partition); +int mshv_debugfs_vp_create(struct mshv_vp *vp); +void mshv_debugfs_vp_remove(struct mshv_vp *vp); +#else +static inline int __init mshv_debugfs_init(void) +{ + return 0; +} +static inline void mshv_debugfs_exit(void) { } + +static inline int mshv_debugfs_partition_create(struct mshv_partition *partition) +{ + return 0; +} +static inline void mshv_debugfs_partition_remove(struct mshv_partition *partition) { } +static inline int mshv_debugfs_vp_create(struct mshv_vp *vp) +{ + return 0; +} +static inline void mshv_debugfs_vp_remove(struct mshv_vp *vp) { } +#endif + extern struct mshv_root mshv_root; extern enum hv_scheduler_type hv_scheduler_type; extern u8 * __percpu *hv_synic_eventring_tail; diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 8803cc71a542..c633014ceb96 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -1096,6 +1096,10 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); + ret = mshv_debugfs_vp_create(vp); + if (ret) + goto put_partition; + /* * Keep anon_inode_getfd last: it installs fd in the file struct and * thus makes the state accessible in user space. @@ -1103,7 +1107,7 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp, O_RDWR | O_CLOEXEC); if (ret < 0) - goto put_partition; + goto remove_debugfs_vp; /* already exclusive with the partition mutex for all ioctls */ partition->pt_vp_count++; @@ -1111,6 +1115,8 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, return ret; +remove_debugfs_vp: + mshv_debugfs_vp_remove(vp); put_partition: mshv_partition_put(partition); free_vp: @@ -1553,10 +1559,16 @@ mshv_partition_ioctl_initialize(struct mshv_partition *partition) if (ret) goto withdraw_mem; + ret = mshv_debugfs_partition_create(partition); + if (ret) + goto finalize_partition; + partition->pt_initialized = true; return 0; +finalize_partition: + hv_call_finalize_partition(partition->pt_id); withdraw_mem: hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); @@ -1736,6 +1748,7 @@ static void destroy_partition(struct mshv_partition *partition) if (!vp) continue; + mshv_debugfs_vp_remove(vp); mshv_vp_stats_unmap(partition->pt_id, vp->vp_index, vp->vp_stats_pages); @@ -1769,6 +1782,8 @@ static void destroy_partition(struct mshv_partition *partition) partition->pt_vp_array[i] = NULL; } + mshv_debugfs_partition_remove(partition); + /* Deallocates and unmaps everything including vcpus, GPA mappings etc */ hv_call_finalize_partition(partition->pt_id); @@ -2314,10 +2329,14 @@ static int __init mshv_parent_partition_init(void) mshv_init_vmm_caps(dev); - ret = mshv_irqfd_wq_init(); + ret = mshv_debugfs_init(); if (ret) goto exit_partition; + ret = mshv_irqfd_wq_init(); + if (ret) + goto exit_debugfs; + spin_lock_init(&mshv_root.pt_ht_lock); hash_init(mshv_root.pt_htable); @@ -2325,6 +2344,8 @@ static int __init mshv_parent_partition_init(void) return 0; +exit_debugfs: + mshv_debugfs_exit(); exit_partition: if (hv_root_partition()) mshv_root_partition_exit(); @@ -2341,6 +2362,7 @@ static void __exit mshv_parent_partition_exit(void) { hv_setup_mshv_handler(NULL); mshv_port_table_fini(); + mshv_debugfs_exit(); misc_deregister(&mshv_dev); mshv_irqfd_wq_cleanup(); if (hv_root_partition()) From 93f039fe7a775007d7602c34d51b570f4a382bd7 Mon Sep 17 00:00:00 2001 From: Ethan Nelson-Moore Date: Fri, 30 Jan 2026 18:00:17 -0800 Subject: [PATCH 12/34] PCI: hv: remove unnecessary module_init/exit functions The pci-hyperv-intf driver has unnecessary empty module_init and module_exit functions. Remove them. Note that if a module_init function exists, a module_exit function must also exist; otherwise, the module cannot be unloaded. Signed-off-by: Ethan Nelson-Moore Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv-intf.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv-intf.c b/drivers/pci/controller/pci-hyperv-intf.c index 28b3e93d31c0..18acbda867f0 100644 --- a/drivers/pci/controller/pci-hyperv-intf.c +++ b/drivers/pci/controller/pci-hyperv-intf.c @@ -52,17 +52,5 @@ int hyperv_reg_block_invalidate(struct pci_dev *dev, void *context, } EXPORT_SYMBOL_GPL(hyperv_reg_block_invalidate); -static void __exit exit_hv_pci_intf(void) -{ -} - -static int __init init_hv_pci_intf(void) -{ - return 0; -} - -module_init(init_hv_pci_intf); -module_exit(exit_hv_pci_intf); - MODULE_DESCRIPTION("Hyper-V PCI Interface"); MODULE_LICENSE("GPL v2"); From c3a6ae7ea2d3f507cbddb5818ccc65b9d84d6dc7 Mon Sep 17 00:00:00 2001 From: Mukesh R Date: Tue, 3 Feb 2026 17:58:00 -0800 Subject: [PATCH 13/34] x86/hyperv: Move hv crash init after hypercall pg setup hv_root_crash_init() is not setting up the hypervisor crash collection for baremetal cases because when it's called, hypervisor page is not setup. Fix is simple, just move the crash init call after the hypercall page setup. Signed-off-by: Mukesh Rathor Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_init.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 14de43f4bc6c..7f3301bd081e 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -558,7 +558,6 @@ void __init hyperv_init(void) memunmap(src); hv_remap_tsc_clocksource(); - hv_root_crash_init(); hv_sleep_notifiers_register(); } else { hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); @@ -567,6 +566,9 @@ void __init hyperv_init(void) hv_set_hypercall_pg(hv_hypercall_pg); + if (hv_root_partition()) /* after set hypercall pg */ + hv_root_crash_init(); + skip_hypercall_pg_init: /* * hyperv_init() is called before LAPIC is initialized: see From 0236e75df4d0802a23e3c8d794dbce329cd34a60 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sun, 18 Jan 2026 19:34:35 -0800 Subject: [PATCH 14/34] Drivers: hv: Use memremap()/memunmap() instead of ioremap_cache()/iounmap() When running with a paravisor or in the root partition, the SynIC event and message pages are provided by the paravisor or hypervisor respectively, instead of being allocated by Linux. The provided pages are normal memory, but are outside of the physical address space seen by Linux. As such they cannot be accessed via the kernel's direct map, and must be explicitly mapped to a kernel virtual address. Current code uses ioremap_cache() and iounmap() to map and unmap the pages. These functions are for use on I/O address space that may not behave as normal memory, so they generate or expect addresses with the __iomem attribute. For normal memory, the preferred functions are memremap() and memunmap(), which operate similarly but without __iomem. At the time of the original work on CoCo VMs on Hyper-V, memremap() did not support creating a decrypted mapping, so ioremap_cache() was used instead, since I/O address space is always mapped decrypted. memremap() has since been enhanced to allow decrypted mappings, so replace ioremap_cache() with memremap() when mapping the event and message pages. Similarly, replace iounmap() with memunmap(). As a side benefit, the replacement cleans up 'sparse' warnings about __iomem mismatches. The replacement is done to use the correct functions as long-term goodness and to clean up the sparse warnings. No runtime bugs are fixed. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202601170445.JtZQwndW-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202512150359.fMdmbddk-lkp@intel.com/ Signed-off-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/hv.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index c100f04b3581..ea6835638505 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -287,11 +287,11 @@ void hv_hyp_synic_enable_regs(unsigned int cpu) simp.simp_enabled = 1; if (ms_hyperv.paravisor_present || hv_root_partition()) { - /* Mask out vTOM bit. ioremap_cache() maps decrypted */ + /* Mask out vTOM bit and map as decrypted */ u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; hv_cpu->hyp_synic_message_page = - (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); + memremap(base, HV_HYP_PAGE_SIZE, MEMREMAP_WB | MEMREMAP_DEC); if (!hv_cpu->hyp_synic_message_page) pr_err("Fail to map synic message page.\n"); } else { @@ -306,11 +306,11 @@ void hv_hyp_synic_enable_regs(unsigned int cpu) siefp.siefp_enabled = 1; if (ms_hyperv.paravisor_present || hv_root_partition()) { - /* Mask out vTOM bit. ioremap_cache() maps decrypted */ + /* Mask out vTOM bit and map as decrypted */ u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; hv_cpu->hyp_synic_event_page = - (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); + memremap(base, HV_HYP_PAGE_SIZE, MEMREMAP_WB | MEMREMAP_DEC); if (!hv_cpu->hyp_synic_event_page) pr_err("Fail to map synic event page.\n"); } else { @@ -429,7 +429,7 @@ void hv_hyp_synic_disable_regs(unsigned int cpu) simp.simp_enabled = 0; if (ms_hyperv.paravisor_present || hv_root_partition()) { if (hv_cpu->hyp_synic_message_page) { - iounmap(hv_cpu->hyp_synic_message_page); + memunmap(hv_cpu->hyp_synic_message_page); hv_cpu->hyp_synic_message_page = NULL; } } else { @@ -443,7 +443,7 @@ void hv_hyp_synic_disable_regs(unsigned int cpu) if (ms_hyperv.paravisor_present || hv_root_partition()) { if (hv_cpu->hyp_synic_event_page) { - iounmap(hv_cpu->hyp_synic_event_page); + memunmap(hv_cpu->hyp_synic_event_page); hv_cpu->hyp_synic_event_page = NULL; } } else { From 5ed8cbcaccc0e0ed90e03dac153d01d5b13f6724 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 19 Jan 2026 07:59:37 -0800 Subject: [PATCH 15/34] x86/hyperv: Use memremap()/memunmap() instead of ioremap_cache()/iounmap() When running with a paravisor and SEV-SNP, the GHCB page is provided by the paravisor instead of being allocated by Linux. The provided page is normal memory, but is outside of the physical address space seen by Linux. As such it cannot be accessed via the kernel's direct map, and must be explicitly mapped to a kernel virtual address. Current code uses ioremap_cache() and iounmap() to map and unmap the page. These functions are for use on I/O address space that may not behave as normal memory, so they generate or expect addresses with the __iomem attribute. For normal memory, the preferred functions are memremap() and memunmap(), which operate similarly but without __iomem. At the time of the original work on CoCo VMs on Hyper-V, memremap() did not support creating a decrypted mapping, so ioremap_cache() was used instead, since I/O address space is always mapped decrypted. memremap() has since been enhanced to allow decrypted mappings, so replace ioremap_cache() with memremap() when mapping the GHCB page. Similarly, replace iounmap() with memunmap(). As a side benefit, the replacement cleans up 'sparse' warnings about __iomem mismatches. The replacement is done to use the correct functions as long-term goodness and to clean up the sparse warnings. No runtime bugs are fixed. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311111925.iPGGJik4-lkp@intel.com/ Signed-off-by: Michael Kelley Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 7f3301bd081e..6b00842ddda8 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -103,9 +103,9 @@ static int hyperv_init_ghcb(void) */ rdmsrq(MSR_AMD64_SEV_ES_GHCB, ghcb_gpa); - /* Mask out vTOM bit. ioremap_cache() maps decrypted */ + /* Mask out vTOM bit and map as decrypted */ ghcb_gpa &= ~ms_hyperv.shared_gpa_boundary; - ghcb_va = (void *)ioremap_cache(ghcb_gpa, HV_HYP_PAGE_SIZE); + ghcb_va = memremap(ghcb_gpa, HV_HYP_PAGE_SIZE, MEMREMAP_WB | MEMREMAP_DEC); if (!ghcb_va) return -ENOMEM; @@ -277,7 +277,7 @@ static int hv_cpu_die(unsigned int cpu) if (hv_ghcb_pg) { ghcb_va = (void **)this_cpu_ptr(hv_ghcb_pg); if (*ghcb_va) - iounmap(*ghcb_va); + memunmap(*ghcb_va); *ghcb_va = NULL; } From 2b4246153e2184e3a3b4edc8cc35337d7a2455a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20L=C3=B3pez?= Date: Thu, 22 Jan 2026 12:41:31 +0100 Subject: [PATCH 16/34] mshv: clear eventfd counter on irqfd shutdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While unhooking from the irqfd waitqueue, clear the internal eventfd counter by using eventfd_ctx_remove_wait_queue() instead of remove_wait_queue(), preventing potential spurious interrupts. This removes the need to store a pointer into the workqueue, as the eventfd already keeps track of it. This mimicks what other similar subsystems do on their equivalent paths with their irqfds (KVM, Xen, ACRN support, etc). Signed-off-by: Carlos López Signed-off-by: Wei Liu --- drivers/hv/mshv_eventfd.c | 5 ++--- drivers/hv/mshv_eventfd.h | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c index dfc8b1092c02..525b64a5acc2 100644 --- a/drivers/hv/mshv_eventfd.c +++ b/drivers/hv/mshv_eventfd.c @@ -247,12 +247,13 @@ static void mshv_irqfd_shutdown(struct work_struct *work) { struct mshv_irqfd *irqfd = container_of(work, struct mshv_irqfd, irqfd_shutdown); + u64 cnt; /* * Synchronize with the wait-queue and unhook ourselves to prevent * further events. */ - remove_wait_queue(irqfd->irqfd_wqh, &irqfd->irqfd_wait); + eventfd_ctx_remove_wait_queue(irqfd->irqfd_eventfd_ctx, &irqfd->irqfd_wait, &cnt); if (irqfd->irqfd_resampler) { mshv_irqfd_resampler_shutdown(irqfd); @@ -371,8 +372,6 @@ static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh, struct mshv_irqfd *irqfd = container_of(polltbl, struct mshv_irqfd, irqfd_polltbl); - irqfd->irqfd_wqh = wqh; - /* * TODO: Ensure there isn't already an exclusive, priority waiter, e.g. * that the irqfd isn't already bound to another partition. Only the diff --git a/drivers/hv/mshv_eventfd.h b/drivers/hv/mshv_eventfd.h index 332e7670a344..464c6b81ab33 100644 --- a/drivers/hv/mshv_eventfd.h +++ b/drivers/hv/mshv_eventfd.h @@ -32,7 +32,6 @@ struct mshv_irqfd { struct mshv_lapic_irq irqfd_lapic_irq; struct hlist_node irqfd_hnode; poll_table irqfd_polltbl; - wait_queue_head_t *irqfd_wqh; wait_queue_entry_t irqfd_wait; struct work_struct irqfd_shutdown; struct mshv_irqfd_resampler *irqfd_resampler; From afefdb2bc94571f0f9297dc129b2069942a70f4b Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 2 Feb 2026 08:48:39 -0800 Subject: [PATCH 17/34] x86/hyperv: Update comment in hyperv_cleanup() The comment in hyperv_cleanup() became out-of-date as a result of commit c8ed0812646e ("x86/hyperv: Use direct call to hypercall-page"). Update the comment. No code or functional change. Signed-off-by: Michael Kelley Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_init.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 6b00842ddda8..5dbe9bd67891 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -635,9 +635,13 @@ void hyperv_cleanup(void) hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); /* - * Reset hypercall page reference before reset the page, - * let hypercall operations fail safely rather than - * panic the kernel for using invalid hypercall page + * Reset hv_hypercall_pg before resetting it in the hypervisor. + * hv_set_hypercall_pg(NULL) is not used because at this point in the + * panic path other CPUs have been stopped, causing static_call_update() + * to hang. So resetting hv_hypercall_pg to cause hypercalls to fail + * cleanly is only operative on 32-bit builds. But this is OK as it is + * just a preventative measure to ease detecting a hypercall being made + * after this point, which shouldn't be happening anyway. */ hv_hypercall_pg = NULL; From 51515bfc29ed5971d4f0a98243bdc1c93fadb102 Mon Sep 17 00:00:00 2001 From: Mukesh R Date: Wed, 4 Feb 2026 12:23:28 -0800 Subject: [PATCH 18/34] mshv: make field names descriptive in a header struct When struct fields use very common names like "pages" or "type", it makes it difficult to find uses of these fields with tools like grep, cscope, etc when the struct is in a header file included in many places. Add prefix mreg_ to some fields in struct mshv_mem_region to make it easier to find them. There is no functional change. Signed-off-by: Mukesh R Signed-off-by: Wei Liu --- drivers/hv/mshv_regions.c | 60 ++++++++++++++++++------------------- drivers/hv/mshv_root.h | 10 +++---- drivers/hv/mshv_root_main.c | 10 +++---- 3 files changed, 40 insertions(+), 40 deletions(-) diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c index adba3564d9f1..c28aac0726de 100644 --- a/drivers/hv/mshv_regions.c +++ b/drivers/hv/mshv_regions.c @@ -88,7 +88,7 @@ static long mshv_region_process_chunk(struct mshv_mem_region *region, struct page *page; int stride, ret; - page = region->pages[page_offset]; + page = region->mreg_pages[page_offset]; if (!page) return -EINVAL; @@ -98,7 +98,7 @@ static long mshv_region_process_chunk(struct mshv_mem_region *region, /* Start at stride since the first stride is validated */ for (count = stride; count < page_count; count += stride) { - page = region->pages[page_offset + count]; + page = region->mreg_pages[page_offset + count]; /* Break if current page is not present */ if (!page) @@ -152,7 +152,7 @@ static int mshv_region_process_range(struct mshv_mem_region *region, while (page_count) { /* Skip non-present pages */ - if (!region->pages[page_offset]) { + if (!region->mreg_pages[page_offset]) { page_offset++; page_count--; continue; @@ -190,7 +190,7 @@ struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; - kref_init(®ion->refcount); + kref_init(®ion->mreg_refcount); return region; } @@ -204,7 +204,7 @@ static int mshv_region_chunk_share(struct mshv_mem_region *region, flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; return hv_call_modify_spa_host_access(region->partition->pt_id, - region->pages + page_offset, + region->mreg_pages + page_offset, page_count, HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE, @@ -229,7 +229,7 @@ static int mshv_region_chunk_unshare(struct mshv_mem_region *region, flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; return hv_call_modify_spa_host_access(region->partition->pt_id, - region->pages + page_offset, + region->mreg_pages + page_offset, page_count, 0, flags, false); } @@ -254,7 +254,7 @@ static int mshv_region_chunk_remap(struct mshv_mem_region *region, return hv_call_map_gpa_pages(region->partition->pt_id, region->start_gfn + page_offset, page_count, flags, - region->pages + page_offset); + region->mreg_pages + page_offset); } static int mshv_region_remap_pages(struct mshv_mem_region *region, @@ -277,10 +277,10 @@ int mshv_region_map(struct mshv_mem_region *region) static void mshv_region_invalidate_pages(struct mshv_mem_region *region, u64 page_offset, u64 page_count) { - if (region->type == MSHV_REGION_TYPE_MEM_PINNED) - unpin_user_pages(region->pages + page_offset, page_count); + if (region->mreg_type == MSHV_REGION_TYPE_MEM_PINNED) + unpin_user_pages(region->mreg_pages + page_offset, page_count); - memset(region->pages + page_offset, 0, + memset(region->mreg_pages + page_offset, 0, page_count * sizeof(struct page *)); } @@ -297,7 +297,7 @@ int mshv_region_pin(struct mshv_mem_region *region) int ret; for (done_count = 0; done_count < region->nr_pages; done_count += ret) { - pages = region->pages + done_count; + pages = region->mreg_pages + done_count; userspace_addr = region->start_uaddr + done_count * HV_HYP_PAGE_SIZE; nr_pages = min(region->nr_pages - done_count, @@ -348,11 +348,11 @@ static int mshv_region_unmap(struct mshv_mem_region *region) static void mshv_region_destroy(struct kref *ref) { struct mshv_mem_region *region = - container_of(ref, struct mshv_mem_region, refcount); + container_of(ref, struct mshv_mem_region, mreg_refcount); struct mshv_partition *partition = region->partition; int ret; - if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE) + if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE) mshv_region_movable_fini(region); if (mshv_partition_encrypted(partition)) { @@ -374,12 +374,12 @@ static void mshv_region_destroy(struct kref *ref) void mshv_region_put(struct mshv_mem_region *region) { - kref_put(®ion->refcount, mshv_region_destroy); + kref_put(®ion->mreg_refcount, mshv_region_destroy); } int mshv_region_get(struct mshv_mem_region *region) { - return kref_get_unless_zero(®ion->refcount); + return kref_get_unless_zero(®ion->mreg_refcount); } /** @@ -405,16 +405,16 @@ static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region, int ret; range->notifier_seq = mmu_interval_read_begin(range->notifier); - mmap_read_lock(region->mni.mm); + mmap_read_lock(region->mreg_mni.mm); ret = hmm_range_fault(range); - mmap_read_unlock(region->mni.mm); + mmap_read_unlock(region->mreg_mni.mm); if (ret) return ret; - mutex_lock(®ion->mutex); + mutex_lock(®ion->mreg_mutex); if (mmu_interval_read_retry(range->notifier, range->notifier_seq)) { - mutex_unlock(®ion->mutex); + mutex_unlock(®ion->mreg_mutex); cond_resched(); return -EBUSY; } @@ -438,7 +438,7 @@ static int mshv_region_range_fault(struct mshv_mem_region *region, u64 page_offset, u64 page_count) { struct hmm_range range = { - .notifier = ®ion->mni, + .notifier = ®ion->mreg_mni, .default_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE, }; unsigned long *pfns; @@ -461,12 +461,12 @@ static int mshv_region_range_fault(struct mshv_mem_region *region, goto out; for (i = 0; i < page_count; i++) - region->pages[page_offset + i] = hmm_pfn_to_page(pfns[i]); + region->mreg_pages[page_offset + i] = hmm_pfn_to_page(pfns[i]); ret = mshv_region_remap_pages(region, region->hv_map_flags, page_offset, page_count); - mutex_unlock(®ion->mutex); + mutex_unlock(®ion->mreg_mutex); out: kfree(pfns); return ret; @@ -520,7 +520,7 @@ static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni, { struct mshv_mem_region *region = container_of(mni, struct mshv_mem_region, - mni); + mreg_mni); u64 page_offset, page_count; unsigned long mstart, mend; int ret = -EPERM; @@ -533,8 +533,8 @@ static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni, page_count = HVPFN_DOWN(mend - mstart); if (mmu_notifier_range_blockable(range)) - mutex_lock(®ion->mutex); - else if (!mutex_trylock(®ion->mutex)) + mutex_lock(®ion->mreg_mutex); + else if (!mutex_trylock(®ion->mreg_mutex)) goto out_fail; mmu_interval_set_seq(mni, cur_seq); @@ -546,12 +546,12 @@ static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni, mshv_region_invalidate_pages(region, page_offset, page_count); - mutex_unlock(®ion->mutex); + mutex_unlock(®ion->mreg_mutex); return true; out_unlock: - mutex_unlock(®ion->mutex); + mutex_unlock(®ion->mreg_mutex); out_fail: WARN_ONCE(ret, "Failed to invalidate region %#llx-%#llx (range %#lx-%#lx, event: %u, pages %#llx-%#llx, mm: %#llx): %d\n", @@ -568,21 +568,21 @@ static const struct mmu_interval_notifier_ops mshv_region_mni_ops = { void mshv_region_movable_fini(struct mshv_mem_region *region) { - mmu_interval_notifier_remove(®ion->mni); + mmu_interval_notifier_remove(®ion->mreg_mni); } bool mshv_region_movable_init(struct mshv_mem_region *region) { int ret; - ret = mmu_interval_notifier_insert(®ion->mni, current->mm, + ret = mmu_interval_notifier_insert(®ion->mreg_mni, current->mm, region->start_uaddr, region->nr_pages << HV_HYP_PAGE_SHIFT, &mshv_region_mni_ops); if (ret) return false; - mutex_init(®ion->mutex); + mutex_init(®ion->mreg_mutex); return true; } diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index 7332d9af8373..04c2a1910a8a 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -82,16 +82,16 @@ enum mshv_region_type { struct mshv_mem_region { struct hlist_node hnode; - struct kref refcount; + struct kref mreg_refcount; u64 nr_pages; u64 start_gfn; u64 start_uaddr; u32 hv_map_flags; struct mshv_partition *partition; - enum mshv_region_type type; - struct mmu_interval_notifier mni; - struct mutex mutex; /* protects region pages remapping */ - struct page *pages[]; + enum mshv_region_type mreg_type; + struct mmu_interval_notifier mreg_mni; + struct mutex mreg_mutex; /* protects region pages remapping */ + struct page *mreg_pages[]; }; struct mshv_irq_ack_notifier { diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index c633014ceb96..431aebf95bc7 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -650,7 +650,7 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) return false; /* Only movable memory ranges are supported for GPA intercepts */ - if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE) + if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE) ret = mshv_region_handle_gfn_fault(region, gfn); else ret = false; @@ -1193,12 +1193,12 @@ static int mshv_partition_create_region(struct mshv_partition *partition, return PTR_ERR(rg); if (is_mmio) - rg->type = MSHV_REGION_TYPE_MMIO; + rg->mreg_type = MSHV_REGION_TYPE_MMIO; else if (mshv_partition_encrypted(partition) || !mshv_region_movable_init(rg)) - rg->type = MSHV_REGION_TYPE_MEM_PINNED; + rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED; else - rg->type = MSHV_REGION_TYPE_MEM_MOVABLE; + rg->mreg_type = MSHV_REGION_TYPE_MEM_MOVABLE; rg->partition = partition; @@ -1315,7 +1315,7 @@ mshv_map_user_memory(struct mshv_partition *partition, if (ret) return ret; - switch (region->type) { + switch (region->mreg_type) { case MSHV_REGION_TYPE_MEM_PINNED: ret = mshv_prepare_pinned_region(region); break; From 2e7577cd5ddc1f86d1b6c48caf3cfa87dbb14e34 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 5 Feb 2026 04:40:10 -0500 Subject: [PATCH 19/34] mshv: fix SRCU protection in irqfd resampler ack handler Replace hlist_for_each_entry_rcu() with hlist_for_each_entry_srcu() in mshv_irqfd_resampler_ack() to correctly handle SRCU-protected linked list traversal. The function uses SRCU (sleepable RCU) synchronization via partition->pt_irq_srcu, but was incorrectly using the RCU variant for list iteration. This could lead to race conditions when the list is modified concurrently. Also add srcu_read_lock_held() assertion as required by hlist_for_each_entry_srcu() to ensure we're in the proper read-side critical section. Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs") Signed-off-by: Li RongQing Reviewed-by: Anirudh Rayabharam (Microsoft) Acked-by: Stanislav Kinsburskii Signed-off-by: Wei Liu --- drivers/hv/mshv_eventfd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c index 525b64a5acc2..5e0b10aeeaa2 100644 --- a/drivers/hv/mshv_eventfd.c +++ b/drivers/hv/mshv_eventfd.c @@ -87,8 +87,9 @@ static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian) idx = srcu_read_lock(&partition->pt_irq_srcu); - hlist_for_each_entry_rcu(irqfd, &resampler->rsmplr_irqfd_list, - irqfd_resampler_hnode) { + hlist_for_each_entry_srcu(irqfd, &resampler->rsmplr_irqfd_list, + irqfd_resampler_hnode, + srcu_read_lock_held(&partition->pt_irq_srcu)) { if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type)) hv_call_clear_virtual_interrupt(partition->pt_id); From 834ef6aa0996121184728279c4b81a3b70ee649b Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 21 Nov 2025 15:14:10 +0100 Subject: [PATCH 20/34] x86/hyperv: Use savesegment() instead of inline asm() to save segment registers Use standard savesegment() utility macro to save segment registers. Signed-off-by: Uros Bizjak Acked-by: Wei Liu Tested-by: Michael Kelley Cc: K. Y. Srinivasan Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Signed-off-by: Wei Liu --- arch/x86/hyperv/ivm.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index 651771534cae..7365d8f43181 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -315,16 +316,16 @@ int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip, unsigned int cpu) vmsa->gdtr.base = gdtr.address; vmsa->gdtr.limit = gdtr.size; - asm volatile("movl %%es, %%eax;" : "=a" (vmsa->es.selector)); + savesegment(es, vmsa->es.selector); hv_populate_vmcb_seg(vmsa->es, vmsa->gdtr.base); - asm volatile("movl %%cs, %%eax;" : "=a" (vmsa->cs.selector)); + savesegment(cs, vmsa->cs.selector); hv_populate_vmcb_seg(vmsa->cs, vmsa->gdtr.base); - asm volatile("movl %%ss, %%eax;" : "=a" (vmsa->ss.selector)); + savesegment(ss, vmsa->ss.selector); hv_populate_vmcb_seg(vmsa->ss, vmsa->gdtr.base); - asm volatile("movl %%ds, %%eax;" : "=a" (vmsa->ds.selector)); + savesegment(ds, vmsa->ds.selector); hv_populate_vmcb_seg(vmsa->ds, vmsa->gdtr.base); vmsa->efer = native_read_msr(MSR_EFER); From 885e78d71f772dd4c83f83530814870062115f85 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 21 Nov 2025 15:14:11 +0100 Subject: [PATCH 21/34] x86/hyperv: Remove ASM_CALL_CONSTRAINT with VMMCALL insn Unlike CALL instruction, VMMCALL does not push to the stack, so it's OK to allow the compiler to insert it before the frame pointer gets set up by the containing function. ASM_CALL_CONSTRAINT is for CALLs that must be inserted after the frame pointer is set up, so it is over-constraining here and can be removed. Signed-off-by: Uros Bizjak Tested-by: Michael Kelley Cc: K. Y. Srinivasan Cc: Haiyang Zhang Cc: Wei Liu Cc: Dexuan Cui Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Signed-off-by: Wei Liu --- arch/x86/hyperv/ivm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index 7365d8f43181..be7fad43a88d 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -392,7 +392,7 @@ u64 hv_snp_hypercall(u64 control, u64 param1, u64 param2) register u64 __r8 asm("r8") = param2; asm volatile("vmmcall" - : "=a" (hv_status), ASM_CALL_CONSTRAINT, + : "=a" (hv_status), "+c" (control), "+d" (param1), "+r" (__r8) : : "cc", "memory", "r9", "r10", "r11"); From f8e6343b7a89c7c649db5a9e309ba7aa20401813 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 16 Feb 2026 17:24:56 +0100 Subject: [PATCH 22/34] Drivers: hv: vmbus: Use kthread for vmbus interrupts on PREEMPT_RT Resolves the following lockdep report when booting PREEMPT_RT on Hyper-V with related guest support enabled: [ 1.127941] hv_vmbus: registering driver hyperv_drm [ 1.132518] ============================= [ 1.132519] [ BUG: Invalid wait context ] [ 1.132521] 6.19.0-rc8+ #9 Not tainted [ 1.132524] ----------------------------- [ 1.132525] swapper/0/0 is trying to lock: [ 1.132526] ffff8b9381bb3c90 (&channel->sched_lock){....}-{3:3}, at: vmbus_chan_sched+0xc4/0x2b0 [ 1.132543] other info that might help us debug this: [ 1.132544] context-{2:2} [ 1.132545] 1 lock held by swapper/0/0: [ 1.132547] #0: ffffffffa010c4c0 (rcu_read_lock){....}-{1:3}, at: vmbus_chan_sched+0x31/0x2b0 [ 1.132557] stack backtrace: [ 1.132560] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.19.0-rc8+ #9 PREEMPT_{RT,(lazy)} [ 1.132565] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 09/25/2025 [ 1.132567] Call Trace: [ 1.132570] [ 1.132573] dump_stack_lvl+0x6e/0xa0 [ 1.132581] __lock_acquire+0xee0/0x21b0 [ 1.132592] lock_acquire+0xd5/0x2d0 [ 1.132598] ? vmbus_chan_sched+0xc4/0x2b0 [ 1.132606] ? lock_acquire+0xd5/0x2d0 [ 1.132613] ? vmbus_chan_sched+0x31/0x2b0 [ 1.132619] rt_spin_lock+0x3f/0x1f0 [ 1.132623] ? vmbus_chan_sched+0xc4/0x2b0 [ 1.132629] ? vmbus_chan_sched+0x31/0x2b0 [ 1.132634] vmbus_chan_sched+0xc4/0x2b0 [ 1.132641] vmbus_isr+0x2c/0x150 [ 1.132648] __sysvec_hyperv_callback+0x5f/0xa0 [ 1.132654] sysvec_hyperv_callback+0x88/0xb0 [ 1.132658] [ 1.132659] [ 1.132660] asm_sysvec_hyperv_callback+0x1a/0x20 As code paths that handle vmbus IRQs use sleepy locks under PREEMPT_RT, the vmbus_isr execution needs to be moved into thread context. Open- coding this allows to skip the IPI that irq_work would additionally bring and which we do not need, being an IRQ, never an NMI. This affects both x86 and arm64, therefore hook into the common driver logic. Signed-off-by: Jan Kiszka Reviewed-by: Florian Bezdeka Tested-by: Florian Bezdeka Reviewed-by: Michael Kelley Tested-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 66 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index a53af6fe81a6..1d5cba142828 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -1350,7 +1351,7 @@ static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message } } -void vmbus_isr(void) +static void __vmbus_isr(void) { struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context); @@ -1363,6 +1364,53 @@ void vmbus_isr(void) add_interrupt_randomness(vmbus_interrupt); } + +static DEFINE_PER_CPU(bool, vmbus_irq_pending); +static DEFINE_PER_CPU(struct task_struct *, vmbus_irqd); + +static void vmbus_irqd_wake(void) +{ + struct task_struct *tsk = __this_cpu_read(vmbus_irqd); + + __this_cpu_write(vmbus_irq_pending, true); + wake_up_process(tsk); +} + +static void vmbus_irqd_setup(unsigned int cpu) +{ + sched_set_fifo(current); +} + +static int vmbus_irqd_should_run(unsigned int cpu) +{ + return __this_cpu_read(vmbus_irq_pending); +} + +static void run_vmbus_irqd(unsigned int cpu) +{ + __this_cpu_write(vmbus_irq_pending, false); + __vmbus_isr(); +} + +static bool vmbus_irq_initialized; + +static struct smp_hotplug_thread vmbus_irq_threads = { + .store = &vmbus_irqd, + .setup = vmbus_irqd_setup, + .thread_should_run = vmbus_irqd_should_run, + .thread_fn = run_vmbus_irqd, + .thread_comm = "vmbus_irq/%u", +}; + +void vmbus_isr(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + vmbus_irqd_wake(); + } else { + lockdep_hardirq_threaded(); + __vmbus_isr(); + } +} EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl"); static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id) @@ -1462,6 +1510,13 @@ static int vmbus_bus_init(void) * the VMbus interrupt handler. */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !vmbus_irq_initialized) { + ret = smpboot_register_percpu_thread(&vmbus_irq_threads); + if (ret) + goto err_kthread; + vmbus_irq_initialized = true; + } + if (vmbus_irq == -1) { hv_setup_vmbus_handler(vmbus_isr); } else { @@ -1507,6 +1562,11 @@ err_connect: free_percpu(vmbus_evt); } err_setup: + if (IS_ENABLED(CONFIG_PREEMPT_RT) && vmbus_irq_initialized) { + smpboot_unregister_percpu_thread(&vmbus_irq_threads); + vmbus_irq_initialized = false; + } +err_kthread: bus_unregister(&hv_bus); return ret; } @@ -2976,6 +3036,10 @@ static void __exit vmbus_exit(void) free_percpu_irq(vmbus_irq, vmbus_evt); free_percpu(vmbus_evt); } + if (IS_ENABLED(CONFIG_PREEMPT_RT) && vmbus_irq_initialized) { + smpboot_unregister_percpu_thread(&vmbus_irq_threads); + vmbus_irq_initialized = false; + } for_each_online_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); From fe9f15983c4823a8473e289b4a302946a4864ef5 Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Tue, 17 Feb 2026 15:11:58 -0800 Subject: [PATCH 23/34] x86/hyperv: Reserve 3 interrupt vectors used exclusively by MSHV MSVC compiler, used to compile the Microsoft Hypervisor, currently has an assert intrinsic that uses interrupt vector 0x29 to create an exception. This will cause hypervisor to then crash and collect core. As such, if this interrupt number is assigned to a device by Linux and the device generates it, hypervisor will crash. There are two other such vectors hard coded in the hypervisor, 0x2C and 0x2D for debug purposes. Fortunately, the three vectors are part of the kernel driver space and that makes it feasible to reserve them early so they are not assigned later. Signed-off-by: Mukesh Rathor Signed-off-by: Wei Liu --- arch/x86/kernel/cpu/mshyperv.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 579fb2c64cfd..89a2eb8a0722 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -478,6 +478,28 @@ int hv_get_hypervisor_version(union hv_hypervisor_version_info *info) } EXPORT_SYMBOL_GPL(hv_get_hypervisor_version); +/* + * Reserved vectors hard coded in the hypervisor. If used outside, the hypervisor + * will either crash or hang or attempt to break into debugger. + */ +static void hv_reserve_irq_vectors(void) +{ + #define HYPERV_DBG_FASTFAIL_VECTOR 0x29 + #define HYPERV_DBG_ASSERT_VECTOR 0x2C + #define HYPERV_DBG_SERVICE_VECTOR 0x2D + + if (cpu_feature_enabled(X86_FEATURE_FRED)) + return; + + if (test_and_set_bit(HYPERV_DBG_ASSERT_VECTOR, system_vectors) || + test_and_set_bit(HYPERV_DBG_SERVICE_VECTOR, system_vectors) || + test_and_set_bit(HYPERV_DBG_FASTFAIL_VECTOR, system_vectors)) + BUG(); + + pr_info("Hyper-V: reserve vectors: %d %d %d\n", HYPERV_DBG_ASSERT_VECTOR, + HYPERV_DBG_SERVICE_VECTOR, HYPERV_DBG_FASTFAIL_VECTOR); +} + static void __init ms_hyperv_init_platform(void) { int hv_max_functions_eax, eax; @@ -510,6 +532,9 @@ static void __init ms_hyperv_init_platform(void) hv_identify_partition_type(); + if (hv_root_partition()) + hv_reserve_irq_vectors(); + if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) ms_hyperv.hints |= HV_DEPRECATING_AEOI_RECOMMENDED; From 705d01c8d78121ee1634bfc602ac4b0ad1438fab Mon Sep 17 00:00:00 2001 From: Ethan Tidmore Date: Wed, 18 Feb 2026 13:09:03 -0600 Subject: [PATCH 24/34] x86/hyperv: Fix error pointer dereference The function idle_thread_get() can return an error pointer and is not checked for it. Add check for error pointer. Detected by Smatch: arch/x86/hyperv/hv_vtl.c:126 hv_vtl_bringup_vcpu() error: 'idle' dereferencing possible ERR_PTR() Fixes: 2b4b90e053a29 ("x86/hyperv: Use per cpu initial stack for vtl context") Signed-off-by: Ethan Tidmore Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_vtl.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index c0edaed0efb3..9b6a9bc4ab76 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -110,7 +110,7 @@ static void hv_vtl_ap_entry(void) static int hv_vtl_bringup_vcpu(u32 target_vp_index, int cpu, u64 eip_ignored) { - u64 status; + u64 status, rsp, rip; int ret = 0; struct hv_enable_vp_vtl *input; unsigned long irq_flags; @@ -123,9 +123,11 @@ static int hv_vtl_bringup_vcpu(u32 target_vp_index, int cpu, u64 eip_ignored) struct desc_struct *gdt; struct task_struct *idle = idle_thread_get(cpu); - u64 rsp = (unsigned long)idle->thread.sp; + if (IS_ERR(idle)) + return PTR_ERR(idle); - u64 rip = (u64)&hv_vtl_ap_entry; + rsp = (unsigned long)idle->thread.sp; + rip = (u64)&hv_vtl_ap_entry; native_store_gdt(&gdt_ptr); store_idt(&idt_ptr); From 0597696017fe6c172bce7827be32f4bbd02542ab Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 18 Feb 2026 12:00:18 +0100 Subject: [PATCH 25/34] mshv: Use try_cmpxchg() instead of cmpxchg() Use !try_cmpxchg() instead of cmpxchg (*ptr, old, new) != old. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after CMPXCHG. The generated assembly code improves from e.g.: 415: 48 8b 44 24 30 mov 0x30(%rsp),%rax 41a: 48 8b 54 24 38 mov 0x38(%rsp),%rdx 41f: f0 49 0f b1 91 a8 02 lock cmpxchg %rdx,0x2a8(%r9) 426: 00 00 428: 48 3b 44 24 30 cmp 0x30(%rsp),%rax 42d: 0f 84 09 ff ff ff je 33c <...> to: 415: 48 8b 44 24 30 mov 0x30(%rsp),%rax 41a: 48 8b 54 24 38 mov 0x38(%rsp),%rdx 41f: f0 49 0f b1 91 a8 02 lock cmpxchg %rdx,0x2a8(%r9) 426: 00 00 428: 0f 84 0e ff ff ff je 33c <...> No functional change intended. Signed-off-by: Uros Bizjak Cc: K. Y. Srinivasan Cc: Haiyang Zhang Cc: Wei Liu Cc: Dexuan Cui Cc: Long Li Signed-off-by: Wei Liu --- drivers/hv/hyperv_vmbus.h | 4 ++-- drivers/hv/mshv_eventfd.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index cdbc5f5c3215..7bd8f8486e85 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -370,8 +370,8 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages * on crash. */ - if (cmpxchg(&msg->header.message_type, old_msg_type, - HVMSG_NONE) != old_msg_type) + if (!try_cmpxchg(&msg->header.message_type, + &old_msg_type, HVMSG_NONE)) return; /* diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c index 5e0b10aeeaa2..492c6258045c 100644 --- a/drivers/hv/mshv_eventfd.c +++ b/drivers/hv/mshv_eventfd.c @@ -129,8 +129,8 @@ static int mshv_vp_irq_try_set_vector(struct mshv_vp *vp, u32 vector) new_iv.vector[new_iv.vector_count++] = vector; - if (cmpxchg(&vp->vp_register_page->interrupt_vectors.as_uint64, - iv.as_uint64, new_iv.as_uint64) != iv.as_uint64) + if (!try_cmpxchg(&vp->vp_register_page->interrupt_vectors.as_uint64, + &iv.as_uint64, new_iv.as_uint64)) return -EAGAIN; return 0; From 4bef6b28bab8697b4f9255c375da2b6b6943a969 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Wed, 18 Feb 2026 19:11:40 +0000 Subject: [PATCH 26/34] mshv: Add support for integrated scheduler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Query the hypervisor for integrated scheduler support and use it if configured. Microsoft Hypervisor originally provided two schedulers: root and core. The root scheduler allows the root partition to schedule guest vCPUs across physical cores, supporting both time slicing and CPU affinity (e.g., via cgroups). In contrast, the core scheduler delegates vCPU-to-physical-core scheduling entirely to the hypervisor. Direct virtualization introduces a new privileged guest partition type - L1 Virtual Host (L1VH) — which can create child partitions from its own resources. These child partitions are effectively siblings, scheduled by the hypervisor's core scheduler. This prevents the L1VH parent from setting affinity or time slicing for its own processes or guest VPs. While cgroups, CFS, and cpuset controllers can still be used, their effectiveness is unpredictable, as the core scheduler swaps vCPUs according to its own logic (typically round-robin across all allocated physical CPUs). As a result, the system may appear to "steal" time from the L1VH and its children. To address this, Microsoft Hypervisor introduces the integrated scheduler. This allows an L1VH partition to schedule its own vCPUs and those of its guests across its "physical" cores, effectively emulating root scheduler behavior within the L1VH, while retaining core scheduler behavior for the rest of the system. The integrated scheduler is controlled by the root partition and gated by the vmm_enable_integrated_scheduler capability bit. If set, the hypervisor supports the integrated scheduler. The L1VH partition must then check if it is enabled by querying the corresponding extended partition property. If this property is true, the L1VH partition must use the root scheduler logic; otherwise, it must use the core scheduler. This requirement makes reading VMM capabilities in L1VH partition a requirement too. Signed-off-by: Andreea Pintilie Signed-off-by: Stanislav Kinsburskii Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 82 ++++++++++++++++++++++--------------- include/hyperv/hvhdk_mini.h | 7 +++- 2 files changed, 56 insertions(+), 33 deletions(-) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 431aebf95bc7..c6ec88884728 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -2079,6 +2079,29 @@ static const char *scheduler_type_to_string(enum hv_scheduler_type type) }; } +static int __init l1vh_retrieve_scheduler_type(enum hv_scheduler_type *out) +{ + u64 integrated_sched_enabled; + int ret; + + *out = HV_SCHEDULER_TYPE_CORE_SMT; + + if (!mshv_root.vmm_caps.vmm_enable_integrated_scheduler) + return 0; + + ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, + HV_PARTITION_PROPERTY_INTEGRATED_SCHEDULER_ENABLED, + 0, &integrated_sched_enabled, + sizeof(integrated_sched_enabled)); + if (ret) + return ret; + + if (integrated_sched_enabled) + *out = HV_SCHEDULER_TYPE_ROOT; + + return 0; +} + /* TODO move this to hv_common.c when needed outside */ static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out) { @@ -2111,13 +2134,12 @@ static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out) /* Retrieve and stash the supported scheduler type */ static int __init mshv_retrieve_scheduler_type(struct device *dev) { - int ret = 0; + int ret; if (hv_l1vh_partition()) - hv_scheduler_type = HV_SCHEDULER_TYPE_CORE_SMT; + ret = l1vh_retrieve_scheduler_type(&hv_scheduler_type); else ret = hv_retrieve_scheduler_type(&hv_scheduler_type); - if (ret) return ret; @@ -2237,42 +2259,29 @@ struct notifier_block mshv_reboot_nb = { static void mshv_root_partition_exit(void) { unregister_reboot_notifier(&mshv_reboot_nb); - root_scheduler_deinit(); } static int __init mshv_root_partition_init(struct device *dev) { - int err; - - err = root_scheduler_init(dev); - if (err) - return err; - - err = register_reboot_notifier(&mshv_reboot_nb); - if (err) - goto root_sched_deinit; - - return 0; - -root_sched_deinit: - root_scheduler_deinit(); - return err; + return register_reboot_notifier(&mshv_reboot_nb); } -static void mshv_init_vmm_caps(struct device *dev) +static int __init mshv_init_vmm_caps(struct device *dev) { - /* - * This can only fail here if HVCALL_GET_PARTITION_PROPERTY_EX or - * HV_PARTITION_PROPERTY_VMM_CAPABILITIES are not supported. In that - * case it's valid to proceed as if all vmm_caps are disabled (zero). - */ - if (hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, - HV_PARTITION_PROPERTY_VMM_CAPABILITIES, - 0, &mshv_root.vmm_caps, - sizeof(mshv_root.vmm_caps))) - dev_warn(dev, "Unable to get VMM capabilities\n"); + int ret; + + ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, + HV_PARTITION_PROPERTY_VMM_CAPABILITIES, + 0, &mshv_root.vmm_caps, + sizeof(mshv_root.vmm_caps)); + if (ret && hv_l1vh_partition()) { + dev_err(dev, "Failed to get VMM capabilities: %d\n", ret); + return ret; + } dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]); + + return 0; } static int __init mshv_parent_partition_init(void) @@ -2318,6 +2327,10 @@ static int __init mshv_parent_partition_init(void) mshv_cpuhp_online = ret; + ret = mshv_init_vmm_caps(dev); + if (ret) + goto remove_cpu_state; + ret = mshv_retrieve_scheduler_type(dev); if (ret) goto remove_cpu_state; @@ -2327,11 +2340,13 @@ static int __init mshv_parent_partition_init(void) if (ret) goto remove_cpu_state; - mshv_init_vmm_caps(dev); + ret = root_scheduler_init(dev); + if (ret) + goto exit_partition; ret = mshv_debugfs_init(); if (ret) - goto exit_partition; + goto deinit_root_scheduler; ret = mshv_irqfd_wq_init(); if (ret) @@ -2346,6 +2361,8 @@ static int __init mshv_parent_partition_init(void) exit_debugfs: mshv_debugfs_exit(); +deinit_root_scheduler: + root_scheduler_deinit(); exit_partition: if (hv_root_partition()) mshv_root_partition_exit(); @@ -2365,6 +2382,7 @@ static void __exit mshv_parent_partition_exit(void) mshv_debugfs_exit(); misc_deregister(&mshv_dev); mshv_irqfd_wq_cleanup(); + root_scheduler_deinit(); if (hv_root_partition()) mshv_root_partition_exit(); cpuhp_remove_state(mshv_cpuhp_online); diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h index 41a29bf8ec14..c0300910808b 100644 --- a/include/hyperv/hvhdk_mini.h +++ b/include/hyperv/hvhdk_mini.h @@ -87,6 +87,9 @@ enum hv_partition_property_code { HV_PARTITION_PROPERTY_PRIVILEGE_FLAGS = 0x00010000, HV_PARTITION_PROPERTY_SYNTHETIC_PROC_FEATURES = 0x00010001, + /* Integrated scheduling properties */ + HV_PARTITION_PROPERTY_INTEGRATED_SCHEDULER_ENABLED = 0x00020005, + /* Resource properties */ HV_PARTITION_PROPERTY_GPA_PAGE_ACCESS_TRACKING = 0x00050005, HV_PARTITION_PROPERTY_UNIMPLEMENTED_MSR_ACTION = 0x00050017, @@ -102,7 +105,7 @@ enum hv_partition_property_code { }; #define HV_PARTITION_VMM_CAPABILITIES_BANK_COUNT 1 -#define HV_PARTITION_VMM_CAPABILITIES_RESERVED_BITFIELD_COUNT 59 +#define HV_PARTITION_VMM_CAPABILITIES_RESERVED_BITFIELD_COUNT 57 struct hv_partition_property_vmm_capabilities { u16 bank_count; @@ -119,6 +122,8 @@ struct hv_partition_property_vmm_capabilities { u64 reservedbit3: 1; #endif u64 assignable_synthetic_proc_features: 1; + u64 reservedbit5: 1; + u64 vmm_enable_integrated_scheduler : 1; u64 reserved0: HV_PARTITION_VMM_CAPABILITIES_RESERVED_BITFIELD_COUNT; } __packed; }; From 36d6cbb62133fc6eea28f380409e0fb190f3dfbe Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Wed, 18 Feb 2026 23:32:17 +0000 Subject: [PATCH 27/34] mshv: expose the scrub partition hypercall This hypercall needs to be exposed for VMMs to soft-reboot guests. It will reset APIC and synthetic interrupt controller state, among others. Signed-off-by: Magnus Kulke Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 1 + include/hyperv/hvgdk_mini.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index c6ec88884728..e5d94398528e 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -120,6 +120,7 @@ static u16 mshv_passthru_hvcalls[] = { HVCALL_SET_VP_REGISTERS, HVCALL_TRANSLATE_VIRTUAL_ADDRESS, HVCALL_CLEAR_VIRTUAL_INTERRUPT, + HVCALL_SCRUB_PARTITION, HVCALL_REGISTER_INTERCEPT_RESULT, HVCALL_ASSERT_VIRTUAL_INTERRUPT, HVCALL_GET_GPA_PAGES_ACCESS_STATES, diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 30fbbde81c5c..d9aa5afb0a27 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -474,6 +474,7 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_NOTIFY_PARTITION_EVENT 0x0087 #define HVCALL_ENTER_SLEEP_STATE 0x0084 #define HVCALL_NOTIFY_PORT_RING_EMPTY 0x008b +#define HVCALL_SCRUB_PARTITION 0x008d #define HVCALL_REGISTER_INTERCEPT_RESULT 0x0091 #define HVCALL_ASSERT_VIRTUAL_INTERRUPT 0x0094 #define HVCALL_CREATE_PORT 0x0095 From 30d25a8fc04cf1806c09362616e861d6fd339f98 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 18 Feb 2026 09:01:21 -0800 Subject: [PATCH 28/34] Drivers: hv: vmbus: Simplify allocation of vmbus_evt The per-cpu variable vmbus_evt is currently dynamically allocated. It's only 8 bytes, so just allocate it statically to simplify and save a few lines of code. Signed-off-by: Michael Kelley Reviewed-by: Long Li Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 1d5cba142828..771792da6a4f 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -52,7 +52,7 @@ static struct device *vmbus_root_device; static int hyperv_cpuhp_online; -static long __percpu *vmbus_evt; +static DEFINE_PER_CPU(long, vmbus_evt); /* Values parsed from ACPI DSDT */ int vmbus_irq; @@ -1520,13 +1520,11 @@ static int vmbus_bus_init(void) if (vmbus_irq == -1) { hv_setup_vmbus_handler(vmbus_isr); } else { - vmbus_evt = alloc_percpu(long); ret = request_percpu_irq(vmbus_irq, vmbus_percpu_isr, - "Hyper-V VMbus", vmbus_evt); + "Hyper-V VMbus", &vmbus_evt); if (ret) { pr_err("Can't request Hyper-V VMbus IRQ %d, Err %d", vmbus_irq, ret); - free_percpu(vmbus_evt); goto err_setup; } } @@ -1555,12 +1553,10 @@ static int vmbus_bus_init(void) return 0; err_connect: - if (vmbus_irq == -1) { + if (vmbus_irq == -1) hv_remove_vmbus_handler(); - } else { - free_percpu_irq(vmbus_irq, vmbus_evt); - free_percpu(vmbus_evt); - } + else + free_percpu_irq(vmbus_irq, &vmbus_evt); err_setup: if (IS_ENABLED(CONFIG_PREEMPT_RT) && vmbus_irq_initialized) { smpboot_unregister_percpu_thread(&vmbus_irq_threads); @@ -3030,12 +3026,10 @@ static void __exit vmbus_exit(void) vmbus_connection.conn_state = DISCONNECTED; hv_stimer_global_cleanup(); vmbus_disconnect(); - if (vmbus_irq == -1) { + if (vmbus_irq == -1) hv_remove_vmbus_handler(); - } else { - free_percpu_irq(vmbus_irq, vmbus_evt); - free_percpu(vmbus_evt); - } + else + free_percpu_irq(vmbus_irq, &vmbus_evt); if (IS_ENABLED(CONFIG_PREEMPT_RT) && vmbus_irq_initialized) { smpboot_unregister_percpu_thread(&vmbus_irq_threads); vmbus_irq_initialized = false; From a284dbc96a47891a7a595a1c81b1e2da4d309cf6 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Wed, 18 Feb 2026 14:47:59 +0000 Subject: [PATCH 29/34] mshv: Add nested virtualization creation flag Introduce HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE to indicate support for nested virtualization during partition creation. This enables clearer configuration and capability checks for nested virtualization scenarios. Signed-off-by: Stanislav Kinsburskii Signed-off-by: Muminul Islam Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 2 ++ include/hyperv/hvhdk.h | 1 + include/uapi/linux/mshv.h | 1 + 3 files changed, 4 insertions(+) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index e5d94398528e..e490f8e5a8a5 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -1947,6 +1947,8 @@ static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags, *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES)) *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; + if (args.pt_flags & BIT(MSHV_PT_BIT_NESTED_VIRTUALIZATION)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE; isol_props->as_uint64 = 0; diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h index 79d1f16a850a..f139c7c5bb2d 100644 --- a/include/hyperv/hvhdk.h +++ b/include/hyperv/hvhdk.h @@ -335,6 +335,7 @@ union hv_partition_isolation_properties { #define HV_PARTITION_ISOLATION_HOST_TYPE_RESERVED 0x2 /* Note: Exo partition is enabled by default */ +#define HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE BIT(1) #define HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED BIT(4) #define HV_PARTITION_CREATION_FLAG_EXO_PARTITION BIT(8) #define HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED BIT(13) diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h index dee3ece28ce5..7ef5dd67a232 100644 --- a/include/uapi/linux/mshv.h +++ b/include/uapi/linux/mshv.h @@ -27,6 +27,7 @@ enum { MSHV_PT_BIT_X2APIC, MSHV_PT_BIT_GPA_SUPER_PAGES, MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES, + MSHV_PT_BIT_NESTED_VIRTUALIZATION, MSHV_PT_BIT_COUNT, }; From 8927a108a7662eb83eb667bc0c5a0633397122b1 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 18 Feb 2026 14:48:02 +0000 Subject: [PATCH 30/34] mshv: Add SMT_ENABLED_GUEST partition creation flag Add support for HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST to allow userspace VMMs to enable SMT for guest partitions. Expose this via new MSHV_PT_BIT_SMT_ENABLED_GUEST flag in the UAPI. Without this flag, the hypervisor schedules guest VPs incorrectly, causing SMT unusable. Signed-off-by: Anatol Belski Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 2 ++ include/hyperv/hvhdk.h | 1 + include/uapi/linux/mshv.h | 1 + 3 files changed, 4 insertions(+) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index e490f8e5a8a5..192467a25f66 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -1949,6 +1949,8 @@ static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags, *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; if (args.pt_flags & BIT(MSHV_PT_BIT_NESTED_VIRTUALIZATION)) *pt_flags |= HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE; + if (args.pt_flags & BIT(MSHV_PT_BIT_SMT_ENABLED_GUEST)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST; isol_props->as_uint64 = 0; diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h index f139c7c5bb2d..245f3db53bf1 100644 --- a/include/hyperv/hvhdk.h +++ b/include/hyperv/hvhdk.h @@ -335,6 +335,7 @@ union hv_partition_isolation_properties { #define HV_PARTITION_ISOLATION_HOST_TYPE_RESERVED 0x2 /* Note: Exo partition is enabled by default */ +#define HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST BIT(0) #define HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE BIT(1) #define HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED BIT(4) #define HV_PARTITION_CREATION_FLAG_EXO_PARTITION BIT(8) diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h index 7ef5dd67a232..e0645a34b55b 100644 --- a/include/uapi/linux/mshv.h +++ b/include/uapi/linux/mshv.h @@ -28,6 +28,7 @@ enum { MSHV_PT_BIT_GPA_SUPER_PAGES, MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES, MSHV_PT_BIT_NESTED_VIRTUALIZATION, + MSHV_PT_BIT_SMT_ENABLED_GUEST, MSHV_PT_BIT_COUNT, }; From 7db44aa173de03c170d4dfa5864ae126678a5ad5 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Thu, 5 Feb 2026 18:42:10 +0000 Subject: [PATCH 31/34] mshv: Introduce hv_result_needs_memory() helper function Replace direct comparisons of hv_result(status) against HV_STATUS_INSUFFICIENT_MEMORY with a new hv_result_needs_memory() helper function. This improves code readability and provides a consistent and extendable interface for checking out-of-memory conditions in hypercall results. No functional changes intended. Signed-off-by: Stanislav Kinsburskii Reviewed-by: Anirudh Rayabharam (Microsoft) Reviewed-by: Mukesh R Signed-off-by: Wei Liu --- drivers/hv/hv_proc.c | 14 ++++++++++++-- drivers/hv/mshv_root_hv_call.c | 25 ++++++++++++------------- drivers/hv/mshv_root_main.c | 2 +- include/asm-generic/mshyperv.h | 3 +++ 4 files changed, 28 insertions(+), 16 deletions(-) diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c index fbb4eb3901bb..e53204b9e05d 100644 --- a/drivers/hv/hv_proc.c +++ b/drivers/hv/hv_proc.c @@ -110,6 +110,16 @@ free_buf: } EXPORT_SYMBOL_GPL(hv_call_deposit_pages); +bool hv_result_needs_memory(u64 status) +{ + switch (hv_result(status)) { + case HV_STATUS_INSUFFICIENT_MEMORY: + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(hv_result_needs_memory); + int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) { struct hv_input_add_logical_processor *input; @@ -137,7 +147,7 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) input, output); local_irq_restore(flags); - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (!hv_result_needs_memory(status)) { if (!hv_result_success(status)) { hv_status_err(status, "cpu %u apic ID: %u\n", lp_index, apic_id); @@ -179,7 +189,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL); local_irq_restore(irq_flags); - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (!hv_result_needs_memory(status)) { if (!hv_result_success(status)) { hv_status_err(status, "vcpu: %u, lp: %u\n", vp_index, flags); diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c index daee036e48bc..1c4a2dbf49c0 100644 --- a/drivers/hv/mshv_root_hv_call.c +++ b/drivers/hv/mshv_root_hv_call.c @@ -115,7 +115,7 @@ int hv_call_create_partition(u64 flags, status = hv_do_hypercall(HVCALL_CREATE_PARTITION, input, output); - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (!hv_result_needs_memory(status)) { if (hv_result_success(status)) *partition_id = output->partition_id; local_irq_restore(irq_flags); @@ -147,7 +147,7 @@ int hv_call_initialize_partition(u64 partition_id) status = hv_do_fast_hypercall8(HVCALL_INITIALIZE_PARTITION, *(u64 *)&input); - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (!hv_result_needs_memory(status)) { ret = hv_result_to_errno(status); break; } @@ -239,7 +239,7 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count, completed = hv_repcomp(status); - if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) { + if (hv_result_needs_memory(status)) { ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, HV_MAP_GPA_DEPOSIT_PAGES); if (ret) @@ -455,7 +455,7 @@ int hv_call_get_vp_state(u32 vp_index, u64 partition_id, status = hv_do_hypercall(control, input, output); - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (!hv_result_needs_memory(status)) { if (hv_result_success(status) && ret_output) memcpy(ret_output, output, sizeof(*output)); @@ -518,7 +518,7 @@ int hv_call_set_vp_state(u32 vp_index, u64 partition_id, status = hv_do_hypercall(control, input, NULL); - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (!hv_result_needs_memory(status)) { local_irq_restore(flags); ret = hv_result_to_errno(status); break; @@ -563,7 +563,7 @@ static int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input, output); - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (!hv_result_needs_memory(status)) { if (hv_result_success(status)) *state_page = pfn_to_page(output->map_location); local_irq_restore(flags); @@ -718,7 +718,7 @@ hv_call_create_port(u64 port_partition_id, union hv_port_id port_id, if (hv_result_success(status)) break; - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (!hv_result_needs_memory(status)) { ret = hv_result_to_errno(status); break; } @@ -772,7 +772,7 @@ hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id, if (hv_result_success(status)) break; - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (!hv_result_needs_memory(status)) { ret = hv_result_to_errno(status); break; } @@ -850,7 +850,7 @@ static int hv_call_map_stats_page2(enum hv_stats_object_type type, if (!ret) break; - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (!hv_result_needs_memory(status)) { hv_status_debug(status, "\n"); break; } @@ -899,7 +899,7 @@ hv_call_map_stats_page(enum hv_stats_object_type type, struct hv_input_map_stats_page *input; struct hv_output_map_stats_page *output; u64 status, pfn; - int hv_status, ret = 0; + int ret = 0; do { local_irq_save(flags); @@ -915,13 +915,12 @@ hv_call_map_stats_page(enum hv_stats_object_type type, local_irq_restore(flags); - hv_status = hv_result(status); - if (hv_status != HV_STATUS_INSUFFICIENT_MEMORY) { + if (!hv_result_needs_memory(status)) { if (hv_result_success(status)) break; if (hv_stats_get_area_type(type, identity) == HV_STATS_AREA_PARENT && - hv_status == HV_STATUS_INVALID_PARAMETER) { + hv_result(status) == HV_STATUS_INVALID_PARAMETER) { *addr = NULL; return 0; } diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 192467a25f66..17546f6f4e85 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -252,7 +252,7 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, if (hv_result_success(status)) break; - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) + if (!hv_result_needs_memory(status)) ret = hv_result_to_errno(status); else ret = hv_call_deposit_pages(NUMA_NO_NODE, diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index ecedab554c80..452426d5b2ab 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -342,6 +342,8 @@ static inline bool hv_parent_partition(void) { return hv_root_partition() || hv_l1vh_partition(); } + +bool hv_result_needs_memory(u64 status); int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); @@ -350,6 +352,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); static inline bool hv_root_partition(void) { return false; } static inline bool hv_l1vh_partition(void) { return false; } static inline bool hv_parent_partition(void) { return false; } +static inline bool hv_result_needs_memory(u64 status) { return false; } static inline int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages) { return -EOPNOTSUPP; From ede54383e646821b499873c1caf2dd97551da8eb Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Thu, 5 Feb 2026 18:42:15 +0000 Subject: [PATCH 32/34] mshv: Introduce hv_deposit_memory helper functions Introduce hv_deposit_memory_node() and hv_deposit_memory() helper functions to handle memory deposit with proper error handling. The new hv_deposit_memory_node() function takes the hypervisor status as a parameter and validates it before depositing pages. It checks for HV_STATUS_INSUFFICIENT_MEMORY specifically and returns an error for unexpected status codes. This is a precursor patch to new out-of-memory error codes support. No functional changes intended. Signed-off-by: Stanislav Kinsburskii Reviewed-by: Anirudh Rayabharam (Microsoft) Reviewed-by: Mukesh R Signed-off-by: Wei Liu --- drivers/hv/hv_proc.c | 21 +++++++++++++++++++-- drivers/hv/mshv_root_hv_call.c | 25 +++++++++---------------- drivers/hv/mshv_root_main.c | 3 +-- include/asm-generic/mshyperv.h | 10 ++++++++++ 4 files changed, 39 insertions(+), 20 deletions(-) diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c index e53204b9e05d..53622e5886b8 100644 --- a/drivers/hv/hv_proc.c +++ b/drivers/hv/hv_proc.c @@ -110,6 +110,22 @@ free_buf: } EXPORT_SYMBOL_GPL(hv_call_deposit_pages); +int hv_deposit_memory_node(int node, u64 partition_id, + u64 hv_status) +{ + u32 num_pages = 1; + + switch (hv_result(hv_status)) { + case HV_STATUS_INSUFFICIENT_MEMORY: + break; + default: + hv_status_err(hv_status, "Unexpected!\n"); + return -ENOMEM; + } + return hv_call_deposit_pages(node, partition_id, num_pages); +} +EXPORT_SYMBOL_GPL(hv_deposit_memory_node); + bool hv_result_needs_memory(u64 status) { switch (hv_result(status)) { @@ -155,7 +171,8 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) } break; } - ret = hv_call_deposit_pages(node, hv_current_partition_id, 1); + ret = hv_deposit_memory_node(node, hv_current_partition_id, + status); } while (!ret); return ret; @@ -197,7 +214,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) } break; } - ret = hv_call_deposit_pages(node, partition_id, 1); + ret = hv_deposit_memory_node(node, partition_id, status); } while (!ret); diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c index 1c4a2dbf49c0..7f91096f95a8 100644 --- a/drivers/hv/mshv_root_hv_call.c +++ b/drivers/hv/mshv_root_hv_call.c @@ -123,8 +123,7 @@ int hv_call_create_partition(u64 flags, break; } local_irq_restore(irq_flags); - ret = hv_call_deposit_pages(NUMA_NO_NODE, - hv_current_partition_id, 1); + ret = hv_deposit_memory(hv_current_partition_id, status); } while (!ret); return ret; @@ -151,7 +150,7 @@ int hv_call_initialize_partition(u64 partition_id) ret = hv_result_to_errno(status); break; } - ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1); + ret = hv_deposit_memory(partition_id, status); } while (!ret); return ret; @@ -465,8 +464,7 @@ int hv_call_get_vp_state(u32 vp_index, u64 partition_id, } local_irq_restore(flags); - ret = hv_call_deposit_pages(NUMA_NO_NODE, - partition_id, 1); + ret = hv_deposit_memory(partition_id, status); } while (!ret); return ret; @@ -525,8 +523,7 @@ int hv_call_set_vp_state(u32 vp_index, u64 partition_id, } local_irq_restore(flags); - ret = hv_call_deposit_pages(NUMA_NO_NODE, - partition_id, 1); + ret = hv_deposit_memory(partition_id, status); } while (!ret); return ret; @@ -573,7 +570,7 @@ static int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, local_irq_restore(flags); - ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1); + ret = hv_deposit_memory(partition_id, status); } while (!ret); return ret; @@ -722,8 +719,7 @@ hv_call_create_port(u64 port_partition_id, union hv_port_id port_id, ret = hv_result_to_errno(status); break; } - ret = hv_call_deposit_pages(NUMA_NO_NODE, port_partition_id, 1); - + ret = hv_deposit_memory(port_partition_id, status); } while (!ret); return ret; @@ -776,8 +772,7 @@ hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id, ret = hv_result_to_errno(status); break; } - ret = hv_call_deposit_pages(NUMA_NO_NODE, - connection_partition_id, 1); + ret = hv_deposit_memory(connection_partition_id, status); } while (!ret); return ret; @@ -855,8 +850,7 @@ static int hv_call_map_stats_page2(enum hv_stats_object_type type, break; } - ret = hv_call_deposit_pages(NUMA_NO_NODE, - hv_current_partition_id, 1); + ret = hv_deposit_memory(hv_current_partition_id, status); } while (!ret); return ret; @@ -929,8 +923,7 @@ hv_call_map_stats_page(enum hv_stats_object_type type, return hv_result_to_errno(status); } - ret = hv_call_deposit_pages(NUMA_NO_NODE, - hv_current_partition_id, 1); + ret = hv_deposit_memory(hv_current_partition_id, status); if (ret) return ret; } while (!ret); diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 17546f6f4e85..e6509c980763 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -255,8 +255,7 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, if (!hv_result_needs_memory(status)) ret = hv_result_to_errno(status); else - ret = hv_call_deposit_pages(NUMA_NO_NODE, - pt_id, 1); + ret = hv_deposit_memory(pt_id, status); } while (!ret); args.status = hv_result(status); diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 452426d5b2ab..d37b68238c97 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -344,6 +344,7 @@ static inline bool hv_parent_partition(void) } bool hv_result_needs_memory(u64 status); +int hv_deposit_memory_node(int node, u64 partition_id, u64 status); int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); @@ -353,6 +354,10 @@ static inline bool hv_root_partition(void) { return false; } static inline bool hv_l1vh_partition(void) { return false; } static inline bool hv_parent_partition(void) { return false; } static inline bool hv_result_needs_memory(u64 status) { return false; } +static inline int hv_deposit_memory_node(int node, u64 partition_id, u64 status) +{ + return -EOPNOTSUPP; +} static inline int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages) { return -EOPNOTSUPP; @@ -367,6 +372,11 @@ static inline int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u3 } #endif /* CONFIG_MSHV_ROOT */ +static inline int hv_deposit_memory(u64 partition_id, u64 status) +{ + return hv_deposit_memory_node(NUMA_NO_NODE, partition_id, status); +} + #if IS_ENABLED(CONFIG_HYPERV_VTL_MODE) u8 __init get_vtl(void); #else From cf82dd5ea95815e6c0612b61118d2358ef5c05b0 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Thu, 5 Feb 2026 18:42:21 +0000 Subject: [PATCH 33/34] mshv: Handle insufficient contiguous memory hypervisor status The HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY status indicates that the hypervisor lacks sufficient contiguous memory for its internal allocations. When this status is encountered, allocate and deposit HV_MAX_CONTIGUOUS_ALLOCATION_PAGES contiguous pages to the hypervisor. HV_MAX_CONTIGUOUS_ALLOCATION_PAGES is defined in the hypervisor headers, a deposit of this size will always satisfy the hypervisor's requirements. Signed-off-by: Stanislav Kinsburskii Reviewed-by: Anirudh Rayabharam (Microsoft) Reviewed-by: Mukesh R Signed-off-by: Wei Liu --- drivers/hv/hv_common.c | 1 + drivers/hv/hv_proc.c | 4 ++++ include/hyperv/hvgdk_mini.h | 1 + include/hyperv/hvhdk_mini.h | 2 ++ 4 files changed, 8 insertions(+) diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index f1c17fb60dc1..f20596276662 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -793,6 +793,7 @@ static const struct hv_status_info hv_status_infos[] = { _STATUS_INFO(HV_STATUS_UNKNOWN_PROPERTY, -EIO), _STATUS_INFO(HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE, -EIO), _STATUS_INFO(HV_STATUS_INSUFFICIENT_MEMORY, -ENOMEM), + _STATUS_INFO(HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY, -ENOMEM), _STATUS_INFO(HV_STATUS_INVALID_PARTITION_ID, -EINVAL), _STATUS_INFO(HV_STATUS_INVALID_VP_INDEX, -EINVAL), _STATUS_INFO(HV_STATUS_NOT_FOUND, -EIO), diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c index 53622e5886b8..181f6d02bce3 100644 --- a/drivers/hv/hv_proc.c +++ b/drivers/hv/hv_proc.c @@ -118,6 +118,9 @@ int hv_deposit_memory_node(int node, u64 partition_id, switch (hv_result(hv_status)) { case HV_STATUS_INSUFFICIENT_MEMORY: break; + case HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY: + num_pages = HV_MAX_CONTIGUOUS_ALLOCATION_PAGES; + break; default: hv_status_err(hv_status, "Unexpected!\n"); return -ENOMEM; @@ -130,6 +133,7 @@ bool hv_result_needs_memory(u64 status) { switch (hv_result(status)) { case HV_STATUS_INSUFFICIENT_MEMORY: + case HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY: return true; } return false; diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index d9aa5afb0a27..fa2fb91a6470 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -38,6 +38,7 @@ struct hv_u128 { #define HV_STATUS_INVALID_LP_INDEX 0x41 #define HV_STATUS_INVALID_REGISTER_VALUE 0x50 #define HV_STATUS_OPERATION_FAILED 0x71 +#define HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY 0x75 #define HV_STATUS_TIME_OUT 0x78 #define HV_STATUS_CALL_PENDING 0x79 #define HV_STATUS_VTL_ALREADY_ENABLED 0x86 diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h index c0300910808b..091c03e26046 100644 --- a/include/hyperv/hvhdk_mini.h +++ b/include/hyperv/hvhdk_mini.h @@ -7,6 +7,8 @@ #include "hvgdk_mini.h" +#define HV_MAX_CONTIGUOUS_ALLOCATION_PAGES 8 + /* * Doorbell connection_info flags. */ From 158ebb578cd5f7881fdc7c4ecebddcf9463f91fd Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Thu, 5 Feb 2026 18:42:27 +0000 Subject: [PATCH 34/34] mshv: Handle insufficient root memory hypervisor statuses When creating guest partition objects, the hypervisor may fail to allocate root partition pages and return an insufficient memory status. In this case, deposit memory using the root partition ID instead. Signed-off-by: Stanislav Kinsburskii Reviewed-by: Anirudh Rayabharam (Microsoft) Reviewed-by: Mukesh R Signed-off-by: Wei Liu --- drivers/hv/hv_common.c | 2 ++ drivers/hv/hv_proc.c | 14 +++++++++ include/hyperv/hvgdk_mini.h | 58 +++++++++++++++++++------------------ 3 files changed, 46 insertions(+), 28 deletions(-) diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index f20596276662..6b67ac616789 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -794,6 +794,8 @@ static const struct hv_status_info hv_status_infos[] = { _STATUS_INFO(HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE, -EIO), _STATUS_INFO(HV_STATUS_INSUFFICIENT_MEMORY, -ENOMEM), _STATUS_INFO(HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY, -ENOMEM), + _STATUS_INFO(HV_STATUS_INSUFFICIENT_ROOT_MEMORY, -ENOMEM), + _STATUS_INFO(HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY, -ENOMEM), _STATUS_INFO(HV_STATUS_INVALID_PARTITION_ID, -EINVAL), _STATUS_INFO(HV_STATUS_INVALID_VP_INDEX, -EINVAL), _STATUS_INFO(HV_STATUS_NOT_FOUND, -EIO), diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c index 181f6d02bce3..5f4fd9c3231c 100644 --- a/drivers/hv/hv_proc.c +++ b/drivers/hv/hv_proc.c @@ -121,6 +121,18 @@ int hv_deposit_memory_node(int node, u64 partition_id, case HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY: num_pages = HV_MAX_CONTIGUOUS_ALLOCATION_PAGES; break; + + case HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY: + num_pages = HV_MAX_CONTIGUOUS_ALLOCATION_PAGES; + fallthrough; + case HV_STATUS_INSUFFICIENT_ROOT_MEMORY: + if (!hv_root_partition()) { + hv_status_err(hv_status, "Unexpected root memory deposit\n"); + return -ENOMEM; + } + partition_id = HV_PARTITION_ID_SELF; + break; + default: hv_status_err(hv_status, "Unexpected!\n"); return -ENOMEM; @@ -134,6 +146,8 @@ bool hv_result_needs_memory(u64 status) switch (hv_result(status)) { case HV_STATUS_INSUFFICIENT_MEMORY: case HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY: + case HV_STATUS_INSUFFICIENT_ROOT_MEMORY: + case HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY: return true; } return false; diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index fa2fb91a6470..056ef7b6b360 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -14,34 +14,36 @@ struct hv_u128 { } __packed; /* NOTE: when adding below, update hv_result_to_string() */ -#define HV_STATUS_SUCCESS 0x0 -#define HV_STATUS_INVALID_HYPERCALL_CODE 0x2 -#define HV_STATUS_INVALID_HYPERCALL_INPUT 0x3 -#define HV_STATUS_INVALID_ALIGNMENT 0x4 -#define HV_STATUS_INVALID_PARAMETER 0x5 -#define HV_STATUS_ACCESS_DENIED 0x6 -#define HV_STATUS_INVALID_PARTITION_STATE 0x7 -#define HV_STATUS_OPERATION_DENIED 0x8 -#define HV_STATUS_UNKNOWN_PROPERTY 0x9 -#define HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE 0xA -#define HV_STATUS_INSUFFICIENT_MEMORY 0xB -#define HV_STATUS_INVALID_PARTITION_ID 0xD -#define HV_STATUS_INVALID_VP_INDEX 0xE -#define HV_STATUS_NOT_FOUND 0x10 -#define HV_STATUS_INVALID_PORT_ID 0x11 -#define HV_STATUS_INVALID_CONNECTION_ID 0x12 -#define HV_STATUS_INSUFFICIENT_BUFFERS 0x13 -#define HV_STATUS_NOT_ACKNOWLEDGED 0x14 -#define HV_STATUS_INVALID_VP_STATE 0x15 -#define HV_STATUS_NO_RESOURCES 0x1D -#define HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED 0x20 -#define HV_STATUS_INVALID_LP_INDEX 0x41 -#define HV_STATUS_INVALID_REGISTER_VALUE 0x50 -#define HV_STATUS_OPERATION_FAILED 0x71 -#define HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY 0x75 -#define HV_STATUS_TIME_OUT 0x78 -#define HV_STATUS_CALL_PENDING 0x79 -#define HV_STATUS_VTL_ALREADY_ENABLED 0x86 +#define HV_STATUS_SUCCESS 0x0 +#define HV_STATUS_INVALID_HYPERCALL_CODE 0x2 +#define HV_STATUS_INVALID_HYPERCALL_INPUT 0x3 +#define HV_STATUS_INVALID_ALIGNMENT 0x4 +#define HV_STATUS_INVALID_PARAMETER 0x5 +#define HV_STATUS_ACCESS_DENIED 0x6 +#define HV_STATUS_INVALID_PARTITION_STATE 0x7 +#define HV_STATUS_OPERATION_DENIED 0x8 +#define HV_STATUS_UNKNOWN_PROPERTY 0x9 +#define HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE 0xA +#define HV_STATUS_INSUFFICIENT_MEMORY 0xB +#define HV_STATUS_INVALID_PARTITION_ID 0xD +#define HV_STATUS_INVALID_VP_INDEX 0xE +#define HV_STATUS_NOT_FOUND 0x10 +#define HV_STATUS_INVALID_PORT_ID 0x11 +#define HV_STATUS_INVALID_CONNECTION_ID 0x12 +#define HV_STATUS_INSUFFICIENT_BUFFERS 0x13 +#define HV_STATUS_NOT_ACKNOWLEDGED 0x14 +#define HV_STATUS_INVALID_VP_STATE 0x15 +#define HV_STATUS_NO_RESOURCES 0x1D +#define HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED 0x20 +#define HV_STATUS_INVALID_LP_INDEX 0x41 +#define HV_STATUS_INVALID_REGISTER_VALUE 0x50 +#define HV_STATUS_OPERATION_FAILED 0x71 +#define HV_STATUS_INSUFFICIENT_ROOT_MEMORY 0x73 +#define HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY 0x75 +#define HV_STATUS_TIME_OUT 0x78 +#define HV_STATUS_CALL_PENDING 0x79 +#define HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY 0x83 +#define HV_STATUS_VTL_ALREADY_ENABLED 0x86 /* * The Hyper-V TimeRefCount register and the TSC