From cd4178d19420359554e3da6fd77ecfd0f58067ce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Jun 2025 16:51:47 -0700 Subject: [PATCH 001/101] KVM: arm64: WARN if unmapping a vLPI fails in any path When unmapping a vLPI, WARN if nullifying vCPU affinity fails, not just if failure occurs when freeing an ITE. If undoing vCPU affinity fails, then odds are very good that vLPI state tracking has has gotten out of whack, i.e. that KVM and the GIC disagree on the state of an IRQ/vLPI. At best, inconsistent state means there is a lurking bug/flaw somewhere. At worst, the inconsistency could eventually be fatal to the host, e.g. if an ITS command fails because KVM's view of things doesn't match reality/hardware. Note, only the call from kvm_arch_irq_bypass_del_producer() by way of kvm_vgic_v4_unset_forwarding() doesn't already WARN. Common KVM's kvm_irq_routing_update() WARNs if kvm_arch_update_irqfd_routing() fails. For that path, if its_unmap_vlpi() fails in kvm_vgic_v4_unset_forwarding(), the only possible causes are that the GIC doesn't have a v4 ITS (from its_irq_set_vcpu_affinity()): /* Need a v4 ITS */ if (!is_v4(its_dev->its)) return -EINVAL; guard(raw_spinlock)(&its_dev->event_map.vlpi_lock); /* Unmap request? */ if (!info) return its_vlpi_unmap(d); or that KVM has gotten out of sync with the GIC/ITS (from its_vlpi_unmap()): if (!its_dev->event_map.vm || !irqd_is_forwarded_to_vcpu(d)) return -EINVAL; All of the above failure scenarios are warnable offences, as they should never occur absent a kernel/KVM bug. Acked-by: Oliver Upton Link: https://lore.kernel.org/all/aFWY2LTVIxz5rfhh@google.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/vgic/vgic-its.c | 2 +- arch/arm64/kvm/vgic/vgic-v4.c | 4 ++-- drivers/irqchip/irq-gic-v4.c | 4 ++-- include/linux/irqchip/arm-gic-v4.h | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 534049c7c94b..98630dae910d 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -758,7 +758,7 @@ static void its_free_ite(struct kvm *kvm, struct its_ite *ite) if (irq) { scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) { if (irq->hw) - WARN_ON(its_unmap_vlpi(ite->irq->host_irq)); + its_unmap_vlpi(ite->irq->host_irq); irq->hw = false; } diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index 193946108192..911170d4a9c8 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -545,10 +545,10 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq) if (irq->hw) { atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count); irq->hw = false; - ret = its_unmap_vlpi(host_irq); + its_unmap_vlpi(host_irq); } raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(kvm, irq); - return ret; + return 0; } diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c index 58c28895f8c4..8455b4a5fbb0 100644 --- a/drivers/irqchip/irq-gic-v4.c +++ b/drivers/irqchip/irq-gic-v4.c @@ -342,10 +342,10 @@ int its_get_vlpi(int irq, struct its_vlpi_map *map) return irq_set_vcpu_affinity(irq, &info); } -int its_unmap_vlpi(int irq) +void its_unmap_vlpi(int irq) { irq_clear_status_flags(irq, IRQ_DISABLE_UNLAZY); - return irq_set_vcpu_affinity(irq, NULL); + WARN_ON_ONCE(irq_set_vcpu_affinity(irq, NULL)); } int its_prop_update_vlpi(int irq, u8 config, bool inv) diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index 7f1f11a5e4e4..0b0887099fd7 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -146,7 +146,7 @@ int its_commit_vpe(struct its_vpe *vpe); int its_invall_vpe(struct its_vpe *vpe); int its_map_vlpi(int irq, struct its_vlpi_map *map); int its_get_vlpi(int irq, struct its_vlpi_map *map); -int its_unmap_vlpi(int irq); +void its_unmap_vlpi(int irq); int its_prop_update_vlpi(int irq, u8 config, bool inv); int its_prop_update_vsgi(int irq, u8 priority, bool group); From fa079a0616edbcdad538128306abbc19b68a9863 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:27 -0700 Subject: [PATCH 002/101] irqbypass: Drop pointless and misleading THIS_MODULE get/put Drop irqbypass.ko's superfluous and misleading get/put calls on THIS_MODULE. A module taking a reference to itself is useless; no amount of checks will prevent doom and destruction if the caller hasn't already guaranteed the liveliness of the module (this goes for any module). E.g. if try_module_get() fails because irqbypass.ko is being unloaded, then the kernel has already hit a use-after-free by virtue of executing code whose lifecycle is tied to irqbypass.ko. Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-2-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/lib/irqbypass.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index 28fda42e471b..080c706f3b01 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -92,9 +92,6 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer) might_sleep(); - if (!try_module_get(THIS_MODULE)) - return -ENODEV; - mutex_lock(&lock); list_for_each_entry(tmp, &producers, node) { @@ -120,7 +117,6 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer) return 0; out_err: mutex_unlock(&lock); - module_put(THIS_MODULE); return ret; } EXPORT_SYMBOL_GPL(irq_bypass_register_producer); @@ -142,9 +138,6 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) might_sleep(); - if (!try_module_get(THIS_MODULE)) - return; /* nothing in the list anyway */ - mutex_lock(&lock); list_for_each_entry(tmp, &producers, node) { @@ -159,13 +152,10 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) } list_del(&producer->node); - module_put(THIS_MODULE); break; } mutex_unlock(&lock); - - module_put(THIS_MODULE); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); @@ -188,9 +178,6 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) might_sleep(); - if (!try_module_get(THIS_MODULE)) - return -ENODEV; - mutex_lock(&lock); list_for_each_entry(tmp, &consumers, node) { @@ -216,7 +203,6 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) return 0; out_err: mutex_unlock(&lock); - module_put(THIS_MODULE); return ret; } EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); @@ -238,9 +224,6 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) might_sleep(); - if (!try_module_get(THIS_MODULE)) - return; /* nothing in the list anyway */ - mutex_lock(&lock); list_for_each_entry(tmp, &consumers, node) { @@ -255,12 +238,9 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) } list_del(&consumer->node); - module_put(THIS_MODULE); break; } mutex_unlock(&lock); - - module_put(THIS_MODULE); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer); From 07fbc83c01520c62c89f6495f2f0bea2f4ac6684 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:28 -0700 Subject: [PATCH 003/101] irqbypass: Drop superfluous might_sleep() annotations Drop superfluous might_sleep() annotations from irqbypass, mutex_lock() provides all of the necessary tracking. Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-3-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/lib/irqbypass.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index 080c706f3b01..28a4d933569a 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -90,8 +90,6 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer) if (!producer->token) return -EINVAL; - might_sleep(); - mutex_lock(&lock); list_for_each_entry(tmp, &producers, node) { @@ -136,8 +134,6 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) if (!producer->token) return; - might_sleep(); - mutex_lock(&lock); list_for_each_entry(tmp, &producers, node) { @@ -176,8 +172,6 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) !consumer->add_producer || !consumer->del_producer) return -EINVAL; - might_sleep(); - mutex_lock(&lock); list_for_each_entry(tmp, &consumers, node) { @@ -222,8 +216,6 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) if (!consumer->token) return; - might_sleep(); - mutex_lock(&lock); list_for_each_entry(tmp, &consumers, node) { From 2b521d86ee80a436a92445b8206d38d75aeb39ea Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:29 -0700 Subject: [PATCH 004/101] irqbypass: Take ownership of producer/consumer token tracking Move ownership of IRQ bypass token tracking into irqbypass.ko, and explicitly require callers to pass an eventfd_ctx structure instead of a completely opaque token. Relying on producers and consumers to set the token appropriately is error prone, and hiding the fact that the token must be an eventfd_ctx pointer (for all intents and purposes) unnecessarily obfuscates the code and makes it more brittle. Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 4 +-- drivers/vfio/pci/vfio_pci_intrs.c | 9 +++---- drivers/vhost/vdpa.c | 8 +++--- include/linux/irqbypass.h | 35 +++++++++++++----------- virt/kvm/eventfd.c | 7 +++-- virt/lib/irqbypass.c | 44 ++++++++++++++++++++----------- 6 files changed, 58 insertions(+), 49 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b58a74c1722d..3dc93e2d4777 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13669,8 +13669,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) - printk(KERN_INFO "irq bypass consumer (token %p) unregistration" - " fails: %d\n", irqfd->consumer.token, ret); + printk(KERN_INFO "irq bypass consumer (eventfd %p) unregistration" + " fails: %d\n", irqfd->consumer.eventfd, ret); spin_unlock_irq(&kvm->irqfds.lock); diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 565966351dfa..d87fe116762a 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -505,15 +505,12 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, if (ret) goto out_put_eventfd_ctx; - ctx->producer.token = trigger; ctx->producer.irq = irq; - ret = irq_bypass_register_producer(&ctx->producer); + ret = irq_bypass_register_producer(&ctx->producer, trigger); if (unlikely(ret)) { dev_info(&pdev->dev, - "irq bypass producer (token %p) registration fails: %d\n", - ctx->producer.token, ret); - - ctx->producer.token = NULL; + "irq bypass producer (eventfd %p) registration fails: %d\n", + trigger, ret); } ctx->trigger = trigger; diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 5a49b5a6d496..7b265ffda697 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -213,10 +213,10 @@ static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, u16 qid) return; vq->call_ctx.producer.irq = irq; - ret = irq_bypass_register_producer(&vq->call_ctx.producer); + ret = irq_bypass_register_producer(&vq->call_ctx.producer, vq->call_ctx.ctx); if (unlikely(ret)) - dev_info(&v->dev, "vq %u, irq bypass producer (token %p) registration fails, ret = %d\n", - qid, vq->call_ctx.producer.token, ret); + dev_info(&v->dev, "vq %u, irq bypass producer (eventfd %p) registration fails, ret = %d\n", + qid, vq->call_ctx.ctx, ret); } static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa *v, u16 qid) @@ -712,7 +712,6 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK) vhost_vdpa_unsetup_vq_irq(v, idx); - vq->call_ctx.producer.token = NULL; } break; } @@ -753,7 +752,6 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, cb.callback = vhost_vdpa_virtqueue_cb; cb.private = vq; cb.trigger = vq->call_ctx.ctx; - vq->call_ctx.producer.token = vq->call_ctx.ctx; if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK) vhost_vdpa_setup_vq_irq(v, idx); diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h index 9bdb2a781841..1b57d15ac4cf 100644 --- a/include/linux/irqbypass.h +++ b/include/linux/irqbypass.h @@ -10,6 +10,7 @@ #include +struct eventfd_ctx; struct irq_bypass_consumer; /* @@ -18,20 +19,20 @@ struct irq_bypass_consumer; * The IRQ bypass manager is a simple set of lists and callbacks that allows * IRQ producers (ex. physical interrupt sources) to be matched to IRQ * consumers (ex. virtualization hardware that allows IRQ bypass or offload) - * via a shared token (ex. eventfd_ctx). Producers and consumers register - * independently. When a token match is found, the optional @stop callback - * will be called for each participant. The pair will then be connected via - * the @add_* callbacks, and finally the optional @start callback will allow - * any final coordination. When either participant is unregistered, the - * process is repeated using the @del_* callbacks in place of the @add_* - * callbacks. Match tokens must be unique per producer/consumer, 1:N pairings - * are not supported. + * via a shared eventfd_ctx. Producers and consumers register independently. + * When a producer and consumer are paired, i.e. an eventfd match is found, the + * optional @stop callback will be called for each participant. The pair will + * then be connected via the @add_* callbacks, and finally the optional @start + * callback will allow any final coordination. When either participant is + * unregistered, the process is repeated using the @del_* callbacks in place of + * the @add_* callbacks. eventfds must be unique per producer/consumer, 1:N + * pairings are not supported. */ /** * struct irq_bypass_producer - IRQ bypass producer definition * @node: IRQ bypass manager private list management - * @token: opaque token to match between producer and consumer (non-NULL) + * @eventfd: eventfd context used to match producers and consumers * @irq: Linux IRQ number for the producer device * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional) * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional) @@ -44,7 +45,7 @@ struct irq_bypass_consumer; */ struct irq_bypass_producer { struct list_head node; - void *token; + struct eventfd_ctx *eventfd; int irq; int (*add_consumer)(struct irq_bypass_producer *, struct irq_bypass_consumer *); @@ -57,7 +58,7 @@ struct irq_bypass_producer { /** * struct irq_bypass_consumer - IRQ bypass consumer definition * @node: IRQ bypass manager private list management - * @token: opaque token to match between producer and consumer (non-NULL) + * @eventfd: eventfd context used to match producers and consumers * @add_producer: Connect the IRQ consumer to an IRQ producer * @del_producer: Disconnect the IRQ consumer from an IRQ producer * @stop: Perform any quiesce operations necessary prior to add/del (optional) @@ -70,7 +71,7 @@ struct irq_bypass_producer { */ struct irq_bypass_consumer { struct list_head node; - void *token; + struct eventfd_ctx *eventfd; int (*add_producer)(struct irq_bypass_consumer *, struct irq_bypass_producer *); void (*del_producer)(struct irq_bypass_consumer *, @@ -79,9 +80,11 @@ struct irq_bypass_consumer { void (*start)(struct irq_bypass_consumer *); }; -int irq_bypass_register_producer(struct irq_bypass_producer *); -void irq_bypass_unregister_producer(struct irq_bypass_producer *); -int irq_bypass_register_consumer(struct irq_bypass_consumer *); -void irq_bypass_unregister_consumer(struct irq_bypass_consumer *); +int irq_bypass_register_producer(struct irq_bypass_producer *producer, + struct eventfd_ctx *eventfd); +void irq_bypass_unregister_producer(struct irq_bypass_producer *producer); +int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, + struct eventfd_ctx *eventfd); +void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer); #endif /* IRQBYPASS_H */ diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 11e5d1e3f12e..5bc6abe30748 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -426,15 +426,14 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) if (kvm_arch_has_irq_bypass()) { - irqfd->consumer.token = (void *)irqfd->eventfd; irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; irqfd->consumer.stop = kvm_arch_irq_bypass_stop; irqfd->consumer.start = kvm_arch_irq_bypass_start; - ret = irq_bypass_register_consumer(&irqfd->consumer); + ret = irq_bypass_register_consumer(&irqfd->consumer, irqfd->eventfd); if (ret) - pr_info("irq bypass consumer (token %p) registration fails: %d\n", - irqfd->consumer.token, ret); + pr_info("irq bypass consumer (eventfd %p) registration fails: %d\n", + irqfd->eventfd, ret); } #endif diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index 28a4d933569a..e8d7c420db52 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -77,30 +77,32 @@ static void __disconnect(struct irq_bypass_producer *prod, /** * irq_bypass_register_producer - register IRQ bypass producer * @producer: pointer to producer structure + * @eventfd: pointer to the eventfd context associated with the producer * * Add the provided IRQ producer to the list of producers and connect - * with any matching token found on the IRQ consumers list. + * with any matching eventfd found on the IRQ consumers list. */ -int irq_bypass_register_producer(struct irq_bypass_producer *producer) +int irq_bypass_register_producer(struct irq_bypass_producer *producer, + struct eventfd_ctx *eventfd) { struct irq_bypass_producer *tmp; struct irq_bypass_consumer *consumer; int ret; - if (!producer->token) + if (WARN_ON_ONCE(producer->eventfd)) return -EINVAL; mutex_lock(&lock); list_for_each_entry(tmp, &producers, node) { - if (tmp->token == producer->token) { + if (tmp->eventfd == eventfd) { ret = -EBUSY; goto out_err; } } list_for_each_entry(consumer, &consumers, node) { - if (consumer->token == producer->token) { + if (consumer->eventfd == eventfd) { ret = __connect(producer, consumer); if (ret) goto out_err; @@ -108,6 +110,7 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer) } } + producer->eventfd = eventfd; list_add(&producer->node, &producers); mutex_unlock(&lock); @@ -131,26 +134,28 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) struct irq_bypass_producer *tmp; struct irq_bypass_consumer *consumer; - if (!producer->token) + if (!producer->eventfd) return; mutex_lock(&lock); list_for_each_entry(tmp, &producers, node) { - if (tmp->token != producer->token) + if (tmp->eventfd != producer->eventfd) continue; list_for_each_entry(consumer, &consumers, node) { - if (consumer->token == producer->token) { + if (consumer->eventfd == producer->eventfd) { __disconnect(producer, consumer); break; } } + producer->eventfd = NULL; list_del(&producer->node); break; } + WARN_ON_ONCE(producer->eventfd); mutex_unlock(&lock); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); @@ -158,31 +163,35 @@ EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); /** * irq_bypass_register_consumer - register IRQ bypass consumer * @consumer: pointer to consumer structure + * @eventfd: pointer to the eventfd context associated with the consumer * * Add the provided IRQ consumer to the list of consumers and connect - * with any matching token found on the IRQ producer list. + * with any matching eventfd found on the IRQ producer list. */ -int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) +int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, + struct eventfd_ctx *eventfd) { struct irq_bypass_consumer *tmp; struct irq_bypass_producer *producer; int ret; - if (!consumer->token || - !consumer->add_producer || !consumer->del_producer) + if (WARN_ON_ONCE(consumer->eventfd)) + return -EINVAL; + + if (!consumer->add_producer || !consumer->del_producer) return -EINVAL; mutex_lock(&lock); list_for_each_entry(tmp, &consumers, node) { - if (tmp->token == consumer->token || tmp == consumer) { + if (tmp->eventfd == eventfd) { ret = -EBUSY; goto out_err; } } list_for_each_entry(producer, &producers, node) { - if (producer->token == consumer->token) { + if (producer->eventfd == eventfd) { ret = __connect(producer, consumer); if (ret) goto out_err; @@ -190,6 +199,7 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) } } + consumer->eventfd = eventfd; list_add(&consumer->node, &consumers); mutex_unlock(&lock); @@ -213,7 +223,7 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) struct irq_bypass_consumer *tmp; struct irq_bypass_producer *producer; - if (!consumer->token) + if (!consumer->eventfd) return; mutex_lock(&lock); @@ -223,16 +233,18 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) continue; list_for_each_entry(producer, &producers, node) { - if (producer->token == consumer->token) { + if (producer->eventfd == consumer->eventfd) { __disconnect(producer, consumer); break; } } + consumer->eventfd = NULL; list_del(&consumer->node); break; } + WARN_ON_ONCE(consumer->eventfd); mutex_unlock(&lock); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer); From add57f493e0893ac0fb4acbdc441918d3e800f10 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:30 -0700 Subject: [PATCH 005/101] irqbypass: Explicitly track producer and consumer bindings Explicitly track IRQ bypass producer:consumer bindings. This will allow making removal an O(1) operation; searching through the list to find information that is trivially tracked (and useful for debug) is wasteful. Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-5-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/irqbypass.h | 7 +++++++ virt/lib/irqbypass.c | 9 +++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h index 1b57d15ac4cf..b28197c87483 100644 --- a/include/linux/irqbypass.h +++ b/include/linux/irqbypass.h @@ -29,10 +29,13 @@ struct irq_bypass_consumer; * pairings are not supported. */ +struct irq_bypass_consumer; + /** * struct irq_bypass_producer - IRQ bypass producer definition * @node: IRQ bypass manager private list management * @eventfd: eventfd context used to match producers and consumers + * @consumer: The connected consumer (NULL if no connection) * @irq: Linux IRQ number for the producer device * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional) * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional) @@ -46,6 +49,7 @@ struct irq_bypass_consumer; struct irq_bypass_producer { struct list_head node; struct eventfd_ctx *eventfd; + struct irq_bypass_consumer *consumer; int irq; int (*add_consumer)(struct irq_bypass_producer *, struct irq_bypass_consumer *); @@ -59,6 +63,7 @@ struct irq_bypass_producer { * struct irq_bypass_consumer - IRQ bypass consumer definition * @node: IRQ bypass manager private list management * @eventfd: eventfd context used to match producers and consumers + * @producer: The connected producer (NULL if no connection) * @add_producer: Connect the IRQ consumer to an IRQ producer * @del_producer: Disconnect the IRQ consumer from an IRQ producer * @stop: Perform any quiesce operations necessary prior to add/del (optional) @@ -72,6 +77,8 @@ struct irq_bypass_producer { struct irq_bypass_consumer { struct list_head node; struct eventfd_ctx *eventfd; + struct irq_bypass_producer *producer; + int (*add_producer)(struct irq_bypass_consumer *, struct irq_bypass_producer *); void (*del_producer)(struct irq_bypass_consumer *, diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index e8d7c420db52..fdbf7ecc0c21 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -51,6 +51,10 @@ static int __connect(struct irq_bypass_producer *prod, if (prod->start) prod->start(prod); + if (!ret) { + prod->consumer = cons; + cons->producer = prod; + } return ret; } @@ -72,6 +76,9 @@ static void __disconnect(struct irq_bypass_producer *prod, cons->start(cons); if (prod->start) prod->start(prod); + + prod->consumer = NULL; + cons->producer = NULL; } /** @@ -145,6 +152,7 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) list_for_each_entry(consumer, &consumers, node) { if (consumer->eventfd == producer->eventfd) { + WARN_ON_ONCE(producer->consumer != consumer); __disconnect(producer, consumer); break; } @@ -234,6 +242,7 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) list_for_each_entry(producer, &producers, node) { if (producer->eventfd == consumer->eventfd) { + WARN_ON_ONCE(consumer->producer != producer); __disconnect(producer, consumer); break; } From 5d7dbdce388b43cf3a9bc50c4132493de26aeba4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:31 -0700 Subject: [PATCH 006/101] irqbypass: Use paired consumer/producer to disconnect during unregister Use the paired consumer/producer information to disconnect IRQ bypass producers/consumers in O(1) time (ignoring the cost of __disconnect()). Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-6-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/lib/irqbypass.c | 48 ++++++++------------------------------------ 1 file changed, 8 insertions(+), 40 deletions(-) diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index fdbf7ecc0c21..6a183459dc44 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -138,32 +138,16 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_producer); */ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) { - struct irq_bypass_producer *tmp; - struct irq_bypass_consumer *consumer; - if (!producer->eventfd) return; mutex_lock(&lock); - list_for_each_entry(tmp, &producers, node) { - if (tmp->eventfd != producer->eventfd) - continue; + if (producer->consumer) + __disconnect(producer, producer->consumer); - list_for_each_entry(consumer, &consumers, node) { - if (consumer->eventfd == producer->eventfd) { - WARN_ON_ONCE(producer->consumer != consumer); - __disconnect(producer, consumer); - break; - } - } - - producer->eventfd = NULL; - list_del(&producer->node); - break; - } - - WARN_ON_ONCE(producer->eventfd); + producer->eventfd = NULL; + list_del(&producer->node); mutex_unlock(&lock); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); @@ -228,32 +212,16 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); */ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) { - struct irq_bypass_consumer *tmp; - struct irq_bypass_producer *producer; - if (!consumer->eventfd) return; mutex_lock(&lock); - list_for_each_entry(tmp, &consumers, node) { - if (tmp != consumer) - continue; + if (consumer->producer) + __disconnect(consumer->producer, consumer); - list_for_each_entry(producer, &producers, node) { - if (producer->eventfd == consumer->eventfd) { - WARN_ON_ONCE(consumer->producer != producer); - __disconnect(producer, consumer); - break; - } - } - - consumer->eventfd = NULL; - list_del(&consumer->node); - break; - } - - WARN_ON_ONCE(consumer->eventfd); + consumer->eventfd = NULL; + list_del(&consumer->node); mutex_unlock(&lock); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer); From 46a4bfd0ae480cabbacc56fe0d8f91cbe229c7ce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:32 -0700 Subject: [PATCH 007/101] irqbypass: Use guard(mutex) in lieu of manual lock+unlock Use guard(mutex) to clean up irqbypass's error handling. Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-7-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/lib/irqbypass.c | 38 ++++++++++---------------------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index 6a183459dc44..828556c081f5 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -99,33 +99,25 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer, if (WARN_ON_ONCE(producer->eventfd)) return -EINVAL; - mutex_lock(&lock); + guard(mutex)(&lock); list_for_each_entry(tmp, &producers, node) { - if (tmp->eventfd == eventfd) { - ret = -EBUSY; - goto out_err; - } + if (tmp->eventfd == eventfd) + return -EBUSY; } list_for_each_entry(consumer, &consumers, node) { if (consumer->eventfd == eventfd) { ret = __connect(producer, consumer); if (ret) - goto out_err; + return ret; break; } } producer->eventfd = eventfd; list_add(&producer->node, &producers); - - mutex_unlock(&lock); - return 0; -out_err: - mutex_unlock(&lock); - return ret; } EXPORT_SYMBOL_GPL(irq_bypass_register_producer); @@ -141,14 +133,13 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) if (!producer->eventfd) return; - mutex_lock(&lock); + guard(mutex)(&lock); if (producer->consumer) __disconnect(producer, producer->consumer); producer->eventfd = NULL; list_del(&producer->node); - mutex_unlock(&lock); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); @@ -173,33 +164,25 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, if (!consumer->add_producer || !consumer->del_producer) return -EINVAL; - mutex_lock(&lock); + guard(mutex)(&lock); list_for_each_entry(tmp, &consumers, node) { - if (tmp->eventfd == eventfd) { - ret = -EBUSY; - goto out_err; - } + if (tmp->eventfd == eventfd) + return -EBUSY; } list_for_each_entry(producer, &producers, node) { if (producer->eventfd == eventfd) { ret = __connect(producer, consumer); if (ret) - goto out_err; + return ret; break; } } consumer->eventfd = eventfd; list_add(&consumer->node, &consumers); - - mutex_unlock(&lock); - return 0; -out_err: - mutex_unlock(&lock); - return ret; } EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); @@ -215,13 +198,12 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) if (!consumer->eventfd) return; - mutex_lock(&lock); + guard(mutex)(&lock); if (consumer->producer) __disconnect(consumer->producer, consumer); consumer->eventfd = NULL; list_del(&consumer->node); - mutex_unlock(&lock); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer); From 8394b32faecd9c63b3c436e78e62519e9548e530 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:33 -0700 Subject: [PATCH 008/101] irqbypass: Use xarray to track producers and consumers Track IRQ bypass producers and consumers using an xarray to avoid the O(2n) insertion time associated with walking a list to check for duplicate entries, and to search for an partner. At low (tens or few hundreds) total producer/consumer counts, using a list is faster due to the need to allocate backing storage for xarray. But as count creeps into the thousands, xarray wins easily, and can provide several orders of magnitude better latency at high counts. E.g. hundreds of nanoseconds vs. hundreds of milliseconds. Cc: Oliver Upton Cc: David Matlack Cc: Like Xu Cc: Binbin Wu Reported-by: Yong He Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217379 Link: https://lore.kernel.org/all/20230801115646.33990-1-likexu@tencent.com Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-8-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/irqbypass.h | 4 --- virt/lib/irqbypass.c | 74 ++++++++++++++++++++------------------- 2 files changed, 38 insertions(+), 40 deletions(-) diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h index b28197c87483..cd64fcaa88fe 100644 --- a/include/linux/irqbypass.h +++ b/include/linux/irqbypass.h @@ -33,7 +33,6 @@ struct irq_bypass_consumer; /** * struct irq_bypass_producer - IRQ bypass producer definition - * @node: IRQ bypass manager private list management * @eventfd: eventfd context used to match producers and consumers * @consumer: The connected consumer (NULL if no connection) * @irq: Linux IRQ number for the producer device @@ -47,7 +46,6 @@ struct irq_bypass_consumer; * for a physical device assigned to a VM. */ struct irq_bypass_producer { - struct list_head node; struct eventfd_ctx *eventfd; struct irq_bypass_consumer *consumer; int irq; @@ -61,7 +59,6 @@ struct irq_bypass_producer { /** * struct irq_bypass_consumer - IRQ bypass consumer definition - * @node: IRQ bypass manager private list management * @eventfd: eventfd context used to match producers and consumers * @producer: The connected producer (NULL if no connection) * @add_producer: Connect the IRQ consumer to an IRQ producer @@ -75,7 +72,6 @@ struct irq_bypass_producer { * portions of the interrupt handling to the VM. */ struct irq_bypass_consumer { - struct list_head node; struct eventfd_ctx *eventfd; struct irq_bypass_producer *producer; diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index 828556c081f5..ea888b9203d2 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -22,8 +22,8 @@ MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("IRQ bypass manager utility module"); -static LIST_HEAD(producers); -static LIST_HEAD(consumers); +static DEFINE_XARRAY(producers); +static DEFINE_XARRAY(consumers); static DEFINE_MUTEX(lock); /* @lock must be held when calling connect */ @@ -86,13 +86,13 @@ static void __disconnect(struct irq_bypass_producer *prod, * @producer: pointer to producer structure * @eventfd: pointer to the eventfd context associated with the producer * - * Add the provided IRQ producer to the list of producers and connect - * with any matching eventfd found on the IRQ consumers list. + * Add the provided IRQ producer to the set of producers and connect with the + * consumer with a matching eventfd, if one exists. */ int irq_bypass_register_producer(struct irq_bypass_producer *producer, struct eventfd_ctx *eventfd) { - struct irq_bypass_producer *tmp; + unsigned long index = (unsigned long)eventfd; struct irq_bypass_consumer *consumer; int ret; @@ -101,22 +101,20 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer, guard(mutex)(&lock); - list_for_each_entry(tmp, &producers, node) { - if (tmp->eventfd == eventfd) - return -EBUSY; - } + ret = xa_insert(&producers, index, producer, GFP_KERNEL); + if (ret) + return ret; - list_for_each_entry(consumer, &consumers, node) { - if (consumer->eventfd == eventfd) { - ret = __connect(producer, consumer); - if (ret) - return ret; - break; + consumer = xa_load(&consumers, index); + if (consumer) { + ret = __connect(producer, consumer); + if (ret) { + WARN_ON_ONCE(xa_erase(&producers, index) != producer); + return ret; } } producer->eventfd = eventfd; - list_add(&producer->node, &producers); return 0; } EXPORT_SYMBOL_GPL(irq_bypass_register_producer); @@ -125,11 +123,14 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_producer); * irq_bypass_unregister_producer - unregister IRQ bypass producer * @producer: pointer to producer structure * - * Remove a previously registered IRQ producer from the list of producers - * and disconnect it from any connected IRQ consumer. + * Remove a previously registered IRQ producer (note, it's safe to call this + * even if registration was unsuccessful). Disconnect from the associated + * consumer, if one exists. */ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) { + unsigned long index = (unsigned long)producer->eventfd; + if (!producer->eventfd) return; @@ -138,8 +139,8 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) if (producer->consumer) __disconnect(producer, producer->consumer); + WARN_ON_ONCE(xa_erase(&producers, index) != producer); producer->eventfd = NULL; - list_del(&producer->node); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); @@ -148,13 +149,13 @@ EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); * @consumer: pointer to consumer structure * @eventfd: pointer to the eventfd context associated with the consumer * - * Add the provided IRQ consumer to the list of consumers and connect - * with any matching eventfd found on the IRQ producer list. + * Add the provided IRQ consumer to the set of consumers and connect with the + * producer with a matching eventfd, if one exists. */ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, struct eventfd_ctx *eventfd) { - struct irq_bypass_consumer *tmp; + unsigned long index = (unsigned long)eventfd; struct irq_bypass_producer *producer; int ret; @@ -166,22 +167,20 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, guard(mutex)(&lock); - list_for_each_entry(tmp, &consumers, node) { - if (tmp->eventfd == eventfd) - return -EBUSY; - } + ret = xa_insert(&consumers, index, consumer, GFP_KERNEL); + if (ret) + return ret; - list_for_each_entry(producer, &producers, node) { - if (producer->eventfd == eventfd) { - ret = __connect(producer, consumer); - if (ret) - return ret; - break; + producer = xa_load(&producers, index); + if (producer) { + ret = __connect(producer, consumer); + if (ret) { + WARN_ON_ONCE(xa_erase(&consumers, index) != consumer); + return ret; } } consumer->eventfd = eventfd; - list_add(&consumer->node, &consumers); return 0; } EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); @@ -190,11 +189,14 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); * irq_bypass_unregister_consumer - unregister IRQ bypass consumer * @consumer: pointer to consumer structure * - * Remove a previously registered IRQ consumer from the list of consumers - * and disconnect it from any connected IRQ producer. + * Remove a previously registered IRQ consumer (note, it's safe to call this + * even if registration was unsuccessful). Disconnect from the associated + * producer, if one exists. */ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) { + unsigned long index = (unsigned long)consumer->eventfd; + if (!consumer->eventfd) return; @@ -203,7 +205,7 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) if (consumer->producer) __disconnect(consumer->producer, consumer); + WARN_ON_ONCE(xa_erase(&consumers, index) != consumer); consumer->eventfd = NULL; - list_del(&consumer->node); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer); From 23b54381cee2928e8b5622e654ca4516f30d2f1a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:34 -0700 Subject: [PATCH 009/101] irqbypass: Require producers to pass in Linux IRQ number during registration Pass in the Linux IRQ associated with an IRQ bypass producer instead of relying on the caller to set the field prior to registration, as there's no benefit to relying on callers to do the right thing. Take care to set producer->irq before __connect(), as KVM expects the IRQ to be valid as soon as a connection is possible. Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20250516230734.2564775-9-seanjc@google.com Signed-off-by: Sean Christopherson --- drivers/vfio/pci/vfio_pci_intrs.c | 3 +-- drivers/vhost/vdpa.c | 4 ++-- include/linux/irqbypass.h | 2 +- virt/lib/irqbypass.c | 5 ++++- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index d87fe116762a..123298a4dc8f 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -505,8 +505,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, if (ret) goto out_put_eventfd_ctx; - ctx->producer.irq = irq; - ret = irq_bypass_register_producer(&ctx->producer, trigger); + ret = irq_bypass_register_producer(&ctx->producer, trigger, irq); if (unlikely(ret)) { dev_info(&pdev->dev, "irq bypass producer (eventfd %p) registration fails: %d\n", diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 7b265ffda697..af1e1fdfd9ed 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -212,8 +212,8 @@ static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, u16 qid) if (!vq->call_ctx.ctx) return; - vq->call_ctx.producer.irq = irq; - ret = irq_bypass_register_producer(&vq->call_ctx.producer, vq->call_ctx.ctx); + ret = irq_bypass_register_producer(&vq->call_ctx.producer, + vq->call_ctx.ctx, irq); if (unlikely(ret)) dev_info(&v->dev, "vq %u, irq bypass producer (eventfd %p) registration fails, ret = %d\n", qid, vq->call_ctx.ctx, ret); diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h index cd64fcaa88fe..ede1fa938152 100644 --- a/include/linux/irqbypass.h +++ b/include/linux/irqbypass.h @@ -84,7 +84,7 @@ struct irq_bypass_consumer { }; int irq_bypass_register_producer(struct irq_bypass_producer *producer, - struct eventfd_ctx *eventfd); + struct eventfd_ctx *eventfd, int irq); void irq_bypass_unregister_producer(struct irq_bypass_producer *producer); int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, struct eventfd_ctx *eventfd); diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index ea888b9203d2..62c160200be9 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -85,12 +85,13 @@ static void __disconnect(struct irq_bypass_producer *prod, * irq_bypass_register_producer - register IRQ bypass producer * @producer: pointer to producer structure * @eventfd: pointer to the eventfd context associated with the producer + * @irq: Linux IRQ number of the underlying producer device * * Add the provided IRQ producer to the set of producers and connect with the * consumer with a matching eventfd, if one exists. */ int irq_bypass_register_producer(struct irq_bypass_producer *producer, - struct eventfd_ctx *eventfd) + struct eventfd_ctx *eventfd, int irq) { unsigned long index = (unsigned long)eventfd; struct irq_bypass_consumer *consumer; @@ -99,6 +100,8 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer, if (WARN_ON_ONCE(producer->eventfd)) return -EINVAL; + producer->irq = irq; + guard(mutex)(&lock); ret = xa_insert(&producers, index, producer, GFP_KERNEL); From e295d2e7fbe69ddec772c951c466dfbfc1c96818 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:40 -0700 Subject: [PATCH 010/101] KVM: x86: Trigger I/O APIC route rescan in kvm_arch_irq_routing_update() Trigger the I/O APIC route rescan that's performed for a split IRQ chip after userspace updates IRQ routes in kvm_arch_irq_routing_update(), i.e. before dropping kvm->irq_lock. Calling kvm_make_all_cpus_request() under a mutex is perfectly safe, and the smp_wmb()+smp_mb__after_atomic() pair in __kvm_make_request()+kvm_check_request() ensures the new routing is visible to vCPUs prior to the request being visible to vCPUs. In all likelihood, commit b053b2aef25d ("KVM: x86: Add EOI exit bitmap inference") somewhat arbitrarily made the request outside of irq_lock to avoid holding irq_lock any longer than is strictly necessary. And then commit abdb080f7ac8 ("kvm/irqchip: kvm_arch_irq_routing_update renaming split") took the easy route of adding another arch hook instead of risking a functional change. Note, the call to synchronize_srcu_expedited() does NOT provide ordering guarantees with respect to vCPUs scanning the new routing; as above, the request infrastructure provides the necessary ordering. I.e. there's no need to wait for kvm_scan_ioapic_routes() to complete if it's actively running, because regardless of whether it grabs the old or new table, the vCPU will have another KVM_REQ_SCAN_IOAPIC pending, i.e. will rescan again and see the new mappings. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq_comm.c | 10 +++------- include/linux/kvm_host.h | 4 ---- virt/kvm/irqchip.c | 2 -- 3 files changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index d6d792b5d1bd..e2ae62ff9cc2 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -395,13 +395,6 @@ int kvm_setup_default_irq_routing(struct kvm *kvm) ARRAY_SIZE(default_routing), 0); } -void kvm_arch_post_irq_routing_update(struct kvm *kvm) -{ - if (!irqchip_split(kvm)) - return; - kvm_make_scan_ioapic_request(kvm); -} - void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, u8 vector, unsigned long *ioapic_handled_vectors) { @@ -466,4 +459,7 @@ void kvm_arch_irq_routing_update(struct kvm *kvm) #ifdef CONFIG_KVM_HYPERV kvm_hv_irq_routing_update(kvm); #endif + + if (irqchip_split(kvm)) + kvm_make_scan_ioapic_request(kvm); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3bde4fb5c6aa..9461517b4e62 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1024,14 +1024,10 @@ void vcpu_put(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_IOAPIC void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm); -void kvm_arch_post_irq_routing_update(struct kvm *kvm); #else static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) { } -static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm) -{ -} #endif #ifdef CONFIG_HAVE_KVM_IRQCHIP diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 162d8ed889f2..6ccabfd32287 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -222,8 +222,6 @@ int kvm_set_irq_routing(struct kvm *kvm, kvm_arch_irq_routing_update(kvm); mutex_unlock(&kvm->irq_lock); - kvm_arch_post_irq_routing_update(kvm); - synchronize_srcu_expedited(&kvm->irq_srcu); new = old; From 8a33b1f246ceeffe5f74b10078a04696060ac537 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:41 -0700 Subject: [PATCH 011/101] KVM: x86: Drop superfluous kvm_set_pic_irq() => kvm_pic_set_irq() wrapper Drop the superfluous and confusing kvm_set_pic_irq() => kvm_pic_set_irq() wrapper, and instead wire up ->set() directly to its final destination. Opportunistically move the declaration kvm_pic_set_irq() to irq.h to start gathering more of the in-kernel APIC/IO-APIC logic in irq.{c,h}. No functional change intended. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/i8259.c | 5 ++++- arch/x86/kvm/irq.h | 2 ++ arch/x86/kvm/irq_comm.c | 10 +--------- 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b4a391929cdb..bf4459d637cc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2209,7 +2209,6 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state, return !!(*irq_state); } -int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index a8fb19940975..0150aec4f523 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -185,8 +185,11 @@ void kvm_pic_update_irq(struct kvm_pic *s) pic_unlock(s); } -int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) +int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status) { + struct kvm_pic *s = kvm->arch.vpic; + int irq = e->irqchip.pin; int ret, irq_level; BUG_ON(irq < 0 || irq >= PIC_NUM_PINS); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 76d46b2f41dd..33dd5666b656 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -63,6 +63,8 @@ int kvm_pic_init(struct kvm *kvm); void kvm_pic_destroy(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); +int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status); static inline int irqchip_split(struct kvm *kvm) { diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index e2ae62ff9cc2..64f352e7bcb0 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -27,14 +27,6 @@ #include "x86.h" #include "xen.h" -static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level, - bool line_status) -{ - struct kvm_pic *pic = kvm->arch.vpic; - return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); -} - static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) @@ -296,7 +288,7 @@ int kvm_set_routing_entry(struct kvm *kvm, case KVM_IRQCHIP_PIC_MASTER: if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2) return -EINVAL; - e->set = kvm_set_pic_irq; + e->set = kvm_pic_set_irq; break; case KVM_IRQCHIP_IOAPIC: if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS) From 05dc9eab3f005a5af5ad03138f81b7541c5db85b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:42 -0700 Subject: [PATCH 012/101] KVM: x86: Drop superfluous kvm_set_ioapic_irq() => kvm_ioapic_set_irq() wrapper Drop the superfluous and confusing kvm_set_ioapic_irq() and instead wire up ->set() directly to its final destination. No functional change intended. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/ioapic.c | 6 ++++-- arch/x86/kvm/ioapic.h | 5 +++-- arch/x86/kvm/irq_comm.c | 11 +---------- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 45dae2d5d2f1..8c8a8062eb19 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -479,9 +479,11 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) return ret; } -int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, - int level, bool line_status) +int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status) { + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + int irq = e->irqchip.pin; int ret, irq_level; BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index aa8cb4ac0479..a86f59bbea44 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -111,8 +111,9 @@ void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode); int kvm_ioapic_init(struct kvm *kvm); void kvm_ioapic_destroy(struct kvm *kvm); -int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, - int level, bool line_status); +int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status); + void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 64f352e7bcb0..8dcb6a555902 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -27,15 +27,6 @@ #include "x86.h" #include "xen.h" -static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level, - bool line_status) -{ - struct kvm_ioapic *ioapic = kvm->arch.vioapic; - return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level, - line_status); -} - int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, struct dest_map *dest_map) { @@ -293,7 +284,7 @@ int kvm_set_routing_entry(struct kvm *kvm, case KVM_IRQCHIP_IOAPIC: if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS) return -EINVAL; - e->set = kvm_set_ioapic_irq; + e->set = kvm_ioapic_set_irq; break; default: return -EINVAL; From 20218e69e85b0418f444227488c732e2c0b753b2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:43 -0700 Subject: [PATCH 013/101] KVM: x86: Drop superfluous kvm_hv_set_sint() => kvm_hv_synic_set_irq() wrapper Drop the superfluous kvm_hv_set_sint() and instead wire up ->set() directly to its final destination, kvm_hv_synic_set_irq(). Keep hv_synic_set_irq() instead of kvm_hv_set_sint() to provide some amount of consistency in the ->set() helpers, e.g. to match kvm_pic_set_irq() and kvm_ioapic_set_irq(). kvm_set_msi() is arguably the oddball, e.g. kvm_set_msi_irq() should be something like kvm_msi_to_lapic_irq() so that kvm_set_msi() can instead be kvm_set_msi_irq(), but that's a future problem to solve. No functional change intended. Cc: Vitaly Kuznetsov Cc: Kai Huang Reviewed-by: Vitaly Kuznetsov Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/hyperv.c | 10 +++++++--- arch/x86/kvm/hyperv.h | 3 ++- arch/x86/kvm/irq_comm.c | 18 +++--------------- 3 files changed, 12 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 24f0318c50d7..f316e11383aa 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -497,15 +497,19 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) return ret; } -int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint) +int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status) { struct kvm_vcpu_hv_synic *synic; - synic = synic_get(kvm, vpidx); + if (!level) + return -1; + + synic = synic_get(kvm, e->hv_sint.vcpu); if (!synic) return -EINVAL; - return synic_set_irq(synic, sint); + return synic_set_irq(synic, e->hv_sint.sint); } void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 913bfc96959c..6ce160ffa678 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -103,7 +103,8 @@ static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu) int kvm_hv_hypercall(struct kvm_vcpu *vcpu); void kvm_hv_irq_routing_update(struct kvm *kvm); -int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); +int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status); void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 8dcb6a555902..28a8555ab58b 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -127,18 +127,6 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); } -#ifdef CONFIG_KVM_HYPERV -static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level, - bool line_status) -{ - if (!level) - return -1; - - return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint); -} -#endif - int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) @@ -149,8 +137,8 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, switch (e->type) { #ifdef CONFIG_KVM_HYPERV case KVM_IRQ_ROUTING_HV_SINT: - return kvm_hv_set_sint(e, kvm, irq_source_id, level, - line_status); + return kvm_hv_synic_set_irq(e, kvm, irq_source_id, level, + line_status); #endif case KVM_IRQ_ROUTING_MSI: @@ -302,7 +290,7 @@ int kvm_set_routing_entry(struct kvm *kvm, break; #ifdef CONFIG_KVM_HYPERV case KVM_IRQ_ROUTING_HV_SINT: - e->set = kvm_hv_set_sint; + e->set = kvm_hv_synic_set_irq; e->hv_sint.vcpu = ue->u.hv_sint.vcpu; e->hv_sint.sint = ue->u.hv_sint.sint; break; From 00b5ebf8db7c382c7ea0074fdd79bdb9c65db236 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:44 -0700 Subject: [PATCH 014/101] KVM: x86: Move PIT ioctl helpers to i8254.c Move the PIT ioctl helpers to i8254.c, i.e. to the file that implements PIT emulation. Eliminating PIT code in x86.c will allow adding a Kconfig to control support for in-kernel I/O APIC, PIC, and PIT emulation with minimal #ifdefs. Opportunistically make kvm_pit_set_reinject() and kvm_pit_load_count() local to i8254.c as they were only publicly visible to make them available to the ioctl helpers. No functional change intended. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/i8254.c | 79 ++++++++++++++++++++++++++++++++++++++++++-- arch/x86/kvm/i8254.h | 12 ++++--- arch/x86/kvm/x86.c | 74 ----------------------------------------- 3 files changed, 84 insertions(+), 81 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 739aa6c0d0c3..59f956f35f4c 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -288,7 +288,7 @@ static inline void kvm_pit_reset_reinject(struct kvm_pit *pit) atomic_set(&pit->pit_state.irq_ack, 1); } -void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject) +static void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject) { struct kvm_kpit_state *ps = &pit->pit_state; struct kvm *kvm = pit->kvm; @@ -400,8 +400,8 @@ static void pit_load_count(struct kvm_pit *pit, int channel, u32 val) } } -void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, - int hpet_legacy_start) +static void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, + int hpet_legacy_start) { u8 saved_mode; @@ -649,6 +649,79 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) kvm_pit_reset_reinject(pit); } +int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) +{ + struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state; + + BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels)); + + mutex_lock(&kps->lock); + memcpy(ps, &kps->channels, sizeof(*ps)); + mutex_unlock(&kps->lock); + return 0; +} + +int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) +{ + int i; + struct kvm_pit *pit = kvm->arch.vpit; + + mutex_lock(&pit->pit_state.lock); + memcpy(&pit->pit_state.channels, ps, sizeof(*ps)); + for (i = 0; i < 3; i++) + kvm_pit_load_count(pit, i, ps->channels[i].count, 0); + mutex_unlock(&pit->pit_state.lock); + return 0; +} + +int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + mutex_lock(&kvm->arch.vpit->pit_state.lock); + memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, + sizeof(ps->channels)); + ps->flags = kvm->arch.vpit->pit_state.flags; + mutex_unlock(&kvm->arch.vpit->pit_state.lock); + memset(&ps->reserved, 0, sizeof(ps->reserved)); + return 0; +} + +int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + int start = 0; + int i; + u32 prev_legacy, cur_legacy; + struct kvm_pit *pit = kvm->arch.vpit; + + mutex_lock(&pit->pit_state.lock); + prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; + cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; + if (!prev_legacy && cur_legacy) + start = 1; + memcpy(&pit->pit_state.channels, &ps->channels, + sizeof(pit->pit_state.channels)); + pit->pit_state.flags = ps->flags; + for (i = 0; i < 3; i++) + kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count, + start && i == 0); + mutex_unlock(&pit->pit_state.lock); + return 0; +} + +int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control) +{ + struct kvm_pit *pit = kvm->arch.vpit; + + /* pit->pit_state.lock was overloaded to prevent userspace from getting + * an inconsistent state after running multiple KVM_REINJECT_CONTROL + * ioctls in parallel. Use a separate lock if that ioctl isn't rare. + */ + mutex_lock(&pit->pit_state.lock); + kvm_pit_set_reinject(pit, control->pit_reinject); + mutex_unlock(&pit->pit_state.lock); + + return 0; +} + static const struct kvm_io_device_ops pit_dev_ops = { .read = pit_ioport_read, .write = pit_ioport_write, diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index a768212ba821..338095829ec8 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -6,6 +6,8 @@ #include +#include + struct kvm_kpit_channel_state { u32 count; /* can be 65536 */ u16 latched_count; @@ -55,11 +57,13 @@ struct kvm_pit { #define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 #define KVM_PIT_CHANNEL_MASK 0x3 +int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps); +int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps); +int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps); +int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps); +int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control); + struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); void kvm_free_pit(struct kvm *kvm); -void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, - int hpet_legacy_start); -void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject); - #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3dc93e2d4777..0ee1f7d311e5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6450,80 +6450,6 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) return r; } -static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) -{ - struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state; - - BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels)); - - mutex_lock(&kps->lock); - memcpy(ps, &kps->channels, sizeof(*ps)); - mutex_unlock(&kps->lock); - return 0; -} - -static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) -{ - int i; - struct kvm_pit *pit = kvm->arch.vpit; - - mutex_lock(&pit->pit_state.lock); - memcpy(&pit->pit_state.channels, ps, sizeof(*ps)); - for (i = 0; i < 3; i++) - kvm_pit_load_count(pit, i, ps->channels[i].count, 0); - mutex_unlock(&pit->pit_state.lock); - return 0; -} - -static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) -{ - mutex_lock(&kvm->arch.vpit->pit_state.lock); - memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, - sizeof(ps->channels)); - ps->flags = kvm->arch.vpit->pit_state.flags; - mutex_unlock(&kvm->arch.vpit->pit_state.lock); - memset(&ps->reserved, 0, sizeof(ps->reserved)); - return 0; -} - -static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) -{ - int start = 0; - int i; - u32 prev_legacy, cur_legacy; - struct kvm_pit *pit = kvm->arch.vpit; - - mutex_lock(&pit->pit_state.lock); - prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; - cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; - if (!prev_legacy && cur_legacy) - start = 1; - memcpy(&pit->pit_state.channels, &ps->channels, - sizeof(pit->pit_state.channels)); - pit->pit_state.flags = ps->flags; - for (i = 0; i < 3; i++) - kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count, - start && i == 0); - mutex_unlock(&pit->pit_state.lock); - return 0; -} - -static int kvm_vm_ioctl_reinject(struct kvm *kvm, - struct kvm_reinject_control *control) -{ - struct kvm_pit *pit = kvm->arch.vpit; - - /* pit->pit_state.lock was overloaded to prevent userspace from getting - * an inconsistent state after running multiple KVM_REINJECT_CONTROL - * ioctls in parallel. Use a separate lock if that ioctl isn't rare. - */ - mutex_lock(&pit->pit_state.lock); - kvm_pit_set_reinject(pit, control->pit_reinject); - mutex_unlock(&pit->pit_state.lock); - - return 0; -} - void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { From b771b1616ff89d7441e5889b84e444c48a7676f0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:45 -0700 Subject: [PATCH 015/101] KVM: x86: Move KVM_{GET,SET}_IRQCHIP ioctl helpers to irq.c Move the ioctl helpers for getting/setting fully in-kernel IRQ chip state to irq.c, partly to trim down x86.c, but mostly in preparation for adding a Kconfig to control support for in-kernel I/O APIC, PIC, and PIT emulation. No functional change intended. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/irq.h | 3 +++ arch/x86/kvm/x86.c | 55 --------------------------------------------- 3 files changed, 59 insertions(+), 55 deletions(-) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 97d68d837929..da47e2165389 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -12,6 +12,7 @@ #include #include +#include "ioapic.h" #include "irq.h" #include "i8254.h" #include "x86.h" @@ -178,3 +179,58 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) { return irqchip_in_kernel(kvm); } + +int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) +{ + struct kvm_pic *pic = kvm->arch.vpic; + int r; + + r = 0; + switch (chip->chip_id) { + case KVM_IRQCHIP_PIC_MASTER: + memcpy(&chip->chip.pic, &pic->pics[0], + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_PIC_SLAVE: + memcpy(&chip->chip.pic, &pic->pics[1], + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_IOAPIC: + kvm_get_ioapic(kvm, &chip->chip.ioapic); + break; + default: + r = -EINVAL; + break; + } + return r; +} + +int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) +{ + struct kvm_pic *pic = kvm->arch.vpic; + int r; + + r = 0; + switch (chip->chip_id) { + case KVM_IRQCHIP_PIC_MASTER: + spin_lock(&pic->lock); + memcpy(&pic->pics[0], &chip->chip.pic, + sizeof(struct kvm_pic_state)); + spin_unlock(&pic->lock); + break; + case KVM_IRQCHIP_PIC_SLAVE: + spin_lock(&pic->lock); + memcpy(&pic->pics[1], &chip->chip.pic, + sizeof(struct kvm_pic_state)); + spin_unlock(&pic->lock); + break; + case KVM_IRQCHIP_IOAPIC: + kvm_set_ioapic(kvm, &chip->chip.ioapic); + break; + default: + r = -EINVAL; + break; + } + kvm_pic_update_irq(pic); + return r; +} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 33dd5666b656..aa77a6b2828c 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -66,6 +66,9 @@ void kvm_pic_update_irq(struct kvm_pic *s); int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status); +int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip); +int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip); + static inline int irqchip_split(struct kvm *kvm) { int mode = kvm->arch.irqchip_mode; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0ee1f7d311e5..787fc22dab54 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6395,61 +6395,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, return 0; } -static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) -{ - struct kvm_pic *pic = kvm->arch.vpic; - int r; - - r = 0; - switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: - memcpy(&chip->chip.pic, &pic->pics[0], - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_PIC_SLAVE: - memcpy(&chip->chip.pic, &pic->pics[1], - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_IOAPIC: - kvm_get_ioapic(kvm, &chip->chip.ioapic); - break; - default: - r = -EINVAL; - break; - } - return r; -} - -static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) -{ - struct kvm_pic *pic = kvm->arch.vpic; - int r; - - r = 0; - switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic->lock); - memcpy(&pic->pics[0], &chip->chip.pic, - sizeof(struct kvm_pic_state)); - spin_unlock(&pic->lock); - break; - case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic->lock); - memcpy(&pic->pics[1], &chip->chip.pic, - sizeof(struct kvm_pic_state)); - spin_unlock(&pic->lock); - break; - case KVM_IRQCHIP_IOAPIC: - kvm_set_ioapic(kvm, &chip->chip.ioapic); - break; - default: - r = -EINVAL; - break; - } - kvm_pic_update_irq(pic); - return r; -} - void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { From c5a701955e2de084fdd0ad5edfed578389eae5ac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:46 -0700 Subject: [PATCH 016/101] KVM: x86: Rename irqchip_kernel() to irqchip_full() Rename irqchip_kernel() to irqchip_full(), as "kernel" is very ambiguous due to the existence of split IRQ chip support, where only some of the "irqchip" is in emulated by the kernel/KVM. E.g. irqchip_kernel() often gets confused with irqchip_in_kernel(). Opportunistically hoist the definition up in irq.h so that it's co-located with other "full" irqchip code in anticipation of wrapping it all with a Kconfig/#ifdef. No functional change intended. Suggested-by: Paolo Bonzini Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/ioapic.h | 2 +- arch/x86/kvm/irq.c | 2 +- arch/x86/kvm/irq.h | 22 +++++++++++----------- arch/x86/kvm/irq_comm.c | 2 +- arch/x86/kvm/x86.c | 4 ++-- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index a86f59bbea44..289cca3aec69 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -103,7 +103,7 @@ do { \ static inline int ioapic_in_kernel(struct kvm *kvm) { - return irqchip_kernel(kvm); + return irqchip_full(kvm); } void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index da47e2165389..97e1617ce24d 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -172,7 +172,7 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) { bool resample = args->flags & KVM_IRQFD_FLAG_RESAMPLE; - return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm); + return resample ? irqchip_full(kvm) : irqchip_in_kernel(kvm); } bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index aa77a6b2828c..4ac346102350 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -69,16 +69,7 @@ int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip); int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip); -static inline int irqchip_split(struct kvm *kvm) -{ - int mode = kvm->arch.irqchip_mode; - - /* Matches smp_wmb() when setting irqchip_mode */ - smp_rmb(); - return mode == KVM_IRQCHIP_SPLIT; -} - -static inline int irqchip_kernel(struct kvm *kvm) +static inline int irqchip_full(struct kvm *kvm) { int mode = kvm->arch.irqchip_mode; @@ -89,7 +80,16 @@ static inline int irqchip_kernel(struct kvm *kvm) static inline int pic_in_kernel(struct kvm *kvm) { - return irqchip_kernel(kvm); + return irqchip_full(kvm); +} + +static inline int irqchip_split(struct kvm *kvm) +{ + int mode = kvm->arch.irqchip_mode; + + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_SPLIT; } static inline int irqchip_in_kernel(struct kvm *kvm) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 28a8555ab58b..bcf2f1e4a005 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -200,7 +200,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) goto unlock; } clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); - if (!irqchip_kernel(kvm)) + if (!irqchip_full(kvm)) goto unlock; kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 787fc22dab54..7d3583285847 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7039,7 +7039,7 @@ set_identity_unlock: } r = -ENXIO; - if (!irqchip_kernel(kvm)) + if (!irqchip_full(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) @@ -7063,7 +7063,7 @@ set_identity_unlock: } r = -ENXIO; - if (!irqchip_kernel(kvm)) + if (!irqchip_full(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); set_irqchip_out: From df35135680fae0b13f843b6f4c6c310c567d7900 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:47 -0700 Subject: [PATCH 017/101] KVM: x86: Move kvm_setup_default_irq_routing() into irq.c Move the default IRQ routing table used for in-kernel I/O APIC and PIC routing to irq.c, and tweak the name to make it explicitly clear what routing is being initialized. In addition to making it more obvious that the so called "default" routing only applies to an in-kernel I/O APIC, getting it out of irq_comm.c will allow removing irq_comm.c entirely. And placing the function alongside other I/O APIC and PIC code will allow for guarding KVM's in-kernel I/O APIC and PIC emulation with a Kconfig with minimal #ifdefs. No functional change intended. Cc: Kai Huang Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq.c | 32 ++++++++++++++++++++++++++++++++ arch/x86/kvm/irq.h | 3 ++- arch/x86/kvm/irq_comm.c | 32 -------------------------------- arch/x86/kvm/x86.c | 2 +- 4 files changed, 35 insertions(+), 34 deletions(-) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 97e1617ce24d..b696161ec078 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -180,6 +180,38 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) return irqchip_in_kernel(kvm); } +#define IOAPIC_ROUTING_ENTRY(irq) \ + { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ + .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } +#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq) + +#define PIC_ROUTING_ENTRY(irq) \ + { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ + .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } } +#define ROUTING_ENTRY2(irq) \ + IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq) + +static const struct kvm_irq_routing_entry default_routing[] = { + ROUTING_ENTRY2(0), ROUTING_ENTRY2(1), + ROUTING_ENTRY2(2), ROUTING_ENTRY2(3), + ROUTING_ENTRY2(4), ROUTING_ENTRY2(5), + ROUTING_ENTRY2(6), ROUTING_ENTRY2(7), + ROUTING_ENTRY2(8), ROUTING_ENTRY2(9), + ROUTING_ENTRY2(10), ROUTING_ENTRY2(11), + ROUTING_ENTRY2(12), ROUTING_ENTRY2(13), + ROUTING_ENTRY2(14), ROUTING_ENTRY2(15), + ROUTING_ENTRY1(16), ROUTING_ENTRY1(17), + ROUTING_ENTRY1(18), ROUTING_ENTRY1(19), + ROUTING_ENTRY1(20), ROUTING_ENTRY1(21), + ROUTING_ENTRY1(22), ROUTING_ENTRY1(23), +}; + +int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm) +{ + return kvm_set_irq_routing(kvm, default_routing, + ARRAY_SIZE(default_routing), 0); +} + int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { struct kvm_pic *pic = kvm->arch.vpic; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 4ac346102350..7b8b54462f95 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -66,6 +66,8 @@ void kvm_pic_update_irq(struct kvm_pic *s); int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status); +int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm); + int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip); int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip); @@ -110,7 +112,6 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); -int kvm_setup_default_irq_routing(struct kvm *kvm); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, struct dest_map *dest_map); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index bcf2f1e4a005..99c521bd9db5 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -334,38 +334,6 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, } EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); -#define IOAPIC_ROUTING_ENTRY(irq) \ - { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ - .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } -#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq) - -#define PIC_ROUTING_ENTRY(irq) \ - { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ - .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } } -#define ROUTING_ENTRY2(irq) \ - IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq) - -static const struct kvm_irq_routing_entry default_routing[] = { - ROUTING_ENTRY2(0), ROUTING_ENTRY2(1), - ROUTING_ENTRY2(2), ROUTING_ENTRY2(3), - ROUTING_ENTRY2(4), ROUTING_ENTRY2(5), - ROUTING_ENTRY2(6), ROUTING_ENTRY2(7), - ROUTING_ENTRY2(8), ROUTING_ENTRY2(9), - ROUTING_ENTRY2(10), ROUTING_ENTRY2(11), - ROUTING_ENTRY2(12), ROUTING_ENTRY2(13), - ROUTING_ENTRY2(14), ROUTING_ENTRY2(15), - ROUTING_ENTRY1(16), ROUTING_ENTRY1(17), - ROUTING_ENTRY1(18), ROUTING_ENTRY1(19), - ROUTING_ENTRY1(20), ROUTING_ENTRY1(21), - ROUTING_ENTRY1(22), ROUTING_ENTRY1(23), -}; - -int kvm_setup_default_irq_routing(struct kvm *kvm) -{ - return kvm_set_irq_routing(kvm, default_routing, - ARRAY_SIZE(default_routing), 0); -} - void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, u8 vector, unsigned long *ioapic_handled_vectors) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7d3583285847..53f2ce2d40de 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6991,7 +6991,7 @@ set_identity_unlock: goto create_irqchip_unlock; } - r = kvm_setup_default_irq_routing(kvm); + r = kvm_setup_default_ioapic_and_pic_routing(kvm); if (r) { kvm_ioapic_destroy(kvm); kvm_pic_destroy(kvm); From 77a74b8ff41ae620f5a5d727d596b670b7b9994e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:48 -0700 Subject: [PATCH 018/101] KVM: x86: Move kvm_{request,free}_irq_source_id() to i8254.c (PIT) Move kvm_{request,free}_irq_source_id() to i8254.c, i.e. the dedicated PIT emulation file, in anticipation of removing them entirely in favor of hardcoding the PIT's "requested" source ID (the source ID can only ever be '2', and the request can never fail). No functional change intended. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/i8254.c | 44 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/irq_comm.c | 44 ---------------------------------------- include/linux/kvm_host.h | 2 -- 3 files changed, 44 insertions(+), 46 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 59f956f35f4c..2bb223bf0dac 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -641,6 +641,50 @@ static void kvm_pit_reset(struct kvm_pit *pit) kvm_pit_reset_reinject(pit); } +static int kvm_request_irq_source_id(struct kvm *kvm) +{ + unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; + int irq_source_id; + + mutex_lock(&kvm->irq_lock); + irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG); + + if (irq_source_id >= BITS_PER_LONG) { + pr_warn("exhausted allocatable IRQ sources!\n"); + irq_source_id = -EFAULT; + goto unlock; + } + + ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); + ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); + set_bit(irq_source_id, bitmap); +unlock: + mutex_unlock(&kvm->irq_lock); + + return irq_source_id; +} + +static void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) +{ + ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); + ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); + + mutex_lock(&kvm->irq_lock); + if (irq_source_id < 0 || + irq_source_id >= BITS_PER_LONG) { + pr_err("IRQ source ID out of range!\n"); + goto unlock; + } + clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); + if (!irqchip_full(kvm)) + goto unlock; + + kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); + kvm_pic_clear_all(kvm->arch.vpic, irq_source_id); +unlock: + mutex_unlock(&kvm->irq_lock); +} + static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) { struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 99c521bd9db5..138c675dc24b 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -165,50 +165,6 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, return -EWOULDBLOCK; } -int kvm_request_irq_source_id(struct kvm *kvm) -{ - unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; - int irq_source_id; - - mutex_lock(&kvm->irq_lock); - irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG); - - if (irq_source_id >= BITS_PER_LONG) { - pr_warn("exhausted allocatable IRQ sources!\n"); - irq_source_id = -EFAULT; - goto unlock; - } - - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); - ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); - set_bit(irq_source_id, bitmap); -unlock: - mutex_unlock(&kvm->irq_lock); - - return irq_source_id; -} - -void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) -{ - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); - ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); - - mutex_lock(&kvm->irq_lock); - if (irq_source_id < 0 || - irq_source_id >= BITS_PER_LONG) { - pr_err("IRQ source ID out of range!\n"); - goto unlock; - } - clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); - if (!irqchip_full(kvm)) - goto unlock; - - kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); - kvm_pic_clear_all(kvm->arch.vpic, irq_source_id); -unlock: - mutex_unlock(&kvm->irq_lock); -} - void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, struct kvm_irq_mask_notifier *kimn) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9461517b4e62..cba8fc4529e8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1784,8 +1784,6 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); void kvm_unregister_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); -int kvm_request_irq_source_id(struct kvm *kvm); -void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); /* From 61423c413a746fd5fe5b0d865ea722e11b01105e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:49 -0700 Subject: [PATCH 019/101] KVM: x86: Hardcode the PIT IRQ source ID to '2' Hardcode the PIT's source IRQ ID to '2' instead of "finding" that bit 2 is always the first available bit in irq_sources_bitmap. Bits 0 and 1 are set/reserved by kvm_arch_init_vm(), i.e. long before kvm_create_pit() can be invoked, and KVM allows at most one in-kernel PIT instance, i.e. it's impossible for the PIT to find a different free bit (there are no other users of kvm_request_irq_source_id(). Delete the now-defunct irq_sources_bitmap and all its associated code. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/i8254.c | 55 +++++---------------------------- arch/x86/kvm/i8254.h | 1 - arch/x86/kvm/x86.c | 6 ---- include/linux/kvm_host.h | 1 + 5 files changed, 8 insertions(+), 56 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bf4459d637cc..4d68680f7051 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1397,7 +1397,6 @@ struct kvm_arch { bool pause_in_guest; bool cstate_in_guest; - unsigned long irq_sources_bitmap; s64 kvmclock_offset; /* diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 2bb223bf0dac..fa8187608cfc 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -248,8 +248,8 @@ static void pit_do_work(struct kthread_work *work) if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0)) return; - kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false); - kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false); + kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 1, false); + kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 0, false); /* * Provides NMI watchdog support via Virtual Wire mode. @@ -641,47 +641,11 @@ static void kvm_pit_reset(struct kvm_pit *pit) kvm_pit_reset_reinject(pit); } -static int kvm_request_irq_source_id(struct kvm *kvm) +static void kvm_pit_clear_all(struct kvm *kvm) { - unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; - int irq_source_id; - mutex_lock(&kvm->irq_lock); - irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG); - - if (irq_source_id >= BITS_PER_LONG) { - pr_warn("exhausted allocatable IRQ sources!\n"); - irq_source_id = -EFAULT; - goto unlock; - } - - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); - ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); - set_bit(irq_source_id, bitmap); -unlock: - mutex_unlock(&kvm->irq_lock); - - return irq_source_id; -} - -static void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) -{ - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); - ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); - - mutex_lock(&kvm->irq_lock); - if (irq_source_id < 0 || - irq_source_id >= BITS_PER_LONG) { - pr_err("IRQ source ID out of range!\n"); - goto unlock; - } - clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); - if (!irqchip_full(kvm)) - goto unlock; - - kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); - kvm_pic_clear_all(kvm->arch.vpic, irq_source_id); -unlock: + kvm_ioapic_clear_all(kvm->arch.vioapic, KVM_PIT_IRQ_SOURCE_ID); + kvm_pic_clear_all(kvm->arch.vpic, KVM_PIT_IRQ_SOURCE_ID); mutex_unlock(&kvm->irq_lock); } @@ -788,10 +752,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) if (!pit) return NULL; - pit->irq_source_id = kvm_request_irq_source_id(kvm); - if (pit->irq_source_id < 0) - goto fail_request; - mutex_init(&pit->pit_state.lock); pid = get_pid(task_tgid(current)); @@ -843,8 +803,7 @@ fail_register_pit: kvm_pit_set_reinject(pit, false); kthread_destroy_worker(pit->worker); fail_kthread: - kvm_free_irq_source_id(kvm, pit->irq_source_id); -fail_request: + kvm_pit_clear_all(kvm); kfree(pit); return NULL; } @@ -861,7 +820,7 @@ void kvm_free_pit(struct kvm *kvm) kvm_pit_set_reinject(pit, false); hrtimer_cancel(&pit->pit_state.timer); kthread_destroy_worker(pit->worker); - kvm_free_irq_source_id(kvm, pit->irq_source_id); + kvm_pit_clear_all(kvm); kfree(pit); } } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 338095829ec8..b9c1feb379a7 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -44,7 +44,6 @@ struct kvm_pit { struct kvm_io_device speaker_dev; struct kvm *kvm; struct kvm_kpit_state pit_state; - int irq_source_id; struct kvm_irq_mask_notifier mask_notifier; struct kthread_worker *worker; struct kthread_work expired; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 53f2ce2d40de..1d744730985e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12669,12 +12669,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); atomic_set(&kvm->arch.noncoherent_dma_count, 0); - /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ - set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */ - set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, - &kvm->arch.irq_sources_bitmap); - raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cba8fc4529e8..4ff5ea29e343 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -190,6 +190,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 +#define KVM_PIT_IRQ_SOURCE_ID 2 extern struct mutex kvm_lock; extern struct list_head vm_list; From 2c31aa747d789dc895a62efeea13452519184487 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:50 -0700 Subject: [PATCH 020/101] KVM: x86: Don't clear PIT's IRQ line status when destroying PIT Don't bother clearing the PIT's IRQ line status when destroying the PIT, as userspace can't possibly rely on KVM to lower the IRQ line in any sane use case, and it's not at all obvious that clearing the PIT's IRQ line is correct/desirable in kvm_create_pit()'s error path. When called from kvm_arch_pre_destroy_vm(), the entire VM is being torn down and thus {kvm_pic,kvm_ioapic}.irq_states are unreachable. As for the error path in kvm_create_pit(), the only way the PIT's bit in irq_states can be set is if userspace raises the associated IRQ before KVM_CREATE_PIT{2} completes. Forcefully clearing the bit would clobber userspace's input, nonsensical though that input may be. Not to mention that no known VMM will continue on if PIT creation fails. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-12-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 -- arch/x86/kvm/i8254.c | 10 ---------- arch/x86/kvm/i8259.c | 10 ---------- arch/x86/kvm/ioapic.c | 10 ---------- arch/x86/kvm/ioapic.h | 1 - 5 files changed, 33 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4d68680f7051..a4649a234f05 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2208,8 +2208,6 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state, return !!(*irq_state); } -void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); - void kvm_inject_nmi(struct kvm_vcpu *vcpu); int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index fa8187608cfc..d1b79b418c05 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -641,14 +641,6 @@ static void kvm_pit_reset(struct kvm_pit *pit) kvm_pit_reset_reinject(pit); } -static void kvm_pit_clear_all(struct kvm *kvm) -{ - mutex_lock(&kvm->irq_lock); - kvm_ioapic_clear_all(kvm->arch.vioapic, KVM_PIT_IRQ_SOURCE_ID); - kvm_pic_clear_all(kvm->arch.vpic, KVM_PIT_IRQ_SOURCE_ID); - mutex_unlock(&kvm->irq_lock); -} - static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) { struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); @@ -803,7 +795,6 @@ fail_register_pit: kvm_pit_set_reinject(pit, false); kthread_destroy_worker(pit->worker); fail_kthread: - kvm_pit_clear_all(kvm); kfree(pit); return NULL; } @@ -820,7 +811,6 @@ void kvm_free_pit(struct kvm *kvm) kvm_pit_set_reinject(pit, false); hrtimer_cancel(&pit->pit_state.timer); kthread_destroy_worker(pit->worker); - kvm_pit_clear_all(kvm); kfree(pit); } } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 0150aec4f523..4de055efc4ee 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -206,16 +206,6 @@ int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, return ret; } -void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id) -{ - int i; - - pic_lock(s); - for (i = 0; i < PIC_NUM_PINS; i++) - __clear_bit(irq_source_id, &s->irq_states[i]); - pic_unlock(s); -} - /* * acknowledge interrupt 'irq' */ diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 8c8a8062eb19..65626da1407f 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -498,16 +498,6 @@ int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, return ret; } -void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) -{ - int i; - - spin_lock(&ioapic->lock); - for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) - __clear_bit(irq_source_id, &ioapic->irq_states[i]); - spin_unlock(&ioapic->lock); -} - static void kvm_ioapic_eoi_inject_work(struct work_struct *work) { int i; diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 289cca3aec69..dc92bd7c37bc 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -114,7 +114,6 @@ void kvm_ioapic_destroy(struct kvm *kvm); int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status); -void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, From cd9140ad8312234ec296d566a9d9d0b2b437ee7c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:51 -0700 Subject: [PATCH 021/101] KVM: x86: Explicitly check for in-kernel PIC when getting ExtINT Explicitly check for an in-kernel PIC when checking for a pending ExtINT in the PIC. Effectively swapping the split vs. full irqchip logic will allow guarding the in-kernel I/O APIC (and PIC) emulation with a Kconfig, and also makes it more obvious that kvm_pic_read_irq() won't result in a NULL pointer dereference. Opportunistically add WARNs in the fallthrough path, mostly to document that the userspace ExtINT logic is only relevant to split IRQ chips. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-13-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index b696161ec078..fb3bad0f4965 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -42,6 +42,14 @@ static int pending_userspace_extint(struct kvm_vcpu *v) return v->arch.pending_external_vector != -1; } +static int get_userspace_extint(struct kvm_vcpu *vcpu) +{ + int vector = vcpu->arch.pending_external_vector; + + vcpu->arch.pending_external_vector = -1; + return vector; +} + /* * check if there is pending interrupt from * non-APIC source without intack. @@ -68,10 +76,11 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v) if (!kvm_apic_accept_pic_intr(v)) return 0; - if (irqchip_split(v->kvm)) - return pending_userspace_extint(v); - else + if (pic_in_kernel(v->kvm)) return v->kvm->arch.vpic->output; + + WARN_ON_ONCE(!irqchip_split(v->kvm)); + return pending_userspace_extint(v); } /* @@ -127,13 +136,11 @@ int kvm_cpu_get_extint(struct kvm_vcpu *v) return v->kvm->arch.xen.upcall_vector; #endif - if (irqchip_split(v->kvm)) { - int vector = v->arch.pending_external_vector; - - v->arch.pending_external_vector = -1; - return vector; - } else + if (pic_in_kernel(v->kvm)) return kvm_pic_read_irq(v->kvm); /* PIC */ + + WARN_ON_ONCE(!irqchip_split(v->kvm)); + return get_userspace_extint(v); } EXPORT_SYMBOL_GPL(kvm_cpu_get_extint); From 2c938850d9d18cbd6484a66588fac95d74d951fd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:52 -0700 Subject: [PATCH 022/101] KVM: Move x86-only tracepoints to x86's trace.h Move the I/O APIC tracepoints and trace_kvm_msi_set_irq() to x86, as __KVM_HAVE_IOAPIC is just code for "x86", and trace_kvm_msi_set_irq() isn't unique to I/O APIC emulation. Opportunistically clean up the absurdly messy #includes in ioapic.c. No functional change intended. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-14-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/ioapic.c | 2 +- arch/x86/kvm/irq_comm.c | 12 ++---- arch/x86/kvm/trace.h | 78 ++++++++++++++++++++++++++++++++++++++ include/trace/events/kvm.h | 77 ------------------------------------- 4 files changed, 83 insertions(+), 86 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 65626da1407f..fa7481814bc6 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -41,11 +41,11 @@ #include #include #include -#include #include "ioapic.h" #include "lapic.h" #include "irq.h" +#include "trace.h" static int ioapic_service(struct kvm_ioapic *vioapic, int irq, bool line_status); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 138c675dc24b..13d84c25e503 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -15,15 +15,11 @@ #include #include -#include - -#include "irq.h" - -#include "ioapic.h" - -#include "lapic.h" - #include "hyperv.h" +#include "ioapic.h" +#include "irq.h" +#include "lapic.h" +#include "trace.h" #include "x86.h" #include "xen.h" diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index ba736cbb0587..4ef17990574d 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -260,6 +260,84 @@ TRACE_EVENT(kvm_cpuid, __entry->used_max_basic ? ", used max basic" : "") ); +#define kvm_deliver_mode \ + {0x0, "Fixed"}, \ + {0x1, "LowPrio"}, \ + {0x2, "SMI"}, \ + {0x3, "Res3"}, \ + {0x4, "NMI"}, \ + {0x5, "INIT"}, \ + {0x6, "SIPI"}, \ + {0x7, "ExtINT"} + +TRACE_EVENT(kvm_ioapic_set_irq, + TP_PROTO(__u64 e, int pin, bool coalesced), + TP_ARGS(e, pin, coalesced), + + TP_STRUCT__entry( + __field( __u64, e ) + __field( int, pin ) + __field( bool, coalesced ) + ), + + TP_fast_assign( + __entry->e = e; + __entry->pin = pin; + __entry->coalesced = coalesced; + ), + + TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s", + __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e, + __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), + (__entry->e & (1<<11)) ? "logical" : "physical", + (__entry->e & (1<<15)) ? "level" : "edge", + (__entry->e & (1<<16)) ? "|masked" : "", + __entry->coalesced ? " (coalesced)" : "") +); + +TRACE_EVENT(kvm_ioapic_delayed_eoi_inj, + TP_PROTO(__u64 e), + TP_ARGS(e), + + TP_STRUCT__entry( + __field( __u64, e ) + ), + + TP_fast_assign( + __entry->e = e; + ), + + TP_printk("dst %x vec %u (%s|%s|%s%s)", + (u8)(__entry->e >> 56), (u8)__entry->e, + __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), + (__entry->e & (1<<11)) ? "logical" : "physical", + (__entry->e & (1<<15)) ? "level" : "edge", + (__entry->e & (1<<16)) ? "|masked" : "") +); + +TRACE_EVENT(kvm_msi_set_irq, + TP_PROTO(__u64 address, __u64 data), + TP_ARGS(address, data), + + TP_STRUCT__entry( + __field( __u64, address ) + __field( __u64, data ) + ), + + TP_fast_assign( + __entry->address = address; + __entry->data = data; + ), + + TP_printk("dst %llx vec %u (%s|%s|%s%s)", + (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00), + (u8)__entry->data, + __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode), + (__entry->address & (1<<2)) ? "logical" : "physical", + (__entry->data & (1<<15)) ? "level" : "edge", + (__entry->address & (1<<3)) ? "|rh" : "") +); + #define AREG(x) { APIC_##x, "APIC_" #x } #define kvm_trace_symbol_apic \ diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index fc7d0f8ff078..96e581900c8e 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -85,83 +85,6 @@ TRACE_EVENT(kvm_set_irq, #endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */ #if defined(__KVM_HAVE_IOAPIC) -#define kvm_deliver_mode \ - {0x0, "Fixed"}, \ - {0x1, "LowPrio"}, \ - {0x2, "SMI"}, \ - {0x3, "Res3"}, \ - {0x4, "NMI"}, \ - {0x5, "INIT"}, \ - {0x6, "SIPI"}, \ - {0x7, "ExtINT"} - -TRACE_EVENT(kvm_ioapic_set_irq, - TP_PROTO(__u64 e, int pin, bool coalesced), - TP_ARGS(e, pin, coalesced), - - TP_STRUCT__entry( - __field( __u64, e ) - __field( int, pin ) - __field( bool, coalesced ) - ), - - TP_fast_assign( - __entry->e = e; - __entry->pin = pin; - __entry->coalesced = coalesced; - ), - - TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s", - __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e, - __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), - (__entry->e & (1<<11)) ? "logical" : "physical", - (__entry->e & (1<<15)) ? "level" : "edge", - (__entry->e & (1<<16)) ? "|masked" : "", - __entry->coalesced ? " (coalesced)" : "") -); - -TRACE_EVENT(kvm_ioapic_delayed_eoi_inj, - TP_PROTO(__u64 e), - TP_ARGS(e), - - TP_STRUCT__entry( - __field( __u64, e ) - ), - - TP_fast_assign( - __entry->e = e; - ), - - TP_printk("dst %x vec %u (%s|%s|%s%s)", - (u8)(__entry->e >> 56), (u8)__entry->e, - __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), - (__entry->e & (1<<11)) ? "logical" : "physical", - (__entry->e & (1<<15)) ? "level" : "edge", - (__entry->e & (1<<16)) ? "|masked" : "") -); - -TRACE_EVENT(kvm_msi_set_irq, - TP_PROTO(__u64 address, __u64 data), - TP_ARGS(address, data), - - TP_STRUCT__entry( - __field( __u64, address ) - __field( __u64, data ) - ), - - TP_fast_assign( - __entry->address = address; - __entry->data = data; - ), - - TP_printk("dst %llx vec %u (%s|%s|%s%s)", - (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00), - (u8)__entry->data, - __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode), - (__entry->address & (1<<2)) ? "logical" : "physical", - (__entry->data & (1<<15)) ? "level" : "edge", - (__entry->address & (1<<3)) ? "|rh" : "") -); #define kvm_irqchips \ {KVM_IRQCHIP_PIC_MASTER, "PIC master"}, \ From 628a27731e3f36de7ddce226f7e09ee70e40ed66 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:53 -0700 Subject: [PATCH 023/101] KVM: x86: Add CONFIG_KVM_IOAPIC to allow disabling in-kernel I/O APIC Add a Kconfig to allow building KVM without support for emulating a I/O APIC, PIC, and PIT, which is desirable for deployments that effectively don't support a fully in-kernel IRQ chip, i.e. never expect any VMM to create an in-kernel I/O APIC. E.g. compiling out support eliminates a few thousand lines of guest-facing code and gives security folks warm fuzzies. As a bonus, wrapping relevant paths with CONFIG_KVM_IOAPIC #ifdefs makes it much easier for readers to understand which bits and pieces exist specifically for fully in-kernel IRQ chips. Opportunistically convert all two in-kernel uses of __KVM_HAVE_IOAPIC to CONFIG_KVM_IOAPIC, e.g. rather than add a second #ifdef to generate a stub for kvm_arch_post_irq_routing_update(). Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-15-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/Kconfig | 10 ++++++++++ arch/x86/kvm/Makefile | 5 +++-- arch/x86/kvm/i8254.h | 2 ++ arch/x86/kvm/irq.c | 8 ++++++++ arch/x86/kvm/irq.h | 9 +++++++++ arch/x86/kvm/irq_comm.c | 2 ++ arch/x86/kvm/lapic.c | 7 ++++++- arch/x86/kvm/trace.h | 2 ++ arch/x86/kvm/x86.c | 22 ++++++++++++++++++---- include/linux/kvm_host.h | 2 +- include/trace/events/kvm.h | 4 ++-- 12 files changed, 65 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a4649a234f05..8d511eb03933 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1375,9 +1375,11 @@ struct kvm_arch { atomic_t noncoherent_dma_count; #define __KVM_HAVE_ARCH_ASSIGNED_DEVICE atomic_t assigned_device_count; +#ifdef CONFIG_KVM_IOAPIC struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; +#endif atomic_t vapics_in_nmi_mode; struct mutex apic_map_lock; struct kvm_apic_map __rcu *apic_map; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 2eeffcec5382..2c86673155c9 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -166,6 +166,16 @@ config KVM_AMD_SEV Encrypted State (SEV-ES), and Secure Encrypted Virtualization with Secure Nested Paging (SEV-SNP) technologies on AMD processors. +config KVM_IOAPIC + bool "I/O APIC, PIC, and PIT emulation" + default y + depends on KVM + help + Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e. + for full in-kernel APIC emulation. + + If unsure, say Y. + config KVM_SMM bool "System Management Mode emulation" default y diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index a5d362c7b504..92c737257789 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -5,12 +5,13 @@ ccflags-$(CONFIG_KVM_WERROR) += -Werror include $(srctree)/virt/kvm/Makefile.kvm -kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ - i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ +kvm-y += x86.o emulate.o irq.o lapic.o \ + irq_comm.o cpuid.o pmu.o mtrr.o \ debugfs.o mmu/mmu.o mmu/page_track.o \ mmu/spte.o kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o +kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o kvm-$(CONFIG_KVM_HYPERV) += hyperv.o kvm-$(CONFIG_KVM_XEN) += xen.o kvm-$(CONFIG_KVM_SMM) += smm.o diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index b9c1feb379a7..e8bd59ad8a7c 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -8,6 +8,7 @@ #include +#ifdef CONFIG_KVM_IOAPIC struct kvm_kpit_channel_state { u32 count; /* can be 65536 */ u16 latched_count; @@ -64,5 +65,6 @@ int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control) struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); void kvm_free_pit(struct kvm *kvm); +#endif /* CONFIG_KVM_IOAPIC */ #endif diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index fb3bad0f4965..4c219e9f52b0 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -76,8 +76,10 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v) if (!kvm_apic_accept_pic_intr(v)) return 0; +#ifdef CONFIG_KVM_IOAPIC if (pic_in_kernel(v->kvm)) return v->kvm->arch.vpic->output; +#endif WARN_ON_ONCE(!irqchip_split(v->kvm)); return pending_userspace_extint(v); @@ -136,8 +138,10 @@ int kvm_cpu_get_extint(struct kvm_vcpu *v) return v->kvm->arch.xen.upcall_vector; #endif +#ifdef CONFIG_KVM_IOAPIC if (pic_in_kernel(v->kvm)) return kvm_pic_read_irq(v->kvm); /* PIC */ +#endif WARN_ON_ONCE(!irqchip_split(v->kvm)); return get_userspace_extint(v); @@ -171,7 +175,9 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) void __kvm_migrate_timers(struct kvm_vcpu *vcpu) { __kvm_migrate_apic_timer(vcpu); +#ifdef CONFIG_KVM_IOAPIC __kvm_migrate_pit_timer(vcpu); +#endif kvm_x86_call(migrate_timers)(vcpu); } @@ -187,6 +193,7 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) return irqchip_in_kernel(kvm); } +#ifdef CONFIG_KVM_IOAPIC #define IOAPIC_ROUTING_ENTRY(irq) \ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } @@ -273,3 +280,4 @@ int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) kvm_pic_update_irq(pic); return r; } +#endif diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 7b8b54462f95..5e62c1f79ce6 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -18,6 +18,8 @@ #include #include "lapic.h" +#ifdef CONFIG_KVM_IOAPIC + #define PIC_NUM_PINS 16 #define SELECT_PIC(irq) \ ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE) @@ -79,12 +81,19 @@ static inline int irqchip_full(struct kvm *kvm) smp_rmb(); return mode == KVM_IRQCHIP_KERNEL; } +#else /* CONFIG_KVM_IOAPIC */ +static __always_inline int irqchip_full(struct kvm *kvm) +{ + return false; +} +#endif static inline int pic_in_kernel(struct kvm *kvm) { return irqchip_full(kvm); } + static inline int irqchip_split(struct kvm *kvm) { int mode = kvm->arch.irqchip_mode; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 13d84c25e503..14fc8db0206c 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -208,6 +208,7 @@ int kvm_set_routing_entry(struct kvm *kvm, * check kvm_arch_can_set_irq_routing() before calling this function. */ switch (ue->type) { +#ifdef CONFIG_KVM_IOAPIC case KVM_IRQ_ROUTING_IRQCHIP: if (irqchip_split(kvm)) return -EINVAL; @@ -231,6 +232,7 @@ int kvm_set_routing_entry(struct kvm *kvm, } e->irqchip.irqchip = ue->u.irqchip.irqchip; break; +#endif case KVM_IRQ_ROUTING_MSI: e->set = kvm_set_msi; e->msi.address_lo = ue->u.msi.address_lo; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 73418dc0ebb2..4cf8c1f753d3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1455,7 +1455,7 @@ static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) { - int trigger_mode; + int __maybe_unused trigger_mode; /* Eoi the ioapic only if the ioapic doesn't own the vector. */ if (!kvm_ioapic_handles_vector(apic, vector)) @@ -1476,12 +1476,14 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) return; } +#ifdef CONFIG_KVM_IOAPIC if (apic_test_vector(vector, apic->regs + APIC_TMR)) trigger_mode = IOAPIC_LEVEL_TRIG; else trigger_mode = IOAPIC_EDGE_TRIG; kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); +#endif } static int apic_set_eoi(struct kvm_lapic *apic) @@ -3146,8 +3148,11 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); + +#ifdef CONFIG_KVM_IOAPIC if (ioapic_in_kernel(vcpu->kvm)) kvm_rtc_eoi_tracking_restore_one(vcpu); +#endif vcpu->arch.apic_arb_prio = 0; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 4ef17990574d..ababdba2c186 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -270,6 +270,7 @@ TRACE_EVENT(kvm_cpuid, {0x6, "SIPI"}, \ {0x7, "ExtINT"} +#ifdef CONFIG_KVM_IOAPIC TRACE_EVENT(kvm_ioapic_set_irq, TP_PROTO(__u64 e, int pin, bool coalesced), TP_ARGS(e, pin, coalesced), @@ -314,6 +315,7 @@ TRACE_EVENT(kvm_ioapic_delayed_eoi_inj, (__entry->e & (1<<15)) ? "level" : "edge", (__entry->e & (1<<16)) ? "|masked" : "") ); +#endif TRACE_EVENT(kvm_msi_set_irq, TP_PROTO(__u64 address, __u64 data), diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1d744730985e..78dfa6c1cb01 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4632,17 +4632,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_EXT_CPUID: case KVM_CAP_EXT_EMUL_CPUID: case KVM_CAP_CLOCKSOURCE: +#ifdef CONFIG_KVM_IOAPIC case KVM_CAP_PIT: + case KVM_CAP_PIT2: + case KVM_CAP_PIT_STATE2: + case KVM_CAP_REINJECT_CONTROL: +#endif case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: case KVM_CAP_SYNC_MMU: case KVM_CAP_USER_NMI: - case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_IOEVENTFD: case KVM_CAP_IOEVENTFD_NO_LENGTH: - case KVM_CAP_PIT2: - case KVM_CAP_PIT_STATE2: + case KVM_CAP_SET_IDENTITY_MAP_ADDR: case KVM_CAP_VCPU_EVENTS: #ifdef CONFIG_KVM_HYPERV @@ -6937,9 +6940,11 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; int r = -ENOTTY; + +#ifdef CONFIG_KVM_IOAPIC /* * This union makes it completely explicit to gcc-3.x - * that these two variables' stack usage should be + * that these three variables' stack usage should be * combined, not added together. */ union { @@ -6947,6 +6952,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) struct kvm_pit_state2 ps2; struct kvm_pit_config pit_config; } u; +#endif switch (ioctl) { case KVM_SET_TSS_ADDR: @@ -6970,6 +6976,7 @@ set_identity_unlock: case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); break; +#ifdef CONFIG_KVM_IOAPIC case KVM_CREATE_IRQCHIP: { mutex_lock(&kvm->lock); @@ -7136,6 +7143,7 @@ set_pit2_out: r = kvm_vm_ioctl_reinject(kvm, &control); break; } +#endif case KVM_SET_BOOT_CPU_ID: r = 0; mutex_lock(&kvm->lock); @@ -10595,8 +10603,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); +#ifdef CONFIG_KVM_IOAPIC else if (ioapic_in_kernel(vcpu->kvm)) kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); +#endif if (is_guest_mode(vcpu)) vcpu->arch.load_eoi_exitmap_pending = true; @@ -12799,7 +12809,9 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); +#ifdef CONFIG_KVM_IOAPIC kvm_free_pit(kvm); +#endif kvm_mmu_pre_destroy_vm(kvm); static_call_cond(kvm_x86_vm_pre_destroy)(kvm); @@ -12823,8 +12835,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } kvm_destroy_vcpus(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); +#ifdef CONFIG_KVM_IOAPIC kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); +#endif kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4ff5ea29e343..3b5575d0b574 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1023,7 +1023,7 @@ void kvm_unlock_all_vcpus(struct kvm *kvm); void vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); -#ifdef __KVM_HAVE_IOAPIC +#ifdef CONFIG_KVM_IOAPIC void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm); #else static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 96e581900c8e..1065a81ca57f 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -84,14 +84,14 @@ TRACE_EVENT(kvm_set_irq, ); #endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */ -#if defined(__KVM_HAVE_IOAPIC) +#ifdef CONFIG_KVM_IOAPIC #define kvm_irqchips \ {KVM_IRQCHIP_PIC_MASTER, "PIC master"}, \ {KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \ {KVM_IRQCHIP_IOAPIC, "IOAPIC"} -#endif /* defined(__KVM_HAVE_IOAPIC) */ +#endif /* CONFIG_KVM_IOAPIC */ #if defined(CONFIG_HAVE_KVM_IRQCHIP) From 141db6cd79e2d71fce3049347177f98a233d8eb2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:54 -0700 Subject: [PATCH 024/101] KVM: Squash two CONFIG_HAVE_KVM_IRQCHIP #ifdefs into one Squash two #idef CONFIG_HAVE_KVM_IRQCHIP regions in KVM's trace events, as the only code outside of the #idefs depends on CONFIG_KVM_IOAPIC, and that Kconfig only exists for x86, which unconditionally selects HAVE_KVM_IRQCHIP. No functional change intended. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-16-seanjc@google.com Signed-off-by: Sean Christopherson --- include/trace/events/kvm.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 1065a81ca57f..0b6b79b1a1bc 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -82,7 +82,6 @@ TRACE_EVENT(kvm_set_irq, TP_printk("gsi %u level %d source %d", __entry->gsi, __entry->level, __entry->irq_source_id) ); -#endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */ #ifdef CONFIG_KVM_IOAPIC @@ -93,8 +92,6 @@ TRACE_EVENT(kvm_set_irq, #endif /* CONFIG_KVM_IOAPIC */ -#if defined(CONFIG_HAVE_KVM_IRQCHIP) - #ifdef kvm_irqchips #define kvm_ack_irq_string "irqchip %s pin %u" #define kvm_ack_irq_parm __print_symbolic(__entry->irqchip, kvm_irqchips), __entry->pin From 8fd2a6d43a10184f8edcc7adc0547e1871df6705 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:55 -0700 Subject: [PATCH 025/101] KVM: selftests: Fall back to split IRQ chip if full in-kernel chip is unsupported Now that KVM x86 allows compiling out support for in-kernel I/O APIC (and PIC and PIT) emulation, i.e. allows disabling KVM_CREATE_IRQCHIP for all intents and purposes, fall back to a split IRQ chip for x86 if creating the full in-kernel version fails with ENOTTY. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-17-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/kvm_util.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index a055343a7bf7..93a921deeb99 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1716,7 +1716,18 @@ void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa) /* Create an interrupt controller chip for the specified VM. */ void vm_create_irqchip(struct kvm_vm *vm) { - vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL); + int r; + + /* + * Allocate a fully in-kernel IRQ chip by default, but fall back to a + * split model (x86 only) if that fails (KVM x86 allows compiling out + * support for KVM_CREATE_IRQCHIP). + */ + r = __vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL); + if (r && errno == ENOTTY && kvm_has_cap(KVM_CAP_SPLIT_IRQCHIP)) + vm_enable_cap(vm, KVM_CAP_SPLIT_IRQCHIP, 24); + else + TEST_ASSERT_VM_VCPU_IOCTL(!r, KVM_CREATE_IRQCHIP, r, vm); vm->has_irqchip = true; } From 37b1761fe89529a1531b971d4d869b4e6d345704 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:56 -0700 Subject: [PATCH 026/101] KVM: x86: Move IRQ mask notifier infrastructure to I/O APIC emulation Move the IRQ mask logic to ioapic.c as KVM's only user is its in-kernel I/O APIC emulation. In addition to encapsulating more I/O APIC specific code, trimming down irq_comm.c helps pave the way for removing it entirely. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-18-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 16 -------------- arch/x86/kvm/i8254.h | 2 ++ arch/x86/kvm/i8259.c | 2 ++ arch/x86/kvm/ioapic.c | 37 +++++++++++++++++++++++++++++++++ arch/x86/kvm/ioapic.h | 16 ++++++++++++++ arch/x86/kvm/irq_comm.c | 33 ----------------------------- arch/x86/kvm/x86.c | 1 - 7 files changed, 57 insertions(+), 50 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8d511eb03933..44fd9ccc0624 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1427,9 +1427,6 @@ struct kvm_arch { struct delayed_work kvmclock_update_work; struct delayed_work kvmclock_sync_work; - /* reads protected by irq_srcu, writes by irq_lock */ - struct hlist_head mask_notifier_list; - #ifdef CONFIG_KVM_HYPERV struct kvm_hv hyperv; #endif @@ -2039,19 +2036,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); -struct kvm_irq_mask_notifier { - void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked); - int irq; - struct hlist_node link; -}; - -void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, - struct kvm_irq_mask_notifier *kimn); -void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, - struct kvm_irq_mask_notifier *kimn); -void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, - bool mask); - extern bool tdp_enabled; u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index e8bd59ad8a7c..60fa499d2f8a 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -8,6 +8,8 @@ #include +#include "ioapic.h" + #ifdef CONFIG_KVM_IOAPIC struct kvm_kpit_channel_state { u32 count; /* can be 65536 */ diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 4de055efc4ee..2ac7f1678c46 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -31,6 +31,8 @@ #include #include #include + +#include "ioapic.h" #include "irq.h" #include diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index fa7481814bc6..2b5d389bca5f 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -310,6 +310,42 @@ void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) kvm_make_scan_ioapic_request(kvm); } +void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + + mutex_lock(&kvm->irq_lock); + kimn->irq = irq; + hlist_add_head_rcu(&kimn->link, &ioapic->mask_notifier_list); + mutex_unlock(&kvm->irq_lock); +} + +void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn) +{ + mutex_lock(&kvm->irq_lock); + hlist_del_rcu(&kimn->link); + mutex_unlock(&kvm->irq_lock); + synchronize_srcu(&kvm->irq_srcu); +} + +void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, + bool mask) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + struct kvm_irq_mask_notifier *kimn; + int idx, gsi; + + idx = srcu_read_lock(&kvm->irq_srcu); + gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); + if (gsi != -1) + hlist_for_each_entry_rcu(kimn, &ioapic->mask_notifier_list, link) + if (kimn->irq == gsi) + kimn->func(kimn, mask); + srcu_read_unlock(&kvm->irq_srcu, idx); +} + static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) { unsigned index; @@ -710,6 +746,7 @@ int kvm_ioapic_init(struct kvm *kvm) return -ENOMEM; spin_lock_init(&ioapic->lock); INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work); + INIT_HLIST_HEAD(&ioapic->mask_notifier_list); kvm->arch.vioapic = ioapic; kvm_ioapic_reset(ioapic); kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index dc92bd7c37bc..bf28dbc11ff6 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -86,8 +86,24 @@ struct kvm_ioapic { struct delayed_work eoi_inject; u32 irq_eoi[IOAPIC_NUM_PINS]; u32 irr_delivered; + + /* reads protected by irq_srcu, writes by irq_lock */ + struct hlist_head mask_notifier_list; }; +struct kvm_irq_mask_notifier { + void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked); + int irq; + struct hlist_node link; +}; + +void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn); +void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn); +void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, + bool mask); + #ifdef DEBUG #define ASSERT(x) \ do { \ diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 14fc8db0206c..76d1c85a1011 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -161,39 +161,6 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, return -EWOULDBLOCK; } -void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, - struct kvm_irq_mask_notifier *kimn) -{ - mutex_lock(&kvm->irq_lock); - kimn->irq = irq; - hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list); - mutex_unlock(&kvm->irq_lock); -} - -void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, - struct kvm_irq_mask_notifier *kimn) -{ - mutex_lock(&kvm->irq_lock); - hlist_del_rcu(&kimn->link); - mutex_unlock(&kvm->irq_lock); - synchronize_srcu(&kvm->irq_srcu); -} - -void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, - bool mask) -{ - struct kvm_irq_mask_notifier *kimn; - int idx, gsi; - - idx = srcu_read_lock(&kvm->irq_srcu); - gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); - if (gsi != -1) - hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link) - if (kimn->irq == gsi) - kimn->func(kimn, mask); - srcu_read_unlock(&kvm->irq_srcu, idx); -} - bool kvm_arch_can_set_irq_routing(struct kvm *kvm) { return irqchip_in_kernel(kvm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 78dfa6c1cb01..02237dbb7f32 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12676,7 +12676,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (ret) goto out_uninit_mmu; - INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); atomic_set(&kvm->arch.noncoherent_dma_count, 0); raw_spin_lock_init(&kvm->arch.tsc_write_lock); From e76c274513f24d92074e36e40f799b44aeb7a0fb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:57 -0700 Subject: [PATCH 027/101] KVM: x86: Fold irq_comm.c into irq.c Drop irq_comm.c, a.k.a. common IRQ APIs, as there has been no non-x86 user since commit 003f7de62589 ("KVM: ia64: remove") (at the time, irq_comm.c lived in virt/kvm, not arch/x86/kvm). Suggested-by: Paolo Bonzini Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-19-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/Makefile | 6 +- arch/x86/kvm/irq.c | 304 ++++++++++++++++++++++++++++++++++++- arch/x86/kvm/irq_comm.c | 325 ---------------------------------------- 3 files changed, 305 insertions(+), 330 deletions(-) delete mode 100644 arch/x86/kvm/irq_comm.c diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 92c737257789..c4b8950c7abe 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -5,10 +5,8 @@ ccflags-$(CONFIG_KVM_WERROR) += -Werror include $(srctree)/virt/kvm/Makefile.kvm -kvm-y += x86.o emulate.o irq.o lapic.o \ - irq_comm.o cpuid.o pmu.o mtrr.o \ - debugfs.o mmu/mmu.o mmu/page_track.o \ - mmu/spte.o +kvm-y += x86.o emulate.o irq.o lapic.o cpuid.o pmu.o mtrr.o \ + debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 4c219e9f52b0..a0b1499baf6e 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -12,9 +12,10 @@ #include #include +#include "hyperv.h" #include "ioapic.h" #include "irq.h" -#include "i8254.h" +#include "trace.h" #include "x86.h" #include "xen.h" @@ -193,6 +194,307 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) return irqchip_in_kernel(kvm); } +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, struct dest_map *dest_map) +{ + int r = -1; + struct kvm_vcpu *vcpu, *lowest = NULL; + unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + unsigned int dest_vcpus = 0; + + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) + return r; + + if (irq->dest_mode == APIC_DEST_PHYSICAL && + irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { + pr_info("apic: phys broadcast and lowest prio\n"); + irq->delivery_mode = APIC_DM_FIXED; + } + + memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (!kvm_lowest_prio_delivery(irq)) { + if (r < 0) + r = 0; + r += kvm_apic_set_irq(vcpu, irq, dest_map); + } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { + if (!kvm_vector_hashing_enabled()) { + if (!lowest) + lowest = vcpu; + else if (kvm_apic_compare_prio(vcpu, lowest) < 0) + lowest = vcpu; + } else { + __set_bit(i, dest_vcpu_bitmap); + dest_vcpus++; + } + } + } + + if (dest_vcpus != 0) { + int idx = kvm_vector_to_index(irq->vector, dest_vcpus, + dest_vcpu_bitmap, KVM_MAX_VCPUS); + + lowest = kvm_get_vcpu(kvm, idx); + } + + if (lowest) + r = kvm_apic_set_irq(lowest, irq, dest_map); + + return r; +} + +void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq) +{ + struct msi_msg msg = { .address_lo = e->msi.address_lo, + .address_hi = e->msi.address_hi, + .data = e->msi.data }; + + trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ? + (u64)msg.address_hi << 32 : 0), msg.data); + + irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format); + irq->vector = msg.arch_data.vector; + irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical); + irq->trig_mode = msg.arch_data.is_level; + irq->delivery_mode = msg.arch_data.delivery_mode << 8; + irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint; + irq->level = 1; + irq->shorthand = APIC_DEST_NOSHORT; +} +EXPORT_SYMBOL_GPL(kvm_set_msi_irq); + +static inline bool kvm_msi_route_invalid(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e) +{ + return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff); +} + +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, bool line_status) +{ + struct kvm_lapic_irq irq; + + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + + if (!level) + return -1; + + kvm_set_msi_irq(kvm, e, &irq); + + return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); +} + +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + struct kvm_lapic_irq irq; + int r; + + switch (e->type) { +#ifdef CONFIG_KVM_HYPERV + case KVM_IRQ_ROUTING_HV_SINT: + return kvm_hv_synic_set_irq(e, kvm, irq_source_id, level, + line_status); +#endif + + case KVM_IRQ_ROUTING_MSI: + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + + kvm_set_msi_irq(kvm, e, &irq); + + if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) + return r; + break; + +#ifdef CONFIG_KVM_XEN + case KVM_IRQ_ROUTING_XEN_EVTCHN: + if (!level) + return -1; + + return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm); +#endif + default: + break; + } + + return -EWOULDBLOCK; +} + +bool kvm_arch_can_set_irq_routing(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); +} + +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + /* We can't check irqchip_in_kernel() here as some callers are + * currently initializing the irqchip. Other callers should therefore + * check kvm_arch_can_set_irq_routing() before calling this function. + */ + switch (ue->type) { +#ifdef CONFIG_KVM_IOAPIC + case KVM_IRQ_ROUTING_IRQCHIP: + if (irqchip_split(kvm)) + return -EINVAL; + e->irqchip.pin = ue->u.irqchip.pin; + switch (ue->u.irqchip.irqchip) { + case KVM_IRQCHIP_PIC_SLAVE: + e->irqchip.pin += PIC_NUM_PINS / 2; + fallthrough; + case KVM_IRQCHIP_PIC_MASTER: + if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2) + return -EINVAL; + e->set = kvm_pic_set_irq; + break; + case KVM_IRQCHIP_IOAPIC: + if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS) + return -EINVAL; + e->set = kvm_ioapic_set_irq; + break; + default: + return -EINVAL; + } + e->irqchip.irqchip = ue->u.irqchip.irqchip; + break; +#endif + case KVM_IRQ_ROUTING_MSI: + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + break; +#ifdef CONFIG_KVM_HYPERV + case KVM_IRQ_ROUTING_HV_SINT: + e->set = kvm_hv_synic_set_irq; + e->hv_sint.vcpu = ue->u.hv_sint.vcpu; + e->hv_sint.sint = ue->u.hv_sint.sint; + break; +#endif +#ifdef CONFIG_KVM_XEN + case KVM_IRQ_ROUTING_XEN_EVTCHN: + return kvm_xen_setup_evtchn(kvm, e, ue); +#endif + default: + return -EINVAL; + } + + return 0; +} + +bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu) +{ + int r = 0; + unsigned long i; + struct kvm_vcpu *vcpu; + + if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) + return true; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (++r == 2) + return false; + + *dest_vcpu = vcpu; + } + + return r == 1; +} +EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); + +void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, + u8 vector, unsigned long *ioapic_handled_vectors) +{ + /* + * Intercept EOI if the vCPU is the target of the new IRQ routing, or + * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU + * may receive a level-triggered IRQ in the future, or already received + * level-triggered IRQ. The EOI needs to be intercepted and forwarded + * to I/O APIC emulation so that the IRQ can be de-asserted. + */ + if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) { + __set_bit(vector, ioapic_handled_vectors); + } else if (kvm_apic_pending_eoi(vcpu, vector)) { + __set_bit(vector, ioapic_handled_vectors); + + /* + * Track the highest pending EOI for which the vCPU is NOT the + * target in the new routing. Only the EOI for the IRQ that is + * in-flight (for the old routing) needs to be intercepted, any + * future IRQs that arrive on this vCPU will be coincidental to + * the level-triggered routing and don't need to be intercepted. + */ + if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi) + vcpu->arch.highest_stale_pending_ioapic_eoi = vector; + } +} + +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, + ulong *ioapic_handled_vectors) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_kernel_irq_routing_entry *entry; + struct kvm_irq_routing_table *table; + u32 i, nr_ioapic_pins; + int idx; + + idx = srcu_read_lock(&kvm->irq_srcu); + table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + nr_ioapic_pins = min_t(u32, table->nr_rt_entries, + kvm->arch.nr_reserved_ioapic_pins); + for (i = 0; i < nr_ioapic_pins; ++i) { + hlist_for_each_entry(entry, &table->map[i], link) { + struct kvm_lapic_irq irq; + + if (entry->type != KVM_IRQ_ROUTING_MSI) + continue; + + kvm_set_msi_irq(vcpu->kvm, entry, &irq); + + if (!irq.trig_mode) + continue; + + kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode, + irq.vector, ioapic_handled_vectors); + } + } + srcu_read_unlock(&kvm->irq_srcu, idx); +} + +void kvm_arch_irq_routing_update(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_HYPERV + kvm_hv_irq_routing_update(kvm); +#endif + + if (irqchip_split(kvm)) + kvm_make_scan_ioapic_request(kvm); +} + #ifdef CONFIG_KVM_IOAPIC #define IOAPIC_ROUTING_ENTRY(irq) \ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c deleted file mode 100644 index 76d1c85a1011..000000000000 --- a/arch/x86/kvm/irq_comm.c +++ /dev/null @@ -1,325 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * irq_comm.c: Common API for in kernel interrupt controller - * Copyright (c) 2007, Intel Corporation. - * - * Authors: - * Yaozu (Eddie) Dong - * - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include - -#include "hyperv.h" -#include "ioapic.h" -#include "irq.h" -#include "lapic.h" -#include "trace.h" -#include "x86.h" -#include "xen.h" - -int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, struct dest_map *dest_map) -{ - int r = -1; - struct kvm_vcpu *vcpu, *lowest = NULL; - unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; - unsigned int dest_vcpus = 0; - - if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) - return r; - - if (irq->dest_mode == APIC_DEST_PHYSICAL && - irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { - pr_info("apic: phys broadcast and lowest prio\n"); - irq->delivery_mode = APIC_DM_FIXED; - } - - memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!kvm_apic_present(vcpu)) - continue; - - if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, - irq->dest_id, irq->dest_mode)) - continue; - - if (!kvm_lowest_prio_delivery(irq)) { - if (r < 0) - r = 0; - r += kvm_apic_set_irq(vcpu, irq, dest_map); - } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { - if (!kvm_vector_hashing_enabled()) { - if (!lowest) - lowest = vcpu; - else if (kvm_apic_compare_prio(vcpu, lowest) < 0) - lowest = vcpu; - } else { - __set_bit(i, dest_vcpu_bitmap); - dest_vcpus++; - } - } - } - - if (dest_vcpus != 0) { - int idx = kvm_vector_to_index(irq->vector, dest_vcpus, - dest_vcpu_bitmap, KVM_MAX_VCPUS); - - lowest = kvm_get_vcpu(kvm, idx); - } - - if (lowest) - r = kvm_apic_set_irq(lowest, irq, dest_map); - - return r; -} - -void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, - struct kvm_lapic_irq *irq) -{ - struct msi_msg msg = { .address_lo = e->msi.address_lo, - .address_hi = e->msi.address_hi, - .data = e->msi.data }; - - trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ? - (u64)msg.address_hi << 32 : 0), msg.data); - - irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format); - irq->vector = msg.arch_data.vector; - irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical); - irq->trig_mode = msg.arch_data.is_level; - irq->delivery_mode = msg.arch_data.delivery_mode << 8; - irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint; - irq->level = 1; - irq->shorthand = APIC_DEST_NOSHORT; -} -EXPORT_SYMBOL_GPL(kvm_set_msi_irq); - -static inline bool kvm_msi_route_invalid(struct kvm *kvm, - struct kvm_kernel_irq_routing_entry *e) -{ - return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff); -} - -int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level, bool line_status) -{ - struct kvm_lapic_irq irq; - - if (kvm_msi_route_invalid(kvm, e)) - return -EINVAL; - - if (!level) - return -1; - - kvm_set_msi_irq(kvm, e, &irq); - - return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); -} - -int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level, - bool line_status) -{ - struct kvm_lapic_irq irq; - int r; - - switch (e->type) { -#ifdef CONFIG_KVM_HYPERV - case KVM_IRQ_ROUTING_HV_SINT: - return kvm_hv_synic_set_irq(e, kvm, irq_source_id, level, - line_status); -#endif - - case KVM_IRQ_ROUTING_MSI: - if (kvm_msi_route_invalid(kvm, e)) - return -EINVAL; - - kvm_set_msi_irq(kvm, e, &irq); - - if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) - return r; - break; - -#ifdef CONFIG_KVM_XEN - case KVM_IRQ_ROUTING_XEN_EVTCHN: - if (!level) - return -1; - - return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm); -#endif - default: - break; - } - - return -EWOULDBLOCK; -} - -bool kvm_arch_can_set_irq_routing(struct kvm *kvm) -{ - return irqchip_in_kernel(kvm); -} - -int kvm_set_routing_entry(struct kvm *kvm, - struct kvm_kernel_irq_routing_entry *e, - const struct kvm_irq_routing_entry *ue) -{ - /* We can't check irqchip_in_kernel() here as some callers are - * currently initializing the irqchip. Other callers should therefore - * check kvm_arch_can_set_irq_routing() before calling this function. - */ - switch (ue->type) { -#ifdef CONFIG_KVM_IOAPIC - case KVM_IRQ_ROUTING_IRQCHIP: - if (irqchip_split(kvm)) - return -EINVAL; - e->irqchip.pin = ue->u.irqchip.pin; - switch (ue->u.irqchip.irqchip) { - case KVM_IRQCHIP_PIC_SLAVE: - e->irqchip.pin += PIC_NUM_PINS / 2; - fallthrough; - case KVM_IRQCHIP_PIC_MASTER: - if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2) - return -EINVAL; - e->set = kvm_pic_set_irq; - break; - case KVM_IRQCHIP_IOAPIC: - if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS) - return -EINVAL; - e->set = kvm_ioapic_set_irq; - break; - default: - return -EINVAL; - } - e->irqchip.irqchip = ue->u.irqchip.irqchip; - break; -#endif - case KVM_IRQ_ROUTING_MSI: - e->set = kvm_set_msi; - e->msi.address_lo = ue->u.msi.address_lo; - e->msi.address_hi = ue->u.msi.address_hi; - e->msi.data = ue->u.msi.data; - - if (kvm_msi_route_invalid(kvm, e)) - return -EINVAL; - break; -#ifdef CONFIG_KVM_HYPERV - case KVM_IRQ_ROUTING_HV_SINT: - e->set = kvm_hv_synic_set_irq; - e->hv_sint.vcpu = ue->u.hv_sint.vcpu; - e->hv_sint.sint = ue->u.hv_sint.sint; - break; -#endif -#ifdef CONFIG_KVM_XEN - case KVM_IRQ_ROUTING_XEN_EVTCHN: - return kvm_xen_setup_evtchn(kvm, e, ue); -#endif - default: - return -EINVAL; - } - - return 0; -} - -bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, - struct kvm_vcpu **dest_vcpu) -{ - int r = 0; - unsigned long i; - struct kvm_vcpu *vcpu; - - if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) - return true; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!kvm_apic_present(vcpu)) - continue; - - if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, - irq->dest_id, irq->dest_mode)) - continue; - - if (++r == 2) - return false; - - *dest_vcpu = vcpu; - } - - return r == 1; -} -EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); - -void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, - u8 vector, unsigned long *ioapic_handled_vectors) -{ - /* - * Intercept EOI if the vCPU is the target of the new IRQ routing, or - * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU - * may receive a level-triggered IRQ in the future, or already received - * level-triggered IRQ. The EOI needs to be intercepted and forwarded - * to I/O APIC emulation so that the IRQ can be de-asserted. - */ - if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) { - __set_bit(vector, ioapic_handled_vectors); - } else if (kvm_apic_pending_eoi(vcpu, vector)) { - __set_bit(vector, ioapic_handled_vectors); - - /* - * Track the highest pending EOI for which the vCPU is NOT the - * target in the new routing. Only the EOI for the IRQ that is - * in-flight (for the old routing) needs to be intercepted, any - * future IRQs that arrive on this vCPU will be coincidental to - * the level-triggered routing and don't need to be intercepted. - */ - if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi) - vcpu->arch.highest_stale_pending_ioapic_eoi = vector; - } -} - -void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, - ulong *ioapic_handled_vectors) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_kernel_irq_routing_entry *entry; - struct kvm_irq_routing_table *table; - u32 i, nr_ioapic_pins; - int idx; - - idx = srcu_read_lock(&kvm->irq_srcu); - table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - nr_ioapic_pins = min_t(u32, table->nr_rt_entries, - kvm->arch.nr_reserved_ioapic_pins); - for (i = 0; i < nr_ioapic_pins; ++i) { - hlist_for_each_entry(entry, &table->map[i], link) { - struct kvm_lapic_irq irq; - - if (entry->type != KVM_IRQ_ROUTING_MSI) - continue; - - kvm_set_msi_irq(vcpu->kvm, entry, &irq); - - if (!irq.trig_mode) - continue; - - kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode, - irq.vector, ioapic_handled_vectors); - } - } - srcu_read_unlock(&kvm->irq_srcu, idx); -} - -void kvm_arch_irq_routing_update(struct kvm *kvm) -{ -#ifdef CONFIG_KVM_HYPERV - kvm_hv_irq_routing_update(kvm); -#endif - - if (irqchip_split(kvm)) - kvm_make_scan_ioapic_request(kvm); -} From cb210737675ef4c1ad88721e84558eeb2f199312 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:06 -0700 Subject: [PATCH 028/101] KVM: Pass new routing entries and irqfd when updating IRTEs When updating IRTEs in response to a GSI routing or IRQ bypass change, pass the new/current routing information along with the associated irqfd. This will allow KVM x86 to harden, simplify, and deduplicate its code. Since adding/removing a bypass producer is now conveniently protected with irqfds.lock, i.e. can't run concurrently with kvm_irq_routing_update(), use the routing information cached in the irqfd instead of looking up the information in the current GSI routing tables. Opportunistically convert an existing printk() to pr_info() and put its string onto a single line (old code that strictly adhered to 80 chars). Link: https://lore.kernel.org/r/20250611224604.313496-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 7 ++++--- arch/x86/include/asm/kvm_host.h | 6 ++++-- arch/x86/kvm/svm/avic.c | 18 +++++++---------- arch/x86/kvm/svm/svm.h | 5 +++-- arch/x86/kvm/vmx/posted_intr.c | 19 ++++++++--------- arch/x86/kvm/vmx/posted_intr.h | 8 ++++++-- arch/x86/kvm/x86.c | 36 ++++++++++++++++++--------------- include/linux/kvm_host.h | 7 +++++-- virt/kvm/eventfd.c | 11 +++++----- 9 files changed, 62 insertions(+), 55 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 38a91bb5d4c7..a9a39e0375f7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2771,8 +2771,9 @@ bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, return memcmp(&old->msi, &new->msi, sizeof(new->msi)); } -int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) { /* * Remapping the vLPI requires taking the its_lock mutex to resolve @@ -2781,7 +2782,7 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, * * Unmap the vLPI and fall back to software LPI injection. */ - return kvm_vgic_v4_unset_forwarding(kvm, host_irq); + return kvm_vgic_v4_unset_forwarding(irqfd->kvm, irqfd->producer->irq); } void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 44fd9ccc0624..2a14564dd08a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -297,6 +297,7 @@ enum x86_intercept_stage; */ #define KVM_APIC_PV_EOI_PENDING 1 +struct kvm_kernel_irqfd; struct kvm_kernel_irq_routing_entry; /* @@ -1845,8 +1846,9 @@ struct kvm_x86_ops { void (*vcpu_blocking)(struct kvm_vcpu *vcpu); void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); - int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); + int (*pi_update_irte)(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_kernel_irq_routing_entry *new); void (*pi_start_assignment)(struct kvm *kvm); void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 067f8e3f5a0d..49b73907de92 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -886,21 +887,14 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, return 0; } -/* - * avic_pi_update_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_kernel_irq_routing_entry *new) { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; bool enable_remapped_mode = true; + bool set = !!new; int idx, ret = 0; if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) @@ -926,6 +920,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, if (e->type != KVM_IRQ_ROUTING_MSI) continue; + WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new))); + /** * Here, we setup with legacy mode in the following cases: * 1. When cannot target interrupt to a specific vcpu. diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index e6f3c6a153a0..b35fce30d923 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -736,8 +736,9 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void avic_vcpu_put(struct kvm_vcpu *vcpu); void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); -int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); +int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_kernel_irq_routing_entry *new); void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_ring_doorbell(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 5c615e5845bf..110fb19848ab 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -2,6 +2,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include @@ -294,17 +295,9 @@ void vmx_pi_start_assignment(struct kvm *kvm) kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); } -/* - * vmx_pi_update_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_kernel_irq_routing_entry *new) { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; @@ -312,6 +305,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; + bool set = !!new; int idx, ret = 0; if (!vmx_can_use_vtd_pi(kvm)) @@ -329,6 +323,9 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { if (e->type != KVM_IRQ_ROUTING_MSI) continue; + + WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new))); + /* * VT-d PI cannot support posting multicast/broadcast * interrupts to a vCPU, we still use interrupt remapping diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 80499ea0e674..a94afcb55f7f 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -3,6 +3,9 @@ #define __KVM_X86_VMX_POSTED_INTR_H #include +#include +#include + #include void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); @@ -11,8 +14,9 @@ void pi_wakeup_handler(void); void __init pi_init_cpu(int cpu); void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); -int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); +int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_kernel_irq_routing_entry *new); void vmx_pi_start_assignment(struct kvm *kvm); static inline int pi_find_highest_vector(struct pi_desc *pi_desc) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 02237dbb7f32..e1304b002062 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13507,31 +13507,31 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); struct kvm *kvm = irqfd->kvm; - int ret; + int ret = 0; kvm_arch_start_assignment(irqfd->kvm); spin_lock_irq(&kvm->irqfds.lock); irqfd->producer = prod; - ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, - prod->irq, irqfd->gsi, 1); - if (ret) - kvm_arch_end_assignment(irqfd->kvm); - + if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { + ret = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, prod->irq, + irqfd->gsi, &irqfd->irq_entry); + if (ret) + kvm_arch_end_assignment(irqfd->kvm); + } spin_unlock_irq(&kvm->irqfds.lock); - return ret; } void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { - int ret; struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); struct kvm *kvm = irqfd->kvm; + int ret; WARN_ON(irqfd->producer != prod); @@ -13544,11 +13544,13 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, spin_lock_irq(&kvm->irqfds.lock); irqfd->producer = NULL; - ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, - prod->irq, irqfd->gsi, 0); - if (ret) - printk(KERN_INFO "irq bypass consumer (eventfd %p) unregistration" - " fails: %d\n", irqfd->consumer.eventfd, ret); + if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { + ret = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, prod->irq, + irqfd->gsi, NULL); + if (ret) + pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n", + irqfd->consumer.eventfd, ret); + } spin_unlock_irq(&kvm->irqfds.lock); @@ -13556,10 +13558,12 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, kvm_arch_end_assignment(irqfd->kvm); } -int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) { - return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set); + return kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, irqfd->producer->irq, + irqfd->gsi, new); } bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3b5575d0b574..a4160c1c0c6b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2401,6 +2401,8 @@ struct kvm_vcpu *kvm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) +struct kvm_kernel_irqfd; + bool kvm_arch_has_irq_bypass(void); int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); @@ -2408,8 +2410,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *); void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); -int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); +int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new); bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *, struct kvm_kernel_irq_routing_entry *); #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 5bc6abe30748..bd1766da6895 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -285,9 +285,9 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start( { } -int __attribute__((weak)) kvm_arch_update_irqfd_routing( - struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +int __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) { return 0; } @@ -618,9 +618,8 @@ void kvm_irq_routing_update(struct kvm *kvm) #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) if (irqfd->producer && kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) { - int ret = kvm_arch_update_irqfd_routing( - irqfd->kvm, irqfd->producer->irq, - irqfd->gsi, 1); + int ret = kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); + WARN_ON(ret); } #endif From 05c5e23657e1d61c271c2f4a3a21d4d630b18a9b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:07 -0700 Subject: [PATCH 029/101] KVM: SVM: Track per-vCPU IRTEs using kvm_kernel_irqfd structure Track the IRTEs that are posting to an SVM vCPU via the associated irqfd structure and GSI routing instead of dynamically allocating a separate data structure. In addition to eliminating an atomic allocation, this will allow hoisting much of the IRTE update logic to common x86. Cc: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 71 +++++++++++++++------------------------ arch/x86/kvm/svm/svm.h | 10 +++--- include/linux/kvm_irqfd.h | 3 ++ 3 files changed, 36 insertions(+), 48 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 49b73907de92..accc36958a75 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -76,14 +76,6 @@ static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); bool x2avic_enabled; -/* - * This is a wrapper of struct amd_iommu_ir_data. - */ -struct amd_svm_iommu_ir { - struct list_head node; /* Used by SVM for per-vcpu ir_list */ - void *data; /* Storing pointer to struct amd_ir_data */ -}; - static void avic_activate_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; @@ -747,8 +739,8 @@ static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) { int ret = 0; unsigned long flags; - struct amd_svm_iommu_ir *ir; struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_kernel_irqfd *irqfd; if (!kvm_arch_has_assigned_device(vcpu->kvm)) return 0; @@ -762,11 +754,11 @@ static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) if (list_empty(&svm->ir_list)) goto out; - list_for_each_entry(ir, &svm->ir_list, node) { + list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { if (activate) - ret = amd_iommu_activate_guest_mode(ir->data); + ret = amd_iommu_activate_guest_mode(irqfd->irq_bypass_data); else - ret = amd_iommu_deactivate_guest_mode(ir->data); + ret = amd_iommu_deactivate_guest_mode(irqfd->irq_bypass_data); if (ret) break; } @@ -775,27 +767,30 @@ out: return ret; } -static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) +static void svm_ir_list_del(struct vcpu_svm *svm, + struct kvm_kernel_irqfd *irqfd, + struct amd_iommu_pi_data *pi) { unsigned long flags; - struct amd_svm_iommu_ir *cur; + struct kvm_kernel_irqfd *cur; spin_lock_irqsave(&svm->ir_list_lock, flags); - list_for_each_entry(cur, &svm->ir_list, node) { - if (cur->data != pi->ir_data) + list_for_each_entry(cur, &svm->ir_list, vcpu_list) { + if (cur->irq_bypass_data != pi->ir_data) continue; - list_del(&cur->node); - kfree(cur); + if (WARN_ON_ONCE(cur != irqfd)) + continue; + list_del(&irqfd->vcpu_list); break; } spin_unlock_irqrestore(&svm->ir_list_lock, flags); } -static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) +static int svm_ir_list_add(struct vcpu_svm *svm, + struct kvm_kernel_irqfd *irqfd, + struct amd_iommu_pi_data *pi) { - int ret = 0; unsigned long flags; - struct amd_svm_iommu_ir *ir; u64 entry; if (WARN_ON_ONCE(!pi->ir_data)) @@ -812,25 +807,14 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); struct vcpu_svm *prev_svm; - if (!prev_vcpu) { - ret = -EINVAL; - goto out; - } + if (!prev_vcpu) + return -EINVAL; prev_svm = to_svm(prev_vcpu); - svm_ir_list_del(prev_svm, pi); + svm_ir_list_del(prev_svm, irqfd, pi); } - /** - * Allocating new amd_iommu_pi_data, which will get - * add to the per-vcpu ir_list. - */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT); - if (!ir) { - ret = -ENOMEM; - goto out; - } - ir->data = pi->ir_data; + irqfd->irq_bypass_data = pi->ir_data; spin_lock_irqsave(&svm->ir_list_lock, flags); @@ -845,10 +829,9 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, true, pi->ir_data); - list_add(&ir->node, &svm->ir_list); + list_add(&irqfd->vcpu_list, &svm->ir_list); spin_unlock_irqrestore(&svm->ir_list_lock, flags); -out: - return ret; + return 0; } /* @@ -952,7 +935,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, * scheduling information in IOMMU irte. */ if (!ret && pi.is_guest_mode) - svm_ir_list_add(svm, &pi); + svm_ir_list_add(svm, irqfd, &pi); } if (!ret && svm) { @@ -993,7 +976,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, vcpu = kvm_get_vcpu_by_id(kvm, id); if (vcpu) - svm_ir_list_del(to_svm(vcpu), &pi); + svm_ir_list_del(to_svm(vcpu), irqfd, &pi); } } out: @@ -1005,8 +988,8 @@ static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) { int ret = 0; - struct amd_svm_iommu_ir *ir; struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_kernel_irqfd *irqfd; lockdep_assert_held(&svm->ir_list_lock); @@ -1020,8 +1003,8 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) if (list_empty(&svm->ir_list)) return 0; - list_for_each_entry(ir, &svm->ir_list, node) { - ret = amd_iommu_update_ga(cpu, r, ir->data); + list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { + ret = amd_iommu_update_ga(cpu, r, irqfd->irq_bypass_data); if (ret) return ret; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index b35fce30d923..cc27877d69ae 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -310,10 +310,12 @@ struct vcpu_svm { u64 *avic_physical_id_cache; /* - * Per-vcpu list of struct amd_svm_iommu_ir: - * This is used mainly to store interrupt remapping information used - * when update the vcpu affinity. This avoids the need to scan for - * IRTE and try to match ga_tag in the IOMMU driver. + * Per-vCPU list of irqfds that are eligible to post IRQs directly to + * the vCPU (a.k.a. device posted IRQs, a.k.a. IRQ bypass). The list + * is used to reconfigure IRTEs when the vCPU is loaded/put (to set the + * target pCPU), when AVIC is toggled on/off (to (de)activate bypass), + * and if the irqfd becomes ineligible for posting (to put the IRTE + * back into remapped mode). */ struct list_head ir_list; spinlock_t ir_list_lock; diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h index 8ad43692e3bb..6510a48e62aa 100644 --- a/include/linux/kvm_irqfd.h +++ b/include/linux/kvm_irqfd.h @@ -59,6 +59,9 @@ struct kvm_kernel_irqfd { struct work_struct shutdown; struct irq_bypass_consumer consumer; struct irq_bypass_producer *producer; + + struct list_head vcpu_list; + void *irq_bypass_data; }; #endif /* __LINUX_KVM_IRQFD_H */ From 0a917e9d4b7070fafcbf7a8ec32d2aa444b4e757 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:08 -0700 Subject: [PATCH 030/101] KVM: SVM: Delete IRTE link from previous vCPU before setting new IRTE Delete the previous per-vCPU IRTE link prior to modifying the IRTE. If forcing the IRTE back to remapped mode fails, the IRQ is already broken; keeping stale metadata won't change that, and the IOMMU should be sufficiently paranoid to sanitize the IRTE when the IRQ is freed and reallocated. This will allow hoisting the vCPU tracking to common x86, which in turn will allow most of the IRTE update code to be deduplicated. Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 60 +++++++++------------------------------ include/linux/kvm_irqfd.h | 1 + 2 files changed, 14 insertions(+), 47 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index accc36958a75..7f7af8ff627d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -767,23 +767,19 @@ out: return ret; } -static void svm_ir_list_del(struct vcpu_svm *svm, - struct kvm_kernel_irqfd *irqfd, - struct amd_iommu_pi_data *pi) +static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) { + struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu; unsigned long flags; - struct kvm_kernel_irqfd *cur; - spin_lock_irqsave(&svm->ir_list_lock, flags); - list_for_each_entry(cur, &svm->ir_list, vcpu_list) { - if (cur->irq_bypass_data != pi->ir_data) - continue; - if (WARN_ON_ONCE(cur != irqfd)) - continue; - list_del(&irqfd->vcpu_list); - break; - } - spin_unlock_irqrestore(&svm->ir_list_lock, flags); + if (!vcpu) + return; + + spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); + list_del(&irqfd->vcpu_list); + spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); + + irqfd->irq_bypass_vcpu = NULL; } static int svm_ir_list_add(struct vcpu_svm *svm, @@ -796,24 +792,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, if (WARN_ON_ONCE(!pi->ir_data)) return -EINVAL; - /** - * In some cases, the existing irte is updated and re-set, - * so we need to check here if it's already been * added - * to the ir_list. - */ - if (pi->prev_ga_tag) { - struct kvm *kvm = svm->vcpu.kvm; - u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); - struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); - struct vcpu_svm *prev_svm; - - if (!prev_vcpu) - return -EINVAL; - - prev_svm = to_svm(prev_vcpu); - svm_ir_list_del(prev_svm, irqfd, pi); - } - + irqfd->irq_bypass_vcpu = &svm->vcpu; irqfd->irq_bypass_data = pi->ir_data; spin_lock_irqsave(&svm->ir_list_lock, flags); @@ -905,6 +884,8 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new))); + svm_ir_list_del(irqfd); + /** * Here, we setup with legacy mode in the following cases: * 1. When cannot target interrupt to a specific vcpu. @@ -963,21 +944,6 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, pi.prev_ga_tag = 0; pi.is_guest_mode = false; ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Check if the posted interrupt was previously - * setup with the guest_mode by checking if the ga_tag - * was cached. If so, we need to clean up the per-vcpu - * ir_list. - */ - if (!ret && pi.prev_ga_tag) { - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_vcpu_by_id(kvm, id); - if (vcpu) - svm_ir_list_del(to_svm(vcpu), irqfd, &pi); - } } out: srcu_read_unlock(&kvm->irq_srcu, idx); diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h index 6510a48e62aa..361c07f4466d 100644 --- a/include/linux/kvm_irqfd.h +++ b/include/linux/kvm_irqfd.h @@ -60,6 +60,7 @@ struct kvm_kernel_irqfd { struct irq_bypass_consumer consumer; struct irq_bypass_producer *producer; + struct kvm_vcpu *irq_bypass_vcpu; struct list_head vcpu_list; void *irq_bypass_data; }; From 1da19c5ce0533796179d9e1b55e64bf78478c4c1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:09 -0700 Subject: [PATCH 031/101] iommu/amd: KVM: SVM: Delete now-unused cached/previous GA tag fields Delete the amd_ir_data.prev_ga_tag field now that all usage is superfluous. Reviewed-by: Vasant Hegde Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 2 -- drivers/iommu/amd/amd_iommu_types.h | 1 - drivers/iommu/amd/iommu.c | 10 ---------- include/linux/amd-iommu.h | 2 +- 4 files changed, 1 insertion(+), 14 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 7f7af8ff627d..38cdfb052a3a 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -939,9 +939,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, /** * Here, pi is used to: * - Tell IOMMU to use legacy mode for this interrupt. - * - Retrieve ga_tag of prior interrupt remapping data. */ - pi.prev_ga_tag = 0; pi.is_guest_mode = false; ret = irq_set_vcpu_affinity(host_irq, &pi); } diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index ccbab3a4811a..053a0f05768c 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -1054,7 +1054,6 @@ struct irq_2_irte { }; struct amd_ir_data { - u32 cached_ga_tag; struct amd_iommu *iommu; struct irq_2_irte irq_2_irte; struct msi_msg msi_entry; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 3117d99cf83d..ed96050d4933 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3887,23 +3887,13 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) ir_data->cfg = irqd_cfg(data); pi_data->ir_data = ir_data; - pi_data->prev_ga_tag = ir_data->cached_ga_tag; if (pi_data->is_guest_mode) { ir_data->ga_root_ptr = (pi_data->base >> 12); ir_data->ga_vector = vcpu_pi_info->vector; ir_data->ga_tag = pi_data->ga_tag; ret = amd_iommu_activate_guest_mode(ir_data); - if (!ret) - ir_data->cached_ga_tag = pi_data->ga_tag; } else { ret = amd_iommu_deactivate_guest_mode(ir_data); - - /* - * This communicates the ga_tag back to the caller - * so that it can do all the necessary clean up. - */ - if (!ret) - ir_data->cached_ga_tag = 0; } return ret; diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 062fbd4c9b77..1f9b13d803c5 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -19,8 +19,8 @@ struct amd_iommu; */ struct amd_iommu_pi_data { u32 ga_tag; - u32 prev_ga_tag; u64 base; + bool is_guest_mode; struct vcpu_data *vcpu_data; void *ir_data; From a0ca34bb1aad0c72e3fe7b9377c7ac87fa3b7098 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:10 -0700 Subject: [PATCH 032/101] KVM: SVM: Delete IRTE link from previous vCPU irrespective of new routing Delete the IRTE link from the previous vCPU irrespective of the new routing state, i.e. even if the IRTE won't be configured to post IRQs to a vCPU. Whether or not the new route is postable as no bearing on the *old* route. Failure to delete the link can result in KVM incorrectly updating the IRTE, e.g. if the "old" vCPU is scheduled in/out. Fixes: 411b44ba80ab ("svm: Implements update_pi_irte hook to setup posted interrupt") Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 38cdfb052a3a..d7478a62de63 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -862,6 +862,12 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) return 0; + /* + * If the IRQ was affined to a different vCPU, remove the IRTE metadata + * from the *previous* vCPU's list. + */ + svm_ir_list_del(irqfd); + pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", __func__, host_irq, guest_irq, set); @@ -884,8 +890,6 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new))); - svm_ir_list_del(irqfd); - /** * Here, we setup with legacy mode in the following cases: * 1. When cannot target interrupt to a specific vcpu. From 430579577892f56d25d4bc97f34212acc00e55fa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:11 -0700 Subject: [PATCH 033/101] KVM: SVM: Drop pointless masking of default APIC base when setting V_APIC_BAR Drop VMCB_AVIC_APIC_BAR_MASK, it's just a regurgitation of the maximum theoretical 4KiB-aligned physical address, i.e. is not novel in any way, and its only usage is to mask the default APIC base, which is 4KiB aligned and (obviously) a legal physical address. No functional change intended. Tested-by: Sairaj Kodilkar Reviewed-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250611224604.313496-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 2 -- arch/x86/kvm/svm/avic.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index ad954a1a6656..89a666952b01 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -260,8 +260,6 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define AVIC_DOORBELL_PHYSICAL_ID_MASK GENMASK_ULL(11, 0) -#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL - #define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 #define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 #define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index d7478a62de63..be139337c6d7 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -245,7 +245,7 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; - vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; + vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE; if (kvm_apicv_activated(svm->vcpu.kvm)) avic_activate_vmcb(svm); From 2e002ddc896684f668427113ec027f3025fe6c41 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:12 -0700 Subject: [PATCH 034/101] KVM: SVM: Drop pointless masking of kernel page pa's with AVIC HPA masks Drop AVIC_HPA_MASK and all its users, the mask is just the 4KiB-aligned maximum theoretical physical address for x86-64 CPUs, as x86-64 is currently defined (going beyond PA52 would require an entirely new paging mode, which would arguably create a new, different architecture). All usage in KVM masks the result of page_to_phys(), which on x86-64 is guaranteed to be 4KiB aligned and a legal physical address; if either of those requirements doesn't hold true, KVM has far bigger problems. Drop masking the avic_backing_page with AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK for all the same reasons, but keep the macro even though it's unused in functional code. It's a distinct architectural define, and having the definition in software helps visualize the layout of an entry. And to be hyper-paranoid about MAXPA going beyond 52, add a compile-time assert to ensure the kernel's maximum supported physical address stays in bounds. The unnecessary masking in avic_init_vmcb() also incorrectly assumes that SME's C-bit resides between bits 51:11; that holds true for current CPUs, but isn't required by AMD's architecture: In some implementations, the bit used may be a physical address bit Key word being "may". Opportunistically use the GENMASK_ULL() version for AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK, which is far more readable than a set of repeating Fs. Tested-by: Sairaj Kodilkar Reviewed-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250611224604.313496-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 4 +--- arch/x86/kvm/svm/avic.c | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 89a666952b01..36f67c69ea66 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -253,7 +253,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) #define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0) -#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) +#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK GENMASK_ULL(51, 12) #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) #define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) #define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFFULL) @@ -288,8 +288,6 @@ enum avic_ipi_failure_cause { static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID); static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID); -#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) - #define SVM_SEV_FEAT_SNP_ACTIVE BIT(0) #define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3) #define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index be139337c6d7..61ea794a4dce 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -242,9 +242,9 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page)); - vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; - vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; - vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; + vmcb->control.avic_backing_page = bpa; + vmcb->control.avic_logical_id = lpa; + vmcb->control.avic_physical_id = ppa; vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE; if (kvm_apicv_activated(svm->vcpu.kvm)) @@ -302,9 +302,12 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) if (!entry) return -EINVAL; - new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & - AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | - AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); + /* Note, fls64() returns the bit position, +1. */ + BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT > + fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK)); + + new_entry = __sme_set(page_to_phys(svm->avic_backing_page)) | + AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; WRITE_ONCE(*entry, new_entry); svm->avic_physical_id_cache = entry; @@ -904,8 +907,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, enable_remapped_mode = false; /* Try to enable guest_mode in IRTE */ - pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & - AVIC_HPA_MASK); + pi.base = __sme_set(page_to_phys(svm->avic_backing_page)); pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, svm->vcpu.vcpu_id); pi.is_guest_mode = true; From 3338c639da15a87cf57edc79de8409c98b704d87 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:13 -0700 Subject: [PATCH 035/101] KVM: SVM: Add helper to deduplicate code for getting AVIC backing page Add a helper to get the physical address of the AVIC backing page, both to deduplicate code and to prepare for getting the address directly from apic->regs, at which point it won't be all that obvious that the address in question is what SVM calls the AVIC backing page. No functional change intended. Reviewed-by: Maxim Levitsky Tested-by: Sairaj Kodilkar Reviewed-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250611224604.313496-12-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 61ea794a4dce..75317c189409 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -235,14 +235,18 @@ free_avic: return err; } +static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm) +{ + return __sme_set(page_to_phys(svm->avic_backing_page)); +} + void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) { struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); - phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page)); - vmcb->control.avic_backing_page = bpa; + vmcb->control.avic_backing_page = avic_get_backing_page_address(svm); vmcb->control.avic_logical_id = lpa; vmcb->control.avic_physical_id = ppa; vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE; @@ -306,7 +310,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT > fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK)); - new_entry = __sme_set(page_to_phys(svm->avic_backing_page)) | + new_entry = avic_get_backing_page_address(svm) | AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; WRITE_ONCE(*entry, new_entry); @@ -846,7 +850,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, irq.vector); *svm = to_svm(vcpu); - vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page)); + vcpu_info->pi_desc_addr = avic_get_backing_page_address(*svm); vcpu_info->vector = irq.vector; return 0; @@ -907,7 +911,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, enable_remapped_mode = false; /* Try to enable guest_mode in IRTE */ - pi.base = __sme_set(page_to_phys(svm->avic_backing_page)); + pi.base = avic_get_backing_page_address(svm); pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, svm->vcpu.vcpu_id); pi.is_guest_mode = true; From d8527f133c0a810b2e803a03aa186cfef721fbf8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:14 -0700 Subject: [PATCH 036/101] KVM: SVM: Drop vcpu_svm's pointless avic_backing_page field Drop vcpu_svm's avic_backing_page pointer and instead grab the physical address of KVM's vAPIC page directly from the source. Getting a physical address from a kernel virtual address is not an expensive operation, and getting the physical address from a struct page is *more* expensive for CONFIG_SPARSEMEM=y kernels. Regardless, none of the paths that consume the address are hot paths, i.e. shaving cycles is not a priority. Eliminating the "cache" means KVM doesn't have to worry about the cache being invalid, which will simplify a future fix when dealing with vCPU IDs that are too big. WARN if KVM attempts to allocate a vCPU's AVIC backing page without an in-kernel local APIC. avic_init_vcpu() bails early if the APIC is not in-kernel, and KVM disallows enabling an in-kernel APIC after vCPUs have been created, i.e. it should be impossible to reach avic_init_backing_page() without the vAPIC being allocated. Tested-by: Sairaj Kodilkar Reviewed-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250611224604.313496-13-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 6 ++---- arch/x86/kvm/svm/svm.h | 1 - 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 75317c189409..f0401c1e355c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -237,7 +237,7 @@ free_avic: static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm) { - return __sme_set(page_to_phys(svm->avic_backing_page)); + return __sme_set(__pa(svm->vcpu.arch.apic->regs)); } void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) @@ -282,7 +282,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) (id > X2AVIC_MAX_PHYSICAL_ID)) return -EINVAL; - if (!vcpu->arch.apic->regs) + if (WARN_ON_ONCE(!vcpu->arch.apic->regs)) return -EINVAL; if (kvm_apicv_activated(vcpu->kvm)) { @@ -299,8 +299,6 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return ret; } - svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs); - /* Setting AVIC backing page address in the phy APIC ID table */ entry = avic_get_physical_id_entry(vcpu, id); if (!entry) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index cc27877d69ae..1585288200f4 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -306,7 +306,6 @@ struct vcpu_svm { u32 ldr_reg; u32 dfr_reg; - struct page *avic_backing_page; u64 *avic_physical_id_cache; /* From 1aa6e256e46f0b72be6d6e0f890c11e0a1805f53 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:15 -0700 Subject: [PATCH 037/101] KVM: SVM: Inhibit AVIC if ID is too big instead of rejecting vCPU creation Inhibit AVIC with a new "ID too big" flag if userspace creates a vCPU with an ID that is too big, but otherwise allow vCPU creation to succeed. Rejecting KVM_CREATE_VCPU with EINVAL violates KVM's ABI as KVM advertises that the max vCPU ID is 4095, but disallows creating vCPUs with IDs bigger than 254 (AVIC) or 511 (x2AVIC). Alternatively, KVM could advertise an accurate value depending on which AVIC mode is in use, but that wouldn't really solve the underlying problem, e.g. would be a breaking change if KVM were to ever try and enable AVIC or x2AVIC by default. Cc: Maxim Levitsky Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-14-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 9 ++++++++- arch/x86/kvm/svm/avic.c | 14 ++++++++++++-- arch/x86/kvm/svm/svm.h | 3 ++- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2a14564dd08a..949577b52f64 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1315,6 +1315,12 @@ enum kvm_apicv_inhibit { */ APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED, + /* + * AVIC is disabled because the vCPU's APIC ID is beyond the max + * supported by AVIC/x2AVIC, i.e. the vCPU is unaddressable. + */ + APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG, + NR_APICV_INHIBIT_REASONS, }; @@ -1333,7 +1339,8 @@ enum kvm_apicv_inhibit { __APICV_INHIBIT_REASON(IRQWIN), \ __APICV_INHIBIT_REASON(PIT_REINJ), \ __APICV_INHIBIT_REASON(SEV), \ - __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED) + __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED), \ + __APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG) struct kvm_arch { unsigned long n_used_mmu_pages; diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index f0401c1e355c..38b2f5c441e0 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -278,9 +278,19 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) int id = vcpu->vcpu_id; struct vcpu_svm *svm = to_svm(vcpu); + /* + * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC + * hardware. Immediately clear apicv_active, i.e. don't wait until the + * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as + * avic_vcpu_load() expects to be called if and only if the vCPU has + * fully initialized AVIC. + */ if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) || - (id > X2AVIC_MAX_PHYSICAL_ID)) - return -EINVAL; + (id > X2AVIC_MAX_PHYSICAL_ID)) { + kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG); + vcpu->arch.apic->apicv_active = false; + return 0; + } if (WARN_ON_ONCE(!vcpu->arch.apic->regs)) return -EINVAL; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 1585288200f4..71e3c003580e 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -722,7 +722,8 @@ extern struct kvm_x86_nested_ops svm_nested_ops; BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \ - BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \ + BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) | \ + BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG) \ ) bool avic_hardware_setup(void); From c24ed209c474eae6a0a74beb63faa4184ada64ee Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:16 -0700 Subject: [PATCH 038/101] KVM: SVM: Drop redundant check in AVIC code on ID during vCPU creation Drop avic_get_physical_id_entry()'s compatibility check on the incoming ID, as its sole caller, avic_init_backing_page(), performs the exact same check. Drop avic_get_physical_id_entry() entirely as the only remaining functionality is getting the address of the Physical ID table, and accessing the array without an immediate bounds check is kludgy. Opportunistically add a compile-time assertion to ensure the vcpu_id can't result in a bounds overflow, e.g. if KVM (really) messed up a maximum physical ID #define, as well as run-time assertions so that a NULL pointer dereference is morphed into a safer WARN(). No functional change intended. Tested-by: Sairaj Kodilkar Reviewed-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250611224604.313496-15-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 38b2f5c441e0..0797b0b9b831 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -257,26 +257,12 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) avic_deactivate_vmcb(svm); } -static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, - unsigned int index) -{ - u64 *avic_physical_id_table; - struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - - if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) || - (index > X2AVIC_MAX_PHYSICAL_ID)) - return NULL; - - avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); - - return &avic_physical_id_table[index]; -} - static int avic_init_backing_page(struct kvm_vcpu *vcpu) { - u64 *entry, new_entry; - int id = vcpu->vcpu_id; + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); struct vcpu_svm *svm = to_svm(vcpu); + u32 id = vcpu->vcpu_id; + u64 *table, new_entry; /* * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC @@ -292,6 +278,9 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } + BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(*table) > PAGE_SIZE || + (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(*table) > PAGE_SIZE); + if (WARN_ON_ONCE(!vcpu->arch.apic->regs)) return -EINVAL; @@ -310,9 +299,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) } /* Setting AVIC backing page address in the phy APIC ID table */ - entry = avic_get_physical_id_entry(vcpu, id); - if (!entry) - return -EINVAL; + table = page_address(kvm_svm->avic_physical_id_table_page); /* Note, fls64() returns the bit position, +1. */ BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT > @@ -320,9 +307,9 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) new_entry = avic_get_backing_page_address(svm) | AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; - WRITE_ONCE(*entry, new_entry); + WRITE_ONCE(table[id], new_entry); - svm->avic_physical_id_cache = entry; + svm->avic_physical_id_cache = &table[id]; return 0; } @@ -1005,6 +992,9 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) return; + if (WARN_ON_ONCE(!svm->avic_physical_id_cache)) + return; + /* * No need to update anything if the vCPU is blocking, i.e. if the vCPU * is being scheduled in after being preempted. The CPU entries in the @@ -1045,6 +1035,9 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) lockdep_assert_preemption_disabled(); + if (WARN_ON_ONCE(!svm->avic_physical_id_cache)) + return; + /* * Note, reading the Physical ID entry outside of ir_list_lock is safe * as only the pCPU that has loaded (or is loading) the vCPU is allowed From 26baab4eea4c135c2a2d2b5529a7a0467fba5bef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:17 -0700 Subject: [PATCH 039/101] KVM: SVM: Track AVIC tables as natively sized pointers, not "struct pages" Allocate and track AVIC's logical and physical tables as u32 and u64 pointers respectively, as managing the pages as "struct page" pointers adds an almost absurd amount of boilerplate and complexity. E.g. with page_address() out of the way, svm->avic_physical_id_cache becomes completely superfluous, and will be removed in a future cleanup. No functional change intended. Tested-by: Sairaj Kodilkar Acked-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250611224604.313496-16-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 49 ++++++++++++++--------------------------- arch/x86/kvm/svm/svm.h | 4 ++-- 2 files changed, 18 insertions(+), 35 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 0797b0b9b831..a65f61c5fc0c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -173,10 +173,8 @@ void avic_vm_destroy(struct kvm *kvm) if (!enable_apicv) return; - if (kvm_svm->avic_logical_id_table_page) - __free_page(kvm_svm->avic_logical_id_table_page); - if (kvm_svm->avic_physical_id_table_page) - __free_page(kvm_svm->avic_physical_id_table_page); + free_page((unsigned long)kvm_svm->avic_logical_id_table); + free_page((unsigned long)kvm_svm->avic_physical_id_table); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); hash_del(&kvm_svm->hnode); @@ -189,27 +187,19 @@ int avic_vm_init(struct kvm *kvm) int err = -ENOMEM; struct kvm_svm *kvm_svm = to_kvm_svm(kvm); struct kvm_svm *k2; - struct page *p_page; - struct page *l_page; u32 vm_id; if (!enable_apicv) return 0; - /* Allocating physical APIC ID table (4KB) */ - p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!p_page) + kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!kvm_svm->avic_physical_id_table) goto free_avic; - kvm_svm->avic_physical_id_table_page = p_page; - - /* Allocating logical APIC ID table (4KB) */ - l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!l_page) + kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!kvm_svm->avic_logical_id_table) goto free_avic; - kvm_svm->avic_logical_id_table_page = l_page; - spin_lock_irqsave(&svm_vm_data_hash_lock, flags); again: vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; @@ -243,12 +233,10 @@ static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm) void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) { struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); - phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); - phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page)); vmcb->control.avic_backing_page = avic_get_backing_page_address(svm); - vmcb->control.avic_logical_id = lpa; - vmcb->control.avic_physical_id = ppa; + vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table)); + vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table)); vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE; if (kvm_apicv_activated(svm->vcpu.kvm)) @@ -262,7 +250,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); struct vcpu_svm *svm = to_svm(vcpu); u32 id = vcpu->vcpu_id; - u64 *table, new_entry; + u64 new_entry; /* * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC @@ -278,8 +266,8 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } - BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(*table) > PAGE_SIZE || - (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(*table) > PAGE_SIZE); + BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE || + (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE); if (WARN_ON_ONCE(!vcpu->arch.apic->regs)) return -EINVAL; @@ -298,18 +286,16 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return ret; } - /* Setting AVIC backing page address in the phy APIC ID table */ - table = page_address(kvm_svm->avic_physical_id_table_page); - /* Note, fls64() returns the bit position, +1. */ BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT > fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK)); + /* Setting AVIC backing page address in the phy APIC ID table */ new_entry = avic_get_backing_page_address(svm) | AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; - WRITE_ONCE(table[id], new_entry); + WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry); - svm->avic_physical_id_cache = &table[id]; + svm->avic_physical_id_cache = &kvm_svm->avic_physical_id_table[id]; return 0; } @@ -443,7 +429,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source if (apic_x2apic_mode(source)) avic_logical_id_table = NULL; else - avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page); + avic_logical_id_table = kvm_svm->avic_logical_id_table; /* * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical @@ -545,7 +531,6 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu) static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) { struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - u32 *logical_apic_id_table; u32 cluster, index; ldr = GET_APIC_LOGICAL_ID(ldr); @@ -566,9 +551,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) return NULL; index += (cluster << 2); - logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page); - - return &logical_apic_id_table[index]; + return &kvm_svm->avic_logical_id_table[index]; } static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 71e3c003580e..ec5d77d42a49 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -123,8 +123,8 @@ struct kvm_svm { /* Struct members for AVIC */ u32 avic_vm_id; - struct page *avic_logical_id_table_page; - struct page *avic_physical_id_table_page; + u32 *avic_logical_id_table; + u64 *avic_physical_id_table; struct hlist_node hnode; struct kvm_sev_info sev_info; From d29433336a7b4783546cdca3096c3aee36610e70 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:18 -0700 Subject: [PATCH 040/101] KVM: SVM: Drop superfluous "cache" of AVIC Physical ID entry pointer Drop the vCPU's pointer to its AVIC Physical ID entry, and simply index the table directly. Caching a pointer address is completely unnecessary for performance, and while the field technically caches the result of the pointer calculation, it's all too easy to misinterpret the name and think that the field somehow caches the _data_ in the table. No functional change intended. Suggested-by: Maxim Levitsky Tested-by: Sairaj Kodilkar Reviewed-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250611224604.313496-17-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 27 +++++++++++++++------------ arch/x86/kvm/svm/svm.h | 1 - 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index a65f61c5fc0c..baee6d412e78 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -295,8 +295,6 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry); - svm->avic_physical_id_cache = &kvm_svm->avic_physical_id_table[id]; - return 0; } @@ -771,13 +769,16 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct kvm_kernel_irqfd *irqfd, struct amd_iommu_pi_data *pi) { + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm *kvm = vcpu->kvm; + struct kvm_svm *kvm_svm = to_kvm_svm(kvm); unsigned long flags; u64 entry; if (WARN_ON_ONCE(!pi->ir_data)) return -EINVAL; - irqfd->irq_bypass_vcpu = &svm->vcpu; + irqfd->irq_bypass_vcpu = vcpu; irqfd->irq_bypass_data = pi->ir_data; spin_lock_irqsave(&svm->ir_list_lock, flags); @@ -788,7 +789,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, * will update the pCPU info when the vCPU awkened and/or scheduled in. * See also avic_vcpu_load(). */ - entry = READ_ONCE(*(svm->avic_physical_id_cache)); + entry = READ_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id]); if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, true, pi->ir_data); @@ -965,17 +966,18 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - u64 entry; + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); unsigned long flags; + u64 entry; lockdep_assert_preemption_disabled(); if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) return; - if (WARN_ON_ONCE(!svm->avic_physical_id_cache)) + if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) return; /* @@ -997,14 +999,14 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) */ spin_lock_irqsave(&svm->ir_list_lock, flags); - entry = READ_ONCE(*(svm->avic_physical_id_cache)); + entry = READ_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id]); WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - WRITE_ONCE(*(svm->avic_physical_id_cache), entry); + WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); spin_unlock_irqrestore(&svm->ir_list_lock, flags); @@ -1012,13 +1014,14 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void avic_vcpu_put(struct kvm_vcpu *vcpu) { - u64 entry; + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); struct vcpu_svm *svm = to_svm(vcpu); unsigned long flags; + u64 entry; lockdep_assert_preemption_disabled(); - if (WARN_ON_ONCE(!svm->avic_physical_id_cache)) + if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) return; /* @@ -1028,7 +1031,7 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) * can't be scheduled out and thus avic_vcpu_{put,load}() can't run * recursively. */ - entry = READ_ONCE(*(svm->avic_physical_id_cache)); + entry = READ_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id]); /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) @@ -1047,7 +1050,7 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) avic_update_iommu_vcpu_affinity(vcpu, -1, 0); entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - WRITE_ONCE(*(svm->avic_physical_id_cache), entry); + WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); spin_unlock_irqrestore(&svm->ir_list_lock, flags); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index ec5d77d42a49..f225d0bed152 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -306,7 +306,6 @@ struct vcpu_svm { u32 ldr_reg; u32 dfr_reg; - u64 *avic_physical_id_cache; /* * Per-vCPU list of irqfds that are eligible to post IRQs directly to From bafddc70001d1834b2f2e490e108bbb8812b4bed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:19 -0700 Subject: [PATCH 041/101] KVM: VMX: Move enable_ipiv knob to common x86 Move enable_ipiv to common x86 so that it can be reused by SVM to control IPI virtualization when AVIC is enabled. SVM doesn't actually provide a way to truly disable IPI virtualization, but KVM can get close enough by skipping the necessary table programming. Link: https://lore.kernel.org/r/20250611224604.313496-18-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/capabilities.h | 1 - arch/x86/kvm/vmx/vmx.c | 2 -- arch/x86/kvm/x86.c | 3 +++ 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 949577b52f64..30b3ecc01684 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1951,6 +1951,7 @@ struct kvm_arch_async_pf { extern u32 __read_mostly kvm_nr_uret_msrs; extern bool __read_mostly allow_smaller_maxphyaddr; extern bool __read_mostly enable_apicv; +extern bool __read_mostly enable_ipiv; extern bool __read_mostly enable_device_posted_irqs; extern struct kvm_x86_ops kvm_x86_ops; diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index cb6588238f46..5316c27f6099 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -15,7 +15,6 @@ extern bool __read_mostly enable_ept; extern bool __read_mostly enable_unrestricted_guest; extern bool __read_mostly enable_ept_ad_bits; extern bool __read_mostly enable_pml; -extern bool __read_mostly enable_ipiv; extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4953846cb30d..09f70603fd22 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -113,8 +113,6 @@ static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, 0444); module_param(enable_apicv, bool, 0444); - -bool __read_mostly enable_ipiv = true; module_param(enable_ipiv, bool, 0444); module_param(enable_device_posted_irqs, bool, 0444); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e1304b002062..c176ff44bb9a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -226,6 +226,9 @@ EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); bool __read_mostly enable_apicv = true; EXPORT_SYMBOL_GPL(enable_apicv); +bool __read_mostly enable_ipiv = true; +EXPORT_SYMBOL_GPL(enable_ipiv); + bool __read_mostly enable_device_posted_irqs = true; EXPORT_SYMBOL_GPL(enable_device_posted_irqs); From d921665e01ba86212bdace238bdff123bceffd46 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 11 Jun 2025 15:45:20 -0700 Subject: [PATCH 042/101] KVM: SVM: Add enable_ipiv param, never set IsRunning if disabled Let userspace "disable" IPI virtualization for AVIC via the enable_ipiv module param, by never setting IsRunning. SVM doesn't provide a way to disable IPI virtualization in hardware, but by ensuring CPUs never see IsRunning=1, every IPI in the guest (except for self-IPIs) will generate a VM-Exit. To avoid setting the real IsRunning bit, while still allowing KVM to use each vCPU's entry to update GA log entries, simply maintain a shadow of the entry, without propagating IsRunning updates to the real table when IPI virtualization is disabled. Providing a way to effectively disable IPI virtualization will allow KVM to safely enable AVIC on hardware that is susceptible to erratum #1235, which causes hardware to sometimes fail to detect that the IsRunning bit has been cleared by software. Note, the table _must_ be fully populated, as broadcast IPIs skip invalid entries, i.e. won't generate VM-Exit if every entry is invalid, and so simply pointing the VMCB at a common dummy table won't work. Alternatively, KVM could allocate a shadow of the entire table, but that'd be a waste of 4KiB since the per-vCPU entry doesn't actually consume an additional 8 bytes of memory (vCPU structures are large enough that they are backed by order-N pages). Signed-off-by: Maxim Levitsky [sean: keep "entry" variables, reuse enable_ipiv, split from erratum] Link: https://lore.kernel.org/r/20250611224604.313496-19-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 32 ++++++++++++++++++++++++++------ arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/svm/svm.h | 8 ++++++++ 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index baee6d412e78..d0b845ab66fe 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -293,6 +293,13 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) /* Setting AVIC backing page address in the phy APIC ID table */ new_entry = avic_get_backing_page_address(svm) | AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; + svm->avic_physical_id_entry = new_entry; + + /* + * Initialize the real table, as vCPUs must have a valid entry in order + * for broadcast IPIs to function correctly (broadcast IPIs ignore + * invalid entries, i.e. aren't guaranteed to generate a VM-Exit). + */ WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry); return 0; @@ -770,8 +777,6 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) { struct kvm_vcpu *vcpu = &svm->vcpu; - struct kvm *kvm = vcpu->kvm; - struct kvm_svm *kvm_svm = to_kvm_svm(kvm); unsigned long flags; u64 entry; @@ -789,7 +794,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, * will update the pCPU info when the vCPU awkened and/or scheduled in. * See also avic_vcpu_load(). */ - entry = READ_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id]); + entry = svm->avic_physical_id_entry; if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, true, pi->ir_data); @@ -999,14 +1004,26 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) */ spin_lock_irqsave(&svm->ir_list_lock, flags); - entry = READ_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id]); + entry = svm->avic_physical_id_entry; WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + svm->avic_physical_id_entry = entry; + + /* + * If IPI virtualization is disabled, clear IsRunning when updating the + * actual Physical ID table, so that the CPU never sees IsRunning=1. + * Keep the APIC ID up-to-date in the entry to minimize the chances of + * things going sideways if hardware peeks at the ID. + */ + if (!enable_ipiv) + entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); spin_unlock_irqrestore(&svm->ir_list_lock, flags); @@ -1031,7 +1048,7 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) * can't be scheduled out and thus avic_vcpu_{put,load}() can't run * recursively. */ - entry = READ_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id]); + entry = svm->avic_physical_id_entry; /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) @@ -1050,7 +1067,10 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) avic_update_iommu_vcpu_affinity(vcpu, -1, 0); entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); + svm->avic_physical_id_entry = entry; + + if (enable_ipiv) + WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); spin_unlock_irqrestore(&svm->ir_list_lock, flags); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ab9b947dbf4f..68b6a1922078 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -232,6 +232,7 @@ module_param(tsc_scaling, int, 0444); */ static bool avic; module_param(avic, bool, 0444); +module_param(enable_ipiv, bool, 0444); module_param(enable_device_posted_irqs, bool, 0444); @@ -5581,6 +5582,7 @@ static __init int svm_hardware_setup(void) enable_apicv = avic = avic && avic_hardware_setup(); if (!enable_apicv) { + enable_ipiv = false; svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f225d0bed152..939ff0e35a2b 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -307,6 +307,14 @@ struct vcpu_svm { u32 ldr_reg; u32 dfr_reg; + /* This is essentially a shadow of the vCPU's actual entry in the + * Physical ID table that is programmed into the VMCB, i.e. that is + * seen by the CPU. If IPI virtualization is disabled, IsRunning is + * only ever set in the shadow, i.e. is never propagated to the "real" + * table, so that hardware never sees IsRunning=1. + */ + u64 avic_physical_id_entry; + /* * Per-vCPU list of irqfds that are eligible to post IRQs directly to * the vCPU (a.k.a. device posted IRQs, a.k.a. IRQ bypass). The list From 8de4a1c8164e5b2e40d1df764840a31de983f40b Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 11 Jun 2025 15:45:21 -0700 Subject: [PATCH 043/101] KVM: SVM: Disable (x2)AVIC IPI virtualization if CPU has erratum #1235 Disable IPI virtualization on AMD Family 17h CPUs (Zen2 and Zen1), as hardware doesn't reliably detect changes to the 'IsRunning' bit during ICR write emulation, and might fail to VM-Exit on the sending vCPU, if IsRunning was recently cleared. The absence of the VM-Exit leads to KVM not waking (or triggering nested VM-Exit of) the target vCPU(s) of the IPI, which can lead to hung vCPUs, unbounded delays in L2 execution, etc. To workaround the erratum, simply disable IPI virtualization, which prevents KVM from setting IsRunning and thus eliminates the race where hardware sees a stale IsRunning=1. As a result, all ICR writes (except when "Self" shorthand is used) will VM-Exit and therefore be correctly emulated by KVM. Disabling IPI virtualization does carry a performance penalty, but benchmarkng shows that enabling AVIC without IPI virtualization is still much better than not using AVIC at all, because AVIC still accelerates posted interrupts and the receiving end of the IPIs. Note, when virtualizing Self-IPIs, the CPU skips reading the physical ID table and updates the vIRR directly (because the vCPU is by definition actively running), i.e. Self-IPI isn't susceptible to the erratum *and* is still accelerated by hardware. Signed-off-by: Maxim Levitsky [sean: rebase, massage changelog, disallow user override] Acked-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250611224604.313496-20-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index d0b845ab66fe..72b8cab2fbce 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1188,6 +1188,14 @@ bool avic_hardware_setup(void) if (x2avic_enabled) pr_info("x2AVIC enabled\n"); + /* + * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2) + * due to erratum 1235, which results in missed VM-Exits on the sender + * and thus missed wake events for blocking vCPUs due to the CPU + * failing to see a software update to clear IsRunning. + */ + enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17; + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); return true; From 6737557442e5d21fef5c613c3079d26a4a4b0744 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:22 -0700 Subject: [PATCH 044/101] KVM: VMX: Suppress PI notifications whenever the vCPU is put Suppress posted interrupt notifications (set PID.SN=1) whenever the vCPU is put, i.e. unloaded, not just when the vCPU is preempted, as KVM doesn't do anything in response to a notification IRQ that arrives in the host, nor does KVM rely on the Outstanding Notification (PID.ON) flag when the vCPU is unloaded. And, the cost of scanning the PIR to manually set PID.ON when loading the vCPU is quite small, especially relative to the cost of loading (and unloading) a vCPU. On the flip side, leaving SN clear means a notification for the vCPU will result in a spurious IRQ for the pCPU, even if vCPU task is scheduled out, running in userspace, etc. Even worse, if the pCPU is running a different vCPU, the spurious IRQ could trigger posted interrupt processing for the wrong vCPU, which is technically a violation of the architecture, as setting bits in PIR aren't supposed to be propagated to the vIRR until a notification IRQ is received. The saving grace of the current behavior is that hardware sends notification interrupts if and only if PID.ON=0, i.e. only the first posted interrupt for a vCPU will trigger a spurious IRQ (for each window where the vCPU is unloaded). Ideally, KVM would suppress notifications before enabling IRQs in the VM-Exit, but KVM relies on PID.ON as an indicator that there is a posted interrupt pending in PIR, e.g. in vmx_sync_pir_to_irr(), and sadly there is no way to ask hardware to set PID.ON, but not generate an interrupt. That could be solved by using pi_has_pending_interrupt() instead of checking only PID.ON, but it's not at all clear that would be a performance win, as KVM would end up scanning the entire PIR whenever an interrupt isn't pending. And long term, the spurious IRQ window, i.e. where a vCPU is loaded with IRQs enabled, can effectively be made smaller for hot paths by moving performance critical VM-Exit handlers into the fastpath, i.e. by never enabling IRQs for hot path VM-Exits. Link: https://lore.kernel.org/r/20250611224604.313496-21-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/posted_intr.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 110fb19848ab..d4826a6b674f 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -73,13 +73,10 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) /* * If the vCPU wasn't on the wakeup list and wasn't migrated, then the * full update can be skipped as neither the vector nor the destination - * needs to be changed. + * needs to be changed. Clear SN even if there is no assigned device, + * again for simplicity. */ if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) { - /* - * Clear SN if it was set due to being preempted. Again, do - * this even if there is no assigned device for simplicity. - */ if (pi_test_and_clear_sn(pi_desc)) goto after_clear_sn; return; @@ -225,17 +222,23 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) if (!vmx_needs_pi_wakeup(vcpu)) return; - if (kvm_vcpu_is_blocking(vcpu) && + /* + * If the vCPU is blocking with IRQs enabled and ISN'T being preempted, + * enable the wakeup handler so that notification IRQ wakes the vCPU as + * expected. There is no need to enable the wakeup handler if the vCPU + * is preempted between setting its wait state and manually scheduling + * out, as the task is still runnable, i.e. doesn't need a wake event + * from KVM to be scheduled in. + * + * If the wakeup handler isn't being enabled, Suppress Notifications as + * the cost of propagating PIR.IRR to PID.ON is negligible compared to + * the cost of a spurious IRQ, and vCPU put/load is a slow path. + */ + if (!vcpu->preempted && kvm_vcpu_is_blocking(vcpu) && ((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) || (!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu)))) pi_enable_wakeup_handler(vcpu); - - /* - * Set SN when the vCPU is preempted. Note, the vCPU can both be seen - * as blocking and preempted, e.g. if it's preempted between setting - * its wait state and manually scheduling out. - */ - if (vcpu->preempted) + else pi_set_sn(pi_desc); } From 52d826c9e54cc6b4f0107720dd1f254944f6e727 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:23 -0700 Subject: [PATCH 045/101] KVM: SVM: Add a comment to explain why avic_vcpu_blocking() ignores IRQ blocking Add a comment to explain why KVM clears IsRunning when putting a vCPU, even though leaving IsRunning=1 would be ok from a functional perspective. Per Maxim's experiments, a misbehaving VM could spam the AVIC doorbell so fast as to induce a 50%+ loss in performance. Link: https://lore.kernel.org/all/8d7e0d0391df4efc7cb28557297eb2ec9904f1e5.camel@redhat.com Cc: Maxim Levitsky Acked-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/20250611224604.313496-22-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 72b8cab2fbce..375a29022000 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1122,19 +1122,24 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu) if (!kvm_vcpu_apicv_active(vcpu)) return; - /* - * Unload the AVIC when the vCPU is about to block, _before_ - * the vCPU actually blocks. - * - * Any IRQs that arrive before IsRunning=0 will not cause an - * incomplete IPI vmexit on the source, therefore vIRR will also - * be checked by kvm_vcpu_check_block() before blocking. The - * memory barrier implicit in set_current_state orders writing - * IsRunning=0 before reading the vIRR. The processor needs a - * matching memory barrier on interrupt delivery between writing - * IRR and reading IsRunning; the lack of this barrier might be - * the cause of errata #1235). - */ + /* + * Unload the AVIC when the vCPU is about to block, _before_ the vCPU + * actually blocks. + * + * Note, any IRQs that arrive before IsRunning=0 will not cause an + * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles + * this by checking vIRR one last time before blocking. The memory + * barrier implicit in set_current_state orders writing IsRunning=0 + * before reading the vIRR. The processor needs a matching memory + * barrier on interrupt delivery between writing IRR and reading + * IsRunning; the lack of this barrier might be the cause of errata #1235). + * + * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM + * doesn't need to detect events for scheduling purposes. The doorbell + * used to signal running vCPUs cannot be blocked, i.e. will perturb the + * CPU and cause noisy neighbor problems if the VM is sending interrupts + * to the vCPU while it's scheduled out. + */ avic_vcpu_put(vcpu); } From c4cdbaf9d81c8387da8b9b91567d4b0eb1b8a549 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:24 -0700 Subject: [PATCH 046/101] iommu/amd: KVM: SVM: Use pi_desc_addr to derive ga_root_ptr Use vcpu_data.pi_desc_addr instead of amd_iommu_pi_data.base to get the GA root pointer. KVM is the only source of amd_iommu_pi_data.base, and KVM's one and only path for writing amd_iommu_pi_data.base computes the exact same value for vcpu_data.pi_desc_addr and amd_iommu_pi_data.base, and fills amd_iommu_pi_data.base if and only if vcpu_data.pi_desc_addr is valid, i.e. amd_iommu_pi_data.base is fully redundant. Cc: Maxim Levitsky Reviewed-by: Joao Martins Reviewed-by: Vasant Hegde Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-23-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 7 +++++-- drivers/iommu/amd/iommu.c | 2 +- include/linux/amd-iommu.h | 2 -- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 375a29022000..d95742faae11 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -894,8 +894,11 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, enable_remapped_mode = false; - /* Try to enable guest_mode in IRTE */ - pi.base = avic_get_backing_page_address(svm); + /* + * Try to enable guest_mode in IRTE. Note, the address + * of the vCPU's AVIC backing page is passed to the + * IOMMU via vcpu_info->pi_desc_addr. + */ pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, svm->vcpu.vcpu_id); pi.is_guest_mode = true; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index ed96050d4933..bdfb57af7111 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3888,7 +3888,7 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) pi_data->ir_data = ir_data; if (pi_data->is_guest_mode) { - ir_data->ga_root_ptr = (pi_data->base >> 12); + ir_data->ga_root_ptr = (vcpu_pi_info->pi_desc_addr >> 12); ir_data->ga_vector = vcpu_pi_info->vector; ir_data->ga_tag = pi_data->ga_tag; ret = amd_iommu_activate_guest_mode(ir_data); diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 1f9b13d803c5..deeefc92a5cf 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -19,8 +19,6 @@ struct amd_iommu; */ struct amd_iommu_pi_data { u32 ga_tag; - u64 base; - bool is_guest_mode; struct vcpu_data *vcpu_data; void *ir_data; From 95d50ebe6df80b791d599b1f7434f3a8cac8a84f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:25 -0700 Subject: [PATCH 047/101] iommu/amd: KVM: SVM: Pass NULL @vcpu_info to indicate "not guest mode" Pass NULL to amd_ir_set_vcpu_affinity() to communicate "don't post to a vCPU" now that there's no need to communicate information back to KVM about the previous vCPU (KVM does its own tracking). Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-24-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 16 ++++------------ drivers/iommu/amd/iommu.c | 10 +++++++--- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index d95742faae11..e09722f5b5fa 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -928,18 +928,10 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, } } - ret = 0; - if (enable_remapped_mode) { - /* Use legacy mode in IRTE */ - struct amd_iommu_pi_data pi; - - /** - * Here, pi is used to: - * - Tell IOMMU to use legacy mode for this interrupt. - */ - pi.is_guest_mode = false; - ret = irq_set_vcpu_affinity(host_irq, &pi); - } + if (enable_remapped_mode) + ret = irq_set_vcpu_affinity(host_irq, NULL); + else + ret = 0; out: srcu_read_unlock(&kvm->irq_srcu, idx); return ret; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index bdfb57af7111..e8e259c07265 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3864,7 +3864,6 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) { int ret; struct amd_iommu_pi_data *pi_data = vcpu_info; - struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data; struct amd_ir_data *ir_data = data->chip_data; struct irq_2_irte *irte_info = &ir_data->irq_2_irte; struct iommu_dev_data *dev_data; @@ -3885,9 +3884,14 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) return -EINVAL; ir_data->cfg = irqd_cfg(data); - pi_data->ir_data = ir_data; - if (pi_data->is_guest_mode) { + if (pi_data) { + struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data; + + pi_data->ir_data = ir_data; + + WARN_ON_ONCE(!pi_data->is_guest_mode); + ir_data->ga_root_ptr = (vcpu_pi_info->pi_desc_addr >> 12); ir_data->ga_vector = vcpu_pi_info->vector; ir_data->ga_tag = pi_data->ga_tag; From 1e663ed239923447a0217d78a96bcdc8729022e0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:26 -0700 Subject: [PATCH 048/101] KVM: SVM: Stop walking list of routing table entries when updating IRTE Now that KVM explicitly passes the new/current GSI routing to pi_update_irte(), simply use the provided routing entry and stop walking the routing table to find that entry. KVM, via setup_routing_entry() and sanity checked by kvm_get_msi_route(), disallows having a GSI configured to trigger multiple MSIs. I.e. this is subtly a glorified nop, as KVM allows at most one MSI per GSI, the for-loop can only ever process one entry, and that entry is the new/current entry (see the WARN_ON_ONCE() added by "KVM: x86: Pass new routing entries and irqfd when updating IRTEs" to ensure @new matches the entry found in the routing table). Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-25-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 105 ++++++++++++++++------------------------ 1 file changed, 42 insertions(+), 63 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index e09722f5b5fa..a1a5b86a37fe 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -844,11 +844,10 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, struct kvm_kernel_irq_routing_entry *new) { - struct kvm_kernel_irq_routing_entry *e; - struct kvm_irq_routing_table *irq_rt; bool enable_remapped_mode = true; - bool set = !!new; - int idx, ret = 0; + struct vcpu_data vcpu_info; + struct vcpu_svm *svm = NULL; + int ret = 0; if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) return 0; @@ -860,72 +859,53 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, svm_ir_list_del(irqfd); pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", - __func__, host_irq, guest_irq, set); + __func__, host_irq, guest_irq, !!new); - idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + /** + * Here, we setup with legacy mode in the following cases: + * 1. When cannot target interrupt to a specific vcpu. + * 2. Unsetting posted interrupt. + * 3. APIC virtualization is disabled for the vcpu. + * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) + */ + if (new && new->type == KVM_IRQ_ROUTING_MSI && + !get_pi_vcpu_info(kvm, new, &vcpu_info, &svm) && + kvm_vcpu_apicv_active(&svm->vcpu)) { + struct amd_iommu_pi_data pi; - if (guest_irq >= irq_rt->nr_rt_entries || - hlist_empty(&irq_rt->map[guest_irq])) { - pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", - guest_irq, irq_rt->nr_rt_entries); - goto out; - } + enable_remapped_mode = false; - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { - struct vcpu_data vcpu_info; - struct vcpu_svm *svm = NULL; - - if (e->type != KVM_IRQ_ROUTING_MSI) - continue; - - WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new))); + /* + * Try to enable guest_mode in IRTE. Note, the address + * of the vCPU's AVIC backing page is passed to the + * IOMMU via vcpu_info->pi_desc_addr. + */ + pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, + svm->vcpu.vcpu_id); + pi.is_guest_mode = true; + pi.vcpu_data = &vcpu_info; + ret = irq_set_vcpu_affinity(host_irq, &pi); /** - * Here, we setup with legacy mode in the following cases: - * 1. When cannot target interrupt to a specific vcpu. - * 2. Unsetting posted interrupt. - * 3. APIC virtualization is disabled for the vcpu. - * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) + * Here, we successfully setting up vcpu affinity in + * IOMMU guest mode. Now, we need to store the posted + * interrupt information in a per-vcpu ir_list so that + * we can reference to them directly when we update vcpu + * scheduling information in IOMMU irte. */ - if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && - kvm_vcpu_apicv_active(&svm->vcpu)) { - struct amd_iommu_pi_data pi; + if (!ret) + ret = svm_ir_list_add(svm, irqfd, &pi); + } - enable_remapped_mode = false; + if (!ret && svm) { + trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id, + guest_irq, vcpu_info.vector, + vcpu_info.pi_desc_addr, !!new); + } - /* - * Try to enable guest_mode in IRTE. Note, the address - * of the vCPU's AVIC backing page is passed to the - * IOMMU via vcpu_info->pi_desc_addr. - */ - pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, - svm->vcpu.vcpu_id); - pi.is_guest_mode = true; - pi.vcpu_data = &vcpu_info; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Here, we successfully setting up vcpu affinity in - * IOMMU guest mode. Now, we need to store the posted - * interrupt information in a per-vcpu ir_list so that - * we can reference to them directly when we update vcpu - * scheduling information in IOMMU irte. - */ - if (!ret && pi.is_guest_mode) - svm_ir_list_add(svm, irqfd, &pi); - } - - if (!ret && svm) { - trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id, - e->gsi, vcpu_info.vector, - vcpu_info.pi_desc_addr, set); - } - - if (ret < 0) { - pr_err("%s: failed to update PI IRTE\n", __func__); - goto out; - } + if (ret < 0) { + pr_err("%s: failed to update PI IRTE\n", __func__); + goto out; } if (enable_remapped_mode) @@ -933,7 +913,6 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, else ret = 0; out: - srcu_read_unlock(&kvm->irq_srcu, idx); return ret; } From 23ca102e6fb284f3c1d08d8bbbddceff2353427f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:27 -0700 Subject: [PATCH 049/101] KVM: VMX: Stop walking list of routing table entries when updating IRTE Now that KVM provides the to-be-updated routing entry, stop walking the routing table to find that entry. KVM, via setup_routing_entry() and sanity checked by kvm_get_msi_route(), disallows having a GSI configured to trigger multiple MSIs, i.e. the for-loop can only process one entry. Link: https://lore.kernel.org/r/20250611224604.313496-26-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/posted_intr.c | 90 +++++++++++----------------------- 1 file changed, 28 insertions(+), 62 deletions(-) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index d4826a6b674f..e59eae11f476 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -302,78 +302,44 @@ int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, struct kvm_kernel_irq_routing_entry *new) { - struct kvm_kernel_irq_routing_entry *e; - struct kvm_irq_routing_table *irq_rt; - bool enable_remapped_mode = true; struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; - bool set = !!new; - int idx, ret = 0; if (!vmx_can_use_vtd_pi(kvm)) return 0; - idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - if (guest_irq >= irq_rt->nr_rt_entries || - hlist_empty(&irq_rt->map[guest_irq])) { - pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", - guest_irq, irq_rt->nr_rt_entries); - goto out; - } + /* + * VT-d PI cannot support posting multicast/broadcast + * interrupts to a vCPU, we still use interrupt remapping + * for these kind of interrupts. + * + * For lowest-priority interrupts, we only support + * those with single CPU as the destination, e.g. user + * configures the interrupts via /proc/irq or uses + * irqbalance to make the interrupts single-CPU. + * + * We will support full lowest-priority interrupt later. + * + * In addition, we can only inject generic interrupts using + * the PI mechanism, refuse to route others through it. + */ + if (!new || new->type != KVM_IRQ_ROUTING_MSI) + goto do_remapping; - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { - if (e->type != KVM_IRQ_ROUTING_MSI) - continue; + kvm_set_msi_irq(kvm, new, &irq); - WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new))); + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || + !kvm_irq_is_postable(&irq)) + goto do_remapping; - /* - * VT-d PI cannot support posting multicast/broadcast - * interrupts to a vCPU, we still use interrupt remapping - * for these kind of interrupts. - * - * For lowest-priority interrupts, we only support - * those with single CPU as the destination, e.g. user - * configures the interrupts via /proc/irq or uses - * irqbalance to make the interrupts single-CPU. - * - * We will support full lowest-priority interrupt later. - * - * In addition, we can only inject generic interrupts using - * the PI mechanism, refuse to route others through it. - */ + vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); + vcpu_info.vector = irq.vector; - kvm_set_msi_irq(kvm, e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) - continue; + trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, guest_irq, + vcpu_info.vector, vcpu_info.pi_desc_addr, true); - vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); - vcpu_info.vector = irq.vector; - - trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, - vcpu_info.vector, vcpu_info.pi_desc_addr, set); - - if (!set) - continue; - - enable_remapped_mode = false; - - ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - if (ret < 0) { - printk(KERN_INFO "%s: failed to update PI IRTE\n", - __func__); - goto out; - } - } - - if (enable_remapped_mode) - ret = irq_set_vcpu_affinity(host_irq, NULL); - - ret = 0; -out: - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; + return irq_set_vcpu_affinity(host_irq, &vcpu_info); +do_remapping: + return irq_set_vcpu_affinity(host_irq, NULL); } From 0a64c447f6f81f79b69294c16b28680be59a3649 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:28 -0700 Subject: [PATCH 050/101] KVM: SVM: Extract SVM specific code out of get_pi_vcpu_info() Genericize SVM's get_pi_vcpu_info() so that it can be shared with VMX. The only SVM specific information it provides is the AVIC back page, and that can be trivially retrieved by its sole caller. No functional change intended. Cc: Francesco Lavra Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-27-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index a1a5b86a37fe..86d2dc7c1940 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -817,14 +817,14 @@ static int svm_ir_list_add(struct vcpu_svm *svm, */ static int get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, - struct vcpu_data *vcpu_info, struct vcpu_svm **svm) + struct vcpu_data *vcpu_info, struct kvm_vcpu **vcpu) { struct kvm_lapic_irq irq; - struct kvm_vcpu *vcpu = NULL; + *vcpu = NULL; kvm_set_msi_irq(kvm, e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || + if (!kvm_intr_is_single_vcpu(kvm, &irq, vcpu) || !kvm_irq_is_postable(&irq)) { pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", __func__, irq.vector); @@ -833,8 +833,6 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, irq.vector); - *svm = to_svm(vcpu); - vcpu_info->pi_desc_addr = avic_get_backing_page_address(*svm); vcpu_info->vector = irq.vector; return 0; @@ -846,7 +844,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, { bool enable_remapped_mode = true; struct vcpu_data vcpu_info; - struct vcpu_svm *svm = NULL; + struct kvm_vcpu *vcpu = NULL; int ret = 0; if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) @@ -869,19 +867,20 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) */ if (new && new->type == KVM_IRQ_ROUTING_MSI && - !get_pi_vcpu_info(kvm, new, &vcpu_info, &svm) && - kvm_vcpu_apicv_active(&svm->vcpu)) { + !get_pi_vcpu_info(kvm, new, &vcpu_info, &vcpu) && + kvm_vcpu_apicv_active(vcpu)) { struct amd_iommu_pi_data pi; enable_remapped_mode = false; + vcpu_info.pi_desc_addr = avic_get_backing_page_address(to_svm(vcpu)); + /* * Try to enable guest_mode in IRTE. Note, the address * of the vCPU's AVIC backing page is passed to the * IOMMU via vcpu_info->pi_desc_addr. */ - pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, - svm->vcpu.vcpu_id); + pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, vcpu->vcpu_id); pi.is_guest_mode = true; pi.vcpu_data = &vcpu_info; ret = irq_set_vcpu_affinity(host_irq, &pi); @@ -894,11 +893,11 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, * scheduling information in IOMMU irte. */ if (!ret) - ret = svm_ir_list_add(svm, irqfd, &pi); + ret = svm_ir_list_add(to_svm(vcpu), irqfd, &pi); } - if (!ret && svm) { - trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id, + if (!ret && vcpu) { + trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, guest_irq, vcpu_info.vector, vcpu_info.pi_desc_addr, !!new); } From f5369619f7f8add44fc5e3d05364d2ed293995fa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:29 -0700 Subject: [PATCH 051/101] KVM: x86: Move IRQ routing/delivery APIs from x86.c => irq.c Move a bunch of IRQ routing and delivery APIs from x86.c to irq.c. x86.c has grown quite fat, and irq.c is the perfect landing spot. Opportunistically rewrite kvm_arch_irq_bypass_del_producer()'s comment, as the existing comment has several typos and is rather confusing. Suggested-by: Paolo Bonzini Link: https://lore.kernel.org/r/20250611224604.313496-28-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 87 --------------------------------------------- 2 files changed, 88 insertions(+), 87 deletions(-) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index a0b1499baf6e..f34c2deedf78 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -11,6 +11,7 @@ #include #include +#include #include "hyperv.h" #include "ioapic.h" @@ -332,6 +333,18 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, return -EWOULDBLOCK; } +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, + bool line_status) +{ + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event->irq, irq_event->level, + line_status); + return 0; +} + bool kvm_arch_can_set_irq_routing(struct kvm *kvm) { return irqchip_in_kernel(kvm); @@ -495,6 +508,81 @@ void kvm_arch_irq_routing_update(struct kvm *kvm) kvm_make_scan_ioapic_request(kvm); } +int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; + int ret = 0; + + kvm_arch_start_assignment(irqfd->kvm); + + spin_lock_irq(&kvm->irqfds.lock); + irqfd->producer = prod; + + if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { + ret = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, prod->irq, + irqfd->gsi, &irqfd->irq_entry); + if (ret) + kvm_arch_end_assignment(irqfd->kvm); + } + spin_unlock_irq(&kvm->irqfds.lock); + + return ret; +} + +void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; + int ret; + + WARN_ON(irqfd->producer != prod); + + /* + * If the producer of an IRQ that is currently being posted to a vCPU + * is unregistered, change the associated IRTE back to remapped mode as + * the IRQ has been released (or repurposed) by the device driver, i.e. + * KVM must relinquish control of the IRTE. + */ + spin_lock_irq(&kvm->irqfds.lock); + irqfd->producer = NULL; + + if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { + ret = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, prod->irq, + irqfd->gsi, NULL); + if (ret) + pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n", + irqfd->consumer.eventfd, ret); + } + + spin_unlock_irq(&kvm->irqfds.lock); + + + kvm_arch_end_assignment(irqfd->kvm); +} + +int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) +{ + return kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, irqfd->producer->irq, + irqfd->gsi, new); +} + +bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) +{ + if (old->type != KVM_IRQ_ROUTING_MSI || + new->type != KVM_IRQ_ROUTING_MSI) + return true; + + return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); +} + #ifdef CONFIG_KVM_IOAPIC #define IOAPIC_ROUTING_ENTRY(irq) \ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c176ff44bb9a..7091d7d7eebd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6420,18 +6420,6 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) kvm_vcpu_kick(vcpu); } -int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, - bool line_status) -{ - if (!irqchip_in_kernel(kvm)) - return -ENXIO; - - irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, - irq_event->irq, irq_event->level, - line_status); - return 0; -} - int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { @@ -13504,81 +13492,6 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); -int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, - struct irq_bypass_producer *prod) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - struct kvm *kvm = irqfd->kvm; - int ret = 0; - - kvm_arch_start_assignment(irqfd->kvm); - - spin_lock_irq(&kvm->irqfds.lock); - irqfd->producer = prod; - - if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { - ret = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, prod->irq, - irqfd->gsi, &irqfd->irq_entry); - if (ret) - kvm_arch_end_assignment(irqfd->kvm); - } - spin_unlock_irq(&kvm->irqfds.lock); - - return ret; -} - -void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, - struct irq_bypass_producer *prod) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - struct kvm *kvm = irqfd->kvm; - int ret; - - WARN_ON(irqfd->producer != prod); - - /* - * When producer of consumer is unregistered, we change back to - * remapped mode, so we can re-use the current implementation - * when the irq is masked/disabled or the consumer side (KVM - * int this case doesn't want to receive the interrupts. - */ - spin_lock_irq(&kvm->irqfds.lock); - irqfd->producer = NULL; - - if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { - ret = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, prod->irq, - irqfd->gsi, NULL); - if (ret) - pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n", - irqfd->consumer.eventfd, ret); - } - - spin_unlock_irq(&kvm->irqfds.lock); - - - kvm_arch_end_assignment(irqfd->kvm); -} - -int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, - struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) -{ - return kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, irqfd->producer->irq, - irqfd->gsi, new); -} - -bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) -{ - if (old->type != KVM_IRQ_ROUTING_MSI || - new->type != KVM_IRQ_ROUTING_MSI) - return true; - - return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); -} - bool kvm_vector_hashing_enabled(void) { return vector_hashing; From 9517aedecd0e73716b766f5c603518fd6e383454 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:30 -0700 Subject: [PATCH 052/101] KVM: x86: Nullify irqfd->producer after updating IRTEs Nullify irqfd->producer (when it's going away) _after_ updating IRTEs so that the producer can be queried during the update. Link: https://lore.kernel.org/r/20250611224604.313496-29-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index f34c2deedf78..b3ae9e43a254 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -549,7 +549,6 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * KVM must relinquish control of the IRTE. */ spin_lock_irq(&kvm->irqfds.lock); - irqfd->producer = NULL; if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { ret = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, prod->irq, @@ -558,10 +557,10 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n", irqfd->consumer.eventfd, ret); } + irqfd->producer = NULL; spin_unlock_irq(&kvm->irqfds.lock); - kvm_arch_end_assignment(irqfd->kvm); } From cf04ec393ed08b3d0566f0735675639732dca23a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:31 -0700 Subject: [PATCH 053/101] KVM: x86: Dedup AVIC vs. PI code for identifying target vCPU Hoist the logic for identifying the target vCPU for a posted interrupt into common x86. The code is functionally identical between Intel and AMD. Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-30-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/irq.c | 45 +++++++++++++++--- arch/x86/kvm/svm/avic.c | 82 ++++++++------------------------- arch/x86/kvm/svm/svm.h | 2 +- arch/x86/kvm/vmx/posted_intr.c | 51 +++++--------------- arch/x86/kvm/vmx/posted_intr.h | 2 +- 6 files changed, 73 insertions(+), 111 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 30b3ecc01684..f91791473f0b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1855,7 +1855,7 @@ struct kvm_x86_ops { int (*pi_update_irte)(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, - struct kvm_kernel_irq_routing_entry *new); + struct kvm_vcpu *vcpu, u32 vector); void (*pi_start_assignment)(struct kvm *kvm); void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index b3ae9e43a254..6a407a958af7 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -508,6 +508,42 @@ void kvm_arch_irq_routing_update(struct kvm *kvm) kvm_make_scan_ioapic_request(kvm); } +static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *entry) +{ + struct kvm *kvm = irqfd->kvm; + struct kvm_vcpu *vcpu = NULL; + struct kvm_lapic_irq irq; + + if (!irqchip_in_kernel(kvm) || + !kvm_arch_has_irq_bypass() || + !kvm_arch_has_assigned_device(kvm)) + return 0; + + if (entry && entry->type == KVM_IRQ_ROUTING_MSI) { + kvm_set_msi_irq(kvm, entry, &irq); + + /* + * Force remapped mode if hardware doesn't support posting the + * virtual interrupt to a vCPU. Only IRQs are postable (NMIs, + * SMIs, etc. are not), and neither AMD nor Intel IOMMUs support + * posting multicast/broadcast IRQs. If the interrupt can't be + * posted, the device MSI needs to be routed to the host so that + * the guest's desired interrupt can be synthesized by KVM. + * + * This means that KVM can only post lowest-priority interrupts + * if they have a single CPU as the destination, e.g. only if + * the guest has affined the interrupt to a single vCPU. + */ + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || + !kvm_irq_is_postable(&irq)) + vcpu = NULL; + } + + return kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, irqfd->producer->irq, + irqfd->gsi, vcpu, irq.vector); +} + int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { @@ -522,8 +558,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, irqfd->producer = prod; if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { - ret = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, prod->irq, - irqfd->gsi, &irqfd->irq_entry); + ret = kvm_pi_update_irte(irqfd, &irqfd->irq_entry); if (ret) kvm_arch_end_assignment(irqfd->kvm); } @@ -551,8 +586,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, spin_lock_irq(&kvm->irqfds.lock); if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { - ret = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, prod->irq, - irqfd->gsi, NULL); + ret = kvm_pi_update_irte(irqfd, NULL); if (ret) pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n", irqfd->consumer.eventfd, ret); @@ -568,8 +602,7 @@ int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { - return kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, irqfd->producer->irq, - irqfd->gsi, new); + return kvm_pi_update_irte(irqfd, new); } bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 86d2dc7c1940..153d8a84b8ef 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -804,52 +804,12 @@ static int svm_ir_list_add(struct vcpu_svm *svm, return 0; } -/* - * Note: - * The HW cannot support posting multicast/broadcast - * interrupts to a vCPU. So, we still use legacy interrupt - * remapping for these kind of interrupts. - * - * For lowest-priority interrupts, we only support - * those with single CPU as the destination, e.g. user - * configures the interrupts via /proc/irq or uses - * irqbalance to make the interrupts single-CPU. - */ -static int -get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, - struct vcpu_data *vcpu_info, struct kvm_vcpu **vcpu) -{ - struct kvm_lapic_irq irq; - *vcpu = NULL; - - kvm_set_msi_irq(kvm, e, &irq); - - if (!kvm_intr_is_single_vcpu(kvm, &irq, vcpu) || - !kvm_irq_is_postable(&irq)) { - pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", - __func__, irq.vector); - return -1; - } - - pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, - irq.vector); - vcpu_info->vector = irq.vector; - - return 0; -} - int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, - struct kvm_kernel_irq_routing_entry *new) + struct kvm_vcpu *vcpu, u32 vector) { - bool enable_remapped_mode = true; - struct vcpu_data vcpu_info; - struct kvm_vcpu *vcpu = NULL; int ret = 0; - if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) - return 0; - /* * If the IRQ was affined to a different vCPU, remove the IRTE metadata * from the *previous* vCPU's list. @@ -857,7 +817,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, svm_ir_list_del(irqfd); pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", - __func__, host_irq, guest_irq, !!new); + __func__, host_irq, guest_irq, !!vcpu); /** * Here, we setup with legacy mode in the following cases: @@ -866,23 +826,23 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, * 3. APIC virtualization is disabled for the vcpu. * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) */ - if (new && new->type == KVM_IRQ_ROUTING_MSI && - !get_pi_vcpu_info(kvm, new, &vcpu_info, &vcpu) && - kvm_vcpu_apicv_active(vcpu)) { - struct amd_iommu_pi_data pi; - - enable_remapped_mode = false; - - vcpu_info.pi_desc_addr = avic_get_backing_page_address(to_svm(vcpu)); - + if (vcpu && kvm_vcpu_apicv_active(vcpu)) { /* * Try to enable guest_mode in IRTE. Note, the address * of the vCPU's AVIC backing page is passed to the * IOMMU via vcpu_info->pi_desc_addr. */ - pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, vcpu->vcpu_id); - pi.is_guest_mode = true; - pi.vcpu_data = &vcpu_info; + struct vcpu_data vcpu_info = { + .pi_desc_addr = avic_get_backing_page_address(to_svm(vcpu)), + .vector = vector, + }; + + struct amd_iommu_pi_data pi = { + .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, vcpu->vcpu_id), + .is_guest_mode = true, + .vcpu_data = &vcpu_info, + }; + ret = irq_set_vcpu_affinity(host_irq, &pi); /** @@ -894,12 +854,11 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, */ if (!ret) ret = svm_ir_list_add(to_svm(vcpu), irqfd, &pi); - } - if (!ret && vcpu) { - trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, - guest_irq, vcpu_info.vector, - vcpu_info.pi_desc_addr, !!new); + trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, guest_irq, + vector, vcpu_info.pi_desc_addr, true); + } else { + ret = irq_set_vcpu_affinity(host_irq, NULL); } if (ret < 0) { @@ -907,10 +866,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, goto out; } - if (enable_remapped_mode) - ret = irq_set_vcpu_affinity(host_irq, NULL); - else - ret = 0; + ret = 0; out: return ret; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 939ff0e35a2b..b5cd1927b009 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -747,7 +747,7 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, - struct kvm_kernel_irq_routing_entry *new); + struct kvm_vcpu *vcpu, u32 vector); void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_ring_doorbell(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index e59eae11f476..3de767c5d6b2 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -300,46 +300,19 @@ void vmx_pi_start_assignment(struct kvm *kvm) int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, - struct kvm_kernel_irq_routing_entry *new) + struct kvm_vcpu *vcpu, u32 vector) { - struct kvm_lapic_irq irq; - struct kvm_vcpu *vcpu; - struct vcpu_data vcpu_info; + if (vcpu) { + struct vcpu_data vcpu_info = { + .pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)), + .vector = vector, + }; - if (!vmx_can_use_vtd_pi(kvm)) - return 0; + trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, guest_irq, + vcpu_info.vector, vcpu_info.pi_desc_addr, true); - /* - * VT-d PI cannot support posting multicast/broadcast - * interrupts to a vCPU, we still use interrupt remapping - * for these kind of interrupts. - * - * For lowest-priority interrupts, we only support - * those with single CPU as the destination, e.g. user - * configures the interrupts via /proc/irq or uses - * irqbalance to make the interrupts single-CPU. - * - * We will support full lowest-priority interrupt later. - * - * In addition, we can only inject generic interrupts using - * the PI mechanism, refuse to route others through it. - */ - if (!new || new->type != KVM_IRQ_ROUTING_MSI) - goto do_remapping; - - kvm_set_msi_irq(kvm, new, &irq); - - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) - goto do_remapping; - - vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); - vcpu_info.vector = irq.vector; - - trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, guest_irq, - vcpu_info.vector, vcpu_info.pi_desc_addr, true); - - return irq_set_vcpu_affinity(host_irq, &vcpu_info); -do_remapping: - return irq_set_vcpu_affinity(host_irq, NULL); + return irq_set_vcpu_affinity(host_irq, &vcpu_info); + } else { + return irq_set_vcpu_affinity(host_irq, NULL); + } } diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index a94afcb55f7f..94ed66ea6249 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -16,7 +16,7 @@ void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, - struct kvm_kernel_irq_routing_entry *new); + struct kvm_vcpu *vcpu, u32 vector); void vmx_pi_start_assignment(struct kvm *kvm); static inline int pi_find_highest_vector(struct pi_desc *pi_desc) From c5af31698d719acf35ec3e3a3610a7c92dac0e6e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:32 -0700 Subject: [PATCH 054/101] KVM: x86: Move posted interrupt tracepoint to common code Move the pi_irte_update tracepoint to common x86, and call it whenever the IRTE is modified. Tracing only the modifications that result in an IRQ being posted to a vCPU makes the tracepoint useless for debugging. Drop the vendor specific address; plumbing that into common code isn't worth the trouble, as the address is meaningless without a whole pile of other information that isn't provided in any tracepoint. Link: https://lore.kernel.org/r/20250611224604.313496-31-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq.c | 12 ++++++++++-- arch/x86/kvm/svm/avic.c | 6 ------ arch/x86/kvm/trace.h | 19 +++++++------------ arch/x86/kvm/vmx/posted_intr.c | 3 --- arch/x86/kvm/x86.c | 1 - 5 files changed, 17 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 6a407a958af7..cd9b56c6a5c3 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -511,9 +511,11 @@ void kvm_arch_irq_routing_update(struct kvm *kvm) static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm_kernel_irq_routing_entry *entry) { + unsigned int host_irq = irqfd->producer->irq; struct kvm *kvm = irqfd->kvm; struct kvm_vcpu *vcpu = NULL; struct kvm_lapic_irq irq; + int r; if (!irqchip_in_kernel(kvm) || !kvm_arch_has_irq_bypass() || @@ -540,8 +542,13 @@ static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd, vcpu = NULL; } - return kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, irqfd->producer->irq, - irqfd->gsi, vcpu, irq.vector); + r = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, host_irq, irqfd->gsi, + vcpu, irq.vector); + if (r) + return r; + + trace_kvm_pi_irte_update(host_irq, vcpu, irqfd->gsi, irq.vector, !!vcpu); + return 0; } int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, @@ -595,6 +602,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, spin_unlock_irq(&kvm->irqfds.lock); + kvm_arch_end_assignment(irqfd->kvm); } diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 153d8a84b8ef..b0f41ae631bb 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -816,9 +816,6 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, */ svm_ir_list_del(irqfd); - pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", - __func__, host_irq, guest_irq, !!vcpu); - /** * Here, we setup with legacy mode in the following cases: * 1. When cannot target interrupt to a specific vcpu. @@ -854,9 +851,6 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, */ if (!ret) ret = svm_ir_list_add(to_svm(vcpu), irqfd, &pi); - - trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, guest_irq, - vector, vcpu_info.pi_desc_addr, true); } else { ret = irq_set_vcpu_affinity(host_irq, NULL); } diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index ababdba2c186..57d79fd31df0 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1176,37 +1176,32 @@ TRACE_EVENT(kvm_smm_transition, * Tracepoint for VT-d posted-interrupts and AMD-Vi Guest Virtual APIC. */ TRACE_EVENT(kvm_pi_irte_update, - TP_PROTO(unsigned int host_irq, unsigned int vcpu_id, - unsigned int gsi, unsigned int gvec, - u64 pi_desc_addr, bool set), - TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set), + TP_PROTO(unsigned int host_irq, struct kvm_vcpu *vcpu, + unsigned int gsi, unsigned int gvec, bool set), + TP_ARGS(host_irq, vcpu, gsi, gvec, set), TP_STRUCT__entry( __field( unsigned int, host_irq ) - __field( unsigned int, vcpu_id ) + __field( int, vcpu_id ) __field( unsigned int, gsi ) __field( unsigned int, gvec ) - __field( u64, pi_desc_addr ) __field( bool, set ) ), TP_fast_assign( __entry->host_irq = host_irq; - __entry->vcpu_id = vcpu_id; + __entry->vcpu_id = vcpu ? vcpu->vcpu_id : -1; __entry->gsi = gsi; __entry->gvec = gvec; - __entry->pi_desc_addr = pi_desc_addr; __entry->set = set; ), - TP_printk("PI is %s for irq %u, vcpu %u, gsi: 0x%x, " - "gvec: 0x%x, pi_desc_addr: 0x%llx", + TP_printk("PI is %s for irq %u, vcpu %d, gsi: 0x%x, gvec: 0x%x", __entry->set ? "enabled and being updated" : "disabled", __entry->host_irq, __entry->vcpu_id, __entry->gsi, - __entry->gvec, - __entry->pi_desc_addr) + __entry->gvec) ); /* diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 3de767c5d6b2..687ffde3b61c 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -308,9 +308,6 @@ int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, .vector = vector, }; - trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, guest_irq, - vcpu_info.vector, vcpu_info.pi_desc_addr, true); - return irq_set_vcpu_affinity(host_irq, &vcpu_info); } else { return irq_set_vcpu_affinity(host_irq, NULL); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7091d7d7eebd..5ebe5a384fd9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13891,7 +13891,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); From 803928483669b41e84c27086ffcf28438c1a8cca Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:33 -0700 Subject: [PATCH 055/101] KVM: SVM: Clean up return handling in avic_pi_update_irte() Clean up the return paths for avic_pi_update_irte() now that the refactoring dust has settled. Opportunistically drop the pr_err() on IRTE update failures. Logging that a failure occurred without _any_ context is quite useless. Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-32-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index b0f41ae631bb..eed5c58ac07f 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -808,8 +808,6 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, struct kvm_vcpu *vcpu, u32 vector) { - int ret = 0; - /* * If the IRQ was affined to a different vCPU, remove the IRTE metadata * from the *previous* vCPU's list. @@ -839,8 +837,11 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, .is_guest_mode = true, .vcpu_data = &vcpu_info, }; + int ret; ret = irq_set_vcpu_affinity(host_irq, &pi); + if (ret) + return ret; /** * Here, we successfully setting up vcpu affinity in @@ -849,20 +850,9 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, * we can reference to them directly when we update vcpu * scheduling information in IOMMU irte. */ - if (!ret) - ret = svm_ir_list_add(to_svm(vcpu), irqfd, &pi); - } else { - ret = irq_set_vcpu_affinity(host_irq, NULL); + return svm_ir_list_add(to_svm(vcpu), irqfd, &pi); } - - if (ret < 0) { - pr_err("%s: failed to update PI IRTE\n", __func__); - goto out; - } - - ret = 0; -out: - return ret; + return irq_set_vcpu_affinity(host_irq, NULL); } static inline int From 53527ea1b70224d16d29edbd5c850456469f00ec Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:34 -0700 Subject: [PATCH 056/101] iommu: KVM: Split "struct vcpu_data" into separate AMD vs. Intel structs Split the vcpu_data structure that serves as a handoff from KVM to IOMMU drivers into vendor specific structures. Overloading a single structure makes the code hard to read and maintain, is *very* misleading as it suggests that mixing vendors is actually supported, and bastardizing Intel's posted interrupt descriptor address when AMD's IOMMU already has its own structure is quite unnecessary. Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-33-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/irq_remapping.h | 15 ++++++++++++++- arch/x86/kvm/svm/avic.c | 21 ++++++++------------- arch/x86/kvm/vmx/posted_intr.c | 4 ++-- drivers/iommu/amd/iommu.c | 12 ++++-------- drivers/iommu/intel/irq_remapping.c | 10 +++++----- include/linux/amd-iommu.h | 12 ------------ 6 files changed, 33 insertions(+), 41 deletions(-) diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 5036f13ab69f..2dbc9cb61c2f 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -26,7 +26,20 @@ enum { IRQ_REMAP_X2APIC_MODE, }; -struct vcpu_data { +/* + * This is mainly used to communicate information back-and-forth + * between SVM and IOMMU for setting up and tearing down posted + * interrupt + */ +struct amd_iommu_pi_data { + u64 vapic_addr; /* Physical address of the vCPU's vAPIC. */ + u32 ga_tag; + u32 vector; /* Guest vector of the interrupt */ + bool is_guest_mode; + void *ir_data; +}; + +struct intel_iommu_pi_data { u64 pi_desc_addr; /* Physical address of PI Descriptor */ u32 vector; /* Guest vector of the interrupt */ }; diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index eed5c58ac07f..dc1526fef18d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -823,23 +823,18 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, */ if (vcpu && kvm_vcpu_apicv_active(vcpu)) { /* - * Try to enable guest_mode in IRTE. Note, the address - * of the vCPU's AVIC backing page is passed to the - * IOMMU via vcpu_info->pi_desc_addr. + * Try to enable guest_mode in IRTE. */ - struct vcpu_data vcpu_info = { - .pi_desc_addr = avic_get_backing_page_address(to_svm(vcpu)), - .vector = vector, - }; - - struct amd_iommu_pi_data pi = { - .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, vcpu->vcpu_id), + struct amd_iommu_pi_data pi_data = { + .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, + vcpu->vcpu_id), .is_guest_mode = true, - .vcpu_data = &vcpu_info, + .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)), + .vector = vector, }; int ret; - ret = irq_set_vcpu_affinity(host_irq, &pi); + ret = irq_set_vcpu_affinity(host_irq, &pi_data); if (ret) return ret; @@ -850,7 +845,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, * we can reference to them directly when we update vcpu * scheduling information in IOMMU irte. */ - return svm_ir_list_add(to_svm(vcpu), irqfd, &pi); + return svm_ir_list_add(to_svm(vcpu), irqfd, &pi_data); } return irq_set_vcpu_affinity(host_irq, NULL); } diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 687ffde3b61c..3a23c30f73cb 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -303,12 +303,12 @@ int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, struct kvm_vcpu *vcpu, u32 vector) { if (vcpu) { - struct vcpu_data vcpu_info = { + struct intel_iommu_pi_data pi_data = { .pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)), .vector = vector, }; - return irq_set_vcpu_affinity(host_irq, &vcpu_info); + return irq_set_vcpu_affinity(host_irq, &pi_data); } else { return irq_set_vcpu_affinity(host_irq, NULL); } diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index e8e259c07265..8366d32252cd 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3860,10 +3860,10 @@ int amd_iommu_deactivate_guest_mode(void *data) } EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode); -static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) +static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info) { int ret; - struct amd_iommu_pi_data *pi_data = vcpu_info; + struct amd_iommu_pi_data *pi_data = info; struct amd_ir_data *ir_data = data->chip_data; struct irq_2_irte *irte_info = &ir_data->irq_2_irte; struct iommu_dev_data *dev_data; @@ -3886,14 +3886,10 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) ir_data->cfg = irqd_cfg(data); if (pi_data) { - struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data; - pi_data->ir_data = ir_data; - WARN_ON_ONCE(!pi_data->is_guest_mode); - - ir_data->ga_root_ptr = (vcpu_pi_info->pi_desc_addr >> 12); - ir_data->ga_vector = vcpu_pi_info->vector; + ir_data->ga_root_ptr = (pi_data->vapic_addr >> 12); + ir_data->ga_vector = pi_data->vector; ir_data->ga_tag = pi_data->ga_tag; ret = amd_iommu_activate_guest_mode(ir_data); } else { diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index cf7b6882ec75..2fc451253dc3 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -1244,10 +1244,10 @@ static void intel_ir_compose_msi_msg(struct irq_data *irq_data, static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info) { struct intel_ir_data *ir_data = data->chip_data; - struct vcpu_data *vcpu_pi_info = info; + struct intel_iommu_pi_data *pi_data = info; /* stop posting interrupts, back to the default mode */ - if (!vcpu_pi_info) { + if (!pi_data) { __intel_ir_reconfigure_irte(data, true); } else { struct irte irte_pi; @@ -1265,10 +1265,10 @@ static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info) /* Update the posted mode fields */ irte_pi.p_pst = 1; irte_pi.p_urgent = 0; - irte_pi.p_vector = vcpu_pi_info->vector; - irte_pi.pda_l = (vcpu_pi_info->pi_desc_addr >> + irte_pi.p_vector = pi_data->vector; + irte_pi.pda_l = (pi_data->pi_desc_addr >> (32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT); - irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) & + irte_pi.pda_h = (pi_data->pi_desc_addr >> 32) & ~(-1UL << PDA_HIGH_BIT); ir_data->irq_2_iommu.posted_vcpu = true; diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index deeefc92a5cf..99b4fa9a0296 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -12,18 +12,6 @@ struct amd_iommu; -/* - * This is mainly used to communicate information back-and-forth - * between SVM and IOMMU for setting up and tearing down posted - * interrupt - */ -struct amd_iommu_pi_data { - u32 ga_tag; - bool is_guest_mode; - struct vcpu_data *vcpu_data; - void *ir_data; -}; - #ifdef CONFIG_AMD_IOMMU struct task_struct; From b33252b9d17238a4a9fa5a29af6e6a2922a6c2b0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:35 -0700 Subject: [PATCH 057/101] KVM: Don't WARN if updating IRQ bypass route fails Don't bother WARNing if updating an IRTE route fails now that vendor code provides much more precise WARNs. The generic WARN doesn't provide enough information to actually debug the problem, and has obviously done nothing to surface the myriad bugs in KVM x86's implementation. Drop all of the associated return code plumbing that existed just so that common KVM could WARN. Link: https://lore.kernel.org/r/20250611224604.313496-34-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 6 +++--- arch/arm64/kvm/vgic/vgic-v4.c | 8 +++----- arch/x86/kvm/irq.c | 8 ++++---- include/kvm/arm_vgic.h | 2 +- include/linux/kvm_host.h | 6 +++--- virt/kvm/eventfd.c | 15 ++++++--------- 6 files changed, 20 insertions(+), 25 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a9a39e0375f7..94fb2f096a20 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2771,9 +2771,9 @@ bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, return memcmp(&old->msi, &new->msi, sizeof(new->msi)); } -int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, - struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) +void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) { /* * Remapping the vLPI requires taking the its_lock mutex to resolve diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index 911170d4a9c8..ef3481963122 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -527,18 +527,17 @@ static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq) return NULL; } -int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq) +void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq) { struct vgic_irq *irq; unsigned long flags; - int ret = 0; if (!vgic_supports_direct_msis(kvm)) - return 0; + return; irq = __vgic_host_irq_get_vlpi(kvm, host_irq); if (!irq) - return 0; + return; raw_spin_lock_irqsave(&irq->irq_lock, flags); WARN_ON(irq->hw && irq->host_irq != host_irq); @@ -550,5 +549,4 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq) raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(kvm, irq); - return 0; } diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index cd9b56c6a5c3..b7b0fbc218b8 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -606,11 +606,11 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, kvm_arch_end_assignment(irqfd->kvm); } -int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, - struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) +void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) { - return kvm_pi_update_irte(irqfd, new); + kvm_pi_update_irte(irqfd, new); } bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 4a34f7f0a864..b2a04481de1a 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -434,7 +434,7 @@ struct kvm_kernel_irq_routing_entry; int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int irq, struct kvm_kernel_irq_routing_entry *irq_entry); -int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq); +void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq); int vgic_v4_load(struct kvm_vcpu *vcpu); void vgic_v4_commit(struct kvm_vcpu *vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a4160c1c0c6b..8e74ac0f90b1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2410,9 +2410,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *); void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); -int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, - struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new); +void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new); bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *, struct kvm_kernel_irq_routing_entry *); #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index bd1766da6895..719b242fc935 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -285,11 +285,11 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start( { } -int __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, - struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) +void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) { - return 0; + } bool __attribute__((weak)) kvm_arch_irqfd_route_changed( @@ -617,11 +617,8 @@ void kvm_irq_routing_update(struct kvm *kvm) #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) if (irqfd->producer && - kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) { - int ret = kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); - - WARN_ON(ret); - } + kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) + kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); #endif } From 77bb184ab880171a1cedfbed9ab05977e6ae2258 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:36 -0700 Subject: [PATCH 058/101] KVM: Fold kvm_arch_irqfd_route_changed() into kvm_arch_update_irqfd_routing() Fold kvm_arch_irqfd_route_changed() into kvm_arch_update_irqfd_routing(). Calling arch code to know whether or not to call arch code is absurd. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250611224604.313496-35-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 15 +++++---------- arch/x86/kvm/irq.c | 15 +++++---------- include/linux/kvm_host.h | 2 -- virt/kvm/eventfd.c | 10 +--------- 4 files changed, 11 insertions(+), 31 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 94fb2f096a20..04f2116927b1 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2761,20 +2761,15 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq); } -bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) -{ - if (old->type != KVM_IRQ_ROUTING_MSI || - new->type != KVM_IRQ_ROUTING_MSI) - return true; - - return memcmp(&old->msi, &new->msi, sizeof(new->msi)); -} - void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { + if (old->type == KVM_IRQ_ROUTING_MSI && + new->type == KVM_IRQ_ROUTING_MSI && + !memcmp(&old->msi, &new->msi, sizeof(new->msi))) + return; + /* * Remapping the vLPI requires taking the its_lock mutex to resolve * the new translation. We're in spinlock land at this point, so no diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index b7b0fbc218b8..23e0acc07cb6 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -610,19 +610,14 @@ void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { + if (old->type == KVM_IRQ_ROUTING_MSI && + new->type == KVM_IRQ_ROUTING_MSI && + !memcmp(&old->msi, &new->msi, sizeof(new->msi))) + return; + kvm_pi_update_irte(irqfd, new); } -bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) -{ - if (old->type != KVM_IRQ_ROUTING_MSI || - new->type != KVM_IRQ_ROUTING_MSI) - return true; - - return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); -} - #ifdef CONFIG_KVM_IOAPIC #define IOAPIC_ROUTING_ENTRY(irq) \ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8e74ac0f90b1..fb9ec06aa807 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2413,8 +2413,6 @@ void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new); -bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *, - struct kvm_kernel_irq_routing_entry *); #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ #ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 719b242fc935..59b1e64697f1 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -291,13 +291,6 @@ void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, { } - -bool __attribute__((weak)) kvm_arch_irqfd_route_changed( - struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) -{ - return true; -} #endif static int @@ -616,8 +609,7 @@ void kvm_irq_routing_update(struct kvm *kvm) irqfd_update(kvm, irqfd); #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) - if (irqfd->producer && - kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) + if (irqfd->producer) kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); #endif } From 511754bc548b59e136fcf433da23ba39c74684cf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:37 -0700 Subject: [PATCH 059/101] KVM: x86: Track irq_bypass_vcpu in common x86 code Track the vCPU that is being targeted for IRQ bypass, a.k.a. for a posted IRQ, in common x86 code. This will allow for additional consolidation of the SVM and VMX code. Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-36-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq.c | 7 ++++++- arch/x86/kvm/svm/avic.c | 4 ---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 23e0acc07cb6..5b3f721808d5 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -544,8 +544,13 @@ static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd, r = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, host_irq, irqfd->gsi, vcpu, irq.vector); - if (r) + if (r) { + WARN_ON_ONCE(irqfd->irq_bypass_vcpu && !vcpu); + irqfd->irq_bypass_vcpu = NULL; return r; + } + + irqfd->irq_bypass_vcpu = vcpu; trace_kvm_pi_irte_update(host_irq, vcpu, irqfd->gsi, irq.vector, !!vcpu); return 0; diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index dc1526fef18d..21a822d64867 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -768,22 +768,18 @@ static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); list_del(&irqfd->vcpu_list); spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); - - irqfd->irq_bypass_vcpu = NULL; } static int svm_ir_list_add(struct vcpu_svm *svm, struct kvm_kernel_irqfd *irqfd, struct amd_iommu_pi_data *pi) { - struct kvm_vcpu *vcpu = &svm->vcpu; unsigned long flags; u64 entry; if (WARN_ON_ONCE(!pi->ir_data)) return -EINVAL; - irqfd->irq_bypass_vcpu = vcpu; irqfd->irq_bypass_data = pi->ir_data; spin_lock_irqsave(&svm->ir_list_lock, flags); From dc6adb13046abe199194f788e5d0dc41c67db5bc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:38 -0700 Subject: [PATCH 060/101] KVM: x86: Skip IOMMU IRTE updates if there's no old or new vCPU being targeted Don't "reconfigure" an IRTE into host controlled mode when it's already in the state, i.e. if KVM's GSI routing changes but the IRQ wasn't and still isn't being posted to a vCPU. Link: https://lore.kernel.org/r/20250611224604.313496-37-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 5b3f721808d5..ecebd150c2f7 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -542,6 +542,9 @@ static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd, vcpu = NULL; } + if (!irqfd->irq_bypass_vcpu && !vcpu) + return 0; + r = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, host_irq, irqfd->gsi, vcpu, irq.vector); if (r) { From cc8b13105eac969bb721c52cf95fd35818b5c54c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:39 -0700 Subject: [PATCH 061/101] KVM: x86: Don't update IRTE entries when old and new routes were !MSI Skip the entirety of IRTE updates on a GSI routing change if neither the old nor the new routing is for an MSI, i.e. if the neither routing setup allows for posting to a vCPU. If the IRTE isn't already host controlled, KVM has bigger problems. Link: https://lore.kernel.org/r/20250611224604.313496-38-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index ecebd150c2f7..bc9ebd893566 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -618,6 +618,10 @@ void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { + if (new->type != KVM_IRQ_ROUTING_MSI && + old->type != KVM_IRQ_ROUTING_MSI) + return; + if (old->type == KVM_IRQ_ROUTING_MSI && new->type == KVM_IRQ_ROUTING_MSI && !memcmp(&old->msi, &new->msi, sizeof(new->msi))) From 71d6b3b8e69d8a212c53df1150e559b101d6768b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:40 -0700 Subject: [PATCH 062/101] KVM: SVM: Revert IRTE to legacy mode if IOMMU doesn't provide IR metadata Revert the IRTE back to remapping mode if the AMD IOMMU driver mucks up and doesn't provide the necessary metadata. Returning an error up the stack without actually handling the error is useless and confusing. Link: https://lore.kernel.org/r/20250611224604.313496-39-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 21a822d64867..031f374a0b41 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -770,16 +770,13 @@ static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); } -static int svm_ir_list_add(struct vcpu_svm *svm, - struct kvm_kernel_irqfd *irqfd, - struct amd_iommu_pi_data *pi) +static void svm_ir_list_add(struct vcpu_svm *svm, + struct kvm_kernel_irqfd *irqfd, + struct amd_iommu_pi_data *pi) { unsigned long flags; u64 entry; - if (WARN_ON_ONCE(!pi->ir_data)) - return -EINVAL; - irqfd->irq_bypass_data = pi->ir_data; spin_lock_irqsave(&svm->ir_list_lock, flags); @@ -797,7 +794,6 @@ static int svm_ir_list_add(struct vcpu_svm *svm, list_add(&irqfd->vcpu_list, &svm->ir_list); spin_unlock_irqrestore(&svm->ir_list_lock, flags); - return 0; } int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, @@ -834,6 +830,16 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, if (ret) return ret; + /* + * Revert to legacy mode if the IOMMU didn't provide metadata + * for the IRTE, which KVM needs to keep the IRTE up-to-date, + * e.g. if the vCPU is migrated or AVIC is disabled. + */ + if (WARN_ON_ONCE(!pi_data.ir_data)) { + irq_set_vcpu_affinity(host_irq, NULL); + return -EIO; + } + /** * Here, we successfully setting up vcpu affinity in * IOMMU guest mode. Now, we need to store the posted @@ -841,7 +847,8 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, * we can reference to them directly when we update vcpu * scheduling information in IOMMU irte. */ - return svm_ir_list_add(to_svm(vcpu), irqfd, &pi_data); + svm_ir_list_add(to_svm(vcpu), irqfd, &pi_data); + return 0; } return irq_set_vcpu_affinity(host_irq, NULL); } From c3d591c91f9c99a35f8cf80d28ea689214a1acf3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:41 -0700 Subject: [PATCH 063/101] KVM: SVM: Take and hold ir_list_lock across IRTE updates in IOMMU Now that svm_ir_list_add() isn't overloaded with all manner of weird things, fold it into avic_pi_update_irte(), and more importantly take ir_list_lock across the irq_set_vcpu_affinity() calls to ensure the info that's shoved into the IRTE is fresh. While preemption (and IRQs) is disabled on the task performing the IRTE update, thanks to irqfds.lock, that task doesn't hold the vCPU's mutex, i.e. preemption being disabled is irrelevant. Link: https://lore.kernel.org/r/20250611224604.313496-40-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 55 +++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 031f374a0b41..42fd1868c32f 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -770,32 +770,6 @@ static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); } -static void svm_ir_list_add(struct vcpu_svm *svm, - struct kvm_kernel_irqfd *irqfd, - struct amd_iommu_pi_data *pi) -{ - unsigned long flags; - u64 entry; - - irqfd->irq_bypass_data = pi->ir_data; - - spin_lock_irqsave(&svm->ir_list_lock, flags); - - /* - * Update the target pCPU for IOMMU doorbells if the vCPU is running. - * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM - * will update the pCPU info when the vCPU awkened and/or scheduled in. - * See also avic_vcpu_load(). - */ - entry = svm->avic_physical_id_entry; - if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) - amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, - true, pi->ir_data); - - list_add(&irqfd->vcpu_list, &svm->ir_list); - spin_unlock_irqrestore(&svm->ir_list_lock, flags); -} - int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, struct kvm_vcpu *vcpu, u32 vector) @@ -824,8 +798,18 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)), .vector = vector, }; + struct vcpu_svm *svm = to_svm(vcpu); + u64 entry; int ret; + /* + * Prevent the vCPU from being scheduled out or migrated until + * the IRTE is updated and its metadata has been added to the + * list of IRQs being posted to the vCPU, to ensure the IRTE + * isn't programmed with stale pCPU/IsRunning information. + */ + guard(spinlock_irqsave)(&svm->ir_list_lock); + ret = irq_set_vcpu_affinity(host_irq, &pi_data); if (ret) return ret; @@ -840,14 +824,19 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, return -EIO; } - /** - * Here, we successfully setting up vcpu affinity in - * IOMMU guest mode. Now, we need to store the posted - * interrupt information in a per-vcpu ir_list so that - * we can reference to them directly when we update vcpu - * scheduling information in IOMMU irte. + /* + * Update the target pCPU for IOMMU doorbells if the vCPU is + * running. If the vCPU is NOT running, i.e. is blocking or + * scheduled out, KVM will update the pCPU info when the vCPU + * is awakened and/or scheduled in. See also avic_vcpu_load(). */ - svm_ir_list_add(to_svm(vcpu), irqfd, &pi_data); + entry = svm->avic_physical_id_entry; + if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) + amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, + true, pi_data.ir_data); + + irqfd->irq_bypass_data = pi_data.ir_data; + list_add(&irqfd->vcpu_list, &svm->ir_list); return 0; } return irq_set_vcpu_affinity(host_irq, NULL); From 3be405e89f3daea23ddfcf6b6526ae44ce38dd9a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:42 -0700 Subject: [PATCH 064/101] iommu/amd: Document which IRTE fields amd_iommu_update_ga() can modify Add a comment to amd_iommu_update_ga() to document what fields it can safely modify without issuing an invalidation of the IRTE, and to explain its role in keeping GA IRTEs up-to-date. Per page 93 of the IOMMU spec dated Feb 2025: When virtual interrupts are enabled by setting MMIO Offset 0018h[GAEn] and IRTE[GuestMode=1], IRTE[IsRun], IRTE[Destination], and if present IRTE[GATag], are not cached by the IOMMU. Modifications to these fields do not require an invalidation of the Interrupt Remapping Table. Link: https://lore.kernel.org/all/9b7ceea3-8c47-4383-ad9c-1a9bbdc9044a@oracle.com Cc: Joao Martins Link: https://lore.kernel.org/r/20250611224604.313496-41-seanjc@google.com Signed-off-by: Sean Christopherson --- drivers/iommu/amd/iommu.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 8366d32252cd..0a0d73483195 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3986,6 +3986,18 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu) return 0; } +/* + * Update the pCPU information for an IRTE that is configured to post IRQs to + * a vCPU, without issuing an IOMMU invalidation for the IRTE. + * + * This API is intended to be used when a vCPU is scheduled in/out (or stops + * running for any reason), to do a fast update of IsRun and (conditionally) + * Destination. + * + * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached + * and thus don't require an invalidation to ensure the IOMMU consumes fresh + * information. + */ int amd_iommu_update_ga(int cpu, bool is_run, void *data) { struct amd_ir_data *ir_data = (struct amd_ir_data *)data; From 08d9ccdd1a5c75d7aca7ac3af56f723d780dd6ac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:43 -0700 Subject: [PATCH 065/101] iommu/amd: KVM: SVM: Infer IsRun from validity of pCPU destination Infer whether or not a vCPU should be marked running from the validity of the pCPU on which it is running. amd_iommu_update_ga() already skips the IRTE update if the pCPU is invalid, i.e. passing %true for is_run with an invalid pCPU would be a blatant and egregrious KVM bug. Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-42-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 11 +++++------ drivers/iommu/amd/iommu.c | 14 +++++++++----- include/linux/amd-iommu.h | 6 ++---- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 42fd1868c32f..1960bb06c4b9 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -833,7 +833,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, entry = svm->avic_physical_id_entry; if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, - true, pi_data.ir_data); + pi_data.ir_data); irqfd->irq_bypass_data = pi_data.ir_data; list_add(&irqfd->vcpu_list, &svm->ir_list); @@ -842,8 +842,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, return irq_set_vcpu_affinity(host_irq, NULL); } -static inline int -avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) +static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu) { int ret = 0; struct vcpu_svm *svm = to_svm(vcpu); @@ -862,7 +861,7 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) return 0; list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { - ret = amd_iommu_update_ga(cpu, r, irqfd->irq_bypass_data); + ret = amd_iommu_update_ga(cpu, irqfd->irq_bypass_data); if (ret) return ret; } @@ -924,7 +923,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id); spin_unlock_irqrestore(&svm->ir_list_lock, flags); } @@ -964,7 +963,7 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) */ spin_lock_irqsave(&svm->ir_list_lock, flags); - avic_update_iommu_vcpu_affinity(vcpu, -1, 0); + avic_update_iommu_vcpu_affinity(vcpu, -1); entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; svm->avic_physical_id_entry = entry; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 0a0d73483195..3c4c81eb201b 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3990,15 +3990,17 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu) * Update the pCPU information for an IRTE that is configured to post IRQs to * a vCPU, without issuing an IOMMU invalidation for the IRTE. * - * This API is intended to be used when a vCPU is scheduled in/out (or stops - * running for any reason), to do a fast update of IsRun and (conditionally) - * Destination. + * If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination + * with the pCPU's APIC ID and set IsRun, else clear IsRun. I.e. treat vCPUs + * that are associated with a pCPU as running. This API is intended to be used + * when a vCPU is scheduled in/out (or stops running for any reason), to do a + * fast update of IsRun and (conditionally) Destination. * * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached * and thus don't require an invalidation to ensure the IOMMU consumes fresh * information. */ -int amd_iommu_update_ga(int cpu, bool is_run, void *data) +int amd_iommu_update_ga(int cpu, void *data) { struct amd_ir_data *ir_data = (struct amd_ir_data *)data; struct irte_ga *entry = (struct irte_ga *) ir_data->entry; @@ -4015,8 +4017,10 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data) APICID_TO_IRTE_DEST_LO(cpu); entry->hi.fields.destination = APICID_TO_IRTE_DEST_HI(cpu); + entry->lo.fields_vapic.is_run = true; + } else { + entry->lo.fields_vapic.is_run = false; } - entry->lo.fields_vapic.is_run = is_run; return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, ir_data->irq_2_irte.index, entry); diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 99b4fa9a0296..fe0e16ffe0e5 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -30,8 +30,7 @@ static inline void amd_iommu_detect(void) { } /* IOMMU AVIC Function */ extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)); -extern int -amd_iommu_update_ga(int cpu, bool is_run, void *data); +extern int amd_iommu_update_ga(int cpu, void *data); extern int amd_iommu_activate_guest_mode(void *data); extern int amd_iommu_deactivate_guest_mode(void *data); @@ -44,8 +43,7 @@ amd_iommu_register_ga_log_notifier(int (*notifier)(u32)) return 0; } -static inline int -amd_iommu_update_ga(int cpu, bool is_run, void *data) +static inline int amd_iommu_update_ga(int cpu, void *data) { return 0; } From 0b2b541fa3cd85bb06fdd9353764673aa2bfd54a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:44 -0700 Subject: [PATCH 066/101] iommu/amd: Factor out helper for manipulating IRTE GA/CPU info Split the guts of amd_iommu_update_ga() to a dedicated helper so that the logic can be shared with flows that put the IRTE into posted mode. Opportunistically move amd_iommu_update_ga() and its new helper above amd_iommu_activate_guest_mode() so that it's all co-located. Link: https://lore.kernel.org/r/20250611224604.313496-43-seanjc@google.com Signed-off-by: Sean Christopherson --- drivers/iommu/amd/iommu.c | 87 +++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 3c4c81eb201b..2eaba64be68a 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3804,6 +3804,52 @@ static const struct irq_domain_ops amd_ir_domain_ops = { .deactivate = irq_remapping_deactivate, }; +static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu) +{ + if (cpu >= 0) { + entry->lo.fields_vapic.destination = + APICID_TO_IRTE_DEST_LO(cpu); + entry->hi.fields.destination = + APICID_TO_IRTE_DEST_HI(cpu); + entry->lo.fields_vapic.is_run = true; + } else { + entry->lo.fields_vapic.is_run = false; + } +} + +/* + * Update the pCPU information for an IRTE that is configured to post IRQs to + * a vCPU, without issuing an IOMMU invalidation for the IRTE. + * + * If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination + * with the pCPU's APIC ID and set IsRun, else clear IsRun. I.e. treat vCPUs + * that are associated with a pCPU as running. This API is intended to be used + * when a vCPU is scheduled in/out (or stops running for any reason), to do a + * fast update of IsRun and (conditionally) Destination. + * + * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached + * and thus don't require an invalidation to ensure the IOMMU consumes fresh + * information. + */ +int amd_iommu_update_ga(int cpu, void *data) +{ + struct amd_ir_data *ir_data = (struct amd_ir_data *)data; + struct irte_ga *entry = (struct irte_ga *) ir_data->entry; + + if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || + !entry || !entry->lo.fields_vapic.guest_mode) + return 0; + + if (!ir_data->iommu) + return -ENODEV; + + __amd_iommu_update_ga(entry, cpu); + + return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, + ir_data->irq_2_irte.index, entry); +} +EXPORT_SYMBOL(amd_iommu_update_ga); + int amd_iommu_activate_guest_mode(void *data) { struct amd_ir_data *ir_data = (struct amd_ir_data *)data; @@ -3985,45 +4031,4 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu) return 0; } - -/* - * Update the pCPU information for an IRTE that is configured to post IRQs to - * a vCPU, without issuing an IOMMU invalidation for the IRTE. - * - * If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination - * with the pCPU's APIC ID and set IsRun, else clear IsRun. I.e. treat vCPUs - * that are associated with a pCPU as running. This API is intended to be used - * when a vCPU is scheduled in/out (or stops running for any reason), to do a - * fast update of IsRun and (conditionally) Destination. - * - * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached - * and thus don't require an invalidation to ensure the IOMMU consumes fresh - * information. - */ -int amd_iommu_update_ga(int cpu, void *data) -{ - struct amd_ir_data *ir_data = (struct amd_ir_data *)data; - struct irte_ga *entry = (struct irte_ga *) ir_data->entry; - - if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || - !entry || !entry->lo.fields_vapic.guest_mode) - return 0; - - if (!ir_data->iommu) - return -ENODEV; - - if (cpu >= 0) { - entry->lo.fields_vapic.destination = - APICID_TO_IRTE_DEST_LO(cpu); - entry->hi.fields.destination = - APICID_TO_IRTE_DEST_HI(cpu); - entry->lo.fields_vapic.is_run = true; - } else { - entry->lo.fields_vapic.is_run = false; - } - - return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, - ir_data->irq_2_irte.index, entry); -} -EXPORT_SYMBOL(amd_iommu_update_ga); #endif From f965255dc5033387ac7858787c6c792fd789ac34 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:45 -0700 Subject: [PATCH 067/101] iommu/amd: KVM: SVM: Set pCPU info in IRTE when setting vCPU affinity Now that setting vCPU affinity is guarded with ir_list_lock, i.e. now that avic_physical_id_entry can be safely accessed, set the pCPU info straight-away when setting vCPU affinity. Putting the IRTE into posted mode, and then immediately updating the IRTE a second time if the target vCPU is running is wasteful and confusing. This also fixes a flaw where a posted IRQ that arrives between putting the IRTE into guest_mode and setting the correct destination could cause the IOMMU to ring the doorbell on the wrong pCPU. Link: https://lore.kernel.org/r/20250611224604.313496-44-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/irq_remapping.h | 1 + arch/x86/kvm/svm/avic.c | 26 ++++++++++++++------------ drivers/iommu/amd/iommu.c | 6 ++++-- include/linux/amd-iommu.h | 4 ++-- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 2dbc9cb61c2f..4c75a17632f6 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -35,6 +35,7 @@ struct amd_iommu_pi_data { u64 vapic_addr; /* Physical address of the vCPU's vAPIC. */ u32 ga_tag; u32 vector; /* Guest vector of the interrupt */ + int cpu; bool is_guest_mode; void *ir_data; }; diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 1960bb06c4b9..35c5fb831c28 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -727,6 +727,7 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) { + int apic_id = kvm_cpu_get_apicid(vcpu->cpu); int ret = 0; unsigned long flags; struct vcpu_svm *svm = to_svm(vcpu); @@ -746,7 +747,7 @@ static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { if (activate) - ret = amd_iommu_activate_guest_mode(irqfd->irq_bypass_data); + ret = amd_iommu_activate_guest_mode(irqfd->irq_bypass_data, apic_id); else ret = amd_iommu_deactivate_guest_mode(irqfd->irq_bypass_data); if (ret) @@ -810,6 +811,18 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, */ guard(spinlock_irqsave)(&svm->ir_list_lock); + /* + * Update the target pCPU for IOMMU doorbells if the vCPU is + * running. If the vCPU is NOT running, i.e. is blocking or + * scheduled out, KVM will update the pCPU info when the vCPU + * is awakened and/or scheduled in. See also avic_vcpu_load(). + */ + entry = svm->avic_physical_id_entry; + if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) + pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; + else + pi_data.cpu = -1; + ret = irq_set_vcpu_affinity(host_irq, &pi_data); if (ret) return ret; @@ -824,17 +837,6 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, return -EIO; } - /* - * Update the target pCPU for IOMMU doorbells if the vCPU is - * running. If the vCPU is NOT running, i.e. is blocking or - * scheduled out, KVM will update the pCPU info when the vCPU - * is awakened and/or scheduled in. See also avic_vcpu_load(). - */ - entry = svm->avic_physical_id_entry; - if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) - amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, - pi_data.ir_data); - irqfd->irq_bypass_data = pi_data.ir_data; list_add(&irqfd->vcpu_list, &svm->ir_list); return 0; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 2eaba64be68a..6352930ad011 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3850,7 +3850,7 @@ int amd_iommu_update_ga(int cpu, void *data) } EXPORT_SYMBOL(amd_iommu_update_ga); -int amd_iommu_activate_guest_mode(void *data) +int amd_iommu_activate_guest_mode(void *data, int cpu) { struct amd_ir_data *ir_data = (struct amd_ir_data *)data; struct irte_ga *entry = (struct irte_ga *) ir_data->entry; @@ -3871,6 +3871,8 @@ int amd_iommu_activate_guest_mode(void *data) entry->hi.fields.vector = ir_data->ga_vector; entry->lo.fields_vapic.ga_tag = ir_data->ga_tag; + __amd_iommu_update_ga(entry, cpu); + return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, ir_data->irq_2_irte.index, entry); } @@ -3937,7 +3939,7 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info) ir_data->ga_root_ptr = (pi_data->vapic_addr >> 12); ir_data->ga_vector = pi_data->vector; ir_data->ga_tag = pi_data->ga_tag; - ret = amd_iommu_activate_guest_mode(ir_data); + ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu); } else { ret = amd_iommu_deactivate_guest_mode(ir_data); } diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index fe0e16ffe0e5..c9f2df0c4596 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -32,7 +32,7 @@ extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)); extern int amd_iommu_update_ga(int cpu, void *data); -extern int amd_iommu_activate_guest_mode(void *data); +extern int amd_iommu_activate_guest_mode(void *data, int cpu); extern int amd_iommu_deactivate_guest_mode(void *data); #else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */ @@ -48,7 +48,7 @@ static inline int amd_iommu_update_ga(int cpu, void *data) return 0; } -static inline int amd_iommu_activate_guest_mode(void *data) +static inline int amd_iommu_activate_guest_mode(void *data, int cpu) { return 0; } From 6df262f915abc0ec988b203c696845b76a7bc7b4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:46 -0700 Subject: [PATCH 068/101] iommu/amd: KVM: SVM: Add IRTE metadata to affined vCPU's list if AVIC is inhibited If an IRQ can be posted to a vCPU, but AVIC is currently inhibited on the vCPU, go through the dance of "affining" the IRTE to the vCPU, but leave the actual IRTE in remapped mode. KVM already handles the case where AVIC is inhibited => uninhibited with posted IRQs (see avic_set_pi_irte_mode()), but doesn't handle the scenario where a postable IRQ comes along while AVIC is inhibited. Link: https://lore.kernel.org/r/20250611224604.313496-45-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 16 ++++++---------- drivers/iommu/amd/iommu.c | 5 ++++- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 35c5fb831c28..cb6317fcaddf 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -781,21 +781,17 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, */ svm_ir_list_del(irqfd); - /** - * Here, we setup with legacy mode in the following cases: - * 1. When cannot target interrupt to a specific vcpu. - * 2. Unsetting posted interrupt. - * 3. APIC virtualization is disabled for the vcpu. - * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) - */ - if (vcpu && kvm_vcpu_apicv_active(vcpu)) { + if (vcpu) { /* - * Try to enable guest_mode in IRTE. + * Try to enable guest_mode in IRTE, unless AVIC is inhibited, + * in which case configure the IRTE for legacy mode, but track + * the IRTE metadata so that it can be converted to guest mode + * if AVIC is enabled/uninhibited in the future. */ struct amd_iommu_pi_data pi_data = { .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, vcpu->vcpu_id), - .is_guest_mode = true, + .is_guest_mode = kvm_vcpu_apicv_active(vcpu), .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)), .vector = vector, }; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 6352930ad011..6149aece0eac 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3939,7 +3939,10 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info) ir_data->ga_root_ptr = (pi_data->vapic_addr >> 12); ir_data->ga_vector = pi_data->vector; ir_data->ga_tag = pi_data->ga_tag; - ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu); + if (pi_data->is_guest_mode) + ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu); + else + ret = amd_iommu_deactivate_guest_mode(ir_data); } else { ret = amd_iommu_deactivate_guest_mode(ir_data); } From f5998661ff73b5e76974fd7913c1ccc3ff0a07a0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:47 -0700 Subject: [PATCH 069/101] KVM: SVM: Don't check for assigned device(s) when updating affinity Don't bother checking if a VM has an assigned device when updating AVIC vCPU affinity, querying ir_list is just as cheap and nothing prevents racing with changes in device assignment. Link: https://lore.kernel.org/r/20250611224604.313496-46-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index cb6317fcaddf..e1fa5c73bc98 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -848,9 +848,6 @@ static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu lockdep_assert_held(&svm->ir_list_lock); - if (!kvm_arch_has_assigned_device(vcpu->kvm)) - return 0; - /* * Here, we go through the per-vcpu ir_list to update all existing * interrupt remapping table entry targeting this vcpu. From fe0213923dd991cbe6a1e84b9bf600aaf172b530 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:48 -0700 Subject: [PATCH 070/101] KVM: SVM: Don't check for assigned device(s) when activating AVIC Don't short-circuit IRTE updating when (de)activating AVIC based on the VM having assigned devices, as nothing prevents AVIC (de)activation from racing with device (de)assignment. And from a performance perspective, bailing early when there is no assigned device doesn't add much, as ir_list_lock will never be contended if there's no assigned device. Link: https://lore.kernel.org/r/20250611224604.313496-47-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index e1fa5c73bc98..f3bfd7cb0bca 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -733,9 +733,6 @@ static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) struct vcpu_svm *svm = to_svm(vcpu); struct kvm_kernel_irqfd *irqfd; - if (!kvm_arch_has_assigned_device(vcpu->kvm)) - return 0; - /* * Here, we go through the per-vcpu ir_list to update all existing * interrupt remapping table entry targeting this vcpu. From 16562766f171199acf0ee5d6e6bbe4b4586293ce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:49 -0700 Subject: [PATCH 071/101] KVM: SVM: WARN if (de)activating guest mode in IOMMU fails WARN if (de)activating "guest mode" for an IRTE entry fails as modifying an IRTE should only fail if KVM is buggy, e.g. has stale metadata. Link: https://lore.kernel.org/r/20250611224604.313496-48-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index f3bfd7cb0bca..31bf202e0026 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -725,10 +725,9 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) +static void avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) { int apic_id = kvm_cpu_get_apicid(vcpu->cpu); - int ret = 0; unsigned long flags; struct vcpu_svm *svm = to_svm(vcpu); struct kvm_kernel_irqfd *irqfd; @@ -743,16 +742,15 @@ static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) goto out; list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { + void *data = irqfd->irq_bypass_data; + if (activate) - ret = amd_iommu_activate_guest_mode(irqfd->irq_bypass_data, apic_id); + WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, apic_id)); else - ret = amd_iommu_deactivate_guest_mode(irqfd->irq_bypass_data); - if (ret) - break; + WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data)); } out: spin_unlock_irqrestore(&svm->ir_list_lock, flags); - return ret; } static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) From 48f79c6c86b38f9b6506eeef0d9ca2e03b6be6fe Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:50 -0700 Subject: [PATCH 072/101] KVM: SVM: Process all IRTEs on affinity change even if one update fails When updating IRTE GA fields, keep processing all other IRTEs if an update fails, as not updating later entries risks making a bad situation worse. Link: https://lore.kernel.org/r/20250611224604.313496-49-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 31bf202e0026..6ecbd7984975 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -850,12 +850,10 @@ static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu if (list_empty(&svm->ir_list)) return 0; - list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { + list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) ret = amd_iommu_update_ga(cpu, irqfd->irq_bypass_data); - if (ret) - return ret; - } - return 0; + + return ret; } void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) From cd86240fea2645abd3cdd4d87f27f3a4a0595da7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:51 -0700 Subject: [PATCH 073/101] KVM: SVM: WARN if updating IRTE GA fields in IOMMU fails WARN if updating GA information for an IRTE entry fails as modifying an IRTE should only fail if KVM is buggy, e.g. has stale metadata, and because returning an error that is always ignored is pointless. Link: https://lore.kernel.org/r/20250611224604.313496-50-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 6ecbd7984975..eaf793a28127 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -835,9 +835,8 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, return irq_set_vcpu_affinity(host_irq, NULL); } -static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu) +static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu) { - int ret = 0; struct vcpu_svm *svm = to_svm(vcpu); struct kvm_kernel_irqfd *irqfd; @@ -848,12 +847,10 @@ static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu * interrupt remapping table entry targeting this vcpu. */ if (list_empty(&svm->ir_list)) - return 0; + return; list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) - ret = amd_iommu_update_ga(cpu, irqfd->irq_bypass_data); - - return ret; + WARN_ON_ONCE(amd_iommu_update_ga(cpu, irqfd->irq_bypass_data)); } void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) From 04c4ca0ae47989d8f1263ffa6b77ab05d6160854 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:52 -0700 Subject: [PATCH 074/101] KVM: x86: Drop superfluous "has assigned device" check in kvm_pi_update_irte() Don't bother checking if the VM has an assigned device when updating IRTE entries. kvm_arch_irq_bypass_add_producer() explicitly increments the assigned device count, kvm_arch_irq_bypass_del_producer() explicitly decrements the count before invoking kvm_pi_update_irte(), and kvm_irq_routing_update() only updates IRTE entries if there's an active IRQ bypass producer. Link: https://lore.kernel.org/r/20250611224604.313496-51-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index bc9ebd893566..b3b6cf5b828a 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -517,9 +517,7 @@ static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm_lapic_irq irq; int r; - if (!irqchip_in_kernel(kvm) || - !kvm_arch_has_irq_bypass() || - !kvm_arch_has_assigned_device(kvm)) + if (!irqchip_in_kernel(kvm) || !kvm_arch_has_irq_bypass()) return 0; if (entry && entry->type == KVM_IRQ_ROUTING_MSI) { From d1bccaa1793d8f823983824635933e4bdc752b64 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:53 -0700 Subject: [PATCH 075/101] KVM: x86: WARN if IRQ bypass isn't supported in kvm_pi_update_irte() WARN if kvm_pi_update_irte() is reached without IRQ bypass support, as the code is only reachable if the VM already has an IRQ bypass producer (see kvm_irq_routing_update()), or from kvm_arch_irq_bypass_{add,del}_producer(), which, stating the obvious, are called if and only if KVM enables its IRQ bypass hooks. Cc: David Matlack Link: https://lore.kernel.org/r/20250611224604.313496-52-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index b3b6cf5b828a..81a19d6afe44 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -517,7 +517,7 @@ static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm_lapic_irq irq; int r; - if (!irqchip_in_kernel(kvm) || !kvm_arch_has_irq_bypass()) + if (!irqchip_in_kernel(kvm) || WARN_ON_ONCE(!kvm_arch_has_irq_bypass())) return 0; if (entry && entry->type == KVM_IRQ_ROUTING_MSI) { From 25ef059e8bc51219f18fb7444aff1907960a3a53 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:54 -0700 Subject: [PATCH 076/101] KVM: x86: WARN if IRQ bypass routing is updated without in-kernel local APIC Yell if kvm_pi_update_irte() is reached without an in-kernel local APIC, as kvm_arch_irqfd_allowed() should prevent attaching an irqfd and thus any and all postable IRQs to an APIC-less VM. Link: https://lore.kernel.org/r/20250611224604.313496-53-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 81a19d6afe44..f87f63a53ede 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -517,8 +517,8 @@ static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm_lapic_irq irq; int r; - if (!irqchip_in_kernel(kvm) || WARN_ON_ONCE(!kvm_arch_has_irq_bypass())) - return 0; + if (WARN_ON_ONCE(!irqchip_in_kernel(kvm) || !kvm_arch_has_irq_bypass())) + return -EINVAL; if (entry && entry->type == KVM_IRQ_ROUTING_MSI) { kvm_set_msi_irq(kvm, entry, &irq); From 99836eb9c5dcae1f0e64da2cb6f7e0bb9151f95d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:55 -0700 Subject: [PATCH 077/101] KVM: SVM: WARN if ir_list is non-empty at vCPU free Now that AVIC IRTE tracking is in a mostly sane state, WARN if a vCPU is freed with ir_list entries, i.e. if KVM leaves a dangling IRTE. Initialize the per-vCPU interrupt remapping list and its lock even if AVIC is disabled so that the WARN doesn't hit false positives (and so that KVM doesn't need to call into AVIC code for a simple sanity check). Link: https://lore.kernel.org/r/20250611224604.313496-54-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 5 +++-- arch/x86/kvm/svm/svm.c | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index eaf793a28127..4b359a4a7e40 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -705,6 +705,9 @@ int avic_init_vcpu(struct vcpu_svm *svm) int ret; struct kvm_vcpu *vcpu = &svm->vcpu; + INIT_LIST_HEAD(&svm->ir_list); + spin_lock_init(&svm->ir_list_lock); + if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) return 0; @@ -712,8 +715,6 @@ int avic_init_vcpu(struct vcpu_svm *svm) if (ret) return ret; - INIT_LIST_HEAD(&svm->ir_list); - spin_lock_init(&svm->ir_list_lock); svm->dfr_reg = APIC_DFR_FLAT; return ret; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 68b6a1922078..7f0df9dbc232 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1491,6 +1491,8 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + WARN_ON_ONCE(!list_empty(&svm->ir_list)); + svm_leave_nested(vcpu); svm_free_nested(svm); From 77e1b8332d1d7aa786f7515e9bd4055def6a1e06 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:56 -0700 Subject: [PATCH 078/101] KVM: x86: Decouple device assignment from IRQ bypass Use a dedicated counter to track the number of IRQs that can utilize IRQ bypass instead of piggybacking the assigned device count. As evidenced by commit 2edd9cb79fb3 ("kvm: detect assigned device via irqbypass manager"), it's possible for a device to be able to post IRQs to a vCPU without said device being assigned to a VM. Leave the calls to kvm_arch_{start,end}_assignment() alone for the moment to avoid regressing the MMIO stale data mitigation. KVM is abusing the assigned device count when applying mmio_stale_data_clear, and it's not at all clear if vDPA devices rely on this behavior. This will hopefully be cleaned up in the future, as the number of assigned devices is a terrible heuristic for detecting if a VM has access to host MMIO. Link: https://lore.kernel.org/r/20250611224604.313496-55-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-ops.h | 2 +- arch/x86/include/asm/kvm_host.h | 4 +++- arch/x86/kvm/irq.c | 9 ++++++++- arch/x86/kvm/vmx/main.c | 2 +- arch/x86/kvm/vmx/posted_intr.c | 16 ++++++++++------ arch/x86/kvm/vmx/posted_intr.h | 2 +- arch/x86/kvm/x86.c | 3 +-- 7 files changed, 25 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 8d50e3e0a19b..8897f509860c 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -112,7 +112,7 @@ KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging) KVM_X86_OP_OPTIONAL(vcpu_blocking) KVM_X86_OP_OPTIONAL(vcpu_unblocking) KVM_X86_OP_OPTIONAL(pi_update_irte) -KVM_X86_OP_OPTIONAL(pi_start_assignment) +KVM_X86_OP_OPTIONAL(pi_start_bypass) KVM_X86_OP_OPTIONAL(apicv_pre_state_restore) KVM_X86_OP_OPTIONAL(apicv_post_state_restore) KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f91791473f0b..9a5d8e35f431 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1383,6 +1383,8 @@ struct kvm_arch { atomic_t noncoherent_dma_count; #define __KVM_HAVE_ARCH_ASSIGNED_DEVICE atomic_t assigned_device_count; + unsigned long nr_possible_bypass_irqs; + #ifdef CONFIG_KVM_IOAPIC struct kvm_pic *vpic; struct kvm_ioapic *vioapic; @@ -1856,7 +1858,7 @@ struct kvm_x86_ops { int (*pi_update_irte)(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, struct kvm_vcpu *vcpu, u32 vector); - void (*pi_start_assignment)(struct kvm *kvm); + void (*pi_start_bypass)(struct kvm *kvm); void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index f87f63a53ede..3e8ee23b16c2 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -570,10 +570,15 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, spin_lock_irq(&kvm->irqfds.lock); irqfd->producer = prod; + if (!kvm->arch.nr_possible_bypass_irqs++) + kvm_x86_call(pi_start_bypass)(kvm); + if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { ret = kvm_pi_update_irte(irqfd, &irqfd->irq_entry); - if (ret) + if (ret) { + kvm->arch.nr_possible_bypass_irqs--; kvm_arch_end_assignment(irqfd->kvm); + } } spin_unlock_irq(&kvm->irqfds.lock); @@ -606,6 +611,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, } irqfd->producer = NULL; + kvm->arch.nr_possible_bypass_irqs--; + spin_unlock_irq(&kvm->irqfds.lock); diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index d1e02e567b57..a986fc45145e 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -1014,7 +1014,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .nested_ops = &vmx_nested_ops, .pi_update_irte = vmx_pi_update_irte, - .pi_start_assignment = vmx_pi_start_assignment, + .pi_start_bypass = vmx_pi_start_bypass, #ifdef CONFIG_X86_64 .set_hv_timer = vt_op(set_hv_timer), diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 3a23c30f73cb..5671d59a6b6d 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -146,8 +146,13 @@ after_clear_sn: static bool vmx_can_use_vtd_pi(struct kvm *kvm) { + /* + * Note, reading the number of possible bypass IRQs can race with a + * bypass IRQ being attached to the VM. vmx_pi_start_bypass() ensures + * blockng vCPUs will see an elevated count or get KVM_REQ_UNBLOCK. + */ return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() && - kvm_arch_has_assigned_device(kvm); + READ_ONCE(kvm->arch.nr_possible_bypass_irqs); } /* @@ -285,12 +290,11 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) /* - * Bail out of the block loop if the VM has an assigned - * device, but the blocking vCPU didn't reconfigure the - * PI.NV to the wakeup vector, i.e. the assigned device - * came along after the initial check in vmx_vcpu_pi_put(). + * Kick all vCPUs when the first possible bypass IRQ is attached to a VM, as + * blocking vCPUs may scheduled out without reconfiguring PID.NV to the wakeup + * vector, i.e. if the bypass IRQ came along after vmx_vcpu_pi_put(). */ -void vmx_pi_start_assignment(struct kvm *kvm) +void vmx_pi_start_bypass(struct kvm *kvm) { if (!kvm_arch_has_irq_bypass()) return; diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 94ed66ea6249..a4af39948cf0 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -17,7 +17,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, struct kvm_vcpu *vcpu, u32 vector); -void vmx_pi_start_assignment(struct kvm *kvm); +void vmx_pi_start_bypass(struct kvm *kvm); static inline int pi_find_highest_vector(struct pi_desc *pi_desc) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5ebe5a384fd9..1508b77622b9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13440,8 +13440,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) void kvm_arch_start_assignment(struct kvm *kvm) { - if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) - kvm_x86_call(pi_start_assignment)(kvm); + atomic_inc(&kvm->arch.assigned_device_count); } EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); From ce9d54f41be03555f1b57cf9dc2a50c45f9f712e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:57 -0700 Subject: [PATCH 079/101] KVM: VMX: WARN if VT-d Posted IRQs aren't possible when starting IRQ bypass WARN if KVM attempts to "start" IRQ bypass when VT-d Posted IRQs are disabled, to make it obvious that the logic is a sanity check, and so that a bug related to nr_possible_bypass_irqs is more like to cause noisy failures, e.g. so that KVM doesn't silently fail to wake blocking vCPUs. Link: https://lore.kernel.org/r/20250611224604.313496-56-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/posted_intr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 5671d59a6b6d..4a6d9a17da23 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -296,7 +296,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) */ void vmx_pi_start_bypass(struct kvm *kvm) { - if (!kvm_arch_has_irq_bypass()) + if (WARN_ON_ONCE(!vmx_can_use_vtd_pi(kvm))) return; kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); From 11a60455d4c9d50f07a36efefd5e487989b99b8b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:58 -0700 Subject: [PATCH 080/101] KVM: SVM: Use vcpu_idx, not vcpu_id, for GA log tag/metadata Use a vCPU's index, not its ID, for the GA log tag/metadata that's used to find and kick vCPUs when a device posted interrupt serves as a wake event. Lookups on a vCPU index are O(fast) (not sure what xa_load() actually provides), whereas a vCPU ID lookup is O(n) if a vCPU's ID doesn't match its index. Unlike the Physical APIC Table, which is accessed by hardware when virtualizing IPIs, hardware doesn't consume the GA tag, i.e. KVM _must_ use APIC IDs to fill the Physical APIC Table, but KVM has free rein over the format/meaning of the GA tag. Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-57-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 4b359a4a7e40..808700ce70c3 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -30,36 +30,39 @@ #include "svm.h" /* - * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID, - * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry - * if an interrupt can't be delivered, e.g. because the vCPU isn't running. + * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that + * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't + * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index + * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast + * lookup on the index, where as vCPUs whose index doesn't match their ID need + * to walk the entire xarray of vCPUs in the worst case scenario. * - * For the vCPU ID, use however many bits are currently allowed for the max + * For the vCPU index, use however many bits are currently allowed for the max * guest physical APIC ID (limited by the size of the physical ID table), and * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the * size of the GATag is defined by hardware (32 bits), but is an opaque value * as far as hardware is concerned. */ -#define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK +#define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK #define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK) #define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT) #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK) -#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) +#define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK) -#define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ - ((vcpu_id) & AVIC_VCPU_ID_MASK)) -#define AVIC_GATAG(vm_id, vcpu_id) \ +#define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ + ((vcpu_idx) & AVIC_VCPU_IDX_MASK)) +#define AVIC_GATAG(vm_id, vcpu_idx) \ ({ \ - u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \ + u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \ \ - WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \ + WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \ WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \ ga_tag; \ }) -static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u); +static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u); static bool force_avic; module_param_unsafe(force_avic, bool, 0444); @@ -140,16 +143,16 @@ int avic_ga_log_notifier(u32 ga_tag) struct kvm_svm *kvm_svm; struct kvm_vcpu *vcpu = NULL; u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); - u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); + u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag); - pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); - trace_kvm_avic_ga_log(vm_id, vcpu_id); + pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx); + trace_kvm_avic_ga_log(vm_id, vcpu_idx); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { if (kvm_svm->avic_vm_id != vm_id) continue; - vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id); + vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx); break; } spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); @@ -786,7 +789,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, */ struct amd_iommu_pi_data pi_data = { .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, - vcpu->vcpu_id), + vcpu->vcpu_idx), .is_guest_mode = kvm_vcpu_apicv_active(vcpu), .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)), .vector = vector, From a23480fe21de60b3e191faa5b0a5ddd24c5e38d6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:59 -0700 Subject: [PATCH 081/101] iommu/amd: WARN if KVM calls GA IRTE helpers without virtual APIC support WARN if KVM attempts to update IRTE entries when virtual APIC isn't fully supported, as KVM should guard all such calls on IRQ posting being enabled. Link: https://lore.kernel.org/r/20250611224604.313496-58-seanjc@google.com Signed-off-by: Sean Christopherson --- drivers/iommu/amd/iommu.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 6149aece0eac..4b0cc249771f 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3836,8 +3836,10 @@ int amd_iommu_update_ga(int cpu, void *data) struct amd_ir_data *ir_data = (struct amd_ir_data *)data; struct irte_ga *entry = (struct irte_ga *) ir_data->entry; - if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || - !entry || !entry->lo.fields_vapic.guest_mode) + if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) + return -EINVAL; + + if (!entry || !entry->lo.fields_vapic.guest_mode) return 0; if (!ir_data->iommu) @@ -3856,7 +3858,10 @@ int amd_iommu_activate_guest_mode(void *data, int cpu) struct irte_ga *entry = (struct irte_ga *) ir_data->entry; u64 valid; - if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || !entry) + if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) + return -EINVAL; + + if (!entry) return 0; valid = entry->lo.fields_vapic.valid; @@ -3885,8 +3890,10 @@ int amd_iommu_deactivate_guest_mode(void *data) struct irq_cfg *cfg = ir_data->cfg; u64 valid; - if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || - !entry || !entry->lo.fields_vapic.guest_mode) + if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) + return -EINVAL; + + if (!entry || !entry->lo.fields_vapic.guest_mode) return 0; valid = entry->lo.fields_remap.valid; From f2bc961d383bbc26c72f77e3f452da2f9f44dc0d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:46:00 -0700 Subject: [PATCH 082/101] KVM: SVM: Fold avic_set_pi_irte_mode() into its sole caller Fold avic_set_pi_irte_mode() into avic_refresh_apicv_exec_ctrl() in anticipation of moving the __avic_vcpu_{load,put}() calls into the critical section, and because having a one-off helper with a name that's easily confused with avic_pi_update_irte() is unnecessary. No functional change intended. Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-59-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 52 ++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 808700ce70c3..3a59c5e77a4f 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -729,34 +729,6 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -static void avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) -{ - int apic_id = kvm_cpu_get_apicid(vcpu->cpu); - unsigned long flags; - struct vcpu_svm *svm = to_svm(vcpu); - struct kvm_kernel_irqfd *irqfd; - - /* - * Here, we go through the per-vcpu ir_list to update all existing - * interrupt remapping table entry targeting this vcpu. - */ - spin_lock_irqsave(&svm->ir_list_lock, flags); - - if (list_empty(&svm->ir_list)) - goto out; - - list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { - void *data = irqfd->irq_bypass_data; - - if (activate) - WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, apic_id)); - else - WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data)); - } -out: - spin_unlock_irqrestore(&svm->ir_list_lock, flags); -} - static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) { struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu; @@ -991,6 +963,10 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { bool activated = kvm_vcpu_apicv_active(vcpu); + int apic_id = kvm_cpu_get_apicid(vcpu->cpu); + struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_kernel_irqfd *irqfd; + unsigned long flags; if (!enable_apicv) return; @@ -1002,7 +978,25 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) else avic_vcpu_put(vcpu); - avic_set_pi_irte_mode(vcpu, activated); + /* + * Here, we go through the per-vcpu ir_list to update all existing + * interrupt remapping table entry targeting this vcpu. + */ + spin_lock_irqsave(&svm->ir_list_lock, flags); + + if (list_empty(&svm->ir_list)) + goto out; + + list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { + void *data = irqfd->irq_bypass_data; + + if (activated) + WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, apic_id)); + else + WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data)); + } +out: + spin_unlock_irqrestore(&svm->ir_list_lock, flags); } void avic_vcpu_blocking(struct kvm_vcpu *vcpu) From 6eab2340f339cabb63079c94e5dbaea4d90007df Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:46:01 -0700 Subject: [PATCH 083/101] KVM: SVM: Don't check vCPU's blocking status when toggling AVIC on/off Don't query a vCPU's blocking status when toggling AVIC on/off; barring KVM bugs, the vCPU can't be blocking when refreshing AVIC controls. And if there are KVM bugs, ensuring the vCPU and its associated IRTEs are in the correct state is desirable, i.e. well worth any overhead in a buggy scenario. Isolating the "real" load/put flows will allow moving the IOMMU IRTE (de)activation logic from avic_refresh_apicv_exec_ctrl() to avic_update_iommu_vcpu_affinity(), i.e. will allow updating the vCPU's physical ID entry and its IRTEs in a common path, under a single critical section of ir_list_lock. Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-60-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 65 +++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 3a59c5e77a4f..1a9862c7106e 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -829,7 +829,7 @@ static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu) WARN_ON_ONCE(amd_iommu_update_ga(cpu, irqfd->irq_bypass_data)); } -void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); int h_physical_id = kvm_cpu_get_apicid(cpu); @@ -845,16 +845,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) return; - /* - * No need to update anything if the vCPU is blocking, i.e. if the vCPU - * is being scheduled in after being preempted. The CPU entries in the - * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. - * If the vCPU was migrated, its new CPU value will be stuffed when the - * vCPU unblocks. - */ - if (kvm_vcpu_is_blocking(vcpu)) - return; - /* * Grab the per-vCPU interrupt remapping lock even if the VM doesn't * _currently_ have assigned devices, as that can change. Holding @@ -889,31 +879,33 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) spin_unlock_irqrestore(&svm->ir_list_lock, flags); } -void avic_vcpu_put(struct kvm_vcpu *vcpu) +void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + /* + * No need to update anything if the vCPU is blocking, i.e. if the vCPU + * is being scheduled in after being preempted. The CPU entries in the + * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. + * If the vCPU was migrated, its new CPU value will be stuffed when the + * vCPU unblocks. + */ + if (kvm_vcpu_is_blocking(vcpu)) + return; + + __avic_vcpu_load(vcpu, cpu); +} + +static void __avic_vcpu_put(struct kvm_vcpu *vcpu) { struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); struct vcpu_svm *svm = to_svm(vcpu); unsigned long flags; - u64 entry; + u64 entry = svm->avic_physical_id_entry; lockdep_assert_preemption_disabled(); if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) return; - /* - * Note, reading the Physical ID entry outside of ir_list_lock is safe - * as only the pCPU that has loaded (or is loading) the vCPU is allowed - * to modify the entry, and preemption is disabled. I.e. the vCPU - * can't be scheduled out and thus avic_vcpu_{put,load}() can't run - * recursively. - */ - entry = svm->avic_physical_id_entry; - - /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ - if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) - return; - /* * Take and hold the per-vCPU interrupt remapping lock while updating * the Physical ID entry even though the lock doesn't protect against @@ -933,7 +925,24 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); spin_unlock_irqrestore(&svm->ir_list_lock, flags); +} +void avic_vcpu_put(struct kvm_vcpu *vcpu) +{ + /* + * Note, reading the Physical ID entry outside of ir_list_lock is safe + * as only the pCPU that has loaded (or is loading) the vCPU is allowed + * to modify the entry, and preemption is disabled. I.e. the vCPU + * can't be scheduled out and thus avic_vcpu_{put,load}() can't run + * recursively. + */ + u64 entry = to_svm(vcpu)->avic_physical_id_entry; + + /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ + if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) + return; + + __avic_vcpu_put(vcpu); } void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) @@ -974,9 +983,9 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) avic_refresh_virtual_apic_mode(vcpu); if (activated) - avic_vcpu_load(vcpu, vcpu->cpu); + __avic_vcpu_load(vcpu, vcpu->cpu); else - avic_vcpu_put(vcpu); + __avic_vcpu_put(vcpu); /* * Here, we go through the per-vcpu ir_list to update all existing From 5f3d06b1648e0878f04e3d4819f2ef16c1b40bbf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:46:02 -0700 Subject: [PATCH 084/101] KVM: SVM: Consolidate IRTE update when toggling AVIC on/off Fold the IRTE modification logic in avic_refresh_apicv_exec_ctrl() into __avic_vcpu_{load,put}(), and add a param to the helpers to communicate whether or not AVIC is being toggled, i.e. if IRTE needs a "full" update, or just a quick update to set the CPU and IsRun. Link: https://lore.kernel.org/r/20250611224604.313496-61-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 85 ++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 39 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 1a9862c7106e..5803f778999f 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -811,7 +811,28 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, return irq_set_vcpu_affinity(host_irq, NULL); } -static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu) +enum avic_vcpu_action { + /* + * There is no need to differentiate between activate and deactivate, + * as KVM only refreshes AVIC state when the vCPU is scheduled in and + * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is + * being (de)activated. + */ + AVIC_TOGGLE_ON_OFF = BIT(0), + AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF, + AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF, + + /* + * No unique action is required to deal with a vCPU that stops/starts + * running, as IRTEs are configured to generate GALog interrupts at all + * times. + */ + AVIC_START_RUNNING = 0, + AVIC_STOP_RUNNING = 0, +}; + +static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, + enum avic_vcpu_action action) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_kernel_irqfd *irqfd; @@ -825,11 +846,20 @@ static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu) if (list_empty(&svm->ir_list)) return; - list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) - WARN_ON_ONCE(amd_iommu_update_ga(cpu, irqfd->irq_bypass_data)); + list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { + void *data = irqfd->irq_bypass_data; + + if (!(action & AVIC_TOGGLE_ON_OFF)) + WARN_ON_ONCE(amd_iommu_update_ga(cpu, data)); + else if (cpu >= 0) + WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu)); + else + WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data)); + } } -static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, + enum avic_vcpu_action action) { struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); int h_physical_id = kvm_cpu_get_apicid(cpu); @@ -874,7 +904,7 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id); + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action); spin_unlock_irqrestore(&svm->ir_list_lock, flags); } @@ -891,10 +921,10 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (kvm_vcpu_is_blocking(vcpu)) return; - __avic_vcpu_load(vcpu, cpu); + __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING); } -static void __avic_vcpu_put(struct kvm_vcpu *vcpu) +static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) { struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); struct vcpu_svm *svm = to_svm(vcpu); @@ -916,7 +946,7 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu) */ spin_lock_irqsave(&svm->ir_list_lock, flags); - avic_update_iommu_vcpu_affinity(vcpu, -1); + avic_update_iommu_vcpu_affinity(vcpu, -1, action); entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; svm->avic_physical_id_entry = entry; @@ -942,7 +972,7 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) return; - __avic_vcpu_put(vcpu); + __avic_vcpu_put(vcpu, AVIC_STOP_RUNNING); } void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) @@ -971,41 +1001,18 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { - bool activated = kvm_vcpu_apicv_active(vcpu); - int apic_id = kvm_cpu_get_apicid(vcpu->cpu); - struct vcpu_svm *svm = to_svm(vcpu); - struct kvm_kernel_irqfd *irqfd; - unsigned long flags; - if (!enable_apicv) return; + /* APICv should only be toggled on/off while the vCPU is running. */ + WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu)); + avic_refresh_virtual_apic_mode(vcpu); - if (activated) - __avic_vcpu_load(vcpu, vcpu->cpu); + if (kvm_vcpu_apicv_active(vcpu)) + __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE); else - __avic_vcpu_put(vcpu); - - /* - * Here, we go through the per-vcpu ir_list to update all existing - * interrupt remapping table entry targeting this vcpu. - */ - spin_lock_irqsave(&svm->ir_list_lock, flags); - - if (list_empty(&svm->ir_list)) - goto out; - - list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { - void *data = irqfd->irq_bypass_data; - - if (activated) - WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, apic_id)); - else - WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data)); - } -out: - spin_unlock_irqrestore(&svm->ir_list_lock, flags); + __avic_vcpu_put(vcpu, AVIC_DEACTIVATE); } void avic_vcpu_blocking(struct kvm_vcpu *vcpu) @@ -1031,7 +1038,7 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu) * CPU and cause noisy neighbor problems if the VM is sending interrupts * to the vCPU while it's scheduled out. */ - avic_vcpu_put(vcpu); + __avic_vcpu_put(vcpu, AVIC_STOP_RUNNING); } void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) From b9e53f9ff4a88f01b22524878c9a381a6c5f65ff Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:46:03 -0700 Subject: [PATCH 085/101] iommu/amd: KVM: SVM: Allow KVM to control need for GA log interrupts Add plumbing to the AMD IOMMU driver to allow KVM to control whether or not an IRTE is configured to generate GA log interrupts. KVM only needs a notification if the target vCPU is blocking, so the vCPU can be awakened. If a vCPU is preempted or exits to userspace, KVM clears is_run, but will set the vCPU back to running when userspace does KVM_RUN and/or the vCPU task is scheduled back in, i.e. KVM doesn't need a notification. Unconditionally pass "true" in all KVM paths to isolate the IOMMU changes from the KVM changes insofar as possible. Opportunistically swap the ordering of parameters for amd_iommu_update_ga() so that the match amd_iommu_activate_guest_mode(). Note, as of this writing, the AMD IOMMU manual doesn't list GALogIntr as a non-cached field, but per AMD hardware architects, it's not cached and can be safely updated without an invalidation. Link: https://lore.kernel.org/all/b29b8c22-2fd4-4b5e-b755-9198874157c7@amd.com Cc: Vasant Hegde Cc: Joao Martins Link: https://lore.kernel.org/r/20250611224604.313496-62-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/irq_remapping.h | 1 + arch/x86/kvm/svm/avic.c | 10 ++++++---- drivers/iommu/amd/iommu.c | 28 +++++++++++++++++----------- include/linux/amd-iommu.h | 9 ++++----- 4 files changed, 28 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 4c75a17632f6..5a0d42464d44 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -36,6 +36,7 @@ struct amd_iommu_pi_data { u32 ga_tag; u32 vector; /* Guest vector of the interrupt */ int cpu; + bool ga_log_intr; bool is_guest_mode; void *ir_data; }; diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 5803f778999f..02f266901bfe 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -785,10 +785,12 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, * is awakened and/or scheduled in. See also avic_vcpu_load(). */ entry = svm->avic_physical_id_entry; - if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) + if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) { pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; - else + } else { pi_data.cpu = -1; + pi_data.ga_log_intr = true; + } ret = irq_set_vcpu_affinity(host_irq, &pi_data); if (ret) @@ -850,9 +852,9 @@ static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, void *data = irqfd->irq_bypass_data; if (!(action & AVIC_TOGGLE_ON_OFF)) - WARN_ON_ONCE(amd_iommu_update_ga(cpu, data)); + WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, true)); else if (cpu >= 0) - WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu)); + WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, true)); else WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data)); } diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 4b0cc249771f..c50d4a8a51be 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3804,7 +3804,8 @@ static const struct irq_domain_ops amd_ir_domain_ops = { .deactivate = irq_remapping_deactivate, }; -static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu) +static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu, + bool ga_log_intr) { if (cpu >= 0) { entry->lo.fields_vapic.destination = @@ -3812,8 +3813,10 @@ static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu) entry->hi.fields.destination = APICID_TO_IRTE_DEST_HI(cpu); entry->lo.fields_vapic.is_run = true; + entry->lo.fields_vapic.ga_log_intr = false; } else { entry->lo.fields_vapic.is_run = false; + entry->lo.fields_vapic.ga_log_intr = ga_log_intr; } } @@ -3822,16 +3825,19 @@ static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu) * a vCPU, without issuing an IOMMU invalidation for the IRTE. * * If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination - * with the pCPU's APIC ID and set IsRun, else clear IsRun. I.e. treat vCPUs - * that are associated with a pCPU as running. This API is intended to be used - * when a vCPU is scheduled in/out (or stops running for any reason), to do a - * fast update of IsRun and (conditionally) Destination. + * with the pCPU's APIC ID, set IsRun, and clear GALogIntr. If the vCPU isn't + * associated with a pCPU (@cpu < 0), clear IsRun and set/clear GALogIntr based + * on input from the caller (e.g. KVM only requests GALogIntr when the vCPU is + * blocking and requires a notification wake event). I.e. treat vCPUs that are + * associated with a pCPU as running. This API is intended to be used when a + * vCPU is scheduled in/out (or stops running for any reason), to do a fast + * update of IsRun, GALogIntr, and (conditionally) Destination. * * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached * and thus don't require an invalidation to ensure the IOMMU consumes fresh * information. */ -int amd_iommu_update_ga(int cpu, void *data) +int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr) { struct amd_ir_data *ir_data = (struct amd_ir_data *)data; struct irte_ga *entry = (struct irte_ga *) ir_data->entry; @@ -3845,14 +3851,14 @@ int amd_iommu_update_ga(int cpu, void *data) if (!ir_data->iommu) return -ENODEV; - __amd_iommu_update_ga(entry, cpu); + __amd_iommu_update_ga(entry, cpu, ga_log_intr); return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, ir_data->irq_2_irte.index, entry); } EXPORT_SYMBOL(amd_iommu_update_ga); -int amd_iommu_activate_guest_mode(void *data, int cpu) +int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr) { struct amd_ir_data *ir_data = (struct amd_ir_data *)data; struct irte_ga *entry = (struct irte_ga *) ir_data->entry; @@ -3871,12 +3877,11 @@ int amd_iommu_activate_guest_mode(void *data, int cpu) entry->lo.fields_vapic.valid = valid; entry->lo.fields_vapic.guest_mode = 1; - entry->lo.fields_vapic.ga_log_intr = 1; entry->hi.fields.ga_root_ptr = ir_data->ga_root_ptr; entry->hi.fields.vector = ir_data->ga_vector; entry->lo.fields_vapic.ga_tag = ir_data->ga_tag; - __amd_iommu_update_ga(entry, cpu); + __amd_iommu_update_ga(entry, cpu, ga_log_intr); return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, ir_data->irq_2_irte.index, entry); @@ -3947,7 +3952,8 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info) ir_data->ga_vector = pi_data->vector; ir_data->ga_tag = pi_data->ga_tag; if (pi_data->is_guest_mode) - ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu); + ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu, + pi_data->ga_log_intr); else ret = amd_iommu_deactivate_guest_mode(ir_data); } else { diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index c9f2df0c4596..8cced632ecd0 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -30,9 +30,8 @@ static inline void amd_iommu_detect(void) { } /* IOMMU AVIC Function */ extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)); -extern int amd_iommu_update_ga(int cpu, void *data); - -extern int amd_iommu_activate_guest_mode(void *data, int cpu); +extern int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr); +extern int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr); extern int amd_iommu_deactivate_guest_mode(void *data); #else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */ @@ -43,12 +42,12 @@ amd_iommu_register_ga_log_notifier(int (*notifier)(u32)) return 0; } -static inline int amd_iommu_update_ga(int cpu, void *data) +static inline int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr) { return 0; } -static inline int amd_iommu_activate_guest_mode(void *data, int cpu) +static inline int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr) { return 0; } From b03500f03ea09f5fa31a408135c95c665542ff15 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:46:04 -0700 Subject: [PATCH 086/101] KVM: SVM: Generate GA log IRQs only if the associated vCPUs is blocking Configure IRTEs to GA log interrupts for device posted IRQs that hit non-running vCPUs if and only if the target vCPU is blocking, i.e. actually needs a wake event. If the vCPU has exited to userspace or was preempted, generating GA log entries and interrupts is wasteful and unnecessary, as the vCPU will be re-loaded and/or scheduled back in irrespective of the GA log notification (avic_ga_log_notifier() is just a fancy wrapper for kvm_vcpu_wake_up()). Use a should-be-zero bit in the vCPU's Physical APIC ID Table Entry to track whether or not the vCPU's associated IRTEs are configured to generate GA logs, but only set the synthetic bit in KVM's "cache", i.e. never set the should-be-zero bit in tables that are used by hardware. Use a synthetic bit instead of a dedicated boolean to minimize the odds of messing up the locking, i.e. so that all the existing rules that apply to avic_physical_id_entry for IS_RUNNING are reused verbatim for GA_LOG_INTR. Note, because KVM (by design) "puts" AVIC state in a "pre-blocking" phase, using kvm_vcpu_is_blocking() to track the need for notifications isn't a viable option. Link: https://lore.kernel.org/r/20250611224604.313496-63-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 7 +++++ arch/x86/kvm/svm/avic.c | 63 ++++++++++++++++++++++++++++++-------- 2 files changed, 58 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 36f67c69ea66..ffc27f676243 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -252,6 +252,13 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) +/* + * GA_LOG_INTR is a synthetic flag that's never propagated to hardware-visible + * tables. GA_LOG_INTR is set if the vCPU needs device posted IRQs to generate + * GA log interrupts to wake the vCPU (because it's blocking or about to block). + */ +#define AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR BIT_ULL(61) + #define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0) #define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK GENMASK_ULL(51, 12) #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 02f266901bfe..a34c5c3b164e 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -789,7 +789,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; } else { pi_data.cpu = -1; - pi_data.ga_log_intr = true; + pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; } ret = irq_set_vcpu_affinity(host_irq, &pi_data); @@ -826,16 +826,25 @@ enum avic_vcpu_action { /* * No unique action is required to deal with a vCPU that stops/starts - * running, as IRTEs are configured to generate GALog interrupts at all - * times. + * running. A vCPU that starts running by definition stops blocking as + * well, and a vCPU that stops running can't have been blocking, i.e. + * doesn't need to toggle GALogIntr. */ AVIC_START_RUNNING = 0, AVIC_STOP_RUNNING = 0, + + /* + * When a vCPU starts blocking, KVM needs to set the GALogIntr flag + * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is + * sent to the vCPU. + */ + AVIC_START_BLOCKING = BIT(1), }; static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, enum avic_vcpu_action action) { + bool ga_log_intr = (action & AVIC_START_BLOCKING); struct vcpu_svm *svm = to_svm(vcpu); struct kvm_kernel_irqfd *irqfd; @@ -852,9 +861,9 @@ static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, void *data = irqfd->irq_bypass_data; if (!(action & AVIC_TOGGLE_ON_OFF)) - WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, true)); + WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr)); else if (cpu >= 0) - WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, true)); + WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr)); else WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data)); } @@ -889,7 +898,8 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, entry = svm->avic_physical_id_entry; WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); - entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; + entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK | + AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; @@ -950,12 +960,26 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) avic_update_iommu_vcpu_affinity(vcpu, -1, action); + WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); + + /* + * Keep the previous APIC ID in the entry so that a rogue doorbell from + * hardware is at least restricted to a CPU associated with the vCPU. + */ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - svm->avic_physical_id_entry = entry; if (enable_ipiv) WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); + /* + * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as + * it's a synthetic flag that usurps an unused should-be-zero bit. + */ + if (action & AVIC_START_BLOCKING) + entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; + + svm->avic_physical_id_entry = entry; + spin_unlock_irqrestore(&svm->ir_list_lock, flags); } @@ -970,11 +994,26 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) */ u64 entry = to_svm(vcpu)->avic_physical_id_entry; - /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ - if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) - return; + /* + * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the + * vCPU is preempted while its in the process of blocking. WARN if the + * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put + * the AVIC if it wasn't previously loaded. + */ + if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) { + if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu))) + return; - __avic_vcpu_put(vcpu, AVIC_STOP_RUNNING); + /* + * The vCPU was preempted while blocking, ensure its IRTEs are + * configured to generate GA Log Interrupts. + */ + if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR)))) + return; + } + + __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING : + AVIC_STOP_RUNNING); } void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) @@ -1040,7 +1079,7 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu) * CPU and cause noisy neighbor problems if the VM is sending interrupts * to the vCPU while it's scheduled out. */ - __avic_vcpu_put(vcpu, AVIC_STOP_RUNNING); + __avic_vcpu_put(vcpu, AVIC_START_BLOCKING); } void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) From 6f343724837b9a7d70ba8c83dcdc37f4768d4e55 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:46:05 -0700 Subject: [PATCH 087/101] KVM: x86: Rename kvm_set_msi_irq() => kvm_msi_to_lapic_irq() Rename kvm_set_msi_irq() to kvm_msi_to_lapic_irq() to better capture what it actually does, e.g. it's _really_ easy to conflate kvm_set_msi_irq() with kvm_set_msi(). Opportunistically delete the public declaration and export, as they are no longer used/needed. No functional change intended. Link: https://lore.kernel.org/r/20250611224604.313496-64-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 3 --- arch/x86/kvm/irq.c | 14 +++++++------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9a5d8e35f431..8a8efe9f23fb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2382,9 +2382,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); -void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, - struct kvm_lapic_irq *irq); - static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) { /* We can only post Fixed and LowPrio IRQs */ diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 3e8ee23b16c2..4c3f80e8b13f 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -252,8 +252,9 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, return r; } -void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, - struct kvm_lapic_irq *irq) +static void kvm_msi_to_lapic_irq(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq) { struct msi_msg msg = { .address_lo = e->msi.address_lo, .address_hi = e->msi.address_hi, @@ -271,7 +272,6 @@ void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, irq->level = 1; irq->shorthand = APIC_DEST_NOSHORT; } -EXPORT_SYMBOL_GPL(kvm_set_msi_irq); static inline bool kvm_msi_route_invalid(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e) @@ -290,7 +290,7 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, if (!level) return -1; - kvm_set_msi_irq(kvm, e, &irq); + kvm_msi_to_lapic_irq(kvm, e, &irq); return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); } @@ -313,7 +313,7 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, if (kvm_msi_route_invalid(kvm, e)) return -EINVAL; - kvm_set_msi_irq(kvm, e, &irq); + kvm_msi_to_lapic_irq(kvm, e, &irq); if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) return r; @@ -486,7 +486,7 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, if (entry->type != KVM_IRQ_ROUTING_MSI) continue; - kvm_set_msi_irq(vcpu->kvm, entry, &irq); + kvm_msi_to_lapic_irq(vcpu->kvm, entry, &irq); if (!irq.trig_mode) continue; @@ -521,7 +521,7 @@ static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd, return -EINVAL; if (entry && entry->type == KVM_IRQ_ROUTING_MSI) { - kvm_set_msi_irq(kvm, entry, &irq); + kvm_msi_to_lapic_irq(kvm, entry, &irq); /* * Force remapped mode if hardware doesn't support posting the From 283ed5001d6852f85c09ed2522331b2b197ba937 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:11 -0700 Subject: [PATCH 088/101] KVM: Use a local struct to do the initial vfs_poll() on an irqfd Use a function-local struct for the poll_table passed to vfs_poll(), as nothing in the vfs_poll() callchain grabs a long-term reference to the structure, i.e. its lifetime doesn't need to be tied to the irqfd. Using a local structure will also allow propagating failures out of the polling callback without further polluting kvm_kernel_irqfd. Opportunstically rename irqfd_ptable_queue_proc() to kvm_irqfd_register() to capture what it actually does. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-2-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_irqfd.h | 1 - virt/kvm/eventfd.c | 26 +++++++++++++++++--------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h index 361c07f4466d..ef8c134ded8a 100644 --- a/include/linux/kvm_irqfd.h +++ b/include/linux/kvm_irqfd.h @@ -55,7 +55,6 @@ struct kvm_kernel_irqfd { /* Used for setup/shutdown */ struct eventfd_ctx *eventfd; struct list_head list; - poll_table pt; struct work_struct shutdown; struct irq_bypass_consumer consumer; struct irq_bypass_producer *producer; diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 59b1e64697f1..0b655376734e 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -245,12 +245,17 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) return ret; } -static void -irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, - poll_table *pt) +struct kvm_irqfd_pt { + struct kvm_kernel_irqfd *irqfd; + poll_table pt; +}; + +static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) { - struct kvm_kernel_irqfd *irqfd = - container_of(pt, struct kvm_kernel_irqfd, pt); + struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt); + struct kvm_kernel_irqfd *irqfd = p->irqfd; + add_wait_queue_priority(wqh, &irqfd->wait); } @@ -298,6 +303,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) { struct kvm_kernel_irqfd *irqfd, *tmp; struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; + struct kvm_irqfd_pt irqfd_pt; int ret; __poll_t events; int idx; @@ -387,7 +393,6 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) * a callback whenever someone signals the underlying eventfd */ init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); - init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); spin_lock_irq(&kvm->irqfds.lock); @@ -409,11 +414,14 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) spin_unlock_irq(&kvm->irqfds.lock); /* - * Check if there was an event already pending on the eventfd - * before we registered, and trigger it as if we didn't miss it. + * Register the irqfd with the eventfd by polling on the eventfd. If + * there was en event pending on the eventfd prior to registering, + * manually trigger IRQ injection. */ - events = vfs_poll(fd_file(f), &irqfd->pt); + irqfd_pt.irqfd = irqfd; + init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register); + events = vfs_poll(fd_file(f), &irqfd_pt.pt); if (events & EPOLLIN) schedule_work(&irqfd->inject); From 140768a7bf03df2a746cdbd4b6dc938d80caad8d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:12 -0700 Subject: [PATCH 089/101] KVM: Acquire SCRU lock outside of irqfds.lock during assignment Acquire SRCU outside of irqfds.lock so that the locking is symmetrical, and add a comment explaining why on earth KVM holds SRCU for so long. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-3-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 0b655376734e..614f81cd37c1 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -394,6 +394,18 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) */ init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); + /* + * Set the irqfd routing and add it to KVM's list before registering + * the irqfd with the eventfd, so that the routing information is valid + * and stays valid, e.g. if there are GSI routing changes, prior to + * making the irqfd visible, i.e. before it might be signaled. + * + * Note, holding SRCU ensures a stable read of routing information, and + * also prevents irqfd_shutdown() from freeing the irqfd before it's + * fully initialized. + */ + idx = srcu_read_lock(&kvm->irq_srcu); + spin_lock_irq(&kvm->irqfds.lock); ret = 0; @@ -402,11 +414,9 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) continue; /* This fd is used for another irq already. */ ret = -EBUSY; - spin_unlock_irq(&kvm->irqfds.lock); - goto fail; + goto fail_duplicate; } - idx = srcu_read_lock(&kvm->irq_srcu); irqfd_update(kvm, irqfd); list_add_tail(&irqfd->list, &kvm->irqfds.items); @@ -441,6 +451,9 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) srcu_read_unlock(&kvm->irq_srcu, idx); return 0; +fail_duplicate: + spin_unlock_irq(&kvm->irqfds.lock); + srcu_read_unlock(&kvm->irq_srcu, idx); fail: if (irqfd->resampler) irqfd_resampler_shutdown(irqfd); From b5c543518ae9df8e99c63cd08a8b573f0141b31a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:13 -0700 Subject: [PATCH 090/101] KVM: Initialize irqfd waitqueue callback when adding to the queue Initialize the irqfd waitqueue callback immediately prior to inserting the irqfd into the eventfd's waitqueue. Pre-initializing the state in a completely different context is all kinds of confusing, and incorrectly suggests that the waitqueue function needs to be initialize prior to vfs_poll(). Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-4-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 614f81cd37c1..7b7c5269cf18 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -256,6 +256,13 @@ static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt); struct kvm_kernel_irqfd *irqfd = p->irqfd; + /* + * Add the irqfd as a priority waiter on the eventfd, with a custom + * wake-up handler, so that KVM *and only KVM* is notified whenever the + * underlying eventfd is signaled. + */ + init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); + add_wait_queue_priority(wqh, &irqfd->wait); } @@ -388,12 +395,6 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) mutex_unlock(&kvm->irqfds.resampler_lock); } - /* - * Install our own custom wake-up handling so we are notified via - * a callback whenever someone signals the underlying eventfd - */ - init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); - /* * Set the irqfd routing and add it to KVM's list before registering * the irqfd with the eventfd, so that the routing information is valid From 5f8ca05ea99183ab2b69c7fd9617961211d194e7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:14 -0700 Subject: [PATCH 091/101] KVM: Add irqfd to KVM's list via the vfs_poll() callback Add the irqfd structure to KVM's list of irqfds in kvm_irqfd_register(), i.e. via the vfs_poll() callback. This will allow taking irqfds.lock across the entire registration sequence (add to waitqueue, add to list), and more importantly will allow inserting into KVM's list if and only if adding to the waitqueue succeeds (spoiler alert), without needing to juggle return codes in weird ways. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-5-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 102 +++++++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 45 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 7b7c5269cf18..170a67aad983 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -245,34 +245,14 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) return ret; } -struct kvm_irqfd_pt { - struct kvm_kernel_irqfd *irqfd; - poll_table pt; -}; - -static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, - poll_table *pt) -{ - struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt); - struct kvm_kernel_irqfd *irqfd = p->irqfd; - - /* - * Add the irqfd as a priority waiter on the eventfd, with a custom - * wake-up handler, so that KVM *and only KVM* is notified whenever the - * underlying eventfd is signaled. - */ - init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); - - add_wait_queue_priority(wqh, &irqfd->wait); -} - -/* Must be called under irqfds.lock */ static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) { struct kvm_kernel_irq_routing_entry *e; struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; int n_entries; + lockdep_assert_held(&kvm->irqfds.lock); + n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); write_seqcount_begin(&irqfd->irq_entry_sc); @@ -286,6 +266,49 @@ static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) write_seqcount_end(&irqfd->irq_entry_sc); } +struct kvm_irqfd_pt { + struct kvm_kernel_irqfd *irqfd; + struct kvm *kvm; + poll_table pt; + int ret; +}; + +static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) +{ + struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt); + struct kvm_kernel_irqfd *irqfd = p->irqfd; + struct kvm_kernel_irqfd *tmp; + struct kvm *kvm = p->kvm; + + spin_lock_irq(&kvm->irqfds.lock); + + list_for_each_entry(tmp, &kvm->irqfds.items, list) { + if (irqfd->eventfd != tmp->eventfd) + continue; + /* This fd is used for another irq already. */ + p->ret = -EBUSY; + spin_unlock_irq(&kvm->irqfds.lock); + return; + } + + irqfd_update(kvm, irqfd); + + list_add_tail(&irqfd->list, &kvm->irqfds.items); + + spin_unlock_irq(&kvm->irqfds.lock); + + /* + * Add the irqfd as a priority waiter on the eventfd, with a custom + * wake-up handler, so that KVM *and only KVM* is notified whenever the + * underlying eventfd is signaled. + */ + init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); + + add_wait_queue_priority(wqh, &irqfd->wait); + p->ret = 0; +} + #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) void __attribute__((weak)) kvm_arch_irq_bypass_stop( struct irq_bypass_consumer *cons) @@ -308,7 +331,7 @@ void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, static int kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) { - struct kvm_kernel_irqfd *irqfd, *tmp; + struct kvm_kernel_irqfd *irqfd; struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; struct kvm_irqfd_pt irqfd_pt; int ret; @@ -407,32 +430,22 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) */ idx = srcu_read_lock(&kvm->irq_srcu); - spin_lock_irq(&kvm->irqfds.lock); - - ret = 0; - list_for_each_entry(tmp, &kvm->irqfds.items, list) { - if (irqfd->eventfd != tmp->eventfd) - continue; - /* This fd is used for another irq already. */ - ret = -EBUSY; - goto fail_duplicate; - } - - irqfd_update(kvm, irqfd); - - list_add_tail(&irqfd->list, &kvm->irqfds.items); - - spin_unlock_irq(&kvm->irqfds.lock); - /* - * Register the irqfd with the eventfd by polling on the eventfd. If - * there was en event pending on the eventfd prior to registering, - * manually trigger IRQ injection. + * Register the irqfd with the eventfd by polling on the eventfd, and + * simultaneously and the irqfd to KVM's list. If there was en event + * pending on the eventfd prior to registering, manually trigger IRQ + * injection. */ irqfd_pt.irqfd = irqfd; + irqfd_pt.kvm = kvm; init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register); events = vfs_poll(fd_file(f), &irqfd_pt.pt); + + ret = irqfd_pt.ret; + if (ret) + goto fail_poll; + if (events & EPOLLIN) schedule_work(&irqfd->inject); @@ -452,8 +465,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) srcu_read_unlock(&kvm->irq_srcu, idx); return 0; -fail_duplicate: - spin_unlock_irq(&kvm->irqfds.lock); +fail_poll: srcu_read_unlock(&kvm->irq_srcu, idx); fail: if (irqfd->resampler) From 86e00cd162a727c0b847def89bbd787c20eb8f5d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:15 -0700 Subject: [PATCH 092/101] KVM: Add irqfd to eventfd's waitqueue while holding irqfds.lock Add an irqfd to its target eventfd's waitqueue while holding irqfds.lock, which is mildly terrifying but functionally safe. irqfds.lock is taken inside the waitqueue's lock, but if and only if the eventfd is being released, i.e. that path is mutually exclusive with registration as KVM holds a reference to the eventfd (and obviously must do so to avoid UAF). This will allow using the eventfd's waitqueue to enforce KVM's requirement that eventfd is assigned to at most one irqfd, without introducing races. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-6-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 170a67aad983..a9a13f919de8 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -204,6 +204,11 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) int ret = 0; if (flags & EPOLLIN) { + /* + * WARNING: Do NOT take irqfds.lock in any path except EPOLLHUP, + * as KVM holds irqfds.lock when registering the irqfd with the + * eventfd. + */ u64 cnt; eventfd_ctx_do_read(irqfd->eventfd, &cnt); @@ -225,6 +230,11 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) /* The eventfd is closing, detach from KVM */ unsigned long iflags; + /* + * Taking irqfds.lock is safe here, as KVM holds a reference to + * the eventfd when registering the irqfd, i.e. this path can't + * be reached while kvm_irqfd_add() is running. + */ spin_lock_irqsave(&kvm->irqfds.lock, iflags); /* @@ -296,16 +306,21 @@ static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, list_add_tail(&irqfd->list, &kvm->irqfds.items); - spin_unlock_irq(&kvm->irqfds.lock); - /* * Add the irqfd as a priority waiter on the eventfd, with a custom * wake-up handler, so that KVM *and only KVM* is notified whenever the - * underlying eventfd is signaled. + * underlying eventfd is signaled. Temporarily lie to lockdep about + * holding irqfds.lock to avoid a false positive regarding potential + * deadlock with irqfd_wakeup() (see irqfd_wakeup() for details). */ init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); + spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_); add_wait_queue_priority(wqh, &irqfd->wait); + spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_); + + spin_unlock_irq(&kvm->irqfds.lock); + p->ret = 0; } From 867347bb21e18551b48eb147c0b2d8635909c8b5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:16 -0700 Subject: [PATCH 093/101] sched/wait: Drop WQ_FLAG_EXCLUSIVE from add_wait_queue_priority() Drop the setting of WQ_FLAG_EXCLUSIVE from add_wait_queue_priority() and instead have callers manually add the flag prior to adding their structure to the queue. Blindly setting WQ_FLAG_EXCLUSIVE is flawed, as the nature of exclusive, priority waiters means that only the first waiter added will ever receive notifications. Pushing the flawed behavior to callers will allow fixing the problem one hypervisor at a time (KVM added the flawed API, and then KVM's code was copy+pasted nearly verbatim by Xen and Hyper-V), and will also allow for adding an API that provides true exclusivity, i.e. that guarantees at most one priority waiter is in the queue. Opportunistically add a comment in Hyper-V to call out the mess. Xen privcmd's irqfd_wakefup() doesn't actually operate in exclusive mode, i.e. can be "fixed" simply by dropping WQ_FLAG_EXCLUSIVE. And KVM is primed to switch to the aforementioned fully exclusive API, i.e. won't be carrying the flawed code for long. No functional change intended. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-7-seanjc@google.com Signed-off-by: Sean Christopherson --- drivers/hv/mshv_eventfd.c | 8 ++++++++ drivers/xen/privcmd.c | 1 + kernel/sched/wait.c | 4 ++-- virt/kvm/eventfd.c | 1 + 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c index 8dd22be2ca0b..b348928871c2 100644 --- a/drivers/hv/mshv_eventfd.c +++ b/drivers/hv/mshv_eventfd.c @@ -368,6 +368,14 @@ static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh, container_of(polltbl, struct mshv_irqfd, irqfd_polltbl); irqfd->irqfd_wqh = wqh; + + /* + * TODO: Ensure there isn't already an exclusive, priority waiter, e.g. + * that the irqfd isn't already bound to another partition. Only the + * first exclusive waiter encountered will be notified, and + * add_wait_queue_priority() doesn't enforce exclusivity. + */ + irqfd->irqfd_wait.flags |= WQ_FLAG_EXCLUSIVE; add_wait_queue_priority(wqh, &irqfd->irqfd_wait); } diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 13a10f3294a8..c08ec8a7d27c 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -957,6 +957,7 @@ irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) struct privcmd_kernel_irqfd *kirqfd = container_of(pt, struct privcmd_kernel_irqfd, pt); + kirqfd->wait.flags |= WQ_FLAG_EXCLUSIVE; add_wait_queue_priority(wqh, &kirqfd->wait); } diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 51e38f5f4701..4ab3ab195277 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -40,7 +40,7 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_ { unsigned long flags; - wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY; + wq_entry->flags |= WQ_FLAG_PRIORITY; spin_lock_irqsave(&wq_head->lock, flags); __add_wait_queue(wq_head, wq_entry); spin_unlock_irqrestore(&wq_head->lock, flags); @@ -64,7 +64,7 @@ EXPORT_SYMBOL(remove_wait_queue); * the non-exclusive tasks. Normally, exclusive tasks will be at the end of * the list and any non-exclusive tasks will be woken first. A priority task * may be at the head of the list, and can consume the event without any other - * tasks being woken. + * tasks being woken if it's also an exclusive task. * * There are circumstances in which we can try to wake a task which has already * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index a9a13f919de8..f8c2486f95d5 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -316,6 +316,7 @@ static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_); + irqfd->wait.flags |= WQ_FLAG_EXCLUSIVE; add_wait_queue_priority(wqh, &irqfd->wait); spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_); From a52664134a24f3b0c425781082ce993617fc2797 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:17 -0700 Subject: [PATCH 094/101] xen: privcmd: Don't mark eventfd waiter as EXCLUSIVE Don't set WQ_FLAG_EXCLUSIVE when adding an irqfd to a wait queue, as irqfd_wakeup() unconditionally returns '0', i.e. doesn't actually operate in exclusive mode. Note, the use of WQ_FLAG_PRIORITY is also dubious, but that's a problem for another day. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-8-seanjc@google.com Signed-off-by: Sean Christopherson --- drivers/xen/privcmd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index c08ec8a7d27c..13a10f3294a8 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -957,7 +957,6 @@ irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) struct privcmd_kernel_irqfd *kirqfd = container_of(pt, struct privcmd_kernel_irqfd, pt); - kirqfd->wait.flags |= WQ_FLAG_EXCLUSIVE; add_wait_queue_priority(wqh, &kirqfd->wait); } From 0d09582b3a607436fd91d6ce813048a048ecbf10 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:18 -0700 Subject: [PATCH 095/101] sched/wait: Add a waitqueue helper for fully exclusive priority waiters Add a waitqueue helper to add a priority waiter that requires exclusive wakeups, i.e. that requires that it be the _only_ priority waiter. The API will be used by KVM to ensure that at most one of KVM's irqfds is bound to a single eventfd (across the entire kernel). Open code the helper instead of using __add_wait_queue() so that the common path doesn't need to "handle" impossible failures. Cc: K Prateek Nayak Reviewed-by: K Prateek Nayak Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-9-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/wait.h | 2 ++ kernel/sched/wait.c | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/include/linux/wait.h b/include/linux/wait.h index 965a19809c7e..09855d819418 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -164,6 +164,8 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); +extern int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head, + struct wait_queue_entry *wq_entry); extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 4ab3ab195277..15632c89c4f2 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -47,6 +47,24 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_ } EXPORT_SYMBOL_GPL(add_wait_queue_priority); +int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head, + struct wait_queue_entry *wq_entry) +{ + struct list_head *head = &wq_head->head; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY; + + guard(spinlock_irqsave)(&wq_head->lock); + + if (!list_empty(head) && + (list_first_entry(head, typeof(*wq_entry), entry)->flags & WQ_FLAG_PRIORITY)) + return -EBUSY; + + list_add(&wq_entry->entry, head); + return 0; +} +EXPORT_SYMBOL_GPL(add_wait_queue_priority_exclusive); + void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; From 2cdd64cbf9906f9d2d52ef96e1471992ff6c27ec Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:19 -0700 Subject: [PATCH 096/101] KVM: Disallow binding multiple irqfds to an eventfd with a priority waiter Disallow binding an irqfd to an eventfd that already has a priority waiter, i.e. to an eventfd that already has an attached irqfd. KVM always operates in exclusive mode for EPOLL_IN (unconditionally returns '1'), i.e. only the first waiter will be notified. KVM already disallows binding multiple irqfds to an eventfd in a single VM, but doesn't guard against multiple VMs binding to an eventfd. Adding the extra protection reduces the pain of a userspace VMM bug, e.g. if userspace fails to de-assign before re-assigning when transferring state for intra-host migration, then the migration will explicitly fail as opposed to dropping IRQs on the destination VM. Temporarily keep KVM's manual check on irqfds.items, but add a WARN, e.g. to allow sanity checking the waitqueue enforcement. Cc: Oliver Upton Cc: David Matlack Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-10-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 55 +++++++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index f8c2486f95d5..ab9e3cfb81bf 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -291,38 +291,57 @@ static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, struct kvm_kernel_irqfd *tmp; struct kvm *kvm = p->kvm; + /* + * Note, irqfds.lock protects the irqfd's irq_entry, i.e. its routing, + * and irqfds.items. It does NOT protect registering with the eventfd. + */ spin_lock_irq(&kvm->irqfds.lock); - list_for_each_entry(tmp, &kvm->irqfds.items, list) { - if (irqfd->eventfd != tmp->eventfd) - continue; - /* This fd is used for another irq already. */ - p->ret = -EBUSY; - spin_unlock_irq(&kvm->irqfds.lock); - return; - } - + /* + * Initialize the routing information prior to adding the irqfd to the + * eventfd's waitqueue, as irqfd_wakeup() can be invoked as soon as the + * irqfd is registered. + */ irqfd_update(kvm, irqfd); - list_add_tail(&irqfd->list, &kvm->irqfds.items); - /* * Add the irqfd as a priority waiter on the eventfd, with a custom * wake-up handler, so that KVM *and only KVM* is notified whenever the - * underlying eventfd is signaled. Temporarily lie to lockdep about - * holding irqfds.lock to avoid a false positive regarding potential - * deadlock with irqfd_wakeup() (see irqfd_wakeup() for details). + * underlying eventfd is signaled. */ init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); + /* + * Temporarily lie to lockdep about holding irqfds.lock to avoid a + * false positive regarding potential deadlock with irqfd_wakeup() + * (see irqfd_wakeup() for details). + * + * Adding to the wait queue will fail if there is already a priority + * waiter, i.e. if the eventfd is associated with another irqfd (in any + * VM). Note, kvm_irqfd_deassign() waits for all in-flight shutdown + * jobs to complete, i.e. ensures the irqfd has been removed from the + * eventfd's waitqueue before returning to userspace. + */ spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_); - irqfd->wait.flags |= WQ_FLAG_EXCLUSIVE; - add_wait_queue_priority(wqh, &irqfd->wait); + p->ret = add_wait_queue_priority_exclusive(wqh, &irqfd->wait); spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_); + if (p->ret) + goto out; + list_for_each_entry(tmp, &kvm->irqfds.items, list) { + if (irqfd->eventfd != tmp->eventfd) + continue; + + WARN_ON_ONCE(1); + /* This fd is used for another irq already. */ + p->ret = -EBUSY; + goto out; + } + + list_add_tail(&irqfd->list, &kvm->irqfds.items); + +out: spin_unlock_irq(&kvm->irqfds.lock); - - p->ret = 0; } #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) From b599d44a71f1aa1acff54b05e4c0e60ea82ed90c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:20 -0700 Subject: [PATCH 097/101] KVM: Drop sanity check that per-VM list of irqfds is unique Now that the eventfd's waitqueue ensures it has at most one priority waiter, i.e. prevents KVM from binding multiple irqfds to one eventfd, drop KVM's sanity check that eventfds are unique for a single VM. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-11-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index ab9e3cfb81bf..6b1133a6617f 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -288,7 +288,6 @@ static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, { struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt); struct kvm_kernel_irqfd *irqfd = p->irqfd; - struct kvm_kernel_irqfd *tmp; struct kvm *kvm = p->kvm; /* @@ -328,16 +327,6 @@ static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, if (p->ret) goto out; - list_for_each_entry(tmp, &kvm->irqfds.items, list) { - if (irqfd->eventfd != tmp->eventfd) - continue; - - WARN_ON_ONCE(1); - /* This fd is used for another irq already. */ - p->ret = -EBUSY; - goto out; - } - list_add_tail(&irqfd->list, &kvm->irqfds.items); out: From 033b76bc7f066b3e2e024e5ced0de0768d7d57b5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:21 -0700 Subject: [PATCH 098/101] KVM: selftests: Assert that eventfd() succeeds in Xen shinfo test Assert that eventfd() succeeds in the Xen shinfo test instead of skipping the associated testcase. While eventfd() is outside the scope of KVM, KVM unconditionally selects EVENTFD, i.e. the syscall should always succeed. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-12-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/xen_shinfo_test.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kvm/x86/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86/xen_shinfo_test.c index 287829f850f7..34d180cf4eed 100644 --- a/tools/testing/selftests/kvm/x86/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86/xen_shinfo_test.c @@ -548,14 +548,11 @@ int main(int argc, char *argv[]) if (do_eventfd_tests) { irq_fd[0] = eventfd(0, 0); + TEST_ASSERT(irq_fd[0] >= 0, __KVM_SYSCALL_ERROR("eventfd()", irq_fd[0])); + irq_fd[1] = eventfd(0, 0); + TEST_ASSERT(irq_fd[1] >= 0, __KVM_SYSCALL_ERROR("eventfd()", irq_fd[1])); - /* Unexpected, but not a KVM failure */ - if (irq_fd[0] == -1 || irq_fd[1] == -1) - do_evtchn_tests = do_eventfd_tests = false; - } - - if (do_eventfd_tests) { irq_routes.info.nr = 2; irq_routes.entries[0].gsi = 32; From 74e5e3fb0dd71790a1974d63420d43bc5743a56b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:22 -0700 Subject: [PATCH 099/101] KVM: selftests: Add utilities to create eventfds and do KVM_IRQFD Add helpers to create eventfds and to (de)assign eventfds via KVM_IRQFD. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-13-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 12 ++---- .../testing/selftests/kvm/include/kvm_util.h | 40 +++++++++++++++++++ .../selftests/kvm/x86/xen_shinfo_test.c | 18 ++------- 3 files changed, 47 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index f4ac28d53747..a09dd423c2d7 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -620,18 +620,12 @@ static void kvm_routing_and_irqfd_check(struct kvm_vm *vm, * that no actual interrupt was injected for those cases. */ - for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) { - fd[f] = eventfd(0, 0); - TEST_ASSERT(fd[f] != -1, __KVM_SYSCALL_ERROR("eventfd()", fd[f])); - } + for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) + fd[f] = kvm_new_eventfd(); for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) { - struct kvm_irqfd irqfd = { - .fd = fd[f], - .gsi = i - MIN_SPI, - }; assert(i <= (uint64_t)UINT_MAX); - vm_ioctl(vm, KVM_IRQFD, &irqfd); + kvm_assign_irqfd(vm, i - MIN_SPI, fd[f]); } for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) { diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index bee65ca08721..1ddf85eebd1d 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -18,6 +18,7 @@ #include #include +#include #include #include "kvm_util_arch.h" @@ -502,6 +503,45 @@ static inline int vm_get_stats_fd(struct kvm_vm *vm) return fd; } +static inline int __kvm_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd, + uint32_t flags) +{ + struct kvm_irqfd irqfd = { + .fd = eventfd, + .gsi = gsi, + .flags = flags, + .resamplefd = -1, + }; + + return __vm_ioctl(vm, KVM_IRQFD, &irqfd); +} + +static inline void kvm_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd, + uint32_t flags) +{ + int ret = __kvm_irqfd(vm, gsi, eventfd, flags); + + TEST_ASSERT_VM_VCPU_IOCTL(!ret, KVM_IRQFD, ret, vm); +} + +static inline void kvm_assign_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd) +{ + kvm_irqfd(vm, gsi, eventfd, 0); +} + +static inline void kvm_deassign_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd) +{ + kvm_irqfd(vm, gsi, eventfd, KVM_IRQFD_FLAG_DEASSIGN); +} + +static inline int kvm_new_eventfd(void) +{ + int fd = eventfd(0, 0); + + TEST_ASSERT(fd >= 0, __KVM_SYSCALL_ERROR("eventfd()", fd)); + return fd; +} + static inline void read_stats_header(int stats_fd, struct kvm_stats_header *header) { ssize_t ret; diff --git a/tools/testing/selftests/kvm/x86/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86/xen_shinfo_test.c index 34d180cf4eed..23909b501ac2 100644 --- a/tools/testing/selftests/kvm/x86/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86/xen_shinfo_test.c @@ -547,11 +547,8 @@ int main(int argc, char *argv[]) int irq_fd[2] = { -1, -1 }; if (do_eventfd_tests) { - irq_fd[0] = eventfd(0, 0); - TEST_ASSERT(irq_fd[0] >= 0, __KVM_SYSCALL_ERROR("eventfd()", irq_fd[0])); - - irq_fd[1] = eventfd(0, 0); - TEST_ASSERT(irq_fd[1] >= 0, __KVM_SYSCALL_ERROR("eventfd()", irq_fd[1])); + irq_fd[0] = kvm_new_eventfd(); + irq_fd[1] = kvm_new_eventfd(); irq_routes.info.nr = 2; @@ -569,15 +566,8 @@ int main(int argc, char *argv[]) vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes.info); - struct kvm_irqfd ifd = { }; - - ifd.fd = irq_fd[0]; - ifd.gsi = 32; - vm_ioctl(vm, KVM_IRQFD, &ifd); - - ifd.fd = irq_fd[1]; - ifd.gsi = 33; - vm_ioctl(vm, KVM_IRQFD, &ifd); + kvm_assign_irqfd(vm, 32, irq_fd[0]); + kvm_assign_irqfd(vm, 33, irq_fd[1]); struct sigaction sa = { }; sa.sa_handler = handle_alrm; From 7e9b231c402a297251b3e6e0f5cc16cef7dd3ce5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:23 -0700 Subject: [PATCH 100/101] KVM: selftests: Add a KVM_IRQFD test to verify uniqueness requirements Add a selftest to verify that eventfd+irqfd bindings are globally unique, i.e. that KVM doesn't allow multiple irqfds to bind to a single eventfd, even across VMs. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-14-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + tools/testing/selftests/kvm/irqfd_test.c | 135 +++++++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 tools/testing/selftests/kvm/irqfd_test.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 38b95998e1e6..028456f1aae1 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -59,6 +59,7 @@ TEST_PROGS_x86 += x86/nx_huge_pages_test.sh TEST_GEN_PROGS_COMMON = demand_paging_test TEST_GEN_PROGS_COMMON += dirty_log_test TEST_GEN_PROGS_COMMON += guest_print_test +TEST_GEN_PROGS_COMMON += irqfd_test TEST_GEN_PROGS_COMMON += kvm_binary_stats_test TEST_GEN_PROGS_COMMON += kvm_create_max_vcpus TEST_GEN_PROGS_COMMON += kvm_page_table_test diff --git a/tools/testing/selftests/kvm/irqfd_test.c b/tools/testing/selftests/kvm/irqfd_test.c new file mode 100644 index 000000000000..7c301b4c7005 --- /dev/null +++ b/tools/testing/selftests/kvm/irqfd_test.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kvm_util.h" + +static struct kvm_vm *vm1; +static struct kvm_vm *vm2; +static int __eventfd; +static bool done; + +/* + * KVM de-assigns based on eventfd *and* GSI, but requires unique eventfds when + * assigning (the API isn't symmetrical). Abuse the oddity and use a per-task + * GSI base to avoid false failures due to cross-task de-assign, i.e. so that + * the secondary doesn't de-assign the primary's eventfd and cause assign to + * unexpectedly succeed on the primary. + */ +#define GSI_BASE_PRIMARY 0x20 +#define GSI_BASE_SECONDARY 0x30 + +static void juggle_eventfd_secondary(struct kvm_vm *vm, int eventfd) +{ + int r, i; + + /* + * The secondary task can encounter EBADF since the primary can close + * the eventfd at any time. And because the primary can recreate the + * eventfd, at the safe fd in the file table, the secondary can also + * encounter "unexpected" success, e.g. if the close+recreate happens + * between the first and second assignments. The secondary's role is + * mostly to antagonize KVM, not to detect bugs. + */ + for (i = 0; i < 2; i++) { + r = __kvm_irqfd(vm, GSI_BASE_SECONDARY, eventfd, 0); + TEST_ASSERT(!r || errno == EBUSY || errno == EBADF, + "Wanted success, EBUSY, or EBADF, r = %d, errno = %d", + r, errno); + + /* De-assign should succeed unless the eventfd was closed. */ + r = __kvm_irqfd(vm, GSI_BASE_SECONDARY + i, eventfd, KVM_IRQFD_FLAG_DEASSIGN); + TEST_ASSERT(!r || errno == EBADF, + "De-assign should succeed unless the fd was closed"); + } +} + +static void *secondary_irqfd_juggler(void *ign) +{ + while (!READ_ONCE(done)) { + juggle_eventfd_secondary(vm1, READ_ONCE(__eventfd)); + juggle_eventfd_secondary(vm2, READ_ONCE(__eventfd)); + } + + return NULL; +} + +static void juggle_eventfd_primary(struct kvm_vm *vm, int eventfd) +{ + int r1, r2; + + /* + * At least one of the assigns should fail. KVM disallows assigning a + * single eventfd to multiple GSIs (or VMs), so it's possible that both + * assignments can fail, too. + */ + r1 = __kvm_irqfd(vm, GSI_BASE_PRIMARY, eventfd, 0); + TEST_ASSERT(!r1 || errno == EBUSY, + "Wanted success or EBUSY, r = %d, errno = %d", r1, errno); + + r2 = __kvm_irqfd(vm, GSI_BASE_PRIMARY + 1, eventfd, 0); + TEST_ASSERT(r1 || (r2 && errno == EBUSY), + "Wanted failure (EBUSY), r1 = %d, r2 = %d, errno = %d", + r1, r2, errno); + + /* + * De-assign should always succeed, even if the corresponding assign + * failed. + */ + kvm_irqfd(vm, GSI_BASE_PRIMARY, eventfd, KVM_IRQFD_FLAG_DEASSIGN); + kvm_irqfd(vm, GSI_BASE_PRIMARY + 1, eventfd, KVM_IRQFD_FLAG_DEASSIGN); +} + +int main(int argc, char *argv[]) +{ + pthread_t racing_thread; + int r, i; + + /* Create "full" VMs, as KVM_IRQFD requires an in-kernel IRQ chip. */ + vm1 = vm_create(1); + vm2 = vm_create(1); + + WRITE_ONCE(__eventfd, kvm_new_eventfd()); + + kvm_irqfd(vm1, 10, __eventfd, 0); + + r = __kvm_irqfd(vm1, 11, __eventfd, 0); + TEST_ASSERT(r && errno == EBUSY, + "Wanted EBUSY, r = %d, errno = %d", r, errno); + + r = __kvm_irqfd(vm2, 12, __eventfd, 0); + TEST_ASSERT(r && errno == EBUSY, + "Wanted EBUSY, r = %d, errno = %d", r, errno); + + /* + * De-assign all eventfds, along with multiple eventfds that were never + * assigned. KVM's ABI is that de-assign is allowed so long as the + * eventfd itself is valid. + */ + kvm_irqfd(vm1, 11, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN); + kvm_irqfd(vm1, 12, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN); + kvm_irqfd(vm1, 13, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN); + kvm_irqfd(vm1, 14, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN); + kvm_irqfd(vm1, 10, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN); + + close(__eventfd); + + pthread_create(&racing_thread, NULL, secondary_irqfd_juggler, vm2); + + for (i = 0; i < 10000; i++) { + WRITE_ONCE(__eventfd, kvm_new_eventfd()); + + juggle_eventfd_primary(vm1, __eventfd); + juggle_eventfd_primary(vm2, __eventfd); + close(__eventfd); + } + + WRITE_ONCE(done, true); + pthread_join(racing_thread, NULL); +} From 81bf24f1ac77029bf858c0da081088eb62b1b230 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 10 Jul 2025 12:12:20 +0100 Subject: [PATCH 101/101] KVM: selftests: Add CONFIG_EVENTFD for irqfd selftest In 7e9b231c402a ("KVM: selftests: Add a KVM_IRQFD test to verify uniqueness requirements") we added a test for the newly added irqfd support but since this feature works with eventfds it won't work unless the kernel has been built wth eventfd support. Add CONFIG_EVENTFD to the list of required options for the KVM selftests. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20250710-kvm-selftests-eventfd-config-v1-1-78c276e4b80f@kernel.org Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/config b/tools/testing/selftests/kvm/config index 8835fed09e9f..96d874b239eb 100644 --- a/tools/testing/selftests/kvm/config +++ b/tools/testing/selftests/kvm/config @@ -1,5 +1,6 @@ CONFIG_KVM=y CONFIG_KVM_INTEL=y CONFIG_KVM_AMD=y +CONFIG_EVENTFD=y CONFIG_USERFAULTFD=y CONFIG_IDLE_PAGE_TRACKING=y