From 3c4629b68dbe18e454cce4b864c530268cffbeed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 22 Dec 2025 09:00:33 +0100 Subject: [PATCH 01/59] virtio: uapi: avoid usage of libc types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using libc types and headers from the UAPI headers is problematic as it introduces a dependency on a full C toolchain. On Linux 'unsigned long' works as a replacement for 'uintptr_t' and does not depend on libc. Signed-off-by: Thomas Weißschuh Acked-by: Arnd Bergmann Signed-off-by: Michael S. Tsirkin Message-Id: <20251222-uapi-virtio-v1-1-29390f87bcad@linutronix.de> --- include/uapi/linux/virtio_ring.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h index f8c20d3de8da..3c478582a3c2 100644 --- a/include/uapi/linux/virtio_ring.h +++ b/include/uapi/linux/virtio_ring.h @@ -31,9 +31,6 @@ * SUCH DAMAGE. * * Copyright Rusty Russell IBM Corporation 2007. */ -#ifndef __KERNEL__ -#include -#endif #include #include @@ -202,7 +199,7 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p, vr->num = num; vr->desc = p; vr->avail = (struct vring_avail *)((char *)p + num * sizeof(struct vring_desc)); - vr->used = (void *)(((uintptr_t)&vr->avail->ring[num] + sizeof(__virtio16) + vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + sizeof(__virtio16) + align-1) & ~(align - 1)); } From 4b7bf8d5503287ed3bd661207b9d061999ac494e Mon Sep 17 00:00:00 2001 From: "zhangdongchuan@eswincomputing.com" Date: Wed, 26 Nov 2025 11:40:16 +0800 Subject: [PATCH 02/59] virtio_ring: code cleanup in detach_buf_split Since the return value of vring_unmap_one_split() is exactly vq->split.desc_extra[i].next, 'i = vq->split.desc_extra[i].next' is redundant. Assign vring_unmap_one_split() to i instead. Since vq->split.desc_extra is assigned to extra, use extra[i].next instead of vq->split.desc_extra[i].next to improve readability. No change in functionality. Signed-off-by: zhangdongchuan Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin Message-Id: <202511261140162936986@eswincomputing.com> --- drivers/virtio/virtio_ring.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index ddab68959671..560e132f1f19 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -764,13 +764,12 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, i = head; while (vq->split.vring.desc[i].flags & nextflag) { - vring_unmap_one_split(vq, &extra[i]); - i = vq->split.desc_extra[i].next; + i = vring_unmap_one_split(vq, &extra[i]); vq->vq.num_free++; } vring_unmap_one_split(vq, &extra[i]); - vq->split.desc_extra[i].next = vq->free_head; + extra[i].next = vq->free_head; vq->free_head = head; /* Plus final descriptor */ From 3b34d6324d1f82a4d35ce461add457e185dc98ac Mon Sep 17 00:00:00 2001 From: Jon Kohler Date: Wed, 12 Nov 2025 17:55:28 -0700 Subject: [PATCH 03/59] vhost: use "checked" versions of get_user() and put_user() vhost_get_user and vhost_put_user leverage __get_user and __put_user, respectively, which were both added in 2016 by commit 6b1e6cc7855b ("vhost: new device IOTLB API"). In a heavy UDP transmit workload on a vhost-net backed tap device, these functions showed up as ~11.6% of samples in a flamegraph of the underlying vhost worker thread. Quoting Linus from [1]: Anyway, every single __get_user() call I looked at looked like historical garbage. [...] End result: I get the feeling that we should just do a global search-and-replace of the __get_user/ __put_user users, replace them with plain get_user/put_user instead, and then fix up any fallout (eg the coco code). Switch to plain get_user/put_user in vhost, which results in a slight throughput speedup. get_user now about ~8.4% of samples in flamegraph. Basic iperf3 test on a Intel 5416S CPU with Ubuntu 25.10 guest: TX: taskset -c 2 iperf3 -c -t 60 -p 5200 -b 0 -u -i 5 RX: taskset -c 2 iperf3 -s -p 5200 -D Before: 6.08 Gbits/sec After: 6.32 Gbits/sec As to what drives the speedup, Sean's patch [2] explains: Use the normal, checked versions for get_user() and put_user() instead of the double-underscore versions that omit range checks, as the checked versions are actually measurably faster on modern CPUs (12%+ on Intel, 25%+ on AMD). The performance hit on the unchecked versions is almost entirely due to the added LFENCE on CPUs where LFENCE is serializing (which is effectively all modern CPUs), which was added by commit 304ec1b05031 ("x86/uaccess: Use __uaccess_begin_nospec() and uaccess_try_nospec"). The small optimizations done by commit b19b74bc99b1 ("x86/mm: Rework address range check in get_user() and put_user()") likely shave a few cycles off, but the bulk of the extra latency comes from the LFENCE. [1] https://lore.kernel.org/all/CAHk-=wiJiDSPZJTV7z3Q-u4DfLgQTNWqUqqrwSBHp0+Dh016FA@mail.gmail.com/ [2] https://lore.kernel.org/all/20251106210206.221558-1-seanjc@google.com/ Suggested-by: Linus Torvalds Cc: Borislav Petkov Cc: Sean Christopherson Signed-off-by: Jon Kohler Message-Id: <20251113005529.2494066-1-jon@nutanix.com> Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vhost.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index bccdc9eab267..fcf7f10adbbf 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1444,13 +1444,13 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq, ({ \ int ret; \ if (!vq->iotlb) { \ - ret = __put_user(x, ptr); \ + ret = put_user(x, ptr); \ } else { \ __typeof__(ptr) to = \ (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ sizeof(*ptr), VHOST_ADDR_USED); \ if (to != NULL) \ - ret = __put_user(x, to); \ + ret = put_user(x, to); \ else \ ret = -EFAULT; \ } \ @@ -1489,14 +1489,14 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq) ({ \ int ret; \ if (!vq->iotlb) { \ - ret = __get_user(x, ptr); \ + ret = get_user(x, ptr); \ } else { \ __typeof__(ptr) from = \ (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ sizeof(*ptr), \ type); \ if (from != NULL) \ - ret = __get_user(x, from); \ + ret = get_user(x, from); \ else \ ret = -EFAULT; \ } \ From 8ce8e3e5582e85f6533b5013806299a8efba67f0 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:31 +0800 Subject: [PATCH 04/59] virtio_ring: rename virtqueue_reinit_xxx to virtqueue_reset_xxx() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be consistent with virtqueue_reset(). Acked-by: Eugenio Pérez Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-2-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 560e132f1f19..bcac7500c874 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -1008,7 +1008,7 @@ static void virtqueue_vring_init_split(struct vring_virtqueue_split *vring_split } } -static void virtqueue_reinit_split(struct vring_virtqueue *vq) +static void virtqueue_reset_split(struct vring_virtqueue *vq) { int num; @@ -1252,7 +1252,7 @@ static int virtqueue_resize_split(struct virtqueue *_vq, u32 num) err_state_extra: vring_free_split(&vring_split, vdev, vq->map); err: - virtqueue_reinit_split(vq); + virtqueue_reset_split(vq); return -ENOMEM; } @@ -2090,7 +2090,7 @@ static void virtqueue_vring_attach_packed(struct vring_virtqueue *vq, vq->free_head = 0; } -static void virtqueue_reinit_packed(struct vring_virtqueue *vq) +static void virtqueue_reset_packed(struct vring_virtqueue *vq) { memset(vq->packed.vring.device, 0, vq->packed.event_size_in_bytes); memset(vq->packed.vring.driver, 0, vq->packed.event_size_in_bytes); @@ -2217,7 +2217,7 @@ static int virtqueue_resize_packed(struct virtqueue *_vq, u32 num) err_state_extra: vring_free_packed(&vring_packed, vdev, vq->map); err_ring: - virtqueue_reinit_packed(vq); + virtqueue_reset_packed(vq); return -ENOMEM; } @@ -2859,9 +2859,9 @@ int virtqueue_reset(struct virtqueue *_vq, recycle_done(_vq); if (vq->packed_ring) - virtqueue_reinit_packed(vq); + virtqueue_reset_packed(vq); else - virtqueue_reinit_split(vq); + virtqueue_reset_split(vq); return virtqueue_enable_after_reset(_vq); } From 79f6d682937dd91c5ed3a1050fa99cb4369dd720 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:32 +0800 Subject: [PATCH 05/59] virtio_ring: switch to use vring_virtqueue in virtqueue_poll variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those variants are used internally so let's switch to use vring_virtqueue as parameter to be consistent with other internal virtqueue helpers. Acked-by: Eugenio Pérez Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Tested-by: Lei Yang Reviewed-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-3-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index bcac7500c874..c92b371d9a56 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -918,11 +918,10 @@ static unsigned int virtqueue_enable_cb_prepare_split(struct virtqueue *_vq) return last_used_idx; } -static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned int last_used_idx) +static bool virtqueue_poll_split(struct vring_virtqueue *vq, + unsigned int last_used_idx) { - struct vring_virtqueue *vq = to_vvq(_vq); - - return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, + return (u16)last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->split.vring.used->idx); } @@ -1843,9 +1842,8 @@ static unsigned int virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq) return vq->last_used_idx; } -static bool virtqueue_poll_packed(struct virtqueue *_vq, u16 off_wrap) +static bool virtqueue_poll_packed(struct vring_virtqueue *vq, u16 off_wrap) { - struct vring_virtqueue *vq = to_vvq(_vq); bool wrap_counter; u16 used_idx; @@ -2610,8 +2608,8 @@ bool virtqueue_poll(struct virtqueue *_vq, unsigned int last_used_idx) return false; virtio_mb(vq->weak_barriers); - return vq->packed_ring ? virtqueue_poll_packed(_vq, last_used_idx) : - virtqueue_poll_split(_vq, last_used_idx); + return vq->packed_ring ? virtqueue_poll_packed(vq, last_used_idx) : + virtqueue_poll_split(vq, last_used_idx); } EXPORT_SYMBOL_GPL(virtqueue_poll); From 40da006f137dbbd16b657da37f6ea4fb8ad13671 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:33 +0800 Subject: [PATCH 06/59] virtio_ring: unify logic of virtqueue_poll() and more_used() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch unifies the logic of virtqueue_poll() and more_used() for better code reusing and ease the future in order implementation. Acked-by: Eugenio Pérez Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Tested-by: Lei Yang Reviewed-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-4-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 48 +++++++++++++++--------------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index c92b371d9a56..b25bb2f1e22c 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -805,10 +805,16 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, } } +static bool virtqueue_poll_split(const struct vring_virtqueue *vq, + unsigned int last_used_idx) +{ + return (u16)last_used_idx != virtio16_to_cpu(vq->vq.vdev, + vq->split.vring.used->idx); +} + static bool more_used_split(const struct vring_virtqueue *vq) { - return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, - vq->split.vring.used->idx); + return virtqueue_poll_split(vq, vq->last_used_idx); } static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, @@ -918,13 +924,6 @@ static unsigned int virtqueue_enable_cb_prepare_split(struct virtqueue *_vq) return last_used_idx; } -static bool virtqueue_poll_split(struct vring_virtqueue *vq, - unsigned int last_used_idx) -{ - return (u16)last_used_idx != virtio16_to_cpu(vq->vq.vdev, - vq->split.vring.used->idx); -} - static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); @@ -1709,16 +1708,20 @@ static inline bool is_used_desc_packed(const struct vring_virtqueue *vq, return avail == used && used == used_wrap_counter; } +static bool virtqueue_poll_packed(const struct vring_virtqueue *vq, u16 off_wrap) +{ + bool wrap_counter; + u16 used_idx; + + wrap_counter = off_wrap >> VRING_PACKED_EVENT_F_WRAP_CTR; + used_idx = off_wrap & ~(1 << VRING_PACKED_EVENT_F_WRAP_CTR); + + return is_used_desc_packed(vq, used_idx, wrap_counter); +} + static bool more_used_packed(const struct vring_virtqueue *vq) { - u16 last_used; - u16 last_used_idx; - bool used_wrap_counter; - - last_used_idx = READ_ONCE(vq->last_used_idx); - last_used = packed_last_used(last_used_idx); - used_wrap_counter = packed_used_wrap_counter(last_used_idx); - return is_used_desc_packed(vq, last_used, used_wrap_counter); + return virtqueue_poll_packed(vq, READ_ONCE(vq->last_used_idx)); } static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, @@ -1842,17 +1845,6 @@ static unsigned int virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq) return vq->last_used_idx; } -static bool virtqueue_poll_packed(struct vring_virtqueue *vq, u16 off_wrap) -{ - bool wrap_counter; - u16 used_idx; - - wrap_counter = off_wrap >> VRING_PACKED_EVENT_F_WRAP_CTR; - used_idx = off_wrap & ~(1 << VRING_PACKED_EVENT_F_WRAP_CTR); - - return is_used_desc_packed(vq, used_idx, wrap_counter); -} - static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); From 9552bc05815447e04cc540ea034bb8632392c678 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:34 +0800 Subject: [PATCH 07/59] virtio_ring: switch to use vring_virtqueue for virtqueue resize variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those variants are used internally so let's switch to use vring_virtqueue as parameter to be consistent with other internal virtqueue helpers. Acked-by: Eugenio Pérez Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-5-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index b25bb2f1e22c..6c444ff009e2 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -1220,11 +1220,10 @@ static struct virtqueue *vring_create_virtqueue_split( return vq; } -static int virtqueue_resize_split(struct virtqueue *_vq, u32 num) +static int virtqueue_resize_split(struct vring_virtqueue *vq, u32 num) { struct vring_virtqueue_split vring_split = {}; - struct vring_virtqueue *vq = to_vvq(_vq); - struct virtio_device *vdev = _vq->vdev; + struct virtio_device *vdev = vq->vq.vdev; int err; err = vring_alloc_queue_split(&vring_split, vdev, num, @@ -2181,11 +2180,10 @@ static struct virtqueue *vring_create_virtqueue_packed( return vq; } -static int virtqueue_resize_packed(struct virtqueue *_vq, u32 num) +static int virtqueue_resize_packed(struct vring_virtqueue *vq, u32 num) { struct vring_virtqueue_packed vring_packed = {}; - struct vring_virtqueue *vq = to_vvq(_vq); - struct virtio_device *vdev = _vq->vdev; + struct virtio_device *vdev = vq->vq.vdev; int err; if (vring_alloc_queue_packed(&vring_packed, vdev, num, vq->map)) @@ -2808,9 +2806,9 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num, recycle_done(_vq); if (vq->packed_ring) - err = virtqueue_resize_packed(_vq, num); + err = virtqueue_resize_packed(vq, num); else - err = virtqueue_resize_split(_vq, num); + err = virtqueue_resize_split(vq, num); err_reset = virtqueue_enable_after_reset(_vq); if (err_reset) From 8b8590b70894f5934249f5735e164ee2121d6549 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:35 +0800 Subject: [PATCH 08/59] virtio_ring: switch to use vring_virtqueue for virtqueue_kick_prepare variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those variants are used internally so let's switch to use vring_virtqueue as parameter to be consistent with other internal virtqueue helpers. Acked-by: Eugenio Pérez Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Tested-by: Lei Yang Reviewed-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-6-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 6c444ff009e2..b209f456b07a 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -717,9 +717,8 @@ unmap_release: return -ENOMEM; } -static bool virtqueue_kick_prepare_split(struct virtqueue *_vq) +static bool virtqueue_kick_prepare_split(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); u16 new, old; bool needs_kick; @@ -736,12 +735,12 @@ static bool virtqueue_kick_prepare_split(struct virtqueue *_vq) LAST_ADD_TIME_INVALID(vq); if (vq->event) { - needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, + needs_kick = vring_need_event(virtio16_to_cpu(vq->vq.vdev, vring_avail_event(&vq->split.vring)), new, old); } else { needs_kick = !(vq->split.vring.used->flags & - cpu_to_virtio16(_vq->vdev, + cpu_to_virtio16(vq->vq.vdev, VRING_USED_F_NO_NOTIFY)); } END_USE(vq); @@ -1595,9 +1594,8 @@ unmap_release: return -EIO; } -static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq) +static bool virtqueue_kick_prepare_packed(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); u16 new, old, off_wrap, flags, wrap_counter, event_idx; bool needs_kick; union { @@ -2456,8 +2454,8 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - return vq->packed_ring ? virtqueue_kick_prepare_packed(_vq) : - virtqueue_kick_prepare_split(_vq); + return vq->packed_ring ? virtqueue_kick_prepare_packed(vq) : + virtqueue_kick_prepare_split(vq); } EXPORT_SYMBOL_GPL(virtqueue_kick_prepare); From 4a0fa90b10a2b11522bcb808d90022f489b2ab27 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:36 +0800 Subject: [PATCH 09/59] virtio_ring: switch to use vring_virtqueue for virtqueue_add variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those variants are used internally so let's switch to use vring_virtqueue as parameter to be consistent with other internal virtqueue helpers. Acked-by: Eugenio Pérez Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-7-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 39 ++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index b209f456b07a..5787cb428652 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -476,7 +476,7 @@ out: return extra->next; } -static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq, +static struct vring_desc *alloc_indirect_split(struct vring_virtqueue *vq, unsigned int total_sg, gfp_t gfp) { @@ -505,7 +505,7 @@ static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq, return desc; } -static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, +static inline unsigned int virtqueue_add_desc_split(struct vring_virtqueue *vq, struct vring_desc *desc, struct vring_desc_extra *extra, unsigned int i, @@ -513,11 +513,12 @@ static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, unsigned int len, u16 flags, bool premapped) { + struct virtio_device *vdev = vq->vq.vdev; u16 next; - desc[i].flags = cpu_to_virtio16(vq->vdev, flags); - desc[i].addr = cpu_to_virtio64(vq->vdev, addr); - desc[i].len = cpu_to_virtio32(vq->vdev, len); + desc[i].flags = cpu_to_virtio16(vdev, flags); + desc[i].addr = cpu_to_virtio64(vdev, addr); + desc[i].len = cpu_to_virtio32(vdev, len); extra[i].addr = premapped ? DMA_MAPPING_ERROR : addr; extra[i].len = len; @@ -525,12 +526,12 @@ static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, next = extra[i].next; - desc[i].next = cpu_to_virtio16(vq->vdev, next); + desc[i].next = cpu_to_virtio16(vdev, next); return next; } -static inline int virtqueue_add_split(struct virtqueue *_vq, +static inline int virtqueue_add_split(struct vring_virtqueue *vq, struct scatterlist *sgs[], unsigned int total_sg, unsigned int out_sgs, @@ -540,7 +541,6 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, bool premapped, gfp_t gfp) { - struct vring_virtqueue *vq = to_vvq(_vq); struct vring_desc_extra *extra; struct scatterlist *sg; struct vring_desc *desc; @@ -565,7 +565,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, head = vq->free_head; if (virtqueue_use_indirect(vq, total_sg)) - desc = alloc_indirect_split(_vq, total_sg, gfp); + desc = alloc_indirect_split(vq, total_sg, gfp); else { desc = NULL; WARN_ON_ONCE(total_sg > vq->split.vring.num && !vq->indirect); @@ -612,7 +612,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, /* Note that we trust indirect descriptor * table since it use stream DMA mapping. */ - i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, len, + i = virtqueue_add_desc_split(vq, desc, extra, i, addr, len, VRING_DESC_F_NEXT, premapped); } @@ -629,14 +629,14 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, /* Note that we trust indirect descriptor * table since it use stream DMA mapping. */ - i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, len, + i = virtqueue_add_desc_split(vq, desc, extra, i, addr, len, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE, premapped); } } /* Last one doesn't continue. */ - desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); + desc[prev].flags &= cpu_to_virtio16(vq->vq.vdev, ~VRING_DESC_F_NEXT); if (!indirect && vring_need_unmap_buffer(vq, &extra[prev])) vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &= ~VRING_DESC_F_NEXT; @@ -649,7 +649,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, if (vring_mapping_error(vq, addr)) goto unmap_release; - virtqueue_add_desc_split(_vq, vq->split.vring.desc, + virtqueue_add_desc_split(vq, vq->split.vring.desc, vq->split.desc_extra, head, addr, total_sg * sizeof(struct vring_desc), @@ -675,13 +675,13 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); - vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); + vq->split.vring.avail->ring[avail] = cpu_to_virtio16(vq->vq.vdev, head); /* Descriptors and available array need to be set before we expose the * new available array entries. */ virtio_wmb(vq->weak_barriers); vq->split.avail_idx_shadow++; - vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, + vq->split.vring.avail->idx = cpu_to_virtio16(vq->vq.vdev, vq->split.avail_idx_shadow); vq->num_added++; @@ -691,7 +691,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, /* This is very unlikely, but theoretically possible. Kick * just in case. */ if (unlikely(vq->num_added == (1 << 16) - 1)) - virtqueue_kick(_vq); + virtqueue_kick(&vq->vq); return 0; @@ -1439,7 +1439,7 @@ unmap_release: return -ENOMEM; } -static inline int virtqueue_add_packed(struct virtqueue *_vq, +static inline int virtqueue_add_packed(struct vring_virtqueue *vq, struct scatterlist *sgs[], unsigned int total_sg, unsigned int out_sgs, @@ -1449,7 +1449,6 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, bool premapped, gfp_t gfp) { - struct vring_virtqueue *vq = to_vvq(_vq); struct vring_packed_desc *desc; struct scatterlist *sg; unsigned int i, n, c, descs_used, err_idx, len; @@ -2261,9 +2260,9 @@ static inline int virtqueue_add(struct virtqueue *_vq, { struct vring_virtqueue *vq = to_vvq(_vq); - return vq->packed_ring ? virtqueue_add_packed(_vq, sgs, total_sg, + return vq->packed_ring ? virtqueue_add_packed(vq, sgs, total_sg, out_sgs, in_sgs, data, ctx, premapped, gfp) : - virtqueue_add_split(_vq, sgs, total_sg, + virtqueue_add_split(vq, sgs, total_sg, out_sgs, in_sgs, data, ctx, premapped, gfp); } From ceea1cd0aef23e44c994127d62f51519ae3566fa Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:37 +0800 Subject: [PATCH 10/59] virtio: switch to use vring_virtqueue for virtqueue_get variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those variants are used internally so let's switch to use vring_virtqueue as parameter to be consistent with other internal virtqueue helpers. Acked-by: Eugenio Pérez Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Tested-by: Lei Yang Reviewed-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-8-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 5787cb428652..6225ecac3c19 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -816,11 +816,10 @@ static bool more_used_split(const struct vring_virtqueue *vq) return virtqueue_poll_split(vq, vq->last_used_idx); } -static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, +static void *virtqueue_get_buf_ctx_split(struct vring_virtqueue *vq, unsigned int *len, void **ctx) { - struct vring_virtqueue *vq = to_vvq(_vq); void *ret; unsigned int i; u16 last_used; @@ -842,9 +841,9 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, virtio_rmb(vq->weak_barriers); last_used = (vq->last_used_idx & (vq->split.vring.num - 1)); - i = virtio32_to_cpu(_vq->vdev, + i = virtio32_to_cpu(vq->vq.vdev, vq->split.vring.used->ring[last_used].id); - *len = virtio32_to_cpu(_vq->vdev, + *len = virtio32_to_cpu(vq->vq.vdev, vq->split.vring.used->ring[last_used].len); if (unlikely(i >= vq->split.vring.num)) { @@ -866,7 +865,7 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) virtio_store_mb(vq->weak_barriers, &vring_used_event(&vq->split.vring), - cpu_to_virtio16(_vq->vdev, vq->last_used_idx)); + cpu_to_virtio16(vq->vq.vdev, vq->last_used_idx)); LAST_ADD_TIME_INVALID(vq); @@ -1720,11 +1719,10 @@ static bool more_used_packed(const struct vring_virtqueue *vq) return virtqueue_poll_packed(vq, READ_ONCE(vq->last_used_idx)); } -static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, +static void *virtqueue_get_buf_ctx_packed(struct vring_virtqueue *vq, unsigned int *len, void **ctx) { - struct vring_virtqueue *vq = to_vvq(_vq); u16 last_used, id, last_used_idx; bool used_wrap_counter; void *ret; @@ -2524,8 +2522,8 @@ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len, { struct vring_virtqueue *vq = to_vvq(_vq); - return vq->packed_ring ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) : - virtqueue_get_buf_ctx_split(_vq, len, ctx); + return vq->packed_ring ? virtqueue_get_buf_ctx_packed(vq, len, ctx) : + virtqueue_get_buf_ctx_split(vq, len, ctx); } EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx); From 74847cb5731760b22ace8e2fe97a330aa0162d1e Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:38 +0800 Subject: [PATCH 11/59] virtio_ring: switch to use vring_virtqueue for enable_cb_prepare variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those variants are used internally so let's switch to use vring_virtqueue as parameter to be consistent with other internal virtqueue helpers. Acked-by: Eugenio Pérez Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Tested-by: Lei Yang Reviewed-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-9-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 6225ecac3c19..435121b1403c 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -897,9 +897,8 @@ static void virtqueue_disable_cb_split(struct virtqueue *_vq) } } -static unsigned int virtqueue_enable_cb_prepare_split(struct virtqueue *_vq) +static unsigned int virtqueue_enable_cb_prepare_split(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); u16 last_used_idx; START_USE(vq); @@ -913,10 +912,10 @@ static unsigned int virtqueue_enable_cb_prepare_split(struct virtqueue *_vq) vq->split.avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT; if (!vq->event) vq->split.vring.avail->flags = - cpu_to_virtio16(_vq->vdev, + cpu_to_virtio16(vq->vq.vdev, vq->split.avail_flags_shadow); } - vring_used_event(&vq->split.vring) = cpu_to_virtio16(_vq->vdev, + vring_used_event(&vq->split.vring) = cpu_to_virtio16(vq->vq.vdev, last_used_idx = vq->last_used_idx); END_USE(vq); return last_used_idx; @@ -1806,10 +1805,8 @@ static void virtqueue_disable_cb_packed(struct virtqueue *_vq) } } -static unsigned int virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq) +static unsigned int virtqueue_enable_cb_prepare_packed(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); - START_USE(vq); /* @@ -2571,8 +2568,8 @@ unsigned int virtqueue_enable_cb_prepare(struct virtqueue *_vq) if (vq->event_triggered) vq->event_triggered = false; - return vq->packed_ring ? virtqueue_enable_cb_prepare_packed(_vq) : - virtqueue_enable_cb_prepare_split(_vq); + return vq->packed_ring ? virtqueue_enable_cb_prepare_packed(vq) : + virtqueue_enable_cb_prepare_split(vq); } EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare); From 62fa22cdab7bc07f82e3f5080d7bf35f5f1bf676 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:39 +0800 Subject: [PATCH 12/59] virtio_ring: use vring_virtqueue for enable_cb_delayed variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those variants are used internally so let's switch to use vring_virtqueue as parameter to be consistent with other internal virtqueue helpers. Acked-by: Eugenio Pérez Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Reviewed-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-10-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 435121b1403c..84a7a59813f5 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -921,9 +921,8 @@ static unsigned int virtqueue_enable_cb_prepare_split(struct vring_virtqueue *vq return last_used_idx; } -static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq) +static bool virtqueue_enable_cb_delayed_split(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); u16 bufs; START_USE(vq); @@ -937,7 +936,7 @@ static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq) vq->split.avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT; if (!vq->event) vq->split.vring.avail->flags = - cpu_to_virtio16(_vq->vdev, + cpu_to_virtio16(vq->vq.vdev, vq->split.avail_flags_shadow); } /* TODO: tune this threshold */ @@ -945,9 +944,9 @@ static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq) virtio_store_mb(vq->weak_barriers, &vring_used_event(&vq->split.vring), - cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs)); + cpu_to_virtio16(vq->vq.vdev, vq->last_used_idx + bufs)); - if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->split.vring.used->idx) + if (unlikely((u16)(virtio16_to_cpu(vq->vq.vdev, vq->split.vring.used->idx) - vq->last_used_idx) > bufs)) { END_USE(vq); return false; @@ -1836,9 +1835,8 @@ static unsigned int virtqueue_enable_cb_prepare_packed(struct vring_virtqueue *v return vq->last_used_idx; } -static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq) +static bool virtqueue_enable_cb_delayed_packed(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); u16 used_idx, wrap_counter, last_used_idx; u16 bufs; @@ -2634,8 +2632,8 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) if (vq->event_triggered) data_race(vq->event_triggered = false); - return vq->packed_ring ? virtqueue_enable_cb_delayed_packed(_vq) : - virtqueue_enable_cb_delayed_split(_vq); + return vq->packed_ring ? virtqueue_enable_cb_delayed_packed(vq) : + virtqueue_enable_cb_delayed_split(vq); } EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed); From 7e81017673fefa3726b60ca0a9999e621e99ff27 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:40 +0800 Subject: [PATCH 13/59] virtio_ring: switch to use vring_virtqueue for disable_cb variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those variants are used internally so let's switch to use vring_virtqueue as parameter to be consistent with other internal virtqueue helpers. Acked-by: Eugenio Pérez Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Tested-by: Lei Yang Reviewed-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-11-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 84a7a59813f5..113f640e42b4 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -873,10 +873,8 @@ static void *virtqueue_get_buf_ctx_split(struct vring_virtqueue *vq, return ret; } -static void virtqueue_disable_cb_split(struct virtqueue *_vq) +static void virtqueue_disable_cb_split(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); - if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) { vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; @@ -892,7 +890,7 @@ static void virtqueue_disable_cb_split(struct virtqueue *_vq) vring_used_event(&vq->split.vring) = 0x0; else vq->split.vring.avail->flags = - cpu_to_virtio16(_vq->vdev, + cpu_to_virtio16(vq->vq.vdev, vq->split.avail_flags_shadow); } } @@ -1785,10 +1783,8 @@ static void *virtqueue_get_buf_ctx_packed(struct vring_virtqueue *vq, return ret; } -static void virtqueue_disable_cb_packed(struct virtqueue *_vq) +static void virtqueue_disable_cb_packed(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); - if (vq->packed.event_flags_shadow != VRING_PACKED_EVENT_FLAG_DISABLE) { vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; @@ -2541,9 +2537,9 @@ void virtqueue_disable_cb(struct virtqueue *_vq) struct vring_virtqueue *vq = to_vvq(_vq); if (vq->packed_ring) - virtqueue_disable_cb_packed(_vq); + virtqueue_disable_cb_packed(vq); else - virtqueue_disable_cb_split(_vq); + virtqueue_disable_cb_split(vq); } EXPORT_SYMBOL_GPL(virtqueue_disable_cb); From f2ad9d6b4eed59f880b1fcaf28e2ddaeb292b2df Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:41 +0800 Subject: [PATCH 14/59] virtio_ring: switch to use vring_virtqueue for detach_unused_buf variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those variants are used internally so let's switch to use vring_virtqueue as parameter to be consistent with other internal virtqueue helpers. Acked-by: Eugenio Pérez Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Tested-by: Lei Yang Reviewed-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-12-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 113f640e42b4..09ebcb4e17e2 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -954,9 +954,8 @@ static bool virtqueue_enable_cb_delayed_split(struct vring_virtqueue *vq) return true; } -static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq) +static void *virtqueue_detach_unused_buf_split(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); unsigned int i; void *buf; @@ -969,7 +968,7 @@ static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq) buf = vq->split.desc_state[i].data; detach_buf_split(vq, i, NULL); vq->split.avail_idx_shadow--; - vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, + vq->split.vring.avail->idx = cpu_to_virtio16(vq->vq.vdev, vq->split.avail_idx_shadow); END_USE(vq); return buf; @@ -1891,9 +1890,8 @@ static bool virtqueue_enable_cb_delayed_packed(struct vring_virtqueue *vq) return true; } -static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq) +static void *virtqueue_detach_unused_buf_packed(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); unsigned int i; void *buf; @@ -2645,8 +2643,8 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - return vq->packed_ring ? virtqueue_detach_unused_buf_packed(_vq) : - virtqueue_detach_unused_buf_split(_vq); + return vq->packed_ring ? virtqueue_detach_unused_buf_packed(vq) : + virtqueue_detach_unused_buf_split(vq); } EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf); From eff8b47d2832150f96ab706562cef5a754a0d625 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:42 +0800 Subject: [PATCH 15/59] virtio_ring: switch to use unsigned int for virtqueue_poll_packed() Switch to use unsigned int for virtqueue_poll_packed() to match virtqueue_poll() and virtqueue_poll_split() and to ease the abstraction of the virtqueue ops. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-13-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 09ebcb4e17e2..1832ea7982a6 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -1698,7 +1698,8 @@ static inline bool is_used_desc_packed(const struct vring_virtqueue *vq, return avail == used && used == used_wrap_counter; } -static bool virtqueue_poll_packed(const struct vring_virtqueue *vq, u16 off_wrap) +static bool virtqueue_poll_packed(const struct vring_virtqueue *vq, + unsigned int off_wrap) { bool wrap_counter; u16 used_idx; From 1208473f9b5eb273e787bb1b07a4b2a323692a10 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:43 +0800 Subject: [PATCH 16/59] virtio_ring: introduce virtqueue ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces virtqueue ops which is a set of callbacks that will be called for different queue layout or features. This would help to avoid branches for split/packed and will ease the future implementation like in order. Note that in order to eliminate the indirect calls this patch uses global array of const ops to allow compiler to avoid indirect branches. Tested with CONFIG_MITIGATION_RETPOLINE, no performance differences were noticed. Acked-by: Eugenio Pérez Suggested-by: Michael S. Tsirkin Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-14-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 171 ++++++++++++++++++++++++++--------- 1 file changed, 127 insertions(+), 44 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 1832ea7982a6..d0904ac0aa93 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -67,6 +67,11 @@ #define LAST_ADD_TIME_INVALID(vq) #endif +enum vq_layout { + VQ_LAYOUT_SPLIT = 0, + VQ_LAYOUT_PACKED, +}; + struct vring_desc_state_split { void *data; /* Data for callback. */ @@ -159,12 +164,29 @@ struct vring_virtqueue_packed { size_t event_size_in_bytes; }; +struct vring_virtqueue; + +struct virtqueue_ops { + int (*add)(struct vring_virtqueue *vq, struct scatterlist *sgs[], + unsigned int total_sg, unsigned int out_sgs, + unsigned int in_sgs, void *data, + void *ctx, bool premapped, gfp_t gfp); + void *(*get)(struct vring_virtqueue *vq, unsigned int *len, void **ctx); + bool (*kick_prepare)(struct vring_virtqueue *vq); + void (*disable_cb)(struct vring_virtqueue *vq); + bool (*enable_cb_delayed)(struct vring_virtqueue *vq); + unsigned int (*enable_cb_prepare)(struct vring_virtqueue *vq); + bool (*poll)(const struct vring_virtqueue *vq, + unsigned int last_used_idx); + void *(*detach_unused_buf)(struct vring_virtqueue *vq); + bool (*more_used)(const struct vring_virtqueue *vq); + int (*resize)(struct vring_virtqueue *vq, u32 num); + void (*reset)(struct vring_virtqueue *vq); +}; + struct vring_virtqueue { struct virtqueue vq; - /* Is this a packed ring? */ - bool packed_ring; - /* Is DMA API used? */ bool use_map_api; @@ -180,6 +202,8 @@ struct vring_virtqueue { /* Host publishes avail event idx */ bool event; + enum vq_layout layout; + /* Head of free buffer list. */ unsigned int free_head; /* Number we've added since last sync. */ @@ -231,6 +255,12 @@ static void vring_free(struct virtqueue *_vq); #define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq) + +static inline bool virtqueue_is_packed(const struct vring_virtqueue *vq) +{ + return vq->layout == VQ_LAYOUT_PACKED; +} + static bool virtqueue_use_indirect(const struct vring_virtqueue *vq, unsigned int total_sg) { @@ -433,7 +463,7 @@ static void virtqueue_init(struct vring_virtqueue *vq, u32 num) { vq->vq.num_free = num; - if (vq->packed_ring) + if (virtqueue_is_packed(vq)) vq->last_used_idx = 0 | (1 << VRING_PACKED_EVENT_F_WRAP_CTR); else vq->last_used_idx = 0; @@ -1121,6 +1151,8 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split, return 0; } +static const struct virtqueue_ops split_ops; + static struct virtqueue *__vring_new_virtqueue_split(unsigned int index, struct vring_virtqueue_split *vring_split, struct virtio_device *vdev, @@ -1138,7 +1170,7 @@ static struct virtqueue *__vring_new_virtqueue_split(unsigned int index, if (!vq) return NULL; - vq->packed_ring = false; + vq->layout = VQ_LAYOUT_SPLIT; vq->vq.callback = callback; vq->vq.vdev = vdev; vq->vq.name = name; @@ -2071,11 +2103,12 @@ static void virtqueue_reset_packed(struct vring_virtqueue *vq) /* we need to reset the desc.flags. For more, see is_used_desc_packed() */ memset(vq->packed.vring.desc, 0, vq->packed.ring_size_in_bytes); - virtqueue_init(vq, vq->packed.vring.num); virtqueue_vring_init_packed(&vq->packed, !!vq->vq.callback); } +static const struct virtqueue_ops packed_ops; + static struct virtqueue *__vring_new_virtqueue_packed(unsigned int index, struct vring_virtqueue_packed *vring_packed, struct virtio_device *vdev, @@ -2106,7 +2139,7 @@ static struct virtqueue *__vring_new_virtqueue_packed(unsigned int index, #else vq->broken = false; #endif - vq->packed_ring = true; + vq->layout = VQ_LAYOUT_PACKED; vq->map = map; vq->use_map_api = vring_use_map_api(vdev); @@ -2194,6 +2227,34 @@ err_ring: return -ENOMEM; } +static const struct virtqueue_ops split_ops = { + .add = virtqueue_add_split, + .get = virtqueue_get_buf_ctx_split, + .kick_prepare = virtqueue_kick_prepare_split, + .disable_cb = virtqueue_disable_cb_split, + .enable_cb_delayed = virtqueue_enable_cb_delayed_split, + .enable_cb_prepare = virtqueue_enable_cb_prepare_split, + .poll = virtqueue_poll_split, + .detach_unused_buf = virtqueue_detach_unused_buf_split, + .more_used = more_used_split, + .resize = virtqueue_resize_split, + .reset = virtqueue_reset_split, +}; + +static const struct virtqueue_ops packed_ops = { + .add = virtqueue_add_packed, + .get = virtqueue_get_buf_ctx_packed, + .kick_prepare = virtqueue_kick_prepare_packed, + .disable_cb = virtqueue_disable_cb_packed, + .enable_cb_delayed = virtqueue_enable_cb_delayed_packed, + .enable_cb_prepare = virtqueue_enable_cb_prepare_packed, + .poll = virtqueue_poll_packed, + .detach_unused_buf = virtqueue_detach_unused_buf_packed, + .more_used = more_used_packed, + .resize = virtqueue_resize_packed, + .reset = virtqueue_reset_packed, +}; + static int virtqueue_disable_and_recycle(struct virtqueue *_vq, void (*recycle)(struct virtqueue *vq, void *buf)) { @@ -2236,6 +2297,42 @@ static int virtqueue_enable_after_reset(struct virtqueue *_vq) * Generic functions and exported symbols. */ +#define VIRTQUEUE_CALL(vq, op, ...) \ + ({ \ + typeof(vq) __VIRTQUEUE_CALL_vq = (vq); \ + typeof(split_ops.op(__VIRTQUEUE_CALL_vq, ##__VA_ARGS__)) ret; \ + \ + switch (__VIRTQUEUE_CALL_vq->layout) { \ + case VQ_LAYOUT_SPLIT: \ + ret = split_ops.op(__VIRTQUEUE_CALL_vq, ##__VA_ARGS__); \ + break; \ + case VQ_LAYOUT_PACKED: \ + ret = packed_ops.op(__VIRTQUEUE_CALL_vq, ##__VA_ARGS__);\ + break; \ + default: \ + BUG(); \ + break; \ + } \ + ret; \ +}) + +#define VOID_VIRTQUEUE_CALL(vq, op, ...) \ + ({ \ + typeof(vq) __VIRTQUEUE_CALL_vq = (vq); \ + \ + switch (__VIRTQUEUE_CALL_vq->layout) { \ + case VQ_LAYOUT_SPLIT: \ + split_ops.op(__VIRTQUEUE_CALL_vq, ##__VA_ARGS__); \ + break; \ + case VQ_LAYOUT_PACKED: \ + packed_ops.op(__VIRTQUEUE_CALL_vq, ##__VA_ARGS__); \ + break; \ + default: \ + BUG(); \ + break; \ + } \ +}) + static inline int virtqueue_add(struct virtqueue *_vq, struct scatterlist *sgs[], unsigned int total_sg, @@ -2248,10 +2345,9 @@ static inline int virtqueue_add(struct virtqueue *_vq, { struct vring_virtqueue *vq = to_vvq(_vq); - return vq->packed_ring ? virtqueue_add_packed(vq, sgs, total_sg, - out_sgs, in_sgs, data, ctx, premapped, gfp) : - virtqueue_add_split(vq, sgs, total_sg, - out_sgs, in_sgs, data, ctx, premapped, gfp); + return VIRTQUEUE_CALL(vq, add, sgs, total_sg, + out_sgs, in_sgs, data, + ctx, premapped, gfp); } /** @@ -2441,8 +2537,7 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - return vq->packed_ring ? virtqueue_kick_prepare_packed(vq) : - virtqueue_kick_prepare_split(vq); + return VIRTQUEUE_CALL(vq, kick_prepare); } EXPORT_SYMBOL_GPL(virtqueue_kick_prepare); @@ -2512,8 +2607,7 @@ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len, { struct vring_virtqueue *vq = to_vvq(_vq); - return vq->packed_ring ? virtqueue_get_buf_ctx_packed(vq, len, ctx) : - virtqueue_get_buf_ctx_split(vq, len, ctx); + return VIRTQUEUE_CALL(vq, get, len, ctx); } EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx); @@ -2535,10 +2629,7 @@ void virtqueue_disable_cb(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - if (vq->packed_ring) - virtqueue_disable_cb_packed(vq); - else - virtqueue_disable_cb_split(vq); + VOID_VIRTQUEUE_CALL(vq, disable_cb); } EXPORT_SYMBOL_GPL(virtqueue_disable_cb); @@ -2561,8 +2652,7 @@ unsigned int virtqueue_enable_cb_prepare(struct virtqueue *_vq) if (vq->event_triggered) vq->event_triggered = false; - return vq->packed_ring ? virtqueue_enable_cb_prepare_packed(vq) : - virtqueue_enable_cb_prepare_split(vq); + return VIRTQUEUE_CALL(vq, enable_cb_prepare); } EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare); @@ -2583,8 +2673,8 @@ bool virtqueue_poll(struct virtqueue *_vq, unsigned int last_used_idx) return false; virtio_mb(vq->weak_barriers); - return vq->packed_ring ? virtqueue_poll_packed(vq, last_used_idx) : - virtqueue_poll_split(vq, last_used_idx); + + return VIRTQUEUE_CALL(vq, poll, last_used_idx); } EXPORT_SYMBOL_GPL(virtqueue_poll); @@ -2627,8 +2717,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) if (vq->event_triggered) data_race(vq->event_triggered = false); - return vq->packed_ring ? virtqueue_enable_cb_delayed_packed(vq) : - virtqueue_enable_cb_delayed_split(vq); + return VIRTQUEUE_CALL(vq, enable_cb_delayed); } EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed); @@ -2644,14 +2733,13 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - return vq->packed_ring ? virtqueue_detach_unused_buf_packed(vq) : - virtqueue_detach_unused_buf_split(vq); + return VIRTQUEUE_CALL(vq, detach_unused_buf); } EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf); static inline bool more_used(const struct vring_virtqueue *vq) { - return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq); + return VIRTQUEUE_CALL(vq, more_used); } /** @@ -2781,7 +2869,7 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num, if (!num) return -EINVAL; - if ((vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num) == num) + if (virtqueue_get_vring_size(_vq) == num) return 0; err = virtqueue_disable_and_recycle(_vq, recycle); @@ -2790,10 +2878,7 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num, if (recycle_done) recycle_done(_vq); - if (vq->packed_ring) - err = virtqueue_resize_packed(vq, num); - else - err = virtqueue_resize_split(vq, num); + err = VIRTQUEUE_CALL(vq, resize, num); err_reset = virtqueue_enable_after_reset(_vq); if (err_reset) @@ -2831,10 +2916,7 @@ int virtqueue_reset(struct virtqueue *_vq, if (recycle_done) recycle_done(_vq); - if (vq->packed_ring) - virtqueue_reset_packed(vq); - else - virtqueue_reset_split(vq); + VOID_VIRTQUEUE_CALL(vq, reset); return virtqueue_enable_after_reset(_vq); } @@ -2877,7 +2959,7 @@ static void vring_free(struct virtqueue *_vq) struct vring_virtqueue *vq = to_vvq(_vq); if (vq->we_own_ring) { - if (vq->packed_ring) { + if (virtqueue_is_packed(vq)) { vring_free_queue(vq->vq.vdev, vq->packed.ring_size_in_bytes, vq->packed.vring.desc, @@ -2906,7 +2988,7 @@ static void vring_free(struct virtqueue *_vq) vq->map); } } - if (!vq->packed_ring) { + if (!virtqueue_is_packed(vq)) { kfree(vq->split.desc_state); kfree(vq->split.desc_extra); } @@ -2931,7 +3013,7 @@ u32 vring_notification_data(struct virtqueue *_vq) struct vring_virtqueue *vq = to_vvq(_vq); u16 next; - if (vq->packed_ring) + if (virtqueue_is_packed(vq)) next = (vq->packed.next_avail_idx & ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR))) | vq->packed.avail_wrap_counter << @@ -2984,7 +3066,8 @@ unsigned int virtqueue_get_vring_size(const struct virtqueue *_vq) const struct vring_virtqueue *vq = to_vvq(_vq); - return vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num; + return virtqueue_is_packed(vq) ? vq->packed.vring.num : + vq->split.vring.num; } EXPORT_SYMBOL_GPL(virtqueue_get_vring_size); @@ -3067,7 +3150,7 @@ dma_addr_t virtqueue_get_desc_addr(const struct virtqueue *_vq) BUG_ON(!vq->we_own_ring); - if (vq->packed_ring) + if (virtqueue_is_packed(vq)) return vq->packed.ring_dma_addr; return vq->split.queue_dma_addr; @@ -3080,7 +3163,7 @@ dma_addr_t virtqueue_get_avail_addr(const struct virtqueue *_vq) BUG_ON(!vq->we_own_ring); - if (vq->packed_ring) + if (virtqueue_is_packed(vq)) return vq->packed.driver_event_dma_addr; return vq->split.queue_dma_addr + @@ -3094,7 +3177,7 @@ dma_addr_t virtqueue_get_used_addr(const struct virtqueue *_vq) BUG_ON(!vq->we_own_ring); - if (vq->packed_ring) + if (virtqueue_is_packed(vq)) return vq->packed.device_event_dma_addr; return vq->split.queue_dma_addr + From 03f05c4eeb7bc5019deb25f7415a7af8dc3fdd3f Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:44 +0800 Subject: [PATCH 17/59] virtio_ring: determine descriptor flags at one time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's determine the last descriptor by counting the number of sg. This would be consistent with packed virtqueue implementation and ease the future in-order implementation. Acked-by: Eugenio Pérez Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-15-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index d0904ac0aa93..e55b26a03037 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -574,7 +574,7 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq, struct vring_desc_extra *extra; struct scatterlist *sg; struct vring_desc *desc; - unsigned int i, n, avail, descs_used, prev, err_idx; + unsigned int i, n, avail, descs_used, err_idx, sg_count = 0; int head; bool indirect; @@ -634,42 +634,40 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq, for (sg = sgs[n]; sg; sg = sg_next(sg)) { dma_addr_t addr; u32 len; + u16 flags = 0; + + if (++sg_count != total_sg) + flags |= VRING_DESC_F_NEXT; if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr, &len, premapped)) goto unmap_release; - prev = i; /* Note that we trust indirect descriptor * table since it use stream DMA mapping. */ - i = virtqueue_add_desc_split(vq, desc, extra, i, addr, len, - VRING_DESC_F_NEXT, - premapped); + i = virtqueue_add_desc_split(vq, desc, extra, i, addr, + len, flags, premapped); } } for (; n < (out_sgs + in_sgs); n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { dma_addr_t addr; u32 len; + u16 flags = VRING_DESC_F_WRITE; + + if (++sg_count != total_sg) + flags |= VRING_DESC_F_NEXT; if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr, &len, premapped)) goto unmap_release; - prev = i; /* Note that we trust indirect descriptor * table since it use stream DMA mapping. */ - i = virtqueue_add_desc_split(vq, desc, extra, i, addr, len, - VRING_DESC_F_NEXT | - VRING_DESC_F_WRITE, - premapped); + i = virtqueue_add_desc_split(vq, desc, extra, i, addr, + len, flags, premapped); } } - /* Last one doesn't continue. */ - desc[prev].flags &= cpu_to_virtio16(vq->vq.vdev, ~VRING_DESC_F_NEXT); - if (!indirect && vring_need_unmap_buffer(vq, &extra[prev])) - vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &= - ~VRING_DESC_F_NEXT; if (indirect) { /* Now that the indirect table is filled in, map it. */ From c623106c79c811816614dcb687ed5d08b25d5fe5 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:45 +0800 Subject: [PATCH 18/59] virtio_ring: factor out core logic of buffer detaching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Factor out core logic of buffer detaching and leave the free list management to the caller so in_order can just call the core logic. Acked-by: Eugenio Pérez Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-16-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index e55b26a03037..27f69859ccf3 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -1665,8 +1665,8 @@ out: return needs_kick; } -static void detach_buf_packed(struct vring_virtqueue *vq, - unsigned int id, void **ctx) +static void detach_buf_packed_in_order(struct vring_virtqueue *vq, + unsigned int id, void **ctx) { struct vring_desc_state_packed *state = NULL; struct vring_packed_desc *desc; @@ -1677,8 +1677,6 @@ static void detach_buf_packed(struct vring_virtqueue *vq, /* Clear data ptr. */ state->data = NULL; - vq->packed.desc_extra[state->last].next = vq->free_head; - vq->free_head = id; vq->vq.num_free += state->num; if (unlikely(vq->use_map_api)) { @@ -1715,6 +1713,17 @@ static void detach_buf_packed(struct vring_virtqueue *vq, } } +static void detach_buf_packed(struct vring_virtqueue *vq, + unsigned int id, void **ctx) +{ + struct vring_desc_state_packed *state = &vq->packed.desc_state[id]; + + vq->packed.desc_extra[state->last].next = vq->free_head; + vq->free_head = id; + + detach_buf_packed_in_order(vq, id, ctx); +} + static inline bool is_used_desc_packed(const struct vring_virtqueue *vq, u16 idx, bool used_wrap_counter) { From fa56d17b9241394aaa77ee622b72a1b765a48d6e Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:46 +0800 Subject: [PATCH 19/59] virtio_ring: factor out core logic for updating last_used_idx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Factor out the core logic for updating last_used_idx to be reused by the packed in order implementation. Acked-by: Eugenio Pérez Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Tested-by: Lei Yang Reviewed-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-17-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 43 +++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 27f69859ccf3..3389aad6f5a8 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -1754,6 +1754,30 @@ static bool more_used_packed(const struct vring_virtqueue *vq) return virtqueue_poll_packed(vq, READ_ONCE(vq->last_used_idx)); } +static void update_last_used_idx_packed(struct vring_virtqueue *vq, + u16 id, u16 last_used, + u16 used_wrap_counter) +{ + last_used += vq->packed.desc_state[id].num; + if (unlikely(last_used >= vq->packed.vring.num)) { + last_used -= vq->packed.vring.num; + used_wrap_counter ^= 1; + } + + last_used = (last_used | (used_wrap_counter << VRING_PACKED_EVENT_F_WRAP_CTR)); + WRITE_ONCE(vq->last_used_idx, last_used); + + /* + * If we expect an interrupt for the next entry, tell host + * by writing event index and flush out the write before + * the read in the next get_buf call. + */ + if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DESC) + virtio_store_mb(vq->weak_barriers, + &vq->packed.vring.driver->off_wrap, + cpu_to_le16(vq->last_used_idx)); +} + static void *virtqueue_get_buf_ctx_packed(struct vring_virtqueue *vq, unsigned int *len, void **ctx) @@ -1797,24 +1821,7 @@ static void *virtqueue_get_buf_ctx_packed(struct vring_virtqueue *vq, ret = vq->packed.desc_state[id].data; detach_buf_packed(vq, id, ctx); - last_used += vq->packed.desc_state[id].num; - if (unlikely(last_used >= vq->packed.vring.num)) { - last_used -= vq->packed.vring.num; - used_wrap_counter ^= 1; - } - - last_used = (last_used | (used_wrap_counter << VRING_PACKED_EVENT_F_WRAP_CTR)); - WRITE_ONCE(vq->last_used_idx, last_used); - - /* - * If we expect an interrupt for the next entry, tell host - * by writing event index and flush out the write before - * the read in the next get_buf call. - */ - if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DESC) - virtio_store_mb(vq->weak_barriers, - &vq->packed.vring.driver->off_wrap, - cpu_to_le16(vq->last_used_idx)); + update_last_used_idx_packed(vq, id, last_used, used_wrap_counter); LAST_ADD_TIME_INVALID(vq); From 9dc6b944f16c0904331903ba0ec36e558e1a3537 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:47 +0800 Subject: [PATCH 20/59] virtio_ring: factor out split indirect detaching logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Factor out the split indirect descriptor detaching logic in order to allow it to be reused by the in order support. Acked-by: Eugenio Pérez Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-18-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 62 ++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 3389aad6f5a8..c0eaa907c67b 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -775,11 +775,41 @@ static bool virtqueue_kick_prepare_split(struct vring_virtqueue *vq) return needs_kick; } +static void detach_indirect_split(struct vring_virtqueue *vq, + unsigned int head) +{ + struct vring_desc_extra *extra = vq->split.desc_extra; + struct vring_desc *indir_desc = vq->split.desc_state[head].indir_desc; + unsigned int j; + u32 len, num; + + /* Free the indirect table, if any, now that it's unmapped. */ + if (!indir_desc) + return; + len = vq->split.desc_extra[head].len; + + BUG_ON(!(vq->split.desc_extra[head].flags & + VRING_DESC_F_INDIRECT)); + BUG_ON(len == 0 || len % sizeof(struct vring_desc)); + + num = len / sizeof(struct vring_desc); + + extra = (struct vring_desc_extra *)&indir_desc[num]; + + if (vq->use_map_api) { + for (j = 0; j < num; j++) + vring_unmap_one_split(vq, &extra[j]); + } + + kfree(indir_desc); + vq->split.desc_state[head].indir_desc = NULL; +} + static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, void **ctx) { struct vring_desc_extra *extra; - unsigned int i, j; + unsigned int i; __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT); /* Clear data ptr. */ @@ -802,34 +832,10 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, /* Plus final descriptor */ vq->vq.num_free++; - if (vq->indirect) { - struct vring_desc *indir_desc = - vq->split.desc_state[head].indir_desc; - u32 len, num; - - /* Free the indirect table, if any, now that it's unmapped. */ - if (!indir_desc) - return; - len = vq->split.desc_extra[head].len; - - BUG_ON(!(vq->split.desc_extra[head].flags & - VRING_DESC_F_INDIRECT)); - BUG_ON(len == 0 || len % sizeof(struct vring_desc)); - - num = len / sizeof(struct vring_desc); - - extra = (struct vring_desc_extra *)&indir_desc[num]; - - if (vq->use_map_api) { - for (j = 0; j < num; j++) - vring_unmap_one_split(vq, &extra[j]); - } - - kfree(indir_desc); - vq->split.desc_state[head].indir_desc = NULL; - } else if (ctx) { + if (vq->indirect) + detach_indirect_split(vq, head); + else if (ctx) *ctx = vq->split.desc_state[head].indir_desc; - } } static bool virtqueue_poll_split(const struct vring_virtqueue *vq, From 519b206e30a37f16cfa88a2f6a508642f7d8fd0c Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:48 +0800 Subject: [PATCH 21/59] virtio_ring: factor out split detaching logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch factors out the split core detaching logic that could be reused by in order feature into a dedicated function. Acked-by: Eugenio Pérez Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Reviewed-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-19-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index c0eaa907c67b..a0fa81620333 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -805,8 +805,9 @@ static void detach_indirect_split(struct vring_virtqueue *vq, vq->split.desc_state[head].indir_desc = NULL; } -static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, - void **ctx) +static unsigned detach_buf_split_in_order(struct vring_virtqueue *vq, + unsigned int head, + void **ctx) { struct vring_desc_extra *extra; unsigned int i; @@ -826,8 +827,6 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, } vring_unmap_one_split(vq, &extra[i]); - extra[i].next = vq->free_head; - vq->free_head = head; /* Plus final descriptor */ vq->vq.num_free++; @@ -836,6 +835,17 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, detach_indirect_split(vq, head); else if (ctx) *ctx = vq->split.desc_state[head].indir_desc; + + return i; +} + +static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, + void **ctx) +{ + unsigned int i = detach_buf_split_in_order(vq, head, ctx); + + vq->split.desc_extra[i].next = vq->free_head; + vq->free_head = head; } static bool virtqueue_poll_split(const struct vring_virtqueue *vq, From f6a15d85498614baf121f7e207e6c55524f175a4 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 30 Dec 2025 14:46:49 +0800 Subject: [PATCH 22/59] virtio_ring: add in order support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch implements in order support for both split virtqueue and packed virtqueue. Performance could be gained for the device where the memory access could be expensive (e.g vhost-net or a real PCI device): Benchmark with KVM guest: Vhost-net on the host: (pktgen + XDP_DROP): in_order=off | in_order=on | +% TX: 4.51Mpps | 5.30Mpps | +17% RX: 3.47Mpps | 3.61Mpps | + 4% Vhost-user(testpmd) on the host: (pktgen/XDP_DROP): For split virtqueue: in_order=off | in_order=on | +% TX: 5.60Mpps | 5.60Mpps | +0.0% RX: 9.16Mpps | 9.61Mpps | +4.9% For packed virtqueue: in_order=off | in_order=on | +% TX: 5.60Mpps | 5.70Mpps | +1.7% RX: 10.6Mpps | 10.8Mpps | +1.8% Benchmark also shows no performance impact for in_order=off for queue size with 256 and 1024. Reviewed-by: Eugenio Pérez Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin Message-Id: <20251230064649.55597-20-jasowang@redhat.com> --- drivers/virtio/virtio_ring.c | 448 +++++++++++++++++++++++++++++++++-- 1 file changed, 430 insertions(+), 18 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index a0fa81620333..95e320b23624 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -70,6 +70,8 @@ enum vq_layout { VQ_LAYOUT_SPLIT = 0, VQ_LAYOUT_PACKED, + VQ_LAYOUT_SPLIT_IN_ORDER, + VQ_LAYOUT_PACKED_IN_ORDER, }; struct vring_desc_state_split { @@ -79,6 +81,7 @@ struct vring_desc_state_split { * allocated together. So we won't stress more to the memory allocator. */ struct vring_desc *indir_desc; + u32 total_in_len; }; struct vring_desc_state_packed { @@ -90,6 +93,7 @@ struct vring_desc_state_packed { struct vring_packed_desc *indir_desc; u16 num; /* Descriptor list length. */ u16 last; /* The last desc state in a list. */ + u32 total_in_len; /* In length for the skipped buffer. */ }; struct vring_desc_extra { @@ -204,8 +208,24 @@ struct vring_virtqueue { enum vq_layout layout; - /* Head of free buffer list. */ + /* + * Without IN_ORDER it's the head of free buffer list. With + * IN_ORDER and SPLIT, it's the next available buffer + * index. With IN_ORDER and PACKED, it's unused. + */ unsigned int free_head; + + /* + * With IN_ORDER, once we see an in-order batch, this stores + * this last entry, and until we return the last buffer. + * After this, id is set to UINT_MAX to mark it invalid. + * Unused without IN_ORDER. + */ + struct used_entry { + u32 id; + u32 len; + } batch_last; + /* Number we've added since last sync. */ unsigned int num_added; @@ -217,6 +237,11 @@ struct vring_virtqueue { */ u16 last_used_idx; + /* With IN_ORDER and SPLIT, last descriptor id we used to + * detach buffer. + */ + u16 last_used; + /* Hint for event idx: already triggered no need to disable. */ bool event_triggered; @@ -258,7 +283,14 @@ static void vring_free(struct virtqueue *_vq); static inline bool virtqueue_is_packed(const struct vring_virtqueue *vq) { - return vq->layout == VQ_LAYOUT_PACKED; + return vq->layout == VQ_LAYOUT_PACKED || + vq->layout == VQ_LAYOUT_PACKED_IN_ORDER; +} + +static inline bool virtqueue_is_in_order(const struct vring_virtqueue *vq) +{ + return vq->layout == VQ_LAYOUT_SPLIT_IN_ORDER || + vq->layout == VQ_LAYOUT_PACKED_IN_ORDER; } static bool virtqueue_use_indirect(const struct vring_virtqueue *vq, @@ -468,6 +500,8 @@ static void virtqueue_init(struct vring_virtqueue *vq, u32 num) else vq->last_used_idx = 0; + vq->last_used = 0; + vq->event_triggered = false; vq->num_added = 0; @@ -575,6 +609,8 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq, struct scatterlist *sg; struct vring_desc *desc; unsigned int i, n, avail, descs_used, err_idx, sg_count = 0; + /* Total length for in-order */ + unsigned int total_in_len = 0; int head; bool indirect; @@ -666,6 +702,7 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq, */ i = virtqueue_add_desc_split(vq, desc, extra, i, addr, len, flags, premapped); + total_in_len += len; } } @@ -688,7 +725,12 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq, vq->vq.num_free -= descs_used; /* Update free pointer */ - if (indirect) + if (virtqueue_is_in_order(vq)) { + vq->free_head += descs_used; + if (vq->free_head >= vq->split.vring.num) + vq->free_head -= vq->split.vring.num; + vq->split.desc_state[head].total_in_len = total_in_len; + } else if (indirect) vq->free_head = vq->split.desc_extra[head].next; else vq->free_head = i; @@ -860,6 +902,14 @@ static bool more_used_split(const struct vring_virtqueue *vq) return virtqueue_poll_split(vq, vq->last_used_idx); } +static bool more_used_split_in_order(const struct vring_virtqueue *vq) +{ + if (vq->batch_last.id != UINT_MAX) + return true; + + return virtqueue_poll_split(vq, vq->last_used_idx); +} + static void *virtqueue_get_buf_ctx_split(struct vring_virtqueue *vq, unsigned int *len, void **ctx) @@ -917,6 +967,76 @@ static void *virtqueue_get_buf_ctx_split(struct vring_virtqueue *vq, return ret; } +static void *virtqueue_get_buf_ctx_split_in_order(struct vring_virtqueue *vq, + unsigned int *len, + void **ctx) +{ + void *ret; + unsigned int num = vq->split.vring.num; + unsigned int num_free = vq->vq.num_free; + u16 last_used, last_used_idx; + + START_USE(vq); + + if (unlikely(vq->broken)) { + END_USE(vq); + return NULL; + } + + last_used = vq->last_used & (num - 1); + last_used_idx = vq->last_used_idx & (num - 1); + + if (vq->batch_last.id == UINT_MAX) { + if (!more_used_split_in_order(vq)) { + pr_debug("No more buffers in queue\n"); + END_USE(vq); + return NULL; + } + + /* + * Only get used array entries after they have been + * exposed by host. + */ + virtio_rmb(vq->weak_barriers); + + vq->batch_last.id = virtio32_to_cpu(vq->vq.vdev, + vq->split.vring.used->ring[last_used_idx].id); + vq->batch_last.len = virtio32_to_cpu(vq->vq.vdev, + vq->split.vring.used->ring[last_used_idx].len); + } + + if (vq->batch_last.id == last_used) { + vq->batch_last.id = UINT_MAX; + *len = vq->batch_last.len; + } else { + *len = vq->split.desc_state[last_used].total_in_len; + } + + if (unlikely(!vq->split.desc_state[last_used].data)) { + BAD_RING(vq, "id %u is not a head!\n", last_used); + return NULL; + } + + /* detach_buf_split clears data, so grab it now. */ + ret = vq->split.desc_state[last_used].data; + detach_buf_split_in_order(vq, last_used, ctx); + + vq->last_used_idx++; + vq->last_used += (vq->vq.num_free - num_free); + /* If we expect an interrupt for the next entry, tell host + * by writing event index and flush out the write before + * the read in the next get_buf call. */ + if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) + virtio_store_mb(vq->weak_barriers, + &vring_used_event(&vq->split.vring), + cpu_to_virtio16(vq->vq.vdev, vq->last_used_idx)); + + LAST_ADD_TIME_INVALID(vq); + + END_USE(vq); + return ret; +} + static void virtqueue_disable_cb_split(struct vring_virtqueue *vq) { if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) { @@ -1010,7 +1130,10 @@ static void *virtqueue_detach_unused_buf_split(struct vring_virtqueue *vq) continue; /* detach_buf_split clears data, so grab it now. */ buf = vq->split.desc_state[i].data; - detach_buf_split(vq, i, NULL); + if (virtqueue_is_in_order(vq)) + detach_buf_split_in_order(vq, i, NULL); + else + detach_buf_split(vq, i, NULL); vq->split.avail_idx_shadow--; vq->split.vring.avail->idx = cpu_to_virtio16(vq->vq.vdev, vq->split.avail_idx_shadow); @@ -1073,6 +1196,7 @@ static void virtqueue_vring_attach_split(struct vring_virtqueue *vq, /* Put everything in free lists. */ vq->free_head = 0; + vq->batch_last.id = UINT_MAX; } static int vring_alloc_state_extra_split(struct vring_virtqueue_split *vring_split) @@ -1184,7 +1308,6 @@ static struct virtqueue *__vring_new_virtqueue_split(unsigned int index, if (!vq) return NULL; - vq->layout = VQ_LAYOUT_SPLIT; vq->vq.callback = callback; vq->vq.vdev = vdev; vq->vq.name = name; @@ -1204,6 +1327,8 @@ static struct virtqueue *__vring_new_virtqueue_split(unsigned int index, vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); + vq->layout = virtio_has_feature(vdev, VIRTIO_F_IN_ORDER) ? + VQ_LAYOUT_SPLIT_IN_ORDER : VQ_LAYOUT_SPLIT; if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) vq->weak_barriers = false; @@ -1361,13 +1486,14 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, unsigned int in_sgs, void *data, bool premapped, - gfp_t gfp) + gfp_t gfp, + u16 id) { struct vring_desc_extra *extra; struct vring_packed_desc *desc; struct scatterlist *sg; - unsigned int i, n, err_idx, len; - u16 head, id; + unsigned int i, n, err_idx, len, total_in_len = 0; + u16 head; dma_addr_t addr; head = vq->packed.next_avail_idx; @@ -1385,8 +1511,6 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, } i = 0; - id = vq->free_head; - BUG_ON(id == vq->packed.vring.num); for (n = 0; n < out_sgs + in_sgs; n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { @@ -1406,6 +1530,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, extra[i].flags = n < out_sgs ? 0 : VRING_DESC_F_WRITE; } + if (n >= out_sgs) + total_in_len += len; i++; } } @@ -1452,13 +1578,15 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, 1 << VRING_PACKED_DESC_F_USED; } vq->packed.next_avail_idx = n; - vq->free_head = vq->packed.desc_extra[id].next; + if (!virtqueue_is_in_order(vq)) + vq->free_head = vq->packed.desc_extra[id].next; /* Store token and indirect buffer state. */ vq->packed.desc_state[id].num = 1; vq->packed.desc_state[id].data = data; vq->packed.desc_state[id].indir_desc = desc; vq->packed.desc_state[id].last = id; + vq->packed.desc_state[id].total_in_len = total_in_len; vq->num_added += 1; @@ -1511,8 +1639,11 @@ static inline int virtqueue_add_packed(struct vring_virtqueue *vq, BUG_ON(total_sg == 0); if (virtqueue_use_indirect(vq, total_sg)) { + id = vq->free_head; + BUG_ON(id == vq->packed.vring.num); err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs, - in_sgs, data, premapped, gfp); + in_sgs, data, premapped, + gfp, id); if (err != -ENOMEM) { END_USE(vq); return err; @@ -1633,6 +1764,160 @@ unmap_release: return -EIO; } +static inline int virtqueue_add_packed_in_order(struct vring_virtqueue *vq, + struct scatterlist *sgs[], + unsigned int total_sg, + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + void *ctx, + bool premapped, + gfp_t gfp) +{ + struct vring_packed_desc *desc; + struct scatterlist *sg; + unsigned int i, n, sg_count, err_idx, total_in_len = 0; + __le16 head_flags, flags; + u16 head, avail_used_flags; + bool avail_wrap_counter; + int err; + + START_USE(vq); + + BUG_ON(data == NULL); + BUG_ON(ctx && vq->indirect); + + if (unlikely(vq->broken)) { + END_USE(vq); + return -EIO; + } + + LAST_ADD_TIME_UPDATE(vq); + + BUG_ON(total_sg == 0); + + if (virtqueue_use_indirect(vq, total_sg)) { + err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs, + in_sgs, data, premapped, gfp, + vq->packed.next_avail_idx); + if (err != -ENOMEM) { + END_USE(vq); + return err; + } + + /* fall back on direct */ + } + + head = vq->packed.next_avail_idx; + avail_used_flags = vq->packed.avail_used_flags; + avail_wrap_counter = vq->packed.avail_wrap_counter; + + WARN_ON_ONCE(total_sg > vq->packed.vring.num && !vq->indirect); + + desc = vq->packed.vring.desc; + i = head; + + if (unlikely(vq->vq.num_free < total_sg)) { + pr_debug("Can't add buf len %i - avail = %i\n", + total_sg, vq->vq.num_free); + END_USE(vq); + return -ENOSPC; + } + + sg_count = 0; + for (n = 0; n < out_sgs + in_sgs; n++) { + for (sg = sgs[n]; sg; sg = sg_next(sg)) { + dma_addr_t addr; + u32 len; + + flags = 0; + if (++sg_count != total_sg) + flags |= cpu_to_le16(VRING_DESC_F_NEXT); + if (n >= out_sgs) + flags |= cpu_to_le16(VRING_DESC_F_WRITE); + + if (vring_map_one_sg(vq, sg, n < out_sgs ? + DMA_TO_DEVICE : DMA_FROM_DEVICE, + &addr, &len, premapped)) + goto unmap_release; + + flags |= cpu_to_le16(vq->packed.avail_used_flags); + + if (i == head) + head_flags = flags; + else + desc[i].flags = flags; + + desc[i].addr = cpu_to_le64(addr); + desc[i].len = cpu_to_le32(len); + desc[i].id = cpu_to_le16(head); + + if (unlikely(vq->use_map_api)) { + vq->packed.desc_extra[i].addr = premapped ? + DMA_MAPPING_ERROR : addr; + vq->packed.desc_extra[i].len = len; + vq->packed.desc_extra[i].flags = + le16_to_cpu(flags); + } + + if ((unlikely(++i >= vq->packed.vring.num))) { + i = 0; + vq->packed.avail_used_flags ^= + 1 << VRING_PACKED_DESC_F_AVAIL | + 1 << VRING_PACKED_DESC_F_USED; + vq->packed.avail_wrap_counter ^= 1; + } + + if (n >= out_sgs) + total_in_len += len; + } + } + + /* We're using some buffers from the free list. */ + vq->vq.num_free -= total_sg; + + /* Update free pointer */ + vq->packed.next_avail_idx = i; + + /* Store token. */ + vq->packed.desc_state[head].num = total_sg; + vq->packed.desc_state[head].data = data; + vq->packed.desc_state[head].indir_desc = ctx; + vq->packed.desc_state[head].total_in_len = total_in_len; + + /* + * A driver MUST NOT make the first descriptor in the list + * available before all subsequent descriptors comprising + * the list are made available. + */ + virtio_wmb(vq->weak_barriers); + vq->packed.vring.desc[head].flags = head_flags; + vq->num_added += total_sg; + + pr_debug("Added buffer head %i to %p\n", head, vq); + END_USE(vq); + + return 0; + +unmap_release: + err_idx = i; + i = head; + vq->packed.avail_used_flags = avail_used_flags; + vq->packed.avail_wrap_counter = avail_wrap_counter; + + for (n = 0; n < total_sg; n++) { + if (i == err_idx) + break; + vring_unmap_extra_packed(vq, &vq->packed.desc_extra[i]); + i++; + if (i >= vq->packed.vring.num) + i = 0; + } + + END_USE(vq); + return -EIO; +} + static bool virtqueue_kick_prepare_packed(struct vring_virtqueue *vq) { u16 new, old, off_wrap, flags, wrap_counter, event_idx; @@ -1794,10 +2079,82 @@ static void update_last_used_idx_packed(struct vring_virtqueue *vq, cpu_to_le16(vq->last_used_idx)); } +static bool more_used_packed_in_order(const struct vring_virtqueue *vq) +{ + if (vq->batch_last.id != UINT_MAX) + return true; + + return virtqueue_poll_packed(vq, READ_ONCE(vq->last_used_idx)); +} + +static void *virtqueue_get_buf_ctx_packed_in_order(struct vring_virtqueue *vq, + unsigned int *len, + void **ctx) +{ + unsigned int num = vq->packed.vring.num; + u16 last_used, last_used_idx; + bool used_wrap_counter; + void *ret; + + START_USE(vq); + + if (unlikely(vq->broken)) { + END_USE(vq); + return NULL; + } + + last_used_idx = vq->last_used_idx; + used_wrap_counter = packed_used_wrap_counter(last_used_idx); + last_used = packed_last_used(last_used_idx); + + if (vq->batch_last.id == UINT_MAX) { + if (!more_used_packed_in_order(vq)) { + pr_debug("No more buffers in queue\n"); + END_USE(vq); + return NULL; + } + /* Only get used elements after they have been exposed by host. */ + virtio_rmb(vq->weak_barriers); + vq->batch_last.id = + le16_to_cpu(vq->packed.vring.desc[last_used].id); + vq->batch_last.len = + le32_to_cpu(vq->packed.vring.desc[last_used].len); + } + + if (vq->batch_last.id == last_used) { + vq->batch_last.id = UINT_MAX; + *len = vq->batch_last.len; + } else { + *len = vq->packed.desc_state[last_used].total_in_len; + } + + if (unlikely(last_used >= num)) { + BAD_RING(vq, "id %u out of range\n", last_used); + return NULL; + } + if (unlikely(!vq->packed.desc_state[last_used].data)) { + BAD_RING(vq, "id %u is not a head!\n", last_used); + return NULL; + } + + /* detach_buf_packed clears data, so grab it now. */ + ret = vq->packed.desc_state[last_used].data; + detach_buf_packed_in_order(vq, last_used, ctx); + + update_last_used_idx_packed(vq, last_used, last_used, + used_wrap_counter); + + LAST_ADD_TIME_INVALID(vq); + + END_USE(vq); + return ret; +} + static void *virtqueue_get_buf_ctx_packed(struct vring_virtqueue *vq, unsigned int *len, void **ctx) { + unsigned int num = vq->packed.vring.num; u16 last_used, id, last_used_idx; bool used_wrap_counter; void *ret; @@ -1824,7 +2181,7 @@ static void *virtqueue_get_buf_ctx_packed(struct vring_virtqueue *vq, id = le16_to_cpu(vq->packed.vring.desc[last_used].id); *len = le32_to_cpu(vq->packed.vring.desc[last_used].len); - if (unlikely(id >= vq->packed.vring.num)) { + if (unlikely(id >= num)) { BAD_RING(vq, "id %u out of range\n", id); return NULL; } @@ -1965,7 +2322,10 @@ static void *virtqueue_detach_unused_buf_packed(struct vring_virtqueue *vq) continue; /* detach_buf clears data, so grab it now. */ buf = vq->packed.desc_state[i].data; - detach_buf_packed(vq, i, NULL); + if (virtqueue_is_in_order(vq)) + detach_buf_packed_in_order(vq, i, NULL); + else + detach_buf_packed(vq, i, NULL); END_USE(vq); return buf; } @@ -1991,6 +2351,8 @@ static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num) for (i = 0; i < num - 1; i++) desc_extra[i].next = i + 1; + desc_extra[num - 1].next = 0; + return desc_extra; } @@ -2122,10 +2484,17 @@ static void virtqueue_vring_attach_packed(struct vring_virtqueue *vq, { vq->packed = *vring_packed; - /* Put everything in free lists. */ - vq->free_head = 0; + if (virtqueue_is_in_order(vq)) { + vq->batch_last.id = UINT_MAX; + } else { + /* + * Put everything in free lists. Note that + * next_avail_idx is sufficient with IN_ORDER so + * free_head is unused. + */ + vq->free_head = 0; + } } - static void virtqueue_reset_packed(struct vring_virtqueue *vq) { memset(vq->packed.vring.device, 0, vq->packed.event_size_in_bytes); @@ -2169,13 +2538,14 @@ static struct virtqueue *__vring_new_virtqueue_packed(unsigned int index, #else vq->broken = false; #endif - vq->layout = VQ_LAYOUT_PACKED; vq->map = map; vq->use_map_api = vring_use_map_api(vdev); vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); + vq->layout = virtio_has_feature(vdev, VIRTIO_F_IN_ORDER) ? + VQ_LAYOUT_PACKED_IN_ORDER : VQ_LAYOUT_PACKED; if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) vq->weak_barriers = false; @@ -2285,6 +2655,34 @@ static const struct virtqueue_ops packed_ops = { .reset = virtqueue_reset_packed, }; +static const struct virtqueue_ops split_in_order_ops = { + .add = virtqueue_add_split, + .get = virtqueue_get_buf_ctx_split_in_order, + .kick_prepare = virtqueue_kick_prepare_split, + .disable_cb = virtqueue_disable_cb_split, + .enable_cb_delayed = virtqueue_enable_cb_delayed_split, + .enable_cb_prepare = virtqueue_enable_cb_prepare_split, + .poll = virtqueue_poll_split, + .detach_unused_buf = virtqueue_detach_unused_buf_split, + .more_used = more_used_split_in_order, + .resize = virtqueue_resize_split, + .reset = virtqueue_reset_split, +}; + +static const struct virtqueue_ops packed_in_order_ops = { + .add = virtqueue_add_packed_in_order, + .get = virtqueue_get_buf_ctx_packed_in_order, + .kick_prepare = virtqueue_kick_prepare_packed, + .disable_cb = virtqueue_disable_cb_packed, + .enable_cb_delayed = virtqueue_enable_cb_delayed_packed, + .enable_cb_prepare = virtqueue_enable_cb_prepare_packed, + .poll = virtqueue_poll_packed, + .detach_unused_buf = virtqueue_detach_unused_buf_packed, + .more_used = more_used_packed_in_order, + .resize = virtqueue_resize_packed, + .reset = virtqueue_reset_packed, +}; + static int virtqueue_disable_and_recycle(struct virtqueue *_vq, void (*recycle)(struct virtqueue *vq, void *buf)) { @@ -2339,6 +2737,12 @@ static int virtqueue_enable_after_reset(struct virtqueue *_vq) case VQ_LAYOUT_PACKED: \ ret = packed_ops.op(__VIRTQUEUE_CALL_vq, ##__VA_ARGS__);\ break; \ + case VQ_LAYOUT_SPLIT_IN_ORDER: \ + ret = split_in_order_ops.op(vq, ##__VA_ARGS__); \ + break; \ + case VQ_LAYOUT_PACKED_IN_ORDER: \ + ret = packed_in_order_ops.op(vq, ##__VA_ARGS__); \ + break; \ default: \ BUG(); \ break; \ @@ -2357,6 +2761,12 @@ static int virtqueue_enable_after_reset(struct virtqueue *_vq) case VQ_LAYOUT_PACKED: \ packed_ops.op(__VIRTQUEUE_CALL_vq, ##__VA_ARGS__); \ break; \ + case VQ_LAYOUT_SPLIT_IN_ORDER: \ + split_in_order_ops.op(vq, ##__VA_ARGS__); \ + break; \ + case VQ_LAYOUT_PACKED_IN_ORDER: \ + packed_in_order_ops.op(vq, ##__VA_ARGS__); \ + break; \ default: \ BUG(); \ break; \ @@ -3076,6 +3486,8 @@ void vring_transport_features(struct virtio_device *vdev) break; case VIRTIO_F_NOTIFICATION_DATA: break; + case VIRTIO_F_IN_ORDER: + break; default: /* We don't understand this bit. */ __virtio_clear_bit(vdev, i); From ca085faabb42c31ee204235facc5a430cb9e78a9 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Dec 2025 03:53:39 -0500 Subject: [PATCH 23/59] dma-mapping: add __dma_from_device_group_begin()/end() When a structure contains a buffer that DMA writes to alongside fields that the CPU writes to, cache line sharing between the DMA buffer and CPU-written fields can cause data corruption on non-cache-coherent platforms. Add __dma_from_device_group_begin()/end() annotations to ensure proper alignment to prevent this: struct my_device { spinlock_t lock1; __dma_from_device_group_begin(); char dma_buffer1[16]; char dma_buffer2[16]; __dma_from_device_group_end(); spinlock_t lock2; }; Message-ID: <19163086d5e4704c316f18f6da06bc1c72968904.1767601130.git.mst@redhat.com> Acked-by: Marek Szyprowski Reviewed-by: Petr Tesarik Signed-off-by: Michael S. Tsirkin --- include/linux/dma-mapping.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index aa36a0d1d9df..29ad2ce700f0 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -7,6 +7,7 @@ #include #include #include +#include /** * List of possible attributes associated with a DMA mapping. The semantics @@ -703,6 +704,18 @@ static inline int dma_get_cache_alignment(void) } #endif +#ifdef ARCH_HAS_DMA_MINALIGN +#define ____dma_from_device_aligned __aligned(ARCH_DMA_MINALIGN) +#else +#define ____dma_from_device_aligned +#endif +/* Mark start of DMA buffer */ +#define __dma_from_device_group_begin(GROUP) \ + __cacheline_group_begin(GROUP) ____dma_from_device_aligned +/* Mark end of DMA buffer */ +#define __dma_from_device_group_end(GROUP) \ + __cacheline_group_end(GROUP) ____dma_from_device_aligned + static inline void *dmam_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { From 1e8b5d855525e0863198797a67a69774f426e142 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Dec 2025 04:01:21 -0500 Subject: [PATCH 24/59] docs: dma-api: document __dma_from_device_group_begin()/end() Document the __dma_from_device_group_begin()/end() annotations. Message-ID: <01ea88055ded4d70cac70ba557680fd5fa7d9ff5.1767601130.git.mst@redhat.com> Acked-by: Marek Szyprowski Reviewed-by: Petr Tesarik Signed-off-by: Michael S. Tsirkin --- Documentation/core-api/dma-api-howto.rst | 52 ++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/Documentation/core-api/dma-api-howto.rst b/Documentation/core-api/dma-api-howto.rst index 96fce2a9aa90..e97743ab0f26 100644 --- a/Documentation/core-api/dma-api-howto.rst +++ b/Documentation/core-api/dma-api-howto.rst @@ -146,6 +146,58 @@ What about block I/O and networking buffers? The block I/O and networking subsystems make sure that the buffers they use are valid for you to DMA from/to. +__dma_from_device_group_begin/end annotations +============================================= + +As explained previously, when a structure contains a DMA_FROM_DEVICE / +DMA_BIDIRECTIONAL buffer (device writes to memory) alongside fields that the +CPU writes to, cache line sharing between the DMA buffer and CPU-written fields +can cause data corruption on CPUs with DMA-incoherent caches. + +The ``__dma_from_device_group_begin(GROUP)/__dma_from_device_group_end(GROUP)`` +macros ensure proper alignment to prevent this:: + + struct my_device { + spinlock_t lock1; + __dma_from_device_group_begin(); + char dma_buffer1[16]; + char dma_buffer2[16]; + __dma_from_device_group_end(); + spinlock_t lock2; + }; + +To isolate a DMA buffer from adjacent fields, use +``__dma_from_device_group_begin(GROUP)`` before the first DMA buffer +field and ``__dma_from_device_group_end(GROUP)`` after the last DMA +buffer field (with the same GROUP name). This protects both the head +and tail of the buffer from cache line sharing. + +The GROUP parameter is an optional identifier that names the DMA buffer group +(in case you have several in the same structure):: + + struct my_device { + spinlock_t lock1; + __dma_from_device_group_begin(buffer1); + char dma_buffer1[16]; + __dma_from_device_group_end(buffer1); + spinlock_t lock2; + __dma_from_device_group_begin(buffer2); + char dma_buffer2[16]; + __dma_from_device_group_end(buffer2); + }; + +On cache-coherent platforms these macros expand to zero-length array markers. +On non-coherent platforms, they also ensure the minimal DMA alignment, which +can be as large as 128 bytes. + +.. note:: + + It is allowed (though somewhat fragile) to include extra fields, not + intended for DMA from the device, within the group (in order to pack the + structure tightly) - but only as long as the CPU does not write these + fields while any fields in the group are mapped for DMA_FROM_DEVICE or + DMA_BIDIRECTIONAL. + DMA addressing capabilities =========================== From 61868dc55a119a5e4b912d458fc2c48ba80a35fe Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Dec 2025 07:28:43 -0500 Subject: [PATCH 25/59] dma-mapping: add DMA_ATTR_CPU_CACHE_CLEAN When multiple small DMA_FROM_DEVICE or DMA_BIDIRECTIONAL buffers share a cacheline, and DMA_API_DEBUG is enabled, we get this warning: cacheline tracking EEXIST, overlapping mappings aren't supported. This is because when one of the mappings is removed, while another one is active, CPU might write into the buffer. Add an attribute for the driver to promise not to do this, making the overlapping safe, and suppressing the warning. Message-ID: <2d5d091f9d84b68ea96abd545b365dd1d00bbf48.1767601130.git.mst@redhat.com> Reviewed-by: Petr Tesarik Acked-by: Marek Szyprowski Signed-off-by: Michael S. Tsirkin --- include/linux/dma-mapping.h | 7 +++++++ kernel/dma/debug.c | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 29ad2ce700f0..29973baa0581 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -79,6 +79,13 @@ */ #define DMA_ATTR_MMIO (1UL << 10) +/* + * DMA_ATTR_CPU_CACHE_CLEAN: Indicates the CPU will not dirty any cacheline + * overlapping this buffer while it is mapped for DMA. All mappings sharing + * a cacheline must have this attribute for this to be considered safe. + */ +#define DMA_ATTR_CPU_CACHE_CLEAN (1UL << 11) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 138ede653de4..7e66d863d573 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -595,7 +595,8 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) if (rc == -ENOMEM) { pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n"); global_disable = true; - } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + } else if (rc == -EEXIST && + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_CPU_CACHE_CLEAN)) && !(IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && is_swiotlb_active(entry->dev))) { err_printk(entry->dev, entry, From e21dd666e4af829c6a26d830cca8bf4839878297 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Dec 2025 08:11:41 -0500 Subject: [PATCH 26/59] docs: dma-api: document DMA_ATTR_CPU_CACHE_CLEAN Document DMA_ATTR_CPU_CACHE_CLEAN as implemented in the previous patch. Message-ID: <0720b4be31c1b7a38edca67fd0c97983d2a56936.1767601130.git.mst@redhat.com> Reviewed-by: Petr Tesarik Acked-by: Marek Szyprowski Signed-off-by: Michael S. Tsirkin --- Documentation/core-api/dma-attributes.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 0bdc2be65e57..1d7bfad73b1c 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -148,3 +148,12 @@ DMA_ATTR_MMIO is appropriate. For architectures that require cache flushing for DMA coherence DMA_ATTR_MMIO will not perform any cache flushing. The address provided must never be mapped cacheable into the CPU. + +DMA_ATTR_CPU_CACHE_CLEAN +------------------------ + +This attribute indicates the CPU will not dirty any cacheline overlapping this +DMA_FROM_DEVICE/DMA_BIDIRECTIONAL buffer while it is mapped. This allows +multiple small buffers to safely share a cacheline without risk of data +corruption, suppressing DMA debug warnings about overlapping mappings. +All mappings sharing a cacheline should have this attribute. From d5d846513128c1a3bc2f2d371f6e903177dea443 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Dec 2025 14:38:31 -0500 Subject: [PATCH 27/59] dma-debug: track cache clean flag in entries If a driver is buggy and has 2 overlapping mappings but only sets cache clean flag on the 1st one of them, we warn. But if it only does it for the 2nd one, we don't. Fix by tracking cache clean flag in the entry. Message-ID: <0ffb3513d18614539c108b4548cdfbc64274a7d1.1767601130.git.mst@redhat.com> Reviewed-by: Petr Tesarik Signed-off-by: Michael S. Tsirkin --- kernel/dma/debug.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 7e66d863d573..43d6a996d7a7 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -63,6 +63,7 @@ enum map_err_types { * @sg_mapped_ents: 'mapped_ents' from dma_map_sg * @paddr: physical start address of the mapping * @map_err_type: track whether dma_mapping_error() was checked + * @is_cache_clean: driver promises not to write to buffer while mapped * @stack_len: number of backtrace entries in @stack_entries * @stack_entries: stack of backtrace history */ @@ -76,7 +77,8 @@ struct dma_debug_entry { int sg_call_ents; int sg_mapped_ents; phys_addr_t paddr; - enum map_err_types map_err_type; + enum map_err_types map_err_type; + bool is_cache_clean; #ifdef CONFIG_STACKTRACE unsigned int stack_len; unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; @@ -472,12 +474,15 @@ static int active_cacheline_dec_overlap(phys_addr_t cln) return active_cacheline_set_overlap(cln, --overlap); } -static int active_cacheline_insert(struct dma_debug_entry *entry) +static int active_cacheline_insert(struct dma_debug_entry *entry, + bool *overlap_cache_clean) { phys_addr_t cln = to_cacheline_number(entry); unsigned long flags; int rc; + *overlap_cache_clean = false; + /* If the device is not writing memory then we don't have any * concerns about the cpu consuming stale data. This mitigates * legitimate usages of overlapping mappings. @@ -487,8 +492,16 @@ static int active_cacheline_insert(struct dma_debug_entry *entry) spin_lock_irqsave(&radix_lock, flags); rc = radix_tree_insert(&dma_active_cacheline, cln, entry); - if (rc == -EEXIST) + if (rc == -EEXIST) { + struct dma_debug_entry *existing; + active_cacheline_inc_overlap(cln); + existing = radix_tree_lookup(&dma_active_cacheline, cln); + /* A lookup failure here after we got -EEXIST is unexpected. */ + WARN_ON(!existing); + if (existing) + *overlap_cache_clean = existing->is_cache_clean; + } spin_unlock_irqrestore(&radix_lock, flags); return rc; @@ -583,20 +596,24 @@ DEFINE_SHOW_ATTRIBUTE(dump); */ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) { + bool overlap_cache_clean; struct hash_bucket *bucket; unsigned long flags; int rc; + entry->is_cache_clean = !!(attrs & DMA_ATTR_CPU_CACHE_CLEAN); + bucket = get_hash_bucket(entry, &flags); hash_bucket_add(bucket, entry); put_hash_bucket(bucket, flags); - rc = active_cacheline_insert(entry); + rc = active_cacheline_insert(entry, &overlap_cache_clean); if (rc == -ENOMEM) { pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n"); global_disable = true; } else if (rc == -EEXIST && - !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_CPU_CACHE_CLEAN)) && + !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + !(entry->is_cache_clean && overlap_cache_clean) && !(IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && is_swiotlb_active(entry->dev))) { err_printk(entry->dev, entry, From 5fc6dd158e97d317aeb85ea930613f8db172603b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Dec 2025 13:25:23 -0500 Subject: [PATCH 28/59] virtio: add virtqueue_add_inbuf_cache_clean API Add virtqueue_add_inbuf_cache_clean() for passing DMA_ATTR_CPU_CACHE_CLEAN to virtqueue operations. This suppresses DMA debug cacheline overlap warnings for buffers where proper cache management is ensured by the caller. Message-ID: Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 83 ++++++++++++++++++++++++++---------- include/linux/virtio.h | 5 +++ 2 files changed, 65 insertions(+), 23 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 95e320b23624..4fe0f78df5ec 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -174,7 +174,8 @@ struct virtqueue_ops { int (*add)(struct vring_virtqueue *vq, struct scatterlist *sgs[], unsigned int total_sg, unsigned int out_sgs, unsigned int in_sgs, void *data, - void *ctx, bool premapped, gfp_t gfp); + void *ctx, bool premapped, gfp_t gfp, + unsigned long attr); void *(*get)(struct vring_virtqueue *vq, unsigned int *len, void **ctx); bool (*kick_prepare)(struct vring_virtqueue *vq); void (*disable_cb)(struct vring_virtqueue *vq); @@ -444,7 +445,7 @@ static int vring_mapping_error(const struct vring_virtqueue *vq, /* Map one sg entry. */ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg, enum dma_data_direction direction, dma_addr_t *addr, - u32 *len, bool premapped) + u32 *len, bool premapped, unsigned long attr) { if (premapped) { *addr = sg_dma_address(sg); @@ -472,7 +473,7 @@ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist */ *addr = virtqueue_map_page_attrs(&vq->vq, sg_page(sg), sg->offset, sg->length, - direction, 0); + direction, attr); if (vring_mapping_error(vq, *addr)) return -ENOMEM; @@ -603,7 +604,8 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq, void *data, void *ctx, bool premapped, - gfp_t gfp) + gfp_t gfp, + unsigned long attr) { struct vring_desc_extra *extra; struct scatterlist *sg; @@ -675,7 +677,8 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq, if (++sg_count != total_sg) flags |= VRING_DESC_F_NEXT; - if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr, &len, premapped)) + if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr, &len, + premapped, attr)) goto unmap_release; /* Note that we trust indirect descriptor @@ -694,7 +697,8 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq, if (++sg_count != total_sg) flags |= VRING_DESC_F_NEXT; - if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr, &len, premapped)) + if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr, &len, + premapped, attr)) goto unmap_release; /* Note that we trust indirect descriptor @@ -1487,7 +1491,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, void *data, bool premapped, gfp_t gfp, - u16 id) + u16 id, + unsigned long attr) { struct vring_desc_extra *extra; struct vring_packed_desc *desc; @@ -1516,7 +1521,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, for (sg = sgs[n]; sg; sg = sg_next(sg)) { if (vring_map_one_sg(vq, sg, n < out_sgs ? DMA_TO_DEVICE : DMA_FROM_DEVICE, - &addr, &len, premapped)) + &addr, &len, premapped, attr)) goto unmap_release; desc[i].flags = cpu_to_le16(n < out_sgs ? @@ -1615,7 +1620,8 @@ static inline int virtqueue_add_packed(struct vring_virtqueue *vq, void *data, void *ctx, bool premapped, - gfp_t gfp) + gfp_t gfp, + unsigned long attr) { struct vring_packed_desc *desc; struct scatterlist *sg; @@ -1642,8 +1648,8 @@ static inline int virtqueue_add_packed(struct vring_virtqueue *vq, id = vq->free_head; BUG_ON(id == vq->packed.vring.num); err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs, - in_sgs, data, premapped, - gfp, id); + in_sgs, data, premapped, gfp, + id, attr); if (err != -ENOMEM) { END_USE(vq); return err; @@ -1679,7 +1685,7 @@ static inline int virtqueue_add_packed(struct vring_virtqueue *vq, if (vring_map_one_sg(vq, sg, n < out_sgs ? DMA_TO_DEVICE : DMA_FROM_DEVICE, - &addr, &len, premapped)) + &addr, &len, premapped, attr)) goto unmap_release; flags = cpu_to_le16(vq->packed.avail_used_flags | @@ -1772,7 +1778,8 @@ static inline int virtqueue_add_packed_in_order(struct vring_virtqueue *vq, void *data, void *ctx, bool premapped, - gfp_t gfp) + gfp_t gfp, + unsigned long attr) { struct vring_packed_desc *desc; struct scatterlist *sg; @@ -1799,7 +1806,8 @@ static inline int virtqueue_add_packed_in_order(struct vring_virtqueue *vq, if (virtqueue_use_indirect(vq, total_sg)) { err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs, in_sgs, data, premapped, gfp, - vq->packed.next_avail_idx); + vq->packed.next_avail_idx, + attr); if (err != -ENOMEM) { END_USE(vq); return err; @@ -1838,7 +1846,7 @@ static inline int virtqueue_add_packed_in_order(struct vring_virtqueue *vq, if (vring_map_one_sg(vq, sg, n < out_sgs ? DMA_TO_DEVICE : DMA_FROM_DEVICE, - &addr, &len, premapped)) + &addr, &len, premapped, attr)) goto unmap_release; flags |= cpu_to_le16(vq->packed.avail_used_flags); @@ -2781,13 +2789,14 @@ static inline int virtqueue_add(struct virtqueue *_vq, void *data, void *ctx, bool premapped, - gfp_t gfp) + gfp_t gfp, + unsigned long attr) { struct vring_virtqueue *vq = to_vvq(_vq); return VIRTQUEUE_CALL(vq, add, sgs, total_sg, out_sgs, in_sgs, data, - ctx, premapped, gfp); + ctx, premapped, gfp, attr); } /** @@ -2825,7 +2834,7 @@ int virtqueue_add_sgs(struct virtqueue *_vq, total_sg++; } return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, - data, NULL, false, gfp); + data, NULL, false, gfp, 0); } EXPORT_SYMBOL_GPL(virtqueue_add_sgs); @@ -2847,7 +2856,7 @@ int virtqueue_add_outbuf(struct virtqueue *vq, void *data, gfp_t gfp) { - return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, false, gfp); + return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, false, gfp, 0); } EXPORT_SYMBOL_GPL(virtqueue_add_outbuf); @@ -2870,7 +2879,7 @@ int virtqueue_add_outbuf_premapped(struct virtqueue *vq, void *data, gfp_t gfp) { - return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, true, gfp); + return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, true, gfp, 0); } EXPORT_SYMBOL_GPL(virtqueue_add_outbuf_premapped); @@ -2892,10 +2901,38 @@ int virtqueue_add_inbuf(struct virtqueue *vq, void *data, gfp_t gfp) { - return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, false, gfp); + return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, false, gfp, 0); } EXPORT_SYMBOL_GPL(virtqueue_add_inbuf); +/** + * virtqueue_add_inbuf_cache_clean - expose input buffers with cache clean + * @vq: the struct virtqueue we're talking about. + * @sg: scatterlist (must be well-formed and terminated!) + * @num: the number of entries in @sg writable by other side + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Same as virtqueue_add_inbuf but passes DMA_ATTR_CPU_CACHE_CLEAN to indicate + * that the CPU will not dirty any cacheline overlapping this buffer while it + * is available, and to suppress overlapping cacheline warnings in DMA debug + * builds. + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). + */ +int virtqueue_add_inbuf_cache_clean(struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, + gfp_t gfp) +{ + return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, false, gfp, + DMA_ATTR_CPU_CACHE_CLEAN); +} +EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_cache_clean); + /** * virtqueue_add_inbuf_ctx - expose input buffers to other end * @vq: the struct virtqueue we're talking about. @@ -2916,7 +2953,7 @@ int virtqueue_add_inbuf_ctx(struct virtqueue *vq, void *ctx, gfp_t gfp) { - return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, false, gfp); + return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, false, gfp, 0); } EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx); @@ -2941,7 +2978,7 @@ int virtqueue_add_inbuf_premapped(struct virtqueue *vq, void *ctx, gfp_t gfp) { - return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, true, gfp); + return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, true, gfp, 0); } EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_premapped); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 3626eb694728..63bb05ece8c5 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -62,6 +62,11 @@ int virtqueue_add_inbuf(struct virtqueue *vq, void *data, gfp_t gfp); +int virtqueue_add_inbuf_cache_clean(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp); + int virtqueue_add_inbuf_ctx(struct virtqueue *vq, struct scatterlist sg[], unsigned int num, void *data, From 63dfad0517f0418a5192024fad934be79c2f5902 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Dec 2025 18:23:53 -0500 Subject: [PATCH 29/59] vsock/virtio: fix DMA alignment for event_list On non-cache-coherent platforms, when a structure contains a buffer used for DMA alongside fields that the CPU writes to, cacheline sharing can cause data corruption. The event_list array is used for DMA_FROM_DEVICE operations via virtqueue_add_inbuf(). The adjacent event_run and guest_cid fields are written by the CPU while the buffer is available, so mapped for the device. If these share cachelines with event_list, CPU writes can corrupt DMA data. Add __dma_from_device_group_begin()/end() annotations to ensure event_list is isolated in its own cachelines. Message-ID: Acked-by: Stefano Garzarella Signed-off-by: Michael S. Tsirkin --- net/vmw_vsock/virtio_transport.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 8c867023a2e5..3de2b2d49b51 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -59,7 +60,9 @@ struct virtio_vsock { */ struct mutex event_lock; bool event_run; + __dma_from_device_group_begin(); struct virtio_vsock_event event_list[8]; + __dma_from_device_group_end(); u32 guest_cid; bool seqpacket_allow; From db191ba0c8564ff84877e5b1c9553e991feca239 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Dec 2025 18:27:54 -0500 Subject: [PATCH 30/59] vsock/virtio: use virtqueue_add_inbuf_cache_clean for events The event_list array contains 8 small (4-byte) events that share cachelines with each other. When CONFIG_DMA_API_DEBUG is enabled, this can trigger warnings about overlapping DMA mappings within the same cacheline. The previous patch isolated event_list in its own cache lines so the warnings are spurious. Use virtqueue_add_inbuf_cache_clean() to indicate that the CPU does not write into these fields, suppressing the warnings. Reported-by: Cong Wang Message-ID: <4b5bf63a7ebb782d87f643466b3669df567c9fe1.1767601130.git.mst@redhat.com> Acked-by: Stefano Garzarella Signed-off-by: Michael S. Tsirkin --- net/vmw_vsock/virtio_transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 3de2b2d49b51..999a0839726a 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -393,7 +393,7 @@ static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock, sg_init_one(&sg, event, sizeof(*event)); - return virtqueue_add_inbuf(vq, &sg, 1, event, GFP_KERNEL); + return virtqueue_add_inbuf_cache_clean(vq, &sg, 1, event, GFP_KERNEL); } /* event_lock must be held */ From 95c7b0ad6c69d1c0608ff0bbd358a546856beaf3 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Dec 2025 18:24:36 -0500 Subject: [PATCH 31/59] virtio_input: fix DMA alignment for evts On non-cache-coherent platforms, when a structure contains a buffer used for DMA alongside fields that the CPU writes to, cacheline sharing can cause data corruption. The evts array is used for DMA_FROM_DEVICE operations via virtqueue_add_inbuf(). The adjacent lock and ready fields are written by the CPU during normal operation. If these share cachelines with evts, CPU writes can corrupt DMA data. Add __dma_from_device_group_begin()/end() annotations to ensure evts is isolated in its own cachelines. Message-ID: Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_input.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c index d0728285b6ce..9f13de1f1d77 100644 --- a/drivers/virtio/virtio_input.c +++ b/drivers/virtio/virtio_input.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -16,7 +17,9 @@ struct virtio_input { char serial[64]; char phys[64]; struct virtqueue *evt, *sts; + __dma_from_device_group_begin(); struct virtio_input_event evts[64]; + __dma_from_device_group_end(); spinlock_t lock; bool ready; }; From 2678369e8efe0c5ac71adb49fbc2c240a222e44d Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Dec 2025 18:25:16 -0500 Subject: [PATCH 32/59] virtio_scsi: fix DMA cacheline issues for events Current struct virtio_scsi_event_node layout has two problems: The event (DMA_FROM_DEVICE) and work (CPU-written via INIT_WORK/queue_work) fields share a cacheline. On non-cache-coherent platforms, CPU writes to work can corrupt device-written event data. If ARCH_DMA_MINALIGN is large enough, the 8 events in event_list share cachelines, triggering CONFIG_DMA_API_DEBUG warnings. Fix the corruption by moving event buffers to a separate array and aligning using __dma_from_device_group_begin()/end(). Suppress the (now spurious) DMA debug warnings using virtqueue_add_inbuf_cache_clean(). Message-ID: <8801aeef7576a155299f19b6887682dd3a272aba.1767601130.git.mst@redhat.com> Reviewed-by: Stefan Hajnoczi Signed-off-by: Michael S. Tsirkin --- drivers/scsi/virtio_scsi.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 96a69edddbe5..6ff53fc8adb0 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "sd.h" @@ -61,7 +62,7 @@ struct virtio_scsi_cmd { struct virtio_scsi_event_node { struct virtio_scsi *vscsi; - struct virtio_scsi_event event; + struct virtio_scsi_event *event; struct work_struct work; }; @@ -89,6 +90,11 @@ struct virtio_scsi { struct virtio_scsi_vq ctrl_vq; struct virtio_scsi_vq event_vq; + + __dma_from_device_group_begin(); + struct virtio_scsi_event events[VIRTIO_SCSI_EVENT_LEN]; + __dma_from_device_group_end(); + struct virtio_scsi_vq req_vqs[]; }; @@ -237,12 +243,12 @@ static int virtscsi_kick_event(struct virtio_scsi *vscsi, unsigned long flags; INIT_WORK(&event_node->work, virtscsi_handle_event); - sg_init_one(&sg, &event_node->event, sizeof(struct virtio_scsi_event)); + sg_init_one(&sg, event_node->event, sizeof(struct virtio_scsi_event)); spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); - err = virtqueue_add_inbuf(vscsi->event_vq.vq, &sg, 1, event_node, - GFP_ATOMIC); + err = virtqueue_add_inbuf_cache_clean(vscsi->event_vq.vq, &sg, 1, event_node, + GFP_ATOMIC); if (!err) virtqueue_kick(vscsi->event_vq.vq); @@ -257,6 +263,7 @@ static int virtscsi_kick_event_all(struct virtio_scsi *vscsi) for (i = 0; i < VIRTIO_SCSI_EVENT_LEN; i++) { vscsi->event_list[i].vscsi = vscsi; + vscsi->event_list[i].event = &vscsi->events[i]; virtscsi_kick_event(vscsi, &vscsi->event_list[i]); } @@ -380,7 +387,7 @@ static void virtscsi_handle_event(struct work_struct *work) struct virtio_scsi_event_node *event_node = container_of(work, struct virtio_scsi_event_node, work); struct virtio_scsi *vscsi = event_node->vscsi; - struct virtio_scsi_event *event = &event_node->event; + struct virtio_scsi_event *event = event_node->event; if (event->event & cpu_to_virtio32(vscsi->vdev, VIRTIO_SCSI_T_EVENTS_MISSED)) { From bd2b617c49820a38cefcf512c6d56d30deb59aa9 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Dec 2025 18:27:21 -0500 Subject: [PATCH 33/59] virtio-rng: fix DMA alignment for data buffer The data buffer in struct virtrng_info is used for DMA_FROM_DEVICE via virtqueue_add_inbuf() and shares cachelines with the adjacent CPU-written fields (data_avail, data_idx). The device writing to the DMA buffer and the CPU writing to adjacent fields could corrupt each other's data on non-cache-coherent platforms. Add __dma_from_device_group_begin()/end() annotations to place these in distinct cache lines. Message-ID: <157a63b6324d1f1307ddd4faa3b62a8b90a79423.1767601130.git.mst@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/char/hw_random/virtio-rng.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index dd998f4fe4f2..eb80a031c7be 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -28,11 +29,13 @@ struct virtrng_info { unsigned int data_avail; unsigned int data_idx; /* minimal size returned by rng_buffer_size() */ + __dma_from_device_group_begin(); #if SMP_CACHE_BYTES < 32 u8 data[32]; #else u8 data[SMP_CACHE_BYTES]; #endif + __dma_from_device_group_end(); }; static void random_recv_done(struct virtqueue *vq) From d08fda2cf2e68b4e0865f1bf0b49010db74da079 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Dec 2025 18:28:28 -0500 Subject: [PATCH 34/59] virtio_input: use virtqueue_add_inbuf_cache_clean for events The evts array contains 64 small (8-byte) input events that share cachelines with each other. When CONFIG_DMA_API_DEBUG is enabled, this can trigger warnings about overlapping DMA mappings within the same cacheline. Previous patch isolated the array in its own cachelines, so the warnings are now spurious. Use virtqueue_add_inbuf_cache_clean() to indicate that the CPU does not write into these cache lines, suppressing these warnings. Message-ID: <4c885b4046323f68cf5cadc7fbfb00216b11dd20.1767601130.git.mst@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c index 9f13de1f1d77..74df16677da8 100644 --- a/drivers/virtio/virtio_input.c +++ b/drivers/virtio/virtio_input.c @@ -30,7 +30,7 @@ static void virtinput_queue_evtbuf(struct virtio_input *vi, struct scatterlist sg[1]; sg_init_one(sg, evtbuf, sizeof(*evtbuf)); - virtqueue_add_inbuf(vi->evt, sg, 1, evtbuf, GFP_ATOMIC); + virtqueue_add_inbuf_cache_clean(vi->evt, sg, 1, evtbuf, GFP_ATOMIC); } static void virtinput_recv_events(struct virtqueue *vq) From f9108dee782fe45318a2c9f007fb72ab370d476d Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Dec 2025 18:58:05 -0500 Subject: [PATCH 35/59] vsock/virtio: reorder fields to reduce padding Reorder struct virtio_vsock fields to place the DMA buffer (event_list) last. This eliminates the padding from aligning the struct size on ARCH_DMA_MINALIGN. Message-ID: Signed-off-by: Michael S. Tsirkin --- net/vmw_vsock/virtio_transport.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 999a0839726a..b333a7591b26 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -55,15 +55,6 @@ struct virtio_vsock { int rx_buf_nr; int rx_buf_max_nr; - /* The following fields are protected by event_lock. - * vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held. - */ - struct mutex event_lock; - bool event_run; - __dma_from_device_group_begin(); - struct virtio_vsock_event event_list[8]; - __dma_from_device_group_end(); - u32 guest_cid; bool seqpacket_allow; @@ -77,6 +68,15 @@ struct virtio_vsock { */ struct scatterlist *out_sgs[MAX_SKB_FRAGS + 1]; struct scatterlist out_bufs[MAX_SKB_FRAGS + 1]; + + /* The following fields are protected by event_lock. + * vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held. + */ + struct mutex event_lock; + bool event_run; + __dma_from_device_group_begin(); + struct virtio_vsock_event event_list[8]; + __dma_from_device_group_end(); }; static u32 virtio_transport_get_local_cid(void) From 29615fe3fb5015a96a14cfa43bd168034719ddeb Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 30 Dec 2025 08:04:15 -0500 Subject: [PATCH 36/59] gpio: virtio: fix DMA alignment The res and ires buffers in struct virtio_gpio_line and struct vgpio_irq_line respectively are used for DMA_FROM_DEVICE via virtqueue_add_sgs(). However, within these structs, even though these elements are tagged as ____cacheline_aligned, adjacent struct elements can share DMA cachelines on platforms where ARCH_DMA_MINALIGN > L1_CACHE_BYTES (e.g., arm64 with 128-byte DMA alignment but 64-byte cache lines). The existing ____cacheline_aligned annotation aligns to L1_CACHE_BYTES which is not always sufficient for DMA alignment. For example, with L1_CACHE_BYTES = 32 and ARCH_DMA_MINALIGN = 128 - irq_lines[0].ires at offset 128 - irq_lines[1].type at offset 192 both in same 128-byte DMA cacheline [128-256) When the device writes to irq_lines[0].ires and the CPU concurrently modifies one of irq_lines[1].type/disabled/masked/queued flags, corruption can occur on non-cache-coherent platforms. Fix by using __dma_from_device_group_begin()/end() annotations on the DMA buffers. Drop ____cacheline_aligned - it's not required to isolate request and response, and keeping them would increase the memory cost. Acked-by: Viresh Kumar Message-ID: Acked-by: Bartosz Golaszewski Signed-off-by: Michael S. Tsirkin --- drivers/gpio/gpio-virtio.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpio-virtio.c b/drivers/gpio/gpio-virtio.c index 17e040991e46..b70294626770 100644 --- a/drivers/gpio/gpio-virtio.c +++ b/drivers/gpio/gpio-virtio.c @@ -10,6 +10,7 @@ */ #include +#include #include #include #include @@ -24,8 +25,11 @@ struct virtio_gpio_line { struct mutex lock; /* Protects line operation */ struct completion completion; - struct virtio_gpio_request req ____cacheline_aligned; - struct virtio_gpio_response res ____cacheline_aligned; + + __dma_from_device_group_begin(); + struct virtio_gpio_request req; + struct virtio_gpio_response res; + __dma_from_device_group_end(); unsigned int rxlen; }; @@ -37,8 +41,10 @@ struct vgpio_irq_line { bool update_pending; bool queue_pending; - struct virtio_gpio_irq_request ireq ____cacheline_aligned; - struct virtio_gpio_irq_response ires ____cacheline_aligned; + __dma_from_device_group_begin(); + struct virtio_gpio_irq_request ireq; + struct virtio_gpio_irq_response ires; + __dma_from_device_group_end(); }; struct virtio_gpio { From 1a266b6d9cfa42997f31942d1754ddf220ba7a1c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 29 Dec 2025 18:58:05 -0500 Subject: [PATCH 37/59] gpio: virtio: reorder fields to reduce struct padding Reorder struct virtio_gpio_line fields to place the DMA buffers (req/res) last. This eliminates the padding from aligning struct size on ARCH_DMA_MINALIGN. Acked-by: Viresh Kumar Message-ID: Acked-by: Bartosz Golaszewski Signed-off-by: Michael S. Tsirkin --- drivers/gpio/gpio-virtio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-virtio.c b/drivers/gpio/gpio-virtio.c index b70294626770..ed6e0e90fa8a 100644 --- a/drivers/gpio/gpio-virtio.c +++ b/drivers/gpio/gpio-virtio.c @@ -26,11 +26,12 @@ struct virtio_gpio_line { struct mutex lock; /* Protects line operation */ struct completion completion; + unsigned int rxlen; + __dma_from_device_group_begin(); struct virtio_gpio_request req; struct virtio_gpio_response res; __dma_from_device_group_end(); - unsigned int rxlen; }; struct vgpio_irq_line { From 74bc5f69bd3b7fa099fca67268f10532e3dae916 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 5 Jan 2026 16:05:42 -0500 Subject: [PATCH 38/59] checkpatch: special-case cacheline group macros Currently, cacheline group macros trigger checkpatch warnings. For example: $ ./scripts/checkpatch.pl -g ba7e025a6c84aed012421468d83639e5dae982b0 WARNING: Missing a blank line after declarations #58: FILE: drivers/gpio/gpio-virtio.c:32: + struct virtio_gpio_response res; + __dma_from_device_group_end(); $ ./scripts/checkpatch.pl -g 5d4cc87414c5d11345c4b11d61377d351b5c28a2 WARNING: Missing a blank line after declarations #267: FILE: include/net/sock.h:431: + int sk_rcvlowat; + __cacheline_group_end(sock_read_rx); But these are not actually statements - the following macros all expand to zero-length fields: __cacheline_group_begin() __cacheline_group_end() __cacheline_group_begin_aligned() __cacheline_group_end_aligned() __dma_from_device_group_begin() __dma_from_device_group_end() Add them to $declaration_macros so checkpatch recognizes this fact. Message-ID: Signed-off-by: Michael S. Tsirkin --- scripts/checkpatch.pl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index c0250244cf7a..f71dd9cbddfb 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1100,7 +1100,9 @@ our $declaration_macros = qr{(?x: (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,6}\s*\(| (?:$Storage\s+)?[HLP]?LIST_HEAD\s*\(| (?:SKCIPHER_REQUEST|SHASH_DESC|AHASH_REQUEST)_ON_STACK\s*\(| - (?:$Storage\s+)?(?:XA_STATE|XA_STATE_ORDER)\s*\( + (?:$Storage\s+)?(?:XA_STATE|XA_STATE_ORDER)\s*\(| + __cacheline_group_(?:begin|end)(?:_aligned)?\s*\(| + __dma_from_device_group_(?:begin|end)\s*\( )}; our %allow_repeated_words = ( From cd025c1e876b4e262e71398236a1550486a73ede Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Mon, 19 Jan 2026 15:32:54 +0100 Subject: [PATCH 39/59] vhost: move vdpa group bound check to vhost_vdpa MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove duplication by consolidating these here. This reduces the posibility of a parent driver missing them. While we're at it, fix a bug in vdpa_sim where a valid ASID can be assigned to a group equal to ngroups, causing an out of bound write. Cc: stable@vger.kernel.org Fixes: bda324fd037a ("vdpasim: control virtqueue support") Acked-by: Jason Wang Signed-off-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20260119143306.1818855-2-eperezma@redhat.com> --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 3 --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 6 ------ drivers/vhost/vdpa.c | 2 +- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index ddaa1366704b..44062e9d68f0 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -3640,9 +3640,6 @@ static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group, struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); int err = 0; - if (group >= MLX5_VDPA_NUMVQ_GROUPS) - return -EINVAL; - mvdev->mres.group2asid[group] = asid; mutex_lock(&mvdev->mres.lock); diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index c1c6431950e1..df9c7ddc5d78 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -606,12 +606,6 @@ static int vdpasim_set_group_asid(struct vdpa_device *vdpa, unsigned int group, struct vhost_iotlb *iommu; int i; - if (group > vdpasim->dev_attr.ngroups) - return -EINVAL; - - if (asid >= vdpasim->dev_attr.nas) - return -EINVAL; - iommu = &vdpasim->iommu[asid]; mutex_lock(&vdpasim->mutex); diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 05a481e4c385..9d25b735b43d 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -680,7 +680,7 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, case VHOST_VDPA_SET_GROUP_ASID: if (copy_from_user(&s, argp, sizeof(s))) return -EFAULT; - if (s.num >= vdpa->nas) + if (idx >= vdpa->ngroups || s.num >= vdpa->nas) return -EINVAL; if (!ops->set_group_asid) return -EOPNOTSUPP; From a006ed4ecd4905b69402980ad7d4e5f31bf44953 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Mon, 19 Jan 2026 15:32:55 +0100 Subject: [PATCH 40/59] vduse: add v1 API definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows the kernel to detect whether the userspace VDUSE device supports the VQ group and ASID features. VDUSE devices that don't set the V1 API will not receive the new messages, and vdpa device will be created with only one vq group and asid. The next patches implement the new feature incrementally, only enabling the VDUSE device to set the V1 API version by the end of the series. Acked-by: Jason Wang Reviewed-by: Xie Yongji Signed-off-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20260119143306.1818855-3-eperezma@redhat.com> --- include/uapi/linux/vduse.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h index 10ad71aa00d6..ccb92a1efce0 100644 --- a/include/uapi/linux/vduse.h +++ b/include/uapi/linux/vduse.h @@ -10,6 +10,10 @@ #define VDUSE_API_VERSION 0 +/* VQ groups and ASID support */ + +#define VDUSE_API_VERSION_1 1 + /* * Get the version of VDUSE API that kernel supported (VDUSE_API_VERSION). * This is used for future extension. From 9350a09afd086771b0612c7b7c9583e8a1568135 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Mon, 19 Jan 2026 15:32:56 +0100 Subject: [PATCH 41/59] vduse: add vq group support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows separate the different virtqueues in groups that shares the same address space. Asking the VDUSE device for the groups of the vq at the beginning as they're needed for the DMA API. Allocating 3 vq groups as net is the device that need the most groups: * Dataplane (guest passthrough) * CVQ * Shadowed vrings. Future versions of the series can include dynamic allocation of the groups array so VDUSE can declare more groups. Acked-by: Jason Wang Reviewed-by: Xie Yongji Signed-off-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20260119143306.1818855-4-eperezma@redhat.com> --- drivers/vdpa/vdpa_user/vduse_dev.c | 47 ++++++++++++++++++++++++++---- include/uapi/linux/vduse.h | 12 ++++++-- 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index ae357d014564..5bffc25a266e 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -39,6 +39,7 @@ #define DRV_LICENSE "GPL v2" #define VDUSE_DEV_MAX (1U << MINORBITS) +#define VDUSE_DEV_MAX_GROUPS 0xffff #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024) #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024) #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024) @@ -58,6 +59,7 @@ struct vduse_virtqueue { struct vdpa_vq_state state; bool ready; bool kicked; + u32 group; spinlock_t kick_lock; spinlock_t irq_lock; struct eventfd_ctx *kickfd; @@ -114,6 +116,7 @@ struct vduse_dev { u8 status; u32 vq_num; u32 vq_align; + u32 ngroups; struct vduse_umem *umem; struct mutex mem_lock; unsigned int bounce_size; @@ -592,6 +595,16 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx, return 0; } +static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + if (dev->api_version < VDUSE_API_VERSION_1) + return 0; + + return dev->vqs[idx]->group; +} + static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, struct vdpa_vq_state *state) { @@ -789,6 +802,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { .set_vq_cb = vduse_vdpa_set_vq_cb, .set_vq_num = vduse_vdpa_set_vq_num, .get_vq_size = vduse_vdpa_get_vq_size, + .get_vq_group = vduse_get_vq_group, .set_vq_ready = vduse_vdpa_set_vq_ready, .get_vq_ready = vduse_vdpa_get_vq_ready, .set_vq_state = vduse_vdpa_set_vq_state, @@ -1252,12 +1266,24 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, if (config.index >= dev->vq_num) break; - if (!is_mem_zero((const char *)config.reserved, - sizeof(config.reserved))) + if (dev->api_version < VDUSE_API_VERSION_1) { + if (config.group) + break; + } else { + if (config.group >= dev->ngroups) + break; + if (dev->status & VIRTIO_CONFIG_S_DRIVER_OK) + break; + } + + if (config.reserved1 || + !is_mem_zero((const char *)config.reserved2, + sizeof(config.reserved2))) break; index = array_index_nospec(config.index, dev->vq_num); dev->vqs[index]->num_max = config.max_size; + dev->vqs[index]->group = config.group; ret = 0; break; } @@ -1737,12 +1763,20 @@ static bool features_is_valid(struct vduse_dev_config *config) return true; } -static bool vduse_validate_config(struct vduse_dev_config *config) +static bool vduse_validate_config(struct vduse_dev_config *config, + u64 api_version) { if (!is_mem_zero((const char *)config->reserved, sizeof(config->reserved))) return false; + if (api_version < VDUSE_API_VERSION_1 && config->ngroups) + return false; + + if (api_version >= VDUSE_API_VERSION_1 && + (!config->ngroups || config->ngroups > VDUSE_DEV_MAX_GROUPS)) + return false; + if (config->vq_align > PAGE_SIZE) return false; @@ -1858,6 +1892,9 @@ static int vduse_create_dev(struct vduse_dev_config *config, dev->device_features = config->features; dev->device_id = config->device_id; dev->vendor_id = config->vendor_id; + dev->ngroups = (dev->api_version < VDUSE_API_VERSION_1) + ? 1 + : config->ngroups; dev->name = kstrdup(config->name, GFP_KERNEL); if (!dev->name) goto err_str; @@ -1936,7 +1973,7 @@ static long vduse_ioctl(struct file *file, unsigned int cmd, break; ret = -EINVAL; - if (vduse_validate_config(&config) == false) + if (!vduse_validate_config(&config, control->api_version)) break; buf = vmemdup_user(argp + size, config.config_size); @@ -2017,7 +2054,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, &vduse_vdpa_config_ops, &vduse_map_ops, - 1, 1, name, true); + dev->ngroups, 1, name, true); if (IS_ERR(vdev)) return PTR_ERR(vdev); diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h index ccb92a1efce0..a3d51cf6df3a 100644 --- a/include/uapi/linux/vduse.h +++ b/include/uapi/linux/vduse.h @@ -31,6 +31,7 @@ * @features: virtio features * @vq_num: the number of virtqueues * @vq_align: the allocation alignment of virtqueue's metadata + * @ngroups: number of vq groups that VDUSE device declares * @reserved: for future use, needs to be initialized to zero * @config_size: the size of the configuration space * @config: the buffer of the configuration space @@ -45,7 +46,8 @@ struct vduse_dev_config { __u64 features; __u32 vq_num; __u32 vq_align; - __u32 reserved[13]; + __u32 ngroups; /* if VDUSE_API_VERSION >= 1 */ + __u32 reserved[12]; __u32 config_size; __u8 config[]; }; @@ -122,14 +124,18 @@ struct vduse_config_data { * struct vduse_vq_config - basic configuration of a virtqueue * @index: virtqueue index * @max_size: the max size of virtqueue - * @reserved: for future use, needs to be initialized to zero + * @reserved1: for future use, needs to be initialized to zero + * @group: virtqueue group + * @reserved2: for future use, needs to be initialized to zero * * Structure used by VDUSE_VQ_SETUP ioctl to setup a virtqueue. */ struct vduse_vq_config { __u32 index; __u16 max_size; - __u16 reserved[13]; + __u16 reserved1; + __u32 group; + __u16 reserved2[10]; }; /* From 02e3f7ffe2906033da73b7c7ea8180b131d0cdbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Mon, 19 Jan 2026 15:32:57 +0100 Subject: [PATCH 42/59] vduse: return internal vq group struct as map token MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Return the internal struct that represents the vq group as virtqueue map token, instead of the device. This allows the map functions to access the information per group. At this moment all the virtqueues share the same vq group, that only can point to ASID 0. This change prepares the infrastructure for actual per-group address space handling Acked-by: Jason Wang Signed-off-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20260119143306.1818855-5-eperezma@redhat.com> --- drivers/vdpa/vdpa_user/vduse_dev.c | 100 ++++++++++++++++++++++++++--- include/linux/virtio.h | 6 +- 2 files changed, 94 insertions(+), 12 deletions(-) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 5bffc25a266e..68290c3d9d8f 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -85,6 +86,10 @@ struct vduse_umem { struct mm_struct *mm; }; +struct vduse_vq_group { + struct vduse_dev *dev; +}; + struct vduse_dev { struct vduse_vdpa *vdev; struct device *dev; @@ -118,6 +123,7 @@ struct vduse_dev { u32 vq_align; u32 ngroups; struct vduse_umem *umem; + struct vduse_vq_group *groups; struct mutex mem_lock; unsigned int bounce_size; struct mutex domain_lock; @@ -605,6 +611,17 @@ static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx) return dev->vqs[idx]->group; } +static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + u32 vq_group = vduse_get_vq_group(vdpa, idx); + union virtio_map ret = { + .group = &dev->groups[vq_group], + }; + + return ret; +} + static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, struct vdpa_vq_state *state) { @@ -825,6 +842,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { .get_vq_affinity = vduse_vdpa_get_vq_affinity, .reset = vduse_vdpa_reset, .set_map = vduse_vdpa_set_map, + .get_vq_map = vduse_get_vq_map, .free = vduse_vdpa_free, }; @@ -832,7 +850,14 @@ static void vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir) { - struct vduse_iova_domain *domain = token.iova_domain; + struct vduse_dev *vdev; + struct vduse_iova_domain *domain; + + if (!token.group) + return; + + vdev = token.group->dev; + domain = vdev->domain; vduse_domain_sync_single_for_device(domain, dma_addr, size, dir); } @@ -841,7 +866,14 @@ static void vduse_dev_sync_single_for_cpu(union virtio_map token, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir) { - struct vduse_iova_domain *domain = token.iova_domain; + struct vduse_dev *vdev; + struct vduse_iova_domain *domain; + + if (!token.group) + return; + + vdev = token.group->dev; + domain = vdev->domain; vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir); } @@ -851,7 +883,14 @@ static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page, enum dma_data_direction dir, unsigned long attrs) { - struct vduse_iova_domain *domain = token.iova_domain; + struct vduse_dev *vdev; + struct vduse_iova_domain *domain; + + if (!token.group) + return DMA_MAPPING_ERROR; + + vdev = token.group->dev; + domain = vdev->domain; return vduse_domain_map_page(domain, page, offset, size, dir, attrs); } @@ -860,7 +899,14 @@ static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - struct vduse_iova_domain *domain = token.iova_domain; + struct vduse_dev *vdev; + struct vduse_iova_domain *domain; + + if (!token.group) + return; + + vdev = token.group->dev; + domain = vdev->domain; return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs); } @@ -868,11 +914,17 @@ static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr, static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size, dma_addr_t *dma_addr, gfp_t flag) { - struct vduse_iova_domain *domain = token.iova_domain; + struct vduse_dev *vdev; + struct vduse_iova_domain *domain; unsigned long iova; void *addr; *dma_addr = DMA_MAPPING_ERROR; + if (!token.group) + return NULL; + + vdev = token.group->dev; + domain = vdev->domain; addr = vduse_domain_alloc_coherent(domain, size, (dma_addr_t *)&iova, flag); if (!addr) @@ -887,14 +939,28 @@ static void vduse_dev_free_coherent(union virtio_map token, size_t size, void *vaddr, dma_addr_t dma_addr, unsigned long attrs) { - struct vduse_iova_domain *domain = token.iova_domain; + struct vduse_dev *vdev; + struct vduse_iova_domain *domain; + + if (!token.group) + return; + + vdev = token.group->dev; + domain = vdev->domain; vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs); } static bool vduse_dev_need_sync(union virtio_map token, dma_addr_t dma_addr) { - struct vduse_iova_domain *domain = token.iova_domain; + struct vduse_dev *vdev; + struct vduse_iova_domain *domain; + + if (!token.group) + return false; + + vdev = token.group->dev; + domain = vdev->domain; return dma_addr < domain->bounce_size; } @@ -908,7 +974,14 @@ static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr) static size_t vduse_dev_max_mapping_size(union virtio_map token) { - struct vduse_iova_domain *domain = token.iova_domain; + struct vduse_dev *vdev; + struct vduse_iova_domain *domain; + + if (!token.group) + return 0; + + vdev = token.group->dev; + domain = vdev->domain; return domain->bounce_size; } @@ -1726,6 +1799,7 @@ static int vduse_destroy_dev(char *name) if (dev->domain) vduse_domain_destroy(dev->domain); kfree(dev->name); + kfree(dev->groups); vduse_dev_destroy(dev); module_put(THIS_MODULE); @@ -1895,6 +1969,13 @@ static int vduse_create_dev(struct vduse_dev_config *config, dev->ngroups = (dev->api_version < VDUSE_API_VERSION_1) ? 1 : config->ngroups; + dev->groups = kcalloc(dev->ngroups, sizeof(dev->groups[0]), + GFP_KERNEL); + if (!dev->groups) + goto err_vq_groups; + for (u32 i = 0; i < dev->ngroups; ++i) + dev->groups[i].dev = dev; + dev->name = kstrdup(config->name, GFP_KERNEL); if (!dev->name) goto err_str; @@ -1931,6 +2012,8 @@ err_dev: err_idr: kfree(dev->name); err_str: + kfree(dev->groups); +err_vq_groups: vduse_dev_destroy(dev); err: return ret; @@ -2092,7 +2175,6 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, return -ENOMEM; } - dev->vdev->vdpa.vmap.iova_domain = dev->domain; ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num); if (ret) { put_device(&dev->vdev->vdpa.dev); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 63bb05ece8c5..3bbc4cb6a672 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -43,13 +43,13 @@ struct virtqueue { void *priv; }; -struct vduse_iova_domain; +struct vduse_vq_group; union virtio_map { /* Device that performs DMA */ struct device *dma_dev; - /* VDUSE specific mapping data */ - struct vduse_iova_domain *iova_domain; + /* VDUSE specific virtqueue group for doing map */ + struct vduse_vq_group *group; }; int virtqueue_add_outbuf(struct virtqueue *vq, From 0d215afdc8199ef9702567778bbc781449f48e50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Mon, 19 Jan 2026 15:32:58 +0100 Subject: [PATCH 43/59] vdpa: document set_group_asid thread safety MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document that the function races with the check of DRIVER_OK. Acked-by: Jason Wang Signed-off-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20260119143306.1818855-6-eperezma@redhat.com> --- include/linux/vdpa.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 4cf21d6e9cfd..2bfe3baa63f4 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -312,7 +312,9 @@ struct vdpa_map_file { * @idx: virtqueue index * Returns the affinity mask * @set_group_asid: Set address space identifier for a - * virtqueue group (optional) + * virtqueue group (optional). Caller must + * prevent this from being executed concurrently + * with set_status. * @vdev: vdpa device * @group: virtqueue group * @asid: address space id for this group From 3543b04a4ea3de78bdc420350d21c538efd6116c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Mon, 19 Jan 2026 15:32:59 +0100 Subject: [PATCH 44/59] vhost: forbid change vq groups ASID if DRIVER_OK is set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only vdpa_sim support it. Forbid this behaviour as there is no use for it right now, we can always enable it in the future with a feature flag. Acked-by: Jason Wang Signed-off-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20260119143306.1818855-7-eperezma@redhat.com> --- drivers/vhost/vdpa.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 9d25b735b43d..3f0184d42075 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -682,6 +682,8 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, return -EFAULT; if (idx >= vdpa->ngroups || s.num >= vdpa->nas) return -EINVAL; + if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK) + return -EBUSY; if (!ops->set_group_asid) return -EOPNOTSUPP; return ops->set_group_asid(vdpa, idx, s.num); From 3e2ddda6f4cb9e25e2e0a24033e13e347d6ce952 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Mon, 19 Jan 2026 15:33:00 +0100 Subject: [PATCH 45/59] vduse: refactor vdpa_dev_add for goto err handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Next patches introduce more error paths in this function. Refactor it so they can be accommodated through gotos. Acked-by: Jason Wang Reviewed-by: Xie Yongji Signed-off-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20260119143306.1818855-8-eperezma@redhat.com> --- drivers/vdpa/vdpa_user/vduse_dev.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 68290c3d9d8f..43851b0711ac 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -2171,21 +2171,27 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, dev->bounce_size); mutex_unlock(&dev->domain_lock); if (!dev->domain) { - put_device(&dev->vdev->vdpa.dev); - return -ENOMEM; + ret = -ENOMEM; + goto domain_err; } ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num); if (ret) { - put_device(&dev->vdev->vdpa.dev); - mutex_lock(&dev->domain_lock); - vduse_domain_destroy(dev->domain); - dev->domain = NULL; - mutex_unlock(&dev->domain_lock); - return ret; + goto register_err; } return 0; + +register_err: + mutex_lock(&dev->domain_lock); + vduse_domain_destroy(dev->domain); + dev->domain = NULL; + mutex_unlock(&dev->domain_lock); + +domain_err: + put_device(&dev->vdev->vdpa.dev); + + return ret; } static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev) From 766e1749c0ef6a09651be9b8a8283d508c322b58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Mon, 19 Jan 2026 15:33:01 +0100 Subject: [PATCH 46/59] vduse: remove unused vaddr parameter of vduse_domain_free_coherent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We will modify the function in next patches so let's clean it first. Signed-off-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20260119143306.1818855-9-eperezma@redhat.com> --- drivers/vdpa/vdpa_user/iova_domain.c | 3 +-- drivers/vdpa/vdpa_user/iova_domain.h | 3 +-- drivers/vdpa/vdpa_user/vduse_dev.c | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c index 4352b5cf74f0..309cd5a039d1 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.c +++ b/drivers/vdpa/vdpa_user/iova_domain.c @@ -528,8 +528,7 @@ err: } void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, - void *vaddr, dma_addr_t dma_addr, - unsigned long attrs) + dma_addr_t dma_addr, unsigned long attrs) { struct iova_domain *iovad = &domain->consistent_iovad; struct vhost_iotlb_map *map; diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h index a923971a64f5..081f06c52cdc 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.h +++ b/drivers/vdpa/vdpa_user/iova_domain.h @@ -70,8 +70,7 @@ void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, gfp_t flag); void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, - void *vaddr, dma_addr_t dma_addr, - unsigned long attrs); + dma_addr_t dma_addr, unsigned long attrs); void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain); diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 43851b0711ac..0e3cf5128ad0 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -948,7 +948,7 @@ static void vduse_dev_free_coherent(union virtio_map token, size_t size, vdev = token.group->dev; domain = vdev->domain; - vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs); + vduse_domain_free_coherent(domain, size, dma_addr, attrs); } static bool vduse_dev_need_sync(union virtio_map token, dma_addr_t dma_addr) From 489d76520612abf9a4ede4344349105406c91a73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Mon, 19 Jan 2026 15:33:02 +0100 Subject: [PATCH 47/59] vduse: take out allocations from vduse_dev_alloc_coherent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function vduse_dev_alloc_coherent will be called under rwlock in next patches. Make it out of the lock to avoid increasing its fail rate. Acked-by: Jason Wang Signed-off-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20260119143306.1818855-10-eperezma@redhat.com> --- drivers/vdpa/vdpa_user/iova_domain.c | 24 +++++++----------------- drivers/vdpa/vdpa_user/iova_domain.h | 5 ++--- drivers/vdpa/vdpa_user/vduse_dev.c | 17 +++++++++++------ 3 files changed, 20 insertions(+), 26 deletions(-) diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c index 309cd5a039d1..0a9f668467a8 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.c +++ b/drivers/vdpa/vdpa_user/iova_domain.c @@ -493,17 +493,15 @@ void vduse_domain_unmap_page(struct vduse_iova_domain *domain, vduse_domain_free_iova(iovad, dma_addr, size); } -void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, - size_t size, dma_addr_t *dma_addr, - gfp_t flag) +dma_addr_t vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, + size_t size, void *orig) { struct iova_domain *iovad = &domain->consistent_iovad; unsigned long limit = domain->iova_limit; dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit); - void *orig = alloc_pages_exact(size, flag); - if (!iova || !orig) - goto err; + if (!iova) + return DMA_MAPPING_ERROR; spin_lock(&domain->iotlb_lock); if (vduse_iotlb_add_range(domain, (u64)iova, (u64)iova + size - 1, @@ -514,17 +512,12 @@ void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, } spin_unlock(&domain->iotlb_lock); - *dma_addr = iova; + return iova; - return orig; err: - *dma_addr = DMA_MAPPING_ERROR; - if (orig) - free_pages_exact(orig, size); - if (iova) - vduse_domain_free_iova(iovad, iova, size); + vduse_domain_free_iova(iovad, iova, size); - return NULL; + return DMA_MAPPING_ERROR; } void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, @@ -533,7 +526,6 @@ void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, struct iova_domain *iovad = &domain->consistent_iovad; struct vhost_iotlb_map *map; struct vdpa_map_file *map_file; - phys_addr_t pa; spin_lock(&domain->iotlb_lock); map = vhost_iotlb_itree_first(domain->iotlb, (u64)dma_addr, @@ -545,12 +537,10 @@ void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, map_file = (struct vdpa_map_file *)map->opaque; fput(map_file->file); kfree(map_file); - pa = map->addr; vhost_iotlb_map_free(domain->iotlb, map); spin_unlock(&domain->iotlb_lock); vduse_domain_free_iova(iovad, dma_addr, size); - free_pages_exact(phys_to_virt(pa), size); } static vm_fault_t vduse_domain_mmap_fault(struct vm_fault *vmf) diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h index 081f06c52cdc..e50e55d1396f 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.h +++ b/drivers/vdpa/vdpa_user/iova_domain.h @@ -65,9 +65,8 @@ void vduse_domain_unmap_page(struct vduse_iova_domain *domain, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, unsigned long attrs); -void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, - size_t size, dma_addr_t *dma_addr, - gfp_t flag); +dma_addr_t vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, + size_t size, void *orig); void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, dma_addr_t dma_addr, unsigned long attrs); diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 0e3cf5128ad0..6dba1f3224d9 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -916,23 +916,27 @@ static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size, { struct vduse_dev *vdev; struct vduse_iova_domain *domain; - unsigned long iova; void *addr; *dma_addr = DMA_MAPPING_ERROR; if (!token.group) return NULL; - vdev = token.group->dev; - domain = vdev->domain; - addr = vduse_domain_alloc_coherent(domain, size, - (dma_addr_t *)&iova, flag); + addr = alloc_pages_exact(size, flag); if (!addr) return NULL; - *dma_addr = (dma_addr_t)iova; + vdev = token.group->dev; + domain = vdev->domain; + *dma_addr = vduse_domain_alloc_coherent(domain, size, addr); + if (*dma_addr == DMA_MAPPING_ERROR) + goto err; return addr; + +err: + free_pages_exact(addr, size); + return NULL; } static void vduse_dev_free_coherent(union virtio_map token, size_t size, @@ -949,6 +953,7 @@ static void vduse_dev_free_coherent(union virtio_map token, size_t size, domain = vdev->domain; vduse_domain_free_coherent(domain, size, dma_addr, attrs); + free_pages_exact(vaddr, size); } static bool vduse_dev_need_sync(union virtio_map token, dma_addr_t dma_addr) From f3dc3a8a3ea71d4758b0f63affceb18398cf79b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Mon, 19 Jan 2026 15:33:03 +0100 Subject: [PATCH 48/59] vduse: merge tree search logic of IOTLB_GET_FD and IOTLB_GET_INFO ioctls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The next patch adds new ioctl with the ASID member per entry. Abstract these two so it can be build on top easily. Acked-by: Jason Wang Signed-off-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20260119143306.1818855-11-eperezma@redhat.com> --- drivers/vdpa/vdpa_user/vduse_dev.c | 102 ++++++++++++++++------------- 1 file changed, 56 insertions(+), 46 deletions(-) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 6dba1f3224d9..d658f3e1cebf 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -1243,6 +1243,51 @@ static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq) vq->irq_effective_cpu = curr_cpu; } +static int vduse_dev_iotlb_entry(struct vduse_dev *dev, + struct vduse_iotlb_entry *entry, + struct file **f, uint64_t *capability) +{ + int r = -EINVAL; + struct vhost_iotlb_map *map; + + if (entry->start > entry->last) + return -EINVAL; + + mutex_lock(&dev->domain_lock); + if (!dev->domain) + goto out; + + spin_lock(&dev->domain->iotlb_lock); + map = vhost_iotlb_itree_first(dev->domain->iotlb, entry->start, + entry->last); + if (map) { + if (f) { + const struct vdpa_map_file *map_file; + + map_file = (struct vdpa_map_file *)map->opaque; + entry->offset = map_file->offset; + *f = get_file(map_file->file); + } + entry->start = map->start; + entry->last = map->last; + entry->perm = map->perm; + if (capability) { + *capability = 0; + + if (dev->domain->bounce_map && map->start == 0 && + map->last == dev->domain->bounce_size - 1) + *capability |= VDUSE_IOVA_CAP_UMEM; + } + + r = 0; + } + spin_unlock(&dev->domain->iotlb_lock); + +out: + mutex_unlock(&dev->domain_lock); + return r; +} + static long vduse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -1256,36 +1301,16 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case VDUSE_IOTLB_GET_FD: { struct vduse_iotlb_entry entry; - struct vhost_iotlb_map *map; - struct vdpa_map_file *map_file; struct file *f = NULL; ret = -EFAULT; if (copy_from_user(&entry, argp, sizeof(entry))) break; - ret = -EINVAL; - if (entry.start > entry.last) + ret = vduse_dev_iotlb_entry(dev, &entry, &f, NULL); + if (ret) break; - mutex_lock(&dev->domain_lock); - if (!dev->domain) { - mutex_unlock(&dev->domain_lock); - break; - } - spin_lock(&dev->domain->iotlb_lock); - map = vhost_iotlb_itree_first(dev->domain->iotlb, - entry.start, entry.last); - if (map) { - map_file = (struct vdpa_map_file *)map->opaque; - f = get_file(map_file->file); - entry.offset = map_file->offset; - entry.start = map->start; - entry.last = map->last; - entry.perm = map->perm; - } - spin_unlock(&dev->domain->iotlb_lock); - mutex_unlock(&dev->domain_lock); ret = -EINVAL; if (!f) break; @@ -1475,41 +1500,26 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, } case VDUSE_IOTLB_GET_INFO: { struct vduse_iova_info info; - struct vhost_iotlb_map *map; + struct vduse_iotlb_entry entry; ret = -EFAULT; if (copy_from_user(&info, argp, sizeof(info))) break; - ret = -EINVAL; - if (info.start > info.last) - break; - if (!is_mem_zero((const char *)info.reserved, sizeof(info.reserved))) break; - mutex_lock(&dev->domain_lock); - if (!dev->domain) { - mutex_unlock(&dev->domain_lock); - break; - } - spin_lock(&dev->domain->iotlb_lock); - map = vhost_iotlb_itree_first(dev->domain->iotlb, - info.start, info.last); - if (map) { - info.start = map->start; - info.last = map->last; - info.capability = 0; - if (dev->domain->bounce_map && map->start == 0 && - map->last == dev->domain->bounce_size - 1) - info.capability |= VDUSE_IOVA_CAP_UMEM; - } - spin_unlock(&dev->domain->iotlb_lock); - mutex_unlock(&dev->domain_lock); - if (!map) + entry.start = info.start; + entry.last = info.last; + ret = vduse_dev_iotlb_entry(dev, &entry, NULL, + &info.capability); + if (ret < 0) break; + info.start = entry.start; + info.last = entry.last; + ret = -EFAULT; if (copy_to_user(argp, &info, sizeof(info))) break; From 079212f6877e5d07308c8998a8fbc7539ca3f8f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Mon, 19 Jan 2026 15:33:04 +0100 Subject: [PATCH 49/59] vduse: add vq group asid support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for assigning Address Space Identifiers (ASIDs) to each VQ group. This enables mapping each group into a distinct memory space. The vq group to ASID association is protected by a rwlock now. But the mutex domain_lock keeps protecting the domains of all ASIDs, as some operations like the one related with the bounce buffer size still requires to lock all the ASIDs. Acked-by: Jason Wang Signed-off-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20260119143306.1818855-12-eperezma@redhat.com> --- drivers/vdpa/vdpa_user/vduse_dev.c | 385 +++++++++++++++++++---------- include/uapi/linux/vduse.h | 66 ++++- 2 files changed, 315 insertions(+), 136 deletions(-) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index d658f3e1cebf..2727c0c26003 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -9,6 +9,7 @@ */ #include "linux/virtio_net.h" +#include #include #include #include @@ -41,6 +42,7 @@ #define VDUSE_DEV_MAX (1U << MINORBITS) #define VDUSE_DEV_MAX_GROUPS 0xffff +#define VDUSE_DEV_MAX_AS 0xffff #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024) #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024) #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024) @@ -86,7 +88,15 @@ struct vduse_umem { struct mm_struct *mm; }; +struct vduse_as { + struct vduse_iova_domain *domain; + struct vduse_umem *umem; + struct mutex mem_lock; +}; + struct vduse_vq_group { + rwlock_t as_lock; + struct vduse_as *as; /* Protected by as_lock */ struct vduse_dev *dev; }; @@ -94,7 +104,7 @@ struct vduse_dev { struct vduse_vdpa *vdev; struct device *dev; struct vduse_virtqueue **vqs; - struct vduse_iova_domain *domain; + struct vduse_as *as; char *name; struct mutex lock; spinlock_t msg_lock; @@ -122,9 +132,8 @@ struct vduse_dev { u32 vq_num; u32 vq_align; u32 ngroups; - struct vduse_umem *umem; + u32 nas; struct vduse_vq_group *groups; - struct mutex mem_lock; unsigned int bounce_size; struct mutex domain_lock; }; @@ -314,7 +323,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status) return vduse_dev_msg_sync(dev, &msg); } -static int vduse_dev_update_iotlb(struct vduse_dev *dev, +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid, u64 start, u64 last) { struct vduse_dev_msg msg = { 0 }; @@ -323,8 +332,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev, return -EINVAL; msg.req.type = VDUSE_UPDATE_IOTLB; - msg.req.iova.start = start; - msg.req.iova.last = last; + if (dev->api_version < VDUSE_API_VERSION_1) { + msg.req.iova.start = start; + msg.req.iova.last = last; + } else { + msg.req.iova_v2.start = start; + msg.req.iova_v2.last = last; + msg.req.iova_v2.asid = asid; + } return vduse_dev_msg_sync(dev, &msg); } @@ -439,11 +454,14 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait) static void vduse_dev_reset(struct vduse_dev *dev) { int i; - struct vduse_iova_domain *domain = dev->domain; /* The coherent mappings are handled in vduse_dev_free_coherent() */ - if (domain && domain->bounce_map) - vduse_domain_reset_bounce_map(domain); + for (i = 0; i < dev->nas; i++) { + struct vduse_iova_domain *domain = dev->as[i].domain; + + if (domain && domain->bounce_map) + vduse_domain_reset_bounce_map(domain); + } down_write(&dev->rwsem); @@ -622,6 +640,42 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx) return ret; } +DEFINE_GUARD(vq_group_as_read_lock, struct vduse_vq_group *, + if (_T->dev->nas > 1) + read_lock(&_T->as_lock), + if (_T->dev->nas > 1) + read_unlock(&_T->as_lock)) + +DEFINE_GUARD(vq_group_as_write_lock, struct vduse_vq_group *, + if (_T->dev->nas > 1) + write_lock(&_T->as_lock), + if (_T->dev->nas > 1) + write_unlock(&_T->as_lock)) + +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group, + unsigned int asid) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_dev_msg msg = { 0 }; + int r; + + if (dev->api_version < VDUSE_API_VERSION_1) + return -EINVAL; + + msg.req.type = VDUSE_SET_VQ_GROUP_ASID; + msg.req.vq_group_asid.group = group; + msg.req.vq_group_asid.asid = asid; + + r = vduse_dev_msg_sync(dev, &msg); + if (r < 0) + return r; + + guard(vq_group_as_write_lock)(&dev->groups[group]); + dev->groups[group].as = &dev->as[asid]; + + return 0; +} + static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, struct vdpa_vq_state *state) { @@ -793,13 +847,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa, struct vduse_dev *dev = vdpa_to_vduse(vdpa); int ret; - ret = vduse_domain_set_map(dev->domain, iotlb); + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb); if (ret) return ret; - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX); + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX); if (ret) { - vduse_domain_clear_map(dev->domain, iotlb); + vduse_domain_clear_map(dev->as[asid].domain, iotlb); return ret; } @@ -842,6 +896,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { .get_vq_affinity = vduse_vdpa_get_vq_affinity, .reset = vduse_vdpa_reset, .set_map = vduse_vdpa_set_map, + .set_group_asid = vduse_set_group_asid, .get_vq_map = vduse_get_vq_map, .free = vduse_vdpa_free, }; @@ -850,15 +905,13 @@ static void vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir) { - struct vduse_dev *vdev; struct vduse_iova_domain *domain; if (!token.group) return; - vdev = token.group->dev; - domain = vdev->domain; - + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; vduse_domain_sync_single_for_device(domain, dma_addr, size, dir); } @@ -866,15 +919,13 @@ static void vduse_dev_sync_single_for_cpu(union virtio_map token, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir) { - struct vduse_dev *vdev; struct vduse_iova_domain *domain; if (!token.group) return; - vdev = token.group->dev; - domain = vdev->domain; - + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir); } @@ -883,15 +934,13 @@ static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page, enum dma_data_direction dir, unsigned long attrs) { - struct vduse_dev *vdev; struct vduse_iova_domain *domain; if (!token.group) return DMA_MAPPING_ERROR; - vdev = token.group->dev; - domain = vdev->domain; - + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; return vduse_domain_map_page(domain, page, offset, size, dir, attrs); } @@ -899,23 +948,19 @@ static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - struct vduse_dev *vdev; struct vduse_iova_domain *domain; if (!token.group) return; - vdev = token.group->dev; - domain = vdev->domain; - - return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs); + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; + vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs); } static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size, dma_addr_t *dma_addr, gfp_t flag) { - struct vduse_dev *vdev; - struct vduse_iova_domain *domain; void *addr; *dma_addr = DMA_MAPPING_ERROR; @@ -926,11 +971,15 @@ static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size, if (!addr) return NULL; - vdev = token.group->dev; - domain = vdev->domain; - *dma_addr = vduse_domain_alloc_coherent(domain, size, addr); - if (*dma_addr == DMA_MAPPING_ERROR) - goto err; + { + struct vduse_iova_domain *domain; + + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; + *dma_addr = vduse_domain_alloc_coherent(domain, size, addr); + if (*dma_addr == DMA_MAPPING_ERROR) + goto err; + } return addr; @@ -943,31 +992,27 @@ static void vduse_dev_free_coherent(union virtio_map token, size_t size, void *vaddr, dma_addr_t dma_addr, unsigned long attrs) { - struct vduse_dev *vdev; - struct vduse_iova_domain *domain; - if (!token.group) return; - vdev = token.group->dev; - domain = vdev->domain; + { + struct vduse_iova_domain *domain; + + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; + vduse_domain_free_coherent(domain, size, dma_addr, attrs); + } - vduse_domain_free_coherent(domain, size, dma_addr, attrs); free_pages_exact(vaddr, size); } static bool vduse_dev_need_sync(union virtio_map token, dma_addr_t dma_addr) { - struct vduse_dev *vdev; - struct vduse_iova_domain *domain; - if (!token.group) return false; - vdev = token.group->dev; - domain = vdev->domain; - - return dma_addr < domain->bounce_size; + guard(vq_group_as_read_lock)(token.group); + return dma_addr < token.group->as->domain->bounce_size; } static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr) @@ -979,16 +1024,11 @@ static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr) static size_t vduse_dev_max_mapping_size(union virtio_map token) { - struct vduse_dev *vdev; - struct vduse_iova_domain *domain; - if (!token.group) return 0; - vdev = token.group->dev; - domain = vdev->domain; - - return domain->bounce_size; + guard(vq_group_as_read_lock)(token.group); + return token.group->as->domain->bounce_size; } static const struct virtio_map_ops vduse_map_ops = { @@ -1128,39 +1168,40 @@ unlock: return ret; } -static int vduse_dev_dereg_umem(struct vduse_dev *dev, +static int vduse_dev_dereg_umem(struct vduse_dev *dev, u32 asid, u64 iova, u64 size) { int ret; - mutex_lock(&dev->mem_lock); + mutex_lock(&dev->as[asid].mem_lock); ret = -ENOENT; - if (!dev->umem) + if (!dev->as[asid].umem) goto unlock; ret = -EINVAL; - if (!dev->domain) + if (!dev->as[asid].domain) goto unlock; - if (dev->umem->iova != iova || size != dev->domain->bounce_size) + if (dev->as[asid].umem->iova != iova || + size != dev->as[asid].domain->bounce_size) goto unlock; - vduse_domain_remove_user_bounce_pages(dev->domain); - unpin_user_pages_dirty_lock(dev->umem->pages, - dev->umem->npages, true); - atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm); - mmdrop(dev->umem->mm); - vfree(dev->umem->pages); - kfree(dev->umem); - dev->umem = NULL; + vduse_domain_remove_user_bounce_pages(dev->as[asid].domain); + unpin_user_pages_dirty_lock(dev->as[asid].umem->pages, + dev->as[asid].umem->npages, true); + atomic64_sub(dev->as[asid].umem->npages, &dev->as[asid].umem->mm->pinned_vm); + mmdrop(dev->as[asid].umem->mm); + vfree(dev->as[asid].umem->pages); + kfree(dev->as[asid].umem); + dev->as[asid].umem = NULL; ret = 0; unlock: - mutex_unlock(&dev->mem_lock); + mutex_unlock(&dev->as[asid].mem_lock); return ret; } static int vduse_dev_reg_umem(struct vduse_dev *dev, - u64 iova, u64 uaddr, u64 size) + u32 asid, u64 iova, u64 uaddr, u64 size) { struct page **page_list = NULL; struct vduse_umem *umem = NULL; @@ -1168,14 +1209,14 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev, unsigned long npages, lock_limit; int ret; - if (!dev->domain || !dev->domain->bounce_map || - size != dev->domain->bounce_size || + if (!dev->as[asid].domain || !dev->as[asid].domain->bounce_map || + size != dev->as[asid].domain->bounce_size || iova != 0 || uaddr & ~PAGE_MASK) return -EINVAL; - mutex_lock(&dev->mem_lock); + mutex_lock(&dev->as[asid].mem_lock); ret = -EEXIST; - if (dev->umem) + if (dev->as[asid].umem) goto unlock; ret = -ENOMEM; @@ -1199,7 +1240,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev, goto out; } - ret = vduse_domain_add_user_bounce_pages(dev->domain, + ret = vduse_domain_add_user_bounce_pages(dev->as[asid].domain, page_list, pinned); if (ret) goto out; @@ -1212,7 +1253,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev, umem->mm = current->mm; mmgrab(current->mm); - dev->umem = umem; + dev->as[asid].umem = umem; out: if (ret && pinned > 0) unpin_user_pages(page_list, pinned); @@ -1223,7 +1264,7 @@ unlock: vfree(page_list); kfree(umem); } - mutex_unlock(&dev->mem_lock); + mutex_unlock(&dev->as[asid].mem_lock); return ret; } @@ -1244,44 +1285,47 @@ static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq) } static int vduse_dev_iotlb_entry(struct vduse_dev *dev, - struct vduse_iotlb_entry *entry, + struct vduse_iotlb_entry_v2 *entry, struct file **f, uint64_t *capability) { + u32 asid; int r = -EINVAL; struct vhost_iotlb_map *map; - if (entry->start > entry->last) + if (entry->v1.start > entry->v1.last || entry->asid >= dev->nas) return -EINVAL; + asid = array_index_nospec(entry->asid, dev->nas); mutex_lock(&dev->domain_lock); - if (!dev->domain) + + if (!dev->as[asid].domain) goto out; - spin_lock(&dev->domain->iotlb_lock); - map = vhost_iotlb_itree_first(dev->domain->iotlb, entry->start, - entry->last); + spin_lock(&dev->as[asid].domain->iotlb_lock); + map = vhost_iotlb_itree_first(dev->as[asid].domain->iotlb, + entry->v1.start, entry->v1.last); if (map) { if (f) { const struct vdpa_map_file *map_file; map_file = (struct vdpa_map_file *)map->opaque; - entry->offset = map_file->offset; + entry->v1.offset = map_file->offset; *f = get_file(map_file->file); } - entry->start = map->start; - entry->last = map->last; - entry->perm = map->perm; + entry->v1.start = map->start; + entry->v1.last = map->last; + entry->v1.perm = map->perm; if (capability) { *capability = 0; - if (dev->domain->bounce_map && map->start == 0 && - map->last == dev->domain->bounce_size - 1) + if (dev->as[asid].domain->bounce_map && map->start == 0 && + map->last == dev->as[asid].domain->bounce_size - 1) *capability |= VDUSE_IOVA_CAP_UMEM; } r = 0; } - spin_unlock(&dev->domain->iotlb_lock); + spin_unlock(&dev->as[asid].domain->iotlb_lock); out: mutex_unlock(&dev->domain_lock); @@ -1299,12 +1343,29 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, return -EPERM; switch (cmd) { - case VDUSE_IOTLB_GET_FD: { - struct vduse_iotlb_entry entry; + case VDUSE_IOTLB_GET_FD: + case VDUSE_IOTLB_GET_FD2: { + struct vduse_iotlb_entry_v2 entry = {0}; struct file *f = NULL; + ret = -ENOIOCTLCMD; + if (dev->api_version < VDUSE_API_VERSION_1 && + cmd == VDUSE_IOTLB_GET_FD2) + break; + ret = -EFAULT; - if (copy_from_user(&entry, argp, sizeof(entry))) + if (cmd == VDUSE_IOTLB_GET_FD2) { + if (copy_from_user(&entry, argp, sizeof(entry))) + break; + } else { + if (copy_from_user(&entry.v1, argp, + sizeof(entry.v1))) + break; + } + + ret = -EINVAL; + if (!is_mem_zero((const char *)entry.reserved, + sizeof(entry.reserved))) break; ret = vduse_dev_iotlb_entry(dev, &entry, &f, NULL); @@ -1315,12 +1376,19 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, if (!f) break; - ret = -EFAULT; - if (copy_to_user(argp, &entry, sizeof(entry))) { + if (cmd == VDUSE_IOTLB_GET_FD2) + ret = copy_to_user(argp, &entry, + sizeof(entry)); + else + ret = copy_to_user(argp, &entry.v1, + sizeof(entry.v1)); + + if (ret) { + ret = -EFAULT; fput(f); break; } - ret = receive_fd(f, NULL, perm_to_file_flags(entry.perm)); + ret = receive_fd(f, NULL, perm_to_file_flags(entry.v1.perm)); fput(f); break; } @@ -1465,6 +1533,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, } case VDUSE_IOTLB_REG_UMEM: { struct vduse_iova_umem umem; + u32 asid; ret = -EFAULT; if (copy_from_user(&umem, argp, sizeof(umem))) @@ -1472,17 +1541,21 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, ret = -EINVAL; if (!is_mem_zero((const char *)umem.reserved, - sizeof(umem.reserved))) + sizeof(umem.reserved)) || + (dev->api_version < VDUSE_API_VERSION_1 && + umem.asid != 0) || umem.asid >= dev->nas) break; mutex_lock(&dev->domain_lock); - ret = vduse_dev_reg_umem(dev, umem.iova, + asid = array_index_nospec(umem.asid, dev->nas); + ret = vduse_dev_reg_umem(dev, asid, umem.iova, umem.uaddr, umem.size); mutex_unlock(&dev->domain_lock); break; } case VDUSE_IOTLB_DEREG_UMEM: { struct vduse_iova_umem umem; + u32 asid; ret = -EFAULT; if (copy_from_user(&umem, argp, sizeof(umem))) @@ -1490,17 +1563,22 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, ret = -EINVAL; if (!is_mem_zero((const char *)umem.reserved, - sizeof(umem.reserved))) + sizeof(umem.reserved)) || + (dev->api_version < VDUSE_API_VERSION_1 && + umem.asid != 0) || + umem.asid >= dev->nas) break; + mutex_lock(&dev->domain_lock); - ret = vduse_dev_dereg_umem(dev, umem.iova, + asid = array_index_nospec(umem.asid, dev->nas); + ret = vduse_dev_dereg_umem(dev, asid, umem.iova, umem.size); mutex_unlock(&dev->domain_lock); break; } case VDUSE_IOTLB_GET_INFO: { struct vduse_iova_info info; - struct vduse_iotlb_entry entry; + struct vduse_iotlb_entry_v2 entry; ret = -EFAULT; if (copy_from_user(&info, argp, sizeof(info))) @@ -1510,15 +1588,23 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, sizeof(info.reserved))) break; - entry.start = info.start; - entry.last = info.last; + if (dev->api_version < VDUSE_API_VERSION_1) { + if (info.asid) + break; + } else if (info.asid >= dev->nas) + break; + + entry.v1.start = info.start; + entry.v1.last = info.last; + entry.asid = info.asid; ret = vduse_dev_iotlb_entry(dev, &entry, NULL, &info.capability); if (ret < 0) break; - info.start = entry.start; - info.last = entry.last; + info.start = entry.v1.start; + info.last = entry.v1.last; + info.asid = entry.asid; ret = -EFAULT; if (copy_to_user(argp, &info, sizeof(info))) @@ -1540,8 +1626,10 @@ static int vduse_dev_release(struct inode *inode, struct file *file) struct vduse_dev *dev = file->private_data; mutex_lock(&dev->domain_lock); - if (dev->domain) - vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size); + for (int i = 0; i < dev->nas; i++) + if (dev->as[i].domain) + vduse_dev_dereg_umem(dev, i, 0, + dev->as[i].domain->bounce_size); mutex_unlock(&dev->domain_lock); spin_lock(&dev->msg_lock); /* Make sure the inflight messages can processed after reconncection */ @@ -1760,7 +1848,6 @@ static struct vduse_dev *vduse_dev_create(void) return NULL; mutex_init(&dev->lock); - mutex_init(&dev->mem_lock); mutex_init(&dev->domain_lock); spin_lock_init(&dev->msg_lock); INIT_LIST_HEAD(&dev->send_list); @@ -1811,8 +1898,11 @@ static int vduse_destroy_dev(char *name) idr_remove(&vduse_idr, dev->minor); kvfree(dev->config); vduse_dev_deinit_vqs(dev); - if (dev->domain) - vduse_domain_destroy(dev->domain); + for (int i = 0; i < dev->nas; i++) { + if (dev->as[i].domain) + vduse_domain_destroy(dev->as[i].domain); + } + kfree(dev->as); kfree(dev->name); kfree(dev->groups); vduse_dev_destroy(dev); @@ -1859,12 +1949,17 @@ static bool vduse_validate_config(struct vduse_dev_config *config, sizeof(config->reserved))) return false; - if (api_version < VDUSE_API_VERSION_1 && config->ngroups) + if (api_version < VDUSE_API_VERSION_1 && + (config->ngroups || config->nas)) return false; - if (api_version >= VDUSE_API_VERSION_1 && - (!config->ngroups || config->ngroups > VDUSE_DEV_MAX_GROUPS)) - return false; + if (api_version >= VDUSE_API_VERSION_1) { + if (!config->ngroups || config->ngroups > VDUSE_DEV_MAX_GROUPS) + return false; + + if (!config->nas || config->nas > VDUSE_DEV_MAX_AS) + return false; + } if (config->vq_align > PAGE_SIZE) return false; @@ -1929,7 +2024,8 @@ static ssize_t bounce_size_store(struct device *device, ret = -EPERM; mutex_lock(&dev->domain_lock); - if (dev->domain) + /* Assuming that if the first domain is allocated, all are allocated */ + if (dev->as[0].domain) goto unlock; ret = kstrtouint(buf, 10, &bounce_size); @@ -1981,6 +2077,14 @@ static int vduse_create_dev(struct vduse_dev_config *config, dev->device_features = config->features; dev->device_id = config->device_id; dev->vendor_id = config->vendor_id; + + dev->nas = (dev->api_version < VDUSE_API_VERSION_1) ? 1 : config->nas; + dev->as = kcalloc(dev->nas, sizeof(dev->as[0]), GFP_KERNEL); + if (!dev->as) + goto err_as; + for (int i = 0; i < dev->nas; i++) + mutex_init(&dev->as[i].mem_lock); + dev->ngroups = (dev->api_version < VDUSE_API_VERSION_1) ? 1 : config->ngroups; @@ -1988,8 +2092,11 @@ static int vduse_create_dev(struct vduse_dev_config *config, GFP_KERNEL); if (!dev->groups) goto err_vq_groups; - for (u32 i = 0; i < dev->ngroups; ++i) + for (u32 i = 0; i < dev->ngroups; ++i) { dev->groups[i].dev = dev; + rwlock_init(&dev->groups[i].as_lock); + dev->groups[i].as = &dev->as[0]; + } dev->name = kstrdup(config->name, GFP_KERNEL); if (!dev->name) @@ -2029,6 +2136,8 @@ err_idr: err_str: kfree(dev->groups); err_vq_groups: + kfree(dev->as); +err_as: vduse_dev_destroy(dev); err: return ret; @@ -2152,7 +2261,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, &vduse_vdpa_config_ops, &vduse_map_ops, - dev->ngroups, 1, name, true); + dev->ngroups, dev->nas, name, true); if (IS_ERR(vdev)) return PTR_ERR(vdev); @@ -2167,7 +2276,8 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, const struct vdpa_dev_set_config *config) { struct vduse_dev *dev; - int ret; + size_t domain_bounce_size; + int ret, i; mutex_lock(&vduse_lock); dev = vduse_find_dev(name); @@ -2181,29 +2291,38 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, return ret; mutex_lock(&dev->domain_lock); - if (!dev->domain) - dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1, - dev->bounce_size); - mutex_unlock(&dev->domain_lock); - if (!dev->domain) { - ret = -ENOMEM; - goto domain_err; + ret = 0; + + domain_bounce_size = dev->bounce_size / dev->nas; + for (i = 0; i < dev->nas; ++i) { + dev->as[i].domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1, + domain_bounce_size); + if (!dev->as[i].domain) { + ret = -ENOMEM; + goto err; + } } + mutex_unlock(&dev->domain_lock); + ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num); - if (ret) { - goto register_err; - } + if (ret) + goto err_register; return 0; -register_err: +err_register: mutex_lock(&dev->domain_lock); - vduse_domain_destroy(dev->domain); - dev->domain = NULL; + +err: + for (int j = 0; j < i; j++) { + if (dev->as[j].domain) { + vduse_domain_destroy(dev->as[j].domain); + dev->as[j].domain = NULL; + } + } mutex_unlock(&dev->domain_lock); -domain_err: put_device(&dev->vdev->vdpa.dev); return ret; diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h index a3d51cf6df3a..68b4287f9fac 100644 --- a/include/uapi/linux/vduse.h +++ b/include/uapi/linux/vduse.h @@ -32,6 +32,7 @@ * @vq_num: the number of virtqueues * @vq_align: the allocation alignment of virtqueue's metadata * @ngroups: number of vq groups that VDUSE device declares + * @nas: number of address spaces that VDUSE device declares * @reserved: for future use, needs to be initialized to zero * @config_size: the size of the configuration space * @config: the buffer of the configuration space @@ -47,7 +48,8 @@ struct vduse_dev_config { __u32 vq_num; __u32 vq_align; __u32 ngroups; /* if VDUSE_API_VERSION >= 1 */ - __u32 reserved[12]; + __u32 nas; /* if VDUSE_API_VERSION >= 1 */ + __u32 reserved[11]; __u32 config_size; __u8 config[]; }; @@ -166,6 +168,16 @@ struct vduse_vq_state_packed { __u16 last_used_idx; }; +/** + * struct vduse_vq_group_asid - virtqueue group ASID + * @group: Index of the virtqueue group + * @asid: Address space ID of the group + */ +struct vduse_vq_group_asid { + __u32 group; + __u32 asid; +}; + /** * struct vduse_vq_info - information of a virtqueue * @index: virtqueue index @@ -225,6 +237,7 @@ struct vduse_vq_eventfd { * @uaddr: start address of userspace memory, it must be aligned to page size * @iova: start of the IOVA region * @size: size of the IOVA region + * @asid: Address space ID of the IOVA region * @reserved: for future use, needs to be initialized to zero * * Structure used by VDUSE_IOTLB_REG_UMEM and VDUSE_IOTLB_DEREG_UMEM @@ -234,7 +247,8 @@ struct vduse_iova_umem { __u64 uaddr; __u64 iova; __u64 size; - __u64 reserved[3]; + __u32 asid; + __u32 reserved[5]; }; /* Register userspace memory for IOVA regions */ @@ -248,6 +262,7 @@ struct vduse_iova_umem { * @start: start of the IOVA region * @last: last of the IOVA region * @capability: capability of the IOVA region + * @asid: Address space ID of the IOVA region, only if device API version >= 1 * @reserved: for future use, needs to be initialized to zero * * Structure used by VDUSE_IOTLB_GET_INFO ioctl to get information of @@ -258,7 +273,8 @@ struct vduse_iova_info { __u64 last; #define VDUSE_IOVA_CAP_UMEM (1 << 0) __u64 capability; - __u64 reserved[3]; + __u32 asid; /* Only if device API version >= 1 */ + __u32 reserved[5]; }; /* @@ -267,6 +283,28 @@ struct vduse_iova_info { */ #define VDUSE_IOTLB_GET_INFO _IOWR(VDUSE_BASE, 0x1a, struct vduse_iova_info) +/** + * struct vduse_iotlb_entry_v2 - entry of IOTLB to describe one IOVA region + * + * @v1: the original vduse_iotlb_entry + * @asid: address space ID of the IOVA region + * @reserved: for future use, needs to be initialized to zero + * + * Structure used by VDUSE_IOTLB_GET_FD2 ioctl to find an overlapped IOVA region. + */ +struct vduse_iotlb_entry_v2 { + struct vduse_iotlb_entry v1; + __u32 asid; + __u32 reserved[12]; +}; + +/* + * Same as VDUSE_IOTLB_GET_FD but with vduse_iotlb_entry_v2 argument that + * support extra fields. + */ +#define VDUSE_IOTLB_GET_FD2 _IOWR(VDUSE_BASE, 0x1b, struct vduse_iotlb_entry_v2) + + /* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */ /** @@ -275,11 +313,14 @@ struct vduse_iova_info { * @VDUSE_SET_STATUS: set the device status * @VDUSE_UPDATE_IOTLB: Notify userspace to update the memory mapping for * specified IOVA range via VDUSE_IOTLB_GET_FD ioctl + * @VDUSE_SET_VQ_GROUP_ASID: Notify userspace to update the address space of a + * virtqueue group. */ enum vduse_req_type { VDUSE_GET_VQ_STATE, VDUSE_SET_STATUS, VDUSE_UPDATE_IOTLB, + VDUSE_SET_VQ_GROUP_ASID, }; /** @@ -314,6 +355,18 @@ struct vduse_iova_range { __u64 last; }; +/** + * struct vduse_iova_range_v2 - IOVA range [start, last] if API_VERSION >= 1 + * @start: start of the IOVA range + * @last: last of the IOVA range + * @asid: address space ID of the IOVA range + */ +struct vduse_iova_range_v2 { + __u64 start; + __u64 last; + __u32 asid; +}; + /** * struct vduse_dev_request - control request * @type: request type @@ -322,6 +375,8 @@ struct vduse_iova_range { * @vq_state: virtqueue state, only index field is available * @s: device status * @iova: IOVA range for updating + * @iova_v2: IOVA range for updating if API_VERSION >= 1 + * @vq_group_asid: ASID of a virtqueue group * @padding: padding * * Structure used by read(2) on /dev/vduse/$NAME. @@ -334,6 +389,11 @@ struct vduse_dev_request { struct vduse_vq_state vq_state; struct vduse_dev_status s; struct vduse_iova_range iova; + /* Following members but padding exist only if vduse api + * version >= 1 + */ + struct vduse_iova_range_v2 iova_v2; + struct vduse_vq_group_asid vq_group_asid; __u32 padding[32]; }; }; From 12e0043d335f6c8badfe98f1d8f5e1910d430cf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Mon, 19 Jan 2026 15:33:05 +0100 Subject: [PATCH 50/59] vduse: bump version number MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Finalize the series by advertising VDUSE API v1 support to userspace. Now that all required infrastructure for v1 (ASIDs, VQ groups, update_iotlb_v2) is in place, VDUSE devices can opt in to the new features. Assume API version 0 if the VDUSE instance does not call VDUSE_GET_API_VERSION to maintain compatibility. Acked-by: Jason Wang Signed-off-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20260119143306.1818855-13-eperezma@redhat.com> --- drivers/vdpa/vdpa_user/vduse_dev.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 2727c0c26003..73d1d517dc6c 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -52,6 +52,15 @@ #define IRQ_UNBOUND -1 +/* + * VDUSE instance have not asked the vduse API version, so assume 0. + * + * Old devices may not ask for the device version and assume it is 0. Keep + * this value for these. From the moment the VDUSE instance ask for the + * version, convert to the latests supported one and continue regular flow + */ +#define VDUSE_API_VERSION_NOT_ASKED U64_MAX + struct vduse_virtqueue { u16 index; u16 num_max; @@ -2153,6 +2162,8 @@ static long vduse_ioctl(struct file *file, unsigned int cmd, mutex_lock(&vduse_lock); switch (cmd) { case VDUSE_GET_API_VERSION: + if (control->api_version == VDUSE_API_VERSION_NOT_ASKED) + control->api_version = VDUSE_API_VERSION_1; ret = put_user(control->api_version, (u64 __user *)argp); break; case VDUSE_SET_API_VERSION: { @@ -2163,7 +2174,7 @@ static long vduse_ioctl(struct file *file, unsigned int cmd, break; ret = -EINVAL; - if (api_version > VDUSE_API_VERSION) + if (api_version > VDUSE_API_VERSION_1) break; ret = 0; @@ -2180,6 +2191,8 @@ static long vduse_ioctl(struct file *file, unsigned int cmd, break; ret = -EINVAL; + if (control->api_version == VDUSE_API_VERSION_NOT_ASKED) + control->api_version = VDUSE_API_VERSION; if (!vduse_validate_config(&config, control->api_version)) break; @@ -2230,7 +2243,7 @@ static int vduse_open(struct inode *inode, struct file *file) if (!control) return -ENOMEM; - control->api_version = VDUSE_API_VERSION; + control->api_version = VDUSE_API_VERSION_NOT_ASKED; file->private_data = control; return 0; From 7a9dc249e750975fc5bdb44439eaed57243b709d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Mon, 19 Jan 2026 15:33:06 +0100 Subject: [PATCH 51/59] Documentation: Add documentation for VDUSE Address Space IDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address Space IDs allows the VDUSE framework to support devices able to expose different virtqueues to different part of the drivers. For example, to let QEMU handle the net device control virtqueue, so QEMU always knows the state of the device like mac address or number of queues enabled, while leaving the dataplane passthrough to the guest intact. This enables live migration. Expands the VDUSE documentation to explain how to use the new ioctls or the new struct members of old ioctls. Acked-by: Jason Wang Signed-off-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20260119143306.1818855-14-eperezma@redhat.com> --- Documentation/userspace-api/vduse.rst | 53 +++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/Documentation/userspace-api/vduse.rst b/Documentation/userspace-api/vduse.rst index bdb880e01132..81479d47c8b9 100644 --- a/Documentation/userspace-api/vduse.rst +++ b/Documentation/userspace-api/vduse.rst @@ -230,4 +230,57 @@ able to start the dataplane processing as follows: 5. Inject an interrupt for specific virtqueue with the VDUSE_INJECT_VQ_IRQ ioctl after the used ring is filled. +Enabling ASID (API version 1) +------------------------------ + +VDUSE supports per-address-space identifiers (ASIDs) starting with API +version 1. Set it up with ioctl(VDUSE_SET_API_VERSION) on `/dev/vduse/control` +and pass `VDUSE_API_VERSION_1` before creating a new VDUSE instance with +ioctl(VDUSE_CREATE_DEV). + +Afterwards, you can use the member asid of ioctl(VDUSE_VQ_SETUP) argument to +select the address space of the IOTLB you are querying. The driver could +change the address space of any virtqueue group by using the +VDUSE_SET_VQ_GROUP_ASID VDUSE message type, and the VDUSE instance needs to +reply with VDUSE_REQ_RESULT_OK if it was possible to change it. + +Similarly, you can use ioctl(VDUSE_IOTLB_GET_FD2) to obtain the file descriptor +describing an IOVA region of a specific ASID. Example usage: + +.. code-block:: c + + static void *iova_to_va(int dev_fd, uint32_t asid, uint64_t iova, + uint64_t *len) + { + int fd; + void *addr; + size_t size; + struct vduse_iotlb_entry_v2 entry = { 0 }; + + entry.v1.start = iova; + entry.v1.last = iova; + entry.asid = asid; + + fd = ioctl(dev_fd, VDUSE_IOTLB_GET_FD2, &entry); + if (fd < 0) + return NULL; + + size = entry.v1.last - entry.v1.start + 1; + *len = entry.v1.last - iova + 1; + addr = mmap(0, size, perm_to_prot(entry.v1.perm), MAP_SHARED, + fd, entry.v1.offset); + close(fd); + if (addr == MAP_FAILED) + return NULL; + + /* + * Using some data structures such as linked list to store + * the iotlb mapping. The munmap(2) should be called for the + * cached mapping when the corresponding VDUSE_UPDATE_IOTLB + * message is received or the device is reset. + */ + + return addr + iova - entry.v1.start; + } + For more details on the uAPI, please see include/uapi/linux/vduse.h. From af9a17d29ce9060664f56264bcc64b976fddd2b5 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 13 Jan 2026 11:05:54 +0800 Subject: [PATCH 52/59] crypto: virtio: Add spinlock protection with virtqueue notification When VM boots with one virtio-crypto PCI device and builtin backend, run openssl benchmark command with multiple processes, such as openssl speed -evp aes-128-cbc -engine afalg -seconds 10 -multi 32 openssl processes will hangup and there is error reported like this: virtio_crypto virtio0: dataq.0:id 3 is not a head! It seems that the data virtqueue need protection when it is handled for virtio done notification. If the spinlock protection is added in virtcrypto_done_task(), openssl benchmark with multiple processes works well. Fixes: fed93fb62e05 ("crypto: virtio - Handle dataq logic with tasklet") Cc: stable@vger.kernel.org Signed-off-by: Bibo Mao Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Message-Id: <20260113030556.3522533-2-maobibo@loongson.cn> --- drivers/crypto/virtio/virtio_crypto_core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/crypto/virtio/virtio_crypto_core.c b/drivers/crypto/virtio/virtio_crypto_core.c index 3d241446099c..ccc6b5c1b24b 100644 --- a/drivers/crypto/virtio/virtio_crypto_core.c +++ b/drivers/crypto/virtio/virtio_crypto_core.c @@ -75,15 +75,20 @@ static void virtcrypto_done_task(unsigned long data) struct data_queue *data_vq = (struct data_queue *)data; struct virtqueue *vq = data_vq->vq; struct virtio_crypto_request *vc_req; + unsigned long flags; unsigned int len; + spin_lock_irqsave(&data_vq->lock, flags); do { virtqueue_disable_cb(vq); while ((vc_req = virtqueue_get_buf(vq, &len)) != NULL) { + spin_unlock_irqrestore(&data_vq->lock, flags); if (vc_req->alg_cb) vc_req->alg_cb(vc_req, len); + spin_lock_irqsave(&data_vq->lock, flags); } } while (!virtqueue_enable_cb(vq)); + spin_unlock_irqrestore(&data_vq->lock, flags); } static void virtcrypto_dataq_callback(struct virtqueue *vq) From a389d431053935366b88a8fbf271f1a564b9a44e Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 13 Jan 2026 11:05:55 +0800 Subject: [PATCH 53/59] crypto: virtio: Remove duplicated virtqueue_kick in virtio_crypto_skcipher_crypt_req With function virtio_crypto_skcipher_crypt_req(), there is already virtqueue_kick() call with spinlock held in function __virtio_crypto_skcipher_do_req(). Remove duplicated virtqueue_kick() function call here. Fixes: d79b5d0bbf2e ("crypto: virtio - support crypto engine framework") Cc: stable@vger.kernel.org Signed-off-by: Bibo Mao Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Message-Id: <20260113030556.3522533-3-maobibo@loongson.cn> --- drivers/crypto/virtio/virtio_crypto_skcipher_algs.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/crypto/virtio/virtio_crypto_skcipher_algs.c b/drivers/crypto/virtio/virtio_crypto_skcipher_algs.c index 1b3fb21a2a7d..11053d1786d4 100644 --- a/drivers/crypto/virtio/virtio_crypto_skcipher_algs.c +++ b/drivers/crypto/virtio/virtio_crypto_skcipher_algs.c @@ -541,8 +541,6 @@ int virtio_crypto_skcipher_crypt_req( if (ret < 0) return ret; - virtqueue_kick(data_vq->vq); - return 0; } From 72ecf75c58116c0fe07e34ba4fff5020e55c9097 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 13 Jan 2026 11:05:56 +0800 Subject: [PATCH 54/59] crypto: virtio: Replace package id with numa node id With multiple virtio crypto devices supported with different NUMA nodes, when crypto session is created, it will search virtio crypto device with the same numa node of current CPU. Here API topology_physical_package_id() is replaced with cpu_to_node() since package id is physical concept, and one package id have multiple memory numa id. Signed-off-by: Bibo Mao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Message-Id: <20260113030556.3522533-4-maobibo@loongson.cn> --- drivers/crypto/virtio/virtio_crypto_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/virtio/virtio_crypto_common.h b/drivers/crypto/virtio/virtio_crypto_common.h index 19c934af3df6..e559bdadf4f9 100644 --- a/drivers/crypto/virtio/virtio_crypto_common.h +++ b/drivers/crypto/virtio/virtio_crypto_common.h @@ -135,7 +135,7 @@ static inline int virtio_crypto_get_current_node(void) int cpu, node; cpu = get_cpu(); - node = topology_physical_package_id(cpu); + node = cpu_to_node(cpu); put_cpu(); return node; From 719d959274da85eb0e1eb448dd807b1c435c19a1 Mon Sep 17 00:00:00 2001 From: Cindy Lu Date: Mon, 26 Jan 2026 17:45:36 +0800 Subject: [PATCH 55/59] vdpa/mlx5: update mlx_features with driver state check Add logic in mlx5_vdpa_set_attr() to ensure the VIRTIO_NET_F_MAC feature bit is properly set only when the device is not yet in the DRIVER_OK (running) state. This makes the MAC address visible in the output of: vdpa dev config show -jp when the device is created without an initial MAC address. Signed-off-by: Cindy Lu Reviewed-by: Dragos Tatulea Signed-off-by: Michael S. Tsirkin Message-Id: <20260126094848.9601-2-lulu@redhat.com> --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 44062e9d68f0..a02f34d8f0fe 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -4046,7 +4046,7 @@ static int mlx5_vdpa_set_attr(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device * struct mlx5_vdpa_dev *mvdev; struct mlx5_vdpa_net *ndev; struct mlx5_core_dev *mdev; - int err = -EOPNOTSUPP; + int err = 0; mvdev = to_mvdev(dev); ndev = to_mlx5_vdpa_ndev(mvdev); @@ -4054,13 +4054,22 @@ static int mlx5_vdpa_set_attr(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device * config = &ndev->config; down_write(&ndev->reslock); - if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) { + + if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR)) { + if (!(ndev->mvdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) { + ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MAC); + } else { + mlx5_vdpa_warn(mvdev, "device running, skip updating MAC\n"); + err = -EBUSY; + goto out; + } pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev)); err = mlx5_mpfs_add_mac(pfmdev, config->mac); if (!err) ether_addr_copy(config->mac, add_config->net.mac); } +out: up_write(&ndev->reslock); return err; } From 2f61e6eda7a793bca4df6efea95815375e122f3a Mon Sep 17 00:00:00 2001 From: Cindy Lu Date: Mon, 26 Jan 2026 17:45:37 +0800 Subject: [PATCH 56/59] vdpa/mlx5: reuse common function for MAC address updates Factor out MAC address update logic and reuse it from handle_ctrl_mac(). This ensures that old MAC entries are removed from the MPFS table before adding a new one and that the forwarding rules are updated accordingly. If updating the flow table fails, the original MAC and rules are restored as much as possible to keep the software and hardware state consistent. Signed-off-by: Cindy Lu Reviewed-by: Dragos Tatulea Signed-off-by: Michael S. Tsirkin Message-Id: <20260126094848.9601-3-lulu@redhat.com> --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 135 ++++++++++++++++-------------- 1 file changed, 73 insertions(+), 62 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index a02f34d8f0fe..b7974f451e62 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2125,6 +2125,74 @@ static void teardown_steering(struct mlx5_vdpa_net *ndev) mlx5_destroy_flow_table(ndev->rxft); } +static int mlx5_vdpa_change_mac(struct mlx5_vdpa_net *ndev, + struct mlx5_core_dev *pfmdev, + const u8 *new_mac) +{ + struct mlx5_vdpa_dev *mvdev = &ndev->mvdev; + u8 old_mac[ETH_ALEN]; + + if (is_zero_ether_addr(new_mac)) + return -EINVAL; + + if (!is_zero_ether_addr(ndev->config.mac)) { + if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) { + mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n", + ndev->config.mac); + return -EIO; + } + } + + if (mlx5_mpfs_add_mac(pfmdev, (u8 *)new_mac)) { + mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n", + new_mac); + return -EIO; + } + + /* backup the original mac address so that if failed to add the forward rules + * we could restore it + */ + ether_addr_copy(old_mac, ndev->config.mac); + + ether_addr_copy(ndev->config.mac, new_mac); + + /* Need recreate the flow table entry, so that the packet could forward back + */ + mac_vlan_del(ndev, old_mac, 0, false); + + if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) { + mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n"); + + /* Although it hardly run here, we still need double check */ + if (is_zero_ether_addr(old_mac)) { + mlx5_vdpa_warn(mvdev, "restore mac failed: Original MAC is zero\n"); + return -EIO; + } + + /* Try to restore original mac address to MFPS table, and try to restore + * the forward rule entry. + */ + if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) { + mlx5_vdpa_warn(mvdev, "restore mac failed: delete MAC %pM from MPFS table failed\n", + ndev->config.mac); + } + + if (mlx5_mpfs_add_mac(pfmdev, old_mac)) { + mlx5_vdpa_warn(mvdev, "restore mac failed: insert old MAC %pM into MPFS table failed\n", + old_mac); + } + + ether_addr_copy(ndev->config.mac, old_mac); + + if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) + mlx5_vdpa_warn(mvdev, "restore forward rules failed: insert forward rules failed\n"); + + return -EIO; + } + + return 0; +} + static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd) { struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); @@ -2132,12 +2200,13 @@ static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd) virtio_net_ctrl_ack status = VIRTIO_NET_ERR; struct mlx5_core_dev *pfmdev; size_t read; - u8 mac[ETH_ALEN], mac_back[ETH_ALEN]; + u8 mac[ETH_ALEN]; pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev)); switch (cmd) { case VIRTIO_NET_CTRL_MAC_ADDR_SET: - read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN); + read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, + (void *)mac, ETH_ALEN); if (read != ETH_ALEN) break; @@ -2145,66 +2214,8 @@ static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd) status = VIRTIO_NET_OK; break; } - - if (is_zero_ether_addr(mac)) - break; - - if (!is_zero_ether_addr(ndev->config.mac)) { - if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) { - mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n", - ndev->config.mac); - break; - } - } - - if (mlx5_mpfs_add_mac(pfmdev, mac)) { - mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n", - mac); - break; - } - - /* backup the original mac address so that if failed to add the forward rules - * we could restore it - */ - memcpy(mac_back, ndev->config.mac, ETH_ALEN); - - memcpy(ndev->config.mac, mac, ETH_ALEN); - - /* Need recreate the flow table entry, so that the packet could forward back - */ - mac_vlan_del(ndev, mac_back, 0, false); - - if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) { - mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n"); - - /* Although it hardly run here, we still need double check */ - if (is_zero_ether_addr(mac_back)) { - mlx5_vdpa_warn(mvdev, "restore mac failed: Original MAC is zero\n"); - break; - } - - /* Try to restore original mac address to MFPS table, and try to restore - * the forward rule entry. - */ - if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) { - mlx5_vdpa_warn(mvdev, "restore mac failed: delete MAC %pM from MPFS table failed\n", - ndev->config.mac); - } - - if (mlx5_mpfs_add_mac(pfmdev, mac_back)) { - mlx5_vdpa_warn(mvdev, "restore mac failed: insert old MAC %pM into MPFS table failed\n", - mac_back); - } - - memcpy(ndev->config.mac, mac_back, ETH_ALEN); - - if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) - mlx5_vdpa_warn(mvdev, "restore forward rules failed: insert forward rules failed\n"); - - break; - } - - status = VIRTIO_NET_OK; + status = mlx5_vdpa_change_mac(ndev, pfmdev, mac) ? VIRTIO_NET_ERR : + VIRTIO_NET_OK; break; default: From 503ef41e88080fb2d2399173e34d26e59567fb5e Mon Sep 17 00:00:00 2001 From: Cindy Lu Date: Mon, 26 Jan 2026 17:45:38 +0800 Subject: [PATCH 57/59] vdpa/mlx5: update MAC address handling in mlx5_vdpa_set_attr() Improve MAC address handling in mlx5_vdpa_set_attr() to ensure that old MAC entries are properly removed from the MPFS table before adding a new one. The new MAC address is then added to both the MPFS and VLAN tables. This change fixes an issue where the updated MAC address would not take effect until QEMU was rebooted. Signed-off-by: Cindy Lu Reviewed-by: Dragos Tatulea Signed-off-by: Michael S. Tsirkin Message-Id: <20260126094848.9601-4-lulu@redhat.com> --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index b7974f451e62..b7e46338815f 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -4052,17 +4052,15 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device * static int mlx5_vdpa_set_attr(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev, const struct vdpa_dev_set_config *add_config) { - struct virtio_net_config *config; struct mlx5_core_dev *pfmdev; struct mlx5_vdpa_dev *mvdev; struct mlx5_vdpa_net *ndev; struct mlx5_core_dev *mdev; - int err = 0; + int err = -EOPNOTSUPP; mvdev = to_mvdev(dev); ndev = to_mlx5_vdpa_ndev(mvdev); mdev = mvdev->mdev; - config = &ndev->config; down_write(&ndev->reslock); @@ -4075,9 +4073,8 @@ static int mlx5_vdpa_set_attr(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device * goto out; } pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev)); - err = mlx5_mpfs_add_mac(pfmdev, config->mac); - if (!err) - ether_addr_copy(config->mac, add_config->net.mac); + err = mlx5_vdpa_change_mac(ndev, pfmdev, + (u8 *)add_config->net.mac); } out: From 5145b277309f3818e2db507f525d19ac3b910922 Mon Sep 17 00:00:00 2001 From: Kommula Shiva Shankar Date: Fri, 2 Jan 2026 12:27:03 +0530 Subject: [PATCH 58/59] vhost: fix caching attributes of MMIO regions by setting them explicitly Explicitly set non-cached caching attributes for MMIO regions. Default write-back mode can cause CPU to cache device memory, causing invalid reads and unpredictable behavior. Invalid read and write issues were observed on ARM64 when mapping the notification area to userspace via mmap. Signed-off-by: Kommula Shiva Shankar Acked-by: Jason Wang Reviewed-by: Jason Gunthorpe Signed-off-by: Michael S. Tsirkin Message-Id: <20260102065703.656255-1-kshankar@marvell.com> --- drivers/vhost/vdpa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 3f0184d42075..cdee8f320dca 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -1529,6 +1529,7 @@ static int vhost_vdpa_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_end - vma->vm_start != notify.size) return -ENOTSUPP; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &vhost_vdpa_vm_ops; return 0; From ebcff9dacaf2c1418f8bc927388186d7d3674603 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 2 Feb 2026 23:48:07 +0100 Subject: [PATCH 59/59] vduse: avoid adding implicit padding The vduse_iova_range_v2 and vduse_iotlb_entry_v2 structures are both defined in a way that adds implicit padding and is incompatible between i386 and x86_64 userspace because of the different structure alignment requirements. Building the header with -Wpadded shows these new warnings: vduse.h:305:1: error: padding struct size to alignment boundary with 4 bytes [-Werror=padded] vduse.h:374:1: error: padding struct size to alignment boundary with 4 bytes [-Werror=padded] Change the amount of padding in these two structures to align them to 64 bit words and avoid those problems. Since the v1 vduse_iotlb_entry already has an inconsistent size, do not attempt to reuse the structure but rather list the members indiviudally, with a fixed amount of padding. Fixes: 079212f6877e ("vduse: add vq group asid support") Signed-off-by: Arnd Bergmann Signed-off-by: Michael S. Tsirkin Message-Id: <20260202224835.559538-1-arnd@kernel.org> --- drivers/vdpa/vdpa_user/vduse_dev.c | 40 +++++++++++------------------- include/uapi/linux/vduse.h | 9 +++++-- 2 files changed, 21 insertions(+), 28 deletions(-) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 73d1d517dc6c..405d59610f76 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -1301,7 +1301,7 @@ static int vduse_dev_iotlb_entry(struct vduse_dev *dev, int r = -EINVAL; struct vhost_iotlb_map *map; - if (entry->v1.start > entry->v1.last || entry->asid >= dev->nas) + if (entry->start > entry->last || entry->asid >= dev->nas) return -EINVAL; asid = array_index_nospec(entry->asid, dev->nas); @@ -1312,18 +1312,18 @@ static int vduse_dev_iotlb_entry(struct vduse_dev *dev, spin_lock(&dev->as[asid].domain->iotlb_lock); map = vhost_iotlb_itree_first(dev->as[asid].domain->iotlb, - entry->v1.start, entry->v1.last); + entry->start, entry->last); if (map) { if (f) { const struct vdpa_map_file *map_file; map_file = (struct vdpa_map_file *)map->opaque; - entry->v1.offset = map_file->offset; + entry->offset = map_file->offset; *f = get_file(map_file->file); } - entry->v1.start = map->start; - entry->v1.last = map->last; - entry->v1.perm = map->perm; + entry->start = map->start; + entry->last = map->last; + entry->perm = map->perm; if (capability) { *capability = 0; @@ -1363,14 +1363,8 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, break; ret = -EFAULT; - if (cmd == VDUSE_IOTLB_GET_FD2) { - if (copy_from_user(&entry, argp, sizeof(entry))) - break; - } else { - if (copy_from_user(&entry.v1, argp, - sizeof(entry.v1))) - break; - } + if (copy_from_user(&entry, argp, _IOC_SIZE(cmd))) + break; ret = -EINVAL; if (!is_mem_zero((const char *)entry.reserved, @@ -1385,19 +1379,13 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, if (!f) break; - if (cmd == VDUSE_IOTLB_GET_FD2) - ret = copy_to_user(argp, &entry, - sizeof(entry)); - else - ret = copy_to_user(argp, &entry.v1, - sizeof(entry.v1)); - + ret = copy_to_user(argp, &entry, _IOC_SIZE(cmd)); if (ret) { ret = -EFAULT; fput(f); break; } - ret = receive_fd(f, NULL, perm_to_file_flags(entry.v1.perm)); + ret = receive_fd(f, NULL, perm_to_file_flags(entry.perm)); fput(f); break; } @@ -1603,16 +1591,16 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, } else if (info.asid >= dev->nas) break; - entry.v1.start = info.start; - entry.v1.last = info.last; + entry.start = info.start; + entry.last = info.last; entry.asid = info.asid; ret = vduse_dev_iotlb_entry(dev, &entry, NULL, &info.capability); if (ret < 0) break; - info.start = entry.v1.start; - info.last = entry.v1.last; + info.start = entry.start; + info.last = entry.last; info.asid = entry.asid; ret = -EFAULT; diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h index 68b4287f9fac..361eea511c21 100644 --- a/include/uapi/linux/vduse.h +++ b/include/uapi/linux/vduse.h @@ -293,9 +293,13 @@ struct vduse_iova_info { * Structure used by VDUSE_IOTLB_GET_FD2 ioctl to find an overlapped IOVA region. */ struct vduse_iotlb_entry_v2 { - struct vduse_iotlb_entry v1; + __u64 offset; + __u64 start; + __u64 last; + __u8 perm; + __u8 padding[7]; __u32 asid; - __u32 reserved[12]; + __u32 reserved[11]; }; /* @@ -365,6 +369,7 @@ struct vduse_iova_range_v2 { __u64 start; __u64 last; __u32 asid; + __u32 padding; }; /**