From 3c7d76d6128a0fef68e6540754bf85a44a29bb59 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Dec 2025 03:25:41 -0700 Subject: [PATCH 01/30] io_uring: IOPOLL polling improvements io_uring manages issued and pending IOPOLL read/write requests in a singly linked list. One downside of that is that individual items cannot easily be removed from that list, and as a result, io_uring will only complete a completed request N in that list if 0..N-1 are also complete. For homogenous IO this isn't necessarily an issue, but if different devices are involved in polling in the same ring, or if disparate IO from the same device is being polled for, this can defer completion of some requests unnecessarily. Move to a doubly linked list for iopoll completions instead, making it possible to easily complete whatever requests that were polled done successfully. Co-developed-by: Fengnan Chang Link: https://lore.kernel.org/io-uring/20251210085501.84261-1-changfengnan@bytedance.com/ Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 13 +++++++++++-- io_uring/cancel.c | 2 +- io_uring/io_uring.c | 27 ++++++++------------------- io_uring/rw.c | 28 ++++++++-------------------- io_uring/slist.h | 12 ------------ io_uring/sqpoll.c | 8 ++++---- 6 files changed, 32 insertions(+), 58 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e1adb0d20a0a..54fd30abf2b8 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -316,7 +316,7 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ bool poll_multi_queue; - struct io_wq_work_list iopoll_list; + struct list_head iopoll_list; struct io_file_table file_table; struct io_rsrc_data buf_table; @@ -708,7 +708,16 @@ struct io_kiocb { atomic_t refs; bool cancel_seq_set; - struct io_task_work io_task_work; + + /* + * IOPOLL doesn't use task_work, so use the ->iopoll_node list + * entry to manage pending iopoll requests. + */ + union { + struct io_task_work io_task_work; + struct list_head iopoll_node; + }; + union { /* * for polled requests, i.e. IORING_OP_POLL_ADD and async armed diff --git a/io_uring/cancel.c b/io_uring/cancel.c index ca12ac10c0ae..4136bf04464a 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -534,7 +534,7 @@ __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, /* SQPOLL thread does its own polling */ if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || is_sqpoll_thread) { - while (!wq_list_empty(&ctx->iopoll_list)) { + while (!list_empty(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; cond_resched(); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6cb24cdf8e68..05a660c97316 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -334,7 +334,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->poll_wq); spin_lock_init(&ctx->completion_lock); raw_spin_lock_init(&ctx->timeout_lock); - INIT_WQ_LIST(&ctx->iopoll_list); + INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); @@ -1561,7 +1561,7 @@ __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) return; mutex_lock(&ctx->uring_lock); - while (!wq_list_empty(&ctx->iopoll_list)) { + while (!list_empty(&ctx->iopoll_list)) { /* let it sleep and repeat later if can't complete a request */ if (io_do_iopoll(ctx, true) == 0) break; @@ -1626,21 +1626,18 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) * forever, while the workqueue is stuck trying to acquire the * very same mutex. */ - if (wq_list_empty(&ctx->iopoll_list) || - io_task_work_pending(ctx)) { + if (list_empty(&ctx->iopoll_list) || io_task_work_pending(ctx)) { u32 tail = ctx->cached_cq_tail; (void) io_run_local_work_locked(ctx, min_events); - if (task_work_pending(current) || - wq_list_empty(&ctx->iopoll_list)) { + if (task_work_pending(current) || list_empty(&ctx->iopoll_list)) { mutex_unlock(&ctx->uring_lock); io_run_task_work(); mutex_lock(&ctx->uring_lock); } /* some requests don't go through iopoll_list */ - if (tail != ctx->cached_cq_tail || - wq_list_empty(&ctx->iopoll_list)) + if (tail != ctx->cached_cq_tail || list_empty(&ctx->iopoll_list)) break; } ret = io_do_iopoll(ctx, !min_events); @@ -1683,25 +1680,17 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) * how we do polling eventually, not spinning if we're on potentially * different devices. */ - if (wq_list_empty(&ctx->iopoll_list)) { + if (list_empty(&ctx->iopoll_list)) { ctx->poll_multi_queue = false; } else if (!ctx->poll_multi_queue) { struct io_kiocb *list_req; - list_req = container_of(ctx->iopoll_list.first, struct io_kiocb, - comp_list); + list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, iopoll_node); if (list_req->file != req->file) ctx->poll_multi_queue = true; } - /* - * For fast devices, IO may have already completed. If it has, add - * it to the front so we find it first. - */ - if (READ_ONCE(req->iopoll_completed)) - wq_list_add_head(&req->comp_list, &ctx->iopoll_list); - else - wq_list_add_tail(&req->comp_list, &ctx->iopoll_list); + list_add_tail(&req->iopoll_node, &ctx->iopoll_list); if (unlikely(needs_lock)) { /* diff --git a/io_uring/rw.c b/io_uring/rw.c index 70ca88cc1f54..307f1f39d9f3 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1315,9 +1315,9 @@ static int io_uring_hybrid_poll(struct io_kiocb *req, int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) { - struct io_wq_work_node *pos, *start, *prev; unsigned int poll_flags = 0; DEFINE_IO_COMP_BATCH(iob); + struct io_kiocb *req, *tmp; int nr_events = 0; /* @@ -1327,8 +1327,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (ctx->poll_multi_queue || force_nonspin) poll_flags |= BLK_POLL_ONESHOT; - wq_list_for_each(pos, start, &ctx->iopoll_list) { - struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); + list_for_each_entry(req, &ctx->iopoll_list, iopoll_node) { int ret; /* @@ -1357,31 +1356,20 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (!rq_list_empty(&iob.req_list)) iob.complete(&iob); - else if (!pos) - return 0; - - prev = start; - wq_list_for_each_resume(pos, prev) { - struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); + list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, iopoll_node) { /* order with io_complete_rw_iopoll(), e.g. ->result updates */ if (!smp_load_acquire(&req->iopoll_completed)) - break; + continue; + list_del(&req->iopoll_node); + wq_list_add_tail(&req->comp_list, &ctx->submit_state.compl_reqs); nr_events++; req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL); if (req->opcode != IORING_OP_URING_CMD) io_req_rw_cleanup(req, 0); } - if (unlikely(!nr_events)) - return 0; - - pos = start ? start->next : ctx->iopoll_list.first; - wq_list_cut(&ctx->iopoll_list, prev, start); - - if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs))) - return 0; - ctx->submit_state.compl_reqs.first = pos; - __io_submit_flush_completions(ctx); + if (nr_events) + __io_submit_flush_completions(ctx); return nr_events; } diff --git a/io_uring/slist.h b/io_uring/slist.h index 7ef747442754..0aec51076bad 100644 --- a/io_uring/slist.h +++ b/io_uring/slist.h @@ -9,9 +9,6 @@ #define wq_list_for_each(pos, prv, head) \ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) -#define wq_list_for_each_resume(pos, prv) \ - for (; pos; prv = pos, pos = (pos)->next) - #define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) #define INIT_WQ_LIST(list) do { \ @@ -43,15 +40,6 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, } } -static inline void wq_list_add_head(struct io_wq_work_node *node, - struct io_wq_work_list *list) -{ - node->next = list->first; - if (!node->next) - list->last = node; - WRITE_ONCE(list->first, node); -} - static inline void wq_list_cut(struct io_wq_work_list *list, struct io_wq_work_node *last, struct io_wq_work_node *prev) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 74c1a130cd87..becdfdd323a9 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -212,7 +212,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, struct io_sq_data *sqd, if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; - if (to_submit || !wq_list_empty(&ctx->iopoll_list)) { + if (to_submit || !list_empty(&ctx->iopoll_list)) { const struct cred *creds = NULL; io_sq_start_worktime(ist); @@ -221,7 +221,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, struct io_sq_data *sqd, creds = override_creds(ctx->sq_creds); mutex_lock(&ctx->uring_lock); - if (!wq_list_empty(&ctx->iopoll_list)) + if (!list_empty(&ctx->iopoll_list)) io_do_iopoll(ctx, true); /* @@ -344,7 +344,7 @@ static int io_sq_thread(void *data) list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { int ret = __io_sq_thread(ctx, sqd, cap_entries, &ist); - if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) + if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) sqt_spin = true; } if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE)) @@ -379,7 +379,7 @@ static int io_sq_thread(void *data) atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); if ((ctx->flags & IORING_SETUP_IOPOLL) && - !wq_list_empty(&ctx->iopoll_list)) { + !list_empty(&ctx->iopoll_list)) { needs_sched = false; break; } From 48ed70131e4f3057f819c848d92fe84ba696e2a9 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 5 Jan 2026 18:09:32 -0500 Subject: [PATCH 02/30] io_uring: Trim out unused includes Clean up some left overs of refactoring io_uring into multiple files. Compile tested with a few configurations. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/alloc_cache.h | 1 + io_uring/cancel.c | 2 -- io_uring/filetable.h | 1 - io_uring/io_uring.c | 12 ------------ io_uring/io_uring.h | 1 - 5 files changed, 1 insertion(+), 16 deletions(-) diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index d33ce159ef33..bb2f21a7bfd6 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -2,6 +2,7 @@ #define IOU_ALLOC_CACHE_H #include +#include /* * Don't allow the cache to grow beyond this size. diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 4136bf04464a..76c657a28fe7 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -2,10 +2,8 @@ #include #include #include -#include #include #include -#include #include #include diff --git a/io_uring/filetable.h b/io_uring/filetable.h index 7717ea9efd0e..c348233a3411 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -2,7 +2,6 @@ #ifndef IOU_FILE_TABLE_H #define IOU_FILE_TABLE_H -#include #include #include "rsrc.h" diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 05a660c97316..1aebdba425e8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -40,37 +40,25 @@ * Copyright (c) 2018-2019 Christoph Hellwig */ #include -#include #include #include -#include #include -#include #include #include #include -#include #include -#include #include #include -#include -#include -#include #include -#include #include #include -#include -#include #include #include #include #include #include #include -#include #define CREATE_TRACE_POINTS #include diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index a790c16854d3..c5bbb43b5842 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include From 7a8737e1132ff07ca225aa7a4008f87319b5b1ca Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 5 Jan 2026 14:05:40 -0700 Subject: [PATCH 03/30] io_uring: use release-acquire ordering for IORING_SETUP_R_DISABLED io_uring_enter(), __io_msg_ring_data(), and io_msg_send_fd() read ctx->flags and ctx->submitter_task without holding the ctx's uring_lock. This means they may race with the assignment to ctx->submitter_task and the clearing of IORING_SETUP_R_DISABLED from ctx->flags in io_register_enable_rings(). Ensure the correct ordering of the ctx->flags and ctx->submitter_task memory accesses by storing to ctx->flags using release ordering and loading it using acquire ordering. Signed-off-by: Caleb Sander Mateos Fixes: 4add705e4eeb ("io_uring: remove io_register_submitter") Reviewed-by: Joanne Koong Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 +++++- io_uring/msg_ring.c | 12 ++++++++++-- io_uring/register.c | 3 ++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 1aebdba425e8..559932b851ca 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3228,7 +3228,11 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, ctx = file->private_data; ret = -EBADFD; - if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) + /* + * Keep IORING_SETUP_R_DISABLED check before submitter_task load + * in io_uring_add_tctx_node() -> __io_uring_add_tctx_node_from_submit() + */ + if (unlikely(smp_load_acquire(&ctx->flags) & IORING_SETUP_R_DISABLED)) goto out; /* diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 7063ea7964e7..87b4d306cf1b 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -125,7 +125,11 @@ static int __io_msg_ring_data(struct io_ring_ctx *target_ctx, return -EINVAL; if (!(msg->flags & IORING_MSG_RING_FLAGS_PASS) && msg->dst_fd) return -EINVAL; - if (target_ctx->flags & IORING_SETUP_R_DISABLED) + /* + * Keep IORING_SETUP_R_DISABLED check before submitter_task load + * in io_msg_data_remote() -> io_msg_remote_post() + */ + if (smp_load_acquire(&target_ctx->flags) & IORING_SETUP_R_DISABLED) return -EBADFD; if (io_msg_need_remote(target_ctx)) @@ -245,7 +249,11 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) return -EINVAL; if (target_ctx == ctx) return -EINVAL; - if (target_ctx->flags & IORING_SETUP_R_DISABLED) + /* + * Keep IORING_SETUP_R_DISABLED check before submitter_task load + * in io_msg_fd_remote() + */ + if (smp_load_acquire(&target_ctx->flags) & IORING_SETUP_R_DISABLED) return -EBADFD; if (!msg->src_file) { int ret = io_msg_grab_file(req, issue_flags); diff --git a/io_uring/register.c b/io_uring/register.c index 62d39b3ff317..5c2574496aa9 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -193,7 +193,8 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) if (ctx->restrictions.registered) ctx->restricted = 1; - ctx->flags &= ~IORING_SETUP_R_DISABLED; + /* Keep submitter_task store before clearing IORING_SETUP_R_DISABLED */ + smp_store_release(&ctx->flags, ctx->flags & ~IORING_SETUP_R_DISABLED); if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) wake_up(&ctx->sq_data->wait); return 0; From bcd4c95737d15fa1a85152b8862dec146b11c214 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 5 Jan 2026 14:05:41 -0700 Subject: [PATCH 04/30] io_uring/msg_ring: drop unnecessary submitter_task checks __io_msg_ring_data() checks that the target_ctx isn't IORING_SETUP_R_DISABLED before calling io_msg_data_remote(), which calls io_msg_remote_post(). So submitter_task can't be modified concurrently with the read in io_msg_remote_post(). Additionally, submitter_task must exist, as io_msg_data_remote() is only called for io_msg_need_remote(), i.e. task_complete is set, which requires IORING_SETUP_DEFER_TASKRUN, which in turn requires IORING_SETUP_SINGLE_ISSUER. And submitter_task is assigned in io_uring_create() or io_register_enable_rings() before enabling any IORING_SETUP_SINGLE_ISSUER io_ring_ctx. Similarly, io_msg_send_fd() checks IORING_SETUP_R_DISABLED and io_msg_need_remote() before calling io_msg_fd_remote(). submitter_task therefore can't be modified concurrently with the read in io_msg_fd_remote() and must be non-null. io_register_enable_rings() can't run concurrently because it's called from io_uring_register() -> __io_uring_register() with uring_lock held. Thus, replace the READ_ONCE() and WRITE_ONCE() of submitter_task with plain loads and stores. And remove the NULL checks of submitter_task in io_msg_remote_post() and io_msg_fd_remote(). Signed-off-by: Caleb Sander Mateos Reviewed-by: Joanne Koong Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 7 +------ io_uring/msg_ring.c | 18 +++++------------- io_uring/register.c | 2 +- 3 files changed, 7 insertions(+), 20 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 559932b851ca..a1dd1540ab79 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3637,13 +3637,8 @@ static __cold int io_uring_create(struct io_ctx_config *config) } if (ctx->flags & IORING_SETUP_SINGLE_ISSUER - && !(ctx->flags & IORING_SETUP_R_DISABLED)) { - /* - * Unlike io_register_enable_rings(), don't need WRITE_ONCE() - * since ctx isn't yet accessible from other tasks - */ + && !(ctx->flags & IORING_SETUP_R_DISABLED)) ctx->submitter_task = get_task_struct(current); - } file = io_uring_get_file(ctx); if (IS_ERR(file)) { diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 87b4d306cf1b..57ad0085869a 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -80,13 +80,9 @@ static void io_msg_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw) percpu_ref_put(&ctx->refs); } -static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, +static void io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, int res, u32 cflags, u64 user_data) { - if (!READ_ONCE(ctx->submitter_task)) { - kfree_rcu(req, rcu_head); - return -EOWNERDEAD; - } req->opcode = IORING_OP_NOP; req->cqe.user_data = user_data; io_req_set_res(req, res, cflags); @@ -95,7 +91,6 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, req->tctx = NULL; req->io_task_work.func = io_msg_tw_complete; io_req_task_work_add_remote(req, IOU_F_TWQ_LAZY_WAKE); - return 0; } static int io_msg_data_remote(struct io_ring_ctx *target_ctx, @@ -111,8 +106,8 @@ static int io_msg_data_remote(struct io_ring_ctx *target_ctx, if (msg->flags & IORING_MSG_RING_FLAGS_PASS) flags = msg->cqe_flags; - return io_msg_remote_post(target_ctx, target, msg->len, flags, - msg->user_data); + io_msg_remote_post(target_ctx, target, msg->len, flags, msg->user_data); + return 0; } static int __io_msg_ring_data(struct io_ring_ctx *target_ctx, @@ -127,7 +122,7 @@ static int __io_msg_ring_data(struct io_ring_ctx *target_ctx, return -EINVAL; /* * Keep IORING_SETUP_R_DISABLED check before submitter_task load - * in io_msg_data_remote() -> io_msg_remote_post() + * in io_msg_data_remote() -> io_req_task_work_add_remote() */ if (smp_load_acquire(&target_ctx->flags) & IORING_SETUP_R_DISABLED) return -EBADFD; @@ -227,10 +222,7 @@ static int io_msg_fd_remote(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); - struct task_struct *task = READ_ONCE(ctx->submitter_task); - - if (unlikely(!task)) - return -EOWNERDEAD; + struct task_struct *task = ctx->submitter_task; init_task_work(&msg->tw, io_msg_tw_fd_complete); if (task_work_add(task, &msg->tw, TWA_SIGNAL)) diff --git a/io_uring/register.c b/io_uring/register.c index 5c2574496aa9..29393f93a414 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -181,7 +181,7 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) return -EBADFD; if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { - WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); + ctx->submitter_task = get_task_struct(current); /* * Lazy activation attempts would fail if it was polled before * submitter_task is set. From 130a82760718997806a618490f5b7ab06932bd9c Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 5 Jan 2026 14:05:42 -0700 Subject: [PATCH 05/30] io_uring/register: drop io_register_enable_rings() submitter_task check io_register_enable_rings() checks that the io_ring_ctx is IORING_SETUP_R_DISABLED, which ensures submitter_task hasn't been assigned by io_uring_create() or a previous io_register_enable_rings() call. So drop the redundant check that submitter_task is NULL. Signed-off-by: Caleb Sander Mateos Reviewed-by: Joanne Koong Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/register.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/register.c b/io_uring/register.c index 29393f93a414..a3fef649272b 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -180,7 +180,7 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) if (!(ctx->flags & IORING_SETUP_R_DISABLED)) return -EBADFD; - if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { + if (ctx->flags & IORING_SETUP_SINGLE_ISSUER) { ctx->submitter_task = get_task_struct(current); /* * Lazy activation attempts would fail if it was polled before From 51fff55a66d89d76fcaeaa277d53bdf5b19efa0e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 12 Jan 2026 08:14:41 -0700 Subject: [PATCH 06/30] io_uring/register: have io_parse_restrictions() return number of ops Rather than return 0 on success, return >= 0 for success, where the return value is that number of parsed entries. As before, any < 0 return is an error. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/register.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/io_uring/register.c b/io_uring/register.c index a3fef649272b..4d4e7420e7c6 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -103,6 +103,10 @@ static int io_register_personality(struct io_ring_ctx *ctx) return id; } +/* + * Returns number of restrictions parsed and added on success, or < 0 for + * an error. + */ static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, struct io_restriction *restrictions) { @@ -145,9 +149,7 @@ static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, goto err; } } - - ret = 0; - + ret = nr_args; err: kfree(res); return ret; @@ -168,11 +170,12 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); /* Reset all restrictions if an error happened */ - if (ret != 0) + if (ret < 0) { memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); - else - ctx->restrictions.registered = true; - return ret; + return ret; + } + ctx->restrictions.registered = true; + return 0; } static int io_register_enable_rings(struct io_ring_ctx *ctx) From e6ed0f051d557be5a782a42f74dddbc8ed5309ec Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 12 Jan 2026 08:14:42 -0700 Subject: [PATCH 07/30] io_uring/register: have io_parse_restrictions() set restrictions enabled Rather than leave this to the caller, have io_parse_restrictions() set ->registered = true if restrictions have been enabled. This is in preparation for having finer grained restrictions. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/register.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/register.c b/io_uring/register.c index 4d4e7420e7c6..f5bd57c5bc38 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -150,6 +150,7 @@ static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, } } ret = nr_args; + restrictions->registered = true; err: kfree(res); return ret; @@ -174,7 +175,6 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); return ret; } - ctx->restrictions.registered = true; return 0; } From 09bd84421defa0a9dcebdcdaf8b7deb1870855d0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 12 Jan 2026 08:14:43 -0700 Subject: [PATCH 08/30] io_uring/register: set ctx->restricted when restrictions are parsed Rather than defer this until the rings are enabled, just set it upfront when the restrictions are parsed and enabled anyway. There's no reason to defer this setting until the rings are enabled. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/register.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/io_uring/register.c b/io_uring/register.c index f5bd57c5bc38..54ccf164be38 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -175,6 +175,8 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); return ret; } + if (ctx->restrictions.registered) + ctx->restricted = 1; return 0; } @@ -193,9 +195,6 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) io_activate_pollwq(ctx); } - if (ctx->restrictions.registered) - ctx->restricted = 1; - /* Keep submitter_task store before clearing IORING_SETUP_R_DISABLED */ smp_store_release(&ctx->flags, ctx->flags & ~IORING_SETUP_R_DISABLED); if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) @@ -627,7 +626,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (ctx->submitter_task && ctx->submitter_task != current) return -EEXIST; - if (ctx->restricted) { + if (ctx->restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) { opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); if (!test_bit(opcode, ctx->restrictions.register_op)) return -EACCES; From 991fb85a1d43f0d0237a405d5535024f78a873e5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 12 Jan 2026 08:14:44 -0700 Subject: [PATCH 09/30] io_uring: move ctx->restricted check into io_check_restriction() Just a cleanup, makes the code easier to read without too many dependent nested checks. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a1dd1540ab79..92ed92a44023 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2056,6 +2056,8 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags) { + if (!ctx->restricted) + return true; if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) return false; @@ -2158,7 +2160,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, } } if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { - if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) + if (!io_check_restriction(ctx, req, sqe_flags)) return io_init_fail_req(req, -EACCES); /* knock it to the slow queue path, will be drained there */ if (ctx->drain_active) From d6406c45f14842019cfaaba19fe2a76ef9fa831c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 12 Jan 2026 08:14:45 -0700 Subject: [PATCH 10/30] io_uring: track restrictions separately for IORING_OP and IORING_REGISTER It's quite likely that only register opcode restrictions exists, in which case we'd never need to check the normal opcodes. Split ctx->restricted into two separate fields, one for I/O opcodes, and one for register opcodes. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 8 ++++++-- io_uring/io_uring.c | 4 ++-- io_uring/register.c | 19 ++++++++++++++----- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 54fd30abf2b8..e4c804f99c30 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -224,7 +224,10 @@ struct io_restriction { DECLARE_BITMAP(sqe_op, IORING_OP_LAST); u8 sqe_flags_allowed; u8 sqe_flags_required; - bool registered; + /* IORING_OP_* restrictions exist */ + bool op_registered; + /* IORING_REGISTER_* restrictions exist */ + bool reg_registered; }; struct io_submit_link { @@ -259,7 +262,8 @@ struct io_ring_ctx { struct { unsigned int flags; unsigned int drain_next: 1; - unsigned int restricted: 1; + unsigned int op_restricted: 1; + unsigned int reg_restricted: 1; unsigned int off_timeout_used: 1; unsigned int drain_active: 1; unsigned int has_evfd: 1; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 92ed92a44023..2cde22af78a3 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2056,7 +2056,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags) { - if (!ctx->restricted) + if (!ctx->op_restricted) return true; if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) return false; @@ -2159,7 +2159,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, io_init_drain(ctx); } } - if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { + if (unlikely(ctx->op_restricted || ctx->drain_active || ctx->drain_next)) { if (!io_check_restriction(ctx, req, sqe_flags)) return io_init_fail_req(req, -EACCES); /* knock it to the slow queue path, will be drained there */ diff --git a/io_uring/register.c b/io_uring/register.c index 54ccf164be38..8551f13920dc 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -133,24 +133,31 @@ static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, if (res[i].register_op >= IORING_REGISTER_LAST) goto err; __set_bit(res[i].register_op, restrictions->register_op); + restrictions->reg_registered = true; break; case IORING_RESTRICTION_SQE_OP: if (res[i].sqe_op >= IORING_OP_LAST) goto err; __set_bit(res[i].sqe_op, restrictions->sqe_op); + restrictions->op_registered = true; break; case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: restrictions->sqe_flags_allowed = res[i].sqe_flags; + restrictions->op_registered = true; break; case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: restrictions->sqe_flags_required = res[i].sqe_flags; + restrictions->op_registered = true; break; default: goto err; } } ret = nr_args; - restrictions->registered = true; + if (!nr_args) { + restrictions->op_registered = true; + restrictions->reg_registered = true; + } err: kfree(res); return ret; @@ -166,7 +173,7 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, return -EBADFD; /* We allow only a single restrictions registration */ - if (ctx->restrictions.registered) + if (ctx->restrictions.op_registered || ctx->restrictions.reg_registered) return -EBUSY; ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); @@ -175,8 +182,10 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); return ret; } - if (ctx->restrictions.registered) - ctx->restricted = 1; + if (ctx->restrictions.op_registered) + ctx->op_restricted = 1; + if (ctx->restrictions.reg_registered) + ctx->reg_restricted = 1; return 0; } @@ -626,7 +635,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (ctx->submitter_task && ctx->submitter_task != current) return -EEXIST; - if (ctx->restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) { + if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) { opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); if (!test_bit(opcode, ctx->restrictions.register_op)) return -EACCES; From 697a5284ad9697609324739e38e341612cd342a6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 14 Jan 2026 07:59:08 -0700 Subject: [PATCH 11/30] io_uring: fix IOPOLL with passthrough I/O A previous commit improving IOPOLL made an incorrect assumption that task_work isn't used with IOPOLL. This can cause crashes when doing passthrough I/O on nvme, where queueing the completion task_work will trample on the same memory that holds the completed list of requests. Fix it up by shuffling the members around, so we're not sharing any parts that end up getting used in this path. Fixes: 3c7d76d6128a ("io_uring: IOPOLL polling improvements") Reported-by: Yi Zhang Link: https://lore.kernel.org/linux-block/CAHj4cs_SLPj9v9w5MgfzHKy+983enPx3ZQY2kMuMJ1202DBefw@mail.gmail.com/ Tested-by: Yi Zhang Cc: Ming Lei Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 11 ++++------- io_uring/rw.c | 5 +++-- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e4c804f99c30..211686ad89fd 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -713,13 +713,10 @@ struct io_kiocb { atomic_t refs; bool cancel_seq_set; - /* - * IOPOLL doesn't use task_work, so use the ->iopoll_node list - * entry to manage pending iopoll requests. - */ union { struct io_task_work io_task_work; - struct list_head iopoll_node; + /* For IOPOLL setup queues, with hybrid polling */ + u64 iopoll_start; }; union { @@ -728,8 +725,8 @@ struct io_kiocb { * poll */ struct hlist_node hash_node; - /* For IOPOLL setup queues, with hybrid polling */ - u64 iopoll_start; + /* IOPOLL completion handling */ + struct list_head iopoll_node; /* for private io_kiocb freeing */ struct rcu_head rcu_head; }; diff --git a/io_uring/rw.c b/io_uring/rw.c index 307f1f39d9f3..c33c533a267e 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1296,12 +1296,13 @@ static int io_uring_hybrid_poll(struct io_kiocb *req, struct io_comp_batch *iob, unsigned int poll_flags) { struct io_ring_ctx *ctx = req->ctx; - u64 runtime, sleep_time; + u64 runtime, sleep_time, iopoll_start; int ret; + iopoll_start = READ_ONCE(req->iopoll_start); sleep_time = io_hybrid_iopoll_delay(ctx, req); ret = io_uring_classic_poll(req, iob, poll_flags); - runtime = ktime_get_ns() - req->iopoll_start - sleep_time; + runtime = ktime_get_ns() - iopoll_start - sleep_time; /* * Use minimum sleep time if we're polling devices with different From 8661d0b142bccfa19bb542ee21ec45a5423899ea Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 14 Jan 2026 08:51:02 -0700 Subject: [PATCH 12/30] io_uring/uring_cmd: explicitly disallow cancelations for IOPOLL This currently isn't supported, and due to a recent commit, it also cannot easily be supported by io_uring due to hash_node and IOPOLL completion data overlapping. This can be revisited if we ever do support cancelations of requests that have gone to the block stack. Suggested-by: Ming Lei Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 197474911f04..ee7b49f47cb5 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -104,6 +104,15 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, struct io_kiocb *req = cmd_to_io_kiocb(cmd); struct io_ring_ctx *ctx = req->ctx; + /* + * Doing cancelations on IOPOLL requests are not supported. Both + * because they can't get canceled in the block stack, but also + * because iopoll completion data overlaps with the hash_node used + * for tracking. + */ + if (ctx->flags & IORING_SETUP_IOPOLL) + return; + if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) { cmd->flags |= IORING_URING_CMD_CANCELABLE; io_ring_submit_lock(ctx, issue_flags); From 42b12cb5fd4554679bac06bbdd05dc8b643bcc42 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 20 Jan 2026 09:53:43 -0700 Subject: [PATCH 13/30] io_uring/timeout: annotate data race in io_flush_timeouts() syzbot correctly reports this as a KCSAN race, as ctx->cached_cq_tail should be read under ->uring_lock. This isn't immediately feasible in io_flush_timeouts(), but as long as we read a stable value, that should be good enough. If two io-wq threads compete on this value, then they will both end up calling io_flush_timeouts() and at least one of them will see the correct value. Reported-by: syzbot+6c48db7d94402407301e@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- io_uring/timeout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index d8fbbaf31cf3..84dda24f3eb2 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -130,7 +130,7 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) u32 seq; raw_spin_lock_irq(&ctx->timeout_lock); - seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + seq = READ_ONCE(ctx->cached_cq_tail) - atomic_read(&ctx->cq_timeouts); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); From 07f3c3a1cd56c2048a92dad0c11f15e4ac3888c1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Jan 2026 11:21:32 -0700 Subject: [PATCH 14/30] io_uring/eventfd: remove unused ctx->evfd_last_cq_tail member A previous commit got rid of any use of this member, but forgot to remove it. Kill it. Fixes: f4bb2f65bb81 ("io_uring/eventfd: move ctx->evfd_last_cq_tail into io_ev_fd") Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 211686ad89fd..dc6bd6940a0d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -442,6 +442,9 @@ struct io_ring_ctx { struct list_head defer_list; unsigned nr_drained; + /* protected by ->completion_lock */ + unsigned nr_req_allocated; + #ifdef CONFIG_NET_RX_BUSY_POLL struct list_head napi_list; /* track busy poll napi_id */ spinlock_t napi_lock; /* napi_list lock */ @@ -454,10 +457,6 @@ struct io_ring_ctx { DECLARE_HASHTABLE(napi_ht, 4); #endif - /* protected by ->completion_lock */ - unsigned evfd_last_cq_tail; - unsigned nr_req_allocated; - /* * Protection for resize vs mmap races - both the mmap and resize * side will need to grab this lock, to prevent either side from From 649dd18f559891bdafc5532d737c7dfb56060a6d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Jan 2026 11:48:56 -0700 Subject: [PATCH 15/30] io_uring/sync: validate passed in offset Check if the passed in offset is negative once cast to sync->off. This ensures that -EINVAL is returned for that case, like it would be for sync_file_range(2). Fixes: c992fe2925d7 ("io_uring: add fsync support") Signed-off-by: Jens Axboe --- io_uring/sync.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/sync.c b/io_uring/sync.c index cea2d381ffd2..ab7fa1cd7dd6 100644 --- a/io_uring/sync.c +++ b/io_uring/sync.c @@ -62,6 +62,8 @@ int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; sync->off = READ_ONCE(sqe->off); + if (sync->off < 0) + return -EINVAL; sync->len = READ_ONCE(sqe->len); req->flags |= REQ_F_FORCE_ASYNC; return 0; From dd120bddc43af583e3bc9afe75f58bb9280947b0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Jan 2026 16:06:41 -0700 Subject: [PATCH 16/30] io_uring: add IO_URING_EXIT_WAIT_MAX definition Add the timeout we normally wait before complaining about things being stuck waiting for cancelations to complete as a define, and use it in io_ring_exit_work(). Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- io_uring/io_uring.h | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2cde22af78a3..6c9003a3efae 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2962,7 +2962,7 @@ static __cold void io_tctx_exit_cb(struct callback_head *cb) static __cold void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); - unsigned long timeout = jiffies + HZ * 60 * 5; + unsigned long timeout = jiffies + IO_URING_EXIT_WAIT_MAX; unsigned long interval = HZ / 20; struct io_tctx_exit exit; struct io_tctx_node *node; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index c5bbb43b5842..2df861200b4f 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -88,6 +88,12 @@ struct io_ctx_config { IOSQE_BUFFER_SELECT |\ IOSQE_CQE_SKIP_SUCCESS) +/* + * Complaint timeout for io_uring cancelation exits, and for io-wq exit + * worker waiting. + */ +#define IO_URING_EXIT_WAIT_MAX (HZ * 60 * 5) + enum { IOU_COMPLETE = 0, From 1f293098a31368a5d5cfab215a56f4cbed3f657a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Jan 2026 10:40:33 -0700 Subject: [PATCH 17/30] io_uring/io-wq: don't trigger hung task for syzbot craziness Use the same trick that blk_io_schedule() does to avoid triggering the hung task warning (and potential reboot/panic, depending on system settings), and only wait for half the hung task timeout at the time. If we exceed the default IO_URING_EXIT_WAIT_MAX period where we expect things to certainly have finished unless there's a bug, then throw a WARN_ON_ONCE() for that case. Reported-by: syzbot+4eb282331cab6d5b6588@syzkaller.appspotmail.com Tested-by: syzbot+4eb282331cab6d5b6588@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index cd13d8aac3d2..aaf1dfc21763 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "io-wq.h" @@ -1316,6 +1317,8 @@ static void io_wq_cancel_tw_create(struct io_wq *wq) static void io_wq_exit_workers(struct io_wq *wq) { + unsigned long timeout, warn_timeout; + if (!wq->task) return; @@ -1325,7 +1328,24 @@ static void io_wq_exit_workers(struct io_wq *wq) io_wq_for_each_worker(wq, io_wq_worker_wake, NULL); rcu_read_unlock(); io_worker_ref_put(wq); - wait_for_completion(&wq->worker_done); + + /* + * Shut up hung task complaint, see for example + * + * https://lore.kernel.org/all/696fc9e7.a70a0220.111c58.0006.GAE@google.com/ + * + * where completely overloading the system with tons of long running + * io-wq items can easily trigger the hung task timeout. Only sleep + * uninterruptibly for half that time, and warn if we exceeded end + * up waiting more than IO_URING_EXIT_WAIT_MAX. + */ + timeout = sysctl_hung_task_timeout_secs * HZ / 2; + warn_timeout = jiffies + IO_URING_EXIT_WAIT_MAX; + do { + if (wait_for_completion_timeout(&wq->worker_done, timeout)) + break; + WARN_ON_ONCE(time_after(jiffies, warn_timeout)); + } while (1); spin_lock_irq(&wq->hash->wait.lock); list_del_init(&wq->wait.entry); From 7642e668606009fcd3fe1fc161a79ef90403d507 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Jan 2026 08:55:45 -0700 Subject: [PATCH 18/30] io_uring: split out task work code into tw.c Move the task work handling code out of io_uring.c into a new tw.c file. This includes the local work, normal work, and fallback work handling infrastructure. The associated tw.h header contains io_should_terminate_tw() as a static inline helper, along with the necessary function declarations. Signed-off-by: Jens Axboe --- io_uring/Makefile | 3 +- io_uring/io_uring.c | 371 -------------------------------------------- io_uring/io_uring.h | 79 +--------- io_uring/tw.c | 354 ++++++++++++++++++++++++++++++++++++++++++ io_uring/tw.h | 124 +++++++++++++++ 5 files changed, 483 insertions(+), 448 deletions(-) create mode 100644 io_uring/tw.c create mode 100644 io_uring/tw.h diff --git a/io_uring/Makefile b/io_uring/Makefile index bc4e4a3fa0a5..b7ea66a9fcfc 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -8,12 +8,13 @@ endif obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ tctx.o filetable.o rw.o poll.o \ - eventfd.o uring_cmd.o openclose.o \ + tw.o eventfd.o uring_cmd.o openclose.o \ sqpoll.o xattr.o nop.o fs.o splice.o \ sync.o msg_ring.o advise.o openclose.o \ statx.o timeout.o cancel.o \ waitid.o register.o truncate.o \ memmap.o alloc_cache.o query.o + obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6c9003a3efae..f9b716c819d1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -110,19 +110,10 @@ #define IO_COMPL_BATCH 32 #define IO_REQ_ALLOC_BATCH 8 -#define IO_LOCAL_TW_DEFAULT_MAX 20 /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) -/* - * No waiters. It's larger than any valid value of the tw counter - * so that tests against ->cq_wait_nr would fail and skip wake_up(). - */ -#define IO_CQ_WAKE_INIT (-1U) -/* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ -#define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) - static void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags); static void __io_req_caches_free(struct io_ring_ctx *ctx); @@ -205,38 +196,6 @@ static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) complete(&ctx->ref_comp); } -/* - * Terminate the request if either of these conditions are true: - * - * 1) It's being executed by the original task, but that task is marked - * with PF_EXITING as it's exiting. - * 2) PF_KTHREAD is set, in which case the invoker of the task_work is - * our fallback task_work. - * 3) The ring has been closed and is going away. - */ -static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) -{ - return (current->flags & (PF_EXITING | PF_KTHREAD)) || percpu_ref_is_dying(&ctx->refs); -} - -static __cold void io_fallback_req_func(struct work_struct *work) -{ - struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, - fallback_work.work); - struct llist_node *node = llist_del_all(&ctx->fallback_llist); - struct io_kiocb *req, *tmp; - struct io_tw_state ts = {}; - - percpu_ref_get(&ctx->refs); - mutex_lock(&ctx->uring_lock); - ts.cancel = io_should_terminate_tw(ctx); - llist_for_each_entry_safe(req, tmp, node, io_task_work.node) - req->io_task_work.func((struct io_tw_req){req}, ts); - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - percpu_ref_put(&ctx->refs); -} - static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) { unsigned int hash_buckets; @@ -1070,336 +1029,6 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) return nxt; } -static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw) -{ - if (!ctx) - return; - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - percpu_ref_put(&ctx->refs); -} - -/* - * Run queued task_work, returning the number of entries processed in *count. - * If more entries than max_entries are available, stop processing once this - * is reached and return the rest of the list. - */ -struct llist_node *io_handle_tw_list(struct llist_node *node, - unsigned int *count, - unsigned int max_entries) -{ - struct io_ring_ctx *ctx = NULL; - struct io_tw_state ts = { }; - - do { - struct llist_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - - if (req->ctx != ctx) { - ctx_flush_and_put(ctx, ts); - ctx = req->ctx; - mutex_lock(&ctx->uring_lock); - percpu_ref_get(&ctx->refs); - ts.cancel = io_should_terminate_tw(ctx); - } - INDIRECT_CALL_2(req->io_task_work.func, - io_poll_task_func, io_req_rw_complete, - (struct io_tw_req){req}, ts); - node = next; - (*count)++; - if (unlikely(need_resched())) { - ctx_flush_and_put(ctx, ts); - ctx = NULL; - cond_resched(); - } - } while (node && *count < max_entries); - - ctx_flush_and_put(ctx, ts); - return node; -} - -static __cold void __io_fallback_tw(struct llist_node *node, bool sync) -{ - struct io_ring_ctx *last_ctx = NULL; - struct io_kiocb *req; - - while (node) { - req = container_of(node, struct io_kiocb, io_task_work.node); - node = node->next; - if (last_ctx != req->ctx) { - if (last_ctx) { - if (sync) - flush_delayed_work(&last_ctx->fallback_work); - percpu_ref_put(&last_ctx->refs); - } - last_ctx = req->ctx; - percpu_ref_get(&last_ctx->refs); - } - if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist)) - schedule_delayed_work(&last_ctx->fallback_work, 1); - } - - if (last_ctx) { - if (sync) - flush_delayed_work(&last_ctx->fallback_work); - percpu_ref_put(&last_ctx->refs); - } -} - -static void io_fallback_tw(struct io_uring_task *tctx, bool sync) -{ - struct llist_node *node = llist_del_all(&tctx->task_list); - - __io_fallback_tw(node, sync); -} - -struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, - unsigned int max_entries, - unsigned int *count) -{ - struct llist_node *node; - - node = llist_del_all(&tctx->task_list); - if (node) { - node = llist_reverse_order(node); - node = io_handle_tw_list(node, count, max_entries); - } - - /* relaxed read is enough as only the task itself sets ->in_cancel */ - if (unlikely(atomic_read(&tctx->in_cancel))) - io_uring_drop_tctx_refs(current); - - trace_io_uring_task_work_run(tctx, *count); - return node; -} - -void tctx_task_work(struct callback_head *cb) -{ - struct io_uring_task *tctx; - struct llist_node *ret; - unsigned int count = 0; - - tctx = container_of(cb, struct io_uring_task, task_work); - ret = tctx_task_work_run(tctx, UINT_MAX, &count); - /* can't happen */ - WARN_ON_ONCE(ret); -} - -static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) -{ - struct io_ring_ctx *ctx = req->ctx; - unsigned nr_wait, nr_tw, nr_tw_prev; - struct llist_node *head; - - /* See comment above IO_CQ_WAKE_INIT */ - BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); - - /* - * We don't know how many requests there are in the link and whether - * they can even be queued lazily, fall back to non-lazy. - */ - if (req->flags & IO_REQ_LINK_FLAGS) - flags &= ~IOU_F_TWQ_LAZY_WAKE; - - guard(rcu)(); - - head = READ_ONCE(ctx->work_llist.first); - do { - nr_tw_prev = 0; - if (head) { - struct io_kiocb *first_req = container_of(head, - struct io_kiocb, - io_task_work.node); - /* - * Might be executed at any moment, rely on - * SLAB_TYPESAFE_BY_RCU to keep it alive. - */ - nr_tw_prev = READ_ONCE(first_req->nr_tw); - } - - /* - * Theoretically, it can overflow, but that's fine as one of - * previous adds should've tried to wake the task. - */ - nr_tw = nr_tw_prev + 1; - if (!(flags & IOU_F_TWQ_LAZY_WAKE)) - nr_tw = IO_CQ_WAKE_FORCE; - - req->nr_tw = nr_tw; - req->io_task_work.node.next = head; - } while (!try_cmpxchg(&ctx->work_llist.first, &head, - &req->io_task_work.node)); - - /* - * cmpxchg implies a full barrier, which pairs with the barrier - * in set_current_state() on the io_cqring_wait() side. It's used - * to ensure that either we see updated ->cq_wait_nr, or waiters - * going to sleep will observe the work added to the list, which - * is similar to the wait/wawke task state sync. - */ - - if (!head) { - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (ctx->has_evfd) - io_eventfd_signal(ctx, false); - } - - nr_wait = atomic_read(&ctx->cq_wait_nr); - /* not enough or no one is waiting */ - if (nr_tw < nr_wait) - return; - /* the previous add has already woken it up */ - if (nr_tw_prev >= nr_wait) - return; - wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); -} - -static void io_req_normal_work_add(struct io_kiocb *req) -{ - struct io_uring_task *tctx = req->tctx; - struct io_ring_ctx *ctx = req->ctx; - - /* task_work already pending, we're done */ - if (!llist_add(&req->io_task_work.node, &tctx->task_list)) - return; - - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - - /* SQPOLL doesn't need the task_work added, it'll run it itself */ - if (ctx->flags & IORING_SETUP_SQPOLL) { - __set_notify_signal(tctx->task); - return; - } - - if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))) - return; - - io_fallback_tw(tctx, false); -} - -void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) -{ - if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) - io_req_local_work_add(req, flags); - else - io_req_normal_work_add(req); -} - -void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags) -{ - if (WARN_ON_ONCE(!(req->ctx->flags & IORING_SETUP_DEFER_TASKRUN))) - return; - __io_req_task_work_add(req, flags); -} - -static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) -{ - struct llist_node *node = llist_del_all(&ctx->work_llist); - - __io_fallback_tw(node, false); - node = llist_del_all(&ctx->retry_llist); - __io_fallback_tw(node, false); -} - -static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, - int min_events) -{ - if (!io_local_work_pending(ctx)) - return false; - if (events < min_events) - return true; - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - return false; -} - -static int __io_run_local_work_loop(struct llist_node **node, - io_tw_token_t tw, - int events) -{ - int ret = 0; - - while (*node) { - struct llist_node *next = (*node)->next; - struct io_kiocb *req = container_of(*node, struct io_kiocb, - io_task_work.node); - INDIRECT_CALL_2(req->io_task_work.func, - io_poll_task_func, io_req_rw_complete, - (struct io_tw_req){req}, tw); - *node = next; - if (++ret >= events) - break; - } - - return ret; -} - -static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, - int min_events, int max_events) -{ - struct llist_node *node; - unsigned int loops = 0; - int ret = 0; - - if (WARN_ON_ONCE(ctx->submitter_task != current)) - return -EEXIST; - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); -again: - tw.cancel = io_should_terminate_tw(ctx); - min_events -= ret; - ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); - if (ctx->retry_llist.first) - goto retry_done; - - /* - * llists are in reverse order, flip it back the right way before - * running the pending items. - */ - node = llist_reverse_order(llist_del_all(&ctx->work_llist)); - ret += __io_run_local_work_loop(&node, tw, max_events - ret); - ctx->retry_llist.first = node; - loops++; - - if (io_run_local_work_continue(ctx, ret, min_events)) - goto again; -retry_done: - io_submit_flush_completions(ctx); - if (io_run_local_work_continue(ctx, ret, min_events)) - goto again; - - trace_io_uring_local_work_run(ctx, ret, loops); - return ret; -} - -static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, - int min_events) -{ - struct io_tw_state ts = {}; - - if (!io_local_work_pending(ctx)) - return 0; - return __io_run_local_work(ctx, ts, min_events, - max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); -} - -int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events) -{ - struct io_tw_state ts = {}; - int ret; - - mutex_lock(&ctx->uring_lock); - ret = __io_run_local_work(ctx, ts, min_events, max_events); - mutex_unlock(&ctx->uring_lock); - return ret; -} - static void io_req_task_cancel(struct io_tw_req tw_req, io_tw_token_t tw) { struct io_kiocb *req = tw_req.req; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 2df861200b4f..29b8f90fdabf 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -10,6 +10,7 @@ #include "alloc_cache.h" #include "io-wq.h" #include "slist.h" +#include "tw.h" #include "opdef.h" #ifndef CREATE_TRACE_POINTS @@ -88,6 +89,8 @@ struct io_ctx_config { IOSQE_BUFFER_SELECT |\ IOSQE_CQE_SKIP_SUCCESS) +#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) + /* * Complaint timeout for io_uring cancelation exits, and for io-wq exit * worker waiting. @@ -156,8 +159,6 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) int io_prepare_config(struct io_ctx_config *config); bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); -int io_run_task_work_sig(struct io_ring_ctx *ctx); -int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events); void io_req_defer_failed(struct io_kiocb *req, s32 res); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); @@ -171,15 +172,10 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); -void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); -void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags); void io_req_task_queue(struct io_kiocb *req); void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw); void io_req_task_queue_fail(struct io_kiocb *req, int ret); void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw); -struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); -struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); -void tctx_task_work(struct callback_head *cb); __cold void io_uring_drop_tctx_refs(struct task_struct *task); int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, @@ -232,11 +228,6 @@ static inline bool io_is_compat(struct io_ring_ctx *ctx) return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat); } -static inline void io_req_task_work_add(struct io_kiocb *req) -{ - __io_req_task_work_add(req, 0); -} - static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) { if (!wq_list_empty(&ctx->submit_state.compl_reqs) || @@ -461,59 +452,6 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return min(entries, ctx->sq_entries); } -static inline int io_run_task_work(void) -{ - bool ret = false; - - /* - * Always check-and-clear the task_work notification signal. With how - * signaling works for task_work, we can find it set with nothing to - * run. We need to clear it for that case, like get_signal() does. - */ - if (test_thread_flag(TIF_NOTIFY_SIGNAL)) - clear_notify_signal(); - /* - * PF_IO_WORKER never returns to userspace, so check here if we have - * notify work that needs processing. - */ - if (current->flags & PF_IO_WORKER) { - if (test_thread_flag(TIF_NOTIFY_RESUME)) { - __set_current_state(TASK_RUNNING); - resume_user_mode_work(NULL); - } - if (current->io_uring) { - unsigned int count = 0; - - __set_current_state(TASK_RUNNING); - tctx_task_work_run(current->io_uring, UINT_MAX, &count); - if (count) - ret = true; - } - } - if (task_work_pending(current)) { - __set_current_state(TASK_RUNNING); - task_work_run(); - ret = true; - } - - return ret; -} - -static inline bool io_local_work_pending(struct io_ring_ctx *ctx) -{ - return !llist_empty(&ctx->work_llist) || !llist_empty(&ctx->retry_llist); -} - -static inline bool io_task_work_pending(struct io_ring_ctx *ctx) -{ - return task_work_pending(current) || io_local_work_pending(ctx); -} - -static inline void io_tw_lock(struct io_ring_ctx *ctx, io_tw_token_t tw) -{ - lockdep_assert_held(&ctx->uring_lock); -} - /* * Don't complete immediately but use deferred completion infrastructure. * Protected by ->uring_lock and can only be used either with @@ -571,17 +509,6 @@ static inline bool io_alloc_req(struct io_ring_ctx *ctx, struct io_kiocb **req) return true; } -static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx) -{ - return likely(ctx->submitter_task == current); -} - -static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) -{ - return likely(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN) || - ctx->submitter_task == current); -} - static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) { io_req_set_res(req, res, 0); diff --git a/io_uring/tw.c b/io_uring/tw.c new file mode 100644 index 000000000000..f20ffc529040 --- /dev/null +++ b/io_uring/tw.c @@ -0,0 +1,354 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Task work handling for io_uring + */ +#include +#include +#include +#include +#include + +#include "io_uring.h" +#include "tctx.h" +#include "poll.h" +#include "rw.h" +#include "eventfd.h" + +void io_fallback_req_func(struct work_struct *work) +{ + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, + fallback_work.work); + struct llist_node *node = llist_del_all(&ctx->fallback_llist); + struct io_kiocb *req, *tmp; + struct io_tw_state ts = {}; + + percpu_ref_get(&ctx->refs); + mutex_lock(&ctx->uring_lock); + ts.cancel = io_should_terminate_tw(ctx); + llist_for_each_entry_safe(req, tmp, node, io_task_work.node) + req->io_task_work.func((struct io_tw_req){req}, ts); + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); + percpu_ref_put(&ctx->refs); +} + +static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw) +{ + if (!ctx) + return; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); + percpu_ref_put(&ctx->refs); +} + +/* + * Run queued task_work, returning the number of entries processed in *count. + * If more entries than max_entries are available, stop processing once this + * is reached and return the rest of the list. + */ +struct llist_node *io_handle_tw_list(struct llist_node *node, + unsigned int *count, + unsigned int max_entries) +{ + struct io_ring_ctx *ctx = NULL; + struct io_tw_state ts = { }; + + do { + struct llist_node *next = node->next; + struct io_kiocb *req = container_of(node, struct io_kiocb, + io_task_work.node); + + if (req->ctx != ctx) { + ctx_flush_and_put(ctx, ts); + ctx = req->ctx; + mutex_lock(&ctx->uring_lock); + percpu_ref_get(&ctx->refs); + ts.cancel = io_should_terminate_tw(ctx); + } + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + (struct io_tw_req){req}, ts); + node = next; + (*count)++; + if (unlikely(need_resched())) { + ctx_flush_and_put(ctx, ts); + ctx = NULL; + cond_resched(); + } + } while (node && *count < max_entries); + + ctx_flush_and_put(ctx, ts); + return node; +} + +static __cold void __io_fallback_tw(struct llist_node *node, bool sync) +{ + struct io_ring_ctx *last_ctx = NULL; + struct io_kiocb *req; + + while (node) { + req = container_of(node, struct io_kiocb, io_task_work.node); + node = node->next; + if (last_ctx != req->ctx) { + if (last_ctx) { + if (sync) + flush_delayed_work(&last_ctx->fallback_work); + percpu_ref_put(&last_ctx->refs); + } + last_ctx = req->ctx; + percpu_ref_get(&last_ctx->refs); + } + if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist)) + schedule_delayed_work(&last_ctx->fallback_work, 1); + } + + if (last_ctx) { + if (sync) + flush_delayed_work(&last_ctx->fallback_work); + percpu_ref_put(&last_ctx->refs); + } +} + +static void io_fallback_tw(struct io_uring_task *tctx, bool sync) +{ + struct llist_node *node = llist_del_all(&tctx->task_list); + + __io_fallback_tw(node, sync); +} + +struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, + unsigned int max_entries, + unsigned int *count) +{ + struct llist_node *node; + + node = llist_del_all(&tctx->task_list); + if (node) { + node = llist_reverse_order(node); + node = io_handle_tw_list(node, count, max_entries); + } + + /* relaxed read is enough as only the task itself sets ->in_cancel */ + if (unlikely(atomic_read(&tctx->in_cancel))) + io_uring_drop_tctx_refs(current); + + trace_io_uring_task_work_run(tctx, *count); + return node; +} + +void tctx_task_work(struct callback_head *cb) +{ + struct io_uring_task *tctx; + struct llist_node *ret; + unsigned int count = 0; + + tctx = container_of(cb, struct io_uring_task, task_work); + ret = tctx_task_work_run(tctx, UINT_MAX, &count); + /* can't happen */ + WARN_ON_ONCE(ret); +} + +void io_req_local_work_add(struct io_kiocb *req, unsigned flags) +{ + struct io_ring_ctx *ctx = req->ctx; + unsigned nr_wait, nr_tw, nr_tw_prev; + struct llist_node *head; + + /* See comment above IO_CQ_WAKE_INIT */ + BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); + + /* + * We don't know how many requests there are in the link and whether + * they can even be queued lazily, fall back to non-lazy. + */ + if (req->flags & IO_REQ_LINK_FLAGS) + flags &= ~IOU_F_TWQ_LAZY_WAKE; + + guard(rcu)(); + + head = READ_ONCE(ctx->work_llist.first); + do { + nr_tw_prev = 0; + if (head) { + struct io_kiocb *first_req = container_of(head, + struct io_kiocb, + io_task_work.node); + /* + * Might be executed at any moment, rely on + * SLAB_TYPESAFE_BY_RCU to keep it alive. + */ + nr_tw_prev = READ_ONCE(first_req->nr_tw); + } + + /* + * Theoretically, it can overflow, but that's fine as one of + * previous adds should've tried to wake the task. + */ + nr_tw = nr_tw_prev + 1; + if (!(flags & IOU_F_TWQ_LAZY_WAKE)) + nr_tw = IO_CQ_WAKE_FORCE; + + req->nr_tw = nr_tw; + req->io_task_work.node.next = head; + } while (!try_cmpxchg(&ctx->work_llist.first, &head, + &req->io_task_work.node)); + + /* + * cmpxchg implies a full barrier, which pairs with the barrier + * in set_current_state() on the io_cqring_wait() side. It's used + * to ensure that either we see updated ->cq_wait_nr, or waiters + * going to sleep will observe the work added to the list, which + * is similar to the wait/wawke task state sync. + */ + + if (!head) { + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + if (ctx->has_evfd) + io_eventfd_signal(ctx, false); + } + + nr_wait = atomic_read(&ctx->cq_wait_nr); + /* not enough or no one is waiting */ + if (nr_tw < nr_wait) + return; + /* the previous add has already woken it up */ + if (nr_tw_prev >= nr_wait) + return; + wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); +} + +void io_req_normal_work_add(struct io_kiocb *req) +{ + struct io_uring_task *tctx = req->tctx; + struct io_ring_ctx *ctx = req->ctx; + + /* task_work already pending, we're done */ + if (!llist_add(&req->io_task_work.node, &tctx->task_list)) + return; + + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + + /* SQPOLL doesn't need the task_work added, it'll run it itself */ + if (ctx->flags & IORING_SETUP_SQPOLL) { + __set_notify_signal(tctx->task); + return; + } + + if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))) + return; + + io_fallback_tw(tctx, false); +} + +void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags) +{ + if (WARN_ON_ONCE(!(req->ctx->flags & IORING_SETUP_DEFER_TASKRUN))) + return; + __io_req_task_work_add(req, flags); +} + +void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) +{ + struct llist_node *node = llist_del_all(&ctx->work_llist); + + __io_fallback_tw(node, false); + node = llist_del_all(&ctx->retry_llist); + __io_fallback_tw(node, false); +} + +static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, + int min_events) +{ + if (!io_local_work_pending(ctx)) + return false; + if (events < min_events) + return true; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + return false; +} + +static int __io_run_local_work_loop(struct llist_node **node, + io_tw_token_t tw, + int events) +{ + int ret = 0; + + while (*node) { + struct llist_node *next = (*node)->next; + struct io_kiocb *req = container_of(*node, struct io_kiocb, + io_task_work.node); + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + (struct io_tw_req){req}, tw); + *node = next; + if (++ret >= events) + break; + } + + return ret; +} + +static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, + int min_events, int max_events) +{ + struct llist_node *node; + unsigned int loops = 0; + int ret = 0; + + if (WARN_ON_ONCE(ctx->submitter_task != current)) + return -EEXIST; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); +again: + tw.cancel = io_should_terminate_tw(ctx); + min_events -= ret; + ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); + if (ctx->retry_llist.first) + goto retry_done; + + /* + * llists are in reverse order, flip it back the right way before + * running the pending items. + */ + node = llist_reverse_order(llist_del_all(&ctx->work_llist)); + ret += __io_run_local_work_loop(&node, tw, max_events - ret); + ctx->retry_llist.first = node; + loops++; + + if (io_run_local_work_continue(ctx, ret, min_events)) + goto again; +retry_done: + io_submit_flush_completions(ctx); + if (io_run_local_work_continue(ctx, ret, min_events)) + goto again; + + trace_io_uring_local_work_run(ctx, ret, loops); + return ret; +} + +int io_run_local_work_locked(struct io_ring_ctx *ctx, int min_events) +{ + struct io_tw_state ts = {}; + + if (!io_local_work_pending(ctx)) + return 0; + return __io_run_local_work(ctx, ts, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); +} + +int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events) +{ + struct io_tw_state ts = {}; + int ret; + + mutex_lock(&ctx->uring_lock); + ret = __io_run_local_work(ctx, ts, min_events, max_events); + mutex_unlock(&ctx->uring_lock); + return ret; +} diff --git a/io_uring/tw.h b/io_uring/tw.h new file mode 100644 index 000000000000..8683efca58ef --- /dev/null +++ b/io_uring/tw.h @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_TW_H +#define IOU_TW_H + +#include +#include +#include + +#define IO_LOCAL_TW_DEFAULT_MAX 20 + +/* + * No waiters. It's larger than any valid value of the tw counter + * so that tests against ->cq_wait_nr would fail and skip wake_up(). + */ +#define IO_CQ_WAKE_INIT (-1U) +/* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ +#define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) + +/* + * Terminate the request if either of these conditions are true: + * + * 1) It's being executed by the original task, but that task is marked + * with PF_EXITING as it's exiting. + * 2) PF_KTHREAD is set, in which case the invoker of the task_work is + * our fallback task_work. + * 3) The ring has been closed and is going away. + */ +static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) +{ + return (current->flags & (PF_EXITING | PF_KTHREAD)) || percpu_ref_is_dying(&ctx->refs); +} + +void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags); +struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); +void tctx_task_work(struct callback_head *cb); +int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events); +int io_run_task_work_sig(struct io_ring_ctx *ctx); + +__cold void io_fallback_req_func(struct work_struct *work); +__cold void io_move_task_work_from_local(struct io_ring_ctx *ctx); +int io_run_local_work_locked(struct io_ring_ctx *ctx, int min_events); + +void io_req_local_work_add(struct io_kiocb *req, unsigned flags); +void io_req_normal_work_add(struct io_kiocb *req); +struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); + +static inline void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) +{ + if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) + io_req_local_work_add(req, flags); + else + io_req_normal_work_add(req); +} + +static inline void io_req_task_work_add(struct io_kiocb *req) +{ + __io_req_task_work_add(req, 0); +} + +static inline int io_run_task_work(void) +{ + bool ret = false; + + /* + * Always check-and-clear the task_work notification signal. With how + * signaling works for task_work, we can find it set with nothing to + * run. We need to clear it for that case, like get_signal() does. + */ + if (test_thread_flag(TIF_NOTIFY_SIGNAL)) + clear_notify_signal(); + /* + * PF_IO_WORKER never returns to userspace, so check here if we have + * notify work that needs processing. + */ + if (current->flags & PF_IO_WORKER) { + if (test_thread_flag(TIF_NOTIFY_RESUME)) { + __set_current_state(TASK_RUNNING); + resume_user_mode_work(NULL); + } + if (current->io_uring) { + unsigned int count = 0; + + __set_current_state(TASK_RUNNING); + tctx_task_work_run(current->io_uring, UINT_MAX, &count); + if (count) + ret = true; + } + } + if (task_work_pending(current)) { + __set_current_state(TASK_RUNNING); + task_work_run(); + ret = true; + } + + return ret; +} + +static inline bool io_local_work_pending(struct io_ring_ctx *ctx) +{ + return !llist_empty(&ctx->work_llist) || !llist_empty(&ctx->retry_llist); +} + +static inline bool io_task_work_pending(struct io_ring_ctx *ctx) +{ + return task_work_pending(current) || io_local_work_pending(ctx); +} + +static inline void io_tw_lock(struct io_ring_ctx *ctx, io_tw_token_t tw) +{ + lockdep_assert_held(&ctx->uring_lock); +} + +static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx) +{ + return likely(ctx->submitter_task == current); +} + +static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) +{ + return likely(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN) || + ctx->submitter_task == current); +} + +#endif From 0105b0562a5ed6374f06e5cd4246a3f1311a65a0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Jan 2026 08:40:19 -0700 Subject: [PATCH 19/30] io_uring: split out CQ waiting code into wait.c Move the completion queue waiting and scheduling code out of io_uring.c into a dedicated wait.c file. This further removes code out of the main io_uring C and header file, and into a topical new file. Signed-off-by: Jens Axboe --- io_uring/Makefile | 13 +- io_uring/cancel.c | 1 + io_uring/io_uring.c | 322 +------------------------------------------- io_uring/tw.c | 1 + io_uring/tw.h | 8 -- io_uring/wait.c | 308 ++++++++++++++++++++++++++++++++++++++++++ io_uring/wait.h | 49 +++++++ 7 files changed, 368 insertions(+), 334 deletions(-) create mode 100644 io_uring/wait.c create mode 100644 io_uring/wait.h diff --git a/io_uring/Makefile b/io_uring/Makefile index b7ea66a9fcfc..bf9eff88427a 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -8,12 +8,13 @@ endif obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ tctx.o filetable.o rw.o poll.o \ - tw.o eventfd.o uring_cmd.o openclose.o \ - sqpoll.o xattr.o nop.o fs.o splice.o \ - sync.o msg_ring.o advise.o openclose.o \ - statx.o timeout.o cancel.o \ - waitid.o register.o truncate.o \ - memmap.o alloc_cache.o query.o + tw.o wait.o eventfd.o uring_cmd.o \ + openclose.o sqpoll.o xattr.o nop.o \ + fs.o splice.o sync.o msg_ring.o \ + advise.o openclose.o statx.o timeout.o \ + cancel.o waitid.o register.o \ + truncate.o memmap.o alloc_cache.o \ + query.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 76c657a28fe7..653be6152a6f 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -19,6 +19,7 @@ #include "waitid.h" #include "futex.h" #include "cancel.h" +#include "wait.h" struct io_cancel { struct file *file; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f9b716c819d1..a50459238bee 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -93,6 +93,7 @@ #include "rw.h" #include "alloc_cache.h" #include "eventfd.h" +#include "wait.h" #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) @@ -166,16 +167,6 @@ static void io_poison_req(struct io_kiocb *req) req->link = IO_URING_PTR_POISON; } -static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) -{ - return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); -} - -static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) -{ - return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); -} - static inline void req_fail_link_node(struct io_kiocb *req, int res) { req_set_fail(req); @@ -589,7 +580,7 @@ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) __io_cqring_overflow_flush(ctx, true); } -static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) +void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); __io_cqring_overflow_flush(ctx, false); @@ -1161,13 +1152,6 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) ctx->submit_state.cq_flush = false; } -static unsigned io_cqring_events(struct io_ring_ctx *ctx) -{ - /* See comment at the top of this file */ - smp_rmb(); - return __io_cqring_events(ctx); -} - /* * We can't just wait for polled events to come to us, we have to actively * find and complete them. @@ -2060,308 +2044,6 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) return ret; } -static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, - int wake_flags, void *key) -{ - struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); - - /* - * Cannot safely flush overflowed CQEs from here, ensure we wake up - * the task, and the next invocation will do it. - */ - if (io_should_wake(iowq) || io_has_work(iowq->ctx)) - return autoremove_wake_function(curr, mode, wake_flags, key); - return -1; -} - -int io_run_task_work_sig(struct io_ring_ctx *ctx) -{ - if (io_local_work_pending(ctx)) { - __set_current_state(TASK_RUNNING); - if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) - return 0; - } - if (io_run_task_work() > 0) - return 0; - if (task_sigpending(current)) - return -EINTR; - return 0; -} - -static bool current_pending_io(void) -{ - struct io_uring_task *tctx = current->io_uring; - - if (!tctx) - return false; - return percpu_counter_read_positive(&tctx->inflight); -} - -static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) -{ - struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); - - WRITE_ONCE(iowq->hit_timeout, 1); - iowq->min_timeout = 0; - wake_up_process(iowq->wq.private); - return HRTIMER_NORESTART; -} - -/* - * Doing min_timeout portion. If we saw any timeouts, events, or have work, - * wake up. If not, and we have a normal timeout, switch to that and keep - * sleeping. - */ -static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) -{ - struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); - struct io_ring_ctx *ctx = iowq->ctx; - - /* no general timeout, or shorter (or equal), we are done */ - if (iowq->timeout == KTIME_MAX || - ktime_compare(iowq->min_timeout, iowq->timeout) >= 0) - goto out_wake; - /* work we may need to run, wake function will see if we need to wake */ - if (io_has_work(ctx)) - goto out_wake; - /* got events since we started waiting, min timeout is done */ - if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) - goto out_wake; - /* if we have any events and min timeout expired, we're done */ - if (io_cqring_events(ctx)) - goto out_wake; - - /* - * If using deferred task_work running and application is waiting on - * more than one request, ensure we reset it now where we are switching - * to normal sleeps. Any request completion post min_wait should wake - * the task and return. - */ - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - atomic_set(&ctx->cq_wait_nr, 1); - smp_mb(); - if (!llist_empty(&ctx->work_llist)) - goto out_wake; - } - - /* any generated CQE posted past this time should wake us up */ - iowq->cq_tail = iowq->cq_min_tail; - - hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup); - hrtimer_set_expires(timer, iowq->timeout); - return HRTIMER_RESTART; -out_wake: - return io_cqring_timer_wakeup(timer); -} - -static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, - clockid_t clock_id, ktime_t start_time) -{ - ktime_t timeout; - - if (iowq->min_timeout) { - timeout = ktime_add_ns(iowq->min_timeout, start_time); - hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id, - HRTIMER_MODE_ABS); - } else { - timeout = iowq->timeout; - hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id, - HRTIMER_MODE_ABS); - } - - hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); - hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); - - if (!READ_ONCE(iowq->hit_timeout)) - schedule(); - - hrtimer_cancel(&iowq->t); - destroy_hrtimer_on_stack(&iowq->t); - __set_current_state(TASK_RUNNING); - - return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; -} - -struct ext_arg { - size_t argsz; - struct timespec64 ts; - const sigset_t __user *sig; - ktime_t min_time; - bool ts_set; - bool iowait; -}; - -static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, - struct ext_arg *ext_arg, - ktime_t start_time) -{ - int ret = 0; - - /* - * Mark us as being in io_wait if we have pending requests, so cpufreq - * can take into account that the task is waiting for IO - turns out - * to be important for low QD IO. - */ - if (ext_arg->iowait && current_pending_io()) - current->in_iowait = 1; - if (iowq->timeout != KTIME_MAX || iowq->min_timeout) - ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); - else - schedule(); - current->in_iowait = 0; - return ret; -} - -/* If this returns > 0, the caller should retry */ -static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, - struct ext_arg *ext_arg, - ktime_t start_time) -{ - if (unlikely(READ_ONCE(ctx->check_cq))) - return 1; - if (unlikely(io_local_work_pending(ctx))) - return 1; - if (unlikely(task_work_pending(current))) - return 1; - if (unlikely(task_sigpending(current))) - return -EINTR; - if (unlikely(io_should_wake(iowq))) - return 0; - - return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time); -} - -/* - * Wait until events become available, if we don't already have some. The - * application must reap them itself, as they reside on the shared cq ring. - */ -static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, - struct ext_arg *ext_arg) -{ - struct io_wait_queue iowq; - struct io_rings *rings = ctx->rings; - ktime_t start_time; - int ret; - - min_events = min_t(int, min_events, ctx->cq_entries); - - if (!io_allowed_run_tw(ctx)) - return -EEXIST; - if (io_local_work_pending(ctx)) - io_run_local_work(ctx, min_events, - max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); - io_run_task_work(); - - if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) - io_cqring_do_overflow_flush(ctx); - if (__io_cqring_events_user(ctx) >= min_events) - return 0; - - init_waitqueue_func_entry(&iowq.wq, io_wake_function); - iowq.wq.private = current; - INIT_LIST_HEAD(&iowq.wq.entry); - iowq.ctx = ctx; - iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; - iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); - iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); - iowq.hit_timeout = 0; - iowq.min_timeout = ext_arg->min_time; - iowq.timeout = KTIME_MAX; - start_time = io_get_time(ctx); - - if (ext_arg->ts_set) { - iowq.timeout = timespec64_to_ktime(ext_arg->ts); - if (!(flags & IORING_ENTER_ABS_TIMER)) - iowq.timeout = ktime_add(iowq.timeout, start_time); - } - - if (ext_arg->sig) { -#ifdef CONFIG_COMPAT - if (in_compat_syscall()) - ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig, - ext_arg->argsz); - else -#endif - ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz); - - if (ret) - return ret; - } - - io_napi_busy_loop(ctx, &iowq); - - trace_io_uring_cqring_wait(ctx, min_events); - do { - unsigned long check_cq; - int nr_wait; - - /* if min timeout has been hit, don't reset wait count */ - if (!iowq.hit_timeout) - nr_wait = (int) iowq.cq_tail - - READ_ONCE(ctx->rings->cq.tail); - else - nr_wait = 1; - - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - atomic_set(&ctx->cq_wait_nr, nr_wait); - set_current_state(TASK_INTERRUPTIBLE); - } else { - prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, - TASK_INTERRUPTIBLE); - } - - ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time); - __set_current_state(TASK_RUNNING); - atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); - - /* - * Run task_work after scheduling and before io_should_wake(). - * If we got woken because of task_work being processed, run it - * now rather than let the caller do another wait loop. - */ - if (io_local_work_pending(ctx)) - io_run_local_work(ctx, nr_wait, nr_wait); - io_run_task_work(); - - /* - * Non-local task_work will be run on exit to userspace, but - * if we're using DEFER_TASKRUN, then we could have waited - * with a timeout for a number of requests. If the timeout - * hits, we could have some requests ready to process. Ensure - * this break is _after_ we have run task_work, to avoid - * deferring running potentially pending requests until the - * next time we wait for events. - */ - if (ret < 0) - break; - - check_cq = READ_ONCE(ctx->check_cq); - if (unlikely(check_cq)) { - /* let the caller flush overflows, retry */ - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - io_cqring_do_overflow_flush(ctx); - if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { - ret = -EBADR; - break; - } - } - - if (io_should_wake(&iowq)) { - ret = 0; - break; - } - cond_resched(); - } while (1); - - if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) - finish_wait(&ctx->cq_wait, &iowq.wq); - restore_saved_sigmask_unless(ret == -EINTR); - - return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; -} - static void io_rings_free(struct io_ring_ctx *ctx) { io_free_region(ctx->user, &ctx->sq_region); diff --git a/io_uring/tw.c b/io_uring/tw.c index f20ffc529040..1ee2b8ab07c8 100644 --- a/io_uring/tw.c +++ b/io_uring/tw.c @@ -13,6 +13,7 @@ #include "poll.h" #include "rw.h" #include "eventfd.h" +#include "wait.h" void io_fallback_req_func(struct work_struct *work) { diff --git a/io_uring/tw.h b/io_uring/tw.h index 8683efca58ef..415e330fabde 100644 --- a/io_uring/tw.h +++ b/io_uring/tw.h @@ -8,14 +8,6 @@ #define IO_LOCAL_TW_DEFAULT_MAX 20 -/* - * No waiters. It's larger than any valid value of the tw counter - * so that tests against ->cq_wait_nr would fail and skip wake_up(). - */ -#define IO_CQ_WAKE_INIT (-1U) -/* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ -#define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) - /* * Terminate the request if either of these conditions are true: * diff --git a/io_uring/wait.c b/io_uring/wait.c new file mode 100644 index 000000000000..0581cadf20ee --- /dev/null +++ b/io_uring/wait.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Waiting for completion events + */ +#include +#include +#include + +#include + +#include + +#include "io_uring.h" +#include "napi.h" +#include "wait.h" + +static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, + int wake_flags, void *key) +{ + struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); + + /* + * Cannot safely flush overflowed CQEs from here, ensure we wake up + * the task, and the next invocation will do it. + */ + if (io_should_wake(iowq) || io_has_work(iowq->ctx)) + return autoremove_wake_function(curr, mode, wake_flags, key); + return -1; +} + +int io_run_task_work_sig(struct io_ring_ctx *ctx) +{ + if (io_local_work_pending(ctx)) { + __set_current_state(TASK_RUNNING); + if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) + return 0; + } + if (io_run_task_work() > 0) + return 0; + if (task_sigpending(current)) + return -EINTR; + return 0; +} + +static bool current_pending_io(void) +{ + struct io_uring_task *tctx = current->io_uring; + + if (!tctx) + return false; + return percpu_counter_read_positive(&tctx->inflight); +} + +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) +{ + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); + + WRITE_ONCE(iowq->hit_timeout, 1); + iowq->min_timeout = 0; + wake_up_process(iowq->wq.private); + return HRTIMER_NORESTART; +} + +/* + * Doing min_timeout portion. If we saw any timeouts, events, or have work, + * wake up. If not, and we have a normal timeout, switch to that and keep + * sleeping. + */ +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) +{ + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); + struct io_ring_ctx *ctx = iowq->ctx; + + /* no general timeout, or shorter (or equal), we are done */ + if (iowq->timeout == KTIME_MAX || + ktime_compare(iowq->min_timeout, iowq->timeout) >= 0) + goto out_wake; + /* work we may need to run, wake function will see if we need to wake */ + if (io_has_work(ctx)) + goto out_wake; + /* got events since we started waiting, min timeout is done */ + if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) + goto out_wake; + /* if we have any events and min timeout expired, we're done */ + if (io_cqring_events(ctx)) + goto out_wake; + + /* + * If using deferred task_work running and application is waiting on + * more than one request, ensure we reset it now where we are switching + * to normal sleeps. Any request completion post min_wait should wake + * the task and return. + */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 1); + smp_mb(); + if (!llist_empty(&ctx->work_llist)) + goto out_wake; + } + + /* any generated CQE posted past this time should wake us up */ + iowq->cq_tail = iowq->cq_min_tail; + + hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup); + hrtimer_set_expires(timer, iowq->timeout); + return HRTIMER_RESTART; +out_wake: + return io_cqring_timer_wakeup(timer); +} + +static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, + clockid_t clock_id, ktime_t start_time) +{ + ktime_t timeout; + + if (iowq->min_timeout) { + timeout = ktime_add_ns(iowq->min_timeout, start_time); + hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id, + HRTIMER_MODE_ABS); + } else { + timeout = iowq->timeout; + hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id, + HRTIMER_MODE_ABS); + } + + hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); + hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); + + if (!READ_ONCE(iowq->hit_timeout)) + schedule(); + + hrtimer_cancel(&iowq->t); + destroy_hrtimer_on_stack(&iowq->t); + __set_current_state(TASK_RUNNING); + + return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; +} + +static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + struct ext_arg *ext_arg, + ktime_t start_time) +{ + int ret = 0; + + /* + * Mark us as being in io_wait if we have pending requests, so cpufreq + * can take into account that the task is waiting for IO - turns out + * to be important for low QD IO. + */ + if (ext_arg->iowait && current_pending_io()) + current->in_iowait = 1; + if (iowq->timeout != KTIME_MAX || iowq->min_timeout) + ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); + else + schedule(); + current->in_iowait = 0; + return ret; +} + +/* If this returns > 0, the caller should retry */ +static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + struct ext_arg *ext_arg, + ktime_t start_time) +{ + if (unlikely(READ_ONCE(ctx->check_cq))) + return 1; + if (unlikely(io_local_work_pending(ctx))) + return 1; + if (unlikely(task_work_pending(current))) + return 1; + if (unlikely(task_sigpending(current))) + return -EINTR; + if (unlikely(io_should_wake(iowq))) + return 0; + + return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time); +} + +/* + * Wait until events become available, if we don't already have some. The + * application must reap them itself, as they reside on the shared cq ring. + */ +int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, + struct ext_arg *ext_arg) +{ + struct io_wait_queue iowq; + struct io_rings *rings = ctx->rings; + ktime_t start_time; + int ret; + + min_events = min_t(int, min_events, ctx->cq_entries); + + if (!io_allowed_run_tw(ctx)) + return -EEXIST; + if (io_local_work_pending(ctx)) + io_run_local_work(ctx, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); + io_run_task_work(); + + if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) + io_cqring_do_overflow_flush(ctx); + if (__io_cqring_events_user(ctx) >= min_events) + return 0; + + init_waitqueue_func_entry(&iowq.wq, io_wake_function); + iowq.wq.private = current; + INIT_LIST_HEAD(&iowq.wq.entry); + iowq.ctx = ctx; + iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; + iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); + iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); + iowq.hit_timeout = 0; + iowq.min_timeout = ext_arg->min_time; + iowq.timeout = KTIME_MAX; + start_time = io_get_time(ctx); + + if (ext_arg->ts_set) { + iowq.timeout = timespec64_to_ktime(ext_arg->ts); + if (!(flags & IORING_ENTER_ABS_TIMER)) + iowq.timeout = ktime_add(iowq.timeout, start_time); + } + + if (ext_arg->sig) { +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig, + ext_arg->argsz); + else +#endif + ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz); + + if (ret) + return ret; + } + + io_napi_busy_loop(ctx, &iowq); + + trace_io_uring_cqring_wait(ctx, min_events); + do { + unsigned long check_cq; + int nr_wait; + + /* if min timeout has been hit, don't reset wait count */ + if (!iowq.hit_timeout) + nr_wait = (int) iowq.cq_tail - + READ_ONCE(ctx->rings->cq.tail); + else + nr_wait = 1; + + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, nr_wait); + set_current_state(TASK_INTERRUPTIBLE); + } else { + prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, + TASK_INTERRUPTIBLE); + } + + ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time); + __set_current_state(TASK_RUNNING); + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); + + /* + * Run task_work after scheduling and before io_should_wake(). + * If we got woken because of task_work being processed, run it + * now rather than let the caller do another wait loop. + */ + if (io_local_work_pending(ctx)) + io_run_local_work(ctx, nr_wait, nr_wait); + io_run_task_work(); + + /* + * Non-local task_work will be run on exit to userspace, but + * if we're using DEFER_TASKRUN, then we could have waited + * with a timeout for a number of requests. If the timeout + * hits, we could have some requests ready to process. Ensure + * this break is _after_ we have run task_work, to avoid + * deferring running potentially pending requests until the + * next time we wait for events. + */ + if (ret < 0) + break; + + check_cq = READ_ONCE(ctx->check_cq); + if (unlikely(check_cq)) { + /* let the caller flush overflows, retry */ + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) + io_cqring_do_overflow_flush(ctx); + if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { + ret = -EBADR; + break; + } + } + + if (io_should_wake(&iowq)) { + ret = 0; + break; + } + cond_resched(); + } while (1); + + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + finish_wait(&ctx->cq_wait, &iowq.wq); + restore_saved_sigmask_unless(ret == -EINTR); + + return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; +} diff --git a/io_uring/wait.h b/io_uring/wait.h new file mode 100644 index 000000000000..5e236f74e1af --- /dev/null +++ b/io_uring/wait.h @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_WAIT_H +#define IOU_WAIT_H + +#include + +/* + * No waiters. It's larger than any valid value of the tw counter + * so that tests against ->cq_wait_nr would fail and skip wake_up(). + */ +#define IO_CQ_WAKE_INIT (-1U) +/* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ +#define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) + +struct ext_arg { + size_t argsz; + struct timespec64 ts; + const sigset_t __user *sig; + ktime_t min_time; + bool ts_set; + bool iowait; +}; + +int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, + struct ext_arg *ext_arg); +int io_run_task_work_sig(struct io_ring_ctx *ctx); +void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx); + +static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) +{ + return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); +} + +static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) +{ + return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); +} + +/* + * Reads the tail/head of the CQ ring while providing an acquire ordering, + * see comment at top of io_uring.c. + */ +static inline unsigned io_cqring_events(struct io_ring_ctx *ctx) +{ + smp_rmb(); + return __io_cqring_events(ctx); +} + +#endif From 5247c034a67f5a93cc1faa15e9867eec5b22f38a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 20 Jan 2026 20:47:40 +0000 Subject: [PATCH 20/30] io_uring: introduce non-circular SQ Outside of SQPOLL, normally SQ entries are consumed by the time the submission syscall returns. For those cases we don't need a circular buffer and the head/tail tracking, instead the kernel can assume that entries always start from the beginning of the SQ at index 0. This patch introduces a setup flag doing exactly that. It's a simpler and helps to keeps SQEs hot in cache. The feature is optional and enabled by setting IORING_SETUP_SQ_REWIND. The flag is rejected if passed together with SQPOLL as it'd require waiting for SQ before each submission. It also requires IORING_SETUP_NO_SQARRAY, which can be supported but it's unlikely there will be users, so leave more space for future optimisations. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 12 ++++++++++++ io_uring/io_uring.c | 29 ++++++++++++++++++++++------- io_uring/io_uring.h | 3 ++- 3 files changed, 36 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b5b23c0d5283..475094c7a668 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -237,6 +237,18 @@ enum io_uring_sqe_flags_bit { */ #define IORING_SETUP_SQE_MIXED (1U << 19) +/* + * When set, io_uring ignores SQ head and tail and fetches SQEs to submit + * starting from index 0 instead from the index stored in the head pointer. + * IOW, the user should place all SQE at the beginning of the SQ memory + * before issuing a submission syscall. + * + * It requires IORING_SETUP_NO_SQARRAY and is incompatible with + * IORING_SETUP_SQPOLL. The user must also never change the SQ head and tail + * values and keep it set to 0. Any other value is undefined behaviour. + */ +#define IORING_SETUP_SQ_REWIND (1U << 20) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a50459238bee..0f88ec74e55d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1945,12 +1945,16 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - /* - * Ensure any loads from the SQEs are done at this point, - * since once we write the new head, the application could - * write new data to them. - */ - smp_store_release(&rings->sq.head, ctx->cached_sq_head); + if (ctx->flags & IORING_SETUP_SQ_REWIND) { + ctx->cached_sq_head = 0; + } else { + /* + * Ensure any loads from the SQEs are done at this point, + * since once we write the new head, the application could + * write new data to them. + */ + smp_store_release(&rings->sq.head, ctx->cached_sq_head); + } } /* @@ -1996,10 +2000,15 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { - unsigned int entries = io_sqring_entries(ctx); + unsigned int entries; unsigned int left; int ret; + if (ctx->flags & IORING_SETUP_SQ_REWIND) + entries = ctx->sq_entries; + else + entries = io_sqring_entries(ctx); + entries = min(nr, entries); if (unlikely(!entries)) return 0; @@ -2728,6 +2737,12 @@ static int io_uring_sanitise_params(struct io_uring_params *p) if (flags & ~IORING_SETUP_FLAGS) return -EINVAL; + if (flags & IORING_SETUP_SQ_REWIND) { + if ((flags & IORING_SETUP_SQPOLL) || + !(flags & IORING_SETUP_NO_SQARRAY)) + return -EINVAL; + } + /* There is no way to mmap rings without a real fd */ if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && !(flags & IORING_SETUP_NO_MMAP)) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 29b8f90fdabf..acdc39b9f8d6 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -69,7 +69,8 @@ struct io_ctx_config { IORING_SETUP_NO_SQARRAY |\ IORING_SETUP_HYBRID_IOPOLL |\ IORING_SETUP_CQE_MIXED |\ - IORING_SETUP_SQE_MIXED) + IORING_SETUP_SQE_MIXED |\ + IORING_SETUP_SQ_REWIND) #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\ IORING_ENTER_SQ_WAKEUP |\ From 82dadc8a494758093e775336390cb31033c6f9a3 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 22 Jan 2026 14:45:04 -0700 Subject: [PATCH 21/30] io_uring/rsrc: take unsigned index in io_rsrc_node_lookup() io_rsrc_node_lookup() takes a signed int index as input and compares it to an unsigned length. Since the signed int is implicitly cast to an unsigned int for the comparison and the length is bounded by IORING_MAX_FIXED_FILES/IORING_MAX_REG_BUFFERS, negative indices are already rejected on architectures where int is at least 32 bits. Make this a bit clearer and avoid compiler warnings for comparisons of signed and unsigned values by taking an unsigned int index instead. Signed-off-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/rsrc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index d603f6a47f5e..4a5db2ad1af2 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -90,7 +90,7 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, struct io_imu_folio_data *data); static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data, - int index) + unsigned int index) { if (index < data->nr) return data->nodes[array_index_nospec(index, data->nr)]; From 1edf0891d0f4a6c186721e41323c9a3b86ceceda Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 23 Jan 2026 05:09:08 -0700 Subject: [PATCH 22/30] io_uring: fix bad indentation for setup flags if statement smatch complains about this: smatch warnings: io_uring/io_uring.c:2741 io_uring_sanitise_params() warn: if statement not indented hence fix it up. Link: https://lore.kernel.org/all/202601231651.HeTmPS8C-lkp@intel.com/ Fixes: 5247c034a67f ("io_uring: introduce non-circular SQ") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202601231651.HeTmPS8C-lkp@intel.com/ Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0f88ec74e55d..5c503a3f6ecc 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2740,7 +2740,7 @@ static int io_uring_sanitise_params(struct io_uring_params *p) if (flags & IORING_SETUP_SQ_REWIND) { if ((flags & IORING_SETUP_SQPOLL) || !(flags & IORING_SETUP_NO_SQARRAY)) - return -EINVAL; + return -EINVAL; } /* There is no way to mmap rings without a real fd */ From 816095894c0f44aaba4372d92c874121af687596 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 23 Jan 2026 13:58:03 -0700 Subject: [PATCH 23/30] io_uring/io-wq: handle !sysctl_hung_task_timeout_secs If the hung_task_timeout sysctl is set to 0, then we'll end up busy looping inside io_wq_exit_workers() after an earlier commit switched to using wait_for_completion_timeout(). Use the maximum schedule timeout value for that case. Fixes: 1f293098a313 ("io_uring/io-wq: don't trigger hung task for syzbot craziness") Reported-by: Chris Mason Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index aaf1dfc21763..3b55feb620d9 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -1340,6 +1340,8 @@ static void io_wq_exit_workers(struct io_wq *wq) * up waiting more than IO_URING_EXIT_WAIT_MAX. */ timeout = sysctl_hung_task_timeout_secs * HZ / 2; + if (!timeout) + timeout = MAX_SCHEDULE_TIMEOUT; warn_timeout = jiffies + IO_URING_EXIT_WAIT_MAX; do { if (wait_for_completion_timeout(&wq->worker_done, timeout)) From 6e0d71c288fdcf5866f5d0c6cde850a091cc3c55 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 25 Jan 2026 10:07:09 -0700 Subject: [PATCH 24/30] io_uring/futex: use GFP_KERNEL_ACCOUNT for futex data allocation Be a bit nicer and ensure it plays nice with memcg accounting. Signed-off-by: Jens Axboe --- io_uring/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/futex.c b/io_uring/futex.c index 11bfff5a80df..1dabcfd503b8 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -186,7 +186,7 @@ int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; ifd = kzalloc(struct_size_t(struct io_futexv_data, futexv, iof->futex_nr), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!ifd) return -ENOMEM; From e26f51f6f68749fb5ee6b3b8f7a5a91bf2633e39 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 25 Jan 2026 10:07:35 -0700 Subject: [PATCH 25/30] io_uring/rsrc: use GFP_KERNEL_ACCOUNT consistently For potential long term allocations, ensure that we play nicer with memcg and use the accounting variant of the GFP_KERNEL allocation. Signed-off-by: Jens Axboe --- io_uring/memmap.c | 2 +- io_uring/rsrc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 18e574776ef6..81b527dc96f7 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -56,7 +56,7 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) if (WARN_ON_ONCE(nr_pages > INT_MAX)) return ERR_PTR(-EOVERFLOW); - pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); if (!pages) return ERR_PTR(-ENOMEM); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 41c89f5c616d..73b38888a304 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1329,7 +1329,7 @@ void io_vec_free(struct iou_vec *iv) int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) { - gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_NOWARN; struct iovec *iov; iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp); From 806ae939c41e5da1d94a1e2b31f5702e96b6c3e3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 27 Jan 2026 21:01:41 -0700 Subject: [PATCH 26/30] io_uring/net: don't continue send bundle if poll was required for retry If a send bundle has picked a bunch of buffers, then it needs to send all of those to be complete. This may require poll arming, if the send buffer ends up being full. Once a send bundle has been poll armed, no further bundles should be attempted. This allows a current bundle to complete even though it needs to go through polling to do so, but it will not allow another bundle to be started once that has happened. Ideally we would abort a bundle if it was only partially sent, but as some parts of it already went out on the wire, this obviously isn't feasible. Not continuing more bundle attempts post encountering a full socket buffer is the second best thing. Cc: stable@vger.kernel.org Fixes: a05d1f625c7a ("io_uring/net: support bundles for send") Signed-off-by: Jens Axboe --- io_uring/net.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/io_uring/net.c b/io_uring/net.c index 519ea055b761..d9a4b83804a2 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -515,7 +515,11 @@ static inline bool io_send_finish(struct io_kiocb *req, cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val)); - if (bundle_finished || req->flags & REQ_F_BL_EMPTY) + /* + * Don't start new bundles if the buffer list is empty, or if the + * current operation needed to go through polling to complete. + */ + if (bundle_finished || req->flags & (REQ_F_BL_EMPTY | REQ_F_POLLED)) goto finish; /* From 38aa434ab9335ce2d178b7538cdf01d60b2014c3 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Mon, 2 Feb 2026 22:37:53 +0800 Subject: [PATCH 27/30] io_uring/io-wq: add exit-on-idle state io-wq uses an idle timeout to shrink the pool, but keeps the last worker around indefinitely to avoid churn. For tasks that used io_uring for file I/O and then stop using io_uring, this can leave an iou-wrk-* thread behind even after all io_uring instances are gone. This is unnecessary overhead and also gets in the way of process checkpoint/restore. Add an exit-on-idle state that makes all io-wq workers exit as soon as they become idle, and provide io_wq_set_exit_on_idle() to toggle it. Signed-off-by: Li Chen Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 27 +++++++++++++++++++++++++-- io_uring/io-wq.h | 1 + 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 3b55feb620d9..f0cec1a6d9c4 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -35,6 +35,7 @@ enum { enum { IO_WQ_BIT_EXIT = 0, /* wq exiting */ + IO_WQ_BIT_EXIT_ON_IDLE = 1, /* allow all workers to exit on idle */ }; enum { @@ -707,9 +708,13 @@ static int io_wq_worker(void *data) raw_spin_lock(&acct->workers_lock); /* * Last sleep timed out. Exit if we're not the last worker, - * or if someone modified our affinity. + * or if someone modified our affinity. If wq is marked + * idle-exit, drop the worker as well. This is used to avoid + * keeping io-wq workers around for tasks that no longer have + * any active io_uring instances. */ - if (last_timeout && (exit_mask || acct->nr_workers > 1)) { + if ((last_timeout && (exit_mask || acct->nr_workers > 1)) || + test_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state)) { acct->nr_workers--; raw_spin_unlock(&acct->workers_lock); __set_current_state(TASK_RUNNING); @@ -967,6 +972,24 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) return false; } +void io_wq_set_exit_on_idle(struct io_wq *wq, bool enable) +{ + if (!wq->task) + return; + + if (!enable) { + clear_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state); + return; + } + + if (test_and_set_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state)) + return; + + rcu_read_lock(); + io_wq_for_each_worker(wq, io_wq_worker_wake, NULL); + rcu_read_unlock(); +} + static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { do { diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index 774abab54732..94b14742b703 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -41,6 +41,7 @@ struct io_wq_data { struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); void io_wq_exit_start(struct io_wq *wq); void io_wq_put_and_exit(struct io_wq *wq); +void io_wq_set_exit_on_idle(struct io_wq *wq, bool enable); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_hash_work(struct io_wq_work *work, void *val); From 91214661489467f8452d34edbf257488d85176e4 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Mon, 2 Feb 2026 22:37:54 +0800 Subject: [PATCH 28/30] io_uring: allow io-wq workers to exit when unused io_uring keeps a per-task io-wq around, even when the task no longer has any io_uring instances. If the task previously used io_uring for file I/O, this can leave an unrelated iou-wrk-* worker thread behind after the last io_uring instance is gone. When the last io_uring ctx is removed from the task context, mark the io-wq exit-on-idle so workers can go away. Clear the flag on subsequent io_uring usage. Signed-off-by: Li Chen Signed-off-by: Jens Axboe --- io_uring/tctx.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 5b66755579c0..03c278aa5812 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -122,6 +122,14 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) return ret; } } + + /* + * Re-activate io-wq keepalive on any new io_uring usage. The wq may have + * been marked for idle-exit when the task temporarily had no active + * io_uring instances. + */ + if (tctx->io_wq) + io_wq_set_exit_on_idle(tctx->io_wq, false); if (!xa_load(&tctx->xa, (unsigned long)ctx)) { node = kmalloc(sizeof(*node), GFP_KERNEL); if (!node) @@ -183,6 +191,9 @@ __cold void io_uring_del_tctx_node(unsigned long index) if (tctx->last == node->ctx) tctx->last = NULL; kfree(node); + + if (xa_empty(&tctx->xa) && tctx->io_wq) + io_wq_set_exit_on_idle(tctx->io_wq, true); } __cold void io_uring_clean_tctx(struct io_uring_task *tctx) From ccd18ce290726053faff75b6fc3e541301ac99f9 Mon Sep 17 00:00:00 2001 From: Tim Bird Date: Tue, 3 Feb 2026 16:46:24 -0700 Subject: [PATCH 29/30] io_uring: Add SPDX id lines to remaining source files Some io_uring files are missing SPDX-License-Identifier lines. Add lines with GPL-2.0 license IDs to these files. Signed-off-by: Tim Bird Signed-off-by: Jens Axboe --- io_uring/alloc_cache.h | 1 + io_uring/cmd_net.c | 1 + io_uring/eventfd.h | 1 + io_uring/io-wq.h | 1 + io_uring/io_uring.h | 1 + io_uring/memmap.h | 1 + io_uring/mock_file.c | 1 + io_uring/notif.c | 1 + io_uring/refs.h | 1 + io_uring/slist.h | 1 + 10 files changed, 10 insertions(+) diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index bb2f21a7bfd6..45fcd8b3b824 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOU_ALLOC_CACHE_H #define IOU_ALLOC_CACHE_H diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c index 19d3ce2bd20a..cb2775936fb8 100644 --- a/io_uring/cmd_net.c +++ b/io_uring/cmd_net.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h index e2f1985c2cf9..400eda4a4165 100644 --- a/io_uring/eventfd.h +++ b/io_uring/eventfd.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ struct io_ring_ctx; int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index 94b14742b703..42f00a47a9c9 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef INTERNAL_IO_WQ_H #define INTERNAL_IO_WQ_H diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index acdc39b9f8d6..f12e597adc9e 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOU_CORE_H #define IOU_CORE_H diff --git a/io_uring/memmap.h b/io_uring/memmap.h index a39d9e518905..f4cfbb6b9a1f 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef IO_URING_MEMMAP_H #define IO_URING_MEMMAP_H diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c index 3ffac8f72974..80c96ad2061f 100644 --- a/io_uring/mock_file.c +++ b/io_uring/mock_file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include diff --git a/io_uring/notif.c b/io_uring/notif.c index f476775ba44b..efce8ae12eaa 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include diff --git a/io_uring/refs.h b/io_uring/refs.h index 0d928d87c4ed..0fe16b67c308 100644 --- a/io_uring/refs.h +++ b/io_uring/refs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOU_REQ_REF_H #define IOU_REQ_REF_H diff --git a/io_uring/slist.h b/io_uring/slist.h index 0aec51076bad..45c21a8ad685 100644 --- a/io_uring/slist.h +++ b/io_uring/slist.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef INTERNAL_IO_SLIST_H #define INTERNAL_IO_SLIST_H From 442ae406603a94f1a263654494f425302ceb0445 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 5 Feb 2026 08:38:20 -0700 Subject: [PATCH 30/30] io_uring/kbuf: fix memory leak if io_buffer_add_list fails io_register_pbuf_ring() ignores the return value of io_buffer_add_list(), which can fail if xa_store() returns an error (e.g., -ENOMEM). When this happens, the function returns 0 (success) to the caller, but the io_buffer_list structure is neither added to the xarray nor freed. In practice this requires failure injection to hit, hence not a real issue. But it should get fixed up none the less. Fixes: c7fb19428d67 ("io_uring: add support for ring mapped supplied buffers") Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 796d131107dd..67d4fe576473 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -669,8 +669,9 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) bl->buf_ring = br; if (reg.flags & IOU_PBUF_RING_INC) bl->flags |= IOBL_INC; - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; + ret = io_buffer_add_list(ctx, bl, reg.bgid); + if (!ret) + return 0; fail: io_free_region(ctx->user, &bl->region); kfree(bl);