From cd3c877d04683b44a4d50dcdfad54b356e65d158 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 24 Feb 2026 07:46:37 -0800 Subject: [PATCH 1/9] iomap: don't report direct-io retries to fserror iomap's directio implementation has two magic errno codes that it uses to signal callers -- ENOTBLK tells the filesystem that it should retry a write with the pagecache; and EAGAIN tells the caller that pagecache flushing or invalidation failed and that it should try again. Neither of these indicate data loss, so let's not report them. Fixes: a9d573ee88af98 ("iomap: report file I/O errors to the VFS") Signed-off-by: Darrick J. Wong Link: https://patch.msgid.link/20260224154637.GD2390381@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/direct-io.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 95254aa1b654..e911daedff65 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -87,6 +87,19 @@ static inline enum fserror_type iomap_dio_err_type(const struct iomap_dio *dio) return FSERR_DIRECTIO_READ; } +static inline bool should_report_dio_fserror(const struct iomap_dio *dio) +{ + switch (dio->error) { + case 0: + case -EAGAIN: + case -ENOTBLK: + /* don't send fsnotify for success or magic retry codes */ + return false; + default: + return true; + } +} + ssize_t iomap_dio_complete(struct iomap_dio *dio) { const struct iomap_dio_ops *dops = dio->dops; @@ -96,7 +109,7 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) if (dops && dops->end_io) ret = dops->end_io(iocb, dio->size, ret, dio->flags); - if (dio->error) + if (should_report_dio_fserror(dio)) fserror_report_io(file_inode(iocb->ki_filp), iomap_dio_err_type(dio), offset, dio->size, dio->error, GFP_NOFS); From 28aaa9c39945b7925a1cc1d513c8f21ed38f5e4f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 10:43:55 +0100 Subject: [PATCH 2/9] kthread: consolidate kthread exit paths to prevent use-after-free Guillaume reported crashes via corrupted RCU callback function pointers during KUnit testing. The crash was traced back to the pidfs rhashtable conversion which replaced the 24-byte rb_node with an 8-byte rhash_head in struct pid, shrinking it from 160 to 144 bytes. struct kthread (without CONFIG_BLK_CGROUP) is also 144 bytes. With CONFIG_SLAB_MERGE_DEFAULT and SLAB_HWCACHE_ALIGN both round up to 192 bytes and share the same slab cache. struct pid.rcu.func and struct kthread.affinity_node both sit at offset 0x78. When a kthread exits via make_task_dead() it bypasses kthread_exit() and misses the affinity_node cleanup. free_kthread_struct() frees the memory while the node is still linked into the global kthread_affinity_list. A subsequent list_del() by another kthread writes through dangling list pointers into the freed and reused memory, corrupting the pid's rcu.func pointer. Instead of patching free_kthread_struct() to handle the missed cleanup, consolidate all kthread exit paths. Turn kthread_exit() into a macro that calls do_exit() and add kthread_do_exit() which is called from do_exit() for any task with PF_KTHREAD set. This guarantees that kthread-specific cleanup always happens regardless of the exit path - make_task_dead(), direct do_exit(), or kthread_exit(). Replace __to_kthread() with a new tsk_is_kthread() accessor in the public header. Export do_exit() since module code using the kthread_exit() macro now needs it directly. Reported-by: Guillaume Tucker Tested-by: Guillaume Tucker Tested-by: Mark Brown Tested-by: David Gow Cc: Link: https://lore.kernel.org/all/20260224-mittlerweile-besessen-2738831ae7f6@brauner Co-developed-by: Linus Torvalds Fixes: 4d13f4304fa4 ("kthread: Implement preferred affinity") Signed-off-by: Linus Torvalds Signed-off-by: Christian Brauner --- include/linux/kthread.h | 21 ++++++++++++++++++++- kernel/exit.c | 6 ++++++ kernel/kthread.c | 41 +++++------------------------------------ 3 files changed, 31 insertions(+), 37 deletions(-) diff --git a/include/linux/kthread.h b/include/linux/kthread.h index c92c1149ee6e..a01a474719a7 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -7,6 +7,24 @@ struct mm_struct; +/* opaque kthread data */ +struct kthread; + +/* + * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will + * always remain a kthread. For kthreads p->worker_private always + * points to a struct kthread. For tasks that are not kthreads + * p->worker_private is used to point to other things. + * + * Return NULL for any task that is not a kthread. + */ +static inline struct kthread *tsk_is_kthread(struct task_struct *p) +{ + if (p->flags & PF_KTHREAD) + return p->worker_private; + return NULL; +} + __printf(4, 5) struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, @@ -98,9 +116,10 @@ void *kthread_probe_data(struct task_struct *k); int kthread_park(struct task_struct *k); void kthread_unpark(struct task_struct *k); void kthread_parkme(void); -void kthread_exit(long result) __noreturn; +#define kthread_exit(result) do_exit(result) void kthread_complete_and_exit(struct completion *, long) __noreturn; int kthreads_update_housekeeping(void); +void kthread_do_exit(struct kthread *, long); int kthreadd(void *unused); extern struct task_struct *kthreadd_task; diff --git a/kernel/exit.c b/kernel/exit.c index 8a87021211ae..ede3117fa7d4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -896,11 +896,16 @@ static void synchronize_group_exit(struct task_struct *tsk, long code) void __noreturn do_exit(long code) { struct task_struct *tsk = current; + struct kthread *kthread; int group_dead; WARN_ON(irqs_disabled()); WARN_ON(tsk->plug); + kthread = tsk_is_kthread(tsk); + if (unlikely(kthread)) + kthread_do_exit(kthread, code); + kcov_task_exit(tsk); kmsan_task_exit(tsk); @@ -1013,6 +1018,7 @@ void __noreturn do_exit(long code) lockdep_free_task(tsk); do_task_dead(); } +EXPORT_SYMBOL(do_exit); void __noreturn make_task_dead(int signr) { diff --git a/kernel/kthread.c b/kernel/kthread.c index 20451b624b67..791210daf8b4 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -85,24 +85,6 @@ static inline struct kthread *to_kthread(struct task_struct *k) return k->worker_private; } -/* - * Variant of to_kthread() that doesn't assume @p is a kthread. - * - * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will - * always remain a kthread. For kthreads p->worker_private always - * points to a struct kthread. For tasks that are not kthreads - * p->worker_private is used to point to other things. - * - * Return NULL for any task that is not a kthread. - */ -static inline struct kthread *__to_kthread(struct task_struct *p) -{ - void *kthread = p->worker_private; - if (kthread && !(p->flags & PF_KTHREAD)) - kthread = NULL; - return kthread; -} - void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { struct kthread *kthread = to_kthread(tsk); @@ -193,7 +175,7 @@ EXPORT_SYMBOL_GPL(kthread_should_park); bool kthread_should_stop_or_park(void) { - struct kthread *kthread = __to_kthread(current); + struct kthread *kthread = tsk_is_kthread(current); if (!kthread) return false; @@ -234,7 +216,7 @@ EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); */ void *kthread_func(struct task_struct *task) { - struct kthread *kthread = __to_kthread(task); + struct kthread *kthread = tsk_is_kthread(task); if (kthread) return kthread->threadfn; return NULL; @@ -266,7 +248,7 @@ EXPORT_SYMBOL_GPL(kthread_data); */ void *kthread_probe_data(struct task_struct *task) { - struct kthread *kthread = __to_kthread(task); + struct kthread *kthread = tsk_is_kthread(task); void *data = NULL; if (kthread) @@ -309,19 +291,8 @@ void kthread_parkme(void) } EXPORT_SYMBOL_GPL(kthread_parkme); -/** - * kthread_exit - Cause the current kthread return @result to kthread_stop(). - * @result: The integer value to return to kthread_stop(). - * - * While kthread_exit can be called directly, it exists so that - * functions which do some additional work in non-modular code such as - * module_put_and_kthread_exit can be implemented. - * - * Does not return. - */ -void __noreturn kthread_exit(long result) +void kthread_do_exit(struct kthread *kthread, long result) { - struct kthread *kthread = to_kthread(current); kthread->result = result; if (!list_empty(&kthread->affinity_node)) { mutex_lock(&kthread_affinity_lock); @@ -333,9 +304,7 @@ void __noreturn kthread_exit(long result) kthread->preferred_affinity = NULL; } } - do_exit(0); } -EXPORT_SYMBOL(kthread_exit); /** * kthread_complete_and_exit - Exit the current kthread. @@ -683,7 +652,7 @@ void kthread_set_per_cpu(struct task_struct *k, int cpu) bool kthread_is_per_cpu(struct task_struct *p) { - struct kthread *kthread = __to_kthread(p); + struct kthread *kthread = tsk_is_kthread(p); if (!kthread) return false; From a0b4c7a49137ed21279f354eb59f49ddae8dffc2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 26 Feb 2026 13:32:33 +0000 Subject: [PATCH 3/9] netfs: Fix unbuffered/DIO writes to dispatch subrequests in strict sequence Fix netfslib such that when it's making an unbuffered or DIO write, to make sure that it sends each subrequest strictly sequentially, waiting till the previous one is 'committed' before sending the next so that we don't have pieces landing out of order and potentially leaving a hole if an error occurs (ENOSPC for example). This is done by copying in just those bits of issuing, collecting and retrying subrequests that are necessary to do one subrequest at a time. Retrying, in particular, is simpler because if the current subrequest needs retrying, the source iterator can just be copied again and the subrequest prepped and issued again without needing to be concerned about whether it needs merging with the previous or next in the sequence. Note that the issuing loop waits for a subrequest to complete right after issuing it, but this wait could be moved elsewhere allowing preparatory steps to be performed whilst the subrequest is in progress. In particular, once content encryption is available in netfslib, that could be done whilst waiting, as could cleanup of buffers that have been completed. Fixes: 153a9961b551 ("netfs: Implement unbuffered/DIO write support") Signed-off-by: David Howells Link: https://patch.msgid.link/58526.1772112753@warthog.procyon.org.uk Tested-by: Steve French Reviewed-by: Paulo Alcantara (Red Hat) cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/direct_write.c | 228 ++++++++++++++++++++++++++++++++--- fs/netfs/internal.h | 4 +- fs/netfs/write_collect.c | 21 ---- fs/netfs/write_issue.c | 41 +------ include/trace/events/netfs.h | 4 +- 5 files changed, 221 insertions(+), 77 deletions(-) diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index a9d1c3b2c084..dd1451bf7543 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -9,6 +9,202 @@ #include #include "internal.h" +/* + * Perform the cleanup rituals after an unbuffered write is complete. + */ +static void netfs_unbuffered_write_done(struct netfs_io_request *wreq) +{ + struct netfs_inode *ictx = netfs_inode(wreq->inode); + + _enter("R=%x", wreq->debug_id); + + /* Okay, declare that all I/O is complete. */ + trace_netfs_rreq(wreq, netfs_rreq_trace_write_done); + + if (!wreq->error) + netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred); + + if (wreq->origin == NETFS_DIO_WRITE && + wreq->mapping->nrpages) { + /* mmap may have got underfoot and we may now have folios + * locally covering the region we just wrote. Attempt to + * discard the folios, but leave in place any modified locally. + * ->write_iter() is prevented from interfering by the DIO + * counter. + */ + pgoff_t first = wreq->start >> PAGE_SHIFT; + pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT; + + invalidate_inode_pages2_range(wreq->mapping, first, last); + } + + if (wreq->origin == NETFS_DIO_WRITE) + inode_dio_end(wreq->inode); + + _debug("finished"); + netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip); + /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */ + + if (wreq->iocb) { + size_t written = umin(wreq->transferred, wreq->len); + + wreq->iocb->ki_pos += written; + if (wreq->iocb->ki_complete) { + trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete); + wreq->iocb->ki_complete(wreq->iocb, wreq->error ?: written); + } + wreq->iocb = VFS_PTR_POISON; + } + + netfs_clear_subrequests(wreq); +} + +/* + * Collect the subrequest results of unbuffered write subrequests. + */ +static void netfs_unbuffered_write_collect(struct netfs_io_request *wreq, + struct netfs_io_stream *stream, + struct netfs_io_subrequest *subreq) +{ + trace_netfs_collect_sreq(wreq, subreq); + + spin_lock(&wreq->lock); + list_del_init(&subreq->rreq_link); + spin_unlock(&wreq->lock); + + wreq->transferred += subreq->transferred; + iov_iter_advance(&wreq->buffer.iter, subreq->transferred); + + stream->collected_to = subreq->start + subreq->transferred; + wreq->collected_to = stream->collected_to; + netfs_put_subrequest(subreq, netfs_sreq_trace_put_done); + + trace_netfs_collect_stream(wreq, stream); + trace_netfs_collect_state(wreq, wreq->collected_to, 0); +} + +/* + * Write data to the server without going through the pagecache and without + * writing it to the local cache. We dispatch the subrequests serially and + * wait for each to complete before dispatching the next, lest we leave a gap + * in the data written due to a failure such as ENOSPC. We could, however + * attempt to do preparation such as content encryption for the next subreq + * whilst the current is in progress. + */ +static int netfs_unbuffered_write(struct netfs_io_request *wreq) +{ + struct netfs_io_subrequest *subreq = NULL; + struct netfs_io_stream *stream = &wreq->io_streams[0]; + int ret; + + _enter("%llx", wreq->len); + + if (wreq->origin == NETFS_DIO_WRITE) + inode_dio_begin(wreq->inode); + + stream->collected_to = wreq->start; + + for (;;) { + bool retry = false; + + if (!subreq) { + netfs_prepare_write(wreq, stream, wreq->start + wreq->transferred); + subreq = stream->construct; + stream->construct = NULL; + stream->front = NULL; + } + + /* Check if (re-)preparation failed. */ + if (unlikely(test_bit(NETFS_SREQ_FAILED, &subreq->flags))) { + netfs_write_subrequest_terminated(subreq, subreq->error); + wreq->error = subreq->error; + break; + } + + iov_iter_truncate(&subreq->io_iter, wreq->len - wreq->transferred); + if (!iov_iter_count(&subreq->io_iter)) + break; + + subreq->len = netfs_limit_iter(&subreq->io_iter, 0, + stream->sreq_max_len, + stream->sreq_max_segs); + iov_iter_truncate(&subreq->io_iter, subreq->len); + stream->submit_extendable_to = subreq->len; + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + stream->issue_write(subreq); + + /* Async, need to wait. */ + netfs_wait_for_in_progress_stream(wreq, stream); + + if (test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { + retry = true; + } else if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) { + ret = subreq->error; + wreq->error = ret; + netfs_see_subrequest(subreq, netfs_sreq_trace_see_failed); + subreq = NULL; + break; + } + ret = 0; + + if (!retry) { + netfs_unbuffered_write_collect(wreq, stream, subreq); + subreq = NULL; + if (wreq->transferred >= wreq->len) + break; + if (!wreq->iocb && signal_pending(current)) { + ret = wreq->transferred ? -EINTR : -ERESTARTSYS; + trace_netfs_rreq(wreq, netfs_rreq_trace_intr); + break; + } + continue; + } + + /* We need to retry the last subrequest, so first reset the + * iterator, taking into account what, if anything, we managed + * to transfer. + */ + subreq->error = -EAGAIN; + trace_netfs_sreq(subreq, netfs_sreq_trace_retry); + if (subreq->transferred > 0) + iov_iter_advance(&wreq->buffer.iter, subreq->transferred); + + if (stream->source == NETFS_UPLOAD_TO_SERVER && + wreq->netfs_ops->retry_request) + wreq->netfs_ops->retry_request(wreq, stream); + + __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + __clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags); + __clear_bit(NETFS_SREQ_FAILED, &subreq->flags); + subreq->io_iter = wreq->buffer.iter; + subreq->start = wreq->start + wreq->transferred; + subreq->len = wreq->len - wreq->transferred; + subreq->transferred = 0; + subreq->retry_count += 1; + stream->sreq_max_len = UINT_MAX; + stream->sreq_max_segs = INT_MAX; + + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + stream->prepare_write(subreq); + + __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + netfs_stat(&netfs_n_wh_retry_write_subreq); + } + + netfs_unbuffered_write_done(wreq); + _leave(" = %d", ret); + return ret; +} + +static void netfs_unbuffered_write_async(struct work_struct *work) +{ + struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work); + + netfs_unbuffered_write(wreq); + netfs_put_request(wreq, netfs_rreq_trace_put_complete); +} + /* * Perform an unbuffered write where we may have to do an RMW operation on an * encrypted file. This can also be used for direct I/O writes. @@ -70,35 +266,35 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * */ wreq->buffer.iter = *iter; } + + wreq->len = iov_iter_count(&wreq->buffer.iter); } __set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags); - if (async) - __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags); /* Copy the data into the bounce buffer and encrypt it. */ // TODO /* Dispatch the write. */ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); - if (async) + + if (async) { + INIT_WORK(&wreq->work, netfs_unbuffered_write_async); wreq->iocb = iocb; - wreq->len = iov_iter_count(&wreq->buffer.iter); - ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len); - if (ret < 0) { - _debug("begin = %zd", ret); - goto out; - } - - if (!async) { - ret = netfs_wait_for_write(wreq); - if (ret > 0) - iocb->ki_pos += ret; - } else { + queue_work(system_dfl_wq, &wreq->work); ret = -EIOCBQUEUED; + } else { + ret = netfs_unbuffered_write(wreq); + if (ret < 0) { + _debug("begin = %zd", ret); + } else { + iocb->ki_pos += wreq->transferred; + ret = wreq->transferred ?: wreq->error; + } + + netfs_put_request(wreq, netfs_rreq_trace_put_complete); } -out: netfs_put_request(wreq, netfs_rreq_trace_put_return); return ret; diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 4319611f5354..d436e20d3418 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -198,6 +198,9 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, struct file *file, loff_t start, enum netfs_io_origin origin); +void netfs_prepare_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream, + loff_t start); void netfs_reissue_write(struct netfs_io_stream *stream, struct netfs_io_subrequest *subreq, struct iov_iter *source); @@ -212,7 +215,6 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c struct folio **writethrough_cache); ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, struct folio *writethrough_cache); -int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len); /* * write_retry.c diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 61eab34ea67e..83eb3dc1adf8 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -399,27 +399,6 @@ bool netfs_write_collection(struct netfs_io_request *wreq) ictx->ops->invalidate_cache(wreq); } - if ((wreq->origin == NETFS_UNBUFFERED_WRITE || - wreq->origin == NETFS_DIO_WRITE) && - !wreq->error) - netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred); - - if (wreq->origin == NETFS_DIO_WRITE && - wreq->mapping->nrpages) { - /* mmap may have got underfoot and we may now have folios - * locally covering the region we just wrote. Attempt to - * discard the folios, but leave in place any modified locally. - * ->write_iter() is prevented from interfering by the DIO - * counter. - */ - pgoff_t first = wreq->start >> PAGE_SHIFT; - pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT; - invalidate_inode_pages2_range(wreq->mapping, first, last); - } - - if (wreq->origin == NETFS_DIO_WRITE) - inode_dio_end(wreq->inode); - _debug("finished"); netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip); /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */ diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 34894da5a23e..437268f65640 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -154,9 +154,9 @@ EXPORT_SYMBOL(netfs_prepare_write_failed); * Prepare a write subrequest. We need to allocate a new subrequest * if we don't have one. */ -static void netfs_prepare_write(struct netfs_io_request *wreq, - struct netfs_io_stream *stream, - loff_t start) +void netfs_prepare_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream, + loff_t start) { struct netfs_io_subrequest *subreq; struct iov_iter *wreq_iter = &wreq->buffer.iter; @@ -698,41 +698,6 @@ ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_c return ret; } -/* - * Write data to the server without going through the pagecache and without - * writing it to the local cache. - */ -int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len) -{ - struct netfs_io_stream *upload = &wreq->io_streams[0]; - ssize_t part; - loff_t start = wreq->start; - int error = 0; - - _enter("%zx", len); - - if (wreq->origin == NETFS_DIO_WRITE) - inode_dio_begin(wreq->inode); - - while (len) { - // TODO: Prepare content encryption - - _debug("unbuffered %zx", len); - part = netfs_advance_write(wreq, upload, start, len, false); - start += part; - len -= part; - rolling_buffer_advance(&wreq->buffer, part); - if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) - netfs_wait_for_paused_write(wreq); - if (test_bit(NETFS_RREQ_FAILED, &wreq->flags)) - break; - } - - netfs_end_issue_write(wreq); - _leave(" = %d", error); - return error; -} - /* * Write some of a pending folio data back to the server and/or the cache. */ diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 64a382fbc31a..2d366be46a1c 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -57,6 +57,7 @@ EM(netfs_rreq_trace_done, "DONE ") \ EM(netfs_rreq_trace_end_copy_to_cache, "END-C2C") \ EM(netfs_rreq_trace_free, "FREE ") \ + EM(netfs_rreq_trace_intr, "INTR ") \ EM(netfs_rreq_trace_ki_complete, "KI-CMPL") \ EM(netfs_rreq_trace_recollect, "RECLLCT") \ EM(netfs_rreq_trace_redirty, "REDIRTY") \ @@ -169,7 +170,8 @@ EM(netfs_sreq_trace_put_oom, "PUT OOM ") \ EM(netfs_sreq_trace_put_wip, "PUT WIP ") \ EM(netfs_sreq_trace_put_work, "PUT WORK ") \ - E_(netfs_sreq_trace_put_terminated, "PUT TERM ") + EM(netfs_sreq_trace_put_terminated, "PUT TERM ") \ + E_(netfs_sreq_trace_see_failed, "SEE FAILED ") #define netfs_folio_traces \ EM(netfs_folio_is_uptodate, "mod-uptodate") \ From e6b899f08066e744f89df16ceb782e06868bd148 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:50:09 +0100 Subject: [PATCH 4/9] nsfs: tighten permission checks for ns iteration ioctls Even privileged services should not necessarily be able to see other privileged service's namespaces so they can't leak information to each other. Use may_see_all_namespaces() helper that centralizes this policy until the nstree adapts. Link: https://patch.msgid.link/20260226-work-visibility-fixes-v1-1-d2c2853313bd@kernel.org Fixes: a1d220d9dafa ("nsfs: iterate through mount namespaces") Reviewed-by: Jeff Layton Cc: stable@kernel.org # v6.12+ Signed-off-by: Christian Brauner --- fs/nsfs.c | 13 +++++++++++++ include/linux/ns_common.h | 2 ++ kernel/nscommon.c | 6 ++++++ 3 files changed, 21 insertions(+) diff --git a/fs/nsfs.c b/fs/nsfs.c index db91de208645..be36c10c38cf 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -199,6 +199,17 @@ static bool nsfs_ioctl_valid(unsigned int cmd) return false; } +static bool may_use_nsfs_ioctl(unsigned int cmd) +{ + switch (_IOC_NR(cmd)) { + case _IOC_NR(NS_MNT_GET_NEXT): + fallthrough; + case _IOC_NR(NS_MNT_GET_PREV): + return may_see_all_namespaces(); + } + return true; +} + static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -214,6 +225,8 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, if (!nsfs_ioctl_valid(ioctl)) return -ENOIOCTLCMD; + if (!may_use_nsfs_ioctl(ioctl)) + return -EPERM; ns = get_proc_ns(file_inode(filp)); switch (ioctl) { diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 825f5865bfc5..c8e227a3f9e2 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -55,6 +55,8 @@ static __always_inline bool is_ns_init_id(const struct ns_common *ns) #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) +bool may_see_all_namespaces(void); + static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns) { return atomic_read(&ns->__ns_ref_active); diff --git a/kernel/nscommon.c b/kernel/nscommon.c index bdc3c86231d3..3166c1fd844a 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -309,3 +309,9 @@ void __ns_ref_active_get(struct ns_common *ns) return; } } + +bool may_see_all_namespaces(void) +{ + return (task_active_pid_ns(current) == &init_pid_ns) && + ns_capable_noaudit(init_pid_ns.user_ns, CAP_SYS_ADMIN); +} From d2324a9317f00013facb0ba00b00440e19d2af5e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:50:10 +0100 Subject: [PATCH 5/9] nsfs: tighten permission checks for handle opening Even privileged services should not necessarily be able to see other privileged service's namespaces so they can't leak information to each other. Use may_see_all_namespaces() helper that centralizes this policy until the nstree adapts. Link: https://patch.msgid.link/20260226-work-visibility-fixes-v1-2-d2c2853313bd@kernel.org Fixes: 5222470b2fbb ("nsfs: support file handles") Reviewed-by: Jeff Layton Cc: stable@kernel.org # v6.18+ Signed-off-by: Christian Brauner --- fs/nsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index be36c10c38cf..c215878d55e8 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -627,7 +627,7 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, return ERR_PTR(-EOPNOTSUPP); } - if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) { + if (owning_ns && !may_see_all_namespaces()) { ns->ops->put(ns); return ERR_PTR(-EPERM); } From 8d76afe84fa2babf604b3c173730d4d2b067e361 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:50:11 +0100 Subject: [PATCH 6/9] nstree: tighten permission checks for listing Even privileged services should not necessarily be able to see other privileged service's namespaces so they can't leak information to each other. Use may_see_all_namespaces() helper that centralizes this policy until the nstree adapts. Link: https://patch.msgid.link/20260226-work-visibility-fixes-v1-3-d2c2853313bd@kernel.org Fixes: 76b6f5dfb3fd ("nstree: add listns()") Reviewed-by: Jeff Layton Cc: stable@kernel.org # v6.19+ Signed-off-by: Christian Brauner --- kernel/nstree.c | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/kernel/nstree.c b/kernel/nstree.c index f36c59e6951d..6d12e5900ac0 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -515,32 +515,11 @@ static inline bool __must_check ns_requested(const struct klistns *kls, static inline bool __must_check may_list_ns(const struct klistns *kls, struct ns_common *ns) { - if (kls->user_ns) { - if (kls->userns_capable) - return true; - } else { - struct ns_common *owner; - struct user_namespace *user_ns; - - owner = ns_owner(ns); - if (owner) - user_ns = to_user_ns(owner); - else - user_ns = &init_user_ns; - if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN)) - return true; - } - + if (kls->user_ns && kls->userns_capable) + return true; if (is_current_namespace(ns)) return true; - - if (ns->ns_type != CLONE_NEWUSER) - return false; - - if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN)) - return true; - - return false; + return may_see_all_namespaces(); } static inline void ns_put(struct ns_common *ns) @@ -600,7 +579,7 @@ static ssize_t do_listns_userns(struct klistns *kls) ret = 0; head = &to_ns_common(kls->user_ns)->ns_owner_root.ns_list_head; - kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN); + kls->userns_capable = may_see_all_namespaces(); rcu_read_lock(); From 4c7b2ec23cc5d880e3ffe35e8c2aad686b67723a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:50:12 +0100 Subject: [PATCH 7/9] selftests: fix mntns iteration selftests Now that we changed permission checking make sure that we reflect that in the selftests. Link: https://patch.msgid.link/20260226-work-visibility-fixes-v1-4-d2c2853313bd@kernel.org Fixes: 9d87b1067382 ("selftests: add tests for mntns iteration") Reviewed-by: Jeff Layton Cc: stable@kernel.org # v6.14+ Signed-off-by: Christian Brauner --- .../filesystems/nsfs/iterate_mntns.c | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c index 61e55dfbf121..e19ff8168baf 100644 --- a/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c +++ b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c @@ -37,17 +37,20 @@ FIXTURE(iterate_mount_namespaces) { __u64 mnt_ns_id[MNT_NS_COUNT]; }; +static inline bool mntns_in_list(__u64 *mnt_ns_id, struct mnt_ns_info *info) +{ + for (int i = 0; i < MNT_NS_COUNT; i++) { + if (mnt_ns_id[i] == info->mnt_ns_id) + return true; + } + return false; +} + FIXTURE_SETUP(iterate_mount_namespaces) { for (int i = 0; i < MNT_NS_COUNT; i++) self->fd_mnt_ns[i] = -EBADF; - /* - * Creating a new user namespace let's us guarantee that we only see - * mount namespaces that we did actually create. - */ - ASSERT_EQ(unshare(CLONE_NEWUSER), 0); - for (int i = 0; i < MNT_NS_COUNT; i++) { struct mnt_ns_info info = {}; @@ -75,13 +78,15 @@ TEST_F(iterate_mount_namespaces, iterate_all_forward) fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[0], F_DUPFD_CLOEXEC); ASSERT_GE(fd_mnt_ns_cur, 0); - for (;; count++) { + for (;;) { struct mnt_ns_info info = {}; int fd_mnt_ns_next; fd_mnt_ns_next = ioctl(fd_mnt_ns_cur, NS_MNT_GET_NEXT, &info); if (fd_mnt_ns_next < 0 && errno == ENOENT) break; + if (mntns_in_list(self->mnt_ns_id, &info)) + count++; ASSERT_GE(fd_mnt_ns_next, 0); ASSERT_EQ(close(fd_mnt_ns_cur), 0); fd_mnt_ns_cur = fd_mnt_ns_next; @@ -96,13 +101,15 @@ TEST_F(iterate_mount_namespaces, iterate_all_backwards) fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[MNT_NS_LAST_INDEX], F_DUPFD_CLOEXEC); ASSERT_GE(fd_mnt_ns_cur, 0); - for (;; count++) { + for (;;) { struct mnt_ns_info info = {}; int fd_mnt_ns_prev; fd_mnt_ns_prev = ioctl(fd_mnt_ns_cur, NS_MNT_GET_PREV, &info); if (fd_mnt_ns_prev < 0 && errno == ENOENT) break; + if (mntns_in_list(self->mnt_ns_id, &info)) + count++; ASSERT_GE(fd_mnt_ns_prev, 0); ASSERT_EQ(close(fd_mnt_ns_cur), 0); fd_mnt_ns_cur = fd_mnt_ns_prev; @@ -125,7 +132,6 @@ TEST_F(iterate_mount_namespaces, iterate_forward) ASSERT_GE(fd_mnt_ns_next, 0); ASSERT_EQ(close(fd_mnt_ns_cur), 0); fd_mnt_ns_cur = fd_mnt_ns_next; - ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]); } } @@ -144,7 +150,6 @@ TEST_F(iterate_mount_namespaces, iterate_backward) ASSERT_GE(fd_mnt_ns_prev, 0); ASSERT_EQ(close(fd_mnt_ns_cur), 0); fd_mnt_ns_cur = fd_mnt_ns_prev; - ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]); } } From debc1a492b2695d05973994fb0f796dbd9ceaae6 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 3 Mar 2026 15:34:20 -0800 Subject: [PATCH 8/9] iomap: don't mark folio uptodate if read IO has bytes pending If a folio has ifs metadata attached to it and the folio is partially read in through an async IO helper with the rest of it then being read in through post-EOF zeroing or as inline data, and the helper successfully finishes the read first, then post-EOF zeroing / reading inline will mark the folio as uptodate in iomap_set_range_uptodate(). This is a problem because when the read completion path later calls iomap_read_end(), it will call folio_end_read(), which sets the uptodate bit using XOR semantics. Calling folio_end_read() on a folio that was already marked uptodate clears the uptodate bit. Fix this by not marking the folio as uptodate if the read IO has bytes pending. The folio uptodate state will be set in the read completion path through iomap_end_read() -> folio_end_read(). Reported-by: Wei Gao Suggested-by: Sasha Levin Tested-by: Wei Gao Reviewed-by: Darrick J. Wong Cc: stable@vger.kernel.org # v6.19 Link: https://lore.kernel.org/linux-fsdevel/aYbmy8JdgXwsGaPP@autotest-wegao.qe.prg2.suse.org/ Fixes: b2f35ac4146d ("iomap: add caller-provided callbacks for read and readahead") Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20260303233420.874231-2-joannelkoong@gmail.com Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index bc82083e420a..00f0efaf12b2 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -80,18 +80,27 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off, { struct iomap_folio_state *ifs = folio->private; unsigned long flags; - bool uptodate = true; + bool mark_uptodate = true; if (folio_test_uptodate(folio)) return; if (ifs) { spin_lock_irqsave(&ifs->state_lock, flags); - uptodate = ifs_set_range_uptodate(folio, ifs, off, len); + /* + * If a read with bytes pending is in progress, we must not call + * folio_mark_uptodate(). The read completion path + * (iomap_read_end()) will call folio_end_read(), which uses XOR + * semantics to set the uptodate bit. If we set it here, the XOR + * in folio_end_read() will clear it, leaving the folio not + * uptodate. + */ + mark_uptodate = ifs_set_range_uptodate(folio, ifs, off, len) && + !ifs->read_bytes_pending; spin_unlock_irqrestore(&ifs->state_lock, flags); } - if (uptodate) + if (mark_uptodate) folio_mark_uptodate(folio); } From d320f160aa5ff36cdf83c645cca52b615e866e32 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 2 Mar 2026 09:30:02 -0800 Subject: [PATCH 9/9] iomap: reject delalloc mappings during writeback Filesystems should never provide a delayed allocation mapping to writeback; they're supposed to allocate the space before replying. This can lead to weird IO errors and crashes in the block layer if the filesystem is being malicious, or if it hadn't set iomap->dev because it's a delalloc mapping. Fix this by failing writeback on delalloc mappings. Currently no filesystems actually misbehave in this manner, but we ought to be stricter about things like that. Cc: stable@vger.kernel.org # v5.5 Fixes: 598ecfbaa742ac ("iomap: lift the xfs writeback code to iomap") Signed-off-by: Darrick J. Wong Link: https://patch.msgid.link/20260302173002.GL13829@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Signed-off-by: Christian Brauner --- fs/iomap/ioend.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index 4d1ef8a2cee9..60546fa14dfe 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -215,17 +215,18 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, WARN_ON_ONCE(!folio->private && map_len < dirty_len); switch (wpc->iomap.type) { - case IOMAP_INLINE: - WARN_ON_ONCE(1); - return -EIO; + case IOMAP_UNWRITTEN: + ioend_flags |= IOMAP_IOEND_UNWRITTEN; + break; + case IOMAP_MAPPED: + break; case IOMAP_HOLE: return map_len; default: - break; + WARN_ON_ONCE(1); + return -EIO; } - if (wpc->iomap.type == IOMAP_UNWRITTEN) - ioend_flags |= IOMAP_IOEND_UNWRITTEN; if (wpc->iomap.flags & IOMAP_F_SHARED) ioend_flags |= IOMAP_IOEND_SHARED; if (folio_test_dropbehind(folio))