From 0778ac7df5137d5041783fadfc201f8fd55a1d9b Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Mon, 13 Oct 2025 19:41:51 +0800 Subject: [PATCH 01/17] fs: Fix uninitialized 'offp' in statmount_string() In statmount_string(), most flags assign an output offset pointer (offp) which is later updated with the string offset. However, the STATMOUNT_MNT_UIDMAP and STATMOUNT_MNT_GIDMAP cases directly set the struct fields instead of using offp. This leaves offp uninitialized, leading to a possible uninitialized dereference when *offp is updated. Fix it by assigning offp for UIDMAP and GIDMAP as well, keeping the code path consistent. Fixes: 37c4a9590e1e ("statmount: allow to retrieve idmappings") Fixes: e52e97f09fb6 ("statmount: let unset strings be empty") Cc: stable@vger.kernel.org Signed-off-by: Zhen Ni Link: https://patch.msgid.link/20251013114151.664341-1-zhen.ni@easystack.cn Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index d82910f33dc4..5b5ab2ae238b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5454,11 +5454,11 @@ static int statmount_string(struct kstatmount *s, u64 flag) ret = statmount_sb_source(s, seq); break; case STATMOUNT_MNT_UIDMAP: - sm->mnt_uidmap = start; + offp = &sm->mnt_uidmap; ret = statmount_mnt_uidmap(s, seq); break; case STATMOUNT_MNT_GIDMAP: - sm->mnt_gidmap = start; + offp = &sm->mnt_gidmap; ret = statmount_mnt_gidmap(s, seq); break; default: From 2c2b67af5f5f77fc68261a137ad65dcfb8e52506 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Sat, 11 Oct 2025 09:22:35 +0000 Subject: [PATCH 02/17] hostfs: Fix only passing host root in boot stage with new mount In the old mount proceedure, hostfs could only pass root directory during boot. This is because it constructed the root directory using the @root_ino event without any mount options. However, when using it with the new mount API, this step is no longer triggered. As a result, if users mounts without specifying any mount options, the @host_root_path remains uninitialized. To prevent this issue, the @host_root_path should be initialized at the time of allocation. Reported-by: Geoffrey Thorpe Closes: https://lore.kernel.org/all/643333a0-f434-42fb-82ac-d25a0b56f3b7@geoffthorpe.net/ Fixes: cd140ce9f611 ("hostfs: convert hostfs to use the new mount API") Signed-off-by: Hongbo Li Link: https://patch.msgid.link/20251011092235.29880-1-lihongbo22@huawei.com Signed-off-by: Christian Brauner --- fs/hostfs/hostfs_kern.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 1e1acf5775ab..86455eebbf6c 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -979,7 +979,7 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct hostfs_fs_info *fsi = fc->s_fs_info; struct fs_parse_result result; - char *host_root; + char *host_root, *tmp_root; int opt; opt = fs_parse(fc, hostfs_param_specs, param, &result); @@ -990,11 +990,13 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_hostfs: host_root = param->string; if (!*host_root) - host_root = ""; - fsi->host_root_path = - kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); - if (fsi->host_root_path == NULL) + break; + tmp_root = kasprintf(GFP_KERNEL, "%s%s", + fsi->host_root_path, host_root); + if (!tmp_root) return -ENOMEM; + kfree(fsi->host_root_path); + fsi->host_root_path = tmp_root; break; } @@ -1004,17 +1006,17 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) static int hostfs_parse_monolithic(struct fs_context *fc, void *data) { struct hostfs_fs_info *fsi = fc->s_fs_info; - char *host_root = (char *)data; + char *tmp_root, *host_root = (char *)data; /* NULL is printed as '(null)' by printf(): avoid that. */ if (host_root == NULL) - host_root = ""; + return 0; - fsi->host_root_path = - kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); - if (fsi->host_root_path == NULL) + tmp_root = kasprintf(GFP_KERNEL, "%s%s", fsi->host_root_path, host_root); + if (!tmp_root) return -ENOMEM; - + kfree(fsi->host_root_path); + fsi->host_root_path = tmp_root; return 0; } @@ -1049,6 +1051,11 @@ static int hostfs_init_fs_context(struct fs_context *fc) if (!fsi) return -ENOMEM; + fsi->host_root_path = kasprintf(GFP_KERNEL, "%s/", root_ino); + if (!fsi->host_root_path) { + kfree(fsi); + return -ENOMEM; + } fc->s_fs_info = fsi; fc->ops = &hostfs_context_ops; return 0; From 330e2c514823008b22e6afd2055715bc46dd8d55 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 22 Oct 2025 19:48:32 +0100 Subject: [PATCH 03/17] afs: Fix dynamic lookup to fail on cell lookup failure When a process tries to access an entry in /afs, normally what happens is that an automount dentry is created by ->lookup() and then triggered, which jumps through the ->d_automount() op. Currently, afs_dynroot_lookup() does not do cell DNS lookup, leaving that to afs_d_automount() to perform - however, it is possible to use access() or stat() on the automount point, which will always return successfully, have briefly created an afs_cell record if one did not already exist. This means that something like: test -d "/afs/.west" && echo Directory exists will print "Directory exists" even though no such cell is configured. This breaks the "west" python module available on PIP as it expects this access to fail. Now, it could be possible to make afs_dynroot_lookup() perform the DNS[*] lookup, but that would make "ls --color /afs" do this for each cell in /afs that is listed but not yet probed. kafs-client, probably wrongly, preloads the entire cell database and all the known cells are then listed in /afs - and doing ls /afs would be very, very slow, especially if any cell supplied addresses but was wholly inaccessible. [*] When I say "DNS", actually read getaddrinfo(), which could use any one of a host of mechanisms. Could also use static configuration. To fix this, make the following changes: (1) Create an enum to specify the origination point of a call to afs_lookup_cell() and pass this value into that function in place of the "excl" parameter (which can be derived from it). There are six points of origination: - Cell preload through /proc/net/afs/cells - Root cell config through /proc/net/afs/rootcell - Lookup in dynamic root - Automount trigger - Direct mount with mount() syscall - Alias check where YFS tells us the cell name is different (2) Add an extra state into the afs_cell state machine to indicate a cell that's been initialised, but not yet looked up. This is separate from one that can be considered active and has been looked up at least once. (3) Make afs_lookup_cell() vary its behaviour more, depending on where it was called from: If called from preload or root cell config, DNS lookup will not happen until we definitely want to use the cell (dynroot mount, automount, direct mount or alias check). The cell will appear in /afs but stat() won't trigger DNS lookup. If the cell already exists, dynroot will not wait for the DNS lookup to complete. If the cell did not already exist, dynroot will wait. If called from automount, direct mount or alias check, it will wait for the DNS lookup to complete. (4) Make afs_lookup_cell() return an error if lookup failed in one way or another. We try to return -ENOENT if the DNS says the cell does not exist and -EDESTADDRREQ if we couldn't access the DNS. Reported-by: Markus Suvanto Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220685 Signed-off-by: David Howells Link: https://patch.msgid.link/1784747.1761158912@warthog.procyon.org.uk Fixes: 1d0b929fc070 ("afs: Change dynroot to create contents on demand") Tested-by: Markus Suvanto cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner --- fs/afs/cell.c | 78 +++++++++++++++++++++++++++++++++++++++-------- fs/afs/dynroot.c | 3 +- fs/afs/internal.h | 12 +++++++- fs/afs/mntpt.c | 3 +- fs/afs/proc.c | 3 +- fs/afs/super.c | 2 +- fs/afs/vl_alias.c | 3 +- 7 files changed, 86 insertions(+), 18 deletions(-) diff --git a/fs/afs/cell.c b/fs/afs/cell.c index f31359922e98..d9b6fa1088b7 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -229,7 +229,7 @@ error: * @name: The name of the cell. * @namesz: The strlen of the cell name. * @vllist: A colon/comma separated list of numeric IP addresses or NULL. - * @excl: T if an error should be given if the cell name already exists. + * @reason: The reason we're doing the lookup * @trace: The reason to be logged if the lookup is successful. * * Look up a cell record by name and query the DNS for VL server addresses if @@ -239,7 +239,8 @@ error: */ struct afs_cell *afs_lookup_cell(struct afs_net *net, const char *name, unsigned int namesz, - const char *vllist, bool excl, + const char *vllist, + enum afs_lookup_cell_for reason, enum afs_cell_trace trace) { struct afs_cell *cell, *candidate, *cursor; @@ -247,12 +248,18 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, enum afs_cell_state state; int ret, n; - _enter("%s,%s", name, vllist); + _enter("%s,%s,%u", name, vllist, reason); - if (!excl) { + if (reason != AFS_LOOKUP_CELL_PRELOAD) { cell = afs_find_cell(net, name, namesz, trace); - if (!IS_ERR(cell)) + if (!IS_ERR(cell)) { + if (reason == AFS_LOOKUP_CELL_DYNROOT) + goto no_wait; + if (cell->state == AFS_CELL_SETTING_UP || + cell->state == AFS_CELL_UNLOOKED) + goto lookup_cell; goto wait_for_cell; + } } /* Assume we're probably going to create a cell and preallocate and @@ -298,26 +305,69 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, rb_insert_color(&cell->net_node, &net->cells); up_write(&net->cells_lock); - afs_queue_cell(cell, afs_cell_trace_queue_new); +lookup_cell: + if (reason != AFS_LOOKUP_CELL_PRELOAD && + reason != AFS_LOOKUP_CELL_ROOTCELL) { + set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags); + afs_queue_cell(cell, afs_cell_trace_queue_new); + } wait_for_cell: - _debug("wait_for_cell"); state = smp_load_acquire(&cell->state); /* vs error */ - if (state != AFS_CELL_ACTIVE && - state != AFS_CELL_DEAD) { + switch (state) { + case AFS_CELL_ACTIVE: + case AFS_CELL_DEAD: + break; + case AFS_CELL_UNLOOKED: + default: + if (reason == AFS_LOOKUP_CELL_PRELOAD || + reason == AFS_LOOKUP_CELL_ROOTCELL) + break; + _debug("wait_for_cell"); afs_see_cell(cell, afs_cell_trace_wait); wait_var_event(&cell->state, ({ state = smp_load_acquire(&cell->state); /* vs error */ state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD; })); + _debug("waited_for_cell %d %d", cell->state, cell->error); } +no_wait: /* Check the state obtained from the wait check. */ + state = smp_load_acquire(&cell->state); /* vs error */ if (state == AFS_CELL_DEAD) { ret = cell->error; goto error; } + if (state == AFS_CELL_ACTIVE) { + switch (cell->dns_status) { + case DNS_LOOKUP_NOT_DONE: + if (cell->dns_source == DNS_RECORD_FROM_CONFIG) { + ret = 0; + break; + } + fallthrough; + default: + ret = -EIO; + goto error; + case DNS_LOOKUP_GOOD: + case DNS_LOOKUP_GOOD_WITH_BAD: + ret = 0; + break; + case DNS_LOOKUP_GOT_NOT_FOUND: + ret = -ENOENT; + goto error; + case DNS_LOOKUP_BAD: + ret = -EREMOTEIO; + goto error; + case DNS_LOOKUP_GOT_LOCAL_FAILURE: + case DNS_LOOKUP_GOT_TEMP_FAILURE: + case DNS_LOOKUP_GOT_NS_FAILURE: + ret = -EDESTADDRREQ; + goto error; + } + } _leave(" = %p [cell]", cell); return cell; @@ -325,7 +375,7 @@ wait_for_cell: cell_already_exists: _debug("cell exists"); cell = cursor; - if (excl) { + if (reason == AFS_LOOKUP_CELL_PRELOAD) { ret = -EEXIST; } else { afs_use_cell(cursor, trace); @@ -384,7 +434,8 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) return -EINVAL; /* allocate a cell record for the root/workstation cell */ - new_root = afs_lookup_cell(net, rootcell, len, vllist, false, + new_root = afs_lookup_cell(net, rootcell, len, vllist, + AFS_LOOKUP_CELL_ROOTCELL, afs_cell_trace_use_lookup_ws); if (IS_ERR(new_root)) { _leave(" = %ld", PTR_ERR(new_root)); @@ -777,6 +828,7 @@ static bool afs_manage_cell(struct afs_cell *cell) switch (cell->state) { case AFS_CELL_SETTING_UP: goto set_up_cell; + case AFS_CELL_UNLOOKED: case AFS_CELL_ACTIVE: goto cell_is_active; case AFS_CELL_REMOVING: @@ -797,7 +849,7 @@ set_up_cell: goto remove_cell; } - afs_set_cell_state(cell, AFS_CELL_ACTIVE); + afs_set_cell_state(cell, AFS_CELL_UNLOOKED); cell_is_active: if (afs_has_cell_expired(cell, &next_manage)) @@ -807,6 +859,8 @@ cell_is_active: ret = afs_update_cell(cell); if (ret < 0) cell->error = ret; + if (cell->state == AFS_CELL_UNLOOKED) + afs_set_cell_state(cell, AFS_CELL_ACTIVE); } if (next_manage < TIME64_MAX && cell->net->live) { diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 8c6130789fde..dc9d29e3739e 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -108,7 +108,8 @@ static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry * dotted = true; } - cell = afs_lookup_cell(net, name, len, NULL, false, + cell = afs_lookup_cell(net, name, len, NULL, + AFS_LOOKUP_CELL_DYNROOT, afs_cell_trace_use_lookup_dynroot); if (IS_ERR(cell)) { ret = PTR_ERR(cell); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a45ae5c2ef8a..b92f96f56767 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -343,6 +343,7 @@ extern const char afs_init_sysname[]; enum afs_cell_state { AFS_CELL_SETTING_UP, + AFS_CELL_UNLOOKED, AFS_CELL_ACTIVE, AFS_CELL_REMOVING, AFS_CELL_DEAD, @@ -1049,9 +1050,18 @@ static inline bool afs_cb_is_broken(unsigned int cb_break, extern int afs_cell_init(struct afs_net *, const char *); extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned, enum afs_cell_trace); +enum afs_lookup_cell_for { + AFS_LOOKUP_CELL_DYNROOT, + AFS_LOOKUP_CELL_MOUNTPOINT, + AFS_LOOKUP_CELL_DIRECT_MOUNT, + AFS_LOOKUP_CELL_PRELOAD, + AFS_LOOKUP_CELL_ROOTCELL, + AFS_LOOKUP_CELL_ALIAS_CHECK, +}; struct afs_cell *afs_lookup_cell(struct afs_net *net, const char *name, unsigned int namesz, - const char *vllist, bool excl, + const char *vllist, + enum afs_lookup_cell_for reason, enum afs_cell_trace trace); extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace); void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 1ad048e6e164..57c204a3c04e 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -107,7 +107,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) if (size > AFS_MAXCELLNAME) return -ENAMETOOLONG; - cell = afs_lookup_cell(ctx->net, p, size, NULL, false, + cell = afs_lookup_cell(ctx->net, p, size, NULL, + AFS_LOOKUP_CELL_MOUNTPOINT, afs_cell_trace_use_lookup_mntpt); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 40e879c8ca77..44520549b509 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -122,7 +122,8 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size) if (strcmp(buf, "add") == 0) { struct afs_cell *cell; - cell = afs_lookup_cell(net, name, strlen(name), args, true, + cell = afs_lookup_cell(net, name, strlen(name), args, + AFS_LOOKUP_CELL_PRELOAD, afs_cell_trace_use_lookup_add); if (IS_ERR(cell)) { ret = PTR_ERR(cell); diff --git a/fs/afs/super.c b/fs/afs/super.c index da407f2d6f0d..d672b7ab57ae 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -290,7 +290,7 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) /* lookup the cell record */ if (cellname) { cell = afs_lookup_cell(ctx->net, cellname, cellnamesz, - NULL, false, + NULL, AFS_LOOKUP_CELL_DIRECT_MOUNT, afs_cell_trace_use_lookup_mount); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%*.*s'\n", diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index 709b4cdb723e..fc9676abd252 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -269,7 +269,8 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key) if (!name_len || name_len > AFS_MAXCELLNAME) master = ERR_PTR(-EOPNOTSUPP); else - master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false, + master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, + AFS_LOOKUP_CELL_ALIAS_CHECK, afs_cell_trace_use_lookup_canonical); kfree(cell_name); if (IS_ERR(master)) From 34ab4c75588c07cca12884f2bf6b0347c7a13872 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 23 Oct 2025 22:25:49 +0900 Subject: [PATCH 04/17] bfs: Reconstruct file type when loading from disk syzbot is reporting that S_IFMT bits of inode->i_mode can become bogus when the S_IFMT bits of the 32bits "mode" field loaded from disk are corrupted or when the 32bits "attributes" field loaded from disk are corrupted. A documentation says that BFS uses only lower 9 bits of the "mode" field. But I can't find an explicit explanation that the unused upper 23 bits (especially, the S_IFMT bits) are initialized with 0. Therefore, ignore the S_IFMT bits of the "mode" field loaded from disk. Also, verify that the value of the "attributes" field loaded from disk is either BFS_VREG or BFS_VDIR (because BFS supports only regular files and the root directory). Reported-by: syzbot+895c23f6917da440ed0d@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d Signed-off-by: Tetsuo Handa Link: https://patch.msgid.link/fabce673-d5b9-4038-8287-0fd65d80203b@I-love.SAKURA.ne.jp Reviewed-by: Tigran Aivazian Signed-off-by: Christian Brauner --- fs/bfs/inode.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 1d41ce477df5..984b365df046 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -61,7 +61,19 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; di = (struct bfs_inode *)bh->b_data + off; - inode->i_mode = 0x0000FFFF & le32_to_cpu(di->i_mode); + /* + * https://martin.hinner.info/fs/bfs/bfs-structure.html explains that + * BFS in SCO UnixWare environment used only lower 9 bits of di->i_mode + * value. This means that, although bfs_write_inode() saves whole + * inode->i_mode bits (which include S_IFMT bits and S_IS{UID,GID,VTX} + * bits), middle 7 bits of di->i_mode value can be garbage when these + * bits were not saved by bfs_write_inode(). + * Since we can't tell whether middle 7 bits are garbage, use only + * lower 12 bits (i.e. tolerate S_IS{UID,GID,VTX} bits possibly being + * garbage) and reconstruct S_IFMT bits for Linux environment from + * di->i_vtype value. + */ + inode->i_mode = 0x00000FFF & le32_to_cpu(di->i_mode); if (le32_to_cpu(di->i_vtype) == BFS_VDIR) { inode->i_mode |= S_IFDIR; inode->i_op = &bfs_dir_inops; @@ -71,6 +83,11 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; inode->i_mapping->a_ops = &bfs_aops; + } else { + brelse(bh); + printf("Unknown vtype=%u %s:%08lx\n", + le32_to_cpu(di->i_vtype), inode->i_sb->s_id, ino); + goto error; } BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock); From 9db8d46712d274a27d1d22c38e70211f20d508c2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 24 Oct 2025 15:23:36 +0200 Subject: [PATCH 05/17] mnt: Remove dead code which might prevent from building Clang, in particular, is not happy about dead code: fs/namespace.c:135:37: error: unused function 'node_to_mnt_ns' [-Werror,-Wunused-function] 135 | static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) | ^~~~~~~~~~~~~~ 1 error generated. Remove a leftover from the previous cleanup. Fixes: 7d7d16498958 ("mnt: support ns lookup") Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20251024132336.1666382-1-andriy.shevchenko@linux.intel.com Signed-off-by: Christian Brauner --- fs/namespace.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 5b5ab2ae238b..cc6e00e72437 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -132,16 +132,6 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); -static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) -{ - struct ns_common *ns; - - if (!node) - return NULL; - ns = rb_entry(node, struct ns_common, ns_tree_node); - return container_of(ns, struct mnt_namespace, ns); -} - static void mnt_ns_release(struct mnt_namespace *ns) { /* keep alive for {list,stat}mount() */ From 63b5aa01da0f38cdbd97d021477258e511631497 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Tue, 4 Nov 2025 20:50:06 +0800 Subject: [PATCH 06/17] vfat: fix missing sb_min_blocksize() return value checks When emulating an nvme device on qemu with both logical_block_size and physical_block_size set to 8 KiB, but without format, a kernel panic was triggered during the early boot stage while attempting to mount a vfat filesystem. [95553.682035] EXT4-fs (nvme0n1): unable to set blocksize [95553.684326] EXT4-fs (nvme0n1): unable to set blocksize [95553.686501] EXT4-fs (nvme0n1): unable to set blocksize [95553.696448] ISOFS: unsupported/invalid hardware sector size 8192 [95553.697117] ------------[ cut here ]------------ [95553.697567] kernel BUG at fs/buffer.c:1582! [95553.697984] Oops: invalid opcode: 0000 [#1] SMP NOPTI [95553.698602] CPU: 0 UID: 0 PID: 7212 Comm: mount Kdump: loaded Not tainted 6.18.0-rc2+ #38 PREEMPT(voluntary) [95553.699511] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [95553.700534] RIP: 0010:folio_alloc_buffers+0x1bb/0x1c0 [95553.701018] Code: 48 8b 15 e8 93 18 02 65 48 89 35 e0 93 18 02 48 83 c4 10 5b 41 5c 41 5d 41 5e 41 5f 5d 31 d2 31 c9 31 f6 31 ff c3 cc cc cc cc <0f> 0b 90 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f [95553.702648] RSP: 0018:ffffd1b0c676f990 EFLAGS: 00010246 [95553.703132] RAX: ffff8cfc4176d820 RBX: 0000000000508c48 RCX: 0000000000000001 [95553.703805] RDX: 0000000000002000 RSI: 0000000000000000 RDI: 0000000000000000 [95553.704481] RBP: ffffd1b0c676f9c8 R08: 0000000000000000 R09: 0000000000000000 [95553.705148] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001 [95553.705816] R13: 0000000000002000 R14: fffff8bc8257e800 R15: 0000000000000000 [95553.706483] FS: 000072ee77315840(0000) GS:ffff8cfdd2c8d000(0000) knlGS:0000000000000000 [95553.707248] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [95553.707782] CR2: 00007d8f2a9e5a20 CR3: 0000000039d0c006 CR4: 0000000000772ef0 [95553.708439] PKRU: 55555554 [95553.708734] Call Trace: [95553.709015] [95553.709266] __getblk_slow+0xd2/0x230 [95553.709641] ? find_get_block_common+0x8b/0x530 [95553.710084] bdev_getblk+0x77/0xa0 [95553.710449] __bread_gfp+0x22/0x140 [95553.710810] fat_fill_super+0x23a/0xfc0 [95553.711216] ? __pfx_setup+0x10/0x10 [95553.711580] ? __pfx_vfat_fill_super+0x10/0x10 [95553.712014] vfat_fill_super+0x15/0x30 [95553.712401] get_tree_bdev_flags+0x141/0x1e0 [95553.712817] get_tree_bdev+0x10/0x20 [95553.713177] vfat_get_tree+0x15/0x20 [95553.713550] vfs_get_tree+0x2a/0x100 [95553.713910] vfs_cmd_create+0x62/0xf0 [95553.714273] __do_sys_fsconfig+0x4e7/0x660 [95553.714669] __x64_sys_fsconfig+0x20/0x40 [95553.715062] x64_sys_call+0x21ee/0x26a0 [95553.715453] do_syscall_64+0x80/0x670 [95553.715816] ? __fs_parse+0x65/0x1e0 [95553.716172] ? fat_parse_param+0x103/0x4b0 [95553.716587] ? vfs_parse_fs_param_source+0x21/0xa0 [95553.717034] ? __do_sys_fsconfig+0x3d9/0x660 [95553.717548] ? __x64_sys_fsconfig+0x20/0x40 [95553.717957] ? x64_sys_call+0x21ee/0x26a0 [95553.718360] ? do_syscall_64+0xb8/0x670 [95553.718734] ? __x64_sys_fsconfig+0x20/0x40 [95553.719141] ? x64_sys_call+0x21ee/0x26a0 [95553.719545] ? do_syscall_64+0xb8/0x670 [95553.719922] ? x64_sys_call+0x1405/0x26a0 [95553.720317] ? do_syscall_64+0xb8/0x670 [95553.720702] ? __x64_sys_close+0x3e/0x90 [95553.721080] ? x64_sys_call+0x1b5e/0x26a0 [95553.721478] ? do_syscall_64+0xb8/0x670 [95553.721841] ? irqentry_exit+0x43/0x50 [95553.722211] ? exc_page_fault+0x90/0x1b0 [95553.722681] entry_SYSCALL_64_after_hwframe+0x76/0x7e [95553.723166] RIP: 0033:0x72ee774f3afe [95553.723562] Code: 73 01 c3 48 8b 0d 0a 33 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 49 89 ca b8 af 01 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d da 32 0f 00 f7 d8 64 89 01 48 [95553.725188] RSP: 002b:00007ffe97148978 EFLAGS: 00000246 ORIG_RAX: 00000000000001af [95553.725892] RAX: ffffffffffffffda RBX: 00005dcfe53d0080 RCX: 000072ee774f3afe [95553.726526] RDX: 0000000000000000 RSI: 0000000000000006 RDI: 0000000000000003 [95553.727176] RBP: 00007ffe97148ac0 R08: 0000000000000000 R09: 000072ee775e7ac0 [95553.727818] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [95553.728459] R13: 00005dcfe53d04b0 R14: 000072ee77670b00 R15: 00005dcfe53d1a28 [95553.729086] The panic occurs as follows: 1. logical_block_size is 8KiB, causing {struct super_block *sb}->s_blocksize is initialized to 0. vfat_fill_super - fat_fill_super - sb_min_blocksize - sb_set_blocksize //return 0 when size is 8KiB. 2. __bread_gfp is called with size == 0, causing folio_alloc_buffers() to compute an offset equal to folio_size(folio), which triggers a BUG_ON. fat_fill_super - sb_bread - __bread_gfp // size == {struct super_block *sb}->s_blocksize == 0 - bdev_getblk - __getblk_slow - grow_buffers - grow_dev_folio - folio_alloc_buffers // size == 0 - folio_set_bh //offset == folio_size(folio) and panic To fix this issue, add proper return value checks for sb_min_blocksize(). Cc: stable@vger.kernel.org # v6.15 Fixes: a64e5a596067bd ("bdev: add back PAGE_SIZE block size validation for sb_set_blocksize()") Reviewed-by: Matthew Wilcox Reviewed-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: OGAWA Hirofumi Reviewed-by: Christoph Hellwig Signed-off-by: Yongpeng Yang Link: https://patch.msgid.link/20251104125009.2111925-2-yangyongpeng.storage@gmail.com Acked-by: OGAWA Hirofumi Signed-off-by: Christian Brauner --- fs/fat/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 9648ed097816..9cfe20a3daaf 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1595,8 +1595,12 @@ int fat_fill_super(struct super_block *sb, struct fs_context *fc, setup(sb); /* flavour-specific stuff that needs options */ + error = -EINVAL; + if (!sb_min_blocksize(sb, 512)) { + fat_msg(sb, KERN_ERR, "unable to set blocksize"); + goto out_fail; + } error = -EIO; - sb_min_blocksize(sb, 512); bh = sb_bread(sb, 0); if (bh == NULL) { fat_msg(sb, KERN_ERR, "unable to read boot sector"); From f2c1f631630e01821fe4c3fdf6077bc7a8284f82 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Tue, 4 Nov 2025 20:50:07 +0800 Subject: [PATCH 07/17] exfat: check return value of sb_min_blocksize in exfat_read_boot_sector sb_min_blocksize() may return 0. Check its return value to avoid accessing the filesystem super block when sb->s_blocksize is 0. Cc: stable@vger.kernel.org # v6.15 Fixes: 719c1e1829166d ("exfat: add super block operations") Reviewed-by: Christoph Hellwig Signed-off-by: Yongpeng Yang Link: https://patch.msgid.link/20251104125009.2111925-3-yangyongpeng.storage@gmail.com Signed-off-by: Christian Brauner --- fs/exfat/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 7f9592856bf7..74d451f732c7 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -433,7 +433,10 @@ static int exfat_read_boot_sector(struct super_block *sb) struct exfat_sb_info *sbi = EXFAT_SB(sb); /* set block size to read super block */ - sb_min_blocksize(sb, 512); + if (!sb_min_blocksize(sb, 512)) { + exfat_err(sb, "unable to set blocksize"); + return -EINVAL; + } /* read boot sector */ sbi->boot_bh = sb_bread(sb, 0); From e106e269c5cb38315eb0a0e7e38f71e9b20c8c66 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Tue, 4 Nov 2025 20:50:08 +0800 Subject: [PATCH 08/17] isofs: check the return value of sb_min_blocksize() in isofs_fill_super sb_min_blocksize() may return 0. Check its return value to avoid opt->blocksize and sb->s_blocksize is 0. Cc: stable@vger.kernel.org # v6.15 Fixes: 1b17a46c9243e9 ("isofs: convert isofs to use the new mount API") Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Yongpeng Yang Link: https://patch.msgid.link/20251104125009.2111925-4-yangyongpeng.storage@gmail.com Signed-off-by: Christian Brauner --- fs/isofs/inode.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 6f0e6b19383c..ad3143d4066b 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -610,6 +610,11 @@ static int isofs_fill_super(struct super_block *s, struct fs_context *fc) goto out_freesbi; } opt->blocksize = sb_min_blocksize(s, opt->blocksize); + if (!opt->blocksize) { + printk(KERN_ERR + "ISOFS: unable to set blocksize\n"); + goto out_freesbi; + } sbi->s_high_sierra = 0; /* default is iso9660 */ sbi->s_session = opt->session; From 124af0868ec6929ba838fb76d25f00c06ba8fc0d Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Tue, 4 Nov 2025 20:50:09 +0800 Subject: [PATCH 09/17] xfs: check the return value of sb_min_blocksize() in xfs_fs_fill_super sb_min_blocksize() may return 0. Check its return value to avoid the filesystem super block when sb->s_blocksize is 0. Cc: stable@vger.kernel.org # v6.15 Fixes: a64e5a596067bd ("bdev: add back PAGE_SIZE block size validation for sb_set_blocksize()") Reviewed-by: Christoph Hellwig Signed-off-by: Yongpeng Yang Link: https://patch.msgid.link/20251104125009.2111925-5-yangyongpeng.storage@gmail.com Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/xfs/xfs_super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e85a156dc17d..fbb8009f1c0f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1662,7 +1662,10 @@ xfs_fs_fill_super( if (error) return error; - sb_min_blocksize(sb, BBSIZE); + if (!sb_min_blocksize(sb, BBSIZE)) { + xfs_err(mp, "unable to set blocksize"); + return -EINVAL; + } sb->s_xattr = xfs_xattr_handlers; sb->s_export_op = &xfs_export_operations; #ifdef CONFIG_XFS_QUOTA From c014021253d77cd89b2d8788ce522283d83fbd40 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Mon, 27 Oct 2025 03:46:47 -0700 Subject: [PATCH 10/17] virtio-fs: fix incorrect check for fsvq->kobj In virtio_fs_add_queues_sysfs(), the code incorrectly checks fs->mqs_kobj after calling kobject_create_and_add(). Change the check to fsvq->kobj (fs->mqs_kobj -> fsvq->kobj) to ensure the per-queue kobject is successfully created. Fixes: 87cbdc396a31 ("virtio_fs: add sysfs entries for queue information") Signed-off-by: Alok Tiwari Link: https://patch.msgid.link/20251027104658.1668537-1-alok.a.tiwari@oracle.com Reviewed-by: Stefan Hajnoczi Signed-off-by: Christian Brauner --- fs/fuse/virtio_fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 6bc7c97b017d..b2f6486fe1d5 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -373,7 +373,7 @@ static int virtio_fs_add_queues_sysfs(struct virtio_fs *fs) sprintf(buff, "%d", i); fsvq->kobj = kobject_create_and_add(buff, fs->mqs_kobj); - if (!fs->mqs_kobj) { + if (!fsvq->kobj) { ret = -ENOMEM; goto out_del; } From 8637fa89e678422995301ddb20b74190dffcccee Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Tue, 4 Nov 2025 20:50:10 +0800 Subject: [PATCH 11/17] block: add __must_check attribute to sb_min_blocksize() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When sb_min_blocksize() returns 0 and the return value is not checked, it may lead to a situation where sb->s_blocksize is 0 when accessing the filesystem super block. After commit a64e5a596067bd ("bdev: add back PAGE_SIZE block size validation for sb_set_blocksize()"), this becomes more likely to happen when the block device’s logical_block_size is larger than PAGE_SIZE and the filesystem is unformatted. Add the __must_check attribute to ensure callers always check the return value. Cc: stable@vger.kernel.org # v6.15 Suggested-by: Matthew Wilcox Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Yongpeng Yang Link: https://patch.msgid.link/20251104125009.2111925-6-yangyongpeng.storage@gmail.com Signed-off-by: Christian Brauner --- block/bdev.c | 2 +- include/linux/fs.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 810707cca970..638f0cd458ae 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -231,7 +231,7 @@ int sb_set_blocksize(struct super_block *sb, int size) EXPORT_SYMBOL(sb_set_blocksize); -int sb_min_blocksize(struct super_block *sb, int size) +int __must_check sb_min_blocksize(struct super_block *sb, int size) { int minsize = bdev_logical_block_size(sb->s_bdev); if (size < minsize) diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..3ea98c6cce81 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3423,8 +3423,8 @@ static inline void remove_inode_hash(struct inode *inode) extern void inode_sb_list_add(struct inode *inode); extern void inode_add_lru(struct inode *inode); -extern int sb_set_blocksize(struct super_block *, int); -extern int sb_min_blocksize(struct super_block *, int); +int sb_set_blocksize(struct super_block *sb, int size); +int __must_check sb_min_blocksize(struct super_block *sb, int size); int generic_file_mmap(struct file *, struct vm_area_struct *); int generic_file_mmap_prepare(struct vm_area_desc *desc); From 90f601b497d76f40fa66795c3ecf625b6aced9fd Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Wed, 5 Nov 2025 02:29:23 +0000 Subject: [PATCH 12/17] binfmt_misc: restore write access before closing files opened by open_exec() bm_register_write() opens an executable file using open_exec(), which internally calls do_open_execat() and denies write access on the file to avoid modification while it is being executed. However, when an error occurs, bm_register_write() closes the file using filp_close() directly. This does not restore the write permission, which may cause subsequent write operations on the same file to fail. Fix this by calling exe_file_allow_write_access() before filp_close() to restore the write permission properly. Fixes: e7850f4d844e ("binfmt_misc: fix possible deadlock in bm_register_write") Signed-off-by: Zilin Guan Link: https://patch.msgid.link/20251105022923.1813587-1-zilin@seu.edu.cn Signed-off-by: Christian Brauner --- fs/binfmt_misc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index a839f960cd4a..a8b1d79e4af0 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -837,8 +837,10 @@ out: inode_unlock(d_inode(root)); if (err) { - if (f) + if (f) { + exe_file_allow_write_access(f); filp_close(f, NULL); + } kfree(e); return err; } From a3f8f8662771285511ae26c4c8d3ba1cd22159b9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 5 Nov 2025 14:39:45 +0100 Subject: [PATCH 13/17] power: always freeze efivarfs The efivarfs filesystems must always be frozen and thawed to resync variable state. Make it so. Link: https://patch.msgid.link/20251105-vorbild-zutreffen-fe00d1dd98db@brauner Signed-off-by: Christian Brauner --- fs/efivarfs/super.c | 1 + fs/super.c | 13 ++++++++++--- include/linux/fs.h | 3 ++- kernel/power/hibernate.c | 9 +++------ kernel/power/suspend.c | 3 +-- 5 files changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 1f4d8ce56667..6de97565d5f7 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -533,6 +533,7 @@ static struct file_system_type efivarfs_type = { .init_fs_context = efivarfs_init_fs_context, .kill_sb = efivarfs_kill_sb, .parameters = efivarfs_parameters, + .fs_flags = FS_POWER_FREEZE, }; static __init int efivarfs_init(void) diff --git a/fs/super.c b/fs/super.c index 5bab94fb7e03..277b84e5c279 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1183,11 +1183,14 @@ static inline bool get_active_super(struct super_block *sb) static const char *filesystems_freeze_ptr = "filesystems_freeze"; -static void filesystems_freeze_callback(struct super_block *sb, void *unused) +static void filesystems_freeze_callback(struct super_block *sb, void *freeze_all_ptr) { if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super) return; + if (freeze_all_ptr && !(sb->s_type->fs_flags & FS_POWER_FREEZE)) + return; + if (!get_active_super(sb)) return; @@ -1201,9 +1204,13 @@ static void filesystems_freeze_callback(struct super_block *sb, void *unused) deactivate_super(sb); } -void filesystems_freeze(void) +void filesystems_freeze(bool freeze_all) { - __iterate_supers(filesystems_freeze_callback, NULL, + void *freeze_all_ptr = NULL; + + if (freeze_all) + freeze_all_ptr = &freeze_all; + __iterate_supers(filesystems_freeze_callback, freeze_all_ptr, SUPER_ITER_UNLOCKED | SUPER_ITER_REVERSE); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 3ea98c6cce81..249a1da8440e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2689,6 +2689,7 @@ struct file_system_type { #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ #define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_LBS 128 /* FS supports LBS */ +#define FS_POWER_FREEZE 256 /* Always freeze on suspend/hibernate */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; @@ -3606,7 +3607,7 @@ extern void drop_super_exclusive(struct super_block *sb); extern void iterate_supers(void (*f)(struct super_block *, void *), void *arg); extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); -void filesystems_freeze(void); +void filesystems_freeze(bool freeze_all); void filesystems_thaw(void); extern int dcache_dir_open(struct inode *, struct file *); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 14e85ff23551..1f250ce036a0 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -825,8 +825,7 @@ int hibernate(void) goto Restore; ksys_sync_helper(); - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) @@ -932,8 +931,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) if (error) goto restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) @@ -1083,8 +1081,7 @@ static int software_resume(void) if (error) goto Restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4bb4686c1c08..c933a63a9718 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -375,8 +375,7 @@ static int suspend_prepare(suspend_state_t state) if (error) goto Restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); trace_suspend_resume(TPS("freeze_processes"), 0, false); From 78f0e33cd6c939a555aa80dbed2fec6b333a7660 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 11 Nov 2025 06:28:15 +0000 Subject: [PATCH 14/17] fs/namespace: correctly handle errors returned by grab_requested_mnt_ns grab_requested_mnt_ns was changed to return error codes on failure, but its callers were not updated to check for error pointers, still checking only for a NULL return value. This commit updates the callers to use IS_ERR() or IS_ERR_OR_NULL() and PTR_ERR() to correctly check for and propagate errors. This also makes sure that the logic actually works and mount namespace file descriptors can be used to refere to mounts. Christian Brauner says: Rework the patch to be more ergonomic and in line with our overall error handling patterns. Fixes: 7b9d14af8777 ("fs: allow mount namespace fd") Cc: Christian Brauner Signed-off-by: Andrei Vagin Link: https://patch.msgid.link/20251111062815.2546189-1-avagin@google.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 32 ++++++++++++++++---------------- include/uapi/linux/mount.h | 2 +- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index cc6e00e72437..2bad25709b2c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -141,7 +141,8 @@ static void mnt_ns_release(struct mnt_namespace *ns) kfree(ns); } } -DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) +DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, + if (!IS_ERR(_T)) mnt_ns_release(_T)) static void mnt_ns_release_rcu(struct rcu_head *rcu) { @@ -5726,7 +5727,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req, ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); if (ret) return ret; - if (kreq->spare != 0) + if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id) return -EINVAL; /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET) @@ -5743,16 +5744,12 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq { struct mnt_namespace *mnt_ns; - if (kreq->mnt_ns_id && kreq->spare) - return ERR_PTR(-EINVAL); - - if (kreq->mnt_ns_id) - return lookup_mnt_ns(kreq->mnt_ns_id); - - if (kreq->spare) { + if (kreq->mnt_ns_id) { + mnt_ns = lookup_mnt_ns(kreq->mnt_ns_id); + } else if (kreq->mnt_ns_fd) { struct ns_common *ns; - CLASS(fd, f)(kreq->spare); + CLASS(fd, f)(kreq->mnt_ns_fd); if (fd_empty(f)) return ERR_PTR(-EBADF); @@ -5767,6 +5764,8 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq } else { mnt_ns = current->nsproxy->mnt_ns; } + if (!mnt_ns) + return ERR_PTR(-ENOENT); refcount_inc(&mnt_ns->passive); return mnt_ns; @@ -5791,8 +5790,8 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, return ret; ns = grab_requested_mnt_ns(&kreq); - if (!ns) - return -ENOENT; + if (IS_ERR(ns)) + return PTR_ERR(ns); if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) @@ -5902,8 +5901,8 @@ static void __free_klistmount_free(const struct klistmount *kls) static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq, size_t nr_mnt_ids) { - u64 last_mnt_id = kreq->param; + struct mnt_namespace *ns; /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) @@ -5917,9 +5916,10 @@ static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req * if (!kls->kmnt_ids) return -ENOMEM; - kls->ns = grab_requested_mnt_ns(kreq); - if (!kls->ns) - return -ENOENT; + ns = grab_requested_mnt_ns(kreq); + if (IS_ERR(ns)) + return PTR_ERR(ns); + kls->ns = ns; kls->mnt_parent_id = kreq->mnt_id; return 0; diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 7fa67c2031a5..5d3f8c9e3a62 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -197,7 +197,7 @@ struct statmount { */ struct mnt_id_req { __u32 size; - __u32 spare; + __u32 mnt_ns_fd; __u64 mnt_id; __u64 param; __u64 mnt_ns_id; From 3cd1548a278c7d6a9bdef1f1866e7cf66bfd3518 Mon Sep 17 00:00:00 2001 From: Mike Yuan Date: Sat, 8 Nov 2025 19:09:47 +0000 Subject: [PATCH 15/17] shmem: fix tmpfs reconfiguration (remount) when noswap is set In systemd we're trying to switch the internal credentials setup logic to new mount API [1], and I noticed fsconfig(FSCONFIG_CMD_RECONFIGURE) consistently fails on tmpfs with noswap option. This can be trivially reproduced with the following: ``` int fs_fd = fsopen("tmpfs", 0); fsconfig(fs_fd, FSCONFIG_SET_FLAG, "noswap", NULL, 0); fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); fsmount(fs_fd, 0, 0); fsconfig(fs_fd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0); <------ EINVAL ``` After some digging the culprit is shmem_reconfigure() rejecting !(ctx->seen & SHMEM_SEEN_NOSWAP) && sbinfo->noswap, which is bogus as ctx->seen serves as a mask for whether certain options are touched at all. On top of that, noswap option doesn't use fsparam_flag_no, hence it's not really possible to "reenable" swap to begin with. Drop the check and redundant SHMEM_SEEN_NOSWAP flag. [1] https://github.com/systemd/systemd/pull/39637 Fixes: 2c6efe9cf2d7 ("shmem: add support to ignore swap") Signed-off-by: Mike Yuan Link: https://patch.msgid.link/20251108190930.440685-1-me@yhndnzj.com Cc: Luis Chamberlain Cc: Christian Brauner Cc: Hugh Dickins Cc: stable@vger.kernel.org Signed-off-by: Christian Brauner --- mm/shmem.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index b9081b817d28..1b976414d6fa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -131,8 +131,7 @@ struct shmem_options { #define SHMEM_SEEN_INODES 2 #define SHMEM_SEEN_HUGE 4 #define SHMEM_SEEN_INUMS 8 -#define SHMEM_SEEN_NOSWAP 16 -#define SHMEM_SEEN_QUOTA 32 +#define SHMEM_SEEN_QUOTA 16 }; #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -4677,7 +4676,6 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) "Turning off swap in unprivileged tmpfs mounts unsupported"); } ctx->noswap = true; - ctx->seen |= SHMEM_SEEN_NOSWAP; break; case Opt_quota: if (fc->user_ns != &init_user_ns) @@ -4827,14 +4825,15 @@ static int shmem_reconfigure(struct fs_context *fc) err = "Current inum too high to switch to 32-bit inums"; goto out; } - if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) { + + /* + * "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap" + * counterpart for (re-)enabling swap. + */ + if (ctx->noswap && !sbinfo->noswap) { err = "Cannot disable swap on remount"; goto out; } - if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) { - err = "Cannot enable swap on remount if it was disabled on first mount"; - goto out; - } if (ctx->seen & SHMEM_SEEN_QUOTA && !sb_any_quota_loaded(fc->root->d_sb)) { From 12741624645e098b2234a5ae341045a97473caf1 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Nov 2025 22:20:24 +0100 Subject: [PATCH 16/17] fs: add iput_not_last() Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251105212025.807549-1-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/inode.c | 12 ++++++++++++ include/linux/fs.h | 1 + 2 files changed, 13 insertions(+) diff --git a/fs/inode.c b/fs/inode.c index ec9339024ac3..cff1d3af0d57 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1967,6 +1967,18 @@ retry: } EXPORT_SYMBOL(iput); +/** + * iput_not_last - put an inode assuming this is not the last reference + * @inode: inode to put + */ +void iput_not_last(struct inode *inode) +{ + VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 2, inode); + + WARN_ON(atomic_sub_return(1, &inode->i_count) == 0); +} +EXPORT_SYMBOL(iput_not_last); + #ifdef CONFIG_BLOCK /** * bmap - find a block number in a file diff --git a/include/linux/fs.h b/include/linux/fs.h index 249a1da8440e..dd3b57cfadee 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2824,6 +2824,7 @@ extern int current_umask(void); extern void ihold(struct inode * inode); extern void iput(struct inode *); +void iput_not_last(struct inode *); int inode_update_timestamps(struct inode *inode, int flags); int generic_update_time(struct inode *, int); From 56325e8c68c0724d626f665773a5005dcf44e329 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Nov 2025 22:20:25 +0100 Subject: [PATCH 17/17] landlock: fix splats from iput() after it started calling might_sleep() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At this point it is guaranteed this is not the last reference. However, a recent addition of might_sleep() at top of iput() started generating false-positives as it was executing for all values. Remedy the problem by using the newly introduced iput_not_last(). Reported-by: syzbot+12479ae15958fc3f54ec@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68d32659.a70a0220.4f78.0012.GAE@google.com/ Fixes: 2ef435a872ab ("fs: add might_sleep() annotation to iput() and more") Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251105212025.807549-2-mjguzik@gmail.com Reviewed-by: Mickaël Salaün Signed-off-by: Christian Brauner --- security/landlock/fs.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/security/landlock/fs.c b/security/landlock/fs.c index 0bade2c5aa1d..d9c12b993fa7 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -1335,11 +1335,10 @@ static void hook_sb_delete(struct super_block *const sb) * At this point, we own the ihold() reference that was * originally set up by get_inode_object() and the * __iget() reference that we just set in this loop - * walk. Therefore the following call to iput() will - * not sleep nor drop the inode because there is now at - * least two references to it. + * walk. Therefore there are at least two references + * on the inode. */ - iput(inode); + iput_not_last(inode); } else { spin_unlock(&object->lock); rcu_read_unlock();