diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst index de64d2d002a2..a01d9b9b5bc3 100644 --- a/Documentation/filesystems/nfs/exporting.rst +++ b/Documentation/filesystems/nfs/exporting.rst @@ -119,43 +119,11 @@ For a filesystem to be exportable it must: A file system implementation declares that instances of the filesystem are exportable by setting the s_export_op field in the struct -super_block. This field must point to a "struct export_operations" -struct which has the following members: +super_block. This field must point to a struct export_operations +which has the following members: - encode_fh (mandatory) - Takes a dentry and creates a filehandle fragment which may later be used - to find or create a dentry for the same object. - - fh_to_dentry (mandatory) - Given a filehandle fragment, this should find the implied object and - create a dentry for it (possibly with d_obtain_alias). - - fh_to_parent (optional but strongly recommended) - Given a filehandle fragment, this should find the parent of the - implied object and create a dentry for it (possibly with - d_obtain_alias). May fail if the filehandle fragment is too small. - - get_parent (optional but strongly recommended) - When given a dentry for a directory, this should return a dentry for - the parent. Quite possibly the parent dentry will have been allocated - by d_alloc_anon. The default get_parent function just returns an error - so any filehandle lookup that requires finding a parent will fail. - ->lookup("..") is *not* used as a default as it can leave ".." entries - in the dcache which are too messy to work with. - - get_name (optional) - When given a parent dentry and a child dentry, this should find a name - in the directory identified by the parent dentry, which leads to the - object identified by the child dentry. If no get_name function is - supplied, a default implementation is provided which uses vfs_readdir - to find potential names, and matches inode numbers to find the correct - match. - - flags - Some filesystems may need to be handled differently than others. The - export_operations struct also includes a flags field that allows the - filesystem to communicate such information to nfsd. See the Export - Operations Flags section below for more explanation. +.. kernel-doc:: include/linux/exportfs.h + :identifiers: struct export_operations A filehandle fragment consists of an array of 1 or more 4byte words, together with a one byte "type". diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 633da5e37299..ae7e7cf7523a 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -167,17 +167,11 @@ int v9fs_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, if (retval) goto err_out; - size = posix_acl_xattr_size(acl->a_count); - - value = kzalloc(size, GFP_NOFS); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_NOFS); if (!value) { retval = -ENOMEM; goto err_out; } - - retval = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (retval < 0) - goto err_out; } /* @@ -257,13 +251,10 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl) return 0; /* Set a setxattr request to server */ - size = posix_acl_xattr_size(acl->a_count); - buffer = kmalloc(size, GFP_KERNEL); + buffer = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL); if (!buffer) return -ENOMEM; - retval = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - if (retval < 0) - goto err_free_out; + switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; @@ -275,7 +266,6 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl) BUG(); } retval = v9fs_fid_xattr_set(fid, name, buffer, size, 0); -err_free_out: kfree(buffer); return retval; } diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index c336e2ab7f8a..e55b686fe1ab 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -57,7 +57,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct posix_acl *acl, int type) { - int ret, size = 0; + int ret; + size_t size = 0; const char *name; char AUTO_KFREE(value); @@ -77,20 +78,15 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, if (acl) { unsigned int nofs_flag; - size = posix_acl_xattr_size(acl->a_count); /* * We're holding a transaction handle, so use a NOFS memory * allocation context to avoid deadlock if reclaim happens. */ nofs_flag = memalloc_nofs_save(); - value = kmalloc(size, GFP_KERNEL); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL); memalloc_nofs_restore(nofs_flag); if (!value) return -ENOMEM; - - ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (ret < 0) - return ret; } if (trans) diff --git a/fs/buffer.c b/fs/buffer.c index 838c0c571022..fd53b806ab7e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2354,7 +2354,7 @@ bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count) if (!head) return false; blocksize = head->b_size; - to = min_t(unsigned, folio_size(folio) - from, count); + to = min(folio_size(folio) - from, count); to = from + to; if (from < blocksize && to > folio_size(folio) - blocksize) return false; @@ -2948,6 +2948,10 @@ bool try_to_free_buffers(struct folio *folio) if (folio_test_writeback(folio)) return false; + /* Misconfigured folio check */ + if (WARN_ON_ONCE(!folio_buffers(folio))) + return true; + if (mapping == NULL) { /* can this still happen? */ ret = drop_buffers(folio, &buffers_to_free); goto out; diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 1564eacc253d..85d3dd48b167 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -90,7 +90,8 @@ retry: int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { - int ret = 0, size = 0; + int ret = 0; + size_t size = 0; const char *name = NULL; char *value = NULL; struct iattr newattrs; @@ -126,16 +127,11 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, } if (acl) { - size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_NOFS); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_NOFS); if (!value) { ret = -ENOMEM; goto out; } - - ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (ret < 0) - goto out_free; } if (new_mode != old_mode) { @@ -172,7 +168,7 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, struct posix_acl *acl, *default_acl; size_t val_size1 = 0, val_size2 = 0; struct ceph_pagelist *pagelist = NULL; - void *tmp_buf = NULL; + void *tmp_buf1 = NULL, *tmp_buf2 = NULL; int err; err = posix_acl_create(dir, mode, &default_acl, &acl); @@ -192,15 +188,7 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, if (!default_acl && !acl) return 0; - if (acl) - val_size1 = posix_acl_xattr_size(acl->a_count); - if (default_acl) - val_size2 = posix_acl_xattr_size(default_acl->a_count); - err = -ENOMEM; - tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL); - if (!tmp_buf) - goto out_err; pagelist = ceph_pagelist_alloc(GFP_KERNEL); if (!pagelist) goto out_err; @@ -213,34 +201,39 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, if (acl) { size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS); + + err = -ENOMEM; + tmp_buf1 = posix_acl_to_xattr(&init_user_ns, acl, + &val_size1, GFP_KERNEL); + if (!tmp_buf1) + goto out_err; err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8); if (err) goto out_err; ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_ACCESS, len); - err = posix_acl_to_xattr(&init_user_ns, acl, - tmp_buf, val_size1); - if (err < 0) - goto out_err; ceph_pagelist_encode_32(pagelist, val_size1); - ceph_pagelist_append(pagelist, tmp_buf, val_size1); + ceph_pagelist_append(pagelist, tmp_buf1, val_size1); } if (default_acl) { size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT); + + err = -ENOMEM; + tmp_buf2 = posix_acl_to_xattr(&init_user_ns, default_acl, + &val_size2, GFP_KERNEL); + if (!tmp_buf2) + goto out_err; err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8); if (err) goto out_err; ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_DEFAULT, len); - err = posix_acl_to_xattr(&init_user_ns, default_acl, - tmp_buf, val_size2); - if (err < 0) - goto out_err; ceph_pagelist_encode_32(pagelist, val_size2); - ceph_pagelist_append(pagelist, tmp_buf, val_size2); + ceph_pagelist_append(pagelist, tmp_buf2, val_size2); } - kfree(tmp_buf); + kfree(tmp_buf1); + kfree(tmp_buf2); as_ctx->acl = acl; as_ctx->default_acl = default_acl; @@ -250,7 +243,8 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, out_err: posix_acl_release(acl); posix_acl_release(default_acl); - kfree(tmp_buf); + kfree(tmp_buf1); + kfree(tmp_buf2); if (pagelist) ceph_pagelist_release(pagelist); return err; diff --git a/fs/char_dev.c b/fs/char_dev.c index c2ddb998f3c9..bf7b32650e54 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -97,7 +98,8 @@ static struct char_device_struct * __register_chrdev_region(unsigned int major, unsigned int baseminor, int minorct, const char *name) { - struct char_device_struct *cd, *curr, *prev = NULL; + struct char_device_struct *cd __free(kfree) = NULL; + struct char_device_struct *curr, *prev = NULL; int ret; int i; @@ -117,14 +119,14 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, if (cd == NULL) return ERR_PTR(-ENOMEM); - mutex_lock(&chrdevs_lock); + guard(mutex)(&chrdevs_lock); if (major == 0) { ret = find_dynamic_major(); if (ret < 0) { pr_err("CHRDEV \"%s\" dynamic allocation region is full\n", name); - goto out; + return ERR_PTR(ret); } major = ret; } @@ -144,7 +146,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, if (curr->baseminor >= baseminor + minorct) break; - goto out; + return ERR_PTR(ret); } cd->major = major; @@ -160,12 +162,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, prev->next = cd; } - mutex_unlock(&chrdevs_lock); - return cd; -out: - mutex_unlock(&chrdevs_lock); - kfree(cd); - return ERR_PTR(ret); + return_ptr(cd); } static struct char_device_struct * @@ -343,7 +340,7 @@ void __unregister_chrdev(unsigned int major, unsigned int baseminor, kfree(cd); } -static DEFINE_SPINLOCK(cdev_lock); +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(cdev_lock); static struct kobject *cdev_get(struct cdev *p) { diff --git a/fs/dcache.c b/fs/dcache.c index 66dd1bb830d1..7088df2d042c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3237,10 +3237,7 @@ EXPORT_SYMBOL(d_parent_ino); static __initdata unsigned long dhash_entries; static int __init set_dhash_entries(char *str) { - if (!str) - return 0; - dhash_entries = simple_strtoul(str, &str, 0); - return 1; + return kstrtoul(str, 0, &dhash_entries) == 0; } __setup("dhash_entries=", set_dhash_entries); diff --git a/fs/exec.c b/fs/exec.c index 9d5ebc9d15b0..d0606e53376f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -555,7 +555,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) return -E2BIG; while (len > 0) { - unsigned int bytes_to_copy = min_t(unsigned int, len, + unsigned int bytes_to_copy = min(len, min_not_zero(offset_in_page(pos), PAGE_SIZE)); struct page *page; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 56d50fd3310b..e817a758801d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4276,8 +4276,7 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, * get the corresponding group metadata to work with. * For this we have goto again loop. */ - thisgrp_len = min_t(unsigned int, (unsigned int)len, - EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff)); + thisgrp_len = min(len, EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff)); clen = EXT4_NUM_B2C(sbi, thisgrp_len); if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) { diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 050f26168d97..76842f0957b5 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1479,7 +1479,7 @@ static void ext4_update_super(struct super_block *sb, /* Update the global fs size fields */ sbi->s_groups_count += flex_gd->count; - sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + sbi->s_blockfile_groups = min(sbi->s_groups_count, (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); /* Update the reserved block counts only once the new group is diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a6241ffb8639..a8d2460b527a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4837,7 +4837,7 @@ static int ext4_check_geometry(struct super_block *sb, return -EINVAL; } sbi->s_groups_count = blocks_count; - sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + sbi->s_blockfile_groups = min(sbi->s_groups_count, (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != le32_to_cpu(es->s_inodes_count)) { diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 0e97ef6c2327..07d95f1442c8 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -1355,7 +1355,7 @@ found: /* Fill the long name slots. */ for (i = 0; i < long_bhs; i++) { - int copy = min_t(int, sb->s_blocksize - offset, size); + int copy = umin(sb->s_blocksize - offset, size); memcpy(bhs[i]->b_data + offset, slots, copy); mark_buffer_dirty_inode(bhs[i], dir); offset = 0; @@ -1366,7 +1366,7 @@ found: err = fat_sync_bhs(bhs, long_bhs); if (!err && i < nr_bhs) { /* Fill the short name slot. */ - int copy = min_t(int, sb->s_blocksize - offset, size); + int copy = umin(sb->s_blocksize - offset, size); memcpy(bhs[i]->b_data + offset, slots, copy); mark_buffer_dirty_inode(bhs[i], dir); if (IS_DIRSYNC(dir)) diff --git a/fs/fat/file.c b/fs/fat/file.c index afc0e3ad6536..124d9c5431c8 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -141,8 +141,7 @@ static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg) if (copy_from_user(&range, user_range, sizeof(range))) return -EFAULT; - range.minlen = max_t(unsigned int, range.minlen, - bdev_discard_granularity(sb->s_bdev)); + range.minlen = max(range.minlen, bdev_discard_granularity(sb->s_bdev)); err = fat_trim_fs(inode, &range); if (err < 0) diff --git a/fs/file_table.c b/fs/file_table.c index cd4a3db4659a..34244fccf2ed 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -176,6 +176,11 @@ static int init_file(struct file *f, int flags, const struct cred *cred) f->f_flags = flags; f->f_mode = OPEN_FMODE(flags); + /* + * Disable permission and pre-content events for all files by default. + * They may be enabled later by fsnotify_open_perm_and_set_mode(). + */ + file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM); f->f_op = NULL; f->f_mapping = NULL; @@ -197,11 +202,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred) * refcount bumps we should reinitialize the reused file first. */ file_ref_init(&f->f_ref, 1); - /* - * Disable permission and pre-content events for all files by default. - * They may be enabled later by fsnotify_open_perm_and_set_mode(). - */ - file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM); return 0; } diff --git a/fs/fs_struct.c b/fs/fs_struct.c index b8c46c5a38a0..394875d06fd6 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "internal.h" /* diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 8f484b105f13..cbde6ac1add3 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -122,20 +122,16 @@ int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, * them to be refreshed the next time they are used, * and it also updates i_ctime. */ - size_t size = posix_acl_xattr_size(acl->a_count); + size_t size; void *value; - if (size > PAGE_SIZE) - return -E2BIG; - - value = kmalloc(size, GFP_KERNEL); + value = posix_acl_to_xattr(fc->user_ns, acl, &size, GFP_KERNEL); if (!value) return -ENOMEM; - ret = posix_acl_to_xattr(fc->user_ns, acl, value, size); - if (ret < 0) { + if (size > PAGE_SIZE) { kfree(value); - return ret; + return -E2BIG; } /* diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6d59cbc877c6..a30c8b57d478 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1813,7 +1813,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, goto out_iput; folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; - nr_bytes = min_t(unsigned, num, folio_size(folio) - folio_offset); + nr_bytes = min(num, folio_size(folio) - folio_offset); nr_pages = (offset + nr_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; err = fuse_copy_folio(cs, &folio, folio_offset, nr_bytes, 0); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 78fa46cfc636..dffd454e30e2 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1323,10 +1323,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, static inline unsigned int fuse_wr_pages(loff_t pos, size_t len, unsigned int max_pages) { - return min_t(unsigned int, - ((pos + len - 1) >> PAGE_SHIFT) - - (pos >> PAGE_SHIFT) + 1, - max_pages); + return min(((pos + len - 1) >> PAGE_SHIFT) - (pos >> PAGE_SHIFT) + 1, + max_pages); } static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) @@ -1607,7 +1605,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, struct folio *folio = page_folio(pages[i]); unsigned int offset = start + (folio_page_idx(folio, pages[i]) << PAGE_SHIFT); - unsigned int len = min_t(unsigned int, ret, PAGE_SIZE - start); + unsigned int len = umin(ret, PAGE_SIZE - start); ap->descs[ap->num_folios].offset = offset; ap->descs[ap->num_folios].length = len; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 443640e6fb9c..a5b60778b91c 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -83,21 +83,14 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu) int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int error; - size_t len; - char *data; + size_t len = 0; + char *data = NULL; const char *name = gfs2_acl_name(type); if (acl) { - len = posix_acl_xattr_size(acl->a_count); - data = kmalloc(len, GFP_NOFS); + data = posix_acl_to_xattr(&init_user_ns, acl, &len, GFP_NOFS); if (data == NULL) return -ENOMEM; - error = posix_acl_to_xattr(&init_user_ns, acl, data, len); - if (error < 0) - goto out; - } else { - data = NULL; - len = 0; } error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); diff --git a/fs/inode.c b/fs/inode.c index 1d0474745e77..dae43a8de7e0 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1028,19 +1028,20 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) return freed; } -static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked); +static void __wait_on_freeing_inode(struct inode *inode, bool hash_locked, bool rcu_locked); + /* * Called with the inode lock held. */ static struct inode *find_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), - void *data, bool is_inode_hash_locked, + void *data, bool hash_locked, bool *isnew) { struct inode *inode = NULL; - if (is_inode_hash_locked) + if (hash_locked) lockdep_assert_held(&inode_hash_lock); else lockdep_assert_not_held(&inode_hash_lock); @@ -1054,7 +1055,7 @@ repeat: continue; spin_lock(&inode->i_lock); if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) { - __wait_on_freeing_inode(inode, is_inode_hash_locked); + __wait_on_freeing_inode(inode, hash_locked, true); goto repeat; } if (unlikely(inode_state_read(inode) & I_CREATING)) { @@ -1078,11 +1079,11 @@ repeat: */ static struct inode *find_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino, - bool is_inode_hash_locked, bool *isnew) + bool hash_locked, bool *isnew) { struct inode *inode = NULL; - if (is_inode_hash_locked) + if (hash_locked) lockdep_assert_held(&inode_hash_lock); else lockdep_assert_not_held(&inode_hash_lock); @@ -1096,7 +1097,7 @@ repeat: continue; spin_lock(&inode->i_lock); if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) { - __wait_on_freeing_inode(inode, is_inode_hash_locked); + __wait_on_freeing_inode(inode, hash_locked, true); goto repeat; } if (unlikely(inode_state_read(inode) & I_CREATING)) { @@ -1832,16 +1833,13 @@ int insert_inode_locked(struct inode *inode) while (1) { struct inode *old = NULL; spin_lock(&inode_hash_lock); +repeat: hlist_for_each_entry(old, head, i_hash) { if (old->i_ino != ino) continue; if (old->i_sb != sb) continue; spin_lock(&old->i_lock); - if (inode_state_read(old) & (I_FREEING | I_WILL_FREE)) { - spin_unlock(&old->i_lock); - continue; - } break; } if (likely(!old)) { @@ -1852,6 +1850,11 @@ int insert_inode_locked(struct inode *inode) spin_unlock(&inode_hash_lock); return 0; } + if (inode_state_read(old) & (I_FREEING | I_WILL_FREE)) { + __wait_on_freeing_inode(old, true, false); + old = NULL; + goto repeat; + } if (unlikely(inode_state_read(old) & I_CREATING)) { spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); @@ -2522,16 +2525,18 @@ EXPORT_SYMBOL(inode_needs_sync); * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list * will DTRT. */ -static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked) +static void __wait_on_freeing_inode(struct inode *inode, bool hash_locked, bool rcu_locked) { struct wait_bit_queue_entry wqe; struct wait_queue_head *wq_head; + VFS_BUG_ON(!hash_locked && !rcu_locked); + /* * Handle racing against evict(), see that routine for more details. */ if (unlikely(inode_unhashed(inode))) { - WARN_ON(is_inode_hash_locked); + WARN_ON(hash_locked); spin_unlock(&inode->i_lock); return; } @@ -2539,23 +2544,22 @@ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW); prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); - rcu_read_unlock(); - if (is_inode_hash_locked) + if (rcu_locked) + rcu_read_unlock(); + if (hash_locked) spin_unlock(&inode_hash_lock); schedule(); finish_wait(wq_head, &wqe.wq_entry); - if (is_inode_hash_locked) + if (hash_locked) spin_lock(&inode_hash_lock); - rcu_read_lock(); + if (rcu_locked) + rcu_read_lock(); } static __initdata unsigned long ihash_entries; static int __init set_ihash_entries(char *str) { - if (!str) - return 0; - ihash_entries = simple_strtoul(str, &str, 0); - return 1; + return kstrtoul(str, 0, &ihash_entries) == 0; } __setup("ihash_entries=", set_ihash_entries); @@ -3005,24 +3009,45 @@ umode_t mode_strip_sgid(struct mnt_idmap *idmap, EXPORT_SYMBOL(mode_strip_sgid); #ifdef CONFIG_DEBUG_VFS -/* - * Dump an inode. +/** + * dump_inode - dump an inode. + * @inode: inode to dump + * @reason: reason for dumping * - * TODO: add a proper inode dumping routine, this is a stub to get debug off the - * ground. - * - * TODO: handle getting to fs type with get_kernel_nofault()? - * See dump_mapping() above. + * If inode is an invalid pointer, we don't want to crash accessing it, + * so probe everything depending on it carefully with get_kernel_nofault(). */ void dump_inode(struct inode *inode, const char *reason) { - struct super_block *sb = inode->i_sb; + struct super_block *sb; + struct file_system_type *s_type; + const char *fs_name_ptr; + char fs_name[32] = {}; + umode_t mode; + unsigned short opflags; + unsigned int flags; + unsigned int state; + int count; - pr_warn("%s encountered for inode %px\n" - "fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n", - reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags, - inode->i_flags, inode_state_read_once(inode), atomic_read(&inode->i_count)); + if (get_kernel_nofault(sb, &inode->i_sb) || + get_kernel_nofault(mode, &inode->i_mode) || + get_kernel_nofault(opflags, &inode->i_opflags) || + get_kernel_nofault(flags, &inode->i_flags)) { + pr_warn("%s: unreadable inode:%px\n", reason, inode); + return; + } + + state = inode_state_read_once(inode); + count = atomic_read(&inode->i_count); + + if (!sb || + get_kernel_nofault(s_type, &sb->s_type) || !s_type || + get_kernel_nofault(fs_name_ptr, &s_type->name) || !fs_name_ptr || + strncpy_from_kernel_nofault(fs_name, fs_name_ptr, sizeof(fs_name) - 1) < 0) + strscpy(fs_name, ""); + + pr_warn("%s: inode:%px fs:%s mode:%ho opflags:%#x flags:%#x state:%#x count:%d\n", + reason, inode, fs_name, mode, opflags, flags, state, count); } - EXPORT_SYMBOL(dump_inode); #endif diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 1de3602c98de..16b71a23ff1e 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -61,7 +61,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, { char *ea_name; int rc; - int size = 0; + size_t size = 0; char *value = NULL; switch (type) { @@ -76,16 +76,11 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, } if (acl) { - size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_KERNEL); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL); if (!value) return -ENOMEM; - rc = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (rc < 0) - goto out; } rc = __jfs_setxattr(tid, inode, ea_name, value, size, 0); -out: kfree(value); if (!rc) diff --git a/fs/locks.c b/fs/locks.c index cf1968b01bcb..3ea25d3a780f 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -178,7 +178,6 @@ locks_get_lock_context(struct inode *inode, int type) { struct file_lock_context *ctx; - /* paired with cmpxchg() below */ ctx = locks_inode_context(inode); if (likely(ctx) || type == F_UNLCK) goto out; @@ -196,7 +195,18 @@ locks_get_lock_context(struct inode *inode, int type) * Assign the pointer if it's not already assigned. If it is, then * free the context we just allocated. */ - if (cmpxchg(&inode->i_flctx, NULL, ctx)) { + spin_lock(&inode->i_lock); + if (!(inode->i_opflags & IOP_FLCTX)) { + VFS_BUG_ON_INODE(inode->i_flctx, inode); + WRITE_ONCE(inode->i_flctx, ctx); + /* + * Paired with locks_inode_context(). + */ + smp_store_release(&inode->i_opflags, inode->i_opflags | IOP_FLCTX); + spin_unlock(&inode->i_lock); + } else { + VFS_BUG_ON_INODE(!inode->i_flctx, inode); + spin_unlock(&inode->i_lock); kmem_cache_free(flctx_cache, ctx); ctx = locks_inode_context(inode); } diff --git a/fs/namei.c b/fs/namei.c index 76bc569ace8e..b28ecb699f32 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -879,7 +879,7 @@ static bool try_to_unlazy(struct nameidata *nd) { struct dentry *parent = nd->path.dentry; - BUG_ON(!(nd->flags & LOOKUP_RCU)); + VFS_BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(nd->flags & LOOKUP_CACHED)) { drop_links(nd); @@ -919,7 +919,8 @@ out: static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) { int res; - BUG_ON(!(nd->flags & LOOKUP_RCU)); + + VFS_BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(nd->flags & LOOKUP_CACHED)) { drop_links(nd); @@ -1631,9 +1632,6 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path) struct dentry *dentry = path->dentry; unsigned int flags = dentry->d_flags; - if (likely(!(flags & DCACHE_MANAGED_DENTRY))) - return true; - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return false; diff --git a/fs/namespace.c b/fs/namespace.c index 1d2089ffb6ab..0cc8c2757500 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -49,20 +49,14 @@ static unsigned int mp_hash_shift __ro_after_init; static __initdata unsigned long mhash_entries; static int __init set_mhash_entries(char *str) { - if (!str) - return 0; - mhash_entries = simple_strtoul(str, &str, 0); - return 1; + return kstrtoul(str, 0, &mhash_entries) == 0; } __setup("mhash_entries=", set_mhash_entries); static __initdata unsigned long mphash_entries; static int __init set_mphash_entries(char *str) { - if (!str) - return 0; - mphash_entries = simple_strtoul(str, &str, 0); - return 1; + return kstrtoul(str, 0, &mphash_entries) == 0; } __setup("mphash_entries=", set_mphash_entries); diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index c93df55e98d0..37a69a75ce68 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -641,13 +641,9 @@ static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap, value = NULL; flags = XATTR_REPLACE; } else { - size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_NOFS); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_NOFS); if (!value) return -ENOMEM; - err = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (err < 0) - goto out; flags = 0; } diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 5aefb705bcc8..a01ef0c1b1bf 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -90,14 +90,9 @@ int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) type); if (acl) { - size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_KERNEL); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL); if (!value) return -ENOMEM; - - error = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (error < 0) - goto out; } gossip_debug(GOSSIP_ACL_DEBUG, @@ -111,7 +106,6 @@ int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) */ error = orangefs_inode_setxattr(inode, name, value, size, 0); -out: kfree(value); if (!error) set_cached_acl(inode, type, acl); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 768f027c1428..4ef6f9d2b8d6 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -829,19 +829,19 @@ EXPORT_SYMBOL (posix_acl_from_xattr); /* * Convert from in-memory to extended attribute representation. */ -int +void * posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, - void *buffer, size_t size) + size_t *sizep, gfp_t gfp) { - struct posix_acl_xattr_header *ext_acl = buffer; + struct posix_acl_xattr_header *ext_acl; struct posix_acl_xattr_entry *ext_entry; - int real_size, n; + size_t size; + int n; - real_size = posix_acl_xattr_size(acl->a_count); - if (!buffer) - return real_size; - if (real_size > size) - return -ERANGE; + size = posix_acl_xattr_size(acl->a_count); + ext_acl = kmalloc(size, gfp); + if (!ext_acl) + return NULL; ext_entry = (void *)(ext_acl + 1); ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); @@ -864,7 +864,8 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, break; } } - return real_size; + *sizep = size; + return ext_acl; } EXPORT_SYMBOL (posix_acl_to_xattr); diff --git a/fs/select.c b/fs/select.c index 65019b8ba3f7..78a1508c84d3 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1038,14 +1038,11 @@ static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; - struct timespec64 *to = NULL, end_time; + struct timespec64 *to = NULL; int ret; - if (restart_block->poll.has_timeout) { - end_time.tv_sec = restart_block->poll.tv_sec; - end_time.tv_nsec = restart_block->poll.tv_nsec; - to = &end_time; - } + if (restart_block->poll.has_timeout) + to = &restart_block->poll.end_time; ret = do_sys_poll(ufds, nfds, to); @@ -1077,8 +1074,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, restart_block->poll.nfds = nfds; if (timeout_msecs >= 0) { - restart_block->poll.tv_sec = end_time.tv_sec; - restart_block->poll.tv_nsec = end_time.tv_nsec; + restart_block->poll.end_time = end_time; restart_block->poll.has_timeout = 1; } else restart_block->poll.has_timeout = 0; diff --git a/fs/splice.c b/fs/splice.c index d338fe56b50b..5fb07c01936f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1467,7 +1467,7 @@ static ssize_t iter_to_pipe(struct iov_iter *from, n = DIV_ROUND_UP(left + start, PAGE_SIZE); for (i = 0; i < n; i++) { - int size = min_t(int, left, PAGE_SIZE - start); + int size = umin(left, PAGE_SIZE - start); buf.page = pages[i]; buf.offset = start; diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h index 0864773a57e8..822085bc2d20 100644 --- a/include/linux/device_cgroup.h +++ b/include/linux/device_cgroup.h @@ -21,7 +21,7 @@ static inline int devcgroup_inode_permission(struct inode *inode, int mask) if (likely(!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))) return 0; - if (likely(!inode->i_rdev)) + if (!inode->i_rdev) return 0; if (S_ISBLK(inode->i_mode)) diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index f0cf2714ec52..262e24d83313 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -201,9 +201,9 @@ struct handle_to_path_ctx { * @commit_metadata: commit metadata changes to stable storage * * See Documentation/filesystems/nfs/exporting.rst for details on how to use - * this interface correctly. + * this interface correctly and the definition of the flags. * - * encode_fh: + * @encode_fh: * @encode_fh should store in the file handle fragment @fh (using at most * @max_len bytes) information that can be used by @decode_fh to recover the * file referred to by the &struct dentry @de. If @flag has CONNECTABLE bit @@ -215,7 +215,7 @@ struct handle_to_path_ctx { * greater than @max_len*4 bytes). On error @max_len contains the minimum * size(in 4 byte unit) needed to encode the file handle. * - * fh_to_dentry: + * @fh_to_dentry: * @fh_to_dentry is given a &struct super_block (@sb) and a file handle * fragment (@fh, @fh_len). It should return a &struct dentry which refers * to the same file that the file handle fragment refers to. If it cannot, @@ -227,31 +227,44 @@ struct handle_to_path_ctx { * created with d_alloc_root. The caller can then find any other extant * dentries by following the d_alias links. * - * fh_to_parent: + * @fh_to_parent: * Same as @fh_to_dentry, except that it returns a pointer to the parent * dentry if it was encoded into the filehandle fragment by @encode_fh. * - * get_name: + * @get_name: * @get_name should find a name for the given @child in the given @parent * directory. The name should be stored in the @name (with the - * understanding that it is already pointing to a %NAME_MAX+1 sized + * understanding that it is already pointing to a %NAME_MAX + 1 sized * buffer. get_name() should return %0 on success, a negative error code * or error. @get_name will be called without @parent->i_rwsem held. * - * get_parent: + * @get_parent: * @get_parent should find the parent directory for the given @child which * is also a directory. In the event that it cannot be found, or storage * space cannot be allocated, a %ERR_PTR should be returned. * - * permission: + * @permission: * Allow filesystems to specify a custom permission function. * - * open: + * @open: * Allow filesystems to specify a custom open function. * - * commit_metadata: + * @commit_metadata: * @commit_metadata should commit metadata changes to stable storage. * + * @get_uuid: + * Get a filesystem unique signature exposed to clients. + * + * @map_blocks: + * Map and, if necessary, allocate blocks for a layout. + * + * @commit_blocks: + * Commit blocks in a layout once the client is done with them. + * + * @flags: + * Allows the filesystem to communicate to nfsd that it may want to do things + * differently when dealing with it. + * * Locking rules: * get_parent is called with child->d_inode->i_rwsem down * get_name is not (which is possibly inconsistent) diff --git a/include/linux/filelock.h b/include/linux/filelock.h index 2f5e5588ee07..d2c9740e26a8 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -242,7 +242,14 @@ bool locks_owner_has_blockers(struct file_lock_context *flctx, static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { - return smp_load_acquire(&inode->i_flctx); + /* + * Paired with smp_store_release in locks_get_lock_context(). + * + * Ensures ->i_flctx will be visible if we spotted the flag. + */ + if (likely(!(smp_load_acquire(&inode->i_opflags) & IOP_FLCTX))) + return NULL; + return READ_ONCE(inode->i_flctx); } #else /* !CONFIG_FILE_LOCKING */ @@ -469,7 +476,7 @@ static inline int break_lease(struct inode *inode, unsigned int mode) * could end up racing with tasks trying to set a new lease on this * file. */ - flctx = READ_ONCE(inode->i_flctx); + flctx = locks_inode_context(inode); if (!flctx) return 0; smp_mb(); @@ -488,7 +495,7 @@ static inline int break_deleg(struct inode *inode, unsigned int flags) * could end up racing with tasks trying to set a new lease on this * file. */ - flctx = READ_ONCE(inode->i_flctx); + flctx = locks_inode_context(inode); if (!flctx) return 0; smp_mb(); @@ -533,8 +540,11 @@ static inline int break_deleg_wait(struct delegated_inode *di) static inline int break_layout(struct inode *inode, bool wait) { + struct file_lock_context *flctx; + smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) { + flctx = locks_inode_context(inode); + if (flctx && !list_empty_careful(&flctx->flc_lease)) { unsigned int flags = LEASE_BREAK_LAYOUT; if (!wait) diff --git a/include/linux/fs.h b/include/linux/fs.h index ca31bc9308a3..73911f961c7e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -631,6 +631,7 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_MGTIME 0x0020 #define IOP_CACHED_LINK 0x0040 #define IOP_FASTPERM_MAY_EXEC 0x0080 +#define IOP_FLCTX 0x0100 /* * Inode state bits. Protected by inode->i_lock diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h index b332b019b29c..0014fbc1c626 100644 --- a/include/linux/ns/ns_common_types.h +++ b/include/linux/ns/ns_common_types.h @@ -108,11 +108,13 @@ extern const struct proc_ns_operations utsns_operations; * @ns_tree: namespace tree nodes and active reference count */ struct ns_common { + struct { + refcount_t __ns_ref; /* do not use directly */ + } ____cacheline_aligned_in_smp; u32 ns_type; struct dentry *stashed; const struct proc_ns_operations *ops; unsigned int inum; - refcount_t __ns_ref; /* do not use directly */ union { struct ns_tree; struct rcu_head ns_rcu; diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index e86f3b731da2..9e1892525eac 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -44,8 +44,9 @@ posix_acl_from_xattr(struct user_namespace *user_ns, const void *value, } #endif -int posix_acl_to_xattr(struct user_namespace *user_ns, - const struct posix_acl *acl, void *buffer, size_t size); +extern void *posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, + size_t *sizep, gfp_t gfp); + static inline const char *posix_acl_xattr_name(int type) { switch (type) { diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 67d2bf579942..9b262109726d 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -6,6 +6,7 @@ #define __LINUX_RESTART_BLOCK_H #include +#include #include struct __kernel_timespec; @@ -50,8 +51,7 @@ struct restart_block { struct pollfd __user *ufds; int nfds; int has_timeout; - unsigned long tv_sec; - unsigned long tv_nsec; + struct timespec64 end_time; } poll; }; }; diff --git a/init/Kconfig b/init/Kconfig index fa79feb8fe57..160c1c4ef253 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -624,8 +624,9 @@ config SCHED_HW_PRESSURE arch_update_hw_pressure() and arch_scale_thermal_pressure(). config BSD_PROCESS_ACCT - bool "BSD Process Accounting" + bool "BSD Process Accounting (DEPRECATED)" depends on MULTIUSER + default n help If you say Y here, a user level program will be able to instruct the kernel (via a special system call) to write process accounting @@ -635,7 +636,9 @@ config BSD_PROCESS_ACCT command name, memory usage, controlling terminal etc. (the complete list is in the struct acct in ). It is up to the user level program to do useful things with this - information. This is generally a good idea, so say Y. + information. This mechanism is antiquated and has significant + scalability issues. You probably want to use eBPF instead. Say + N unless you really need this. config BSD_PROCESS_ACCT_V3 bool "BSD Process Accounting version 3 file format" diff --git a/init/initramfs_test.c b/init/initramfs_test.c index 5d2db455e60c..beb6e3cf7808 100644 --- a/init/initramfs_test.c +++ b/init/initramfs_test.c @@ -447,6 +447,53 @@ out: kfree(tbufs); } +static void __init initramfs_test_fname_path_max(struct kunit *test) +{ + char *err; + size_t len; + struct kstat st0, st1; + char fdata[] = "this file data will not be unpacked"; + struct test_fname_path_max { + char fname_oversize[PATH_MAX + 1]; + char fname_ok[PATH_MAX]; + char cpio_src[(CPIO_HDRLEN + PATH_MAX + 3 + sizeof(fdata)) * 2]; + } *tbufs = kzalloc(sizeof(struct test_fname_path_max), GFP_KERNEL); + struct initramfs_test_cpio c[] = { { + .magic = "070701", + .ino = 1, + .mode = S_IFDIR | 0777, + .nlink = 1, + .namesize = sizeof(tbufs->fname_oversize), + .fname = tbufs->fname_oversize, + .filesize = sizeof(fdata), + .data = fdata, + }, { + .magic = "070701", + .ino = 2, + .mode = S_IFDIR | 0777, + .nlink = 1, + .namesize = sizeof(tbufs->fname_ok), + .fname = tbufs->fname_ok, + } }; + + memset(tbufs->fname_oversize, '/', sizeof(tbufs->fname_oversize) - 1); + memset(tbufs->fname_ok, '/', sizeof(tbufs->fname_ok) - 1); + memcpy(tbufs->fname_oversize, "fname_oversize", + sizeof("fname_oversize") - 1); + memcpy(tbufs->fname_ok, "fname_ok", sizeof("fname_ok") - 1); + len = fill_cpio(c, ARRAY_SIZE(c), tbufs->cpio_src); + + /* unpack skips over fname_oversize instead of returning an error */ + err = unpack_to_rootfs(tbufs->cpio_src, len); + KUNIT_EXPECT_NULL(test, err); + + KUNIT_EXPECT_EQ(test, init_stat("fname_oversize", &st0, 0), -ENOENT); + KUNIT_EXPECT_EQ(test, init_stat("fname_ok", &st1, 0), 0); + KUNIT_EXPECT_EQ(test, init_rmdir("fname_ok"), 0); + + kfree(tbufs); +} + /* * The kunit_case/_suite struct cannot be marked as __initdata as this will be * used in debugfs to retrieve results after test has run. @@ -459,6 +506,7 @@ static struct kunit_case __refdata initramfs_test_cases[] = { KUNIT_CASE(initramfs_test_hardlink), KUNIT_CASE(initramfs_test_many), KUNIT_CASE(initramfs_test_fname_pad), + KUNIT_CASE(initramfs_test_fname_path_max), {}, }; diff --git a/kernel/pid.c b/kernel/pid.c index a31771bc89c1..f45ae56db7da 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -159,58 +159,86 @@ void free_pids(struct pid **pids) free_pid(pids[tmp]); } -struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, - size_t set_tid_size) +struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, + size_t arg_set_tid_size) { + int set_tid[MAX_PID_NS_LEVEL + 1] = {}; + int pid_max[MAX_PID_NS_LEVEL + 1] = {}; struct pid *pid; enum pid_type type; int i, nr; struct pid_namespace *tmp; struct upid *upid; int retval = -ENOMEM; + bool retried_preload; /* - * set_tid_size contains the size of the set_tid array. Starting at + * arg_set_tid_size contains the size of the arg_set_tid array. Starting at * the most nested currently active PID namespace it tells alloc_pid() * which PID to set for a process in that most nested PID namespace - * up to set_tid_size PID namespaces. It does not have to set the PID - * for a process in all nested PID namespaces but set_tid_size must + * up to arg_set_tid_size PID namespaces. It does not have to set the PID + * for a process in all nested PID namespaces but arg_set_tid_size must * never be greater than the current ns->level + 1. */ - if (set_tid_size > ns->level + 1) + if (arg_set_tid_size > ns->level + 1) return ERR_PTR(-EINVAL); + /* + * Prep before we take locks: + * + * 1. allocate and fill in pid struct + */ pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) return ERR_PTR(retval); - tmp = ns; + get_pid_ns(ns); pid->level = ns->level; + refcount_set(&pid->count, 1); + spin_lock_init(&pid->lock); + for (type = 0; type < PIDTYPE_MAX; ++type) + INIT_HLIST_HEAD(&pid->tasks[type]); + init_waitqueue_head(&pid->wait_pidfd); + INIT_HLIST_HEAD(&pid->inodes); - for (i = ns->level; i >= 0; i--) { - int tid = 0; - int pid_max = READ_ONCE(tmp->pid_max); + /* + * 2. perm check checkpoint_restore_ns_capable() + * + * This stores found pid_max to make sure the used value is the same should + * later code need it. + */ + for (tmp = ns, i = ns->level; i >= 0; i--) { + pid_max[ns->level - i] = READ_ONCE(tmp->pid_max); - if (set_tid_size) { - tid = set_tid[ns->level - i]; + if (arg_set_tid_size) { + int tid = set_tid[ns->level - i] = arg_set_tid[ns->level - i]; retval = -EINVAL; - if (tid < 1 || tid >= pid_max) - goto out_free; + if (tid < 1 || tid >= pid_max[ns->level - i]) + goto out_abort; /* * Also fail if a PID != 1 is requested and * no PID 1 exists. */ if (tid != 1 && !tmp->child_reaper) - goto out_free; + goto out_abort; retval = -EPERM; if (!checkpoint_restore_ns_capable(tmp->user_ns)) - goto out_free; - set_tid_size--; + goto out_abort; + arg_set_tid_size--; } - idr_preload(GFP_KERNEL); - spin_lock(&pidmap_lock); + tmp = tmp->parent; + } + + /* + * Prep is done, id allocation goes here: + */ + retried_preload = false; + idr_preload(GFP_KERNEL); + spin_lock(&pidmap_lock); + for (tmp = ns, i = ns->level; i >= 0;) { + int tid = set_tid[ns->level - i]; if (tid) { nr = idr_alloc(&tmp->idr, NULL, tid, @@ -220,6 +248,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, * alreay in use. Return EEXIST in that case. */ if (nr == -ENOSPC) + nr = -EEXIST; } else { int pid_min = 1; @@ -235,19 +264,42 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, * a partially initialized PID (see below). */ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, - pid_max, GFP_ATOMIC); + pid_max[ns->level - i], GFP_ATOMIC); + if (nr == -ENOSPC) + nr = -EAGAIN; } - spin_unlock(&pidmap_lock); - idr_preload_end(); - if (nr < 0) { - retval = (nr == -ENOSPC) ? -EAGAIN : nr; + if (unlikely(nr < 0)) { + /* + * Preload more memory if idr_alloc{,cyclic} failed with -ENOMEM. + * + * The IDR API only allows us to preload memory for one call, while we may end + * up doing several under pidmap_lock with GFP_ATOMIC. The situation may be + * salvageable with GFP_KERNEL. But make sure to not loop indefinitely if preload + * did not help (the routine unfortunately returns void, so we have no idea + * if it got anywhere). + * + * The lock can be safely dropped and picked up as historically pid allocation + * for different namespaces was *not* atomic -- we try to hold on to it the + * entire time only for performance reasons. + */ + if (nr == -ENOMEM && !retried_preload) { + spin_unlock(&pidmap_lock); + idr_preload_end(); + retried_preload = true; + idr_preload(GFP_KERNEL); + spin_lock(&pidmap_lock); + continue; + } + retval = nr; goto out_free; } pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; tmp = tmp->parent; + i--; + retried_preload = false; } /* @@ -257,25 +309,15 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, * is what we have exposed to userspace for a long time and it is * documented behavior for pid namespaces. So we can't easily * change it even if there were an error code better suited. + * + * This can't be done earlier because we need to preserve other + * error conditions. */ retval = -ENOMEM; - - get_pid_ns(ns); - refcount_set(&pid->count, 1); - spin_lock_init(&pid->lock); - for (type = 0; type < PIDTYPE_MAX; ++type) - INIT_HLIST_HEAD(&pid->tasks[type]); - - init_waitqueue_head(&pid->wait_pidfd); - INIT_HLIST_HEAD(&pid->inodes); - - upid = pid->numbers + ns->level; - idr_preload(GFP_KERNEL); - spin_lock(&pidmap_lock); - if (!(ns->pid_allocated & PIDNS_ADDING)) - goto out_unlock; + if (unlikely(!(ns->pid_allocated & PIDNS_ADDING))) + goto out_free; pidfs_add_pid(pid); - for ( ; upid >= pid->numbers; --upid) { + for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; @@ -286,13 +328,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, return pid; -out_unlock: - spin_unlock(&pidmap_lock); - idr_preload_end(); - put_pid_ns(ns); - out_free: - spin_lock(&pidmap_lock); while (++i <= ns->level) { upid = pid->numbers + i; idr_remove(&upid->ns->idr, upid->nr); @@ -303,7 +339,10 @@ out_free: idr_set_cursor(&ns->idr, 0); spin_unlock(&pidmap_lock); + idr_preload_end(); +out_abort: + put_pid_ns(ns); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); }