mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 01:04:41 +01:00
kernel-7.0-rc1.misc
Please consider pulling these changes from the signed kernel-7.0-rc1.misc tag. Thanks! Christian -----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaZL+JwAKCRCRxhvAZXjc ovU/AP4xgVxEegnNYrXZ+TpdCXbCtQZ54JqowFX73MBtaBHY1QD/YkDaIzl6K70v d9P2Fe8Y6wOnIHxcjE4MIdMansphjAM= =TN3q -----END PGP SIGNATURE----- Merge tag 'kernel-7.0-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs Pull pidfs updates from Christian Brauner: - pid: introduce task_ppid_vnr() helper - pidfs: convert rb-tree to rhashtable Mateusz reported performance penalties during task creation because pidfs uses pidmap_lock to add elements into the rbtree. Switch to an rhashtable to have separate fine-grained locking and to decouple from pidmap_lock moving all heavy manipulations outside of it Also move inode allocation outside of pidmap_lock. With this there's nothing happening for pidfs under pidmap_lock - pid: reorder fields in pid_namespace to reduce false sharing - Revert "pid: make __task_pid_nr_ns(ns => NULL) safe for zombie callers" - ipc: Add SPDX license id to mqueue.c * tag 'kernel-7.0-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: pid: introduce task_ppid_vnr() helper pidfs: implement ino allocation without the pidmap lock Revert "pid: make __task_pid_nr_ns(ns => NULL) safe for zombie callers" pid: reorder fields in pid_namespace to reduce false sharing pidfs: convert rb-tree to rhashtable ipc: Add SPDX license id to mqueue.c
This commit is contained in:
commit
543b9b6339
6 changed files with 120 additions and 96 deletions
172
fs/pidfs.c
172
fs/pidfs.c
|
|
@ -21,7 +21,9 @@
|
|||
#include <linux/utsname.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <linux/coredump.h>
|
||||
#include <linux/rhashtable.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/cookie.h>
|
||||
|
||||
#include "internal.h"
|
||||
#include "mount.h"
|
||||
|
|
@ -55,9 +57,48 @@ struct pidfs_attr {
|
|||
__u32 coredump_signal;
|
||||
};
|
||||
|
||||
static struct rb_root pidfs_ino_tree = RB_ROOT;
|
||||
static struct rhashtable pidfs_ino_ht;
|
||||
|
||||
static const struct rhashtable_params pidfs_ino_ht_params = {
|
||||
.key_offset = offsetof(struct pid, ino),
|
||||
.key_len = sizeof(u64),
|
||||
.head_offset = offsetof(struct pid, pidfs_hash),
|
||||
.automatic_shrinking = true,
|
||||
};
|
||||
|
||||
/*
|
||||
* inode number handling
|
||||
*
|
||||
* On 64 bit nothing special happens. The 64bit number assigned
|
||||
* to struct pid is the inode number.
|
||||
*
|
||||
* On 32 bit the 64 bit number assigned to struct pid is split
|
||||
* into two 32 bit numbers. The lower 32 bits are used as the
|
||||
* inode number and the upper 32 bits are used as the inode
|
||||
* generation number.
|
||||
*
|
||||
* On 32 bit pidfs_ino() will return the lower 32 bit. When
|
||||
* pidfs_ino() returns zero a wrap around happened. When a
|
||||
* wraparound happens the 64 bit number will be incremented by 1
|
||||
* so inode numbering starts at 1 again.
|
||||
*
|
||||
* On 64 bit comparing two pidfds is as simple as comparing
|
||||
* inode numbers.
|
||||
*
|
||||
* When a wraparound happens on 32 bit multiple pidfds with the
|
||||
* same inode number are likely to exist (This isn't a problem
|
||||
* since before pidfs pidfds used the anonymous inode meaning
|
||||
* all pidfds had the same inode number.). Userspace can
|
||||
* reconstruct the 64 bit identifier by retrieving both the
|
||||
* inode number and the inode generation number to compare or
|
||||
* use file handles.
|
||||
*/
|
||||
|
||||
#if BITS_PER_LONG == 32
|
||||
|
||||
DEFINE_SPINLOCK(pidfs_ino_lock);
|
||||
static u64 pidfs_ino_nr = 1;
|
||||
|
||||
static inline unsigned long pidfs_ino(u64 ino)
|
||||
{
|
||||
return lower_32_bits(ino);
|
||||
|
|
@ -69,6 +110,18 @@ static inline u32 pidfs_gen(u64 ino)
|
|||
return upper_32_bits(ino);
|
||||
}
|
||||
|
||||
static inline u64 pidfs_alloc_ino(void)
|
||||
{
|
||||
u64 ino;
|
||||
|
||||
spin_lock(&pidfs_ino_lock);
|
||||
if (pidfs_ino(pidfs_ino_nr) == 0)
|
||||
pidfs_ino_nr++;
|
||||
ino = pidfs_ino_nr++;
|
||||
spin_unlock(&pidfs_ino_lock);
|
||||
return ino;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* On 64 bit simply return ino. */
|
||||
|
|
@ -82,69 +135,47 @@ static inline u32 pidfs_gen(u64 ino)
|
|||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b)
|
||||
DEFINE_COOKIE(pidfs_ino_cookie);
|
||||
|
||||
static u64 pidfs_alloc_ino(void)
|
||||
{
|
||||
struct pid *pid_a = rb_entry(a, struct pid, pidfs_node);
|
||||
struct pid *pid_b = rb_entry(b, struct pid, pidfs_node);
|
||||
u64 pid_ino_a = pid_a->ino;
|
||||
u64 pid_ino_b = pid_b->ino;
|
||||
u64 ino;
|
||||
|
||||
if (pid_ino_a < pid_ino_b)
|
||||
return -1;
|
||||
if (pid_ino_a > pid_ino_b)
|
||||
return 1;
|
||||
return 0;
|
||||
preempt_disable();
|
||||
ino = gen_cookie_next(&pidfs_ino_cookie);
|
||||
preempt_enable();
|
||||
|
||||
VFS_WARN_ON_ONCE(ino < 1);
|
||||
return ino;
|
||||
}
|
||||
|
||||
void pidfs_add_pid(struct pid *pid)
|
||||
#endif
|
||||
|
||||
void pidfs_prepare_pid(struct pid *pid)
|
||||
{
|
||||
static u64 pidfs_ino_nr = 2;
|
||||
|
||||
/*
|
||||
* On 64 bit nothing special happens. The 64bit number assigned
|
||||
* to struct pid is the inode number.
|
||||
*
|
||||
* On 32 bit the 64 bit number assigned to struct pid is split
|
||||
* into two 32 bit numbers. The lower 32 bits are used as the
|
||||
* inode number and the upper 32 bits are used as the inode
|
||||
* generation number.
|
||||
*
|
||||
* On 32 bit pidfs_ino() will return the lower 32 bit. When
|
||||
* pidfs_ino() returns zero a wrap around happened. When a
|
||||
* wraparound happens the 64 bit number will be incremented by 2
|
||||
* so inode numbering starts at 2 again.
|
||||
*
|
||||
* On 64 bit comparing two pidfds is as simple as comparing
|
||||
* inode numbers.
|
||||
*
|
||||
* When a wraparound happens on 32 bit multiple pidfds with the
|
||||
* same inode number are likely to exist (This isn't a problem
|
||||
* since before pidfs pidfds used the anonymous inode meaning
|
||||
* all pidfds had the same inode number.). Userspace can
|
||||
* reconstruct the 64 bit identifier by retrieving both the
|
||||
* inode number and the inode generation number to compare or
|
||||
* use file handles.
|
||||
*/
|
||||
if (pidfs_ino(pidfs_ino_nr) == 0)
|
||||
pidfs_ino_nr += 2;
|
||||
|
||||
pid->ino = pidfs_ino_nr;
|
||||
pid->stashed = NULL;
|
||||
pid->attr = NULL;
|
||||
pidfs_ino_nr++;
|
||||
pid->ino = 0;
|
||||
}
|
||||
|
||||
write_seqcount_begin(&pidmap_lock_seq);
|
||||
rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp);
|
||||
write_seqcount_end(&pidmap_lock_seq);
|
||||
int pidfs_add_pid(struct pid *pid)
|
||||
{
|
||||
int ret;
|
||||
|
||||
pid->ino = pidfs_alloc_ino();
|
||||
ret = rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
|
||||
pidfs_ino_ht_params);
|
||||
if (unlikely(ret))
|
||||
pid->ino = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void pidfs_remove_pid(struct pid *pid)
|
||||
{
|
||||
write_seqcount_begin(&pidmap_lock_seq);
|
||||
rb_erase(&pid->pidfs_node, &pidfs_ino_tree);
|
||||
write_seqcount_end(&pidmap_lock_seq);
|
||||
if (likely(pid->ino))
|
||||
rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
|
||||
pidfs_ino_ht_params);
|
||||
}
|
||||
|
||||
void pidfs_free_pid(struct pid *pid)
|
||||
|
|
@ -415,7 +446,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
|
|||
* the fields are set correctly, or return ESRCH to avoid providing
|
||||
* incomplete information. */
|
||||
|
||||
kinfo.ppid = task_ppid_nr_ns(task, NULL);
|
||||
kinfo.ppid = task_ppid_vnr(task);
|
||||
kinfo.tgid = task_tgid_vnr(task);
|
||||
kinfo.pid = task_pid_vnr(task);
|
||||
kinfo.mask |= PIDFD_INFO_PID;
|
||||
|
|
@ -791,42 +822,24 @@ static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
|
|||
return FILEID_KERNFS;
|
||||
}
|
||||
|
||||
static int pidfs_ino_find(const void *key, const struct rb_node *node)
|
||||
{
|
||||
const u64 pid_ino = *(u64 *)key;
|
||||
const struct pid *pid = rb_entry(node, struct pid, pidfs_node);
|
||||
|
||||
if (pid_ino < pid->ino)
|
||||
return -1;
|
||||
if (pid_ino > pid->ino)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Find a struct pid based on the inode number. */
|
||||
static struct pid *pidfs_ino_get_pid(u64 ino)
|
||||
{
|
||||
struct pid *pid;
|
||||
struct rb_node *node;
|
||||
unsigned int seq;
|
||||
struct pidfs_attr *attr;
|
||||
|
||||
guard(rcu)();
|
||||
do {
|
||||
seq = read_seqcount_begin(&pidmap_lock_seq);
|
||||
node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find);
|
||||
if (node)
|
||||
break;
|
||||
} while (read_seqcount_retry(&pidmap_lock_seq, seq));
|
||||
|
||||
if (!node)
|
||||
pid = rhashtable_lookup(&pidfs_ino_ht, &ino, pidfs_ino_ht_params);
|
||||
if (!pid)
|
||||
return NULL;
|
||||
attr = READ_ONCE(pid->attr);
|
||||
if (IS_ERR_OR_NULL(attr))
|
||||
return NULL;
|
||||
if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask))
|
||||
return NULL;
|
||||
|
||||
pid = rb_entry(node, struct pid, pidfs_node);
|
||||
|
||||
/* Within our pid namespace hierarchy? */
|
||||
if (pid_vnr(pid) == 0)
|
||||
return NULL;
|
||||
|
||||
return get_pid(pid);
|
||||
}
|
||||
|
||||
|
|
@ -1104,6 +1117,9 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
|
|||
|
||||
void __init pidfs_init(void)
|
||||
{
|
||||
if (rhashtable_init(&pidfs_ino_ht, &pidfs_ino_ht_params))
|
||||
panic("Failed to initialize pidfs hashtable");
|
||||
|
||||
pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
|
||||
(SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
|
||||
SLAB_ACCOUNT | SLAB_PANIC), NULL);
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
#include <linux/rculist.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/refcount.h>
|
||||
#include <linux/rhashtable-types.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/wait.h>
|
||||
|
||||
|
|
@ -60,7 +61,7 @@ struct pid {
|
|||
spinlock_t lock;
|
||||
struct {
|
||||
u64 ino;
|
||||
struct rb_node pidfs_node;
|
||||
struct rhash_head pidfs_hash;
|
||||
struct dentry *stashed;
|
||||
struct pidfs_attr *attr;
|
||||
};
|
||||
|
|
@ -73,7 +74,6 @@ struct pid {
|
|||
struct upid numbers[];
|
||||
};
|
||||
|
||||
extern seqcount_spinlock_t pidmap_lock_seq;
|
||||
extern struct pid init_struct_pid;
|
||||
|
||||
struct file;
|
||||
|
|
@ -310,6 +310,11 @@ static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_na
|
|||
return pid;
|
||||
}
|
||||
|
||||
static inline pid_t task_ppid_vnr(const struct task_struct *tsk)
|
||||
{
|
||||
return task_ppid_nr_ns(tsk, NULL);
|
||||
}
|
||||
|
||||
static inline pid_t task_ppid_nr(const struct task_struct *tsk)
|
||||
{
|
||||
return task_ppid_nr_ns(tsk, &init_pid_ns);
|
||||
|
|
|
|||
|
|
@ -27,6 +27,13 @@ struct pid_namespace {
|
|||
struct idr idr;
|
||||
struct rcu_head rcu;
|
||||
unsigned int pid_allocated;
|
||||
#ifdef CONFIG_SYSCTL
|
||||
#if defined(CONFIG_MEMFD_CREATE)
|
||||
int memfd_noexec_scope;
|
||||
#endif
|
||||
struct ctl_table_set set;
|
||||
struct ctl_table_header *sysctls;
|
||||
#endif
|
||||
struct task_struct *child_reaper;
|
||||
struct kmem_cache *pid_cachep;
|
||||
unsigned int level;
|
||||
|
|
@ -40,13 +47,6 @@ struct pid_namespace {
|
|||
int reboot; /* group exit code if this pidns was rebooted */
|
||||
struct ns_common ns;
|
||||
struct work_struct work;
|
||||
#ifdef CONFIG_SYSCTL
|
||||
struct ctl_table_set set;
|
||||
struct ctl_table_header *sysctls;
|
||||
#if defined(CONFIG_MEMFD_CREATE)
|
||||
int memfd_noexec_scope;
|
||||
#endif
|
||||
#endif
|
||||
} __randomize_layout;
|
||||
|
||||
extern struct pid_namespace init_pid_ns;
|
||||
|
|
|
|||
|
|
@ -6,7 +6,8 @@ struct coredump_params;
|
|||
|
||||
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
|
||||
void __init pidfs_init(void);
|
||||
void pidfs_add_pid(struct pid *pid);
|
||||
void pidfs_prepare_pid(struct pid *pid);
|
||||
int pidfs_add_pid(struct pid *pid);
|
||||
void pidfs_remove_pid(struct pid *pid);
|
||||
void pidfs_exit(struct task_struct *tsk);
|
||||
#ifdef CONFIG_COREDUMP
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* POSIX message queues filesystem for Linux.
|
||||
*
|
||||
|
|
@ -9,8 +10,6 @@
|
|||
* Manfred Spraul (manfred@colorfullife.com)
|
||||
*
|
||||
* Audit: George Wilson (ltcgcw@us.ibm.com)
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*/
|
||||
|
||||
#include <linux/capability.h>
|
||||
|
|
|
|||
15
kernel/pid.c
15
kernel/pid.c
|
|
@ -43,7 +43,6 @@
|
|||
#include <linux/sched/task.h>
|
||||
#include <linux/idr.h>
|
||||
#include <linux/pidfs.h>
|
||||
#include <linux/seqlock.h>
|
||||
#include <net/sock.h>
|
||||
#include <uapi/linux/pidfd.h>
|
||||
|
||||
|
|
@ -85,7 +84,6 @@ struct pid_namespace init_pid_ns = {
|
|||
EXPORT_SYMBOL_GPL(init_pid_ns);
|
||||
|
||||
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
|
||||
seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
|
||||
|
||||
void put_pid(struct pid *pid)
|
||||
{
|
||||
|
|
@ -141,9 +139,9 @@ void free_pid(struct pid *pid)
|
|||
|
||||
idr_remove(&ns->idr, upid->nr);
|
||||
}
|
||||
pidfs_remove_pid(pid);
|
||||
spin_unlock(&pidmap_lock);
|
||||
|
||||
pidfs_remove_pid(pid);
|
||||
call_rcu(&pid->rcu, delayed_put_pid);
|
||||
}
|
||||
|
||||
|
|
@ -200,6 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
|
|||
INIT_HLIST_HEAD(&pid->tasks[type]);
|
||||
init_waitqueue_head(&pid->wait_pidfd);
|
||||
INIT_HLIST_HEAD(&pid->inodes);
|
||||
pidfs_prepare_pid(pid);
|
||||
|
||||
/*
|
||||
* 2. perm check checkpoint_restore_ns_capable()
|
||||
|
|
@ -316,7 +315,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
|
|||
retval = -ENOMEM;
|
||||
if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
|
||||
goto out_free;
|
||||
pidfs_add_pid(pid);
|
||||
for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
|
||||
/* Make the PID visible to find_pid_ns. */
|
||||
idr_replace(&upid->ns->idr, pid, upid->nr);
|
||||
|
|
@ -326,6 +324,12 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
|
|||
idr_preload_end();
|
||||
ns_ref_active_get(ns);
|
||||
|
||||
retval = pidfs_add_pid(pid);
|
||||
if (unlikely(retval)) {
|
||||
free_pid(pid);
|
||||
pid = ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
return pid;
|
||||
|
||||
out_free:
|
||||
|
|
@ -554,8 +558,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
|
|||
rcu_read_lock();
|
||||
if (!ns)
|
||||
ns = task_active_pid_ns(current);
|
||||
if (ns)
|
||||
nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
|
||||
nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
|
||||
rcu_read_unlock();
|
||||
|
||||
return nr;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue