cgroup/cpuset: Clarify exclusion rules for cpuset internal variables

Clarify the locking rules associated with file level internal variables
inside the cpuset code. There is no functional change.

Reviewed-by: Chen Ridong <chenridong@huaweicloud.com>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
Waiman Long 2026-02-21 13:54:13 -05:00 committed by Tejun Heo
parent 68230aac8b
commit 17b1860034

View file

@ -61,6 +61,58 @@ static const char * const perr_strings[] = {
[PERR_REMOTE] = "Have remote partition underneath",
};
/*
* CPUSET Locking Convention
* -------------------------
*
* Below are the three global locks guarding cpuset structures in lock
* acquisition order:
* - cpu_hotplug_lock (cpus_read_lock/cpus_write_lock)
* - cpuset_mutex
* - callback_lock (raw spinlock)
*
* A task must hold all the three locks to modify externally visible or
* used fields of cpusets, though some of the internally used cpuset fields
* and internal variables can be modified without holding callback_lock. If only
* reliable read access of the externally used fields are needed, a task can
* hold either cpuset_mutex or callback_lock which are exposed to other
* external subsystems.
*
* If a task holds cpu_hotplug_lock and cpuset_mutex, it blocks others,
* ensuring that it is the only task able to also acquire callback_lock and
* be able to modify cpusets. It can perform various checks on the cpuset
* structure first, knowing nothing will change. It can also allocate memory
* without holding callback_lock. While it is performing these checks, various
* callback routines can briefly acquire callback_lock to query cpusets. Once
* it is ready to make the changes, it takes callback_lock, blocking everyone
* else.
*
* Calls to the kernel memory allocator cannot be made while holding
* callback_lock which is a spinlock, as the memory allocator may sleep or
* call back into cpuset code and acquire callback_lock.
*
* Now, the task_struct fields mems_allowed and mempolicy may be changed
* by other task, we use alloc_lock in the task_struct fields to protect
* them.
*
* The cpuset_common_seq_show() handlers only hold callback_lock across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
*/
static DEFINE_MUTEX(cpuset_mutex);
/*
* File level internal variables below follow one of the following exclusion
* rules.
*
* RWCS: Read/write-able by holding either cpus_write_lock (and optionally
* cpuset_mutex) or both cpus_read_lock and cpuset_mutex.
*
* CSCB: Readable by holding either cpuset_mutex or callback_lock. Writable
* by holding both cpuset_mutex and callback_lock.
*/
/*
* For local partitions, update to subpartitions_cpus & isolated_cpus is done
* in update_parent_effective_cpumask(). For remote partitions, it is done in
@ -70,19 +122,18 @@ static const char * const perr_strings[] = {
* Exclusive CPUs distributed out to local or remote sub-partitions of
* top_cpuset
*/
static cpumask_var_t subpartitions_cpus;
static cpumask_var_t subpartitions_cpus; /* RWCS */
/*
* Exclusive CPUs in isolated partitions
* Exclusive CPUs in isolated partitions (shown in cpuset.cpus.isolated)
*/
static cpumask_var_t isolated_cpus;
static cpumask_var_t isolated_cpus; /* CSCB */
/*
* isolated_cpus updating flag (protected by cpuset_mutex)
* Set if isolated_cpus is going to be updated in the current
* cpuset_mutex crtical section.
* Set if isolated_cpus is being updated in the current cpuset_mutex
* critical section.
*/
static bool isolated_cpus_updating;
static bool isolated_cpus_updating; /* RWCS */
/*
* A flag to force sched domain rebuild at the end of an operation.
@ -98,7 +149,7 @@ static bool isolated_cpus_updating;
* Note that update_relax_domain_level() in cpuset-v1.c can still call
* rebuild_sched_domains_locked() directly without using this flag.
*/
static bool force_sd_rebuild;
static bool force_sd_rebuild; /* RWCS */
/*
* Partition root states:
@ -218,42 +269,6 @@ struct cpuset top_cpuset = {
.partition_root_state = PRS_ROOT,
};
/*
* There are two global locks guarding cpuset structures - cpuset_mutex and
* callback_lock. The cpuset code uses only cpuset_mutex. Other kernel
* subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
* structures. Note that cpuset_mutex needs to be a mutex as it is used in
* paths that rely on priority inheritance (e.g. scheduler - on RT) for
* correctness.
*
* A task must hold both locks to modify cpusets. If a task holds
* cpuset_mutex, it blocks others, ensuring that it is the only task able to
* also acquire callback_lock and be able to modify cpusets. It can perform
* various checks on the cpuset structure first, knowing nothing will change.
* It can also allocate memory while just holding cpuset_mutex. While it is
* performing these checks, various callback routines can briefly acquire
* callback_lock to query cpusets. Once it is ready to make the changes, it
* takes callback_lock, blocking everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
* callback_lock, as that would risk double tripping on callback_lock
* from one of the callbacks into the cpuset code from within
* __alloc_pages().
*
* If a task is only holding callback_lock, then it has read-only
* access to cpusets.
*
* Now, the task_struct fields mems_allowed and mempolicy may be changed
* by other task, we use alloc_lock in the task_struct fields to protect
* them.
*
* The cpuset_common_seq_show() handlers only hold callback_lock across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
*/
static DEFINE_MUTEX(cpuset_mutex);
/**
* cpuset_lock - Acquire the global cpuset mutex
*
@ -1163,6 +1178,8 @@ static void reset_partition_data(struct cpuset *cs)
static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus)
{
WARN_ON_ONCE(old_prs == new_prs);
lockdep_assert_held(&callback_lock);
lockdep_assert_held(&cpuset_mutex);
if (new_prs == PRS_ISOLATED)
cpumask_or(isolated_cpus, isolated_cpus, xcpus);
else