mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 06:04:44 +01:00
sched_ext: Short-circuit sched_class operations on dead tasks
7900aa699c("sched_ext: Fix cgroup exit ordering by moving sched_ext_free() to finish_task_switch()") moved sched_ext_free() to finish_task_switch() and renamed it to sched_ext_dead() to fix cgroup exit ordering issues. However, this created a race window where certain sched_class ops may be invoked on dead tasks leading to failures - e.g. sched_setscheduler() may try to switch a task which finished sched_ext_dead() back into SCX triggering invalid SCX task state transitions. Add task_dead_and_done() which tests whether a task is TASK_DEAD and has completed its final context switch, and use it to short-circuit sched_class operations which may be called on dead tasks. Fixes:7900aa699c("sched_ext: Fix cgroup exit ordering by moving sched_ext_free() to finish_task_switch()") Reported-by: Andrea Righi <arighi@nvidia.com> Link: http://lkml.kernel.org/r/20260202151341.796959-1-arighi@nvidia.com Reviewed-by: Andrea Righi <arighi@nvidia.com> Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
parent
c9894e6f01
commit
0eca95cba2
1 changed files with 48 additions and 0 deletions
|
|
@ -194,6 +194,7 @@ MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microsecond
|
|||
#include <trace/events/sched_ext.h>
|
||||
|
||||
static void process_ddsp_deferred_locals(struct rq *rq);
|
||||
static bool task_dead_and_done(struct task_struct *p);
|
||||
static u32 reenq_local(struct rq *rq);
|
||||
static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
|
||||
static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
|
||||
|
|
@ -2618,6 +2619,9 @@ static void set_cpus_allowed_scx(struct task_struct *p,
|
|||
|
||||
set_cpus_allowed_common(p, ac);
|
||||
|
||||
if (task_dead_and_done(p))
|
||||
return;
|
||||
|
||||
/*
|
||||
* The effective cpumask is stored in @p->cpus_ptr which may temporarily
|
||||
* differ from the configured one in @p->cpus_mask. Always tell the bpf
|
||||
|
|
@ -3033,10 +3037,45 @@ void scx_cancel_fork(struct task_struct *p)
|
|||
percpu_up_read(&scx_fork_rwsem);
|
||||
}
|
||||
|
||||
/**
|
||||
* task_dead_and_done - Is a task dead and done running?
|
||||
* @p: target task
|
||||
*
|
||||
* Once sched_ext_dead() removes the dead task from scx_tasks and exits it, the
|
||||
* task no longer exists from SCX's POV. However, certain sched_class ops may be
|
||||
* invoked on these dead tasks leading to failures - e.g. sched_setscheduler()
|
||||
* may try to switch a task which finished sched_ext_dead() back into SCX
|
||||
* triggering invalid SCX task state transitions and worse.
|
||||
*
|
||||
* Once a task has finished the final switch, sched_ext_dead() is the only thing
|
||||
* that needs to happen on the task. Use this test to short-circuit sched_class
|
||||
* operations which may be called on dead tasks.
|
||||
*/
|
||||
static bool task_dead_and_done(struct task_struct *p)
|
||||
{
|
||||
struct rq *rq = task_rq(p);
|
||||
|
||||
lockdep_assert_rq_held(rq);
|
||||
|
||||
/*
|
||||
* In do_task_dead(), a dying task sets %TASK_DEAD with preemption
|
||||
* disabled and __schedule(). If @p has %TASK_DEAD set and off CPU, @p
|
||||
* won't ever run again.
|
||||
*/
|
||||
return unlikely(READ_ONCE(p->__state) == TASK_DEAD) &&
|
||||
!task_on_cpu(rq, p);
|
||||
}
|
||||
|
||||
void sched_ext_dead(struct task_struct *p)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* By the time control reaches here, @p has %TASK_DEAD set, switched out
|
||||
* for the last time and then dropped the rq lock - task_dead_and_done()
|
||||
* should be returning %true nullifying the straggling sched_class ops.
|
||||
* Remove from scx_tasks and exit @p.
|
||||
*/
|
||||
raw_spin_lock_irqsave(&scx_tasks_lock, flags);
|
||||
list_del_init(&p->scx.tasks_node);
|
||||
raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
|
||||
|
|
@ -3062,6 +3101,9 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
|
|||
|
||||
lockdep_assert_rq_held(task_rq(p));
|
||||
|
||||
if (task_dead_and_done(p))
|
||||
return;
|
||||
|
||||
p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
|
||||
if (SCX_HAS_OP(sch, set_weight))
|
||||
SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
|
||||
|
|
@ -3076,6 +3118,9 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
|
|||
{
|
||||
struct scx_sched *sch = scx_root;
|
||||
|
||||
if (task_dead_and_done(p))
|
||||
return;
|
||||
|
||||
scx_enable_task(p);
|
||||
|
||||
/*
|
||||
|
|
@ -3089,6 +3134,9 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
|
|||
|
||||
static void switched_from_scx(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
if (task_dead_and_done(p))
|
||||
return;
|
||||
|
||||
scx_disable_task(p);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue