mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 03:44:45 +01:00
workqueue: Changes for v6.20
- Rework the rescuer to process work items one-by-one instead of slurping all pending work items in a single pass. As there is only one rescuer per workqueue, a single long-blocking work item could cause high latency for all tasks queued behind it, even after memory pressure is relieved and regular kworkers become available to service them. - Add CONFIG_BOOTPARAM_WQ_STALL_PANIC build-time option and workqueue.panic_on_stall_time parameter for time-based stall panic, giving systems more control over workqueue stall handling. - Replace BUG_ON() with panic() in the stall panic path for clearer intent and more informative output. -----BEGIN PGP SIGNATURE----- iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCaYov/A4cdGpAa2VybmVs Lm9yZwAKCRCxYfJx3gVYGWXnAQCfyELl+evz3RdFhyTiVCM1TiOnC1TsBjgkm3SJ orMhwgEAkgg40jino34wgeZRfdIAThxQ1O6bsvTpooWKjlCYcQY= =zZCF -----END PGP SIGNATURE----- Merge tag 'wq-for-6.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq Pull workqueue updates from Tejun Heo: - Rework the rescuer to process work items one-by-one instead of slurping all pending work items in a single pass. As there is only one rescuer per workqueue, a single long-blocking work item could cause high latency for all tasks queued behind it, even after memory pressure is relieved and regular kworkers become available to service them. - Add CONFIG_BOOTPARAM_WQ_STALL_PANIC build-time option and workqueue.panic_on_stall_time parameter for time-based stall panic, giving systems more control over workqueue stall handling. - Replace BUG_ON() with panic() in the stall panic path for clearer intent and more informative output. * tag 'wq-for-6.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq: workqueue: replace BUG_ON with panic in panic_on_wq_watchdog workqueue: add time-based panic for stalls workqueue: add CONFIG_BOOTPARAM_WQ_STALL_PANIC option workqueue: Process extra works in rescuer on memory pressure workqueue: Process rescuer work items one-by-one using a cursor workqueue: Make send_mayday() take a PWQ argument directly
This commit is contained in:
commit
9bdc64892d
3 changed files with 149 additions and 36 deletions
|
|
@ -8376,7 +8376,16 @@ Kernel parameters
|
|||
CONFIG_WQ_WATCHDOG. It sets the number times of the
|
||||
stall to trigger panic.
|
||||
|
||||
The default is 0, which disables the panic on stall.
|
||||
The default is set by CONFIG_BOOTPARAM_WQ_STALL_PANIC,
|
||||
which is 0 (disabled) if not configured.
|
||||
|
||||
workqueue.panic_on_stall_time=<uint>
|
||||
Panic when a workqueue stall has been continuous for
|
||||
the specified number of seconds. Unlike panic_on_stall
|
||||
which counts accumulated stall events, this triggers
|
||||
based on the duration of a single continuous stall.
|
||||
|
||||
The default is 0, which disables the time-based panic.
|
||||
|
||||
workqueue.cpu_intensive_thresh_us=
|
||||
Per-cpu work items which run for longer than this
|
||||
|
|
|
|||
|
|
@ -117,6 +117,8 @@ enum wq_internal_consts {
|
|||
MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
|
||||
CREATE_COOLDOWN = HZ, /* time to breath after fail */
|
||||
|
||||
RESCUER_BATCH = 16, /* process items per turn */
|
||||
|
||||
/*
|
||||
* Rescue workers are used only on emergencies and shared by
|
||||
* all cpus. Give MIN_NICE.
|
||||
|
|
@ -286,6 +288,7 @@ struct pool_workqueue {
|
|||
struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */
|
||||
struct list_head pwqs_node; /* WR: node on wq->pwqs */
|
||||
struct list_head mayday_node; /* MD: node on wq->maydays */
|
||||
struct work_struct mayday_cursor; /* L: cursor on pool->worklist */
|
||||
|
||||
u64 stats[PWQ_NR_STATS];
|
||||
|
||||
|
|
@ -1120,6 +1123,12 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
static void mayday_cursor_func(struct work_struct *work)
|
||||
{
|
||||
/* should not be processed, only for marking position */
|
||||
BUG();
|
||||
}
|
||||
|
||||
/**
|
||||
* move_linked_works - move linked works to a list
|
||||
* @work: start of series of works to be scheduled
|
||||
|
|
@ -1182,6 +1191,16 @@ static bool assign_work(struct work_struct *work, struct worker *worker,
|
|||
|
||||
lockdep_assert_held(&pool->lock);
|
||||
|
||||
/* The cursor work should not be processed */
|
||||
if (unlikely(work->func == mayday_cursor_func)) {
|
||||
/* only worker_thread() can possibly take this branch */
|
||||
WARN_ON_ONCE(worker->rescue_wq);
|
||||
if (nextp)
|
||||
*nextp = list_next_entry(work, entry);
|
||||
list_del_init(&work->entry);
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* A single work shouldn't be executed concurrently by multiple workers.
|
||||
* __queue_work() ensures that @work doesn't jump to a different pool
|
||||
|
|
@ -2976,9 +2995,8 @@ static void idle_cull_fn(struct work_struct *work)
|
|||
reap_dying_workers(&cull_list);
|
||||
}
|
||||
|
||||
static void send_mayday(struct work_struct *work)
|
||||
static void send_mayday(struct pool_workqueue *pwq)
|
||||
{
|
||||
struct pool_workqueue *pwq = get_work_pwq(work);
|
||||
struct workqueue_struct *wq = pwq->wq;
|
||||
|
||||
lockdep_assert_held(&wq_mayday_lock);
|
||||
|
|
@ -3016,7 +3034,7 @@ static void pool_mayday_timeout(struct timer_list *t)
|
|||
* rescuers.
|
||||
*/
|
||||
list_for_each_entry(work, &pool->worklist, entry)
|
||||
send_mayday(work);
|
||||
send_mayday(get_work_pwq(work));
|
||||
}
|
||||
|
||||
raw_spin_unlock(&wq_mayday_lock);
|
||||
|
|
@ -3440,22 +3458,57 @@ sleep:
|
|||
static bool assign_rescuer_work(struct pool_workqueue *pwq, struct worker *rescuer)
|
||||
{
|
||||
struct worker_pool *pool = pwq->pool;
|
||||
struct work_struct *cursor = &pwq->mayday_cursor;
|
||||
struct work_struct *work, *n;
|
||||
|
||||
/* need rescue? */
|
||||
if (!pwq->nr_active || !need_to_create_worker(pool))
|
||||
/* have work items to rescue? */
|
||||
if (!pwq->nr_active)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Slurp in all works issued via this workqueue and
|
||||
* process'em.
|
||||
*/
|
||||
list_for_each_entry_safe(work, n, &pool->worklist, entry) {
|
||||
if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n))
|
||||
pwq->stats[PWQ_STAT_RESCUED]++;
|
||||
/* need rescue? */
|
||||
if (!need_to_create_worker(pool)) {
|
||||
/*
|
||||
* The pool has idle workers and doesn't need the rescuer, so it
|
||||
* could simply return false here.
|
||||
*
|
||||
* However, the memory pressure might not be fully relieved.
|
||||
* In PERCPU pool with concurrency enabled, having idle workers
|
||||
* does not necessarily mean memory pressure is gone; it may
|
||||
* simply mean regular workers have woken up, completed their
|
||||
* work, and gone idle again due to concurrency limits.
|
||||
*
|
||||
* In this case, those working workers may later sleep again,
|
||||
* the pool may run out of idle workers, and it will have to
|
||||
* allocate new ones and wait for the timer to send mayday,
|
||||
* causing unnecessary delay - especially if memory pressure
|
||||
* was never resolved throughout.
|
||||
*
|
||||
* Do more work if memory pressure is still on to reduce
|
||||
* relapse, using (pool->flags & POOL_MANAGER_ACTIVE), though
|
||||
* not precisely, unless there are other PWQs needing help.
|
||||
*/
|
||||
if (!(pool->flags & POOL_MANAGER_ACTIVE) ||
|
||||
!list_empty(&pwq->wq->maydays))
|
||||
return false;
|
||||
}
|
||||
|
||||
return !list_empty(&rescuer->scheduled);
|
||||
/* search from the start or cursor if available */
|
||||
if (list_empty(&cursor->entry))
|
||||
work = list_first_entry(&pool->worklist, struct work_struct, entry);
|
||||
else
|
||||
work = list_next_entry(cursor, entry);
|
||||
|
||||
/* find the next work item to rescue */
|
||||
list_for_each_entry_safe_from(work, n, &pool->worklist, entry) {
|
||||
if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) {
|
||||
pwq->stats[PWQ_STAT_RESCUED]++;
|
||||
/* put the cursor for next search */
|
||||
list_move_tail(&cursor->entry, &n->entry);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -3512,6 +3565,7 @@ repeat:
|
|||
struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
|
||||
struct pool_workqueue, mayday_node);
|
||||
struct worker_pool *pool = pwq->pool;
|
||||
unsigned int count = 0;
|
||||
|
||||
__set_current_state(TASK_RUNNING);
|
||||
list_del_init(&pwq->mayday_node);
|
||||
|
|
@ -3524,31 +3578,27 @@ repeat:
|
|||
|
||||
WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
|
||||
|
||||
if (assign_rescuer_work(pwq, rescuer)) {
|
||||
while (assign_rescuer_work(pwq, rescuer)) {
|
||||
process_scheduled_works(rescuer);
|
||||
|
||||
/*
|
||||
* The above execution of rescued work items could
|
||||
* have created more to rescue through
|
||||
* pwq_activate_first_inactive() or chained
|
||||
* queueing. Let's put @pwq back on mayday list so
|
||||
* that such back-to-back work items, which may be
|
||||
* being used to relieve memory pressure, don't
|
||||
* incur MAYDAY_INTERVAL delay inbetween.
|
||||
* If the per-turn work item limit is reached and other
|
||||
* PWQs are in mayday, requeue mayday for this PWQ and
|
||||
* let the rescuer handle the other PWQs first.
|
||||
*/
|
||||
if (pwq->nr_active && need_to_create_worker(pool)) {
|
||||
if (++count > RESCUER_BATCH && !list_empty(&pwq->wq->maydays) &&
|
||||
pwq->nr_active && need_to_create_worker(pool)) {
|
||||
raw_spin_lock(&wq_mayday_lock);
|
||||
/*
|
||||
* Queue iff somebody else hasn't queued it already.
|
||||
*/
|
||||
if (list_empty(&pwq->mayday_node)) {
|
||||
get_pwq(pwq);
|
||||
list_add_tail(&pwq->mayday_node, &wq->maydays);
|
||||
}
|
||||
send_mayday(pwq);
|
||||
raw_spin_unlock(&wq_mayday_lock);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* The cursor can not be left behind without the rescuer watching it. */
|
||||
if (!list_empty(&pwq->mayday_cursor.entry) && list_empty(&pwq->mayday_node))
|
||||
list_del_init(&pwq->mayday_cursor.entry);
|
||||
|
||||
/*
|
||||
* Leave this pool. Notify regular workers; otherwise, we end up
|
||||
* with 0 concurrency and stalling the execution.
|
||||
|
|
@ -5167,6 +5217,19 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
|
|||
INIT_LIST_HEAD(&pwq->pwqs_node);
|
||||
INIT_LIST_HEAD(&pwq->mayday_node);
|
||||
kthread_init_work(&pwq->release_work, pwq_release_workfn);
|
||||
|
||||
/*
|
||||
* Set the dummy cursor work with valid function and get_work_pwq().
|
||||
*
|
||||
* The cursor work should only be in the pwq->pool->worklist, and
|
||||
* should not be treated as a processable work item.
|
||||
*
|
||||
* WORK_STRUCT_PENDING and WORK_STRUCT_INACTIVE just make it less
|
||||
* surprise for kernel debugging tools and reviewers.
|
||||
*/
|
||||
INIT_WORK(&pwq->mayday_cursor, mayday_cursor_func);
|
||||
atomic_long_set(&pwq->mayday_cursor.data, (unsigned long)pwq |
|
||||
WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | WORK_STRUCT_INACTIVE);
|
||||
}
|
||||
|
||||
/* sync @pwq with the current state of its associated wq and link it */
|
||||
|
|
@ -7508,9 +7571,13 @@ static struct timer_list wq_watchdog_timer;
|
|||
static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
|
||||
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
|
||||
|
||||
static unsigned int wq_panic_on_stall;
|
||||
static unsigned int wq_panic_on_stall = CONFIG_BOOTPARAM_WQ_STALL_PANIC;
|
||||
module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644);
|
||||
|
||||
static unsigned int wq_panic_on_stall_time;
|
||||
module_param_named(panic_on_stall_time, wq_panic_on_stall_time, uint, 0644);
|
||||
MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds (0=disabled)");
|
||||
|
||||
/*
|
||||
* Show workers that might prevent the processing of pending work items.
|
||||
* The only candidates are CPU-bound workers in the running state.
|
||||
|
|
@ -7562,14 +7629,25 @@ static void show_cpu_pools_hogs(void)
|
|||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static void panic_on_wq_watchdog(void)
|
||||
/*
|
||||
* It triggers a panic in two scenarios: when the total number of stalls
|
||||
* exceeds a threshold, and when a stall lasts longer than
|
||||
* wq_panic_on_stall_time
|
||||
*/
|
||||
static void panic_on_wq_watchdog(unsigned int stall_time_sec)
|
||||
{
|
||||
static unsigned int wq_stall;
|
||||
|
||||
if (wq_panic_on_stall) {
|
||||
wq_stall++;
|
||||
BUG_ON(wq_stall >= wq_panic_on_stall);
|
||||
if (wq_stall >= wq_panic_on_stall)
|
||||
panic("workqueue: %u stall(s) exceeded threshold %u\n",
|
||||
wq_stall, wq_panic_on_stall);
|
||||
}
|
||||
|
||||
if (wq_panic_on_stall_time && stall_time_sec >= wq_panic_on_stall_time)
|
||||
panic("workqueue: stall lasted %us, exceeding threshold %us\n",
|
||||
stall_time_sec, wq_panic_on_stall_time);
|
||||
}
|
||||
|
||||
static void wq_watchdog_reset_touched(void)
|
||||
|
|
@ -7584,10 +7662,12 @@ static void wq_watchdog_reset_touched(void)
|
|||
static void wq_watchdog_timer_fn(struct timer_list *unused)
|
||||
{
|
||||
unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
|
||||
unsigned int max_stall_time = 0;
|
||||
bool lockup_detected = false;
|
||||
bool cpu_pool_stall = false;
|
||||
unsigned long now = jiffies;
|
||||
struct worker_pool *pool;
|
||||
unsigned int stall_time;
|
||||
int pi;
|
||||
|
||||
if (!thresh)
|
||||
|
|
@ -7621,14 +7701,15 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
|
|||
/* did we stall? */
|
||||
if (time_after(now, ts + thresh)) {
|
||||
lockup_detected = true;
|
||||
stall_time = jiffies_to_msecs(now - pool_ts) / 1000;
|
||||
max_stall_time = max(max_stall_time, stall_time);
|
||||
if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) {
|
||||
pool->cpu_stall = true;
|
||||
cpu_pool_stall = true;
|
||||
}
|
||||
pr_emerg("BUG: workqueue lockup - pool");
|
||||
pr_cont_pool_info(pool);
|
||||
pr_cont(" stuck for %us!\n",
|
||||
jiffies_to_msecs(now - pool_ts) / 1000);
|
||||
pr_cont(" stuck for %us!\n", stall_time);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -7641,7 +7722,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
|
|||
show_cpu_pools_hogs();
|
||||
|
||||
if (lockup_detected)
|
||||
panic_on_wq_watchdog();
|
||||
panic_on_wq_watchdog(max_stall_time);
|
||||
|
||||
wq_watchdog_reset_touched();
|
||||
mod_timer(&wq_watchdog_timer, jiffies + thresh);
|
||||
|
|
|
|||
|
|
@ -1322,6 +1322,29 @@ config WQ_WATCHDOG
|
|||
state. This can be configured through kernel parameter
|
||||
"workqueue.watchdog_thresh" and its sysfs counterpart.
|
||||
|
||||
config BOOTPARAM_WQ_STALL_PANIC
|
||||
int "Panic on Nth workqueue stall"
|
||||
default 0
|
||||
range 0 100
|
||||
depends on WQ_WATCHDOG
|
||||
help
|
||||
Set the number of workqueue stalls to trigger a kernel panic.
|
||||
A workqueue stall occurs when a worker pool doesn't make forward
|
||||
progress on a pending work item for over 30 seconds (configurable
|
||||
using the workqueue.watchdog_thresh parameter).
|
||||
|
||||
If n = 0, the kernel will not panic on stall. If n > 0, the kernel
|
||||
will panic after n stall warnings.
|
||||
|
||||
The panic can be used in combination with panic_timeout,
|
||||
to cause the system to reboot automatically after a
|
||||
stall has been detected. This feature is useful for
|
||||
high-availability systems that have uptime guarantees and
|
||||
where a stall must be resolved ASAP.
|
||||
|
||||
This setting can be overridden at runtime via the
|
||||
workqueue.panic_on_stall kernel parameter.
|
||||
|
||||
config WQ_CPU_INTENSIVE_REPORT
|
||||
bool "Report per-cpu work items which hog CPU for too long"
|
||||
depends on DEBUG_KERNEL
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue