linux/kernel/sched/stats.h
Wang Yaxin 503efe850c delayacct: add timestamp of delay max
Problem
=======
Commit 658eb5ab91 ("delayacct: add delay max to record delay peak")
introduced the delay max for getdelays, which records abnormal latency
peaks and helps us understand the magnitude of such delays.  However, the
peak latency value alone is insufficient for effective root cause
analysis.  Without the precise timestamp of when the peak occurred, we
still lack the critical context needed to correlate it with other system
events.

Solution
========
To address this, we need to additionally record a precise timestamp when
the maximum latency occurs.  By correlating this timestamp with system
logs and monitoring metrics, we can identify processes with abnormal
resource usage at the same moment, which can help us to pinpoint root
causes.

Use Case
========
bash-4.4# ./getdelays -d -t 227
print delayacct stats ON
TGID    227
CPU         count     real total  virtual total    delay total  delay average      delay max      delay min      delay max timestamp
               46      188000000      192348334        4098012          0.089ms     0.429260ms     0.051205ms    2026-01-15T15:06:58
IO          count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
SWAP        count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
RECLAIM     count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
THRAS HING   count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
COMPACT     count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
WPCOPY      count    delay total  delay average      delay max      delay min      delay max timestamp
              182       19413338          0.107ms     0.547353ms     0.022462ms    2026-01-15T15:05:24
IRQ         count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A

Link: https://lkml.kernel.org/r/20260119100241520gWubW8-5QfhSf9gjqcc_E@zte.com.cn
Signed-off-by: Wang Yaxin <wang.yaxin@zte.com.cn>
Cc: Fan Yu <fan.yu9@zte.com.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2026-01-31 16:16:06 -08:00

348 lines
11 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _KERNEL_STATS_H
#define _KERNEL_STATS_H
#ifdef CONFIG_SCHEDSTATS
extern struct static_key_false sched_schedstats;
/*
* Expects runqueue lock to be held for atomicity of update
*/
static inline void
rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
{
if (rq) {
rq->rq_sched_info.run_delay += delta;
rq->rq_sched_info.pcount++;
}
}
/*
* Expects runqueue lock to be held for atomicity of update
*/
static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{
if (rq)
rq->rq_cpu_time += delta;
}
static inline void
rq_sched_info_dequeue(struct rq *rq, unsigned long long delta)
{
if (rq)
rq->rq_sched_info.run_delay += delta;
}
#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
#define __schedstat_inc(var) do { var++; } while (0)
#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
#define __schedstat_add(var, amt) do { var += (amt); } while (0)
#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
#define __schedstat_set(var, val) do { var = (val); } while (0)
#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
#define schedstat_val(var) (var)
#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats);
void __update_stats_wait_end(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats);
void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats);
static inline void
check_schedstat_required(void)
{
if (schedstat_enabled())
return;
/* Force schedstat enabled if a dependent tracepoint is active */
if (trace_sched_stat_wait_enabled() ||
trace_sched_stat_sleep_enabled() ||
trace_sched_stat_iowait_enabled() ||
trace_sched_stat_blocked_enabled() ||
trace_sched_stat_runtime_enabled())
printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, stat_blocked and stat_runtime require the kernel parameter schedstats=enable or kernel.sched_schedstats=1\n");
}
#else /* !CONFIG_SCHEDSTATS: */
static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { }
static inline void rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { }
static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { }
# define schedstat_enabled() 0
# define __schedstat_inc(var) do { } while (0)
# define schedstat_inc(var) do { } while (0)
# define __schedstat_add(var, amt) do { } while (0)
# define schedstat_add(var, amt) do { } while (0)
# define __schedstat_set(var, val) do { } while (0)
# define schedstat_set(var, val) do { } while (0)
# define schedstat_val(var) 0
# define schedstat_val_or_zero(var) 0
# define __update_stats_wait_start(rq, p, stats) do { } while (0)
# define __update_stats_wait_end(rq, p, stats) do { } while (0)
# define __update_stats_enqueue_sleeper(rq, p, stats) do { } while (0)
# define check_schedstat_required() do { } while (0)
#endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity_stats {
struct sched_entity se;
struct sched_statistics stats;
} __no_randomize_layout;
#endif
static inline struct sched_statistics *
__schedstats_from_se(struct sched_entity *se)
{
#ifdef CONFIG_FAIR_GROUP_SCHED
if (!entity_is_task(se))
return &container_of(se, struct sched_entity_stats, se)->stats;
#endif
return &task_of(se)->stats;
}
#ifdef CONFIG_PSI
void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
struct task_struct *prev) {}
#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
* where a task's runnable state changes, and migrations, where a task
* and its runnable state are being moved between CPUs and runqueues.
*
* A notable case is a task whose dequeue is delayed. PSI considers
* those sleeping, but because they are still on the runqueue they can
* go through migration requeues. In this case, *sleeping* states need
* to be transferred.
*/
static inline void psi_enqueue(struct task_struct *p, int flags)
{
int clear = 0, set = 0;
if (static_branch_likely(&psi_disabled))
return;
/* Same runqueue, nothing changed for psi */
if (flags & ENQUEUE_RESTORE)
return;
/* psi_sched_switch() will handle the flags */
if (task_on_cpu(task_rq(p), p))
return;
if (p->se.sched_delayed) {
/* CPU migration of "sleeping" task */
WARN_ON_ONCE(!(flags & ENQUEUE_MIGRATED));
if (p->in_memstall)
set |= TSK_MEMSTALL;
if (p->in_iowait)
set |= TSK_IOWAIT;
} else if (flags & ENQUEUE_MIGRATED) {
/* CPU migration of runnable task */
set = TSK_RUNNING;
if (p->in_memstall)
set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING;
} else {
/* Wakeup of new or sleeping task */
if (p->in_iowait)
clear |= TSK_IOWAIT;
set = TSK_RUNNING;
if (p->in_memstall)
set |= TSK_MEMSTALL_RUNNING;
}
psi_task_change(p, clear, set);
}
static inline void psi_dequeue(struct task_struct *p, int flags)
{
if (static_branch_likely(&psi_disabled))
return;
/* Same runqueue, nothing changed for psi */
if (flags & DEQUEUE_SAVE)
return;
/*
* A voluntary sleep is a dequeue followed by a task switch. To
* avoid walking all ancestors twice, psi_task_switch() handles
* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
* Do nothing here.
*
* In the SCHED_PROXY_EXECUTION case we may do sleeping
* dequeues that are not followed by a task switch, so check
* TSK_ONCPU is set to ensure the task switch is imminent.
* Otherwise clear the flags as usual.
*/
if ((flags & DEQUEUE_SLEEP) && (p->psi_flags & TSK_ONCPU))
return;
/*
* When migrating a task to another CPU, clear all psi
* state. The enqueue callback above will work it out.
*/
psi_task_change(p, p->psi_flags, 0);
}
static inline void psi_ttwu_dequeue(struct task_struct *p)
{
if (static_branch_likely(&psi_disabled))
return;
/*
* Is the task being migrated during a wakeup? Make sure to
* deregister its sleep-persistent psi states from the old
* queue, and let psi_enqueue() know it has to requeue.
*/
if (unlikely(p->psi_flags)) {
struct rq_flags rf;
struct rq *rq;
rq = __task_rq_lock(p, &rf);
psi_task_change(p, p->psi_flags, 0);
__task_rq_unlock(rq, p, &rf);
}
}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep)
{
if (static_branch_likely(&psi_disabled))
return;
psi_task_switch(prev, next, sleep);
}
#else /* !CONFIG_PSI: */
static inline void psi_enqueue(struct task_struct *p, bool migrate) {}
static inline void psi_dequeue(struct task_struct *p, bool migrate) {}
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
struct task_struct *prev) {}
#endif /* !CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
/*
* We are interested in knowing how long it was from the *first* time a
* task was queued to the time that it finally hit a CPU, we call this routine
* from dequeue_task() to account for possible rq->clock skew across CPUs. The
* delta taken on each CPU would annul the skew.
*/
static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
{
unsigned long long delta = 0;
if (!t->sched_info.last_queued)
return;
delta = rq_clock(rq) - t->sched_info.last_queued;
t->sched_info.last_queued = 0;
t->sched_info.run_delay += delta;
if (delta > t->sched_info.max_run_delay) {
t->sched_info.max_run_delay = delta;
ktime_get_real_ts64(&t->sched_info.max_run_delay_ts);
}
if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
t->sched_info.min_run_delay = delta;
rq_sched_info_dequeue(rq, delta);
}
/*
* Called when a task finally hits the CPU. We can now calculate how
* long it was waiting to run. We also note when it began so that we
* can keep stats on how long its time-slice is.
*/
static void sched_info_arrive(struct rq *rq, struct task_struct *t)
{
unsigned long long now, delta = 0;
if (!t->sched_info.last_queued)
return;
now = rq_clock(rq);
delta = now - t->sched_info.last_queued;
t->sched_info.last_queued = 0;
t->sched_info.run_delay += delta;
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
if (delta > t->sched_info.max_run_delay) {
t->sched_info.max_run_delay = delta;
ktime_get_real_ts64(&t->sched_info.max_run_delay_ts);
}
if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
t->sched_info.min_run_delay = delta;
rq_sched_info_arrive(rq, delta);
}
/*
* This function is only called from enqueue_task(), but also only updates
* the timestamp if it is already not set. It's assumed that
* sched_info_dequeue() will clear that stamp when appropriate.
*/
static inline void sched_info_enqueue(struct rq *rq, struct task_struct *t)
{
if (!t->sched_info.last_queued)
t->sched_info.last_queued = rq_clock(rq);
}
/*
* Called when a process ceases being the active-running process involuntarily
* due, typically, to expiring its time slice (this may also be called when
* switching to the idle task). Now we can calculate how long we ran.
* Also, if the process is still in the TASK_RUNNING state, call
* sched_info_enqueue() to mark that it has now again started waiting on
* the runqueue.
*/
static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
{
unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival;
rq_sched_info_depart(rq, delta);
if (task_is_running(t))
sched_info_enqueue(rq, t);
}
/*
* Called when tasks are switched involuntarily due, typically, to expiring
* their time slice. (This may also be called when switching to or from
* the idle task.) We are only called when prev != next.
*/
static inline void
sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
/*
* prev now departs the CPU. It's not interesting to record
* stats about how efficient we were at scheduling the idle
* process, however.
*/
if (prev != rq->idle)
sched_info_depart(rq, prev);
if (next != rq->idle)
sched_info_arrive(rq, next);
}
#else /* !CONFIG_SCHED_INFO: */
# define sched_info_enqueue(rq, t) do { } while (0)
# define sched_info_dequeue(rq, t) do { } while (0)
# define sched_info_switch(rq, t, next) do { } while (0)
#endif /* !CONFIG_SCHED_INFO */
#endif /* _KERNEL_STATS_H */