sched_ext: Fixes for v7.0

- Various bug fixes for the example schedulers and selftests.
 -----BEGIN PGP SIGNATURE-----
 
 iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCaZkltg4cdGpAa2VybmVs
 Lm9yZwAKCRCxYfJx3gVYGR+3AQC2h+P8tZXpQ3tzNRgQ10KvekO+uetee4d3vb4O
 db5FDwEAh5Binq/tRSzLm7XEb/YmmP2XmHmhmnbRQL69dzMt5gM=
 =agf0
 -----END PGP SIGNATURE-----

Merge tag 'sched_ext-for-7.0-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fixes from Tejun Heo:

 - Various bug fixes for the example schedulers and selftests

* tag 'sched_ext-for-7.0-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
  tools/sched_ext: fix getopt not re-parsed on restart
  tools/sched_ext: scx_userland: fix data races on shared counters
  tools/sched_ext: scx_pair: fix stride == 0 crash on single-CPU systems
  tools/sched_ext: scx_central: fix CPU_SET and skeleton leak on early exit
  tools/sched_ext: scx_userland: fix stale data on restart
  tools/sched_ext: scx_flatcg: fix potential stack overflow from VLA in fcg_read_stats
  selftests/sched_ext: Fix rt_stall flaky failure
  tools/sched_ext: scx_userland: fix restart and stats thread lifecycle bugs
  tools/sched_ext: scx_central: fix sched_setaffinity() call with the set size
  tools/sched_ext: scx_flatcg: zero-initialize stats counter array
This commit is contained in:
Linus Torvalds 2026-02-21 09:38:59 -08:00
commit 4cf4465788
8 changed files with 96 additions and 18 deletions

View file

@ -50,11 +50,13 @@ int main(int argc, char **argv)
__u64 seq = 0, ecode;
__s32 opt;
cpu_set_t *cpuset;
size_t cpuset_size;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
restart:
optind = 1;
skel = SCX_OPS_OPEN(central_ops, scx_central);
skel->rodata->central_cpu = 0;
@ -73,6 +75,7 @@ restart:
u32 central_cpu = strtoul(optarg, NULL, 0);
if (central_cpu >= skel->rodata->nr_cpu_ids) {
fprintf(stderr, "invalid central CPU id value, %u given (%u max)\n", central_cpu, skel->rodata->nr_cpu_ids);
scx_central__destroy(skel);
return -1;
}
skel->rodata->central_cpu = (s32)central_cpu;
@ -106,9 +109,10 @@ restart:
*/
cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids);
SCX_BUG_ON(!cpuset, "Failed to allocate cpuset");
CPU_ZERO_S(CPU_ALLOC_SIZE(skel->rodata->nr_cpu_ids), cpuset);
CPU_SET(skel->rodata->central_cpu, cpuset);
SCX_BUG_ON(sched_setaffinity(0, sizeof(*cpuset), cpuset),
cpuset_size = CPU_ALLOC_SIZE(skel->rodata->nr_cpu_ids);
CPU_ZERO_S(cpuset_size, cpuset);
CPU_SET_S(skel->rodata->central_cpu, cpuset_size, cpuset);
SCX_BUG_ON(sched_setaffinity(0, cpuset_size, cpuset),
"Failed to affinitize to central CPU %d (max %d)",
skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1);
CPU_FREE(cpuset);

View file

@ -69,6 +69,7 @@ int main(int argc, char **argv)
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
restart:
optind = 1;
skel = SCX_OPS_OPEN(cpu0_ops, scx_cpu0);
skel->rodata->nr_cpus = libbpf_num_possible_cpus();

View file

@ -102,21 +102,27 @@ static float read_cpu_util(__u64 *last_sum, __u64 *last_idle)
static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats)
{
__u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus];
__u64 *cnts;
__u32 idx;
cnts = calloc(skel->rodata->nr_cpus, sizeof(__u64));
if (!cnts)
return;
memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS);
for (idx = 0; idx < FCG_NR_STATS; idx++) {
int ret, cpu;
ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
&idx, cnts[idx]);
&idx, cnts);
if (ret < 0)
continue;
for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++)
stats[idx] += cnts[idx][cpu];
stats[idx] += cnts[cpu];
}
free(cnts);
}
int main(int argc, char **argv)
@ -135,6 +141,7 @@ int main(int argc, char **argv)
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
restart:
optind = 1;
skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg);
skel->rodata->nr_cpus = libbpf_num_possible_cpus();

View file

@ -53,10 +53,10 @@ int main(int argc, char **argv)
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
restart:
optind = 1;
skel = SCX_OPS_OPEN(pair_ops, scx_pair);
skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
assert(skel->rodata->nr_cpu_ids > 0);
skel->rodata->pair_batch_dur_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
/* pair up the earlier half to the latter by default, override with -s */
@ -76,6 +76,12 @@ restart:
}
}
/* Stride must be positive to pair distinct CPUs. */
if (stride <= 0) {
fprintf(stderr, "Invalid stride %d, must be positive\n", stride);
scx_pair__destroy(skel);
return -1;
}
bpf_map__set_max_entries(skel->maps.pair_ctx, skel->rodata->nr_cpu_ids / 2);
/* Resize arrays so their element count is equal to cpu count. */

View file

@ -51,6 +51,7 @@ int main(int argc, char **argv)
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
restart:
optind = 1;
skel = SCX_OPS_OPEN(sdt_ops, scx_sdt);
while ((opt = getopt(argc, argv, "fvh")) != -1) {

View file

@ -71,6 +71,7 @@ int main(int argc, char **argv)
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
restart:
optind = 1;
skel = SCX_OPS_OPEN(simple_ops, scx_simple);
while ((opt = getopt(argc, argv, "fvh")) != -1) {

View file

@ -54,6 +54,7 @@ static bool verbose;
static volatile int exit_req;
static int enqueued_fd, dispatched_fd;
static pthread_t stats_printer;
static struct scx_userland *skel;
static struct bpf_link *ops_link;
@ -156,9 +157,9 @@ static int dispatch_task(__s32 pid)
err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0);
if (err) {
nr_vruntime_failed++;
__atomic_add_fetch(&nr_vruntime_failed, 1, __ATOMIC_RELAXED);
} else {
nr_vruntime_dispatches++;
__atomic_add_fetch(&nr_vruntime_dispatches, 1, __ATOMIC_RELAXED);
}
return err;
@ -201,8 +202,8 @@ static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task)
return ENOENT;
update_enqueued(curr, bpf_task);
nr_vruntime_enqueues++;
nr_curr_enqueued++;
__atomic_add_fetch(&nr_vruntime_enqueues, 1, __ATOMIC_RELAXED);
__atomic_add_fetch(&nr_curr_enqueued, 1, __ATOMIC_RELAXED);
/*
* Enqueue the task in a vruntime-sorted list. A more optimal data
@ -278,9 +279,9 @@ static void dispatch_batch(void)
LIST_INSERT_HEAD(&vruntime_head, task, entries);
break;
}
nr_curr_enqueued--;
__atomic_sub_fetch(&nr_curr_enqueued, 1, __ATOMIC_RELAXED);
}
skel->bss->nr_scheduled = nr_curr_enqueued;
skel->bss->nr_scheduled = __atomic_load_n(&nr_curr_enqueued, __ATOMIC_RELAXED);
}
static void *run_stats_printer(void *arg)
@ -305,9 +306,9 @@ static void *run_stats_printer(void *arg)
printf("|-----------------------|\n");
printf("| VRUNTIME / USER |\n");
printf("|-----------------------|\n");
printf("| enq: %10llu |\n", nr_vruntime_enqueues);
printf("| disp: %10llu |\n", nr_vruntime_dispatches);
printf("| failed: %10llu |\n", nr_vruntime_failed);
printf("| enq: %10llu |\n", __atomic_load_n(&nr_vruntime_enqueues, __ATOMIC_RELAXED));
printf("| disp: %10llu |\n", __atomic_load_n(&nr_vruntime_dispatches, __ATOMIC_RELAXED));
printf("| failed: %10llu |\n", __atomic_load_n(&nr_vruntime_failed, __ATOMIC_RELAXED));
printf("o-----------------------o\n");
printf("\n\n");
fflush(stdout);
@ -319,8 +320,6 @@ static void *run_stats_printer(void *arg)
static int spawn_stats_thread(void)
{
pthread_t stats_printer;
return pthread_create(&stats_printer, NULL, run_stats_printer, NULL);
}
@ -375,6 +374,15 @@ static void pre_bootstrap(int argc, char **argv)
static void bootstrap(char *comm)
{
exit_req = 0;
min_vruntime = 0.0;
__atomic_store_n(&nr_vruntime_enqueues, 0, __ATOMIC_RELAXED);
__atomic_store_n(&nr_vruntime_dispatches, 0, __ATOMIC_RELAXED);
__atomic_store_n(&nr_vruntime_failed, 0, __ATOMIC_RELAXED);
__atomic_store_n(&nr_curr_enqueued, 0, __ATOMIC_RELAXED);
memset(tasks, 0, pid_max * sizeof(*tasks));
LIST_INIT(&vruntime_head);
skel = SCX_OPS_OPEN(userland_ops, scx_userland);
skel->rodata->num_possible_cpus = libbpf_num_possible_cpus();
@ -428,6 +436,7 @@ restart:
exit_req = 1;
bpf_link__destroy(ops_link);
pthread_join(stats_printer, NULL);
ecode = UEI_REPORT(skel, uei);
scx_userland__destroy(skel);

View file

@ -23,6 +23,30 @@
#define CORE_ID 0 /* CPU to pin tasks to */
#define RUN_TIME 5 /* How long to run the test in seconds */
/* Signal the parent that setup is complete by writing to a pipe */
static void signal_ready(int fd)
{
char c = 1;
if (write(fd, &c, 1) != 1) {
perror("write to ready pipe");
exit(EXIT_FAILURE);
}
close(fd);
}
/* Wait for a child to signal readiness via a pipe */
static void wait_ready(int fd)
{
char c;
if (read(fd, &c, 1) != 1) {
perror("read from ready pipe");
exit(EXIT_FAILURE);
}
close(fd);
}
/* Simple busy-wait function for test tasks */
static void process_func(void)
{
@ -122,14 +146,24 @@ static bool sched_stress_test(bool is_ext)
float ext_runtime, rt_runtime, actual_ratio;
int ext_pid, rt_pid;
int ext_ready[2], rt_ready[2];
ksft_print_header();
ksft_set_plan(1);
if (pipe(ext_ready) || pipe(rt_ready)) {
perror("pipe");
ksft_exit_fail();
}
/* Create and set up a EXT task */
ext_pid = fork();
if (ext_pid == 0) {
close(ext_ready[0]);
close(rt_ready[0]);
close(rt_ready[1]);
set_affinity(CORE_ID);
signal_ready(ext_ready[1]);
process_func();
exit(0);
} else if (ext_pid < 0) {
@ -140,8 +174,12 @@ static bool sched_stress_test(bool is_ext)
/* Create an RT task */
rt_pid = fork();
if (rt_pid == 0) {
close(ext_ready[0]);
close(ext_ready[1]);
close(rt_ready[0]);
set_affinity(CORE_ID);
set_sched(SCHED_FIFO, 50);
signal_ready(rt_ready[1]);
process_func();
exit(0);
} else if (rt_pid < 0) {
@ -149,6 +187,17 @@ static bool sched_stress_test(bool is_ext)
ksft_exit_fail();
}
/*
* Wait for both children to complete their setup (affinity and
* scheduling policy) before starting the measurement window.
* This prevents flaky failures caused by the RT child's setup
* time eating into the measurement period.
*/
close(ext_ready[1]);
close(rt_ready[1]);
wait_ready(ext_ready[0]);
wait_ready(rt_ready[0]);
/* Let the processes run for the specified time */
sleep(RUN_TIME);