mirror of
https://github.com/torvalds/linux.git
synced 2026-03-09 04:16:35 +01:00
tools/sched_ext: add arena based scheduler
Add a scheduler that uses BPF arenas to manage task context data. Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com> Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
parent
f0262b102c
commit
36929ebd17
4 changed files with 925 additions and 1 deletions
|
|
@ -189,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
|
|||
|
||||
SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
|
||||
|
||||
c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair
|
||||
c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair scx_sdt
|
||||
|
||||
$(addprefix $(BINDIR)/,$(c-sched-targets)): \
|
||||
$(BINDIR)/%: \
|
||||
|
|
|
|||
710
tools/sched_ext/scx_sdt.bpf.c
Normal file
710
tools/sched_ext/scx_sdt.bpf.c
Normal file
|
|
@ -0,0 +1,710 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Arena-based task data scheduler. This is a variation of scx_simple
|
||||
* that uses a combined allocator and indexing structure to organize
|
||||
* task data. Task context allocation is done when a task enters the
|
||||
* scheduler, while freeing is done when it exits. Task contexts are
|
||||
* retrieved from task-local storage, pointing to the allocated memory.
|
||||
*
|
||||
* The main purpose of this scheduler is to demostrate arena memory
|
||||
* management.
|
||||
*
|
||||
* Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates.
|
||||
* Copyright (c) 2024-2025 Emil Tsalapatis <etsal@meta.com>
|
||||
* Copyright (c) 2024-2025 Tejun Heo <tj@kernel.org>
|
||||
*
|
||||
*/
|
||||
#include <scx/common.bpf.h>
|
||||
#include <scx/bpf_arena_common.bpf.h>
|
||||
|
||||
#include "scx_sdt.h"
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
UEI_DEFINE(uei);
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_ARENA);
|
||||
__uint(map_flags, BPF_F_MMAPABLE);
|
||||
#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
|
||||
__uint(max_entries, 1 << 16); /* number of pages */
|
||||
__ulong(map_extra, (1ull << 32)); /* start of mmap() region */
|
||||
#else
|
||||
__uint(max_entries, 1 << 20); /* number of pages */
|
||||
__ulong(map_extra, (1ull << 44)); /* start of mmap() region */
|
||||
#endif
|
||||
} arena __weak SEC(".maps");
|
||||
|
||||
#define SHARED_DSQ 0
|
||||
|
||||
#define DEFINE_SDT_STAT(metric) \
|
||||
static inline void \
|
||||
stat_inc_##metric(struct scx_stats __arena *stats) \
|
||||
{ \
|
||||
cast_kern(stats); \
|
||||
stats->metric += 1; \
|
||||
} \
|
||||
__u64 stat_##metric; \
|
||||
|
||||
DEFINE_SDT_STAT(enqueue);
|
||||
DEFINE_SDT_STAT(init);
|
||||
DEFINE_SDT_STAT(exit);
|
||||
DEFINE_SDT_STAT(select_idle_cpu);
|
||||
DEFINE_SDT_STAT(select_busy_cpu);
|
||||
|
||||
/*
|
||||
* Necessary for cond_break/can_loop's semantics. According to kernel commit
|
||||
* 011832b, the loop counter variable must be seen as imprecise and bounded
|
||||
* by the verifier. Initializing it from a constant (e.g., i = 0;), then,
|
||||
* makes it precise and prevents may_goto from helping with converging the
|
||||
* loop. For these loops we must initialize the loop counter from a variable
|
||||
* whose value the verifier cannot reason about when checking the program, so
|
||||
* that the loop counter's value is imprecise.
|
||||
*/
|
||||
static __u64 zero = 0;
|
||||
|
||||
/*
|
||||
* XXX Hack to get the verifier to find the arena for sdt_exit_task.
|
||||
* As of 6.12-rc5, The verifier associates arenas with programs by
|
||||
* checking LD.IMM instruction operands for an arena and populating
|
||||
* the program state with the first instance it finds. This requires
|
||||
* accessing our global arena variable, but scx methods do not necessarily
|
||||
* do so while still using pointers from that arena. Insert a bpf_printk
|
||||
* statement that triggers at most once to generate an LD.IMM instruction
|
||||
* to access the arena and help the verifier.
|
||||
*/
|
||||
static volatile bool scx_arena_verify_once;
|
||||
|
||||
__hidden void scx_arena_subprog_init(void)
|
||||
{
|
||||
if (scx_arena_verify_once)
|
||||
return;
|
||||
|
||||
bpf_printk("%s: arena pointer %p", __func__, &arena);
|
||||
scx_arena_verify_once = true;
|
||||
}
|
||||
|
||||
|
||||
private(LOCK) struct bpf_spin_lock alloc_lock;
|
||||
private(POOL_LOCK) struct bpf_spin_lock alloc_pool_lock;
|
||||
|
||||
/* allocation pools */
|
||||
struct sdt_pool desc_pool;
|
||||
struct sdt_pool chunk_pool;
|
||||
|
||||
/* Protected by alloc_lock. */
|
||||
struct scx_alloc_stats alloc_stats;
|
||||
|
||||
|
||||
/* Allocate element from the pool. Must be called with a then pool lock held. */
|
||||
static
|
||||
void __arena *scx_alloc_from_pool(struct sdt_pool *pool)
|
||||
{
|
||||
__u64 elem_size, max_elems;
|
||||
void __arena *slab;
|
||||
void __arena *ptr;
|
||||
|
||||
elem_size = pool->elem_size;
|
||||
max_elems = pool->max_elems;
|
||||
|
||||
/* If the chunk is spent, get a new one. */
|
||||
if (pool->idx >= max_elems) {
|
||||
slab = bpf_arena_alloc_pages(&arena, NULL,
|
||||
div_round_up(max_elems * elem_size, PAGE_SIZE), NUMA_NO_NODE, 0);
|
||||
if (!slab)
|
||||
return NULL;
|
||||
|
||||
pool->slab = slab;
|
||||
pool->idx = 0;
|
||||
}
|
||||
|
||||
ptr = (void __arena *)((__u64) pool->slab + elem_size * pool->idx);
|
||||
pool->idx += 1;
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/* Alloc desc and associated chunk. Called with the allocator spinlock held. */
|
||||
static sdt_desc_t *scx_alloc_chunk(void)
|
||||
{
|
||||
struct sdt_chunk __arena *chunk;
|
||||
sdt_desc_t *desc;
|
||||
sdt_desc_t *out;
|
||||
|
||||
chunk = scx_alloc_from_pool(&chunk_pool);
|
||||
if (!chunk)
|
||||
return NULL;
|
||||
|
||||
desc = scx_alloc_from_pool(&desc_pool);
|
||||
if (!desc) {
|
||||
/*
|
||||
* Effectively frees the previous chunk allocation.
|
||||
* Index cannot be 0, so decrementing is always
|
||||
* valid.
|
||||
*/
|
||||
chunk_pool.idx -= 1;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
out = desc;
|
||||
|
||||
desc->nr_free = SDT_TASK_ENTS_PER_CHUNK;
|
||||
desc->chunk = chunk;
|
||||
|
||||
alloc_stats.chunk_allocs += 1;
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
static int pool_set_size(struct sdt_pool *pool, __u64 data_size, __u64 nr_pages)
|
||||
{
|
||||
if (unlikely(data_size % 8))
|
||||
return -EINVAL;
|
||||
|
||||
if (unlikely(nr_pages == 0))
|
||||
return -EINVAL;
|
||||
|
||||
pool->elem_size = data_size;
|
||||
pool->max_elems = (PAGE_SIZE * nr_pages) / pool->elem_size;
|
||||
/* Populate the pool slab on the first allocation. */
|
||||
pool->idx = pool->max_elems;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Initialize both the base pool allocators and the root chunk of the index. */
|
||||
__hidden int
|
||||
scx_alloc_init(struct scx_allocator *alloc, __u64 data_size)
|
||||
{
|
||||
size_t min_chunk_size;
|
||||
int ret;
|
||||
|
||||
_Static_assert(sizeof(struct sdt_chunk) <= PAGE_SIZE,
|
||||
"chunk size must fit into a page");
|
||||
|
||||
ret = pool_set_size(&chunk_pool, sizeof(struct sdt_chunk), 1);
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
ret = pool_set_size(&desc_pool, sizeof(struct sdt_desc), 1);
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
/* Wrap data into a descriptor and word align. */
|
||||
data_size += sizeof(struct sdt_data);
|
||||
data_size = round_up(data_size, 8);
|
||||
|
||||
/*
|
||||
* Ensure we allocate large enough chunks from the arena to avoid excessive
|
||||
* internal fragmentation when turning chunks it into structs.
|
||||
*/
|
||||
min_chunk_size = div_round_up(SDT_TASK_MIN_ELEM_PER_ALLOC * data_size, PAGE_SIZE);
|
||||
ret = pool_set_size(&alloc->pool, data_size, min_chunk_size);
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
bpf_spin_lock(&alloc_lock);
|
||||
alloc->root = scx_alloc_chunk();
|
||||
bpf_spin_unlock(&alloc_lock);
|
||||
if (!alloc->root)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static
|
||||
int set_idx_state(sdt_desc_t *desc, __u64 pos, bool state)
|
||||
{
|
||||
__u64 __arena *allocated = desc->allocated;
|
||||
__u64 bit;
|
||||
|
||||
if (unlikely(pos >= SDT_TASK_ENTS_PER_CHUNK))
|
||||
return -EINVAL;
|
||||
|
||||
bit = (__u64)1 << (pos % 64);
|
||||
|
||||
if (state)
|
||||
allocated[pos / 64] |= bit;
|
||||
else
|
||||
allocated[pos / 64] &= ~bit;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static __noinline
|
||||
int mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK_LEVELS])
|
||||
{
|
||||
sdt_desc_t *desc;
|
||||
__u64 u, level;
|
||||
int ret;
|
||||
|
||||
for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) {
|
||||
level = SDT_TASK_LEVELS - 1 - u;
|
||||
|
||||
/* Only propagate upwards if we are the parent's only free chunk. */
|
||||
desc = lv_desc[level];
|
||||
|
||||
ret = set_idx_state(desc, lv_pos[level], false);
|
||||
if (unlikely(ret != 0))
|
||||
return ret;
|
||||
|
||||
desc->nr_free += 1;
|
||||
if (desc->nr_free > 1)
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Free the allocated struct with the given index. Called with the
|
||||
* allocator lock taken.
|
||||
*/
|
||||
__hidden
|
||||
int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx)
|
||||
{
|
||||
const __u64 mask = (1 << SDT_TASK_ENTS_PER_PAGE_SHIFT) - 1;
|
||||
sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
|
||||
sdt_desc_t * __arena *desc_children;
|
||||
struct sdt_chunk __arena *chunk;
|
||||
sdt_desc_t *desc;
|
||||
struct sdt_data __arena *data;
|
||||
__u64 level, shift, pos;
|
||||
__u64 lv_pos[SDT_TASK_LEVELS];
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
if (!alloc)
|
||||
return 0;
|
||||
|
||||
desc = alloc->root;
|
||||
if (unlikely(!desc))
|
||||
return -EINVAL;
|
||||
|
||||
/* To appease the verifier. */
|
||||
for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
|
||||
lv_desc[level] = NULL;
|
||||
lv_pos[level] = 0;
|
||||
}
|
||||
|
||||
/* Find the leaf node containing the index. */
|
||||
for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
|
||||
shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_PAGE_SHIFT;
|
||||
pos = (idx >> shift) & mask;
|
||||
|
||||
lv_desc[level] = desc;
|
||||
lv_pos[level] = pos;
|
||||
|
||||
if (level == SDT_TASK_LEVELS - 1)
|
||||
break;
|
||||
|
||||
chunk = desc->chunk;
|
||||
|
||||
desc_children = (sdt_desc_t * __arena *)chunk->descs;
|
||||
desc = desc_children[pos];
|
||||
|
||||
if (unlikely(!desc))
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
chunk = desc->chunk;
|
||||
|
||||
pos = idx & mask;
|
||||
data = chunk->data[pos];
|
||||
if (likely(data)) {
|
||||
data[pos] = (struct sdt_data) {
|
||||
.tid.genn = data->tid.genn + 1,
|
||||
};
|
||||
|
||||
/* Zero out one word at a time. */
|
||||
for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) {
|
||||
data->payload[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
ret = mark_nodes_avail(lv_desc, lv_pos);
|
||||
if (unlikely(ret != 0))
|
||||
return ret;
|
||||
|
||||
alloc_stats.active_allocs -= 1;
|
||||
alloc_stats.free_ops += 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline
|
||||
int ffs(__u64 word)
|
||||
{
|
||||
unsigned int num = 0;
|
||||
|
||||
if ((word & 0xffffffff) == 0) {
|
||||
num += 32;
|
||||
word >>= 32;
|
||||
}
|
||||
|
||||
if ((word & 0xffff) == 0) {
|
||||
num += 16;
|
||||
word >>= 16;
|
||||
}
|
||||
|
||||
if ((word & 0xff) == 0) {
|
||||
num += 8;
|
||||
word >>= 8;
|
||||
}
|
||||
|
||||
if ((word & 0xf) == 0) {
|
||||
num += 4;
|
||||
word >>= 4;
|
||||
}
|
||||
|
||||
if ((word & 0x3) == 0) {
|
||||
num += 2;
|
||||
word >>= 2;
|
||||
}
|
||||
|
||||
if ((word & 0x1) == 0) {
|
||||
num += 1;
|
||||
word >>= 1;
|
||||
}
|
||||
|
||||
return num;
|
||||
}
|
||||
|
||||
|
||||
/* find the first empty slot */
|
||||
__hidden
|
||||
__u64 chunk_find_empty(sdt_desc_t __arg_arena *desc)
|
||||
{
|
||||
__u64 freeslots;
|
||||
__u64 i;
|
||||
|
||||
for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) {
|
||||
freeslots = ~desc->allocated[i];
|
||||
if (freeslots == (__u64)0)
|
||||
continue;
|
||||
|
||||
return (i * 64) + ffs(freeslots);
|
||||
}
|
||||
|
||||
return SDT_TASK_ENTS_PER_CHUNK;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find and return an available idx on the allocator.
|
||||
* Called with the task spinlock held.
|
||||
*/
|
||||
static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp)
|
||||
{
|
||||
sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
|
||||
sdt_desc_t * __arena *desc_children;
|
||||
struct sdt_chunk __arena *chunk;
|
||||
sdt_desc_t *tmp;
|
||||
__u64 lv_pos[SDT_TASK_LEVELS];
|
||||
__u64 u, pos, level;
|
||||
__u64 idx = 0;
|
||||
int ret;
|
||||
|
||||
for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
|
||||
pos = chunk_find_empty(desc);
|
||||
|
||||
/* If we error out, something has gone very wrong. */
|
||||
if (unlikely(pos > SDT_TASK_ENTS_PER_CHUNK))
|
||||
return NULL;
|
||||
|
||||
if (pos == SDT_TASK_ENTS_PER_CHUNK)
|
||||
return NULL;
|
||||
|
||||
idx <<= SDT_TASK_ENTS_PER_PAGE_SHIFT;
|
||||
idx |= pos;
|
||||
|
||||
/* Log the levels to complete allocation. */
|
||||
lv_desc[level] = desc;
|
||||
lv_pos[level] = pos;
|
||||
|
||||
/* The rest of the loop is for internal node traversal. */
|
||||
if (level == SDT_TASK_LEVELS - 1)
|
||||
break;
|
||||
|
||||
/* Allocate an internal node if necessary. */
|
||||
chunk = desc->chunk;
|
||||
desc_children = (sdt_desc_t * __arena *)chunk->descs;
|
||||
|
||||
desc = desc_children[pos];
|
||||
if (!desc) {
|
||||
desc = scx_alloc_chunk();
|
||||
if (!desc)
|
||||
return NULL;
|
||||
|
||||
desc_children[pos] = desc;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Finding the descriptor along with any internal node
|
||||
* allocations was successful. Update all levels with
|
||||
* the new allocation.
|
||||
*/
|
||||
bpf_for(u, 0, SDT_TASK_LEVELS) {
|
||||
level = SDT_TASK_LEVELS - 1 - u;
|
||||
tmp = lv_desc[level];
|
||||
|
||||
ret = set_idx_state(tmp, lv_pos[level], true);
|
||||
if (ret != 0)
|
||||
break;
|
||||
|
||||
tmp->nr_free -= 1;
|
||||
if (tmp->nr_free > 0)
|
||||
break;
|
||||
}
|
||||
|
||||
*idxp = idx;
|
||||
|
||||
return desc;
|
||||
}
|
||||
|
||||
__hidden
|
||||
void __arena *scx_alloc(struct scx_allocator *alloc)
|
||||
{
|
||||
struct sdt_data __arena *data = NULL;
|
||||
struct sdt_chunk __arena *chunk;
|
||||
sdt_desc_t *desc;
|
||||
__u64 idx, pos;
|
||||
|
||||
if (!alloc)
|
||||
return NULL;
|
||||
|
||||
bpf_spin_lock(&alloc_lock);
|
||||
|
||||
/* We unlock if we encounter an error in the function. */
|
||||
desc = desc_find_empty(alloc->root, &idx);
|
||||
if (unlikely(desc == NULL)) {
|
||||
bpf_spin_unlock(&alloc_lock);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
chunk = desc->chunk;
|
||||
|
||||
/* Populate the leaf node if necessary. */
|
||||
pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1);
|
||||
data = chunk->data[pos];
|
||||
if (!data) {
|
||||
data = scx_alloc_from_pool(&alloc->pool);
|
||||
if (!data) {
|
||||
scx_alloc_free_idx(alloc, idx);
|
||||
bpf_spin_unlock(&alloc_lock);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
chunk->data[pos] = data;
|
||||
|
||||
/* The data counts as a chunk */
|
||||
alloc_stats.data_allocs += 1;
|
||||
alloc_stats.alloc_ops += 1;
|
||||
alloc_stats.active_allocs += 1;
|
||||
|
||||
data->tid.idx = idx;
|
||||
|
||||
bpf_spin_unlock(&alloc_lock);
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
/*
|
||||
* Task BPF map entry recording the task's assigned ID and pointing to the data
|
||||
* area allocated in arena.
|
||||
*/
|
||||
struct scx_task_map_val {
|
||||
union sdt_id tid;
|
||||
__u64 tptr;
|
||||
struct sdt_data __arena *data;
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
|
||||
__uint(map_flags, BPF_F_NO_PREALLOC);
|
||||
__type(key, int);
|
||||
__type(value, struct scx_task_map_val);
|
||||
} scx_task_map SEC(".maps");
|
||||
|
||||
static struct scx_allocator scx_task_allocator;
|
||||
|
||||
__hidden
|
||||
void __arena *scx_task_alloc(struct task_struct *p)
|
||||
{
|
||||
struct sdt_data __arena *data = NULL;
|
||||
struct scx_task_map_val *mval;
|
||||
|
||||
mval = bpf_task_storage_get(&scx_task_map, p, 0,
|
||||
BPF_LOCAL_STORAGE_GET_F_CREATE);
|
||||
if (!mval)
|
||||
return NULL;
|
||||
|
||||
data = scx_alloc(&scx_task_allocator);
|
||||
if (unlikely(!data))
|
||||
return NULL;
|
||||
|
||||
mval->tid = data->tid;
|
||||
mval->tptr = (__u64) p;
|
||||
mval->data = data;
|
||||
|
||||
return (void __arena *)data->payload;
|
||||
}
|
||||
|
||||
__hidden
|
||||
int scx_task_init(__u64 data_size)
|
||||
{
|
||||
return scx_alloc_init(&scx_task_allocator, data_size);
|
||||
}
|
||||
|
||||
__hidden
|
||||
void __arena *scx_task_data(struct task_struct *p)
|
||||
{
|
||||
struct sdt_data __arena *data;
|
||||
struct scx_task_map_val *mval;
|
||||
|
||||
scx_arena_subprog_init();
|
||||
|
||||
mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
|
||||
if (!mval)
|
||||
return NULL;
|
||||
|
||||
data = mval->data;
|
||||
|
||||
return (void __arena *)data->payload;
|
||||
}
|
||||
|
||||
__hidden
|
||||
void scx_task_free(struct task_struct *p)
|
||||
{
|
||||
struct scx_task_map_val *mval;
|
||||
|
||||
scx_arena_subprog_init();
|
||||
|
||||
mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
|
||||
if (!mval)
|
||||
return;
|
||||
|
||||
bpf_spin_lock(&alloc_lock);
|
||||
scx_alloc_free_idx(&scx_task_allocator, mval->tid.idx);
|
||||
bpf_spin_unlock(&alloc_lock);
|
||||
|
||||
bpf_task_storage_delete(&scx_task_map, p);
|
||||
}
|
||||
|
||||
static inline void
|
||||
scx_stat_global_update(struct scx_stats __arena *stats)
|
||||
{
|
||||
cast_kern(stats);
|
||||
__sync_fetch_and_add(&stat_enqueue, stats->enqueue);
|
||||
__sync_fetch_and_add(&stat_init, stats->init);
|
||||
__sync_fetch_and_add(&stat_exit, stats->exit);
|
||||
__sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu);
|
||||
__sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu);
|
||||
}
|
||||
|
||||
s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
|
||||
{
|
||||
struct scx_stats __arena *stats;
|
||||
bool is_idle = false;
|
||||
s32 cpu;
|
||||
|
||||
stats = scx_task_data(p);
|
||||
if (!stats) {
|
||||
scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
|
||||
return 0;
|
||||
}
|
||||
|
||||
cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
|
||||
if (is_idle) {
|
||||
stat_inc_select_idle_cpu(stats);
|
||||
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
|
||||
} else {
|
||||
stat_inc_select_busy_cpu(stats);
|
||||
}
|
||||
|
||||
return cpu;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
{
|
||||
struct scx_stats __arena *stats;
|
||||
|
||||
stats = scx_task_data(p);
|
||||
if (!stats) {
|
||||
scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
|
||||
return;
|
||||
}
|
||||
|
||||
stat_inc_enqueue(stats);
|
||||
|
||||
scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev)
|
||||
{
|
||||
scx_bpf_dsq_move_to_local(SHARED_DSQ);
|
||||
}
|
||||
|
||||
s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p,
|
||||
struct scx_init_task_args *args)
|
||||
{
|
||||
struct scx_stats __arena *stats;
|
||||
|
||||
stats = scx_task_alloc(p);
|
||||
if (!stats) {
|
||||
scx_bpf_error("arena allocator out of memory");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
stats->pid = p->pid;
|
||||
|
||||
stat_inc_init(stats);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p,
|
||||
struct scx_exit_task_args *args)
|
||||
{
|
||||
struct scx_stats __arena *stats;
|
||||
|
||||
stats = scx_task_data(p);
|
||||
if (!stats) {
|
||||
scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
|
||||
return;
|
||||
}
|
||||
|
||||
stat_inc_exit(stats);
|
||||
scx_stat_global_update(stats);
|
||||
|
||||
scx_task_free(p);
|
||||
}
|
||||
|
||||
s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = scx_task_init(sizeof(struct scx_stats));
|
||||
if (ret < 0) {
|
||||
scx_bpf_error("%s: failed with %d", __func__, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return scx_bpf_create_dsq(SHARED_DSQ, -1);
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei)
|
||||
{
|
||||
UEI_RECORD(uei, ei);
|
||||
}
|
||||
|
||||
SCX_OPS_DEFINE(sdt_ops,
|
||||
.select_cpu = (void *)sdt_select_cpu,
|
||||
.enqueue = (void *)sdt_enqueue,
|
||||
.dispatch = (void *)sdt_dispatch,
|
||||
.init_task = (void *)sdt_init_task,
|
||||
.exit_task = (void *)sdt_exit_task,
|
||||
.init = (void *)sdt_init,
|
||||
.exit = (void *)sdt_exit,
|
||||
.name = "sdt");
|
||||
101
tools/sched_ext/scx_sdt.c
Normal file
101
tools/sched_ext/scx_sdt.c
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
|
||||
* Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com>
|
||||
* Copyright (c) 2024 Tejun Heo <tj@kernel.org>
|
||||
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <signal.h>
|
||||
#include <libgen.h>
|
||||
#include <bpf/bpf.h>
|
||||
#include <scx/common.h>
|
||||
|
||||
#include "scx_sdt.h"
|
||||
#include "scx_sdt.bpf.skel.h"
|
||||
|
||||
const char help_fmt[] =
|
||||
"A simple arena-based sched_ext scheduler.\n"
|
||||
"\n"
|
||||
"Modified version of scx_simple that demonstrates arena-based data structures.\n"
|
||||
"\n"
|
||||
"Usage: %s [-f] [-v]\n"
|
||||
"\n"
|
||||
" -v Print libbpf debug messages\n"
|
||||
" -h Display this help and exit\n";
|
||||
|
||||
static bool verbose;
|
||||
static volatile int exit_req;
|
||||
|
||||
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
|
||||
{
|
||||
if (level == LIBBPF_DEBUG && !verbose)
|
||||
return 0;
|
||||
return vfprintf(stderr, format, args);
|
||||
}
|
||||
|
||||
static void sigint_handler(int sig)
|
||||
{
|
||||
exit_req = 1;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct scx_sdt *skel;
|
||||
struct bpf_link *link;
|
||||
__u32 opt;
|
||||
__u64 ecode;
|
||||
|
||||
libbpf_set_print(libbpf_print_fn);
|
||||
signal(SIGINT, sigint_handler);
|
||||
signal(SIGTERM, sigint_handler);
|
||||
restart:
|
||||
skel = SCX_OPS_OPEN(sdt_ops, scx_sdt);
|
||||
|
||||
while ((opt = getopt(argc, argv, "fvh")) != -1) {
|
||||
switch (opt) {
|
||||
case 'v':
|
||||
verbose = true;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, help_fmt, basename(argv[0]));
|
||||
return opt != 'h';
|
||||
}
|
||||
}
|
||||
|
||||
SCX_OPS_LOAD(skel, sdt_ops, scx_sdt, uei);
|
||||
link = SCX_OPS_ATTACH(skel, sdt_ops, scx_sdt);
|
||||
|
||||
while (!exit_req && !UEI_EXITED(skel, uei)) {
|
||||
printf("====SCHEDULING STATS====\n");
|
||||
printf("enqueues=%llu\t", skel->bss->stat_enqueue);
|
||||
printf("inits=%llu\t", skel->bss->stat_init);
|
||||
printf("exits=%llu\t", skel->bss->stat_exit);
|
||||
printf("\n");
|
||||
|
||||
printf("select_idle_cpu=%llu\t", skel->bss->stat_select_idle_cpu);
|
||||
printf("select_busy_cpu=%llu\t", skel->bss->stat_select_busy_cpu);
|
||||
printf("\n");
|
||||
|
||||
printf("====ALLOCATION STATS====\n");
|
||||
printf("chunk allocs=%llu\t", skel->bss->alloc_stats.chunk_allocs);
|
||||
printf("data_allocs=%llu\n", skel->bss->alloc_stats.data_allocs);
|
||||
printf("alloc_ops=%llu\t", skel->bss->alloc_stats.alloc_ops);
|
||||
printf("free_ops=%llu\t", skel->bss->alloc_stats.free_ops);
|
||||
printf("active_allocs=%llu\t", skel->bss->alloc_stats.active_allocs);
|
||||
printf("arena_pages_used=%llu\t", skel->bss->alloc_stats.arena_pages_used);
|
||||
printf("\n\n");
|
||||
|
||||
fflush(stdout);
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
bpf_link__destroy(link);
|
||||
ecode = UEI_REPORT(skel, uei);
|
||||
scx_sdt__destroy(skel);
|
||||
|
||||
if (UEI_ECODE_RESTART(ecode))
|
||||
goto restart;
|
||||
return 0;
|
||||
}
|
||||
113
tools/sched_ext/scx_sdt.h
Normal file
113
tools/sched_ext/scx_sdt.h
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
/*
|
||||
* SPDX-License-Identifier: GPL-2.0
|
||||
* Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
|
||||
* Copyright (c) 2025 Tejun Heo <tj@kernel.org>
|
||||
* Copyright (c) 2025 Emil Tsalapatis <etsal@meta.com>
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#ifndef __BPF__
|
||||
#define __arena
|
||||
#endif /* __BPF__ */
|
||||
|
||||
struct scx_alloc_stats {
|
||||
__u64 chunk_allocs;
|
||||
__u64 data_allocs;
|
||||
__u64 alloc_ops;
|
||||
__u64 free_ops;
|
||||
__u64 active_allocs;
|
||||
__u64 arena_pages_used;
|
||||
};
|
||||
|
||||
struct sdt_pool {
|
||||
void __arena *slab;
|
||||
__u64 elem_size;
|
||||
__u64 max_elems;
|
||||
__u64 idx;
|
||||
};
|
||||
|
||||
#ifndef div_round_up
|
||||
#define div_round_up(a, b) (((a) + (b) - 1) / (b))
|
||||
#endif
|
||||
|
||||
#ifndef round_up
|
||||
#define round_up(a, b) (div_round_up((a), (b)) * (b))
|
||||
#endif
|
||||
|
||||
typedef struct sdt_desc __arena sdt_desc_t;
|
||||
|
||||
enum sdt_consts {
|
||||
SDT_TASK_ENTS_PER_PAGE_SHIFT = 9,
|
||||
SDT_TASK_LEVELS = 3,
|
||||
SDT_TASK_ENTS_PER_CHUNK = 1 << SDT_TASK_ENTS_PER_PAGE_SHIFT,
|
||||
SDT_TASK_CHUNK_BITMAP_U64S = div_round_up(SDT_TASK_ENTS_PER_CHUNK, 64),
|
||||
SDT_TASK_MIN_ELEM_PER_ALLOC = 8,
|
||||
};
|
||||
|
||||
union sdt_id {
|
||||
__s64 val;
|
||||
struct {
|
||||
__s32 idx; /* index in the radix tree */
|
||||
__s32 genn; /* ++'d on recycle so that it forms unique'ish 64bit ID */
|
||||
};
|
||||
};
|
||||
|
||||
struct sdt_chunk;
|
||||
|
||||
/*
|
||||
* Each index page is described by the following descriptor which carries the
|
||||
* bitmap. This way the actual index can host power-of-two numbers of entries
|
||||
* which makes indexing cheaper.
|
||||
*/
|
||||
struct sdt_desc {
|
||||
__u64 allocated[SDT_TASK_CHUNK_BITMAP_U64S];
|
||||
__u64 nr_free;
|
||||
struct sdt_chunk __arena *chunk;
|
||||
};
|
||||
|
||||
/*
|
||||
* Leaf node containing per-task data.
|
||||
*/
|
||||
struct sdt_data {
|
||||
union sdt_id tid;
|
||||
__u64 payload[];
|
||||
};
|
||||
|
||||
/*
|
||||
* Intermediate node pointing to another intermediate node or leaf node.
|
||||
*/
|
||||
struct sdt_chunk {
|
||||
union {
|
||||
sdt_desc_t * descs[SDT_TASK_ENTS_PER_CHUNK];
|
||||
struct sdt_data __arena *data[SDT_TASK_ENTS_PER_CHUNK];
|
||||
};
|
||||
};
|
||||
|
||||
struct scx_allocator {
|
||||
struct sdt_pool pool;
|
||||
sdt_desc_t *root;
|
||||
};
|
||||
|
||||
struct scx_stats {
|
||||
int seq;
|
||||
pid_t pid;
|
||||
__u64 enqueue;
|
||||
__u64 exit;
|
||||
__u64 init;
|
||||
__u64 select_busy_cpu;
|
||||
__u64 select_idle_cpu;
|
||||
};
|
||||
|
||||
#ifdef __BPF__
|
||||
|
||||
void __arena *scx_task_data(struct task_struct *p);
|
||||
int scx_task_init(__u64 data_size);
|
||||
void __arena *scx_task_alloc(struct task_struct *p);
|
||||
void scx_task_free(struct task_struct *p);
|
||||
void scx_arena_subprog_init(void);
|
||||
|
||||
int scx_alloc_init(struct scx_allocator *alloc, __u64 data_size);
|
||||
u64 scx_alloc_internal(struct scx_allocator *alloc);
|
||||
int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx);
|
||||
|
||||
#endif /* __BPF__ */
|
||||
Loading…
Add table
Add a link
Reference in a new issue