Add ops.cpu_online/offline() which are invoked when CPUs come online and offline respectively. As the enqueue path already automatically bypasses tasks to the local dsq on a deactivated CPU, BPF schedulers are guaranteed to see tasks only on CPUs which are between online() and offline(). If the BPF scheduler doesn't implement ops.cpu_online/offline(), the scheduler is automatically exited with SCX_ECODE_RESTART | SCX_ECODE_RSN_HOTPLUG. Userspace can implement CPU hotpplug support trivially by simply reinitializing and reloading the scheduler. scx_qmap is updated to print out online CPUs on hotplug events. Other schedulers are updated to restart based on ecode. v2: - To accommodate lock ordering change between scx_cgroup_rwsem and cpus_read_lock(), CPU hotplug operations are put into its own SCX_OPI block and enabled eariler during scx_ope_enable() so that cpus_read_lock() can be dropped before acquiring scx_cgroup_rwsem. - Auto exit with ECODE added. Signed-off-by: Tejun Heo <tj@xxxxxxxxxx> Reviewed-by: David Vernet <dvernet@xxxxxxxx> Acked-by: Josh Don <joshdon@xxxxxxxxxx> Acked-by: Hao Luo <haoluo@xxxxxxxxxx> Acked-by: Barret Rhoden <brho@xxxxxxxxxx> --- kernel/sched/ext.c | 135 ++++++++++++++++++- tools/sched_ext/include/scx/compat.h | 29 ++++ tools/sched_ext/include/scx/user_exit_info.h | 28 ++++ tools/sched_ext/scx_central.c | 9 +- tools/sched_ext/scx_flatcg.c | 8 +- tools/sched_ext/scx_qmap.bpf.c | 60 +++++++++ tools/sched_ext/scx_qmap.c | 4 + tools/sched_ext/scx_simple.c | 8 +- 8 files changed, 271 insertions(+), 10 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 9bc03533cf5e..ed2452d42862 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -30,6 +30,29 @@ enum scx_exit_kind { SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ }; +/* + * An exit code can be specified when exiting with scx_bpf_exit() or + * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN + * respectively. The codes are 64bit of the format: + * + * Bits: [63 .. 48 47 .. 32 31 .. 0] + * [ SYS ACT ] [ SYS RSN ] [ USR ] + * + * SYS ACT: System-defined exit actions + * SYS RSN: System-defined exit reasons + * USR : User-defined exit codes and reasons + * + * Using the above, users may communicate intention and context by ORing system + * actions and/or system reasons with a user-defined exit code. + */ +enum scx_exit_code { + /* Reasons */ + SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + + /* Actions */ + SCX_ECODE_ACT_RESTART = 1LLU << 48, +}; + /* * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is * being disabled. @@ -504,7 +527,29 @@ struct sched_ext_ops { #endif /* CONFIG_CGROUPS */ /* - * All online ops must come before ops.init(). + * All online ops must come before ops.cpu_online(). + */ + + /** + * cpu_online - A CPU became online + * @cpu: CPU which just came up + * + * @cpu just came online. @cpu doesn't call ops.enqueue() or run tasks + * associated with other CPUs beforehand. + */ + void (*cpu_online)(s32 cpu); + + /** + * cpu_offline - A CPU is going offline + * @cpu: CPU which is going offline + * + * @cpu is going offline. @cpu doesn't call ops.enqueue() or run tasks + * associated with other CPUs afterwards. + */ + void (*cpu_offline)(s32 cpu); + + /* + * All CPU hotplug ops must come before ops.init(). */ /** @@ -543,6 +588,15 @@ struct sched_ext_ops { */ u32 exit_dump_len; + /** + * hotplug_seq - A sequence number that may be set by the scheduler to + * detect when a hotplug event has occurred during the loading process. + * If 0, no detection occurs. Otherwise, the scheduler will fail to + * load if the sequence number does not match @scx_hotplug_seq on the + * enable path. + */ + u64 hotplug_seq; + /** * name - BPF scheduler's name * @@ -556,7 +610,9 @@ struct sched_ext_ops { enum scx_opi { SCX_OPI_BEGIN = 0, SCX_OPI_NORMAL_BEGIN = 0, - SCX_OPI_NORMAL_END = SCX_OP_IDX(init), + SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), SCX_OPI_END = SCX_OP_IDX(init), }; @@ -746,6 +802,7 @@ static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); static struct scx_exit_info *scx_exit_info; static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); +static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); /* * The maximum amount of time in jiffies that a task may be runnable without @@ -2172,7 +2229,8 @@ static int balance_scx(struct rq *rq, struct task_struct *prev, * emitted in scx_next_task_picked(). */ if (SCX_HAS_OP(cpu_acquire)) - SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL); + SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_acquire, cpu_of(rq), + NULL); rq->scx.cpu_released = false; } @@ -2700,6 +2758,34 @@ void __scx_update_idle(struct rq *rq, bool idle) #endif } +static void handle_hotplug(struct rq *rq, bool online) +{ + int cpu = cpu_of(rq); + + atomic_long_inc(&scx_hotplug_seq); + + if (online && SCX_HAS_OP(cpu_online)) + SCX_CALL_OP(SCX_KF_REST, cpu_online, cpu); + else if (!online && SCX_HAS_OP(cpu_offline)) + SCX_CALL_OP(SCX_KF_REST, cpu_offline, cpu); + else + scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "cpu %d going %s, exiting scheduler", cpu, + online ? "online" : "offline"); +} + +static void rq_online_scx(struct rq *rq, enum rq_onoff_reason reason) +{ + if (reason == RQ_ONOFF_HOTPLUG) + handle_hotplug(rq, true); +} + +static void rq_offline_scx(struct rq *rq, enum rq_onoff_reason reason) +{ + if (reason == RQ_ONOFF_HOTPLUG) + handle_hotplug(rq, false); +} + #else /* CONFIG_SMP */ static bool test_and_clear_cpu_idle(int cpu) { return false; } @@ -3328,6 +3414,9 @@ DEFINE_SCHED_CLASS(ext) = { .balance = balance_scx, .select_task_rq = select_task_rq_scx, .set_cpus_allowed = set_cpus_allowed_scx, + + .rq_online = rq_online_scx, + .rq_offline = rq_offline_scx, #endif .task_tick = task_tick_scx, @@ -3584,10 +3673,18 @@ static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, } SCX_ATTR(nr_rejected); +static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); +} +SCX_ATTR(hotplug_seq); + static struct attribute *scx_global_attrs[] = { &scx_attr_state.attr, &scx_attr_switch_all.attr, &scx_attr_nr_rejected.attr, + &scx_attr_hotplug_seq.attr, NULL, }; @@ -4110,6 +4207,25 @@ static struct kthread_worker *scx_create_rt_helper(const char *name) return helper; } +static void check_hotplug_seq(const struct sched_ext_ops *ops) +{ + unsigned long long global_hotplug_seq; + + /* + * If a hotplug event has occurred between when a scheduler was + * initialized, and when we were able to attach, exit and notify user + * space about it. + */ + if (ops->hotplug_seq) { + global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); + if (ops->hotplug_seq != global_hotplug_seq) { + scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "expected hotplug seq %llu did not match actual %llu", + ops->hotplug_seq, global_hotplug_seq); + } + } +} + static int validate_ops(const struct sched_ext_ops *ops) { /* @@ -4192,6 +4308,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops) } } + for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) + if (((void (**)(void))ops)[i]) + static_branch_enable_cpuslocked(&scx_has_op[i]); + cpus_read_unlock(); ret = validate_ops(ops); @@ -4239,6 +4359,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops) cpus_read_lock(); scx_cgroup_lock(); + check_hotplug_seq(ops); + for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) static_branch_enable_cpuslocked(&scx_has_op[i]); @@ -4563,6 +4685,9 @@ static int bpf_scx_init_member(const struct btf_type *t, ops->exit_dump_len = *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; return 1; + case offsetof(struct sched_ext_ops, hotplug_seq): + ops->hotplug_seq = *(u64 *)(udata + moff); + return 1; } return 0; @@ -4659,6 +4784,8 @@ static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {} #endif +static void cpu_online_stub(s32 cpu) {} +static void cpu_offline_stub(s32 cpu) {} static s32 init_stub(void) { return -EINVAL; } static void exit_stub(struct scx_exit_info *info) {} @@ -4689,6 +4816,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .cgroup_cancel_move = cgroup_cancel_move_stub, .cgroup_set_weight = cgroup_set_weight_stub, #endif + .cpu_online = cpu_online_stub, + .cpu_offline = cpu_offline_stub, .init = init_stub, .exit = exit_stub, }; diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h index 2be79bd88a25..7155b69150ff 100644 --- a/tools/sched_ext/include/scx/compat.h +++ b/tools/sched_ext/include/scx/compat.h @@ -8,6 +8,9 @@ #define __SCX_COMPAT_H #include <bpf/btf.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> struct btf *__COMPAT_vmlinux_btf __attribute__((weak)); @@ -120,6 +123,28 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field #define __COMPAT_HAS_CPUMASKS \ __COMPAT_has_ksym("scx_bpf_nr_cpu_ids") +static inline long scx_hotplug_seq(void) +{ + int fd; + char buf[32]; + ssize_t len; + long val; + + fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY); + if (fd < 0) + return -ENOENT; + + len = read(fd, buf, sizeof(buf) - 1); + SCX_BUG_ON(len <= 0, "read failed (%ld)", len); + buf[len] = 0; + close(fd); + + val = strtoul(buf, NULL, 10); + SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val); + + return val; +} + /* * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE() * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load @@ -128,12 +153,16 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field * * - ops.tick(): Ignored on older kernels with a warning. * - ops.exit_dump_len: Cleared to zero on older kernels with a warning. + * - ops.hotplug_seq: Ignored on older kernels. */ #define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ struct __scx_name *__skel; \ \ __skel = __scx_name##__open(); \ SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ + \ + if (__COMPAT_struct_has_field("sched_ext_ops", "hotplug_seq")) \ + __skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \ __skel; \ }) diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h index cf4293cb250e..2d86d01a9575 100644 --- a/tools/sched_ext/include/scx/user_exit_info.h +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -77,7 +77,35 @@ struct user_exit_info { if (__uei->msg[0] != '\0') \ fprintf(stderr, " (%s)", __uei->msg); \ fputs("\n", stderr); \ + __uei->exit_code; \ }) +/* + * We can't import vmlinux.h while compiling user C code. Let's duplicate + * scx_exit_code definition. + */ +enum scx_exit_code { + /* Reasons */ + SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + + /* Actions */ + SCX_ECODE_ACT_RESTART = 1LLU << 48, +}; + +enum uei_ecode_mask { + UEI_ECODE_USER_MASK = ((1LLU << 32) - 1), + UEI_ECODE_SYS_RSN_MASK = ((1LLU << 16) - 1) << 32, + UEI_ECODE_SYS_ACT_MASK = ((1LLU << 16) - 1) << 48, +}; + +/* + * These macro interpret the ecode returned from UEI_REPORT(). + */ +#define UEI_ECODE_USER(__ecode) ((__ecode) & UEI_ECODE_USER_MASK) +#define UEI_ECODE_SYS_RSN(__ecode) ((__ecode) & UEI_ECODE_SYS_RSN_MASK) +#define UEI_ECODE_SYS_ACT(__ecode) ((__ecode) & UEI_ECODE_SYS_ACT_MASK) + +#define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART) + #endif /* __bpf__ */ #endif /* __USER_EXIT_INFO_H */ diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c index 2908add16880..df692dc0ccb1 100644 --- a/tools/sched_ext/scx_central.c +++ b/tools/sched_ext/scx_central.c @@ -46,14 +46,14 @@ int main(int argc, char **argv) { struct scx_central *skel; struct bpf_link *link; - __u64 seq = 0; + __u64 seq = 0, ecode; __s32 opt; cpu_set_t *cpuset; libbpf_set_print(libbpf_print_fn); signal(SIGINT, sigint_handler); signal(SIGTERM, sigint_handler); - +restart: skel = SCX_OPS_OPEN(central_ops, scx_central); skel->rodata->central_cpu = 0; @@ -126,7 +126,10 @@ int main(int argc, char **argv) } bpf_link__destroy(link); - UEI_REPORT(skel, uei); + ecode = UEI_REPORT(skel, uei); scx_central__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; return 0; } diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c index bb0a832a0cfd..20c5a132b610 100644 --- a/tools/sched_ext/scx_flatcg.c +++ b/tools/sched_ext/scx_flatcg.c @@ -127,11 +127,12 @@ int main(int argc, char **argv) __u64 last_stats[FCG_NR_STATS] = {}; unsigned long seq = 0; __s32 opt; + __u64 ecode; libbpf_set_print(libbpf_print_fn); signal(SIGINT, sigint_handler); signal(SIGTERM, sigint_handler); - +restart: skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg); skel->rodata->nr_cpus = libbpf_num_possible_cpus(); @@ -219,7 +220,10 @@ int main(int argc, char **argv) } bpf_link__destroy(link); - UEI_REPORT(skel, uei); + ecode = UEI_REPORT(skel, uei); scx_flatcg__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; return 0; } diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 7c3b0dcae1e0..c2edc080d7e5 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -308,11 +308,69 @@ s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, return -ENOMEM; } +/* + * Print out the online and possible CPU map using bpf_printk() as a + * demonstration of using the cpumask kfuncs and ops.cpu_on/offline(). + */ +static void print_cpus(void) +{ + const struct cpumask *possible, *online; + s32 cpu; + char buf[128] = "", *p; + int idx; + + if (!__COMPAT_HAS_CPUMASKS) + return; + + possible = scx_bpf_get_possible_cpumask(); + online = scx_bpf_get_online_cpumask(); + + idx = 0; + bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) { + if (!(p = MEMBER_VPTR(buf, [idx++]))) + break; + if (bpf_cpumask_test_cpu(cpu, online)) + *p++ = 'O'; + else if (bpf_cpumask_test_cpu(cpu, possible)) + *p++ = 'X'; + else + *p++ = ' '; + + if ((cpu & 7) == 7) { + if (!(p = MEMBER_VPTR(buf, [idx++]))) + break; + *p++ = '|'; + } + } + buf[sizeof(buf) - 1] = '\0'; + + scx_bpf_put_cpumask(online); + scx_bpf_put_cpumask(possible); + + bpf_printk("CPUS: |%s", buf); +} + +void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) +{ + bpf_printk("CPU %d coming online", cpu); + /* @cpu is already online at this point */ + print_cpus(); +} + +void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) +{ + bpf_printk("CPU %d going offline", cpu); + /* @cpu is still online at this point */ + print_cpus(); +} + s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) { if (!switch_partial) __COMPAT_scx_bpf_switch_all(); + print_cpus(); + return scx_bpf_create_dsq(SHARED_DSQ, -1); } @@ -328,6 +386,8 @@ SCX_OPS_DEFINE(qmap_ops, .dispatch = (void *)qmap_dispatch, .cpu_release = (void *)qmap_cpu_release, .init_task = (void *)qmap_init_task, + .cpu_online = (void *)qmap_cpu_online, + .cpu_offline = (void *)qmap_cpu_offline, .init = (void *)qmap_init, .exit = (void *)qmap_exit, .timeout_ms = 5000U, diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index 048b31eed17d..e82f58b5c131 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -119,5 +119,9 @@ int main(int argc, char **argv) bpf_link__destroy(link); UEI_REPORT(skel, uei); scx_qmap__destroy(skel); + /* + * scx_qmap implements ops.cpu_on/offline() and doesn't need to restart + * on CPU hotplug events. + */ return 0; } diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c index 9ffa8d084228..acee683a3ec9 100644 --- a/tools/sched_ext/scx_simple.c +++ b/tools/sched_ext/scx_simple.c @@ -62,11 +62,12 @@ int main(int argc, char **argv) struct scx_simple *skel; struct bpf_link *link; __u32 opt; + __u64 ecode; libbpf_set_print(libbpf_print_fn); signal(SIGINT, sigint_handler); signal(SIGTERM, sigint_handler); - +restart: skel = SCX_OPS_OPEN(simple_ops, scx_simple); while ((opt = getopt(argc, argv, "vh")) != -1) { @@ -93,7 +94,10 @@ int main(int argc, char **argv) } bpf_link__destroy(link); - UEI_REPORT(skel, uei); + ecode = UEI_REPORT(skel, uei); scx_simple__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; return 0; } -- 2.44.0