This patch adds a new hook bpf_oom_evaluate_task in oom_evaluate_task. It takes oc and current iterating task as parameters and returns a result indicating which one should be selected. We can use it to bypass the current logic of oom_evaluate_task and implement customized OOM policies in the attached BPF progams. Suggested-by: Michal Hocko <mhocko@xxxxxxxx> Signed-off-by: Chuyi Zhou <zhouchuyi@xxxxxxxxxxxxx> --- mm/oom_kill.c | 59 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 612b5597d3af..255c9ef1d808 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -18,6 +18,7 @@ * kernel subsystems and hints as to where to find out what things do. */ +#include <linux/bpf.h> #include <linux/oom.h> #include <linux/mm.h> #include <linux/err.h> @@ -305,6 +306,27 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) return CONSTRAINT_NONE; } +enum { + NO_BPF_POLICY, + BPF_EVAL_ABORT, + BPF_EVAL_NEXT, + BPF_EVAL_SELECT, +}; + +__weak noinline int bpf_oom_evaluate_task(struct task_struct *task, struct oom_control *oc) +{ + return NO_BPF_POLICY; +} + +BTF_SET8_START(oom_bpf_fmodret_ids) +BTF_ID_FLAGS(func, bpf_oom_evaluate_task) +BTF_SET8_END(oom_bpf_fmodret_ids) + +static const struct btf_kfunc_id_set oom_bpf_fmodret_set = { + .owner = THIS_MODULE, + .set = &oom_bpf_fmodret_ids, +}; + static int oom_evaluate_task(struct task_struct *task, void *arg) { struct oom_control *oc = arg; @@ -317,6 +339,26 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc)) goto next; + /* + * If task is allocating a lot of memory and has been marked to be + * killed first if it triggers an oom, then select it. + */ + if (oom_task_origin(task)) { + points = LONG_MAX; + goto select; + } + + switch (bpf_oom_evaluate_task(task, oc)) { + case BPF_EVAL_ABORT: + goto abort; /* abort search process */ + case BPF_EVAL_NEXT: + goto next; /* ignore the task */ + case BPF_EVAL_SELECT: + goto select; /* select the task */ + default: + break; /* No BPF policy */ + } + /* * This task already has access to memory reserves and is being killed. * Don't allow any other task to have access to the reserves unless @@ -329,15 +371,6 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) goto abort; } - /* - * If task is allocating a lot of memory and has been marked to be - * killed first if it triggers an oom, then select it. - */ - if (oom_task_origin(task)) { - points = LONG_MAX; - goto select; - } - points = oom_badness(task, oc->totalpages); if (points == LONG_MIN || points < oc->chosen_points) goto next; @@ -732,10 +765,18 @@ static struct ctl_table vm_oom_kill_table[] = { static int __init oom_init(void) { + int err; oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); #ifdef CONFIG_SYSCTL register_sysctl_init("vm", vm_oom_kill_table); #endif + +#ifdef CONFIG_BPF_SYSCALL + err = register_btf_fmodret_id_set(&oom_bpf_fmodret_set); + if (err) + pr_warn("error while registering oom fmodret entrypoints: %d", err); +#endif + return 0; } subsys_initcall(oom_init) -- 2.20.1