[RFC PATCH 1/5] bpf: Introduce BPF_PROG_TYPE_OOM_POLICY

Chuyi Zhou <zhouchuyi@xxxxxxxxxxxxx> · Thu, 27 Jul 2023 15:36:28 +0800

This patch introduces a BPF_PROG_TYPE_OOM_POLICY program type. This
prog will be used to select a leaf memcg as victim from the memcg tree
when global oom is invoked.

The program takes two sibling cgroup's id as parameters and return a
comparison result indicating which one should be chosen as the victim.

Suggested-by: Abel Wu <wuyun.abel@xxxxxxxxxxxxx>
Signed-off-by: Chuyi Zhou <zhouchuyi@xxxxxxxxxxxxx>
---
 include/linux/bpf_oom.h   |  22 +++++
 include/linux/bpf_types.h |   2 +
 include/uapi/linux/bpf.h  |  14 ++++
 kernel/bpf/syscall.c      |  10 +++
 mm/oom_kill.c             | 168 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 216 insertions(+)
 create mode 100644 include/linux/bpf_oom.h

diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h
new file mode 100644
index 000000000000..f4235a83d3bb
--- /dev/null
+++ b/include/linux/bpf_oom.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _BPF_OOM_H
+#define _BPF_OOM_H
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <uapi/linux/bpf.h>
+
+struct bpf_oom_policy {
+	struct bpf_prog_array __rcu	*progs;
+};
+
+int oom_policy_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+int oom_policy_prog_detach(const union bpf_attr *attr);
+int oom_policy_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
+
+int __bpf_run_oom_policy(u64 cg_id_1, u64 cg_id_2);
+
+bool bpf_oom_policy_enabled(void);
+
+#endif
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index fc0d6f32c687..8ab6009b7dd9 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -83,6 +83,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall,
 BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter,
 	      struct bpf_nf_ctx, struct bpf_nf_ctx)
 #endif
+BPF_PROG_TYPE(BPF_PROG_TYPE_OOM_POLICY, oom_policy,
+		  struct bpf_oom_ctx, struct bpf_oom_ctx)
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 60a9d59beeab..9da0d61cf703 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -987,6 +987,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 	BPF_PROG_TYPE_NETFILTER,
+	BPF_PROG_TYPE_OOM_POLICY,
 };
 
 enum bpf_attach_type {
@@ -1036,6 +1037,7 @@ enum bpf_attach_type {
 	BPF_LSM_CGROUP,
 	BPF_STRUCT_OPS,
 	BPF_NETFILTER,
+	BPF_OOM_POLICY,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -6825,6 +6827,18 @@ struct bpf_cgroup_dev_ctx {
 	__u32 minor;
 };
 
+enum {
+	BPF_OOM_CMP_EQUAL   = (1ULL << 0),
+	BPF_OOM_CMP_GREATER = (1ULL << 1),
+	BPF_OOM_CMP_LESS    = (1ULL << 2),
+};
+
+struct bpf_oom_ctx {
+	__u64 cg_id_1;
+	__u64 cg_id_2;
+	__u8  cmp_ret;
+};
+
 struct bpf_raw_tracepoint_args {
 	__u64 args[0];
 };
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a2aef900519c..fb6fb6294eba 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5,6 +5,7 @@
 #include <linux/bpf-cgroup.h>
 #include <linux/bpf_trace.h>
 #include <linux/bpf_lirc.h>
+#include <linux/bpf_oom.h>
 #include <linux/bpf_verifier.h>
 #include <linux/bsearch.h>
 #include <linux/btf.h>
@@ -3588,6 +3589,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 		return BPF_PROG_TYPE_XDP;
 	case BPF_LSM_CGROUP:
 		return BPF_PROG_TYPE_LSM;
+	case BPF_OOM_POLICY:
+		return BPF_PROG_TYPE_OOM_POLICY;
 	default:
 		return BPF_PROG_TYPE_UNSPEC;
 	}
@@ -3634,6 +3637,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 		ret = netns_bpf_prog_attach(attr, prog);
 		break;
+	case BPF_PROG_TYPE_OOM_POLICY:
+		ret = oom_policy_prog_attach(attr, prog);
+		break;
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_CGROUP_SKB:
 	case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -3676,6 +3682,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		return lirc_prog_detach(attr);
 	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 		return netns_bpf_prog_detach(attr, ptype);
+	case BPF_PROG_TYPE_OOM_POLICY:
+		return oom_policy_prog_detach(attr);
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_CGROUP_SKB:
 	case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -3733,6 +3741,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_FLOW_DISSECTOR:
 	case BPF_SK_LOOKUP:
 		return netns_bpf_prog_query(attr, uattr);
+	case BPF_OOM_POLICY:
+		return oom_policy_prog_query(attr, uattr);
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 	case BPF_SK_MSG_VERDICT:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 612b5597d3af..01af8adaa16c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/oom.h>
+#include <linux/bpf_oom.h>
 #include <linux/mm.h>
 #include <linux/err.h>
 #include <linux/gfp.h>
@@ -73,6 +74,9 @@ static inline bool is_memcg_oom(struct oom_control *oc)
 	return oc->memcg != NULL;
 }
 
+DEFINE_MUTEX(oom_policy_lock);
+static struct bpf_oom_policy global_oom_policy;
+
 #ifdef CONFIG_NUMA
 /**
  * oom_cpuset_eligible() - check task eligibility for kill
@@ -1258,3 +1262,167 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
 	return -ENOSYS;
 #endif /* CONFIG_MMU */
 }
+
+const struct bpf_prog_ops oom_policy_prog_ops = {
+};
+
+static const struct bpf_func_proto *
+oom_policy_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	return bpf_base_func_proto(func_id);
+}
+
+static bool oom_policy_is_valid_access(int off, int size,
+						enum bpf_access_type type,
+						const struct bpf_prog *prog,
+						struct bpf_insn_access_aux *info)
+{
+	if (off < 0 || off + size > sizeof(struct bpf_oom_ctx) || off % size)
+		return false;
+
+	switch (off) {
+	case bpf_ctx_range(struct bpf_oom_ctx, cg_id_1):
+	case bpf_ctx_range(struct bpf_oom_ctx, cg_id_2):
+		if (type != BPF_READ)
+			return false;
+		bpf_ctx_record_field_size(info, sizeof(__u64));
+		return bpf_ctx_narrow_access_ok(off, size, sizeof(__u64));
+	case bpf_ctx_range(struct bpf_oom_ctx, cmp_ret):
+		if (type == BPF_READ) {
+			bpf_ctx_record_field_size(info, sizeof(__u8));
+			return bpf_ctx_narrow_access_ok(off, size, sizeof(__u8));
+		} else {
+			return size == sizeof(__u8);
+		}
+	default:
+		return false;
+	}
+}
+
+const struct bpf_verifier_ops oom_policy_verifier_ops = {
+	.get_func_proto		= oom_policy_func_proto,
+	.is_valid_access	= oom_policy_is_valid_access,
+};
+
+#define BPF_MAX_PROGS 10
+
+int oom_policy_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_prog_array *old_array;
+	struct bpf_prog_array *new_array;
+	int ret;
+
+	mutex_lock(&oom_policy_lock);
+	old_array = rcu_dereference(global_oom_policy.progs);
+	if (old_array && bpf_prog_array_length(old_array) >= BPF_MAX_PROGS) {
+		ret = -E2BIG;
+		goto unlock;
+	}
+	ret = bpf_prog_array_copy(old_array, NULL, prog, 0, &new_array);
+	if (ret < 0)
+		goto unlock;
+
+	rcu_assign_pointer(global_oom_policy.progs, new_array);
+	bpf_prog_array_free(old_array);
+
+unlock:
+	mutex_unlock(&oom_policy_lock);
+	return ret;
+}
+
+static int detach_prog(struct bpf_prog *prog)
+{
+	struct bpf_prog_array *old_array;
+	struct bpf_prog_array *new_array;
+	int ret;
+
+	mutex_lock(&oom_policy_lock);
+	old_array = rcu_dereference(global_oom_policy.progs);
+	ret = bpf_prog_array_copy(old_array, prog, NULL, 0, &new_array);
+
+	if (ret)
+		goto unlock;
+
+	rcu_assign_pointer(global_oom_policy.progs, new_array);
+	bpf_prog_array_free(old_array);
+	bpf_prog_put(prog);
+unlock:
+	mutex_unlock(&oom_policy_lock);
+	return ret;
+}
+
+int oom_policy_prog_detach(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	int ret;
+
+	if (attr->attach_flags)
+		return -EINVAL;
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd,
+					BPF_PROG_TYPE_OOM_POLICY);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	ret = detach_prog(prog);
+	bpf_prog_put(prog);
+
+	return ret;
+}
+
+int oom_policy_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+	struct bpf_prog_array *progs;
+	u32 cnt, flags;
+	int ret = 0;
+
+	if (attr->query.query_flags)
+		return -EINVAL;
+
+	mutex_lock(&oom_policy_lock);
+	progs = rcu_dereference(global_oom_policy.progs);
+	cnt = progs ? bpf_prog_array_length(progs) : 0;
+	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) {
+		ret = -EFAULT;
+		goto unlock;
+	}
+	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) {
+		ret = -EFAULT;
+		goto unlock;
+	}
+	if (attr->query.prog_cnt != 0 && prog_ids && cnt)
+		ret = bpf_prog_array_copy_to_user(progs, prog_ids,
+						  attr->query.prog_cnt);
+
+unlock:
+	mutex_unlock(&oom_policy_lock);
+	return ret;
+}
+
+int __bpf_run_oom_policy(u64 cg_id_1, u64 cg_id_2)
+{
+	struct bpf_oom_ctx ctx = {
+		.cg_id_1 = cg_id_1,
+		.cg_id_2 = cg_id_2,
+		.cmp_ret = BPF_OOM_CMP_EQUAL,
+	};
+	rcu_read_lock();
+	bpf_prog_run_array(rcu_dereference(global_oom_policy.progs),
+						&ctx, bpf_prog_run);
+	rcu_read_unlock();
+	return ctx.cmp_ret;
+}
+
+bool bpf_oom_policy_enabled(void)
+{
+	struct bpf_prog_array *prog_array;
+	bool empty = true;
+
+	rcu_read_lock();
+	prog_array = rcu_dereference(global_oom_policy.progs);
+	if (prog_array)
+		empty = bpf_prog_array_is_empty(prog_array);
+	rcu_read_unlock();
+	return !empty;
+}
-- 
2.20.1