[PATCH bpf-next v1 9/9] selftests/bpf: Tests using sleepable tracepoints to monitor cgroup events

Hao Luo <haoluo@xxxxxxxxxx> · Fri, 25 Feb 2022 15:43:39 -0800

Tests the functionalities of sleepable tracing prog, sleepable tracepoints
(i.e. cgroup_mkdir_s and cgroup_rmdir_s) and cgroup iter prog all together.

The added selftest resembles a real-world application, where bpf is used
to export cgroup-level performance stats. There are two sets of progs in
the test: cgroup_monitor and cgroup_sched_lat

- Cgroup_monitor monitors cgroup creation and deletion using sleepable
  tracing; for each cgroup created, creates a directory in bpffs; creates
  a cgroup iter link and pins it in that directory.

- Cgroup_sched_lat is the program that collects cgroup's scheduling
  latencies and store them in hash map. Cgroup_sched_lat implements a
  cgroup iter prog, which reads the stats from the map and seq_prints
  them. This cgroup iter prog is the prog pinned by cgroup_monitor in
  each bpffs directory.

The cgroup_sched_lat in this test can be adapted for exporting similar
cgroup-level performance stats.

Signed-off-by: Hao Luo <haoluo@xxxxxxxxxx>
---
 .../bpf/prog_tests/test_cgroup_stats.c        | 187 ++++++++++++++
 tools/testing/selftests/bpf/progs/bpf_iter.h  |   7 +
 .../selftests/bpf/progs/cgroup_monitor.c      |  78 ++++++
 .../selftests/bpf/progs/cgroup_sched_lat.c    | 232 ++++++++++++++++++
 4 files changed, 504 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/test_cgroup_stats.c
 create mode 100644 tools/testing/selftests/bpf/progs/cgroup_monitor.c
 create mode 100644 tools/testing/selftests/bpf/progs/cgroup_sched_lat.c

diff --git a/tools/testing/selftests/bpf/prog_tests/test_cgroup_stats.c b/tools/testing/selftests/bpf/prog_tests/test_cgroup_stats.c
new file mode 100644
index 000000000000..b6607ac074bc
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_cgroup_stats.c
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Google */
+#define _GNU_SOURCE
+#include <sys/stat.h>	/* mkdir */
+#include <fcntl.h>	/* name_to_handle_at */
+#include <stdlib.h>
+#include <test_progs.h>
+#include "cgroup_monitor.skel.h"
+#include "cgroup_sched_lat.skel.h"
+
+static char mkdir_prog_path[64];
+static char rmdir_prog_path[64];
+static char dump_prog_path[64];
+
+/* Get cgroup id from a full path to cgroup */
+static int get_cgroup_id(const char *cgroup)
+{
+	int mount_id = 0;
+	struct {
+		struct file_handle fh;
+		__u64 cgid;
+	} fh = {};
+
+	fh.fh.handle_bytes = sizeof(fh.cgid);
+	if (name_to_handle_at(AT_FDCWD, cgroup, &fh.fh, &mount_id, 0))
+		return -1;
+
+	return fh.cgid;
+}
+
+static void spin_on_cpu(int seconds)
+{
+	time_t start, now;
+
+	start = time(NULL);
+	do {
+		now = time(NULL);
+	} while (now - start < seconds);
+}
+
+static void do_work(const char *cgroup)
+{
+	int i, cpu = 0, pid;
+	char cmd[128];
+
+	/* make cgroup threaded */
+	snprintf(cmd, 128, "echo threaded > %s/cgroup.type", cgroup);
+	system(cmd);
+
+	/* try to enable cpu controller. this may fail if cpu controller is not
+	 * available in cgroup.controllers or there is a cgroup v1 already
+	 * mounted in the system.
+	 */
+	snprintf(cmd, 128, "echo \"+cpu\" > %s/cgroup.subtree_control", cgroup);
+	system(cmd);
+
+	/* launch two children, both running in child cgroup */
+	for (i = 0; i < 2; ++i) {
+		pid = fork();
+		if (pid == 0) {
+			/* attach to cgroup */
+			snprintf(cmd, 128, "echo %d > %s/cgroup.procs", getpid(), cgroup);
+			system(cmd);
+
+			/* pin process to target cpu */
+			snprintf(cmd, 128, "taskset -pc %d %d", cpu, getpid());
+			system(cmd);
+
+			spin_on_cpu(3); /* spin on cpu for 3 seconds */
+			exit(0);
+		}
+	}
+
+	/* pin parent process to target cpu */
+	snprintf(cmd, 128, "taskset -pc %d %d", cpu, getpid());
+	system(cmd);
+
+	spin_on_cpu(3); /* spin on cpu for 3 seconds */
+	wait(NULL);
+}
+
+/* Check reading cgroup stats from auto pinned objects
+ * @root: root directory in bpffs set up for this test
+ * @cgroup: cgroup path
+ */
+static void check_cgroup_stats(const char *root, const char *cgroup)
+{
+	unsigned long queue_self, queue_other;
+	char buf[64], path[64];
+	int id, cgroup_id;
+	FILE *file;
+
+	id = get_cgroup_id(cgroup);
+	if (!ASSERT_GE(id, 0, "get_cgroup_id"))
+		return;
+
+	snprintf(path, sizeof(path), "%s/%d/stats", root, id);
+	file = fopen(path, "r");
+	if (!ASSERT_OK_PTR(file, "open"))
+		return;
+
+	ASSERT_OK_PTR(fgets(buf, sizeof(buf), file), "cat");
+	ASSERT_EQ(sscanf(buf, "cgroup_id: %8d", &cgroup_id), 1, "output");
+	ASSERT_EQ(id, cgroup_id, "cgroup_id");
+
+	ASSERT_OK_PTR(fgets(buf, sizeof(buf), file), "cat");
+	ASSERT_EQ(sscanf(buf, "queue_self: %8lu", &queue_self), 1, "output");
+
+	ASSERT_OK_PTR(fgets(buf, sizeof(buf), file), "cat");
+	ASSERT_EQ(sscanf(buf, "queue_other: %8lu", &queue_other), 1, "output");
+	fclose(file);
+}
+
+/* Set up bpf progs for monitoring cgroup activities. */
+static void setup_cgroup_monitor(const char *root)
+{
+	struct cgroup_monitor *skel = NULL;
+
+	skel = cgroup_monitor__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "cgroup_monitor_skel_load"))
+		return;
+
+	cgroup_monitor__attach(skel);
+
+	snprintf(skel->bss->root, sizeof(skel->bss->root), "%s", root);
+
+	snprintf(mkdir_prog_path, 64, "%s/mkdir_prog", root);
+	bpf_obj_pin(bpf_link__fd(skel->links.mkdir_prog), mkdir_prog_path);
+
+	snprintf(rmdir_prog_path, 64, "%s/rmdir_prog", root);
+	bpf_obj_pin(bpf_link__fd(skel->links.rmdir_prog), rmdir_prog_path);
+
+	cgroup_monitor__destroy(skel);
+}
+
+void test_cgroup_stats(void)
+{
+	char bpf_tmpl[] = "/sys/fs/bpf/XXXXXX";
+	char cgrp_tmpl[] = "/sys/fs/cgroup/XXXXXX";
+	struct cgroup_sched_lat *skel = NULL;
+	char *root, *cgroup;
+
+	/* prepare test directories */
+	system("mount -t cgroup2 none /sys/fs/cgroup");
+	system("mount -t bpf bpffs /sys/fs/bpf");
+	root = mkdtemp(bpf_tmpl);
+	chmod(root, 0777);
+
+	/* set up progs for monitoring cgroup events */
+	setup_cgroup_monitor(root);
+
+	/* set up progs for profiling cgroup stats */
+	skel = cgroup_sched_lat__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "cgroup_sched_lat_skel_load"))
+		goto cleanup_root;
+
+	snprintf(dump_prog_path, 64, "%s/prog", root);
+	bpf_obj_pin(bpf_program__fd(skel->progs.dump_cgroup), dump_prog_path);
+	chmod(dump_prog_path, 0644);
+
+	cgroup_sched_lat__attach(skel);
+
+	/* thanks to cgroup monitoring progs, a directory corresponding to the
+	 * cgroup is created in bpffs.
+	 */
+	cgroup = mkdtemp(cgrp_tmpl);
+
+	/* collect some cgroup-level stats and check reading them from bpffs */
+	do_work(cgroup);
+	check_cgroup_stats(root, cgroup);
+
+	/* thanks to cgroup monitoring progs, removing cgroups also removes
+	 * the created directory in bpffs.
+	 */
+	rmdir(cgroup);
+
+	/* clean up cgroup monitoring progs */
+	cgroup_sched_lat__detach(skel);
+	cgroup_sched_lat__destroy(skel);
+	unlink(dump_prog_path);
+cleanup_root:
+	/* remove test directories in bpffs */
+	unlink(mkdir_prog_path);
+	unlink(rmdir_prog_path);
+	rmdir(root);
+	return;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter.h b/tools/testing/selftests/bpf/progs/bpf_iter.h
index 8cfaeba1ddbf..0d1bf954e831 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter.h
+++ b/tools/testing/selftests/bpf/progs/bpf_iter.h
@@ -16,6 +16,7 @@
 #define bpf_iter__bpf_map_elem bpf_iter__bpf_map_elem___not_used
 #define bpf_iter__bpf_sk_storage_map bpf_iter__bpf_sk_storage_map___not_used
 #define bpf_iter__sockmap bpf_iter__sockmap___not_used
+#define bpf_iter__cgroup bpf_iter__cgroup__not_used
 #define btf_ptr btf_ptr___not_used
 #define BTF_F_COMPACT BTF_F_COMPACT___not_used
 #define BTF_F_NONAME BTF_F_NONAME___not_used
@@ -37,6 +38,7 @@
 #undef bpf_iter__bpf_map_elem
 #undef bpf_iter__bpf_sk_storage_map
 #undef bpf_iter__sockmap
+#undef bpf_iter__cgroup
 #undef btf_ptr
 #undef BTF_F_COMPACT
 #undef BTF_F_NONAME
@@ -132,6 +134,11 @@ struct bpf_iter__sockmap {
 	struct sock *sk;
 };
 
+struct bpf_iter__cgroup {
+	struct bpf_iter_meta *meta;
+	struct cgroup *cgroup;
+} __attribute__((preserve_access_index));
+
 struct btf_ptr {
 	void *ptr;
 	__u32 type_id;
diff --git a/tools/testing/selftests/bpf/progs/cgroup_monitor.c b/tools/testing/selftests/bpf/progs/cgroup_monitor.c
new file mode 100644
index 000000000000..fa5debe1e15a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cgroup_monitor.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Google */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+/* root is the directory path. */
+char root[64];
+
+SEC("tp_btf.s/cgroup_mkdir_s")
+int BPF_PROG(mkdir_prog, struct cgroup *cgrp)
+{
+	static char dirname[64];
+	static char prog_path[64];
+	static char iter_path[64];
+	static union bpf_iter_link_info info;
+	static union bpf_attr get_attr;
+	static union bpf_attr link_attr;
+	static union bpf_attr pin_attr;
+	int link_fd, prog_fd, ret;
+	__u64 id;
+
+	/* create directory in bpffs named by cgroup's id. */
+	id = cgrp->kn->id;
+	BPF_SNPRINTF(dirname, sizeof(dirname), "%s/%lu", root, id);
+	ret = bpf_mkdir(dirname, sizeof(dirname), 0755);
+	if (ret)
+		return ret;
+
+	/* get cgroup iter prog pinned by test progs. */
+	BPF_SNPRINTF(prog_path, sizeof(prog_path), "%s/prog", root);
+	get_attr.bpf_fd = 0;
+	get_attr.pathname = (__u64)prog_path;
+	get_attr.file_flags = BPF_F_RDONLY;
+	prog_fd = bpf_sys_bpf(BPF_OBJ_GET, &get_attr, sizeof(get_attr));
+	if (prog_fd < 0)
+		return prog_fd;
+
+	/* create a link, parameterized by cgroup id. */
+	info.cgroup.cgroup_id = id;
+	link_attr.link_create.prog_fd = prog_fd;
+	link_attr.link_create.attach_type = BPF_TRACE_ITER;
+	link_attr.link_create.target_fd = 0;
+	link_attr.link_create.flags = 0;
+	link_attr.link_create.iter_info = (__u64)&info;
+	link_attr.link_create.iter_info_len = sizeof(info);
+	ret = bpf_sys_bpf(BPF_LINK_CREATE, &link_attr, sizeof(link_attr));
+	if (ret < 0) {
+		bpf_sys_close(prog_fd);
+		return ret;
+	}
+	link_fd = ret;
+
+	/* pin the link in the created directory */
+	BPF_SNPRINTF(iter_path, sizeof(iter_path), "%s/stats", dirname);
+	pin_attr.pathname = (__u64)iter_path;
+	pin_attr.bpf_fd = link_fd;
+	pin_attr.file_flags = 0;
+	ret = bpf_sys_bpf(BPF_OBJ_PIN, &pin_attr, sizeof(pin_attr));
+
+	bpf_sys_close(prog_fd);
+	bpf_sys_close(link_fd);
+	return ret;
+}
+
+SEC("tp_btf.s/cgroup_rmdir_s")
+int BPF_PROG(rmdir_prog, struct cgroup *cgrp)
+{
+	static char dirname[64];
+	static char path[64];
+
+	BPF_SNPRINTF(dirname, sizeof(dirname), "%s/%lu", root, cgrp->kn->id);
+	BPF_SNPRINTF(path, sizeof(path), "%s/stats", dirname);
+	bpf_unlink(path, sizeof(path));
+	return bpf_rmdir(dirname, sizeof(dirname));
+}
diff --git a/tools/testing/selftests/bpf/progs/cgroup_sched_lat.c b/tools/testing/selftests/bpf/progs/cgroup_sched_lat.c
new file mode 100644
index 000000000000..90fe709377e1
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cgroup_sched_lat.c
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Google */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+#define TASK_RUNNING 0
+#define BPF_F_CURRENT_CPU 0xffffffffULL
+
+extern void fair_sched_class __ksym;
+extern bool CONFIG_FAIR_GROUP_SCHED __kconfig;
+extern bool CONFIG_CGROUP_SCHED __kconfig;
+
+struct wait_lat {
+	/* Queue_self stands for the latency a task experiences while waiting
+	 * behind the tasks that are from the same cgroup.
+	 *
+	 * Queue_other stands for the latency a task experiences while waiting
+	 * behind the tasks that are from other cgroups.
+	 *
+	 * For example, if there are three tasks: A, B and C. Suppose A and B
+	 * are in the same cgroup and C is in another cgroup and we see A has
+	 * a queueing latency X milliseconds. Let's say during the X milliseconds,
+	 * B has run for Y milliseconds. We can break down X to two parts: time
+	 * when B is on cpu, that is Y; the time when C is on cpu, that is X - Y.
+	 *
+	 * Queue_self is the former (Y) while queue_other is the latter (X - Y).
+	 *
+	 * large value in queue_self is an indication of contention within a
+	 * cgroup; while large value in queue_other is an indication of
+	 * contention from multiple cgroups.
+	 */
+	u64 queue_self;
+	u64 queue_other;
+};
+
+struct timestamp {
+	/* timestamp when last queued */
+	u64 tsp;
+
+	/* cgroup exec_clock when last queued */
+	u64 exec_clock;
+};
+
+/* Map to store per-cgroup wait latency */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, u64);
+	__type(value, struct wait_lat);
+	__uint(max_entries, 65532);
+} cgroup_lat SEC(".maps");
+
+/* Map to store per-task queue timestamp */
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct timestamp);
+} start SEC(".maps");
+
+/* adapt from task_cfs_rq in kernel/sched/sched.h */
+__always_inline
+struct cfs_rq *task_cfs_rq(struct task_struct *t)
+{
+	if (!CONFIG_FAIR_GROUP_SCHED)
+		return NULL;
+
+	return BPF_CORE_READ(&t->se, cfs_rq);
+}
+
+/* record enqueue timestamp */
+__always_inline
+static int trace_enqueue(struct task_struct *t)
+{
+	u32 pid = t->pid;
+	struct timestamp *ptr;
+	struct cfs_rq *cfs_rq;
+
+	if (!pid)
+		return 0;
+
+	/* only measure for CFS tasks */
+	if (t->sched_class != &fair_sched_class)
+		return 0;
+
+	ptr = bpf_task_storage_get(&start, t, 0,
+				   BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!ptr)
+		return 0;
+
+	/* CONFIG_FAIR_GROUP_SCHED may not be enabled */
+	cfs_rq = task_cfs_rq(t);
+	if (!cfs_rq)
+		return 0;
+
+	ptr->tsp = bpf_ktime_get_ns();
+	ptr->exec_clock = BPF_CORE_READ(cfs_rq, exec_clock);
+	return 0;
+}
+
+SEC("tp_btf/sched_wakeup")
+int handle__sched_wakeup(u64 *ctx)
+{
+	/* TP_PROTO(struct task_struct *p) */
+	struct task_struct *p = (void *)ctx[0];
+
+	return trace_enqueue(p);
+}
+
+SEC("tp_btf/sched_wakeup_new")
+int handle__sched_wakeup_new(u64 *ctx)
+{
+	/* TP_PROTO(struct task_struct *p) */
+	struct task_struct *p = (void *)ctx[0];
+
+	return trace_enqueue(p);
+}
+
+/* task_group() from kernel/sched/sched.h */
+__always_inline
+struct task_group *task_group(struct task_struct *p)
+{
+	if (!CONFIG_CGROUP_SCHED)
+		return NULL;
+
+	return BPF_CORE_READ(p, sched_task_group);
+}
+
+__always_inline
+struct cgroup *task_cgroup(struct task_struct *p)
+{
+	struct task_group *tg;
+
+	tg = task_group(p);
+	if (!tg)
+		return NULL;
+
+	return BPF_CORE_READ(tg, css).cgroup;
+}
+
+__always_inline
+u64 max(u64 x, u64 y)
+{
+	return x > y ? x : y;
+}
+
+SEC("tp_btf/sched_switch")
+int handle__sched_switch(u64 *ctx)
+{
+	/* TP_PROTO(bool preempt, struct task_struct *prev,
+	 *	    struct task_struct *next)
+	 */
+	struct task_struct *prev = (struct task_struct *)ctx[1];
+	struct task_struct *next = (struct task_struct *)ctx[2];
+	u64 delta, delta_self, delta_other, id;
+	struct cfs_rq *cfs_rq;
+	struct timestamp *tsp;
+	struct wait_lat *lat;
+	struct cgroup *cgroup;
+
+	/* ivcsw: treat like an enqueue event and store timestamp */
+	if (prev->__state == TASK_RUNNING)
+		trace_enqueue(prev);
+
+	/* only measure for CFS tasks */
+	if (next->sched_class != &fair_sched_class)
+		return 0;
+
+	/* fetch timestamp and calculate delta */
+	tsp = bpf_task_storage_get(&start, next, 0, 0);
+	if (!tsp)
+		return 0;   /* missed enqueue */
+
+	/* CONFIG_FAIR_GROUP_SCHED may not be enabled */
+	cfs_rq = task_cfs_rq(next);
+	if (!cfs_rq)
+		return 0;
+
+	/* cpu controller may not be enabled */
+	cgroup = task_cgroup(next);
+	if (!cgroup)
+		return 0;
+
+	/* calculate self delay and other delay */
+	delta = bpf_ktime_get_ns() - tsp->tsp;
+	delta_self = BPF_CORE_READ(cfs_rq, exec_clock) - tsp->exec_clock;
+	if (delta_self > delta)
+		delta_self = delta;
+	delta_other = delta - delta_self;
+
+	/* insert into cgroup_lat map */
+	id = BPF_CORE_READ(cgroup, kn, id);
+	lat = bpf_map_lookup_elem(&cgroup_lat, &id);
+	if (!lat) {
+		struct wait_lat w = {
+			.queue_self = delta_self,
+			.queue_other = delta_other,
+		};
+
+		bpf_map_update_elem(&cgroup_lat, &id, &w, BPF_ANY);
+	} else {
+		lat->queue_self = max(delta_self, lat->queue_self);
+		lat->queue_other = max(delta_other, lat->queue_other);
+	}
+
+	bpf_task_storage_delete(&start, next);
+	return 0;
+}
+
+SEC("iter/cgroup")
+int dump_cgroup(struct bpf_iter__cgroup *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	struct cgroup *cgroup = ctx->cgroup;
+	struct wait_lat *lat;
+	u64 id = cgroup->kn->id;
+
+	BPF_SEQ_PRINTF(seq, "cgroup_id: %8lu\n", id);
+	lat = bpf_map_lookup_elem(&cgroup_lat, &id);
+	if (lat) {
+		BPF_SEQ_PRINTF(seq, "queue_self: %8lu\n", lat->queue_self);
+		BPF_SEQ_PRINTF(seq, "queue_other: %8lu\n", lat->queue_other);
+	} else {
+		/* print anyway for universal parsing logic in userspace. */
+		BPF_SEQ_PRINTF(seq, "queue_self: %8d\n", 0);
+		BPF_SEQ_PRINTF(seq, "queue_other: %8d\n", 0);
+	}
+	return 0;
+}
-- 
2.35.1.574.g5d30c73bfb-goog