[PATCH v2] perf trace: Add --syscall-sample option

Namhyung Kim <namhyung@xxxxxxxxxx> · Thu, 19 Dec 2024 10:34:00 -0800

This option is to implement the event sampling for system calls in BPF.
When it's used, it picks up a syscall in the given sampling period and
discard others.  The period is in msec as it shows the time in msec.

  # perf trace -C 0 --syscall-sample 100 sleep 1
           ? (         ): fleetspeakd/1828559  ... [continued]: futex())                                            = -1 ETIMEDOUT (Connection timed out)
       0.050 (100.247 ms): gnome-shell/572531 recvmsg(fd: 10<socket:[3355761]>, msg: 0x7ffef8b39d20)                = 40
     100.357 (100.149 ms): pipewire-pulse/572245 read(fd: 5<anon_inode:[eventfd]>, buf: 0x7ffc0b9dc8f0, count: 8)      = 8
     200.553 (100.268 ms): NetworkManager/3424 epoll_wait(epfd: 19<anon_inode:[eventpoll]>, events: 0x5607b85bb880, maxevents: 6) = 0
     300.876 (         ): mon/4932 poll(ufds: 0x7fa392784df0, nfds: 1, timeout_msecs: 100)            ...
     400.901 ( 0.025 ms): TaskCon~ller #/620145 futex(uaddr: 0x7f3fc596fa00, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
     300.876 (100.123 ms): mon/4932  ... [continued]: poll())                                             = 0 (Timeout)
     500.901 ( 0.012 ms): evdefer/2/2335122 futex(uaddr: 0x5640baac5198, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
     602.701 ( 0.017 ms): Compositor/1992200 futex(uaddr: 0x7f1a51dfdd40, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
     705.589 ( 0.017 ms): JS Watchdog/947933 futex(uaddr: 0x7f4cac1d4240, op: WAKE|PRIVATE_FLAG, val: 1)           = 0
     812.667 ( 0.027 ms): fix/1985151 futex(uaddr: 0xc0008f7148, op: WAKE|PRIVATE_FLAG, val: 1)             = 1
     912.807 ( 0.017 ms): Xorg/572315 setitimer(value: 0x7ffc375d6ba0)                                      = 0

The timestamp is kept in a per-cpu array and the allowed task is saved
in a BPF hash map.  For non-BPF use cases, it won't work so an error
message would be displayed.

  # perf trace --syscall-sample 100 sleep 1
  ERROR: --syscall-sample works only for BPF

Signed-off-by: Namhyung Kim <namhyung@xxxxxxxxxx>
---
v2 changes)
* rename to --syscall-sample and update the description
* print error when BPF is not available  (Arnaldo)
* rename to sample_period_ns  (Ian)

 tools/perf/Documentation/perf-trace.txt       |  6 ++
 tools/perf/builtin-trace.c                    | 11 +++
 .../bpf_skel/augmented_raw_syscalls.bpf.c     | 67 ++++++++++++++++++-
 3 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index 6e0cc50bbc13fc7f..e8a38ecc5eddab1c 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -241,6 +241,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
 	printing using the existing 'perf trace' syscall arg beautifiers to map integer
 	arguments to strings (pid to comm, syscall id to syscall name, etc).
 
+--syscall-sample=<period>::
+	Enable sampling of system calls with a given period in msec.
+	The sampling frequency would be 1 / period, in other words,
+	it will trace a system call only after the given period of
+	time is passed.  The sampling period is tracked per CPU.
+
 
 PAGEFAULTS
 ----------
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 6a1a128fe645014d..e70e634fbfaf33f5 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -185,6 +185,7 @@ struct trace {
 	} stats;
 	unsigned int		max_stack;
 	unsigned int		min_stack;
+	unsigned long		sample_period_ms;
 	int			raw_augmented_syscalls_args_size;
 	bool			raw_augmented_syscalls;
 	bool			fd_path_disabled;
@@ -5186,6 +5187,7 @@ int cmd_trace(int argc, const char **argv)
 		     "start"),
 	OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
 		       "to customized ones"),
+	OPT_ULONG(0, "syscall-sample", &trace.sample_period_ms, "syscall sampling period in ms"),
 	OPTS_EVSWITCH(&trace.evswitch),
 	OPT_END()
 	};
@@ -5293,6 +5295,9 @@ int cmd_trace(int argc, const char **argv)
 				bpf_program__set_autoattach(prog, /*autoattach=*/false);
 		}
 
+		if (trace.sample_period_ms)
+			trace.skel->rodata->sample_period_ns = trace.sample_period_ms * NSEC_PER_MSEC;
+
 		err = augmented_raw_syscalls_bpf__load(trace.skel);
 
 		if (err < 0) {
@@ -5313,6 +5318,12 @@ int cmd_trace(int argc, const char **argv)
 	trace.syscalls.events.bpf_output = evlist__last(trace.evlist);
 	assert(evsel__name_is(trace.syscalls.events.bpf_output, "__augmented_syscalls__"));
 skip_augmentation:
+#else
+	if (trace.sample_period_ms) {
+		pr_err("ERROR: --syscall-sample works only for BPF\n");
+		err = -EINVAL;
+		goto out;
+	}
 #endif
 	err = -1;
 
diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 4a62ed593e84edf8..7027bec55298191d 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -113,6 +113,22 @@ struct pids_filtered {
 	__uint(max_entries, 64);
 } pids_filtered SEC(".maps");
 
+struct sample_timestamp {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, int);
+	__type(value, __u64);
+	__uint(max_entries, 1);
+} sample_timestamp SEC(".maps");
+
+struct sample_filtered {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, pid_t);
+	__type(value, bool);
+	__uint(max_entries, MAX_CPUS);
+} sample_filtered SEC(".maps");
+
+const volatile __u64 sample_period_ns;
+
 struct augmented_args_payload {
 	struct syscall_enter_args args;
 	struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc)
@@ -428,6 +444,44 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
 	return bpf_map_lookup_elem(pids, &pid) != NULL;
 }
 
+static bool sample_filter__allow_enter(__u64 timestamp, pid_t pid)
+{
+	int idx = 0;
+	__u64 *prev_ts;
+	bool ok = true;
+
+	/* default behavior */
+	if (sample_period_ns == 0)
+		return true;
+
+	prev_ts = bpf_map_lookup_elem(&sample_timestamp, &idx);
+
+	if (prev_ts) {
+		if ((*prev_ts + sample_period_ns) > timestamp)
+			return false;
+		*prev_ts = timestamp;
+	} else {
+		bpf_map_update_elem(&sample_timestamp, &idx, &timestamp, BPF_ANY);
+	}
+
+	bpf_map_update_elem(&sample_filtered, &pid, &ok, BPF_ANY);
+
+	return true;
+}
+
+static bool sample_filter__allow_exit(pid_t pid)
+{
+	/* default behavior */
+	if (sample_period_ns == 0)
+		return true;
+
+	if (!bpf_map_lookup_elem(&sample_filtered, &pid))
+		return false;
+
+	bpf_map_delete_elem(&sample_filtered, &pid);
+	return true;
+}
+
 static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 {
 	bool augmented, do_output = false;
@@ -526,7 +580,9 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 SEC("tp/raw_syscalls/sys_enter")
 int sys_enter(struct syscall_enter_args *args)
 {
+	pid_t pid = getpid();
 	struct augmented_args_payload *augmented_args;
+
 	/*
 	 * We start len, the amount of data that will be in the perf ring
 	 * buffer, if this is not filtered out by one of pid_filter__has(),
@@ -537,7 +593,10 @@ int sys_enter(struct syscall_enter_args *args)
 	 * initial, non-augmented raw_syscalls:sys_enter payload.
 	 */
 
-	if (pid_filter__has(&pids_filtered, getpid()))
+	if (pid_filter__has(&pids_filtered, pid))
+		return 0;
+
+	if (!sample_filter__allow_enter(bpf_ktime_get_ns(), pid))
 		return 0;
 
 	augmented_args = augmented_args_payload();
@@ -561,9 +620,13 @@ int sys_enter(struct syscall_enter_args *args)
 SEC("tp/raw_syscalls/sys_exit")
 int sys_exit(struct syscall_exit_args *args)
 {
+	pid_t pid = getpid();
 	struct syscall_exit_args exit_args;
 
-	if (pid_filter__has(&pids_filtered, getpid()))
+	if (pid_filter__has(&pids_filtered, pid))
+		return 0;
+
+	if (!sample_filter__allow_exit(pid))
 		return 0;
 
 	bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args);
-- 
2.47.1.613.gc27f4b7a9f-goog