Re: [PATCH v2 bpf-next 1/3] bpf: Introduce bpf_timer

Yonghong Song <yhs@xxxxxx> · Fri, 11 Jun 2021 15:12:05 -0700

On 6/10/21 9:24 PM, Alexei Starovoitov wrote:
From: Alexei Starovoitov <ast@xxxxxxxxxx>

Introduce 'struct bpf_timer { __u64 :64; __u64 :64; };' that can be embedded
in hash/array/lru maps as regular field and helpers to operate on it:

// Initialize the timer to call 'callback_fn' static function
// First 4 bits of 'flags' specify clockid.
// Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
long bpf_timer_init(struct bpf_timer *timer, void *callback_fn, int flags);

// Start the timer and set its expiration 'nsec' nanoseconds from the current time.
long bpf_timer_start(struct bpf_timer *timer, u64 nsec);

// Cancel the timer and wait for callback_fn to finish if it was running.
long bpf_timer_cancel(struct bpf_timer *timer);

Here is how BPF program might look like:
struct map_elem {
     int counter;
     struct bpf_timer timer;
};

struct {
     __uint(type, BPF_MAP_TYPE_HASH);
     __uint(max_entries, 1000);
     __type(key, int);
     __type(value, struct map_elem);
} hmap SEC(".maps");

static int timer_cb(void *map, int *key, struct map_elem *val);
/* val points to particular map element that contains bpf_timer. */

SEC("fentry/bpf_fentry_test1")
int BPF_PROG(test1, int a)
{
     struct map_elem *val;
     int key = 0;

     val = bpf_map_lookup_elem(&hmap, &key);
     if (val) {
         bpf_timer_init(&val->timer, timer_cb, CLOCK_REALTIME);
         bpf_timer_start(&val->timer, 1000 /* call timer_cb2 in 1 usec */);
     }
}

This patch adds helper implementations that rely on hrtimers
to call bpf functions as timers expire.
The following patch adds necessary safety checks.

Only programs with CAP_BPF are allowed to use bpf_timer.

The amount of timers used by the program is constrained by
the memcg recorded at map creation time.

The bpf_timer_init() helper is receiving hidden 'map' and 'prog' arguments
supplied by the verifier. The prog pointer is needed to do refcnting of bpf
program to make sure that program doesn't get freed while timer is armed.

The bpf_map_delete_elem() and bpf_map_update_elem() operations cancel
and free the timer if given map element had it allocated.
"bpftool map update" command can be used to cancel timers.

Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx>
---
  include/linux/bpf.h            |   2 +
  include/uapi/linux/bpf.h       |  40 ++++++
  kernel/bpf/helpers.c           | 227 +++++++++++++++++++++++++++++++++
  kernel/bpf/verifier.c          | 109 ++++++++++++++++
  kernel/trace/bpf_trace.c       |   2 +-
  scripts/bpf_doc.py             |   2 +
  tools/include/uapi/linux/bpf.h |  40 ++++++
  7 files changed, 421 insertions(+), 1 deletion(-)

[...]

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2c1ba70abbf1..d25bbcdad8e6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4778,6 +4778,38 @@ union bpf_attr {
   * 		Execute close syscall for given FD.
   * 	Return
   * 		A syscall result.
+ *
+ * long bpf_timer_init(struct bpf_timer *timer, void *callback_fn, int flags)
+ *	Description
+ *		Initialize the timer to call *callback_fn* static function.
+ *		First 4 bits of *flags* specify clockid. Only CLOCK_MONOTONIC,
+ *		CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
+ *		All other bits of *flags* are reserved.
+ *	Return
+ *		0 on success.
+ *		**-EBUSY** if *timer* is already initialized.
+ *		**-EINVAL** if invalid *flags* are passed.
+ *
+ * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs)
+ *	Description
+ *		Start the timer and set its expiration N nanoseconds from the
+ *		current time. The timer callback_fn will be invoked in soft irq
+ *		context on some cpu and will not repeat unless another
+ *		bpf_timer_start() is made. In such case the next invocation can
+ *		migrate to a different cpu.
+ *	Return
+ *		0 on success.
+ *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ *
+ * long bpf_timer_cancel(struct bpf_timer *timer)
+ *	Description
+ *		Cancel the timer and wait for callback_fn to finish if it was running.
+ *	Return
+ *		0 if the timer was not active.
+ *		1 if the timer was active.
+ *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ *		**-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its own timer
+ *		which would have led to a deadlock otherwise.
   */
  #define __BPF_FUNC_MAPPER(FN)		\
  	FN(unspec),			\
@@ -4949,6 +4981,9 @@ union bpf_attr {
  	FN(sys_bpf),			\
  	FN(btf_find_by_name_kind),	\
  	FN(sys_close),			\
+	FN(timer_init),			\
+	FN(timer_start),		\
+	FN(timer_cancel),		\
  	/* */
  
  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -6061,6 +6096,11 @@ struct bpf_spin_lock {
  	__u32	val;
  };
  
+struct bpf_timer {
+	__u64 :64;
+	__u64 :64;
+};
+
  struct bpf_sysctl {
  	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
  				 * Allows 1,2,4-byte read, but no write.
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 544773970dbc..3a693d451ca3 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -985,6 +985,227 @@ const struct bpf_func_proto bpf_snprintf_proto = {
  	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
  };
  
+struct bpf_hrtimer {
+	struct hrtimer timer;
+	struct bpf_map *map;
+	struct bpf_prog *prog;
+	void *callback_fn;
+	void *value;
+};
+
+/* the actual struct hidden inside uapi struct bpf_timer */
+struct bpf_timer_kern {
+	struct bpf_hrtimer *timer;
+	struct bpf_spin_lock lock;
+};

Looks like in 32bit system, sizeof(struct bpf_timer_kern) is 64
and sizeof(struct bpf_timer) is 128.

struct bpf_spin_lock {
        __u32   val;
};

struct bpf_timer {
	__u64 :64;
	__u64 :64;
};

Checking the code, we may not have issues as structure
"bpf_timer" is only used to reserve spaces and
map copy value routine handles that properly.

Maybe we can still make it consistent with
two fields in bpf_timer_kern mapping to
two fields in bpf_timer?

struct bpf_timer_kern {
	__bpf_md_ptr(struct bpf_hrtimer *, timer);
	struct bpf_spin_lock lock;
};

+
+static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
+
+static enum hrtimer_restart bpf_timer_cb(struct hrtimer *timer)
+{
+	struct bpf_hrtimer *t = container_of(timer, struct bpf_hrtimer, timer);
+	struct bpf_prog *prog = t->prog;
+	struct bpf_map *map = t->map;
+	void *key;
+	u32 idx;
+	int ret;
+
+	/* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
+	 * cannot be preempted by another bpf_timer_cb() on the same cpu.
+	 * Remember the timer this callback is servicing to prevent
+	 * deadlock if callback_fn() calls bpf_timer_cancel() on the same timer.
+	 */
+	this_cpu_write(hrtimer_running, t);
+	if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+		struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+		/* compute the key */
+		idx = ((char *)t->value - array->value) / array->elem_size;
+		key = &idx;
+	} else { /* hash or lru */
+		key = t->value - round_up(map->key_size, 8);
+	}
+
+	ret = BPF_CAST_CALL(t->callback_fn)((u64)(long)map,
+					    (u64)(long)key,
+					    (u64)(long)t->value, 0, 0);
+	WARN_ON(ret != 0); /* Next patch disallows 1 in the verifier */
+
+	/* The bpf function finished executed. Drop the prog refcnt.
+	 * It could reach zero here and trigger free of bpf_prog
+	 * and subsequent free of the maps that were holding timers.
+	 * If callback_fn called bpf_timer_start on this timer
+	 * the prog refcnt will be > 0.
+	 *
+	 * If callback_fn deleted map element the 't' could have been freed,
+	 * hence t->prog deref is done earlier.
+	 */
+	bpf_prog_put(prog);
+	this_cpu_write(hrtimer_running, NULL);
+	return HRTIMER_NORESTART;
+}
+
+BPF_CALL_5(bpf_timer_init, struct bpf_timer_kern *, timer, void *, cb, int, flags,
+	   struct bpf_map *, map, struct bpf_prog *, prog)
+{
[...]