Re: [PATCH v3 bpf-next 1/8] bpf: Introduce bpf timers.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On 6/23/21 7:25 PM, Alexei Starovoitov wrote:
From: Alexei Starovoitov <ast@xxxxxxxxxx>

Introduce 'struct bpf_timer { __u64 :64; __u64 :64; };' that can be embedded
in hash/array/lru maps as a regular field and helpers to operate on it:

// Initialize the timer.
// First 4 bits of 'flags' specify clockid.
// Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
long bpf_timer_init(struct bpf_timer *timer, int flags);

// Arm the timer to call callback_fn static function and set its
// expiration 'nsec' nanoseconds from the current time.
long bpf_timer_start(struct bpf_timer *timer, void *callback_fn, u64 nsec);

// Cancel the timer and wait for callback_fn to finish if it was running.
long bpf_timer_cancel(struct bpf_timer *timer);

Here is how BPF program might look like:
struct map_elem {
     int counter;
     struct bpf_timer timer;
};

struct {
     __uint(type, BPF_MAP_TYPE_HASH);
     __uint(max_entries, 1000);
     __type(key, int);
     __type(value, struct map_elem);
} hmap SEC(".maps");

static int timer_cb(void *map, int *key, struct map_elem *val);
/* val points to particular map element that contains bpf_timer. */

SEC("fentry/bpf_fentry_test1")
int BPF_PROG(test1, int a)
{
     struct map_elem *val;
     int key = 0;

     val = bpf_map_lookup_elem(&hmap, &key);
     if (val) {
         bpf_timer_init(&val->timer, CLOCK_REALTIME);
         bpf_timer_start(&val->timer, timer_cb, 1000 /* call timer_cb2 in 1 usec */);
     }
}

This patch adds helper implementations that rely on hrtimers
to call bpf functions as timers expire.
The following patches add necessary safety checks.

Only programs with CAP_BPF are allowed to use bpf_timer.

The amount of timers used by the program is constrained by
the memcg recorded at map creation time.

The bpf_timer_init() helper is receiving hidden 'map' argument and
bpf_timer_start() is receiving hidden 'prog' argument supplied by the verifier.
The prog pointer is needed to do refcnting of bpf program to make sure that
program doesn't get freed while the timer is armed. This apporach relies on

apporach -> approach

"user refcnt" scheme used in prog_array that stores bpf programs for
bpf_tail_call. The bpf_timer_start() will increment the prog refcnt which is
paired with bpf_timer_cancel() that will drop the prog refcnt. The
ops->map_release_uref is responsible for cancelling the timers and dropping
prog refcnt when user space reference to a map reaches zero.
This uref approach is done to make sure that Ctrl-C of user space process will
not leave timers running forever unless the user space explicitly pinned a map
that contained timers in bpffs.

The bpf_map_delete_elem() and bpf_map_update_elem() operations cancel
and free the timer if given map element had it allocated.
"bpftool map update" command can be used to cancel timers.

The 'struct bpf_timer' is explicitly __attribute__((aligned(8))) because
'__u64 :64' has 1 byte alignment of 8 byte padding.

Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx>
---
  include/linux/bpf.h            |   3 +
  include/uapi/linux/bpf.h       |  55 +++++++
  kernel/bpf/helpers.c           | 281 +++++++++++++++++++++++++++++++++
  kernel/bpf/verifier.c          | 138 ++++++++++++++++
  kernel/trace/bpf_trace.c       |   2 +-
  scripts/bpf_doc.py             |   2 +
  tools/include/uapi/linux/bpf.h |  55 +++++++
  7 files changed, 535 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f309fc1509f2..72da9d4d070c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -168,6 +168,7 @@ struct bpf_map {
  	u32 max_entries;
  	u32 map_flags;
  	int spin_lock_off; /* >=0 valid offset, <0 error */
+	int timer_off; /* >=0 valid offset, <0 error */
  	u32 id;
  	int numa_node;
  	u32 btf_key_type_id;
@@ -221,6 +222,7 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
  }
  void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
  			   bool lock_src);
+void bpf_timer_cancel_and_free(void *timer);
  int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
struct bpf_offload_dev;
@@ -314,6 +316,7 @@ enum bpf_arg_type {
  	ARG_PTR_TO_FUNC,	/* pointer to a bpf program function */
  	ARG_PTR_TO_STACK_OR_NULL,	/* pointer to stack or NULL */
  	ARG_PTR_TO_CONST_STR,	/* pointer to a null terminated read-only string */
+	ARG_PTR_TO_TIMER,	/* pointer to bpf_timer */
  	__BPF_ARG_TYPE_MAX,
  };
[...]
+
+static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
+
+static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
+{
+	struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
+	struct bpf_map *map = t->map;
+	void *value = t->value;
+	struct bpf_timer_kern *timer = value + map->timer_off;
+	struct bpf_prog *prog;
+	void *callback_fn;
+	void *key;
+	u32 idx;
+	int ret;
+
+	____bpf_spin_lock(&timer->lock);

I think we may still have some issues.
Case 1:
  1. one bpf program is running in process context,
     bpf_timer_start() is called and timer->lock is taken
  2. timer softirq is triggered and this callback is called

Case 2:
  1. this callback is called, timer->lock is taken
  2. a nmi happens and some bpf program is called (kprobe, tracepoint,
     fentry/fexit or perf_event, etc.) and that program calls
     bpf_timer_start()

So we could have deadlock in both above cases?

+	/* callback_fn and prog need to match. They're updated together
+	 * and have to be read under lock.
+	 */
+	prog = t->prog;
+	callback_fn = t->callback_fn;
+
+	/* wrap bpf subprog invocation with prog->refcnt++ and -- to make
+	 * sure that refcnt doesn't become zero when subprog is executing.
+	 * Do it under lock to make sure that bpf_timer_start doesn't drop
+	 * prev prog refcnt to zero before timer_cb has a chance to bump it.
+	 */
+	bpf_prog_inc(prog);
+	____bpf_spin_unlock(&timer->lock);
+
+	/* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
+	 * cannot be preempted by another bpf_timer_cb() on the same cpu.
+	 * Remember the timer this callback is servicing to prevent
+	 * deadlock if callback_fn() calls bpf_timer_cancel() on the same timer.
+	 */
+	this_cpu_write(hrtimer_running, t);

This is not protected by spinlock, in bpf_timer_cancel() and
bpf_timer_cancel_and_free(), we have spinlock protected read, so
there is potential race conditions if callback function and helper/bpf_timer_cancel_and_free run in different context?

+	if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+		struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+		/* compute the key */
+		idx = ((char *)value - array->value) / array->elem_size;
+		key = &idx;
+	} else { /* hash or lru */
+		key = value - round_up(map->key_size, 8);
+	}
+
+	ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
+					 (u64)(long)key,
+					 (u64)(long)value, 0, 0);
+	WARN_ON(ret != 0); /* Next patch moves this check into the verifier */
+	bpf_prog_put(prog);
+
+	this_cpu_write(hrtimer_running, NULL);
+	return HRTIMER_NORESTART;
+}
+
+BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, u64, flags,
+	   struct bpf_map *, map)
+{
+	clockid_t clockid = flags & (MAX_CLOCKS - 1);
+	struct bpf_hrtimer *t;
+	int ret = 0;
+
+	BUILD_BUG_ON(MAX_CLOCKS != 16);
+	BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer));
+	BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer));
+
+	if (flags >= MAX_CLOCKS ||
+	    /* similar to timerfd except _ALARM variants are not supported */
+	    (clockid != CLOCK_MONOTONIC &&
+	     clockid != CLOCK_REALTIME &&
+	     clockid != CLOCK_BOOTTIME))
+		return -EINVAL;
+	____bpf_spin_lock(&timer->lock);
+	t = timer->timer;
+	if (t) {
+		ret = -EBUSY;
+		goto out;
+	}
+	/* allocate hrtimer via map_kmalloc to use memcg accounting */
+	t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, NUMA_NO_NODE);
+	if (!t) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	t->value = (void *)timer - map->timer_off;
+	t->map = map;
+	t->prog = NULL;
+	t->callback_fn = NULL;
+	hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+	t->timer.function = bpf_timer_cb;
+	timer->timer = t;
+out:
+	____bpf_spin_unlock(&timer->lock);
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_init_proto = {
+	.func		= bpf_timer_init,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+	.arg2_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_timer_start, struct bpf_timer_kern *, timer, void *, callback_fn,
+	   u64, nsecs, struct bpf_prog *, prog)
+{
+	struct bpf_hrtimer *t;
+	struct bpf_prog *prev;
+	int ret = 0;
+
+	____bpf_spin_lock(&timer->lock);
+	t = timer->timer;
+	if (!t) {
+		ret = -EINVAL;
+		goto out;
+	}
+	prev = t->prog;
+	if (prev != prog) {
+		if (prev)
+			/* Drop pref prog refcnt when swapping with new prog */

pref -> prev

+			bpf_prog_put(prev);

Maybe we want to put the above two lines with {}?

+		/* Dump prog refcnt once.
+		 * Every bpf_timer_start() can pick different callback_fn-s
+		 * within the same prog.
+		 */
+		bpf_prog_inc(prog);
+		t->prog = prog;
+	}
+	t->callback_fn = callback_fn;
+	hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT);
+out:
+	____bpf_spin_unlock(&timer->lock);
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_start_proto = {
+	.func		= bpf_timer_start,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+	.arg2_type	= ARG_PTR_TO_FUNC,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+static void drop_prog_refcnt(struct bpf_hrtimer *t)
+{
+	struct bpf_prog *prog = t->prog;
+
+	if (prog) {
+		/* If timer was armed with bpf_timer_start()
+		 * drop prog refcnt.
+		 */
+		bpf_prog_put(prog);
+		t->prog = NULL;
+		t->callback_fn = NULL;
+	}
+}
+
+BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
+{
+	struct bpf_hrtimer *t;
+	int ret = 0;
+
+	____bpf_spin_lock(&timer->lock);
+	t = timer->timer;
+	if (!t) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (this_cpu_read(hrtimer_running) == t) {
+		/* If bpf callback_fn is trying to bpf_timer_cancel()
+		 * its own timer the hrtimer_cancel() will deadlock
+		 * since it waits for callback_fn to finish
+		 */
+		ret = -EDEADLK;
+		goto out;
+	}
+	/* Cancel the timer and wait for associated callback to finish
+	 * if it was running.
+	 */
+	ret = hrtimer_cancel(&t->timer);
+	drop_prog_refcnt(t);
+out:
+	____bpf_spin_unlock(&timer->lock);
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_cancel_proto = {
+	.func		= bpf_timer_cancel,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+};
+
+/* This function is called by map_delete/update_elem for individual element.
+ * By ops->map_release_uref when the user space reference to a map reaches zero
+ * and by ops->map_free when the kernel reference reaches zero.
+ */
+void bpf_timer_cancel_and_free(void *val)
+{
+	struct bpf_timer_kern *timer = val;
+	struct bpf_hrtimer *t;
+
+	/* Performance optimization: read timer->timer without lock first. */
+	if (!READ_ONCE(timer->timer))
+		return;
+
+	____bpf_spin_lock(&timer->lock);
+	/* re-read it under lock */
+	t = timer->timer;
+	if (!t)
+		goto out;
+	/* Cancel the timer and wait for callback to complete if it was running.
+	 * Check that bpf_map_delete/update_elem() wasn't called from timer callback_fn.
+	 * In such case don't call hrtimer_cancel() (since it will deadlock)
+	 * and don't call hrtimer_try_to_cancel() (since it will just return -1).
+	 * Instead free the timer and set timer->timer = NULL.
+	 * The subsequent bpf_timer_start/cancel() helpers won't be able to use it,
+	 * since it won't be initialized.
+	 * In preallocated maps it's safe to do timer->timer = NULL.
+	 * The memory could be reused for another map element while current
+	 * callback_fn can do bpf_timer_init() on it.
+	 * In non-preallocated maps bpf_timer_cancel_and_free and
+	 * timer->timer = NULL will happen after callback_fn completes, since
+	 * program execution is an RCU critical section.
+	 */
+	if (this_cpu_read(hrtimer_running) != t)
+		hrtimer_cancel(&t->timer);

We could still have race conditions here when bpf_timer_cancel_and_free() runs in process context and callback in
softirq context. I guess we might be okay.

But if bpf_timer_cancel_and_free() in nmi context, not 100% sure
whether we have issues or not.

+	drop_prog_refcnt(t);
+	kfree(t);
+	timer->timer = NULL;
+out:
+	____bpf_spin_unlock(&timer->lock);
+}
+
  const struct bpf_func_proto bpf_get_current_task_proto __weak;
  const struct bpf_func_proto bpf_probe_read_user_proto __weak;
  const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@@ -1055,6 +1330,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
[...]



[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux