Re: [PATCH v2 bpf-next 1/3] bpf: Introduce bpf_timer

Yonghong Song <yhs@xxxxxx> · Mon, 14 Jun 2021 09:51:37 -0700

On 6/10/21 9:24 PM, Alexei Starovoitov wrote:
From: Alexei Starovoitov <ast@xxxxxxxxxx>

Introduce 'struct bpf_timer { __u64 :64; __u64 :64; };' that can be embedded
in hash/array/lru maps as regular field and helpers to operate on it:

// Initialize the timer to call 'callback_fn' static function
// First 4 bits of 'flags' specify clockid.
// Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
long bpf_timer_init(struct bpf_timer *timer, void *callback_fn, int flags);

// Start the timer and set its expiration 'nsec' nanoseconds from the current time.
long bpf_timer_start(struct bpf_timer *timer, u64 nsec);

// Cancel the timer and wait for callback_fn to finish if it was running.
long bpf_timer_cancel(struct bpf_timer *timer);

Here is how BPF program might look like:
struct map_elem {
     int counter;
     struct bpf_timer timer;
};

struct {
     __uint(type, BPF_MAP_TYPE_HASH);
     __uint(max_entries, 1000);
     __type(key, int);
     __type(value, struct map_elem);
} hmap SEC(".maps");

static int timer_cb(void *map, int *key, struct map_elem *val);
/* val points to particular map element that contains bpf_timer. */

SEC("fentry/bpf_fentry_test1")
int BPF_PROG(test1, int a)
{
     struct map_elem *val;
     int key = 0;

     val = bpf_map_lookup_elem(&hmap, &key);
     if (val) {
         bpf_timer_init(&val->timer, timer_cb, CLOCK_REALTIME);
         bpf_timer_start(&val->timer, 1000 /* call timer_cb2 in 1 usec */);
     }
}

This patch adds helper implementations that rely on hrtimers
to call bpf functions as timers expire.
The following patch adds necessary safety checks.

Only programs with CAP_BPF are allowed to use bpf_timer.

The amount of timers used by the program is constrained by
the memcg recorded at map creation time.

The bpf_timer_init() helper is receiving hidden 'map' and 'prog' arguments
supplied by the verifier. The prog pointer is needed to do refcnting of bpf
program to make sure that program doesn't get freed while timer is armed.

The bpf_map_delete_elem() and bpf_map_update_elem() operations cancel
and free the timer if given map element had it allocated.
"bpftool map update" command can be used to cancel timers.

Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx>
---
  include/linux/bpf.h            |   2 +
  include/uapi/linux/bpf.h       |  40 ++++++
  kernel/bpf/helpers.c           | 227 +++++++++++++++++++++++++++++++++
  kernel/bpf/verifier.c          | 109 ++++++++++++++++
  kernel/trace/bpf_trace.c       |   2 +-
  scripts/bpf_doc.py             |   2 +
  tools/include/uapi/linux/bpf.h |  40 ++++++
  7 files changed, 421 insertions(+), 1 deletion(-)

[...]
+
+static enum hrtimer_restart bpf_timer_cb(struct hrtimer *timer)
+{
+	struct bpf_hrtimer *t = container_of(timer, struct bpf_hrtimer, timer);
+	struct bpf_prog *prog = t->prog;
+	struct bpf_map *map = t->map;
+	void *key;
+	u32 idx;
+	int ret;
+
+	/* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
+	 * cannot be preempted by another bpf_timer_cb() on the same cpu.
+	 * Remember the timer this callback is servicing to prevent
+	 * deadlock if callback_fn() calls bpf_timer_cancel() on the same timer.
+	 */
+	this_cpu_write(hrtimer_running, t);
+	if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+		struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+		/* compute the key */
+		idx = ((char *)t->value - array->value) / array->elem_size;
+		key = &idx;
+	} else { /* hash or lru */
+		key = t->value - round_up(map->key_size, 8);
+	}
+
+	ret = BPF_CAST_CALL(t->callback_fn)((u64)(long)map,
+					    (u64)(long)key,
+					    (u64)(long)t->value, 0, 0);
+	WARN_ON(ret != 0); /* Next patch disallows 1 in the verifier */

I didn't find that next patch disallows callback return value 1 in the 
verifier. If we indeed disallows return value 1 in the verifier. We
don't need WARN_ON here. Did I miss anything?

+
+	/* The bpf function finished executed. Drop the prog refcnt.
+	 * It could reach zero here and trigger free of bpf_prog
+	 * and subsequent free of the maps that were holding timers.
+	 * If callback_fn called bpf_timer_start on this timer
+	 * the prog refcnt will be > 0.
+	 *
+	 * If callback_fn deleted map element the 't' could have been freed,
+	 * hence t->prog deref is done earlier.
+	 */
+	bpf_prog_put(prog);
+	this_cpu_write(hrtimer_running, NULL);
+	return HRTIMER_NORESTART;
+}
+
+BPF_CALL_5(bpf_timer_init, struct bpf_timer_kern *, timer, void *, cb, int, flags,
+	   struct bpf_map *, map, struct bpf_prog *, prog)
+{
+	clockid_t clockid = flags & (MAX_CLOCKS - 1);
+	struct bpf_hrtimer *t;
+	int ret = 0;
+
+	BUILD_BUG_ON(MAX_CLOCKS != 16);
+	if (flags >= MAX_CLOCKS ||
+	    /* similar to timerfd except _ALARM variants are not supported */
+            (clockid != CLOCK_MONOTONIC &&
+             clockid != CLOCK_REALTIME &&
+             clockid != CLOCK_BOOTTIME))
+		return -EINVAL;
+	____bpf_spin_lock(&timer->lock);
+	t = timer->timer;
+	if (t) {
+		ret = -EBUSY;
+		goto out;
+	}
+	/* allocate hrtimer via map_kmalloc to use memcg accounting */
+	t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, NUMA_NO_NODE);
+	if (!t) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	t->callback_fn = cb;
+	t->value = (void *)timer /* - offset of bpf_timer inside elem */;
+	t->map = map;
+	t->prog = prog;
+	hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+	t->timer.function = bpf_timer_cb;
+	timer->timer = t;
+out:
+	____bpf_spin_unlock(&timer->lock);
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_init_proto = {
+	.func		= bpf_timer_init,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+	.arg2_type	= ARG_PTR_TO_FUNC,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs)
+{
+	struct bpf_hrtimer *t;
+	int ret = 0;
+
+	____bpf_spin_lock(&timer->lock);
+	t = timer->timer;
+	if (!t) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (!hrtimer_active(&t->timer) || hrtimer_callback_running(&t->timer))
+		/* If the timer wasn't active or callback already executing
+		 * bump the prog refcnt to keep it alive until
+		 * callback is invoked (again).
+		 */
+		bpf_prog_inc(t->prog);

I am not 100% sure. But could we have race condition here?
   cpu 1: running bpf_timer_start() helper call
   cpu 2: doing hrtimer work (calling callback etc.)

Is it possible that
  !hrtimer_active(&t->timer) || hrtimer_callback_running(&t->timer)
may be true and then right before bpf_prog_inc(t->prog), it becomes 
true? If hrtimer_callback_running() is called, it is possible that
callback function could have dropped the reference count for t->prog,
so we could already go into the body of the function
__bpf_prog_put()?

static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
        if (atomic64_dec_and_test(&prog->aux->refcnt)) {
                perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
                bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
                /* bpf_prog_free_id() must be called first */
                bpf_prog_free_id(prog, do_idr_lock);
                __bpf_prog_put_noref(prog, true);
        }
}


+	hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT);
+out:
+	____bpf_spin_unlock(&timer->lock);
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_start_proto = {
+	.func		= bpf_timer_start,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+	.arg2_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
+{
+	struct bpf_hrtimer *t;
+	int ret = 0;
+
+	____bpf_spin_lock(&timer->lock);
+	t = timer->timer;
+	if (!t) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (this_cpu_read(hrtimer_running) == t) {
+		/* If bpf callback_fn is trying to bpf_timer_cancel()
+		 * its own timer the hrtimer_cancel() will deadlock
+		 * since it waits for callback_fn to finish
+		 */
+		ret = -EDEADLK;
+		goto out;
+	}
+	/* Cancel the timer and wait for associated callback to finish
+	 * if it was running.
+	 */
+	if (hrtimer_cancel(&t->timer) == 1) {

Again, could we have race here between bpf program and hrtimer_cancel()?

+		/* If the timer was active then drop the prog refcnt,
+		 * since callback will not be invoked.
+		 */
+		bpf_prog_put(t->prog);
+		ret = 1;
+	}
+out:
+	____bpf_spin_unlock(&timer->lock);
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_cancel_proto = {
+	.func		= bpf_timer_cancel,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_TIMER,
+};
+
+/* This function is called by delete_element in htab and lru maps
+ * and by map_free for array, lru, htab maps.
+ */
+void bpf_timer_cancel_and_free(void *val)
+{
+	struct bpf_timer_kern *timer = val;
+	struct bpf_hrtimer *t;
+
+	____bpf_spin_lock(&timer->lock);
+	t = timer->timer;
+	if (!t)
+		goto out;
+	/* Cancel the timer and wait for callback to complete if it was
+	 * running. Only individual delete_element in htab or lru maps can
+	 * return 1 from hrtimer_cancel.
+	 * The whole map is destroyed when its refcnt reaches zero.
+	 * That happens after bpf prog refcnt reaches zero.
+	 * bpf prog refcnt will not reach zero until all timers are executed.
+	 * So when maps are destroyed hrtimer_cancel will surely return 0.
+	 * In such case t->prog is a pointer to freed memory.
+	 *
+	 * When htab or lru is deleting individual element check that
+	 * bpf_map_delete_elem() isn't trying to delete elem with running timer.
+	 * In such case don't call hrtimer_cancel() (since it will deadlock)
+	 * and don't call hrtimer_try_to_cancel() (since it will just return -1).
+	 * Instead free the timer and set timer->timer = NULL.
+	 * The subsequent bpf_timer_start/cancel() helpers won't be able to use it.
+	 * In preallocated maps it's safe to do timer->timer = NULL.
+	 * The memory could be reused for another element while current timer
+	 * callback can still do bpf_timer_init() on it.
+	 * In non-preallocated maps timer->timer = NULL will happen after
+	 * callback completes, since prog execution is an RCU critical section.
+	 */
+	if (this_cpu_read(hrtimer_running) != t &&
+	    hrtimer_cancel(&t->timer) == 1)
+		bpf_prog_put(t->prog);
+	kfree(t);
+	timer->timer = NULL;
+out:
+	____bpf_spin_unlock(&timer->lock);
+}
+
  const struct bpf_func_proto bpf_get_current_task_proto __weak;
  const struct bpf_func_proto bpf_probe_read_user_proto __weak;
  const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@@ -1051,6 +1272,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
  		return &bpf_per_cpu_ptr_proto;
  	case BPF_FUNC_this_cpu_ptr:
  		return &bpf_this_cpu_ptr_proto;
+	case BPF_FUNC_timer_init:
+		return &bpf_timer_init_proto;
+	case BPF_FUNC_timer_start:
+		return &bpf_timer_start_proto;
+	case BPF_FUNC_timer_cancel:
+		return &bpf_timer_cancel_proto;
  	default:
  		break;
  	}

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1de4b8c6ee42..44ec9760b562 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4656,6 +4656,35 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
  	return 0;
  }
  
+static int process_timer_func(struct bpf_verifier_env *env, int regno,
+			      struct bpf_call_arg_meta *meta)
+{
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+	bool is_const = tnum_is_const(reg->var_off);
+	struct bpf_map *map = reg->map_ptr;
+	u64 val = reg->var_off.value;
+
+	if (!is_const) {
+		verbose(env,
+			"R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
+			regno);
+		return -EINVAL;
+	}
+	if (!map->btf) {
+		verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
+			map->name);
+		return -EINVAL;
+	}
+	if (val) {
+		/* This restriction will be removed in the next patch */
+		verbose(env, "bpf_timer field can only be first in the map value element\n");
+		return -EINVAL;
+	}
+	WARN_ON(meta->map_ptr);

Could you explain when this could happen?

+	meta->map_ptr = map;
+	return 0;
+}
+
  static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
  {
  	return type == ARG_PTR_TO_MEM ||
@@ -4788,6 +4817,7 @@ static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PER
  static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
  static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
  static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
  
[...]