Re: [PATCH v5 bpf-next 5/9] bpf: cpumap: add the possibility to attach an eBPF program to cpumap

Daniel Borkmann <daniel@xxxxxxxxxxxxx> · Thu, 2 Jul 2020 01:39:40 +0200




On 6/30/20 2:49 PM, Lorenzo Bianconi wrote:
[...]
+static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
+				    void **frames, int n,
+				    struct xdp_cpumap_stats *stats)
+{
+	struct xdp_rxq_info rxq;
+	struct bpf_prog *prog;
+	struct xdp_buff xdp;
+	int i, nframes = 0;
+
+	if (!rcpu->prog)
+		return n;
+
+	rcu_read_lock();
+
+	xdp_set_return_frame_no_direct();
+	xdp.rxq = &rxq;
+
+	prog = READ_ONCE(rcpu->prog);
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		u32 act;
+		int err;
+
+		rxq.dev = xdpf->dev_rx;
+		rxq.mem = xdpf->mem;
+		/* TODO: report queue_index to xdp_rxq_info */
+
+		xdp_convert_frame_to_buff(xdpf, &xdp);
+
+		act = bpf_prog_run_xdp(prog, &xdp);
+		switch (act) {
+		case XDP_PASS:
+			err = xdp_update_frame_from_buff(&xdp, xdpf);
+			if (err < 0) {
+				xdp_return_frame(xdpf);
+				stats->drop++;
+			} else {
+				frames[nframes++] = xdpf;
+				stats->pass++;
+			}
+			break;
+		default:
+			bpf_warn_invalid_xdp_action(act);
+			/* fallthrough */
+		case XDP_DROP:
+			xdp_return_frame(xdpf);
+			stats->drop++;
+			break;
+		}
+	}
+
+	xdp_clear_return_frame_no_direct();
+
+	rcu_read_unlock();
+
+	return nframes;
+}
+
  #define CPUMAP_BATCH 8
  
  static int cpu_map_kthread_run(void *data)
@@ -235,11 +297,12 @@ static int cpu_map_kthread_run(void *data)
  	 * kthread_stop signal until queue is empty.
  	 */
  	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+		struct xdp_cpumap_stats stats = {}; /* zero stats */
+		gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
  		unsigned int drops = 0, sched = 0;
  		void *frames[CPUMAP_BATCH];
  		void *skbs[CPUMAP_BATCH];
-		gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
-		int i, n, m;
+		int i, n, m, nframes;
  
  		/* Release CPU reschedule checks */
  		if (__ptr_ring_empty(rcpu->queue)) {
@@ -260,8 +323,8 @@ static int cpu_map_kthread_run(void *data)
  		 * kthread CPU pinned. Lockless access to ptr_ring
  		 * consume side valid as no-resize allowed of queue.
  		 */
-		n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH);
-
+		n = __ptr_ring_consume_batched(rcpu->queue, frames,
+					       CPUMAP_BATCH);
  		for (i = 0; i < n; i++) {
  			void *f = frames[i];
  			struct page *page = virt_to_page(f);
@@ -273,15 +336,19 @@ static int cpu_map_kthread_run(void *data)
  			prefetchw(page);
  		}
  
-		m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs);
-		if (unlikely(m == 0)) {
-			for (i = 0; i < n; i++)
-				skbs[i] = NULL; /* effect: xdp_return_frame */
-			drops = n;
+		/* Support running another XDP prog on this CPU */
+		nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats);
+		if (nframes) {
+			m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs);
+			if (unlikely(m == 0)) {
+				for (i = 0; i < nframes; i++)
+					skbs[i] = NULL; /* effect: xdp_return_frame */
+				drops += nframes;
+			}
  		}
  
  		local_bh_disable();
-		for (i = 0; i < n; i++) {
+		for (i = 0; i < nframes; i++) {
  			struct xdp_frame *xdpf = frames[i];
  			struct sk_buff *skb = skbs[i];
  			int ret;
@@ -298,7 +365,7 @@ static int cpu_map_kthread_run(void *data)
  				drops++;
  		}
  		/* Feedback loop via tracepoint */
-		trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched);
+		trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats);
  
  		local_bh_enable(); /* resched point, may call do_softirq() */
  	}
@@ -308,13 +375,38 @@ static int cpu_map_kthread_run(void *data)
  	return 0;
  }
[...]
@@ -415,6 +510,8 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
  
  	old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
  	if (old_rcpu) {
+		if (old_rcpu->prog)
+			bpf_prog_put(old_rcpu->prog);
  		call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
  		INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
  		schedule_work(&old_rcpu->kthread_stop_wq);

Hm, not quite sure I follow the logic here. Why is the bpf_prog_put() not placed inside
__cpu_map_entry_free(), for example? Wouldn't this at least leave a potential small race
window of UAF given the rest is still live? If we already piggy-back from RCU side on
rcpu entry, why not having it in __cpu_map_entry_free()?

Thanks,
Daniel