On 6/30/20 2:49 PM, Lorenzo Bianconi wrote:
[...]
+static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
+ void **frames, int n,
+ struct xdp_cpumap_stats *stats)
+{
+ struct xdp_rxq_info rxq;
+ struct bpf_prog *prog;
+ struct xdp_buff xdp;
+ int i, nframes = 0;
+
+ if (!rcpu->prog)
+ return n;
+
+ rcu_read_lock();
+
+ xdp_set_return_frame_no_direct();
+ xdp.rxq = &rxq;
+
+ prog = READ_ONCE(rcpu->prog);
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ u32 act;
+ int err;
+
+ rxq.dev = xdpf->dev_rx;
+ rxq.mem = xdpf->mem;
+ /* TODO: report queue_index to xdp_rxq_info */
+
+ xdp_convert_frame_to_buff(xdpf, &xdp);
+
+ act = bpf_prog_run_xdp(prog, &xdp);
+ switch (act) {
+ case XDP_PASS:
+ err = xdp_update_frame_from_buff(&xdp, xdpf);
+ if (err < 0) {
+ xdp_return_frame(xdpf);
+ stats->drop++;
+ } else {
+ frames[nframes++] = xdpf;
+ stats->pass++;
+ }
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ /* fallthrough */
+ case XDP_DROP:
+ xdp_return_frame(xdpf);
+ stats->drop++;
+ break;
+ }
+ }
+
+ xdp_clear_return_frame_no_direct();
+
+ rcu_read_unlock();
+
+ return nframes;
+}
+
#define CPUMAP_BATCH 8
static int cpu_map_kthread_run(void *data)
@@ -235,11 +297,12 @@ static int cpu_map_kthread_run(void *data)
* kthread_stop signal until queue is empty.
*/
while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+ struct xdp_cpumap_stats stats = {}; /* zero stats */
+ gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
unsigned int drops = 0, sched = 0;
void *frames[CPUMAP_BATCH];
void *skbs[CPUMAP_BATCH];
- gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
- int i, n, m;
+ int i, n, m, nframes;
/* Release CPU reschedule checks */
if (__ptr_ring_empty(rcpu->queue)) {
@@ -260,8 +323,8 @@ static int cpu_map_kthread_run(void *data)
* kthread CPU pinned. Lockless access to ptr_ring
* consume side valid as no-resize allowed of queue.
*/
- n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH);
-
+ n = __ptr_ring_consume_batched(rcpu->queue, frames,
+ CPUMAP_BATCH);
for (i = 0; i < n; i++) {
void *f = frames[i];
struct page *page = virt_to_page(f);
@@ -273,15 +336,19 @@ static int cpu_map_kthread_run(void *data)
prefetchw(page);
}
- m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs);
- if (unlikely(m == 0)) {
- for (i = 0; i < n; i++)
- skbs[i] = NULL; /* effect: xdp_return_frame */
- drops = n;
+ /* Support running another XDP prog on this CPU */
+ nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats);
+ if (nframes) {
+ m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs);
+ if (unlikely(m == 0)) {
+ for (i = 0; i < nframes; i++)
+ skbs[i] = NULL; /* effect: xdp_return_frame */
+ drops += nframes;
+ }
}
local_bh_disable();
- for (i = 0; i < n; i++) {
+ for (i = 0; i < nframes; i++) {
struct xdp_frame *xdpf = frames[i];
struct sk_buff *skb = skbs[i];
int ret;
@@ -298,7 +365,7 @@ static int cpu_map_kthread_run(void *data)
drops++;
}
/* Feedback loop via tracepoint */
- trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched);
+ trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats);
local_bh_enable(); /* resched point, may call do_softirq() */
}
@@ -308,13 +375,38 @@ static int cpu_map_kthread_run(void *data)
return 0;
}
[...]
@@ -415,6 +510,8 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
if (old_rcpu) {
+ if (old_rcpu->prog)
+ bpf_prog_put(old_rcpu->prog);
call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
schedule_work(&old_rcpu->kthread_stop_wq);
Hm, not quite sure I follow the logic here. Why is the bpf_prog_put() not placed inside
__cpu_map_entry_free(), for example? Wouldn't this at least leave a potential small race
window of UAF given the rest is still live? If we already piggy-back from RCU side on
rcpu entry, why not having it in __cpu_map_entry_free()?
Thanks,
Daniel