From: Hou Tao <houtao1@xxxxxxxxxx> Beside REUSE_AFTER_RCU_GP, also introduce FREE_AFTER_RCU_GP to solve the immediate reuse problem as well. Compared with REUSE_AFTER_RCU_GP, the implementation of FREE_AFTER_RCU_GP is much simpler. It doesn't try to reuse these freed elements after one RCU GP is passed, instead it just directly frees these elements back to slab subsystem after one RCU GP. The shortcoming of FREE_AFTER_RCU_GP is that sleep-able program must access these elements by using bpf_rcu_read_{lock,unlock}, otherwise there will be use-after-free problem. To simplify the implementation, FREE_AFTER_RCU_GP uses a global per-cpu free list to temporarily keep these freed elements and uses a per-cpu kworker to dynamically allocate RCU callback to free these freed elements when the number of freed elements is above the threshold. Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx> --- include/linux/bpf_mem_alloc.h | 1 + kernel/bpf/memalloc.c | 139 ++++++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+) diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index e7f68432713b..61e8556208a2 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -19,6 +19,7 @@ struct bpf_mem_alloc { enum { BPF_MA_PERCPU = 1U << 0, BPF_MA_REUSE_AFTER_RCU_GP = 1U << 1, + BPF_MA_FREE_AFTER_RCU_GP = 1U << 2, }; /* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects. diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 262100f89610..5f6a4f2cfd37 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -63,7 +63,26 @@ static u8 size_index[24] __ro_after_init = { 2 /* 192 */ }; +#define BPF_MA_FREE_TYPE_NR 2 + +struct bpf_ma_free_ctx { + raw_spinlock_t lock; + int cpu; + local_t active; + /* For both no per-cpu and per-cpu */ + struct llist_head to_free[BPF_MA_FREE_TYPE_NR]; + unsigned int to_free_cnt[BPF_MA_FREE_TYPE_NR]; + struct llist_head to_free_extra[BPF_MA_FREE_TYPE_NR]; + struct delayed_work dwork; +}; + +struct bpf_free_batch { + struct rcu_head rcu; + struct llist_node *to_free[BPF_MA_FREE_TYPE_NR]; +}; + static struct workqueue_struct *bpf_ma_wq; +static DEFINE_PER_CPU(struct bpf_ma_free_ctx, percpu_free_ctx); static void bpf_ma_prepare_reuse_work(struct work_struct *work); @@ -910,6 +929,112 @@ static void notrace immediate_reuse_free(struct bpf_mem_cache *c, struct llist_n irq_work_raise(c); } +static void bpf_ma_batch_free_cb(struct rcu_head *rcu) +{ + struct bpf_free_batch *batch = container_of(rcu, struct bpf_free_batch, rcu); + + free_all(batch->to_free[0], false); + free_all(batch->to_free[1], true); + kfree(batch); +} + +static void bpf_ma_schedule_free_dwork(struct bpf_ma_free_ctx *ctx) +{ + long delay, left; + u64 to_free_cnt; + + /* TODO: More reasonable threshold ? */ + to_free_cnt = ctx->to_free_cnt[0] + ctx->to_free_cnt[1]; + delay = to_free_cnt >= 256 ? 0 : HZ; + if (delayed_work_pending(&ctx->dwork)) { + left = ctx->dwork.timer.expires - jiffies; + if (delay < left) + mod_delayed_work(bpf_ma_wq, &ctx->dwork, delay); + return; + } + queue_delayed_work(bpf_ma_wq, &ctx->dwork, delay); +} + +static void splice_llist(struct llist_head *llist, struct llist_node **head) +{ + struct llist_node *first, *last; + + first = llist_del_all(llist); + if (!first) + return; + + last = first; + while (last->next) + last = last->next; + last->next = *head; + *head = first; +} + +static void bpf_ma_splice_to_free_list(struct bpf_ma_free_ctx *ctx, struct llist_node **to_free) +{ + unsigned long flags; + + local_irq_save(flags); + /* Might be interrupted by a NMI which invokes unit_free() */ + if (ctx->cpu == smp_processor_id()) + WARN_ON_ONCE(local_inc_return(&ctx->active) != 1); + raw_spin_lock(&ctx->lock); + to_free[0] = __llist_del_all(&ctx->to_free[0]); + to_free[1] = __llist_del_all(&ctx->to_free[1]); + ctx->to_free_cnt[0] = 0; + ctx->to_free_cnt[1] = 0; + raw_spin_unlock(&ctx->lock); + if (ctx->cpu == smp_processor_id()) + local_dec(&ctx->active); + local_irq_restore(flags); + + splice_llist(&ctx->to_free_extra[0], &to_free[0]); + splice_llist(&ctx->to_free_extra[1], &to_free[1]); +} + +static void bpf_ma_free_dwork(struct work_struct *work) +{ + struct bpf_ma_free_ctx *ctx = container_of(to_delayed_work(work), + struct bpf_ma_free_ctx, dwork); + struct llist_node *to_free[BPF_MA_FREE_TYPE_NR]; + struct bpf_free_batch *batch; + + bpf_ma_splice_to_free_list(ctx, to_free); + + batch = kmalloc(sizeof(*batch), GFP_KERNEL); + if (!batch) { + synchronize_rcu_expedited(); + free_all(to_free[0], false); + free_all(to_free[1], true); + return; + } + + batch->to_free[0] = to_free[0]; + batch->to_free[1] = to_free[1]; + call_rcu(&batch->rcu, bpf_ma_batch_free_cb); +} + +static void notrace wait_gp_direct_free(struct bpf_mem_cache *c, struct llist_node *llnode) +{ + bool percpu = !!c->percpu_size; + struct bpf_ma_free_ctx *ctx; + unsigned long flags; + + local_irq_save(flags); + ctx = this_cpu_ptr(&percpu_free_ctx); + if (local_inc_return(&ctx->active) == 1) { + raw_spin_lock(&ctx->lock); + __llist_add(llnode, &ctx->to_free[percpu]); + ctx->to_free_cnt[percpu] += 1; + bpf_ma_schedule_free_dwork(ctx); + raw_spin_unlock(&ctx->lock); + } else { + llist_add(llnode, &ctx->to_free_extra[percpu]); + } + local_dec(&ctx->active); + local_irq_restore(flags); +} + static inline void notrace unit_free(struct bpf_mem_cache *c, void *ptr) { struct llist_node *llnode = ptr - LLIST_NODE_SZ; @@ -918,6 +1043,8 @@ static inline void notrace unit_free(struct bpf_mem_cache *c, void *ptr) if (c->flags & BPF_MA_REUSE_AFTER_RCU_GP) wait_gp_reuse_free(c, llnode); + else if (c->flags & BPF_MA_FREE_AFTER_RCU_GP) + wait_gp_direct_free(c, llnode); else immediate_reuse_free(c, llnode); } @@ -1016,8 +1143,20 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) static int __init bpf_ma_init(void) { + int cpu; + bpf_ma_wq = alloc_workqueue("bpf_ma", WQ_MEM_RECLAIM, 0); BUG_ON(!bpf_ma_wq); + + for_each_possible_cpu(cpu) { + struct bpf_ma_free_ctx *ctx; + + ctx = per_cpu_ptr(&percpu_free_ctx, cpu); + raw_spin_lock_init(&ctx->lock); + ctx->cpu = cpu; + INIT_DELAYED_WORK(&ctx->dwork, bpf_ma_free_dwork); + } + return 0; } late_initcall(bpf_ma_init); -- 2.29.2