[RFC PATCH bpf-next 4/6] bpf: Introduce BPF_MA_NO_REUSE for bpf memory allocator

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Hou Tao <houtao1@xxxxxxxxxx>

Currently the freed element in bpf memory allocator may be reused by new
allocation, the reuse may lead to two problems. One problem is that the
lookup procedure may get incorrect result if the found element is freed
and then reused. Another problem is that lookup procedure may still use
special fields in map value or allocated object and at the same time
these special fields are reinitialized by new allocation. The latter
problem can be mitigated by using ctor in bpf memory allocator, but it
only works for case in which all elements have the same type.

So introducing BPF_MA_NO_REUSE to disable immediate reuse of freed
elements. These freed elements will be moved into a global per-cpu free
list instead. After the number of freed elements reaches the threshold,
these free elements will be moved into a dymaically allocated object
and being freed by a global per-cpu worker through
call_rcu_tasks_trace().

Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx>
---
 include/linux/bpf_mem_alloc.h |   2 +
 kernel/bpf/memalloc.c         | 175 +++++++++++++++++++++++++++++++++-
 2 files changed, 173 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index b9f6b9155fa5..2a10b721832d 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -19,6 +19,8 @@ struct bpf_mem_alloc {
 /* flags for bpf_mem_alloc_init() */
 enum {
 	BPF_MA_PERCPU = 1,
+	/* Don't reuse freed elements during allocation */
+	BPF_MA_NO_REUSE = 2,
 };
 
 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, unsigned int flags,
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 454c86596111..e5eaf765624b 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -35,6 +35,23 @@
  */
 #define LLIST_NODE_SZ sizeof(struct llist_node)
 
+#define BPF_MA_FREE_TYPE_NR 2
+
+struct bpf_ma_free_context {
+	raw_spinlock_t lock;
+	local_t active;
+	/* For both no per-cpu and per-cpu */
+	struct llist_head to_free[BPF_MA_FREE_TYPE_NR];
+	unsigned int to_free_cnt[BPF_MA_FREE_TYPE_NR];
+	struct llist_head to_free_extra[BPF_MA_FREE_TYPE_NR];
+	struct delayed_work dwork;
+};
+
+struct bpf_ma_free_batch {
+	struct rcu_head rcu;
+	struct llist_node *to_free[BPF_MA_FREE_TYPE_NR];
+};
+
 /* similar to kmalloc, but sizeof == 8 bucket is gone */
 static u8 size_index[24] __ro_after_init = {
 	3,	/* 8 */
@@ -63,6 +80,9 @@ static u8 size_index[24] __ro_after_init = {
 	2	/* 192 */
 };
 
+static DEFINE_PER_CPU(struct bpf_ma_free_context, percpu_free_ctx);
+static struct workqueue_struct *bpf_ma_free_wq;
+
 static int bpf_mem_cache_idx(size_t size)
 {
 	if (!size || size > 4096)
@@ -609,14 +629,11 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c)
  * add it to the free_llist of the current cpu.
  * Let kfree() logic deal with it when it's later called from irq_work.
  */
-static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
+static void notrace reuse_free(struct bpf_mem_cache *c, struct llist_node *llnode)
 {
-	struct llist_node *llnode = ptr - LLIST_NODE_SZ;
 	unsigned long flags;
 	int cnt = 0;
 
-	BUILD_BUG_ON(LLIST_NODE_SZ > 8);
-
 	local_irq_save(flags);
 	if (local_inc_return(&c->active) == 1) {
 		__llist_add(llnode, &c->free_llist);
@@ -638,6 +655,137 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
 		irq_work_raise(c);
 }
 
+static void batch_free_rcu(struct rcu_head *rcu)
+{
+	struct bpf_ma_free_batch *batch = container_of(rcu, struct bpf_ma_free_batch, rcu);
+
+	free_llist(batch->to_free[0], false);
+	free_llist(batch->to_free[1], true);
+	kfree(batch);
+}
+
+static void batch_free_rcu_tasks_trace(struct rcu_head *rcu)
+{
+	if (rcu_trace_implies_rcu_gp())
+		batch_free_rcu(rcu);
+	else
+		call_rcu(rcu, batch_free_rcu);
+}
+
+static void bpf_ma_schedule_free_dwork(struct bpf_ma_free_context *ctx)
+{
+	long delay, left;
+	u64 to_free_cnt;
+
+	to_free_cnt = ctx->to_free_cnt[0] + ctx->to_free_cnt[1];
+	delay = to_free_cnt >= 256 ? 1 : HZ;
+	if (delayed_work_pending(&ctx->dwork)) {
+		left = ctx->dwork.timer.expires - jiffies;
+		if (delay < left)
+			mod_delayed_work(bpf_ma_free_wq, &ctx->dwork, delay);
+		return;
+	}
+	queue_delayed_work(bpf_ma_free_wq, &ctx->dwork, delay);
+}
+
+static void bpf_ma_splice_to_free_list(struct bpf_ma_free_context *ctx, struct llist_node **to_free)
+{
+	struct llist_node *tmp[BPF_MA_FREE_TYPE_NR];
+	unsigned long flags;
+	unsigned int i;
+
+	raw_spin_lock_irqsave(&ctx->lock, flags);
+	for (i = 0; i < ARRAY_SIZE(tmp); i++) {
+		tmp[i] = __llist_del_all(&ctx->to_free[i]);
+		ctx->to_free_cnt[i] = 0;
+	}
+	raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+	for (i = 0; i < ARRAY_SIZE(tmp); i++) {
+		struct llist_node *first, *last;
+
+		first = llist_del_all(&ctx->to_free_extra[i]);
+		if (!first) {
+			to_free[i] = tmp[i];
+			continue;
+		}
+		last = first;
+		while (last->next)
+			last = last->next;
+		to_free[i] = first;
+		last->next = tmp[i];
+	}
+}
+
+static inline bool bpf_ma_has_to_free(const struct bpf_ma_free_context *ctx)
+{
+	return !llist_empty(&ctx->to_free[0]) || !llist_empty(&ctx->to_free[1]) ||
+	       !llist_empty(&ctx->to_free_extra[0]) || !llist_empty(&ctx->to_free_extra[1]);
+}
+
+static void bpf_ma_free_dwork(struct work_struct *work)
+{
+	struct bpf_ma_free_context *ctx = container_of(to_delayed_work(work),
+						       struct bpf_ma_free_context, dwork);
+	struct llist_node *to_free[BPF_MA_FREE_TYPE_NR];
+	struct bpf_ma_free_batch *batch;
+	unsigned long flags;
+
+	bpf_ma_splice_to_free_list(ctx, to_free);
+
+	batch = kmalloc(sizeof(*batch), GFP_NOWAIT | __GFP_NOWARN);
+	if (!batch) {
+		/* TODO: handle ENOMEM case better ? */
+		rcu_barrier_tasks_trace();
+		rcu_barrier();
+		free_llist(to_free[0], false);
+		free_llist(to_free[1], true);
+		goto check;
+	}
+
+	batch->to_free[0] = to_free[0];
+	batch->to_free[1] = to_free[1];
+	call_rcu_tasks_trace(&batch->rcu, batch_free_rcu_tasks_trace);
+check:
+	raw_spin_lock_irqsave(&ctx->lock, flags);
+	if (bpf_ma_has_to_free(ctx))
+		bpf_ma_schedule_free_dwork(ctx);
+	raw_spin_unlock_irqrestore(&ctx->lock, flags);
+}
+
+static void notrace direct_free(struct bpf_mem_cache *c, struct llist_node *llnode)
+{
+	struct bpf_ma_free_context *ctx;
+	bool percpu = !!c->percpu_size;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	ctx = this_cpu_ptr(&percpu_free_ctx);
+	if (local_inc_return(&ctx->active) == 1) {
+		raw_spin_lock(&ctx->lock);
+		__llist_add(llnode, &ctx->to_free[percpu]);
+		ctx->to_free_cnt[percpu] += 1;
+		bpf_ma_schedule_free_dwork(ctx);
+		raw_spin_unlock(&ctx->lock);
+	} else {
+		llist_add(llnode, &ctx->to_free_extra[percpu]);
+	}
+	local_dec(&ctx->active);
+	local_irq_restore(flags);
+}
+
+static inline void unit_free(struct bpf_mem_cache *c, void *ptr)
+{
+	struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+
+	BUILD_BUG_ON(LLIST_NODE_SZ > 8);
+
+	if (c->ma->flags & BPF_MA_NO_REUSE)
+		direct_free(c, llnode);
+	else
+		reuse_free(c, llnode);
+}
+
 /* Called from BPF program or from sys_bpf syscall.
  * In both cases migration is disabled.
  */
@@ -686,3 +834,22 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
 
 	unit_free(this_cpu_ptr(ma->cache), ptr);
 }
+
+static int __init bpf_ma_init(void)
+{
+	int cpu;
+
+	bpf_ma_free_wq = alloc_workqueue("bpf_ma_free", WQ_MEM_RECLAIM, 0);
+	BUG_ON(!bpf_ma_free_wq);
+
+	for_each_possible_cpu(cpu) {
+		struct bpf_ma_free_context *ctx;
+
+		ctx = per_cpu_ptr(&percpu_free_ctx, cpu);
+		raw_spin_lock_init(&ctx->lock);
+		INIT_DELAYED_WORK(&ctx->dwork, bpf_ma_free_dwork);
+	}
+
+	return 0;
+}
+fs_initcall(bpf_ma_init);
-- 
2.29.2




[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux