From: Hou Tao <houtao1@xxxxxxxxxx> After doing reuse-after-RCU-GP in bpf memory allocator, if there are intensive memory allocation and free in bpf memory allocator and RCU GP is slow, the peak memory usage for bpf memory allocator will be high. To reduce memory usage for bpf memory allocator, call rcu_momentary_dyntick_idle() in task work periodically to accelerate the expiration of RCU grace period. The following benchmark results the memory usage reduce a lot after applying the patch: Before: overwrite per-prod-op 49.11 ± 1.30k/s, avg mem 313.09 ± 80.36MiB, peak mem 509.09MiB batch_add_batch_del per-prod-op 76.06 ± 2.38k/s, avg mem 287.97 ± 63.59MiB, peak mem 496.81MiB add_del_on_diff_cpu per-prod-op 18.75 ± 0.09k/s, avg mem 27.71 ± 4.92MiB, peak mem 44.54MiB After: overwrite per-prod-op 51.17 ± 0.30k/s, avg mem 105.09 ± 7.74MiB, peak mem 143.60MiB batch_add_batch_del per-prod-op 86.43 ± 0.90k/s, avg mem 85.82 ± 11.81MiB, peak mem 118.93MiB add_del_on_diff_cpu per-prod-op 18.71 ± 0.08k/s, avg mem 26.92 ± 5.50MiB, peak mem 43.18MiB Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx> --- kernel/bpf/memalloc.c | 46 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 9b31c53fd285..c4b4cae04400 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -6,6 +6,7 @@ #include <linux/irq_work.h> #include <linux/bpf_mem_alloc.h> #include <linux/memcontrol.h> +#include <linux/task_work.h> #include <asm/local.h> /* Any context (including NMI) BPF specific memory allocator. @@ -123,6 +124,16 @@ struct bpf_reuse_batch { struct rcu_head rcu; }; +#define BPF_GP_ACC_RUNNING 0 + +struct bpf_rcu_gp_acc_ctx { + unsigned long flags; + unsigned long next_run; + struct callback_head work; +}; + +static DEFINE_PER_CPU(struct bpf_rcu_gp_acc_ctx, bpf_acc_ctx); + static struct llist_node notrace *__llist_del_first(struct llist_head *head) { struct llist_node *entry, *next; @@ -347,12 +358,47 @@ static void dyn_reuse_rcu(struct rcu_head *rcu) kfree(batch); } +static void bpf_rcu_gp_acc_work(struct callback_head *head) +{ + struct bpf_rcu_gp_acc_ctx *ctx = container_of(head, struct bpf_rcu_gp_acc_ctx, work); + + local_irq_disable(); + rcu_momentary_dyntick_idle(); + local_irq_enable(); + + /* The interval between rcu_momentary_dyntick_idle() calls is + * at least 10ms. + */ + WRITE_ONCE(ctx->next_run, jiffies + msecs_to_jiffies(10)); + clear_bit(BPF_GP_ACC_RUNNING, &ctx->flags); +} + +static void bpf_mem_rcu_gp_acc(struct bpf_mem_cache *c) +{ + struct bpf_rcu_gp_acc_ctx *ctx = this_cpu_ptr(&bpf_acc_ctx); + + if (atomic_read(&c->dyn_reuse_rcu_cnt) < 128 || + time_before(jiffies, READ_ONCE(ctx->next_run))) + return; + + if ((current->flags & PF_KTHREAD) || + test_and_set_bit(BPF_GP_ACC_RUNNING, &ctx->flags)) + return; + + init_task_work(&ctx->work, bpf_rcu_gp_acc_work); + /* Task is exiting ? */ + if (task_work_add(current, &ctx->work, TWA_RESUME)) + clear_bit(BPF_GP_ACC_RUNNING, &ctx->flags); +} + static void reuse_bulk(struct bpf_mem_cache *c) { struct llist_node *head, *tail; struct bpf_reuse_batch *batch; unsigned long flags; + bpf_mem_rcu_gp_acc(c); + head = llist_del_all(&c->free_llist_extra); tail = head; while (tail && tail->next) -- 2.29.2