On Tue, Jan 07, 2020 at 02:30:35PM +0100, Björn Töpel wrote: > On 2020-01-07 14:05, Jiri Olsa wrote: > > On Mon, Jan 06, 2020 at 03:46:40PM -0800, Alexei Starovoitov wrote: > > > On Sun, Dec 29, 2019 at 03:37:40PM +0100, Jiri Olsa wrote: > > > > When unwinding the stack we need to identify each > > > > address to successfully continue. Adding latch tree > > > > to keep trampolines for quick lookup during the > > > > unwind. > > > > > > > > Signed-off-by: Jiri Olsa <jolsa@xxxxxxxxxx> > > > ... > > > > +bool is_bpf_trampoline(void *addr) > > > > +{ > > > > + return latch_tree_find(addr, &tree, &tree_ops) != NULL; > > > > +} > > > > + > > > > struct bpf_trampoline *bpf_trampoline_lookup(u64 key) > > > > { > > > > struct bpf_trampoline *tr; > > > > @@ -65,6 +98,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) > > > > for (i = 0; i < BPF_TRAMP_MAX; i++) > > > > INIT_HLIST_HEAD(&tr->progs_hlist[i]); > > > > tr->image = image; > > > > + latch_tree_insert(&tr->tnode, &tree, &tree_ops); > > > > > > Thanks for the fix. I was thinking to apply it, but then realized that bpf > > > dispatcher logic has the same issue. > > > Could you generalize the fix for both? > > > May be bpf_jit_alloc_exec_page() can do latch_tree_insert() ? > > > and new version of bpf_jit_free_exec() is needed that will do latch_tree_erase(). > > > Wdyt? > > > > I need to check the dispatcher code, but seems ok.. will check > > > > Thanks Jiri! The trampoline and dispatcher share the image allocation, so > putting it there would make sense. > > It's annoying that the dispatcher doesn't show up correctly in perf, and > it's been on my list to fix that. Hopefully you beat me to it! :-D hi, attached patch seems to work for me (trampoline usecase), but I don't know how to test it for dispatcher.. also I need to check if we need to decrease BPF_TRAMP_MAX or BPF_DISPATCHER_MAX, it might take more time ;-) jirka --- diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aed2bc39d72b..e0ca8780dc7a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -510,7 +510,6 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key); int bpf_trampoline_link_prog(struct bpf_prog *prog); int bpf_trampoline_unlink_prog(struct bpf_prog *prog); void bpf_trampoline_put(struct bpf_trampoline *tr); -void *bpf_jit_alloc_exec_page(void); #define BPF_DISPATCHER_INIT(name) { \ .mutex = __MUTEX_INITIALIZER(name.mutex), \ .func = &name##func, \ @@ -542,6 +541,13 @@ void *bpf_jit_alloc_exec_page(void); #define BPF_DISPATCHER_PTR(name) (&name) void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); +struct bpf_image { + struct latch_tree_node tnode; + unsigned char data[]; +}; +#define BPF_IMAGE_SIZE (PAGE_SIZE - sizeof(struct bpf_image)) +bool is_bpf_image(void *addr); +void *bpf_image_alloc(void); #else static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { @@ -563,6 +569,10 @@ static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {} static inline void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to) {} +static inline bool is_bpf_image(void *addr) +{ + return false; +} #endif struct bpf_func_info_aux { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 29d47aae0dd1..53dc3adf6077 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -704,6 +704,8 @@ bool is_bpf_text_address(unsigned long addr) rcu_read_lock(); ret = bpf_prog_kallsyms_find(addr) != NULL; + if (!ret) + ret = is_bpf_image((void*) addr); rcu_read_unlock(); return ret; diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index 204ee61a3904..b3e5b214fed8 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -113,7 +113,7 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) noff = 0; } else { old = d->image + d->image_off; - noff = d->image_off ^ (PAGE_SIZE / 2); + noff = d->image_off ^ (BPF_IMAGE_SIZE / 2); } new = d->num_progs ? d->image + noff : NULL; @@ -140,7 +140,7 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, mutex_lock(&d->mutex); if (!d->image) { - d->image = bpf_jit_alloc_exec_page(); + d->image = bpf_image_alloc(); if (!d->image) goto out; } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 79a04417050d..3ea56f89c68a 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -4,6 +4,7 @@ #include <linux/bpf.h> #include <linux/filter.h> #include <linux/ftrace.h> +#include <linux/rbtree_latch.h> /* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */ #define TRAMPOLINE_HASH_BITS 10 @@ -14,7 +15,12 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); -void *bpf_jit_alloc_exec_page(void) +static struct latch_tree_root image_tree __cacheline_aligned; + +/* serializes access to image_tree */ +static DEFINE_MUTEX(image_mutex); + +static void *bpf_jit_alloc_exec_page(void) { void *image; @@ -30,6 +36,62 @@ void *bpf_jit_alloc_exec_page(void) return image; } +static __always_inline bool image_tree_less(struct latch_tree_node *a, + struct latch_tree_node *b) +{ + struct bpf_image *ia = container_of(a, struct bpf_image, tnode); + struct bpf_image *ib = container_of(b, struct bpf_image, tnode); + + return ia < ib; +} + +static __always_inline int image_tree_comp(void *addr, struct latch_tree_node *n) +{ + void *image = container_of(n, struct bpf_image, tnode); + + if (addr < image) + return -1; + if (addr >= image + PAGE_SIZE) + return 1; + + return 0; +} + +static const struct latch_tree_ops image_tree_ops = { + .less = image_tree_less, + .comp = image_tree_comp, +}; + +void *bpf_image_alloc(void) +{ + struct bpf_image *image; + + image = bpf_jit_alloc_exec_page(); + if (!image) + return NULL; + + mutex_lock(&image_mutex); + latch_tree_insert(&image->tnode, &image_tree, &image_tree_ops); + mutex_unlock(&image_mutex); + return image->data; +} + +void bpf_image_delete(void *ptr) +{ + struct bpf_image *image = container_of(ptr, struct bpf_image, data); + + mutex_lock(&image_mutex); + latch_tree_erase(&image->tnode, &image_tree, &image_tree_ops); + synchronize_rcu(); + bpf_jit_free_exec(image); + mutex_unlock(&image_mutex); +} + +bool is_bpf_image(void *addr) +{ + return latch_tree_find(addr, &image_tree, &image_tree_ops) != NULL; +} + struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; @@ -50,7 +112,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) goto out; /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ - image = bpf_jit_alloc_exec_page(); + image = bpf_image_alloc(); if (!image) { kfree(tr); tr = NULL; @@ -125,14 +187,14 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) } /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 - * bytes on x86. Pick a number to fit into PAGE_SIZE / 2 + * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 */ #define BPF_MAX_TRAMP_PROGS 40 static int bpf_trampoline_update(struct bpf_trampoline *tr) { - void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2; - void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2; + void *old_image = tr->image + ((tr->selector + 1) & 1) * BPF_IMAGE_SIZE/2; + void *new_image = tr->image + (tr->selector & 1) * BPF_IMAGE_SIZE/2; struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS]; int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY]; int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT]; @@ -160,7 +222,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) if (fexit_cnt) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; - err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, + err = arch_prepare_bpf_trampoline(new_image, new_image + BPF_IMAGE_SIZE / 2, &tr->func.model, flags, fentry, fentry_cnt, fexit, fexit_cnt, @@ -251,7 +313,7 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) goto out; if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; - bpf_jit_free_exec(tr->image); + bpf_image_delete(tr->image); hlist_del(&tr->hlist); kfree(tr); out: