On 2/16/24 10:28 AM, thinker.li@xxxxxxxxx wrote:
From: Kui-Feng Lee <thinker.li@xxxxxxxxx>
The BPF struct_ops previously only allowed for one page to be used for the
trampolines of all links in a map. However, we have recently run out of
space due to the large number of BPF program links. By allocating
additional pages when we exhaust an existing page, we can accommodate more
links in a single map.
The variable st_map->image has been changed to st_map->image_pages, and its
type has been changed to an array of pointers to buffers of PAGE_SIZE. The
array is dynamically resized and additional pages are allocated when all
existing pages are exhausted.
Signed-off-by: Kui-Feng Lee <thinker.li@xxxxxxxxx>
---
kernel/bpf/bpf_struct_ops.c | 99 ++++++++++++++++++++++++++++++-------
1 file changed, 80 insertions(+), 19 deletions(-)
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 0d7be97a2411..bb7ae665006a 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -30,12 +30,11 @@ struct bpf_struct_ops_map {
*/
struct bpf_link **links;
u32 links_cnt;
- /* image is a page that has all the trampolines
+ u32 image_pages_cnt;
+ /* image_pages is an array of pages that has all the trampolines
* that stores the func args before calling the bpf_prog.
- * A PAGE_SIZE "image" is enough to store all trampoline for
- * "links[]".
*/
- void *image;
+ void **image_pages;
/* The owner moduler's btf. */
struct btf *btf;
/* uvalue->data stores the kernel struct
@@ -535,7 +534,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
void *udata, *kdata;
int prog_fd, err;
void *image, *image_end;
- u32 i;
+ void **image_pages;
+ u32 i, next_page = 0;
if (flags)
return -EINVAL;
@@ -573,8 +573,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
udata = &uvalue->data;
kdata = &kvalue->data;
- image = st_map->image;
- image_end = st_map->image + PAGE_SIZE;
+ image = st_map->image_pages[next_page++];
+ image_end = image + PAGE_SIZE;
module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
for_each_member(i, t, member) {
@@ -657,6 +657,43 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
&st_ops->func_models[i],
*(void **)(st_ops->cfi_stubs + moff),
image, image_end);
+ if (err == -E2BIG) {
+ /* Use an additional page to try again.
+ *
+ * It may reuse pages allocated for the previous
+ * failed calls.
+ */
+ if (next_page >= st_map->image_pages_cnt) {
This check (more on this later) ...
+ /* Allocate an additional page */
+ image_pages = krealloc(st_map->image_pages,
+ (st_map->image_pages_cnt + 1) * sizeof(void *),
+ GFP_KERNEL);
From the patch 2 test, one page is enough for at least 20 ops. How about keep
it simple and allow a max 8 pages which should be much more than enough for sane
usage. (i.e. add "void *images[MAX_STRUCT_OPS_PAGES];" to "struct
bpf_struct_ops_map").
+ if (!image_pages) {
+ err = -ENOMEM;
+ goto reset_unlock;
+ }
+ st_map->image_pages = image_pages;
+
+ err = bpf_jit_charge_modmem(PAGE_SIZE);
+ if (err)
+ goto reset_unlock;
+
+ image = arch_alloc_bpf_trampoline(PAGE_SIZE);
+ if (!image) {
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
+ err = -ENOMEM;
+ goto reset_unlock;
+ }
+ st_map->image_pages[st_map->image_pages_cnt++] = image;
+ }
+ image = st_map->image_pages[next_page++];
+ image_end = image + PAGE_SIZE;
+
+ err = bpf_struct_ops_prepare_trampoline(tlinks, link,
+ &st_ops->func_models[i],
+ *(void **)(st_ops->cfi_stubs + moff),
+ image, image_end);
+ }
if (err < 0)
goto reset_unlock;
@@ -667,6 +704,18 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*(unsigned long *)(udata + moff) = prog->aux->id;
}
+ while (next_page < st_map->image_pages_cnt) {
This check and the above "if (next_page >= st_map->image_pages_cnt)" should not
happen for the common case?
Together with the new comment after the above "if (err == -E2BIG)", is it trying
to optimize to reuse the pages allocated in the previous error-ed out
map_update_elem() call?
How about keep it simple for the common case and always free all pages when
map_update_elem() failed?
Also, after this patch, the same calls are used in different places.
arch_alloc_bpf_trampoline() is done in two different places, one in map_alloc()
and one in map_update_elem(). How about do all the page alloc in map_update_elem()?
bpf_struct_ops_prepare_trampoline() is also called in two different places
within the same map_update_elem(). When looking inside the
bpf_struct_ops_prepare_trampoline(), it does call arch_bpf_trampoline_size() to
learn the required size first. bpf_struct_ops_prepare_trampoline() should be a
better place to call arch_alloc_bpf_trampoline() when needed. Then there is no
need to retry bpf_struct_ops_prepare_trampoline() in map_update_elem()?
+ /* Free unused pages
+ *
+ * The value can not be updated anymore if the value is not
+ * rejected by st_ops->validate() or st_ops->reg(). So,
+ * there is no reason to keep the unused pages.
+ */
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
+ image = st_map->image_pages[--st_map->image_pages_cnt];
+ arch_free_bpf_trampoline(image, PAGE_SIZE);
+ }
+
if (st_map->map.map_flags & BPF_F_LINK) {
err = 0;
if (st_ops->validate) {
@@ -674,7 +723,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (err)
goto reset_unlock;
}
- arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
+ for (i = 0; i < next_page; i++)
+ arch_protect_bpf_trampoline(st_map->image_pages[i],
+ PAGE_SIZE);
arch_protect_bpf_trampoline() is called here for BPF_F_LINK.
/* Let bpf_link handle registration & unregistration.
*
* Pair with smp_load_acquire() during lookup_elem().
@@ -683,7 +734,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto unlock;
}
- arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
+ for (i = 0; i < next_page; i++)
+ arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
arch_protect_bpf_trampoline() is called here also in the same function for non
BPF_F_LINK.
Can this be cleaned up a bit? For example, "st_ops->validate(kdata);" should not
be specific to BPF_F_LINK which had been brought up earlier when making the
"->validate" optional. It is a good time to clean this up also.
----
pw-bot: cr
err = st_ops->reg(kdata);
if (likely(!err)) {
/* This refcnt increment on the map here after
@@ -706,7 +758,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
* there was a race in registering the struct_ops (under the same name) to
* a sub-system through different struct_ops's maps.
*/
- arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
+ for (i = 0; i < next_page; i++)
+ arch_unprotect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
reset_unlock:
bpf_struct_ops_map_put_progs(st_map);
@@ -771,14 +824,15 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
static void __bpf_struct_ops_map_free(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ int i;
if (st_map->links)
bpf_struct_ops_map_put_progs(st_map);
bpf_map_area_free(st_map->links);
- if (st_map->image) {
- arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
- bpf_jit_uncharge_modmem(PAGE_SIZE);
- }
+ for (i = 0; i < st_map->image_pages_cnt; i++)
+ arch_free_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
+ bpf_jit_uncharge_modmem(PAGE_SIZE * st_map->image_pages_cnt);
+ kfree(st_map->image_pages);
bpf_map_area_free(st_map->uvalue);
bpf_map_area_free(st_map);
}
@@ -888,20 +942,27 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
st_map->st_ops_desc = st_ops_desc;
map = &st_map->map;
+ st_map->image_pages = kcalloc(1, sizeof(void *), GFP_KERNEL);
+ if (!st_map->image_pages) {
+ ret = -ENOMEM;
+ goto errout_free;
+ }
+
ret = bpf_jit_charge_modmem(PAGE_SIZE);
if (ret)
goto errout_free;
- st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
- if (!st_map->image) {
- /* __bpf_struct_ops_map_free() uses st_map->image as flag
- * for "charged or not". In this case, we need to unchange
- * here.
+ st_map->image_pages[0] = arch_alloc_bpf_trampoline(PAGE_SIZE);
+ if (!st_map->image_pages[0]) {
+ /* __bpf_struct_ops_map_free() uses st_map->image_pages_cnt
+ * to for uncharging a number of pages. In this case, we
+ * need to uncharge here.
*/
bpf_jit_uncharge_modmem(PAGE_SIZE);
ret = -ENOMEM;
goto errout_free;
}
+ st_map->image_pages_cnt = 1;
st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
st_map->links_cnt = btf_type_vlen(t);
st_map->links =