The changes in this patch deserve closer look, so it has been split into its own independent patch. While earlier we just had to skip two objects at most while copying in and out of map, now we have potentially many objects (at most 8 + 2 = 10, due to the BPF_MAP_VALUE_OFF_MAX limit). Hence, divide the copy_map_value function into an inlined fast path and function call to slowpath. The slowpath handles the case of > 3 offsets, while we handle the most common cases (0, 1, 2, or 3 offsets) in the inline function itself. In copy_map_value_slow, we use 11 offsets, just to make the for loop that copies the value free of edge cases for the last offset, by using map->value_size as final offset to subtract remaining area to copy from. Signed-off-by: Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx> --- include/linux/bpf.h | 43 +++++++++++++++++++++++++++++++--- kernel/bpf/syscall.c | 55 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ae599aaf8d4c..5d845ca02eba 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -253,12 +253,22 @@ static inline void check_and_init_map_value(struct bpf_map *map, void *dst) memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock)); if (unlikely(map_value_has_timer(map))) memset(dst + map->timer_off, 0, sizeof(struct bpf_timer)); + if (unlikely(map_value_has_ptr_to_btf_id(map))) { + struct bpf_map_value_off *tab = map->ptr_off_tab; + int i; + + for (i = 0; i < tab->nr_off; i++) + *(u64 *)(dst + tab->off[i].offset) = 0; + } } +void copy_map_value_slow(struct bpf_map *map, void *dst, void *src, u32 s_off, + u32 s_sz, u32 t_off, u32 t_sz); + /* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) { - u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0; + u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0, p_off = 0, p_sz = 0; if (unlikely(map_value_has_spin_lock(map))) { s_off = map->spin_lock_off; @@ -268,13 +278,40 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) t_off = map->timer_off; t_sz = sizeof(struct bpf_timer); } + /* Multiple offset case is slow, offload to function */ + if (unlikely(map_value_has_ptr_to_btf_id(map))) { + struct bpf_map_value_off *tab = map->ptr_off_tab; + + /* Inline the likely common case */ + if (likely(tab->nr_off == 1)) { + p_off = tab->off[0].offset; + p_sz = sizeof(u64); + } else { + copy_map_value_slow(map, dst, src, s_off, s_sz, t_off, t_sz); + return; + } + } + + if (unlikely(s_sz || t_sz || p_sz)) { + /* The order is p_off, t_off, s_off, use insertion sort */ - if (unlikely(s_sz || t_sz)) { + if (t_off < p_off || !t_sz) { + swap(t_off, p_off); + swap(t_sz, p_sz); + } if (s_off < t_off || !s_sz) { swap(s_off, t_off); swap(s_sz, t_sz); + if (t_off < p_off || !t_sz) { + swap(t_off, p_off); + swap(t_sz, p_sz); + } } - memcpy(dst, src, t_off); + + memcpy(dst, src, p_off); + memcpy(dst + p_off + p_sz, + src + p_off + p_sz, + t_off - p_off - p_sz); memcpy(dst + t_off + t_sz, src + t_off + t_sz, s_off - t_off - t_sz); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index beb96866f34d..83d71d6912f5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -30,6 +30,7 @@ #include <linux/pgtable.h> #include <linux/bpf_lsm.h> #include <linux/poll.h> +#include <linux/sort.h> #include <linux/bpf-netns.h> #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> @@ -230,6 +231,60 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, return err; } +static int copy_map_value_cmp(const void *_a, const void *_b) +{ + const u32 a = *(const u32 *)_a; + const u32 b = *(const u32 *)_b; + + /* We only need to sort based on offset */ + if (a < b) + return -1; + else if (a > b) + return 1; + return 0; +} + +void copy_map_value_slow(struct bpf_map *map, void *dst, void *src, u32 s_off, + u32 s_sz, u32 t_off, u32 t_sz) +{ + struct bpf_map_value_off *tab = map->ptr_off_tab; /* already set to non-NULL */ + /* 3 = 2 for bpf_timer, bpf_spin_lock, 1 for map->value_size sentinel */ + struct { + u32 off; + u32 sz; + } off_arr[BPF_MAP_VALUE_OFF_MAX + 3]; + int i, cnt = 0; + + /* Reconsider stack usage when bumping BPF_MAP_VALUE_OFF_MAX */ + BUILD_BUG_ON(sizeof(off_arr) != 88); + + for (i = 0; i < tab->nr_off; i++) { + off_arr[cnt].off = tab->off[i].offset; + off_arr[cnt++].sz = sizeof(u64); + } + if (s_sz) { + off_arr[cnt].off = s_off; + off_arr[cnt++].sz = s_sz; + } + if (t_sz) { + off_arr[cnt].off = t_off; + off_arr[cnt++].sz = t_sz; + } + off_arr[cnt].off = map->value_size; + + sort(off_arr, cnt, sizeof(off_arr[0]), copy_map_value_cmp, NULL); + + /* There is always at least one element */ + memcpy(dst, src, off_arr[0].off); + /* Copy the rest, while skipping other regions */ + for (i = 1; i < cnt; i++) { + u32 curr_off = off_arr[i - 1].off + off_arr[i - 1].sz; + u32 next_off = off_arr[i].off; + + memcpy(dst + curr_off, src + curr_off, next_off - curr_off); + } +} + static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, __u64 flags) { -- 2.35.1