From: Hou Tao <houtao1@xxxxxxxxxx> Userspace application uses bpf syscall to lookup or update bpf map. It passes a pointer of fixed-size buffer to kernel to represent the map key. To support map with variable-length key, introduce bpf_dynptr_user to allow userspace to pass a pointer of bpf_dynptr_user to specify the address and the length of key buffer. And in order to represent dynptr from userspace, adding a new dynptr type: BPF_DYNPTR_TYPE_USER. Because BPF_DYNPTR_TYPE_USER-typed dynptr is not available from bpf program, so no verifier update is needed. Add dynptr_key_off in bpf_map to distinguish map with fixed-size key from map with variable-length. dynptr_key_off is less than zero for fixed-size key and can only be zero for dynptr key. For dynptr-key map, key btf type is bpf_dynptr and key size is 16, so use the lower 32-bits of map_extra to specify the maximum size of dynptr key. Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx> --- include/linux/bpf.h | 8 +++ include/uapi/linux/bpf.h | 6 ++ kernel/bpf/map_in_map.c | 3 + kernel/bpf/syscall.c | 121 +++++++++++++++++++++++++++------ tools/include/uapi/linux/bpf.h | 6 ++ 5 files changed, 125 insertions(+), 19 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 66a18dc67b46..44bef4110179 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -216,6 +216,7 @@ struct bpf_map { int spin_lock_off; /* >=0 valid offset, <0 error */ struct bpf_map_value_off *kptr_off_tab; int timer_off; /* >=0 valid offset, <0 error */ + int dynptr_key_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; u32 btf_key_type_id; @@ -265,6 +266,11 @@ static inline bool map_value_has_kptrs(const struct bpf_map *map) return !IS_ERR_OR_NULL(map->kptr_off_tab); } +static inline bool map_key_has_dynptr(const struct bpf_map *map) +{ + return map->dynptr_key_off >= 0; +} + static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { if (unlikely(map_value_has_spin_lock(map))) @@ -2654,6 +2660,8 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_LOCAL, /* Underlying data is a kernel-produced ringbuf record */ BPF_DYNPTR_TYPE_RINGBUF, + /* Points to memory copied from/to userspace */ + BPF_DYNPTR_TYPE_USER, }; void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ead35f39f185..3466bcc9aeca 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6814,6 +6814,12 @@ struct bpf_timer { __u64 :64; } __attribute__((aligned(8))); +struct bpf_dynptr_user { + __u64 data; + __u32 size; + __u32 :32; +} __attribute__((aligned(8))); + struct bpf_dynptr { __u64 :64; __u64 :64; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 135205d0d560..8ba96337893b 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -52,6 +52,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->max_entries = inner_map->max_entries; inner_map_meta->spin_lock_off = inner_map->spin_lock_off; inner_map_meta->timer_off = inner_map->timer_off; + inner_map_meta->dynptr_key_off = inner_map->dynptr_key_off; inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map); if (inner_map->btf) { btf_get(inner_map->btf); @@ -85,7 +86,9 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && meta0->timer_off == meta1->timer_off && + meta0->dynptr_key_off == meta1->dynptr_key_off && meta0->map_flags == meta1->map_flags && + meta0->map_extra == meta1->map_extra && bpf_map_equal_kptr_off_tab(meta0, meta1); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 372fad5ef3d3..70919155c4ed 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -996,6 +996,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, key_type = btf_type_id_size(btf, &btf_key_id, &key_size); if (!key_type || key_size != map->key_size) return -EINVAL; + map->dynptr_key_off = btf_find_dynptr(btf, key_type); } else { key_type = btf_type_by_id(btf, 0); if (!map->ops->map_check_btf) @@ -1089,10 +1090,6 @@ static int map_create(union bpf_attr *attr) return -EINVAL; } - if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && - attr->map_extra != 0) - return -EINVAL; - f_flags = bpf_get_file_flag(attr->map_flags); if (f_flags < 0) return f_flags; @@ -1119,6 +1116,7 @@ static int map_create(union bpf_attr *attr) map->spin_lock_off = -EINVAL; map->timer_off = -EINVAL; + map->dynptr_key_off = -EINVAL; if (attr->btf_key_type_id || attr->btf_value_type_id || /* Even the map's value is a kernel's struct, * the bpf_prog.o must have BTF to begin with @@ -1154,6 +1152,20 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } + if (map_key_has_dynptr(map)) { + /* The lower 32-bits of map_extra specifies the maximum size + * of bpf_dynptr-typed key + */ + if (!attr->map_extra || (attr->map_extra >> 32) || + bpf_dynptr_check_size((u32)attr->map_extra)) { + err = -EINVAL; + goto free_map; + } + } else if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && attr->map_extra != 0) { + err = -EINVAL; + goto free_map; + } + err = bpf_map_alloc_off_arr(map); if (err) goto free_map; @@ -1280,10 +1292,41 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) return -ENOTSUPP; } -static void *__bpf_copy_key(void __user *ukey, u64 key_size) +static void *bpf_copy_from_dynptr_ukey(bpfptr_t ukey) +{ + struct bpf_dynptr_kern *kptr; + struct bpf_dynptr_user uptr; + bpfptr_t data; + + if (copy_from_bpfptr(&uptr, ukey, sizeof(uptr))) + return ERR_PTR(-EFAULT); + + if (!uptr.size || bpf_dynptr_check_size(uptr.size)) + return ERR_PTR(-EINVAL); + + /* Allocate and free bpf_dynptr_kern and its data together */ + kptr = kvmalloc(sizeof(*kptr) + uptr.size, GFP_USER | __GFP_NOWARN); + if (!kptr) + return ERR_PTR(-ENOMEM); + + data = make_bpfptr(uptr.data, bpfptr_is_kernel(ukey)); + if (copy_from_bpfptr(&kptr[1], data, uptr.size)) { + kvfree(kptr); + return ERR_PTR(-EFAULT); + } + + bpf_dynptr_init(kptr, &kptr[1], BPF_DYNPTR_TYPE_USER, 0, uptr.size); + + return kptr; +} + +static void *__bpf_copy_key(const struct bpf_map *map, void __user *ukey) { - if (key_size) - return vmemdup_user(ukey, key_size); + if (map_key_has_dynptr(map)) + return bpf_copy_from_dynptr_ukey(USER_BPFPTR(ukey)); + + if (map->key_size) + return vmemdup_user(ukey, map->key_size); if (ukey) return ERR_PTR(-EINVAL); @@ -1291,10 +1334,13 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size) return NULL; } -static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size) +static void *___bpf_copy_key(const struct bpf_map *map, bpfptr_t ukey) { - if (key_size) - return kvmemdup_bpfptr(ukey, key_size); + if (map_key_has_dynptr(map)) + return bpf_copy_from_dynptr_ukey(ukey); + + if (map->key_size) + return kvmemdup_bpfptr(ukey, map->key_size); if (!bpfptr_is_null(ukey)) return ERR_PTR(-EINVAL); @@ -1302,6 +1348,38 @@ static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size) return NULL; } +static void *bpf_new_dynptr_key(u32 key_size) +{ + struct bpf_dynptr_kern *kptr; + + kptr = kvmalloc(sizeof(*kptr) + key_size, GFP_USER | __GFP_NOWARN); + if (kptr) + bpf_dynptr_init(kptr, &kptr[1], BPF_DYNPTR_TYPE_USER, 0, key_size); + return kptr; +} + +static int bpf_copy_to_dynptr_ukey(struct bpf_dynptr_user __user *uptr, + struct bpf_dynptr_kern *kptr) +{ + struct { + unsigned int size; + unsigned int zero; + } tuple; + u64 udata; + + if (copy_from_user(&udata, &uptr->data, sizeof(udata))) + return -EFAULT; + + /* Also zeroing the reserved field in uptr */ + tuple.size = bpf_dynptr_get_size(kptr); + tuple.zero = 0; + if (copy_to_user(u64_to_user_ptr(udata), kptr->data + kptr->offset, tuple.size) || + copy_to_user(&uptr->size, &tuple, sizeof(tuple))) + return -EFAULT; + + return 0; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags @@ -1337,7 +1415,7 @@ static int map_lookup_elem(union bpf_attr *attr) goto err_put; } - key = __bpf_copy_key(ukey, map->key_size); + key = __bpf_copy_key(map, ukey); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -1377,7 +1455,6 @@ static int map_lookup_elem(union bpf_attr *attr) return err; } - #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) @@ -1410,7 +1487,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - key = ___bpf_copy_key(ukey, map->key_size); + key = ___bpf_copy_key(map, ukey); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -1458,7 +1535,7 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - key = ___bpf_copy_key(ukey, map->key_size); + key = ___bpf_copy_key(map, ukey); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -1514,7 +1591,7 @@ static int map_get_next_key(union bpf_attr *attr) } if (ukey) { - key = __bpf_copy_key(ukey, map->key_size); + key = __bpf_copy_key(map, ukey); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -1524,7 +1601,10 @@ static int map_get_next_key(union bpf_attr *attr) } err = -ENOMEM; - next_key = kvmalloc(map->key_size, GFP_USER); + if (map_key_has_dynptr(map)) + next_key = bpf_new_dynptr_key(map->map_extra); + else + next_key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!next_key) goto free_key; @@ -1540,8 +1620,11 @@ static int map_get_next_key(union bpf_attr *attr) if (err) goto free_next_key; - err = -EFAULT; - if (copy_to_user(unext_key, next_key, map->key_size) != 0) + if (map_key_has_dynptr(map)) + err = bpf_copy_to_dynptr_ukey(unext_key, next_key); + else + err = copy_to_user(unext_key, next_key, map->key_size) != 0 ? -EFAULT : 0; + if (err) goto free_next_key; err = 0; @@ -1815,7 +1898,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) goto err_put; } - key = __bpf_copy_key(ukey, map->key_size); + key = __bpf_copy_key(map, ukey); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ead35f39f185..3466bcc9aeca 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6814,6 +6814,12 @@ struct bpf_timer { __u64 :64; } __attribute__((aligned(8))); +struct bpf_dynptr_user { + __u64 data; + __u32 size; + __u32 :32; +} __attribute__((aligned(8))); + struct bpf_dynptr { __u64 :64; __u64 :64; -- 2.29.2