The recent commit fbee97feed9b ("bpf: Add support to attach bpf program to a devmap entry"), introduced ability to attach (and run) a separate XDP bpf_prog for each devmap entry. A bpf_prog is added via a file-descriptor, thus not using the feature requires using value minus-1. The UAPI is extended via tail-extending struct bpf_devmap_val and using map->value_size to determine the feature set. There is a specific problem with dev_map_can_have_prog() check, which is called from net/core/dev.c in generic_xdp_install() to refuse usage of devmap's from generic-XDP that support these bpf_prog's. The check is size based. This means that all newer features will be blocked from being use by generic-XDP. This patch allows userspace to skip handling of 'bpf_prog' on map-inserts. The feature can be skipped, via not including the member 'bpf_prog' in the map-value struct, which is propagated/described via BTF. Fixes: fbee97feed9b ("bpf: Add support to attach bpf program to a devmap entry") Signed-off-by: Jesper Dangaard Brouer <brouer@xxxxxxxxxx --- kernel/bpf/devmap.c | 295 ++++++++++++++++---- .../bpf/progs/test_xdp_with_devmap_helpers.c | 4 2 files changed, 235 insertions(+), 64 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 854b09beb16b..8a223557123c 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -48,6 +48,7 @@ #include <net/xdp.h> #include <linux/filter.h> #include <trace/events/xdp.h> +#include <linux/btf.h> #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) @@ -60,10 +61,19 @@ struct xdp_dev_bulk_queue { unsigned int count; }; -/* DEVMAP values */ +/* DEVMAP map-value layout. + * + * The struct data-layout of map-value is a configuration interface. + * BPF-prog side have read-only access to this memory. + * + * The layout might be different than below, because some struct members are + * optional. This is made dynamic by requiring userspace provides an BTF + * description of the struct layout, when creating the BPF-map. Struct names + * are important and part of API, as BTF use these names to identify members. + */ struct bpf_devmap_val { u32 ifindex; /* device index */ - union { + union bpf_prog_union { int fd; /* prog fd on map write */ u32 id; /* prog id on map read */ } bpf_prog; @@ -76,13 +86,21 @@ struct bpf_dtab_netdev { struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; - struct bpf_devmap_val val; + struct bpf_devmap_val val; /* actual layout defined by userspace BTF */ +}; + +struct bpf_devmap_val_cfg { + struct { + int ifindex; + int bpf_prog; + } btf_offset; }; struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ struct list_head list; + struct bpf_devmap_val_cfg cfg; /* these are only used for DEVMAP_HASH type maps */ struct hlist_head *dev_index_head; @@ -116,20 +134,27 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { - u32 valsize = attr->value_size; u64 cost = 0; int err; - /* check sanity of attributes. 2 value sizes supported: - * 4 bytes: ifindex - * 8 bytes: ifindex + prog fd - */ + /* Value contents validated in dev_map_check_btf */ if (attr->max_entries == 0 || attr->key_size != 4 || - (valsize != offsetofend(struct bpf_devmap_val, ifindex) && - valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || + attr->value_size > sizeof(struct bpf_devmap_val) || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; + /* Enforce BTF for userspace, unless dealing with legacy kABI */ + if (attr->value_size != 4 && + (!attr->btf_key_type_id || !attr->btf_value_type_id)) + return -EOPNOTSUPP; + + /* Mark BTF offset's as invalid */ + dtab->cfg.btf_offset.ifindex = -1; + dtab->cfg.btf_offset.bpf_prog = -1; + + if (attr->value_size == 4 && !attr->btf_value_type_id) + dtab->cfg.btf_offset.ifindex = 0; + /* Lookup returns a pointer straight to dev->ifindex, so make sure the * verifier prevents writes from the BPF side */ @@ -199,6 +224,118 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) return &dtab->map; } +struct expect { + u8 btf_kind; + bool mandatory; + int bit_offset; + int size; + const char *name; +}; + +/* Note: refactor and move to kernel/bpf/btf.c when other users appear */ +static int btf_find_expect_layout_offset(const struct btf *btf, + const struct btf_type *value_type, + const struct expect *layout) +{ + const struct btf_member *member; + u32 i, off = -ENOENT; + + for_each_member(i, value_type, member) { + const struct btf_type *member_type; + const char *member_name; + + member_type = btf_type_skip_modifiers(btf, member->type, NULL); + if (BTF_INFO_KIND(member_type->info) != layout->btf_kind) { + continue; + } + + member_name = btf_name_by_offset(btf, member->name_off); + if (!member_name) + return -EINVAL; + + if (strcmp(layout->name, member_name)) + continue; + + if (layout->size > 0 && // btf_type_has_size(member_type) && + member_type->size != layout->size) + continue; + + off = btf_member_bit_offset(value_type, member); + if (layout->bit_offset > 0 && + layout->bit_offset != off) { + off = -ENOENT; + continue; + } + + if (off % 8) /* valid C code cannot generate such BTF */ + return -EINVAL; + + return off; + } + return off; +} + +/* Expected BTF layout that match struct bpf_devmap_val */ +static const struct expect layout[] = { + {BTF_KIND_INT, true, -1, 4, "ifindex"}, + {BTF_KIND_UNION, false, -1, 4, "bpf_prog"} +}; + +static int dev_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + u32 found_members_cnt = 0; + u32 int_data; + int off; + u32 i; + + /* Validate KEY type and size */ + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + return -EOPNOTSUPP; + + int_data = *(u32 *)(key_type + 1); + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0) + return -EOPNOTSUPP; + + if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT) + return -EOPNOTSUPP; + + /* Struct/union members in BTF must not exceed (max) expected members */ + if (btf_type_vlen(value_type) > ARRAY_SIZE(layout)) + return -E2BIG; + + for (i = 0; i < ARRAY_SIZE(layout); i++) { + off = btf_find_expect_layout_offset(btf, value_type, &layout[i]); + + if (off < 0 && layout[i].mandatory) + return -EUCLEAN; + + if (off >= 0) + found_members_cnt++; + + /* Transfer layout config to map */ + switch (i) { + case 0: + dtab->cfg.btf_offset.ifindex = off; + break; + case 1: + dtab->cfg.btf_offset.bpf_prog = off; + break; + default: + break; + } + } + + /* Detect if BTF/vlen have members that were not found */ + if (btf_type_vlen(value_type) > found_members_cnt) + return -E2BIG; + + return 0; +} + static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); @@ -338,16 +475,6 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } -bool dev_map_can_have_prog(struct bpf_map *map) -{ - if ((map->map_type == BPF_MAP_TYPE_DEVMAP || - map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && - map->value_size != offsetofend(struct bpf_devmap_val, ifindex)) - return true; - - return false; -} - static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; @@ -601,42 +728,101 @@ static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) return ret; } +static inline bool map_value_has_bpf_prog(const struct bpf_dtab *dtab) +{ + return dtab->cfg.btf_offset.bpf_prog >= 0; +} + +bool dev_map_can_have_prog(struct bpf_map *map) +{ + struct bpf_dtab *dtab; + + if (map->map_type != BPF_MAP_TYPE_DEVMAP && + map->map_type != BPF_MAP_TYPE_DEVMAP_HASH) + return false; + + dtab = container_of(map, struct bpf_dtab, map); + if (map_value_has_bpf_prog(dtab)) + return true; + + return false; +} + +static union bpf_prog_union *map_value_ptr_bpf_prog(const struct bpf_dtab *dtab, + void* val) +{ + u32 off = dtab->cfg.btf_offset.bpf_prog / 8; + union bpf_prog_union *bpf_prog = NULL; + + bpf_prog = (union bpf_prog_union *)(val + off); + return bpf_prog; +} + +static int map_value_ifindex(const struct bpf_dtab *dtab, void* val) +{ + int ifindex = 0; + u32 off; + + if (dtab->cfg.btf_offset.ifindex < 0) + goto out; + + off = dtab->cfg.btf_offset.ifindex / 8; + ifindex = *(int *)(val + off); +out: + return ifindex; +} + static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, - struct bpf_dtab *dtab, - struct bpf_devmap_val *val, + struct bpf_map *map, + void *value, unsigned int idx) { + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; + int ifindex; - dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, + dev = kzalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); - dev->dev = dev_get_by_index(net, val->ifindex); + /* Already verified value_size <= sizeof bpf_devmap_val */ + memcpy(&dev->val, value, map->value_size); + + /* Member: ifindex is mandatory, both BTF and kABI */ + ifindex = map_value_ifindex(dtab, &dev->val); + if (!ifindex) + return ERR_PTR(-EINVAL); + + dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) goto err_out; - if (val->bpf_prog.fd >= 0) { - prog = bpf_prog_get_type_dev(val->bpf_prog.fd, - BPF_PROG_TYPE_XDP, false); - if (IS_ERR(prog)) - goto err_put_dev; - if (prog->expected_attach_type != BPF_XDP_DEVMAP) - goto err_put_prog; - } + /* Member: bpf_prog union is optional */ + if (map_value_has_bpf_prog(dtab)) { + union bpf_prog_union *bpf_prog; + bpf_prog = map_value_ptr_bpf_prog(dtab, &dev->val); + + if (bpf_prog->fd >= 0) { + prog = bpf_prog_get_type_dev(bpf_prog->fd, + BPF_PROG_TYPE_XDP, false); + if (IS_ERR(prog)) + goto err_put_dev; + if (prog->expected_attach_type != BPF_XDP_DEVMAP) + goto err_put_prog; + } + if (prog) { + dev->xdp_prog = prog; + bpf_prog->id = prog->aux->id; + } else { + dev->xdp_prog = NULL; + bpf_prog->id = 0; + } + } dev->idx = idx; dev->dtab = dtab; - if (prog) { - dev->xdp_prog = prog; - dev->val.bpf_prog.id = prog->aux->id; - } else { - dev->xdp_prog = NULL; - dev->val.bpf_prog.id = 0; - } - dev->val.ifindex = val->ifindex; return dev; err_put_prog: @@ -652,7 +838,6 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; u32 i = *(u32 *)key; @@ -663,19 +848,9 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; - /* already verified value_size <= sizeof val */ - memcpy(&val, value, map->value_size); - - if (!val.ifindex) { - dev = NULL; - /* can not specify fd if ifindex is 0 */ - if (val.bpf_prog.fd != -1) - return -EINVAL; - } else { - dev = __dev_map_alloc_node(net, dtab, &val, i); - if (IS_ERR(dev)) - return PTR_ERR(dev); - } + dev = __dev_map_alloc_node(net, map, value, i); + if (IS_ERR(dev)) + return PTR_ERR(dev); /* Use call_rcu() here to ensure rcu critical sections have completed * Remembering the driver side flush operation will happen before the @@ -699,16 +874,12 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; - /* already verified value_size <= sizeof val */ - memcpy(&val, value, map->value_size); - - if (unlikely(map_flags > BPF_EXIST || !val.ifindex)) + if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; spin_lock_irqsave(&dtab->index_lock, flags); @@ -717,7 +888,7 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, if (old_dev && (map_flags & BPF_NOEXIST)) goto out_err; - dev = __dev_map_alloc_node(net, dtab, &val, idx); + dev = __dev_map_alloc_node(net, map, value, idx); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out_err; @@ -762,7 +933,7 @@ const struct bpf_map_ops dev_map_ops = { .map_lookup_elem = dev_map_lookup_elem, .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, - .map_check_btf = map_check_no_btf, + .map_check_btf = dev_map_check_btf, }; const struct bpf_map_ops dev_map_hash_ops = { @@ -772,7 +943,7 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_lookup_elem = dev_map_hash_lookup_elem, .map_update_elem = dev_map_hash_update_elem, .map_delete_elem = dev_map_hash_delete_elem, - .map_check_btf = map_check_no_btf, + .map_check_btf = dev_map_check_btf, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, diff --git a/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c index deef0e050863..d2481ec10764 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c @@ -5,8 +5,8 @@ struct { __uint(type, BPF_MAP_TYPE_DEVMAP); - __uint(key_size, sizeof(__u32)); - __uint(value_size, sizeof(struct bpf_devmap_val)); + __type(key, u32); + __type(value, struct bpf_devmap_val); __uint(max_entries, 4); } dm_ports SEC(".maps");