On Mon, Mar 11, 2019 at 2:51 PM Daniel Borkmann <daniel@xxxxxxxxxxxxx> wrote: > > This work adds two new map creation flags BPF_F_RDONLY_PROG > and BPF_F_WRONLY_PROG in order to allow for read-only or > write-only BPF maps from a BPF program side. > > Today we have BPF_F_RDONLY and BPF_F_WRONLY, but this only > applies to system call side, meaning the BPF program has full > read/write access to the map as usual while bpf(2) calls with > map fd can either only read or write into the map depending > on the flags. BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG allows > for the exact opposite such that verifier is going to reject > program loads if write into a read-only map or a read into a > write-only map is detected. For read-only map case also some > helpers are forbidden for programs that would alter the map > state such as map deletion, update, etc. > > We've enabled this generic map extension to various non-special > maps holding normal user data: array, hash, lru, lpm, local > storage, queue and stack. Further map types could be followed > up in future depending on use-case. Main use case here is to > forbid writes into .rodata map values from verifier side. > > Signed-off-by: Daniel Borkmann <daniel@xxxxxxxxxxxxx> > --- > include/linux/bpf.h | 24 ++++++++++++++++++ > include/uapi/linux/bpf.h | 10 +++++++- > kernel/bpf/arraymap.c | 3 ++- > kernel/bpf/hashtab.c | 6 ++--- > kernel/bpf/local_storage.c | 6 ++--- > kernel/bpf/lpm_trie.c | 3 ++- > kernel/bpf/queue_stack_maps.c | 6 ++--- > kernel/bpf/syscall.c | 2 ++ > kernel/bpf/verifier.c | 46 +++++++++++++++++++++++++++++++++-- > 9 files changed, 92 insertions(+), 14 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 85b6b5dc883f..bb80c78924b0 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -427,6 +427,30 @@ struct bpf_array { > }; > }; > > +#define BPF_MAP_CAN_READ BIT(0) > +#define BPF_MAP_CAN_WRITE BIT(1) > + > +static inline u32 bpf_map_flags_to_cap(struct bpf_map *map) > +{ > + u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); > + > + /* Combination of BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG is > + * not possible. > + */ > + if (access_flags & BPF_F_RDONLY_PROG) > + return BPF_MAP_CAN_READ; > + else if (access_flags & BPF_F_WRONLY_PROG) > + return BPF_MAP_CAN_WRITE; > + else > + return BPF_MAP_CAN_READ | BPF_MAP_CAN_WRITE; > +} > + > +static inline bool bpf_map_flags_access_ok(u32 access_flags) > +{ > + return (access_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) != > + (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); > +} > + > #define MAX_TAIL_CALL_CNT 32 > > struct bpf_event_entry { > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index d0b80fce0fc9..e64fd9862e68 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -294,7 +294,7 @@ enum bpf_attach_type { > > #define BPF_OBJ_NAME_LEN 16U > > -/* Flags for accessing BPF object */ > +/* Flags for accessing BPF object from syscall side. */ > #define BPF_F_RDONLY (1U << 3) > #define BPF_F_WRONLY (1U << 4) > > @@ -304,6 +304,14 @@ enum bpf_attach_type { > /* Zero-initialize hash function seed. This should only be used for testing. */ > #define BPF_F_ZERO_SEED (1U << 6) > > +/* Flags for accessing BPF object from program side. */ > +#define BPF_F_RDONLY_PROG (1U << 7) > +#define BPF_F_WRONLY_PROG (1U << 8) > +#define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ > + BPF_F_RDONLY_PROG | \ > + BPF_F_WRONLY | \ > + BPF_F_WRONLY_PROG) > + > /* flags for BPF_PROG_QUERY */ > #define BPF_F_QUERY_EFFECTIVE (1U << 0) > > diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c > index 862d20422ad1..6d2ce06485ae 100644 > --- a/kernel/bpf/arraymap.c > +++ b/kernel/bpf/arraymap.c > @@ -22,7 +22,7 @@ > #include "map_in_map.h" > > #define ARRAY_CREATE_FLAG_MASK \ > - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) > + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) > > static void bpf_array_free_percpu(struct bpf_array *array) > { > @@ -63,6 +63,7 @@ int array_map_alloc_check(union bpf_attr *attr) > if (attr->max_entries == 0 || attr->key_size != 4 || > attr->value_size == 0 || > attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || > + !bpf_map_flags_access_ok(attr->map_flags) || > (percpu && numa_node != NUMA_NO_NODE)) > return -EINVAL; > > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c > index fed15cf94dca..192d32e77db3 100644 > --- a/kernel/bpf/hashtab.c > +++ b/kernel/bpf/hashtab.c > @@ -23,7 +23,7 @@ > > #define HTAB_CREATE_FLAG_MASK \ > (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ > - BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED) > + BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) > > struct bucket { > struct hlist_nulls_head head; > @@ -262,8 +262,8 @@ static int htab_map_alloc_check(union bpf_attr *attr) > /* Guard against local DoS, and discourage production use. */ > return -EPERM; > > - if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) > - /* reserved bits should not be used */ > + if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || > + !bpf_map_flags_access_ok(attr->map_flags)) > return -EINVAL; > > if (!lru && percpu_lru) > diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c > index 6b572e2de7fb..980e8f1f6cb5 100644 > --- a/kernel/bpf/local_storage.c > +++ b/kernel/bpf/local_storage.c > @@ -14,7 +14,7 @@ DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STO > #ifdef CONFIG_CGROUP_BPF > > #define LOCAL_STORAGE_CREATE_FLAG_MASK \ > - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) > + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) > > struct bpf_cgroup_storage_map { > struct bpf_map map; > @@ -282,8 +282,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) > if (attr->value_size > PAGE_SIZE) > return ERR_PTR(-E2BIG); > > - if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK) > - /* reserved bits should not be used */ > + if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || > + !bpf_map_flags_access_ok(attr->map_flags)) > return ERR_PTR(-EINVAL); > > if (attr->max_entries) > diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c > index 93a5cbbde421..e61630c2e50b 100644 > --- a/kernel/bpf/lpm_trie.c > +++ b/kernel/bpf/lpm_trie.c > @@ -538,7 +538,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) > #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) > > #define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \ > - BPF_F_RDONLY | BPF_F_WRONLY) > + BPF_F_ACCESS_MASK) > > static struct bpf_map *trie_alloc(union bpf_attr *attr) > { > @@ -553,6 +553,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) > if (attr->max_entries == 0 || > !(attr->map_flags & BPF_F_NO_PREALLOC) || > attr->map_flags & ~LPM_CREATE_FLAG_MASK || > + !bpf_map_flags_access_ok(attr->map_flags) || > attr->key_size < LPM_KEY_SIZE_MIN || > attr->key_size > LPM_KEY_SIZE_MAX || > attr->value_size < LPM_VAL_SIZE_MIN || > diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c > index b384ea9f3254..0b140d236889 100644 > --- a/kernel/bpf/queue_stack_maps.c > +++ b/kernel/bpf/queue_stack_maps.c > @@ -11,8 +11,7 @@ > #include "percpu_freelist.h" > > #define QUEUE_STACK_CREATE_FLAG_MASK \ > - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) > - > + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) > > struct bpf_queue_stack { > struct bpf_map map; > @@ -52,7 +51,8 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr) > /* check sanity of attributes */ > if (attr->max_entries == 0 || attr->key_size != 0 || > attr->value_size == 0 || > - attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) > + attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK || > + !bpf_map_flags_access_ok(attr->map_flags)) > return -EINVAL; > > if (attr->value_size > KMALLOC_MAX_SIZE) > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index b0c7a6485c49..ba2fe4cfad09 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -481,6 +481,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, > map->spin_lock_off = btf_find_spin_lock(btf, value_type); > > if (map_value_has_spin_lock(map)) { > + if (map->map_flags & BPF_F_RDONLY_PROG) > + return -EACCES; Do we need to enforce this restriction? This would make sense if we were enforcing that any element that has spinlock inside has to have a lock taken, do we do that in verifier? > if (map->map_type != BPF_MAP_TYPE_HASH && > map->map_type != BPF_MAP_TYPE_ARRAY && > map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index 57678cef9a2c..af3cddb18efb 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -1429,6 +1429,28 @@ static int check_stack_access(struct bpf_verifier_env *env, > return 0; > } > > +static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, > + int off, int size, enum bpf_access_type type) > +{ > + struct bpf_reg_state *regs = cur_regs(env); > + struct bpf_map *map = regs[regno].map_ptr; > + u32 cap = bpf_map_flags_to_cap(map); > + > + if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { > + verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n", > + map->value_size, off, size); > + return -EACCES; > + } > + > + if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) { > + verbose(env, "read into map forbidden, value_size=%d off=%d size=%d\n", typo: "read from"? > + map->value_size, off, size); > + return -EACCES; > + } > + > + return 0; > +} > + > /* check read/write into map element returned by bpf_map_lookup_elem() */ > static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, > int size, bool zero_size_allowed) > @@ -2014,7 +2036,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn > verbose(env, "R%d leaks addr into map\n", value_regno); > return -EACCES; > } > - > + err = check_map_access_type(env, regno, off, size, t); > + if (err) > + return err; > err = check_map_access(env, regno, off, size, false); > if (!err && t == BPF_READ && value_regno >= 0) > mark_reg_unknown(env, regs, value_regno); > @@ -2250,6 +2274,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, > return check_packet_access(env, regno, reg->off, access_size, > zero_size_allowed); > case PTR_TO_MAP_VALUE: > + if (check_map_access_type(env, regno, reg->off, access_size, > + meta && meta->raw_mode ? BPF_WRITE : > + BPF_READ)) > + return -EACCES; > return check_map_access(env, regno, reg->off, access_size, > zero_size_allowed); > default: /* scalar_value|ptr_to_stack or invalid ptr */ > @@ -2971,6 +2999,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, > int func_id, int insn_idx) > { > struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; > + struct bpf_map *map = meta->map_ptr; > > if (func_id != BPF_FUNC_tail_call && > func_id != BPF_FUNC_map_lookup_elem && > @@ -2981,11 +3010,24 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, > func_id != BPF_FUNC_map_peek_elem) > return 0; > > - if (meta->map_ptr == NULL) { > + if (map == NULL) { > verbose(env, "kernel subsystem misconfigured verifier\n"); > return -EINVAL; > } > > + /* In case of read-only, some additional restrictions > + * need to be applied in order to prevent altering the > + * state of the map from program side. > + */ > + if ((map->map_flags & BPF_F_RDONLY_PROG) && > + (func_id == BPF_FUNC_map_delete_elem || > + func_id == BPF_FUNC_map_update_elem || > + func_id == BPF_FUNC_map_push_elem || > + func_id == BPF_FUNC_map_pop_elem)) { Curious, what about tail_calls? Is it considered a read? Is this checked as well? > + verbose(env, "write into map forbidden\n"); > + return -EACCES; > + } > + > if (!BPF_MAP_PTR(aux->map_state)) > bpf_map_ptr_store(aux, meta->map_ptr, > meta->map_ptr->unpriv_array); > -- > 2.17.1 >