From: Joanne Koong <joannelkoong@xxxxxxxxx> Currently, local storage memory can only be allocated atomically (GFP_ATOMIC). This restriction is too strict for sleepable bpf programs. In this patch, the verifier detects whether the program is sleepable, and passes the corresponding GFP_KERNEL or GFP_ATOMIC flag as a 5th argument to bpf_task/sk/inode_storage_get. This flag will propagate down to the local storage functions that allocate memory. A few things to note: * bpf_task/sk/inode_storage_update_elem functions are invoked by userspace applications through syscalls. Preemption is disabled before bpf_task/sk/inode_storage_update_elem is called, which means they will always have to allocate memory atomically. * In bpf_local_storage_update, we have to do the memory charging and allocation outside the spinlock. There are some cases where it is permissible for the memory charging and/or allocation to fail, so in these failure cases, we continue on to determine at a later time whether the failure is relevant. Signed-off-by: Joanne Koong <joannelkoong@xxxxxxxxx> --- include/linux/bpf_local_storage.h | 7 +-- kernel/bpf/bpf_inode_storage.c | 9 ++-- kernel/bpf/bpf_local_storage.c | 77 +++++++++++++++++++++---------- kernel/bpf/bpf_task_storage.c | 10 ++-- kernel/bpf/verifier.c | 20 ++++++++ net/core/bpf_sk_storage.c | 21 +++++---- 6 files changed, 99 insertions(+), 45 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 37b3906af8b1..d6905e85399d 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -154,16 +154,17 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem); struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, - bool charge_mem); + gfp_t mem_flags); int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *first_selem); + struct bpf_local_storage_elem *first_selem, + gfp_t mem_flags); struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags); + void *value, u64 map_flags, gfp_t mem_flags); void bpf_local_storage_free_rcu(struct rcu_head *rcu); diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index e29d9e3d853e..41b0bec1e7e9 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -136,7 +136,7 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, sdata = bpf_local_storage_update(f->f_inode, (struct bpf_local_storage_map *)map, - value, map_flags); + value, map_flags, GFP_ATOMIC); fput(f); return PTR_ERR_OR_ZERO(sdata); } @@ -169,8 +169,9 @@ static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) return err; } -BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, - void *, value, u64, flags) +/* *mem_flags* is set by the bpf verifier */ +BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, + void *, value, u64, flags, gfp_t, mem_flags) { struct bpf_local_storage_data *sdata; @@ -196,7 +197,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { sdata = bpf_local_storage_update( inode, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST); + BPF_NOEXIST, mem_flags); return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 092a1ac772d7..ca402f9cf1a5 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -63,23 +63,22 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, - void *value, bool charge_mem) + void *value, gfp_t mem_flags) { struct bpf_local_storage_elem *selem; - if (charge_mem && mem_charge(smap, owner, smap->elem_size)) + if (mem_charge(smap, owner, smap->elem_size)) return NULL; selem = bpf_map_kzalloc(&smap->map, smap->elem_size, - GFP_ATOMIC | __GFP_NOWARN); + mem_flags | __GFP_NOWARN); if (selem) { if (value) memcpy(SDATA(selem)->data, value, smap->map.value_size); return selem; } - if (charge_mem) - mem_uncharge(smap, owner, smap->elem_size); + mem_uncharge(smap, owner, smap->elem_size); return NULL; } @@ -282,7 +281,8 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata, int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *first_selem) + struct bpf_local_storage_elem *first_selem, + gfp_t mem_flags) { struct bpf_local_storage *prev_storage, *storage; struct bpf_local_storage **owner_storage_ptr; @@ -293,7 +293,7 @@ int bpf_local_storage_alloc(void *owner, return err; storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), - GFP_ATOMIC | __GFP_NOWARN); + mem_flags | __GFP_NOWARN); if (!storage) { err = -ENOMEM; goto uncharge; @@ -350,13 +350,13 @@ int bpf_local_storage_alloc(void *owner, */ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags) + void *value, u64 map_flags, gfp_t mem_flags) { struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; unsigned long flags; - int err; + int err, charge_err; /* BPF_EXIST and BPF_NOEXIST cannot be both set */ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || @@ -373,11 +373,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, owner, value, true); + selem = bpf_selem_alloc(smap, owner, value, mem_flags); if (!selem) return ERR_PTR(-ENOMEM); - err = bpf_local_storage_alloc(owner, smap, selem); + err = bpf_local_storage_alloc(owner, smap, selem, mem_flags); if (err) { kfree(selem); mem_uncharge(smap, owner, smap->elem_size); @@ -404,6 +404,19 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } } + /* Since mem_flags can be non-atomic, we need to do the memory + * allocation outside the spinlock. + * + * There are a few cases where it is permissible for the memory charge + * and allocation to fail (eg if BPF_F_LOCK is set and a local storage + * value already exists, we can swap values without needing an + * allocation), so in the case of a failure here, continue on and see + * if the failure is relevant. + */ + charge_err = mem_charge(smap, owner, smap->elem_size); + selem = bpf_map_kzalloc(&smap->map, smap->elem_size, + mem_flags | __GFP_NOWARN); + raw_spin_lock_irqsave(&local_storage->lock, flags); /* Recheck local_storage->list under local_storage->lock */ @@ -425,25 +438,37 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata && (map_flags & BPF_F_LOCK)) { copy_map_value_locked(&smap->map, old_sdata->data, value, false); - selem = SELEM(old_sdata); - goto unlock; + + raw_spin_unlock_irqrestore(&local_storage->lock, flags); + + if (!charge_err) + mem_uncharge(smap, owner, smap->elem_size); + kfree(selem); + + return old_sdata; + } + + if (!old_sdata && charge_err) { + /* If there is no existing local storage value, then this means + * we needed the charge to succeed. We must make sure this did not + * return an error. + * + * Please note that if an existing local storage value exists, then + * it doesn't matter if the charge failed because we can just + * "reuse" the charge from the existing local storage element. + */ + err = charge_err; + goto unlock_err; } - /* local_storage->lock is held. Hence, we are sure - * we can unlink and uncharge the old_sdata successfully - * later. Hence, instead of charging the new selem now - * and then uncharge the old selem later (which may cause - * a potential but unnecessary charge failure), avoid taking - * a charge at all here (the "!old_sdata" check) and the - * old_sdata will not be uncharged later during - * bpf_selem_unlink_storage_nolock(). - */ - selem = bpf_selem_alloc(smap, owner, value, !old_sdata); if (!selem) { err = -ENOMEM; goto unlock_err; } + if (value) + memcpy(SDATA(selem)->data, value, smap->map.value_size); + /* First, link the new selem to the map */ bpf_selem_link_map(smap, selem); @@ -454,15 +479,17 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - false); + !charge_err); } -unlock: raw_spin_unlock_irqrestore(&local_storage->lock, flags); return SDATA(selem); unlock_err: raw_spin_unlock_irqrestore(&local_storage->lock, flags); + if (!charge_err) + mem_uncharge(smap, owner, smap->elem_size); + kfree(selem); return ERR_PTR(err); } diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 5da7bed0f5f6..bb9e22bad42b 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -174,7 +174,8 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, bpf_task_storage_lock(); sdata = bpf_local_storage_update( - task, (struct bpf_local_storage_map *)map, value, map_flags); + task, (struct bpf_local_storage_map *)map, value, map_flags, + GFP_ATOMIC); bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); @@ -226,8 +227,9 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) return err; } -BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, - task, void *, value, u64, flags) +/* *mem_flags* is set by the bpf verifier */ +BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags, gfp_t, mem_flags) { struct bpf_local_storage_data *sdata; @@ -250,7 +252,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST); + BPF_NOEXIST, mem_flags); unlock: bpf_task_storage_unlock(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0db6cd8dcb35..392fdaabedbd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13491,6 +13491,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto patch_call_imm; } + if (insn->imm == BPF_FUNC_task_storage_get || + insn->imm == BPF_FUNC_sk_storage_get || + insn->imm == BPF_FUNC_inode_storage_get) { + if (env->prog->aux->sleepable) + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, GFP_KERNEL); + else + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, GFP_ATOMIC); + insn_buf[1] = *insn; + cnt = 2; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto patch_call_imm; + } + /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * and other inlining handlers are currently limited to 64 bit * only. diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d9c37fd10809..8790a3791d39 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -141,7 +141,7 @@ static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, if (sock) { sdata = bpf_local_storage_update( sock->sk, (struct bpf_local_storage_map *)map, value, - map_flags); + map_flags, GFP_ATOMIC); sockfd_put(sock); return PTR_ERR_OR_ZERO(sdata); } @@ -172,7 +172,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk, { struct bpf_local_storage_elem *copy_selem; - copy_selem = bpf_selem_alloc(smap, newsk, NULL, true); + copy_selem = bpf_selem_alloc(smap, newsk, NULL, GFP_ATOMIC); if (!copy_selem) return NULL; @@ -230,7 +230,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) bpf_selem_link_map(smap, copy_selem); bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { - ret = bpf_local_storage_alloc(newsk, smap, copy_selem); + ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); if (ret) { kfree(copy_selem); atomic_sub(smap->elem_size, @@ -255,8 +255,9 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) return ret; } -BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, - void *, value, u64, flags) +/* *mem_flags* is set by the bpf verifier */ +BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags, gfp_t, mem_flags) { struct bpf_local_storage_data *sdata; @@ -277,7 +278,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, refcount_inc_not_zero(&sk->sk_refcnt)) { sdata = bpf_local_storage_update( sk, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST); + BPF_NOEXIST, mem_flags); /* sk must be a fullsock (guaranteed by verifier), * so sock_gen_put() is unnecessary. */ @@ -417,14 +418,16 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) return false; } -BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, - void *, value, u64, flags) +/* *mem_flags* is set by the bpf verifier */ +BPF_CALL_5(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags, gfp_t, mem_flags) { WARN_ON_ONCE(!bpf_rcu_lock_held()); if (in_hardirq() || in_nmi()) return (unsigned long)NULL; - return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags); + return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags, + mem_flags); } BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, -- 2.30.2