On Sat, Mar 12, 2022 at 2:17 AM Joanne Koong <joannekoong@xxxxxx> wrote: > > From: Joanne Koong <joannelkoong@xxxxxxxxx> > > Currently, local storage memory can only be allocated atomically > (GFP_ATOMIC). This restriction is too strict for sleepable bpf > programs. > > In this patch, the verifier detects whether the program is sleepable, > and passes the corresponding GFP_KERNEL or GFP_ATOMIC flag as a > 5th argument to bpf_task/sk/inode_storage_get. This flag will propagate > down to the local storage functions that allocate memory. > > A few things to note: > * bpf_task/sk/inode_storage_update_elem functions are invoked by > userspace applications through syscalls. Preemption is disabled before > bpf_task/sk/inode_storage_update_elem is called, which means they will > always have to allocate memory atomically. > > * In bpf_local_storage_update, we have to do the memory charging and > allocation outside the spinlock. There are some cases where it is > permissible for the memory charging and/or allocation to fail, so in > these failure cases, we continue on to determine at a later time whether > the failure is relevant. > > Signed-off-by: Joanne Koong <joannelkoong@xxxxxxxxx> > --- > include/linux/bpf_local_storage.h | 7 +-- > kernel/bpf/bpf_inode_storage.c | 9 ++-- > kernel/bpf/bpf_local_storage.c | 77 +++++++++++++++++++++---------- > kernel/bpf/bpf_task_storage.c | 10 ++-- > kernel/bpf/verifier.c | 20 ++++++++ > net/core/bpf_sk_storage.c | 21 +++++---- > 6 files changed, 99 insertions(+), 45 deletions(-) > > diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h > index 37b3906af8b1..d6905e85399d 100644 > --- a/include/linux/bpf_local_storage.h > +++ b/include/linux/bpf_local_storage.h > @@ -154,16 +154,17 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem); > > struct bpf_local_storage_elem * > bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, > - bool charge_mem); > + gfp_t mem_flags); > > int > bpf_local_storage_alloc(void *owner, > struct bpf_local_storage_map *smap, > - struct bpf_local_storage_elem *first_selem); > + struct bpf_local_storage_elem *first_selem, > + gfp_t mem_flags); > > struct bpf_local_storage_data * > bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, > - void *value, u64 map_flags); > + void *value, u64 map_flags, gfp_t mem_flags); > > void bpf_local_storage_free_rcu(struct rcu_head *rcu); > > diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c > index e29d9e3d853e..41b0bec1e7e9 100644 > --- a/kernel/bpf/bpf_inode_storage.c > +++ b/kernel/bpf/bpf_inode_storage.c > @@ -136,7 +136,7 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, > > sdata = bpf_local_storage_update(f->f_inode, > (struct bpf_local_storage_map *)map, > - value, map_flags); > + value, map_flags, GFP_ATOMIC); > fput(f); > return PTR_ERR_OR_ZERO(sdata); > } > @@ -169,8 +169,9 @@ static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) > return err; > } > > -BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, > - void *, value, u64, flags) > +/* *mem_flags* is set by the bpf verifier */ > +BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, > + void *, value, u64, flags, gfp_t, mem_flags) > { > struct bpf_local_storage_data *sdata; > > @@ -196,7 +197,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, > if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { > sdata = bpf_local_storage_update( > inode, (struct bpf_local_storage_map *)map, value, > - BPF_NOEXIST); > + BPF_NOEXIST, mem_flags); > return IS_ERR(sdata) ? (unsigned long)NULL : > (unsigned long)sdata->data; > } > diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c > index 092a1ac772d7..ca402f9cf1a5 100644 > --- a/kernel/bpf/bpf_local_storage.c > +++ b/kernel/bpf/bpf_local_storage.c > @@ -63,23 +63,22 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) > > struct bpf_local_storage_elem * > bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, > - void *value, bool charge_mem) > + void *value, gfp_t mem_flags) > { > struct bpf_local_storage_elem *selem; > > - if (charge_mem && mem_charge(smap, owner, smap->elem_size)) > + if (mem_charge(smap, owner, smap->elem_size)) > return NULL; > > selem = bpf_map_kzalloc(&smap->map, smap->elem_size, > - GFP_ATOMIC | __GFP_NOWARN); > + mem_flags | __GFP_NOWARN); > if (selem) { > if (value) > memcpy(SDATA(selem)->data, value, smap->map.value_size); > return selem; > } > > - if (charge_mem) > - mem_uncharge(smap, owner, smap->elem_size); > + mem_uncharge(smap, owner, smap->elem_size); > > return NULL; > } > @@ -282,7 +281,8 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata, > > int bpf_local_storage_alloc(void *owner, > struct bpf_local_storage_map *smap, > - struct bpf_local_storage_elem *first_selem) > + struct bpf_local_storage_elem *first_selem, > + gfp_t mem_flags) > { > struct bpf_local_storage *prev_storage, *storage; > struct bpf_local_storage **owner_storage_ptr; > @@ -293,7 +293,7 @@ int bpf_local_storage_alloc(void *owner, > return err; > > storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), > - GFP_ATOMIC | __GFP_NOWARN); > + mem_flags | __GFP_NOWARN); > if (!storage) { > err = -ENOMEM; > goto uncharge; > @@ -350,13 +350,13 @@ int bpf_local_storage_alloc(void *owner, > */ > struct bpf_local_storage_data * > bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, > - void *value, u64 map_flags) > + void *value, u64 map_flags, gfp_t mem_flags) > { > struct bpf_local_storage_data *old_sdata = NULL; > struct bpf_local_storage_elem *selem; > struct bpf_local_storage *local_storage; > unsigned long flags; > - int err; > + int err, charge_err; > > /* BPF_EXIST and BPF_NOEXIST cannot be both set */ > if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || > @@ -373,11 +373,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, > if (err) > return ERR_PTR(err); > > - selem = bpf_selem_alloc(smap, owner, value, true); > + selem = bpf_selem_alloc(smap, owner, value, mem_flags); > if (!selem) > return ERR_PTR(-ENOMEM); > > - err = bpf_local_storage_alloc(owner, smap, selem); > + err = bpf_local_storage_alloc(owner, smap, selem, mem_flags); > if (err) { > kfree(selem); > mem_uncharge(smap, owner, smap->elem_size); > @@ -404,6 +404,19 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, > } > } > > + /* Since mem_flags can be non-atomic, we need to do the memory > + * allocation outside the spinlock. > + * > + * There are a few cases where it is permissible for the memory charge > + * and allocation to fail (eg if BPF_F_LOCK is set and a local storage > + * value already exists, we can swap values without needing an > + * allocation), so in the case of a failure here, continue on and see > + * if the failure is relevant. > + */ > + charge_err = mem_charge(smap, owner, smap->elem_size); > + selem = bpf_map_kzalloc(&smap->map, smap->elem_size, > + mem_flags | __GFP_NOWARN); > + > raw_spin_lock_irqsave(&local_storage->lock, flags); > > /* Recheck local_storage->list under local_storage->lock */ > @@ -425,25 +438,37 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, > if (old_sdata && (map_flags & BPF_F_LOCK)) { > copy_map_value_locked(&smap->map, old_sdata->data, value, > false); > - selem = SELEM(old_sdata); > - goto unlock; > + > + raw_spin_unlock_irqrestore(&local_storage->lock, flags); > + > + if (!charge_err) > + mem_uncharge(smap, owner, smap->elem_size); > + kfree(selem); > + > + return old_sdata; > + } > + > + if (!old_sdata && charge_err) { > + /* If there is no existing local storage value, then this means > + * we needed the charge to succeed. We must make sure this did not > + * return an error. > + * > + * Please note that if an existing local storage value exists, then > + * it doesn't matter if the charge failed because we can just > + * "reuse" the charge from the existing local storage element. > + */ But we did allocate a new element which was unaccounted for, even if it was temporarily. [for the short period of time till we freed the old element] Martin, is this something we are okay with? > + err = charge_err; > + goto unlock_err; > } > > - /* local_storage->lock is held. Hence, we are sure > - * we can unlink and uncharge the old_sdata successfully > - * later. Hence, instead of charging the new selem now > - * and then uncharge the old selem later (which may cause > - * a potential but unnecessary charge failure), avoid taking > - * a charge at all here (the "!old_sdata" check) and the > - * old_sdata will not be uncharged later during > - * bpf_selem_unlink_storage_nolock(). > - */ > - selem = bpf_selem_alloc(smap, owner, value, !old_sdata); > if (!selem) { > err = -ENOMEM; > goto unlock_err; > } > > + if (value) > + memcpy(SDATA(selem)->data, value, smap->map.value_size); > + > /* First, link the new selem to the map */ > bpf_selem_link_map(smap, selem); > > @@ -454,15 +479,17 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, > if (old_sdata) { > bpf_selem_unlink_map(SELEM(old_sdata)); > bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), > - false); > + !charge_err); > } > > -unlock: > raw_spin_unlock_irqrestore(&local_storage->lock, flags); > return SDATA(selem); > > unlock_err: > raw_spin_unlock_irqrestore(&local_storage->lock, flags); > + if (!charge_err) > + mem_uncharge(smap, owner, smap->elem_size); > + kfree(selem); > return ERR_PTR(err); > } > > diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c > index 5da7bed0f5f6..bb9e22bad42b 100644 > --- a/kernel/bpf/bpf_task_storage.c > +++ b/kernel/bpf/bpf_task_storage.c > @@ -174,7 +174,8 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, > > bpf_task_storage_lock(); > sdata = bpf_local_storage_update( > - task, (struct bpf_local_storage_map *)map, value, map_flags); > + task, (struct bpf_local_storage_map *)map, value, map_flags, > + GFP_ATOMIC); > bpf_task_storage_unlock(); > > err = PTR_ERR_OR_ZERO(sdata); > @@ -226,8 +227,9 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) > return err; > } > > -BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, > - task, void *, value, u64, flags) > +/* *mem_flags* is set by the bpf verifier */ Is there a precedence of this happening for any other helpers? You may want to add here that "any value, even if set by uapi, will be ignored" Can we go even beyond this and ensure that the verifier understands that this is an "internal only arg" something in check_helper_call? > +BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, > + task, void *, value, u64, flags, gfp_t, mem_flags) > { > struct bpf_local_storage_data *sdata; > > @@ -250,7 +252,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, > (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) > sdata = bpf_local_storage_update( > task, (struct bpf_local_storage_map *)map, value, > - BPF_NOEXIST); > + BPF_NOEXIST, mem_flags); > > unlock: > bpf_task_storage_unlock(); > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index 0db6cd8dcb35..392fdaabedbd 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -13491,6 +13491,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env) > goto patch_call_imm; > } > > + if (insn->imm == BPF_FUNC_task_storage_get || > + insn->imm == BPF_FUNC_sk_storage_get || > + insn->imm == BPF_FUNC_inode_storage_get) { > + if (env->prog->aux->sleepable) > + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, GFP_KERNEL); > + else > + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, GFP_ATOMIC); > + insn_buf[1] = *insn; > + cnt = 2; > + > + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); > + if (!new_prog) > + return -ENOMEM; > + > + delta += cnt - 1; > + env->prog = prog = new_prog; > + insn = new_prog->insnsi + i + delta; > + goto patch_call_imm; > + } > + > /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup > * and other inlining handlers are currently limited to 64 bit > * only. > diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c > index d9c37fd10809..8790a3791d39 100644 > --- a/net/core/bpf_sk_storage.c > +++ b/net/core/bpf_sk_storage.c > @@ -141,7 +141,7 @@ static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, > if (sock) { > sdata = bpf_local_storage_update( > sock->sk, (struct bpf_local_storage_map *)map, value, > - map_flags); > + map_flags, GFP_ATOMIC); > sockfd_put(sock); > return PTR_ERR_OR_ZERO(sdata); > } > @@ -172,7 +172,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk, > { > struct bpf_local_storage_elem *copy_selem; > > - copy_selem = bpf_selem_alloc(smap, newsk, NULL, true); > + copy_selem = bpf_selem_alloc(smap, newsk, NULL, GFP_ATOMIC); > if (!copy_selem) > return NULL; > > @@ -230,7 +230,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) > bpf_selem_link_map(smap, copy_selem); > bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); > } else { > - ret = bpf_local_storage_alloc(newsk, smap, copy_selem); > + ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); > if (ret) { > kfree(copy_selem); > atomic_sub(smap->elem_size, > @@ -255,8 +255,9 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) > return ret; > } > > -BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, > - void *, value, u64, flags) > +/* *mem_flags* is set by the bpf verifier */ > +BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, > + void *, value, u64, flags, gfp_t, mem_flags) > { > struct bpf_local_storage_data *sdata; > > @@ -277,7 +278,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, > refcount_inc_not_zero(&sk->sk_refcnt)) { > sdata = bpf_local_storage_update( > sk, (struct bpf_local_storage_map *)map, value, > - BPF_NOEXIST); > + BPF_NOEXIST, mem_flags); > /* sk must be a fullsock (guaranteed by verifier), > * so sock_gen_put() is unnecessary. > */ > @@ -417,14 +418,16 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) > return false; > } > > -BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, > - void *, value, u64, flags) > +/* *mem_flags* is set by the bpf verifier */ > +BPF_CALL_5(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, > + void *, value, u64, flags, gfp_t, mem_flags) > { > WARN_ON_ONCE(!bpf_rcu_lock_held()); > if (in_hardirq() || in_nmi()) > return (unsigned long)NULL; > > - return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags); > + return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags, > + mem_flags); > } > > BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, > -- > 2.30.2 >