Re: [PATCH v3 bpf-next 2/5] bpf: Add bpf_sock_destroy kfunc

Stanislav Fomichev <sdf@xxxxxxxxxx> · Thu, 23 Mar 2023 14:45:54 -0700

On Thu, Mar 23, 2023 at 1:02 PM Aditi Ghag <aditi.ghag@xxxxxxxxxxxxx> wrote:
>
>
>
> > On Mar 21, 2023, at 2:02 PM, Stanislav Fomichev <sdf@xxxxxxxxxx> wrote:
> >
> > On 03/21, Aditi Ghag wrote:
> >> The socket destroy kfunc is used to forcefully terminate sockets from
> >> certain BPF contexts. We plan to use the capability in Cilium to force
> >> client sockets to reconnect when their remote load-balancing backends are
> >> deleted. The other use case is on-the-fly policy enforcement where existing
> >> socket connections prevented by policies need to be forcefully terminated.
> >> The helper allows terminating sockets that may or may not be actively
> >> sending traffic.
> >
> >> The helper is currently exposed to certain BPF iterators where users can
> >> filter, and terminate selected sockets.  Additionally, the helper can only
> >> be called from these BPF contexts that ensure socket locking in order to
> >> allow synchronous execution of destroy helpers that also acquire socket
> >> locks. The previous commit that batches UDP sockets during iteration
> >> facilitated a synchronous invocation of the destroy helper from BPF context
> >> by skipping taking socket locks in the destroy handler. TCP iterators
> >> already supported batching.
> >
> >> The helper takes `sock_common` type argument, even though it expects, and
> >> casts them to a `sock` pointer. This enables the verifier to allow the
> >> sock_destroy kfunc to be called for TCP with `sock_common` and UDP with
> >> `sock` structs. As a comparison, BPF helpers enable this behavior with the
> >> `ARG_PTR_TO_BTF_ID_SOCK_COMMON` argument type. However, there is no such
> >> option available with the verifier logic that handles kfuncs where BTF
> >> types are inferred. Furthermore, as `sock_common` only has a subset of
> >> certain fields of `sock`, casting pointer to the latter type might not
> >> always be safe. Hence, the BPF kfunc converts the argument to a full sock
> >> before casting.
> >
> >> Signed-off-by: Aditi Ghag <aditi.ghag@xxxxxxxxxxxxx>
> >> ---
> >>  include/net/udp.h |  1 +
> >>  net/core/filter.c | 54 ++++++++++++++++++++++++++++++++++++++++++
> >>  net/ipv4/tcp.c    | 16 +++++++++----
> >>  net/ipv4/udp.c    | 60 +++++++++++++++++++++++++++++++++++++----------
> >>  4 files changed, 114 insertions(+), 17 deletions(-)
> >
> >> diff --git a/include/net/udp.h b/include/net/udp.h
> >> index de4b528522bb..d2999447d3f2 100644
> >> --- a/include/net/udp.h
> >> +++ b/include/net/udp.h
> >> @@ -437,6 +437,7 @@ struct udp_seq_afinfo {
> >>  struct udp_iter_state {
> >>      struct seq_net_private  p;
> >>      int                     bucket;
> >> +    int                     offset;
> >>      struct udp_seq_afinfo   *bpf_seq_afinfo;
> >>  };
> >
> >> diff --git a/net/core/filter.c b/net/core/filter.c
> >> index 1d6f165923bf..ba3e0dac119c 100644
> >> --- a/net/core/filter.c
> >> +++ b/net/core/filter.c
> >> @@ -11621,3 +11621,57 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
> >
> >>      return func;
> >>  }
> >> +
> >> +/* Disables missing prototype warnings */
> >> +__diag_push();
> >> +__diag_ignore_all("-Wmissing-prototypes",
> >> +              "Global functions as their definitions will be in vmlinux BTF");
> >> +
> >> +/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
> >> + *
> >> + * The helper expects a non-NULL pointer to a socket. It invokes the
> >> + * protocol specific socket destroy handlers.
> >> + *
> >> + * The helper can only be called from BPF contexts that have acquired the socket
> >> + * locks.
> >> + *
> >> + * Parameters:
> >> + * @sock: Pointer to socket to be destroyed
> >> + *
> >> + * Return:
> >> + * On error, may return EPROTONOSUPPORT, EINVAL.
> >> + * EPROTONOSUPPORT if protocol specific destroy handler is not implemented.
> >> + * 0 otherwise
> >> + */
> >> +__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
> >> +{
> >> +    struct sock *sk = (struct sock *)sock;
> >> +
> >> +    if (!sk)
> >> +            return -EINVAL;
> >> +
> >> +    /* The locking semantics that allow for synchronous execution of the
> >> +     * destroy handlers are only supported for TCP and UDP.
> >> +     */
> >> +    if (!sk->sk_prot->diag_destroy || sk->sk_protocol == IPPROTO_RAW)
> >> +            return -EOPNOTSUPP;
> >
> > What makes IPPROTO_RAW special? Looks like it locks the socket as well?
>
> I haven't looked at the locking semantics for IPPROTO_RAW. These can be handled in a follow-up patch. Wdyt?

Fine with me. Maybe make it more opt-in? (vs current "opt ipproto_raw out")

if (sk->sk_prot->diag_destroy != udp_abort &&
    sk->sk_prot->diag_destroy != tcp_abort)
        return -EOPNOTSUPP;

Is it more robust? Or does it look uglier? )
But maybe fine as is, I'm just thinking out loud..

> >
> >> +
> >> +    return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
> >> +}
> >> +
> >> +__diag_pop()
> >> +
> >> +BTF_SET8_START(sock_destroy_kfunc_set)
> >> +BTF_ID_FLAGS(func, bpf_sock_destroy)
> >> +BTF_SET8_END(sock_destroy_kfunc_set)
> >> +
> >> +static const struct btf_kfunc_id_set bpf_sock_destroy_kfunc_set = {
> >> +    .owner = THIS_MODULE,
> >> +    .set   = &sock_destroy_kfunc_set,
> >> +};
> >> +
> >> +static int init_subsystem(void)
> >> +{
> >> +    return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sock_destroy_kfunc_set);
> >> +}
> >> +late_initcall(init_subsystem);
> >> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> >> index 33f559f491c8..59a833c0c872 100644
> >> --- a/net/ipv4/tcp.c
> >> +++ b/net/ipv4/tcp.c
> >> @@ -4678,8 +4678,10 @@ int tcp_abort(struct sock *sk, int err)
> >>              return 0;
> >>      }
> >
> >> -    /* Don't race with userspace socket closes such as tcp_close. */
> >> -    lock_sock(sk);
> >> +    /* BPF context ensures sock locking. */
> >> +    if (!has_current_bpf_ctx())
> >> +            /* Don't race with userspace socket closes such as tcp_close. */
> >> +            lock_sock(sk);
> >
> >>      if (sk->sk_state == TCP_LISTEN) {
> >>              tcp_set_state(sk, TCP_CLOSE);
> >> @@ -4688,7 +4690,8 @@ int tcp_abort(struct sock *sk, int err)
> >
> >>      /* Don't race with BH socket closes such as inet_csk_listen_stop. */
> >>      local_bh_disable();
> >
> > [..]
> >
> >> -    bh_lock_sock(sk);
> >> +    if (!has_current_bpf_ctx())
> >> +            bh_lock_sock(sk);
> >
> > These are spinlocks and should probably be grabbed in the bpf context as
> > well?
>
> Fixed in the next revision. Thanks!
>
> >
> >
> >>      if (!sock_flag(sk, SOCK_DEAD)) {
> >>              sk->sk_err = err;
> >> @@ -4700,10 +4703,13 @@ int tcp_abort(struct sock *sk, int err)
> >>              tcp_done(sk);
> >>      }
> >
> >> -    bh_unlock_sock(sk);
> >> +    if (!has_current_bpf_ctx())
> >> +            bh_unlock_sock(sk);
> >> +
> >>      local_bh_enable();
> >>      tcp_write_queue_purge(sk);
> >> -    release_sock(sk);
> >> +    if (!has_current_bpf_ctx())
> >> +            release_sock(sk);
> >>      return 0;
> >>  }
> >>  EXPORT_SYMBOL_GPL(tcp_abort);
> >> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> >> index 545e56329355..02d357713838 100644
> >> --- a/net/ipv4/udp.c
> >> +++ b/net/ipv4/udp.c
> >> @@ -2925,7 +2925,9 @@ EXPORT_SYMBOL(udp_poll);
> >
> >>  int udp_abort(struct sock *sk, int err)
> >>  {
> >> -    lock_sock(sk);
> >> +    /* BPF context ensures sock locking. */
> >> +    if (!has_current_bpf_ctx())
> >> +            lock_sock(sk);
> >
> >>      /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing
> >>       * with close()
> >> @@ -2938,7 +2940,8 @@ int udp_abort(struct sock *sk, int err)
> >>      __udp_disconnect(sk, 0);
> >
> >>  out:
> >> -    release_sock(sk);
> >> +    if (!has_current_bpf_ctx())
> >> +            release_sock(sk);
> >
> >>      return 0;
> >>  }
> >> @@ -3184,15 +3187,23 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
> >>      struct sock *first_sk = NULL;
> >>      struct sock *sk;
> >>      unsigned int bucket_sks = 0;
> >> -    bool first;
> >>      bool resized = false;
> >> +    int offset = 0;
> >> +    int new_offset;
> >
> >>      /* The current batch is done, so advance the bucket. */
> >> -    if (iter->st_bucket_done)
> >> +    if (iter->st_bucket_done) {
> >>              state->bucket++;
> >> +            state->offset = 0;
> >> +    }
> >
> >>      udptable = udp_get_table_afinfo(afinfo, net);
> >
> >> +    if (state->bucket > udptable->mask) {
> >> +            state->bucket = 0;
> >> +            state->offset = 0;
> >> +    }
> >> +
> >>  again:
> >>      /* New batch for the next bucket.
> >>       * Iterate over the hash table to find a bucket with sockets matching
> >> @@ -3204,43 +3215,60 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
> >>      iter->cur_sk = 0;
> >>      iter->end_sk = 0;
> >>      iter->st_bucket_done = false;
> >> -    first = true;
> >> +    first_sk = NULL;
> >> +    bucket_sks = 0;
> >> +    offset = state->offset;
> >> +    new_offset = offset;
> >
> >>      for (; state->bucket <= udptable->mask; state->bucket++) {
> >>              struct udp_hslot *hslot = &udptable->hash[state->bucket];
> >
> >> -            if (hlist_empty(&hslot->head))
> >> +            if (hlist_empty(&hslot->head)) {
> >> +                    offset = 0;
> >>                      continue;
> >> +            }
> >
> >>              spin_lock_bh(&hslot->lock);
> >> +            /* Resume from the last saved position in a bucket before
> >> +             * iterator was stopped.
> >> +             */
> >> +            while (offset-- > 0) {
> >> +                    sk_for_each(sk, &hslot->head)
> >> +                            continue;
> >> +            }
> >>              sk_for_each(sk, &hslot->head) {
> >>                      if (seq_sk_match(seq, sk)) {
> >> -                            if (first) {
> >> +                            if (!first_sk)
> >>                                      first_sk = sk;
> >> -                                    first = false;
> >> -                            }
> >>                              if (iter->end_sk < iter->max_sk) {
> >>                                      sock_hold(sk);
> >>                                      iter->batch[iter->end_sk++] = sk;
> >>                              }
> >>                              bucket_sks++;
> >>                      }
> >> +                    new_offset++;
> >>              }
> >>              spin_unlock_bh(&hslot->lock);
> >> +
> >>              if (first_sk)
> >>                      break;
> >> +
> >> +            /* Reset the current bucket's offset before moving to the next bucket. */
> >> +            offset = 0;
> >> +            new_offset = 0;
> >> +
> >>      }
> >
> >>      /* All done: no batch made. */
> >>      if (!first_sk)
> >> -            return NULL;
> >> +            goto ret;
> >
> >>      if (iter->end_sk == bucket_sks) {
> >>              /* Batching is done for the current bucket; return the first
> >>               * socket to be iterated from the batch.
> >>               */
> >>              iter->st_bucket_done = true;
> >> -            return first_sk;
> >> +            goto ret;
> >>      }
> >>      if (!resized && !bpf_iter_udp_realloc_batch(iter, bucket_sks * 3 / 2)) {
> >>              resized = true;
> >> @@ -3248,19 +3276,24 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
> >>              state->bucket--;
> >>              goto again;
> >>      }
> >> +ret:
> >> +    state->offset = new_offset;
> >>      return first_sk;
> >>  }
> >
> >>  static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> >>  {
> >>      struct bpf_udp_iter_state *iter = seq->private;
> >> +    struct udp_iter_state *state = &iter->state;
> >>      struct sock *sk;
> >
> >>      /* Whenever seq_next() is called, the iter->cur_sk is
> >>       * done with seq_show(), so unref the iter->cur_sk.
> >>       */
> >> -    if (iter->cur_sk < iter->end_sk)
> >> +    if (iter->cur_sk < iter->end_sk) {
> >>              sock_put(iter->batch[iter->cur_sk++]);
> >> +            ++state->offset;
> >> +    }
> >
> >>      /* After updating iter->cur_sk, check if there are more sockets
> >>       * available in the current bucket batch.
> >> @@ -3630,6 +3663,9 @@ static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
> >>      }
> >>      iter->cur_sk = 0;
> >>      iter->end_sk = 0;
> >> +    iter->st_bucket_done = false;
> >> +    st->bucket = 0;
> >> +    st->offset = 0;
> >
> >>      return ret;
> >>  }
> >> --
> >> 2.34.1
>