> On Dec 20, 2022, at 2:26 AM, Alan Maguire <alan.maguire@xxxxxxxxxx> wrote: > > On 17/12/2022 01:57, Aditi Ghag wrote: >> The socket destroy helper is used to >> forcefully terminate sockets from certain >> BPF contexts. We plan to use the capability >> in Cilium to force client sockets to reconnect >> when their remote load-balancing backends are >> deleted. The other use case is on-the-fly >> policy enforcement where existing socket >> connections prevented by policies need to >> be terminated. >> >> The helper is currently exposed to iterator >> type BPF programs where users can filter, >> and terminate a set of sockets. >> >> Sockets are destroyed asynchronously using >> the work queue infrastructure. This allows >> for current the locking semantics within >> socket destroy handlers, as BPF iterators >> invoking the helper acquire *sock* locks. >> This also allows the helper to be invoked >> from non-sleepable contexts. >> The other approach to skip acquiring locks >> by passing an argument to the `diag_destroy` >> handler didn't work out well for UDP, as >> the UDP abort function internally invokes >> another function that ends up acquiring >> *sock* lock. >> While there are sleepable BPF iterators, >> these are limited to only certain map types. >> Furthermore, it's limiting in the sense that >> it wouldn't allow us to extend the helper >> to other non-sleepable BPF programs. >> >> The work queue infrastructure processes work >> items from per-cpu structures. As the sock >> destroy work items are executed asynchronously, >> we need to ref count sockets before they are >> added to the work queue. The 'work_pending' >> check prevents duplicate ref counting of sockets >> in case users invoke the destroy helper for a >> socket multiple times. The `{READ,WRITE}_ONCE` >> macros ensure that the socket pointer stored >> in a work queue item isn't clobbered while >> the item is being processed. As BPF programs >> are non-preemptible, we can expect that once >> a socket is ref counted, no other socket can >> sneak in before the ref counted socket is >> added to the work queue for asynchronous destroy. >> Finally, users are expected to retry when the >> helper fails to queue a work item for a socket >> to be destroyed in case there is another destroy >> operation is in progress. >> >> Signed-off-by: Aditi Ghag <aditi.ghag@xxxxxxxxxxxxx> >> --- >> include/linux/bpf.h | 1 + >> include/uapi/linux/bpf.h | 17 +++++++++ >> kernel/bpf/core.c | 1 + >> kernel/trace/bpf_trace.c | 2 + >> net/core/filter.c | 70 ++++++++++++++++++++++++++++++++++ >> tools/include/uapi/linux/bpf.h | 17 +++++++++ >> 6 files changed, 108 insertions(+) >> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >> index 3de24cfb7a3d..60eaa05dfab3 100644 >> --- a/include/linux/bpf.h >> +++ b/include/linux/bpf.h >> @@ -2676,6 +2676,7 @@ extern const struct bpf_func_proto bpf_get_retval_proto; >> extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto; >> extern const struct bpf_func_proto bpf_cgrp_storage_get_proto; >> extern const struct bpf_func_proto bpf_cgrp_storage_delete_proto; >> +extern const struct bpf_func_proto bpf_sock_destroy_proto; >> >> const struct bpf_func_proto *tracing_prog_func_proto( >> enum bpf_func_id func_id, const struct bpf_prog *prog); >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index 464ca3f01fe7..789ac7c59fdf 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h >> @@ -5484,6 +5484,22 @@ union bpf_attr { >> * 0 on success. >> * >> * **-ENOENT** if the bpf_local_storage cannot be found. >> + * >> + * int bpf_sock_destroy(struct sock *sk) >> + * Description >> + * Destroy the given socket with **ECONNABORTED** error code. >> + * >> + * *sk* must be a non-**NULL** pointer to a socket. >> + * >> + * Return >> + * The socket is destroyed asynchronosuly, so 0 return value may >> + * not suggest indicate that the socket was successfully destroyed. >> + * >> + * On error, may return **EPROTONOSUPPORT**, **EBUSY**, **EINVAL**. >> + * >> + * **-EPROTONOSUPPORT** if protocol specific destroy handler is not implemented. >> + * >> + * **-EBUSY** if another socket destroy operation is in progress. >> */ >> #define ___BPF_FUNC_MAPPER(FN, ctx...) \ >> FN(unspec, 0, ##ctx) \ >> @@ -5698,6 +5714,7 @@ union bpf_attr { >> FN(user_ringbuf_drain, 209, ##ctx) \ >> FN(cgrp_storage_get, 210, ##ctx) \ >> FN(cgrp_storage_delete, 211, ##ctx) \ >> + FN(sock_destroy, 212, ##ctx) \ >> /* */ >> >> /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't >> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c >> index 7f98dec6e90f..c59bef9805e5 100644 >> --- a/kernel/bpf/core.c >> +++ b/kernel/bpf/core.c >> @@ -2651,6 +2651,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto __weak; >> const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; >> const struct bpf_func_proto bpf_set_retval_proto __weak; >> const struct bpf_func_proto bpf_get_retval_proto __weak; >> +const struct bpf_func_proto bpf_sock_destroy_proto __weak; >> >> const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) >> { >> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c >> index 3bbd3f0c810c..016dbee6b5e4 100644 >> --- a/kernel/trace/bpf_trace.c >> +++ b/kernel/trace/bpf_trace.c >> @@ -1930,6 +1930,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) >> return &bpf_get_socket_ptr_cookie_proto; >> case BPF_FUNC_xdp_get_buff_len: >> return &bpf_xdp_get_buff_len_trace_proto; >> + case BPF_FUNC_sock_destroy: >> + return &bpf_sock_destroy_proto; >> #endif >> case BPF_FUNC_seq_printf: >> return prog->expected_attach_type == BPF_TRACE_ITER ? >> diff --git a/net/core/filter.c b/net/core/filter.c >> index 929358677183..9753606ecc26 100644 >> --- a/net/core/filter.c >> +++ b/net/core/filter.c >> @@ -11569,6 +11569,8 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) >> break; >> case BPF_FUNC_ktime_get_coarse_ns: >> return &bpf_ktime_get_coarse_ns_proto; >> + case BPF_FUNC_sock_destroy: >> + return &bpf_sock_destroy_proto; >> default: > This is a really neat feature! One question though; above it seems to > suggest that the helper is only exposed to BPF iterators, but by The v2 patch only exposes the capability to BPF iterators. The patch series has details involving locking semantics due to which the kfunc is only available for BPF iterators at the moment. > adding it to bpf_sk_base_func_proto() won't it be available to > other BPF programs like sock_addr, sk_filter etc? If I've got that > right, we'd definitely want to exclude potentially unprivileged > programs from having access to this helper. And to be clear, I'd > definitely see value in having it accessible to other BPF program > types too like sockops if possible, but just wanted to check. > >> return bpf_base_func_proto(func_id); >> } >> @@ -11578,3 +11580,71 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) >> >> return func; >> } >> + >> +struct sock_destroy_work { >> + struct sock *sk; >> + struct work_struct destroy; >> +}; >> + >> +static DEFINE_PER_CPU(struct sock_destroy_work, sock_destroy_workqueue); >> + >> +static void bpf_sock_destroy_fn(struct work_struct *work) >> +{ >> + struct sock_destroy_work *sd_work = container_of(work, >> + struct sock_destroy_work, destroy); >> + struct sock *sk = READ_ONCE(sd_work->sk); >> + >> + sk->sk_prot->diag_destroy(sk, ECONNABORTED); >> + sock_put(sk); >> +} >> + >> +static int __init bpf_sock_destroy_workqueue_init(void) >> +{ >> + int cpu; >> + struct sock_destroy_work *work; >> + >> + for_each_possible_cpu(cpu) { >> + work = per_cpu_ptr(&sock_destroy_workqueue, cpu); >> + INIT_WORK(&work->destroy, bpf_sock_destroy_fn); >> + } >> + >> + return 0; >> +} >> +subsys_initcall(bpf_sock_destroy_workqueue_init); >> + >> +BPF_CALL_1(bpf_sock_destroy, struct sock *, sk) >> +{ >> + struct sock_destroy_work *sd_work; >> + >> + if (!sk->sk_prot->diag_destroy) > > risk of a NULL sk here? > > Thanks! > > Alan