> On Dec 19, 2022, at 10:22 AM, sdf@xxxxxxxxxx wrote: > > On 12/17, Aditi Ghag wrote: >> The socket destroy helper is used to >> forcefully terminate sockets from certain >> BPF contexts. We plan to use the capability >> in Cilium to force client sockets to reconnect >> when their remote load-balancing backends are >> deleted. The other use case is on-the-fly >> policy enforcement where existing socket >> connections prevented by policies need to >> be terminated. > >> The helper is currently exposed to iterator >> type BPF programs where users can filter, >> and terminate a set of sockets. > >> Sockets are destroyed asynchronously using >> the work queue infrastructure. This allows >> for current the locking semantics within > > s/current the/the current/ ? > >> socket destroy handlers, as BPF iterators >> invoking the helper acquire *sock* locks. >> This also allows the helper to be invoked >> from non-sleepable contexts. >> The other approach to skip acquiring locks >> by passing an argument to the `diag_destroy` >> handler didn't work out well for UDP, as >> the UDP abort function internally invokes >> another function that ends up acquiring >> *sock* lock. >> While there are sleepable BPF iterators, >> these are limited to only certain map types. >> Furthermore, it's limiting in the sense that >> it wouldn't allow us to extend the helper >> to other non-sleepable BPF programs. > >> The work queue infrastructure processes work >> items from per-cpu structures. As the sock >> destroy work items are executed asynchronously, >> we need to ref count sockets before they are >> added to the work queue. The 'work_pending' >> check prevents duplicate ref counting of sockets >> in case users invoke the destroy helper for a >> socket multiple times. The `{READ,WRITE}_ONCE` >> macros ensure that the socket pointer stored >> in a work queue item isn't clobbered while >> the item is being processed. As BPF programs >> are non-preemptible, we can expect that once >> a socket is ref counted, no other socket can >> sneak in before the ref counted socket is >> added to the work queue for asynchronous destroy. >> Finally, users are expected to retry when the >> helper fails to queue a work item for a socket >> to be destroyed in case there is another destroy >> operation is in progress. > > nit: maybe reformat to fit into 80 characters per line? A bit hard to > read with this narrow formatting.. > > >> Signed-off-by: Aditi Ghag <aditi.ghag@xxxxxxxxxxxxx> >> --- >> include/linux/bpf.h | 1 + >> include/uapi/linux/bpf.h | 17 +++++++++ >> kernel/bpf/core.c | 1 + >> kernel/trace/bpf_trace.c | 2 + >> net/core/filter.c | 70 ++++++++++++++++++++++++++++++++++ >> tools/include/uapi/linux/bpf.h | 17 +++++++++ >> 6 files changed, 108 insertions(+) > >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >> index 3de24cfb7a3d..60eaa05dfab3 100644 >> --- a/include/linux/bpf.h >> +++ b/include/linux/bpf.h >> @@ -2676,6 +2676,7 @@ extern const struct bpf_func_proto bpf_get_retval_proto; >> extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto; >> extern const struct bpf_func_proto bpf_cgrp_storage_get_proto; >> extern const struct bpf_func_proto bpf_cgrp_storage_delete_proto; >> +extern const struct bpf_func_proto bpf_sock_destroy_proto; > >> const struct bpf_func_proto *tracing_prog_func_proto( >> enum bpf_func_id func_id, const struct bpf_prog *prog); >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index 464ca3f01fe7..789ac7c59fdf 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h >> @@ -5484,6 +5484,22 @@ union bpf_attr { >> * 0 on success. >> * >> * **-ENOENT** if the bpf_local_storage cannot be found. >> + * >> + * int bpf_sock_destroy(struct sock *sk) >> + * Description >> + * Destroy the given socket with **ECONNABORTED** error code. >> + * >> + * *sk* must be a non-**NULL** pointer to a socket. >> + * >> + * Return >> + * The socket is destroyed asynchronosuly, so 0 return value may >> + * not suggest indicate that the socket was successfully destroyed. > > s/suggest indicate/ with either suggest or indicate? > >> + * >> + * On error, may return **EPROTONOSUPPORT**, **EBUSY**, **EINVAL**. >> + * >> + * **-EPROTONOSUPPORT** if protocol specific destroy handler is not implemented. >> + * >> + * **-EBUSY** if another socket destroy operation is in progress. >> */ >> #define ___BPF_FUNC_MAPPER(FN, ctx...) \ >> FN(unspec, 0, ##ctx) \ >> @@ -5698,6 +5714,7 @@ union bpf_attr { >> FN(user_ringbuf_drain, 209, ##ctx) \ >> FN(cgrp_storage_get, 210, ##ctx) \ >> FN(cgrp_storage_delete, 211, ##ctx) \ >> + FN(sock_destroy, 212, ##ctx) \ >> /* */ > >> /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't >> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c >> index 7f98dec6e90f..c59bef9805e5 100644 >> --- a/kernel/bpf/core.c >> +++ b/kernel/bpf/core.c >> @@ -2651,6 +2651,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto __weak; >> const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; >> const struct bpf_func_proto bpf_set_retval_proto __weak; >> const struct bpf_func_proto bpf_get_retval_proto __weak; >> +const struct bpf_func_proto bpf_sock_destroy_proto __weak; > >> const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) >> { >> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c >> index 3bbd3f0c810c..016dbee6b5e4 100644 >> --- a/kernel/trace/bpf_trace.c >> +++ b/kernel/trace/bpf_trace.c >> @@ -1930,6 +1930,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) >> return &bpf_get_socket_ptr_cookie_proto; >> case BPF_FUNC_xdp_get_buff_len: >> return &bpf_xdp_get_buff_len_trace_proto; >> + case BPF_FUNC_sock_destroy: >> + return &bpf_sock_destroy_proto; >> #endif >> case BPF_FUNC_seq_printf: >> return prog->expected_attach_type == BPF_TRACE_ITER ? >> diff --git a/net/core/filter.c b/net/core/filter.c >> index 929358677183..9753606ecc26 100644 >> --- a/net/core/filter.c >> +++ b/net/core/filter.c >> @@ -11569,6 +11569,8 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) >> break; >> case BPF_FUNC_ktime_get_coarse_ns: >> return &bpf_ktime_get_coarse_ns_proto; >> + case BPF_FUNC_sock_destroy: >> + return &bpf_sock_destroy_proto; >> default: >> return bpf_base_func_proto(func_id); >> } >> @@ -11578,3 +11580,71 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) > >> return func; >> } >> + >> +struct sock_destroy_work { >> + struct sock *sk; >> + struct work_struct destroy; >> +}; >> + >> +static DEFINE_PER_CPU(struct sock_destroy_work, sock_destroy_workqueue); >> + >> +static void bpf_sock_destroy_fn(struct work_struct *work) >> +{ >> + struct sock_destroy_work *sd_work = container_of(work, >> + struct sock_destroy_work, destroy); >> + struct sock *sk = READ_ONCE(sd_work->sk); >> + >> + sk->sk_prot->diag_destroy(sk, ECONNABORTED); >> + sock_put(sk); >> +} >> + >> +static int __init bpf_sock_destroy_workqueue_init(void) >> +{ >> + int cpu; >> + struct sock_destroy_work *work; >> + >> + for_each_possible_cpu(cpu) { >> + work = per_cpu_ptr(&sock_destroy_workqueue, cpu); >> + INIT_WORK(&work->destroy, bpf_sock_destroy_fn); >> + } >> + >> + return 0; >> +} >> +subsys_initcall(bpf_sock_destroy_workqueue_init); >> + >> +BPF_CALL_1(bpf_sock_destroy, struct sock *, sk) >> +{ >> + struct sock_destroy_work *sd_work; >> + >> + if (!sk->sk_prot->diag_destroy) >> + return -EOPNOTSUPP; >> + >> + sd_work = this_cpu_ptr(&sock_destroy_workqueue); > > [..] > >> + /* This check prevents duplicate ref counting >> + * of sockets, in case the handler is invoked >> + * multiple times for the same socket. >> + */ > > This means this helper can also be called for a single socket during > invocation; is it an ok compromise? > > I'm also assuming it's still possible that this helper gets called for > the same socket on different cpus? Thanks for the review. The v2 patch series refactored this code path significantly, and it executes the destroy handlers synchronously. > >> + if (work_pending(&sd_work->destroy)) >> + return -EBUSY; >> + >> + /* Ref counting ensures that the socket >> + * isn't deleted from underneath us before >> + * the work queue item is processed. >> + */ >> + if (!refcount_inc_not_zero(&sk->sk_refcnt)) >> + return -EINVAL; >> + >> + WRITE_ONCE(sd_work->sk, sk); >> + if (!queue_work(system_wq, &sd_work->destroy)) { >> + sock_put(sk); >> + return -EBUSY; >> + } >> + >> + return 0; >> +} >> + >> +const struct bpf_func_proto bpf_sock_destroy_proto = { >> + .func = bpf_sock_destroy, >> + .ret_type = RET_INTEGER, >> + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, >> +}; >> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h >> index 464ca3f01fe7..07154a4d92f9 100644 >> --- a/tools/include/uapi/linux/bpf.h >> +++ b/tools/include/uapi/linux/bpf.h >> @@ -5484,6 +5484,22 @@ union bpf_attr { >> * 0 on success. >> * >> * **-ENOENT** if the bpf_local_storage cannot be found. >> + * >> + * int bpf_sock_destroy(void *sk) >> + * Description >> + * Destroy the given socket with **ECONNABORTED** error code. >> + * >> + * *sk* must be a non-**NULL** pointer to a socket. >> + * >> + * Return >> + * The socket is destroyed asynchronosuly, so 0 return value may >> + * not indicate that the socket was successfully destroyed. >> + * >> + * On error, may return **EPROTONOSUPPORT**, **EBUSY**, **EINVAL**. >> + * >> + * **-EPROTONOSUPPORT** if protocol specific destroy handler is not implemented. >> + * >> + * **-EBUSY** if another socket destroy operation is in progress. >> */ >> #define ___BPF_FUNC_MAPPER(FN, ctx...) \ >> FN(unspec, 0, ##ctx) \ >> @@ -5698,6 +5714,7 @@ union bpf_attr { >> FN(user_ringbuf_drain, 209, ##ctx) \ >> FN(cgrp_storage_get, 210, ##ctx) \ >> FN(cgrp_storage_delete, 211, ##ctx) \ >> + FN(sock_destroy, 212, ##ctx) \ >> /* */ > >> /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't >> -- >> 2.34.1