The socket destroy helper is used to forcefully terminate sockets from certain BPF contexts. We plan to use the capability in Cilium to force client sockets to reconnect when their remote load-balancing backends are deleted. The other use case is on-the-fly policy enforcement where existing socket connections prevented by policies need to be terminated. The helper is currently exposed to iterator type BPF programs where users can filter, and terminate a set of sockets. Sockets are destroyed asynchronously using the work queue infrastructure. This allows for current the locking semantics within socket destroy handlers, as BPF iterators invoking the helper acquire *sock* locks. This also allows the helper to be invoked from non-sleepable contexts. The other approach to skip acquiring locks by passing an argument to the `diag_destroy` handler didn't work out well for UDP, as the UDP abort function internally invokes another function that ends up acquiring *sock* lock. While there are sleepable BPF iterators, these are limited to only certain map types. Furthermore, it's limiting in the sense that it wouldn't allow us to extend the helper to other non-sleepable BPF programs. The work queue infrastructure processes work items from per-cpu structures. As the sock destroy work items are executed asynchronously, we need to ref count sockets before they are added to the work queue. The 'work_pending' check prevents duplicate ref counting of sockets in case users invoke the destroy helper for a socket multiple times. The `{READ,WRITE}_ONCE` macros ensure that the socket pointer stored in a work queue item isn't clobbered while the item is being processed. As BPF programs are non-preemptible, we can expect that once a socket is ref counted, no other socket can sneak in before the ref counted socket is added to the work queue for asynchronous destroy. Finally, users are expected to retry when the helper fails to queue a work item for a socket to be destroyed in case there is another destroy operation is in progress. Signed-off-by: Aditi Ghag <aditi.ghag@xxxxxxxxxxxxx> --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 17 +++++++++ kernel/bpf/core.c | 1 + kernel/trace/bpf_trace.c | 2 + net/core/filter.c | 70 ++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 17 +++++++++ 6 files changed, 108 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3de24cfb7a3d..60eaa05dfab3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2676,6 +2676,7 @@ extern const struct bpf_func_proto bpf_get_retval_proto; extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto; extern const struct bpf_func_proto bpf_cgrp_storage_get_proto; extern const struct bpf_func_proto bpf_cgrp_storage_delete_proto; +extern const struct bpf_func_proto bpf_sock_destroy_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 464ca3f01fe7..789ac7c59fdf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5484,6 +5484,22 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * + * int bpf_sock_destroy(struct sock *sk) + * Description + * Destroy the given socket with **ECONNABORTED** error code. + * + * *sk* must be a non-**NULL** pointer to a socket. + * + * Return + * The socket is destroyed asynchronosuly, so 0 return value may + * not suggest indicate that the socket was successfully destroyed. + * + * On error, may return **EPROTONOSUPPORT**, **EBUSY**, **EINVAL**. + * + * **-EPROTONOSUPPORT** if protocol specific destroy handler is not implemented. + * + * **-EBUSY** if another socket destroy operation is in progress. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5698,6 +5714,7 @@ union bpf_attr { FN(user_ringbuf_drain, 209, ##ctx) \ FN(cgrp_storage_get, 210, ##ctx) \ FN(cgrp_storage_delete, 211, ##ctx) \ + FN(sock_destroy, 212, ##ctx) \ /* */ /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7f98dec6e90f..c59bef9805e5 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2651,6 +2651,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto __weak; const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; const struct bpf_func_proto bpf_set_retval_proto __weak; const struct bpf_func_proto bpf_get_retval_proto __weak; +const struct bpf_func_proto bpf_sock_destroy_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3bbd3f0c810c..016dbee6b5e4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1930,6 +1930,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_ptr_cookie_proto; case BPF_FUNC_xdp_get_buff_len: return &bpf_xdp_get_buff_len_trace_proto; + case BPF_FUNC_sock_destroy: + return &bpf_sock_destroy_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/filter.c b/net/core/filter.c index 929358677183..9753606ecc26 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -11569,6 +11569,8 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) break; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; + case BPF_FUNC_sock_destroy: + return &bpf_sock_destroy_proto; default: return bpf_base_func_proto(func_id); } @@ -11578,3 +11580,71 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) return func; } + +struct sock_destroy_work { + struct sock *sk; + struct work_struct destroy; +}; + +static DEFINE_PER_CPU(struct sock_destroy_work, sock_destroy_workqueue); + +static void bpf_sock_destroy_fn(struct work_struct *work) +{ + struct sock_destroy_work *sd_work = container_of(work, + struct sock_destroy_work, destroy); + struct sock *sk = READ_ONCE(sd_work->sk); + + sk->sk_prot->diag_destroy(sk, ECONNABORTED); + sock_put(sk); +} + +static int __init bpf_sock_destroy_workqueue_init(void) +{ + int cpu; + struct sock_destroy_work *work; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&sock_destroy_workqueue, cpu); + INIT_WORK(&work->destroy, bpf_sock_destroy_fn); + } + + return 0; +} +subsys_initcall(bpf_sock_destroy_workqueue_init); + +BPF_CALL_1(bpf_sock_destroy, struct sock *, sk) +{ + struct sock_destroy_work *sd_work; + + if (!sk->sk_prot->diag_destroy) + return -EOPNOTSUPP; + + sd_work = this_cpu_ptr(&sock_destroy_workqueue); + /* This check prevents duplicate ref counting + * of sockets, in case the handler is invoked + * multiple times for the same socket. + */ + if (work_pending(&sd_work->destroy)) + return -EBUSY; + + /* Ref counting ensures that the socket + * isn't deleted from underneath us before + * the work queue item is processed. + */ + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + return -EINVAL; + + WRITE_ONCE(sd_work->sk, sk); + if (!queue_work(system_wq, &sd_work->destroy)) { + sock_put(sk); + return -EBUSY; + } + + return 0; +} + +const struct bpf_func_proto bpf_sock_destroy_proto = { + .func = bpf_sock_destroy, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, +}; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 464ca3f01fe7..07154a4d92f9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5484,6 +5484,22 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * + * int bpf_sock_destroy(void *sk) + * Description + * Destroy the given socket with **ECONNABORTED** error code. + * + * *sk* must be a non-**NULL** pointer to a socket. + * + * Return + * The socket is destroyed asynchronosuly, so 0 return value may + * not indicate that the socket was successfully destroyed. + * + * On error, may return **EPROTONOSUPPORT**, **EBUSY**, **EINVAL**. + * + * **-EPROTONOSUPPORT** if protocol specific destroy handler is not implemented. + * + * **-EBUSY** if another socket destroy operation is in progress. */ #define ___BPF_FUNC_MAPPER(FN, ctx...) \ FN(unspec, 0, ##ctx) \ @@ -5698,6 +5714,7 @@ union bpf_attr { FN(user_ringbuf_drain, 209, ##ctx) \ FN(cgrp_storage_get, 210, ##ctx) \ FN(cgrp_storage_delete, 211, ##ctx) \ + FN(sock_destroy, 212, ##ctx) \ /* */ /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't -- 2.34.1