Re: [PATCH 1/2] bpf: Add socket destroy capability

sdf@xxxxxxxxxx · Mon, 19 Dec 2022 10:22:25 -0800

On 12/17, Aditi Ghag wrote:
The socket destroy helper is used to
forcefully terminate sockets from certain
BPF contexts. We plan to use the capability
in Cilium to force client sockets to reconnect
when their remote load-balancing backends are
deleted. The other use case is on-the-fly
policy enforcement where existing socket
connections prevented by policies need to
be terminated.

The helper is currently exposed to iterator
type BPF programs where users can filter,
and terminate a set of sockets.

Sockets are destroyed asynchronously using
the work queue infrastructure. This allows
for current the locking semantics within

s/current the/the current/ ?

socket destroy handlers, as BPF iterators
invoking the helper acquire *sock* locks.
This also allows the helper to be invoked
from non-sleepable contexts.
The other approach to skip acquiring locks
by passing an argument to the `diag_destroy`
handler didn't work out well for UDP, as
the UDP abort function internally invokes
another function that ends up acquiring
*sock* lock.
While there are sleepable BPF iterators,
these are limited to only certain map types.
Furthermore, it's limiting in the sense that
it wouldn't allow us to extend the helper
to other non-sleepable BPF programs.

The work queue infrastructure processes work
items from per-cpu structures. As the sock
destroy work items are executed asynchronously,
we need to ref count sockets before they are
added to the work queue. The 'work_pending'
check prevents duplicate ref counting of sockets
in case users invoke the destroy helper for a
socket multiple times. The `{READ,WRITE}_ONCE`
macros ensure that the socket pointer stored
in a work queue item isn't clobbered while
the item is being processed. As BPF programs
are non-preemptible, we can expect that once
a socket is ref counted, no other socket can
sneak in before the ref counted socket is
added to the work queue for asynchronous destroy.
Finally, users are expected to retry when the
helper fails to queue a work item for a socket
to be destroyed in case there is another destroy
operation is in progress.

nit: maybe reformat to fit into 80 characters per line? A bit hard to
read with this narrow formatting..


Signed-off-by: Aditi Ghag <aditi.ghag@xxxxxxxxxxxxx>
---
  include/linux/bpf.h            |  1 +
  include/uapi/linux/bpf.h       | 17 +++++++++
  kernel/bpf/core.c              |  1 +
  kernel/trace/bpf_trace.c       |  2 +
  net/core/filter.c              | 70 ++++++++++++++++++++++++++++++++++
  tools/include/uapi/linux/bpf.h | 17 +++++++++
  6 files changed, 108 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3de24cfb7a3d..60eaa05dfab3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2676,6 +2676,7 @@ extern const struct bpf_func_proto  
bpf_get_retval_proto;
  extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto;
  extern const struct bpf_func_proto bpf_cgrp_storage_get_proto;
  extern const struct bpf_func_proto bpf_cgrp_storage_delete_proto;
+extern const struct bpf_func_proto bpf_sock_destroy_proto;

  const struct bpf_func_proto *tracing_prog_func_proto(
    enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 464ca3f01fe7..789ac7c59fdf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5484,6 +5484,22 @@ union bpf_attr {
   *		0 on success.
   *
   *		**-ENOENT** if the bpf_local_storage cannot be found.
+ *
+ * int bpf_sock_destroy(struct sock *sk)
+ *	Description
+ *		Destroy the given socket with **ECONNABORTED** error code.
+ *
+ *		*sk* must be a non-**NULL** pointer to a socket.
+ *
+ *	Return
+ *		The socket is destroyed asynchronosuly, so 0 return value may
+ *		not suggest indicate that the socket was successfully destroyed.

s/suggest indicate/ with either suggest or indicate?

+ *
+ *		On error, may return **EPROTONOSUPPORT**, **EBUSY**, **EINVAL**.
+ *
+ *		**-EPROTONOSUPPORT** if protocol specific destroy handler is not  
implemented.
+ *
+ *		**-EBUSY** if another socket destroy operation is in progress.
   */
  #define ___BPF_FUNC_MAPPER(FN, ctx...)			\
  	FN(unspec, 0, ##ctx)				\
@@ -5698,6 +5714,7 @@ union bpf_attr {
  	FN(user_ringbuf_drain, 209, ##ctx)		\
  	FN(cgrp_storage_get, 210, ##ctx)		\
  	FN(cgrp_storage_delete, 211, ##ctx)		\
+	FN(sock_destroy, 212, ##ctx)			\
  	/* */

  /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that  
don't
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7f98dec6e90f..c59bef9805e5 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2651,6 +2651,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto  
__weak;
  const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
  const struct bpf_func_proto bpf_set_retval_proto __weak;
  const struct bpf_func_proto bpf_get_retval_proto __weak;
+const struct bpf_func_proto bpf_sock_destroy_proto __weak;

  const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
  {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3bbd3f0c810c..016dbee6b5e4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1930,6 +1930,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id,  
const struct bpf_prog *prog)
  		return &bpf_get_socket_ptr_cookie_proto;
  	case BPF_FUNC_xdp_get_buff_len:
  		return &bpf_xdp_get_buff_len_trace_proto;
+	case BPF_FUNC_sock_destroy:
+		return &bpf_sock_destroy_proto;
  #endif
  	case BPF_FUNC_seq_printf:
  		return prog->expected_attach_type == BPF_TRACE_ITER ?
diff --git a/net/core/filter.c b/net/core/filter.c
index 929358677183..9753606ecc26 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -11569,6 +11569,8 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
  		break;
  	case BPF_FUNC_ktime_get_coarse_ns:
  		return &bpf_ktime_get_coarse_ns_proto;
+	case BPF_FUNC_sock_destroy:
+		return &bpf_sock_destroy_proto;
  	default:
  		return bpf_base_func_proto(func_id);
  	}
@@ -11578,3 +11580,71 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)

  	return func;
  }
+
+struct sock_destroy_work {
+	struct sock *sk;
+	struct work_struct destroy;
+};
+
+static DEFINE_PER_CPU(struct sock_destroy_work, sock_destroy_workqueue);
+
+static void bpf_sock_destroy_fn(struct work_struct *work)
+{
+	struct sock_destroy_work *sd_work = container_of(work,
+			struct sock_destroy_work, destroy);
+	struct sock *sk = READ_ONCE(sd_work->sk);
+
+	sk->sk_prot->diag_destroy(sk, ECONNABORTED);
+	sock_put(sk);
+}
+
+static int __init bpf_sock_destroy_workqueue_init(void)
+{
+	int cpu;
+	struct sock_destroy_work *work;
+
+	for_each_possible_cpu(cpu) {
+		work = per_cpu_ptr(&sock_destroy_workqueue, cpu);
+		INIT_WORK(&work->destroy, bpf_sock_destroy_fn);
+	}
+
+	return 0;
+}
+subsys_initcall(bpf_sock_destroy_workqueue_init);
+
+BPF_CALL_1(bpf_sock_destroy, struct sock *, sk)
+{
+	struct sock_destroy_work *sd_work;
+
+	if (!sk->sk_prot->diag_destroy)
+		return -EOPNOTSUPP;
+
+	sd_work = this_cpu_ptr(&sock_destroy_workqueue);

[..]

+	/* This check prevents duplicate ref counting
+	 * of sockets, in case the handler is invoked
+	 * multiple times for the same socket.
+	 */

This means this helper can also be called for a single socket during
invocation; is it an ok compromise?

I'm also assuming it's still possible that this helper gets called for
the same socket on different cpus?

+	if (work_pending(&sd_work->destroy))
+		return -EBUSY;
+
+	/* Ref counting ensures that the socket
+	 * isn't deleted from underneath us before
+	 * the work queue item is processed.
+	 */
+	if (!refcount_inc_not_zero(&sk->sk_refcnt))
+		return -EINVAL;
+
+	WRITE_ONCE(sd_work->sk, sk);
+	if (!queue_work(system_wq, &sd_work->destroy)) {
+		sock_put(sk);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+const struct bpf_func_proto bpf_sock_destroy_proto = {
+	.func		= bpf_sock_destroy,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+};
diff --git a/tools/include/uapi/linux/bpf.h  
b/tools/include/uapi/linux/bpf.h
index 464ca3f01fe7..07154a4d92f9 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5484,6 +5484,22 @@ union bpf_attr {
   *		0 on success.
   *
   *		**-ENOENT** if the bpf_local_storage cannot be found.
+ *
+ * int bpf_sock_destroy(void *sk)
+ *	Description
+ *		Destroy the given socket with **ECONNABORTED** error code.
+ *
+ *		*sk* must be a non-**NULL** pointer to a socket.
+ *
+ *	Return
+ *		The socket is destroyed asynchronosuly, so 0 return value may
+ *		not indicate that the socket was successfully destroyed.
+ *
+ *		On error, may return **EPROTONOSUPPORT**, **EBUSY**, **EINVAL**.
+ *
+ *		**-EPROTONOSUPPORT** if protocol specific destroy handler is not  
implemented.
+ *
+ *		**-EBUSY** if another socket destroy operation is in progress.
   */
  #define ___BPF_FUNC_MAPPER(FN, ctx...)			\
  	FN(unspec, 0, ##ctx)				\
@@ -5698,6 +5714,7 @@ union bpf_attr {
  	FN(user_ringbuf_drain, 209, ##ctx)		\
  	FN(cgrp_storage_get, 210, ##ctx)		\
  	FN(cgrp_storage_delete, 211, ##ctx)		\
+	FN(sock_destroy, 212, ##ctx)			\
  	/* */

  /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that  
don't
--
2.34.1