Re: [PATCH bpf-next v4 3/3] bpf: remove extra lock_sock for TCP_ZEROCOPY_RECEIVE

Martin KaFai Lau <kafai@xxxxxx> · Thu, 7 Jan 2021 17:08:39 -0800

On Thu, Jan 07, 2021 at 10:43:05AM -0800, Stanislav Fomichev wrote:
> Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE.
> We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom
> call in do_tcp_getsockopt using the on-stack data. This removes
> 2% overhead for locking/unlocking the socket.
> 
> Also:
> - Removed BUILD_BUG_ON (zerocopy doesn't depend on the buf size anymore)
> - Separated on-stack buffer into bpf_sockopt_buf and downsized to 32 bytes
>   (let's keep it to help with the other options)
> 
> (I can probably split this patch into two: add new features and rework
>  bpf_sockopt_buf; can follow up if the approach in general sounds
>  good).
> 
> Without this patch:
>      1.87%     0.06%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt
> 
> With the patch applied:
>      0.52%     0.12%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt_kern
> 
> Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxx>
> Cc: Martin KaFai Lau <kafai@xxxxxx>
> Cc: Song Liu <songliubraving@xxxxxx>
> Cc: Eric Dumazet <edumazet@xxxxxxxxxx>
> ---
>  include/linux/bpf-cgroup.h                    | 25 ++++-
>  include/linux/filter.h                        |  6 +-
>  include/net/sock.h                            |  2 +
>  include/net/tcp.h                             |  1 +
>  kernel/bpf/cgroup.c                           | 93 +++++++++++++------
>  net/ipv4/tcp.c                                | 14 +++
>  net/ipv4/tcp_ipv4.c                           |  1 +
>  net/ipv6/tcp_ipv6.c                           |  1 +
>  .../selftests/bpf/prog_tests/sockopt_sk.c     | 22 +++++
>  .../testing/selftests/bpf/progs/sockopt_sk.c  | 15 +++
>  10 files changed, 147 insertions(+), 33 deletions(-)
>

[ ... ]

> @@ -454,6 +469,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
>  #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
>  #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
>  				       optlen, max_optlen, retval) ({ retval; })
> +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \
> +					    optlen, retval) ({ retval; })
>  #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
>  				       kernel_optval) ({ 0; })
>  
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 54a4225f36d8..8739f1d4cac4 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -1281,7 +1281,10 @@ struct bpf_sysctl_kern {
>  	u64 tmp_reg;
>  };
>  
> -#define BPF_SOCKOPT_KERN_BUF_SIZE	64
> +#define BPF_SOCKOPT_KERN_BUF_SIZE	32
It is reduced from patch 1 because there is no
need to use the buf (and copy from/to buf) in TCP_ZEROCOPY_RECEIVE?

Patch 1 is still desired (and kept in this set) because it may still
benefit other optname?

> +struct bpf_sockopt_buf {
> +	u8		data[BPF_SOCKOPT_KERN_BUF_SIZE];
> +};
>  
>  struct bpf_sockopt_kern {
>  	struct sock	*sk;
> @@ -1291,7 +1294,6 @@ struct bpf_sockopt_kern {
>  	s32		optname;
>  	s32		optlen;
>  	s32		retval;
> -	u8		buf[BPF_SOCKOPT_KERN_BUF_SIZE];
It is better to pick one way to do thing to avoid code
churn like this within the same series.

>  };
>  
>  int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);
> diff --git a/include/net/sock.h b/include/net/sock.h
> index bdc4323ce53c..ebf44d724845 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1174,6 +1174,8 @@ struct proto {
>  
>  	int			(*backlog_rcv) (struct sock *sk,
>  						struct sk_buff *skb);
> +	bool			(*bpf_bypass_getsockopt)(int level,
> +							 int optname);
>  
>  	void		(*release_cb)(struct sock *sk);
>  
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 78d13c88720f..4bb42fb19711 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -403,6 +403,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock,
>  		      struct poll_table_struct *wait);
>  int tcp_getsockopt(struct sock *sk, int level, int optname,
>  		   char __user *optval, int __user *optlen);
> +bool tcp_bpf_bypass_getsockopt(int level, int optname);
>  int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
>  		   unsigned int optlen);
>  void tcp_set_keepalive(struct sock *sk, int val);
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index adbecdcaa370..e82df63aedc7 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -16,7 +16,6 @@
>  #include <linux/bpf-cgroup.h>
>  #include <net/sock.h>
>  #include <net/bpf_sk_storage.h>
> -#include <uapi/linux/tcp.h> /* sizeof(struct tcp_zerocopy_receive) */
Can the patches be re-ordered a little to avoid code churn like this
in the same series?

It feels like this patch 3 should be the first patch instead.
The current patch 1 should be the second patch
but it can still use the tcp_mmap to show potential
benefit for other optnames.

>  
>  #include "../cgroup/cgroup-internal.h"
>  
> @@ -1299,7 +1298,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
>  	return empty;
>  }
>  
> -static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
> +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
> +			     struct bpf_sockopt_buf *buf)
>  {
>  	if (unlikely(max_optlen < 0))
>  		return -EINVAL;
> @@ -1311,18 +1311,11 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
>  		max_optlen = PAGE_SIZE;
>  	}
>  
> -	if (max_optlen <= sizeof(ctx->buf)) {
> +	if (max_optlen <= sizeof(buf->data)) {
>  		/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
>  		 * bytes avoid the cost of kzalloc.
> -		 *
> -		 * In order to remove extra allocations from the TCP
> -		 * fast zero-copy path ensure that buffer covers
> -		 * the size of struct tcp_zerocopy_receive.
>  		 */
> -		BUILD_BUG_ON(sizeof(struct tcp_zerocopy_receive) >
> -			     BPF_SOCKOPT_KERN_BUF_SIZE);
> -
> -		ctx->optval = ctx->buf;
> +		ctx->optval = buf->data;
>  		ctx->optval_end = ctx->optval + max_optlen;
>  		return max_optlen;
>  	}
> @@ -1336,16 +1329,18 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
>  	return max_optlen;
>  }
>  
> -static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
> +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
> +			     struct bpf_sockopt_buf *buf)
>  {
> -	if (ctx->optval == ctx->buf)
> +	if (ctx->optval == buf->data)
>  		return;
>  	kfree(ctx->optval);
>  }
>  
> -static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx)
> +static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
> +				  struct bpf_sockopt_buf *buf)
>  {
> -	return ctx->optval != ctx->buf;
> +	return ctx->optval != buf->data;
>  }
>  
>  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
> @@ -1353,6 +1348,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>  				       int *optlen, char **kernel_optval)
>  {
>  	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
> +	struct bpf_sockopt_buf buf = {};
>  	struct bpf_sockopt_kern ctx = {
>  		.sk = sk,
>  		.level = *level,
> @@ -1373,7 +1369,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>  	 */
>  	max_optlen = max_t(int, 16, *optlen);
>  
> -	max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
> +	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
>  	if (max_optlen < 0)
>  		return max_optlen;
>  
> @@ -1419,7 +1415,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>  			 * No way to export on-stack buf, have to allocate a
>  			 * new buffer.
>  			 */
> -			if (!sockopt_buf_allocated(&ctx)) {
> +			if (!sockopt_buf_allocated(&ctx, &buf)) {
>  				void *p = kzalloc(ctx.optlen, GFP_USER);
>  
>  				if (!p) {
> @@ -1436,7 +1432,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
>  
>  out:
>  	if (ret)
> -		sockopt_free_buf(&ctx);
> +		sockopt_free_buf(&ctx, &buf);
>  	return ret;
>  }
>  
> @@ -1445,15 +1441,20 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
>  				       int __user *optlen, int max_optlen,
>  				       int retval)
>  {
> -	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
> -	struct bpf_sockopt_kern ctx = {
> -		.sk = sk,
> -		.level = level,
> -		.optname = optname,
> -		.retval = retval,
> -	};
This change looks unnecessary?

> +	struct bpf_sockopt_kern ctx;
> +	struct bpf_sockopt_buf buf;
> +	struct cgroup *cgrp;
>  	int ret;
>  
> +	memset(&buf, 0, sizeof(buf));
> +	memset(&ctx, 0, sizeof(ctx));
> +
> +	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
> +	ctx.sk = sk;
> +	ctx.level = level;
> +	ctx.optname = optname;
> +	ctx.retval = retval;
> +