Re: [PATCH net-next v3 10/14] net-timestamp: add basic support with tskey offset

Willem de Bruijn <willemdebruijn.kernel@xxxxxxxxx> · Mon, 28 Oct 2024 21:24:23 -0400

Jason Xing wrote:
> From: Jason Xing <kernelxing@xxxxxxxxxxx>
> 
> Use the offset to record the delta value between current socket key
> and bpf socket key.
> 
> 1. If there is only bpf feature running, the socket key is bpf socket
> key and the offset is zero;
> 2. If there is only traditional feature running, and then bpf feature
> is turned on, the socket key is still used by the former while the offset
> is the delta between them;
> 3. if there is only bpf feature running, and then application uses it,
> the socket key would be re-init for application and the offset is the
> delta.

We need to also figure out the rare conflict when one user sets
OPT_ID | OPT_ID_TCP while the other only uses OPT_ID.

It is so obscure, that perhaps we can punt and say that the BPF
program just has to follow the application preference and be aware of
the subtle difference.

> Signed-off-by: Jason Xing <kernelxing@xxxxxxxxxxx>
> ---
>  include/net/sock.h |  1 +
>  net/core/skbuff.c  | 15 ++++++++---
>  net/core/sock.c    | 66 ++++++++++++++++++++++++++++++++++++++--------
>  3 files changed, 68 insertions(+), 14 deletions(-)
> 
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 91398b20a4a3..41c6c6f78e55 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -469,6 +469,7 @@ struct sock {
>  	unsigned long		sk_pacing_rate; /* bytes per second */
>  	atomic_t		sk_zckey;
>  	atomic_t		sk_tskey;
> +	u32			sk_tskey_bpf_offset;
>  	__cacheline_group_end(sock_write_tx);
>  
>  	__cacheline_group_begin(sock_read_tx);
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 0b571306f7ea..d1739317b97d 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -5641,9 +5641,10 @@ void timestamp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
>  }
>  
>  static void skb_tstamp_tx_output_bpf(struct sock *sk, int tstype,
> +				     struct sk_buff *skb,
>  				     struct skb_shared_hwtstamps *hwtstamps)
>  {
> -	u32 args[2] = {0, 0};
> +	u32 args[3] = {0, 0, 0};
>  	u32 tsflags, cb_flag;
>  
>  	tsflags = READ_ONCE(sk->sk_tsflags_bpf);
> @@ -5672,7 +5673,15 @@ static void skb_tstamp_tx_output_bpf(struct sock *sk, int tstype,
>  		args[1] = ts.tv_nsec;
>  	}
>  
> -	timestamp_call_bpf(sk, cb_flag, 2, args);
> +	if (tsflags & SOF_TIMESTAMPING_OPT_ID) {
> +		args[2] = skb_shinfo(skb)->tskey;
> +		if (sk_is_tcp(sk))
> +			args[2] -= atomic_read(&sk->sk_tskey);
> +		if (sk->sk_tskey_bpf_offset)
> +			args[2] += sk->sk_tskey_bpf_offset;
> +	}
> +
> +	timestamp_call_bpf(sk, cb_flag, 3, args);

So the BPF interface is effectively OPT_TSONLY: the packet data is
never shared.

Then OPT_ID should be mandatory, because it without it the data is
not actionable: which byte in the bytestream or packet in the case
of datagram sockets does a callback refer to.

> +/* Used to track the tskey for bpf extension
> + *
> + * @sk_tskey: bpf extension can use it only when no application uses.
> + *            Application can use it directly regardless of bpf extension.
> + *
> + * There are three strategies:
> + * 1) If we've already set through setsockopt() and here we're going to set
> + *    OPT_ID for bpf use, we will not re-initialize the @sk_tskey and will
> + *    keep the record of delta between the current "key" and previous key.
> + * 2) If we've already set through bpf_setsockopt() and here we're going to
> + *    set for application use, we will record the delta first and then
> + *    override/initialize the @sk_tskey.
> + * 3) other cases, which means only either of them takes effect, so initialize
> + *    everything simplely.
> + */

Please explain in the commit message that these gymnastics are needed
because there can only be one tskey in skb_shared_info.

> +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type)
> +{
> +	u32 tskey;
> +
> +	if (sk_is_tcp(sk)) {
> +		if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
> +			return -EINVAL;
> +
> +		if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> +			tskey = tcp_sk(sk)->write_seq;
> +		else
> +			tskey = tcp_sk(sk)->snd_una;
> +	} else {
> +		tskey = 0;
> +	}
> +
> +	if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> +		sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey);
> +		return 0;
> +	} else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) {
> +		sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey;
> +	} else {
> +		sk->sk_tskey_bpf_offset = 0;
> +	}
> +
> +	return tskey;
> +}
> +
>  int sock_set_tskey(struct sock *sk, int val, int bpf_type)
>  {
>  	u32 tsflags = bpf_type ? sk->sk_tsflags_bpf : sk->sk_tsflags;
> @@ -901,17 +944,13 @@ int sock_set_tskey(struct sock *sk, int val, int bpf_type)
>  
>  	if (val & SOF_TIMESTAMPING_OPT_ID &&
>  	    !(tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> -		if (sk_is_tcp(sk)) {
> -			if ((1 << sk->sk_state) &
> -			    (TCPF_CLOSE | TCPF_LISTEN))
> -				return -EINVAL;
> -			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> -				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
> -			else
> -				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
> -		} else {
> -			atomic_set(&sk->sk_tskey, 0);
> -		}
> +		long int ret;
> +
> +		ret = sock_calculate_tskey_offset(sk, val, bpf_type);
> +		if (ret <= 0)
> +			return ret;
> +
> +		atomic_set(&sk->sk_tskey, ret);
>  	}
>  
>  	return 0;
> @@ -956,10 +995,15 @@ static int sock_set_timestamping_bpf(struct sock *sk,
>  				     struct so_timestamping timestamping)
>  {
>  	u32 flags = timestamping.flags;
> +	int ret;
>  
>  	if (flags & ~SOF_TIMESTAMPING_BPF_SUPPPORTED_MASK)
>  		return -EINVAL;
>  
> +	ret = sock_set_tskey(sk, flags, 1);
> +	if (ret)
> +		return ret;
> +
>  	WRITE_ONCE(sk->sk_tsflags_bpf, flags);
>  
>  	return 0;

I'm a bit hazy on when this can be called. We can assume that this new
BPF operation cannot race with the existing setsockopt nor with the
datapath that might touch the atomic fields, right?