27.05.2020, 19:43, "Eric Dumazet" <eric.dumazet@xxxxxxxxx>: > On 5/27/20 8:05 AM, Dmitry Yakunin wrote: >> This patch adds support of SO_KEEPALIVE flag and TCP related options >> to bpf_setsockopt() routine. This is helpful if we want to enable or tune >> TCP keepalive for applications which don't do it in the userspace code. >> In order to avoid copy-paste, common code from classic setsockopt was moved >> to auxiliary functions in the headers. > > Please split this in two patches : > - one adding the helpers, a pure TCP patch. > - one for BPF additions. Thanks for your comment. I've split this into three patches (sock, tcp and bpf) and resent. >> Signed-off-by: Dmitry Yakunin <zeil@xxxxxxxxxxxxxx> >> --- >> include/net/sock.h | 9 +++++++++ >> include/net/tcp.h | 18 ++++++++++++++++++ >> net/core/filter.c | 39 ++++++++++++++++++++++++++++++++++++++- >> net/core/sock.c | 9 --------- >> net/ipv4/tcp.c | 15 ++------------- >> 5 files changed, 67 insertions(+), 23 deletions(-) >> >> diff --git a/include/net/sock.h b/include/net/sock.h >> index 3e8c6d4..ee35dea 100644 >> --- a/include/net/sock.h >> +++ b/include/net/sock.h >> @@ -879,6 +879,15 @@ static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag) >> __clear_bit(flag, &sk->sk_flags); >> } >> >> +static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit, >> + int valbool) >> +{ >> + if (valbool) >> + sock_set_flag(sk, bit); >> + else >> + sock_reset_flag(sk, bit); >> +} >> + >> static inline bool sock_flag(const struct sock *sk, enum sock_flags flag) >> { >> return test_bit(flag, &sk->sk_flags); >> diff --git a/include/net/tcp.h b/include/net/tcp.h >> index b681338..ae6a495 100644 >> --- a/include/net/tcp.h >> +++ b/include/net/tcp.h >> @@ -1465,6 +1465,24 @@ static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) >> tcp_jiffies32 - tp->rcv_tstamp); >> } >> >> +/* val must be validated at the top level function */ >> +static inline void keepalive_time_set(struct tcp_sock *tp, int val) >> +{ >> + struct sock *sk = (struct sock *)tp; > > We prefer the other way to avoid a cast unless really needed : > > static inline tcp_keepalive_time_set(struct sock *sk, int val) > { > stuct tcp_sock *tp = tcp_sk(sk); > >> + >> + tp->keepalive_time = val * HZ; >> + if (sock_flag(sk, SOCK_KEEPOPEN) && >> + !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { >> + u32 elapsed = keepalive_time_elapsed(tp); >> + >> + if (tp->keepalive_time > elapsed) >> + elapsed = tp->keepalive_time - elapsed; >> + else >> + elapsed = 0; >> + inet_csk_reset_keepalive_timer(sk, elapsed); >> + } >> +} >> + >> static inline int tcp_fin_time(const struct sock *sk) >> { >> int fin_timeout = tcp_sk(sk)->linger2 ? : sock_net(sk)->ipv4.sysctl_tcp_fin_timeout; >> diff --git a/net/core/filter.c b/net/core/filter.c >> index a6fc234..1035e43 100644 >> --- a/net/core/filter.c >> +++ b/net/core/filter.c >> @@ -4248,8 +4248,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { >> static int _bpf_setsockopt(struct sock *sk, int level, int optname, >> char *optval, int optlen, u32 flags) >> { >> + int val, valbool; >> int ret = 0; >> - int val; >> >> if (!sk_fullsock(sk)) >> return -EINVAL; >> @@ -4260,6 +4260,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, >> if (optlen != sizeof(int)) >> return -EINVAL; >> val = *((int *)optval); >> + valbool = val ? 1 : 0; >> >> /* Only some socketops are supported */ >> switch (optname) { >> @@ -4298,6 +4299,11 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, >> sk_dst_reset(sk); >> } >> break; >> + case SO_KEEPALIVE: >> + if (sk->sk_prot->keepalive) >> + sk->sk_prot->keepalive(sk, valbool); >> + sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); >> + break; >> default: >> ret = -EINVAL; >> } >> @@ -4358,6 +4364,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, >> ret = tcp_set_congestion_control(sk, name, false, >> reinit, true); >> } else { >> + struct inet_connection_sock *icsk = inet_csk(sk); >> struct tcp_sock *tp = tcp_sk(sk); >> >> if (optlen != sizeof(int)) >> @@ -4386,6 +4393,36 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, >> else >> tp->save_syn = val; >> break; >> + case TCP_KEEPIDLE: >> + if (val < 1 || val > MAX_TCP_KEEPIDLE) >> + ret = -EINVAL; >> + else >> + keepalive_time_set(tp, val); >> + break; >> + case TCP_KEEPINTVL: >> + if (val < 1 || val > MAX_TCP_KEEPINTVL) >> + ret = -EINVAL; >> + else >> + tp->keepalive_intvl = val * HZ; >> + break; >> + case TCP_KEEPCNT: >> + if (val < 1 || val > MAX_TCP_KEEPCNT) >> + ret = -EINVAL; >> + else >> + tp->keepalive_probes = val; >> + break; >> + case TCP_SYNCNT: >> + if (val < 1 || val > MAX_TCP_SYNCNT) >> + ret = -EINVAL; >> + else >> + icsk->icsk_syn_retries = val; >> + break; >> + case TCP_USER_TIMEOUT: >> + if (val < 0) >> + ret = -EINVAL; >> + else >> + icsk->icsk_user_timeout = val; >> + break; >> default: >> ret = -EINVAL; >> } >> diff --git a/net/core/sock.c b/net/core/sock.c >> index fd85e65..9836b01 100644 >> --- a/net/core/sock.c >> +++ b/net/core/sock.c >> @@ -684,15 +684,6 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval, >> return ret; >> } >> >> -static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit, >> - int valbool) >> -{ >> - if (valbool) >> - sock_set_flag(sk, bit); >> - else >> - sock_reset_flag(sk, bit); >> -} >> - >> bool sk_mc_loop(struct sock *sk) >> { >> if (dev_recursion_level()) >> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c >> index 9700649..7b239e8 100644 >> --- a/net/ipv4/tcp.c >> +++ b/net/ipv4/tcp.c >> @@ -3003,19 +3003,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, >> case TCP_KEEPIDLE: >> if (val < 1 || val > MAX_TCP_KEEPIDLE) >> err = -EINVAL; >> - else { >> - tp->keepalive_time = val * HZ; >> - if (sock_flag(sk, SOCK_KEEPOPEN) && >> - !((1 << sk->sk_state) & >> - (TCPF_CLOSE | TCPF_LISTEN))) { >> - u32 elapsed = keepalive_time_elapsed(tp); >> - if (tp->keepalive_time > elapsed) >> - elapsed = tp->keepalive_time - elapsed; >> - else >> - elapsed = 0; >> - inet_csk_reset_keepalive_timer(sk, elapsed); >> - } >> - } >> + else >> + keepalive_time_set(tp, val); >> break; >> case TCP_KEEPINTVL: >> if (val < 1 || val > MAX_TCP_KEEPINTVL)