From: Cong Wang <cong.wang@xxxxxxxxxxxxx> Currently sockmap calls into each protocol to update the struct proto and replace it. This certainly won't work when the protocol is implemented as a module, for example, AF_UNIX. Introduce a new ops sk->sk_prot->update_proto(), so each protocol can implement its own way to replace the struct proto. Cc: John Fastabend <john.fastabend@xxxxxxxxx> Cc: Daniel Borkmann <daniel@xxxxxxxxxxxxx> Cc: Jakub Sitnicki <jakub@xxxxxxxxxxxxxx> Cc: Lorenz Bauer <lmb@xxxxxxxxxxxxxx> Signed-off-by: Cong Wang <cong.wang@xxxxxxxxxxxxx> --- include/linux/skmsg.h | 18 +++--------------- include/net/sock.h | 3 +++ include/net/tcp.h | 2 +- include/net/udp.h | 2 +- net/core/sock_map.c | 22 +++------------------- net/ipv4/tcp_bpf.c | 20 +++++++++++++++++--- net/ipv4/tcp_ipv4.c | 3 +++ net/ipv4/udp.c | 3 +++ net/ipv4/udp_bpf.c | 14 ++++++++++++-- net/ipv6/tcp_ipv6.c | 3 +++ net/ipv6/udp.c | 3 +++ 11 files changed, 52 insertions(+), 41 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index cb79b1afa556..cb94d0f89c08 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -97,6 +97,7 @@ struct sk_psock { void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); + int (*saved_update_proto)(struct sock *sk, bool restore); struct proto *sk_proto; struct sk_psock_work_state work_state; struct work_struct work; @@ -335,25 +336,12 @@ static inline void sk_psock_cork_free(struct sk_psock *psock) } } -static inline void sk_psock_update_proto(struct sock *sk, - struct sk_psock *psock, - struct proto *ops) -{ - /* Pairs with lockless read in sk_clone_lock() */ - WRITE_ONCE(sk->sk_prot, ops); -} - static inline void sk_psock_restore_proto(struct sock *sk, struct sk_psock *psock) { sk->sk_prot->unhash = psock->saved_unhash; - if (inet_csk_has_ulp(sk)) { - tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); - } else { - sk->sk_write_space = psock->saved_write_space; - /* Pairs with lockless read in sk_clone_lock() */ - WRITE_ONCE(sk->sk_prot, psock->sk_proto); - } + if (psock->saved_update_proto) + psock->saved_update_proto(sk, true); } static inline void sk_psock_set_state(struct sk_psock *psock, diff --git a/include/net/sock.h b/include/net/sock.h index 7644ea64a376..e474a9202be8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1184,6 +1184,9 @@ struct proto { void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); +#ifdef CONFIG_BPF_SOCK_MAP + int (*update_proto)(struct sock *sk, bool restore); +#endif /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS diff --git a/include/net/tcp.h b/include/net/tcp.h index f7591768525d..c2fff35859b6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2183,7 +2183,7 @@ struct sk_msg; struct sk_psock; #ifdef CONFIG_BPF_SOCK_MAP -struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); +int tcp_bpf_update_proto(struct sock *sk, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #else static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) diff --git a/include/net/udp.h b/include/net/udp.h index 0ff921e6b866..e3e5dfc8e0f0 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -513,7 +513,7 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, #ifdef CONFIG_BPF_SOCK_MAP struct sk_psock; -struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); +int udp_bpf_update_proto(struct sock *sk, bool restore); #endif /* CONFIG_BPF_SOCK_MAP */ #endif /* _UDP_H */ diff --git a/net/core/sock_map.c b/net/core/sock_map.c index f827f1ecefcc..255067e5c73a 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -181,26 +181,10 @@ static void sock_map_unref(struct sock *sk, void *link_raw) static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { - struct proto *prot; - - switch (sk->sk_type) { - case SOCK_STREAM: - prot = tcp_bpf_get_proto(sk, psock); - break; - - case SOCK_DGRAM: - prot = udp_bpf_get_proto(sk, psock); - break; - - default: + if (!sk->sk_prot->update_proto) return -EINVAL; - } - - if (IS_ERR(prot)) - return PTR_ERR(prot); - - sk_psock_update_proto(sk, psock, prot); - return 0; + psock->saved_update_proto = sk->sk_prot->update_proto; + return sk->sk_prot->update_proto(sk, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 2252f1d90676..16e00802ccba 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -601,19 +601,33 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; } -struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +int tcp_bpf_update_proto(struct sock *sk, bool restore) { + struct sk_psock *psock = sk_psock(sk); int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; + if (restore) { + if (inet_csk_has_ulp(sk)) { + tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); + } else { + sk->sk_write_space = psock->saved_write_space; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + } + return 0; + } + if (sk->sk_family == AF_INET6) { if (tcp_bpf_assert_proto_ops(psock->sk_proto)) - return ERR_PTR(-EINVAL); + return -EINVAL; tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); } - return &tcp_bpf_prots[family][config]; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]); + return 0; } /* If a child got cloned from a listening socket that had tcp_bpf diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 62b6fd385a47..d7c30b762cc3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2803,6 +2803,9 @@ struct proto tcp_prot = { .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, +#ifdef CONFIG_BPF_SOCK_MAP + .update_proto = tcp_bpf_update_proto, +#endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c67e483fce41..84ab4f2e874a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2843,6 +2843,9 @@ struct proto udp_prot = { .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, +#ifdef CONFIG_BPF_SOCK_MAP + .update_proto = udp_bpf_update_proto, +#endif .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 7a94791efc1a..595836088e85 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -41,12 +41,22 @@ static int __init udp_bpf_v4_build_proto(void) } core_initcall(udp_bpf_v4_build_proto); -struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +int udp_bpf_update_proto(struct sock *sk, bool restore) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; + struct sk_psock *psock = sk_psock(sk); + + if (restore) { + sk->sk_write_space = psock->saved_write_space; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + return 0; + } if (sk->sk_family == AF_INET6) udp_bpf_check_v6_needs_rebuild(psock->sk_proto); - return &udp_bpf_prots[family]; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]); + return 0; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8539715ff035..77b11799a3fe 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2131,6 +2131,9 @@ struct proto tcpv6_prot = { .hash = inet6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, +#ifdef CONFIG_BPF_SOCK_MAP + .update_proto = tcp_bpf_update_proto, +#endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index a02ac875a923..66ebdfc83c95 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1711,6 +1711,9 @@ struct proto udpv6_prot = { .unhash = udp_lib_unhash, .rehash = udp_v6_rehash, .get_port = udp_v6_get_port, +#ifdef CONFIG_BPF_SOCK_MAP + .update_proto = udp_bpf_update_proto, +#endif .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), -- 2.25.1