2024-10-29, 11:47:25 +0100, Antonio Quartulli wrote: > +static void ovpn_socket_release_work(struct work_struct *work) > +{ > + struct ovpn_socket *sock = container_of(work, struct ovpn_socket, work); > + > + ovpn_socket_detach(sock->sock); > + kfree_rcu(sock, rcu); > +} > + > +static void ovpn_socket_schedule_release(struct ovpn_socket *sock) > +{ > + INIT_WORK(&sock->work, ovpn_socket_release_work); > + schedule_work(&sock->work); How does module unloading know that it has to wait for this work to complete? Will ovpn_cleanup get stuck until some refcount gets released by this work? [...] > +static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff *skb) > +{ > + struct ovpn_peer *peer = container_of(strp, struct ovpn_peer, tcp.strp); > + struct strp_msg *msg = strp_msg(skb); > + size_t pkt_len = msg->full_len - 2; > + size_t off = msg->offset + 2; > + > + /* ensure skb->data points to the beginning of the openvpn packet */ > + if (!pskb_pull(skb, off)) { > + net_warn_ratelimited("%s: packet too small\n", > + peer->ovpn->dev->name); > + goto err; > + } > + > + /* strparser does not trim the skb for us, therefore we do it now */ > + if (pskb_trim(skb, pkt_len) != 0) { > + net_warn_ratelimited("%s: trimming skb failed\n", > + peer->ovpn->dev->name); > + goto err; > + } > + > + /* we need the first byte of data to be accessible > + * to extract the opcode and the key ID later on > + */ > + if (!pskb_may_pull(skb, 1)) { > + net_warn_ratelimited("%s: packet too small to fetch opcode\n", > + peer->ovpn->dev->name); > + goto err; > + } > + > + /* DATA_V2 packets are handled in kernel, the rest goes to user space */ > + if (likely(ovpn_opcode_from_skb(skb, 0) == OVPN_DATA_V2)) { > + /* hold reference to peer as required by ovpn_recv(). > + * > + * NOTE: in this context we should already be holding a > + * reference to this peer, therefore ovpn_peer_hold() is > + * not expected to fail > + */ > + if (WARN_ON(!ovpn_peer_hold(peer))) > + goto err; > + > + ovpn_recv(peer, skb); > + } else { > + /* The packet size header must be there when sending the packet > + * to userspace, therefore we put it back > + */ > + skb_push(skb, 2); > + ovpn_tcp_to_userspace(peer, strp->sk, skb); > + } > + > + return; > +err: > + netdev_err(peer->ovpn->dev, > + "cannot process incoming TCP data for peer %u\n", peer->id); This should also be ratelimited, and maybe just combined with the net_warn_ratelimited just before each goto. > + dev_core_stats_rx_dropped_inc(peer->ovpn->dev); > + kfree_skb(skb); > + ovpn_peer_del(peer, OVPN_DEL_PEER_REASON_TRANSPORT_ERROR); > +} [...] > +void ovpn_tcp_socket_detach(struct socket *sock) > +{ [...] > + /* restore CBs that were saved in ovpn_sock_set_tcp_cb() */ > + sock->sk->sk_data_ready = peer->tcp.sk_cb.sk_data_ready; > + sock->sk->sk_write_space = peer->tcp.sk_cb.sk_write_space; > + sock->sk->sk_prot = peer->tcp.sk_cb.prot; > + sock->sk->sk_socket->ops = peer->tcp.sk_cb.ops; > + rcu_assign_sk_user_data(sock->sk, NULL); > + > + rcu_read_unlock(); > + > + /* cancel any ongoing work. Done after removing the CBs so that these > + * workers cannot be re-armed > + */ I'm not sure whether a barrier is needed to prevent compiler/CPU reordering here. > + cancel_work_sync(&peer->tcp.tx_work); > + strp_done(&peer->tcp.strp); > +} > + > +static void ovpn_tcp_send_sock(struct ovpn_peer *peer) > +{ > + struct sk_buff *skb = peer->tcp.out_msg.skb; > + > + if (!skb) > + return; > + > + if (peer->tcp.tx_in_progress) > + return; > + > + peer->tcp.tx_in_progress = true; Sorry, I never answered your question about my concerns in a previous review here. We can reach ovpn_tcp_send_sock in two different contexts: - lock_sock (either from ovpn_tcp_sendmsg or ovpn_tcp_tx_work) - bh_lock_sock (from ovpn_tcp_send_skb, ie "data path") These are not fully mutually exclusive. lock_sock grabs bh_lock_sock (a spinlock) for a brief period to mark the (sleeping/mutex) lock as taken, and then releases it. So when bh_lock_sock is held, it's not possible to grab lock_sock. But when lock_sock is taken, it's still possible to grab bh_lock_sock. The buggy scenario would be: (data path encrypt) (sendmsg) ovpn_tcp_send_skb lock_sock bh_lock_sock + owned=1 + bh_unlock_sock bh_lock_sock ovpn_tcp_send_sock_skb ovpn_tcp_send_sock_skb !peer->tcp.out_msg.skb !peer->tcp.out_msg.skb peer->tcp.out_msg.skb = ... peer->tcp.out_msg.skb = ... ovpn_tcp_send_sock ovpn_tcp_send_sock !peer->tcp.tx_in_progress !peer->tcp.tx_in_progress peer->tcp.tx_in_progress = true peer->tcp.tx_in_progress = true // proceed // proceed That's 2 similar races, one on out_msg.skb and one on tx_in_progress. It's a bit unlikely (but not impossible) that we'll have 2 cpus trying to call skb_send_sock_locked at the same time, but if they just overwrite each other's skb/len it's already pretty bad. The end of ovpn_tcp_send_sock might also reset peer->tcp.out_msg.* just as ovpn_tcp_send_skb -> ovpn_tcp_send_sock_skb starts setting it up (peer->tcp.out_msg.skb gets cleared, ovpn_tcp_send_sock_skb proceeds and sets skb+len, then maybe len gets reset to 0 by ovpn_tcp_send_sock). To avoid this problem, esp_output_tcp_finish (net/ipv4/esp4.c) does: bh_lock_sock(sk); if (sock_owned_by_user(sk)) err = espintcp_queue_out(sk, skb); else err = espintcp_push_skb(sk, skb); bh_unlock_sock(sk); (espintcp_push_skb is roughly equivalent to ovpn_tcp_send_sock_skb) > + do { > + int ret = skb_send_sock_locked(peer->sock->sock->sk, skb, > + peer->tcp.out_msg.offset, > + peer->tcp.out_msg.len); > + if (unlikely(ret < 0)) { > + if (ret == -EAGAIN) > + goto out; > + > + net_warn_ratelimited("%s: TCP error to peer %u: %d\n", > + peer->ovpn->dev->name, peer->id, > + ret); > + > + /* in case of TCP error we can't recover the VPN > + * stream therefore we abort the connection > + */ > + ovpn_peer_del(peer, > + OVPN_DEL_PEER_REASON_TRANSPORT_ERROR); > + break; > + } > + > + peer->tcp.out_msg.len -= ret; > + peer->tcp.out_msg.offset += ret; > + } while (peer->tcp.out_msg.len > 0); > + > + if (!peer->tcp.out_msg.len) > + dev_sw_netstats_tx_add(peer->ovpn->dev, 1, skb->len); > + > + kfree_skb(peer->tcp.out_msg.skb); > + peer->tcp.out_msg.skb = NULL; > + peer->tcp.out_msg.len = 0; > + peer->tcp.out_msg.offset = 0; > + > +out: > + peer->tcp.tx_in_progress = false; > +} > + > +static void ovpn_tcp_tx_work(struct work_struct *work) > +{ > + struct ovpn_peer *peer; > + > + peer = container_of(work, struct ovpn_peer, tcp.tx_work); > + > + lock_sock(peer->sock->sock->sk); > + ovpn_tcp_send_sock(peer); > + release_sock(peer->sock->sock->sk); > +} > + > +void ovpn_tcp_send_sock_skb(struct ovpn_peer *peer, struct sk_buff *skb) > +{ > + if (peer->tcp.out_msg.skb) > + return; That's leaking the skb? (and not counting the drop) > + > + peer->tcp.out_msg.skb = skb; > + peer->tcp.out_msg.len = skb->len; > + peer->tcp.out_msg.offset = 0; > + > + ovpn_tcp_send_sock(peer); > +} > + > +static int ovpn_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) > +{ [...] > + ret = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); > + if (ret) { > + kfree_skb(skb); > + net_err_ratelimited("%s: skb copy from iter failed: %d\n", > + sock->peer->ovpn->dev->name, ret); > + goto unlock; > + } > + > + ovpn_tcp_send_sock_skb(sock->peer, skb); If we didn't send the packet (because one was already queued/in progress), we should either stash it, or tell userspace that it wasn't sent and it should retry later. > + ret = size; > +unlock: > + release_sock(sk); > +peer_free: > + ovpn_peer_put(peer); > + return ret; > +} -- Sabrina