This patch defines a new function to migrate ESTABLISHED/SYN_RECV sockets. Listening sockets hold incoming connections as a linked list of struct request_sock in the accept queue, and each request has reference to its full socket and listener. In inet_csk_reqsk_queue_migrate(), we only unlink the requests from the closing listener's queue and relink them to the head of the new listener's queue. We do not process each request and its reference to the listener, so the migration completes in O(1) time complexity. Moreover, if TFO requests caused RST before 3WHS has completed, they are held in the listener's TFO queue to prevent DDoS attack. Thus, we also migrate the requests in the TFO queue in the same way. After 3WHS has completed, there are three access patterns to incoming sockets: (1) access to the full socket instead of request_sock (2) access to request_sock from access queue (3) access to request_sock from TFO queue In the first case, the full socket does not have a reference to its request socket and listener, so we do not need the correct listener set in the request socket. In the second case, we always have the correct listener and currently do not use req->rsk_listener. However, in the third case of TCP_SYN_RECV sockets, we take special care in the next commit. Reviewed-by: Benjamin Herrenschmidt <benh@xxxxxxxxxx> Signed-off-by: Kuniyuki Iwashima <kuniyu@xxxxxxxxxxxx> --- include/net/inet_connection_sock.h | 1 + net/ipv4/inet_connection_sock.c | 68 ++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 7338b3865a2a..2ea2d743f8fc 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -260,6 +260,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, struct sock *child); +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk); void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout); struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 1451aa9712b0..5da38a756e4c 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -992,6 +992,74 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk, } EXPORT_SYMBOL(inet_csk_reqsk_queue_add); +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk) +{ + struct request_sock_queue *old_accept_queue, *new_accept_queue; + struct fastopen_queue *old_fastopenq, *new_fastopenq; + spinlock_t *l1, *l2, *l3, *l4; + + old_accept_queue = &inet_csk(sk)->icsk_accept_queue; + new_accept_queue = &inet_csk(nsk)->icsk_accept_queue; + old_fastopenq = &old_accept_queue->fastopenq; + new_fastopenq = &new_accept_queue->fastopenq; + + l1 = &old_accept_queue->rskq_lock; + l2 = &new_accept_queue->rskq_lock; + l3 = &old_fastopenq->lock; + l4 = &new_fastopenq->lock; + + /* sk is never selected as the new listener from reuse->socks[], + * so inversion deadlock does not happen here, + * but change the order to avoid the warning of lockdep. + */ + if (sk < nsk) { + swap(l1, l2); + swap(l3, l4); + } + + spin_lock(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); + + if (old_accept_queue->rskq_accept_head) { + if (new_accept_queue->rskq_accept_head) + old_accept_queue->rskq_accept_tail->dl_next = + new_accept_queue->rskq_accept_head; + else + new_accept_queue->rskq_accept_tail = old_accept_queue->rskq_accept_tail; + + new_accept_queue->rskq_accept_head = old_accept_queue->rskq_accept_head; + old_accept_queue->rskq_accept_head = NULL; + old_accept_queue->rskq_accept_tail = NULL; + + WRITE_ONCE(nsk->sk_ack_backlog, nsk->sk_ack_backlog + sk->sk_ack_backlog); + WRITE_ONCE(sk->sk_ack_backlog, 0); + } + + spin_unlock(l2); + spin_unlock(l1); + + spin_lock_bh(l3); + spin_lock_bh_nested(l4, SINGLE_DEPTH_NESTING); + + new_fastopenq->qlen += old_fastopenq->qlen; + old_fastopenq->qlen = 0; + + if (old_fastopenq->rskq_rst_head) { + if (new_fastopenq->rskq_rst_head) + old_fastopenq->rskq_rst_tail->dl_next = new_fastopenq->rskq_rst_head; + else + old_fastopenq->rskq_rst_tail = new_fastopenq->rskq_rst_tail; + + new_fastopenq->rskq_rst_head = old_fastopenq->rskq_rst_head; + old_fastopenq->rskq_rst_head = NULL; + old_fastopenq->rskq_rst_tail = NULL; + } + + spin_unlock_bh(l4); + spin_unlock_bh(l3); +} +EXPORT_SYMBOL(inet_csk_reqsk_queue_migrate); + struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req) { -- 2.17.2 (Apple Git-113)