On 21.03.23 08:19, Kai Shen wrote:
SMC-R performs not so well on fallback situations right now,
especially on short link server fallback occasions. We are planning
to make SMC-R widely used and handling this fallback performance
issue is really crucial to us. Here we introduce a shadow socket
method to try to relief this problem.
Could you please elaborate the problem?
Basicly, we use two more accept queues to hold incoming connections,
one for fallback connections and the other for smc-r connections.
We implement this method by using two more 'shadow' sockets and
make the connection path of fallback connections almost the same as
normal tcp connections.
Now the SMC-R accept path is like:
1. incoming connection
2. schedule work to smc sock alloc, tcp accept and push to smc
acceptq
3. wake up user to accept
When fallback happens on servers, the accepting path is the same
which costs more than normal tcp accept path. In fallback
situations, the step 2 above is not necessary and the smc sock is
also not needed. So we use two more shadow sockets when one smc
socket start listening. When new connection comes, we pop the req
to the fallback socket acceptq or the non-fallback socket acceptq
according to its syn_smc flag. As a result, when fallback happen we
can graft the user socket with a normal tcp sock instead of a smc
sock and get rid of the cost generated by step 2 and smc sock
releasing.
+-----> non-fallback socket acceptq
|
incoming req --+
|
+-----> fallback socket acceptq
With the help of shadow socket, we gain similar performance as tcp
connections on short link nginx server fallback occasions as what
is illustrated below.
Cases are like "./wrk http://x.x.x.x:x/
-H 'Connection: Close' -c 1600 -t 32 -d 20 --latency"
TCP:
Requests/sec: 145438.65
Transfer/sec: 21.64MB
Server fallback occasions on original SMC-R:
Requests/sec: 114192.82
Transfer/sec: 16.99MB
Server fallback occasions on SMC-R with shadow sockets:
Requests/sec: 143528.11
Transfer/sec: 21.35MB
Generally, I don't have a good feeling about the two non-listenning
sockets, and I can not see why it is necessary to introduce the socket
actsock instead of using the clcsock itself. Maybe you can convince me
with a good reason.
On the other hand, as a result of using another accept queue, the
fastopenq lock is not the right lock to access when accepting. So
we need to find the right fastopenq lock in inet_csk_accept.
Signed-off-by: Kai Shen <KaiShen@xxxxxxxxxxxxxxxxx>
---
net/ipv4/inet_connection_sock.c | 13 ++-
net/smc/af_smc.c | 143 ++++++++++++++++++++++++++++++--
net/smc/smc.h | 2 +
3 files changed, 150 insertions(+), 8 deletions(-)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 65ad4251f6fd..ba2ec5ad4c04 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -658,6 +658,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+ spinlock_t *fastopenq_lock = &queue->fastopenq.lock;
struct request_sock *req;
struct sock *newsk;
int error;
@@ -689,7 +690,15 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
if (sk->sk_protocol == IPPROTO_TCP &&
tcp_rsk(req)->tfo_listener) {
- spin_lock_bh(&queue->fastopenq.lock);
+#if IS_ENABLED(CONFIG_SMC)
+ if (tcp_sk(sk)->syn_smc) {
+ struct request_sock_queue *orig_queue;
+
+ orig_queue = &inet_csk(req->rsk_listener)->icsk_accept_queue;
+ fastopenq_lock = &orig_queue->fastopenq.lock;
+ }
+#endif
+ spin_lock_bh(fastopenq_lock);
if (tcp_rsk(req)->tfo_listener) {
/* We are still waiting for the final ACK from 3WHS
* so can't free req now. Instead, we set req->sk to
@@ -700,7 +709,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
req->sk = NULL;
req = NULL;
}
- spin_unlock_bh(&queue->fastopenq.lock);
+ spin_unlock_bh(fastopenq_lock);
}
out:
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index a4cccdfdc00a..ad6c3b9ec9a6 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -126,7 +126,9 @@ static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk,
smc = smc_clcsock_user_data(sk);
- if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) >
+ if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs)
+ + READ_ONCE(smc->actsock->sk->sk_ack_backlog)
+ + READ_ONCE(smc->fbsock->sk->sk_ack_backlog) >
sk->sk_max_ack_backlog)
goto drop;
@@ -286,6 +288,10 @@ static int __smc_release(struct smc_sock *smc)
/* wake up clcsock accept */
rc = kernel_sock_shutdown(smc->clcsock,
SHUT_RDWR);
+ if (smc->fbsock)
+ sock_release(smc->fbsock);
+ if (smc->actsock)
+ sock_release(smc->actsock);
}
sk->sk_state = SMC_CLOSED;
sk->sk_state_change(sk);
@@ -1681,7 +1687,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
mutex_lock(&lsmc->clcsock_release_lock);
if (lsmc->clcsock)
- rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK);
+ rc = kernel_accept(lsmc->actsock, &new_clcsock, SOCK_NONBLOCK);
mutex_unlock(&lsmc->clcsock_release_lock);
lock_sock(lsk);
if (rc < 0 && rc != -EAGAIN)
@@ -2486,9 +2492,46 @@ static void smc_tcp_listen_work(struct work_struct *work)
sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */
}
+#define SMC_LINK 1
+#define FALLBACK_LINK 2
+static inline int smc_sock_pop_to_another_acceptq(struct smc_sock *lsmc)
+{
+ struct sock *lsk = lsmc->clcsock->sk;
+ struct inet_connection_sock *icsk = inet_csk(lsk);
+ struct inet_connection_sock *dest_icsk;
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+ struct request_sock_queue *dest_queue;
+ struct request_sock *req;
+ struct sock *dst_sock;
+ int ret;
+
+ req = reqsk_queue_remove(queue, lsk);
+ if (!req)
+ return -EINVAL;
+
+ if (tcp_sk(req->sk)->syn_smc || lsmc->sockopt_defer_accept) {
+ dst_sock = lsmc->actsock->sk;
+ ret = SMC_LINK;
+ } else {
+ dst_sock = lsmc->fbsock->sk;
+ ret = FALLBACK_LINK;
+ }
+
+ dest_icsk = inet_csk(dst_sock);
+ dest_queue = &dest_icsk->icsk_accept_queue;
+
+ spin_lock_bh(&dest_queue->rskq_lock);
+ WRITE_ONCE(req->dl_next, dest_queue->rskq_accept_head);
+ sk_acceptq_added(dst_sock);
+ dest_queue->rskq_accept_head = req;
+ spin_unlock_bh(&dest_queue->rskq_lock);
+ return ret;
+}
+
static void smc_clcsock_data_ready(struct sock *listen_clcsock)
{
struct smc_sock *lsmc;
+ int ret;
read_lock_bh(&listen_clcsock->sk_callback_lock);
lsmc = smc_clcsock_user_data(listen_clcsock);
@@ -2496,14 +2539,41 @@ static void smc_clcsock_data_ready(struct sock *listen_clcsock)
goto out;
lsmc->clcsk_data_ready(listen_clcsock);
if (lsmc->sk.sk_state == SMC_LISTEN) {
- sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
- if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) > - sock_put(&lsmc->sk);
+ ret = smc_sock_pop_to_another_acceptq(lsmc);
+ if (ret == SMC_LINK) {
+ sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
+ if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work))
+ sock_put(&lsmc->sk);
+ } else if (ret == FALLBACK_LINK) {
+ lsmc->sk.sk_data_ready(&lsmc->sk);
+ }
}
out:
read_unlock_bh(&listen_clcsock->sk_callback_lock);
}
+static void smc_shadow_socket_init(struct socket *sock)
+{
+ struct inet_connection_sock *icsk = inet_csk(sock->sk);
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+
+ tcp_set_state(sock->sk, TCP_LISTEN);
+ sock->sk->sk_ack_backlog = 0;
+
+ inet_csk_delack_init(sock->sk);
+
+ spin_lock_init(&queue->rskq_lock);
+
+ spin_lock_init(&queue->fastopenq.lock);
+ queue->fastopenq.rskq_rst_head = NULL;
+ queue->fastopenq.rskq_rst_tail = NULL;
+ queue->fastopenq.qlen = 0;
+
+ queue->rskq_accept_head = NULL;
+
+ tcp_sk(sock->sk)->syn_smc = 1;
+}
+
static int smc_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
@@ -2551,6 +2621,18 @@ static int smc_listen(struct socket *sock, int backlog)
if (smc->limit_smc_hs)
tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;
+ rc = sock_create_kern(sock_net(sk), PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &smc->fbsock);
+ if (rc)
+ goto out;
+ smc_shadow_socket_init(smc->fbsock);
+
+ rc = sock_create_kern(sock_net(sk), PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &smc->actsock);
+ if (rc)
+ goto out;
+ smc_shadow_socket_init(smc->actsock);
+
rc = kernel_listen(smc->clcsock, backlog);
if (rc) {
write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
@@ -2569,6 +2651,30 @@ static int smc_listen(struct socket *sock, int backlog)
return rc;
}
+static inline bool tcp_reqsk_queue_empty(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+
+ return reqsk_queue_empty(queue);
+}
+
Since this is only used by smc, I'd like to suggest to use
smc_tcp_reqsk_queue_empty instead of tcp_reqsk_queue_empty.
+static inline void
+smc_restore_fbsock_protocol_family(struct socket *new_sock, struct socket *sock)
+{
+ struct smc_sock *lsmc = smc_sk(sock->sk);
+
+ new_sock->sk->sk_data_ready = lsmc->fbsock->sk->sk_data_ready;
+ new_sock->ops = lsmc->fbsock->ops;
+ new_sock->type = lsmc->fbsock->type;
+
+ module_put(sock->ops->owner);
+ __module_get(new_sock->ops->owner);
+
+ if (tcp_sk(new_sock->sk)->syn_smc)
+ pr_err("new sock is not fallback.\n");
+}
+
static int smc_accept(struct socket *sock, struct socket *new_sock,
int flags, bool kern)
{
@@ -2579,6 +2685,18 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
int rc = 0;
lsmc = smc_sk(sk);
+ /* There is a lock in inet_csk_accept, so to make a fast path we do not lock_sock here */
+ if (lsmc->sk.sk_state == SMC_LISTEN && !tcp_reqsk_queue_empty(lsmc->fbsock->sk)) {
+ rc = lsmc->clcsock->ops->accept(lsmc->fbsock, new_sock, O_NONBLOCK, true);
+ if (rc == -EAGAIN)
+ goto normal_path;
+ if (rc < 0)
+ return rc;
+ smc_restore_fbsock_protocol_family(new_sock, sock);
+ return rc;
+ }
+
+normal_path:
sock_hold(sk); /* sock_put below */
lock_sock(sk);
@@ -2593,6 +2711,18 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
add_wait_queue_exclusive(sk_sleep(sk), &wait);
while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
set_current_state(TASK_INTERRUPTIBLE);
+ if (!tcp_reqsk_queue_empty(lsmc->fbsock->sk)) {
+ rc = lsmc->clcsock->ops->accept(lsmc->fbsock, new_sock, O_NONBLOCK, true);
+ if (rc == -EAGAIN)
+ goto next_round;
+ if (rc < 0)
+ break;
+
+ smc_restore_fbsock_protocol_family(new_sock, sock);
+ nsk = new_sock->sk;
+ break;
+ }
+next_round:
if (!timeo) {
rc = -EAGAIN;
break;
@@ -2731,7 +2861,8 @@ static __poll_t smc_accept_poll(struct sock *parent)
__poll_t mask = 0;
spin_lock(&isk->accept_q_lock);
- if (!list_empty(&isk->accept_q))
+ if (!list_empty(&isk->accept_q) ||
+ !reqsk_queue_empty(&inet_csk(isk->fbsock->sk)->icsk_accept_queue))
mask = EPOLLIN | EPOLLRDNORM;
spin_unlock(&isk->accept_q_lock);
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 5ed765ea0c73..9a62c8f37e26 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -241,6 +241,8 @@ struct smc_connection {
struct smc_sock { /* smc sock container */
struct sock sk;
struct socket *clcsock; /* internal tcp socket */
+ struct socket *fbsock; /* socket for fallback connection */
+ struct socket *actsock; /* socket for non-fallback conneciotn */
void (*clcsk_state_change)(struct sock *sk);
/* original stat_change fct. */
void (*clcsk_data_ready)(struct sock *sk);