Hello Wen Gu and community, our group performed more experiments with SMC-R. The results discussed subsequently were performed on two Mellanox-powered (mlx5, ConnectX-5) PCs, with the following configuration: Kernel 6.5.0-25-generic MTU 9000 net.smc.wmem = $((256*1024)) net.smc.rmem = $((256*1024)) net.smc.autocorking_size = 65536 net.smc.smcr_buf_type = 1 Bandwidth ~ 3.2GB/s (25.0 Gbit/s) We modified your server.c (consumer) and client.c (producer) to estimate the throughput and observed that the "msgsize" of the consumer seems to be mainly responsible for the throughput drops, as shown below. Good cases (server/consumer msgsize <= RMBE/2): ----------------------------------------------- server: smc_run ./server -p 12345 -m $((128*1024)) client: smc_run ./client -i 192.168.0.2 -p 12345 -m $((128*1024)) -c 1000 Sent 261881856 bytes in 82224.819000 us [3.184939 GB/s] server: smc_run ./server -p 12345 -m $((128*1024)) client: smc_run ./client -i 192.168.0.2 -p 12345 -m $((256*1024)) -c 1000 Sent 261881856 bytes in 82097.127000 us [3.189892 GB/s] Bad cases (server/consumer msgsize > RMBE/2): ----------------------------------------------- server: smc_run ./server -p 12345 -m $((256*1024)) client: smc_run ./client -i 192.168.0.2 -p 12345 -m $((128*1024)) -c 1000 Sent 261881856 bytes in 130970.306000 us [1.999545 GB/s] server: smc_run ./server -p 12345 -m $((256*1024)) client: smc_run ./client -i 192.168.0.2 -p 12345 -m $((256*1024)) -c 1000 Sent 130940928 bytes in 88172.887000 us [1.485037 GB/s] Our explanation is that in the "bad cases" producer and consumer act synchronously in the following sense: The producer is sending messages (e.g., msgsize = RMBE on producer side), and at some point, it must wait until the consumer processes some of its RMBE, and answers with a CDC message. During this time, the producer is blocked (since RMBE of consumer is full). In case the consumer processes the entire RMBE (i.e., msgsize=RMBE on consumer side), it is then also blocked as there is nothing left to be processed anymore - i.e. it must wait for the producer. We believe/suspect that this (unintended) synchronization leads to the throughput drops. To enforce the consumer to process smaller messages, reply faster to the producer (CDC) and still be able to process some remaining data (i.e., to avoid being blocked), we cap the value of len to RMBE/2 in smc_rx_recvmsg: --- a/net/smc/smc_rx.c 2024-03-25 12:31:32.264614422 +0100 +++ b/net/smc/smc_rx.c 2024-03-25 12:22:31.989913322 +0100 @@ -344,7 +344,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, struct pipe_inode_info *pipe, size_t len, int flags) { - size_t copylen, read_done = 0, read_remaining = len; + size_t copylen, read_remaining, read_done = 0; size_t chunk_len, chunk_off, chunk_len_sum; struct smc_connection *conn = &smc->conn; int (*func)(struct smc_connection *conn); @@ -363,6 +363,10 @@ sk = &smc->sk; if (sk->sk_state == SMC_LISTEN) return -ENOTCONN; + + len = min_t(size_t, len, conn->rmb_desc->len / 2); + read_remaining = len; + if (flags & MSG_OOB) return smc_rx_recv_urg(smc, msg, len, flags); timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); We ran qperf experiments (as before) on the standard SMC-R module [std] (kernel 6.5.0-25-generic), Wen Gu’s proposal [wengu] (i.e. setting force = true), and our proposal [our] (i.e. capping len to RMBE/2). The measured throughput is shown in subplots (a) in the appended figures. Additionally, we traced tracepoint:smc:smc_tx_sendmsg{ @tx_ret = lhist(args->len,0,262144,16384); } tracepoint:smc:smc_rx_recvmsg{ @rx_ret = lhist(args->len,0,262144,16384); } and calculated the percentage of rx_ret and tx_ret being greater than RMBE/2 - shown in subplots (b) and (c) respectively. As can be observed, there seems to be a correlation between a drop in throughput and rx_ret being greater than RMBE/2. This is avoided in our proposal, and full throughput is achieved. We hope that our analysis and interpretation can help to solve the issue with the throughput drops in SMC-R. p.s., I would like to acknowledge all individuals who contributed to the analysis of SMC-R from our team (sorted by last name): Soumyadeep Debnath Andreas Görlitz Costin Iordache Alexandros Nikolaou Maik Riestock Ievgen Tatolov Mit freundlichen Grüßen / Best regards Andreas Goerlitz (SO/PAF1-Mb) Bosch Service Solutions Magdeburg GmbH | Otto-von-Guericke-Str. 13 | 39104 Magdeburg | GERMANY | [www.boschservicesolutions.com]www.boschservicesolutions.com Andreas.Goerlitz@xxxxxxxxxxxx Sitz: Magdeburg, Registergericht: Amtsgericht Stendal, HRB 24039 Geschäftsführung: Robert Mulatz, Georg Wessels
#include <stdio.h> #include <string.h> #include <stdlib.h> #include <unistd.h> #include <arpa/inet.h> #include <sys/socket.h> #include <netinet/in.h> #include <stdbool.h> #include <errno.h> #include <netinet/tcp.h> #include <time.h> #ifndef AF_SMC #define AF_SMC 43 #endif #define NET_PROTOCAL AF_INET #define SERV_IP "11.213.5.33" #define SERV_PORT 10012 #define BUF_SIZE (5 * 128 * 1024) int stream_send(int fd, char *buf, int msgsize) { int n = msgsize; while (n) { int i = write(fd, buf, n); if (i < 0) return i; buf += i; n -= i; if (i == 0) break; } return msgsize-n; } int net_clnt(char *ip, int port, int msgsize, int msgcnt) { struct timespec start, end; double elapsed_us = 0.0; double elapsed_us_sum = 0.0; int sent; long num_bytes = 0; double gb_s; if (!ip) ip = SERV_IP; if (!port) port = SERV_PORT; int sock = socket(NET_PROTOCAL, SOCK_STREAM, 0); struct sockaddr_in s_addr; memset(&s_addr, 0, sizeof(s_addr)); s_addr.sin_family = NET_PROTOCAL; s_addr.sin_addr.s_addr = inet_addr(ip); s_addr.sin_port = htons(port); if (connect(sock, (struct sockaddr*)&s_addr, sizeof(s_addr))){ printf("connect fail\n"); return 0; } char *buf = (char *)malloc(sizeof(char) * BUF_SIZE); while (--msgcnt) { if (msgsize > BUF_SIZE) break; printf("Send msgsize: %d\n", msgsize); clock_gettime(CLOCK_MONOTONIC, &start); sent = stream_send(sock, buf, msgsize); clock_gettime(CLOCK_MONOTONIC, &end); if (send <= 0) { printf("Error send %d\n", sent); break; } elapsed_us = (end.tv_sec - start.tv_sec)*1000000.0; elapsed_us += (end.tv_nsec - start.tv_nsec) / 1000.0; elapsed_us_sum += elapsed_us; num_bytes += sent; } close(sock); gb_s = (num_bytes/1000) / elapsed_us_sum; printf("Sent %ld bytes in %f us [%f GB/s]\n", num_bytes, elapsed_us_sum, gb_s); return 0; } int main(int argc, char **argv){ int msgsize = BUF_SIZE, msgcnt = 10; char *ip = NULL; bool wrong_param = false; int port = 0; int c; while(!wrong_param && (-1 != (c = getopt(argc, argv, "i:p:m:c:")))) { switch (c) { case 'i': ip = optarg; break; case 'p': port = atoi(optarg); break; case 'm': msgsize = atoi(optarg); break; case 'c': msgcnt = atoi(optarg); break; case '?': printf("usage: ./client -i <ip> -p <port> -m <msgsize> -c <cnt>\n"); wrong_param = true; break; } } if (!wrong_param) net_clnt(ip, port, msgsize, msgcnt); return 0; }
Attachment:
results_our.png
Description: results_our.png
Attachment:
results_std.png
Description: results_std.png
Attachment:
results_wengu.png
Description: results_wengu.png
#include <stdio.h> #include <string.h> #include <stdlib.h> #include <unistd.h> #include <arpa/inet.h> #include <sys/socket.h> #include <netinet/in.h> #include <errno.h> #include <stdbool.h> #include <netinet/tcp.h> #include <pthread.h> #ifndef AF_SMC #define AF_SMC 43 #endif #define NET_PROTOCAL AF_INET #define SERV_IP INADDR_ANY #define SERV_PORT 10012 #define BUF_SIZE (5 * 128 * 1024) int stream_recv(int fd, char *buf, int msgsize) { int n = msgsize; while (n) { int i = read(fd, buf, n); if (i < 0) return i; buf += i; n -= i; if (i == 0) break; //printf("Successfully recv %d B message\n", i); } return msgsize-n; } int net_serv(int port, int msgsize) { int recv; if (!port) port = SERV_PORT; int l_sock = socket(NET_PROTOCAL, SOCK_STREAM, 0); struct sockaddr_in s_addr; memset(&s_addr, 0, sizeof(struct sockaddr_in)); s_addr.sin_family = NET_PROTOCAL; s_addr.sin_addr.s_addr = SERV_IP; s_addr.sin_port = htons(port); // bind listen socket if (bind(l_sock, (struct sockaddr*)&s_addr, sizeof(s_addr))) { printf("bind listen socket error %d\n", errno); return 0; } // listen if (listen(l_sock, 20)) { printf("listen error\n"); return 0; } struct sockaddr_in c_addr; socklen_t c_addr_len = sizeof(c_addr); int s_sock = accept(l_sock, (struct sockaddr*)&c_addr, &c_addr_len); if (s_sock < 0) { printf("accept fail\n"); return 0; } else { char ip[16] = { 0 }; inet_ntop(NET_PROTOCAL, &(c_addr.sin_addr), ip, INET_ADDRSTRLEN); printf("accept connection: ip %s port %d\n", ip, c_addr.sin_port); } char *buf = (char *)malloc(sizeof(char) * BUF_SIZE); while (1) { if (msgsize > BUF_SIZE) break; //printf("Recv msgsize: %d\n", msgsize); recv = stream_recv(s_sock, buf, msgsize); if (recv <= 0) { if (recv) printf("Error recv %d\n", recv); break; } } printf("done\n"); close(s_sock); close(l_sock); return 0; } int main(int argc, char **argv) { bool wrong_param = false; int msgsize = BUF_SIZE; int port = 0; int c; while(!wrong_param && (-1 != (c = getopt(argc, argv, "p:m:")))) { switch (c) { case 'p': port = atoi(optarg); break; case 'm': msgsize = atoi(optarg); break; case '?': printf("usage: ./server -p <port> -m <msgsize>\n"); wrong_param = true; break; } } if (!wrong_param) net_serv(port, msgsize); return 0; }