I'm seeing peculiar TCP throughput behavior in Linux 2.4 that I can't explain. I wonder if someone can help me with the missing insight. I have a throughput benchmark that measures the time to send data from one process to another over TCP. I have applied it to a pair of 600MHz PIIIs running 2.4.0-test11. They are connected by 100Mb/s switched ethernet. Ordinarily, when I send 8MB in 16 byte block sizes, I get about 52 Mb/s. However, if I exchange one 4 byte block over the connection (in each direction) before I start timing the the 8MB transfer, it goes up to about 57 Mb/s. (By "block size", I mean the size of the buffer passed to write.) Does anyone have any idea how the initial small transfer can have such large influence on overall throughput? Below is a copy of `tp', the benchmark program (apologies for its length), and instructions for using it to reproduce my results. Thanks, Vic Zandy Use tp to measure throughput of 8MB in 16 byte blocks: 1. On host A, start tp as a server (-s) with timing enabled (-t). It will print the port number on which it is listening. A% tp -s -t Server listening on port 1033 2. On host B, start tp as a client. Specify the server host (-h) and port (-p), the total data transfer size (-n), the block size (-z), and enable timing (-t). B% tp -h A -p 1033 -n 8M -z 16 After the tp client transfer its entire load, each program will print the observed throughput on its end of the connection. The client will exit and the server will wait for a new client. A% tp -s -t Server listening on port 1033 52.0 Mbits/sec 986 bytes/read 8388608 bytes total B% tp -h A -p 1033 -n 8M -z 16 16 bytes/write: 1230620 usec, 52.0 Mb/sec To re-run the measurement with the initial 4 byte exchange, add (-f) to the client and server tp invocations: A% tp -s -t -f Server listening on port 1034 56.9 Mbits/sec 1105 bytes/read 8388608 bytes total B% tp -h A -p 1034 -n 8M -z 16 -f 16 bytes/write: 1124185 usec, 56.9 Mb/sec /* tp.c: gcc -Wall -O2 -o tp t.c */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <fcntl.h> #include <unistd.h> #include <netinet/in.h> #include <netinet/tcp.h> #include <sys/types.h> #include <sys/wait.h> #include <sys/socket.h> #include <netdb.h> #include <sys/time.h> #include <assert.h> #include <errno.h> static int DEBUG = 0; #define DEFAULT_BUFFER_SIZE (8*1024*1024) static unsigned char *buffer; #define MIN(x,y) ((x) < (y) ? (x) : (y)) static double mbps(unsigned bytes, struct timeval *tv) { double m; m = bytes * 8; /* bits */ m /= (tv->tv_sec*1000000 + tv->tv_usec); /* bits/usec */ m *= 1000000; /* bits/sec */ m /= (1024 * 1024); /* Mbits/sec */ return m; } static struct in_addr hostname_to_addr(char *hostname) { struct hostent* h; struct in_addr addr; h = gethostbyname(hostname); if (!h) { fprintf(stdout, "Host lookup failed: %s\n", hostname); exit(1); } addr = *((struct in_addr *) h->h_addr); /* network order */ return addr; } static int xread(int sd, void *buf, size_t len) { char *p = (char *)buf; size_t nrecv = 0; ssize_t rv; while (nrecv < len) { rv = read(sd, p, len - nrecv); if (0 > rv && errno == EINTR) continue; if (0 >= rv) return -1; nrecv += rv; p += rv; } return nrecv; } static int xwrite(int sd, void *buf, size_t len) { char *p = (char *)buf; size_t nsent = 0; ssize_t rv; while (nsent < len) { rv = write(sd, p, len - nsent); if (0 > rv && errno == EINTR) continue; if (0 > rv) return -1; nsent += rv; p += rv; } return nsent; } /* c = a - b */ static void tv_diff(const struct timeval *a, const struct timeval *b, struct timeval *c) { c->tv_sec = a->tv_sec - b->tv_sec; c->tv_usec = a->tv_usec - b->tv_usec; if (c->tv_usec < 0) { c->tv_sec -= 1; c->tv_usec += 1000000; } } static int do_discard(int sock, int buflen, struct timeval *tv) { int rv; unsigned nread; unsigned loops; struct timeval s; if (tv) { fd_set fds; /* Wait for data to be ready */ FD_ZERO(&fds); FD_SET(sock, &fds); if (0 > select(sock+1, &fds, NULL, NULL, NULL)) { perror("select"); return -1; } gettimeofday(&s, NULL); } nread = 0; loops = 0; while (1) { rv = read(sock, buffer, buflen); if (0 > rv) { perror("read"); return 0; } if (0 == rv) { if (DEBUG) { fprintf(stdout, "EOF\n"); fflush(stdout); } goto out; } nread += rv; loops++; if (DEBUG) { fprintf(stdout, "."); fflush(stdout); } } out: if (tv && nread > 0) { gettimeofday(tv, NULL); tv_diff(tv, &s, tv); printf(" %6.1f Mbits/sec %6.0f bytes/read %d bytes total\n", mbps(nread, tv), (double)nread/loops, nread); fflush(stdout); } return 0; } /* Connect to HOST:PORT and return the new socket. If TV is non-null, return in TV the wall time for establishing the connection. */ static int do_connect(char *host, short port, struct timeval *tv) { int sd; struct sockaddr_in addr; int len; struct timeval s, e; int rv, ern; sd = socket(AF_INET, SOCK_STREAM, 0); if (0 > sd) { perror("socket"); return -1; } bzero(&addr, sizeof(addr)); addr.sin_family = AF_INET; addr.sin_addr = hostname_to_addr(host); addr.sin_port = htons(port); len = sizeof(addr); if (tv) gettimeofday(&s, NULL); rv = connect(sd, &addr, len); ern = errno; if (tv) { gettimeofday(&e, NULL); tv_diff(&e, &s, tv); } if (0 > rv) { errno = ern; perror("connect"); return -1; } return sd; } /* PORT is in network order */ static int do_lb(short port, struct sockaddr_in *addr) { int sd; int len; sd = socket(AF_INET, SOCK_STREAM, 0); if (0 > sd) { perror("socket"); return -1; } bzero(addr, sizeof(struct sockaddr_in)); addr->sin_family = AF_INET; addr->sin_addr.s_addr = INADDR_ANY; addr->sin_port = port; len = sizeof(struct sockaddr_in); if (0 > bind(sd, (struct sockaddr *) addr, len)) { perror("bind"); return -1; } len = sizeof(addr); if (0 > getsockname(sd, (struct sockaddr *) addr, &len)) { perror("getsockname"); return -1; } if (0 > listen(sd, 1)) { perror("listen"); return -1; } fprintf(stderr, "Server listening on port %d\n", ntohs(addr->sin_port)); return sd; } static int do_accept(int serv) { int sock; struct sockaddr_in addr; int len; len = sizeof(addr); sock = accept(serv, (struct sockaddr *) &addr, &len); if (0 > sock) { perror("accept"); return -1; } return sock; } static int do_one_way_write(int sock, const int buflen, int n, struct timeval *tv) { struct timeval s, e; int todo; if (tv) gettimeofday(&s, NULL); todo = n; while (todo > 0) { if (0 > xwrite(sock, buffer, MIN(buflen, todo))) { perror("write"); return -1; } if (DEBUG) { fprintf(stdout, "."); fflush(stdout); } todo -= MIN(buflen, todo); } if (tv) { gettimeofday(&e, NULL); tv_diff(&e, &s, tv); printf("%10d bytes/write: %10ld usec, %6.1f Mb/sec\n", buflen, tv->tv_sec*1000000 + tv->tv_usec, mbps(n, tv)); } return 0; } static void usage_and_exit(int e) { fprintf(stdout, "Usage: sock [switches] [mode]\n"); fprintf(stdout, " Switches:\n"); fprintf(stdout, " -s Run as a server\n"); fprintf(stdout, " -h <host> (Clients) Server host (default localhost)\n"); fprintf(stdout, " -p <port> (Clients) Server port\n"); fprintf(stdout, " -d Print debugging information\n"); fprintf(stdout, " -f Futz with the connection\n"); fprintf(stdout, " -t Print timing statistics\n"); fprintf(stdout, " -n <bytes> Number of bytes to transfer\n"); fprintf(stdout, " -z <bytes> Set network I/O buffer size\n"); fprintf(stdout, " <bytes> may include suffix of `K' or `M'\n"); exit(e); } static unsigned long parsebytes(char *p) { unsigned long l, mul; char *q; /* 100, 100K, 100k, 100M, 100m */ l = strtoul(p, &q, 10); if (!strlen(q)) return l; if (strlen(q) > 1) goto err; if (*q == 'k' || *q == 'K') mul = 1024; else if (*q == 'm' || *q == 'M') mul = 1024 * 1024; else goto err; return l * mul; err: fprintf(stderr, "Bad byte count specification %s, using %ld\n", p, l); return l; } static int do_futz(int sock) { unsigned long a; int futz_factor = 1; int i; for (i = 0; i < futz_factor; i++) { if (0 > xwrite(sock, &a, sizeof(a))) { perror("do_futz"); exit(1); } } for (i = 0; i < futz_factor; i++) { if (0 > xread(sock, &a, sizeof(a))) { perror("do_futz"); exit(1); } } return 0; } int main(int argc, char *argv[]) { int c; int sock, serv; struct timeval tv; struct sockaddr_in addr; char *host = "localhost"; /* Host for client connections */ int port = 0; /* Port for client connections */ int n = 64 * 1024 * 1024; /* Bytes to xfer */ int sz = DEFAULT_BUFFER_SIZE; /* Buffer size */ int server = 0; /* Server or client? */ int timing = 0; /* Collect timing statistics? */ int futz = 0; /* Drop 8 bytes into the stream before timing */ opterr = 1; optind = 0; while (EOF != (c = getopt(argc, argv, "fsp:h:dtz:n:"))) switch (c) { case 'f': futz = 1; break; case 's': server = 1; break; case 'p': port = atoi(optarg); break; case 'h': host = optarg; break; case 'd': DEBUG++; break; case 't': timing = 1; break; case 'z': sz = atoi(optarg); break; case 'n': n = parsebytes(optarg); break; case '?': usage_and_exit(1); break; } if (!server && (!host || !port)) usage_and_exit(1); buffer = (unsigned char*) malloc(sz); if (!buffer) { fprintf(stdout, "Out of memory.\n"); exit(1); } bzero(buffer, sz); if (server) { serv = do_lb(htons(0), &addr); if (0 > serv) { fprintf(stdout, "Failed to create listener\n"); exit(1); } while (1) { sock = do_accept(serv); if (0 > sock) { fprintf(stdout, "Failed to accept client\n"); exit(1); } if (futz) do_futz(sock); if (0 > do_discard(sock, sz, timing ? &tv : NULL)) { fprintf(stdout, "Server loop failed\n"); exit(1); } close(sock); } } else { sock = do_connect(host, port, timing ? &tv : NULL); if (0 > sock) { fprintf(stdout, "Client connect failed\n"); exit(1); } if (futz) do_futz(sock); sleep(1); /* Give server time to catch up before timing */ if (0 > do_one_way_write(sock, sz, n, timing ? &tv : NULL)) { fprintf(stdout, "Client loop failed\n"); exit(1); } close(sock); } free(buffer); return 0; } /* End of tp.c */ - : send the line "unsubscribe linux-net" in the body of a message to majordomo@vger.kernel.org