On 12/21/21 15:35, Pavel Begunkov wrote:
Update on io_uring zerocopy tx, still RFC. For v1 and design notes see
[...]
Benchmarks for dummy netdev, UDP/IPv4, payload size=4096:
-n<N> is how many requests we submit per syscall. From io_uring perspective -n1
is wasteful and far from optimal, but included for comparison.
-z0 disables zerocopy, just normal io_uring send requests
-f makes to flush "buffer free" notifications for every request
[...]> https://github.com/isilence/linux/tree/zc_v2
https://github.com/isilence/liburing/tree/zc_v2
The Benchmark is <liburing>/test/send-zc,
send-zc [-f] [-n<N>] [-z0] -s<payload size> -D<dst ip> (-6|-4) [-t<sec>] udp
Attaching a standalone send-zc for convenience.
gcc -luring -O2 -o send-zc ./send-zc.c
--
Pavel Begunkov
/* SPDX-License-Identifier: MIT */
/* based on linux-kernel/tools/testing/selftests/net/msg_zerocopy.c */
/* gcc -luring -O2 -o send-zc ./send-zc.c */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <errno.h>
#include <error.h>
#include <limits.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdbool.h>
#include <string.h>
#include <arpa/inet.h>
#include <linux/errqueue.h>
#include <linux/if_packet.h>
#include <linux/ipv6.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <netinet/ip.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/un.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#include <liburing.h>
enum {
__IORING_OP_SENDZC = 40,
__IORING_SENDZC_FLUSH = (1U << 0),
__IORING_SENDZC_FIXED_BUF = (1U << 1),
__IORING_REGISTER_TX_CTX = 20,
__IORING_UNREGISTER_TX_CTX = 21,
};
struct __io_uring_tx_ctx_register {
__u64 tag;
};
#ifndef SO_ZEROCOPY
#define SO_ZEROCOPY 60
#endif
#define ZC_TAG 0xfffffffULL
static bool fixed_files;
static bool zc;
static bool flush;
static int nr_reqs;
static bool fixed_buf;
static int cfg_family = PF_UNSPEC;
static int cfg_payload_len;
static int cfg_port = 8000;
static int cfg_runtime_ms = 4200;
static socklen_t cfg_alen;
static struct sockaddr_storage cfg_dst_addr;
static char payload[IP_MAXPACKET] __attribute__((aligned(4096)));
static inline int ____sys_io_uring_register(int fd, unsigned opcode,
const void *arg, unsigned nr_args)
{
int ret;
ret = syscall(__NR_io_uring_register, fd, opcode, arg, nr_args);
return (ret < 0) ? -errno : ret;
}
static unsigned long gettimeofday_ms(void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
}
static void do_setsockopt(int fd, int level, int optname, int val)
{
if (setsockopt(fd, level, optname, &val, sizeof(val)))
error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
}
static void setup_sockaddr(int domain, const char *str_addr,
struct sockaddr_storage *sockaddr)
{
struct sockaddr_in6 *addr6 = (void *) sockaddr;
struct sockaddr_in *addr4 = (void *) sockaddr;
switch (domain) {
case PF_INET:
memset(addr4, 0, sizeof(*addr4));
addr4->sin_family = AF_INET;
addr4->sin_port = htons(cfg_port);
if (str_addr &&
inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
error(1, 0, "ipv4 parse error: %s", str_addr);
break;
case PF_INET6:
memset(addr6, 0, sizeof(*addr6));
addr6->sin6_family = AF_INET6;
addr6->sin6_port = htons(cfg_port);
if (str_addr &&
inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
error(1, 0, "ipv6 parse error: %s", str_addr);
break;
default:
error(1, 0, "illegal domain");
}
}
static int do_setup_tx(int domain, int type, int protocol)
{
int fd;
fd = socket(domain, type, protocol);
if (fd == -1)
error(1, errno, "socket t");
do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);
do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1);
if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
error(1, errno, "connect");
return fd;
}
static inline struct io_uring_cqe *wait_cqe_fast(struct io_uring *ring)
{
struct io_uring_cqe *cqe;
unsigned head;
int ret;
io_uring_for_each_cqe(ring, head, cqe)
return cqe;
ret = io_uring_wait_cqe(ring, &cqe);
if (ret)
error(1, ret, "wait cqe");
return cqe;
}
static void do_tx(int domain, int type, int protocol)
{
unsigned long packets = 0, bytes = 0;
struct io_uring ring;
struct iovec iov;
uint64_t tstop;
int i, fd, ret;
int compl_cqes = 0;
fd = do_setup_tx(domain, type, protocol);
ret = io_uring_queue_init(512, &ring, 0);
if (ret)
error(1, ret, "io_uring: queue init");
struct __io_uring_tx_ctx_register r = { .tag = ZC_TAG, };
ret = ____sys_io_uring_register(ring.ring_fd, __IORING_REGISTER_TX_CTX, (void *)&r, 1);
if (ret)
error(1, ret, "io_uring: tx ctx registration");
ret = io_uring_register_files(&ring, &fd, 1);
if (ret < 0)
error(1, ret, "io_uring: files registration");
iov.iov_base = payload;
iov.iov_len = cfg_payload_len;
ret = io_uring_register_buffers(&ring, &iov, 1);
if (ret < 0)
error(1, ret, "io_uring: buffer registration");
tstop = gettimeofday_ms() + cfg_runtime_ms;
do {
struct io_uring_sqe *sqe;
struct io_uring_cqe *cqe;
compl_cqes += flush ? nr_reqs : 0;
for (i = 0; i < nr_reqs; i++) {
sqe = io_uring_get_sqe(&ring);
io_uring_prep_send(sqe, fd, payload, cfg_payload_len, 0);
sqe->user_data = 1;
if (fixed_files) {
sqe->fd = 0;
sqe->flags = IOSQE_FIXED_FILE;
}
if (zc) {
sqe->opcode = __IORING_OP_SENDZC;
sqe->file_index = 0; // sqe->tx_ctx_idx = 0;
sqe->ioprio = 0;
sqe->off = 0;
sqe->__pad2[0] = 0;
if (flush)
sqe->ioprio |= __IORING_SENDZC_FLUSH;
if (fixed_buf) {
sqe->ioprio |= __IORING_SENDZC_FIXED_BUF;
sqe->buf_index = 0;
}
}
}
ret = io_uring_submit(&ring);
if (ret != nr_reqs)
error(1, ret, "submit");
for (i = 0; i < nr_reqs; i++) {
cqe = wait_cqe_fast(&ring);
if (cqe->user_data == ZC_TAG) {
compl_cqes--;
i--;
} else if (cqe->user_data == 1) {
if (cqe->res <= 0)
error(1, cqe->res, "send failed");
packets++;
bytes += cqe->res;
} else {
error(1, cqe->user_data, "invalid user_data");
}
io_uring_cqe_seen(&ring, cqe);
}
} while (gettimeofday_ms() < tstop);
if (close(fd))
error(1, errno, "close");
fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n",
packets, bytes >> 20,
packets / (cfg_runtime_ms / 1000),
(bytes >> 20) / (cfg_runtime_ms / 1000));
while (compl_cqes) {
struct io_uring_cqe *cqe = wait_cqe_fast(&ring);
io_uring_cqe_seen(&ring, cqe);
compl_cqes--;
}
ret = ____sys_io_uring_register(ring.ring_fd, __IORING_UNREGISTER_TX_CTX,
NULL, 0);
if (ret)
error(1, ret, "io_uring: tx ctx unregistration");
io_uring_queue_exit(&ring);
}
static void do_test(int domain, int type, int protocol)
{
int i;
for (i = 0; i < IP_MAXPACKET; i++)
payload[i] = 'a' + (i % 26);
do_tx(domain, type, protocol);
}
static void usage(const char *filepath)
{
error(1, 0, "Usage: %s [-f] [-n<N>] [-z0] [-s<payload size>] "
"(-4|-6) [-t<time s>] -D<dst_ip> udp", filepath);
}
static void parse_opts(int argc, char **argv)
{
const int max_payload_len = sizeof(payload) -
sizeof(struct ipv6hdr) -
sizeof(struct tcphdr) -
40 /* max tcp options */;
int c;
char *daddr = NULL;
if (argc <= 1)
usage(argv[0]);
cfg_payload_len = max_payload_len;
fixed_files = 1;
zc = 1;
flush = 0 && zc;
nr_reqs = 8;
fixed_buf = 1 && zc;
while ((c = getopt(argc, argv, "46D:i:p:s:t:n:r:fz:h")) != -1) {
switch (c) {
case '4':
if (cfg_family != PF_UNSPEC)
error(1, 0, "Pass one of -4 or -6");
cfg_family = PF_INET;
cfg_alen = sizeof(struct sockaddr_in);
break;
case '6':
if (cfg_family != PF_UNSPEC)
error(1, 0, "Pass one of -4 or -6");
cfg_family = PF_INET6;
cfg_alen = sizeof(struct sockaddr_in6);
break;
case 'D':
daddr = optarg;
break;
case 'p':
cfg_port = strtoul(optarg, NULL, 0);
break;
case 's':
cfg_payload_len = strtoul(optarg, NULL, 0);
break;
case 't':
cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
break;
case 'n':
case 'r':
nr_reqs = strtoul(optarg, NULL, 0);
break;
case 'f':
flush = 1;
break;
case 'z':
zc = strtoul(optarg, NULL, 0);
break;
}
}
if (flush && !zc)
error(1, 0, "Flush should be used with zc only");
setup_sockaddr(cfg_family, daddr, &cfg_dst_addr);
if (cfg_payload_len > max_payload_len)
error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
if (optind != argc - 1)
usage(argv[0]);
}
int main(int argc, char **argv)
{
const char *cfg_test;
parse_opts(argc, argv);
cfg_test = argv[argc - 1];
if (!strcmp(cfg_test, "tcp"))
do_test(cfg_family, SOCK_STREAM, 0);
else if (!strcmp(cfg_test, "udp"))
do_test(cfg_family, SOCK_DGRAM, 0);
else
error(1, 0, "unknown cfg_test %s", cfg_test);
return 0;
}