On Thu, Jan 30, 2025 at 3:05 PM Stanislav Fomichev <stfomichev@xxxxxxxxx> wrote: > > On 01/30, Mina Almasry wrote: > > Add support for devmem TX in ncdevmem. > > > > This is a combination of the ncdevmem from the devmem TCP series RFCv1 > > which included the TX path, and work by Stan to include the netlink API > > and refactored on top of his generic memory_provider support. > > > > Signed-off-by: Mina Almasry <almasrymina@xxxxxxxxxx> > > Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxxx> > > > > --- > > > > v2: > > - make errors a static variable so that we catch instances where there > > are less than 20 errors across different buffers. > > - Fix the issue where the seed is reset to 0 instead of its starting > > value 1. > > - Use 1000ULL instead of 1000 to guard against overflow (Willem). > > - Do not set POLLERR (Willem). > > - Update the test to use the new interface where iov_base is the > > dmabuf_offset. > > - Update the test to send 2 iov instead of 1, so we get some test > > coverage over sending multiple iovs at once. > > - Print the ifindex the test is using, useful for debugging issues where > > maybe the test may fail because the ifindex of the socket is different > > from the dmabuf binding. > > --- > > .../selftests/drivers/net/hw/ncdevmem.c | 276 +++++++++++++++++- > > 1 file changed, 272 insertions(+), 4 deletions(-) > > > > diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c > > index 19a6969643f4..8455f19ecd1a 100644 > > --- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c > > +++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c > > @@ -40,15 +40,18 @@ > > #include <fcntl.h> > > #include <malloc.h> > > #include <error.h> > > +#include <poll.h> > > > > #include <arpa/inet.h> > > #include <sys/socket.h> > > #include <sys/mman.h> > > #include <sys/ioctl.h> > > #include <sys/syscall.h> > > +#include <sys/time.h> > > > > #include <linux/memfd.h> > > #include <linux/dma-buf.h> > > +#include <linux/errqueue.h> > > #include <linux/udmabuf.h> > > #include <libmnl/libmnl.h> > > #include <linux/types.h> > > @@ -80,6 +83,8 @@ static int num_queues = -1; > > static char *ifname; > > static unsigned int ifindex; > > static unsigned int dmabuf_id; > > +static uint32_t tx_dmabuf_id; > > +static int waittime_ms = 500; > > > > struct memory_buffer { > > int fd; > > @@ -93,6 +98,8 @@ struct memory_buffer { > > struct memory_provider { > > struct memory_buffer *(*alloc)(size_t size); > > void (*free)(struct memory_buffer *ctx); > > + void (*memcpy_to_device)(struct memory_buffer *dst, size_t off, > > + void *src, int n); > > void (*memcpy_from_device)(void *dst, struct memory_buffer *src, > > size_t off, int n); > > }; > > @@ -153,6 +160,20 @@ static void udmabuf_free(struct memory_buffer *ctx) > > free(ctx); > > } > > > > +static void udmabuf_memcpy_to_device(struct memory_buffer *dst, size_t off, > > + void *src, int n) > > +{ > > + struct dma_buf_sync sync = {}; > > + > > + sync.flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE; > > + ioctl(dst->fd, DMA_BUF_IOCTL_SYNC, &sync); > > + > > + memcpy(dst->buf_mem + off, src, n); > > + > > + sync.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE; > > + ioctl(dst->fd, DMA_BUF_IOCTL_SYNC, &sync); > > +} > > + > > static void udmabuf_memcpy_from_device(void *dst, struct memory_buffer *src, > > size_t off, int n) > > { > > @@ -170,6 +191,7 @@ static void udmabuf_memcpy_from_device(void *dst, struct memory_buffer *src, > > static struct memory_provider udmabuf_memory_provider = { > > .alloc = udmabuf_alloc, > > .free = udmabuf_free, > > + .memcpy_to_device = udmabuf_memcpy_to_device, > > .memcpy_from_device = udmabuf_memcpy_from_device, > > }; > > > > @@ -188,7 +210,7 @@ void validate_buffer(void *line, size_t size) > > { > > static unsigned char seed = 1; > > unsigned char *ptr = line; > > - int errors = 0; > > + static int errors; > > size_t i; > > > > for (i = 0; i < size; i++) { > > @@ -202,7 +224,7 @@ void validate_buffer(void *line, size_t size) > > } > > seed++; > > if (seed == do_validation) > > - seed = 0; > > + seed = 1; > > } > > > > fprintf(stdout, "Validated buffer\n"); > > @@ -394,6 +416,49 @@ static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd, > > return -1; > > } > > > > +static int bind_tx_queue(unsigned int ifindex, unsigned int dmabuf_fd, > > + struct ynl_sock **ys) > > +{ > > + struct netdev_bind_tx_req *req = NULL; > > + struct netdev_bind_tx_rsp *rsp = NULL; > > + struct ynl_error yerr; > > + > > + *ys = ynl_sock_create(&ynl_netdev_family, &yerr); > > + if (!*ys) { > > + fprintf(stderr, "YNL: %s\n", yerr.msg); > > + return -1; > > + } > > + > > + req = netdev_bind_tx_req_alloc(); > > + netdev_bind_tx_req_set_ifindex(req, ifindex); > > + netdev_bind_tx_req_set_fd(req, dmabuf_fd); > > + > > + rsp = netdev_bind_tx(*ys, req); > > + if (!rsp) { > > + perror("netdev_bind_tx"); > > + goto err_close; > > + } > > + > > + if (!rsp->_present.id) { > > + perror("id not present"); > > + goto err_close; > > + } > > + > > + fprintf(stderr, "got tx dmabuf id=%d\n", rsp->id); > > + tx_dmabuf_id = rsp->id; > > + > > + netdev_bind_tx_req_free(req); > > + netdev_bind_tx_rsp_free(rsp); > > + > > + return 0; > > + > > +err_close: > > + fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg); > > + netdev_bind_tx_req_free(req); > > + ynl_sock_destroy(*ys); > > + return -1; > > +} > > + > > static void enable_reuseaddr(int fd) > > { > > int opt = 1; > > @@ -432,7 +497,7 @@ static int parse_address(const char *str, int port, struct sockaddr_in6 *sin6) > > return 0; > > } > > > > -int do_server(struct memory_buffer *mem) > > +static int do_server(struct memory_buffer *mem) > > { > > char ctrl_data[sizeof(int) * 20000]; > > struct netdev_queue_id *queues; > > @@ -686,6 +751,207 @@ void run_devmem_tests(void) > > provider->free(mem); > > } > > > > +static uint64_t gettimeofday_ms(void) > > +{ > > + struct timeval tv; > > + > > + gettimeofday(&tv, NULL); > > + return (tv.tv_sec * 1000ULL) + (tv.tv_usec / 1000ULL); > > +} > > + > > +static int do_poll(int fd) > > +{ > > + struct pollfd pfd; > > + int ret; > > + > > + pfd.revents = 0; > > + pfd.fd = fd; > > + > > + ret = poll(&pfd, 1, waittime_ms); > > + if (ret == -1) > > + error(1, errno, "poll"); > > + > > + return ret && (pfd.revents & POLLERR); > > +} > > + > > +static void wait_compl(int fd) > > +{ > > + int64_t tstop = gettimeofday_ms() + waittime_ms; > > + char control[CMSG_SPACE(100)] = {}; > > + struct sock_extended_err *serr; > > + struct msghdr msg = {}; > > + struct cmsghdr *cm; > > + int retries = 10; > > + __u32 hi, lo; > > + int ret; > > + > > + msg.msg_control = control; > > + msg.msg_controllen = sizeof(control); > > + > > + while (gettimeofday_ms() < tstop) { > > + if (!do_poll(fd)) > > + continue; > > + > > + ret = recvmsg(fd, &msg, MSG_ERRQUEUE); > > + if (ret < 0) { > > + if (errno == EAGAIN) > > + continue; > > + error(1, ret, "recvmsg(MSG_ERRQUEUE)"); > > + return; > > + } > > + if (msg.msg_flags & MSG_CTRUNC) > > + error(1, 0, "MSG_CTRUNC\n"); > > + > > + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { > > + if (cm->cmsg_level != SOL_IP && > > + cm->cmsg_level != SOL_IPV6) > > + continue; > > + if (cm->cmsg_level == SOL_IP && > > + cm->cmsg_type != IP_RECVERR) > > + continue; > > + if (cm->cmsg_level == SOL_IPV6 && > > + cm->cmsg_type != IPV6_RECVERR) > > + continue; > > + > > + serr = (void *)CMSG_DATA(cm); > > + if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) > > + error(1, 0, "wrong origin %u", serr->ee_origin); > > + if (serr->ee_errno != 0) > > + error(1, 0, "wrong errno %d", serr->ee_errno); > > + > > + hi = serr->ee_data; > > + lo = serr->ee_info; > > + > > + fprintf(stderr, "tx complete [%d,%d]\n", lo, hi); > > + return; > > + } > > + } > > + > > + error(1, 0, "did not receive tx completion"); > > +} > > + > > +static int do_client(struct memory_buffer *mem) > > +{ > > + char ctrl_data[CMSG_SPACE(sizeof(struct dmabuf_tx_cmsg))]; > > + struct sockaddr_in6 server_sin; > > + struct sockaddr_in6 client_sin; > > + struct dmabuf_tx_cmsg ddmabuf; > > + struct ynl_sock *ys = NULL; > > + struct msghdr msg = {}; > > + ssize_t line_size = 0; > > + struct cmsghdr *cmsg; > > + struct iovec iov[2]; > > + uint64_t off = 100; > > + char *line = NULL; > > + size_t len = 0; > > + int socket_fd; > > + int ret, mid; > > + int opt = 1; > > + > > + ret = parse_address(server_ip, atoi(port), &server_sin); > > + if (ret < 0) > > + error(1, 0, "parse server address"); > > + > > + socket_fd = socket(AF_INET6, SOCK_STREAM, 0); > > + if (socket_fd < 0) > > + error(1, socket_fd, "create socket"); > > + > > + enable_reuseaddr(socket_fd); > > + > > + ret = setsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, > > + strlen(ifname) + 1); > > + if (ret) > > + error(1, ret, "bindtodevice"); > > + > > + if (bind_tx_queue(ifindex, mem->fd, &ys)) > > + error(1, 0, "Failed to bind\n"); > > + > > + ret = parse_address(client_ip, atoi(port), &client_sin); > > + if (ret < 0) > > + error(1, 0, "parse client address"); > > + > > + ret = bind(socket_fd, &client_sin, sizeof(client_sin)); > > + if (ret) > > + error(1, ret, "bind"); > > + > > + ret = setsockopt(socket_fd, SOL_SOCKET, SO_ZEROCOPY, &opt, sizeof(opt)); > > + if (ret) > > + error(1, ret, "set sock opt"); > > + > > + fprintf(stderr, "Connect to %s %d (via %s)\n", server_ip, > > + ntohs(server_sin.sin6_port), ifname); > > + > > + ret = connect(socket_fd, &server_sin, sizeof(server_sin)); > > + if (ret) > > + error(1, ret, "connect"); > > + > > + while (1) { > > + free(line); > > + line = NULL; > > + /* Subtract 1 from line_size to remove trailing newlines that > > + * get_line are surely to parse... > > + */ > > + line_size = getline(&line, &len, stdin) - 1; > > Why not send the '\n' as well? If we skip the '\n', it's not keeping > netcat-like behavior :-( > Ah, this is to make the validation on the RX side work. The validation expects a repeating pattern: 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, .... With no newlines. But it does become weird that TX doesn't match netcat. Let me think on this a bit. Maybe I can resolve this in a way where the validation works but also the tx side behaves like netcat. Maybe the RX validation can skip newlines or something. Maybe I can massage how I invoke the test. This can become a rabbit hole because I do want to invoke multiple sendmsg() in one iteration of the test as well, and not overcomplicate the series. > > + > > + if (line_size < 0) > > + break; > > [..] > > > + mid = (line_size / 2) + 1; > > + > > + iov[0].iov_base = (void *)100; > > + iov[0].iov_len = mid; > > + iov[1].iov_base = (void *)2000; > > + iov[1].iov_len = line_size - mid; > > This seems a bit hard-coded. We should at least test that mid is < 2000? > Yep, I should do at least do that. FWIW (although missed it in this iteration), at the top of the file I put docs that list exactly what I run for others to repro the results (and nipa should eventually run similar, I have that on my todo list), but the test should be more flexible to at least catch instances where mid is too large. > But ideally we should have two modes for tx with a flag (and run them > both from the selftest): > - pass one big iov, this will test the sendmsg path which creates > multiple skbs internally > - break 'line' into N sections (as you do here), but maybe have more > control over the number of sections? > > Maybe let's have a new --max-iov-size flag? Then we can call ncdevmem > with --max-iov-size <some prime number close to 4k> to exercise all > sorts of weird offsets? > > (seems ok to also follow up on that separately) Yeah, improvements to tests are always possible, lets have some reasonable tests in the first iteration and expand. -- Thanks, Mina