> -----Original Message----- > From: David Howells <dhowells@xxxxxxxxxx> > Sent: Tuesday, 20 June 2023 16:53 > To: netdev@xxxxxxxxxxxxxxx > Cc: David Howells <dhowells@xxxxxxxxxx>; Alexander Duyck > <alexander.duyck@xxxxxxxxx>; David S. Miller <davem@xxxxxxxxxxxxx>; Eric > Dumazet <edumazet@xxxxxxxxxx>; Jakub Kicinski <kuba@xxxxxxxxxx>; Paolo > Abeni <pabeni@xxxxxxxxxx>; Willem de Bruijn > <willemdebruijn.kernel@xxxxxxxxx>; David Ahern <dsahern@xxxxxxxxxx>; > Matthew Wilcox <willy@xxxxxxxxxxxxx>; Jens Axboe <axboe@xxxxxxxxx>; linux- > mm@xxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; Bernard Metzler > <BMT@xxxxxxxxxxxxxx>; Tom Talpey <tom@xxxxxxxxxx>; Jason Gunthorpe > <jgg@xxxxxxxx>; Leon Romanovsky <leon@xxxxxxxxxx>; linux- > rdma@xxxxxxxxxxxxxxx > Subject: [PATCH net-next v3 04/18] siw: Use > sendmsg(MSG_SPLICE_PAGES) rather than sendpage to transmit > > When transmitting data, call down into TCP using a single sendmsg with > MSG_SPLICE_PAGES to indicate that content should be spliced rather than > performing several sendmsg and sendpage calls to transmit header, data > pages and trailer. > > To make this work, the data is assembled in a bio_vec array and attached to > a BVEC-type iterator. The header and trailer (if present) are copied into > page fragments that can be freed with put_page(). > > Signed-off-by: David Howells <dhowells@xxxxxxxxxx> > cc: Bernard Metzler <bmt@xxxxxxxxxxxxxx> > cc: Tom Talpey <tom@xxxxxxxxxx> > cc: Jason Gunthorpe <jgg@xxxxxxxx> > cc: Leon Romanovsky <leon@xxxxxxxxxx> > cc: "David S. Miller" <davem@xxxxxxxxxxxxx> > cc: Eric Dumazet <edumazet@xxxxxxxxxx> > cc: Jakub Kicinski <kuba@xxxxxxxxxx> > cc: Paolo Abeni <pabeni@xxxxxxxxxx> > cc: Jens Axboe <axboe@xxxxxxxxx> > cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> > cc: linux-rdma@xxxxxxxxxxxxxxx > cc: netdev@xxxxxxxxxxxxxxx > --- > > Notes: > ver #2) > - Wrap lines at 80. > > drivers/infiniband/sw/siw/siw_qp_tx.c | 231 ++++---------------------- > 1 file changed, 36 insertions(+), 195 deletions(-) > > diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c > b/drivers/infiniband/sw/siw/siw_qp_tx.c > index ffb16beb6c30..2584f9da0dd8 100644 > --- a/drivers/infiniband/sw/siw/siw_qp_tx.c > +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c > @@ -311,114 +311,8 @@ static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, > struct socket *s, > return rv; > } > > -/* > - * 0copy TCP transmit interface: Use MSG_SPLICE_PAGES. > - * > - * Using sendpage to push page by page appears to be less efficient > - * than using sendmsg, even if data are copied. > - * > - * A general performance limitation might be the extra four bytes > - * trailer checksum segment to be pushed after user data. > - */ > -static int siw_tcp_sendpages(struct socket *s, struct page **page, int > offset, > - size_t size) > -{ > - struct bio_vec bvec; > - struct msghdr msg = { > - .msg_flags = (MSG_MORE | MSG_DONTWAIT | MSG_SENDPAGE_NOTLAST | > - MSG_SPLICE_PAGES), > - }; > - struct sock *sk = s->sk; > - int i = 0, rv = 0, sent = 0; > - > - while (size) { > - size_t bytes = min_t(size_t, PAGE_SIZE - offset, size); > - > - if (size + offset <= PAGE_SIZE) > - msg.msg_flags &= ~MSG_SENDPAGE_NOTLAST; > - > - tcp_rate_check_app_limited(sk); > - bvec_set_page(&bvec, page[i], bytes, offset); > - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); > - > -try_page_again: > - lock_sock(sk); > - rv = tcp_sendmsg_locked(sk, &msg, size); > - release_sock(sk); > - > - if (rv > 0) { > - size -= rv; > - sent += rv; > - if (rv != bytes) { > - offset += rv; > - bytes -= rv; > - goto try_page_again; > - } > - offset = 0; > - } else { > - if (rv == -EAGAIN || rv == 0) > - break; > - return rv; > - } > - i++; > - } > - return sent; > -} > - > -/* > - * siw_0copy_tx() > - * > - * Pushes list of pages to TCP socket. If pages from multiple > - * SGE's, all referenced pages of each SGE are pushed in one > - * shot. > - */ > -static int siw_0copy_tx(struct socket *s, struct page **page, > - struct siw_sge *sge, unsigned int offset, > - unsigned int size) > -{ > - int i = 0, sent = 0, rv; > - int sge_bytes = min(sge->length - offset, size); > - > - offset = (sge->laddr + offset) & ~PAGE_MASK; > - > - while (sent != size) { > - rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes); > - if (rv >= 0) { > - sent += rv; > - if (size == sent || sge_bytes > rv) > - break; > - > - i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT; > - sge++; > - sge_bytes = min(sge->length, size - sent); > - offset = sge->laddr & ~PAGE_MASK; > - } else { > - sent = rv; > - break; > - } > - } > - return sent; > -} > - > #define MAX_TRAILER (MPA_CRC_SIZE + 4) > > -static void siw_unmap_pages(struct kvec *iov, unsigned long kmap_mask, int > len) > -{ > - int i; > - > - /* > - * Work backwards through the array to honor the kmap_local_page() > - * ordering requirements. > - */ > - for (i = (len-1); i >= 0; i--) { > - if (kmap_mask & BIT(i)) { > - unsigned long addr = (unsigned long)iov[i].iov_base; > - > - kunmap_local((void *)(addr & PAGE_MASK)); > - } > - } > -} > - > /* > * siw_tx_hdt() tries to push a complete packet to TCP where all > * packet fragments are referenced by the elements of one iovec. Just a nit - maybe change 'iovec' -> 'bio_vec' in this comment? I successfully tested with nvmeof client and ib_read_bw/ib_write_bw. Looks good, I very much appreciate the resultant code simplifications in the tx path! Reviewed-by: Bernard Metzler <bmt@xxxxxxxxxxxxxx>