On 11/1/24 11:05 AM, Eric Dumazet wrote: > > On 10/24/24 7:00 AM, Christoph Hellwig wrote: >> From: Ming Lei <ming.lei@xxxxxxxxxx> >> >> The iov_iter_extract_pages interface allows to return physically >> discontiguous pages, as long as all but the first and last page >> in the array are page aligned and page size. Rewrite >> iov_iter_extract_bvec_pages to take advantage of that instead of only >> returning ranges of physically contiguous pages. >> >> Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> >> [hch: minor cleanups, new commit log] >> Signed-off-by: Christoph Hellwig <hch@xxxxxx> >> --- >> lib/iov_iter.c | 67 +++++++++++++++++++++++++++++++++----------------- >> 1 file changed, 45 insertions(+), 22 deletions(-) >> >> diff --git a/lib/iov_iter.c b/lib/iov_iter.c >> index 1abb32c0da50..9fc06f5fb748 100644 >> --- a/lib/iov_iter.c >> +++ b/lib/iov_iter.c >> @@ -1677,8 +1677,8 @@ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, >> } >> /* >> - * Extract a list of contiguous pages from an ITER_BVEC iterator. This does >> - * not get references on the pages, nor does it get a pin on them. >> + * Extract a list of virtually contiguous pages from an ITER_BVEC iterator. >> + * This does not get references on the pages, nor does it get a pin on them. >> */ >> static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, >> struct page ***pages, size_t maxsize, >> @@ -1686,35 +1686,58 @@ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, >> iov_iter_extraction_t extraction_flags, >> size_t *offset0) >> { >> - struct page **p, *page; >> - size_t skip = i->iov_offset, offset, size; >> - int k; >> + size_t skip = i->iov_offset, size = 0; >> + struct bvec_iter bi; >> + int k = 0; >> - for (;;) { >> - if (i->nr_segs == 0) >> - return 0; >> - size = min(maxsize, i->bvec->bv_len - skip); >> - if (size) >> - break; >> + if (i->nr_segs == 0) >> + return 0; >> + >> + if (i->iov_offset == i->bvec->bv_len) { >> i->iov_offset = 0; >> i->nr_segs--; >> i->bvec++; >> skip = 0; >> } >> + bi.bi_size = maxsize + skip; >> + bi.bi_bvec_done = skip; >> + >> + maxpages = want_pages_array(pages, maxsize, skip, maxpages); >> + >> + while (bi.bi_size && bi.bi_idx < i->nr_segs) { >> + struct bio_vec bv = bvec_iter_bvec(i->bvec, bi); >> + >> + /* >> + * The iov_iter_extract_pages interface only allows an offset >> + * into the first page. Break out of the loop if we see an >> + * offset into subsequent pages, the caller will have to call >> + * iov_iter_extract_pages again for the reminder. >> + */ >> + if (k) { >> + if (bv.bv_offset) >> + break; >> + } else { >> + *offset0 = bv.bv_offset; >> + } >> - skip += i->bvec->bv_offset; >> - page = i->bvec->bv_page + skip / PAGE_SIZE; >> - offset = skip % PAGE_SIZE; >> - *offset0 = offset; >> + (*pages)[k++] = bv.bv_page; >> + size += bv.bv_len; >> - maxpages = want_pages_array(pages, size, offset, maxpages); >> - if (!maxpages) >> - return -ENOMEM; >> - p = *pages; >> - for (k = 0; k < maxpages; k++) >> - p[k] = page + k; >> + if (k >= maxpages) >> + break; >> + >> + /* >> + * We are done when the end of the bvec doesn't align to a page >> + * boundary as that would create a hole in the returned space. >> + * The caller will handle this with another call to >> + * iov_iter_extract_pages. >> + */ >> + if (bv.bv_offset + bv.bv_len != PAGE_SIZE) >> + break; >> + >> + bvec_iter_advance_single(i->bvec, &bi, bv.bv_len); >> + } >> - size = min_t(size_t, size, maxpages * PAGE_SIZE - offset); >> iov_iter_advance(i, size); >> return size; >> } > > > This is causing major network regression in UDP sendfile, found by syzbot. > > I will release the syzbot report and this fix : > > diff --git a/lib/iov_iter.c b/lib/iov_iter.c > index 65ec660c2960..e19aab1fccca 100644 > --- a/lib/iov_iter.c > +++ b/lib/iov_iter.c > @@ -1728,6 +1728,10 @@ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, > (*pages)[k++] = bv.bv_page; > size += bv.bv_len; > > + if (size > maxsize) { > + size = maxsize; > + break; > + } > if (k >= maxpages) > break; Thanks Eric, I've applied your patch. -- Jens Axboe