On Tue, Aug 23, 2022 at 03:12:41PM +0100, David Howells wrote: > static void > cifs_writev_requeue(struct cifs_writedata *wdata) > { > - int i, rc = 0; > + int rc = 0; > struct inode *inode = d_inode(wdata->cfile->dentry); > struct TCP_Server_Info *server; > - unsigned int rest_len; > + unsigned int rest_len = wdata->bytes; Umm... Can that by different from iov_iter_count(&wdata->iter)? > + loff_t fpos = wdata->offset; > > server = tlink_tcon(wdata->cfile->tlink)->ses->server; > - i = 0; > - rest_len = wdata->bytes; > do { > struct cifs_writedata *wdata2; > - unsigned int j, nr_pages, wsize, tailsz, cur_len; > + unsigned int wsize, cur_len; > > wsize = server->ops->wp_retry_size(inode); > if (wsize < rest_len) { > - nr_pages = wsize / PAGE_SIZE; > - if (!nr_pages) { > - rc = -EOPNOTSUPP; > + if (wsize < PAGE_SIZE) { > + rc = -ENOTSUPP; > break; > } > - cur_len = nr_pages * PAGE_SIZE; > - tailsz = PAGE_SIZE; > + cur_len = min(round_down(wsize, PAGE_SIZE), rest_len); > } else { > - nr_pages = DIV_ROUND_UP(rest_len, PAGE_SIZE); > cur_len = rest_len; > - tailsz = rest_len - (nr_pages - 1) * PAGE_SIZE; > } > > - wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete); > + wdata2 = cifs_writedata_alloc(cifs_writev_complete); > if (!wdata2) { > rc = -ENOMEM; > break; > } > > - for (j = 0; j < nr_pages; j++) { > - wdata2->pages[j] = wdata->pages[i + j]; > - lock_page(wdata2->pages[j]); > - clear_page_dirty_for_io(wdata2->pages[j]); > - } > - > wdata2->sync_mode = wdata->sync_mode; > - wdata2->nr_pages = nr_pages; > - wdata2->offset = page_offset(wdata2->pages[0]); > - wdata2->pagesz = PAGE_SIZE; > - wdata2->tailsz = tailsz; > - wdata2->bytes = cur_len; > + wdata2->offset = fpos; > + wdata2->bytes = cur_len; > + wdata2->iter = wdata->iter; > + > + iov_iter_advance(&wdata2->iter, fpos - wdata->offset); Am I right assuming that wdata->iter won't be looked at after we return? If so, why not advance wdata->iter instead? At the point where you increment fpos, that is. And instead of rest_len just use iov_iter_count(&wdata->iter)... > - /* cleanup remaining pages from the original wdata */ > - for (; i < wdata->nr_pages; i++) { > - SetPageError(wdata->pages[i]); > - end_page_writeback(wdata->pages[i]); > - put_page(wdata->pages[i]); > - } > + /* Clean up remaining pages from the original wdata */ > + if (iov_iter_is_xarray(&wdata->iter)) > + cifs_pages_write_failed(inode, fpos, rest_len); > > if (rc != 0 && !is_retryable_error(rc)) > mapping_set_error(inode->i_mapping, rc); > @@ -2491,7 +2503,6 @@ cifs_writev_complete(struct work_struct *work) > struct cifs_writedata *wdata = container_of(work, > struct cifs_writedata, work); > struct inode *inode = d_inode(wdata->cfile->dentry); > - int i = 0; > > if (wdata->result == 0) { > spin_lock(&inode->i_lock); > +/* > + * Select span of a bvec iterator we're going to use. Limit it by both maximum > + * size and maximum number of segments. > + */ > +static size_t cifs_limit_bvec_subset(const struct iov_iter *iter, size_t offset, > + size_t max_size, size_t max_segs, unsigned int *_nsegs) > +{ > + const struct bio_vec *bvecs = iter->bvec; > + unsigned int nbv = iter->nr_segs, ix = 0, nsegs = 0; > + size_t len, span = 0, n = iter->count; > + size_t skip = iter->iov_offset + offset; > + > + if (WARN_ON(!iov_iter_is_bvec(iter)) || WARN_ON(offset > n) || n == 0) > + return 0; > + > + while (n && ix < nbv && skip) { > + len = bvecs[ix].bv_len; > + if (skip < len) > + break; > + skip -= len; > + n -= len; > + ix++; > + } Umm... Are you guaranteed that iter->count points to an end of some bvec? IOW, what's to stop your n from wrapping around? > static int > -cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, > +cifs_write_from_iter(loff_t fpos, size_t len, const struct iov_iter *from, > struct cifsFileInfo *open_file, > struct cifs_sb_info *cifs_sb, struct list_head *wdata_list, > struct cifs_aio_ctx *ctx) > { > int rc = 0; > - size_t cur_len; > - unsigned long nr_pages, num_pages, i; > + size_t cur_len, max_len; > struct cifs_writedata *wdata; > - struct iov_iter saved_from = *from; > - loff_t saved_offset = offset; > + size_t skip = 0; > pid_t pid; > struct TCP_Server_Info *server; > - struct page **pagevec; > - size_t start; > - unsigned int xid; > + unsigned int xid, max_segs = INT_MAX; > > if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) > pid = open_file->pid; > @@ -3341,10 +3741,20 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, > server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); > xid = get_xid(); > > +#ifdef CONFIG_CIFS_SMB_DIRECT > + if (server->smbd_conn) > + max_segs = server->smbd_conn->max_frmr_depth; > +#endif > + > do { > - unsigned int wsize; > struct cifs_credits credits_on_stack; > struct cifs_credits *credits = &credits_on_stack; > + unsigned int wsize, nsegs = 0; > + > + if (signal_pending(current)) { > + rc = -EINTR; > + break; > + } > > if (open_file->invalidHandle) { > rc = cifs_reopen_file(open_file, false); > @@ -3359,96 +3769,43 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, > if (rc) > break; > > - cur_len = min_t(const size_t, len, wsize); > - > - if (ctx->direct_io) { > - ssize_t result; > - > - result = iov_iter_get_pages_alloc2( > - from, &pagevec, cur_len, &start); > - if (result < 0) { > - cifs_dbg(VFS, > - "direct_writev couldn't get user pages (rc=%zd) iter type %d iov_offset %zd count %zd\n", > - result, iov_iter_type(from), > - from->iov_offset, from->count); > - dump_stack(); > - > - rc = result; > - add_credits_and_wake_if(server, credits, 0); > - break; > - } > - cur_len = (size_t)result; > - > - nr_pages = > - (cur_len + start + PAGE_SIZE - 1) / PAGE_SIZE; > - > - wdata = cifs_writedata_direct_alloc(pagevec, > - cifs_uncached_writev_complete); > - if (!wdata) { > - rc = -ENOMEM; > - add_credits_and_wake_if(server, credits, 0); > - break; > - } > - > - > - wdata->page_offset = start; > - wdata->tailsz = > - nr_pages > 1 ? > - cur_len - (PAGE_SIZE - start) - > - (nr_pages - 2) * PAGE_SIZE : > - cur_len; > - } else { > - nr_pages = get_numpages(wsize, len, &cur_len); > - wdata = cifs_writedata_alloc(nr_pages, > - cifs_uncached_writev_complete); > - if (!wdata) { > - rc = -ENOMEM; > - add_credits_and_wake_if(server, credits, 0); > - break; > - } > - > - rc = cifs_write_allocate_pages(wdata->pages, nr_pages); > - if (rc) { > - kvfree(wdata->pages); > - kfree(wdata); > - add_credits_and_wake_if(server, credits, 0); > - break; > - } > - > - num_pages = nr_pages; > - rc = wdata_fill_from_iovec( > - wdata, from, &cur_len, &num_pages); > - if (rc) { > - for (i = 0; i < nr_pages; i++) > - put_page(wdata->pages[i]); > - kvfree(wdata->pages); > - kfree(wdata); > - add_credits_and_wake_if(server, credits, 0); > - break; > - } > + max_len = min_t(const size_t, len, wsize); > + if (!max_len) { > + rc = -EAGAIN; > + add_credits_and_wake_if(server, credits, 0); > + break; > + } > > - /* > - * Bring nr_pages down to the number of pages we > - * actually used, and free any pages that we didn't use. > - */ > - for ( ; nr_pages > num_pages; nr_pages--) > - put_page(wdata->pages[nr_pages - 1]); > + cur_len = cifs_limit_bvec_subset(from, skip, max_len, max_segs, &nsegs); > + cifs_dbg(FYI, "write_from_iter len=%zx/%zx nsegs=%u/%lu/%u\n", > + cur_len, max_len, nsegs, from->nr_segs, max_segs); > + if (cur_len == 0) { > + rc = -EIO; > + add_credits_and_wake_if(server, credits, 0); > + break; > + } > > - wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); > + wdata = cifs_writedata_alloc(cifs_uncached_writev_complete); > + if (!wdata) { > + rc = -ENOMEM; > + add_credits_and_wake_if(server, credits, 0); > + break; > } > > wdata->sync_mode = WB_SYNC_ALL; > - wdata->nr_pages = nr_pages; > - wdata->offset = (__u64)offset; > - wdata->cfile = cifsFileInfo_get(open_file); > - wdata->server = server; > - wdata->pid = pid; > - wdata->bytes = cur_len; > - wdata->pagesz = PAGE_SIZE; > - wdata->credits = credits_on_stack; > - wdata->ctx = ctx; > + wdata->offset = (__u64)fpos; > + wdata->cfile = cifsFileInfo_get(open_file); > + wdata->server = server; > + wdata->pid = pid; > + wdata->bytes = cur_len; > + wdata->credits = credits_on_stack; > + wdata->iter = *from; > + wdata->ctx = ctx; > kref_get(&ctx->refcount); > > + iov_iter_advance(&wdata->iter, skip); > + iov_iter_truncate(&wdata->iter, cur_len); > + > rc = adjust_credits(server, &wdata->credits, wdata->bytes); > > if (!rc) { > @@ -3463,16 +3820,14 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, > add_credits_and_wake_if(server, &wdata->credits, 0); > kref_put(&wdata->refcount, > cifs_uncached_writedata_release); > - if (rc == -EAGAIN) { > - *from = saved_from; > - iov_iter_advance(from, offset - saved_offset); > + if (rc == -EAGAIN) > continue; > - } > break; > } > > list_add_tail(&wdata->list, wdata_list); > - offset += cur_len; > + skip += cur_len; > + fpos += cur_len; IDGI. Why not simply iov_iter_advance(from, cur_len) here, and to hell with both the iov_iter_advance() above *and* with skip argument of cifs_limit_bvec_subset()? If you want to keep from const - copy into local variable and be done with that...