Make cifs use iov_iter_extract_pages() rather than iov_iter_get_pages2*() so that pages get pins taken on them rather than refs if they're potentially going to have RDMA read into them. Note that this is only a test and not complete as I think other refs may get taken on the pages in other places. Better, I think, to pass iterators down as far as possible and avoid extracting the pages at all if we can manage it. Two things to note: (1) The code has to decide whether put, unpin or abandon the page dependent the mode (which is stored in rdata and wdata). This probably isn't necessary in all the cases I've made it check the mode. Further, I'm very likely not handling correctly the places where wdata or rdata records are split. (2) There are a couple of places after calling iov_iter_get_pages2*() in the upstream kernel where I think the list of pages and all the pages it points to are leaked on error. I've added some comments at those points. --- fs/cifs/cifsglob.h | 2 + fs/cifs/file.c | 93 +++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 68 insertions(+), 27 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 1420acf987f0..86792b15197f 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1446,6 +1446,7 @@ struct cifs_readdata { __u64 offset; unsigned int bytes; unsigned int got_bytes; + unsigned int pages_cleanup_mode; /* How to clean up pages[] */ pid_t pid; int result; struct work_struct work; @@ -1479,6 +1480,7 @@ struct cifs_writedata { struct cifs_aio_ctx *ctx; __u64 offset; pid_t pid; + unsigned int pages_cleanup_mode; /* How to clean up pages[] */ unsigned int bytes; int result; struct TCP_Server_Info *server; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 21d41b3c1882..385599c08572 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2366,11 +2366,15 @@ cifs_writev_requeue(struct cifs_writedata *wdata) } for (j = 0; j < nr_pages; j++) { - unlock_page(wdata2->pages[j]); + struct page *page = wdata2->pages[j]; + unlock_page(page); if (rc != 0 && !is_retryable_error(rc)) { - SetPageError(wdata2->pages[j]); - end_page_writeback(wdata2->pages[j]); - put_page(wdata2->pages[j]); + SetPageError(page); + end_page_writeback(page); + if (wdata->pages_cleanup_mode & FOLL_PIN) + unpin_user_page(page); + if (wdata->pages_cleanup_mode & FOLL_GET) + put_page(page); } } @@ -2388,9 +2392,13 @@ cifs_writev_requeue(struct cifs_writedata *wdata) /* cleanup remaining pages from the original wdata */ for (; i < wdata->nr_pages; i++) { - SetPageError(wdata->pages[i]); - end_page_writeback(wdata->pages[i]); - put_page(wdata->pages[i]); + struct page *page = wdata->pages[i]; + SetPageError(page); + end_page_writeback(page); + if (wdata->pages_cleanup_mode & FOLL_PIN) + unpin_user_page(page); + if (wdata->pages_cleanup_mode & FOLL_GET) + put_page(page); } if (rc != 0 && !is_retryable_error(rc)) @@ -2424,7 +2432,10 @@ cifs_writev_complete(struct work_struct *work) SetPageError(page); end_page_writeback(page); cifs_readpage_to_fscache(inode, page); - put_page(page); + if (wdata->pages_cleanup_mode & FOLL_PIN) + unpin_user_page(page); + if (wdata->pages_cleanup_mode & FOLL_GET) + put_page(page); } if (wdata->result != -EAGAIN) mapping_set_error(inode->i_mapping, wdata->result); @@ -2529,6 +2540,7 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, if (!wdata) return NULL; + wdata->pages_cleanup_mode = FOLL_GET; *found_pages = find_get_pages_range_tag(mapping, index, end, PAGECACHE_TAG_DIRTY, tofind, wdata->pages); return wdata; @@ -3089,8 +3101,12 @@ cifs_uncached_writedata_release(struct kref *refcount) struct cifs_writedata, refcount); kref_put(&wdata->ctx->refcount, cifs_aio_ctx_release); - for (i = 0; i < wdata->nr_pages; i++) - put_page(wdata->pages[i]); + if (wdata->pages_cleanup_mode & FOLL_PIN) + for (i = 0; i < wdata->nr_pages; i++) + unpin_user_page(wdata->pages[i]); + if (wdata->pages_cleanup_mode & FOLL_GET) + for (i = 0; i < wdata->nr_pages; i++) + put_page(wdata->pages[i]); cifs_writedata_release(refcount); } @@ -3277,11 +3293,12 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, if (ctx->direct_io) { ssize_t result; - result = iov_iter_get_pages_alloc2( - from, &pagevec, cur_len, &start); + pagevec = NULL; + result = iov_iter_extract_pages( + from, &pagevec, cur_len, INT_MAX, &start); if (result < 0) { cifs_dbg(VFS, - "direct_writev couldn't get user pages (rc=%zd) iter type %d iov_offset %zd count %zd\n", + "direct_writev couldn't extract user pages (rc=%zd) iter type %d iov_offset %zd count %zd\n", result, iov_iter_type(from), from->iov_offset, from->count); dump_stack(); @@ -3298,12 +3315,14 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, wdata = cifs_writedata_direct_alloc(pagevec, cifs_uncached_writev_complete); if (!wdata) { + // Just leak pagevec? rc = -ENOMEM; add_credits_and_wake_if(server, credits, 0); break; } + wdata->pages_cleanup_mode = iov_iter_extract_mode(from); wdata->page_offset = start; wdata->tailsz = nr_pages > 1 ? @@ -3328,6 +3347,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, break; } + wdata->pages_cleanup_mode = FOLL_GET; num_pages = nr_pages; rc = wdata_fill_from_iovec( wdata, from, &cur_len, &num_pages); @@ -3489,9 +3509,9 @@ static ssize_t __cifs_writev( int rc; /* - * iov_iter_get_pages_alloc doesn't work with ITER_KVEC. - * In this case, fall back to non-direct write function. - * this could be improved by getting pages directly in ITER_KVEC + * iov_iter_extract_pages_alloc can't work with ITER_KVEC as kvec pages + * must not be got. In this case, fall back to non-direct write + * function. */ if (direct && iov_iter_is_kvec(from)) { cifs_dbg(FYI, "use non-direct cifs_writev for kvec I/O\n"); @@ -3749,7 +3769,12 @@ cifs_read_allocate_pages(struct cifs_readdata *rdata, unsigned int nr_pages) unsigned int nr_page_failed = i; for (i = 0; i < nr_page_failed; i++) { - put_page(rdata->pages[i]); + struct page *page = rdata->pages[i]; + + if (rdata->pages_cleanup_mode & FOLL_PIN) + unpin_user_page(page); + if (rdata->pages_cleanup_mode & FOLL_GET) + put_page(page); rdata->pages[i] = NULL; } } @@ -3765,7 +3790,12 @@ cifs_uncached_readdata_release(struct kref *refcount) kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release); for (i = 0; i < rdata->nr_pages; i++) { - put_page(rdata->pages[i]); + struct page *page = rdata->pages[i]; + + if (rdata->pages_cleanup_mode & FOLL_PIN) + unpin_user_page(page); + if (rdata->pages_cleanup_mode & FOLL_GET) + put_page(page); } cifs_readdata_release(refcount); } @@ -3845,7 +3875,10 @@ uncached_fill_pages(struct TCP_Server_Info *server, /* no need to hold page hostage */ rdata->pages[i] = NULL; rdata->nr_pages--; - put_page(page); + if (rdata->pages_cleanup_mode & FOLL_PIN) + unpin_user_page(page); + if (rdata->pages_cleanup_mode & FOLL_GET) + put_page(page); continue; } @@ -4015,12 +4048,12 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, if (ctx->direct_io) { ssize_t result; - result = iov_iter_get_pages_alloc2( - &direct_iov, &pagevec, - cur_len, &start); + pagevec = NULL; + result = iov_iter_extract_pages(&direct_iov, &pagevec, + cur_len, INT_MAX, &start); if (result < 0) { cifs_dbg(VFS, - "Couldn't get user pages (rc=%zd) iter type %d iov_offset %zd count %zd\n", + "Couldn't extract user pages (rc=%zd) iter type %d iov_offset %zd count %zd\n", result, iov_iter_type(&direct_iov), direct_iov.iov_offset, direct_iov.count); @@ -4035,6 +4068,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, rdata = cifs_readdata_direct_alloc( pagevec, cifs_uncached_readv_complete); if (!rdata) { + // Leaks pagevec? add_credits_and_wake_if(server, credits, 0); rc = -ENOMEM; break; @@ -4045,6 +4079,8 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, rdata->tailsz = npages > 1 ? cur_len-(PAGE_SIZE-start)-(npages-2)*PAGE_SIZE : cur_len; + rdata->pages_cleanup_mode = + iov_iter_extract_mode(&direct_iov); } else { @@ -4228,9 +4264,9 @@ static ssize_t __cifs_readv( struct cifs_aio_ctx *ctx; /* - * iov_iter_get_pages_alloc() doesn't work with ITER_KVEC, - * fall back to data copy read path - * this could be improved by getting pages directly in ITER_KVEC + * iov_iter_extract_pages() doesn't work with ITER_KVEC as kvec pages + * can't have refs or pins taken on them - so fall back to data copy + * read path */ if (direct && iov_iter_is_kvec(to)) { cifs_dbg(FYI, "use non-direct cifs_user_readv for kvec I/O\n"); @@ -4563,7 +4599,10 @@ cifs_readv_complete(struct work_struct *work) got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes); - put_page(page); + if (rdata->pages_cleanup_mode & FOLL_PIN) + unpin_user_page(page); + if (rdata->pages_cleanup_mode & FOLL_GET) + put_page(page); rdata->pages[i] = NULL; } kref_put(&rdata->refcount, cifs_readdata_release);