From: Long Li <longli@xxxxxxxxxxxxx> Implement the main filesystem interface for doing read and write. These functions don't copy the user data into a kenrel buffer for data transfer. Pages are directly pinned and passed to the RDMA transport. Signed-off-by: Long Li <longli@xxxxxxxxxxxxx> --- fs/cifs/cifsfs.c | 19 ++++ fs/cifs/cifsfs.h | 3 + fs/cifs/file.c | 322 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 329 insertions(+), 15 deletions(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index f715609..ba19fed 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1118,6 +1118,25 @@ const struct file_operations cifs_file_direct_ops = { .fallocate = cifs_fallocate, }; +const struct file_operations cifs_file_direct_rdma_ops = { + .read_iter = cifs_direct_readv, + .write_iter = cifs_direct_writev, + .open = cifs_open, + .release = cifs_close, + .lock = cifs_lock, + .fsync = cifs_fsync, + .flush = cifs_flush, + .mmap = cifs_file_mmap, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .unlocked_ioctl = cifs_ioctl, + .copy_file_range = cifs_copy_file_range, + .clone_file_range = cifs_clone_file_range, + .llseek = cifs_llseek, + .setlease = cifs_setlease, + .fallocate = cifs_fallocate, +}; + const struct file_operations cifs_file_nobrl_ops = { .read_iter = cifs_loose_read_iter, .write_iter = cifs_file_write_iter, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 013ba2a..223cca8 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -94,6 +94,7 @@ extern const struct inode_operations cifs_dfs_referral_inode_operations; /* Functions related to files and directories */ extern const struct file_operations cifs_file_ops; extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */ +extern const struct file_operations cifs_file_direct_rdma_ops; /* if directio mnt */ extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */ extern const struct file_operations cifs_file_nobrl_ops; /* no brlocks */ extern const struct file_operations cifs_file_direct_nobrl_ops; @@ -102,8 +103,10 @@ extern int cifs_open(struct inode *inode, struct file *file); extern int cifs_close(struct inode *inode, struct file *file); extern int cifs_closedir(struct inode *inode, struct file *file); extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to); +extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to); extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to); extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from); +extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from); extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from); extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t, int); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index e240c7c..0b394db 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2452,15 +2452,46 @@ cifs_uncached_writedata_release(struct kref *refcount) int i; struct cifs_writedata *wdata = container_of(refcount, struct cifs_writedata, refcount); + struct page **pages = wdata->direct_pages ? wdata->direct_pages : wdata->pages; kref_put(&wdata->ctx->refcount, cifs_aio_ctx_release); for (i = 0; i < wdata->nr_pages; i++) - put_page(wdata->pages[i]); + put_page(pages[i]); cifs_writedata_release(refcount); } static void collect_uncached_write_data(struct cifs_aio_ctx *ctx); +static void cifs_direct_writedata_release(struct kref *refcount) +{ + int i; + struct cifs_writedata *wdata = container_of(refcount, + struct cifs_writedata, refcount); + + for (i = 0; i < wdata->nr_pages; i++) + put_page(wdata->direct_pages[i]); + kvfree(wdata->direct_pages); + + cifs_writedata_release(refcount); +} + +static void cifs_direct_writev_complete(struct work_struct *work) +{ + struct cifs_writedata *wdata = container_of(work, + struct cifs_writedata, work); + struct inode *inode = d_inode(wdata->cfile->dentry); + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + spin_lock(&inode->i_lock); + cifs_update_eof(cifsi, wdata->offset, wdata->bytes); + if (cifsi->server_eof > inode->i_size) + i_size_write(inode, cifsi->server_eof); + spin_unlock(&inode->i_lock); + + complete(&wdata->done); + kref_put(&wdata->refcount, cifs_direct_writedata_release); +} + static void cifs_uncached_writev_complete(struct work_struct *work) { @@ -2703,6 +2734,125 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx) complete(&ctx->done); } +ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + ssize_t total_written = 0; + struct cifsFileInfo *cfile; + struct cifs_tcon *tcon; + struct cifs_sb_info *cifs_sb; + struct TCP_Server_Info *server; + pid_t pid; + unsigned long nr_pages; + loff_t offset = iocb->ki_pos; + size_t len = iov_iter_count(from); + int rc; + struct cifs_writedata *wdata; + + rc = generic_write_checks(iocb, from); + if (rc <= 0) + return rc; + + cifs_sb = CIFS_FILE_SB(file); + cfile = file->private_data; + tcon = tlink_tcon(cfile->tlink); + server = tcon->ses->server; + + if (!server->ops->async_writev) + return -ENOSYS; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + pid = cfile->pid; + else + pid = current->tgid; + + do { + unsigned int wsize, credits; + struct page **pagevec; + size_t start; + ssize_t cur_len; + + rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize, + &wsize, &credits); + if (rc) + break; + + cur_len = iov_iter_get_pages_alloc(from, &pagevec, wsize, &start); + if (cur_len < 0) { + cifs_dbg(VFS, "direct_writev couldn't get user pages (rc=%zd) iter type %d iov_offset %lu count %lu\n", cur_len, from->type, from->iov_offset, from->count); + dump_stack(); + break; + } + if (cur_len < 0) + break; + + nr_pages = (cur_len + start + PAGE_SIZE -1) / PAGE_SIZE; + + wdata = cifs_writedata_alloc(nr_pages, pagevec, + cifs_direct_writev_complete); + if (!wdata) { + rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); + break; + } + + wdata->nr_pages = nr_pages; + wdata->page_offset = start; + wdata->pagesz = PAGE_SIZE; + wdata->tailsz = + nr_pages > 1 ? + cur_len - (PAGE_SIZE-start) - (nr_pages - 2)*PAGE_SIZE : + cur_len; + + wdata->sync_mode = WB_SYNC_ALL; + wdata->offset = (__u64)offset; + wdata->cfile = cifsFileInfo_get(cfile); + wdata->pid = pid; + wdata->bytes = cur_len; + wdata->credits = credits; + + kref_get(&wdata->refcount); + + if (!wdata->cfile->invalidHandle || + !(rc = cifs_reopen_file(wdata->cfile, false))) + rc = server->ops->async_writev(wdata, + cifs_direct_writedata_release); + if (rc) { + add_credits_and_wake_if(server, wdata->credits, 0); + kref_put(&wdata->refcount, + cifs_writedata_release); + if (rc == -EAGAIN) + continue; + break; + } else + wait_for_completion(&wdata->done); + + if (wdata->result) { + rc = wdata->result; + kref_put(&wdata->refcount, cifs_direct_writedata_release); + if (rc == -EAGAIN) + continue; + break; + } + + kref_put(&wdata->refcount, cifs_direct_writedata_release); + + iov_iter_advance(from, cur_len); + total_written += cur_len; + offset += cur_len; + len -= cur_len; + } while (len); + + if (unlikely(!total_written)) { + printk(KERN_ERR "%s: total_written=%ld rc=%d\n", __func__, total_written, rc); + return rc; + } + + iocb->ki_pos += total_written; + return total_written; + +} + ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -2942,18 +3092,30 @@ cifs_read_allocate_pages(struct cifs_readdata *rdata, unsigned int nr_pages) return rc; } +static void cifs_direct_readdata_release(struct kref *refcount) +{ + struct cifs_readdata *rdata = container_of(refcount, + struct cifs_readdata, refcount); + unsigned int i; + for (i = 0; i < rdata->nr_pages; i++) { + put_page(rdata->direct_pages[i]); + } + kvfree(rdata->direct_pages); + + cifs_readdata_release(refcount); +} + static void cifs_uncached_readdata_release(struct kref *refcount) { struct cifs_readdata *rdata = container_of(refcount, struct cifs_readdata, refcount); unsigned int i; + struct page **pages = rdata->direct_pages ? rdata->direct_pages : rdata->pages; kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release); - for (i = 0; i < rdata->nr_pages; i++) { - put_page(rdata->pages[i]); - rdata->pages[i] = NULL; - } + for (i = 0; i < rdata->nr_pages; i++) + put_page(pages[i]); cifs_readdata_release(refcount); } @@ -3013,30 +3175,32 @@ uncached_fill_pages(struct TCP_Server_Info *server, int result = 0; unsigned int i; unsigned int nr_pages = rdata->nr_pages; + unsigned int page_offset = rdata->page_offset; rdata->got_bytes = 0; rdata->tailsz = PAGE_SIZE; for (i = 0; i < nr_pages; i++) { - struct page *page = rdata->pages[i]; + struct page *page = rdata->direct_pages ? rdata->direct_pages[i] : rdata->pages[i]; size_t n; + unsigned int segment_size = rdata->pagesz; + + if (i == 0) + segment_size -= page_offset; + else + page_offset = 0; + if (len <= 0) { /* no need to hold page hostage */ - rdata->pages[i] = NULL; rdata->nr_pages--; put_page(page); continue; } n = len; - if (len >= PAGE_SIZE) { + if (len >= segment_size) /* enough data to fill the page */ - n = PAGE_SIZE; - len -= n; - } else { - zero_user(page, len, PAGE_SIZE - len); - rdata->tailsz = len; - len = 0; - } + n = segment_size; + len -= n; if (iter) result = copy_page_from_iter(page, 0, n, iter); #ifdef CONFIG_CIFS_SMB_DIRECT @@ -3243,6 +3407,134 @@ collect_uncached_read_data(struct cifs_aio_ctx *ctx) complete(&ctx->done); } +static void cifs_direct_readv_complete(struct work_struct *work) +{ + struct cifs_readdata *rdata = container_of(work, struct cifs_readdata, work); + int i = 0; + unsigned int bytes = 0; + + // Set them dirty? + while (bytes < rdata->got_bytes + rdata->page_offset) { + set_page_dirty(rdata->direct_pages[i++]); + bytes += rdata->pagesz; + } + + complete(&rdata->done); + kref_put(&rdata->refcount, cifs_direct_readdata_release); +} + +ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to) +{ + size_t len, cur_len, start; + unsigned int npages, rsize, credits; + struct file *file; + struct cifs_sb_info *cifs_sb; + struct cifsFileInfo *cfile; + struct cifs_tcon *tcon; + struct page **pagevec; + ssize_t rc, total_read = 0; + struct TCP_Server_Info *server; + loff_t offset = iocb->ki_pos; + pid_t pid; + struct cifs_readdata *rdata; + char *buf = to->iov->iov_base; + + len = iov_iter_count(to); + if (!len) + return 0; + + file = iocb->ki_filp; + cifs_sb = CIFS_FILE_SB(file); + cfile = file->private_data; + tcon = tlink_tcon(cfile->tlink); + server = tcon->ses->server; + + if (!server->ops->async_readv) + return -ENOSYS; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) + pid = cfile->pid; + else + pid = current->tgid; + + if ((file->f_flags & O_ACCMODE) == O_WRONLY) + cifs_dbg(FYI, "attempting read on write only file instance\n"); + + do { + rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize, + &rsize, &credits); + if (rc) + break; + + cur_len = min_t(const size_t, len, rsize); + + rc = iov_iter_get_pages_alloc(to, &pagevec, cur_len, &start); + if (rc < 0) { + cifs_dbg(VFS, "couldn't get user pages (rc=%zd) iter type %d iov_offset %lu count %lu\n", rc, to->type, to->iov_offset, to->count); + dump_stack(); + break; + } + + rdata = cifs_readdata_alloc(0, pagevec, cifs_direct_readv_complete); + if (!rdata) { + add_credits_and_wake_if(server, credits, 0); + rc = -ENOMEM; + break; + } + + npages = (rc + start + PAGE_SIZE-1) / PAGE_SIZE; + rdata->nr_pages = npages; + rdata->page_offset = start; + rdata->pagesz = PAGE_SIZE; + rdata->tailsz = npages > 1 ? + rc-(PAGE_SIZE-start)-(npages-2)*PAGE_SIZE : + rc; + cur_len = rc; + + rdata->cfile = cfile; + rdata->offset = offset; + rdata->bytes = rc; + rdata->pid = pid; + rdata->read_into_pages = cifs_uncached_read_into_pages; + rdata->copy_into_pages = cifs_uncached_copy_into_pages; + rdata->credits = credits; + + kref_get(&rdata->refcount); + + if (!rdata->cfile->invalidHandle || + !(rc = cifs_reopen_file(rdata->cfile, true))) + rc = server->ops->async_readv(rdata); + + if (rc) { + add_credits_and_wake_if(server, rdata->credits, 0); + kref_put(&rdata->refcount, + cifs_direct_readdata_release); + if (rc == -EAGAIN) + continue; + } else + wait_for_completion(&rdata->done); + + rc = rdata->result; + if (rc) { + kref_put(&rdata->refcount, cifs_direct_readdata_release); + if (rc == -EAGAIN) + continue; + break; + } + + total_read += rdata->got_bytes; + kref_put(&rdata->refcount, cifs_direct_readdata_release); + + iov_iter_advance(to, cur_len); + len -= cur_len; + offset += cur_len; + } while (len); + + iocb->ki_pos += total_read; + + return total_read; +} + ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; -- 2.7.4 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html