Hi, We recently ran into serious performance issue with NFS client. It turned out that its due to lack of readv/write support for NFS (O_DIRECT) client. Here is our use-case: In our cloud environment, our storage is over NFS. Files on NFS are passed as a blockdevices to the guest (using O_DIRECT). When guest is doing IO on these block devices, they will end up as O_DIRECT writes to NFS (on KVM host). QEMU (on the host) gets a vector from virtio-ring and submits them. Old versions of QEMU, linearized the vector it got from KVM (copied them into a buffer) and submits the buffer. So, NFS client always received a single buffer. Later versions of QEMU, eliminated this copy and submits a vector directly using preadv/pwritev(). NFS client loops through the vector and submits each vector as separate request for each IO < wsize. In our case (negotiated wsize=1MB), for 256K IO - we get 64 vectors, each 4K. So, we end up submitting 64 4K FILE_SYNC IOs. Server end up doing each 4K synchronously. This causes serious performance degrade. We are trying to see if the performance improves if we convert IOs to ASYNC - but our initial results doesn't look good. readv/writev support NFS client for all possible cases is hard. Instead, if all vectors are page-aligned and iosizes page-multiple - it fits the current code easily. Luckily, QEMU use-case fits these requirements. Here is the patch to add this support. Comments ? Thanks, Badari Special readv/writev support for NFS O_DIRECT. Currently NFS client O_DIRECT read/write iterates over individual elements in the vector and submits the IO and waits for them to finish. If individual IOsize is < r/wsize, each IO will be FILE_SYNC. Server has to finish individual smaller IOs synchronous. This causes serious performance degrade. This patch is a special case to submit larger IOs from the client only if all the IO buffers are page-aligned and each individual IOs are page-multiple + total IO size is < r/wsize. This is the common case for QEMU usage. Signed-off-by: Badari Pulavarty <pbadari@xxxxxxxxxx> --- fs/nfs/direct.c | 259 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 259 insertions(+) Index: linux-2.6.38.2/fs/nfs/direct.c =================================================================== --- linux-2.6.38.2.orig/fs/nfs/direct.c 2011-04-12 10:56:29.374266292 -0400 +++ linux-2.6.38.2/fs/nfs/direct.c 2011-04-12 11:01:21.883266310 -0400 @@ -271,6 +271,52 @@ static const struct rpc_call_ops nfs_rea .rpc_release = nfs_direct_read_release, }; +static int nfs_direct_check_single_io(struct nfs_direct_req *dreq, + const struct iovec *iov, + unsigned long nr_segs, int rw) +{ + unsigned long seg; + int pages = 0; + struct nfs_open_context *ctx = dreq->ctx; + struct inode *inode = ctx->path.dentry->d_inode; + size_t size; + + + /* If its a single vector - use old code */ + if (nr_segs == 1) + return pages; + /* + * Check to see if all IO buffers are page-aligned and + * individual IO sizes are page-multiple. + * If so, we can submit them in a single IO. + * Otherwise, use old code + */ + for (seg = 0; seg < nr_segs; seg++) { + unsigned long user_addr = (unsigned long)iov[seg].iov_base; + + if ((user_addr & (PAGE_SIZE - 1)) || + (iov[seg].iov_len & (PAGE_SIZE - 1))) { + pages = 0; + break; + } + pages += (iov[seg].iov_len >> PAGE_SHIFT); + } + + if (rw) + size = NFS_SERVER(inode)->wsize; + else + size = NFS_SERVER(inode)->rsize; + + /* + * If IOsize > wsize/rsize, we need to split IOs into r/wsize anyway + * - fall back to old code. + */ + if (pages > (size >> PAGE_SHIFT)) + pages = 0; + + return pages; +} + /* * For each rsize'd chunk of the user's buffer, dispatch an NFS READ * operation. If nfs_readdata_alloc() or get_user_pages() fails, @@ -385,6 +431,101 @@ static ssize_t nfs_direct_read_schedule_ return result < 0 ? (ssize_t) result : -EFAULT; } +/* + * Special Case: Efficient readv() support - only if all the IO buffers + * in the vector are page-aligned and IO sizes are page-multiple + * + total IO size is < rsize. We can map all of the vectors together + * and submit them in a single IO instead of operating on single entry + * in the vector. + */ +static ssize_t nfs_direct_read_schedule_single_io(struct nfs_direct_req *dreq, + const struct iovec *iov, + unsigned long nr_segs, + int pages, + loff_t pos) +{ + struct nfs_open_context *ctx = dreq->ctx; + struct inode *inode = ctx->path.dentry->d_inode; + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t bytes = pages << PAGE_SHIFT; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = NFS_CLIENT(inode), + .rpc_message = &msg, + .callback_ops = &nfs_read_direct_ops, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + unsigned int pgbase; + int result; + int seg; + int mapped = 0; + int nr_pages; + ssize_t started = 0; + struct nfs_read_data *data; + + result = -ENOMEM; + data = nfs_readdata_alloc(pages); + if (unlikely(!data)) + return result; + + pgbase = user_addr & ~PAGE_MASK; + + for (seg = 0; seg < nr_segs; seg++) { + user_addr = (unsigned long)iov[seg].iov_base; + nr_pages = iov[seg].iov_len >> PAGE_SHIFT; + + down_read(¤t->mm->mmap_sem); + result = get_user_pages(current, current->mm, user_addr, + nr_pages, 1, 0, &data->pagevec[mapped], + NULL); + up_read(¤t->mm->mmap_sem); + if (result < 0) { + /* unmap what is done so far */ + nfs_direct_release_pages(data->pagevec, mapped); + nfs_readdata_free(data); + return result; + } + mapped += result; + } + + get_dreq(dreq); + + data->req = (struct nfs_page *) dreq; + data->inode = inode; + data->cred = msg.rpc_cred; + data->args.fh = NFS_FH(inode); + data->args.context = ctx; + data->args.offset = pos; + data->args.pgbase = pgbase; + data->args.pages = data->pagevec; + data->args.count = bytes; + data->res.fattr = &data->fattr; + data->res.eof = 0; + data->res.count = bytes; + nfs_fattr_init(&data->fattr); + + task_setup_data.task = &data->task; + task_setup_data.callback_data = data; + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + NFS_PROTO(inode)->read_setup(data, &msg); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + goto out; + rpc_put_task(task); + + started += bytes; +out: + if (started) + return started; + return result < 0 ? (ssize_t) result : -EFAULT; +} + static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, const struct iovec *iov, unsigned long nr_segs, @@ -396,6 +537,16 @@ static ssize_t nfs_direct_read_schedule_ get_dreq(dreq); + result = nfs_direct_check_single_io(dreq, iov, nr_segs, 0); + if (result) { + result = nfs_direct_read_schedule_single_io(dreq, iov, nr_segs, + result, pos); + if (result < 0) + goto out; + requested_bytes += result; + pos += result; + goto out; + } for (seg = 0; seg < nr_segs; seg++) { const struct iovec *vec = &iov[seg]; result = nfs_direct_read_schedule_segment(dreq, vec, pos); @@ -416,6 +567,7 @@ static ssize_t nfs_direct_read_schedule_ return result < 0 ? result : -EIO; } +out: if (put_dreq(dreq)) nfs_direct_complete(dreq); return 0; @@ -821,6 +973,102 @@ static ssize_t nfs_direct_write_schedule return result < 0 ? (ssize_t) result : -EFAULT; } +/* + * Special Case: Efficient writev() support - only if all the IO buffers + * in the vector are page-aligned and IO sizes are page-multiple + * + total IO size is < wsize. We can map all of the vectors together + * and submit them in a single IO instead of operating on single entry + * in the vector. + */ +static ssize_t nfs_direct_write_schedule_single_io(struct nfs_direct_req *dreq, + const struct iovec *iov, + unsigned long nr_segs, + int pages, + loff_t pos, int sync) +{ + struct nfs_open_context *ctx = dreq->ctx; + struct inode *inode = ctx->path.dentry->d_inode; + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t bytes = pages << PAGE_SHIFT; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = NFS_CLIENT(inode), + .rpc_message = &msg, + .callback_ops = &nfs_write_direct_ops, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + unsigned int pgbase; + int result; + int seg; + int mapped = 0; + int nr_pages; + ssize_t started = 0; + struct nfs_write_data *data; + + result = -ENOMEM; + data = nfs_writedata_alloc(pages); + if (unlikely(!data)) + return result; + + pgbase = user_addr & ~PAGE_MASK; + + for (seg = 0; seg < nr_segs; seg++) { + user_addr = (unsigned long)iov[seg].iov_base; + nr_pages = iov[seg].iov_len >> PAGE_SHIFT; + + down_read(¤t->mm->mmap_sem); + result = get_user_pages(current, current->mm, user_addr, + nr_pages, 0, 0, &data->pagevec[mapped], NULL); + up_read(¤t->mm->mmap_sem); + if (result < 0) { + /* unmap what is done so far */ + nfs_direct_release_pages(data->pagevec, mapped); + nfs_writedata_free(data); + return result; + } + mapped += result; + } + + get_dreq(dreq); + list_move_tail(&data->pages, &dreq->rewrite_list); + + data->req = (struct nfs_page *) dreq; + data->inode = inode; + data->cred = msg.rpc_cred; + data->args.fh = NFS_FH(inode); + data->args.context = ctx; + data->args.offset = pos; + data->args.pgbase = pgbase; + data->args.pages = data->pagevec; + data->args.count = bytes; + data->args.stable = sync; + data->res.fattr = &data->fattr; + data->res.count = bytes; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + + task_setup_data.task = &data->task; + task_setup_data.callback_data = data; + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + NFS_PROTO(inode)->write_setup(data, &msg); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + goto out; + rpc_put_task(task); + + started += bytes; +out: + if (started) + return started; + return result < 0 ? (ssize_t) result : -EFAULT; +} + static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, const struct iovec *iov, unsigned long nr_segs, @@ -832,6 +1080,16 @@ static ssize_t nfs_direct_write_schedule get_dreq(dreq); + result = nfs_direct_check_single_io(dreq, iov, nr_segs, 1); + if (result) { + result = nfs_direct_write_schedule_single_io(dreq, iov, nr_segs, + result, pos, sync); + if (result < 0) + goto out; + requested_bytes += result; + pos += result; + goto out; + } for (seg = 0; seg < nr_segs; seg++) { const struct iovec *vec = &iov[seg]; result = nfs_direct_write_schedule_segment(dreq, vec, @@ -853,6 +1111,7 @@ static ssize_t nfs_direct_write_schedule return result < 0 ? result : -EIO; } +out: if (put_dreq(dreq)) nfs_direct_write_complete(dreq, dreq->inode); return 0; -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html