On Sat, Oct 29, 2016 at 10:47:58AM -0700, Linus Torvalds wrote: > On Sat, Oct 29, 2016 at 8:20 AM, Christoph Hellwig <hch@xxxxxx> wrote: > > > > We can't as that would not fix the use after free (at least for the lockdep > > case - otherwise the call is a no-op). Once iter_op returns aio_complete > > might have dropped our reference to the file, and another thread might > > have closed the fd so that the fput from aio_complete was the last one. > > I don't concpetually mind the patch per se, but the repeated > > if (rw == WRITE) { > .. > } > > if (rw == WRITE) { > .. > } > > is just insane and makes the code less legible than it should be. I agree. And I actually prepared a major refactor of the code to get rid of it, but didn't dare to post it this late in the cycle. Patch below, although this recently rebased version is not actually tested, so be aware: --- commit 8bc9e04f941645a610ca46804a95170bdd854450 Author: Christoph Hellwig <hch@xxxxxx> Date: Sun Oct 30 07:27:16 2016 +0100 fs: refactor aio_run_iocb Signed-off-by: Christoph Hellwig <hch@xxxxxx> diff --git a/fs/aio.c b/fs/aio.c index bf315cd..bc5541e 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1403,13 +1403,16 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) return -EINVAL; } -typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *); - -static int aio_setup_vectored_rw(int rw, char __user *buf, size_t len, - struct iovec **iovec, - bool compat, - struct iov_iter *iter) +static int aio_setup_rw(int rw, char __user *buf, size_t len, + struct iovec **iovec, bool vectored, bool compat, + struct iov_iter *iter) { + if (!vectored) { + ssize_t ret = import_single_range(rw, buf, len, *iovec, iter); + iovec = NULL; + return ret; + } + #ifdef CONFIG_COMPAT if (compat) return compat_import_iovec(rw, @@ -1420,110 +1423,100 @@ static int aio_setup_vectored_rw(int rw, char __user *buf, size_t len, len, UIO_FASTIOV, iovec, iter); } -/* - * aio_run_iocb: - * Performs the initial checks and io submission. - */ -static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, - char __user *buf, size_t len, bool compat) +static ssize_t aio_read(struct kiocb *req, char __user *buf, size_t len, + bool vectored, bool compat) { struct file *file = req->ki_filp; - ssize_t ret; - int rw; - fmode_t mode; - rw_iter_op *iter_op; struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; + ssize_t ret; - switch (opcode) { - case IOCB_CMD_PREAD: - case IOCB_CMD_PREADV: - mode = FMODE_READ; - rw = READ; - iter_op = file->f_op->read_iter; - goto rw_common; + if (unlikely(!(file->f_mode & FMODE_READ))) + return -EBADF; + if (unlikely(!file->f_op->read_iter)) + return -EINVAL; - case IOCB_CMD_PWRITE: - case IOCB_CMD_PWRITEV: - mode = FMODE_WRITE; - rw = WRITE; - iter_op = file->f_op->write_iter; - goto rw_common; -rw_common: - if (unlikely(!(file->f_mode & mode))) - return -EBADF; - - if (!iter_op) - return -EINVAL; + ret = aio_setup_rw(READ, buf, len, &iovec, vectored, compat, &iter); + if (ret) + goto out_free_iovec; - if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV) - ret = aio_setup_vectored_rw(rw, buf, len, - &iovec, compat, &iter); - else { - ret = import_single_range(rw, buf, len, iovec, &iter); - iovec = NULL; - } - if (!ret) - ret = rw_verify_area(rw, file, &req->ki_pos, - iov_iter_count(&iter)); - if (ret < 0) { - kfree(iovec); - return ret; - } + ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); + if (ret < 0) + goto out_free_iovec; - if (rw == WRITE) { - file_start_write(file); - req->ki_flags |= IOCB_WRITE; - } + ret = file->f_op->read_iter(req, &iter); +out_free_iovec: + kfree(iovec); + return ret; +} - if (rw == WRITE) { - /* - * We release freeze protection in aio_complete(). Fool - * lockdep by telling it the lock got released so that - * it doesn't complain about held lock when we return - * to userspace. - */ - __sb_writers_release(file_inode(file)->i_sb, - SB_FREEZE_WRITE); - } +static ssize_t aio_write(struct kiocb *req, char __user *buf, size_t len, + bool vectored, bool compat) +{ + struct file *file = req->ki_filp; + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct iov_iter iter; + ssize_t ret; - ret = iter_op(req, &iter); + if (unlikely(!(file->f_mode & FMODE_WRITE))) + return -EBADF; + if (unlikely(!file->f_op->write_iter)) + return -EINVAL; - kfree(iovec); - break; + ret = aio_setup_rw(WRITE, buf, len, &iovec, vectored, compat, &iter); + if (ret) + goto out_free_iovec; - case IOCB_CMD_FDSYNC: - if (!file->f_op->aio_fsync) - return -EINVAL; + ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); + if (ret < 0) + goto out_free_iovec; - ret = file->f_op->aio_fsync(req, 1); - break; + file_start_write(file); + req->ki_flags |= IOCB_WRITE; + ret = file->f_op->write_iter(req, &iter); - case IOCB_CMD_FSYNC: - if (!file->f_op->aio_fsync) - return -EINVAL; + /* + * We release freeze protection in aio_complete(). Fool + * lockdep by telling it the lock got released so that + * it doesn't complain about held lock when we return + * to userspace. + */ + __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); - ret = file->f_op->aio_fsync(req, 0); - break; +out_free_iovec: + kfree(iovec); + return ret; +} +static ssize_t aio_fsync(struct kiocb *req, bool datasync) +{ + struct file *file = req->ki_filp; + + if (!file->f_op->aio_fsync) + return -EINVAL; + return file->f_op->aio_fsync(req, datasync); +} + +static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, + char __user *buf, size_t len, bool compat) +{ + switch (opcode) { + case IOCB_CMD_PREAD: + return aio_read(req, buf, len, false, compat); + case IOCB_CMD_PREADV: + return aio_read(req, buf, len, true, compat); + case IOCB_CMD_PWRITE: + return aio_write(req, buf, len, false, compat); + case IOCB_CMD_PWRITEV: + return aio_write(req, buf, len, true, compat); + case IOCB_CMD_FDSYNC: + return aio_fsync(req, true); + case IOCB_CMD_FSYNC: + return aio_fsync(req, false); default: pr_debug("EINVAL: no operation provided\n"); return -EINVAL; } - - if (ret != -EIOCBQUEUED) { - /* - * There's no easy way to restart the syscall since other AIO's - * may be already running. Just fail this IO with EINTR. - */ - if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || - ret == -ERESTARTNOHAND || - ret == -ERESTART_RESTARTBLOCK)) - ret = -EINTR; - aio_complete(req, ret, 0); - } - - return 0; } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, @@ -1591,8 +1584,17 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, (char __user *)(unsigned long)iocb->aio_buf, iocb->aio_nbytes, compat); - if (ret) - goto out_put_req; + if (ret != -EIOCBQUEUED) { + /* + * There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || + ret == -ERESTARTNOHAND || + ret == -ERESTART_RESTARTBLOCK)) + ret = -EINTR; + aio_complete(&req->common, ret, 0); + } return 0; out_put_req: -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html