On 1/29/19 4:08 PM, Jann Horn wrote: > On Wed, Jan 30, 2019 at 12:06 AM Jens Axboe <axboe@xxxxxxxxx> wrote: >> On 1/29/19 4:03 PM, Jann Horn wrote: >>> On Tue, Jan 29, 2019 at 11:56 PM Jens Axboe <axboe@xxxxxxxxx> wrote: >>>> On 1/29/19 3:44 PM, Jann Horn wrote: >>>>> On Tue, Jan 29, 2019 at 8:27 PM Jens Axboe <axboe@xxxxxxxxx> wrote: >>>>>> If we have fixed user buffers, we can map them into the kernel when we >>>>>> setup the io_context. That avoids the need to do get_user_pages() for >>>>>> each and every IO. >>>>>> >>>>>> To utilize this feature, the application must call io_uring_register() >>>>>> after having setup an io_uring context, passing in >>>>>> IORING_REGISTER_BUFFERS as the opcode. The argument must be a pointer >>>>>> to an iovec array, and the nr_args should contain how many iovecs the >>>>>> application wishes to map. >>>>>> >>>>>> If successful, these buffers are now mapped into the kernel, eligible >>>>>> for IO. To use these fixed buffers, the application must use the >>>>>> IORING_OP_READ_FIXED and IORING_OP_WRITE_FIXED opcodes, and then >>>>>> set sqe->index to the desired buffer index. sqe->addr..sqe->addr+seq->len >>>>>> must point to somewhere inside the indexed buffer. >>>>>> >>>>>> The application may register buffers throughout the lifetime of the >>>>>> io_uring context. It can call io_uring_register() with >>>>>> IORING_UNREGISTER_BUFFERS as the opcode to unregister the current set of >>>>>> buffers, and then register a new set. The application need not >>>>>> unregister buffers explicitly before shutting down the io_uring context. >>> [...] >>>>>> + imu = &ctx->user_bufs[index]; >>>>>> + buf_addr = READ_ONCE(sqe->addr); >>>>>> + if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len) >>>>> >>>>> This can wrap around if `buf_addr` or `len` is very big, right? Then >>>>> you e.g. get past the first check because `buf_addr` is sufficiently >>>>> big, and get past the second check because `buf_addr + len` wraps >>>>> around and becomes small. >>>> >>>> Good point. I wonder if we have a verification helper for something like >>>> this? >>> >>> check_add_overflow() exists, I guess that might help a bit. I don't >>> think I've seen a more specific helper for this situation. >> >> Hmm, not super appropriate. How about something ala: >> >> if (buf_addr + len < buf_addr) >> ... overflow ... >> >> ? > > Sure, sounds good. Just folded in this incremental, which should fix all the issues outlined in your email. diff --git a/fs/io_uring.c b/fs/io_uring.c index 7364feebafed..d42541357969 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -751,7 +751,7 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, { size_t len = READ_ONCE(sqe->len); struct io_mapped_ubuf *imu; - int buf_index, index; + unsigned index, buf_index; size_t offset; u64 buf_addr; @@ -763,9 +763,12 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, if (unlikely(buf_index >= ctx->nr_user_bufs)) return -EFAULT; - index = array_index_nospec(buf_index, ctx->sq_entries); + index = array_index_nospec(buf_index, ctx->nr_user_bufs); imu = &ctx->user_bufs[index]; buf_addr = READ_ONCE(sqe->addr); + + if (buf_addr + len < buf_addr) + return -EFAULT; if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len) return -EFAULT; @@ -1602,6 +1605,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, static int io_sq_thread(void *data) { + struct io_uring_sqe lsqe[IO_IOPOLL_BATCH]; struct sqe_submit sqes[IO_IOPOLL_BATCH]; struct io_ring_ctx *ctx = data; struct mm_struct *cur_mm = NULL; @@ -1701,6 +1705,14 @@ static int io_sq_thread(void *data) i = 0; all_fixed = true; do { + /* + * Ensure sqe is stable between checking if we need + * user access, and actually importing the iovec + * further down the stack. + */ + memcpy(&lsqe[i], sqes[i].sqe, sizeof(lsqe[i])); + sqes[i].sqe = &lsqe[i]; + if (all_fixed && io_sqe_needs_user(sqes[i].sqe)) all_fixed = false; @@ -2081,7 +2093,7 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, struct iovec __user *src; #ifdef CONFIG_COMPAT - if (in_compat_syscall()) { + if (ctx->compat) { struct compat_iovec __user *ciovs; struct compat_iovec ciov; @@ -2103,7 +2115,6 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { - struct vm_area_struct **vmas = NULL; struct page **pages = NULL; int i, j, got_pages = 0; int ret = -EINVAL; @@ -2138,7 +2149,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, * submitted if they are wrong. */ ret = -EFAULT; - if (!iov.iov_base) + if (!iov.iov_base || !iov.iov_len) goto err; /* arbitrary limit, but we need something */ @@ -2155,14 +2166,10 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, goto err; if (!pages || nr_pages > got_pages) { - kfree(vmas); kfree(pages); pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - vmas = kmalloc_array(nr_pages, - sizeof(struct vma_area_struct *), - GFP_KERNEL); - if (!pages || !vmas) { + if (!pages) { io_unaccount_mem(ctx, nr_pages); goto err; } @@ -2176,32 +2183,18 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, goto err; } - down_write(¤t->mm->mmap_sem); - pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE, - pages, vmas); - if (pret == nr_pages) { - /* don't support file backed memory */ - for (j = 0; j < nr_pages; j++) { - struct vm_area_struct *vma = vmas[j]; + down_read(¤t->mm->mmap_sem); + pret = get_user_pages_longterm(ubuf, nr_pages, + FOLL_WRITE | FOLL_ANON, pages, + NULL); + up_read(¤t->mm->mmap_sem); - if (vma->vm_file) { - ret = -EOPNOTSUPP; - break; - } - } - } else { - ret = pret < 0 ? pret : -EFAULT; - } - up_write(¤t->mm->mmap_sem); - if (ret) { - /* - * if we did partial map, or found file backed vmas, - * release any pages we did get - */ + if (pret != nr_pages) { if (pret > 0) { for (j = 0; j < pret; j++) put_page(pages[j]); } + ret = pret < 0 ? pret : -EFAULT; io_unaccount_mem(ctx, nr_pages); goto err; } @@ -2224,12 +2217,10 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, imu->nr_bvecs = nr_pages; } kfree(pages); - kfree(vmas); ctx->nr_user_bufs = nr_args; return 0; err: kfree(pages); - kfree(vmas); io_sqe_buffer_unregister(ctx); return ret; } -- Jens Axboe