Thanks Pavel. On 30/10/24 00:59, Pavel Begunkov wrote: > > > On 10/22/24 15:50, Mark Harmstone wrote: > ... >> +static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, >> + unsigned int issue_flags) >> +{ >> + struct btrfs_uring_priv *priv = >> + *io_uring_cmd_to_pdu(cmd, struct btrfs_uring_priv *); >> + struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); >> + struct extent_io_tree *io_tree = &inode->io_tree; >> + unsigned long i; >> + u64 cur; >> + size_t page_offset; >> + ssize_t ret; >> + >> + if (priv->err) { >> + ret = priv->err; >> + goto out; >> + } >> + >> + if (priv->compressed) { >> + i = 0; >> + page_offset = 0; >> + } else { >> + i = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT; >> + page_offset = offset_in_page(priv->iocb.ki_pos - priv->start); >> + } >> + cur = 0; >> + while (cur < priv->count) { >> + size_t bytes = min_t(size_t, priv->count - cur, >> + PAGE_SIZE - page_offset); >> + >> + if (copy_page_to_iter(priv->pages[i], page_offset, bytes, >> + &priv->iter) != bytes) { > > If that's an iovec backed iter that might fail, you'd need to > steal this patch > > https://lore.kernel.org/all/20241016-fuse-uring-for-6-10-rfc4-v4-12-9739c753666e@xxxxxxx/ > > and fail if "issue_flags & IO_URING_F_TASK_DEAD", see > > https://lore.kernel.org/all/20241016-fuse-uring-for-6-10-rfc4-v4-13-9739c753666e@xxxxxxx/ Thanks, I've sent a patchset including your patch. Does it make a difference, though? If the task has died, presumably copy_page_to_iter can't copy to another process' memory...? >> + ret = -EFAULT; >> + goto out; >> + } >> + >> + i++; >> + cur += bytes; >> + page_offset = 0; >> + } >> + ret = priv->count; >> + >> +out: >> + unlock_extent(io_tree, priv->start, priv->lockend, >> &priv->cached_state); >> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); > > When called via io_uring_cmd_complete_in_task() this function might > not get run in any reasonable amount of time. Even worse, a > misbehaving user can block it until the task dies. > > I don't remember if rwsem allows unlock being executed in a different > task than the pairing lock, but blocking it for that long could be a > problem. I might not remember it right but I think Boris meantioned > that the O_DIRECT path drops the inode lock right after submission > without waiting for bios to complete. Is that right? Can we do it > here as well? We can't release the inode lock until we've released the extent lock. I do intend to look into reducing the amount of time we hold the extent lock, if we can, but it's not trivial to do this in a safe manner. We could perhaps move the unlocking to btrfs_uring_read_extent_endio instead, but it looks unlocking an rwsem in a different context might cause problems with PREEMPT_RT(?). >> + >> + io_uring_cmd_done(cmd, ret, 0, issue_flags); >> + add_rchar(current, ret); >> + >> + for (unsigned long index = 0; index < priv->nr_pages; index++) >> + __free_page(priv->pages[index]); >> + >> + kfree(priv->pages); >> + kfree(priv->iov); >> + kfree(priv); >> +} >> + >> +void btrfs_uring_read_extent_endio(void *ctx, int err) >> +{ >> + struct btrfs_uring_priv *priv = ctx; >> + >> + priv->err = err; >> + >> + *io_uring_cmd_to_pdu(priv->cmd, struct btrfs_uring_priv *) = priv; > > a nit, I'd suggest to create a temp var, should be easier to > read. It'd even be nicer if you wrap it into a structure > as suggested last time. > > struct io_btrfs_cmd { > struct btrfs_uring_priv *priv; > }; > > struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); > bc->priv = priv; No problem, I've sent a patch for this. >> + io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished); >> +} >> + >> +static int btrfs_uring_read_extent(struct kiocb *iocb, struct >> iov_iter *iter, >> + u64 start, u64 lockend, >> + struct extent_state *cached_state, >> + u64 disk_bytenr, u64 disk_io_size, >> + size_t count, bool compressed, >> + struct iovec *iov, >> + struct io_uring_cmd *cmd) >> +{ >> + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); >> + struct extent_io_tree *io_tree = &inode->io_tree; >> + struct page **pages; >> + struct btrfs_uring_priv *priv = NULL; >> + unsigned long nr_pages; >> + int ret; >> + >> + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); >> + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); >> + if (!pages) >> + return -ENOMEM; >> + ret = btrfs_alloc_page_array(nr_pages, pages, 0); >> + if (ret) { >> + ret = -ENOMEM; >> + goto fail; >> + } >> + >> + priv = kmalloc(sizeof(*priv), GFP_NOFS); >> + if (!priv) { >> + ret = -ENOMEM; >> + goto fail; >> + } >> + >> + priv->iocb = *iocb; >> + priv->iov = iov; >> + priv->iter = *iter; >> + priv->count = count; >> + priv->cmd = cmd; >> + priv->cached_state = cached_state; >> + priv->compressed = compressed; >> + priv->nr_pages = nr_pages; >> + priv->pages = pages; >> + priv->start = start; >> + priv->lockend = lockend; >> + priv->err = 0; >> + >> + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, >> + disk_io_size, pages, >> + priv); >> + if (ret && ret != -EIOCBQUEUED) >> + goto fail; > > Turning both into return EIOCBQUEUED is a bit suspicious, but > I lack context to say. Might make sense to return ret and let > the caller handle it. btrfs_encoded_read_regular_fill_pages returns 0 if the bio completes before the function can finish, -EIOCBQUEUED otherwise. In either case the behaviour of the calling function will be the same. >> + >> + /* >> + * If we return -EIOCBQUEUED, we're deferring the cleanup to >> + * btrfs_uring_read_finished, which will handle unlocking the extent >> + * and inode and freeing the allocations. >> + */ >> + >> + return -EIOCBQUEUED; >> + >> +fail: >> + unlock_extent(io_tree, start, lockend, &cached_state); >> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); >> + kfree(priv); >> + return ret; >> +} >> + >> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, >> + unsigned int issue_flags) >> +{ >> + size_t copy_end_kernel = offsetofend(struct >> btrfs_ioctl_encoded_io_args, >> + flags); >> + size_t copy_end; >> + struct btrfs_ioctl_encoded_io_args args = { 0 }; >> + int ret; >> + u64 disk_bytenr, disk_io_size; >> + struct file *file = cmd->file; >> + struct btrfs_inode *inode = BTRFS_I(file->f_inode); >> + struct btrfs_fs_info *fs_info = inode->root->fs_info; >> + struct extent_io_tree *io_tree = &inode->io_tree; >> + struct iovec iovstack[UIO_FASTIOV]; >> + struct iovec *iov = iovstack; >> + struct iov_iter iter; >> + loff_t pos; >> + struct kiocb kiocb; >> + struct extent_state *cached_state = NULL; >> + u64 start, lockend; >> + void __user *sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); > > Let's rename it, I was taken aback for a brief second why > you're copy_from_user() from an SQE / the ring, which turns > out to be a user pointer to a btrfs structure. sqe_addr being the addr field in the SQE, not the address of the SQE. I can see how it might be misleading, though. > ... >> + ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, >> + &disk_bytenr, &disk_io_size); >> + if (ret < 0 && ret != -EIOCBQUEUED) >> + goto out_free; >> + >> + file_accessed(file); >> + >> + if (copy_to_user(sqe_addr + copy_end, (char *)&args + >> copy_end_kernel, >> + sizeof(args) - copy_end_kernel)) { >> + if (ret == -EIOCBQUEUED) { >> + unlock_extent(io_tree, start, lockend, &cached_state); >> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); >> + } >> + ret = -EFAULT; >> + goto out_free; > > It seems we're saving iov in the priv structure, who can access the iovec > after the request is submitted? -EIOCBQUEUED in general means that the > request is submitted and will get completed async, e.g. via callback, and > if the bio callback can use the iov maybe via the iter, this goto will be > a use after free. > > Also, you're returning -EFAULT back to io_uring, which will kill the > io_uring request / cmd while there might still be in flight bios that > can try to access it. > > Can you inject errors into the copy and test please? The bio hasn't been submitted at this point, that happens in btrfs_uring_read_extent. So far all we've done is read the metadata from the page cache. The copy_to_user here is copying the metadata info to the userspace structure. > >> + } >> + >> + if (ret == -EIOCBQUEUED) { >> + u64 count; >> + >> + /* >> + * If we've optimized things by storing the iovecs on the stack, >> + * undo this. >> + */> + if (!iov) { >> + iov = kmalloc(sizeof(struct iovec) * args.iovcnt, >> + GFP_NOFS); >> + if (!iov) { >> + unlock_extent(io_tree, start, lockend, >> + &cached_state); >> + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); >> + ret = -ENOMEM; >> + goto out_acct; >> + } >> + >> + memcpy(iov, iovstack, >> + sizeof(struct iovec) * args.iovcnt); >> + } >> + >> + count = min_t(u64, iov_iter_count(&iter), disk_io_size); >> + >> + /* Match ioctl by not returning past EOF if uncompressed */ >> + if (!args.compression) >> + count = min_t(u64, count, args.len); >> + >> + ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend, >> + cached_state, disk_bytenr, >> + disk_io_size, count, >> + args.compression, iov, cmd); >> + >> + goto out_acct; >> + } >> + >> +out_free: >> + kfree(iov); >> + >> +out_acct: >> + if (ret > 0) >> + add_rchar(current, ret); >> + inc_syscr(current); >> + >> + return ret; >> +} >