Re: [PATCH 5/5] btrfs: add io_uring command for encoded reads

Mark Harmstone <maharmstone@xxxxxxxx> · Thu, 31 Oct 2024 17:08:52 +0000

Thanks Pavel.

On 30/10/24 00:59, Pavel Begunkov wrote:
> > 
> On 10/22/24 15:50, Mark Harmstone wrote:
> ...
>> +static void btrfs_uring_read_finished(struct io_uring_cmd *cmd,
>> +                      unsigned int issue_flags)
>> +{
>> +    struct btrfs_uring_priv *priv =
>> +        *io_uring_cmd_to_pdu(cmd, struct btrfs_uring_priv *);
>> +    struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
>> +    struct extent_io_tree *io_tree = &inode->io_tree;
>> +    unsigned long i;
>> +    u64 cur;
>> +    size_t page_offset;
>> +    ssize_t ret;
>> +
>> +    if (priv->err) {
>> +        ret = priv->err;
>> +        goto out;
>> +    }
>> +
>> +    if (priv->compressed) {
>> +        i = 0;
>> +        page_offset = 0;
>> +    } else {
>> +        i = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT;
>> +        page_offset = offset_in_page(priv->iocb.ki_pos - priv->start);
>> +    }
>> +    cur = 0;
>> +    while (cur < priv->count) {
>> +        size_t bytes = min_t(size_t, priv->count - cur,
>> +                     PAGE_SIZE - page_offset);
>> +
>> +        if (copy_page_to_iter(priv->pages[i], page_offset, bytes,
>> +                      &priv->iter) != bytes) {
> 
> If that's an iovec backed iter that might fail, you'd need to
> steal this patch
> 
> https://lore.kernel.org/all/20241016-fuse-uring-for-6-10-rfc4-v4-12-9739c753666e@xxxxxxx/
> 
> and fail if "issue_flags & IO_URING_F_TASK_DEAD", see
> 
> https://lore.kernel.org/all/20241016-fuse-uring-for-6-10-rfc4-v4-13-9739c753666e@xxxxxxx/

Thanks, I've sent a patchset including your patch. Does it make a 
difference, though? If the task has died, presumably copy_page_to_iter 
can't copy to another process' memory...?

>> +            ret = -EFAULT;
>> +            goto out;
>> +        }
>> +
>> +        i++;
>> +        cur += bytes;
>> +        page_offset = 0;
>> +    }
>> +    ret = priv->count;
>> +
>> +out:
>> +    unlock_extent(io_tree, priv->start, priv->lockend, 
>> &priv->cached_state);
>> +    btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
> 
> When called via io_uring_cmd_complete_in_task() this function might
> not get run in any reasonable amount of time. Even worse, a
> misbehaving user can block it until the task dies.
> 
> I don't remember if rwsem allows unlock being executed in a different
> task than the pairing lock, but blocking it for that long could be a
> problem. I might not remember it right but I think Boris meantioned
> that the O_DIRECT path drops the inode lock right after submission
> without waiting for bios to complete. Is that right? Can we do it
> here as well?

We can't release the inode lock until we've released the extent lock. I 
do intend to look into reducing the amount of time we hold the extent 
lock, if we can, but it's not trivial to do this in a safe manner.
We could perhaps move the unlocking to btrfs_uring_read_extent_endio 
instead, but it looks unlocking an rwsem in a different context might 
cause problems with PREEMPT_RT(?).

>> +
>> +    io_uring_cmd_done(cmd, ret, 0, issue_flags);
>> +    add_rchar(current, ret);
>> +
>> +    for (unsigned long index = 0; index < priv->nr_pages; index++)
>> +        __free_page(priv->pages[index]);
>> +
>> +    kfree(priv->pages);
>> +    kfree(priv->iov);
>> +    kfree(priv);
>> +}
>> +
>> +void btrfs_uring_read_extent_endio(void *ctx, int err)
>> +{
>> +    struct btrfs_uring_priv *priv = ctx;
>> +
>> +    priv->err = err;
>> +
>> +    *io_uring_cmd_to_pdu(priv->cmd, struct btrfs_uring_priv *) = priv;
> 
> a nit, I'd suggest to create a temp var, should be easier to
> read. It'd even be nicer if you wrap it into a structure
> as suggested last time.
> 
> struct io_btrfs_cmd {
>      struct btrfs_uring_priv *priv;
> };
> 
> struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
> bc->priv = priv;

No problem, I've sent a patch for this.

>> +    io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished);
>> +}
>> +
>> +static int btrfs_uring_read_extent(struct kiocb *iocb, struct 
>> iov_iter *iter,
>> +                   u64 start, u64 lockend,
>> +                   struct extent_state *cached_state,
>> +                   u64 disk_bytenr, u64 disk_io_size,
>> +                   size_t count, bool compressed,
>> +                   struct iovec *iov,
>> +                   struct io_uring_cmd *cmd)
>> +{
>> +    struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
>> +    struct extent_io_tree *io_tree = &inode->io_tree;
>> +    struct page **pages;
>> +    struct btrfs_uring_priv *priv = NULL;
>> +    unsigned long nr_pages;
>> +    int ret;
>> +
>> +    nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
>> +    pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
>> +    if (!pages)
>> +        return -ENOMEM;
>> +    ret = btrfs_alloc_page_array(nr_pages, pages, 0);
>> +    if (ret) {
>> +        ret = -ENOMEM;
>> +        goto fail;
>> +    }
>> +
>> +    priv = kmalloc(sizeof(*priv), GFP_NOFS);
>> +    if (!priv) {
>> +        ret = -ENOMEM;
>> +        goto fail;
>> +    }
>> +
>> +    priv->iocb = *iocb;
>> +    priv->iov = iov;
>> +    priv->iter = *iter;
>> +    priv->count = count;
>> +    priv->cmd = cmd;
>> +    priv->cached_state = cached_state;
>> +    priv->compressed = compressed;
>> +    priv->nr_pages = nr_pages;
>> +    priv->pages = pages;
>> +    priv->start = start;
>> +    priv->lockend = lockend;
>> +    priv->err = 0;
>> +
>> +    ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
>> +                            disk_io_size, pages,
>> +                            priv);
>> +    if (ret && ret != -EIOCBQUEUED)
>> +        goto fail;
> 
> Turning both into return EIOCBQUEUED is a bit suspicious, but
> I lack context to say. Might make sense to return ret and let
> the caller handle it.

btrfs_encoded_read_regular_fill_pages returns 0 if the bio completes 
before the function can finish, -EIOCBQUEUED otherwise. In either case 
the behaviour of the calling function will be the same.

>> +
>> +    /*
>> +     * If we return -EIOCBQUEUED, we're deferring the cleanup to
>> +     * btrfs_uring_read_finished, which will handle unlocking the extent
>> +     * and inode and freeing the allocations.
>> +     */
>> +
>> +    return -EIOCBQUEUED;
>> +
>> +fail:
>> +    unlock_extent(io_tree, start, lockend, &cached_state);
>> +    btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
>> +    kfree(priv);
>> +    return ret;
>> +}
>> +
>> +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd,
>> +                    unsigned int issue_flags)
>> +{
>> +    size_t copy_end_kernel = offsetofend(struct 
>> btrfs_ioctl_encoded_io_args,
>> +                         flags);
>> +    size_t copy_end;
>> +    struct btrfs_ioctl_encoded_io_args args = { 0 };
>> +    int ret;
>> +    u64 disk_bytenr, disk_io_size;
>> +    struct file *file = cmd->file;
>> +    struct btrfs_inode *inode = BTRFS_I(file->f_inode);
>> +    struct btrfs_fs_info *fs_info = inode->root->fs_info;
>> +    struct extent_io_tree *io_tree = &inode->io_tree;
>> +    struct iovec iovstack[UIO_FASTIOV];
>> +    struct iovec *iov = iovstack;
>> +    struct iov_iter iter;
>> +    loff_t pos;
>> +    struct kiocb kiocb;
>> +    struct extent_state *cached_state = NULL;
>> +    u64 start, lockend;
>> +    void __user *sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
> 
> Let's rename it, I was taken aback for a brief second why
> you're copy_from_user() from an SQE / the ring, which turns
> out to be a user pointer to a btrfs structure.

sqe_addr being the addr field in the SQE, not the address of the SQE. I 
can see how it might be misleading, though.

> ...
>> +    ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
>> +                 &disk_bytenr, &disk_io_size);
>> +    if (ret < 0 && ret != -EIOCBQUEUED)
>> +        goto out_free;
>> +
>> +    file_accessed(file);
>> +
>> +    if (copy_to_user(sqe_addr + copy_end, (char *)&args + 
>> copy_end_kernel,
>> +             sizeof(args) - copy_end_kernel)) {
>> +        if (ret == -EIOCBQUEUED) {
>> +            unlock_extent(io_tree, start, lockend, &cached_state);
>> +            btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
>> +        }
>> +        ret = -EFAULT;
>> +        goto out_free;
> 
> It seems we're saving iov in the priv structure, who can access the iovec
> after the request is submitted? -EIOCBQUEUED in general means that the
> request is submitted and will get completed async, e.g. via callback, and
> if the bio callback can use the iov maybe via the iter, this goto will be
> a use after free.
> 
> Also, you're returning -EFAULT back to io_uring, which will kill the
> io_uring request / cmd while there might still be in flight bios that
> can try to access it.
> 
> Can you inject errors into the copy and test please?

The bio hasn't been submitted at this point, that happens in 
btrfs_uring_read_extent. So far all we've done is read the metadata from 
the page cache. The copy_to_user here is copying the metadata info to 
the userspace structure.

> 
>> +    }
>> +
>> +    if (ret == -EIOCBQUEUED) {
>> +        u64 count;
>> +
>> +        /*
>> +         * If we've optimized things by storing the iovecs on the stack,
>> +         * undo this.
>> +         */> +        if (!iov) {
>> +            iov = kmalloc(sizeof(struct iovec) * args.iovcnt,
>> +                      GFP_NOFS);
>> +            if (!iov) {
>> +                unlock_extent(io_tree, start, lockend,
>> +                          &cached_state);
>> +                btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
>> +                ret = -ENOMEM;
>> +                goto out_acct;
>> +            }
>> +
>> +            memcpy(iov, iovstack,
>> +                   sizeof(struct iovec) * args.iovcnt);
>> +        }
>> +
>> +        count = min_t(u64, iov_iter_count(&iter), disk_io_size);
>> +
>> +        /* Match ioctl by not returning past EOF if uncompressed */
>> +        if (!args.compression)
>> +            count = min_t(u64, count, args.len);
>> +
>> +        ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
>> +                          cached_state, disk_bytenr,
>> +                          disk_io_size, count,
>> +                          args.compression, iov, cmd);
>> +
>> +        goto out_acct;
>> +    }
>> +
>> +out_free:
>> +    kfree(iov);
>> +
>> +out_acct:
>> +    if (ret > 0)
>> +        add_rchar(current, ret);
>> +    inc_syscr(current);
>> +
>> +    return ret;
>> +}
>