On Tue, Feb 06, 2024 at 03:47:18PM -0800, Bart Van Assche wrote: > Calling kiocb_set_cancel_fn() without knowing whether the caller > submitted a struct kiocb or a struct aio_kiocb is unsafe. Fix this by > introducing the cancel_kiocb() method in struct file_operations. The > following call trace illustrates that without this patch an > out-of-bounds write happens if I/O is submitted by io_uring (from a > phone with an ARM CPU and kernel 6.1): > > WARNING: CPU: 3 PID: 368 at fs/aio.c:598 kiocb_set_cancel_fn+0x9c/0xa8 > Call trace: > kiocb_set_cancel_fn+0x9c/0xa8 > ffs_epfile_read_iter+0x144/0x1d0 > io_read+0x19c/0x498 > io_issue_sqe+0x118/0x27c > io_submit_sqes+0x25c/0x5fc > __arm64_sys_io_uring_enter+0x104/0xab0 > invoke_syscall+0x58/0x11c > el0_svc_common+0xb4/0xf4 > do_el0_svc+0x2c/0xb0 > el0_svc+0x2c/0xa4 > el0t_64_sync_handler+0x68/0xb4 > el0t_64_sync+0x1a4/0x1a8 > > The following patch has been used as the basis of this patch: Christoph > Hellwig, "[PATCH 08/32] aio: replace kiocb_set_cancel_fn with a > cancel_kiocb file operation", May 2018 > (https://lore.kernel.org/all/20180515194833.6906-9-hch@xxxxxx/). What's changed that voids Al's objections on that patch from 2018? Specifically https://lore.kernel.org/all/20180406021553.GS30522@xxxxxxxxxxxxxxxxxx https://lore.kernel.org/all/20180520052720.GY30522@xxxxxxxxxxxxxxxxxx > > Cc: Christoph Hellwig <hch@xxxxxx> > Cc: Avi Kivity <avi@xxxxxxxxxxxx> > Cc: Sandeep Dhavale <dhavale@xxxxxxxxxx> > Cc: Jens Axboe <axboe@xxxxxxxxx> > Cc: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> > Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx> > --- > drivers/usb/gadget/function/f_fs.c | 10 +--- > drivers/usb/gadget/legacy/inode.c | 5 +- > fs/aio.c | 93 +++++++++++++++++------------- > include/linux/aio.h | 7 --- > include/linux/fs.h | 1 + > include/linux/mm.h | 5 ++ > include/linux/mm_inline.h | 5 ++ > 7 files changed, 68 insertions(+), 58 deletions(-) > > diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c > index 6bff6cb93789..adc00689fe3b 100644 > --- a/drivers/usb/gadget/function/f_fs.c > +++ b/drivers/usb/gadget/function/f_fs.c > @@ -31,7 +31,6 @@ > #include <linux/usb/composite.h> > #include <linux/usb/functionfs.h> > > -#include <linux/aio.h> > #include <linux/kthread.h> > #include <linux/poll.h> > #include <linux/eventfd.h> > @@ -1157,7 +1156,7 @@ ffs_epfile_open(struct inode *inode, struct file *file) > return stream_open(inode, file); > } > > -static int ffs_aio_cancel(struct kiocb *kiocb) > +static int ffs_epfile_cancel_kiocb(struct kiocb *kiocb) > { > struct ffs_io_data *io_data = kiocb->private; > struct ffs_epfile *epfile = kiocb->ki_filp->private_data; > @@ -1198,9 +1197,6 @@ static ssize_t ffs_epfile_write_iter(struct kiocb *kiocb, struct iov_iter *from) > > kiocb->private = p; > > - if (p->aio) > - kiocb_set_cancel_fn(kiocb, ffs_aio_cancel); > - > res = ffs_epfile_io(kiocb->ki_filp, p); > if (res == -EIOCBQUEUED) > return res; > @@ -1242,9 +1238,6 @@ static ssize_t ffs_epfile_read_iter(struct kiocb *kiocb, struct iov_iter *to) > > kiocb->private = p; > > - if (p->aio) > - kiocb_set_cancel_fn(kiocb, ffs_aio_cancel); > - > res = ffs_epfile_io(kiocb->ki_filp, p); > if (res == -EIOCBQUEUED) > return res; > @@ -1356,6 +1349,7 @@ static const struct file_operations ffs_epfile_operations = { > .release = ffs_epfile_release, > .unlocked_ioctl = ffs_epfile_ioctl, > .compat_ioctl = compat_ptr_ioctl, > + .cancel_kiocb = ffs_epfile_cancel_kiocb, > }; > > > diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c > index 03179b1880fd..a4c03cfb3baa 100644 > --- a/drivers/usb/gadget/legacy/inode.c > +++ b/drivers/usb/gadget/legacy/inode.c > @@ -22,7 +22,6 @@ > #include <linux/slab.h> > #include <linux/poll.h> > #include <linux/kthread.h> > -#include <linux/aio.h> > #include <linux/uio.h> > #include <linux/refcount.h> > #include <linux/delay.h> > @@ -446,7 +445,7 @@ struct kiocb_priv { > unsigned actual; > }; > > -static int ep_aio_cancel(struct kiocb *iocb) > +static int ep_cancel_kiocb(struct kiocb *iocb) > { > struct kiocb_priv *priv = iocb->private; > struct ep_data *epdata; > @@ -537,7 +536,6 @@ static ssize_t ep_aio(struct kiocb *iocb, > iocb->private = priv; > priv->iocb = iocb; > > - kiocb_set_cancel_fn(iocb, ep_aio_cancel); > get_ep(epdata); > priv->epdata = epdata; > priv->actual = 0; > @@ -709,6 +707,7 @@ static const struct file_operations ep_io_operations = { > .unlocked_ioctl = ep_ioctl, > .read_iter = ep_read_iter, > .write_iter = ep_write_iter, > + .cancel_kiocb = ep_cancel_kiocb, > }; > > /* ENDPOINT INITIALIZATION > diff --git a/fs/aio.c b/fs/aio.c > index bb2ff48991f3..0b13d6be773d 100644 > --- a/fs/aio.c > +++ b/fs/aio.c > @@ -203,7 +203,6 @@ struct aio_kiocb { > }; > > struct kioctx *ki_ctx; > - kiocb_cancel_fn *ki_cancel; > > struct io_event ki_res; > > @@ -587,22 +586,6 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) > #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) > #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) > > -void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) > -{ > - struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw); > - struct kioctx *ctx = req->ki_ctx; > - unsigned long flags; > - > - if (WARN_ON_ONCE(!list_empty(&req->ki_list))) > - return; > - > - spin_lock_irqsave(&ctx->ctx_lock, flags); > - list_add_tail(&req->ki_list, &ctx->active_reqs); > - req->ki_cancel = cancel; > - spin_unlock_irqrestore(&ctx->ctx_lock, flags); > -} > -EXPORT_SYMBOL(kiocb_set_cancel_fn); > - > /* > * free_ioctx() should be RCU delayed to synchronize against the RCU > * protected lookup_ioctx() and also needs process context to call > @@ -634,6 +617,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref) > queue_rcu_work(system_wq, &ctx->free_rwork); > } > > +static void aio_cancel_and_del(struct aio_kiocb *req); > + > /* > * When this function runs, the kioctx has been removed from the "hash table" > * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - > @@ -649,8 +634,7 @@ static void free_ioctx_users(struct percpu_ref *ref) > while (!list_empty(&ctx->active_reqs)) { > req = list_first_entry(&ctx->active_reqs, > struct aio_kiocb, ki_list); > - req->ki_cancel(&req->rw); > - list_del_init(&req->ki_list); > + aio_cancel_and_del(req); > } > > spin_unlock_irq(&ctx->ctx_lock); > @@ -1556,6 +1540,20 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret) > { > switch (ret) { > case -EIOCBQUEUED: > + /* > + * If the .cancel_kiocb() callback has been set, add the request > + * to the list of active requests. > + */ > + if (req->ki_filp->f_op->cancel_kiocb) { > + struct aio_kiocb *iocb = > + container_of(req, struct aio_kiocb, rw); > + struct kioctx *ctx = iocb->ki_ctx; > + unsigned long flags; > + > + spin_lock_irqsave(&ctx->ctx_lock, flags); > + list_add_tail(&iocb->ki_list, &ctx->active_reqs); > + spin_unlock_irqrestore(&ctx->ctx_lock, flags); > + } > break; > case -ERESTARTSYS: > case -ERESTARTNOINTR: > @@ -1715,6 +1713,41 @@ static void poll_iocb_unlock_wq(struct poll_iocb *req) > rcu_read_unlock(); > } > > +/* assumes we are called with irqs disabled */ > +static int aio_poll_cancel(struct aio_kiocb *aiocb) > +{ > + struct poll_iocb *req = &aiocb->poll; > + struct kioctx *ctx = aiocb->ki_ctx; > + > + lockdep_assert_held(&ctx->ctx_lock); > + > + if (!poll_iocb_lock_wq(req)) { > + /* Not a polled request or already cancelled. */ > + return 0; > + } > + > + WRITE_ONCE(req->cancelled, true); > + if (!req->work_scheduled) { > + schedule_work(&aiocb->poll.work); > + req->work_scheduled = true; > + } > + poll_iocb_unlock_wq(req); > + > + return 0; > +} > + > +static void aio_cancel_and_del(struct aio_kiocb *req) > +{ > + struct kioctx *ctx = req->ki_ctx; > + > + lockdep_assert_held(&ctx->ctx_lock); > + > + if (req->rw.ki_filp->f_op->cancel_kiocb) > + req->rw.ki_filp->f_op->cancel_kiocb(&req->rw); > + aio_poll_cancel(req); > + list_del_init(&req->ki_list); > +} > + > static void aio_poll_complete_work(struct work_struct *work) > { > struct poll_iocb *req = container_of(work, struct poll_iocb, work); > @@ -1760,24 +1793,6 @@ static void aio_poll_complete_work(struct work_struct *work) > iocb_put(iocb); > } > > -/* assumes we are called with irqs disabled */ > -static int aio_poll_cancel(struct kiocb *iocb) > -{ > - struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); > - struct poll_iocb *req = &aiocb->poll; > - > - if (poll_iocb_lock_wq(req)) { > - WRITE_ONCE(req->cancelled, true); > - if (!req->work_scheduled) { > - schedule_work(&aiocb->poll.work); > - req->work_scheduled = true; > - } > - poll_iocb_unlock_wq(req); > - } /* else, the request was force-cancelled by POLLFREE already */ > - > - return 0; > -} > - > static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, > void *key) > { > @@ -1945,7 +1960,6 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) > * active_reqs so that it can be cancelled if needed. > */ > list_add_tail(&aiocb->ki_list, &ctx->active_reqs); > - aiocb->ki_cancel = aio_poll_cancel; > } > if (on_queue) > poll_iocb_unlock_wq(req); > @@ -2189,8 +2203,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, > /* TODO: use a hash or array, this sucks. */ > list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { > if (kiocb->ki_res.obj == obj) { > - ret = kiocb->ki_cancel(&kiocb->rw); > - list_del_init(&kiocb->ki_list); > + aio_cancel_and_del(kiocb); > break; > } > } > diff --git a/include/linux/aio.h b/include/linux/aio.h > index 86892a4fe7c8..2aa6d0be3171 100644 > --- a/include/linux/aio.h > +++ b/include/linux/aio.h > @@ -4,20 +4,13 @@ > > #include <linux/aio_abi.h> > > -struct kioctx; > -struct kiocb; > struct mm_struct; > > -typedef int (kiocb_cancel_fn)(struct kiocb *); > - > /* prototypes */ > #ifdef CONFIG_AIO > extern void exit_aio(struct mm_struct *mm); > -void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); > #else > static inline void exit_aio(struct mm_struct *mm) { } > -static inline void kiocb_set_cancel_fn(struct kiocb *req, > - kiocb_cancel_fn *cancel) { } > #endif /* CONFIG_AIO */ > > #endif /* __LINUX__AIO_H */ > diff --git a/include/linux/fs.h b/include/linux/fs.h > index ed5966a70495..36cd982c167d 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -2021,6 +2021,7 @@ struct file_operations { > int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); > int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, > unsigned int poll_flags); > + int (*cancel_kiocb)(struct kiocb *); > } __randomize_layout; > > /* Wrap a directory iterator that needs exclusive inode access */ > diff --git a/include/linux/mm.h b/include/linux/mm.h > index f5a97dec5169..7c05464c0ad0 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1092,6 +1092,11 @@ static inline unsigned int folio_order(struct folio *folio) > return folio->_flags_1 & 0xff; > } > > +/* > + * Include <linux/fs.h> to work around a circular dependency between > + * <linux/fs.h> and <linux/huge_mm.h>. > + */ > +#include <linux/fs.h> > #include <linux/huge_mm.h> > > /* > diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h > index f4fe593c1400..4eb96f08ec4f 100644 > --- a/include/linux/mm_inline.h > +++ b/include/linux/mm_inline.h > @@ -3,6 +3,11 @@ > #define LINUX_MM_INLINE_H > > #include <linux/atomic.h> > +/* > + * Include <linux/fs.h> to work around a circular dependency between > + * <linux/fs.h> and <linux/huge_mm.h>. > + */ > +#include <linux/fs.h> > #include <linux/huge_mm.h> > #include <linux/mm_types.h> > #include <linux/swap.h>