> index 293733f61594..9ef9987b4192 100644 > --- a/fs/Makefile > +++ b/fs/Makefile > @@ -29,7 +29,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o > obj-$(CONFIG_TIMERFD) += timerfd.o > obj-$(CONFIG_EVENTFD) += eventfd.o > obj-$(CONFIG_USERFAULTFD) += userfaultfd.o > -obj-$(CONFIG_AIO) += aio.o > +obj-$(CONFIG_AIO) += aio.o io_uring.o It is probablt worth adding a new config symbol for the uring as no code is shared with aio. > diff --git a/fs/io_uring.c b/fs/io_uring.c > new file mode 100644 > index 000000000000..ae2b886282bb > --- /dev/null > +++ b/fs/io_uring.c > @@ -0,0 +1,849 @@ > +/* > + * Shared application/kernel submission and completion ring pairs, for > + * supporting fast/efficient IO. > + * > + * Copyright (C) 2019 Jens Axboe > + */ Add an SPDX header to all new files, please. > +struct io_sq_ring { > + struct io_uring r; > + u32 ring_mask; > + u32 ring_entries; > + u32 dropped; > + u32 flags; > + u32 array[0]; > +}; field[0] is a legacy gcc extension, the proper C99+ way is field[]. > + > +struct io_iocb_ring { > + struct io_sq_ring *ring; > + unsigned entries; > + unsigned ring_mask; > + struct io_uring_iocb *iocbs; > +}; > + > +struct io_event_ring { > + struct io_cq_ring *ring; > + unsigned entries; > + unsigned ring_mask; > +}; Btw, do we really need there structures? It would seem simpler to just embedd them into the containing structure as: struct io_sq_ring *sq_ring; unsigned sq_ring_entries; unsigned sq_ring_mask; struct io_uring_iocb *sq_ring_iocbs; struct io_cq_ring *cq_ring; unsigned cq_ring_entries; unsigned cq_ring_mask; > +struct io_ring_ctx { > + struct percpu_ref refs; > + > + unsigned int flags; > + unsigned int max_reqs; max_reqs can probably go away in favour of the sq ring nr_entries field. > + struct io_iocb_ring sq_ring; > + struct io_event_ring cq_ring; > + > + struct work_struct work; > + > + struct { > + struct mutex uring_lock; > + } ____cacheline_aligned_in_smp; > + > + struct { > + struct mutex ring_lock; > + wait_queue_head_t wait; > + } ____cacheline_aligned_in_smp; > + > + struct { > + spinlock_t completion_lock; > + } ____cacheline_aligned_in_smp; > +}; Can you take a deep look if we need to keep all of ring_lock, completion_lock and the later added poll locking? From a quick look is isn't entirely clear what the locking strategy on the completion side is. It needs to be documented and can hopefully be simplified. > +struct fsync_iocb { > + struct work_struct work; > + struct file *file; > + bool datasync; > +}; Do we actually need this? Can't we just reuse the later thread offload for fsync? Maybe just add fsync support once everything else is done to make that simpler. > +static const struct file_operations io_scqring_fops; > + > +static void io_ring_ctx_free(struct work_struct *work); > +static void io_ring_ctx_ref_free(struct percpu_ref *ref); Can you try to avoid to need the forward delcaration? (except for the fops, where we probably need it). > > + > +static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) > +{ > + struct io_ring_ctx *ctx; > + > + ctx = kmem_cache_zalloc(ioctx_cachep, GFP_KERNEL); > + if (!ctx) > + return NULL; Do we really need an explicit slab for the contexts? > +static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx) Maybe replace the req name with something matching the structure name? (and more on the structure name later). > +{ > + struct io_kiocb *req; > + > + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); > + if (!req) > + return NULL; > + > + percpu_ref_get(&ctx->refs); > + req->ki_ctx = ctx; > + INIT_LIST_HEAD(&req->ki_list); We never do a list_empty ceck on ki_list, so there should be no need to initialize it. > +static void io_fill_event(struct io_uring_event *ev, struct io_kiocb *kiocb, > + long res, unsigned flags) > +{ > + ev->index = kiocb->ki_index; > + ev->res = res; > + ev->flags = flags; > +} Probably no need for this helper. > +static void io_complete_scqring(struct io_kiocb *iocb, long res, unsigned flags) > +{ > + io_cqring_fill_event(iocb, res, flags); > + io_complete_iocb(iocb->ki_ctx, iocb); > +} Probably no need for this helper either. > + ret = kiocb_set_rw_flags(req, iocb->rw_flags); > + if (unlikely(ret)) > + goto out_fput; > + > + /* no one is going to poll for this I/O */ > + req->ki_flags &= ~IOCB_HIPRI; Now that we don't have the aio legacy to deal with should we just reject IOCB_HIPRI on a non-polled context? > +static int io_setup_rw(int rw, const struct io_uring_iocb *iocb, > + struct iovec **iovec, struct iov_iter *iter) > +{ > + void __user *buf = (void __user *)(uintptr_t)iocb->addr; > + size_t ret; > + > + ret = import_single_range(rw, buf, iocb->len, *iovec, iter); > + *iovec = NULL; > + return ret; > +} Is there any point in supporting non-vectored operations here? > + if (S_ISREG(file_inode(file)->i_mode)) { > + __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); > + __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); > + } Overly long lines. > +static int __io_submit_one(struct io_ring_ctx *ctx, > + const struct io_uring_iocb *iocb, > + unsigned long ki_index) Maybe calls this io_ring_submit_one? Or generally find a nice prefix for all the functions in this file? > + f = fdget(fd); > + if (f.file) { > + struct io_ring_ctx *ctx; Please just return early on fialure instead of forcing another level of indentation. > + > + ctx->sq_ring.iocbs = io_mem_alloc(sizeof(struct io_uring_iocb) * > + p->sq_entries); Use array_size(). > +/* > + * sys_io_uring_setup: > + * Sets up an aio uring context, and returns the fd. Applications asks > + * for a ring size, we return the actual sq/cq ring sizes (among other > + * things) in the params structure passed in. > + */ Can we drop this odd aio-style comment format? In fact the syscall documentation probably just belongs into the man page only anyway. Same for the uring_enter syscall. > +struct io_uring_iocb { Should we just call this io_uring_sqe? > +/* > + * IO completion data structure > + */ > +struct io_uring_event { > + __u64 index; /* what iocb this event came from */ > + __s32 res; /* result code for this event */ > + __u32 flags; > +}; io_uring_cqe?