On 1/9/19 5:10 AM, Christoph Hellwig wrote: >> index 293733f61594..9ef9987b4192 100644 >> --- a/fs/Makefile >> +++ b/fs/Makefile >> @@ -29,7 +29,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o >> obj-$(CONFIG_TIMERFD) += timerfd.o >> obj-$(CONFIG_EVENTFD) += eventfd.o >> obj-$(CONFIG_USERFAULTFD) += userfaultfd.o >> -obj-$(CONFIG_AIO) += aio.o >> +obj-$(CONFIG_AIO) += aio.o io_uring.o > > It is probablt worth adding a new config symbol for the uring as no > code is shared with aio. Agreed, done. >> diff --git a/fs/io_uring.c b/fs/io_uring.c >> new file mode 100644 >> index 000000000000..ae2b886282bb >> --- /dev/null >> +++ b/fs/io_uring.c >> @@ -0,0 +1,849 @@ >> +/* >> + * Shared application/kernel submission and completion ring pairs, for >> + * supporting fast/efficient IO. >> + * >> + * Copyright (C) 2019 Jens Axboe >> + */ > > Add an SPDX header to all new files, please. Done >> +struct io_sq_ring { >> + struct io_uring r; >> + u32 ring_mask; >> + u32 ring_entries; >> + u32 dropped; >> + u32 flags; >> + u32 array[0]; >> +}; > > field[0] is a legacy gcc extension, the proper C99+ way is field[]. Fixed >> +struct io_iocb_ring { >> + struct io_sq_ring *ring; >> + unsigned entries; >> + unsigned ring_mask; >> + struct io_uring_iocb *iocbs; >> +}; >> + >> +struct io_event_ring { >> + struct io_cq_ring *ring; >> + unsigned entries; >> + unsigned ring_mask; >> +}; > > Btw, do we really need there structures? It would seem simpler > to just embedd them into the containing structure as: > > struct io_sq_ring *sq_ring; > unsigned sq_ring_entries; > unsigned sq_ring_mask; > struct io_uring_iocb *sq_ring_iocbs; > > struct io_cq_ring *cq_ring; > unsigned cq_ring_entries; > unsigned cq_ring_mask; Yeah, I guess we use it directly in so few places that we may as well just get rid of the structs for these. > > >> +struct io_ring_ctx { >> + struct percpu_ref refs; >> + >> + unsigned int flags; >> + unsigned int max_reqs; > > max_reqs can probably go away in favour of the sq ring nr_entries > field. Indeed, killed. >> + struct io_iocb_ring sq_ring; >> + struct io_event_ring cq_ring; >> + >> + struct work_struct work; >> + >> + struct { >> + struct mutex uring_lock; >> + } ____cacheline_aligned_in_smp; >> + >> + struct { >> + struct mutex ring_lock; >> + wait_queue_head_t wait; >> + } ____cacheline_aligned_in_smp; >> + >> + struct { >> + spinlock_t completion_lock; >> + } ____cacheline_aligned_in_smp; >> +}; > > Can you take a deep look if we need to keep all of ring_lock, > completion_lock and the later added poll locking? From a quick look > is isn't entirely clear what the locking strategy on the completion > side is. It needs to be documented and can hopefully be simplified. I think we just need to kill ring_lock, it's actually not even used. I'll take a closer look at the locking as well. > >> +struct fsync_iocb { >> + struct work_struct work; >> + struct file *file; >> + bool datasync; >> +}; > > Do we actually need this? Can't we just reuse the later thread > offload for fsync? Maybe just add fsync support once everything else > is done to make that simpler. We can just use the sq thread, but we don't always have that backing. I guess we could create it lazily if an fsync comes in. I'll take a look at adding that as a separate thing. >> +static const struct file_operations io_scqring_fops; >> + >> +static void io_ring_ctx_free(struct work_struct *work); >> +static void io_ring_ctx_ref_free(struct percpu_ref *ref); > > Can you try to avoid to need the forward delcaration? (except for the > fops, where we probably need it). I got rid of one of them in my current tree already, I'll see if I can dump the other one. >> +static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) >> +{ >> + struct io_ring_ctx *ctx; >> + >> + ctx = kmem_cache_zalloc(ioctx_cachep, GFP_KERNEL); >> + if (!ctx) >> + return NULL; > > Do we really need an explicit slab for the contexts? Not sure, guess it depends on the frequency of them. But I suspect that it won't matter one bit, I'll kill this slab. > >> +static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx) > > Maybe replace the req name with something matching the structure > name? (and more on the structure name later). Make sense. >> +{ >> + struct io_kiocb *req; >> + >> + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); >> + if (!req) >> + return NULL; >> + >> + percpu_ref_get(&ctx->refs); >> + req->ki_ctx = ctx; >> + INIT_LIST_HEAD(&req->ki_list); > > We never do a list_empty ceck on ki_list, so there should be no need > to initialize it. Killed >> +static void io_fill_event(struct io_uring_event *ev, struct io_kiocb *kiocb, >> + long res, unsigned flags) >> +{ >> + ev->index = kiocb->ki_index; >> + ev->res = res; >> + ev->flags = flags; >> +} > > Probably no need for this helper. Killed. Also realized that we're missing a store ordering barrier after filling in 'ev', but before incrementing the ring. >> +static void io_complete_scqring(struct io_kiocb *iocb, long res, unsigned flags) >> +{ >> + io_cqring_fill_event(iocb, res, flags); >> + io_complete_iocb(iocb->ki_ctx, iocb); >> +} > > Probably no need for this helper either. Killed > >> + ret = kiocb_set_rw_flags(req, iocb->rw_flags); >> + if (unlikely(ret)) >> + goto out_fput; >> + >> + /* no one is going to poll for this I/O */ >> + req->ki_flags &= ~IOCB_HIPRI; > > Now that we don't have the aio legacy to deal with should we just > reject IOCB_HIPRI on a non-polled context? Yes I think so, we don't have any legacy behavior to adhere to. > >> +static int io_setup_rw(int rw, const struct io_uring_iocb *iocb, >> + struct iovec **iovec, struct iov_iter *iter) >> +{ >> + void __user *buf = (void __user *)(uintptr_t)iocb->addr; >> + size_t ret; >> + >> + ret = import_single_range(rw, buf, iocb->len, *iovec, iter); >> + *iovec = NULL; >> + return ret; >> +} > > Is there any point in supporting non-vectored operations here? Not sure I follow? >> + if (S_ISREG(file_inode(file)->i_mode)) { >> + __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); >> + __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); >> + } > > Overly long lines. Fixed >> +static int __io_submit_one(struct io_ring_ctx *ctx, >> + const struct io_uring_iocb *iocb, >> + unsigned long ki_index) > > Maybe calls this io_ring_submit_one? Or generally find a nice prefix > for all the functions in this file? Agree, some of this is leftover cruft from the aio side. I'll clean it up. >> + f = fdget(fd); >> + if (f.file) { >> + struct io_ring_ctx *ctx; > > Please just return early on fialure instead of forcing another level > of indentation. Sure, done. > >> + >> + ctx->sq_ring.iocbs = io_mem_alloc(sizeof(struct io_uring_iocb) * >> + p->sq_entries); > > Use array_size(). Done >> +/* >> + * sys_io_uring_setup: >> + * Sets up an aio uring context, and returns the fd. Applications asks >> + * for a ring size, we return the actual sq/cq ring sizes (among other >> + * things) in the params structure passed in. >> + */ > > Can we drop this odd aio-style comment format? In fact the syscall > documentation probably just belongs into the man page only anyway. > > Same for the uring_enter syscall. Sure, not a big deal to me, dropped. >> +struct io_uring_iocb { > > Should we just call this io_uring_sqe? > >> +/* >> + * IO completion data structure >> + */ >> +struct io_uring_event { >> + __u64 index; /* what iocb this event came from */ >> + __s32 res; /* result code for this event */ >> + __u32 flags; >> +}; > > io_uring_cqe? I'm fine with that, I like the symmetry of the names. -- Jens Axboe