A simple webserver shows about 5% loss compared to linux-aio.
I expect the loss is due to an optimization that io_uring lacks - inline
completion vs workqueue completion:
static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode,
int sync,
void *key)
{
struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
__poll_t mask = key_to_poll(key);
unsigned long flags;
/* for instances that support it check for an event match first: */
if (mask && !(mask & req->events))
return 0;
/*
* Complete the request inline if possible. This requires that
three
* conditions be met:
* 1. An event mask must have been passed. If a plain wakeup
was done
* instead, then mask == 0 and we have to call vfs_poll()
to get
* the events, so inline completion isn't possible.
* 2. The completion work must not have already been scheduled.
* 3. ctx_lock must not be busy. We have to use trylock
because we
* already hold the waitqueue lock, so this inverts the normal
* locking order. Use irqsave/irqrestore because not all
* filesystems (e.g. fuse) call this function with IRQs
disabled,
* yet IRQs have to be disabled before ctx_lock is obtained.
*/
if (mask && !req->work_scheduled &&
spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
struct kioctx *ctx = iocb->ki_ctx;
list_del_init(&req->wait.entry);
list_del(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
if (iocb->ki_eventfd && !eventfd_signal_allowed()) {
iocb = NULL;
INIT_WORK(&req->work, aio_poll_put_work);
schedule_work(&req->work);
}
spin_unlock_irqrestore(&ctx->ctx_lock, flags);
if (iocb)
iocb_put(iocb);
} else {