The doubling of the indirect calls caused a too big regression for some benchmarks in our post-spectre world. With some of the nastiness in the networking code moved out of the way we can instead stick a pointer to a waitqueue into struct file and avoid one of the indirect calls. This actually happens to simplify the code quite a bit as well. Note that for this removes the possibility of actually returning an error before waiting in poll. We could still do this with an ERR_PTR in f_poll_head with a little bit of WRITE_ONCE/READ_ONCE magic, but I'd like to defer that until actually required. Signed-off-by: Christoph Hellwig <hch@xxxxxx> --- Documentation/filesystems/Locking | 4 +--- Documentation/filesystems/vfs.txt | 15 +++++--------- drivers/char/random.c | 8 ++++---- fs/aio.c | 22 ++++++--------------- fs/eventfd.c | 33 +++++++++++++++++++------------ fs/eventpoll.c | 9 +-------- fs/pipe.c | 12 +++-------- fs/select.c | 18 ++++------------- fs/timerfd.c | 31 +++++++++++++++++------------ include/linux/fs.h | 2 +- include/linux/poll.h | 7 +------ net/socket.c | 17 +++------------- 12 files changed, 67 insertions(+), 111 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 2c391338c675..4d183e8258b6 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -441,7 +441,6 @@ prototypes: int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); - struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); __poll_t (*poll_mask) (struct file *, __poll_t); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); @@ -505,8 +504,7 @@ in sys_read() and friends. the lease within the individual filesystem to record the result of the operation -->poll_mask can be called with or without the waitqueue lock for the waitqueue -returned from ->get_poll_head. +->poll_mask can be called with or without the waitqueue lock --------------------------- dquot_operations ------------------------------- prototypes: diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 829a7b7857a4..2d2f07acafa8 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -857,7 +857,6 @@ struct file_operations { ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iterate) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); - struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); __poll_t (*poll_mask) (struct file *, __poll_t); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); @@ -903,16 +902,12 @@ otherwise noted. activity on this file and (optionally) go to sleep until there is activity. Called by the select(2) and poll(2) system calls - get_poll_head: Returns the struct wait_queue_head that callers can - wait on. Callers need to check the returned events using ->poll_mask - once woken. Can return NULL to indicate polling is not supported, - or any error code using the ERR_PTR convention to indicate that a - grave error occured and ->poll_mask shall not be called. - poll_mask: return the mask of EPOLL* values describing the file descriptor - state. Called either before going to sleep on the waitqueue returned by - get_poll_head, or after it has been woken. If ->get_poll_head and - ->poll_mask are implemented ->poll does not need to be implement. + state. Called either before going to sleep on ->f_poll_head, or after it + has been woken. On files that support ->poll_mask the ->f_poll_head field + must point to a wait_queue_head that poll can wait on. The waitqueue must + have the same (or a longer) life time as the struct file itself. + If ->poll_mask is implemented ->poll does not need to be implement. unlocked_ioctl: called by the ioctl(2) system call. diff --git a/drivers/char/random.c b/drivers/char/random.c index a8fb0020ba5c..e6260a9fa3b9 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1875,10 +1875,10 @@ urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) return ret; } -static struct wait_queue_head * -random_get_poll_head(struct file *file, __poll_t events) +static int random_open(struct inode *inode, struct file *file) { - return &random_wait; + file->f_poll_head = &random_wait; + return 0; } static __poll_t @@ -1990,9 +1990,9 @@ static int random_fasync(int fd, struct file *filp, int on) } const struct file_operations random_fops = { + .open = random_open, .read = random_read, .write = random_write, - .get_poll_head = random_get_poll_head, .poll_mask = random_poll_mask, .unlocked_ioctl = random_ioctl, .fasync = random_fasync, diff --git a/fs/aio.c b/fs/aio.c index e1d20124ec0e..84f498df8afc 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -168,7 +168,6 @@ struct fsync_iocb { struct poll_iocb { struct file *file; __poll_t events; - struct wait_queue_head *head; union { struct wait_queue_entry wait; @@ -1632,7 +1631,7 @@ static int aio_poll_cancel(struct kiocb *iocb) { struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); struct poll_iocb *req = &aiocb->poll; - struct wait_queue_head *head = req->head; + struct wait_queue_head *head = req->file->f_poll_head; bool found = false; spin_lock(&head->lock); @@ -1655,7 +1654,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct file *file = req->file; __poll_t mask = key_to_poll(key); - assert_spin_locked(&req->head->lock); + assert_spin_locked(&file->f_poll_head->lock); /* for instances that support it check for an event match first: */ if (mask && !(mask & req->events)) @@ -1703,30 +1702,21 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) req->file = fget(iocb->aio_fildes); if (unlikely(!req->file)) return -EBADF; - if (!file_has_poll_mask(req->file)) + if (!req->file->f_poll_head) goto out_fail; - req->head = req->file->f_op->get_poll_head(req->file, req->events); - if (!req->head) - goto out_fail; - if (IS_ERR(req->head)) { - mask = EPOLLERR; - goto done; - } - init_waitqueue_func_entry(&req->wait, aio_poll_wake); aiocb->ki_cancel = aio_poll_cancel; spin_lock_irq(&ctx->ctx_lock); - spin_lock(&req->head->lock); + spin_lock(&req->file->f_poll_head->lock); mask = req->file->f_op->poll_mask(req->file, req->events) & req->events; if (!mask) { - __add_wait_queue(req->head, &req->wait); + __add_wait_queue(req->file->f_poll_head, &req->wait); list_add_tail(&aiocb->ki_list, &ctx->active_reqs); } - spin_unlock(&req->head->lock); + spin_unlock(&req->file->f_poll_head->lock); spin_unlock_irq(&ctx->ctx_lock); -done: if (mask) __aio_poll_complete(aiocb, mask); return 0; diff --git a/fs/eventfd.c b/fs/eventfd.c index ceb1031f1cac..8904b127577b 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -101,14 +101,6 @@ static int eventfd_release(struct inode *inode, struct file *file) return 0; } -static struct wait_queue_head * -eventfd_get_poll_head(struct file *file, __poll_t events) -{ - struct eventfd_ctx *ctx = file->private_data; - - return &ctx->wqh; -} - static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask) { struct eventfd_ctx *ctx = file->private_data; @@ -311,7 +303,6 @@ static const struct file_operations eventfd_fops = { .show_fdinfo = eventfd_show_fdinfo, #endif .release = eventfd_release, - .get_poll_head = eventfd_get_poll_head, .poll_mask = eventfd_poll_mask, .read = eventfd_read, .write = eventfd_write, @@ -390,7 +381,8 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); static int do_eventfd(unsigned int count, int flags) { struct eventfd_ctx *ctx; - int fd; + struct file *file; + int fd, error; /* Check the EFD_* constants for consistency. */ BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); @@ -408,12 +400,27 @@ static int do_eventfd(unsigned int count, int flags) ctx->count = count; ctx->flags = flags; - fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, + error = get_unused_fd_flags(O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS)); + if (error < 0) + goto err_free_ctx; + fd = error; + + file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS)); - if (fd < 0) - eventfd_free_ctx(ctx); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_put_unused_fd; + } + file->f_poll_head = &ctx->wqh; + fd_install(fd, file); return fd; + +err_put_unused_fd: + put_unused_fd(fd); +err_free_ctx: + eventfd_free_ctx(ctx); + return error; } SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ea4436f409fb..a0d199be91db 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -922,13 +922,6 @@ static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head return 0; } -static struct wait_queue_head *ep_eventpoll_get_poll_head(struct file *file, - __poll_t eventmask) -{ - struct eventpoll *ep = file->private_data; - return &ep->poll_wait; -} - static __poll_t ep_eventpoll_poll_mask(struct file *file, __poll_t eventmask) { struct eventpoll *ep = file->private_data; @@ -972,7 +965,6 @@ static const struct file_operations eventpoll_fops = { .show_fdinfo = ep_show_fdinfo, #endif .release = ep_eventpoll_release, - .get_poll_head = ep_eventpoll_get_poll_head, .poll_mask = ep_eventpoll_poll_mask, .llseek = noop_llseek, }; @@ -1973,6 +1965,7 @@ static int do_epoll_create(int flags) goto out_free_fd; } ep->file = file; + file->f_poll_head = &ep->poll_wait; fd_install(fd, file); return fd; diff --git a/fs/pipe.c b/fs/pipe.c index bb0840e234f3..6817a60bebc9 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -509,14 +509,6 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } } -static struct wait_queue_head * -pipe_get_poll_head(struct file *filp, __poll_t events) -{ - struct pipe_inode_info *pipe = filp->private_data; - - return &pipe->wait; -} - /* No kernel lock held - fine */ static __poll_t pipe_poll_mask(struct file *filp, __poll_t events) { @@ -768,6 +760,7 @@ int create_pipe_files(struct file **res, int flags) f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); f->private_data = inode->i_pipe; + f->f_poll_head = &inode->i_pipe->wait; res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops); if (IS_ERR(res[0])) { @@ -778,6 +771,7 @@ int create_pipe_files(struct file **res, int flags) path_get(&path); res[0]->private_data = inode->i_pipe; res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); + res[0]->f_poll_head = &inode->i_pipe->wait; res[1] = f; return 0; @@ -930,6 +924,7 @@ static int fifo_open(struct inode *inode, struct file *filp) /* We can only do regular read/write on fifos */ filp->f_mode &= (FMODE_READ | FMODE_WRITE); + filp->f_poll_head = &pipe->wait; switch (filp->f_mode) { case FMODE_READ: @@ -1023,7 +1018,6 @@ const struct file_operations pipefifo_fops = { .llseek = no_llseek, .read_iter = pipe_read, .write_iter = pipe_write, - .get_poll_head = pipe_get_poll_head, .poll_mask = pipe_poll_mask, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, diff --git a/fs/select.c b/fs/select.c index 317891ff8165..26e20063454a 100644 --- a/fs/select.c +++ b/fs/select.c @@ -38,20 +38,10 @@ __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt) { if (file->f_op->poll) { return file->f_op->poll(file, pt); - } else if (file_has_poll_mask(file)) { - unsigned int events = poll_requested_events(pt); - struct wait_queue_head *head; - - if (pt && pt->_qproc) { - head = file->f_op->get_poll_head(file, events); - if (!head) - return DEFAULT_POLLMASK; - if (IS_ERR(head)) - return EPOLLERR; - pt->_qproc(file, head, pt); - } - - return file->f_op->poll_mask(file, events); + } else if (file->f_poll_head) { + if (pt && pt->_qproc) + pt->_qproc(file, file->f_poll_head, pt); + return file->f_op->poll_mask(file, poll_requested_events(pt)); } else { return DEFAULT_POLLMASK; } diff --git a/fs/timerfd.c b/fs/timerfd.c index d84a2bee4f82..08e820166c4d 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -227,14 +227,6 @@ static int timerfd_release(struct inode *inode, struct file *file) return 0; } -static struct wait_queue_head *timerfd_get_poll_head(struct file *file, - __poll_t eventmask) -{ - struct timerfd_ctx *ctx = file->private_data; - - return &ctx->wqh; -} - static __poll_t timerfd_poll_mask(struct file *file, __poll_t eventmask) { struct timerfd_ctx *ctx = file->private_data; @@ -363,7 +355,6 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg static const struct file_operations timerfd_fops = { .release = timerfd_release, - .get_poll_head = timerfd_get_poll_head, .poll_mask = timerfd_poll_mask, .read = timerfd_read, .llseek = noop_llseek, @@ -386,8 +377,9 @@ static int timerfd_fget(int fd, struct fd *p) SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) { - int ufd; + int ufd, error; struct timerfd_ctx *ctx; + struct file *file; /* Check the TFD_* constants for consistency. */ BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); @@ -424,12 +416,25 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) ctx->moffs = ktime_mono_to_real(0); - ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, + error = get_unused_fd_flags(O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); + if (error < 0) + goto out_free_ctx; + ufd = error; + + file = anon_inode_getfile("[timerfd]", &timerfd_fops, ctx, O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); - if (ufd < 0) - kfree(ctx); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_put_unused_fd; + } + file->f_poll_head = &ctx->wqh; + fd_install(ufd, file); return ufd; +err_put_unused_fd: + put_unused_fd(ufd); +out_free_ctx: + return error; } static int do_timerfd_settime(int ufd, int flags, diff --git a/include/linux/fs.h b/include/linux/fs.h index 5c91108846db..cc0fb83d3743 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -878,6 +878,7 @@ struct file { loff_t f_pos; struct fown_struct f_owner; const struct cred *f_cred; + struct wait_queue_head *f_poll_head; struct file_ra_state f_ra; u64 f_version; @@ -1720,7 +1721,6 @@ struct file_operations { int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); - struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); __poll_t (*poll_mask) (struct file *, __poll_t); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); diff --git a/include/linux/poll.h b/include/linux/poll.h index fdf86b4cbc71..e3dd9dfee20a 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -74,14 +74,9 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) pt->_key = ~(__poll_t)0; /* all events enabled */ } -static inline bool file_has_poll_mask(struct file *file) -{ - return file->f_op->get_poll_head && file->f_op->poll_mask; -} - static inline bool file_can_poll(struct file *file) { - return file->f_op->poll || file_has_poll_mask(file); + return file->f_op->poll || file->f_poll_head; } __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt); diff --git a/net/socket.c b/net/socket.c index 4354afe8ad96..155d4ba38645 100644 --- a/net/socket.c +++ b/net/socket.c @@ -117,8 +117,6 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from); static int sock_mmap(struct file *file, struct vm_area_struct *vma); static int sock_close(struct inode *inode, struct file *file); -static struct wait_queue_head *sock_get_poll_head(struct file *file, - __poll_t events); static __poll_t sock_poll_mask(struct file *file, __poll_t); static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait); static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -143,7 +141,6 @@ static const struct file_operations socket_file_ops = { .llseek = no_llseek, .read_iter = sock_read_iter, .write_iter = sock_write_iter, - .get_poll_head = sock_get_poll_head, .poll_mask = sock_poll_mask, .poll = sock_poll, .unlocked_ioctl = sock_ioctl, @@ -422,6 +419,8 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) sock->file = file; file->f_flags = O_RDWR | (flags & O_NONBLOCK); + if (sock->ops->poll_mask) + file->f_poll_head = &sock->wq->wait; file->private_data = sock; return file; } @@ -1128,16 +1127,6 @@ int sock_create_lite(int family, int type, int protocol, struct socket **res) } EXPORT_SYMBOL(sock_create_lite); -static struct wait_queue_head *sock_get_poll_head(struct file *file, - __poll_t events) -{ - struct socket *sock = file->private_data; - - if (!sock->ops->poll_mask) - return NULL; - return &sock->wq->wait; -} - static __poll_t sock_poll_mask(struct file *file, __poll_t events) { struct socket *sock = file->private_data; @@ -1167,7 +1156,7 @@ static __poll_t sock_poll(struct file *file, poll_table *wait) if (sock->ops->poll) { mask = sock->ops->poll(file, sock, wait); } else if (sock->ops->poll_mask) { - sock_poll_wait(file, sock_get_poll_head(file, events), wait); + sock_poll_wait(file, &sock->wq->wait, wait); mask = sock->ops->poll_mask(sock, events); } -- 2.17.1