This patch implements an extension of eventfd to define file descriptors whose I/O events can be generated at user level. These file descriptors trigger notifications for [p]select/[p]poll/epoll. This feature is useful for user-level implementations of network stacks or virtual device drivers as libraries. Development and porting of code often requires to find the way to wait for I/O events both coming from file descriptors and generated by user-level code (e.g. user-implemented net stacks or drivers). While it is possible to provide a partial support (e.g. using pipes or socketpairs), a clean and complete solution is still missing (as far as I have seen); e.g. I have not seen any clean way to generate EPOLLPRI, EPOLLERR, etc. This proposal is based on a new tag for eventfd2(2): EFD_VPOLL. This statement: fd = eventfd(EPOLLOUT, EFD_VPOLL | EFD_CLOEXEC); creates a file descriptor for I/O event generation. In this case EPOLLOUT is initially true. Likewise all the other eventfs services, read(2) and write(2) use a 8-byte integer argument. read(2) returns the current state of the pending events. The argument of write(2) is an or-composition of a control command (EFD_VPOLL_ADDEVENTS, EFD_VPOLL_DELEVENTS or EFD_VPOLL_MODEVENTS) and the bitmap of events to be added, deleted to the current set of pending events. EFD_VPOLL_MODEVENTS completely redefines the set of pending events. e.g.: uint64_t request = EFD_VPOLL_ADDEVENTS | EPOLLIN | EPOLLPRI; write(fd, &request, sizeof(request); adds EPOLLIN and EPOLLPRI to the set of pending events. These are examples of messages asking for a feature like EFD_VPOLL: https://stackoverflow.com/questions/909189/simulating-file-descriptor-in-user-space https://stackoverflow.com/questions/1648147/running-a-simple-tcp-server-with-poll-how-do-i-trigger-events-artificially ... and I need it to write networking and device modules for vuos: https://github.com/virtualsquare/vuos (it is the new codebase of ViewOS, see www.virtualsquare.org). EXAMPLE: The following program creates an eventfd/EFD_VPOLL file descriptor and then forks a child process. While the parent waits for events using epoll_wait the child generates a sequence of events. When the parent receives an event (or a set of events) it prints it and disarm it. The following shell session shows a sample run of the program: timeout... timeout... GOT event 1 timeout... GOT event 1 timeout... GOT event 3 timeout... GOT event 2 timeout... GOT event 4 timeout... GOT event 10 Program source: #include <sys/eventfd.h> #include <sys/epoll.h> #include <unistd.h> #include <stdlib.h> #include <stdio.h> #include <stdint.h> /* Definition of uint64_t */ #ifndef EFD_VPOLL #define EFD_VPOLL (1 << 1) #define EFD_VPOLL_ADDEVENTS (1ULL << 32) #define EFD_VPOLL_DELEVENTS (2ULL << 32) #define EFD_VPOLL_MODEVENTS (3ULL << 32) #endif #define handle_error(msg) \ do { perror(msg); exit(EXIT_FAILURE); } while (0) static void vpoll_ctl(int fd, uint64_t request) { ssize_t s; s = write(fd, &request, sizeof(request)); if (s != sizeof(uint64_t)) handle_error("write"); } int main(int argc, char *argv[]) { int efd, epollfd; struct epoll_event ev; ev.events = EPOLLIN | EPOLLRDHUP | EPOLLERR | EPOLLOUT | EPOLLHUP | EPOLLPRI; ev.data.u64 = 0; efd = eventfd(0, EFD_VPOLL | EFD_CLOEXEC); if (efd == -1) handle_error("eventfd"); epollfd = epoll_create1(EPOLL_CLOEXEC); if (efd == -1) handle_error("epoll_create1"); if (epoll_ctl(epollfd, EPOLL_CTL_ADD, efd, &ev) == -1) handle_error("epoll_ctl"); switch (fork()) { case 0: sleep(3); vpoll_ctl(efd, EFD_VPOLL_ADDEVENTS | EPOLLIN); sleep(2); vpoll_ctl(efd, EFD_VPOLL_ADDEVENTS | EPOLLIN); sleep(2); vpoll_ctl(efd, EFD_VPOLL_ADDEVENTS | EPOLLIN | EPOLLPRI); sleep(2); vpoll_ctl(efd, EFD_VPOLL_ADDEVENTS | EPOLLPRI); sleep(2); vpoll_ctl(efd, EFD_VPOLL_ADDEVENTS | EPOLLOUT); sleep(2); vpoll_ctl(efd, EFD_VPOLL_ADDEVENTS | EPOLLHUP); exit(EXIT_SUCCESS); default: while (1) { int nfds; nfds = epoll_wait(epollfd, &ev, 1, 1000); if (nfds < 0) handle_error("epoll_wait"); else if (nfds == 0) printf("timeout...\n"); else { printf("GOT event %x\n", ev.events); vpoll_ctl(efd, EFD_VPOLL_DELEVENTS | ev.events); if (ev.events & EPOLLHUP) break; } } case -1: handle_error("fork"); } close(epollfd); close(efd); return 0; } Signed-off-by: Renzo Davoli <renzo@xxxxxxxxxxx> --- fs/eventfd.c | 115 +++++++++++++++++++++++++++++++-- include/linux/eventfd.h | 7 +- include/uapi/linux/eventpoll.h | 2 + 3 files changed, 116 insertions(+), 8 deletions(-) Changes in v2: - Fix size of EFD_VPOLL_*EVENTS constants for 32 bit architectures diff --git a/fs/eventfd.c b/fs/eventfd.c index 8aa0ea8c55e8..f83b7d02307e 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -3,6 +3,7 @@ * fs/eventfd.c * * Copyright (C) 2007 Davide Libenzi <davidel@xxxxxxxxxxxxxxx> + * EFD_VPOLL support: 2019 Renzo Davoli <renzo@xxxxxxxxxxx> * */ @@ -30,12 +31,24 @@ struct eventfd_ctx { struct kref kref; wait_queue_head_t wqh; /* - * Every time that a write(2) is performed on an eventfd, the - * value of the __u64 being written is added to "count" and a - * wakeup is performed on "wqh". A read(2) will return the "count" - * value to userspace, and will reset "count" to zero. The kernel - * side eventfd_signal() also, adds to the "count" counter and - * issue a wakeup. + * If the EFD_VPOLL flag was NOT set at eventfd creation: + * Every time that a write(2) is performed on an eventfd, the + * value of the __u64 being written is added to "count" and a + * wakeup is performed on "wqh". A read(2) will return the "count" + * value to userspace, and will reset "count" to zero (or decrement + * "count" by 1 if the flag EFD_SEMAPHORE has been set). The kernel + * side eventfd_signal() also, adds to the "count" counter and + * issue a wakeup. + * + * If the EFD_VPOLL flag was set at eventfd creation: + * count is the set of pending EPOLL events. + * read(2) returns the current value of count. + * The argument of write(2) is an 8-byte integer: + * it is an or-composition of a control command (EFD_VPOLL_ADDEVENTS, + * EFD_VPOLL_DELEVENTS or EFD_VPOLL_MODEVENTS) and the bitmap of + * events to be added, deleted to the current set of pending events. + * (i.e. which bits of "count" must be set or reset). + * EFD_VPOLL_MODEVENTS redefines the set of pending events. */ __u64 count; unsigned int flags; @@ -295,6 +308,78 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c return res; } +static __poll_t eventfd_vpoll_poll(struct file *file, poll_table *wait) +{ + struct eventfd_ctx *ctx = file->private_data; + __poll_t events = 0; + u64 count; + + poll_wait(file, &ctx->wqh, wait); + + count = READ_ONCE(ctx->count); + + events = (count & EPOLLALLMASK); + + return events; +} + +static ssize_t eventfd_vpoll_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct eventfd_ctx *ctx = file->private_data; + ssize_t res; + __u64 ucnt = 0; + + if (count < sizeof(ucnt)) + return -EINVAL; + res = sizeof(ucnt); + ucnt = READ_ONCE(ctx->count); + if (put_user(ucnt, (__u64 __user *)buf)) + return -EFAULT; + + return res; +} + +static ssize_t eventfd_vpoll_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct eventfd_ctx *ctx = file->private_data; + ssize_t res; + __u64 ucnt; + __u32 events; + + if (count < sizeof(ucnt)) + return -EINVAL; + if (copy_from_user(&ucnt, buf, sizeof(ucnt))) + return -EFAULT; + spin_lock_irq(&ctx->wqh.lock); + + events = ucnt & EPOLLALLMASK; + res = sizeof(ucnt); + switch (ucnt & ~((__u64)EPOLLALLMASK)) { + case EFD_VPOLL_ADDEVENTS: + ctx->count |= events; + break; + case EFD_VPOLL_DELEVENTS: + ctx->count &= ~(events); + break; + case EFD_VPOLL_MODEVENTS: + ctx->count = (ctx->count & ~EPOLLALLMASK) | events; + break; + default: + res = -EINVAL; + } + + /* wake up waiting threads */ + if (res >= 0 && waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, res); + + spin_unlock_irq(&ctx->wqh.lock); + + return res; + +} + #ifdef CONFIG_PROC_FS static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) { @@ -319,6 +404,17 @@ static const struct file_operations eventfd_fops = { .llseek = noop_llseek, }; +static const struct file_operations eventfd_vpoll_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = eventfd_show_fdinfo, +#endif + .release = eventfd_release, + .poll = eventfd_vpoll_poll, + .read = eventfd_vpoll_read, + .write = eventfd_vpoll_write, + .llseek = noop_llseek, +}; + /** * eventfd_fget - Acquire a reference of an eventfd file descriptor. * @fd: [in] Eventfd file descriptor. @@ -391,6 +487,7 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); static int do_eventfd(unsigned int count, int flags) { struct eventfd_ctx *ctx; + const struct file_operations *fops = &eventfd_fops; int fd; /* Check the EFD_* constants for consistency. */ @@ -410,7 +507,11 @@ static int do_eventfd(unsigned int count, int flags) ctx->flags = flags; ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL); - fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, + if (flags & EFD_VPOLL) { + fops = &eventfd_vpoll_fops; + ctx->count &= EPOLLALLMASK; + } + fd = anon_inode_getfd("[eventfd]", fops, ctx, O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS)); if (fd < 0) eventfd_free_ctx(ctx); diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index ffcc7724ca21..5b1e6ef56651 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -21,11 +21,16 @@ * shared O_* flags. */ #define EFD_SEMAPHORE (1 << 0) +#define EFD_VPOLL (1 << 1) #define EFD_CLOEXEC O_CLOEXEC #define EFD_NONBLOCK O_NONBLOCK #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) -#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE) +#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE | EFD_VPOLL) + +#define EFD_VPOLL_ADDEVENTS (1ULL << 32) +#define EFD_VPOLL_DELEVENTS (2ULL << 32) +#define EFD_VPOLL_MODEVENTS (3ULL << 32) struct eventfd_ctx; struct file; diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h index 8a3432d0f0dc..814de6d869c7 100644 --- a/include/uapi/linux/eventpoll.h +++ b/include/uapi/linux/eventpoll.h @@ -41,6 +41,8 @@ #define EPOLLMSG (__force __poll_t)0x00000400 #define EPOLLRDHUP (__force __poll_t)0x00002000 +#define EPOLLALLMASK ((__force __poll_t)0x0fffffff) + /* Set exclusive wakeup mode for the target file descriptor */ #define EPOLLEXCLUSIVE ((__force __poll_t)(1U << 28)) -- 2.20.1