From: Wen Yang <wenyang.linux@xxxxxxxxxxx> For the NON SEMAPHORE eventfd, if it's counter has a nonzero value, then a read(2) returns 8 bytes containing that value, and the counter's value is reset to zero. Therefore, in the NON SEMAPHORE scenario, N event_writes vs ONE event_read is possible. However, the current implementation wakes up the read thread immediately in eventfd_write so that the cpu utilization increases unnecessarily. By adding a configurable delay after eventfd_write, these unnecessary wakeup operations are avoided, thereby reducing cpu utilization. We used the following test code: https://github.com/w-simon/tests/blob/master/src/test.c ./test_zmq > /dev/null The cpu usage is as follows: 12:14:22 CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle 12:14:24 all 55.46 0.00 4.78 0.00 0.00 0.96 0.00 0.00 0.00 38.80 12:14:26 all 56.29 0.00 4.70 0.00 0.00 1.24 0.00 0.00 0.00 37.76 12:14:28 all 54.97 0.00 5.25 0.00 0.00 0.97 0.00 0.00 0.00 38.81 12:14:30 all 56.02 0.00 5.26 0.00 0.00 1.24 0.00 0.00 0.00 37.48 12:14:32 all 55.31 0.00 5.03 0.00 0.00 1.40 0.00 0.00 0.00 38.27 12:14:34 all 55.46 0.00 5.26 0.00 0.00 1.24 0.00 0.00 0.00 38.04 Then adjust the new control parameter, as follows: echo 5 > /proc/sys/fs/eventfd_write_wake_delay_ms The cpu usagen was observed to decrease by more than 30%, as follows: 12:14:36 all 28.17 0.00 0.93 0.00 0.00 0.00 0.00 0.00 0.00 70.90 12:14:38 all 24.00 0.00 0.80 0.00 0.00 0.13 0.00 0.00 0.00 75.07 12:14:40 all 23.57 0.00 0.53 0.00 0.00 0.13 0.00 0.00 0.00 75.77 12:14:42 all 23.59 0.00 0.40 0.00 0.00 0.00 0.00 0.00 0.00 76.01 12:14:44 all 23.69 0.00 0.27 0.00 0.00 0.00 0.00 0.00 0.00 76.04 12:14:46 all 23.20 0.00 0.67 0.00 0.00 0.13 0.00 0.00 0.00 76.00 12:14:48 all 24.87 0.00 0.66 0.00 0.00 0.00 0.00 0.00 0.00 74.47 12:14:50 all 24.27 0.00 0.66 0.00 0.00 0.00 0.00 0.00 0.00 75.07 Signed-off-by: Wen Yang <wenyang.linux@xxxxxxxxxxx> Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx> Cc: Christoph Hellwig <hch@xxxxxx> Cc: Dylan Yudaken <dylany@xxxxxx> Cc: Jens Axboe <axboe@xxxxxxxxx> Cc: David Woodhouse <dwmw@xxxxxxxxxxxx> Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx> Cc: linux-fsdevel@xxxxxxxxxxxxxxx Cc: linux-kernel@xxxxxxxxxxxxxxx --- fs/eventfd.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/fs/eventfd.c b/fs/eventfd.c index c5bda3df4a28..e45436737f9d 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -41,6 +41,9 @@ struct eventfd_ctx { __u64 count; unsigned int flags; int id; +#ifdef CONFIG_SYSCTL + struct delayed_work dwork; +#endif }; __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask) @@ -95,6 +98,9 @@ static void eventfd_free_ctx(struct eventfd_ctx *ctx) { if (ctx->id >= 0) ida_simple_remove(&eventfd_ida, ctx->id); +#ifdef CONFIG_SYSCTL + flush_delayed_work(&ctx->dwork); +#endif kfree(ctx); } @@ -256,6 +262,28 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to) return sizeof(ucnt); } +#ifdef CONFIG_SYSCTL + +static unsigned long sysctl_eventfd_write_wake_delay_ms; + +static void eventfd_delayed_workfn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct eventfd_ctx *ctx = container_of(dwork, struct eventfd_ctx, dwork); + + spin_lock_irq(&ctx->wqh.lock); + current->in_eventfd = 1; + if (ctx->count) { + /* waitqueue_active is safe because ctx->wqh.lock is being held here. */ + if (waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, EPOLLIN); + } + current->in_eventfd = 0; + spin_unlock_irq(&ctx->wqh.lock); +} + +#endif + static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -282,8 +310,26 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c if (likely(res > 0)) { ctx->count += ucnt; current->in_eventfd = 1; - if (waitqueue_active(&ctx->wqh)) + + /* waitqueue_active is safe because ctx->wqh.lock is being held here. */ + if (waitqueue_active(&ctx->wqh)) { +#ifdef CONFIG_SYSCTL + if (ctx->flags & EFD_SEMAPHORE) + wake_up_locked_poll(&ctx->wqh, EPOLLIN); + else { + unsigned long delay = sysctl_eventfd_write_wake_delay_ms; + + if (delay) { + if (!delayed_work_pending(&ctx->dwork)) + queue_delayed_work(system_unbound_wq, + &ctx->dwork, delay); + } else + wake_up_locked_poll(&ctx->wqh, EPOLLIN); + } +#else wake_up_locked_poll(&ctx->wqh, EPOLLIN); +#endif + } current->in_eventfd = 0; } spin_unlock_irq(&ctx->wqh.lock); @@ -406,6 +452,9 @@ static int do_eventfd(unsigned int count, int flags) ctx->count = count; ctx->flags = flags; ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL); +#ifdef CONFIG_SYSCTL + INIT_DELAYED_WORK(&ctx->dwork, eventfd_delayed_workfn); +#endif flags &= EFD_SHARED_FCNTL_FLAGS; flags |= O_RDWR; @@ -438,3 +487,31 @@ SYSCALL_DEFINE1(eventfd, unsigned int, count) return do_eventfd(count, 0); } +#ifdef CONFIG_SYSCTL + +static unsigned long min_wake_delay; + +static unsigned long max_wake_delay = HZ / 10; + +static struct ctl_table fs_eventfd_ctl[] = { + { + .procname = "eventfd_write_wake_delay_ms", + .data = &sysctl_eventfd_write_wake_delay_ms, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = (void *)&min_wake_delay, + .extra2 = (void *)&max_wake_delay, + }, + { } +}; + +static int __init init_fs_exec_sysctls(void) +{ + register_sysctl_init("fs", fs_eventfd_ctl); + return 0; +} + +fs_initcall(init_fs_exec_sysctls); + +#endif /* CONFIG_SYSCTL */ -- 2.37.2