The patch titled Subject: fs/epoll: use a per-cpu counter for user's watches count has been added to the -mm tree. Its filename is fs-epoll-use-a-per-cpu-counter-for-users-watches-count.patch This patch should soon appear at https://ozlabs.org/~akpm/mmots/broken-out/fs-epoll-use-a-per-cpu-counter-for-users-watches-count.patch and later at https://ozlabs.org/~akpm/mmotm/broken-out/fs-epoll-use-a-per-cpu-counter-for-users-watches-count.patch Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days ------------------------------------------------------ From: Nicholas Piggin <npiggin@xxxxxxxxx> Subject: fs/epoll: use a per-cpu counter for user's watches count This counter tracks the number of watches a user has, to compare against the 'max_user_watches' limit. This causes a scalability bottleneck on SPECjbb2015 on large systems as there is only one user. Changing to a per-cpu counter increases throughput of the benchmark by about 30% on a 16-socket, > 1000 thread system. Link: https://lkml.kernel.org/r/20210802032013.2751916-1-npiggin@xxxxxxxxx Signed-off-by: Nicholas Piggin <npiggin@xxxxxxxxx> Reported-by: Anton Blanchard <anton@xxxxxxxxxx> Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- fs/eventpoll.c | 18 ++++++++++-------- include/linux/sched/user.h | 3 ++- kernel/user.c | 9 +++++++++ 3 files changed, 21 insertions(+), 9 deletions(-) --- a/fs/eventpoll.c~fs-epoll-use-a-per-cpu-counter-for-users-watches-count +++ a/fs/eventpoll.c @@ -723,7 +723,7 @@ static int ep_remove(struct eventpoll *e */ call_rcu(&epi->rcu, epi_rcu_free); - atomic_long_dec(&ep->user->epoll_watches); + percpu_counter_dec(&ep->user->epoll_watches); return 0; } @@ -1439,7 +1439,6 @@ static int ep_insert(struct eventpoll *e { int error, pwake = 0; __poll_t revents; - long user_watches; struct epitem *epi; struct ep_pqueue epq; struct eventpoll *tep = NULL; @@ -1449,11 +1448,15 @@ static int ep_insert(struct eventpoll *e lockdep_assert_irqs_enabled(); - user_watches = atomic_long_read(&ep->user->epoll_watches); - if (unlikely(user_watches >= max_user_watches)) + if (unlikely(percpu_counter_compare(&ep->user->epoll_watches, + max_user_watches) >= 0)) return -ENOSPC; - if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) + percpu_counter_inc(&ep->user->epoll_watches); + + if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) { + percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; + } /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); @@ -1466,17 +1469,16 @@ static int ep_insert(struct eventpoll *e mutex_lock_nested(&tep->mtx, 1); /* Add the current item to the list of active epoll hook for this file */ if (unlikely(attach_epitem(tfile, epi) < 0)) { - kmem_cache_free(epi_cache, epi); if (tep) mutex_unlock(&tep->mtx); + kmem_cache_free(epi_cache, epi); + percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } if (full_check && !tep) list_file(tfile); - atomic_long_inc(&ep->user->epoll_watches); - /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. --- a/include/linux/sched/user.h~fs-epoll-use-a-per-cpu-counter-for-users-watches-count +++ a/include/linux/sched/user.h @@ -4,6 +4,7 @@ #include <linux/uidgid.h> #include <linux/atomic.h> +#include <linux/percpu_counter.h> #include <linux/refcount.h> #include <linux/ratelimit.h> @@ -13,7 +14,7 @@ struct user_struct { refcount_t __count; /* reference count */ #ifdef CONFIG_EPOLL - atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ + struct percpu_counter epoll_watches; /* The number of file descriptors currently watched */ #endif unsigned long unix_inflight; /* How many files in flight in unix sockets */ atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ --- a/kernel/user.c~fs-epoll-use-a-per-cpu-counter-for-users-watches-count +++ a/kernel/user.c @@ -138,6 +138,7 @@ static void free_user(struct user_struct { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); + percpu_counter_destroy(&up->epoll_watches); kmem_cache_free(uid_cachep, up); } @@ -185,6 +186,10 @@ struct user_struct *alloc_uid(kuid_t uid new->uid = uid; refcount_set(&new->__count, 1); + if (percpu_counter_init(&new->epoll_watches, 0, GFP_KERNEL)) { + kmem_cache_free(uid_cachep, new); + return NULL; + } ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); @@ -195,6 +200,7 @@ struct user_struct *alloc_uid(kuid_t uid spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { + percpu_counter_destroy(&new->epoll_watches); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); @@ -216,6 +222,9 @@ static int __init uid_cache_init(void) for(n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(uidhash_table + n); + if (percpu_counter_init(&root_user.epoll_watches, 0, GFP_KERNEL)) + panic("percpu cpunter alloc failed"); + /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); _ Patches currently in -mm which might be from npiggin@xxxxxxxxx are lazy-tlb-introduce-lazy-mm-refcount-helper-functions.patch lazy-tlb-introduce-lazy-mm-refcount-helper-functions-fix.patch lazy-tlb-allow-lazy-tlb-mm-refcounting-to-be-configurable.patch lazy-tlb-allow-lazy-tlb-mm-refcounting-to-be-configurable-fix-2.patch lazy-tlb-shoot-lazies-a-non-refcounting-lazy-tlb-option.patch lazy-tlb-shoot-lazies-a-non-refcounting-lazy-tlb-option-fix.patch powerpc-64s-enable-mmu_lazy_tlb_shootdown.patch fs-epoll-use-a-per-cpu-counter-for-users-watches-count.patch