Hi, I posted a patch to completely remove the lock here from the ep_poll_safewake() patch here: http://lkml.iu.edu/hypermail/linux/kernel/1710.1/05834.html So these are going to conflict. The reason its safe to remove the lock is that there are loop and depth checks now that are performed during EPOLL_CTL_ADD. Specifically, ep_loop_check(). I would prefer to these checks once add add time as opposed to at each wakeup (even if they can be scaled better). I also have a pending patch to do something similar for poll_readywalk_ncalls, but I think that poll_safewake_ncalls is the most egregious one here? Thanks, -Jason On 10/13/2017 11:45 AM, Davidlohr Bueso wrote: > A customer reported massive contention on the ncalls->lock in which > the workload is designed around nested epolls (where the fd is already > an epoll). > > 83.49% [kernel] [k] __pv_queued_spin_lock_slowpath > 2.70% [kernel] [k] ep_call_nested.constprop.13 > 1.94% [kernel] [k] _raw_spin_lock_irqsave > 1.83% [kernel] [k] > __raw_callee_save___pv_queued_spin_unlock > 1.45% [kernel] [k] _raw_spin_unlock_irqrestore > 0.41% [kernel] [k] ep_scan_ready_list.isra.8 > 0.36% [kernel] [k] pvclock_clocksource_read > > The application running on kvm, is using a shared memory IPC communication > with a pipe wakeup mechanism, and a two level dispatcher both built around > 'epoll_wait'. There is one process per physical core and a full mesh of > pipes > between them, so on a 18 core system (the actual case), there are 18*18 > pipes > for the IPCs alone. > > This patch proposes replacing the nested calls global linked list with a > dlock > interface, for which we can benefit from pcpu lists when doing > ep_poll_safewake(), > and hashing for the current task, which provides two benefits: > > 1. Speeds up the process of loop and max-depth checks from O(N) lookups > to O(1) > (albeit possible collisions, which we have to iterate); and, > > 2. Provides smaller lock granularity. > > cpus before after diff > 1 1409370 1344804 -4.58% > 2 1015690 1337053 31.63% > 3 721009 1273254 76.59% > 4 380959 1128931 196.33% > 5 287603 1028362 257.56% > 6 221104 894943 304.76% > 7 173592 976395 462.46% > 8 145860 922584 532.51% > 9 127877 925900 624.05% > 10 112603 791456 602.87% > 11 97926 724296 639.63% > 12 80732 730485 804.82% > > With the exception of a single cpu, where the overhead of jhashing > influences), we > get some pretty good raw throughput increase. Similarly, the amount of > time spent > decreases immensely as well. > > Passes ltp related testcases. > > Signed-off-by: Davidlohr Bueso <dbueso@xxxxxxx> > --- > fs/eventpoll.c | 88 > +++++++++++++++++++++++++++++++++++----------------------- > 1 file changed, 53 insertions(+), 35 deletions(-) > > diff --git a/fs/eventpoll.c b/fs/eventpoll.c > index 2fabd19cdeea..675c97fdc5da 100644 > --- a/fs/eventpoll.c > +++ b/fs/eventpoll.c > @@ -22,7 +22,7 @@ > #include <linux/slab.h> > #include <linux/poll.h> > #include <linux/string.h> > -#include <linux/list.h> > +#include <linux/dlock-list.h> > #include <linux/hash.h> > #include <linux/spinlock.h> > #include <linux/syscalls.h> > @@ -119,7 +119,7 @@ struct epoll_filefd { > * and loop cycles. > */ > struct nested_call_node { > - struct list_head llink; > + struct dlock_list_node llink; > void *cookie; > void *ctx; > }; > @@ -129,8 +129,7 @@ struct nested_call_node { > * maximum recursion dept and loop cycles. > */ > struct nested_calls { > - struct list_head tasks_call_list; > - spinlock_t lock; > + struct dlock_list_heads tasks_call_list; > }; > > /* > @@ -371,10 +370,14 @@ static inline int ep_op_has_event(int op) > } > > /* Initialize the poll safe wake up structure */ > -static void ep_nested_calls_init(struct nested_calls *ncalls) > +static inline int ep_nested_calls_init(struct nested_calls *ncalls) > +{ > + return alloc_dlock_list_heads(&ncalls->tasks_call_list, true); > +} > + > +static inline void ep_nested_calls_free(struct nested_calls *ncalls) > { > - INIT_LIST_HEAD(&ncalls->tasks_call_list); > - spin_lock_init(&ncalls->lock); > + free_dlock_list_heads(&ncalls->tasks_call_list); > } > > /** > @@ -483,45 +486,50 @@ static int ep_call_nested(struct nested_calls > *ncalls, int max_nests, > { > int error, call_nests = 0; > unsigned long flags; > - struct list_head *lsthead = &ncalls->tasks_call_list; > - struct nested_call_node *tncur; > - struct nested_call_node tnode; > + struct dlock_list_head *head; > + struct nested_call_node *tncur, tnode; > > - spin_lock_irqsave(&ncalls->lock, flags); > + head = dlock_list_hash(&ncalls->tasks_call_list, ctx); > > + spin_lock_irqsave(&head->lock, flags); > /* > * Try to see if the current task is already inside this wakeup call. > - * We use a list here, since the population inside this set is always > - * very much limited. > + * > + * We use a chained hash table with linked lists here, and take the > + * lock to avoid racing when collisions (where ctx pointer is the > key). > + * Calls for which context is the cpu number, avoid hashing and act as > + * pcpu add/removal. > + * > + * Since the population inside this set is always very much limited, > + * list scanning should be short. > */ > - list_for_each_entry(tncur, lsthead, llink) { > - if (tncur->ctx == ctx && > - (tncur->cookie == cookie || ++call_nests > max_nests)) { > - /* > - * Ops ... loop detected or maximum nest level reached. > - * We abort this wake by breaking the cycle itself. > - */ > - error = -1; > - goto out_unlock; > - } > - } > + list_for_each_entry(tncur, &head->list, llink.list) { > + if (tncur->ctx == ctx && > + (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) { > + /* > + * Ops ... loop detected or maximum nest level reached. > + * We abort this wake by breaking the cycle itself. > + */ > + error = -1; > + spin_unlock_irqrestore(&head->lock, flags); > + goto done; > + } > + } > + > > /* Add the current task and cookie to the list */ > tnode.ctx = ctx; > tnode.cookie = cookie; > - list_add(&tnode.llink, lsthead); > - > - spin_unlock_irqrestore(&ncalls->lock, flags); > + tnode.llink.head = head; > + list_add(&tnode.llink.list, &head->list); > + spin_unlock_irqrestore(&head->lock, flags); > > /* Call the nested function */ > error = (*nproc)(priv, cookie, call_nests); > > /* Remove the current task from the list */ > - spin_lock_irqsave(&ncalls->lock, flags); > - list_del(&tnode.llink); > -out_unlock: > - spin_unlock_irqrestore(&ncalls->lock, flags); > - > + dlock_lists_del(&tnode.llink); > +done: > return error; > } > > @@ -2313,13 +2321,16 @@ static int __init eventpoll_init(void) > * Initialize the structure used to perform epoll file descriptor > * inclusion loops checks. > */ > - ep_nested_calls_init(&poll_loop_ncalls); > + if (ep_nested_calls_init(&poll_loop_ncalls)) > + goto err; > > /* Initialize the structure used to perform safe poll wait head wake > ups */ > - ep_nested_calls_init(&poll_safewake_ncalls); > + if (ep_nested_calls_init(&poll_safewake_ncalls)) > + goto err_free1; > > /* Initialize the structure used to perform file's f_op->poll() > calls */ > - ep_nested_calls_init(&poll_readywalk_ncalls); > + if (ep_nested_calls_init(&poll_readywalk_ncalls)) > + goto err_free0; > > /* > * We can have many thousands of epitems, so prevent this from > @@ -2336,5 +2347,12 @@ static int __init eventpoll_init(void) > sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL); > > return 0; > + > +err_free0: > + ep_nested_calls_free(&poll_safewake_ncalls); > +err_free1: > + ep_nested_calls_free(&poll_loop_ncalls); > +err: > + return -ENOMEM; > } > fs_initcall(eventpoll_init);