Re: [PATCH RESEND v4] fs/epoll: Remove unnecessary wakeups of nested epoll that in ET mode

Roman Penyaev <rpenyaev@xxxxxxx> · Mon, 30 Sep 2019 13:55:48 +0200

On 2019-09-28 04:29, Andrew Morton wrote:
On Wed, 25 Sep 2019 09:56:03 +0800 hev <r@xxxxxx> wrote:

From: Heiher <r@xxxxxx>

Take the case where we have:

        t0
         | (ew)
        e0
         | (et)
        e1
         | (lt)
        s0

t0: thread 0
e0: epoll fd 0
e1: epoll fd 1
s0: socket fd 0
ew: epoll_wait
et: edge-trigger
lt: level-trigger

We only need to wakeup nested epoll fds if something has been queued 
to the
overflow list, since the ep_poll() traverses the rdllist during 
recursive poll
and thus events on the overflow list may not be visible yet.

Test code:

Look sane to me.  Do you have any performance testing results which
show a benefit?

epoll maintainership isn't exactly a hive of activity nowadays :(
Roman, would you please have time to review this?

So here is my observation: current patch does not fix the described
problem (double wakeup) for the case, when new event comes exactly
to the ->ovflist.  According to the patch this is the desired intention:

   /*
    * We only need to wakeup nested epoll fds if something has been 
queued
    * to the overflow list, since the ep_poll() traverses the rdllist
    * during recursive poll and thus events on the overflow list may not 
be
    * visible yet.
    */
    if (nepi != NULL)
       pwake++;

    ....

    if (pwake == 2)
       ep_poll_safewake(&ep->poll_wait);


but this actually means that we repeat the same behavior (double wakeup)
but only for the case, when event comes to the ->ovflist.

How to reproduce? Can be easily done (ok, not so easy but it is possible
to try): to the given userspace test we need to add one more socket and
immediately fire the event:

    e.events = EPOLLIN;
    if (epoll_ctl(efd[1], EPOLL_CTL_ADD, s2fd[0], &e) < 0)
       goto out;

    /*
     * Signal any fd to let epoll_wait() to call ep_scan_ready_list()
     * in order to "catch" it there and add new event to ->ovflist.
     */
     if (write(s2fd[1], "w", 1) != 1)
        goto out;

That is done in order to let the following epoll_wait() call to invoke
ep_scan_ready_list(), where we can "catch" and insert new event exactly
to the ->ovflist. In order to insert event exactly in the correct list
I introduce artificial delay.

Modified test and kernel patch is below.  Here is the output of the
testing tool with some debug lines from kernel:

  # ~/devel/test/edge-bug
  [   59.263178] ### sleep 2
  >> write to sock
  [   61.318243] ### done sleep
  [   61.318991] !!!!!!!!!!! ep_poll_safewake(&ep->poll_wait); 
events_in_rdllist=1, events_in_ovflist=1
  [   61.321204] ### sleep 2
  [   63.398325] ### done sleep
  error: What?! Again?!

First epoll_wait() call (ep_scan_ready_list()) observes 2 events
(see "!!!!!!!!!!! ep_poll_safewake" output line), exactly what we
wanted to achieve, so eventually ep_poll_safewake() is called again
which leads to double wakeup.

In my opinion current patch as it is should be dropped, it does not
fix the described problem but just hides it.

--
Roman


######### USERSPACE ##########

#include <unistd.h>
#include <sys/epoll.h>
#include <sys/socket.h>
#include <stdio.h>
#include <pthread.h>

static void *do_thread(void *arg)
{
	int s = *(int *)arg;

	sleep(1);
	printf(">> write to sock\n");
	write(s, "w", 1);
}

int main(int argc, char *argv[])
{
	int s1fd[2];
	int s2fd[2];
	int efd[2];
	struct epoll_event e;

	if (socketpair(AF_UNIX, SOCK_STREAM, 0, s1fd) < 0)
		goto out;
	if (socketpair(AF_UNIX, SOCK_STREAM, 0, s2fd) < 0)
		goto out;

	efd[0] = epoll_create(1);
	if (efd[0] < 0)
		goto out;

	efd[1] = epoll_create(1);
	if (efd[1] < 0)
		goto out;

	e.events = EPOLLIN;
	if (epoll_ctl(efd[1], EPOLL_CTL_ADD, s1fd[0], &e) < 0)
		goto out;

	e.events = EPOLLIN;
	if (epoll_ctl(efd[1], EPOLL_CTL_ADD, s2fd[0], &e) < 0)
		goto out;

	e.events = EPOLLIN | EPOLLET;
	if (epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e) < 0)
		goto out;

	/*
	 * Signal any fd to let epoll_wait() to call ep_scan_ready_list()
	 * in order to "catch" it there and add new event to ->ovflist.
	 */
	if (write(s2fd[1], "w", 1) != 1)
		goto out;

	pthread_t thr;
	pthread_create(&thr, NULL, do_thread, &s1fd[1]);
	if (epoll_wait(efd[0], &e, 1, 0) != 1) {
		goto out;
	}
	pthread_join(thr, NULL);

	if (epoll_wait(efd[0], &e, 1, 0) != 0) {
		printf("error: What?! Again?!\n");
		goto out;
	}

	return 0;

out:
	return -1;
}


######### KERNEL ##########

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8bc064630be0..edba7ab45083 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -39,6 +39,8 @@
 #include <linux/rculist.h>
 #include <net/busy_poll.h>

+static bool is_send_events_call;
+
 /*
  * LOCKING:
  * There are three level of locking required by epoll :
@@ -672,6 +674,8 @@ static __poll_t ep_scan_ready_list(struct eventpoll 
*ep,
 	__poll_t res;
 	int pwake = 0;
 	struct epitem *epi, *nepi;
+	unsigned events_in_rdllist = 0;
+	unsigned events_in_ovflist = 0;
 	LIST_HEAD(txlist);

 	lockdep_assert_irqs_enabled();
@@ -693,23 +697,52 @@ static __poll_t ep_scan_ready_list(struct 
eventpoll *ep,
 	 * in a lockless way.
 	 */
 	write_lock_irq(&ep->lock);
+
+	/* XXX Count events */
+	if (!strcmp("edge-bug", current->comm) && depth) {
+		struct list_head *l;
+		list_for_each(l, &ep->rdllist)
+			events_in_rdllist++;
+	}
 	list_splice_init(&ep->rdllist, &txlist);
 	WRITE_ONCE(ep->ovflist, NULL);
 	write_unlock_irq(&ep->lock);

+	if (!strcmp("edge-bug", current->comm) && depth && 
is_send_events_call) {
+		/*
+		 * XXX Introduce delay to let userspace fire event
+		 * XXX directly to ovflist.
+		 */
+		pr_err("### sleep 2\n");
+		msleep(2000);
+		pr_err("### done sleep\n");
+	}
+
+
 	/*
 	 * Now call the callback function.
 	 */
 	res = (*sproc)(ep, &txlist, priv);

 	write_lock_irq(&ep->lock);
+	nepi = READ_ONCE(ep->ovflist);
+	/*
+	 * We only need to wakeup nested epoll fds if something has been 
queued
+	 * to the overflow list, since the ep_poll() traverses the rdllist
+	 * during recursive poll and thus events on the overflow list may not 
be
+	 * visible yet.
+	 */
+	if (nepi != NULL)
+		pwake++;
 	/*
 	 * During the time we spent inside the "sproc" callback, some
 	 * other events might have been queued by the poll callback.
 	 * We re-insert them inside the main ready-list here.
 	 */
-	for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
+	for (; (epi = nepi) != NULL;
 	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
+		/* XXX Count events */
+		events_in_ovflist++;
 		/*
 		 * We need to check if the item is already in the list.
 		 * During the "sproc" callback execution time, items are
@@ -754,8 +787,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll 
*ep,
 		mutex_unlock(&ep->mtx);

 	/* We have to call this outside the lock */
-	if (pwake)
+	if (pwake == 2) {
+		pr_err("!!!!!!!!!!! ep_poll_safewake(&ep->poll_wait); 
events_in_rdllist=%d, events_in_ovflist=%d\n",
+		       events_in_rdllist, events_in_ovflist);
 		ep_poll_safewake(&ep->poll_wait);
+	}

 	return res;
 }
@@ -1925,9 +1961,16 @@ static int ep_poll(struct eventpoll *ep, struct 
epoll_event __user *events,
 	 * there's still timeout left over, we go trying again in search of
 	 * more luck.
 	 */
+
+	/* XXX Catch only ep_scan_ready_list() called from here */
+	if (!strcmp("edge-bug", current->comm))
+		is_send_events_call = 1;
 	if (!res && eavail &&
-	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
+	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out) {
+		is_send_events_call = 0;
 		goto fetch_events;
+	}
+	is_send_events_call = 0;

 	if (waiter) {
 		spin_lock_irq(&ep->wq.lock);