[RFC] eventpoll: Move a kmem_cache_alloc and kmem_cache_free

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



We noticed some scaling issue in the SPECjbb benchmark.  Running perf
we found that the it was spending lots of time in SYS_epoll_ctl.
In particular it is holding the epmutex.
This patch helps by moving out the kmem_cache_alloc and kmem_cache_free out
from under the lock.  It improves throughput by around 15% on 16 sockets.

While this patch should be fine as it is there are probably is more things
that can be done out side the lock, like wakeup_source_unregister, but I am
not familar with the area and I don't know of many tests.  I did find the
one posted by Jason Baron at https://lkml.org/lkml/2011/2/25/297.

Any thoughts?

Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
Cc: Jason Baron <jbaron@xxxxxxxxxx>
Reported-by: Jerry Lohr <glohr@xxxxxxx>
Signed-off-by: Nathan Zimmer <nzimmer@xxxxxxx>
---
 fs/eventpoll.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9ad17b15..752e5ff 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -707,7 +707,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	wakeup_source_unregister(ep_wakeup_source(epi));
 
 	/* At this point it is safe to free the eventpoll item */
-	kmem_cache_free(epi_cache, epi);
 
 	atomic_long_dec(&ep->user->epoll_watches);
 
@@ -754,6 +753,7 @@ static void ep_free(struct eventpoll *ep)
 	while ((rbp = rb_first(&ep->rbr)) != NULL) {
 		epi = rb_entry(rbp, struct epitem, rbn);
 		ep_remove(ep, epi);
+		kmem_cache_free(epi_cache, epi);
 	}
 	mutex_unlock(&ep->mtx);
 
@@ -1230,18 +1230,17 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi)
  * Must be called with "mtx" held.
  */
 static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
-		     struct file *tfile, int fd)
+		     struct file *tfile, int fd, struct epitem *epi)
 {
 	int error, revents, pwake = 0;
 	unsigned long flags;
 	long user_watches;
-	struct epitem *epi;
 	struct ep_pqueue epq;
 
 	user_watches = atomic_long_read(&ep->user->epoll_watches);
 	if (unlikely(user_watches >= max_user_watches))
 		return -ENOSPC;
-	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
+	if (!epi)
 		return -ENOMEM;
 
 	/* Item initialization follow here ... */
@@ -1349,7 +1348,6 @@ error_unregister:
 	wakeup_source_unregister(ep_wakeup_source(epi));
 
 error_create_wakeup_source:
-	kmem_cache_free(epi_cache, epi);
 
 	return error;
 }
@@ -1795,6 +1793,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	struct file *file, *tfile;
 	struct eventpoll *ep;
 	struct epitem *epi;
+	struct epitem *epi_prepped = NULL;
+	struct epitem *epi_dropped = NULL;
 	struct epoll_event epds;
 
 	error = -EFAULT;
@@ -1849,6 +1849,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	 * b/c we want to make sure we are looking at a coherent view of
 	 * epoll network.
 	 */
+	if (op == EPOLL_CTL_ADD)
+		epi_prepped = kmem_cache_alloc(epi_cache, GFP_KERNEL);
+
 	if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
 		mutex_lock(&epmutex);
 		did_lock_epmutex = 1;
@@ -1878,15 +1881,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	case EPOLL_CTL_ADD:
 		if (!epi) {
 			epds.events |= POLLERR | POLLHUP;
-			error = ep_insert(ep, &epds, tfile, fd);
-		} else
+			error = ep_insert(ep, &epds, tfile, fd, epi_prepped);
+			if (error)
+				epi_dropped = epi_prepped;
+		} else {
 			error = -EEXIST;
+		}
 		clear_tfile_check_list();
 		break;
 	case EPOLL_CTL_DEL:
-		if (epi)
+		if (epi) {
 			error = ep_remove(ep, epi);
-		else
+			epi_dropped = epi;
+		} else
 			error = -ENOENT;
 		break;
 	case EPOLL_CTL_MOD:
@@ -1902,6 +1909,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 error_tgt_fput:
 	if (did_lock_epmutex)
 		mutex_unlock(&epmutex);
+	if (epi_dropped)
+		kmem_cache_free(epi_cache, epi_dropped);
 
 	fput(tfile);
 error_fput:
-- 
1.8.2.1

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux