[RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

Currently obtaining a new file descriptor results in locking fdtable
twice - once in order to reserve a slot and second time to fill it.

Hack below gets rid of the second lock usage.

It gives me a ~30% speedup (~300k ops -> ~400k ops) in a microbenchmark
where 16 threads create a pipe (2 fds) and call 2 * close.

Results are fluctuating and even with the patch sometimes drop to around
~300k ops. However, without the patch they never get higher.

I can come up with a better benchmark if that's necessary.

Comments?

==============================================

Subject: [PATCH] fs: use a sequence counter instead of file_lock in fd_install

Because the lock is not held, it is possible that fdtable will be
reallocated as we fill it.

RCU is used to guarantee the old table is not freed just in case we
happen to write to it (which is harmless).

sequence counter is used to ensure we updated the right table.

Signed-off-by: Mateusz Guzik <mguzik@xxxxxxxxxx>
---
 fs/file.c               | 24 +++++++++++++++++++-----
 include/linux/fdtable.h |  5 +++++
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 93c5f89..bd1ef4c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -165,8 +165,10 @@ static int expand_fdtable(struct files_struct *files, int nr)
 	cur_fdt = files_fdtable(files);
 	if (nr >= cur_fdt->max_fds) {
 		/* Continue as planned */
+		write_seqcount_begin(&files->fdt_seqcount);
 		copy_fdtable(new_fdt, cur_fdt);
 		rcu_assign_pointer(files->fdt, new_fdt);
+		write_seqcount_end(&files->fdt_seqcount);
 		if (cur_fdt != &files->fdtab)
 			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
 	} else {
@@ -256,6 +258,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	atomic_set(&newf->count, 1);
 
 	spin_lock_init(&newf->file_lock);
+	seqcount_init(&newf->fdt_seqcount);
 	newf->next_fd = 0;
 	new_fdt = &newf->fdtab;
 	new_fdt->max_fds = NR_OPEN_DEFAULT;
@@ -429,6 +432,7 @@ void exit_files(struct task_struct *tsk)
 
 struct files_struct init_files = {
 	.count		= ATOMIC_INIT(1),
+	.fdt_seqcount	= SEQCNT_ZERO(fdt_seqcount),
 	.fdt		= &init_files.fdtab,
 	.fdtab		= {
 		.max_fds	= NR_OPEN_DEFAULT,
@@ -552,12 +556,22 @@ EXPORT_SYMBOL(put_unused_fd);
 void __fd_install(struct files_struct *files, unsigned int fd,
 		struct file *file)
 {
+	unsigned long seq;
 	struct fdtable *fdt;
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	BUG_ON(fdt->fd[fd] != NULL);
-	rcu_assign_pointer(fdt->fd[fd], file);
-	spin_unlock(&files->file_lock);
+
+	rcu_read_lock();
+	do {
+		seq = read_seqcount_begin(&files->fdt_seqcount);
+		fdt = files_fdtable_seq(files);
+		/*
+		 * Entry in the table can already be equal to file if we
+		 * had to restart and copy_fdtable picked up our update.
+		 */
+		BUG_ON(!(fdt->fd[fd] == NULL || fdt->fd[fd] == file));
+		rcu_assign_pointer(fdt->fd[fd], file);
+		smp_mb();
+	} while (__read_seqcount_retry(&files->fdt_seqcount, seq));
+	rcu_read_unlock();
 }
 
 void fd_install(unsigned int fd, struct file *file)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 230f87b..9e41765 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -12,6 +12,7 @@
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/seqlock.h>
 
 #include <linux/atomic.h>
 
@@ -47,6 +48,7 @@ struct files_struct {
    * read mostly part
    */
 	atomic_t count;
+	seqcount_t fdt_seqcount;
 	struct fdtable __rcu *fdt;
 	struct fdtable fdtab;
   /*
@@ -69,6 +71,9 @@ struct dentry;
 #define files_fdtable(files) \
 	rcu_dereference_check_fdtable((files), (files)->fdt)
 
+#define files_fdtable_seq(files) \
+	rcu_dereference((files)->fdt)
+
 /*
  * The caller must ensure that fd table isn't shared or hold rcu or file lock
  */
-- 
1.8.3.1
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux