rwsem: down_read_unfair() proposal

Michel Lespinasse <walken@xxxxxxxxxx> · Tue, 4 May 2010 20:20:33 -0700

Hi,

I am looking at ways to solve the following problem:

Some cluster monitoring software we use at Google periodically accesses
files such as /proc/<pid>/exe or /proc/<pid>/maps, which requires
acquiring that pid's mmap_sem for read. Sometimes when the machines get
loaded enough, this acquisition can take a long time - typically this
happens when thread A acquires mmap_sem for reads in do_page_fault and
gets blocked (either trying to allocate a page or trying to read from
disk); then thread B tries to acquire mmap_sem for write and gets queued
behind A; then the monitoring software tries to read /proc/<pid>/maps
and gets queued behind B due to rwlock fair behavior.

We have been using patches that address these issues by allowing the
/proc/<pid>/exe and /proc/<pid>/maps code paths to acquire the mmap_sem
for reading in an unfair way, thus allowing the monitoring software to
bypass thread B and acquire mmap_sem concurrently with thread A.

This was easy to implement with the generic rwsem, and looks like it's
doable with the x86 rwsem implementation as well in a way that would only
involve changes to the rwsem spinlock-protected slow paths in lib/rwsem.c .
We are still working on that code but I thought we should ask first how
the developer community feels about the general idea.

For reference, here is one patch we have (against 2.6.33) using
down_read_unfair() in such a way (but with no x86 specific rwsem
implementation yet)


Author: Ying Han <yinghan@xxxxxxxxxx>

    Introduce down_read_unfair()
    
    In down_read_unfair(), reader is not waiting non-exclusive lock
    even a writer on the queue. Apply it to maps & exes in procfs where
    monitoring program reads frequently.

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 58324c2..d51bc55 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1367,7 +1367,7 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
 
 	/* We need mmap_sem to protect against races with removal of
 	 * VM_EXECUTABLE vmas */
-	down_read(&mm->mmap_sem);
+	down_read_unfair(&mm->mmap_sem);
 	exe_file = mm->exe_file;
 	if (exe_file)
 		get_file(exe_file);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f277c4a..118e0cd 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -119,7 +119,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	mm = mm_for_maps(priv->task);
 	if (!mm)
 		return NULL;
-	down_read(&mm->mmap_sem);
+	down_read_unfair(&mm->mmap_sem);
 
 	tail_vma = get_gate_vma(priv->task);
 	priv->tail_vma = tail_vma;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5d9fd64..2ab484b 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -193,7 +193,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 		priv->task = NULL;
 		return NULL;
 	}
-	down_read(&mm->mmap_sem);
+	down_read_unfair(&mm->mmap_sem);
 
 	/* start from the Nth VMA */
 	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index bdfcc25..48199db 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -60,7 +60,8 @@ do {								\
 	__init_rwsem((sem), #sem, &__key);			\
 } while (0)
 
-extern void __down_read(struct rw_semaphore *sem);
+extern void __down_read_fair(struct rw_semaphore *sem);
+extern void __down_read_unfair(struct rw_semaphore *sem);
 extern int __down_read_trylock(struct rw_semaphore *sem);
 extern void __down_write(struct rw_semaphore *sem);
 extern void __down_write_nested(struct rw_semaphore *sem, int subclass);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index efd348f..65d666e 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -20,6 +20,7 @@ struct rw_semaphore;
 #include <linux/rwsem-spinlock.h> /* use a generic implementation */
 #else
 #include <asm/rwsem.h> /* use an arch-specific implementation */
+#error "Missing down_read_unfair support."
 #endif
 
 /*
@@ -28,6 +29,11 @@ struct rw_semaphore;
 extern void down_read(struct rw_semaphore *sem);
 
 /*
+ * lock for reading - skip waitting writers
+ */
+extern void down_read_unfair(struct rw_semaphore *sem);
+
+/*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
 extern int down_read_trylock(struct rw_semaphore *sem);
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index cae050b..24578c5 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -21,12 +21,25 @@ void __sched down_read(struct rw_semaphore *sem)
 	might_sleep();
 	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
 
-	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+	LOCK_CONTENDED(sem, __down_read_trylock, __down_read_fair);
 }
 
 EXPORT_SYMBOL(down_read);
 
 /*
+ * lock for reading - skip waitting writers
+ */
+void __sched down_read_unfair(struct rw_semaphore *sem)
+{
+	might_sleep();
+	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+	LOCK_CONTENDED(sem, __down_read_trylock, __down_read_unfair);
+}
+
+EXPORT_SYMBOL(down_read_unfair);
+
+/*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
 int down_read_trylock(struct rw_semaphore *sem)
@@ -112,7 +125,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
 	might_sleep();
 	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
 
-	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+	LOCK_CONTENDED(sem, __down_read_trylock, __down_read_fair);
 }
 
 EXPORT_SYMBOL(down_read_nested);
@@ -121,7 +134,7 @@ void down_read_non_owner(struct rw_semaphore *sem)
 {
 	might_sleep();
 
-	__down_read(sem);
+	__down_read_fair(sem);
 }
 
 EXPORT_SYMBOL(down_read_non_owner);
diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c
index ccf95bf..8c44c08 100644
--- a/lib/rwsem-spinlock.c
+++ b/lib/rwsem-spinlock.c
@@ -139,14 +139,14 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem)
 /*
  * get a read lock on the semaphore
  */
-void __sched __down_read(struct rw_semaphore *sem)
+void __sched __down_read(struct rw_semaphore *sem, int unfair)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
 
 	spin_lock_irq(&sem->wait_lock);
 
-	if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+	if (sem->activity >= 0 && (unfair || list_empty(&sem->wait_list))) {
 		/* granted */
 		sem->activity++;
 		spin_unlock_irq(&sem->wait_lock);
@@ -161,7 +161,11 @@ void __sched __down_read(struct rw_semaphore *sem)
 	waiter.flags = RWSEM_WAITING_FOR_READ;
 	get_task_struct(tsk);
 
-	list_add_tail(&waiter.list, &sem->wait_list);
+	if (unfair) {
+		list_add(&waiter.list, &sem->wait_list);
+	} else {
+		list_add_tail(&waiter.list, &sem->wait_list);
+	}
 
 	/* we don't need to touch the semaphore struct anymore */
 	spin_unlock_irq(&sem->wait_lock);
@@ -180,6 +184,22 @@ void __sched __down_read(struct rw_semaphore *sem)
 }
 
 /*
+ * wrapper for fair __down_read
+ */
+void __sched __down_read_fair(struct rw_semaphore *sem)
+{
+	__down_read(sem, 0);
+}
+
+/*
+ * wrapper for unfair __down_read
+ */
+void __sched __down_read_unfair(struct rw_semaphore *sem)
+{
+	__down_read(sem, 1);
+}
+
+/*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
 int __down_read_trylock(struct rw_semaphore *sem)


-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxxx  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>