+ page_fault-retry-with-nopage_retry.patch added to -mm tree

akpm@xxxxxxxxxxxxxxxxxxxx · Tue, 09 Dec 2008 11:31:25 -0800

The patch titled
     page_fault retry with NOPAGE_RETRY
has been added to the -mm tree.  Its filename is
     page_fault-retry-with-nopage_retry.patch

Before you just go and hit "reply", please:
   a) Consider who else should be cc'ed
   b) Prefer to cc a suitable mailing list as well
   c) Ideally: find the original patch on the mailing list and do a
      reply-to-all to that, adding suitable additional cc's

*** Remember to use Documentation/SubmitChecklist when testing your code ***

See http://userweb.kernel.org/~akpm/stuff/added-to-mm.txt to find
out what to do about this

The current -mm tree may be found at http://userweb.kernel.org/~akpm/mmotm/

------------------------------------------------------
Subject: page_fault retry with NOPAGE_RETRY
From: Ying Han <yinghan@xxxxxxxxxx>

Allow major faults to drop the mmap_sem read lock while waiting for
synchronous disk read.  This allows another thread which wishes to grab
down_write(mmap_sem) to proceed while the current is waiting the disk IO.

The patch extend the 'write' flag of handle_mm_fault() to FAULT_FLAG_RETRY
as identify that the caller can tolerate the retry in the filemap_fault
call patch.

This patch helps a lot in cases we have writer which is waitting behind
all readers, so it could execute much faster.

I also made patches for all other arch. I am posting x86_64 here first and
i will post others by the time everyone feels comfortable of this patch.

todo:
- there is potential a starvation hole with the retry. By the time the
  retry returns, the pages might be released. we can make change by holding
  page reference as well as remembering what the page "was"(in case the
  file was truncated). any suggestion here are welcomed.

Signed-off-by: Mike Waychison <mikew@xxxxxxxxxx>
Signed-off-by: Ying Han <yinghan@xxxxxxxxxx>
Tested-by: Török Edwin <edwintorok@xxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxx>
Cc: David Rientjes <rientjes@xxxxxxxxxx>
Cc: Rohit Seth <rohitseth@xxxxxxxxxx>
Cc: Hugh Dickins <hugh@xxxxxxxxxxx>
Cc: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
Cc: Lee Schermerhorn <lee.schermerhorn@xxxxxx>
Cc: Nick Piggin <npiggin@xxxxxxx>
Cc: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx>
Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---

 arch/x86/mm/fault.c |   12 ++++++
 include/linux/mm.h  |    3 +
 mm/filemap.c        |   84 ++++++++++++++++++++++++++++++++++++++++--
 mm/memory.c         |    8 ++++
 4 files changed, 102 insertions(+), 5 deletions(-)

diff -puN arch/x86/mm/fault.c~page_fault-retry-with-nopage_retry arch/x86/mm/fault.c

--- a/arch/x86/mm/fault.c~page_fault-retry-with-nopage_retry
+++ a/arch/x86/mm/fault.c
@@ -597,6 +597,7 @@ void __kprobes do_page_fault(struct pt_r
 	unsigned long flags;
 	int sig;
 #endif
+	unsigned int retry_flag = FAULT_FLAG_RETRY;
 
 	tsk = current;
 	mm = tsk->mm;
@@ -705,6 +706,7 @@ void __kprobes do_page_fault(struct pt_r
 		down_read(&mm->mmap_sem);
 	}
 
+retry:
 	vma = find_vma(mm, address);
 	if (!vma)
 		goto bad_area;
@@ -731,6 +733,7 @@ void __kprobes do_page_fault(struct pt_r
 good_area:
 	si_code = SEGV_ACCERR;
 	write = 0;
+	write |= retry_flag;
 	switch (error_code & (PF_PROT|PF_WRITE)) {
 	default:	/* 3: write, present */
 		/* fall through */
@@ -759,6 +762,15 @@ good_area:
 			goto do_sigbus;
 		BUG();
 	}
+
+	if (fault & VM_FAULT_RETRY) {
+		if (write & FAULT_FLAG_RETRY) {
+			retry_flag &= ~FAULT_FLAG_RETRY;
+			goto retry;
+		}
+		BUG();
+	}
+
 	if (fault & VM_FAULT_MAJOR)
 		tsk->maj_flt++;
 	else
diff -puN include/linux/mm.h~page_fault-retry-with-nopage_retry include/linux/mm.h
--- a/include/linux/mm.h~page_fault-retry-with-nopage_retry
+++ a/include/linux/mm.h
@@ -144,7 +144,7 @@ extern pgprot_t protection_map[16];
 
 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
-
+#define FAULT_FLAG_RETRY	0x04	/* Retry majoy fault */
 
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -694,6 +694,7 @@ static inline int page_mapped(struct pag
 #define VM_FAULT_SIGBUS	0x0002
 #define VM_FAULT_MAJOR	0x0004
 #define VM_FAULT_WRITE	0x0008	/* Special case for get_user_pages */
+#define VM_FAULT_RETRY	0x0010
 
 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
diff -puN mm/filemap.c~page_fault-retry-with-nopage_retry mm/filemap.c
--- a/mm/filemap.c~page_fault-retry-with-nopage_retry
+++ a/mm/filemap.c
@@ -714,6 +714,56 @@ repeat:
 EXPORT_SYMBOL(find_lock_page);
 
 /**
+ * find_lock_page_retry - locate, pin and lock a pagecache page, if retry
+ * flag is on, and page is already locked by someone else, return a hint of
+ * retry.
+ * @mapping: the address_space to search
+ * @offset: the page index
+ * @vma: vma in which the fault was taken
+ * @page: zero if page not present, otherwise point to the page in
+ * pagecache.
+ * @retry: 1 indicate caller tolerate a retry.
+ *
+ * Return *page==NULL if page is not in pagecache. Otherwise return *page
+ * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
+ * hint to caller for retry, or ret=0 which means page is succefully
+ * locked.
+ */
+unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
+				struct vm_area_struct *vma, struct page **page,
+				int retry)
+{
+	unsigned int ret = 0;
+
+repeat:
+	*page = find_get_page(mapping, offset);
+	if (*page) {
+		if (!retry)
+			lock_page(*page);
+		else {
+			if (!trylock_page(*page)) {
+				struct mm_struct *mm = vma->vm_mm;
+
+				up_read(&mm->mmap_sem);
+				wait_on_page_locked(*page);
+				down_read(&mm->mmap_sem);
+
+				page_cache_release(*page);
+				return VM_FAULT_RETRY;
+			}
+		}
+		if (unlikely((*page)->mapping != mapping)) {
+			unlock_page(*page);
+			page_cache_release(*page);
+			goto repeat;
+		}
+		VM_BUG_ON((*page)->index != offset);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(find_lock_page_retry);
+
+/**
  * find_or_create_page - locate or add a pagecache page
  * @mapping: the page's address_space
  * @index: the page's index into the mapping
@@ -1444,6 +1494,8 @@ int filemap_fault(struct vm_area_struct 
 	pgoff_t size;
 	int did_readaround = 0;
 	int ret = 0;
+	int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
+	int retry_ret;
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (vmf->pgoff >= size)
@@ -1458,6 +1510,8 @@ int filemap_fault(struct vm_area_struct 
 	 */
 retry_find:
 	page = find_lock_page(mapping, vmf->pgoff);
+
+retry_find_nopage:
 	/*
 	 * For sequential accesses, we use the generic readahead logic.
 	 */
@@ -1465,9 +1519,12 @@ retry_find:
 		if (!page) {
 			page_cache_sync_readahead(mapping, ra, file,
 							   vmf->pgoff, 1);
-			page = find_lock_page(mapping, vmf->pgoff);
+			retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+						vma, &page, retry_flag);
 			if (!page)
 				goto no_cached_page;
+			if (retry_ret == VM_FAULT_RETRY)
+				return retry_ret;
 		}
 		if (PageReadahead(page)) {
 			page_cache_async_readahead(mapping, ra, file, page,
@@ -1504,14 +1561,18 @@ retry_find:
 				start = vmf->pgoff - ra_pages / 2;
 			do_page_cache_readahead(mapping, file, start, ra_pages);
 		}
-		page = find_lock_page(mapping, vmf->pgoff);
+		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+				vma, &page, retry_flag);
 		if (!page)
 			goto no_cached_page;
+		if (retry_ret == VM_FAULT_RETRY)
+			return retry_ret;
 	}
 
 	if (!did_readaround)
 		ra->mmap_miss--;
 
+retry_page_update:
 	/*
 	 * We have a locked page in the page cache, now we need to check
 	 * that it's up-to-date. If not, it is going to be due to an error.
@@ -1546,8 +1607,23 @@ no_cached_page:
 	 * In the unlikely event that someone removed it in the
 	 * meantime, we'll just come back here and read it again.
 	 */
-	if (error >= 0)
-		goto retry_find;
+	if (error >= 0) {
+		/*
+		 * If caller cannot tolerate a retry in the ->fault path
+		 * go back to check the page again.
+		 */
+		if (!retry_flag)
+			goto retry_find;
+
+		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+					vma, &page, retry_flag);
+		if (!page)
+			goto retry_find_nopage;
+		else if (retry_ret == VM_FAULT_RETRY)
+			return retry_ret;
+		else
+			goto retry_page_update;
+	}
 
 	/*
 	 * An error return from page_cache_read can result if the
diff -puN mm/memory.c~page_fault-retry-with-nopage_retry mm/memory.c
--- a/mm/memory.c~page_fault-retry-with-nopage_retry
+++ a/mm/memory.c
@@ -2543,6 +2543,13 @@ static int __do_fault(struct mm_struct *
 	vmf.page = NULL;
 
 	ret = vma->vm_ops->fault(vma, &vmf);
+
+	/* page may be available, but we have to restart the process
+	 * because mmap_sem was dropped during the ->fault
+	 */
+	if (ret == VM_FAULT_RETRY)
+		return ret;
+
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
 		return ret;
 
@@ -2686,6 +2693,7 @@ static int do_linear_fault(struct mm_str
 			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 	unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
 
+	flags |= (write_access & FAULT_FLAG_RETRY);
 	pte_unmap(page_table);
 	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
_

Patches currently in -mm which might be from yinghan@xxxxxxxxxx are

mm-make-get_user_pages-interruptible.patch
mm-make-get_user_pages-interruptible-mmotm-ignore-sigkill-in-get_user_pages-during-munlock.patch
page_fault-retry-with-nopage_retry.patch
page_fault-retry-with-nopage_retry-fix.patch

--
To unsubscribe from this list: send the line "unsubscribe mm-commits" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html