[nacked] page_fault-retry-with-nopage_retry.patch removed from -mm tree

akpm@xxxxxxxxxxxxxxxxxxxx · Wed, 01 Apr 2009 08:56:52 -0700

The patch titled
     page_fault retry with NOPAGE_RETRY
has been removed from the -mm tree.  Its filename was
     page_fault-retry-with-nopage_retry.patch

This patch was dropped because it was nacked

The current -mm tree may be found at http://userweb.kernel.org/~akpm/mmotm/

------------------------------------------------------
Subject: page_fault retry with NOPAGE_RETRY
From: Ying Han <yinghan@xxxxxxxxxx>

Allow major faults to drop the mmap_sem read lock while waiting for
synchronous disk read.  This allows another thread which wishes to grab
down_write(mmap_sem) to proceed while the current is waiting the disk IO.

The patch extend the 'write' flag of handle_mm_fault() to FAULT_FLAG_RETRY
as identify that the caller can tolerate the retry in the filemap_fault
call patch.

This patch helps a lot in cases we have writer which is waitting behind
all readers, so it could execute much faster.

I also made patches for all other arch. I am posting x86_64 here first and
I will post others by the time everyone feels comfortable with this patch.

[akpm@xxxxxxxxxxxxxxxxxxxx: untangle the `write' boolean from the FAULT_FLAG_foo non-boolean field]
[akpm@xxxxxxxxxxxxxxxxxxxx: undo wrong spelling change]
Signed-off-by: Mike Waychison <mikew@xxxxxxxxxx>
Signed-off-by: Ying Han <yinghan@xxxxxxxxxx>
Tested-by: Török Edwin <edwintorok@xxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxx>
Cc: David Rientjes <rientjes@xxxxxxxxxx>
Cc: Rohit Seth <rohitseth@xxxxxxxxxx>
Cc: Hugh Dickins <hugh@xxxxxxxxxxx>
Cc: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
Cc: Lee Schermerhorn <lee.schermerhorn@xxxxxx>
Cc: Nick Piggin <npiggin@xxxxxxx>
Cc: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx>
Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---

 arch/x86/mm/fault.c |   16 +++++++
 include/linux/mm.h  |    2 
 mm/filemap.c        |   85 ++++++++++++++++++++++++++++++++++++++++--
 mm/memory.c         |   11 ++++-
 4 files changed, 109 insertions(+), 5 deletions(-)

diff -puN arch/x86/mm/fault.c~page_fault-retry-with-nopage_retry arch/x86/mm/fault.c

--- a/arch/x86/mm/fault.c~page_fault-retry-with-nopage_retry
+++ a/arch/x86/mm/fault.c
@@ -977,6 +977,7 @@ do_page_fault(struct pt_regs *regs, unsi
 	struct mm_struct *mm;
 	int write;
 	int fault;
+	int retry_flag = 1;
 
 	tsk = current;
 	mm = tsk->mm;
@@ -1085,6 +1086,7 @@ do_page_fault(struct pt_regs *regs, unsi
 		might_sleep();
 	}
 
+retry:
 	vma = find_vma(mm, address);
 	if (unlikely(!vma)) {
 		bad_area(regs, error_code, address);
@@ -1137,6 +1139,20 @@ good_area:
 		return;
 	}
 
+	/*
+	 * Here we retry fault once and switch to synchronous mode. The
+	 * main reason is to prevent us from the cases of starvation.
+	 * The retry logic open a starvation hole in which case pages might
+	 * be removed or changed after the retry.
+	 */
+	if (fault & VM_FAULT_RETRY) {
+		if (retry_flag) {
+			retry_flag = 0;
+			goto retry;
+		}
+		BUG();
+	}
+
 	if (fault & VM_FAULT_MAJOR)
 		tsk->maj_flt++;
 	else
diff -puN include/linux/mm.h~page_fault-retry-with-nopage_retry include/linux/mm.h
--- a/include/linux/mm.h~page_fault-retry-with-nopage_retry
+++ a/include/linux/mm.h
@@ -135,6 +135,7 @@ extern pgprot_t protection_map[16];
 
 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
+#define FAULT_FLAG_RETRY	0x04	/* Retry major fault */
 
 /*
  * This interface is used by x86 PAT code to identify a pfn mapping that is
@@ -697,6 +698,7 @@ static inline int page_mapped(struct pag
 
 #define VM_FAULT_MINOR	0 /* For backwards compat. Remove me quickly. */
 
+#define VM_FAULT_RETRY	0x0010
 #define VM_FAULT_OOM	0x0001
 #define VM_FAULT_SIGBUS	0x0002
 #define VM_FAULT_MAJOR	0x0004
diff -puN mm/filemap.c~page_fault-retry-with-nopage_retry mm/filemap.c
--- a/mm/filemap.c~page_fault-retry-with-nopage_retry
+++ a/mm/filemap.c
@@ -714,6 +714,57 @@ repeat:
 EXPORT_SYMBOL(find_lock_page);
 
 /**
+ * find_lock_page_retry - locate, pin and lock a pagecache page
+ * @mapping: the address_space to search
+ * @offset: the page index
+ * @vma: vma in which the fault was taken
+ * @ppage: zero if page not present, otherwise point to the page in pagecache
+ * @retry: 1 indicate caller tolerate a retry.
+ *
+ * If retry flag is on, and page is already locked by someone else, return
+ * a hint of retry.
+ *
+ * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
+ * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
+ * hint to caller for retry, or ret=0 which means page is successfully locked.
+ */
+unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
+				struct vm_area_struct *vma, struct page **ppage,
+				int retry)
+{
+	unsigned int ret = 0;
+	struct page *page;
+
+repeat:
+	page = find_get_page(mapping, offset);
+	if (page) {
+		if (!retry)
+			lock_page(page);
+		else {
+			if (!trylock_page(page)) {
+				struct mm_struct *mm = vma->vm_mm;
+
+				up_read(&mm->mmap_sem);
+				wait_on_page_locked(page);
+				down_read(&mm->mmap_sem);
+
+				page_cache_release(page);
+				return VM_FAULT_RETRY;
+			}
+		}
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto repeat;
+		}
+		VM_BUG_ON(page->index != offset);
+	}
+	*ppage = page;
+	return ret;
+}
+EXPORT_SYMBOL(find_lock_page_retry);
+
+/**
  * find_or_create_page - locate or add a pagecache page
  * @mapping: the page's address_space
  * @index: the page's index into the mapping
@@ -1459,6 +1510,8 @@ int filemap_fault(struct vm_area_struct 
 	pgoff_t size;
 	int did_readaround = 0;
 	int ret = 0;
+	int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
+	int retry_ret;
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (vmf->pgoff >= size)
@@ -1473,6 +1526,8 @@ int filemap_fault(struct vm_area_struct 
 	 */
 retry_find:
 	page = find_lock_page(mapping, vmf->pgoff);
+
+retry_find_nopage:
 	/*
 	 * For sequential accesses, we use the generic readahead logic.
 	 */
@@ -1480,9 +1535,12 @@ retry_find:
 		if (!page) {
 			page_cache_sync_readahead(mapping, ra, file,
 							   vmf->pgoff, 1);
-			page = find_lock_page(mapping, vmf->pgoff);
+			retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+						vma, &page, retry_flag);
 			if (!page)
 				goto no_cached_page;
+			if (retry_ret == VM_FAULT_RETRY)
+				return retry_ret;
 		}
 		if (PageReadahead(page)) {
 			page_cache_async_readahead(mapping, ra, file, page,
@@ -1519,14 +1577,18 @@ retry_find:
 				start = vmf->pgoff - ra_pages / 2;
 			do_page_cache_readahead(mapping, file, start, ra_pages);
 		}
-		page = find_lock_page(mapping, vmf->pgoff);
+		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+				vma, &page, retry_flag);
 		if (!page)
 			goto no_cached_page;
+		if (retry_ret == VM_FAULT_RETRY)
+			return retry_ret;
 	}
 
 	if (!did_readaround)
 		ra->mmap_miss--;
 
+retry_page_update:
 	/*
 	 * We have a locked page in the page cache, now we need to check
 	 * that it's up-to-date. If not, it is going to be due to an error.
@@ -1561,8 +1623,23 @@ no_cached_page:
 	 * In the unlikely event that someone removed it in the
 	 * meantime, we'll just come back here and read it again.
 	 */
-	if (error >= 0)
-		goto retry_find;
+	if (error >= 0) {
+		/*
+		 * If caller cannot tolerate a retry in the ->fault path
+		 * go back to check the page again.
+		 */
+		if (!retry_flag)
+			goto retry_find;
+
+		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+					vma, &page, retry_flag);
+		if (!page)
+			goto retry_find_nopage;
+		else if (retry_ret == VM_FAULT_RETRY)
+			return retry_ret;
+		else
+			goto retry_page_update;
+	}
 
 	/*
 	 * An error return from page_cache_read can result if the
diff -puN mm/memory.c~page_fault-retry-with-nopage_retry mm/memory.c
--- a/mm/memory.c~page_fault-retry-with-nopage_retry
+++ a/mm/memory.c
@@ -2595,6 +2595,13 @@ static int __do_fault(struct mm_struct *
 	vmf.page = NULL;
 
 	ret = vma->vm_ops->fault(vma, &vmf);
+
+	/* page may be available, but we have to restart the process
+	 * because mmap_sem was dropped during the ->fault
+	 */
+	if (ret & VM_FAULT_RETRY)
+		return ret;
+
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
 		return ret;
 
@@ -2736,8 +2743,10 @@ static int do_linear_fault(struct mm_str
 {
 	pgoff_t pgoff = (((address & PAGE_MASK)
 			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-	unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
+	int write = write_access & ~FAULT_FLAG_RETRY;
+	unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
 
+	flags |= (write_access & FAULT_FLAG_RETRY);
 	pte_unmap(page_table);
 	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
_

Patches currently in -mm which might be from yinghan@xxxxxxxxxx are

page_fault-retry-with-nopage_retry.patch
proc-pid-maps-dont-show-pgoff-of-pure-anon-vmas.patch
proc-pid-maps-dont-show-pgoff-of-pure-anon-vmas-checkpatch-fixes.patch

--
To unsubscribe from this list: send the line "unsubscribe mm-commits" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html