Re: [RFD] Re: [PATCH 00 of 41] Transparent Hugepage Support #17

Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> · Mon, 5 Apr 2010 18:35:43 -0700 (PDT)

On Mon, 5 Apr 2010, Linus Torvalds wrote:
> 
> THIS PATCH IS TOTALLY UNTESTED!

Ok, it was also crap. I tried to warn you. We actually have that 
"split_page()" function that does the right thing, I don't know why I 
didn't realize that.

And the lock was uninitialized for the optimistic case, because I had made 
that "clever optimization" to let the caller do the unlocking in the 
common path, but when I did that I didn't actually make sure that the 
caller had the right lock. Whee.

I'm a moron.

This is _still_ untested and probably horribly buggy, but at least it 
isn't *quite* as rough as the previous patch was.

		Linus

---
 include/linux/gfp.h |    3 ++
 mm/memory.c         |   65 +++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/mempolicy.c      |    9 +++++++
 3 files changed, 77 insertions(+), 0 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 4c6d413..2b8f42b 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -84,6 +84,7 @@ struct vm_area_struct;
 #define GFP_HIGHUSER_MOVABLE	(__GFP_WAIT | __GFP_IO | __GFP_FS | \
 				 __GFP_HARDWALL | __GFP_HIGHMEM | \
 				 __GFP_MOVABLE)
+#define GFP_USER_ORDER	(GFP_HIGHUSER_MOVABLE | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
 #define GFP_IOFS	(__GFP_IO | __GFP_FS)
 
 #ifdef CONFIG_NUMA
@@ -306,10 +307,12 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
 }
 extern struct page *alloc_page_vma(gfp_t gfp_mask,
 			struct vm_area_struct *vma, unsigned long addr);
+extern struct page *alloc_page_user_order(struct vm_area_struct *, unsigned long, int);
 #else
 #define alloc_pages(gfp_mask, order) \
 		alloc_pages_node(numa_node_id(), gfp_mask, order)
 #define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0)
+#define alloc_page_user_order(vma, addr, order) alloc_pages(GFP_USER_ORDER, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 
diff --git a/mm/memory.c b/mm/memory.c
index 1d2ea39..4f1521e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2742,6 +2742,66 @@ out_release:
 }
 
 /*
+ * See if we can optimistically fill eight pages at a time
+ */
+static spinlock_t *optimistic_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pte_t *page_table, pmd_t *pmd)
+{
+	int i;
+	spinlock_t *ptl;
+	struct page *bigpage;
+
+	/* Don't even bother if it's not writable */
+	if (!(vma->vm_flags & VM_WRITE))
+		return NULL;
+
+	/* Are we ok wrt the vma boundaries? */
+	if ((address & (PAGE_MASK << 3)) < vma->vm_start)
+		return NULL;
+	if ((address | ~(PAGE_MASK << 3)) > vma->vm_end)
+		return NULL;
+
+	/*
+	 * Round to a nice even 8-byte page boundary, and
+	 * optimistically (with no locking), check whether
+	 * it's all empty. Skip if we have it partly filled
+	 * in.
+	 *
+	 * 8 page table entries tends to be about a cacheline.
+	 */
+	page_table -= (address >> PAGE_SHIFT) & 7;
+	for (i = 0; i < 8; i++)
+		if (!pte_none(page_table[i]))
+			return NULL;
+
+	/* Allocate the eight pages in one go, no warning or retrying */
+	bigpage = alloc_page_user_order(vma, addr, 3);
+	if (!bigpage)
+		return NULL;
+
+	split_page(bigpage, 3);
+
+	ptl = pte_lockptr(mm, pmd);
+	spin_lock(ptl);
+
+	for (i = 0; i < 8; i++) {
+		struct page *page = bigpage + i;
+
+		if (pte_none(page_table[i])) {
+			pte_t pte = mk_pte(page, vma->vm_page_prot);
+			pte = pte_mkwrite(pte_mkdirty(pte));
+			set_pte_at(mm, address, page_table+i, pte);
+		} else {
+			__free_page(page);
+		}
+	}
+
+	/* The caller will unlock */
+	return ptl;
+}
+
+
+/*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -2754,6 +2814,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	pte_t entry;
 
+	ptl = optimistic_fault(mm, vma, address, page_table, pmd);
+	if (ptl)
+		goto update;
+
 	if (!(flags & FAULT_FLAG_WRITE)) {
 		entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
 						vma->vm_page_prot));
@@ -2790,6 +2854,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 setpte:
 	set_pte_at(mm, address, page_table, entry);
 
+update:
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, page_table);
 unlock:
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 08f40a2..55a92bd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1707,6 +1707,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
 }
 
+struct page *
+alloc_page_user_order(struct vm_area_struct *vma, unsigned long addr, int order)
+{
+	struct zonelist *zl = policy_zonelist(gfp, pol);
+	struct mempolicy *pol = get_vma_policy(current, vma, addr);
+
+	return __alloc_pages_nodemask(GFP_USER_ORDER, order, zl, pol);
+}
+
 /**
  * 	alloc_pages_current - Allocate pages.
  *

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxxx  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>