[RFC][PATCH 1/6] mm: Dont assume page-table invariance during faults

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



One of the side effects of speculating on faults (without holding
mmap_sem) is that we can race with free_pgtables() and therefore we
cannot assume the page-tables will stick around.

Remove the relyance on the pte pointer.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
 mm/memory.c |   76 ++++++++++++++++--------------------------------------------
 1 file changed, 21 insertions(+), 55 deletions(-)

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1933,31 +1933,6 @@ int apply_to_page_range(struct mm_struct
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
-/*
- * handle_pte_fault chooses page fault handler according to an entry
- * which was read non-atomically.  Before making any commitment, on
- * those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
- * must check under lock before unmapping the pte and proceeding
- * (but do_wp_page is only called after already making such a check;
- * and do_anonymous_page can safely check later on).
- */
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
-				pte_t *page_table, pte_t orig_pte)
-{
-	int same = 1;
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-	if (sizeof(pte_t) > sizeof(unsigned long)) {
-		spinlock_t *ptl = pte_lockptr(mm, pmd);
-		spin_lock(ptl);
-		same = pte_same(*page_table, orig_pte);
-		spin_unlock(ptl);
-	}
-#endif
-	pte_unmap(page_table);
-	return same;
-}
-
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
 {
 	debug_dma_assert_idle(src);
@@ -2407,21 +2382,18 @@ EXPORT_SYMBOL(unmap_mapping_range);
  * as does filemap_fault().
  */
 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		unsigned long address, pmd_t *pmd,
 		unsigned int flags, pte_t orig_pte)
 {
 	spinlock_t *ptl;
 	struct page *page, *swapcache;
 	struct mem_cgroup *memcg;
 	swp_entry_t entry;
-	pte_t pte;
+	pte_t *page_table, pte;
 	int locked;
 	int exclusive = 0;
 	int ret = 0;
 
-	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-		goto out;
-
 	entry = pte_to_swp_entry(orig_pte);
 	if (unlikely(non_swap_entry(entry))) {
 		if (is_migration_entry(entry)) {
@@ -2624,15 +2596,13 @@ static inline int check_stack_guard_page
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		unsigned long address, pmd_t *pmd,
 		unsigned int flags)
 {
 	struct mem_cgroup *memcg;
 	struct page *page;
 	spinlock_t *ptl;
-	pte_t entry;
-
-	pte_unmap(page_table);
+	pte_t entry, *page_table;
 
 	/* Check if we need to add a guard page to the stack */
 	if (check_stack_guard_page(vma, address) < 0)
@@ -3031,13 +3001,12 @@ static int do_shared_fault(struct mm_str
  * return value.  See filemap_fault() and __lock_page_or_retry().
  */
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		unsigned long address, pmd_t *pmd,
 		unsigned int flags, pte_t orig_pte)
 {
 	pgoff_t pgoff = (((address & PAGE_MASK)
 			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 
-	pte_unmap(page_table);
 	if (!(flags & FAULT_FLAG_WRITE))
 		return do_read_fault(mm, vma, address, pmd, pgoff, flags,
 				orig_pte);
@@ -3059,16 +3028,13 @@ static int do_linear_fault(struct mm_str
  * return value.  See filemap_fault() and __lock_page_or_retry().
  */
 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		unsigned long address, pmd_t *pmd,
 		unsigned int flags, pte_t orig_pte)
 {
 	pgoff_t pgoff;
 
 	flags |= FAULT_FLAG_NONLINEAR;
 
-	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-		return 0;
-
 	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
 		/*
 		 * Page table corrupted: show pte and kill process.
@@ -3103,7 +3069,7 @@ static int numa_migrate_prep(struct page
 }
 
 static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+		   unsigned long addr, pte_t pte, pmd_t *pmd)
 {
 	struct page *page = NULL;
 	spinlock_t *ptl;
@@ -3112,6 +3078,7 @@ static int do_numa_page(struct mm_struct
 	int target_nid;
 	bool migrated = false;
 	int flags = 0;
+	pte_t *ptep;
 
 	/*
 	* The "pte" at this point cannot be used safely without
@@ -3122,8 +3089,7 @@ static int do_numa_page(struct mm_struct
 	* the _PAGE_NUMA bit and it is not really expected that there
 	* would be concurrent hardware modifications to the PTE.
 	*/
-	ptl = pte_lockptr(mm, pmd);
-	spin_lock(ptl);
+	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	if (unlikely(!pte_same(*ptep, pte))) {
 		pte_unmap_unlock(ptep, ptl);
 		goto out;
@@ -3195,34 +3161,32 @@ static int do_numa_page(struct mm_struct
  */
 static int handle_pte_fault(struct mm_struct *mm,
 		     struct vm_area_struct *vma, unsigned long address,
-		     pte_t *pte, pmd_t *pmd, unsigned int flags)
+		     pte_t entry, pmd_t *pmd, unsigned int flags)
 {
-	pte_t entry;
 	spinlock_t *ptl;
+	pte_t *pte;
 
-	entry = ACCESS_ONCE(*pte);
 	if (!pte_present(entry)) {
 		if (pte_none(entry)) {
 			if (vma->vm_ops) {
 				if (likely(vma->vm_ops->fault))
 					return do_linear_fault(mm, vma, address,
-						pte, pmd, flags, entry);
+						pmd, flags, entry);
 			}
 			return do_anonymous_page(mm, vma, address,
-						 pte, pmd, flags);
+						 pmd, flags);
 		}
 		if (pte_file(entry))
 			return do_nonlinear_fault(mm, vma, address,
-					pte, pmd, flags, entry);
+					pmd, flags, entry);
 		return do_swap_page(mm, vma, address,
-					pte, pmd, flags, entry);
+					pmd, flags, entry);
 	}
 
 	if (pte_numa(entry))
-		return do_numa_page(mm, vma, address, entry, pte, pmd);
+		return do_numa_page(mm, vma, address, entry, pmd);
 
-	ptl = pte_lockptr(mm, pmd);
-	spin_lock(ptl);
+	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
 	if (flags & FAULT_FLAG_WRITE) {
@@ -3261,7 +3225,7 @@ static int __handle_mm_fault(struct mm_s
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
-	pte_t *pte;
+	pte_t *pte, entry;
 
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, flags);
@@ -3331,8 +3295,10 @@ static int __handle_mm_fault(struct mm_s
 	 * safe to run pte_offset_map().
 	 */
 	pte = pte_offset_map(pmd, address);
+	entry = ACCESS_ONCE(*pte);
+	pte_unmap(pte);
 
-	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+	return handle_pte_fault(mm, vma, address, entry, pmd, flags);
 }
 
 /*


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]