Re: [PATCHv4 02/10] mm: convert mm->nr_ptes to atomic_t

"Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx> · Sat, 28 Sep 2013 19:12:56 +0300 (EEST)

Johannes Weiner wrote:
> On Sat, Sep 28, 2013 at 01:24:51AM +0300, Kirill A. Shutemov wrote:
> > Cody P Schafer wrote:
> > > On 09/27/2013 06:16 AM, Kirill A. Shutemov wrote:
> > > > With split page table lock for PMD level we can't hold
> > > > mm->page_table_lock while updating nr_ptes.
> > > >
> > > > Let's convert it to atomic_t to avoid races.
> > > >
> > > 
> > > > ---
> > > 
> > > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > > > index 84e0c56e1e..99f19e850d 100644
> > > > --- a/include/linux/mm_types.h
> > > > +++ b/include/linux/mm_types.h
> > > > @@ -339,6 +339,7 @@ struct mm_struct {
> > > >   	pgd_t * pgd;
> > > >   	atomic_t mm_users;			/* How many users with user space? */
> > > >   	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
> > > > +	atomic_t nr_ptes;			/* Page table pages */
> > > >   	int map_count;				/* number of VMAs */
> > > >
> > > >   	spinlock_t page_table_lock;		/* Protects page tables and some counters */
> > > > @@ -360,7 +361,6 @@ struct mm_struct {
> > > >   	unsigned long exec_vm;		/* VM_EXEC & ~VM_WRITE */
> > > >   	unsigned long stack_vm;		/* VM_GROWSUP/DOWN */
> > > >   	unsigned long def_flags;
> > > > -	unsigned long nr_ptes;		/* Page table pages */
> > > >   	unsigned long start_code, end_code, start_data, end_data;
> > > >   	unsigned long start_brk, brk, start_stack;
> > > >   	unsigned long arg_start, arg_end, env_start, env_end;
> > > 
> > > Will 32bits always be enough here? Should atomic_long_t be used instead?
> > 
> > Good question!
> > 
> > On x86_64 we need one table to cover 2M (512 entries by 4k, 21 bits) of
> > virtual address space. Total size of virtual memory which can be covered
> > by 31-bit (32 - sign) nr_ptes is 52 bits (31 + 21).
> > 
> > Currently, on x86_64 with 4-level page tables we can use at most 48 bit of
> > virtual address space (only half of it available for userspace), so we
> > pretty safe here.
> > 
> > Although, it can be a potential problem, if (when) x86_64 will implement
> > 5-level page tables -- 57-bits of virtual address space.
> > 
> > Any thoughts?
> 
> I'd just go with atomic_long_t to avoid having to worry about this in
> the first place.  It's been ulong forever and I'm not aware of struct
> mm_struct size being an urgent issue.  Cutting this type in half and
> adding overflow checks adds more problems than it solves.

Good point. Updated patch is below.
It will cause few trivial conflicts in other patches. The whole patchset
can be found here:

git://git.kernel.org/pub/scm/linux/kernel/git/kas/linux.git thp/split_pmd_ptl/v5

>From fa55c2f7be050d0aa8e529a867de850ab1d0a7fb Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx>
Date: Thu, 12 Sep 2013 19:32:15 +0300
Subject: [PATCHv5] mm: convert mm->nr_ptes to atomic_long_t

With split page table lock for PMD level we can't hold
mm->page_table_lock while updating nr_ptes.

Let's convert it to atomic_long_t to avoid races.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
Tested-by: Alex Thorlton <athorlton@xxxxxxx>
---
 fs/proc/task_mmu.c       |  3 ++-
 include/linux/mm_types.h |  2 +-
 kernel/fork.c            |  2 +-
 mm/huge_memory.c         | 10 +++++-----
 mm/memory.c              |  4 ++--
 mm/mmap.c                |  3 ++-
 mm/oom_kill.c            |  6 +++---
 7 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7366e9d63c..fda6c4f2bb 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -62,7 +62,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+		(PTRS_PER_PTE * sizeof(pte_t) *
+		 atomic_long_read(&mm->nr_ptes)) >> 10,
 		swap << (PAGE_SHIFT-10));
 }
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 84e0c56e1e..149ad17ceb 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -339,6 +339,7 @@ struct mm_struct {
 	pgd_t * pgd;
 	atomic_t mm_users;			/* How many users with user space? */
 	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
+	atomic_long_t nr_ptes;			/* Page table pages */
 	int map_count;				/* number of VMAs */
 
 	spinlock_t page_table_lock;		/* Protects page tables and some counters */
@@ -360,7 +361,6 @@ struct mm_struct {
 	unsigned long exec_vm;		/* VM_EXEC & ~VM_WRITE */
 	unsigned long stack_vm;		/* VM_GROWSUP/DOWN */
 	unsigned long def_flags;
-	unsigned long nr_ptes;		/* Page table pages */
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long start_brk, brk, start_stack;
 	unsigned long arg_start, arg_end, env_start, env_end;
diff --git a/kernel/fork.c b/kernel/fork.c
index 086fe73ad6..5c8a19482a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -532,7 +532,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	mm->flags = (current->mm) ?
 		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
 	mm->core_state = NULL;
-	mm->nr_ptes = 0;
+	atomic_long_set(&mm->nr_ptes, 0);
 	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
 	spin_lock_init(&mm->page_table_lock);
 	mm_init_aio(mm);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7489884682..e31600f465 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -737,7 +737,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
 		set_pmd_at(mm, haddr, pmd, entry);
 		add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
-		mm->nr_ptes++;
+		atomic_long_inc(&mm->nr_ptes);
 		spin_unlock(&mm->page_table_lock);
 	}
 
@@ -778,7 +778,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 	entry = pmd_mkhuge(entry);
 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, haddr, pmd, entry);
-	mm->nr_ptes++;
+	atomic_long_inc(&mm->nr_ptes);
 	return true;
 }
 
@@ -903,7 +903,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	pmd = pmd_mkold(pmd_wrprotect(pmd));
 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-	dst_mm->nr_ptes++;
+	atomic_long_inc(&dst_mm->nr_ptes);
 
 	ret = 0;
 out_unlock:
@@ -1358,7 +1358,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 		pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
 		if (is_huge_zero_pmd(orig_pmd)) {
-			tlb->mm->nr_ptes--;
+			atomic_long_dec(&tlb->mm->nr_ptes);
 			spin_unlock(&tlb->mm->page_table_lock);
 			put_huge_zero_page();
 		} else {
@@ -1367,7 +1367,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			VM_BUG_ON(page_mapcount(page) < 0);
 			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
 			VM_BUG_ON(!PageHead(page));
-			tlb->mm->nr_ptes--;
+			atomic_long_dec(&tlb->mm->nr_ptes);
 			spin_unlock(&tlb->mm->page_table_lock);
 			tlb_remove_page(tlb, page);
 		}
diff --git a/mm/memory.c b/mm/memory.c
index ca00039471..975ab644f6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -382,7 +382,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 	pgtable_t token = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
 	pte_free_tlb(tlb, token, addr);
-	tlb->mm->nr_ptes--;
+	atomic_long_dec(&tlb->mm->nr_ptes);
 }
 
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -575,7 +575,7 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 	spin_lock(&mm->page_table_lock);
 	wait_split_huge_page = 0;
 	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
-		mm->nr_ptes++;
+		atomic_long_inc(&mm->nr_ptes);
 		pmd_populate(mm, pmd, new);
 		new = NULL;
 	} else if (unlikely(pmd_trans_splitting(*pmd)))
diff --git a/mm/mmap.c b/mm/mmap.c
index 9d548512ff..846dae00a5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2726,7 +2726,8 @@ void exit_mmap(struct mm_struct *mm)
 	}
 	vm_unacct_memory(nr_accounted);
 
-	WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+	WARN_ON(atomic_long_read(&mm->nr_ptes) >
+			(FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }
 
 /* Insert vm structure into process list sorted by address
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 314e9d2743..51d9b680ae 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -161,7 +161,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * The baseline for the badness score is the proportion of RAM that each
 	 * task's rss, pagetable and swap space use.
 	 */
-	points = get_mm_rss(p->mm) + p->mm->nr_ptes +
+	points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
 		 get_mm_counter(p->mm, MM_SWAPENTS);
 	task_unlock(p);
 
@@ -364,10 +364,10 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
 			continue;
 		}
 
-		pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu         %5hd %s\n",
+		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-			task->mm->nr_ptes,
+			atomic_long_read(&task->mm->nr_ptes),
 			get_mm_counter(task->mm, MM_SWAPENTS),
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
-- 
 Kirill A. Shutemov

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>