The purpose of this series is: Waiman Long reported 'pgd_lock' contention on high CPU count systems and proposed moving pgd_lock on a separate cacheline to eliminate false sharing and to reduce some of the lock bouncing overhead. I think we can do much better: this series eliminates the pgd_list and makes pgd_alloc()/pgd_free() lockless. This is the -v2 submission that addresses all feedback received so far, it fixes bugs and cleans up details. There's a v1->v2 interdiff attached further below: most of the changes relate to locking the task before looking at tsk->mm. The series is also available in -tip:master for easy testing: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git master It's not applied to any visible topic tree yet. (Its sha1 is bc40a788fc63 if you want to test it without pending non-x86 bits in -tip.) Thanks, Ingo ====================> Ingo Molnar (12): x86/mm/pat: Don't free PGD entries on memory unmap x86/mm/hotplug: Remove pgd_list use from the memory hotplug code x86/mm/hotplug: Don't remove PGD entries in remove_pagetable() x86/mm/hotplug: Simplify sync_global_pgds() mm: Introduce arch_pgd_init_late() x86/mm: Enable and use the arch_pgd_init_late() method x86/virt/guest/xen: Remove use of pgd_list from the Xen guest code x86/mm: Remove pgd_list use from vmalloc_sync_all() x86/mm/pat/32: Remove pgd_list use from the PAT code x86/mm: Make pgd_alloc()/pgd_free() lockless x86/mm: Remove pgd_list leftovers x86/mm: Simplify pgd_alloc() arch/Kconfig | 9 ++++ arch/x86/Kconfig | 1 + arch/x86/include/asm/pgtable.h | 3 -- arch/x86/include/asm/pgtable_64.h | 3 +- arch/x86/mm/fault.c | 31 +++++++++----- arch/x86/mm/init_64.c | 80 ++++++++++++----------------------- arch/x86/mm/pageattr.c | 41 +++++++++--------- arch/x86/mm/pgtable.c | 131 ++++++++++++++++++++++++++++++---------------------------- arch/x86/xen/mmu.c | 51 +++++++++++++++++------ fs/exec.c | 3 ++ include/linux/mm.h | 6 +++ kernel/fork.c | 16 +++++++ 12 files changed, 209 insertions(+), 166 deletions(-) -v1 => -v2 interdiff: ==== arch/x86/mm/fault.c | 17 +++++++++++------ arch/x86/mm/init_64.c | 23 ++++++++++++----------- arch/x86/mm/pageattr.c | 27 ++++++++++++++------------- arch/x86/mm/pgtable.c | 22 ++++++++++++---------- arch/x86/xen/mmu.c | 47 +++++++++++++++++++++++++---------------------- fs/exec.c | 2 +- include/linux/mm.h | 4 ++-- kernel/fork.c | 4 ++-- 8 files changed, 79 insertions(+), 67 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index bebdd97f888b..14b39c3511fd 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -237,28 +237,33 @@ void vmalloc_sync_all(void) struct task_struct *g, *p; - rcu_read_lock(); - spin_lock(&pgd_lock); + spin_lock(&pgd_lock); /* Implies rcu_read_lock(): */ for_each_process_thread(g, p) { + struct mm_struct *mm; spinlock_t *pgt_lock; pmd_t *pmd_ret; - if (!p->mm) + task_lock(p); + mm = p->mm; + if (!mm) { + task_unlock(p); continue; + } /* The pgt_lock is only used on Xen: */ - pgt_lock = &p->mm->page_table_lock; + pgt_lock = &mm->page_table_lock; spin_lock(pgt_lock); - pmd_ret = vmalloc_sync_one(p->mm->pgd, address); + pmd_ret = vmalloc_sync_one(mm->pgd, address); spin_unlock(pgt_lock); + task_unlock(p); + if (!pmd_ret) break; } spin_unlock(&pgd_lock); - rcu_read_unlock(); } } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 730560c4873e..dcb2f45caf0e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -171,28 +171,28 @@ void sync_global_pgds(unsigned long start, unsigned long end) const pgd_t *pgd_ref = pgd_offset_k(address); struct task_struct *g, *p; - /* - * When this function is called after memory hot remove, - * pgd_none() already returns true, but only the reference - * kernel PGD has been cleared, not the process PGDs. - * - * So clear the affected entries in every process PGD as well: - */ + /* Only sync (potentially) newly added PGD entries: */ if (pgd_none(*pgd_ref)) continue; - spin_lock(&pgd_lock); + spin_lock(&pgd_lock); /* Implies rcu_read_lock() for the task list iteration: */ for_each_process_thread(g, p) { + struct mm_struct *mm; pgd_t *pgd; spinlock_t *pgt_lock; - if (!p->mm) + task_lock(p); + mm = p->mm; + if (!mm) { + task_unlock(p); continue; - pgd = p->mm->pgd; + } + + pgd = mm->pgd; /* The pgt_lock is only used by Xen: */ - pgt_lock = &p->mm->page_table_lock; + pgt_lock = &mm->page_table_lock; spin_lock(pgt_lock); if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) @@ -202,6 +202,7 @@ void sync_global_pgds(unsigned long start, unsigned long end) set_pgd(pgd, *pgd_ref); spin_unlock(pgt_lock); + task_unlock(p); } spin_unlock(&pgd_lock); } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 93c134fdb398..4ff6a1808f1d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -440,29 +440,30 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) if (!SHARED_KERNEL_PMD) { struct task_struct *g, *p; - rcu_read_lock(); + /* We are holding pgd_lock, which implies rcu_read_lock(): */ for_each_process_thread(g, p) { + struct mm_struct *mm; spinlock_t *pgt_lock; pgd_t *pgd; pud_t *pud; pmd_t *pmd; - if (!p->mm) - continue; + task_lock(p); + mm = p->mm; + if (mm) { + pgt_lock = &mm->page_table_lock; + spin_lock(pgt_lock); - pgt_lock = &p->mm->page_table_lock; - spin_lock(pgt_lock); + pgd = mm->pgd + pgd_index(address); + pud = pud_offset(pgd, address); + pmd = pmd_offset(pud, address); + set_pte_atomic((pte_t *)pmd, pte); - pgd = p->mm->pgd + pgd_index(address); - pud = pud_offset(pgd, address); - pmd = pmd_offset(pud, address); - set_pte_atomic((pte_t *)pmd, pte); - - spin_unlock(pgt_lock); + spin_unlock(pgt_lock); + } + task_unlock(p); } - - rcu_read_unlock(); } #endif } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index b1bd35f452ef..d7d341e57e33 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -348,15 +348,17 @@ pgd_t *pgd_alloc(struct mm_struct *mm) * at the highest, PGD level. Thus any other task extending it * will first update the reference PGD, then modify the task PGDs. */ -void arch_pgd_init_late(struct mm_struct *mm, pgd_t *pgd) +void arch_pgd_init_late(struct mm_struct *mm) { /* - * This is called after a new MM has been made visible - * in fork() or exec(). + * This function is called after a new MM has been made visible + * in fork() or exec() via: + * + * tsk->mm = mm; * * This barrier makes sure the MM is visible to new RCU - * walkers before we initialize it, so that we don't miss - * updates: + * walkers before we initialize the pagetables below, so that + * we don't miss updates: */ smp_wmb(); @@ -365,12 +367,12 @@ void arch_pgd_init_late(struct mm_struct *mm, pgd_t *pgd) * ptes in non-PAE, or shared PMD in PAE), then just copy the * references from swapper_pg_dir: */ - if (CONFIG_PGTABLE_LEVELS == 2 || + if ( CONFIG_PGTABLE_LEVELS == 2 || (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || - CONFIG_PGTABLE_LEVELS == 4) { + CONFIG_PGTABLE_LEVELS == 4) { pgd_t *pgd_src = swapper_pg_dir + KERNEL_PGD_BOUNDARY; - pgd_t *pgd_dst = pgd + KERNEL_PGD_BOUNDARY; + pgd_t *pgd_dst = mm->pgd + KERNEL_PGD_BOUNDARY; int i; for (i = 0; i < KERNEL_PGD_PTRS; i++, pgd_src++, pgd_dst++) { @@ -387,8 +389,8 @@ void arch_pgd_init_late(struct mm_struct *mm, pgd_t *pgd) * it won't change from under us. Parallel updates (new * allocations) will modify our (already visible) PGD: */ - if (pgd_val(*pgd_src)) - WRITE_ONCE(*pgd_dst, *pgd_src); + if (!pgd_none(*pgd_src)) + set_pgd(pgd_dst, *pgd_src); } } } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 87a8354435f8..70a3df5b0b54 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -855,27 +855,28 @@ void xen_mm_pin_all(void) { struct task_struct *g, *p; - rcu_read_lock(); - spin_lock(&pgd_lock); + spin_lock(&pgd_lock); /* Implies rcu_read_lock() for the task list iteration: */ for_each_process_thread(g, p) { + struct mm_struct *mm; struct page *page; pgd_t *pgd; - if (!p->mm) - continue; - - pgd = p->mm->pgd; - page = virt_to_page(pgd); + task_lock(p); + mm = p->mm; + if (mm) { + pgd = mm->pgd; + page = virt_to_page(pgd); - if (!PagePinned(page)) { - __xen_pgd_pin(&init_mm, pgd); - SetPageSavePinned(page); + if (!PagePinned(page)) { + __xen_pgd_pin(&init_mm, pgd); + SetPageSavePinned(page); + } } + task_unlock(p); } spin_unlock(&pgd_lock); - rcu_read_unlock(); } /* @@ -980,24 +981,26 @@ void xen_mm_unpin_all(void) { struct task_struct *g, *p; - rcu_read_lock(); - spin_lock(&pgd_lock); + spin_lock(&pgd_lock); /* Implies rcu_read_lock() for the task list iteration: */ for_each_process_thread(g, p) { + struct mm_struct *mm; struct page *page; pgd_t *pgd; - if (!p->mm) - continue; - - pgd = p->mm->pgd; - page = virt_to_page(pgd); + task_lock(p); + mm = p->mm; + if (mm) { + pgd = mm->pgd; + page = virt_to_page(pgd); - if (PageSavePinned(page)) { - BUG_ON(!PagePinned(page)); - __xen_pgd_unpin(&init_mm, pgd); - ClearPageSavePinned(page); + if (PageSavePinned(page)) { + BUG_ON(!PagePinned(page)); + __xen_pgd_unpin(&init_mm, pgd); + ClearPageSavePinned(page); + } } + task_unlock(p); } spin_unlock(&pgd_lock); diff --git a/fs/exec.c b/fs/exec.c index c1d213c64fda..4ce1383d5bba 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -862,7 +862,7 @@ static int exec_mmap(struct mm_struct *mm) active_mm = tsk->active_mm; tsk->mm = mm; - arch_pgd_init_late(mm, mm->pgd); + arch_pgd_init_late(mm); tsk->active_mm = mm; activate_mm(active_mm, mm); diff --git a/include/linux/mm.h b/include/linux/mm.h index 35d887d2b038..a3edc839e431 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1135,9 +1135,9 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); #ifdef CONFIG_ARCH_HAS_PGD_INIT_LATE -void arch_pgd_init_late(struct mm_struct *mm, pgd_t *pgd); +void arch_pgd_init_late(struct mm_struct *mm); #else -static inline void arch_pgd_init_late(struct mm_struct *mm, pgd_t *pgd) { } +static inline void arch_pgd_init_late(struct mm_struct *mm) { } #endif static inline void unmap_shared_mapping_range(struct address_space *mapping, diff --git a/kernel/fork.c b/kernel/fork.c index 1f83ceca6c6c..cfa84971fb52 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1605,8 +1605,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, * NOTE: any user-space parts of the PGD are already initialized * and must not be clobbered. */ - if (p->mm != current->mm) - arch_pgd_init_late(p->mm, p->mm->pgd); + if (!(clone_flags & CLONE_VM)) + arch_pgd_init_late(p->mm); proc_fork_connector(p); cgroup_post_fork(p); -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>