As a result of commit 32f8516a8c73 ("mm, mempolicy: fix printing stack contents in numa_maps"), the mutex protecting a shared policy can be inadvertently taken while holding task_lock(task). Recently, commit b22d127a39dd ("mempolicy: fix a race in shared_policy_replace()") switched the spinlock within a shared policy to a mutex so sp_alloc() could block. Thus, a refcount must be grabbed on all mempolicies returned by get_vma_policy() so it isn't freed while being passed to mpol_to_str() when reading /proc/pid/numa_maps. This patch only takes task_lock() while dereferencing task->mempolicy in get_vma_policy() to increment its refcount. This ensures it will remain in memory until dropped by __mpol_put() after mpol_to_str() is called. Refcounts of shared policies are grabbed by the ->get_policy() function of the vma, all others will be grabbed directly in get_vma_policy(). Now that this is done, all callers now unconditionally drop the refcount. Tested-by: Dave Jones <davej@xxxxxxxxxx> Signed-off-by: David Rientjes <rientjes@xxxxxxxxxx> --- fs/proc/task_mmu.c | 4 +-- include/linux/mempolicy.h | 12 +------ mm/hugetlb.c | 4 +-- mm/mempolicy.c | 79 +++++++++++++++++++-------------------------- 4 files changed, 38 insertions(+), 61 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1178,11 +1178,9 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) walk.private = md; walk.mm = mm; - task_lock(task); pol = get_vma_policy(task, vma, vma->vm_start); mpol_to_str(buffer, sizeof(buffer), pol, 0); - mpol_cond_put(pol); - task_unlock(task); + __mpol_put(pol); seq_printf(m, "%08lx %s", vma->vm_start, buffer); diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -73,13 +73,7 @@ static inline void mpol_put(struct mempolicy *pol) */ static inline int mpol_needs_cond_ref(struct mempolicy *pol) { - return (pol && (pol->flags & MPOL_F_SHARED)); -} - -static inline void mpol_cond_put(struct mempolicy *pol) -{ - if (mpol_needs_cond_ref(pol)) - __mpol_put(pol); + return pol->flags & MPOL_F_SHARED; } extern struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, @@ -211,10 +205,6 @@ static inline void mpol_put(struct mempolicy *p) { } -static inline void mpol_cond_put(struct mempolicy *pol) -{ -} - static inline struct mempolicy *mpol_cond_copy(struct mempolicy *to, struct mempolicy *from) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -568,13 +568,13 @@ retry_cpuset: } } - mpol_cond_put(mpol); + __mpol_put(mpol); if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) goto retry_cpuset; return page; err: - mpol_cond_put(mpol); + __mpol_put(mpol); return NULL; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -906,7 +906,8 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } out: - mpol_cond_put(pol); + if (mpol_needs_cond_ref(pol)) + __mpol_put(pol); if (vma) up_read(¤t->mm->mmap_sem); return err; @@ -1527,48 +1528,52 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, } #endif - -/* - * get_vma_policy(@task, @vma, @addr) - * @task - task for fallback if vma policy == default - * @vma - virtual memory area whose policy is sought - * @addr - address in @vma for shared policy lookup +/** + * get_vma_policy() - return effective policy for a vma at specified address + * @task: task for fallback if vma policy == default_policy + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup * - * Returns effective policy for a VMA at specified address. * Falls back to @task or system default policy, as necessary. - * Current or other task's task mempolicy and non-shared vma policies must be - * protected by task_lock(task) by the caller. - * Shared policies [those marked as MPOL_F_SHARED] require an extra reference - * count--added by the get_policy() vm_op, as appropriate--to protect against - * freeing by another task. It is the caller's responsibility to free the - * extra reference for shared policies. + * Increments the reference count of the returned mempolicy, it is the caller's + * responsibility to decrement with __mpol_put(). + * Requires vma->vm_mm->mmap_sem to be held for vma policies and takes + * task_lock(task) for task policy fallback. */ struct mempolicy *get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) { - struct mempolicy *pol = task->mempolicy; + struct mempolicy *pol; + + /* + * Grab a reference before task has the potential to exit and free its + * mempolicy. + */ + task_lock(task); + pol = task->mempolicy; + mpol_get(pol); + task_unlock(task); if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) { struct mempolicy *vpol = vma->vm_ops->get_policy(vma, addr); - if (vpol) + if (vpol) { + mpol_put(pol); pol = vpol; + if (!mpol_needs_cond_ref(pol)) + mpol_get(pol); + } } else if (vma->vm_policy) { + mpol_put(pol); pol = vma->vm_policy; - - /* - * shmem_alloc_page() passes MPOL_F_SHARED policy with - * a pseudo vma whose vma->vm_ops=NULL. Take a reference - * count on these policies which will be dropped by - * mpol_cond_put() later - */ - if (mpol_needs_cond_ref(pol)) - mpol_get(pol); + mpol_get(pol); } } - if (!pol) + if (!pol) { pol = &default_policy; + mpol_get(pol); + } return pol; } @@ -1919,30 +1924,14 @@ retry_cpuset: unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); - mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) - goto retry_cpuset; - - return page; + goto out; } zl = policy_zonelist(gfp, pol, node); - if (unlikely(mpol_needs_cond_ref(pol))) { - /* - * slow path: ref counted shared policy - */ - struct page *page = __alloc_pages_nodemask(gfp, order, - zl, policy_nodemask(gfp, pol)); - __mpol_put(pol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) - goto retry_cpuset; - return page; - } - /* - * fast path: default or task policy - */ page = __alloc_pages_nodemask(gfp, order, zl, policy_nodemask(gfp, pol)); +out: + __mpol_put(pol); if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) goto retry_cpuset; return page; -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>