vma_adjust() is updating anon VMA information without locks being taken. In contrast, file-backed mappings use the i_mmap_lock and this lack of locking can result in races with users of rmap_walk such as page migration. vma_address() can return -EFAULT for an address that will soon be valid. For migration, this potentially leaves a dangling migration PTE behind which can later cause a BUG_ON to trigger when the page is faulted in. With the recent anon_vma changes, there can be more than one anon_vma->lock to take in a anon_vma_chain but a second lock cannot be spinned upon in case of deadlock. The rmap walker tries to take locks of different anon_vma's but if the attempt fails, locks are released and the operation is restarted. For vma_adjust(), the locking behaviour prior to the anon_vma is restored so that rmap_walk() can be sure of the integrity of the VMA information and lists when the anon_vma lock is held. With this patch, the vma->anon_vma->lock is taken if a) If there is any overlap with the next VMA due to the adjustment b) If there is a new VMA is being inserted into the address space c) If the start of the VMA is being changed so that the relationship between vm_start and vm_pgoff is preserved for vma_address() Signed-off-by: Mel Gorman <mel@xxxxxxxxx> --- mm/ksm.c | 22 ++++++++++++++++++++-- mm/mmap.c | 9 +++++++++ mm/rmap.c | 28 +++++++++++++++++++++++----- 3 files changed, 52 insertions(+), 7 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 3666d43..0c09927 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1668,15 +1668,28 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, again: hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma *locked_vma; struct anon_vma_chain *vmac; struct vm_area_struct *vma; spin_lock(&anon_vma->lock); list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { vma = vmac->vma; + + /* See comment in mm/rmap.c#rmap_walk_anon on locking */ + locked_vma = NULL; + if (anon_vma != vma->anon_vma) { + locked_vma = vma->anon_vma; + if (!spin_trylock(&locked_vma->lock)) { + spin_unlock(&anon_vma->lock); + goto again; + } + } + if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) - continue; + goto next_vma; + /* * Initially we examine only the vma which covers this * rmap_item; but later, if there is still work to do, @@ -1684,9 +1697,14 @@ again: * were forked from the original since ksmd passed. */ if ((rmap_item->mm == vma->vm_mm) == search_new_forks) - continue; + goto next_vma; ret = rmap_one(page, vma, rmap_item->address, arg); + +next_vma: + if (locked_vma) + spin_unlock(&locked_vma->lock); + if (ret != SWAP_AGAIN) { spin_unlock(&anon_vma->lock); goto out; diff --git a/mm/mmap.c b/mm/mmap.c index f90ea92..d635132 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -505,6 +505,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *next = vma->vm_next; struct vm_area_struct *importer = NULL; struct address_space *mapping = NULL; + struct anon_vma *anon_vma = NULL; struct prio_tree_root *root = NULL; struct file *file = vma->vm_file; long adjust_next = 0; @@ -578,6 +579,11 @@ again: remove_next = 1 + (end > next->vm_end); } } + if (vma->anon_vma && (insert || importer || start != vma->vm_start)) { + anon_vma = vma->anon_vma; + spin_lock(&anon_vma->lock); + } + if (root) { flush_dcache_mmap_lock(mapping); vma_prio_tree_remove(vma, root); @@ -620,6 +626,9 @@ again: remove_next = 1 + (end > next->vm_end); if (mapping) spin_unlock(&mapping->i_mmap_lock); + if (anon_vma) + spin_unlock(&anon_vma->lock); + if (remove_next) { if (file) { fput(file); diff --git a/mm/rmap.c b/mm/rmap.c index 85f203e..f7ed89f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1358,7 +1358,7 @@ int try_to_munlock(struct page *page) static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, struct vm_area_struct *, unsigned long, void *), void *arg) { - struct anon_vma *anon_vma; + struct anon_vma *anon_vma, *locked_vma; struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1368,16 +1368,34 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, * are holding mmap_sem. Users without mmap_sem are required to * take a reference count to prevent the anon_vma disappearing */ +retry: anon_vma = page_anon_vma(page); if (!anon_vma) return ret; spin_lock(&anon_vma->lock); list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { struct vm_area_struct *vma = avc->vma; - unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; - ret = rmap_one(page, vma, address, arg); + unsigned long address; + + /* + * Guard against deadlocks by not spinning against + * vma->anon_vma->lock. On contention release and retry + */ + locked_vma = NULL; + if (anon_vma != vma->anon_vma) { + locked_vma = vma->anon_vma; + if (!spin_trylock(&locked_vma->lock)) { + spin_unlock(&anon_vma->lock); + goto retry; + } + } + address = vma_address(page, vma); + if (address != -EFAULT) + ret = rmap_one(page, vma, address, arg); + + if (locked_vma) + spin_unlock(&locked_vma->lock); + if (ret != SWAP_AGAIN) break; } -- 1.6.5 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>