[RFC v3 7/7] shm: isolate pinned pages when sealing files

David Herrmann <dh.herrmann@xxxxxxxxx> · Fri, 13 Jun 2014 12:36:59 +0200

When setting SEAL_WRITE, we must make sure nobody has a writable reference
to the pages (via GUP or similar). We currently check references and wait
some time for them to be dropped. This, however, might fail for several
reasons, including:
 - the page is pinned for longer than we wait
 - while we wait, someone takes an already pinned page for read-access

Therefore, this patch introduces page-isolation. When sealing a file with
SEAL_WRITE, we copy all pages that have an elevated ref-count. The newpage
is put in place atomically, the old page is detached and left alone. It
will get reclaimed once the last external user dropped it.

Signed-off-by: David Herrmann <dh.herrmann@xxxxxxxxx>
---
 mm/shmem.c | 218 +++++++++++++++++++++++++++++--------------------------------
 1 file changed, 105 insertions(+), 113 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index ddc3998..34b14fb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1237,6 +1237,110 @@ unlock:
 	return error;
 }
 
+static int shmem_isolate_page(struct inode *inode, struct page *oldpage)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct page *newpage;
+	int error;
+
+	if (oldpage->mapping != mapping)
+		return 0;
+	if (page_count(oldpage) - page_mapcount(oldpage) <= 2)
+		return 0;
+
+	if (page_mapped(oldpage))
+		unmap_mapping_range(mapping,
+				    (loff_t)oldpage->index << PAGE_CACHE_SHIFT,
+				    PAGE_CACHE_SIZE, 0);
+
+	VM_BUG_ON_PAGE(PageWriteback(oldpage), oldpage);
+	VM_BUG_ON_PAGE(page_has_private(oldpage), oldpage);
+
+	newpage = shmem_alloc_page(mapping_gfp_mask(mapping), info,
+				   oldpage->index);
+	if (!newpage)
+		return -ENOMEM;
+
+	__set_page_locked(newpage);
+	copy_highpage(newpage, oldpage);
+	flush_dcache_page(newpage);
+
+	page_cache_get(newpage);
+	SetPageUptodate(newpage);
+	SetPageSwapBacked(newpage);
+	newpage->mapping = mapping;
+	newpage->index = oldpage->index;
+
+	cancel_dirty_page(oldpage, PAGE_CACHE_SIZE);
+
+	spin_lock_irq(&mapping->tree_lock);
+	error = shmem_radix_tree_replace(mapping, oldpage->index,
+					 oldpage, newpage);
+	if (!error) {
+		__inc_zone_page_state(newpage, NR_FILE_PAGES);
+		__dec_zone_page_state(oldpage, NR_FILE_PAGES);
+	}
+	spin_unlock_irq(&mapping->tree_lock);
+
+	if (error) {
+		newpage->mapping = NULL;
+		unlock_page(newpage);
+		page_cache_release(newpage);
+		page_cache_release(newpage);
+		return error;
+	}
+
+	mem_cgroup_replace_page_cache(oldpage, newpage);
+	lru_cache_add_anon(newpage);
+
+	oldpage->mapping = NULL;
+	page_cache_release(oldpage);
+	unlock_page(newpage);
+	page_cache_release(newpage);
+
+	return 1;
+}
+
+static int shmem_isolate_pins(struct inode *inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	pgoff_t indices[PAGEVEC_SIZE];
+	pgoff_t index;
+	int i, ret, error;
+
+	pagevec_init(&pvec, 0);
+	index = 0;
+	error = 0;
+	while ((pvec.nr = find_get_entries(mapping, index, PAGEVEC_SIZE,
+					   pvec.pages, indices))) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			index = indices[i];
+			if (radix_tree_exceptional_entry(page))
+				continue;
+			if (page->mapping != mapping)
+				continue;
+			if (page_count(page) - page_mapcount(page) <= 2)
+				continue;
+
+			lock_page(page);
+			ret = shmem_isolate_page(inode, page);
+			if (ret < 0)
+				error = ret;
+			unlock_page(page);
+		}
+		pagevec_remove_exceptionals(&pvec);
+		pagevec_release(&pvec);
+		cond_resched();
+		index++;
+	}
+
+	return error;
+}
+
 static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct inode *inode = file_inode(vma->vm_file);
@@ -1734,118 +1838,6 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
 	return offset;
 }
 
-/*
- * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
- * so reuse a tag which we firmly believe is never set or cleared on shmem.
- */
-#define SHMEM_TAG_PINNED        PAGECACHE_TAG_TOWRITE
-#define LAST_SCAN               4       /* about 150ms max */
-
-static void shmem_tag_pins(struct address_space *mapping)
-{
-	struct radix_tree_iter iter;
-	void **slot;
-	pgoff_t start;
-	struct page *page;
-
-	start = 0;
-	rcu_read_lock();
-
-restart:
-	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
-		page = radix_tree_deref_slot(slot);
-		if (!page || radix_tree_exception(page)) {
-			if (radix_tree_deref_retry(page))
-				goto restart;
-		} else if (page_count(page) - page_mapcount(page) > 1) {
-			spin_lock_irq(&mapping->tree_lock);
-			radix_tree_tag_set(&mapping->page_tree, iter.index,
-					   SHMEM_TAG_PINNED);
-			spin_unlock_irq(&mapping->tree_lock);
-		}
-
-		if (need_resched()) {
-			cond_resched_rcu();
-			start = iter.index + 1;
-			goto restart;
-		}
-	}
-	rcu_read_unlock();
-}
-
-/*
- * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
- * via get_user_pages(), drivers might have some pending I/O without any active
- * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
- * and see whether it has an elevated ref-count. If so, we tag them and wait for
- * them to be dropped.
- * The caller must guarantee that no new user will acquire writable references
- * to those pages to avoid races.
- */
-static int shmem_wait_for_pins(struct address_space *mapping)
-{
-	struct radix_tree_iter iter;
-	void **slot;
-	pgoff_t start;
-	struct page *page;
-	int error, scan;
-
-	shmem_tag_pins(mapping);
-
-	error = 0;
-	for (scan = 0; scan <= LAST_SCAN; scan++) {
-		if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED))
-			break;
-
-		if (!scan)
-			lru_add_drain_all();
-		else if (schedule_timeout_killable((HZ << scan) / 200))
-			scan = LAST_SCAN;
-
-		start = 0;
-		rcu_read_lock();
-restart:
-		radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
-					   start, SHMEM_TAG_PINNED) {
-
-			page = radix_tree_deref_slot(slot);
-			if (radix_tree_exception(page)) {
-				if (radix_tree_deref_retry(page))
-					goto restart;
-
-				page = NULL;
-			}
-
-			if (page &&
-			    page_count(page) - page_mapcount(page) != 1) {
-				if (scan < LAST_SCAN)
-					goto continue_resched;
-
-				/*
-				 * On the last scan, we clean up all those tags
-				 * we inserted; but make a note that we still
-				 * found pages pinned.
-				 */
-				error = -EBUSY;
-			}
-
-			spin_lock_irq(&mapping->tree_lock);
-			radix_tree_tag_clear(&mapping->page_tree,
-					     iter.index, SHMEM_TAG_PINNED);
-			spin_unlock_irq(&mapping->tree_lock);
-continue_resched:
-			if (need_resched()) {
-				cond_resched_rcu();
-				start = iter.index + 1;
-				goto restart;
-			}
-		}
-		rcu_read_unlock();
-	}
-
-	return error;
-}
-
 #define F_ALL_SEALS (F_SEAL_SEAL | \
 		     F_SEAL_SHRINK | \
 		     F_SEAL_GROW | \
@@ -1907,7 +1899,7 @@ int shmem_add_seals(struct file *file, unsigned int seals)
 		if (error)
 			goto unlock;
 
-		error = shmem_wait_for_pins(file->f_mapping);
+		error = shmem_isolate_pins(inode);
 		if (error) {
 			mapping_allow_writable(file->f_mapping);
 			goto unlock;
-- 
2.0.0

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html