[RFC PATCH 1/2] placeholder pages

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Introduce a place holder page for the radix tree.  mm/filemap.c is
changed to wait on these before adding a page into the page cache, and
truncates are changed to wait for all of the place holder pages to
disappear.

Place holder pages can only be tested or looked at with the mapping lock
held, and only page->flags can be trusted.  They cannot be locked, and
cannot have references increased or decreased on them.

Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx>

diff -r 18a9e9f5c707 include/linux/mm.h
--- a/include/linux/mm.h	Thu Oct 19 08:30:00 2006 +0700
+++ b/include/linux/mm.h	Fri Oct 20 12:38:24 2006 -0400
@@ -276,6 +276,7 @@ static inline void get_page(struct page 
 	if (unlikely(PageCompound(page)))
 		page = (struct page *)page_private(page);
 	VM_BUG_ON(atomic_read(&page->_count) == 0);
+	VM_BUG_ON(PagePlaceHolder(page));
 	atomic_inc(&page->_count);
 }
 
diff -r 18a9e9f5c707 include/linux/page-flags.h
--- a/include/linux/page-flags.h	Thu Oct 19 08:30:00 2006 +0700
+++ b/include/linux/page-flags.h	Fri Oct 20 12:46:03 2006 -0400
@@ -90,6 +90,7 @@
 #define PG_reclaim		17	/* To be reclaimed asap */
 #define PG_nosave_free		18	/* Used for system suspend/resume */
 #define PG_buddy		19	/* Page is free, on buddy lists */
+#define PG_placeholder		20	/* An invalid page holding a slot */
 
 
 #if (BITS_PER_LONG > 32)
@@ -251,6 +252,10 @@ static inline void SetPageUptodate(struc
 #define SetPageUncached(page)	set_bit(PG_uncached, &(page)->flags)
 #define ClearPageUncached(page)	clear_bit(PG_uncached, &(page)->flags)
 
+#define PagePlaceHolder(page)	   test_bit(PG_placeholder, &(page)->flags)
+#define SetPagePlaceHolder(page)   set_bit(PG_placeholder, &(page)->flags)
+#define ClearPagePlaceHolder(page) clear_bit(PG_placeholder, &(page)->flags)
+
 struct page;	/* forward declaration */
 
 int test_clear_page_dirty(struct page *page);
diff -r 18a9e9f5c707 include/linux/pagemap.h
--- a/include/linux/pagemap.h	Thu Oct 19 08:30:00 2006 +0700
+++ b/include/linux/pagemap.h	Fri Oct 20 12:38:24 2006 -0400
@@ -72,6 +72,9 @@ extern struct page * find_get_page(struc
 				unsigned long index);
 extern struct page * find_lock_page(struct address_space *mapping,
 				unsigned long index);
+extern struct page *find_or_insert_page(struct address_space *mapping,
+					unsigned long index, gfp_t gfp_mask,
+					struct page *insert);
 extern __deprecated_for_modules struct page * find_trylock_page(
 			struct address_space *mapping, unsigned long index);
 extern struct page * find_or_create_page(struct address_space *mapping,
@@ -82,6 +85,12 @@ unsigned find_get_pages_contig(struct ad
 			       unsigned int nr_pages, struct page **pages);
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 			int tag, unsigned int nr_pages, struct page **pages);
+void remove_placeholder_page(struct address_space *mapping, struct page *expected,
+		       unsigned long off);
+void wake_up_placeholder_page(struct page *page);
+void wait_on_placeholder_pages_range(struct address_space *mapping, pgoff_t start,
+			       pgoff_t end);
+
 
 /*
  * Returns locked page at given index in given cache, creating it if needed.
diff -r 18a9e9f5c707 mm/filemap.c
--- a/mm/filemap.c	Thu Oct 19 08:30:00 2006 +0700
+++ b/mm/filemap.c	Fri Oct 20 13:46:29 2006 -0400
@@ -44,6 +44,9 @@ generic_file_direct_IO(int rw, struct ki
 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	loff_t offset, unsigned long nr_segs);
 
+static void wait_on_placeholder_page(struct address_space *mapping,
+			       struct page *page, unsigned long offset);
+
 /*
  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  * though.
@@ -437,12 +440,24 @@ int add_to_page_cache(struct page *page,
 int add_to_page_cache(struct page *page, struct address_space *mapping,
 		pgoff_t offset, gfp_t gfp_mask)
 {
-	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+	int error;
+again:
+	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 
 	if (error == 0) {
 		write_lock_irq(&mapping->tree_lock);
 		error = radix_tree_insert(&mapping->page_tree, offset, page);
-		if (!error) {
+		if (error == -EEXIST && (gfp_mask & __GFP_WAIT)) {
+			struct page *tmp;
+			tmp = radix_tree_lookup(&mapping->page_tree, offset);
+			if (tmp && PagePlaceHolder(tmp)) {
+				write_unlock_irq(&mapping->tree_lock);
+				radix_tree_preload_end();
+				wait_on_placeholder_page(mapping, tmp, offset);
+				goto again;
+			}
+		}
+		if (!error && !PagePlaceHolder(page)) {
 			page_cache_get(page);
 			SetPageLocked(page);
 			page->mapping = mapping;
@@ -526,6 +541,76 @@ void fastcall wait_on_page_bit(struct pa
 }
 EXPORT_SYMBOL(wait_on_page_bit);
 
+static void wait_on_placeholder_page(struct address_space *mapping,
+			       struct page *page, unsigned long offset)
+{
+	DEFINE_WAIT(wait);
+	wait_queue_head_t *wqh = page_waitqueue(page);
+	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+	read_lock_irq(&mapping->tree_lock);
+	page = radix_tree_lookup(&mapping->page_tree, offset);
+	if (page && PagePlaceHolder(page)) {
+		read_unlock_irq(&mapping->tree_lock);
+		io_schedule();
+	} else
+		read_unlock_irq(&mapping->tree_lock);
+	finish_wait(wqh, &wait);
+}
+
+void wake_up_placeholder_page(struct page *page)
+{
+	wake_up(page_waitqueue(page));
+}
+EXPORT_SYMBOL(wake_up_placeholder_page);
+
+/**
+ * wait_on_placeholder_pages - gang placeholder page waiter
+ * @mapping:	The address_space to search
+ * @start:	The starting page index
+ * @end:	The max page index
+ *
+ * wait_on_placeholder_pages() will search for and wait on a range of pages
+ * in the mapping
+ *
+ * On return, the range has no placeholder pages sitting in it.
+ */
+void wait_on_placeholder_pages_range(struct address_space *mapping,
+			       pgoff_t start, pgoff_t end)
+{
+	unsigned int i;
+	unsigned int ret;
+	struct page *pages[8];
+	pgoff_t cur = start;
+	pgoff_t highest = start;
+	DEFINE_WAIT(wait);
+
+	/*
+	 * we expect a very small number of place holder pages, so
+	 * this code isn't trying to be very fast.
+	 */
+again:
+	read_lock_irq(&mapping->tree_lock);
+	ret = radix_tree_gang_lookup(&mapping->page_tree,
+				(void **)pages, cur, ARRAY_SIZE(pages));
+	for (i = 0; i < ret; i++) {
+		if (PagePlaceHolder(pages[i])) {
+			wait_queue_head_t *wqh = page_waitqueue(pages[i]);
+			prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+			read_unlock_irq(&mapping->tree_lock);
+			io_schedule();
+			finish_wait(wqh, &wait);
+			goto again;
+		} else if (pages[i]->index > highest)
+			highest = pages[i]->index;
+	}
+	read_unlock_irq(&mapping->tree_lock);
+	if (highest < end && ret == ARRAY_SIZE(pages)) {
+		cur = highest;
+		goto again;
+	}
+}
+EXPORT_SYMBOL(wait_on_placeholder_pages_range);
+
 /**
  * unlock_page - unlock a locked page
  * @page: the page
@@ -542,6 +627,7 @@ EXPORT_SYMBOL(wait_on_page_bit);
  */
 void fastcall unlock_page(struct page *page)
 {
+	BUG_ON(PagePlaceHolder(page));
 	smp_mb__before_clear_bit();
 	if (!TestClearPageLocked(page))
 		BUG();
@@ -578,6 +664,7 @@ void fastcall __lock_page(struct page *p
 {
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 
+	BUG_ON(PagePlaceHolder(page));
 	__wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
 							TASK_UNINTERRUPTIBLE);
 }
@@ -590,6 +677,7 @@ void fastcall __lock_page_nosync(struct 
 void fastcall __lock_page_nosync(struct page *page)
 {
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+	BUG_ON(PagePlaceHolder(page));
 	__wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
 							TASK_UNINTERRUPTIBLE);
 }
@@ -608,12 +696,66 @@ struct page * find_get_page(struct addre
 
 	read_lock_irq(&mapping->tree_lock);
 	page = radix_tree_lookup(&mapping->page_tree, offset);
-	if (page)
-		page_cache_get(page);
+	if (page) {
+		if (PagePlaceHolder(page))
+			page = NULL;
+		else
+			page_cache_get(page);
+	}
 	read_unlock_irq(&mapping->tree_lock);
 	return page;
 }
 EXPORT_SYMBOL(find_get_page);
+
+/**
+ * find_or_insert_page - locate a pagecache page or insert one
+ * @mapping: the page's address_space
+ * @index: the page's index into the mapping
+ * @gfp_mask: page allocation mode
+ * @insert: the page to insert if none is found
+ *
+ * Locates a page in the pagecache.  If the page is not present,
+ * @insert is added instead.  @insert is not placed on the lrus
+ * The returned page is locked and has its reference count
+ * incremented
+ *
+ * find_or_insert_page() may sleep, even if @gfp_flags specifies an atomic
+ * allocation!
+ *
+ * find_or_insert_page() returns the desired page's address, or zero on
+ * memory exhaustion.
+ */
+struct page *find_or_insert_page(struct address_space *mapping,
+		unsigned long index, gfp_t gfp_mask, struct page *insert)
+{
+	struct page *page;
+	int err;
+repeat:
+	page = find_lock_page(mapping, index);
+	if (!page) {
+		err = add_to_page_cache(insert, mapping, index, gfp_mask);
+		if (!err) {
+			page = insert;
+		} else if (err == -EEXIST)
+			goto repeat;
+	}
+	return page;
+}
+EXPORT_SYMBOL(find_or_insert_page);
+
+void remove_placeholder_page(struct address_space *mapping,
+			     struct page *expected, unsigned long offset)
+{
+	struct page *page;
+	write_lock_irq(&mapping->tree_lock);
+	page = radix_tree_lookup(&mapping->page_tree, offset);
+	BUG_ON(!page);
+	BUG_ON(!PagePlaceHolder(page));
+	BUG_ON(page != expected);
+	radix_tree_delete(&mapping->page_tree, offset);
+	write_unlock_irq(&mapping->tree_lock);
+}
+EXPORT_SYMBOL(remove_placeholder_page);
 
 /**
  * find_trylock_page - find and lock a page
@@ -628,7 +770,7 @@ struct page *find_trylock_page(struct ad
 
 	read_lock_irq(&mapping->tree_lock);
 	page = radix_tree_lookup(&mapping->page_tree, offset);
-	if (page && TestSetPageLocked(page))
+	if (page && (PagePlaceHolder(page) || TestSetPageLocked(page)))
 		page = NULL;
 	read_unlock_irq(&mapping->tree_lock);
 	return page;
@@ -654,6 +796,12 @@ repeat:
 repeat:
 	page = radix_tree_lookup(&mapping->page_tree, offset);
 	if (page) {
+		if (PagePlaceHolder(page)) {
+			read_unlock_irq(&mapping->tree_lock);
+			wait_on_placeholder_page(mapping, page, offset);
+			read_lock_irq(&mapping->tree_lock);
+			goto repeat;
+		}
 		page_cache_get(page);
 		if (TestSetPageLocked(page)) {
 			read_unlock_irq(&mapping->tree_lock);
@@ -743,8 +891,17 @@ unsigned find_get_pages(struct address_s
 	read_lock_irq(&mapping->tree_lock);
 	ret = radix_tree_gang_lookup(&mapping->page_tree,
 				(void **)pages, start, nr_pages);
-	for (i = 0; i < ret; i++)
-		page_cache_get(pages[i]);
+	for (i = 0; i < ret; i++) {
+		if (PagePlaceHolder(pages[i])) {
+			/* we can't return a place holder, shift it away */
+			if (i + 1 < ret) {
+				memmove(pages + i, pages + i + 1,
+		                       (ret - i - 1) * sizeof(struct page *));
+			}
+			ret--;
+		} else
+			page_cache_get(pages[i]);
+	}
 	read_unlock_irq(&mapping->tree_lock);
 	return ret;
 }
@@ -771,6 +928,8 @@ unsigned find_get_pages_contig(struct ad
 	ret = radix_tree_gang_lookup(&mapping->page_tree,
 				(void **)pages, index, nr_pages);
 	for (i = 0; i < ret; i++) {
+		if (PagePlaceHolder(pages[i]))
+			break;
 		if (pages[i]->mapping == NULL || pages[i]->index != index)
 			break;
 
@@ -801,8 +960,17 @@ unsigned find_get_pages_tag(struct addre
 	read_lock_irq(&mapping->tree_lock);
 	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
 				(void **)pages, *index, nr_pages, tag);
-	for (i = 0; i < ret; i++)
-		page_cache_get(pages[i]);
+	for (i = 0; i < ret; i++) {
+		if (PagePlaceHolder(pages[i])) {
+			/* we can't return a place holder, shift it away */
+			if (i + 1 < ret) {
+				memmove(pages + i, pages + i + 1,
+		                       (ret - i - 1) * sizeof(struct page *));
+			}
+			ret--;
+		} else
+			page_cache_get(pages[i]);
+	}
 	if (ret)
 		*index = pages[ret - 1]->index + 1;
 	read_unlock_irq(&mapping->tree_lock);
diff -r 18a9e9f5c707 mm/truncate.c
--- a/mm/truncate.c	Thu Oct 19 08:30:00 2006 +0700
+++ b/mm/truncate.c	Fri Oct 20 12:38:24 2006 -0400
@@ -207,6 +207,7 @@ void truncate_inode_pages_range(struct a
 		}
 		pagevec_release(&pvec);
 	}
+	wait_on_placeholder_pages_range(mapping, start, end);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
 
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux