[no subject]

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



>From chris.mason@xxxxxxxxxx Thu Dec 21 15:34:58 2006
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Subject: [PATCH 1 of 8] Introduce a place holder page for the pagecache
X-Mercurial-Node: 4cac7e560b5342c0e5e2c45b2e036a936adedc2e
Message-Id: <4cac7e560b5342c0e5e2.1166733297@xxxxxxxxxxxxxxxxxxx>
In-Reply-To: <patchbomb.1166733296@xxxxxxxxxxxxxxxxxxx>
Date: Thu, 21 Dec 2006 15:34:57 -0400
From: Chris Mason <chris.mason@xxxxxxxxxx>
To: linux-fsdevel@xxxxxxxxxxxxxxx, akpm@xxxxxxxx, zach.brown@xxxxxxxxxx

mm/filemap.c is changed to wait on these before adding a page into the page
cache, and truncates are changed to wait for all of the place holder pages to
disappear.

Place holder pages can only be examined with the mapping lock held.  They
cannot be locked, and cannot have references increased or decreased on them.

Placeholders can span a range bigger than one page.  The placeholder is
inserted into the radix slot for the end of the range, and the flags field in
the page struct is used to record the start of the range.

A bit is added for the radix root (PAGECACHE_TAG_EXTENTS), and when
mm/filemap.c finds that bit set, searches for an index in the pagecache
look forward to find any placeholders that index may intersect.

Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx>

diff -r 511f067627ac -r 4cac7e560b53 drivers/mtd/devices/block2mtd.c
--- a/drivers/mtd/devices/block2mtd.c	Thu Dec 21 00:20:01 2006 -0800
+++ b/drivers/mtd/devices/block2mtd.c	Thu Dec 21 15:31:30 2006 -0500
@@ -66,7 +66,7 @@ static void cache_readahead(struct addre
 			INFO("Overrun end of disk in cache readahead\n");
 			break;
 		}
-		page = radix_tree_lookup(&mapping->page_tree, pagei);
+		page = radix_tree_lookup_extent(&mapping->page_tree, pagei);
 		if (page && (!i))
 			break;
 		if (page)
diff -r 511f067627ac -r 4cac7e560b53 include/linux/fs.h
--- a/include/linux/fs.h	Thu Dec 21 00:20:01 2006 -0800
+++ b/include/linux/fs.h	Thu Dec 21 15:31:30 2006 -0500
@@ -489,6 +489,11 @@ struct block_device {
  */
 #define PAGECACHE_TAG_DIRTY	0
 #define PAGECACHE_TAG_WRITEBACK	1
+
+/*
+ * This tag is only valid on the root of the radix tree
+ */
+#define PAGE_CACHE_TAG_EXTENTS 2
 
 int mapping_tagged(struct address_space *mapping, int tag);
 
diff -r 511f067627ac -r 4cac7e560b53 include/linux/page-flags.h
--- a/include/linux/page-flags.h	Thu Dec 21 00:20:01 2006 -0800
+++ b/include/linux/page-flags.h	Thu Dec 21 15:31:30 2006 -0500
@@ -267,4 +267,6 @@ static inline void set_page_writeback(st
 	test_set_page_writeback(page);
 }
 
+void set_page_placeholder(struct page *page, pgoff_t start, pgoff_t end);
+
 #endif	/* PAGE_FLAGS_H */
diff -r 511f067627ac -r 4cac7e560b53 include/linux/pagemap.h
--- a/include/linux/pagemap.h	Thu Dec 21 00:20:01 2006 -0800
+++ b/include/linux/pagemap.h	Thu Dec 21 15:31:30 2006 -0500
@@ -76,6 +76,11 @@ extern struct page * find_get_page(struc
 				unsigned long index);
 extern struct page * find_lock_page(struct address_space *mapping,
 				unsigned long index);
+int find_or_insert_placeholders(struct address_space *mapping,
+                                  struct page **pages, unsigned long start,
+                                  unsigned long end, unsigned long nr,
+                                  gfp_t gfp_mask,
+                                  int wait);
 extern __deprecated_for_modules struct page * find_trylock_page(
 			struct address_space *mapping, unsigned long index);
 extern struct page * find_or_create_page(struct address_space *mapping,
@@ -86,6 +91,15 @@ unsigned find_get_pages_contig(struct ad
 			       unsigned int nr_pages, struct page **pages);
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 			int tag, unsigned int nr_pages, struct page **pages);
+void remove_placeholder_pages(struct address_space *mapping,
+                             struct page **pages,
+                             unsigned long offset,
+                             unsigned long end,
+                             unsigned long nr);
+void wake_up_placeholder_page(struct page *page);
+void wait_on_placeholder_pages_range(struct address_space *mapping, pgoff_t start,
+			       pgoff_t end);
+
 
 /*
  * Returns locked page at given index in given cache, creating it if needed.
@@ -116,6 +130,8 @@ int add_to_page_cache_lru(struct page *p
 				unsigned long index, gfp_t gfp_mask);
 extern void remove_from_page_cache(struct page *page);
 extern void __remove_from_page_cache(struct page *page);
+struct page *radix_tree_lookup_extent(struct radix_tree_root *root,
+					     unsigned long index);
 
 /*
  * Return byte-offset into filesystem object for page.
diff -r 511f067627ac -r 4cac7e560b53 include/linux/radix-tree.h
--- a/include/linux/radix-tree.h	Thu Dec 21 00:20:01 2006 -0800
+++ b/include/linux/radix-tree.h	Thu Dec 21 15:31:30 2006 -0500
@@ -53,6 +53,7 @@ static inline int radix_tree_is_direct_p
 /*** radix-tree API starts here ***/
 
 #define RADIX_TREE_MAX_TAGS 2
+#define RADIX_TREE_MAX_ROOT_TAGS 3
 
 /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
 struct radix_tree_root {
@@ -168,6 +169,7 @@ radix_tree_gang_lookup_tag(struct radix_
 		unsigned long first_index, unsigned int max_items,
 		unsigned int tag);
 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
+void radix_tree_root_tag_set(struct radix_tree_root *root, unsigned int tag);
 
 static inline void radix_tree_preload_end(void)
 {
diff -r 511f067627ac -r 4cac7e560b53 lib/radix-tree.c
--- a/lib/radix-tree.c	Thu Dec 21 00:20:01 2006 -0800
+++ b/lib/radix-tree.c	Thu Dec 21 15:31:30 2006 -0500
@@ -468,6 +468,12 @@ void *radix_tree_tag_set(struct radix_tr
 	return slot;
 }
 EXPORT_SYMBOL(radix_tree_tag_set);
+
+void radix_tree_root_tag_set(struct radix_tree_root *root, unsigned int tag)
+{
+	root_tag_set(root, tag);
+}
+EXPORT_SYMBOL(radix_tree_root_tag_set);
 
 /**
  *	radix_tree_tag_clear - clear a tag on a radix tree node
diff -r 511f067627ac -r 4cac7e560b53 mm/filemap.c
--- a/mm/filemap.c	Thu Dec 21 00:20:01 2006 -0800
+++ b/mm/filemap.c	Thu Dec 21 15:31:30 2006 -0500
@@ -44,6 +44,14 @@ generic_file_direct_IO(int rw, struct ki
 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	loff_t offset, unsigned long nr_segs);
 
+static wait_queue_head_t *page_waitqueue(struct page *page);
+static void wait_on_placeholder_page(struct address_space *mapping,
+				     struct page *page,
+				     int write_lock);
+
+static struct address_space placeholder_address_space;
+#define PagePlaceHolder(page) ((page)->mapping == &placeholder_address_space)
+
 /*
  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  * though.
@@ -421,6 +429,41 @@ int filemap_write_and_wait_range(struct 
 	return err;
 }
 
+/*
+ * When the radix tree has the extent bit set, a lookup needs to search
+ * forward in the tree to find any extent the index might intersect.
+ * When extents are off, a faster radix_tree_lookup can be done instead.
+ *
+ * This does the appropriate lookup based on the PAGE_CACHE_TAG_EXTENTS
+ * bit on the root node
+ */
+struct page *radix_tree_lookup_extent(struct radix_tree_root *root,
+					     unsigned long index)
+{
+	if (radix_tree_tagged(root, PAGE_CACHE_TAG_EXTENTS)) {
+		struct page *p;
+		unsigned int found;
+		found = radix_tree_gang_lookup(root, (void **)(&p), index, 1);
+		if (found) {
+			if (PagePlaceHolder(p)) {
+				pgoff_t start = p->flags;
+				pgoff_t end = p->index;
+				if (end >= index && start <= index)
+					return p;
+				return NULL;
+			} else {
+				if (p->index == index) {
+					return p;
+				}
+				return NULL;
+			}
+		} else
+			return NULL;
+	}
+	return radix_tree_lookup(root, index);
+}
+
+
 /**
  * add_to_page_cache - add newly allocated pagecache pages
  * @page:	page to add
@@ -437,12 +480,38 @@ int add_to_page_cache(struct page *page,
 int add_to_page_cache(struct page *page, struct address_space *mapping,
 		pgoff_t offset, gfp_t gfp_mask)
 {
-	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+	int error;
+again:
+	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 
 	if (error == 0) {
+		struct page *tmp;
 		write_lock_irq(&mapping->tree_lock);
+		/*
+		 * If extents are on for this radix tree, we have to do
+		 * the more expensive search for an overlapping extent
+		 * before we try to insert.
+		 */
+		if (radix_tree_tagged(&mapping->page_tree,
+				      PAGE_CACHE_TAG_EXTENTS)) {
+			tmp = radix_tree_lookup_extent(&mapping->page_tree,
+						       offset);
+			if (tmp && PagePlaceHolder(tmp))
+				goto exists;
+		}
 		error = radix_tree_insert(&mapping->page_tree, offset, page);
-		if (!error) {
+		if (error == -EEXIST && (gfp_mask & __GFP_WAIT)) {
+			tmp = radix_tree_lookup_extent(&mapping->page_tree,
+						       offset);
+			if (tmp && PagePlaceHolder(tmp)) {
+exists:
+				radix_tree_preload_end();
+				/* drops the lock */
+				wait_on_placeholder_page(mapping, tmp, 1);
+				goto again;
+			}
+		}
+		if (!error && !PagePlaceHolder(page)) {
 			page_cache_get(page);
 			SetPageLocked(page);
 			page->mapping = mapping;
@@ -516,6 +585,92 @@ void fastcall wait_on_page_bit(struct pa
 }
 EXPORT_SYMBOL(wait_on_page_bit);
 
+/*
+ * Call with either a read lock or a write lock on the mapping tree.
+ *
+ * placeholder pages can't be tested or checked without the tree lock held
+ *
+ * In order to wait for the placeholders without losing a wakeup from someone
+ * removing them, we have to prepare_to_wait before dropping the tree lock.
+ *
+ * The lock is dropped just before waiting for the place holder.  It is not
+ * retaken before returning.
+ */
+static void wait_on_placeholder_page(struct address_space *mapping,
+				     struct page *page,
+				     int write_lock)
+{
+	DEFINE_WAIT(wait);
+	wait_queue_head_t *wqh = page_waitqueue(page);
+	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+	if (write_lock)
+		write_unlock_irq(&mapping->tree_lock);
+	else
+		read_unlock_irq(&mapping->tree_lock);
+	io_schedule();
+	finish_wait(wqh, &wait);
+}
+
+void wake_up_placeholder_page(struct page *page)
+{
+	__wake_up_bit(page_waitqueue(page), &page->flags, PG_locked);
+}
+EXPORT_SYMBOL_GPL(wake_up_placeholder_page);
+
+/**
+ * wait_on_placeholder_pages - gang placeholder page waiter
+ * @mapping:	The address_space to search
+ * @start:	The starting page index
+ * @end:	The max page index (inclusive)
+ *
+ * wait_on_placeholder_pages() will search for and wait on a range of pages
+ * in the mapping
+ *
+ * On return, the range has no placeholder pages sitting in it.
+ */
+void wait_on_placeholder_pages_range(struct address_space *mapping,
+			       pgoff_t start, pgoff_t end)
+{
+	unsigned int i;
+	unsigned int ret;
+	struct page *pages[8];
+	pgoff_t cur = start;
+	pgoff_t highest = start;
+
+	/*
+	 * we expect a very small number of place holder pages, so
+	 * this code isn't trying to be very fast.
+	 */
+again:
+	read_lock_irq(&mapping->tree_lock);
+	ret = radix_tree_gang_lookup(&mapping->page_tree,
+				(void **)pages, cur, ARRAY_SIZE(pages));
+	for (i = 0; i < ret; i++) {
+		if (PagePlaceHolder(pages[i])) {
+			if (pages[i]->flags > end)
+				goto done;
+			/* drops the lock */
+			wait_on_placeholder_page(mapping, pages[i], 0);
+			goto again;
+		}
+		if (pages[i]->index > highest)
+			highest = pages[i]->index;
+		if (pages[i]->index > end)
+			goto done;
+	}
+	if (highest < end && ret == ARRAY_SIZE(pages)) {
+		cur = highest;
+		if (need_resched()) {
+			read_unlock_irq(&mapping->tree_lock);
+			cond_resched();
+		}
+		goto again;
+	}
+done:
+	read_unlock_irq(&mapping->tree_lock);
+}
+EXPORT_SYMBOL_GPL(wait_on_placeholder_pages_range);
+
 /**
  * unlock_page - unlock a locked page
  * @page: the page
@@ -532,6 +687,7 @@ EXPORT_SYMBOL(wait_on_page_bit);
  */
 void fastcall unlock_page(struct page *page)
 {
+	BUG_ON(PagePlaceHolder(page));
 	smp_mb__before_clear_bit();
 	if (!TestClearPageLocked(page))
 		BUG();
@@ -568,6 +724,7 @@ void fastcall __lock_page(struct page *p
 {
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 
+	BUG_ON(PagePlaceHolder(page));
 	__wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
 							TASK_UNINTERRUPTIBLE);
 }
@@ -580,6 +737,7 @@ void fastcall __lock_page_nosync(struct 
 void fastcall __lock_page_nosync(struct page *page)
 {
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+	BUG_ON(PagePlaceHolder(page));
 	__wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
 							TASK_UNINTERRUPTIBLE);
 }
@@ -597,13 +755,281 @@ struct page * find_get_page(struct addre
 	struct page *page;
 
 	read_lock_irq(&mapping->tree_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
-	if (page)
-		page_cache_get(page);
+	page = radix_tree_lookup_extent(&mapping->page_tree, offset);
+	if (page) {
+		if (PagePlaceHolder(page))
+			page = NULL;
+		else
+			page_cache_get(page);
+	}
 	read_unlock_irq(&mapping->tree_lock);
 	return page;
 }
 EXPORT_SYMBOL(find_get_page);
+
+/**
+ * remove_placeholder_pages - remove a range of placeholder or locked pages
+ * @mapping: the page's address_space
+ * @pages: an array of page pointers to use for gang looukps
+ * @placeholder: the placeholder page previously inserted (for verification)
+ * @start: the search starting point
+ * @end: the search end point (offsets >= end are not touched)
+ * @nr: the size of the pages array.
+ *
+ * Any placeholder pages in the range specified are removed.  Any real
+ * pages are unlocked and released.
+ */
+void remove_placeholder_pages(struct address_space *mapping,
+			     struct page **pages,
+			     unsigned long start,
+			     unsigned long end,
+			     unsigned long nr)
+{
+	struct page *page;
+	int ret;
+	int i;
+	unsigned long num;
+
+	write_lock_irq(&mapping->tree_lock);
+	while (start < end) {
+		num = min(nr, end-start);
+		ret = radix_tree_gang_lookup(&mapping->page_tree,
+						(void **)pages, start, num);
+		for (i = 0; i < ret; i++) {
+			page = pages[i];
+			if (PagePlaceHolder(page)) {
+				if (page->index >= end)
+					break;
+				radix_tree_delete(&mapping->page_tree,
+						  page->index);
+				start = page->index + 1;
+				wake_up_placeholder_page(page);
+				kfree(page);
+			} else {
+				if (page->index >= end)
+					break;
+				unlock_page(page);
+				page_cache_release(page);
+				start = page->index + 1;
+			}
+		}
+		if (need_resched()) {
+			write_unlock_irq(&mapping->tree_lock);
+			cond_resched();
+			write_lock_irq(&mapping->tree_lock);
+		}
+	}
+	write_unlock_irq(&mapping->tree_lock);
+}
+EXPORT_SYMBOL_GPL(remove_placeholder_pages);
+
+/*
+ * a helper function to insert a placeholder into multiple slots
+ * in the radix tree.  This could probably use an optimized version
+ * in the radix code.  It may insert fewer than the request number
+ * of placeholders if we need to reschedule or the radix tree needs to
+ * be preloaded again.
+ *
+ * returns zero on error or the number actually inserted.
+ */
+static int insert_placeholder(struct address_space *mapping,
+					 struct page *insert)
+{
+	int err;
+	unsigned int found;
+	struct page *debug_page;
+	/* sanity check, make sure other extents don't exist in this range */
+	found = radix_tree_gang_lookup(&mapping->page_tree,
+				    (void **)(&debug_page),
+				    insert->flags, 1);
+	BUG_ON(found > 0 && debug_page->flags <= (insert->index));
+	err = radix_tree_insert(&mapping->page_tree, insert->index, insert);
+	return err;
+}
+
+
+static struct page *alloc_placeholder(gfp_t gfp_mask)
+{
+	struct page *p;
+	p = kmalloc(sizeof(*p), gfp_mask);
+	if (p) {
+		memset(p, 0, sizeof(*p));
+		p->mapping = &placeholder_address_space;
+	}
+	return p;
+}
+
+/**
+ * find_or_insert_placeholders - locate a group of pagecache pages or insert one
+ * @mapping: the page's address_space
+ * @pages: an array of page pointers to use for gang looukps
+ * @start: the search starting point
+ * @end: the search end point (offsets >= end are not touched)
+ * @nr: the size of the pages array.
+ * @gfp_mask: page allocation mode
+ * @insert: the page to insert if none is found
+ * @iowait: 1 if you want to wait for dirty or writeback pages.
+ *
+ * This locks down a range of offsets in the address space.  Any pages
+ * already present are locked and a placeholder page is inserted into
+ * the radix tree for any offsets without pages.
+ */
+int find_or_insert_placeholders(struct address_space *mapping,
+				  struct page **pages, unsigned long start,
+				  unsigned long end, unsigned long nr,
+				  gfp_t gfp_mask,
+				  int iowait)
+{
+	int err = 0;
+	int i, ret;
+	unsigned long cur = start;
+	struct page *page;
+	int restart;
+	struct page *insert = NULL;
+	/*
+	 * this gets complicated.  Placeholders and page locks need to
+	 * be taken in order.  We use gang lookup to cut down on the cpu
+	 * cost, but we need to keep track of holes in the results and
+	 * insert placeholders as appropriate.
+	 *
+	 * If a locked page or a placeholder is found, we wait for it and
+	 * pick up where we left off.  If a dirty or PG_writeback page is found
+	 * and iowait==1, we have to drop all of our locks, kick/wait for the
+	 * io and resume again.
+	 */
+repeat:
+	if (!insert) {
+		insert = alloc_placeholder(gfp_mask);
+		if (!insert) {
+			err = -ENOMEM;
+			goto fail;
+		}
+	}
+	if (cur != start )
+		cond_resched();
+	err = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+	if (err)
+		goto fail;
+	write_lock_irq(&mapping->tree_lock);
+
+	/* only set the extent tag if we are inserting placeholders for more
+	 * than one page worth of slots.  This way small random ios don't
+	 * suffer from slower lookups.
+	 */
+	if (cur == start && end - start > 1)
+		radix_tree_root_tag_set(&mapping->page_tree,
+					PAGE_CACHE_TAG_EXTENTS);
+repeat_lock:
+	ret = radix_tree_gang_lookup(&mapping->page_tree,
+					(void **)pages, cur,
+					min(nr, end-cur));
+	for (i = 0 ; i < ret ; i++) {
+		restart = 0;
+		page = pages[i];
+
+		if (PagePlaceHolder(page) && page->flags < end) {
+			radix_tree_preload_end();
+			/* drops the lock */
+			wait_on_placeholder_page(mapping, page, 1);
+			goto repeat;
+		}
+
+		if (page->index > cur) {
+			unsigned long top = min(end, page->index);
+			insert->index = top - 1;
+			insert->flags = cur;
+			err = insert_placeholder(mapping, insert);
+			write_unlock_irq(&mapping->tree_lock);
+			radix_tree_preload_end();
+			insert = NULL;
+			if (err)
+				goto fail;
+			cur = top;
+			if (cur < end)
+				goto repeat;
+			else
+				goto done;
+		}
+		if (page->index >= end) {
+			ret = 0;
+			break;
+		}
+		page_cache_get(page);
+		BUG_ON(page->index != cur);
+		BUG_ON(PagePlaceHolder(page));
+		if (TestSetPageLocked(page)) {
+			unsigned long tmpoff = page->index;
+			page_cache_get(page);
+			write_unlock_irq(&mapping->tree_lock);
+			radix_tree_preload_end();
+			__lock_page(page);
+			/* Has the page been truncated while we slept? */
+			if (unlikely(page->mapping != mapping ||
+				     page->index != tmpoff)) {
+				unlock_page(page);
+				page_cache_release(page);
+				goto repeat;
+			} else {
+				/* we've locked the page, but  we need to
+				 *  check it for dirty/writeback
+				 */
+				restart = 1;
+			}
+		}
+		if (iowait && (PageDirty(page) || PageWriteback(page))) {
+			unlock_page(page);
+			page_cache_release(page);
+			if (!restart) {
+				write_unlock_irq(&mapping->tree_lock);
+				radix_tree_preload_end();
+			}
+			err = filemap_write_and_wait_range(mapping,
+						 cur << PAGE_CACHE_SHIFT,
+						 end << PAGE_CACHE_SHIFT);
+			if (err)
+				goto fail;
+			goto repeat;
+		}
+		cur++;
+		if (restart)
+			goto repeat;
+		if (cur >= end)
+			break;
+	}
+
+	/* we haven't yet filled the range */
+	if (cur < end) {
+		/* if the search filled our array, there is more to do. */
+		if (ret && ret == nr)
+			goto repeat_lock;
+
+		/* otherwise insert placeholders for the remaining offsets */
+		insert->index = end - 1;
+		insert->flags = cur;
+		err = insert_placeholder(mapping, insert);
+		write_unlock_irq(&mapping->tree_lock);
+		radix_tree_preload_end();
+		if (err)
+			goto fail;
+		insert = NULL;
+		cur = end;
+	} else {
+		write_unlock_irq(&mapping->tree_lock);
+		radix_tree_preload_end();
+	}
+done:
+	BUG_ON(cur < end);
+	BUG_ON(cur > end);
+	if (insert)
+		kfree(insert);
+	return err;
+fail:
+	remove_placeholder_pages(mapping, pages, start, cur, nr);
+	if (insert)
+		kfree(insert);
+	return err;
+}
+EXPORT_SYMBOL_GPL(find_or_insert_placeholders);
 
 /**
  * find_trylock_page - find and lock a page
@@ -617,8 +1043,8 @@ struct page *find_trylock_page(struct ad
 	struct page *page;
 
 	read_lock_irq(&mapping->tree_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
-	if (page && TestSetPageLocked(page))
+	page = radix_tree_lookup_extent(&mapping->page_tree, offset);
+	if (page && (PagePlaceHolder(page) || TestSetPageLocked(page)))
 		page = NULL;
 	read_unlock_irq(&mapping->tree_lock);
 	return page;
@@ -642,8 +1068,14 @@ struct page *find_lock_page(struct addre
 
 	read_lock_irq(&mapping->tree_lock);
 repeat:
-	page = radix_tree_lookup(&mapping->page_tree, offset);
+	page = radix_tree_lookup_extent(&mapping->page_tree, offset);
 	if (page) {
+		if (PagePlaceHolder(page)) {
+			/* drops the lock */
+			wait_on_placeholder_page(mapping, page, 0);
+			read_lock_irq(&mapping->tree_lock);
+			goto repeat;
+		}
 		page_cache_get(page);
 		if (TestSetPageLocked(page)) {
 			read_unlock_irq(&mapping->tree_lock);
@@ -727,14 +1159,25 @@ unsigned find_get_pages(struct address_s
 unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 			    unsigned int nr_pages, struct page **pages)
 {
-	unsigned int i;
+	unsigned int i = 0;
 	unsigned int ret;
 
 	read_lock_irq(&mapping->tree_lock);
 	ret = radix_tree_gang_lookup(&mapping->page_tree,
 				(void **)pages, start, nr_pages);
-	for (i = 0; i < ret; i++)
-		page_cache_get(pages[i]);
+	while(i < ret) {
+		if (PagePlaceHolder(pages[i])) {
+			/* we can't return a place holder, shift it away */
+			if (i + 1 < ret) {
+				memcpy(&pages[i], &pages[i+1],
+		                       (ret - i - 1) * sizeof(struct page *));
+			}
+			ret--;
+			continue;
+		} else
+			page_cache_get(pages[i]);
+		i++;
+	}
 	read_unlock_irq(&mapping->tree_lock);
 	return ret;
 }
@@ -761,6 +1204,8 @@ unsigned find_get_pages_contig(struct ad
 	ret = radix_tree_gang_lookup(&mapping->page_tree,
 				(void **)pages, index, nr_pages);
 	for (i = 0; i < ret; i++) {
+		if (PagePlaceHolder(pages[i]))
+			break;
 		if (pages[i]->mapping == NULL || pages[i]->index != index)
 			break;
 
@@ -785,14 +1230,25 @@ unsigned find_get_pages_tag(struct addre
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 			int tag, unsigned int nr_pages, struct page **pages)
 {
-	unsigned int i;
+	unsigned int i = 0;
 	unsigned int ret;
 
 	read_lock_irq(&mapping->tree_lock);
 	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
 				(void **)pages, *index, nr_pages, tag);
-	for (i = 0; i < ret; i++)
-		page_cache_get(pages[i]);
+	while(i < ret) {
+		if (PagePlaceHolder(pages[i])) {
+			/* we can't return a place holder, shift it away */
+			if (i + 1 < ret) {
+				memcpy(&pages[i], &pages[i+1],
+		                       (ret - i - 1) * sizeof(struct page *));
+			}
+			ret--;
+			continue;
+		} else
+			page_cache_get(pages[i]);
+		i++;
+	}
 	if (ret)
 		*index = pages[ret - 1]->index + 1;
 	read_unlock_irq(&mapping->tree_lock);
@@ -2406,18 +2862,15 @@ generic_file_direct_IO(int rw, struct ki
 			unmap_mapping_range(mapping, offset, write_len, 0);
 	}
 
-	retval = filemap_write_and_wait(mapping);
-	if (retval == 0) {
-		retval = mapping->a_ops->direct_IO(rw, iocb, iov,
-						offset, nr_segs);
-		if (rw == WRITE && mapping->nrpages) {
-			pgoff_t end = (offset + write_len - 1)
-						>> PAGE_CACHE_SHIFT;
-			int err = invalidate_inode_pages2_range(mapping,
-					offset >> PAGE_CACHE_SHIFT, end);
-			if (err)
-				retval = err;
-		}
+	retval = mapping->a_ops->direct_IO(rw, iocb, iov,
+					offset, nr_segs);
+	if (rw == WRITE && mapping->nrpages) {
+		pgoff_t end = (offset + write_len - 1)
+					>> PAGE_CACHE_SHIFT;
+		int err = invalidate_inode_pages2_range(mapping,
+				offset >> PAGE_CACHE_SHIFT, end);
+		if (err)
+			retval = err;
 	}
 	return retval;
 }
diff -r 511f067627ac -r 4cac7e560b53 mm/migrate.c
--- a/mm/migrate.c	Thu Dec 21 00:20:01 2006 -0800
+++ b/mm/migrate.c	Thu Dec 21 15:31:30 2006 -0500
@@ -305,8 +305,12 @@ static int migrate_page_move_mapping(str
 
 	write_lock_irq(&mapping->tree_lock);
 
+	/*
+	 * we don't need to worry about placeholders here,
+	 * the slot in the tree is verified
+	 */
 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
- 					page_index(page));
+					page_index(page));
 
 	if (page_count(page) != 2 + !!PagePrivate(page) ||
 			(struct page *)radix_tree_deref_slot(pslot) != page) {
diff -r 511f067627ac -r 4cac7e560b53 mm/readahead.c
--- a/mm/readahead.c	Thu Dec 21 00:20:01 2006 -0800
+++ b/mm/readahead.c	Thu Dec 21 15:31:30 2006 -0500
@@ -288,7 +288,8 @@ __do_page_cache_readahead(struct address
 		if (page_offset > end_index)
 			break;
 
-		page = radix_tree_lookup(&mapping->page_tree, page_offset);
+		page = radix_tree_lookup_extent(&mapping->page_tree,
+						page_offset);
 		if (page)
 			continue;
 
diff -r 511f067627ac -r 4cac7e560b53 mm/truncate.c
--- a/mm/truncate.c	Thu Dec 21 00:20:01 2006 -0800
+++ b/mm/truncate.c	Thu Dec 21 15:31:30 2006 -0500
@@ -209,6 +209,7 @@ void truncate_inode_pages_range(struct a
 		}
 		pagevec_release(&pvec);
 	}
+	wait_on_placeholder_pages_range(mapping, start, end);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
 


-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux