Introduce a place holder page for the radix tree. mm/filemap.c is changed to wait on these before adding a page into the page cache, and truncates are changed to wait for all of the place holder pages to disappear. Place holder pages can only be tested or looked at with the mapping lock held, and only page->flags can be trusted. They cannot be locked, and cannot have references increased or decreased on them. Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx> diff -r 18a9e9f5c707 include/linux/mm.h --- a/include/linux/mm.h Thu Oct 19 08:30:00 2006 +0700 +++ b/include/linux/mm.h Fri Oct 20 12:38:24 2006 -0400 @@ -276,6 +276,7 @@ static inline void get_page(struct page if (unlikely(PageCompound(page))) page = (struct page *)page_private(page); VM_BUG_ON(atomic_read(&page->_count) == 0); + VM_BUG_ON(PagePlaceHolder(page)); atomic_inc(&page->_count); } diff -r 18a9e9f5c707 include/linux/page-flags.h --- a/include/linux/page-flags.h Thu Oct 19 08:30:00 2006 +0700 +++ b/include/linux/page-flags.h Fri Oct 20 12:46:03 2006 -0400 @@ -90,6 +90,7 @@ #define PG_reclaim 17 /* To be reclaimed asap */ #define PG_nosave_free 18 /* Used for system suspend/resume */ #define PG_buddy 19 /* Page is free, on buddy lists */ +#define PG_placeholder 20 /* An invalid page holding a slot */ #if (BITS_PER_LONG > 32) @@ -251,6 +252,10 @@ static inline void SetPageUptodate(struc #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags) #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags) +#define PagePlaceHolder(page) test_bit(PG_placeholder, &(page)->flags) +#define SetPagePlaceHolder(page) set_bit(PG_placeholder, &(page)->flags) +#define ClearPagePlaceHolder(page) clear_bit(PG_placeholder, &(page)->flags) + struct page; /* forward declaration */ int test_clear_page_dirty(struct page *page); diff -r 18a9e9f5c707 include/linux/pagemap.h --- a/include/linux/pagemap.h Thu Oct 19 08:30:00 2006 +0700 +++ b/include/linux/pagemap.h Fri Oct 20 12:38:24 2006 -0400 @@ -72,6 +72,9 @@ extern struct page * find_get_page(struc unsigned long index); extern struct page * find_lock_page(struct address_space *mapping, unsigned long index); +extern struct page *find_or_insert_page(struct address_space *mapping, + unsigned long index, gfp_t gfp_mask, + struct page *insert); extern __deprecated_for_modules struct page * find_trylock_page( struct address_space *mapping, unsigned long index); extern struct page * find_or_create_page(struct address_space *mapping, @@ -82,6 +85,12 @@ unsigned find_get_pages_contig(struct ad unsigned int nr_pages, struct page **pages); unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages); +void remove_placeholder_page(struct address_space *mapping, struct page *expected, + unsigned long off); +void wake_up_placeholder_page(struct page *page); +void wait_on_placeholder_pages_range(struct address_space *mapping, pgoff_t start, + pgoff_t end); + /* * Returns locked page at given index in given cache, creating it if needed. diff -r 18a9e9f5c707 mm/filemap.c --- a/mm/filemap.c Thu Oct 19 08:30:00 2006 +0700 +++ b/mm/filemap.c Fri Oct 20 13:46:29 2006 -0400 @@ -44,6 +44,9 @@ generic_file_direct_IO(int rw, struct ki generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs); +static void wait_on_placeholder_page(struct address_space *mapping, + struct page *page, unsigned long offset); + /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -437,12 +440,24 @@ int add_to_page_cache(struct page *page, int add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + int error; +again: + error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { write_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); - if (!error) { + if (error == -EEXIST && (gfp_mask & __GFP_WAIT)) { + struct page *tmp; + tmp = radix_tree_lookup(&mapping->page_tree, offset); + if (tmp && PagePlaceHolder(tmp)) { + write_unlock_irq(&mapping->tree_lock); + radix_tree_preload_end(); + wait_on_placeholder_page(mapping, tmp, offset); + goto again; + } + } + if (!error && !PagePlaceHolder(page)) { page_cache_get(page); SetPageLocked(page); page->mapping = mapping; @@ -526,6 +541,76 @@ void fastcall wait_on_page_bit(struct pa } EXPORT_SYMBOL(wait_on_page_bit); +static void wait_on_placeholder_page(struct address_space *mapping, + struct page *page, unsigned long offset) +{ + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = page_waitqueue(page); + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + read_lock_irq(&mapping->tree_lock); + page = radix_tree_lookup(&mapping->page_tree, offset); + if (page && PagePlaceHolder(page)) { + read_unlock_irq(&mapping->tree_lock); + io_schedule(); + } else + read_unlock_irq(&mapping->tree_lock); + finish_wait(wqh, &wait); +} + +void wake_up_placeholder_page(struct page *page) +{ + wake_up(page_waitqueue(page)); +} +EXPORT_SYMBOL(wake_up_placeholder_page); + +/** + * wait_on_placeholder_pages - gang placeholder page waiter + * @mapping: The address_space to search + * @start: The starting page index + * @end: The max page index + * + * wait_on_placeholder_pages() will search for and wait on a range of pages + * in the mapping + * + * On return, the range has no placeholder pages sitting in it. + */ +void wait_on_placeholder_pages_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + unsigned int i; + unsigned int ret; + struct page *pages[8]; + pgoff_t cur = start; + pgoff_t highest = start; + DEFINE_WAIT(wait); + + /* + * we expect a very small number of place holder pages, so + * this code isn't trying to be very fast. + */ +again: + read_lock_irq(&mapping->tree_lock); + ret = radix_tree_gang_lookup(&mapping->page_tree, + (void **)pages, cur, ARRAY_SIZE(pages)); + for (i = 0; i < ret; i++) { + if (PagePlaceHolder(pages[i])) { + wait_queue_head_t *wqh = page_waitqueue(pages[i]); + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + read_unlock_irq(&mapping->tree_lock); + io_schedule(); + finish_wait(wqh, &wait); + goto again; + } else if (pages[i]->index > highest) + highest = pages[i]->index; + } + read_unlock_irq(&mapping->tree_lock); + if (highest < end && ret == ARRAY_SIZE(pages)) { + cur = highest; + goto again; + } +} +EXPORT_SYMBOL(wait_on_placeholder_pages_range); + /** * unlock_page - unlock a locked page * @page: the page @@ -542,6 +627,7 @@ EXPORT_SYMBOL(wait_on_page_bit); */ void fastcall unlock_page(struct page *page) { + BUG_ON(PagePlaceHolder(page)); smp_mb__before_clear_bit(); if (!TestClearPageLocked(page)) BUG(); @@ -578,6 +664,7 @@ void fastcall __lock_page(struct page *p { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + BUG_ON(PagePlaceHolder(page)); __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, TASK_UNINTERRUPTIBLE); } @@ -590,6 +677,7 @@ void fastcall __lock_page_nosync(struct void fastcall __lock_page_nosync(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + BUG_ON(PagePlaceHolder(page)); __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, TASK_UNINTERRUPTIBLE); } @@ -608,12 +696,66 @@ struct page * find_get_page(struct addre read_lock_irq(&mapping->tree_lock); page = radix_tree_lookup(&mapping->page_tree, offset); - if (page) - page_cache_get(page); + if (page) { + if (PagePlaceHolder(page)) + page = NULL; + else + page_cache_get(page); + } read_unlock_irq(&mapping->tree_lock); return page; } EXPORT_SYMBOL(find_get_page); + +/** + * find_or_insert_page - locate a pagecache page or insert one + * @mapping: the page's address_space + * @index: the page's index into the mapping + * @gfp_mask: page allocation mode + * @insert: the page to insert if none is found + * + * Locates a page in the pagecache. If the page is not present, + * @insert is added instead. @insert is not placed on the lrus + * The returned page is locked and has its reference count + * incremented + * + * find_or_insert_page() may sleep, even if @gfp_flags specifies an atomic + * allocation! + * + * find_or_insert_page() returns the desired page's address, or zero on + * memory exhaustion. + */ +struct page *find_or_insert_page(struct address_space *mapping, + unsigned long index, gfp_t gfp_mask, struct page *insert) +{ + struct page *page; + int err; +repeat: + page = find_lock_page(mapping, index); + if (!page) { + err = add_to_page_cache(insert, mapping, index, gfp_mask); + if (!err) { + page = insert; + } else if (err == -EEXIST) + goto repeat; + } + return page; +} +EXPORT_SYMBOL(find_or_insert_page); + +void remove_placeholder_page(struct address_space *mapping, + struct page *expected, unsigned long offset) +{ + struct page *page; + write_lock_irq(&mapping->tree_lock); + page = radix_tree_lookup(&mapping->page_tree, offset); + BUG_ON(!page); + BUG_ON(!PagePlaceHolder(page)); + BUG_ON(page != expected); + radix_tree_delete(&mapping->page_tree, offset); + write_unlock_irq(&mapping->tree_lock); +} +EXPORT_SYMBOL(remove_placeholder_page); /** * find_trylock_page - find and lock a page @@ -628,7 +770,7 @@ struct page *find_trylock_page(struct ad read_lock_irq(&mapping->tree_lock); page = radix_tree_lookup(&mapping->page_tree, offset); - if (page && TestSetPageLocked(page)) + if (page && (PagePlaceHolder(page) || TestSetPageLocked(page))) page = NULL; read_unlock_irq(&mapping->tree_lock); return page; @@ -654,6 +796,12 @@ repeat: repeat: page = radix_tree_lookup(&mapping->page_tree, offset); if (page) { + if (PagePlaceHolder(page)) { + read_unlock_irq(&mapping->tree_lock); + wait_on_placeholder_page(mapping, page, offset); + read_lock_irq(&mapping->tree_lock); + goto repeat; + } page_cache_get(page); if (TestSetPageLocked(page)) { read_unlock_irq(&mapping->tree_lock); @@ -743,8 +891,17 @@ unsigned find_get_pages(struct address_s read_lock_irq(&mapping->tree_lock); ret = radix_tree_gang_lookup(&mapping->page_tree, (void **)pages, start, nr_pages); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); + for (i = 0; i < ret; i++) { + if (PagePlaceHolder(pages[i])) { + /* we can't return a place holder, shift it away */ + if (i + 1 < ret) { + memmove(pages + i, pages + i + 1, + (ret - i - 1) * sizeof(struct page *)); + } + ret--; + } else + page_cache_get(pages[i]); + } read_unlock_irq(&mapping->tree_lock); return ret; } @@ -771,6 +928,8 @@ unsigned find_get_pages_contig(struct ad ret = radix_tree_gang_lookup(&mapping->page_tree, (void **)pages, index, nr_pages); for (i = 0; i < ret; i++) { + if (PagePlaceHolder(pages[i])) + break; if (pages[i]->mapping == NULL || pages[i]->index != index) break; @@ -801,8 +960,17 @@ unsigned find_get_pages_tag(struct addre read_lock_irq(&mapping->tree_lock); ret = radix_tree_gang_lookup_tag(&mapping->page_tree, (void **)pages, *index, nr_pages, tag); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); + for (i = 0; i < ret; i++) { + if (PagePlaceHolder(pages[i])) { + /* we can't return a place holder, shift it away */ + if (i + 1 < ret) { + memmove(pages + i, pages + i + 1, + (ret - i - 1) * sizeof(struct page *)); + } + ret--; + } else + page_cache_get(pages[i]); + } if (ret) *index = pages[ret - 1]->index + 1; read_unlock_irq(&mapping->tree_lock); diff -r 18a9e9f5c707 mm/truncate.c --- a/mm/truncate.c Thu Oct 19 08:30:00 2006 +0700 +++ b/mm/truncate.c Fri Oct 20 12:38:24 2006 -0400 @@ -207,6 +207,7 @@ void truncate_inode_pages_range(struct a } pagevec_release(&pvec); } + wait_on_placeholder_pages_range(mapping, start, end); } EXPORT_SYMBOL(truncate_inode_pages_range); - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html