Re: [patch 5/9] mm + fs: prepare for non-page entries in page cache radix trees

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Jan 10, 2014 at 01:10:39PM -0500, Johannes Weiner wrote:
> shmem mappings already contain exceptional entries where swap slot
> information is remembered.
> 
> To be able to store eviction information for regular page cache,
> prepare every site dealing with the radix trees directly to handle
> entries other than pages.
> 
> The common lookup functions will filter out non-page entries and
> return NULL for page cache holes, just as before.  But provide a raw
> version of the API which returns non-page entries as well, and switch
> shmem over to use it.
> 
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>

I know I'm really late in the game here and this isn't even a proper
review. It's just a glancing through my inbox trying to reduce the mess
before going offline.

> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
> index 6aad98cb343f..c88316587900 100644
> --- a/fs/btrfs/compression.c
> +++ b/fs/btrfs/compression.c
> @@ -474,7 +474,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
>  		rcu_read_lock();
>  		page = radix_tree_lookup(&mapping->page_tree, pg_index);
>  		rcu_read_unlock();
> -		if (page) {
> +		if (page && !radix_tree_exceptional_entry(page)) {
>  			misses++;
>  			if (misses > 4)
>  				break;

For unrelated reasons this raises a red flag for me. The block of code
looks very similar to find_or_create_page except it uses different locking
for reasons that are not very obvious at first glance.  It's also a little
similar to normal readahead but different enough that they cannot easily
share code. The similarities are close enough to wonder why btrfs had
to be special here and it now looks worse when they have to deal with
exceptional entries.

It's something for the btrfs people I guess.

> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 8b6e55ee8855..c09ef3ae55bc 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -906,6 +906,14 @@ extern void show_free_areas(unsigned int flags);
>  extern bool skip_free_areas_node(unsigned int flags, int nid);
>  
>  int shmem_zero_setup(struct vm_area_struct *);
> +#ifdef CONFIG_SHMEM
> +bool shmem_mapping(struct address_space *mapping);
> +#else
> +static inline bool shmem_mapping(struct address_space *mapping)
> +{
> +	return false;
> +}
> +#endif
>  
>  extern int can_do_mlock(void);
>  extern int user_shm_lock(size_t, struct user_struct *);
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index c73130c607c4..b6854b7c58cb 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -248,12 +248,15 @@ pgoff_t page_cache_next_hole(struct address_space *mapping,
>  pgoff_t page_cache_prev_hole(struct address_space *mapping,
>  			     pgoff_t index, unsigned long max_scan);
>  
> -extern struct page * find_get_page(struct address_space *mapping,
> -				pgoff_t index);
> -extern struct page * find_lock_page(struct address_space *mapping,
> -				pgoff_t index);
> -extern struct page * find_or_create_page(struct address_space *mapping,
> -				pgoff_t index, gfp_t gfp_mask);
> +struct page *__find_get_page(struct address_space *mapping, pgoff_t offset);
> +struct page *find_get_page(struct address_space *mapping, pgoff_t offset);
> +struct page *__find_lock_page(struct address_space *mapping, pgoff_t offset);
> +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset);
> +struct page *find_or_create_page(struct address_space *mapping, pgoff_t index,
> +				 gfp_t gfp_mask);
> +unsigned __find_get_pages(struct address_space *mapping, pgoff_t start,
> +			  unsigned int nr_pages, struct page **pages,
> +			  pgoff_t *indices);

When I see foo() and __foo() in a header, my first assumption is that
__foo() is a version of foo() that assumes the necesssary locks are
already held. If I see it within a C file, my second assumption will be
that it's an internal helper. Here __find_get_page is not returning just
a page. It's returning a page or a shadow entry if they exist and that may
cause some confusion. Consider renaming __find_get_page to find_get_entry()
to give a hint to the reader they should be looking out for either pages
or shadow entries. It still makes sense for find_lock_entry -- if it's a
page, then it'll return locked etc

>  unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
>  			unsigned int nr_pages, struct page **pages);
>  unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
> diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
> index e4dbfab37729..3c6b8b1e945b 100644
> --- a/include/linux/pagevec.h
> +++ b/include/linux/pagevec.h
> @@ -22,6 +22,9 @@ struct pagevec {
>  
>  void __pagevec_release(struct pagevec *pvec);
>  void __pagevec_lru_add(struct pagevec *pvec);
> +unsigned __pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
> +			  pgoff_t start, unsigned nr_pages, pgoff_t *indices);
> +void pagevec_remove_exceptionals(struct pagevec *pvec);
>  unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
>  		pgoff_t start, unsigned nr_pages);
>  unsigned pagevec_lookup_tag(struct pagevec *pvec,
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index 30aa0dc60d75..deb49609cd36 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -49,6 +49,7 @@ extern struct file *shmem_file_setup(const char *name,
>  					loff_t size, unsigned long flags);
>  extern int shmem_zero_setup(struct vm_area_struct *);
>  extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
> +extern bool shmem_mapping(struct address_space *mapping);
>  extern void shmem_unlock_mapping(struct address_space *mapping);
>  extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
>  					pgoff_t index, gfp_t gfp_mask);
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 0746b7a4658f..23eb3be27205 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -446,6 +446,29 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
>  }
>  EXPORT_SYMBOL_GPL(replace_page_cache_page);
>  
> +static int page_cache_tree_insert(struct address_space *mapping,
> +				  struct page *page)
> +{

Nothing here on the locking rules for the function although the existing
docs here are poor. Everyone knows you need the mapping lock and page lock
here, right?

> +	void **slot;
> +	int error;
> +
> +	slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
> +	if (slot) {
> +		void *p;
> +
> +		p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
> +		if (!radix_tree_exceptional_entry(p))
> +			return -EEXIST;
> +		radix_tree_replace_slot(slot, page);
> +		mapping->nrpages++;
> +		return 0;
> +	}
> +	error = radix_tree_insert(&mapping->page_tree, page->index, page);
> +	if (!error)
> +		mapping->nrpages++;
> +	return error;
> +}
> +
>  /**
>   * add_to_page_cache_locked - add a locked page to the pagecache
>   * @page:	page to add
> @@ -480,11 +503,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
>  	page->index = offset;
>  
>  	spin_lock_irq(&mapping->tree_lock);
> -	error = radix_tree_insert(&mapping->page_tree, offset, page);
> +	error = page_cache_tree_insert(mapping, page);
>  	radix_tree_preload_end();
>  	if (unlikely(error))
>  		goto err_insert;
> -	mapping->nrpages++;
>  	__inc_zone_page_state(page, NR_FILE_PAGES);
>  	spin_unlock_irq(&mapping->tree_lock);
>  	trace_mm_filemap_add_to_page_cache(page);
> @@ -712,7 +734,10 @@ pgoff_t page_cache_next_hole(struct address_space *mapping,
>  	unsigned long i;
>  
>  	for (i = 0; i < max_scan; i++) {
> -		if (!radix_tree_lookup(&mapping->page_tree, index))
> +		struct page *page;
> +
> +		page = radix_tree_lookup(&mapping->page_tree, index);
> +		if (!page || radix_tree_exceptional_entry(page))
>  			break;
>  		index++;
>  		if (index == 0)
> @@ -750,7 +775,10 @@ pgoff_t page_cache_prev_hole(struct address_space *mapping,
>  	unsigned long i;
>  
>  	for (i = 0; i < max_scan; i++) {
> -		if (!radix_tree_lookup(&mapping->page_tree, index))
> +		struct page *page;
> +
> +		page = radix_tree_lookup(&mapping->page_tree, index);
> +		if (!page || radix_tree_exceptional_entry(page))
>  			break;
>  		index--;
>  		if (index == ULONG_MAX)
> @@ -762,14 +790,19 @@ pgoff_t page_cache_prev_hole(struct address_space *mapping,
>  EXPORT_SYMBOL(page_cache_prev_hole);
>  
>  /**
> - * find_get_page - find and get a page reference
> + * __find_get_page - find and get a page reference

This comment will be out of date when it could be returning shadow
entries

>   * @mapping: the address_space to search
>   * @offset: the page index
>   *
> - * Is there a pagecache struct page at the given (mapping, offset) tuple?
> - * If yes, increment its refcount and return it; if no, return NULL.
> + * Looks up the page cache slot at @mapping & @offset.  If there is a
> + * page cache page, it is returned with an increased refcount.
> + *
> + * If the slot holds a shadow entry of a previously evicted page, it
> + * is returned.
> + *

That's not true yet but who cares. Anyone doing a git blame of the history
will need to search around the area anyway.

> + * Otherwise, %NULL is returned.
>   */
> -struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
> +struct page *__find_get_page(struct address_space *mapping, pgoff_t offset)
>  {
>  	void **pagep;
>  	struct page *page;
> @@ -810,24 +843,49 @@ out:
>  
>  	return page;
>  }
> +EXPORT_SYMBOL(__find_get_page);
> +
> +/**
> + * find_get_page - find and get a page reference
> + * @mapping: the address_space to search
> + * @offset: the page index
> + *
> + * Looks up the page cache slot at @mapping & @offset.  If there is a
> + * page cache page, it is returned with an increased refcount.
> + *
> + * Otherwise, %NULL is returned.
> + */
> +struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
> +{
> +	struct page *page = __find_get_page(mapping, offset);
> +
> +	if (radix_tree_exceptional_entry(page))
> +		page = NULL;
> +	return page;
> +}
>  EXPORT_SYMBOL(find_get_page);
>  
>  /**
> - * find_lock_page - locate, pin and lock a pagecache page
> + * __find_lock_page - locate, pin and lock a pagecache page
>   * @mapping: the address_space to search
>   * @offset: the page index
>   *
> - * Locates the desired pagecache page, locks it, increments its reference
> - * count and returns its address.
> + * Looks up the page cache slot at @mapping & @offset.  If there is a
> + * page cache page, it is returned locked and with an increased
> + * refcount.
> + *
> + * If the slot holds a shadow entry of a previously evicted page, it
> + * is returned.
> + *
> + * Otherwise, %NULL is returned.
>   *
> - * Returns zero if the page was not present. find_lock_page() may sleep.
> + * __find_lock_page() may sleep.
>   */
> -struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
> +struct page *__find_lock_page(struct address_space *mapping, pgoff_t offset)
>  {
>  	struct page *page;
> -
>  repeat:

Unnecessary whitespace change.

> -	page = find_get_page(mapping, offset);
> +	page = __find_get_page(mapping, offset);
>  	if (page && !radix_tree_exception(page)) {
>  		lock_page(page);
>  		/* Has the page been truncated? */

Just as an example, if this was find_get_entry() it would be a lot
clearer that the return value may or may not be a page.

> @@ -840,6 +898,29 @@ repeat:
>  	}
>  	return page;
>  }
> +EXPORT_SYMBOL(__find_lock_page);
> +
> +/**
> + * find_lock_page - locate, pin and lock a pagecache page
> + * @mapping: the address_space to search
> + * @offset: the page index
> + *
> + * Looks up the page cache slot at @mapping & @offset.  If there is a
> + * page cache page, it is returned locked and with an increased
> + * refcount.
> + *
> + * Otherwise, %NULL is returned.
> + *
> + * find_lock_page() may sleep.
> + */
> +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
> +{
> +	struct page *page = __find_lock_page(mapping, offset);
> +
> +	if (radix_tree_exceptional_entry(page))
> +		page = NULL;
> +	return page;
> +}
>  EXPORT_SYMBOL(find_lock_page);
>  
>  /**
> @@ -848,16 +929,18 @@ EXPORT_SYMBOL(find_lock_page);
>   * @index: the page's index into the mapping
>   * @gfp_mask: page allocation mode
>   *
> - * Locates a page in the pagecache.  If the page is not present, a new page
> - * is allocated using @gfp_mask and is added to the pagecache and to the VM's
> - * LRU list.  The returned page is locked and has its reference count
> - * incremented.
> + * Looks up the page cache slot at @mapping & @offset.  If there is a
> + * page cache page, it is returned locked and with an increased
> + * refcount.
>   *
> - * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
> - * allocation!
> + * If the page is not present, a new page is allocated using @gfp_mask
> + * and added to the page cache and the VM's LRU list.  The page is
> + * returned locked and with an increased refcount.
>   *
> - * find_or_create_page() returns the desired page's address, or zero on
> - * memory exhaustion.
> + * On memory exhaustion, %NULL is returned.
> + *
> + * find_or_create_page() may sleep, even if @gfp_flags specifies an
> + * atomic allocation!
>   */
>  struct page *find_or_create_page(struct address_space *mapping,
>  		pgoff_t index, gfp_t gfp_mask)
> @@ -890,6 +973,73 @@ repeat:
>  EXPORT_SYMBOL(find_or_create_page);
>  
>  /**
> + * __find_get_pages - gang pagecache lookup
> + * @mapping:	The address_space to search
> + * @start:	The starting page index
> + * @nr_pages:	The maximum number of pages
> + * @pages:	Where the resulting pages are placed
> + *
> + * __find_get_pages() will search for and return a group of up to
> + * @nr_pages pages in the mapping.  The pages are placed at @pages.
> + * __find_get_pages() takes a reference against the returned pages.
> + *
> + * The search returns a group of mapping-contiguous pages with ascending
> + * indexes.  There may be holes in the indices due to not-present pages.
> + *
> + * Any shadow entries of evicted pages are included in the returned
> + * array.
> + *
> + * __find_get_pages() returns the number of pages and shadow entries
> + * which were found.
> + */
> +unsigned __find_get_pages(struct address_space *mapping,
> +			  pgoff_t start, unsigned int nr_pages,
> +			  struct page **pages, pgoff_t *indices)
> +{
> +	void **slot;
> +	unsigned int ret = 0;
> +	struct radix_tree_iter iter;
> +
> +	if (!nr_pages)
> +		return 0;
> +
> +	rcu_read_lock();
> +restart:
> +	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
> +		struct page *page;
> +repeat:
> +		page = radix_tree_deref_slot(slot);
> +		if (unlikely(!page))
> +			continue;
> +		if (radix_tree_exception(page)) {
> +			if (radix_tree_deref_retry(page))
> +				goto restart;
> +			/*
> +			 * Otherwise, we must be storing a swap entry
> +			 * here as an exceptional entry: so return it
> +			 * without attempting to raise page count.
> +			 */
> +			goto export;
> +		}

There is a non-obvious API hazard here that should be called out in
the function description. shmem was the previous gang lookup user and
it knew that there would be swap entries and removed them if necessary
with shmem_deswap_pagevec. It was internal to shmem.c so it could deal
with the complexity. Now that you are making it a generic function it
should clearly explain that exceptional entries can be returned and
pagevec_remove_exceptionals should be used to remove them if necessary
or else split the helper in two to return just pages or both pages and
exceptional entries.

> +		if (!page_cache_get_speculative(page))
> +			goto repeat;
> +
> +		/* Has the page moved? */
> +		if (unlikely(page != *slot)) {
> +			page_cache_release(page);
> +			goto repeat;
> +		}
> +export:
> +		indices[ret] = iter.index;
> +		pages[ret] = page;
> +		if (++ret == nr_pages)
> +			break;
> +	}
> +	rcu_read_unlock();
> +	return ret;
> +}
> +
> +/**
>   * find_get_pages - gang pagecache lookup
>   * @mapping:	The address_space to search
>   * @start:	The starting page index
> diff --git a/mm/mincore.c b/mm/mincore.c
> index da2be56a7b8f..ad411ec86a55 100644
> --- a/mm/mincore.c
> +++ b/mm/mincore.c
> @@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
>  	 * any other file mapping (ie. marked !present and faulted in with
>  	 * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
>  	 */
> -	page = find_get_page(mapping, pgoff);
>  #ifdef CONFIG_SWAP
> -	/* shmem/tmpfs may return swap: account for swapcache page too. */
> -	if (radix_tree_exceptional_entry(page)) {
> -		swp_entry_t swap = radix_to_swp_entry(page);
> -		page = find_get_page(swap_address_space(swap), swap.val);
> -	}
> +	if (shmem_mapping(mapping)) {
> +		page = __find_get_page(mapping, pgoff);
> +		/*
> +		 * shmem/tmpfs may return swap: account for swapcache
> +		 * page too.
> +		 */
> +		if (radix_tree_exceptional_entry(page)) {
> +			swp_entry_t swp = radix_to_swp_entry(page);
> +			page = find_get_page(swap_address_space(swp), swp.val);
> +		}
> +	} else
> +		page = find_get_page(mapping, pgoff);
> +#else
> +	page = find_get_page(mapping, pgoff);
>  #endif
>  	if (page) {
>  		present = PageUptodate(page);
> diff --git a/mm/readahead.c b/mm/readahead.c
> index 9eeeeda4ac0e..912c00358112 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -179,7 +179,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
>  		rcu_read_lock();
>  		page = radix_tree_lookup(&mapping->page_tree, page_offset);
>  		rcu_read_unlock();
> -		if (page)
> +		if (page && !radix_tree_exceptional_entry(page))
>  			continue;
>  
>  		page = page_cache_alloc_readahead(mapping);

Maybe just think hunk can be split out and shared with btrfs to avoid it
dealing with exceptional entries although I've no good suggestions on what
you'd call it.

> diff --git a/mm/shmem.c b/mm/shmem.c
> index 7c67249d6f28..1f4b65f7b831 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -329,56 +329,6 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
>  }
>  
>  /*
> - * Like find_get_pages, but collecting swap entries as well as pages.
> - */
> -static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
> -					pgoff_t start, unsigned int nr_pages,
> -					struct page **pages, pgoff_t *indices)
> -{
> -	void **slot;
> -	unsigned int ret = 0;
> -	struct radix_tree_iter iter;
> -
> -	if (!nr_pages)
> -		return 0;
> -
> -	rcu_read_lock();
> -restart:
> -	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
> -		struct page *page;
> -repeat:
> -		page = radix_tree_deref_slot(slot);
> -		if (unlikely(!page))
> -			continue;
> -		if (radix_tree_exception(page)) {
> -			if (radix_tree_deref_retry(page))
> -				goto restart;
> -			/*
> -			 * Otherwise, we must be storing a swap entry
> -			 * here as an exceptional entry: so return it
> -			 * without attempting to raise page count.
> -			 */
> -			goto export;
> -		}
> -		if (!page_cache_get_speculative(page))
> -			goto repeat;
> -
> -		/* Has the page moved? */
> -		if (unlikely(page != *slot)) {
> -			page_cache_release(page);
> -			goto repeat;
> -		}
> -export:
> -		indices[ret] = iter.index;
> -		pages[ret] = page;
> -		if (++ret == nr_pages)
> -			break;
> -	}
> -	rcu_read_unlock();
> -	return ret;
> -}
> -
> -/*
>   * Remove swap entry from radix tree, free the swap and its page cache.
>   */
>  static int shmem_free_swap(struct address_space *mapping,
> @@ -396,21 +346,6 @@ static int shmem_free_swap(struct address_space *mapping,
>  }
>  
>  /*
> - * Pagevec may contain swap entries, so shuffle up pages before releasing.
> - */
> -static void shmem_deswap_pagevec(struct pagevec *pvec)
> -{
> -	int i, j;
> -
> -	for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
> -		struct page *page = pvec->pages[i];
> -		if (!radix_tree_exceptional_entry(page))
> -			pvec->pages[j++] = page;
> -	}
> -	pvec->nr = j;
> -}
> -
> -/*
>   * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
>   */
>  void shmem_unlock_mapping(struct address_space *mapping)
> @@ -428,12 +363,12 @@ void shmem_unlock_mapping(struct address_space *mapping)
>  		 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
>  		 * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
>  		 */
> -		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
> +		pvec.nr = __find_get_pages(mapping, index,
>  					PAGEVEC_SIZE, pvec.pages, indices);
>  		if (!pvec.nr)
>  			break;
>  		index = indices[pvec.nr - 1] + 1;
> -		shmem_deswap_pagevec(&pvec);
> +		pagevec_remove_exceptionals(&pvec);
>  		check_move_unevictable_pages(pvec.pages, pvec.nr);
>  		pagevec_release(&pvec);
>  		cond_resched();
> @@ -465,9 +400,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
>  	pagevec_init(&pvec, 0);
>  	index = start;
>  	while (index < end) {
> -		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
> -				min(end - index, (pgoff_t)PAGEVEC_SIZE),
> -							pvec.pages, indices);
> +		pvec.nr = __find_get_pages(mapping, index,
> +			min(end - index, (pgoff_t)PAGEVEC_SIZE),
> +			pvec.pages, indices);
>  		if (!pvec.nr)
>  			break;
>  		mem_cgroup_uncharge_start();
> @@ -496,7 +431,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
>  			}
>  			unlock_page(page);
>  		}
> -		shmem_deswap_pagevec(&pvec);
> +		pagevec_remove_exceptionals(&pvec);
>  		pagevec_release(&pvec);
>  		mem_cgroup_uncharge_end();
>  		cond_resched();
> @@ -534,9 +469,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
>  	index = start;
>  	for ( ; ; ) {
>  		cond_resched();
> -		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
> +
> +		pvec.nr = __find_get_pages(mapping, index,
>  				min(end - index, (pgoff_t)PAGEVEC_SIZE),
> -							pvec.pages, indices);
> +				pvec.pages, indices);
>  		if (!pvec.nr) {
>  			if (index == start || unfalloc)
>  				break;
> @@ -544,7 +480,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
>  			continue;
>  		}
>  		if ((index == start || unfalloc) && indices[0] >= end) {
> -			shmem_deswap_pagevec(&pvec);
> +			pagevec_remove_exceptionals(&pvec);
>  			pagevec_release(&pvec);
>  			break;
>  		}
> @@ -573,7 +509,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
>  			}
>  			unlock_page(page);
>  		}
> -		shmem_deswap_pagevec(&pvec);
> +		pagevec_remove_exceptionals(&pvec);
>  		pagevec_release(&pvec);
>  		mem_cgroup_uncharge_end();
>  		index++;
> @@ -1081,7 +1017,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
>  		return -EFBIG;
>  repeat:
>  	swap.val = 0;
> -	page = find_lock_page(mapping, index);
> +	page = __find_lock_page(mapping, index);
>  	if (radix_tree_exceptional_entry(page)) {
>  		swap = radix_to_swp_entry(page);
>  		page = NULL;
> @@ -1418,6 +1354,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
>  	return inode;
>  }
>  
> +bool shmem_mapping(struct address_space *mapping)
> +{
> +	return mapping->backing_dev_info == &shmem_backing_dev_info;
> +}
> +
>  #ifdef CONFIG_TMPFS
>  static const struct inode_operations shmem_symlink_inode_operations;
>  static const struct inode_operations shmem_short_symlink_operations;
> @@ -1730,7 +1671,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
>  	pagevec_init(&pvec, 0);
>  	pvec.nr = 1;		/* start small: we may be there already */
>  	while (!done) {
> -		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
> +		pvec.nr = __find_get_pages(mapping, index,
>  					pvec.nr, pvec.pages, indices);
>  		if (!pvec.nr) {
>  			if (whence == SEEK_DATA)
> @@ -1757,7 +1698,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
>  				break;
>  			}
>  		}
> -		shmem_deswap_pagevec(&pvec);
> +		pagevec_remove_exceptionals(&pvec);
>  		pagevec_release(&pvec);
>  		pvec.nr = PAGEVEC_SIZE;
>  		cond_resched();
> diff --git a/mm/swap.c b/mm/swap.c
> index 759c3caf44bd..f624e5b4b724 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -894,6 +894,53 @@ EXPORT_SYMBOL(__pagevec_lru_add);
>  
>  /**
>   * pagevec_lookup - gang pagecache lookup
> + * @pvec:	Where the resulting entries are placed
> + * @mapping:	The address_space to search
> + * @start:	The starting entry index
> + * @nr_pages:	The maximum number of entries
> + *
> + * pagevec_lookup() will search for and return a group of up to
> + * @nr_pages pages and shadow entries in the mapping.  All entries are
> + * placed in @pvec.  pagevec_lookup() takes a reference against actual
> + * pages in @pvec.
> + *
> + * The search returns a group of mapping-contiguous entries with
> + * ascending indexes.  There may be holes in the indices due to
> + * not-present entries.
> + *
> + * pagevec_lookup() returns the number of entries which were found.
> + */
> +unsigned __pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
> +			  pgoff_t start, unsigned nr_pages, pgoff_t *indices)
> +{
> +	pvec->nr = __find_get_pages(mapping, start, nr_pages,
> +				    pvec->pages, indices);
> +	return pagevec_count(pvec);
> +}
> +
> +/**
> + * pagevec_remove_exceptionals - pagevec exceptionals pruning
> + * @pvec:	The pagevec to prune
> + *
> + * __pagevec_lookup() fills both pages and exceptional radix tree
> + * entries into the pagevec.  This function prunes all exceptionals
> + * from @pvec without leaving holes, so that it can be passed on to
> + * other pagevec operations.
> + */
> +void pagevec_remove_exceptionals(struct pagevec *pvec)
> +{
> +	int i, j;
> +
> +	for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
> +		struct page *page = pvec->pages[i];
> +		if (!radix_tree_exceptional_entry(page))
> +			pvec->pages[j++] = page;
> +	}
> +	pvec->nr = j;
> +}
> +
> +/**
> + * pagevec_lookup - gang pagecache lookup
>   * @pvec:	Where the resulting pages are placed
>   * @mapping:	The address_space to search
>   * @start:	The starting page index
> diff --git a/mm/truncate.c b/mm/truncate.c
> index 353b683afd6e..b0f4d4bee8ab 100644
> --- a/mm/truncate.c
> +++ b/mm/truncate.c
> @@ -22,6 +22,22 @@
>  #include <linux/cleancache.h>
>  #include "internal.h"
>  
> +static void clear_exceptional_entry(struct address_space *mapping,
> +				    pgoff_t index, void *entry)
> +{
> +	/* Handled by shmem itself */
> +	if (shmem_mapping(mapping))
> +		return;
> +
> +	spin_lock_irq(&mapping->tree_lock);
> +	/*
> +	 * Regular page slots are stabilized by the page lock even
> +	 * without the tree itself locked.  These unlocked entries
> +	 * need verification under the tree lock.
> +	 */
> +	radix_tree_delete_item(&mapping->page_tree, index, entry);
> +	spin_unlock_irq(&mapping->tree_lock);
> +}
>  
>  /**
>   * do_invalidatepage - invalidate part or all of a page
> @@ -208,6 +224,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
>  	unsigned int	partial_start;	/* inclusive */
>  	unsigned int	partial_end;	/* exclusive */
>  	struct pagevec	pvec;
> +	pgoff_t		indices[PAGEVEC_SIZE];
>  	pgoff_t		index;
>  	int		i;
>  
> @@ -238,17 +255,23 @@ void truncate_inode_pages_range(struct address_space *mapping,
>  
>  	pagevec_init(&pvec, 0);
>  	index = start;
> -	while (index < end && pagevec_lookup(&pvec, mapping, index,
> -			min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
> +	while (index < end && __pagevec_lookup(&pvec, mapping, index,
> +			min(end - index, (pgoff_t)PAGEVEC_SIZE),
> +			indices)) {
>  		mem_cgroup_uncharge_start();
>  		for (i = 0; i < pagevec_count(&pvec); i++) {
>  			struct page *page = pvec.pages[i];
>  
>  			/* We rely upon deletion not changing page->index */
> -			index = page->index;
> +			index = indices[i];
>  			if (index >= end)
>  				break;
>  
> +			if (radix_tree_exceptional_entry(page)) {
> +				clear_exceptional_entry(mapping, index, page);
> +				continue;
> +			}
> +
>  			if (!trylock_page(page))
>  				continue;
>  			WARN_ON(page->index != index);
> @@ -259,6 +282,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
>  			truncate_inode_page(mapping, page);
>  			unlock_page(page);
>  		}
> +		pagevec_remove_exceptionals(&pvec);
>  		pagevec_release(&pvec);
>  		mem_cgroup_uncharge_end();
>  		cond_resched();
> @@ -307,14 +331,15 @@ void truncate_inode_pages_range(struct address_space *mapping,
>  	index = start;
>  	for ( ; ; ) {
>  		cond_resched();
> -		if (!pagevec_lookup(&pvec, mapping, index,
> -			min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
> +		if (!__pagevec_lookup(&pvec, mapping, index,
> +			min(end - index, (pgoff_t)PAGEVEC_SIZE),
> +			indices)) {
>  			if (index == start)
>  				break;
>  			index = start;
>  			continue;
>  		}
> -		if (index == start && pvec.pages[0]->index >= end) {
> +		if (index == start && indices[0] >= end) {
>  			pagevec_release(&pvec);
>  			break;
>  		}
> @@ -323,16 +348,22 @@ void truncate_inode_pages_range(struct address_space *mapping,
>  			struct page *page = pvec.pages[i];
>  
>  			/* We rely upon deletion not changing page->index */
> -			index = page->index;
> +			index = indices[i];
>  			if (index >= end)
>  				break;
>  
> +			if (radix_tree_exceptional_entry(page)) {
> +				clear_exceptional_entry(mapping, index, page);
> +				continue;
> +			}
> +
>  			lock_page(page);
>  			WARN_ON(page->index != index);
>  			wait_on_page_writeback(page);
>  			truncate_inode_page(mapping, page);
>  			unlock_page(page);
>  		}
> +		pagevec_remove_exceptionals(&pvec);
>  		pagevec_release(&pvec);
>  		mem_cgroup_uncharge_end();
>  		index++;
> @@ -375,6 +406,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
>  unsigned long invalidate_mapping_pages(struct address_space *mapping,
>  		pgoff_t start, pgoff_t end)
>  {
> +	pgoff_t indices[PAGEVEC_SIZE];
>  	struct pagevec pvec;
>  	pgoff_t index = start;
>  	unsigned long ret;
> @@ -390,17 +422,23 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
>  	 */
>  
>  	pagevec_init(&pvec, 0);
> -	while (index <= end && pagevec_lookup(&pvec, mapping, index,
> -			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
> +	while (index <= end && __pagevec_lookup(&pvec, mapping, index,
> +			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
> +			indices)) {
>  		mem_cgroup_uncharge_start();
>  		for (i = 0; i < pagevec_count(&pvec); i++) {
>  			struct page *page = pvec.pages[i];
>  
>  			/* We rely upon deletion not changing page->index */
> -			index = page->index;
> +			index = indices[i];
>  			if (index > end)
>  				break;
>  
> +			if (radix_tree_exceptional_entry(page)) {
> +				clear_exceptional_entry(mapping, index, page);
> +				continue;
> +			}
> +
>  			if (!trylock_page(page))
>  				continue;
>  			WARN_ON(page->index != index);
> @@ -414,6 +452,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
>  				deactivate_page(page);
>  			count += ret;
>  		}
> +		pagevec_remove_exceptionals(&pvec);
>  		pagevec_release(&pvec);
>  		mem_cgroup_uncharge_end();
>  		cond_resched();
> @@ -481,6 +520,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
>  int invalidate_inode_pages2_range(struct address_space *mapping,
>  				  pgoff_t start, pgoff_t end)
>  {
> +	pgoff_t indices[PAGEVEC_SIZE];
>  	struct pagevec pvec;
>  	pgoff_t index;
>  	int i;
> @@ -491,17 +531,23 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
>  	cleancache_invalidate_inode(mapping);
>  	pagevec_init(&pvec, 0);
>  	index = start;
> -	while (index <= end && pagevec_lookup(&pvec, mapping, index,
> -			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
> +	while (index <= end && __pagevec_lookup(&pvec, mapping, index,
> +			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
> +			indices)) {
>  		mem_cgroup_uncharge_start();
>  		for (i = 0; i < pagevec_count(&pvec); i++) {
>  			struct page *page = pvec.pages[i];
>  
>  			/* We rely upon deletion not changing page->index */
> -			index = page->index;
> +			index = indices[i];
>  			if (index > end)
>  				break;
>  
> +			if (radix_tree_exceptional_entry(page)) {
> +				clear_exceptional_entry(mapping, index, page);
> +				continue;
> +			}
> +
>  			lock_page(page);
>  			WARN_ON(page->index != index);
>  			if (page->mapping != mapping) {
> @@ -539,6 +585,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
>  				ret = ret2;
>  			unlock_page(page);
>  		}
> +		pagevec_remove_exceptionals(&pvec);
>  		pagevec_release(&pvec);
>  		mem_cgroup_uncharge_end();
>  		cond_resched();

-- 
Mel Gorman
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux