Re: [PATCH 8/9] mm: workingset: move shadow entry tracking to radix tree exceptional tracking

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu 17-11-16 14:32:11, Johannes Weiner wrote:
> Currently, we track the shadow entries in the page cache in the upper
> bits of the radix_tree_node->count, behind the back of the radix tree
> implementation. Because the radix tree code has no awareness of them,
> we rely on random subtleties throughout the implementation (such as
> the node->count != 1 check in the shrinking code, which is meant to
> exclude multi-entry nodes but also happens to skip nodes with only one
> shadow entry, as that's accounted in the upper bits). This is error
> prone and has, in fact, caused the bug fixed in d3798ae8c6f3 ("mm:
> filemap: don't plant shadow entries without radix tree node").
> 
> To remove these subtleties, this patch moves shadow entry tracking
> from the upper bits of node->count to the existing counter for
> exceptional entries. node->count goes back to being a simple counter
> of valid entries in the tree node and can be shrunk to a single byte.
> 
> This vastly simplifies the page cache code. All accounting happens
> natively inside the radix tree implementation, and maintaining the LRU
> linkage of shadow nodes is consolidated into a single function in the
> workingset code that is called for leaf nodes affected by a change in
> the page cache tree.
> 
> This also removes the last user of the __radix_delete_node() return
> value. Eliminate it.

Looks good. You can add:

Reviewed-by: Jan Kara <jack@xxxxxxx>

								Honza

> 
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
> ---
>  include/linux/radix-tree.h |  8 ++-----
>  include/linux/swap.h       | 34 +---------------------------
>  lib/radix-tree.c           | 25 +++++----------------
>  mm/filemap.c               | 54 +++++---------------------------------------
>  mm/truncate.c              | 21 +++--------------
>  mm/workingset.c            | 56 +++++++++++++++++++++++++++++++++++-----------
>  6 files changed, 60 insertions(+), 138 deletions(-)
> 
> diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
> index 15c972ea9510..744486057e9e 100644
> --- a/include/linux/radix-tree.h
> +++ b/include/linux/radix-tree.h
> @@ -80,14 +80,10 @@ static inline bool radix_tree_is_internal_node(void *ptr)
>  #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
>  					  RADIX_TREE_MAP_SHIFT))
>  
> -/* Internally used bits of node->count */
> -#define RADIX_TREE_COUNT_SHIFT	(RADIX_TREE_MAP_SHIFT + 1)
> -#define RADIX_TREE_COUNT_MASK	((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
> -
>  struct radix_tree_node {
>  	unsigned char	shift;		/* Bits remaining in each slot */
>  	unsigned char	offset;		/* Slot offset in parent */
> -	unsigned int	count;		/* Total entry count */
> +	unsigned char	count;		/* Total entry count */
>  	unsigned char	exceptional;	/* Exceptional entry count */
>  	union {
>  		struct {
> @@ -270,7 +266,7 @@ void __radix_tree_replace(struct radix_tree_root *root,
>  			  radix_tree_update_node_t update_node, void *private);
>  void radix_tree_replace_slot(struct radix_tree_root *root,
>  			     void **slot, void *item);
> -bool __radix_tree_delete_node(struct radix_tree_root *root,
> +void __radix_tree_delete_node(struct radix_tree_root *root,
>  			      struct radix_tree_node *node);
>  void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
>  void *radix_tree_delete(struct radix_tree_root *, unsigned long);
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index a56523cefb9b..09b212d37f1d 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -246,39 +246,7 @@ struct swap_info_struct {
>  void *workingset_eviction(struct address_space *mapping, struct page *page);
>  bool workingset_refault(void *shadow);
>  void workingset_activation(struct page *page);
> -extern struct list_lru workingset_shadow_nodes;
> -
> -static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
> -{
> -	return node->count & RADIX_TREE_COUNT_MASK;
> -}
> -
> -static inline void workingset_node_pages_inc(struct radix_tree_node *node)
> -{
> -	node->count++;
> -}
> -
> -static inline void workingset_node_pages_dec(struct radix_tree_node *node)
> -{
> -	VM_WARN_ON_ONCE(!workingset_node_pages(node));
> -	node->count--;
> -}
> -
> -static inline unsigned int workingset_node_shadows(struct radix_tree_node *node)
> -{
> -	return node->count >> RADIX_TREE_COUNT_SHIFT;
> -}
> -
> -static inline void workingset_node_shadows_inc(struct radix_tree_node *node)
> -{
> -	node->count += 1U << RADIX_TREE_COUNT_SHIFT;
> -}
> -
> -static inline void workingset_node_shadows_dec(struct radix_tree_node *node)
> -{
> -	VM_WARN_ON_ONCE(!workingset_node_shadows(node));
> -	node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
> -}
> +void workingset_update_node(struct radix_tree_node *node, void *private);
>  
>  /* linux/mm/page_alloc.c */
>  extern unsigned long totalram_pages;
> diff --git a/lib/radix-tree.c b/lib/radix-tree.c
> index df4ff18dd63c..9dbfaac05e6c 100644
> --- a/lib/radix-tree.c
> +++ b/lib/radix-tree.c
> @@ -541,12 +541,10 @@ static int radix_tree_extend(struct radix_tree_root *root,
>   *	radix_tree_shrink    -    shrink radix tree to minimum height
>   *	@root		radix tree root
>   */
> -static inline bool radix_tree_shrink(struct radix_tree_root *root,
> +static inline void radix_tree_shrink(struct radix_tree_root *root,
>  				     radix_tree_update_node_t update_node,
>  				     void *private)
>  {
> -	bool shrunk = false;
> -
>  	for (;;) {
>  		struct radix_tree_node *node = root->rnode;
>  		struct radix_tree_node *child;
> @@ -606,26 +604,20 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
>  		}
>  
>  		radix_tree_node_free(node);
> -		shrunk = true;
>  	}
> -
> -	return shrunk;
>  }
>  
> -static bool delete_node(struct radix_tree_root *root,
> +static void delete_node(struct radix_tree_root *root,
>  			struct radix_tree_node *node,
>  			radix_tree_update_node_t update_node, void *private)
>  {
> -	bool deleted = false;
> -
>  	do {
>  		struct radix_tree_node *parent;
>  
>  		if (node->count) {
>  			if (node == entry_to_node(root->rnode))
> -				deleted |= radix_tree_shrink(root, update_node,
> -							     private);
> -			return deleted;
> +				radix_tree_shrink(root, update_node, private);
> +			return;
>  		}
>  
>  		parent = node->parent;
> @@ -638,12 +630,9 @@ static bool delete_node(struct radix_tree_root *root,
>  		}
>  
>  		radix_tree_node_free(node);
> -		deleted = true;
>  
>  		node = parent;
>  	} while (node);
> -
> -	return deleted;
>  }
>  
>  /**
> @@ -1595,13 +1584,11 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
>   *	After clearing the slot at @index in @node from radix tree
>   *	rooted at @root, call this function to attempt freeing the
>   *	node and shrinking the tree.
> - *
> - *	Returns %true if @node was freed, %false otherwise.
>   */
> -bool __radix_tree_delete_node(struct radix_tree_root *root,
> +void __radix_tree_delete_node(struct radix_tree_root *root,
>  			      struct radix_tree_node *node)
>  {
> -	return delete_node(root, node, NULL, NULL);
> +	delete_node(root, node, NULL, NULL);
>  }
>  
>  static inline void delete_sibling_entries(struct radix_tree_node *node,
> diff --git a/mm/filemap.c b/mm/filemap.c
> index eb463156f29a..7d92032277ff 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -132,37 +132,19 @@ static int page_cache_tree_insert(struct address_space *mapping,
>  		if (!dax_mapping(mapping)) {
>  			if (shadowp)
>  				*shadowp = p;
> -			if (node)
> -				workingset_node_shadows_dec(node);
>  		} else {
>  			/* DAX can replace empty locked entry with a hole */
>  			WARN_ON_ONCE(p !=
>  				(void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
>  					 RADIX_DAX_ENTRY_LOCK));
> -			/* DAX accounts exceptional entries as normal pages */
> -			if (node)
> -				workingset_node_pages_dec(node);
>  			/* Wakeup waiters for exceptional entry lock */
>  			dax_wake_mapping_entry_waiter(mapping, page->index,
>  						      false);
>  		}
>  	}
> -	radix_tree_replace_slot(&mapping->page_tree, slot, page);
> +	__radix_tree_replace(&mapping->page_tree, node, slot, page,
> +			     workingset_update_node, mapping);
>  	mapping->nrpages++;
> -	if (node) {
> -		workingset_node_pages_inc(node);
> -		/*
> -		 * Don't track node that contains actual pages.
> -		 *
> -		 * Avoid acquiring the list_lru lock if already
> -		 * untracked.  The list_empty() test is safe as
> -		 * node->private_list is protected by
> -		 * mapping->tree_lock.
> -		 */
> -		if (!list_empty(&node->private_list))
> -			list_lru_del(&workingset_shadow_nodes,
> -				     &node->private_list);
> -	}
>  	return 0;
>  }
>  
> @@ -182,8 +164,6 @@ static void page_cache_tree_delete(struct address_space *mapping,
>  		__radix_tree_lookup(&mapping->page_tree, page->index + i,
>  				    &node, &slot);
>  
> -		radix_tree_clear_tags(&mapping->page_tree, node, slot);
> -
>  		if (!node) {
>  			VM_BUG_ON_PAGE(nr != 1, page);
>  			/*
> @@ -193,33 +173,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
>  			shadow = NULL;
>  		}
>  
> -		radix_tree_replace_slot(&mapping->page_tree, slot, shadow);
> -
> -		if (!node)
> -			break;
> -
> -		workingset_node_pages_dec(node);
> -		if (shadow)
> -			workingset_node_shadows_inc(node);
> -		else
> -			if (__radix_tree_delete_node(&mapping->page_tree, node))
> -				continue;
> -
> -		/*
> -		 * Track node that only contains shadow entries. DAX mappings
> -		 * contain no shadow entries and may contain other exceptional
> -		 * entries so skip those.
> -		 *
> -		 * Avoid acquiring the list_lru lock if already tracked.
> -		 * The list_empty() test is safe as node->private_list is
> -		 * protected by mapping->tree_lock.
> -		 */
> -		if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
> -				list_empty(&node->private_list)) {
> -			node->private_data = mapping;
> -			list_lru_add(&workingset_shadow_nodes,
> -					&node->private_list);
> -		}
> +		radix_tree_clear_tags(&mapping->page_tree, node, slot);
> +		__radix_tree_replace(&mapping->page_tree, node, slot, shadow,
> +				     workingset_update_node, mapping);
>  	}
>  
>  	if (shadow) {
> diff --git a/mm/truncate.c b/mm/truncate.c
> index 6ae44571d4c7..7e5464a611db 100644
> --- a/mm/truncate.c
> +++ b/mm/truncate.c
> @@ -44,28 +44,13 @@ static void clear_exceptional_entry(struct address_space *mapping,
>  	 * without the tree itself locked.  These unlocked entries
>  	 * need verification under the tree lock.
>  	 */
> -	if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
> -				&slot))
> +	if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
>  		goto unlock;
>  	if (*slot != entry)
>  		goto unlock;
> -	radix_tree_replace_slot(&mapping->page_tree, slot, NULL);
> +	__radix_tree_replace(&mapping->page_tree, node, slot, NULL,
> +			     workingset_update_node, mapping);
>  	mapping->nrexceptional--;
> -	if (!node)
> -		goto unlock;
> -	workingset_node_shadows_dec(node);
> -	/*
> -	 * Don't track node without shadow entries.
> -	 *
> -	 * Avoid acquiring the list_lru lock if already untracked.
> -	 * The list_empty() test is safe as node->private_list is
> -	 * protected by mapping->tree_lock.
> -	 */
> -	if (!workingset_node_shadows(node) &&
> -	    !list_empty(&node->private_list))
> -		list_lru_del(&workingset_shadow_nodes,
> -				&node->private_list);
> -	__radix_tree_delete_node(&mapping->page_tree, node);
>  unlock:
>  	spin_unlock_irq(&mapping->tree_lock);
>  }
> diff --git a/mm/workingset.c b/mm/workingset.c
> index 3cfc61d84a52..111e06559892 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -10,6 +10,7 @@
>  #include <linux/atomic.h>
>  #include <linux/module.h>
>  #include <linux/swap.h>
> +#include <linux/dax.h>
>  #include <linux/fs.h>
>  #include <linux/mm.h>
>  
> @@ -334,18 +335,45 @@ void workingset_activation(struct page *page)
>   * point where they would still be useful.
>   */
>  
> -struct list_lru workingset_shadow_nodes;
> +static struct list_lru shadow_nodes;
> +
> +void workingset_update_node(struct radix_tree_node *node, void *private)
> +{
> +	struct address_space *mapping = private;
> +
> +	/* Only regular page cache has shadow entries */
> +	if (dax_mapping(mapping) || shmem_mapping(mapping))
> +		return;
> +
> +	/*
> +	 * Track non-empty nodes that contain only shadow entries;
> +	 * unlink those that contain pages or are being freed.
> +	 *
> +	 * Avoid acquiring the list_lru lock when the nodes are
> +	 * already where they should be. The list_empty() test is safe
> +	 * as node->private_list is protected by &mapping->tree_lock.
> +	 */
> +	if (node->count && node->count == node->exceptional) {
> +		if (list_empty(&node->private_list)) {
> +			node->private_data = mapping;
> +			list_lru_add(&shadow_nodes, &node->private_list);
> +		}
> +	} else {
> +		if (!list_empty(&node->private_list))
> +			list_lru_del(&shadow_nodes, &node->private_list);
> +	}
> +}
>  
>  static unsigned long count_shadow_nodes(struct shrinker *shrinker,
>  					struct shrink_control *sc)
>  {
> -	unsigned long shadow_nodes;
>  	unsigned long max_nodes;
> +	unsigned long nodes;
>  	unsigned long pages;
>  
>  	/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
>  	local_irq_disable();
> -	shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
> +	nodes = list_lru_shrink_count(&shadow_nodes, sc);
>  	local_irq_enable();
>  
>  	if (memcg_kmem_enabled()) {
> @@ -372,10 +400,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
>  	 */
>  	max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3);
>  
> -	if (shadow_nodes <= max_nodes)
> +	if (nodes <= max_nodes)
>  		return 0;
>  
> -	return shadow_nodes - max_nodes;
> +	return nodes - max_nodes;
>  }
>  
>  static enum lru_status shadow_lru_isolate(struct list_head *item,
> @@ -418,22 +446,25 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
>  	 * no pages, so we expect to be able to remove them all and
>  	 * delete and free the empty node afterwards.
>  	 */
> -	if (WARN_ON_ONCE(!workingset_node_shadows(node)))
> +	if (WARN_ON_ONCE(!node->exceptional))
>  		goto out_invalid;
> -	if (WARN_ON_ONCE(workingset_node_pages(node)))
> +	if (WARN_ON_ONCE(node->count != node->exceptional))
>  		goto out_invalid;
>  	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
>  		if (node->slots[i]) {
>  			if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i])))
>  				goto out_invalid;
> +			if (WARN_ON_ONCE(!node->exceptional))
> +				goto out_invalid;
>  			if (WARN_ON_ONCE(!mapping->nrexceptional))
>  				goto out_invalid;
>  			node->slots[i] = NULL;
> -			workingset_node_shadows_dec(node);
> +			node->exceptional--;
> +			node->count--;
>  			mapping->nrexceptional--;
>  		}
>  	}
> -	if (WARN_ON_ONCE(workingset_node_shadows(node)))
> +	if (WARN_ON_ONCE(node->exceptional))
>  		goto out_invalid;
>  	inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
>  	__radix_tree_delete_node(&mapping->page_tree, node);
> @@ -456,8 +487,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
>  
>  	/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
>  	local_irq_disable();
> -	ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
> -				    shadow_lru_isolate, NULL);
> +	ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
>  	local_irq_enable();
>  	return ret;
>  }
> @@ -496,7 +526,7 @@ static int __init workingset_init(void)
>  	pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
>  	       timestamp_bits, max_order, bucket_order);
>  
> -	ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
> +	ret = list_lru_init_key(&shadow_nodes, &shadow_nodes_key);
>  	if (ret)
>  		goto err;
>  	ret = register_shrinker(&workingset_shadow_shrinker);
> @@ -504,7 +534,7 @@ static int __init workingset_init(void)
>  		goto err_list_lru;
>  	return 0;
>  err_list_lru:
> -	list_lru_destroy(&workingset_shadow_nodes);
> +	list_lru_destroy(&shadow_nodes);
>  err:
>  	return ret;
>  }
> -- 
> 2.10.2
> 
-- 
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>



[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]