On Fri, Jan 10, 2014 at 01:10:43PM -0500, Johannes Weiner wrote: > Previously, page cache radix tree nodes were freed after reclaim > emptied out their page pointers. But now reclaim stores shadow > entries in their place, which are only reclaimed when the inodes > themselves are reclaimed. This is problematic for bigger files that > are still in use after they have a significant amount of their cache > reclaimed, without any of those pages actually refaulting. The shadow > entries will just sit there and waste memory. In the worst case, the > shadow entries will accumulate until the machine runs out of memory. > > To get this under control, the VM will track radix tree nodes > exclusively containing shadow entries on a per-NUMA node list. > Per-NUMA rather than global because we expect the radix tree nodes > themselves to be allocated node-locally and we want to reduce > cross-node references of otherwise independent cache workloads. A > simple shrinker will then reclaim these nodes on memory pressure. > > A few things need to be stored in the radix tree node to implement the > shadow node LRU and allow tree deletions coming from the list: > > 1. There is no index available that would describe the reverse path > from the node up to the tree root, which is needed to perform a > deletion. To solve this, encode in each node its offset inside the > parent. This can be stored in the unused upper bits of the same > member that stores the node's height at no extra space cost. > > 2. The number of shadow entries needs to be counted in addition to the > regular entries, to quickly detect when the node is ready to go to > the shadow node LRU list. The current entry count is an unsigned > int but the maximum number of entries is 64, so a shadow counter > can easily be stored in the unused upper bits. > > 3. Tree modification needs tree lock and tree root, which are located > in the address space, so store an address_space backpointer in the > node. The parent pointer of the node is in a union with the 2-word > rcu_head, so the backpointer comes at no extra cost as well. > > 4. The node needs to be linked to an LRU list, which requires a list > head inside the node. This does increase the size of the node, but > it does not change the number of objects that fit into a slab page. > > Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx> > --- > include/linux/list_lru.h | 2 + > include/linux/mmzone.h | 1 + > include/linux/radix-tree.h | 32 +++++++++--- > include/linux/swap.h | 1 + > lib/radix-tree.c | 36 ++++++++------ > mm/filemap.c | 77 +++++++++++++++++++++++------ > mm/list_lru.c | 8 +++ > mm/truncate.c | 20 +++++++- > mm/vmstat.c | 1 + > mm/workingset.c | 121 +++++++++++++++++++++++++++++++++++++++++++++ > 10 files changed, 259 insertions(+), 40 deletions(-) > > diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h > index 3ce541753c88..b02fc233eadd 100644 > --- a/include/linux/list_lru.h > +++ b/include/linux/list_lru.h > @@ -13,6 +13,8 @@ > /* list_lru_walk_cb has to always return one of those */ > enum lru_status { > LRU_REMOVED, /* item removed from list */ > + LRU_REMOVED_RETRY, /* item removed, but lock has been > + dropped and reacquired */ > LRU_ROTATE, /* item referenced, give another pass */ > LRU_SKIP, /* item cannot be locked, skip */ > LRU_RETRY, /* item not freeable. May drop the lock > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 118ba9f51e86..8cac5a7ef7a7 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -144,6 +144,7 @@ enum zone_stat_item { > #endif > WORKINGSET_REFAULT, > WORKINGSET_ACTIVATE, > + WORKINGSET_NODERECLAIM, > NR_ANON_TRANSPARENT_HUGEPAGES, > NR_FREE_CMA_PAGES, > NR_VM_ZONE_STAT_ITEMS }; > diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h > index 13636c40bc42..33170dbd9db4 100644 > --- a/include/linux/radix-tree.h > +++ b/include/linux/radix-tree.h > @@ -72,21 +72,37 @@ static inline int radix_tree_is_indirect_ptr(void *ptr) > #define RADIX_TREE_TAG_LONGS \ > ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) > > +#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) > +#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ > + RADIX_TREE_MAP_SHIFT)) > + > +/* Height component in node->path */ > +#define RADIX_TREE_HEIGHT_SHIFT (RADIX_TREE_MAX_PATH + 1) > +#define RADIX_TREE_HEIGHT_MASK ((1UL << RADIX_TREE_HEIGHT_SHIFT) - 1) > + > +/* Internally used bits of node->count */ > +#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1) > +#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1) > + > struct radix_tree_node { > - unsigned int height; /* Height from the bottom */ > + unsigned int path; /* Offset in parent & height from the bottom */ > unsigned int count; > union { > - struct radix_tree_node *parent; /* Used when ascending tree */ > - struct rcu_head rcu_head; /* Used when freeing node */ > + struct { > + /* Used when ascending tree */ > + struct radix_tree_node *parent; > + /* For tree user */ > + void *private_data; > + }; > + /* Used when freeing node */ > + struct rcu_head rcu_head; > }; > + /* For tree user */ > + struct list_head private_list; > void __rcu *slots[RADIX_TREE_MAP_SIZE]; > unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; > }; > > -#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) > -#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ > - RADIX_TREE_MAP_SHIFT)) > - > /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ > struct radix_tree_root { > unsigned int height; > @@ -251,7 +267,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, > struct radix_tree_node **nodep, void ***slotp); > void *radix_tree_lookup(struct radix_tree_root *, unsigned long); > void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); > -bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, > +bool __radix_tree_delete_node(struct radix_tree_root *root, > struct radix_tree_node *node); > void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); > void *radix_tree_delete(struct radix_tree_root *, unsigned long); > diff --git a/include/linux/swap.h b/include/linux/swap.h > index b83cf61403ed..102e37bc82d5 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -264,6 +264,7 @@ struct swap_list_t { > void *workingset_eviction(struct address_space *mapping, struct page *page); > bool workingset_refault(void *shadow); > void workingset_activation(struct page *page); > +extern struct list_lru workingset_shadow_nodes; > > /* linux/mm/page_alloc.c */ > extern unsigned long totalram_pages; > diff --git a/lib/radix-tree.c b/lib/radix-tree.c > index e601c56a43d0..0a0895371447 100644 > --- a/lib/radix-tree.c > +++ b/lib/radix-tree.c > @@ -342,7 +342,8 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) > > /* Increase the height. */ > newheight = root->height+1; > - node->height = newheight; > + BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK); > + node->path = newheight; Nitpick: It would be better to add some accessor for path and offset for readability and future enhance? > node->count = 1; > node->parent = NULL; > slot = root->rnode; > @@ -400,11 +401,12 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, > /* Have to add a child node. */ > if (!(slot = radix_tree_node_alloc(root))) > return -ENOMEM; > - slot->height = height; > + slot->path = height; > slot->parent = node; > if (node) { > rcu_assign_pointer(node->slots[offset], slot); > node->count++; > + slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT; > } else > rcu_assign_pointer(root->rnode, ptr_to_indirect(slot)); > } > @@ -496,7 +498,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, > } > node = indirect_to_ptr(node); > > - height = node->height; > + height = node->path & RADIX_TREE_HEIGHT_MASK; > if (index > radix_tree_maxindex(height)) > return NULL; > > @@ -702,7 +704,7 @@ int radix_tree_tag_get(struct radix_tree_root *root, > return (index == 0); > node = indirect_to_ptr(node); > > - height = node->height; > + height = node->path & RADIX_TREE_HEIGHT_MASK; > if (index > radix_tree_maxindex(height)) > return 0; > > @@ -739,7 +741,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, > { > unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK; > struct radix_tree_node *rnode, *node; > - unsigned long index, offset; > + unsigned long index, offset, height; > > if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) > return NULL; > @@ -770,7 +772,8 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, > return NULL; > > restart: > - shift = (rnode->height - 1) * RADIX_TREE_MAP_SHIFT; > + height = rnode->path & RADIX_TREE_HEIGHT_MASK; > + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; > offset = index >> shift; > > /* Index outside of the tree */ > @@ -1140,7 +1143,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item, > unsigned int shift, height; > unsigned long i; > > - height = slot->height; > + height = slot->path & RADIX_TREE_HEIGHT_MASK; > shift = (height-1) * RADIX_TREE_MAP_SHIFT; > > for ( ; height > 1; height--) { > @@ -1203,7 +1206,8 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) > } > > node = indirect_to_ptr(node); > - max_index = radix_tree_maxindex(node->height); > + max_index = radix_tree_maxindex(node->path & > + RADIX_TREE_HEIGHT_MASK); > if (cur_index > max_index) > break; > > @@ -1297,7 +1301,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) > * > * Returns %true if @node was freed, %false otherwise. > */ > -bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, > +bool __radix_tree_delete_node(struct radix_tree_root *root, > struct radix_tree_node *node) > { > bool deleted = false; > @@ -1316,9 +1320,10 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, > > parent = node->parent; > if (parent) { > - index >>= RADIX_TREE_MAP_SHIFT; > + unsigned int offset; > > - parent->slots[index & RADIX_TREE_MAP_MASK] = NULL; > + offset = node->path >> RADIX_TREE_HEIGHT_SHIFT; > + parent->slots[offset] = NULL; > parent->count--; > } else { > root_tag_clear_all(root); > @@ -1382,7 +1387,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, > node->slots[offset] = NULL; > node->count--; > > - __radix_tree_delete_node(root, index, node); > + __radix_tree_delete_node(root, node); > > return entry; > } > @@ -1415,9 +1420,12 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag) > EXPORT_SYMBOL(radix_tree_tagged); > > static void > -radix_tree_node_ctor(void *node) > +radix_tree_node_ctor(void *arg) > { > - memset(node, 0, sizeof(struct radix_tree_node)); > + struct radix_tree_node *node = arg; > + > + memset(node, 0, sizeof(*node)); > + INIT_LIST_HEAD(&node->private_list); > } > > static __init unsigned long __maxindex(unsigned int height) > diff --git a/mm/filemap.c b/mm/filemap.c > index 65a374c0df4f..b93e223b59a9 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -110,11 +110,17 @@ > static void page_cache_tree_delete(struct address_space *mapping, > struct page *page, void *shadow) > { > - if (shadow) { > - void **slot; > + struct radix_tree_node *node; > + unsigned long index; > + unsigned int offset; > + unsigned int tag; > + void **slot; > > - slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); > - radix_tree_replace_slot(slot, shadow); > + VM_BUG_ON(!PageLocked(page)); > + > + __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); > + > + if (shadow) { > mapping->nrshadows++; > /* > * Make sure the nrshadows update is committed before > @@ -123,9 +129,39 @@ static void page_cache_tree_delete(struct address_space *mapping, > * same time and miss a shadow entry. > */ > smp_wmb(); > - } else > - radix_tree_delete(&mapping->page_tree, page->index); > + } > mapping->nrpages--; > + > + if (!node) { > + /* Clear direct pointer tags in root node */ > + mapping->page_tree.gfp_mask &= __GFP_BITS_MASK; > + radix_tree_replace_slot(slot, shadow); > + return; > + } > + > + /* Clear tree tags for the removed page */ > + index = page->index; > + offset = index & RADIX_TREE_MAP_MASK; > + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { > + if (test_bit(offset, node->tags[tag])) > + radix_tree_tag_clear(&mapping->page_tree, index, tag); > + } > + > + /* Delete page, swap shadow entry */ > + radix_tree_replace_slot(slot, shadow); > + node->count--; > + if (shadow) > + node->count += 1U << RADIX_TREE_COUNT_SHIFT; Nitpick2: It should be a function of workingset.c rather than exposing RADIX_TREE_COUNT_SHIFT? IMO, It would be better to provide some accessor functions here, too. I didn't review locking part yet and will review it tomorrow with fresh brain. :) > + else > + if (__radix_tree_delete_node(&mapping->page_tree, node)) > + return; > + > + /* Only shadow entries in there, keep track of this node */ > + if (!(node->count & RADIX_TREE_COUNT_MASK) && > + list_empty(&node->private_list)) { > + node->private_data = mapping; > + list_lru_add(&workingset_shadow_nodes, &node->private_list); > + } > } > > /* > @@ -471,27 +507,36 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page); > static int page_cache_tree_insert(struct address_space *mapping, > struct page *page, void **shadowp) > { > + struct radix_tree_node *node; > void **slot; > int error; > > - slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); > - if (slot) { > + error = __radix_tree_create(&mapping->page_tree, page->index, > + &node, &slot); > + if (error) > + return error; > + if (*slot) { > void *p; > > p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); > if (!radix_tree_exceptional_entry(p)) > return -EEXIST; > - radix_tree_replace_slot(slot, page); > - mapping->nrshadows--; > - mapping->nrpages++; > if (shadowp) > *shadowp = p; > - return 0; > + mapping->nrshadows--; > + if (node) > + node->count -= 1U << RADIX_TREE_COUNT_SHIFT; > } > - error = radix_tree_insert(&mapping->page_tree, page->index, page); > - if (!error) > - mapping->nrpages++; > - return error; > + radix_tree_replace_slot(slot, page); > + mapping->nrpages++; > + if (node) { > + node->count++; > + /* Installed page, can't be shadow-only anymore */ > + if (!list_empty(&node->private_list)) > + list_lru_del(&workingset_shadow_nodes, > + &node->private_list); > + } > + return 0; > } > > static int __add_to_page_cache_locked(struct page *page, > diff --git a/mm/list_lru.c b/mm/list_lru.c > index 72f9decb0104..47a9faf4070b 100644 > --- a/mm/list_lru.c > +++ b/mm/list_lru.c > @@ -88,10 +88,18 @@ restart: > ret = isolate(item, &nlru->lock, cb_arg); > switch (ret) { > case LRU_REMOVED: > + case LRU_REMOVED_RETRY: > if (--nlru->nr_items == 0) > node_clear(nid, lru->active_nodes); > WARN_ON_ONCE(nlru->nr_items < 0); > isolated++; > + /* > + * If the lru lock has been dropped, our list > + * traversal is now invalid and so we have to > + * restart from scratch. > + */ > + if (ret == LRU_REMOVED_RETRY) > + goto restart; > break; > case LRU_ROTATE: > list_move_tail(item, &nlru->list); > diff --git a/mm/truncate.c b/mm/truncate.c > index 97606fa4c458..5c2615d7f4da 100644 > --- a/mm/truncate.c > +++ b/mm/truncate.c > @@ -25,6 +25,9 @@ > static void clear_exceptional_entry(struct address_space *mapping, > pgoff_t index, void *entry) > { > + struct radix_tree_node *node; > + void **slot; > + > /* Handled by shmem itself */ > if (shmem_mapping(mapping)) > return; > @@ -35,8 +38,21 @@ static void clear_exceptional_entry(struct address_space *mapping, > * without the tree itself locked. These unlocked entries > * need verification under the tree lock. > */ > - if (radix_tree_delete_item(&mapping->page_tree, index, entry) == entry) > - mapping->nrshadows--; > + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) > + goto unlock; > + if (*slot != entry) > + goto unlock; > + radix_tree_replace_slot(slot, NULL); > + mapping->nrshadows--; > + if (!node) > + goto unlock; > + node->count -= 1U << RADIX_TREE_COUNT_SHIFT; > + /* No more shadow entries, stop tracking the node */ > + if (!(node->count >> RADIX_TREE_COUNT_SHIFT) && > + !list_empty(&node->private_list)) > + list_lru_del(&workingset_shadow_nodes, &node->private_list); > + __radix_tree_delete_node(&mapping->page_tree, node); > +unlock: > spin_unlock_irq(&mapping->tree_lock); > } > > diff --git a/mm/vmstat.c b/mm/vmstat.c > index 3ac830d1b533..baa3ba586685 100644 > --- a/mm/vmstat.c > +++ b/mm/vmstat.c > @@ -772,6 +772,7 @@ const char * const vmstat_text[] = { > #endif > "workingset_refault", > "workingset_activate", > + "workingset_nodereclaim", > "nr_anon_transparent_hugepages", > "nr_free_cma", > "nr_dirty_threshold", > diff --git a/mm/workingset.c b/mm/workingset.c > index 8a6c7cff4923..7bb1a432c137 100644 > --- a/mm/workingset.c > +++ b/mm/workingset.c > @@ -251,3 +251,124 @@ void workingset_activation(struct page *page) > { > atomic_long_inc(&page_zone(page)->inactive_age); > } > + > +/* > + * Page cache radix tree nodes containing only shadow entries can grow > + * excessively on certain workloads. That's why they are tracked on > + * per-(NUMA)node lists and pushed back by a shrinker, but with a > + * slightly higher threshold than regular shrinkers so we don't > + * discard the entries too eagerly - after all, during light memory > + * pressure is exactly when we need them. > + */ > + > +struct list_lru workingset_shadow_nodes; > + > +static unsigned long count_shadow_nodes(struct shrinker *shrinker, > + struct shrink_control *sc) > +{ > + return list_lru_count_node(&workingset_shadow_nodes, sc->nid); > +} > + > +static enum lru_status shadow_lru_isolate(struct list_head *item, > + spinlock_t *lru_lock, > + void *arg) > +{ > + unsigned long *nr_reclaimed = arg; > + struct address_space *mapping; > + struct radix_tree_node *node; > + unsigned int i; > + int ret; > + > + /* > + * Page cache insertions and deletions synchroneously maintain > + * the shadow node LRU under the mapping->tree_lock and the > + * lru_lock. Because the page cache tree is emptied before > + * the inode can be destroyed, holding the lru_lock pins any > + * address_space that has radix tree nodes on the LRU. > + * > + * We can then safely transition to the mapping->tree_lock to > + * pin only the address_space of the particular node we want > + * to reclaim, take the node off-LRU, and drop the lru_lock. > + */ > + > + node = container_of(item, struct radix_tree_node, private_list); > + mapping = node->private_data; > + > + /* Coming from the list, invert the lock order */ > + if (!spin_trylock_irq(&mapping->tree_lock)) { > + spin_unlock(lru_lock); > + ret = LRU_RETRY; > + goto out; > + } > + > + list_del_init(item); > + spin_unlock(lru_lock); > + > + /* > + * The nodes should only contain one or more shadow entries, > + * no pages, so we expect to be able to remove them all and > + * delete and free the empty node afterwards. > + */ > + > + BUG_ON(!node->count); > + BUG_ON(node->count & RADIX_TREE_COUNT_MASK); > + > + for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { > + if (node->slots[i]) { > + BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); > + node->slots[i] = NULL; > + BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); > + node->count -= 1U << RADIX_TREE_COUNT_SHIFT; > + BUG_ON(!mapping->nrshadows); > + mapping->nrshadows--; > + } > + } > + BUG_ON(node->count); > + inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM); > + if (!__radix_tree_delete_node(&mapping->page_tree, node)) > + BUG(); > + (*nr_reclaimed)++; > + > + spin_unlock_irq(&mapping->tree_lock); > + ret = LRU_REMOVED_RETRY; > +out: > + cond_resched(); > + spin_lock(lru_lock); > + return ret; > +} > + > +static unsigned long scan_shadow_nodes(struct shrinker *shrinker, > + struct shrink_control *sc) > +{ > + unsigned long nr_reclaimed = 0; > + > + list_lru_walk_node(&workingset_shadow_nodes, sc->nid, > + shadow_lru_isolate, &nr_reclaimed, &sc->nr_to_scan); > + > + return nr_reclaimed; > +} > + > +static struct shrinker workingset_shadow_shrinker = { > + .count_objects = count_shadow_nodes, > + .scan_objects = scan_shadow_nodes, > + .seeks = DEFAULT_SEEKS * 4, > + .flags = SHRINKER_NUMA_AWARE, > +}; > + > +static int __init workingset_init(void) > +{ > + int ret; > + > + ret = list_lru_init(&workingset_shadow_nodes); > + if (ret) > + goto err; > + ret = register_shrinker(&workingset_shadow_shrinker); > + if (ret) > + goto err_list_lru; > + return 0; > +err_list_lru: > + list_lru_destroy(&workingset_shadow_nodes); > +err: > + return ret; > +} > +module_init(workingset_init); > -- > 1.8.4.2 > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@xxxxxxxxx. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a> -- Kind regards, Minchan Kim -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html