On Mon 07-11-16 14:07:39, Johannes Weiner wrote: > The bug in khugepaged (fixed in the first patch of this series) shows > that radix tree slot replacement is fragile; and it will become more > so when not only NULL<->!NULL transitions need to be accounted but > transitions from and to exceptional entries as well. We need checks. > > Re-implement radix_tree_replace_slot() on top of the sanity-checked > __radix_tree_replace(). This requires existing callers to also pass > the radix tree root, but it'll warn us when somebody replaces slots > with contents that need proper accounting (transitions between NULL > entries, real entries, exceptional entries) and where a replacement > through the slot pointer would corrupt the radix tree node counts. > > Suggested-by: Jan Kara <jack@xxxxxxx> > Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx> The patch looks good. You can add: Reviewed-by: Jan Kara <jack@xxxxxxx> Honza > --- > arch/s390/mm/gmap.c | 2 +- > drivers/sh/intc/virq.c | 2 +- > fs/dax.c | 4 +-- > include/linux/radix-tree.h | 16 ++------- > lib/radix-tree.c | 64 +++++++++++++++++++++++++++-------- > mm/filemap.c | 4 +-- > mm/khugepaged.c | 5 +-- > mm/migrate.c | 4 +-- > mm/truncate.c | 2 +- > tools/testing/radix-tree/multiorder.c | 2 +- > 10 files changed, 64 insertions(+), 41 deletions(-) > > diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c > index 3ba622702ce4..ec1f0dedb948 100644 > --- a/arch/s390/mm/gmap.c > +++ b/arch/s390/mm/gmap.c > @@ -1015,7 +1015,7 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, > if (slot) { > rmap->next = radix_tree_deref_slot_protected(slot, > &sg->guest_table_lock); > - radix_tree_replace_slot(slot, rmap); > + radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); > } else { > rmap->next = NULL; > radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, > diff --git a/drivers/sh/intc/virq.c b/drivers/sh/intc/virq.c > index e7899624aa0b..35bbe288ddb4 100644 > --- a/drivers/sh/intc/virq.c > +++ b/drivers/sh/intc/virq.c > @@ -254,7 +254,7 @@ static void __init intc_subgroup_map(struct intc_desc_int *d) > > radix_tree_tag_clear(&d->tree, entry->enum_id, > INTC_TAG_VIRQ_NEEDS_ALLOC); > - radix_tree_replace_slot((void **)entries[i], > + radix_tree_replace_slot(&d->tree, (void **)entries[i], > &intc_irq_xlate[irq]); > } > > diff --git a/fs/dax.c b/fs/dax.c > index db78bae0dc0f..85930c2a2749 100644 > --- a/fs/dax.c > +++ b/fs/dax.c > @@ -342,7 +342,7 @@ static inline void *lock_slot(struct address_space *mapping, void **slot) > radix_tree_deref_slot_protected(slot, &mapping->tree_lock); > > entry |= RADIX_DAX_ENTRY_LOCK; > - radix_tree_replace_slot(slot, (void *)entry); > + radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); > return (void *)entry; > } > > @@ -356,7 +356,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) > radix_tree_deref_slot_protected(slot, &mapping->tree_lock); > > entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; > - radix_tree_replace_slot(slot, (void *)entry); > + radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); > return (void *)entry; > } > > diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h > index 7ced8a70cc8b..2d1b9b8be983 100644 > --- a/include/linux/radix-tree.h > +++ b/include/linux/radix-tree.h > @@ -249,20 +249,6 @@ static inline int radix_tree_exception(void *arg) > return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK); > } > > -/** > - * radix_tree_replace_slot - replace item in a slot > - * @pslot: pointer to slot, returned by radix_tree_lookup_slot > - * @item: new item to store in the slot. > - * > - * For use with radix_tree_lookup_slot(). Caller must hold tree write locked > - * across slot lookup and replacement. > - */ > -static inline void radix_tree_replace_slot(void **pslot, void *item) > -{ > - BUG_ON(radix_tree_is_internal_node(item)); > - rcu_assign_pointer(*pslot, item); > -} > - > int __radix_tree_create(struct radix_tree_root *root, unsigned long index, > unsigned order, struct radix_tree_node **nodep, > void ***slotp); > @@ -280,6 +266,8 @@ void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); > void __radix_tree_replace(struct radix_tree_root *root, > struct radix_tree_node *node, > void **slot, void *item); > +void radix_tree_replace_slot(struct radix_tree_root *root, > + void **slot, void *item); > bool __radix_tree_delete_node(struct radix_tree_root *root, > struct radix_tree_node *node); > void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); > diff --git a/lib/radix-tree.c b/lib/radix-tree.c > index ddc6facbf4da..5cbdd911931e 100644 > --- a/lib/radix-tree.c > +++ b/lib/radix-tree.c > @@ -753,19 +753,10 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) > } > EXPORT_SYMBOL(radix_tree_lookup); > > -/** > - * __radix_tree_replace - replace item in a slot > - * @root: radix tree root > - * @node: pointer to tree node > - * @slot: pointer to slot in @node > - * @item: new item to store in the slot. > - * > - * For use with __radix_tree_lookup(). Caller must hold tree write locked > - * across slot lookup and replacement. > - */ > -void __radix_tree_replace(struct radix_tree_root *root, > - struct radix_tree_node *node, > - void **slot, void *item) > +static void replace_slot(struct radix_tree_root *root, > + struct radix_tree_node *node, > + void **slot, void *item, > + bool warn_typeswitch) > { > void *old = rcu_dereference_raw(*slot); > int count, exceptional; > @@ -776,8 +767,7 @@ void __radix_tree_replace(struct radix_tree_root *root, > exceptional = !!radix_tree_exceptional_entry(item) - > !!radix_tree_exceptional_entry(old); > > - WARN_ONCE(!node && slot != (void **)&root->rnode && > - (count || exceptional), > + WARN_ONCE(warn_typeswitch && (count || exceptional), > "Unaccounted slot replacement: old=%p item=%p count=%d exceptional=%d\n", > old, item, count, exceptional); > > @@ -789,6 +779,50 @@ void __radix_tree_replace(struct radix_tree_root *root, > } > > /** > + * __radix_tree_replace - replace item in a slot > + * @root: radix tree root > + * @node: pointer to tree node > + * @slot: pointer to slot in @node > + * @item: new item to store in the slot. > + * > + * For use with __radix_tree_lookup(). Caller must hold tree write locked > + * across slot lookup and replacement. > + */ > +void __radix_tree_replace(struct radix_tree_root *root, > + struct radix_tree_node *node, > + void **slot, void *item) > +{ > + /* > + * This function supports replacing entries with different types > + * (NULL, regular entries, exceptional entries), but that needs > + * accounting against the node - unless the slot is root->rnode. > + */ > + replace_slot(root, node, slot, item, > + !node && slot != (void **)&root->rnode); > +} > + > +/** > + * radix_tree_replace_slot - replace item in a slot > + * @root: radix tree root > + * @slot: pointer to slot > + * @item: new item to store in the slot. > + * > + * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(), > + * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked > + * across slot lookup and replacement. > + * > + * NOTE: This cannot be used to switch between non-entries (empty slots), > + * regular entries, and exceptional entries, as that requires accounting > + * inside the radix tree node. When switching from one type of entry to > + * another, use __radix_tree_lookup() and __radix_tree_replace(). > + */ > +void radix_tree_replace_slot(struct radix_tree_root *root, > + void **slot, void *item) > +{ > + replace_slot(root, NULL, slot, item, true); > +} > + > +/** > * radix_tree_tag_set - set a tag on a radix tree node > * @root: radix tree root > * @index: index key > diff --git a/mm/filemap.c b/mm/filemap.c > index c7fe2f16503f..eb463156f29a 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -147,7 +147,7 @@ static int page_cache_tree_insert(struct address_space *mapping, > false); > } > } > - radix_tree_replace_slot(slot, page); > + radix_tree_replace_slot(&mapping->page_tree, slot, page); > mapping->nrpages++; > if (node) { > workingset_node_pages_inc(node); > @@ -193,7 +193,7 @@ static void page_cache_tree_delete(struct address_space *mapping, > shadow = NULL; > } > > - radix_tree_replace_slot(slot, shadow); > + radix_tree_replace_slot(&mapping->page_tree, slot, shadow); > > if (!node) > break; > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index eac6f0580e26..fed8d5e96978 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -1421,7 +1421,7 @@ static void collapse_shmem(struct mm_struct *mm, > list_add_tail(&page->lru, &pagelist); > > /* Finally, replace with the new page. */ > - radix_tree_replace_slot(slot, > + radix_tree_replace_slot(&mapping->page_tree, slot, > new_page + (index % HPAGE_PMD_NR)); > > index++; > @@ -1531,7 +1531,8 @@ static void collapse_shmem(struct mm_struct *mm, > /* Unfreeze the page. */ > list_del(&page->lru); > page_ref_unfreeze(page, 2); > - radix_tree_replace_slot(slot, page); > + radix_tree_replace_slot(&mapping->page_tree, > + slot, page); > spin_unlock_irq(&mapping->tree_lock); > putback_lru_page(page); > unlock_page(page); > diff --git a/mm/migrate.c b/mm/migrate.c > index 99250aee1ac1..9b88e4e37d0a 100644 > --- a/mm/migrate.c > +++ b/mm/migrate.c > @@ -482,7 +482,7 @@ int migrate_page_move_mapping(struct address_space *mapping, > SetPageDirty(newpage); > } > > - radix_tree_replace_slot(pslot, newpage); > + radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); > > /* > * Drop cache reference from old page by unfreezing > @@ -556,7 +556,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, > > get_page(newpage); > > - radix_tree_replace_slot(pslot, newpage); > + radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); > > page_ref_unfreeze(page, expected_count - 1); > > diff --git a/mm/truncate.c b/mm/truncate.c > index a01cce450a26..6ae44571d4c7 100644 > --- a/mm/truncate.c > +++ b/mm/truncate.c > @@ -49,7 +49,7 @@ static void clear_exceptional_entry(struct address_space *mapping, > goto unlock; > if (*slot != entry) > goto unlock; > - radix_tree_replace_slot(slot, NULL); > + radix_tree_replace_slot(&mapping->page_tree, slot, NULL); > mapping->nrexceptional--; > if (!node) > goto unlock; > diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c > index 05d7bc488971..d1be94667a30 100644 > --- a/tools/testing/radix-tree/multiorder.c > +++ b/tools/testing/radix-tree/multiorder.c > @@ -146,7 +146,7 @@ static void multiorder_check(unsigned long index, int order) > > slot = radix_tree_lookup_slot(&tree, index); > free(*slot); > - radix_tree_replace_slot(slot, item2); > + radix_tree_replace_slot(&tree, slot, item2); > for (i = min; i < max; i++) { > struct item *item = item_lookup(&tree, i); > assert(item != 0); > -- > 2.10.1 > -- Jan Kara <jack@xxxxxxxx> SUSE Labs, CR -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>