The patch titled Subject: maple_tree: change spanning store to work on larger trees has been added to the -mm mm-unstable branch. Its filename is maple-tree-add-new-data-structure-fix-8.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/maple-tree-add-new-data-structure-fix-8.patch This patch will later appear in the mm-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Liam Howlett <liam.howlett@xxxxxxxxxx> Subject: maple_tree: change spanning store to work on larger trees Date: Wed, 15 Jun 2022 14:19:39 +0000 Spanning store had an issue which could lead to double free during a large tree modification. Fix this by being more careful about how nodes are added to the to-be-freed and to-be-destroyed list on this operation. Link: https://lkml.kernel.org/r/20220615141921.417598-3-Liam.Howlett@xxxxxxxxxx Signed-off-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx> Reported-by: Qian Cai <quic_qiancai@xxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- lib/maple_tree.c | 325 ++++++++++++++++++++++++++++----------------- 1 file changed, 206 insertions(+), 119 deletions(-) --- a/lib/maple_tree.c~maple-tree-add-new-data-structure-fix-8 +++ a/lib/maple_tree.c @@ -1388,7 +1388,6 @@ static inline unsigned char ma_data_end( /* * mas_data_end() - Find the end of the data (slot). * @mas: the maple state - * @type: the type of maple node * * This method is optimized to check the metadata of a node if the node type * supports data end metadata. @@ -2273,6 +2272,31 @@ done: } /* + * mas_topiary_range() - Add a range of slots to the topiary. + * @mas: The maple state + * @destroy: The topiary to add the slots (usually destroy) + * @start: The starting slot inclusively + * @end: The end slot inclusively + */ +static inline void mas_topiary_range(struct ma_state *mas, + struct ma_topiary *destroy, unsigned char start, unsigned char end) +{ + void __rcu **slots; + unsigned offset; + + MT_BUG_ON(mas->tree, mte_is_leaf(mas->node)); + slots = ma_slots(mas_mn(mas), mte_node_type(mas->node)); + for (offset = start; offset <= end; offset++) { + struct maple_enode *enode = mas_slot_locked(mas, slots, offset); + + if (mte_dead_node(enode)) + continue; + + mat_add(destroy, enode); + } +} + +/* * mast_topiary() - Add the portions of the tree to the removal list; either to * be freed or discarded (destroy walk). * @mast: The maple_subtree_state. @@ -2280,48 +2304,62 @@ done: static inline void mast_topiary(struct maple_subtree_state *mast) { MA_WR_STATE(wr_mas, mast->orig_l, NULL); - unsigned char l_off, r_off, offset; - unsigned long l_index; - struct maple_enode *child; - void __rcu **slots; + unsigned char r_start, r_end; + unsigned char l_start, l_end; + void **l_slots, **r_slots; wr_mas.type = mte_node_type(mast->orig_l->node); - /* The left node is consumed, so add to the free list. */ - l_index = mast->orig_l->index; mast->orig_l->index = mast->orig_l->last; mas_wr_node_walk(&wr_mas); - mast->orig_l->index = l_index; - l_off = mast->orig_l->offset; - r_off = mast->orig_r->offset; - if (mast->orig_l->node == mast->orig_r->node) { - slots = ma_slots(mte_to_node(mast->orig_l->node), wr_mas.type); - for (offset = l_off + 1; offset < r_off; offset++) - mat_add(mast->destroy, mas_slot_locked(mast->orig_l, - slots, offset)); + l_start = mast->orig_l->offset + 1; + l_end = mas_data_end(mast->orig_l); + r_start = 0; + r_end = mast->orig_r->offset; + + if (r_end) + r_end--; + + l_slots = ma_slots(mas_mn(mast->orig_l), + mte_node_type(mast->orig_l->node)); + + r_slots = ma_slots(mas_mn(mast->orig_r), + mte_node_type(mast->orig_r->node)); + + if ((l_start < l_end) && + mte_dead_node(mas_slot_locked(mast->orig_l, l_slots, l_start))) { + l_start++; + } + if (mte_dead_node(mas_slot_locked(mast->orig_r, r_slots, r_end))) { + if (r_end) + r_end--; + } + + if ((l_start > r_end) && (mast->orig_l->node == mast->orig_r->node)) return; + + /* At the node where left and right sides meet, add the parts between */ + if (mast->orig_l->node == mast->orig_r->node) { + return mas_topiary_range(mast->orig_l, mast->destroy, + l_start, r_end); } /* mast->orig_r is different and consumed. */ if (mte_is_leaf(mast->orig_r->node)) return; - /* Now destroy l_off + 1 -> end and 0 -> r_off - 1 */ - offset = l_off + 1; - slots = ma_slots(mte_to_node(mast->orig_l->node), wr_mas.type); - while (offset < mt_slots[wr_mas.type]) { - child = mas_slot_locked(mast->orig_l, slots, offset++); - if (!child) - break; + if (mte_dead_node(mas_slot_locked(mast->orig_l, l_slots, l_end))) + l_end--; - mat_add(mast->destroy, child); - } - slots = ma_slots(mte_to_node(mast->orig_r->node), - mte_node_type(mast->orig_r->node)); - for (offset = 0; offset < r_off; offset++) - mat_add(mast->destroy, - mas_slot_locked(mast->orig_l, slots, offset)); + if (l_start <= l_end) + mas_topiary_range(mast->orig_l, mast->destroy, l_start, l_end); + + if (mte_dead_node(mas_slot_locked(mast->orig_r, r_slots, r_start))) + r_start++; + + if (r_start <= r_end) + mas_topiary_range(mast->orig_r, mast->destroy, 0, r_end); } /* @@ -2329,19 +2367,13 @@ static inline void mast_topiary(struct m * @mast: The maple subtree state * @old_r: The encoded maple node to the right (next node). */ -static inline void mast_rebalance_next(struct maple_subtree_state *mast, - struct maple_enode *old_r, bool free) +static inline void mast_rebalance_next(struct maple_subtree_state *mast) { unsigned char b_end = mast->bn->b_end; mas_mab_cp(mast->orig_r, 0, mt_slot_count(mast->orig_r->node), mast->bn, b_end); - if (free) - mat_add(mast->free, old_r); - mast->orig_r->last = mast->orig_r->max; - if (old_r == mast->orig_l->node) - mast->orig_l->node = mast->orig_r->node; } /* @@ -2349,17 +2381,13 @@ static inline void mast_rebalance_next(s * @mast: The maple subtree state * @old_l: The encoded maple node to the left (previous node) */ -static inline void mast_rebalance_prev(struct maple_subtree_state *mast, - struct maple_enode *old_l) +static inline void mast_rebalance_prev(struct maple_subtree_state *mast) { unsigned char end = mas_data_end(mast->orig_l) + 1; unsigned char b_end = mast->bn->b_end; mab_shift_right(mast->bn, end); mas_mab_cp(mast->orig_l, 0, end - 1, mast->bn, 0); - mat_add(mast->free, old_l); - if (mast->orig_r->node == old_l) - mast->orig_r->node = mast->orig_l->node; mast->l->min = mast->orig_l->min; mast->orig_l->index = mast->orig_l->min; mast->bn->b_end = end + b_end; @@ -2367,68 +2395,116 @@ static inline void mast_rebalance_prev(s } /* - * mast_sibling_rebalance_right() - Rebalance from nodes with the same parents. - * Check the right side, then the left. Data is copied into the @mast->bn. + * mast_spanning_rebalance() - Rebalance nodes with nearest neighbour favouring + * the node to the right. Checking the nodes to the right then the left at each + * level upwards until root is reached. Free and destroy as needed. + * Data is copied into the @mast->bn. * @mast: The maple_subtree_state. */ static inline -bool mast_sibling_rebalance_right(struct maple_subtree_state *mast, bool free) +bool mast_spanning_rebalance(struct maple_subtree_state *mast) { - struct maple_enode *old_r; - struct maple_enode *old_l; + struct ma_state r_tmp = *mast->orig_r; + struct ma_state l_tmp = *mast->orig_l; + struct maple_enode *ancestor = NULL; + unsigned char start, end; + unsigned char depth = 0; - old_r = mast->orig_r->node; - if (mas_next_sibling(mast->orig_r)) { - mast_rebalance_next(mast, old_r, free); - return true; - } + r_tmp = *mast->orig_r; + l_tmp = *mast->orig_l; + do { + mas_ascend(mast->orig_r); + mas_ascend(mast->orig_l); + depth++; + if (!ancestor && + (mast->orig_r->node == mast->orig_l->node)) { + ancestor = mast->orig_r->node; + end = mast->orig_r->offset - 1; + start = mast->orig_l->offset + 1; + } - old_l = mast->orig_l->node; - if (mas_prev_sibling(mast->orig_l)) { - mast->bn->type = mte_node_type(mast->orig_l->node); - mast_rebalance_prev(mast, old_l); - return true; - } + if (mast->orig_r->offset < mas_data_end(mast->orig_r)) { + if (!ancestor) { + ancestor = mast->orig_r->node; + start = 0; + } - return false; -} + mast->orig_r->offset++; + do { + mas_descend(mast->orig_r); + mast->orig_r->offset = 0; + depth--; + } while (depth); + + mast_rebalance_next(mast); + do { + unsigned char l_off = 0; + struct maple_enode *child = r_tmp.node; + + mas_ascend(&r_tmp); + if (ancestor == r_tmp.node) + l_off = start; + + if (r_tmp.offset) + r_tmp.offset--; + + if (l_off < r_tmp.offset) + mas_topiary_range(&r_tmp, mast->destroy, + l_off, r_tmp.offset); -static inline int mas_prev_node(struct ma_state *mas, unsigned long min); -static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, - unsigned long max); -/* - * mast_cousin_rebalance_right() - Rebalance from nodes with different parents. - * Check the right side, then the left. Data is copied into the @mast->bn. - * @mast: The maple_subtree_state. - */ -static inline -bool mast_cousin_rebalance_right(struct maple_subtree_state *mast, bool free) -{ - struct maple_enode *old_l = mast->orig_l->node; - struct maple_enode *old_r = mast->orig_r->node; + if (l_tmp.node != child) + mat_add(mast->free, child); - MA_STATE(tmp, mast->orig_r->tree, mast->orig_r->index, mast->orig_r->last); + } while (r_tmp.node != ancestor); - tmp = *mast->orig_r; - mas_next_node(mast->orig_r, mas_mn(mast->orig_r), ULONG_MAX); - if (!mas_is_none(mast->orig_r)) { - mast_rebalance_next(mast, old_r, free); - return true; - } + *mast->orig_l = l_tmp; + return true; - *mast->orig_r = *mast->orig_l; - *mast->r = *mast->l; - mas_prev_node(mast->orig_l, 0); - if (mas_is_none(mast->orig_l)) { - /* Making a new root with the contents of mast->bn */ - *mast->orig_l = *mast->orig_r; - *mast->orig_r = tmp; - return false; - } + } else if (mast->orig_l->offset != 0) { + if (!ancestor) { + ancestor = mast->orig_l->node; + end = mas_data_end(mast->orig_l); + } - mast->orig_l->offset = 0; - mast_rebalance_prev(mast, old_l); - return true; + mast->orig_l->offset--; + do { + mas_descend(mast->orig_l); + mast->orig_l->offset = + mas_data_end(mast->orig_l); + depth--; + } while (depth); + + mast_rebalance_prev(mast); + do { + unsigned char r_off; + struct maple_enode *child = l_tmp.node; + + mas_ascend(&l_tmp); + if (ancestor == l_tmp.node) + r_off = end; + else + r_off = mas_data_end(&l_tmp); + + if (l_tmp.offset < r_off) + l_tmp.offset++; + + if (l_tmp.offset < r_off) + mas_topiary_range(&l_tmp, mast->destroy, + l_tmp.offset, r_off); + + if (r_tmp.node != child) + mat_add(mast->free, child); + + } while (l_tmp.node != ancestor); + + *mast->orig_r = r_tmp; + return true; + } + } while (!mte_is_root(mast->orig_r->node)); + + *mast->orig_r = r_tmp; + *mast->orig_l = l_tmp; + return false; } /* @@ -2462,18 +2538,16 @@ mast_ascend_free(struct maple_subtree_st * The node may not contain the value so set slot to ensure all * of the nodes contents are freed or destroyed. */ - if (mast->orig_r->max < mast->orig_r->last) - mast->orig_r->offset = mas_data_end(mast->orig_r) + 1; - else { - wr_mas.type = mte_node_type(mast->orig_r->node); - mas_wr_node_walk(&wr_mas); - } + wr_mas.type = mte_node_type(mast->orig_r->node); + mas_wr_node_walk(&wr_mas); /* Set up the left side of things */ mast->orig_l->offset = 0; mast->orig_l->index = mast->l->min; wr_mas.mas = mast->orig_l; wr_mas.type = mte_node_type(mast->orig_l->node); mas_wr_node_walk(&wr_mas); + + mast->bn->type = wr_mas.type; } /* @@ -2881,7 +2955,7 @@ static int mas_spanning_rebalance(struct struct maple_enode *left = NULL, *middle = NULL, *right = NULL; MA_STATE(l_mas, mas->tree, mas->index, mas->index); - MA_STATE(r_mas, mas->tree, mas->index, mas->index); + MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(m_mas, mas->tree, mas->index, mas->index); MA_TOPIARY(free, mas->tree); MA_TOPIARY(destroy, mas->tree); @@ -2897,14 +2971,9 @@ static int mas_spanning_rebalance(struct mast->destroy = &destroy; l_mas.node = r_mas.node = m_mas.node = MAS_NONE; if (!mas_is_root_limits(mast->orig_l) && - unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) { - /* - * Do not free the current node as it may be freed in a bulk - * free. - */ - if (!mast_sibling_rebalance_right(mast, false)) - mast_cousin_rebalance_right(mast, false); - } + unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) + mast_spanning_rebalance(mast); + mast->orig_l->depth = 0; /* @@ -2948,6 +3017,15 @@ static int mas_spanning_rebalance(struct /* Copy anything necessary out of the right node. */ mast_combine_cp_right(mast); + if (mte_dead_node(mast->orig_l->node) || + mte_dead_node(mast->orig_r->node)) { + printk("FUCKED. l %p is %s and r %p is %s\n", + mas_mn(mast->orig_l), + mte_dead_node(mast->orig_l->node) ? "dead" : "alive", + mas_mn(mast->orig_r), + mte_dead_node(mast->orig_r->node) ? "dead" : "alive"); + printk("Writing %lu-%lu\n", mas->index, mas->last); + } mast_topiary(mast); mast->orig_l->last = mast->orig_l->max; @@ -2961,15 +3039,14 @@ static int mas_spanning_rebalance(struct if (mas_is_root_limits(mast->orig_l)) break; - /* Try to get enough data for the next iteration. */ - if (!mast_sibling_rebalance_right(mast, true)) - if (!mast_cousin_rebalance_right(mast, true)) - break; + if (!mast_spanning_rebalance(mast)) + break; /* rebalancing from other nodes may require another loop. */ if (!count) count++; } + l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), mte_node_type(mast->orig_l->node)); mast->orig_l->depth++; @@ -3042,6 +3119,7 @@ static inline int mas_rebalance(struct m mast.orig_l = &l_mas; mast.orig_r = &r_mas; mast.bn = b_node; + mast.bn->type = mte_node_type(mas->node); l_mas = r_mas = *mas; @@ -3855,7 +3933,7 @@ done: return 1; } /* - * mas_spanning_store() - Create a subtree with the store operation completed + * mas_wr_spanning_store() - Create a subtree with the store operation completed * and new nodes where necessary, then place the sub-tree in the actual tree. * Note that mas is expected to point to the node which caused the store to * span. @@ -3941,6 +4019,13 @@ static inline int mas_wr_spanning_store( mast.bn = &b_node; mast.orig_l = &l_mas; mast.orig_r = &r_mas; + if (mte_dead_node(mast.orig_l->node) || + mte_dead_node(mast.orig_r->node)) { + printk("FUCKED. l is %s and r is %s\n", + mte_dead_node(mast.orig_l->node) ? "dead" : "alive", + mte_dead_node(mast.orig_r->node) ? "dead" : "alive"); + printk("Writing %lu-%lu\n", mas->index, mas->last); + } /* Combine l_mas and r_mas and split them up evenly again. */ return mas_spanning_rebalance(mas, &mast, height + 1); } @@ -5387,6 +5472,9 @@ static inline void __rcu **mas_destroy_d node = mas_mn(mas); slots = ma_slots(node, mte_node_type(mas->node)); next = mas_slot_locked(mas, slots, 0); + if ((mte_dead_node(next))) + next = mas_slot_locked(mas, slots, 1); + mte_set_node_dead(mas->node); node->type = mte_node_type(mas->node); node->piv_parent = prev; @@ -5394,6 +5482,7 @@ static inline void __rcu **mas_destroy_d offset = 0; prev = mas->node; } while (!mte_is_leaf(next)); + return slots; } @@ -5427,17 +5516,15 @@ static void mt_destroy_walk(struct maple mas.node = node->piv_parent; if (mas_mn(&mas) == node) goto start_slots_free; + type = mte_node_type(mas.node); slots = ma_slots(mte_to_node(mas.node), type); - if ((offset < mt_slots[type])) { - struct maple_enode *next = slots[offset]; + if ((offset < mt_slots[type]) && mte_node_type(slots[offset]) && + mte_to_node(slots[offset])) { + struct maple_enode *parent = mas.node; - if (mte_node_type(next) && mte_to_node(next)) { - struct maple_enode *parent = mas.node; - - mas.node = mas_slot_locked(&mas, slots, offset); - slots = mas_destroy_descend(&mas, parent, offset); - } + mas.node = mas_slot_locked(&mas, slots, offset); + slots = mas_destroy_descend(&mas, parent, offset); } node = mas_mn(&mas); } while (start != mas.node); _ Patches currently in -mm which might be from liam.howlett@xxxxxxxxxx are maple-tree-add-new-data-structure-fix.patch maple-tree-add-new-data-structure-fix-2.patch maple-tree-add-new-data-structure-fix-3.patch maple-tree-add-new-data-structure-fix-4.patch maple-tree-add-new-data-structure-fix-7.patch maple-tree-add-new-data-structure-fix-8.patch maple-tree-add-new-data-structure-fix-9.patch lib-test_maple_tree-add-testing-for-maple-tree-fix.patch lib-test_maple_tree-add-testing-for-maple-tree-fix-2.patch mm-start-tracking-vmas-with-maple-tree-fix-2.patch mm-start-tracking-vmas-with-maple-tree-fix-3.patch mm-mmap-use-advanced-maple-tree-api-for-mmap_region-fix-2.patch mm-mmap-change-do_brk_munmap-to-use-do_mas_align_munmap-fix.patch mm-remove-the-vma-linked-list-fix.patch mm-mlock-drop-dead-code-in-count_mm_mlocked_page_nr.patch