Hello, please RFC. Initial discussion was here: https://patchwork.kernel.org/patch/10244733/ Currently an allocation of the new VA area is done over busy list iteration until a suitable hole is found between two busy areas. Therefore each new allocation causes the list being grown. Due to long list and different permissive parameters an allocation can take a long time on embedded devices(milliseconds). This patch organizes the vmalloc memory layout into free areas of the VMALLOC_START-VMALLOC_END range. It uses a red-black tree that keeps blocks sorted by their offsets in pair with linked list keeping the free space in order of increasing addresses. Allocation: to allocate a new block a search is done over free list areas until a suitable block is large enough to encompass the requested size. If the block is bigger than requested size - it is split. De-allocation: red-black tree allows efficiently find a spot in the tree whereas a linked list allows fast merge of de-allocated memory chunks with existing free blocks creating large coalesced areas. model name: QEMU Virtual CPU version 2.5+ test_1: <measure this loop time> for (n = 0; n < 1000000; n++) { void *ptr_1 = vmalloc(3 * PAGE_SIZE); *((__u8 *)ptr_1) = 0; /* Pretend we used the mem */ vfree(ptr_1); } <measure this loop time> 938007(us) vs 939222(us) +0.129% 932760(us) vs 932565(us) -0.020% 929691(us) vs 935795(us) +0.652% 932767(us) vs 932683(us) -0.009% 937520(us) vs 935457(us) -0.220% test_2: for (n = 0; n < 15000; n++) ptr[n] = vmalloc(1 * PAGE_SIZE); <measure this loop time> for (n = 0; n < 1000000; n++) { void *ptr_1 = vmalloc(100 * PAGE_SIZE); void *ptr_2 = vmalloc(1 * PAGE_SIZE); *((__u8 *)ptr_1) = 0; /* Pretend we used the mem */ *((__u8 *)ptr_2) = 1; /* Pretend we used the mem */ vfree(ptr_1); vfree(ptr_2); } <measure this loop time> 33590880(us) vs 11027121(us) -67.172% 34503307(us) vs 11696023(us) -66.101% 44198667(us) vs 11849005(us) -73.191% 19377377(us) vs 12026349(us) -37.936% 29511186(us) vs 11757217(us) -60.160% Signed-off-by: Uladzislau Rezki (Sony) <urezki@xxxxxxxxx> --- mm/vmalloc.c | 420 +++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 393 insertions(+), 27 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ebff729cc956..2ab7ec93b199 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -332,6 +332,29 @@ LIST_HEAD(vmap_area_list); static LLIST_HEAD(vmap_purge_list); static struct rb_root vmap_area_root = RB_ROOT; +/* + * This linked list is used in pair with free_vmap_area_root. + * It makes it possible of fast accessing to next/prev nodes + * to perform coalescing. + */ +static LIST_HEAD(free_vmap_area_list); + +/* + * This red-black tree is used for storing address-sorted + * vmap areas during free operation. Sorting is done using + * va_start address. We make use of it to merge a VA with + * its prev/next neighbors. + */ +static struct rb_root free_vmap_area_root = RB_ROOT; + +/* + * For vmalloc specific area allocation. + */ +static struct vmap_area *last_free_va_chunk; +static unsigned long last_alloc_vstart; +static unsigned long last_alloc_align; +static unsigned long free_va_max_size; + /* The vmap cache globals are protected by vmap_area_lock */ static struct rb_node *free_vmap_cache; static unsigned long cached_hole_size; @@ -359,27 +382,53 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) return NULL; } -static void __insert_vmap_area(struct vmap_area *va) +static inline void __find_va_slot(struct vmap_area *va, + struct rb_root *root, struct rb_node **parent, struct rb_node ***link) { - struct rb_node **p = &vmap_area_root.rb_node; - struct rb_node *parent = NULL; - struct rb_node *tmp; + *link = &root->rb_node; + *parent = NULL; - while (*p) { + while (**link) { struct vmap_area *tmp_va; - parent = *p; - tmp_va = rb_entry(parent, struct vmap_area, rb_node); + *parent = **link; + tmp_va = rb_entry(*parent, struct vmap_area, rb_node); if (va->va_start < tmp_va->va_end) - p = &(*p)->rb_left; + *link = &(**link)->rb_left; else if (va->va_end > tmp_va->va_start) - p = &(*p)->rb_right; + *link = &(**link)->rb_right; else BUG(); } +} + +static inline void __find_va_siblings(struct rb_node *p_rb_node, + struct rb_node **p_rb_link, struct list_head **next, struct list_head **prev) +{ + struct list_head *p_list_head; + + if (likely(p_rb_node)) { + p_list_head = &rb_entry(p_rb_node, struct vmap_area, rb_node)->list; + if (&p_rb_node->rb_right == p_rb_link) { + *next = p_list_head->next; + *prev = p_list_head; + } else { + *prev = p_list_head->prev; + *next = p_list_head; + } + } else { + /* Suppose it may ever happen. */ + *next = *prev = &free_vmap_area_list; + } +} - rb_link_node(&va->rb_node, parent, p); - rb_insert_color(&va->rb_node, &vmap_area_root); +static inline void __link_va(struct vmap_area *va, struct rb_root *root, + struct rb_node *parent, struct rb_node **p_link, struct list_head *head) +{ + struct rb_node *tmp; + + rb_link_node(&va->rb_node, parent, p_link); + rb_insert_color(&va->rb_node, root); /* address-sort this list */ tmp = rb_prev(&va->rb_node); @@ -388,13 +437,239 @@ static void __insert_vmap_area(struct vmap_area *va) prev = rb_entry(tmp, struct vmap_area, rb_node); list_add_rcu(&va->list, &prev->list); } else - list_add_rcu(&va->list, &vmap_area_list); + list_add_rcu(&va->list, head); +} + +static void __insert_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + struct rb_node **p_link; + struct rb_node *parent; + + __find_va_slot(va, root, &parent, &p_link); + __link_va(va, root, parent, p_link, head); } static void purge_vmap_area_lazy(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); +static inline unsigned long +__va_size(struct vmap_area *va) +{ + return va->va_end - va->va_start; +} + +static inline void +__remove_free_va_area(struct vmap_area *va) +{ + /* + * Remove VA from the address-sorted tree/list. + * Do check if its rb_node is empty or not, since + * we use this function as common interface to + * destroy a vmap_area. + */ + if (!RB_EMPTY_NODE(&va->rb_node)) { + rb_erase(&va->rb_node, &free_vmap_area_root); + list_del_rcu(&va->list); + } + + /* + * Lazy free. + */ + kfree_rcu(va, rcu_head); +} + +/* + * Merge de-allocated chunk of VA memory with previous + * and next free blocks. Either a pointer to the new + * merged area is returned if coalesce is done or VA + * area if inserting is done. + */ +static inline struct vmap_area * +__merge_add_free_va_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + struct vmap_area *sibling; + struct list_head *next, *prev; + struct rb_node **p_link; + struct rb_node *parent; + bool merged = false; + + /* + * Find a place in the tree where VA potentially will be + * inserted, unless it is merged with its sibling/siblings. + */ + __find_va_slot(va, root, &parent, &p_link); + + /* + * Get next/prev nodes of VA to check if merging can be done. + */ + __find_va_siblings(parent, p_link, &next, &prev); + + /* + * start end + * | | + * |<------VA------>|<-----Next----->| + * | | + * start end + */ + if (next != head) { + sibling = list_entry(next, struct vmap_area, list); + if (sibling->va_start == va->va_end) { + sibling->va_start = va->va_start; + __remove_free_va_area(va); + + /* Point to the new merged area. */ + va = sibling; + merged = true; + } + } + + /* + * start end + * | | + * |<-----Prev----->|<------VA------>| + * | | + * start end + */ + if (prev != head) { + sibling = list_entry(prev, struct vmap_area, list); + if (sibling->va_end == va->va_start) { + sibling->va_end = va->va_end; + __remove_free_va_area(va); + + /* Point to the new merged area. */ + va = sibling; + merged = true; + } + } + + if (!merged) + __link_va(va, root, parent, p_link, head); + + return va; +} + +static inline unsigned long +alloc_vmalloc_area(unsigned long size, unsigned long align, + unsigned long vstart, unsigned long vend, + int node, gfp_t gfp_mask) +{ + struct vmap_area *b_fit = NULL; /* best fit */ + struct vmap_area *le_fit = NULL; /* left-edge fit */ + struct vmap_area *re_fit = NULL; /* right-edge fit */ + struct vmap_area *ne_fit = NULL; /* no edge fit */ + struct vmap_area *va = last_free_va_chunk; + unsigned long nva_start_addr; + + if (!last_free_va_chunk || size <= free_va_max_size || + vstart < last_alloc_vstart || align < last_alloc_align) { + va = list_first_entry(&free_vmap_area_list, struct vmap_area, list); + free_va_max_size = 0; + last_free_va_chunk = NULL; + } + + nva_start_addr = ALIGN(vstart, align); + list_for_each_entry_from(va, &free_vmap_area_list, list) { + if (va->va_start > vstart) + nva_start_addr = ALIGN(va->va_start, align); + + /* VA does not fit to requested parameters. */ + if (nva_start_addr + size > va->va_end) { + free_va_max_size = max(free_va_max_size, __va_size(va)); + continue; + } + + /* Nothing has been found, give up. */ + if (nva_start_addr + size > vend) + break; + + /* Classify what we have found. */ + if (va->va_start == nva_start_addr) { + if (va->va_end == nva_start_addr + size) + b_fit = va; + le_fit = va; + } else if (va->va_end == nva_start_addr + size) { + re_fit = va; + } else { + ne_fit = va; + } + + last_free_va_chunk = va; + last_alloc_vstart = vstart; + last_alloc_align = align; + break; + } + + if (b_fit) { + /* + * No need to split VA, it fully fits. + * + * | | + * V NVA V + * |---------------| + */ + if (b_fit->list.prev != &free_vmap_area_list) + last_free_va_chunk = list_prev_entry(b_fit, list); + else + last_free_va_chunk = NULL; + + __remove_free_va_area(b_fit); + } else if (le_fit) { + /* + * Split left edge fit VA. + * + * | | + * V NVA V + * |-------|-------| + */ + le_fit->va_start += size; + } else if (re_fit) { + /* + * Split right edge fit VA. + * + * | | + * V NVA V + * |-------|-------| + */ + re_fit->va_end = nva_start_addr; + } else if (ne_fit) { + /* + * Split no edge fit VA. + * + * | | + * V NVA V + * |---|-------|---| + */ + va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); + if (unlikely(!va)) + return VMALLOC_END; + + /* + * Build right area of VA. + */ + va->va_start = nva_start_addr + size; + va->va_end = ne_fit->va_end; + + /* + * Build left area of VA. + */ + ne_fit->va_end = nva_start_addr; + + /* + * Add newly built right area to the address sorted list. + */ + __insert_vmap_area(va, + &free_vmap_area_root, &free_vmap_area_list); + } else { + /* Not found. */ + nva_start_addr = VMALLOC_END; + } + + return nva_start_addr; +} + /* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. @@ -409,6 +684,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long addr; int purged = 0; struct vmap_area *first; + bool is_vmalloc_allocation; BUG_ON(!size); BUG_ON(offset_in_page(size)); @@ -426,9 +702,22 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, * to avoid false negatives. */ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK); + is_vmalloc_allocation = is_vmalloc_addr((void *)vstart); retry: spin_lock(&vmap_area_lock); + if (is_vmalloc_allocation) { + addr = alloc_vmalloc_area(size, align, + vstart, vend, node, gfp_mask); + + /* + * If an allocation fails, the VMALLOC_END address is + * returned. Therefore, an overflow path will be triggered + * below. + */ + goto found; + } + /* * Invalidate cache if we have more permissive parameters. * cached_hole_size notes the largest hole noticed _below_ @@ -504,8 +793,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, va->va_start = addr; va->va_end = addr + size; va->flags = 0; - __insert_vmap_area(va); - free_vmap_cache = &va->rb_node; + __insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + + if (!is_vmalloc_allocation) + free_vmap_cache = &va->rb_node; + spin_unlock(&vmap_area_lock); BUG_ON(!IS_ALIGNED(va->va_start, align)); @@ -552,9 +844,14 @@ EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); static void __free_vmap_area(struct vmap_area *va) { + unsigned long last_free_va_start = 0; + bool is_vmalloc_area; + BUG_ON(RB_EMPTY_NODE(&va->rb_node)); + is_vmalloc_area = (va->va_end > VMALLOC_START && + va->va_end <= VMALLOC_END); - if (free_vmap_cache) { + if (!is_vmalloc_area && free_vmap_cache) { if (va->va_end < cached_vstart) { free_vmap_cache = NULL; } else { @@ -573,16 +870,39 @@ static void __free_vmap_area(struct vmap_area *va) RB_CLEAR_NODE(&va->rb_node); list_del_rcu(&va->list); - /* - * Track the highest possible candidate for pcpu area - * allocation. Areas outside of vmalloc area can be returned - * here too, consider only end addresses which fall inside - * vmalloc area proper. - */ - if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) + if (is_vmalloc_area) { + /* + * Track the highest possible candidate for pcpu area + * allocation. Areas outside of vmalloc area can be returned + * here too, consider only end addresses which fall inside + * vmalloc area proper. + */ vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); - kfree_rcu(va, rcu_head); + if (last_free_va_chunk) + last_free_va_start = last_free_va_chunk->va_start; + + /* + * Merge VA with its neighbors, otherwise add it. + */ + va = __merge_add_free_va_area(va, + &free_vmap_area_root, &free_vmap_area_list); + + /* + * Update a search criteria if merging/inserting is + * done before last_free_va_chunk va_start address. + */ + if (last_free_va_start) { + if (va->va_start <= last_free_va_start) { + if (va->list.prev != &free_vmap_area_list) + last_free_va_chunk = list_prev_entry(va, list); + else + last_free_va_chunk = NULL; + } + } + } else { + kfree_rcu(va, rcu_head); + } } /* @@ -1253,7 +1573,7 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align) void __init vmalloc_init(void) { - struct vmap_area *va; + struct vmap_area *va, *prev_va; struct vm_struct *tmp; int i; @@ -1269,16 +1589,62 @@ void __init vmalloc_init(void) INIT_WORK(&p->wq, free_work); } + /* + * Build free areas. + */ + va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); + va->va_start = (unsigned long) VMALLOC_START; + + if (!vmlist) + va->va_end = (unsigned long) VMALLOC_END; + else + va->va_end = (unsigned long) vmlist->addr; + + __insert_vmap_area(va, + &free_vmap_area_root, &free_vmap_area_list); + + if (!vmlist) + goto build_free_area_done; + /* Import existing vmlist entries. */ - for (tmp = vmlist; tmp; tmp = tmp->next) { + for (tmp = vmlist, prev_va = NULL; tmp; tmp = tmp->next) { + struct vmap_area *free_area; + va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); va->flags = VM_VM_AREA; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; va->vm = tmp; - __insert_vmap_area(va); + __insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + + /* + * Check if there is a padding between previous/current. + */ + if (prev_va && (va->va_start - prev_va->va_end) > 0) { + free_area = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); + free_area->va_start = prev_va->va_end; + free_area->va_end = va->va_start; + + __insert_vmap_area(free_area, + &free_vmap_area_root, &free_vmap_area_list); + } + + /* + * Handle last case building the remaining space. + */ + if (!tmp->next) { + free_area = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); + free_area->va_start = va->va_end; + free_area->va_end = (unsigned long) VMALLOC_END; + + __insert_vmap_area(free_area, + &free_vmap_area_root, &free_vmap_area_list); + } + + prev_va = va; } +build_free_area_done: vmap_area_pcpu_hole = VMALLOC_END; vmap_initialized = true; @@ -2604,7 +2970,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, va->va_start = base + offsets[area]; va->va_end = va->va_start + sizes[area]; - __insert_vmap_area(va); + __insert_vmap_area(va, &vmap_area_root, &vmap_area_list); } vmap_area_pcpu_hole = base + offsets[last_area]; -- 2.11.0