On 2024/1/5 18:50, Uladzislau Rezki wrote:
Hello, Wen Gu.Hi Uladzislau Rezki,
<...>
Fortunately, thank you for this patch set, the global vmap_area_lock was removed and per node lock vn->busy.lock is introduced. it is really helpful: In 48 CPUs qemu environment, the Requests/s increased by 5 times: - nginx - wrk -c 1000 -t 96 -d 30 http://127.0.0.1:80 vzalloced shmem vzalloced shmem(with this patch set) Requests/sec 113536.56 583729.93Thank you for the confirmation that your workload is improved. The "nginx" is 5 times better!
Yes, thank you very much for the improvement!
But it also has some overhead, compared to using kzalloced shared memory or unsetting CONFIG_HARDENED_USERCOPY, which won't involve finding vmap area: kzalloced shmem vzalloced shmem(unset CONFIG_HARDENED_USERCOPY) Requests/sec 831950.39 805164.78The CONFIG_HARDENED_USERCOPY prevents coping "wrong" memory regions. That is why if it is a vmalloced memory it wants to make sure it is really true, if not user-copy is aborted. So there is an extra work that involves finding a VA associated with an address.
Yes, and lock contention in finding VA is likely to be a performance bottleneck, which is mitigated a lot by your work.
So, as a newbie in Linux-mm, I would like to ask for some suggestions: Is it possible to further eliminate the overhead caused by lock contention in find_vmap_area() in this scenario (maybe this is asking too much), or the only way out is not setting CONFIG_HARDENED_USERCOPY or not using vzalloced buffer in the situation where cocurrent kernel-userspace-copy happens?Could you please try below patch, if it improves this series further? Just in case:
Thank you! I tried the patch, and it seems that the wait for rwlock_t also exists, as much as using spinlock_t. (The flamegraph is attached. Not sure why the read_lock waits so long, given that there is no frequent write_lock competition) vzalloced shmem(spinlock_t) vzalloced shmem(rwlock_t) Requests/sec 583729.93 460007.44 So I guess the overhead in finding vmap area is inevitable here and the original spin_lock is fine in this series. Thanks again for your help! Best regards, Wen Gu
<snip> diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e30dabf68263..40acf53cadfb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -772,7 +772,7 @@ static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); struct rb_list { struct rb_root root; struct list_head head; - spinlock_t lock; + rwlock_t lock; };struct vmap_pool {@@ -947,19 +947,19 @@ find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va) for (i = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i];- spin_lock(&vn->busy.lock);+ read_lock(&vn->busy.lock); va_lowest = __find_vmap_area_exceed_addr(addr, &vn->busy.root); if (va_lowest) { if (!va_node || va_lowest->va_start < (*va)->va_start) { if (va_node) - spin_unlock(&va_node->busy.lock); + read_unlock(&va_node->busy.lock);*va = va_lowest;va_node = vn; continue; } } - spin_unlock(&vn->busy.lock); + read_unlock(&vn->busy.lock); }return va_node;@@ -1695,9 +1695,9 @@ static void free_vmap_area(struct vmap_area *va) /* * Remove from the busy tree/list. */ - spin_lock(&vn->busy.lock); + write_lock(&vn->busy.lock); unlink_va(va, &vn->busy.root); - spin_unlock(&vn->busy.lock); + write_unlock(&vn->busy.lock);/** Insert/Merge it back to the free tree/list. @@ -1901,9 +1901,9 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,vn = addr_to_node(va->va_start); - spin_lock(&vn->busy.lock);+ write_lock(&vn->busy.lock); insert_vmap_area(va, &vn->busy.root, &vn->busy.head); - spin_unlock(&vn->busy.lock); + write_unlock(&vn->busy.lock);BUG_ON(!IS_ALIGNED(va->va_start, align));BUG_ON(va->va_start < vstart); @@ -2123,10 +2123,10 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, if (RB_EMPTY_ROOT(&vn->lazy.root)) continue;- spin_lock(&vn->lazy.lock);+ write_lock(&vn->lazy.lock); WRITE_ONCE(vn->lazy.root.rb_node, NULL); list_replace_init(&vn->lazy.head, &vn->purge_list); - spin_unlock(&vn->lazy.lock); + write_unlock(&vn->lazy.lock);start = min(start, list_first_entry(&vn->purge_list,struct vmap_area, list)->va_start); @@ -2223,9 +2223,9 @@ static void free_vmap_area_noflush(struct vmap_area *va) vn = is_vn_id_valid(vn_id) ? id_to_node(vn_id):addr_to_node(va->va_start);- spin_lock(&vn->lazy.lock);+ write_lock(&vn->lazy.lock); insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head); - spin_unlock(&vn->lazy.lock); + write_unlock(&vn->lazy.lock);trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max); @@ -2272,9 +2272,9 @@ struct vmap_area *find_vmap_area(unsigned long addr)do { vn = &vmap_nodes[i];- spin_lock(&vn->busy.lock);+ read_lock(&vn->busy.lock); va = __find_vmap_area(addr, &vn->busy.root); - spin_unlock(&vn->busy.lock); + read_unlock(&vn->busy.lock);if (va)return va; @@ -2293,11 +2293,11 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr) do { vn = &vmap_nodes[i];- spin_lock(&vn->busy.lock);+ write_lock(&vn->busy.lock); va = __find_vmap_area(addr, &vn->busy.root); if (va) unlink_va(va, &vn->busy.root); - spin_unlock(&vn->busy.lock); + write_unlock(&vn->busy.lock);if (va)return va; @@ -2514,9 +2514,9 @@ static void free_vmap_block(struct vmap_block *vb) BUG_ON(tmp != vb);vn = addr_to_node(vb->va->va_start);- spin_lock(&vn->busy.lock); + write_lock(&vn->busy.lock); unlink_va(vb->va, &vn->busy.root); - spin_unlock(&vn->busy.lock); + write_unlock(&vn->busy.lock);free_vmap_area_noflush(vb->va);kfree_rcu(vb, rcu_head); @@ -2942,9 +2942,9 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, { struct vmap_node *vn = addr_to_node(va->va_start);- spin_lock(&vn->busy.lock);+ read_lock(&vn->busy.lock); setup_vmalloc_vm_locked(vm, va, flags, caller); - spin_unlock(&vn->busy.lock); + read_unlock(&vn->busy.lock); }static void clear_vm_uninitialized_flag(struct vm_struct *vm)@@ -4214,19 +4214,19 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)next_va:next = va->va_end; - spin_unlock(&vn->busy.lock); + read_unlock(&vn->busy.lock); } while ((vn = find_vmap_area_exceed_addr_lock(next, &va)));finished_zero:if (vn) - spin_unlock(&vn->busy.lock); + read_unlock(&vn->busy.lock);/* zero-fill memory holes */return count - remains + zero_iter(iter, remains); finished: /* Nothing remains, or We couldn't copy/zero everything. */ if (vn) - spin_unlock(&vn->busy.lock); + read_unlock(&vn->busy.lock);return count - remains;} @@ -4563,11 +4563,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, for (area = 0; area < nr_vms; area++) { struct vmap_node *vn = addr_to_node(vas[area]->va_start);- spin_lock(&vn->busy.lock);+ write_lock(&vn->busy.lock); insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head); setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, pcpu_get_vm_areas); - spin_unlock(&vn->busy.lock); + write_unlock(&vn->busy.lock); }/*@@ -4687,7 +4687,7 @@ bool vmalloc_dump_obj(void *object)vn = addr_to_node((unsigned long)objp); - if (spin_trylock(&vn->busy.lock)) {+ if (read_trylock(&vn->busy.lock)) { va = __find_vmap_area(addr, &vn->busy.root);if (va && va->vm) {@@ -4697,7 +4697,7 @@ bool vmalloc_dump_obj(void *object) success = true; }- spin_unlock(&vn->busy.lock);+ read_unlock(&vn->busy.lock); }if (success)@@ -4742,13 +4742,13 @@ static void show_purge_info(struct seq_file *m) for (i = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i];- spin_lock(&vn->lazy.lock);+ read_lock(&vn->lazy.lock); list_for_each_entry(va, &vn->lazy.head, list) { seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", (void *)va->va_start, (void *)va->va_end, va->va_end - va->va_start); } - spin_unlock(&vn->lazy.lock); + read_unlock(&vn->lazy.lock); } }@@ -4762,7 +4762,7 @@ static int vmalloc_info_show(struct seq_file *m, void *p)for (i = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i];- spin_lock(&vn->busy.lock);+ read_lock(&vn->busy.lock); list_for_each_entry(va, &vn->busy.head, list) { if (!va->vm) { if (va->flags & VMAP_RAM) @@ -4808,7 +4808,7 @@ static int vmalloc_info_show(struct seq_file *m, void *p) show_numa_info(m, v); seq_putc(m, '\n'); } - spin_unlock(&vn->busy.lock); + read_unlock(&vn->busy.lock); }/*@@ -4902,11 +4902,11 @@ static void vmap_init_nodes(void) vn = &vmap_nodes[n]; vn->busy.root = RB_ROOT; INIT_LIST_HEAD(&vn->busy.head); - spin_lock_init(&vn->busy.lock); + rwlock_init(&vn->busy.lock);vn->lazy.root = RB_ROOT;INIT_LIST_HEAD(&vn->lazy.head); - spin_lock_init(&vn->lazy.lock); + rwlock_init(&vn->lazy.lock);for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {INIT_LIST_HEAD(&vn->pool[i].head); <snip> Thank you! -- Uladzislau Rezki
Attachment:
vzalloc_t96_improve_rwlock.svg
Description: image/svg