On 2025-01-04 2:24 AM, Zi Yan wrote:
Now page copies are batched, multi-threaded page copy can be used to increase page copy throughput. Add copy_page_lists_mt() to copy pages in multi-threaded manners. Empirical data show more than 32 base pages are needed to show the benefit of using multi-threaded page copy, so use 32 as the threshold.
>
Signed-off-by: Zi Yan <ziy@xxxxxxxxxx> --- include/linux/migrate.h | 3 + mm/Makefile | 2 +- mm/copy_pages.c | 186 ++++++++++++++++++++++++++++++++++++++++ mm/migrate.c | 19 ++-- 4 files changed, 199 insertions(+), 11 deletions(-) create mode 100644 mm/copy_pages.c
[...snip...]
+++ b/mm/copy_pages.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Parallel page copy routine. + */ + +#include <linux/sysctl.h> +#include <linux/highmem.h> +#include <linux/workqueue.h> +#include <linux/slab.h> +#include <linux/migrate.h> + + +unsigned int limit_mt_num = 4; + +struct copy_item { + char *to; + char *from; + unsigned long chunk_size; +}; + +struct copy_page_info { + struct work_struct copy_page_work; + unsigned long num_items; + struct copy_item item_list[]; +}; + +static void copy_page_routine(char *vto, char *vfrom, + unsigned long chunk_size) +{ + memcpy(vto, vfrom, chunk_size); +} + +static void copy_page_work_queue_thread(struct work_struct *work) +{ + struct copy_page_info *my_work = (struct copy_page_info *)work; + int i; + + for (i = 0; i < my_work->num_items; ++i) + copy_page_routine(my_work->item_list[i].to, + my_work->item_list[i].from, + my_work->item_list[i].chunk_size); +} + +int copy_page_lists_mt(struct list_head *dst_folios, + struct list_head *src_folios, int nr_items) +{ + int err = 0; + unsigned int total_mt_num = limit_mt_num; + int to_node = folio_nid(list_first_entry(dst_folios, struct folio, lru)); + int i; + struct copy_page_info *work_items[32] = {0}; + const struct cpumask *per_node_cpumask = cpumask_of_node(to_node);
What happens here if to_node is a NUMA node without CPUs? (e.g. CXL node). And even with a NUMA node with CPUs I think offloading copies to CPUs of either "from node" or "to node" will end up a CPU touching two pages in two different NUMA nodes anyway, one page in the local node and the other page in the remote node. In that sense, I don't understand when push_0_pull_1 (introduced in patch 5) should be 0 or 1. Am I missing something?
+ int cpu_id_list[32] = {0}; + int cpu; + int max_items_per_thread; + int item_idx; + struct folio *src, *src2, *dst, *dst2; + + total_mt_num = min_t(unsigned int, total_mt_num, + cpumask_weight(per_node_cpumask)); + + if (total_mt_num > 32) + total_mt_num = 32; + + /* Each threads get part of each page, if nr_items < totla_mt_num */ + if (nr_items < total_mt_num) + max_items_per_thread = nr_items; + else + max_items_per_thread = (nr_items / total_mt_num) + + ((nr_items % total_mt_num) ? 1 : 0); + + + for (cpu = 0; cpu < total_mt_num; ++cpu) { + work_items[cpu] = kzalloc(sizeof(struct copy_page_info) + + sizeof(struct copy_item) * max_items_per_thread, + GFP_NOWAIT);
> +
+ if (!work_items[cpu]) { + err = -ENOMEM; + goto free_work_items; + } + }
[...snip...]
+ + /* Wait until it finishes */ + for (i = 0; i < total_mt_num; ++i) + flush_work((struct work_struct *)work_items[i]); + +free_work_items: + for (cpu = 0; cpu < total_mt_num; ++cpu) + kfree(work_items[cpu]); + + return err;
Should the kernel re-try migration without multi-threading if it failed to allocate memory? --- Hyeonggon