Re: [RFC PATCH 4/5] mm/migrate: introduce multi-threaded page copy routine

Hyeonggon Yoo <hyeonggon.yoo@xxxxxx> · Mon, 6 Jan 2025 10:18:09 +0900






On 2025-01-04 2:24 AM, Zi Yan wrote:
Now page copies are batched, multi-threaded page copy can be used to
increase page copy throughput. Add copy_page_lists_mt() to copy pages in
multi-threaded manners. Empirical data show more than 32 base pages are
needed to show the benefit of using multi-threaded page copy, so use 32 as
the threshold.
>
Signed-off-by: Zi Yan <ziy@xxxxxxxxxx>
---
  include/linux/migrate.h |   3 +
  mm/Makefile             |   2 +-
  mm/copy_pages.c         | 186 ++++++++++++++++++++++++++++++++++++++++
  mm/migrate.c            |  19 ++--
  4 files changed, 199 insertions(+), 11 deletions(-)
  create mode 100644 mm/copy_pages.c


[...snip...]

+++ b/mm/copy_pages.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Parallel page copy routine.
+ */
+
+#include <linux/sysctl.h>
+#include <linux/highmem.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/migrate.h>
+
+
+unsigned int limit_mt_num = 4;
+
+struct copy_item {
+	char *to;
+	char *from;
+	unsigned long chunk_size;
+};
+
+struct copy_page_info {
+	struct work_struct copy_page_work;
+	unsigned long num_items;
+	struct copy_item item_list[];
+};
+
+static void copy_page_routine(char *vto, char *vfrom,
+	unsigned long chunk_size)
+{
+	memcpy(vto, vfrom, chunk_size);
+}
+
+static void copy_page_work_queue_thread(struct work_struct *work)
+{
+	struct copy_page_info *my_work = (struct copy_page_info *)work;
+	int i;
+
+	for (i = 0; i < my_work->num_items; ++i)
+		copy_page_routine(my_work->item_list[i].to,
+						  my_work->item_list[i].from,
+						  my_work->item_list[i].chunk_size);
+}
+
+int copy_page_lists_mt(struct list_head *dst_folios,
+		struct list_head *src_folios, int nr_items)
+{
+	int err = 0;
+	unsigned int total_mt_num = limit_mt_num;
+	int to_node = folio_nid(list_first_entry(dst_folios, struct folio, lru));
+	int i;
+	struct copy_page_info *work_items[32] = {0};
+	const struct cpumask *per_node_cpumask = cpumask_of_node(to_node);

What happens here if to_node is a NUMA node without CPUs? (e.g. CXL
node).

And even with a NUMA node with CPUs I think offloading copies to CPUs
of either "from node" or "to node" will end up a CPU touching two pages
in two different NUMA nodes anyway, one page in the local node
and the other page in the remote node.

In that sense, I don't understand when push_0_pull_1 (introduced in
patch 5) should be 0 or 1. Am I missing something?

+	int cpu_id_list[32] = {0};
+	int cpu;
+	int max_items_per_thread;
+	int item_idx;
+	struct folio *src, *src2, *dst, *dst2;
+
+	total_mt_num = min_t(unsigned int, total_mt_num,
+			cpumask_weight(per_node_cpumask));
+
+	if (total_mt_num > 32)
+		total_mt_num = 32;
+
+	/* Each threads get part of each page, if nr_items < totla_mt_num */
+	if (nr_items < total_mt_num)
+		max_items_per_thread = nr_items;
+	else
+		max_items_per_thread = (nr_items / total_mt_num) +
+				((nr_items % total_mt_num) ? 1 : 0);
+
+
+	for (cpu = 0; cpu < total_mt_num; ++cpu) {
+		work_items[cpu] = kzalloc(sizeof(struct copy_page_info) +
+					sizeof(struct copy_item) * max_items_per_thread,
+					GFP_NOWAIT);
> +
+		if (!work_items[cpu]) {
+			err = -ENOMEM;
+			goto free_work_items;
+		}
+	}

[...snip...]

+
+	/* Wait until it finishes  */
+	for (i = 0; i < total_mt_num; ++i)
+		flush_work((struct work_struct *)work_items[i]);
+
+free_work_items:
+	for (cpu = 0; cpu < total_mt_num; ++cpu)
+		kfree(work_items[cpu]);
+
+	return err;

Should the kernel re-try migration without multi-threading if it failed
to allocate memory?

---
Hyeonggon