[RFC PATCH 2/6] mm: make Selective KSM synchronous

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Make KSM synchronous by introducing the following sysfs file, which
shall carryout merging on the specified memory region synchronously
and eliminates the need of ksmd running in the background.

echo "pid start_addr end_addr" > /sys/kernel/mm/ksm/trigger_merge

Signed-off-by: Sourav Panda <souravpanda@xxxxxxxxxx>
---
 mm/ksm.c | 317 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 271 insertions(+), 46 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 8be2b144fefd..b2f184557ed9 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -290,16 +290,18 @@ static unsigned int zero_checksum __read_mostly;
 /* Whether to merge empty (zeroed) pages with actual zero pages */
 static bool ksm_use_zero_pages __read_mostly;
 
-/* Skip pages that couldn't be de-duplicated previously */
-/* Default to true at least temporarily, for testing */
-static bool ksm_smart_scan = true;
-
 /* The number of zero pages which is placed by KSM */
 atomic_long_t ksm_zero_pages = ATOMIC_LONG_INIT(0);
 
 /* The number of pages that have been skipped due to "smart scanning" */
 static unsigned long ksm_pages_skipped;
 
+#ifndef CONFIG_SELECTIVE_KSM /* advisor immaterial if there is no scanning */
+
+/* Skip pages that couldn't be de-duplicated previously */
+/* Default to true at least temporarily, for testing */
+static bool ksm_smart_scan = true;
+
 /* Don't scan more than max pages per batch. */
 static unsigned long ksm_advisor_max_pages_to_scan = 30000;
 
@@ -465,6 +467,7 @@ static void advisor_stop_scan(void)
 	if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
 		scan_time_advisor();
 }
+#endif /* CONFIG_SELECTIVE_KSM */
 
 #ifdef CONFIG_NUMA
 /* Zeroed when merging across nodes is not allowed */
@@ -957,6 +960,25 @@ static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node,
 	return NULL;
 }
 
+static unsigned char get_rmap_item_age(struct ksm_rmap_item *rmap_item)
+{
+#ifdef CONFIG_SELECTIVE_KSM /* age is immaterial in selective ksm */
+	return 0;
+#else
+	unsigned char age;
+	/*
+	 * Usually ksmd can and must skip the rb_erase, because
+	 * root_unstable_tree was already reset to RB_ROOT.
+	 * But be careful when an mm is exiting: do the rb_erase
+	 * if this rmap_item was inserted by this scan, rather
+	 * than left over from before.
+	 */
+	age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
+	WARN_ON_ONCE(age > 1);
+	return age;
+#endif /* CONFIG_SELECTIVE_KSM */
+}
+
 /*
  * Removing rmap_item from stable or unstable tree.
  * This function will clean the information from the stable/unstable tree.
@@ -991,16 +1013,7 @@ static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item)
 		rmap_item->address &= PAGE_MASK;
 
 	} else if (rmap_item->address & UNSTABLE_FLAG) {
-		unsigned char age;
-		/*
-		 * Usually ksmd can and must skip the rb_erase, because
-		 * root_unstable_tree was already reset to RB_ROOT.
-		 * But be careful when an mm is exiting: do the rb_erase
-		 * if this rmap_item was inserted by this scan, rather
-		 * than left over from before.
-		 */
-		age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
-		BUG_ON(age > 1);
+		unsigned char age = get_rmap_item_age(rmap_item);
 		if (!age)
 			rb_erase(&rmap_item->node,
 				 root_unstable_tree + NUMA(rmap_item->nid));
@@ -2203,6 +2216,37 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
 	rmap_item->mm->ksm_merging_pages++;
 }
 
+#ifdef CONFIG_SELECTIVE_KSM
+static int update_checksum(struct page *page, struct ksm_rmap_item *rmap_item)
+{
+	/*
+	 * Typically KSM would wait for a second round to even consider
+	 * the page for unstable tree insertion to ascertain its stability.
+	 * Avoid this when using selective ksm.
+	 */
+	rmap_item->oldchecksum = calc_checksum(page);
+	return 0;
+}
+#else
+static int update_checksum(struct page *page, struct ksm_rmap_item *rmap_item)
+{
+	remove_rmap_item_from_tree(rmap_item);
+
+	/*
+	 * If the hash value of the page has changed from the last time
+	 * we calculated it, this page is changing frequently: therefore we
+	 * don't want to insert it in the unstable tree, and we don't want
+	 * to waste our time searching for something identical to it there.
+	 */
+	checksum = calc_checksum(page);
+	if (rmap_item->oldchecksum != checksum) {
+		rmap_item->oldchecksum = checksum;
+		return -EINVAL;
+	}
+	return 0;
+}
+#endif
+
 /*
  * cmp_and_merge_page - first see if page can be merged into the stable tree;
  * if not, compare checksum to previous and if it's the same, see if page can
@@ -2218,7 +2262,6 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
 	struct page *tree_page = NULL;
 	struct ksm_stable_node *stable_node;
 	struct folio *kfolio;
-	unsigned int checksum;
 	int err;
 	bool max_page_sharing_bypass = false;
 
@@ -2241,20 +2284,8 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
 		if (!is_page_sharing_candidate(stable_node))
 			max_page_sharing_bypass = true;
 	} else {
-		remove_rmap_item_from_tree(rmap_item);
-
-		/*
-		 * If the hash value of the page has changed from the last time
-		 * we calculated it, this page is changing frequently: therefore we
-		 * don't want to insert it in the unstable tree, and we don't want
-		 * to waste our time searching for something identical to it there.
-		 */
-		checksum = calc_checksum(page);
-		if (rmap_item->oldchecksum != checksum) {
-			rmap_item->oldchecksum = checksum;
+		if (update_checksum(page, rmap_item))
 			return;
-		}
-
 		if (!try_to_merge_with_zero_page(rmap_item, page))
 			return;
 	}
@@ -2379,6 +2410,111 @@ static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot,
 	return rmap_item;
 }
 
+#ifdef CONFIG_SELECTIVE_KSM
+static struct ksm_rmap_item *retrieve_rmap_item(struct page **page,
+						struct mm_struct *mm,
+						unsigned long start,
+						unsigned long end)
+{
+	struct ksm_mm_slot *mm_slot;
+	struct mm_slot *slot;
+	struct vm_area_struct *vma;
+	struct ksm_rmap_item *rmap_item;
+	struct vma_iterator vmi;
+
+	lru_add_drain_all();
+
+	if (!ksm_merge_across_nodes) {
+		struct ksm_stable_node *stable_node, *next;
+		struct folio *folio;
+
+		list_for_each_entry_safe(stable_node, next,
+					 &migrate_nodes, list) {
+			folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK);
+			if (folio)
+				folio_put(folio);
+		}
+	}
+
+	spin_lock(&ksm_mmlist_lock);
+	slot = mm_slot_lookup(mm_slots_hash, mm);
+	spin_unlock(&ksm_mmlist_lock);
+
+	if (!slot)
+		return NULL;
+	mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
+
+	ksm_scan.address = 0;
+	ksm_scan.mm_slot = mm_slot;
+	ksm_scan.rmap_list = &mm_slot->rmap_list;
+
+	vma_iter_init(&vmi, mm, ksm_scan.address);
+
+	mmap_read_lock(mm);
+	for_each_vma(vmi, vma) {
+		if (!(vma->vm_flags & VM_MERGEABLE))
+			continue;
+		if (ksm_scan.address < vma->vm_start)
+			ksm_scan.address = vma->vm_start;
+		if (!vma->anon_vma)
+			ksm_scan.address = vma->vm_end;
+
+		while (ksm_scan.address < vma->vm_end) {
+			struct page *tmp_page = NULL;
+			struct folio_walk fw;
+			struct folio *folio;
+
+			if (ksm_scan.address < start || ksm_scan.address > end)
+				break;
+
+			folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
+			if (folio) {
+				if (!folio_is_zone_device(folio) &&
+				    folio_test_anon(folio)) {
+					folio_get(folio);
+					tmp_page = fw.page;
+				}
+				folio_walk_end(&fw, vma);
+			}
+
+			if (tmp_page) {
+				flush_anon_page(vma, tmp_page, ksm_scan.address);
+				flush_dcache_page(tmp_page);
+				rmap_item = get_next_rmap_item(mm_slot,
+							       ksm_scan.rmap_list,
+							       ksm_scan.address);
+				if (rmap_item) {
+					ksm_scan.rmap_list =
+							&rmap_item->rmap_list;
+					ksm_scan.address += PAGE_SIZE;
+					*page = tmp_page;
+				} else {
+					folio_put(folio);
+				}
+				mmap_read_unlock(mm);
+				return rmap_item;
+			}
+			ksm_scan.address += PAGE_SIZE;
+		}
+	}
+	mmap_read_unlock(mm);
+	return NULL;
+}
+
+static void ksm_sync_merge(struct mm_struct *mm,
+			   unsigned long start, unsigned long end)
+{
+	struct ksm_rmap_item *rmap_item;
+	struct page *page;
+
+	rmap_item = retrieve_rmap_item(&page, mm, start, end);
+	if (!rmap_item)
+		return;
+	cmp_and_merge_page(page, rmap_item);
+	put_page(page);
+}
+
+#else /* CONFIG_SELECTIVE_KSM */
 /*
  * Calculate skip age for the ksm page age. The age determines how often
  * de-duplicating has already been tried unsuccessfully. If the age is
@@ -2688,6 +2824,7 @@ static int ksm_scan_thread(void *nothing)
 	}
 	return 0;
 }
+#endif /* CONFIG_SELECTIVE_KSM */
 
 static void __ksm_add_vma(struct vm_area_struct *vma)
 {
@@ -3335,9 +3472,10 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
 	unsigned int nr_pages;
 	int err;
 
+#ifndef CONFIG_SELECTIVE_KSM
 	if (ksm_advisor != KSM_ADVISOR_NONE)
 		return -EINVAL;
-
+#endif
 	err = kstrtouint(buf, 10, &nr_pages);
 	if (err)
 		return -EINVAL;
@@ -3396,6 +3534,65 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
 }
 KSM_ATTR(run);
 
+static ssize_t trigger_merge_show(struct kobject *kobj,
+				  struct kobj_attribute *attr,
+				  char *buf)
+{
+	return -EINVAL;	/* Not yet implemented */
+}
+
+static ssize_t trigger_merge_store(struct kobject *kobj,
+				   struct kobj_attribute *attr,
+				   const char *buf, size_t count)
+{
+	unsigned long start, end;
+	pid_t pid;
+	char *input, *ptr;
+	int ret;
+	struct task_struct *task;
+	struct mm_struct *mm;
+
+	input = kstrdup(buf, GFP_KERNEL);
+	if (!input)
+		return -ENOMEM;
+
+	ptr = strim(input);
+	ret = sscanf(ptr, "%d %lx %lx", &pid, &start, &end);
+	kfree(input);
+
+	if (ret != 3)
+		return -EINVAL;
+
+	if (start >= end)
+		return -EINVAL;
+
+	/* Find the mm_struct */
+	rcu_read_lock();
+	task = find_task_by_vpid(pid);
+	if (!task) {
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+
+	get_task_struct(task);
+
+	rcu_read_unlock();
+	mm = get_task_mm(task);
+	put_task_struct(task);
+
+	if (!mm)
+		return -EINVAL;
+
+	mutex_lock(&ksm_thread_mutex);
+	wait_while_offlining();
+	ksm_sync_merge(mm, start, end);
+	mutex_unlock(&ksm_thread_mutex);
+
+	mmput(mm);
+	return count;
+}
+KSM_ATTR(trigger_merge);
+
 #ifdef CONFIG_NUMA
 static ssize_t merge_across_nodes_show(struct kobject *kobj,
 				       struct kobj_attribute *attr, char *buf)
@@ -3635,6 +3832,7 @@ static ssize_t full_scans_show(struct kobject *kobj,
 }
 KSM_ATTR_RO(full_scans);
 
+#ifndef CONFIG_SELECTIVE_KSM
 static ssize_t smart_scan_show(struct kobject *kobj,
 			       struct kobj_attribute *attr, char *buf)
 {
@@ -3780,11 +3978,13 @@ static ssize_t advisor_target_scan_time_store(struct kobject *kobj,
 	return count;
 }
 KSM_ATTR(advisor_target_scan_time);
+#endif /* CONFIG_SELECTIVE_KSM */
 
 static struct attribute *ksm_attrs[] = {
 	&sleep_millisecs_attr.attr,
 	&pages_to_scan_attr.attr,
 	&run_attr.attr,
+	&trigger_merge_attr.attr,
 	&pages_scanned_attr.attr,
 	&pages_shared_attr.attr,
 	&pages_sharing_attr.attr,
@@ -3802,12 +4002,14 @@ static struct attribute *ksm_attrs[] = {
 	&stable_node_chains_prune_millisecs_attr.attr,
 	&use_zero_pages_attr.attr,
 	&general_profit_attr.attr,
+#ifndef CONFIG_SELECTIVE_KSM
 	&smart_scan_attr.attr,
 	&advisor_mode_attr.attr,
 	&advisor_max_cpu_attr.attr,
 	&advisor_min_pages_to_scan_attr.attr,
 	&advisor_max_pages_to_scan_attr.attr,
 	&advisor_target_scan_time_attr.attr,
+#endif
 	NULL,
 };
 
@@ -3815,40 +4017,63 @@ static const struct attribute_group ksm_attr_group = {
 	.attrs = ksm_attrs,
 	.name = "ksm",
 };
+
+static int __init ksm_sysfs_init(void)
+{
+	return sysfs_create_group(mm_kobj, &ksm_attr_group);
+}
+#else /* CONFIG_SYSFS */
+static int __init ksm_sysfs_init(void)
+{
+	ksm_run = KSM_RUN_MERGE;	/* no way for user to start it */
+	return 0;
+}
 #endif /* CONFIG_SYSFS */
 
-static int __init ksm_init(void)
+#ifdef CONFIG_SELECTIVE_KSM
+static int __init ksm_thread_sysfs_init(void)
+{
+	return ksm_sysfs_init();
+}
+#else /* CONFIG_SELECTIVE_KSM */
+static int __init ksm_thread_sysfs_init(void)
 {
 	struct task_struct *ksm_thread;
 	int err;
 
-	/* The correct value depends on page size and endianness */
-	zero_checksum = calc_checksum(ZERO_PAGE(0));
-	/* Default to false for backwards compatibility */
-	ksm_use_zero_pages = false;
-
-	err = ksm_slab_init();
-	if (err)
-		goto out;
-
 	ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
 	if (IS_ERR(ksm_thread)) {
 		pr_err("ksm: creating kthread failed\n");
 		err = PTR_ERR(ksm_thread);
-		goto out_free;
+		return err;
 	}
 
-#ifdef CONFIG_SYSFS
-	err = sysfs_create_group(mm_kobj, &ksm_attr_group);
+	err = ksm_sysfs_init();
 	if (err) {
 		pr_err("ksm: register sysfs failed\n");
 		kthread_stop(ksm_thread);
-		goto out_free;
 	}
-#else
-	ksm_run = KSM_RUN_MERGE;	/* no way for user to start it */
 
-#endif /* CONFIG_SYSFS */
+	return err;
+}
+#endif /* CONFIG_SELECTIVE_KSM */
+
+static int __init ksm_init(void)
+{
+	int err;
+
+	/* The correct value depends on page size and endianness */
+	zero_checksum = calc_checksum(ZERO_PAGE(0));
+	/* Default to false for backwards compatibility */
+	ksm_use_zero_pages = false;
+
+	err = ksm_slab_init();
+	if (err)
+		goto out;
+
+	err = ksm_thread_sysfs_init();
+	if (err)
+		goto out_free;
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
 	/* There is no significance to this priority 100 */
-- 
2.49.0.395.g12beb8f557-goog





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux