Re: [PATCH v12 12/14] mm: multi-gen LRU: debugfs interface

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On 2022/6/14 15:16, Yu Zhao wrote:
Add /sys/kernel/debug/lru_gen for working set estimation and proactive
reclaim. These techniques are commonly used to optimize job scheduling
(bin packing) in data centers [1][2].

Compared with the page table-based approach and the PFN-based
approach, this lruvec-based approach has the following advantages:
1. It offers better choices because it is aware of memcgs, NUMA nodes,
    shared mappings and unmapped page cache.
2. It is more scalable because it is O(nr_hot_pages), whereas the
    PFN-based approach is O(nr_total_pages).

Add /sys/kernel/debug/lru_gen_full for debugging.

[1] https://dl.acm.org/doi/10.1145/3297858.3304053
[2] https://dl.acm.org/doi/10.1145/3503222.3507731

Signed-off-by: Yu Zhao <yuzhao@xxxxxxxxxx>
Acked-by: Brian Geffon <bgeffon@xxxxxxxxxx>
Acked-by: Jan Alexander Steffens (heftig) <heftig@xxxxxxxxxxxxx>
Acked-by: Oleksandr Natalenko <oleksandr@xxxxxxxxxxxxxx>
Acked-by: Steven Barrett <steven@xxxxxxxxxxxx>
Acked-by: Suleiman Souhlal <suleiman@xxxxxxxxxx>
Tested-by: Daniel Byrne <djbyrne@xxxxxxx>
Tested-by: Donald Carr <d@xxxxxxxxxxxxxxx>
Tested-by: Holger Hoffstätte <holger@xxxxxxxxxxxxxxxxxxxxxx>
Tested-by: Konstantin Kharlamov <Hi-Angel@xxxxxxxxx>
Tested-by: Shuang Zhai <szhai2@xxxxxxxxxxxxxxxx>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@xxxxxxxxxxxxx>
---
  include/linux/nodemask.h |   1 +
  mm/vmscan.c              | 412 ++++++++++++++++++++++++++++++++++++++-
  2 files changed, 403 insertions(+), 10 deletions(-)


Hi Yu,

+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
+				 size_t len, loff_t *pos)
+{
+	void *buf;
+	char *cur, *next;
+	unsigned int flags;
+	struct blk_plug plug;
+	int err = -EINVAL;
+	struct scan_control sc = {
+		.may_writepage = true,
+		.may_unmap = true,
+		.may_swap = true,
+		.reclaim_idx = MAX_NR_ZONES - 1,
+		.gfp_mask = GFP_KERNEL,
+	};
+
+	buf = kvmalloc(len + 1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (copy_from_user(buf, src, len)) {
+		kvfree(buf);
+		return -EFAULT;
+	}
+
+	if (!set_mm_walk(NULL)) {

The current->reclaim_state will be dereferenced in set_mm_walk(), so
calling set_mm_walk() before set_task_reclaim_state(current,
&sc.reclaim_state) will cause panic:

[ 1861.154916] BUG: kernel NULL pointer dereference, address: 0000000000000008
[ 1861.155720] #PF: supervisor read access in kernel mode
[ 1861.156263] #PF: error_code(0x0000) - not-present page
[ 1861.156805] PGD 0 P4D 0
[ 1861.157107] Oops: 0000 [#1] PREEMPT SMP PTI
[ 1861.157560] CPU: 5 PID: 1017 Comm: bash Not tainted 5.19.0-rc2+ #244
[ 1861.158227] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g14
[ 1861.159419] RIP: 0010:set_mm_walk+0x15/0x60
[ 1861.159878] Code: e8 30 5f 01 00 48 c7 43 70 00 00 00 00 5b c3 31 f6 eb e2 66 90 0f 1f f
[ 1861.161806] RSP: 0018:ffffc90006dd3d58 EFLAGS: 00010246
[ 1861.162356] RAX: 0000000000000000 RBX: 00005582747a70b0 RCX: 0000000000000000 [ 1861.163109] RDX: ffff88810a198000 RSI: 00005582747a70c1 RDI: 0000000000000000 [ 1861.163855] RBP: ffff888104f4e400 R08: 0000000000000000 R09: ffff888100042400 [ 1861.164597] R10: 0000000000000000 R11: 0000000000000000 R12: ffff888685896fc0 [ 1861.165334] R13: 00005582747a70b0 R14: ffff888103ef2e40 R15: 0000000000000011 [ 1861.166083] FS: 00007f843df57740(0000) GS:ffff888666b40000(0000) knlGS:0000000000000000
[ 1861.166921] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1861.167527] CR2: 0000000000000008 CR3: 0000000684e7e000 CR4: 00000000000006e0 [ 1861.168272] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1861.169020] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1861.169867] Call Trace:
[ 1861.170159]  <TASK>
[ 1861.170396]  lru_gen_seq_write+0xbf/0x600
[ 1861.170837]  ? _raw_spin_unlock+0x15/0x30
[ 1861.171272]  ? wp_page_reuse+0x5f/0x70
[ 1861.171678]  ? do_wp_page+0xda/0x3e0
[ 1861.172063]  ? __handle_mm_fault+0x92f/0xeb0
[ 1861.172529]  full_proxy_write+0x4d/0x70
[ 1861.172941]  vfs_write+0xb8/0x2a0
[ 1861.173302]  ksys_write+0x59/0xd0
[ 1861.173667]  do_syscall_64+0x34/0x80
[ 1861.174055]  entry_SYSCALL_64_after_hwframe+0x46/0xb0

+		kvfree(buf);
+		return -ENOMEM;
+	}
+
+	set_task_reclaim_state(current, &sc.reclaim_state);
+	flags = memalloc_noreclaim_save();
+	blk_start_plug(&plug);
+
+	next = buf;
+	next[len] = '\0';
+
+	while ((cur = strsep(&next, ",;\n"))) {
+		int n;
+		int end;
+		char cmd;
+		unsigned int memcg_id;
+		unsigned int nid;
+		unsigned long seq;
+		unsigned int swappiness = -1;
+		unsigned long opt = -1;
+
+		cur = skip_spaces(cur);
+		if (!*cur)
+			continue;
+
+		n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
+			   &seq, &end, &swappiness, &end, &opt, &end);
+		if (n < 4 || cur[end]) {
+			err = -EINVAL;
+			break;
+		}
+
+		err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
+		if (err)
+			break;
+	}
+
+	blk_finish_plug(&plug);
+	memalloc_noreclaim_restore(flags);
+	set_task_reclaim_state(current, NULL);
+
+	clear_mm_walk();

Ditto, we can't call clear_mm_walk() after set_task_reclaim_state(current, NULL).

Maybe it can be modified as follows:

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2422edc786eb..552e6ae5243e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -5569,12 +5569,12 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
                return -EFAULT;
        }

+       set_task_reclaim_state(current, &sc.reclaim_state);
        if (!set_mm_walk(NULL)) {
                kvfree(buf);
                return -ENOMEM;
        }

-       set_task_reclaim_state(current, &sc.reclaim_state);
        flags = memalloc_noreclaim_save();
        blk_start_plug(&plug);

@@ -5609,9 +5609,9 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,

        blk_finish_plug(&plug);
        memalloc_noreclaim_restore(flags);
+       clear_mm_walk();
        set_task_reclaim_state(current, NULL);

-       clear_mm_walk();
        kvfree(buf);

        return err ? : len;

Thanks,
Qi

+	kvfree(buf);
+
+	return err ? : len;
+}
+


--
Thanks,
Qi




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux