Cc: vdavydov.dev@xxxxxxxxx
Cc: Brendan Gregg <bgregg@xxxxxxxxxxx>
Cc: kernel-team@xxxxxxxxxxx
Signed-off-by: Joel Fernandes (Google) <joel@xxxxxxxxxxxxxxxxx>
---
Internal review -> v1:
Fixes from Suren.
Corrections to change log, docs (Florian, Sandeep)
fs/proc/base.c | 3 +
fs/proc/internal.h | 1 +
fs/proc/task_mmu.c | 57 +++++++
include/linux/page_idle.h | 4 +
mm/page_idle.c | 305 +++++++++++++++++++++++++++++++++-----
5 files changed, 330 insertions(+), 40 deletions(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 77eb628ecc7f..a58dd74606e9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3021,6 +3021,9 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("smaps", S_IRUGO, proc_pid_smaps_operations),
REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
REG("pagemap", S_IRUSR, proc_pagemap_operations),
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+ REG("page_idle", S_IRUSR|S_IWUSR, proc_page_idle_operations),
+#endif
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index cd0c8d5ce9a1..bc9371880c63 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -293,6 +293,7 @@ extern const struct file_operations proc_pid_smaps_operations;
extern const struct file_operations proc_pid_smaps_rollup_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;
+extern const struct file_operations proc_page_idle_operations;
extern unsigned long task_vsize(struct mm_struct *);
extern unsigned long task_statm(struct mm_struct *,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4d2b860dbc3f..11ccc53da38e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1642,6 +1642,63 @@ const struct file_operations proc_pagemap_operations = {
.open = pagemap_open,
.release = pagemap_release,
};
+
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+static ssize_t proc_page_idle_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int ret;
+ struct task_struct *tsk = get_proc_task(file_inode(file));
+
+ if (!tsk)
+ return -EINVAL;
+ ret = page_idle_proc_read(file, buf, count, ppos, tsk);
+ put_task_struct(tsk);
+ return ret;
+}
+
+static ssize_t proc_page_idle_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int ret;
+ struct task_struct *tsk = get_proc_task(file_inode(file));
+
+ if (!tsk)
+ return -EINVAL;
+ ret = page_idle_proc_write(file, (char __user *)buf, count, ppos, tsk);
+ put_task_struct(tsk);
+ return ret;
+}
+
+static int proc_page_idle_open(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm;
+
+ mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+ file->private_data = mm;
+ return 0;
+}
+
+static int proc_page_idle_release(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm = file->private_data;
+
+ if (mm)
+ mmdrop(mm);
+ return 0;
+}
+
+const struct file_operations proc_page_idle_operations = {
+ .llseek = mem_lseek, /* borrow this */
+ .read = proc_page_idle_read,
+ .write = proc_page_idle_write,
+ .open = proc_page_idle_open,
+ .release = proc_page_idle_release,
+};
+#endif /* CONFIG_IDLE_PAGE_TRACKING */
+
#endif /* CONFIG_PROC_PAGE_MONITOR */
#ifdef CONFIG_NUMA
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 1e894d34bdce..f1bc2640d85e 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -106,6 +106,10 @@ static inline void clear_page_idle(struct page *page)
}
#endif /* CONFIG_64BIT */
+ssize_t page_idle_proc_write(struct file *file,
+ char __user *buf, size_t count, loff_t *ppos, struct task_struct *tsk);
+ssize_t page_idle_proc_read(struct file *file,
+ char __user *buf, size_t count, loff_t *ppos, struct task_struct *tsk);
#else /* !CONFIG_IDLE_PAGE_TRACKING */
static inline bool page_is_young(struct page *page)
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 295512465065..874a60c41fef 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -11,6 +11,7 @@
#include <linux/mmu_notifier.h>
#include <linux/page_ext.h>
#include <linux/page_idle.h>
+#include <linux/sched/mm.h>
#define BITMAP_CHUNK_SIZE sizeof(u64)
#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
@@ -28,15 +29,12 @@
*
* This function tries to get a user memory page by pfn as described above.
*/
-static struct page *page_idle_get_page(unsigned long pfn)
+static struct page *page_idle_get_page(struct page *page_in)
{
struct page *page;
pg_data_t *pgdat;
- if (!pfn_valid(pfn))
- return NULL;
-
- page = pfn_to_page(pfn);
+ page = page_in;
if (!page || !PageLRU(page) ||
!get_page_unless_zero(page))
return NULL;
@@ -51,6 +49,15 @@ static struct page *page_idle_get_page(unsigned long pfn)
return page;
}
+static struct page *page_idle_get_page_pfn(unsigned long pfn)
+{
+
+ if (!pfn_valid(pfn))
+ return NULL;
+
+ return page_idle_get_page(pfn_to_page(pfn));
+}
+
static bool page_idle_clear_pte_refs_one(struct page *page,
struct vm_area_struct *vma,
unsigned long addr, void *arg)
@@ -118,6 +125,47 @@ static void page_idle_clear_pte_refs(struct page *page)
unlock_page(page);
}
+/* Helper to get the start and end frame given a pos and count */
+static int page_idle_get_frames(loff_t pos, size_t count, struct mm_struct *mm,
+ unsigned long *start, unsigned long *end)
+{
+ unsigned long max_frame;
+
+ /* If an mm is not given, assume we want physical frames */
+ max_frame = mm ? (mm->task_size >> PAGE_SHIFT) : max_pfn;
+
+ if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+ return -EINVAL;
+
+ *start = pos * BITS_PER_BYTE;
+ if (*start >= max_frame)
+ return -ENXIO;
+
+ *end = *start + count * BITS_PER_BYTE;
+ if (*end > max_frame)
+ *end = max_frame;
+ return 0;
+}
+
+static bool page_really_idle(struct page *page)
+{
+ if (!page)
+ return false;
+
+ if (page_is_idle(page)) {
+ /*
+ * The page might have been referenced via a
+ * pte, in which case it is not idle. Clear
+ * refs and recheck.
+ */
+ page_idle_clear_pte_refs(page);
+ if (page_is_idle(page))
+ return true;
+ }
+
+ return false;
+}
+
static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
struct bin_attribute *attr, char *buf,
loff_t pos, size_t count)
@@ -125,35 +173,21 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
u64 *out = (u64 *)buf;
struct page *page;
unsigned long pfn, end_pfn;
- int bit;
-
- if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
- return -EINVAL;
-
- pfn = pos * BITS_PER_BYTE;
- if (pfn >= max_pfn)
- return 0;
+ int bit, ret;
- end_pfn = pfn + count * BITS_PER_BYTE;
- if (end_pfn > max_pfn)
- end_pfn = max_pfn;
+ ret = page_idle_get_frames(pos, count, NULL, &pfn, &end_pfn);
+ if (ret == -ENXIO)
+ return 0; /* Reads beyond max_pfn do nothing */
+ else if (ret)
+ return ret;
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
if (!bit)
*out = 0ULL;
- page = page_idle_get_page(pfn);
- if (page) {
- if (page_is_idle(page)) {
- /*
- * The page might have been referenced via a
- * pte, in which case it is not idle. Clear
- * refs and recheck.
- */
- page_idle_clear_pte_refs(page);
- if (page_is_idle(page))
- *out |= 1ULL << bit;
- }
+ page = page_idle_get_page_pfn(pfn);
+ if (page && page_really_idle(page)) {
+ *out |= 1ULL << bit;
put_page(page);
}
if (bit == BITMAP_CHUNK_BITS - 1)
@@ -170,23 +204,16 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
const u64 *in = (u64 *)buf;
struct page *page;
unsigned long pfn, end_pfn;
- int bit;
+ int bit, ret;
- if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
- return -EINVAL;
-
- pfn = pos * BITS_PER_BYTE;
- if (pfn >= max_pfn)
- return -ENXIO;
-
- end_pfn = pfn + count * BITS_PER_BYTE;
- if (end_pfn > max_pfn)
- end_pfn = max_pfn;
+ ret = page_idle_get_frames(pos, count, NULL, &pfn, &end_pfn);
+ if (ret)
+ return ret;
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
if ((*in >> bit) & 1) {
- page = page_idle_get_page(pfn);
+ page = page_idle_get_page_pfn(pfn);
if (page) {
page_idle_clear_pte_refs(page);
set_page_idle(page);
@@ -224,10 +251,208 @@ struct page_ext_operations page_idle_ops = {
};
#endif
+/* page_idle tracking for /proc/<pid>/page_idle */
+
+static DEFINE_SPINLOCK(idle_page_list_lock);
+struct list_head idle_page_list;
+
+struct page_node {
+ struct page *page;
+ unsigned long addr;
+ struct list_head list;
+};
+
+struct page_idle_proc_priv {
+ unsigned long start_addr;
+ char *buffer;
+ int write;
+};
+
+static void add_page_idle_list(struct page *page,
+ unsigned long addr, struct mm_walk *walk)
+{
+ struct page *page_get;
+ struct page_node *pn;
+ int bit;
+ unsigned long frames;
+ struct page_idle_proc_priv *priv = walk->private;
+ u64 *chunk = (u64 *)priv->buffer;
+
+ if (priv->write) {
+ /* Find whether this page was asked to be marked */
+ frames = (addr - priv->start_addr) >> PAGE_SHIFT;
+ bit = frames % BITMAP_CHUNK_BITS;
+ chunk = &chunk[frames / BITMAP_CHUNK_BITS];
+ if (((*chunk >> bit) & 1) == 0)
+ return;
+ }
+
+ page_get = page_idle_get_page(page);
+ if (!page_get)
+ return;
+
+ pn = kmalloc(sizeof(*pn), GFP_ATOMIC);
+ if (!pn)
+ return;
+
+ pn->page = page_get;
+ pn->addr = addr;
+ list_add(&pn->list, &idle_page_list);
+}
+
+static int pte_page_idle_proc_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *pte;
+ spinlock_t *ptl;
+ struct page *page;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ if (pmd_present(*pmd)) {
+ page = follow_trans_huge_pmd(vma, addr, pmd,
+ FOLL_DUMP|FOLL_WRITE);
+ if (!IS_ERR_OR_NULL(page))
+ add_page_idle_list(page, addr, walk);
+ }
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ if (!pte_present(*pte))
+ continue;
+
+ page = vm_normal_page(vma, addr, *pte);
+ if (page)
+ add_page_idle_list(page, addr, walk);
+ }
+
+ pte_unmap_unlock(pte - 1, ptl);
+ return 0;
+}
+
+ssize_t page_idle_proc_generic(struct file *file, char __user *ubuff,
+ size_t count, loff_t *pos,
+ struct task_struct *tsk, int write)
+{
+ int ret;
+ char *buffer;
+ u64 *out;
+ unsigned long start_addr, end_addr, start_frame, end_frame;
+ struct mm_struct *mm = file->private_data;
+ struct mm_walk walk = { .pmd_entry = pte_page_idle_proc_range, };
+ struct page_node *cur, *next;
+ struct page_idle_proc_priv priv;
+ bool walk_error = false;
+
+ if (!mm || !mmget_not_zero(mm))
+ return -EINVAL;
+
+ if (count > PAGE_SIZE)
+ count = PAGE_SIZE;
+
+ buffer = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buffer) {
+ ret = -ENOMEM;
+ goto out_mmput;
+ }
+ out = (u64 *)buffer;
+
+ if (write && copy_from_user(buffer, ubuff, count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = page_idle_get_frames(*pos, count, mm, &start_frame, &end_frame);
+ if (ret)
+ goto out;
+
+ start_addr = (start_frame << PAGE_SHIFT);
+ end_addr = (end_frame << PAGE_SHIFT);
+ priv.buffer = buffer;
+ priv.start_addr = start_addr;
+ priv.write = write;
+ walk.private = &priv;
+ walk.mm = mm;
+
+ down_read(&mm->mmap_sem);
+
+ /*
+ * Protects the idle_page_list which is needed because
+ * walk_page_vma() holds ptlock which deadlocks with
+ * page_idle_clear_pte_refs(). So we have to collect all
+ * pages first, and then call page_idle_clear_pte_refs().
+ */
+ spin_lock(&idle_page_list_lock);
+ ret = walk_page_range(start_addr, end_addr, &walk);
+ if (ret)
+ walk_error = true;
+
+ list_for_each_entry_safe(cur, next, &idle_page_list, list) {
+ int bit, index;
+ unsigned long off;
+ struct page *page = cur->page;
+
+ if (unlikely(walk_error))
+ goto remove_page;
+
+ if (write) {
+ page_idle_clear_pte_refs(page);
+ set_page_idle(page);
+ } else {
+ if (page_really_idle(page)) {
+ off = ((cur->addr) >> PAGE_SHIFT) - start_frame;
+ bit = off % BITMAP_CHUNK_BITS;
+ index = off / BITMAP_CHUNK_BITS;
+ out[index] |= 1ULL << bit;
+ }
+ }
+remove_page:
+ put_page(page);
+ list_del(&cur->list);
+ kfree(cur);
+ }
+ spin_unlock(&idle_page_list_lock);
+
+ if (!write && !walk_error)
+ ret = copy_to_user(ubuff, buffer, count);
+
+ up_read(&mm->mmap_sem);
+out:
+ kfree(buffer);
+out_mmput:
+ mmput(mm);
+ if (!ret)
+ ret = count;
+ return ret;
+
+}
+
+ssize_t page_idle_proc_read(struct file *file, char __user *ubuff,
+ size_t count, loff_t *pos, struct task_struct *tsk)
+{
+ return page_idle_proc_generic(file, ubuff, count, pos, tsk, 0);
+}
+
+ssize_t page_idle_proc_write(struct file *file, char __user *ubuff,
+ size_t count, loff_t *pos, struct task_struct *tsk)
+{
+ return page_idle_proc_generic(file, ubuff, count, pos, tsk, 1);
+}
+
static int __init page_idle_init(void)
{
int err;
+ INIT_LIST_HEAD(&idle_page_list);
+
err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
if (err) {
pr_err("page_idle: register sysfs failed\n");