These day, there are many platforms available in the embedded market and sometime, they has more hints about workingset than kernel so they want to involve memory management more heavily like android's lowmemory killer and ashmem or user-daemon with lowmemory notifier. This patch adds add new method for userspace to manage memory efficiently via knob "/proc/<pid>/reclaim" so platform can reclaim any process anytime. One of useful usecase is to avoid process killing for getting free memory in android, which was really terrible experience because I lost my best score of game I had ever after I switch the phone call while I enjoyed the game as well as slow start-up by cold launching. Our product have used it in real procuct. Quote from Sangwoo Park <angwoo2.park@xxxxxxx> Thanks for the data, Sangwoo! " - Test scenaro - platform: android - target: MSM8952, 2G DDR, 16G eMMC - scenario retry app launch and Back Home with 16 apps and 16 turns (total app launch count is 256) - result: resume count | cold launching count ----------------------------------------------------------------- vanilla | 85 | 171 perproc reclaim | 184 | 72 " Higher resume count is better because cold launching needs loading lots of resource data which takes above 15 ~ 20 seconds for some games while successful resume just takes 1~5 second. As perproc reclaim way with new management policy, we could reduce cold launching a lot(i.e., 171-72) so that it reduces app startup a lot. Another useful function from this feature is to make swapout easily which is useful for testing swapout stress and workloads. Interface: Reclaim file-backed pages only. echo 1 > /proc/<pid>/reclaim Reclaim anonymous pages only. echo 2 > /proc/<pid>/reclaim Reclaim all pages echo 3 > /proc/<pid>/reclaim bit 1 : file, bit 2 : anon, bit 1 & 2 : all Note: If a page is shared by other processes(i.e., page_mapcount(page) > 1), it couldn't be reclaimed. Cc: Sangwoo Park <sangwoo2.park@xxxxxxx> Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx> --- Documentation/filesystems/proc.txt | 15 ++++ fs/proc/base.c | 1 + fs/proc/internal.h | 1 + fs/proc/task_mmu.c | 149 +++++++++++++++++++++++++++++++++++++ include/linux/rmap.h | 4 + mm/vmscan.c | 40 ++++++++++ 6 files changed, 210 insertions(+) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 50fcf48f4d58..3b6adf370f3c 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -138,6 +138,7 @@ Table 1-1: Process specific entries in /proc maps Memory maps to executables and library files (2.4) mem Memory held by this process root Link to the root directory of this process + reclaim Reclaim pages in this process stat Process status statm Process memory status information status Process status in human readable form @@ -536,6 +537,20 @@ To reset the peak resident set size ("high water mark") to the process's Any other value written to /proc/PID/clear_refs will have no effect. +The file /proc/PID/reclaim is used to reclaim pages in this process. +bit 1: file, bit 2: anon, bit 3: all + +To reclaim file-backed pages, + > echo 1 > /proc/PID/reclaim + +To reclaim anonymous pages, + > echo 2 > /proc/PID/reclaim + +To reclaim all pages, + > echo 3 > /proc/PID/reclaim + +If a page is shared by several processes, it cannot be reclaimed. + The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags using /proc/kpageflags and number of times a page is mapped using /proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.txt. diff --git a/fs/proc/base.c b/fs/proc/base.c index 93e7754fd5b2..b957d929516d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2848,6 +2848,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("mounts", S_IRUGO, proc_mounts_operations), REG("mountinfo", S_IRUGO, proc_mountinfo_operations), REG("mountstats", S_IRUSR, proc_mountstats_operations), + REG("reclaim", S_IWUSR, proc_reclaim_operations), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), diff --git a/fs/proc/internal.h b/fs/proc/internal.h index aa2781095bd1..ef2b01533c97 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -209,6 +209,7 @@ struct pde_opener { extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; +extern const struct file_operations proc_reclaim_operations; extern void proc_init_inodecache(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 187d84ef9de9..31e4657f8fe9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -11,6 +11,7 @@ #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/swap.h> +#include <linux/mm_inline.h> #include <linux/swapops.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> @@ -1465,6 +1466,154 @@ const struct file_operations proc_pagemap_operations = { }; #endif /* CONFIG_PROC_PAGE_MONITOR */ +static int reclaim_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct mm_struct *mm = walk->mm; + struct vm_area_struct *vma = walk->private; + pte_t *orig_pte, *pte, ptent; + spinlock_t *ptl; + struct page *page; + LIST_HEAD(page_list); + int isolated = 0; + + split_huge_pmd(vma, pmd, addr); + if (pmd_trans_unstable(pmd)) + return 0; + + orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + if (page_mapcount(page) != 1) + continue; + + if (PageTransCompound(page)) { + get_page(page); + if (!trylock_page(page)) { + put_page(page); + goto out; + } + pte_unmap_unlock(orig_pte, ptl); + + if (split_huge_page(page)) { + unlock_page(page); + put_page(page); + orig_pte = pte_offset_map_lock(mm, pmd, + addr, &ptl); + goto out; + } + put_page(page); + unlock_page(page); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte--; + addr -= PAGE_SIZE; + continue; + } + + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (isolate_lru_page(page)) + continue; + + list_add(&page->lru, &page_list); + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + isolated++; + if (isolated >= SWAP_CLUSTER_MAX) { + pte_unmap_unlock(orig_pte, ptl); + reclaim_pages_from_list(&page_list); + isolated = 0; + cond_resched(); + orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + } + } + +out: + pte_unmap_unlock(orig_pte, ptl); + reclaim_pages_from_list(&page_list); + + cond_resched(); + return 0; +} + +enum reclaim_type { + RECLAIM_FILE = 1, + RECLAIM_ANON, + RECLAIM_ALL, +}; + +static ssize_t reclaim_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + struct mm_struct *mm; + struct vm_area_struct *vma; + int itype; + int rv; + enum reclaim_type type; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + rv = kstrtoint(strstrip(buffer), 10, &itype); + if (rv < 0) + return rv; + type = (enum reclaim_type)itype; + if (type < RECLAIM_FILE || type > RECLAIM_ALL) + return -EINVAL; + + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) + return -ESRCH; + + mm = get_task_mm(task); + if (mm) { + struct mm_walk reclaim_walk = { + .pmd_entry = reclaim_pte_range, + .mm = mm, + }; + + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + reclaim_walk.private = vma; + + if (is_vm_hugetlb_page(vma)) + continue; + + if (!vma_is_anonymous(vma) && !(type & RECLAIM_FILE)) + continue; + + if (vma_is_anonymous(vma) && !(type & RECLAIM_ANON)) + continue; + + walk_page_range(vma->vm_start, vma->vm_end, + &reclaim_walk); + } + flush_tlb_mm(mm); + up_read(&mm->mmap_sem); + mmput(mm); + } + put_task_struct(task); + + return count; +} + +const struct file_operations proc_reclaim_operations = { + .write = reclaim_write, + .llseek = noop_llseek, +}; + #ifdef CONFIG_NUMA struct numa_maps { diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 5704f101b52e..e90a21b78da3 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -10,6 +10,10 @@ #include <linux/rwsem.h> #include <linux/memcontrol.h> +extern int isolate_lru_page(struct page *page); +extern void putback_lru_page(struct page *page); +extern unsigned long reclaim_pages_from_list(struct list_head *page_list); + /* * The anon_vma heads a list of private "related" vmas, to scan if * an anonymous page pointing to this anon_vma needs to be unmapped: diff --git a/mm/vmscan.c b/mm/vmscan.c index d20c9e863d35..442866f77251 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1212,6 +1212,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, * appear not as the counts should be low */ list_add(&page->lru, &free_pages); + /* + * If pagelist are from multiple zones, we should decrease + * NR_ISOLATED_ANON + x on freed pages in here. + */ + if (!zone) + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); continue; cull_mlocked: @@ -1280,6 +1287,39 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, return ret; } +unsigned long reclaim_pages_from_list(struct list_head *page_list) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + .force_reclaim = 1, + }; + + unsigned long nr_reclaimed, dummy1, dummy2, dummy3, dummy4, dummy5; + struct page *page; + + list_for_each_entry(page, page_list, lru) + ClearPageActive(page); + + nr_reclaimed = shrink_page_list(page_list, &sc, + TTU_UNMAP|TTU_IGNORE_ACCESS, + &dummy1, &dummy2, &dummy3, + &dummy4, &dummy5); + + while (!list_empty(page_list)) { + page = lru_to_page(page_list); + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } + + return nr_reclaimed; +} + /* * Attempt to remove the specified page from its LRU. Only take this page * if it is of the appropriate PageActive status. Pages which are being -- 1.9.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>