On Thu, 23 Jan 2020 11:20:07 +0100 Alexander Graf wrote: > > The big problem I see is that what I really want from a user's point of > view is a tuneable that says "Automatically free clean page cache pages > that were not accessed in the last X minutes". A diff is made on top of 1a4e58cce84e ("mm: introduce MADV_PAGEOUT") without test in any form, assuming it goes in line with the tunable above but without "X minutes" taken into account. [BTW, please take a look at Content-Type: text/plain; charset="utf-8"; format="flowed" Content-Transfer-Encoding: base64 and ensure pure text message.] --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -69,6 +69,7 @@ #define MADV_COLD 20 /* deactivate these pages */ #define MADV_PAGEOUT 21 /* reclaim these pages */ +#define MADV_CCPC 22 /* reclaim cold & clean page cache pages */ /* compatibility flags */ #define MAP_FILE 0 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -35,6 +35,7 @@ struct madvise_walk_private { struct mmu_gather *tlb; bool pageout; + int behavior; }; /* @@ -50,6 +51,7 @@ static int madvise_need_mmap_write(int b case MADV_DONTNEED: case MADV_COLD: case MADV_PAGEOUT: + case MADV_CCPC: case MADV_FREE: return 0; default: @@ -304,6 +306,7 @@ static int madvise_cold_or_pageout_pte_r struct madvise_walk_private *private = walk->private; struct mmu_gather *tlb = private->tlb; bool pageout = private->pageout; + bool ccpc = private->behavior == MADV_CCPC; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; pte_t *orig_pte, *pte, ptent; @@ -429,6 +432,8 @@ regular_page: VM_BUG_ON_PAGE(PageTransCompound(page), page); if (pte_young(ptent)) { + if (ccpc) + continue; ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); ptent = pte_mkold(ptent); @@ -436,6 +441,10 @@ regular_page: tlb_remove_tlb_entry(tlb, pte, addr); } + if (ccpc) + if (PageDirty(page)) + continue; + /* * We are deactivating a page for accelerating reclaiming. * VM couldn't reclaim the page unless we clear PG_young. @@ -502,12 +511,13 @@ static long madvise_cold(struct vm_area_ } static void madvise_pageout_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, + struct vm_area_struct *vma, int behavior, unsigned long addr, unsigned long end) { struct madvise_walk_private walk_private = { .pageout = true, .tlb = tlb, + .behavior = behavior, }; tlb_start_vma(tlb, vma); @@ -515,10 +525,10 @@ static void madvise_pageout_page_range(s tlb_end_vma(tlb, vma); } -static inline bool can_do_pageout(struct vm_area_struct *vma) +static inline bool can_do_pageout(struct vm_area_struct *vma, int behavior) { if (vma_is_anonymous(vma)) - return true; + return behavior != MADV_CCPC; if (!vma->vm_file) return false; /* @@ -531,7 +541,7 @@ static inline bool can_do_pageout(struct inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; } -static long madvise_pageout(struct vm_area_struct *vma, +static long madvise_pageout(struct vm_area_struct *vma, int behavior, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) { @@ -542,12 +552,12 @@ static long madvise_pageout(struct vm_ar if (!can_madv_lru_vma(vma)) return -EINVAL; - if (!can_do_pageout(vma)) + if (!can_do_pageout(vma, behavior)) return 0; lru_add_drain(); tlb_gather_mmu(&tlb, mm, start_addr, end_addr); - madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); + madvise_pageout_page_range(&tlb, vma, behavior, start_addr, end_addr); tlb_finish_mmu(&tlb, start_addr, end_addr); return 0; @@ -936,7 +946,8 @@ madvise_vma(struct vm_area_struct *vma, case MADV_COLD: return madvise_cold(vma, prev, start, end); case MADV_PAGEOUT: - return madvise_pageout(vma, prev, start, end); + case MADV_CCPC: + return madvise_pageout(vma, behavior, prev, start, end); case MADV_FREE: case MADV_DONTNEED: return madvise_dontneed_free(vma, prev, start, end, behavior); @@ -960,6 +971,7 @@ madvise_behavior_valid(int behavior) case MADV_FREE: case MADV_COLD: case MADV_PAGEOUT: + case MADV_CCPC: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: --