On Fri, Mar 14, 2014 at 03:37:47PM +0900, Minchan Kim wrote: > Linux doesn't have an ability to free pages lazy while other OS > already have been supported that named by madvise(MADV_FREE). > > The gain is clear that kernel can evict freed pages rather than > swapping out or OOM if memory pressure happens. > > Without memory pressure, freed pages would be reused by userspace > without another additional overhead(ex, page fault + + page allocation > + page zeroing). > > Firstly, heavy users would be general allocators(ex, jemalloc, > I hope ptmalloc support it) and jemalloc already have supported > the feature for other OS(ex, FreeBSD) > > At the moment, this patch would break build other ARCHs which have > own TLB flush scheme other than that x86 but if there is no objection > in this direction, I will add patches for handling other ARCHs > in next iteration. > > Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx> > --- > include/asm-generic/tlb.h | 9 ++++++++ > include/linux/mm.h | 35 ++++++++++++++++++++++++++++++- > include/linux/rmap.h | 1 + > include/linux/swap.h | 15 ++++++++++++++ > include/uapi/asm-generic/mman-common.h | 1 + > mm/madvise.c | 17 +++++++++++++-- > mm/memory.c | 12 ++++++++++- > mm/rmap.c | 21 +++++++++++++++++-- > mm/swap_state.c | 38 +++++++++++++++++++++++++++++++++- > mm/vmscan.c | 22 +++++++++++++++++++- > 10 files changed, 163 insertions(+), 8 deletions(-) > > diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h > index 5672d7ea1fa0..b82ee729a065 100644 > --- a/include/asm-generic/tlb.h > +++ b/include/asm-generic/tlb.h > @@ -116,8 +116,17 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long > void tlb_flush_mmu(struct mmu_gather *tlb); > void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, > unsigned long end); > +int __tlb_madvfree_page(struct mmu_gather *tlb, struct page *page); > int __tlb_remove_page(struct mmu_gather *tlb, struct page *page); > > +static inline void tlb_madvfree_page(struct mmu_gather *tlb, struct page *page) > +{ > + /* Prevent page free */ > + get_page(page); > + if (!__tlb_remove_page(tlb, MarkLazyFree(page))) > + tlb_flush_mmu(tlb); > +} > + > /* tlb_remove_page > * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when > * required. > diff --git a/include/linux/mm.h b/include/linux/mm.h > index c1b7414c7bef..9b048cabce27 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -933,10 +933,16 @@ void page_address_init(void); > * Please note that, confusingly, "page_mapping" refers to the inode > * address_space which maps the page from disk; whereas "page_mapped" > * refers to user virtual address space into which the page is mapped. > + * > + * PAGE_MAPPING_LZFREE bit is set along with PAGE_MAPPING_ANON bit > + * and then page->mapping points to an anon_vma. This flag is used > + * for lazy freeing the page instead of swap. > */ > #define PAGE_MAPPING_ANON 1 > #define PAGE_MAPPING_KSM 2 > -#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) > +#define PAGE_MAPPING_LZFREE 4 > +#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM | \ > + PAGE_MAPPING_LZFREE) > > extern struct address_space *page_mapping(struct page *page); > > @@ -962,6 +968,32 @@ static inline int PageAnon(struct page *page) > return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; > } > > +static inline void SetPageLazyFree(struct page *page) > +{ > + BUG_ON(!PageAnon(page)); > + BUG_ON(!PageLocked(page)); > + > + page->mapping = (void *)((unsigned long)page->mapping | > + PAGE_MAPPING_LZFREE); > +} > + > +static inline void ClearPageLazyFree(struct page *page) > +{ > + BUG_ON(!PageAnon(page)); > + BUG_ON(!PageLocked(page)); > + > + page->mapping = (void *)((unsigned long)page->mapping & > + ~PAGE_MAPPING_LZFREE); > +} > + > +static inline int PageLazyFree(struct page *page) > +{ > + if (((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == > + (PAGE_MAPPING_ANON|PAGE_MAPPING_LZFREE)) > + return 1; > + return 0; > +} > + > /* > * Return the pagecache index of the passed page. Regular pagecache pages > * use ->index whereas swapcache pages use ->private > @@ -1054,6 +1086,7 @@ struct zap_details { > struct address_space *check_mapping; /* Check page->mapping if set */ > pgoff_t first_index; /* Lowest page->index to unmap */ > pgoff_t last_index; /* Highest page->index to unmap */ > + int lazy_free; /* do lazy free */ > }; > > struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, > diff --git a/include/linux/rmap.h b/include/linux/rmap.h > index 1da693d51255..19e74aebb3d5 100644 > --- a/include/linux/rmap.h > +++ b/include/linux/rmap.h > @@ -75,6 +75,7 @@ enum ttu_flags { > TTU_UNMAP = 0, /* unmap mode */ > TTU_MIGRATION = 1, /* migration mode */ > TTU_MUNLOCK = 2, /* munlock mode */ > + TTU_LAZYFREE = 3, /* free lazyfree page */ > TTU_ACTION_MASK = 0xff, > > TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ > diff --git a/include/linux/swap.h b/include/linux/swap.h > index 46ba0c6c219f..223909c14703 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -13,6 +13,21 @@ > #include <linux/page-flags.h> > #include <asm/page.h> > > +static inline struct page *MarkLazyFree(struct page *p) > +{ > + return (struct page *)((unsigned long)p | 0x1UL); > +} > + > +static inline struct page *ClearLazyFree(struct page *p) > +{ > + return (struct page *)((unsigned long)p & ~0x1UL); > +} > + > +static inline bool LazyFree(struct page *p) > +{ > + return ((unsigned long)p & 0x1UL) ? true : false; > +} > + > struct notifier_block; > > struct bio; > diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h > index 4164529a94f9..7e257e49be2e 100644 > --- a/include/uapi/asm-generic/mman-common.h > +++ b/include/uapi/asm-generic/mman-common.h > @@ -34,6 +34,7 @@ > #define MADV_SEQUENTIAL 2 /* expect sequential page references */ > #define MADV_WILLNEED 3 /* will need these pages */ > #define MADV_DONTNEED 4 /* don't need these pages */ > +#define MADV_FREE 5 /* do lazy free */ > > /* common parameters: try to keep these consistent across architectures */ > #define MADV_REMOVE 9 /* remove these pages & resources */ > diff --git a/mm/madvise.c b/mm/madvise.c > index 539eeb96b323..2e904289a2bb 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -31,6 +31,7 @@ static int madvise_need_mmap_write(int behavior) > case MADV_REMOVE: > case MADV_WILLNEED: > case MADV_DONTNEED: > + case MADV_FREE: > return 0; > default: > /* be safe, default to 1. list exceptions explicitly */ > @@ -272,7 +273,8 @@ static long madvise_willneed(struct vm_area_struct *vma, > */ > static long madvise_dontneed(struct vm_area_struct *vma, > struct vm_area_struct **prev, > - unsigned long start, unsigned long end) > + unsigned long start, unsigned long end, > + int behavior) > { > *prev = vma; > if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) > @@ -284,8 +286,17 @@ static long madvise_dontneed(struct vm_area_struct *vma, > .last_index = ULONG_MAX, > }; > zap_page_range(vma, start, end - start, &details); > + } else if (behavior == MADV_FREE) { > + struct zap_details details = { > + .lazy_free = 1, > + }; > + > + if (vma->vm_file) > + return -EINVAL; > + zap_page_range(vma, start, end - start, &details); > } else > zap_page_range(vma, start, end - start, NULL); > + > return 0; > } > > @@ -384,8 +395,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, > return madvise_remove(vma, prev, start, end); > case MADV_WILLNEED: > return madvise_willneed(vma, prev, start, end); > + case MADV_FREE: > case MADV_DONTNEED: > - return madvise_dontneed(vma, prev, start, end); > + return madvise_dontneed(vma, prev, start, end, behavior); > default: > return madvise_behavior(vma, prev, start, end, behavior); > } > @@ -403,6 +415,7 @@ madvise_behavior_valid(int behavior) > case MADV_REMOVE: > case MADV_WILLNEED: > case MADV_DONTNEED: > + case MADV_FREE: > #ifdef CONFIG_KSM > case MADV_MERGEABLE: > case MADV_UNMERGEABLE: > diff --git a/mm/memory.c b/mm/memory.c > index 22dfa617bddb..f1f0dc13e8d1 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -1093,6 +1093,15 @@ again: > > page = vm_normal_page(vma, addr, ptent); > if (unlikely(details) && page) { > + if (details->lazy_free && PageAnon(page)) { > + ptent = pte_mkold(ptent); > + ptent = pte_mkclean(ptent); > + set_pte_at(mm, addr, pte, ptent); > + tlb_remove_tlb_entry(tlb, pte, addr); > + tlb_madvfree_page(tlb, page); > + continue; > + } > + > /* > * unmap_shared_mapping_pages() wants to > * invalidate cache without truncating: > @@ -1276,7 +1285,8 @@ static void unmap_page_range(struct mmu_gather *tlb, > pgd_t *pgd; > unsigned long next; > > - if (details && !details->check_mapping && !details->nonlinear_vma) > + if (details && !details->check_mapping && !details->nonlinear_vma && > + !details->lazy_free) > details = NULL; > > BUG_ON(addr >= end); > diff --git a/mm/rmap.c b/mm/rmap.c > index 76069afa6b81..7712f39acfee 100644 > --- a/mm/rmap.c > +++ b/mm/rmap.c > @@ -377,6 +377,15 @@ void __init anon_vma_init(void) > anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); > } > > +static inline bool is_anon_vma(unsigned long mapping) > +{ > + unsigned long anon_mapping = mapping & PAGE_MAPPING_FLAGS; > + if ((anon_mapping != PAGE_MAPPING_ANON) && > + (anon_mapping != (PAGE_MAPPING_ANON|PAGE_MAPPING_LZFREE))) > + return false; > + return true; > +} > + > /* > * Getting a lock on a stable anon_vma from a page off the LRU is tricky! > * > @@ -407,7 +416,7 @@ struct anon_vma *page_get_anon_vma(struct page *page) > > rcu_read_lock(); > anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); > - if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) > + if (!is_anon_vma(anon_mapping)) > goto out; > if (!page_mapped(page)) > goto out; > @@ -450,7 +459,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) > > rcu_read_lock(); > anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); > - if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) > + if (!is_anon_vma(anon_mapping)) > goto out; > if (!page_mapped(page)) > goto out; > @@ -1165,6 +1174,14 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, > } > set_pte_at(mm, address, pte, > swp_entry_to_pte(make_hwpoison_entry(page))); > + } else if ((flags & TTU_LAZYFREE) && PageLazyFree(page)) { > + BUG_ON(!PageAnon(page)); > + if (unlikely(pte_dirty(pteval))) { > + set_pte_at(mm, address, pte, pteval); > + ret = SWAP_FAIL; > + goto out_unmap; > + } > + dec_mm_counter(mm, MM_ANONPAGES); > } else if (PageAnon(page)) { > swp_entry_t entry = { .val = page_private(page) }; > pte_t swp_pte; > diff --git a/mm/swap_state.c b/mm/swap_state.c > index e76ace30d436..0718ecd166dc 100644 > --- a/mm/swap_state.c > +++ b/mm/swap_state.c > @@ -18,6 +18,7 @@ > #include <linux/pagevec.h> > #include <linux/migrate.h> > #include <linux/page_cgroup.h> > +#include <linux/ksm.h> > > #include <asm/pgtable.h> > > @@ -256,8 +257,36 @@ void free_page_and_swap_cache(struct page *page) > } > > /* > + * move @page to inactive LRU's tail so that VM can discard it > + * rather than swapping hot pages out when memory pressure happens. > + */ > +static bool move_lazyfree(struct page *page) > +{ > + if (!trylock_page(page)) > + return false; > + > + if (PageKsm(page)) { > + unlock_page(page); > + return false; > + } > + > + if (PageSwapCache(page) && > + try_to_free_swap(page)) > + ClearPageDirty(page); > + > + if (!PageLazyFree(page)) { > + SetPageLazyFree(page); > + deactivate_page(page); > + } > + > + unlock_page(page); > + return true; > +} > + > +/* > * Passed an array of pages, drop them all from swapcache and then release > * them. They are removed from the LRU and freed if this is their last use. > + * If page passed are lazyfree, deactivate them intead of freeing. > */ > void free_pages_and_swap_cache(struct page **pages, int nr) > { > @@ -269,7 +298,14 @@ void free_pages_and_swap_cache(struct page **pages, int nr) > int i; > > for (i = 0; i < todo; i++) > - free_swap_cache(pagep[i]); > + if (LazyFree(pagep[i])) { > + pagep[i] = ClearLazyFree(pagep[i]); > + /* If we failed, just free */ > + if (!move_lazyfree(pagep[i])) > + free_swap_cache(pagep[i]); Oops, patchset was confused by older version in my git tree. Fix goes. diff --git a/mm/swap_state.c b/mm/swap_state.c index 0718ecd166dc..882f1c8e5bd2 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -300,9 +300,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr) for (i = 0; i < todo; i++) if (LazyFree(pagep[i])) { pagep[i] = ClearLazyFree(pagep[i]); - /* If we failed, just free */ - if (!move_lazyfree(pagep[i])) - free_swap_cache(pagep[i]); + move_lazyfree(pagep[i]); } else { free_swap_cache(pagep[i]); } -- Kind regards, Minchan Kim -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>