On Sun, 21 Feb 2010 15:10:45 +0100 aarcange@xxxxxxxxxx wrote: > From: Andrea Arcangeli <aarcange@xxxxxxxxxx> > > Add khugepaged to relocate fragmented pages into hugepages if new hugepages > become available. (this is indipendent of the defrag logic that will have to > make new hugepages available) > > Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx> > Acked-by: Rik van Riel <riel@xxxxxxxxxx> > --- > > --- > include/linux/huge_mm.h | 2 > include/linux/khugepaged.h | 32 + > include/linux/sched.h | 1 > kernel/fork.c | 5 > mm/huge_memory.c | 981 ++++++++++++++++++++++++++++++++++++++++++++- > 5 files changed, 1014 insertions(+), 7 deletions(-) > > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -23,6 +23,8 @@ extern int zap_huge_pmd(struct mmu_gathe > enum transparent_hugepage_flag { > TRANSPARENT_HUGEPAGE_FLAG, > TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, > + TRANSPARENT_HUGEPAGE_KHUGEPAGED_FLAG, > + TRANSPARENT_HUGEPAGE_KHUGEPAGED_REQ_MADV_FLAG, > TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, > #ifdef CONFIG_DEBUG_VM > TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, > --- /dev/null > +++ b/include/linux/khugepaged.h > @@ -0,0 +1,32 @@ > +#ifndef _LINUX_KHUGEPAGED_H > +#define _LINUX_KHUGEPAGED_H > + > +#include <linux/sched.h> /* MMF_VM_HUGEPAGE */ > + > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > +extern int __khugepaged_enter(struct mm_struct *mm); > +extern void __khugepaged_exit(struct mm_struct *mm); > + > +static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) > +{ > + if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags)) > + return __khugepaged_enter(mm); > + return 0; > +} > + > +static inline void khugepaged_exit(struct mm_struct *mm) > +{ > + if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) > + __khugepaged_exit(mm); > +} > +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ > +static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) > +{ > + return 0; > +} > +static inline void khugepaged_exit(struct mm_struct *mm) > +{ > +} > +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ > + > +#endif /* _LINUX_KHUGEPAGED_H */ > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -432,6 +432,7 @@ extern int get_dumpable(struct mm_struct > #endif > /* leave room for more dump flags */ > #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ > +#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ > > #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) > > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -65,6 +65,7 @@ > #include <linux/perf_event.h> > #include <linux/posix-timers.h> > #include <linux/user-return-notifier.h> > +#include <linux/khugepaged.h> > > #include <asm/pgtable.h> > #include <asm/pgalloc.h> > @@ -307,6 +308,9 @@ static int dup_mmap(struct mm_struct *mm > retval = ksm_fork(mm, oldmm); > if (retval) > goto out; > + retval = khugepaged_fork(mm, oldmm); > + if (retval) > + goto out; > > for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { > struct file *file; > @@ -519,6 +523,7 @@ void mmput(struct mm_struct *mm) > if (atomic_dec_and_test(&mm->mm_users)) { > exit_aio(mm); > ksm_exit(mm); > + khugepaged_exit(mm); /* must run before exit_mmap */ > exit_mmap(mm); > set_mm_exe_file(mm, NULL); > if (!list_empty(&mm->mmlist)) { > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -12,14 +12,121 @@ > #include <linux/mmu_notifier.h> > #include <linux/rmap.h> > #include <linux/swap.h> > +#include <linux/mm_inline.h> > +#include <linux/kthread.h> > +#include <linux/khugepaged.h> > #include <asm/tlb.h> > #include <asm/pgalloc.h> > #include "internal.h" > > unsigned long transparent_hugepage_flags __read_mostly = > - (1<<TRANSPARENT_HUGEPAGE_FLAG); > + (1<<TRANSPARENT_HUGEPAGE_FLAG)| > + (1<<TRANSPARENT_HUGEPAGE_KHUGEPAGED_FLAG); > + > +/* default scan 8*512 pte (or vmas) every 30 second */ > +static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; > +static unsigned int khugepaged_pages_collapsed; > +static unsigned int khugepaged_full_scans; > +static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; > +/* during fragmentation poll the hugepage allocator once every minute */ > +static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; > +static struct task_struct *khugepaged_thread __read_mostly; > +static DEFINE_MUTEX(khugepaged_mutex); > +static DEFINE_SPINLOCK(khugepaged_mm_lock); > +static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); > + > +static int khugepaged(void *none); > +static int mm_slots_hash_init(void); > +static int khugepaged_slab_init(void); > +static void khugepaged_slab_free(void); > + > +#define MM_SLOTS_HASH_HEADS 1024 > +static struct hlist_head *mm_slots_hash __read_mostly; > +static struct kmem_cache *mm_slot_cache __read_mostly; > + > +/** > + * struct mm_slot - hash lookup from mm to mm_slot > + * @hash: hash collision list > + * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head > + * @mm: the mm that this information is valid for > + */ > +struct mm_slot { > + struct hlist_node hash; > + struct list_head mm_node; > + struct mm_struct *mm; > +}; > + > +/** > + * struct khugepaged_scan - cursor for scanning > + * @mm_head: the head of the mm list to scan > + * @mm_slot: the current mm_slot we are scanning > + * @address: the next address inside that to be scanned > + * > + * There is only the one khugepaged_scan instance of this cursor structure. > + */ > +struct khugepaged_scan { > + struct list_head mm_head; > + struct mm_slot *mm_slot; > + unsigned long address; > +} khugepaged_scan = { > + .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), > +}; > + > +#define khugepaged_enabled() \ > + (transparent_hugepage_flags & \ > + ((1<<TRANSPARENT_HUGEPAGE_KHUGEPAGED_FLAG) | \ > + (1<<TRANSPARENT_HUGEPAGE_KHUGEPAGED_REQ_MADV_FLAG))) > +#define khugepaged_always() \ > + (transparent_hugepage_flags & \ > + (1<<TRANSPARENT_HUGEPAGE_KHUGEPAGED_FLAG)) > +#define khugepaged_req_madv() \ > + (transparent_hugepage_flags & \ > + (1<<TRANSPARENT_HUGEPAGE_KHUGEPAGED_REQ_MADV_FLAG)) > + > +static int start_khugepaged(void) > +{ > + int err = 0; > + if (khugepaged_enabled()) { > + int wakeup; > + if (unlikely(!mm_slot_cache || !mm_slots_hash)) { > + err = -ENOMEM; > + goto out; > + } > + mutex_lock(&khugepaged_mutex); > + if (!khugepaged_thread) > + khugepaged_thread = kthread_run(khugepaged, NULL, > + "khugepaged"); > + if (unlikely(IS_ERR(khugepaged_thread))) { > + clear_bit(TRANSPARENT_HUGEPAGE_KHUGEPAGED_FLAG, > + &transparent_hugepage_flags); > + clear_bit(TRANSPARENT_HUGEPAGE_KHUGEPAGED_REQ_MADV_FLAG, > + &transparent_hugepage_flags); > + printk(KERN_ERR > + "khugepaged: kthread_run(khugepaged) failed\n"); > + err = PTR_ERR(khugepaged_thread); > + khugepaged_thread = NULL; > + } > + wakeup = !list_empty(&khugepaged_scan.mm_head); > + mutex_unlock(&khugepaged_mutex); > + if (wakeup) > + wake_up_interruptible(&khugepaged_wait); > + } else > + /* wakeup to exit */ > + wake_up_interruptible(&khugepaged_wait); > +out: > + return err; > +} > > #ifdef CONFIG_SYSFS > + > +static void wakeup_khugepaged(void) > +{ > + mutex_lock(&khugepaged_mutex); > + if (khugepaged_thread) > + wake_up_process(khugepaged_thread); > + mutex_unlock(&khugepaged_mutex); > +} > + > static ssize_t double_flag_show(struct kobject *kobj, > struct kobj_attribute *attr, char *buf, > enum transparent_hugepage_flag enabled, > @@ -151,20 +258,188 @@ static struct attribute *hugepage_attr[] > > static struct attribute_group hugepage_attr_group = { > .attrs = hugepage_attr, > - .name = "transparent_hugepage", > +}; > + > +static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, > + struct kobj_attribute *attr, > + char *buf) > +{ > + return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); > +} > + > +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, > + struct kobj_attribute *attr, > + const char *buf, size_t count) > +{ > + unsigned long msecs; > + int err; > + > + err = strict_strtoul(buf, 10, &msecs); > + if (err || msecs > UINT_MAX) > + return -EINVAL; > + > + khugepaged_scan_sleep_millisecs = msecs; > + wakeup_khugepaged(); > + > + return count; > +} > +static struct kobj_attribute scan_sleep_millisecs_attr = > + __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, > + scan_sleep_millisecs_store); > + > +static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, > + struct kobj_attribute *attr, > + char *buf) > +{ > + return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); > +} > + > +static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, > + struct kobj_attribute *attr, > + const char *buf, size_t count) > +{ > + unsigned long msecs; > + int err; > + > + err = strict_strtoul(buf, 10, &msecs); > + if (err || msecs > UINT_MAX) > + return -EINVAL; > + > + khugepaged_alloc_sleep_millisecs = msecs; > + wakeup_khugepaged(); > + > + return count; > +} > +static struct kobj_attribute alloc_sleep_millisecs_attr = > + __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, > + alloc_sleep_millisecs_store); > + > +static ssize_t pages_to_scan_show(struct kobject *kobj, > + struct kobj_attribute *attr, > + char *buf) > +{ > + return sprintf(buf, "%u\n", khugepaged_pages_to_scan); > +} > +static ssize_t pages_to_scan_store(struct kobject *kobj, > + struct kobj_attribute *attr, > + const char *buf, size_t count) > +{ > + int err; > + unsigned long pages; > + > + err = strict_strtoul(buf, 10, &pages); > + if (err || !pages || pages > UINT_MAX) > + return -EINVAL; > + > + khugepaged_pages_to_scan = pages; > + > + return count; > +} > +static struct kobj_attribute pages_to_scan_attr = > + __ATTR(pages_to_scan, 0644, pages_to_scan_show, > + pages_to_scan_store); > + > +static ssize_t pages_collapsed_show(struct kobject *kobj, > + struct kobj_attribute *attr, > + char *buf) > +{ > + return sprintf(buf, "%u\n", khugepaged_pages_collapsed); > +} > +static struct kobj_attribute pages_collapsed_attr = > + __ATTR_RO(pages_collapsed); > + > +static ssize_t full_scans_show(struct kobject *kobj, > + struct kobj_attribute *attr, > + char *buf) > +{ > + return sprintf(buf, "%u\n", khugepaged_full_scans); > +} > +static struct kobj_attribute full_scans_attr = > + __ATTR_RO(full_scans); > + > +static ssize_t khugepaged_enabled_show(struct kobject *kobj, > + struct kobj_attribute *attr, char *buf) > +{ > + return double_flag_show(kobj, attr, buf, > + TRANSPARENT_HUGEPAGE_KHUGEPAGED_FLAG, > + TRANSPARENT_HUGEPAGE_KHUGEPAGED_REQ_MADV_FLAG); > +} > +static ssize_t khugepaged_enabled_store(struct kobject *kobj, > + struct kobj_attribute *attr, > + const char *buf, size_t count) > +{ > + ssize_t ret; > + > + ret = double_flag_store(kobj, attr, buf, count, > + TRANSPARENT_HUGEPAGE_KHUGEPAGED_FLAG, > + TRANSPARENT_HUGEPAGE_KHUGEPAGED_REQ_MADV_FLAG); > + if (ret > 0) { > + int err = start_khugepaged(); > + if (err) > + ret = err; > + } > + return ret; > +} > +static struct kobj_attribute khugepaged_enabled_attr = > + __ATTR(enabled, 0644, khugepaged_enabled_show, > + khugepaged_enabled_store); > + > +static struct attribute *khugepaged_attr[] = { > + &khugepaged_enabled_attr.attr, > + &pages_to_scan_attr.attr, > + &pages_collapsed_attr.attr, > + &full_scans_attr.attr, > + &scan_sleep_millisecs_attr.attr, > + &alloc_sleep_millisecs_attr.attr, > + NULL, > +}; > + > +static struct attribute_group khugepaged_attr_group = { > + .attrs = khugepaged_attr, > + .name = "khugepaged", > }; > #endif /* CONFIG_SYSFS */ > > static int __init hugepage_init(void) > { > -#ifdef CONFIG_SYSFS > int err; > +#ifdef CONFIG_SYSFS > + static struct kobject *hugepage_kobj; > > - err = sysfs_create_group(mm_kobj, &hugepage_attr_group); > - if (err) > - printk(KERN_ERR "hugepage: register sysfs failed\n"); > + err = -ENOMEM; > + hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); > + if (unlikely(!hugepage_kobj)) { > + printk(KERN_ERR "hugepage: failed kobject create\n"); > + goto out; > + } > + > + err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); > + if (err) { > + printk(KERN_ERR "hugepage: failed register hugeage group\n"); > + goto out; > + } > + > + err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); > + if (err) { > + printk(KERN_ERR "hugepage: failed register hugeage group\n"); > + goto out; > + } > #endif > - return 0; > + > + err = khugepaged_slab_init(); > + if (err) > + goto out; > + > + err = mm_slots_hash_init(); > + if (err) { > + khugepaged_slab_free(); > + goto out; > + } > + > + start_khugepaged(); > + > +out: > + return err; > } > module_init(hugepage_init) > > @@ -181,6 +456,15 @@ static int __init setup_transparent_huge > transparent_hugepage_flags); > transparent_hugepage_flags = 0; > } > + if (test_bit(TRANSPARENT_HUGEPAGE_KHUGEPAGED_FLAG, > + &transparent_hugepage_flags) && > + test_bit(TRANSPARENT_HUGEPAGE_KHUGEPAGED_REQ_MADV_FLAG, > + &transparent_hugepage_flags)) { > + printk(KERN_WARNING > + "transparent_hugepage=%lu invalid parameter, disabling", > + transparent_hugepage_flags); > + transparent_hugepage_flags = 0; > + } > return 1; > } > __setup("transparent_hugepage=", setup_transparent_hugepage); > @@ -276,6 +560,12 @@ int do_huge_pmd_anonymous_page(struct mm > if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { > if (unlikely(anon_vma_prepare(vma))) > return VM_FAULT_OOM; > + if (unlikely(!test_bit(MMF_VM_HUGEPAGE, &mm->flags))) > + if (khugepaged_always() || > + (khugepaged_req_madv() && > + vma->vm_flags & VM_HUGEPAGE)) > + if (__khugepaged_enter(mm)) > + return VM_FAULT_OOM; > page = alloc_hugepage(); > if (unlikely(!page)) > goto out; > @@ -894,3 +1184,680 @@ int hugepage_madvise(unsigned long *vm_f > > return 0; > } > + > +static int __init khugepaged_slab_init(void) > +{ > + mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", > + sizeof(struct mm_slot), > + __alignof__(struct mm_slot), 0, NULL); > + if (!mm_slot_cache) > + return -ENOMEM; > + > + return 0; > +} > + > +static void __init khugepaged_slab_free(void) > +{ > + kmem_cache_destroy(mm_slot_cache); > + mm_slot_cache = NULL; > +} > + > +static inline struct mm_slot *alloc_mm_slot(void) > +{ > + if (!mm_slot_cache) /* initialization failed */ > + return NULL; > + return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); > +} > + > +static inline void free_mm_slot(struct mm_slot *mm_slot) > +{ > + kmem_cache_free(mm_slot_cache, mm_slot); > +} > + > +static int __init mm_slots_hash_init(void) > +{ > + mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), > + GFP_KERNEL); > + if (!mm_slots_hash) > + return -ENOMEM; > + return 0; > +} > + > +#if 0 > +static void __init mm_slots_hash_free(void) > +{ > + kfree(mm_slots_hash); > + mm_slots_hash = NULL; > +} > +#endif > + > +static struct mm_slot *get_mm_slot(struct mm_struct *mm) > +{ > + struct mm_slot *mm_slot; > + struct hlist_head *bucket; > + struct hlist_node *node; > + > + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) > + % MM_SLOTS_HASH_HEADS]; > + hlist_for_each_entry(mm_slot, node, bucket, hash) { > + if (mm == mm_slot->mm) > + return mm_slot; > + } > + return NULL; > +} > + > +static void insert_to_mm_slots_hash(struct mm_struct *mm, > + struct mm_slot *mm_slot) > +{ > + struct hlist_head *bucket; > + > + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) > + % MM_SLOTS_HASH_HEADS]; > + mm_slot->mm = mm; > + hlist_add_head(&mm_slot->hash, bucket); > +} > + > +int __khugepaged_enter(struct mm_struct *mm) > +{ > + struct mm_slot *mm_slot; > + int wakeup; > + > + mm_slot = alloc_mm_slot(); > + if (!mm_slot) > + return -ENOMEM; > + > + spin_lock(&khugepaged_mm_lock); > + insert_to_mm_slots_hash(mm, mm_slot); > + /* > + * Insert just behind the scanning cursor, to let the area settle > + * down a little. > + */ > + wakeup = list_empty(&khugepaged_scan.mm_head); > + list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); > + set_bit(MMF_VM_HUGEPAGE, &mm->flags); > + spin_unlock(&khugepaged_mm_lock); > + > + atomic_inc(&mm->mm_count); > + if (wakeup) > + wake_up_interruptible(&khugepaged_wait); > + > + return 0; > +} > + > +void __khugepaged_exit(struct mm_struct *mm) > +{ > + struct mm_slot *mm_slot; > + int free = 0; > + > + spin_lock(&khugepaged_mm_lock); > + mm_slot = get_mm_slot(mm); > + if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { > + hlist_del(&mm_slot->hash); > + list_del(&mm_slot->mm_node); > + free = 1; > + } > + > + if (free) { > + clear_bit(MMF_VM_HUGEPAGE, &mm->flags); > + spin_unlock(&khugepaged_mm_lock); > + free_mm_slot(mm_slot); > + mmdrop(mm); > + } else if (mm_slot) { > + spin_unlock(&khugepaged_mm_lock); > + /* > + * This is required to serialize against > + * khugepaged_test_exit() (which is guaranteed to run > + * under mmap sem read mode). Stop here (after we > + * return all pagetables will be destroyed) until > + * khugepaged has finished working on the pagetables > + * under the mmap_sem. > + */ > + down_write(&mm->mmap_sem); > + up_write(&mm->mmap_sem); > + } else > + spin_unlock(&khugepaged_mm_lock); > +} > + > +static inline int khugepaged_test_exit(struct mm_struct *mm) > +{ > + return atomic_read(&mm->mm_users) == 0; > +} > + > +static void release_pte_page(struct page *page) > +{ > + /* 0 stands for page_is_file_cache(page) == false */ > + dec_zone_page_state(page, NR_ISOLATED_ANON + 0); > + unlock_page(page); > + putback_lru_page(page); > +} > + > +static void release_pte_pages(pte_t *pte, pte_t *_pte) > +{ > + while (--_pte >= pte) > + release_pte_page(pte_page(*_pte)); > +} > + > +static void release_all_pte_pages(pte_t *pte) > +{ > + release_pte_pages(pte, pte + HPAGE_PMD_NR); > +} > + > +static int __collapse_huge_page_isolate(struct vm_area_struct *vma, > + unsigned long address, > + pte_t *pte) > +{ > + struct page *page; > + pte_t *_pte; > + int referenced = 0, isolated = 0; > + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; > + _pte++, address += PAGE_SIZE) { > + pte_t pteval = *_pte; > + if (!pte_present(pteval) || !pte_write(pteval)) { > + release_pte_pages(pte, _pte); > + goto out; > + } > + /* If there is no mapped pte young don't collapse the page */ > + if (pte_young(pteval)) > + referenced = 1; > + page = vm_normal_page(vma, address, pteval); > + if (unlikely(!page)) { > + release_pte_pages(pte, _pte); > + goto out; > + } > + VM_BUG_ON(PageCompound(page)); > + BUG_ON(!PageAnon(page)); > + VM_BUG_ON(!PageSwapBacked(page)); > + > + /* cannot use mapcount: can't collapse if there's a gup pin */ > + if (page_count(page) != 1) { > + release_pte_pages(pte, _pte); > + goto out; > + } > + /* > + * We can do it before isolate_lru_page because the > + * page can't be freed from under us. NOTE: PG_lock > + * is needed to serialize against split_huge_page > + * when invoked from the VM. > + */ > + if (!trylock_page(page)) { > + release_pte_pages(pte, _pte); > + goto out; > + } > + /* > + * Isolate the page to avoid collapsing an hugepage > + * currently in use by the VM. > + */ > + if (isolate_lru_page(page)) { > + unlock_page(page); > + release_pte_pages(pte, _pte); > + goto out; > + } > + /* 0 stands for page_is_file_cache(page) == false */ > + inc_zone_page_state(page, NR_ISOLATED_ANON + 0); > + VM_BUG_ON(!PageLocked(page)); > + VM_BUG_ON(PageLRU(page)); > + } > + if (unlikely(!referenced)) > + release_all_pte_pages(pte); > + else > + isolated = 1; > +out: > + return isolated; > +} > + > +static void __collapse_huge_page_copy(pte_t *pte, struct page *page, > + struct vm_area_struct *vma, > + unsigned long address, > + spinlock_t *ptl) > +{ > + pte_t *_pte; > + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { > + struct page *src_page = pte_page(*_pte); > + copy_user_highpage(page, src_page, address, vma); > + VM_BUG_ON(page_mapcount(src_page) != 1); > + VM_BUG_ON(page_count(src_page) != 2); > + release_pte_page(src_page); > + /* > + * ptl mostly unnecessary, but preempt has to be disabled > + * to update the per-cpu stats inside page_remove_rmap(). > + */ > + spin_lock(ptl); > + /* paravirt calls inside pte_clear here are superfluous */ > + pte_clear(vma->vm_mm, address, _pte); > + page_remove_rmap(src_page); > + spin_unlock(ptl); > + free_page_and_swap_cache(src_page); > + > + address += PAGE_SIZE; > + page++; > + } > +} > + > +static void collapse_huge_page(struct mm_struct *mm, > + unsigned long address, > + struct page **hpage) > +{ > + struct vm_area_struct *vma; > + pgd_t *pgd; > + pud_t *pud; > + pmd_t *pmd, _pmd; > + pte_t *pte; > + pgtable_t pgtable; > + struct page *new_page; > + spinlock_t *ptl; > + int isolated; > + unsigned long hstart, hend; > + > + VM_BUG_ON(address & ~HPAGE_PMD_MASK); > + VM_BUG_ON(!*hpage); > + > + /* > + * Prevent all access to pagetables with the exception of > + * gup_fast later hanlded by the ptep_clear_flush and the VM > + * handled by the anon_vma lock + PG_lock. > + */ > + down_write(&mm->mmap_sem); > + if (unlikely(khugepaged_test_exit(mm))) > + goto out; > + > + vma = find_vma(mm, address); > + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; > + hend = vma->vm_end & HPAGE_PMD_MASK; > + if (address < hstart || address + HPAGE_PMD_SIZE > hend) > + goto out; > + > + if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) > + goto out; > + > + /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ > + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) > + goto out; > + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); > + > + pgd = pgd_offset(mm, address); > + if (!pgd_present(*pgd)) > + goto out; > + > + pud = pud_offset(pgd, address); > + if (!pud_present(*pud)) > + goto out; > + > + pmd = pmd_offset(pud, address); > + /* pmd can't go away or become huge under us */ > + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) > + goto out; > + > + /* > + * Stop anon_vma rmap pagetable access. vma->anon_vma->lock is > + * enough for now (we don't need to check each anon_vma > + * pointed by each page->mapping) because collapse_huge_page > + * only works on not-shared anon pages (that are guaranteed to > + * belong to vma->anon_vma). > + */ > + spin_lock(&vma->anon_vma->lock); > + > + pte = pte_offset_map(pmd, address); > + ptl = pte_lockptr(mm, pmd); > + > + spin_lock(&mm->page_table_lock); /* probably unnecessary */ > + /* after this gup_fast can't run anymore */ > + _pmd = pmdp_clear_flush_notify(vma, address, pmd); > + spin_unlock(&mm->page_table_lock); > + > + spin_lock(ptl); > + isolated = __collapse_huge_page_isolate(vma, address, pte); > + spin_unlock(ptl); Is it safe to take zone->lock under pte_lock and anon_vma->lock ? I think usual way is page = follow_page(FOLL_GET) isolate_page(page) > + pte_unmap(pte); > + > + if (unlikely(!isolated)) { > + spin_lock(&mm->page_table_lock); > + BUG_ON(!pmd_none(*pmd)); > + set_pmd_at(mm, address, pmd, _pmd); > + spin_unlock(&mm->page_table_lock); > + spin_unlock(&vma->anon_vma->lock); > + goto out; > + } > + > + /* > + * All pages are isolated and locked so anon_vma rmap > + * can't run anymore. > + */ > + spin_unlock(&vma->anon_vma->lock); > + > + new_page = *hpage; I'm not sure but....where this *hpage is chareged to proper memcg ? I think it's good to charge this newpage in the top of this function. (And cancel it at failure, of course.) Thanks, -Kame > + __collapse_huge_page_copy(pte, new_page, vma, address, ptl); > + __SetPageUptodate(new_page); > + pgtable = pmd_pgtable(_pmd); > + VM_BUG_ON(page_count(pgtable) != 1); > + VM_BUG_ON(page_mapcount(pgtable) != 0); > + > + _pmd = mk_pmd(new_page, vma->vm_page_prot); > + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); > + _pmd = pmd_mkhuge(_pmd); > + > + /* > + * spin_lock() below is not the equivalent of smp_wmb(), so > + * this is needed to avoid the copy_huge_page writes to become > + * visible after the set_pmd_at() write. > + */ > + smp_wmb(); > + > + spin_lock(&mm->page_table_lock); > + BUG_ON(!pmd_none(*pmd)); > + page_add_new_anon_rmap(new_page, vma, address); > + set_pmd_at(mm, address, pmd, _pmd); > + update_mmu_cache(vma, address, entry); > + prepare_pmd_huge_pte(pgtable, mm); > + mm->nr_ptes--; > + spin_unlock(&mm->page_table_lock); > + > + *hpage = NULL; > + khugepaged_pages_collapsed++; > +out: > + up_write(&mm->mmap_sem); > +} > + > +static int khugepaged_scan_pmd(struct mm_struct *mm, > + struct vm_area_struct *vma, > + unsigned long address, > + struct page **hpage) > +{ > + pgd_t *pgd; > + pud_t *pud; > + pmd_t *pmd; > + pte_t *pte, *_pte; > + int ret = 0, referenced = 0; > + struct page *page; > + unsigned long _address; > + spinlock_t *ptl; > + > + VM_BUG_ON(address & ~HPAGE_PMD_MASK); > + > + pgd = pgd_offset(mm, address); > + if (!pgd_present(*pgd)) > + goto out; > + > + pud = pud_offset(pgd, address); > + if (!pud_present(*pud)) > + goto out; > + > + pmd = pmd_offset(pud, address); > + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) > + goto out; > + > + pte = pte_offset_map_lock(mm, pmd, address, &ptl); > + for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; > + _pte++, _address += PAGE_SIZE) { > + pte_t pteval = *_pte; > + if (!pte_present(pteval) || !pte_write(pteval)) > + goto out_unmap; > + if (pte_young(pteval)) > + referenced = 1; > + page = vm_normal_page(vma, _address, pteval); > + if (unlikely(!page)) > + goto out_unmap; > + VM_BUG_ON(PageCompound(page)); > + if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) > + goto out_unmap; > + /* cannot use mapcount: can't collapse if there's a gup pin */ > + if (page_count(page) != 1) > + goto out_unmap; > + } > + if (referenced) > + ret = 1; > +out_unmap: > + pte_unmap_unlock(pte, ptl); > + if (ret) { > + up_read(&mm->mmap_sem); > + collapse_huge_page(mm, address, hpage); > + } > +out: > + return ret; > +} > + > +static void collect_mm_slot(struct mm_slot *mm_slot) > +{ > + struct mm_struct *mm = mm_slot->mm; > + > + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); > + > + if (khugepaged_test_exit(mm)) { > + /* free mm_slot */ > + hlist_del(&mm_slot->hash); > + list_del(&mm_slot->mm_node); > + clear_bit(MMF_VM_MERGEABLE, &mm->flags); > + free_mm_slot(mm_slot); > + mmdrop(mm); > + } > +} > + > +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, > + struct page **hpage) > +{ > + struct mm_slot *mm_slot; > + struct mm_struct *mm; > + struct vm_area_struct *vma; > + int progress = 0; > + > + VM_BUG_ON(!pages); > + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); > + > + if (khugepaged_scan.mm_slot) > + mm_slot = khugepaged_scan.mm_slot; > + else { > + mm_slot = list_entry(khugepaged_scan.mm_head.next, > + struct mm_slot, mm_node); > + khugepaged_scan.address = 0; > + khugepaged_scan.mm_slot = mm_slot; > + } > + spin_unlock(&khugepaged_mm_lock); > + > + mm = mm_slot->mm; > + down_read(&mm->mmap_sem); > + if (unlikely(khugepaged_test_exit(mm))) > + vma = NULL; > + else > + vma = find_vma(mm, khugepaged_scan.address); > + > + progress++; > + for (; vma; vma = vma->vm_next) { > + unsigned long hstart, hend; > + > + cond_resched(); > + if (unlikely(khugepaged_test_exit(mm))) { > + progress++; > + break; > + } > + > + if (!(vma->vm_flags & VM_HUGEPAGE) && > + !khugepaged_always()) { > + progress++; > + continue; > + } > + > + /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ > + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { > + khugepaged_scan.address = vma->vm_end; > + progress++; > + continue; > + } > + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); > + > + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; > + hend = vma->vm_end & HPAGE_PMD_MASK; > + if (hstart >= hend) { > + progress++; > + continue; > + } > + if (khugepaged_scan.address < hstart) > + khugepaged_scan.address = hstart; > + if (khugepaged_scan.address > hend) { > + khugepaged_scan.address = hend + HPAGE_PMD_SIZE; > + progress++; > + continue; > + } > + BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); > + > + while (khugepaged_scan.address < hend) { > + int ret; > + cond_resched(); > + if (unlikely(khugepaged_test_exit(mm))) > + goto breakouterloop; > + > + VM_BUG_ON(khugepaged_scan.address < hstart || > + khugepaged_scan.address + HPAGE_PMD_SIZE > > + hend); > + ret = khugepaged_scan_pmd(mm, vma, > + khugepaged_scan.address, > + hpage); > + /* move to next address */ > + khugepaged_scan.address += HPAGE_PMD_SIZE; > + progress += HPAGE_PMD_NR; > + if (ret) > + /* we released mmap_sem so break loop */ > + goto breakouterloop_mmap_sem; > + if (progress >= pages) > + goto breakouterloop; > + } > + } > +breakouterloop: > + up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ > +breakouterloop_mmap_sem: > + > + spin_lock(&khugepaged_mm_lock); > + BUG_ON(khugepaged_scan.mm_slot != mm_slot); > + /* > + * Release the current mm_slot if this mm is about to die, or > + * if we scanned all vmas of this mm. > + */ > + if (khugepaged_test_exit(mm) || !vma) { > + /* > + * Make sure that if mm_users is reaching zero while > + * khugepaged runs here, khugepaged_exit will find > + * mm_slot not pointing to the exiting mm. > + */ > + if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { > + khugepaged_scan.mm_slot = list_entry( > + mm_slot->mm_node.next, > + struct mm_slot, mm_node); > + khugepaged_scan.address = 0; > + } else { > + khugepaged_scan.mm_slot = NULL; > + khugepaged_full_scans++; > + } > + > + collect_mm_slot(mm_slot); > + } > + > + return progress; > +} > + > +static int khugepaged_has_work(void) > +{ > + return !list_empty(&khugepaged_scan.mm_head) && > + khugepaged_enabled(); > +} > + > +static int khugepaged_wait_event(void) > +{ > + return !list_empty(&khugepaged_scan.mm_head) || > + !khugepaged_enabled(); > +} > + > +static void khugepaged_do_scan(struct page **hpage) > +{ > + unsigned int progress = 0, pass_through_head = 0; > + unsigned int pages = khugepaged_pages_to_scan; > + > + barrier(); /* write khugepaged_pages_to_scan to local stack */ > + > + while (progress < pages) { > + cond_resched(); > + > + if (!*hpage) { > + *hpage = alloc_hugepage(); > + if (unlikely(!*hpage)) > + break; > + } > + > + spin_lock(&khugepaged_mm_lock); > + if (!khugepaged_scan.mm_slot) > + pass_through_head++; > + if (khugepaged_has_work() && > + pass_through_head < 2) > + progress += khugepaged_scan_mm_slot(pages - progress, > + hpage); > + else > + progress = pages; > + spin_unlock(&khugepaged_mm_lock); > + } > +} > + > +static struct page *khugepaged_alloc_hugepage(void) > +{ > + struct page *hpage; > + > + do { > + hpage = alloc_hugepage(); > + if (!hpage) > + schedule_timeout_interruptible( > + msecs_to_jiffies( > + khugepaged_alloc_sleep_millisecs)); > + } while (unlikely(!hpage) && > + likely(khugepaged_enabled())); > + return hpage; > +} > + > +static void khugepaged_loop(void) > +{ > + struct page *hpage; > + > + while (likely(khugepaged_enabled())) { > + hpage = khugepaged_alloc_hugepage(); > + if (unlikely(!hpage)) > + break; > + > + khugepaged_do_scan(&hpage); > + if (hpage) > + put_page(hpage); > + if (khugepaged_has_work()) { > + if (!khugepaged_scan_sleep_millisecs) > + continue; > + schedule_timeout_interruptible( > + msecs_to_jiffies( > + khugepaged_scan_sleep_millisecs)); > + } else if (khugepaged_enabled()) > + wait_event_interruptible(khugepaged_wait, > + khugepaged_wait_event()); > + } > +} > + > +static int khugepaged(void *none) > +{ > + struct mm_slot *mm_slot; > + > + set_user_nice(current, 19); > + > + for (;;) { > + BUG_ON(khugepaged_thread != current); > + khugepaged_loop(); > + BUG_ON(khugepaged_thread != current); > + > + mutex_lock(&khugepaged_mutex); > + if (!khugepaged_enabled()) > + break; > + mutex_unlock(&khugepaged_mutex); > + } > + > + spin_lock(&khugepaged_mm_lock); > + mm_slot = khugepaged_scan.mm_slot; > + khugepaged_scan.mm_slot = NULL; > + if (mm_slot) > + collect_mm_slot(mm_slot); > + spin_unlock(&khugepaged_mm_lock); > + > + khugepaged_thread = NULL; > + mutex_unlock(&khugepaged_mutex); > + > + return 0; > +} > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@xxxxxxxxxx For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a> > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>