Re: [PATCH 3/4] Convert khugepaged scan functions to work with task_work

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



I don't understand this patch, but...

On 10/23, Alex Thorlton wrote:
>
>  static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
>  {
> +	/* this will add task_pgcollapse_work to task_works */
>  	if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags))
> -		return __khugepaged_enter(mm);
> +		return __khugepaged_enter();

This looks certainly wrong or I am totally confused. __khugepaged_enter()
does task_work_add(current) but we want to kick the child, not the parent.

> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -30,6 +30,8 @@
>  #include <linux/mempolicy.h>
>  #include <linux/migrate.h>
>  #include <linux/task_work.h>
> +#include <linux/types.h>
> +#include <linux/khugepaged.h>
>  
>  #include <trace/events/sched.h>
>  
> @@ -2060,6 +2062,23 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
>  }
>  #endif /* CONFIG_NUMA_BALANCING */
>  
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +void task_pgcollapse_work(struct callback_head *work)
> +{
> +	WARN_ON_ONCE(current != container_of(work, struct task_struct, pgcollapse_work));
> +
> +	work->next = work; /* protect against double add */
> +
> +	pr_info("!!! debug - INFO: task: %s/%d in task_pgcollapse_work\n",
> +		current->comm, (int) current->pid);
> +	khugepaged_do_scan();
> +}
> +#else

Why do you add it into kernel/sched/ ??

> -int __khugepaged_enter(struct mm_struct *mm)
> +int __khugepaged_enter(void)
>  {
> -	struct mm_slot *mm_slot;
> -	int wakeup;
> +	unsigned long period = msecs_to_jiffies(current->pgcollapse_scan_sleep_millisecs);
>  
> -	mm_slot = alloc_mm_slot();
> -	if (!mm_slot)
> -		return -ENOMEM;
> +	pr_info("!!! debug - INFO: task: %s/%d jiffies: %lu period: %lu last_scan: %lu\n",
> +		current->comm, (int) current->pid, jiffies, period,
> +		current->pgcollapse_last_scan);
>  
> -	/* __khugepaged_exit() must not run from under us */
> -	VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
> -	if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
> -		free_mm_slot(mm_slot);
> -		return 0;
> -	}
> +	/* may want to move this up to where we actually do the scan... */
> +	if (time_after(jiffies, current->pgcollapse_last_scan + period)) {
> +		current->pgcollapse_last_scan = jiffies;
>  
> -	spin_lock(&khugepaged_mm_lock);
> -	insert_to_mm_slots_hash(mm, mm_slot);
> -	/*
> -	 * Insert just behind the scanning cursor, to let the area settle
> -	 * down a little.
> -	 */
> -	wakeup = list_empty(&khugepaged_scan.mm_head);
> -	list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
> -	spin_unlock(&khugepaged_mm_lock);
> +		pr_info("!!! debug - INFO: task: %s/%d adding pgcollapse work\n",
> +			current->comm, (int) current->pid);
>  
> -	atomic_inc(&mm->mm_count);
> -	if (wakeup)
> -		wake_up_interruptible(&khugepaged_wait);
> +		/* debug - actual new code */
> +		init_task_work(&current->pgcollapse_work, task_pgcollapse_work);
> +		task_work_add(current, &current->pgcollapse_work, true);
> +	} else {
> +		pr_info("!!! debug - INFO: task: %s/%d skipping pgcollapse_scan\n",
> +			current->comm, (int) current->pid);
> +	}
>  
>  	return 0;
>  }

so mm_slots_hash becomes unused?

Oleg.

> @@ -2069,6 +2063,8 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
>  	VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
>  	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
>  	hend = vma->vm_end & HPAGE_PMD_MASK;
> +
> +	/* this will add task_pgcollapse_work to task_works */
>  	if (hstart < hend)
>  		return khugepaged_enter(vma);
>  	return 0;
> @@ -2417,6 +2413,9 @@ static void collapse_huge_page(struct mm_struct *mm,
>  	if (!new_page)
>  		return;
>  
> +	pr_info("!!! debug - INFO: task: %s/%d pgcollapse alloc: %lx on node %d\n",
> +		current->comm, (int) current->pid, address, node);
> +
>  	if (unlikely(mem_cgroup_try_charge(new_page, mm,
>  					   GFP_TRANSHUGE, &memcg)))
>  		return;
> @@ -2514,7 +2513,7 @@ static void collapse_huge_page(struct mm_struct *mm,
>  
>  	*hpage = NULL;
>  
> -	khugepaged_pages_collapsed++;
> +	current->pgcollapse_pages_collapsed++;
>  out_up_write:
>  	up_write(&mm->mmap_sem);
>  	return;
> @@ -2616,44 +2615,34 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
>  }
>  
>  static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
> -					    struct page **hpage)
> -	__releases(&khugepaged_mm_lock)
> -	__acquires(&khugepaged_mm_lock)
> +					   struct page **hpage)
>  {
> -	struct mm_slot *mm_slot;
>  	struct mm_struct *mm;
>  	struct vm_area_struct *vma;
>  	int progress = 0;
> +	/*
> +	 * grab this pointer here to avoid dereferencing
> +	 * the task struct multiple times later
> +	 */
> +	unsigned long *scan_addr_p = &current->pgcollapse_scan_address;
>  
>  	VM_BUG_ON(!pages);
> -	VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
>  
> -	if (khugepaged_scan.mm_slot)
> -		mm_slot = khugepaged_scan.mm_slot;
> -	else {
> -		mm_slot = list_entry(khugepaged_scan.mm_head.next,
> -				     struct mm_slot, mm_node);
> -		khugepaged_scan.address = 0;
> -		khugepaged_scan.mm_slot = mm_slot;
> -	}
> -	spin_unlock(&khugepaged_mm_lock);
> +	pr_info("!!! debug - task: %s/%d starting scan at %lx\n",
> +		current->comm, (int) current->pid, *scan_addr_p);
>  
> -	mm = mm_slot->mm;
> +	mm = current->mm;
>  	down_read(&mm->mmap_sem);
>  	if (unlikely(khugepaged_test_exit(mm)))
>  		vma = NULL;
>  	else
> -		vma = find_vma(mm, khugepaged_scan.address);
> +		vma = find_vma(mm, *scan_addr_p);
>  
>  	progress++;
>  	for (; vma; vma = vma->vm_next) {
>  		unsigned long hstart, hend;
>  
>  		cond_resched();
> -		if (unlikely(khugepaged_test_exit(mm))) {
> -			progress++;
> -			break;
> -		}
>  		if (!hugepage_vma_check(vma)) {
>  skip:
>  			progress++;
> @@ -2663,26 +2652,24 @@ skip:
>  		hend = vma->vm_end & HPAGE_PMD_MASK;
>  		if (hstart >= hend)
>  			goto skip;
> -		if (khugepaged_scan.address > hend)
> +		if (*scan_addr_p > hend)
>  			goto skip;
> -		if (khugepaged_scan.address < hstart)
> -			khugepaged_scan.address = hstart;
> -		VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
> +		if (*scan_addr_p < hstart)
> +			*scan_addr_p = hstart;
> +		VM_BUG_ON(*scan_addr_p & ~HPAGE_PMD_MASK);
>  
> -		while (khugepaged_scan.address < hend) {
> +		while (*scan_addr_p < hend) {
>  			int ret;
>  			cond_resched();
>  			if (unlikely(khugepaged_test_exit(mm)))
>  				goto breakouterloop;
>  
> -			VM_BUG_ON(khugepaged_scan.address < hstart ||
> -				  khugepaged_scan.address + HPAGE_PMD_SIZE >
> -				  hend);
> -			ret = khugepaged_scan_pmd(mm, vma,
> -						  khugepaged_scan.address,
> +			VM_BUG_ON(*scan_addr_p < hstart ||
> +				  *scan_addr_p + HPAGE_PMD_SIZE > hend);
> +			ret = khugepaged_scan_pmd(mm, vma, *scan_addr_p,
>  						  hpage);
>  			/* move to next address */
> -			khugepaged_scan.address += HPAGE_PMD_SIZE;
> +			*scan_addr_p += HPAGE_PMD_SIZE;
>  			progress += HPAGE_PMD_NR;
>  			if (ret)
>  				/* we released mmap_sem so break loop */
> @@ -2694,30 +2681,14 @@ skip:
>  breakouterloop:
>  	up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
>  breakouterloop_mmap_sem:
> -
> -	spin_lock(&khugepaged_mm_lock);
> -	VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
>  	/*
>  	 * Release the current mm_slot if this mm is about to die, or
> -	 * if we scanned all vmas of this mm.
> +	 * if we scanned all vmas of this mm.  Don't think we need the
> +	 * khugepaged_test_exit for the task_work style scan...
>  	 */
>  	if (khugepaged_test_exit(mm) || !vma) {
> -		/*
> -		 * Make sure that if mm_users is reaching zero while
> -		 * khugepaged runs here, khugepaged_exit will find
> -		 * mm_slot not pointing to the exiting mm.
> -		 */
> -		if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
> -			khugepaged_scan.mm_slot = list_entry(
> -				mm_slot->mm_node.next,
> -				struct mm_slot, mm_node);
> -			khugepaged_scan.address = 0;
> -		} else {
> -			khugepaged_scan.mm_slot = NULL;
> -			khugepaged_full_scans++;
> -		}
> -
> -		collect_mm_slot(mm_slot);
> +		*scan_addr_p = 0;
> +		current->pgcollapse_full_scans++;
>  	}
>  
>  	return progress;
> @@ -2735,11 +2706,11 @@ static int khugepaged_wait_event(void)
>  		kthread_should_stop();
>  }
>  
> -static void khugepaged_do_scan(void)
> +void khugepaged_do_scan(void)
>  {
>  	struct page *hpage = NULL;
> -	unsigned int progress = 0, pass_through_head = 0;
> -	unsigned int pages = khugepaged_pages_to_scan;
> +	unsigned int progress = 0;
> +	unsigned int pages = current->pgcollapse_pages_to_scan;
>  	bool wait = true;
>  
>  	barrier(); /* write khugepaged_pages_to_scan to local stack */
> @@ -2750,19 +2721,12 @@ static void khugepaged_do_scan(void)
>  
>  		cond_resched();
>  
> -		if (unlikely(kthread_should_stop() || freezing(current)))
> +		if (unlikely(freezing(current)))
>  			break;
>  
> -		spin_lock(&khugepaged_mm_lock);
> -		if (!khugepaged_scan.mm_slot)
> -			pass_through_head++;
> -		if (khugepaged_has_work() &&
> -		    pass_through_head < 2)
> -			progress += khugepaged_scan_mm_slot(pages - progress,
> -							    &hpage);
> -		else
> -			progress = pages;
> -		spin_unlock(&khugepaged_mm_lock);
> +		progress += khugepaged_scan_mm_slot(pages - progress, &hpage);
> +		pr_info("!!! debug - INFO: task: %s/%d scan iteration progress %u\n",
> +			current->comm, (int) current->pid, progress);
>  	}
>  
>  	if (!IS_ERR_OR_NULL(hpage))
> -- 
> 1.7.12.4
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]