Re: [PATCH 19/33] autonuma: memory follows CPU algorithm and task/mm_autonuma stats collection

Mel Gorman <mel@xxxxxxxxx> · Thu, 11 Oct 2012 19:28:21 +0100

On Thu, Oct 04, 2012 at 01:51:01AM +0200, Andrea Arcangeli wrote:
> This implements the following parts of autonuma:
> 
> o knuma_scand: daemon for setting pte_numa and pmd_numa while
>   gathering NUMA mm stats
> 
> o NUMA hinting page fault handler: Migrate on Fault and gathers NUMA
>   task stats
> 
> o Migrate On Fault: in the context of the NUMA hinting page faults we
>   migrate memory from remote nodes to the local node
> 
> o The rest of autonuma core logic: false sharing detection, sysfs and
>   initialization routines
> 
> The AutoNUMA algorithm when knuma_scand is not running is fully
> bypassed and it will not alter the runtime of memory management or the
> scheduler.
> 
> The whole AutoNUMA logic is a chain reaction as a result of the
> actions of the knuma_scand. Various parts of the code can be described
> like different gears (gears as in glxgears).
> 
> knuma_scand is the first gear and it collects the mm_autonuma
> per-process statistics and at the same time it sets the ptes and pmds
> it scans respectively as pte_numa and pmd_numa.
> 
> The second gear are the numa hinting page faults. These are triggered
> by the pte_numa/pmd_numa pmd/ptes. They collect the task_autonuma
> per-thread statistics. They also implement the memory follow CPU logic
> where we track if pages are repeatedly accessed by remote nodes. The
> memory follow CPU logic can decide to migrate pages across different
> NUMA nodes using Migrate On Fault.
> 
> The third gear is Migrate On Fault. Pages pending for migration are
> migrated in the context of the NUMA hinting page faults. Each
> destination node has a migration rate limit configurable with sysfs.
> 

Ok, all that is understandable.

> The fourth gear is the NUMA scheduler balancing code. That computes
> the statistical information collected in mm->mm_autonuma and
> p->task_autonuma and evaluates the status of all CPUs to decide if
> tasks should be migrated to CPUs in remote nodes.
> 

I imagine this is where all the real complexity lies.

> The only "input" information of the AutoNUMA algorithm that isn't
> collected through NUMA hinting page faults are the per-process
> mm->mm_autonuma statistics. Those mm_autonuma statistics are collected
> by the knuma_scand pmd/pte scans that are also responsible for setting
> pte_numa/pmd_numa to activate the NUMA hinting page faults.
> 
> knuma_scand -> NUMA hinting page faults
>   |                       |
>  \|/                     \|/
> mm_autonuma  <->  task_autonuma (CPU follow memory, this is mm_autonuma too)
>                   page last_nid  (false thread sharing/thread shared memory detection )
>                   queue or cancel page migration (memory follow CPU)
> 
> The code includes some fixes from Hillf Danton <dhillf@xxxxxxxxx>.
> 
> Math documentation on autonuma_last_nid in the header of
> last_nid_set() reworked from sched-numa code by Peter Zijlstra
> <a.p.zijlstra@xxxxxxxxx>.
> 
> Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx>
> Signed-off-by: Hillf Danton <dhillf@xxxxxxxxx>
> ---
>  mm/autonuma.c    | 1365 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  mm/huge_memory.c |   34 ++
>  2 files changed, 1399 insertions(+), 0 deletions(-)
>  create mode 100644 mm/autonuma.c
> 
> diff --git a/mm/autonuma.c b/mm/autonuma.c
> new file mode 100644
> index 0000000..1b2530c
> --- /dev/null
> +++ b/mm/autonuma.c
> @@ -0,0 +1,1365 @@
> +/*
> + *  Copyright (C) 2012  Red Hat, Inc.
> + *
> + *  This work is licensed under the terms of the GNU GPL, version 2. See
> + *  the COPYING file in the top-level directory.
> + *
> + *  Boot with "numa=fake=2" to test on non NUMA systems.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/rmap.h>
> +#include <linux/kthread.h>
> +#include <linux/mmu_notifier.h>
> +#include <linux/freezer.h>
> +#include <linux/mm_inline.h>
> +#include <linux/migrate.h>
> +#include <linux/swap.h>
> +#include <linux/autonuma.h>
> +#include <asm/tlbflush.h>
> +#include <asm/pgtable.h>
> +
> +unsigned long autonuma_flags __read_mostly =
> +	(1<<AUTONUMA_POSSIBLE_FLAG)
> +#ifdef CONFIG_AUTONUMA_DEFAULT_ENABLED
> +	|(1<<AUTONUMA_ENABLED_FLAG)
> +#endif
> +	|(1<<AUTONUMA_SCAN_PMD_FLAG);
> +
> +static DEFINE_MUTEX(knumad_mm_mutex);
> +

/* Protects the list of mm's being considered by knuma_scan */

> +/* knuma_scand */
> +static unsigned int scan_sleep_millisecs __read_mostly = 100;
> +static unsigned int scan_sleep_pass_millisecs __read_mostly = 10000;
> +static unsigned int pages_to_scan __read_mostly = 128*1024*1024/PAGE_SIZE;
> +static DECLARE_WAIT_QUEUE_HEAD(knuma_scand_wait);
> +static unsigned long full_scans;
> +static unsigned long pages_scanned;
> +
> +/* page migration rate limiting control */
> +static unsigned int migrate_sleep_millisecs __read_mostly = 100;
> +static unsigned int pages_to_migrate __read_mostly = 128*1024*1024/PAGE_SIZE;
> +static volatile unsigned long pages_migrated;
> +

Ok, so it's rate limited to a small value. Explain the values better

/*
 * page migration te limiting control
 * AutoNUMA will not migrate more then pages_to_migrate pages per
 * migrate_sleep_millisecs 
 */

If that's right, the names could be a bit more helpful

/*
 * page migration rate limiting control
 * During a given time window of migrate_ratelimit_window_ms
 * milliseconds, no more than migrate_ratelimit_nr_pages will
 * be migrated. The default values will not migrate more than
 * 1.2G/sec
 */
migrate_ratelimit_window_ms = 100
migrate_ratelimit_nr_pages  = 128 << 20 >> PAGE_SHIFT

or something?

> +static struct knuma_scand_data {
> +	struct list_head mm_head; /* entry: mm->mm_autonuma->mm_node */
> +	struct mm_struct *mm;
> +	unsigned long address;
> +	unsigned long *mm_numa_fault_tmp;
> +} knuma_scand_data = {
> +	.mm_head = LIST_HEAD_INIT(knuma_scand_data.mm_head),
> +};
> +
> +/* caller already holds the compound_lock */
> +void autonuma_migrate_split_huge_page(struct page *page,
> +				      struct page *page_tail)
> +{
> +	int last_nid;
> +
> +	last_nid = ACCESS_ONCE(page->autonuma_last_nid);
> +	if (last_nid >= 0)
> +		page_tail->autonuma_last_nid = last_nid;
> +}
> +
> +static int sync_isolate_migratepages(struct list_head *migratepages,
> +				     struct page *page,
> +				     struct pglist_data *pgdat,
> +				     bool *migrated)
> +{

So, why did this thing not to a

struct compact_control cc = {
        .nr_freepages = 0,
        .nr_migratepages = 0,
        .zone = zone,
        .sync = false,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);

start_pfn = page_to_pfn(page);
end_pfn = start_pfn + 1;

if (PageTransHuge(page)) {
	end_pfn = start_pfn + HPAGE_PMD_NR;
	VM_BUG_ON(!PageAnon(page));
	if (unlikely(split_huge_page(page))) {
		autonuma_printk("autonuma migrate THP free\n");
		goto out;
	}
}
isolate_migratepages_range(page_zone(page), &cc, start_pfn, end_pfn);

I know that this looks really clumsy and you will need to shortcut the
migrate_async_suitable() check in isolate_migratepages_range() but it
looks something that could be a simple helper function in compaction.c
to create the compact_control and return the migratepages to you.

That way you would also get the lock contention detection and the like.
Otherwise you're going to end up implementing your own version of
compact_checklock_irqsave when that LRU lock (taken for single pages
ouchies) gets too hot. You might be able to more easily implement a
batching mechanism too of some sort (dunno what that would look like
yet)

> +	struct zone *zone;
> +	struct lruvec *lruvec;
> +	int nr_subpages;
> +	struct page *subpage;
> +	int ret = 0;
> +
> +	nr_subpages = 1;
> +	if (PageTransHuge(page)) {
> +		nr_subpages = HPAGE_PMD_NR;
> +		VM_BUG_ON(!PageAnon(page));
> +		/* FIXME: remove split_huge_page */
> +		if (unlikely(split_huge_page(page))) {
> +			autonuma_printk("autonuma migrate THP free\n");
> +			goto out;
> +		}
> +	}
> +
> +	/* All THP subpages are guaranteed to be in the same zone */
> +	zone = page_zone(page);
> +
> +	for (subpage = page; subpage < page+nr_subpages; subpage++) {
> +		spin_lock_irq(&zone->lru_lock);
> +
> +		/* Must run under the lru_lock and before page isolation */
> +		lruvec = mem_cgroup_page_lruvec(subpage, zone);
> +
> +		if (!__isolate_lru_page(subpage, ISOLATE_ASYNC_MIGRATE)) {
> +			VM_BUG_ON(PageTransCompound(subpage));
> +			del_page_from_lru_list(subpage, lruvec,
> +					       page_lru(subpage));
> +			inc_zone_state(zone, page_is_file_cache(subpage) ?
> +				       NR_ISOLATED_FILE : NR_ISOLATED_ANON);
> +			spin_unlock_irq(&zone->lru_lock);
> +
> +			list_add(&subpage->lru, migratepages);
> +			ret++;
> +		} else {
> +			/* losing page */
> +			spin_unlock_irq(&zone->lru_lock);
> +		}
> +	}
> +
> +	/*
> +	 * Pin the head subpage at least until the first
> +	 * __isolate_lru_page succeeds (__isolate_lru_page pins it
> +	 * again when it succeeds). If we unpin before
> +	 * __isolate_lru_page successd, the page could be freed and
> +	 * reallocated out from under us. Thus our previous checks on
> +	 * the page, and the split_huge_page, would be worthless.
> +	 *
> +	 * We really only need to do this if "ret > 0" but it doesn't
> +	 * hurt to do it unconditionally as nobody can reference
> +	 * "page" anymore after this and so we can avoid an "if (ret >
> +	 * 0)" branch here.
> +	 */
> +	put_page(page);
> +	/*
> +	 * Tell the caller we already released its pin, to avoid a
> +	 * double free.
> +	 */
> +	*migrated = true;
> +
> +out:
> +	return ret;
> +}
> +
> +static bool autonuma_balance_pgdat(struct pglist_data *pgdat,
> +				   int nr_migrate_pages)
> +{

This is actually a check function true/false where as page reclaims
balance_pgdat() function actually does the balancing. This made my brain
skip a little. Rename

should_autonuma_balance_pgdat()

Document what a returning true means

/* Returns true if a zone within the pgdat is below watermarks and may
 * need some of its pages migrated to other node to balance across
 * multiple NUMA nodes
 */
static bool should_autonuma_balance_pgdat(....)

> +	/* FIXME: this only check the wmarks, make it move
> +	 * "unused" memory or pagecache by queuing it to
> +	 * pgdat->autonuma_migrate_head[pgdat->node_id].
> +	 */
> +	int z;
> +	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
> +		struct zone *zone = pgdat->node_zones + z;
> +
> +		if (!populated_zone(zone))
> +			continue;
> +
> +		if (zone->all_unreclaimable)
> +			continue;
> +
> +		/*
> +		 * FIXME: in theory we're ok if we can obtain
> +		 * pages_to_migrate pages from all zones, it doesn't
> +		 * need to be all in a single zone. We care about the
> +		 * pgdat, not the zone.
> +		 */
> +
> +		/*
> +		 * Try not to wakeup kswapd by allocating
> +		 * pages_to_migrate pages.
> +		 */
> +		if (!zone_watermark_ok(zone, 0,
> +				       high_wmark_pages(zone) +
> +				       nr_migrate_pages,
> +				       0, 0))
> +			continue;
> +		return true;
> +	}
> +	return false;
> +}

prepare_kswapd_sleep() has a check very similar to this to check if
all_zones_ok. I think it could be split out and mostly shared minus
the all_unreclaimbe bit?

> +
> +static struct page *alloc_migrate_dst_page(struct page *page,
> +					   unsigned long data,
> +					   int **result)
> +{
> +	int nid = (int) data;
> +	struct page *newpage;
> +	newpage = alloc_pages_exact_node(nid,
> +					 (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
> +					  __GFP_NOMEMALLOC | __GFP_NORETRY |
> +					  __GFP_NOWARN | __GFP_NO_KSWAPD) &

__GFP_NO_KSWAPD will be gone so that flag will go away. When it does,
this thing will wake kswapd which you probably don't want. Will you need
to bring the flag back or make this an atomic allocation?

> +					 ~GFP_IOFS, 0);
> +	if (newpage)
> +		newpage->autonuma_last_nid = page->autonuma_last_nid;
> +	return newpage;
> +}
> +
> +static inline void autonuma_migrate_lock(int nid)
> +{
> +	spin_lock(&NODE_DATA(nid)->autonuma_migrate_lock);
> +}
> +
> +static inline void autonuma_migrate_unlock(int nid)
> +{
> +	spin_unlock(&NODE_DATA(nid)->autonuma_migrate_lock);
> +}
> +
> +static bool autonuma_migrate_page(struct page *page, int dst_nid,
> +				  int page_nid, bool *migrated)
> +{
> +	int isolated = 0;
> +	LIST_HEAD(migratepages);
> +	struct pglist_data *pgdat = NODE_DATA(dst_nid);
> +	int nr_pages = hpage_nr_pages(page);
> +	unsigned long autonuma_migrate_nr_pages = 0;
> +
> +	autonuma_migrate_lock(dst_nid);

Why is this lock necessary?

It's necessary because multiple processes can be migrating on fault at
the same time and the counters must be protected.

/*
 * autonuma_migrate_lock protects against multiple processes performing
 * migrate-on-fault at the same time. The counter statistics must be
 * kept accurate.
 */

Severely doubt this is a hot lock.

> +	if (time_after(jiffies, pgdat->autonuma_migrate_last_jiffies +
> +		       msecs_to_jiffies(migrate_sleep_millisecs))) {
> +		autonuma_migrate_nr_pages = pgdat->autonuma_migrate_nr_pages;
> +		pgdat->autonuma_migrate_nr_pages = 0;
> +		pgdat->autonuma_migrate_last_jiffies = jiffies;
> +	}
> +	if (pgdat->autonuma_migrate_nr_pages >= pages_to_migrate) {
> +		autonuma_migrate_unlock(dst_nid);
> +		goto out;
> +	}
> +	pgdat->autonuma_migrate_nr_pages += nr_pages;
> +	autonuma_migrate_unlock(dst_nid);
> +
> +	if (autonuma_migrate_nr_pages)
> +		autonuma_printk("migrated %lu pages to node %d\n",
> +				autonuma_migrate_nr_pages, dst_nid);
> +
> +	if (autonuma_balance_pgdat(pgdat, nr_pages))
> +		isolated = sync_isolate_migratepages(&migratepages,
> +						     page, pgdat,
> +						     migrated);
> +

At some point you want want to batch that a bit - either 1 THP or
SWAP_CLUSTER_MAX

We should have either tracepoints or vmstats available that allow us to
track how many pages are being scanned, isolated and migrated. The
vmstats would give a coarse view of the overall systems. The tracepoints
may allow someone to figure out what rate data is moving between nodes
due to autonuma.

> +	if (isolated) {
> +		int err;
> +		pages_migrated += isolated; /* FIXME: per node */
> +		err = migrate_pages(&migratepages, alloc_migrate_dst_page,
> +				    pgdat->node_id, false, MIGRATE_ASYNC);

Another option would be to push some of the stats down here. The current
stats are from compaction and they suck. compact_pages_moved and
compact_pagemigrate_failed should be replaced and pushed down into
migration. It'll be insufficinet to tell which are due to autonuma dna
which are due to compaction but again tracepoints could be used to
answer that question if someone cared.

> +		if (err)
> +			putback_lru_pages(&migratepages);
> +	}
> +	BUG_ON(!list_empty(&migratepages));
> +out:
> +	return isolated;
> +}
> +
> +static void cpu_follow_memory_pass(struct task_struct *p,
> +				   struct task_autonuma *task_autonuma,
> +				   unsigned long *task_numa_fault)
> +{
> +	int nid;
> +	/* If a new pass started, degrade the stats by a factor of 2 */
> +	for_each_node(nid)
> +		task_numa_fault[nid] >>= 1;
> +	task_autonuma->task_numa_fault_tot >>= 1;
> +}
> +
> +static void numa_hinting_fault_cpu_follow_memory(struct task_struct *p,
> +						 int access_nid,
> +						 int numpages,
> +						 bool new_pass)
> +{
> +	struct task_autonuma *task_autonuma = p->task_autonuma;
> +	unsigned long *task_numa_fault = task_autonuma->task_numa_fault;
> +
> +	/* prevent sched_autonuma_balance() to run on top of us */
> +	local_bh_disable();
> +
> +	if (unlikely(new_pass))
> +		cpu_follow_memory_pass(p, task_autonuma, task_numa_fault);
> +	task_numa_fault[access_nid] += numpages;
> +	task_autonuma->task_numa_fault_tot += numpages;
> +
> +	local_bh_enable();
> +}
> +
> +/*
> + * In this function we build a temporal CPU_node<->page relation by
> + * using a two-stage autonuma_last_nid filter to remove short/unlikely
> + * relations.
> + *
> + * Using P(p) ~ n_p / n_t as per frequentest probability, we can
> + * equate a node's CPU usage of a particular page (n_p) per total
> + * usage of this page (n_t) (in a given time-span) to a probability.
> + *
> + * Our periodic faults will then sample this probability and getting
> + * the same result twice in a row, given these samples are fully
> + * independent, is then given by P(n)^2, provided our sample period
> + * is sufficiently short compared to the usage pattern.
> + *
> + * This quadric squishes small probabilities, making it less likely
> + * we act on an unlikely CPU_node<->page relation.
> + */
> +static inline bool last_nid_set(struct page *page, int this_nid)
> +{
> +	bool ret = true;
> +	int autonuma_last_nid = ACCESS_ONCE(page->autonuma_last_nid);
> +	VM_BUG_ON(this_nid < 0);
> +	VM_BUG_ON(this_nid >= MAX_NUMNODES);
> +	if (autonuma_last_nid != this_nid) {
> +		if (autonuma_last_nid >= 0)
> +			ret = false;
> +		ACCESS_ONCE(page->autonuma_last_nid) = this_nid;
> +	}
> +	return ret;
> +}
> +

Ok, you could include your blurb from another mail about how this is
functionally very similar to how pages are aged - two LRU scans for
page aging versus two knuma_scans for autonuma. It does mean that small
processes may have a tendency to move betwen nodes faster because the
knuma_scan completes faster. Not many workloads will notice though.

> +static int numa_hinting_fault_memory_follow_cpu(struct page *page,
> +						int this_nid, int page_nid,
> +						bool new_pass,
> +						bool *migrated)
> +{
> +	if (!last_nid_set(page, this_nid))
> +		goto out;
> +	if (!PageLRU(page))
> +		goto out;
> +	if (this_nid != page_nid) {
> +		if (autonuma_migrate_page(page, this_nid, page_nid,
> +					  migrated))
> +			return this_nid;
> +	}
> +out:
> +	return page_nid;
> +}
> +
> +bool numa_hinting_fault(struct page *page, int numpages)
> +{
> +	bool migrated = false;
> +
> +	/*
> +	 * "current->mm" could be different from the "mm" where the
> +	 * NUMA hinting page fault happened, if get_user_pages()
> +	 * triggered the fault on some other process "mm". That is ok,
> +	 * all we care about is to count the "page_nid" access on the
> +	 * current->task_autonuma, even if the page belongs to a
> +	 * different "mm".
> +	 */
> +	WARN_ON_ONCE(!current->mm);
> +	if (likely(current->mm && !current->mempolicy && autonuma_enabled())) {
> +		struct task_struct *p = current;
> +		int this_nid, page_nid, access_nid;
> +		bool new_pass;
> +
> +		/*
> +		 * new_pass is only true the first time the thread
> +		 * faults on this pass of knuma_scand.
> +		 */
> +		new_pass = p->task_autonuma->task_numa_fault_pass !=
> +			p->mm->mm_autonuma->mm_numa_fault_pass;
> +		page_nid = page_to_nid(page);
> +		this_nid = numa_node_id();
> +		VM_BUG_ON(this_nid < 0);
> +		VM_BUG_ON(this_nid >= MAX_NUMNODES);
> +		access_nid = numa_hinting_fault_memory_follow_cpu(page,
> +								  this_nid,
> +								  page_nid,
> +								  new_pass,
> +								  &migrated);
> +		/* "page" has been already freed if "migrated" is true */
> +		numa_hinting_fault_cpu_follow_memory(p, access_nid,
> +						     numpages, new_pass);
> +		if (unlikely(new_pass))
> +			/*
> +			 * Set the task's fault_pass equal to the new
> +			 * mm's fault_pass, so new_pass will be false
> +			 * on the next fault by this thread in this
> +			 * same pass.
> +			 */
> +			p->task_autonuma->task_numa_fault_pass =
> +				p->mm->mm_autonuma->mm_numa_fault_pass;
> +	}
> +
> +	return migrated;
> +}

ok.

> +
> +/* NUMA hinting page fault entry point for ptes */
> +int pte_numa_fixup(struct mm_struct *mm, struct vm_area_struct *vma,
> +		   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
> +{

Naming. We're not "fixing" anything, it's not broken so no way should
this be the entry point. It should follow the same pattern as other
fault handlers and be called do_numahint_page();

handle_pte_fault -> do_numahint_page
handle_pte_fault -> do_anonymous_page

etc.

Functionally I did not spot a problem.

> +	struct page *page;
> +	spinlock_t *ptl;
> +	bool migrated;
> +
> +	/*
> +	 * The "pte" at this point cannot be used safely without
> +	 * validation through pte_unmap_same(). It's of NUMA type but
> +	 * the pfn may be screwed if the read is non atomic.
> +	 */
> +
> +	ptl = pte_lockptr(mm, pmd);
> +	spin_lock(ptl);
> +	if (unlikely(!pte_same(*ptep, pte)))
> +		goto out_unlock;
> +	pte = pte_mknonnuma(pte);
> +	set_pte_at(mm, addr, ptep, pte);
> +	page = vm_normal_page(vma, addr, pte);
> +	BUG_ON(!page);
> +	if (unlikely(page_mapcount(page) != 1))
> +		goto out_unlock;
> +	get_page(page);
> +	pte_unmap_unlock(ptep, ptl);
> +
> +	migrated = numa_hinting_fault(page, 1);
> +	if (!migrated)
> +		put_page(page);
> +out:
> +	return 0;
> +
> +out_unlock:
> +	pte_unmap_unlock(ptep, ptl);
> +	goto out;
> +}
> +
> +/* NUMA hinting page fault entry point for regular pmds */
> +int pmd_numa_fixup(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp)

do_numahint_pmd

> +{
> +	pmd_t pmd;
> +	pte_t *pte, *orig_pte;
> +	unsigned long _addr = addr & PMD_MASK;
> +	unsigned long offset;
> +	spinlock_t *ptl;
> +	bool numa = false;
> +	struct vm_area_struct *vma;
> +	bool migrated;
> +
> +	spin_lock(&mm->page_table_lock);
> +	pmd = *pmdp;
> +	if (pmd_numa(pmd)) {
> +		set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
> +		numa = true;
> +	}
> +	spin_unlock(&mm->page_table_lock);
> +
> +	if (!numa)
> +		return 0;
> +
> +	vma = find_vma(mm, _addr);
> +	/* we're in a page fault so some vma must be in the range */
> +	BUG_ON(!vma);
> +	BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
> +	offset = max(_addr, vma->vm_start) & ~PMD_MASK;
> +	VM_BUG_ON(offset >= PMD_SIZE);
> +	orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
> +	pte += offset >> PAGE_SHIFT;
> +	for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {

Pity you can't use walk_pmd_range() but it's unsuitable.

> +		pte_t pteval = *pte;
> +		struct page * page;
> +		if (!pte_present(pteval))
> +			continue;
> +		if (addr >= vma->vm_end) {
> +			vma = find_vma(mm, addr);
> +			/* there's a pte present so there must be a vma */
> +			BUG_ON(!vma);
> +			BUG_ON(addr < vma->vm_start);
> +		}
> +		if (pte_numa(pteval)) {
> +			pteval = pte_mknonnuma(pteval);
> +			set_pte_at(mm, addr, pte, pteval);
> +		}
> +		page = vm_normal_page(vma, addr, pteval);
> +		if (unlikely(!page))
> +			continue;
> +		/* only check non-shared pages */
> +		if (unlikely(page_mapcount(page) != 1))
> +			continue;
> +		get_page(page);
> +		pte_unmap_unlock(pte, ptl);
> +
> +		migrated = numa_hinting_fault(page, 1);
> +		if (!migrated)
> +			put_page(page);
> +
> +		pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
> +	}
> +	pte_unmap_unlock(orig_pte, ptl);
> +	return 0;
> +}
> +
> +static inline int task_autonuma_size(void)
> +{
> +	return sizeof(struct task_autonuma) +
> +		nr_node_ids * sizeof(unsigned long);
> +}
> +
> +static inline int task_autonuma_reset_size(void)
> +{
> +	struct task_autonuma *task_autonuma = NULL;
> +	return task_autonuma_size() -
> +		(int)((char *)(&task_autonuma->task_numa_fault_pass) -
> +		      (char *)task_autonuma);
> +}
> +
> +static void __task_autonuma_reset(struct task_autonuma *task_autonuma)
> +{
> +	memset(&task_autonuma->task_numa_fault_pass, 0,
> +	       task_autonuma_reset_size());
> +}
> +
> +static void task_autonuma_reset(struct task_autonuma *task_autonuma)
> +{
> +	task_autonuma->task_selected_nid = -1;
> +	__task_autonuma_reset(task_autonuma);
> +}
> +
> +static inline int mm_autonuma_fault_size(void)
> +{
> +	return nr_node_ids * sizeof(unsigned long);
> +}
> +
> +static inline int mm_autonuma_size(void)
> +{
> +	return sizeof(struct mm_autonuma) + mm_autonuma_fault_size();
> +}
> +
> +static inline int mm_autonuma_reset_size(void)
> +{
> +	struct mm_autonuma *mm_autonuma = NULL;
> +	return mm_autonuma_size() -
> +		(int)((char *)(&mm_autonuma->mm_numa_fault_pass) -
> +		      (char *)mm_autonuma);
> +}
> +
> +static void mm_autonuma_reset(struct mm_autonuma *mm_autonuma)
> +{
> +	memset(&mm_autonuma->mm_numa_fault_pass, 0, mm_autonuma_reset_size());
> +}
> +
> +void autonuma_setup_new_exec(struct task_struct *p)
> +{
> +	if (p->task_autonuma)
> +		task_autonuma_reset(p->task_autonuma);
> +	if (p->mm && p->mm->mm_autonuma)
> +		mm_autonuma_reset(p->mm->mm_autonuma);
> +}
> +
> +static inline int knumad_test_exit(struct mm_struct *mm)
> +{
> +	return atomic_read(&mm->mm_users) == 0;
> +}
> +
> +/*
> + * Here we search for not shared page mappings (mapcount == 1) and we
> + * set up the pmd/pte_numa on those mappings so the very next access
> + * will fire a NUMA hinting page fault. We also collect the
> + * mm_autonuma statistics for this process mm at the same time.
> + */
> +static int knuma_scand_pmd(struct mm_struct *mm,
> +			   struct vm_area_struct *vma,
> +			   unsigned long address)
> +{
> +	pgd_t *pgd;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +	pte_t *pte, *_pte;
> +	struct page *page;
> +	unsigned long _address, end;
> +	spinlock_t *ptl;
> +	int ret = 0;
> +
> +	VM_BUG_ON(address & ~PAGE_MASK);
> +
> +	pgd = pgd_offset(mm, address);
> +	if (!pgd_present(*pgd))
> +		goto out;
> +
> +	pud = pud_offset(pgd, address);
> +	if (!pud_present(*pud))
> +		goto out;
> +
> +	pmd = pmd_offset(pud, address);
> +	if (pmd_none(*pmd))
> +		goto out;
> +
> +	if (pmd_trans_huge_lock(pmd, vma) == 1) {
> +		int page_nid;
> +		unsigned long *fault_tmp;
> +		ret = HPAGE_PMD_NR;
> +
> +		VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> +
> +		page = pmd_page(*pmd);
> +
> +		/* only check non-shared pages */
> +		if (page_mapcount(page) != 1) {
> +			spin_unlock(&mm->page_table_lock);
> +			goto out;
> +		}
> +
> +		page_nid = page_to_nid(page);
> +		fault_tmp = knuma_scand_data.mm_numa_fault_tmp;
> +		fault_tmp[page_nid] += ret;
> +
> +		if (pmd_numa(*pmd)) {
> +			spin_unlock(&mm->page_table_lock);
> +			goto out;
> +		}
> +
> +		set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
> +		/* defer TLB flush to lower the overhead */
> +		spin_unlock(&mm->page_table_lock);
> +		goto out;
> +	}
> +

Ok, so it's another area to watch out for but the competition on
mmap_sem means that knuma_scan can increase mmap() latency and basically
anything that requires down_write(mmap_sem). If the lock is contented,
knuma_scan will spin on it where as it probably should back off and this
is a possibility as at this level you are dealing with
mm->page_table_lock and not the fine-grained ptl.

That said, it's probably similar concerns that exist for khugepaged and
so far the scanning has not been a major problem. Something to keep an
eye on, but not necessarily get into a twist over either. As with so
many other places, it's a contributed to System CPU usage that must be
kept an eye on and incrementally improved.

> +	if (pmd_trans_unstable(pmd))
> +		goto out;
> +	VM_BUG_ON(!pmd_present(*pmd));
> +
> +	end = min(vma->vm_end, (address + PMD_SIZE) & PMD_MASK);
> +	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
> +	for (_address = address, _pte = pte; _address < end;
> +	     _pte++, _address += PAGE_SIZE) {
> +		pte_t pteval = *_pte;
> +		unsigned long *fault_tmp;
> +		if (!pte_present(pteval))
> +			continue;
> +		page = vm_normal_page(vma, _address, pteval);
> +		if (unlikely(!page))
> +			continue;
> +		/* only check non-shared pages */
> +		if (page_mapcount(page) != 1)
> +			continue;
> +
> +		fault_tmp = knuma_scand_data.mm_numa_fault_tmp;
> +		fault_tmp[page_to_nid(page)]++;
> +
> +		if (pte_numa(pteval))
> +			continue;
> +
> +		if (!autonuma_scan_pmd())
> +			set_pte_at(mm, _address, _pte, pte_mknuma(pteval));
> +
> +		/* defer TLB flush to lower the overhead */
> +		ret++;
> +	}
> +	pte_unmap_unlock(pte, ptl);
> +
> +	if (ret && !pmd_numa(*pmd) && autonuma_scan_pmd()) {
> +		/*
> +		 * Mark the page table pmd as numa if "autonuma scan
> +		 * pmd" mode is enabled.
> +		 */
> +		spin_lock(&mm->page_table_lock);
> +		set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
> +		spin_unlock(&mm->page_table_lock);
> +		/* defer TLB flush to lower the overhead */
> +	}
> +
> +out:
> +	return ret;
> +}
> +
> +static void mm_numa_fault_tmp_flush(struct mm_struct *mm)
> +{
> +	int nid;
> +	struct mm_autonuma *mma = mm->mm_autonuma;
> +	unsigned long tot;
> +	unsigned long *fault_tmp = knuma_scand_data.mm_numa_fault_tmp;
> +
> +	/* FIXME: would be better protected with write_seqlock_bh() */
> +	local_bh_disable();
> +
> +	tot = 0;
> +	for_each_node(nid) {
> +		unsigned long faults = fault_tmp[nid];
> +		fault_tmp[nid] = 0;
> +		mma->mm_numa_fault[nid] = faults;
> +		tot += faults;
> +	}
> +	mma->mm_numa_fault_tot = tot;
> +
> +	local_bh_enable();
> +}
> +
> +static void mm_numa_fault_tmp_reset(void)
> +{
> +	memset(knuma_scand_data.mm_numa_fault_tmp, 0,
> +	       mm_autonuma_fault_size());
> +}
> +
> +static inline void validate_mm_numa_fault_tmp(unsigned long address)
> +{
> +#ifdef CONFIG_DEBUG_VM
> +	int nid;
> +	if (address)
> +		return;
> +	for_each_node(nid)
> +		BUG_ON(knuma_scand_data.mm_numa_fault_tmp[nid]);
> +#endif
> +}
> +
> +/*
> + * Scan the next part of the mm. Keep track of the progress made and
> + * return it.
> + */
> +static int knumad_do_scan(void)
> +{
> +	struct mm_struct *mm;
> +	struct mm_autonuma *mm_autonuma;
> +	unsigned long address;
> +	struct vm_area_struct *vma;
> +	int progress = 0;
> +
> +	mm = knuma_scand_data.mm;
> +	/*
> +	 * knuma_scand_data.mm is NULL after the end of each
> +	 * knuma_scand pass. So when it's NULL we've start from
> +	 * scratch from the very first mm in the list.
> +	 */
> +	if (!mm) {
> +		if (unlikely(list_empty(&knuma_scand_data.mm_head)))
> +			return pages_to_scan;
> +		mm_autonuma = list_entry(knuma_scand_data.mm_head.next,
> +					 struct mm_autonuma, mm_node);
> +		mm = mm_autonuma->mm;
> +		knuma_scand_data.address = 0;
> +		knuma_scand_data.mm = mm;
> +		atomic_inc(&mm->mm_count);
> +		mm_autonuma->mm_numa_fault_pass++;
> +	}
> +	address = knuma_scand_data.address;
> +
> +	validate_mm_numa_fault_tmp(address);
> +
> +	mutex_unlock(&knumad_mm_mutex);
> +
> +	down_read(&mm->mmap_sem);
> +	if (unlikely(knumad_test_exit(mm)))
> +		vma = NULL;
> +	else
> +		vma = find_vma(mm, address);
> +
> +	progress++;
> +	for (; vma && progress < pages_to_scan; vma = vma->vm_next) {
> +		unsigned long start_addr, end_addr;
> +		cond_resched();
> +		if (unlikely(knumad_test_exit(mm))) {
> +			progress++;
> +			break;
> +		}
> +
> +		if (!vma->anon_vma || vma_policy(vma)) {
> +			progress++;
> +			continue;
> +		}
> +		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) {
> +			progress++;
> +			continue;
> +		}
> +		/*
> +		 * Skip regions mprotected with PROT_NONE. It would be
> +		 * safe to scan them too, but it's worthless because
> +		 * NUMA hinting page faults can't run on those.
> +		 */
> +		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) {
> +			progress++;
> +			continue;
> +		}
> +		if (is_vma_temporary_stack(vma)) {
> +			progress++;
> +			continue;
> +		}
> +
> +		VM_BUG_ON(address & ~PAGE_MASK);
> +		if (address < vma->vm_start)
> +			address = vma->vm_start;
> +
> +		start_addr = address;
> +		while (address < vma->vm_end) {
> +			cond_resched();
> +			if (unlikely(knumad_test_exit(mm)))
> +				break;
> +
> +			VM_BUG_ON(address < vma->vm_start ||
> +				  address + PAGE_SIZE > vma->vm_end);
> +			progress += knuma_scand_pmd(mm, vma, address);
> +			/* move to next address */
> +			address = (address + PMD_SIZE) & PMD_MASK;
> +			if (progress >= pages_to_scan)
> +				break;
> +		}
> +		end_addr = min(address, vma->vm_end);
> +
> +		/*
> +		 * Flush the TLB for the mm to start the NUMA hinting
> +		 * page faults after we finish scanning this vma part.
> +		 */
> +		mmu_notifier_invalidate_range_start(vma->vm_mm, start_addr,
> +						    end_addr);
> +		flush_tlb_range(vma, start_addr, end_addr);

It's possible that the full range you checked already were set PAGE_NUMA
and no updates were necessary and avoid the PMD flush and notification.

> +		mmu_notifier_invalidate_range_end(vma->vm_mm, start_addr,
> +						  end_addr);
> +	}
> +	up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
> +
> +	mutex_lock(&knumad_mm_mutex);
> +	VM_BUG_ON(knuma_scand_data.mm != mm);
> +	knuma_scand_data.address = address;
> +	/*
> +	 * Change the current mm if this mm is about to die, or if we
> +	 * scanned all vmas of this mm.
> +	 */
> +	if (knumad_test_exit(mm) || !vma) {
> +		mm_autonuma = mm->mm_autonuma;
> +		if (mm_autonuma->mm_node.next != &knuma_scand_data.mm_head) {
> +			mm_autonuma = list_entry(mm_autonuma->mm_node.next,
> +						 struct mm_autonuma, mm_node);
> +			knuma_scand_data.mm = mm_autonuma->mm;
> +			atomic_inc(&knuma_scand_data.mm->mm_count);
> +			knuma_scand_data.address = 0;
> +			knuma_scand_data.mm->mm_autonuma->mm_numa_fault_pass++;
> +		} else
> +			knuma_scand_data.mm = NULL;
> +
> +		if (knumad_test_exit(mm)) {
> +			list_del(&mm->mm_autonuma->mm_node);
> +			/* tell autonuma_exit not to list_del */
> +			VM_BUG_ON(mm->mm_autonuma->mm != mm);
> +			mm->mm_autonuma->mm = NULL;
> +			mm_numa_fault_tmp_reset();
> +		} else
> +			mm_numa_fault_tmp_flush(mm);
> +
> +		mmdrop(mm);
> +	}
> +
> +	return progress;
> +}
> +
> +static void knuma_scand_disabled(void)
> +{
> +	if (!autonuma_enabled())
> +		wait_event_freezable(knuma_scand_wait,
> +				     autonuma_enabled() ||
> +				     kthread_should_stop());
> +}
> +
> +static int knuma_scand(void *none)
> +{
> +	struct mm_struct *mm = NULL;
> +	int progress = 0, _progress;
> +	unsigned long total_progress = 0;
> +
> +	set_freezable();
> +
> +	knuma_scand_disabled();
> +
> +	/*
> +	 * Serialize the knuma_scand_data against
> +	 * autonuma_enter/exit().
> +	 */
> +	mutex_lock(&knumad_mm_mutex);
> +
> +	for (;;) {
> +		if (unlikely(kthread_should_stop()))
> +			break;
> +
> +		/* Do one loop of scanning, keeping track of the progress */
> +		_progress = knumad_do_scan();
> +		progress += _progress;
> +		total_progress += _progress;
> +		mutex_unlock(&knumad_mm_mutex);
> +
> +		/* Check if we completed one full scan pass */
> +		if (unlikely(!knuma_scand_data.mm)) {
> +			autonuma_printk("knuma_scand %lu\n", total_progress);
> +			pages_scanned += total_progress;
> +			total_progress = 0;
> +			full_scans++;
> +
> +			wait_event_freezable_timeout(knuma_scand_wait,
> +						     kthread_should_stop(),
> +						     msecs_to_jiffies(
> +						     scan_sleep_pass_millisecs));
> +
> +			if (autonuma_debug()) {
> +				extern void sched_autonuma_dump_mm(void);
> +				sched_autonuma_dump_mm();
> +			}
> +
> +			/* wait while there is no pinned mm */
> +			knuma_scand_disabled();
> +		}
> +		if (progress > pages_to_scan) {
> +			progress = 0;
> +			wait_event_freezable_timeout(knuma_scand_wait,
> +						     kthread_should_stop(),
> +						     msecs_to_jiffies(
> +						     scan_sleep_millisecs));
> +		}
> +		cond_resched();
> +		mutex_lock(&knumad_mm_mutex);
> +	}
> +
> +	mm = knuma_scand_data.mm;
> +	knuma_scand_data.mm = NULL;
> +	if (mm && knumad_test_exit(mm)) {
> +		list_del(&mm->mm_autonuma->mm_node);
> +		/* tell autonuma_exit not to list_del */
> +		VM_BUG_ON(mm->mm_autonuma->mm != mm);
> +		mm->mm_autonuma->mm = NULL;
> +	}
> +	mutex_unlock(&knumad_mm_mutex);
> +
> +	if (mm)
> +		mmdrop(mm);
> +	mm_numa_fault_tmp_reset();
> +
> +	return 0;
> +}
> +
> +void autonuma_enter(struct mm_struct *mm)
> +{
> +	if (!autonuma_possible())
> +		return;
> +
> +	mutex_lock(&knumad_mm_mutex);
> +	list_add_tail(&mm->mm_autonuma->mm_node, &knuma_scand_data.mm_head);
> +	mutex_unlock(&knumad_mm_mutex);
> +}
> +
> +void autonuma_exit(struct mm_struct *mm)
> +{
> +	bool serialize;
> +
> +	if (!autonuma_possible())
> +		return;
> +
> +	serialize = false;
> +	mutex_lock(&knumad_mm_mutex);
> +	if (knuma_scand_data.mm == mm)
> +		serialize = true;
> +	else if (mm->mm_autonuma->mm) {
> +		VM_BUG_ON(mm->mm_autonuma->mm != mm);
> +		mm->mm_autonuma->mm = NULL; /* debug */
> +		list_del(&mm->mm_autonuma->mm_node);
> +	}
> +	mutex_unlock(&knumad_mm_mutex);
> +
> +	if (serialize) {
> +		/* prevent the mm to go away under knumad_do_scan main loop */
> +		down_write(&mm->mmap_sem);
> +		up_write(&mm->mmap_sem);
> +	}
> +}
> +
> +static int start_knuma_scand(void)
> +{
> +	int err = 0;
> +	struct task_struct *knumad_thread;
> +
> +	knuma_scand_data.mm_numa_fault_tmp = kzalloc(mm_autonuma_fault_size(),
> +						     GFP_KERNEL);
> +	if (!knuma_scand_data.mm_numa_fault_tmp)
> +		return -ENOMEM;
> +
> +	knumad_thread = kthread_run(knuma_scand, NULL, "knuma_scand");
> +	if (unlikely(IS_ERR(knumad_thread))) {
> +		autonuma_printk(KERN_ERR
> +				"knumad: kthread_run(knuma_scand) failed\n");
> +		err = PTR_ERR(knumad_thread);
> +	}
> +	return err;
> +}
> +

Ok, I ignored everything after this because it mostly looks like
boiler-plate code.

Overall, this is pretty heavy but there is no getting away from it either
for the autonuma concept. There are places where it could be improved but
that's ok because profiles will point to where the areas to improve are over
time just as what has happened with compaction. Ultimately it'll come down
to whether the ends justify the means -- I think in many cases they will
and the better NUMA placement will offset the amount of work this thing
has to do but that is hard to detect. It's easy to see autonumas costs at
runtime but harder to see the benefit without doing comparisons with it
on and off. I guess perf measuring NUMA traffic would do it though. There
will be corner case -- for example, I suspect that small processes will
move too quickly because they can be scanned quickly but bounce around a lot.

All that said, an administrator can just turn it off and on and decide
for themselves where the system CPU overhead is worth it or not.

Eventually I think it will be necessary to reduce the scanning rate as
autonuma identifies that the workload has converged or almost converged
to reduce the overhead and lock contention. I did not see something that
makes it go quiet when the system is fully idle and it may be consuming
CPU scaning for no reason on idle machines.  Maybe it does this already
and I'll spot it after thinking about it. We also will need to keep an
eye on lock contention as it does make a few locks a bit hotter.

I did not see any fundamental problems! Users will decide for themselves
whether they would prefer autonuma to spend CPU figuring out the best
placement or whether they want to spent the tuning and development time
doing it by hand. Where possible (e.g. HPC) they will but I suspect Java
workloads and anything virtualisation-based will prefer this and pay the
CPU cost.

> +
> +#ifdef CONFIG_SYSFS
> +
> +static ssize_t flag_show(struct kobject *kobj,
> +			 struct kobj_attribute *attr, char *buf,
> +			 enum autonuma_flag flag)
> +{
> +	return sprintf(buf, "%d\n",
> +		       !!test_bit(flag, &autonuma_flags));
> +}
> +static ssize_t flag_store(struct kobject *kobj,
> +			  struct kobj_attribute *attr,
> +			  const char *buf, size_t count,
> +			  enum autonuma_flag flag)
> +{
> +	unsigned long value;
> +	int ret;
> +
> +	ret = kstrtoul(buf, 10, &value);
> +	if (ret < 0)
> +		return ret;
> +	if (value > 1)
> +		return -EINVAL;
> +
> +	if (value)
> +		set_bit(flag, &autonuma_flags);
> +	else
> +		clear_bit(flag, &autonuma_flags);
> +
> +	return count;
> +}
> +
> +static ssize_t enabled_show(struct kobject *kobj,
> +			    struct kobj_attribute *attr, char *buf)
> +{
> +	return flag_show(kobj, attr, buf, AUTONUMA_ENABLED_FLAG);
> +}
> +static ssize_t enabled_store(struct kobject *kobj,
> +			     struct kobj_attribute *attr,
> +			     const char *buf, size_t count)
> +{
> +	ssize_t ret;
> +
> +	ret = flag_store(kobj, attr, buf, count, AUTONUMA_ENABLED_FLAG);
> +
> +	if (ret > 0 && autonuma_enabled())
> +		wake_up_interruptible(&knuma_scand_wait);
> +
> +	return ret;
> +}
> +static struct kobj_attribute enabled_attr =
> +	__ATTR(enabled, 0644, enabled_show, enabled_store);
> +
> +#define SYSFS_ENTRY(NAME, FLAG)						\
> +static ssize_t NAME ## _show(struct kobject *kobj,			\
> +			     struct kobj_attribute *attr, char *buf)	\
> +{									\
> +	return flag_show(kobj, attr, buf, FLAG);			\
> +}									\
> +									\
> +static ssize_t NAME ## _store(struct kobject *kobj,			\
> +			      struct kobj_attribute *attr,		\
> +			      const char *buf, size_t count)		\
> +{									\
> +	return flag_store(kobj, attr, buf, count, FLAG);		\
> +}									\
> +static struct kobj_attribute NAME ## _attr =				\
> +	__ATTR(NAME, 0644, NAME ## _show, NAME ## _store);
> +
> +SYSFS_ENTRY(scan_pmd, AUTONUMA_SCAN_PMD_FLAG);
> +SYSFS_ENTRY(debug, AUTONUMA_DEBUG_FLAG);
> +#ifdef CONFIG_DEBUG_VM
> +SYSFS_ENTRY(sched_load_balance_strict, AUTONUMA_SCHED_LOAD_BALANCE_STRICT_FLAG);
> +SYSFS_ENTRY(child_inheritance, AUTONUMA_CHILD_INHERITANCE_FLAG);
> +#endif /* CONFIG_DEBUG_VM */
> +
> +#undef SYSFS_ENTRY
> +
> +enum {
> +	SYSFS_SCAN_SLEEP_ENTRY,
> +	SYSFS_SCAN_PAGES_ENTRY,
> +	SYSFS_MIGRATE_SLEEP_ENTRY,
> +	SYSFS_MIGRATE_PAGES_ENTRY,
> +};
> +
> +#define SYSFS_ENTRY(NAME, SYSFS_TYPE)					\
> +	static ssize_t NAME ## _show(struct kobject *kobj,		\
> +				     struct kobj_attribute *attr,	\
> +				     char *buf)				\
> +	{								\
> +		return sprintf(buf, "%u\n", NAME);			\
> +	}								\
> +	static ssize_t NAME ## _store(struct kobject *kobj,		\
> +				      struct kobj_attribute *attr,	\
> +				      const char *buf, size_t count)	\
> +	{								\
> +		unsigned long val;					\
> +		int err;						\
> +									\
> +		err = strict_strtoul(buf, 10, &val);			\
> +		if (err || val > UINT_MAX)				\
> +			return -EINVAL;					\
> +		switch (SYSFS_TYPE) {					\
> +		case SYSFS_SCAN_PAGES_ENTRY:				\
> +		case SYSFS_MIGRATE_PAGES_ENTRY:				\
> +			if (!val)					\
> +				return -EINVAL;				\
> +			break;						\
> +		}							\
> +									\
> +		NAME = val;						\
> +		switch (SYSFS_TYPE) {					\
> +		case SYSFS_SCAN_SLEEP_ENTRY:				\
> +			wake_up_interruptible(&knuma_scand_wait);	\
> +			break;						\
> +		}							\
> +									\
> +		return count;						\
> +	}								\
> +	static struct kobj_attribute NAME ## _attr =			\
> +		__ATTR(NAME, 0644, NAME ## _show, NAME ## _store);
> +
> +SYSFS_ENTRY(scan_sleep_millisecs, SYSFS_SCAN_SLEEP_ENTRY);
> +SYSFS_ENTRY(scan_sleep_pass_millisecs, SYSFS_SCAN_SLEEP_ENTRY);
> +SYSFS_ENTRY(pages_to_scan, SYSFS_SCAN_PAGES_ENTRY);
> +
> +SYSFS_ENTRY(migrate_sleep_millisecs, SYSFS_MIGRATE_SLEEP_ENTRY);
> +SYSFS_ENTRY(pages_to_migrate, SYSFS_MIGRATE_PAGES_ENTRY);
> +
> +#undef SYSFS_ENTRY
> +
> +#define SYSFS_ENTRY(NAME)					\
> +static ssize_t NAME ## _show(struct kobject *kobj,		\
> +			     struct kobj_attribute *attr,	\
> +			     char *buf)				\
> +{								\
> +	return sprintf(buf, "%lu\n", NAME);			\
> +}								\
> +static struct kobj_attribute NAME ## _attr =			\
> +	__ATTR_RO(NAME);
> +
> +SYSFS_ENTRY(full_scans);
> +SYSFS_ENTRY(pages_scanned);
> +SYSFS_ENTRY(pages_migrated);
> +
> +#undef SYSFS_ENTRY
> +
> +static struct attribute *autonuma_attr[] = {
> +	&enabled_attr.attr,
> +
> +	&debug_attr.attr,
> +
> +	/* migrate start */
> +	&migrate_sleep_millisecs_attr.attr,
> +	&pages_to_migrate_attr.attr,
> +	&pages_migrated_attr.attr,
> +	/* migrate end */
> +
> +	/* scan start */
> +	&scan_sleep_millisecs_attr.attr,
> +	&scan_sleep_pass_millisecs_attr.attr,
> +	&pages_to_scan_attr.attr,
> +	&pages_scanned_attr.attr,
> +	&full_scans_attr.attr,
> +	&scan_pmd_attr.attr,
> +	/* scan end */
> +
> +#ifdef CONFIG_DEBUG_VM
> +	&sched_load_balance_strict_attr.attr,
> +	&child_inheritance_attr.attr,
> +#endif
> +
> +	NULL,
> +};
> +static struct attribute_group autonuma_attr_group = {
> +	.attrs = autonuma_attr,
> +};
> +
> +static int __init autonuma_init_sysfs(struct kobject **autonuma_kobj)
> +{
> +	int err;
> +
> +	*autonuma_kobj = kobject_create_and_add("autonuma", mm_kobj);
> +	if (unlikely(!*autonuma_kobj)) {
> +		printk(KERN_ERR "autonuma: failed kobject create\n");
> +		return -ENOMEM;
> +	}
> +
> +	err = sysfs_create_group(*autonuma_kobj, &autonuma_attr_group);
> +	if (err) {
> +		printk(KERN_ERR "autonuma: failed register autonuma group\n");
> +		goto delete_obj;
> +	}
> +
> +	return 0;
> +
> +delete_obj:
> +	kobject_put(*autonuma_kobj);
> +	return err;
> +}
> +
> +static void __init autonuma_exit_sysfs(struct kobject *autonuma_kobj)
> +{
> +	sysfs_remove_group(autonuma_kobj, &autonuma_attr_group);
> +	kobject_put(autonuma_kobj);
> +}
> +#else
> +static inline int autonuma_init_sysfs(struct kobject **autonuma_kobj)
> +{
> +	return 0;
> +}
> +
> +static inline void autonuma_exit_sysfs(struct kobject *autonuma_kobj)
> +{
> +}
> +#endif /* CONFIG_SYSFS */
> +
> +static int __init noautonuma_setup(char *str)
> +{
> +	if (autonuma_possible()) {
> +		printk("AutoNUMA permanently disabled\n");
> +		clear_bit(AUTONUMA_POSSIBLE_FLAG, &autonuma_flags);
> +		WARN_ON(autonuma_possible()); /* avoid early crash */
> +	}
> +	return 1;
> +}
> +__setup("noautonuma", noautonuma_setup);
> +
> +static bool autonuma_init_checks_failed(void)
> +{
> +	/* safety checks on nr_node_ids */
> +	int last_nid = find_last_bit(node_states[N_POSSIBLE].bits, MAX_NUMNODES);
> +	if (last_nid + 1 != nr_node_ids) {
> +		WARN_ON(1);
> +		return true;
> +	}
> +	if (num_possible_nodes() > nr_node_ids) {
> +		WARN_ON(1);
> +		return true;
> +	}
> +	return false;
> +}
> +
> +static int __init autonuma_init(void)
> +{
> +	int err;
> +	struct kobject *autonuma_kobj;
> +
> +	VM_BUG_ON(num_possible_nodes() < 1);
> +	if (num_possible_nodes() <= 1 || !autonuma_possible()) {
> +		clear_bit(AUTONUMA_POSSIBLE_FLAG, &autonuma_flags);
> +		return -EINVAL;
> +	} else if (autonuma_init_checks_failed()) {
> +		printk("autonuma disengaged: init checks failed\n");
> +		clear_bit(AUTONUMA_POSSIBLE_FLAG, &autonuma_flags);
> +		return -EINVAL;
> +	}
> +
> +	err = autonuma_init_sysfs(&autonuma_kobj);
> +	if (err)
> +		return err;
> +
> +	err = start_knuma_scand();
> +	if (err) {
> +		printk("failed to start knuma_scand\n");
> +		goto out;
> +	}
> +
> +	printk("AutoNUMA initialized successfully\n");
> +	return err;
> +
> +out:
> +	autonuma_exit_sysfs(autonuma_kobj);
> +	return err;
> +}
> +module_init(autonuma_init)
> +
> +static struct kmem_cache *task_autonuma_cachep;
> +
> +int alloc_task_autonuma(struct task_struct *tsk, struct task_struct *orig,
> +			 int node)
> +{
> +	int err = 1;
> +	struct task_autonuma *task_autonuma;
> +
> +	if (!autonuma_possible())
> +		goto no_numa;
> +	task_autonuma = kmem_cache_alloc_node(task_autonuma_cachep,
> +					      GFP_KERNEL, node);
> +	if (!task_autonuma)
> +		goto out;
> +	if (!autonuma_child_inheritance()) {
> +		/*
> +		 * Only reset the task NUMA stats, Always inherit the
> +		 * task_selected_nid. It's certainly better to start
> +		 * the child in the same NUMA node of the parent, if
> +		 * idle/load balancing permits. If they don't permit,
> +		 * task_selected_nid is a transient entity and it'll
> +		 * be updated accordingly.
> +		 */
> +		task_autonuma->task_selected_nid =
> +			orig->task_autonuma->task_selected_nid;
> +		__task_autonuma_reset(task_autonuma);
> +	} else
> +		memcpy(task_autonuma, orig->task_autonuma,
> +		       task_autonuma_size());
> +	VM_BUG_ON(task_autonuma->task_selected_nid < -1);
> +	VM_BUG_ON(task_autonuma->task_selected_nid >= nr_node_ids);
> +	tsk->task_autonuma = task_autonuma;
> +no_numa:
> +	err = 0;
> +out:
> +	return err;
> +}
> +
> +void free_task_autonuma(struct task_struct *tsk)
> +{
> +	if (!autonuma_possible()) {
> +		BUG_ON(tsk->task_autonuma);
> +		return;
> +	}
> +
> +	BUG_ON(!tsk->task_autonuma);
> +	kmem_cache_free(task_autonuma_cachep, tsk->task_autonuma);
> +	tsk->task_autonuma = NULL;
> +}
> +
> +void __init task_autonuma_init(void)
> +{
> +	struct task_autonuma *task_autonuma;
> +
> +	BUG_ON(current != &init_task);
> +
> +	if (!autonuma_possible())
> +		return;
> +
> +	task_autonuma_cachep =
> +		kmem_cache_create("task_autonuma",
> +				  task_autonuma_size(), 0,
> +				  SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
> +
> +	task_autonuma = kmem_cache_alloc_node(task_autonuma_cachep,
> +					      GFP_KERNEL, numa_node_id());
> +	BUG_ON(!task_autonuma);
> +	task_autonuma_reset(task_autonuma);
> +	BUG_ON(current->task_autonuma);
> +	current->task_autonuma = task_autonuma;
> +}
> +
> +static struct kmem_cache *mm_autonuma_cachep;
> +
> +int alloc_mm_autonuma(struct mm_struct *mm)
> +{
> +	int err = 1;
> +	struct mm_autonuma *mm_autonuma;
> +
> +	if (!autonuma_possible())
> +		goto no_numa;
> +	mm_autonuma = kmem_cache_alloc(mm_autonuma_cachep, GFP_KERNEL);
> +	if (!mm_autonuma)
> +		goto out;
> +	if (!autonuma_child_inheritance() || !mm->mm_autonuma)
> +		mm_autonuma_reset(mm_autonuma);
> +	else
> +		memcpy(mm_autonuma, mm->mm_autonuma, mm_autonuma_size());
> +
> +	/*
> +	 * We're not leaking memory here, if mm->mm_autonuma is not
> +	 * zero it's a not refcounted copy of the parent's
> +	 * mm->mm_autonuma pointer.
> +	 */
> +	mm->mm_autonuma = mm_autonuma;
> +	mm_autonuma->mm = mm;
> +no_numa:
> +	err = 0;
> +out:
> +	return err;
> +}
> +
> +void free_mm_autonuma(struct mm_struct *mm)
> +{
> +	if (!autonuma_possible()) {
> +		BUG_ON(mm->mm_autonuma);
> +		return;
> +	}
> +
> +	BUG_ON(!mm->mm_autonuma);
> +	kmem_cache_free(mm_autonuma_cachep, mm->mm_autonuma);
> +	mm->mm_autonuma = NULL;
> +}
> +
> +void __init mm_autonuma_init(void)
> +{
> +	BUG_ON(current != &init_task);
> +	BUG_ON(current->mm);
> +
> +	if (!autonuma_possible())
> +		return;
> +
> +	mm_autonuma_cachep =
> +		kmem_cache_create("mm_autonuma",
> +				  mm_autonuma_size(), 0,
> +				  SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
> +}
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 25e262a..edee54d 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1036,6 +1036,40 @@ out:
>  	return page;
>  }
>  
> +#ifdef CONFIG_AUTONUMA
> +/* NUMA hinting page fault entry point for trans huge pmds */
> +int huge_pmd_numa_fixup(struct mm_struct *mm, unsigned long addr,
> +			pmd_t pmd, pmd_t *pmdp)
> +{
> +	struct page *page;
> +	bool migrated;
> +
> +	spin_lock(&mm->page_table_lock);
> +	if (unlikely(!pmd_same(pmd, *pmdp)))
> +		goto out_unlock;
> +
> +	page = pmd_page(pmd);
> +	pmd = pmd_mknonnuma(pmd);
> +	set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmdp, pmd);
> +	VM_BUG_ON(pmd_numa(*pmdp));
> +	if (unlikely(page_mapcount(page) != 1))
> +		goto out_unlock;
> +	get_page(page);
> +	spin_unlock(&mm->page_table_lock);
> +
> +	migrated = numa_hinting_fault(page, HPAGE_PMD_NR);
> +	if (!migrated)
> +		put_page(page);
> +
> +out:
> +	return 0;
> +
> +out_unlock:
> +	spin_unlock(&mm->page_table_lock);
> +	goto out;
> +}
> +#endif
> +
>  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>  		 pmd_t *pmd, unsigned long addr)
>  {
> 

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>