On Fri, Nov 01, 2019 at 03:57:25PM +0800, Huang, Ying wrote: > index 8ec38b11b361..59e2151734ab 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -484,6 +484,11 @@ struct mm_struct { > > /* numa_scan_seq prevents two threads setting pte_numa */ > int numa_scan_seq; > + > +#define NUMA_SCAN_NR_HIST 16 > + int numa_scan_idx; > + unsigned long numa_scan_jiffies[NUMA_SCAN_NR_HIST]; > + unsigned long numa_scan_starts[NUMA_SCAN_NR_HIST]; Why 16? This is 4 cachelines. > #endif > /* > * An operation with batched TLB flushing is going on. Anything > +static long numa_hint_fault_latency(struct task_struct *p, unsigned long addr) > +{ > + struct mm_struct *mm = p->mm; > + unsigned long now = jiffies; > + unsigned long start, end; > + int i, j; > + long latency = 0; > + > + i = READ_ONCE(mm->numa_scan_idx); > + i = i ? i - 1 : NUMA_SCAN_NR_HIST - 1; > + /* > + * Paired with smp_wmb() in task_numa_work() to check > + * scan range buffer after get current index > + */ > + smp_rmb(); That wants to be: i = smp_load_acquire(&mm->numa_scan_idx) i = (i - 1) % NUMA_SCAN_NR_HIST; (and because NUMA_SCAN_NR_HIST is a power of 2, the compiler will conveniently make that a bitwise and operation) And: "DEC %0; AND $15, %0" is so much faster than a branch. > + end = READ_ONCE(mm->numa_scan_offset); > + start = READ_ONCE(mm->numa_scan_starts[i]); > + if (start == end) > + end = start + MAX_SCAN_WINDOW * (1UL << 22); > + for (j = 0; j < NUMA_SCAN_NR_HIST; j++) { > + latency = now - READ_ONCE(mm->numa_scan_jiffies[i]); > + start = READ_ONCE(mm->numa_scan_starts[i]); > + /* Scan pass the end of address space */ > + if (end < start) > + end = TASK_SIZE; > + if (addr >= start && addr < end) > + return latency; > + end = start; > + i = i ? i - 1 : NUMA_SCAN_NR_HIST - 1; i = (i - 1) % NUMA_SCAN_NR_HIST; > + } > + /* > + * The tracking window isn't large enough, approximate to the > + * max latency in the tracking window. > + */ > + return latency; > +} > @@ -2583,6 +2640,19 @@ void task_numa_work(struct callback_head *work) > start = 0; > vma = mm->mmap; > } > + idx = mm->numa_scan_idx; > + WRITE_ONCE(mm->numa_scan_starts[idx], start); > + WRITE_ONCE(mm->numa_scan_jiffies[idx], jiffies); > + /* > + * Paired with smp_rmb() in should_numa_migrate_memory() to > + * update scan range buffer index after update the buffer > + * contents. > + */ > + smp_wmb(); > + if (idx + 1 >= NUMA_SCAN_NR_HIST) > + WRITE_ONCE(mm->numa_scan_idx, 0); > + else > + WRITE_ONCE(mm->numa_scan_idx, idx + 1); smp_store_release(&mm->nums_scan_idx, idx % NUMA_SCAN_NR_HIST);