Re: [PATCH v2] Linux VM workaround for Knights Landing A/D leak

Borislav Petkov <bp@xxxxxxxxx> · Tue, 14 Jun 2016 20:10:02 +0200

On Tue, Jun 14, 2016 at 07:01:12PM +0200, Lukasz Anaczkowski wrote:
> From: Andi Kleen <ak@xxxxxxxxxxxxxxx>
> 
> Knights Landing has a issue that a thread setting A or D bits
> may not do so atomically against checking the present bit.
> A thread which is going to page fault may still set those
> bits, even though the present bit was already atomically cleared.
> 
> This implies that when the kernel clears present atomically,
> some time later the supposed to be zero entry could be corrupted
> with stray A or D bits.
> 
> Since the PTE could be already used for storing a swap index,
> or a NUMA migration index, this cannot be tolerated. Most
> of the time the kernel detects the problem, but in some
> rare cases it may not.
> 
> This patch enforces that the page unmap path in vmscan/direct reclaim
> always flushes other CPUs after clearing each page, and also
> clears the PTE again after the flush.
> 
> For reclaim this brings the performance back to before Mel's
> flushing changes, but for unmap it disables batching.
> 
> This makes sure any leaked A/D bits are immediately cleared before the entry
> is used for something else.
> 
> Any parallel faults that check for entry is zero may loop,
> but they should eventually recover after the entry is written.
> 
> Also other users may spin in the page table lock until we
> "fixed" the PTE. This is ensured by always taking the page table lock
> even for the swap cache case. Previously this was only done
> on architectures with non atomic PTE accesses (such as 32bit PTE),
> but now it is also done when this bug workaround is active.
> 
> I audited apply_pte_range and other users of arch_enter_lazy...
> and they seem to all not clear the present bit.
> 
> Right now the extra flush is done in the low level
> architecture code, while the higher level code still
> does batched TLB flush. This means there is always one extra
> unnecessary TLB flush now. As a followon optimization
> this could be avoided by telling the callers that
> the flush already happenend.
> 
> v2 (Lukasz Anaczkowski):
>     () added call to smp_mb__after_atomic() to synchornize with
>        switch_mm, based on Nadav's comment
>     () fixed compilation breakage
> 
> Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
> Signed-off-by: Lukasz Anaczkowski <lukasz.anaczkowski@xxxxxxxxx>
> ---
>  arch/x86/include/asm/cpufeatures.h |  1 +
>  arch/x86/include/asm/hugetlb.h     |  9 ++++++++-
>  arch/x86/include/asm/pgtable.h     |  5 +++++
>  arch/x86/include/asm/pgtable_64.h  |  6 ++++++
>  arch/x86/kernel/cpu/intel.c        | 10 ++++++++++
>  arch/x86/mm/tlb.c                  | 22 ++++++++++++++++++++++
>  include/linux/mm.h                 |  4 ++++
>  mm/memory.c                        |  3 ++-
>  8 files changed, 58 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> index 4a41348..2c48011 100644
> --- a/arch/x86/include/asm/cpufeatures.h
> +++ b/arch/x86/include/asm/cpufeatures.h
> @@ -303,6 +303,7 @@
>  #define X86_BUG_SYSRET_SS_ATTRS	X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
>  #define X86_BUG_NULL_SEG	X86_BUG(9) /* Nulling a selector preserves the base */
>  #define X86_BUG_SWAPGS_FENCE	X86_BUG(10) /* SWAPGS without input dep on GS */
> +#define X86_BUG_PTE_LEAK        X86_BUG(11) /* PTE may leak A/D bits after clear */
>  
>  
>  #ifdef CONFIG_X86_32
> diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
> index 3a10616..58e1ca9 100644
> --- a/arch/x86/include/asm/hugetlb.h
> +++ b/arch/x86/include/asm/hugetlb.h
> @@ -41,10 +41,17 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
>  	set_pte_at(mm, addr, ptep, pte);
>  }
>  
> +extern void fix_pte_leak(struct mm_struct *mm, unsigned long addr,
> +			 pte_t *ptep);
> +
>  static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
>  					    unsigned long addr, pte_t *ptep)
>  {
> -	return ptep_get_and_clear(mm, addr, ptep);
> +	pte_t pte = ptep_get_and_clear(mm, addr, ptep);
> +
> +	if (boot_cpu_has_bug(X86_BUG_PTE_LEAK))

static_cpu_has_bug()

> +		fix_pte_leak(mm, addr, ptep);
> +	return pte;
>  }
>  
>  static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 1a27396..9769355 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -794,11 +794,16 @@ extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
>  extern int ptep_clear_flush_young(struct vm_area_struct *vma,
>  				  unsigned long address, pte_t *ptep);
>  
> +extern void fix_pte_leak(struct mm_struct *mm, unsigned long addr,
> +			 pte_t *ptep);
> +
>  #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
>  static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
>  				       pte_t *ptep)
>  {
>  	pte_t pte = native_ptep_get_and_clear(ptep);
> +	if (boot_cpu_has_bug(X86_BUG_PTE_LEAK))

static_cpu_has_bug()

> +		fix_pte_leak(mm, addr, ptep);
>  	pte_update(mm, addr, ptep);
>  	return pte;
>  }
> diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
> index 2ee7811..6fa4079 100644
> --- a/arch/x86/include/asm/pgtable_64.h
> +++ b/arch/x86/include/asm/pgtable_64.h
> @@ -178,6 +178,12 @@ extern void cleanup_highmap(void);
>  extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
>  extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
>  
> +#define ARCH_HAS_NEEDS_SWAP_PTL 1
> +static inline bool arch_needs_swap_ptl(void)
> +{
> +       return boot_cpu_has_bug(X86_BUG_PTE_LEAK);

static_cpu_has_bug()

> +}
> +
>  #endif /* !__ASSEMBLY__ */
>  
>  #endif /* _ASM_X86_PGTABLE_64_H */
> diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
> index 6e2ffbe..f499513 100644
> --- a/arch/x86/kernel/cpu/intel.c
> +++ b/arch/x86/kernel/cpu/intel.c
> @@ -181,6 +181,16 @@ static void early_init_intel(struct cpuinfo_x86 *c)
>  		}
>  	}
>  
> +	if (c->x86_model == 87) {

That should be INTEL_FAM6_XEON_PHI_KNL, AFAICT.

> +		static bool printed;
> +
> +		if (!printed) {
> +			pr_info("Enabling PTE leaking workaround\n");
> +			printed = true;
> +		}

pr_info_once

> +		set_cpu_bug(c, X86_BUG_PTE_LEAK);
> +	}
> +
>  	/*
>  	 * Intel Quark Core DevMan_001.pdf section 6.4.11
>  	 * "The operating system also is required to invalidate (i.e., flush)
-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>