On Tue, Feb 12, 2019 at 10:56:14AM +0800, Peter Xu wrote: > From: Andrea Arcangeli <aarcange@xxxxxxxxxx> > > Accurate userfaultfd WP tracking is possible by tracking exactly which > virtual memory ranges were writeprotected by userland. We can't relay > only on the RW bit of the mapped pagetable because that information is > destroyed by fork() or KSM or swap. If we were to relay on that, we'd > need to stay on the safe side and generate false positive wp faults > for every swapped out page. > > Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx> > Signed-off-by: Peter Xu <peterx@xxxxxxxxxx> Reviewed-by: Mike Rapoport <rppt@xxxxxxxxxxxxx> > --- > arch/x86/Kconfig | 1 + > arch/x86/include/asm/pgtable.h | 52 ++++++++++++++++++++++++++++ > arch/x86/include/asm/pgtable_64.h | 8 ++++- > arch/x86/include/asm/pgtable_types.h | 9 +++++ > include/asm-generic/pgtable.h | 1 + > include/asm-generic/pgtable_uffd.h | 51 +++++++++++++++++++++++++++ > init/Kconfig | 5 +++ > 7 files changed, 126 insertions(+), 1 deletion(-) > create mode 100644 include/asm-generic/pgtable_uffd.h > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index 68261430fe6e..cb43bc008675 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -209,6 +209,7 @@ config X86 > select USER_STACKTRACE_SUPPORT > select VIRT_TO_BUS > select X86_FEATURE_NAMES if PROC_FS > + select HAVE_ARCH_USERFAULTFD_WP if USERFAULTFD > > config INSTRUCTION_DECODER > def_bool y > diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h > index 2779ace16d23..6863236e8484 100644 > --- a/arch/x86/include/asm/pgtable.h > +++ b/arch/x86/include/asm/pgtable.h > @@ -23,6 +23,7 @@ > > #ifndef __ASSEMBLY__ > #include <asm/x86_init.h> > +#include <asm-generic/pgtable_uffd.h> > > extern pgd_t early_top_pgt[PTRS_PER_PGD]; > int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); > @@ -293,6 +294,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) > return native_make_pte(v & ~clear); > } > > +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP > +static inline int pte_uffd_wp(pte_t pte) > +{ > + return pte_flags(pte) & _PAGE_UFFD_WP; > +} > + > +static inline pte_t pte_mkuffd_wp(pte_t pte) > +{ > + return pte_set_flags(pte, _PAGE_UFFD_WP); > +} > + > +static inline pte_t pte_clear_uffd_wp(pte_t pte) > +{ > + return pte_clear_flags(pte, _PAGE_UFFD_WP); > +} > +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ > + > static inline pte_t pte_mkclean(pte_t pte) > { > return pte_clear_flags(pte, _PAGE_DIRTY); > @@ -372,6 +390,23 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) > return native_make_pmd(v & ~clear); > } > > +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP > +static inline int pmd_uffd_wp(pmd_t pmd) > +{ > + return pmd_flags(pmd) & _PAGE_UFFD_WP; > +} > + > +static inline pmd_t pmd_mkuffd_wp(pmd_t pmd) > +{ > + return pmd_set_flags(pmd, _PAGE_UFFD_WP); > +} > + > +static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd) > +{ > + return pmd_clear_flags(pmd, _PAGE_UFFD_WP); > +} > +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ > + > static inline pmd_t pmd_mkold(pmd_t pmd) > { > return pmd_clear_flags(pmd, _PAGE_ACCESSED); > @@ -1351,6 +1386,23 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) > #endif > #endif > > +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP > +static inline pte_t pte_swp_mkuffd_wp(pte_t pte) > +{ > + return pte_set_flags(pte, _PAGE_SWP_UFFD_WP); > +} > + > +static inline int pte_swp_uffd_wp(pte_t pte) > +{ > + return pte_flags(pte) & _PAGE_SWP_UFFD_WP; > +} > + > +static inline pte_t pte_swp_clear_uffd_wp(pte_t pte) > +{ > + return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP); > +} > +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ > + > #define PKRU_AD_BIT 0x1 > #define PKRU_WD_BIT 0x2 > #define PKRU_BITS_PER_PKEY 2 > diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h > index 9c85b54bf03c..e0c5d29b8685 100644 > --- a/arch/x86/include/asm/pgtable_64.h > +++ b/arch/x86/include/asm/pgtable_64.h > @@ -189,7 +189,7 @@ extern void sync_global_pgds(unsigned long start, unsigned long end); > * > * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number > * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names > - * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry > + * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|F|SD|0| <- swp entry > * > * G (8) is aliased and used as a PROT_NONE indicator for > * !present ptes. We need to start storing swap entries above > @@ -197,9 +197,15 @@ extern void sync_global_pgds(unsigned long start, unsigned long end); > * erratum where they can be incorrectly set by hardware on > * non-present PTEs. > * > + * SD Bits 1-4 are not used in non-present format and available for > + * special use described below: > + * > * SD (1) in swp entry is used to store soft dirty bit, which helps us > * remember soft dirty over page migration > * > + * F (2) in swp entry is used to record when a pagetable is > + * writeprotected by userfaultfd WP support. > + * > * Bit 7 in swp entry should be 0 because pmd_present checks not only P, > * but also L and G. > * > diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h > index d6ff0bbdb394..8cebcff91e57 100644 > --- a/arch/x86/include/asm/pgtable_types.h > +++ b/arch/x86/include/asm/pgtable_types.h > @@ -32,6 +32,7 @@ > > #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 > #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 > +#define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */ > #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ > #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 > > @@ -100,6 +101,14 @@ > #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) > #endif > > +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP > +#define _PAGE_UFFD_WP (_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP) > +#define _PAGE_SWP_UFFD_WP _PAGE_USER > +#else > +#define _PAGE_UFFD_WP (_AT(pteval_t, 0)) > +#define _PAGE_SWP_UFFD_WP (_AT(pteval_t, 0)) > +#endif > + > #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) > #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) > #define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP) > diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h > index 05e61e6c843f..f49afe951711 100644 > --- a/include/asm-generic/pgtable.h > +++ b/include/asm-generic/pgtable.h > @@ -10,6 +10,7 @@ > #include <linux/mm_types.h> > #include <linux/bug.h> > #include <linux/errno.h> > +#include <asm-generic/pgtable_uffd.h> > > #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ > defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS > diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h > new file mode 100644 > index 000000000000..643d1bf559c2 > --- /dev/null > +++ b/include/asm-generic/pgtable_uffd.h > @@ -0,0 +1,51 @@ > +#ifndef _ASM_GENERIC_PGTABLE_UFFD_H > +#define _ASM_GENERIC_PGTABLE_UFFD_H > + > +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP > +static __always_inline int pte_uffd_wp(pte_t pte) > +{ > + return 0; > +} > + > +static __always_inline int pmd_uffd_wp(pmd_t pmd) > +{ > + return 0; > +} > + > +static __always_inline pte_t pte_mkuffd_wp(pte_t pte) > +{ > + return pte; > +} > + > +static __always_inline pmd_t pmd_mkuffd_wp(pmd_t pmd) > +{ > + return pmd; > +} > + > +static __always_inline pte_t pte_clear_uffd_wp(pte_t pte) > +{ > + return pte; > +} > + > +static __always_inline pmd_t pmd_clear_uffd_wp(pmd_t pmd) > +{ > + return pmd; > +} > + > +static __always_inline pte_t pte_swp_mkuffd_wp(pte_t pte) > +{ > + return pte; > +} > + > +static __always_inline int pte_swp_uffd_wp(pte_t pte) > +{ > + return 0; > +} > + > +static __always_inline pte_t pte_swp_clear_uffd_wp(pte_t pte) > +{ > + return pte; > +} > +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ > + > +#endif /* _ASM_GENERIC_PGTABLE_UFFD_H */ > diff --git a/init/Kconfig b/init/Kconfig > index c9386a365eea..892d61ddf2eb 100644 > --- a/init/Kconfig > +++ b/init/Kconfig > @@ -1424,6 +1424,11 @@ config ADVISE_SYSCALLS > applications use these syscalls, you can disable this option to save > space. > > +config HAVE_ARCH_USERFAULTFD_WP > + bool > + help > + Arch has userfaultfd write protection support > + > config MEMBARRIER > bool "Enable membarrier() system call" if EXPERT > default y > -- > 2.17.1 > -- Sincerely yours, Mike.