On 2/10/25 16:34, Ryan Roberts wrote: > On 10/02/2025 07:45, Anshuman Khandual wrote: >> >> >> On 2/5/25 20:39, Ryan Roberts wrote: >>> arch_sync_kernel_mappings() is an optional hook for arches to allow them >>> to synchonize certain levels of the kernel pgtables after modification. >>> But arm64 could benefit from a hook similar to this, paired with a call >>> prior to starting the batch of modifications. >>> >>> So let's introduce arch_update_kernel_mappings_begin() and >>> arch_update_kernel_mappings_end(). Both have a default implementation >>> which can be overridden by the arch code. The default for the former is >>> a nop, and the default for the latter is to call >>> arch_sync_kernel_mappings(), so the latter replaces previous >>> arch_sync_kernel_mappings() callsites. So by default, the resulting >>> behaviour is unchanged. >>> >>> To avoid include hell, the pgtbl_mod_mask type and it's associated >>> macros are moved to their own header. >>> >>> In a future patch, arm64 will opt-in to overriding both functions. >>> >>> Signed-off-by: Ryan Roberts <ryan.roberts@xxxxxxx> >>> --- >>> include/linux/pgtable.h | 24 +---------------- >>> include/linux/pgtable_modmask.h | 32 ++++++++++++++++++++++ >>> include/linux/vmalloc.h | 47 +++++++++++++++++++++++++++++++++ >>> mm/memory.c | 5 ++-- >>> mm/vmalloc.c | 15 ++++++----- >>> 5 files changed, 92 insertions(+), 31 deletions(-) >>> create mode 100644 include/linux/pgtable_modmask.h >>> >>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h >>> index 94d267d02372..7f70786a73b3 100644 >>> --- a/include/linux/pgtable.h >>> +++ b/include/linux/pgtable.h >>> @@ -4,6 +4,7 @@ >>> >>> #include <linux/pfn.h> >>> #include <asm/pgtable.h> >>> +#include <linux/pgtable_modmask.h> >>> >>> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) >>> #define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT) >>> @@ -1786,29 +1787,6 @@ static inline bool arch_has_pfn_modify_check(void) >>> # define PAGE_KERNEL_EXEC PAGE_KERNEL >>> #endif >>> >>> -/* >>> - * Page Table Modification bits for pgtbl_mod_mask. >>> - * >>> - * These are used by the p?d_alloc_track*() set of functions an in the generic >>> - * vmalloc/ioremap code to track at which page-table levels entries have been >>> - * modified. Based on that the code can better decide when vmalloc and ioremap >>> - * mapping changes need to be synchronized to other page-tables in the system. >>> - */ >>> -#define __PGTBL_PGD_MODIFIED 0 >>> -#define __PGTBL_P4D_MODIFIED 1 >>> -#define __PGTBL_PUD_MODIFIED 2 >>> -#define __PGTBL_PMD_MODIFIED 3 >>> -#define __PGTBL_PTE_MODIFIED 4 >>> - >>> -#define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED) >>> -#define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED) >>> -#define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED) >>> -#define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED) >>> -#define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED) >>> - >>> -/* Page-Table Modification Mask */ >>> -typedef unsigned int pgtbl_mod_mask; >>> - >>> #endif /* !__ASSEMBLY__ */ >>> >>> #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) >>> diff --git a/include/linux/pgtable_modmask.h b/include/linux/pgtable_modmask.h >>> new file mode 100644 >>> index 000000000000..5a21b1bb8df3 >>> --- /dev/null >>> +++ b/include/linux/pgtable_modmask.h >>> @@ -0,0 +1,32 @@ >>> +/* SPDX-License-Identifier: GPL-2.0 */ >>> +#ifndef _LINUX_PGTABLE_MODMASK_H >>> +#define _LINUX_PGTABLE_MODMASK_H >>> + >>> +#ifndef __ASSEMBLY__ >>> + >>> +/* >>> + * Page Table Modification bits for pgtbl_mod_mask. >>> + * >>> + * These are used by the p?d_alloc_track*() set of functions an in the generic >>> + * vmalloc/ioremap code to track at which page-table levels entries have been >>> + * modified. Based on that the code can better decide when vmalloc and ioremap >>> + * mapping changes need to be synchronized to other page-tables in the system. >>> + */ >>> +#define __PGTBL_PGD_MODIFIED 0 >>> +#define __PGTBL_P4D_MODIFIED 1 >>> +#define __PGTBL_PUD_MODIFIED 2 >>> +#define __PGTBL_PMD_MODIFIED 3 >>> +#define __PGTBL_PTE_MODIFIED 4 >>> + >>> +#define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED) >>> +#define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED) >>> +#define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED) >>> +#define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED) >>> +#define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED) >>> + >>> +/* Page-Table Modification Mask */ >>> +typedef unsigned int pgtbl_mod_mask; >>> + >>> +#endif /* !__ASSEMBLY__ */ >>> + >>> +#endif /* _LINUX_PGTABLE_MODMASK_H */ >>> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h >>> index 16dd4cba64f2..cb5d8f1965a1 100644 >>> --- a/include/linux/vmalloc.h >>> +++ b/include/linux/vmalloc.h >>> @@ -11,6 +11,7 @@ >>> #include <asm/page.h> /* pgprot_t */ >>> #include <linux/rbtree.h> >>> #include <linux/overflow.h> >>> +#include <linux/pgtable_modmask.h> >>> >>> #include <asm/vmalloc.h> >>> >>> @@ -213,6 +214,26 @@ extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, >>> int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, >>> struct page **pages, unsigned int page_shift); >>> >>> +#ifndef arch_update_kernel_mappings_begin >>> +/** >>> + * arch_update_kernel_mappings_begin - A batch of kernel pgtable mappings are >>> + * about to be updated. >>> + * @start: Virtual address of start of range to be updated. >>> + * @end: Virtual address of end of range to be updated. >>> + * >>> + * An optional hook to allow architecture code to prepare for a batch of kernel >>> + * pgtable mapping updates. An architecture may use this to enter a lazy mode >>> + * where some operations can be deferred until the end of the batch. >>> + * >>> + * Context: Called in task context and may be preemptible. >>> + */ >>> +static inline void arch_update_kernel_mappings_begin(unsigned long start, >>> + unsigned long end) >>> +{ >>> +} >>> +#endif >>> + >>> +#ifndef arch_update_kernel_mappings_end >>> /* >>> * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values >>> * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() >>> @@ -229,6 +250,32 @@ int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, >>> */ >>> void arch_sync_kernel_mappings(unsigned long start, unsigned long end); >>> >>> +/** >>> + * arch_update_kernel_mappings_end - A batch of kernel pgtable mappings have >>> + * been updated. >>> + * @start: Virtual address of start of range that was updated. >>> + * @end: Virtual address of end of range that was updated. >>> + * >>> + * An optional hook to inform architecture code that a batch update is complete. >>> + * This balances a previous call to arch_update_kernel_mappings_begin(). >>> + * >>> + * An architecture may override this for any purpose, such as exiting a lazy >>> + * mode previously entered with arch_update_kernel_mappings_begin() or syncing >>> + * kernel mappings to a secondary pgtable. The default implementation calls an >>> + * arch-provided arch_sync_kernel_mappings() if any arch-defined pgtable level >>> + * was updated. >>> + * >>> + * Context: Called in task context and may be preemptible. >>> + */ >>> +static inline void arch_update_kernel_mappings_end(unsigned long start, >>> + unsigned long end, >>> + pgtbl_mod_mask mask) >>> +{ >>> + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) >>> + arch_sync_kernel_mappings(start, end); >>> +} >> >> One arch call back calling yet another arch call back sounds bit odd. > > It's no different from the default implementation of arch_make_huge_pte() > calling pte_mkhuge() is it? Agreed. arch_make_huge_pte() ---> pte_mkhuge() where either helpers can be customized in the platform is another such example but unless necessary we should probably avoid following that. Anyways it's not a big deal I guess. > >> Also >> should not ARCH_PAGE_TABLE_SYNC_MASK be checked both for __begin and __end >> callbacks in case a platform subscribes into this framework. > > I'm not sure how that would work? The mask is accumulated during the pgtable > walk. So we don't have a mask until we get to the end. A non-zero ARCH_PAGE_TABLE_SYNC_MASK indicates that a platform is subscribing to this mechanism. So could ARCH_PAGE_TABLE_SYNC_MASK != 0 be used instead ? > >> Instead the >> following changes sound more reasonable, but will also require some more >> updates for the current platforms using arch_sync_kernel_mappings(). >> >> if (mask & ARCH_PAGE_TABLE_SYNC_MASK) >> arch_update_kernel_mappings_begin() > > This makes no sense. mask is always 0 before doing the walk. Got it. > >> >> if (mask & ARCH_PAGE_TABLE_SYNC_MASK) >> arch_update_kernel_mappings_end() >> >> Basically when any platform defines ARCH_PAGE_TABLE_SYNC_MASK and subscribes >> this framework, it will also provide arch_update_kernel_mappings_begin/end() >> callbacks as required. > > Personally I think it's cleaner to just pass mask to > arch_update_kernel_mappings_end() and it the function decide what it wants to do. > > But it's a good question as to whether we should refactor x86 and arm to > directly implement arch_update_kernel_mappings_end() instead of > arch_sync_kernel_mappings(). Personally I thought it was better to avoid the > churn. But interested in others opinions. > > Thanks, > Ryan > >> >>> +#endif >>> + >>> /* >>> * Lowlevel-APIs (not for driver use!) >>> */ >>> diff --git a/mm/memory.c b/mm/memory.c >>> index a15f7dd500ea..f80930bc19f6 100644 >>> --- a/mm/memory.c >>> +++ b/mm/memory.c >>> @@ -3035,6 +3035,8 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, >>> if (WARN_ON(addr >= end)) >>> return -EINVAL; >>> >>> + arch_update_kernel_mappings_begin(start, end); >>> + >>> pgd = pgd_offset(mm, addr); >>> do { >>> next = pgd_addr_end(addr, end); >>> @@ -3055,8 +3057,7 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, >>> break; >>> } while (pgd++, addr = next, addr != end); >>> >>> - if (mask & ARCH_PAGE_TABLE_SYNC_MASK) >>> - arch_sync_kernel_mappings(start, start + size); >>> + arch_update_kernel_mappings_end(start, end, mask); >>> >>> return err; >>> } >>> diff --git a/mm/vmalloc.c b/mm/vmalloc.c >>> index 50fd44439875..c5c51d86ef78 100644 >>> --- a/mm/vmalloc.c >>> +++ b/mm/vmalloc.c >>> @@ -312,10 +312,10 @@ int vmap_page_range(unsigned long addr, unsigned long end, >>> pgtbl_mod_mask mask = 0; >>> int err; >>> >>> + arch_update_kernel_mappings_begin(addr, end); >>> err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), >>> ioremap_max_page_shift, &mask); >>> - if (mask & ARCH_PAGE_TABLE_SYNC_MASK) >>> - arch_sync_kernel_mappings(addr, end); >>> + arch_update_kernel_mappings_end(addr, end, mask); >>> >>> flush_cache_vmap(addr, end); >>> if (!err) >>> @@ -463,6 +463,9 @@ void __vunmap_range_noflush(unsigned long start, unsigned long end) >>> pgtbl_mod_mask mask = 0; >>> >>> BUG_ON(addr >= end); >>> + >>> + arch_update_kernel_mappings_begin(start, end); >>> + >>> pgd = pgd_offset_k(addr); >>> do { >>> next = pgd_addr_end(addr, end); >>> @@ -473,8 +476,7 @@ void __vunmap_range_noflush(unsigned long start, unsigned long end) >>> vunmap_p4d_range(pgd, addr, next, &mask); >>> } while (pgd++, addr = next, addr != end); >>> >>> - if (mask & ARCH_PAGE_TABLE_SYNC_MASK) >>> - arch_sync_kernel_mappings(start, end); >>> + arch_update_kernel_mappings_end(start, end, mask); >>> } >>> >>> void vunmap_range_noflush(unsigned long start, unsigned long end) >>> @@ -625,6 +627,8 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, >>> >>> WARN_ON(page_shift < PAGE_SHIFT); >>> >>> + arch_update_kernel_mappings_begin(start, end); >>> + >>> if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || >>> page_shift == PAGE_SHIFT) { >>> err = vmap_small_pages_range_noflush(addr, end, prot, pages, >>> @@ -642,8 +646,7 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, >>> } >>> } >>> >>> - if (mask & ARCH_PAGE_TABLE_SYNC_MASK) >>> - arch_sync_kernel_mappings(start, end); >>> + arch_update_kernel_mappings_end(start, end, mask); >>> >>> return err; >>> } >