Thomas Gleixner <tglx@xxxxxxxxxxxxx> writes: > iounmap() on x86 occasionally fails to unmap because the provided valid > ioremap address is not below high_memory. It turned out that this > happens due to KASLR. > > KASLR uses the full address space between PAGE_OFFSET and vaddr_end to > randomize the starting points of the direct map, vmalloc and vmemmap > regions. It thereby limits the size of the direct map by using the > installed memory size plus an extra configurable margin for hot-plug > memory. This limitation is done to gain more randomization space > because otherwise only the holes between the direct map, vmalloc, > vmemmap and vaddr_end would be usable for randomizing. > > The limited direct map size is not exposed to the rest of the kernel, so > the memory hot-plug and resource management related code paths still > operate under the assumption that the available address space can be > determined with MAX_PHYSMEM_BITS. > > request_free_mem_region() allocates from (1 << MAX_PHYSMEM_BITS) - 1 > downwards. That means the first allocation happens past the end of the > direct map and if unlucky this address is in the vmalloc space, which > causes high_memory to become greater than VMALLOC_START and consequently > causes iounmap() to fail for valid ioremap addresses. > > MAX_PHYSMEM_BITS cannot be changed for that because the randomization > does not align with address bit boundaries and there are other places > which actually require to know the maximum number of address bits. All > remaining usage sites of MAX_PHYSMEM_BITS have been analyzed and found > to be correct. > > Cure this by exposing the end of the direct map via PHYSMEM_END and use > that for the memory hot-plug and resource management related places > instead of relying on MAX_PHYSMEM_BITS. In the KASLR case PHYSMEM_END > maps to a variable which is initialized by the KASLR initialization and > otherwise it is based on MAX_PHYSMEM_BITS as before. > > To prevent future hickups add a check into add_pages() to catch callers > trying to add memory above PHYSMEM_END. > > Fixes: 0483e1fa6e09 ("x86/mm: Implement ASLR for kernel memory regions") > Reported-by: Max Ramanouski <max8rr8@xxxxxxxxx> > Reported-by: Alistair Popple <apopple@xxxxxxxxxx> > Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Thanks Thomas. Looks good and it fixes the issue on a system which always ran into the iounmap problem. So feel free to add: Reviewed-by: Alistair Popple <apopple@xxxxxxxxxx> Tested-by: Alistair Popple <apopple@xxxxxxxxxx> > --- > arch/x86/include/asm/page_64.h | 1 + > arch/x86/include/asm/pgtable_64_types.h | 4 ++++ > arch/x86/mm/init_64.c | 4 ++++ > arch/x86/mm/kaslr.c | 21 ++++++++++++++++++--- > include/linux/mm.h | 4 ++++ > kernel/resource.c | 6 ++---- > mm/memory_hotplug.c | 2 +- > mm/sparse.c | 2 +- > 8 files changed, 35 insertions(+), 9 deletions(-) > > --- a/arch/x86/include/asm/page_64.h > +++ b/arch/x86/include/asm/page_64.h > @@ -17,6 +17,7 @@ extern unsigned long phys_base; > extern unsigned long page_offset_base; > extern unsigned long vmalloc_base; > extern unsigned long vmemmap_base; > +extern unsigned long physmem_end; > > static __always_inline unsigned long __phys_addr_nodebug(unsigned long x) > { > --- a/arch/x86/include/asm/pgtable_64_types.h > +++ b/arch/x86/include/asm/pgtable_64_types.h > @@ -140,6 +140,10 @@ extern unsigned int ptrs_per_p4d; > # define VMEMMAP_START __VMEMMAP_BASE_L4 > #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ > > +#ifdef CONFIG_RANDOMIZE_MEMORY > +# define PHYSMEM_END physmem_end > +#endif > + > /* > * End of the region for which vmalloc page tables are pre-allocated. > * For non-KMSAN builds, this is the same as VMALLOC_END. > --- a/arch/x86/mm/init_64.c > +++ b/arch/x86/mm/init_64.c > @@ -958,8 +958,12 @@ static void update_end_of_memory_vars(u6 > int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, > struct mhp_params *params) > { > + unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1; > int ret; > > + if (WARN_ON_ONCE(end > PHYSMEM_END)) > + return -ERANGE; > + > ret = __add_pages(nid, start_pfn, nr_pages, params); > WARN_ON_ONCE(ret); > > --- a/arch/x86/mm/kaslr.c > +++ b/arch/x86/mm/kaslr.c > @@ -47,13 +47,24 @@ static const unsigned long vaddr_end = C > */ > static __initdata struct kaslr_memory_region { > unsigned long *base; > + unsigned long *end; > unsigned long size_tb; > } kaslr_regions[] = { > - { &page_offset_base, 0 }, > - { &vmalloc_base, 0 }, > - { &vmemmap_base, 0 }, > + { > + .base = &page_offset_base, > + .end = &physmem_end, > + }, > + { > + .base = &vmalloc_base, > + }, > + { > + .base = &vmemmap_base, > + }, > }; > > +/* The end of the possible address space for physical memory */ > +unsigned long physmem_end __ro_after_init; > + > /* Get size in bytes used by the memory region */ > static inline unsigned long get_padding(struct kaslr_memory_region *region) > { > @@ -82,6 +93,8 @@ void __init kernel_randomize_memory(void > BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE); > BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); > > + /* Preset the end of the possible address space for physical memory */ > + physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1); > if (!kaslr_memory_enabled()) > return; > > @@ -134,6 +147,8 @@ void __init kernel_randomize_memory(void > */ > vaddr += get_padding(&kaslr_regions[i]); > vaddr = round_up(vaddr + 1, PUD_SIZE); > + if (kaslr_regions[i].end) > + *kaslr_regions[i].end = __pa(vaddr) - 1; > remain_entropy -= entropy; > } > } > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -97,6 +97,10 @@ extern const int mmap_rnd_compat_bits_ma > extern int mmap_rnd_compat_bits __read_mostly; > #endif > > +#ifndef PHYSMEM_END > +# define PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) > +#endif > + > #include <asm/page.h> > #include <asm/processor.h> > > --- a/kernel/resource.c > +++ b/kernel/resource.c > @@ -1826,8 +1826,7 @@ static resource_size_t gfr_start(struct > if (flags & GFR_DESCENDING) { > resource_size_t end; > > - end = min_t(resource_size_t, base->end, > - (1ULL << MAX_PHYSMEM_BITS) - 1); > + end = min_t(resource_size_t, base->end, PHYSMEM_END); > return end - size + 1; > } > > @@ -1844,8 +1843,7 @@ static bool gfr_continue(struct resource > * @size did not wrap 0. > */ > return addr > addr - size && > - addr <= min_t(resource_size_t, base->end, > - (1ULL << MAX_PHYSMEM_BITS) - 1); > + addr <= min_t(resource_size_t, base->end, PHYSMEM_END); > } > > static resource_size_t gfr_next(resource_size_t addr, resource_size_t size, > --- a/mm/memory_hotplug.c > +++ b/mm/memory_hotplug.c > @@ -1681,7 +1681,7 @@ struct range __weak arch_get_mappable_ra > > struct range mhp_get_pluggable_range(bool need_mapping) > { > - const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1; > + const u64 max_phys = PHYSMEM_END; > struct range mhp_range; > > if (need_mapping) { > --- a/mm/sparse.c > +++ b/mm/sparse.c > @@ -129,7 +129,7 @@ static inline int sparse_early_nid(struc > static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, > unsigned long *end_pfn) > { > - unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); > + unsigned long max_sparsemem_pfn = (PHYSMEM_END + 1) >> PAGE_SHIFT; > > /* > * Sanity checks - do not allow an architecture to pass