* Nam Cao <namcao@xxxxxxxxxxxxx> [240825 11:29]: > When a virtual memory area (VMA) gets splitted, memtype_rbroot's entries > are not updated. This causes confusion later on when the VMAs get > un-mapped, because the address ranges of the splitted VMAs do not match the > address range of the initial VMA. > > For example, if user does: > > fd = open("/some/pci/bar", O_RDWR); > addr = mmap(0, 8192, PROT_READ, MAP_SHARED, fd, 0); > mprotect(addr, 4096, PROT_READ | PROT_WRITE); > munmap(p, 8192); > > with the physical address starting from 0xfd000000, the range > (0xfd000000-0xfd002000) would be tracked with the mmap() call. > > After mprotect(), the initial range gets splitted into > (0xfd000000-0xfd001000) and (0xfd001000-0xfd002000). > > Then, at munmap(), the first range does not match any entry in > memtype_rbroot, and a message is seen in dmesg: > > x86/PAT: test:177 freeing invalid memtype [mem 0xfd000000-0xfd000fff] > > The second range still matches by accident, because matching only the end > address is acceptable (to handle shrinking VMA, added by 2039e6acaf94 > (x86/mm/pat: Change free_memtype() to support shrinking case)). Does this need a fixes tag? > > Make sure VMA splitting is handled properly, by splitting the entries in > memtype_rbroot. > > Signed-off-by: Nam Cao <namcao@xxxxxxxxxxxxx> > --- > arch/x86/mm/pat/memtype.c | 59 ++++++++++++++++++++++++++++++ > arch/x86/mm/pat/memtype.h | 3 ++ > arch/x86/mm/pat/memtype_interval.c | 22 +++++++++++ > include/linux/pgtable.h | 6 +++ > mm/mmap.c | 8 ++++ > 5 files changed, 98 insertions(+) > > diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c > index bdc2a240c2aa..b60019478a76 100644 > --- a/arch/x86/mm/pat/memtype.c > +++ b/arch/x86/mm/pat/memtype.c > @@ -935,6 +935,46 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, > return 0; > } > > +static int split_pfn_range(u64 start, u64 end, u64 addr) > +{ > + struct memtype *entry_new; > + int is_range_ram, ret; > + > + if (!pat_enabled()) > + return 0; > + > + start = sanitize_phys(start); > + end = sanitize_phys(end - 1) + 1; > + > + /* Low ISA region is not tracked, it is always mapped WB */ > + if (x86_platform.is_untracked_pat_range(start, end)) > + return 0; > + > + is_range_ram = pat_pagerange_is_ram(start, end); > + if (is_range_ram == 1) > + return 0; > + > + if (is_range_ram < 0) > + return -EINVAL; > + > + entry_new = kmalloc(sizeof(*entry_new), GFP_KERNEL); > + if (!entry_new) > + return -ENOMEM; > + > + spin_lock(&memtype_lock); > + ret = memtype_split(start, end, addr, entry_new); > + spin_unlock(&memtype_lock); > + > + if (ret) { > + pr_err("x86/PAT: %s:%d splitting invalid memtype [mem %#010Lx-%#010Lx]\n", > + current->comm, current->pid, start, end - 1); > + kfree(entry_new); > + return ret; > + } > + > + return 0; > +} > + > /* > * Internal interface to free a range of physical memory. > * Frees non RAM regions only. > @@ -1072,6 +1112,25 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, > return 0; > } > > +int track_pfn_split(struct vm_area_struct *vma, unsigned long addr) > +{ > + unsigned long vma_size = vma->vm_end - vma->vm_start; > + resource_size_t start_paddr, split_paddr; > + int ret; > + > + if (vma->vm_flags & VM_PAT) { > + ret = get_pat_info(vma, &start_paddr, NULL); > + if (ret) > + return ret; > + > + split_paddr = start_paddr + addr - vma->vm_start; > + > + return split_pfn_range(start_paddr, start_paddr + vma_size, split_paddr); > + } > + > + return 0; > +} > + > void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) > { > enum page_cache_mode pcm; > diff --git a/arch/x86/mm/pat/memtype.h b/arch/x86/mm/pat/memtype.h > index cacecdbceb55..e01dc2018ab6 100644 > --- a/arch/x86/mm/pat/memtype.h > +++ b/arch/x86/mm/pat/memtype.h > @@ -31,6 +31,7 @@ static inline char *cattr_name(enum page_cache_mode pcm) > #ifdef CONFIG_X86_PAT > extern int memtype_check_insert(struct memtype *entry_new, > enum page_cache_mode *new_type); > +extern int memtype_split(u64 start, u64 end, u64 addr, struct memtype *entry_new); I think we are dropping unnecessary externs now. > extern struct memtype *memtype_erase(u64 start, u64 end); > extern struct memtype *memtype_lookup(u64 addr); > extern int memtype_copy_nth_element(struct memtype *entry_out, loff_t pos); > @@ -38,6 +39,8 @@ extern int memtype_copy_nth_element(struct memtype *entry_out, loff_t pos); > static inline int memtype_check_insert(struct memtype *entry_new, > enum page_cache_mode *new_type) > { return 0; } > +static inline int memtype_split(u64 start, u64 end, u64 addr, struct memtype *entry_new) > +{ return 0; } > static inline struct memtype *memtype_erase(u64 start, u64 end) > { return NULL; } > static inline struct memtype *memtype_lookup(u64 addr) > diff --git a/arch/x86/mm/pat/memtype_interval.c b/arch/x86/mm/pat/memtype_interval.c > index 645613d59942..c75d9ee6b72f 100644 > --- a/arch/x86/mm/pat/memtype_interval.c > +++ b/arch/x86/mm/pat/memtype_interval.c > @@ -128,6 +128,28 @@ int memtype_check_insert(struct memtype *entry_new, enum page_cache_mode *ret_ty > return 0; > } > > +int memtype_split(u64 start, u64 end, u64 addr, struct memtype *entry_new) > +{ > + struct memtype *entry_old; > + > + entry_old = memtype_match(start, end, MEMTYPE_EXACT_MATCH); > + if (!entry_old) > + return -EINVAL; > + > + interval_remove(entry_old, &memtype_rbroot); > + > + entry_new->start = addr; > + entry_new->end = entry_old->end; > + entry_new->type = entry_old->type; > + > + entry_old->end = addr; > + > + interval_insert(entry_old, &memtype_rbroot); > + interval_insert(entry_new, &memtype_rbroot); > + > + return 0; > +} > + > struct memtype *memtype_erase(u64 start, u64 end) > { > struct memtype *entry_old; > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h > index 2a6a3cccfc36..8bfc8d0f5dd2 100644 > --- a/include/linux/pgtable.h > +++ b/include/linux/pgtable.h > @@ -1502,6 +1502,11 @@ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, > return 0; > } > > +static inline int track_pfn_split(struct vm_area_struct *vma, unsigned long addr) > +{ > + return 0; > +} > + > /* > * track_pfn_insert is called when a _new_ single pfn is established > * by vmf_insert_pfn(). > @@ -1542,6 +1547,7 @@ static inline void untrack_pfn_clear(struct vm_area_struct *vma) > extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, > unsigned long pfn, unsigned long addr, > unsigned long size); > +extern int track_pfn_split(struct vm_area_struct *vma, unsigned long addr); Same extern comment here. > extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, > pfn_t pfn); > extern int track_pfn_copy(struct vm_area_struct *vma); > diff --git a/mm/mmap.c b/mm/mmap.c > index d0dfc85b209b..64067ddb8382 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -2486,6 +2486,12 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, > if (err) > goto out_free_mpol; > > + if (unlikely(vma->vm_flags & VM_PFNMAP)) { It is also a bit odd that you check VM_PFNMAP() here, then call a function to check another flag? > + err = track_pfn_split(vma, addr); > + if (err) > + goto out_vma_unlink; > + } > + I don't think the __split_vma() location is the best place to put this. Can this be done through the vm_ops->may_split() that is called above? This is arch independent code that now has an x86 specific check, and I'd like to keep __split_vma() out of the flag checking. The only error after the vm_ops check is ENOMEM (without any extra GFP restrictions on the allocations), you don't need the new vma, and use the same arguments passed to vm_ops->may_split(). > if (new->vm_file) > get_file(new->vm_file); > > @@ -2515,6 +2521,8 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, > vma_next(vmi); > return 0; > > +out_vma_unlink: > + unlink_anon_vmas(vma); > out_free_mpol: > mpol_put(vma_policy(new)); > out_free_vmi: > -- > 2.39.2 >