On Wed 23-08-17 16:48:51, Dan Williams wrote: > The mmap(2) syscall suffers from the ABI anti-pattern of not validating > unknown flags. However, proposals like MAP_SYNC and MAP_DIRECT need a > mechanism to define new behavior that is known to fail on older kernels > without the support. Define a new mmap3 syscall that checks for > unsupported flags at syscall entry and add a 'mmap_supported_mask' to > 'struct file_operations' so generic code can validate the ->mmap() > handler knows about the specified flags. This also arranges for the > flags to be passed to the handler so it can do further local validation > if the requested behavior can be fulfilled. > > Cc: Jan Kara <jack@xxxxxxx> > Cc: Arnd Bergmann <arnd@xxxxxxxx> > Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> > Suggested-by: Andy Lutomirski <luto@xxxxxxxxxx> > Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> OK, are we sold on this approach to introduction of new mmap flags? I'm asking because working API for mmap flag is basically the only thing that's missing from my MAP_SYNC patches so I'd like to rebase my patches onto something that is working... Honza > --- > arch/x86/entry/syscalls/syscall_32.tbl | 1 + > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > include/linux/fs.h | 1 + > include/linux/mm.h | 2 +- > include/linux/mman.h | 42 ++++++++++++++++++++++++++++++++ > include/linux/syscalls.h | 3 ++ > mm/mmap.c | 32 ++++++++++++++++++++++-- > 7 files changed, 78 insertions(+), 4 deletions(-) > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl > index 448ac2161112..0618b5b38b45 100644 > --- a/arch/x86/entry/syscalls/syscall_32.tbl > +++ b/arch/x86/entry/syscalls/syscall_32.tbl > @@ -391,3 +391,4 @@ > 382 i386 pkey_free sys_pkey_free > 383 i386 statx sys_statx > 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl > +385 i386 mmap3 sys_mmap_pgoff_strict > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > index 5aef183e2f85..e204c736d7e9 100644 > --- a/arch/x86/entry/syscalls/syscall_64.tbl > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > @@ -339,6 +339,7 @@ > 330 common pkey_alloc sys_pkey_alloc > 331 common pkey_free sys_pkey_free > 332 common statx sys_statx > +333 common mmap3 sys_mmap_pgoff_strict > > # > # x32-specific system call numbers start at 512 to avoid cache impact > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 33d1ee8f51be..db42da9f98c4 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1674,6 +1674,7 @@ struct file_operations { > long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); > long (*compat_ioctl) (struct file *, unsigned int, unsigned long); > int (*mmap) (struct file *, struct vm_area_struct *, unsigned long); > + unsigned long mmap_supported_mask; > int (*open) (struct inode *, struct file *); > int (*flush) (struct file *, fl_owner_t id); > int (*release) (struct inode *, struct file *); > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 46b9ac5e8569..49eef48da4b7 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2090,7 +2090,7 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo > > extern unsigned long mmap_region(struct file *file, unsigned long addr, > unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, > - struct list_head *uf); > + struct list_head *uf, unsigned long flags); > extern unsigned long do_mmap(struct file *file, unsigned long addr, > unsigned long len, unsigned long prot, unsigned long flags, > vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, > diff --git a/include/linux/mman.h b/include/linux/mman.h > index c8367041fafd..64b6cb3dec70 100644 > --- a/include/linux/mman.h > +++ b/include/linux/mman.h > @@ -7,6 +7,48 @@ > #include <linux/atomic.h> > #include <uapi/linux/mman.h> > > +/* > + * Arrange for undefined architecture specific flags to be rejected by > + * default. > + */ > +#ifndef MAP_32BIT > +#define MAP_32BIT 0 > +#endif > +#ifndef MAP_HUGE_2MB > +#define MAP_HUGE_2MB 0 > +#endif > +#ifndef MAP_HUGE_1GB > +#define MAP_HUGE_1GB 0 > +#endif > +#ifndef MAP_UNINITIALIZED > +#define MAP_UNINITIALIZED 0 > +#endif > + > +/* > + * The historical set of flags that all mmap implementations implicitly > + * support when file_operations.mmap_supported_mask is zero. With the > + * mmap3 syscall the deprecated MAP_DENYWRITE and MAP_EXECUTABLE bit > + * values are explicitly rejected with EOPNOTSUPP rather than being > + * silently accepted. > + */ > +#define LEGACY_MAP_SUPPORTED_MASK (MAP_SHARED \ > + | MAP_PRIVATE \ > + | MAP_FIXED \ > + | MAP_ANONYMOUS \ > + | MAP_UNINITIALIZED \ > + | MAP_GROWSDOWN \ > + | MAP_LOCKED \ > + | MAP_NORESERVE \ > + | MAP_POPULATE \ > + | MAP_NONBLOCK \ > + | MAP_STACK \ > + | MAP_HUGETLB \ > + | MAP_32BIT \ > + | MAP_HUGE_2MB \ > + | MAP_HUGE_1GB) > + > +#define MAP_SUPPORTED_MASK (LEGACY_MAP_SUPPORTED_MASK) > + > extern int sysctl_overcommit_memory; > extern int sysctl_overcommit_ratio; > extern unsigned long sysctl_overcommit_kbytes; > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index 3cb15ea48aee..c0e0c99cf4ad 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -858,6 +858,9 @@ asmlinkage long sys_perf_event_open( > asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len, > unsigned long prot, unsigned long flags, > unsigned long fd, unsigned long pgoff); > +asmlinkage long sys_mmap_pgoff_strict(unsigned long addr, unsigned long len, > + unsigned long prot, unsigned long flags, > + unsigned long fd, unsigned long pgoff); > asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg); > asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name, > struct file_handle __user *handle, > diff --git a/mm/mmap.c b/mm/mmap.c > index 744faae86781..386706831d67 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -1464,7 +1464,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, > vm_flags |= VM_NORESERVE; > } > > - addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); > + addr = mmap_region(file, addr, len, vm_flags, pgoff, uf, flags); > if (!IS_ERR_VALUE(addr) && > ((vm_flags & VM_LOCKED) || > (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) > @@ -1521,6 +1521,32 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, > return retval; > } > > +SYSCALL_DEFINE6(mmap_pgoff_strict, unsigned long, addr, unsigned long, len, > + unsigned long, prot, unsigned long, flags, > + unsigned long, fd, unsigned long, pgoff) > +{ > + if (flags & ~(MAP_SUPPORTED_MASK)) > + return -EOPNOTSUPP; > + > + if (!(flags & MAP_ANONYMOUS)) { > + unsigned long f_supported; > + struct file *file; > + > + audit_mmap_fd(fd, flags); > + file = fget(fd); > + if (!file) > + return -EBADF; > + f_supported = file->f_op->mmap_supported_mask; > + fput(file); > + if (!f_supported) > + f_supported = LEGACY_MAP_SUPPORTED_MASK; > + if (flags & ~f_supported) > + return -EOPNOTSUPP; > + } > + > + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); > +} > + > #ifdef __ARCH_WANT_SYS_OLD_MMAP > struct mmap_arg_struct { > unsigned long addr; > @@ -1601,7 +1627,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) > > unsigned long mmap_region(struct file *file, unsigned long addr, > unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, > - struct list_head *uf) > + struct list_head *uf, unsigned long flags) > { > struct mm_struct *mm = current->mm; > struct vm_area_struct *vma, *prev; > @@ -1686,7 +1712,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > * new file must not have been exposed to user-space, yet. > */ > vma->vm_file = get_file(file); > - error = call_mmap(file, vma, 0); > + error = call_mmap(file, vma, flags); > if (error) > goto unmap_and_free_vma; > > -- Jan Kara <jack@xxxxxxxx> SUSE Labs, CR