Re: [PATCHv4 RESEND 1/3] syscalls,x86: implement execveat() system call

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Jun 5, 2014 at 6:40 AM, David Drysdale <drysdale@xxxxxxxxxx> wrote:
> This patch set adds execveat(2) for x86, and is derived from Meredydd
> Luff's patch from Sept 2012 (https://lkml.org/lkml/2012/9/11/528).
>
> The primary aim of adding an execveat syscall is to allow an
> implementation of fexecve(3) that does not rely on the /proc
> filesystem.  The current glibc version of fexecve(3) is implemented
> via /proc, which causes problems in sandboxed or otherwise restricted
> environments.
>
> Given the desire for a /proc-free fexecve() implementation, HPA
> suggested (https://lkml.org/lkml/2006/7/11/556) that an execveat(2)
> syscall would be an appropriate generalization.
>
> Also, having a new syscall means that it can take a flags argument
> without back-compatibility concerns.  The current implementation just
> defines the AT_SYMLINK_NOFOLLOW flag, but other flags could be added
> in future -- for example, flags for new namespaces (as suggested at
> https://lkml.org/lkml/2006/7/11/474).
>
> Related history:
>  - https://lkml.org/lkml/2006/12/27/123 is an example of someone
>    realizing that fexecve() is likely to fail in a chroot environment.
>  - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514043 covered
>    documenting the /proc requirement of fexecve(3) in its manpage, to
>    "prevent other people from wasting their time".
>  - https://bugzilla.kernel.org/show_bug.cgi?id=74481 documented that
>    it's not possible to fexecve() a file descriptor for a script with
>    close-on-exec set (which is possible with the implementation here).
>  - https://bugzilla.redhat.com/show_bug.cgi?id=241609 described a
>    problem where a process that did setuid() could not fexecve()
>    because it no longer had access to /proc/self/fd; this has since
>    been fixed.
>
> This patch in particular (of 3 = 2 kernel + 1 man-pages):
>
> Adds the new system execveat(2) syscall. execveat() is to execve() as
> openat() is to open(): it takes a file descriptor that refers to a
> directory, and resolves the filename relative to that.
>
> In addition, if the filename is NULL, execveat() executes the file
> to which the file descriptor refers. This replicates the functionality
> of fexecve(), which is a system call in other UNIXen, but in Linux
> glibc it depends on /proc being mounted.
>
> Only x86-64, i386 and x32 ABIs are supported in this patch.
>
> Based on patches by Meredydd Luff <meredydd@xxxxxxxxxxxxxxx>
>
> Signed-off-by: David Drysdale <drysdale@xxxxxxxxxx>

Hi Al,

Any thoughts on this? I think it would be quite handy.

-Kees

> ---
>  arch/x86/ia32/audit.c             |   1 +
>  arch/x86/ia32/ia32entry.S         |   1 +
>  arch/x86/kernel/audit_64.c        |   1 +
>  arch/x86/kernel/entry_64.S        |  28 +++++++
>  arch/x86/syscalls/syscall_32.tbl  |   1 +
>  arch/x86/syscalls/syscall_64.tbl  |   2 +
>  arch/x86/um/sys_call_table_64.c   |   1 +
>  fs/exec.c                         | 153 ++++++++++++++++++++++++++++++++------
>  include/linux/compat.h            |   3 +
>  include/linux/sched.h             |   4 +
>  include/linux/syscalls.h          |   4 +
>  include/uapi/asm-generic/unistd.h |   4 +-
>  kernel/sys_ni.c                   |   3 +
>  lib/audit.c                       |   3 +
>  14 files changed, 186 insertions(+), 23 deletions(-)
>
> diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c
> index 5d7b381da692..2eccc8932ae6 100644
> --- a/arch/x86/ia32/audit.c
> +++ b/arch/x86/ia32/audit.c
> @@ -35,6 +35,7 @@ int ia32_classify_syscall(unsigned syscall)
>         case __NR_socketcall:
>                 return 4;
>         case __NR_execve:
> +       case __NR_execveat:
>                 return 5;
>         default:
>                 return 1;
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index 4299eb05023c..2516c09743e0 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -464,6 +464,7 @@ GLOBAL(\label)
>         PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
>         PTREGSCALL stub32_sigreturn, sys32_sigreturn
>         PTREGSCALL stub32_execve, compat_sys_execve
> +       PTREGSCALL stub32_execveat, compat_sys_execveat
>         PTREGSCALL stub32_fork, sys_fork
>         PTREGSCALL stub32_vfork, sys_vfork
>
> diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
> index 06d3e5a14d9d..f3672508b249 100644
> --- a/arch/x86/kernel/audit_64.c
> +++ b/arch/x86/kernel/audit_64.c
> @@ -50,6 +50,7 @@ int audit_classify_syscall(int abi, unsigned syscall)
>         case __NR_openat:
>                 return 3;
>         case __NR_execve:
> +       case __NR_execveat:
>                 return 5;
>         default:
>                 return 0;
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index 1e96c3628bf2..f9a6c2fdda15 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -872,6 +872,20 @@ ENTRY(stub_execve)
>         CFI_ENDPROC
>  END(stub_execve)
>
> +ENTRY(stub_execveat)
> +       CFI_STARTPROC
> +       addq $8, %rsp
> +       PARTIAL_FRAME 0
> +       SAVE_REST
> +       FIXUP_TOP_OF_STACK %r11
> +       call sys_execveat
> +       RESTORE_TOP_OF_STACK %r11
> +       movq %rax,RAX(%rsp)
> +       RESTORE_REST
> +       jmp int_ret_from_sys_call
> +       CFI_ENDPROC
> +END(stub_execveat)
> +
>  /*
>   * sigreturn is special because it needs to restore all registers on return.
>   * This cannot be done with SYSRET, so use the IRET return path instead.
> @@ -917,6 +931,20 @@ ENTRY(stub_x32_execve)
>         CFI_ENDPROC
>  END(stub_x32_execve)
>
> +ENTRY(stub_x32_execveat)
> +       CFI_STARTPROC
> +       addq $8, %rsp
> +       PARTIAL_FRAME 0
> +       SAVE_REST
> +       FIXUP_TOP_OF_STACK %r11
> +       call compat_sys_execveat
> +       RESTORE_TOP_OF_STACK %r11
> +       movq %rax,RAX(%rsp)
> +       RESTORE_REST
> +       jmp int_ret_from_sys_call
> +       CFI_ENDPROC
> +END(stub_x32_execveat)
> +
>  #endif
>
>  /*
> diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
> index 96bc506ac6de..2ab0712a0e7c 100644
> --- a/arch/x86/syscalls/syscall_32.tbl
> +++ b/arch/x86/syscalls/syscall_32.tbl
> @@ -359,3 +359,4 @@
>  350    i386    finit_module            sys_finit_module
>  351    i386    sched_setattr           sys_sched_setattr
>  352    i386    sched_getattr           sys_sched_getattr
> +353    i386    execveat                sys_execveat                    stub32_execveat
> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
> index a12bddc7ccea..2e4058c14b4f 100644
> --- a/arch/x86/syscalls/syscall_64.tbl
> +++ b/arch/x86/syscalls/syscall_64.tbl
> @@ -322,6 +322,7 @@
>  313    common  finit_module            sys_finit_module
>  314    common  sched_setattr           sys_sched_setattr
>  315    common  sched_getattr           sys_sched_getattr
> +316    64      execveat                stub_execveat
>
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> @@ -358,3 +359,4 @@
>  540    x32     process_vm_writev       compat_sys_process_vm_writev
>  541    x32     setsockopt              compat_sys_setsockopt
>  542    x32     getsockopt              compat_sys_getsockopt
> +543    x32     execveat                stub_x32_execveat
> diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
> index f2f0723070ca..20c3649d0691 100644
> --- a/arch/x86/um/sys_call_table_64.c
> +++ b/arch/x86/um/sys_call_table_64.c
> @@ -31,6 +31,7 @@
>  #define stub_fork sys_fork
>  #define stub_vfork sys_vfork
>  #define stub_execve sys_execve
> +#define stub_execveat sys_execveat
>  #define stub_rt_sigreturn sys_rt_sigreturn
>
>  #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
> diff --git a/fs/exec.c b/fs/exec.c
> index 3d78fccdd723..a8676ce571ce 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -748,7 +748,22 @@ EXPORT_SYMBOL(setup_arg_pages);
>
>  #endif /* CONFIG_MMU */
>
> -static struct file *do_open_exec(struct filename *name)
> +/*
> + * Perform the extra checks that open_exec() needs over and above a normal
> + * open.
> + */
> +static int check_exec_and_deny_write(struct file *file)
> +{
> +       if (!S_ISREG(file_inode(file)->i_mode))
> +               return -EACCES;
> +
> +       if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
> +               return -EACCES;
> +
> +       return deny_write_access(file);
> +}
> +
> +static struct file *do_open_execat(int fd, struct filename *name, int flags)
>  {
>         struct file *file;
>         int err;
> @@ -758,24 +773,42 @@ static struct file *do_open_exec(struct filename *name)
>                 .intent = LOOKUP_OPEN,
>                 .lookup_flags = LOOKUP_FOLLOW,
>         };
> +       static const struct open_flags open_exec_nofollow_flags = {
> +               .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
> +               .acc_mode = MAY_EXEC | MAY_OPEN,
> +               .intent = LOOKUP_OPEN,
> +               .lookup_flags = 0,
> +       };
>
> -       file = do_filp_open(AT_FDCWD, name, &open_exec_flags);
> -       if (IS_ERR(file))
> -               goto out;
> -
> -       err = -EACCES;
> -       if (!S_ISREG(file_inode(file)->i_mode))
> -               goto exit;
> +       if (flags & ~AT_SYMLINK_NOFOLLOW)
> +               return ERR_PTR(-EINVAL);
>
> -       if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
> -               goto exit;
> +       if (name) {
> +               const struct open_flags *oflags = ((flags & AT_SYMLINK_NOFOLLOW)
> +                                                  ? &open_exec_nofollow_flags
> +                                                  : &open_exec_flags);
>
> -       fsnotify_open(file);
> +               file = do_filp_open(fd, name, oflags);
> +               if (IS_ERR(file))
> +                       goto out;
> +       } else {
> +               file = fget(fd);
> +               if (!file)
> +                       return ERR_PTR(-EBADF);
> +
> +               err = inode_permission(file->f_path.dentry->d_inode,
> +                               open_exec_flags.acc_mode);
> +               if (err)
> +                       goto exit;
> +       }
>
> -       err = deny_write_access(file);
> +       err = check_exec_and_deny_write(file);
>         if (err)
>                 goto exit;
>
> +       if (name)
> +               fsnotify_open(file);
> +
>  out:
>         return file;
>
> @@ -787,7 +820,7 @@ exit:
>  struct file *open_exec(const char *name)
>  {
>         struct filename tmp = { .name = name };
> -       return do_open_exec(&tmp);
> +       return do_open_execat(AT_FDCWD, &tmp, 0);
>  }
>  EXPORT_SYMBOL(open_exec);
>
> @@ -1437,10 +1470,12 @@ static int exec_binprm(struct linux_binprm *bprm)
>  /*
>   * sys_execve() executes a new program.
>   */
> -static int do_execve_common(struct filename *filename,
> -                               struct user_arg_ptr argv,
> -                               struct user_arg_ptr envp)
> +static int do_execveat_common(int fd, struct filename *filename,
> +                             struct user_arg_ptr argv,
> +                             struct user_arg_ptr envp,
> +                             int flags)
>  {
> +       char *pathbuf = NULL;
>         struct linux_binprm *bprm;
>         struct file *file;
>         struct files_struct *displaced;
> @@ -1481,7 +1516,7 @@ static int do_execve_common(struct filename *filename,
>         check_unsafe_exec(bprm);
>         current->in_execve = 1;
>
> -       file = do_open_exec(filename);
> +       file = do_open_execat(fd, filename, flags);
>         retval = PTR_ERR(file);
>         if (IS_ERR(file))
>                 goto out_unmark;
> @@ -1489,7 +1524,21 @@ static int do_execve_common(struct filename *filename,
>         sched_exec();
>
>         bprm->file = file;
> -       bprm->filename = bprm->interp = filename->name;
> +       if (filename && fd == AT_FDCWD) {
> +               bprm->filename = filename->name;
> +       } else {
> +               pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
> +               if (!pathbuf) {
> +                       retval = -ENOMEM;
> +                       goto out_unmark;
> +               }
> +               bprm->filename = d_path(&file->f_path, pathbuf, PATH_MAX);
> +               if (IS_ERR(bprm->filename)) {
> +                       retval = PTR_ERR(bprm->filename);
> +                       goto out_unmark;
> +               }
> +       }
> +       bprm->interp = bprm->filename;
>
>         retval = bprm_mm_init(bprm);
>         if (retval)
> @@ -1530,7 +1579,8 @@ static int do_execve_common(struct filename *filename,
>         acct_update_integrals(current);
>         task_numa_free(current);
>         free_bprm(bprm);
> -       putname(filename);
> +       if (filename)
> +               putname(filename);
>         if (displaced)
>                 put_files_struct(displaced);
>         return retval;
> @@ -1547,12 +1597,14 @@ out_unmark:
>
>  out_free:
>         free_bprm(bprm);
> +       kfree(pathbuf);
>
>  out_files:
>         if (displaced)
>                 reset_files_struct(displaced);
>  out_ret:
> -       putname(filename);
> +       if (filename)
> +               putname(filename);
>         return retval;
>  }
>
> @@ -1562,7 +1614,17 @@ int do_execve(struct filename *filename,
>  {
>         struct user_arg_ptr argv = { .ptr.native = __argv };
>         struct user_arg_ptr envp = { .ptr.native = __envp };
> -       return do_execve_common(filename, argv, envp);
> +       return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
> +}
> +
> +int do_execveat(int fd, struct filename *filename,
> +               const char __user *const __user *__argv,
> +               const char __user *const __user *__envp,
> +               int flags)
> +{
> +       struct user_arg_ptr argv = { .ptr.native = __argv };
> +       struct user_arg_ptr envp = { .ptr.native = __envp };
> +       return do_execveat_common(fd, filename, argv, envp, flags);
>  }
>
>  #ifdef CONFIG_COMPAT
> @@ -1578,7 +1640,23 @@ static int compat_do_execve(struct filename *filename,
>                 .is_compat = true,
>                 .ptr.compat = __envp,
>         };
> -       return do_execve_common(filename, argv, envp);
> +       return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
> +}
> +
> +static int compat_do_execveat(int fd, struct filename *filename,
> +                             const compat_uptr_t __user *__argv,
> +                             const compat_uptr_t __user *__envp,
> +                             int flags)
> +{
> +       struct user_arg_ptr argv = {
> +               .is_compat = true,
> +               .ptr.compat = __argv,
> +       };
> +       struct user_arg_ptr envp = {
> +               .is_compat = true,
> +               .ptr.compat = __envp,
> +       };
> +       return do_execveat_common(fd, filename, argv, envp, flags);
>  }
>  #endif
>
> @@ -1618,6 +1696,22 @@ SYSCALL_DEFINE3(execve,
>  {
>         return do_execve(getname(filename), argv, envp);
>  }
> +
> +SYSCALL_DEFINE5(execveat,
> +               int, fd, const char __user *, filename,
> +               const char __user *const __user *, argv,
> +               const char __user *const __user *, envp,
> +               int, flags)
> +{
> +       struct filename *path = NULL;
> +       if (filename) {
> +               path = getname(filename);
> +               if (IS_ERR(path))
> +                       return PTR_ERR(path);
> +       }
> +       return do_execveat(fd, path, argv, envp, flags);
> +}
> +
>  #ifdef CONFIG_COMPAT
>  asmlinkage long compat_sys_execve(const char __user * filename,
>         const compat_uptr_t __user * argv,
> @@ -1625,4 +1719,19 @@ asmlinkage long compat_sys_execve(const char __user * filename,
>  {
>         return compat_do_execve(getname(filename), argv, envp);
>  }
> +
> +asmlinkage long compat_sys_execveat(int fd,
> +       const char __user *filename,
> +       const compat_uptr_t __user *argv,
> +       const compat_uptr_t __user *envp,
> +       int flags)
> +{
> +       struct filename *path = NULL;
> +       if (filename) {
> +               path = getname(filename);
> +               if (IS_ERR(path))
> +                       return PTR_ERR(path);
> +       }
> +       return compat_do_execveat(fd, path, argv, envp, flags);
> +}
>  #endif
> diff --git a/include/linux/compat.h b/include/linux/compat.h
> index 3f448c65511b..e875a5d97e08 100644
> --- a/include/linux/compat.h
> +++ b/include/linux/compat.h
> @@ -341,6 +341,9 @@ asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int);
>
>  asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv,
>                      const compat_uptr_t __user *envp);
> +asmlinkage long compat_sys_execveat(int dfd, const char __user *filename,
> +                    const compat_uptr_t __user *argv,
> +                    const compat_uptr_t __user *envp, int flags);
>
>  asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
>                 compat_ulong_t __user *outp, compat_ulong_t __user *exp,
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a781dec1cd0b..92601818b4fb 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2315,6 +2315,10 @@ extern int disallow_signal(int);
>  extern int do_execve(struct filename *,
>                      const char __user * const __user *,
>                      const char __user * const __user *);
> +extern int do_execveat(int, struct filename *,
> +                      const char __user * const __user *,
> +                      const char __user * const __user *,
> +                      int);
>  extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
>  struct task_struct *fork_idle(int);
>  extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index a747a77ea584..309a810cf39d 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -855,4 +855,8 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>                          unsigned long idx1, unsigned long idx2);
>  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
> +asmlinkage long sys_execveat(int dfd, const char __user *filename,
> +                       const char __user *const __user *argv,
> +                       const char __user *const __user *envp, int flags);
> +
>  #endif
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index dde8041f40d2..4231bca3f95e 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -696,9 +696,11 @@ __SYSCALL(__NR_finit_module, sys_finit_module)
>  __SYSCALL(__NR_sched_setattr, sys_sched_setattr)
>  #define __NR_sched_getattr 275
>  __SYSCALL(__NR_sched_getattr, sys_sched_getattr)
> +#define __NR_execveat 276
> +__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
>
>  #undef __NR_syscalls
> -#define __NR_syscalls 276
> +#define __NR_syscalls 277
>
>  /*
>   * All syscalls below here should go away really,
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 7078052284fd..5ea7b8ab9e63 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -209,3 +209,6 @@ cond_syscall(compat_sys_open_by_handle_at);
>
>  /* compare kernel pointers */
>  cond_syscall(sys_kcmp);
> +
> +/* execveat */
> +cond_syscall(sys_execveat);
> diff --git a/lib/audit.c b/lib/audit.c
> index 76bbed4a20e5..712456ed5960 100644
> --- a/lib/audit.c
> +++ b/lib/audit.c
> @@ -48,6 +48,9 @@ int audit_classify_syscall(int abi, unsigned syscall)
>         case __NR_socketcall:
>                 return 4;
>  #endif
> +#ifdef __NR_execveat
> +       case __NR_execveat:
> +#endif
>         case __NR_execve:
>                 return 5;
>         default:
> --
> 1.9.1.423.g4596e3a
>



-- 
Kees Cook
Chrome OS Security
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux