This patch set adds execveat(2) for x86, and is derived from Meredydd Luff's patch from Sept 2012 (https://lkml.org/lkml/2012/9/11/528). The primary aim of adding an execveat syscall is to allow an implementation of fexecve(3) that does not rely on the /proc filesystem. The current glibc version of fexecve(3) is implemented via /proc, which causes problems in sandboxed or otherwise restricted environments. Given the desire for a /proc-free fexecve() implementation, HPA suggested (https://lkml.org/lkml/2006/7/11/556) that an execveat(2) syscall would be an appropriate generalization. Also, having a new syscall means that it can take a flags argument without back-compatibility concerns. The current implementation just defines the AT_SYMLINK_NOFOLLOW flag, but other flags could be added in future -- for example, flags for new namespaces (as suggested at https://lkml.org/lkml/2006/7/11/474). Related history: - https://lkml.org/lkml/2006/12/27/123 is an example of someone realizing that fexecve() is likely to fail in a chroot environment. - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514043 covered documenting the /proc requirement of fexecve(3) in its manpage, to "prevent other people from wasting their time". - https://bugzilla.kernel.org/show_bug.cgi?id=74481 documented that it's not possible to fexecve() a file descriptor for a script with close-on-exec set (which is possible with the implementation here). - https://bugzilla.redhat.com/show_bug.cgi?id=241609 described a problem where a process that did setuid() could not fexecve() because it no longer had access to /proc/self/fd; this has since been fixed. This patch in particular (of 3 = 2 kernel + 1 man-pages): Adds the new system execveat(2) syscall. execveat() is to execve() as openat() is to open(): it takes a file descriptor that refers to a directory, and resolves the filename relative to that. In addition, if the filename is NULL, execveat() executes the file to which the file descriptor refers. This replicates the functionality of fexecve(), which is a system call in other UNIXen, but in Linux glibc it depends on /proc being mounted. Only x86-64, i386 and x32 ABIs are supported in this patch. Based on patches by Meredydd Luff <meredydd@xxxxxxxxxxxxxxx> Signed-off-by: David Drysdale <drysdale@xxxxxxxxxx> --- arch/x86/ia32/audit.c | 1 + arch/x86/ia32/ia32entry.S | 1 + arch/x86/kernel/audit_64.c | 1 + arch/x86/kernel/entry_64.S | 28 +++++++ arch/x86/syscalls/syscall_32.tbl | 1 + arch/x86/syscalls/syscall_64.tbl | 2 + arch/x86/um/sys_call_table_64.c | 1 + fs/exec.c | 153 ++++++++++++++++++++++++++++++++------ include/linux/compat.h | 3 + include/linux/sched.h | 4 + include/linux/syscalls.h | 4 + include/uapi/asm-generic/unistd.h | 4 +- kernel/sys_ni.c | 3 + lib/audit.c | 3 + 14 files changed, 186 insertions(+), 23 deletions(-) diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c index 5d7b381da692..2eccc8932ae6 100644 --- a/arch/x86/ia32/audit.c +++ b/arch/x86/ia32/audit.c @@ -35,6 +35,7 @@ int ia32_classify_syscall(unsigned syscall) case __NR_socketcall: return 4; case __NR_execve: + case __NR_execveat: return 5; default: return 1; diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 4299eb05023c..2516c09743e0 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -464,6 +464,7 @@ GLOBAL(\label) PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn PTREGSCALL stub32_sigreturn, sys32_sigreturn PTREGSCALL stub32_execve, compat_sys_execve + PTREGSCALL stub32_execveat, compat_sys_execveat PTREGSCALL stub32_fork, sys_fork PTREGSCALL stub32_vfork, sys_vfork diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index 06d3e5a14d9d..f3672508b249 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c @@ -50,6 +50,7 @@ int audit_classify_syscall(int abi, unsigned syscall) case __NR_openat: return 3; case __NR_execve: + case __NR_execveat: return 5; default: return 0; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1e96c3628bf2..f9a6c2fdda15 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -872,6 +872,20 @@ ENTRY(stub_execve) CFI_ENDPROC END(stub_execve) +ENTRY(stub_execveat) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call sys_execveat + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_execveat) + /* * sigreturn is special because it needs to restore all registers on return. * This cannot be done with SYSRET, so use the IRET return path instead. @@ -917,6 +931,20 @@ ENTRY(stub_x32_execve) CFI_ENDPROC END(stub_x32_execve) +ENTRY(stub_x32_execveat) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call compat_sys_execveat + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_x32_execveat) + #endif /* diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 96bc506ac6de..2ab0712a0e7c 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -359,3 +359,4 @@ 350 i386 finit_module sys_finit_module 351 i386 sched_setattr sys_sched_setattr 352 i386 sched_getattr sys_sched_getattr +353 i386 execveat sys_execveat stub32_execveat diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index a12bddc7ccea..2e4058c14b4f 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -322,6 +322,7 @@ 313 common finit_module sys_finit_module 314 common sched_setattr sys_sched_setattr 315 common sched_getattr sys_sched_getattr +316 64 execveat stub_execveat # # x32-specific system call numbers start at 512 to avoid cache impact @@ -358,3 +359,4 @@ 540 x32 process_vm_writev compat_sys_process_vm_writev 541 x32 setsockopt compat_sys_setsockopt 542 x32 getsockopt compat_sys_getsockopt +543 x32 execveat stub_x32_execveat diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index f2f0723070ca..20c3649d0691 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -31,6 +31,7 @@ #define stub_fork sys_fork #define stub_vfork sys_vfork #define stub_execve sys_execve +#define stub_execveat sys_execveat #define stub_rt_sigreturn sys_rt_sigreturn #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) diff --git a/fs/exec.c b/fs/exec.c index 3d78fccdd723..a8676ce571ce 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -748,7 +748,22 @@ EXPORT_SYMBOL(setup_arg_pages); #endif /* CONFIG_MMU */ -static struct file *do_open_exec(struct filename *name) +/* + * Perform the extra checks that open_exec() needs over and above a normal + * open. + */ +static int check_exec_and_deny_write(struct file *file) +{ + if (!S_ISREG(file_inode(file)->i_mode)) + return -EACCES; + + if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) + return -EACCES; + + return deny_write_access(file); +} + +static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; int err; @@ -758,24 +773,42 @@ static struct file *do_open_exec(struct filename *name) .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; + static const struct open_flags open_exec_nofollow_flags = { + .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .acc_mode = MAY_EXEC | MAY_OPEN, + .intent = LOOKUP_OPEN, + .lookup_flags = 0, + }; - file = do_filp_open(AT_FDCWD, name, &open_exec_flags); - if (IS_ERR(file)) - goto out; - - err = -EACCES; - if (!S_ISREG(file_inode(file)->i_mode)) - goto exit; + if (flags & ~AT_SYMLINK_NOFOLLOW) + return ERR_PTR(-EINVAL); - if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) - goto exit; + if (name) { + const struct open_flags *oflags = ((flags & AT_SYMLINK_NOFOLLOW) + ? &open_exec_nofollow_flags + : &open_exec_flags); - fsnotify_open(file); + file = do_filp_open(fd, name, oflags); + if (IS_ERR(file)) + goto out; + } else { + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + + err = inode_permission(file->f_path.dentry->d_inode, + open_exec_flags.acc_mode); + if (err) + goto exit; + } - err = deny_write_access(file); + err = check_exec_and_deny_write(file); if (err) goto exit; + if (name) + fsnotify_open(file); + out: return file; @@ -787,7 +820,7 @@ exit: struct file *open_exec(const char *name) { struct filename tmp = { .name = name }; - return do_open_exec(&tmp); + return do_open_execat(AT_FDCWD, &tmp, 0); } EXPORT_SYMBOL(open_exec); @@ -1437,10 +1470,12 @@ static int exec_binprm(struct linux_binprm *bprm) /* * sys_execve() executes a new program. */ -static int do_execve_common(struct filename *filename, - struct user_arg_ptr argv, - struct user_arg_ptr envp) +static int do_execveat_common(int fd, struct filename *filename, + struct user_arg_ptr argv, + struct user_arg_ptr envp, + int flags) { + char *pathbuf = NULL; struct linux_binprm *bprm; struct file *file; struct files_struct *displaced; @@ -1481,7 +1516,7 @@ static int do_execve_common(struct filename *filename, check_unsafe_exec(bprm); current->in_execve = 1; - file = do_open_exec(filename); + file = do_open_execat(fd, filename, flags); retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; @@ -1489,7 +1524,21 @@ static int do_execve_common(struct filename *filename, sched_exec(); bprm->file = file; - bprm->filename = bprm->interp = filename->name; + if (filename && fd == AT_FDCWD) { + bprm->filename = filename->name; + } else { + pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); + if (!pathbuf) { + retval = -ENOMEM; + goto out_unmark; + } + bprm->filename = d_path(&file->f_path, pathbuf, PATH_MAX); + if (IS_ERR(bprm->filename)) { + retval = PTR_ERR(bprm->filename); + goto out_unmark; + } + } + bprm->interp = bprm->filename; retval = bprm_mm_init(bprm); if (retval) @@ -1530,7 +1579,8 @@ static int do_execve_common(struct filename *filename, acct_update_integrals(current); task_numa_free(current); free_bprm(bprm); - putname(filename); + if (filename) + putname(filename); if (displaced) put_files_struct(displaced); return retval; @@ -1547,12 +1597,14 @@ out_unmark: out_free: free_bprm(bprm); + kfree(pathbuf); out_files: if (displaced) reset_files_struct(displaced); out_ret: - putname(filename); + if (filename) + putname(filename); return retval; } @@ -1562,7 +1614,17 @@ int do_execve(struct filename *filename, { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; - return do_execve_common(filename, argv, envp); + return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); +} + +int do_execveat(int fd, struct filename *filename, + const char __user *const __user *__argv, + const char __user *const __user *__envp, + int flags) +{ + struct user_arg_ptr argv = { .ptr.native = __argv }; + struct user_arg_ptr envp = { .ptr.native = __envp }; + return do_execveat_common(fd, filename, argv, envp, flags); } #ifdef CONFIG_COMPAT @@ -1578,7 +1640,23 @@ static int compat_do_execve(struct filename *filename, .is_compat = true, .ptr.compat = __envp, }; - return do_execve_common(filename, argv, envp); + return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); +} + +static int compat_do_execveat(int fd, struct filename *filename, + const compat_uptr_t __user *__argv, + const compat_uptr_t __user *__envp, + int flags) +{ + struct user_arg_ptr argv = { + .is_compat = true, + .ptr.compat = __argv, + }; + struct user_arg_ptr envp = { + .is_compat = true, + .ptr.compat = __envp, + }; + return do_execveat_common(fd, filename, argv, envp, flags); } #endif @@ -1618,6 +1696,22 @@ SYSCALL_DEFINE3(execve, { return do_execve(getname(filename), argv, envp); } + +SYSCALL_DEFINE5(execveat, + int, fd, const char __user *, filename, + const char __user *const __user *, argv, + const char __user *const __user *, envp, + int, flags) +{ + struct filename *path = NULL; + if (filename) { + path = getname(filename); + if (IS_ERR(path)) + return PTR_ERR(path); + } + return do_execveat(fd, path, argv, envp, flags); +} + #ifdef CONFIG_COMPAT asmlinkage long compat_sys_execve(const char __user * filename, const compat_uptr_t __user * argv, @@ -1625,4 +1719,19 @@ asmlinkage long compat_sys_execve(const char __user * filename, { return compat_do_execve(getname(filename), argv, envp); } + +asmlinkage long compat_sys_execveat(int fd, + const char __user *filename, + const compat_uptr_t __user *argv, + const compat_uptr_t __user *envp, + int flags) +{ + struct filename *path = NULL; + if (filename) { + path = getname(filename); + if (IS_ERR(path)) + return PTR_ERR(path); + } + return compat_do_execveat(fd, path, argv, envp, flags); +} #endif diff --git a/include/linux/compat.h b/include/linux/compat.h index 3f448c65511b..e875a5d97e08 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -341,6 +341,9 @@ asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int); asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv, const compat_uptr_t __user *envp); +asmlinkage long compat_sys_execveat(int dfd, const char __user *filename, + const compat_uptr_t __user *argv, + const compat_uptr_t __user *envp, int flags); asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, diff --git a/include/linux/sched.h b/include/linux/sched.h index a781dec1cd0b..92601818b4fb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2315,6 +2315,10 @@ extern int disallow_signal(int); extern int do_execve(struct filename *, const char __user * const __user *, const char __user * const __user *); +extern int do_execveat(int, struct filename *, + const char __user * const __user *, + const char __user * const __user *, + int); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a747a77ea584..309a810cf39d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -855,4 +855,8 @@ asmlinkage long sys_process_vm_writev(pid_t pid, asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2); asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags); +asmlinkage long sys_execveat(int dfd, const char __user *filename, + const char __user *const __user *argv, + const char __user *const __user *envp, int flags); + #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index dde8041f40d2..4231bca3f95e 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -696,9 +696,11 @@ __SYSCALL(__NR_finit_module, sys_finit_module) __SYSCALL(__NR_sched_setattr, sys_sched_setattr) #define __NR_sched_getattr 275 __SYSCALL(__NR_sched_getattr, sys_sched_getattr) +#define __NR_execveat 276 +__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat) #undef __NR_syscalls -#define __NR_syscalls 276 +#define __NR_syscalls 277 /* * All syscalls below here should go away really, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7078052284fd..5ea7b8ab9e63 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -209,3 +209,6 @@ cond_syscall(compat_sys_open_by_handle_at); /* compare kernel pointers */ cond_syscall(sys_kcmp); + +/* execveat */ +cond_syscall(sys_execveat); diff --git a/lib/audit.c b/lib/audit.c index 76bbed4a20e5..712456ed5960 100644 --- a/lib/audit.c +++ b/lib/audit.c @@ -48,6 +48,9 @@ int audit_classify_syscall(int abi, unsigned syscall) case __NR_socketcall: return 4; #endif +#ifdef __NR_execveat + case __NR_execveat: +#endif case __NR_execve: return 5; default: -- 1.9.1.423.g4596e3a -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html