Re: [PATCHv5 1/3] syscalls,x86: implement execveat() system call

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



David Drysdale <drysdale@xxxxxxxxxx> writes:

> Add a new system execveat(2) syscall. execveat() is to execve() as
> openat() is to open(): it takes a file descriptor that refers to a
> directory, and resolves the filename relative to that.
>
> In addition, if the filename is empty and AT_EMPTY_PATH is specified,
> execveat() executes the file to which the file descriptor refers. This
> replicates the functionality of fexecve(), which is a system call in
> other UNIXen, but in Linux glibc it depends on opening
> "/proc/self/fd/<fd>" (and so relies on /proc being mounted).
>
> The filename fed to the executed program as argv[0] (or the name of the
> script fed to a script interpreter) will be of the form "/dev/fd/<fd>"
> (for an empty filename) or "/dev/fd/<fd>/<filename>", effectively
> reflecting how the executable was found.  This does however mean that
> execution of a script in a /proc-less environment won't work.
>
> Only x86-64, i386 and x32 ABIs are supported in this patch.
>
> Based on patches by Meredydd Luff <meredydd@xxxxxxxxxxxxxxx>
>
> Signed-off-by: David Drysdale <drysdale@xxxxxxxxxx>
> ---
>  arch/x86/ia32/audit.c             |   1 +
>  arch/x86/ia32/ia32entry.S         |   1 +
>  arch/x86/kernel/audit_64.c        |   1 +
>  arch/x86/kernel/entry_64.S        |  28 ++++++++
>  arch/x86/syscalls/syscall_32.tbl  |   1 +
>  arch/x86/syscalls/syscall_64.tbl  |   2 +
>  arch/x86/um/sys_call_table_64.c   |   1 +
>  fs/exec.c                         | 130 ++++++++++++++++++++++++++++++++++----
>  fs/namei.c                        |   2 +-
>  include/linux/compat.h            |   3 +
>  include/linux/fs.h                |   1 +
>  include/linux/sched.h             |   4 ++
>  include/linux/syscalls.h          |   4 ++
>  include/uapi/asm-generic/unistd.h |   4 +-
>  kernel/sys_ni.c                   |   3 +
>  lib/audit.c                       |   3 +
>  16 files changed, 173 insertions(+), 16 deletions(-)
>
> diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c
> index 5d7b381da692..2eccc8932ae6 100644
> --- a/arch/x86/ia32/audit.c
> +++ b/arch/x86/ia32/audit.c
> @@ -35,6 +35,7 @@ int ia32_classify_syscall(unsigned syscall)
>  	case __NR_socketcall:
>  		return 4;
>  	case __NR_execve:
> +	case __NR_execveat:
>  		return 5;
>  	default:
>  		return 1;
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index 4299eb05023c..2516c09743e0 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -464,6 +464,7 @@ GLOBAL(\label)
>  	PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
>  	PTREGSCALL stub32_sigreturn, sys32_sigreturn
>  	PTREGSCALL stub32_execve, compat_sys_execve
> +	PTREGSCALL stub32_execveat, compat_sys_execveat
>  	PTREGSCALL stub32_fork, sys_fork
>  	PTREGSCALL stub32_vfork, sys_vfork
>  
> diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
> index 06d3e5a14d9d..f3672508b249 100644
> --- a/arch/x86/kernel/audit_64.c
> +++ b/arch/x86/kernel/audit_64.c
> @@ -50,6 +50,7 @@ int audit_classify_syscall(int abi, unsigned syscall)
>  	case __NR_openat:
>  		return 3;
>  	case __NR_execve:
> +	case __NR_execveat:
>  		return 5;
>  	default:
>  		return 0;
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index 2fac1343a90b..00c4526e6ffe 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -665,6 +665,20 @@ ENTRY(stub_execve)
>  	CFI_ENDPROC
>  END(stub_execve)
>  
> +ENTRY(stub_execveat)
> +	CFI_STARTPROC
> +	addq $8, %rsp
> +	PARTIAL_FRAME 0
> +	SAVE_REST
> +	FIXUP_TOP_OF_STACK %r11
> +	call sys_execveat
> +	RESTORE_TOP_OF_STACK %r11
> +	movq %rax,RAX(%rsp)
> +	RESTORE_REST
> +	jmp int_ret_from_sys_call
> +	CFI_ENDPROC
> +END(stub_execveat)
> +
>  /*
>   * sigreturn is special because it needs to restore all registers on return.
>   * This cannot be done with SYSRET, so use the IRET return path instead.
> @@ -710,6 +724,20 @@ ENTRY(stub_x32_execve)
>  	CFI_ENDPROC
>  END(stub_x32_execve)
>  
> +ENTRY(stub_x32_execveat)
> +	CFI_STARTPROC
> +	addq $8, %rsp
> +	PARTIAL_FRAME 0
> +	SAVE_REST
> +	FIXUP_TOP_OF_STACK %r11
> +	call compat_sys_execveat
> +	RESTORE_TOP_OF_STACK %r11
> +	movq %rax,RAX(%rsp)
> +	RESTORE_REST
> +	jmp int_ret_from_sys_call
> +	CFI_ENDPROC
> +END(stub_x32_execveat)
> +
>  #endif
>  
>  /*
> diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
> index 028b78168d85..2633e3195455 100644
> --- a/arch/x86/syscalls/syscall_32.tbl
> +++ b/arch/x86/syscalls/syscall_32.tbl
> @@ -363,3 +363,4 @@
>  354	i386	seccomp			sys_seccomp
>  355	i386	getrandom		sys_getrandom
>  356	i386	memfd_create		sys_memfd_create
> +357	i386	execveat		sys_execveat			stub32_execveat
> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
> index 35dd922727b9..1af5badd159c 100644
> --- a/arch/x86/syscalls/syscall_64.tbl
> +++ b/arch/x86/syscalls/syscall_64.tbl
> @@ -327,6 +327,7 @@
>  318	common	getrandom		sys_getrandom
>  319	common	memfd_create		sys_memfd_create
>  320	common	kexec_file_load		sys_kexec_file_load
> +321	64	execveat		stub_execveat
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> @@ -365,3 +366,4 @@
>  542	x32	getsockopt		compat_sys_getsockopt
>  543	x32	io_setup		compat_sys_io_setup
>  544	x32	io_submit		compat_sys_io_submit
> +545	x32	execveat		stub_x32_execveat
> diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
> index f2f0723070ca..20c3649d0691 100644
> --- a/arch/x86/um/sys_call_table_64.c
> +++ b/arch/x86/um/sys_call_table_64.c
> @@ -31,6 +31,7 @@
>  #define stub_fork sys_fork
>  #define stub_vfork sys_vfork
>  #define stub_execve sys_execve
> +#define stub_execveat sys_execveat
>  #define stub_rt_sigreturn sys_rt_sigreturn
>  
>  #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
> diff --git a/fs/exec.c b/fs/exec.c
> index a2b42a98c743..92a6e14f096a 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -747,7 +747,7 @@ EXPORT_SYMBOL(setup_arg_pages);
>  
>  #endif /* CONFIG_MMU */
>  
> -static struct file *do_open_exec(struct filename *name)
> +static struct file *do_open_execat(int fd, struct filename *name, int flags)
>  {
>  	struct file *file;
>  	int err;
> @@ -757,10 +757,34 @@ static struct file *do_open_exec(struct filename *name)
>  		.intent = LOOKUP_OPEN,
>  		.lookup_flags = LOOKUP_FOLLOW,
>  	};
> +	static const struct open_flags open_exec_nofollow_flags = {
> +		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
> +		.acc_mode = MAY_EXEC | MAY_OPEN,
> +		.intent = LOOKUP_OPEN,
> +		.lookup_flags = 0,
> +	};
>  
> -	file = do_filp_open(AT_FDCWD, name, &open_exec_flags);
> -	if (IS_ERR(file))
> -		goto out;
> +	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
> +		return ERR_PTR(-EINVAL);
> +
> +	if (name->name[0] != '\0') {

Is it really necessary to special case AT_EMPTY_PATH here.  I would
have thought the existing logic in namei.c would have been fine
assuning we passed LOOKUP_EMPTY.

> +		const struct open_flags *oflags = ((flags & AT_SYMLINK_NOFOLLOW)
> +						   ? &open_exec_nofollow_flags
> +						   : &open_exec_flags);
> +
> +		file = do_filp_open(fd, name, oflags);
> +		if (IS_ERR(file))
> +			goto out;
> +	} else {
> +		file = fget(fd);
> +		if (!file)
> +			return ERR_PTR(-EBADF);
> +
> +		err = inode_permission(file->f_path.dentry->d_inode,
> +				open_exec_flags.acc_mode);
> +		if (err)
> +			goto exit;
> +	}
>
>  	err = -EACCES;
>  	if (!S_ISREG(file_inode(file)->i_mode))
> @@ -769,12 +793,13 @@ static struct file *do_open_exec(struct filename *name)
>  	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
>  		goto exit;
>  
> -	fsnotify_open(file);
> -
>  	err = deny_write_access(file);
>  	if (err)
>  		goto exit;
>  
> +	if (name->name[0] != '\0')
> +		fsnotify_open(file);
> +
>  out:
>  	return file;
>  
> @@ -786,7 +811,7 @@ exit:
>  struct file *open_exec(const char *name)
>  {
>  	struct filename tmp = { .name = name };
> -	return do_open_exec(&tmp);
> +	return do_open_execat(AT_FDCWD, &tmp, 0);
>  }
>  EXPORT_SYMBOL(open_exec);
>  
> @@ -1422,10 +1447,12 @@ static int exec_binprm(struct linux_binprm *bprm)
>  /*
>   * sys_execve() executes a new program.
>   */
> -static int do_execve_common(struct filename *filename,
> -				struct user_arg_ptr argv,
> -				struct user_arg_ptr envp)
> +static int do_execveat_common(int fd, struct filename *filename,
> +			      struct user_arg_ptr argv,
> +			      struct user_arg_ptr envp,
> +			      int flags)
>  {
> +	char *pathbuf = NULL;
>  	struct linux_binprm *bprm;
>  	struct file *file;
>  	struct files_struct *displaced;
> @@ -1466,7 +1493,7 @@ static int do_execve_common(struct filename *filename,
>  	check_unsafe_exec(bprm);
>  	current->in_execve = 1;
>  
> -	file = do_open_exec(filename);
> +	file = do_open_execat(fd, filename, flags);
>  	retval = PTR_ERR(file);
>  	if (IS_ERR(file))
>  		goto out_unmark;
> @@ -1474,7 +1501,27 @@ static int do_execve_common(struct filename *filename,
>  	sched_exec();
>  
>  	bprm->file = file;
> -	bprm->filename = bprm->interp = filename->name;
> +	if (fd == AT_FDCWD || filename->name[0] == '/') {
> +		bprm->filename = filename->name;
> +	} else {
> +		/*
> +		 * Build a pathname that reflects how we got to the file,
> +		 * either "/dev/fd/<fd>" (for an empty filename) or
> +		 * "/dev/fd/<fd>/<filename>".
> +		 */
> +		pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
> +		if (!pathbuf) {
> +			retval = -ENOMEM;
> +			goto out_unmark;
> +		}
> +		bprm->filename = pathbuf;
> +		if (filename->name[0] == '\0')
> +			sprintf(pathbuf, "/dev/fd/%d", fd);
> +		else
> +			snprintf(pathbuf, PATH_MAX,
> +				 "/dev/fd/%d/%s", fd, filename->name);
> +	}
> +	bprm->interp = bprm->filename;
>  
>  	retval = bprm_mm_init(bprm);
>  	if (retval)
> @@ -1532,6 +1579,7 @@ out_unmark:
>  
>  out_free:
>  	free_bprm(bprm);
> +	kfree(pathbuf);
>  
>  out_files:
>  	if (displaced)
> @@ -1547,7 +1595,18 @@ int do_execve(struct filename *filename,
>  {
>  	struct user_arg_ptr argv = { .ptr.native = __argv };
>  	struct user_arg_ptr envp = { .ptr.native = __envp };
> -	return do_execve_common(filename, argv, envp);
> +	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
> +}
> +
> +int do_execveat(int fd, struct filename *filename,
> +		const char __user *const __user *__argv,
> +		const char __user *const __user *__envp,
> +		int flags)
> +{
> +	struct user_arg_ptr argv = { .ptr.native = __argv };
> +	struct user_arg_ptr envp = { .ptr.native = __envp };
> +
> +	return do_execveat_common(fd, filename, argv, envp, flags);
>  }
>  
>  #ifdef CONFIG_COMPAT
> @@ -1563,7 +1622,23 @@ static int compat_do_execve(struct filename *filename,
>  		.is_compat = true,
>  		.ptr.compat = __envp,
>  	};
> -	return do_execve_common(filename, argv, envp);
> +	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
> +}
> +
> +static int compat_do_execveat(int fd, struct filename *filename,
> +			      const compat_uptr_t __user *__argv,
> +			      const compat_uptr_t __user *__envp,
> +			      int flags)
> +{
> +	struct user_arg_ptr argv = {
> +		.is_compat = true,
> +		.ptr.compat = __argv,
> +	};
> +	struct user_arg_ptr envp = {
> +		.is_compat = true,
> +		.ptr.compat = __envp,
> +	};
> +	return do_execveat_common(fd, filename, argv, envp, flags);
>  }
>  #endif
>  
> @@ -1603,6 +1678,20 @@ SYSCALL_DEFINE3(execve,
>  {
>  	return do_execve(getname(filename), argv, envp);
>  }
> +
> +SYSCALL_DEFINE5(execveat,
> +		int, fd, const char __user *, filename,
> +		const char __user *const __user *, argv,
> +		const char __user *const __user *, envp,
> +		int, flags)
> +{
> +	int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
> +
> +	return do_execveat(fd,
> +			   getname_flags(filename, lookup_flags, NULL),
> +			   argv, envp, flags);
> +}
> +
>  #ifdef CONFIG_COMPAT
>  COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
>  	const compat_uptr_t __user *, argv,
> @@ -1610,4 +1699,17 @@ COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
>  {
>  	return compat_do_execve(getname(filename), argv, envp);
>  }
> +
> +COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
> +		       const char __user *, filename,
> +		       const compat_uptr_t __user *, argv,
> +		       const compat_uptr_t __user *, envp,
> +		       int,  flags)
> +{
> +	int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
> +
> +	return compat_do_execveat(fd,
> +				  getname_flags(filename, lookup_flags, NULL),
> +				  argv, envp, flags);
> +}
>  #endif
> diff --git a/fs/namei.c b/fs/namei.c
> index a7b05bf82d31..553c84d3e0cc 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -130,7 +130,7 @@ void final_putname(struct filename *name)
>  
>  #define EMBEDDED_NAME_MAX	(PATH_MAX - sizeof(struct filename))
>  
> -static struct filename *
> +struct filename *
>  getname_flags(const char __user *filename, int flags, int *empty)
>  {
>  	struct filename *result, *err;
> diff --git a/include/linux/compat.h b/include/linux/compat.h
> index e6494261eaff..7450ca2ac1fc 100644
> --- a/include/linux/compat.h
> +++ b/include/linux/compat.h
> @@ -357,6 +357,9 @@ asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int);
>  
>  asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv,
>  		     const compat_uptr_t __user *envp);
> +asmlinkage long compat_sys_execveat(int dfd, const char __user *filename,
> +		     const compat_uptr_t __user *argv,
> +		     const compat_uptr_t __user *envp, int flags);
>  
>  asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
>  		compat_ulong_t __user *outp, compat_ulong_t __user *exp,
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 94187721ad41..e9818574d738 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2060,6 +2060,7 @@ extern struct file *file_open_root(struct dentry *, struct vfsmount *,
>  extern struct file * dentry_open(const struct path *, int, const struct cred *);
>  extern int filp_close(struct file *, fl_owner_t id);
>  
> +extern struct filename *getname_flags(const char __user *, int, int *);
>  extern struct filename *getname(const char __user *);
>  extern struct filename *getname_kernel(const char *);
>  
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index b867a4dab38a..33e056da7d33 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2430,6 +2430,10 @@ extern void do_group_exit(int);
>  extern int do_execve(struct filename *,
>  		     const char __user * const __user *,
>  		     const char __user * const __user *);
> +extern int do_execveat(int, struct filename *,
> +		       const char __user * const __user *,
> +		       const char __user * const __user *,
> +		       int);
>  extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
>  struct task_struct *fork_idle(int);
>  extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 0f86d85a9ce4..df5422294deb 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -876,4 +876,8 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
>  asmlinkage long sys_getrandom(char __user *buf, size_t count,
>  			      unsigned int flags);
>  
> +asmlinkage long sys_execveat(int dfd, const char __user *filename,
> +			const char __user *const __user *argv,
> +			const char __user *const __user *envp, int flags);
> +
>  #endif
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 11d11bc5c78f..feef07d29663 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -705,9 +705,11 @@ __SYSCALL(__NR_seccomp, sys_seccomp)
>  __SYSCALL(__NR_getrandom, sys_getrandom)
>  #define __NR_memfd_create 279
>  __SYSCALL(__NR_memfd_create, sys_memfd_create)
> +#define __NR_execveat 280
> +__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
>  
>  #undef __NR_syscalls
> -#define __NR_syscalls 280
> +#define __NR_syscalls 281
>  
>  /*
>   * All syscalls below here should go away really,
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 391d4ddb6f4b..efb06058ad3e 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -218,3 +218,6 @@ cond_syscall(sys_kcmp);
>  
>  /* operate on Secure Computing state */
>  cond_syscall(sys_seccomp);
> +
> +/* execveat */
> +cond_syscall(sys_execveat);
> diff --git a/lib/audit.c b/lib/audit.c
> index 1d726a22565b..b8fb5ee81e26 100644
> --- a/lib/audit.c
> +++ b/lib/audit.c
> @@ -54,6 +54,9 @@ int audit_classify_syscall(int abi, unsigned syscall)
>  	case __NR_socketcall:
>  		return 4;
>  #endif
> +#ifdef __NR_execveat
> +	case __NR_execveat:
> +#endif
>  	case __NR_execve:
>  		return 5;
>  	default:
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux