[PATCHv4 RESEND 1/3] syscalls,x86: implement execveat() system call

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This patch set adds execveat(2) for x86, and is derived from Meredydd
Luff's patch from Sept 2012 (https://lkml.org/lkml/2012/9/11/528).

The primary aim of adding an execveat syscall is to allow an
implementation of fexecve(3) that does not rely on the /proc
filesystem.  The current glibc version of fexecve(3) is implemented
via /proc, which causes problems in sandboxed or otherwise restricted
environments.

Given the desire for a /proc-free fexecve() implementation, HPA
suggested (https://lkml.org/lkml/2006/7/11/556) that an execveat(2)
syscall would be an appropriate generalization.

Also, having a new syscall means that it can take a flags argument
without back-compatibility concerns.  The current implementation just
defines the AT_SYMLINK_NOFOLLOW flag, but other flags could be added
in future -- for example, flags for new namespaces (as suggested at
https://lkml.org/lkml/2006/7/11/474).

Related history:
 - https://lkml.org/lkml/2006/12/27/123 is an example of someone
   realizing that fexecve() is likely to fail in a chroot environment.
 - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514043 covered
   documenting the /proc requirement of fexecve(3) in its manpage, to
   "prevent other people from wasting their time".
 - https://bugzilla.kernel.org/show_bug.cgi?id=74481 documented that
   it's not possible to fexecve() a file descriptor for a script with
   close-on-exec set (which is possible with the implementation here).
 - https://bugzilla.redhat.com/show_bug.cgi?id=241609 described a
   problem where a process that did setuid() could not fexecve()
   because it no longer had access to /proc/self/fd; this has since
   been fixed.

This patch in particular (of 3 = 2 kernel + 1 man-pages):

Adds the new system execveat(2) syscall. execveat() is to execve() as
openat() is to open(): it takes a file descriptor that refers to a
directory, and resolves the filename relative to that.

In addition, if the filename is NULL, execveat() executes the file
to which the file descriptor refers. This replicates the functionality
of fexecve(), which is a system call in other UNIXen, but in Linux
glibc it depends on /proc being mounted.

Only x86-64, i386 and x32 ABIs are supported in this patch.

Based on patches by Meredydd Luff <meredydd@xxxxxxxxxxxxxxx>

Signed-off-by: David Drysdale <drysdale@xxxxxxxxxx>
---
 arch/x86/ia32/audit.c             |   1 +
 arch/x86/ia32/ia32entry.S         |   1 +
 arch/x86/kernel/audit_64.c        |   1 +
 arch/x86/kernel/entry_64.S        |  28 +++++++
 arch/x86/syscalls/syscall_32.tbl  |   1 +
 arch/x86/syscalls/syscall_64.tbl  |   2 +
 arch/x86/um/sys_call_table_64.c   |   1 +
 fs/exec.c                         | 153 ++++++++++++++++++++++++++++++++------
 include/linux/compat.h            |   3 +
 include/linux/sched.h             |   4 +
 include/linux/syscalls.h          |   4 +
 include/uapi/asm-generic/unistd.h |   4 +-
 kernel/sys_ni.c                   |   3 +
 lib/audit.c                       |   3 +
 14 files changed, 186 insertions(+), 23 deletions(-)

diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c
index 5d7b381da692..2eccc8932ae6 100644
--- a/arch/x86/ia32/audit.c
+++ b/arch/x86/ia32/audit.c
@@ -35,6 +35,7 @@ int ia32_classify_syscall(unsigned syscall)
 	case __NR_socketcall:
 		return 4;
 	case __NR_execve:
+	case __NR_execveat:
 		return 5;
 	default:
 		return 1;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 4299eb05023c..2516c09743e0 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -464,6 +464,7 @@ GLOBAL(\label)
 	PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
 	PTREGSCALL stub32_sigreturn, sys32_sigreturn
 	PTREGSCALL stub32_execve, compat_sys_execve
+	PTREGSCALL stub32_execveat, compat_sys_execveat
 	PTREGSCALL stub32_fork, sys_fork
 	PTREGSCALL stub32_vfork, sys_vfork

diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
index 06d3e5a14d9d..f3672508b249 100644
--- a/arch/x86/kernel/audit_64.c
+++ b/arch/x86/kernel/audit_64.c
@@ -50,6 +50,7 @@ int audit_classify_syscall(int abi, unsigned syscall)
 	case __NR_openat:
 		return 3;
 	case __NR_execve:
+	case __NR_execveat:
 		return 5;
 	default:
 		return 0;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1e96c3628bf2..f9a6c2fdda15 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -872,6 +872,20 @@ ENTRY(stub_execve)
 	CFI_ENDPROC
 END(stub_execve)

+ENTRY(stub_execveat)
+	CFI_STARTPROC
+	addq $8, %rsp
+	PARTIAL_FRAME 0
+	SAVE_REST
+	FIXUP_TOP_OF_STACK %r11
+	call sys_execveat
+	RESTORE_TOP_OF_STACK %r11
+	movq %rax,RAX(%rsp)
+	RESTORE_REST
+	jmp int_ret_from_sys_call
+	CFI_ENDPROC
+END(stub_execveat)
+
 /*
  * sigreturn is special because it needs to restore all registers on return.
  * This cannot be done with SYSRET, so use the IRET return path instead.
@@ -917,6 +931,20 @@ ENTRY(stub_x32_execve)
 	CFI_ENDPROC
 END(stub_x32_execve)

+ENTRY(stub_x32_execveat)
+	CFI_STARTPROC
+	addq $8, %rsp
+	PARTIAL_FRAME 0
+	SAVE_REST
+	FIXUP_TOP_OF_STACK %r11
+	call compat_sys_execveat
+	RESTORE_TOP_OF_STACK %r11
+	movq %rax,RAX(%rsp)
+	RESTORE_REST
+	jmp int_ret_from_sys_call
+	CFI_ENDPROC
+END(stub_x32_execveat)
+
 #endif

 /*
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 96bc506ac6de..2ab0712a0e7c 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -359,3 +359,4 @@
 350	i386	finit_module		sys_finit_module
 351	i386	sched_setattr		sys_sched_setattr
 352	i386	sched_getattr		sys_sched_getattr
+353	i386	execveat		sys_execveat			stub32_execveat
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index a12bddc7ccea..2e4058c14b4f 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -322,6 +322,7 @@
 313	common	finit_module		sys_finit_module
 314	common	sched_setattr		sys_sched_setattr
 315	common	sched_getattr		sys_sched_getattr
+316	64	execveat		stub_execveat

 #
 # x32-specific system call numbers start at 512 to avoid cache impact
@@ -358,3 +359,4 @@
 540	x32	process_vm_writev	compat_sys_process_vm_writev
 541	x32	setsockopt		compat_sys_setsockopt
 542	x32	getsockopt		compat_sys_getsockopt
+543	x32	execveat		stub_x32_execveat
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index f2f0723070ca..20c3649d0691 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -31,6 +31,7 @@
 #define stub_fork sys_fork
 #define stub_vfork sys_vfork
 #define stub_execve sys_execve
+#define stub_execveat sys_execveat
 #define stub_rt_sigreturn sys_rt_sigreturn

 #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
diff --git a/fs/exec.c b/fs/exec.c
index 3d78fccdd723..a8676ce571ce 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -748,7 +748,22 @@ EXPORT_SYMBOL(setup_arg_pages);

 #endif /* CONFIG_MMU */

-static struct file *do_open_exec(struct filename *name)
+/*
+ * Perform the extra checks that open_exec() needs over and above a normal
+ * open.
+ */
+static int check_exec_and_deny_write(struct file *file)
+{
+	if (!S_ISREG(file_inode(file)->i_mode))
+		return -EACCES;
+
+	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+		return -EACCES;
+
+	return deny_write_access(file);
+}
+
+static struct file *do_open_execat(int fd, struct filename *name, int flags)
 {
 	struct file *file;
 	int err;
@@ -758,24 +773,42 @@ static struct file *do_open_exec(struct filename *name)
 		.intent = LOOKUP_OPEN,
 		.lookup_flags = LOOKUP_FOLLOW,
 	};
+	static const struct open_flags open_exec_nofollow_flags = {
+		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+		.acc_mode = MAY_EXEC | MAY_OPEN,
+		.intent = LOOKUP_OPEN,
+		.lookup_flags = 0,
+	};

-	file = do_filp_open(AT_FDCWD, name, &open_exec_flags);
-	if (IS_ERR(file))
-		goto out;
-
-	err = -EACCES;
-	if (!S_ISREG(file_inode(file)->i_mode))
-		goto exit;
+	if (flags & ~AT_SYMLINK_NOFOLLOW)
+		return ERR_PTR(-EINVAL);

-	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
-		goto exit;
+	if (name) {
+		const struct open_flags *oflags = ((flags & AT_SYMLINK_NOFOLLOW)
+						   ? &open_exec_nofollow_flags
+						   : &open_exec_flags);

-	fsnotify_open(file);
+		file = do_filp_open(fd, name, oflags);
+		if (IS_ERR(file))
+			goto out;
+	} else {
+		file = fget(fd);
+		if (!file)
+			return ERR_PTR(-EBADF);
+
+		err = inode_permission(file->f_path.dentry->d_inode,
+				open_exec_flags.acc_mode);
+		if (err)
+			goto exit;
+	}

-	err = deny_write_access(file);
+	err = check_exec_and_deny_write(file);
 	if (err)
 		goto exit;

+	if (name)
+		fsnotify_open(file);
+
 out:
 	return file;

@@ -787,7 +820,7 @@ exit:
 struct file *open_exec(const char *name)
 {
 	struct filename tmp = { .name = name };
-	return do_open_exec(&tmp);
+	return do_open_execat(AT_FDCWD, &tmp, 0);
 }
 EXPORT_SYMBOL(open_exec);

@@ -1437,10 +1470,12 @@ static int exec_binprm(struct linux_binprm *bprm)
 /*
  * sys_execve() executes a new program.
  */
-static int do_execve_common(struct filename *filename,
-				struct user_arg_ptr argv,
-				struct user_arg_ptr envp)
+static int do_execveat_common(int fd, struct filename *filename,
+			      struct user_arg_ptr argv,
+			      struct user_arg_ptr envp,
+			      int flags)
 {
+	char *pathbuf = NULL;
 	struct linux_binprm *bprm;
 	struct file *file;
 	struct files_struct *displaced;
@@ -1481,7 +1516,7 @@ static int do_execve_common(struct filename *filename,
 	check_unsafe_exec(bprm);
 	current->in_execve = 1;

-	file = do_open_exec(filename);
+	file = do_open_execat(fd, filename, flags);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto out_unmark;
@@ -1489,7 +1524,21 @@ static int do_execve_common(struct filename *filename,
 	sched_exec();

 	bprm->file = file;
-	bprm->filename = bprm->interp = filename->name;
+	if (filename && fd == AT_FDCWD) {
+		bprm->filename = filename->name;
+	} else {
+		pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
+		if (!pathbuf) {
+			retval = -ENOMEM;
+			goto out_unmark;
+		}
+		bprm->filename = d_path(&file->f_path, pathbuf, PATH_MAX);
+		if (IS_ERR(bprm->filename)) {
+			retval = PTR_ERR(bprm->filename);
+			goto out_unmark;
+		}
+	}
+	bprm->interp = bprm->filename;

 	retval = bprm_mm_init(bprm);
 	if (retval)
@@ -1530,7 +1579,8 @@ static int do_execve_common(struct filename *filename,
 	acct_update_integrals(current);
 	task_numa_free(current);
 	free_bprm(bprm);
-	putname(filename);
+	if (filename)
+		putname(filename);
 	if (displaced)
 		put_files_struct(displaced);
 	return retval;
@@ -1547,12 +1597,14 @@ out_unmark:

 out_free:
 	free_bprm(bprm);
+	kfree(pathbuf);

 out_files:
 	if (displaced)
 		reset_files_struct(displaced);
 out_ret:
-	putname(filename);
+	if (filename)
+		putname(filename);
 	return retval;
 }

@@ -1562,7 +1614,17 @@ int do_execve(struct filename *filename,
 {
 	struct user_arg_ptr argv = { .ptr.native = __argv };
 	struct user_arg_ptr envp = { .ptr.native = __envp };
-	return do_execve_common(filename, argv, envp);
+	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
+}
+
+int do_execveat(int fd, struct filename *filename,
+		const char __user *const __user *__argv,
+		const char __user *const __user *__envp,
+		int flags)
+{
+	struct user_arg_ptr argv = { .ptr.native = __argv };
+	struct user_arg_ptr envp = { .ptr.native = __envp };
+	return do_execveat_common(fd, filename, argv, envp, flags);
 }

 #ifdef CONFIG_COMPAT
@@ -1578,7 +1640,23 @@ static int compat_do_execve(struct filename *filename,
 		.is_compat = true,
 		.ptr.compat = __envp,
 	};
-	return do_execve_common(filename, argv, envp);
+	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
+}
+
+static int compat_do_execveat(int fd, struct filename *filename,
+			      const compat_uptr_t __user *__argv,
+			      const compat_uptr_t __user *__envp,
+			      int flags)
+{
+	struct user_arg_ptr argv = {
+		.is_compat = true,
+		.ptr.compat = __argv,
+	};
+	struct user_arg_ptr envp = {
+		.is_compat = true,
+		.ptr.compat = __envp,
+	};
+	return do_execveat_common(fd, filename, argv, envp, flags);
 }
 #endif

@@ -1618,6 +1696,22 @@ SYSCALL_DEFINE3(execve,
 {
 	return do_execve(getname(filename), argv, envp);
 }
+
+SYSCALL_DEFINE5(execveat,
+		int, fd, const char __user *, filename,
+		const char __user *const __user *, argv,
+		const char __user *const __user *, envp,
+		int, flags)
+{
+	struct filename *path = NULL;
+	if (filename) {
+		path = getname(filename);
+		if (IS_ERR(path))
+			return PTR_ERR(path);
+	}
+	return do_execveat(fd, path, argv, envp, flags);
+}
+
 #ifdef CONFIG_COMPAT
 asmlinkage long compat_sys_execve(const char __user * filename,
 	const compat_uptr_t __user * argv,
@@ -1625,4 +1719,19 @@ asmlinkage long compat_sys_execve(const char __user * filename,
 {
 	return compat_do_execve(getname(filename), argv, envp);
 }
+
+asmlinkage long compat_sys_execveat(int fd,
+	const char __user *filename,
+	const compat_uptr_t __user *argv,
+	const compat_uptr_t __user *envp,
+	int flags)
+{
+	struct filename *path = NULL;
+	if (filename) {
+		path = getname(filename);
+		if (IS_ERR(path))
+			return PTR_ERR(path);
+	}
+	return compat_do_execveat(fd, path, argv, envp, flags);
+}
 #endif
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 3f448c65511b..e875a5d97e08 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -341,6 +341,9 @@ asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int);

 asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv,
 		     const compat_uptr_t __user *envp);
+asmlinkage long compat_sys_execveat(int dfd, const char __user *filename,
+		     const compat_uptr_t __user *argv,
+		     const compat_uptr_t __user *envp, int flags);

 asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 		compat_ulong_t __user *outp, compat_ulong_t __user *exp,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a781dec1cd0b..92601818b4fb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2315,6 +2315,10 @@ extern int disallow_signal(int);
 extern int do_execve(struct filename *,
 		     const char __user * const __user *,
 		     const char __user * const __user *);
+extern int do_execveat(int, struct filename *,
+		       const char __user * const __user *,
+		       const char __user * const __user *,
+		       int);
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a747a77ea584..309a810cf39d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -855,4 +855,8 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
 asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
 			 unsigned long idx1, unsigned long idx2);
 asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
+asmlinkage long sys_execveat(int dfd, const char __user *filename,
+			const char __user *const __user *argv,
+			const char __user *const __user *envp, int flags);
+
 #endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index dde8041f40d2..4231bca3f95e 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -696,9 +696,11 @@ __SYSCALL(__NR_finit_module, sys_finit_module)
 __SYSCALL(__NR_sched_setattr, sys_sched_setattr)
 #define __NR_sched_getattr 275
 __SYSCALL(__NR_sched_getattr, sys_sched_getattr)
+#define __NR_execveat 276
+__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)

 #undef __NR_syscalls
-#define __NR_syscalls 276
+#define __NR_syscalls 277

 /*
  * All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7078052284fd..5ea7b8ab9e63 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -209,3 +209,6 @@ cond_syscall(compat_sys_open_by_handle_at);

 /* compare kernel pointers */
 cond_syscall(sys_kcmp);
+
+/* execveat */
+cond_syscall(sys_execveat);
diff --git a/lib/audit.c b/lib/audit.c
index 76bbed4a20e5..712456ed5960 100644
--- a/lib/audit.c
+++ b/lib/audit.c
@@ -48,6 +48,9 @@ int audit_classify_syscall(int abi, unsigned syscall)
 	case __NR_socketcall:
 		return 4;
 #endif
+#ifdef __NR_execveat
+	case __NR_execveat:
+#endif
 	case __NR_execve:
 		return 5;
 	default:
--
1.9.1.423.g4596e3a

--
To unsubscribe from this list: send the line "unsubscribe linux-arch" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Kernel]     [Kernel Newbies]     [x86 Platform Driver]     [Netdev]     [Linux Wireless]     [Netfilter]     [Bugtraq]     [Linux Filesystems]     [Yosemite Discussion]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]

  Powered by Linux