clone() has no more usable flags available. It has three now-unused flags (CLONE_PID, CLONE_DETACHED, and CLONE_STOPPED), but current kernels just ignore those flags without returning an error like EINVAL, so reusing those flags would not allow userspace to detect the availability of the new functionality. Introduce a new system call, clone4, which accepts a second 32-bit flags field. clone4 also returns EINVAL for the currently unused flags in clone, allowing their reuse. To process these new flags, change the flags argument of _do_fork to a u64. sys_clone and do_fork both still use "unsigned long" for flags as they did before, truncating it to 32-bit and masking out the obsolete flags to behave like clone currently does. clone4 accepts its remaining arguments as a structure, and userspace passes in the size of that structure. clone4 has well-defined semantics that allow extending that structure in the future. New userspace passing in a larger structure than the kernel expects will receive EINVAL, and can use a smaller structure to work with old kernels. New kernels accept smaller argument structures passed by userspace, and any un-passed arguments default to 0. clone4 handles arguments in the same order on all architectures, with no backwards variations; to do so, it depends on the new HAVE_COPY_THREAD_TLS. The new system call currently accepts exactly the same flags as clone; future commits will introduce new flags for additional functionality. Signed-off-by: Josh Triplett <josh@xxxxxxxxxxxxxxxx> Signed-off-by: Thiago Macieira <thiago.macieira@xxxxxxxxx> --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/kernel/entry_64.S | 1 + arch/x86/syscalls/syscall_32.tbl | 1 + arch/x86/syscalls/syscall_64.tbl | 2 ++ include/linux/compat.h | 12 +++++++++ include/uapi/asm-generic/unistd.h | 4 ++- include/uapi/linux/sched.h | 36 ++++++++++++++++++++++--- init/Kconfig | 10 +++++++ kernel/fork.c | 56 ++++++++++++++++++++++++++++++++++++--- kernel/sys_ni.c | 1 + 10 files changed, 116 insertions(+), 8 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 0286735..ba28306 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -483,6 +483,7 @@ GLOBAL(\label) PTREGSCALL stub32_execveat, compat_sys_execveat PTREGSCALL stub32_fork, sys_fork PTREGSCALL stub32_vfork, sys_vfork + PTREGSCALL stub32_clone4, compat_sys_clone4 ALIGN GLOBAL(stub32_clone) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1d74d16..ead143f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -520,6 +520,7 @@ END(\label) FORK_LIKE clone FORK_LIKE fork FORK_LIKE vfork + FORK_LIKE clone4 FIXED_FRAME stub_iopl, sys_iopl ENTRY(stub_execve) diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index b3560ec..56fcc90 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -365,3 +365,4 @@ 356 i386 memfd_create sys_memfd_create 357 i386 bpf sys_bpf 358 i386 execveat sys_execveat stub32_execveat +359 i386 clone4 sys_clone4 stub32_clone4 diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 8d656fb..af15b0f 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -329,6 +329,7 @@ 320 common kexec_file_load sys_kexec_file_load 321 common bpf sys_bpf 322 64 execveat stub_execveat +323 64 clone4 stub_clone4 # # x32-specific system call numbers start at 512 to avoid cache impact @@ -368,3 +369,4 @@ 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit 545 x32 execveat stub_x32_execveat +546 x32 clone4 stub32_clone4 diff --git a/include/linux/compat.h b/include/linux/compat.h index ab25814..6c4a68d 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -293,6 +293,14 @@ struct compat_old_sigaction { }; #endif +struct compat_clone4_args { + compat_uptr_t ptid; + compat_uptr_t ctid; + compat_ulong_t stack_start; + compat_ulong_t stack_size; + compat_ulong_t tls; +}; + struct compat_statfs; struct compat_statfs64; struct compat_old_linux_dirent; @@ -713,6 +721,10 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32, int, const char __user *); + +asmlinkage long compat_sys_clone4(unsigned, unsigned, compat_ulong_t, + struct compat_clone4_args __user *); + #else #define is_compat_task() (0) diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index e016bd9..3740166 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -709,9 +709,11 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create) __SYSCALL(__NR_bpf, sys_bpf) #define __NR_execveat 281 __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat) +#define __NR_clone4 282 +__SC_COMP(__NR_clone4, sys_clone4, compat_sys_clone4) #undef __NR_syscalls -#define __NR_syscalls 282 +#define __NR_syscalls 283 /* * All syscalls below here should go away really, diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index cc89dde..7656152 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -1,6 +1,8 @@ #ifndef _UAPI_LINUX_SCHED_H #define _UAPI_LINUX_SCHED_H +#include <linux/types.h> + /* * cloning flags: */ @@ -18,11 +20,8 @@ #define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ #define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ #define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ -#define CLONE_DETACHED 0x00400000 /* Unused, ignored */ #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ -/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) - and is now available for re-use. */ #define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ #define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ #define CLONE_NEWUSER 0x10000000 /* New user namespace */ @@ -31,6 +30,37 @@ #define CLONE_IO 0x80000000 /* Clone io context */ /* + * Old flags, unused by current clone. clone does not return EINVAL for these + * flags, so they can't easily be reused. clone4 can use them. + */ +#define CLONE_PID 0x00001000 +#define CLONE_DETACHED 0x00400000 +#define CLONE_STOPPED 0x02000000 + +#ifdef __KERNEL__ +/* + * Valid flags for clone and for clone4. Kept in this file next to the flag + * list above, but not exposed to userspace. + */ +#define CLONE_VALID_FLAGS (0xffffffffULL & ~(CLONE_PID | CLONE_DETACHED | CLONE_STOPPED)) +#define CLONE4_VALID_FLAGS CLONE_VALID_FLAGS +#endif /* __KERNEL__ */ + +/* + * Structure passed to clone4 for additional arguments. Initialized to 0, + * then overwritten with arguments from userspace, so arguments not supplied by + * userspace will remain 0. New versions of the kernel may safely append new + * arguments to the end. + */ +struct clone4_args { + __kernel_pid_t __user *ptid; + __kernel_pid_t __user *ctid; + __kernel_ulong_t stack_start; + __kernel_ulong_t stack_size; + __kernel_ulong_t tls; +}; + +/* * Scheduling policies */ #define SCHED_NORMAL 0 diff --git a/init/Kconfig b/init/Kconfig index f5dbc6d..3ab6649 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1511,6 +1511,16 @@ config EVENTFD If unsure, say Y. +config CLONE4 + bool "Enable clone4() system call" if EXPERT + depends on HAVE_COPY_THREAD_TLS + default y + help + Enable the clone4() system call, which supports passing additional + flags. + + If unsure, say Y. + # syscall, maps, verifier config BPF_SYSCALL bool "Enable bpf() system call" if EXPERT diff --git a/kernel/fork.c b/kernel/fork.c index b3dadf4..8a21f9e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1187,7 +1187,7 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ -static struct task_struct *copy_process(unsigned long clone_flags, +static struct task_struct *copy_process(u64 clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *child_tidptr, @@ -1198,6 +1198,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, int retval; struct task_struct *p; + if (clone_flags & ~CLONE4_VALID_FLAGS) + return ERR_PTR(-EINVAL); + if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -1630,7 +1633,7 @@ struct task_struct *fork_idle(int cpu) * it and waits for it to finish using the VM if required. */ static long _do_fork( - unsigned long clone_flags, + u64 clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *parent_tidptr, @@ -1701,6 +1704,15 @@ static long _do_fork( return nr; } +/* + * Convenience function for callers passing unsigned long flags, to prevent old + * syscall entry points from unexpectedly returning EINVAL. + */ +static inline u64 squelch_clone_flags(unsigned long clone_flags) +{ + return clone_flags & CLONE_VALID_FLAGS; +} + #ifndef CONFIG_HAVE_COPY_THREAD_TLS /* For compatibility with architectures that call do_fork directly rather than * using the syscall entry points below. */ @@ -1710,7 +1722,8 @@ long do_fork(unsigned long clone_flags, int __user *parent_tidptr, int __user *child_tidptr) { - return _do_fork(clone_flags, stack_start, stack_size, + return _do_fork(squelch_clone_flags(clone_flags), + stack_start, stack_size, parent_tidptr, child_tidptr, 0); } #endif @@ -1768,10 +1781,45 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, unsigned long, tls) #endif { - return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); + return _do_fork(squelch_clone_flags(clone_flags), newsp, 0, + parent_tidptr, child_tidptr, tls); } #endif +#ifdef CONFIG_CLONE4 +SYSCALL_DEFINE4(clone4, unsigned, flags_high, unsigned, flags_low, + unsigned long, args_size, struct clone4_args __user *, args) +{ + u64 flags = (u64)flags_high << 32 | flags_low; + struct clone4_args kargs = {}; + if (args_size > sizeof(kargs)) + return -EINVAL; + if (args_size && copy_from_user(&kargs, args, args_size)) + return -EFAULT; + return _do_fork(flags, kargs.stack_start, kargs.stack_size, + kargs.ptid, kargs.ctid, kargs.tls); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(clone4, unsigned, flags_high, unsigned, flags_low, + compat_ulong_t, args_size, + struct compat_clone4_args __user *, args) +{ + u64 flags = (u64)flags_high << 32 | flags_low; + struct compat_clone4_args compat_kargs = {}; + if (args_size > sizeof(compat_kargs)) + return -EINVAL; + if (args_size && copy_from_user(&compat_kargs, args, args_size)) + return -EFAULT; + return _do_fork(flags, compat_kargs.stack_start, + compat_kargs.stack_size, + compat_ptr(compat_kargs.ptid), + compat_ptr(compat_kargs.ctid), + compat_kargs.tls); +} +#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_CLONE4 */ + #ifndef ARCH_MIN_MMSTRUCT_ALIGN #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5adcb0a..5b5d2b9 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -159,6 +159,7 @@ cond_syscall(sys_uselib); cond_syscall(sys_fadvise64); cond_syscall(sys_fadvise64_64); cond_syscall(sys_madvise); +cond_syscall(sys_clone4); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); -- 2.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html