From: Sukadev Bhattiprolu <sukadev@xxxxxxxxxxxxxxxxxx> clone_with_pids() is same as clone(), except that it takes a 'target_pid_set' paramter which lets caller choose a specific pid number for the child process in each of the child process's pid namespace. This system call would be needed to implement Checkpoint/Restart (i.e after a checkpoint, restart a process with its original pids). Call clone_with_pids as follows: pid_t pids[] = { 0, 77, 99 }; struct target_pid_set pid_set; pid_set.num_pids = sizeof(pids) / sizeof(int); pid_set.target_pids = &pids; syscall(__NR_clone_with_pids, flags, stack, NULL, NULL, NULL, &pid_set); If a target-pid is 0, the kernel continues to assign a pid for the process in that namespace. In the above example, pids[0] is 0, meaning the kernel will assign next available pid to the process in init_pid_ns. But kernel will assign pid 77 in the child pid namespace 1 and pid 99 in pid namespace 2. If either 77 or 99 are taken, the system call fails with -EBUSY. If 'pid_set.num_pids' exceeds the current nesting level of pid namespaces, the system call fails with -EINVAL. Its mostly an exploratory patch seeking feedback on the interface. NOTE: Compared to clone(), clone_with_pids() needs to pass in two more pieces of information: - number of pids in the set - user buffer containing the list of pids. But since clone() already takes 5 parameters, use a 'struct target_pid_set'. TODO: - Gently tested. - May need additional sanity checks in check_target_pids() - Allow CLONE_NEWPID() with clone_with_pids() (ensure target-pid in the namespace is either 1 or 0). Signed-off-by: Sukadev Bhattiprolu <sukadev@xxxxxxxxxxxxxxxxxx> --- arch/x86/include/asm/syscalls.h | 1 + arch/x86/include/asm/unistd_32.h | 1 + arch/x86/kernel/entry_32.S | 1 + arch/x86/kernel/process_32.c | 91 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/syscall_table_32.S | 1 + include/linux/types.h | 5 ++ 6 files changed, 100 insertions(+), 0 deletions(-) diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 7043408..1fdc149 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -31,6 +31,7 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *); /* kernel/process_32.c */ int sys_fork(struct pt_regs *); int sys_clone(struct pt_regs *); +int sys_clone_with_pids(struct pt_regs *); int sys_vfork(struct pt_regs *); int sys_execve(struct pt_regs *); diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 6e72d74..90f906f 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -340,6 +340,7 @@ #define __NR_inotify_init1 332 #define __NR_preadv 333 #define __NR_pwritev 334 +#define __NR_clone_with_pids 335 #ifdef __KERNEL__ diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c929add..ee92b0d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -707,6 +707,7 @@ ptregs_##name: \ PTREGSCALL(iopl) PTREGSCALL(fork) PTREGSCALL(clone) +PTREGSCALL(clone_with_pids) PTREGSCALL(vfork) PTREGSCALL(execve) PTREGSCALL(sigaltstack) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 76f8f84..66ac6f7 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -445,6 +445,97 @@ int sys_clone(struct pt_regs *regs) return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr); } +static int check_target_pids(unsigned long clone_flags, + struct target_pid_set *pid_setp) +{ + /* + * CLONE_NEWPID implies pid == 1 + * + * TODO: Maybe this should be more fine-grained (i.e would we want + * to have a container-init have a specific pid in ancestor + * namespaces ?) + */ + if (clone_flags & CLONE_NEWPID) + return -EINVAL; + + /* number of pids must match current nesting level of pid ns */ + if (pid_setp->num_pids > task_pid(current)->level + 1) + return -EINVAL; + + /* TODO: More sanity checks ? */ + + return 0; +} + +static pid_t *copy_target_pids(unsigned long clone_flags, void __user *upid_setp) +{ + int rc; + int size; + unsigned long clone_flags; + pid_t __user *utarget_pids; + pid_t *target_pids; + struct target_pid_set pid_set; + + if (copy_from_user(pid_setp, upid_setp, sizeof(*pid_setp))) + return ERR_PTR(-EFAULT); + + size = pid_setp->num_pids * sizeof(pid_t); + utarget_pids = pid_setp->target_pids; + + target_pids = kzalloc(size, GFP_KERNEL); + if (!target_pids) + return ERR_PTR(-ENOMEM); + + rc = -EFAULT; + if (copy_from_user(target_pids, utarget_pids, size)) + goto out_free; + + rc = check_target_pids(clone_flags, &pid_set); + if (rc) + goto out_free; + + printk(KERN_ERR "clone_with_pids() num_pids %d, [ %d, %d ]\n", + pid_set.num_pids, target_pids[0], target_pids[1]); + + return target_pids; + +out_free: + kfree(target_pids); + return ERR_PTR(rc); +} + +int sys_clone_with_pids(struct pt_regs *regs) +{ + unsigned long clone_flags; + unsigned long newsp; + int __user *parent_tidptr; + int __user *child_tidptr; + void __user *upid_setp; + int rc; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + clone_flags = regs->bx; + newsp = regs->cx; + parent_tidptr = (int __user *)regs->dx; + child_tidptr = (int __user *)regs->di; + upid_setp = (void __user *)regs->bp; + + if (!newsp) + newsp = regs->sp; + + target_pids = copy_target_pids(clone_flags, upid_setp) + if (IS_ERR(target_pids)) + return PTR_ERR(target_pids); + + rc = do_fork_with_pids(clone_flags, newsp, regs, 0, parent_tidptr, + child_tidptr, target_pids); +out_free: + kfree(target_pids); + return rc; +} + /* * sys_execve() executes a new program. */ diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index ff5c873..94c1a58 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -334,3 +334,4 @@ ENTRY(sys_call_table) .long sys_inotify_init1 .long sys_preadv .long sys_pwritev + .long ptregs_clone_with_pids /* 335 */ diff --git a/include/linux/types.h b/include/linux/types.h index 5abe354..17ec186 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -204,6 +204,11 @@ struct ustat { char f_fpack[6]; }; +struct target_pid_set { + int num_pids; + pid_t *target_pids; +}; + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ #endif /* _LINUX_TYPES_H */ -- 1.5.2.5 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers