Quoting Daniel Hokka Zakrisson (daniel@xxxxxxxxx): > Pavel Emelyanov wrote: > > We have one bit in the clone_flags left, so we won't be > > able to create more namespaces after we make it busy. > > Besides, for checkpoint/restart jobs we might want to > > create tasks with pre-defined pids (virtual of course). > > What else might be required from clone() - nobody knows. > > > > This is an attempt to create a extendable API for clone. > > > > I use the last bit in the clone_flags for CLONE_NEWCLONE. > > When set it will denote that the child_tidptr is not a > > pointer on the tid storage, but the pointer on the struct > > long_clone_struct which currently looks like this: > > > > struct long_clone_arg { > > int size; > > }; > > > > When we want to add a new argument for clone we just put > > it *at the end of this structure* and adjust the size. > > The binary compatibility with older long_clone_arg-s is > > facilitated with the clone_arg_has() macro. > > > > Sure, we lose the ability to clone tasks with extended > > argument and the CLONE_CHILD_SETTID/CLEARTID, but do we > > really need this? > > > > The same thing is about to be done for unshare - we can > > add the second argument for it and iff the CLONE_NEWCLONE > > is specified - try to use it. Binary compatibility with > > the old ushare will be kept. > > > > The new argument is pulled up to the create_new_namespaces > > so that later we can easily use it w/o sending additional > > patches. > > > > This is a final, but a pre-review patch for sys_clone() > > that I plan to send to Andrew before we go on developing > > new namespaces. > > > > Made against 2.6.24-rc5-mm1. > > > > Signed-off-by: Pavel Emelyanov <xemul@xxxxxxxxxx> > > > > --- > > > > diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h > > index 0e66b57..e01de56 100644 > > --- a/include/linux/nsproxy.h > > +++ b/include/linux/nsproxy.h > > @@ -62,7 +62,8 @@ static inline struct nsproxy *task_nsproxy(struct > > task_struct *tsk) > > return rcu_dereference(tsk->nsproxy); > > } > > > > -int copy_namespaces(unsigned long flags, struct task_struct *tsk); > > +int copy_namespaces(unsigned long flags, struct task_struct *tsk, > > + struct long_clone_arg *carg); > > void exit_task_namespaces(struct task_struct *tsk); > > void switch_task_namespaces(struct task_struct *tsk, struct nsproxy > > *new); > > void free_nsproxy(struct nsproxy *ns); > > diff --git a/include/linux/sched.h b/include/linux/sched.h > > index e4d2a82..585a2b4 100644 > > --- a/include/linux/sched.h > > +++ b/include/linux/sched.h > > @@ -27,6 +27,23 @@ > > #define CLONE_NEWUSER 0x10000000 /* New user namespace */ > > #define CLONE_NEWPID 0x20000000 /* New pid namespace */ > > #define CLONE_NEWNET 0x40000000 /* New network namespace */ > > +#define CLONE_NEWCLONE 0x80000000 /* Has an long_clone_arg */ > > + > > +/* > > + * If the CLONE_NEWCLONE argument is specified, then the > > + * child_tidptr is expected to point to the struct long_clone_arg. > > + * > > + * This structure has a variable size, which is to be recorded > > + * in the size member. All other members a to be put after this. > > + * Never ... No. Always add new members only at its tail. > > + * > > + * The clone_arg_has() macro is responsible for checking whether > > + * the given arg has the desired member. > > + */ > > + > > +struct long_clone_arg { > > + int size; > > +}; > > > > /* > > * Scheduling policies > > @@ -40,6 +57,11 @@ > > > > #ifdef __KERNEL__ > > > > +#define clone_arg_has(arg, member) ({ \ > > + struct long_clone_arg *__carg = arg; \ > > + (__carg->size >= offsetof(struct long_clone_arg, member) + \ > > + sizeof(__carg->member)); }) > > + > > struct sched_param { > > int sched_priority; > > }; > > diff --git a/kernel/fork.c b/kernel/fork.c > > index 8b558b7..f5895fc 100644 > > --- a/kernel/fork.c > > +++ b/kernel/fork.c > > @@ -967,6 +967,29 @@ static void rt_mutex_init_task(struct task_struct *p) > > #endif > > } > > > > +static struct long_clone_arg *get_long_clone_arg(int __user > > *child_tidptr) > > +{ > > + int size; > > + struct long_clone_arg *carg; > > + > > + if (get_user(size, child_tidptr)) > > + return ERR_PTR(-EFAULT); > > + > > + if (size > sizeof(struct long_clone_arg)) > > + return ERR_PTR(-EINVAL); > > This means that software built against a newer kernel won't work on an > older one. Surely that's not intended? Well the idea would be that if userspace knows about 64 additional bits but is only asking for bit 3, it would pass { size=sizeof(int)+sizeof(__u32), u32_1 = 000...000100, } rather than { size=sizeof(int)+2*sizeof(__u32_), u32_1 = 000...000100, u32_2 = 000...000000, } And, of course, if if it should set a bit in u32_2, then the kernel which only knows about 32 additional bits doesn't support the requested feature and should in fact return -EINVAL. > > + carg = kzalloc(sizeof(struct long_clone_arg), GFP_KERNEL); > > + if (carg == NULL) > > + return ERR_PTR(-ENOMEM); > > + > > + if (copy_from_user(carg, child_tidptr, size)) { > > + kfree(carg); > > + return ERR_PTR(-EFAULT); > > + } > > + > > + return carg; > > +} > > + > > /* > > * This creates a new process as a copy of the old one, > > * but does not actually start it yet. > > @@ -985,6 +1008,7 @@ static struct task_struct *copy_process(unsigned long > > clone_flags, > > int retval; > > struct task_struct *p; > > int cgroup_callbacks_done = 0; > > + struct long_clone_arg *carg = NULL; > > > > if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) > > return ERR_PTR(-EINVAL); > > @@ -1004,6 +1028,11 @@ static struct task_struct *copy_process(unsigned > > long clone_flags, > > if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) > > return ERR_PTR(-EINVAL); > > > > + if ((clone_flags & CLONE_NEWCLONE) && > > + (clone_flags & (CLONE_CHILD_SETTID | > > + CLONE_CHILD_CLEARTID))) > > + return ERR_PTR(-EINVAL); > > + > > retval = security_task_create(clone_flags); > > if (retval) > > goto fork_out; > > @@ -1132,8 +1161,15 @@ static struct task_struct *copy_process(unsigned > > long clone_flags, > > /* Perform scheduler related setup. Assign this task to a CPU. */ > > sched_fork(p, clone_flags); > > > > + if (clone_flags & CLONE_NEWCLONE) { > > + carg = get_long_clone_arg(child_tidptr); > > + retval = PTR_ERR(carg); > > + if (IS_ERR(carg)) > > + goto bad_fork_cleanup_policy; > > + } > > + > > if ((retval = security_task_alloc(p))) > > - goto bad_fork_cleanup_policy; > > + goto bad_fork_cleanup_carg; > > if ((retval = audit_alloc(p))) > > goto bad_fork_cleanup_security; > > /* copy all the process information */ > > @@ -1151,7 +1187,7 @@ static struct task_struct *copy_process(unsigned > > long clone_flags, > > goto bad_fork_cleanup_signal; > > if ((retval = copy_keys(clone_flags, p))) > > goto bad_fork_cleanup_mm; > > - if ((retval = copy_namespaces(clone_flags, p))) > > + if ((retval = copy_namespaces(clone_flags, p, carg))) > > goto bad_fork_cleanup_keys; > > retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); > > if (retval) > > @@ -1345,6 +1381,8 @@ bad_fork_cleanup_audit: > > audit_free(p); > > bad_fork_cleanup_security: > > security_task_free(p); > > +bad_fork_cleanup_carg: > > + kfree(carg); > > bad_fork_cleanup_policy: > > #ifdef CONFIG_NUMA > > mpol_free(p->mempolicy); > > diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c > > index f5d332c..b955196 100644 > > --- a/kernel/nsproxy.c > > +++ b/kernel/nsproxy.c > > @@ -48,7 +48,8 @@ static inline struct nsproxy *clone_nsproxy(struct > > nsproxy *orig) > > * leave it to the caller to do proper locking and attach it to task. > > */ > > static struct nsproxy *create_new_namespaces(unsigned long flags, > > - struct task_struct *tsk, struct fs_struct *new_fs) > > + struct task_struct *tsk, struct fs_struct *new_fs, > > + struct long_clone_arg *carg) > > { > > struct nsproxy *new_nsp; > > int err; > > @@ -119,7 +120,8 @@ out_ns: > > * called from clone. This now handles copy for nsproxy and all > > * namespaces therein. > > */ > > -int copy_namespaces(unsigned long flags, struct task_struct *tsk) > > +int copy_namespaces(unsigned long flags, struct task_struct *tsk, > > + struct long_clone_arg *carg) > > { > > struct nsproxy *old_ns = tsk->nsproxy; > > struct nsproxy *new_ns; > > @@ -131,7 +133,8 @@ int copy_namespaces(unsigned long flags, struct > > task_struct *tsk) > > get_nsproxy(old_ns); > > > > if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | > > - CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET))) > > + CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET | > > + CLONE_NEWCLONE))) > > return 0; > > > > if (!capable(CAP_SYS_ADMIN)) { > > @@ -139,7 +142,7 @@ int copy_namespaces(unsigned long flags, struct > > task_struct *tsk) > > goto out; > > } > > > > - new_ns = create_new_namespaces(flags, tsk, tsk->fs); > > + new_ns = create_new_namespaces(flags, tsk, tsk->fs, carg); > > if (IS_ERR(new_ns)) { > > err = PTR_ERR(new_ns); > > goto out; > > @@ -191,7 +194,7 @@ int unshare_nsproxy_namespaces(unsigned long > > unshare_flags, > > return -EPERM; > > > > *new_nsp = create_new_namespaces(unshare_flags, current, > > - new_fs ? new_fs : current->fs); > > + new_fs ? new_fs : current->fs, NULL); > > if (IS_ERR(*new_nsp)) { > > err = PTR_ERR(*new_nsp); > > goto out; > > -- > Daniel Hokka Zakrisson _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers