Quoting Nadia.Derbey@xxxxxxxx (Nadia.Derbey@xxxxxxxx): > [PATCH 03/05] > > This patch uses the value written into the next_syscall_data proc file > as a target upid nr for the next process to be created. > The following syscalls have a new behavior if next_syscall_data is set: > . fork() > . vfork() > . clone() > > In the current version, if the process belongs to nested namespaces, only > the upper namespace level upid nr is allowed to be predefined, since there > is not yet a way to take a snapshot of upid nrs at all namespaces levels. > > But this can easily be extended in the future. Good point, we will want to discuss the right way to dump that data. Do we add a new file under /proc/<pid>, use /proc/pid/status, or find some other way? > Signed-off-by: Nadia Derbey <Nadia.Derbey@xxxxxxxx> > > --- > include/linux/next_syscall_data.h | 2 > include/linux/pid.h | 2 > kernel/fork.c | 3 - > kernel/pid.c | 111 ++++++++++++++++++++++++++++++++------ > 4 files changed, 98 insertions(+), 20 deletions(-) > > Index: linux-2.6.26-rc5-mm3/kernel/pid.c > =================================================================== > --- linux-2.6.26-rc5-mm3.orig/kernel/pid.c 2008-07-01 10:25:46.000000000 +0200 > +++ linux-2.6.26-rc5-mm3/kernel/pid.c 2008-07-01 11:25:38.000000000 +0200 > @@ -122,6 +122,26 @@ static void free_pidmap(struct upid *upi > atomic_inc(&map->nr_free); > } > > +static inline int alloc_pidmap_page(struct pidmap *map) > +{ > + if (unlikely(!map->page)) { > + void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); > + /* > + * Free the page if someone raced with us > + * installing it: > + */ > + spin_lock_irq(&pidmap_lock); > + if (map->page) > + kfree(page); > + else > + map->page = page; > + spin_unlock_irq(&pidmap_lock); > + if (unlikely(!map->page)) > + return -1; > + } > + return 0; > +} > + > static int alloc_pidmap(struct pid_namespace *pid_ns) > { > int i, offset, max_scan, pid, last = pid_ns->last_pid; > @@ -134,21 +154,8 @@ static int alloc_pidmap(struct pid_names > map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; > max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; > for (i = 0; i <= max_scan; ++i) { > - if (unlikely(!map->page)) { > - void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); > - /* > - * Free the page if someone raced with us > - * installing it: > - */ > - spin_lock_irq(&pidmap_lock); > - if (map->page) > - kfree(page); > - else > - map->page = page; > - spin_unlock_irq(&pidmap_lock); > - if (unlikely(!map->page)) > - break; > - } > + if (unlikely(alloc_pidmap_page(map))) > + break; > if (likely(atomic_read(&map->nr_free))) { > do { > if (!test_and_set_bit(offset, map->page)) { > @@ -182,6 +189,33 @@ static int alloc_pidmap(struct pid_names > return -1; > } > > +/* > + * Return 0 if successful (i.e. next_nr could be assigned as a upid nr). > + * -errno else > + */ > +static int alloc_fixed_pidmap(struct pid_namespace *pid_ns, int next_nr) > +{ > + int offset; > + struct pidmap *map; > + > + if (next_nr < RESERVED_PIDS || next_nr >= pid_max) > + return -EINVAL; > + > + map = &pid_ns->pidmap[next_nr / BITS_PER_PAGE]; > + > + if (unlikely(alloc_pidmap_page(map))) > + return -ENOMEM; > + > + offset = next_nr & BITS_PER_PAGE_MASK; > + if (test_and_set_bit(offset, map->page)) > + return -EBUSY; > + > + atomic_dec(&map->nr_free); > + pid_ns->last_pid = max(pid_ns->last_pid, next_nr); > + > + return 0; > +} > + > int next_pidmap(struct pid_namespace *pid_ns, int last) > { > int offset; > @@ -239,7 +273,25 @@ void free_pid(struct pid *pid) > call_rcu(&pid->rcu, delayed_put_pid); > } > > -struct pid *alloc_pid(struct pid_namespace *ns) > +/* > + * Sets a predefined upid nr for the process' upper namespace level > + */ > +static int set_predefined_pid(struct pid_namespace *ns, struct pid *pid, > + int next_nr) > +{ > + int i = ns->level; > + int rc; > + > + rc = alloc_fixed_pidmap(ns, next_nr); > + if (rc < 0) > + return rc; > + > + pid->numbers[i].nr = next_nr; > + pid->numbers[i].ns = ns; > + return 0; > +} > + > +struct pid *alloc_pid(struct pid_namespace *ns, int *retval) Is there a reason why you can't return retval using return ERR_PTR(retval); instead of using an additional argument? Then at copy_process, after the call, do if (IS_ERR(pid)) retval = PTR_ERR(pid); ? > { > struct pid *pid; > enum pid_type type; > @@ -247,12 +299,37 @@ struct pid *alloc_pid(struct pid_namespa > struct pid_namespace *tmp; > struct upid *upid; > > + *retval = -ENOMEM; > pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); > if (!pid) > goto out; > > tmp = ns; > - for (i = ns->level; i >= 0; i--) { > + i = ns->level; > + if (next_data_set(current)) { > + /* > + * There is a upid nr specified, use it instead of letting > + * the kernel chose it for us. > + */ > + int next_nr = get_next_data(current); > + int rc; > + > + rc = set_predefined_pid(tmp, pid, next_nr); > + if (rc < 0) { > + *retval = rc; > + goto out_free; Again, I'd argue for resetting the syscall data on failure. > + } > + /* Go up one level */ > + tmp = tmp->parent; > + i--; > + reset_next_syscall_data(current); > + } > + > + /* > + * Let the lower levels upid nrs be automatically allocated > + */ > + *retval = -ENOMEM; > + for ( ; i >= 0; i--) { > nr = alloc_pidmap(tmp); > if (nr < 0) > goto out_free; > Index: linux-2.6.26-rc5-mm3/include/linux/pid.h > =================================================================== > --- linux-2.6.26-rc5-mm3.orig/include/linux/pid.h 2008-07-01 10:25:46.000000000 +0200 > +++ linux-2.6.26-rc5-mm3/include/linux/pid.h 2008-07-01 10:49:07.000000000 +0200 > @@ -121,7 +121,7 @@ extern struct pid *find_get_pid(int nr); > extern struct pid *find_ge_pid(int nr, struct pid_namespace *); > int next_pidmap(struct pid_namespace *pid_ns, int last); > > -extern struct pid *alloc_pid(struct pid_namespace *ns); > +extern struct pid *alloc_pid(struct pid_namespace *, int *); > extern void free_pid(struct pid *pid); > > /* > Index: linux-2.6.26-rc5-mm3/kernel/fork.c > =================================================================== > --- linux-2.6.26-rc5-mm3.orig/kernel/fork.c 2008-07-01 10:25:46.000000000 +0200 > +++ linux-2.6.26-rc5-mm3/kernel/fork.c 2008-07-01 10:49:07.000000000 +0200 > @@ -1110,8 +1110,7 @@ static struct task_struct *copy_process( > goto bad_fork_cleanup_io; > > if (pid != &init_struct_pid) { > - retval = -ENOMEM; > - pid = alloc_pid(task_active_pid_ns(p)); > + pid = alloc_pid(task_active_pid_ns(p), &retval); > if (!pid) > goto bad_fork_cleanup_io; > > Index: linux-2.6.26-rc5-mm3/include/linux/next_syscall_data.h > =================================================================== > --- linux-2.6.26-rc5-mm3.orig/include/linux/next_syscall_data.h 2008-07-01 10:41:36.000000000 +0200 > +++ linux-2.6.26-rc5-mm3/include/linux/next_syscall_data.h 2008-07-01 11:09:35.000000000 +0200 > @@ -5,6 +5,7 @@ > * following is supported today: > * . object creation with a predefined id > * . for a sysv ipc object > + * . for a process > * > */ > > @@ -19,6 +20,7 @@ > * For example, it can be used to pre-set the id of the object to be created > * by next syscall. The following syscalls support this feature: > * . msgget(), semget(), shmget() > + * . fork(), vfork(), clone() > */ > struct next_syscall_data { > int ndata; > > -- _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers