[PATCH 03/05] This patch uses the value written into the next_syscall_data proc file as a target upid nr for the next process to be created. The following syscalls have a new behavior if next_syscall_data is set: . fork() . vfork() . clone() In the current version, if the process belongs to nested namespaces, only the upper namespace level upid nr is allowed to be predefined, since there is not yet a way to take a snapshot of upid nrs at all namespaces levels. But this can easily be extended in the future. Signed-off-by: Nadia Derbey <Nadia.Derbey@xxxxxxxx> --- include/linux/next_syscall_data.h | 2 kernel/fork.c | 5 - kernel/pid.c | 116 +++++++++++++++++++++++++++++++------- 3 files changed, 102 insertions(+), 21 deletions(-) Index: linux-2.6.26-rc8-mm1/kernel/pid.c =================================================================== --- linux-2.6.26-rc8-mm1.orig/kernel/pid.c 2008-07-08 12:12:39.000000000 +0200 +++ linux-2.6.26-rc8-mm1/kernel/pid.c 2008-07-08 12:24:04.000000000 +0200 @@ -122,6 +122,26 @@ static void free_pidmap(struct upid *upi atomic_inc(&map->nr_free); } +static inline int alloc_pidmap_page(struct pidmap *map) +{ + if (unlikely(!map->page)) { + void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); + /* + * Free the page if someone raced with us + * installing it: + */ + spin_lock_irq(&pidmap_lock); + if (map->page) + kfree(page); + else + map->page = page; + spin_unlock_irq(&pidmap_lock); + if (unlikely(!map->page)) + return -1; + } + return 0; +} + static int alloc_pidmap(struct pid_namespace *pid_ns) { int i, offset, max_scan, pid, last = pid_ns->last_pid; @@ -134,21 +154,8 @@ static int alloc_pidmap(struct pid_names map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; for (i = 0; i <= max_scan; ++i) { - if (unlikely(!map->page)) { - void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); - /* - * Free the page if someone raced with us - * installing it: - */ - spin_lock_irq(&pidmap_lock); - if (map->page) - kfree(page); - else - map->page = page; - spin_unlock_irq(&pidmap_lock); - if (unlikely(!map->page)) - break; - } + if (unlikely(alloc_pidmap_page(map))) + break; if (likely(atomic_read(&map->nr_free))) { do { if (!test_and_set_bit(offset, map->page)) { @@ -182,6 +189,33 @@ static int alloc_pidmap(struct pid_names return -1; } +/* + * Return 0 if successful (i.e. next_nr could be assigned as a upid nr). + * -errno else + */ +static int alloc_fixed_pidmap(struct pid_namespace *pid_ns, int next_nr) +{ + int offset; + struct pidmap *map; + + if (next_nr < RESERVED_PIDS || next_nr >= pid_max) + return -EINVAL; + + map = &pid_ns->pidmap[next_nr / BITS_PER_PAGE]; + + if (unlikely(alloc_pidmap_page(map))) + return -ENOMEM; + + offset = next_nr & BITS_PER_PAGE_MASK; + if (test_and_set_bit(offset, map->page)) + return -EBUSY; + + atomic_dec(&map->nr_free); + pid_ns->last_pid = max(pid_ns->last_pid, next_nr); + + return 0; +} + int next_pidmap(struct pid_namespace *pid_ns, int last) { int offset; @@ -239,6 +273,24 @@ void free_pid(struct pid *pid) call_rcu(&pid->rcu, delayed_put_pid); } +/* + * Sets a predefined upid nr for the process' upper namespace level + */ +static int set_predefined_pid(struct pid_namespace *ns, struct pid *pid, + int next_nr) +{ + int i = ns->level; + int rc; + + rc = alloc_fixed_pidmap(ns, next_nr); + if (rc < 0) + return rc; + + pid->numbers[i].nr = next_nr; + pid->numbers[i].ns = ns; + return 0; +} + struct pid *alloc_pid(struct pid_namespace *ns) { struct pid *pid; @@ -248,14 +300,41 @@ struct pid *alloc_pid(struct pid_namespa struct upid *upid; pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); - if (!pid) + if (!pid) { + pid = ERR_PTR(-ENOMEM); goto out; + } tmp = ns; - for (i = ns->level; i >= 0; i--) { + i = ns->level; + if (unlikely(next_data_set(current))) { + /* + * There is a upid nr specified, use it instead of letting + * the kernel chose it for us. + */ + int next_nr = get_next_data(current); + int rc; + + reset_next_syscall_data(current); + rc = set_predefined_pid(tmp, pid, next_nr); + if (rc < 0) { + pid = ERR_PTR(rc); + goto out_free; + } + /* Go up one level */ + tmp = tmp->parent; + i--; + } + + /* + * Let the lower levels upid nrs be automatically allocated + */ + for ( ; i >= 0; i--) { nr = alloc_pidmap(tmp); - if (nr < 0) + if (nr < 0) { + pid = ERR_PTR(-ENOMEM); goto out_free; + } pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; @@ -284,7 +363,6 @@ out_free: free_pidmap(pid->numbers + i); kmem_cache_free(ns->pid_cachep, pid); - pid = NULL; goto out; } Index: linux-2.6.26-rc8-mm1/kernel/fork.c =================================================================== --- linux-2.6.26-rc8-mm1.orig/kernel/fork.c 2008-07-08 12:12:39.000000000 +0200 +++ linux-2.6.26-rc8-mm1/kernel/fork.c 2008-07-08 12:22:47.000000000 +0200 @@ -1118,10 +1118,11 @@ static struct task_struct *copy_process( goto bad_fork_cleanup_io; if (pid != &init_struct_pid) { - retval = -ENOMEM; pid = alloc_pid(task_active_pid_ns(p)); - if (!pid) + if (IS_ERR(pid)) { + retval = PTR_ERR(pid); goto bad_fork_cleanup_io; + } if (clone_flags & CLONE_NEWPID) { retval = pid_ns_prepare_proc(task_active_pid_ns(p)); Index: linux-2.6.26-rc8-mm1/include/linux/next_syscall_data.h =================================================================== --- linux-2.6.26-rc8-mm1.orig/include/linux/next_syscall_data.h 2008-07-08 12:12:39.000000000 +0200 +++ linux-2.6.26-rc8-mm1/include/linux/next_syscall_data.h 2008-07-08 12:22:47.000000000 +0200 @@ -5,6 +5,7 @@ * following is supported today: * . object creation with a predefined id * . for a sysv ipc object + * . for a process */ #ifndef _LINUX_NEXT_SYSCALL_DATA_H @@ -18,6 +19,7 @@ * For example, it can be used to pre-set the id of the object to be created * by next syscall. The following syscalls support this feature: * . msgget(), semget(), shmget() + * . fork(), vfork(), clone() */ struct next_syscall_data { int ndata; -- _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers