Deal with struct pid in general and task pids in particular. Guess what, references to outside pids are banned which means that if child is created with simple CLONE_NEWPID, it's PIDTYPE_PGID and PIDTYPE_SID will be outside of newborn pidns. On restore we don't know to where glue them and they weren't saved at all. So abort checkpointing in this case. New-born container inits should use setpgrp(2) and setsid(2)! Signed-off-by: Alexey Dobriyan <adobriyan@xxxxxxxxx> --- include/linux/kstate-image.h | 13 +++ include/linux/kstate.h | 5 + include/linux/pid.h | 2 +- kernel/fork.c | 2 +- kernel/kstate/cpt-sys.c | 6 + kernel/kstate/kstate-context.c | 5 + kernel/kstate/kstate-object.c | 3 + kernel/kstate/kstate-task.c | 80 ++++++++++++++++ kernel/pid.c | 199 +++++++++++++++++++++++++++++++++++++++- 9 files changed, 308 insertions(+), 7 deletions(-) diff --git a/include/linux/kstate-image.h b/include/linux/kstate-image.h index a573833..108bb2d 100644 --- a/include/linux/kstate-image.h +++ b/include/linux/kstate-image.h @@ -53,6 +53,7 @@ struct kstate_image_header { #define KSTATE_OBJ_GROUP_INFO 13 #define KSTATE_OBJ_USER_STRUCT 14 #define KSTATE_OBJ_USER_NS 15 +#define KSTATE_OBJ_PID 16 struct kstate_object_header { __u32 obj_type; @@ -80,6 +81,10 @@ struct kstate_image_task_struct { kstate_ref_t ref_real_cred; kstate_ref_t ref_cred; + kstate_ref_t ref_pid; + kstate_ref_t ref_pgid; + kstate_ref_t ref_sid; + __u8 comm[16]; /* Native arch of task, one of KSTATE_ARCH_*. */ @@ -305,4 +310,12 @@ struct kstate_image_user_ns { */ kstate_ref_t ref_creator; } __packed; + +struct kstate_image_pid { + struct kstate_object_header hdr; + + kstate_ref_t ref_pid_ns; /* last-level pid_ns */ + __u32 level; + __u32 nr[1]; +} __packed; #endif diff --git a/include/linux/kstate.h b/include/linux/kstate.h index f0c8e09..99a4345 100644 --- a/include/linux/kstate.h +++ b/include/linux/kstate.h @@ -33,6 +33,7 @@ enum kstate_context_obj_type { KSTATE_CTX_NET_NS, #endif KSTATE_CTX_NSPROXY, + KSTATE_CTX_PID, KSTATE_CTX_PID_NS, KSTATE_CTX_TASK_STRUCT, KSTATE_CTX_USER_NS, @@ -144,6 +145,10 @@ int kstate_collect_all_user_ns(struct kstate_context *ctx); int kstate_dump_all_user_ns(struct kstate_context *ctx); int kstate_restore_user_ns(struct kstate_context *ctx, kstate_ref_t *ref); +int kstate_collect_all_pid(struct kstate_context *ctx); +int kstate_dump_all_pid(struct kstate_context *ctx); +int kstate_restore_pid(struct kstate_context *ctx, kstate_ref_t *ref); + #if defined(CONFIG_X86_32) || defined(CONFIG_X86_64) extern const __u32 kstate_kernel_arch; int kstate_arch_check_image_header(struct kstate_image_header *i); diff --git a/include/linux/pid.h b/include/linux/pid.h index 49f1c2f..f775a85 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -119,7 +119,7 @@ extern struct pid *find_get_pid(int nr); extern struct pid *find_ge_pid(int nr, struct pid_namespace *); int next_pidmap(struct pid_namespace *pid_ns, int last); -extern struct pid *alloc_pid(struct pid_namespace *ns); +extern struct pid *alloc_pid(struct pid_namespace *ns, int *nr, unsigned int level); extern void free_pid(struct pid *pid); /* diff --git a/kernel/fork.c b/kernel/fork.c index ed377ad..97521ab 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1117,7 +1117,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (pid != &init_struct_pid) { retval = -ENOMEM; - pid = alloc_pid(p->nsproxy->pid_ns); + pid = alloc_pid(p->nsproxy->pid_ns, NULL, 0); if (!pid) goto bad_fork_cleanup_io; diff --git a/kernel/kstate/cpt-sys.c b/kernel/kstate/cpt-sys.c index 3df776e..119940d 100644 --- a/kernel/kstate/cpt-sys.c +++ b/kernel/kstate/cpt-sys.c @@ -101,6 +101,9 @@ static int kstate_collect(struct kstate_context *ctx) rv = kstate_collect_all_user_ns(ctx); if (rv < 0) return rv; + rv = kstate_collect_all_pid(ctx); + if (rv < 0) + return rv; return 0; } @@ -154,6 +157,9 @@ static int kstate_dump(struct kstate_context *ctx) rv = kstate_dump_all_pid_ns(ctx); if (rv < 0) return rv; + rv = kstate_dump_all_pid(ctx); + if (rv < 0) + return rv; rv = kstate_dump_all_user_ns(ctx); if (rv < 0) return rv; diff --git a/kernel/kstate/kstate-context.c b/kernel/kstate/kstate-context.c index f8168cc..9acb441 100644 --- a/kernel/kstate/kstate-context.c +++ b/kernel/kstate/kstate-context.c @@ -81,6 +81,11 @@ void kstate_context_destroy(struct kstate_context *ctx) list_del(&obj->o_list); kfree(obj); } + for_each_kstate_object_safe(ctx, obj, tmp, KSTATE_CTX_PID) { + put_pid((struct pid *)obj->o_obj); + list_del(&obj->o_list); + kfree(obj); + } for_each_kstate_object_safe(ctx, obj, tmp, KSTATE_CTX_PID_NS) { put_pid_ns((struct pid_namespace *)obj->o_obj); list_del(&obj->o_list); diff --git a/kernel/kstate/kstate-object.c b/kernel/kstate/kstate-object.c index eb77027..ab026f0 100644 --- a/kernel/kstate/kstate-object.c +++ b/kernel/kstate/kstate-object.c @@ -64,6 +64,9 @@ int kstate_collect_object(struct kstate_context *ctx, void *p, enum kstate_conte case KSTATE_CTX_NSPROXY: get_nsproxy((struct nsproxy *)obj->o_obj); break; + case KSTATE_CTX_PID: + get_pid((struct pid *)obj->o_obj); + break; case KSTATE_CTX_PID_NS: get_pid_ns((struct pid_namespace *)obj->o_obj); break; diff --git a/kernel/kstate/kstate-task.c b/kernel/kstate/kstate-task.c index dc2387b..4a3524e 100644 --- a/kernel/kstate/kstate-task.c +++ b/kernel/kstate/kstate-task.c @@ -128,6 +128,13 @@ static int dump_task_struct(struct kstate_context *ctx, struct kstate_object *ob tmp = find_kstate_obj_by_ptr(ctx, tsk->cred, KSTATE_CTX_CRED); i->ref_cred = tmp->o_ref; + tmp = find_kstate_obj_by_ptr(ctx, tsk->pids[PIDTYPE_PID].pid, KSTATE_CTX_PID); + i->ref_pid = tmp->o_ref; + tmp = find_kstate_obj_by_ptr(ctx, tsk->pids[PIDTYPE_PGID].pid, KSTATE_CTX_PID); + i->ref_pgid = tmp->o_ref; + tmp = find_kstate_obj_by_ptr(ctx, tsk->pids[PIDTYPE_SID].pid, KSTATE_CTX_PID); + i->ref_sid = tmp->o_ref; + BUILD_BUG_ON(sizeof(i->comm) != sizeof(tsk->comm)); strlcpy((char *)i->comm, (const char *)tsk->comm, sizeof(i->comm)); @@ -280,6 +287,70 @@ static int restore_nsproxy(struct kstate_context *ctx, kstate_ref_t *ref) return 0; } +static int restore_pid(struct kstate_context *ctx, kstate_ref_t *ref) +{ + struct pid *pid; + struct kstate_object *tmp; + int rv; + + tmp = find_kstate_obj_by_ref(ctx, ref, KSTATE_CTX_PID); + if (!tmp) { + rv = kstate_restore_pid(ctx, ref); + if (rv < 0) + return rv; + tmp = find_kstate_obj_by_ref(ctx, ref, KSTATE_CTX_PID); + } + pid = tmp->o_obj; + + write_lock_irq(&tasklist_lock); + change_pid(current, PIDTYPE_PID, get_pid(pid)); + current->pid = current->tgid = pid_nr(pid); + write_unlock_irq(&tasklist_lock); + return 0; +} + +static int restore_pgid(struct kstate_context *ctx, kstate_ref_t *ref) +{ + struct pid *pid; + struct kstate_object *tmp; + int rv; + + tmp = find_kstate_obj_by_ref(ctx, ref, KSTATE_CTX_PID); + if (!tmp) { + rv = kstate_restore_pid(ctx, ref); + if (rv < 0) + return rv; + tmp = find_kstate_obj_by_ref(ctx, ref, KSTATE_CTX_PID); + } + pid = tmp->o_obj; + + write_lock_irq(&tasklist_lock); + change_pid(current, PIDTYPE_PGID, pid); + write_unlock_irq(&tasklist_lock); + return 0; +} + +static int restore_sid(struct kstate_context *ctx, kstate_ref_t *ref) +{ + struct pid *pid; + struct kstate_object *tmp; + int rv; + + tmp = find_kstate_obj_by_ref(ctx, ref, KSTATE_CTX_PID); + if (!tmp) { + rv = kstate_restore_pid(ctx, ref); + if (rv < 0) + return rv; + tmp = find_kstate_obj_by_ref(ctx, ref, KSTATE_CTX_PID); + } + pid = tmp->o_obj; + + write_lock_irq(&tasklist_lock); + change_pid(current, PIDTYPE_SID, pid); + write_unlock_irq(&tasklist_lock); + return 0; +} + struct task_struct_restore_context { struct kstate_context *ctx; struct kstate_image_task_struct *i; @@ -334,6 +405,15 @@ static int task_struct_restorer(void *_tsk_ctx) rv = restore_cred(ctx, &i->ref_cred); if (rv < 0) goto out; + rv = restore_pid(ctx, &i->ref_pid); + if (rv < 0) + goto out; + rv = restore_pgid(ctx, &i->ref_pgid); + if (rv < 0) + goto out; + rv = restore_sid(ctx, &i->ref_sid); + if (rv < 0) + goto out; out: tsk_ctx->rv = rv; diff --git a/kernel/pid.c b/kernel/pid.c index b2e5f78..bacf279 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -23,6 +23,7 @@ * (C) 2007 Pavel Emelyanov <xemul@xxxxxxxxxx>, OpenVZ, SWsoft Inc. * (C) 2007 Sukadev Bhattiprolu <sukadev@xxxxxxxxxx>, IBM * Many thanks to Oleg Nesterov for comments and help + * Copyright (C) 2000-2009 Parallels Holdings, Ltd. * */ @@ -182,6 +183,36 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) return -1; } +#ifdef CONFIG_CHECKPOINT +static int set_pidmap(struct pid_namespace *pid_ns, pid_t pid) +{ + int offset; + struct pidmap *map; + + offset = pid & BITS_PER_PAGE_MASK; + map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; + if (!map->page) { + void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); + /* + * Free the page if someone raced with us + * installing it. + */ + spin_lock_irq(&pidmap_lock); + if (map->page) + kfree(page); + else + map->page = page; + spin_unlock_irq(&pidmap_lock); + if (unlikely(!map->page)) + return -ENOMEM; + } + if (test_and_set_bit(offset, map->page)) + return -EBUSY; + atomic_dec(&map->nr_free); + return pid; +} +#endif + int next_pidmap(struct pid_namespace *pid_ns, int last) { int offset; @@ -239,11 +270,12 @@ void free_pid(struct pid *pid) call_rcu(&pid->rcu, delayed_put_pid); } -struct pid *alloc_pid(struct pid_namespace *ns) +/* Last level + 1 pid numbers are predefined. */ +struct pid *alloc_pid(struct pid_namespace *ns, int *nr, unsigned int level) { struct pid *pid; enum pid_type type; - int i, nr; + int i, pid_nr; struct pid_namespace *tmp; struct upid *upid; @@ -253,11 +285,16 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = ns; for (i = ns->level; i >= 0; i--) { - nr = alloc_pidmap(tmp); - if (nr < 0) +#ifdef CONFIG_CHECKPOINT + if (nr && ns->level - i <= level) + pid_nr = set_pidmap(tmp, nr[ns->level - i]); + else +#endif + pid_nr = alloc_pidmap(tmp); + if (pid_nr < 0) goto out_free; - pid->numbers[i].nr = nr; + pid->numbers[i].nr = pid_nr; pid->numbers[i].ns = tmp; tmp = tmp->parent; } @@ -537,3 +574,155 @@ void __init pidmap_init(void) init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC); } + +#ifdef CONFIG_CHECKPOINT +#include <linux/kstate.h> +#include <linux/kstate-image.h> + +static int collect_pid(struct kstate_context *ctx, struct pid *pid) +{ + int rv; + + rv = kstate_collect_object(ctx, pid, KSTATE_CTX_PID); + pr_debug("collect pid %p: rv %d\n", pid, rv); + return rv; +} + +static int collect_task_pid(struct kstate_context *ctx, struct pid *pid) +{ + unsigned int level0, level; + + level0 = ctx->init_tsk->nsproxy->pid_ns->level; + if (pid->level < level0) { + WARN_ON(1); + return -EINVAL; + } + for (level = level0; level <= pid->level; level++) { + struct pid_namespace *pid_ns; + struct kstate_object *tmp; + + pid_ns = pid->numbers[level].ns; + tmp = find_kstate_obj_by_ptr(ctx, pid_ns, KSTATE_CTX_PID_NS); + if (!tmp) { + WARN_ON(1); + return -EINVAL; + } + } + return collect_pid(ctx, pid); +} + +int kstate_collect_all_pid(struct kstate_context *ctx) +{ + struct kstate_object *obj; + int rv; + + for_each_kstate_object(ctx, obj, KSTATE_CTX_TASK_STRUCT) { + struct task_struct *tsk = obj->o_obj; + + rv = collect_task_pid(ctx, tsk->pids[PIDTYPE_PID].pid); + if (rv < 0) + return rv; + rv = collect_task_pid(ctx, tsk->pids[PIDTYPE_PGID].pid); + if (rv < 0) + return rv; + rv = collect_task_pid(ctx, tsk->pids[PIDTYPE_SID].pid); + if (rv < 0) + return rv; + } + return 0; +} + +static int dump_pid(struct kstate_context *ctx, struct kstate_object *obj) +{ + struct pid *pid = obj->o_obj; + struct kstate_image_pid *i; + struct kstate_object *tmp; + unsigned int level0, level; + unsigned int image_len; + int rv; + + level0 = ctx->init_tsk->nsproxy->pid_ns->level; + image_len = sizeof(*i) + (pid->level - level0 + 1) * sizeof(__u32); + i = kstate_prepare_image(KSTATE_OBJ_PID, image_len); + if (!i) + return -ENOMEM; + + tmp = find_kstate_obj_by_ptr(ctx, pid->numbers[pid->level].ns, KSTATE_CTX_PID_NS); + i->ref_pid_ns = tmp->o_ref; + + i->level = pid->level - level0; + for (level = level0; level <= pid->level; level++) + i->nr[level - level0] = pid->numbers[level].nr; + + rv = kstate_write_image(ctx, i, image_len, obj); + kfree(i); + pr_debug("dump pid %p: ref {%llu, %u}, rv %d\n", pid, (unsigned long long)obj->o_ref.pos, obj->o_ref.id, rv); + return rv; +} + +int kstate_dump_all_pid(struct kstate_context *ctx) +{ + struct kstate_object *obj; + int rv; + + for_each_kstate_object(ctx, obj, KSTATE_CTX_PID) { + rv = dump_pid(ctx, obj); + if (rv < 0) + return rv; + } + return 0; +} + +int kstate_restore_pid(struct kstate_context *ctx, kstate_ref_t *ref) +{ + struct kstate_image_pid *i; + struct pid *pid; + struct pid_namespace *pid_ns; + struct kstate_object *tmp; + unsigned int level0; + int rv; + + i = kstate_read_image(ctx, ref, KSTATE_OBJ_PID, sizeof(*i)); + if (IS_ERR(i)) + return PTR_ERR(i); + if (i->level > ((__u32)-1 - sizeof(*i)) / sizeof(__u32) - 1) { + rv = -EINVAL; + goto out_free_image; + } + if (i->hdr.obj_len != sizeof(*i) + (i->level + 1) * sizeof(__u32)) { + rv = -EINVAL; + goto out_free_image; + } + + tmp = find_kstate_obj_by_ref(ctx, &i->ref_pid_ns, KSTATE_CTX_PID_NS); + if (!tmp) { + rv = kstate_restore_pid_ns(ctx, &i->ref_pid_ns); + if (rv < 0) + goto out_free_image; + tmp = find_kstate_obj_by_ref(ctx, &i->ref_pid_ns, KSTATE_CTX_PID_NS); + } + pid_ns = tmp->o_obj; + + level0 = ctx->init_tsk->nsproxy->pid_ns->level; + if (i->level >= pid_ns->level - level0) { + rv = -EINVAL; + goto out_free_image; + } + + pid = alloc_pid(pid_ns, i->nr, i->level); + kfree(i); + if (!pid) + return -ENOMEM; + + rv = kstate_restore_object(ctx, pid, KSTATE_CTX_PID, ref); + if (rv < 0) + put_pid(pid); + pr_debug("restore pid %p: ref {%lld, %u}, rv %d\n", pid, (unsigned long long)ref->pos, ref->id, rv); + return rv; + +out_free_image: + kfree(i); + pr_debug("%s: return %d, ref {%llu, %u}\n", __func__, rv, (unsigned long long)ref->pos, ref->id); + return rv; +} +#endif -- 1.5.6.5 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers