[ Patch against https://www.linux-cr.org/redmine/tab/show/kernel-cr ] In place of one big pids array, checkpoint one struct ckpt_hdr_pids per task. It contains pid/ppid/etc in the root nsproxy's pidns, and is followed by a list of all virtual pids in child pid namespaces, if any. When an nsproxy is created during do_restore_ns(), we don't yet set its pid_ns, waiting instead until a task attaches that new nsproxy to itself. I *think* the nsproxy will generally get recreated by the task which will use it, but we may as well be sure by having the pid_ns set when the nsproxy is first assigned. This patch applies on top of ckpt-v20. With this patch applied (and the corresponding user-cr patch), all cr_tests pass, including a new pidns test (which is in branch pidns.1 until this patch goes into ckpt-v20-dev). Please apply. Changelog: Mar 18: bump checkpoing image format version Signed-off-by: Serge E. Hallyn <serue@xxxxxxxxxx> --- checkpoint/checkpoint.c | 91 +++++++++++++++++++++++-------------- checkpoint/process.c | 27 +++++++----- checkpoint/restart.c | 59 ++++++++++++++++-------- checkpoint/sys.c | 8 +++- include/linux/checkpoint.h | 2 +- include/linux/checkpoint_hdr.h | 19 +++++--- include/linux/checkpoint_types.h | 3 + kernel/nsproxy.c | 9 +++- 8 files changed, 142 insertions(+), 76 deletions(-) diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c index f27af41..55e14c3 100644 --- a/checkpoint/checkpoint.c +++ b/checkpoint/checkpoint.c @@ -27,6 +27,7 @@ #include <linux/deferqueue.h> #include <linux/checkpoint.h> #include <linux/checkpoint_hdr.h> +#include <linux/pid_namespace.h> /* unique checkpoint identifier (FIXME: should be per-container ?) */ static atomic_t ctx_count = ATOMIC_INIT(0); @@ -241,6 +242,7 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t) { struct task_struct *root = ctx->root_task; struct nsproxy *nsproxy; + struct pid_namespace *pidns; int ret = 0; ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns)); @@ -293,66 +295,85 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t) _ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n"); ret = -EPERM; } - /* no support for >1 private pidns */ - if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) { - _ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n"); - ret = -EPERM; + /* pidns must be descendent of root_nsproxy */ + pidns = nsproxy->pid_ns; + while (pidns != ctx->root_nsproxy->pid_ns) { + if (pidns == &init_pid_ns) { + ret = -EPERM; + _ckpt_err(ctx, ret, "%(T)stranger pid_ns\n"); + break; + } + pidns = pidns->parent; } rcu_read_unlock(); return ret; } -#define CKPT_HDR_PIDS_CHUNK 256 +/* called under rcu_read_lock */ +static void copy_task(struct ckpt_hdr_pids *h, struct task_struct *t, + struct pid_namespace *root_pid_ns, + struct pid_namespace *task_pid_ns) +{ + int i = 0; + __s32 *pids; + + h->pid = task_pid_nr_ns(t, root_pid_ns); + h->tgid = task_tgid_nr_ns(t, root_pid_ns); + h->pgid = task_pgrp_nr_ns(t, root_pid_ns); + h->sid = task_session_nr_ns(t, root_pid_ns); + h->ppid = task_tgid_nr_ns(t->real_parent, root_pid_ns); + h->rpid = task_pid_vnr(t); + pids = h->vpids; + + while (task_pid_ns != root_pid_ns) { + pids[i++] = task_pid_nr_ns(t, task_pid_ns); + task_pid_ns = task_pid_ns->parent; + } +} static int checkpoint_pids(struct ckpt_ctx *ctx) { - struct ckpt_pids *h; - struct pid_namespace *ns; + struct ckpt_hdr_pids *h; + struct pid_namespace *root_pidns; struct task_struct *task; struct task_struct **tasks_arr; - int nr_tasks, n, pos = 0, ret = 0; + int nr_tasks, i, ret = 0; - ns = ctx->root_nsproxy->pid_ns; + root_pidns = ctx->root_nsproxy->pid_ns; tasks_arr = ctx->tasks_arr; nr_tasks = ctx->nr_tasks; BUG_ON(nr_tasks <= 0); - ret = ckpt_write_obj_type(ctx, NULL, - sizeof(*h) * nr_tasks, - CKPT_HDR_BUFFER); - if (ret < 0) - return ret; + for (i = 0; i < nr_tasks; i++) { + int nsdelta, size; + struct pid_namespace *task_pidns; - h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK); - if (!h) - return -ENOMEM; + task = tasks_arr[i]; + rcu_read_lock(); + task_pidns = task_nsproxy(task)->pid_ns; + rcu_read_unlock(); + + nsdelta = task_pidns->level - root_pidns->level; + size = sizeof(*h) + nsdelta * sizeof(__s32); + + h = ckpt_hdr_get_type(ctx, size, CKPT_HDR_PID); + if (!h) + return -ENOMEM; - do { rcu_read_lock(); - for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) { - task = tasks_arr[pos]; - - h[n].vpid = task_pid_nr_ns(task, ns); - h[n].vtgid = task_tgid_nr_ns(task, ns); - h[n].vpgid = task_pgrp_nr_ns(task, ns); - h[n].vsid = task_session_nr_ns(task, ns); - h[n].vppid = task_tgid_nr_ns(task->real_parent, ns); - ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n", - pos, h[n].vpid, h[n].vtgid, h[n].vppid); - pos++; - } + copy_task(h, task, root_pidns, task_pidns); rcu_read_unlock(); + ckpt_debug("task[%d]: pid %d tgid %d parent %d\n", + i, h->pid, h->tgid, h->ppid); - n = min(nr_tasks, CKPT_HDR_PIDS_CHUNK); - ret = ckpt_kwrite(ctx, h, n * sizeof(*h)); + ret = ckpt_write_obj(ctx, &h->h); + ckpt_hdr_put(ctx, h); if (ret < 0) break; - nr_tasks -= n; - } while (nr_tasks > 0); + } - _ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK); return ret; } diff --git a/checkpoint/process.c b/checkpoint/process.c index f917112..bb44960 100644 --- a/checkpoint/process.c +++ b/checkpoint/process.c @@ -22,7 +22,7 @@ #include <linux/checkpoint.h> #include <linux/checkpoint_hdr.h> #include <linux/syscalls.h> - +#include <linux/pid_namespace.h> pid_t ckpt_pid_nr(struct ckpt_ctx *ctx, struct pid *pid) { @@ -36,12 +36,6 @@ struct pid *_ckpt_find_pgrp(struct ckpt_ctx *ctx, pid_t pgid) struct pid *pgrp; if (pgid == 0) { - /* - * At checkpoint the pgid owner lived in an ancestor - * pid-ns. The best we can do (sanely and safely) is - * to examine the parent of this restart's root: if in - * a distinct pid-ns, use its pgrp; otherwise fail. - */ p = ctx->root_task->real_parent; if (p->nsproxy->pid_ns == current->nsproxy->pid_ns) return NULL; @@ -51,7 +45,7 @@ struct pid *_ckpt_find_pgrp(struct ckpt_ctx *ctx, pid_t pgid) * Find the owner process of this pgid (it must exist * if pgrp exists). It must be a thread group leader. */ - pgrp = find_vpid(pgid); + pgrp = find_pid_ns(pgid, ctx->root_nsproxy->pid_ns); p = pid_task(pgrp, PIDTYPE_PID); if (!p || !thread_group_leader(p)) return NULL; @@ -578,6 +572,14 @@ static int restore_task_ns(struct ckpt_ctx *ctx) } if (nsproxy != task_nsproxy(current)) { + /* + * This is *kinda* shady to do without any locking. However + * it is safe because each task is restarted separately in + * serial. If that ever changes, we'll need a spinlock? + */ + if (!nsproxy->pid_ns) + nsproxy->pid_ns = get_pid_ns(current->nsproxy->pid_ns); + get_nsproxy(nsproxy); switch_task_namespaces(current, nsproxy); } @@ -827,10 +829,10 @@ static int restore_task_pgid(struct ckpt_ctx *ctx) if (!thread_group_leader(task)) /* (1) */ return 0; - pgid = ctx->pids_arr[ctx->active_pid].vpgid; + pgid = ctx->vpgids_arr[ctx->active_pid]; - if (pgid == task_pgrp_vnr(task)) /* nothing to do */ - return 0; + if (pgid == task_pgrp_nr_ns(task, ctx->root_nsproxy->pid_ns)) + return 0; /* nothing to do */ if (task->signal->leader) /* (2) */ return -EINVAL; @@ -850,6 +852,9 @@ static int restore_task_pgid(struct ckpt_ctx *ctx) if (ctx->uflags & RESTART_TASKSELF) ret = 0; + if (ret < 0) + ckpt_err(ctx, ret, "setting pgid\n"); + return ret; } diff --git a/checkpoint/restart.c b/checkpoint/restart.c index 6a9644d..84713c7 100644 --- a/checkpoint/restart.c +++ b/checkpoint/restart.c @@ -145,7 +145,7 @@ void restore_debug_free(struct ckpt_ctx *ctx) ckpt_debug("kflags %lu uflags %lu oflags %lu", ctx->kflags, ctx->uflags, ctx->oflags); for (i = 0; i < ctx->nr_pids; i++) - ckpt_debug("task[%d] to run %d\n", i, ctx->pids_arr[i].vpid); + ckpt_debug("task[%d] to run %d\n", i, ctx->vpids_arr[i]); list_for_each_entry_safe(s, p, &ctx->task_status, list) { if (s->flags & RESTART_DBG_COORD) @@ -420,7 +420,8 @@ void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type) h = ckpt_read_obj(ctx, len, len); if (IS_ERR(h)) { - ckpt_err(ctx, PTR_ERR(h), "Expecting to read type %d\n", type); + ckpt_err(ctx, PTR_ERR(h), "Expecting to read type %d len %d\n", + type, len); return h; } @@ -730,34 +731,51 @@ static int restore_read_tail(struct ckpt_ctx *ctx) return ret; } +#define CKPT_MAX_PIDS_SZ 99999 /* restore_read_tree - read the tasks tree into the checkpoint context */ static int restore_read_tree(struct ckpt_ctx *ctx) { struct ckpt_hdr_tree *h; - int size, ret; + int i, size; h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TREE); if (IS_ERR(h)) return PTR_ERR(h); - ret = -EINVAL; + ctx->nr_pids = h->nr_tasks; + ckpt_hdr_put(ctx, h); + if (h->nr_tasks <= 0) - goto out; + return -EINVAL; - ctx->nr_pids = h->nr_tasks; - size = sizeof(*ctx->pids_arr) * ctx->nr_pids; + size = sizeof(pid_t) * ctx->nr_pids; if (size <= 0) /* overflow ? */ - goto out; + return -EINVAL; - ctx->pids_arr = kmalloc(size, GFP_KERNEL); - if (!ctx->pids_arr) { - ret = -ENOMEM; - goto out; + ctx->vpids_arr = kmalloc(size, GFP_KERNEL); + ctx->vpgids_arr = kmalloc(size, GFP_KERNEL); + if (!ctx->vpids_arr || !ctx->vpgids_arr) + return -ENOMEM; + + for (i = 0; i < ctx->nr_pids; i++) { + struct ckpt_hdr_pids *p; + + p = ckpt_read_obj(ctx, 0, CKPT_MAX_PIDS_SZ); + if (!p) + return -EINVAL; + if (p->h.type != CKPT_HDR_PID) { + ckpt_hdr_put(ctx, p); + return -EINVAL; + } + if (p->h.len < sizeof(*p)) { + ckpt_hdr_put(ctx, p); + return -EINVAL; + } + ctx->vpids_arr[i] = p->pid; + ctx->vpgids_arr[i] = p->pgid; + ckpt_hdr_put(ctx, p); } - ret = _ckpt_read_buffer(ctx, ctx->pids_arr, size); - out: - ckpt_hdr_put(ctx, h); - return ret; + return 0; } static inline int all_tasks_activated(struct ckpt_ctx *ctx) @@ -768,7 +786,7 @@ static inline int all_tasks_activated(struct ckpt_ctx *ctx) static inline pid_t get_active_pid(struct ckpt_ctx *ctx) { int active = ctx->active_pid; - return active >= 0 ? ctx->pids_arr[active].vpid : 0; + return active >= 0 ? ctx->vpids_arr[active] : 0; } static inline int is_task_active(struct ckpt_ctx *ctx, pid_t pid) @@ -870,7 +888,7 @@ static int restore_activate_next(struct ckpt_ctx *ctx) static int wait_task_active(struct ckpt_ctx *ctx) { - pid_t pid = task_pid_vnr(current); + pid_t pid = task_pid_nr_ns(current, ctx->root_nsproxy->pid_ns); int ret; ckpt_debug("pid %d waiting\n", pid); @@ -886,7 +904,8 @@ static int wait_task_active(struct ckpt_ctx *ctx) static int wait_task_sync(struct ckpt_ctx *ctx) { - ckpt_debug("pid %d syncing\n", task_pid_vnr(current)); + ckpt_debug("pid %d syncing\n", + task_pid_nr_ns(current, ctx->root_nsproxy->pid_ns)); wait_event_interruptible(ctx->waitq, ckpt_test_complete(ctx)); ckpt_debug("task sync done (errno %d)\n", ctx->errno); if (ckpt_test_error(ctx)) @@ -1160,7 +1179,7 @@ static struct task_struct *choose_root_task(struct ckpt_ctx *ctx, pid_t pid) read_lock(&tasklist_lock); list_for_each_entry(task, ¤t->children, sibling) { - if (task_pid_vnr(task) == pid) { + if (task_pid_nr_ns(task, ctx->coord_pidns) == pid) { get_task_struct(task); ctx->root_task = task; ctx->root_pid = pid; diff --git a/checkpoint/sys.c b/checkpoint/sys.c index 9e9df9b..5df72b0 100644 --- a/checkpoint/sys.c +++ b/checkpoint/sys.c @@ -22,6 +22,7 @@ #include <linux/capability.h> #include <linux/checkpoint.h> #include <linux/deferqueue.h> +#include <linux/pid_namespace.h> /* * ckpt_unpriv_allowed - sysctl controlled. @@ -247,6 +248,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx) if (ctx->tasks_arr) task_arr_free(ctx); + if (ctx->coord_pidns) + put_pid_ns(ctx->coord_pidns); if (ctx->root_nsproxy) put_nsproxy(ctx->root_nsproxy); if (ctx->root_task) @@ -256,7 +259,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx) free_page((unsigned long) ctx->scratch_page); - kfree(ctx->pids_arr); + kfree(ctx->vpids_arr); + kfree(ctx->vpgids_arr); sock_listening_list_free(&ctx->listen_sockets); @@ -277,6 +281,8 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags, ctx->kflags = kflags; ctx->ktime_begin = ktime_get(); + ctx->coord_pidns = get_pid_ns(current->nsproxy->pid_ns); + atomic_set(&ctx->refcount, 0); INIT_LIST_HEAD(&ctx->pgarr_list); INIT_LIST_HEAD(&ctx->pgarr_pool); diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h index 792b523..e860bf5 100644 --- a/include/linux/checkpoint.h +++ b/include/linux/checkpoint.h @@ -10,7 +10,7 @@ * distribution for more details. */ -#define CHECKPOINT_VERSION 5 +#define CHECKPOINT_VERSION 6 /* checkpoint user flags */ #define CHECKPOINT_SUBTREE 0x1 diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h index 41412d1..7957b3b 100644 --- a/include/linux/checkpoint_hdr.h +++ b/include/linux/checkpoint_hdr.h @@ -117,6 +117,8 @@ enum { #define CKPT_HDR_GROUPINFO CKPT_HDR_GROUPINFO CKPT_HDR_TASK_CREDS, #define CKPT_HDR_TASK_CREDS CKPT_HDR_TASK_CREDS + CKPT_HDR_PID, +#define CKPT_HDR_PID CKPT_HDR_PID /* 201-299: reserved for arch-dependent */ @@ -326,12 +328,17 @@ struct ckpt_hdr_tree { __s32 nr_tasks; } __attribute__((aligned(8))); -struct ckpt_pids { - __s32 vpid; - __s32 vppid; - __s32 vtgid; - __s32 vpgid; - __s32 vsid; +struct ckpt_hdr_pids { + struct ckpt_hdr h; + __s32 rpid; /* pid in checkpointer's pid_ns */ + /* The rest of these are in container init's pid_ns */ + __s32 pid; + __s32 ppid; + __s32 tgid; + __s32 pgid; + __s32 sid; + /* followed by pids in pid_ns up to root->nsproxy->pid_ns */ + __s32 vpids[0]; } __attribute__((aligned(8))); /* pids */ diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h index ecd3e91..0ae78a7 100644 --- a/include/linux/checkpoint_types.h +++ b/include/linux/checkpoint_types.h @@ -37,6 +37,7 @@ struct ckpt_ctx { int root_init; /* [container] root init ? */ pid_t root_pid; /* [container] root pid */ struct task_struct *root_task; /* [container] root task */ + struct pid_namespace *coord_pidns; /* coordinator pid_ns */ struct nsproxy *root_nsproxy; /* [container] root nsproxy */ struct task_struct *root_freezer; /* [container] root task */ char lsm_name[SECURITY_NAME_MAX + 1]; /* security module at ckpt */ @@ -74,6 +75,8 @@ struct ckpt_ctx { /* [multi-process restart] */ struct ckpt_pids *pids_arr; /* array of all pids [restart] */ + pid_t *vpids_arr; /* pids array in container pidns */ + pid_t *vpgids_arr; /* vpgids array in container pidns */ int nr_pids; /* size of pids array */ atomic_t nr_total; /* total tasks count (with ghosts) */ int active_pid; /* (next) position in pids array */ diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 0da0d83..6d86240 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -364,8 +364,13 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx) get_net(net_ns); nsproxy->net_ns = net_ns; - get_pid_ns(current->nsproxy->pid_ns); - nsproxy->pid_ns = current->nsproxy->pid_ns; + /* + * The pid_ns will get assigned the first time that we + * assign the nsproxy to a task. The task had unshared + * its pid_ns in userspace before calling restart, and + * we want to keep using that pid_ns. + */ + nsproxy->pid_ns = NULL; } out: if (ret < 0) -- 1.6.1 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers