On Fri, Mar 19, 2010 at 04:39:55PM -0500, Serge E. Hallyn wrote: > Support checkpoint and restart of tasks in nested pid namespaces. At > Oren's request here is an alternative to my previous implementation. In > this one, we keep the original single pids_array to minimize memory > allocations. The pids array entries are augmented with a pidns depth > (relative to the container init's pidns, and an "rpid" which is the pid > in the checkpointer's pidns (or 0 if no valid pid exists). The rpid > will be used by userspace to gather more information (like > /proc/$$/mountinfo) after the kernel sys_checkpoint. If any tasks are > in nested pid namespace, another single array holds all of the vpids. > At restart those are used by userspace to determine how to call > eclone(). Kernel ignores them. > > All cr_tests including the new pid_ns testcase pass. > IMHO this approach looks ok too. I just feel that checkpoint_vpids() could be re-worked a bit in order to not impose an artificial limit of CKPT_HDR_PIDS_CHUNK to the depth of pid namespaces, even if it is 256 (see suggested changes below). It would probably be safer too to use task_active_pid_ns() instead of task->nsproxy->pid_ns, just in case some PID namespace unsharing like proposed by Eric makes it to mainline. Thanks, Louis > Signed-off-by: Serge E. Hallyn <serue@xxxxxxxxxx> > --- > checkpoint/checkpoint.c | 113 ++++++++++++++++++++++++++++++++++---- > checkpoint/process.c | 18 +++++- > checkpoint/restart.c | 45 ++++++++++++++- > checkpoint/sys.c | 2 + > include/linux/checkpoint.h | 2 +- > include/linux/checkpoint_hdr.h | 16 +++++ > include/linux/checkpoint_types.h | 3 + > kernel/nsproxy.c | 9 ++- > 8 files changed, 186 insertions(+), 22 deletions(-) > > diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c > index f27af41..fe3546a 100644 > --- a/checkpoint/checkpoint.c > +++ b/checkpoint/checkpoint.c > @@ -27,6 +27,7 @@ > #include <linux/deferqueue.h> > #include <linux/checkpoint.h> > #include <linux/checkpoint_hdr.h> > +#include <linux/pid_namespace.h> > > /* unique checkpoint identifier (FIXME: should be per-container ?) */ > static atomic_t ctx_count = ATOMIC_INIT(0); > @@ -242,6 +243,7 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t) > struct task_struct *root = ctx->root_task; > struct nsproxy *nsproxy; > int ret = 0; > + struct pid_namespace *pidns; > > ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns)); > > @@ -293,10 +295,15 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t) > _ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n"); > ret = -EPERM; > } > - /* no support for >1 private pidns */ > - if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) { > - _ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n"); > - ret = -EPERM; > + /* pidns must be descendent of root_nsproxy */ > + pidns = nsproxy->pid_ns; > + while (pidns != ctx->root_nsproxy->pid_ns) { > + if (pidns == &init_pid_ns) { > + ret = -EPERM; > + _ckpt_err(ctx, ret, "%(T)stranger pid_ns\n"); > + break; > + } > + pidns = pidns->parent; > } > rcu_read_unlock(); > > @@ -305,15 +312,19 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t) > > #define CKPT_HDR_PIDS_CHUNK 256 > > +/* > + * Write the pids in ctx->root_nsproxy->pidns. This info is > + * needed at restart to unambiguously dereference tasks. > + */ > static int checkpoint_pids(struct ckpt_ctx *ctx) > { > struct ckpt_pids *h; > - struct pid_namespace *ns; > + struct pid_namespace *root_pidns; > struct task_struct *task; > struct task_struct **tasks_arr; > int nr_tasks, n, pos = 0, ret = 0; > > - ns = ctx->root_nsproxy->pid_ns; > + root_pidns = ctx->root_nsproxy->pid_ns; > tasks_arr = ctx->tasks_arr; > nr_tasks = ctx->nr_tasks; > BUG_ON(nr_tasks <= 0); > @@ -331,15 +342,21 @@ static int checkpoint_pids(struct ckpt_ctx *ctx) > do { > rcu_read_lock(); > for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) { > + struct pid_namespace *task_pidns; > task = tasks_arr[pos]; > > - h[n].vpid = task_pid_nr_ns(task, ns); > - h[n].vtgid = task_tgid_nr_ns(task, ns); > - h[n].vpgid = task_pgrp_nr_ns(task, ns); > - h[n].vsid = task_session_nr_ns(task, ns); > - h[n].vppid = task_tgid_nr_ns(task->real_parent, ns); > + h[n].vpid = task_pid_nr_ns(task, root_pidns); > + h[n].vtgid = task_tgid_nr_ns(task, root_pidns); > + h[n].vpgid = task_pgrp_nr_ns(task, root_pidns); > + h[n].vsid = task_session_nr_ns(task, root_pidns); > + h[n].vppid = task_tgid_nr_ns(task->real_parent, > + root_pidns); > + task_pidns = task_nsproxy(task)->pid_ns; > + h[n].rpid = task_pid_vnr(task); > + h[n].depth = task_pidns->level - root_pidns->level; > ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n", > pos, h[n].vpid, h[n].vtgid, h[n].vppid); > + ctx->nr_vpids += h[n].depth; > pos++; > } > rcu_read_unlock(); > @@ -356,6 +373,61 @@ static int checkpoint_pids(struct ckpt_ctx *ctx) > return ret; > } > > +static int checkpoint_vpids(struct ckpt_ctx *ctx) > +{ > + struct ckpt_vpid *h; > + struct pid_namespace *root_pidns, *task_pidns; > + struct task_struct *task; > + int ret, nr_tasks = ctx->nr_tasks; > + int tidx = 0, /* index into task array */ > + hidx = 0; /* pids written into current ckpt_vpids chunk */ > + > + root_pidns = ctx->root_nsproxy->pid_ns; > + nr_tasks = ctx->nr_tasks; > + > + ret = ckpt_write_obj_type(ctx, NULL, > + sizeof(*h) * ctx->nr_vpids, > + CKPT_HDR_BUFFER); > + if (ret < 0) > + return ret; > + > + h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK); > + if (!h) > + return -ENOMEM; > + > + do { > + rcu_read_lock(); > + while (tidx < nr_tasks) { > + int vidx; /* vpid index */ > + int nsdelta; > + > + task = ctx->tasks_arr[tidx]; > + task_pidns = task_nsproxy(task)->pid_ns; > + nsdelta = task_pidns->level - root_pidns->level; > + if (hidx + nsdelta >= CKPT_HDR_PIDS_CHUNK) I think that (hidx + nsdelta > CKPT_HDR_PIDS_CHUNK) checks more accurately the limit. > + break; > + > + for (vidx = 0; vidx < nsdelta; vidx++) { > + h[vidx].pid = task_pid_nr_ns(task, task_pidns); Here: h[hidx + vidx] > + task_pidns = task_pidns->parent; > + } > + > + hidx += nsdelta; > + tidx++; > + } > + rcu_read_unlock(); > + > + ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h)); > + if (ret < 0) > + break; > + > + hidx = 0; > + } while (tidx < nr_tasks); > + > + _ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK); > + return ret; > +} Maybe re-work it this way: static int checkpoint_vpids(struct ckpt_ctx *ctx) { struct ckpt_vpid *h; struct pid_namespace *root_pidns, *task_pidns, *active_pidns; struct task_struct *task; int ret, nr_tasks = ctx->nr_tasks; int tidx = 0, /* index into task array */ hidx = 0; /* pids written into current ckpt_vpids chunk */ vidx = 0; /* vpid index for current task */ root_pidns = ctx->root_nsproxy->pid_ns; nr_tasks = ctx->nr_tasks; ret = ckpt_write_obj_type(ctx, NULL, sizeof(*h) * ctx->nr_vpids, CKPT_HDR_BUFFER); if (ret < 0) return ret; h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK); if (!h) return -ENOMEM; do { rcu_read_lock(); while (tidx < nr_tasks && hidx < CKPT_HDR_PIDS_CHUNK) { int nsdelta; task = ctx->tasks_arr[tidx]; active_pidns = task_active_pid_ns(task); nsdelta = active_pidns->level - root_pidns->level; if (hidx + nsdelta - vidx > CKPT_HDR_PIDS_CHUNK) /* * We will release rcu before recording the * remaining vpids, but neither task nor its * pid can disappear. */ nsdelta = CKPT_HDR_PIDS_CHUNK - hidx + vidx; if (vidx == 0) task_pidns = active_pidns; for (; vidx < nsdelta; vidx++) { h[hidx].pid = task_pid_nr_ns(task, task_pidns); hidx++; task_pidns = task_pidns->parent; } if (task_pidns == root_pidns) { tidx++; vidx = 0; } } rcu_read_unlock(); ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h)); if (ret < 0) break; hidx = 0; } while (tidx < nr_tasks); _ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK); return ret; } [...] -- Dr Louis Rilling Kerlabs Skype: louis.rilling Batiment Germanium Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes http://www.kerlabs.com/ 35700 Rennes
Attachment:
signature.asc
Description: Digital signature
_______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers