[ this patch is against the userspace checkpoint/restart tools at http://www.linux-cr.org/git/?p=user-cr.git;a=summary ] Support restart of nested pid namespaces. Parse the ckpt_vpid array to decide the vpids to specify for each task's eclone(). Signed-off-by: Serge Hallyn <serue@xxxxxxxxxx> --- include/linux/checkpoint.h | 2 +- include/linux/checkpoint_hdr.h | 16 ++++ restart.c | 158 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 160 insertions(+), 16 deletions(-) diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h index 53b8b2c..8d021b9 100644 --- a/include/linux/checkpoint.h +++ b/include/linux/checkpoint.h @@ -14,7 +14,7 @@ * distribution for more details. */ -#define CHECKPOINT_VERSION 5 +#define CHECKPOINT_VERSION 6 /* checkpoint user flags */ #define CHECKPOINT_SUBTREE 0x1 diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h index e8eaf23..caf16a6 100644 --- a/include/linux/checkpoint_hdr.h +++ b/include/linux/checkpoint_hdr.h @@ -111,6 +111,8 @@ enum { #define CKPT_HDR_GROUPINFO CKPT_HDR_GROUPINFO CKPT_HDR_TASK_CREDS, #define CKPT_HDR_TASK_CREDS CKPT_HDR_TASK_CREDS + CKPT_HDR_VPIDS, +#define CKPT_HDR_VPIDS CKPT_HDR_VPIDS /* 201-299: reserved for arch-dependent */ @@ -321,11 +323,25 @@ struct ckpt_hdr_tree { } __attribute__((aligned(8))); struct ckpt_pids { + /* these pids are in root_nsproxy's pid ns */ __s32 vpid; __s32 vppid; __s32 vtgid; __s32 vpgid; __s32 vsid; + __s32 rsid; /* real pid - in checkpointer's pid_ns */ + __s32 depth; /* pidns depth */ +} __attribute__((aligned(8))); + +/* number of vpids */ +struct ckpt_hdr_vpids { + struct ckpt_hdr h; + __s32 nr_vpids; +} __attribute__((aligned(8))); + +struct ckpt_vpid { + __s32 pid; + __s32 padding; } __attribute__((aligned(8))); /* pids */ diff --git a/restart.c b/restart.c index 0c74bb6..32f36f8 100644 --- a/restart.c +++ b/restart.c @@ -244,6 +244,9 @@ struct task { struct task *phantom; /* pointer to place-holdler task (if any) */ + int piddepth; + struct ckpt_vpid *vpids; + pid_t pid; /* process IDs, our bread-&-butter */ pid_t ppid; pid_t tgid; @@ -272,6 +275,7 @@ struct ckpt_ctx { int pipe_in; int pipe_out; int pids_nr; + int vpids_nr; int pipe_child[2]; /* for children to report status */ int pipe_feed[2]; /* for feeder to provide input */ @@ -279,6 +283,7 @@ struct ckpt_ctx { struct ckpt_pids *pids_arr; struct ckpt_pids *copy_arr; + struct ckpt_vpid *vpids_arr; struct task *tasks_arr; int tasks_nr; @@ -291,6 +296,7 @@ struct ckpt_ctx { char header_arch[BUFSIZE]; char container[BUFSIZE]; char tree[BUFSIZE]; + char vpids[BUFSIZE]; char buf[BUFSIZE]; struct app_restart_args *args; @@ -316,6 +322,7 @@ static int ckpt_remount_devpts(struct ckpt_ctx *ctx); static int ckpt_build_tree(struct ckpt_ctx *ctx); static int ckpt_init_tree(struct ckpt_ctx *ctx); +static int assign_vpids(struct ckpt_ctx *ctx); static int ckpt_set_creator(struct ckpt_ctx *ctx, struct task *task); static int ckpt_placeholder_task(struct ckpt_ctx *ctx, struct task *task); static int ckpt_propagate_session(struct ckpt_ctx *ctx, struct task *session); @@ -339,6 +346,7 @@ static int ckpt_write_header(struct ckpt_ctx *ctx); static int ckpt_write_header_arch(struct ckpt_ctx *ctx); static int ckpt_write_container(struct ckpt_ctx *ctx); static int ckpt_write_tree(struct ckpt_ctx *ctx); +static int ckpt_write_vpids(struct ckpt_ctx *ctx); static int _ckpt_read(int fd, void *buf, int count); static int ckpt_read(int fd, void *buf, int count); @@ -350,6 +358,7 @@ static int ckpt_read_header(struct ckpt_ctx *ctx); static int ckpt_read_header_arch(struct ckpt_ctx *ctx); static int ckpt_read_container(struct ckpt_ctx *ctx); static int ckpt_read_tree(struct ckpt_ctx *ctx); +static int ckpt_read_vpids(struct ckpt_ctx *ctx); static int hash_init(struct ckpt_ctx *ctx); static void hash_exit(struct ckpt_ctx *ctx); @@ -883,6 +892,12 @@ int app_restart(struct app_restart_args *args) exit(1); } + ret = ckpt_read_vpids(&ctx); + if (ret < 0) { + ckpt_perror("read c/r tree"); + exit(1); + } + /* build creator-child-relationship tree */ if (hash_init(&ctx) < 0) exit(1); @@ -891,6 +906,10 @@ int app_restart(struct app_restart_args *args) if (ret < 0) exit(1); + ret = assign_vpids(&ctx); + if (ret < 0) + exit(1); + ret = ckpt_fork_feeder(&ctx); if (ret < 0) exit(1); @@ -1218,13 +1237,13 @@ static int ckpt_coordinator_pidns(struct ckpt_ctx *ctx) return ret; } -#else +#else /* CLONE_NEWPID */ static int ckpt_coordinator_pidns(struct ckpt_ctx *ctx) { ckpt_err("logical error: ckpt_coordinator_pidns unexpected\n"); exit(1); } -#endif +#endif /* CLONE_NEWPID */ static int ckpt_coordinator(struct ckpt_ctx *ctx) { @@ -2050,8 +2069,8 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child) struct clone_args clone_args; genstack stk; unsigned long flags = SIGCHLD; - size_t nr_pids = 1; pid_t pid = 0; + pid_t *pids = &pid; ckpt_dbg("forking child vpid %d flags %#x\n", child->pid, child->flags); @@ -2067,29 +2086,58 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child) flags |= CLONE_PARENT; } + memset(&clone_args, 0, sizeof(clone_args)); + clone_args.nr_pids = 1; /* select pid if --pids, otherwise it's 0 */ - if (ctx->args->pids) - pid = child->pid; + if (ctx->args->pids) { + int i, depth = child->piddepth + 1; -#ifdef CLONE_NEWPID - /* but for new pidns, don't specify a pid */ - if (child->flags & TASK_NEWPID) { - flags |= CLONE_NEWPID; - pid = 0; + clone_args.nr_pids = depth; + pids = malloc(sizeof(pid_t) * depth); + if (!pids) { + perror("ckpt_fork_child pids malloc"); + return -1; + } + + pids[0] = child->pid; + for (i = 1; i <= child->piddepth; i++) + pids[i] = child->vpids[i-1].pid; + +#ifndef CLONE_NEWPID + if (child->piddepth > child->creator->piddepth) { + ckpt_err("nested pidns but CLONE_NEWPID undefined"); + errno = -EINVAL; + return -1; + } else if (child->flags & TASK_NEWPID) { + ckpt_err("TASK_NEWPID set but CLONE_NEWPID undefined"); + errno = -EINVAL; + return -1; + } +#else /* CLONE_NEWPID */ + if (child->piddepth > child->creator->piddepth) { + child->flags |= TASK_NEWPID; + flags |= CLONE_NEWPID; + } else if (child->flags & TASK_NEWPID) { + /* The TASK_NEWPID could have been set for root task */ + pids[0] = 0; + flags |= CLONE_NEWPID; + } + if (flags & CLONE_NEWPID) + clone_args.nr_pids--; +#endif /* CLONE_NEWPID */ } -#endif if (child->flags & (TASK_SIBLING | TASK_THREAD)) child->real_parent = getppid(); else child->real_parent = _getpid(); - memset(&clone_args, 0, sizeof(clone_args)); clone_args.child_stack = (unsigned long)genstack_base(stk); clone_args.child_stack_size = genstack_size(stk); - clone_args.nr_pids = nr_pids; - pid = eclone(ckpt_fork_stub, child, flags, &clone_args, &pid); + pid = eclone(ckpt_fork_stub, child, flags, &clone_args, pids); + if (pids != &pid) + free(pids); if (pid < 0) { ckpt_perror("eclone"); genstack_release(stk); @@ -2269,6 +2317,9 @@ static int ckpt_do_feeder(void *data) if (ckpt_write_tree(ctx) < 0) ckpt_abort(ctx, "write c/r tree"); + if (ckpt_write_vpids(ctx) < 0) + ckpt_abort(ctx, "write vpids"); + /* read rest -> write rest */ if (ctx->args->inspect) ckpt_read_write_inspect(ctx); @@ -2461,6 +2512,8 @@ static int ckpt_read_obj(struct ckpt_ctx *ctx, errno = EINVAL; return -1; } + if (h->len == sizeof(*h)) + return 0; return ckpt_read(STDIN_FILENO, buf, h->len - sizeof(*h)); } @@ -2609,8 +2662,64 @@ static int ckpt_read_tree(struct ckpt_ctx *ctx) } ret = ckpt_read_obj_ptr(ctx, ctx->pids_arr, len, CKPT_HDR_BUFFER); - if (ret < 0) + if (ret < 0) { free(ctx->pids_arr); + return ret; + } + + return ret; +} + +/* set the vpids pointers in all the tasks */ +static int assign_vpids(struct ckpt_ctx *ctx) +{ + int d, hidx, tidx; + + for (hidx = 0, tidx = 0; tidx < ctx->pids_nr; tidx++) { + d = ctx->tasks_arr[tidx].piddepth = ctx->pids_arr[tidx].depth; + if (!d) { + ctx->tasks_arr[tidx].vpids = NULL; + continue; + } + ctx->tasks_arr[tidx].vpids = &ctx->vpids_arr[hidx]; + hidx += ctx->pids_arr[tidx].depth; + if (hidx > ctx->vpids_nr) + return -ENOMEM; + } + + return 0; +} + +static int ckpt_read_vpids(struct ckpt_ctx *ctx) +{ + struct ckpt_hdr_vpids *h; + int len, ret; + + h = (struct ckpt_hdr_vpids *) ctx->vpids; + ret = ckpt_read_obj_type(ctx, h, sizeof(*h), CKPT_HDR_VPIDS); + if (ret < 0) + return ret; + + ckpt_dbg("number of vpids: %d\n", h->nr_vpids); + + if (h->nr_vpids < 0) { + ckpt_err("invalid number of vpids %d", h->nr_vpids); + errno = EINVAL; + return -1; + } + ctx->vpids_nr = h->nr_vpids; + if (!ctx->vpids_nr) + return 0; + + len = sizeof(struct ckpt_vpid) * ctx->vpids_nr; + + ctx->vpids_arr = malloc(len); + if (!ctx->pids_arr) + return -1; + + ret = ckpt_read_obj_ptr(ctx, ctx->vpids_arr, len, CKPT_HDR_BUFFER); + if (ret < 0) + free(ctx->vpids_arr); return ret; } @@ -2685,6 +2794,25 @@ static int ckpt_write_tree(struct ckpt_ctx *ctx) return 0; } +static int ckpt_write_vpids(struct ckpt_ctx *ctx) +{ + struct ckpt_hdr_vpids *h; + int len; + + h = (struct ckpt_hdr_vpids *) ctx->vpids; + if (ckpt_write_obj(ctx, (struct ckpt_hdr *) h) < 0) + ckpt_abort(ctx, "write vpids hdr"); + + if (!ctx->vpids_nr) + return 0; + len = sizeof(struct ckpt_vpid) * ctx->vpids_nr; + if (ckpt_write_obj_ptr(ctx, ctx->vpids_arr, len, CKPT_HDR_BUFFER) < 0) + ckpt_abort(ctx, "write vpids"); + ckpt_dbg("wrote %d bytes for %d vpids\n", len, ctx->vpids_nr); + + return 0; +} + /* * a simple hash implementation */ -- 1.7.0 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers