Quoting Oren Laadan (orenl@xxxxxxxxxxx): > Instead of playing tricks with xchg() of task->checkpoint_ctx, use the > task_{lock,unlock} to protect changes to that pointer. This simplifies > the logic since we no longer need to check for races (and old_ctx). > > The remaining changes include cleanup, and unification of common code > to handle errors during restart, and some debug statements. > > Signed-off-by: Oren Laadan <orenl@xxxxxxxxxxxxxxx> > --- > checkpoint/restart.c | 170 ++++++++++++++++++++++---------------------- > checkpoint/sys.c | 6 +- > include/linux/checkpoint.h | 2 +- > kernel/fork.c | 6 +- > 4 files changed, 92 insertions(+), 92 deletions(-) > > diff --git a/checkpoint/restart.c b/checkpoint/restart.c > index 543b380..5d936cf 100644 > --- a/checkpoint/restart.c > +++ b/checkpoint/restart.c > @@ -529,17 +529,54 @@ static inline int is_task_active(struct ckpt_ctx *ctx, pid_t pid) > > static inline void _restore_notify_error(struct ckpt_ctx *ctx, int errno) > { > - ckpt_set_ctx_error(ctx, errno); > - complete(&ctx->complete); > + /* first to fail: notify everyone (racy but harmless) */ > + if (!ckpt_test_ctx_error(ctx)) { > + ckpt_debug("setting restart error %d\n", errno); \ > + ckpt_set_ctx_error(ctx, errno); > + complete(&ctx->complete); > + wake_up_all(&ctx->waitq); > + wake_up_all(&ctx->ghostq); > + } > } > > /* Need to call ckpt_debug such that it will get the correct source location */ > #define restore_notify_error(ctx, errno) \ > do { \ > - ckpt_debug("ctx root pid %d err %d", ctx->root_pid, errno); \ > + ckpt_debug("restart error %d, root pid %d\n", errno, ctx->root_pid); \ > _restore_notify_error(ctx, errno); \ > } while(0) > Maybe make a note that these can't be called under write_lock_irq(&tasklist_lock)? Also, update the comment above task_lock() to add checkpoint_ctx to the list of things protected by it. > +static inline struct ckpt_ctx *get_task_ctx(struct task_struct *task) > +{ > + struct ckpt_ctx *ctx; > + > + task_lock(task); > + ctx = ckpt_ctx_get(task->checkpoint_ctx); > + task_unlock(task); > + return ctx; > +} > + > +/* returns 1 on success, 0 otherwise */ This works, but it's more confusing than it needs to be. I think using two helpers, 'set_task_ctx(tsk, ctx)' and 'clear_task-ctx(tsk)', where set_task_ctx always bails if task->checkpoint_ctx is set, would be much easier to read. But Acked-by: Serge Hallyn <serue@xxxxxxxxxx> > +static int set_task_ctx(struct task_struct *task, struct ckpt_ctx *ctx) > +{ > + struct ckpt_ctx *old; > + int ret = 1; > + > + task_lock(task); > + if (!ctx || !task->checkpoint_ctx) { > + old = task->checkpoint_ctx; > + task->checkpoint_ctx = ckpt_ctx_get(ctx); > + } else { > + ckpt_debug("task %d has prior checkpoint_ctx\n", > + task_pid_vnr(task)); > + old = NULL; > + ret = 0; > + } > + task_unlock(task); > + ckpt_ctx_put(old); > + return ret; > +} > + > static void restore_task_done(struct ckpt_ctx *ctx) > { > if (atomic_dec_and_test(&ctx->nr_total)) > @@ -570,6 +607,7 @@ static int restore_activate_next(struct ckpt_ctx *ctx) > rcu_read_unlock(); > > if (!task) { > + ckpt_debug("could not find task %d\n", pid); > restore_notify_error(ctx, -ESRCH); > return -ESRCH; > } > @@ -590,12 +628,10 @@ static int wait_task_active(struct ckpt_ctx *ctx) > ret = wait_event_interruptible(ctx->waitq, > is_task_active(ctx, pid) || > ckpt_test_ctx_error(ctx)); > - ckpt_debug("active %d < %d (ret %d)\n", > - ctx->active_pid, ctx->nr_pids, ret); > - if (!ret && ckpt_test_ctx_error(ctx)) { > - force_sig(SIGKILL, current); > + ckpt_debug("active %d < %d (ret %d, errno %d)\n", > + ctx->active_pid, ctx->nr_pids, ret, ctx->errno); > + if (!ret && ckpt_test_ctx_error(ctx)) > ret = -EBUSY; > - } > return ret; > } > > @@ -603,17 +639,17 @@ static int wait_task_sync(struct ckpt_ctx *ctx) > { > ckpt_debug("pid %d syncing\n", task_pid_vnr(current)); > wait_event_interruptible(ctx->waitq, ckpt_test_ctx_complete(ctx)); > - if (ckpt_test_ctx_error(ctx)) { > - force_sig(SIGKILL, current); > + ckpt_debug("task sync done (errno %d)\n", ctx->errno); > + if (ckpt_test_ctx_error(ctx)) > return -EBUSY; > - } > return 0; > } > > +/* grabs a reference to the @ctx on success; caller should free */ > static struct ckpt_ctx *wait_checkpoint_ctx(void) > { > DECLARE_WAIT_QUEUE_HEAD(waitq); > - struct ckpt_ctx *ctx, *old_ctx; > + struct ckpt_ctx *ctx; > int ret; > > /* > @@ -621,32 +657,15 @@ static struct ckpt_ctx *wait_checkpoint_ctx(void) > * reference to its restart context. > */ > ret = wait_event_interruptible(waitq, current->checkpoint_ctx); > - if (ret < 0) > + if (ret < 0) { > + ckpt_debug("wait_checkpoint_ctx: failed (%d)\n", ret); > return ERR_PTR(ret); > + } > > - ctx = xchg(¤t->checkpoint_ctx, NULL); > - if (!ctx) > + ctx = get_task_ctx(current); > + if (!ctx) { > + ckpt_debug("wait_checkpoint_ctx: checkpoint_ctx missing\n"); > return ERR_PTR(-EAGAIN); > - ckpt_ctx_get(ctx); > - > - /* > - * Put the @ctx back on our task_struct. If an ancestor tried > - * to prepare_descendants() on us (although extremly unlikely) > - * we will encounter the ctx that he xchg()ed there and bail. > - */ > - old_ctx = xchg(¤t->checkpoint_ctx, ctx); > - if (old_ctx) { > - ckpt_debug("self-set of checkpoint_ctx failed\n"); > - > - /* alert coordinator of unexpected ctx */ > - restore_notify_error(old_ctx, -EAGAIN); > - ckpt_ctx_put(old_ctx); > - > - /* alert our coordinator that we bail */ > - restore_notify_error(ctx, -EAGAIN); > - ckpt_ctx_put(ctx); > - > - ctx = ERR_PTR(-EAGAIN); > } > > return ctx; > @@ -655,6 +674,7 @@ static struct ckpt_ctx *wait_checkpoint_ctx(void) > static int do_ghost_task(void) > { > struct ckpt_ctx *ctx; > + int ret; > > ctx = wait_checkpoint_ctx(); > if (IS_ERR(ctx)) > @@ -662,9 +682,11 @@ static int do_ghost_task(void) > > current->flags |= PF_RESTARTING; > > - wait_event_interruptible(ctx->ghostq, > - all_tasks_activated(ctx) || > - ckpt_test_ctx_error(ctx)); > + ret = wait_event_interruptible(ctx->ghostq, > + all_tasks_activated(ctx) || > + ckpt_test_ctx_error(ctx)); > + if (ret < 0) > + restore_notify_error(ctx, ret); > > current->exit_signal = -1; > ckpt_ctx_put(ctx); > @@ -675,7 +697,7 @@ static int do_ghost_task(void) > > static int do_restore_task(void) > { > - struct ckpt_ctx *ctx, *old_ctx; > + struct ckpt_ctx *ctx; > int zombie, ret; > > ctx = wait_checkpoint_ctx(); > @@ -713,18 +735,11 @@ static int do_restore_task(void) > restore_task_done(ctx); > ret = wait_task_sync(ctx); > out: > - old_ctx = xchg(¤t->checkpoint_ctx, NULL); > - if (old_ctx) > - ckpt_ctx_put(old_ctx); > - > - /* if we're first to fail - notify others */ > - if (ret < 0 && !ckpt_test_ctx_error(ctx)) { > + if (ret < 0) > restore_notify_error(ctx, ret); > - wake_up_all(&ctx->waitq); > - wake_up_all(&ctx->ghostq); > - } > > current->flags &= ~PF_RESTARTING; > + set_task_ctx(current, NULL); > ckpt_ctx_put(ctx); > return ret; > } > @@ -742,17 +757,14 @@ static int prepare_descendants(struct ckpt_ctx *ctx, struct task_struct *root) > struct task_struct *leader = root; > struct task_struct *parent = NULL; > struct task_struct *task = root; > - struct ckpt_ctx *old_ctx; > int nr_pids = 0; > - int ret = 0; > + int ret = -EBUSY; > > read_lock(&tasklist_lock); > while (1) { > ckpt_debug("consider task %d\n", task_pid_vnr(task)); > - if (task_ptrace(task) & PT_PTRACED) { > - ret = -EBUSY; > + if (task_ptrace(task) & PT_PTRACED) > break; > - } > /* > * Set task->checkpoint_ctx of all non-zombie descendants. > * If a descendant already has a ->checkpoint_ctx, it > @@ -764,14 +776,8 @@ static int prepare_descendants(struct ckpt_ctx *ctx, struct task_struct *root) > * already be set. > */ > if (!task->exit_state) { > - ckpt_ctx_get(ctx); > - old_ctx = xchg(&task->checkpoint_ctx, ctx); > - if (old_ctx) { > - ckpt_debug("bad task %d\n",task_pid_vnr(task)); > - ckpt_ctx_put(old_ctx); > - ret = -EAGAIN; > + if (!set_task_ctx(task, ctx)) > break; > - } > ckpt_debug("prepare task %d\n", task_pid_vnr(task)); > wake_up_process(task); > nr_pids++; > @@ -799,8 +805,10 @@ static int prepare_descendants(struct ckpt_ctx *ctx, struct task_struct *root) > if (task == root) { > /* in case root task is multi-threaded */ > root = task = next_thread(task); > - if (root == leader) > + if (root == leader) { > + ret = 0; > break; > + } > } > } > read_unlock(&tasklist_lock); > @@ -829,8 +837,7 @@ static int wait_all_tasks_finish(struct ckpt_ctx *ctx) > return ret; > > ret = wait_for_completion_interruptible(&ctx->complete); > - > - ckpt_debug("final sync kflags %#lx\n", ctx->kflags); > + ckpt_debug("final sync kflags %#lx (ret %d)\n", ctx->kflags, ret); > /* > * Usually when restart fails, the restarting task will first > * set @ctx->errno before waking us up. In the rare event that > @@ -899,13 +906,14 @@ static int init_restart_ctx(struct ckpt_ctx *ctx, pid_t pid) > > static int do_restore_coord(struct ckpt_ctx *ctx, pid_t pid) > { > - struct ckpt_ctx *old_ctx; > int ret; > > ret = restore_read_header(ctx); > + ckpt_debug("restore header: %d\n", ret); > if (ret < 0) > return ret; > ret = restore_read_tree(ctx); > + ckpt_debug("restore tree: %d\n", ret); > if (ret < 0) > return ret; > > @@ -920,45 +928,42 @@ static int do_restore_coord(struct ckpt_ctx *ctx, pid_t pid) > * Populate own ->checkpoint_ctx: if an ancestor attempts to > * prepare_descendants() on us, it will fail. Furthermore, > * that ancestor won't proceed deeper to interfere with our > - * descendants that are restarting (e.g. by xchg()ing their > - * ->checkpoint_ctx pointer temporarily). > + * descendants that are restarting. > */ > - ckpt_ctx_get(ctx); > - old_ctx = xchg(¤t->checkpoint_ctx, ctx); > - if (old_ctx) { > + if (!set_task_ctx(current, ctx)) { > /* > * We are a bad-behaving descendant: an ancestor must > - * have done prepare_descendants() on us as part of a > - * restart. Oh, well ... alert ancestor (coordinator) > - * with an error on @old_ctx. > + * have prepare_descendants() us as part of a restart. > */ > - ckpt_debug("bad behaving checkpoint_ctx\n"); > - restore_notify_error(old_ctx, -EBUSY); > - ckpt_ctx_put(old_ctx); > - ret = -EBUSY; > - goto out; > + ckpt_debug("coord already has checkpoint_ctx\n"); > + return -EBUSY; > } > > if (ctx->uflags & RESTART_TASKSELF) { > ret = restore_task(ctx); > + ckpt_debug("restore task: %d\n", ret); > if (ret < 0) > goto out; > } else { > /* prepare descendants' t->checkpoint_ctx point to coord */ > ret = prepare_descendants(ctx, ctx->root_task); > + ckpt_debug("restore prepare: %d\n", ret); > if (ret < 0) > goto out; > /* wait for all other tasks to complete do_restore_task() */ > ret = wait_all_tasks_finish(ctx); > + ckpt_debug("restore finish: %d\n", ret); > if (ret < 0) > goto out; > } > > ret = deferqueue_run(ctx->deferqueue); /* run deferred work */ > + ckpt_debug("restore deferqueue: %d\n", ret); > if (ret < 0) > goto out; > > ret = restore_read_tail(ctx); > + ckpt_debug("restore tail: %d\n", ret); > if (ret < 0) > goto out; > > @@ -974,14 +979,8 @@ static int do_restore_coord(struct ckpt_ctx *ctx, pid_t pid) > > if (!(ctx->uflags & RESTART_TASKSELF)) > wake_up_all(&ctx->waitq); > - /* > - * If an ancestor attempts to prepare_descendants() on us, it > - * xchg()s our ->checkpoint_ctx, and free it. Our @ctx will, > - * instead, point to the ctx that said ancestor placed. > - */ > - ctx = xchg(¤t->checkpoint_ctx, NULL); > - ckpt_ctx_put(ctx); > > + set_task_ctx(current, NULL); > return ret; > } > > @@ -1070,6 +1069,7 @@ long do_restart(struct ckpt_ctx *ctx, pid_t pid, unsigned long flags) > } > } > > + ckpt_debug("sys_restart returns %ld\n", ret); > return ret; > } > > @@ -1082,7 +1082,7 @@ void exit_checkpoint(struct task_struct *tsk) > struct ckpt_ctx *ctx; > > /* no one else will touch this, because @tsk is dead already */ > - ctx = xchg(&tsk->checkpoint_ctx, NULL); > + ctx = tsk->checkpoint_ctx; > > /* restarting zombies will activate next task in restart */ > if (tsk->flags & PF_RESTARTING) { > diff --git a/checkpoint/sys.c b/checkpoint/sys.c > index 76a3fa9..77613d7 100644 > --- a/checkpoint/sys.c > +++ b/checkpoint/sys.c > @@ -263,9 +263,11 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags, > return ERR_PTR(err); > } > > -void ckpt_ctx_get(struct ckpt_ctx *ctx) > +struct ckpt_ctx *ckpt_ctx_get(struct ckpt_ctx *ctx) > { > - atomic_inc(&ctx->refcount); > + if (ctx) > + atomic_inc(&ctx->refcount); > + return ctx; > } > > void ckpt_ctx_put(struct ckpt_ctx *ctx) > diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h > index 5294a96..b7f1796 100644 > --- a/include/linux/checkpoint.h > +++ b/include/linux/checkpoint.h > @@ -134,7 +134,7 @@ extern int ckpt_obj_insert(struct ckpt_ctx *ctx, void *ptr, int objref, > enum obj_type type); > extern int ckpt_obj_reserve(struct ckpt_ctx *ctx); > > -extern void ckpt_ctx_get(struct ckpt_ctx *ctx); > +extern struct ckpt_ctx *ckpt_ctx_get(struct ckpt_ctx *ctx); > extern void ckpt_ctx_put(struct ckpt_ctx *ctx); > > extern long do_checkpoint(struct ckpt_ctx *ctx, pid_t pid); > diff --git a/kernel/fork.c b/kernel/fork.c > index 57118e4..0e226f5 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -1188,10 +1188,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, > > #ifdef CONFIG_CHECKPOINT > /* If parent is restarting, child should be too */ > - if (unlikely(current->checkpoint_ctx)) { > - p->checkpoint_ctx = current->checkpoint_ctx; > - ckpt_ctx_get(p->checkpoint_ctx); > - } > + if (unlikely(current->checkpoint_ctx)) > + p->checkpoint_ctx = ckpt_ctx_get(current->checkpoint_ctx); > #endif > /* > * The task hasn't been attached yet, so its cpus_allowed mask will > -- > 1.6.0.4 > > _______________________________________________ > Containers mailing list > Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx > https://lists.linux-foundation.org/mailman/listinfo/containers _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers