Introduce a helper to iterate over the descendants of a given task. The prototype is: int walk_task_subtree(struct task_struct *root, int (*func)(struct task_struct *, void *), void *data) The function will start with @root, and iterate through all the descendants, including threads, in a DFS manner. Children of a task are traversed before proceeding to the next thread of that task. For each task, the callback @func will be called providing the task pointer and the @data. The callback is invoked while holding the tasklist_lock for reading. If the callback fails it should return a negative error, and the traversal ends. If the callback succeeds, it returns a non-negative number, and these values are summed. On success, walk_task_subtree() returns the total summed. On failure, it returns a negative value. The code in checkpoint/checkpoint.c and checkpoint/restart.c has been converted to use this helper. Signed-off-by: Oren Laadan <orenl@xxxxxxxxxxxxxxx> --- checkpoint/checkpoint.c | 97 ++++++++++++++-------------------------- checkpoint/restart.c | 105 +++++++++++++++++++------------------------- checkpoint/sys.c | 55 +++++++++++++++++++++++ include/linux/checkpoint.h | 3 + 4 files changed, 137 insertions(+), 123 deletions(-) diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c index dbe9e10..1eeb557 100644 --- a/checkpoint/checkpoint.c +++ b/checkpoint/checkpoint.c @@ -520,80 +520,51 @@ static int collect_objects(struct ckpt_ctx *ctx) return ret; } +struct ckpt_cnt_tasks { + struct ckpt_ctx *ctx; + int nr; +}; + /* count number of tasks in tree (and optionally fill pid's in array) */ -static int tree_count_tasks(struct ckpt_ctx *ctx) +static int __tree_count_tasks(struct task_struct *task, void *data) { - struct task_struct *root; - struct task_struct *task; - struct task_struct *parent; - struct task_struct **tasks_arr = ctx->tasks_arr; - int nr_tasks = ctx->nr_tasks; - int nr = 0; + struct ckpt_cnt_tasks *d = (struct ckpt_cnt_tasks *) data; + struct ckpt_ctx *ctx = d->ctx; int ret; - read_lock(&tasklist_lock); - - /* we hold the lock, so root_task->real_parent can't change */ - task = ctx->root_task; - if (ctx->root_init) { - /* container-init: start from container parent */ - parent = task->real_parent; - root = parent; - } else { - /* non-container-init: start from root task and down */ - parent = NULL; - root = task; - } - - /* count tasks via DFS scan of the tree */ - while (1) { - ctx->tsk = task; /* (for ckpt_write_err) */ + ctx->tsk = task; /* (for ckpt_write_err) */ - /* is this task cool ? */ - ret = may_checkpoint_task(ctx, task); - if (ret < 0) { - nr = ret; - break; - } - if (tasks_arr) { - /* unlikely... but if so then try again later */ - if (nr == nr_tasks) { - nr = -EBUSY; /* cleanup in ckpt_ctx_free() */ - break; - } - tasks_arr[nr] = task; - get_task_struct(task); - } - nr++; - /* if has children - proceed with child */ - if (!list_empty(&task->children)) { - parent = task; - task = list_entry(task->children.next, - struct task_struct, sibling); - continue; - } - while (task != root) { - /* if has sibling - proceed with sibling */ - if (!list_is_last(&task->sibling, &parent->children)) { - task = list_entry(task->sibling.next, - struct task_struct, sibling); - break; - } + /* is this task cool ? */ + ret = may_checkpoint_task(ctx, task); + if (ret < 0) + goto out; - /* else, trace back to parent and proceed */ - task = parent; - parent = parent->real_parent; + if (ctx->tasks_arr) { + if (d->nr == ctx->nr_tasks) { /* unlikely... try again later */ + __ckpt_write_err(ctx, "TE", "bad task count\n", -EBUSY); + ret = -EBUSY; + goto out; } - if (task == root) - break; + ctx->tasks_arr[d->nr++] = task; + get_task_struct(task); } + + ret = 1; + out: + if (ret < 0) + ckpt_write_err(ctx, "", NULL); ctx->tsk = NULL; + return ret; +} - read_unlock(&tasklist_lock); +static int tree_count_tasks(struct ckpt_ctx *ctx) +{ + struct ckpt_cnt_tasks data; - if (nr < 0) - ckpt_write_err(ctx, "", NULL); - return nr; + data.ctx = ctx; + data.nr = 0; + + return walk_task_subtree(ctx->root_task, __tree_count_tasks, &data); } /* diff --git a/checkpoint/restart.c b/checkpoint/restart.c index 37454c5..c021eaf 100644 --- a/checkpoint/restart.c +++ b/checkpoint/restart.c @@ -804,77 +804,56 @@ static int do_restore_task(void) * Called by the coodinator to set the ->checkpoint_ctx pointer of the * root task and all its descendants. */ -static int prepare_descendants(struct ckpt_ctx *ctx, struct task_struct *root) +static int __prepare_descendants(struct task_struct *task, void *data) { - struct task_struct *leader = root; - struct task_struct *parent = NULL; - struct task_struct *task = root; - int nr_pids = 0; - int ret = -EBUSY; + struct ckpt_ctx *ctx = (struct ckpt_ctx *) data; - read_lock(&tasklist_lock); - while (1) { - ckpt_debug("consider task %d\n", task_pid_vnr(task)); - if (task_ptrace(task) & PT_PTRACED) - break; - /* - * Set task->checkpoint_ctx of all non-zombie descendants. - * If a descendant already has a ->checkpoint_ctx, it - * must be a coordinator (for a different restart ?) so - * we fail. - * - * Note that own ancestors cannot interfere since they - * won't descend past us, as own ->checkpoint_ctx must - * already be set. - */ - if (!task->exit_state) { - if (!set_task_ctx(task, ctx)) - break; - ckpt_debug("prepare task %d\n", task_pid_vnr(task)); - wake_up_process(task); - nr_pids++; - } + ckpt_debug("consider task %d\n", task_pid_vnr(task)); - /* if has children - proceed with child */ - if (!list_empty(&task->children)) { - parent = task; - task = list_entry(task->children.next, - struct task_struct, sibling); - continue; - } - while (task != root) { - /* if has sibling - proceed with sibling */ - if (!list_is_last(&task->sibling, &parent->children)) { - task = list_entry(task->sibling.next, - struct task_struct, sibling); - break; - } - - /* else, trace back to parent and proceed */ - task = parent; - parent = parent->real_parent; - } - if (task == root) { - /* in case root task is multi-threaded */ - root = task = next_thread(task); - if (root == leader) { - ret = 0; - break; - } - } + if (task_ptrace(task) & PT_PTRACED) { + ckpt_debug("ptraced task %d\n", task_pid_vnr(task)); + return -EBUSY; } - read_unlock(&tasklist_lock); - ckpt_debug("nr %d/%d ret %d\n", ctx->nr_pids, nr_pids, ret); + + /* + * Set task->checkpoint_ctx of all non-zombie descendants. + * If a descendant already has a ->checkpoint_ctx, it + * must be a coordinator (for a different restart ?) so + * we fail. + * + * Note that own ancestors cannot interfere since they + * won't descend past us, as own ->checkpoint_ctx must + * already be set. + */ + if (!task->exit_state) { + if (!set_task_ctx(task, ctx)) + return -EBUSY; + ckpt_debug("prepare task %d\n", task_pid_vnr(task)); + wake_up_process(task); + return 1; + } + + return 0; +} + +static int prepare_descendants(struct ckpt_ctx *ctx, struct task_struct *root) +{ + int nr_pids; + + nr_pids = walk_task_subtree(root, __prepare_descendants, ctx); + ckpt_debug("nr %d/%d\n", ctx->nr_pids, nr_pids); + if (nr_pids < 0) + return nr_pids; /* * Actual tasks count may exceed ctx->nr_pids due of 'dead' * tasks used as place-holders for PGIDs, but not fall short. */ - if (!ret && (nr_pids < ctx->nr_pids)) - ret = -ESRCH; + if (nr_pids < ctx->nr_pids) + return -ESRCH; atomic_set(&ctx->nr_total, nr_pids); - return ret; + return nr_pids; } static int wait_all_tasks_finish(struct ckpt_ctx *ctx) @@ -991,6 +970,12 @@ static int do_restore_coord(struct ckpt_ctx *ctx, pid_t pid) return -EBUSY; } + /* + * From now on we are committed to the restart. If anything + * fails, we'll wipe out the entire subtree below us, to + * ensure proper cleanup. + */ + if (ctx->uflags & RESTART_TASKSELF) { ret = restore_task(ctx); ckpt_debug("restore task: %d\n", ret); diff --git a/checkpoint/sys.c b/checkpoint/sys.c index 77613d7..7604089 100644 --- a/checkpoint/sys.c +++ b/checkpoint/sys.c @@ -276,6 +276,61 @@ void ckpt_ctx_put(struct ckpt_ctx *ctx) ckpt_ctx_free(ctx); } +int walk_task_subtree(struct task_struct *root, + int (*func)(struct task_struct *, void *), + void *data) +{ + + struct task_struct *leader = root; + struct task_struct *parent = NULL; + struct task_struct *task = root; + int total = 0; + int ret; + + read_lock(&tasklist_lock); + while (1) { + /* invoke callback on this task */ + ret = func(task, data); + if (ret < 0) + break; + + total += ret; + + /* if has children - proceed with child */ + if (!list_empty(&task->children)) { + parent = task; + task = list_entry(task->children.next, + struct task_struct, sibling); + continue; + } + + while (task != root) { + /* if has sibling - proceed with sibling */ + if (!list_is_last(&task->sibling, &parent->children)) { + task = list_entry(task->sibling.next, + struct task_struct, sibling); + break; + } + + /* else, trace back to parent and proceed */ + task = parent; + parent = parent->real_parent; + } + + if (task == root) { + /* in case root task is multi-threaded */ + root = task = next_thread(task); + if (root == leader) + break; + } + } + read_unlock(&tasklist_lock); + + ckpt_debug("total %d ret %d\n", total, ret); + return (ret < 0 ? ret : total); +} + + /** * sys_checkpoint - checkpoint a container * @pid: pid of the container init(1) process diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h index b7f1796..12210e4 100644 --- a/include/linux/checkpoint.h +++ b/include/linux/checkpoint.h @@ -50,6 +50,9 @@ RESTART_FROZEN | \ RESTART_GHOST) +extern int walk_task_subtree(struct task_struct *task, + int (*func)(struct task_struct *, void *), + void *data); extern void exit_checkpoint(struct task_struct *tsk); extern int ckpt_kwrite(struct ckpt_ctx *ctx, void *buf, int count); -- 1.6.0.4 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers