On Sun, Oct 25, 2009 at 06:23:29PM -0400, Oren Laadan wrote: > This patch modifies the memory checkpoint code to _not_ hold the > mmap_sem while dumping out the vma's. > > The problem with holding the mmap_sem is that it first takes the > mmap_sem and then takes the file's inode semaphore. This violates the > normal locking order, e,g, when taking a page fault during a copyout, > which is inode sem and then the mmap_sem. > > Normally this reverse locking order won't cause a lockup because a the > output file for the checkpoint image isn't used by the checkpointee. > However, there a couple of cases where it may be a problem, e.g. when > some async-IO happens to complete and triggers a page fault at the > wrong time. > > This fixes complaints from the lockdep about this reverse ordering. > > Signed-off-by: Oren Laadan <orenl@xxxxxxxxxxxxxxx> > --- > checkpoint/memory.c | 133 ++++++++++++++++++++++++++++++++++++--------------- > 1 files changed, 94 insertions(+), 39 deletions(-) > > diff --git a/checkpoint/memory.c b/checkpoint/memory.c > index 0da948f..656614c 100644 > --- a/checkpoint/memory.c > +++ b/checkpoint/memory.c > @@ -644,11 +644,80 @@ static int anonymous_checkpoint(struct ckpt_ctx *ctx, > return private_vma_checkpoint(ctx, vma, CKPT_VMA_ANON, 0); > } > > +static int checkpoint_vmas(struct ckpt_ctx *ctx, struct mm_struct *mm) > +{ > + struct vm_area_struct *vma, *next; > + int map_count = 0; > + int ret = 0; > + > + vma = kzalloc(sizeof(*vma), GFP_KERNEL); > + if (!vma) > + return -ENOMEM; > + > + /* > + * Must not hold mm->mmap_sem when writing to image file, so > + * can't simply traverse the vma list. Instead, use find_vma() > + * to get the @next and make a local "copy" of it. > + */ > + while (1) { > + down_read(&mm->mmap_sem); > + next = find_vma(mm, vma->vm_end); > + if (!next) { > + up_read(&mm->mmap_sem); > + break; > + } > + if (vma->vm_file) > + fput(vma->vm_file); > + *vma = *next; > + if (vma->vm_file) > + get_file(vma->vm_file); > + up_read(&mm->mmap_sem); > + > + map_count++; > + > + ckpt_debug("vma %#lx-%#lx flags %#lx\n", > + vma->vm_start, vma->vm_end, vma->vm_flags); > + > + if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) { > + ckpt_write_err(ctx, "TE", "vma: bad flags (%#lx)\n", > + -ENOSYS, vma->vm_flags); > + ret = -ENOSYS; > + break; > + } > + > + if (!vma->vm_ops) > + ret = anonymous_checkpoint(ctx, vma); > + else if (vma->vm_ops->checkpoint) > + ret = (*vma->vm_ops->checkpoint)(ctx, vma); > + else > + ret = -ENOSYS; > + if (ret < 0) { > + ckpt_write_err(ctx, "TE", "vma: failed", ret); > + break; > + } > + /* > + * The file was collected, but not always checkpointed; > + * be safe and mark as visited to appease leak detection > + */ > + if (vma->vm_file && !(ctx->uflags & CHECKPOINT_SUBTREE)) { > + ret = ckpt_obj_visit(ctx, vma->vm_file, CKPT_OBJ_FILE); > + if (ret < 0) > + break; > + } > + } > + > + if (vma->vm_file) > + fput(vma->vm_file); > + > + kfree(vma); > + > + return ret < 0 ? ret : map_count; > +} > + > static int do_checkpoint_mm(struct ckpt_ctx *ctx, struct mm_struct *mm) > { > struct ckpt_hdr_mm *h; > - struct vm_area_struct *vma; > - int exe_objref = 0; > + struct file *exe_file = NULL; > int ret; > > h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_MM); > @@ -674,14 +743,23 @@ static int do_checkpoint_mm(struct ckpt_ctx *ctx, struct mm_struct *mm) > > h->map_count = mm->map_count; > > - /* checkpoint the ->exe_file */ > - if (mm->exe_file) { > - exe_objref = checkpoint_obj(ctx, mm->exe_file, CKPT_OBJ_FILE); > - if (exe_objref < 0) { > - ret = exe_objref; > + if (mm->exe_file) { /* checkpoint the ->exe_file */ > + exe_file = mm->exe_file; > + get_file(exe_file); > + } > + > + /* > + * Drop mm->mmap_sem before writing data to checkpoint image > + * to avoid reverse locking order (inode must come before mm). > + */ > + up_read(&mm->mmap_sem); > + > + if (exe_file) { > + h->exe_objref = checkpoint_obj(ctx, exe_file, CKPT_OBJ_FILE); > + if (h->exe_objref < 0) { > + ret = h->exe_objref; > goto out; > } > - h->exe_objref = exe_objref; > } > > ret = ckpt_write_obj(ctx, &h->h); > @@ -692,40 +770,17 @@ static int do_checkpoint_mm(struct ckpt_ctx *ctx, struct mm_struct *mm) > if (ret < 0) > return ret; > > - /* write the vma's */ > - for (vma = mm->mmap; vma; vma = vma->vm_next) { > - ckpt_debug("vma %#lx-%#lx flags %#lx\n", > - vma->vm_start, vma->vm_end, vma->vm_flags); > - if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) { > - ckpt_write_err(ctx, "TE", "vma: bad flags (%#lx)\n", > - -ENOSYS, vma->vm_flags); > - return -ENOSYS; > - } > - if (!vma->vm_ops) > - ret = anonymous_checkpoint(ctx, vma); > - else if (vma->vm_ops->checkpoint) > - ret = (*vma->vm_ops->checkpoint)(ctx, vma); > - else > - ret = -ENOSYS; > - if (ret < 0) { > - ckpt_write_err(ctx, "TE", "vma: failed", ret); > - goto out; > - } > - /* > - * The file was collected, but not always checkpointed; > - * be safe and mark as visited to appease leak detection > - */ > - if (vma->vm_file && !(ctx->uflags & CHECKPOINT_SUBTREE)) { > - ret = ckpt_obj_visit(ctx, vma->vm_file, CKPT_OBJ_FILE); > - if (ret < 0) > - goto out; > - } > - } > + ret = checkpoint_vmas(ctx, mm); > + if (ret != h->map_count && ret >= 0) > + ret = -EBUSY; /* checkpoint mm leak */ > + if (ret < 0) > + goto out; > > ret = checkpoint_mm_context(ctx, mm); > out: > + if (exe_file) > + fput(exe_file); > ckpt_hdr_put(ctx, h); > - up_read(&mm->mmap_sem); > return ret; > } > > @@ -1288,9 +1343,9 @@ static struct mm_struct *do_restore_mm(struct ckpt_ctx *ctx) > } > set_mm_exe_file(mm, file); > } > + up_write(&mm->mmap_sem); > > ret = _ckpt_read_buffer(ctx, mm->saved_auxv, sizeof(mm->saved_auxv)); > - up_write(&mm->mmap_sem); > if (ret < 0) > goto out; > > -- At least in the restart path it's interesting to see how Alexey did it without mmap_sem, at least for part of it: http://patchwork.kernel.org/patch/25337/ (search for kstate_restore_mm_struct()) Is that a feasible and more-suitable approach for the initial portions of mm restore? Cheers, -Matt Helsley _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers