On Thu, Mar 27, 2014 at 11:16 AM, Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> wrote: > > It would all be cleaner if all the setup was done with the > ctx->ring_lock held (you can even *initialize* it to the locked state, > since this is the function that allocates it!) and then it would just > be unlocked when done. Attached is a TOTALLY UNTESTED patch based on Ben's one that does at least this minimal cleanup, in addition to dropping the completion_lock in aio_free_ring() in favor of instead just doing the "put_aio_ring_file()" first. I do want to stress that "untested" part. I'm not going to commit this, because I don't have any real test-cases even if it were to boot and work for me otherwise. I can't say that I like the locking. It really seems totally mis-designed to me. For example, the first completion_lock in aio_migratepage() seems to be total BS - it's locking against other migrations, but that's what "mapping->private_lock" (and now "ctx->ring_lock") protect against. The *second* completion_lock use in aio_migratepage() is actually valid: we can't copy the page contents to a new one when a completion might change the ring tail data, because then the change might be done to the old page but not the new page. But there the "check if things haven't changed" is bogus, since we've held the ring_lock. I did *not* clean up that part. But it's an example of how the locking here seems to be more "voodoo programming" than actually thought about and designed. Please, somebody who has test-cases look at this, ok? Linus
fs/aio.c | 53 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 062a5f6a1448..758430665b3a 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -243,6 +243,9 @@ static void aio_free_ring(struct kioctx *ctx) { int i; + /* This makes the ctx unreachable */ + put_aio_ring_file(ctx); + for (i = 0; i < ctx->nr_pages; i++) { struct page *page; pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, @@ -254,8 +257,6 @@ static void aio_free_ring(struct kioctx *ctx) put_page(page); } - put_aio_ring_file(ctx); - if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) { kfree(ctx->ring_pages); ctx->ring_pages = NULL; @@ -287,9 +288,29 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, rc = 0; - /* Make sure the old page hasn't already been changed */ + /* Get a reference on the ioctx so we can take the ring_lock mutex. */ spin_lock(&mapping->private_lock); ctx = mapping->private_data; + if (ctx) + percpu_ref_get(&ctx->users); + spin_unlock(&mapping->private_lock); + + if (!ctx) + return -EINVAL; + + /* We use mutex_trylock() here as the callers of migratepage may + * already be holding current->mm->mmap_sem, and ->ring_lock must be + * outside of mmap_sem due to its usage in aio_read_events_ring(). + * Since page migration is not an absolutely critical operation, the + * occasional failure here is acceptable. + */ + if (!mutex_trylock(&ctx->ring_lock)) { + percpu_ref_put(&ctx->users); + return -EAGAIN; + } + + /* Make sure the old page hasn't already been changed */ + spin_lock(&mapping->private_lock); if (ctx) { pgoff_t idx; spin_lock_irqsave(&ctx->completion_lock, flags); @@ -305,7 +326,7 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, spin_unlock(&mapping->private_lock); if (rc != 0) - return rc; + goto out_unlock; /* Writeback must be complete */ BUG_ON(PageWriteback(old)); @@ -314,7 +335,7 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); if (rc != MIGRATEPAGE_SUCCESS) { put_page(new); - return rc; + goto out_unlock; } /* We can potentially race against kioctx teardown here. Use the @@ -346,6 +367,9 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, else put_page(new); +out_unlock: + mutex_unlock(&ctx->ring_lock); + percpu_ref_put(&ctx->users); return rc; } #endif @@ -380,7 +404,7 @@ static int aio_setup_ring(struct kioctx *ctx) file = aio_private_file(ctx, nr_pages); if (IS_ERR(file)) { ctx->aio_ring_file = NULL; - return -EAGAIN; + return -ENOMEM; } ctx->aio_ring_file = file; @@ -415,7 +439,7 @@ static int aio_setup_ring(struct kioctx *ctx) if (unlikely(i != nr_pages)) { aio_free_ring(ctx); - return -EAGAIN; + return -ENOMEM; } ctx->mmap_size = nr_pages * PAGE_SIZE; @@ -429,7 +453,7 @@ static int aio_setup_ring(struct kioctx *ctx) if (IS_ERR((void *)ctx->mmap_base)) { ctx->mmap_size = 0; aio_free_ring(ctx); - return -EAGAIN; + return -ENOMEM; } pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); @@ -657,8 +681,13 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (!ctx->cpu) goto err; - if (aio_setup_ring(ctx) < 0) - goto err; + /* Prevent races with page migration during setup by holding + * the ring_lock mutex. + */ + mutex_lock(&ctx->ring_lock); + err = aio_setup_ring(ctx); + if (err < 0) + goto err_unlock; atomic_set(&ctx->reqs_available, ctx->nr_events - 1); ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); @@ -683,6 +712,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (err) goto err_cleanup; + mutex_unlock(&ctx->ring_lock); pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; @@ -691,6 +721,8 @@ err_cleanup: aio_nr_sub(ctx->max_reqs); err_ctx: aio_free_ring(ctx); +err_unlock: + mutex_unlock(&ctx->ring_lock); err: free_percpu(ctx->cpu); free_percpu(ctx->reqs.pcpu_count); @@ -1024,6 +1056,7 @@ static long aio_read_events_ring(struct kioctx *ctx, mutex_lock(&ctx->ring_lock); + /* Access to ->ring_pages here is protected by ctx->ring_lock. */ ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; tail = ring->tail;