Re: git pull -- [PATCH] aio: v2 ensure access to ctx->ring_pages is correctly serialised

Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> · Thu, 27 Mar 2014 12:43:13 -0700

On Thu, Mar 27, 2014 at 11:16 AM, Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
>
> It would all be cleaner if all the setup was done with the
> ctx->ring_lock held (you can even *initialize* it to the locked state,
> since this is the function that allocates it!) and then it would just
> be unlocked when done.

Attached is a TOTALLY UNTESTED patch based on Ben's one that does at
least this minimal cleanup, in addition to dropping the
completion_lock in aio_free_ring() in favor of instead just doing the
"put_aio_ring_file()" first.

I do want to stress that "untested" part. I'm not going to commit
this, because I don't have any real test-cases even if it were to boot
and work for me otherwise.

I can't say that I like the locking. It really seems totally
mis-designed to me. For example, the first completion_lock in
aio_migratepage() seems to be total BS - it's locking against other
migrations, but that's what "mapping->private_lock" (and now
"ctx->ring_lock") protect against.

The *second* completion_lock use in aio_migratepage() is actually
valid: we can't copy the page contents to a new one when a completion
might change the ring tail data, because then the change might be done
to the old page but not the new page. But there the "check if things
haven't changed" is bogus, since we've held the ring_lock.

I did *not* clean up that part. But it's an example of how the locking
here seems to be more "voodoo programming" than actually thought about
and designed.

Please, somebody who has test-cases look at this, ok?

                  Linus
 fs/aio.c | 53 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 10 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 062a5f6a1448..758430665b3a 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -243,6 +243,9 @@ static void aio_free_ring(struct kioctx *ctx)
 {
 	int i;
 
+	/* This makes the ctx unreachable */
+	put_aio_ring_file(ctx);
+
 	for (i = 0; i < ctx->nr_pages; i++) {
 		struct page *page;
 		pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
@@ -254,8 +257,6 @@ static void aio_free_ring(struct kioctx *ctx)
 		put_page(page);
 	}
 
-	put_aio_ring_file(ctx);
-
 	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
 		kfree(ctx->ring_pages);
 		ctx->ring_pages = NULL;
@@ -287,9 +288,29 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
 
 	rc = 0;
 
-	/* Make sure the old page hasn't already been changed */
+	/* Get a reference on the ioctx so we can take the ring_lock mutex. */
 	spin_lock(&mapping->private_lock);
 	ctx = mapping->private_data;
+	if (ctx)
+		percpu_ref_get(&ctx->users);
+	spin_unlock(&mapping->private_lock);
+
+	if (!ctx)
+		return -EINVAL;
+
+	/* We use mutex_trylock() here as the callers of migratepage may
+	 * already be holding current->mm->mmap_sem, and ->ring_lock must be
+	 * outside of mmap_sem due to its usage in aio_read_events_ring().
+	 * Since page migration is not an absolutely critical operation, the
+	 * occasional failure here is acceptable.
+	 */
+	if (!mutex_trylock(&ctx->ring_lock)) {
+		percpu_ref_put(&ctx->users);
+		return -EAGAIN;
+	}
+
+	/* Make sure the old page hasn't already been changed */
+	spin_lock(&mapping->private_lock);
 	if (ctx) {
 		pgoff_t idx;
 		spin_lock_irqsave(&ctx->completion_lock, flags);
@@ -305,7 +326,7 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
 	spin_unlock(&mapping->private_lock);
 
 	if (rc != 0)
-		return rc;
+		goto out_unlock;
 
 	/* Writeback must be complete */
 	BUG_ON(PageWriteback(old));
@@ -314,7 +335,7 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
 	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
 	if (rc != MIGRATEPAGE_SUCCESS) {
 		put_page(new);
-		return rc;
+		goto out_unlock;
 	}
 
 	/* We can potentially race against kioctx teardown here.  Use the
@@ -346,6 +367,9 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
 	else
 		put_page(new);
 
+out_unlock:
+	mutex_unlock(&ctx->ring_lock);
+	percpu_ref_put(&ctx->users);
 	return rc;
 }
 #endif
@@ -380,7 +404,7 @@ static int aio_setup_ring(struct kioctx *ctx)
 	file = aio_private_file(ctx, nr_pages);
 	if (IS_ERR(file)) {
 		ctx->aio_ring_file = NULL;
-		return -EAGAIN;
+		return -ENOMEM;
 	}
 
 	ctx->aio_ring_file = file;
@@ -415,7 +439,7 @@ static int aio_setup_ring(struct kioctx *ctx)
 
 	if (unlikely(i != nr_pages)) {
 		aio_free_ring(ctx);
-		return -EAGAIN;
+		return -ENOMEM;
 	}
 
 	ctx->mmap_size = nr_pages * PAGE_SIZE;
@@ -429,7 +453,7 @@ static int aio_setup_ring(struct kioctx *ctx)
 	if (IS_ERR((void *)ctx->mmap_base)) {
 		ctx->mmap_size = 0;
 		aio_free_ring(ctx);
-		return -EAGAIN;
+		return -ENOMEM;
 	}
 
 	pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
@@ -657,8 +681,13 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	if (!ctx->cpu)
 		goto err;
 
-	if (aio_setup_ring(ctx) < 0)
-		goto err;
+	/* Prevent races with page migration during setup by holding
+	 * the ring_lock mutex.
+	 */
+	mutex_lock(&ctx->ring_lock);
+	err = aio_setup_ring(ctx);
+	if (err < 0)
+		goto err_unlock;
 
 	atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
 	ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
@@ -683,6 +712,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	if (err)
 		goto err_cleanup;
 
+	mutex_unlock(&ctx->ring_lock);
 	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
 		 ctx, ctx->user_id, mm, ctx->nr_events);
 	return ctx;
@@ -691,6 +721,8 @@ err_cleanup:
 	aio_nr_sub(ctx->max_reqs);
 err_ctx:
 	aio_free_ring(ctx);
+err_unlock:
+	mutex_unlock(&ctx->ring_lock);
 err:
 	free_percpu(ctx->cpu);
 	free_percpu(ctx->reqs.pcpu_count);
@@ -1024,6 +1056,7 @@ static long aio_read_events_ring(struct kioctx *ctx,
 
 	mutex_lock(&ctx->ring_lock);
 
+	/* Access to ->ring_pages here is protected by ctx->ring_lock. */
 	ring = kmap_atomic(ctx->ring_pages[0]);
 	head = ring->head;
 	tail = ring->tail;