Hi all, Based on the issues reported by Tang and Gu, I've come up with the an alternative fix that avoids adding additional locking in the event read code path. The fix is to take the ring_lock mutex during page migration, which is already used to syncronize event readers and thus does not add any new locking requirements in aio_read_events_ring(). I've dropped the patches from Tang and Gu as a result. This patch is now in my git://git.kvack.org/~bcrl/aio-next.git tree and will be sent to Linus once a few other people chime in with their reviews of this change. Please review Tang, Gu. Thanks! -ben -- "Thought is the essence of where you are now." >From e52ddd946ab1def55c8282c8b3d0e80403abae12 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise <bcrl@xxxxxxxxx> Date: Fri, 21 Mar 2014 14:26:43 -0400 Subject: [PATCH] aio: ensure access to ctx->ring_pages is correctly serialised As reported by Tang Chen and Gu Zheng, the following issues exist in the aio ring page migration support. Tang Chen reported: As a result, for example, we have the following problem: thread 1 | thread 2 | aio_migratepage() | |-> take ctx->completion_lock | |-> migrate_page_copy(new, old) | | *NOW*, ctx->ring_pages[idx] == old | | | *NOW*, ctx->ring_pages[idx] == old | aio_read_events_ring() | |-> ring = kmap_atomic(ctx->ring_pages[0]) | |-> ring->head = head; *HERE, write to the old ring page* | |-> kunmap_atomic(ring); | |-> ctx->ring_pages[idx] = new | | *BUT NOW*, the content of | | ring_pages[idx] is old. | |-> release ctx->completion_lock | As above, the new ring page will not be updated. Fix this issue, as well as prevent races in aio_ring_setup() by taking the ring_lock mutex during page migration and where otherwise applicable. This avoids the overhead of taking another spinlock in aio_read_events_ring() as Tang's and Gu's original fix did, pushing the overhead into the migration code. Signed-off-by: Benjamin LaHaise <bcrl@xxxxxxxxx> Cc: Tang Chen <tangchen@xxxxxxxxxxxxxx> Cc: Gu Zheng <guz.fnst@xxxxxxxxxxxxxx> --- fs/aio.c | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 062a5f6..c97cee8 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -241,8 +241,10 @@ static void put_aio_ring_file(struct kioctx *ctx) static void aio_free_ring(struct kioctx *ctx) { + unsigned long flags; int i; + spin_lock_irqsave(&ctx->completion_lock, flags); for (i = 0; i < ctx->nr_pages; i++) { struct page *page; pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, @@ -253,6 +255,7 @@ static void aio_free_ring(struct kioctx *ctx) ctx->ring_pages[i] = NULL; put_page(page); } + spin_unlock_irqrestore(&ctx->completion_lock, flags); put_aio_ring_file(ctx); @@ -287,9 +290,20 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, rc = 0; - /* Make sure the old page hasn't already been changed */ + /* Get a reference on the ioctx so we can take the ring_lock mutex. */ spin_lock(&mapping->private_lock); ctx = mapping->private_data; + if (ctx) + percpu_ref_get(&ctx->users); + spin_unlock(&mapping->private_lock); + + if (!ctx) + return -EINVAL; + + mutex_lock(&ctx->ring_lock); + + /* Make sure the old page hasn't already been changed */ + spin_lock(&mapping->private_lock); if (ctx) { pgoff_t idx; spin_lock_irqsave(&ctx->completion_lock, flags); @@ -305,7 +319,7 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, spin_unlock(&mapping->private_lock); if (rc != 0) - return rc; + goto out_unlock; /* Writeback must be complete */ BUG_ON(PageWriteback(old)); @@ -314,7 +328,7 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); if (rc != MIGRATEPAGE_SUCCESS) { put_page(new); - return rc; + goto out_unlock; } /* We can potentially race against kioctx teardown here. Use the @@ -346,6 +360,9 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, else put_page(new); +out_unlock: + mutex_unlock(&ctx->ring_lock); + percpu_ref_put(&ctx->users); return rc; } #endif @@ -556,9 +573,17 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); + /* + * Accessing ring pages must be done + * holding ctx->completion_lock to + * prevent aio ring page migration + * procedure from migrating ring pages. + */ + spin_lock_irq(&ctx->completion_lock); ring = kmap_atomic(ctx->ring_pages[0]); ring->id = ctx->id; kunmap_atomic(ring); + spin_unlock_irq(&ctx->completion_lock); return 0; } @@ -657,7 +682,13 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (!ctx->cpu) goto err; - if (aio_setup_ring(ctx) < 0) + /* Prevent races with page migration in aio_setup_ring() by holding + * the ring_lock mutex. + */ + mutex_lock(&ctx->ring_lock); + err = aio_setup_ring(ctx); + mutex_unlock(&ctx->ring_lock); + if (err < 0) goto err; atomic_set(&ctx->reqs_available, ctx->nr_events - 1); @@ -1024,6 +1055,7 @@ static long aio_read_events_ring(struct kioctx *ctx, mutex_lock(&ctx->ring_lock); + /* Access to ->ring_pages here is protected by ctx->ring_lock. */ ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; tail = ring->tail; -- 1.8.2.1 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html