This custom resizing array was vulnerable to a Spectre attack (speculating off the end of an array to a user-controlled offset). The XArray is not vulnerable to Spectre as it always masks its lookups to be within the bounds of the array. Signed-off-by: Matthew Wilcox <willy@xxxxxxxxxxxxx> --- fs/aio.c | 182 ++++++++++++++------------------------- include/linux/mm_types.h | 5 +- kernel/fork.c | 3 +- mm/debug.c | 6 -- 4 files changed, 67 insertions(+), 129 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 301e6314183b..51ba7446a24f 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -71,12 +71,6 @@ struct aio_ring { #define AIO_RING_PAGES 8 -struct kioctx_table { - struct rcu_head rcu; - unsigned nr; - struct kioctx __rcu *table[]; -}; - struct kioctx_cpu { unsigned reqs_available; }; @@ -313,27 +307,22 @@ static int aio_ring_mremap(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; - struct kioctx_table *table; - int i, res = -EINVAL; + struct kioctx *ctx; + unsigned long index; + int res = -EINVAL; - spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); - for (i = 0; i < table->nr; i++) { - struct kioctx *ctx; - - ctx = rcu_dereference(table->table[i]); - if (ctx && ctx->aio_ring_file == file) { - if (!atomic_read(&ctx->dead)) { - ctx->user_id = ctx->mmap_base = vma->vm_start; - res = 0; - } - break; + xa_lock(&mm->ioctx); + xa_for_each(&mm->ioctx, ctx, index, ULONG_MAX, XA_PRESENT) { + if (ctx->aio_ring_file != file) + continue; + if (!atomic_read(&ctx->dead)) { + ctx->user_id = ctx->mmap_base = vma->vm_start; + res = 0; } + break; } + xa_unlock(&mm->ioctx); - rcu_read_unlock(); - spin_unlock(&mm->ioctx_lock); return res; } @@ -617,57 +606,21 @@ static void free_ioctx_users(struct percpu_ref *ref) static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) { - unsigned i, new_nr; - struct kioctx_table *table, *old; struct aio_ring *ring; + int err; - spin_lock(&mm->ioctx_lock); - table = rcu_dereference_raw(mm->ioctx_table); - - while (1) { - if (table) - for (i = 0; i < table->nr; i++) - if (!rcu_access_pointer(table->table[i])) { - ctx->id = i; - rcu_assign_pointer(table->table[i], ctx); - spin_unlock(&mm->ioctx_lock); - - /* While kioctx setup is in progress, - * we are protected from page migration - * changes ring_pages by ->ring_lock. - */ - ring = kmap_atomic(ctx->ring_pages[0]); - ring->id = ctx->id; - kunmap_atomic(ring); - return 0; - } - - new_nr = (table ? table->nr : 1) * 4; - spin_unlock(&mm->ioctx_lock); - - table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * - new_nr, GFP_KERNEL); - if (!table) - return -ENOMEM; - - table->nr = new_nr; - - spin_lock(&mm->ioctx_lock); - old = rcu_dereference_raw(mm->ioctx_table); - - if (!old) { - rcu_assign_pointer(mm->ioctx_table, table); - } else if (table->nr > old->nr) { - memcpy(table->table, old->table, - old->nr * sizeof(struct kioctx *)); + err = xa_alloc(&mm->ioctx, &ctx->id, UINT_MAX, ctx, GFP_KERNEL); + if (err) + return err; - rcu_assign_pointer(mm->ioctx_table, table); - kfree_rcu(old, rcu); - } else { - kfree(table); - table = old; - } - } + /* + * While kioctx setup is in progress, we are protected from + * page migration changing ring_pages by ->ring_lock. + */ + ring = kmap_atomic(ctx->ring_pages[0]); + ring->id = ctx->id; + kunmap_atomic(ring); + return 0; } static void aio_nr_sub(unsigned nr) @@ -793,27 +746,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ERR_PTR(err); } -/* kill_ioctx - * Cancels all outstanding aio requests on an aio context. Used - * when the processes owning a context have all exited to encourage - * the rapid destruction of the kioctx. - */ -static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, - struct ctx_rq_wait *wait) +static void __kill_ioctx(struct kioctx *ctx, struct ctx_rq_wait *wait) { - struct kioctx_table *table; - - spin_lock(&mm->ioctx_lock); - if (atomic_xchg(&ctx->dead, 1)) { - spin_unlock(&mm->ioctx_lock); - return -EINVAL; - } - - table = rcu_dereference_raw(mm->ioctx_table); - WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id])); - RCU_INIT_POINTER(table->table[ctx->id], NULL); - spin_unlock(&mm->ioctx_lock); - /* free_ioctx_reqs() will do the necessary RCU synchronization */ wake_up_all(&ctx->wait); @@ -831,6 +765,30 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, ctx->rq_wait = wait; percpu_ref_kill(&ctx->users); +} + +/* kill_ioctx + * Cancels all outstanding aio requests on an aio context. Used + * when the processes owning a context have all exited to encourage + * the rapid destruction of the kioctx. + */ +static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, + struct ctx_rq_wait *wait) +{ + struct kioctx *old; + + /* I don't understand what this lock is protecting against */ + xa_lock(&mm->ioctx); + if (atomic_xchg(&ctx->dead, 1)) { + xa_unlock(&mm->ioctx); + return -EINVAL; + } + + old = __xa_erase(&mm->ioctx, ctx->id); + WARN_ON(old != ctx); + xa_unlock(&mm->ioctx); + + __kill_ioctx(ctx, wait); return 0; } @@ -844,26 +802,21 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, */ void exit_aio(struct mm_struct *mm) { - struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); + struct kioctx *ctx; struct ctx_rq_wait wait; - int i, skipped; + unsigned long index; - if (!table) + if (xa_empty(&mm->ioctx)) return; - atomic_set(&wait.count, table->nr); + /* + * Prevent count from getting to 0 until we're ready for it + */ + atomic_set(&wait.count, 1); init_completion(&wait.comp); - skipped = 0; - for (i = 0; i < table->nr; ++i) { - struct kioctx *ctx = - rcu_dereference_protected(table->table[i], true); - - if (!ctx) { - skipped++; - continue; - } - + xa_lock(&mm->ioctx); + xa_for_each(&mm->ioctx, ctx, index, ULONG_MAX, XA_PRESENT) { /* * We don't need to bother with munmap() here - exit_mmap(mm) * is coming and it'll unmap everything. And we simply can't, @@ -872,16 +825,16 @@ void exit_aio(struct mm_struct *mm) * that it needs to unmap the area, just set it to 0. */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx, &wait); + atomic_inc(&wait.count); + __xa_erase(&mm->ioctx, ctx->id); + __kill_ioctx(ctx, &wait); } + xa_unlock(&mm->ioctx); - if (!atomic_sub_and_test(skipped, &wait.count)) { + if (!atomic_sub_and_test(1, &wait.count)) { /* Wait until all IO for the context are done. */ wait_for_completion(&wait.comp); } - - RCU_INIT_POINTER(mm->ioctx_table, NULL); - kfree(table); } static void put_reqs_available(struct kioctx *ctx, unsigned nr) @@ -1026,24 +979,17 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) struct aio_ring __user *ring = (void __user *)ctx_id; struct mm_struct *mm = current->mm; struct kioctx *ctx, *ret = NULL; - struct kioctx_table *table; unsigned id; if (get_user(id, &ring->id)) return NULL; rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); - - if (!table || id >= table->nr) - goto out; - - ctx = rcu_dereference(table->table[id]); + ctx = xa_load(&mm->ioctx, id); if (ctx && ctx->user_id == ctx_id) { if (percpu_ref_tryget_live(&ctx->users)) ret = ctx; } -out: rcu_read_unlock(); return ret; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5ed8f6292a53..1a95bb27f5a7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -14,6 +14,7 @@ #include <linux/uprobes.h> #include <linux/page-flags-layout.h> #include <linux/workqueue.h> +#include <linux/xarray.h> #include <asm/mmu.h> @@ -336,7 +337,6 @@ struct core_state { struct completion startup; }; -struct kioctx_table; struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ @@ -431,8 +431,7 @@ struct mm_struct { atomic_t membarrier_state; #endif #ifdef CONFIG_AIO - spinlock_t ioctx_lock; - struct kioctx_table __rcu *ioctx_table; + struct xarray ioctx; #endif #ifdef CONFIG_MEMCG /* diff --git a/kernel/fork.c b/kernel/fork.c index 07cddff89c7b..acb775f9592d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -946,8 +946,7 @@ __setup("coredump_filter=", coredump_filter_setup); static void mm_init_aio(struct mm_struct *mm) { #ifdef CONFIG_AIO - spin_lock_init(&mm->ioctx_lock); - mm->ioctx_table = NULL; + xa_init_flags(&mm->ioctx, XA_FLAGS_ALLOC); #endif } diff --git a/mm/debug.c b/mm/debug.c index cdacba12e09a..d45ec63aed8c 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -127,9 +127,6 @@ void dump_mm(const struct mm_struct *mm) "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" "binfmt %px flags %lx core_state %px\n" -#ifdef CONFIG_AIO - "ioctx_table %px\n" -#endif #ifdef CONFIG_MEMCG "owner %px " #endif @@ -158,9 +155,6 @@ void dump_mm(const struct mm_struct *mm) mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, mm->binfmt, mm->flags, mm->core_state, -#ifdef CONFIG_AIO - mm->ioctx_table, -#endif #ifdef CONFIG_MEMCG mm->owner, #endif -- 2.19.1