ping On Wed, Nov 28, 2018 at 10:35:31AM -0800, Matthew Wilcox wrote: > This custom resizing array was vulnerable to a Spectre attack (speculating > off the end of an array to a user-controlled offset). The XArray is > not vulnerable to Spectre as it always masks its lookups to be within > the bounds of the array. > > Signed-off-by: Matthew Wilcox <willy@xxxxxxxxxxxxx> > --- > fs/aio.c | 182 ++++++++++++++------------------------- > include/linux/mm_types.h | 5 +- > kernel/fork.c | 3 +- > mm/debug.c | 6 -- > 4 files changed, 67 insertions(+), 129 deletions(-) > > diff --git a/fs/aio.c b/fs/aio.c > index 301e6314183b..51ba7446a24f 100644 > --- a/fs/aio.c > +++ b/fs/aio.c > @@ -71,12 +71,6 @@ struct aio_ring { > > #define AIO_RING_PAGES 8 > > -struct kioctx_table { > - struct rcu_head rcu; > - unsigned nr; > - struct kioctx __rcu *table[]; > -}; > - > struct kioctx_cpu { > unsigned reqs_available; > }; > @@ -313,27 +307,22 @@ static int aio_ring_mremap(struct vm_area_struct *vma) > { > struct file *file = vma->vm_file; > struct mm_struct *mm = vma->vm_mm; > - struct kioctx_table *table; > - int i, res = -EINVAL; > + struct kioctx *ctx; > + unsigned long index; > + int res = -EINVAL; > > - spin_lock(&mm->ioctx_lock); > - rcu_read_lock(); > - table = rcu_dereference(mm->ioctx_table); > - for (i = 0; i < table->nr; i++) { > - struct kioctx *ctx; > - > - ctx = rcu_dereference(table->table[i]); > - if (ctx && ctx->aio_ring_file == file) { > - if (!atomic_read(&ctx->dead)) { > - ctx->user_id = ctx->mmap_base = vma->vm_start; > - res = 0; > - } > - break; > + xa_lock(&mm->ioctx); > + xa_for_each(&mm->ioctx, ctx, index, ULONG_MAX, XA_PRESENT) { > + if (ctx->aio_ring_file != file) > + continue; > + if (!atomic_read(&ctx->dead)) { > + ctx->user_id = ctx->mmap_base = vma->vm_start; > + res = 0; > } > + break; > } > + xa_unlock(&mm->ioctx); > > - rcu_read_unlock(); > - spin_unlock(&mm->ioctx_lock); > return res; > } > > @@ -617,57 +606,21 @@ static void free_ioctx_users(struct percpu_ref *ref) > > static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) > { > - unsigned i, new_nr; > - struct kioctx_table *table, *old; > struct aio_ring *ring; > + int err; > > - spin_lock(&mm->ioctx_lock); > - table = rcu_dereference_raw(mm->ioctx_table); > - > - while (1) { > - if (table) > - for (i = 0; i < table->nr; i++) > - if (!rcu_access_pointer(table->table[i])) { > - ctx->id = i; > - rcu_assign_pointer(table->table[i], ctx); > - spin_unlock(&mm->ioctx_lock); > - > - /* While kioctx setup is in progress, > - * we are protected from page migration > - * changes ring_pages by ->ring_lock. > - */ > - ring = kmap_atomic(ctx->ring_pages[0]); > - ring->id = ctx->id; > - kunmap_atomic(ring); > - return 0; > - } > - > - new_nr = (table ? table->nr : 1) * 4; > - spin_unlock(&mm->ioctx_lock); > - > - table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * > - new_nr, GFP_KERNEL); > - if (!table) > - return -ENOMEM; > - > - table->nr = new_nr; > - > - spin_lock(&mm->ioctx_lock); > - old = rcu_dereference_raw(mm->ioctx_table); > - > - if (!old) { > - rcu_assign_pointer(mm->ioctx_table, table); > - } else if (table->nr > old->nr) { > - memcpy(table->table, old->table, > - old->nr * sizeof(struct kioctx *)); > + err = xa_alloc(&mm->ioctx, &ctx->id, UINT_MAX, ctx, GFP_KERNEL); > + if (err) > + return err; > > - rcu_assign_pointer(mm->ioctx_table, table); > - kfree_rcu(old, rcu); > - } else { > - kfree(table); > - table = old; > - } > - } > + /* > + * While kioctx setup is in progress, we are protected from > + * page migration changing ring_pages by ->ring_lock. > + */ > + ring = kmap_atomic(ctx->ring_pages[0]); > + ring->id = ctx->id; > + kunmap_atomic(ring); > + return 0; > } > > static void aio_nr_sub(unsigned nr) > @@ -793,27 +746,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) > return ERR_PTR(err); > } > > -/* kill_ioctx > - * Cancels all outstanding aio requests on an aio context. Used > - * when the processes owning a context have all exited to encourage > - * the rapid destruction of the kioctx. > - */ > -static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, > - struct ctx_rq_wait *wait) > +static void __kill_ioctx(struct kioctx *ctx, struct ctx_rq_wait *wait) > { > - struct kioctx_table *table; > - > - spin_lock(&mm->ioctx_lock); > - if (atomic_xchg(&ctx->dead, 1)) { > - spin_unlock(&mm->ioctx_lock); > - return -EINVAL; > - } > - > - table = rcu_dereference_raw(mm->ioctx_table); > - WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id])); > - RCU_INIT_POINTER(table->table[ctx->id], NULL); > - spin_unlock(&mm->ioctx_lock); > - > /* free_ioctx_reqs() will do the necessary RCU synchronization */ > wake_up_all(&ctx->wait); > > @@ -831,6 +765,30 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, > > ctx->rq_wait = wait; > percpu_ref_kill(&ctx->users); > +} > + > +/* kill_ioctx > + * Cancels all outstanding aio requests on an aio context. Used > + * when the processes owning a context have all exited to encourage > + * the rapid destruction of the kioctx. > + */ > +static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, > + struct ctx_rq_wait *wait) > +{ > + struct kioctx *old; > + > + /* I don't understand what this lock is protecting against */ > + xa_lock(&mm->ioctx); > + if (atomic_xchg(&ctx->dead, 1)) { > + xa_unlock(&mm->ioctx); > + return -EINVAL; > + } > + > + old = __xa_erase(&mm->ioctx, ctx->id); > + WARN_ON(old != ctx); > + xa_unlock(&mm->ioctx); > + > + __kill_ioctx(ctx, wait); > return 0; > } > > @@ -844,26 +802,21 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, > */ > void exit_aio(struct mm_struct *mm) > { > - struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); > + struct kioctx *ctx; > struct ctx_rq_wait wait; > - int i, skipped; > + unsigned long index; > > - if (!table) > + if (xa_empty(&mm->ioctx)) > return; > > - atomic_set(&wait.count, table->nr); > + /* > + * Prevent count from getting to 0 until we're ready for it > + */ > + atomic_set(&wait.count, 1); > init_completion(&wait.comp); > > - skipped = 0; > - for (i = 0; i < table->nr; ++i) { > - struct kioctx *ctx = > - rcu_dereference_protected(table->table[i], true); > - > - if (!ctx) { > - skipped++; > - continue; > - } > - > + xa_lock(&mm->ioctx); > + xa_for_each(&mm->ioctx, ctx, index, ULONG_MAX, XA_PRESENT) { > /* > * We don't need to bother with munmap() here - exit_mmap(mm) > * is coming and it'll unmap everything. And we simply can't, > @@ -872,16 +825,16 @@ void exit_aio(struct mm_struct *mm) > * that it needs to unmap the area, just set it to 0. > */ > ctx->mmap_size = 0; > - kill_ioctx(mm, ctx, &wait); > + atomic_inc(&wait.count); > + __xa_erase(&mm->ioctx, ctx->id); > + __kill_ioctx(ctx, &wait); > } > + xa_unlock(&mm->ioctx); > > - if (!atomic_sub_and_test(skipped, &wait.count)) { > + if (!atomic_sub_and_test(1, &wait.count)) { > /* Wait until all IO for the context are done. */ > wait_for_completion(&wait.comp); > } > - > - RCU_INIT_POINTER(mm->ioctx_table, NULL); > - kfree(table); > } > > static void put_reqs_available(struct kioctx *ctx, unsigned nr) > @@ -1026,24 +979,17 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) > struct aio_ring __user *ring = (void __user *)ctx_id; > struct mm_struct *mm = current->mm; > struct kioctx *ctx, *ret = NULL; > - struct kioctx_table *table; > unsigned id; > > if (get_user(id, &ring->id)) > return NULL; > > rcu_read_lock(); > - table = rcu_dereference(mm->ioctx_table); > - > - if (!table || id >= table->nr) > - goto out; > - > - ctx = rcu_dereference(table->table[id]); > + ctx = xa_load(&mm->ioctx, id); > if (ctx && ctx->user_id == ctx_id) { > if (percpu_ref_tryget_live(&ctx->users)) > ret = ctx; > } > -out: > rcu_read_unlock(); > return ret; > } > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 5ed8f6292a53..1a95bb27f5a7 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -14,6 +14,7 @@ > #include <linux/uprobes.h> > #include <linux/page-flags-layout.h> > #include <linux/workqueue.h> > +#include <linux/xarray.h> > > #include <asm/mmu.h> > > @@ -336,7 +337,6 @@ struct core_state { > struct completion startup; > }; > > -struct kioctx_table; > struct mm_struct { > struct { > struct vm_area_struct *mmap; /* list of VMAs */ > @@ -431,8 +431,7 @@ struct mm_struct { > atomic_t membarrier_state; > #endif > #ifdef CONFIG_AIO > - spinlock_t ioctx_lock; > - struct kioctx_table __rcu *ioctx_table; > + struct xarray ioctx; > #endif > #ifdef CONFIG_MEMCG > /* > diff --git a/kernel/fork.c b/kernel/fork.c > index 07cddff89c7b..acb775f9592d 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -946,8 +946,7 @@ __setup("coredump_filter=", coredump_filter_setup); > static void mm_init_aio(struct mm_struct *mm) > { > #ifdef CONFIG_AIO > - spin_lock_init(&mm->ioctx_lock); > - mm->ioctx_table = NULL; > + xa_init_flags(&mm->ioctx, XA_FLAGS_ALLOC); > #endif > } > > diff --git a/mm/debug.c b/mm/debug.c > index cdacba12e09a..d45ec63aed8c 100644 > --- a/mm/debug.c > +++ b/mm/debug.c > @@ -127,9 +127,6 @@ void dump_mm(const struct mm_struct *mm) > "start_brk %lx brk %lx start_stack %lx\n" > "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" > "binfmt %px flags %lx core_state %px\n" > -#ifdef CONFIG_AIO > - "ioctx_table %px\n" > -#endif > #ifdef CONFIG_MEMCG > "owner %px " > #endif > @@ -158,9 +155,6 @@ void dump_mm(const struct mm_struct *mm) > mm->start_brk, mm->brk, mm->start_stack, > mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, > mm->binfmt, mm->flags, mm->core_state, > -#ifdef CONFIG_AIO > - mm->ioctx_table, > -#endif > #ifdef CONFIG_MEMCG > mm->owner, > #endif > -- > 2.19.1 >