Hi Ben, On 07/16/2013 09:34 PM, Benjamin LaHaise wrote: > On Tue, Jul 16, 2013 at 05:56:16PM +0800, Gu Zheng wrote: >> As the aio job will pin the ring pages, that will lead to mem migrated >> failed. In order to fix this problem we use an anon inode to manage the aio ring >> pages, and setup the migratepage callback in the anon inode's address space, so >> that when mem migrating the aio ring pages will be moved to other mem node safely. > > There are a few minor issues that needed to be fixed -- see below. I've > made these changes and added them to git://git.kvack.org/~bcrl/aio-next.git , > and will ask for that tree to be included in linux-next. Thanks very much, and your review. Stephen sent out a build failed msg when merger this patch into next-tree from your aio_next. This is because we use migrate_page_move_mapping() which is protected by CONFIG_MIGRATION, I'll fix this issue in the next version. Best regards, Gu > > mm folks: can someone familiar with page migration / hot plug memory please > review the migration changes? > >> >> Signed-off-by: Gu Zheng <guz.fnst@xxxxxxxxxxxxxx> >> Signed-off-by: Benjamin LaHaise <bcrl@xxxxxxxxx> > > Again, I had not provided my Signed-off-by on this patch previously, so > don't add it for me. Sorry again.:) > >> --- >> fs/aio.c | 120 ++++++++++++++++++++++++++++++++++++++++++---- >> include/linux/migrate.h | 3 + >> mm/migrate.c | 2 +- >> 3 files changed, 113 insertions(+), 12 deletions(-) >> >> diff --git a/fs/aio.c b/fs/aio.c >> index 9b5ca11..d10f956 100644 >> --- a/fs/aio.c >> +++ b/fs/aio.c >> @@ -35,6 +35,9 @@ >> #include <linux/eventfd.h> >> #include <linux/blkdev.h> >> #include <linux/compat.h> >> +#include <linux/anon_inodes.h> >> +#include <linux/migrate.h> >> +#include <linux/ramfs.h> >> >> #include <asm/kmap_types.h> >> #include <asm/uaccess.h> >> @@ -110,6 +113,7 @@ struct kioctx { >> } ____cacheline_aligned_in_smp; >> >> struct page *internal_pages[AIO_RING_PAGES]; >> + struct file *aio_ring_file; >> }; >> >> /*------ sysctl variables----*/ >> @@ -138,15 +142,78 @@ __initcall(aio_setup); >> >> static void aio_free_ring(struct kioctx *ctx) >> { >> - long i; >> + int i; >> + struct file *aio_ring_file = ctx->aio_ring_file; >> >> - for (i = 0; i < ctx->nr_pages; i++) >> + for (i = 0; i < ctx->nr_pages; i++) { >> + pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, >> + page_count(ctx->ring_pages[i])); >> put_page(ctx->ring_pages[i]); >> + } >> >> if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) >> kfree(ctx->ring_pages); >> + >> + if (aio_ring_file) { >> + truncate_setsize(aio_ring_file->f_inode, 0); >> + pr_debug("pid(%d) i_nlink=%u d_count=%d d_unhashed=%d i_count=%d\n", >> + current->pid, aio_ring_file->f_inode->i_nlink, >> + aio_ring_file->f_path.dentry->d_count, >> + d_unhashed(aio_ring_file->f_path.dentry), >> + atomic_read(&aio_ring_file->f_inode->i_count)); >> + fput(aio_ring_file); >> + ctx->aio_ring_file = NULL; >> + } >> +} >> + >> +static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) >> +{ >> + vma->vm_ops = &generic_file_vm_ops; >> + return 0; >> +} >> + >> +static const struct file_operations aio_ring_fops = { >> + .mmap = aio_ring_mmap, >> +}; >> + >> +static int aio_set_page_dirty(struct page *page) >> +{ >> + return 0; >> } >> >> +static int aio_migratepage(struct address_space *mapping, struct page *new, >> + struct page *old, enum migrate_mode mode) >> +{ >> + struct kioctx *ctx = mapping->private_data; >> + unsigned long flags; >> + unsigned idx = old->index; >> + int rc; >> + >> + /*Writeback must be complete*/ > > Missing spaces before/after beginning and end of comment. > >> + BUG_ON(PageWriteback(old)); >> + put_page(old); >> + >> + rc = migrate_page_move_mapping(mapping, new, old, NULL, mode); >> + if (rc != MIGRATEPAGE_SUCCESS) { >> + get_page(old); >> + return rc; >> + } >> + >> + get_page(new); >> + >> + spin_lock_irqsave(&ctx->completion_lock, flags); >> + migrate_page_copy(new, old); >> + ctx->ring_pages[idx] = new; >> + spin_unlock_irqrestore(&ctx->completion_lock, flags); >> + >> + return rc; >> +} >> + >> +static const struct address_space_operations aio_ctx_aops = { >> + .set_page_dirty = aio_set_page_dirty, >> + .migratepage = aio_migratepage, >> +}; >> + >> static int aio_setup_ring(struct kioctx *ctx) >> { >> struct aio_ring *ring; >> @@ -154,20 +221,45 @@ static int aio_setup_ring(struct kioctx *ctx) >> struct mm_struct *mm = current->mm; >> unsigned long size, populate; >> int nr_pages; >> + int i; >> + struct file *file; >> >> /* Compensate for the ring buffer's head/tail overlap entry */ >> nr_events += 2; /* 1 is required, 2 for good luck */ >> >> size = sizeof(struct aio_ring); >> size += sizeof(struct io_event) * nr_events; >> - nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; >> >> + nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; > > Hrm, this should probably be replaced by PFN_UP(size) rather than the old > open coding of same. > >> if (nr_pages < 0) >> return -EINVAL; >> >> - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); >> + file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR); >> + if (IS_ERR(file)) { >> + ctx->aio_ring_file = NULL; >> + return -EAGAIN; >> + } >> + >> + file->f_inode->i_mapping->a_ops = &aio_ctx_aops; >> + file->f_inode->i_mapping->private_data = ctx; >> + file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages; >> + >> + for (i = 0; i < nr_pages; i++) { >> + struct page *page; >> + page = find_or_create_page(file->f_inode->i_mapping, >> + i, GFP_HIGHUSER | __GFP_ZERO); >> + if (!page) >> + break; >> + pr_debug("pid(%d) page[%d]->count=%d\n", >> + current->pid, i, page_count(page)); >> + SetPageUptodate(page); >> + SetPageDirty(page); >> + unlock_page(page); >> + } >> + ctx->aio_ring_file = file; >> + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) >> + / sizeof(struct io_event); >> >> - ctx->nr_events = 0; >> ctx->ring_pages = ctx->internal_pages; >> if (nr_pages > AIO_RING_PAGES) { >> ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), >> @@ -178,28 +270,31 @@ static int aio_setup_ring(struct kioctx *ctx) >> >> ctx->mmap_size = nr_pages * PAGE_SIZE; >> pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); >> + >> down_write(&mm->mmap_sem); >> - ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, >> - PROT_READ|PROT_WRITE, >> - MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); >> + ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, >> + PROT_READ | PROT_WRITE, >> + MAP_SHARED | MAP_POPULATE, 0, &populate); >> if (IS_ERR((void *)ctx->mmap_base)) { >> up_write(&mm->mmap_sem); >> ctx->mmap_size = 0; >> aio_free_ring(ctx); >> return -EAGAIN; >> } >> + up_write(&mm->mmap_sem); >> + >> + mm_populate(ctx->mmap_base, populate); >> >> pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); >> ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, >> 1, 0, ctx->ring_pages, NULL); >> - up_write(&mm->mmap_sem); >> + for (i = 0; i < ctx->nr_pages; i++) >> + put_page(ctx->ring_pages[i]); >> >> if (unlikely(ctx->nr_pages != nr_pages)) { >> aio_free_ring(ctx); >> return -EAGAIN; >> } >> - if (populate) >> - mm_populate(ctx->mmap_base, populate); >> >> ctx->user_id = ctx->mmap_base; >> ctx->nr_events = nr_events; /* trusted copy */ >> @@ -399,6 +494,8 @@ out_cleanup: >> err = -EAGAIN; >> aio_free_ring(ctx); >> out_freectx: >> + if (ctx->aio_ring_file) >> + fput(ctx->aio_ring_file); >> kmem_cache_free(kioctx_cachep, ctx); >> pr_debug("error allocating ioctx %d\n", err); >> return ERR_PTR(err); >> @@ -852,6 +949,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) >> ioctx = ioctx_alloc(nr_events); >> ret = PTR_ERR(ioctx); >> if (!IS_ERR(ioctx)) { >> + ctx = ioctx->user_id; >> ret = put_user(ioctx->user_id, ctxp); >> if (ret) >> kill_ioctx(ioctx); > > This hunk doesn't do anything and needs to be removed. That should cover > things. > > -ben > >> diff --git a/include/linux/migrate.h b/include/linux/migrate.h >> index a405d3dc..c407d88 100644 >> --- a/include/linux/migrate.h >> +++ b/include/linux/migrate.h >> @@ -55,6 +55,9 @@ extern int migrate_vmas(struct mm_struct *mm, >> extern void migrate_page_copy(struct page *newpage, struct page *page); >> extern int migrate_huge_page_move_mapping(struct address_space *mapping, >> struct page *newpage, struct page *page); >> +extern int migrate_page_move_mapping(struct address_space *mapping, >> + struct page *newpage, struct page *page, >> + struct buffer_head *head, enum migrate_mode mode); >> #else >> >> static inline void putback_lru_pages(struct list_head *l) {} >> diff --git a/mm/migrate.c b/mm/migrate.c >> index 6f0c244..1da0092 100644 >> --- a/mm/migrate.c >> +++ b/mm/migrate.c >> @@ -307,7 +307,7 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, >> * 2 for pages with a mapping >> * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. >> */ >> -static int migrate_page_move_mapping(struct address_space *mapping, >> +int migrate_page_move_mapping(struct address_space *mapping, >> struct page *newpage, struct page *page, >> struct buffer_head *head, enum migrate_mode mode) >> { >> -- >> 1.7.7 >> > -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html