From: Zi Yan <ziy@xxxxxxxxxx> Users can use the syscall to exchange two lists of pages, similar to move_pages() syscall. Signed-off-by: Zi Yan <ziy@xxxxxxxxxx> --- arch/x86/entry/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 5 + mm/exchange.c | 346 +++++++++++++++++++++++++++++++++ 3 files changed, 352 insertions(+) diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 92ee0b4..863a21e 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -343,6 +343,7 @@ 332 common statx __x64_sys_statx 333 common io_pgetevents __x64_sys_io_pgetevents 334 common rseq __x64_sys_rseq +335 common exchange_pages __x64_sys_exchange_pages # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal __x64_sys_pidfd_send_signal diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e446806..2c1eb49 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1203,6 +1203,11 @@ asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long fd, unsigned long pgoff); asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg); +asmlinkage long sys_exchange_pages(pid_t pid, unsigned long nr_pages, + const void __user * __user *from_pages, + const void __user * __user *to_pages, + int __user *status, + int flags); /* * Not a real system call, but a placeholder for syscalls which are diff --git a/mm/exchange.c b/mm/exchange.c index 45c7013..48e344e 100644 --- a/mm/exchange.c +++ b/mm/exchange.c @@ -22,6 +22,7 @@ #include <linux/buffer_head.h> #include <linux/fs.h> /* buffer_migrate_page */ #include <linux/backing-dev.h> +#include <linux/sched/mm.h> #include "internal.h" @@ -1212,3 +1213,348 @@ int exchange_pages_concur(struct list_head *exchange_list, return nr_failed?-EFAULT:0; } + +static int store_status(int __user *status, int start, int value, int nr) +{ + while (nr-- > 0) { + if (put_user(value, status + start)) + return -EFAULT; + start++; + } + + return 0; +} + +static int do_exchange_page_list(struct mm_struct *mm, + struct list_head *from_pagelist, struct list_head *to_pagelist, + bool migrate_mt, bool migrate_concur) +{ + int err; + struct exchange_page_info *one_pair; + LIST_HEAD(exchange_page_list); + + while (!list_empty(from_pagelist)) { + struct page *from_page, *to_page; + + from_page = list_first_entry_or_null(from_pagelist, struct page, lru); + to_page = list_first_entry_or_null(to_pagelist, struct page, lru); + + if (!from_page || !to_page) + break; + + one_pair = kzalloc(sizeof(struct exchange_page_info), GFP_ATOMIC); + if (!one_pair) { + err = -ENOMEM; + break; + } + + list_del(&from_page->lru); + list_del(&to_page->lru); + + one_pair->from_page = from_page; + one_pair->to_page = to_page; + + list_add_tail(&one_pair->list, &exchange_page_list); + } + + if (migrate_concur) + err = exchange_pages_concur(&exchange_page_list, + MIGRATE_SYNC | (migrate_mt ? MIGRATE_MT : MIGRATE_SINGLETHREAD), + MR_SYSCALL); + else + err = exchange_pages(&exchange_page_list, + MIGRATE_SYNC | (migrate_mt ? MIGRATE_MT : MIGRATE_SINGLETHREAD), + MR_SYSCALL); + + while (!list_empty(&exchange_page_list)) { + struct exchange_page_info *one_pair = + list_first_entry(&exchange_page_list, + struct exchange_page_info, list); + + list_del(&one_pair->list); + kfree(one_pair); + } + + if (!list_empty(from_pagelist)) + putback_movable_pages(from_pagelist); + + if (!list_empty(to_pagelist)) + putback_movable_pages(to_pagelist); + + return err; +} + +static int add_page_for_exchange(struct mm_struct *mm, + unsigned long from_addr, unsigned long to_addr, + struct list_head *from_pagelist, struct list_head *to_pagelist, + bool migrate_all) +{ + struct vm_area_struct *from_vma, *to_vma; + struct page *from_page, *to_page; + LIST_HEAD(err_page_list); + unsigned int follflags; + int err; + + err = -EFAULT; + from_vma = find_vma(mm, from_addr); + if (!from_vma || from_addr < from_vma->vm_start || + !vma_migratable(from_vma)) + goto set_from_status; + + /* FOLL_DUMP to ignore special (like zero) pages */ + follflags = FOLL_GET | FOLL_DUMP; + from_page = follow_page(from_vma, from_addr, follflags); + + err = PTR_ERR(from_page); + if (IS_ERR(from_page)) + goto set_from_status; + + err = -ENOENT; + if (!from_page) + goto set_from_status; + + err = -EACCES; + if (page_mapcount(from_page) > 1 && !migrate_all) + goto put_and_set_from_page; + + if (PageHuge(from_page)) { + if (PageHead(from_page)) + if (isolate_huge_page(from_page, &err_page_list)) { + err = 0; + } + goto put_and_set_from_page; + } else if (PageTransCompound(from_page)) { + if (PageTail(from_page)) { + err = -EACCES; + goto put_and_set_from_page; + } + } + + err = isolate_lru_page(from_page); + if (!err) + mod_node_page_state(page_pgdat(from_page), NR_ISOLATED_ANON + + page_is_file_cache(from_page), hpage_nr_pages(from_page)); +put_and_set_from_page: + /* + * Either remove the duplicate refcount from + * isolate_lru_page() or drop the page ref if it was + * not isolated. + * + * Since FOLL_GET calls get_page(), and isolate_lru_page() + * also calls get_page() + */ + put_page(from_page); +set_from_status: + if (err) + goto out; + + /* to pages */ + err = -EFAULT; + to_vma = find_vma(mm, to_addr); + if (!to_vma || + to_addr < to_vma->vm_start || + !vma_migratable(to_vma)) + goto set_to_status; + + /* FOLL_DUMP to ignore special (like zero) pages */ + to_page = follow_page(to_vma, to_addr, follflags); + + err = PTR_ERR(to_page); + if (IS_ERR(to_page)) + goto set_to_status; + + err = -ENOENT; + if (!to_page) + goto set_to_status; + + err = -EACCES; + if (page_mapcount(to_page) > 1 && + !migrate_all) + goto put_and_set_to_page; + + if (PageHuge(to_page)) { + if (PageHead(to_page)) + if (isolate_huge_page(to_page, &err_page_list)) { + err = 0; + } + goto put_and_set_to_page; + } else if (PageTransCompound(to_page)) { + if (PageTail(to_page)) { + err = -EACCES; + goto put_and_set_to_page; + } + } + + err = isolate_lru_page(to_page); + if (!err) + mod_node_page_state(page_pgdat(to_page), NR_ISOLATED_ANON + + page_is_file_cache(to_page), hpage_nr_pages(to_page)); +put_and_set_to_page: + /* + * Either remove the duplicate refcount from + * isolate_lru_page() or drop the page ref if it was + * not isolated. + * + * Since FOLL_GET calls get_page(), and isolate_lru_page() + * also calls get_page() + */ + put_page(to_page); +set_to_status: + if (!err) { + if ((PageHuge(from_page) != PageHuge(to_page)) || + (PageTransHuge(from_page) != PageTransHuge(to_page))) { + list_add(&from_page->lru, &err_page_list); + list_add(&to_page->lru, &err_page_list); + } else { + list_add_tail(&from_page->lru, from_pagelist); + list_add_tail(&to_page->lru, to_pagelist); + } + } else + list_add(&from_page->lru, &err_page_list); +out: + if (!list_empty(&err_page_list)) + putback_movable_pages(&err_page_list); + return err; +} +/* + * Migrate an array of page address onto an array of nodes and fill + * the corresponding array of status. + */ +static int do_pages_exchange(struct mm_struct *mm, nodemask_t task_nodes, + unsigned long nr_pages, + const void __user * __user *from_pages, + const void __user * __user *to_pages, + int __user *status, int flags) +{ + LIST_HEAD(from_pagelist); + LIST_HEAD(to_pagelist); + int start, i; + int err = 0, err1; + + migrate_prep(); + + down_read(&mm->mmap_sem); + for (i = start = 0; i < nr_pages; i++) { + const void __user *from_p, *to_p; + unsigned long from_addr, to_addr; + + err = -EFAULT; + if (get_user(from_p, from_pages + i)) + goto out_flush; + if (get_user(to_p, to_pages + i)) + goto out_flush; + + from_addr = (unsigned long)from_p; + to_addr = (unsigned long)to_p; + + err = -EACCES; + /* + * Errors in the page lookup or isolation are not fatal and we simply + * report them via status + */ + err = add_page_for_exchange(mm, from_addr, to_addr, + &from_pagelist, &to_pagelist, + flags & MPOL_MF_MOVE_ALL); + + if (!err) + continue; + + err = store_status(status, i, err, 1); + if (err) + goto out_flush; + + err = do_exchange_page_list(mm, &from_pagelist, &to_pagelist, + flags & MPOL_MF_MOVE_MT, + flags & MPOL_MF_MOVE_CONCUR); + if (err) + goto out; + if (i > start) { + err = store_status(status, start, 0, i - start); + if (err) + goto out; + } + start = i; + } +out_flush: + /* Make sure we do not overwrite the existing error */ + err1 = do_exchange_page_list(mm, &from_pagelist, &to_pagelist, + flags & MPOL_MF_MOVE_MT, + flags & MPOL_MF_MOVE_CONCUR); + if (!err1) + err1 = store_status(status, start, 0, i - start); + if (!err) + err = err1; +out: + up_read(&mm->mmap_sem); + return err; +} + +SYSCALL_DEFINE6(exchange_pages, pid_t, pid, unsigned long, nr_pages, + const void __user * __user *, from_pages, + const void __user * __user *, to_pages, + int __user *, status, int, flags) +{ + const struct cred *cred = current_cred(), *tcred; + struct task_struct *task; + struct mm_struct *mm; + int err; + nodemask_t task_nodes; + + /* Check flags */ + if (flags & ~(MPOL_MF_MOVE| + MPOL_MF_MOVE_ALL| + MPOL_MF_MOVE_MT| + MPOL_MF_MOVE_CONCUR)) + return -EINVAL; + + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + /* Find the mm_struct */ + rcu_read_lock(); + task = pid ? find_task_by_vpid(pid) : current; + if (!task) { + rcu_read_unlock(); + return -ESRCH; + } + get_task_struct(task); + + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative + * capabilities, superuser privileges or the same + * userid as the target process. + */ + tcred = __task_cred(task); + if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && + !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); + err = -EPERM; + goto out; + } + rcu_read_unlock(); + + err = security_task_movememory(task); + if (err) + goto out; + + task_nodes = cpuset_mems_allowed(task); + mm = get_task_mm(task); + put_task_struct(task); + + if (!mm) + return -EINVAL; + + err = do_pages_exchange(mm, task_nodes, nr_pages, from_pages, + to_pages, status, flags); + + mmput(mm); + + return err; + +out: + put_task_struct(task); + + return err; +} -- 2.7.4