On Tue, 19 Sep 2023 19:09:07 -0400 Gregory Price <gourry.memverge@xxxxxxxxx> wrote: > Similar to the move_pages system call, instead of taking a pid and > list of virtual addresses, this system call takes a list of physical > addresses. > > Because there is no task to validate the memory policy against, each > page needs to be interrogated to determine whether the migration is > valid, and all tasks that map it need to be interrogated. > > This is accomplished in via a rmap_walk on the folio containing > the page, and an interrogation of all tasks that map the page (by > way of each task's vma). > > Each page must be interrogated individually, which should be > considered when using this to migrate shared regions. > > The remaining logic is the same as the move_pages syscall. Some > minor changes to do_pages_move are made (to check whether an > mm_struct is passed) in order to re-use the existing migration code. > > Signed-off-by: Gregory Price <gregory.price@xxxxxxxxxxxx> Hi Gregory A few comments inline. Thanks, Jonathan > diff --git a/mm/migrate.c b/mm/migrate.c > index 1123d841a7f1..2d06557c0b80 100644 > --- a/mm/migrate.c > +++ b/mm/migrate.c > @@ -2165,9 +2165,118 @@ static int move_pages_and_store_status(int node, > return store_status(status, start, node, i - start); > } > > +struct rmap_page_ctxt { > + bool found; > + bool migratable; > + bool node_allowed; > + int node; > +}; > + > +/* > + * Walks each vma mapping a given page and determines if those > + * vma's are both migratable, and that the target node is within > + * the allowed cpuset of the owning task. > + */ > +static bool phys_page_migratable(struct folio *folio, > + struct vm_area_struct *vma, > + unsigned long address, > + void *arg) > +{ > + struct rmap_page_ctxt *ctxt = (struct rmap_page_ctxt *)arg; arg is a void * so no need for explicit cast. > + struct task_struct *owner = vma->vm_mm->owner; > + /* On non-memcg systems, the allowed set is the possible set */ > +#ifdef CONFIG_MEMCG > + nodemask_t task_nodes = cpuset_mems_allowed(owner); > +#else > + nodemask_t task_nodes = node_possible_map; > +#endif > + > + ctxt->found |= true; Given you or it with true, you might as well just set it.. ctxt->found = true; ends up the same. > + ctxt->migratable &= vma_migratable(vma); > + ctxt->node_allowed &= node_isset(ctxt->node, task_nodes); > + > + return ctxt->migratable && ctxt->node_allowed; > +} > + > +/* > + * Validates the physical address is online and migratable. Walks the folio > + * containing the page to validate the vma is migratable and the cpuset node > + * restrictions. Then calls add_page_for_migration to isolate it from the > + * LRU and place it into the given pagelist. > + * Returns: > + * errno - if the page is not online, migratable, or can't be isolated > + * 0 - when it doesn't have to be migrated because it is already on the > + * target node > + * 1 - when it has been queued > + */ > +static int add_phys_page_for_migration(const void __user *p, int node, > + struct list_head *pagelist, > + bool migrate_all) > +{ > + unsigned long pfn; > + struct page *page; > + struct folio *folio; > + int err; > + struct rmap_page_ctxt rmctxt = { > + .found = false, > + .migratable = true, > + .node_allowed = true, > + .node = node > + }; > + struct rmap_walk_control rwc = { > + .rmap_one = phys_page_migratable, > + .arg = &rmctxt > + }; > + > + pfn = ((unsigned long)p) >> PAGE_SHIFT; > + page = pfn_to_online_page(pfn); > + if (!page || PageTail(page)) > + return -ENOENT; > + > + folio = phys_migrate_get_folio(page); > + if (folio) > + rmap_walk(folio, &rwc); Why not just tidy up the !folio case more directly as if it's not we only get to the first condition as rmctxt.found == false anyway, so this is the same as. if (!folio) return -ENOENT. rmap_walk(foil, &rwc); > + > + if (!rmctxt.found) > + err = -ENOENT; > + else if (!rmctxt.migratable) > + err = -EFAULT; > + else if (!rmctxt.node_allowed) > + err = -EACCES; > + else > + err = add_page_for_migration(page, node, pagelist, migrate_all); > + > + if (folio) With above changes this condition can go. > + folio_put(folio); > + > + return err; > +} > + > /* > * Migrate an array of page address onto an array of nodes and fill > * the corresponding array of status. > + * > + * When the mm argument is not NULL, task_nodes is expected to be the > + * cpuset nodemask for the task which owns the mm_struct, and the > + * values located in (*pages) are expected to be virtual addresses. > + * > + * When the mm argument is NULL, the values located at (*pages) are > + * expected to be physical addresses, and task_nodes is expected to > + * be empty. > */ > static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, > unsigned long nr_pages, > @@ -2181,6 +2290,10 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, > int start, i; > int err = 0, err1; > > + /* This should never occur in regular operation */ > + if (!mm && nodes_weight(task_nodes) > 0) You document below that task_nodes isn't used if !mm but it is used to indicate this condition... > + return -EINVAL; > + > lru_cache_disable(); > > for (i = start = 0; i < nr_pages; i++) { > @@ -2209,7 +2322,14 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, > goto out_flush; > > err = -EACCES; > - if (!node_isset(node, task_nodes)) > + /* > + * if mm is NULL, then the pages are addressed via physical > + * address and the task_nodes structure is empty. Validation > + * of migratability is deferred to add_phys_page_for_migration > + * where vma's that map the address will have their node_mask > + * checked to ensure the requested node bit is set. > + */ > + if (mm && !node_isset(node, task_nodes)) > goto out_flush; > > @@ -2317,6 +2444,37 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, > mmap_read_unlock(mm); > } > > +/* > + * Determine the nodes pages pointed to by the physical addresses in the > + * pages array, and store those node values in the status array > + */ > +static void do_phys_pages_stat_array(unsigned long nr_pages, > + const void __user **pages, int *status) > +{ > + unsigned long i; > + > + for (i = 0; i < nr_pages; i++) { > + unsigned long pfn = (unsigned long)(*pages) >> PAGE_SHIFT; > + struct page *page = pfn_to_online_page(pfn); > + int err = -ENOENT; > + > + if (!page) > + goto set_status; > + > + get_page(page); > + > + if (!is_zone_device_page(page)) > + err = page_to_nid(page); > + > + put_page(page); > +set_status: > + *status = err; Given you update it unconditionally this seems to wipe out earlier potential errors with the nid. > + > + pages++; > + status++; > + } > +} > > +/* > + * Move a list of physically-addressed pages to the list of target nodes > + */ > +static int kernel_move_phys_pages(unsigned long nr_pages, > + const void __user * __user *pages, > + const int __user *nodes, > + int __user *status, int flags) > +{ > + int err; > + nodemask_t dummy_nodes; > + > + if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) Spaces around the | > + return -EINVAL; > + > + if (!capable(CAP_SYS_ADMIN)) > + return -EPERM; > + > + /* > + * When the mm argument to do_pages_move is null, the task_nodes > + * argument is ignored, so pass in an empty nodemask as a dummy. If ignored, why bother clearing it? Looks like you are using it as as signal rather than ignoring it. So not sure this comment is correct. > + */ > + nodes_clear(dummy_nodes); > + if (nodes) > + err = do_pages_move(NULL, dummy_nodes, nr_pages, pages, > + nodes, status, flags); > + else > + err = do_pages_stat(NULL, nr_pages, pages, status); > + > + return err; If nothing to do, just return in the two branches and loose the local variable err. > +} > + > +SYSCALL_DEFINE5(move_phys_pages, unsigned long, nr_pages, > + const void __user * __user *, pages, > + const int __user *, nodes, > + int __user *, status, int, flags) > +{ > + return kernel_move_phys_pages(nr_pages, pages, nodes, status, flags); > +} > + Looks like 1 blank line enough to me... > + > #ifdef CONFIG_NUMA_BALANCING