Zi Yan <ziy@xxxxxxxxxx> writes: > On 6 Feb 2023, at 1:33, Huang Ying wrote: > >> In this patch the _unmap and _move stage of the folio migration is >> batched. That for, previously, it is, >> >> for each folio >> _unmap() >> _move() >> >> Now, it is, >> >> for each folio >> _unmap() >> for each folio >> _move() >> >> Based on this, we can batch the TLB flushing and use some hardware >> accelerator to copy folios between batched _unmap and batched _move >> stages. >> >> Signed-off-by: "Huang, Ying" <ying.huang@xxxxxxxxx> >> Tested-by: Hyeonggon Yoo <42.hyeyoo@xxxxxxxxx> >> Cc: Zi Yan <ziy@xxxxxxxxxx> >> Cc: Yang Shi <shy828301@xxxxxxxxx> >> Cc: Baolin Wang <baolin.wang@xxxxxxxxxxxxxxxxx> >> Cc: Oscar Salvador <osalvador@xxxxxxx> >> Cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> >> Cc: Bharata B Rao <bharata@xxxxxxx> >> Cc: Alistair Popple <apopple@xxxxxxxxxx> >> Cc: haoxin <xhao@xxxxxxxxxxxxxxxxx> >> Cc: Minchan Kim <minchan@xxxxxxxxxx> >> Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx> >> --- >> mm/migrate.c | 208 +++++++++++++++++++++++++++++++++++++++++++++------ >> 1 file changed, 184 insertions(+), 24 deletions(-) >> >> diff --git a/mm/migrate.c b/mm/migrate.c >> index 0428449149f4..fa7212330cb6 100644 >> --- a/mm/migrate.c >> +++ b/mm/migrate.c >> @@ -1033,6 +1033,33 @@ static void __migrate_folio_extract(struct folio *dst, >> dst->private = NULL; >> } >> >> +/* Restore the source folio to the original state upon failure */ >> +static void migrate_folio_undo_src(struct folio *src, >> + int page_was_mapped, >> + struct anon_vma *anon_vma, >> + struct list_head *ret) >> +{ >> + if (page_was_mapped) >> + remove_migration_ptes(src, src, false); >> + /* Drop an anon_vma reference if we took one */ >> + if (anon_vma) >> + put_anon_vma(anon_vma); >> + folio_unlock(src); >> + list_move_tail(&src->lru, ret); >> +} >> + >> +/* Restore the destination folio to the original state upon failure */ >> +static void migrate_folio_undo_dst(struct folio *dst, >> + free_page_t put_new_page, >> + unsigned long private) >> +{ >> + folio_unlock(dst); >> + if (put_new_page) >> + put_new_page(&dst->page, private); >> + else >> + folio_put(dst); >> +} >> + >> /* Cleanup src folio upon migration success */ >> static void migrate_folio_done(struct folio *src, >> enum migrate_reason reason) >> @@ -1052,7 +1079,7 @@ static void migrate_folio_done(struct folio *src, >> } >> >> static int __migrate_folio_unmap(struct folio *src, struct folio *dst, >> - int force, enum migrate_mode mode) >> + int force, bool force_lock, enum migrate_mode mode) >> { >> int rc = -EAGAIN; >> int page_was_mapped = 0; >> @@ -1079,6 +1106,17 @@ static int __migrate_folio_unmap(struct folio *src, struct folio *dst, >> if (current->flags & PF_MEMALLOC) >> goto out; >> >> + /* >> + * We have locked some folios, to avoid deadlock, we cannot >> + * lock the folio synchronously. Go out to process (and >> + * unlock) all the locked folios. Then we can lock the folio >> + * synchronously. >> + */ > The comment alone is quite confusing and the variable might be better > renamed to avoid_force_lock, since there is a force variable to force > lock folio already. And the variable intends to discourage force lock > on a folio to avoid potential deadlock. > > How about? Since "lock synchronously" might not be as straightforward > as wait to lock. > > /* > * We have locked some folios and are going to wait to lock this folio. > * To avoid a potential deadlock, let's bail out and not do that. The > * locked folios will be moved and unlocked, then we can wait to lock > * this folio > */ > >> + if (!force_lock) { >> + rc = -EDEADLOCK; >> + goto out; >> + } >> + >> folio_lock(src); >> } >> >> @@ -1187,10 +1225,20 @@ static int __migrate_folio_move(struct folio *src, struct folio *dst, >> int page_was_mapped = 0; >> struct anon_vma *anon_vma = NULL; >> bool is_lru = !__PageMovable(&src->page); >> + struct list_head *prev; >> >> __migrate_folio_extract(dst, &page_was_mapped, &anon_vma); >> + prev = dst->lru.prev; >> + list_del(&dst->lru); >> >> rc = move_to_new_folio(dst, src, mode); >> + >> + if (rc == -EAGAIN) { >> + list_add(&dst->lru, prev); >> + __migrate_folio_record(dst, page_was_mapped, anon_vma); >> + return rc; >> + } >> + >> if (unlikely(!is_lru)) >> goto out_unlock_both; >> >> @@ -1233,7 +1281,7 @@ static int __migrate_folio_move(struct folio *src, struct folio *dst, >> /* Obtain the lock on page, remove all ptes. */ >> static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, >> unsigned long private, struct folio *src, >> - struct folio **dstp, int force, >> + struct folio **dstp, int force, bool force_lock, >> enum migrate_mode mode, enum migrate_reason reason, >> struct list_head *ret) >> { >> @@ -1261,7 +1309,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page >> *dstp = dst; >> >> dst->private = NULL; >> - rc = __migrate_folio_unmap(src, dst, force, mode); >> + rc = __migrate_folio_unmap(src, dst, force, force_lock, mode); >> if (rc == MIGRATEPAGE_UNMAP) >> return rc; >> >> @@ -1270,7 +1318,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page >> * references and be restored. >> */ >> /* restore the folio to right list. */ >> - if (rc != -EAGAIN) >> + if (rc != -EAGAIN && rc != -EDEADLOCK) >> list_move_tail(&src->lru, ret); >> >> if (put_new_page) >> @@ -1309,9 +1357,8 @@ static int migrate_folio_move(free_page_t put_new_page, unsigned long private, >> */ >> if (rc == MIGRATEPAGE_SUCCESS) { >> migrate_folio_done(src, reason); >> - } else { >> - if (rc != -EAGAIN) >> - list_add_tail(&src->lru, ret); >> + } else if (rc != -EAGAIN) { >> + list_add_tail(&src->lru, ret); >> >> if (put_new_page) >> put_new_page(&dst->page, private); >> @@ -1591,7 +1638,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, >> enum migrate_mode mode, int reason, struct list_head *ret_folios, >> struct migrate_pages_stats *stats) > > Like I said in my last comment to this patch, migrate_pages_batch() function > deserves a detailed comment about its working flow including the error handling. > Now you only put some in the git log, which is hard to access after several code > changes later. > > How about? > > /* > * migrate_pages_batch() first unmaps pages in the from as many as possible, > * then migrates the unmapped pages. During unmap process, different situations > * are handled differently: > * 1. ENOSYS, unsupported large folio migration: move to ret_folios list > * 2. ENOMEM, lower memory at the destination: migrate existing unmapped folios > * and stop, since existing unmapped folios have new pages allocated and can > * be migrated > * 3. EDEADLOCK, to be unmapped page is locked by someone else, to avoid deadlock, > * we migrate existing unmapped pages and try to lock again > * 4. MIGRATEPAGE_SUCCESS, the folios was freed under us: no action > * 5. MIGRATEPAGE_UNMAP, unmap succeeded: set avoid_force_lock to true to avoid > * wait to lock a folio in the future to avoid deadlock. > * > * For folios unmapped but cannot be migrated, we will restore their original > * states during cleanup stage at the end. > */ Sorry, I didn't notice the above comments in the previous reply. The comments appear to too detailed for me. I think that it's better for people to get the details from the code itself. So, I want to use the much simplified version as below. /* * migrate_pages_batch() first unmaps folios in the from list as many as * possible, then move the unmapped folios. */ Best Regards, Huang, Ying >> { >> - int retry = 1; >> + int retry; >> int large_retry = 1; >> int thp_retry = 1; >> int nr_failed = 0; >> @@ -1600,13 +1647,19 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, >> int pass = 0; >> bool is_large = false; >> bool is_thp = false; >> - struct folio *folio, *folio2, *dst = NULL; >> - int rc, nr_pages; >> + struct folio *folio, *folio2, *dst = NULL, *dst2; >> + int rc, rc_saved, nr_pages; >> LIST_HEAD(split_folios); >> + LIST_HEAD(unmap_folios); >> + LIST_HEAD(dst_folios); >> bool nosplit = (reason == MR_NUMA_MISPLACED); >> bool no_split_folio_counting = false; >> + bool force_lock; >> >> -split_folio_migration: >> +retry: >> + rc_saved = 0; >> + force_lock = true; >> + retry = 1; >> for (pass = 0; >> pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); >> pass++) { >> @@ -1628,16 +1681,15 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, >> cond_resched(); >> >> rc = migrate_folio_unmap(get_new_page, put_new_page, private, >> - folio, &dst, pass > 2, mode, >> - reason, ret_folios); >> - if (rc == MIGRATEPAGE_UNMAP) >> - rc = migrate_folio_move(put_new_page, private, >> - folio, dst, mode, >> - reason, ret_folios); >> + folio, &dst, pass > 2, force_lock, >> + mode, reason, ret_folios); >> /* >> * The rules are: >> * Success: folio will be freed >> + * Unmap: folio will be put on unmap_folios list, >> + * dst folio put on dst_folios list >> * -EAGAIN: stay on the from list >> + * -EDEADLOCK: stay on the from list >> * -ENOMEM: stay on the from list >> * -ENOSYS: stay on the from list >> * Other errno: put on ret_folios list >> @@ -1672,7 +1724,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, >> case -ENOMEM: >> /* >> * When memory is low, don't bother to try to migrate >> - * other folios, just exit. >> + * other folios, move unmapped folios, then exit. >> */ >> if (is_large) { >> nr_large_failed++; >> @@ -1711,7 +1763,19 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, >> /* nr_failed isn't updated for not used */ >> nr_large_failed += large_retry; >> stats->nr_thp_failed += thp_retry; >> - goto out; >> + rc_saved = rc; >> + if (list_empty(&unmap_folios)) >> + goto out; >> + else >> + goto move; >> + case -EDEADLOCK: >> + /* >> + * The folio cannot be locked for potential deadlock. >> + * Go move (and unlock) all locked folios. Then we can >> + * try again. >> + */ >> + rc_saved = rc; >> + goto move; >> case -EAGAIN: >> if (is_large) { >> large_retry++; >> @@ -1725,6 +1789,15 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, >> stats->nr_succeeded += nr_pages; >> stats->nr_thp_succeeded += is_thp; >> break; >> + case MIGRATEPAGE_UNMAP: >> + /* >> + * We have locked some folios, don't force lock >> + * to avoid deadlock. >> + */ >> + force_lock = false; >> + list_move_tail(&folio->lru, &unmap_folios); >> + list_add_tail(&dst->lru, &dst_folios); >> + break; >> default: >> /* >> * Permanent failure (-EBUSY, etc.): >> @@ -1748,12 +1821,95 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, >> nr_large_failed += large_retry; >> stats->nr_thp_failed += thp_retry; >> stats->nr_failed_pages += nr_retry_pages; >> +move: >> + retry = 1; >> + for (pass = 0; >> + pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); >> + pass++) { >> + retry = 0; >> + large_retry = 0; >> + thp_retry = 0; >> + nr_retry_pages = 0; >> + >> + dst = list_first_entry(&dst_folios, struct folio, lru); >> + dst2 = list_next_entry(dst, lru); >> + list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { >> + is_large = folio_test_large(folio); >> + is_thp = is_large && folio_test_pmd_mappable(folio); >> + nr_pages = folio_nr_pages(folio); >> + >> + cond_resched(); >> + >> + rc = migrate_folio_move(put_new_page, private, >> + folio, dst, mode, >> + reason, ret_folios); >> + /* >> + * The rules are: >> + * Success: folio will be freed >> + * -EAGAIN: stay on the unmap_folios list >> + * Other errno: put on ret_folios list >> + */ >> + switch(rc) { >> + case -EAGAIN: >> + if (is_large) { >> + large_retry++; >> + thp_retry += is_thp; >> + } else if (!no_split_folio_counting) { >> + retry++; >> + } >> + nr_retry_pages += nr_pages; >> + break; >> + case MIGRATEPAGE_SUCCESS: >> + stats->nr_succeeded += nr_pages; >> + stats->nr_thp_succeeded += is_thp; >> + break; >> + default: >> + if (is_large) { >> + nr_large_failed++; >> + stats->nr_thp_failed += is_thp; >> + } else if (!no_split_folio_counting) { >> + nr_failed++; >> + } >> + >> + stats->nr_failed_pages += nr_pages; >> + break; >> + } >> + dst = dst2; >> + dst2 = list_next_entry(dst, lru); >> + } >> + } >> + nr_failed += retry; >> + nr_large_failed += large_retry; >> + stats->nr_thp_failed += thp_retry; >> + stats->nr_failed_pages += nr_retry_pages; >> + >> + if (rc_saved) >> + rc = rc_saved; >> + else >> + rc = nr_failed + nr_large_failed; >> +out: >> + /* Cleanup remaining folios */ >> + dst = list_first_entry(&dst_folios, struct folio, lru); >> + dst2 = list_next_entry(dst, lru); >> + list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { >> + int page_was_mapped = 0; >> + struct anon_vma *anon_vma = NULL; >> + >> + __migrate_folio_extract(dst, &page_was_mapped, &anon_vma); >> + migrate_folio_undo_src(folio, page_was_mapped, anon_vma, >> + ret_folios); >> + list_del(&dst->lru); >> + migrate_folio_undo_dst(dst, put_new_page, private); >> + dst = dst2; >> + dst2 = list_next_entry(dst, lru); >> + } >> + >> /* >> * Try to migrate split folios of fail-to-migrate large folios, no >> * nr_failed counting in this round, since all split folios of a >> * large folio is counted as 1 failure in the first round. >> */ >> - if (!list_empty(&split_folios)) { >> + if (rc >= 0 && !list_empty(&split_folios)) { >> /* >> * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY >> * retries) to ret_folios to avoid migrating them again. >> @@ -1761,12 +1917,16 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, >> list_splice_init(from, ret_folios); >> list_splice_init(&split_folios, from); >> no_split_folio_counting = true; >> - retry = 1; >> - goto split_folio_migration; >> + goto retry; >> } >> >> - rc = nr_failed + nr_large_failed; >> -out: >> + /* >> + * We have unlocked all locked folios, so we can force lock now, let's >> + * try again. >> + */ >> + if (rc == -EDEADLOCK) >> + goto retry; >> + >> return rc; >> } >> >> -- >> 2.35.1 > > After rename the variable (or give it a better name) and add the comments, > you can add Reviewed-by: Zi Yan <ziy@xxxxxxxxxx> > > Thanks. > > -- > Best Regards, > Yan, Zi