On Wed, Aug 21, 2024 at 7:35 AM Usama Arif <usamaarif642@xxxxxxxxx> wrote: > > > > On 19/08/2024 22:55, Barry Song wrote: > > On Tue, Aug 20, 2024 at 9:34 AM Barry Song <baohua@xxxxxxxxxx> wrote: > >> > >> On Tue, Aug 20, 2024 at 8:16 AM Usama Arif <usamaarif642@xxxxxxxxx> wrote: > >>> > >>> > >>> > >>> On 19/08/2024 20:00, Barry Song wrote: > >>>> On Tue, Aug 20, 2024 at 2:17 AM Usama Arif <usamaarif642@xxxxxxxxx> wrote: > >>>>> > >>>>> > >>>>> > >>>>> On 19/08/2024 09:29, Barry Song wrote: > >>>>>> Hi Usama, > >>>>>> > >>>>>> I feel it is much better now! thanks! > >>>>>> > >>>>>> On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@xxxxxxxxx> wrote: > >>>>>>> > >>>>>>> Currently folio->_deferred_list is used to keep track of > >>>>>>> partially_mapped folios that are going to be split under memory > >>>>>>> pressure. In the next patch, all THPs that are faulted in and collapsed > >>>>>>> by khugepaged are also going to be tracked using _deferred_list. > >>>>>>> > >>>>>>> This patch introduces a pageflag to be able to distinguish between > >>>>>>> partially mapped folios and others in the deferred_list at split time in > >>>>>>> deferred_split_scan. Its needed as __folio_remove_rmap decrements > >>>>>>> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be > >>>>>>> possible to distinguish between partially mapped folios and others in > >>>>>>> deferred_split_scan. > >>>>>>> > >>>>>>> Eventhough it introduces an extra flag to track if the folio is > >>>>>>> partially mapped, there is no functional change intended with this > >>>>>>> patch and the flag is not useful in this patch itself, it will > >>>>>>> become useful in the next patch when _deferred_list has non partially > >>>>>>> mapped folios. > >>>>>>> > >>>>>>> Signed-off-by: Usama Arif <usamaarif642@xxxxxxxxx> > >>>>>>> --- > >>>>>>> include/linux/huge_mm.h | 4 ++-- > >>>>>>> include/linux/page-flags.h | 11 +++++++++++ > >>>>>>> mm/huge_memory.c | 23 ++++++++++++++++------- > >>>>>>> mm/internal.h | 4 +++- > >>>>>>> mm/memcontrol.c | 3 ++- > >>>>>>> mm/migrate.c | 3 ++- > >>>>>>> mm/page_alloc.c | 5 +++-- > >>>>>>> mm/rmap.c | 5 +++-- > >>>>>>> mm/vmscan.c | 3 ++- > >>>>>>> 9 files changed, 44 insertions(+), 17 deletions(-) > >>>>>>> > >>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > >>>>>>> index 4c32058cacfe..969f11f360d2 100644 > >>>>>>> --- a/include/linux/huge_mm.h > >>>>>>> +++ b/include/linux/huge_mm.h > >>>>>>> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page) > >>>>>>> { > >>>>>>> return split_huge_page_to_list_to_order(page, NULL, 0); > >>>>>>> } > >>>>>>> -void deferred_split_folio(struct folio *folio); > >>>>>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped); > >>>>>>> > >>>>>>> void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, > >>>>>>> unsigned long address, bool freeze, struct folio *folio); > >>>>>>> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page) > >>>>>>> { > >>>>>>> return 0; > >>>>>>> } > >>>>>>> -static inline void deferred_split_folio(struct folio *folio) {} > >>>>>>> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} > >>>>>>> #define split_huge_pmd(__vma, __pmd, __address) \ > >>>>>>> do { } while (0) > >>>>>>> > >>>>>>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h > >>>>>>> index a0a29bd092f8..c3bb0e0da581 100644 > >>>>>>> --- a/include/linux/page-flags.h > >>>>>>> +++ b/include/linux/page-flags.h > >>>>>>> @@ -182,6 +182,7 @@ enum pageflags { > >>>>>>> /* At least one page in this folio has the hwpoison flag set */ > >>>>>>> PG_has_hwpoisoned = PG_active, > >>>>>>> PG_large_rmappable = PG_workingset, /* anon or file-backed */ > >>>>>>> + PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */ > >>>>>>> }; > >>>>>>> > >>>>>>> #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) > >>>>>>> @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page) > >>>>>>> ClearPageHead(page); > >>>>>>> } > >>>>>>> FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) > >>>>>>> +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE) > >>>>>>> +/* > >>>>>>> + * PG_partially_mapped is protected by deferred_split split_queue_lock, > >>>>>>> + * so its safe to use non-atomic set/clear. > >>>>>>> + */ > >>>>>>> +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE) > >>>>>>> +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE) > >>>>>>> #else > >>>>>>> FOLIO_FLAG_FALSE(large_rmappable) > >>>>>>> +FOLIO_TEST_FLAG_FALSE(partially_mapped) > >>>>>>> +__FOLIO_SET_FLAG_NOOP(partially_mapped) > >>>>>>> +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped) > >>>>>>> #endif > >>>>>>> > >>>>>>> #define PG_head_mask ((1UL << PG_head)) > >>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c > >>>>>>> index 2d77b5d2291e..70ee49dfeaad 100644 > >>>>>>> --- a/mm/huge_memory.c > >>>>>>> +++ b/mm/huge_memory.c > >>>>>>> @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, > >>>>>>> * page_deferred_list. > >>>>>>> */ > >>>>>>> list_del_init(&folio->_deferred_list); > >>>>>>> + __folio_clear_partially_mapped(folio); > >>>>>>> } > >>>>>>> spin_unlock(&ds_queue->split_queue_lock); > >>>>>>> if (mapping) { > >>>>>>> @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio) > >>>>>>> if (!list_empty(&folio->_deferred_list)) { > >>>>>>> ds_queue->split_queue_len--; > >>>>>>> list_del_init(&folio->_deferred_list); > >>>>>>> + __folio_clear_partially_mapped(folio); > >>>>>> > >>>>>> is it possible to make things clearer by > >>>>>> > >>>>>> if (folio_clear_partially_mapped) > >>>>>> __folio_clear_partially_mapped(folio); > >>>>>> > >>>>>> While writing without conditions isn't necessarily wrong, adding a condition > >>>>>> will improve the readability of the code and enhance the clarity of my mTHP > >>>>>> counters series. also help decrease smp cache sync if we can avoid > >>>>>> unnecessary writing? > >>>>>> > >>>>> > >>>>> Do you mean if(folio_test_partially_mapped(folio))? > >>>>> > >>>>> I don't like this idea. I think it makes the readability worse? If I was looking at if (test) -> clear for the first time, I would become confused why its being tested if its going to be clear at the end anyways? > >>>> > >>>> In the pmd-order case, the majority of folios are not partially mapped. > >>>> Unconditional writes will trigger cache synchronization across all > >>>> CPUs (related to the MESI protocol), making them more costly. By > >>>> using conditional writes, such as "if(test) write," we can avoid > >>>> most unnecessary writes, which is much more efficient. Additionally, > >>>> we only need to manage nr_split_deferred when the condition > >>>> is met. We are carefully evaluating all scenarios to determine > >>>> if modifications to the partially_mapped flag are necessary. > >>>> > >>> > >>> > >>> Hmm okay, as you said its needed for nr_split_deferred anyways. Something like below is ok to fold in? > >>> > >>> commit 4ae9e2067346effd902b342296987b97dee29018 (HEAD) > >>> Author: Usama Arif <usamaarif642@xxxxxxxxx> > >>> Date: Mon Aug 19 21:07:16 2024 +0100 > >>> > >>> mm: Introduce a pageflag for partially mapped folios fix > >>> > >>> Test partially_mapped flag before clearing it. This should > >>> avoid unnecessary writes and will be needed in the nr_split_deferred > >>> series. > >>> > >>> Signed-off-by: Usama Arif <usamaarif642@xxxxxxxxx> > >>> > >>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c > >>> index 5d67d3b3c1b2..ccde60aaaa0f 100644 > >>> --- a/mm/huge_memory.c > >>> +++ b/mm/huge_memory.c > >>> @@ -3479,7 +3479,8 @@ void __folio_undo_large_rmappable(struct folio *folio) > >>> if (!list_empty(&folio->_deferred_list)) { > >>> ds_queue->split_queue_len--; > >>> list_del_init(&folio->_deferred_list); > >>> - __folio_clear_partially_mapped(folio); > >>> + if (folio_test_partially_mapped(folio)) > >>> + __folio_clear_partially_mapped(folio); > >>> } > >>> spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); > >>> } > >>> @@ -3610,7 +3611,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, > >>> } else { > >>> /* We lost race with folio_put() */ > >>> list_del_init(&folio->_deferred_list); > >>> - __folio_clear_partially_mapped(folio); > >>> + if (folio_test_partially_mapped(folio)) > >>> + __folio_clear_partially_mapped(folio); > >>> ds_queue->split_queue_len--; > >>> } > >>> if (!--sc->nr_to_scan) > >>> > >> > >> Do we also need if (folio_test_partially_mapped(folio)) in > >> split_huge_page_to_list_to_order()? > >> > >> I recall that in Yu Zhao's TAO, there’s a chance of splitting (shattering) > >> non-partially-mapped folios. To be future-proof, we might want to handle > >> both cases equally. > > > > we recall we also have a real case which can split entirely_mapped > > folio: > > > > mm: huge_memory: enable debugfs to split huge pages to any order > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fc4d182316bd5309b4066fd9ef21529ea397a7d4 > > > >> > >> By the way, we might not need to clear the flag for a new folio. This differs > >> from the init_list, which is necessary. If a new folio has the partially_mapped > >> flag, it indicates that we failed to clear it when freeing the folio to > >> the buddy system, which is a bug we need to fix in the free path. > >> > >> Thanks > >> Barry > > I believe the below fixlet should address all concerns: Hi Usama, thanks! I can't judge if we need this partially_mapped flag. but if we need, the code looks correct to me. I'd like to leave this to David and other experts to ack. an alternative approach might be two lists? one for entirely_mapped, the other one for split_deferred. also seems ugly ? On the other hand, when we want to extend your patchset to mTHP other than PMD- order, will the only deferred_list create huge lock contention while adding or removing folios from it? > > > From 95492a51b1929ea274b4e5b78fc74e7736645d58 Mon Sep 17 00:00:00 2001 > From: Usama Arif <usamaarif642@xxxxxxxxx> > Date: Mon, 19 Aug 2024 21:07:16 +0100 > Subject: [PATCH] mm: Introduce a pageflag for partially mapped folios fix > > Test partially_mapped flag before clearing it. This should > avoid unnecessary writes and will be needed in the nr_split_deferred > series. > Also no need to clear partially_mapped prepping compound head, as it > should start with already being cleared. > > Signed-off-by: Usama Arif <usamaarif642@xxxxxxxxx> > --- > include/linux/page-flags.h | 2 +- > mm/huge_memory.c | 9 ++++++--- > mm/internal.h | 4 +--- > 3 files changed, 8 insertions(+), 7 deletions(-) > > diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h > index c3bb0e0da581..f1602695daf2 100644 > --- a/include/linux/page-flags.h > +++ b/include/linux/page-flags.h > @@ -1182,7 +1182,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) > */ > #define PAGE_FLAGS_SECOND \ > (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ > - 1UL << PG_large_rmappable) > + 1UL << PG_large_rmappable | 1UL << PG_partially_mapped) > > #define PAGE_FLAGS_PRIVATE \ > (1UL << PG_private | 1UL << PG_private_2) > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 5d67d3b3c1b2..402b9d933de0 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -3422,7 +3422,8 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, > * page_deferred_list. > */ > list_del_init(&folio->_deferred_list); > - __folio_clear_partially_mapped(folio); > + if (folio_test_partially_mapped(folio)) > + __folio_clear_partially_mapped(folio); > } > spin_unlock(&ds_queue->split_queue_lock); > if (mapping) { > @@ -3479,7 +3480,8 @@ void __folio_undo_large_rmappable(struct folio *folio) > if (!list_empty(&folio->_deferred_list)) { > ds_queue->split_queue_len--; > list_del_init(&folio->_deferred_list); > - __folio_clear_partially_mapped(folio); > + if (folio_test_partially_mapped(folio)) > + __folio_clear_partially_mapped(folio); > } > spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); > } > @@ -3610,7 +3612,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, > } else { > /* We lost race with folio_put() */ > list_del_init(&folio->_deferred_list); > - __folio_clear_partially_mapped(folio); > + if (folio_test_partially_mapped(folio)) > + __folio_clear_partially_mapped(folio); > ds_queue->split_queue_len--; > } > if (!--sc->nr_to_scan) > diff --git a/mm/internal.h b/mm/internal.h > index 27cbb5365841..52f7fc4e8ac3 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -662,10 +662,8 @@ static inline void prep_compound_head(struct page *page, unsigned int order) > atomic_set(&folio->_entire_mapcount, -1); > atomic_set(&folio->_nr_pages_mapped, 0); > atomic_set(&folio->_pincount, 0); > - if (order > 1) { > + if (order > 1) > INIT_LIST_HEAD(&folio->_deferred_list); > - __folio_clear_partially_mapped(folio); > - } > } > > static inline void prep_compound_tail(struct page *head, int tail_idx) > -- > 2.43.5 > > Thanks Barry