On 15/08/2024 00:05, Barry Song wrote: > > On Thu, Aug 15, 2024 at 12:37 AM Usama Arif <usamaarif642@xxxxxxxxx> wrote: > [snip] >>>>>> >>>>>> -void deferred_split_folio(struct folio *folio) >>>>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped) >>>>>> { >>>>>> struct deferred_split *ds_queue = get_deferred_split_queue(folio); >>>>>> #ifdef CONFIG_MEMCG >>>>>> @@ -3485,14 +3487,17 @@ void deferred_split_folio(struct folio *folio) >>>>>> if (folio_test_swapcache(folio)) >>>>>> return; >>>>>> >>>>>> - if (!list_empty(&folio->_deferred_list)) >>>>>> - return; >>>>>> - >>>>>> spin_lock_irqsave(&ds_queue->split_queue_lock, flags); >>>>>> + if (partially_mapped) >>>>>> + folio_set_partially_mapped(folio); >>>>>> + else >>>>>> + folio_clear_partially_mapped(folio); >>>>>> if (list_empty(&folio->_deferred_list)) { >>>>>> - if (folio_test_pmd_mappable(folio)) >>>>>> - count_vm_event(THP_DEFERRED_SPLIT_PAGE); >>>>>> - count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); >>>>>> + if (partially_mapped) { >>>>>> + if (folio_test_pmd_mappable(folio)) >>>>>> + count_vm_event(THP_DEFERRED_SPLIT_PAGE); >>>>>> + count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); >>>>> >>>>> This code completely broke MTHP_STAT_SPLIT_DEFERRED for PMD_ORDER. It >>>>> added the folio to the deferred_list as entirely_mapped >>>>> (partially_mapped == false). >>>>> However, when partially_mapped becomes true, there's no opportunity to >>>>> add it again >>>>> as it has been there on the list. Are you consistently seeing the counter for >>>>> PMD_ORDER as 0? >>>>> >>>> >>>> Ah I see it, this should fix it? >>>> >>>> -void deferred_split_folio(struct folio *folio) >>>> +/* partially_mapped=false won't clear PG_partially_mapped folio flag */ >>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped) >>>> { >>>> struct deferred_split *ds_queue = get_deferred_split_queue(folio); >>>> #ifdef CONFIG_MEMCG >>>> @@ -3485,14 +3488,14 @@ void deferred_split_folio(struct folio *folio) >>>> if (folio_test_swapcache(folio)) >>>> return; >>>> >>>> - if (!list_empty(&folio->_deferred_list)) >>>> - return; >>>> - >>>> spin_lock_irqsave(&ds_queue->split_queue_lock, flags); >>>> - if (list_empty(&folio->_deferred_list)) { >>>> + if (partially_mapped) { >>>> + folio_set_partially_mapped(folio); >>>> if (folio_test_pmd_mappable(folio)) >>>> count_vm_event(THP_DEFERRED_SPLIT_PAGE); >>>> count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); >>>> + } >>>> + if (list_empty(&folio->_deferred_list)) { >>>> list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); >>>> ds_queue->split_queue_len++; >>>> #ifdef CONFIG_MEMCG >>>> >>> >>> not enough. as deferred_split_folio(true) won't be called if folio has been >>> deferred_list in __folio_remove_rmap(): >>> >>> if (partially_mapped && folio_test_anon(folio) && >>> list_empty(&folio->_deferred_list)) >>> deferred_split_folio(folio, true); >>> >>> so you will still see 0. >>> >> >> ah yes, Thanks. >> >> So below diff over the current v3 series should work for all cases: >> >> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index b4d72479330d..482e3ab60911 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -3483,6 +3483,7 @@ void __folio_undo_large_rmappable(struct folio *folio) >> spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); >> } >> >> +/* partially_mapped=false won't clear PG_partially_mapped folio flag */ >> void deferred_split_folio(struct folio *folio, bool partially_mapped) >> { >> struct deferred_split *ds_queue = get_deferred_split_queue(folio); >> @@ -3515,16 +3516,16 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) >> return; >> >> spin_lock_irqsave(&ds_queue->split_queue_lock, flags); >> - if (partially_mapped) >> + if (partially_mapped) { >> folio_set_partially_mapped(folio); >> - else >> - folio_clear_partially_mapped(folio); >> + if (folio_test_pmd_mappable(folio)) >> + count_vm_event(THP_DEFERRED_SPLIT_PAGE); >> + count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); >> + } else { >> + /* partially mapped folios cannont become partially unmapped */ >> + VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio); >> + } >> if (list_empty(&folio->_deferred_list)) { >> - if (partially_mapped) { >> - if (folio_test_pmd_mappable(folio)) >> - count_vm_event(THP_DEFERRED_SPLIT_PAGE); >> - count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); >> - } >> list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); >> ds_queue->split_queue_len++; >> #ifdef CONFIG_MEMCG >> diff --git a/mm/rmap.c b/mm/rmap.c >> index 9ad558c2bad0..4c330635aa4e 100644 >> --- a/mm/rmap.c >> +++ b/mm/rmap.c >> @@ -1578,7 +1578,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, >> * Check partially_mapped first to ensure it is a large folio. >> */ >> if (partially_mapped && folio_test_anon(folio) && >> - list_empty(&folio->_deferred_list)) >> + !folio_test_partially_mapped(folio)) >> deferred_split_folio(folio, true); >> >> __folio_mod_stat(folio, -nr, -nr_pmdmapped); >> > > This is an improvement, but there's still an issue. Two or more threads can > execute deferred_split_folio() simultaneously, which could lead to > DEFERRED_SPLIT being added multiple times. We should double-check > the status after acquiring the spinlock. > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index f401ceded697..3d247826fb95 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -3607,10 +3607,12 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) > > spin_lock_irqsave(&ds_queue->split_queue_lock, flags); > if (partially_mapped) { > - folio_set_partially_mapped(folio); > - if (folio_test_pmd_mappable(folio)) > - count_vm_event(THP_DEFERRED_SPLIT_PAGE); > - count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); > + if (!folio_test_partially_mapped(folio)) { > + folio_set_partially_mapped(folio); > + if (folio_test_pmd_mappable(folio)) > + count_vm_event(THP_DEFERRED_SPLIT_PAGE); > + count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); > + } > } else { > /* partially mapped folios cannont become partially unmapped */ > VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio); Actually above is still not thread safe. multiple threads can test partially_mapped and see its false at the same time and all of them would then increment stats. I believe !folio_test_set_partially_mapped would be best. Hopefully below diff over v3 should cover all the fixes that have come up until now. Independent of this series, I also think its a good idea to add a selftest for this deferred_split counter. I will send a separate patch for it that just maps a THP, unmaps a small part from it and checks the counter. I think split_huge_page_test.c is probably the right place for it. If everyone is happy with it, Andrew could replace the original fix patch in [1] with below. [1] https://lore.kernel.org/all/20240814200220.F1964C116B1@xxxxxxxxxxxxxxx/ commit c627655548fa09b59849e942da4decc84fa0b0f2 Author: Usama Arif <usamaarif642@xxxxxxxxx> Date: Thu Aug 15 16:07:20 2024 +0100 mm: Introduce a pageflag for partially mapped folios fix Fixes the original commit by not clearing partially mapped bit in hugeTLB folios and fixing deferred split THP stats. Signed-off-by: Usama Arif <usamaarif642@xxxxxxxxx> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index cecc1bad7910..7bee743ede40 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -863,6 +863,7 @@ static inline void ClearPageCompound(struct page *page) } FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE) +FOLIO_TEST_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(large_rmappable) FOLIO_FLAG_FALSE(partially_mapped) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c024ab0f745c..24371e4ef19b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3459,6 +3459,7 @@ void __folio_undo_large_rmappable(struct folio *folio) spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); } +/* partially_mapped=false won't clear PG_partially_mapped folio flag */ void deferred_split_folio(struct folio *folio, bool partially_mapped) { struct deferred_split *ds_queue = get_deferred_split_queue(folio); @@ -3488,16 +3489,17 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) return; spin_lock_irqsave(&ds_queue->split_queue_lock, flags); - if (partially_mapped) - folio_set_partially_mapped(folio); - else - folio_clear_partially_mapped(folio); - if (list_empty(&folio->_deferred_list)) { - if (partially_mapped) { + if (partially_mapped) { + if (!folio_test_set_partially_mapped(folio)) { if (folio_test_pmd_mappable(folio)) count_vm_event(THP_DEFERRED_SPLIT_PAGE); count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); } + } else { + /* partially mapped folios cannot become non-partially mapped */ + VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio); + } + if (list_empty(&folio->_deferred_list)) { list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2ae2d9a18e40..1fdd9eab240c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1758,7 +1758,6 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, free_gigantic_folio(folio, huge_page_order(h)); } else { INIT_LIST_HEAD(&folio->_deferred_list); - folio_clear_partially_mapped(folio); folio_put(folio); } } diff --git a/mm/rmap.c b/mm/rmap.c index 9ad558c2bad0..4c330635aa4e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1578,7 +1578,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, * Check partially_mapped first to ensure it is a large folio. */ if (partially_mapped && folio_test_anon(folio) && - list_empty(&folio->_deferred_list)) + !folio_test_partially_mapped(folio)) deferred_split_folio(folio, true); __folio_mod_stat(folio, -nr, -nr_pmdmapped);