In the rare case where an mlocked order-0 folio is mapped 2^20 or more times, the high mapcount can be interpreted mistakenly by munlock() as an mlock_count, causing PG_mlocked to not be cleared, possibly leaving the folio stranded as unevictable endlessly. To fix this, add a hook during unmapping to check if the bits used for mlock_count are 0s yet PG_mlocked is set. In this case, call make sure to perform the missed munlock operation. Signed-off-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx> --- include/linux/mm.h | 4 ++++ mm/mlock.c | 18 +++++++++++++++++- mm/rmap.c | 1 + 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3994580772b3..b341477a83e8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1050,6 +1050,7 @@ unsigned long vmalloc_to_pfn(const void *addr); extern bool is_vmalloc_addr(const void *x); extern int is_vmalloc_or_module_addr(const void *x); extern int folio_mlocked_mapcount(struct folio *folio); +extern void folio_mlock_unmap_check(struct folio *folio); #else static inline bool is_vmalloc_addr(const void *x) { @@ -1063,6 +1064,9 @@ static inline int folio_mlocked_mapcount(struct folio *folio) { return 0; } +static inline void folio_mlock_unmap_check(struct folio *folio) +{ +} #endif /* diff --git a/mm/mlock.c b/mm/mlock.c index 5c5462627391..8261df11d6a6 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -66,7 +66,8 @@ EXPORT_SYMBOL(can_do_mlock); * (1) The mapcount will be incorrect (underestimated). It will be correct again * once the number of mappings falls below MLOCK_COUNT_BIAS. * (2) munlock() can misinterpret the large number of mappings as an mlock_count - * and leave PG_mlocked set. + * and leave PG_mlocked set. This will be fixed when the number of mappings + * falls below MLOCK_COUNT_BIAS by folio_mlock_unmap_check(). */ #define MLOCK_COUNT_SHIFT 20 #define MLOCK_COUNT_BIAS (1U << MLOCK_COUNT_SHIFT) @@ -139,6 +140,21 @@ static int folio_mlock_count_dec(struct folio *folio) return mlock_count - 1; } +/* + * Call after decrementing the mapcount. If the mapcount previously overflowed + * beyond the lower 20 bits for an order-0 mlocked folio, munlock() have + * mistakenly left the folio mlocked. Fix it here. + */ +void folio_mlock_unmap_check(struct folio *folio) +{ + int mapcount = atomic_read(&folio->_mapcount) + 1; + int mlock_count = mapcount >> MLOCK_COUNT_SHIFT; + + if (unlikely(!folio_test_large(folio) && folio_test_mlocked(folio) && + mlock_count == 0)) + munlock_folio(folio); +} + /* * Mlocked folios are marked with the PG_mlocked flag for efficient testing * in vmscan and, possibly, the fault path; and to support semi-accurate diff --git a/mm/rmap.c b/mm/rmap.c index 19392e090bec..02e558551f15 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1392,6 +1392,7 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma, nr = atomic_dec_return_relaxed(mapped); nr = (nr < COMPOUND_MAPPED); } + folio_mlock_unmap_check(folio); } else if (folio_test_pmd_mappable(folio)) { /* That test is redundant: it's for safety or to optimize out */ -- 2.41.0.162.gfafddb0af9-goog