[..] > >> I think its better to handle this in Barrys patch. I feel this series is > >> close to its final state, i.e. the only diff I have for the next > >> revision is below to remove start/end_writeback for zer_filled case. I > >> will comment on Barrys patch once the I send out the next revision of this. > > Sorry I did not make myself clearer. I did not mean that you should > > handle the large folio swapin here. This needs to be handled at a > > higher level because as you mentioned, a large folio may be partially > > in the zeromap, zswap, swapcache, disk, etc. > > > > What I meant is that we should probably have a debug check to make > > sure this doesn't go unhandled. For zswap, I am trying to add a > > warning and fail the swapin operation if a large folio slips through > > to zswap. We can do something similar here if folks agree this is the > > right way in the interim: > > https://lore.kernel.org/lkml/20240611024516.1375191-3-yosryahmed@xxxxxxxxxx/. > > > > Maybe I am too paranoid, but I think it's easy to mess up these things > > when working on large folio swapin imo. > > So there is a difference between zswap and this optimization. In this > optimization, if the zeromap is set for all the folio bits, then we > should do large folio swapin. There still needs to be a change in Barrys > patch in alloc_swap_folio, but apart from that does the below diff over > v3 make it better? I will send a v4 with this if it sounds good. > > > diff --git a/mm/page_io.c b/mm/page_io.c > index 6400be6e4291..bf01364748a9 100644 > --- a/mm/page_io.c > +++ b/mm/page_io.c > @@ -234,18 +234,24 @@ static void swap_zeromap_folio_clear(struct folio > *folio) > } > } > > -static bool swap_zeromap_folio_test(struct folio *folio) > +/* > + * Return the index of the first subpage which is not zero-filled > + * according to swap_info_struct->zeromap. > + * If all pages are zero-filled according to zeromap, it will return > + * folio_nr_pages(folio). > + */ > +static long swap_zeromap_folio_test(struct folio *folio) > { > struct swap_info_struct *sis = swp_swap_info(folio->swap); > swp_entry_t entry; > - unsigned int i; > + long i; Why long? > > for (i = 0; i < folio_nr_pages(folio); i++) { > entry = page_swap_entry(folio_page(folio, i)); > if (!test_bit(swp_offset(entry), sis->zeromap)) > - return false; > + return i; > } > - return true; > + return i; > } > > /* > @@ -581,6 +587,7 @@ void swap_read_folio(struct folio *folio, bool > synchronous, > { > struct swap_info_struct *sis = swp_swap_info(folio->swap); > bool workingset = folio_test_workingset(folio); > + long first_non_zero_page_idx; > unsigned long pflags; > bool in_thrashing; > > @@ -598,10 +605,19 @@ void swap_read_folio(struct folio *folio, bool > synchronous, > psi_memstall_enter(&pflags); > } > delayacct_swapin_start(); > - if (swap_zeromap_folio_test(folio)) { > + first_non_zero_page_idx = swap_zeromap_folio_test(folio); > + if (first_non_zero_page_idx == folio_nr_pages(folio)) { > folio_zero_fill(folio); > folio_mark_uptodate(folio); > folio_unlock(folio); > + } else if (first_non_zero_page_idx != 0) { > + /* > + * The case for when only *some* of subpages being > swapped-in were recorded > + * in sis->zeromap, while the rest are in zswap/disk is > currently not handled. > + * WARN in this case and return without marking the > folio uptodate so that > + * an IO error is emitted (e.g. do_swap_page() will sigbus). > + */ > + WARN_ON_ONCE(1); > } else if (zswap_load(folio)) { > folio_mark_uptodate(folio); > folio_unlock(folio); > > This is too much noise for swap_read_folio(). How about adding swap_read_folio_zeromap() that takes care of this and decides whether or not to call folio_mark_uptodate()? -static bool swap_zeromap_folio_test(struct folio *folio) +/* + * Return the index of the first subpage which is not zero-filled according to + * swap_info_struct->zeromap. If all pages are zero-filled according to + * zeromap, it will return folio_nr_pages(folio). + */ +static unsigned int swap_zeromap_folio_test(struct folio *folio) { struct swap_info_struct *sis = swp_swap_info(folio->swap); swp_entry_t entry; @@ -243,9 +248,9 @@ static bool swap_zeromap_folio_test(struct folio *folio) for (i = 0; i < folio_nr_pages(folio); i++) { entry = page_swap_entry(folio_page(folio, i)); if (!test_bit(swp_offset(entry), sis->zeromap)) - return false; + return i; } - return true; + return i; } /* @@ -511,6 +516,25 @@ static void sio_read_complete(struct kiocb *iocb, long ret) mempool_free(sio, sio_pool); } +static bool swap_read_folio_zeromap(struct folio *folio) +{ + unsigned int idx = swap_zeromap_folio_test(folio); + + if (idx == 0) + return false; + + /* + * Swapping in a large folio that is partially in the zeromap is not + * currently handled. Return true without marking the folio uptodate so + * that an IO error is emitted (e.g. do_swap_page() will sigbus). + */ + if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) + return true; + + folio_zero_fill(folio); + folio_mark_uptodate(folio); + return true +} + static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) { struct swap_info_struct *sis = swp_swap_info(folio->swap); @@ -600,9 +624,7 @@ void swap_read_folio(struct folio *folio, bool synchronous, psi_memstall_enter(&pflags); } delayacct_swapin_start(); - if (swap_zeromap_folio_test(folio)) { - folio_zero_fill(folio); - folio_mark_uptodate(folio); + if (swap_read_folio_zeromap(folio)) { folio_unlock(folio); } else if (zswap_load(folio)) { folio_mark_uptodate(folio);