Currently ext4_writepages() submits all pages with transaction started. When no page needs block allocation or extent conversion we can submit all dirty pages in the inode while holding a single transaction handle and when device is congested this can take significant amount of time. Thus ext4_writepages() can block transaction commits for extended periods of time. Take for example a simple benchmark simulating PostgreSQL database (pgioperf in mmtest). The benchmark runs 16 processes doing random reads from a huge file, one process doing random writes to the huge file, and one process doing sequential writes to a small writes and frequently running fsync. With unpatched kernel transaction commits take on average ~18s with standard deviation of ~41s, top 5 commit times are: 274.466639s, 126.467347s, 86.992429s, 34.351563s, 31.517653s. After this patch transaction commits take on average 0.1s with standard deviation of 0.15s, top 5 commit times are: 0.563792s, 0.519980s, 0.509841s, 0.471700s, 0.469899s Signed-off-by: Jan Kara <jack@xxxxxxx> --- fs/ext4/inode.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index baa87e7d1426..ff55d430938b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2171,6 +2171,9 @@ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, /* First block in the extent? */ if (map->m_len == 0) { + /* We cannot map unless handle is started... */ + if (!mpd->io_submit.io_end) + return false; map->m_lblk = lblk; map->m_len = 1; map->m_flags = bh->b_state & BH_FLAGS; @@ -2223,6 +2226,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, /* Found extent to map? */ if (mpd->map.m_len) return 0; + /* Buffer needs mapping and handle is not started? */ + if (!mpd->io_submit.io_end) + return 0; /* Everything mapped so far and we hit EOF */ break; } @@ -2739,6 +2745,21 @@ static int ext4_writepages(struct address_space *mapping, tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); done = false; blk_start_plug(&plug); + + /* + * First writeback pages that don't need mapping - we can avoid + * starting a transaction unnecessarily and also avoid being blocked + * in the block layer on device congestion while having transaction + * started. + */ + ret = mpage_prepare_extent_to_map(&mpd); + /* Submit prepared bio */ + ext4_io_submit(&mpd.io_submit); + /* Unlock pages we didn't use */ + mpage_release_unused_pages(&mpd, false); + if (ret < 0) + goto unplug; + while (!done && mpd.first_page <= mpd.last_page) { /* For each extent of pages we use new io_end */ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); @@ -2767,6 +2788,7 @@ static int ext4_writepages(struct address_space *mapping, wbc->nr_to_write, inode->i_ino, ret); /* Release allocated io_end */ ext4_put_io_end(mpd.io_submit.io_end); + mpd.io_submit.io_end = NULL; break; } @@ -2816,6 +2838,7 @@ static int ext4_writepages(struct address_space *mapping, ext4_journal_stop(handle); } else ext4_put_io_end(mpd.io_submit.io_end); + mpd.io_submit.io_end = NULL; if (ret == -ENOSPC && sbi->s_journal) { /* @@ -2831,6 +2854,7 @@ static int ext4_writepages(struct address_space *mapping, if (ret) break; } +unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; -- 2.12.0