From: Pankaj Raghav <p.raghav@xxxxxxxxxxx> This crosses off one of the TODO that was there as a part of writeback_store() function: A single page IO would be inefficient for write... This reduces the time of writeback of 4G data to a nvme backing device from 68 secs to 15 secs (more than 4x improvement). The idea is to batch the IOs until to a certain limit before the data is flushed to the backing device. The batch limit is initially chosen based on the bdi->io_pages value with an upper limit of 32 pages (128k on x86). The limit is modified based writeback_limit, if set. Signed-off-by: Pankaj Raghav <p.raghav@xxxxxxxxxxx> --- drivers/block/zram/zram_drv.c | 186 +++++++++++++++++++++------------- 1 file changed, 113 insertions(+), 73 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 0b8f814e11dd..27313c2d781d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -551,22 +551,6 @@ static ssize_t backing_dev_store(struct device *dev, return err; } -static unsigned long alloc_block_bdev(struct zram *zram) -{ - unsigned long blk_idx = 1; -retry: - /* skip 0 bit to confuse zram.handle = 0 */ - blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, blk_idx); - if (blk_idx == zram->nr_pages) - return 0; - - if (test_and_set_bit(blk_idx, zram->bitmap)) - goto retry; - - atomic64_inc(&zram->stats.bd_count); - return blk_idx; -} - static void free_block_bdev(struct zram *zram, unsigned long blk_idx) { int was_set; @@ -628,6 +612,15 @@ static void read_from_bdev_async(struct zram *zram, struct page *page, #define IDLE_WRITEBACK (1<<1) #define INCOMPRESSIBLE_WRITEBACK (1<<2) +#define MAX_INDEX_ENTRIES_ORDER 5 +#define MAX_INDEX_ENTRIES (1U << MAX_INDEX_ENTRIES_ORDER) +struct index_mapping { + /* Cap the maximum indices to 32 before we flush */ + unsigned long arr[MAX_INDEX_ENTRIES]; + unsigned int nr_of_entries; +}; + + /* * Returns: true if the index was prepared for further processing * false if the index can be skipped @@ -668,39 +661,36 @@ static bool writeback_prep_or_skip_index(struct zram *zram, int mode, return ret; } -static int writeback_flush_to_bdev(struct zram *zram, unsigned long index, - struct page *page, unsigned long *blk_idx) +static int writeback_flush_to_bdev(struct zram *zram, struct folio *folio, + struct index_mapping *map, + unsigned long blk_idx, unsigned int io_pages) { struct bio bio; struct bio_vec bio_vec; - int ret; + int ret = 0; + + if (!map->nr_of_entries) + return ret; bio_init(&bio, zram->bdev, &bio_vec, 1, REQ_OP_WRITE | REQ_SYNC); - bio.bi_iter.bi_sector = *blk_idx * (PAGE_SIZE >> 9); - __bio_add_page(&bio, page, PAGE_SIZE, 0); + bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9); + + if (!bio_add_folio(&bio, folio, io_pages * PAGE_SIZE, 0)) + goto cleanup; - /* - * XXX: A single page IO would be inefficient for write - * but it would be not bad as starter. - */ ret = submit_bio_wait(&bio); - if (ret) { - zram_slot_lock(zram, index); - zram_clear_flag(zram, index, ZRAM_UNDER_WB); - zram_clear_flag(zram, index, ZRAM_IDLE); - zram_slot_unlock(zram, index); - /* - * BIO errors are not fatal, we continue and simply - * attempt to writeback the remaining objects (pages). - * At the same time we need to signal user-space that - * some writes (at least one, but also could be all of - * them) were not successful and we do so by returning - * the most recent BIO error. - */ - return ret; - } + /* + * BIO errors are not fatal, we continue and simply + * attempt to writeback the remaining objects (pages). + * At the same time we need to signal user-space that + * some writes (at least one, but also could be all of + * them) were not successful and we do so by returning + * the most recent BIO error. + */ + if (ret) + goto cleanup; - atomic64_inc(&zram->stats.bd_writes); + atomic64_add(map->nr_of_entries, &zram->stats.bd_writes); /* * We released zram_slot_lock so need to check if the slot was * changed. If there is freeing for the slot, we can catch it @@ -710,28 +700,40 @@ static int writeback_flush_to_bdev(struct zram *zram, unsigned long index, * mark ZRAM_IDLE once it found the slot was ZRAM_UNDER_WB. * Thus, we could close the race by checking ZRAM_IDLE bit. */ - zram_slot_lock(zram, index); - if (!zram_allocated(zram, index) || - !zram_test_flag(zram, index, ZRAM_IDLE)) { - zram_clear_flag(zram, index, ZRAM_UNDER_WB); - zram_clear_flag(zram, index, ZRAM_IDLE); - goto skip; + for (int iter = 0; iter < map->nr_of_entries; iter++) { + zram_slot_lock(zram, map->arr[iter]); + if (!zram_allocated(zram, map->arr[iter]) || + !zram_test_flag(zram, map->arr[iter], ZRAM_IDLE)) { + zram_clear_flag(zram, map->arr[iter], ZRAM_UNDER_WB); + zram_clear_flag(zram, map->arr[iter], ZRAM_IDLE); + zram_slot_unlock(zram, map->arr[iter]); + free_block_bdev(zram, blk_idx + iter); + continue; + } + + zram_free_page(zram, map->arr[iter]); + zram_clear_flag(zram, map->arr[iter], ZRAM_UNDER_WB); + zram_set_flag(zram, map->arr[iter], ZRAM_WB); + zram_set_element(zram, map->arr[iter], blk_idx + iter); + zram_slot_unlock(zram, map->arr[iter]); + atomic64_inc(&zram->stats.pages_stored); + + spin_lock(&zram->wb_limit_lock); + if (zram->wb_limit_enable && zram->bd_wb_limit > 0) + zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12); + spin_unlock(&zram->wb_limit_lock); } + return ret; - zram_free_page(zram, index); - zram_clear_flag(zram, index, ZRAM_UNDER_WB); - zram_set_flag(zram, index, ZRAM_WB); - zram_set_element(zram, index, *blk_idx); - atomic64_inc(&zram->stats.pages_stored); - *blk_idx = 0; - - spin_lock(&zram->wb_limit_lock); - if (zram->wb_limit_enable && zram->bd_wb_limit > 0) - zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12); - spin_unlock(&zram->wb_limit_lock); -skip: - zram_slot_unlock(zram, index); - return 0; +cleanup: + for (int iter = 0; iter < map->nr_of_entries; iter++) { + zram_slot_lock(zram, map->arr[iter]); + zram_clear_flag(zram, map->arr[iter], ZRAM_UNDER_WB); + zram_clear_flag(zram, map->arr[iter], ZRAM_IDLE); + zram_slot_unlock(zram, map->arr[iter]); + } + free_block_bdev_range(zram, blk_idx, map->nr_of_entries); + return ret; } static ssize_t writeback_store(struct device *dev, @@ -741,9 +743,15 @@ static ssize_t writeback_store(struct device *dev, unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; unsigned long index = 0; struct page *page; + struct folio *folio; ssize_t ret = len; int mode, err; unsigned long blk_idx = 0; + unsigned int io_pages; + u64 bd_wb_limit_pages = ULONG_MAX; + struct index_mapping map = {}; + unsigned int order = min(MAX_INDEX_ENTRIES_ORDER, + ilog2(zram->bdev->bd_disk->bdi->io_pages)); if (sysfs_streq(buf, "idle")) mode = IDLE_WRITEBACK; @@ -776,32 +784,48 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - page = alloc_page(GFP_KERNEL); - if (!page) { + folio = folio_alloc(GFP_KERNEL, order); + if (!folio) { ret = -ENOMEM; goto release_init_lock; } for (; nr_pages != 0; index++, nr_pages--) { spin_lock(&zram->wb_limit_lock); - if (zram->wb_limit_enable && !zram->bd_wb_limit) { - spin_unlock(&zram->wb_limit_lock); - ret = -EIO; - break; + if (zram->wb_limit_enable) { + if (!zram->bd_wb_limit) { + spin_unlock(&zram->wb_limit_lock); + ret = -EIO; + break; + } + bd_wb_limit_pages = zram->bd_wb_limit >> + (PAGE_SHIFT - 12); } spin_unlock(&zram->wb_limit_lock); if (!blk_idx) { - blk_idx = alloc_block_bdev(zram); + io_pages = min(1UL << order, nr_pages); + io_pages = min_t(u64, bd_wb_limit_pages, io_pages); + + blk_idx = alloc_block_bdev_range(zram, &io_pages); if (!blk_idx) { ret = -ENOSPC; break; } } - if (!writeback_prep_or_skip_index(zram, mode, index)) - continue; + if (!writeback_prep_or_skip_index(zram, mode, index)) { + if (nr_pages > 1 || map.nr_of_entries == 0) + continue; + /* There are still some unfinished IOs that + * needs to be flushed + */ + err = writeback_flush_to_bdev(zram, folio, &map, + blk_idx, io_pages); + goto next; + } + page = folio_page(folio, map.nr_of_entries); if (zram_read_page(zram, page, index, NULL)) { zram_slot_lock(zram, index); zram_clear_flag(zram, index, ZRAM_UNDER_WB); @@ -810,15 +834,31 @@ static ssize_t writeback_store(struct device *dev, continue; } - err = writeback_flush_to_bdev(zram, index, page, &blk_idx); + map.arr[map.nr_of_entries++] = index; + if (map.nr_of_entries < io_pages) + continue; + err = writeback_flush_to_bdev(zram, folio, &map, blk_idx, + io_pages); +next: if (err) ret = err; + + /* + * Check if all the blocks have been utilized before + * allocating the next batch. This is necessary to free + * the unused blocks after looping through all indices. + */ + if (map.nr_of_entries == io_pages) { + blk_idx = 0; + map.nr_of_entries = 0; + } } if (blk_idx) - free_block_bdev(zram, blk_idx); - __free_page(page); + free_block_bdev_range(zram, blk_idx + map.nr_of_entries, + io_pages - map.nr_of_entries); + folio_put(folio); release_init_lock: up_read(&zram->init_lock); -- 2.40.1