On 5/16/23 13:43, Jens Axboe wrote: > On 5/16/23 2:41 PM, Chaitanya Kulkarni wrote: >> On 5/16/23 06:08, Sergey Senozhatsky wrote: >>> On (23/05/16 05:51), Chaitanya Kulkarni wrote: >>>> Removed modparam v2 is ready to send, but I've few concerns enabling >>>> nowait unconditionally for zram :- >>>> >>>> From brd data [1] and zram data [2] from my setup :- >>>> >>>> IOPs (old->new) | sys cpu% (old->new) >>>> -------------------------------------------------- >>>> brd | 1.5x (3919 -> 5874) | 3x (29 -> 87) >>>> zram | 1.09x ( 29 -> 87) | 9x (11 -> 97) >>>> >>>> brd:- >>>> IOPs increased by ~1.5 times (50% up) >>>> sys CPU percentage increased by ~3.0 times (200% up) >>>> >>>> zram:- >>>> IOPs increased by ~1.09 times ( 9% up) >>>> sys CPU percentage increased by ~8.81 times (781% up) >>>> >>>> This comparison clearly demonstrates that zram experiences a much more >>>> substantial CPU load relative to the increase in IOPs compared to brd. >>>> Such a significant difference might suggest a potential CPU regression >>>> in zram ? >>>> >>>> Especially for zram, if applications are not expecting this high cpu >>>> usage then they we'll get regression reports with default nowait >>>> approach. How about we avoid something like this with one of the >>>> following options ? >>> Well, zram performs decompression/compression on the CPU (per-CPU >>> crypto streams) for each IO operation, so zram IO is CPU intensive. >> and that is exactly I've raised this issue, are you okay with that ? >> I'll send V2 with nowait enabled by default .. > Did you check that it's actually nowait sane to begin with? I spent > literally 30 seconds on that when you sent the first patch, and the > partial/sync path does not look kosher. > I did check for it and it needs a nowait sane preparation in V2 with something like this [1] plus returning error with bio_wouldblock_error() when we end up in read_from_bdev_sync() when it is not called from writeback_store(). (rough changes [1]) But after taking performance numbers repeatedly, the main concern I failed to resolve is above numbers & default enabling nowait for zram ... -ck [1] diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 08d198431a88..c2f154911cc0 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -55,7 +55,7 @@ static const struct block_device_operations zram_devops; static void zram_free_page(struct zram *zram, size_t index); static int zram_read_page(struct zram *zram, struct page *page, u32 index, - struct bio *parent, blk_opf_t opf); + struct bio *parent, bool nowait); static int zram_slot_trylock(struct zram *zram, u32 index) { @@ -690,7 +690,7 @@ static ssize_t writeback_store(struct device *dev, /* Need for hugepage writeback racing */ zram_set_flag(zram, index, ZRAM_IDLE); zram_slot_unlock(zram, index); - if (zram_read_page(zram, page, index, NULL, 0)) { + if (zram_read_page(zram, page, index, NULL, false)) { zram_slot_lock(zram, index); zram_clear_flag(zram, index, ZRAM_UNDER_WB); zram_clear_flag(zram, index, ZRAM_IDLE); @@ -772,6 +772,7 @@ struct zram_work { unsigned long entry; struct page *page; int error; + bool nowait; }; static void zram_sync_read(struct work_struct *work) @@ -779,8 +780,9 @@ static void zram_sync_read(struct work_struct *work) struct zram_work *zw = container_of(work, struct zram_work, work); struct bio_vec bv; struct bio bio; + blk_opf_t_nowait = zw->nowait ? REQ_NOWAIT : 0; - bio_init(&bio, zw->zram->bdev, &bv, 1, REQ_OP_READ); + bio_init(&bio, zw->zram->bdev, &bv, 1, REQ_OP_READ | nowait); bio.bi_iter.bi_sector = zw->entry * (PAGE_SIZE >> 9); __bio_add_page(&bio, zw->page, PAGE_SIZE, 0); zw->error = submit_bio_wait(&bio); @@ -792,18 +794,14 @@ static void zram_sync_read(struct work_struct *work) * use a worker thread context. */ static int read_from_bdev_sync(struct zram *zram, struct page *page, - unsigned long entry, blk_opf_t opf) + unsigned long entry, bool nowait) { struct zram_work work; work.page = page; work.zram = zram; work.entry = entry; - - if (opf & REQ_NOWAIT) { - zram_sync_read(&work); - return work.error; - } + work.nowait = nowait; INIT_WORK_ONSTACK(&work.work, zram_sync_read); queue_work(system_unbound_wq, &work.work); @@ -814,21 +812,21 @@ static int read_from_bdev_sync(struct zram *zram, struct page *page, } static int read_from_bdev(struct zram *zram, struct page *page, - unsigned long entry, struct bio *parent, blk_opf_t opf) + unsigned long entry, struct bio *parent, bool nowait) { atomic64_inc(&zram->stats.bd_reads); if (!parent) { if (WARN_ON_ONCE(!IS_ENABLED(ZRAM_PARTIAL_IO))) return -EIO; - return read_from_bdev_sync(zram, page, entry, opf); + return read_from_bdev_sync(zram, page, entry, nowait); } - read_from_bdev_async(zram, page, entry, parent, opf); + read_from_bdev_async(zram, page, entry, parent, nowait); return 0; } #else static inline void reset_bdev(struct zram *zram) {}; static int read_from_bdev(struct zram *zram, struct page *page, - unsigned long entry, struct bio *parent, blk_opf_t opf) + unsigned long entry, struct bio *parent, bool nowait) { return -EIO; } @@ -1361,7 +1359,7 @@ static int zram_read_from_zspool(struct zram *zram, struct page *page, } static int zram_read_page(struct zram *zram, struct page *page, u32 index, - struct bio *parent, blk_opf_t bi_opf) + struct bio *parent, bool nowait) { int ret; @@ -1378,7 +1376,7 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index, zram_slot_unlock(zram, index); ret = read_from_bdev(zram, page, zram_get_element(zram, index), - parent, bi_opf); + parent, nowait); } /* Should NEVER happen. Return bio error if it does. */ @@ -1395,13 +1393,14 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index, static int zram_bvec_read_partial(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio) { + bool nowait = bio->bi_opf & REQ_NOWAIT; struct page *page; int ret; - page = alloc_page(bio->bi_opf & REQ_NOWAIT ? GFP_NOWAIT : GFP_NOIO); + page = alloc_page(nowait ? GFP_NOWAIT : GFP_NOIO); if (!page) return -ENOMEM; - ret = zram_read_page(zram, page, index, NULL, bio->bi_opf); + ret = zram_read_page(zram, page, index, NULL, nowait); if (likely(!ret)) memcpy_to_bvec(bvec, page_address(page) + offset); __free_page(page); @@ -1413,7 +1412,8 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, { if (is_partial_io(bvec)) return zram_bvec_read_partial(zram, bvec, index, offset, bio); - return zram_read_page(zram, bvec->bv_page, index, bio, bio->bi_opf); + return zram_read_page(zram, bvec->bv_page, index, bio, + bio->bi_opf & REQ_NOWAIT); } static int zram_write_page(struct zram *zram, struct page *page, u32 index) @@ -1547,14 +1547,15 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) static int zram_bvec_write_partial(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio) { + bool nowait = bio->bi_opf & REQ_NOWAIT; struct page *page; int ret; - page = alloc_page(bio->bi_opf & REQ_NOWAIT ? GFP_NOWAIT : GFP_NOIO); + page = alloc_page(nowait ? GFP_NOWAIT : GFP_NOIO); if (!page) return -ENOMEM; - ret = zram_read_page(zram, page, index, bio, bio->bi_opf); + ret = zram_read_page(zram, page, index, bio, nowait); if (!ret) { memcpy_from_bvec(page_address(page) + offset, bvec); ret = zram_write_page(zram, page, index);