On (23/02/02 22:38), Andrew Morton wrote: > Date: Thu, 02 Feb 2023 22:38:48 -0800 > From: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> > To: mm-commits@xxxxxxxxxxxxxxx, vishal.l.verma@xxxxxxxxx, > senozhatsky@xxxxxxxxxxxx, minchan@xxxxxxxxxx, kbusch@xxxxxxxxxx, > ira.weiny@xxxxxxxxx, dave.jiang@xxxxxxxxx, dan.j.williams@xxxxxxxxx, > axboe@xxxxxxxxx, hch@xxxxxx, akpm@xxxxxxxxxxxxxxxxxxxx > Subject: [merged mm-stable] block-remove-rw_page.patch removed from -mm tree > Message-Id: <20230203063848.D066CC4339B@xxxxxxxxxxxxxxx> > > > The quilt patch titled > Subject: block: remove ->rw_page > has been removed from the -mm tree. Its filename was > block-remove-rw_page.patch > > This patch was dropped because it was merged into the mm-stable branch > of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm > > ------------------------------------------------------ > From: Christoph Hellwig <hch@xxxxxx> > Subject: block: remove ->rw_page > Date: Wed, 25 Jan 2023 14:34:36 +0100 > > The ->rw_page method is a special purpose bypass of the usual bio handling > path that is limited to single-page reads and writes and synchronous which > causes a lot of extra code in the drivers, callers and the block layer. > > The only remaining user is the MM swap code. Switch that swap code to > simply submit a single-vec on-stack bio an synchronously wait on it based > on a newly added QUEUE_FLAG_SYNCHRONOUS flag set by the drivers that > currently implement ->rw_page instead. While this touches one extra cache > line and executes extra code, it simplifies the block layer and drivers > and ensures that all feastures are properly supported by all drivers, e.g. > right now ->rw_page bypassed cgroup writeback entirely. > > [akpm@xxxxxxxxxxxxxxxxxxxx: fix comment typo, per Dan] > Link: https://lkml.kernel.org/r/20230125133436.447864-8-hch@xxxxxx > Signed-off-by: Christoph Hellwig <hch@xxxxxx> > Reviewed-by: Dan Williams <dan.j.williams@xxxxxxxxx> > Cc: Dave Jiang <dave.jiang@xxxxxxxxx> > Cc: Ira Weiny <ira.weiny@xxxxxxxxx> > Cc: Jens Axboe <axboe@xxxxxxxxx> > Cc: Keith Busch <kbusch@xxxxxxxxxx> > Cc: Minchan Kim <minchan@xxxxxxxxxx> > Cc: Sergey Senozhatsky <senozhatsky@xxxxxxxxxxxx> > Cc: Vishal Verma <vishal.l.verma@xxxxxxxxx> > Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> > --- > > > --- a/block/bdev.c~block-remove-rw_page > +++ a/block/bdev.c > @@ -304,84 +304,6 @@ out: > } > EXPORT_SYMBOL(thaw_bdev); > > -/** > - * bdev_read_page() - Start reading a page from a block device > - * @bdev: The device to read the page from > - * @sector: The offset on the device to read the page to (need not be aligned) > - * @page: The page to read > - * > - * On entry, the page should be locked. It will be unlocked when the page > - * has been read. If the block driver implements rw_page synchronously, > - * that will be true on exit from this function, but it need not be. > - * > - * Errors returned by this function are usually "soft", eg out of memory, or > - * queue full; callers should try a different route to read this page rather > - * than propagate an error back up the stack. > - * > - * Return: negative errno if an error occurs, 0 if submission was successful. > - */ > -int bdev_read_page(struct block_device *bdev, sector_t sector, > - struct page *page) > -{ > - const struct block_device_operations *ops = bdev->bd_disk->fops; > - int result = -EOPNOTSUPP; > - > - if (!ops->rw_page || bdev_get_integrity(bdev)) > - return result; > - > - result = blk_queue_enter(bdev_get_queue(bdev), 0); > - if (result) > - return result; > - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, > - REQ_OP_READ); > - blk_queue_exit(bdev_get_queue(bdev)); > - return result; > -} > - > -/** > - * bdev_write_page() - Start writing a page to a block device > - * @bdev: The device to write the page to > - * @sector: The offset on the device to write the page to (need not be aligned) > - * @page: The page to write > - * @wbc: The writeback_control for the write > - * > - * On entry, the page should be locked and not currently under writeback. > - * On exit, if the write started successfully, the page will be unlocked and > - * under writeback. If the write failed already (eg the driver failed to > - * queue the page to the device), the page will still be locked. If the > - * caller is a ->writepage implementation, it will need to unlock the page. > - * > - * Errors returned by this function are usually "soft", eg out of memory, or > - * queue full; callers should try a different route to write this page rather > - * than propagate an error back up the stack. > - * > - * Return: negative errno if an error occurs, 0 if submission was successful. > - */ > -int bdev_write_page(struct block_device *bdev, sector_t sector, > - struct page *page, struct writeback_control *wbc) > -{ > - int result; > - const struct block_device_operations *ops = bdev->bd_disk->fops; > - > - if (!ops->rw_page || bdev_get_integrity(bdev)) > - return -EOPNOTSUPP; > - result = blk_queue_enter(bdev_get_queue(bdev), 0); > - if (result) > - return result; > - > - set_page_writeback(page); > - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, > - REQ_OP_WRITE); > - if (result) { > - end_page_writeback(page); > - } else { > - clean_page_buffers(page); > - unlock_page(page); > - } > - blk_queue_exit(bdev_get_queue(bdev)); > - return result; > -} > - > /* > * pseudo-fs > */ > --- a/drivers/block/brd.c~block-remove-rw_page > +++ a/drivers/block/brd.c > @@ -309,23 +309,9 @@ static void brd_submit_bio(struct bio *b > bio_endio(bio); > } > > -static int brd_rw_page(struct block_device *bdev, sector_t sector, > - struct page *page, enum req_op op) > -{ > - struct brd_device *brd = bdev->bd_disk->private_data; > - int err; > - > - if (PageTransHuge(page)) > - return -ENOTSUPP; > - err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector); > - page_endio(page, op_is_write(op), err); > - return err; > -} > - > static const struct block_device_operations brd_fops = { > .owner = THIS_MODULE, > .submit_bio = brd_submit_bio, > - .rw_page = brd_rw_page, > }; > > /* > @@ -411,6 +397,7 @@ static int brd_alloc(int i) > > /* Tell the block layer that this is not a rotational device */ > blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); > + blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue); > blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); > err = add_disk(disk); > if (err) > --- a/drivers/block/zram/zram_drv.c~block-remove-rw_page > +++ a/drivers/block/zram/zram_drv.c > @@ -1453,10 +1453,6 @@ static int __zram_bvec_read(struct zram > /* Slot should be unlocked before the function call */ > zram_slot_unlock(zram, index); > > - /* A null bio means rw_page was used, we must fallback to bio */ > - if (!bio) > - return -EOPNOTSUPP; > - > ret = zram_bvec_read_from_bdev(zram, page, index, bio, > partial_io); > } > @@ -2081,61 +2077,6 @@ static void zram_slot_free_notify(struct > zram_slot_unlock(zram, index); > } > > -static int zram_rw_page(struct block_device *bdev, sector_t sector, > - struct page *page, enum req_op op) > -{ > - int offset, ret; > - u32 index; > - struct zram *zram; > - struct bio_vec bv; > - unsigned long start_time; > - > - if (PageTransHuge(page)) > - return -ENOTSUPP; > - zram = bdev->bd_disk->private_data; > - > - if (!valid_io_request(zram, sector, PAGE_SIZE)) { > - atomic64_inc(&zram->stats.invalid_io); > - ret = -EINVAL; > - goto out; > - } > - > - index = sector >> SECTORS_PER_PAGE_SHIFT; > - offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; > - > - bv.bv_page = page; > - bv.bv_len = PAGE_SIZE; > - bv.bv_offset = 0; > - > - start_time = bdev_start_io_acct(bdev->bd_disk->part0, > - SECTORS_PER_PAGE, op, jiffies); > - ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL); > - bdev_end_io_acct(bdev->bd_disk->part0, op, start_time); > -out: > - /* > - * If I/O fails, just return error(ie, non-zero) without > - * calling page_endio. > - * It causes resubmit the I/O with bio request by upper functions > - * of rw_page(e.g., swap_readpage, __swap_writepage) and > - * bio->bi_end_io does things to handle the error > - * (e.g., SetPageError, set_page_dirty and extra works). > - */ > - if (unlikely(ret < 0)) > - return ret; > - > - switch (ret) { > - case 0: > - page_endio(page, op_is_write(op), 0); > - break; > - case 1: > - ret = 0; > - break; > - default: > - WARN_ON(1); > - } > - return ret; > -} > - > static void zram_destroy_comps(struct zram *zram) > { > u32 prio; > @@ -2290,7 +2231,6 @@ static const struct block_device_operati > .open = zram_open, > .submit_bio = zram_submit_bio, > .swap_slot_free_notify = zram_slot_free_notify, > - .rw_page = zram_rw_page, > .owner = THIS_MODULE > }; > > @@ -2389,6 +2329,7 @@ static int zram_add(void) > set_capacity(zram->disk, 0); > /* zram devices sort of resembles non-rotational disks */ > blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue); > + blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue); > blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); > > /* > --- a/drivers/nvdimm/btt.c~block-remove-rw_page > +++ a/drivers/nvdimm/btt.c > @@ -1482,20 +1482,6 @@ static void btt_submit_bio(struct bio *b > bio_endio(bio); > } > > -static int btt_rw_page(struct block_device *bdev, sector_t sector, > - struct page *page, enum req_op op) > -{ > - struct btt *btt = bdev->bd_disk->private_data; > - int rc; > - > - rc = btt_do_bvec(btt, NULL, page, thp_size(page), 0, op, sector); > - if (rc == 0) > - page_endio(page, op_is_write(op), 0); > - > - return rc; > -} > - > - > static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) > { > /* some standard values */ > @@ -1508,7 +1494,6 @@ static int btt_getgeo(struct block_devic > static const struct block_device_operations btt_fops = { > .owner = THIS_MODULE, > .submit_bio = btt_submit_bio, > - .rw_page = btt_rw_page, > .getgeo = btt_getgeo, > }; > > @@ -1530,6 +1515,7 @@ static int btt_blk_init(struct btt *btt) > blk_queue_logical_block_size(btt->btt_disk->queue, btt->sector_size); > blk_queue_max_hw_sectors(btt->btt_disk->queue, UINT_MAX); > blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue); > + blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue); > > if (btt_meta_size(btt)) { > rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt)); > --- a/drivers/nvdimm/pmem.c~block-remove-rw_page > +++ a/drivers/nvdimm/pmem.c > @@ -238,28 +238,6 @@ static void pmem_submit_bio(struct bio * > bio_endio(bio); > } > > -static int pmem_rw_page(struct block_device *bdev, sector_t sector, > - struct page *page, enum req_op op) > -{ > - struct pmem_device *pmem = bdev->bd_disk->private_data; > - blk_status_t rc; > - > - if (op_is_write(op)) > - rc = pmem_do_write(pmem, page, 0, sector, thp_size(page)); > - else > - rc = pmem_do_read(pmem, page, 0, sector, thp_size(page)); > - /* > - * The ->rw_page interface is subtle and tricky. The core > - * retries on any error, so we can only invoke page_endio() in > - * the successful completion case. Otherwise, we'll see crashes > - * caused by double completion. > - */ > - if (rc == 0) > - page_endio(page, op_is_write(op), 0); > - > - return blk_status_to_errno(rc); > -} > - > /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ > __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, > long nr_pages, enum dax_access_mode mode, void **kaddr, > @@ -310,7 +288,6 @@ __weak long __pmem_direct_access(struct > static const struct block_device_operations pmem_fops = { > .owner = THIS_MODULE, > .submit_bio = pmem_submit_bio, > - .rw_page = pmem_rw_page, > }; > > static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, > @@ -565,6 +542,7 @@ static int pmem_attach_disk(struct devic > blk_queue_logical_block_size(q, pmem_sector_size(ndns)); > blk_queue_max_hw_sectors(q, UINT_MAX); > blk_queue_flag_set(QUEUE_FLAG_NONROT, q); > + blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q); > if (pmem->pfn_flags & PFN_MAP) > blk_queue_flag_set(QUEUE_FLAG_DAX, q); > > --- a/include/linux/blkdev.h~block-remove-rw_page > +++ a/include/linux/blkdev.h > @@ -554,6 +554,7 @@ struct request_queue { > #define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ > #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ > #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ > +#define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ > #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ > #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ > #define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ > @@ -1250,6 +1251,12 @@ static inline bool bdev_nonrot(struct bl > return blk_queue_nonrot(bdev_get_queue(bdev)); > } > > +static inline bool bdev_synchronous(struct block_device *bdev) > +{ > + return test_bit(QUEUE_FLAG_SYNCHRONOUS, > + &bdev_get_queue(bdev)->queue_flags); > +} > + > static inline bool bdev_stable_writes(struct block_device *bdev) > { > return test_bit(QUEUE_FLAG_STABLE_WRITES, > @@ -1382,7 +1389,6 @@ struct block_device_operations { > unsigned int flags); > int (*open) (struct block_device *, fmode_t); > void (*release) (struct gendisk *, fmode_t); > - int (*rw_page)(struct block_device *, sector_t, struct page *, enum req_op); > int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); > int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); > unsigned int (*check_events) (struct gendisk *disk, > @@ -1417,10 +1423,6 @@ extern int blkdev_compat_ptr_ioctl(struc > #define blkdev_compat_ptr_ioctl NULL > #endif > > -extern int bdev_read_page(struct block_device *, sector_t, struct page *); > -extern int bdev_write_page(struct block_device *, sector_t, struct page *, > - struct writeback_control *); > - > static inline void blk_wake_io_task(struct task_struct *waiter) > { > /* > --- a/mm/page_io.c~block-remove-rw_page > +++ a/mm/page_io.c > @@ -27,7 +27,7 @@ > #include <linux/delayacct.h> > #include "swap.h" > > -static void end_swap_bio_write(struct bio *bio) > +static void __end_swap_bio_write(struct bio *bio) > { > struct page *page = bio_first_page_all(bio); > > @@ -48,6 +48,11 @@ static void end_swap_bio_write(struct bi > ClearPageReclaim(page); > } > end_page_writeback(page); > +} > + > +static void end_swap_bio_write(struct bio *bio) > +{ > + __end_swap_bio_write(bio); > bio_put(bio); > } > > @@ -326,15 +331,31 @@ static void swap_writepage_fs(struct pag > *wbc->swap_plug = sio; > } > > -static void swap_writepage_bdev(struct page *page, > +static void swap_writepage_bdev_sync(struct page *page, > struct writeback_control *wbc, struct swap_info_struct *sis) > { > - struct bio *bio; > + struct bio_vec bv; > + struct bio bio; > > - if (!bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc)) { > - count_swpout_vm_event(page); > - return; > - } > + bio_init(&bio, sis->bdev, &bv, 1, > + REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc)); > + bio.bi_iter.bi_sector = swap_page_sector(page); > + bio_add_page(&bio, page, thp_size(page), 0); > + > + bio_associate_blkg_from_page(&bio, page); > + count_swpout_vm_event(page); > + > + set_page_writeback(page); > + unlock_page(page); > + > + submit_bio_wait(&bio); > + __end_swap_bio_write(&bio); > +} > + > +static void swap_writepage_bdev_async(struct page *page, > + struct writeback_control *wbc, struct swap_info_struct *sis) > +{ > + struct bio *bio; > > bio = bio_alloc(sis->bdev, 1, > REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc), > @@ -362,8 +383,10 @@ void __swap_writepage(struct page *page, > */ > if (data_race(sis->flags & SWP_FS_OPS)) > swap_writepage_fs(page, wbc); > + else if (sis->flags & SWP_SYNCHRONOUS_IO) > + swap_writepage_bdev_sync(page, wbc, sis); > else > - swap_writepage_bdev(page, wbc, sis); > + swap_writepage_bdev_async(page, wbc, sis); > } > > void swap_write_unplug(struct swap_iocb *sio) > @@ -447,12 +470,6 @@ static void swap_readpage_bdev_sync(stru > struct bio_vec bv; > struct bio bio; > > - if ((sis->flags & SWP_SYNCHRONOUS_IO) && > - !bdev_read_page(sis->bdev, swap_page_sector(page), page)) { > - count_vm_event(PSWPIN); > - return; > - } > - > bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ); > bio.bi_iter.bi_sector = swap_page_sector(page); > bio_add_page(&bio, page, thp_size(page), 0); > @@ -472,12 +489,6 @@ static void swap_readpage_bdev_async(str > { > struct bio *bio; > > - if ((sis->flags & SWP_SYNCHRONOUS_IO) && > - !bdev_read_page(sis->bdev, swap_page_sector(page), page)) { > - count_vm_event(PSWPIN); > - return; > - } > - > bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); > bio->bi_iter.bi_sector = swap_page_sector(page); > bio->bi_end_io = end_swap_bio_read; > @@ -513,7 +524,7 @@ void swap_readpage(struct page *page, bo > unlock_page(page); > } else if (data_race(sis->flags & SWP_FS_OPS)) { > swap_readpage_fs(page, plug); > - } else if (synchronous) { > + } else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) { > swap_readpage_bdev_sync(page, sis); > } else { > swap_readpage_bdev_async(page, sis); > --- a/mm/swapfile.c~block-remove-rw_page > +++ a/mm/swapfile.c > @@ -3071,7 +3071,7 @@ SYSCALL_DEFINE2(swapon, const char __use > if (p->bdev && bdev_stable_writes(p->bdev)) > p->flags |= SWP_STABLE_WRITES; > > - if (p->bdev && p->bdev->bd_disk->fops->rw_page) > + if (p->bdev && bdev_synchronous(p->bdev)) > p->flags |= SWP_SYNCHRONOUS_IO; > > if (p->bdev && bdev_nonrot(p->bdev)) { > _ > > Patches currently in -mm which might be from hch@xxxxxx are > >