Apply the fsio controller to the opportune kernel functions to evaluate and throttle filesystem I/O. Signed-off-by: Andrea Righi <righi.andrea@xxxxxxxxx> --- block/blk-core.c | 10 ++++++++++ include/linux/writeback.h | 7 ++++++- mm/filemap.c | 20 +++++++++++++++++++- mm/page-writeback.c | 14 ++++++++++++-- 4 files changed, 47 insertions(+), 4 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 3c5f61ceeb67..4b4717f64ac1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -16,6 +16,7 @@ #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/fsio-throttle.h> #include <linux/blk-mq.h> #include <linux/highmem.h> #include <linux/mm.h> @@ -956,6 +957,15 @@ generic_make_request_checks(struct bio *bio) */ create_io_context(GFP_ATOMIC, q->node); + /* + * Account only READs at this layer (WRITEs are accounted and throttled + * in balance_dirty_pages()) and don't enfore sleeps (state=0): in this + * way we can prevent potential lock contentions and priority inversion + * problems at the filesystem layer. + */ + if (bio_op(bio) == REQ_OP_READ) + fsio_throttle(bio_dev(bio), bio->bi_iter.bi_size, 0); + if (!blkcg_bio_issue_check(q, bio)) return false; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 738a0c24874f..1e161c7969e5 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -356,7 +356,12 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time); -void balance_dirty_pages_ratelimited(struct address_space *mapping); + +#define balance_dirty_pages_ratelimited(__mapping) \ + __balance_dirty_pages_ratelimited(__mapping, false) +void __balance_dirty_pages_ratelimited(struct address_space *mapping, + bool redirty); + bool wb_over_bg_thresh(struct bdi_writeback *wb); typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, diff --git a/mm/filemap.c b/mm/filemap.c index 9f5e323e883e..5cc0959274d6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -29,6 +29,7 @@ #include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/blkdev.h> +#include <linux/fsio-throttle.h> #include <linux/security.h> #include <linux/cpuset.h> #include <linux/hugetlb.h> @@ -2040,6 +2041,7 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb, { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; + struct block_device *bdev = as_to_bdev(mapping); struct inode *inode = mapping->host; struct file_ra_state *ra = &filp->f_ra; loff_t *ppos = &iocb->ki_pos; @@ -2068,6 +2070,7 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb, cond_resched(); find_page: + fsio_throttle(bdev_to_dev(bdev), 0, TASK_INTERRUPTIBLE); if (fatal_signal_pending(current)) { error = -EINTR; goto out; @@ -2308,11 +2311,17 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (iocb->ki_flags & IOCB_DIRECT) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; + struct block_device *bdev = as_to_bdev(mapping); struct inode *inode = mapping->host; loff_t size; size = i_size_read(inode); if (iocb->ki_flags & IOCB_NOWAIT) { + unsigned long long sleep; + + sleep = fsio_throttle(bdev_to_dev(bdev), 0, 0); + if (sleep) + return -EAGAIN; if (filemap_range_has_page(mapping, iocb->ki_pos, iocb->ki_pos + count - 1)) return -EAGAIN; @@ -2322,6 +2331,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos + count - 1); if (retval < 0) goto out; + fsio_throttle(bdev_to_dev(bdev), 0, TASK_INTERRUPTIBLE); } file_accessed(file); @@ -2366,9 +2376,11 @@ EXPORT_SYMBOL(generic_file_read_iter); static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) { struct address_space *mapping = file->f_mapping; + struct block_device *bdev = as_to_bdev(mapping); struct page *page; int ret; + fsio_throttle(bdev_to_dev(bdev), 0, TASK_INTERRUPTIBLE); do { page = __page_cache_alloc(gfp_mask); if (!page) @@ -2498,11 +2510,15 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) */ page = find_get_page(mapping, offset); if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { + struct block_device *bdev = as_to_bdev(mapping); /* * We found the page, so try async readahead before * waiting for the lock. */ do_async_mmap_readahead(vmf->vma, ra, file, page, offset); + if (unlikely(!PageUptodate(page))) + fsio_throttle(bdev_to_dev(bdev), 0, + TASK_INTERRUPTIBLE); } else if (!page) { /* No page in the page cache at all */ do_sync_mmap_readahead(vmf->vma, ra, file, offset); @@ -3172,6 +3188,7 @@ ssize_t generic_perform_write(struct file *file, long status = 0; ssize_t written = 0; unsigned int flags = 0; + unsigned int dirty; do { struct page *page; @@ -3216,6 +3233,7 @@ ssize_t generic_perform_write(struct file *file, copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); flush_dcache_page(page); + dirty = PageDirty(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); if (unlikely(status < 0)) @@ -3241,7 +3259,7 @@ ssize_t generic_perform_write(struct file *file, pos += copied; written += copied; - balance_dirty_pages_ratelimited(mapping); + __balance_dirty_pages_ratelimited(mapping, dirty); } while (iov_iter_count(i)); return written ? written : status; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7d1010453fb9..694ede8783f3 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -20,6 +20,7 @@ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/writeback.h> +#include <linux/fsio-throttle.h> #include <linux/init.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> @@ -1858,10 +1859,12 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ -void balance_dirty_pages_ratelimited(struct address_space *mapping) +void __balance_dirty_pages_ratelimited(struct address_space *mapping, + bool redirty) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); + struct block_device *bdev = as_to_bdev(mapping); struct bdi_writeback *wb = NULL; int ratelimit; int *p; @@ -1878,6 +1881,13 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) if (wb->dirty_exceeded) ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); + /* + * Throttle filesystem I/O only if page was initially clean: re-writing + * a dirty page doesn't generate additional I/O. + */ + if (!redirty) + fsio_throttle(bdev_to_dev(bdev), PAGE_SIZE, TASK_KILLABLE); + preempt_disable(); /* * This prevents one CPU to accumulate too many dirtied pages without @@ -1911,7 +1921,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) wb_put(wb); } -EXPORT_SYMBOL(balance_dirty_pages_ratelimited); +EXPORT_SYMBOL(__balance_dirty_pages_ratelimited); /** * wb_over_bg_thresh - does @wb need to be written back? -- 2.17.1