On Tue, 2012-12-25 at 02:22 +0000, Eric Wong wrote: Please add changelog. > Using fadvise with POSIX_FADV_WILLNEED can be very slow and cause > user-visible latency. This hurts interactivity and encourages > userspace to resort to background threads for readahead (or avoid > POSIX_FADV_WILLNEED entirely). > > "strace -T" timing on an uncached, one gigabyte file: > > Before: fadvise64(3, 0, 0, POSIX_FADV_WILLNEED) = 0 <2.484832> > After: fadvise64(3, 0, 0, POSIX_FADV_WILLNEED) = 0 <0.000061> > > For a smaller 9.8M request, there is still a significant improvement: > > Before: fadvise64(3, 0, 10223108, POSIX_FADV_WILLNEED) = 0 <0.005399> > After: fadvise64(3, 0, 10223108, POSIX_FADV_WILLNEED) = 0 <0.000059> > > Even with a small 1M request, there is an improvement: > > Before: fadvise64(3, 0, 1048576, POSIX_FADV_WILLNEED) = 0 <0.000474> > After: fadvise64(3, 0, 1048576, POSIX_FADV_WILLNEED) = 0 <0.000063> > > While userspace can mimic the effect of this commit by using a > background thread to perform readahead(), this allows for simpler > userspace code. > > To mitigate denial-of-service attacks, inflight (but incomplete) > readahead requests are accounted for when new readahead requests arrive. > New readahead requests may be reduced or ignored if there are too many > inflight readahead pages in the workqueue. > > IO priority is also taken into account for workqueue readahead. > Normal and idle priority tasks share a concurrency-limited workqueue to > prevent excessive readahead requests from taking place simultaneously. > This normal workqueue is concurrency-limited to one task per-CPU > (like AIO). > > Real-time I/O tasks get their own high-priority workqueue independent > of the normal workqueue. > > The impact of idle tasks is also reduced and they are more likely to > have advisory readahead requests ignored/dropped when read congestion > occurs. > > Cc: Alan Cox <alan@xxxxxxxxxxxxxxxxxxx> > Cc: Dave Chinner <david@xxxxxxxxxxxxx> > Cc: Zheng Liu <gnehzuil.liu@xxxxxxxxx> > Signed-off-by: Eric Wong <normalperson@xxxxxxxx> > --- > I have not tested on NUMA (since I've no access to NUMA hardware) > and do not know how the use of the workqueue affects RA performance. > I'm only using WQ_UNBOUND on non-NUMA, though. > > I'm halfway tempted to make DONTNEED use a workqueue, too. > Having perceptible latency on advisory syscalls is unpleasant and > keeping the latency makes little sense if we can hide it. > > include/linux/mm.h | 3 + > mm/fadvise.c | 10 +-- > mm/readahead.c | 217 +++++++++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 224 insertions(+), 6 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 6320407..90b361c 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1536,6 +1536,9 @@ void task_dirty_inc(struct task_struct *tsk); > #define VM_MAX_READAHEAD 128 /* kbytes */ > #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ > > +void wq_page_cache_readahead(struct address_space *mapping, struct file *filp, > + pgoff_t offset, unsigned long nr_to_read); > + > int force_page_cache_readahead(struct address_space *mapping, struct file *filp, > pgoff_t offset, unsigned long nr_to_read); > > diff --git a/mm/fadvise.c b/mm/fadvise.c > index a47f0f5..cf3bd4c 100644 > --- a/mm/fadvise.c > +++ b/mm/fadvise.c > @@ -102,12 +102,10 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) > if (!nrpages) > nrpages = ~0UL; > > - /* > - * Ignore return value because fadvise() shall return > - * success even if filesystem can't retrieve a hint, > - */ > - force_page_cache_readahead(mapping, f.file, start_index, > - nrpages); > + get_file(f.file); /* fput() is called by workqueue */ > + > + /* queue up the request, don't care if it fails */ > + wq_page_cache_readahead(mapping, f.file, start_index, nrpages); > break; > case POSIX_FADV_NOREUSE: > break; > diff --git a/mm/readahead.c b/mm/readahead.c > index 7963f23..f9e0705 100644 > --- a/mm/readahead.c > +++ b/mm/readahead.c > @@ -19,6 +19,45 @@ > #include <linux/pagemap.h> > #include <linux/syscalls.h> > #include <linux/file.h> > +#include <linux/workqueue.h> > +#include <linux/ioprio.h> > + > +static struct workqueue_struct *ra_be __read_mostly; > +static struct workqueue_struct *ra_rt __read_mostly; > +static unsigned long ra_nr_queued; > +static DEFINE_SPINLOCK(ra_nr_queued_lock); > + > +struct wq_ra_req { > + struct work_struct work; > + struct address_space *mapping; > + struct file *file; > + pgoff_t offset; > + unsigned long nr_to_read; > + int ioprio; > +}; > + > +static void wq_ra_enqueue(struct wq_ra_req *); > + > +/* keep NUMA readahead on the same CPU for now... */ > +#ifdef CONFIG_NUMA > +# define RA_WQ_FLAGS 0 > +#else > +# define RA_WQ_FLAGS WQ_UNBOUND > +#endif > + > +static int __init init_readahead(void) > +{ > + /* let tasks with real-time priorities run freely */ > + ra_rt = alloc_workqueue("readahead_rt", RA_WQ_FLAGS|WQ_HIGHPRI, 0); > + > + /* limit async concurrency of normal and idle readahead */ > + ra_be = alloc_workqueue("readahead_be", RA_WQ_FLAGS, 1); > + > + BUG_ON(!ra_be || !ra_rt); > + return 0; > +} > + > +early_initcall(init_readahead); > > /* > * Initialise a struct file's readahead state. Assumes that the caller has > @@ -205,6 +244,183 @@ out: > } > > /* > + * if nr_to_read is too large, adjusts nr_to_read to the maximum sane value. > + * atomically increments ra_nr_queued by nr_to_read if possible > + * returns the number of pages queued (zero is possible) > + */ > +static unsigned long ra_queue_begin(struct address_space *mapping, > + unsigned long nr_to_read) > +{ > + unsigned long flags; > + unsigned long nr_isize, max; > + loff_t isize; > + > + /* do not attempt readahead pages beyond current inode size */ > + isize = i_size_read(mapping->host); > + if (isize == 0) > + return 0; > + nr_isize = (isize >> PAGE_CACHE_SHIFT) + 1; > + nr_to_read = min(nr_to_read, nr_isize); > + > + /* check if we can do readahead at all */ > + max = max_sane_readahead(~0UL); > + nr_to_read = min(nr_to_read, max); > + if (nr_to_read == 0) > + return 0; > + > + /* check if we queued up too much readahead */ > + spin_lock_irqsave(&ra_nr_queued_lock, flags); > + > + if (ra_nr_queued >= max) { > + /* too much queued, do not queue more */ > + nr_to_read = 0; > + } else { > + /* trim to reflect maximum amount possible */ > + if ((nr_to_read + ra_nr_queued) > max) > + nr_to_read = max - ra_nr_queued; > + > + ra_nr_queued += nr_to_read; > + } > + > + spin_unlock_irqrestore(&ra_nr_queued_lock, flags); > + > + return nr_to_read; > +} > + > +/* > + * atomically decrements ra_nr_queued by nr_pages when a part of the > + * readahead request is done (or aborted) > + */ > +static void ra_queue_complete(unsigned long nr_pages) > +{ > + unsigned long flags; > + > + spin_lock_irqsave(&ra_nr_queued_lock, flags); > + ra_nr_queued -= nr_pages; > + spin_unlock_irqrestore(&ra_nr_queued_lock, flags); > +} > + > +/* > + * Read a chunk of the read-ahead request, this will re-enqueue work. > + * Use 2 megabyte units per chunk to avoid pinning too much memory at once. > + */ > +static void wq_ra_req_fn(struct work_struct *work) > +{ > + unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE; > + struct wq_ra_req *req = container_of(work, struct wq_ra_req, work); > + int ret; > + int old_prio, tmp_prio; > + struct task_struct *p = current; > + > + /* limit the impact of idle tasks */ > + if (IOPRIO_PRIO_CLASS(req->ioprio) == IOPRIO_CLASS_IDLE) { > + /* drop requests for idle tasks if there is congestion */ > + if (bdi_read_congested(req->mapping->backing_dev_info)) > + goto done; > + > + /* smaller chunk size gives priority to others */ > + this_chunk /= 8; > + > + /* > + * setting IOPRIO_CLASS_IDLE may stall everything else, > + * use best-effort instead > + */ > + tmp_prio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 7); > + } else { > + tmp_prio = req->ioprio; > + } > + > + if (this_chunk > req->nr_to_read) > + this_chunk = req->nr_to_read; > + > + /* stop the async readahead if we cannot proceed */ > + this_chunk = max_sane_readahead(this_chunk); > + if (this_chunk == 0) > + goto done; > + > + /* temporarily change our IO prio to that of the originating task */ > + old_prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), task_nice_ioprio(p)); > + set_task_ioprio(p, tmp_prio); > + ret = __do_page_cache_readahead(req->mapping, req->file, > + req->offset, this_chunk, 0); > + set_task_ioprio(p, old_prio); > + > + /* requeue if readahead was successful and there is more to queue */ > + if (ret >= 0 && req->nr_to_read > this_chunk) { > + req->offset += this_chunk; > + req->nr_to_read -= this_chunk; > + ra_queue_complete(this_chunk); > + > + /* keep going, but yield to other requests */ > + wq_ra_enqueue(req); > + } else { > +done: > + ra_queue_complete(req->nr_to_read); > + fput(req->file); > + kfree(req); > + } > +} > + > +static void wq_ra_enqueue(struct wq_ra_req *req) > +{ > + INIT_WORK(&req->work, wq_ra_req_fn); > + > + if (IOPRIO_PRIO_CLASS(req->ioprio) == IOPRIO_CLASS_RT) > + queue_work(ra_rt, &req->work); > + else > + queue_work(ra_be, &req->work); > +} > + > +/* > + * Fire-and-forget readahead using a workqueue, this allocates pages > + * inside a workqueue and returns as soon as possible. > + */ > +void wq_page_cache_readahead(struct address_space *mapping, struct file *filp, > + pgoff_t offset, unsigned long nr_to_read) > +{ > + struct wq_ra_req *req; > + int ioprio; > + struct task_struct *p; > + > + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) > + goto skip_ra; > + > + nr_to_read = ra_queue_begin(mapping, nr_to_read); > + if (!nr_to_read) > + goto skip_ra; > + > + p = current; > + if (p->io_context) > + ioprio = p->io_context->ioprio; > + else > + ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), > + task_nice_ioprio(p)); > + > + /* drop requests for idle tasks if there is congestion */ > + if (IOPRIO_PRIO_CLASS(ioprio) == IOPRIO_CLASS_IDLE > + && bdi_read_congested(mapping->backing_dev_info)) > + goto skip_ra_done; > + > + req = kzalloc(sizeof(*req), GFP_KERNEL); > + if (!req) > + goto skip_ra_done; > + > + /* offload to a workqueue and return to caller ASAP */ > + req->mapping = mapping; > + req->file = filp; > + req->offset = offset; > + req->nr_to_read = nr_to_read; > + req->ioprio = ioprio; > + wq_ra_enqueue(req); > + > + return; > +skip_ra_done: > + ra_queue_complete(nr_to_read); > +skip_ra: > + fput(filp); > +} > + > +/* > * Chunk the readahead into 2 megabyte units, so that we don't pin too much > * memory at once. > */ -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>