After looking at v2 yesterday I noticed I few things in the structure that I really didn't like: - using a struct page * return value just to communicate status codes - the extremely long function names - a generally somewhat non-intuitive split of the helpers I then hacked on top of it for a bit while sitting in a telephone conference. Below is my result, which passes a quick xfstests run. Note that this includes the refactoring and the batch lookup changes as I did it on top of your series: diff --git a/mm/filemap.c b/mm/filemap.c index f0ae9a6308cb4d..9e0aecd99950f4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1972,273 +1972,360 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) ra->ra_pages /= 4; } -/** - * generic_file_buffered_read - generic file read routine - * @iocb: the iocb to read - * @iter: data destination - * @written: already copied - * - * This is a generic file read routine, and uses the - * mapping->a_ops->readpage() function for the actual low-level stuff. - * - * This is really ugly. But the goto's actually try to clarify some - * of the logic when it comes to error handling etc. - * - * Return: - * * total number of bytes copied, including those the were already @written - * * negative error code if nothing was copied - */ -ssize_t generic_file_buffered_read(struct kiocb *iocb, - struct iov_iter *iter, ssize_t written) +static inline pgoff_t filemap_last_index(struct kiocb *iocb, + struct iov_iter *iter) { - struct file *filp = iocb->ki_filp; - struct address_space *mapping = filp->f_mapping; - struct inode *inode = mapping->host; - struct file_ra_state *ra = &filp->f_ra; - loff_t *ppos = &iocb->ki_pos; - pgoff_t index; - pgoff_t last_index; - pgoff_t prev_index; - unsigned long offset; /* offset into pagecache page */ - unsigned int prev_offset; - int error = 0; - - if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) - return 0; - iov_iter_truncate(iter, inode->i_sb->s_maxbytes); - - index = *ppos >> PAGE_SHIFT; - prev_index = ra->prev_pos >> PAGE_SHIFT; - prev_offset = ra->prev_pos & (PAGE_SIZE-1); - last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; - offset = *ppos & ~PAGE_MASK; + return (iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT; +} - for (;;) { - struct page *page; - pgoff_t end_index; - loff_t isize; - unsigned long nr, ret; +static inline unsigned long filemap_nr_pages(struct kiocb *iocb, + struct iov_iter *iter) +{ + return filemap_last_index(iocb, iter) - (iocb->ki_pos >> PAGE_SHIFT); +} - cond_resched(); -find_page: - if (fatal_signal_pending(current)) { - error = -EINTR; - goto out; - } +static int __filemap_read_not_uptodate(struct file *file, struct page *page) +{ + int error; - page = find_get_page(mapping, index); - if (!page) { - if (iocb->ki_flags & IOCB_NOWAIT) - goto would_block; - page_cache_sync_readahead(mapping, - ra, filp, - index, last_index - index); - page = find_get_page(mapping, index); - if (unlikely(page == NULL)) - goto no_cached_page; - } - if (PageReadahead(page)) { - page_cache_async_readahead(mapping, - ra, filp, page, - index, last_index - index); - } - if (!PageUptodate(page)) { - if (iocb->ki_flags & IOCB_NOWAIT) { - put_page(page); - goto would_block; - } + error = lock_page_killable(page); + if (error) + return error; + if (!PageUptodate(page)) { + if (!page->mapping) { /* - * See comment in do_read_cache_page on why - * wait_on_page_locked is used to avoid unnecessarily - * serialisations and why it's safe. + * invalidate_mapping_pages got it */ - error = wait_on_page_locked_killable(page); - if (unlikely(error)) - goto readpage_error; - if (PageUptodate(page)) - goto page_ok; - - if (inode->i_blkbits == PAGE_SHIFT || - !mapping->a_ops->is_partially_uptodate) - goto page_not_up_to_date; - /* pipes can't handle partially uptodate pages */ - if (unlikely(iov_iter_is_pipe(iter))) - goto page_not_up_to_date; - if (!trylock_page(page)) - goto page_not_up_to_date; - /* Did it get truncated before we got the lock? */ - if (!page->mapping) - goto page_not_up_to_date_locked; - if (!mapping->a_ops->is_partially_uptodate(page, - offset, iter->count)) - goto page_not_up_to_date_locked; - unlock_page(page); + error = AOP_TRUNCATED_PAGE; + } else { + error = -EIO; } -page_ok: - /* - * i_size must be checked after we know the page is Uptodate. - * - * Checking i_size after the check allows us to calculate - * the correct value for "nr", which means the zero-filled - * part of the page is not copied back to userspace (unless - * another truncate extends the file - this is desired though). - */ + } - isize = i_size_read(inode); - end_index = (isize - 1) >> PAGE_SHIFT; - if (unlikely(!isize || index > end_index)) { - put_page(page); - goto out; - } + unlock_page(page); + if (error == -EIO) + shrink_readahead_size_eio(&file->f_ra); + return error; +} - /* nr is the maximum number of bytes to copy from this page */ - nr = PAGE_SIZE; - if (index == end_index) { - nr = ((isize - 1) & ~PAGE_MASK) + 1; - if (nr <= offset) { - put_page(page); - goto out; - } - } - nr = nr - offset; +static int __filemap_read_one_page(struct file *file, struct page *page) +{ + int error; - /* If users can be writing to this page using arbitrary - * virtual addresses, take care about potential aliasing - * before reading the page on the kernel side. - */ - if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + /* + * A previous I/O error may have been due to temporary failures, e.g. + * multipath errors. PG_error will be set again if readpage fails. + */ + ClearPageError(page); - /* - * When a sequential read accesses a page several times, - * only mark it as accessed the first time. - */ - if (prev_index != index || offset != prev_offset) - mark_page_accessed(page); - prev_index = index; + /* + * Start the actual read. The read will unlock the page. + */ + error = file->f_mapping->a_ops->readpage(file, page); + if (!error && !PageUptodate(page)) + return __filemap_read_not_uptodate(file, page); + return error; +} - /* - * Ok, we have the page, and it's up-to-date, so - * now we can copy it to user space... - */ +static int filemap_read_one_page(struct kiocb *iocb, struct iov_iter *iter, + struct page *page, pgoff_t index) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + loff_t pos = max(iocb->ki_pos, (loff_t)index << PAGE_SHIFT); + loff_t count = iocb->ki_pos + iter->count - pos; + int error; - ret = copy_page_to_iter(page, offset, nr, iter); - offset += ret; - index += offset >> PAGE_SHIFT; - offset &= ~PAGE_MASK; - prev_offset = offset; + /* + * See comment in do_read_cache_page on why wait_on_page_locked is used + * to avoid unnecessarily serialisations and why it's safe. + */ + error = wait_on_page_locked_killable(page); + if (unlikely(error)) + goto out_put_page; - put_page(page); - written += ret; - if (!iov_iter_count(iter)) - goto out; - if (ret < nr) { - error = -EFAULT; - goto out; - } - continue; + if (PageUptodate(page)) + return 0; + + if (mapping->host->i_blkbits == PAGE_SHIFT || + !mapping->a_ops->is_partially_uptodate) + goto page_not_up_to_date; + /* pipes can't handle partially uptodate pages */ + if (unlikely(iov_iter_is_pipe(iter))) + goto page_not_up_to_date; + if (!trylock_page(page)) + goto page_not_up_to_date; + /* Did it get truncated before we got the lock? */ + if (!page->mapping) + goto page_not_up_to_date_locked; + if (!mapping->a_ops->is_partially_uptodate(page, pos & ~PAGE_MASK, + count)) + goto page_not_up_to_date_locked; +done: + unlock_page(page); + return 0; page_not_up_to_date: - /* Get exclusive access to the page ... */ - error = lock_page_killable(page); - if (unlikely(error)) - goto readpage_error; + /* Get exclusive access to the page ... */ + error = lock_page_killable(page); + if (unlikely(error)) + goto out_put_page; page_not_up_to_date_locked: - /* Did it get truncated before we got the lock? */ - if (!page->mapping) { - unlock_page(page); - put_page(page); + /* Did it get truncated before we got the lock? */ + if (!page->mapping) { + error = AOP_TRUNCATED_PAGE; + unlock_page(page); + goto out_put_page; + } + + /* Did somebody else fill it already? */ + if (PageUptodate(page)) + goto done; + + error = __filemap_read_one_page(file, page); + if (error) + goto out_put_page; + return 0; + +out_put_page: + put_page(page); + return error; +} + +static int filemap_read_get_pages(struct kiocb *iocb, struct iov_iter *iter, + struct page **pages, unsigned long max_pages) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + unsigned long nr = min(filemap_nr_pages(iocb, iter), max_pages); + pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; + int ret; + + if (fatal_signal_pending(current)) + return -EINTR; + + ret = find_get_pages_contig(mapping, index, nr, pages); + if (ret) + return ret; + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + + page_cache_sync_readahead(mapping, &file->f_ra, file, index, + filemap_nr_pages(iocb, iter)); + + ret = find_get_pages_contig(mapping, index, nr, pages); + if (ret) + return ret; + + /* + * Ok, it wasn't cached, so we need to create a new page.. + */ + pages[0] = page_cache_alloc(mapping); + if (!pages[0]) + return -ENOMEM; + + ret = add_to_page_cache_lru(pages[0], mapping, index, + mapping_gfp_constraint(mapping, GFP_KERNEL)); + if (ret) { + if (ret == -EEXIST) + ret = 0; + goto out_put_page; + } + + ret = __filemap_read_one_page(file, pages[0]); + if (ret) { + if (ret == AOP_TRUNCATED_PAGE) + ret = 0; + goto out_put_page; + } + + return 1; + +out_put_page: + put_page(pages[0]); + return ret; +} + +static int filemap_read_pages(struct kiocb *iocb, struct iov_iter *iter, + struct page **pages, unsigned int nr_pages) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + pgoff_t last_index = filemap_last_index(iocb, iter); + pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; + unsigned int cur_page, j; + int err = 0; + + for (cur_page = 0; cur_page < nr_pages; cur_page++, index++) { + struct page *page = pages[cur_page]; + + if (PageReadahead(page)) + page_cache_async_readahead(mapping, &file->f_ra, file, + page, index, last_index - index); + + if (PageUptodate(page)) continue; - } - /* Did somebody else fill it already? */ - if (PageUptodate(page)) { - unlock_page(page); - goto page_ok; + if (iocb->ki_flags & IOCB_NOWAIT) { + put_page(page); + err = -EAGAIN; + goto out_put_pages; } -readpage: - /* - * A previous I/O error may have been due to temporary - * failures, eg. multipath errors. - * PG_error will be set again if readpage fails. - */ - ClearPageError(page); - /* Start the actual read. The read will unlock the page. */ - error = mapping->a_ops->readpage(filp, page); + err = filemap_read_one_page(iocb, iter, page, index); + if (err) + goto out_put_pages; + } - if (unlikely(error)) { - if (error == AOP_TRUNCATED_PAGE) { - put_page(page); - error = 0; - goto find_page; - } - goto readpage_error; - } + return cur_page; - if (!PageUptodate(page)) { - error = lock_page_killable(page); - if (unlikely(error)) - goto readpage_error; - if (!PageUptodate(page)) { - if (page->mapping == NULL) { - /* - * invalidate_mapping_pages got it - */ - unlock_page(page); - put_page(page); - goto find_page; - } - unlock_page(page); - shrink_readahead_size_eio(ra); - error = -EIO; - goto readpage_error; - } - unlock_page(page); - } +out_put_pages: + for (j = cur_page + 1; j < nr_pages; j++) + put_page(pages[j]); + if (cur_page == 0) { + if (err == AOP_TRUNCATED_PAGE) + err = 0; + return err; + } + return cur_page; +} - goto page_ok; +static int filemap_do_read(struct kiocb *iocb, struct iov_iter *iter, + struct page **pages, unsigned long max_pages) +{ + struct file *filp = iocb->ki_filp; + struct address_space *mapping = filp->f_mapping; + struct file_ra_state *ra = &filp->f_ra; + loff_t isize, end_offset; + int nr_pages, i; + bool writably_mapped; -readpage_error: - /* UHHUH! A synchronous read error occurred. Report it */ - put_page(page); - goto out; + cond_resched(); + +was_truncated: + nr_pages = filemap_read_get_pages(iocb, iter, pages, max_pages); + if (nr_pages > 0) + nr_pages = filemap_read_pages(iocb, iter, pages, nr_pages); + if (nr_pages == 0) + goto was_truncated; + if (nr_pages < 0) + return nr_pages; + + /* + * i_size must be checked after we know the pages are Uptodate. + * + * Checking i_size after the check allows us to calculate the correct + * value for "nr_pages", which means the zero-filled part of the page is + * not copied back to userspace (unless another truncate extends the + * file - this is desired though). + */ + isize = i_size_read(mapping->host); + if (unlikely(iocb->ki_pos >= isize)) + goto put_pages; + + end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); + + while ((iocb->ki_pos >> PAGE_SHIFT) + nr_pages > + (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT) + put_page(pages[--nr_pages]); + + /* + * Once we start copying data, we don't want to be touching any + * cachelines that might be contended: + */ + writably_mapped = mapping_writably_mapped(mapping); + + /* + * When a sequential read accesses a page several times, only mark it as + * accessed the first time. + */ + if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT) + mark_page_accessed(pages[0]); + for (i = 1; i < nr_pages; i++) + mark_page_accessed(pages[i]); + + for (i = 0; i < nr_pages; i++) { + unsigned offset = iocb->ki_pos & ~PAGE_MASK; + unsigned bytes = min_t(loff_t, end_offset - iocb->ki_pos, + PAGE_SIZE - offset); + unsigned copied; -no_cached_page: /* - * Ok, it wasn't cached, so we need to create a new - * page.. + * If users can be writing to this page using arbitrary virtual + * addresses, take care about potential aliasing before reading + * the page on the kernel side. */ - page = page_cache_alloc(mapping); - if (!page) { - error = -ENOMEM; - goto out; - } - error = add_to_page_cache_lru(page, mapping, index, - mapping_gfp_constraint(mapping, GFP_KERNEL)); - if (error) { - put_page(page); - if (error == -EEXIST) { - error = 0; - goto find_page; - } - goto out; - } - goto readpage; + if (writably_mapped) + flush_dcache_page(pages[i]); + + copied = copy_page_to_iter(pages[i], offset, bytes, iter); + + iocb->ki_pos += copied; + ra->prev_pos = iocb->ki_pos; + + if (copied < bytes) + return -EFAULT; } -would_block: - error = -EAGAIN; -out: - ra->prev_pos = prev_index; - ra->prev_pos <<= PAGE_SHIFT; - ra->prev_pos |= prev_offset; +put_pages: + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); + return 0; +} + +/** + * generic_file_buffered_read - generic file read routine + * @iocb: the iocb to read + * @iter: data destination + * @written: already copied + * + * This is a generic file read routine, and uses the + * mapping->a_ops->readpage() function for the actual low-level stuff. + * + * Return: + * * total number of bytes copied, including those the were already @written + * * negative error code if nothing was copied + */ +ssize_t generic_file_buffered_read(struct kiocb *iocb, + struct iov_iter *iter, ssize_t written) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + size_t orig_count = iov_iter_count(iter); + struct inode *inode = mapping->host; + struct page *page_array[8], **pages = NULL; + unsigned max_pages = filemap_nr_pages(iocb, iter); + int error; - *ppos = ((loff_t)index << PAGE_SHIFT) + offset; - file_accessed(filp); - return written ? written : error; + if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) + return 0; + iov_iter_truncate(iter, inode->i_sb->s_maxbytes); + + if (max_pages > ARRAY_SIZE(page_array)) + pages = kmalloc_array(max_pages, sizeof(void *), GFP_KERNEL); + + if (!pages) { + pages = page_array; + max_pages = ARRAY_SIZE(page_array); + } + + do { + error = filemap_do_read(iocb, iter, pages, max_pages); + if (error) + break; + } while (iov_iter_count(iter) && iocb->ki_pos < i_size_read(inode)); + + file_accessed(iocb->ki_filp); + written += orig_count - iov_iter_count(iter); + + if (pages != page_array) + kfree(pages); + + if (unlikely(!written)) + return error; + return written; } EXPORT_SYMBOL_GPL(generic_file_buffered_read);