Hello everyone, We're still hammering out O_DIRECT code for btrfs that doesn't go through the page cache. Until things are done, it helps to have O_DIRECT that goes through the page cache and just immediately invalidates. It doesn't give us AIO, but it is still useful for some workloads. Btrfs has had O_DIRECT writes for a while by invalidating the cache in its own file_write. Reads are harder because filemap.c doesn't quite export enough. Rather than export things and make a btrfs read, I've made a generic cache-only read function that includes O_DIRECT invalidation. What does everyone think of adding something like this: -chris diff --git a/include/linux/fs.h b/include/linux/fs.h index 2adaa25..9ee97a9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2203,6 +2203,9 @@ extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); +extern ssize_t generic_file_cached_read(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos); extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t *); extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); diff --git a/mm/filemap.c b/mm/filemap.c index ef169f3..506d769 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1269,6 +1269,85 @@ int generic_segment_checks(const struct iovec *iov, EXPORT_SYMBOL(generic_segment_checks); /** + * generic_file_cached_read - generic filesystem read routine, no O_DIRECT + * is done + * @iocb: kernel I/O control block + * @iov: io vector request + * @nr_segs: number of segments in the iovec + * @pos: current file position + * + * This is the "read()" routine for all filesystems + * that can use the page cache directly, but do not support + * O_DIRECT. The O_DIRECT part is emulated with page cache invalidatation + */ +ssize_t +generic_file_cached_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *filp = iocb->ki_filp; + ssize_t retval; + unsigned long seg; + size_t count; + loff_t *ppos = &iocb->ki_pos; + struct address_space *mapping; + struct inode *inode; + loff_t size; + size_t iov_len = 0; + + count = 0; + retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); + if (retval) + return retval; + + mapping = filp->f_mapping; + inode = mapping->host; + size = i_size_read(inode); + + /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ + if (filp->f_flags & O_DIRECT) { + if (!count) + goto out; /* skip atime */ + + iov_len = iov_length(iov, nr_segs); + + if (pos < size) { + retval = filemap_write_and_wait_range(mapping, pos, + pos + iov_len - 1); + if (retval) + goto out; + } + } + + for (seg = 0; seg < nr_segs; seg++) { + read_descriptor_t desc; + + desc.written = 0; + desc.arg.buf = iov[seg].iov_base; + desc.count = iov[seg].iov_len; + if (desc.count == 0) + continue; + desc.error = 0; + do_generic_file_read(filp, ppos, &desc, file_read_actor); + retval += desc.written; + if (desc.error) { + retval = retval ?: desc.error; + break; + } + if (desc.count > 0) + break; + } + if ((filp->f_flags & O_DIRECT) && pos < size) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + iov_len - 1) >> PAGE_CACHE_SHIFT); + } + file_accessed(filp); +out: + return retval; +} +EXPORT_SYMBOL(generic_file_cached_read); + +/** * generic_file_aio_read - generic filesystem read routine * @iocb: kernel I/O control block * @iov: io vector request -- 1.6.4.1 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html