This adds a structure and interface to represent the segments of memory which are acting as the source or destination for a read or write operation. Callers would fill this structure and then pass it down the rw path. The intent is to let stages in the rw path make specific calls against this API and structure instead of working with, say, struct iovec natively. The main intent of this is to enable kernel calls into the rw path which specify memory with page/offset/len tuples. Another potential benefit of this is the reduction in iterations over iovecs at various points in the kernel. Each iov_length(iov) call, for example, could be translated into rwm->total_bytes. O_DIRECTs check of memory alignment is changed into a single test against rwm->boundary_bits. I imagine this might integrate well with the iov_iter interface, though I haven't examined that in any depth. --- fs/Makefile | 2 +- fs/rwmem.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/rwmem.h | 29 +++++++++++++++ 3 files changed, 122 insertions(+), 1 deletions(-) create mode 100644 fs/rwmem.c create mode 100644 include/linux/rwmem.h diff --git a/fs/Makefile b/fs/Makefile index 500cf15..c342365 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \ attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o drop_caches.o splice.o sync.o utimes.o \ - stack.o + stack.o rwmem.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o diff --git a/fs/rwmem.c b/fs/rwmem.c new file mode 100644 index 0000000..0433ba4 --- /dev/null +++ b/fs/rwmem.c @@ -0,0 +1,92 @@ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/uio.h> +#include <linux/rwmem.h> + +static inline unsigned long pages_spanned(unsigned long addr, + unsigned long bytes) +{ + return ((addr + bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) - + (addr >> PAGE_SHIFT); +} + +void rwmem_iovec_init(struct rwmem *rwm) +{ + struct rwmem_iovec *rwi = container_of(rwm, struct rwmem_iovec, rwmem); + struct iovec *iov; + unsigned long i; + + rwm->total_bytes = 0; + rwm->nr_pages = 0; + rwm->boundary_bits = 0; + + for (i = 0; i < rwm->nr_segs; i++) { + iov = &rwi->iov[i]; + + rwm->total_bytes += iov->iov_len; + rwm->nr_pages += pages_spanned((unsigned long)iov->iov_base, + iov->iov_len); + rwm->boundary_bits |= (unsigned long)iov->iov_base | + (unsigned long)iov->iov_len; + } +} + +/* + * Returns the offset of the start of a segment within its first page. + */ +unsigned long rwmem_iovec_seg_page_offset(struct rwmem *rwm, unsigned long i) +{ + struct rwmem_iovec *rwi = container_of(rwm, struct rwmem_iovec, rwmem); + BUG_ON(i >= rwm->nr_segs); + return (unsigned long)rwi->iov[i].iov_base & ~PAGE_MASK; +} + +/* + * Returns the total bytes in the given segment. + */ +unsigned long rwmem_iovec_seg_bytes(struct rwmem *rwm, unsigned long i) +{ + struct rwmem_iovec *rwi = container_of(rwm, struct rwmem_iovec, rwmem); + BUG_ON(i >= rwm->nr_segs); + return rwi->iov[i].iov_len; +} + +int rwmem_iovec_get_seg_pages(struct rwmem *rwm, unsigned long i, + unsigned long *cursor, struct page **pages, + unsigned long max_pages, int write) +{ + struct rwmem_iovec *rwi = container_of(rwm, struct rwmem_iovec, rwmem); + struct iovec *iov; + int ret; + + BUG_ON(i >= rwm->nr_segs); + iov = &rwi->iov[i]; + + if (*cursor == 0) + *cursor = (unsigned long)iov->iov_base; + + max_pages = min(pages_spanned(*cursor, iov->iov_len - + (*cursor - (unsigned long)iov->iov_base)), + max_pages); + + down_read(¤t->mm->mmap_sem); + ret = get_user_pages(current, current->mm, *cursor, max_pages, write, + 0, pages, NULL); + up_read(¤t->mm->mmap_sem); + + if (ret > 0) { + *cursor += ret * PAGE_SIZE; + if (*cursor >= (unsigned long)iov->iov_base + iov->iov_len) + *cursor = ~0; + } + + return ret; +} + +struct rwmem_ops rwmem_iovec_ops = { + .init = rwmem_iovec_init, + .seg_page_offset = rwmem_iovec_seg_page_offset, + .seg_bytes = rwmem_iovec_seg_bytes, + .get_seg_pages = rwmem_iovec_get_seg_pages, +}; diff --git a/include/linux/rwmem.h b/include/linux/rwmem.h new file mode 100644 index 0000000..666f9f4 --- /dev/null +++ b/include/linux/rwmem.h @@ -0,0 +1,29 @@ +#ifndef _LINUX_RWMEM_H +#define _LINUX_RWMEM_H + +struct rwmwm_ops; + +struct rwmem { + struct rwmem_ops *ops; + size_t total_bytes; + unsigned long boundary_bits; + unsigned long nr_pages; + unsigned short nr_segs; +}; + +struct rwmem_ops { + void (*init)(struct rwmem *rwm); + unsigned long (*seg_page_offset)(struct rwmem *rwm, unsigned long i); + unsigned long (*seg_bytes)(struct rwmem *rwm, unsigned long i); + int (*get_seg_pages)(struct rwmem *rwm, unsigned long i, + unsigned long *cursor, struct page **pages, + unsigned long max_pages, int write); +}; + +struct rwmem_iovec { + struct rwmem rwmem; + const struct iovec *iov; +}; +struct rwmem_ops rwmem_iovec_ops; + +#endif -- 1.5.2.2 - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html