From: Martin Brandenburg <martin@xxxxxxxxxxxx> Attach the actual range of bytes written to plus the responsible uid/gid to each dirty page. This information must be sent to the server when the page is written out. Now write_begin, page_mkwrite, and invalidatepage keep up with this information. There are several conditions where they must write out the page immediately to store the new range. Two non-contiguous ranges cannot be stored on a single page. Signed-off-by: Martin Brandenburg <martin@xxxxxxxxxxxx> Signed-off-by: Mike Marshall <hubcap@xxxxxxxxxxxx> --- fs/orangefs/file.c | 10 +- fs/orangefs/inode.c | 289 ++++++++++++++++++++++++++++++---- fs/orangefs/orangefs-kernel.h | 10 +- 3 files changed, 274 insertions(+), 35 deletions(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 26d8ff410b0a..f409ac5d3410 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -46,8 +46,8 @@ static int flush_racache(struct inode *inode) * Post and wait for the I/O upcall to finish */ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, - loff_t *offset, struct iov_iter *iter, - size_t total_size, loff_t readahead_size) + loff_t *offset, struct iov_iter *iter, size_t total_size, + loff_t readahead_size, struct orangefs_write_range *wr) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; @@ -85,6 +85,10 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, new_op->upcall.req.io.buf_index = buffer_index; new_op->upcall.req.io.count = total_size; new_op->upcall.req.io.offset = *offset; + if (type == ORANGEFS_IO_WRITE && wr) { + new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid); + new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid); + } gossip_debug(GOSSIP_FILE_DEBUG, "%s(%pU): offset: %llu total_size: %zd\n", @@ -329,7 +333,7 @@ static vm_fault_t orangefs_fault(struct vm_fault *vmf) static const struct vm_operations_struct orangefs_file_vm_ops = { .fault = orangefs_fault, .map_pages = filemap_map_pages, - .page_mkwrite = filemap_page_mkwrite, + .page_mkwrite = orangefs_page_mkwrite, }; /* diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 1c72aa38317d..add9c569a7dc 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -15,9 +15,11 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" -static int orangefs_writepage(struct page *page, struct writeback_control *wbc) +static int orangefs_writepage_locked(struct page *page, + struct writeback_control *wbc) { struct inode *inode = page->mapping->host; + struct orangefs_write_range *wr = NULL; struct iov_iter iter; struct bio_vec bv; size_t len, wlen; @@ -26,34 +28,52 @@ static int orangefs_writepage(struct page *page, struct writeback_control *wbc) set_page_writeback(page); - off = page_offset(page); len = i_size_read(inode); - if (off > len) { - /* The file was truncated; there is nothing to write. */ - unlock_page(page); - end_page_writeback(page); - return 0; + if (PagePrivate(page)) { + wr = (struct orangefs_write_range *)page_private(page); + off = wr->pos; + if (off + wr->len > len) + wlen = len - off; + else + wlen = wr->len; + } else { + WARN_ON(1); + off = page_offset(page); + if (off + PAGE_SIZE > len) + wlen = len - off; + else + wlen = PAGE_SIZE; } - if (off + PAGE_SIZE > len) - wlen = len - off; - else - wlen = PAGE_SIZE; + /* Should've been handled in orangefs_invalidatepage. */ + WARN_ON(off == len || off + wlen > len); bv.bv_page = page; bv.bv_len = wlen; bv.bv_offset = off % PAGE_SIZE; - if (wlen == 0) - dump_stack(); + WARN_ON(wlen == 0); iov_iter_bvec(&iter, WRITE, &bv, 1, wlen); ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, - len); + len, wr); if (ret < 0) { SetPageError(page); mapping_set_error(page->mapping, ret); } else { ret = 0; } + if (wr) { + kfree(wr); + set_page_private(page, 0); + ClearPagePrivate(page); + put_page(page); + } + return ret; +} + +static int orangefs_writepage(struct page *page, struct writeback_control *wbc) +{ + int ret; + ret = orangefs_writepage_locked(page, wbc); unlock_page(page); end_page_writeback(page); return ret; @@ -74,7 +94,7 @@ static int orangefs_readpage(struct file *file, struct page *page) iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE); ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, - PAGE_SIZE, inode->i_size); + PAGE_SIZE, inode->i_size, NULL); /* this will only zero remaining unread portions of the page data */ iov_iter_zero(~0U, &iter); /* takes care of potential aliasing */ @@ -92,6 +112,73 @@ static int orangefs_readpage(struct file *file, struct page *page) return ret; } +static int orangefs_launder_page(struct page *); + +static int orangefs_write_begin(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, struct page **pagep, + void **fsdata) +{ + struct orangefs_write_range *wr; + struct page *page; + pgoff_t index; + int ret; + + index = pos >> PAGE_SHIFT; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + + *pagep = page; + + if (PageDirty(page) && !PagePrivate(page)) { + /* + * Should be impossible. If it happens, launder the page + * since we don't know what's dirty. This will WARN in + * orangefs_writepage_locked. + */ + ret = orangefs_launder_page(page); + if (ret) + return ret; + } + if (PagePrivate(page)) { + struct orangefs_write_range *wr; + wr = (struct orangefs_write_range *)page_private(page); + if (wr->pos + wr->len == pos && + uid_eq(wr->uid, current_fsuid()) && + gid_eq(wr->gid, current_fsgid())) { + wr->len += len; + goto okay; + } else { + ret = orangefs_launder_page(page); + if (ret) + return ret; + } + + } + + wr = kmalloc(sizeof *wr, GFP_KERNEL); + if (!wr) + return -ENOMEM; + + wr->pos = pos; + wr->len = len; + wr->uid = current_fsuid(); + wr->gid = current_fsgid(); + SetPagePrivate(page); + set_page_private(page, (unsigned long)wr); + get_page(page); +okay: + + if (!PageUptodate(page) && (len != PAGE_SIZE)) { + unsigned from = pos & (PAGE_SIZE - 1); + + zero_user_segments(page, 0, from, from + len, PAGE_SIZE); + } + return 0; +} + static int orangefs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { @@ -105,24 +192,96 @@ static void orangefs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { - gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_invalidatepage called on page %p " - "(offset is %u)\n", - page, - offset); + struct orangefs_write_range *wr; + wr = (struct orangefs_write_range *)page_private(page); + + if (offset == 0 && length == PAGE_SIZE) { + kfree((struct orangefs_write_range *)page_private(page)); + set_page_private(page, 0); + ClearPagePrivate(page); + put_page(page); + /* write range entirely within invalidate range (or equal) */ + } else if (page_offset(page) + offset <= wr->pos && + wr->pos + wr->len <= page_offset(page) + offset + length) { + kfree((struct orangefs_write_range *)page_private(page)); + set_page_private(page, 0); + ClearPagePrivate(page); + put_page(page); + /* XXX is this right? only caller in fs */ + cancel_dirty_page(page); + /* invalidate range chops off end of write range */ + } else if (wr->pos < page_offset(page) + offset && + wr->pos + wr->len <= page_offset(page) + offset + length && + page_offset(page) + offset < wr->pos + wr->len) { + size_t x; + x = wr->pos + wr->len - (page_offset(page) + offset); + WARN_ON(x > wr->len); + wr->len -= x; + wr->uid = current_fsuid(); + wr->gid = current_fsgid(); + /* invalidate range chops off beginning of write range */ + } else if (page_offset(page) + offset <= wr->pos && + page_offset(page) + offset + length < wr->pos + wr->len && + wr->pos < page_offset(page) + offset + length) { + size_t x; + x = page_offset(page) + offset + length - wr->pos; + WARN_ON(x > wr->len); + wr->pos += x; + wr->len -= x; + wr->uid = current_fsuid(); + wr->gid = current_fsgid(); + /* invalidate range entirely within write range (punch hole) */ + } else if (wr->pos < page_offset(page) + offset && + page_offset(page) + offset + length < wr->pos + wr->len) { + /* XXX what do we do here... should not WARN_ON */ + WARN_ON(1); + /* punch hole */ + /* + * should we just ignore this and write it out anyway? + * it hardly makes sense + */ + /* non-overlapping ranges */ + } else { + /* WARN if they do overlap */ + if (!((page_offset(page) + offset + length <= wr->pos) ^ + (wr->pos + wr->len <= page_offset(page) + offset))) { + WARN_ON(1); + printk("invalidate range offset %llu length %u\n", + page_offset(page) + offset, length); + printk("write range offset %llu length %zu\n", + wr->pos, wr->len); + } + } +} - ClearPageUptodate(page); - ClearPageMappedToDisk(page); - return; +static int orangefs_releasepage(struct page *page, gfp_t foo) +{ + return !PagePrivate(page); +} +static void orangefs_freepage(struct page *page) +{ + if (PagePrivate(page)) { + kfree((struct orangefs_write_range *)page_private(page)); + set_page_private(page, 0); + ClearPagePrivate(page); + put_page(page); + } } -static int orangefs_releasepage(struct page *page, gfp_t foo) +static int orangefs_launder_page(struct page *page) { - gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_releasepage called on page %p\n", - page); - return 0; + int r = 0; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, + }; + wait_on_page_writeback(page); + if (clear_page_dirty_for_io(page)) { + r = orangefs_writepage_locked(page, &wbc); + end_page_writeback(page); + } + return r; } static ssize_t orangefs_direct_IO(struct kiocb *iocb, @@ -145,7 +304,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; size_t count = iov_iter_count(iter); - size_t ORIGINALcount = iov_iter_count(iter); ssize_t total_count = 0; ssize_t ret = -EINVAL; int i = 0; @@ -192,7 +350,7 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, (int)*offset); ret = wait_for_direct_io(type, inode, offset, iter, - each_count, 0); + each_count, 0, NULL); gossip_debug(GOSSIP_FILE_DEBUG, "%s(%pU): return from wait_for_io:%d\n", __func__, @@ -247,13 +405,82 @@ static const struct address_space_operations orangefs_address_operations = { .writepage = orangefs_writepage, .readpage = orangefs_readpage, .set_page_dirty = __set_page_dirty_nobuffers, - .write_begin = simple_write_begin, + .write_begin = orangefs_write_begin, .write_end = orangefs_write_end, .invalidatepage = orangefs_invalidatepage, .releasepage = orangefs_releasepage, + .freepage = orangefs_freepage, + .launder_page = orangefs_launder_page, .direct_IO = orangefs_direct_IO, }; +vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vmf->vma->vm_file); + vm_fault_t ret = VM_FAULT_LOCKED; + struct orangefs_write_range *wr; + + lock_page(page); + if (PageDirty(page) && !PagePrivate(page)) { + /* + * Should be impossible. If it happens, launder the page + * since we don't know what's dirty. This will WARN in + * orangefs_writepage_locked. + */ + if (orangefs_launder_page(page)) { + ret = VM_FAULT_RETRY; + goto out; + } + } + if (PagePrivate(page)) { + wr = (struct orangefs_write_range *)page_private(page); + if (uid_eq(wr->uid, current_fsuid()) && + gid_eq(wr->gid, current_fsgid())) { + wr->pos = page_offset(page); + wr->len = PAGE_SIZE; + goto okay; + } else { + if (orangefs_launder_page(page)) { + ret = VM_FAULT_RETRY; + goto out; + } + } + } + wr = kmalloc(sizeof *wr, GFP_KERNEL); + if (!wr) { + ret = VM_FAULT_RETRY; + goto out; + } + wr->pos = page_offset(page); + wr->len = PAGE_SIZE; + wr->uid = current_fsuid(); + wr->gid = current_fsgid(); + SetPagePrivate(page); + set_page_private(page, (unsigned long)wr); + get_page(page); +okay: + + sb_start_pagefault(inode->i_sb); + file_update_time(vmf->vma->vm_file); + if (page->mapping != inode->i_mapping) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + goto out; + } + + /* + * We mark the page dirty already here so that when freeze is in + * progress, we are guaranteed that writeback during freezing will + * see the dirty page and writeprotect it again. + */ + set_page_dirty(page); + wait_for_stable_page(page); +out: + sb_end_pagefault(inode->i_sb); + return ret; +} + static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 307bbb61819a..336a3ec0b83e 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -230,6 +230,13 @@ struct orangefs_cached_xattr { unsigned long timeout; }; +struct orangefs_write_range { + loff_t pos; + size_t len; + kuid_t uid; + kgid_t gid; +}; + extern struct orangefs_stats orangefs_stats; /* @@ -342,6 +349,7 @@ void fsid_key_table_finalize(void); /* * defined in inode.c */ +vm_fault_t orangefs_page_mkwrite(struct vm_fault *); struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, int mode, @@ -383,7 +391,7 @@ bool __is_daemon_in_service(void); * defined in file.c */ ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *, - struct iov_iter *, size_t, loff_t); + struct iov_iter *, size_t, loff_t, struct orangefs_write_range *); ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *, struct iov_iter *); -- 2.20.1