From: Martin Brandenburg <martin@xxxxxxxxxxxx> This is modeled after NFS, except our method is different. We use a simple timer to determine whether to invalidate the page cache. This is bound to perform. This addes a sysfs parameter cache_timeout_msecs which controls the time between page cache invalidations. Signed-off-by: Martin Brandenburg <martin@xxxxxxxxxxxx> Signed-off-by: Mike Marshall <hubcap@xxxxxxxxxxxx> --- fs/orangefs/file.c | 70 +++++++++- fs/orangefs/inode.c | 250 +++++++++++++++++++++++++++++++--- fs/orangefs/orangefs-kernel.h | 4 + fs/orangefs/orangefs-mod.c | 1 + fs/orangefs/orangefs-sysfs.c | 22 +++ 5 files changed, 328 insertions(+), 19 deletions(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 405449ce4b02..faa5b61cdfd6 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -241,18 +241,78 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, return ret; } +int orangefs_revalidate_mapping(struct inode *inode) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct address_space *mapping = inode->i_mapping; + unsigned long *bitlock = &orangefs_inode->bitlock; + int ret; + + while (1) { + ret = wait_on_bit(bitlock, 1, TASK_KILLABLE); + if (ret) + return ret; + spin_lock(&inode->i_lock); + if (test_bit(1, bitlock)) { + spin_unlock(&inode->i_lock); + continue; + } + if (!time_before(jiffies, orangefs_inode->mapping_time)) + break; + spin_unlock(&inode->i_lock); + return 0; + } + + set_bit(1, bitlock); + smp_wmb(); + spin_unlock(&inode->i_lock); + + unmap_mapping_range(mapping, 0, 0, 0); + ret = filemap_write_and_wait(mapping); + if (!ret) + ret = invalidate_inode_pages2(mapping); + + orangefs_inode->mapping_time = jiffies + + orangefs_cache_timeout_msecs*HZ/1000; + + clear_bit(1, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, 1); + + return ret; +} + static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { + int ret; orangefs_stats.reads++; - return generic_file_read_iter(iocb, iter); + + down_read(&file_inode(iocb->ki_filp)->i_rwsem); + ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); + if (ret) + goto out; + + ret = generic_file_read_iter(iocb, iter); +out: + up_read(&file_inode(iocb->ki_filp)->i_rwsem); + return ret; } static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) { + int ret; orangefs_stats.writes++; - return generic_file_write_iter(iocb, iter); + + if (iocb->ki_pos > i_size_read(file_inode(iocb->ki_filp))) { + ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); + if (ret) + return ret; + } + + ret = generic_file_write_iter(iocb, iter); + return ret; } /* @@ -341,6 +401,12 @@ static const struct vm_operations_struct orangefs_file_vm_ops = { */ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) { + int ret; + + ret = orangefs_revalidate_mapping(file_inode(file)); + if (ret) + return ret; + gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_mmap: called on %s\n", (file ? diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index add9c569a7dc..7ed2ea093c4e 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -31,6 +31,7 @@ static int orangefs_writepage_locked(struct page *page, len = i_size_read(inode); if (PagePrivate(page)) { wr = (struct orangefs_write_range *)page_private(page); + WARN_ON(wr->pos >= len); off = wr->pos; if (off + wr->len > len) wlen = len - off; @@ -79,6 +80,173 @@ static int orangefs_writepage(struct page *page, struct writeback_control *wbc) return ret; } +struct orangefs_writepages { + loff_t off; + size_t len; + kuid_t uid; + kgid_t gid; + int maxpages; + int npages; + struct page **pages; + struct bio_vec *bv; +}; + +static int orangefs_writepages_work(struct orangefs_writepages *ow, + struct writeback_control *wbc) +{ + struct inode *inode = ow->pages[0]->mapping->host; + struct orangefs_write_range *wrp, wr; + struct iov_iter iter; + ssize_t ret; + size_t len; + loff_t off; + int i; + + len = i_size_read(inode); + + for (i = 0; i < ow->npages; i++) { + set_page_writeback(ow->pages[i]); + ow->bv[i].bv_page = ow->pages[i]; + ow->bv[i].bv_len = min(page_offset(ow->pages[i]) + PAGE_SIZE, + ow->off + ow->len) - + max(ow->off, page_offset(ow->pages[i])); + if (i == 0) + ow->bv[i].bv_offset = ow->off - + page_offset(ow->pages[i]); + else + ow->bv[i].bv_offset = 0; + } + iov_iter_bvec(&iter, WRITE, ow->bv, ow->npages, ow->len); + + WARN_ON(ow->off >= len); + if (ow->off + ow->len > len) + ow->len = len - ow->off; + + off = ow->off; + wr.uid = ow->uid; + wr.gid = ow->gid; + ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len, + 0, &wr); + if (ret < 0) { + for (i = 0; i < ow->npages; i++) { + SetPageError(ow->pages[i]); + mapping_set_error(ow->pages[i]->mapping, ret); + if (PagePrivate(ow->pages[i])) { + wrp = (struct orangefs_write_range *) + page_private(ow->pages[i]); + ClearPagePrivate(ow->pages[i]); + put_page(ow->pages[i]); + kfree(wrp); + } + end_page_writeback(ow->pages[i]); + unlock_page(ow->pages[i]); + } + } else { + ret = 0; + for (i = 0; i < ow->npages; i++) { + if (PagePrivate(ow->pages[i])) { + wrp = (struct orangefs_write_range *) + page_private(ow->pages[i]); + ClearPagePrivate(ow->pages[i]); + put_page(ow->pages[i]); + kfree(wrp); + } + end_page_writeback(ow->pages[i]); + unlock_page(ow->pages[i]); + } + } + return ret; +} + +static int orangefs_writepages_callback(struct page *page, + struct writeback_control *wbc, void *data) +{ + struct orangefs_writepages *ow = data; + struct orangefs_write_range *wr; + int ret; + + if (!PagePrivate(page)) { + unlock_page(page); + /* It's not private so there's nothing to write, right? */ + printk("writepages_callback not private!\n"); + BUG(); + return 0; + } + wr = (struct orangefs_write_range *)page_private(page); + + ret = -1; + if (ow->npages == 0) { + ow->off = wr->pos; + ow->len = wr->len; + ow->uid = wr->uid; + ow->gid = wr->gid; + ow->pages[ow->npages++] = page; + ret = 0; + goto done; + } + if (!uid_eq(ow->uid, wr->uid) || !gid_eq(ow->gid, wr->gid)) { + orangefs_writepages_work(ow, wbc); + ow->npages = 0; + ret = -1; + goto done; + } + if (ow->off + ow->len == wr->pos) { + ow->len += wr->len; + ow->pages[ow->npages++] = page; + ret = 0; + goto done; + } +done: + if (ret == -1) { + if (ow->npages) { + orangefs_writepages_work(ow, wbc); + ow->npages = 0; + } + ret = orangefs_writepage_locked(page, wbc); + mapping_set_error(page->mapping, ret); + unlock_page(page); + end_page_writeback(page); + } else { + if (ow->npages == ow->maxpages) { + orangefs_writepages_work(ow, wbc); + ow->npages = 0; + } + } + return ret; +} + +static int orangefs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct orangefs_writepages *ow; + struct blk_plug plug; + int ret; + ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL); + if (!ow) + return -ENOMEM; + ow->maxpages = orangefs_bufmap_size_query()/PAGE_SIZE; + ow->pages = kcalloc(ow->maxpages, sizeof(struct page *), GFP_KERNEL); + if (!ow->pages) { + kfree(ow); + return -ENOMEM; + } + ow->bv = kcalloc(ow->maxpages, sizeof(struct bio_vec), GFP_KERNEL); + if (!ow->bv) { + kfree(ow->pages); + kfree(ow); + return -ENOMEM; + } + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, orangefs_writepages_callback, ow); + if (ow->npages) + ret = orangefs_writepages_work(ow, wbc); + blk_finish_plug(&plug); + kfree(ow->pages); + kfree(ow->bv); + kfree(ow); + return ret; +} + static int orangefs_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; @@ -93,6 +261,9 @@ static int orangefs_readpage(struct file *file, struct page *page) bv.bv_offset = 0; iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE); + if (PageDirty(page)) + orangefs_launder_page(page); + ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, PAGE_SIZE, inode->i_size, NULL); /* this will only zero remaining unread portions of the page data */ @@ -170,22 +341,42 @@ static int orangefs_write_begin(struct file *file, set_page_private(page, (unsigned long)wr); get_page(page); okay: - - if (!PageUptodate(page) && (len != PAGE_SIZE)) { - unsigned from = pos & (PAGE_SIZE - 1); - - zero_user_segments(page, 0, from, from + len, PAGE_SIZE); - } return 0; } static int orangefs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - int r; - r = simple_write_end(file, mapping, pos, len, copied, page, fsdata); + struct inode *inode = page->mapping->host; + loff_t last_pos = pos + copied; + + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold the i_mutex. + */ + if (last_pos > inode->i_size) + i_size_write(inode, last_pos); + + /* zero the stale part of the page if we did a short copy */ + if (!PageUptodate(page)) { + unsigned from = pos & (PAGE_SIZE - 1); + if (copied < len) { + zero_user(page, from + copied, len - copied); + } + /* Set fully written pages uptodate. */ + if (pos == page_offset(page) && + (len == PAGE_SIZE || pos + len == inode->i_size)) { + zero_user_segment(page, from + copied, PAGE_SIZE); + SetPageUptodate(page); + } + } + + set_page_dirty(page); + unlock_page(page); + put_page(page); + mark_inode_dirty_sync(file_inode(file)); - return r; + return copied; } static void orangefs_invalidatepage(struct page *page, @@ -200,6 +391,7 @@ static void orangefs_invalidatepage(struct page *page, set_page_private(page, 0); ClearPagePrivate(page); put_page(page); + return; /* write range entirely within invalidate range (or equal) */ } else if (page_offset(page) + offset <= wr->pos && wr->pos + wr->len <= page_offset(page) + offset + length) { @@ -209,6 +401,7 @@ static void orangefs_invalidatepage(struct page *page, put_page(page); /* XXX is this right? only caller in fs */ cancel_dirty_page(page); + return; /* invalidate range chops off end of write range */ } else if (wr->pos < page_offset(page) + offset && wr->pos + wr->len <= page_offset(page) + offset + length && @@ -240,6 +433,7 @@ static void orangefs_invalidatepage(struct page *page, * should we just ignore this and write it out anyway? * it hardly makes sense */ + return; /* non-overlapping ranges */ } else { /* WARN if they do overlap */ @@ -251,7 +445,15 @@ static void orangefs_invalidatepage(struct page *page, printk("write range offset %llu length %zu\n", wr->pos, wr->len); } + return; } + + /* + * Above there are returns where wr is freed or where we WARN. + * Thus the following runs if wr was modified above. + */ + + orangefs_launder_page(page); } static int orangefs_releasepage(struct page *page, gfp_t foo) @@ -404,6 +606,7 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, static const struct address_space_operations orangefs_address_operations = { .writepage = orangefs_writepage, .readpage = orangefs_readpage, + .writepages = orangefs_writepages, .set_page_dirty = __set_page_dirty_nobuffers, .write_begin = orangefs_write_begin, .write_end = orangefs_write_end, @@ -418,9 +621,18 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); - vm_fault_t ret = VM_FAULT_LOCKED; + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + unsigned long *bitlock = &orangefs_inode->bitlock; + vm_fault_t ret; struct orangefs_write_range *wr; + sb_start_pagefault(inode->i_sb); + + if (wait_on_bit(bitlock, 1, TASK_KILLABLE)) { + ret = VM_FAULT_RETRY; + goto out; + } + lock_page(page); if (PageDirty(page) && !PagePrivate(page)) { /* @@ -429,7 +641,7 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) * orangefs_writepage_locked. */ if (orangefs_launder_page(page)) { - ret = VM_FAULT_RETRY; + ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; goto out; } } @@ -442,14 +654,14 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) goto okay; } else { if (orangefs_launder_page(page)) { - ret = VM_FAULT_RETRY; + ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; goto out; } } } wr = kmalloc(sizeof *wr, GFP_KERNEL); if (!wr) { - ret = VM_FAULT_RETRY; + ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; goto out; } wr->pos = page_offset(page); @@ -461,11 +673,10 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) get_page(page); okay: - sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); if (page->mapping != inode->i_mapping) { unlock_page(page); - ret = VM_FAULT_NOPAGE; + ret = VM_FAULT_LOCKED|VM_FAULT_NOPAGE; goto out; } @@ -476,6 +687,7 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) */ set_page_dirty(page); wait_for_stable_page(page); + ret = VM_FAULT_LOCKED; out: sb_end_pagefault(inode->i_sb); return ret; @@ -553,13 +765,15 @@ int __orangefs_setattr(struct inode *inode, struct iattr *iattr) } else { gossip_debug(GOSSIP_UTILS_DEBUG, "User attempted to set sticky bit on non-root directory; returning EINVAL.\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } } if (iattr->ia_mode & (S_ISUID)) { gossip_debug(GOSSIP_UTILS_DEBUG, "Attempting to set setuid bit (not supported); returning EINVAL.\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } } @@ -741,6 +955,8 @@ static int orangefs_set_inode(struct inode *inode, void *data) ORANGEFS_I(inode)->refn.khandle = ref->khandle; ORANGEFS_I(inode)->attr_valid = 0; hash_init(ORANGEFS_I(inode)->xattr_cache); + ORANGEFS_I(inode)->mapping_time = jiffies - 1; + ORANGEFS_I(inode)->bitlock = 0; return 0; } diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 336a3ec0b83e..87beab10326a 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -193,9 +193,11 @@ struct orangefs_inode_s { sector_t last_failed_block_index_read; unsigned long getattr_time; + unsigned long mapping_time; int attr_valid; kuid_t attr_uid; kgid_t attr_gid; + unsigned long bitlock; DECLARE_HASHTABLE(xattr_cache, 4); }; @@ -390,6 +392,7 @@ bool __is_daemon_in_service(void); /* * defined in file.c */ +int orangefs_revalidate_mapping(struct inode *); ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *, struct iov_iter *, size_t, loff_t, struct orangefs_write_range *); ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *, @@ -427,6 +430,7 @@ int orangefs_normalize_to_errno(__s32 error_code); extern struct mutex orangefs_request_mutex; extern int op_timeout_secs; extern int slot_timeout_secs; +extern int orangefs_cache_timeout_msecs; extern int orangefs_dcache_timeout_msecs; extern int orangefs_getattr_timeout_msecs; extern struct list_head orangefs_superblocks; diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c index 85ef87245a87..82cf8b3e568b 100644 --- a/fs/orangefs/orangefs-mod.c +++ b/fs/orangefs/orangefs-mod.c @@ -30,6 +30,7 @@ static ulong module_parm_debug_mask; __u64 orangefs_gossip_debug_mask; int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS; int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS; +int orangefs_cache_timeout_msecs = 50; int orangefs_dcache_timeout_msecs = 50; int orangefs_getattr_timeout_msecs = 50; diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index 19739aaee675..3627ea946402 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -62,6 +62,14 @@ * Slots are requested and waited for, * the wait times out after slot_timeout_secs. * + * What: /sys/fs/orangefs/cache_timeout_msecs + * Date: Mar 2018 + * Contact: Martin Brandenburg <martin@xxxxxxxxxxxx> + * Description: + * Time in milliseconds between which + * orangefs_revalidate_mapping will invalidate the page + * cache. + * * What: /sys/fs/orangefs/dcache_timeout_msecs * Date: Jul 2016 * Contact: Martin Brandenburg <martin@xxxxxxxxxxxx> @@ -221,6 +229,13 @@ static ssize_t sysfs_int_show(struct kobject *kobj, "%d\n", slot_timeout_secs); goto out; + } else if (!strcmp(attr->attr.name, + "cache_timeout_msecs")) { + rc = scnprintf(buf, + PAGE_SIZE, + "%d\n", + orangefs_cache_timeout_msecs); + goto out; } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) { rc = scnprintf(buf, @@ -277,6 +292,9 @@ static ssize_t sysfs_int_store(struct kobject *kobj, } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) { rc = kstrtoint(buf, 0, &slot_timeout_secs); goto out; + } else if (!strcmp(attr->attr.name, "cache_timeout_msecs")) { + rc = kstrtoint(buf, 0, &orangefs_cache_timeout_msecs); + goto out; } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) { rc = kstrtoint(buf, 0, &orangefs_dcache_timeout_msecs); goto out; @@ -818,6 +836,9 @@ static struct orangefs_attribute op_timeout_secs_attribute = static struct orangefs_attribute slot_timeout_secs_attribute = __ATTR(slot_timeout_secs, 0664, sysfs_int_show, sysfs_int_store); +static struct orangefs_attribute cache_timeout_msecs_attribute = + __ATTR(cache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store); + static struct orangefs_attribute dcache_timeout_msecs_attribute = __ATTR(dcache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store); @@ -861,6 +882,7 @@ static struct orangefs_attribute perf_time_interval_secs_attribute = static struct attribute *orangefs_default_attrs[] = { &op_timeout_secs_attribute.attr, &slot_timeout_secs_attribute.attr, + &cache_timeout_msecs_attribute.attr, &dcache_timeout_msecs_attribute.attr, &getattr_timeout_msecs_attribute.attr, &readahead_count_attribute.attr, -- 2.20.1