Hi, I tested your patches on a ubuntu lucid system, but ubuntu raring kernel (3.8), but with for-linus branch from ceph-client and your fscache. There was no probs in heavy load. But i dont see any difference with/without fscache on our "test" case (mp4 video streaming, ~5500 connections): with fscache: http://imageshack.us/photo/my-images/109/xg5a.png/ without fscache: http://imageshack.us/photo/my-images/5/xak.png/ Elbandi 2013/5/29 Milosz Tanski <milosz@xxxxxxxxx>: > Sage, > > Thanks for taking a look at this. No worries about the timing. > > I added two extra changes into my branch located here: > https://bitbucket.org/adfin/linux-fs/commits/branch/forceph. The first > one is a fix for kernel deadlock. The second one makes fsc cache a > non-default mount option (akin to NFS). > > Finally, I observed an occasional oops in the fscache that's fixed in > David's branch that's waiting to get into mainline. The fix for the > issue is here: http://git.kernel.org/cgit/linux/kernel/git/dhowells/linux-fs.git/commit/?h=fscache&id=82958c45e35963c93fc6cbe6a27752e2d97e9f9a. > I can only cause that issue by forcing the kernel to drop it's caches > in some cases. > > Let me know if you any other feedback, or if I can help in anyway. > > Thanks, > - Milosz > > On Tue, May 28, 2013 at 1:11 PM, Sage Weil <sage@xxxxxxxxxxx> wrote: >> Hi Milosz, >> >> Just a heads up that I hope to take a closer look at the patch this >> afternoon or tomorrow. Just catching up after the long weekend. >> >> Thanks! >> sage >> >> >> On Thu, 23 May 2013, Milosz Tanski wrote: >> >>> Enable fscache as an optional feature of ceph. >>> >>> Adding support for fscache to the Ceph filesystem. This would bring it to on >>> par with some of the other network filesystems in Linux (like NFS, AFS, etc...) >>> >>> This exploits the existing Ceph cache & lazyio capabilities. >>> >>> Signed-off-by: Milosz Tanski <milosz@xxxxxxxxx> >>> --- >>> fs/ceph/Kconfig | 9 ++++++ >>> fs/ceph/Makefile | 2 ++ >>> fs/ceph/addr.c | 85 ++++++++++++++++++++++++++++++++++++++++-------------- >>> fs/ceph/caps.c | 21 +++++++++++++- >>> fs/ceph/file.c | 9 ++++++ >>> fs/ceph/inode.c | 25 ++++++++++++++-- >>> fs/ceph/super.c | 25 ++++++++++++++-- >>> fs/ceph/super.h | 12 ++++++++ >>> 8 files changed, 162 insertions(+), 26 deletions(-) >>> >>> diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig >>> index 49bc782..ac9a2ef 100644 >>> --- a/fs/ceph/Kconfig >>> +++ b/fs/ceph/Kconfig >>> @@ -16,3 +16,12 @@ config CEPH_FS >>> >>> If unsure, say N. >>> >>> +if CEPH_FS >>> +config CEPH_FSCACHE >>> + bool "Enable Ceph client caching support" >>> + depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y >>> + help >>> + Choose Y here to enable persistent, read-only local >>> + caching support for Ceph clients using FS-Cache >>> + >>> +endif >>> diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile >>> index bd35212..0af0678 100644 >>> --- a/fs/ceph/Makefile >>> +++ b/fs/ceph/Makefile >>> @@ -9,3 +9,5 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ >>> mds_client.o mdsmap.o strings.o ceph_frag.o \ >>> debugfs.o >>> >>> +ceph-$(CONFIG_CEPH_FSCACHE) += cache.o >>> + >>> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c >>> index 3e68ac1..fd3a1cc 100644 >>> --- a/fs/ceph/addr.c >>> +++ b/fs/ceph/addr.c >>> @@ -11,6 +11,7 @@ >>> >>> #include "super.h" >>> #include "mds_client.h" >>> +#include "cache.h" >>> #include <linux/ceph/osd_client.h> >>> >>> /* >>> @@ -149,11 +150,26 @@ static void ceph_invalidatepage(struct page >>> *page, unsigned long offset) >>> struct ceph_inode_info *ci; >>> struct ceph_snap_context *snapc = page_snap_context(page); >>> >>> - BUG_ON(!PageLocked(page)); >>> - BUG_ON(!PagePrivate(page)); >>> BUG_ON(!page->mapping); >>> >>> inode = page->mapping->host; >>> + ci = ceph_inode(inode); >>> + >>> + if (offset != 0) { >>> + dout("%p invalidatepage %p idx %lu partial dirty page\n", >>> + inode, page, page->index); >>> + return; >>> + } >>> + >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + if (PageFsCache(page)) >>> + ceph_invalidate_fscache_page(inode, page); >>> +#endif >>> + >>> + if (!PagePrivate(page)) >>> + return; >>> + >>> + BUG_ON(!PageLocked(page)); >>> >>> /* >>> * We can get non-dirty pages here due to races between >>> @@ -163,31 +179,32 @@ static void ceph_invalidatepage(struct page >>> *page, unsigned long offset) >>> if (!PageDirty(page)) >>> pr_err("%p invalidatepage %p page not dirty\n", inode, page); >>> >>> - if (offset == 0) >>> - ClearPageChecked(page); >>> + ClearPageChecked(page); >>> >>> - ci = ceph_inode(inode); >>> - if (offset == 0) { >>> - dout("%p invalidatepage %p idx %lu full dirty page %lu\n", >>> - inode, page, page->index, offset); >>> - ceph_put_wrbuffer_cap_refs(ci, 1, snapc); >>> - ceph_put_snap_context(snapc); >>> - page->private = 0; >>> - ClearPagePrivate(page); >>> - } else { >>> - dout("%p invalidatepage %p idx %lu partial dirty page\n", >>> - inode, page, page->index); >>> - } >>> + dout("%p invalidatepage %p idx %lu full dirty page %lu\n", >>> + inode, page, page->index, offset); >>> + >>> + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); >>> + ceph_put_snap_context(snapc); >>> + page->private = 0; >>> + ClearPagePrivate(page); >>> } >>> >>> -/* just a sanity check */ >>> static int ceph_releasepage(struct page *page, gfp_t g) >>> { >>> struct inode *inode = page->mapping ? page->mapping->host : NULL; >>> dout("%p releasepage %p idx %lu\n", inode, page, page->index); >>> WARN_ON(PageDirty(page)); >>> - WARN_ON(PagePrivate(page)); >>> - return 0; >>> + >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + /* Can we release the page from the cache? */ >>> + if (PageFsCache(page) && ceph_release_fscache_page(page, g) == 0) >>> + return 0; >>> +#endif >>> + if (PagePrivate(page)) >>> + return 0; >>> + >>> + return 1; >>> } >>> >>> /* >>> @@ -197,11 +214,18 @@ static int readpage_nounlock(struct file *filp, >>> struct page *page) >>> { >>> struct inode *inode = file_inode(filp); >>> struct ceph_inode_info *ci = ceph_inode(inode); >>> - struct ceph_osd_client *osdc = >>> + struct ceph_osd_client *osdc = >>> &ceph_inode_to_client(inode)->client->osdc; >>> int err = 0; >>> u64 len = PAGE_CACHE_SIZE; >>> >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + err = ceph_readpage_from_fscache(inode, page); >>> + >>> + if (err == 0) >>> + goto out; >>> +#endif >>> + >>> dout("readpage inode %p file %p page %p index %lu\n", >>> inode, filp, page, page->index); >>> err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, >>> @@ -219,6 +243,10 @@ static int readpage_nounlock(struct file *filp, >>> struct page *page) >>> } >>> SetPageUptodate(page); >>> >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + ceph_readpage_to_fscache(inode, page); >>> +#endif >>> + >>> out: >>> return err < 0 ? err : 0; >>> } >>> @@ -262,6 +290,9 @@ static void finish_read(struct ceph_osd_request >>> *req, struct ceph_msg *msg) >>> flush_dcache_page(page); >>> SetPageUptodate(page); >>> unlock_page(page); >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + ceph_readpage_to_fscache(inode, page); >>> +#endif >>> page_cache_release(page); >>> bytes -= PAGE_CACHE_SIZE; >>> } >>> @@ -330,7 +361,7 @@ static int start_read(struct inode *inode, struct >>> list_head *page_list, int max) >>> page = list_entry(page_list->prev, struct page, lru); >>> BUG_ON(PageLocked(page)); >>> list_del(&page->lru); >>> - >>> + >>> dout("start_read %p adding %p idx %lu\n", inode, page, >>> page->index); >>> if (add_to_page_cache_lru(page, &inode->i_data, page->index, >>> @@ -377,6 +408,14 @@ static int ceph_readpages(struct file *file, >>> struct address_space *mapping, >>> int rc = 0; >>> int max = 0; >>> >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, >>> + &nr_pages); >>> + >>> + if (rc == 0) >>> + goto out; >>> +#endif >>> + >>> if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) >>> max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) >>> >> PAGE_SHIFT; >>> @@ -490,6 +529,10 @@ static int writepage_nounlock(struct page *page, >>> struct writeback_control *wbc) >>> CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) >>> set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); >>> >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + ceph_readpage_to_fscache(inode, page); >>> +#endif >>> + >>> set_page_writeback(page); >>> err = ceph_osdc_writepages(osdc, ceph_vino(inode), >>> &ci->i_layout, snapc, >>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c >>> index da0f9b8..7e8d8d3 100644 >>> --- a/fs/ceph/caps.c >>> +++ b/fs/ceph/caps.c >>> @@ -10,6 +10,7 @@ >>> >>> #include "super.h" >>> #include "mds_client.h" >>> +#include "cache.h" >>> #include <linux/ceph/decode.h> >>> #include <linux/ceph/messenger.h> >>> >>> @@ -486,8 +487,14 @@ static void __check_cap_issue(struct >>> ceph_inode_info *ci, struct ceph_cap *cap, >>> * i_rdcache_gen. >>> */ >>> if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && >>> - (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) >>> + (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { >>> ci->i_rdcache_gen++; >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + /* Invalidate the cache for the whole file. */ >>> + dout("Invalidating inode data cache: %p", &ci->vfs_inode); >>> + fscache_invalidate(ci->fscache); >>> +#endif >>> + } >>> >>> /* >>> * if we are newly issued FILE_SHARED, mark dir not complete; we >>> @@ -2356,6 +2363,12 @@ static void handle_cap_grant(struct inode >>> *inode, struct ceph_mds_caps *grant, >>> if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && >>> (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && >>> !ci->i_wrbuffer_ref) { >>> + >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + /* Close the fscache on inode */ >>> + ceph_fscache_unregister_inode_cookie(ci); >>> +#endif >>> + >>> if (try_nonblocking_invalidate(inode) == 0) { >>> revoked_rdcache = 1; >>> } else { >>> @@ -2425,6 +2438,12 @@ static void handle_cap_grant(struct inode >>> *inode, struct ceph_mds_caps *grant, >>> wake = 1; >>> } >>> >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + /* Register cache (if needed); perform this after amny size change. */ >>> + if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO))) >>> + ceph_fscache_register_inode_cookie(session->s_mdsc->fsc, ci); >>> +#endif >>> + >>> /* check cap bits */ >>> wanted = __ceph_caps_wanted(ci); >>> used = __ceph_caps_used(ci); >>> diff --git a/fs/ceph/file.c b/fs/ceph/file.c >>> index 656e169..e7ecc04 100644 >>> --- a/fs/ceph/file.c >>> +++ b/fs/ceph/file.c >>> @@ -11,6 +11,7 @@ >>> >>> #include "super.h" >>> #include "mds_client.h" >>> +#include "cache.h" >>> >>> /* >>> * Ceph file operations >>> @@ -67,10 +68,17 @@ out: >>> static int ceph_init_file(struct inode *inode, struct file *file, int fmode) >>> { >>> struct ceph_file_info *cf; >>> + struct ceph_inode_info *ci = ceph_inode(inode); >>> + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); >>> int ret = 0; >>> >>> switch (inode->i_mode & S_IFMT) { >>> case S_IFREG: >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + spin_lock(&ci->i_ceph_lock); >>> + ceph_fscache_register_inode_cookie(fsc, ci); >>> + spin_lock(&ci->i_ceph_lock); >>> +#endif >>> case S_IFDIR: >>> dout("init_file %p %p 0%o (regular)\n", inode, file, >>> inode->i_mode); >>> @@ -181,6 +189,7 @@ int ceph_open(struct inode *inode, struct file *file) >>> spin_unlock(&ci->i_ceph_lock); >>> return ceph_init_file(inode, file, fmode); >>> } >>> + >>> spin_unlock(&ci->i_ceph_lock); >>> >>> dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); >>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c >>> index be0f7e2..620b84c 100644 >>> --- a/fs/ceph/inode.c >>> +++ b/fs/ceph/inode.c >>> @@ -12,6 +12,7 @@ >>> >>> #include "super.h" >>> #include "mds_client.h" >>> +#include "cache.h" >>> #include <linux/ceph/decode.h> >>> >>> /* >>> @@ -377,6 +378,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) >>> >>> INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work); >>> >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + ci->fscache = NULL; >>> +#endif >>> + >>> return &ci->vfs_inode; >>> } >>> >>> @@ -396,6 +401,10 @@ void ceph_destroy_inode(struct inode *inode) >>> >>> dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); >>> >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + ceph_fscache_unregister_inode_cookie(ci); >>> +#endif >>> + >>> ceph_queue_caps_release(inode); >>> >>> /* >>> @@ -430,7 +439,6 @@ void ceph_destroy_inode(struct inode *inode) >>> call_rcu(&inode->i_rcu, ceph_i_callback); >>> } >>> >>> - >>> /* >>> * Helpers to fill in size, ctime, mtime, and atime. We have to be >>> * careful because either the client or MDS may have more up to date >>> @@ -633,6 +641,14 @@ static int fill_inode(struct inode *inode, >>> le32_to_cpu(info->time_warp_seq), >>> &ctime, &mtime, &atime); >>> >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + /* Notify the cache that size has changed */ >>> + if (queue_trunc && ci->fscache) { >>> + pr_info("size changed inode: %p cap flags\n", &ci->vfs_inode); >>> + fscache_attr_changed(ci->fscache); >>> + } >>> +#endif >>> + >>> /* only update max_size on auth cap */ >>> if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && >>> ci->i_max_size != le64_to_cpu(info->max_size)) { >>> @@ -1066,7 +1082,7 @@ int ceph_fill_trace(struct super_block *sb, >>> struct ceph_mds_request *req, >>> * complete. >>> */ >>> ceph_set_dentry_offset(req->r_old_dentry); >>> - dout("dn %p gets new offset %lld\n", req->r_old_dentry, >>> + dout("dn %p gets new offset %lld\n", req->r_old_dentry, >>> ceph_dentry(req->r_old_dentry)->offset); >>> >>> dn = req->r_old_dentry; /* use old_dentry */ >>> @@ -1430,6 +1446,11 @@ static void ceph_invalidate_work(struct >>> work_struct *work) >>> orig_gen = ci->i_rdcache_gen; >>> spin_unlock(&ci->i_ceph_lock); >>> >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + pr_info("cache invalidating inode: %p cap flags\n", &ci->vfs_inode); >>> + fscache_invalidate(ci->fscache); >>> +#endif >>> + >>> truncate_inode_pages(&inode->i_data, 0); >>> >>> spin_lock(&ci->i_ceph_lock); >>> diff --git a/fs/ceph/super.c b/fs/ceph/super.c >>> index 7d377c9..7847ef7 100644 >>> --- a/fs/ceph/super.c >>> +++ b/fs/ceph/super.c >>> @@ -17,6 +17,7 @@ >>> >>> #include "super.h" >>> #include "mds_client.h" >>> +#include "cache.h" >>> >>> #include <linux/ceph/ceph_features.h> >>> #include <linux/ceph/decode.h> >>> @@ -530,6 +531,11 @@ static struct ceph_fs_client >>> *create_fs_client(struct ceph_mount_options *fsopt, >>> if (!fsc->wb_pagevec_pool) >>> goto fail_trunc_wq; >>> >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + /* fscache */ >>> + ceph_fscache_register_fsid_cookie(fsc); >>> +#endif >>> + >>> /* caps */ >>> fsc->min_caps = fsopt->max_readdir; >>> >>> @@ -554,6 +560,10 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) >>> { >>> dout("destroy_fs_client %p\n", fsc); >>> >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + ceph_fscache_unregister_fsid_cookie(fsc); >>> +#endif >>> + >>> destroy_workqueue(fsc->wb_wq); >>> destroy_workqueue(fsc->pg_inv_wq); >>> destroy_workqueue(fsc->trunc_wq); >>> @@ -588,6 +598,8 @@ static void ceph_inode_init_once(void *foo) >>> >>> static int __init init_caches(void) >>> { >>> + int error = -ENOMEM; >>> + >>> ceph_inode_cachep = kmem_cache_create("ceph_inode_info", >>> sizeof(struct ceph_inode_info), >>> __alignof__(struct ceph_inode_info), >>> @@ -611,15 +623,19 @@ static int __init init_caches(void) >>> if (ceph_file_cachep == NULL) >>> goto bad_file; >>> >>> - return 0; >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + if ((error = fscache_register_netfs(&ceph_cache_netfs))) >>> + goto bad_file; >>> +#endif >>> >>> + return 0; >>> bad_file: >>> kmem_cache_destroy(ceph_dentry_cachep); >>> bad_dentry: >>> kmem_cache_destroy(ceph_cap_cachep); >>> bad_cap: >>> kmem_cache_destroy(ceph_inode_cachep); >>> - return -ENOMEM; >>> + return error; >>> } >>> >>> static void destroy_caches(void) >>> @@ -629,10 +645,15 @@ static void destroy_caches(void) >>> * destroy cache. >>> */ >>> rcu_barrier(); >>> + >>> kmem_cache_destroy(ceph_inode_cachep); >>> kmem_cache_destroy(ceph_cap_cachep); >>> kmem_cache_destroy(ceph_dentry_cachep); >>> kmem_cache_destroy(ceph_file_cachep); >>> + >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + fscache_unregister_netfs(&ceph_cache_netfs); >>> +#endif >>> } >>> >>> >>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h >>> index 8696be2..2980337 100644 >>> --- a/fs/ceph/super.h >>> +++ b/fs/ceph/super.h >>> @@ -16,6 +16,10 @@ >>> >>> #include <linux/ceph/libceph.h> >>> >>> +#ifdef CONFIG_CEPH_FSCACHE >>> +#include <linux/fscache.h> >>> +#endif >>> + >>> /* f_type in struct statfs */ >>> #define CEPH_SUPER_MAGIC 0x00c36400 >>> >>> @@ -90,6 +94,10 @@ struct ceph_fs_client { >>> struct dentry *debugfs_bdi; >>> struct dentry *debugfs_mdsc, *debugfs_mdsmap; >>> #endif >>> + >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + struct fscache_cookie *fscache; >>> +#endif >>> }; >>> >>> >>> @@ -319,6 +327,10 @@ struct ceph_inode_info { >>> >>> struct work_struct i_vmtruncate_work; >>> >>> +#ifdef CONFIG_CEPH_FSCACHE >>> + struct fscache_cookie *fscache; >>> +#endif >>> + >>> struct inode vfs_inode; /* at end */ >>> }; >>> >>> -- >>> 1.7.9.5 >>> -- >>> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in >>> the body of a message to majordomo@xxxxxxxxxxxxxxx >>> More majordomo info at http://vger.kernel.org/majordomo-info.html >>> >>> > -- > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html