On Mon, Mar 5, 2018 at 8:17 PM, Chengguang Xu <cgxu519@xxxxxxx> wrote: > In current code, regular file and directory use same struct > ceph_file_info to store fs specific data so the struct has to > include some fields which are only used for directory > (e.g., readdir related info), when having plenty of regular files, > it will lead to memory waste. > > This patch introduces dedicated readdir_info cache to separate readdir > related things from struct ceph_file_info, so that regular file could > not include those unused fields anymore. Also, chagned to manipulate > fscache only when reuglar file successfully acquired ceph_file_info. > > Signed-off-by: Chengguang Xu <cgxu519@xxxxxxx> > --- > fs/ceph/dir.c | 158 +++++++++++++++++++++++-------------------- > fs/ceph/file.c | 88 ++++++++++++++++-------- > fs/ceph/super.c | 14 +++- > fs/ceph/super.h | 19 ++++-- > include/linux/ceph/libceph.h | 1 + > 5 files changed, 170 insertions(+), 110 deletions(-) > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c > index 0c43468..84bc75a 100644 > --- a/fs/ceph/dir.c > +++ b/fs/ceph/dir.c > @@ -105,15 +105,17 @@ static int fpos_cmp(loff_t l, loff_t r) > static int note_last_dentry(struct ceph_file_info *fi, const char *name, > int len, unsigned next_offset) > { > + struct ceph_readdir_info *ri = fi->readdir_info; > char *buf = kmalloc(len+1, GFP_KERNEL); > if (!buf) > return -ENOMEM; > - kfree(fi->last_name); > - fi->last_name = buf; > - memcpy(fi->last_name, name, len); > - fi->last_name[len] = 0; > - fi->next_offset = next_offset; > - dout("note_last_dentry '%s'\n", fi->last_name); > + > + kfree(ri->last_name); > + ri->last_name = buf; > + memcpy(ri->last_name, name, len); > + ri->last_name[len] = 0; > + ri->next_offset = next_offset; > + dout("note_last_dentry '%s'\n", ri->last_name); > return 0; > } > > @@ -176,6 +178,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, > int shared_gen) > { > struct ceph_file_info *fi = file->private_data; > + struct ceph_readdir_info *ri = fi->readdir_info; > struct dentry *parent = file->f_path.dentry; > struct inode *dir = d_inode(parent); > struct dentry *dentry, *last = NULL; > @@ -279,9 +282,9 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, > err = ret; > dput(last); > /* last_name no longer match cache index */ > - if (fi->readdir_cache_idx >= 0) { > - fi->readdir_cache_idx = -1; > - fi->dir_release_count = 0; > + if (ri->readdir_cache_idx >= 0) { > + ri->readdir_cache_idx = -1; > + ri->dir_release_count = 0; > } > } > return err; > @@ -289,17 +292,20 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, > > static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos) > { > - if (!fi->last_readdir) > + struct ceph_readdir_info *ri = fi->readdir_info; > + > + if (!ri->last_readdir) > return true; > if (is_hash_order(pos)) > - return !ceph_frag_contains_value(fi->frag, fpos_hash(pos)); > + return !ceph_frag_contains_value(ri->frag, fpos_hash(pos)); > else > - return fi->frag != fpos_frag(pos); > + return ri->frag != fpos_frag(pos); > } > > static int ceph_readdir(struct file *file, struct dir_context *ctx) > { > struct ceph_file_info *fi = file->private_data; > + struct ceph_readdir_info *ri = fi->readdir_info; > struct inode *inode = file_inode(file); > struct ceph_inode_info *ci = ceph_inode(inode); > struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > @@ -357,9 +363,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) > CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; > > /* discard old result, if any */ > - if (fi->last_readdir) { > - ceph_mdsc_put_request(fi->last_readdir); > - fi->last_readdir = NULL; > + if (ri->last_readdir) { > + ceph_mdsc_put_request(ri->last_readdir); > + ri->last_readdir = NULL; > } > > if (is_hash_order(ctx->pos)) { > @@ -373,7 +379,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) > } > > dout("readdir fetching %llx.%llx frag %x offset '%s'\n", > - ceph_vinop(inode), frag, fi->last_name); > + ceph_vinop(inode), frag, ri->last_name); > req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); > if (IS_ERR(req)) > return PTR_ERR(req); > @@ -389,8 +395,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) > __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); > req->r_inode_drop = CEPH_CAP_FILE_EXCL; > } > - if (fi->last_name) { > - req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); > + if (ri->last_name) { > + req->r_path2 = kstrdup(ri->last_name, GFP_KERNEL); > if (!req->r_path2) { > ceph_mdsc_put_request(req); > return -ENOMEM; > @@ -400,10 +406,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) > cpu_to_le32(fpos_hash(ctx->pos)); > } > > - req->r_dir_release_cnt = fi->dir_release_count; > - req->r_dir_ordered_cnt = fi->dir_ordered_count; > - req->r_readdir_cache_idx = fi->readdir_cache_idx; > - req->r_readdir_offset = fi->next_offset; > + req->r_dir_release_cnt = ri->dir_release_count; > + req->r_dir_ordered_cnt = ri->dir_ordered_count; > + req->r_readdir_cache_idx = ri->readdir_cache_idx; > + req->r_readdir_offset = ri->next_offset; > req->r_args.readdir.frag = cpu_to_le32(frag); > req->r_args.readdir.flags = > cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); > @@ -427,35 +433,35 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) > if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { > frag = le32_to_cpu(rinfo->dir_dir->frag); > if (!rinfo->hash_order) { > - fi->next_offset = req->r_readdir_offset; > + ri->next_offset = req->r_readdir_offset; > /* adjust ctx->pos to beginning of frag */ > ctx->pos = ceph_make_fpos(frag, > - fi->next_offset, > + ri->next_offset, > false); > } > } > > - fi->frag = frag; > - fi->last_readdir = req; > + ri->frag = frag; > + ri->last_readdir = req; > > if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) { > - fi->readdir_cache_idx = req->r_readdir_cache_idx; > - if (fi->readdir_cache_idx < 0) { > + ri->readdir_cache_idx = req->r_readdir_cache_idx; > + if (ri->readdir_cache_idx < 0) { > /* preclude from marking dir ordered */ > - fi->dir_ordered_count = 0; > + ri->dir_ordered_count = 0; > } else if (ceph_frag_is_leftmost(frag) && > - fi->next_offset == 2) { > + ri->next_offset == 2) { > /* note dir version at start of readdir so > * we can tell if any dentries get dropped */ > - fi->dir_release_count = req->r_dir_release_cnt; > - fi->dir_ordered_count = req->r_dir_ordered_cnt; > + ri->dir_release_count = req->r_dir_release_cnt; > + ri->dir_ordered_count = req->r_dir_ordered_cnt; > } > } else { > dout("readdir !did_prepopulate"); > /* disable readdir cache */ > - fi->readdir_cache_idx = -1; > + ri->readdir_cache_idx = -1; > /* preclude from marking dir complete */ > - fi->dir_release_count = 0; > + ri->dir_release_count = 0; > } > > /* note next offset and last dentry name */ > @@ -469,14 +475,14 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) > if (err) > return err; > } else if (req->r_reply_info.dir_end) { > - fi->next_offset = 2; > + ri->next_offset = 2; > /* keep last name */ > } > } > > - rinfo = &fi->last_readdir->r_reply_info; > + rinfo = &ri->last_readdir->r_reply_info; > dout("readdir frag %x num %d pos %llx chunk first %llx\n", > - fi->frag, rinfo->dir_nr, ctx->pos, > + ri->frag, rinfo->dir_nr, ctx->pos, > rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); > > i = 0; > @@ -520,27 +526,27 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) > ctx->pos++; > } > > - ceph_mdsc_put_request(fi->last_readdir); > - fi->last_readdir = NULL; > + ceph_mdsc_put_request(ri->last_readdir); > + ri->last_readdir = NULL; > > - if (fi->next_offset > 2) { > - frag = fi->frag; > + if (ri->next_offset > 2) { > + frag = ri->frag; > goto more; > } > > /* more frags? */ > - if (!ceph_frag_is_rightmost(fi->frag)) { > - frag = ceph_frag_next(fi->frag); > + if (!ceph_frag_is_rightmost(ri->frag)) { > + frag = ceph_frag_next(ri->frag); > if (is_hash_order(ctx->pos)) { > loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), > - fi->next_offset, true); > + ri->next_offset, true); > if (new_pos > ctx->pos) > ctx->pos = new_pos; > /* keep last_name */ > } else { > - ctx->pos = ceph_make_fpos(frag, fi->next_offset, false); > - kfree(fi->last_name); > - fi->last_name = NULL; > + ctx->pos = ceph_make_fpos(frag, ri->next_offset, false); > + kfree(ri->last_name); > + ri->last_name = NULL; > } > dout("readdir next frag is %x\n", frag); > goto more; > @@ -552,20 +558,20 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) > * were released during the whole readdir, and we should have > * the complete dir contents in our cache. > */ > - if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) { > + if (atomic64_read(&ci->i_release_count) == ri->dir_release_count) { > spin_lock(&ci->i_ceph_lock); > - if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) { > + if (ri->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) { > dout(" marking %p complete and ordered\n", inode); > /* use i_size to track number of entries in > * readdir cache */ > - BUG_ON(fi->readdir_cache_idx < 0); > - i_size_write(inode, fi->readdir_cache_idx * > + BUG_ON(ri->readdir_cache_idx < 0); > + i_size_write(inode, ri->readdir_cache_idx * > sizeof(struct dentry*)); > } else { > dout(" marking %p complete\n", inode); > } > - __ceph_dir_set_complete(ci, fi->dir_release_count, > - fi->dir_ordered_count); > + __ceph_dir_set_complete(ci, ri->dir_release_count, > + ri->dir_ordered_count); > spin_unlock(&ci->i_ceph_lock); > } > > @@ -575,15 +581,17 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) > > static void reset_readdir(struct ceph_file_info *fi) > { > - if (fi->last_readdir) { > - ceph_mdsc_put_request(fi->last_readdir); > - fi->last_readdir = NULL; > + struct ceph_readdir_info *ri = fi->readdir_info; > + > + if (ri->last_readdir) { > + ceph_mdsc_put_request(ri->last_readdir); > + ri->last_readdir = NULL; > } > - kfree(fi->last_name); > - fi->last_name = NULL; > - fi->dir_release_count = 0; > - fi->readdir_cache_idx = -1; > - fi->next_offset = 2; /* compensate for . and .. */ > + kfree(ri->last_name); > + ri->last_name = NULL; > + ri->dir_release_count = 0; > + ri->readdir_cache_idx = -1; > + ri->next_offset = 2; /* compensate for . and .. */ > fi->flags &= ~CEPH_F_ATEND; > } > > @@ -594,16 +602,18 @@ static void reset_readdir(struct ceph_file_info *fi) > static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) > { > struct ceph_mds_reply_info_parsed *rinfo; > + struct ceph_readdir_info *ri = fi->readdir_info; > loff_t chunk_offset; > + > if (new_pos == 0) > return true; > if (is_hash_order(new_pos)) { > /* no need to reset last_name for a forward seek when > * dentries are sotred in hash order */ > - } else if (fi->frag != fpos_frag(new_pos)) { > + } else if (ri->frag != fpos_frag(new_pos)) { > return true; > } > - rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; > + rinfo = ri->last_readdir ? &ri->last_readdir->r_reply_info : NULL; > if (!rinfo || !rinfo->dir_nr) > return true; > chunk_offset = rinfo->dir_entries[0].offset; > @@ -614,6 +624,7 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) > static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) > { > struct ceph_file_info *fi = file->private_data; > + struct ceph_readdir_info *ri = fi->readdir_info; > struct inode *inode = file->f_mapping->host; > loff_t retval; > > @@ -637,8 +648,8 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) > } else if (is_hash_order(offset) && offset > file->f_pos) { > /* for hash offset, we don't know if a forward seek > * is within same frag */ > - fi->dir_release_count = 0; > - fi->readdir_cache_idx = -1; > + ri->dir_release_count = 0; > + ri->readdir_cache_idx = -1; > } > > if (offset != file->f_pos) { > @@ -1371,6 +1382,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, > loff_t *ppos) > { > struct ceph_file_info *cf = file->private_data; > + struct ceph_readdir_info *ri = cf->readdir_info; > struct inode *inode = file_inode(file); > struct ceph_inode_info *ci = ceph_inode(inode); > int left; > @@ -1379,12 +1391,12 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, > if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) > return -EISDIR; > > - if (!cf->dir_info) { > - cf->dir_info = kmalloc(bufsize, GFP_KERNEL); > - if (!cf->dir_info) > + if (!ri->dir_info) { > + ri->dir_info = kmalloc(bufsize, GFP_KERNEL); > + if (!ri->dir_info) > return -ENOMEM; > - cf->dir_info_len = > - snprintf(cf->dir_info, bufsize, > + ri->dir_info_len = > + snprintf(ri->dir_info, bufsize, > "entries: %20lld\n" > " files: %20lld\n" > " subdirs: %20lld\n" > @@ -1404,10 +1416,10 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, > (long)ci->i_rctime.tv_nsec); > } > > - if (*ppos >= cf->dir_info_len) > + if (*ppos >= ri->dir_info_len) > return 0; > - size = min_t(unsigned, size, cf->dir_info_len-*ppos); > - left = copy_to_user(buf, cf->dir_info + *ppos, size); > + size = min_t(unsigned int, size, ri->dir_info_len-*ppos); > + left = copy_to_user(buf, ri->dir_info + *ppos, size); > if (left == size) > return -EFAULT; > *ppos += (size - left); > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index 6639926..611aaef 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -159,36 +159,60 @@ static size_t dio_get_pagev_size(const struct iov_iter *it) > return req; > } > > + > +static int ceph_init_file_info(struct inode *inode, > + struct file *file, int fmode) > +{ > + struct ceph_file_info *fi; > + > + dout("init_file inode %p file %p mode 0%o (%s)\n", inode, file, > + inode->i_mode, S_ISREG(inode->i_mode) ? "regular" : "dir"); > + > + BUG_ON(inode->i_fop->release != ceph_release); > + > + fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); > + if (!fi) { > + ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ > + return -ENOMEM; > + } > + > + if (S_ISDIR(inode->i_mode)) { > + fi->readdir_info = kmem_cache_zalloc(ceph_readdir_cachep, > + GFP_KERNEL); > + if (!fi->readdir_info) { > + kmem_cache_free(ceph_file_cachep, fi); > + return -ENOMEM; > + } > + > + fi->readdir_info->next_offset = 2; > + fi->readdir_info->readdir_cache_idx = -1; > + } else { > + ceph_fscache_register_inode_cookie(inode); > + ceph_fscache_file_set_cookie(inode, file); > + } > + > + fi->fmode = fmode; > + spin_lock_init(&fi->rw_contexts_lock); > + INIT_LIST_HEAD(&fi->rw_contexts); > + file->private_data = fi; > + > + return 0; > +} > + > /* > * initialize private struct file data. > * if we fail, clean up by dropping fmode reference on the ceph_inode > */ > static int ceph_init_file(struct inode *inode, struct file *file, int fmode) > { > - struct ceph_file_info *cf; > int ret = 0; > > switch (inode->i_mode & S_IFMT) { > case S_IFREG: > - ceph_fscache_register_inode_cookie(inode); > - ceph_fscache_file_set_cookie(inode, file); > case S_IFDIR: > - dout("init_file %p %p 0%o (regular)\n", inode, file, > - inode->i_mode); > - cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); > - if (!cf) { > - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ > - return -ENOMEM; > - } > - cf->fmode = fmode; > - > - spin_lock_init(&cf->rw_contexts_lock); > - INIT_LIST_HEAD(&cf->rw_contexts); > - > - cf->next_offset = 2; > - cf->readdir_cache_idx = -1; > - file->private_data = cf; > - BUG_ON(inode->i_fop->release != ceph_release); > + ret = ceph_init_file_info(inode, file, fmode); > + if (ret) > + return ret; > break; > > case S_IFLNK: > @@ -460,16 +484,24 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > int ceph_release(struct inode *inode, struct file *file) > { > struct ceph_inode_info *ci = ceph_inode(inode); > - struct ceph_file_info *cf = file->private_data; > + struct ceph_file_info *fi = file->private_data; > + struct ceph_readdir_info *ri = fi->readdir_info; > + > + dout("release inode %p file %p mode 0%o (%s)\n", inode, file, > + inode->i_mode, S_ISREG(inode->i_mode) ? "regular" : "dir"); > > - dout("release inode %p file %p\n", inode, file); > - ceph_put_fmode(ci, cf->fmode); > - if (cf->last_readdir) > - ceph_mdsc_put_request(cf->last_readdir); > - kfree(cf->last_name); > - kfree(cf->dir_info); > - WARN_ON(!list_empty(&cf->rw_contexts)); > - kmem_cache_free(ceph_file_cachep, cf); > + WARN_ON(!list_empty(&fi->rw_contexts)); > + > + ceph_put_fmode(ci, fi->fmode); > + if (ri) { > + if (ri->last_readdir) > + ceph_mdsc_put_request(ri->last_readdir); > + kfree(ri->last_name); > + kfree(ri->dir_info); > + kmem_cache_free(ceph_readdir_cachep, ri); > + ri = NULL; > + } > + kmem_cache_free(ceph_file_cachep, fi); > > /* wake up anyone waiting for caps on this inode */ > wake_up_all(&ci->i_cap_wq); > diff --git a/fs/ceph/super.c b/fs/ceph/super.c > index a62d2a9..c1732ff 100644 > --- a/fs/ceph/super.c > +++ b/fs/ceph/super.c > @@ -677,6 +677,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) > struct kmem_cache *ceph_cap_flush_cachep; > struct kmem_cache *ceph_dentry_cachep; > struct kmem_cache *ceph_file_cachep; > +struct kmem_cache *ceph_readdir_cachep; > > static void ceph_inode_init_once(void *foo) > { > @@ -711,14 +712,22 @@ static int __init init_caches(void) > goto bad_dentry; > > ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD); > - > if (!ceph_file_cachep) > goto bad_file; > > + ceph_readdir_cachep = KMEM_CACHE(ceph_readdir_info, SLAB_MEM_SPREAD); > + if (!ceph_readdir_cachep) > + goto bad_readdir; > + > if ((error = ceph_fscache_register())) > - goto bad_file; > + goto bad_fscache; > > return 0; > + > +bad_fscache: > + kmem_cache_destroy(ceph_readdir_cachep); > +bad_readdir: > + kmem_cache_destroy(ceph_file_cachep); > bad_file: > kmem_cache_destroy(ceph_dentry_cachep); > bad_dentry: > @@ -743,6 +752,7 @@ static void destroy_caches(void) > kmem_cache_destroy(ceph_cap_flush_cachep); > kmem_cache_destroy(ceph_dentry_cachep); > kmem_cache_destroy(ceph_file_cachep); > + kmem_cache_destroy(ceph_readdir_cachep); > > ceph_fscache_unregister(); > } > diff --git a/fs/ceph/super.h b/fs/ceph/super.h > index 21b2e5b..3a943a0 100644 > --- a/fs/ceph/super.h > +++ b/fs/ceph/super.h > @@ -665,13 +665,7 @@ extern void ceph_reservation_status(struct ceph_fs_client *client, > #define CEPH_F_SYNC 1 > #define CEPH_F_ATEND 2 > > -struct ceph_file_info { > - short fmode; /* initialized on open */ > - short flags; /* CEPH_F_* */ > - > - spinlock_t rw_contexts_lock; > - struct list_head rw_contexts; > - > +struct ceph_readdir_info { > /* readdir: position within the dir */ > u32 frag; > struct ceph_mds_request *last_readdir; > @@ -688,6 +682,17 @@ struct ceph_file_info { > int dir_info_len; > }; > > +struct ceph_file_info { > + short fmode; /* initialized on open */ > + short flags; /* CEPH_F_* */ > + > + spinlock_t rw_contexts_lock; > + struct list_head rw_contexts; > + > + /* readdir related things */ > + struct ceph_readdir_info *readdir_info; > +}; For dir file. I prefer something like: struct ceph_dir_file_info { struct ceph_file_info file; ceph_readdir_info readdir. }; This avoids double memory allocation for dir file. Regards Yan, Zheng > + > struct ceph_rw_context { > struct list_head list; > struct task_struct *thread; > diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h > index c2ec44c..4ca23fa 100644 > --- a/include/linux/ceph/libceph.h > +++ b/include/linux/ceph/libceph.h > @@ -262,6 +262,7 @@ static inline int calc_pages_for(u64 off, u64 len) > extern struct kmem_cache *ceph_cap_flush_cachep; > extern struct kmem_cache *ceph_dentry_cachep; > extern struct kmem_cache *ceph_file_cachep; > +extern struct kmem_cache *ceph_readdir_cachep; > > /* ceph_common.c */ > extern bool libceph_compatible(void *data); > -- > 1.8.3.1 > > -- > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html