On Thu, 2020-02-20 at 22:53 +0800, Yan, Zheng wrote: > On Thu, Feb 20, 2020 at 10:18 PM Jeff Layton <jlayton@xxxxxxxxxx> wrote: > > On Thu, 2020-02-20 at 20:26 +0800, Yan, Zheng wrote: > > > When getting caps for read/write, update corresponding file's last > > > read/write. If a file hasn't been read/write for 'caps_wanted_delay_max' > > > seconds, ignore the file when calculating wanted caps. > > > > > > > Please explain in the changelog how the new info is to be stored, given > > that it is quite complex. > > > > > Signed-off-by: "Yan, Zheng" <zyan@xxxxxxxxxx> > > > --- > > > fs/ceph/caps.c | 149 ++++++++++++++++++++++++----------- > > > fs/ceph/file.c | 23 +++--- > > > fs/ceph/inode.c | 15 +++- > > > fs/ceph/ioctl.c | 4 +- > > > fs/ceph/super.h | 16 +++- > > > include/linux/ceph/ceph_fs.h | 1 + > > > 6 files changed, 145 insertions(+), 63 deletions(-) > > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > > > index 293920d013ff..ccdc47bd7cf0 100644 > > > --- a/fs/ceph/caps.c > > > +++ b/fs/ceph/caps.c > > > @@ -971,18 +971,44 @@ int __ceph_caps_used(struct ceph_inode_info *ci) > > > return used; > > > } > > > > > > +#define FMODE_WAIT_BIAS 1000 > > > + > > > /* > > > * wanted, by virtue of open file modes > > > */ > > > int __ceph_caps_file_wanted(struct ceph_inode_info *ci) > > > { > > > + struct ceph_mount_options *opt = > > > + ceph_inode_to_client(&ci->vfs_inode)->mount_options; > > > + unsigned long used_cutoff = > > > + round_jiffies(jiffies - opt->caps_wanted_delay_max * HZ); > > > + unsigned long idle_cutoff = > > > + round_jiffies(jiffies - opt->caps_wanted_delay_min * HZ); > > > int i, bits = 0; > > > + > > > for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { > > > - if (ci->i_nr_by_mode[i]) > > > + if (ci->i_file_by_mode[i].nr >= FMODE_WAIT_BIAS) { > > > + /* there are cap waiters or lots of open files */ > > > bits |= 1 << i; > > > + } else if (ci->i_file_by_mode[i].nr > 0) { > > > + if (i == CEPH_FILE_MODE_PIN || > > > + time_after(ci->i_file_by_mode[i].last_used, > > > + used_cutoff)) > > > + bits |= 1 << i; > > > + } else if ((ci->i_file_by_mode[i].last_used & 1)) { > > > + if (time_after(ci->i_file_by_mode[i].last_used, > > > + idle_cutoff)) { > > > + bits |= 1 << i; > > > + } else { > > > + ci->i_file_by_mode[i].last_used &= ~1UL; > > > + } > > > + } > > > } > > > if (bits == 0) > > > return 0; > > > + if (bits == 1 && !S_ISDIR(ci->vfs_inode.i_mode)) > > > + return 0; > > > + > > > return ceph_caps_for_mode(bits >> 1); > > > } > > > > > > @@ -1021,14 +1047,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check) > > > return mds_wanted; > > > } > > > > > > -/* > > > - * called under i_ceph_lock > > > - */ > > > -static int __ceph_is_single_caps(struct ceph_inode_info *ci) > > > -{ > > > - return rb_first(&ci->i_caps) == rb_last(&ci->i_caps); > > > -} > > > - > > > int ceph_is_any_caps(struct inode *inode) > > > { > > > struct ceph_inode_info *ci = ceph_inode(inode); > > > @@ -1856,10 +1874,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, > > > if (ci->i_ceph_flags & CEPH_I_FLUSH) > > > flags |= CHECK_CAPS_FLUSH; > > > > > > - if (!(flags & CHECK_CAPS_AUTHONLY) || > > > - (ci->i_auth_cap && __ceph_is_single_caps(ci))) > > > - __cap_delay_cancel(mdsc, ci); > > > - > > > goto retry_locked; > > > retry: > > > spin_lock(&ci->i_ceph_lock); > > > @@ -2081,9 +2095,16 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, > > > goto retry; /* retake i_ceph_lock and restart our cap scan. */ > > > } > > > > > > - /* Reschedule delayed caps release if we delayed anything */ > > > - if (delayed) > > > - __cap_delay_requeue(mdsc, ci, false); > > > + if (list_empty(&ci->i_cap_delay_list)) { > > > + if (delayed) { > > > + /* Reschedule delayed caps release if we delayed anything */ > > > + __cap_delay_requeue(mdsc, ci, false); > > > + } else if ((file_wanted & ~CEPH_CAP_PIN) && > > > + !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { > > > + /* periodically re-calculate caps wanted by open files */ > > > + __cap_delay_requeue(mdsc, ci, true); > > > + } > > > + } > > > > > > spin_unlock(&ci->i_ceph_lock); > > > > > > @@ -2549,8 +2570,9 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got, > > > * FIXME: how does a 0 return differ from -EAGAIN? > > > */ > > > enum { > > > - NON_BLOCKING = 1, > > > - CHECK_FILELOCK = 2, > > > + /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */ > > > + NON_BLOCKING = (1 << 8), > > > + CHECK_FILELOCK = (1 << 9), > > > }; > > > > > > static int try_get_cap_refs(struct inode *inode, int need, int want, > > > @@ -2560,7 +2582,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want, > > > struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; > > > int ret = 0; > > > int have, implemented; > > > - int file_wanted; > > > bool snap_rwsem_locked = false; > > > > > > dout("get_cap_refs %p need %s want %s\n", inode, > > > @@ -2576,15 +2597,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want, > > > goto out_unlock; > > > } > > > > > > - /* make sure file is actually open */ > > > - file_wanted = __ceph_caps_file_wanted(ci); > > > - if ((file_wanted & need) != need) { > > > - dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", > > > - ceph_cap_string(need), ceph_cap_string(file_wanted)); > > > - ret = -EBADF; > > > - goto out_unlock; > > > - } > > > - > > > /* finish pending truncate */ > > > while (ci->i_truncate_pending) { > > > spin_unlock(&ci->i_ceph_lock); > > > @@ -2692,6 +2704,9 @@ static int try_get_cap_refs(struct inode *inode, int need, int want, > > > ceph_cap_string(have), ceph_cap_string(need)); > > > } > > > out_unlock: > > > + > > > + __ceph_touch_fmode(ci, mdsc, flags); > > > + > > > spin_unlock(&ci->i_ceph_lock); > > > if (snap_rwsem_locked) > > > up_read(&mdsc->snap_rwsem); > > > @@ -2729,10 +2744,22 @@ static void check_max_size(struct inode *inode, loff_t endoff) > > > ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); > > > } > > > > > > +static inline int get_used_file_mode(int need, int want) > > > +{ > > > + int fmode = 0; > > > + if (need & CEPH_CAP_FILE_RD) > > > + fmode |= CEPH_FILE_MODE_RD; > > > + if (need & CEPH_CAP_FILE_WR) > > > + fmode |= CEPH_FILE_MODE_WR; > > > + if (want & CEPH_CAP_FILE_LAZYIO) > > > + fmode |= CEPH_FILE_MODE_LAZY; > > > + return fmode; > > > +} > > > + > > > int ceph_try_get_caps(struct inode *inode, int need, int want, > > > bool nonblock, int *got) > > > { > > > - int ret; > > > + int ret, flags; > > > > > > BUG_ON(need & ~CEPH_CAP_FILE_RD); > > > BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); > > > @@ -2740,8 +2767,11 @@ int ceph_try_get_caps(struct inode *inode, int need, int want, > > > if (ret < 0) > > > return ret; > > > > > > - ret = try_get_cap_refs(inode, need, want, 0, > > > - (nonblock ? NON_BLOCKING : 0), got); > > > + flags = get_used_file_mode(need, want); > > > + if (nonblock) > > > + flags |= NON_BLOCKING; > > > + > > > + ret = try_get_cap_refs(inode, need, want, 0, flags, got); > > > return ret == -EAGAIN ? 0 : ret; > > > } > > > > > > @@ -2767,11 +2797,15 @@ int ceph_get_caps(struct file *filp, int need, int want, > > > fi->filp_gen != READ_ONCE(fsc->filp_gen)) > > > return -EBADF; > > > > > > + flags = get_used_file_mode(need, want); > > > + > > > while (true) { > > > if (endoff > 0) > > > check_max_size(inode, endoff); > > > > > > - flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0; > > > + flags &= CEPH_FILE_MODE_MASK; > > > + if (atomic_read(&fi->num_locks)) > > > + flags |= CHECK_FILELOCK; > > > _got = 0; > > > ret = try_get_cap_refs(inode, need, want, endoff, > > > flags, &_got); > > > @@ -2791,6 +2825,8 @@ int ceph_get_caps(struct file *filp, int need, int want, > > > list_add(&cw.list, &mdsc->cap_wait_list); > > > spin_unlock(&mdsc->caps_list_lock); > > > > > > + /* make sure used fmode not timeout */ > > > + ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS); > > > add_wait_queue(&ci->i_cap_wq, &wait); > > > > > > flags |= NON_BLOCKING; > > > @@ -2804,6 +2840,7 @@ int ceph_get_caps(struct file *filp, int need, int want, > > > } > > > > > > remove_wait_queue(&ci->i_cap_wq, &wait); > > > + ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS); > > > > > > spin_lock(&mdsc->caps_list_lock); > > > list_del(&cw.list); > > > @@ -2823,7 +2860,7 @@ int ceph_get_caps(struct file *filp, int need, int want, > > > if (ret < 0) { > > > if (ret == -ESTALE) { > > > /* session was killed, try renew caps */ > > > - ret = ceph_renew_caps(inode); > > > + ret = ceph_renew_caps(inode, flags); > > > if (ret == 0) > > > continue; > > > } > > > @@ -4121,13 +4158,41 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) > > > dout("flush_dirty_caps done\n"); > > > } > > > > > > +void __ceph_touch_fmode(struct ceph_inode_info *ci, > > > + struct ceph_mds_client *mdsc, int fmode) > > > +{ > > > + int i; > > > + int bits = (fmode << 1); > > > + unsigned long now = jiffies | 1; > > > + for (i = 1; i < CEPH_FILE_MODE_BITS; i++) { > > > + if (bits & (1 << i)) > > > + ci->i_file_by_mode[i].last_used = now; > > > + } > > > + > > > + /* queue periodic check */ > > > + if (bits && list_empty(&ci->i_cap_delay_list)) > > > + __cap_delay_requeue(mdsc, ci, true); > > > +} > > > + > > > +void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) > > > +{ > > > + int i; > > > + int bits = (fmode << 1) | 1; > > > + spin_lock(&ci->i_ceph_lock); > > > + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { > > > + if (bits & (1 << i)) > > > + ci->i_file_by_mode[i].nr += count; > > > + } > > > + spin_unlock(&ci->i_ceph_lock); > > > +} > > > + > > > void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode) > > > { > > > int i; > > > int bits = (fmode << 1) | 1; > > > for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { > > > if (bits & (1 << i)) > > > - ci->i_nr_by_mode[i]++; > > > + ci->i_file_by_mode[i].nr++; > > > } > > > } > > > > > > @@ -4136,26 +4201,18 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode) > > > * we may need to release capabilities to the MDS (or schedule > > > * their delayed release). > > > */ > > > -void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) > > > +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) > > > { > > > - int i, last = 0; > > > + int i; > > > int bits = (fmode << 1) | 1; > > > spin_lock(&ci->i_ceph_lock); > > > for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { > > > if (bits & (1 << i)) { > > > - BUG_ON(ci->i_nr_by_mode[i] == 0); > > > - if (--ci->i_nr_by_mode[i] == 0) > > > - last++; > > > + BUG_ON(ci->i_file_by_mode[i].nr < count); > > > + ci->i_file_by_mode[i].nr -= count; > > > } > > > } > > > - dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n", > > > - &ci->vfs_inode, fmode, > > > - ci->i_nr_by_mode[0], ci->i_nr_by_mode[1], > > > - ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]); > > > spin_unlock(&ci->i_ceph_lock); > > > - > > > - if (last && ci->i_vino.snap == CEPH_NOSNAP) > > > - ceph_check_caps(ci, 0, NULL); > > > } > > > > > > /* > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > > > index 7e0190b1f821..f28f420bad23 100644 > > > --- a/fs/ceph/file.c > > > +++ b/fs/ceph/file.c > > > @@ -213,7 +213,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, > > > struct ceph_dir_file_info *dfi = > > > kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); > > > if (!dfi) { > > > - ceph_put_fmode(ci, fmode); /* clean up */ > > > + ceph_put_fmode(ci, fmode, 1); /* clean up */ > > > return -ENOMEM; > > > } > > > > > > @@ -224,7 +224,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, > > > } else { > > > fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); > > > if (!fi) { > > > - ceph_put_fmode(ci, fmode); /* clean up */ > > > + ceph_put_fmode(ci, fmode, 1); /* clean up */ > > > return -ENOMEM; > > > } > > > > > > @@ -263,7 +263,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) > > > case S_IFLNK: > > > dout("init_file %p %p 0%o (symlink)\n", inode, file, > > > inode->i_mode); > > > - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ > > > + ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */ > > > break; > > > > > > default: > > > @@ -273,7 +273,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) > > > * we need to drop the open ref now, since we don't > > > * have .release set to ceph_release. > > > */ > > > - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ > > > + ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */ > > > BUG_ON(inode->i_fop->release == ceph_release); > > > > > > /* call the proper open fop */ > > > @@ -285,14 +285,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) > > > /* > > > * try renew caps after session gets killed. > > > */ > > > -int ceph_renew_caps(struct inode *inode) > > > +int ceph_renew_caps(struct inode *inode, int fmode) > > > { > > > - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; > > > + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; > > > struct ceph_inode_info *ci = ceph_inode(inode); > > > struct ceph_mds_request *req; > > > int err, flags, wanted; > > > > > > spin_lock(&ci->i_ceph_lock); > > > + __ceph_touch_fmode(ci, mdsc, fmode); > > > wanted = __ceph_caps_file_wanted(ci); > > > if (__ceph_is_any_real_caps(ci) && > > > (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) { > > > @@ -405,6 +406,7 @@ int ceph_open(struct inode *inode, struct file *file) > > > } else if (ceph_snap(inode) != CEPH_NOSNAP && > > > (ci->i_snap_caps & wanted) == wanted) { > > > __ceph_get_fmode(ci, fmode); > > > + __ceph_touch_fmode(ci, mdsc, fmode); > > > spin_unlock(&ci->i_ceph_lock); > > > return ceph_init_file(inode, file, fmode); > > > } > > > @@ -525,7 +527,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > > > } > > > out_req: > > > if (!req->r_err && req->r_target_inode) > > > - ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); > > > + ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode, 1); > > > ceph_mdsc_put_request(req); > > > out_ctx: > > > ceph_release_acl_sec_ctx(&as_ctx); > > > @@ -542,7 +544,7 @@ int ceph_release(struct inode *inode, struct file *file) > > > dout("release inode %p dir file %p\n", inode, file); > > > WARN_ON(!list_empty(&dfi->file_info.rw_contexts)); > > > > > > - ceph_put_fmode(ci, dfi->file_info.fmode); > > > + ceph_put_fmode(ci, dfi->file_info.fmode, 1); > > > > > > if (dfi->last_readdir) > > > ceph_mdsc_put_request(dfi->last_readdir); > > > @@ -554,7 +556,8 @@ int ceph_release(struct inode *inode, struct file *file) > > > dout("release inode %p regular file %p\n", inode, file); > > > WARN_ON(!list_empty(&fi->rw_contexts)); > > > > > > - ceph_put_fmode(ci, fi->fmode); > > > + ceph_put_fmode(ci, fi->fmode, 1); > > > + > > > kmem_cache_free(ceph_file_cachep, fi); > > > } > > > > > > @@ -1560,7 +1563,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) > > > if (dirty) > > > __mark_inode_dirty(inode, dirty); > > > if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) > > > - ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL); > > > + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); > > > } > > > > > > dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c > > > index 094b8fc37787..b279bd8e168e 100644 > > > --- a/fs/ceph/inode.c > > > +++ b/fs/ceph/inode.c > > > @@ -478,8 +478,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) > > > ci->i_head_snapc = NULL; > > > ci->i_snap_caps = 0; > > > > > > - for (i = 0; i < CEPH_FILE_MODE_BITS; i++) > > > - ci->i_nr_by_mode[i] = 0; > > > + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { > > > + ci->i_file_by_mode[i].nr = 0; > > > + ci->i_file_by_mode[i].last_used = 0; > > > + } > > > > > > mutex_init(&ci->i_truncate_mutex); > > > ci->i_truncate_seq = 0; > > > @@ -637,7 +639,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, > > > if ((issued & (CEPH_CAP_FILE_CACHE| > > > CEPH_CAP_FILE_BUFFER)) || > > > mapping_mapped(inode->i_mapping) || > > > - __ceph_caps_file_wanted(ci)) { > > > + __ceph_is_file_opened(ci)) { > > > ci->i_truncate_pending++; > > > queue_trunc = 1; > > > } > > > @@ -1010,6 +1012,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page, > > > fill_inline = true; > > > } > > > > > > + if (cap_fmode >= 0) { > > > + if (!info_caps) > > > + pr_warn("mds issued no caps on %llx.%llx\n", > > > + ceph_vinop(inode)); > > > + __ceph_touch_fmode(ci, mdsc, cap_fmode); > > > + } > > > + > > > spin_unlock(&ci->i_ceph_lock); > > > > > > if (fill_inline) > > > diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c > > > index c90f03beb15d..da0ee54ae5bc 100644 > > > --- a/fs/ceph/ioctl.c > > > +++ b/fs/ceph/ioctl.c > > > @@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file) > > > struct ceph_file_info *fi = file->private_data; > > > struct inode *inode = file_inode(file); > > > struct ceph_inode_info *ci = ceph_inode(inode); > > > + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; > > > > > > if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { > > > spin_lock(&ci->i_ceph_lock); > > > fi->fmode |= CEPH_FILE_MODE_LAZY; > > > - ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++; > > > + ci->i_file_by_mode[ffs(CEPH_FILE_MODE_LAZY)].nr++; > > > + __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_LAZY); > > > spin_unlock(&ci->i_ceph_lock); > > > dout("ioctl_layzio: file %p marked lazy\n", file); > > > > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h > > > index d370f89df358..029823643b8b 100644 > > > --- a/fs/ceph/super.h > > > +++ b/fs/ceph/super.h > > > @@ -361,7 +361,10 @@ struct ceph_inode_info { > > > dirty|flushing caps */ > > > unsigned i_snap_caps; /* cap bits for snapped files */ > > > > > > - int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */ > > > + struct { > > > + int nr; > > > + unsigned long last_used; > > > + } i_file_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */ > > > > > > > Ok, so we're growing ceph_inode_info by 40 bytes here (on 64-bit arch). > > > > That's quite a bit, actually, but it turns out that there are 32 bytes > > worth of holes in ceph_inode_info now. It'd be good to reorganize the > > struct for better packing before you do this, so that this at least > > won't make memory consumption too much worse. > > > > There other ways we could approach this too that would be more space > > efficient. We don't really need to keep a timestamp for each mode bit. > > All we're really interested in is what modes were used in the last time > > interval. > > > > We could keep an active and inactive set of CEPH_FILE_MODE bits (which > > are just a single byte each), and a timestamp representing the switch > > between the two. > > > > As we use the file, we'd set bits in the active mask if the timestamp is > > less than half the time interval old. If it's more than half the > > interval, copy the active mask to the inactive one and zero out the > > active mask first. > > > > When you go to check what modes have been used you can do the switch > > again first if the timestamp is too old. To see what bits were actually > > used, you just logically or the activ You could couple this with > > something that requeues e and inactive sets together. > > > > That would take a lot less space per inode. > > The problem is there is no reliable tick for interval. I handwaved over that part, but you could ensure that it was revisited in a timely fashion the same way as here. Just call __cap_delay_requeue at each active/inactive switch event. You might not end up with a hard timeout for each individual bit, but I'm not sure that's really required to fix this problem. There may also be benefits in keeping the timeouts "looser" -- more opportunities to batch up cap releases when there are spiky, mixed read/write workloads. > besides, we can > not have patch 4 if use active bit. patch 4 simplifies code a lot. I > really like to keep it. we can just track last use of read/write, > which uses 16 bytes. this patch removes i_hold_caps_min. So only need > 8 more bytes. Ok, I do like the last two patches in the series a lot. Don't you also need to timestamp LAZY too if you do it this way? If you have a mix of lazy and nonlazy clients and the lazy one is holding the cap w/o using it, that seems like it could be the same problem all over again. > > > struct mutex i_truncate_mutex; > > > u32 i_truncate_seq; /* last truncate to smaller size */ > > > @@ -673,6 +676,10 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, > > > extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); > > > extern int __ceph_caps_used(struct ceph_inode_info *ci); > > > > > > +static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci) > > > +{ > > > + return ci->i_file_by_mode[0].nr; > > > +} > > > extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci); > > > extern int __ceph_caps_wanted(struct ceph_inode_info *ci); > > > > > > @@ -1074,7 +1081,10 @@ extern int ceph_try_get_caps(struct inode *inode, > > > > > > /* for counting open files by mode */ > > > extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode); > > > -extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); > > > +extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count); > > > +extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count); > > > +extern void __ceph_touch_fmode(struct ceph_inode_info *ci, > > > + struct ceph_mds_client *mdsc, int fmode); > > > > > > /* addr.c */ > > > extern const struct address_space_operations ceph_aops; > > > @@ -1086,7 +1096,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); > > > /* file.c */ > > > extern const struct file_operations ceph_file_fops; > > > > > > -extern int ceph_renew_caps(struct inode *inode); > > > +extern int ceph_renew_caps(struct inode *inode, int fmode); > > > extern int ceph_open(struct inode *inode, struct file *file); > > > extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > > > struct file *file, unsigned flags, umode_t mode); > > > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h > > > index cb21c5cf12c3..8017130a08a1 100644 > > > --- a/include/linux/ceph/ceph_fs.h > > > +++ b/include/linux/ceph/ceph_fs.h > > > @@ -564,6 +564,7 @@ struct ceph_filelock { > > > #define CEPH_FILE_MODE_RDWR 3 /* RD | WR */ > > > #define CEPH_FILE_MODE_LAZY 4 /* lazy io */ > > > #define CEPH_FILE_MODE_BITS 4 > > > +#define CEPH_FILE_MODE_MASK ((1 << CEPH_FILE_MODE_BITS) - 1) > > > > > > int ceph_flags_to_mode(int flags); > > > > > > > -- > > Jeff Layton <jlayton@xxxxxxxxxx> > > -- Jeff Layton <jlayton@xxxxxxxxxx>