On Mon, 2024-09-16 at 14:34 +1000, NeilBrown wrote: > Various operations such as rename and unlink must break any delegations > before they proceed. > do_dentry_open() and vfs_truncate(), which use break_lease(), increment > i_writecount and/or i_readcount which blocks delegations until the > counter is decremented, but the various callers of try_break_deleg() do > not impose any such barrier. They hold the inode lock while performing > the operation which blocks delegations, but must drop it while waiting > for a delegation to be broken, which leaves an opportunity for a new > delegation to be added. > > nfsd - the only current user of delegations - records any files on which > it is called to break a delegation in a manner which blocks further > delegations for 30-60 seconds. This is normally sufficient. However > there is talk of reducing the timeout and it would be best if operations > that needed delegations to be blocked used something more definitive > than a timer. > > This patch adds that definitive blocking by adding a counter to struct > file_lock_context of the number of concurrent operations which require > delegations to be blocked. check_conflicting_open() checks that counter > when a delegation is requested and denies the delegation if the counter > is elevated. > > try_break_deleg() now increments that counter when it records the inode > as a 'delegated_inode'. > > break_deleg_wait() now leaves the inode pointer in *delegated_inode when > it signals that the operation should be retried, and then clears it - > decrementing the new counter - when the operation has completed, whether > successfully or not. To achieve this we now pass the current error > status in to break_deleg_wait(). > > vfs_rename() now uses two delegated_inode pointers, one for the > source and one for the destination in the case of replacement. This is > needed as it may be necessary to block further delegations to both > inodes while the rename completes. > > The net result is that we no longer depend on the delay that nfsd > imposes on new delegation in order for various functions that break > delegations to be sure that new delegations won't be added while they wait > with the inode unlocked. This gives more freedom to nfsd to make more > subtle choices about when and for how long to block delegations when > there is no active contention. > > try_break_deleg() is possibly now large enough that it shouldn't be > inline. > I like this approach. Moving the blocking of new delegations into the VFS layer makes sense, and we can do better, more targeted blocking of new leases this way. I wonder -- do we still need the bloom filter if we do this? We could keep track of the time of the last recall in i_flctx as well, and then avoid handing them out until some time has elapsed. try_break_deleg() (and break_deleg_wait(), for that matter) were already a bit large for inlines, so moving them to regular functions sounds like a good idea. Maybe we can even have inline fastpath wrappers around them that check for i_flctx == NULL and avoid the jmp in that case? > Signed-off-by: NeilBrown <neilb@xxxxxxx> > --- > fs/locks.c | 12 ++++++++++-- > fs/namei.c | 32 ++++++++++++++++++++------------ > fs/open.c | 8 ++++---- > fs/posix_acl.c | 8 ++++---- > fs/utimes.c | 4 ++-- > fs/xattr.c | 8 ++++---- > include/linux/filelock.h | 31 ++++++++++++++++++++++++------- > include/linux/fs.h | 3 ++- > 8 files changed, 70 insertions(+), 36 deletions(-) > > diff --git a/fs/locks.c b/fs/locks.c > index e45cad40f8b6..171628094daa 100644 > --- a/fs/locks.c > +++ b/fs/locks.c > @@ -191,6 +191,7 @@ locks_get_lock_context(struct inode *inode, int type) > INIT_LIST_HEAD(&ctx->flc_flock); > INIT_LIST_HEAD(&ctx->flc_posix); > INIT_LIST_HEAD(&ctx->flc_lease); > + atomic_set(&ctx->flc_deleg_blockers, 0); > > /* > * Assign the pointer if it's not already assigned. If it is, then > @@ -255,6 +256,7 @@ locks_free_lock_context(struct inode *inode) > struct file_lock_context *ctx = locks_inode_context(inode); > > if (unlikely(ctx)) { > + WARN_ON(atomic_read(&ctx->flc_deleg_blockers) != 0); > locks_check_ctx_lists(inode); > kmem_cache_free(flctx_cache, ctx); > } > @@ -1743,9 +1745,15 @@ check_conflicting_open(struct file *filp, const int arg, int flags) > > if (flags & FL_LAYOUT) > return 0; > - if (flags & FL_DELEG) > - /* We leave these checks to the caller */ > + if (flags & FL_DELEG) { > + struct file_lock_context *ctx = locks_inode_context(inode); > + > + if (ctx && atomic_read(&ctx->flc_deleg_blockers) > 0) > + return -EAGAIN; > + > + /* We leave the remaining checks to the caller */ > return 0; > + } > > if (arg == F_RDLCK) > return inode_is_open_for_write(inode) ? -EAGAIN : 0; > diff --git a/fs/namei.c b/fs/namei.c > index 5512cb10fa89..3054da90276b 100644 > --- a/fs/namei.c > +++ b/fs/namei.c > @@ -4493,8 +4493,8 @@ int do_unlinkat(int dfd, struct filename *name) > iput(inode); /* truncate the inode here */ > inode = NULL; > if (delegated_inode) { > - error = break_deleg_wait(&delegated_inode); > - if (!error) > + error = break_deleg_wait(&delegated_inode, error); > + if (error == -EWOULDBLOCK) > goto retry_deleg; > } > mnt_drop_write(path.mnt); > @@ -4764,8 +4764,8 @@ int do_linkat(int olddfd, struct filename *old, int newdfd, > out_dput: > done_path_create(&new_path, new_dentry); > if (delegated_inode) { > - error = break_deleg_wait(&delegated_inode); > - if (!error) { > + error = break_deleg_wait(&delegated_inode, error); > + if (error == -EWOULDBLOCK) { > path_put(&old_path); > goto retry; > } > @@ -4848,7 +4848,8 @@ int vfs_rename(struct renamedata *rd) > struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir; > struct dentry *old_dentry = rd->old_dentry; > struct dentry *new_dentry = rd->new_dentry; > - struct inode **delegated_inode = rd->delegated_inode; > + struct inode **delegated_inode_old = rd->delegated_inode_old; > + struct inode **delegated_inode_new = rd->delegated_inode_new; > unsigned int flags = rd->flags; > bool is_dir = d_is_dir(old_dentry); > struct inode *source = old_dentry->d_inode; > @@ -4954,12 +4955,12 @@ int vfs_rename(struct renamedata *rd) > goto out; > } > if (!is_dir) { > - error = try_break_deleg(source, delegated_inode); > + error = try_break_deleg(source, delegated_inode_old); > if (error) > goto out; > } > if (target && !new_is_dir) { > - error = try_break_deleg(target, delegated_inode); > + error = try_break_deleg(target, delegated_inode_new); > if (error) > goto out; > } > @@ -5011,7 +5012,8 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, > struct path old_path, new_path; > struct qstr old_last, new_last; > int old_type, new_type; > - struct inode *delegated_inode = NULL; > + struct inode *delegated_inode_old = NULL; > + struct inode *delegated_inode_new = NULL; > unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; > bool should_retry = false; > int error = -EINVAL; > @@ -5118,7 +5120,8 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, > rd.new_dir = new_path.dentry->d_inode; > rd.new_dentry = new_dentry; > rd.new_mnt_idmap = mnt_idmap(new_path.mnt); > - rd.delegated_inode = &delegated_inode; > + rd.delegated_inode_old = &delegated_inode_old; > + rd.delegated_inode_new = &delegated_inode_new; > rd.flags = flags; > error = vfs_rename(&rd); > exit5: > @@ -5128,9 +5131,14 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, > exit3: > unlock_rename(new_path.dentry, old_path.dentry); > exit_lock_rename: > - if (delegated_inode) { > - error = break_deleg_wait(&delegated_inode); > - if (!error) > + if (delegated_inode_old) { > + error = break_deleg_wait(&delegated_inode_old, error); > + if (error == -EWOULDBLOCK) > + goto retry_deleg; > + } > + if (delegated_inode_new) { > + error = break_deleg_wait(&delegated_inode_new, error); > + if (error == -EWOULDBLOCK) > goto retry_deleg; > } > mnt_drop_write(old_path.mnt); > diff --git a/fs/open.c b/fs/open.c > index 22adbef7ecc2..6b6d20a68dd8 100644 > --- a/fs/open.c > +++ b/fs/open.c > @@ -656,8 +656,8 @@ int chmod_common(const struct path *path, umode_t mode) > out_unlock: > inode_unlock(inode); > if (delegated_inode) { > - error = break_deleg_wait(&delegated_inode); > - if (!error) > + error = break_deleg_wait(&delegated_inode, error); > + if (error == -EWOULDBLOCK) > goto retry_deleg; > } > mnt_drop_write(path->mnt); > @@ -795,8 +795,8 @@ int chown_common(const struct path *path, uid_t user, gid_t group) > &delegated_inode); > inode_unlock(inode); > if (delegated_inode) { > - error = break_deleg_wait(&delegated_inode); > - if (!error) > + error = break_deleg_wait(&delegated_inode, error); > + if (error == -EWOULDBLOCK) > goto retry_deleg; > } > return error; > diff --git a/fs/posix_acl.c b/fs/posix_acl.c > index 3f87297dbfdb..5eb3635d1067 100644 > --- a/fs/posix_acl.c > +++ b/fs/posix_acl.c > @@ -1143,8 +1143,8 @@ int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, > inode_unlock(inode); > > if (delegated_inode) { > - error = break_deleg_wait(&delegated_inode); > - if (!error) > + error = break_deleg_wait(&delegated_inode, error); > + if (error == -EWOULDBLOCK) > goto retry_deleg; > } > > @@ -1251,8 +1251,8 @@ int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, > inode_unlock(inode); > > if (delegated_inode) { > - error = break_deleg_wait(&delegated_inode); > - if (!error) > + error = break_deleg_wait(&delegated_inode, error); > + if (error == -EWOULDBLOCK) > goto retry_deleg; > } > > diff --git a/fs/utimes.c b/fs/utimes.c > index 3701b3946f88..21b7605551dc 100644 > --- a/fs/utimes.c > +++ b/fs/utimes.c > @@ -67,8 +67,8 @@ int vfs_utimes(const struct path *path, struct timespec64 *times) > &delegated_inode); > inode_unlock(inode); > if (delegated_inode) { > - error = break_deleg_wait(&delegated_inode); > - if (!error) > + error = break_deleg_wait(&delegated_inode, error); > + if (error == -EWOULDBLOCK) > goto retry_deleg; > } > > diff --git a/fs/xattr.c b/fs/xattr.c > index 7672ce5486c5..63e0b067dab9 100644 > --- a/fs/xattr.c > +++ b/fs/xattr.c > @@ -323,8 +323,8 @@ vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, > inode_unlock(inode); > > if (delegated_inode) { > - error = break_deleg_wait(&delegated_inode); > - if (!error) > + error = break_deleg_wait(&delegated_inode, error); > + if (error == -EWOULDBLOCK) > goto retry_deleg; > } > if (value != orig_value) > @@ -577,8 +577,8 @@ vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, > inode_unlock(inode); > > if (delegated_inode) { > - error = break_deleg_wait(&delegated_inode); > - if (!error) > + error = break_deleg_wait(&delegated_inode, error); > + if (error == -EWOULDBLOCK) > goto retry_deleg; > } > > diff --git a/include/linux/filelock.h b/include/linux/filelock.h > index daee999d05f3..66470ba9658c 100644 > --- a/include/linux/filelock.h > +++ b/include/linux/filelock.h > @@ -144,6 +144,7 @@ struct file_lock_context { > struct list_head flc_flock; > struct list_head flc_posix; > struct list_head flc_lease; > + atomic_t flc_deleg_blockers; > }; > > #ifdef CONFIG_FILE_LOCKING > @@ -450,21 +451,37 @@ static inline int try_break_deleg(struct inode *inode, struct inode **delegated_ > { > int ret; > > + if (delegated_inode && *delegated_inode) { > + if (*delegated_inode == inode) > + /* Don't need to count this */ > + return break_deleg(inode, O_WRONLY|O_NONBLOCK); > + > + /* inode changed, forget the old one */ > + atomic_dec(&(*delegated_inode)->i_flctx->flc_deleg_blockers); > + iput(*delegated_inode); > + *delegated_inode = NULL; > + } > ret = break_deleg(inode, O_WRONLY|O_NONBLOCK); > if (ret == -EWOULDBLOCK && delegated_inode) { > *delegated_inode = inode; > + atomic_inc(&(*delegated_inode)->i_flctx->flc_deleg_blockers); > ihold(inode); > } > return ret; > } > > -static inline int break_deleg_wait(struct inode **delegated_inode) > +static inline int break_deleg_wait(struct inode **delegated_inode, int ret) > { > - int ret; > - > - ret = break_deleg(*delegated_inode, O_WRONLY); > - iput(*delegated_inode); > - *delegated_inode = NULL; > + if (ret == -EWOULDBLOCK) { > + ret = break_deleg(*delegated_inode, O_WRONLY); > + if (ret == 0) > + ret = -EWOULDBLOCK; > + } > + if (ret != -EWOULDBLOCK) { > + atomic_dec(&(*delegated_inode)->i_flctx->flc_deleg_blockers); > + iput(*delegated_inode); > + *delegated_inode = NULL; > + } > return ret; > } > > @@ -494,7 +511,7 @@ static inline int try_break_deleg(struct inode *inode, struct inode **delegated_ > return 0; > } > > -static inline int break_deleg_wait(struct inode **delegated_inode) > +static inline int break_deleg_wait(struct inode **delegated_inode, int ret) > { > BUG(); > return 0; > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 6ca11e241a24..50957d9e1c2b 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1902,7 +1902,8 @@ struct renamedata { > struct mnt_idmap *new_mnt_idmap; > struct inode *new_dir; > struct dentry *new_dentry; > - struct inode **delegated_inode; > + struct inode **delegated_inode_old; > + struct inode **delegated_inode_new; > unsigned int flags; > } __randomize_layout; > > > base-commit: 98f7e32f20d28ec452afb208f9cffc08448a2652 -- Jeff Layton <jlayton@xxxxxxxxxx>