Hi, On Tue, Aug 15, 2023 at 1:49 PM Jeff Layton <jlayton@xxxxxxxxxx> wrote: > > On Mon, 2023-08-14 at 17:11 -0400, Alexander Aring wrote: > > This patch fixes a race in async lock request handling between adding > > the relevant struct nlm_block to nlm_blocked list after the request was > > sent by vfs_lock_file() and nlmsvc_grant_deferred() does a lookup of the > > nlm_block in the nlm_blocked list. It could be that the async request is > > completed before the nlm_block was added to the list. This would end > > in a -ENOENT and a kernel log message of "lockd: grant for unknown > > block". > > > > To solve this issue we add the nlm_block before the vfs_lock_file() call > > to be sure it has been added when a possible nlmsvc_grant_deferred() is > > called. If the vfs_lock_file() results in an case when it wouldn't be > > added to nlm_blocked list, the nlm_block struct will be removed from > > this list again. > > > > The introducing of the new B_PENDING_CALLBACK nlm_block flag will handle > > async lock requests on a pending lock requests as a retry on the caller > > level to hit the -EAGAIN case. > > > > Signed-off-by: Alexander Aring <aahringo@xxxxxxxxxx> > > --- > > fs/lockd/svclock.c | 100 ++++++++++++++++++++++++++---------- > > include/linux/lockd/lockd.h | 2 + > > 2 files changed, 74 insertions(+), 28 deletions(-) > > > > diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c > > index c43ccdf28ed9..7d63524bdb81 100644 > > --- a/fs/lockd/svclock.c > > +++ b/fs/lockd/svclock.c > > @@ -133,6 +133,7 @@ nlmsvc_remove_block(struct nlm_block *block) > > { > > if (!list_empty(&block->b_list)) { > > spin_lock(&nlm_blocked_lock); > > + block->b_flags &= ~B_PENDING_CALLBACK; > > list_del_init(&block->b_list); > > spin_unlock(&nlm_blocked_lock); > > nlmsvc_release_block(block); > > @@ -232,6 +233,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, > > kref_init(&block->b_count); > > INIT_LIST_HEAD(&block->b_list); > > INIT_LIST_HEAD(&block->b_flist); > > + mutex_init(&block->b_cb_mutex); > > > > if (!nlmsvc_setgrantargs(call, lock)) > > goto failed_free; > > @@ -289,6 +291,8 @@ static void nlmsvc_free_block(struct kref *kref) > > > > dprintk("lockd: freeing block %p...\n", block); > > > > + WARN_ON_ONCE(block->b_flags & B_PENDING_CALLBACK); > > + > > /* Remove block from file's list of blocks */ > > list_del_init(&block->b_flist); > > mutex_unlock(&file->f_mutex); > > @@ -532,6 +536,13 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, > > goto out; > > } > > > > + mutex_lock(&block->b_cb_mutex); > > + if (block->b_flags & B_PENDING_CALLBACK) > > + goto pending_request; > > + > > + /* Append to list of blocked */ > > + nlmsvc_insert_block(block, NLM_NEVER); > > + > > if (!wait) > > lock->fl.fl_flags &= ~FL_SLEEP; > > mode = lock_to_openmode(&lock->fl); > > @@ -541,9 +552,13 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, > > dprintk("lockd: vfs_lock_file returned %d\n", error); > > switch (error) { > > case 0: > > + nlmsvc_remove_block(block); > > ret = nlm_granted; > > - goto out; > > + goto out_cb_mutex; > > case -EAGAIN: > > + if (!wait) > > + nlmsvc_remove_block(block); > > +pending_request: > > /* > > * If this is a blocking request for an > > * already pending lock request then we need > > @@ -552,26 +567,29 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, > > if (wait) > > break; > > ret = async_block ? nlm_lck_blocked : nlm_lck_denied; > > - goto out; > > + goto out_cb_mutex; > > case FILE_LOCK_DEFERRED: > > + block->b_flags |= B_PENDING_CALLBACK; > > + > > if (wait) > > break; > > /* Filesystem lock operation is in progress > > Add it to the queue waiting for callback */ > > ret = nlmsvc_defer_lock_rqst(rqstp, block); > > - goto out; > > + goto out_cb_mutex; > > case -EDEADLK: > > + nlmsvc_remove_block(block); > > ret = nlm_deadlock; > > - goto out; > > + goto out_cb_mutex; > > default: /* includes ENOLCK */ > > + nlmsvc_remove_block(block); > > ret = nlm_lck_denied_nolocks; > > - goto out; > > + goto out_cb_mutex; > > } > > > > ret = nlm_lck_blocked; > > - > > - /* Append to list of blocked */ > > - nlmsvc_insert_block(block, NLM_NEVER); > > +out_cb_mutex: > > + mutex_unlock(&block->b_cb_mutex); > > out: > > mutex_unlock(&file->f_mutex); > > nlmsvc_release_block(block); > > @@ -728,34 +746,60 @@ nlmsvc_update_deferred_block(struct nlm_block *block, int result) > > block->b_flags |= B_TIMED_OUT; > > } > > > > +static int __nlmsvc_grant_deferred(struct nlm_block *block, > > + struct file_lock *fl, > > + int result) > > +{ > > + int rc = 0; > > + > > + dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n", > > + block, block->b_flags); > > + if (block->b_flags & B_QUEUED) { > > + if (block->b_flags & B_TIMED_OUT) { > > + rc = -ENOLCK; > > + goto out; > > + } > > + nlmsvc_update_deferred_block(block, result); > > + } else if (result == 0) > > + block->b_granted = 1; > > + > > + nlmsvc_insert_block_locked(block, 0); > > + svc_wake_up(block->b_daemon); > > +out: > > + return rc; > > +} > > + > > static int nlmsvc_grant_deferred(struct file_lock *fl, int result) > > { > > - struct nlm_block *block; > > - int rc = -ENOENT; > > + struct nlm_block *iter, *block = NULL; > > + int rc; > > > > spin_lock(&nlm_blocked_lock); > > - list_for_each_entry(block, &nlm_blocked, b_list) { > > - if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { > > - dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n", > > - block, block->b_flags); > > - if (block->b_flags & B_QUEUED) { > > - if (block->b_flags & B_TIMED_OUT) { > > - rc = -ENOLCK; > > - break; > > - } > > - nlmsvc_update_deferred_block(block, result); > > - } else if (result == 0) > > - block->b_granted = 1; > > - > > - nlmsvc_insert_block_locked(block, 0); > > - svc_wake_up(block->b_daemon); > > - rc = 0; > > + list_for_each_entry(iter, &nlm_blocked, b_list) { > > + if (nlm_compare_locks(&iter->b_call->a_args.lock.fl, fl)) { > > + kref_get(&iter->b_count); > > + block = iter; > > break; > > } > > } > > spin_unlock(&nlm_blocked_lock); > > - if (rc == -ENOENT) > > - printk(KERN_WARNING "lockd: grant for unknown block\n"); > > + > > + if (!block) { > > + pr_warn("lockd: grant for unknown pending block\n"); > > + return -ENOENT; > > + } > > + > > + /* don't interfere with nlmsvc_lock() */ > > + mutex_lock(&block->b_cb_mutex); > > + block->b_flags &= ~B_PENDING_CALLBACK; > > + > > + spin_lock(&nlm_blocked_lock); > > + WARN_ON_ONCE(list_empty(&block->b_list)); > > + rc = __nlmsvc_grant_deferred(block, fl, result); > > + spin_unlock(&nlm_blocked_lock); > > + mutex_unlock(&block->b_cb_mutex); > > + > > + nlmsvc_release_block(block); > > return rc; > > } > > > > diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h > > index f42594a9efe0..91f55458f5fc 100644 > > --- a/include/linux/lockd/lockd.h > > +++ b/include/linux/lockd/lockd.h > > @@ -185,10 +185,12 @@ struct nlm_block { > > struct nlm_file * b_file; /* file in question */ > > struct cache_req * b_cache_req; /* deferred request handling */ > > struct cache_deferred_req * b_deferred_req > > + struct mutex b_cb_mutex; /* callback mutex */ > > There is no mention at all of this new mutex in the changelog or > comments. It's not at all clear to me what this is intended to protect. > In general, with lockd being a single-threaded service, we want to avoid > sleeping locks. This will need some clear justification. > The idea was to protect the PENDING flag and to wait until nlmsvc_lock() is "mostly" done and then allowing lm_grant() callback to proceed. Before the f_mutex was doing that, but since I made !wait request (non-blocking requests) being handled synchronously there was a deadlock because the callback can be run in an unknown order. It was fixed by having a new mutex on a per nlm_block basis. However I will remove those changes and try to find another way. > At a glance, it looks like you're trying to use this to hold > B_PENDING_CALLBACK steady while a lock request is being handled. That > suggests that you're using this mutex to serialize access to a section > of code and not one or more specific data structures. We usually like to > avoid that sort of thing, since locks that protect arbitrary sections of > code become difficult to work with over time. > > I'm going to go out on a limb here though and suggest that there is > probably a way to solve this problem that doesn't involve adding new > locks. > ok. > > unsigned int b_flags; /* block flags */ > > #define B_QUEUED 1 /* lock queued */ > > #define B_GOT_CALLBACK 2 /* got lock or conflicting lock */ > > #define B_TIMED_OUT 4 /* filesystem too slow to respond */ > > +#define B_PENDING_CALLBACK 8 /* pending callback for lock request */ > > }; > > > > /* > > Do we need this new flag at all? It seems redundant. If we have a block > on the list, then it is sort of by definition "pending callback". If > it's not on the list anymore, then it's not. No? > I will try to find another way. A pending callback is more a check on if it's a wait request (blocking request) and is it on the list or not. This flag was also there to avoid multiple wait requests while a wait request is pending. I ran into this case and the DLM implementation was never able to handle it. I can return -EAGAIN but DLM lock() needs to keep track of those things, I prefer to handle this situation in the upper layer. I am not sure why lockd do multiple wait requests for the same nlm_block when it's pending and waiting for lm_grant() callback. I will split this handling out of this patch as it is something what doesn't belong to the race fix between request and lm_grant() callback. - Alex