On Wed, Oct 18, 2023 at 3:34 AM Krister Johansen <kjlx@xxxxxxxxxxxxxxxxxx> wrote: > > Fuse submounts do not perform a lookup for the nodeid that they inherit > from their parent. Instead, the code decrements the nlookup on the > submount's fuse_inode when it is instantiated, and no forget is > performed when a submount root is evicted. > > Trouble arises when the submount's parent is evicted despite the > submount itself being in use. In this author's case, the submount was > in a container and deatched from the initial mount namespace via a > MNT_DEATCH operation. When memory pressure triggered the shrinker, the > inode from the parent was evicted, which triggered enough forgets to > render the submount's nodeid invalid. > > Since submounts should still function, even if their parent goes away, > solve this problem by sharing refcounted state between the parent and > its submount. When all of the references on this shared state reach > zero, it's safe to forget the final lookup of the fuse nodeid. > > Signed-off-by: Krister Johansen <kjlx@xxxxxxxxxxxxxxxxxx> > Cc: stable@xxxxxxxxxxxxxxx > Fixes: 1866d779d5d2 ("fuse: Allow fuse_fill_super_common() for submounts") > --- > fs/fuse/fuse_i.h | 20 +++++++++++ > fs/fuse/inode.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++-- > 2 files changed, 105 insertions(+), 3 deletions(-) > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index 405252bb51f2..0d1659c5016b 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -63,6 +63,24 @@ struct fuse_forget_link { > struct fuse_forget_link *next; > }; > > +/* Submount lookup tracking */ > +struct fuse_submount_lookup { > + /** Refcount */ > + refcount_t count; > + > + /** Unique ID, which identifies the inode between userspace > + * and kernel */ > + u64 nodeid; > + > + /** Number of lookups on this inode */ > + u64 nlookup; sl->nlookup will always be one. So that can just be implicit and this field can just go away. > + > + /** The request used for sending the FORGET message */ > + struct fuse_forget_link *forget; > + > + struct rcu_head rcu; RCU would be needed if any fields could be accessed from RCU protected code. But AFAICS there's no such access, so this shouldn't be needed. Am I missing something? > +}; > + > /** FUSE inode */ > struct fuse_inode { > /** Inode data */ > @@ -158,6 +176,8 @@ struct fuse_inode { > */ > struct fuse_inode_dax *dax; > #endif > + /** Submount specific lookup tracking */ > + struct fuse_submount_lookup *submount_lookup; > }; > > /** FUSE inode state bits */ > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > index 444418e240c8..dc1499e2074f 100644 > --- a/fs/fuse/inode.c > +++ b/fs/fuse/inode.c > @@ -68,6 +68,24 @@ struct fuse_forget_link *fuse_alloc_forget(void) > return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT); > } > > +static struct fuse_submount_lookup *fuse_alloc_submount_lookup(void) > +{ > + struct fuse_submount_lookup *sl; > + > + sl = kzalloc(sizeof(struct fuse_submount_lookup), GFP_KERNEL_ACCOUNT); > + if (!sl) > + return NULL; > + sl->forget = fuse_alloc_forget(); > + if (!sl->forget) > + goto out_free; > + > + return sl; > + > +out_free: > + kfree(sl); > + return NULL; > +} > + > static struct inode *fuse_alloc_inode(struct super_block *sb) > { > struct fuse_inode *fi; > @@ -113,9 +131,24 @@ static void fuse_free_inode(struct inode *inode) > kmem_cache_free(fuse_inode_cachep, fi); > } > > +static void fuse_cleanup_submount_lookup(struct fuse_conn *fc, > + struct fuse_submount_lookup *sl) > +{ > + if (!refcount_dec_and_test(&sl->count)) > + return; > + > + if (sl->nlookup) { > + fuse_queue_forget(fc, sl->forget, sl->nodeid, sl->nlookup); > + sl->forget = NULL; > + } > + kfree(sl->forget); > + kfree_rcu(sl, rcu); > +} > + > static void fuse_evict_inode(struct inode *inode) > { > struct fuse_inode *fi = get_fuse_inode(inode); > + struct fuse_submount_lookup *sl = NULL; > > /* Will write inode on close/munmap and in all other dirtiers */ > WARN_ON(inode->i_state & I_DIRTY_INODE); > @@ -132,6 +165,15 @@ static void fuse_evict_inode(struct inode *inode) > fi->nlookup); > fi->forget = NULL; > } > + > + spin_lock(&fi->lock); > + if (fi->submount_lookup) { > + sl = fi->submount_lookup; > + fi->submount_lookup = NULL; > + } > + spin_unlock(&fi->lock); I don't think locking is needed. Eviction happens only once and at that point nobody else should be touching the inode. > + if (sl) > + fuse_cleanup_submount_lookup(fc, sl); > } > if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { > WARN_ON(!list_empty(&fi->write_files)); > @@ -332,6 +374,14 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, > fuse_dax_dontcache(inode, attr->flags); > } > > +static void fuse_init_submount_lookup(struct fuse_submount_lookup *sl, > + u64 nodeid) > +{ > + sl->nodeid = nodeid; > + sl->nlookup = 1; > + refcount_set(&sl->count, 1); > +} > + > static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr, > struct fuse_conn *fc) > { > @@ -395,12 +445,22 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, > */ > if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) && > S_ISDIR(attr->mode)) { > + struct fuse_inode *fi; > + > inode = new_inode(sb); > if (!inode) > return NULL; > > fuse_init_inode(inode, attr, fc); > - get_fuse_inode(inode)->nodeid = nodeid; > + fi = get_fuse_inode(inode); > + fi->nodeid = nodeid; > + fi->submount_lookup = fuse_alloc_submount_lookup(); > + if (!fi->submount_lookup) { > + iput(inode); > + return NULL; > + } > + /* Sets nlookup = 1 on fi->submount_lookup->nlookup */ > + fuse_init_submount_lookup(fi->submount_lookup, nodeid); > inode->i_flags |= S_AUTOMOUNT; > goto done; > } > @@ -423,11 +483,11 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, > iput(inode); > goto retry; > } > -done: > fi = get_fuse_inode(inode); > spin_lock(&fi->lock); > fi->nlookup++; > spin_unlock(&fi->lock); > +done: > fuse_change_attributes(inode, attr, NULL, attr_valid, attr_version); > > return inode; > @@ -1465,6 +1525,8 @@ static int fuse_fill_super_submount(struct super_block *sb, > struct super_block *parent_sb = parent_fi->inode.i_sb; > struct fuse_attr root_attr; > struct inode *root; > + struct fuse_submount_lookup *sl; > + struct fuse_inode *fi; > > fuse_sb_defaults(sb); > fm->sb = sb; > @@ -1487,12 +1549,32 @@ static int fuse_fill_super_submount(struct super_block *sb, > * its nlookup should not be incremented. fuse_iget() does > * that, though, so undo it here. > */ > - get_fuse_inode(root)->nlookup--; > + fi = get_fuse_inode(root); > + fi->nlookup--; > + > sb->s_d_op = &fuse_dentry_operations; > sb->s_root = d_make_root(root); > if (!sb->s_root) > return -ENOMEM; > > + /* > + * Grab the parent's submount_lookup pointer and take a > + * reference on the shared nlookup from the parent. This is to > + * prevent the last forget for this nodeid from getting > + * triggered until all users have finished with it. > + */ > + spin_lock(&parent_fi->lock); Root has just been allocated, no locking needed. > + sl = parent_fi->submount_lookup; > + if (sl) { WARN_ON(!sl); Thanks, Miklos