> On Apr 20, 2023, at 2:52 PM, Jeff Layton <jlayton@xxxxxxxxxx> wrote: > > On Mon, 2023-04-17 at 15:23 -0400, Chuck Lever wrote: >> From: Chuck Lever <chuck.lever@xxxxxxxxxx> >> >> The current cursor-based directory cookie mechanism doesn't work >> when a tmpfs filesystem is exported via NFS. This is because NFS >> clients do not open directories: each READDIR operation has to open >> the directory on the server, read it, then close it. The cursor >> state for that directory, being associated strictly with the opened >> struct file, is then discarded. >> >> Directory cookies are cached not only by NFS clients, but also by >> user space libraries on those clients. Essentially there is no way >> to invalidate those caches when directory offsets have changed on >> an NFS server after the offset-to-dentry mapping changes. >> >> The solution we've come up with is to make the directory cookie for >> each file in a tmpfs filesystem stable for the life of the directory >> entry it represents. >> >> Add a per-directory xarray. shmem_readdir() uses this to map each >> directory offset (an loff_t integer) to the memory address of a >> struct dentry. >> >> Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> >> --- >> include/linux/shmem_fs.h | 2 >> mm/shmem.c | 213 +++++++++++++++++++++++++++++++++++++++++++--- >> 2 files changed, 201 insertions(+), 14 deletions(-) >> >> Changes since RFC: >> - Destroy xarray in shmem_destroy_inode() instead of free_in_core_inode() >> - A few cosmetic updates >> >> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h >> index 103d1000a5a2..682ef885aa89 100644 >> --- a/include/linux/shmem_fs.h >> +++ b/include/linux/shmem_fs.h >> @@ -26,6 +26,8 @@ struct shmem_inode_info { >> atomic_t stop_eviction; /* hold when working on inode */ >> struct timespec64 i_crtime; /* file creation time */ >> unsigned int fsflags; /* flags for FS_IOC_[SG]ETFLAGS */ >> + struct xarray doff_map; /* dir offset to entry mapping */ >> + u32 next_doff; >> struct inode vfs_inode; >> }; >> >> diff --git a/mm/shmem.c b/mm/shmem.c >> index 448f393d8ab2..ba4176499e5c 100644 >> --- a/mm/shmem.c >> +++ b/mm/shmem.c >> @@ -40,6 +40,8 @@ >> #include <linux/fs_parser.h> >> #include <linux/swapfile.h> >> #include <linux/iversion.h> >> +#include <linux/xarray.h> >> + >> #include "swap.h" >> >> static struct vfsmount *shm_mnt; >> @@ -234,6 +236,7 @@ static const struct super_operations shmem_ops; >> const struct address_space_operations shmem_aops; >> static const struct file_operations shmem_file_operations; >> static const struct inode_operations shmem_inode_operations; >> +static const struct file_operations shmem_dir_operations; >> static const struct inode_operations shmem_dir_inode_operations; >> static const struct inode_operations shmem_special_inode_operations; >> static const struct vm_operations_struct shmem_vm_ops; >> @@ -2397,7 +2400,9 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block >> /* Some things misbehave if size == 0 on a directory */ >> inode->i_size = 2 * BOGO_DIRENT_SIZE; >> inode->i_op = &shmem_dir_inode_operations; >> - inode->i_fop = &simple_dir_operations; >> + inode->i_fop = &shmem_dir_operations; >> + xa_init_flags(&info->doff_map, XA_FLAGS_ALLOC1); >> + info->next_doff = 0; >> break; >> case S_IFLNK: >> /* >> @@ -2917,6 +2922,71 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) >> return 0; >> } >> >> +static struct xarray *shmem_doff_map(struct inode *dir) >> +{ >> + return &SHMEM_I(dir)->doff_map; >> +} >> + >> +static int shmem_doff_add(struct inode *dir, struct dentry *dentry) >> +{ >> + struct shmem_inode_info *info = SHMEM_I(dir); >> + struct xa_limit limit = XA_LIMIT(2, U32_MAX); >> + u32 offset; >> + int ret; >> + >> + if (dentry->d_fsdata) >> + return -EBUSY; >> + >> + offset = 0; >> + ret = xa_alloc_cyclic(shmem_doff_map(dir), &offset, dentry, limit, >> + &info->next_doff, GFP_KERNEL); >> + if (ret < 0) >> + return ret; >> + >> + dentry->d_fsdata = (void *)(unsigned long)offset; >> + return 0; >> +} >> + >> +static struct dentry *shmem_doff_find_after(struct dentry *dir, >> + unsigned long *offset) >> +{ >> + struct xarray *xa = shmem_doff_map(d_inode(dir)); >> + struct dentry *d, *found = NULL; >> + >> + spin_lock(&dir->d_lock); >> + d = xa_find_after(xa, offset, ULONG_MAX, XA_PRESENT); >> + if (d) { >> + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); >> + if (simple_positive(d)) >> + found = dget_dlock(d); >> + spin_unlock(&d->d_lock); >> + } >> + spin_unlock(&dir->d_lock); > > This part is kind of gross, but I think I get it now... > > You have to take dir->d_lock to ensure that "d" doesn't go away when you > don't hold a ref on it, and you need the child's d_lock to ensure that > simple_positive result is stable while you take a reference (because > doing a dput there could be problematic). If that's right, then that's a > bit subtle, and might deserve a nice comment. > > I do wonder if there is some way to do this with RCU instead, but this > seems to work well enough. I lifted this from fs/libfs.c, fwiw. >> + return found; >> +} >> + >> +static void shmem_doff_remove(struct inode *dir, struct dentry *dentry) >> +{ >> + u32 offset = (u32)(unsigned long)dentry->d_fsdata; >> + >> + if (!offset) >> + return; >> + >> + xa_erase(shmem_doff_map(dir), offset); >> + dentry->d_fsdata = NULL; >> +} >> + >> +/* >> + * During fs teardown (eg. umount), a directory's doff_map might still >> + * contain entries. xa_destroy() cleans out anything that remains. >> + */ >> +static void shmem_doff_map_destroy(struct inode *inode) >> +{ >> + struct xarray *xa = shmem_doff_map(inode); >> + >> + xa_destroy(xa); >> +} >> + >> /* >> * File creation. Allocate an inode, and we're done.. >> */ >> @@ -2938,6 +3008,10 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, >> if (error && error != -EOPNOTSUPP) >> goto out_iput; >> >> + error = shmem_doff_add(dir, dentry); >> + if (error) >> + goto out_iput; >> + >> error = 0; >> dir->i_size += BOGO_DIRENT_SIZE; >> dir->i_ctime = dir->i_mtime = current_time(dir); >> @@ -3015,6 +3089,10 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr >> goto out; >> } >> >> + ret = shmem_doff_add(dir, dentry); >> + if (ret) >> + goto out; >> + >> dir->i_size += BOGO_DIRENT_SIZE; >> inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); >> inode_inc_iversion(dir); >> @@ -3033,6 +3111,8 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) >> if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) >> shmem_free_inode(inode->i_sb); >> >> + shmem_doff_remove(dir, dentry); >> + >> dir->i_size -= BOGO_DIRENT_SIZE; >> inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); >> inode_inc_iversion(dir); >> @@ -3091,24 +3171,37 @@ static int shmem_rename2(struct mnt_idmap *idmap, >> { >> struct inode *inode = d_inode(old_dentry); >> int they_are_dirs = S_ISDIR(inode->i_mode); >> + int error; >> >> if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) >> return -EINVAL; >> >> - if (flags & RENAME_EXCHANGE) >> + if (flags & RENAME_EXCHANGE) { >> + shmem_doff_remove(old_dir, old_dentry); >> + shmem_doff_remove(new_dir, new_dentry); >> + error = shmem_doff_add(new_dir, old_dentry); >> + if (error) >> + return error; >> + error = shmem_doff_add(old_dir, new_dentry); >> + if (error) >> + return error; >> return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); >> + } >> >> if (!simple_empty(new_dentry)) >> return -ENOTEMPTY; >> >> if (flags & RENAME_WHITEOUT) { >> - int error; >> - >> error = shmem_whiteout(idmap, old_dir, old_dentry); >> if (error) >> return error; >> } >> >> + shmem_doff_remove(old_dir, old_dentry); >> + error = shmem_doff_add(new_dir, old_dentry); >> + if (error) >> + return error; >> + >> if (d_really_is_positive(new_dentry)) { >> (void) shmem_unlink(new_dir, new_dentry); >> if (they_are_dirs) { >> @@ -3149,26 +3242,22 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, >> >> error = security_inode_init_security(inode, dir, &dentry->d_name, >> shmem_initxattrs, NULL); >> - if (error && error != -EOPNOTSUPP) { >> - iput(inode); >> - return error; >> - } >> + if (error && error != -EOPNOTSUPP) >> + goto out_iput; >> >> inode->i_size = len-1; >> if (len <= SHORT_SYMLINK_LEN) { >> inode->i_link = kmemdup(symname, len, GFP_KERNEL); >> if (!inode->i_link) { >> - iput(inode); >> - return -ENOMEM; >> + error = -ENOMEM; >> + goto out_iput; >> } >> inode->i_op = &shmem_short_symlink_operations; >> } else { >> inode_nohighmem(inode); >> error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); >> - if (error) { >> - iput(inode); >> - return error; >> - } >> + if (error) >> + goto out_iput; >> inode->i_mapping->a_ops = &shmem_aops; >> inode->i_op = &shmem_symlink_inode_operations; >> memcpy(folio_address(folio), symname, len); >> @@ -3177,12 +3266,20 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, >> folio_unlock(folio); >> folio_put(folio); >> } >> + >> + error = shmem_doff_add(dir, dentry); >> + if (error) >> + goto out_iput; >> + >> dir->i_size += BOGO_DIRENT_SIZE; >> dir->i_ctime = dir->i_mtime = current_time(dir); >> inode_inc_iversion(dir); >> d_instantiate(dentry, inode); >> dget(dentry); >> return 0; >> +out_iput: >> + iput(inode); >> + return error; >> } >> >> static void shmem_put_link(void *arg) >> @@ -3224,6 +3321,77 @@ static const char *shmem_get_link(struct dentry *dentry, >> return folio_address(folio); >> } >> >> +static loff_t shmem_dir_llseek(struct file *file, loff_t offset, int whence) >> +{ >> + switch (whence) { >> + case SEEK_CUR: >> + offset += file->f_pos; >> + fallthrough; >> + case SEEK_SET: >> + if (offset >= 0) >> + break; >> + fallthrough; >> + default: >> + return -EINVAL; >> + } >> + return vfs_setpos(file, offset, U32_MAX); >> +} >> + >> +static bool shmem_dir_emit(struct dir_context *ctx, struct dentry *dentry) >> +{ >> + struct inode *inode = d_inode(dentry); >> + >> + return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, >> + (loff_t)dentry->d_fsdata, inode->i_ino, >> + fs_umode_to_dtype(inode->i_mode)); >> +} >> + >> +/** >> + * shmem_readdir - Emit entries starting at offset @ctx->pos >> + * @file: an open directory to iterate over >> + * @ctx: directory iteration context >> + * >> + * Caller must hold @file's i_rwsem to prevent insertion or removal of >> + * entries during this call. >> + * >> + * On entry, @ctx->pos contains an offset that represents the first entry >> + * to be read from the directory. >> + * >> + * The operation continues until there are no more entries to read, or >> + * until the ctx->actor indicates there is no more space in the caller's >> + * output buffer. >> + * >> + * On return, @ctx->pos contains an offset that will read the next entry >> + * in this directory when shmem_readdir() is called again with @ctx. >> + * >> + * Return values: >> + * %0 - Complete >> + */ >> +static int shmem_readdir(struct file *file, struct dir_context *ctx) >> +{ >> + struct dentry *dentry, *dir = file->f_path.dentry; >> + unsigned long offset; >> + >> + lockdep_assert_held(&d_inode(dir)->i_rwsem); > > You probably don't need the above. This is called via ->iterate_shared > so the lock had _better_ be held. True, it's not 100% necessary. I was trying to document the API contract, part of which is "caller needs to hold dir->i_rwsem". This seemed like the most crisp way to do that. >> + >> + if (!dir_emit_dots(file, ctx)) >> + goto out; >> + for (offset = ctx->pos - 1; offset < ULONG_MAX - 1;) { >> + dentry = shmem_doff_find_after(dir, &offset); >> + if (!dentry) >> + break; >> + if (!shmem_dir_emit(ctx, dentry)) { >> + dput(dentry); >> + break; >> + } >> + ctx->pos = offset + 1; >> + dput(dentry); >> + } >> + >> +out: >> + return 0; >> +} >> + >> #ifdef CONFIG_TMPFS_XATTR >> >> static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) >> @@ -3742,6 +3910,12 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) >> return 0; >> } >> >> +#else /* CONFIG_TMPFS */ >> + >> +static inline void shmem_doff_map_destroy(struct inode *dir) >> +{ >> +} >> + >> #endif /* CONFIG_TMPFS */ >> >> static void shmem_put_super(struct super_block *sb) >> @@ -3888,6 +4062,8 @@ static void shmem_destroy_inode(struct inode *inode) >> { >> if (S_ISREG(inode->i_mode)) >> mpol_free_shared_policy(&SHMEM_I(inode)->policy); >> + if (S_ISDIR(inode->i_mode)) >> + shmem_doff_map_destroy(inode); >> } >> >> static void shmem_init_inode(void *foo) >> @@ -3955,6 +4131,15 @@ static const struct inode_operations shmem_inode_operations = { >> #endif >> }; >> >> +static const struct file_operations shmem_dir_operations = { >> +#ifdef CONFIG_TMPFS >> + .llseek = shmem_dir_llseek, >> + .iterate_shared = shmem_readdir, >> +#endif >> + .read = generic_read_dir, >> + .fsync = noop_fsync, >> +}; >> + >> static const struct inode_operations shmem_dir_inode_operations = { >> #ifdef CONFIG_TMPFS >> .getattr = shmem_getattr, >> >> > > Other than the nits above, this all looks fine to me. I've done some > testing with this series too and it all seems to work as expected, and > fixes some nasty problems when trying to recursively remove directories > via nfsd. Thanks for your review, testing, and suggestions. > Have you done any performance testing? My expectation would be that > you'd have roughly similar (or even faster) performance with this set, > but at the expense of a bit of memory (for the xarrays). I don't have any directory microbenchmarks. I suppose I could do something like timing large software builds. > One thing we could consider is lifting the bulk of this code into libfs, > so other shmem-like filesystems can take advantage of it, but that work > could be done later too when we have another proposed consumer. Eg. autofs. -- Chuck Lever