Re: [PATCH v1] shmem: stable directory cookies

Chuck Lever III <chuck.lever@xxxxxxxxxx> · Thu, 20 Apr 2023 20:12:21 +0000

> On Apr 20, 2023, at 2:52 PM, Jeff Layton <jlayton@xxxxxxxxxx> wrote:
> 
> On Mon, 2023-04-17 at 15:23 -0400, Chuck Lever wrote:
>> From: Chuck Lever <chuck.lever@xxxxxxxxxx>
>> 
>> The current cursor-based directory cookie mechanism doesn't work
>> when a tmpfs filesystem is exported via NFS. This is because NFS
>> clients do not open directories: each READDIR operation has to open
>> the directory on the server, read it, then close it. The cursor
>> state for that directory, being associated strictly with the opened
>> struct file, is then discarded.
>> 
>> Directory cookies are cached not only by NFS clients, but also by
>> user space libraries on those clients. Essentially there is no way
>> to invalidate those caches when directory offsets have changed on
>> an NFS server after the offset-to-dentry mapping changes.
>> 
>> The solution we've come up with is to make the directory cookie for
>> each file in a tmpfs filesystem stable for the life of the directory
>> entry it represents.
>> 
>> Add a per-directory xarray. shmem_readdir() uses this to map each
>> directory offset (an loff_t integer) to the memory address of a
>> struct dentry.
>> 
>> Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx>
>> ---
>> include/linux/shmem_fs.h |    2 
>> mm/shmem.c               |  213 +++++++++++++++++++++++++++++++++++++++++++---
>> 2 files changed, 201 insertions(+), 14 deletions(-)
>> 
>> Changes since RFC:
>> - Destroy xarray in shmem_destroy_inode() instead of free_in_core_inode()
>> - A few cosmetic updates
>> 
>> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
>> index 103d1000a5a2..682ef885aa89 100644
>> --- a/include/linux/shmem_fs.h
>> +++ b/include/linux/shmem_fs.h
>> @@ -26,6 +26,8 @@ struct shmem_inode_info {
>> atomic_t stop_eviction; /* hold when working on inode */
>> struct timespec64 i_crtime; /* file creation time */
>> unsigned int fsflags; /* flags for FS_IOC_[SG]ETFLAGS */
>> + struct xarray doff_map; /* dir offset to entry mapping */
>> + u32 next_doff;
>> struct inode vfs_inode;
>> };
>> 
>> diff --git a/mm/shmem.c b/mm/shmem.c
>> index 448f393d8ab2..ba4176499e5c 100644
>> --- a/mm/shmem.c
>> +++ b/mm/shmem.c
>> @@ -40,6 +40,8 @@
>> #include <linux/fs_parser.h>
>> #include <linux/swapfile.h>
>> #include <linux/iversion.h>
>> +#include <linux/xarray.h>
>> +
>> #include "swap.h"
>> 
>> static struct vfsmount *shm_mnt;
>> @@ -234,6 +236,7 @@ static const struct super_operations shmem_ops;
>> const struct address_space_operations shmem_aops;
>> static const struct file_operations shmem_file_operations;
>> static const struct inode_operations shmem_inode_operations;
>> +static const struct file_operations shmem_dir_operations;
>> static const struct inode_operations shmem_dir_inode_operations;
>> static const struct inode_operations shmem_special_inode_operations;
>> static const struct vm_operations_struct shmem_vm_ops;
>> @@ -2397,7 +2400,9 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block
>> /* Some things misbehave if size == 0 on a directory */
>> inode->i_size = 2 * BOGO_DIRENT_SIZE;
>> inode->i_op = &shmem_dir_inode_operations;
>> - inode->i_fop = &simple_dir_operations;
>> + inode->i_fop = &shmem_dir_operations;
>> + xa_init_flags(&info->doff_map, XA_FLAGS_ALLOC1);
>> + info->next_doff = 0;
>> break;
>> case S_IFLNK:
>> /*
>> @@ -2917,6 +2922,71 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
>> return 0;
>> }
>> 
>> +static struct xarray *shmem_doff_map(struct inode *dir)
>> +{
>> + return &SHMEM_I(dir)->doff_map;
>> +}
>> +
>> +static int shmem_doff_add(struct inode *dir, struct dentry *dentry)
>> +{
>> + struct shmem_inode_info *info = SHMEM_I(dir);
>> + struct xa_limit limit = XA_LIMIT(2, U32_MAX);
>> + u32 offset;
>> + int ret;
>> +
>> + if (dentry->d_fsdata)
>> + return -EBUSY;
>> +
>> + offset = 0;
>> + ret = xa_alloc_cyclic(shmem_doff_map(dir), &offset, dentry, limit,
>> +       &info->next_doff, GFP_KERNEL);
>> + if (ret < 0)
>> + return ret;
>> +
>> + dentry->d_fsdata = (void *)(unsigned long)offset;
>> + return 0;
>> +}
>> +
>> +static struct dentry *shmem_doff_find_after(struct dentry *dir,
>> +     unsigned long *offset)
>> +{
>> + struct xarray *xa = shmem_doff_map(d_inode(dir));
>> + struct dentry *d, *found = NULL;
>> +
>> + spin_lock(&dir->d_lock);
>> + d = xa_find_after(xa, offset, ULONG_MAX, XA_PRESENT);
>> + if (d) {
>> + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
>> + if (simple_positive(d))
>> + found = dget_dlock(d);
>> + spin_unlock(&d->d_lock);
>> + }
>> + spin_unlock(&dir->d_lock);
> 
> This part is kind of gross, but I think I get it now...
> 
> You have to take dir->d_lock to ensure that "d" doesn't go away when you
> don't hold a ref on it, and you need the child's d_lock to ensure that
> simple_positive result is stable while you take a reference (because
> doing a dput there could be problematic). If that's right, then that's a
> bit subtle, and might deserve a nice comment.
> 
> I do wonder if there is some way to do this with RCU instead, but this
> seems to work well enough.

I lifted this from fs/libfs.c, fwiw.

>> + return found;
>> +}
>> +
>> +static void shmem_doff_remove(struct inode *dir, struct dentry *dentry)
>> +{
>> + u32 offset = (u32)(unsigned long)dentry->d_fsdata;
>> +
>> + if (!offset)
>> + return;
>> +
>> + xa_erase(shmem_doff_map(dir), offset);
>> + dentry->d_fsdata = NULL;
>> +}
>> +
>> +/*
>> + * During fs teardown (eg. umount), a directory's doff_map might still
>> + * contain entries. xa_destroy() cleans out anything that remains.
>> + */
>> +static void shmem_doff_map_destroy(struct inode *inode)
>> +{
>> + struct xarray *xa = shmem_doff_map(inode);
>> +
>> + xa_destroy(xa);
>> +}
>> +
>> /*
>>  * File creation. Allocate an inode, and we're done..
>>  */
>> @@ -2938,6 +3008,10 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
>> if (error && error != -EOPNOTSUPP)
>> goto out_iput;
>> 
>> + error = shmem_doff_add(dir, dentry);
>> + if (error)
>> + goto out_iput;
>> +
>> error = 0;
>> dir->i_size += BOGO_DIRENT_SIZE;
>> dir->i_ctime = dir->i_mtime = current_time(dir);
>> @@ -3015,6 +3089,10 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
>> goto out;
>> }
>> 
>> + ret = shmem_doff_add(dir, dentry);
>> + if (ret)
>> + goto out;
>> +
>> dir->i_size += BOGO_DIRENT_SIZE;
>> inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
>> inode_inc_iversion(dir);
>> @@ -3033,6 +3111,8 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
>> if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
>> shmem_free_inode(inode->i_sb);
>> 
>> + shmem_doff_remove(dir, dentry);
>> +
>> dir->i_size -= BOGO_DIRENT_SIZE;
>> inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
>> inode_inc_iversion(dir);
>> @@ -3091,24 +3171,37 @@ static int shmem_rename2(struct mnt_idmap *idmap,
>> {
>> struct inode *inode = d_inode(old_dentry);
>> int they_are_dirs = S_ISDIR(inode->i_mode);
>> + int error;
>> 
>> if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
>> return -EINVAL;
>> 
>> - if (flags & RENAME_EXCHANGE)
>> + if (flags & RENAME_EXCHANGE) {
>> + shmem_doff_remove(old_dir, old_dentry);
>> + shmem_doff_remove(new_dir, new_dentry);
>> + error = shmem_doff_add(new_dir, old_dentry);
>> + if (error)
>> + return error;
>> + error = shmem_doff_add(old_dir, new_dentry);
>> + if (error)
>> + return error;
>> return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
>> + }
>> 
>> if (!simple_empty(new_dentry))
>> return -ENOTEMPTY;
>> 
>> if (flags & RENAME_WHITEOUT) {
>> - int error;
>> -
>> error = shmem_whiteout(idmap, old_dir, old_dentry);
>> if (error)
>> return error;
>> }
>> 
>> + shmem_doff_remove(old_dir, old_dentry);
>> + error = shmem_doff_add(new_dir, old_dentry);
>> + if (error)
>> + return error;
>> +
>> if (d_really_is_positive(new_dentry)) {
>> (void) shmem_unlink(new_dir, new_dentry);
>> if (they_are_dirs) {
>> @@ -3149,26 +3242,22 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
>> 
>> error = security_inode_init_security(inode, dir, &dentry->d_name,
>>      shmem_initxattrs, NULL);
>> - if (error && error != -EOPNOTSUPP) {
>> - iput(inode);
>> - return error;
>> - }
>> + if (error && error != -EOPNOTSUPP)
>> + goto out_iput;
>> 
>> inode->i_size = len-1;
>> if (len <= SHORT_SYMLINK_LEN) {
>> inode->i_link = kmemdup(symname, len, GFP_KERNEL);
>> if (!inode->i_link) {
>> - iput(inode);
>> - return -ENOMEM;
>> + error = -ENOMEM;
>> + goto out_iput;
>> }
>> inode->i_op = &shmem_short_symlink_operations;
>> } else {
>> inode_nohighmem(inode);
>> error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
>> - if (error) {
>> - iput(inode);
>> - return error;
>> - }
>> + if (error)
>> + goto out_iput;
>> inode->i_mapping->a_ops = &shmem_aops;
>> inode->i_op = &shmem_symlink_inode_operations;
>> memcpy(folio_address(folio), symname, len);
>> @@ -3177,12 +3266,20 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
>> folio_unlock(folio);
>> folio_put(folio);
>> }
>> +
>> + error = shmem_doff_add(dir, dentry);
>> + if (error)
>> + goto out_iput;
>> +
>> dir->i_size += BOGO_DIRENT_SIZE;
>> dir->i_ctime = dir->i_mtime = current_time(dir);
>> inode_inc_iversion(dir);
>> d_instantiate(dentry, inode);
>> dget(dentry);
>> return 0;
>> +out_iput:
>> + iput(inode);
>> + return error;
>> }
>> 
>> static void shmem_put_link(void *arg)
>> @@ -3224,6 +3321,77 @@ static const char *shmem_get_link(struct dentry *dentry,
>> return folio_address(folio);
>> }
>> 
>> +static loff_t shmem_dir_llseek(struct file *file, loff_t offset, int whence)
>> +{
>> + switch (whence) {
>> + case SEEK_CUR:
>> + offset += file->f_pos;
>> + fallthrough;
>> + case SEEK_SET:
>> + if (offset >= 0)
>> + break;
>> + fallthrough;
>> + default:
>> + return -EINVAL;
>> + }
>> + return vfs_setpos(file, offset, U32_MAX);
>> +}
>> +
>> +static bool shmem_dir_emit(struct dir_context *ctx, struct dentry *dentry)
>> +{
>> + struct inode *inode = d_inode(dentry);
>> +
>> + return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len,
>> +   (loff_t)dentry->d_fsdata, inode->i_ino,
>> +   fs_umode_to_dtype(inode->i_mode));
>> +}
>> +
>> +/**
>> + * shmem_readdir - Emit entries starting at offset @ctx->pos
>> + * @file: an open directory to iterate over
>> + * @ctx: directory iteration context
>> + *
>> + * Caller must hold @file's i_rwsem to prevent insertion or removal of
>> + * entries during this call.
>> + *
>> + * On entry, @ctx->pos contains an offset that represents the first entry
>> + * to be read from the directory.
>> + *
>> + * The operation continues until there are no more entries to read, or
>> + * until the ctx->actor indicates there is no more space in the caller's
>> + * output buffer.
>> + *
>> + * On return, @ctx->pos contains an offset that will read the next entry
>> + * in this directory when shmem_readdir() is called again with @ctx.
>> + *
>> + * Return values:
>> + *   %0 - Complete
>> + */
>> +static int shmem_readdir(struct file *file, struct dir_context *ctx)
>> +{
>> + struct dentry *dentry, *dir = file->f_path.dentry;
>> + unsigned long offset;
>> +
>> + lockdep_assert_held(&d_inode(dir)->i_rwsem);
> 
> You probably don't need the above. This is called via ->iterate_shared
> so the lock had _better_ be held.

True, it's not 100% necessary.

I was trying to document the API contract, part of which is
"caller needs to hold dir->i_rwsem". This seemed like the most
crisp way to do that.

>> +
>> + if (!dir_emit_dots(file, ctx))
>> + goto out;
>> + for (offset = ctx->pos - 1; offset < ULONG_MAX - 1;) {
>> + dentry = shmem_doff_find_after(dir, &offset);
>> + if (!dentry)
>> + break;
>> + if (!shmem_dir_emit(ctx, dentry)) {
>> + dput(dentry);
>> + break;
>> + }
>> + ctx->pos = offset + 1;
>> + dput(dentry);
>> + }
>> +
>> +out:
>> + return 0;
>> +}
>> +
>> #ifdef CONFIG_TMPFS_XATTR
>> 
>> static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
>> @@ -3742,6 +3910,12 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
>> return 0;
>> }
>> 
>> +#else /* CONFIG_TMPFS */
>> +
>> +static inline void shmem_doff_map_destroy(struct inode *dir)
>> +{
>> +}
>> +
>> #endif /* CONFIG_TMPFS */
>> 
>> static void shmem_put_super(struct super_block *sb)
>> @@ -3888,6 +4062,8 @@ static void shmem_destroy_inode(struct inode *inode)
>> {
>> if (S_ISREG(inode->i_mode))
>> mpol_free_shared_policy(&SHMEM_I(inode)->policy);
>> + if (S_ISDIR(inode->i_mode))
>> + shmem_doff_map_destroy(inode);
>> }
>> 
>> static void shmem_init_inode(void *foo)
>> @@ -3955,6 +4131,15 @@ static const struct inode_operations shmem_inode_operations = {
>> #endif
>> };
>> 
>> +static const struct file_operations shmem_dir_operations = {
>> +#ifdef CONFIG_TMPFS
>> + .llseek = shmem_dir_llseek,
>> + .iterate_shared = shmem_readdir,
>> +#endif
>> + .read = generic_read_dir,
>> + .fsync = noop_fsync,
>> +};
>> +
>> static const struct inode_operations shmem_dir_inode_operations = {
>> #ifdef CONFIG_TMPFS
>> .getattr = shmem_getattr,
>> 
>> 
> 
> Other than the nits above, this all looks fine to me. I've done some
> testing with this series too and it all seems to work as expected, and
> fixes some nasty problems when trying to recursively remove directories
> via nfsd.

Thanks for your review, testing, and suggestions.

> Have you done any performance testing? My expectation would be that
> you'd have roughly similar (or even faster) performance with this set,
> but at the expense of a bit of memory (for the xarrays).

I don't have any directory microbenchmarks. I suppose I could
do something like timing large software builds.

> One thing we could consider is lifting the bulk of this code into libfs,
> so other shmem-like filesystems can take advantage of it, but that work
> could be done later too when we have another proposed consumer.

Eg. autofs.

--
Chuck Lever