Re: [PATCH v3 1/3] libfs: Add directory operations for stable offsets

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On 6/24/23 00:21, Bernd Schubert wrote:


On 6/6/23 15:10, Chuck Lever wrote:
From: Chuck Lever <chuck.lever@xxxxxxxxxx>

Create a vector of directory operations in fs/libfs.c that handles
directory seeks and readdir via stable offsets instead of the
current cursor-based mechanism.

For the moment these are unused.

Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx>
---
  fs/dcache.c            |    1
  fs/libfs.c             |  185 ++++++++++++++++++++++++++++++++++++++++++++++++
  include/linux/dcache.h |    1
  include/linux/fs.h     |    9 ++
  4 files changed, 196 insertions(+)

diff --git a/fs/dcache.c b/fs/dcache.c
index 52e6d5fdab6b..9c9a801f3b33 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1813,6 +1813,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
      dentry->d_sb = sb;
      dentry->d_op = NULL;
      dentry->d_fsdata = NULL;
+    dentry->d_offset = 0;
      INIT_HLIST_BL_NODE(&dentry->d_hash);
      INIT_LIST_HEAD(&dentry->d_lru);
      INIT_LIST_HEAD(&dentry->d_subdirs);
diff --git a/fs/libfs.c b/fs/libfs.c
index 89cf614a3271..07317bbe1668 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -239,6 +239,191 @@ const struct inode_operations simple_dir_inode_operations = {
  };
  EXPORT_SYMBOL(simple_dir_inode_operations);
+/**
+ * stable_offset_init - initialize a parent directory
+ * @dir: parent directory to be initialized
+ *
+ */
+void stable_offset_init(struct inode *dir)
+{
+    xa_init_flags(&dir->i_doff_map, XA_FLAGS_ALLOC1);
+    dir->i_next_offset = 0;
+}
+EXPORT_SYMBOL(stable_offset_init);
+
+/**
+ * stable_offset_add - Add an entry to a directory's stable offset map
+ * @dir: parent directory being modified
+ * @dentry: new dentry being added
+ *
+ * Returns zero on success. Otherwise, a negative errno value is returned.
+ */
+int stable_offset_add(struct inode *dir, struct dentry *dentry)
+{
+    struct xa_limit limit = XA_LIMIT(2, U32_MAX);
+    u32 offset = 0;
+    int ret;
+
+    if (dentry->d_offset)
+        return -EBUSY;
+
+    ret = xa_alloc_cyclic(&dir->i_doff_map, &offset, dentry, limit,
+                  &dir->i_next_offset, GFP_KERNEL);

Please see below at struct inode my question about i_next_offset.

+    if (ret < 0)
+        return ret;
+
+    dentry->d_offset = offset;
+    return 0;
+}
+EXPORT_SYMBOL(stable_offset_add);
+
+/**
+ * stable_offset_remove - Remove an entry to a directory's stable offset map
+ * @dir: parent directory being modified
+ * @dentry: dentry being removed
+ *
+ */
+void stable_offset_remove(struct inode *dir, struct dentry *dentry)
+{
+    if (!dentry->d_offset)
+        return;
+
+    xa_erase(&dir->i_doff_map, dentry->d_offset);
+    dentry->d_offset = 0;
+}
+EXPORT_SYMBOL(stable_offset_remove);
+
+/**
+ * stable_offset_destroy - Release offset map
+ * @dir: parent directory that is about to be destroyed
+ *
+ * During fs teardown (eg. umount), a directory's offset map might still
+ * contain entries. xa_destroy() cleans out anything that remains.
+ */
+void stable_offset_destroy(struct inode *dir)
+{
+    xa_destroy(&dir->i_doff_map);
+}
+EXPORT_SYMBOL(stable_offset_destroy);
+
+/**
+ * stable_dir_llseek - Advance the read position of a directory descriptor
+ * @file: an open directory whose position is to be updated
+ * @offset: a byte offset
+ * @whence: enumerator describing the starting position for this update
+ *
+ * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories.
+ *
+ * Returns the updated read position if successful; otherwise a
+ * negative errno is returned and the read position remains unchanged.
+ */
+static loff_t stable_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+    switch (whence) {
+    case SEEK_CUR:
+        offset += file->f_pos;
+        fallthrough;
+    case SEEK_SET:
+        if (offset >= 0)
+            break;
+        fallthrough;
+    default:
+        return -EINVAL;
+    }
+
+    return vfs_setpos(file, offset, U32_MAX);
+}
+
+static struct dentry *stable_find_next(struct xa_state *xas)
+{
+    struct dentry *child, *found = NULL;
+
+    rcu_read_lock();
+    child = xas_next_entry(xas, U32_MAX);
+    if (!child)
+        goto out;
+    spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+    if (simple_positive(child))
+        found = dget_dlock(child);
+    spin_unlock(&child->d_lock);
+out:
+    rcu_read_unlock();
+    return found;
+}

I wonder if this should try the next dentry when simple_positive() returns false, what is if there is a readdir/unlink race? readdir now abort in the middle instead of continuing with the next dentry?

+
+static bool stable_dir_emit(struct dir_context *ctx, struct dentry *dentry)
+{
+    struct inode *inode = d_inode(dentry);
+
+    return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len,
+              dentry->d_offset, inode->i_ino,
+              fs_umode_to_dtype(inode->i_mode));
+}
+
+static void stable_iterate_dir(struct dentry *dir, struct dir_context *ctx)
+{
+    XA_STATE(xas, &((d_inode(dir))->i_doff_map), ctx->pos);
+    struct dentry *dentry;
+
+    while (true) {
+        spin_lock(&dir->d_lock);
+        dentry = stable_find_next(&xas);
+        spin_unlock(&dir->d_lock);
+        if (!dentry)
+            break;
+
+        if (!stable_dir_emit(ctx, dentry)) {
+            dput(dentry);
+            break;
+        }
+
+        dput(dentry);
+        ctx->pos = xas.xa_index + 1;
+    }
+}
+
+/**
+ * stable_readdir - Emit entries starting at offset @ctx->pos
+ * @file: an open directory to iterate over
+ * @ctx: directory iteration context
+ *
+ * Caller must hold @file's i_rwsem to prevent insertion or removal of
+ * entries during this call.
+ *
+ * On entry, @ctx->pos contains an offset that represents the first entry
+ * to be read from the directory.
+ *
+ * The operation continues until there are no more entries to read, or
+ * until the ctx->actor indicates there is no more space in the caller's
+ * output buffer.
+ *
+ * On return, @ctx->pos contains an offset that will read the next entry
+ * in this directory when shmem_readdir() is called again with @ctx.
+ *
+ * Return values:
+ *   %0 - Complete
+ */
+static int stable_readdir(struct file *file, struct dir_context *ctx)
+{
+    struct dentry *dir = file->f_path.dentry;
+
+    lockdep_assert_held(&d_inode(dir)->i_rwsem);
+
+    if (!dir_emit_dots(file, ctx))
+        return 0;
+
+    stable_iterate_dir(dir, ctx);
+    return 0;
+}
+
+const struct file_operations stable_dir_operations = {
+    .llseek        = stable_dir_llseek,
+    .iterate_shared    = stable_readdir,
+    .read        = generic_read_dir,
+    .fsync        = noop_fsync,
+};
+EXPORT_SYMBOL(stable_dir_operations);
+
  static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
  {
      struct dentry *child = NULL;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 6b351e009f59..579ce1800efe 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -96,6 +96,7 @@ struct dentry {
      struct super_block *d_sb;    /* The root of the dentry tree */
      unsigned long d_time;        /* used by d_revalidate */
      void *d_fsdata;            /* fs-specific data */
+    u32 d_offset;            /* directory offset in parent */
      union {
          struct list_head d_lru;        /* LRU list */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 133f0640fb24..3fc2c04ed8ff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -719,6 +719,10 @@ struct inode {
  #endif
      void            *i_private; /* fs or device private pointer */
+
+    /* simplefs stable directory offset tracking */
+    struct xarray        i_doff_map;
+    u32            i_next_offset;

Hmm, I was grepping through the patches and only find that "i_next_offset" is initialized to 0 and then passed to xa_alloc_cyclic - does this really need to part of struct inode or could it be a local variable in stable_offset_add()?

Hmm, at best that is used for rename, but then is this offset right when old_dir and new_dir differ?


I only managed to look a bit through the patches right now, personally I like v2 better as it doesn't extend struct inode with changes that can be used by in-memory file system only. What do others think? An alternative would be to have these fields in struct shmem_inode_info and pass it as extra argument to the stable_ functions?


Thanks,
Bernd



[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux