Re: [PATCH v3 1/3] libfs: Add directory operations for stable offsets

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On 6/26/23 16:50, Chuck Lever III wrote:


On Jun 26, 2023, at 9:36 AM, Bernd Schubert <bernd.schubert@xxxxxxxxxxx> wrote:



On 6/24/23 00:21, Bernd Schubert wrote:
On 6/6/23 15:10, Chuck Lever wrote:
From: Chuck Lever <chuck.lever@xxxxxxxxxx>

Create a vector of directory operations in fs/libfs.c that handles
directory seeks and readdir via stable offsets instead of the
current cursor-based mechanism.

For the moment these are unused.

Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx>
---
   fs/dcache.c            |    1
   fs/libfs.c             |  185 ++++++++++++++++++++++++++++++++++++++++++++++++
   include/linux/dcache.h |    1
   include/linux/fs.h     |    9 ++
   4 files changed, 196 insertions(+)

diff --git a/fs/dcache.c b/fs/dcache.c
index 52e6d5fdab6b..9c9a801f3b33 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1813,6 +1813,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
       dentry->d_sb = sb;
       dentry->d_op = NULL;
       dentry->d_fsdata = NULL;
+    dentry->d_offset = 0;
       INIT_HLIST_BL_NODE(&dentry->d_hash);
       INIT_LIST_HEAD(&dentry->d_lru);
       INIT_LIST_HEAD(&dentry->d_subdirs);
diff --git a/fs/libfs.c b/fs/libfs.c
index 89cf614a3271..07317bbe1668 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -239,6 +239,191 @@ const struct inode_operations simple_dir_inode_operations = {
   };
   EXPORT_SYMBOL(simple_dir_inode_operations);
+/**
+ * stable_offset_init - initialize a parent directory
+ * @dir: parent directory to be initialized
+ *
+ */
+void stable_offset_init(struct inode *dir)
+{
+    xa_init_flags(&dir->i_doff_map, XA_FLAGS_ALLOC1);
+    dir->i_next_offset = 0;
+}
+EXPORT_SYMBOL(stable_offset_init);
+
+/**
+ * stable_offset_add - Add an entry to a directory's stable offset map
+ * @dir: parent directory being modified
+ * @dentry: new dentry being added
+ *
+ * Returns zero on success. Otherwise, a negative errno value is returned.
+ */
+int stable_offset_add(struct inode *dir, struct dentry *dentry)
+{
+    struct xa_limit limit = XA_LIMIT(2, U32_MAX);
+    u32 offset = 0;
+    int ret;
+
+    if (dentry->d_offset)
+        return -EBUSY;
+
+    ret = xa_alloc_cyclic(&dir->i_doff_map, &offset, dentry, limit,
+                  &dir->i_next_offset, GFP_KERNEL);
Please see below at struct inode my question about i_next_offset.
+    if (ret < 0)
+        return ret;
+
+    dentry->d_offset = offset;
+    return 0;
+}
+EXPORT_SYMBOL(stable_offset_add);
+
+/**
+ * stable_offset_remove - Remove an entry to a directory's stable offset map
+ * @dir: parent directory being modified
+ * @dentry: dentry being removed
+ *
+ */
+void stable_offset_remove(struct inode *dir, struct dentry *dentry)
+{
+    if (!dentry->d_offset)
+        return;
+
+    xa_erase(&dir->i_doff_map, dentry->d_offset);
+    dentry->d_offset = 0;
+}
+EXPORT_SYMBOL(stable_offset_remove);
+
+/**
+ * stable_offset_destroy - Release offset map
+ * @dir: parent directory that is about to be destroyed
+ *
+ * During fs teardown (eg. umount), a directory's offset map might still
+ * contain entries. xa_destroy() cleans out anything that remains.
+ */
+void stable_offset_destroy(struct inode *dir)
+{
+    xa_destroy(&dir->i_doff_map);
+}
+EXPORT_SYMBOL(stable_offset_destroy);
+
+/**
+ * stable_dir_llseek - Advance the read position of a directory descriptor
+ * @file: an open directory whose position is to be updated
+ * @offset: a byte offset
+ * @whence: enumerator describing the starting position for this update
+ *
+ * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories.
+ *
+ * Returns the updated read position if successful; otherwise a
+ * negative errno is returned and the read position remains unchanged.
+ */
+static loff_t stable_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+    switch (whence) {
+    case SEEK_CUR:
+        offset += file->f_pos;
+        fallthrough;
+    case SEEK_SET:
+        if (offset >= 0)
+            break;
+        fallthrough;
+    default:
+        return -EINVAL;
+    }
+
+    return vfs_setpos(file, offset, U32_MAX);
+}
+
+static struct dentry *stable_find_next(struct xa_state *xas)
+{
+    struct dentry *child, *found = NULL;
+
+    rcu_read_lock();
+    child = xas_next_entry(xas, U32_MAX);
+    if (!child)
+        goto out;
+    spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+    if (simple_positive(child))
+        found = dget_dlock(child);
+    spin_unlock(&child->d_lock);
+out:
+    rcu_read_unlock();
+    return found;
+}
+
+static bool stable_dir_emit(struct dir_context *ctx, struct dentry *dentry)
+{
+    struct inode *inode = d_inode(dentry);
+
+    return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len,
+              dentry->d_offset, inode->i_ino,
+              fs_umode_to_dtype(inode->i_mode));
+}
+
+static void stable_iterate_dir(struct dentry *dir, struct dir_context *ctx)
+{
+    XA_STATE(xas, &((d_inode(dir))->i_doff_map), ctx->pos);
+    struct dentry *dentry;
+
+    while (true) {
+        spin_lock(&dir->d_lock);
+        dentry = stable_find_next(&xas);
+        spin_unlock(&dir->d_lock);
+        if (!dentry)
+            break;
+
+        if (!stable_dir_emit(ctx, dentry)) {
+            dput(dentry);
+            break;
+        }
+
+        dput(dentry);
+        ctx->pos = xas.xa_index + 1;
+    }
+}
+
+/**
+ * stable_readdir - Emit entries starting at offset @ctx->pos
+ * @file: an open directory to iterate over
+ * @ctx: directory iteration context
+ *
+ * Caller must hold @file's i_rwsem to prevent insertion or removal of
+ * entries during this call.
+ *
+ * On entry, @ctx->pos contains an offset that represents the first entry
+ * to be read from the directory.
+ *
+ * The operation continues until there are no more entries to read, or
+ * until the ctx->actor indicates there is no more space in the caller's
+ * output buffer.
+ *
+ * On return, @ctx->pos contains an offset that will read the next entry
+ * in this directory when shmem_readdir() is called again with @ctx.
+ *
+ * Return values:
+ *   %0 - Complete
+ */
+static int stable_readdir(struct file *file, struct dir_context *ctx)
+{
+    struct dentry *dir = file->f_path.dentry;
+
+    lockdep_assert_held(&d_inode(dir)->i_rwsem);
+
+    if (!dir_emit_dots(file, ctx))
+        return 0;
+
+    stable_iterate_dir(dir, ctx);
+    return 0;
+}
+
+const struct file_operations stable_dir_operations = {
+    .llseek        = stable_dir_llseek,
+    .iterate_shared    = stable_readdir,
+    .read        = generic_read_dir,
+    .fsync        = noop_fsync,
+};
+EXPORT_SYMBOL(stable_dir_operations);
+
   static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
   {
       struct dentry *child = NULL;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 6b351e009f59..579ce1800efe 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -96,6 +96,7 @@ struct dentry {
       struct super_block *d_sb;    /* The root of the dentry tree */
       unsigned long d_time;        /* used by d_revalidate */
       void *d_fsdata;            /* fs-specific data */
+    u32 d_offset;            /* directory offset in parent */
       union {
           struct list_head d_lru;        /* LRU list */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 133f0640fb24..3fc2c04ed8ff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -719,6 +719,10 @@ struct inode {
   #endif
       void            *i_private; /* fs or device private pointer */
+
+    /* simplefs stable directory offset tracking */
+    struct xarray        i_doff_map;
+    u32            i_next_offset;
Hmm, I was grepping through the patches and only find that "i_next_offset" is initialized to 0 and then passed to xa_alloc_cyclic - does this really need to part of struct inode or could it be a local variable in stable_offset_add()?

This is a per-directory value so that each directory can use
the full range of offsets (from zero to UINT_MAX). If there were
only a single next_offset field, then all tmpfs directories
would share the offset range, which is not scalable (not to
mention, it would also be unlike the behavior of other
filesystems).

Yes, we could move this to the shmem-private part of the inode,
but then the API gets a little uglier.

Oh right, sorry, I was reading this wrong as per dentry value.



I only managed to look a bit through the patches right now, personally I like v2 better as it doesn't extend struct inode with changes that can be used by in-memory file system only. What do others think? An alternative would be to have these fields in struct shmem_inode_info and pass it as extra argument to the stable_ functions?



[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux