* cel@xxxxxxxxxx <cel@xxxxxxxxxx> [241220 10:33]: > From: Chuck Lever <chuck.lever@xxxxxxxxxx> > > According to getdents(3), the d_off field in each returned directory > entry points to the next entry in the directory. The d_off field in > the last returned entry in the readdir buffer must contain a valid > offset value, but if it points to an actual directory entry, then > readdir/getdents can loop. > > This patch introduces a specific fixed offset value that is placed > in the d_off field of the last entry in a directory. Some user space > applications assume that the EOD offset value is larger than the > offsets of real directory entries, so the largest possible offset > value is reserved for this purpose. This new value is never > allocated by simple_offset_add(). > > When ->iterate_dir() returns, getdents{64} inserts the ctx->pos > value into the d_off field of the last valid entry in the readdir > buffer. When it hits EOD, offset_readdir() sets ctx->pos to the EOD > offset value so the last entry is updated to point to the EOD marker. > > When trying to read the entry at the EOD offset, offset_readdir() > terminates immediately. > > It is worth noting that using a Maple tree for directory offset > value allocation does not guarantee a 63-bit range of values -- > on platforms where "long" is a 32-bit type, the directory offset > value range is still 0..(2^31 - 1). I have a standing request to have 32-bit archs return 64-bit values. Is this another 'nice to have' 64 bit values on 32 bit archs? > > Fixes: 796432efab1e ("libfs: getdents() should return 0 after reaching EOD") > Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> > --- > fs/libfs.c | 38 ++++++++++++++++++++++---------------- > 1 file changed, 22 insertions(+), 16 deletions(-) > > diff --git a/fs/libfs.c b/fs/libfs.c > index 8c9364a0174c..5c56783c03a5 100644 > --- a/fs/libfs.c > +++ b/fs/libfs.c > @@ -245,9 +245,16 @@ const struct inode_operations simple_dir_inode_operations = { > }; > EXPORT_SYMBOL(simple_dir_inode_operations); > > -/* 0 is '.', 1 is '..', so always start with offset 2 or more */ > +/* simple_offset_add() allocation range */ > enum { > - DIR_OFFSET_MIN = 2, > + DIR_OFFSET_MIN = 2, > + DIR_OFFSET_MAX = LONG_MAX - 1, > +}; > + > +/* simple_offset_add() never assigns these to a dentry */ > +enum { > + DIR_OFFSET_EOD = LONG_MAX, /* Marks EOD */ > + > }; > > static void offset_set(struct dentry *dentry, long offset) > @@ -291,7 +298,8 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) > return -EBUSY; > > ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN, > - LONG_MAX, &octx->next_offset, GFP_KERNEL); > + DIR_OFFSET_MAX, &octx->next_offset, > + GFP_KERNEL); > if (unlikely(ret < 0)) > return ret == -EBUSY ? -ENOSPC : ret; > > @@ -447,8 +455,6 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) > return -EINVAL; > } > > - /* In this case, ->private_data is protected by f_pos_lock */ > - file->private_data = NULL; > return vfs_setpos(file, offset, LONG_MAX); > } > > @@ -458,7 +464,7 @@ static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset) > struct dentry *child, *found = NULL; > > rcu_read_lock(); > - child = mas_find(&mas, LONG_MAX); > + child = mas_find(&mas, DIR_OFFSET_MAX); > if (!child) > goto out; > spin_lock(&child->d_lock); > @@ -479,7 +485,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) > inode->i_ino, fs_umode_to_dtype(inode->i_mode)); > } > > -static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) > +static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx) > { > struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); > struct dentry *dentry; > @@ -487,7 +493,7 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) > while (true) { > dentry = offset_find_next(octx, ctx->pos); > if (!dentry) > - return ERR_PTR(-ENOENT); > + goto out_eod; > > if (!offset_dir_emit(ctx, dentry)) { > dput(dentry); > @@ -497,7 +503,10 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) > ctx->pos = dentry2offset(dentry) + 1; > dput(dentry); > } > - return NULL; > + return; > + > +out_eod: > + ctx->pos = DIR_OFFSET_EOD; > } > > /** > @@ -517,6 +526,8 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) > * > * On return, @ctx->pos contains an offset that will read the next entry > * in this directory when offset_readdir() is called again with @ctx. > + * Caller places this value in the d_off field of the last entry in the > + * user's buffer. > * > * Return values: > * %0 - Complete > @@ -529,13 +540,8 @@ static int offset_readdir(struct file *file, struct dir_context *ctx) > > if (!dir_emit_dots(file, ctx)) > return 0; > - > - /* In this case, ->private_data is protected by f_pos_lock */ > - if (ctx->pos == DIR_OFFSET_MIN) > - file->private_data = NULL; > - else if (file->private_data == ERR_PTR(-ENOENT)) > - return 0; > - file->private_data = offset_iterate_dir(d_inode(dir), ctx); > + if (ctx->pos != DIR_OFFSET_EOD) > + offset_iterate_dir(d_inode(dir), ctx); > return 0; > } > > -- > 2.47.0 > >