On Fri, May 31, 2024 at 11:30:07AM +0200, Carlos Maiolino wrote: > On Tue, May 21, 2024 at 08:12:14PM GMT, Darrick J. Wong wrote: > > From: Darrick J. Wong <djwong@xxxxxxxxxx> > > > > In a few patchsets from now, we'll transition xfs_repair to use > > memfd-backed rmap and rcbag btrees for storing repair data instead of > > heap allocations. This allows repair to use libxfs code shared from the > > online repair code, which reduces the size of the codebase. It also > > reduces heap fragmentation, which might be critical on 32-bit systems. > > > > However, there's one hitch -- userspace xfiles naively allocate one > > memfd per data structure, but there's only so many file descriptors that > > a process can open. If a filesystem has a lot of allocation groups, we > > can run out of fds and fail. xfs_repair already tries to increase > > RLIMIT_NOFILE to the maximum (~1M) but this can fail due to system or > > memory constraints. > > > > Fortunately, it is possible to compute the upper bound of a memfd btree, > > which implies that we can store multiple btrees per memfd. Make it so > > that we can partition a memfd file to avoid running out of file > > descriptors. > > > > Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> > > I usually see bigger problems on those filesystems like mount timing > out, before users reach system's max fd limit, but I'm ok with it if > you've seen it in the wild. Giving that - in theory - cloud instances > resources are pretty limited, we could get to pretty small max fd > limits, so it seems fair to me. Yes indeed you can hit the practical limits on a VM with not a lot of memory and a severely overgrowfs'd filesystem. > Reviewed-by: Carlos Maiolino <cmaiolino@xxxxxxxxxx> Thanks! --D > > --- > > libxfs/xfile.c | 197 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- > > libxfs/xfile.h | 17 ++++- > > 2 files changed, 205 insertions(+), 9 deletions(-) > > > > > > diff --git a/libxfs/xfile.c b/libxfs/xfile.c > > index cba173cc1..fdb76f406 100644 > > --- a/libxfs/xfile.c > > +++ b/libxfs/xfile.c > > @@ -97,6 +97,149 @@ xfile_create_fd( > > return fd; > > } > > > > +static LIST_HEAD(fcb_list); > > +static pthread_mutex_t fcb_mutex = PTHREAD_MUTEX_INITIALIZER; > > + > > +/* Create a new memfd. */ > > +static inline int > > +xfile_fcb_create( > > + const char *description, > > + struct xfile_fcb **fcbp) > > +{ > > + struct xfile_fcb *fcb; > > + int fd; > > + > > + fd = xfile_create_fd(description); > > + if (fd < 0) > > + return -errno; > > + > > + fcb = malloc(sizeof(struct xfile_fcb)); > > + if (!fcb) { > > + close(fd); > > + return -ENOMEM; > > + } > > + > > + list_head_init(&fcb->fcb_list); > > + fcb->fd = fd; > > + fcb->refcount = 1; > > + > > + *fcbp = fcb; > > + return 0; > > +} > > + > > +/* Release an xfile control block */ > > +static void > > +xfile_fcb_irele( > > + struct xfile_fcb *fcb, > > + loff_t pos, > > + uint64_t len) > > +{ > > + /* > > + * If this memfd is linked only to itself, it's private, so we can > > + * close it without taking any locks. > > + */ > > + if (list_empty(&fcb->fcb_list)) { > > + close(fcb->fd); > > + free(fcb); > > + return; > > + } > > + > > + pthread_mutex_lock(&fcb_mutex); > > + if (--fcb->refcount == 0) { > > + /* If we're the last user of this memfd file, kill it fast. */ > > + list_del(&fcb->fcb_list); > > + close(fcb->fd); > > + free(fcb); > > + } else if (len > 0) { > > + struct stat statbuf; > > + int ret; > > + > > + /* > > + * If we were using the end of a partitioned file, free the > > + * address space. IOWs, bonus points if you delete these in > > + * reverse-order of creation. > > + */ > > + ret = fstat(fcb->fd, &statbuf); > > + if (!ret && statbuf.st_size == pos + len) { > > + ret = ftruncate(fcb->fd, pos); > > + } > > + } > > + pthread_mutex_unlock(&fcb_mutex); > > +} > > + > > +/* > > + * Find an memfd that can accomodate the given amount of address space. > > + */ > > +static int > > +xfile_fcb_find( > > + const char *description, > > + uint64_t maxbytes, > > + loff_t *posp, > > + struct xfile_fcb **fcbp) > > +{ > > + struct xfile_fcb *fcb; > > + int ret; > > + int error; > > + > > + /* No maximum range means that the caller gets a private memfd. */ > > + if (maxbytes == 0) { > > + *posp = 0; > > + return xfile_fcb_create(description, fcbp); > > + } > > + > > + /* round up to page granularity so we can do mmap */ > > + maxbytes = roundup_64(maxbytes, PAGE_SIZE); > > + > > + pthread_mutex_lock(&fcb_mutex); > > + > > + /* > > + * If we only need a certain number of byte range, look for one with > > + * available file range. > > + */ > > + list_for_each_entry(fcb, &fcb_list, fcb_list) { > > + struct stat statbuf; > > + loff_t pos; > > + > > + ret = fstat(fcb->fd, &statbuf); > > + if (ret) > > + continue; > > + pos = roundup_64(statbuf.st_size, PAGE_SIZE); > > + > > + /* > > + * Truncate up to ensure that the memfd can actually handle > > + * writes to the end of the range. > > + */ > > + ret = ftruncate(fcb->fd, pos + maxbytes); > > + if (ret) > > + continue; > > + > > + fcb->refcount++; > > + *posp = pos; > > + *fcbp = fcb; > > + goto out_unlock; > > + } > > + > > + /* Otherwise, open a new memfd and add it to our list. */ > > + error = xfile_fcb_create(description, &fcb); > > + if (error) > > + return error; > > + > > + ret = ftruncate(fcb->fd, maxbytes); > > + if (ret) { > > + error = -errno; > > + xfile_fcb_irele(fcb, 0, maxbytes); > > + return error; > > + } > > + > > + list_add_tail(&fcb->fcb_list, &fcb_list); > > + *posp = 0; > > + *fcbp = fcb; > > + > > +out_unlock: > > + pthread_mutex_unlock(&fcb_mutex); > > + return error; > > +} > > + > > /* > > * Create an xfile of the given size. The description will be used in the > > * trace output. > > @@ -104,6 +247,7 @@ xfile_create_fd( > > int > > xfile_create( > > const char *description, > > + unsigned long long maxbytes, > > struct xfile **xfilep) > > { > > struct xfile *xf; > > @@ -113,13 +257,14 @@ xfile_create( > > if (!xf) > > return -ENOMEM; > > > > - xf->fd = xfile_create_fd(description); > > - if (xf->fd < 0) { > > - error = -errno; > > + error = xfile_fcb_find(description, maxbytes, &xf->partition_pos, > > + &xf->fcb); > > + if (error) { > > kfree(xf); > > return error; > > } > > > > + xf->maxbytes = maxbytes; > > *xfilep = xf; > > return 0; > > } > > @@ -129,7 +274,7 @@ void > > xfile_destroy( > > struct xfile *xf) > > { > > - close(xf->fd); > > + xfile_fcb_irele(xf->fcb, xf->partition_pos, xf->maxbytes); > > kfree(xf); > > } > > > > @@ -137,6 +282,9 @@ static inline loff_t > > xfile_maxbytes( > > struct xfile *xf) > > { > > + if (xf->maxbytes > 0) > > + return xf->maxbytes; > > + > > if (sizeof(loff_t) == 8) > > return LLONG_MAX; > > return LONG_MAX; > > @@ -160,7 +308,7 @@ xfile_load( > > if (xfile_maxbytes(xf) - pos < count) > > return -ENOMEM; > > > > - ret = pread(xf->fd, buf, count, pos); > > + ret = pread(xf->fcb->fd, buf, count, pos + xf->partition_pos); > > if (ret < 0) > > return -errno; > > if (ret != count) > > @@ -186,7 +334,7 @@ xfile_store( > > if (xfile_maxbytes(xf) - pos < count) > > return -EFBIG; > > > > - ret = pwrite(xf->fd, buf, count, pos); > > + ret = pwrite(xf->fcb->fd, buf, count, pos + xf->partition_pos); > > if (ret < 0) > > return -errno; > > if (ret != count) > > @@ -194,6 +342,38 @@ xfile_store( > > return 0; > > } > > > > +/* Compute the number of bytes used by a partitioned xfile. */ > > +static unsigned long long > > +xfile_partition_bytes( > > + struct xfile *xf) > > +{ > > + loff_t data_pos = xf->partition_pos; > > + loff_t stop_pos = data_pos + xf->maxbytes; > > + loff_t hole_pos; > > + unsigned long long bytes = 0; > > + > > + data_pos = lseek(xf->fcb->fd, data_pos, SEEK_DATA); > > + while (data_pos >= 0 && data_pos < stop_pos) { > > + hole_pos = lseek(xf->fcb->fd, data_pos, SEEK_HOLE); > > + if (hole_pos < 0) { > > + /* save error, break */ > > + data_pos = hole_pos; > > + break; > > + } > > + if (hole_pos >= stop_pos) { > > + bytes += stop_pos - data_pos; > > + return bytes; > > + } > > + bytes += hole_pos - data_pos; > > + > > + data_pos = lseek(xf->fcb->fd, hole_pos, SEEK_DATA); > > + } > > + if (data_pos < 0 && errno != ENXIO) > > + return xf->maxbytes; > > + > > + return bytes; > > +} > > + > > /* Compute the number of bytes used by a xfile. */ > > unsigned long long > > xfile_bytes( > > @@ -202,7 +382,10 @@ xfile_bytes( > > struct stat statbuf; > > int error; > > > > - error = fstat(xf->fd, &statbuf); > > + if (xf->maxbytes > 0) > > + return xfile_partition_bytes(xf); > > + > > + error = fstat(xf->fcb->fd, &statbuf); > > if (error) > > return -errno; > > > > diff --git a/libxfs/xfile.h b/libxfs/xfile.h > > index d60084011..180a42bbb 100644 > > --- a/libxfs/xfile.h > > +++ b/libxfs/xfile.h > > @@ -6,11 +6,24 @@ > > #ifndef __LIBXFS_XFILE_H__ > > #define __LIBXFS_XFILE_H__ > > > > -struct xfile { > > +struct xfile_fcb { > > + struct list_head fcb_list; > > int fd; > > + unsigned int refcount; > > }; > > > > -int xfile_create(const char *description, struct xfile **xfilep); > > +struct xfile { > > + struct xfile_fcb *fcb; > > + > > + /* File position within fcb->fd where this partition starts */ > > + loff_t partition_pos; > > + > > + /* Maximum number of bytes that can be written to the partition. */ > > + uint64_t maxbytes; > > +}; > > + > > +int xfile_create(const char *description, unsigned long long maxbytes, > > + struct xfile **xfilep); > > void xfile_destroy(struct xfile *xf); > > > > ssize_t xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos); > > >