From: Darrick J. Wong <djwong@xxxxxxxxxx> In a few patchsets from now, we'll transition xfs_repair to use memfd-backed rmap and rcbag btrees for storing repair data instead of heap allocations. This allows repair to use libxfs code shared from the online repair code, which reduces the size of the codebase. It also reduces heap fragmentation, which might be critical on 32-bit systems. However, there's one hitch -- userspace xfiles naively allocate one memfd per data structure, but there's only so many file descriptors that a process can open. If a filesystem has a lot of allocation groups, we can run out of fds and fail. xfs_repair already tries to increase RLIMIT_NOFILE to the maximum (~1M) but this can fail due to system or memory constraints. Fortunately, it is possible to compute the upper bound of a memfd btree, which implies that we can store multiple btrees per memfd. Make it so that we can partition a memfd file to avoid running out of file descriptors. Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> --- v3.1: improve commit message to explain why we need this --- libxfs/xfile.c | 197 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- libxfs/xfile.h | 17 ++++- 2 files changed, 205 insertions(+), 9 deletions(-) diff --git a/libxfs/xfile.c b/libxfs/xfile.c index cba173cc17f1..fdb76f406647 100644 --- a/libxfs/xfile.c +++ b/libxfs/xfile.c @@ -97,6 +97,149 @@ xfile_create_fd( return fd; } +static LIST_HEAD(fcb_list); +static pthread_mutex_t fcb_mutex = PTHREAD_MUTEX_INITIALIZER; + +/* Create a new memfd. */ +static inline int +xfile_fcb_create( + const char *description, + struct xfile_fcb **fcbp) +{ + struct xfile_fcb *fcb; + int fd; + + fd = xfile_create_fd(description); + if (fd < 0) + return -errno; + + fcb = malloc(sizeof(struct xfile_fcb)); + if (!fcb) { + close(fd); + return -ENOMEM; + } + + list_head_init(&fcb->fcb_list); + fcb->fd = fd; + fcb->refcount = 1; + + *fcbp = fcb; + return 0; +} + +/* Release an xfile control block */ +static void +xfile_fcb_irele( + struct xfile_fcb *fcb, + loff_t pos, + uint64_t len) +{ + /* + * If this memfd is linked only to itself, it's private, so we can + * close it without taking any locks. + */ + if (list_empty(&fcb->fcb_list)) { + close(fcb->fd); + free(fcb); + return; + } + + pthread_mutex_lock(&fcb_mutex); + if (--fcb->refcount == 0) { + /* If we're the last user of this memfd file, kill it fast. */ + list_del(&fcb->fcb_list); + close(fcb->fd); + free(fcb); + } else if (len > 0) { + struct stat statbuf; + int ret; + + /* + * If we were using the end of a partitioned file, free the + * address space. IOWs, bonus points if you delete these in + * reverse-order of creation. + */ + ret = fstat(fcb->fd, &statbuf); + if (!ret && statbuf.st_size == pos + len) { + ret = ftruncate(fcb->fd, pos); + } + } + pthread_mutex_unlock(&fcb_mutex); +} + +/* + * Find an memfd that can accomodate the given amount of address space. + */ +static int +xfile_fcb_find( + const char *description, + uint64_t maxbytes, + loff_t *posp, + struct xfile_fcb **fcbp) +{ + struct xfile_fcb *fcb; + int ret; + int error; + + /* No maximum range means that the caller gets a private memfd. */ + if (maxbytes == 0) { + *posp = 0; + return xfile_fcb_create(description, fcbp); + } + + /* round up to page granularity so we can do mmap */ + maxbytes = roundup_64(maxbytes, PAGE_SIZE); + + pthread_mutex_lock(&fcb_mutex); + + /* + * If we only need a certain number of byte range, look for one with + * available file range. + */ + list_for_each_entry(fcb, &fcb_list, fcb_list) { + struct stat statbuf; + loff_t pos; + + ret = fstat(fcb->fd, &statbuf); + if (ret) + continue; + pos = roundup_64(statbuf.st_size, PAGE_SIZE); + + /* + * Truncate up to ensure that the memfd can actually handle + * writes to the end of the range. + */ + ret = ftruncate(fcb->fd, pos + maxbytes); + if (ret) + continue; + + fcb->refcount++; + *posp = pos; + *fcbp = fcb; + goto out_unlock; + } + + /* Otherwise, open a new memfd and add it to our list. */ + error = xfile_fcb_create(description, &fcb); + if (error) + return error; + + ret = ftruncate(fcb->fd, maxbytes); + if (ret) { + error = -errno; + xfile_fcb_irele(fcb, 0, maxbytes); + return error; + } + + list_add_tail(&fcb->fcb_list, &fcb_list); + *posp = 0; + *fcbp = fcb; + +out_unlock: + pthread_mutex_unlock(&fcb_mutex); + return error; +} + /* * Create an xfile of the given size. The description will be used in the * trace output. @@ -104,6 +247,7 @@ xfile_create_fd( int xfile_create( const char *description, + unsigned long long maxbytes, struct xfile **xfilep) { struct xfile *xf; @@ -113,13 +257,14 @@ xfile_create( if (!xf) return -ENOMEM; - xf->fd = xfile_create_fd(description); - if (xf->fd < 0) { - error = -errno; + error = xfile_fcb_find(description, maxbytes, &xf->partition_pos, + &xf->fcb); + if (error) { kfree(xf); return error; } + xf->maxbytes = maxbytes; *xfilep = xf; return 0; } @@ -129,7 +274,7 @@ void xfile_destroy( struct xfile *xf) { - close(xf->fd); + xfile_fcb_irele(xf->fcb, xf->partition_pos, xf->maxbytes); kfree(xf); } @@ -137,6 +282,9 @@ static inline loff_t xfile_maxbytes( struct xfile *xf) { + if (xf->maxbytes > 0) + return xf->maxbytes; + if (sizeof(loff_t) == 8) return LLONG_MAX; return LONG_MAX; @@ -160,7 +308,7 @@ xfile_load( if (xfile_maxbytes(xf) - pos < count) return -ENOMEM; - ret = pread(xf->fd, buf, count, pos); + ret = pread(xf->fcb->fd, buf, count, pos + xf->partition_pos); if (ret < 0) return -errno; if (ret != count) @@ -186,7 +334,7 @@ xfile_store( if (xfile_maxbytes(xf) - pos < count) return -EFBIG; - ret = pwrite(xf->fd, buf, count, pos); + ret = pwrite(xf->fcb->fd, buf, count, pos + xf->partition_pos); if (ret < 0) return -errno; if (ret != count) @@ -194,6 +342,38 @@ xfile_store( return 0; } +/* Compute the number of bytes used by a partitioned xfile. */ +static unsigned long long +xfile_partition_bytes( + struct xfile *xf) +{ + loff_t data_pos = xf->partition_pos; + loff_t stop_pos = data_pos + xf->maxbytes; + loff_t hole_pos; + unsigned long long bytes = 0; + + data_pos = lseek(xf->fcb->fd, data_pos, SEEK_DATA); + while (data_pos >= 0 && data_pos < stop_pos) { + hole_pos = lseek(xf->fcb->fd, data_pos, SEEK_HOLE); + if (hole_pos < 0) { + /* save error, break */ + data_pos = hole_pos; + break; + } + if (hole_pos >= stop_pos) { + bytes += stop_pos - data_pos; + return bytes; + } + bytes += hole_pos - data_pos; + + data_pos = lseek(xf->fcb->fd, hole_pos, SEEK_DATA); + } + if (data_pos < 0 && errno != ENXIO) + return xf->maxbytes; + + return bytes; +} + /* Compute the number of bytes used by a xfile. */ unsigned long long xfile_bytes( @@ -202,7 +382,10 @@ xfile_bytes( struct stat statbuf; int error; - error = fstat(xf->fd, &statbuf); + if (xf->maxbytes > 0) + return xfile_partition_bytes(xf); + + error = fstat(xf->fcb->fd, &statbuf); if (error) return -errno; diff --git a/libxfs/xfile.h b/libxfs/xfile.h index d60084011357..180a42bbbaa2 100644 --- a/libxfs/xfile.h +++ b/libxfs/xfile.h @@ -6,11 +6,24 @@ #ifndef __LIBXFS_XFILE_H__ #define __LIBXFS_XFILE_H__ -struct xfile { +struct xfile_fcb { + struct list_head fcb_list; int fd; + unsigned int refcount; }; -int xfile_create(const char *description, struct xfile **xfilep); +struct xfile { + struct xfile_fcb *fcb; + + /* File position within fcb->fd where this partition starts */ + loff_t partition_pos; + + /* Maximum number of bytes that can be written to the partition. */ + uint64_t maxbytes; +}; + +int xfile_create(const char *description, unsigned long long maxbytes, + struct xfile **xfilep); void xfile_destroy(struct xfile *xf); ssize_t xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos);