From: Dave Chinner <dchinner@xxxxxxxxxx> Before we can move inodes for a shrink operation, we have to find all the inodes that own space in the AG(s) we want to empty. This implementation uses FS_IOC_GETFSMAP on the assumption that filesystems to be shrunk have reverse mapping enabled as it is the only way to identify inode related metadata that userspace is unable to see or influence (e.g. BMBT blocks) that may be located in the specific AG. We can use GETFSMAP to identify both inodes to be moved (via XFS_FMR_OWN_INODES records) and inodes with just data and/or metadata to be moved. Once we have identified all the inodes to be moved, we have to map them to paths so that we can use renameat2() to exchange the directory entries pointing at the moved inode atomically. We also need to record inodes with hard links and all of the paths to the inode so that hard links can be recreated appropriately. This requires a directory tree walk to discover the paths (until parent pointers are a thing). Hence for filesystems that aren't reverse mapping enabled, we can eventually use this pass to discover inodes with visible data and metadata that need to be moved. As we resolve the paths to the inodes to be moved, output the information to stdout so that it can be acted upon by other utilities. This results in a command that acts similar to find but with a physical location filter rather than an inode metadata filter. Again, this is not meant to be an optimal implementation. It shouldn't suck, but there is plenty of scope for performance optimisation, especially with a multithreaded and/or async directory traversal/parent pointer path resolution process to hide access latencies. Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> Reviewed-by: "Darrick J. Wong" <djwong@xxxxxxxxxx> Signed-off-by: "Darrick J. Wong" <djwong@xxxxxxxxxx> --- libfrog/fsgeom.h | 19 ++ libfrog/radix-tree.c | 2 libfrog/radix-tree.h | 2 man/man8/xfs_spaceman.8 | 11 + spaceman/Makefile | 1 spaceman/find_owner.c | 481 +++++++++++++++++++++++++++++++++++++++++++++++ spaceman/init.c | 4 spaceman/space.h | 2 8 files changed, 521 insertions(+), 1 deletion(-) create mode 100644 spaceman/find_owner.c diff --git a/libfrog/fsgeom.h b/libfrog/fsgeom.h index b851b9bbf36a58..679046077cba84 100644 --- a/libfrog/fsgeom.h +++ b/libfrog/fsgeom.h @@ -97,6 +97,25 @@ cvt_ino_to_agino( return ino & ((1ULL << xfd->aginolog) - 1); } +/* Convert an AG block to an AG inode number. */ +static inline uint32_t +cvt_agbno_to_agino( + const struct xfs_fd *xfd, + xfs_agblock_t agbno) +{ + return agbno << xfd->inopblog; +} + +/* Calculate the number of inodes in a byte range */ +static inline uint32_t +cvt_b_to_inode_count( + const struct xfs_fd *xfd, + uint64_t bytes) +{ + return (bytes >> xfd->blocklog) << xfd->inopblog; +} + + /* * Convert a linear fs block offset number into bytes. This is the runtime * equivalent of XFS_FSB_TO_B, which means that it is /not/ for segmented fsbno diff --git a/libfrog/radix-tree.c b/libfrog/radix-tree.c index 261fc2487de97f..788d11612e290f 100644 --- a/libfrog/radix-tree.c +++ b/libfrog/radix-tree.c @@ -377,6 +377,8 @@ void *radix_tree_tag_set(struct radix_tree_root *root, unsigned int height, shift; struct radix_tree_node *slot; + ASSERT(tag < RADIX_TREE_MAX_TAGS); + height = root->height; if (index > radix_tree_maxindex(height)) return NULL; diff --git a/libfrog/radix-tree.h b/libfrog/radix-tree.h index 0a4e3bb4f9defc..73f41a9d902a26 100644 --- a/libfrog/radix-tree.h +++ b/libfrog/radix-tree.h @@ -28,7 +28,7 @@ do { \ } while (0) #ifdef RADIX_TREE_TAGS -#define RADIX_TREE_MAX_TAGS 2 +#define RADIX_TREE_MAX_TAGS 3 #endif int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); diff --git a/man/man8/xfs_spaceman.8 b/man/man8/xfs_spaceman.8 index f898a8bbe840ea..6fef6949aa6c8b 100644 --- a/man/man8/xfs_spaceman.8 +++ b/man/man8/xfs_spaceman.8 @@ -41,6 +41,14 @@ .SH COMMANDS If the .B -v option is given, print what's happening every step of the way. +.TP +.BI "find_owner \-a agno" +Create an internal structure to map physical space in the given allocation +group to file paths. +This enables space reorganization on a mounted filesystem by enabling +us to find files. +Unclear why we can't just use FSMAP and BULKSTAT to open by handle. + .TP .BI "freesp [ \-dgrs ] [-a agno]... [ \-b | \-e bsize | \-h bsize | \-m factor ]" With no arguments, @@ -195,6 +203,9 @@ .SH COMMANDS .B print Display a list of all open files. .TP +.B resolve_owner +Resolves space in the filesystem to file paths, maybe? +.TP .B quit Exit .BR xfs_spaceman . diff --git a/spaceman/Makefile b/spaceman/Makefile index 9d080b67de9a22..b35ab1dbd2f440 100644 --- a/spaceman/Makefile +++ b/spaceman/Makefile @@ -11,6 +11,7 @@ HFILES = \ space.h CFILES = \ file.c \ + find_owner.c \ health.c \ info.c \ init.c \ diff --git a/spaceman/find_owner.c b/spaceman/find_owner.c new file mode 100644 index 00000000000000..7a656d80d21217 --- /dev/null +++ b/spaceman/find_owner.c @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2017 Oracle. + * Copyright (c) 2020 Red Hat, Inc. + * All Rights Reserved. + */ + +#include "libxfs.h" +#include <linux/fiemap.h> +#include "libfrog/fsgeom.h" +#include "libfrog/radix-tree.h" +#include "command.h" +#include "init.h" +#include "libfrog/paths.h" +#include <linux/fsmap.h> +#include "space.h" +#include "input.h" + +static cmdinfo_t find_owner_cmd; +static cmdinfo_t resolve_owner_cmd; + +#define NR_EXTENTS 128 + +static RADIX_TREE(inode_tree, 0); +#define MOVE_INODE 0 +#define MOVE_BLOCKS 1 +#define INODE_PATH 2 +int inode_count; +int inode_paths; + +static void +track_inode_chunks( + struct xfs_fd *xfd, + xfs_agnumber_t agno, + uint64_t physaddr, + uint64_t length) +{ + xfs_agblock_t agbno = cvt_b_to_agbno(xfd, physaddr); + uint64_t first_ino = cvt_agino_to_ino(xfd, agno, + cvt_agbno_to_agino(xfd, agbno)); + uint64_t num_inodes = cvt_b_to_inode_count(xfd, length); + int i; + + printf(_("AG %d\tInode Range to move: 0x%llx - 0x%llx (length 0x%llx)\n"), + agno, + (unsigned long long)first_ino, + (unsigned long long)first_ino + num_inodes - 1, + (unsigned long long)length); + + for (i = 0; i < num_inodes; i++) { + if (!radix_tree_lookup(&inode_tree, first_ino + i)) { + radix_tree_insert(&inode_tree, first_ino + i, + (void *)first_ino + i); + inode_count++; + } + radix_tree_tag_set(&inode_tree, first_ino + i, MOVE_INODE); + } +} + +static void +track_inode( + struct xfs_fd *xfd, + xfs_agnumber_t agno, + uint64_t owner, + uint64_t physaddr, + uint64_t length) +{ + if (radix_tree_tag_get(&inode_tree, owner, MOVE_BLOCKS)) + return; + + printf(_("AG %d\tInode 0x%llx: blocks to move to move: 0x%llx - 0x%llx\n"), + agno, + (unsigned long long)owner, + (unsigned long long)physaddr, + (unsigned long long)physaddr + length - 1); + if (!radix_tree_lookup(&inode_tree, owner)) { + radix_tree_insert(&inode_tree, owner, (void *)owner); + inode_count++; + } + radix_tree_tag_set(&inode_tree, owner, MOVE_BLOCKS); +} + +static void +scan_ag( + xfs_agnumber_t agno) +{ + struct fsmap_head *fsmap; + struct fsmap *extent; + struct fsmap *l, *h; + struct fsmap *p; + struct xfs_fd *xfd = &file->xfd; + int ret; + int i; + + fsmap = malloc(fsmap_sizeof(NR_EXTENTS)); + if (!fsmap) { + fprintf(stderr, _("%s: fsmap malloc failed.\n"), progname); + exitcode = 1; + return; + } + + memset(fsmap, 0, sizeof(*fsmap)); + fsmap->fmh_count = NR_EXTENTS; + l = fsmap->fmh_keys; + h = fsmap->fmh_keys + 1; + l->fmr_physical = cvt_agbno_to_b(xfd, agno, 0); + h->fmr_physical = cvt_agbno_to_b(xfd, agno + 1, 0); + l->fmr_device = h->fmr_device = file->fs_path.fs_datadev; + h->fmr_owner = ULLONG_MAX; + h->fmr_flags = UINT_MAX; + h->fmr_offset = ULLONG_MAX; + + while (true) { + printf("Inode count %d\n", inode_count); + ret = ioctl(xfd->fd, FS_IOC_GETFSMAP, fsmap); + if (ret < 0) { + fprintf(stderr, _("%s: FS_IOC_GETFSMAP [\"%s\"]: %s\n"), + progname, file->name, strerror(errno)); + free(fsmap); + exitcode = 1; + return; + } + + /* No more extents to map, exit */ + if (!fsmap->fmh_entries) + break; + + /* + * Walk the extents, ignore everything except inode chunks + * and inode owned blocks. + */ + for (i = 0, extent = fsmap->fmh_recs; + i < fsmap->fmh_entries; + i++, extent++) { + if (extent->fmr_flags & FMR_OF_SPECIAL_OWNER) { + if (extent->fmr_owner != XFS_FMR_OWN_INODES) + continue; + /* + * This extent contains inodes that need to be + * moved into another AG. Convert the extent to + * a range of inode numbers and track them all. + */ + track_inode_chunks(xfd, agno, + extent->fmr_physical, + extent->fmr_length); + + continue; + } + + /* + * Extent is owned by an inode that may be located + * anywhere in the filesystem, not just this AG. + */ + track_inode(xfd, agno, extent->fmr_owner, + extent->fmr_physical, + extent->fmr_length); + } + + p = &fsmap->fmh_recs[fsmap->fmh_entries - 1]; + if (p->fmr_flags & FMR_OF_LAST) + break; + fsmap_advance(fsmap); + } + + free(fsmap); +} + +/* + * find inodes that own physical space in a given AG. + */ +static int +find_owner_f( + int argc, + char **argv) +{ + xfs_agnumber_t agno = -1; + int c; + + while ((c = getopt(argc, argv, "a:")) != EOF) { + switch (c) { + case 'a': + agno = cvt_u32(optarg, 10); + if (errno) { + fprintf(stderr, _("bad agno value %s\n"), + optarg); + return command_usage(&find_owner_cmd); + } + break; + default: + return command_usage(&find_owner_cmd); + } + } + + if (optind != argc) + return command_usage(&find_owner_cmd); + + if (agno == -1 || agno >= file->xfd.fsgeom.agcount) { + fprintf(stderr, +_("Destination AG %d does not exist. Filesystem only has %d AGs\n"), + agno, file->xfd.fsgeom.agcount); + exitcode = 1; + return 0; + } + + /* + * Check that rmap is enabled so that GETFSMAP is actually useful. + */ + if (!(file->xfd.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_RMAPBT)) { + fprintf(stderr, +_("Filesystem at %s does not have reverse mapping enabled. Aborting.\n"), + file->fs_path.fs_dir); + exitcode = 1; + return 0; + } + + scan_ag(agno); + return 0; +} + +static void +find_owner_help(void) +{ + printf(_( +"\n" +"Find inodes owning physical blocks in a given AG.\n" +"\n" +" -a agno -- Scan the given AG agno.\n" +"\n")); + +} + +void +find_owner_init(void) +{ + find_owner_cmd.name = "find_owner"; + find_owner_cmd.altname = "fown"; + find_owner_cmd.cfunc = find_owner_f; + find_owner_cmd.argmin = 2; + find_owner_cmd.argmax = 2; + find_owner_cmd.args = "-a agno"; + find_owner_cmd.flags = CMD_FLAG_ONESHOT; + find_owner_cmd.oneline = _("Find inodes owning physical blocks in a given AG"); + find_owner_cmd.help = find_owner_help; + + add_command(&find_owner_cmd); +} + +/* + * for each dirent we get returned, look up the inode tree to see if it is an + * inode we need to process. If it is, then replace the entry in the tree with + * a structure containing the current path and mark the entry as resolved. + */ +struct inode_path { + uint64_t ino; + struct list_head path_list; + uint32_t link_count; + char path[1]; +}; + +static int +resolve_owner_cb( + const char *path, + const struct stat *stat, + int status, + struct FTW *data) +{ + struct inode_path *ipath, *slot_ipath; + int pathlen; + void **slot; + + /* + * Lookup the slot rather than the entry so we can replace the contents + * without another lookup later on. + */ + slot = radix_tree_lookup_slot(&inode_tree, stat->st_ino); + if (!slot || *slot == NULL) + return 0; + + /* Could not get stat data? Fail! */ + if (status == FTW_NS) { + fprintf(stderr, +_("Failed to obtain stat(2) information from path %s. Aborting\n"), + path); + return -EPERM; + } + + /* Allocate a new inode path and record the path in it. */ + pathlen = strlen(path); + ipath = calloc(1, sizeof(*ipath) + pathlen + 1); + if (!ipath) { + fprintf(stderr, +_("Aborting: Storing path %s for inode 0x%lx failed: %s\n"), + path, stat->st_ino, strerror(ENOMEM)); + return -ENOMEM; + } + INIT_LIST_HEAD(&ipath->path_list); + memcpy(&ipath->path[0], path, pathlen); + ipath->ino = stat->st_ino; + + /* + * If the slot contains the inode number we just looked up, then we + * haven't recorded a path for it yet. If that is the case, we just + * set the link count of the path to 1 and replace the slot contents + * with our new_ipath. + */ + if (stat->st_ino == (uint64_t)*slot) { + ipath->link_count = 1; + *slot = ipath; + radix_tree_tag_set(&inode_tree, stat->st_ino, INODE_PATH); + inode_paths++; + return 0; + } + + /* + * Multiple hard links to this inode. The slot already contains an + * ipath pointer, so we add the new ipath to the tail of the list held + * by the slot's ipath and bump the link count of the slot's ipath to + * keep track of how many hard links the inode has. + */ + slot_ipath = *slot; + slot_ipath->link_count++; + list_add_tail(&ipath->path_list, &slot_ipath->path_list); + return 0; +} + +/* + * This should be parallelised - pass subdirs off to a work queue, have the + * work queue processes subdirs, queueing more subdirs to work on. + */ +static int +walk_mount( + const char *mntpt) +{ + int ret; + + ret = nftw(mntpt, resolve_owner_cb, + 100, FTW_PHYS | FTW_MOUNT | FTW_DEPTH); + if (ret) + return -errno; + return 0; +} + +static int +list_inode_paths(void) +{ + struct inode_path *ipath; + uint64_t idx = 0; + int ret; + + do { + bool move_blocks; + bool move_inode; + + ret = radix_tree_gang_lookup_tag(&inode_tree, (void **)&ipath, + idx, 1, INODE_PATH); + if (!ret) + break; + idx = ipath->ino + 1; + + /* Grab status tags and remove from tree. */ + move_blocks = radix_tree_tag_get(&inode_tree, ipath->ino, + MOVE_BLOCKS); + move_inode = radix_tree_tag_get(&inode_tree, ipath->ino, + MOVE_INODE); + radix_tree_delete(&inode_tree, ipath->ino); + + /* Print the initial path with inode number and state. */ + printf("0x%.16llx\t%s\t%s\t%8d\t%s\n", + (unsigned long long)ipath->ino, + move_blocks ? "BLOCK" : "---", + move_inode ? "INODE" : "---", + ipath->link_count, ipath->path); + ipath->link_count--; + + /* Walk all the hard link paths and emit them. */ + while (!list_empty(&ipath->path_list)) { + struct inode_path *hpath; + + hpath = list_first_entry(&ipath->path_list, + struct inode_path, path_list); + list_del(&hpath->path_list); + ipath->link_count--; + + printf("\t\t\t\t\t%s\n", hpath->path); + } + if (ipath->link_count) { + printf(_("Link count anomaly: %d paths left over\n"), + ipath->link_count); + } + free(ipath); + } while (true); + + /* + * Any inodes remaining in the tree at this point indicate inodes whose + * paths were not found. This will be unlinked but still open inodes or + * lost inodes due to corruptions. Either way, a shrink will not succeed + * until these inodes are removed from the filesystem. + */ + idx = 0; + do { + uint64_t ino; + + + ret = radix_tree_gang_lookup(&inode_tree, (void **)&ino, idx, 1); + if (!ret) { + if (idx != 0) + ret = -EBUSY; + break; + } + idx = ino + 1; + printf(_("No path found for inode 0x%llx!\n"), + (unsigned long long)ino); + radix_tree_delete(&inode_tree, ino); + } while (true); + + return ret; +} + +/* + * Resolve inode numbers to paths via a directory tree walk. + */ +static int +resolve_owner_f( + int argc, + char **argv) +{ + int ret; + + if (!inode_tree.rnode) { + fprintf(stderr, +_("Inode list has not been populated. No inodes to resolve.\n")); + return 0; + } + + ret = walk_mount(file->fs_path.fs_dir); + if (ret) { + fprintf(stderr, +_("Failed to resolve all paths from mount point %s: %s\n"), + file->fs_path.fs_dir, strerror(-ret)); + exitcode = 1; + return 0; + } + + ret = list_inode_paths(); + if (ret) { + fprintf(stderr, +_("Failed to list all resolved paths from mount point %s: %s\n"), + file->fs_path.fs_dir, strerror(-ret)); + exitcode = 1; + return 0; + } + return 0; +} + +static void +resolve_owner_help(void) +{ + printf(_( +"\n" +"Resolve inodes owning physical blocks in a given AG.\n" +"This requires the find_owner command to be run first to populate the table\n" +"of inodes that need to have their paths resolved.\n" +"\n")); + +} + +void +resolve_owner_init(void) +{ + resolve_owner_cmd.name = "resolve_owner"; + resolve_owner_cmd.altname = "rown"; + resolve_owner_cmd.cfunc = resolve_owner_f; + resolve_owner_cmd.argmin = 0; + resolve_owner_cmd.argmax = 0; + resolve_owner_cmd.args = ""; + resolve_owner_cmd.flags = CMD_FLAG_ONESHOT; + resolve_owner_cmd.oneline = _("Resolve patches to inodes owning physical blocks in a given AG"); + resolve_owner_cmd.help = resolve_owner_help; + + add_command(&resolve_owner_cmd); +} diff --git a/spaceman/init.c b/spaceman/init.c index dbeebcf97b9fb2..8b0af14e566dc8 100644 --- a/spaceman/init.c +++ b/spaceman/init.c @@ -10,6 +10,7 @@ #include "input.h" #include "init.h" #include "libfrog/paths.h" +#include "libfrog/radix-tree.h" #include "space.h" char *progname; @@ -37,6 +38,8 @@ init_commands(void) health_init(); clearfree_init(); move_inode_init(); + find_owner_init(); + resolve_owner_init(); } static int @@ -71,6 +74,7 @@ init( setlocale(LC_ALL, ""); bindtextdomain(PACKAGE, LOCALEDIR); textdomain(PACKAGE); + radix_tree_init(); fs_table_initialise(0, NULL, 0, NULL); while ((c = getopt(argc, argv, "c:p:V")) != EOF) { diff --git a/spaceman/space.h b/spaceman/space.h index 96c3c356f13fec..cffb1882153a18 100644 --- a/spaceman/space.h +++ b/spaceman/space.h @@ -39,5 +39,7 @@ extern void clearfree_init(void); extern void info_init(void); extern void health_init(void); void move_inode_init(void); +void find_owner_init(void); +void resolve_owner_init(void); #endif /* XFS_SPACEMAN_SPACE_H_ */