From: Darrick J. Wong <djwong@xxxxxxxxxx> Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> --- libfrog/clearspace.c | 377 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 297 insertions(+), 80 deletions(-) diff --git a/libfrog/clearspace.c b/libfrog/clearspace.c index 601257022b8..452cd13db45 100644 --- a/libfrog/clearspace.c +++ b/libfrog/clearspace.c @@ -1362,6 +1362,17 @@ csp_evac_dedupe_loop( fdr->src_length = old_reqlen; continue; } + if (ret == EINVAL) { + /* + * If we can't dedupe get the block, it's possible that + * src_fd was punched or truncated out from under us. + * Treat this the same way we would if the contents + * didn't match. + */ + trace_dedupe(req, "cannot evac space, moving on", 0); + same = false; + ret = 0; + } if (ret) { fprintf(stderr, _("evacuating inode 0x%llx: %s\n"), ino, strerror(ret)); @@ -1939,8 +1950,14 @@ csp_evac_fs_metadata( * the space capture file, or 1 if there's nothing to transfer to the space * capture file. */ -static int -csp_freeze_check_attempt( +enum freeze_outcome { + FREEZE_FAILED = -1, + FREEZE_DONE, + FREEZE_SKIP, +}; + +static enum freeze_outcome +csp_freeze_check_outcome( struct clearspace_req *req, const struct fsmap *mrec, unsigned long long *len) @@ -1950,13 +1967,12 @@ csp_freeze_check_attempt( *len = 0; - ret = bmapx_one(req, req->work_fd, mrec->fmr_physical, - mrec->fmr_length, &brec); + ret = bmapx_one(req, req->work_fd, 0, mrec->fmr_length, &brec); if (ret) - return ret; + return FREEZE_FAILED; trace_freeze(req, - "does workfd pos 0x%llx len 0x%llx map to phys 0x%llx len 0x%llx?", + "check if workfd pos 0x0 phys 0x%llx len 0x%llx maps to phys 0x%llx len 0x%llx", (unsigned long long)mrec->fmr_physical, (unsigned long long)mrec->fmr_length, (unsigned long long)BBTOB(brec.bmv_block), @@ -1964,8 +1980,8 @@ csp_freeze_check_attempt( /* freeze of an unwritten extent punches a hole in the work file. */ if ((mrec->fmr_flags & FMR_OF_PREALLOC) && brec.bmv_block == -1) { - *len = BBTOB(brec.bmv_length); - return 1; + *len = min(mrec->fmr_length, BBTOB(brec.bmv_length)); + return FREEZE_SKIP; } /* @@ -1974,8 +1990,8 @@ csp_freeze_check_attempt( */ if (!(mrec->fmr_flags & FMR_OF_PREALLOC) && BBTOB(brec.bmv_block) == mrec->fmr_physical) { - *len = BBTOB(brec.bmv_length); - return 0; + *len = min(mrec->fmr_length, BBTOB(brec.bmv_length)); + return FREEZE_DONE; } /* @@ -1984,20 +2000,15 @@ csp_freeze_check_attempt( * have been mapped into the work file. Set @len to zero and return so * that we try again with the next mapping. */ + trace_falloc(req, "reset workfd isize 0x0", 0); - trace_falloc(req, "fpunch workfd pos 0x%llx bytecount 0x%llx", - (unsigned long long)mrec->fmr_physical, - (unsigned long long)mrec->fmr_length); - - ret = fallocate(req->work_fd, - FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - mrec->fmr_physical, mrec->fmr_length); + ret = ftruncate(req->work_fd, 0); if (ret) { perror(_("resetting work file after failed freeze")); - return ret; + return FREEZE_FAILED; } - return 1; + return FREEZE_SKIP; } /* @@ -2014,6 +2025,7 @@ csp_freeze_open( int *fd) { struct xfs_bulkstat bulkstat; + int oflags = O_RDWR; int target_fd; int ret; @@ -2041,7 +2053,10 @@ csp_freeze_open( if (!S_ISREG(bulkstat.bs_mode) && !S_ISDIR(bulkstat.bs_mode)) return 0; - target_fd = csp_open_by_handle(req, O_RDONLY, mrec->fmr_owner, + if (S_ISDIR(bulkstat.bs_mode)) + oflags = O_RDONLY; + + target_fd = csp_open_by_handle(req, oflags, mrec->fmr_owner, bulkstat.bs_gen); if (target_fd == -2) return 0; @@ -2061,6 +2076,122 @@ csp_freeze_open( return 0; } +static inline uint64_t rounddown_64(uint64_t x, uint64_t y) +{ + return (x / y) * y; +} + +/* + * Deal with a frozen extent containing a partially written EOF block. Either + * we use funshare to get src_fd to release the block, or we reduce the length + * of the frozen extent by one block. + */ +static int +csp_freeze_unaligned_eofblock( + struct clearspace_req *req, + int src_fd, + const struct fsmap *mrec, + unsigned long long *frozen_len) +{ + struct getbmapx brec; + struct stat statbuf; + loff_t work_offset, length; + int ret; + + ret = fstat(req->work_fd, &statbuf); + if (ret) { + perror(_("statting work file")); + return ret; + } + + /* + * The frozen extent is less than the size of the work file, which + * means that we're already block aligned. + */ + if (*frozen_len <= statbuf.st_size) + return 0; + + /* The frozen extent does not contain a partially written EOF block. */ + if (statbuf.st_size % statbuf.st_blksize == 0) + return 0; + + /* + * Unshare what we think is a partially written EOF block of the + * original file, to try to force it to release that block. + */ + work_offset = rounddown_64(statbuf.st_size, statbuf.st_blksize); + length = statbuf.st_size - work_offset; + + trace_freeze(req, + "unaligned eofblock 0x%llx work_size 0x%llx blksize 0x%x work_offset 0x%llx work_length 0x%llx", + *frozen_len, statbuf.st_size, statbuf.st_blksize, + work_offset, length); + + ret = fallocate(src_fd, FALLOC_FL_UNSHARE_RANGE, + mrec->fmr_offset + work_offset, length); + if (ret) { + perror(_("unsharing original file")); + return ret; + } + + ret = fsync(src_fd); + if (ret) { + perror(_("flushing original file")); + return ret; + } + + ret = bmapx_one(req, req->work_fd, work_offset, length, &brec); + if (ret) + return ret; + + if (BBTOB(brec.bmv_block) != mrec->fmr_physical + work_offset) { + fprintf(stderr, + _("work file offset 0x%llx maps to phys 0x%llx, expected 0x%llx"), + (unsigned long long)work_offset, + (unsigned long long)BBTOB(brec.bmv_block), + (unsigned long long)mrec->fmr_physical); + return -1; + } + + /* + * If the block is still shared, there must be other owners of this + * block. Round down the frozen length and we'll come back to it + * eventually. + */ + if (brec.bmv_oflags & BMV_OF_SHARED) { + *frozen_len = work_offset; + return 0; + } + + /* + * Not shared anymore, so increase the size of the file to the next + * block boundary so that we can reflink it into the space capture + * file. + */ + ret = ftruncate(req->work_fd, + BBTOB(brec.bmv_length) + BBTOB(brec.bmv_offset)); + if (ret) { + perror(_("expanding work file")); + return ret; + } + + /* Double-check that we didn't lose the block. */ + ret = bmapx_one(req, req->work_fd, work_offset, length, &brec); + if (ret) + return ret; + + if (BBTOB(brec.bmv_block) != mrec->fmr_physical + work_offset) { + fprintf(stderr, + _("work file offset 0x%llx maps to phys 0x%llx, should be 0x%llx"), + (unsigned long long)work_offset, + (unsigned long long)BBTOB(brec.bmv_block), + (unsigned long long)mrec->fmr_physical); + return -1; + } + + return 0; +} + /* * Given a fsmap, try to reflink the physical space into the space capture * file. @@ -2074,6 +2205,7 @@ csp_freeze_req_fsmap( struct fsmap short_mrec; struct file_clone_range fcr = { }; unsigned long long frozen_len; + enum freeze_outcome outcome; int src_fd; int ret, ret2; @@ -2126,33 +2258,86 @@ csp_freeze_req_fsmap( } /* - * Reflink the mapping from the source file into the work file. If we + * Reflink the mapping from the source file into the empty work file so + * that a write will be written elsewhere. The only way to reflink a + * partially written EOF block is if the kernel can reset the work file + * size so that the post-EOF part of the block remains post-EOF. If we * can't do that, we're sunk. If the mapping is unwritten, we'll leave * a hole in the work file. */ + ret = ftruncate(req->work_fd, 0); + if (ret) { + perror(_("truncating work file for freeze")); + goto out_fd; + } + fcr.src_fd = src_fd; fcr.src_offset = mrec->fmr_offset; fcr.src_length = mrec->fmr_length; - fcr.dest_offset = mrec->fmr_physical; + fcr.dest_offset = 0; - trace_freeze(req, "freeze to workfd pos 0x%llx", - (unsigned long long)fcr.dest_offset); + trace_freeze(req, + "reflink ino 0x%llx offset 0x%llx bytecount 0x%llx into workfd", + (unsigned long long)mrec->fmr_owner, + (unsigned long long)fcr.src_offset, + (unsigned long long)fcr.src_length); ret = clonerange(req->work_fd, &fcr); - if (ret) { - fprintf(stderr, _("freezing space to work file: %s\n"), - strerror(ret)); - goto out_fd; + if (ret == EINVAL) { + /* + * If that didn't work, try reflinking to EOF and picking out + * whatever pieces we want. + */ + fcr.src_length = 0; + + trace_freeze(req, + "reflink ino 0x%llx offset 0x%llx to EOF into workfd", + (unsigned long long)mrec->fmr_owner, + (unsigned long long)fcr.src_offset); + + ret = clonerange(req->work_fd, &fcr); } - - req->trace_indent++; - ret = csp_freeze_check_attempt(req, mrec, &frozen_len); - req->trace_indent--; - if (ret < 0) - goto out_fd; - if (ret == 1) { + if (ret == EINVAL) { + /* + * If we still can't get the block, it's possible that src_fd + * was punched or truncated out from under us, so we just move + * on to the next fsmap. + */ + trace_freeze(req, "cannot freeze space, moving on", 0); ret = 0; - goto advance; + goto out_fd; + } + if (ret) { + fprintf(stderr, _("freezing space to work file: %s\n"), + strerror(ret)); + goto out_fd; + } + + req->trace_indent++; + outcome = csp_freeze_check_outcome(req, mrec, &frozen_len); + req->trace_indent--; + switch (outcome) { + case FREEZE_FAILED: + ret = -1; + goto out_fd; + case FREEZE_SKIP: + *cursor += frozen_len; + goto out_fd; + case FREEZE_DONE: + break; + } + + /* + * If we tried reflinking to EOF to capture a partially written EOF + * block in the work file, we need to unshare the end of the source + * file before we try to reflink the frozen space into the space + * capture file. + */ + if (fcr.src_length == 0) { + ret = csp_freeze_unaligned_eofblock(req, src_fd, mrec, + &frozen_len); + if (ret) + goto out_fd; } /* @@ -2164,11 +2349,11 @@ csp_freeze_req_fsmap( * the contents of the work file. */ fcr.src_fd = req->work_fd; - fcr.src_offset = mrec->fmr_physical; + fcr.src_offset = 0; fcr.dest_offset = mrec->fmr_physical; fcr.src_length = frozen_len; - trace_freeze(req, "link phys 0x%llx len 0x%llx to spacefd", + trace_freeze(req, "reflink phys 0x%llx len 0x%llx to spacefd", (unsigned long long)mrec->fmr_physical, (unsigned long long)mrec->fmr_length); @@ -2187,7 +2372,6 @@ csp_freeze_req_fsmap( goto out_fd; } -advance: *cursor += frozen_len; out_fd: ret2 = close(src_fd); @@ -2278,6 +2462,79 @@ csp_collect_garbage( return 0; } +static int +csp_prepare( + struct clearspace_req *req) +{ + blkcnt_t old_blocks = 0; + int ret; + + /* + * Empty out CoW forks and speculative post-EOF preallocations before + * starting the clearing process. This may be somewhat overkill. + */ + ret = syncfs(req->xfd->fd); + if (ret) { + perror(_("syncing filesystem")); + return ret; + } + + ret = csp_collect_garbage(req); + if (ret) + return ret; + + /* + * Set up the space capture file as a large sparse file mirroring the + * physical space that we want to defragment. + */ + ret = ftruncate(req->space_fd, req->start + req->length); + if (ret) { + perror(_("setting up space capture file")); + return ret; + } + + /* + * If we don't have reflink, just grab the free space and move on to + * copying and exchanging file contents. + */ + if (!req->use_reflink) + return csp_grab_free_space(req); + + /* + * Try to freeze as much of the requested range as we can, grab the + * free space in that range, and run freeze again to pick up anything + * that may have been allocated while all that was going on. + */ + do { + struct stat statbuf; + + ret = csp_freeze_req_range(req); + if (ret) + return ret; + + ret = csp_grab_free_space(req); + if (ret) + return ret; + + ret = fstat(req->space_fd, &statbuf); + if (ret) + return ret; + + if (old_blocks == statbuf.st_blocks) + break; + old_blocks = statbuf.st_blocks; + } while (1); + + /* + * If reflink is enabled, our strategy is to dedupe to free blocks in + * the area that we're clearing without making any user-visible changes + * to the file contents. For all the written file data blocks in area + * we're clearing, make an identical copy in the work file that is + * backed by blocks that are not in the clearing area. + */ + return csp_prepare_for_dedupe(req); +} + /* Set up the target to clear all metadata from the given range. */ static inline void csp_target_metadata( @@ -2330,50 +2587,10 @@ clearspace_run( return ret; } - /* - * Empty out CoW forks and speculative post-EOF preallocations before - * starting the clearing process. This may be somewhat overkill. - */ - ret = syncfs(req->xfd->fd); - if (ret) { - perror(_("syncing filesystem")); - goto out_bitmap; - } - - ret = csp_collect_garbage(req); - if (ret) - goto out_bitmap; - - /* - * Try to freeze as much of the requested range as we can, grab the - * free space in that range, and run freeze again to pick up anything - * that may have been allocated while all that was going on. - */ - ret = csp_freeze_req_range(req); - if (ret) - goto out_bitmap; - - ret = csp_grab_free_space(req); - if (ret) - goto out_bitmap; - - ret = csp_freeze_req_range(req); + ret = csp_prepare(req); if (ret) goto out_bitmap; - /* - * If reflink is enabled, our strategy is to dedupe to free blocks in - * the area that we're clearing without making any user-visible changes - * to the file contents. For all the written file data blocks in area - * we're clearing, make an identical copy in the work file that is - * backed by blocks that are not in the clearing area. - */ - if (req->use_reflink) { - ret = csp_prepare_for_dedupe(req); - if (ret) - goto out_bitmap; - } - /* Evacuate as many file blocks as we can. */ do { ret = csp_find_target(req, &target);