Re: [PATCH 6/9] spaceman/defrag: workaround kernel xfs_reflink_try_clear_inode_flag()

Wengang Wang <wen.gang.wang@xxxxxxxxxx> · Thu, 11 Jul 2024 23:11:14 +0000

> On Jul 9, 2024, at 1:51 PM, Darrick J. Wong <djwong@xxxxxxxxxx> wrote:
> 
> On Tue, Jul 09, 2024 at 12:10:25PM -0700, Wengang Wang wrote:
>> xfs_reflink_try_clear_inode_flag() takes very long in case file has huge number
>> of extents and none of the extents are shared.
>> 
>> workaround:
>> share the first real extent so that xfs_reflink_try_clear_inode_flag() returns
>> quickly to save cpu times and speed up defrag significantly.
> 
> I wonder if a better solution would be to change xfs_reflink_unshare
> only to try to clear the reflink iflag if offset/len cover the entire
> file?  It's a pity we can't set time budgets on fallocate requests.

Yep.
Anyway the change, if there will be, would be in kernel.
We can use -n option to disable this workaround in defrag.

Thanks,
Wengang

> 
> --D
> 
>> Signed-off-by: Wengang Wang <wen.gang.wang@xxxxxxxxxx>
>> ---
>> spaceman/defrag.c | 174 +++++++++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 172 insertions(+), 2 deletions(-)
>> 
>> diff --git a/spaceman/defrag.c b/spaceman/defrag.c
>> index f8e6713c..b5c5b187 100644
>> --- a/spaceman/defrag.c
>> +++ b/spaceman/defrag.c
>> @@ -327,6 +327,155 @@ defrag_fs_limit_hit(int fd)
>> return statfs_s.f_bsize * statfs_s.f_bavail < g_limit_free_bytes;
>> }
>> 
>> +static bool g_enable_first_ext_share = true;
>> +
>> +static int
>> +defrag_get_first_real_ext(int fd, struct getbmapx *mapx)
>> +{
>> + int err;
>> +
>> + while (1) {
>> + err = defrag_get_next_extent(fd, mapx);
>> + if (err)
>> + break;
>> +
>> + defrag_move_next_extent();
>> + if (!(mapx->bmv_oflags & BMV_OF_PREALLOC))
>> + break;
>> + }
>> + return err;
>> +}
>> +
>> +static __u64 g_share_offset = -1ULL;
>> +static __u64 g_share_len = 0ULL;
>> +#define SHARE_MAX_SIZE 32768  /* 32KiB */
>> +
>> +/* share the first real extent with scrach */
>> +static void
>> +defrag_share_first_extent(int defrag_fd, int scratch_fd)
>> +{
>> +#define OFFSET_1PB 0x4000000000000LL
>> + struct file_clone_range clone;
>> + struct getbmapx mapx;
>> + int err;
>> +
>> + if (g_enable_first_ext_share == false)
>> + return;
>> +
>> + err = defrag_get_first_real_ext(defrag_fd, &mapx);
>> + if (err)
>> + return;
>> +
>> + clone.src_fd = defrag_fd;
>> + clone.src_offset = mapx.bmv_offset * 512;
>> + clone.src_length = mapx.bmv_length * 512;
>> + /* shares at most SHARE_MAX_SIZE length */
>> + if (clone.src_length > SHARE_MAX_SIZE)
>> + clone.src_length = SHARE_MAX_SIZE;
>> + clone.dest_offset = OFFSET_1PB + clone.src_offset;
>> + /* if the first is extent is reaching the EoF, no need to share */
>> + if (clone.src_offset + clone.src_length >= g_defrag_file_size)
>> + return;
>> + err = ioctl(scratch_fd, FICLONERANGE, &clone);
>> + if (err != 0) {
>> + fprintf(stderr, "cloning first extent failed: %s\n",
>> + strerror(errno));
>> + return;
>> + }
>> +
>> + /* safe the offset and length for re-share */
>> + g_share_offset = clone.src_offset;
>> + g_share_len = clone.src_length;
>> +}
>> +
>> +/* re-share the blocks we shared previous if then are no longer shared */
>> +static void
>> +defrag_reshare_blocks_in_front(int defrag_fd, int scratch_fd)
>> +{
>> +#define NR_GET_EXT 9
>> + struct getbmapx mapx[NR_GET_EXT];
>> + struct file_clone_range clone;
>> + __u64 new_share_len;
>> + int idx, err;
>> +
>> + if (g_enable_first_ext_share == false)
>> + return;
>> +
>> + if (g_share_len == 0ULL)
>> + return;
>> +
>> + /*
>> + * check if previous shareing still exist
>> + * we are done if (partially) so.
>> + */
>> + mapx[0].bmv_offset = g_share_offset;
>> + mapx[0].bmv_length = g_share_len;
>> + mapx[0].bmv_count = NR_GET_EXT;
>> + mapx[0].bmv_iflags = BMV_IF_NO_HOLES | BMV_IF_PREALLOC;
>> + err = ioctl(defrag_fd, XFS_IOC_GETBMAPX, mapx);
>> + if (err) {
>> + fprintf(stderr, "XFS_IOC_GETBMAPX failed %s\n",
>> + strerror(errno));
>> + /* won't try share again */
>> + g_share_len = 0ULL;
>> + return;
>> + }
>> +
>> + if (mapx[0].bmv_entries == 0) {
>> + /* shared blocks all became hole, won't try share again */
>> + g_share_len = 0ULL;
>> + return;
>> + }
>> +
>> + if (g_share_offset != 512 * mapx[1].bmv_offset) {
>> + /* first shared block became hole, won't try share again */
>> + g_share_len = 0ULL;
>> + return;
>> + }
>> +
>> + /* we check up to only the first NR_GET_EXT - 1 extents */
>> + for (idx = 1; idx <= mapx[0].bmv_entries; idx++) {
>> + if (mapx[idx].bmv_oflags & BMV_OF_SHARED) {
>> + /* some blocks still shared, done */
>> + return;
>> + }
>> + }
>> +
>> + /*
>> + * The previously shared blocks are no longer shared, re-share.
>> + * deallocate the blocks in scrath file first
>> + */
>> + err = fallocate(scratch_fd,
>> + FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,
>> + OFFSET_1PB + g_share_offset, g_share_len);
>> + if (err != 0) {
>> + fprintf(stderr, "punch hole failed %s\n",
>> + strerror(errno));
>> + g_share_len = 0;
>> + return;
>> + }
>> +
>> + new_share_len = 512 * mapx[1].bmv_length;
>> + if (new_share_len > SHARE_MAX_SIZE)
>> + new_share_len = SHARE_MAX_SIZE;
>> +
>> + clone.src_fd = defrag_fd;
>> + /* keep starting offset unchanged */
>> + clone.src_offset = g_share_offset;
>> + clone.src_length = new_share_len;
>> + clone.dest_offset = OFFSET_1PB + clone.src_offset;
>> +
>> + err = ioctl(scratch_fd, FICLONERANGE, &clone);
>> + if (err) {
>> + fprintf(stderr, "FICLONERANGE failed %s\n",
>> + strerror(errno));
>> + g_share_len = 0;
>> + return;
>> + }
>> +
>> + g_share_len = new_share_len;
>> + }
>> +
>> /*
>>  * defragment a file
>>  * return 0 if successfully done, 1 otherwise
>> @@ -377,6 +526,12 @@ defrag_xfs_defrag(char *file_path) {
>> 
>> signal(SIGINT, defrag_sigint_handler);
>> 
>> + /*
>> + * share the first extent to work around kernel consuming time
>> + * in xfs_reflink_try_clear_inode_flag()
>> + */
>> + defrag_share_first_extent(defrag_fd, scratch_fd);
>> +
>> do {
>> struct timeval t_clone, t_unshare, t_punch_hole;
>> struct defrag_segment segment;
>> @@ -454,6 +609,15 @@ defrag_xfs_defrag(char *file_path) {
>> if (time_delta > max_unshare_us)
>> max_unshare_us = time_delta;
>> 
>> + /*
>> + * if unshare used more than 1 second, time is very possibly
>> + * used in checking if the file is sharing extents now.
>> + * to avoid that happen again we re-share the blocks in front
>> + * to workaround that.
>> + */
>> + if (time_delta > 1000000)
>> + defrag_reshare_blocks_in_front(defrag_fd, scratch_fd);
>> +
>> /*
>> * Punch out the original extents we shared to the
>> * scratch file so they are returned to free space.
>> @@ -514,6 +678,8 @@ static void defrag_help(void)
>> " -f free_space      -- specify shrethod of the XFS free space in MiB, when\n"
>> "                       XFS free space is lower than that, shared segments \n"
>> "                       are excluded from defragmentation, 1024 by default\n"
>> +" -n                 -- disable the \"share first extent\" featue, it's\n"
>> +"                       enabled by default to speed up\n"
>> ));
>> }
>> 
>> @@ -525,7 +691,7 @@ defrag_f(int argc, char **argv)
>> int i;
>> int c;
>> 
>> - while ((c = getopt(argc, argv, "s:f:")) != EOF) {
>> + while ((c = getopt(argc, argv, "s:f:n")) != EOF) {
>> switch(c) {
>> case 's':
>> g_segment_size_lmt = atoi(optarg) * 1024 * 1024 / 512;
>> @@ -539,6 +705,10 @@ defrag_f(int argc, char **argv)
>> g_limit_free_bytes = atol(optarg) * 1024 * 1024;
>> break;
>> 
>> + case 'n':
>> + g_enable_first_ext_share = false;
>> + break;
>> +
>> default:
>> command_usage(&defrag_cmd);
>> return 1;
>> @@ -556,7 +726,7 @@ void defrag_init(void)
>> defrag_cmd.cfunc = defrag_f;
>> defrag_cmd.argmin = 0;
>> defrag_cmd.argmax = 4;
>> - defrag_cmd.args = "[-s segment_size] [-f free_space]";
>> + defrag_cmd.args = "[-s segment_size] [-f free_space] [-n]";
>> defrag_cmd.flags = CMD_FLAG_ONESHOT;
>> defrag_cmd.oneline = _("Defragment XFS files");
>> defrag_cmd.help = defrag_help;
>> -- 
>> 2.39.3 (Apple Git-146)
>> 
>>