From: Wang Shilong <wshilong@xxxxxxx> In our benchmarking for PiB size filesystem, pass5 takes 10446s to finish and 99.5% of time takes on reading bitmaps. It makes sense to reading bitmaps using multiple threads, a quickly benchmark show 10446s to 626s with 64 threads. [ This has all of many bug fixes for rw_bitmaps.c from the original luster patch set collapsed into a single commit. In addition it has the new ext2fs_rw_bitmaps() api proposed by Ted. ] Signed-off-by: Wang Shilong <wshilong@xxxxxxx> Signed-off-by: Saranya Muruganandam <saranyamohan@xxxxxxxxxx> Signed-off-by: Theodore Ts'o <tytso@xxxxxxx> --- lib/ext2fs/ext2fs.h | 8 + lib/ext2fs/rw_bitmaps.c | 332 +++++++++++++++++++++++++++++++++------- 2 files changed, 288 insertions(+), 52 deletions(-) diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h index 5955c3ae..82ce9126 100644 --- a/lib/ext2fs/ext2fs.h +++ b/lib/ext2fs/ext2fs.h @@ -688,6 +688,14 @@ struct ext2_xattr_handle; #define XATTR_ABORT 1 #define XATTR_CHANGED 2 +/* + * flags for ext2fs_rw_bitmaps() + */ +#define EXT2FS_BITMAPS_WRITE 0x0001 +#define EXT2FS_BITMAPS_BLOCK 0x0002 +#define EXT2FS_BITMAPS_INODE 0x0004 +#define EXT2FS_BITMAPS_VALID_FLAGS 0x0007 + /* * function prototypes */ diff --git a/lib/ext2fs/rw_bitmaps.c b/lib/ext2fs/rw_bitmaps.c index d80c9eb8..7e4f7c6a 100644 --- a/lib/ext2fs/rw_bitmaps.c +++ b/lib/ext2fs/rw_bitmaps.c @@ -23,11 +23,33 @@ #ifdef HAVE_SYS_TYPES_H #include <sys/types.h> #endif +#ifdef HAVE_PTHREAD_H +#include <pthread.h> +#endif #include "ext2_fs.h" #include "ext2fs.h" #include "e2image.h" +#ifdef HAVE_PTHREAD +typedef pthread_mutex_t mutex_t; + +static void unix_pthread_mutex_lock(mutex_t *mutex) +{ + if (mutex) + pthread_mutex_lock(mutex); +} +static void unix_pthread_mutex_unlock(mutex_t *mutex) +{ + if (mutex) + pthread_mutex_unlock(mutex); +} +#else +typedef int mutex_t; +#define unix_pthread_mutex_lock(mutex_t) do {} while (0) +#define unix_pthread_mutex_unlock(mutex_t) do {} while (0) +#endif + static errcode_t write_bitmaps(ext2_filsys fs, int do_inode, int do_block) { dgrp_t i; @@ -205,22 +227,12 @@ static int bitmap_tail_verify(unsigned char *bitmap, int first, int last) return 1; } -static errcode_t read_bitmaps(ext2_filsys fs, int do_inode, int do_block) +static errcode_t read_bitmaps_range_prepare(ext2_filsys fs, int flags) { - dgrp_t i; - char *block_bitmap = 0, *inode_bitmap = 0; - char *buf; errcode_t retval; int block_nbytes = EXT2_CLUSTERS_PER_GROUP(fs->super) / 8; int inode_nbytes = EXT2_INODES_PER_GROUP(fs->super) / 8; - int tail_flags = 0; - int csum_flag; - unsigned int cnt; - blk64_t blk; - blk64_t blk_itr = EXT2FS_B2C(fs, fs->super->s_first_data_block); - blk64_t blk_cnt; - ext2_ino_t ino_itr = 1; - ext2_ino_t ino_cnt; + char *buf; EXT2_CHECK_MAGIC(fs, EXT2_ET_MAGIC_EXT2FS_FILSYS); @@ -230,12 +242,11 @@ static errcode_t read_bitmaps(ext2_filsys fs, int do_inode, int do_block) fs->write_bitmaps = ext2fs_write_bitmaps; - csum_flag = ext2fs_has_group_desc_csum(fs); - retval = ext2fs_get_mem(strlen(fs->device_name) + 80, &buf); if (retval) return retval; - if (do_block) { + + if (flags & EXT2FS_BITMAPS_BLOCK) { if (fs->block_map) ext2fs_free_block_bitmap(fs->block_map); strcpy(buf, "block bitmap for "); @@ -243,12 +254,9 @@ static errcode_t read_bitmaps(ext2_filsys fs, int do_inode, int do_block) retval = ext2fs_allocate_block_bitmap(fs, buf, &fs->block_map); if (retval) goto cleanup; - retval = io_channel_alloc_buf(fs->io, 0, &block_bitmap); - if (retval) - goto cleanup; - } else - block_nbytes = 0; - if (do_inode) { + } + + if (flags & EXT2FS_BITMAPS_INODE) { if (fs->inode_map) ext2fs_free_inode_bitmap(fs->inode_map); strcpy(buf, "inode bitmap for "); @@ -256,13 +264,62 @@ static errcode_t read_bitmaps(ext2_filsys fs, int do_inode, int do_block) retval = ext2fs_allocate_inode_bitmap(fs, buf, &fs->inode_map); if (retval) goto cleanup; + } + ext2fs_free_mem(&buf); + + return retval; + +cleanup: + if (flags & EXT2FS_BITMAPS_BLOCK) { + ext2fs_free_block_bitmap(fs->block_map); + fs->block_map = 0; + } + if (flags & EXT2FS_BITMAPS_INODE) { + ext2fs_free_inode_bitmap(fs->inode_map); + fs->inode_map = 0; + } + if (buf) + ext2fs_free_mem(&buf); + return retval; +} + +static errcode_t read_bitmaps_range_start(ext2_filsys fs, int flags, + dgrp_t start, dgrp_t end, + mutex_t *mutex, + int *tail_flags) +{ + dgrp_t i; + char *block_bitmap = 0, *inode_bitmap = 0; + errcode_t retval = 0; + int block_nbytes = EXT2_CLUSTERS_PER_GROUP(fs->super) / 8; + int inode_nbytes = EXT2_INODES_PER_GROUP(fs->super) / 8; + int csum_flag; + unsigned int cnt; + blk64_t blk; + blk64_t blk_itr = EXT2FS_B2C(fs, fs->super->s_first_data_block); + blk64_t blk_cnt; + ext2_ino_t ino_itr = 1; + ext2_ino_t ino_cnt; + + csum_flag = ext2fs_has_group_desc_csum(fs); + + if (flags & EXT2FS_BITMAPS_BLOCK) { + retval = io_channel_alloc_buf(fs->io, 0, &block_bitmap); + if (retval) + goto cleanup; + } else { + block_nbytes = 0; + } + + if (flags & EXT2FS_BITMAPS_INODE) { retval = io_channel_alloc_buf(fs->io, 0, &inode_bitmap); if (retval) goto cleanup; - } else + } else { inode_nbytes = 0; - ext2fs_free_mem(&buf); + } + /* io should be null */ if (fs->flags & EXT2_FLAG_IMAGE_FILE) { blk = (ext2fs_le32_to_cpu(fs->image_header->offset_inodemap) / fs->blocksize); ino_cnt = fs->super->s_inodes_count; @@ -300,10 +357,12 @@ static errcode_t read_bitmaps(ext2_filsys fs, int do_inode, int do_block) blk_itr += cnt; blk_cnt -= cnt; } - goto success_cleanup; + goto cleanup; } - for (i = 0; i < fs->group_desc_count; i++) { + blk_itr += ((blk64_t)start * (block_nbytes << 3)); + ino_itr += ((blk64_t)start * (inode_nbytes << 3)); + for (i = start; i <= end; i++) { if (block_bitmap) { blk = ext2fs_block_bitmap_loc(fs, i); if ((csum_flag && @@ -329,12 +388,14 @@ static errcode_t read_bitmaps(ext2_filsys fs, int do_inode, int do_block) } if (!bitmap_tail_verify((unsigned char *) block_bitmap, block_nbytes, fs->blocksize - 1)) - tail_flags |= EXT2_FLAG_BBITMAP_TAIL_PROBLEM; + *tail_flags |= EXT2_FLAG_BBITMAP_TAIL_PROBLEM; } else memset(block_bitmap, 0, block_nbytes); cnt = block_nbytes << 3; + unix_pthread_mutex_lock(mutex); retval = ext2fs_set_block_bitmap_range2(fs->block_map, blk_itr, cnt, block_bitmap); + unix_pthread_mutex_unlock(mutex); if (retval) goto cleanup; blk_itr += block_nbytes << 3; @@ -365,63 +426,225 @@ static errcode_t read_bitmaps(ext2_filsys fs, int do_inode, int do_block) } if (!bitmap_tail_verify((unsigned char *) inode_bitmap, inode_nbytes, fs->blocksize - 1)) - tail_flags |= EXT2_FLAG_IBITMAP_TAIL_PROBLEM; + *tail_flags |= EXT2_FLAG_IBITMAP_TAIL_PROBLEM; } else memset(inode_bitmap, 0, inode_nbytes); cnt = inode_nbytes << 3; + unix_pthread_mutex_lock(mutex); retval = ext2fs_set_inode_bitmap_range2(fs->inode_map, ino_itr, cnt, inode_bitmap); + unix_pthread_mutex_unlock(mutex); if (retval) goto cleanup; ino_itr += inode_nbytes << 3; } } +cleanup: + if (inode_bitmap) + ext2fs_free_mem(&inode_bitmap); + if (block_bitmap) + ext2fs_free_mem(&block_bitmap); + return retval; +} + +static errcode_t read_bitmaps_range_end(ext2_filsys fs, int flags, + int tail_flags) +{ + errcode_t retval; + /* Mark group blocks for any BLOCK_UNINIT groups */ - if (do_block) { + if (flags & EXT2FS_BITMAPS_BLOCK) { retval = mark_uninit_bg_group_blocks(fs); if (retval) - goto cleanup; - } - -success_cleanup: - if (inode_bitmap) { - ext2fs_free_mem(&inode_bitmap); - fs->flags &= ~EXT2_FLAG_IBITMAP_TAIL_PROBLEM; - } - if (block_bitmap) { - ext2fs_free_mem(&block_bitmap); + return retval; fs->flags &= ~EXT2_FLAG_BBITMAP_TAIL_PROBLEM; } + if (flags & EXT2FS_BITMAPS_INODE) + fs->flags &= ~EXT2_FLAG_IBITMAP_TAIL_PROBLEM; fs->flags |= tail_flags; + return 0; +} -cleanup: - if (do_block) { +static void read_bitmaps_cleanup_on_error(ext2_filsys fs, int flags) +{ + if (flags & EXT2FS_BITMAPS_BLOCK) { ext2fs_free_block_bitmap(fs->block_map); fs->block_map = 0; } - if (do_inode) { + if (flags & EXT2FS_BITMAPS_INODE) { ext2fs_free_inode_bitmap(fs->inode_map); fs->inode_map = 0; } - if (inode_bitmap) - ext2fs_free_mem(&inode_bitmap); - if (block_bitmap) - ext2fs_free_mem(&block_bitmap); - if (buf) - ext2fs_free_mem(&buf); +} + +static errcode_t read_bitmaps_range(ext2_filsys fs, int flags, + dgrp_t start, dgrp_t end) +{ + errcode_t retval; + int tail_flags = 0; + + retval = read_bitmaps_range_prepare(fs, flags); + if (retval) + return retval; + + retval = read_bitmaps_range_start(fs, flags, start, end, + NULL, &tail_flags); + if (retval == 0) + retval = read_bitmaps_range_end(fs, flags, tail_flags); + if (retval) + read_bitmaps_cleanup_on_error(fs, flags); + return retval; +} + +#ifdef HAVE_PTHREAD +struct read_bitmaps_thread_info { + ext2_filsys rbt_fs; + int rbt_flags; + dgrp_t rbt_grp_start; + dgrp_t rbt_grp_end; + errcode_t rbt_retval; + pthread_mutex_t *rbt_mutex; + int rbt_tail_flags; +}; + +static void *read_bitmaps_thread(void *data) +{ + struct read_bitmaps_thread_info *rbt = data; + + rbt->rbt_retval = read_bitmaps_range_start(rbt->rbt_fs, rbt->rbt_flags, + rbt->rbt_grp_start, rbt->rbt_grp_end, + rbt->rbt_mutex, &rbt->rbt_tail_flags); + return NULL; +} +#endif + +errcode_t ext2fs_rw_bitmaps(ext2_filsys fs, int flags, int num_threads) +{ +#ifdef HAVE_PTHREAD + pthread_attr_t attr; + pthread_t *thread_ids = NULL; + struct read_bitmaps_thread_info *thread_infos = NULL; + pthread_mutex_t rbt_mutex = PTHREAD_MUTEX_INITIALIZER; + errcode_t retval; + errcode_t rc; + unsigned flexbg_size = 1 << fs->super->s_log_groups_per_flex; + dgrp_t average_group; + int i, tail_flags = 0; + io_manager manager = unix_io_manager; +#endif + + if (flags & ~EXT2FS_BITMAPS_VALID_FLAGS) + return EXT2_ET_INVALID_ARGUMENT; + + if (flags & EXT2FS_BITMAPS_WRITE) + return write_bitmaps(fs, flags & EXT2FS_BITMAPS_INODE, + flags & EXT2FS_BITMAPS_BLOCK); + +#ifdef HAVE_PTHREAD + if (((fs->io->flags & CHANNEL_FLAGS_THREADS) == 0) || + (num_threads == 1) || (fs->flags & EXT2_FLAG_IMAGE_FILE)) + goto fallback; + + if (num_threads < 0) { +#if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_CONF) + num_threads = sysconf(_SC_NPROCESSORS_CONF); +#else + /* + * Guess for now; eventually we should probably define + * ext2fs_get_num_cpus() and teach it how to get this info on + * MacOS, FreeBSD, etc. + * ref: https://stackoverflow.com/questions/150355 + */ + num_threads = 4; +#endif /* HAVE_SYSCONF */ + } + if (num_threads > fs->group_desc_count) + num_threads = fs->group_desc_count; + average_group = fs->group_desc_count / num_threads; + if (ext2fs_has_feature_flex_bg(fs->super)) { + average_group = (average_group / flexbg_size) * flexbg_size; + } + if (average_group == 0) + goto fallback; + + io_channel_set_options(fs->io, "cache=off"); + retval = pthread_attr_init(&attr); + if (retval) + return retval; + + thread_ids = calloc(sizeof(pthread_t), num_threads); + if (!thread_ids) + return -ENOMEM; + + thread_infos = calloc(sizeof(struct read_bitmaps_thread_info), + num_threads); + if (!thread_infos) + goto out; + + retval = read_bitmaps_range_prepare(fs, flags); + if (retval) + goto out; + +// fprintf(stdout, "Multiple threads triggered to read bitmaps\n"); + for (i = 0; i < num_threads; i++) { + thread_infos[i].rbt_fs = fs; + thread_infos[i].rbt_flags = flags; + thread_infos[i].rbt_mutex = &rbt_mutex; + thread_infos[i].rbt_tail_flags = 0; + if (i == 0) + thread_infos[i].rbt_grp_start = 0; + else + thread_infos[i].rbt_grp_start = average_group * i + 1; + + if (i == num_threads - 1) + thread_infos[i].rbt_grp_end = fs->group_desc_count - 1; + else + thread_infos[i].rbt_grp_end = average_group * (i + 1); + retval = pthread_create(&thread_ids[i], &attr, + &read_bitmaps_thread, &thread_infos[i]); + if (retval) + break; + } + for (i = 0; i < num_threads; i++) { + if (!thread_ids[i]) + break; + rc = pthread_join(thread_ids[i], NULL); + if (rc && !retval) + retval = rc; + rc = thread_infos[i].rbt_retval; + if (rc && !retval) + retval = rc; + tail_flags |= thread_infos[i].rbt_tail_flags; + } +out: + rc = pthread_attr_destroy(&attr); + if (rc && !retval) + retval = rc; + free(thread_infos); + free(thread_ids); + + if (retval == 0) + retval = read_bitmaps_range_end(fs, flags, tail_flags); + if (retval) + read_bitmaps_cleanup_on_error(fs, flags); + /* XXX should save and restore cache setting */ + io_channel_set_options(fs->io, "cache=on"); return retval; +fallback: +#endif /* HAVE_PTHREAD */ + return read_bitmaps_range(fs, flags, 0, fs->group_desc_count - 1); } errcode_t ext2fs_read_inode_bitmap(ext2_filsys fs) { - return read_bitmaps(fs, 1, 0); + return ext2fs_rw_bitmaps(fs, EXT2FS_BITMAPS_INODE, -1); } errcode_t ext2fs_read_block_bitmap(ext2_filsys fs) { - return read_bitmaps(fs, 0, 1); + return ext2fs_rw_bitmaps(fs, EXT2FS_BITMAPS_BLOCK, -1); } errcode_t ext2fs_write_inode_bitmap(ext2_filsys fs) @@ -436,10 +659,15 @@ errcode_t ext2fs_write_block_bitmap (ext2_filsys fs) errcode_t ext2fs_read_bitmaps(ext2_filsys fs) { - if (fs->inode_map && fs->block_map) - return 0; + int flags = 0; - return read_bitmaps(fs, !fs->inode_map, !fs->block_map); + if (!fs->inode_map) + flags |= EXT2FS_BITMAPS_INODE; + if (!fs->block_map) + flags |= EXT2FS_BITMAPS_BLOCK; + if (flags == 0) + return 0; + return ext2fs_rw_bitmaps(fs, flags, -1); } errcode_t ext2fs_write_bitmaps(ext2_filsys fs) -- 2.30.0.284.gd98b1dd5eaa7-goog