On Jul 31, 2023, at 10:53 PM, Li Dongyang <dongyangli@xxxxxxx> wrote: > > Using the sorted array to track the EA blocks and their refs is > not scalable. When the file system has a huge number of EA blocks > reporting wrong ref, pass1 scanning could not be finished within > a reasonable time, as 95%+ of CPU time is spent in memmove() when > trying to enlarge the the sorted array. > > On a file system with 20 million EA blocks on an NVMe device, pass1 > time taken: > without patch: > time: 2014.78/1838.70/19.91 > with patch: > time: 45.17/20.17/20.19 > > Signed-off-by: Li Dongyang <dongyangli@xxxxxxx> Reviewed-by: Andreas Dilger <adilger@xxxxxxxxx> > --- > e2fsck/e2fsck.h | 3 +- > e2fsck/ea_refcount.c | 286 +++++++++++++++---------------------------- > e2fsck/pass1.c | 14 +-- > 3 files changed, 102 insertions(+), 201 deletions(-) > > diff --git a/e2fsck/e2fsck.h b/e2fsck/e2fsck.h > index 3f2dc3084..bc2490c9d 100644 > --- a/e2fsck/e2fsck.h > +++ b/e2fsck/e2fsck.h > @@ -533,7 +533,7 @@ extern struct dx_dir_info *e2fsck_dx_dir_info_iter(e2fsck_t ctx, > typedef __u64 ea_key_t; > typedef __u64 ea_value_t; > > -extern errcode_t ea_refcount_create(size_t size, ext2_refcount_t *ret); > +extern errcode_t ea_refcount_create(ext2_refcount_t *ret); > extern void ea_refcount_free(ext2_refcount_t refcount); > extern errcode_t ea_refcount_fetch(ext2_refcount_t refcount, ea_key_t ea_key, > ea_value_t *ret); > @@ -543,7 +543,6 @@ extern errcode_t ea_refcount_decrement(ext2_refcount_t refcount, > ea_key_t ea_key, ea_value_t *ret); > extern errcode_t ea_refcount_store(ext2_refcount_t refcount, ea_key_t ea_key, > ea_value_t count); > -extern size_t ext2fs_get_refcount_size(ext2_refcount_t refcount); > extern void ea_refcount_intr_begin(ext2_refcount_t refcount); > extern ea_key_t ea_refcount_intr_next(ext2_refcount_t refcount, > ea_value_t *ret); > diff --git a/e2fsck/ea_refcount.c b/e2fsck/ea_refcount.c > index 7154b47c3..fb8b009f1 100644 > --- a/e2fsck/ea_refcount.c > +++ b/e2fsck/ea_refcount.c > @@ -16,193 +16,100 @@ > #undef ENABLE_NLS > #endif > #include "e2fsck.h" > +#include "ext2fs/rbtree.h" > > /* > * The strategy we use for keeping track of EA refcounts is as > - * follows. We keep a sorted array of first EA blocks and its > - * reference counts. Once the refcount has dropped to zero, it is > - * removed from the array to save memory space. Once the EA block is > + * follows. We keep the first EA blocks and its reference counts > + * in the rb-tree. Once the refcount has dropped to zero, it is > + * removed from the rb-tree to save memory space. Once the EA block is > * checked, its bit is set in the block_ea_map bitmap. > */ > struct ea_refcount_el { > + struct rb_node node; > /* ea_key could either be an inode number or block number. */ > ea_key_t ea_key; > ea_value_t ea_value; > }; > > struct ea_refcount { > - size_t count; > - size_t size; > - size_t cursor; > - struct ea_refcount_el *list; > + struct rb_root root; > + struct rb_node *cursor; > }; > > void ea_refcount_free(ext2_refcount_t refcount) > { > + struct ea_refcount_el *el; > + struct rb_node *node, *next; > + > if (!refcount) > return; > > - if (refcount->list) > - ext2fs_free_mem(&refcount->list); > + for (node = ext2fs_rb_first(&refcount->root); node; node = next) { > + next = ext2fs_rb_next(node); > + el = ext2fs_rb_entry(node, struct ea_refcount_el, node); > + ext2fs_rb_erase(node, &refcount->root); > + ext2fs_free_mem(&el); > + } > ext2fs_free_mem(&refcount); > } > > -errcode_t ea_refcount_create(size_t size, ext2_refcount_t *ret) > +errcode_t ea_refcount_create(ext2_refcount_t *ret) > { > ext2_refcount_t refcount; > errcode_t retval; > - size_t bytes; > > retval = ext2fs_get_memzero(sizeof(struct ea_refcount), &refcount); > if (retval) > return retval; > - > - if (!size) > - size = 500; > - refcount->size = size; > - bytes = size * sizeof(struct ea_refcount_el); > -#ifdef DEBUG > - printf("Refcount allocated %zu entries, %zu bytes.\n", > - refcount->size, bytes); > -#endif > - retval = ext2fs_get_memzero(bytes, &refcount->list); > - if (retval) > - goto errout; > - > - refcount->count = 0; > - refcount->cursor = 0; > + refcount->root = RB_ROOT; > > *ret = refcount; > return 0; > - > -errout: > - ea_refcount_free(refcount); > - return(retval); > -} > - > -/* > - * collapse_refcount() --- go through the refcount array, and get rid > - * of any count == zero entries > - */ > -static void refcount_collapse(ext2_refcount_t refcount) > -{ > - unsigned int i, j; > - struct ea_refcount_el *list; > - > - list = refcount->list; > - for (i = 0, j = 0; i < refcount->count; i++) { > - if (list[i].ea_value) { > - if (i != j) > - list[j] = list[i]; > - j++; > - } > - } > -#if defined(DEBUG) || defined(TEST_PROGRAM) > - printf("Refcount_collapse: size was %zu, now %d\n", > - refcount->count, j); > -#endif > - refcount->count = j; > -} > - > - > -/* > - * insert_refcount_el() --- Insert a new entry into the sorted list at a > - * specified position. > - */ > -static struct ea_refcount_el *insert_refcount_el(ext2_refcount_t refcount, > - ea_key_t ea_key, int pos) > -{ > - struct ea_refcount_el *el; > - errcode_t retval; > - size_t new_size = 0; > - int num; > - > - if (refcount->count >= refcount->size) { > - new_size = refcount->size + 100; > -#ifdef DEBUG > - printf("Reallocating refcount %d entries...\n", new_size); > -#endif > - retval = ext2fs_resize_mem((size_t) refcount->size * > - sizeof(struct ea_refcount_el), > - (size_t) new_size * > - sizeof(struct ea_refcount_el), > - &refcount->list); > - if (retval) > - return 0; > - refcount->size = new_size; > - } > - num = (int) refcount->count - pos; > - if (num < 0) > - return 0; /* should never happen */ > - if (num) { > - memmove(&refcount->list[pos+1], &refcount->list[pos], > - sizeof(struct ea_refcount_el) * num); > - } > - refcount->count++; > - el = &refcount->list[pos]; > - el->ea_key = ea_key; > - el->ea_value = 0; > - return el; > } > > - > /* > * get_refcount_el() --- given an block number, try to find refcount > - * information in the sorted list. If the create flag is set, > - * and we can't find an entry, create one in the sorted list. > + * information in the rb-tree. If the create flag is set, > + * and we can't find an entry, create and add it to rb-tree. > */ > static struct ea_refcount_el *get_refcount_el(ext2_refcount_t refcount, > ea_key_t ea_key, int create) > { > - int low, high, mid; > + struct rb_node **node; > + struct rb_node *parent = NULL; > + struct ea_refcount_el *el; > + errcode_t retval; > > - if (!refcount || !refcount->list) > - return 0; > -retry: > - low = 0; > - high = (int) refcount->count-1; > - if (create && ((refcount->count == 0) || > - (ea_key > refcount->list[high].ea_key))) { > - if (refcount->count >= refcount->size) > - refcount_collapse(refcount); > - > - return insert_refcount_el(refcount, ea_key, > - (unsigned) refcount->count); > - } > - if (refcount->count == 0) > + if (!refcount) > return 0; > > - if (refcount->cursor >= refcount->count) > - refcount->cursor = 0; > - if (ea_key == refcount->list[refcount->cursor].ea_key) > - return &refcount->list[refcount->cursor++]; > -#ifdef DEBUG > - printf("Non-cursor get_refcount_el: %u\n", ea_key); > -#endif > - while (low <= high) { > - mid = (low+high)/2; > - if (ea_key == refcount->list[mid].ea_key) { > - refcount->cursor = mid+1; > - return &refcount->list[mid]; > - } > - if (ea_key < refcount->list[mid].ea_key) > - high = mid-1; > + node = &refcount->root.rb_node; > + while (*node) { > + el = ext2fs_rb_entry(*node, struct ea_refcount_el, node); > + > + parent = *node; > + if (ea_key < el->ea_key) > + node = &(*node)->rb_left; > + else if (ea_key > el->ea_key) > + node = &(*node)->rb_right; > else > - low = mid+1; > - } > - /* > - * If we need to create a new entry, it should be right at > - * low (where high will be left at low-1). > - */ > - if (create) { > - if (refcount->count >= refcount->size) { > - refcount_collapse(refcount); > - if (refcount->count < refcount->size) > - goto retry; > - } > - return insert_refcount_el(refcount, ea_key, low); > + return el; > } > - return 0; > + > + if (!create) > + return 0; > + > + retval = ext2fs_get_memzero(sizeof(struct ea_refcount_el), &el); > + if (retval) > + return 0; > + > + el->ea_key = ea_key; > + el->ea_value = 0; > + ext2fs_rb_link_node(&el->node, parent, node); > + ext2fs_rb_insert_color(&el->node, &refcount->root); > + > + return el; > } > > errcode_t ea_refcount_fetch(ext2_refcount_t refcount, ea_key_t ea_key, > @@ -240,13 +147,18 @@ errcode_t ea_refcount_decrement(ext2_refcount_t refcount, ea_key_t ea_key, > struct ea_refcount_el *el; > > el = get_refcount_el(refcount, ea_key, 0); > - if (!el || el->ea_value == 0) > + if (!el) > return EXT2_ET_INVALID_ARGUMENT; > > el->ea_value--; > > if (ret) > *ret = el->ea_value; > + > + if (el->ea_value == 0) { > + ext2fs_rb_erase(&el->node, &refcount->root); > + ext2fs_free_mem(&el); > + } > return 0; > } > > @@ -262,17 +174,13 @@ errcode_t ea_refcount_store(ext2_refcount_t refcount, ea_key_t ea_key, > if (!el) > return ea_value ? EXT2_ET_NO_MEMORY : 0; > el->ea_value = ea_value; > + if (el->ea_value == 0) { > + ext2fs_rb_erase(&el->node, &refcount->root); > + ext2fs_free_mem(&el); > + } > return 0; > } > > -size_t ext2fs_get_refcount_size(ext2_refcount_t refcount) > -{ > - if (!refcount) > - return 0; > - > - return refcount->size; > -} > - > void ea_refcount_intr_begin(ext2_refcount_t refcount) > { > refcount->cursor = 0; > @@ -281,19 +189,23 @@ void ea_refcount_intr_begin(ext2_refcount_t refcount) > ea_key_t ea_refcount_intr_next(ext2_refcount_t refcount, > ea_value_t *ret) > { > - struct ea_refcount_el *list; > - > - while (1) { > - if (refcount->cursor >= refcount->count) > - return 0; > - list = refcount->list; > - if (list[refcount->cursor].ea_value) { > - if (ret) > - *ret = list[refcount->cursor].ea_value; > - return list[refcount->cursor++].ea_key; > - } > - refcount->cursor++; > + struct ea_refcount_el *el; > + struct rb_node *node = refcount->cursor; > + > + if (node == NULL) > + node = ext2fs_rb_first(&refcount->root); > + else > + node = ext2fs_rb_next(node); > + > + if (node) { > + refcount->cursor = node; > + el = ext2fs_rb_entry(node, struct ea_refcount_el, node); > + if (ret) > + *ret = el->ea_value; > + return el->ea_key; > } > + > + return 0; > } > > > @@ -301,26 +213,28 @@ ea_key_t ea_refcount_intr_next(ext2_refcount_t refcount, > > errcode_t ea_refcount_validate(ext2_refcount_t refcount, FILE *out) > { > - errcode_t ret = 0; > - int i; > + struct ea_refcount_el *el; > + struct rb_node *node; > + ea_key_t prev; > + int prev_valid = 0; > const char *bad = "bad refcount"; > > - if (refcount->count > refcount->size) { > - fprintf(out, "%s: count > size\n", bad); > - return EXT2_ET_INVALID_ARGUMENT; > - } > - for (i=1; i < refcount->count; i++) { > - if (refcount->list[i-1].ea_key >= refcount->list[i].ea_key) { > + for (node = ext2fs_rb_first(&refcount->root); node != NULL; > + node = ext2fs_rb_next(node)) { > + el = ext2fs_rb_entry(node, struct ea_refcount_el, node); > + if (prev_valid && prev >= el->ea_key) { > fprintf(out, > - "%s: list[%d].ea_key=%llu, list[%d].ea_key=%llu\n", > - bad, i-1, > - (unsigned long long) refcount->list[i-1].ea_key, > - i, > - (unsigned long long) refcount->list[i].ea_key); > - ret = EXT2_ET_INVALID_ARGUMENT; > + "%s: prev.ea_key=%llu, curr.ea_key=%llu\n", > + bad, > + (unsigned long long) prev, > + (unsigned long long) el->ea_key); > + return EXT2_ET_INVALID_ARGUMENT; > } > + prev = el->ea_key; > + prev_valid = 1; > } > - return ret; > + > + return 0; > } > > #define BCODE_END 0 > @@ -332,10 +246,9 @@ errcode_t ea_refcount_validate(ext2_refcount_t refcount, FILE *out) > #define BCODE_FETCH 6 > #define BCODE_VALIDATE 7 > #define BCODE_LIST 8 > -#define BCODE_COLLAPSE 9 > > int bcode_program[] = { > - BCODE_CREATE, 5, > + BCODE_CREATE, > BCODE_STORE, 3, 3, > BCODE_STORE, 4, 4, > BCODE_STORE, 1, 1, > @@ -362,7 +275,6 @@ int bcode_program[] = { > BCODE_FETCH, 30, > BCODE_DECR, 2, > BCODE_DECR, 2, > - BCODE_COLLAPSE, > BCODE_LIST, > BCODE_VALIDATE, > BCODE_FREE, > @@ -373,7 +285,6 @@ int main(int argc, char **argv) > { > int i = 0; > ext2_refcount_t refcount; > - size_t size; > ea_key_t ea_key; > ea_value_t arg; > errcode_t retval; > @@ -383,15 +294,13 @@ int main(int argc, char **argv) > case BCODE_END: > exit(0); > case BCODE_CREATE: > - size = bcode_program[i++]; > - retval = ea_refcount_create(size, &refcount); > + retval = ea_refcount_create(&refcount); > if (retval) { > com_err("ea_refcount_create", retval, > - "while creating size %zu", size); > + "while creating refcount"); > exit(1); > } else > - printf("Creating refcount with size %zu\n", > - size); > + printf("Creating refcount\n"); > break; > case BCODE_FREE: > ea_refcount_free(refcount); > @@ -465,9 +374,6 @@ int main(int argc, char **argv) > (unsigned long long) arg); > } > break; > - case BCODE_COLLAPSE: > - refcount_collapse(refcount); > - break; > } > > } > diff --git a/e2fsck/pass1.c b/e2fsck/pass1.c > index a341c72ac..27364cd73 100644 > --- a/e2fsck/pass1.c > +++ b/e2fsck/pass1.c > @@ -398,8 +398,7 @@ static void inc_ea_inode_refs(e2fsck_t ctx, struct problem_context *pctx, > if (!entry->e_value_inum) > goto next; > if (!ctx->ea_inode_refs) { > - pctx->errcode = ea_refcount_create(0, > - &ctx->ea_inode_refs); > + pctx->errcode = ea_refcount_create(&ctx->ea_inode_refs); > if (pctx->errcode) { > pctx->num = 4; > fix_problem(ctx, PR_1_ALLOCATE_REFCOUNT, pctx); > @@ -2475,7 +2474,7 @@ static int check_ext_attr(e2fsck_t ctx, struct problem_context *pctx, > > /* Create the EA refcount structure if necessary */ > if (!ctx->refcount) { > - pctx->errcode = ea_refcount_create(0, &ctx->refcount); > + pctx->errcode = ea_refcount_create(&ctx->refcount); > if (pctx->errcode) { > pctx->num = 1; > fix_problem(ctx, PR_1_ALLOCATE_REFCOUNT, pctx); > @@ -2509,8 +2508,7 @@ static int check_ext_attr(e2fsck_t ctx, struct problem_context *pctx, > return 1; > /* Ooops, this EA was referenced more than it stated */ > if (!ctx->refcount_extra) { > - pctx->errcode = ea_refcount_create(0, > - &ctx->refcount_extra); > + pctx->errcode = ea_refcount_create(&ctx->refcount_extra); > if (pctx->errcode) { > pctx->num = 2; > fix_problem(ctx, PR_1_ALLOCATE_REFCOUNT, pctx); > @@ -2651,8 +2649,7 @@ static int check_ext_attr(e2fsck_t ctx, struct problem_context *pctx, > > if (quota_blocks != EXT2FS_C2B(fs, 1U)) { > if (!ctx->ea_block_quota_blocks) { > - pctx->errcode = ea_refcount_create(0, > - &ctx->ea_block_quota_blocks); > + pctx->errcode = ea_refcount_create(&ctx->ea_block_quota_blocks); > if (pctx->errcode) { > pctx->num = 3; > goto refcount_fail; > @@ -2664,8 +2661,7 @@ static int check_ext_attr(e2fsck_t ctx, struct problem_context *pctx, > > if (quota_inodes) { > if (!ctx->ea_block_quota_inodes) { > - pctx->errcode = ea_refcount_create(0, > - &ctx->ea_block_quota_inodes); > + pctx->errcode = ea_refcount_create(&ctx->ea_block_quota_inodes); > if (pctx->errcode) { > pctx->num = 4; > refcount_fail: > -- > 2.41.0 > Cheers, Andreas
Attachment:
signature.asc
Description: Message signed with OpenPGP