On Mon, 16 Oct 2023, Steven Rostedt wrote: > From: "Steven Rostedt (Google)" <rostedt@xxxxxxxxxxx> > > I was loading a very large trace.dat file into kernelshark when it just > stopped near the end off the load and hung there for a very long time. I > ran it under gdb and hit Ctrl^C when it hit the hang to see what it was > doing and found that it was spinning in: > > libtracecmd trace-input.c: free_zpage > > static void free_zpage(struct cpu_data *cpu_data, void *map) > { > struct zchunk_cache *cache; > > list_for_each_entry(cache, &cpu_data->compress.cache, list) { > if (map <= cache->map && map > (cache->map + cache->chunk->size)) > goto found; > } > return; > > found: > > It seems that there can be over 10 thousand cache pages in that link > list and that kernelshark is constantly hitting that. So it's doing a > linear search for the mapped page to free it. > > I ran: time kernelshark trace.dat > > And exited out when it finished loading and the result was: > > real 6m14.772s > user 6m0.649s > sys 0m12.718s > > That's over 6 minutes to load the trace.dat file!!! > > I ran perf record on it and it showed 77% of the time was in free_zpage(). > > I pulled out my old algorithms book and wrote up a rbtree for internal use > of libtracecmd. Then I switched the cache into a binary rbtree to do the > look ups. As the lookups used both where the memory of the compressed page > is mapped as well as the offset depending on how the search was done, I > found that it only used the memory allocation address in one location. > Luckily, the memory allocation mapping lookup also had access to the > offset of the file the memory represented. That allowed me to make all > lookups use the file offset (Thanks to Masami Hiramatsu for helping me > realize that). > > After converting the cache to an rbtree lookup, I ran kernelshark again on > opening that file and exited out as soon as it finished loading and the > timings was: > > real 1m22.356s > user 1m10.532s > sys 0m10.901s > > Still a bit long, but it dropped from over 6 minutes to under 1 1/2 > minutes. Also, free_zpages() was no longer in the perf record output. Does it impact trace-cmd report? julia > > Signed-off-by: Steven Rostedt (Google) <rostedt@xxxxxxxxxxx> > --- > lib/trace-cmd/Makefile | 1 + > lib/trace-cmd/include/private/trace-rbtree.h | 34 ++ > lib/trace-cmd/trace-input.c | 81 +++-- > lib/trace-cmd/trace-rbtree.c | 345 +++++++++++++++++++ > 4 files changed, 438 insertions(+), 23 deletions(-) > create mode 100644 lib/trace-cmd/include/private/trace-rbtree.h > create mode 100644 lib/trace-cmd/trace-rbtree.c > > diff --git a/lib/trace-cmd/Makefile b/lib/trace-cmd/Makefile > index e9d26b2bb367..aba6fda5b0f6 100644 > --- a/lib/trace-cmd/Makefile > +++ b/lib/trace-cmd/Makefile > @@ -9,6 +9,7 @@ DEFAULT_TARGET = $(LIBTRACECMD_STATIC) > > OBJS = > OBJS += trace-hash.o > +OBJS += trace-rbtree.o > OBJS += trace-hooks.o > OBJS += trace-input.o > OBJS += trace-output.o > diff --git a/lib/trace-cmd/include/private/trace-rbtree.h b/lib/trace-cmd/include/private/trace-rbtree.h > new file mode 100644 > index 000000000000..822111998e7a > --- /dev/null > +++ b/lib/trace-cmd/include/private/trace-rbtree.h > @@ -0,0 +1,34 @@ > +/* SPDX-License-Identifier: LGPL-2.1 */ > +/* > + * Copyright (C) 2023 Google, Steven Rostedt <rostedt@xxxxxxxxxxx> > + * > + */ > +#ifndef _TRACE_RBTREE_H > +#define _TRACE_RBTREE_H > + > +struct trace_rbtree_node { > + struct trace_rbtree_node *parent; > + struct trace_rbtree_node *left; > + struct trace_rbtree_node *right; > + int color; > +}; > + > +typedef int (*trace_rbtree_cmp_fn)(const struct trace_rbtree_node *A, const struct trace_rbtree_node *B); > + > +typedef int (*trace_rbtree_search_fn)(const struct trace_rbtree_node *n, const void *data); > + > +struct trace_rbtree { > + struct trace_rbtree_node *node; > + trace_rbtree_search_fn search; > + trace_rbtree_cmp_fn cmp; > + size_t nr_nodes; > +}; > + > +void trace_rbtree_init(struct trace_rbtree *tree, trace_rbtree_cmp_fn cmp_fn, > + trace_rbtree_search_fn search_fn); > +struct trace_rbtree_node *trace_rbtree_find(struct trace_rbtree *tree, const void *data); > +void trace_rbtree_delete(struct trace_rbtree *tree, struct trace_rbtree_node *node); > +int trace_rbtree_insert(struct trace_rbtree *tree, struct trace_rbtree_node *node); > +struct trace_rbtree_node *trace_rbtree_pop_nobalance(struct trace_rbtree *tree); > + > +#endif /* _TRACE_RBTREE_H */ > diff --git a/lib/trace-cmd/trace-input.c b/lib/trace-cmd/trace-input.c > index af51b3d71e1a..42c8312f8395 100644 > --- a/lib/trace-cmd/trace-input.c > +++ b/lib/trace-cmd/trace-input.c > @@ -16,6 +16,7 @@ > > #include "trace-write-local.h" > #include "trace-cmd-local.h" > +#include "trace-rbtree.h" > #include "trace-local.h" > #include "kbuffer.h" > #include "list.h" > @@ -66,7 +67,7 @@ struct page { > }; > > struct zchunk_cache { > - struct list_head list; > + struct trace_rbtree_node node; > struct tracecmd_compress_chunk *chunk; > void *map; > int ref; > @@ -78,7 +79,7 @@ struct cpu_zdata { > char file[26]; /* strlen(COMPR_TEMP_FILE) */ > unsigned int count; > unsigned int last_chunk; > - struct list_head cache; > + struct trace_rbtree cache; > struct tracecmd_compress_chunk *chunks; > }; > > @@ -1378,21 +1379,24 @@ static struct tracecmd_compress_chunk *get_zchunk(struct cpu_data *cpu, off_t of > return chunk; > } > > -static void free_zpage(struct cpu_data *cpu_data, void *map) > +static void free_zpage(struct cpu_data *cpu_data, off_t offset) > { > + struct trace_rbtree_node *node; > struct zchunk_cache *cache; > > - list_for_each_entry(cache, &cpu_data->compress.cache, list) { > - if (map <= cache->map && map > (cache->map + cache->chunk->size)) > - goto found; > - } > - return; > + node = trace_rbtree_find(&cpu_data->compress.cache, (void *)&offset); > + > + if (!node) > + return; > + > + cache = container_of(node, struct zchunk_cache, node); > > -found: > cache->ref--; > if (cache->ref) > return; > - list_del(&cache->list); > + > + trace_rbtree_delete(&cpu_data->compress.cache, node); > + > free(cache->map); > free(cache); > } > @@ -1401,6 +1405,7 @@ static void *read_zpage(struct tracecmd_input *handle, int cpu, off_t offset) > { > struct cpu_data *cpu_data = &handle->cpu_data[cpu]; > struct tracecmd_compress_chunk *chunk; > + struct trace_rbtree_node *node; > struct zchunk_cache *cache; > void *map = NULL; > int pindex; > @@ -1409,11 +1414,11 @@ static void *read_zpage(struct tracecmd_input *handle, int cpu, off_t offset) > offset -= cpu_data->file_offset; > > /* Look in the cache of already loaded chunks */ > - list_for_each_entry(cache, &cpu_data->compress.cache, list) { > - if (CHUNK_CHECK_OFFSET(cache->chunk, offset)) { > - cache->ref++; > - goto out; > - } > + node = trace_rbtree_find(&cpu_data->compress.cache, (void *)&offset); > + if (node) { > + cache = container_of(node, struct zchunk_cache, node); > + cache->ref++; > + goto out; > } > > chunk = get_zchunk(cpu_data, offset); > @@ -1435,7 +1440,7 @@ static void *read_zpage(struct tracecmd_input *handle, int cpu, off_t offset) > cache->ref = 1; > cache->chunk = chunk; > cache->map = map; > - list_add(&cache->list, &cpu_data->compress.cache); > + trace_rbtree_insert(&cpu_data->compress.cache, &cache->node); > > /* a chunk can hold multiple pages, get the requested one */ > out: > @@ -1606,7 +1611,7 @@ static void __free_page(struct tracecmd_input *handle, struct page *page) > if (handle->read_page) > free(page->map); > else if (handle->read_zpage) > - free_zpage(cpu_data, page->map); > + free_zpage(cpu_data, page->offset); > else > free_page_map(page->page_map); > > @@ -2102,7 +2107,7 @@ tracecmd_read_cpu_first(struct tracecmd_input *handle, int cpu) > /* If the page was already mapped, we need to reset it */ > if (ret) > update_page_info(handle, cpu); > - > + > free_next(handle, cpu); > > return tracecmd_read_data(handle, cpu); > @@ -3348,6 +3353,35 @@ static int init_cpu_zpage(struct tracecmd_input *handle, int cpu) > return 0; > } > > +static int compress_cmp(const struct trace_rbtree_node *A, > + const struct trace_rbtree_node *B) > +{ > + const struct zchunk_cache *cacheA; > + const struct zchunk_cache *cacheB; > + > + cacheA = container_of(A, struct zchunk_cache, node); > + cacheB = container_of(B, struct zchunk_cache, node); > + > + return chunk_cmp(cacheA->chunk, cacheB->chunk); > +} > + > +static int compress_search(const struct trace_rbtree_node *A, > + const void *data) > +{ > + const struct zchunk_cache *cache; > + off_t offset = *(off_t *)data; > + > + cache = container_of(A, struct zchunk_cache, node); > + > + if (CHUNK_CHECK_OFFSET(cache->chunk, offset)) > + return 0; > + > + if (cache->chunk->offset < offset) > + return -1; > + > + return 1; > +} > + > static int init_cpu(struct tracecmd_input *handle, int cpu) > { > struct cpu_data *cpu_data = &handle->cpu_data[cpu]; > @@ -3368,7 +3402,8 @@ static int init_cpu(struct tracecmd_input *handle, int cpu) > cpu_data->timestamp = 0; > > list_head_init(&cpu_data->page_maps); > - list_head_init(&cpu_data->compress.cache); > + > + trace_rbtree_init(&cpu_data->compress.cache, compress_cmp, compress_search); > > if (!cpu_data->size) { > tracecmd_info("CPU %d is empty", cpu); > @@ -5131,10 +5166,10 @@ void tracecmd_close(struct tracecmd_input *handle) > close(cpu_data->compress.fd); > unlink(cpu_data->compress.file); > } > - while (!list_empty(&cpu_data->compress.cache)) { > - cache = container_of(cpu_data->compress.cache.next, > - struct zchunk_cache, list); > - list_del(&cache->list); > + while (cpu_data->compress.cache.node) { > + struct trace_rbtree_node *node; > + node = trace_rbtree_pop_nobalance(&cpu_data->compress.cache); > + cache = container_of(node, struct zchunk_cache, node); > free(cache->map); > free(cache); > } > diff --git a/lib/trace-cmd/trace-rbtree.c b/lib/trace-cmd/trace-rbtree.c > new file mode 100644 > index 000000000000..b015773ed2d6 > --- /dev/null > +++ b/lib/trace-cmd/trace-rbtree.c > @@ -0,0 +1,345 @@ > +// SPDX-License-Identifier: LGPL-2.1 > +/* > + * Copyright (C) 2023 Google, Steven Rostedt <rostedt@xxxxxxxxxxx> > + * > + */ > +#include <stdlib.h> > +#include <stdbool.h> > +#include "trace-local.h" > +#include "trace-rbtree.h" > + > +enum { > + RED, > + BLACK, > +}; > + > +void __hidden trace_rbtree_init(struct trace_rbtree *tree, trace_rbtree_cmp_fn cmp_fn, > + trace_rbtree_search_fn search_fn) > +{ > + memset(tree, 0, sizeof(*tree)); > + tree->search = search_fn; > + tree->cmp = cmp_fn; > +} > + > +static bool is_left(struct trace_rbtree_node *node) > +{ > + return node == node->parent->left; > +} > + > +static void check_tree(struct trace_rbtree_node *node) > +{ > + if (!node) > + return; > + > + if (node->color == RED && > + ((node->left && node->left->color == RED) || > + ((node->right && node->right->color == RED)))) > + breakpoint(); > + > + check_tree(node->left); > + check_tree(node->right); > +} > + > +static struct trace_rbtree_node **get_parent_ptr(struct trace_rbtree *tree, > + struct trace_rbtree_node *node) > +{ > + if (!node->parent) > + return &tree->node; > + else if (is_left(node)) > + return &node->parent->left; > + else > + return &node->parent->right; > +} > + > +static void rotate_left(struct trace_rbtree *tree, > + struct trace_rbtree_node *node) > +{ > + struct trace_rbtree_node **parent_ptr = get_parent_ptr(tree, node); > + struct trace_rbtree_node *parent = node->parent; > + struct trace_rbtree_node *old_right = node->right; > + > + *parent_ptr = old_right; > + node->right = old_right->left; > + old_right->left = node; > + > + node->parent = old_right; > + old_right->parent = parent; > +} > + > +static void rotate_right(struct trace_rbtree *tree, > + struct trace_rbtree_node *node) > +{ > + struct trace_rbtree_node **parent_ptr = get_parent_ptr(tree, node); > + struct trace_rbtree_node *parent = node->parent; > + struct trace_rbtree_node *old_left = node->left; > + > + *parent_ptr = old_left; > + node->left = old_left->right; > + old_left->right = node; > + > + node->parent = old_left; > + old_left->parent = parent; > +} > + > +static void insert_tree(struct trace_rbtree *tree, > + struct trace_rbtree_node *node) > +{ > + struct trace_rbtree_node *next = tree->node; > + struct trace_rbtree_node *last_next = NULL; > + bool went_left = false; > + > + while (next) { > + last_next = next; > + if (tree->cmp(next, node) > 0) { > + next = next->right; > + went_left = false; > + } else { > + next = next->left; > + went_left = true; > + } > + } > + > + if (!last_next) { > + tree->node = node; > + return; > + } > + > + if (went_left) > + last_next->left = node; > + else > + last_next->right = node; > + > + node->parent = last_next; > +} > + > +int __hidden trace_rbtree_insert(struct trace_rbtree *tree, > + struct trace_rbtree_node *node) > +{ > + memset(node, 0, sizeof(*node)); > + > + insert_tree(tree, node); > + node->color = RED; > + while (node && node->parent && node->parent->color == RED) { > + if (is_left(node->parent)) { > + struct trace_rbtree_node *old_right; > + > + old_right = node->parent->parent->right; > + if (old_right && old_right->color == RED) { > + node->parent->color = BLACK; > + old_right->color = BLACK; > + node->parent->parent = RED; > + node = node->parent->parent; > + } else { > + if (!is_left(node)) { > + node = node->parent; > + rotate_left(tree, node); > + } > + node->parent->color = BLACK; > + node->parent->parent->color = RED; > + rotate_right(tree, node->parent->parent); > + } > + } else { > + struct trace_rbtree_node *old_left; > + > + old_left = node->parent->parent->right; > + if (old_left && old_left->color == RED) { > + node->parent->color = BLACK; > + old_left->color = BLACK; > + node->parent->parent = RED; > + node = node->parent->parent; > + } else { > + if (is_left(node)) { > + node = node->parent; > + rotate_right(tree, node); > + } > + node->parent->color = BLACK; > + node->parent->parent->color = RED; > + rotate_left(tree, node->parent->parent); > + } > + } > + } > + tree->node->color = BLACK; > + tree->nr_nodes++; > + check_tree(tree->node); > + return 0; > +} > + > +struct trace_rbtree_node *trace_rbtree_find(struct trace_rbtree *tree, const void *data) > +{ > + struct trace_rbtree_node *node = tree->node; > + int ret; > + > + while (node) { > + ret = tree->search(node, data); > + if (!ret) > + return node; > + if (ret > 0) > + node = node->right; > + else > + node = node->left; > + } > + return NULL; > +} > + > +static struct trace_rbtree_node *next_node(struct trace_rbtree_node *node) > +{ > + if (node->right) { > + node = node->right; > + while (node->left) > + node = node->left; > + return node; > + } > + > + while (node->parent && !is_left(node)) > + node = node->parent; > + > + return node->parent; > +} > + > +static void tree_fixup(struct trace_rbtree *tree, struct trace_rbtree_node *node) > +{ > + while (node->parent && node->color == BLACK) { > + if (is_left(node)) { > + struct trace_rbtree_node *old_right = node->parent->right; > + > + if (old_right->color == RED) { > + old_right->color = BLACK; > + node->parent->color = RED; > + rotate_left(tree, node->parent); > + old_right = node->parent->right; > + } > + if (old_right->left->color == BLACK && > + old_right->right->color == BLACK) { > + old_right->color = RED; > + node = node->parent; > + } else { > + if (old_right->right->color == BLACK) { > + old_right->left->color = BLACK; > + old_right->color = RED; > + rotate_right(tree, old_right); > + old_right = node->parent->right; > + } > + old_right->color = node->parent->color; > + node->parent->color = BLACK; > + old_right->right->color = BLACK; > + rotate_left(tree, node->parent); > + node = tree->node; > + } > + } else { > + struct trace_rbtree_node *old_left = node->parent->left; > + > + if (old_left->color == RED) { > + old_left->color = BLACK; > + node->parent->color = RED; > + rotate_right(tree, node->parent); > + old_left = node->parent->left; > + } > + if (old_left->right->color == BLACK && > + old_left->left->color == BLACK) { > + old_left->color = RED; > + node = node->parent; > + } else { > + if (old_left->left->color == BLACK) { > + old_left->right->color = BLACK; > + old_left->color = RED; > + rotate_left(tree, old_left); > + old_left = node->parent->left; > + } > + old_left->color = node->parent->color; > + node->parent->color = BLACK; > + old_left->left->color = BLACK; > + rotate_right(tree, node->parent); > + node = tree->node; > + } > + } > + } > + node->color = BLACK; > +} > + > +void trace_rbtree_delete(struct trace_rbtree *tree, struct trace_rbtree_node *node) > +{ > + struct trace_rbtree_node *x, *y; > + > + if (!node->left && !node->right) { > + tree->node = NULL; > + goto out; > + } > + > + if (!node->left || !node->right) > + y = node; > + else > + y = next_node(node); > + > + if (y->left) > + x = y->left; > + else > + x = y->right; > + > + x->parent = y->parent; > + > + if (!x->parent) { > + tree->node = x; > + } else { > + if (is_left(y)) > + y->parent->left = x; > + else > + y->parent->right = x; > + } > + > + if (y != node) { > + y->parent = node->parent; > + y->left = node->left; > + y->right = node->right; > + if (y->left) > + y->left->parent = y; > + if (y->right) > + y->right->parent = y; > + if (!y->parent) { > + tree->node = y; > + } else { > + if (is_left(node)) > + y->parent->left = y; > + else > + y->parent->right = y; > + } > + } > + > + if (y->color == BLACK) > + tree_fixup(tree, x); > + > + out: > + node->parent = node->left = node->right = NULL; > + tree->nr_nodes--; > + check_tree(tree->node); > +} > + > +/* > + * Used for freeing a tree, just quickly pop off the children in > + * no particular order. This will corrupt the tree! That is, > + * do not do any inserting or deleting of this tree after calling > + * this function. > + */ > +struct trace_rbtree_node *trace_rbtree_pop_nobalance(struct trace_rbtree *tree) > +{ > + struct trace_rbtree_node *node = tree->node; > + > + if (!node) > + return NULL; > + > + while (node->left) > + node = node->left; > + > + while (node->right) > + node = node->right; > + > + if (node->parent) { > + if (is_left(node)) > + node->parent->left = NULL; > + else > + node->parent->right = NULL; > + } else { > + tree->node = NULL; > + } > + > + return node; > +} > -- > 2.42.0 > >