There are 2 ways to iterate vm_area_struct: 1) by rbtree, aka vma.vm_rb; 2) by linked list, aka vma.vm_prev/next. However for linux maple tree patch[1][2], vm_rb and vm_prev/next are removed from vm_area_struct. For memory.c:vm_area_dump of crash, it mainly uses linked list as a way of vma iteration, which will not work for this case. So maple tree iteration need to be ported to crash. For crash, currently it only iteratively read the maple tree, no more rcu safe or maple tree modification features needed. So we only port a subset of kernel maple tree features. In addition, we need to modify the ported kernel source code, making it compatible with crash. The formal crash way of vmcore struct member resolving is: readmem(node, KVADDR, buf, SIZE(buf), "", flag); return buf + OFFSET(member); which is the reimplementation of kernel way of member resolving: return node->member; The 1st one is arch independent, it uses gdb to resolve the OFFSET of members, so crash don't need to know what the inside of the struct is, even if the struct changes for new kernel version. The 2nd one is arch dependent, the struct need to be ported to crash, and the OFFSET of members may differ between crash and kernel due to padding/ alignment or optimization reasons. This patch deals with the 2 issues: 1) Poring mt_dump() function, and all its dependencies from kernel source[3] to crash, to enable crash maple tree iteration, 2) adapting the ported code with crash. [1]: https://github.com/oracle/linux-uek/commit/d19703645b80abe35dff1a88449d074b0b5b1bb1 [2]: https://github.com/oracle/linux-uek/commit/91dee01f1ebb6b6587463b6ee6f7bbc4965f91d5 [3]: https://github.com/oracle/linux-uek, maple/mainline branch Signed-off-by: Tao Liu <ltao@xxxxxxxxxx> --- Makefile | 10 +- defs.h | 19 +++ maple_tree.c | 409 +++++++++++++++++++++++++++++++++++++++++++++++++++ maple_tree.h | 82 +++++++++++ 4 files changed, 517 insertions(+), 3 deletions(-) create mode 100644 maple_tree.c create mode 100644 maple_tree.h diff --git a/Makefile b/Makefile index 1506dd4..102597f 100644 --- a/Makefile +++ b/Makefile @@ -59,6 +59,7 @@ IBM_HFILES=ibm_common.h SADUMP_HFILES=sadump.h UNWIND_HFILES=unwind.h unwind_i.h rse.h unwind_x86.h unwind_x86_64.h VMWARE_HFILES=vmware_vmss.h +MAPLE_TREE_HFILES=maple_tree.h CFILES=main.c tools.c global_data.c memory.c filesys.c help.c task.c \ kernel.c test.c gdb_interface.c configure.c net.c dev.c bpf.c \ @@ -73,12 +74,12 @@ CFILES=main.c tools.c global_data.c memory.c filesys.c help.c task.c \ xen_hyper.c xen_hyper_command.c xen_hyper_global_data.c \ xen_hyper_dump_tables.c kvmdump.c qemu.c qemu-load.c sadump.c ipcs.c \ ramdump.c vmware_vmss.c vmware_guestdump.c \ - xen_dom0.c kaslr_helper.c sbitmap.c + xen_dom0.c kaslr_helper.c sbitmap.c maple_tree.c SOURCE_FILES=${CFILES} ${GENERIC_HFILES} ${MCORE_HFILES} \ ${REDHAT_CFILES} ${REDHAT_HFILES} ${UNWIND_HFILES} \ ${LKCD_DUMP_HFILES} ${LKCD_TRACE_HFILES} ${LKCD_OBSOLETE_HFILES}\ - ${IBM_HFILES} ${SADUMP_HFILES} ${VMWARE_HFILES} + ${IBM_HFILES} ${SADUMP_HFILES} ${VMWARE_HFILES} ${MAPLE_TREE_HFILES} OBJECT_FILES=main.o tools.o global_data.o memory.o filesys.o help.o task.o \ build_data.o kernel.o test.o gdb_interface.o net.o dev.o bpf.o \ @@ -93,7 +94,7 @@ OBJECT_FILES=main.o tools.o global_data.o memory.o filesys.o help.o task.o \ xen_hyper.o xen_hyper_command.o xen_hyper_global_data.o \ xen_hyper_dump_tables.o kvmdump.o qemu.o qemu-load.o sadump.o ipcs.o \ ramdump.o vmware_vmss.o vmware_guestdump.o \ - xen_dom0.o kaslr_helper.o sbitmap.o + xen_dom0.o kaslr_helper.o sbitmap.o maple_tree.o MEMORY_DRIVER_FILES=memory_driver/Makefile memory_driver/crash.c memory_driver/README @@ -539,6 +540,9 @@ kaslr_helper.o: ${GENERIC_HFILES} kaslr_helper.c bpf.o: ${GENERIC_HFILES} bpf.c ${CC} -c ${CRASH_CFLAGS} bpf.c ${WARNING_OPTIONS} ${WARNING_ERROR} +maple_tree.o: ${GENERIC_HFILES} ${MAPLE_TREE_HFILES} maple_tree.c + ${CC} -c ${CRASH_CFLAGS} maple_tree.c ${WARNING_OPTIONS} ${WARNING_ERROR} + ${PROGRAM}: force @$(MAKE) all diff --git a/defs.h b/defs.h index 08ac4dc..46bfd4a 100644 --- a/defs.h +++ b/defs.h @@ -2189,6 +2189,21 @@ struct offset_table { /* stash of commonly-used offsets */ long request_queue_hctx_table; long percpu_counter_counters; long slab_slab_list; + long mm_struct_mm_mt; + long maple_tree_ma_root; + long maple_tree_ma_flags; + long maple_node_parent; + long maple_node_ma64; + long maple_node_mr64; + long maple_node_slot; + long maple_arange_64_pivot; + long maple_arange_64_slot; + long maple_arange_64_gap; + long maple_arange_64_meta; + long maple_range_64_pivot; + long maple_range_64_slot; + long maple_metadata_end; + long maple_metadata_gap; }; struct size_table { /* stash of commonly-used sizes */ @@ -2360,6 +2375,8 @@ struct size_table { /* stash of commonly-used sizes */ long sbq_wait_state; long blk_mq_tags; long percpu_counter; + long maple_tree; + long maple_node; }; struct array_table { @@ -5742,6 +5759,8 @@ int file_dump(ulong, ulong, ulong, int, int); int same_file(char *, char *); int cleanup_memory_driver(void); +void maple_init(void); +int do_mptree(struct tree_data *); /* * help.c diff --git a/maple_tree.c b/maple_tree.c new file mode 100644 index 0000000..0575db8 --- /dev/null +++ b/maple_tree.c @@ -0,0 +1,409 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Maple Tree implementation + * Copyright (c) 2018-2022 Oracle Corporation + * Authors: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx> + * Matthew Wilcox <willy@xxxxxxxxxxxxx> + * + * The following are copied and modified from lib/maple_tree.c + */ + +#include "maple_tree.h" +#include "defs.h" + +unsigned char *mt_slots = NULL; +unsigned char *mt_pivots = NULL; +ulong mt_max[4] = {0}; + +#define MAPLE_BUFSIZE 512 + +static inline ulong mte_to_node(ulong maple_enode_entry) +{ + return maple_enode_entry & ~MAPLE_NODE_MASK; +} + +static inline enum maple_type mte_node_type(ulong maple_enode_entry) +{ + return (maple_enode_entry >> MAPLE_NODE_TYPE_SHIFT) & + MAPLE_NODE_TYPE_MASK; +} + +static inline ulong mt_slot(void **slots, unsigned char offset) +{ + return (ulong)slots[offset]; +} + +static inline bool ma_is_leaf(const enum maple_type type) +{ + return type < maple_range_64; +} + +/***************For cmd_tree********************/ + +struct maple_tree_ops { + void (*entry)(ulong node, ulong slot, const char *path, + ulong index, void *private); + void *private; + bool is_td; +}; + +static const char spaces[] = " "; + +static void do_mt_range64(ulong, ulong, ulong, uint, char *, ulong *, + struct maple_tree_ops *); +static void do_mt_arange64(ulong, ulong, ulong, uint, char *, ulong *, + struct maple_tree_ops *); +static void do_mt_entry(ulong, ulong, ulong, uint, uint, char *, ulong *, + struct maple_tree_ops *); +static void do_mt_node(ulong, ulong, ulong, uint, char *, ulong *, + struct maple_tree_ops *); +struct req_entry *fill_member_offsets(char *); +void dump_struct_members_fast(struct req_entry *, int, ulong); +void dump_struct_members_for_tree(struct tree_data *, int, ulong); + +static void mt_dump_range(ulong min, ulong max, uint depth) +{ + if (min == max) + fprintf(fp, "%.*s%lu: ", depth * 2, spaces, min); + else + fprintf(fp, "%.*s%lu-%lu: ", depth * 2, spaces, min, max); +} + +static inline bool mt_is_reserved(ulong entry) +{ + return (entry < MAPLE_RESERVED_RANGE) && xa_is_internal(entry); +} + +static inline bool mte_is_leaf(ulong maple_enode_entry) +{ + return ma_is_leaf(mte_node_type(maple_enode_entry)); +} + +static uint mt_height(void *maple_tree_mt) +{ + return (*(uint *)(maple_tree_mt + OFFSET(maple_tree_ma_flags)) & + MT_FLAGS_HEIGHT_MASK) + >> MT_FLAGS_HEIGHT_OFFSET; +} + +static void dump_mt_range64(void *maple_range_64_node) +{ + int i; + + fprintf(fp, " contents: "); + for (i = 0; i < mt_slots[maple_range_64] - 1; i++) + fprintf(fp, "%p %lu ", + *((void **)(maple_range_64_node + + OFFSET(maple_range_64_slot)) + i), + *((ulong *)(maple_range_64_node + + OFFSET(maple_range_64_pivot)) + i)); + fprintf(fp, "%p\n", *((void **)(maple_range_64_node + + OFFSET(maple_range_64_slot)) + i)); +} + +static void dump_mt_arange64(void *maple_arange_64_node) +{ + int i; + + fprintf(fp, " contents: "); + for (i = 0; i < mt_slots[maple_arange_64]; i++) + fprintf(fp, "%lu ", + *((ulong *)(maple_arange_64_node + + OFFSET(maple_arange_64_gap)) + i)); + + fprintf(fp, "| %02X %02X| ", + *((unsigned char *)maple_arange_64_node + + OFFSET(maple_arange_64_meta) + + OFFSET(maple_metadata_end)), + *((unsigned char *)maple_arange_64_node + + OFFSET(maple_arange_64_meta) + + OFFSET(maple_metadata_gap))); + + for (i = 0; i < mt_slots[maple_arange_64] - 1; i++) + fprintf(fp, "%p %lu ", + *((void **)(maple_arange_64_node + + OFFSET(maple_arange_64_slot)) + i), + *((ulong *)(maple_arange_64_node + + OFFSET(maple_arange_64_pivot)) + i)); + fprintf(fp, "%p\n", + *((void **)(maple_arange_64_node + + OFFSET(maple_arange_64_slot)) + i)); +} + +static void dump_mt_entry(ulong entry, ulong min, ulong max, uint depth) +{ + mt_dump_range(min, max, depth); + + if (xa_is_value(entry)) + fprintf(fp, "value %ld (0x%lx) [0x%lx]\n", xa_to_value(entry), + xa_to_value(entry), entry); + else if (xa_is_zero(entry)) + fprintf(fp, "zero (%ld)\n", xa_to_internal(entry)); + else if (mt_is_reserved(entry)) + fprintf(fp, "UNKNOWN ENTRY (0x%lx)\n", entry); + else + fprintf(fp, "0x%lx\n", entry); +} + +static void dump_mt_node(ulong maple_node, char *node_data, uint type, + ulong min, ulong max, uint depth) +{ + mt_dump_range(min, max, depth); + + fprintf(fp, "node 0x%lx depth %d type %d parent %p", maple_node, depth, type, + maple_node ? VOID_PTR(node_data + OFFSET(maple_node_parent)) + : NULL); +} + +static void do_mt_range64(ulong entry, ulong min, ulong max, + uint depth, char *path, ulong *global_index, + struct maple_tree_ops *ops) +{ + ulong maple_node_m_node = mte_to_node(entry); + unsigned char tmp_node[MAPLE_BUFSIZE]; + bool leaf = mte_is_leaf(entry); + ulong first = min, last; + int i; + int len = strlen(path); + struct tree_data *td = ops->is_td ? (struct tree_data *)ops->private : NULL; + void *maple_range_64_node; + + if (SIZE(maple_node) > MAPLE_BUFSIZE) + error(FATAL, "MAPLE_BUFSIZE should be larger than maple_node struct"); + + readmem(maple_node_m_node, KVADDR, tmp_node, SIZE(maple_node), + "mt_dump_range64 read maple_node", FAULT_ON_ERROR); + + maple_range_64_node = tmp_node + OFFSET(maple_node_mr64); + + for (i = 0; i < mt_slots[maple_range_64]; i++) { + last = max; + + if (i < (mt_slots[maple_range_64] - 1)) + last = *((ulong *)(maple_range_64_node + + OFFSET(maple_range_64_pivot)) + i); + else if (!*((void **)(maple_range_64_node + + OFFSET(maple_range_64_slot)) + i) && + max != mt_max[mte_node_type(entry)]) + break; + if (last == 0 && i > 0) + break; + if (leaf) + do_mt_entry(mt_slot((void **)(maple_range_64_node + + OFFSET(maple_range_64_slot)), i), + first, last, depth + 1, i, path, global_index, ops); + else if (*((void **)(maple_range_64_node + + OFFSET(maple_range_64_slot)) + i)) { + sprintf(path + len, "/%d", i); + do_mt_node(mt_slot((void **)(maple_range_64_node + + OFFSET(maple_range_64_slot)), i), + first, last, depth + 1, path, global_index, ops); + } + + if (last == max) + break; + if (last > max) { + fprintf(fp, "node %p last (%lu) > max (%lu) at pivot %d!\n", + maple_range_64_node, last, max, i); + break; + } + first = last + 1; + } +} + +static void do_mt_arange64(ulong entry, ulong min, ulong max, + uint depth, char *path, ulong *global_index, + struct maple_tree_ops *ops) +{ + ulong maple_node_m_node = mte_to_node(entry); + unsigned char tmp_node[MAPLE_BUFSIZE]; + bool leaf = mte_is_leaf(entry); + ulong first = min, last; + int i; + int len = strlen(path); + struct tree_data *td = ops->is_td ? (struct tree_data *)ops->private : NULL; + void *maple_arange_64_node; + + if (SIZE(maple_node) > MAPLE_BUFSIZE) + error(FATAL, "MAPLE_BUFSIZE should be larger than maple_node struct"); + + readmem(maple_node_m_node, KVADDR, tmp_node, SIZE(maple_node), + "mt_dump_arange64 read maple_node", FAULT_ON_ERROR); + + maple_arange_64_node = tmp_node + OFFSET(maple_node_ma64); + + for (i = 0; i < mt_slots[maple_arange_64]; i++) { + last = max; + + if (i < (mt_slots[maple_arange_64] - 1)) + last = *((ulong *)(maple_arange_64_node + + OFFSET(maple_arange_64_pivot)) + i); + else if (! *((void **)(maple_arange_64_node + + OFFSET(maple_arange_64_slot)) + i)) + break; + if (last == 0 && i > 0) + break; + + if (leaf) + do_mt_entry(mt_slot((void **)(maple_arange_64_node + + OFFSET(maple_arange_64_slot)), i), + first, last, depth + 1, i, path, global_index, ops); + else if (*((void **)(maple_arange_64_node + + OFFSET(maple_arange_64_slot)) + i)) { + sprintf(path + len, "/%d", i); + do_mt_node(mt_slot((void **)(maple_arange_64_node + + OFFSET(maple_arange_64_slot)), i), + first, last, depth + 1, path, global_index, ops); + } + + if (last == max) + break; + if (last > max) { + fprintf(fp, "node %p last (%lu) > max (%lu) at pivot %d!\n", + maple_arange_64_node, last, max, i); + break; + } + first = last + 1; + } +} + +static void do_mt_entry(ulong entry, ulong min, ulong max, uint depth, + uint index, char *path, ulong *global_index, + struct maple_tree_ops *ops) +{ + int print_radix = 0, i; + static struct req_entry **e = NULL; + struct tree_data *td = ops->is_td ? (struct tree_data *)ops->private : NULL; + + if (!td) + return; +} + +static void do_mt_node(ulong entry, ulong min, ulong max, + uint depth, char *path, ulong *global_index, + struct maple_tree_ops *ops) +{ + ulong maple_node = mte_to_node(entry); + uint type = mte_node_type(entry); + uint i; + char tmp_node[MAPLE_BUFSIZE]; + struct tree_data *td = ops->is_td ? (struct tree_data *)ops->private : NULL; + + if (SIZE(maple_node) > MAPLE_BUFSIZE) + error(FATAL, "MAPLE_BUFSIZE should be larger than maple_node struct"); + + readmem(maple_node, KVADDR, tmp_node, SIZE(maple_node), + "mt_dump_node read maple_node", FAULT_ON_ERROR); + + switch (type) { + case maple_dense: + for (i = 0; i < mt_slots[maple_dense]; i++) { + if (min + i > max) + fprintf(fp, "OUT OF RANGE: "); + do_mt_entry(mt_slot((void **)(tmp_node + OFFSET(maple_node_slot)), i), + min + i, min + i, depth, i, path, global_index, ops); + } + break; + case maple_leaf_64: + case maple_range_64: + do_mt_range64(entry, min, max, depth, path, global_index, ops); + break; + case maple_arange_64: + do_mt_arange64(entry, min, max, depth, path, global_index, ops); + break; + default: + fprintf(fp, " UNKNOWN TYPE\n"); + } +} + +static int do_maple_tree_traverse(ulong ptr, int is_root, + struct maple_tree_ops *ops) +{ + char path[BUFSIZE] = {0}; + unsigned char tmp_tree[MAPLE_BUFSIZE]; + ulong entry; + struct tree_data *td = ops->is_td ? (struct tree_data *)ops->private : NULL; + ulong global_index = 0; + + if (SIZE(maple_tree) > MAPLE_BUFSIZE) + error(FATAL, "MAPLE_BUFSIZE should be larger than maple_tree struct"); + + if (!is_root) { + strcpy(path, "direct"); + do_mt_node(ptr, 0, mt_max[mte_node_type(ptr)], + 0, path, &global_index, ops); + } else { + readmem(ptr, KVADDR, tmp_tree, SIZE(maple_tree), + "mt_dump read maple_tree", FAULT_ON_ERROR); + entry = ULONG(tmp_tree + OFFSET(maple_tree_ma_root)); + + if (!xa_is_node(entry)) + do_mt_entry(entry, 0, 0, 0, 0, path, &global_index, ops); + else if (entry) { + strcpy(path, "root"); + do_mt_node(entry, 0, mt_max[mte_node_type(entry)], 0, + path, &global_index, ops); + } + } + return 0; +} + +int do_mptree(struct tree_data *td) +{ + struct maple_tree_ops ops = { + .entry = NULL, + .private = td, + .is_td = true, + }; + + int is_root = !(td->flags & TREE_NODE_POINTER); + + do_maple_tree_traverse(td->start, is_root, &ops); + + return 0; +} + +/***********************************************/ +void maple_init(void) +{ + int array_len; + + STRUCT_SIZE_INIT(maple_tree, "maple_tree"); + STRUCT_SIZE_INIT(maple_node, "maple_node"); + + MEMBER_OFFSET_INIT(maple_tree_ma_root, "maple_tree", "ma_root"); + MEMBER_OFFSET_INIT(maple_tree_ma_flags, "maple_tree", "ma_flags"); + + MEMBER_OFFSET_INIT(maple_node_parent, "maple_node", "parent"); + MEMBER_OFFSET_INIT(maple_node_ma64, "maple_node", "ma64"); + MEMBER_OFFSET_INIT(maple_node_mr64, "maple_node", "mr64"); + MEMBER_OFFSET_INIT(maple_node_slot, "maple_node", "slot"); + + MEMBER_OFFSET_INIT(maple_arange_64_pivot, "maple_arange_64", "pivot"); + MEMBER_OFFSET_INIT(maple_arange_64_slot, "maple_arange_64", "slot"); + MEMBER_OFFSET_INIT(maple_arange_64_gap, "maple_arange_64", "gap"); + MEMBER_OFFSET_INIT(maple_arange_64_meta, "maple_arange_64", "meta"); + + MEMBER_OFFSET_INIT(maple_range_64_pivot, "maple_range_64", "pivot"); + MEMBER_OFFSET_INIT(maple_range_64_slot, "maple_range_64", "slot"); + + MEMBER_OFFSET_INIT(maple_metadata_end, "maple_metadata", "end"); + MEMBER_OFFSET_INIT(maple_metadata_gap, "maple_metadata", "gap"); + + array_len = get_array_length("mt_slots", NULL, sizeof(char)); + mt_slots = calloc(array_len, sizeof(char)); + readmem(symbol_value("mt_slots"), KVADDR, mt_slots, + array_len * sizeof(char), "maple_init read mt_slots", + RETURN_ON_ERROR); + + array_len = get_array_length("mt_pivots", NULL, sizeof(char)); + mt_pivots = calloc(array_len, sizeof(char)); + readmem(symbol_value("mt_pivots"), KVADDR, mt_pivots, + array_len * sizeof(char), "maple_init read mt_pivots", + RETURN_ON_ERROR); + + mt_max[maple_dense] = mt_slots[maple_dense]; + mt_max[maple_leaf_64] = ULONG_MAX; + mt_max[maple_range_64] = ULONG_MAX; + mt_max[maple_arange_64] = ULONG_MAX; +} \ No newline at end of file diff --git a/maple_tree.h b/maple_tree.h new file mode 100644 index 0000000..369fa9f --- /dev/null +++ b/maple_tree.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef _MAPLE_TREE_H +#define _MAPLE_TREE_H +/* + * Maple Tree - An RCU-safe adaptive tree for storing ranges + * Copyright (c) 2018-2022 Oracle + * Authors: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx> + * Matthew Wilcox <willy@xxxxxxxxxxxxx> + * + * eXtensible Arrays + * Copyright (c) 2017 Microsoft Corporation + * Author: Matthew Wilcox <willy@xxxxxxxxxxxxx> + * + * See Documentation/core-api/xarray.rst for how to use the XArray. + */ +#include <stdbool.h> +#include <limits.h> +#include <sys/types.h> + +/* + * The following are copied and modified from include/linux/maple_tree.h + */ + +enum maple_type { + maple_dense, + maple_leaf_64, + maple_range_64, + maple_arange_64, +}; + +#define MAPLE_NODE_MASK 255UL + +#define MT_FLAGS_HEIGHT_OFFSET 0x02 +#define MT_FLAGS_HEIGHT_MASK 0x7C + +#define MAPLE_NODE_TYPE_MASK 0x0F +#define MAPLE_NODE_TYPE_SHIFT 0x03 + +#define MAPLE_RESERVED_RANGE 4096 + +/* + * The following are copied and modified from include/linux/xarray.h + */ + +#define XA_ZERO_ENTRY xa_mk_internal(257) + +static inline ulong xa_mk_internal(ulong v) +{ + return (v << 2) | 2; +} + +static inline bool xa_is_internal(ulong entry) +{ + return (entry & 3) == 2; +} + +static inline bool xa_is_node(ulong entry) +{ + return xa_is_internal(entry) && entry > 4096; +} + +static inline bool xa_is_value(ulong entry) +{ + return entry & 1; +} + +static inline bool xa_is_zero(ulong entry) +{ + return entry == XA_ZERO_ENTRY; +} + +static inline unsigned long xa_to_internal(ulong entry) +{ + return entry >> 2; +} + +static inline unsigned long xa_to_value(ulong entry) +{ + return entry >> 1; +} + +#endif /* _MAPLE_TREE_H */ \ No newline at end of file -- 2.33.1 -- Crash-utility mailing list Crash-utility@xxxxxxxxxx https://listman.redhat.com/mailman/listinfo/crash-utility Contribution Guidelines: https://github.com/crash-utility/crash/wiki