Hi Lianbo, Thank you for working on this, we know the stats are very useful but it's a tough work to support :-( I have a few questions about basic design: - Is it possible to re-use sbitmap_for_each_set() etc. in sbitmap.c? They were implemented like the sbitmap functions in kernel, maybe we can imitate bt_for_each() in kernel. Otherwise, we will have to update sbitmap.c and dev.c when a change in sbitmap structure occurs. or did you find any reason that they cannot be used? - Is it possible to use the new logic when blk-mq has sbitmap, even if a kernel has rq_dispatched and rq_completed? They are unreliable in the first place as you said [1] and will not worth displaying if we can parse sbitmap, I think. [1] https://listman.redhat.com/archives/crash-utility/2022-February/009531.html Thanks, Kazu -----Original Message----- > Currently, crash doesn't support to display disk I/O statistics > for blk-mq devices. For more details, please refer to the following > commit: <98b417fc6346> ("Handle blk_mq_ctx member changes for kernels > 5.16-rc1 and later"). > > Lets parse the bitmap in blk-mq layer to achieve it. > > Signed-off-by: Lianbo Jiang <lijiang@xxxxxxxxxx> > --- > defs.h | 13 +++ > dev.c | 302 +++++++++++++++++++++++++++++++++++++++++++++++------- > symbols.c | 26 +++++ > 3 files changed, 302 insertions(+), 39 deletions(-) > > diff --git a/defs.h b/defs.h > index 81ac0498dac7..f3c05fb44e62 100644 > --- a/defs.h > +++ b/defs.h > @@ -2168,6 +2168,18 @@ struct offset_table { /* stash of commonly-used offsets */ > long sbitmap_queue_min_shallow_depth; > long sbq_wait_state_wait_cnt; > long sbq_wait_state_wait; > + long request_q; > + long request_cmd_flags; > + long request_queue_queue_hw_ctx; > + long request_queue_nr_hw_queues; > + long blk_mq_hw_ctx_tags; > + long blk_mq_hw_ctx_sched_tags; > + long blk_mq_tags_bitmap_tags; > + long blk_mq_tags_breserved_tags; > + long blk_mq_tags_nr_reserved_tags; > + long blk_mq_tags_nr_tags; > + long blk_mq_tags_rqs; > + long blk_mq_tags_static_rqs; > }; > > struct size_table { /* stash of commonly-used sizes */ > @@ -2337,6 +2349,7 @@ struct size_table { /* stash of commonly-used sizes */ > long sbitmap; > long sbitmap_queue; > long sbq_wait_state; > + long blk_mq_tags; > }; > > struct array_table { > diff --git a/dev.c b/dev.c > index a493e51ac95c..4d574f06494f 100644 > --- a/dev.c > +++ b/dev.c > @@ -4238,19 +4238,224 @@ get_one_mctx_diskio(unsigned long mctx, struct diskio *io) > io->write = (dispatch[1] - comp[1]); > } > > +struct blk_mq_tags_context { > + uint nr_tags; > + uint nr_reserved_tags; > + ulong bitmap_tags; > + ulong breserved_tags; > + ulong rqs; > + ulong static_rqs; > +}; > + > +static void load_blk_mq_tags_context(ulong addr, struct blk_mq_tags_context *bmtc) > +{ > + char *tag_buf = NULL; > + > + tag_buf = GETBUF(SIZE(blk_mq_tags)); > + if (!tag_buf) > + error(FATAL, "fail to get memory for blk_mq_tags\n"); > + > + if (!readmem(addr, KVADDR, tag_buf, SIZE(blk_mq_tags), "blk_mq_tags", RETURN_ON_ERROR)) { > + FREEBUF(tag_buf); > + error(FATAL, "cannot read blk_mq_tags\n"); > + } > + > + bmtc->nr_tags = UINT(tag_buf + OFFSET(blk_mq_tags_nr_tags)); > + bmtc->nr_reserved_tags = UINT(tag_buf + OFFSET(blk_mq_tags_nr_reserved_tags)); > + bmtc->bitmap_tags = addr + OFFSET(blk_mq_tags_bitmap_tags); > + bmtc->breserved_tags = addr + OFFSET(blk_mq_tags_breserved_tags); > + bmtc->rqs = ULONG(tag_buf + OFFSET(blk_mq_tags_rqs)); > + bmtc->static_rqs = ULONG(tag_buf + OFFSET(blk_mq_tags_static_rqs)); > + > + FREEBUF(tag_buf); > +} > + > +static void load_blk_mq_rqs(ulong rqs_addr, uint counts, ulong **rqs) > +{ > + char *rqs_buf = NULL; > + uint rqs_buf_size = sizeof(void *) * counts; > + > + if (!IS_KVADDR(rqs_addr)) { > + *rqs = NULL; > + return; > + } > + > + rqs_buf = GETBUF(rqs_buf_size); > + if (!rqs_buf) > + error(FATAL, "fail to get memory for the rqs buf\n"); > + > + if (!readmem(rqs_addr, KVADDR, rqs_buf, rqs_buf_size, > + "blk_mq_tags.[static_]rqs", RETURN_ON_ERROR)) { > + FREEBUF(rqs_buf); > + error(FATAL, "fail to load blk_mq_tags.[static_]rqs\n"); > + } > + > + *rqs = (ulong*)rqs_buf; > +} > + > +static uint op_is_write(uint op) > +{ > +#define REQ_OP_BITS 8 > +#define REQ_OP_MASK ((1 << REQ_OP_BITS) - 1) > + > + return (op & REQ_OP_MASK) & 1; > +} > + > +static void find_mq_diskio_by_parsing_bitmap(ulong q, ulong tags, uint offset, ulong *rqs, struct diskio > **io_counts) > +{ > + int i,j; > + char *sbitmap_word_buf = NULL; > + ulong depth = 0, word = 0, cleared = 0; > + struct sbitmap_context sc = {0}; > + uint idx = offset; > + ulong addr = tags + OFFSET(sbitmap_queue_sb); > + ulong sbitmap_word_size = SIZE(sbitmap_word); > + > + sbitmap_word_buf = GETBUF(sbitmap_word_size); > + if (!sbitmap_word_buf) > + error(FATAL, "fail to get memory for the sbitmap word\n"); > + > + sbitmap_context_load(addr, &sc); > + if (sc.map_nr == 0 || !IS_KVADDR(sc.map_addr)) > + goto sbitmap_word_buf_free; > + > + addr = sc.map_addr; > + for (i = 0; i < sc.map_nr; i++, addr += sbitmap_word_size) { > + if (!readmem(addr, KVADDR, sbitmap_word_buf, sbitmap_word_size, > + "sbitmap_word", RETURN_ON_ERROR)) { > + error(INFO, "cannot read sbitmap_word\n"); > + goto sbitmap_word_buf_free; > + } > + > + word = ULONG(sbitmap_word_buf + OFFSET(sbitmap_word_word)); > + cleared = ULONG(sbitmap_word_buf + OFFSET(sbitmap_word_cleared)); > + word = word & ~cleared; > + if (!word) > + continue; > + > + if (i == sc.map_nr - 1) > + depth = sc.depth - (i << sc.shift); > + else > + depth = 1U << sc.shift; > + > + for (j = 0; j < depth; j++) { > + if ((word & 1) && (rqs[idx])) { > + ulong queue = 0; > + uint cmd_flags = 0; > + > + if (!readmem(rqs[idx] + OFFSET(request_q), KVADDR, &queue, > + sizeof(ulong), "request.q", RETURN_ON_ERROR)) > + goto next; > + > + if (!readmem(rqs[idx] + OFFSET(request_cmd_flags), KVADDR, &cmd_flags, > + sizeof(uint), "request.cmd_flags", RETURN_ON_ERROR)) > + goto next; > + > + if (q == queue) { > + if (op_is_write(cmd_flags)) > + (*io_counts)->write++; > + else > + (*io_counts)->read++; > + } > + > + } > + next: > + idx++; > + word >>= 1; > + } > + } > + > +sbitmap_word_buf_free: > + FREEBUF(sbitmap_word_buf); > +} > + > +static void get_mq_diskio_from_hw_queues(unsigned long q, struct diskio *io_counts) > +{ > + unsigned long *queue_hw_ctx = NULL; > + unsigned long addr = 0; > + unsigned int i, nr_hw_queues; > + > + addr = q + OFFSET(request_queue_nr_hw_queues); > + readmem(addr, KVADDR, &nr_hw_queues, sizeof(uint), > + "request_queue.nr_hw_queues", FAULT_ON_ERROR); > + > + queue_hw_ctx = (ulong *)GETBUF(sizeof(void *) * nr_hw_queues); > + if (!queue_hw_ctx) > + error(FATAL, "fail to get memory for the queue_hw_ctx\n"); > + > + addr = q + OFFSET(request_queue_queue_hw_ctx); > + if (!readmem(addr, KVADDR, queue_hw_ctx, sizeof(void *) * nr_hw_queues, > + "request_queue.queue_hw_ctx", RETURN_ON_ERROR)) > + goto queue_hw_ctx_free; > + > + for (i = 0; i < nr_hw_queues; i++) { > + ulong tags = 0, sched_tags = 0, queue_hw_ctx_addr = 0; > + unsigned long *rqs = NULL, *static_rqs = NULL; > + struct blk_mq_tags_context tags_ctx = {0}; > + struct blk_mq_tags_context sched_tags_ctx = {0}; > + > + if(!IS_KVADDR(queue_hw_ctx[i])) > + continue; > + > + if (!readmem(queue_hw_ctx[i], KVADDR, &queue_hw_ctx_addr, > + sizeof(ulong), "blk_mq_hw_ctx", RETURN_ON_ERROR)) > + goto queue_hw_ctx_free; > + > + if(!IS_KVADDR(queue_hw_ctx_addr)) > + continue; > + > + addr = queue_hw_ctx_addr + OFFSET(blk_mq_hw_ctx_tags); > + if (!readmem(addr, KVADDR, &tags, sizeof(ulong), > + "blk_mq_hw_ctx.tags", RETURN_ON_ERROR)) > + goto queue_hw_ctx_free; > + > + addr = queue_hw_ctx_addr + OFFSET(blk_mq_hw_ctx_sched_tags); > + if (!readmem(addr, KVADDR, &sched_tags, sizeof(ulong), > + "blk_mq_hw_ctx.sched_tags", RETURN_ON_ERROR)) > + goto queue_hw_ctx_free; > + > + if (IS_KVADDR(tags)) { > + load_blk_mq_tags_context(tags, &tags_ctx); > + load_blk_mq_rqs(tags_ctx.rqs, tags_ctx.nr_tags, &rqs); > + if (!rqs) > + goto next; > + find_mq_diskio_by_parsing_bitmap(q, tags_ctx.breserved_tags, 0, rqs, &io_counts); > + find_mq_diskio_by_parsing_bitmap(q, tags_ctx.bitmap_tags, > tags_ctx.nr_reserved_tags, rqs, &io_counts); > + FREEBUF(rqs); > + } > + > + next: > + if (IS_KVADDR(sched_tags)) { > + load_blk_mq_tags_context(sched_tags, &sched_tags_ctx); > + load_blk_mq_rqs(sched_tags_ctx.static_rqs, sched_tags_ctx.nr_tags, &static_rqs); > + if (!static_rqs) > + continue; > + find_mq_diskio_by_parsing_bitmap(q, sched_tags_ctx.breserved_tags, 0, static_rqs, > &io_counts); > + find_mq_diskio_by_parsing_bitmap(q, sched_tags_ctx.bitmap_tags, > sched_tags_ctx.nr_reserved_tags, > + static_rqs, &io_counts); > + FREEBUF(static_rqs); > + } > + } > + > +queue_hw_ctx_free: > + FREEBUF(queue_hw_ctx); > +} > + > static void > get_mq_diskio(unsigned long q, unsigned long *mq_count) > { > int cpu; > unsigned long queue_ctx; > unsigned long mctx_addr; > - struct diskio tmp; > + struct diskio tmp = {0}; > > if (INVALID_MEMBER(blk_mq_ctx_rq_dispatched) || > - INVALID_MEMBER(blk_mq_ctx_rq_completed)) > + INVALID_MEMBER(blk_mq_ctx_rq_completed)) { > + get_mq_diskio_from_hw_queues(q, &tmp); > + mq_count[0] = tmp.read; > + mq_count[1] = tmp.write; > return; > - > - memset(&tmp, 0x00, sizeof(struct diskio)); > + } > > readmem(q + OFFSET(request_queue_queue_ctx), KVADDR, &queue_ctx, > sizeof(ulong), "request_queue.queue_ctx", > @@ -4479,41 +4684,24 @@ display_one_diskio(struct iter *i, unsigned long gendisk, ulong flags) > && (io.read + io.write == 0)) > return; > > - if (use_mq_interface(queue_addr) && > - (INVALID_MEMBER(blk_mq_ctx_rq_dispatched) || > - INVALID_MEMBER(blk_mq_ctx_rq_completed))) > - fprintf(fp, "%s%s%s %s%s%s%s %s%s%s", > - mkstring(buf0, 5, RJUST|INT_DEC, (char *)(unsigned long)major), > - space(MINSPACE), > - mkstring(buf1, VADDR_PRLEN, LJUST|LONG_HEX, (char *)gendisk), > - space(MINSPACE), > - mkstring(buf2, 10, LJUST, disk_name), > - space(MINSPACE), > - mkstring(buf3, VADDR_PRLEN <= 11 ? 11 : VADDR_PRLEN, > - LJUST|LONG_HEX, (char *)queue_addr), > - space(MINSPACE), > - mkstring(buf4, 17, RJUST, "(not supported)"), > - space(MINSPACE)); > - > - else > - fprintf(fp, "%s%s%s %s%s%s%s %s%5d%s%s%s%s%s", > - mkstring(buf0, 5, RJUST|INT_DEC, (char *)(unsigned long)major), > - space(MINSPACE), > - mkstring(buf1, VADDR_PRLEN, LJUST|LONG_HEX, (char *)gendisk), > - space(MINSPACE), > - mkstring(buf2, 10, LJUST, disk_name), > - space(MINSPACE), > - mkstring(buf3, VADDR_PRLEN <= 11 ? 11 : VADDR_PRLEN, > - LJUST|LONG_HEX, (char *)queue_addr), > - space(MINSPACE), > - io.read + io.write, > - space(MINSPACE), > - mkstring(buf4, 5, RJUST|INT_DEC, > - (char *)(unsigned long)io.read), > - space(MINSPACE), > - mkstring(buf5, 5, RJUST|INT_DEC, > - (char *)(unsigned long)io.write), > - space(MINSPACE)); > + fprintf(fp, "%s%s%s %s%s%s%s %s%5d%s%s%s%s%s", > + mkstring(buf0, 5, RJUST|INT_DEC, (char *)(unsigned long)major), > + space(MINSPACE), > + mkstring(buf1, VADDR_PRLEN, LJUST|LONG_HEX, (char *)gendisk), > + space(MINSPACE), > + mkstring(buf2, 10, LJUST, disk_name), > + space(MINSPACE), > + mkstring(buf3, VADDR_PRLEN <= 11 ? 11 : VADDR_PRLEN, > + LJUST|LONG_HEX, (char *)queue_addr), > + space(MINSPACE), > + io.read + io.write, > + space(MINSPACE), > + mkstring(buf4, 5, RJUST|INT_DEC, > + (char *)(unsigned long)io.read), > + space(MINSPACE), > + mkstring(buf5, 5, RJUST|INT_DEC, > + (char *)(unsigned long)io.write), > + space(MINSPACE)); > > if (VALID_MEMBER(request_queue_in_flight)) { > if (!use_mq_interface(queue_addr)) { > @@ -4603,15 +4791,51 @@ void diskio_init(void) > MEMBER_OFFSET_INIT(request_queue_rq, "request_queue", "rq"); > else > MEMBER_OFFSET_INIT(request_queue_rq, "request_queue", "root_rl"); > + if (MEMBER_EXISTS("request", "q")) > + MEMBER_OFFSET_INIT(request_q, "request", "q"); > + if (MEMBER_EXISTS("request", "cmd_flags")) > + MEMBER_OFFSET_INIT(request_cmd_flags, "request", "cmd_flags"); > if (MEMBER_EXISTS("request_queue", "mq_ops")) { > MEMBER_OFFSET_INIT(request_queue_mq_ops, "request_queue", > "mq_ops"); > ANON_MEMBER_OFFSET_INIT(request_queue_queue_ctx, > "request_queue", "queue_ctx"); > + MEMBER_OFFSET_INIT(request_queue_queue_hw_ctx, > + "request_queue", "queue_hw_ctx"); > + MEMBER_OFFSET_INIT(request_queue_nr_hw_queues, > + "request_queue", "nr_hw_queues"); > MEMBER_OFFSET_INIT(blk_mq_ctx_rq_dispatched, "blk_mq_ctx", > "rq_dispatched"); > MEMBER_OFFSET_INIT(blk_mq_ctx_rq_completed, "blk_mq_ctx", > "rq_completed"); > + MEMBER_OFFSET_INIT(blk_mq_hw_ctx_tags, "blk_mq_hw_ctx", > + "tags"); > + MEMBER_OFFSET_INIT(blk_mq_hw_ctx_sched_tags, "blk_mq_hw_ctx", > + "sched_tags"); > + MEMBER_OFFSET_INIT(blk_mq_tags_bitmap_tags, "blk_mq_tags", > + "bitmap_tags"); > + MEMBER_OFFSET_INIT(blk_mq_tags_breserved_tags, "blk_mq_tags", > + "breserved_tags"); > + MEMBER_OFFSET_INIT(blk_mq_tags_nr_reserved_tags, "blk_mq_tags", > + "nr_reserved_tags"); > + MEMBER_OFFSET_INIT(blk_mq_tags_nr_tags, "blk_mq_tags", > + "nr_tags"); > + MEMBER_OFFSET_INIT(blk_mq_tags_rqs, "blk_mq_tags", > + "rqs"); > + MEMBER_OFFSET_INIT(blk_mq_tags_static_rqs, "blk_mq_tags", > + "static_rqs"); > + STRUCT_SIZE_INIT(blk_mq_tags, "blk_mq_tags"); > + STRUCT_SIZE_INIT(sbitmap, "sbitmap"); > + STRUCT_SIZE_INIT(sbitmap_word, "sbitmap_word"); > + MEMBER_OFFSET_INIT(sbitmap_word_depth, "sbitmap_word", "depth"); > + MEMBER_OFFSET_INIT(sbitmap_word_word, "sbitmap_word", "word"); > + MEMBER_OFFSET_INIT(sbitmap_word_cleared, "sbitmap_word", "cleared"); > + MEMBER_OFFSET_INIT(sbitmap_depth, "sbitmap", "depth"); > + MEMBER_OFFSET_INIT(sbitmap_shift, "sbitmap", "shift"); > + MEMBER_OFFSET_INIT(sbitmap_map_nr, "sbitmap", "map_nr"); > + MEMBER_OFFSET_INIT(sbitmap_map, "sbitmap", "map"); > + MEMBER_OFFSET_INIT(sbitmap_queue_sb, "sbitmap_queue", "sb"); > + > } > MEMBER_OFFSET_INIT(subsys_private_klist_devices, "subsys_private", > "klist_devices"); > diff --git a/symbols.c b/symbols.c > index ba5e2741347d..0612255b6e34 100644 > --- a/symbols.c > +++ b/symbols.c > @@ -10385,6 +10385,10 @@ dump_offset_table(char *spec, ulong makestruct) > OFFSET(kset_list)); > fprintf(fp, " request_list_count: %ld\n", > OFFSET(request_list_count)); > + fprintf(fp, " request_q: %ld\n", > + OFFSET(request_q)); > + fprintf(fp, " request_cmd_flags: %ld\n", > + OFFSET(request_cmd_flags)); > fprintf(fp, " request_queue_in_flight: %ld\n", > OFFSET(request_queue_in_flight)); > fprintf(fp, " request_queue_rq: %ld\n", > @@ -10393,10 +10397,31 @@ dump_offset_table(char *spec, ulong makestruct) > OFFSET(request_queue_mq_ops)); > fprintf(fp, " request_queue_queue_ctx: %ld\n", > OFFSET(request_queue_queue_ctx)); > + fprintf(fp, " request_queue_queue_hw_ctx: %ld\n", > + OFFSET(request_queue_queue_hw_ctx)); > + fprintf(fp, " request_queue_nr_hw_queues: %ld\n", > + OFFSET(request_queue_nr_hw_queues)); > fprintf(fp, " blk_mq_ctx_rq_dispatched: %ld\n", > OFFSET(blk_mq_ctx_rq_dispatched)); > fprintf(fp, " blk_mq_ctx_rq_completed: %ld\n", > OFFSET(blk_mq_ctx_rq_completed)); > + fprintf(fp, " blk_mq_hw_ctx_tags: %ld\n", > + OFFSET(blk_mq_hw_ctx_tags)); > + fprintf(fp, " blk_mq_hw_ctx_sched_tags: %ld\n", > + OFFSET(blk_mq_hw_ctx_sched_tags)); > + fprintf(fp, " blk_mq_tags_bitmap_tags: %ld\n", > + OFFSET(blk_mq_tags_bitmap_tags)); > + fprintf(fp, " blk_mq_tags_breserved_tags: %ld\n", > + OFFSET(blk_mq_tags_breserved_tags)); > + fprintf(fp, " blk_mq_tags_nr_reserved_tags: %ld\n", > + OFFSET(blk_mq_tags_nr_reserved_tags)); > + fprintf(fp, " blk_mq_tags_nr_tags: %ld\n", > + OFFSET(blk_mq_tags_nr_tags)); > + fprintf(fp, " blk_mq_tags_rqs: %ld\n", > + OFFSET(blk_mq_tags_rqs)); > + fprintf(fp, " blk_mq_tags_static_rqs: %ld\n", > + OFFSET(blk_mq_tags_static_rqs)); > + > fprintf(fp, " subsys_private_klist_devices: %ld\n", > OFFSET(subsys_private_klist_devices)); > fprintf(fp, " subsystem_kset: %ld\n", > @@ -10999,6 +11024,7 @@ dump_offset_table(char *spec, ulong makestruct) > fprintf(fp, " sbitmap: %ld\n", SIZE(sbitmap)); > fprintf(fp, " sbitmap_queue: %ld\n", SIZE(sbitmap_queue)); > fprintf(fp, " sbq_wait_state: %ld\n", SIZE(sbq_wait_state)); > + fprintf(fp, " blk_mq_tags: %ld\n", SIZE(blk_mq_tags)); > > fprintf(fp, "\n array_table:\n"); > /* > -- > 2.20.1 -- Crash-utility mailing list Crash-utility@xxxxxxxxxx https://listman.redhat.com/mailman/listinfo/crash-utility Contribution Guidelines: https://github.com/crash-utility/crash/wiki