On 6/2/19 6:58 PM, Chaitanya Kulkarni wrote:
Update blktrace tool to support the trace extensions. The new option
'-E' will allow blktrace to configure trace with newly introduced
IOCTLs for trace extensions. '-Y' and '-y' options are similar to
'-a' and '-A' but can now use the mask for additional operations
which are supported for trace-extension. For tracking the priority we
add '-P' along with the '-X' and '-x' to specify priority value and
mask.
Documentation in addition to the updated usage text?
doc/blkparse.1
doc/blktrace.tex
optimally including new output formats such as for the extended RWBS and
prio-class so users know how to read it
[my background is https://www.spinics.net/lists/linux-btrace/msg00840.html
https://www.spinics.net/lists/linux-btrace/msg00836.html
https://www.spinics.net/lists/linux-btrace/msg00825.html]
Update blkparse format with the trace extension support so blkparse
now can now identify and dump the trace extension. The new function
fill_rwbs_ext() formats the trace string with newly added trace
categories. We encapsulate existing blk_trace and newly added
blk_trace_ext structure under struct trace and use the t_bit_xxx()
helpers to read the appropriate value based on whether extensions
are enabled or not. We also add the support for priority tracking
when trace extension is enabled.
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@xxxxxxx>
---
Makefile | 2 +-
blkparse.c | 1072 +++++++++++++++++++++++++++++++++++-------------
blkparse.h | 78 ++++
blkparse_fmt.c | 204 ++++++---
blktrace.h | 6 +-
5 files changed, 1017 insertions(+), 345 deletions(-)
create mode 100644 blkparse.h
diff --git a/Makefile b/Makefile
index 5917814..af90a45 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
CC = gcc
CFLAGS = -Wall -O2 -g -W
ALL_CFLAGS = $(CFLAGS) -D_GNU_SOURCE -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64
-PROGS = blkparse blktrace verify_blkparse blkrawverify blkiomon
+PROGS = blkparse blktrace #verify_blkparse blkrawverify blkiomon
not building verify_blkparse any more looks odd, is it a left over from
development / debugging?
LIBS = -lpthread
SCRIPTS = btrace
SUBDIRS = btreplay btt iowatcher
diff --git a/blkparse.c b/blkparse.c
index 227cc44..36fe568 100644
--- a/blkparse.c
+++ b/blkparse.c
@@ -201,6 +201,42 @@ static struct option l_opts[] = {
.flag = NULL,
.val = 'w'
},
+ {
+ .name = "act-mask-ext",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'y'
+ },
+ {
+ .name = "set-mask-ext",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'Y'
+ },
+ {
+ .name = "use-extensions",
+ .has_arg = no_argument,
+ .flag = NULL,
+ .val = 'E'
+ },
+ {
+ .name = "track-priority",
+ .has_arg = no_argument,
+ .flag = NULL,
+ .val = 'P'
+ },
+ {
+ .name = "prio-mask",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'x'
+ },
+ {
+ .name = "prio-set-mask",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'X'
+ },
{
.name = "verbose",
.has_arg = no_argument,
@@ -572,53 +611,82 @@ static struct process_pid_map *add_ppm_hash(pid_t pid, const char *name)
return ppm;
}
-static void handle_notify(struct blk_io_trace *bit)
+static void handle_notify(struct trace *t)
{
- void *payload = (caddr_t) bit + sizeof(*bit);
+ void *payload;
__u32 two32[2];
- switch (bit->action) {
- case BLK_TN_PROCESS:
- add_ppm_hash(bit->pid, payload);
- break;
-
- case BLK_TN_TIMESTAMP:
- if (bit->pdu_len != sizeof(two32))
- return;
- memcpy(two32, payload, sizeof(two32));
- if (!data_is_native) {
- two32[0] = be32_to_cpu(two32[0]);
- two32[1] = be32_to_cpu(two32[1]);
- }
- start_timestamp = bit->time;
- abs_start_time.tv_sec = two32[0];
- abs_start_time.tv_nsec = two32[1];
- if (abs_start_time.tv_nsec < 0) {
- abs_start_time.tv_sec--;
- abs_start_time.tv_nsec += 1000000000;
+ if (t->use_ext)
+ payload = (caddr_t) t->bit_ext + sizeof(*(t->bit_ext));
+ else
+ payload = (caddr_t) t->bit + sizeof(*(t->bit));
+
+ if (use_ext) {
-
- break;
-
- case BLK_TN_MESSAGE:
- if (bit->pdu_len > 0) {
- char msg[bit->pdu_len+1];
-
- memcpy(msg, (char *)payload, bit->pdu_len);
- msg[bit->pdu_len] = '\0';
-
- fprintf(ofp,
- "%3d,%-3d %2d %8s %5d.%09lu %5u %2s %3s %s\n",
- MAJOR(bit->device), MINOR(bit->device),
- bit->cpu, "0", (int) SECONDS(bit->time),
- (unsigned long) NANO_SECONDS(bit->time),
- 0, "m", "N", msg);
I find it a bit hard to review that everything under the else case remains the
same code with just an additional indentation. Would it be possible to just add
the if(use_ext) with an early return and leave the old non-extended code as is?
+ } else {
+ if (t_bit_act(t) == BLK_TN_PROCESS) {
+ add_ppm_hash(t_bit_pid(t), payload);
+ } else if (t_bit_act(t) == BLK_TN_TIMESTAMP) {
+ if (t_bit_pdu_len(t) != sizeof(two32))
+ return;
+ memcpy(two32, payload, sizeof(two32));
+ if (!data_is_native) {
+ two32[0] = be32_to_cpu(two32[0]);
+ two32[1] = be32_to_cpu(two32[1]);
+ }
+ start_timestamp = t_bit_time(t);
+ abs_start_time.tv_sec = two32[0];
+ abs_start_time.tv_nsec = two32[1];
+ if (abs_start_time.tv_nsec < 0) {
+ abs_start_time.tv_sec--;
+ abs_start_time.tv_nsec += 1000000000;
+ }
+ } else if (t_bit_act(t) == BLK_TN_MESSAGE) {
+ if (t_bit_pdu_len(t) > 0) {
+ char msg[t_bit_pdu_len(t)+1];
+
+ memcpy(msg, (char *)payload, t_bit_pdu_len(t));
+ msg[t_bit_pdu_len(t)] = '\0';
+
+ fprintf(ofp,
+ "%3d,%-3d %2d %8s %5d.%09lu %5u %2s %3s %s\n",
+ MAJOR(t_bit_dev(t)), MINOR(t_bit_dev(t)),
+ t_bit_cpu(t), "0", (int) SECONDS(t_bit_time(t)),
+ (unsigned long) NANO_SECONDS(t_bit_time(t)),
+ 0, "m", "N", msg);
+ }
}
- break;
-
- default:
- /* Ignore unknown notify events */
- ;
}
}
-static void dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci,
+static void dump_trace(struct trace *t, struct per_cpu_info *pci,
struct per_dev_info *pdi)
{
if (text_output) {
- if (t->action == BLK_TN_MESSAGE)
- handle_notify(t);
- else if (t->action & BLK_TC_ACT(BLK_TC_PC))
- dump_trace_pc(t, pdi, pci);
- else
- dump_trace_fs(t, pdi, pci);
+ if (use_ext)
+ if (t->bit_ext->action == BLK_TN_MESSAGE_EXT)
+ handle_notify(t);
+ else if (t->bit_ext->action & BLK_TC_ACT_EXT(BLK_TC_PC))
+ dump_trace_pc(t, pdi, pci);
+ else
+ dump_trace_fs(t, pdi, pci);
+ else {
+ if (t->bit->action == BLK_TN_MESSAGE)
+ handle_notify(t);
+ else if (t->bit->action & BLK_TC_ACT(BLK_TC_PC))
+ dump_trace_pc(t, pdi, pci);
+ else
+ dump_trace_fs(t, pdi, pci);
+ }
dito
}
if (!pdi->events)
- pdi->first_reported_time = t->time;
+ pdi->first_reported_time = t_bit_time(t);
pdi->events++;
- if (bin_output_msgs ||
- !(t->action & BLK_TC_ACT(BLK_TC_NOTIFY) &&
- t->action == BLK_TN_MESSAGE))
- output_binary(t, sizeof(*t) + t->pdu_len);
+ if (use_ext) {
+ struct blk_io_trace_ext *bit_ext = t->bit_ext;
+
+ if (bin_output_msgs ||
+ !(bit_ext->action & BLK_TC_ACT_EXT(BLK_TC_NOTIFY) &&
+ bit_ext->action == BLK_TN_MESSAGE_EXT))
+ output_binary(bit_ext, sizeof(*bit_ext) + bit_ext->pdu_len);
+ } else {
+ struct blk_io_trace *bit = t->bit;
+
+ if (bin_output_msgs ||
+ !(bit->action & BLK_TC_ACT(BLK_TC_NOTIFY) &&
+ bit->action == BLK_TN_MESSAGE))
+ output_binary(bit, sizeof(*bit) + bit->pdu_len);
+ }
}
@@ -2111,41 +2237,57 @@ static void show_entries_rb(int force)
- if (!(bit->action == BLK_TN_MESSAGE) &&
- check_sequence(pdi, t, force))
- break;
+ if (use_ext) {
+ if (!(t->bit_ext->action == BLK_TN_MESSAGE_EXT) &&
+ check_sequence(pdi, t, force))
+ break;
+ } else {
+ if (!(t->bit->action == BLK_TN_MESSAGE) &&
+ check_sequence(pdi, t, force))
+ break;
+ }
- if (!(bit->action == BLK_TN_MESSAGE))
- pci->last_sequence = bit->sequence;
+ if (use_ext) {
+ if (!(t->bit_ext->action == BLK_TN_MESSAGE_EXT))
+ pci->last_sequence = t->bit_ext->sequence;
+ } else {
+ if (!(t->bit->action == BLK_TN_MESSAGE))
+ pci->last_sequence = t->bit->sequence;
+ }
- if (bit->action & (act_mask << BLK_TC_SHIFT))
- dump_trace(bit, pci, pdi);
+ if (use_ext) {
+ if (t->bit_ext->action &
+ (act_mask_ext << BLK_TC_SHIFT_EXT))
+ dump_trace(t, pci, pdi);
+ } else {
+ if (t->bit->action & (act_mask << BLK_TC_SHIFT))
+ dump_trace(t, pci, pdi);
+ }
put_trace(pdi, t);
}
@@ -2342,10 +2629,18 @@ static inline void ms_deq(struct ms_stream *msp)
msp->first = msp->first->next;
if (!msp->first) {
msp->last = NULL;
- if (!ms_prime(msp)) {
- ms_head = msp->next;
- msp->next = NULL;
- return;
+ if (use_ext) {
+ if (!ms_prime_ext(msp)) {
+ ms_head = msp->next;
+ msp->next = NULL;
+ return;
+ }
+ } else {
+ if (!ms_prime(msp)) {
+ ms_head = msp->next;
+ msp->next = NULL;
+ return;
+ }
}
}
@@ -2469,8 +2848,13 @@ static struct ms_stream *ms_alloc(struct per_dev_info *pdi, int cpu)
- if (ms_prime(msp))
- ms_sort(msp);
+ if (use_ext) {
+ if (ms_prime_ext(msp))
+ ms_sort(msp);
+ } else {
+ if (ms_prime(msp))
+ ms_sort(msp);
+ }
@@ -2605,8 +3020,13 @@ static int do_file(void)
- while (!is_done() && ms_head && handle(ms_head))
- ;
+ if (use_ext)
+ while (!is_done() && ms_head && handle_ext(ms_head))
+ ;
+ else {
+ while (!is_done() && ms_head && handle(ms_head))
+ ;
+ }
@@ -2618,19 +3038,23 @@ static void do_pipe(int fd)
read_sequence++;
-
#if 0
smallest_seq_read = -1U;
#endif
-
if (sort_entries(&youngest))
break;
-
if (youngest > stopwatch_end)
break;
-
show_entries_rb(0);
}
Why such seemingly unrelated whitespace changes?
-#define S_OPTS "a:A:b:D:d:f:F:hi:o:Oqstw:vVM"
+#define S_OPTS "a:A:b:D:d:f:F:hi:o:Oqstw:vVMPx:X:y:Y:E"
static char usage_str[] = "\n\n" \
- "-i <file> | --input=<file>\n" \
- "[ -a <action field> | --act-mask=<action field> ]\n" \
- "[ -A <action mask> | --set-mask=<action mask> ]\n" \
- "[ -b <traces> | --batch=<traces> ]\n" \
- "[ -d <file> | --dump-binary=<file> ]\n" \
- "[ -D <dir> | --input-directory=<dir> ]\n" \
- "[ -f <format> | --format=<format> ]\n" \
- "[ -F <spec> | --format-spec=<spec> ]\n" \
- "[ -h | --hash-by-name ]\n" \
- "[ -o <file> | --output=<file> ]\n" \
- "[ -O | --no-text-output ]\n" \
- "[ -q | --quiet ]\n" \
- "[ -s | --per-program-stats ]\n" \
- "[ -t | --track-ios ]\n" \
- "[ -w <time> | --stopwatch=<time> ]\n" \
- "[ -M | --no-msgs\n" \
- "[ -v | --verbose ]\n" \
- "[ -V | --version ]\n\n" \
+ "-i <file> | --input=<file>\n" \
+ "[ -a <action field> | --act-mask=<action field> ]\n" \
+ "[ -A <action mask> | --set-mask=<action mask> ]\n" \
+ "[ -b <traces> | --batch=<traces> ]\n" \
+ "[ -d <file> | --dump-binary=<file> ]\n" \
+ "[ -D <dir> | --input-directory=<dir> ]\n" \
+ "[ -f <format> | --format=<format> ]\n" \
+ "[ -F <spec> | --format-spec=<spec> ]\n" \
+ "[ -h | --hash-by-name ]\n" \
+ "[ -o <file> | --output=<file> ]\n" \
+ "[ -O | --no-text-output ]\n" \
+ "[ -q | --quiet ]\n" \
+ "[ -s | --per-program-stats ]\n" \
+ "[ -t | --track-ios ]\n" \
+ "[ -w <time> | --stopwatch=<time> ]\n" \
+ "[ -M | --no-msgs\n" \
+ "[ -E | --use-extensions ]\n" \
+ "[ -y <action field ext> | --act-mask-ext=<action field ext>]\n" \
+ "[ -Y <action mask ext> | --set-mask=<action mask>]\n" \
+ "[ -P | --track-priority ]\n" \
+ "[ -x <ioprio field> | --prio-mask=<ioprio field> ]\n" \
+ "[ -X <ioprio mask> | --set-mask=<ioprio mask> ]\n" \
+ "[ -v | --verbose ]\n" \
+ "[ -V | --version ]\n\n" \
"\t-a Only trace specified actions. See documentation\n" \
"\t-A Give trace mask as a single value. See documentation\n" \
"\t-b stdin read batching\n" \
@@ -2769,6 +3199,12 @@ static char usage_str[] = "\n\n" \
"\t-w Only parse data between the given time interval in seconds.\n" \
"\t If 'start' isn't given, blkparse defaults the start time to 0\n" \
"\t-M Do not output messages to binary file\n" \
+ "\t-E Use Blocktrace Extensions\n" \
+ "\t-P Enable tracking priorites.\n" \
+ "\t-y Only trace specified actions ext.\n" \
+ "\t-Y Give trace mask as a single value ext.\n" \
+ "\t-x Only priority specified actions.\n" \
+ "\t-X Give priority mask as a single value.\n" \
"\t-v More verbose for marginal errors\n" \
"\t-V Print program version info\n\n";
@@ -2882,8 +3365,27 @@ int main(int argc, char *argv[])
- if (act_mask_tmp != 0)
- act_mask = act_mask_tmp;
+ if (use_ext) {
+ if (act_mask_tmp) {
+ fprintf(stderr, "please use y or Y with -E\n");
+ return 1;
+ }
+ if (act_mask_tmp_ext != 0)
+ act_mask_ext = act_mask_tmp_ext;
+
+ /*
+ * When track-priority is on and user didn't specify prio_mask then
+ * trace all the classes.
+ */
+ if (blkparse_track_prio && !blkparse_prio_mask)
+ blkparse_prio_mask = TRACE_ALL_IOPRIO;
+ } else {
+ if (act_mask_tmp != 0)
+ act_mask = act_mask_tmp;
+ }
diff --git a/blkparse_fmt.c b/blkparse_fmt.c
index c42e6d7..15c5f44 100644
--- a/blkparse_fmt.c
+++ b/blkparse_fmt.c
+static inline void fill_rwbs_ext(char *rwbs, struct blk_io_trace_ext *t)
+{
+ uint64_t w = t->action & BLK_TC_ACT_EXT(BLK_TC_WRITE);
+ uint64_t a = t->action & BLK_TC_ACT_EXT(BLK_TC_AHEAD);
+ uint64_t s = t->action & BLK_TC_ACT_EXT(BLK_TC_SYNC);
+ uint64_t m = t->action & BLK_TC_ACT_EXT(BLK_TC_META);
+ uint64_t d = t->action & BLK_TC_ACT_EXT(BLK_TC_DISCARD);
+ uint64_t f = t->action & BLK_TC_ACT_EXT(BLK_TC_FLUSH);
+ uint64_t u = t->action & BLK_TC_ACT_EXT(BLK_TC_FUA);
+ uint64_t z = t->action & BLK_TC_ACT_EXT(BLK_TC_WRITE_ZEROES);
+ uint64_t r = t->action & BLK_TC_ACT_EXT(BLK_TC_ZONE_RESET);
+ int i = 0;
+
+ if (f)
+ rwbs[i++] = 'F'; /* flush */
+
+ if (d)
+ rwbs[i++] = 'D';
+ else if (z) {
+ rwbs[i++] = 'W'; /* write-zeroes */
+ rwbs[i++] = 'Z';
+ } else if (r) {
+ rwbs[i++] = 'Z'; /* zone-reset */
+ rwbs[i++] = 'R';
+ } else if (w)
+ rwbs[i++] = 'W';
+ else if (t->bytes)
+ rwbs[i++] = 'R';
+ else
+ rwbs[i++] = 'N';
+
+ if (u)
+ rwbs[i++] = 'F'; /* fua */
+ if (a)
+ rwbs[i++] = 'A';
+ if (s)
+ rwbs[i++] = 'S';
+ if (m)
+ rwbs[i++] = 'M';
+
+ rwbs[i] = '\0';
+}
+
+static inline bool t_zone_reset(struct trace *t)
+{
+ if (!t->use_ext)
+ return false;
+
+ return t_bit_act(t) & BLK_TC_ACT_EXT(BLK_TC_ZONE_RESET) ? true : false;
+}
nice
static void process_default(char *act, struct per_cpu_info *pci,
- struct blk_io_trace *t, unsigned long long elapsed,
+ struct trace *t, unsigned long long elapsed,
int pdu_len, unsigned char *pdu_buf)
{
+ if (t->use_ext && blkparse_track_prio) {
+ switch (IOPRIO_PRIO_CLASS(t->bit_ext->ioprio)) {
+ case IOPRIO_CLASS_NONE:
+ if (blkparse_prio_mask & 0x1)
+ fprintf(ofp, "N ");
+ break;
+ case IOPRIO_CLASS_RT:
+ if (blkparse_prio_mask & 0x2)
+ fprintf(ofp, "R ");
+ break;
+ case IOPRIO_CLASS_BE:
+ if (blkparse_prio_mask & 0x4)
+ fprintf(ofp, "B ");
+ break;
+ case IOPRIO_CLASS_IDLE:
+ if (blkparse_prio_mask & 0x8)
+ fprintf(ofp, "I ");
+ break;
+ default:
+ fprintf(ofp, "E ");
+ }
+ }
--
Mit freundlichen Gruessen / Kind regards
Steffen Maier
Linux on IBM Z Development
https://www.ibm.com/privacy/us/en/
IBM Deutschland Research & Development GmbH
Vorsitzender des Aufsichtsrats: Matthias Hartmann
Geschaeftsfuehrung: Dirk Wittkopp
Sitz der Gesellschaft: Boeblingen
Registergericht: Amtsgericht Stuttgart, HRB 243294