Re: [PATCH bpf-next v3] selftests/bpf: emit top frequent code lines in veristat

Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> · Tue, 1 Oct 2024 10:38:53 -0700

On Mon, Sep 30, 2024 at 4:15 PM Mykyta Yatsenko
<mykyta.yatsenko5@xxxxxxxxx> wrote:
>
> From: Mykyta Yatsenko <yatsenko@xxxxxxxx>
>
> Production BPF programs are increasing in number of instructions and states
> to the point, where optimising verification process for them is necessary
> to avoid running into instruction limit. Authors of those BPF programs
> need to analyze verifier output, for example, collecting the most
> frequent source code lines to understand which part of the program has
> the biggest verification cost.
>
> This patch introduces `--top-src-lines` flag in veristat.
> `--top-src-lines=N` makes veristat output N the most popular sorce code
> lines, parsed from verification log.
>
> An example of output:
> ```
> sudo ./veristat  --top-src-lines=2   bpf_flow.bpf.o
> Processing 'bpf_flow.bpf.o'...
> Top source lines (_dissect):
>     4: (bpf_helpers.h:161)      asm volatile("r1 = %[ctx]\n\t"
>     4: (bpf_flow.c:155) if (iph && iph->ihl == 5 &&
> ...
> ```
>
> Signed-off-by: Mykyta Yatsenko <yatsenko@xxxxxxxx>
> ---
>  tools/testing/selftests/bpf/veristat.c | 127 ++++++++++++++++++++++++-
>  1 file changed, 125 insertions(+), 2 deletions(-)
>

LGTM, I played with it locally, works nicely, thanks!

I did a few tweaks before applying, see below for details.

> diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
> index 1ec5c4c47235..977c5eca56f7 100644
> --- a/tools/testing/selftests/bpf/veristat.c
> +++ b/tools/testing/selftests/bpf/veristat.c
> @@ -179,6 +179,7 @@ static struct env {
>         int files_skipped;
>         int progs_processed;
>         int progs_skipped;
> +       int top_src_lines;
>  } env;
>
>  static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
> @@ -206,6 +207,7 @@ const char argp_program_doc[] =
>  enum {
>         OPT_LOG_FIXED = 1000,
>         OPT_LOG_SIZE = 1001,
> +       OPT_TOP_SRC_LINES = 1002,
>  };
>
>  static const struct argp_option opts[] = {
> @@ -228,6 +230,7 @@ static const struct argp_option opts[] = {
>           "Force frequent BPF verifier state checkpointing (set BPF_F_TEST_STATE_FREQ program flag)" },
>         { "test-reg-invariants", 'r', NULL, 0,
>           "Force BPF verifier failure on register invariant violation (BPF_F_TEST_REG_INVARIANTS program flag)" },
> +       { "top-src-lines", OPT_TOP_SRC_LINES, "N", 0, "Emit N most frequent source code lines" },

I added 'S' as a short name, I found --top-src-lines too much pain to type :)

>         {},
>  };
>
> @@ -337,6 +340,14 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
>                         return -ENOMEM;
>                 env.filename_cnt++;
>                 break;
> +       case OPT_TOP_SRC_LINES:
> +               errno = 0;
> +               env.top_src_lines = strtol(arg, NULL, 10);
> +               if (errno) {
> +                       fprintf(stderr, "invalid top lines N specifier: %s\n", arg);
> +                       argp_usage(state);
> +               }
> +               break;
>         default:
>                 return ARGP_ERR_UNKNOWN;
>         }
> @@ -854,6 +865,115 @@ static int parse_verif_log(char * const buf, size_t buf_sz, struct verif_stats *
>         return 0;
>  }
>
> +struct line_cnt {
> +       char *line;
> +       int cnt;
> +};
> +
> +static int str_cmp(const void *a, const void *b)
> +{
> +       const char **str1 = (const char **)a;
> +       const char **str2 = (const char **)b;
> +
> +       return strcmp(*str1, *str2);
> +}
> +
> +static int line_cnt_cmp(const void *a, const void *b)
> +{
> +       const struct line_cnt *a_cnt = (const struct line_cnt *)a;
> +       const struct line_cnt *b_cnt = (const struct line_cnt *)b;
> +
> +       return b_cnt->cnt - a_cnt->cnt;

for ordering stability, I added a fallback to line_cnt->line string
comparison, if counts are equal

> +}
> +

[...]

> +       printf("Top source lines (%s):\n", prog_name);
> +       for (i = 0; i < min(unique_lines, env.top_src_lines); ++i) {
> +               const char *src_code = freq[i].line;
> +               const char *src_line = NULL;
> +               char *split = strrchr(freq[i].line, '@');
> +
> +               if (split) {
> +                       src_line = split + 1;
> +
> +                       while (*src_line && isspace(*src_line))
> +                               src_line++;
> +
> +                       while (split > src_code && isspace(*split))
> +                               split--;
> +                       *split = '\0';
> +               }
> +
> +               if (src_line)
> +                       printf("%5d: (%s)\t%s\n", freq[i].cnt, src_line, src_code);
> +               else
> +                       printf("%5d: %s\n", freq[i].cnt, src_code);
> +       }
> +

added printf("\n") to visually separate the output a bit

> +cleanup:
> +       free(freq);
> +       free(lines);
> +       return err;
> +}
> +
>  static int guess_prog_type_by_ctx_name(const char *ctx_name,
>                                        enum bpf_prog_type *prog_type,
>                                        enum bpf_attach_type *attach_type)
> @@ -1009,13 +1129,14 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf
>         stats = &env.prog_stats[env.prog_stat_cnt++];
>         memset(stats, 0, sizeof(*stats));
>
> -       if (env.verbose) {
> +       if (env.verbose || env.top_src_lines > 0) {
>                 buf_sz = env.log_size ? env.log_size : 16 * 1024 * 1024;
>                 buf = malloc(buf_sz);
>                 if (!buf)
>                         return -ENOMEM;
>                 /* ensure we always request stats */
> -               log_level = env.log_level | 4 | (env.log_fixed ? 8 : 0);
> +               log_level = (env.top_src_lines > 0 && env.log_level == 0 ? 2 : env.log_level)
> +                       | 4 | (env.log_fixed ? 8 : 0);

this felt a bit too crowded, so I switched this to one extra if, and
left original log_level assignment untouched

>         } else {
>                 buf = verif_log_buf;
>                 buf_sz = sizeof(verif_log_buf);
> @@ -1048,6 +1169,8 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf
>                        filename, prog_name, stats->stats[DURATION],
>                        err ? "failure" : "success", buf);
>         }
> +       if (env.top_src_lines > 0)
> +               print_top_src_lines(buf, buf_sz, stats->prog_name);
>
>         if (verif_log_buf != buf)
>                 free(buf);
> --
> 2.46.2
>