Re: [PATCH bpf-next v2 05/20] bpf: implement bpf_seq_read() for bpf iterator

Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> · Tue, 5 May 2020 12:56:02 -0700

On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@xxxxxx> wrote:
>
> bpf iterator uses seq_file to provide a lossless
> way to transfer data to user space. But we want to call
> bpf program after all objects have been traversed, and
> bpf program may write additional data to the
> seq_file buffer. The current seq_read() does not work
> for this use case.
>
> Besides allowing stop() function to write to the buffer,
> the bpf_seq_read() also fixed the buffer size to one page.
> If any single call of show() or stop() will emit data
> more than one page to cause overflow, -E2BIG error code
> will be returned to user space.
>
> Signed-off-by: Yonghong Song <yhs@xxxxxx>
> ---
>  kernel/bpf/bpf_iter.c | 128 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 128 insertions(+)
>
> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> index 05ae04ac1eca..2674c9cbc3dc 100644
> --- a/kernel/bpf/bpf_iter.c
> +++ b/kernel/bpf/bpf_iter.c
> @@ -26,6 +26,134 @@ static DEFINE_MUTEX(targets_mutex);
>  /* protect bpf_iter_link changes */
>  static DEFINE_MUTEX(link_mutex);
>
> +/* bpf_seq_read, a customized and simpler version for bpf iterator.
> + * no_llseek is assumed for this file.
> + * The following are differences from seq_read():
> + *  . fixed buffer size (PAGE_SIZE)
> + *  . assuming no_llseek
> + *  . stop() may call bpf program, handling potential overflow there
> + */
> +static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
> +                           loff_t *ppos)
> +{
> +       struct seq_file *seq = file->private_data;
> +       size_t n, offs, copied = 0;
> +       int err = 0;
> +       void *p;
> +
> +       mutex_lock(&seq->lock);
> +
> +       if (!seq->buf) {
> +               seq->size = PAGE_SIZE;
> +               seq->buf = kmalloc(seq->size, GFP_KERNEL);
> +               if (!seq->buf)
> +                       goto Enomem;

Why not just mutex_unlock and exit with -ENOMEM? Less goto'ing, more
straightforward.

> +       }
> +
> +       if (seq->count) {
> +               n = min(seq->count, size);
> +               err = copy_to_user(buf, seq->buf + seq->from, n);
> +               if (err)
> +                       goto Efault;
> +               seq->count -= n;
> +               seq->from += n;
> +               copied = n;
> +               goto Done;
> +       }
> +
> +       seq->from = 0;
> +       p = seq->op->start(seq, &seq->index);
> +       if (!p || IS_ERR(p))

IS_ERR_OR_NULL?

> +               goto Stop;
> +
> +       err = seq->op->show(seq, p);
> +       if (seq_has_overflowed(seq)) {
> +               err = -E2BIG;
> +               goto Error_show;
> +       } else if (err) {
> +               /* < 0: go out, > 0: skip */
> +               if (likely(err < 0))
> +                       goto Error_show;
> +               seq->count = 0;
> +       }

This seems a bit more straightforward:

if (seq_has_overflowed(seq))
    err = -E2BIG;
if (err < 0)
    goto Error_show;
else if (err > 0)
    seq->count = 0;

Also, I wonder if err > 0 (so skip was requested), should we ignore
overflow? So something like:

if (err > 0) {
    seq->count = 0;
} else {
    if (seq_has_overflowed(seq))
        err = -E2BIG;
    if (err)
        goto Error_show;
}

> +
> +       while (1) {
> +               loff_t pos = seq->index;
> +
> +               offs = seq->count;
> +               p = seq->op->next(seq, p, &seq->index);
> +               if (pos == seq->index) {
> +                       pr_info_ratelimited("buggy seq_file .next function %ps "
> +                               "did not updated position index\n",
> +                               seq->op->next);
> +                       seq->index++;
> +               }
> +
> +               if (!p || IS_ERR(p)) {

Same, IS_ERR_OR_NULL.

> +                       err = PTR_ERR(p);
> +                       break;
> +               }
> +               if (seq->count >= size)
> +                       break;
> +
> +               err = seq->op->show(seq, p);
> +               if (seq_has_overflowed(seq)) {
> +                       if (offs == 0) {
> +                               err = -E2BIG;
> +                               goto Error_show;
> +                       }
> +                       seq->count = offs;
> +                       break;
> +               } else if (err) {
> +                       /* < 0: go out, > 0: skip */
> +                       seq->count = offs;
> +                       if (likely(err < 0)) {
> +                               if (offs == 0)
> +                                       goto Error_show;
> +                               break;
> +                       }
> +               }

Same question here about ignoring overflow if skip was requested.

> +       }
> +Stop:
> +       offs = seq->count;
> +       /* may call bpf program */
> +       seq->op->stop(seq, p);
> +       if (seq_has_overflowed(seq)) {
> +               if (offs == 0)
> +                       goto Error_stop;
> +               seq->count = offs;

just want to double-check, because it's not clear from the code. If
all the start()/show()/next() succeeded, but stop() overflown. Would
stop() be called again on subsequent read? Would start/show/next
handle this correctly as well?

> +       }
> +
> +       n = min(seq->count, size);
> +       err = copy_to_user(buf, seq->buf, n);
> +       if (err)
> +               goto Efault;
> +       copied = n;
> +       seq->count -= n;
> +       seq->from = n;
> +Done:
> +       if (!copied)
> +               copied = err;
> +       else
> +               *ppos += copied;
> +       mutex_unlock(&seq->lock);
> +       return copied;
> +
> +Error_show:
> +       seq->op->stop(seq, p);
> +Error_stop:
> +       seq->count = 0;
> +       goto Done;
> +
> +Enomem:
> +       err = -ENOMEM;
> +       goto Done;
> +
> +Efault:
> +       err = -EFAULT;
> +       goto Done;

Enomem and Efault seem completely redundant and just add goto
complexity to this algorithm. Let's just inline `err =
-E(NOMEM|FAULT); goto Done;` instead?

> +}
> +
>  int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
>  {
>         struct bpf_iter_target_info *tinfo;
> --
> 2.24.1
>