Re: [PATCH v2] drm/print: Introduce drm_line_printer

Michal Wajdeczko <michal.wajdeczko@xxxxxxxxx> · Thu, 30 May 2024 23:27:24 +0200

On 30.05.2024 20:47, John Harrison wrote:
> On 5/30/2024 00:49, Jani Nikula wrote:
>> On Wed, 29 May 2024, John Harrison <john.c.harrison@xxxxxxxxx> wrote:
>>> On 5/28/2024 06:06, Michal Wajdeczko wrote:
>>>> This drm printer wrapper can be used to increase the robustness of
>>>> the captured output generated by any other drm_printer to make sure
>>>> we didn't lost any intermediate lines of the output by adding line
>>>> numbers to each output line. Helpful for capturing some crash data.
>>>>
>>>> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@xxxxxxxxx>
>>>> Cc: Jani Nikula <jani.nikula@xxxxxxxxx>
>>>> Cc: John Harrison <John.C.Harrison@xxxxxxxxx>
>>>> ---
>>>> v2: don't abuse prefix, use union instead (Jani)
>>>>       don't use 'dp' as name, prefer 'p' (Jani)
>>>>       add support for unique series identifier (John)
>>>> ---
>>>>    drivers/gpu/drm/drm_print.c | 14 ++++++++
>>>>    include/drm/drm_print.h     | 68
>>>> ++++++++++++++++++++++++++++++++++++-
>>>>    2 files changed, 81 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/drm_print.c b/drivers/gpu/drm/drm_print.c
>>>> index cf2efb44722c..be9cbebff5b3 100644
>>>> --- a/drivers/gpu/drm/drm_print.c
>>>> +++ b/drivers/gpu/drm/drm_print.c
>>>> @@ -214,6 +214,20 @@ void __drm_printfn_err(struct drm_printer *p,
>>>> struct va_format *vaf)
>>>>    }
>>>>    EXPORT_SYMBOL(__drm_printfn_err);
>>>>    +void __drm_printfn_line(struct drm_printer *p, struct va_format
>>>> *vaf)
>>>> +{
>>>> +    unsigned int counter = ++p->line.counter;
>>> Wrong units, but see below anyway...
>>>
>>>> +    const char *prefix = p->prefix ?: "";
>>>> +    const char *pad = p->prefix ? " " : "";
>>>> +
>>>> +    if (p->line.series)
>>>> +        drm_printf(p->arg, "%s%s%u.%u: %pV",
>>>> +               prefix, pad, p->line.series, counter, vaf);
>>>> +    else
>>>> +        drm_printf(p->arg, "%s%s%u: %pV", prefix, pad, counter, vaf);
>>>> +}
>>>> +EXPORT_SYMBOL(__drm_printfn_line);
>>>> +
>>>>    /**
>>>>     * drm_puts - print a const string to a &drm_printer stream
>>>>     * @p: the &drm printer
>>>> diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
>>>> index 089950ad8681..f4d9b98d7909 100644
>>>> --- a/include/drm/drm_print.h
>>>> +++ b/include/drm/drm_print.h
>>>> @@ -176,7 +176,13 @@ struct drm_printer {
>>>>        void (*puts)(struct drm_printer *p, const char *str);
>>>>        void *arg;
>>>>        const char *prefix;
>>>> -    enum drm_debug_category category;
>>>> +    union {
>>>> +        enum drm_debug_category category;
>>>> +        struct {
>>>> +            unsigned short series;
>>>> +            unsigned short counter;
>>> Any particular reason for using 'short' rather than 'int'? Short is only
>>> 16bits right? That might seem huge but a GuC log buffer with 16MB debug
>>> log (and minimum sizes for the other sections) when dumped via the
>>> original debugfs hexdump mechanism is 1,245,444 lines. That line count
>>> goes down a lot when you start using longer lines for the dump, but it
>>> is still in the tens of thousands of lines.  So limiting to 16 bits is
>>> an overflow just waiting to happen.
>>>
>>>> +        } line;
>>>> +    };
>>>>    };
>>>>       void __drm_printfn_coredump(struct drm_printer *p, struct
>>>> va_format *vaf);
>>>> @@ -186,6 +192,7 @@ void __drm_puts_seq_file(struct drm_printer *p,
>>>> const char *str);
>>>>    void __drm_printfn_info(struct drm_printer *p, struct va_format
>>>> *vaf);
>>>>    void __drm_printfn_dbg(struct drm_printer *p, struct va_format
>>>> *vaf);
>>>>    void __drm_printfn_err(struct drm_printer *p, struct va_format
>>>> *vaf);
>>>> +void __drm_printfn_line(struct drm_printer *p, struct va_format *vaf);
>>>>       __printf(2, 3)
>>>>    void drm_printf(struct drm_printer *p, const char *f, ...);
>>>> @@ -357,6 +364,65 @@ static inline struct drm_printer
>>>> drm_err_printer(struct drm_device *drm,
>>>>        return p;
>>>>    }
>>>>    +/**
>>>> + * drm_line_printer - construct a &drm_printer that prefixes
>>>> outputs with line numbers
>>>> + * @p: the &struct drm_printer which actually generates the output
>>>> + * @prefix: optional output prefix, or NULL for no prefix
>>>> + * @series: optional unique series identifier, or 0 to omit
>>>> identifier in the output
>>>> + *
>>>> + * This printer can be used to increase the robustness of the
>>>> captured output
>>>> + * to make sure we didn't lost any intermediate lines of the
>>>> output. Helpful
>>>> + * while capturing some crash data.
>>>> + *
>>>> + * Example 1::
>>>> + *
>>>> + *    void crash_dump(struct drm_device *drm)
>>>> + *    {
>>>> + *        static unsigned short id;
>>>> + *        struct drm_printer p = drm_err_printer(drm, "crash");
>>>> + *        struct drm_printer lp = drm_line_printer(&p, "dump", ++id);
>>> Is there any benefit or other reason to split the prefix across two
>>> separate printers? It seems like a source of confusion. As in, the code
>>> will allow a double prefix, there is not much you can do about that
>>> because losing the prefix from drm_line_printer would mean no prefix at
>>> all when not using drm_err_printer underneath. But why explicitly split
>>> the message across both printers in the usage example? This is saying
>>> that this is the recommended way to use the interface, but with no
>>> explanation of why the split is required or how the split should be
>>> done.
>> You could have a printer, and then add two separate line counted blocks.
>>
>>     struct drm_printer p = drm_err_printer(drm, "parent");
>>     struct drm_printer lp1 = drm_line_printer(&p, "child 1", 0);
>>
>>     ...
>>
>>     struct drm_printer lp2 = drm_line_printer(&p, "child 2", 0);
>>
>>     ...
>>
>> p could be defined elsewhere and passed into separate functions which
>> each have the line printing. The two prefixes can be useful.
> Except you can't have a multi-level prefix if using the info level
> printer as that does not take a prefix. And I'm really not seeing a

but it's up to you which printer you choose as 'primary' printer that
will render final output

> reason why you would want the line count to restart in the middle of a
> single atomic dump operation.
> 
>>
>>> Also, there is really no specific connection to crashes. The purpose of
>>> this is for allowing the dumping of multi-line blocks of data. One use
>>> is for debugging crashes. But there are many debug operations that
>>> require the same dump and do not involve a crash. And this support is
>>> certainly not intended to be used on general end-user GPU hangs. For
>>> those, everything should be part of the devcoredump core file that is
>>> produced and accessible via sysfs. We absolutely should not be dumping
>>> huge data blocks in customer release drivers except in very extreme
>>> circumstances.
>> A devcoredump implementation could use a drm_printer too?
> You mean for reading the devcoredump file from sysfs? Except that the
> whole reason this was started was because Michal absolutely refuses to
> allow line counted output in a sysfs/debugfs file because "it is
> unnecessary and breaks the decoding tools".
> 
>>
>> Is this only about bikeshedding the example? I'm not sure I get the
>> point here.
> I would call it getting accurate and understandable documentation.
> 
> The existing example is splitting what should be an atomic name "crash
> dump" across two separate line printer objects. That is something that
> is so unrealistic that it implies there is a required reason to break
> the prefix up. Documentation that is ambiguous and confusing is
> potentially worse than no documentation at all.
> 

all below setups will render the same output.
which one, in your opinion, shows all potential of the drm_line_printer?

	struct drm_printer p = drm_err_printer(drm, "AAA BBB");
	struct drm_printer lp = drm_line_printer(&p, NULL, id);

	struct drm_printer p = drm_err_printer(drm, NULL);
	struct drm_printer lp = drm_line_printer(&p, "AAA BBB", id);

	struct drm_printer p = drm_err_printer(drm, "AAA");
	struct drm_printer lp = drm_line_printer(&p, "BBB", ++id);

> 
>>
>>>> + *
>>>> + *        drm_printf(&lp, "foo");
>>>> + *        drm_printf(&lp, "bar");
>>>> + *    }
>>>> + *
>>>> + * Above code will print into the dmesg something like::
>>>> + *
>>>> + *    [ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.1: foo
>>>> + *    [ ] 0000:00:00.0: [drm] *ERROR* crash dump 1.2: bar
>>>> + *
>>>> + * Example 2::
>>>> + *
>>>> + *    void line_dump(struct device *dev)
>>>> + *    {
>>>> + *        struct drm_printer p = drm_info_printer(dev);
>>>> + *        struct drm_printer lp = drm_line_printer(&p, NULL, 0);
>>>> + *
>>>> + *        drm_printf(&lp, "foo");
>>>> + *        drm_printf(&lp, "bar");
>>>> + *    }
>>>> + *
>>>> + * Above code will print::
>>>> + *
>>>> + *    [ ] 0000:00:00.0: [drm] 1: foo
>>>> + *    [ ] 0000:00:00.0: [drm] 2: bar
>>> Not really seeing the point of having two examples listed. The first
>>> includes all the optional extras, the second is just repeating with no
>>> new information.
>> You see how the "series" param behaves?
> The second example doesn't have a series parameter. If the only purpose
> is to say "the print of the series value is suppressed if zero" then why
> not just have that one line? Documentation should also be concise.

it was already documented in that way:

@series: optional unique series identifier, or 0 to omit identifier in
the output

> 
> John.
> 
>>
>> BR,
>> Jani.
>>
>>> John.
>>>
>>>> + *
>>>> + * RETURNS:
>>>> + * The &drm_printer object
>>>> + */
>>>> +static inline struct drm_printer drm_line_printer(struct
>>>> drm_printer *p,
>>>> +                          const char *prefix,
>>>> +                          unsigned short series)
>>>> +{
>>>> +    struct drm_printer lp = {
>>>> +        .printfn = __drm_printfn_line,
>>>> +        .arg = p,
>>>> +        .prefix = prefix,
>>>> +        .line = { .series = series, },
>>>> +    };
>>>> +    return lp;
>>>> +}
>>>> +
>>>>    /*
>>>>     * struct device based logging
>>>>     *
>