Add capture output size check function to provide a reasonable minimum size for error capture region before allocating the shared buffer. Signed-off-by: Zhanjun Dong <zhanjun.dong@xxxxxxxxx> --- drivers/gpu/drm/xe/xe_guc_capture.c | 76 +++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_guc_capture.c b/drivers/gpu/drm/xe/xe_guc_capture.c index dde3a269d114..f4153dc4ab86 100644 --- a/drivers/gpu/drm/xe/xe_guc_capture.c +++ b/drivers/gpu/drm/xe/xe_guc_capture.c @@ -559,6 +559,81 @@ xe_guc_capture_getnullheader(struct xe_guc *guc, void **outptr, size_t *size) return 0; } +static int +guc_capture_output_min_size_est(struct xe_guc *guc) +{ + struct xe_gt *gt = guc_to_gt(guc); + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + + int worst_min_size = 0; + size_t tmp = 0; + + if (!guc->capture) + return -ENODEV; + + /* + * If every single engine-instance suffered a failure in quick succession but + * were all unrelated, then a burst of multiple error-capture events would dump + * registers for every one engine instance, one at a time. In this case, GuC + * would even dump the global-registers repeatedly. + * + * For each engine instance, there would be 1 x guc_state_capture_group_t output + * followed by 3 x guc_state_capture_t lists. The latter is how the register + * dumps are split across different register types (where the '3' are global vs class + * vs instance). + */ + for_each_hw_engine(hwe, gt, id) { + worst_min_size += sizeof(struct guc_state_capture_group_header_t) + + (3 * sizeof(struct guc_state_capture_header_t)); + + if (!guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_GLOBAL, 0, &tmp, true)) + worst_min_size += tmp; + + if (!guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS, + hwe->class, &tmp, true)) { + worst_min_size += tmp; + } + if (!guc_capture_getlistsize(guc, 0, GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE, + hwe->class, &tmp, true)) { + worst_min_size += tmp; + } + } + + return worst_min_size; +} + +/* + * Add on a 3x multiplier to allow for multiple back-to-back captures occurring + * before the i915 can read the data out and process it + */ +#define GUC_CAPTURE_OVERBUFFER_MULTIPLIER 3 + +static void check_guc_capture_size(struct xe_guc *guc) +{ + int min_size = guc_capture_output_min_size_est(guc); + int spare_size = min_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER; + u32 buffer_size = xe_guc_log_section_size_capture(&guc->log); + + /* + * NOTE: min_size is much smaller than the capture region allocation (DG2: <80K vs 1MB) + * Additionally, its based on space needed to fit all engines getting reset at once + * within the same G2H handler task slot. This is very unlikely. However, if GuC really + * does run out of space for whatever reason, we will see an separate warning message + * when processing the G2H event capture-notification, search for: + * xe_guc_STATE_CAPTURE_EVENT_STATUS_NOSPACE. + */ + if (min_size < 0) + xe_gt_warn(guc_to_gt(guc), "Failed to calculate error state capture buffer minimum size: %d!\n", + min_size); + else if (min_size > buffer_size) + xe_gt_warn(guc_to_gt(guc), "Error state capture buffer maybe small: %d < %d\n", + buffer_size, min_size); + else if (spare_size > buffer_size) + xe_gt_dbg(guc_to_gt(guc), "Error state capture buffer lacks spare size: %d < %d (min = %d)\n", + buffer_size, spare_size, min_size); +} + int xe_guc_capture_init(struct xe_guc *guc) { guc->capture = kzalloc(sizeof(*guc->capture), GFP_KERNEL); @@ -570,6 +645,7 @@ int xe_guc_capture_init(struct xe_guc *guc) INIT_LIST_HEAD(&guc->capture->outlist); INIT_LIST_HEAD(&guc->capture->cachelist); + check_guc_capture_size(guc); return 0; } -- 2.34.1