Re: [RFC PATCH 37/97] drm/i915/guc: Add stall timer to non blocking CTB send function

Michal Wajdeczko <michal.wajdeczko@xxxxxxxxx> · Tue, 25 May 2021 16:15:15 +0200

On 24.05.2021 20:35, Matthew Brost wrote:
> On Mon, May 24, 2021 at 02:58:12PM +0200, Michal Wajdeczko wrote:
>>
>>
>> On 06.05.2021 21:13, Matthew Brost wrote:
>>> Implement a stall timer which fails H2G CTBs once a period of time
>>> with no forward progress is reached to prevent deadlock.
>>>
>>> Also update to ct_write to return -EDEADLK rather than -EPIPE on a
>>> corrupted descriptor.
>>
>> broken descriptor is really separate issue compared to no progress from
>> GuC side, I would really like to keep old error code
>>
> 
> I know you do as you have brought it up several times. Again to the rest
> of the stack these two things mean the exact same thing.

but I guess 'the rest of the stack' is only interested if returned error
is EBUSY or not, as all other errors are treated in the same way, thus
no need change existing error codes

>  
>> note that broken CTB descriptor is unrecoverable error, while on other
>> hand, in theory, we could recover from temporary non-moving CTB
>>
> 
> Yea but we don't, in both cases we disable submission which in turn
> triggers a full GPU reset.

is this current limitation or long term design?

I would rather expect that decision to trigger full GPU is done on solid
foundations, like definite lost communication with the GuC or missed
heartbeats, not that we temporarily push CTB to the limit

or if do we want to treat CTB processing as kind of hw health checkout
too, what if heartbeat timeout and CTB stall time do not match ?

> 
>>>
>>> Signed-off-by: John Harrison <John.C.Harrison@xxxxxxxxx>
>>> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@xxxxxxxxx>
>>> Signed-off-by: Matthew Brost <matthew.brost@xxxxxxxxx>
>>> ---
>>>  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 48 +++++++++++++++++++++--
>>>  1 file changed, 45 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>>> index af7314d45a78..4eab319d61be 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
>>> @@ -69,6 +69,8 @@ static inline struct drm_device *ct_to_drm(struct intel_guc_ct *ct)
>>>  #define CTB_H2G_BUFFER_SIZE	(SZ_4K)
>>>  #define CTB_G2H_BUFFER_SIZE	(SZ_4K)
>>>  
>>> +#define MAX_US_STALL_CTB	1000000
>>
>> nit: maybe we should make it a CONFIG value ?
>>
> 
> Sure.
> 
>>> +
>>>  struct ct_request {
>>>  	struct list_head link;
>>>  	u32 fence;
>>> @@ -315,6 +317,7 @@ int intel_guc_ct_enable(struct intel_guc_ct *ct)
>>>  
>>>  	ct->requests.last_fence = 1;
>>>  	ct->enabled = true;
>>> +	ct->stall_time = KTIME_MAX;
>>>  
>>>  	return 0;
>>>  
>>> @@ -378,7 +381,7 @@ static int ct_write(struct intel_guc_ct *ct,
>>>  	unsigned int i;
>>>  
>>>  	if (unlikely(ctb->broken))
>>> -		return -EPIPE;
>>> +		return -EDEADLK;
>>>  
>>>  	if (unlikely(desc->status))
>>>  		goto corrupted;
>>> @@ -449,7 +452,7 @@ static int ct_write(struct intel_guc_ct *ct,
>>>  	CT_ERROR(ct, "Corrupted descriptor head=%u tail=%u status=%#x\n",
>>>  		 desc->head, desc->tail, desc->status);
>>>  	ctb->broken = true;
>>> -	return -EPIPE;
>>> +	return -EDEADLK;
>>>  }
>>>  
>>>  /**
>>> @@ -494,6 +497,17 @@ static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
>>>  	return err;
>>>  }
>>>  
>>> +static inline bool ct_deadlocked(struct intel_guc_ct *ct)
>>> +{
>>> +	bool ret = ktime_us_delta(ktime_get(), ct->stall_time) >
>>> +		MAX_US_STALL_CTB;
>>> +
>>> +	if (unlikely(ret))
>>> +		CT_ERROR(ct, "CT deadlocked\n");
>>> +
>>> +	return ret;
>>> +}
>>> +
>>>  static inline bool ctb_has_room(struct intel_guc_ct_buffer *ctb, u32 len_dw)
>>>  {
>>>  	struct guc_ct_buffer_desc *desc = ctb->desc;
>>> @@ -505,6 +519,26 @@ static inline bool ctb_has_room(struct intel_guc_ct_buffer *ctb, u32 len_dw)
>>>  	return space >= len_dw;
>>>  }
>>>  
>>> +static int has_room_nb(struct intel_guc_ct *ct, u32 len_dw)
>>> +{
>>> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
>>> +
>>> +	lockdep_assert_held(&ct->ctbs.send.lock);
>>> +
>>> +	if (unlikely(!ctb_has_room(ctb, len_dw))) {
>>> +		if (ct->stall_time == KTIME_MAX)
>>> +			ct->stall_time = ktime_get();
>>> +
>>> +		if (unlikely(ct_deadlocked(ct)))
>>> +			return -EDEADLK;
>>> +		else
>>> +			return -EBUSY;
>>> +	}
>>> +
>>> +	ct->stall_time = KTIME_MAX;
>>> +	return 0;
>>> +}
>>> +
>>>  static int ct_send_nb(struct intel_guc_ct *ct,
>>>  		      const u32 *action,
>>>  		      u32 len,
>>> @@ -517,7 +551,7 @@ static int ct_send_nb(struct intel_guc_ct *ct,
>>>  
>>>  	spin_lock_irqsave(&ctb->lock, spin_flags);
>>>  
>>> -	ret = ctb_has_room(ctb, len + 1);
>>> +	ret = has_room_nb(ct, len + 1);
>>>  	if (unlikely(ret))
>>>  		goto out;
>>>  
>>> @@ -561,11 +595,19 @@ static int ct_send(struct intel_guc_ct *ct,
>>>  retry:
>>>  	spin_lock_irqsave(&ct->ctbs.send.lock, flags);
>>>  	if (unlikely(!ctb_has_room(ctb, len + 1))) {
>>> +		if (ct->stall_time == KTIME_MAX)
>>> +			ct->stall_time = ktime_get();
>>>  		spin_unlock_irqrestore(&ct->ctbs.send.lock, flags);
>>> +
>>> +		if (unlikely(ct_deadlocked(ct)))
>>> +			return -EDEADLK;
>>> +
>>
>> likely, instead of duplicating code, you can reuse has_room_nb here
>>
> 
> In this patch yes, in the following patch no as this check changes
> between non-blockig and blocking once we introduce G2H credits. I'd
> rather just leave it as is than churning on the patches.
> 
> Matt 
>  
>>>  		cond_resched();
>>>  		goto retry;
>>>  	}
>>>  
>>> +	ct->stall_time = KTIME_MAX;
>>> +
>>>  	fence = ct_get_next_fence(ct);
>>>  	request.fence = fence;
>>>  	request.status = 0;
>>>