Re: [PATCH v8 3/7] drm/i915: Define and use GuC and CTB TLB invalidation routines

Nirmoy Das <nirmoy.das@xxxxxxxxx> · Mon, 9 Oct 2023 15:24:03 +0200



    Hi Tvrtko,

    
    On 10/9/2023 2:54 PM, Tvrtko Ursulin
      wrote:
    <snip>

    
        +static long must_wait_woken(struct
          wait_queue_entry *wq_entry, long timeout)
          

          +{
          

          +    /*
          

          +     * This is equivalent to wait_woken() with the exception
          that
          

          +     * we do not wake up early if the kthread task has been
          completed.
          

          +     * As we are called from page reclaim in any task
          context,
          

          +     * we may be invoked from stopped kthreads, but we *must*
          

          +     * complete the wait from the HW .
          

          +     *
          

          +     * A second problem is that since we are called under
          reclaim
          

          +     * and wait_woken() inspected the thread state, it makes
          an invalid
          

          +     * assumption that all PF_KTHREAD tasks have
          set_kthread_struct()
          

          +     * called upon them, and will trigger a GPF
          

        As discussed internally, the GPF issue is resolved with
        https://lore.kernel.org/all/20230602212350.535358-1-jstultz@xxxxxxxxxx/
<https://lore.kernel.org/all/20230602212350.535358-1-jstultz@xxxxxxxxxx/>
        

      If it means no open coding a slighlt different wait_token() that
      would be a lot better ineed.
      

      Although the question of why a single wait queue is not good
      enough still remains. As I wrote before - give each invalidation a
      seqno, upon receiving the done h2g store the latest seqno in the
      driver and wake up all sleepers. They check the seqno and if their
      has passed exit, otherwise go back to sleep. No xarray needed.
      Put_pages() invalidations are already serialized so no ill-effect
      and GGTT invalidations are, or are not, performance sensitive for
      some reason?
      

    I think your proposed solution should work as per my
      understanding because we are doing TLB invalidation of all engines
      and it is serialized with gt->tlb.invalidate_lock.
    We might need this when we want to make it more fine grain and do
      vma based ranged invalidation for better performance. So I think
      we should try with a single wait queue now. 
    
    
    I don't have an concrete answer yet, may be someone else?
    

    Regards,
    Nirmoy

    
      Regards,
      

      Tvrtko
      

          in is_kthread_should_stop().
          

          +     */
          

          +    do {
          

          +        set_current_state(TASK_UNINTERRUPTIBLE);
          

          +        if (wq_entry->flags & WQ_FLAG_WOKEN)
          

          +            break;
          

          +
          

          +        timeout = schedule_timeout(timeout);
          

          +    } while (timeout);
          

          +    __set_current_state(TASK_RUNNING);
          

          +
          

          +    /* See wait_woken() and woken_wake_function() */
          

          +    smp_store_mb(wq_entry->flags, wq_entry->flags &
          ~WQ_FLAG_WOKEN);
          

          +
          

          +    return timeout;
          

          +}
          

          +
          

          +static int guc_send_invalidate_tlb(struct intel_guc *guc,
          enum intel_guc_tlb_inval_mode type)
          

        2nd param should be intel_guc_tlb_invalidation_type not
        intel_guc_tlb_inval_mod.
        

        Not sure why didn't CI complained.
        

        Regards,
        

        Nirmoy
        

        +{
          

          +    struct intel_guc_tlb_wait _wq, *wq = &_wq;
          

          +    DEFINE_WAIT_FUNC(wait, woken_wake_function);
          

          +    int err;
          

          +    u32 seqno;
          

          +    u32 action[] = {
          

          +        INTEL_GUC_ACTION_TLB_INVALIDATION,
          

          +        0,
          

          +        REG_FIELD_PREP(INTEL_GUC_TLB_INVAL_TYPE_MASK, type) |
          

          +            REG_FIELD_PREP(INTEL_GUC_TLB_INVAL_MODE_MASK,
          

          +                       INTEL_GUC_TLB_INVAL_MODE_HEAVY) |
          

          +            INTEL_GUC_TLB_INVAL_FLUSH_CACHE,
          

          +    };
          

          +    u32 size = ARRAY_SIZE(action);
          

          +
          

          +    init_waitqueue_head(&_wq.wq);
          

          +
          

          +    if (xa_alloc_cyclic_irq(&guc->tlb_lookup,
          &seqno, wq,
          

          +                xa_limit_32b, &guc->next_seqno,
          

          +                GFP_ATOMIC | __GFP_NOWARN) < 0) {
          

          +        /* Under severe memory pressure? Serialise TLB
          allocations */
          

          +        xa_lock_irq(&guc->tlb_lookup);
          

          +        wq = xa_load(&guc->tlb_lookup,
          guc->serial_slot);
          

          +        wait_event_lock_irq(wq->wq,
          

          +                    !READ_ONCE(wq->busy),
          

          +                    guc->tlb_lookup.xa_lock);
          

          +        /*
          

          +         * Update wq->busy under lock to ensure only one
          waiter can
          

          +         * issue the TLB invalidation command using the
          serial slot at a
          

          +         * time. The condition is set to true before
          releasing the lock
          

          +         * so that other caller continue to wait until woken
          up again.
          

          +         */
          

          +        wq->busy = true;
          

          +        xa_unlock_irq(&guc->tlb_lookup);
          

          +
          

          +        seqno = guc->serial_slot;
          

          +    }
          

          +
          

          +    action[1] = seqno;
          

          +
          

          +    add_wait_queue(&wq->wq, &wait);
          

          +
          

          +    /*
          

          +     * This is a critical reclaim path and thus we must loop
          here:
          

          +     * We cannot block for anything that is on the GPU.
          

          +     */
          

          +    err = intel_guc_send_busy_loop(guc, action, size,
          G2H_LEN_DW_INVALIDATE_TLB, true);
          

          +    if (err)
          

          +        goto out;
          

          +
          

          +    if (!must_wait_woken(&wait,
          intel_guc_ct_expected_delay(&guc->ct))) {
          

          +        guc_err(guc,
          

          +            "TLB invalidation response timed out for seqno
          %u\n", seqno);
          

          +        err = -ETIME;
          

          +    }
          

          +out:
          

          +    remove_wait_queue(&wq->wq, &wait);
          

          +    if (seqno != guc->serial_slot)
          

          +        xa_erase_irq(&guc->tlb_lookup, seqno);
          

          +
          

          +    return err;
          

          +}
          

          +
          

          +/* Full TLB invalidation */
          

          +int intel_guc_invalidate_tlb_engines(struct intel_guc *guc)
          

          +{
          

          +    return guc_send_invalidate_tlb(guc,
          INTEL_GUC_TLB_INVAL_ENGINES);
          

          +}
          

          +
          

          +/* GuC TLB Invalidation: Invalidate the TLB's of GuC itself.
          */
          

          +int intel_guc_invalidate_tlb_guc(struct intel_guc *guc)
          

          +{
          

          +    return guc_send_invalidate_tlb(guc,
          INTEL_GUC_TLB_INVAL_GUC);
          

          +}
          

          +
          

            int intel_guc_deregister_done_process_msg(struct intel_guc
          *guc,
          

                                  const u32 *msg,
          

                                  u32 len)