Re: [PATCH 1/2] drm/ttm: fix busy memory to fail other user v6

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 5/7/19 1:42 PM, Koenig, Christian wrote:
Am 07.05.19 um 13:37 schrieb Thomas Hellstrom:
[CAUTION: External Email]

On 5/7/19 1:24 PM, Christian König wrote:
Am 07.05.19 um 13:22 schrieb zhoucm1:

On 2019年05月07日 19:13, Koenig, Christian wrote:
Am 07.05.19 um 13:08 schrieb zhoucm1:
On 2019年05月07日 18:53, Koenig, Christian wrote:
Am 07.05.19 um 11:36 schrieb Chunming Zhou:
heavy gpu job could occupy memory long time, which lead other user
fail to get memory.

basically pick up Christian idea:

1. Reserve the BO in DC using a ww_mutex ticket (trivial).
2. If we then run into this EBUSY condition in TTM check if the BO
we need memory for (or rather the ww_mutex of its reservation
object) has a ticket assigned.
3. If we have a ticket we grab a reference to the first BO on the
LRU, drop the LRU lock and try to grab the reservation lock with
the
ticket.
4. If getting the reservation lock with the ticket succeeded we
check if the BO is still the first one on the LRU in question (the
BO could have moved).
5. If the BO is still the first one on the LRU in question we
try to
evict it as we would evict any other BO.
6. If any of the "If's" above fail we just back off and return
-EBUSY.

v2: fix some minor check
v3: address Christian v2 comments.
v4: fix some missing
v5: handle first_bo unlock and bo_get/put
v6: abstract unified iterate function, and handle all possible
usecase not only pinned bo.

Change-Id: I21423fb922f885465f13833c41df1e134364a8e7
Signed-off-by: Chunming Zhou <david1.zhou@xxxxxxx>
---
     drivers/gpu/drm/ttm/ttm_bo.c | 113
++++++++++++++++++++++++++++++-----
     1 file changed, 97 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
b/drivers/gpu/drm/ttm/ttm_bo.c
index 8502b3ed2d88..bbf1d14d00a7 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -766,11 +766,13 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
      * b. Otherwise, trylock it.
      */
     static bool ttm_bo_evict_swapout_allowable(struct
ttm_buffer_object *bo,
-            struct ttm_operation_ctx *ctx, bool *locked)
+            struct ttm_operation_ctx *ctx, bool *locked, bool
*busy)
     {
         bool ret = false;
            *locked = false;
+    if (busy)
+        *busy = false;
         if (bo->resv == ctx->resv) {
reservation_object_assert_held(bo->resv);
             if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
@@ -779,35 +781,45 @@ static bool
ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
         } else {
             *locked = reservation_object_trylock(bo->resv);
             ret = *locked;
+        if (!ret && busy)
+            *busy = true;
         }
            return ret;
     }
     -static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
-                   uint32_t mem_type,
-                   const struct ttm_place *place,
-                   struct ttm_operation_ctx *ctx)
+static struct ttm_buffer_object*
+ttm_mem_find_evitable_bo(struct ttm_bo_device *bdev,
+             struct ttm_mem_type_manager *man,
+             const struct ttm_place *place,
+             struct ttm_operation_ctx *ctx,
+             struct ttm_buffer_object **first_bo,
+             bool *locked)
     {
-    struct ttm_bo_global *glob = bdev->glob;
-    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
         struct ttm_buffer_object *bo = NULL;
-    bool locked = false;
-    unsigned i;
-    int ret;
+    int i;
     -    spin_lock(&glob->lru_lock);
+    if (first_bo)
+        *first_bo = NULL;
         for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
             list_for_each_entry(bo, &man->lru[i], lru) {
-            if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
+            bool busy = false;
+            if (!ttm_bo_evict_swapout_allowable(bo, ctx, locked,
+                                &busy)) {
A newline between declaration and code please.

+                if (first_bo && !(*first_bo) && busy) {
+                    ttm_bo_get(bo);
+                    *first_bo = bo;
+                }
                     continue;
+            }
                    if (place &&
!bdev->driver->eviction_valuable(bo,
                                           place)) {
-                if (locked)
+                if (*locked)
reservation_object_unlock(bo->resv);
                     continue;
                 }
+
                 break;
             }
     @@ -818,9 +830,66 @@ static int ttm_mem_evict_first(struct
ttm_bo_device *bdev,
             bo = NULL;
         }
     +    return bo;
+}
+
+static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
+                   uint32_t mem_type,
+                   const struct ttm_place *place,
+                   struct ttm_operation_ctx *ctx)
+{
+    struct ttm_bo_global *glob = bdev->glob;
+    struct ttm_mem_type_manager *man = &bdev->man[mem_type];
+    struct ttm_buffer_object *bo = NULL, *first_bo = NULL;
+    bool locked = false;
+    int ret;
+
+    spin_lock(&glob->lru_lock);
+    bo = ttm_mem_find_evitable_bo(bdev, man, place, ctx,
&first_bo,
+                      &locked);
         if (!bo) {
+        struct ttm_operation_ctx busy_ctx;
+
             spin_unlock(&glob->lru_lock);
-        return -EBUSY;
+        /* check if other user occupy memory too long time */
+        if (!first_bo || !ctx || !ctx->resv ||
!ctx->resv->lock.ctx) {
+            if (first_bo)
+                ttm_bo_put(first_bo);
+            return -EBUSY;
+        }
+        if (first_bo->resv == ctx->resv) {
+            ttm_bo_put(first_bo);
+            return -EBUSY;
+        }
+        if (ctx->interruptible)
+            ret =
ww_mutex_lock_interruptible(&first_bo->resv->lock,
+ ctx->resv->lock.ctx);
+        else
+            ret = ww_mutex_lock(&first_bo->resv->lock,
ctx->resv->lock.ctx);
+        if (ret) {
+            ttm_bo_put(first_bo);
+            return ret;
+        }
+        spin_lock(&glob->lru_lock);
+        /* previous busy resv lock is held by above, idle now,
+         * so let them evictable.
+         */
+        busy_ctx.interruptible = ctx->interruptible;
+        busy_ctx.no_wait_gpu   = ctx->no_wait_gpu;
+        busy_ctx.resv           = first_bo->resv;
+        busy_ctx.flags           = TTM_OPT_FLAG_ALLOW_RES_EVICT;
+
+        bo = ttm_mem_find_evitable_bo(bdev, man, place, &busy_ctx,
NULL,
+                          &locked);
+        if (bo && (bo->resv == first_bo->resv))
+            locked = true;
+        else if (bo)
+ ww_mutex_unlock(&first_bo->resv->lock);
+        if (!bo) {
+            spin_unlock(&glob->lru_lock);
+            ttm_bo_put(first_bo);
+            return -EBUSY;
+        }
         }
            kref_get(&bo->list_kref);
@@ -829,11 +898,15 @@ static int ttm_mem_evict_first(struct
ttm_bo_device *bdev,
             ret = ttm_bo_cleanup_refs(bo, ctx->interruptible,
                           ctx->no_wait_gpu, locked);
             kref_put(&bo->list_kref, ttm_bo_release_list);
+        if (first_bo)
+            ttm_bo_put(first_bo);
             return ret;
         }
            ttm_bo_del_from_lru(bo);
         spin_unlock(&glob->lru_lock);
+    if (first_bo)
+        ttm_bo_put(first_bo);
            ret = ttm_bo_evict(bo, ctx);
         if (locked) {
@@ -899,6 +972,13 @@ static int ttm_bo_mem_force_space(struct
ttm_buffer_object *bo,
     {
         struct ttm_bo_device *bdev = bo->bdev;
         struct ttm_mem_type_manager *man = &bdev->man[mem_type];
+    struct ttm_operation_ctx native_ctx = {
+        .interruptible = false,
+        .no_wait_gpu = false,
+        .resv = bo->resv,
+        .flags = 0
+    };
+    struct ttm_operation_ctx *evict_ctx = ctx ? ctx : &native_ctx;
I thought we made the ctx parameter mandatory, didn't we? Could be
that
I remember that incorrectly.
Prike said he see ctx->resv is null, in that case, code doesn't run
into busy path.
Oh, as you mentioned here, we need add .resv=bo->resv for every
ttm_operation_ctx. That's a huge change which will cross all vendor
drivers.

Can we just force to evaluate evict_ctx->resv = bo->resv? That means
we just add one extra line: evict_ctx->resv = bo->resv. How about
that?
Well only if ctx->resv is NULL, otherwise we would overwrite some
reservation context given by the driver.

Probably better to give the acquir_ctx as separate parameter to
ttm_mem_evict_first().
still put acquire_ctx into ttm_operation_ctx? Then that's same
ctx->resv.
Current problem is we don't pass resv anywhere except ALLOW_EVICT case.
If you have concern for overwritten, we have to do ".resv = bo->resv"
in every ttm_operation_ctx definitions.
No, what I mean is to add the acquire_ctx as separate parameter to
ttm_mem_evict_first().

E.g. we only need it in this function and it is actually not related
to the ttm operation context filled in by the driver.

FWIW, I think it would be nice at some point to have a reservation
context being part of the ttm operation context, so that validate and
evict could do sleeping reservations, and have bos remain on the lru
even when reserved...
Yeah, well that's exactly what the ctx->resv parameter is good for :)

Hmm. I don't quite follow? It looks to me like ctx->resv is there to work around recursive reservations?

What I'm after is being able to do sleeping reservations within validate and evict and open up for returning -EDEADLK. One benefit would be to scan over the LRU lists, reserving exactly those bos we want to evict, and when all are reserved, we evict them. If we hit an -EDEADLK while evicting we need to restart. Then we need an acquire_ctx in the ttm_operation_ctx.


And yes, we do keep the BOs on the LRU even when they are reserved.

static inline int ttm_bo_reserve(struct ttm_buffer_object *bo,
                 bool interruptible, bool no_wait,
                 struct ww_acquire_ctx *ticket)
{
    int ret;

    WARN_ON(!kref_read(&bo->kref));

    ret = __ttm_bo_reserve(bo, interruptible, no_wait, ticket);
    if (likely(ret == 0))
        ttm_bo_del_sub_from_lru(bo);

    return ret;
}

/Thomas



Christian.

/Thomas


Christian.

-David
Christian.

-David
Christian.

         int ret;
            do {
@@ -907,7 +987,7 @@ static int ttm_bo_mem_force_space(struct
ttm_buffer_object *bo,
                 return ret;
             if (mem->mm_node)
                 break;
-        ret = ttm_mem_evict_first(bdev, mem_type, place, ctx);
+        ret = ttm_mem_evict_first(bdev, mem_type, place,
evict_ctx);
             if (unlikely(ret != 0))
                 return ret;
         } while (1);
@@ -1784,7 +1864,8 @@ int ttm_bo_swapout(struct ttm_bo_global
*glob,
struct ttm_operation_ctx *ctx)
         spin_lock(&glob->lru_lock);
         for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
             list_for_each_entry(bo, &glob->swap_lru[i], swap) {
-            if (ttm_bo_evict_swapout_allowable(bo, ctx,
&locked)) {
+            if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
+                               NULL)) {
                     ret = 0;
                     break;
                 }
_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
dri-devel mailing list
dri-devel@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/dri-devel


_______________________________________________
dri-devel mailing list
dri-devel@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/dri-devel




[Index of Archives]     [Linux DRI Users]     [Linux Intel Graphics]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [XFree86]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux