Re: [PATCH 1/4] drm/ttm: add a pointer to the allocating BO into ttm_resource

Christian König <ckoenig.leichtzumerken@xxxxxxxxx> · Fri, 11 Jun 2021 13:30:38 +0200

Am 11.06.21 um 07:34 schrieb Thomas Hellström (Intel):
Hi, Christian,

I know you have a lot on your plate, and that the drm community is a 
bit lax about following the kernel patch submitting guidelines, but 
now that we're also spinning up a number of Intel developers on TTM 
could we please make a better effort with cover letters and commit 
messages so that they understand what the purpose and end goal of the 
series is. A reviewer shouldn't have to look at the last patch to try 
to get an understanding what the series is doing and why.

Sorry, that was send out this early unintentionally. See it more like an 
RFC.


On 6/10/21 1:05 PM, Christian König wrote:
We are going to need this for the next patch


and it allows us to clean
up amdgpu as well.

The amdgpu changes are not reflected in the commit title.



Signed-off-by: Christian König <christian.koenig@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 47 ++++++++-------------
  drivers/gpu/drm/ttm/ttm_resource.c          |  1 +
  include/drm/ttm/ttm_resource.h              |  1 +
  3 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
index 194f9eecf89c..8e3f5da44e4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
@@ -26,23 +26,12 @@
    #include "amdgpu.h"
  -struct amdgpu_gtt_node {
-    struct ttm_buffer_object *tbo;
-    struct ttm_range_mgr_node base;
-};
-
  static inline struct amdgpu_gtt_mgr *
  to_gtt_mgr(struct ttm_resource_manager *man)
  {
      return container_of(man, struct amdgpu_gtt_mgr, manager);
  }
  -static inline struct amdgpu_gtt_node *
-to_amdgpu_gtt_node(struct ttm_resource *res)
-{
-    return container_of(res, struct amdgpu_gtt_node, base.base);
-}
-
  /**
   * DOC: mem_info_gtt_total
   *
@@ -107,9 +96,9 @@ const struct attribute_group 
amdgpu_gtt_mgr_attr_group = {
   */
  bool amdgpu_gtt_mgr_has_gart_addr(struct ttm_resource *res)
  {
-    struct amdgpu_gtt_node *node = to_amdgpu_gtt_node(res);
+    struct ttm_range_mgr_node *node = to_ttm_range_mgr_node(res);
  -    return drm_mm_node_allocated(&node->base.mm_nodes[0]);
+    return drm_mm_node_allocated(&node->mm_nodes[0]);
  }
    /**
@@ -129,7 +118,7 @@ static int amdgpu_gtt_mgr_new(struct 
ttm_resource_manager *man,
  {
      struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
      uint32_t num_pages = PFN_UP(tbo->base.size);
-    struct amdgpu_gtt_node *node;
+    struct ttm_range_mgr_node *node;
      int r;
        spin_lock(&mgr->lock);
@@ -141,19 +130,17 @@ static int amdgpu_gtt_mgr_new(struct 
ttm_resource_manager *man,
      atomic64_sub(num_pages, &mgr->available);
      spin_unlock(&mgr->lock);
  -    node = kzalloc(struct_size(node, base.mm_nodes, 1), GFP_KERNEL);
+    node = kzalloc(struct_size(node, mm_nodes, 1), GFP_KERNEL);
      if (!node) {
          r = -ENOMEM;
          goto err_out;
      }
  -    node->tbo = tbo;
-    ttm_resource_init(tbo, place, &node->base.base);
-
+    ttm_resource_init(tbo, place, &node->base);
      if (place->lpfn) {
          spin_lock(&mgr->lock);
          r = drm_mm_insert_node_in_range(&mgr->mm,
-                        &node->base.mm_nodes[0],
+                        &node->mm_nodes[0],
                          num_pages, tbo->page_alignment,
                          0, place->fpfn, place->lpfn,
                          DRM_MM_INSERT_BEST);
@@ -161,14 +148,14 @@ static int amdgpu_gtt_mgr_new(struct 
ttm_resource_manager *man,
          if (unlikely(r))
              goto err_free;
  -        node->base.base.start = node->base.mm_nodes[0].start;
+        node->base.start = node->mm_nodes[0].start;
      } else {
-        node->base.mm_nodes[0].start = 0;
-        node->base.mm_nodes[0].size = node->base.base.num_pages;
-        node->base.base.start = AMDGPU_BO_INVALID_OFFSET;
+        node->mm_nodes[0].start = 0;
+        node->mm_nodes[0].size = node->base.num_pages;
+        node->base.start = AMDGPU_BO_INVALID_OFFSET;
      }
  -    *res = &node->base.base;
+    *res = &node->base;
      return 0;
    err_free:
@@ -191,12 +178,12 @@ static int amdgpu_gtt_mgr_new(struct 
ttm_resource_manager *man,
  static void amdgpu_gtt_mgr_del(struct ttm_resource_manager *man,
                     struct ttm_resource *res)
  {
-    struct amdgpu_gtt_node *node = to_amdgpu_gtt_node(res);
+    struct ttm_range_mgr_node *node = to_ttm_range_mgr_node(res);
      struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
        spin_lock(&mgr->lock);
-    if (drm_mm_node_allocated(&node->base.mm_nodes[0]))
-        drm_mm_remove_node(&node->base.mm_nodes[0]);
+    if (drm_mm_node_allocated(&node->mm_nodes[0]))
+        drm_mm_remove_node(&node->mm_nodes[0]);
      spin_unlock(&mgr->lock);
      atomic64_add(res->num_pages, &mgr->available);
  @@ -228,14 +215,14 @@ uint64_t amdgpu_gtt_mgr_usage(struct 
ttm_resource_manager *man)
  int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man)
  {
      struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-    struct amdgpu_gtt_node *node;
+    struct ttm_range_mgr_node *node;
      struct drm_mm_node *mm_node;
      int r = 0;
        spin_lock(&mgr->lock);
      drm_mm_for_each_node(mm_node, &mgr->mm) {
-        node = container_of(mm_node, typeof(*node), base.mm_nodes[0]);
-        r = amdgpu_ttm_recover_gart(node->tbo);
+        node = container_of(mm_node, typeof(*node), mm_nodes[0]);
+        r = amdgpu_ttm_recover_gart(node->base.bo);
          if (r)
              break;
      }
diff --git a/drivers/gpu/drm/ttm/ttm_resource.c 
b/drivers/gpu/drm/ttm/ttm_resource.c
index 2431717376e7..7ff6194154fe 100644
--- a/drivers/gpu/drm/ttm/ttm_resource.c
+++ b/drivers/gpu/drm/ttm/ttm_resource.c
@@ -41,6 +41,7 @@ void ttm_resource_init(struct ttm_buffer_object *bo,
      res->bus.offset = 0;
      res->bus.is_iomem = false;
      res->bus.caching = ttm_cached;
+    res->bo = bo;
  }
  EXPORT_SYMBOL(ttm_resource_init);
  diff --git a/include/drm/ttm/ttm_resource.h 
b/include/drm/ttm/ttm_resource.h
index 140b6b9a8bbe..6d0b7a6d2169 100644
--- a/include/drm/ttm/ttm_resource.h
+++ b/include/drm/ttm/ttm_resource.h
@@ -171,6 +171,7 @@ struct ttm_resource {
      uint32_t mem_type;
      uint32_t placement;
      struct ttm_bus_placement bus;
+    struct ttm_buffer_object *bo;

Not that I'm against this change by itself, but this bo pointer is not 
refcounted, and therefore needs a description when it's needed and 
why. What happens, for example when the resource is moved to a ghost 
object, or the bo is killed while the resource is remaining on a lru 
list (which I understand was one of the main purposes with 
free-standing resources). Weak references need a guarantee that the 
object they pointed to is alive. What is that guarantee?

The basic idea is that we want to get rid of ghost objects and all the 
related workarounds in the mid term. But yes for the interim we probably 
need more logic here to make sure the BO is not destroyed.


Also could we introduce new TTM structure members where they are first 
used /referenced by TTM and not where they are used by amdgpu? Without 
finding out in patch 3 that this member is needed to look up the bo 
from a lru list the correct response to this patch would have been: 
That bo is amdgpu-specific and needs to be in a driver private struct...

That was indeed not supposed like this.

The question I rather wanted to raise if it's ok to make the resource 
object fully depend on the BO for allocation/destruction?

I've seen some code in the DG1 branch where a mock BO is used to 
allocate some resource, but that approach won't work with this here.

On the other hand this fixes a long outstanding and very annoying 
problem we had.

Christian.



Thanks,

/Thomas


  };
    /**