Re: [PATCH 10/46] drm/i915: Make request allocation caches global

Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx> · Mon, 11 Feb 2019 17:02:19 +0000

On 11/02/2019 12:40, Chris Wilson wrote:
Quoting Tvrtko Ursulin (2019-02-11 11:43:41)

On 06/02/2019 13:03, Chris Wilson wrote:
As kmem_caches share the same properties (size, allocation/free behaviour)
for all potential devices, we can use global caches. While this
potential has worse fragmentation behaviour (one can argue that
different devices would have different activity lifetimes, but you can
also argue that activity is temporal across the system) it is the
default behaviour of the system at large to amalgamate matching caches.

The benefit for us is much reduced pointer dancing along the frequent
allocation paths.

v2: Defer shrinking until after a global grace period for futureproofing
multiple consumers of the slab caches, similar to the current strategy
for avoiding shrinking too early.

Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
---
   drivers/gpu/drm/i915/Makefile                 |   1 +
   drivers/gpu/drm/i915/i915_active.c            |   7 +-
   drivers/gpu/drm/i915/i915_active.h            |   1 +
   drivers/gpu/drm/i915/i915_drv.h               |   3 -
   drivers/gpu/drm/i915/i915_gem.c               |  34 +-----
   drivers/gpu/drm/i915/i915_globals.c           | 105 ++++++++++++++++++
   drivers/gpu/drm/i915/i915_globals.h           |  15 +++
   drivers/gpu/drm/i915/i915_pci.c               |   8 +-
   drivers/gpu/drm/i915/i915_request.c           |  53 +++++++--
   drivers/gpu/drm/i915/i915_request.h           |  10 ++
   drivers/gpu/drm/i915/i915_scheduler.c         |  66 ++++++++---
   drivers/gpu/drm/i915/i915_scheduler.h         |  34 +++++-
   drivers/gpu/drm/i915/intel_guc_submission.c   |   3 +-
   drivers/gpu/drm/i915/intel_lrc.c              |   6 +-
   drivers/gpu/drm/i915/intel_ringbuffer.h       |  17 ---
   drivers/gpu/drm/i915/selftests/intel_lrc.c    |   2 +-
   drivers/gpu/drm/i915/selftests/mock_engine.c  |  48 ++++----
   .../gpu/drm/i915/selftests/mock_gem_device.c  |  26 -----
   drivers/gpu/drm/i915/selftests/mock_request.c |  12 +-
   drivers/gpu/drm/i915/selftests/mock_request.h |   7 --
   20 files changed, 306 insertions(+), 152 deletions(-)
   create mode 100644 drivers/gpu/drm/i915/i915_globals.c
   create mode 100644 drivers/gpu/drm/i915/i915_globals.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 1787e1299b1b..a1d834068765 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -77,6 +77,7 @@ i915-y += \
         i915_gem_tiling.o \
         i915_gem_userptr.o \
         i915_gemfs.o \
+       i915_globals.o \
         i915_query.o \
         i915_request.o \
         i915_scheduler.o \
diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
index 215b6ff8aa73..9026787ebdf8 100644
--- a/drivers/gpu/drm/i915/i915_active.c
+++ b/drivers/gpu/drm/i915/i915_active.c
@@ -280,7 +280,12 @@ int __init i915_global_active_init(void)
       return 0;
   }
   
-void __exit i915_global_active_exit(void)
+void i915_global_active_shrink(void)
+{
+     kmem_cache_shrink(global.slab_cache);
+}
+
+void i915_global_active_exit(void)
   {
       kmem_cache_destroy(global.slab_cache);
   }
diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
index 12b5c1d287d1..5fbd9102384b 100644
--- a/drivers/gpu/drm/i915/i915_active.h
+++ b/drivers/gpu/drm/i915/i915_active.h
@@ -420,6 +420,7 @@ static inline void i915_active_fini(struct i915_active *ref) { }
   #endif
   
   int i915_global_active_init(void);
+void i915_global_active_shrink(void);
   void i915_global_active_exit(void);
   
   #endif /* _I915_ACTIVE_H_ */
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 37230ae7fbe6..a365b1a2ea9a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1459,9 +1459,6 @@ struct drm_i915_private {
       struct kmem_cache *objects;
       struct kmem_cache *vmas;
       struct kmem_cache *luts;
-     struct kmem_cache *requests;
-     struct kmem_cache *dependencies;
-     struct kmem_cache *priorities;
   
       const struct intel_device_info __info; /* Use INTEL_INFO() to access. */
       struct intel_runtime_info __runtime; /* Use RUNTIME_INFO() to access. */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 1eb3a5f8654c..d18c4ccff370 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -42,6 +42,7 @@
   #include "i915_drv.h"
   #include "i915_gem_clflush.h"
   #include "i915_gemfs.h"
+#include "i915_globals.h"
   #include "i915_reset.h"
   #include "i915_trace.h"
   #include "i915_vgpu.h"
@@ -187,6 +188,8 @@ void i915_gem_unpark(struct drm_i915_private *i915)
       if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
               i915->gt.epoch = 1;
   
+     i915_globals_unpark();
+
       intel_enable_gt_powersave(i915);
       i915_update_gfx_val(i915);
       if (INTEL_GEN(i915) >= 6)
@@ -2916,12 +2919,11 @@ static void shrink_caches(struct drm_i915_private *i915)
        * filled slabs to prioritise allocating from the mostly full slabs,
        * with the aim of reducing fragmentation.
        */
-     kmem_cache_shrink(i915->priorities);
-     kmem_cache_shrink(i915->dependencies);
-     kmem_cache_shrink(i915->requests);
       kmem_cache_shrink(i915->luts);
       kmem_cache_shrink(i915->vmas);
       kmem_cache_shrink(i915->objects);
+
+     i915_globals_park();

Slightly confusing that the shrink caches path calls globals_park - ie
after the device has been parked. Would i915_globals_shrink and
__i915_globals_shrink be clearer? Not sure.

Final destination is __i915_gem_park. I could stick it there now, but
felt it clearer to have it as a sideways move atm.

With the last 3 slab caches converted over to globals, they all sit
behind the same rcu_work and we can remove our open-coded variant
(rcu_work is a recent invention).

Is there some downside to calling i915_globals_park directly from the 
idle work handler straight away? (I mean in this patch.)

Converting the idle __sleep_rcu path to queue_rcu_work is then a 
completely separate task.

Regards,

Tvrtko

+void i915_globals_park(void)
+{
+     struct park_work *wrk;
+
+     /*
+      * Defer shrinking the global slab caches (and other work) until
+      * after a RCU grace period has completed with no activity. This
+      * is to try and reduce the latency impact on the consumers caused
+      * by us shrinking the caches the same time as they are trying to
+      * allocate, with the assumption being that if we idle long enough
+      * for an RCU grace period to elapse since the last use, it is likely
+      * to be longer until we need the caches again.
+      */
+     if (!atomic_dec_and_test(&active))
+             return;
+
+     wrk = kmalloc(sizeof(*wrk), GFP_KERNEL);
+     if (!wrk)
+             return;
+
+     wrk->epoch = atomic_inc_return(&epoch);

Do you need to bump the epoch here?

Strictly, no. It doesn't harm, provides an explicit mb and a known
uniqueness to our sampling.

Unpark would bump it so
automatically when rcu work gets to run it would fail already. Like this
it sounds like double increment. I don't see a problem with the double
increment I just failed to spot if it is actually needed for some subtle
reason. There would be a potential race with multiple device park
callers storing the same epoch but is that really a problem? Again, as
soon as someone unparks it seems like it would be the right thing.

I did wonder if we could make use of it, but for the moment, all I can
say is that it may make debugging slightly easier.
-Chris

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx