On Wed, Jul 27, 2016 at 12:14:59PM +0100, Chris Wilson wrote: > If we enable RCU for the requests (providing a grace period where we can > inspect a "dead" request before it is freed), we can allow callers to > carefully perform lockless lookup of an active request. > > However, by enabling deferred freeing of requests, we can potentially > hog a lot of memory when dealing with tens of thousands of requests per > second - with a quick insertion of a synchronize_rcu() inside our > shrinker callback, that issue disappears. > > v2: Currently, it is our responsibility to handle reclaim i.e. to avoid > hogging memory with the delayed slab frees. At the moment, we wait for a > grace period in the shrinker, and block for all RCU callbacks on oom. > Suggested alternatives focus on flushing our RCU callback when we have a > certain number of outstanding request frees, and blocking on that flush > after a second high watermark. (So rather than wait for the system to > run out of memory, we stop issuing requests - both are nondeterministic.) > > Paul E. McKenney wrote: > > Another approach is synchronize_rcu() after some largish number of > requests. The advantage of this approach is that it throttles the > production of callbacks at the source. The corresponding disadvantage > is that it slows things up. > > Another approach is to use call_rcu(), but if the previous call_rcu() > is still in flight, block waiting for it. Yet another approach is > the get_state_synchronize_rcu() / cond_synchronize_rcu() pair. The > idea is to do something like this: > > cond_synchronize_rcu(cookie); > cookie = get_state_synchronize_rcu(); > > You would of course do an initial get_state_synchronize_rcu() to > get things going. This would not block unless there was less than > one grace period's worth of time between invocations. But this > assumes a busy system, where there is almost always a grace period > in flight. But you can make that happen as follows: > > cond_synchronize_rcu(cookie); > cookie = get_state_synchronize_rcu(); > call_rcu(&my_rcu_head, noop_function); > > Note that you need additional code to make sure that the old callback > has completed before doing a new one. Setting and clearing a flag > with appropriate memory ordering control suffices (e.g,. smp_load_acquire() > and smp_store_release()). > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Maarten Lankhorst <maarten.lankhorst@xxxxxxxxxxxxxxx> > Cc: "Goel, Akash" <akash.goel@xxxxxxxxx> > Cc: Josh Triplett <josh@xxxxxxxxxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_gem.c | 7 +- > drivers/gpu/drm/i915/i915_gem_request.c | 2 +- > drivers/gpu/drm/i915/i915_gem_request.h | 114 +++++++++++++++++++++++++++++-- > drivers/gpu/drm/i915/i915_gem_shrinker.c | 15 ++-- > 4 files changed, 126 insertions(+), 12 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index 54d8a3863d11..0c546f8099d9 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -4421,7 +4421,9 @@ i915_gem_load_init(struct drm_device *dev) > dev_priv->requests = > kmem_cache_create("i915_gem_request", > sizeof(struct drm_i915_gem_request), 0, > - SLAB_HWCACHE_ALIGN, > + SLAB_HWCACHE_ALIGN | > + SLAB_RECLAIM_ACCOUNT | > + SLAB_DESTROY_BY_RCU, > NULL); > > INIT_LIST_HEAD(&dev_priv->context_list); > @@ -4457,6 +4459,9 @@ void i915_gem_load_cleanup(struct drm_device *dev) > kmem_cache_destroy(dev_priv->requests); > kmem_cache_destroy(dev_priv->vmas); > kmem_cache_destroy(dev_priv->objects); > + > + /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */ > + rcu_barrier(); > } > > int i915_gem_freeze_late(struct drm_i915_private *dev_priv) > diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c > index 3395c955a532..bcc1369c0693 100644 > --- a/drivers/gpu/drm/i915/i915_gem_request.c > +++ b/drivers/gpu/drm/i915/i915_gem_request.c > @@ -190,7 +190,7 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request) > prefetchw(next); > > INIT_LIST_HEAD(&active->link); > - active->__request = NULL; > + RCU_INIT_POINTER(active->__request, NULL); > > active->retire(active, request); > } > diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h > index 2eec0cac1e9f..bb03f4440b0f 100644 > --- a/drivers/gpu/drm/i915/i915_gem_request.h > +++ b/drivers/gpu/drm/i915/i915_gem_request.h > @@ -183,6 +183,12 @@ i915_gem_request_get(struct drm_i915_gem_request *req) > return to_request(fence_get(&req->fence)); > } > > +static inline struct drm_i915_gem_request * > +i915_gem_request_get_rcu(struct drm_i915_gem_request *req) > +{ > + return to_request(fence_get_rcu(&req->fence)); > +} > + > static inline void > i915_gem_request_put(struct drm_i915_gem_request *req) > { > @@ -286,7 +292,7 @@ typedef void (*i915_gem_retire_fn)(struct i915_gem_active *, > struct drm_i915_gem_request *); > > struct i915_gem_active { > - struct drm_i915_gem_request *__request; > + struct drm_i915_gem_request __rcu *__request; > struct list_head link; > i915_gem_retire_fn retire; > }; > @@ -323,13 +329,19 @@ i915_gem_active_set(struct i915_gem_active *active, > struct drm_i915_gem_request *request) > { > list_move(&active->link, &request->active_list); > - active->__request = request; > + rcu_assign_pointer(active->__request, request); > } > > static inline struct drm_i915_gem_request * > __i915_gem_active_peek(const struct i915_gem_active *active) > { > - return active->__request; > + /* Inside the error capture (running with the driver in an unknown > + * state), we want to bend the rules slightly (a lot). > + * > + * Work is in progress to make it safer, in the meantime this keeps > + * the known issue from spamming the logs. > + */ > + return rcu_dereference_protected(active->__request, 1); > } > > /** > @@ -345,7 +357,29 @@ i915_gem_active_peek(const struct i915_gem_active *active, struct mutex *mutex) > { > struct drm_i915_gem_request *request; > > - request = active->__request; > + request = rcu_dereference_protected(active->__request, > + lockdep_is_held(mutex)); > + if (!request || i915_gem_request_completed(request)) > + return NULL; > + > + return request; > +} > + > +/** > + * i915_gem_active_peek_rcu - report the active request being monitored > + * @active - the active tracker > + * > + * i915_gem_active_peek_rcu() returns the current request being tracked if > + * still active, or NULL. It does not obtain a reference on the request > + * for the caller, and inspection of the request is only valid under > + * the RCU lock. > + */ > +static inline struct drm_i915_gem_request * > +i915_gem_active_peek_rcu(const struct i915_gem_active *active) > +{ > + struct drm_i915_gem_request *request; > + > + request = rcu_dereference(active->__request); > if (!request || i915_gem_request_completed(request)) > return NULL; > > @@ -366,6 +400,72 @@ i915_gem_active_get(const struct i915_gem_active *active, struct mutex *mutex) > } > > /** > + * i915_gem_active_get_rcu - return a reference to the active request > + * @active - the active tracker > + * > + * i915_gem_active_get() returns a reference to the active request, or NULL > + * if the active tracker is idle. The caller must hold the RCU read lock. > + */ > +static inline struct drm_i915_gem_request * > +i915_gem_active_get_rcu(const struct i915_gem_active *active) > +{ > + /* Performing a lockless retrieval of the active request is super > + * tricky. SLAB_DESTROY_BY_RCU merely guarantees that the backing > + * slab of request objects will not be freed whilst we hold the > + * RCU read lock. It does not guarantee that the request itself > + * will not be freed and then *reused*. Viz, > + * > + * Thread A Thread B > + * > + * req = active.request > + * retire(req) -> free(req); > + * (req is now first on the slab freelist) > + * active.request = NULL > + * > + * req = new submission on a new object > + * ref(req) > + * > + * To prevent the request from being reused whilst the caller > + * uses it, we take a reference like normal. Whilst acquiring > + * the reference we check that it is not in a destroyed state > + * (refcnt == 0). That prevents the request being reallocated > + * whilst the caller holds on to it. To check that the request > + * was not reallocated as we acquired the reference we have to > + * check that our request remains the active request across > + * the lookup, in the same manner as a seqlock. The visibility > + * of the pointer versus the reference counting is controlled > + * by using RCU barriers (rcu_dereference and rcu_assign_pointer). > + * > + * In the middle of all that, we inspect whether the request is > + * complete. Retiring is lazy so the request may be completed long > + * before the active tracker is updated. Querying whether the > + * request is complete is far cheaper (as it involves no locked > + * instructions setting cachelines to exclusive) than acquiring > + * the reference, so we do it first. The RCU read lock ensures the > + * pointer dereference is valid, but does not ensure that the > + * seqno nor HWS is the right one! However, if the request was > + * reallocated, that means the active tracker's request was complete. > + * If the new request is also complete, then both are and we can > + * just report the active tracker is idle. If the new request is > + * incomplete, then we acquire a reference on it and check that > + * it remained the active request. > + */ > + do { > + struct drm_i915_gem_request *request; > + > + request = rcu_dereference(active->__request); > + if (!request || i915_gem_request_completed(request)) > + return NULL; > + > + request = i915_gem_request_get_rcu(request); I think we have a race here still: The issue is that the kref_get_unless_zero is an unordered atomic, and the rcu_dereference is only an smb_read_barrier_depends, which doesn't prevent the fetch from happening before the atomic_add_unless. Well until I opened memory-barriers.txt and learned that atomic_add_unless is a full smp_mb() on both sides on success. That's a bit too tricky for my taste, what about the following comment: /* When request_get_rcu succeds the underlying * atomic_add_unless has a full smp_mb() on both sides. * This ensures that the rcu_dereference() below can't be * reordered before the the refcounting increase has * happened, which prevents the request from being reused. */ I couldn't poke any other holes into this, and we're reusing the fence rcu functions where appropriate. With the comment: Reviewed-by: Daniel Vetter <daniel.vetter@xxxxxxxx> > + if (!request || request == rcu_dereference(active->__request)) > + return request; > + > + i915_gem_request_put(request); > + } while (1); > +} > + > +/** > * __i915_gem_active_is_busy - report whether the active tracker is assigned > * @active - the active tracker > * > @@ -433,7 +533,8 @@ i915_gem_active_retire(struct i915_gem_active *active, > struct drm_i915_gem_request *request; > int ret; > > - request = active->__request; > + request = rcu_dereference_protected(active->__request, > + lockdep_is_held(mutex)); > if (!request) > return 0; > > @@ -442,7 +543,8 @@ i915_gem_active_retire(struct i915_gem_active *active, > return ret; > > list_del_init(&active->link); > - active->__request = NULL; > + RCU_INIT_POINTER(active->__request, NULL); > + > active->retire(active, request); > > return 0; > diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c > index 33f8dcb9b8c4..a1a805fcdffa 100644 > --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c > +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c > @@ -191,6 +191,8 @@ i915_gem_shrink(struct drm_i915_private *dev_priv, > intel_runtime_pm_put(dev_priv); > > i915_gem_retire_requests(dev_priv); > + /* expedite the RCU grace period to free some request slabs */ > + synchronize_rcu_expedited(); > > return count; > } > @@ -211,10 +213,15 @@ i915_gem_shrink(struct drm_i915_private *dev_priv, > */ > unsigned long i915_gem_shrink_all(struct drm_i915_private *dev_priv) > { > - return i915_gem_shrink(dev_priv, -1UL, > - I915_SHRINK_BOUND | > - I915_SHRINK_UNBOUND | > - I915_SHRINK_ACTIVE); > + unsigned long freed; > + > + freed = i915_gem_shrink(dev_priv, -1UL, > + I915_SHRINK_BOUND | > + I915_SHRINK_UNBOUND | > + I915_SHRINK_ACTIVE); > + rcu_barrier(); /* wait until our RCU delayed slab frees are completed */ > + > + return freed; > } > > static bool i915_gem_shrinker_lock(struct drm_device *dev, bool *unlock) > -- > 2.8.1 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/intel-gfx -- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx