Some error conditions just stop a channel and fences get stuck, so they either need to be kicked ready in overwriting hw seq numbers (as nvgpu does) or faked with a sw flag like this. This is just a hack as an example of what would be needed. Here, a channel id whose fences should be forced updated is passed upwards with the uevent response. Normally, this is -1 to match no channel id, but some error paths fake an update event with an explicit channel id. Note: if userspace has some meaningful timeouts on the fences, then they do finish but without any notification that the channel is broken now (how do you distinguish a too long gpu job from a stuck one?). In many cases, a channel needs to be shut down completely when it breaks (e.g., mmu fault). Signed-off-by: Konsta Hölttä <kholtta@xxxxxxxxxx> --- drm/nouveau/include/nvif/event.h | 1 + drm/nouveau/include/nvkm/engine/fifo.h | 2 +- drm/nouveau/nouveau_fence.c | 13 ++++++++----- drm/nouveau/nvkm/engine/fifo/base.c | 3 ++- drm/nouveau/nvkm/engine/fifo/gf100.c | 2 +- drm/nouveau/nvkm/engine/fifo/gk104.c | 7 ++++++- drm/nouveau/nvkm/engine/fifo/nv04.c | 2 +- 7 files changed, 20 insertions(+), 10 deletions(-) diff --git a/drm/nouveau/include/nvif/event.h b/drm/nouveau/include/nvif/event.h index d148b85..a9ff4ee 100644 --- a/drm/nouveau/include/nvif/event.h +++ b/drm/nouveau/include/nvif/event.h @@ -52,16 +52,17 @@ struct nvif_notify_conn_rep_v0 { }; struct nvif_notify_uevent_req { /* nvif_notify_req ... */ }; struct nvif_notify_uevent_rep { /* nvif_notify_rep ... */ + __u32 force_chid; }; struct nvif_notify_eevent_req { /* nvif_notify_req ... */ u32 chid; }; struct nvif_notify_eevent_rep { diff --git a/drm/nouveau/include/nvkm/engine/fifo.h b/drm/nouveau/include/nvkm/engine/fifo.h index cbca477..946eb68 100644 --- a/drm/nouveau/include/nvkm/engine/fifo.h +++ b/drm/nouveau/include/nvkm/engine/fifo.h @@ -117,15 +117,15 @@ extern struct nvkm_oclass *gf100_fifo_oclass; extern struct nvkm_oclass *gk104_fifo_oclass; extern struct nvkm_oclass *gk20a_fifo_oclass; extern struct nvkm_oclass *gk208_fifo_oclass; extern struct nvkm_oclass *gm204_fifo_oclass; extern struct nvkm_oclass *gm20b_fifo_oclass; int nvkm_fifo_uevent_ctor(struct nvkm_object *, void *, u32, struct nvkm_notify *); -void nvkm_fifo_uevent(struct nvkm_fifo *); +void nvkm_fifo_uevent(struct nvkm_fifo *, u32 force_chid); void nvkm_fifo_eevent(struct nvkm_fifo *, u32 chid, u32 error); void nv04_fifo_intr(struct nvkm_subdev *); int nv04_fifo_context_attach(struct nvkm_object *, struct nvkm_object *); #endif diff --git a/drm/nouveau/nouveau_fence.c b/drm/nouveau/nouveau_fence.c index 38bccb0..b7d9987 100644 --- a/drm/nouveau/nouveau_fence.c +++ b/drm/nouveau/nouveau_fence.c @@ -123,50 +123,53 @@ nouveau_fence_context_put(struct kref *fence_ref) void nouveau_fence_context_free(struct nouveau_fence_chan *fctx) { kref_put(&fctx->fence_ref, nouveau_fence_context_put); } static int -nouveau_fence_update(struct nouveau_channel *chan, struct nouveau_fence_chan *fctx) +nouveau_fence_update(struct nouveau_channel *chan, + struct nouveau_fence_chan *fctx, u32 force_chid) { struct nouveau_fence *fence; int drop = 0; u32 seq = fctx->read(chan); + bool force = force_chid == chan->chid; while (!list_empty(&fctx->pending)) { fence = list_entry(fctx->pending.next, typeof(*fence), head); - if ((int)(seq - fence->base.seqno) < 0) + if ((int)(seq - fence->base.seqno) < 0 && !force) break; drop |= nouveau_fence_signal(fence); } return drop; } static int nouveau_fence_wait_uevent_handler(struct nvif_notify *notify) { struct nouveau_fence_chan *fctx = container_of(notify, typeof(*fctx), notify); + const struct nvif_notify_uevent_rep *rep = notify->data; unsigned long flags; int ret = NVIF_NOTIFY_KEEP; spin_lock_irqsave(&fctx->lock, flags); if (!list_empty(&fctx->pending)) { struct nouveau_fence *fence; struct nouveau_channel *chan; fence = list_entry(fctx->pending.next, typeof(*fence), head); chan = rcu_dereference_protected(fence->channel, lockdep_is_held(&fctx->lock)); - if (nouveau_fence_update(fence->channel, fctx)) + if (nouveau_fence_update(fence->channel, fctx, rep->force_chid)) ret = NVIF_NOTIFY_DROP; } spin_unlock_irqrestore(&fctx->lock, flags); return ret; } void @@ -278,17 +281,17 @@ nouveau_fence_emit(struct nouveau_fence *fence, struct nouveau_channel *chan) kref_get(&fctx->fence_ref); trace_fence_emit(&fence->base); ret = fctx->emit(fence); if (!ret) { fence_get(&fence->base); spin_lock_irq(&fctx->lock); - if (nouveau_fence_update(chan, fctx)) + if (nouveau_fence_update(chan, fctx, -1)) nvif_notify_put(&fctx->notify); list_add_tail(&fence->head, &fctx->pending); spin_unlock_irq(&fctx->lock); } return ret; } @@ -302,17 +305,17 @@ nouveau_fence_done(struct nouveau_fence *fence) struct nouveau_channel *chan; unsigned long flags; if (test_bit(FENCE_FLAG_SIGNALED_BIT, &fence->base.flags)) return true; spin_lock_irqsave(&fctx->lock, flags); chan = rcu_dereference_protected(fence->channel, lockdep_is_held(&fctx->lock)); - if (chan && nouveau_fence_update(chan, fctx)) + if (chan && nouveau_fence_update(chan, fctx, -1)) nvif_notify_put(&fctx->notify); spin_unlock_irqrestore(&fctx->lock, flags); } return fence_is_signaled(&fence->base); } static long nouveau_fence_wait_legacy(struct fence *f, bool intr, long wait) diff --git a/drm/nouveau/nvkm/engine/fifo/base.c b/drm/nouveau/nvkm/engine/fifo/base.c index a5dc6c9..e35d711 100644 --- a/drm/nouveau/nvkm/engine/fifo/base.c +++ b/drm/nouveau/nvkm/engine/fifo/base.c @@ -184,19 +184,20 @@ nvkm_fifo_uevent_ctor(struct nvkm_object *object, void *data, u32 size, notify->types = 1; notify->index = 0; } return ret; } void -nvkm_fifo_uevent(struct nvkm_fifo *fifo) +nvkm_fifo_uevent(struct nvkm_fifo *fifo, u32 force_chid) { struct nvif_notify_uevent_rep rep = { + .force_chid = force_chid }; nvkm_event_send(&fifo->uevent, 1, 0, &rep, sizeof(rep)); } static int nvkm_fifo_eevent_ctor(struct nvkm_object *object, void *data, u32 size, struct nvkm_notify *notify) { diff --git a/drm/nouveau/nvkm/engine/fifo/gf100.c b/drm/nouveau/nvkm/engine/fifo/gf100.c index b745252..ca86dfe 100644 --- a/drm/nouveau/nvkm/engine/fifo/gf100.c +++ b/drm/nouveau/nvkm/engine/fifo/gf100.c @@ -732,17 +732,17 @@ gf100_fifo_intr_engine_unit(struct gf100_fifo_priv *priv, int engn) u32 inte = nv_rd32(priv, 0x002628); u32 unkn; nv_wr32(priv, 0x0025a8 + (engn * 0x04), intr); for (unkn = 0; unkn < 8; unkn++) { u32 ints = (intr >> (unkn * 0x04)) & inte; if (ints & 0x1) { - nvkm_fifo_uevent(&priv->base); + nvkm_fifo_uevent(&priv->base, -1); ints &= ~1; } if (ints) { nv_error(priv, "ENGINE %d %d %01x", engn, unkn, ints); nv_mask(priv, 0x002628, ints, 0); } } } diff --git a/drm/nouveau/nvkm/engine/fifo/gk104.c b/drm/nouveau/nvkm/engine/fifo/gk104.c index 53a464d..caecef1 100644 --- a/drm/nouveau/nvkm/engine/fifo/gk104.c +++ b/drm/nouveau/nvkm/engine/fifo/gk104.c @@ -908,16 +908,18 @@ gk104_fifo_intr_fault(struct gk104_fifo_priv *priv, int unit) object = engctx; while (object) { switch (nv_mclass(object)) { case KEPLER_CHANNEL_GPFIFO_A: case MAXWELL_CHANNEL_GPFIFO_A: nvkm_fifo_eevent(&priv->base, ((struct nvkm_fifo_chan*)object)->chid, NOUVEAU_GEM_CHANNEL_FIFO_ERROR_MMU_ERR_FLT); + nvkm_fifo_uevent(&priv->base, + ((struct nvkm_fifo_chan*)object)->chid); gk104_fifo_recover(priv, engine, (void *)object); break; } object = object->parent; } nvkm_engctx_put(engctx); } @@ -978,18 +980,21 @@ gk104_fifo_intr_pbdma_0(struct gk104_fifo_priv *priv, int unit) nv_error(priv, "PBDMA%d:", unit); nvkm_bitfield_print(gk104_fifo_pbdma_intr_0, show); pr_cont("\n"); nv_error(priv, "PBDMA%d: ch %d [%s] subc %d mthd 0x%04x data 0x%08x\n", unit, chid, nvkm_client_name_for_fifo_chid(&priv->base, chid), subc, mthd, data); + nvkm_fifo_eevent(&priv->base, chid, NOUVEAU_GEM_CHANNEL_PBDMA_ERROR); + + nvkm_fifo_uevent(&priv->base, chid); } nv_wr32(priv, 0x040108 + (unit * 0x2000), stat); } static const struct nvkm_bitfield gk104_fifo_pbdma_intr_1[] = { { 0x00000001, "HCE_RE_ILLEGAL_OP" }, { 0x00000002, "HCE_RE_ALIGNB" }, @@ -1030,17 +1035,17 @@ gk104_fifo_intr_runlist(struct gk104_fifo_priv *priv) nv_wr32(priv, 0x002a00, 1 << engn); mask &= ~(1 << engn); } } static void gk104_fifo_intr_engine(struct gk104_fifo_priv *priv) { - nvkm_fifo_uevent(&priv->base); + nvkm_fifo_uevent(&priv->base, -1); } static void gk104_fifo_intr(struct nvkm_subdev *subdev) { struct gk104_fifo_priv *priv = (void *)subdev; u32 mask = nv_rd32(priv, 0x002140); u32 stat = nv_rd32(priv, 0x002100) & mask; diff --git a/drm/nouveau/nvkm/engine/fifo/nv04.c b/drm/nouveau/nvkm/engine/fifo/nv04.c index 043e429..1749614 100644 --- a/drm/nouveau/nvkm/engine/fifo/nv04.c +++ b/drm/nouveau/nvkm/engine/fifo/nv04.c @@ -536,17 +536,17 @@ nv04_fifo_intr(struct nvkm_subdev *subdev) if (device->card_type == NV_50) { if (stat & 0x00000010) { stat &= ~0x00000010; nv_wr32(priv, 0x002100, 0x00000010); } if (stat & 0x40000000) { nv_wr32(priv, 0x002100, 0x40000000); - nvkm_fifo_uevent(&priv->base); + nvkm_fifo_uevent(&priv->base, -1); stat &= ~0x40000000; } } if (stat) { nv_warn(priv, "unknown intr 0x%08x\n", stat); nv_mask(priv, NV03_PFIFO_INTR_EN_0, stat, 0x00000000); nv_wr32(priv, NV03_PFIFO_INTR_0, stat); -- 2.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-tegra" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html