Hi Laurent, On 06/04/18 23:55, Laurent Pinchart wrote: > Hi Kieran, > > Thank you for the patch. > > On Thursday, 8 March 2018 02:05:27 EEST Kieran Bingham wrote: >> Adapt the dl->body0 object to use an object from the body pool. This >> greatly reduces the pressure on the TLB for IPMMU use cases, as all of >> the lists use a single allocation for the main body. >> >> The CLU and LUT objects pre-allocate a pool containing three bodies, >> allowing a userspace update before the hardware has committed a previous >> set of tables. >> >> Bodies are no longer 'freed' in interrupt context, but instead released >> back to their respective pools. This allows us to remove the garbage >> collector in the DLM. >> >> Signed-off-by: Kieran Bingham <kieran.bingham+renesas@xxxxxxxxxxxxxxxx> >> >> --- >> v3: >> - 's/fragment/body', 's/fragments/bodies/' >> - CLU/LUT now allocate 3 bodies >> - vsp1_dl_list_fragments_free -> vsp1_dl_list_bodies_put >> >> v2: >> - Use dl->body0->max_entries to determine header offset, instead of the >> global constant VSP1_DL_NUM_ENTRIES which is incorrect. >> - squash updates for LUT, CLU, and fragment cleanup into single patch. >> (Not fully bisectable when separated) >> >> drivers/media/platform/vsp1/vsp1_clu.c | 27 ++- >> drivers/media/platform/vsp1/vsp1_clu.h | 1 +- >> drivers/media/platform/vsp1/vsp1_dl.c | 223 ++++++-------------------- >> drivers/media/platform/vsp1/vsp1_dl.h | 3 +- >> drivers/media/platform/vsp1/vsp1_lut.c | 27 ++- >> drivers/media/platform/vsp1/vsp1_lut.h | 1 +- >> 6 files changed, 101 insertions(+), 181 deletions(-) > > Still a nice diffstart :-) > > [snip] > >> diff --git a/drivers/media/platform/vsp1/vsp1_dl.c >> b/drivers/media/platform/vsp1/vsp1_dl.c index 0208e72cb356..74476726451c >> 100644 >> --- a/drivers/media/platform/vsp1/vsp1_dl.c >> +++ b/drivers/media/platform/vsp1/vsp1_dl.c > > [snip] > >> @@ -399,11 +311,10 @@ void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 >> reg, u32 data) * Display List Transaction Management >> */ >> >> -static struct vsp1_dl_list *vsp1_dl_list_alloc(struct vsp1_dl_manager *dlm) >> +static struct vsp1_dl_list *vsp1_dl_list_alloc(struct vsp1_dl_manager >> *dlm, >> + struct vsp1_dl_body_pool *pool) > > Given that the only caller of this function passes dlm->pool as the second > argument, can't you remove the second argument ? Hrm ... I thought there was going to be a use case where the pool will be separated. But perhaps not. So yes - Removing. > >> { >> struct vsp1_dl_list *dl; >> - size_t header_size; >> - int ret; >> >> dl = kzalloc(sizeof(*dl), GFP_KERNEL); >> if (!dl) >> @@ -412,41 +323,39 @@ static struct vsp1_dl_list *vsp1_dl_list_alloc(struct >> vsp1_dl_manager *dlm) INIT_LIST_HEAD(&dl->bodies); >> dl->dlm = dlm; >> >> - /* >> - * Initialize the display list body and allocate DMA memory for the body >> - * and the optional header. Both are allocated together to avoid memory >> - * fragmentation, with the header located right after the body in >> - * memory. >> - */ >> - header_size = dlm->mode == VSP1_DL_MODE_HEADER >> - ? ALIGN(sizeof(struct vsp1_dl_header), 8) >> - : 0; >> - >> - ret = vsp1_dl_body_init(dlm->vsp1, &dl->body0, VSP1_DL_NUM_ENTRIES, >> - header_size); >> - if (ret < 0) { >> - kfree(dl); >> + /* Retrieve a body from our DLM body pool */ > > s/body pool/body pool./ > > (And I would have said "Get a body" but that's up to you) I think that's evident by the function name "vsp1_dl_body_get()", thus I've adapted this comment to be a bit more meaningful: /* Get a default body for our list. */ But I'm not opposed to dropping the comment. Also at somepoint, I think there's scope to remove the dl->body0 so it may not matter. > >> + dl->body0 = vsp1_dl_body_get(pool); >> + if (!dl->body0) >> return NULL; >> - } >> - >> if (dlm->mode == VSP1_DL_MODE_HEADER) { >> - size_t header_offset = VSP1_DL_NUM_ENTRIES >> - * sizeof(*dl->body0.entries); >> + size_t header_offset = dl->body0->max_entries >> + * sizeof(*dl->body0->entries); >> >> - dl->header = ((void *)dl->body0.entries) + header_offset; >> - dl->dma = dl->body0.dma + header_offset; >> + dl->header = ((void *)dl->body0->entries) + header_offset; >> + dl->dma = dl->body0->dma + header_offset; >> >> memset(dl->header, 0, sizeof(*dl->header)); >> - dl->header->lists[0].addr = dl->body0.dma; >> + dl->header->lists[0].addr = dl->body0->dma; >> } >> >> return dl; >> } >> >> +static void vsp1_dl_list_bodies_put(struct vsp1_dl_list *dl) >> +{ >> + struct vsp1_dl_body *dlb, *tmp; >> + >> + list_for_each_entry_safe(dlb, tmp, &dl->bodies, list) { >> + list_del(&dlb->list); >> + vsp1_dl_body_put(dlb); >> + } >> +} >> + >> static void vsp1_dl_list_free(struct vsp1_dl_list *dl) >> { >> - vsp1_dl_body_cleanup(&dl->body0); >> - list_splice_init(&dl->bodies, &dl->dlm->gc_bodies); >> + vsp1_dl_body_put(dl->body0); >> + vsp1_dl_list_bodies_put(dl); > > Too bad we can't keep the list splice, it's more efficient than iterating over > the list, but I suppose it's unavoidable if we want to reset the number of > used entries to 0 for each body. Beside, we should have a small number of > bodies only, so hopefully it won't be a big deal. Yes, plus reference counting needs to be tracked too ... so I think we need to keep this. > >> + >> kfree(dl); >> } >> >> @@ -500,18 +409,13 @@ static void __vsp1_dl_list_put(struct vsp1_dl_list >> *dl) >> >> dl->has_chain = false; >> >> + vsp1_dl_list_bodies_put(dl); >> + >> /* >> - * We can't free bodies here as DMA memory can only be freed in >> - * interruptible context. Move all bodies to the display list manager's >> - * list of bodies to be freed, they will be garbage-collected by the >> - * work queue. >> + * body0 is reused as as an optimisation as presently every display list >> + * has at least one body, thus we reinitialise the entries list > > s/entries list/entries list./ Done > >> */ >> - if (!list_empty(&dl->bodies)) { >> - list_splice_init(&dl->bodies, &dl->dlm->gc_bodies); >> - schedule_work(&dl->dlm->gc_work); >> - } > > We can certainly do this synchronously now that we don't need to free memory > anymore. I wonder however about the potential performance impact, as there's a > kfree() in vsp1_dl_list_free(). Yes, but ... > Do you think it could have a noticeable impact on the time spent with interrupts disabled ? I doubt it ... vsp1_dl_list_free() is only called from vsp1_dlm_destroy(), which is only called from vsp1_wpf_destroy(). That's the whole benefit of being able to remove the garbage collector. >> - >> - dl->body0.num_entries = 0; >> + dl->body0->num_entries = 0; >> >> list_add_tail(&dl->list, &dl->dlm->free); >> } >> @@ -548,7 +452,7 @@ void vsp1_dl_list_put(struct vsp1_dl_list *dl) >> */ >> void vsp1_dl_list_write(struct vsp1_dl_list *dl, u32 reg, u32 data) >> { >> - vsp1_dl_body_write(&dl->body0, reg, data); >> + vsp1_dl_body_write(dl->body0, reg, data); >> } >> >> /** >> @@ -561,8 +465,7 @@ void vsp1_dl_list_write(struct vsp1_dl_list *dl, u32 >> reg, u32 data) >> * in the order in which bodies are added. >> * >> * Adding a body to a display list passes ownership of the body to the >> list. The >> - * caller must not touch the body after this call, and must not free it >> - * explicitly with vsp1_dl_body_free(). > > Shouldn't we keep the last part of the sentence and adapt it ? Maybe something > like > > and must not release it explicitly with vsp1_dl_body_put(). > > I know that you introduce a reference count in the next patches that would > make this comment invalid, up to this patch it should be correct. When > introducing reference-counting you can update the comment to state that the > reference must be released. > Sure ... It's a very temporary statement :-) >> + * caller must not touch the body after this call. >> * >> * Additional bodies are only usable for display lists in header mode. >> * Attempting to add a body to a header-less display list will return an >> error. @@ -620,7 +523,7 @@ static void vsp1_dl_list_fill_header(struct >> vsp1_dl_list *dl, bool is_last) >> * list was allocated. >> */ >> >> - hdr->num_bytes = dl->body0.num_entries >> + hdr->num_bytes = dl->body0->num_entries >> * sizeof(*dl->header->lists); >> >> list_for_each_entry(dlb, &dl->bodies, list) { >> @@ -694,9 +597,9 @@ static void vsp1_dl_list_hw_enqueue(struct vsp1_dl_list >> *dl) * bit will be cleared by the hardware when the display list >> * processing starts. >> */ >> - vsp1_write(vsp1, VI6_DL_HDR_ADDR(0), dl->body0.dma); >> + vsp1_write(vsp1, VI6_DL_HDR_ADDR(0), dl->body0->dma); >> vsp1_write(vsp1, VI6_DL_BODY_SIZE, VI6_DL_BODY_SIZE_UPD | >> - (dl->body0.num_entries * sizeof(*dl->header->lists))); >> + (dl->body0->num_entries * sizeof(*dl->header->lists))); >> } else { >> /* >> * In header mode, program the display list header address. If >> @@ -879,45 +782,12 @@ void vsp1_dlm_reset(struct vsp1_dl_manager *dlm) >> dlm->pending = NULL; >> } >> >> -/* >> - * Free all bodies awaiting to be garbage-collected. >> - * >> - * This function must be called without the display list manager lock held. >> - */ >> -static void vsp1_dlm_bodies_free(struct vsp1_dl_manager *dlm) >> -{ >> - unsigned long flags; >> - >> - spin_lock_irqsave(&dlm->lock, flags); >> - >> - while (!list_empty(&dlm->gc_bodies)) { >> - struct vsp1_dl_body *dlb; >> - >> - dlb = list_first_entry(&dlm->gc_bodies, struct vsp1_dl_body, >> - list); >> - list_del(&dlb->list); >> - >> - spin_unlock_irqrestore(&dlm->lock, flags); >> - vsp1_dl_body_free(dlb); >> - spin_lock_irqsave(&dlm->lock, flags); >> - } >> - >> - spin_unlock_irqrestore(&dlm->lock, flags); >> -} >> - >> -static void vsp1_dlm_garbage_collect(struct work_struct *work) >> -{ >> - struct vsp1_dl_manager *dlm = >> - container_of(work, struct vsp1_dl_manager, gc_work); >> - >> - vsp1_dlm_bodies_free(dlm); >> -} >> - >> struct vsp1_dl_manager *vsp1_dlm_create(struct vsp1_device *vsp1, >> unsigned int index, >> unsigned int prealloc) >> { >> struct vsp1_dl_manager *dlm; >> + size_t header_size; >> unsigned int i; >> >> dlm = devm_kzalloc(vsp1->dev, sizeof(*dlm), GFP_KERNEL); >> @@ -932,13 +802,26 @@ struct vsp1_dl_manager *vsp1_dlm_create(struct >> vsp1_device *vsp1, >> >> spin_lock_init(&dlm->lock); >> INIT_LIST_HEAD(&dlm->free); >> - INIT_LIST_HEAD(&dlm->gc_bodies); >> - INIT_WORK(&dlm->gc_work, vsp1_dlm_garbage_collect); >> + >> + /* >> + * Initialize the display list body and allocate DMA memory for the body >> + * and the optional header. Both are allocated together to avoid memory >> + * fragmentation, with the header located right after the body in >> + * memory. >> + */ >> + header_size = dlm->mode == VSP1_DL_MODE_HEADER >> + ? ALIGN(sizeof(struct vsp1_dl_header), 8) >> + : 0; >> + >> + dlm->pool = vsp1_dl_body_pool_create(vsp1, prealloc, >> + VSP1_DL_NUM_ENTRIES, header_size); >> + if (!dlm->pool) >> + return NULL; >> >> for (i = 0; i < prealloc; ++i) { >> struct vsp1_dl_list *dl; >> >> - dl = vsp1_dl_list_alloc(dlm); >> + dl = vsp1_dl_list_alloc(dlm, dlm->pool); >> if (!dl) >> return NULL; >> >> @@ -955,12 +838,10 @@ void vsp1_dlm_destroy(struct vsp1_dl_manager *dlm) >> if (!dlm) >> return; >> >> - cancel_work_sync(&dlm->gc_work); >> - >> list_for_each_entry_safe(dl, next, &dlm->free, list) { >> list_del(&dl->list); >> vsp1_dl_list_free(dl); >> } >> >> - vsp1_dlm_bodies_free(dlm); >> + vsp1_dl_body_pool_destroy(dlm->pool); >> } > > [snip] >