Hi Kieran, Thank you for the patch. On Thursday, 8 March 2018 02:05:27 EEST Kieran Bingham wrote: > Adapt the dl->body0 object to use an object from the body pool. This > greatly reduces the pressure on the TLB for IPMMU use cases, as all of > the lists use a single allocation for the main body. > > The CLU and LUT objects pre-allocate a pool containing three bodies, > allowing a userspace update before the hardware has committed a previous > set of tables. > > Bodies are no longer 'freed' in interrupt context, but instead released > back to their respective pools. This allows us to remove the garbage > collector in the DLM. > > Signed-off-by: Kieran Bingham <kieran.bingham+renesas@xxxxxxxxxxxxxxxx> > > --- > v3: > - 's/fragment/body', 's/fragments/bodies/' > - CLU/LUT now allocate 3 bodies > - vsp1_dl_list_fragments_free -> vsp1_dl_list_bodies_put > > v2: > - Use dl->body0->max_entries to determine header offset, instead of the > global constant VSP1_DL_NUM_ENTRIES which is incorrect. > - squash updates for LUT, CLU, and fragment cleanup into single patch. > (Not fully bisectable when separated) > > drivers/media/platform/vsp1/vsp1_clu.c | 27 ++- > drivers/media/platform/vsp1/vsp1_clu.h | 1 +- > drivers/media/platform/vsp1/vsp1_dl.c | 223 ++++++-------------------- > drivers/media/platform/vsp1/vsp1_dl.h | 3 +- > drivers/media/platform/vsp1/vsp1_lut.c | 27 ++- > drivers/media/platform/vsp1/vsp1_lut.h | 1 +- > 6 files changed, 101 insertions(+), 181 deletions(-) Still a nice diffstart :-) [snip] > diff --git a/drivers/media/platform/vsp1/vsp1_dl.c > b/drivers/media/platform/vsp1/vsp1_dl.c index 0208e72cb356..74476726451c > 100644 > --- a/drivers/media/platform/vsp1/vsp1_dl.c > +++ b/drivers/media/platform/vsp1/vsp1_dl.c [snip] > @@ -399,11 +311,10 @@ void vsp1_dl_body_write(struct vsp1_dl_body *dlb, u32 > reg, u32 data) * Display List Transaction Management > */ > > -static struct vsp1_dl_list *vsp1_dl_list_alloc(struct vsp1_dl_manager *dlm) > +static struct vsp1_dl_list *vsp1_dl_list_alloc(struct vsp1_dl_manager > *dlm, > + struct vsp1_dl_body_pool *pool) Given that the only caller of this function passes dlm->pool as the second argument, can't you remove the second argument ? > { > struct vsp1_dl_list *dl; > - size_t header_size; > - int ret; > > dl = kzalloc(sizeof(*dl), GFP_KERNEL); > if (!dl) > @@ -412,41 +323,39 @@ static struct vsp1_dl_list *vsp1_dl_list_alloc(struct > vsp1_dl_manager *dlm) INIT_LIST_HEAD(&dl->bodies); > dl->dlm = dlm; > > - /* > - * Initialize the display list body and allocate DMA memory for the body > - * and the optional header. Both are allocated together to avoid memory > - * fragmentation, with the header located right after the body in > - * memory. > - */ > - header_size = dlm->mode == VSP1_DL_MODE_HEADER > - ? ALIGN(sizeof(struct vsp1_dl_header), 8) > - : 0; > - > - ret = vsp1_dl_body_init(dlm->vsp1, &dl->body0, VSP1_DL_NUM_ENTRIES, > - header_size); > - if (ret < 0) { > - kfree(dl); > + /* Retrieve a body from our DLM body pool */ s/body pool/body pool./ (And I would have said "Get a body" but that's up to you) > + dl->body0 = vsp1_dl_body_get(pool); > + if (!dl->body0) > return NULL; > - } > - > if (dlm->mode == VSP1_DL_MODE_HEADER) { > - size_t header_offset = VSP1_DL_NUM_ENTRIES > - * sizeof(*dl->body0.entries); > + size_t header_offset = dl->body0->max_entries > + * sizeof(*dl->body0->entries); > > - dl->header = ((void *)dl->body0.entries) + header_offset; > - dl->dma = dl->body0.dma + header_offset; > + dl->header = ((void *)dl->body0->entries) + header_offset; > + dl->dma = dl->body0->dma + header_offset; > > memset(dl->header, 0, sizeof(*dl->header)); > - dl->header->lists[0].addr = dl->body0.dma; > + dl->header->lists[0].addr = dl->body0->dma; > } > > return dl; > } > > +static void vsp1_dl_list_bodies_put(struct vsp1_dl_list *dl) > +{ > + struct vsp1_dl_body *dlb, *tmp; > + > + list_for_each_entry_safe(dlb, tmp, &dl->bodies, list) { > + list_del(&dlb->list); > + vsp1_dl_body_put(dlb); > + } > +} > + > static void vsp1_dl_list_free(struct vsp1_dl_list *dl) > { > - vsp1_dl_body_cleanup(&dl->body0); > - list_splice_init(&dl->bodies, &dl->dlm->gc_bodies); > + vsp1_dl_body_put(dl->body0); > + vsp1_dl_list_bodies_put(dl); Too bad we can't keep the list splice, it's more efficient than iterating over the list, but I suppose it's unavoidable if we want to reset the number of used entries to 0 for each body. Beside, we should have a small number of bodies only, so hopefully it won't be a big deal. > + > kfree(dl); > } > > @@ -500,18 +409,13 @@ static void __vsp1_dl_list_put(struct vsp1_dl_list > *dl) > > dl->has_chain = false; > > + vsp1_dl_list_bodies_put(dl); > + > /* > - * We can't free bodies here as DMA memory can only be freed in > - * interruptible context. Move all bodies to the display list manager's > - * list of bodies to be freed, they will be garbage-collected by the > - * work queue. > + * body0 is reused as as an optimisation as presently every display list > + * has at least one body, thus we reinitialise the entries list s/entries list/entries list./ > */ > - if (!list_empty(&dl->bodies)) { > - list_splice_init(&dl->bodies, &dl->dlm->gc_bodies); > - schedule_work(&dl->dlm->gc_work); > - } We can certainly do this synchronously now that we don't need to free memory anymore. I wonder however about the potential performance impact, as there's a kfree() in vsp1_dl_list_free(). Do you think it could have a noticeable impact on the time spent with interrupts disabled ? > - > - dl->body0.num_entries = 0; > + dl->body0->num_entries = 0; > > list_add_tail(&dl->list, &dl->dlm->free); > } > @@ -548,7 +452,7 @@ void vsp1_dl_list_put(struct vsp1_dl_list *dl) > */ > void vsp1_dl_list_write(struct vsp1_dl_list *dl, u32 reg, u32 data) > { > - vsp1_dl_body_write(&dl->body0, reg, data); > + vsp1_dl_body_write(dl->body0, reg, data); > } > > /** > @@ -561,8 +465,7 @@ void vsp1_dl_list_write(struct vsp1_dl_list *dl, u32 > reg, u32 data) > * in the order in which bodies are added. > * > * Adding a body to a display list passes ownership of the body to the > list. The > - * caller must not touch the body after this call, and must not free it > - * explicitly with vsp1_dl_body_free(). Shouldn't we keep the last part of the sentence and adapt it ? Maybe something like and must not release it explicitly with vsp1_dl_body_put(). I know that you introduce a reference count in the next patches that would make this comment invalid, up to this patch it should be correct. When introducing reference-counting you can update the comment to state that the reference must be released. > + * caller must not touch the body after this call. > * > * Additional bodies are only usable for display lists in header mode. > * Attempting to add a body to a header-less display list will return an > error. @@ -620,7 +523,7 @@ static void vsp1_dl_list_fill_header(struct > vsp1_dl_list *dl, bool is_last) > * list was allocated. > */ > > - hdr->num_bytes = dl->body0.num_entries > + hdr->num_bytes = dl->body0->num_entries > * sizeof(*dl->header->lists); > > list_for_each_entry(dlb, &dl->bodies, list) { > @@ -694,9 +597,9 @@ static void vsp1_dl_list_hw_enqueue(struct vsp1_dl_list > *dl) * bit will be cleared by the hardware when the display list > * processing starts. > */ > - vsp1_write(vsp1, VI6_DL_HDR_ADDR(0), dl->body0.dma); > + vsp1_write(vsp1, VI6_DL_HDR_ADDR(0), dl->body0->dma); > vsp1_write(vsp1, VI6_DL_BODY_SIZE, VI6_DL_BODY_SIZE_UPD | > - (dl->body0.num_entries * sizeof(*dl->header->lists))); > + (dl->body0->num_entries * sizeof(*dl->header->lists))); > } else { > /* > * In header mode, program the display list header address. If > @@ -879,45 +782,12 @@ void vsp1_dlm_reset(struct vsp1_dl_manager *dlm) > dlm->pending = NULL; > } > > -/* > - * Free all bodies awaiting to be garbage-collected. > - * > - * This function must be called without the display list manager lock held. > - */ > -static void vsp1_dlm_bodies_free(struct vsp1_dl_manager *dlm) > -{ > - unsigned long flags; > - > - spin_lock_irqsave(&dlm->lock, flags); > - > - while (!list_empty(&dlm->gc_bodies)) { > - struct vsp1_dl_body *dlb; > - > - dlb = list_first_entry(&dlm->gc_bodies, struct vsp1_dl_body, > - list); > - list_del(&dlb->list); > - > - spin_unlock_irqrestore(&dlm->lock, flags); > - vsp1_dl_body_free(dlb); > - spin_lock_irqsave(&dlm->lock, flags); > - } > - > - spin_unlock_irqrestore(&dlm->lock, flags); > -} > - > -static void vsp1_dlm_garbage_collect(struct work_struct *work) > -{ > - struct vsp1_dl_manager *dlm = > - container_of(work, struct vsp1_dl_manager, gc_work); > - > - vsp1_dlm_bodies_free(dlm); > -} > - > struct vsp1_dl_manager *vsp1_dlm_create(struct vsp1_device *vsp1, > unsigned int index, > unsigned int prealloc) > { > struct vsp1_dl_manager *dlm; > + size_t header_size; > unsigned int i; > > dlm = devm_kzalloc(vsp1->dev, sizeof(*dlm), GFP_KERNEL); > @@ -932,13 +802,26 @@ struct vsp1_dl_manager *vsp1_dlm_create(struct > vsp1_device *vsp1, > > spin_lock_init(&dlm->lock); > INIT_LIST_HEAD(&dlm->free); > - INIT_LIST_HEAD(&dlm->gc_bodies); > - INIT_WORK(&dlm->gc_work, vsp1_dlm_garbage_collect); > + > + /* > + * Initialize the display list body and allocate DMA memory for the body > + * and the optional header. Both are allocated together to avoid memory > + * fragmentation, with the header located right after the body in > + * memory. > + */ > + header_size = dlm->mode == VSP1_DL_MODE_HEADER > + ? ALIGN(sizeof(struct vsp1_dl_header), 8) > + : 0; > + > + dlm->pool = vsp1_dl_body_pool_create(vsp1, prealloc, > + VSP1_DL_NUM_ENTRIES, header_size); > + if (!dlm->pool) > + return NULL; > > for (i = 0; i < prealloc; ++i) { > struct vsp1_dl_list *dl; > > - dl = vsp1_dl_list_alloc(dlm); > + dl = vsp1_dl_list_alloc(dlm, dlm->pool); > if (!dl) > return NULL; > > @@ -955,12 +838,10 @@ void vsp1_dlm_destroy(struct vsp1_dl_manager *dlm) > if (!dlm) > return; > > - cancel_work_sync(&dlm->gc_work); > - > list_for_each_entry_safe(dl, next, &dlm->free, list) { > list_del(&dl->list); > vsp1_dl_list_free(dl); > } > > - vsp1_dlm_bodies_free(dlm); > + vsp1_dl_body_pool_destroy(dlm->pool); > } [snip] -- Regards, Laurent Pinchart