mzoran@xxxxxxxxxxxx writes: > From: Michael Zoran <mzoran@xxxxxxxxxxxx> > > The original arm implementation uses dmac_map_area which is not > portable. Replace it with an architecture neutral version > which uses dma_map_sg. > > As you can see that for larger page sizes, the dma_map_sg > implementation is faster then the original unportable dma_map_area > implementation. > > Test dmac_map_area dma_map_page dma_map_sg > vchiq_test -b 4 10000 51us/iter 76us/iter 76us > vchiq_test -b 8 10000 70us/iter 82us/iter 91us > vchiq_test -b 16 10000 94us/iter 118us/iter 121us > vchiq_test -b 32 10000 146us/iter 173us/iter 187us > vchiq_test -b 64 10000 263us/iter 328us/iter 299us > vchiq_test -b 128 10000 529us/iter 631us/iter 595us > vchiq_test -b 256 10000 2285us/iter 2275us/iter 2001us > vchiq_test -b 512 10000 4372us/iter 4616us/iter 4123us > > For message sizes >= 64KB, dma_map_sg is faster then dma_map_page. > > For message size >= 256KB, the dma_map_sg is the fastest > implementation. > > Signed-off-by: Michael Zoran <mzoran@xxxxxxxxxxxx> > --- > .../interface/vchiq_arm/vchiq_2835_arm.c | 153 ++++++++++++--------- > 1 file changed, 91 insertions(+), 62 deletions(-) > > diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c > index 98c6819..5ca66ee 100644 > --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c > +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c > @@ -45,13 +45,8 @@ > #include <asm/pgtable.h> > #include <soc/bcm2835/raspberrypi-firmware.h> > > -#define dmac_map_area __glue(_CACHE,_dma_map_area) > -#define dmac_unmap_area __glue(_CACHE,_dma_unmap_area) > - > #define TOTAL_SLOTS (VCHIQ_SLOT_ZERO_SLOTS + 2 * 32) > > -#define VCHIQ_ARM_ADDRESS(x) ((void *)((char *)x + g_virt_to_bus_offset)) > - > #include "vchiq_arm.h" > #include "vchiq_2835.h" > #include "vchiq_connected.h" > @@ -73,7 +68,7 @@ static unsigned int g_fragments_size; > static char *g_fragments_base; > static char *g_free_fragments; > static struct semaphore g_free_fragments_sema; > -static unsigned long g_virt_to_bus_offset; > +static struct device *g_dev; > > extern int vchiq_arm_log_level; > > @@ -84,10 +79,11 @@ vchiq_doorbell_irq(int irq, void *dev_id); > > static int > create_pagelist(char __user *buf, size_t count, unsigned short type, > - struct task_struct *task, PAGELIST_T ** ppagelist); > + struct task_struct *task, PAGELIST_T **ppagelist, > + dma_addr_t *dma_addr); > > static void > -free_pagelist(PAGELIST_T *pagelist, int actual); > +free_pagelist(dma_addr_t dma_addr, PAGELIST_T *pagelist, int actual); > > int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state) > { > @@ -101,8 +97,6 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state) > int slot_mem_size, frag_mem_size; > int err, irq, i; > > - g_virt_to_bus_offset = virt_to_dma(dev, (void *)0); > - > (void)of_property_read_u32(dev->of_node, "cache-line-size", > &g_cache_line_size); > g_fragments_size = 2 * g_cache_line_size; > @@ -118,7 +112,7 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state) > return -ENOMEM; > } > > - WARN_ON(((int)slot_mem & (PAGE_SIZE - 1)) != 0); > + WARN_ON(((unsigned long)slot_mem & (PAGE_SIZE - 1)) != 0); > > vchiq_slot_zero = vchiq_init_slots(slot_mem, slot_mem_size); > if (!vchiq_slot_zero) > @@ -170,6 +164,7 @@ int vchiq_platform_init(struct platform_device *pdev, VCHIQ_STATE_T *state) > return err ? : -ENXIO; > } > > + g_dev = dev; > vchiq_log_info(vchiq_arm_log_level, > "vchiq_init - done (slots %pK, phys %pad)", > vchiq_slot_zero, &slot_phys); > @@ -233,6 +228,7 @@ vchiq_prepare_bulk_data(VCHIQ_BULK_T *bulk, VCHI_MEM_HANDLE_T memhandle, > { > PAGELIST_T *pagelist; > int ret; > + dma_addr_t dma_addr; > > WARN_ON(memhandle != VCHI_MEM_HANDLE_INVALID); > > @@ -241,12 +237,14 @@ vchiq_prepare_bulk_data(VCHIQ_BULK_T *bulk, VCHI_MEM_HANDLE_T memhandle, > ? PAGELIST_READ > : PAGELIST_WRITE, > current, > - &pagelist); > + &pagelist, > + &dma_addr); > + > if (ret != 0) > return VCHIQ_ERROR; > > bulk->handle = memhandle; > - bulk->data = VCHIQ_ARM_ADDRESS(pagelist); > + bulk->data = (void *)(unsigned long)dma_addr; > > /* Store the pagelist address in remote_data, which isn't used by the > slave. */ > @@ -259,7 +257,8 @@ void > vchiq_complete_bulk(VCHIQ_BULK_T *bulk) > { > if (bulk && bulk->remote_data && bulk->actual) > - free_pagelist((PAGELIST_T *)bulk->remote_data, bulk->actual); > + free_pagelist((dma_addr_t)(unsigned long)bulk->data, > + (PAGELIST_T *)bulk->remote_data, bulk->actual); > } > > void > @@ -353,38 +352,44 @@ vchiq_doorbell_irq(int irq, void *dev_id) > ** obscuring the new data underneath. We can solve this by transferring the > ** partial cache lines separately, and allowing the ARM to copy into the > ** cached area. > - > -** N.B. This implementation plays slightly fast and loose with the Linux > -** driver programming rules, e.g. its use of dmac_map_area instead of > -** dma_map_single, but it isn't a multi-platform driver and it benefits > -** from increased speed as a result. > */ > > static int > create_pagelist(char __user *buf, size_t count, unsigned short type, > - struct task_struct *task, PAGELIST_T ** ppagelist) > + struct task_struct *task, PAGELIST_T **ppagelist, > + dma_addr_t *dma_addr) > { > PAGELIST_T *pagelist; > struct page **pages; > - unsigned long *addrs; > - unsigned int num_pages, offset, i; > - char *addr, *base_addr, *next_addr; > - int run, addridx, actual_pages; > + u32 *addrs; > + unsigned int num_pages, offset, i, j, k; > + int actual_pages; > unsigned long *need_release; > + size_t pagelist_size; > + struct scatterlist *scatterlist; > + int dma_buffers; > + int dir; > > - offset = (unsigned int)buf & (PAGE_SIZE - 1); > + offset = ((unsigned int)(unsigned long)buf & (PAGE_SIZE - 1)); > num_pages = (count + offset + PAGE_SIZE - 1) / PAGE_SIZE; > > + pagelist_size = sizeof(PAGELIST_T) + > + (num_pages * sizeof(unsigned long)) + > + sizeof(unsigned long) + > + (num_pages * sizeof(pages[0]) + > + (num_pages * sizeof(struct scatterlist))); > + > *ppagelist = NULL; > > + dir = (type == PAGELIST_WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; > + > /* Allocate enough storage to hold the page pointers and the page > ** list > */ > - pagelist = kmalloc(sizeof(PAGELIST_T) + > - (num_pages * sizeof(unsigned long)) + > - sizeof(unsigned long) + > - (num_pages * sizeof(pages[0])), > - GFP_KERNEL); > + pagelist = dma_zalloc_coherent(g_dev, > + pagelist_size, > + dma_addr, > + GFP_KERNEL); > > vchiq_log_trace(vchiq_arm_log_level, "create_pagelist - %pK", > pagelist); > @@ -394,10 +399,9 @@ create_pagelist(char __user *buf, size_t count, unsigned short type, > addrs = pagelist->addrs; > need_release = (unsigned long *)(addrs + num_pages); > pages = (struct page **)(addrs + num_pages + 1); > + scatterlist = (struct scatterlist *)(pages + num_pages); > > if (is_vmalloc_addr(buf)) { > - int dir = (type == PAGELIST_WRITE) ? > - DMA_TO_DEVICE : DMA_FROM_DEVICE; > unsigned long length = count; > unsigned int off = offset; > > @@ -410,7 +414,6 @@ create_pagelist(char __user *buf, size_t count, unsigned short type, > if (bytes > length) > bytes = length; > pages[actual_pages] = pg; > - dmac_map_area(page_address(pg) + off, bytes, dir); > length -= bytes; > off = 0; > } > @@ -438,7 +441,8 @@ create_pagelist(char __user *buf, size_t count, unsigned short type, > actual_pages--; > put_page(pages[actual_pages]); > } > - kfree(pagelist); > + dma_free_coherent(g_dev, pagelist_size, > + pagelist, *dma_addr); > if (actual_pages == 0) > actual_pages = -ENOMEM; > return actual_pages; > @@ -450,29 +454,38 @@ create_pagelist(char __user *buf, size_t count, unsigned short type, > pagelist->type = type; > pagelist->offset = offset; > > - /* Group the pages into runs of contiguous pages */ > - > - base_addr = VCHIQ_ARM_ADDRESS(page_address(pages[0])); > - next_addr = base_addr + PAGE_SIZE; > - addridx = 0; > - run = 0; > - > - for (i = 1; i < num_pages; i++) { > - addr = VCHIQ_ARM_ADDRESS(page_address(pages[i])); > - if ((addr == next_addr) && (run < (PAGE_SIZE - 1))) { > - next_addr += PAGE_SIZE; > - run++; > - } else { > - addrs[addridx] = (unsigned long)base_addr + run; > - addridx++; > - base_addr = addr; > - next_addr = addr + PAGE_SIZE; > - run = 0; > - } > + for (i = 0; i < num_pages; i++) > + sg_set_page(scatterlist + i, pages[i], PAGE_SIZE, 0); > + > + dma_buffers = dma_map_sg(g_dev, > + scatterlist, > + num_pages, > + dir); > + > + if (dma_buffers == 0) { > + dma_free_coherent(g_dev, pagelist_size, > + pagelist, *dma_addr); > + > + return -EINTR; > } > > - addrs[addridx] = (unsigned long)base_addr + run; > - addridx++; > + k = 0; > + for (i = 0; i < dma_buffers;) { > + u32 section_length = 0; > + > + for (j = i + 1; j < dma_buffers; j++) { > + if (sg_dma_address(scatterlist + j) != > + sg_dma_address(scatterlist + j - 1) + PAGE_SIZE) { > + break; > + } > + section_length++; > + } > + > + addrs[k] = (u32)sg_dma_address(scatterlist + i) | > + section_length; It looks like scatterlist may not be just an array, so I think this whole loop wants to be something like: for_each_sg(scatterlist, sg, num_pages, i) { u32 len = sg_dma_len(sg); u32 addr = sg_dma_address(sg); if (k > 0 && addrs[k - 1] & PAGE_MASK + (addrs[k - 1] & ~PAGE_MASK) << PAGE_SHIFT) == addr) { addrs[k - 1] += len; } else { addrs[k++] = addr | len; } } Note the use of sg_dma_len(), since sg entries may be more than one page. I don't think any merging is actually happening on our platform currently, thus why I left the inner loop. Thanks for taking on doing this conversion! This code's going to be so much nicer when you're done.
Attachment:
signature.asc
Description: PGP signature
_______________________________________________ devel mailing list devel@xxxxxxxxxxxxxxxxxxxxxx http://driverdev.linuxdriverproject.org/mailman/listinfo/driverdev-devel