Hello Robin, Christoph, Any further comment? John suggested that "depends on DEBUG_FS" should be added in Kconfig. I am collecting more comments to send v4 together with fixing this minor issue :-) Thanks Barry > -----Original Message----- > From: Song Bao Hua (Barry Song) > Sent: Monday, November 2, 2020 9:07 PM > To: iommu@xxxxxxxxxxxxxxxxxxxxxxxxxx; hch@xxxxxx; robin.murphy@xxxxxxx; > m.szyprowski@xxxxxxxxxxx > Cc: Linuxarm <linuxarm@xxxxxxxxxx>; linux-kselftest@xxxxxxxxxxxxxxx; xuwei > (O) <xuwei5@xxxxxxxxxx>; Song Bao Hua (Barry Song) > <song.bao.hua@xxxxxxxxxxxxx>; Joerg Roedel <joro@xxxxxxxxxx>; Will Deacon > <will@xxxxxxxxxx>; Shuah Khan <shuah@xxxxxxxxxx> > Subject: [PATCH v3 1/2] dma-mapping: add benchmark support for streaming > DMA APIs > > Nowadays, there are increasing requirements to benchmark the performance > of dma_map and dma_unmap particually while the device is attached to an > IOMMU. > > This patch enables the support. Users can run specified number of threads to > do dma_map_page and dma_unmap_page on a specific NUMA node with the > specified duration. Then dma_map_benchmark will calculate the average > latency for map and unmap. > > A difficulity for this benchmark is that dma_map/unmap APIs must run on a > particular device. Each device might have different backend of IOMMU or > non-IOMMU. > > So we use the driver_override to bind dma_map_benchmark to a particual > device by: > For platform devices: > echo dma_map_benchmark > /sys/bus/platform/devices/xxx/driver_override > echo xxx > /sys/bus/platform/drivers/xxx/unbind > echo xxx > /sys/bus/platform/drivers/dma_map_benchmark/bind > > For PCI devices: > echo dma_map_benchmark > > /sys/bus/pci/devices/0000:00:01.0/driver_override > echo 0000:00:01.0 > /sys/bus/pci/drivers/xxx/unbind echo 0000:00:01.0 > > /sys/bus/pci/drivers/dma_map_benchmark/bind > > Cc: Joerg Roedel <joro@xxxxxxxxxx> > Cc: Will Deacon <will@xxxxxxxxxx> > Cc: Shuah Khan <shuah@xxxxxxxxxx> > Cc: Christoph Hellwig <hch@xxxxxx> > Cc: Marek Szyprowski <m.szyprowski@xxxxxxxxxxx> > Cc: Robin Murphy <robin.murphy@xxxxxxx> > Signed-off-by: Barry Song <song.bao.hua@xxxxxxxxxxxxx> > --- > -v3: > * fix build issues reported by 0day kernel test robot > -v2: > * add PCI support; v1 supported platform devices only > * replace ssleep by msleep_interruptible() to permit users to exit > benchmark before it is completed > * many changes according to Robin's suggestions, thanks! Robin > - add standard deviation output to reflect the worst case > - check users' parameters strictly like the number of threads > - make cache dirty before dma_map > - fix unpaired dma_map_page and dma_unmap_single; > - remove redundant "long long" before ktime_to_ns(); > - use devm_add_action() > > kernel/dma/Kconfig | 8 + > kernel/dma/Makefile | 1 + > kernel/dma/map_benchmark.c | 296 > +++++++++++++++++++++++++++++++++++++ > 3 files changed, 305 insertions(+) > create mode 100644 kernel/dma/map_benchmark.c > > diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index > c99de4a21458..949c53da5991 100644 > --- a/kernel/dma/Kconfig > +++ b/kernel/dma/Kconfig > @@ -225,3 +225,11 @@ config DMA_API_DEBUG_SG > is technically out-of-spec. > > If unsure, say N. > + > +config DMA_MAP_BENCHMARK > + bool "Enable benchmarking of streaming DMA mapping" > + help > + Provides /sys/kernel/debug/dma_map_benchmark that helps with > testing > + performance of dma_(un)map_page. > + > + See tools/testing/selftests/dma/dma_map_benchmark.c > diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index > dc755ab68aab..7aa6b26b1348 100644 > --- a/kernel/dma/Makefile > +++ b/kernel/dma/Makefile > @@ -10,3 +10,4 @@ obj-$(CONFIG_DMA_API_DEBUG) += debug.o > obj-$(CONFIG_SWIOTLB) += swiotlb.o > obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o > obj-$(CONFIG_DMA_REMAP) += remap.o > +obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o > diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c > new file mode 100644 index 000000000000..dc4e5ff48a2d > --- /dev/null > +++ b/kernel/dma/map_benchmark.c > @@ -0,0 +1,296 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (C) 2020 Hisilicon Limited. > + */ > + > +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt > + > +#include <linux/debugfs.h> > +#include <linux/delay.h> > +#include <linux/device.h> > +#include <linux/dma-mapping.h> > +#include <linux/kernel.h> > +#include <linux/kthread.h> > +#include <linux/math64.h> > +#include <linux/module.h> > +#include <linux/pci.h> > +#include <linux/platform_device.h> > +#include <linux/slab.h> > +#include <linux/timekeeping.h> > + > +#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark) > +#define DMA_MAP_MAX_THREADS 1024 > +#define DMA_MAP_MAX_SECONDS 300 > + > +struct map_benchmark { > + __u64 avg_map_100ns; /* average map latency in 100ns */ > + __u64 map_stddev; /* standard deviation of map latency */ > + __u64 avg_unmap_100ns; /* as above */ > + __u64 unmap_stddev; > + __u32 threads; /* how many threads will do map/unmap in parallel */ > + __u32 seconds; /* how long the test will last */ > + int node; /* which numa node this benchmark will run on */ > + __u64 expansion[10]; /* For future use */ > +}; > + > +struct map_benchmark_data { > + struct map_benchmark bparam; > + struct device *dev; > + struct dentry *debugfs; > + atomic64_t sum_map_100ns; > + atomic64_t sum_unmap_100ns; > + atomic64_t sum_square_map; > + atomic64_t sum_square_unmap; > + atomic64_t loops; > +}; > + > +static int map_benchmark_thread(void *data) { > + void *buf; > + dma_addr_t dma_addr; > + struct map_benchmark_data *map = data; > + int ret = 0; > + > + buf = (void *)__get_free_page(GFP_KERNEL); > + if (!buf) > + return -ENOMEM; > + > + while (!kthread_should_stop()) { > + __u64 map_100ns, unmap_100ns, map_square, unmap_square; > + ktime_t map_stime, map_etime, unmap_stime, unmap_etime; > + > + /* > + * for a non-coherent device, if we don't stain them in the cache, > + * this will give an underestimate of the real-world overhead of > + * BIDIRECTIONAL or TO_DEVICE mappings > + * 66 means evertything goes well! 66 is lucky. > + */ > + memset(buf, 0x66, PAGE_SIZE); > + > + map_stime = ktime_get(); > + dma_addr = dma_map_single(map->dev, buf, PAGE_SIZE, > DMA_BIDIRECTIONAL); > + if (unlikely(dma_mapping_error(map->dev, dma_addr))) { > + pr_err("dma_map_single failed on %s\n", > dev_name(map->dev)); > + ret = -ENOMEM; > + goto out; > + } > + map_etime = ktime_get(); > + > + unmap_stime = ktime_get(); > + dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, > DMA_BIDIRECTIONAL); > + unmap_etime = ktime_get(); > + > + /* calculate sum and sum of squares */ > + map_100ns = div64_ul(ktime_to_ns(ktime_sub(map_etime, > map_stime)), 100); > + unmap_100ns = div64_ul(ktime_to_ns(ktime_sub(unmap_etime, > unmap_stime)), 100); > + map_square = map_100ns * map_100ns; > + unmap_square = unmap_100ns * unmap_100ns; > + > + atomic64_add(map_100ns, &map->sum_map_100ns); > + atomic64_add(unmap_100ns, &map->sum_unmap_100ns); > + atomic64_add(map_square, &map->sum_square_map); > + atomic64_add(unmap_square, &map->sum_square_unmap); > + atomic64_inc(&map->loops); > + } > + > +out: > + free_page((unsigned long)buf); > + return ret; > +} > + > +static int do_map_benchmark(struct map_benchmark_data *map) { > + struct task_struct **tsk; > + int threads = map->bparam.threads; > + int node = map->bparam.node; > + const cpumask_t *cpu_mask = cpumask_of_node(node); > + __u64 loops; > + int ret = 0; > + int i; > + > + tsk = kmalloc_array(threads, sizeof(tsk), GFP_KERNEL); > + if (!tsk) > + return -ENOMEM; > + > + get_device(map->dev); > + > + for (i = 0; i < threads; i++) { > + tsk[i] = kthread_create_on_node(map_benchmark_thread, map, > + map->bparam.node, "dma-map-benchmark/%d", i); > + if (IS_ERR(tsk[i])) { > + pr_err("create dma_map thread failed\n"); > + ret = PTR_ERR(tsk[i]); > + goto out; > + } > + > + if (node != NUMA_NO_NODE && node_online(node)) > + kthread_bind_mask(tsk[i], cpu_mask); > + } > + > + /* clear the old value in the previous benchmark */ > + atomic64_set(&map->sum_map_100ns, 0); > + atomic64_set(&map->sum_unmap_100ns, 0); > + atomic64_set(&map->sum_square_map, 0); > + atomic64_set(&map->sum_square_unmap, 0); > + atomic64_set(&map->loops, 0); > + > + for (i = 0; i < threads; i++) > + wake_up_process(tsk[i]); > + > + msleep_interruptible(map->bparam.seconds * 1000); > + > + /* wait for the completion of benchmark threads */ > + for (i = 0; i < threads; i++) { > + ret = kthread_stop(tsk[i]); > + if (ret) > + goto out; > + } > + > + loops = atomic64_read(&map->loops); > + if (likely(loops > 0)) { > + __u64 map_variance, unmap_variance; > + > + /* average latency */ > + map->bparam.avg_map_100ns = > div64_u64(atomic64_read(&map->sum_map_100ns), loops); > + map->bparam.avg_unmap_100ns = > +div64_u64(atomic64_read(&map->sum_unmap_100ns), loops); > + > + /* standard deviation of latency */ > + map_variance = > div64_u64(atomic64_read(&map->sum_square_map), loops) - > + map->bparam.avg_map_100ns * > map->bparam.avg_map_100ns; > + unmap_variance = > div64_u64(atomic64_read(&map->sum_square_unmap), loops) - > + map->bparam.avg_unmap_100ns * > map->bparam.avg_unmap_100ns; > + map->bparam.map_stddev = int_sqrt64(map_variance); > + map->bparam.unmap_stddev = int_sqrt64(unmap_variance); > + } > + > +out: > + put_device(map->dev); > + kfree(tsk); > + return ret; > +} > + > +static long map_benchmark_ioctl(struct file *filep, unsigned int cmd, > + unsigned long arg) > +{ > + struct map_benchmark_data *map = filep->private_data; > + int ret; > + > + if (copy_from_user(&map->bparam, (void __user *)arg, > sizeof(map->bparam))) > + return -EFAULT; > + > + switch (cmd) { > + case DMA_MAP_BENCHMARK: > + if (map->bparam.threads == 0 || map->bparam.threads > > DMA_MAP_MAX_THREADS) { > + pr_err("invalid thread number\n"); > + return -EINVAL; > + } > + if (map->bparam.seconds == 0 || map->bparam.seconds > > DMA_MAP_MAX_SECONDS) { > + pr_err("invalid duration seconds\n"); > + return -EINVAL; > + } > + > + ret = do_map_benchmark(map); > + break; > + default: > + return -EINVAL; > + } > + > + if (copy_to_user((void __user *)arg, &map->bparam, > sizeof(map->bparam))) > + return -EFAULT; > + > + return ret; > +} > + > +static const struct file_operations map_benchmark_fops = { > + .open = simple_open, > + .unlocked_ioctl = map_benchmark_ioctl, }; > + > +static void map_benchmark_remove_debugfs(void *data) { > + struct map_benchmark_data *map = (struct map_benchmark_data *)data; > + > + debugfs_remove(map->debugfs); > +} > + > +static int __map_benchmark_probe(struct device *dev) { > + struct dentry *entry; > + struct map_benchmark_data *map; > + int ret; > + > + map = devm_kzalloc(dev, sizeof(*map), GFP_KERNEL); > + if (!map) > + return -ENOMEM; > + map->dev = dev; > + > + ret = devm_add_action(dev, map_benchmark_remove_debugfs, map); > + if (ret) { > + pr_err("Can't add debugfs remove action\n"); > + return ret; > + } > + > + /* > + * we only permit a device bound with this driver, 2nd probe > + * will fail > + */ > + entry = debugfs_create_file("dma_map_benchmark", 0600, NULL, map, > + &map_benchmark_fops); > + if (IS_ERR(entry)) > + return PTR_ERR(entry); > + map->debugfs = entry; > + > + return 0; > +} > + > +static int map_benchmark_platform_probe(struct platform_device *pdev) { > + return __map_benchmark_probe(&pdev->dev); > +} > + > +static struct platform_driver map_benchmark_platform_driver = { > + .driver = { > + .name = "dma_map_benchmark", > + }, > + .probe = map_benchmark_platform_probe, }; > + > +static int map_benchmark_pci_probe(struct pci_dev *pdev, const struct > +pci_device_id *id) { > + return __map_benchmark_probe(&pdev->dev); > +} > + > +static struct pci_driver map_benchmark_pci_driver = { > + .name = "dma_map_benchmark", > + .probe = map_benchmark_pci_probe, > +}; > + > +static int __init map_benchmark_init(void) { > + int ret; > + > + ret = pci_register_driver(&map_benchmark_pci_driver); > + if (ret) > + return ret; > + > + ret = platform_driver_register(&map_benchmark_platform_driver); > + if (ret) { > + pci_unregister_driver(&map_benchmark_pci_driver); > + return ret; > + } > + > + return 0; > +} > + > +static void __exit map_benchmark_cleanup(void) { > + platform_driver_unregister(&map_benchmark_platform_driver); > + pci_unregister_driver(&map_benchmark_pci_driver); > +} > + > +module_init(map_benchmark_init); > +module_exit(map_benchmark_cleanup); > + > +MODULE_AUTHOR("Barry Song <song.bao.hua@xxxxxxxxxxxxx>"); > +MODULE_DESCRIPTION("dma_map benchmark driver"); > MODULE_LICENSE("GPL"); > -- > 2.25.1