On Fri, 17 Apr 2020, Thomas Gleixner wrote: > Dan Williams <dan.j.williams@xxxxxxxxx> writes: > > > The goal of naming it _inatomic() was specifically for the observation > > that your driver coordinates atomic access and does not benefit from > > the cache friendliness that non-temporal stores afford. That said > > _inatomic() is arguably not a good choice either because that refers > > to whether the copy is prepared to take a fault or not. What about > > _exclusive() or _single()? Anything but _clflushopt() that conveys no > > contextual information. OK. I renamed it to memcpy_flushcache_single > > Other than quibbling with the name, and one more comment below, this > > looks ok to me. > > > >> Index: linux-2.6/drivers/md/dm-writecache.c > >> =================================================================== > >> --- linux-2.6.orig/drivers/md/dm-writecache.c 2020-04-17 14:06:35.139999000 +0200 > >> +++ linux-2.6/drivers/md/dm-writecache.c 2020-04-17 14:06:35.129999000 +0200 > >> @@ -1166,7 +1166,10 @@ static void bio_copy_block(struct dm_wri > >> } > >> } else { > >> flush_dcache_page(bio_page(bio)); > >> - memcpy_flushcache(data, buf, size); > >> + if (likely(size > 512)) > > > > This needs some reference to how this magic number is chosen and how a > > future developer might determine whether the value needs to be > > adjusted. > > I don't think it's a good idea to make this decision in generic code as > architectures or even CPU models might have different constraints on the > size. > > So I'd rather let the architecture implementation decide and make this > > flush_dcache_page(bio_page(bio)); > - memcpy_flushcache(data, buf, size); > + memcpy_flushcache_bikesheddedname(data, buf, size); > > and have the default fallback memcpy_flushcache() and let the > architecture sort the size limit and the underlying technology out. > > So x86 can use clflushopt or implement it with movdir64b and any other > architecture can provide their own magic soup without changing the > callsite. > > Thanks, > > tglx OK - so I moved the decision to memcpy_flushcache_single and I added a comment that explains the magic number. Mikulas From: Mikulas Patocka <mpatocka@xxxxxxxxxx> Implement the function memcpy_flushcache_single which flushes cache just like memcpy_flushcache - except that it uses cached writes and explicit cache flushing instead of non-temporal stores. Explicit cache flushing performs better in singlethreaded cases (i.e. the dm-writecache target with block size greater than 512), non-temporal stores perform better in other cases (mostly multithreaded workloads) - so we provide these two functions and the user should select which one is faster for his particular workload. dm-writecache througput (on real Optane-based persistent memory): block size 512 1024 2048 4096 movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s Signed-off-by: Mikulas Patocka <mpatocka@xxxxxxxxxx> --- arch/x86/include/asm/string_64.h | 10 ++++++++ arch/x86/lib/usercopy_64.c | 46 +++++++++++++++++++++++++++++++++++++++ drivers/md/dm-writecache.c | 2 - include/linux/string.h | 6 +++++ 4 files changed, 63 insertions(+), 1 deletion(-) Index: linux-2.6/arch/x86/include/asm/string_64.h =================================================================== --- linux-2.6.orig/arch/x86/include/asm/string_64.h 2020-04-20 15:31:46.939999000 +0200 +++ linux-2.6/arch/x86/include/asm/string_64.h 2020-04-20 15:31:46.929999000 +0200 @@ -114,6 +114,14 @@ memcpy_mcsafe(void *dst, const void *src return 0; } +/* + * In some cases (mostly single-threaded workload), clflushopt is faster + * than non-temporal stores. In other situations, non-temporal stores are + * faster. So, we provide two functions: + * memcpy_flushcache using non-temporal stores + * memcpy_flushcache_single using clflushopt + * The caller should test which one is faster for the particular workload. + */ #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE #define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1 void __memcpy_flushcache(void *dst, const void *src, size_t cnt); @@ -135,6 +143,8 @@ static __always_inline void memcpy_flush } __memcpy_flushcache(dst, src, cnt); } +#define __HAVE_ARCH_MEMCPY_FLUSHCACHE_CLFLUSHOPT 1 +void memcpy_flushcache_single(void *dst, const void *src, size_t cnt); #endif #endif /* __KERNEL__ */ Index: linux-2.6/include/linux/string.h =================================================================== --- linux-2.6.orig/include/linux/string.h 2020-04-20 15:31:46.939999000 +0200 +++ linux-2.6/include/linux/string.h 2020-04-20 15:31:46.929999000 +0200 @@ -175,6 +175,12 @@ static inline void memcpy_flushcache(voi memcpy(dst, src, cnt); } #endif +#ifndef __HAVE_ARCH_MEMCPY_FLUSHCACHE_CLFLUSHOPT +static inline void memcpy_flushcache_single(void *dst, const void *src, size_t cnt) +{ + memcpy_flushcache(dst, src, cnt); +} +#endif void *memchr_inv(const void *s, int c, size_t n); char *strreplace(char *s, char old, char new); Index: linux-2.6/arch/x86/lib/usercopy_64.c =================================================================== --- linux-2.6.orig/arch/x86/lib/usercopy_64.c 2020-04-20 15:31:46.939999000 +0200 +++ linux-2.6/arch/x86/lib/usercopy_64.c 2020-04-20 15:38:13.159999000 +0200 @@ -199,6 +199,52 @@ void __memcpy_flushcache(void *_dst, con } EXPORT_SYMBOL_GPL(__memcpy_flushcache); +void memcpy_flushcache_single(void *_dst, const void *_src, size_t size) +{ + unsigned long dest = (unsigned long) _dst; + unsigned long source = (unsigned long) _src; + + /* + * dm-writecache througput (on real Optane-based persistent memory): + * measured with dd: + * + * block size 512 1024 2048 4096 + * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s + * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s + * + * We see that movnti performs better for 512-byte blocks, and + * clflushopt performs better for 1024-byte and larger blocks. So, we + * prefer clflushopt for sizes >= 768. + */ + + if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) && likely(boot_cpu_data.x86_clflush_size == 64) && + likely(size >= 768)) { + if (unlikely(!IS_ALIGNED(dest, 64))) { + size_t len = min_t(size_t, size, ALIGN(dest, 64) - dest); + + memcpy((void *) dest, (void *) source, len); + clflushopt((void *)dest); + dest += len; + source += len; + size -= len; + } + do { + memcpy((void *)dest, (void *)source, 64); + clflushopt((void *)dest); + dest += 64; + source += 64; + size -= 64; + } while (size >= 64) + if (unlikely(size != 0)) { + memcpy((void *)dest, (void *)source, size); + clflushopt((void *)dest); + } + return; + } + memcpy_flushcache((void *)dest, (void *)source, size); +} +EXPORT_SYMBOL_GPL(memcpy_flushcache_single); + void memcpy_page_flushcache(char *to, struct page *page, size_t offset, size_t len) { Index: linux-2.6/drivers/md/dm-writecache.c =================================================================== --- linux-2.6.orig/drivers/md/dm-writecache.c 2020-04-20 15:31:46.939999000 +0200 +++ linux-2.6/drivers/md/dm-writecache.c 2020-04-20 15:32:35.549999000 +0200 @@ -1166,7 +1166,7 @@ static void bio_copy_block(struct dm_wri } } else { flush_dcache_page(bio_page(bio)); - memcpy_flushcache(data, buf, size); + memcpy_flushcache_single(data, buf, size); } bvec_kunmap_irq(buf, &flags); -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel