Re: [PATCH rdma-core 1/5] util: Add common mmio macros

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 4/14/2017 1:38 AM, Jason Gunthorpe wrote:
The macros are a mashup of the Mellanox driver macros:
 - An alternate implementation for S390 is provided
 - The ia32 XMM based 64 bit store emulation is switched
   in if supported
 - The macros are 'relaxed' ordering, so no implicit barrier
   overheads
 - If UDMA is not available then MMIO macros are also switched off,
   this is because the 32 bit emulation relies on the udma barriers.

Signed-off-by: Jason Gunthorpe <jgunthorpe@xxxxxxxxxxxxxxxxxxxx>
---
 CMakeLists.txt                |  16 ++++
 buildlib/config.h.in          |   2 +
 buildlib/rdma_functions.cmake |   4 +-
 util/CMakeLists.txt           |  20 +++-
 util/dummy.c                  |   0
 util/mmio.c                   |  83 ++++++++++++++++
 util/mmio.h                   | 214 ++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 336 insertions(+), 3 deletions(-)
 create mode 100644 util/dummy.c
 create mode 100644 util/mmio.c
 create mode 100644 util/mmio.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1ff3189c9d295e..5552872e8ca707 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -181,6 +181,19 @@ set(NO_STRICT_ALIASING_FLAGS "")
 RDMA_AddOptCFlag(NO_STRICT_ALIASING_FLAGS HAVE_NO_STRICT_ALIASING
   "-fno-strict-aliasing")

+CHECK_C_SOURCE_COMPILES("
+ #include <unistd.h>
+
+ void entry(void);
+
+ static void do_entry(void) {}
+ void entry(void) __attribute__((ifunc(\"resolve_entry\")));
+ static void *resolve_entry(void) {return &do_entry;}
+
+ int main(int argc,const char *argv[]) { entry(); }"
+  HAVE_FUNC_ATTRIBUTE_IFUNC
+  FAIL_REGEX "warning")
+
 # The code does not do the racy fcntl if the various CLOEXEC's are not
 # supported so it really doesn't work right if this isn't available. Thus hard
 # require it.
@@ -418,6 +431,9 @@ message(STATUS "Missing Optional Items:")
 if (NOT HAVE_FUNC_ATTRIBUTE_ALWAYS_INLINE)
   message(STATUS " Compiler attribute always_inline NOT supported")
 endif()
+if (NOT HAVE_FUNC_ATTRIBUTE_IFUNC)
+  message(STATUS " Compiler attribute ifunc NOT supported")
+endif()
 if (NOT HAVE_COHERENT_DMA)
   message(STATUS " Architecture NOT able to do coherent DMA (check util/udma_barrier.h) some providers disabled!")
 endif()
diff --git a/buildlib/config.h.in b/buildlib/config.h.in
index e4e7b9ee70c857..2eb16be764630e 100644
--- a/buildlib/config.h.in
+++ b/buildlib/config.h.in
@@ -28,6 +28,8 @@
 // FIXME This has been supported in compilers forever, we should just fail to build on such old systems.
 #cmakedefine HAVE_FUNC_ATTRIBUTE_ALWAYS_INLINE 1

+#cmakedefine HAVE_FUNC_ATTRIBUTE_IFUNC 1
+
 #cmakedefine HAVE_WORKING_IF_H 1

 @SIZEOF_LONG_CODE@
diff --git a/buildlib/rdma_functions.cmake b/buildlib/rdma_functions.cmake
index 01997b51468f55..936e1b256816ef 100644
--- a/buildlib/rdma_functions.cmake
+++ b/buildlib/rdma_functions.cmake
@@ -6,8 +6,8 @@
 # Global list of pairs of (SHARED STATIC) libary target names
 set(RDMA_STATIC_LIBS "" CACHE INTERNAL "Doc" FORCE)

-set(COMMON_LIBS_PIC ccan_pic)
-set(COMMON_LIBS ccan)
+set(COMMON_LIBS_PIC ccan_pic rdma_util_pic)
+set(COMMON_LIBS ccan rdma_util)

 # Create a symlink at filename DEST
 # If the directory containing DEST does not exist then it is created
diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt
index 71e33ac3baaa9b..581bf6822ec39f 100644
--- a/util/CMakeLists.txt
+++ b/util/CMakeLists.txt
@@ -1,5 +1,23 @@
 publish_internal_headers(util
   compiler.h
-  udma_barrier.h
   util.h
   )
+
+# The empty dummy.c is only needed so that cmake always has something to build
+# into the library.
+set(C_FILES dummy.c)
+
+if (HAVE_COHERENT_DMA)
+  publish_internal_headers(util
+    mmio.h
+    udma_barrier.h
+    )
+
+  set(C_FILES ${C_FILES}
+    mmio.c
+    )
+endif()
+
+add_library(rdma_util STATIC ${C_FILES})
+add_library(rdma_util_pic STATIC ${C_FILES})
+set_property(TARGET rdma_util_pic PROPERTY POSITION_INDEPENDENT_CODE TRUE)
diff --git a/util/dummy.c b/util/dummy.c
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/util/mmio.c b/util/mmio.c
new file mode 100644
index 00000000000000..757a46ce5418d8
--- /dev/null
+++ b/util/mmio.c
@@ -0,0 +1,83 @@
+/* GPLv2 or OpenIB.org BSD (MIT) See COPYING file */
+#include <util/mmio.h>
+#include <util/udma_barrier.h>
+#include <config.h>
+
+#include <pthread.h>
+#include <stdbool.h>
+
+#if SIZEOF_LONG != 8
+
+static pthread_spinlock_t mmio_spinlock;
+
+static __attribute__((constructor)) void lock_constructor(void)
+{
+	pthread_spin_init(&mmio_spinlock, PTHREAD_PROCESS_PRIVATE);
+}
+
+/* When the arch does not have a 32 bit store we provide an emulation that
You mean a 64 bit store, correct ?

+   does two stores in address ascending order while holding a global
+   spinlock. */
+static void pthread_mmio_write64_be(void *addr, __be64 val)
+{
+	__be32 first_dword = htobe32(be64toh(val) >> 32);
+	__be32 second_dword = htobe32(be64toh(val));
+
+	/* The WC spinlock, by definition, provides global ordering for all UC
+	   and WC stores within the critical region. */
+	mmio_wc_spinlock(&mmio_spinlock);
+
+	mmio_write32_be(addr, first_dword);
+	mmio_write32_be(addr + 4, second_dword);
+
+	mmio_wc_spinunlock(&mmio_spinlock);
+}
+
+#if defined(__i386__)
+#include <xmmintrin.h>
+#include <cpuid.h>
+
+/* For ia32 we have historically emitted movlps SSE instructions to do the 64
+   bit operations. */
+static void __attribute__((target("sse")))
+sse_mmio_write64_be(void *addr, __be64 val)
+{
+	__m128 tmp = {};
+	tmp = _mm_loadl_pi(tmp, (__force __m64 *)&val);
+	_mm_storel_pi((__m64 *)addr,tmp);
+}
+
+static bool have_sse(void)
+{
+	unsigned int ax,bx,cx,dx;
+
+	if (!__get_cpuid(1,&ax,&bx,&cx,&dx))
+		return false;
+	return dx & bit_SSE;
+}
+
+#endif /* defined(__i386__) */
+
+typedef void (*write64_fn_t)(void *, __be64);
+
+/* This uses the STT_GNU_IFUNC extension to have the dynamic linker select the
+   best above implementations at runtime. */
+#if HAVE_FUNC_ATTRIBUTE_IFUNC
+void mmio_write64_be(void *addr, __be64 val)
+    __attribute__((ifunc("resolve_mmio_write64_be")));
+static write64_fn_t resolve_mmio_write64_be(void);
+#else
+__asm__(".type mmio_write64_be, %gnu_indirect_function");
+write64_fn_t resolve_mmio_write64_be(void) __asm__("mmio_write64_be");
+#endif
+
+write64_fn_t resolve_mmio_write64_be(void)
+{
+#if defined(__i386__)
+	if (have_sse())
+		return &sse_mmio_write64_be;
+#endif
+	return &pthread_mmio_write64_be;

This will bring in 32 bit systems code that uses global spinlock comparing current usage where a specific spinlock is used, isn't it ?
see mlx4_write64() which is replaced by that code in downstream patches.

+}
+
+#endif /* SIZEOF_LONG != 8 */
diff --git a/util/mmio.h b/util/mmio.h
new file mode 100644
index 00000000000000..0b89f5fcbe000e
--- /dev/null
+++ b/util/mmio.h
@@ -0,0 +1,214 @@
+/* GPLv2 or OpenIB.org BSD (MIT) See COPYING file
+
+   These accessors always map to PCI-E TLPs in predictable ways. Translation
+   to other buses should follow similar definitions.
+
+   write32(mem, 1)
+      Produce a 4 byte MemWr TLP with bit 0 of DW byte offset 0 set
+   write32_be(mem, htobe32(1))
+      Produce a 4 byte MemWr TLP with bit 0 of DW byte offset 3 set
+   write32_le(mem, htole32(1))
+      Produce a 4 byte MemWr TLP with bit 0 of DW byte offset 0 set
+
+   For ordering these accessors are similar to the Kernel's concept of
+   writel_relaxed(). When working with UC memory the following hold:
+
+   1) Strong ordering is required when talking to the same device (eg BAR),
+      and combining is not permitted:
+
+       write32(mem, 1);
+       write32(mem + 4, 1);
+       write32(mem, 1);
+
+      Must produce three TLPs, in order.
+
+   2) Ordering ignores all pthread locking:
+
+       pthread_spin_lock(&lock);
+       write32(mem, global++);
+       pthread_spin_unlock(&lock);
+
+      When run concurrently on all CPUs the device must observe all stores,
+      but the data value will not be strictly increasing.
+
+   3) Interaction with DMA is not ordered. Explicit use of a barrier from
+      udma_barriers is required:
+
+	*dma_mem = 1;
+	udma_to_device_barrier();
+	write32(mem, GO_DMA);
+
+   4) Access out of program order (eg speculation), either by the CPU or
+      compiler is not permitted:
+
+	if (cond)
+	   read32();
+
+      Must not issue a read TLP if cond is false.
+
+   If these are used with WC memory then #1 and #4 do not apply, and all WC
+   accesses must be bracketed with mmio_wc_start() // mmio_flush_writes()
+*/
+
+#ifndef __UTIL_MMIO_H
+#define __UTIL_MMIO_H
+
+#include <linux/types.h>
+#include <stdatomic.h>
+#include <stdint.h>
+#include <endian.h>
+
+#include <config.h>
+#include <util/compiler.h>
+
+/* The first step is to define the 'raw' accessors. To make this very safe
+   with sparse we define two versions of each, a le and a be - however the
+   code is always identical.
+*/
+#ifdef __s390x__
+#include <unistd.h>
+#include <sys/syscall.h>
+
+/* s390 requires a privileged instruction to access IO memory, these syscalls
+   perform that instruction using a memory buffer copy semantic.
+*/
+static inline void s390_mmio_write(void *mmio_addr, const void *val,
+				   size_t length)
+{
+	// FIXME: Check for error and call abort?
+	syscall(__NR_s390_pci_mmio_write, mmio_addr, val, length);
+}
+
+static inline void s390_mmio_read(const void *mmio_addr, void *val,
+				  size_t length)
+{
+	// FIXME: Check for error and call abort?
+	syscall(__NR_s390_pci_mmio_read, mmio_addr, val, length);
+}
+
+#define MAKE_WRITE(_NAME_, _SZ_)                                               \
+	static inline void _NAME_##_be(void *addr, __be##_SZ_ value)           \
+	{                                                                      \
+		s390_mmio_write(addr, &value, sizeof(value));                  \
+	}                                                                      \
+	static inline void _NAME_##_le(void *addr, __le##_SZ_ value)           \
+	{                                                                      \
+		s390_mmio_write(addr, &value, sizeof(value));                  \
+	}

I believe that you introduced two different versions (i.e. le/be only for semantic reasons as code is really similar, is that correct ?

+#define MAKE_READ(_NAME_, _SZ_)                                                \
+	static inline __be##_SZ_ _NAME_##_be(const void *addr)                 \
+	{                                                                      \
+		__be##_SZ_ res;                                                \
+		s390_mmio_read(addr, &res, sizeof(res));                       \
+		return res;                                                    \
+	}                                                                      \
+	static inline __le##_SZ_ _NAME_##_le(const void *addr)                 \
+	{                                                                      \
+		__le##_SZ_ res;                                                \
+		s390_mmio_read(addr, &res, sizeof(res));                       \
+		return res;                                                    \
+	}
+
+static inline void mmio_read8(void *addr, uint8_t value)

Did you refer here to mmio_write8 ?

+{
+	s390_mmio_write(addr, &value, sizeof(value));
+}
+
+static inline uint8_t mmio_read8(const void *addr)

This function was already defined above.

+{
+	uint8_t res;
+	s390_mmio_read(addr, &res, sizeof(res));
+	return res;
+}
+
+#else /* __s390x__ */
+
+#define MAKE_WRITE(_NAME_, _SZ_)                                               \
+	static inline void _NAME_##_be(void *addr, __be##_SZ_ value)           \
+	{                                                                      \
+		atomic_store_explicit((_Atomic(uint##_SZ_##_t) *)addr,         \
+				      (__force uint##_SZ_##_t)value,           \
+				      memory_order_relaxed);                   \

Mightn't this code introduce some overhead comparing direct assignment which is used (i.e. mlx5_write64) and replaced by that in downstream patches ?

This need to be measured and verified as it's used in the data path flows.

+	}                                                                      \
+	static inline void _NAME_##_le(void *addr, __le##_SZ_ value)           \
+	{                                                                      \
+		atomic_store_explicit((_Atomic(uint##_SZ_##_t) *)addr,         \
+				      (__force uint##_SZ_##_t)value,           \
+				      memory_order_relaxed);                   \
+	}
+#define MAKE_READ(_NAME_, _SZ_)                                                \
+	static inline __be##_SZ_ _NAME_##_be(const void *addr)                 \
+	{                                                                      \
+		return (__force __be##_SZ_)atomic_load_explicit(               \
+		    (_Atomic(uint##_SZ_##_t) *)addr, memory_order_relaxed);    \
+	}                                                                      \
+	static inline __le##_SZ_ _NAME_##_le(const void *addr)                 \
+	{                                                                      \
+		return (__force __le##_SZ_)atomic_load_explicit(               \
+		    (_Atomic(uint##_SZ_##_t) *)addr, memory_order_relaxed);    \
+	}
+
+static inline void mmio_write8(void *addr, uint8_t value)
+{
+	atomic_store_explicit((_Atomic(uint8_t) *)addr, value,
+			      memory_order_relaxed);
+}
+static inline uint8_t mmio_read8(const void *addr)
+{
+	return atomic_load_explicit((_Atomic(uint32_t) *)addr,
+				    memory_order_relaxed);
+}
+
+#endif /* __s390x__ */
+
+MAKE_WRITE(mmio_write16, 16)
+MAKE_WRITE(mmio_write32, 32)
+
+MAKE_READ(mmio_read16, 16)
+MAKE_READ(mmio_read32, 32)
+
+#if SIZEOF_LONG == 8
+MAKE_WRITE(mmio_write64, 64)
+MAKE_READ(mmio_read64, 64)
+#else
+void mmio_write64_be(void *addr, __be64 val);
+static inline void mmio_write64_le(void *addr, __le64 val)
+{
+	mmio_write64_be(addr, (__be64 __force)val);
+}
+
+/* There is no way to do read64 atomically, rather than provide some sketchy
+   implementation we leave these functions undefined, users should not call
+   them if SIZEOF_LONG != 8, but instead implement an appropriate version. */
+__be64 mmio_read64_be(const void *addr);
+__le64 mmio_read64_le(const void *addr);
+#endif /* SIZEOF_LONG == 8 */
+
+#undef MAKE_WRITE
+#undef MAKE_READ
+
+/* Now we can define the host endian versions of the operator, this just includes
+   a call to htole. */
+#define MAKE_WRITE(_NAME_, _SZ_)                                               \
+	static inline void _NAME_(void *addr, uint##_SZ_##_t value)            \
+	{                                                                      \
+		_NAME_##_le(addr, htole##_SZ_(value));                         \
+	}
+#define MAKE_READ(_NAME_, _SZ_)                                                \
+	static inline uint##_SZ_##_t _NAME_(const void *addr)                  \
+	{                                                                      \
+		return le##_SZ_##toh(_NAME_##_le(addr));                       \
+	}
+
+MAKE_WRITE(mmio_write16, 16)
+MAKE_WRITE(mmio_write32, 32)
+MAKE_WRITE(mmio_write64, 64)
+
+MAKE_READ(mmio_read16, 16)
+MAKE_READ(mmio_read32, 32)
+MAKE_READ(mmio_read64, 64)
+
+#undef MAKE_WRITE
+#undef MAKE_READ
+
+#endif


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux