Re: [PATCH] Improve endian conversion in umac.c

[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]

 



Please forgive the overly formalized way I wrote this up. This is what I use for the notes I make as I need to report on a lot of this back to the NSF. So I just copied and pasted it here.

Test subject: OpenSSH 8.8p1 commit bf944e37

Test environment: 2 AMD EPYC 7502P 32-Core servers connected via 10Gb Intel X710 NICs in LAN, RTT < 0.5ms. ESNet TCP tuning applied.

Test method: 50 iterations of 'dd if=/dev/zero bs=1M count=20000 | ssh -caes256-ctr remote.host "cat > /dev/null"'. 5 second pause between each iteration.

Versions tested:
Stock openssh 8.8p1 (stock)
Migrating all endian functions from misc.c (full)
Migrating only [get|put]_u32_le from misc.c (little)

Test Results:

Flavor	mean,	median,	min,	max,	adev
stock   669.68,	670,	653,	688,	4.9728
full 	800.32,	804.5,	762,	823,	11.9216
little  803.1,	806,	772,	831,	10.484

Interpretation:
Inlining the endian functions, via compiler optimization, in umac.c leads to an average 16.6% improvement in throughput, with a maximized gain of 21.4%, in this test environment. The more pronounced deviation is likely a result of the higher throughput rather than intrinsic instabilities in the code. This performance gain comes entirely from inlining the little endian conversion function [get|put]_u32_le. These functions are defined in misc.c but only used in umac.c. As such, migrating them to umac.c would impose minimal disruption on the rest of the code base.

Patch:
diff --git a/misc.c b/misc.c
index 0134d694..20a2a186 100644
--- a/misc.c
+++ b/misc.c
@@ -1533,20 +1533,6 @@ get_u32(const void *vp)
 	return (v);
 }

-u_int32_t
-get_u32_le(const void *vp)
-{
-	const u_char *p = (const u_char *)vp;
-	u_int32_t v;
-
-	v  = (u_int32_t)p[0];
-	v |= (u_int32_t)p[1] << 8;
-	v |= (u_int32_t)p[2] << 16;
-	v |= (u_int32_t)p[3] << 24;
-
-	return (v);
-}
-
 u_int16_t
 get_u16(const void *vp)
 {
@@ -1585,17 +1571,6 @@ put_u32(void *vp, u_int32_t v)
 	p[3] = (u_char)v & 0xff;
 }

-void
-put_u32_le(void *vp, u_int32_t v)
-{
-	u_char *p = (u_char *)vp;
-
-	p[0] = (u_char)v & 0xff;
-	p[1] = (u_char)(v >> 8) & 0xff;
-	p[2] = (u_char)(v >> 16) & 0xff;
-	p[3] = (u_char)(v >> 24) & 0xff;
-}
-
 void
 put_u16(void *vp, u_int16_t v)
 {
diff --git a/misc.h b/misc.h
index 2e2dca54..b0c64270 100644
--- a/misc.h
+++ b/misc.h
@@ -154,12 +154,6 @@ void		put_u32(void *, u_int32_t)
 void		put_u16(void *, u_int16_t)
     __attribute__((__bounded__( __minbytes__, 1, 2)));

-/* Little-endian store/load, used by umac.c */
-u_int32_t	get_u32_le(const void *)
-    __attribute__((__bounded__(__minbytes__, 1, 4)));
-void		put_u32_le(void *, u_int32_t)
-    __attribute__((__bounded__(__minbytes__, 1, 4)));
-
 struct bwlimit {
 	size_t buflen;
 	u_int64_t rate;		/* desired rate in kbit/s */
diff --git a/umac.c b/umac.c
index e5ec19f0..64502365 100644
--- a/umac.c
+++ b/umac.c
@@ -134,15 +134,56 @@ typedef unsigned int	UWORD;  /* Register */
/* --- Endian Conversion --- Forcing assembly on some platforms */ /* ---------------------------------------------------------------------- */

+
+/* Using local statically defined versions of the get/put functions
+ * found in misc.c allows them to be inlined. This improves throughput
+ * performance by 10% to 15% on well connected (10Gb/s+) systems.
+ * Forward declaration of the functions in order to maintain
+ * the attributes.
+ * Chris Rapier <rapier@xxxxxxx> 2022-03-09 */
+
+static u_int32_t umac_get_u32_le(const void *)
+    __attribute__((__bounded__(__minbytes__, 1, 4)));
+
+static u_int32_t
+umac_get_u32_le(const void *vp)
+{
+        const u_char *p = (const u_char *)vp;
+        u_int32_t v;
+
+        v  = (u_int32_t)p[0];
+        v |= (u_int32_t)p[1] << 8;
+        v |= (u_int32_t)p[2] << 16;
+        v |= (u_int32_t)p[3] << 24;
+
+        return (v);
+}
+
+#if (! __LITTLE_ENDIAN__) /* compile time warning thown otherwise */
+static void umac_put_u32_le(void *, u_int32_t)
+    __attribute__((__bounded__(__minbytes__, 1, 4)));
+
+static void
+umac_put_u32_le(void *vp, u_int32_t v)
+{
+        u_char *p = (u_char *)vp;
+
+        p[0] = (u_char)v & 0xff;
+        p[1] = (u_char)(v >> 8) & 0xff;
+        p[2] = (u_char)(v >> 16) & 0xff;
+        p[3] = (u_char)(v >> 24) & 0xff;
+}
+#endif
+
 #if (__LITTLE_ENDIAN__)
 #define LOAD_UINT32_REVERSED(p)		get_u32(p)
 #define STORE_UINT32_REVERSED(p,v)	put_u32(p,v)
 #else
-#define LOAD_UINT32_REVERSED(p)		get_u32_le(p)
-#define STORE_UINT32_REVERSED(p,v)	put_u32_le(p,v)
+#define LOAD_UINT32_REVERSED(p)		umac_get_u32_le(p)
+#define STORE_UINT32_REVERSED(p,v)	umac_put_u32_le(p,v)
 #endif

-#define LOAD_UINT32_LITTLE(p)		(get_u32_le(p))
+#define LOAD_UINT32_LITTLE(p)		(umac_get_u32_le(p))
 #define STORE_UINT32_BIG(p,v)		put_u32(p, v)

/* ---------------------------------------------------------------------- */

_______________________________________________
openssh-unix-dev mailing list
openssh-unix-dev@xxxxxxxxxxx
https://lists.mindrot.org/mailman/listinfo/openssh-unix-dev



[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]

[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Security]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux