Please forgive the overly formalized way I wrote this up. This is what I
use for the notes I make as I need to report on a lot of this back to
the NSF. So I just copied and pasted it here.
Test subject: OpenSSH 8.8p1 commit bf944e37
Test environment: 2 AMD EPYC 7502P 32-Core servers connected via 10Gb
Intel X710 NICs in LAN, RTT < 0.5ms. ESNet TCP tuning applied.
Test method: 50 iterations of 'dd if=/dev/zero bs=1M count=20000 | ssh
-caes256-ctr remote.host "cat > /dev/null"'. 5 second pause between each
iteration.
Versions tested:
Stock openssh 8.8p1 (stock)
Migrating all endian functions from misc.c (full)
Migrating only [get|put]_u32_le from misc.c (little)
Test Results:
Flavor mean, median, min, max, adev
stock 669.68, 670, 653, 688, 4.9728
full 800.32, 804.5, 762, 823, 11.9216
little 803.1, 806, 772, 831, 10.484
Interpretation:
Inlining the endian functions, via compiler optimization, in umac.c
leads to an average 16.6% improvement in throughput, with a maximized
gain of 21.4%, in this test environment. The more pronounced deviation
is likely a result of the higher throughput rather than intrinsic
instabilities in the code. This performance gain comes entirely from
inlining the little endian conversion function [get|put]_u32_le. These
functions are defined in misc.c but only used in umac.c. As such,
migrating them to umac.c would impose minimal disruption on the rest of
the code base.
Patch:
diff --git a/misc.c b/misc.c
index 0134d694..20a2a186 100644
--- a/misc.c
+++ b/misc.c
@@ -1533,20 +1533,6 @@ get_u32(const void *vp)
return (v);
}
-u_int32_t
-get_u32_le(const void *vp)
-{
- const u_char *p = (const u_char *)vp;
- u_int32_t v;
-
- v = (u_int32_t)p[0];
- v |= (u_int32_t)p[1] << 8;
- v |= (u_int32_t)p[2] << 16;
- v |= (u_int32_t)p[3] << 24;
-
- return (v);
-}
-
u_int16_t
get_u16(const void *vp)
{
@@ -1585,17 +1571,6 @@ put_u32(void *vp, u_int32_t v)
p[3] = (u_char)v & 0xff;
}
-void
-put_u32_le(void *vp, u_int32_t v)
-{
- u_char *p = (u_char *)vp;
-
- p[0] = (u_char)v & 0xff;
- p[1] = (u_char)(v >> 8) & 0xff;
- p[2] = (u_char)(v >> 16) & 0xff;
- p[3] = (u_char)(v >> 24) & 0xff;
-}
-
void
put_u16(void *vp, u_int16_t v)
{
diff --git a/misc.h b/misc.h
index 2e2dca54..b0c64270 100644
--- a/misc.h
+++ b/misc.h
@@ -154,12 +154,6 @@ void put_u32(void *, u_int32_t)
void put_u16(void *, u_int16_t)
__attribute__((__bounded__( __minbytes__, 1, 2)));
-/* Little-endian store/load, used by umac.c */
-u_int32_t get_u32_le(const void *)
- __attribute__((__bounded__(__minbytes__, 1, 4)));
-void put_u32_le(void *, u_int32_t)
- __attribute__((__bounded__(__minbytes__, 1, 4)));
-
struct bwlimit {
size_t buflen;
u_int64_t rate; /* desired rate in kbit/s */
diff --git a/umac.c b/umac.c
index e5ec19f0..64502365 100644
--- a/umac.c
+++ b/umac.c
@@ -134,15 +134,56 @@ typedef unsigned int UWORD; /* Register */
/* --- Endian Conversion --- Forcing assembly on some platforms
*/
/*
---------------------------------------------------------------------- */
+
+/* Using local statically defined versions of the get/put functions
+ * found in misc.c allows them to be inlined. This improves throughput
+ * performance by 10% to 15% on well connected (10Gb/s+) systems.
+ * Forward declaration of the functions in order to maintain
+ * the attributes.
+ * Chris Rapier <rapier@xxxxxxx> 2022-03-09 */
+
+static u_int32_t umac_get_u32_le(const void *)
+ __attribute__((__bounded__(__minbytes__, 1, 4)));
+
+static u_int32_t
+umac_get_u32_le(const void *vp)
+{
+ const u_char *p = (const u_char *)vp;
+ u_int32_t v;
+
+ v = (u_int32_t)p[0];
+ v |= (u_int32_t)p[1] << 8;
+ v |= (u_int32_t)p[2] << 16;
+ v |= (u_int32_t)p[3] << 24;
+
+ return (v);
+}
+
+#if (! __LITTLE_ENDIAN__) /* compile time warning thown otherwise */
+static void umac_put_u32_le(void *, u_int32_t)
+ __attribute__((__bounded__(__minbytes__, 1, 4)));
+
+static void
+umac_put_u32_le(void *vp, u_int32_t v)
+{
+ u_char *p = (u_char *)vp;
+
+ p[0] = (u_char)v & 0xff;
+ p[1] = (u_char)(v >> 8) & 0xff;
+ p[2] = (u_char)(v >> 16) & 0xff;
+ p[3] = (u_char)(v >> 24) & 0xff;
+}
+#endif
+
#if (__LITTLE_ENDIAN__)
#define LOAD_UINT32_REVERSED(p) get_u32(p)
#define STORE_UINT32_REVERSED(p,v) put_u32(p,v)
#else
-#define LOAD_UINT32_REVERSED(p) get_u32_le(p)
-#define STORE_UINT32_REVERSED(p,v) put_u32_le(p,v)
+#define LOAD_UINT32_REVERSED(p) umac_get_u32_le(p)
+#define STORE_UINT32_REVERSED(p,v) umac_put_u32_le(p,v)
#endif
-#define LOAD_UINT32_LITTLE(p) (get_u32_le(p))
+#define LOAD_UINT32_LITTLE(p) (umac_get_u32_le(p))
#define STORE_UINT32_BIG(p,v) put_u32(p, v)
/*
---------------------------------------------------------------------- */
_______________________________________________
openssh-unix-dev mailing list
openssh-unix-dev@xxxxxxxxxxx
https://lists.mindrot.org/mailman/listinfo/openssh-unix-dev