Just to answer a few of the questions below... On Jul 16, 2009, at 3:54 PM, Trond Myklebust wrote:
On Wed, 2009-07-15 at 17:37 -0400, Chuck Lever wrote:Introduce a set of functions in the kernel's RPC implementation for converting between a socket address and either a standard presentation address string or an RPC universal address. The universal address functions will be used to encode and decode RPCB_FOO and NFSv4 SETCLIENTID arguments. The other functions are part of a previous promise to deliver shared functions that can be used by upper-layer protocols to display and manipulate IP addresses. The kernel's current address printf formatters were designed specifically for kernel to user-space APIs that require a particular string format for socket addresses, thus are somewhat limited for the purposes of sunrpc.ko. The formatter for IPv6 addresses, %pI6, doesnot support short-handing or scope IDs. Also, these printf formattersare unique per address family, so a separate formatter string is required for printing AF_INET and AF_INET6 addresses. Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> --- fs/nfs/internal.h | 2 include/linux/sunrpc/clnt.h | 12 + net/sunrpc/Makefile | 2net/sunrpc/addr.c | 524 +++++++++++++++++++++++++++++++++ ++++++++++4 files changed, 537 insertions(+), 3 deletions(-) create mode 100644 net/sunrpc/addr.c diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index d3823d7..a19b7b9 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h@@ -370,8 +370,6 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)PAGE_SIZE - 1) >> PAGE_SHIFT; } -#define IPV6_SCOPE_DELIMITER '%' - /* * Set the port number in an address. Be agnostic about the address * family.diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/ clnt.hindex 37881f1..bf38c4a 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -151,5 +151,17 @@ void rpc_force_rebind(struct rpc_clnt *); size_t rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);const char *rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t);+unsigned short rpc_get_port(const struct sockaddr *sap); +void rpc_set_port(struct sockaddr *sap, const unsigned short port); +size_t rpc_ntop(const struct sockaddr *, char *, const size_t); +size_t rpc_pton(const char *, const size_t, + struct sockaddr *, const size_t); +char * rpc_sockaddr2uaddr(const struct sockaddr *); +size_t rpc_uaddr2sockaddr(const char *, const size_t, + struct sockaddr *, const size_t); + +#define IPV6_SCOPE_DELIMITER '%' +#define IPV6_SCOPE_ID_LEN sizeof("%nnnnnnnnnn") + #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 7a594cc..6020f5b 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ auth.o auth_null.o auth_unix.o auth_generic.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ - rpcb_clnt.o timer.o xdr.o \ + addr.o rpcb_clnt.o timer.o xdr.o \ sunrpc_syms.o cache.o rpc_pipe.o \ svc_xprt.o sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c new file mode 100644 index 0000000..9ddcffd --- /dev/null +++ b/net/sunrpc/addr.c @@ -0,0 +1,524 @@ +/* + * Copyright 2009, Oracle. All rights reserved. + * + * Convert socket addresses to presentation addresses and universal + * addresses, and vice versa. + *+ * Universal addresses are introduced by RFC 1833 and further refined by + * recent RFCs describing NFSv4. The universal address format is part+ * of the external (network) interface provided by rpcbind version 3 + * and 4, and by NFSv4. Such an address is a string containing a + * presentation format IP address followed by a port number in + * "hibyte.lobyte" format. + * + * There are some challenges to constructing universal addresses.+ * In particular, constructing a presentation address string for IPv4 + * is straightforward, but it's not so easy for IPv6 addresses. Such+ * addresses can be "short-handed" to remove the longest string of+ * zeros, and mapped v4 presentation addresses can appear in a special+ * dotted-quad form. Refer to RFC 4291, Section 2.2 for details on + * IPv6 presentation formats. + * + * IPv6 addresses can also include a scope ID, typically denoted by + * a '%' followed by a device name or a non-negative integer. + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/inet.h> + +#include <net/ipv6.h> + +#include <linux/sunrpc/clnt.h> + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + +struct rpc_zeros { + unsigned int count; + int index; +}; + +static const struct rpc_zeros zeros_init = { + .count = 0, + .index = -1, +}; + +static void rpc_find_longest_zero_run(const struct in6_addr *addr, + struct rpc_zeros *longest) +{ + struct rpc_zeros zeros; + unsigned int i; + + zeros = zeros_init; + *longest = zeros_init;^^^^^^^^^^ Why do this when you could just set longest->index to -1?
Documentation.
+ + for (i = 0; i < ARRAY_SIZE(addr->s6_addr16); i++) { + if (addr->s6_addr16[i] == 0) { + if (zeros.index == -1) + zeros.index = i; + zeros.count++; + } else { + if (zeros.count > longest->count || + longest->index == -1) + *longest = zeros; + zeros = zeros_init; + } + } + + /* + * Maybe the last run of zeros didn't terminate. + * Not likely for a host address. + */ + if (zeros.index != -1) { + if (zeros.count > longest->count || + longest->index == -1) + *longest = zeros; + } +} + +/* + * Append a set of halfwords from @addr onto the string in @buf. + * + * Returns the new length of the string in @buf, or zero if + * some error occurs. + */ +static size_t rpc_append_halfword6(const struct in6_addr *addr, + const char *fmt, + const unsigned int start, + const unsigned int end, + char *buf, const size_t buflen) +{ + char tmp[sizeof("ffff:") + 1];You are passing 'fmt' as an argument, yet you assume it will have aformat that makes sense of the above 'sizeof' calculation? Why not justdrop the ':', and always use fmt=="%x"? As far as I can see, that would simplify rpc_ntop6_shorthand too...
OK.
+ unsigned int i; + size_t len = 0; + + for (i = start; i < end; i++) { + len = snprintf(tmp, sizeof(tmp), fmt, + ntohs(addr->s6_addr16[i])); + if (unlikely(len > sizeof(tmp))) + return 0; + + len = strlcat(buf, tmp, buflen); + if (unlikely(len > buflen)) + return 0; + } + + return len; +} + +/* + * This implementation depends on the Linux kernel's definition of + * in6_addr, which is a union of arrays that facilities dissecting + * the address into 16-bit chunks. + */ +static size_t rpc_ntop6_shorthand(const struct in6_addr *addr, + struct rpc_zeros *longest, + char *buf, const size_t buflen) +{ + size_t len; + + /* + * Left half, before the longest string of zeros + */ + buf[0] = '\0'; + if (longest->index == 0) { + len = strlcat(buf, ":", buflen); + if (unlikely(len > buflen)) + return 0; + } else { + len = rpc_append_halfword6(addr, "%x:", + 0, longest->index, buf, buflen); + if (unlikely(len == 0)) + return 0; + } + + /* + * Right half, after the longest string of zeros + */+ if (longest->index + longest->count >= ARRAY_SIZE(addr- >s6_addr16)) {+ len = strlcat(buf, ":", buflen); + if (unlikely(len > buflen)) + return 0; + } else { + len = rpc_append_halfword6(addr, ":%x", + longest->index + longest->count, + ARRAY_SIZE(addr->s6_addr16), buf, buflen); + if (unlikely(len == 0)) + return 0; + } + + return len; +}The shorthand format is, AFAICS, not an obligatory notation. Why shouldwe implement it, if it involves all this complication?
One reason is a shorthanded presentation address can be stored in significantly less memory, or sent on the network in fewer bytes. Another is that these addresses are stored in rpcbind, and advertised, so they are often looked at by humans.
IMO the present code is less complex than our RDMA/RPC implementation or even the kernel's full vsnprintf(), and once we get it right, it will probably never change again. It's no uglier than what's in xdr.c, for example. So I don't buy the "too complex" argument.
If we replace all this code with "%pI6" however, we should explicitly check for ANYADDR and loopback addresses and use shorthands for them: "::" and "::1", as the code in rpcb_clnt.c now does for ANYADDR.
+ +static size_t rpc_ntop6_noscopeid(const struct sockaddr *sap, + char *buf, const int buflen) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + const struct in6_addr *addr = &sin6->sin6_addr; + struct rpc_zeros longest; + + /* + * RFC 4291, Section 2.2.3 + * + * Special presentation address format for mapped v4 + * addresses. + */ + if (ipv6_addr_v4mapped(addr)) + return snprintf(buf, buflen, "::ffff:%pI4", + &addr->s6_addr32[3]);Err... Won't that give you ::ffff:x.x.x.x ?
Yes, that's a standard way to express a mapped IPv4 presentation address.
+ /* + * RFC 4291, Section 2.2.2 + * + * See if address can be short-handed. + */ + rpc_find_longest_zero_run(addr, &longest); + if (longest.index != -1 && longest.count > 1) + return rpc_ntop6_shorthand(addr, &longest, buf, buflen); + + /* + * RFC 4291, Section 2.2.1 + * + * Short-handing not possible, so spell out full + * address. We don't want leading zeros in each + * halfword, so avoid %pI6. + */ + return snprintf(buf, buflen, "%x:%x:%x:%x:%x:%x:%x:%x", + ntohs(addr->s6_addr16[0]), ntohs(addr->s6_addr16[1]), + ntohs(addr->s6_addr16[2]), ntohs(addr->s6_addr16[3]), + ntohs(addr->s6_addr16[4]), ntohs(addr->s6_addr16[5]), + ntohs(addr->s6_addr16[6]), ntohs(addr->s6_addr16[7]));Why not use %pI6 ?
See the comment.For some reason %pI4 does not add leading zeroes (eg. 010.000.000.004) but %pI6 does (eg. fe80:0000:0000:cd59:0017:0054:bda4:000f). Probably some networking-specific user space parser depends on it.
Leaving out the leading zeroes can shorten IPv6 presentation addresses significantly.
+} + +static size_t rpc_ntop6(const struct sockaddr *sap, + char *buf, const size_t buflen) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + char scopebuf[IPV6_SCOPE_ID_LEN]; + size_t len; + int rc; + + len = rpc_ntop6_noscopeid(sap, buf, buflen); + if (unlikely(len == 0)) + return len; + + if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) && + !(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_SITELOCAL)) + return len; + + rc = snprintf(scopebuf, sizeof(scopebuf), "%c%u", + IPV6_SCOPE_DELIMITER, sin6->sin6_scope_id); + if (unlikely((size_t)rc > sizeof(scopebuf))) + return 0; + + len += rc; + if (unlikely(len > buflen)) + return 0; + + strcat(buf, scopebuf); + return len; +} + +#else /* !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) */ + +static size_t rpc_ntop6_noscopeid(const struct sockaddr *sap, + char *buf, const int buflen) +{ + return 0; +} + +static size_t rpc_ntop6(const struct sockaddr *sap, + char *buf, const size_t buflen) +{ + return 0; +} + +#endif /* !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) */ + +static int rpc_ntop4(const struct sockaddr *sap, + char *buf, const size_t buflen) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + + return snprintf(buf, buflen, "%pI4", &sin->sin_addr); +} + +/** + * rpc_ntop - construct a presentation address in @buf + * @sap: socket address + * @buf: construction area + * @buflen: size of @buf, in bytes + * + * Plants a %NUL-terminated string in @buf and returns the length + * of the string, excluding the %NUL. Otherwise zero is returned. + */+size_t rpc_ntop(const struct sockaddr *sap, char *buf, const size_t buflen)+{ + char tmp[INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 1]; + size_t len; + + switch (sap->sa_family) { + case AF_INET: + len = rpc_ntop4(sap, tmp, sizeof(tmp)); + break; + case AF_INET6: + len = rpc_ntop6(sap, tmp, sizeof(tmp)); + break; + default: + return 0; + } + + if (unlikely(len == 0 || len > sizeof(tmp) || len > buflen)) + return 0; + return strlcpy(buf, tmp, buflen);Why two copies?
It simplifies the assumptions that rpc_ntop6() has to make. One can go away, if you like.
+/** + * rpc_sockaddr2uaddr - Construct a universal address string from @sap. + * @sap: socket address + * + * Returns a %NUL-terminated string in dynamically allocated memory; + * otherwise NULL is returned if an error occurred. Caller must + * free the returned string. + */ +char *rpc_sockaddr2uaddr(const struct sockaddr *sap) +{ + char portbuf[RPCBIND_MAXUADDRPLEN + 1]; + char addrbuf[RPCBIND_MAXUADDRLEN + 1]; + unsigned short port; + + switch (sap->sa_family) { + case AF_INET: + if (rpc_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0) + return NULL; + port = ntohs(((struct sockaddr_in *)sap)->sin_port); + break; + case AF_INET6:+ if (rpc_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf))== 0) + return NULL;+ port = ntohs(((struct sockaddr_in6 *)sap)- >sin6_port);+ break; + default: + return NULL; + } + + if (snprintf(portbuf, sizeof(portbuf), + ".%u.%u", port >> 8, port & 0xff) > (int)sizeof(portbuf)) + return NULL; + + if (strlcat(addrbuf, portbuf, sizeof(addrbuf)) > sizeof(addrbuf)) + return NULL; + + return kstrdup(addrbuf, GFP_ATOMIC);GFP_ATOMIC? Why?
No reason. Will fix.
+} +EXPORT_SYMBOL_GPL(rpc_sockaddr2uaddr);+unsigned short rpc_get_port(const struct sockaddr *sap) +{ + switch (sap->sa_family) { + case AF_INET: + return ntohs(((struct sockaddr_in *)sap)->sin_port); + case AF_INET6: + return ntohs(((struct sockaddr_in6 *)sap)->sin6_port); + } + return 0; +} +EXPORT_SYMBOL_GPL(rpc_get_port);inline helper?
OK.
+ +void rpc_set_port(struct sockaddr *sap, const unsigned short port) +{ + switch (sap->sa_family) { + case AF_INET: + ((struct sockaddr_in *)sap)->sin_port = htons(port); + break; + case AF_INET6: + ((struct sockaddr_in6 *)sap)->sin6_port = htons(port); + break; + } +} +EXPORT_SYMBOL_GPL(rpc_set_port);ditto
-- Chuck Lever chuck[dot]lever[at]oracle[dot]com -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html