[PATCH] lfp: fix AF_INET checksum with csum offloading

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



When an AF_INET socket is used, linux would sometimes
return a packet without the checksum.  This happens when a packet
originates on the same machine, which is most common with virtual
machines but might be possible even without with a software-based dhcp
server such as dnsmasq.
This appears to be a performance optimization to avoid calculating
the checksum and there's no way to disable this. Users are required to
detect such packets by passing a msg_control parameter to the recvmsg
call. This was added in linux kernel 2.6.21.

dhclient from isc.org as of 4.2.0-P2 fails to do this and
concequently discards such packets, so such setups fail to get an IP.
This was reported in the past:
https://lists.isc.org/mailman/htdig/dhcp-hackers/2010-April/001825.html

The attached patch fixes this by passing msg_control and looking
at the tp_status field in the result. If the TP_STATUS_CSUMNOTREADY
bit is set, the packet checksum is missing because the packet
is local.

The patch below is currently successfully used at least on Fedora and
some other Red Hat based distributions. Applying this to the ISC sources
would help make the problem go away for everyone.
---

Notes:
- The patch is to be applied with -p1.
- The patch below applies to dhcp 4.2.0-P2 and 4.1.2 (the last with an offset).
- Please Cc me on comments directly, I am not subscribed to the dhcp-hackers list.

 common/bpf.c     |    2 +-
 common/dlpi.c    |    2 +-
 common/lpf.c     |   81 ++++++++++++++++++++++++++++++++++++++++++------------
 common/nit.c     |    2 +-
 common/packet.c  |    4 +-
 common/upf.c     |    2 +-
 includes/dhcpd.h |    2 +-
 7 files changed, 70 insertions(+), 25 deletions(-)

diff --git a/common/bpf.c b/common/bpf.c
index b0ef657..8bd5727 100644
--- a/common/bpf.c
+++ b/common/bpf.c
@@ -485,7 +485,7 @@ ssize_t receive_packet (interface, buf, len, from, hfrom)
 		offset = decode_udp_ip_header (interface,
 					       interface -> rbuf,
 					       interface -> rbuf_offset,
-  					       from, hdr.bh_caplen, &paylen);
+  					       from, hdr.bh_caplen, &paylen, 0);
 
 		/* If the IP or UDP checksum was bad, skip the packet... */
 		if (offset < 0) {
diff --git a/common/dlpi.c b/common/dlpi.c
index eb64342..d4a8bb9 100644
--- a/common/dlpi.c
+++ b/common/dlpi.c
@@ -694,7 +694,7 @@ ssize_t receive_packet (interface, buf, len, from, hfrom)
 	length -= offset;
 #endif
 	offset = decode_udp_ip_header (interface, dbuf, bufix,
-				       from, length, &paylen);
+				       from, length, &paylen, 0);
 
 	/*
 	 * If the IP or UDP checksum was bad, skip the packet...
diff --git a/common/lpf.c b/common/lpf.c
index f727b7c..4bdb0f1 100644
--- a/common/lpf.c
+++ b/common/lpf.c
@@ -29,18 +29,33 @@
 #include "dhcpd.h"
 #if defined (USE_LPF_SEND) || defined (USE_LPF_RECEIVE)
 #include <sys/ioctl.h>
+#include <sys/socket.h>
 #include <sys/uio.h>
 #include <errno.h>
 
 #include <asm/types.h>
 #include <linux/filter.h>
 #include <linux/if_ether.h>
+#include <linux/if_packet.h>
 #include <netinet/in_systm.h>
 #include "includes/netinet/ip.h"
 #include "includes/netinet/udp.h"
 #include "includes/netinet/if_ether.h"
 #include <net/if.h>
 
+#ifndef PACKET_AUXDATA
+#define PACKET_AUXDATA 8
+
+struct tpacket_auxdata
+{
+	__u32		tp_status;
+	__u32		tp_len;
+	__u32		tp_snaplen;
+	__u16		tp_mac;
+	__u16		tp_net;
+};
+#endif
+
 /* Reinitializes the specified interface after an address change.   This
    is not required for packet-filter APIs. */
 
@@ -66,10 +81,14 @@ int if_register_lpf (info)
 	struct interface_info *info;
 {
 	int sock;
-	struct sockaddr sa;
+	union {
+		struct sockaddr_ll ll;
+		struct sockaddr common;
+	} sa;
+	struct ifreq ifr;
 
 	/* Make an LPF socket. */
-	if ((sock = socket(PF_PACKET, SOCK_PACKET,
+	if ((sock = socket(PF_PACKET, SOCK_RAW,
 			   htons((short)ETH_P_ALL))) < 0) {
 		if (errno == ENOPROTOOPT || errno == EPROTONOSUPPORT ||
 		    errno == ESOCKTNOSUPPORT || errno == EPFNOSUPPORT ||
@@ -84,11 +103,16 @@ int if_register_lpf (info)
 		log_fatal ("Open a socket for LPF: %m");
 	}
 
+	memset (&ifr, 0, sizeof ifr);
+	strncpy (ifr.ifr_name, (const char *)info -> ifp, sizeof ifr.ifr_name);
+	if (ioctl (sock, SIOCGIFINDEX, &ifr))
+		log_fatal ("Failed to get interface index: %m");
+
 	/* Bind to the interface name */
 	memset (&sa, 0, sizeof sa);
-	sa.sa_family = AF_PACKET;
-	strncpy (sa.sa_data, (const char *)info -> ifp, sizeof sa.sa_data);
-	if (bind (sock, &sa, sizeof sa)) {
+	sa.ll.sll_family = AF_PACKET;
+	sa.ll.sll_ifindex = ifr.ifr_ifindex;
+	if (bind (sock, &sa.common, sizeof sa)) {
 		if (errno == ENOPROTOOPT || errno == EPROTONOSUPPORT ||
 		    errno == ESOCKTNOSUPPORT || errno == EPFNOSUPPORT ||
 		    errno == EAFNOSUPPORT || errno == EINVAL) {
@@ -170,9 +194,18 @@ static void lpf_gen_filter_setup (struct interface_info *);
 void if_register_receive (info)
 	struct interface_info *info;
 {
+	int val;
+
 	/* Open a LPF device and hang it on this interface... */
 	info -> rfdesc = if_register_lpf (info);
 
+	val = 1;
+	if (setsockopt (info -> rfdesc, SOL_PACKET, PACKET_AUXDATA, &val,
+			sizeof val) < 0) {
+		if (errno != ENOPROTOOPT)
+			log_fatal ("Failed to set auxiliary packet data: %m");
+	}
+
 #if defined (HAVE_TR_SUPPORT)
 	if (info -> hw_address.hbuf [0] == HTYPE_IEEE802)
 		lpf_tr_filter_setup (info);
@@ -294,7 +327,6 @@ ssize_t send_packet (interface, packet, raw, len, from, to, hto)
 	double hh [16];
 	double ih [1536 / sizeof (double)];
 	unsigned char *buf = (unsigned char *)ih;
-	struct sockaddr sa;
 	int result;
 	int fudge;
 
@@ -315,15 +347,7 @@ ssize_t send_packet (interface, packet, raw, len, from, to, hto)
 				(unsigned char *)raw, len);
 	memcpy (buf + ibufp, raw, len);
 
-	/* For some reason, SOCK_PACKET sockets can't be connected,
-	   so we have to do a sentdo every time. */
-	memset (&sa, 0, sizeof sa);
-	sa.sa_family = AF_PACKET;
-	strncpy (sa.sa_data,
-		 (const char *)interface -> ifp, sizeof sa.sa_data);
-
-	result = sendto (interface -> wfdesc,
-			 buf + fudge, ibufp + len - fudge, 0, &sa, sizeof sa);
+	result = write (interface -> wfdesc, buf + fudge, ibufp + len - fudge);
 	if (result < 0)
 		log_error ("send_packet: %m");
 	return result;
@@ -340,14 +364,35 @@ ssize_t receive_packet (interface, buf, len, from, hfrom)
 {
 	int length = 0;
 	int offset = 0;
+	int nocsum = 0;
 	unsigned char ibuf [1536];
 	unsigned bufix = 0;
 	unsigned paylen;
-
-	length = read (interface -> rfdesc, ibuf, sizeof ibuf);
+	unsigned char cmsgbuf[CMSG_LEN(sizeof(struct tpacket_auxdata))];
+	struct iovec iov = {
+		.iov_base = ibuf,
+		.iov_len = sizeof ibuf,
+	};
+	struct msghdr msg = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+		.msg_control = cmsgbuf,
+		.msg_controllen = sizeof(cmsgbuf),
+	};
+	struct cmsghdr *cmsg;
+
+	length = recvmsg (interface -> rfdesc, &msg, 0);
 	if (length <= 0)
 		return length;
 
+	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+		if (cmsg->cmsg_level == SOL_PACKET &&
+		    cmsg->cmsg_type == PACKET_AUXDATA) {
+			struct tpacket_auxdata *aux = (void *)CMSG_DATA(cmsg);
+			nocsum = aux->tp_status & TP_STATUS_CSUMNOTREADY;
+		}
+	}
+
 	bufix = 0;
 	/* Decode the physical header... */
 	offset = decode_hw_header (interface, ibuf, bufix, hfrom);
@@ -364,7 +409,7 @@ ssize_t receive_packet (interface, buf, len, from, hfrom)
 
 	/* Decode the IP and UDP headers... */
 	offset = decode_udp_ip_header (interface, ibuf, bufix, from,
-				       (unsigned)length, &paylen);
+				       (unsigned)length, &paylen, nocsum);
 
 	/* If the IP or UDP checksum was bad, skip the packet... */
 	if (offset < 0)
diff --git a/common/nit.c b/common/nit.c
index 3822206..0da9c36 100644
--- a/common/nit.c
+++ b/common/nit.c
@@ -369,7 +369,7 @@ ssize_t receive_packet (interface, buf, len, from, hfrom)
 
 	/* Decode the IP and UDP headers... */
 	offset = decode_udp_ip_header (interface, ibuf, bufix,
-				       from, length, &paylen);
+				       from, length, &paylen, 0);
 
 	/* If the IP or UDP checksum was bad, skip the packet... */
 	if (offset < 0)
diff --git a/common/packet.c b/common/packet.c
index 42bca69..fd2d975 100644
--- a/common/packet.c
+++ b/common/packet.c
@@ -211,7 +211,7 @@ ssize_t
 decode_udp_ip_header(struct interface_info *interface,
 		     unsigned char *buf, unsigned bufix,
 		     struct sockaddr_in *from, unsigned buflen,
-		     unsigned *rbuflen)
+		     unsigned *rbuflen, int nocsum)
 {
   unsigned char *data;
   struct ip ip;
@@ -322,7 +322,7 @@ decode_udp_ip_header(struct interface_info *interface,
 					   8, IPPROTO_UDP + ulen))));
 
   udp_packets_seen++;
-  if (usum && usum != sum) {
+  if (!nocsum && usum && usum != sum) {
 	  udp_packets_bad_checksum++;
 	  if (udp_packets_seen > 4 &&
 	      (udp_packets_seen / udp_packets_bad_checksum) < 2) {
diff --git a/common/upf.c b/common/upf.c
index feb82a2..fff3949 100644
--- a/common/upf.c
+++ b/common/upf.c
@@ -320,7 +320,7 @@ ssize_t receive_packet (interface, buf, len, from, hfrom)
 
 	/* Decode the IP and UDP headers... */
 	offset = decode_udp_ip_header (interface, ibuf, bufix,
-				       from, length, &paylen);
+				       from, length, &paylen, 0);
 
 	/* If the IP or UDP checksum was bad, skip the packet... */
 	if (offset < 0)
diff --git a/includes/dhcpd.h b/includes/dhcpd.h
index cd7d962..0835d98 100644
--- a/includes/dhcpd.h
+++ b/includes/dhcpd.h
@@ -2769,7 +2769,7 @@ ssize_t decode_hw_header PROTO ((struct interface_info *, unsigned char *,
 				 unsigned, struct hardware *));
 ssize_t decode_udp_ip_header PROTO ((struct interface_info *, unsigned char *,
 				     unsigned, struct sockaddr_in *,
-				     unsigned, unsigned *));
+				     unsigned, unsigned *, int));
 
 /* ethernet.c */
 void assemble_ethernet_header PROTO ((struct interface_info *, unsigned char *,
-- 
1.7.3.2.91.g446ac
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux