Hi, I've encountered a strange situation while trying to run my software under Linux. And I boiled it down to what I think is a bug in the nf_conntrack_ipv4 kernel module. Maybe somebody on this list can help me with this. My software implements the following behaviour: 1) a number of listener processes listen for udp packets on a well known port on the localhost via recvfrom() 2) at some point in time a broadcast process sends a request via udp broadcast to this well known port 3) all the listener processes receive this request and reply individually to the broadcast process via sendto() What happens to me is that 1) and 2) work as expected, but the replies in step 3) are randomly dropped and never reach the broadcast process. The process calling sendto() receives an EPERM return in this case but in a more complex (real world) scenario I think the packets are also silently dropped. This happens under the following circumstances: - the kernel module nf_conntrack_ipv4 must be loaded - no actual firewall/iptable rules are configured, so all packets should be accepted - the code needs to run on an SMP machine that allows for real parallelization. I couldn't reproduce this from within a qemu virtual machine, for example. So it looks like a race condition to me. I've written isolated test cases for the "listener" and the "broadcast" part of this scenario. You can find the program source code attached to this mail. To reproduce the behaviour the following needs to be done: - start two instances of the listener program - start / ctrl-c / restart the broadcast program until one or both of the listener instances receive an EPERM I've also managed to trace the location in the kernel code where the decision to drop the packet in this situation is actually made. But I don't understand the logic that's implemented there. It's in function __nf_conntrack_confirm() where the following if clause is matching: ------------------------------------------------------------------------ hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &h->tuple) && zone =3D=3D nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) ------------------------------------------------------------------------ This is the kernel stacktrace leading to it: Call Trace: [<ffffffff81565a3d>] dump_stack+0x45/0x57 [<ffffffff814bb7f5>] __nf_conntrack_confirm+0x21d/0x2c9 [<ffffffff81504265>] ipv4_confirm+0x6a/0xe8 [<ffffffff814b6912>] nf_iterate+0x52/0x8b [<ffffffff814b699e>] nf_hook_slow+0x53/0xde [<ffffffff814cec9b>] ip_output+0xaf/0xf5 [<ffffffff814ce26f>] ? ip_fragment+0x673/0x673 [<ffffffff814cd721>] ip_local_out_sk+0x49/0x6f [<ffffffff814cf631>] ip_send_skb+0x13/0x70 [<ffffffff814ee83b>] udp_send_skb+0x196/0x219 [<ffffffff814eee78>] udp_sendmsg+0x569/0x7b6 [<ffffffff814cece1>] ? ip_output+0xf5/0xf5 [<ffffffff814ed778>] ? udp_recvmsg+0x176/0x335 [<ffffffff814f758b>] inet_sendmsg+0x5e/0xb5 [<ffffffff810ee67e>] ? __fget_light+0x3f/0x51 [<ffffffff81482356>] sock_sendmsg+0x14/0x39 [<ffffffff81483437>] SyS_sendto+0x12b/0x184 [<ffffffff810e7fa2>] ? SyS_select+0x9f/0xb4 [<ffffffff8156abee>] system_call_fastpath+0x12/0x71 The stacktrace is from Linux kernel 4.1.12 but I've also managed to reproduce this using the current kernel version 4.3. Any help is appreciated. Regards Matthias -- Matthias Gerstner, Dipl.-Wirtsch.-Inf. (FH) Entwicklung NCP engineering GmbH Dombühler Straße 2, D-90449, Nürnberg Geschäftsführer Peter Söll, HRB-Nr: 77 86 Nürnberg Telefon: +49 911 9968-153, Fax: +49 911 9968-229 E-Mail: Matthias.Gerstner@xxxxxxxxx Internet: http://www.ncp-e.com
#include <sys/types.h> #include <sys/socket.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/udp.h> #include <arpa/inet.h> #include <string.h> #include <stdio.h> #include <stdint.h> #include <errno.h> int main(int argc, const char **argv) { int udp_sock = socket(AF_INET, SOCK_DGRAM, 0); struct sockaddr_in udp_addr, peer_addr; const int sock_bool = 1; const size_t MAX_MSG = 1024; uint8_t message[MAX_MSG]; socklen_t ip_len = sizeof(struct sockaddr_in); int res = 0; /* * bind to a fixed udp port on localhost */ memset(&udp_addr, 0, sizeof(struct sockaddr_in)); udp_addr.sin_family = AF_INET; // some fixed port number for finding each other udp_addr.sin_port = htons(30451); // listen on the localhost for broadcasts if( inet_aton("127.255.255.255", &udp_addr.sin_addr) == 0 ) { printf("Failed to set addr\n"); return 1; } if( setsockopt(udp_sock, SOL_SOCKET, SO_BROADCAST, &sock_bool, sizeof(sock_bool)) != 0 ) { printf("Failed to set broadcast option\n"); return 1; } if( setsockopt(udp_sock, SOL_SOCKET, SO_REUSEADDR, &sock_bool, sizeof(sock_bool)) != 0 ) { printf("Failed to set reuse option\n"); return 1; } if( bind(udp_sock, (struct sockaddr*)&udp_addr, sizeof(udp_addr)) != 0 ) { printf("Failed to bind to addr\n"); return 1; } while( 1 ) { res = recvfrom( udp_sock, message, MAX_MSG, 0, (struct sockaddr*)&peer_addr, &ip_len ); if( res == -1 ) { printf("Failed to receive message\n"); return 1; } if( ip_len != sizeof(struct sockaddr_in) ) { printf("Wrong addr len\n"); return 1; } printf("Received message of %d bytes from %s:%d\n", res, inet_ntoa(peer_addr.sin_addr), peer_addr.sin_port ); // ignore actual message content, we just want to reply. // // use the same message for this purpose res = sendto( udp_sock, message, res, 0, (struct sockaddr*)&peer_addr, ip_len); if( res == -1 ) { perror("Failed to reply"); } } return 0; }
#include <sys/types.h> #include <sys/socket.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/udp.h> #include <arpa/inet.h> #include <string.h> #include <stdio.h> #include <stdint.h> #include <errno.h> int main(int argc, const char **argv) { int udp_sock = socket(AF_INET, SOCK_DGRAM, 0); struct sockaddr_in local_addr, peer_addr; const int sock_bool = 1; socklen_t ip_len = sizeof(struct sockaddr_in); const size_t MAX_MSG = 1024; uint8_t message[MAX_MSG]; int res = 0; /* * bind to some arbitrary UDP port on localhost */ memset(&local_addr, 0, sizeof(struct sockaddr_in)); local_addr.sin_family = AF_INET; local_addr.sin_port = 0; // listen on the localhost for broadcasts if( inet_aton("127.0.0.1", &local_addr.sin_addr) == 0 ) { printf("Failed to set addr\n"); return 1; } /* * setup he broadcast target address */ memset(&peer_addr, 0, sizeof(struct sockaddr_in)); peer_addr.sin_family = AF_INET; // fixed port number for broadcasting peer_addr.sin_port = htons(30451); if( inet_aton("127.255.255.255", &peer_addr.sin_addr) == 0 ) { printf("Failed to set broadcast addr\n"); return 1; } /* * setup the socket */ if( setsockopt(udp_sock, SOL_SOCKET, SO_BROADCAST, &sock_bool, sizeof(sock_bool)) != 0 ) { printf("Failed to set broadcast option\n"); return 1; } if( bind(udp_sock, (struct sockaddr*)&local_addr, sizeof(local_addr)) != 0 ) { printf("Failed to bind to addr\n"); return 1; } if( getsockname( udp_sock, (struct sockaddr*)&local_addr, &ip_len) != 0 ) { printf("Failed to get sockname\n"); return 1; } printf("Bound to %s:%d for replies\n", inet_ntoa(local_addr.sin_addr), local_addr.sin_port ); /* send some arbitrary data */ if( sendto( udp_sock, message, 17, 0, (struct sockaddr*)&peer_addr, ip_len ) == -1 ) { perror("Failed to send broadcast"); return 1; } printf("Sent broadcast to %s:%d\n", inet_ntoa(peer_addr.sin_addr), local_addr.sin_port ); while ( 1 ) { res = recvfrom( udp_sock, message, MAX_MSG, 0, (struct sockaddr*)&peer_addr, &ip_len ); if( res == -1 ) { printf("Failed to receive message\n"); return 1; } if( ip_len != sizeof(struct sockaddr_in) ) { printf("Wrong addr len\n"); return 1; } printf("Received broadcast reply of %d bytes from %s:%d\n", res, inet_ntoa(peer_addr.sin_addr), peer_addr.sin_port ); /* * ignore actual message content */ } }
Attachment:
signature.asc
Description: Digital signature