Hello All, When a conntrack is created by kernel, it is initialized (sets IPS_{DST,SRC}_NAT_DONE_BIT bits in nf_nat_setup_info) and only then it is added in hashes (__nf_conntrack_hash_insert), so one conntract can't be initialized from a few threads concurrently. ctnetlink can add an uninitialized conntrack (w/o IPS_{DST,SRC}_NAT_DONE_BIT) in hashes, then a few threads can look up this conntrack and start initialize it concurrently. It's dangerous, because BUG can be triggered from nf_nat_setup_info. I added a busy loop in nf_nat_setup_info before BUG_ON to increase race window and the bug is triggered without any problem. This kernel patch is attached to this message. And I attached a test script for sending packets and an user-space program to add conntrack via netlink. [ 1307.098595] ctnetlink_create_conntrack:1723: ffff8800a527bdf0 [ 1318.374303] nf_nat_setup_info:385 ffff8800a527bdf0 1 88 [ 1318.402729] nf_nat_setup_info:385 ffff8800a527bdf0 1 88 [ 1322.290041] nf_nat_setup_info:388 ffff8800a527bdf0 1 88 [ 1322.298476] nf_nat_setup_info:388 ffff8800a527bdf0 1 38a [ 1322.299851] ------------[ cut here ]------------ [ 1322.300800] kernel BUG at net/netfilter/nf_nat_core.c:390! [ 1322.300800] invalid opcode: 0000 [#1] SMP [ 1322.300800] Dumping ftrace buffer: [ 1322.300800] (ftrace buffer empty) [ 1322.300800] Modules linked in: binfmt_misc nf_conntrack_netlink nfnetlink xt_nat iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack nfsv3 nfs_acl nfs lockd sunrpc fscache veth ip6table_filter ip6_tables iptable_filter ip_tables pcspkr virtio_balloon i2c_piix4 virtio_net i2c_core virtio_blk floppy [last unloaded: nf_conntrack] [ 1322.300800] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.13.0-rc7+ #78 [ 1322.300800] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 1322.300800] task: ffff8800bb168290 ti: ffff8800bb16c000 task.ti: ffff8800bb16c000 [ 1322.300800] RIP: 0010:[<ffffffffa00692b2>] [<ffffffffa00692b2>] nf_nat_setup_info+0x392/0x400 [nf_nat] [ 1322.300800] RSP: 0018:ffff8800bfa83bb8 EFLAGS: 00010206 [ 1322.300800] RAX: 0000000000000100 RBX: ffff8800a527bdf0 RCX: 0000000000000000 [ 1322.300800] RDX: 0000000000000100 RSI: ffff8800bb1689b8 RDI: 0000000000000246 [ 1322.300800] RBP: ffff8800bfa83c58 R08: 0000000000000000 R09: 0000000000000000 [ 1322.300800] R10: 0000000000000001 R11: ffff8800bfa838e6 R12: 0000000000000001 [ 1322.300800] R13: ffff880037ae4040 R14: ffff8800bfa83c78 R15: 0000000000000000 [ 1322.300800] FS: 0000000000000000(0000) GS:ffff8800bfa80000(0000) knlGS:0000000000000000 [ 1322.300800] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 1322.300800] CR2: 00007ffb9b71e000 CR3: 00000000b92d5000 CR4: 00000000000006e0 [ 1322.300800] Stack: [ 1322.300800] ffffffffa00440b8 ffff8800b992fd80 ffff8800a249ab80 ffff8800b992fd80 [ 1322.300800] ffff8800b0d06400 00000000a0087c85 ffffe8ffffc80574 0000000100000001 [ 1322.300800] ffffffffa0046598 ffff8800bb304480 0000000000000002 ffff8800b9664000 [ 1322.300800] Call Trace: [ 1322.300800] <IRQ> [ 1322.300800] [<ffffffffa00440b8>] ? ipt_do_table+0x2e8/0x4a5 [ip_tables] [ 1322.300800] [<ffffffffa007d2d2>] nf_nat_ipv4_fn+0x2d2/0x330 [iptable_nat] [ 1322.300800] [<ffffffffa007d5de>] nf_nat_ipv4_in+0x2e/0x84 [iptable_nat] [ 1322.300800] [<ffffffff8159d6f0>] ? inet_del_offload+0x40/0x40 [ 1322.300800] [<ffffffff8159435a>] nf_iterate+0x9a/0xb0 [ 1322.300800] [<ffffffff8159d6f0>] ? inet_del_offload+0x40/0x40 [ 1322.300800] [<ffffffff8159440c>] nf_hook_slow+0x9c/0x160 [ 1322.300800] [<ffffffff8159d6f0>] ? inet_del_offload+0x40/0x40 [ 1322.300800] [<ffffffff8159e3a7>] ip_rcv+0x2f7/0x3d0 [ 1322.300800] [<ffffffff815635a2>] __netif_receive_skb_core+0x4e2/0x880 [ 1322.300800] [<ffffffff815631d2>] ? __netif_receive_skb_core+0x112/0x880 [ 1322.300800] [<ffffffff81563958>] __netif_receive_skb+0x18/0x60 [ 1322.300800] [<ffffffff8156460e>] process_backlog+0xbe/0x1a0 [ 1322.300800] [<ffffffff81563e42>] net_rx_action+0x162/0x270 [ 1322.300800] [<ffffffff8105dee4>] __do_softirq+0x104/0x2a0 [ 1322.300800] [<ffffffff8105e45d>] irq_exit+0xcd/0xe0 [ 1322.300800] [<ffffffff8167f586>] do_IRQ+0x56/0xc0 [ 1322.300800] [<ffffffff81674f32>] common_interrupt+0x72/0x72 [ 1322.300800] <EOI> [ 1322.300800] [<ffffffff8103d8e6>] ? native_safe_halt+0x6/0x10 [ 1322.300800] [<ffffffff810a743d>] ? trace_hardirqs_on+0xd/0x10 [ 1322.300800] [<ffffffff8100b284>] default_idle+0x24/0xc0 [ 1322.300800] [<ffffffff8100bb4e>] arch_cpu_idle+0x2e/0x40 [ 1322.300800] [<ffffffff810b9845>] cpu_startup_entry+0xc5/0x290 [ 1322.300800] [<ffffffff8102fe6d>] start_secondary+0x21d/0x2d0 [ 1322.300800] Code: 00 48 01 d0 e9 b8 fe ff ff 48 81 8b b0 00 00 00 00 01 00 00 e9 9e fd ff ff 48 83 8b b0 00 00 00 20 e9 5f fd ff ff e8 ee f5 fe e0 <0f> 0b 4c 8b 8b b0 00 00 00 45 89 e0 48 89 d9 ba 81 01 00 00 48 [ 1322.300800] RIP [<ffffffffa00692b2>] nf_nat_setup_info+0x392/0x400 [nf_nat] [ 1322.300800] RSP <ffff8800bfa83bb8> [ 1322.383568] ---[ end trace c3cef76423498e85 ]--- [ 1322.384778] Kernel panic - not syncing: Fatal exception in interrupt [ 1322.385766] Dumping ftrace buffer: [ 1322.385766] (ftrace buffer empty)
From f501d2e80c2811aa7d69110d72f24e6b7173d920 Mon Sep 17 00:00:00 2001 From: Andrey Vagin <avagin@xxxxxxxxxx> Date: Mon, 6 Jan 2014 18:59:37 +0400 Subject: [PATCH] debug Signed-off-by: Andrey Vagin <avagin@xxxxxxxxxx> --- net/core/dev.c | 2 ++ net/netfilter/nf_conntrack_netlink.c | 1 + net/netfilter/nf_nat_core.c | 8 ++++++++ 3 files changed, 11 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index 4fc1722..671e4b4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2981,7 +2981,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_sock_flow_table *sock_flow_table; int cpu = -1; u16 tcpu; + static int xxx = 0; + return (xxx++) % 2; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); if (unlikely(index >= dev->real_num_rx_queues)) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 08870b8..2f6c724 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1720,6 +1720,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, if (tstamp) tstamp->start = ktime_to_ns(ktime_get_real()); + printk("%s:%d: %p\n", __func__, __LINE__, ct); err = nf_conntrack_hash_check_insert(ct); if (err < 0) goto err2; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 63a8154..5600bfd 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -379,6 +379,14 @@ nf_nat_setup_info(struct nf_conn *ct, NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC || maniptype == NF_NAT_MANIP_DST); + if (net != &init_net) + { + long long i; + printk("%s:%d %p %d %lx\n", __func__, __LINE__, ct, maniptype, ct->status); + for (i = 0; i < 10000000000; i++) + asm("nop"); + printk("%s:%d %p %d %lx\n", __func__, __LINE__, ct, maniptype, ct->status); + } BUG_ON(nf_nat_initialized(ct, maniptype)); /* What we've got will look like inverse of reply. Normally -- 1.8.4.2
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <errno.h> #include <arpa/inet.h> #include <libnetfilter_conntrack/libnetfilter_conntrack.h> #include <libnetfilter_conntrack/libnetfilter_conntrack_tcp.h> int main(void) { int ret; struct nfct_handle *h; struct nf_conntrack *ct; ct = nfct_new(); if (!ct) { perror("nfct_new"); return 0; } nfct_set_attr_u8(ct, ATTR_L3PROTO, AF_INET); nfct_set_attr_u32(ct, ATTR_IPV4_SRC, inet_addr("192.168.122.1")); nfct_set_attr_u32(ct, ATTR_IPV4_DST, inet_addr("192.168.122.56")); nfct_set_attr_u8(ct, ATTR_L4PROTO, IPPROTO_TCP); nfct_set_attr_u16(ct, ATTR_PORT_SRC, htons(6588)); nfct_set_attr_u16(ct, ATTR_PORT_DST, htons(6588)); nfct_setobjopt(ct, NFCT_SOPT_SETUP_REPLY); nfct_setobjopt(ct, NFCT_SOPT_UNDO_SNAT); nfct_set_attr_u8(ct, ATTR_TCP_STATE, TCP_CONNTRACK_SYN_SENT); nfct_set_attr_u32(ct, ATTR_TIMEOUT, 30); h = nfct_open(CONNTRACK, 0); if (!h) { perror("nfct_open"); nfct_destroy(ct); return -1; } ret = nfct_query(h, NFCT_Q_CREATE, ct); printf("TEST: create conntrack "); if (ret == -1) printf("(%d)(%s)\n", ret, strerror(errno)); else printf("(OK)\n"); nfct_close(h); nfct_destroy(ct); ret == -1 ? exit(EXIT_FAILURE) : exit(EXIT_SUCCESS); }
from scapy.all import * a=IP(dst="192.168.122.56")/TCP(flags="S", sport=6588, dport=6588) for i in xrange(4): send(a, verbose=0)