Been getting beaten up by this bug for a few days now. I made a small test program for you netfilter experts to try because I'm running out of ideas over here. Attached is a C program to trigger the BUG_ON. I have narrowed possible causes down to the portion of my code that sends NFT_MSG_NEWRULE, if you comment that out the bug will not happen. Let me know if you need more information or have a patch to try. The kernel config is nothing special, minimal x86 qemu with ipv{4,6} and full nftables options, no modules. ------------[ cut here ]------------ kernel BUG at net/netfilter/nf_tables_api.c:816! invalid opcode: 0000 [#1] CPU: 0 PID: 42 Comm: kworker/u2:2 Not tainted 4.9.40 #1 Workqueue: netns cleanup_net task: c0225540 task.stack: c026e000 EIP: 0060:[<c1289440>] EFLAGS: 00000202 CPU: 0 EIP is at nf_tables_table_destroy.isra.23.part.24+0x0/0x10 EAX: f4e613d8 EBX: f4e613d8 ECX: 000000a8 EDX: 00000001 ESI: f4e613d8 EDI: f4e613d8 EBP: f4e613e8 ESP: c026fe90 DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068 CR0: 80050033 CR2: 08197148 CR3: 002a5000 CR4: 00000690 Stack: c128caa6 f4e613e8 f4e4a000 f4e52464 f4e52450 f4e52464 f4e4a000 f4e52450 f4e613d8 f4e61664 00000000 00000000 00000000 00000000 f4e4a000 c026ff08 c1470348 c147034c c133f31e f4e4a000 c124e506 c147033c c026fee8 c026ff10 Call Trace: [<c128caa6>] ? nft_unregister_afinfo+0x1f6/0x200 [<c133f31e>] ? nf_tables_ipv6_exit_net+0xe/0x20 [<c124e506>] ? ops_exit_list.isra.6+0x26/0x50 [<c124edc5>] ? cleanup_net+0x135/0x210 [<c1044c1a>] ? pick_next_task_fair+0xba/0x120 [<c1038c7e>] ? process_one_work+0x19e/0x350 [<c1038e77>] ? worker_thread+0x47/0x4a0 [<c1038e30>] ? process_one_work+0x350/0x350 [<c103d248>] ? kthread+0x98/0xb0 [<c103d1b0>] ? kthread_worker_fn+0xb0/0xb0 [<c134c377>] ? ret_from_fork+0x1b/0x28 Code: 8b 04 24 8d 53 b4 8b 08 89 f8 e8 7c f5 fe ff 8b 5b 08 83 eb 08 39 de 75 c2 83 c4 04 5b 5e 5f 5d c3 8d 76 00 8d bc 27 00 00 00 00 <0f> 0b 8d b4 26 00 00 00 00 8d bc 27 00 00 00 00 8b 50 1c 85 d2 EIP: [<c1289440>] nf_tables_table_destroy.isra.23.part.24+0x0/0x10 SS:ESP 0068:c026fe90 ---[ end trace 20fa171526d8ba2a ]--- BUG: unable to handle kernel paging request at fffffff0 IP: [<c103d4a6>] kthread_data+0x6/0x10 *pde = 014b5067 *pte = 00000000 Oops: 0000 [#2] CPU: 0 PID: 42 Comm: kworker/u2:2 Tainted: G D 4.9.40 #1 task: c0225540 task.stack: c026e000 EIP: 0060:[<c103d4a6>] EFLAGS: 00000002 CPU: 0 EIP is at kthread_data+0x6/0x10 EAX: 00000000 EBX: c0225540 ECX: 00000001 EDX: c0225570 ESI: 00000000 EDI: c026ff98 EBP: c026ff80 ESP: c026ff64 DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068 CR0: 80050033 CR2: 00000014 CR3: 002a5000 CR4: 00000690 Stack: c10387f5 c1349a27 c0225750 c026fdf0 c0225540 c026fdf0 c026ff98 c026ff88 c104210d 00000000 c102aee9 c02256cc 0126ff90 c026ff98 c026ff98 0000000b c0270000 c13f9a10 00000000 c134cdfc 00000000 00000000 00000000 00000000 Call Trace: [<c10387f5>] ? wq_worker_sleeping+0x5/0x70 [<c1349a27>] ? __schedule+0x207/0x350 [<c104210d>] ? do_task_dead+0x1d/0x20 [<c102aee9>] ? do_exit+0x4c9/0x7f0 [<c134cdfc>] ? rewind_stack_do_exit+0x10/0x12 Code: 27 00 00 00 00 85 c0 74 03 c6 00 00 a1 a8 11 45 c1 8b 80 e4 01 00 00 8b 40 e8 d1 e8 83 e0 01 c3 90 8d 74 26 00 8b 80 e4 01 00 00 <8b> 40 f0 c3 8d b6 00 00 00 00 83 ec 04 8b 90 e4 01 00 00 b9 04 EIP: [<c103d4a6>] kthread_data+0x6/0x10 SS:ESP 0068:c026ff64 CR2: 00000000fffffff0 ---[ end trace 20fa171526d8ba2b ]---
/* (c) 2017 Michael R. Tirado * GPLv3+, GNU General Public License, version 3 or later. * * build with: `gcc -o netph netph.c -lmnl -lnftnl` */ #define _GNU_SOURCE #include <stdio.h> #include <errno.h> #include <sched.h> #include <fcntl.h> #include <string.h> #include <stdlib.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <linux/netfilter/nfnetlink.h> #include <libmnl/libmnl.h> #include <libnftnl/common.h> #include <libnftnl/ruleset.h> #include <libnftnl/table.h> #include <libnftnl/chain.h> #include <libnftnl/set.h> #include <libnftnl/rule.h> #include <libnftnl/batch.h> #define ST_SOCKBUF_SIZE 8192 #define list_for_each(pos, head) \ for (pos = (head)->next; pos != (head); pos = pos->next) struct net_data { struct nftnl_ruleset *ruleset; }; const char fw[]=" \n\ flush ruleset \n\ \n\ table ip firewall { \n\ chain incoming { \n\ type filter hook input priority 0; policy drop; \n\ ct state invalid drop \n\ ct state established,related accept \n\ } \n\ } \n\ \n\ table ip6 firewall { \n\ chain incoming { \n\ type filter hook input priority 0; policy drop; \n\ ct state invalid drop \n\ ct state established,related accept \n\ icmpv6 type { \n\ nd-neighbor-solicit, \n\ nd-neighbor-advert, \n\ nd-router-advert, \n\ nd-redirect \n\ } accept \n\ } \n\ } \n\ "; static int mnl_talk(struct mnl_socket *nf_sock, const void *data, unsigned int len, int (*cb)(const struct nlmsghdr *nlh, void *data), void *cb_data, unsigned int seq) { char buf[ST_SOCKBUF_SIZE*2]; uint32_t portid = mnl_socket_get_portid(nf_sock); int ret; if (len >= ST_SOCKBUF_SIZE) { printf("bad len?\n"); return -1; } if (mnl_socket_sendto(nf_sock, data, len) < 0) { printf("sendto\n"); return -1; } ret = mnl_socket_recvfrom(nf_sock, buf, ST_SOCKBUF_SIZE); while (cb && ret > 0) { errno = 0; ret = mnl_cb_run(buf, ret, seq, portid, cb, cb_data); if (ret <= 0) { printf("errno=%s\n", strerror(errno)); printf("cb_run=%d\n",ret); goto out; } ret = mnl_socket_recvfrom(nf_sock, buf, sizeof(buf)); } out: if (ret < 0 && errno == EAGAIN) return 0; return ret; } static int setid; static int clone_cb(const struct nlmsghdr *nlmsg_hdr, void *data) { void *vtype; unsigned short msg_type = NFNL_MSG_TYPE(nlmsg_hdr->nlmsg_type); if (NFNL_SUBSYS_ID(nlmsg_hdr->nlmsg_type) != NFNL_SUBSYS_NFTABLES) { printf("unexpected subsys id\n"); return MNL_CB_ERROR; } switch (msg_type) { case NFT_MSG_NEWTABLE: vtype = nftnl_table_alloc(); if (!vtype) return MNL_CB_ERROR; if (nftnl_table_nlmsg_parse(nlmsg_hdr, vtype)) return MNL_CB_ERROR; nftnl_table_list_add(vtype, ((struct nftnl_table_list *)data)); break; case NFT_MSG_NEWCHAIN: vtype = nftnl_chain_alloc(); if (!vtype) return MNL_CB_ERROR; if (nftnl_chain_nlmsg_parse(nlmsg_hdr, vtype)) return MNL_CB_ERROR; nftnl_chain_list_add(vtype, ((struct nftnl_chain_list *)data)); break; case NFT_MSG_NEWSET: vtype = nftnl_set_alloc(); if (!vtype) return MNL_CB_ERROR; if (nftnl_set_nlmsg_parse(nlmsg_hdr, vtype)) return MNL_CB_ERROR; nftnl_set_set_u32(vtype, NFTNL_SET_ID, setid++); nftnl_set_list_add(vtype, ((struct nftnl_set_list *)data)); break; case NFT_MSG_NEWSETELEM: vtype = data; if (nftnl_set_elems_nlmsg_parse(nlmsg_hdr, vtype)) return MNL_CB_ERROR; break; case NFT_MSG_NEWRULE: vtype = nftnl_rule_alloc(); if (!vtype) return MNL_CB_ERROR; if (nftnl_rule_nlmsg_parse(nlmsg_hdr, vtype)) return MNL_CB_ERROR; nftnl_rule_unset(vtype, NFTNL_RULE_HANDLE); nftnl_rule_unset(vtype, NFTNL_RULE_POSITION); nftnl_rule_list_add(vtype, ((struct nftnl_rule_list *)data)); break; default: printf("unexpected msg type: %d\n", msg_type); return MNL_CB_ERROR; } return MNL_CB_OK; } static int clone_list(unsigned short msg_type, struct mnl_socket *nl_sock, void *list) { char buf[ST_SOCKBUF_SIZE]; struct nlmsghdr *hdr; unsigned int sq = rand()%20000; unsigned int flags; int r; struct nftnl_set *nfset; struct nftnl_rule *nfrule; setid = 1; memset(buf, 0, sizeof(buf)); switch (msg_type) { /* 0 == NFPROTO_UNSPEC */ case NFT_MSG_GETTABLE: flags = NLM_F_DUMP; hdr = nftnl_table_nlmsg_build_hdr(buf, msg_type, 0, flags, sq); break; case NFT_MSG_GETCHAIN: flags = NLM_F_DUMP; hdr = nftnl_chain_nlmsg_build_hdr(buf, msg_type, 0, flags, sq); break; case NFT_MSG_GETSET: flags = NLM_F_DUMP; hdr = nftnl_set_nlmsg_build_hdr(buf, msg_type, 0, flags, sq); nfset = nftnl_set_alloc(); if (nfset == NULL) return -1; /* was this needed? */ nftnl_set_nlmsg_build_payload(hdr, nfset); nftnl_set_free(nfset); break; case NFT_MSG_GETSETELEM: return 0; nfset = list; flags = NLM_F_DUMP; hdr = nftnl_set_nlmsg_build_hdr(buf, msg_type, nftnl_set_get_u32(nfset, NFTNL_SET_FAMILY), flags, sq); nftnl_set_elems_nlmsg_build_payload(hdr, nfset); break; case NFT_MSG_GETRULE: flags = NLM_F_DUMP; (void) nfrule; hdr = nftnl_rule_nlmsg_build_hdr(buf, msg_type, 0, flags, sq); break; default: return -1; } r = mnl_talk(nl_sock, hdr, hdr->nlmsg_len, clone_cb, list, sq); if (r < 0 || r >= ST_SOCKBUF_SIZE) return -1; return 0; } /* static int get_setelems(struct nftnl_set *cur, void *data) { printf("get_setelems\n"); printf("set name: %s\n", nftnl_set_get_str(cur, NFTNL_SET_NAME)); printf("set table: %s\n", nftnl_set_get_str(cur, NFTNL_SET_TABLE)); printf("set family: "); switch (nftnl_set_get_u32(cur, NFTNL_SET_FAMILY) ) { case NFPROTO_IPV4: printf("ip4\n"); break; case NFPROTO_IPV6: printf("ip6\n"); break; case NFPROTO_BRIDGE: printf("bridge\n"); break; case NFPROTO_ARP: printf("arp\n"); break; default: printf("?\n"); } return clone_list(NFT_MSG_GETSETELEM, data, cur); } */ static int create_ruleset(struct net_data *nnp, struct mnl_socket *nl_sock) { struct nftnl_ruleset *ruleset = NULL; struct nftnl_table_list *table_list = NULL; struct nftnl_chain_list *chain_list = NULL; struct nftnl_set_list *set_list = NULL; struct nftnl_rule_list *rule_list = NULL; table_list = nftnl_table_list_alloc(); if (table_list == NULL) return -1; if (clone_list(NFT_MSG_GETTABLE, nl_sock, table_list)) goto free_fail; chain_list = nftnl_chain_list_alloc(); if (chain_list == NULL) goto free_fail; if (clone_list(NFT_MSG_GETCHAIN, nl_sock, chain_list)) goto free_fail; set_list = nftnl_set_list_alloc(); if (set_list == NULL) goto free_fail; if (clone_list(NFT_MSG_GETSET, nl_sock, set_list)) goto free_fail; /* get set elements */ /* if (nftnl_set_list_foreach(set_list, get_setelems, nl_sock)) { printf("problem getting set elements\n"); goto free_fail; } */ rule_list = nftnl_rule_list_alloc(); if (rule_list == NULL) goto free_fail; if (clone_list(NFT_MSG_GETRULE, nl_sock, rule_list)) goto free_fail; ruleset = nftnl_ruleset_alloc(); if (ruleset == NULL) goto free_fail; nftnl_ruleset_set(ruleset, NFTNL_RULESET_TABLELIST, table_list); nftnl_ruleset_set(ruleset, NFTNL_RULESET_CHAINLIST, chain_list); nftnl_ruleset_set(ruleset, NFTNL_RULESET_SETLIST, set_list); nftnl_ruleset_set(ruleset, NFTNL_RULESET_RULELIST, rule_list); nnp->ruleset = ruleset; if (nftnl_ruleset_fprintf(stdout, ruleset, NFTNL_OUTPUT_JSON, 0) < 0) { printf("ruleset print failure\n"); return -1; } return 0; free_fail: printf("create_ruleset failed\n"); if (table_list) nftnl_table_list_free(table_list); if (chain_list) nftnl_chain_list_free(chain_list); if (set_list) nftnl_set_list_free(set_list); if (rule_list) nftnl_rule_list_free(rule_list); return -1; } static int netns_get_nftables(struct net_data *nnp) { struct mnl_socket *nl_sock; nl_sock = mnl_socket_open(NETLINK_NETFILTER); if (nl_sock == NULL) { printf("mnl_socket_open: %s\n", strerror(errno)); return -1; } if (mnl_socket_bind(nl_sock, 0, MNL_SOCKET_AUTOPID)) { printf("mnl_socket_bind: %s\n", strerror(errno)); mnl_socket_close(nl_sock); return -1; } if (create_ruleset(nnp, nl_sock)) { mnl_socket_close(nl_sock); return -1; } mnl_socket_close(nl_sock); return 0; } struct cb_data { struct mnl_socket *sock; unsigned int msg_type; unsigned int portid; }; static int noop_cb(const struct nlmsghdr *nlmsg_hdr, void *data) { (void)nlmsg_hdr; (void)data; return MNL_CB_OK; } int nftnl_send_item(void *cur, void *data) { char buf[ST_SOCKBUF_SIZE*2]; struct mnl_nlmsg_batch *batch = NULL; struct nlmsghdr *hdr; struct cb_data *dat = data; unsigned int sq, item_sq; int r; uint32_t family, flags; int batching; /* */ memset(buf, 0, sizeof(buf)); sq = (rand()%20000); batching = nftnl_batch_is_supported(); batch = mnl_nlmsg_batch_start(buf, ST_SOCKBUF_SIZE); if (batch == NULL) return MNL_CB_ERROR; if (batching) { nftnl_batch_begin(mnl_nlmsg_batch_current(batch), sq); mnl_nlmsg_batch_next(batch); } switch (dat->msg_type) { case NFT_MSG_NEWTABLE: flags = NLM_F_ACK | NLM_F_CREATE; family = nftnl_table_get_u32(cur, NFTNL_TABLE_FAMILY); hdr = nftnl_table_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), dat->msg_type, family, flags, ++sq); nftnl_table_nlmsg_build_payload(hdr, (struct nftnl_table *)cur); break; case NFT_MSG_NEWCHAIN: flags = NLM_F_ACK; nftnl_chain_unset(cur, NFTNL_CHAIN_HANDLE); family = nftnl_chain_get_u32(cur, NFTNL_CHAIN_FAMILY); hdr = nftnl_chain_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), dat->msg_type, family, flags, ++sq); nftnl_chain_nlmsg_build_payload(hdr, (struct nftnl_chain *)cur); break; case NFT_MSG_NEWSET: flags = NLM_F_ACK | NLM_F_CREATE; family = nftnl_set_get_u32(cur, NFTNL_SET_FAMILY); hdr = nftnl_set_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), dat->msg_type, family, flags, ++sq); nftnl_set_nlmsg_build_payload(hdr, (struct nftnl_set *)cur); break; case NFT_MSG_NEWRULE: flags = NLM_F_ACK | NLM_F_CREATE | NLM_F_APPEND; family = nftnl_rule_get_u32(cur, NFTNL_RULE_FAMILY); hdr = nftnl_rule_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), dat->msg_type, family, flags, ++sq); nftnl_rule_nlmsg_build_payload(hdr, (struct nftnl_rule *)cur); break; default: mnl_nlmsg_batch_stop(batch); return MNL_CB_ERROR; } item_sq = sq; mnl_nlmsg_batch_next(batch); if (batching) { nftnl_batch_end(mnl_nlmsg_batch_current(batch), ++sq); mnl_nlmsg_batch_next(batch); } r = mnl_socket_sendto(dat->sock, mnl_nlmsg_batch_head(batch), mnl_nlmsg_batch_size(batch)); if (r < 0 || r >= ST_SOCKBUF_SIZE) { printf("r == %d -- %s\n", r, strerror(errno)); mnl_nlmsg_batch_stop(batch); return MNL_CB_ERROR; } mnl_nlmsg_batch_stop(batch); r = mnl_socket_recvfrom(dat->sock, buf, ST_SOCKBUF_SIZE); while (r > 0) { r = mnl_cb_run(buf, r, item_sq, dat->portid, noop_cb, &r); if (r <= 0) break; r = mnl_socket_recvfrom(dat->sock, buf, ST_SOCKBUF_SIZE); } if (r < 0) { return MNL_CB_ERROR; } return MNL_CB_OK; } int send_table(struct nftnl_table *table, void *data) { return nftnl_send_item(table, data); } int send_chain(struct nftnl_chain *chain, void *data) { return nftnl_send_item(chain, data); } int send_set(struct nftnl_set *set, void *data) { return nftnl_send_item(set, data); } int send_rule(struct nftnl_rule *rule, void *data) { return nftnl_send_item(rule, data); } static int netns_replace_nftables(struct net_data *nnp) { struct mnl_socket *nl_sock; struct nftnl_table_list *table_list; struct nftnl_chain_list *chain_list; struct nftnl_set_list *set_list; struct nftnl_rule_list *rule_list; struct cb_data dat; nl_sock = mnl_socket_open(NETLINK_NETFILTER); if (nl_sock == NULL) { printf("mnl_socket_open: %s\n", strerror(errno)); return -1; } if (mnl_socket_bind(nl_sock, 0, MNL_SOCKET_AUTOPID)) { printf("mnl_socket_bind: %s\n", strerror(errno)); goto failure; } dat.sock = nl_sock; dat.portid = mnl_socket_get_portid(nl_sock); /* table list */ dat.msg_type = NFT_MSG_NEWTABLE; table_list = nftnl_ruleset_get(nnp->ruleset, NFTNL_RULESET_TABLELIST); if (!table_list) goto failure; if (nftnl_table_list_foreach(table_list, send_table, &dat)) { printf("problem adding table_list\n"); goto failure; } /* chain list */ dat.msg_type = NFT_MSG_NEWCHAIN; chain_list = nftnl_ruleset_get(nnp->ruleset, NFTNL_RULESET_CHAINLIST); if (!chain_list) goto failure; if (nftnl_chain_list_foreach(chain_list, send_chain, &dat)) { printf("problem adding chain_list\n"); goto failure; } /* set list */ dat.msg_type = NFT_MSG_NEWSET; set_list = nftnl_ruleset_get(nnp->ruleset, NFTNL_RULESET_SETLIST); if (!set_list) goto failure; if (nftnl_set_list_foreach(set_list, send_set, &dat)) { printf("problem adding set_list\n"); goto failure; } /* rule list */ dat.msg_type = NFT_MSG_NEWRULE; rule_list = nftnl_ruleset_get(nnp->ruleset, NFTNL_RULESET_RULELIST); if (!rule_list) goto failure; if (nftnl_rule_list_foreach(rule_list, send_rule, &dat)) { printf("problem adding rule_list\n"); goto failure; } mnl_socket_close(nl_sock); return 0; failure: mnl_socket_close(nl_sock); return -1; } int main() { int fd; struct net_data data; memset(&data, 0, sizeof(data)); /* write nftables file */ fd = open("testtables.nft", O_RDWR|O_TRUNC|O_CREAT, 0755); if (fd == -1) { printf("open(testtables.nft): %s\n", strerror(errno)); return -1; } if (write(fd, fw, sizeof(fw)) != sizeof(fw)) { printf("write error\n"); return -1; } close(fd); /* install tables */ system("nft -f testtables.nft"); if (netns_get_nftables(&data)) { printf("failed to get tables\n"); return -1; } if (unshare(CLONE_NEWNET)) { printf("unshare(CLONE_NEWNET): %s\n", strerror(errno)); return -1; } if (netns_replace_nftables(&data)) { printf("failed to replace tables\n"); return -1; } return 0; }