Hi there!
We experienced a major network outage today when upgrading kernels.
The affected servers run the VRF+conntrack+nftables combo. They are edge
firewalls/NAT boxes, meaning most interesting traffic is not locally generated,
but forwarded.
What we experienced is NATed traffic in the reply direction never being
forwarded back to the original client.
Good kernel: 5.10.40 (debian 5.10.0-0.bpo.7-amd64)
Bad kernel: 5.10.70 (debian 5.10.0-0.bpo.9-amd64)
I suspect the problem may be related to this patch:
https://x-lore.kernel.org/stable/20210824165908.709932-58-sashal@xxxxxxxxxx/
Would it be possible to confirm the offending change, and to get some advice on
how to workaround the problem? I could run more tests and give additional
information on demand.
Some bits of our configuration follows. The setup is rather simple, two
interfaces, one pointing to the internet (eno2.2120) and the other to the
internal network (eno2.2107). Both interfaces are attached to a VRF device
'vrf-cloudgw'. The VRF is used to isolate forwarded traffic from the host
network (eno1). The nftables firewall is also split: a table 'basefirewall' for
input/output chains, a table 'cloudgw' for forwarded traffic, to perform NAT.
Interfaces setup:
=== 8< ===
user@cloudgw2002-dev:~ $ ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group
default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: eno1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group
default qlen 1000
link/ether 2c:ea:7f:7b:e1:04 brd ff:ff:ff:ff:ff:ff
inet 10.192.20.18/24 brd 10.192.20.255 scope global eno1
valid_lft forever preferred_lft forever
inet6 2620:0:860:118:10:192:20:18/64 scope global
valid_lft 2591995sec preferred_lft 604795sec
inet6 fe80::2eea:7fff:fe7b:e104/64 scope link
valid_lft forever preferred_lft forever
3: eno2: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group
default qlen 1000
link/ether 2c:ea:7f:7b:e1:05 brd ff:ff:ff:ff:ff:ff
inet6 fe80::2eea:7fff:fe7b:e105/64 scope link
valid_lft forever preferred_lft forever
4: vrf-cloudgw: <NOARP,MASTER,UP,LOWER_UP> mtu 65575 qdisc noqueue state UP
group default qlen 1000
link/ether 1e:04:99:69:3e:56 brd ff:ff:ff:ff:ff:ff
5: eno2.2107@eno2: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue
master vrf-cloudgw state UP group default qlen 1000
link/ether 2c:ea:7f:7b:e1:05 brd ff:ff:ff:ff:ff:ff
inet 185.15.57.9/30 scope global eno2.2107
valid_lft forever preferred_lft forever
inet6 fe80::2eea:7fff:fe7b:e105/64 scope link
valid_lft forever preferred_lft forever
6: eno2.2120@eno2: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue
master vrf-cloudgw state UP group default qlen 1000
link/ether 2c:ea:7f:7b:e1:05 brd ff:ff:ff:ff:ff:ff
inet 208.80.153.189/29 brd 208.80.153.191 scope global eno2.2120
valid_lft forever preferred_lft forever
inet 208.80.153.190/29 scope global secondary eno2.2120
valid_lft forever preferred_lft forever
inet6 fe80::2eea:7fff:fe7b:e105/64 scope link
valid_lft forever preferred_lft forever
=== 8< ===
VRF routing table:
=== 8< ===
user@cloudgw2002-dev:~ $ ip route list vrf vrf-cloudgw
default via 208.80.153.185 dev eno2.2120 onlink
172.16.128.0/24 via 185.15.57.10 dev eno2.2107 proto 112 onlink
185.15.57.0/29 via 185.15.57.10 dev eno2.2107 proto 112 onlink
185.15.57.8/30 dev eno2.2107 proto kernel scope link src 185.15.57.9
208.80.153.184/29 dev eno2.2120 proto kernel scope link src 208.80.153.189
=== 8< ===
Find attached nftables ruleset.
table inet basefirewall {
set monitoring_ipv4 {
type ipv4_addr
elements = { 208.80.153.84, 208.80.154.88 }
}
set monitoring_ipv6 {
type ipv6_addr
elements = { 2620:0:860:3:208:80:153:84,
2620:0:861:3:208:80:154:88 }
}
set ssh_allowed_ipv4 {
type ipv4_addr
elements = { 10.64.32.25, 10.192.32.49,
10.192.48.16, 91.198.174.6,
103.102.166.6, 198.35.26.13,
208.80.153.54, 208.80.155.110 }
}
set ssh_allowed_ipv6 {
type ipv6_addr
elements = { 2001:df2:e500:1:103:102:166:6,
2620:0:860:2:208:80:153:54,
2620:0:860:103:10:192:32:49,
2620:0:860:104:10:192:48:16,
2620:0:861:4:208:80:155:110,
2620:0:861:103:10:64:32:25,
2620:0:862:1:91:198:174:6,
2620:0:863:1:198:35:26:13 }
}
set prometheus_nodes_ipv4 {
type ipv4_addr
elements = { 10.192.0.145, 10.192.16.189 }
}
set prometheus_nodes_ipv6 {
type ipv6_addr
elements = { 2620:0:860:101:10:192:0:145,
2620:0:860:102:10:192:16:189 }
}
set prometheus_ports {
type inet_service
elements = { 9100, 9105, 9710 }
}
chain input {
type filter hook input priority filter; policy drop;
ct state established,related accept
iifname "lo" accept
meta pkttype multicast accept
meta l4proto ipv6-icmp accept
ip protocol icmp accept
ip saddr @monitoring_ipv4 ct state new accept
ip6 saddr @monitoring_ipv6 ct state new accept
ip saddr @ssh_allowed_ipv4 tcp dport 22 ct state new counter packets 1 bytes 60 accept
ip6 saddr @ssh_allowed_ipv6 tcp dport 22 ct state new counter packets 6 bytes 480 accept
ip saddr @prometheus_nodes_ipv4 tcp dport @prometheus_ports ct state new counter packets 421 bytes 25260 accept
ip6 saddr @prometheus_nodes_ipv6 tcp dport @prometheus_ports ct state new counter packets 422 bytes 33760 accept
ip saddr 10.192.20.18 tcp dport 3780 ct state new accept
counter packets 1213 bytes 68460 comment "counter dropped packets"
}
chain output {
type filter hook output priority filter; policy accept;
counter packets 67940 bytes 38842995 comment "conter accepted packets"
}
}
table inet cloudgw {
set dmz_cidr_set {
type ipv4_addr
counter
elements = { 10.64.4.15 counter packets 0 bytes 0, 10.64.37.13 counter packets 0 bytes 0,
10.64.37.18 counter packets 0 bytes 0, 91.198.174.192 counter packets 0 bytes 0,
91.198.174.208 counter packets 0 bytes 0, 103.102.166.224 counter packets 0 bytes 0,
103.102.166.240 counter packets 0 bytes 0, 198.35.26.96 counter packets 0 bytes 0,
198.35.26.112 counter packets 0 bytes 0, 208.80.153.15 counter packets 0 bytes 0,
208.80.153.42 counter packets 0 bytes 0, 208.80.153.59 counter packets 0 bytes 0,
208.80.153.75 counter packets 0 bytes 0, 208.80.153.78 counter packets 2108 bytes 231555,
208.80.153.107 counter packets 0 bytes 0, 208.80.153.116 counter packets 0 bytes 0,
208.80.153.118 counter packets 0 bytes 0, 208.80.153.224 counter packets 0 bytes 0,
208.80.153.240 counter packets 0 bytes 0, 208.80.153.252 counter packets 0 bytes 0,
208.80.154.15 counter packets 0 bytes 0, 208.80.154.23 counter packets 0 bytes 0,
208.80.154.24 counter packets 0 bytes 0, 208.80.154.30 counter packets 0 bytes 0,
208.80.154.85 counter packets 0 bytes 0, 208.80.154.132 counter packets 0 bytes 0,
208.80.154.137 counter packets 0 bytes 0, 208.80.154.143 counter packets 0 bytes 0,
208.80.154.224 counter packets 0 bytes 0, 208.80.154.240 counter packets 0 bytes 0,
208.80.154.252 counter packets 0 bytes 0, 208.80.155.119 counter packets 0 bytes 0,
208.80.155.125 counter packets 0 bytes 0, 208.80.155.126 counter packets 0 bytes 0 }
}
chain prerouting {
type nat hook prerouting priority dstnat; policy accept;
}
chain postrouting {
type nat hook postrouting priority srcnat; policy accept;
oifname != "eno2.2120" counter packets 629 bytes 42929 accept
ip saddr != 172.16.128.0/24 counter packets 536 bytes 58248 accept
ip daddr @dmz_cidr_set counter packets 2108 bytes 231555 accept comment "dmz_cidr"
counter packets 12 bytes 720 snat ip to 185.15.57.1 comment "routing_source_ip"
}
chain forward {
type filter hook forward priority filter; policy drop;
iifname "vrf-cloudgw" oifname { "eno2.2120", "eno2.2107" } counter packets 6994 bytes 1171911 accept
counter packets 0 bytes 0 comment "counter dropped packets"
}
}