Re: Most optimal method to dump UDP conntrack entries

Antonio Ojea <antonio.ojea.garcia@xxxxxxxxx> · Tue, 12 Nov 2024 07:41:34 -0700

On Tue, 12 Nov 2024 at 02:20, Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> wrote:
>
> On Tue, Nov 12, 2024 at 10:16:45AM +0100, Pablo Neira Ayuso wrote:
> > I guess the concern is that assured flows cannot be expelled from the
> > conntrack table via early_drop, that is why an expedite cleanup is
> > important?
>
> Actually, the issue is that packets could end up in a backend which
> does not exist after re-configuration, therefore, removing the entry
> need to happen so ongoing flow have a chance to talk to another
> (different) backend.

Please take a look to this kselftest attached that emulates the
problematic behavior in kubernetes,

I think that in UDP the nat rule should take precedence over the
conntrack entry,on the contrary to TCP where it is important to
preserve the session if it has been established.

I  did only a quick test and seems to fail also with Florian patch

diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile
index ffe161fac8b5..9ba5610f3ebc 100644
--- a/tools/testing/selftests/net/netfilter/Makefile
+++ b/tools/testing/selftests/net/netfilter/Makefile
@@ -12,6 +12,7 @@ TEST_PROGS += conntrack_dump_flush.sh
 TEST_PROGS += conntrack_icmp_related.sh
 TEST_PROGS += conntrack_ipip_mtu.sh
 TEST_PROGS += conntrack_tcp_unreplied.sh
+TEST_PROGS += conntrack_udp_expires.sh
 TEST_PROGS += conntrack_sctp_collision.sh
 TEST_PROGS += conntrack_vrf.sh
 TEST_PROGS += conntrack_reverse_clash.sh
diff --git a/tools/testing/selftests/net/netfilter/conntrack_udp_expires.sh b/tools/testing/selftests/net/netfilter/conntrack_udp_expires.sh
new file mode 100755
index 000000000000..928f221ae739
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_udp_expires.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+#
+# This tests conntrack on the following scenario:
+#
+#                         +------------+
+# +-------+               |  nsrouter  |                  +-------+
+# |ns1    |.99          .1|            |.1             .99|    ns2|
+# |   eth0|---------------|veth0  veth1|------------------|eth0   |
+# |       |  10.0.1.0/24  |            |   10.0.2.0/24    |       |
+# +-------+  dead:1::/64  |    veth2   |   dead:2::/64    +-------+
+#                         +------------+
+#                                |.1
+#                                |
+#                                |
+#                                |                        +-------+
+#                                |                     .99|    ns3|
+#                                +------------------------|eth0   |
+#                                       10.0.3.0/24       |       |
+#                                       dead:3::/64       +-------+
+#
+# nsrouters implement loadbalancing using DNAT with a virtual IP
+# 10.0.4.10 - dead:4::a
+# shellcheck disable=SC2162,SC2317
+
+source lib.sh
+ret=0
+# UDP is slow
+timeout=15
+
+cleanup()
+{
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+	ip netns pids "$ns2" | xargs kill 2>/dev/null
+	ip netns pids "$ns3" | xargs kill 2>/dev/null
+	ip netns pids "$nsrouter" | xargs kill 2>/dev/null
+
+	cleanup_all_ns
+}
+
+checktool "nft --version" "test without nft tool"
+checktool "socat -h" "run test without socat"
+
+trap cleanup EXIT
+setup_ns ns1 ns2 ns3 nsrouter
+
+if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then
+    echo "SKIP: No virtual ethernet pair device support in kernel"
+    exit $ksft_skip
+fi
+ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2"
+ip link add veth2 netns "$nsrouter" type veth peer name eth0 netns "$ns3"
+
+ip -net "$nsrouter" link set veth0 up
+ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad
+
+ip -net "$nsrouter" link set veth1 up
+ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1
+ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad
+
+ip -net "$nsrouter" link set veth2 up
+ip -net "$nsrouter" addr add 10.0.3.1/24 dev veth2
+ip -net "$nsrouter" addr add dead:3::1/64 dev veth2 nodad
+
+ip -net "$ns1" link set eth0 up
+ip -net "$ns2" link set eth0 up
+ip -net "$ns3" link set eth0 up
+
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns1" route add default via dead:1::1
+
+ip -net "$ns2" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad
+ip -net "$ns2" route add default via 10.0.2.1
+ip -net "$ns2" route add default via dead:2::1
+
+ip -net "$ns3" addr add 10.0.3.99/24 dev eth0
+ip -net "$ns3" addr add dead:3::99/64 dev eth0 nodad
+ip -net "$ns3" route add default via 10.0.3.1
+ip -net "$ns3" route add default via dead:3::1
+
+ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null
+
+test_ping() {
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then
+	return 1
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then
+	return 2
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.3.99 > /dev/null; then
+	return 1
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:3::99 > /dev/null; then
+	return 2
+  fi
+
+  return 0
+}
+
+test_ping_router() {
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then
+	return 3
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then
+	return 4
+  fi
+
+  return 0
+}
+
+
+listener_ready()
+{
+	local ns="$1"
+	local port="$2"
+	local proto="$3"
+	ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port"
+}
+
+test_conntrack_udp_expires()
+{
+	local ip_proto="$1"
+	# derived variables
+	local testname="test_${ip_proto}_udp_forward"
+	local socat_ipproto
+	local vip
+	local ns2_ip
+	local ns3_ip
+	local ns2_ip_port
+	local ns3_ip_port
+
+	# socat 1.8.0 has a bug that requires to specify the IP family to bind (fixed in 1.8.0.1)
+	case $ip_proto in
+	"ip")
+		socat_ipproto="-4"
+		vip=10.0.4.10
+		ns2_ip=10.0.2.99
+		ns3_ip=10.0.3.99
+		vip_ip_port="$vip:8080"
+		ns2_ip_port="$ns2_ip:8080"
+		ns3_ip_port="$ns3_ip:8080"
+	;;
+	"ip6")
+		socat_ipproto="-6"
+		vip=dead:4::a
+		ns2_ip=dead:2::99
+		ns3_ip=dead:3::99
+		vip_ip_port="[$vip]:8080"
+		ns2_ip_port="[$ns2_ip]:8080"
+		ns3_ip_port="[$ns3_ip]:8080"
+	;;
+	*)
+	echo "FAIL: unsupported protocol"
+	exit 255
+	;;
+	esac
+
+	ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+flush ruleset
+table inet nat {
+	chain divert {
+		type nat hook prerouting priority 0; policy accept;
+		$ip_proto daddr $vip udp dport 8080 dnat to $ns2_ip_port
+	}
+}
+EOF
+
+	timeout "$timeout" ip netns exec "$ns2" socat "$socat_ipproto" udp-listen:8080,fork SYSTEM:"echo PONG_NS2" 2>/dev/null &
+	local server2_pid=$!
+
+	timeout "$timeout" ip netns exec "$ns3" socat "$socat_ipproto" udp-listen:8080,fork SYSTEM:"echo PONG_NS3" 2>/dev/null &
+	local server3_pid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" 8080 "-u"
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns3" 8080 "-u"
+
+	local result
+	# request from ns1 to ns2 (direct traffic)
+	result=$(echo PING | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$ns2_ip_port",sourceport=18888)
+	if [ "$result" == "PONG_NS2" ] ;then
+		echo "PASS: conntrack udp test $testname: ns1 got reply \"$result\" connecting to ns2"
+	else
+		echo "ERROR: conntrack udp test $testname: ns1 got reply \"$result\" connecting to ns2, not \"PONG_NS2\" as intended"
+		ret=1
+	fi
+
+	# request from ns1 to ns3 (direct traffic)
+	result=$(echo PING | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$ns3_ip_port",sourceport=18888)
+	if [ "$result" = "PONG_NS3" ] ;then
+		echo "PASS: conntrack udp test $testname: ns1 got reply \"$result\" connecting to ns3"
+	else
+		echo "ERROR: conntrack udp test $testname: ns1 got reply \"$result\" connecting to ns3, not \"PONG_NS3\" as intended"
+		ret=1
+	fi
+
+	# request from ns1 to vip (DNAT to ns2)
+	result=$(echo PING | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$vip_ip_port",sourceport=18888)
+	if [ "$result" = "PONG_NS2" ] ;then
+		echo "PASS: conntrack udp test $testname: ns1 got reply \"$result\" connecting to vip (ns2)"
+	else
+		echo "ERROR: conntrack udp test $testname: ns1 got reply \"$result\" connecting to vip, not \"PONG_NS2\" as intended"
+		ret=1
+	fi
+
+	# replace the DNAT rule to direct and replace ns2 destination with ns3
+		ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+flush ruleset
+table inet nat {
+	chain divert {
+		type nat hook prerouting priority 0; policy accept;
+		$ip_proto daddr $vip udp dport 8080 dnat to $ns3_ip_port
+	}
+}
+EOF
+	# request from ns1 to vip (DNAT to ns3)
+	# reuse the same port to validate the existing conntrack entry does not
+	# shadow the actual nftables rule.
+	result=$(echo PING | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$vip_ip_port",sourceport=18888)
+	if [ "$result" = "PONG_NS3" ] ;then
+		echo "PASS: conntrack udp test $testname: ns1 got reply \"$result\" connecting to vip (ns3)"
+	else
+		echo "ERROR: conntrack udp test $testname: ns1 got reply \"$result\" connecting to vip, not \"PONG_NS3\" as intended"
+		ret=1
+	fi
+}
+
+
+if test_ping; then
+	# queue bypass works (rules were skipped, no listener)
+	echo "PASS: ${ns1} can reach ${ns2}"
+else
+	echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2
+	exit $ret
+fi
+
+test_conntrack_udp_expires "ip"
+test_conntrack_udp_expires "ip6"
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/conntrack_vrf.sh b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh
index e95ecb37c2b1..77413cc11124 100755
--- a/tools/testing/selftests/net/netfilter/conntrack_vrf.sh
+++ b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh
@@ -1,253 +1,249 @@
 #!/bin/bash
-
-# This script demonstrates interaction of conntrack and vrf.
-# The vrf driver calls the netfilter hooks again, with oif/iif
-# pointing at the VRF device.
-#
-# For ingress, this means first iteration has iifname of lower/real
-# device.  In this script, thats veth0.
-# Second iteration is iifname set to vrf device, tvrf in this script.
-#
-# For egress, this is reversed: first iteration has the vrf device,
-# second iteration is done with the lower/real/veth0 device.
-#
-# test_ct_zone_in demonstrates unexpected change of nftables
-# behavior # caused by commit 09e856d54bda5f28 "vrf: Reset skb conntrack
-# connection on VRF rcv"
 #
-# It was possible to assign conntrack zone to a packet (or mark it for
-# `notracking`) in the prerouting chain before conntrack, based on real iif.
+# This tests conntrack on the following scenario:
 #
-# After the change, the zone assignment is lost and the zone is assigned based
-# on the VRF master interface (in case such a rule exists).
-# assignment is lost. Instead, assignment based on the `iif` matching
-# Thus it is impossible to distinguish packets based on the original
-# interface.
+#                         +------------+
+# +-------+               |  nsrouter  |                  +-------+
+# |ns1    |.99          .1|            |.1             .99|    ns2|
+# |   eth0|---------------|veth0  veth1|------------------|eth0   |
+# |       |  10.0.1.0/24  |            |   10.0.2.0/24    |       |
+# +-------+  dead:1::/64  |    veth2   |   dead:2::/64    +-------+
+#                         +------------+
+#                                |.1
+#                                |
+#                                |
+#                                |                        +-------+
+#                                |                     .99|    ns3|
+#                                +------------------------|eth0   |
+#                                       10.0.3.0/24       |       |
+#                                       dead:3::/64       +-------+
 #
-# test_masquerade_vrf and test_masquerade_veth0 demonstrate the problem
-# that was supposed to be fixed by the commit mentioned above to make sure
-# that any fix to test case 1 won't break masquerade again.
+# nsrouters implement loadbalancing using DNAT with a virtual IP
+# 10.0.4.10 - dead:4::a
+# shellcheck disable=SC2162,SC2317
 
 source lib.sh
-
-IP0=172.30.30.1
-IP1=172.30.30.2
-DUMMYNET=10.9.9
-PFXL=30
 ret=0
+# UDP is slow
+timeout=15
 
 cleanup()
 {
-	ip netns pids $ns0 | xargs kill 2>/dev/null
-	ip netns pids $ns1 | xargs kill 2>/dev/null
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+	ip netns pids "$ns2" | xargs kill 2>/dev/null
+	ip netns pids "$ns3" | xargs kill 2>/dev/null
+	ip netns pids "$nsrouter" | xargs kill 2>/dev/null
 
 	cleanup_all_ns
 }
 
-checktool "nft --version" "run test without nft"
-checktool "conntrack --version" "run test without conntrack"
+checktool "nft --version" "test without nft tool"
 checktool "socat -h" "run test without socat"
 
 trap cleanup EXIT
+setup_ns ns1 ns2 ns3 nsrouter
 
-setup_ns ns0 ns1
-
-ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.default.rp_filter=0
-ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.all.rp_filter=0
-ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.all.rp_filter=0
-ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.all.forwarding=1
-
-if ! ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1; then
-	echo "SKIP: Could not add veth device"
-	exit $ksft_skip
+if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then
+    echo "SKIP: No virtual ethernet pair device support in kernel"
+    exit $ksft_skip
 fi
+ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2"
+ip link add veth2 netns "$nsrouter" type veth peer name eth0 netns "$ns3"
+
+ip -net "$nsrouter" link set veth0 up
+ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad
+
+ip -net "$nsrouter" link set veth1 up
+ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1
+ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad
+
+ip -net "$nsrouter" link set veth2 up
+ip -net "$nsrouter" addr add 10.0.3.1/24 dev veth2
+ip -net "$nsrouter" addr add dead:3::1/64 dev veth2 nodad
+
+ip -net "$ns1" link set eth0 up
+ip -net "$ns2" link set eth0 up
+ip -net "$ns3" link set eth0 up
+
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns1" route add default via dead:1::1
+
+ip -net "$ns2" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad
+ip -net "$ns2" route add default via 10.0.2.1
+ip -net "$ns2" route add default via dead:2::1
+
+ip -net "$ns3" addr add 10.0.3.99/24 dev eth0
+ip -net "$ns3" addr add dead:3::99/64 dev eth0 nodad
+ip -net "$ns3" route add default via 10.0.3.1
+ip -net "$ns3" route add default via dead:3::1
+
+ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null
+
+test_ping() {
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then
+	return 1
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then
+	return 2
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.3.99 > /dev/null; then
+	return 1
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:3::99 > /dev/null; then
+	return 2
+  fi
+
+  return 0
+}
 
-if ! ip -net "$ns0" li add tvrf type vrf table 9876; then
-	echo "SKIP: Could not add vrf device"
-	exit $ksft_skip
-fi
+test_ping_router() {
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then
+	return 3
+  fi
 
-ip -net "$ns0" link add dummy0 type dummy
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then
+	return 4
+  fi
 
-ip -net "$ns0" li set veth0 master tvrf
-ip -net "$ns0" li set dummy0 master tvrf
-ip -net "$ns0" li set tvrf up
-ip -net "$ns0" li set veth0 up
-ip -net "$ns0" li set dummy0 up
-ip -net "$ns1" li set veth0 up
+  return 0
+}
 
-ip -net "$ns0" addr add $IP0/$PFXL dev veth0
-ip -net "$ns1" addr add $IP1/$PFXL dev veth0
-ip -net "$ns0" addr add $DUMMYNET.1/$PFXL dev dummy0
 
 listener_ready()
 {
-        local ns="$1"
-
-        ss -N "$ns" -l -n -t -o "sport = :55555" | grep -q "55555"
+	local ns="$1"
+	local port="$2"
+	local proto="$3"
+	ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port"
 }
 
-ip netns exec "$ns1" socat -u -4 TCP-LISTEN:55555,reuseaddr,fork STDOUT > /dev/null &
-busywait $BUSYWAIT_TIMEOUT listener_ready "$ns1"
-
-# test vrf ingress handling.
-# The incoming connection should be placed in conntrack zone 1,
-# as decided by the first iteration of the ruleset.
-test_ct_zone_in()
+test_conntrack_udp_expires()
 {
-ip netns exec "$ns0" nft -f - <<EOF
-table testct {
-	chain rawpre {
-		type filter hook prerouting priority raw;
-
-		iif { veth0, tvrf } counter meta nftrace set 1
-		iif veth0 counter ct zone set 1 counter return
-		iif tvrf counter ct zone set 2 counter return
-		ip protocol icmp counter
-		notrack counter
-	}
-
-	chain rawout {
-		type filter hook output priority raw;
-
-		oif veth0 counter ct zone set 1 counter return
-		oif tvrf counter ct zone set 2 counter return
-		notrack counter
+	local ip_proto="$1"
+	# derived variables
+	local testname="test_${ip_proto}_udp_forward"
+	local socat_ipproto
+	local vip
+	local ns2_ip
+	local ns3_ip
+	local ns2_ip_port
+	local ns3_ip_port
+
+	# socat 1.8.0 has a bug that requires to specify the IP family to bind (fixed in 1.8.0.1)
+	case $ip_proto in
+	"ip")
+		socat_ipproto="-4"
+		vip=10.0.4.10
+		ns2_ip=10.0.2.99
+		ns3_ip=10.0.3.99
+		vip_ip_port="$vip:8080"
+		ns2_ip_port="$ns2_ip:8080"
+		ns3_ip_port="$ns3_ip:8080"
+	;;
+	"ip6")
+		socat_ipproto="-6"
+		vip=dead:4::a
+		ns2_ip=dead:2::99
+		ns3_ip=dead:3::99
+		vip_ip_port="[$vip]:8080"
+		ns2_ip_port="[$ns2_ip]:8080"
+		ns3_ip_port="[$ns3_ip]:8080"
+	;;
+	*)
+	echo "FAIL: unsupported protocol"
+	exit 255
+	;;
+	esac
+
+	ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+flush ruleset
+table inet filter {
+	chain divert {
+		type filter hook prerouting priority 0; policy accept;
+		$ip_proto daddr $vip udp dport 8080 dnat to $ns2_ip
 	}
 }
 EOF
-	ip netns exec "$ns1" ping -W 1 -c 1 -I veth0 "$IP0" > /dev/null
-
-	# should be in zone 1, not zone 2
-	count=$(ip netns exec "$ns0" conntrack -L -s $IP1 -d $IP0 -p icmp --zone 1 2>/dev/null | wc -l)
-	if [ "$count" -eq 1 ]; then
-		echo "PASS: entry found in conntrack zone 1"
-	else
-		echo "FAIL: entry not found in conntrack zone 1"
-		count=$(ip netns exec "$ns0" conntrack -L -s $IP1 -d $IP0 -p icmp --zone 2 2> /dev/null | wc -l)
-		if [ "$count" -eq 1 ]; then
-			echo "FAIL: entry found in zone 2 instead"
-		else
-			echo "FAIL: entry not in zone 1 or 2, dumping table"
-			ip netns exec "$ns0" conntrack -L
-			ip netns exec "$ns0" nft list ruleset
-		fi
-	fi
-}
-
-# add masq rule that gets evaluated w. outif set to vrf device.
-# This tests the first iteration of the packet through conntrack,
-# oifname is the vrf device.
-test_masquerade_vrf()
-{
-	local qdisc=$1
 
-	if [ "$qdisc" != "default" ]; then
-		tc -net "$ns0" qdisc add dev tvrf root "$qdisc"
-	fi
+	timeout "$timeout" ip netns exec "$ns2" socat "$socat_ipproto" udp-listen:8080,fork SYSTEM:"echo PONG_NS2" 2>/dev/null &
+	local server2_pid=$!
 
-	ip netns exec "$ns0" conntrack -F 2>/dev/null
+	timeout "$timeout" ip netns exec "$ns3" socat "$socat_ipproto" udp-listen:8080,fork SYSTEM:"echo PONG_NS3" 2>/dev/null &
+	local server3_pid=$!
 
-ip netns exec "$ns0" nft -f - <<EOF
-flush ruleset
-table ip nat {
-	chain rawout {
-		type filter hook output priority raw;
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" 8080 "-u"
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns3" 8080 "-u"
 
-		oif tvrf ct state untracked counter
-	}
-	chain postrouting2 {
-		type filter hook postrouting priority mangle;
-
-		oif tvrf ct state untracked counter
-	}
-	chain postrouting {
-		type nat hook postrouting priority 0;
-		# NB: masquerade should always be combined with 'oif(name) bla',
-		# lack of this is intentional here, we want to exercise double-snat.
-		ip saddr 172.30.30.0/30 counter masquerade random
-	}
-}
-EOF
-	if ! ip netns exec "$ns0" ip vrf exec tvrf socat -u -4 STDIN TCP:"$IP1":55555 < /dev/null > /dev/null;then
-		echo "FAIL: connect failure with masquerade + sport rewrite on vrf device"
-		ret=1
-		return
-	fi
-
-	# must also check that nat table was evaluated on second (lower device) iteration.
-	if ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 1' &&
-	   ip netns exec "$ns0" nft list table ip nat |grep -q 'untracked counter packets [1-9]'; then
-		echo "PASS: connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)"
+	local result
+	# request from ns1 to ns2 (direct traffic)
+	result=$(echo PING | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$ns2_ip_port",sourceport=18888)
+	if [ "$result" == "PONG_NS2" ] ;then
+		echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2"
 	else
-		echo "FAIL: vrf rules have unexpected counter value"
+		echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2, not \"${expect_ns1_ns2}\" as intended"
 		ret=1
 	fi
 
-	if [ "$qdisc" != "default" ]; then
-		tc -net "$ns0" qdisc del dev tvrf root
-	fi
-}
-
-# add masq rule that gets evaluated w. outif set to veth device.
-# This tests the 2nd iteration of the packet through conntrack,
-# oifname is the lower device (veth0 in this case).
-test_masquerade_veth()
-{
-	ip netns exec "$ns0" conntrack -F 2>/dev/null
-ip netns exec "$ns0" nft -f - <<EOF
-flush ruleset
-table ip nat {
-	chain postrouting {
-		type nat hook postrouting priority 0;
-		meta oif veth0 ip saddr 172.30.30.0/30 counter masquerade random
-	}
-}
-EOF
-	if ! ip netns exec "$ns0" ip vrf exec tvrf socat -u -4 STDIN TCP:"$IP1":55555 < /dev/null > /dev/null;then
-		echo "FAIL: connect failure with masquerade + sport rewrite on veth device"
+	# request from ns1 to ns3 (direct traffic)
+	result=$(echo PING | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$ns3_ip_port")
+	if [ "$result" = "PONG_NS3" ] ;then
+		echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3"
+	else
+		echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3, not \"$expect_ns1_ns3\" as intended"
 		ret=1
-		return
 	fi
 
-	# must also check that nat table was evaluated on second (lower device) iteration.
-	if ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 1'; then
-		echo "PASS: connect with masquerade + sport rewrite on veth device"
+	# request from ns1 to vip (DNAT to ns2)
+	result=$(echo PING | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$vip_ip_port")
+	if [ "$result" = "PONG_NS2" ] ;then
+		echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3"
 	else
-		echo "FAIL: vrf masq rule has unexpected counter value"
+		echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3, not \"$expect_ns1_ns3\" as intended"
 		ret=1
 	fi
-}
 
-test_fib()
-{
-ip netns exec "$ns0" nft -f - <<EOF
+	# replace the DNAT rule to direct and replace ns2 destination with ns3
+		ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
 flush ruleset
-table ip t {
-	counter fibcount { }
-
-	chain prerouting {
-		type filter hook prerouting priority 0;
-		meta iifname veth0 ip daddr $DUMMYNET.2 fib daddr oif dummy0 counter name fibcount notrack
+table inet filter {
+	chain divert {
+		type filter hook prerouting priority 0; policy accept;
+		$ip_proto daddr $vip udp dport 8080 dnat to $ns3_ip
 	}
 }
 EOF
-	ip -net "$ns1" route add 10.9.9.0/24 via "$IP0" dev veth0
-	ip netns exec "$ns1" ping -q -w 1 -c 1 "$DUMMYNET".2 > /dev/null
-
-	if ip netns exec "$ns0" nft list counter t fibcount | grep -q "packets 1"; then
-		echo "PASS: fib lookup returned exepected output interface"
+	# request from ns1 to vip (DNAT to ns3)
+	# reuse the same port to validate the existing conntrack entry does not
+	# shadow the actual nftables rule.
+	result=$(echo PING | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$vip_ip_port")
+	if [ "$result" = "PONG_NS3" ] ;then
+		echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3"
 	else
-		echo "FAIL: fib lookup did not return exepected output interface"
+		echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3, not \"$expect_ns1_ns3\" as intended"
 		ret=1
-		return
 	fi
 }
 
-test_ct_zone_in
-test_masquerade_vrf "default"
-test_masquerade_vrf "pfifo"
-test_masquerade_veth
-test_fib
+
+if test_ping; then
+	# queue bypass works (rules were skipped, no listener)
+	echo "PASS: ${ns1} can reach ${ns2}"
+else
+	echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2
+	exit $ret
+fi
+
+test_conntrack_udp_expires "ip"
+test_conntrack_udp_expires "ip6"
 
 exit $ret