Hello! I am going via https://github.com/xdp-project/xdp-tutorial and after 3rd lesson trying to create a simple program for searching TCP timestamp option and incrementing it by one. However, after two dozen tries eBPF verifier still doesn't accept my code. I was digging into verifier sources and found that access ("r=123") propagates to all registers with the same "id=" after comparison with packet_end (usually data_end variable in xdp-tutorial). So I learned to place such checks as late as possible to actual access, but this still did not help. But carefully and boringly going via disassembler and verifier output, I've found that clang optimizer ignores just checked register, and calculates from another, raising verifier's complain. Moreover, it generates not optimal code sometimes! In my (currently) last try, this looks like: ; if (optlen != TCPOPT_TS_LEN || changed) { 240: (79) r0 = *(u64 *)(r10 -16) ; R0_w=P0 R10=fp0 fp-16=00000000 241: (4f) r9 |= r0 ; R0_w=P0 R9_w=0 242: (57) r9 &= 1 ; R9_w=0 243: (b7) r0 = 309 ; R0_w=309 244: (55) if r9 != 0x0 goto pc+150 ; R9_w=0 245: (b7) r0 = 300 ; R0_w=300 ; ACCESS_OPT_PTR_SHORT(chgptr, old_ofs); 246: (25) if r7 > 0x22 goto pc+148 ; R7=scalar(id=163,umin=2,umax=34,var_off=(0x0; 0x3f)) ; ACCESS_OPT_PTR_SHORT(chgptr, old_ofs); 247: (bf) r6 = r1 ; R1=pkt(id=11,off=18,r=40,umax=60,var_off=(0x0; 0x3c)) R6_w=pkt(id=11,off=18,r=40,umax=60,var_off=(0x0; 0x3c)) 248: (0f) r6 += r7 ; R6_w=pkt(id=167,off=18,r=0,umin=2,umax=94,var_off=(0x0; 0x7f),s32_min=0,s32_max=127,u32_max=127) R7=scalar(id=163,umin=2,umax=34,var_off=(0x0; 0x3f)) 249: (bf) r9 = r6 ; R6_w=pkt(id=167,off=18,r=0,umin=2,umax=94,var_off=(0x0; 0x7f),s32_min=0,s32_max=127,u32_max=127) R9_w=pkt(id=167,off=18,r=0,umin=2,umax=94,var_off=(0x0; 0x7f),s32_min=0,s32_max=127,u32_max=127) 250: (07) r9 += 26 ; R9_w=pkt(id=167,off=44,r=0,umin=2,umax=94,var_off=(0x0; 0x7f),s32_min=0,s32_max=127,u32_max=127) 251: (b7) r0 = 300 ; R0=300 ; ACCESS_OPT_PTR_SHORT(chgptr, old_ofs); 252: (2d) if r9 > r2 goto pc+142 ; R2=pkt_end(off=0,imm=0) R9=pkt(id=167,off=44,r=44,umin=2,umax=94,var_off=(0x0; 0x7f),s32_min=0,s32_max=127,u32_max=127) 253: (b7) r0 = 300 ; R0_w=300 ; ACCESS_OPT_PTR(chgptr, cur+5); 254: (2d) if r9 > r2 goto pc+140 ; R2=pkt_end(off=0,imm=0) R9=pkt(id=167,off=44,r=44,umin=2,umax=94,var_off=(0x0; 0x7f),s32_min=0,s32_max=127,u32_max=127) ; old = bpf_ntohs(*(unsigned short *)chgptr); 255: (bf) r0 = r1 ; R0_w=pkt(id=11,off=18,r=40,umax=60,var_off=(0x0; 0x3c)) R1=pkt(id=11,off=18,r=40,umax=60,var_off=(0x0; 0x3c)) 256: (0f) r0 += r7 ; R0_w=pkt(id=168,off=18,r=0,umin=2,umax=94,var_off=(0x0; 0x7f),s32_min=0,s32_max=127,u32_max=127) R7=scalar(id=163,umin=2,umax=34,var_off=(0x0; 0x3f)) 257: (69) r9 = *(u16 *)(r0 +24) invalid access to packet, off=42 size=2, R0(id=168,off=42,r=0) R0 offset is outside of the packet processed 2565 insns (limit 1000000) max_states_per_insn 9 total_states 92 peak_states 73 mark_read 12 -- END PROG LOAD LOG -- and in disassembler (`llvm-objdump -lS nuc_ts_prog_kern.o`), with my comments added: 0000000000000780 <LBB0_64>: ; LBB0_64(): //r1=tcph r2=end r4=size r5=40 r6=size-cur r7=cur r8=10 r9=8 ; /home/vadimgon/xdp-tutorial/nuc_ts_prog_kern.c:181 ; if (optlen != TCPOPT_TS_LEN || changed) { 240: 79 a0 f0 ff 00 00 00 00 r0 = *(u64 *)(r10 - 16) //changed 241: 4f 09 00 00 00 00 00 00 r9 |= r0 242: 57 09 00 00 01 00 00 00 r9 &= 1 243: b7 00 00 00 35 01 00 00 r0 = 309 244: 55 09 96 00 00 00 00 00 if r9 != 0 goto +150 <LBB0_103> 245: b7 00 00 00 2c 01 00 00 r0 = 300 ; /home/vadimgon/xdp-tutorial/nuc_ts_prog_kern.c:188 ; ACCESS_OPT_PTR_SHORT(chgptr, old_ofs); 246: 25 07 94 00 22 00 00 00 if r7 > 34 goto +148 <LBB0_103> 247: bf 16 00 00 00 00 00 00 r6 = r1 248: 0f 76 00 00 00 00 00 00 r6 += r7 249: bf 69 00 00 00 00 00 00 r9 = r6 250: 07 09 00 00 1a 00 00 00 r9 += 26 251: b7 00 00 00 2c 01 00 00 r0 = 300 252: 2d 29 8e 00 00 00 00 00 if r9 > r2 goto +142 <LBB0_103> 253: b7 00 00 00 2c 01 00 00 r0 = 300 ; /home/vadimgon/xdp-tutorial/nuc_ts_prog_kern.c:190 ; ACCESS_OPT_PTR(chgptr, cur+5); 254: 2d 29 8c 00 00 00 00 00 if r9 > r2 goto +140 <LBB0_103> ; /home/vadimgon/xdp-tutorial/nuc_ts_prog_kern.c:189 ; old = bpf_ntohs(*(unsigned short *)chgptr); 255: bf 10 00 00 00 00 00 00 r0 = r1 256: 0f 70 00 00 00 00 00 00 r0 += r7 // <== HERE 257: 69 09 18 00 00 00 00 00 r9 = *(u16 *)(r0 + 24) ; /home/vadimgon/xdp-tutorial/nuc_ts_prog_kern.c:191 ; *((__u8*)chgptr) += 1; 258: bf 90 00 00 00 00 00 00 r0 = r9 259: 77 00 00 00 08 00 00 00 r0 >>= 8 260: 07 00 00 00 01 00 00 00 r0 += 1 261: 73 06 19 00 00 00 00 00 *(u8 *)(r6 + 25) = r0 ; /home/vadimgon/xdp-tutorial/nuc_ts_prog_kern.c:154 ; while (cur < size && cur < 40) { 262: 0f 78 00 00 00 00 00 00 r8 += r7 As you can see, it checks `r9` even *twice*, and `r9` gets desired "r=44", but then discards r9 and uses r0 with no such checks from optimiser. I tried to patch this manually: * first, doing llc -march=bpf -filetype=asm -o nuc_ts_prog_kern.s nuc_ts_prog_kern.ll * then, I patch eBPF assemby in just one byte: --- nuc_ts_prog_kern.s~ 2024-07-05 21:59:36.754767691 +0300 +++ nuc_ts_prog_kern.s 2024-07-05 21:59:44.138807845 +0300 @@ -1395,7 +1395,7 @@ .Ltmp412: r0 = r1 r0 += r7 - r9 = *(u16 *)(r0 + 24) + r9 = *(u16 *)(r6 + 24) .loc 0 191 21 # nuc_ts_prog_kern.c:191:21 .Ltmp413: r0 = r9 * and getting actual object file: llvm-mc -triple bpf -filetype=obj -o nuc_ts_prog_kern.o nuc_ts_prog_kern.s then after this eBPF verifier accepts and loads program into kernel! So, the questions are: 1) could this be considered a clang bug? 2) even if "yes" to first question, what could be done to trick clang into emitting more "correct" (from verifier's POV) code? I am new to this list, so don't know the proper way to post multiple files (versions of code) here, are attaches allowed? So for the first post let me try just copypaste the last version of code into message (hope Thunderbird will not smash it). === How to reproduce: This is Ubuntu 22.04, kernel 6.5.0-35, clang 14. Do git clone --recurse-submodules https://github.com/xdp-project/xdp-tutorial.git cd xdp-tutorial and initial lib setup by their README, then put .h and .c file directly into this directory. Compile with: clang -S -target bpf -D __BPF_TRACING__ -Wno-visibility -DBPF_DIR_MNT="/sys/fs/bpf" -DBPF_OBJECT_PATH="/usr/local/lib/bpf" -DDEBUG -I/usr/include/x86_64-linux-gnu -I./lib/install/include -Wall -Wno-unused-value -Wno-pointer-sign -Wno-compare-distinct-pointer-types -Werror -O2 -emit-llvm -c -g -o nuc_ts_prog_kern.ll nuc_ts_prog_kern.c && llc -march=bpf -filetype=obj -o nuc_ts_prog_kern.o nuc_ts_prog_kern.ll The sources: $ cat nuc_ts_common_kern_user.h /* Common for timestamp incrementing BPF kernel prog and it's userland prog */ #ifndef __NUC_TS_COMMON_KERN_USER_H #define __NUC_TS_COMMON_KERN_USER_H enum stat_types { STAT_VERIFIERHAPPY = 0, STAT_ETHSHORT, STAT_VLANSHORT, STAT_NONIP, STAT_IPTOTSHORT, STAT_IPHDRSHORT, STAT_IP6EXTHDRSHORT, STAT_NONTCP, STAT_TCPSHORT, STAT_TCPINVALID, STAT_TCPOPTOVERFLOW, STAT_NONTS, STAT_TS_INCREMENTED, STAT_MAX, //keep last }; /* keep in sync with above*/ static const char *stat_entry_names[STAT_MAX] = { [STAT_VERIFIERHAPPY] = "BPFVERIFIER", [STAT_ETHSHORT] = "ETHSHORT", [STAT_VLANSHORT] = "VLANSHORT", [STAT_NONIP] = "NONIP", [STAT_IPTOTSHORT] = "IPTOTSHORT", [STAT_IPHDRSHORT] = "IPHDRSHORT", [STAT_IP6EXTHDRSHORT] = "IP6EXTHDRSHORT", [STAT_NONTCP] = "NONTCP", [STAT_TCPSHORT] = "TCPSHORT", [STAT_TCPINVALID] = "TCPINVALID", [STAT_TCPOPTOVERFLOW] = "TCPOPTOVERFLOW", [STAT_NONTS] = "NONTS", [STAT_TS_INCREMENTED] = "TS_INCREMENTED", }; #endif $ cat nuc_ts_prog_kern.c #include <linux/bpf.h> #include <linux/in.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> #include "common/parsing_helpers.h" #include "nuc_ts_common_kern_user.h" #define TCPOPT_TS_KIND 8 /* Timestamp option has Kind=8 */ #define TCPOPT_TS_LEN 10 /* and length 10, per RFC 1323 */ /* map to count errors */ struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); __type(key, __u32); __type(value, __u64); __uint(max_entries, STAT_MAX); } xdp_errstats_map SEC(".maps"); char _license[] SEC("license") = "GPL"; static void rec2stat(__u32 error) { long *value; value = bpf_map_lookup_elem(&xdp_errstats_map, &error); if (value) *value += 1; } static int parse_ipv4(void **nh_pos, void *data_end, __u64 *l4_off) { struct iphdr *iph = *nh_pos; if (iph + 1 > data_end) { rec2stat(STAT_IPHDRSHORT); return 0; } if (*nh_pos + (0x0fff & 1 + bpf_ntohs(iph->tot_len)) > data_end) { rec2stat(STAT_IPTOTSHORT); return 0; } if (iph->ihl < 5) { rec2stat(STAT_IPHDRSHORT); return 0; } *l4_off = 20 + 4*(iph->ihl - 5); if (*nh_pos + *l4_off > data_end) { rec2stat(STAT_IPHDRSHORT); return 0; } return iph->protocol; } static int parse_ipv6(void **nh_pos, void *data_end, __u64 *l4_off) { struct ipv6hdr *ip6h = *nh_pos; if (ip6h + 1 > data_end) { rec2stat(STAT_IPHDRSHORT); return 0; } if (*nh_pos + (0xefff & 1 + bpf_ntohs(ip6h->payload_len)) > data_end) { rec2stat(STAT_IPTOTSHORT); return 0; } *l4_off = sizeof(struct ipv6hdr); __u8 next_hdr_type = ip6h->nexthdr; struct ipv6_opt_hdr *hdr = *nh_pos + *l4_off; for (int i = 0; i < 6; ++i) { if (hdr + 1 > data_end) { rec2stat(STAT_IP6EXTHDRSHORT); return 0; } switch (next_hdr_type) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_MH: *l4_off += (hdr->hdrlen + 1) * 8; next_hdr_type = hdr->nexthdr; break; case IPPROTO_AH: *l4_off += (hdr->hdrlen + 2) * 4; next_hdr_type = hdr->nexthdr; break; case IPPROTO_FRAGMENT: *l4_off += 8; next_hdr_type = hdr->nexthdr; break; default: /* Found a header that is not an IPv6 extension header */ return next_hdr_type; } } return next_hdr_type; } /* -1 format err, 0 untouched, 1 modified */ static int process_tcp(void **nh_pos, __u64 l4_off, void *data_end) { __u64 size, old_ofs, changed = 0, open, cur = 0; __u64 optkind, optlen, err = STAT_VERIFIERHAPPY; *nh_pos += l4_off & 0xff; struct tcphdr *tcph = *nh_pos; if (tcph + 1 > data_end) { err = STAT_TCPSHORT; goto err_past_end; } size = 4*tcph->doff; *nh_pos += 20; if ((*nh_pos + size > data_end)) { err = STAT_TCPSHORT; goto err_past_end; } if (size < 20) { err = STAT_TCPINVALID; goto err_past_end; } size -= 20;; if (size == 0) /* no options - pass as-is */ return 0; __u32 sum; unsigned short old; void *chgptr; /* make verifier happy */ #define ACCESS_OPT_PTR(setptr, offs) do { \ if ((offs) > 39) goto err_past_end; \ (setptr) = (void*)(*nh_pos + (offs)); \ if ((setptr) + 1 > data_end) \ goto err_past_end; \ } while(0) #define ACCESS_OPT_PTR_SHORT(setptr, offs) do { \ if ((offs) > 38) goto err_past_end; \ (setptr) = (void*)(*nh_pos + (offs)); \ if ((setptr) + 2 > data_end) \ goto err_past_end; \ } while(0) #define ACCESS_OPT_BYTE(setvar, offs) do { \ void *tmp; \ ACCESS_OPT_PTR(tmp, offs); \ (setvar) = *( (__u8*)tmp ); \ } while(0) while (cur < size && cur < 40) { open = 261; /* just marker in disasm */ ACCESS_OPT_BYTE(optkind, cur); /* Single-byte options */ if (optkind == 0) /* End of Options */ break; if (optkind == 1) { /* NOP */ cur++; continue; } /* Kind and length bytes options */ if (cur+1 >= size) { err = STAT_TCPOPTOVERFLOW; goto err_past_end; } ACCESS_OPT_BYTE(optlen, cur+1); if (optlen < 2) { err = STAT_TCPINVALID; goto err_past_end; } if (optlen > size - cur) goto unclosed; if (optkind == TCPOPT_TS_KIND) { if (optlen != TCPOPT_TS_LEN || changed) { err = STAT_TCPINVALID; goto err_past_end; } /* increment TSval */ old_ofs = cur+4; // FIXME word align from tcp start ACCESS_OPT_PTR_SHORT(chgptr, old_ofs); old = bpf_ntohs(*(unsigned short *)chgptr); ACCESS_OPT_PTR(chgptr, cur+5); *((__u8*)chgptr) += 1; changed = 1; } /* Skip to next option */ cur += optlen; open = 0; } if (open) goto unclosed; /* Update checksum incrementally per RFC 1141 */ if (changed) { ACCESS_OPT_PTR_SHORT(chgptr, old_ofs); sum = old + (~bpf_ntohs(*(unsigned short *)chgptr) & 0xffff); sum += bpf_ntohs(tcph->check); sum = (sum & 0xffff) + (sum>>16); tcph->check = bpf_htons(sum + (sum>>16)); return 1; } return 0; unclosed: err = STAT_TCPOPTOVERFLOW; err_past_end: rec2stat(err); return -1; } SEC("xdp") int xdp_tcp_timestamp_incr(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; void *data = (void *)(long)ctx->data; struct ethhdr *eth = data; int rc = XDP_DROP; __u16 h_proto; __u64 nh_off; __u32 ipproto; __u64 l4_off; void *nh_pos; nh_off = sizeof(*eth); if (data + nh_off > data_end) { rec2stat(STAT_ETHSHORT); return rc; } h_proto = eth->h_proto; /* check VLAN tag; could be repeated to support double-tagged VLAN */ if (h_proto == bpf_htons(ETH_P_8021Q) || h_proto == bpf_htons(ETH_P_8021AD)) { struct vlan_hdr *vhdr; vhdr = data + nh_off; nh_off += sizeof(struct vlan_hdr); if (data + nh_off > data_end) { rec2stat(STAT_VLANSHORT); return rc; } h_proto = vhdr->h_vlan_encapsulated_proto; } nh_pos = data + nh_off; if (h_proto == bpf_htons(ETH_P_IP)) ipproto = parse_ipv4(&nh_pos, data_end, &l4_off); else if (h_proto == bpf_htons(ETH_P_IPV6)) ipproto = parse_ipv6(&nh_pos, data_end, &l4_off); else { rec2stat(STAT_NONIP); return rc; } if (ipproto != IPPROTO_TCP) { /* pass non-TCP untouched */ rec2stat(STAT_NONTCP); rc = XDP_PASS; } else { int done = process_tcp(&nh_pos, l4_off, data_end); if (done == -1) { rc = XDP_DROP; } else if (done == 0) { rec2stat(STAT_NONTS); rc = XDP_PASS; } else { rec2stat(STAT_TS_INCREMENTED); rc = XDP_PASS; } } return rc; } -- WBR, tg://nuclight