Hi Pablo, On Sun, Apr 24, 2022 at 11:56:13PM +0200, Pablo Neira Ayuso wrote: > Starting Linux kernel 5.18-rc, operations on registers that already > contain the expected data are turned into noop. > > Track operation on registers to use the same register through > payload_get_register() and meta_get_register(). This patch introduces an > LRU eviction strategy when all the registers are in used. > > get_register() is used to allocate a register as scratchpad area: no > tracking is performed in this case. This is used for concatenations, > eg. ebt_among. > > Using samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh > > Benchmark #1: match on IPv6 address list > > *raw > :PREROUTING DROP [9:2781] > :OUTPUT ACCEPT [0:0] > -A PREROUTING -d aaaa::bbbb -j DROP > [... 98 times same rule above to trigger mismatch ...] > -A PREROUTING -d fd00::1 -j DROP # matching rule > > iptables-legacy 798Mb > iptables-nft 801Mb (+0.37) > > Benchmark #2: match on layer 4 protocol + port list > > *raw > :PREROUTING DROP [9:2781] > :OUTPUT ACCEPT [0:0] > -A PREROUTING -p tcp --dport 23 -j DROP > [... 98 times same rule above to trigger mismatch ...] > -A PREROUTING -p udp --dport 9 -j DROP # matching rule > > iptables-legacy 648Mb > iptables-nft 892Mb (+37.65%) > > Benchmark #3: match on mark > > *raw > :PREROUTING DROP [9:2781] > :OUTPUT ACCEPT [0:0] > -A PREROUTING -m mark --mark 100 -j DROP > [... 98 times same rule above to trigger mismatch ...] > -A PREROUTING -d 198.18.0.42/32 -j DROP # matching rule > > iptables-legacy 255Mb > iptables-nft 865Mb (+239.21%) Great results, but obviously biased test cases. Did you measure a more "realistic" ruleset? > In these cases, iptables-nft generates netlink bytecode which uses the > native expressions, ie. payload + cmp and meta + cmp. Sounds like a real point for further conversion into native nftables expressions where possible. [...] > diff --git a/iptables/nft-regs.c b/iptables/nft-regs.c > new file mode 100644 > index 000000000000..bfc762e4186f > --- /dev/null > +++ b/iptables/nft-regs.c > @@ -0,0 +1,191 @@ > +/* > + * (C) 2012-2022 by Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + */ > + > +/* Funded through the NGI0 PET Fund established by NLnet (https://nlnet.nl) > + * with support from the European Commission's Next Generation Internet > + * programme. > + */ > + > +#include <string.h> > +#include <stdio.h> > +#include <stdlib.h> > +#include <stdbool.h> > +#include <errno.h> > +#include <assert.h> > + > +#include "nft.h" > +#include "nft-regs.h" > + > +static uint64_t reg_genid(struct nft_handle *h) > +{ > + return ++h->reg_genid; > +} > + > +struct nft_reg_ctx { > + uint64_t genid; > + int reg; > + int evict; > +}; Move this struct declaration above the first function? > + > +static int reg_space(int i) > +{ > + return sizeof(uint32_t) * 16 - sizeof(uint32_t) * i; > +} > + > +static void register_track(const struct nft_handle *h, > + struct nft_reg_ctx *ctx, int i, int len) > +{ > + if (ctx->reg >= 0 || h->regs[i].word || reg_space(i) < len) > + return; Since ctx->reg is not reset in callers' loops and reg_space(i) is monotonic, maybe make those loop exit conditions by returning false from register_track() in those cases? > + if (h->regs[i].type == NFT_REG_UNSPEC) { > + ctx->genid = h->reg_genid; Is ctx->genid used in this case? > + ctx->reg = i; > + } else if (h->regs[i].genid < ctx->genid) { > + ctx->genid = h->regs[i].genid; > + ctx->evict = i; What if the oldest reg is too small? > + } else if (h->regs[i].len == len) { > + ctx->evict = i; > + ctx->genid = 0; Why prefer regs of same size over older ones? > + } > +} > + > +static void register_evict(struct nft_reg_ctx *ctx, int i) Unused parameter i. > +{ > + if (ctx->reg < 0) { > + assert(ctx->evict >= 0); > + ctx->reg = ctx->evict; > + } > +} > + > +static void __register_update(struct nft_handle *h, uint8_t reg, > + int type, uint32_t len, uint8_t word, > + uint64_t genid) > +{ > + h->regs[reg].type = type; > + h->regs[reg].genid = genid; > + h->regs[reg].len = len; > + h->regs[reg].word = word; > +} > + > +static void register_cancel(struct nft_handle *h, struct nft_reg_ctx *ctx) > +{ > + int plen = h->regs[ctx->reg].len, i; > + > + for (i = ctx->reg; plen > 0; i++, plen -= sizeof(uint32_t)) { > + h->regs[i].type = NFT_REG_UNSPEC; > + h->regs[i].word = 0; > + } > + > + while (h->regs[i].word != 0) { > + h->regs[i].type = NFT_REG_UNSPEC; > + h->regs[i].word = 0; > + i++; > + } > +} > + > +static void register_update(struct nft_handle *h, struct nft_reg_ctx *ctx, > + int type, uint32_t len, uint64_t genid) > +{ > + register_cancel(h, ctx); > + __register_update(h, ctx->reg, type, len, 0, genid); > +} > + > +uint8_t meta_get_register(struct nft_handle *h, enum nft_meta_keys key) Accept a uint32_t len parameter and replace all the sizeof(uint32_t) below with it? Not needed but consistent with payload_get_register() and less hard-coded values. > +{ > + struct nft_reg_ctx ctx = { > + .reg = -1, > + .evict = -1, > + .genid = UINT64_MAX, > + }; > + uint64_t genid; > + int i; > + > + for (i = 0; i < 16; i++) { > + register_track(h, &ctx, i, sizeof(uint32_t)); > + > + if (h->regs[i].type != NFT_REG_META) > + continue; > + > + if (h->regs[i].meta.key == key && > + h->regs[i].len == sizeof(uint32_t)) { > + h->regs[i].genid = reg_genid(h); > + return i + NFT_REG32_00; > + } > + } > + > + register_evict(&ctx, i); > + > + genid = reg_genid(h); > + register_update(h, &ctx, NFT_REG_META, sizeof(uint32_t), genid); > + h->regs[ctx.reg].meta.key = key; > + > + return ctx.reg + NFT_REG32_00; > +} > + > +uint8_t payload_get_register(struct nft_handle *h, enum nft_payload_bases base, > + uint32_t offset, uint32_t len) > +{ > + struct nft_reg_ctx ctx = { > + .reg = -1, > + .evict = -1, > + .genid = UINT64_MAX, > + }; > + int i, j, plen = len; > + uint64_t genid; > + > + for (i = 0; i < 16; i++) { > + register_track(h, &ctx, i, len); > + > + if (h->regs[i].type != NFT_REG_PAYLOAD) > + continue; > + > + if (h->regs[i].payload.base == base && > + h->regs[i].payload.offset == offset && > + h->regs[i].len >= plen) { > + h->regs[i].genid = reg_genid(h); > + return i + NFT_REG32_00; > + } > + } > + > + register_evict(&ctx, i); > + > + genid = reg_genid(h); > + register_update(h, &ctx, NFT_REG_PAYLOAD, len, genid); > + h->regs[ctx.reg].payload.base = base; > + h->regs[ctx.reg].payload.offset = offset; > + > + plen -= sizeof(uint32_t); > + j = 1; > + for (i = ctx.reg + 1; plen > 0; i++, plen -= sizeof(uint32_t)) { > + __register_update(h, i, NFT_REG_PAYLOAD, len, j++, genid); > + h->regs[i].payload.base = base; > + h->regs[i].payload.offset = offset; > + } > + > + return ctx.reg + NFT_REG32_00; > +} > + > +uint8_t get_register(struct nft_handle *h, uint8_t len) > +{ > + struct nft_reg_ctx ctx = { > + .reg = -1, > + .evict = -1, > + .genid = UINT64_MAX, > + }; > + int i; > + > + for (i = 0; i < 16; i++) > + register_track(h, &ctx, i, len); > + > + register_evict(&ctx, i); > + register_cancel(h, &ctx); > + > + return ctx.reg + NFT_REG32_00; > +} > diff --git a/iptables/nft-regs.h b/iptables/nft-regs.h > new file mode 100644 > index 000000000000..3953eae9f217 > --- /dev/null > +++ b/iptables/nft-regs.h > @@ -0,0 +1,9 @@ > +#ifndef _NFT_REGS_H_ > +#define _NFT_REGS_H_ > + > +uint8_t payload_get_register(struct nft_handle *h, enum nft_payload_bases base, > + uint32_t offset, uint32_t len); > +uint8_t meta_get_register(struct nft_handle *h, enum nft_meta_keys key); > +uint8_t get_register(struct nft_handle *h, uint8_t len); > + > +#endif /* _NFT_REGS_H_ */ > diff --git a/iptables/nft-shared.c b/iptables/nft-shared.c > index 27e95c1ae4f3..ad5361942093 100644 > --- a/iptables/nft-shared.c > +++ b/iptables/nft-shared.c > @@ -32,6 +32,7 @@ > > #include "nft-shared.h" > #include "nft-bridge.h" > +#include "nft-regs.h" > #include "xshared.h" > #include "nft.h" > > @@ -50,7 +51,7 @@ void add_meta(struct nft_handle *h, struct nftnl_rule *r, uint32_t key, > if (expr == NULL) > return; > > - reg = NFT_REG_1; > + reg = meta_get_register(h, key); > nftnl_expr_set_u32(expr, NFTNL_EXPR_META_KEY, key); > nftnl_expr_set_u32(expr, NFTNL_EXPR_META_DREG, reg); > nftnl_rule_add_expr(r, expr); > @@ -68,7 +69,7 @@ void add_payload(struct nft_handle *h, struct nftnl_rule *r, > if (expr == NULL) > return; > > - reg = NFT_REG_1; > + reg = payload_get_register(h, base, offset, len); > nftnl_expr_set_u32(expr, NFTNL_EXPR_PAYLOAD_BASE, base); > nftnl_expr_set_u32(expr, NFTNL_EXPR_PAYLOAD_DREG, reg); > nftnl_expr_set_u32(expr, NFTNL_EXPR_PAYLOAD_OFFSET, offset); > @@ -89,7 +90,7 @@ void add_bitwise_u16(struct nft_handle *h, struct nftnl_rule *r, > if (expr == NULL) > return; > > - reg = NFT_REG_1; > + reg = get_register(h, sizeof(uint32_t)); > nftnl_expr_set_u32(expr, NFTNL_EXPR_BITWISE_SREG, sreg); > nftnl_expr_set_u32(expr, NFTNL_EXPR_BITWISE_DREG, reg); > nftnl_expr_set_u32(expr, NFTNL_EXPR_BITWISE_LEN, sizeof(uint16_t)); > @@ -105,12 +106,13 @@ void add_bitwise(struct nft_handle *h, struct nftnl_rule *r, > { > struct nftnl_expr *expr; > uint32_t xor[4] = { 0 }; > - uint8_t reg = *dreg; > + uint8_t reg; > > expr = nftnl_expr_alloc("bitwise"); > if (expr == NULL) > return; > > + reg = get_register(h, len); > nftnl_expr_set_u32(expr, NFTNL_EXPR_BITWISE_SREG, sreg); > nftnl_expr_set_u32(expr, NFTNL_EXPR_BITWISE_DREG, reg); > nftnl_expr_set_u32(expr, NFTNL_EXPR_BITWISE_LEN, len); > diff --git a/iptables/nft.c b/iptables/nft.c > index 07653ee1a3d6..48330f285703 100644 > --- a/iptables/nft.c > +++ b/iptables/nft.c > @@ -56,6 +56,7 @@ > #include <arpa/inet.h> > > #include "nft.h" > +#include "nft-regs.h" > #include "xshared.h" /* proto_to_name */ > #include "nft-cache.h" > #include "nft-shared.h" > @@ -1112,7 +1113,7 @@ gen_payload(struct nft_handle *h, uint32_t base, uint32_t offset, uint32_t len, > struct nftnl_expr *e; > uint8_t reg; > > - reg = NFT_REG_1; > + reg = payload_get_register(h, base, offset, len); > e = __gen_payload(base, offset, len, reg); > *dreg = reg; > > @@ -1157,10 +1158,10 @@ static int __add_nft_among(struct nft_handle *h, const char *table, > offsetof(struct iphdr, saddr), > offsetof(struct iphdr, daddr) > }; > + uint8_t reg, concat_len; > struct nftnl_expr *e; > struct nftnl_set *s; > uint32_t flags = 0; > - uint8_t reg; > int idx = 0; > > if (ip) { > @@ -1201,21 +1202,26 @@ static int __add_nft_among(struct nft_handle *h, const char *table, > nftnl_set_elem_add(s, elem); > } > > - e = gen_payload(h, NFT_PAYLOAD_LL_HEADER, > - eth_addr_off[dst], ETH_ALEN, ®); > + concat_len = ETH_ALEN; > + if (ip) > + concat_len += sizeof(struct in_addr); > + > + reg = get_register(h, concat_len); > + e = __gen_payload(NFT_PAYLOAD_LL_HEADER, > + eth_addr_off[dst], ETH_ALEN, reg); > if (!e) > return -ENOMEM; > nftnl_rule_add_expr(r, e); > > if (ip) { > - e = gen_payload(h, NFT_PAYLOAD_NETWORK_HEADER, ip_addr_off[dst], > - sizeof(struct in_addr), ®); > + e = __gen_payload(NFT_PAYLOAD_NETWORK_HEADER, ip_addr_off[dst], > + sizeof(struct in_addr), reg + 2); With a respective macro, this could be 'reg + REG_ALIGN(ETH_ALEN)'. > if (!e) > return -ENOMEM; > nftnl_rule_add_expr(r, e); > } > > - reg = NFT_REG_1; > + reg = get_register(h, sizeof(uint32_t)); > e = gen_lookup(reg, "__set%d", set_id, inv); > if (!e) > return -ENOMEM; > diff --git a/iptables/nft.h b/iptables/nft.h > index 68b0910c8e18..3dc907b188ce 100644 > --- a/iptables/nft.h > +++ b/iptables/nft.h > @@ -85,6 +85,28 @@ struct nft_cache_req { > struct list_head chain_list; > }; > > +enum nft_reg_type { > + NFT_REG_UNSPEC = 0, > + NFT_REG_PAYLOAD, > + NFT_REG_META, > +}; > + > +struct nft_regs { > + enum nft_reg_type type; > + uint32_t len; > + uint64_t genid; > + uint8_t word; > + union { > + struct { > + enum nft_meta_keys key; > + } meta; > + struct { > + enum nft_payload_bases base; > + uint32_t offset; > + } payload; > + }; > +}; > + > struct nft_handle { > int family; > struct mnl_socket *nl; > @@ -111,6 +133,9 @@ struct nft_handle { > bool cache_init; > int verbose; > > + struct nft_regs regs[20]; Why 20? Ain't 16 enough? > + uint64_t reg_genid; > + > /* meta data, for error reporting */ > struct { > unsigned int lineno; Thanks, Phil