On Mon, May 16, 2022 at 10:10:31PM +0300, Vlad Buslov wrote: > To improve hardware offload debuggability and scalability introduce > 'nf_flowtable_count_hw' and 'nf_flowtable_max_hw' sysctl entries in new > dedicated 'net/netfilter/ft' namespace. Add new pernet struct nf_ft_net in > order to store the counter and sysctl header of new sysctl table. > > Count the offloaded flows in workqueue add task handler. Verify that > offloaded flow total is lower than allowed maximum before calling the > driver callbacks. To prevent spamming the 'add' workqueue with tasks when > flows can't be offloaded anymore also check that count is below limit > before queuing offload work. This doesn't prevent all redundant workqueue > task since counter can be taken by concurrent work handler after the check > had been performed but before the offload job is executed but it still > greatly reduces such occurrences. Note that flows that were not offloaded > due to counter being larger than the cap can still be offloaded via refresh > function. > > Ensure that flows are accounted correctly by verifying IPS_HW_OFFLOAD_BIT > value before counting them. This ensures that add/refresh code path > increments the counter exactly once per flow when setting the bit and > decrements it only for accounted flows when deleting the flow with the bit > set. > > Signed-off-by: Vlad Buslov <vladbu@xxxxxxxxxx> > Signed-off-by: Oz Shlomo <ozsh@xxxxxxxxxx> > --- > > Notes: > Changes V1 -> V2: > > - Combine patches that expose offloaded flow count and limit total > offloaded flows. > > - Rework the counting logic to count in workqueue context. > > - Create dedicated namespace for flow table sysctls. > > .../networking/nf_flowtable-sysctl.rst | 17 ++++ > include/net/netfilter/nf_flow_table.h | 25 ++++++ > net/netfilter/Makefile | 3 +- > net/netfilter/nf_flow_table_core.c | 55 +++++++++++- > net/netfilter/nf_flow_table_offload.c | 36 ++++++-- > net/netfilter/nf_flow_table_sysctl.c | 83 +++++++++++++++++++ > 6 files changed, 210 insertions(+), 9 deletions(-) > create mode 100644 Documentation/networking/nf_flowtable-sysctl.rst > create mode 100644 net/netfilter/nf_flow_table_sysctl.c > > diff --git a/Documentation/networking/nf_flowtable-sysctl.rst b/Documentation/networking/nf_flowtable-sysctl.rst > new file mode 100644 > index 000000000000..932b4abeca6c > --- /dev/null > +++ b/Documentation/networking/nf_flowtable-sysctl.rst > @@ -0,0 +1,17 @@ > +.. SPDX-License-Identifier: GPL-2.0 > + > +=================================== > +Netfilter Flowtable Sysfs variables > +=================================== Better document the sysctl entries in the existing nf_conntrack.txt file rather than this new file? > + > +/proc/sys/net/netfilter/ft/nf_flowtable_* Variables: > +================================================= > + > +nf_flowtable_count_hw - INTEGER (read-only) > + Number of flowtable entries that are currently offloaded to hardware. > + > +nf_flowtable_max_hw - INTEGER > + - 0 - disabled (default) > + - not 0 - enabled > + > + Cap on maximum total amount of flowtable entries offloaded to hardware. > diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h > index 64daafd1fc41..e09c29fa034e 100644 > --- a/include/net/netfilter/nf_flow_table.h > +++ b/include/net/netfilter/nf_flow_table.h > @@ -335,4 +335,29 @@ static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) > return 0; > } > > +struct nf_ft_net { > + atomic_t count_hw; > +#ifdef CONFIG_SYSCTL > + struct ctl_table_header *sysctl_header; > +#endif > +}; > + > +extern unsigned int nf_ft_hw_max; > +extern unsigned int nf_ft_net_id; > + > +#include <net/netns/generic.h> > + > +static inline struct nf_ft_net *nf_ft_pernet(const struct net *net) > +{ > + return net_generic(net, nf_ft_net_id); > +} > + > +static inline struct nf_ft_net *nf_ft_pernet_get(struct nf_flowtable *flow_table) > +{ > + return nf_ft_pernet(read_pnet(&flow_table->net)); > +} > + > +int nf_flow_table_init_sysctl(struct net *net); > +void nf_flow_table_fini_sysctl(struct net *net); > + > #endif /* _NF_FLOW_TABLE_H */ > diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile > index 238b6a620e88..67e281842b61 100644 > --- a/net/netfilter/Makefile > +++ b/net/netfilter/Makefile > @@ -127,7 +127,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o > # flow table infrastructure > obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o > nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ > - nf_flow_table_offload.o > + nf_flow_table_offload.o \ > + nf_flow_table_sysctl.o > > obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o > > diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c > index 3db256da919b..e2598f98017c 100644 > --- a/net/netfilter/nf_flow_table_core.c > +++ b/net/netfilter/nf_flow_table_core.c > @@ -277,6 +277,13 @@ static const struct rhashtable_params nf_flow_offload_rhash_params = { > .automatic_shrinking = true, > }; > > +static bool flow_max_hw_entries(struct nf_flowtable *flow_table) > +{ > + struct nf_ft_net *fnet = nf_ft_pernet_get(flow_table); > + > + return nf_ft_hw_max && atomic_read(&fnet->count_hw) >= nf_ft_hw_max; > +} > + > unsigned long flow_offload_get_timeout(struct flow_offload *flow) > { > unsigned long timeout = NF_FLOW_TIMEOUT; > @@ -320,7 +327,8 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) > > nf_ct_offload_timeout(flow->ct); > > - if (nf_flowtable_hw_offload(flow_table)) { > + if (nf_flowtable_hw_offload(flow_table) && > + !flow_max_hw_entries(flow_table)) { > __set_bit(NF_FLOW_HW, &flow->flags); > nf_flow_offload_add(flow_table, flow); > } > @@ -338,9 +346,11 @@ void flow_offload_refresh(struct nf_flowtable *flow_table, > if (READ_ONCE(flow->timeout) != timeout) > WRITE_ONCE(flow->timeout, timeout); > > - if (likely(!nf_flowtable_hw_offload(flow_table))) > + if (likely(!nf_flowtable_hw_offload(flow_table) || > + flow_max_hw_entries(flow_table))) > return; > > + set_bit(NF_FLOW_HW, &flow->flags); > nf_flow_offload_add(flow_table, flow); > } > EXPORT_SYMBOL_GPL(flow_offload_refresh); > @@ -652,14 +662,53 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) > } > EXPORT_SYMBOL_GPL(nf_flow_table_free); > > +static int nf_flow_table_pernet_init(struct net *net) > +{ > + return nf_flow_table_init_sysctl(net); > +} > + > +static void nf_flow_table_pernet_exit(struct list_head *net_exit_list) > +{ > + struct net *net; > + > + list_for_each_entry(net, net_exit_list, exit_list) > + nf_flow_table_fini_sysctl(net); > +} > + > +unsigned int nf_ft_net_id __read_mostly; > + > +static struct pernet_operations nf_flow_table_net_ops = { > + .init = nf_flow_table_pernet_init, > + .exit_batch = nf_flow_table_pernet_exit, > + .id = &nf_ft_net_id, > + .size = sizeof(struct nf_ft_net), > +}; > + > static int __init nf_flow_table_module_init(void) > { > - return nf_flow_table_offload_init(); > + int ret; > + > + nf_ft_hw_max = 0; > + > + ret = register_pernet_subsys(&nf_flow_table_net_ops); > + if (ret < 0) > + return ret; > + > + ret = nf_flow_table_offload_init(); > + if (ret) > + goto out_offload; > + > + return 0; > + > +out_offload: > + unregister_pernet_subsys(&nf_flow_table_net_ops); > + return ret; > } > > static void __exit nf_flow_table_module_exit(void) > { > nf_flow_table_offload_exit(); > + unregister_pernet_subsys(&nf_flow_table_net_ops); > } > > module_init(nf_flow_table_module_init); > diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c > index 11b6e1942092..a6e763901eb9 100644 > --- a/net/netfilter/nf_flow_table_offload.c > +++ b/net/netfilter/nf_flow_table_offload.c > @@ -903,30 +903,56 @@ static int flow_offload_rule_add(struct flow_offload_work *offload, > return 0; > } > > +static bool flow_get_max_hw_entries(struct nf_flowtable *flow_table) > +{ > + struct nf_ft_net *fnet = nf_ft_pernet_get(flow_table); > + int count_hw = atomic_inc_return(&fnet->count_hw); > + > + if (nf_ft_hw_max && count_hw > nf_ft_hw_max) { > + atomic_dec(&fnet->count_hw); > + return false; > + } > + return true; > +} > + > static void flow_offload_work_add(struct flow_offload_work *offload) > { > + struct nf_ft_net *fnet = nf_ft_pernet_get(offload->flowtable); > struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX]; > int err; > > + if (!flow_get_max_hw_entries(offload->flowtable)) > + return; > + > err = nf_flow_offload_alloc(offload, flow_rule); > if (err < 0) > - return; > + goto out_alloc; > > err = flow_offload_rule_add(offload, flow_rule); > if (err < 0) > - goto out; > + goto out_add; > > - set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); > + if (test_and_set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status)) > + atomic_dec(&fnet->count_hw); > + nf_flow_offload_destroy(flow_rule); > + return; > > -out: > +out_add: > nf_flow_offload_destroy(flow_rule); > +out_alloc: > + atomic_dec(&fnet->count_hw); > } > > static void flow_offload_work_del(struct flow_offload_work *offload) > { > - clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); > + bool offloaded = test_and_clear_bit(IPS_HW_OFFLOAD_BIT, > + &offload->flow->ct->status); > + struct nf_ft_net *fnet = nf_ft_pernet_get(offload->flowtable); > + > flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); > flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); > + if (offloaded) > + atomic_dec(&fnet->count_hw); > set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); > } > > diff --git a/net/netfilter/nf_flow_table_sysctl.c b/net/netfilter/nf_flow_table_sysctl.c > new file mode 100644 > index 000000000000..ce8c0a2c4bdb > --- /dev/null > +++ b/net/netfilter/nf_flow_table_sysctl.c > @@ -0,0 +1,83 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +#include <linux/kernel.h> > +#include <net/netfilter/nf_flow_table.h> > + > +unsigned int nf_ft_hw_max __read_mostly; You can move this to nf_flow_table_core_offload.c for the Makefile idea. > +#ifdef CONFIG_SYSCTL If you follow the Kconfig+Makefile approach, then this #ifdef can likely go away as the entire file would be on/off thing. > +enum nf_ct_sysctl_index { > + NF_SYSCTL_FLOW_TABLE_MAX_HW, > + NF_SYSCTL_FLOW_TABLE_COUNT_HW, > + > + __NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL, > +}; > + > +#define NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL \ > + (__NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL + 1) > + > +static struct ctl_table nf_flow_table_sysctl_table[] = { > + [NF_SYSCTL_FLOW_TABLE_MAX_HW] = { > + .procname = "nf_flowtable_max_hw", > + .data = &nf_ft_hw_max, > + .maxlen = sizeof(int), > + .mode = 0644, > + .proc_handler = proc_dointvec, > + }, > + [NF_SYSCTL_FLOW_TABLE_COUNT_HW] = { > + .procname = "nf_flowtable_count_hw", > + .maxlen = sizeof(int), > + .mode = 0444, > + .proc_handler = proc_dointvec, > + }, > + {} > +}; > + > +int nf_flow_table_init_sysctl(struct net *net) > +{ > + struct nf_ft_net *fnet = nf_ft_pernet(net); > + struct ctl_table *table; > + > + BUILD_BUG_ON(ARRAY_SIZE(nf_flow_table_sysctl_table) != NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL); > + > + table = kmemdup(nf_flow_table_sysctl_table, sizeof(nf_flow_table_sysctl_table), > + GFP_KERNEL); > + if (!table) > + return -ENOMEM; > + > + table[NF_SYSCTL_FLOW_TABLE_COUNT_HW].data = &fnet->count_hw; > + > + /* Don't allow non-init_net ns to alter global sysctls */ > + if (!net_eq(&init_net, net)) > + table[NF_SYSCTL_FLOW_TABLE_MAX_HW].mode = 0444; > + > + fnet->sysctl_header = register_net_sysctl(net, "net/netfilter/ft", table); > + if (!fnet->sysctl_header) > + goto out_unregister_netfilter; > + > + return 0; > + > +out_unregister_netfilter: > + kfree(table); > + return -ENOMEM; > +} > + > +void nf_flow_table_fini_sysctl(struct net *net) > +{ > + struct nf_ft_net *fnet = nf_ft_pernet(net); > + struct ctl_table *table; > + > + table = fnet->sysctl_header->ctl_table_arg; > + unregister_net_sysctl_table(fnet->sysctl_header); > + kfree(table); > +} > + > +#else > +int nf_flow_table_init_sysctl(struct net *net) > +{ > + return 0; > +} > + > +void nf_flow_table_fini_sysctl(struct net *net) > +{ > +} > +#endif /* CONFIG_SYSCTL */ > -- > 2.31.1 >