On Tue 17 May 2022 at 13:28, Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> wrote: > On Mon, May 16, 2022 at 10:10:31PM +0300, Vlad Buslov wrote: >> To improve hardware offload debuggability and scalability introduce >> 'nf_flowtable_count_hw' and 'nf_flowtable_max_hw' sysctl entries in new >> dedicated 'net/netfilter/ft' namespace. Add new pernet struct nf_ft_net in >> order to store the counter and sysctl header of new sysctl table. >> >> Count the offloaded flows in workqueue add task handler. Verify that >> offloaded flow total is lower than allowed maximum before calling the >> driver callbacks. To prevent spamming the 'add' workqueue with tasks when >> flows can't be offloaded anymore also check that count is below limit >> before queuing offload work. This doesn't prevent all redundant workqueue >> task since counter can be taken by concurrent work handler after the check >> had been performed but before the offload job is executed but it still >> greatly reduces such occurrences. Note that flows that were not offloaded >> due to counter being larger than the cap can still be offloaded via refresh >> function. >> >> Ensure that flows are accounted correctly by verifying IPS_HW_OFFLOAD_BIT >> value before counting them. This ensures that add/refresh code path >> increments the counter exactly once per flow when setting the bit and >> decrements it only for accounted flows when deleting the flow with the bit >> set. >> >> Signed-off-by: Vlad Buslov <vladbu@xxxxxxxxxx> >> Signed-off-by: Oz Shlomo <ozsh@xxxxxxxxxx> >> --- >> >> Notes: >> Changes V1 -> V2: >> >> - Combine patches that expose offloaded flow count and limit total >> offloaded flows. >> >> - Rework the counting logic to count in workqueue context. >> >> - Create dedicated namespace for flow table sysctls. >> >> .../networking/nf_flowtable-sysctl.rst | 17 ++++ >> include/net/netfilter/nf_flow_table.h | 25 ++++++ >> net/netfilter/Makefile | 3 +- >> net/netfilter/nf_flow_table_core.c | 55 +++++++++++- >> net/netfilter/nf_flow_table_offload.c | 36 ++++++-- >> net/netfilter/nf_flow_table_sysctl.c | 83 +++++++++++++++++++ >> 6 files changed, 210 insertions(+), 9 deletions(-) >> create mode 100644 Documentation/networking/nf_flowtable-sysctl.rst >> create mode 100644 net/netfilter/nf_flow_table_sysctl.c >> >> diff --git a/Documentation/networking/nf_flowtable-sysctl.rst b/Documentation/networking/nf_flowtable-sysctl.rst >> new file mode 100644 >> index 000000000000..932b4abeca6c >> --- /dev/null >> +++ b/Documentation/networking/nf_flowtable-sysctl.rst >> @@ -0,0 +1,17 @@ >> +.. SPDX-License-Identifier: GPL-2.0 >> + >> +=================================== >> +Netfilter Flowtable Sysfs variables >> +=================================== > > Better document the sysctl entries in the existing nf_conntrack.txt > file rather than this new file? Sure, no problem. > >> + >> +/proc/sys/net/netfilter/ft/nf_flowtable_* Variables: >> +================================================= >> + >> +nf_flowtable_count_hw - INTEGER (read-only) >> + Number of flowtable entries that are currently offloaded to hardware. >> + >> +nf_flowtable_max_hw - INTEGER >> + - 0 - disabled (default) >> + - not 0 - enabled >> + >> + Cap on maximum total amount of flowtable entries offloaded to hardware. >> diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h >> index 64daafd1fc41..e09c29fa034e 100644 >> --- a/include/net/netfilter/nf_flow_table.h >> +++ b/include/net/netfilter/nf_flow_table.h >> @@ -335,4 +335,29 @@ static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) >> return 0; >> } >> >> +struct nf_ft_net { >> + atomic_t count_hw; >> +#ifdef CONFIG_SYSCTL >> + struct ctl_table_header *sysctl_header; >> +#endif >> +}; >> + >> +extern unsigned int nf_ft_hw_max; >> +extern unsigned int nf_ft_net_id; >> + >> +#include <net/netns/generic.h> >> + >> +static inline struct nf_ft_net *nf_ft_pernet(const struct net *net) >> +{ >> + return net_generic(net, nf_ft_net_id); >> +} >> + >> +static inline struct nf_ft_net *nf_ft_pernet_get(struct nf_flowtable *flow_table) >> +{ >> + return nf_ft_pernet(read_pnet(&flow_table->net)); >> +} >> + >> +int nf_flow_table_init_sysctl(struct net *net); >> +void nf_flow_table_fini_sysctl(struct net *net); >> + >> #endif /* _NF_FLOW_TABLE_H */ >> diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile >> index 238b6a620e88..67e281842b61 100644 >> --- a/net/netfilter/Makefile >> +++ b/net/netfilter/Makefile >> @@ -127,7 +127,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o >> # flow table infrastructure >> obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o >> nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ >> - nf_flow_table_offload.o >> + nf_flow_table_offload.o \ >> + nf_flow_table_sysctl.o >> >> obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o >> >> diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c >> index 3db256da919b..e2598f98017c 100644 >> --- a/net/netfilter/nf_flow_table_core.c >> +++ b/net/netfilter/nf_flow_table_core.c >> @@ -277,6 +277,13 @@ static const struct rhashtable_params nf_flow_offload_rhash_params = { >> .automatic_shrinking = true, >> }; >> >> +static bool flow_max_hw_entries(struct nf_flowtable *flow_table) >> +{ >> + struct nf_ft_net *fnet = nf_ft_pernet_get(flow_table); >> + >> + return nf_ft_hw_max && atomic_read(&fnet->count_hw) >= nf_ft_hw_max; >> +} >> + >> unsigned long flow_offload_get_timeout(struct flow_offload *flow) >> { >> unsigned long timeout = NF_FLOW_TIMEOUT; >> @@ -320,7 +327,8 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) >> >> nf_ct_offload_timeout(flow->ct); >> >> - if (nf_flowtable_hw_offload(flow_table)) { >> + if (nf_flowtable_hw_offload(flow_table) && >> + !flow_max_hw_entries(flow_table)) { >> __set_bit(NF_FLOW_HW, &flow->flags); >> nf_flow_offload_add(flow_table, flow); >> } >> @@ -338,9 +346,11 @@ void flow_offload_refresh(struct nf_flowtable *flow_table, >> if (READ_ONCE(flow->timeout) != timeout) >> WRITE_ONCE(flow->timeout, timeout); >> >> - if (likely(!nf_flowtable_hw_offload(flow_table))) >> + if (likely(!nf_flowtable_hw_offload(flow_table) || >> + flow_max_hw_entries(flow_table))) >> return; >> >> + set_bit(NF_FLOW_HW, &flow->flags); >> nf_flow_offload_add(flow_table, flow); >> } >> EXPORT_SYMBOL_GPL(flow_offload_refresh); >> @@ -652,14 +662,53 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) >> } >> EXPORT_SYMBOL_GPL(nf_flow_table_free); >> >> +static int nf_flow_table_pernet_init(struct net *net) >> +{ >> + return nf_flow_table_init_sysctl(net); >> +} >> + >> +static void nf_flow_table_pernet_exit(struct list_head *net_exit_list) >> +{ >> + struct net *net; >> + >> + list_for_each_entry(net, net_exit_list, exit_list) >> + nf_flow_table_fini_sysctl(net); >> +} >> + >> +unsigned int nf_ft_net_id __read_mostly; >> + >> +static struct pernet_operations nf_flow_table_net_ops = { >> + .init = nf_flow_table_pernet_init, >> + .exit_batch = nf_flow_table_pernet_exit, >> + .id = &nf_ft_net_id, >> + .size = sizeof(struct nf_ft_net), >> +}; >> + >> static int __init nf_flow_table_module_init(void) >> { >> - return nf_flow_table_offload_init(); >> + int ret; >> + >> + nf_ft_hw_max = 0; >> + >> + ret = register_pernet_subsys(&nf_flow_table_net_ops); >> + if (ret < 0) >> + return ret; >> + >> + ret = nf_flow_table_offload_init(); >> + if (ret) >> + goto out_offload; >> + >> + return 0; >> + >> +out_offload: >> + unregister_pernet_subsys(&nf_flow_table_net_ops); >> + return ret; >> } >> >> static void __exit nf_flow_table_module_exit(void) >> { >> nf_flow_table_offload_exit(); >> + unregister_pernet_subsys(&nf_flow_table_net_ops); >> } >> >> module_init(nf_flow_table_module_init); >> diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c >> index 11b6e1942092..a6e763901eb9 100644 >> --- a/net/netfilter/nf_flow_table_offload.c >> +++ b/net/netfilter/nf_flow_table_offload.c >> @@ -903,30 +903,56 @@ static int flow_offload_rule_add(struct flow_offload_work *offload, >> return 0; >> } >> >> +static bool flow_get_max_hw_entries(struct nf_flowtable *flow_table) >> +{ >> + struct nf_ft_net *fnet = nf_ft_pernet_get(flow_table); >> + int count_hw = atomic_inc_return(&fnet->count_hw); >> + >> + if (nf_ft_hw_max && count_hw > nf_ft_hw_max) { >> + atomic_dec(&fnet->count_hw); >> + return false; >> + } >> + return true; >> +} >> + >> static void flow_offload_work_add(struct flow_offload_work *offload) >> { >> + struct nf_ft_net *fnet = nf_ft_pernet_get(offload->flowtable); >> struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX]; >> int err; >> >> + if (!flow_get_max_hw_entries(offload->flowtable)) >> + return; >> + >> err = nf_flow_offload_alloc(offload, flow_rule); >> if (err < 0) >> - return; >> + goto out_alloc; >> >> err = flow_offload_rule_add(offload, flow_rule); >> if (err < 0) >> - goto out; >> + goto out_add; >> >> - set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); >> + if (test_and_set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status)) >> + atomic_dec(&fnet->count_hw); >> + nf_flow_offload_destroy(flow_rule); >> + return; >> >> -out: >> +out_add: >> nf_flow_offload_destroy(flow_rule); >> +out_alloc: >> + atomic_dec(&fnet->count_hw); >> } >> >> static void flow_offload_work_del(struct flow_offload_work *offload) >> { >> - clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); >> + bool offloaded = test_and_clear_bit(IPS_HW_OFFLOAD_BIT, >> + &offload->flow->ct->status); >> + struct nf_ft_net *fnet = nf_ft_pernet_get(offload->flowtable); >> + >> flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); >> flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); >> + if (offloaded) >> + atomic_dec(&fnet->count_hw); >> set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); >> } >> >> diff --git a/net/netfilter/nf_flow_table_sysctl.c b/net/netfilter/nf_flow_table_sysctl.c >> new file mode 100644 >> index 000000000000..ce8c0a2c4bdb >> --- /dev/null >> +++ b/net/netfilter/nf_flow_table_sysctl.c >> @@ -0,0 +1,83 @@ >> +// SPDX-License-Identifier: GPL-2.0-only >> +#include <linux/kernel.h> >> +#include <net/netfilter/nf_flow_table.h> >> + >> +unsigned int nf_ft_hw_max __read_mostly; > > You can move this to nf_flow_table_core_offload.c for the Makefile > idea. > >> +#ifdef CONFIG_SYSCTL > > If you follow the Kconfig+Makefile approach, then this #ifdef can > likely go away as the entire file would be on/off thing. In following patch I also implement procfs stats in this file, which depends on another config variable (CONFIG_NF_FLOW_TABLE_PROCFS). Should I put procfs code in another new file that will also be conditionally compiled? > >> +enum nf_ct_sysctl_index { >> + NF_SYSCTL_FLOW_TABLE_MAX_HW, >> + NF_SYSCTL_FLOW_TABLE_COUNT_HW, >> + >> + __NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL, >> +}; >> + >> +#define NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL \ >> + (__NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL + 1) >> + >> +static struct ctl_table nf_flow_table_sysctl_table[] = { >> + [NF_SYSCTL_FLOW_TABLE_MAX_HW] = { >> + .procname = "nf_flowtable_max_hw", >> + .data = &nf_ft_hw_max, >> + .maxlen = sizeof(int), >> + .mode = 0644, >> + .proc_handler = proc_dointvec, >> + }, >> + [NF_SYSCTL_FLOW_TABLE_COUNT_HW] = { >> + .procname = "nf_flowtable_count_hw", >> + .maxlen = sizeof(int), >> + .mode = 0444, >> + .proc_handler = proc_dointvec, >> + }, >> + {} >> +}; >> + >> +int nf_flow_table_init_sysctl(struct net *net) >> +{ >> + struct nf_ft_net *fnet = nf_ft_pernet(net); >> + struct ctl_table *table; >> + >> + BUILD_BUG_ON(ARRAY_SIZE(nf_flow_table_sysctl_table) != NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL); >> + >> + table = kmemdup(nf_flow_table_sysctl_table, sizeof(nf_flow_table_sysctl_table), >> + GFP_KERNEL); >> + if (!table) >> + return -ENOMEM; >> + >> + table[NF_SYSCTL_FLOW_TABLE_COUNT_HW].data = &fnet->count_hw; >> + >> + /* Don't allow non-init_net ns to alter global sysctls */ >> + if (!net_eq(&init_net, net)) >> + table[NF_SYSCTL_FLOW_TABLE_MAX_HW].mode = 0444; >> + >> + fnet->sysctl_header = register_net_sysctl(net, "net/netfilter/ft", table); >> + if (!fnet->sysctl_header) >> + goto out_unregister_netfilter; >> + >> + return 0; >> + >> +out_unregister_netfilter: >> + kfree(table); >> + return -ENOMEM; >> +} >> + >> +void nf_flow_table_fini_sysctl(struct net *net) >> +{ >> + struct nf_ft_net *fnet = nf_ft_pernet(net); >> + struct ctl_table *table; >> + >> + table = fnet->sysctl_header->ctl_table_arg; >> + unregister_net_sysctl_table(fnet->sysctl_header); >> + kfree(table); >> +} >> + >> +#else >> +int nf_flow_table_init_sysctl(struct net *net) >> +{ >> + return 0; >> +} >> + >> +void nf_flow_table_fini_sysctl(struct net *net) >> +{ >> +} >> +#endif /* CONFIG_SYSCTL */ >> -- >> 2.31.1 >>