Re: [PATCH net-next v2 2/3] netfilter: nf_flow_table: count and limit hw offloaded entries

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, May 16, 2022 at 10:10:31PM +0300, Vlad Buslov wrote:
> To improve hardware offload debuggability and scalability introduce
> 'nf_flowtable_count_hw' and 'nf_flowtable_max_hw' sysctl entries in new
> dedicated 'net/netfilter/ft' namespace. Add new pernet struct nf_ft_net in
> order to store the counter and sysctl header of new sysctl table.
> 
> Count the offloaded flows in workqueue add task handler. Verify that
> offloaded flow total is lower than allowed maximum before calling the
> driver callbacks. To prevent spamming the 'add' workqueue with tasks when
> flows can't be offloaded anymore also check that count is below limit
> before queuing offload work. This doesn't prevent all redundant workqueue
> task since counter can be taken by concurrent work handler after the check
> had been performed but before the offload job is executed but it still
> greatly reduces such occurrences. Note that flows that were not offloaded
> due to counter being larger than the cap can still be offloaded via refresh
> function.
> 
> Ensure that flows are accounted correctly by verifying IPS_HW_OFFLOAD_BIT
> value before counting them. This ensures that add/refresh code path
> increments the counter exactly once per flow when setting the bit and
> decrements it only for accounted flows when deleting the flow with the bit
> set.
> 
> Signed-off-by: Vlad Buslov <vladbu@xxxxxxxxxx>
> Signed-off-by: Oz Shlomo <ozsh@xxxxxxxxxx>
> ---
> 
> Notes:
>     Changes V1 -> V2:
>     
>     - Combine patches that expose offloaded flow count and limit total
>     offloaded flows.
>     
>     - Rework the counting logic to count in workqueue context.
>     
>     - Create dedicated namespace for flow table sysctls.
> 
>  .../networking/nf_flowtable-sysctl.rst        | 17 ++++
>  include/net/netfilter/nf_flow_table.h         | 25 ++++++
>  net/netfilter/Makefile                        |  3 +-
>  net/netfilter/nf_flow_table_core.c            | 55 +++++++++++-
>  net/netfilter/nf_flow_table_offload.c         | 36 ++++++--
>  net/netfilter/nf_flow_table_sysctl.c          | 83 +++++++++++++++++++
>  6 files changed, 210 insertions(+), 9 deletions(-)
>  create mode 100644 Documentation/networking/nf_flowtable-sysctl.rst
>  create mode 100644 net/netfilter/nf_flow_table_sysctl.c
> 
> diff --git a/Documentation/networking/nf_flowtable-sysctl.rst b/Documentation/networking/nf_flowtable-sysctl.rst
> new file mode 100644
> index 000000000000..932b4abeca6c
> --- /dev/null
> +++ b/Documentation/networking/nf_flowtable-sysctl.rst
> @@ -0,0 +1,17 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +
> +===================================
> +Netfilter Flowtable Sysfs variables
> +===================================

Better document the sysctl entries in the existing nf_conntrack.txt
file rather than this new file?

> +
> +/proc/sys/net/netfilter/ft/nf_flowtable_* Variables:
> +=================================================
> +
> +nf_flowtable_count_hw - INTEGER (read-only)
> +	Number of flowtable entries that are currently offloaded to hardware.
> +
> +nf_flowtable_max_hw - INTEGER
> +	- 0 - disabled (default)
> +	- not 0 - enabled
> +
> +	Cap on maximum total amount of flowtable entries offloaded to hardware.
> diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
> index 64daafd1fc41..e09c29fa034e 100644
> --- a/include/net/netfilter/nf_flow_table.h
> +++ b/include/net/netfilter/nf_flow_table.h
> @@ -335,4 +335,29 @@ static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb)
>  	return 0;
>  }
>  
> +struct nf_ft_net {
> +	atomic_t count_hw;
> +#ifdef CONFIG_SYSCTL
> +	struct ctl_table_header *sysctl_header;
> +#endif
> +};
> +
> +extern unsigned int nf_ft_hw_max;
> +extern unsigned int nf_ft_net_id;
> +
> +#include <net/netns/generic.h>
> +
> +static inline struct nf_ft_net *nf_ft_pernet(const struct net *net)
> +{
> +	return net_generic(net, nf_ft_net_id);
> +}
> +
> +static inline struct nf_ft_net *nf_ft_pernet_get(struct nf_flowtable *flow_table)
> +{
> +	return nf_ft_pernet(read_pnet(&flow_table->net));
> +}
> +
> +int nf_flow_table_init_sysctl(struct net *net);
> +void nf_flow_table_fini_sysctl(struct net *net);
> +
>  #endif /* _NF_FLOW_TABLE_H */
> diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
> index 238b6a620e88..67e281842b61 100644
> --- a/net/netfilter/Makefile
> +++ b/net/netfilter/Makefile
> @@ -127,7 +127,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV)	+= nft_fwd_netdev.o
>  # flow table infrastructure
>  obj-$(CONFIG_NF_FLOW_TABLE)	+= nf_flow_table.o
>  nf_flow_table-objs		:= nf_flow_table_core.o nf_flow_table_ip.o \
> -				   nf_flow_table_offload.o
> +				   nf_flow_table_offload.o \
> +				   nf_flow_table_sysctl.o
>  
>  obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
>  
> diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
> index 3db256da919b..e2598f98017c 100644
> --- a/net/netfilter/nf_flow_table_core.c
> +++ b/net/netfilter/nf_flow_table_core.c
> @@ -277,6 +277,13 @@ static const struct rhashtable_params nf_flow_offload_rhash_params = {
>  	.automatic_shrinking	= true,
>  };
>  
> +static bool flow_max_hw_entries(struct nf_flowtable *flow_table)
> +{
> +	struct nf_ft_net *fnet = nf_ft_pernet_get(flow_table);
> +
> +	return nf_ft_hw_max && atomic_read(&fnet->count_hw) >= nf_ft_hw_max;
> +}
> +
>  unsigned long flow_offload_get_timeout(struct flow_offload *flow)
>  {
>  	unsigned long timeout = NF_FLOW_TIMEOUT;
> @@ -320,7 +327,8 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
>  
>  	nf_ct_offload_timeout(flow->ct);
>  
> -	if (nf_flowtable_hw_offload(flow_table)) {
> +	if (nf_flowtable_hw_offload(flow_table) &&
> +	    !flow_max_hw_entries(flow_table)) {
>  		__set_bit(NF_FLOW_HW, &flow->flags);
>  		nf_flow_offload_add(flow_table, flow);
>  	}
> @@ -338,9 +346,11 @@ void flow_offload_refresh(struct nf_flowtable *flow_table,
>  	if (READ_ONCE(flow->timeout) != timeout)
>  		WRITE_ONCE(flow->timeout, timeout);
>  
> -	if (likely(!nf_flowtable_hw_offload(flow_table)))
> +	if (likely(!nf_flowtable_hw_offload(flow_table) ||
> +		   flow_max_hw_entries(flow_table)))
>  		return;
>  
> +	set_bit(NF_FLOW_HW, &flow->flags);
>  	nf_flow_offload_add(flow_table, flow);
>  }
>  EXPORT_SYMBOL_GPL(flow_offload_refresh);
> @@ -652,14 +662,53 @@ void nf_flow_table_free(struct nf_flowtable *flow_table)
>  }
>  EXPORT_SYMBOL_GPL(nf_flow_table_free);
>  
> +static int nf_flow_table_pernet_init(struct net *net)
> +{
> +	return nf_flow_table_init_sysctl(net);
> +}
> +
> +static void nf_flow_table_pernet_exit(struct list_head *net_exit_list)
> +{
> +	struct net *net;
> +
> +	list_for_each_entry(net, net_exit_list, exit_list)
> +		nf_flow_table_fini_sysctl(net);
> +}
> +
> +unsigned int nf_ft_net_id __read_mostly;
> +
> +static struct pernet_operations nf_flow_table_net_ops = {
> +	.init = nf_flow_table_pernet_init,
> +	.exit_batch = nf_flow_table_pernet_exit,
> +	.id = &nf_ft_net_id,
> +	.size = sizeof(struct nf_ft_net),
> +};
> +
>  static int __init nf_flow_table_module_init(void)
>  {
> -	return nf_flow_table_offload_init();
> +	int ret;
> +
> +	nf_ft_hw_max = 0;
> +
> +	ret = register_pernet_subsys(&nf_flow_table_net_ops);
> +	if (ret < 0)
> +		return ret;
> +
> +	ret = nf_flow_table_offload_init();
> +	if (ret)
> +		goto out_offload;
> +
> +	return 0;
> +
> +out_offload:
> +	unregister_pernet_subsys(&nf_flow_table_net_ops);
> +	return ret;
>  }
>  
>  static void __exit nf_flow_table_module_exit(void)
>  {
>  	nf_flow_table_offload_exit();
> +	unregister_pernet_subsys(&nf_flow_table_net_ops);
>  }
>  
>  module_init(nf_flow_table_module_init);
> diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
> index 11b6e1942092..a6e763901eb9 100644
> --- a/net/netfilter/nf_flow_table_offload.c
> +++ b/net/netfilter/nf_flow_table_offload.c
> @@ -903,30 +903,56 @@ static int flow_offload_rule_add(struct flow_offload_work *offload,
>  	return 0;
>  }
>  
> +static bool flow_get_max_hw_entries(struct nf_flowtable *flow_table)
> +{
> +	struct nf_ft_net *fnet = nf_ft_pernet_get(flow_table);
> +	int count_hw = atomic_inc_return(&fnet->count_hw);
> +
> +	if (nf_ft_hw_max && count_hw > nf_ft_hw_max) {
> +		atomic_dec(&fnet->count_hw);
> +		return false;
> +	}
> +	return true;
> +}
> +
>  static void flow_offload_work_add(struct flow_offload_work *offload)
>  {
> +	struct nf_ft_net *fnet = nf_ft_pernet_get(offload->flowtable);
>  	struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX];
>  	int err;
>  
> +	if (!flow_get_max_hw_entries(offload->flowtable))
> +		return;
> +
>  	err = nf_flow_offload_alloc(offload, flow_rule);
>  	if (err < 0)
> -		return;
> +		goto out_alloc;
>  
>  	err = flow_offload_rule_add(offload, flow_rule);
>  	if (err < 0)
> -		goto out;
> +		goto out_add;
>  
> -	set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status);
> +	if (test_and_set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status))
> +		atomic_dec(&fnet->count_hw);
> +	nf_flow_offload_destroy(flow_rule);
> +	return;
>  
> -out:
> +out_add:
>  	nf_flow_offload_destroy(flow_rule);
> +out_alloc:
> +	atomic_dec(&fnet->count_hw);
>  }
>  
>  static void flow_offload_work_del(struct flow_offload_work *offload)
>  {
> -	clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status);
> +	bool offloaded = test_and_clear_bit(IPS_HW_OFFLOAD_BIT,
> +					    &offload->flow->ct->status);
> +	struct nf_ft_net *fnet = nf_ft_pernet_get(offload->flowtable);
> +
>  	flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL);
>  	flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY);
> +	if (offloaded)
> +		atomic_dec(&fnet->count_hw);
>  	set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags);
>  }
>  
> diff --git a/net/netfilter/nf_flow_table_sysctl.c b/net/netfilter/nf_flow_table_sysctl.c
> new file mode 100644
> index 000000000000..ce8c0a2c4bdb
> --- /dev/null
> +++ b/net/netfilter/nf_flow_table_sysctl.c
> @@ -0,0 +1,83 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include <linux/kernel.h>
> +#include <net/netfilter/nf_flow_table.h>
> +
> +unsigned int nf_ft_hw_max __read_mostly;

You can move this to nf_flow_table_core_offload.c for the Makefile
idea.

> +#ifdef CONFIG_SYSCTL

If you follow the Kconfig+Makefile approach, then this #ifdef can
likely go away as the entire file would be on/off thing.

> +enum nf_ct_sysctl_index {
> +	NF_SYSCTL_FLOW_TABLE_MAX_HW,
> +	NF_SYSCTL_FLOW_TABLE_COUNT_HW,
> +
> +	__NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL,
> +};
> +
> +#define NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL                                       \
> +	(__NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL + 1)
> +
> +static struct ctl_table nf_flow_table_sysctl_table[] = {
> +	[NF_SYSCTL_FLOW_TABLE_MAX_HW] = {
> +		.procname	= "nf_flowtable_max_hw",
> +		.data		= &nf_ft_hw_max,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	[NF_SYSCTL_FLOW_TABLE_COUNT_HW] = {
> +		.procname	= "nf_flowtable_count_hw",
> +		.maxlen		= sizeof(int),
> +		.mode		= 0444,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{}
> +};
> +
> +int nf_flow_table_init_sysctl(struct net *net)
> +{
> +	struct nf_ft_net *fnet = nf_ft_pernet(net);
> +	struct ctl_table *table;
> +
> +	BUILD_BUG_ON(ARRAY_SIZE(nf_flow_table_sysctl_table) != NF_SYSCTL_FLOW_TABLE_LAST_SYSCTL);
> +
> +	table = kmemdup(nf_flow_table_sysctl_table, sizeof(nf_flow_table_sysctl_table),
> +			GFP_KERNEL);
> +	if (!table)
> +		return -ENOMEM;
> +
> +	table[NF_SYSCTL_FLOW_TABLE_COUNT_HW].data = &fnet->count_hw;
> +
> +	/* Don't allow non-init_net ns to alter global sysctls */
> +	if (!net_eq(&init_net, net))
> +		table[NF_SYSCTL_FLOW_TABLE_MAX_HW].mode = 0444;
> +
> +	fnet->sysctl_header = register_net_sysctl(net, "net/netfilter/ft", table);
> +	if (!fnet->sysctl_header)
> +		goto out_unregister_netfilter;
> +
> +	return 0;
> +
> +out_unregister_netfilter:
> +	kfree(table);
> +	return -ENOMEM;
> +}
> +
> +void nf_flow_table_fini_sysctl(struct net *net)
> +{
> +	struct nf_ft_net *fnet = nf_ft_pernet(net);
> +	struct ctl_table *table;
> +
> +	table = fnet->sysctl_header->ctl_table_arg;
> +	unregister_net_sysctl_table(fnet->sysctl_header);
> +	kfree(table);
> +}
> +
> +#else
> +int nf_flow_table_init_sysctl(struct net *net)
> +{
> +	return 0;
> +}
> +
> +void nf_flow_table_fini_sysctl(struct net *net)
> +{
> +}
> +#endif /* CONFIG_SYSCTL */
> -- 
> 2.31.1
> 



[Index of Archives]     [Netfitler Users]     [Berkeley Packet Filter]     [LARTC]     [Bugtraq]     [Yosemite Forum]

  Powered by Linux