Unfortunately, MAX_WORK at this moment is broken: during trie_rebalance() climbing upwards resize() is being called for each tnode. Which makes the limit useless the bigger trie gets: at each level there are 10 attempts to balance tnode and childs, resulting in O(10^n + 10^{n-1} + ... 10^{n-k}) complexity to balance trie, where k - is the level where we changed the trie originally (adding/removing alias/route). Which results in the following price of removing one route under big trie (similar for some other single routes that results in reallocating tnodes by hitting the threshold limits for halve/inflate resize): Before: Basic info: size of leaf: 40 bytes, size of tnode: 40 bytes. Main: Aver depth: 1.99 Max depth: 2 Leaves: 77825 Prefixes: 77825 Internal nodes: 77 9: 1 10: 76 Pointers: 78336 Null ptrs: 435 Total size: 7912 kB Local: [omitted] After: Basic info: size of leaf: 40 bytes, size of tnode: 40 bytes. Main: Aver depth: 3.00 Max depth: 3 Leaves: 77824 Prefixes: 77824 Internal nodes: 20491 1: 2048 2: 18432 6: 1 11: 10 Pointers: 98368 Null ptrs: 54 Total size: 8865 kB Local: [omitted] Provide a sysctl to control amount of pending balancing work. (by default unlimited as it was) Cc: linux-doc@xxxxxxxxxxxxxxx Cc: Jonathan Corbet <corbet@xxxxxxx> Fixes: ff181ed8768f ("fib_trie: Push assignment of child to parent down into inflate/halve") Signed-off-by: Dmitry Safonov <dima@xxxxxxxxxx> --- Documentation/networking/ip-sysctl.txt | 6 +++ include/net/ip.h | 1 + net/ipv4/fib_trie.c | 60 +++++++++++++++----------- net/ipv4/sysctl_net_ipv4.c | 7 +++ 4 files changed, 49 insertions(+), 25 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index acdfb5d2bcaa..fb71dacff4dd 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -81,6 +81,12 @@ fib_multipath_hash_policy - INTEGER 0 - Layer 3 1 - Layer 4 +fib_balance_budget - UNSIGNED INTEGER + Limits the number of resize attempts during balancing fib trie + on adding/removing new routes. + Possible values: + Default: UINT_MAX (0xFFFFFFFF) + ip_forward_update_priority - INTEGER Whether to update SKB priority from "TOS" field in IPv4 header after it is forwarded. The new SKB priority is mapped from TOS field value diff --git a/include/net/ip.h b/include/net/ip.h index be3cad9c2e4c..305d0e43088b 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -421,6 +421,7 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk, return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); } +extern unsigned int fib_balance_budget; struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, int fc_mx_len, struct netlink_ext_ack *extack); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index ad7d56c421cb..d90cf9dfd443 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -182,8 +182,10 @@ struct trie { #endif }; -static struct key_vector *resize(struct trie *t, struct key_vector *tn); +static struct key_vector *resize(struct trie *t, struct key_vector *tn, + unsigned int *budget); static size_t tnode_free_size; +unsigned int fib_balance_budget = UINT_MAX; /* * synchronize_rcu after call_rcu for that many pages; it should be especially @@ -506,7 +508,8 @@ static void tnode_free(struct key_vector *tn) static struct key_vector *replace(struct trie *t, struct key_vector *oldtnode, - struct key_vector *tn) + struct key_vector *tn, + unsigned int *budget) { struct key_vector *tp = node_parent(oldtnode); unsigned long i; @@ -522,19 +525,19 @@ static struct key_vector *replace(struct trie *t, tnode_free(oldtnode); /* resize children now that oldtnode is freed */ - for (i = child_length(tn); i;) { + for (i = child_length(tn); i && *budget;) { struct key_vector *inode = get_child(tn, --i); /* resize child node */ if (tnode_full(tn, inode)) - tn = resize(t, inode); + tn = resize(t, inode, budget); } return tp; } -static struct key_vector *inflate(struct trie *t, - struct key_vector *oldtnode) +static struct key_vector *inflate(struct trie *t, struct key_vector *oldtnode, + unsigned int *budget) { struct key_vector *tn; unsigned long i; @@ -621,7 +624,7 @@ static struct key_vector *inflate(struct trie *t, } /* setup the parent pointers into and out of this node */ - return replace(t, oldtnode, tn); + return replace(t, oldtnode, tn, budget); nomem: /* all pointers should be clean so we are done */ tnode_free(tn); @@ -629,8 +632,8 @@ static struct key_vector *inflate(struct trie *t, return NULL; } -static struct key_vector *halve(struct trie *t, - struct key_vector *oldtnode) +static struct key_vector *halve(struct trie *t, struct key_vector *oldtnode, + unsigned int *budget) { struct key_vector *tn; unsigned long i; @@ -676,7 +679,7 @@ static struct key_vector *halve(struct trie *t, } /* setup the parent pointers into and out of this node */ - return replace(t, oldtnode, tn); + return replace(t, oldtnode, tn, budget); nomem: /* all pointers should be clean so we are done */ tnode_free(tn); @@ -843,15 +846,15 @@ static inline bool should_collapse(struct key_vector *tn) return used < 2; } -#define MAX_WORK 10 -static struct key_vector *resize(struct trie *t, struct key_vector *tn) +static struct key_vector *resize(struct trie *t, struct key_vector *tn, + unsigned int *budget) { #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats = t->stats; #endif struct key_vector *tp = node_parent(tn); unsigned long cindex = get_index(tn->key, tp); - int max_work = MAX_WORK; + bool inflated = false; pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", tn, inflate_threshold, halve_threshold); @@ -865,8 +868,8 @@ static struct key_vector *resize(struct trie *t, struct key_vector *tn) /* Double as long as the resulting node has a number of * nonempty nodes that are above the threshold. */ - while (should_inflate(tp, tn) && max_work) { - tp = inflate(t, tn); + while (should_inflate(tp, tn) && *budget) { + tp = inflate(t, tn, budget); if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->resize_node_skipped); @@ -874,22 +877,25 @@ static struct key_vector *resize(struct trie *t, struct key_vector *tn) break; } - max_work--; + (*budget)--; + inflated = true; tn = get_child(tp, cindex); } /* update parent in case inflate failed */ tp = node_parent(tn); - /* Return if at least one inflate is run */ - if (max_work != MAX_WORK) + /* Return if at least one inflate is run: + * microoptimization to not recalculate thresholds + */ + if (inflated) return tp; /* Halve as long as the number of empty children in this * node is above threshold. */ - while (should_halve(tp, tn) && max_work) { - tp = halve(t, tn); + while (should_halve(tp, tn) && *budget) { + tp = halve(t, tn, budget); if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->resize_node_skipped); @@ -897,7 +903,7 @@ static struct key_vector *resize(struct trie *t, struct key_vector *tn) break; } - max_work--; + (*budget)--; tn = get_child(tp, cindex); } @@ -1005,8 +1011,10 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, static void trie_rebalance(struct trie *t, struct key_vector *tn) { - while (!IS_TRIE(tn)) - tn = resize(t, tn); + unsigned int budget = fib_balance_budget; + + while (budget && !IS_TRIE(tn)) + tn = resize(t, tn, &budget); } static int fib_insert_node(struct trie *t, struct key_vector *tp, @@ -1784,6 +1792,7 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb) void fib_table_flush_external(struct fib_table *tb) { struct trie *t = (struct trie *)tb->tb_data; + unsigned int budget = fib_balance_budget; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; @@ -1806,7 +1815,7 @@ void fib_table_flush_external(struct fib_table *tb) update_suffix(pn); /* resize completed node */ - pn = resize(t, pn); + pn = resize(t, pn, &budget); cindex = get_index(pkey, pn); continue; @@ -1853,6 +1862,7 @@ void fib_table_flush_external(struct fib_table *tb) int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) { struct trie *t = (struct trie *)tb->tb_data; + unsigned int budget = fib_balance_budget; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; @@ -1876,7 +1886,7 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) update_suffix(pn); /* resize completed node */ - pn = resize(t, pn); + pn = resize(t, pn, &budget); cindex = get_index(pkey, pn); continue; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ba0fc4b18465..d7274cc442af 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -443,6 +443,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "fib_balance_budget", + .data = &fib_balance_budget, + .maxlen = sizeof(fib_balance_budget), + .mode = 0644, + .proc_handler = proc_douintvec, + }, { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, -- 2.21.0