We introduce an optional per-netns hash table for UDP. With a smaller hash table, we can look up sockets faster and isolate noisy neighbours. Also, we can reduce lock contention. We can control the hash table size by a new sysctl knob. However, depending on workloads, it will require very sensitive tuning, so we disable the feature by default (net.ipv4.udp_child_ehash_entries == 0). Moreover, we can fall back to using the global hash table in case we fail to allocate enough memory for a new hash table. We can check the current hash table size by another read-only sysctl knob, net.ipv4.udp_hash_entries. A negative value means the netns shares the global hash table (per-netns hash table is disabled or failed to allocate memory). We could optimise the hash table lookup/iteration further by removing netns comparison for the per-netns one in the future. Also, we could optimise the sparse udp_hslot layout by putting it in udp_table. Signed-off-by: Kuniyuki Iwashima <kuniyu@xxxxxxxxxx> --- Documentation/networking/ip-sysctl.rst | 20 ++++++++ include/net/netns/ipv4.h | 2 + net/ipv4/sysctl_net_ipv4.c | 56 +++++++++++++++++++++ net/ipv4/udp.c | 69 ++++++++++++++++++++++++++ 4 files changed, 147 insertions(+) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 97a0952b11e3..6dc4e2853e39 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1090,6 +1090,26 @@ udp_rmem_min - INTEGER udp_wmem_min - INTEGER UDP does not have tx memory accounting and this tunable has no effect. +udp_hash_entries - INTEGER + Read-only number of hash buckets for UDP sockets in the current + networking namespace. + + A negative value means the networking namespace does not own its + hash buckets and shares the initial networking namespace's one. + +udp_child_ehash_entries - INTEGER + Control the number of hash buckets for UDP sockets in the child + networking namespace, which must be set before clone() or unshare(). + + The written value except for 0 is rounded up to 2^n. 0 is a special + value, meaning the child networking namespace will share the initial + networking namespace's hash buckets. + + Note that the child will use the global one in case the kernel + fails to allocate enough memory. + + Default: 0 + RAW variables ============= diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index c367da5d61e2..a1be7ebb7338 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -200,6 +200,8 @@ struct netns_ipv4 { atomic_t dev_addr_genid; + unsigned int sysctl_udp_child_hash_entries; + #ifdef CONFIG_SYSCTL unsigned long *sysctl_local_reserved_ports; int sysctl_ip_prot_sock; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 03a3187c4705..b3cea3f36463 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -424,6 +424,47 @@ static int proc_tcp_child_ehash_entries(struct ctl_table *table, int write, return 0; } +static int proc_udp_hash_entries(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_udp_child_hash_entries); + int udp_hash_entries; + struct ctl_table tbl; + + udp_hash_entries = net->ipv4.udp_table->mask + 1; + + /* A negative number indicates that the child netns + * shares the global udp_table. + */ + if (!net_eq(net, &init_net) && net->ipv4.udp_table == &udp_table) + udp_hash_entries *= -1; + + tbl.data = &udp_hash_entries; + tbl.maxlen = sizeof(int); + + return proc_dointvec(&tbl, write, buffer, lenp, ppos); +} + +static int proc_udp_child_hash_entries(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned int udp_child_hash_entries; + int ret; + + ret = proc_douintvec(table, write, buffer, lenp, ppos); + if (!write || ret) + return ret; + + udp_child_hash_entries = READ_ONCE(*(unsigned int *)table->data); + if (udp_child_hash_entries) + udp_child_hash_entries = roundup_pow_of_two(udp_child_hash_entries); + + WRITE_ONCE(*(unsigned int *)table->data, udp_child_hash_entries); + + return 0; +} + #ifdef CONFIG_IP_ROUTE_MULTIPATH static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, void *buffer, size_t *lenp, @@ -1378,6 +1419,21 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, + { + .procname = "udp_hash_entries", + .data = &init_net.ipv4.sysctl_udp_child_hash_entries, + .mode = 0444, + .proc_handler = proc_udp_hash_entries, + }, + { + .procname = "udp_child_hash_entries", + .data = &init_net.ipv4.sysctl_udp_child_hash_entries, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_udp_child_hash_entries, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, + }, { .procname = "udp_rmem_min", .data = &init_net.ipv4.sysctl_udp_rmem_min, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f4825e38762a..c41306225305 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -3309,8 +3309,77 @@ static int __net_init udp_sysctl_init(struct net *net) return 0; } +static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries) +{ + struct udp_table *udptable; + int i; + + udptable = kmalloc(sizeof(*udptable), GFP_KERNEL); + if (!udptable) + goto out; + + udptable->hash = kvmalloc_array(hash_entries * 2, + sizeof(struct udp_hslot), GFP_KERNEL); + if (!udptable->hash) + goto free_table; + + udptable->hash2 = udptable->hash + hash_entries; + udptable->mask = hash_entries - 1; + udptable->log = ilog2(hash_entries); + + for (i = 0; i < hash_entries; i++) { + INIT_HLIST_HEAD(&udptable->hash[i].head); + udptable->hash[i].count = 0; + spin_lock_init(&udptable->hash[i].lock); + + INIT_HLIST_HEAD(&udptable->hash2[i].head); + udptable->hash2[i].count = 0; + spin_lock_init(&udptable->hash2[i].lock); + } + + return udptable; + +free_table: + kfree(udptable); +out: + return NULL; +} + +static int __net_init udp_pernet_table_init(struct net *net, struct net *old_net) +{ + struct udp_table *udptable; + unsigned int hash_entries; + + hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries); + if (!hash_entries) + goto out; + + udptable = udp_pernet_table_alloc(hash_entries); + if (udptable) + net->ipv4.udp_table = udptable; + else + pr_warn("Failed to allocate UDP hash table (entries: %u) " + "for a netns, fallback to use the global one\n", + hash_entries); +out: + return 0; +} + +static void __net_exit udp_pernet_table_free(struct net *net) +{ + struct udp_table *udptable = net->ipv4.udp_table; + + if (udptable == &udp_table) + return; + + kvfree(udptable->hash); + kfree(udptable); +} + static struct pernet_operations __net_initdata udp_sysctl_ops = { .init = udp_sysctl_init, + .init2 = udp_pernet_table_init, + .exit = udp_pernet_table_free, }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) -- 2.30.2