as of commit 4608fdfc07e1 ("netfilter: conntrack: collect all entries in one cycle") conntrack gc was changed to run periodically every 2 minutes. On systems where conntrack hash table is set to large value, almost all evictions happen from gc worker rather than the packet path due to hash table distribution. This causes netlink event overflows when the events are collected. This change exposes two sysctls: 1. gc interval (milliseconds, default: 2 minutes) 2. buckets per cycle (default: UINT_MAX / all) This allows to increase the scan intervals but also to reduce bustiness by switching to partial scans of the table for each cycle. If scan is changed to partial mode, next cycle resumes with next bucket. The defaults keep current behaviour. Reported-by: Karel Rericha <karel@xxxxxxxxx> Cc: Shmulik Ladkani <shmulik.ladkani@xxxxxxxxx> Cc: Eyal Birger <eyal.birger@xxxxxxxxx> Signed-off-by: Florian Westphal <fw@xxxxxxxxx> --- This is an alternative to Eyals patch. gc_interval is in millseconds rather than seconds and new gc_buckets can be used to switch the gc behaviour to a partial scan. For example you could configure it to scan at most 100 buckets every 10ms, which would scan about 10k entries/s. If you think the extra complexity of gc_buckets is unwanted I would suggest that Eyal submits a v3 with gc_interval in ms units. .../networking/nf_conntrack-sysctl.rst | 13 ++++++++++ include/net/netfilter/nf_conntrack.h | 2 ++ net/netfilter/nf_conntrack_core.c | 25 ++++++++++++++----- net/netfilter/nf_conntrack_standalone.c | 24 ++++++++++++++++++ 4 files changed, 58 insertions(+), 6 deletions(-) diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst index 311128abb768..26767a495406 100644 --- a/Documentation/networking/nf_conntrack-sysctl.rst +++ b/Documentation/networking/nf_conntrack-sysctl.rst @@ -61,6 +61,19 @@ nf_conntrack_frag6_timeout - INTEGER (seconds) Time to keep an IPv6 fragment in memory. +nf_conntrack_gc_buckets - INTEGER + default 4294967295 + + Number of buckets to scan during one gc cycle. + If the value is less than nf_conntrack_buckets, gc will return + early and next cycle resumes at the next unscanned bucket. + Default is to scan entire table per cycle. + +nf_conntrack_gc_interval - INTEGER (milliseconds) + default 120000 (2 minutes) + + Garbage collector Interval (in milliseconds). + nf_conntrack_generic_timeout - INTEGER (seconds) default 600 diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index cc663c68ddc4..ebaf36917c36 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -313,6 +313,8 @@ int nf_conntrack_hash_resize(unsigned int hashsize); extern struct hlist_nulls_head *nf_conntrack_hash; extern unsigned int nf_conntrack_htable_size; extern seqcount_spinlock_t nf_conntrack_generation; +extern unsigned long nf_conntrack_gc_interval; +extern unsigned int nf_conntrack_gc_buckets; extern unsigned int nf_conntrack_max; /* must be called with rcu read lock held */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 054ee9d25efe..0c789ee65e71 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -66,6 +66,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash); struct conntrack_gc_work { struct delayed_work dwork; u32 next_bucket; + u32 buckets; bool exiting; bool early_drop; }; @@ -83,6 +84,9 @@ static DEFINE_MUTEX(nf_conntrack_mutex); #define MIN_CHAINLEN 8u #define MAX_CHAINLEN (32u - MIN_CHAINLEN) +unsigned long __read_mostly nf_conntrack_gc_interval = GC_SCAN_INTERVAL; +unsigned int __read_mostly nf_conntrack_gc_buckets = UINT_MAX; + static struct conntrack_gc_work conntrack_gc_work; void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) @@ -1421,12 +1425,17 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct) static void gc_worker(struct work_struct *work) { unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION; + unsigned long next_run = nf_conntrack_gc_interval; unsigned int i, hashsz, nf_conntrack_max95 = 0; - unsigned long next_run = GC_SCAN_INTERVAL; struct conntrack_gc_work *gc_work; + unsigned int buckets; gc_work = container_of(work, struct conntrack_gc_work, dwork.work); + buckets = gc_work->buckets; + gc_work->buckets = 0; + i = gc_work->next_bucket; + gc_work->next_bucket = 0; if (gc_work->early_drop) nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; @@ -1491,7 +1500,12 @@ static void gc_worker(struct work_struct *work) cond_resched(); i++; + if (++buckets >= nf_conntrack_gc_buckets) { + gc_work->next_bucket = i; + break; + } if (time_after(jiffies, end_time) && i < hashsz) { + gc_work->buckets = buckets; gc_work->next_bucket = i; next_run = 0; break; @@ -1508,16 +1522,15 @@ static void gc_worker(struct work_struct *work) * This worker is only here to reap expired entries when system went * idle after a busy period. */ - if (next_run) { + if (next_run) gc_work->early_drop = false; - gc_work->next_bucket = 0; - } + queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); } static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) { - INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker); + INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); gc_work->exiting = false; } @@ -2743,7 +2756,7 @@ int nf_conntrack_init_start(void) goto err_proto; conntrack_gc_work_init(&conntrack_gc_work); - queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); + queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, 10 * HZ); return 0; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 80f675d884b2..38c9d0a3c898 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -554,6 +554,8 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_MAX, NF_SYSCTL_CT_COUNT, NF_SYSCTL_CT_BUCKETS, + NF_SYSCTL_CT_GC_BUCKETS, + NF_SYSCTL_CT_GC_INTERVAL, NF_SYSCTL_CT_CHECKSUM, NF_SYSCTL_CT_LOG_INVALID, NF_SYSCTL_CT_EXPECT_MAX, @@ -624,6 +626,9 @@ enum nf_ct_sysctl_index { #define NF_SYSCTL_CT_LAST_SYSCTL (__NF_SYSCTL_CT_LAST_SYSCTL + 1) +static const unsigned long max_scan_interval = 1 * 24 * 60 * 60 * HZ; +static const unsigned long min_scan_interval = 1; + static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_MAX] = { .procname = "nf_conntrack_max", @@ -645,6 +650,23 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = nf_conntrack_hash_sysctl, }, + [NF_SYSCTL_CT_GC_BUCKETS] = { + .procname = "nf_conntrack_gc_buckets", + .data = &nf_conntrack_gc_buckets, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ONE, + }, + [NF_SYSCTL_CT_GC_INTERVAL] = { + .procname = "nf_conntrack_gc_interval", + .data = &nf_conntrack_gc_interval, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = (void *)&min_scan_interval, + .extra2 = (void *)&max_scan_interval, + }, [NF_SYSCTL_CT_CHECKSUM] = { .procname = "nf_conntrack_checksum", .data = &init_net.ct.sysctl_checksum, @@ -1123,6 +1145,8 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[NF_SYSCTL_CT_MAX].mode = 0444; table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444; table[NF_SYSCTL_CT_BUCKETS].mode = 0444; + table[NF_SYSCTL_CT_GC_BUCKETS].mode = 0444; + table[NF_SYSCTL_CT_GC_INTERVAL].mode = 0444; } cnet->sysctl_header = register_net_sysctl(net, "net/netfilter", table); -- 2.32.0