On Wed, Apr 28, 2021 at 10:54:37AM +0200, Christian Borntraeger wrote: > > > On 28.04.21 10:46, Peter Zijlstra wrote: > > On Tue, Apr 27, 2021 at 04:59:25PM +0200, Christian Borntraeger wrote: > > > Peter, > > > > > > I just realized that we moved away sysctl tunabled to debugfs in next. > > > We have seen several cases where it was benefitial to set > > > sched_migration_cost_ns to a lower value. For example with KVM I can > > > easily get 50% more transactions with 50000 instead of 500000. > > > Until now it was possible to use tuned or /etc/sysctl.conf to set > > > these things permanently. > > > > > > Given that some people do not want to have debugfs mounted all the time > > > I would consider this a regression. The sysctl tunable was always > > > available. > > > > > > I am ok with the "informational" things being in debugfs, but not > > > the tunables. So how do we proceed here? > > > > It's all SCHED_DEBUG; IOW you're relying on DEBUG infrastructure for > > production performance, and that's your fail. > > No its not. sched_migration_cost_ns was NEVER protected by CONFIG_SCHED_DEBUG. > It was available on all kernels with CONFIG_SMP. The relevant section from origin/master:kernel/sysctl.c: #ifdef CONFIG_SCHED_DEBUG { .procname = "sched_min_granularity_ns", .data = &sysctl_sched_min_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_proc_update_handler, .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, { .procname = "sched_latency_ns", .data = &sysctl_sched_latency, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_proc_update_handler, .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, { .procname = "sched_wakeup_granularity_ns", .data = &sysctl_sched_wakeup_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_proc_update_handler, .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, #ifdef CONFIG_SMP { .procname = "sched_tunable_scaling", .data = &sysctl_sched_tunable_scaling, .maxlen = sizeof(enum sched_tunable_scaling), .mode = 0644, .proc_handler = sched_proc_update_handler, .extra1 = &min_sched_tunable_scaling, .extra2 = &max_sched_tunable_scaling, }, { .procname = "sched_migration_cost_ns", .data = &sysctl_sched_migration_cost, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sched_nr_migrate", .data = &sysctl_sched_nr_migrate, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", .data = NULL, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sysctl_schedstats, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SCHEDSTATS */ #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING { .procname = "numa_balancing_scan_delay_ms", .data = &sysctl_numa_balancing_scan_delay, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "numa_balancing_scan_period_min_ms", .data = &sysctl_numa_balancing_scan_period_min, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "numa_balancing_scan_period_max_ms", .data = &sysctl_numa_balancing_scan_period_max, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "numa_balancing_scan_size_mb", .data = &sysctl_numa_balancing_scan_size, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "numa_balancing", .data = NULL, /* filled in by handler */ .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sysctl_numa_balancing, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ How is migration_cost not under SCHED_DEBUG? The bigger problem is that world+dog has SCHED_DEBUG=y in their .config.