This is the core of the fsio-throttle controller: it defines the interface to the cgroup subsystem and implements the I/O measurement and throttling logic. Signed-off-by: Andrea Righi <righi.andrea@xxxxxxxxx> --- include/linux/cgroup_subsys.h | 4 + include/linux/fsio-throttle.h | 43 +++ init/Kconfig | 11 + kernel/cgroup/Makefile | 1 + kernel/cgroup/fsio-throttle.c | 501 ++++++++++++++++++++++++++++++++++ 5 files changed, 560 insertions(+) create mode 100644 include/linux/fsio-throttle.h create mode 100644 kernel/cgroup/fsio-throttle.c diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index acb77dcff3b4..33beb70c0eca 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -61,6 +61,10 @@ SUBSYS(pids) SUBSYS(rdma) #endif +#if IS_ENABLED(CONFIG_CGROUP_FSIO_THROTTLE) +SUBSYS(fsio) +#endif + /* * The following subsystems are not supported on the default hierarchy. */ diff --git a/include/linux/fsio-throttle.h b/include/linux/fsio-throttle.h new file mode 100644 index 000000000000..3a46df712475 --- /dev/null +++ b/include/linux/fsio-throttle.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __FSIO_THROTTLE_H__ +#define __FSIO_THROTTLE_H__ + +#include <linux/fs.h> +#include <linux/genhd.h> + +#ifdef CONFIG_BLOCK +static inline dev_t bdev_to_dev(struct block_device *bdev) +{ + return bdev ? MKDEV(MAJOR(bdev->bd_inode->i_rdev), + bdev->bd_disk->first_minor) : 0; +} + +static inline struct block_device *as_to_bdev(struct address_space *mapping) +{ + return (mapping->host && mapping->host->i_sb->s_bdev) ? + mapping->host->i_sb->s_bdev : NULL; +} +#else /* CONFIG_BLOCK */ +static dev_t bdev_to_dev(struct block_device *bdev) +{ + return 0; +} + +static inline struct block_device *as_to_bdev(struct address_space *mapping) +{ + return NULL; +} +#endif /* CONFIG_BLOCK */ + +#ifdef CONFIG_CGROUP_FSIO_THROTTLE +int fsio_throttle(dev_t dev, ssize_t bytes, int state); +#else /* CONFIG_CGROUP_FSIO_THROTTLE */ +static inline int +fsio_throttle(dev_t dev, ssize_t bytes, int state) +{ + return 0; +} +#endif /* CONFIG_CGROUP_FSIO_THROTTLE */ + +#endif /* __FSIO_THROTTLE_H__ */ diff --git a/init/Kconfig b/init/Kconfig index d47cb77a220e..95d7342801eb 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -775,6 +775,17 @@ config CGROUP_WRITEBACK depends on MEMCG && BLK_CGROUP default y +config CGROUP_FSIO_THROTTLE + bool "Filesystem I/O throttling controller" + default n + depends on BLOCK + help + This option enables filesystem I/O throttling infrastructure. + + This allows to properly throttle reads and writes at the filesystem + level, without introducing I/O locking contentions or priority + inversion problems. + menuconfig CGROUP_SCHED bool "CPU controller" default n diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index bfcdae896122..12de828b36cd 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -2,6 +2,7 @@ obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o obj-$(CONFIG_CGROUP_FREEZER) += freezer.o +obj-$(CONFIG_CGROUP_FSIO_THROTTLE) += fsio-throttle.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o diff --git a/kernel/cgroup/fsio-throttle.c b/kernel/cgroup/fsio-throttle.c new file mode 100644 index 000000000000..46f3ffd4015b --- /dev/null +++ b/kernel/cgroup/fsio-throttle.c @@ -0,0 +1,501 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * fsio-throttle.c - I/O cgroup controller + * + * Copyright (C) 2019 Andrea Righi <righi.andrea@xxxxxxxxx> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/jiffies.h> +#include <linux/spinlock.h> +#include <linux/timer.h> +#include <linux/moduleparam.h> +#include <linux/genhd.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/sched/signal.h> +#include <linux/cgroup.h> +#include <linux/fsio-throttle.h> + +#define KB(x) ((x) * 1024) +#define MB(x) (KB(KB(x))) +#define GB(x) (MB(KB(x))) + +static int throttle_kernel_threads __read_mostly; +module_param(throttle_kernel_threads, int, 0644); +MODULE_PARM_DESC(throttle_kernel_threads, + "enable/disable I/O throttling for kernel threads"); + +static int throttle_timeslice_ms __read_mostly = 250; +module_param(throttle_timeslice_ms, int, 0644); +MODULE_PARM_DESC(throttle_kernel_threads, + "throttling time slice (default 250ms)"); + +static int throttle_timeframe_ms __read_mostly = 2000; +module_param(throttle_timeframe_ms, int, 0644); +MODULE_PARM_DESC(throttle_kernel_threads, + "maximum sleep time enforced (default 2000ms)"); + +struct iothrottle { + struct cgroup_subsys_state css; + struct list_head list; + /* protect the list of iothrottle_node elements (list) */ + struct mutex lock; + wait_queue_head_t wait; + struct timer_list timer; + bool timer_cancel; + /* protect the wait queue elements */ + spinlock_t wait_lock; +}; + +struct iothrottle_limit { + unsigned long long usage; + unsigned long long bucket_size; + unsigned long long limit; + unsigned long long timestamp; + /* protect all of the above */ + spinlock_t lock; +}; + +struct iothrottle_node { + struct list_head node; + struct rcu_head rcu; + struct iothrottle_limit bw; + dev_t dev; +}; + +static inline bool iothrottle_disabled(void) +{ + return !cgroup_subsys_enabled(fsio_cgrp_subsys); +} + +static struct iothrottle *css_to_iothrottle(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct iothrottle, css) : NULL; +} + +struct iothrottle *task_to_iothrottle(struct task_struct *p) +{ + if (unlikely(!p)) + return NULL; + return css_to_iothrottle(task_css(p, fsio_cgrp_id)); +} + +static inline unsigned long long +iothrottle_limit_delta_t(struct iothrottle_limit *res) +{ + return (long long)get_jiffies_64() - (long long)res->timestamp; +} + +static void iothrottle_limit_init(struct iothrottle_limit *res, + unsigned long long limit, + unsigned long long bucket_size) +{ + spin_lock_init(&res->lock); + res->limit = limit; + res->usage = 0; + res->bucket_size = bucket_size; + res->timestamp = get_jiffies_64(); +} + +static unsigned long long +iothrottle_limit_sleep(struct iothrottle_limit *res, unsigned long long size) +{ + unsigned long long delta; + long long tok; + unsigned long flags; + + spin_lock_irqsave(&res->lock, flags); + res->usage -= size; + delta = jiffies_to_msecs(iothrottle_limit_delta_t(res)); + res->timestamp = get_jiffies_64(); + tok = (long long)res->usage * MSEC_PER_SEC; + if (delta) { + long long max = (long long)res->bucket_size * MSEC_PER_SEC; + + tok += delta * res->limit; + tok = min_t(long long, tok, max); + res->usage = (unsigned long long)div_s64(tok, MSEC_PER_SEC); + } + spin_unlock_irqrestore(&res->lock, flags); + + return (tok < 0) ? msecs_to_jiffies(div_u64(-tok, res->limit)) : 0; +} + +static void iothrottle_limit_reset(struct iothrottle_limit *res) +{ + unsigned long flags; + + spin_lock_irqsave(&res->lock, flags); + res->usage = 0; + spin_unlock_irqrestore(&res->lock, flags); +} + +static inline int iothrottle_node_size(void) +{ + return sizeof(struct iothrottle_node); +} + +static struct iothrottle_node *iothrottle_node_alloc(gfp_t flags) +{ + struct iothrottle_node *n; + int size = iothrottle_node_size(); + + if (size < PAGE_SIZE) + n = kmalloc(size, flags); + else + n = vmalloc(size); + if (n) + memset(n, 0, size); + return n; +} + +static void iothrottle_node_free(struct iothrottle_node *n) +{ + if (iothrottle_node_size() < PAGE_SIZE) + kfree(n); + else + vfree(n); +} + +static struct iothrottle_node * +iothrottle_node_search(const struct iothrottle *iot, dev_t dev) +{ + struct iothrottle_node *n; + + list_for_each_entry_rcu(n, &iot->list, node) + if (n->dev == dev) + return n; + return NULL; +} + +static void iothrottle_node_reclaim(struct rcu_head *rp) +{ + struct iothrottle_node *n; + + n = container_of(rp, struct iothrottle_node, rcu); + iothrottle_node_free(n); +} + +static int iothrottle_parse_args(char *buf, size_t nbytes, + dev_t *dev, + unsigned long long *io_limit, + unsigned long long *bucket_size) +{ + struct gendisk *disk; + unsigned int major, minor; + unsigned long long limit, size; + int part, ret = 0; + + if (sscanf(buf, "%u:%u %llu %llu", &major, &minor, &limit, &size) != 4) + return -EINVAL; + disk = get_gendisk(MKDEV(major, minor), &part); + if (!disk) + return -ENODEV; + if (part) { + ret = -ENODEV; + goto out; + } + *dev = MKDEV(major, minor); + *io_limit = MB(limit); + *bucket_size = MB(size); +out: + put_disk_and_module(disk); + + return ret; +} + +static ssize_t iothrottle_write(struct kernfs_open_file *of, + char *buffer, size_t nbytes, loff_t off) +{ + struct iothrottle *iot; + struct iothrottle_node *n, *newn = NULL; + unsigned long long io_limit, bucket_size; + dev_t dev; + char *buf; + int ret; + + /* + * We need to allocate a new buffer here, because + * iothrottle_parse_args() can modify it and the buffer provided by + * write_string is supposed to be const. + */ + buf = kmalloc(nbytes + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + memcpy(buf, buffer, nbytes + 1); + + ret = iothrottle_parse_args(buf, nbytes, &dev, &io_limit, &bucket_size); + if (ret) + goto out_free; + + newn = iothrottle_node_alloc(GFP_KERNEL); + if (!newn) { + ret = -ENOMEM; + goto out_free; + } + newn->dev = dev; + iothrottle_limit_init(&newn->bw, io_limit, bucket_size); + + iot = css_to_iothrottle(of_css(of)); + if (unlikely(!iot)) { + WARN_ON_ONCE(1); + goto out_free; + } + mutex_lock(&iot->lock); + n = iothrottle_node_search(iot, dev); + if (!n) { + /* Insert new node */ + if (io_limit) { + list_add_rcu(&newn->node, &iot->list); + newn = NULL; + } + } else if (!io_limit) { + /* Delete existing node */ + list_del_rcu(&n->node); + } else { + /* Update existing node */ + list_replace_rcu(&n->node, &newn->node); + newn = NULL; + } + mutex_unlock(&iot->lock); + if (n) + call_rcu(&n->rcu, iothrottle_node_reclaim); + ret = nbytes; +out_free: + if (newn) + iothrottle_node_free(newn); + kfree(buf); + return ret; +} + +static void iothrottle_show_limit(struct seq_file *m, + dev_t dev, struct iothrottle_limit *res) +{ + seq_put_decimal_ull(m, "", MAJOR(dev)); + seq_put_decimal_ull(m, ":", MINOR(dev)); + seq_put_decimal_ull(m, " ", res->limit); + seq_put_decimal_ull(m, " ", res->usage); + seq_put_decimal_ull(m, " ", res->bucket_size); + seq_put_decimal_ull(m, " ", + jiffies_to_clock_t(iothrottle_limit_delta_t(res))); + seq_putc(m, '\n'); +} + +static int iothrottle_read(struct seq_file *m, void *v) +{ + struct iothrottle *iot = css_to_iothrottle(seq_css(m)); + struct iothrottle_node *n; + + rcu_read_lock(); + list_for_each_entry_rcu(n, &iot->list, node) + iothrottle_show_limit(m, n->dev, &n->bw); + rcu_read_unlock(); + + return 0; +} + +static struct cftype iothrottle_files[] = { + { + .name = "max_mbs", + .seq_show = iothrottle_read, + .write = iothrottle_write, + .flags = CFTYPE_NOT_ON_ROOT, + }, +}; + +static void iothrottle_wakeup(struct iothrottle *iot, bool timer_cancel) +{ + spin_lock_bh(&iot->wait_lock); + if (timer_cancel) + iot->timer_cancel = true; + wake_up_all(&iot->wait); + spin_unlock_bh(&iot->wait_lock); +} + +static void iothrottle_timer_wakeup(struct timer_list *t) +{ + struct iothrottle *iot = from_timer(iot, t, timer); + + iothrottle_wakeup(iot, false); +} + +static struct cgroup_subsys_state * +iothrottle_css_alloc(struct cgroup_subsys_state *parent) +{ + struct iothrottle *iot; + + iot = kzalloc(sizeof(*iot), GFP_KERNEL); + if (!iot) + return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&iot->list); + mutex_init(&iot->lock); + init_waitqueue_head(&iot->wait); + spin_lock_init(&iot->wait_lock); + iot->timer_cancel = false; + timer_setup(&iot->timer, iothrottle_timer_wakeup, 0); + + return &iot->css; +} + +static void iothrottle_css_offline(struct cgroup_subsys_state *css) +{ + struct iothrottle *iot = css_to_iothrottle(css); + + spin_lock_bh(&iot->wait_lock); + iot->timer_cancel = true; + spin_unlock_bh(&iot->wait_lock); + + iothrottle_wakeup(iot, true); +} + +static void iothrottle_css_free(struct cgroup_subsys_state *css) +{ + struct iothrottle_node *n, *p; + struct iothrottle *iot = css_to_iothrottle(css); + + del_timer_sync(&iot->timer); + /* + * don't worry about locking here, at this point there's no reference + * to the list. + */ + list_for_each_entry_safe(n, p, &iot->list, node) + iothrottle_node_free(n); + kfree(iot); +} + +static inline bool is_kernel_thread(void) +{ + return !!(current->flags & (PF_KTHREAD | PF_KSWAPD)); +} + +static inline bool is_urgent_task(void) +{ + /* Never throttle tasks that are going to exit */ + if (current->flags & PF_EXITING) + return true; + /* Throttle kernel threads only if throttle_kernel_threads is set */ + return is_kernel_thread() && !throttle_kernel_threads; +} + +static struct iothrottle *try_get_iothrottle_from_task(struct task_struct *p) +{ + struct iothrottle *iot = NULL; + + rcu_read_lock(); + if (!task_css_is_root(p, fsio_cgrp_id)) { + do { + iot = task_to_iothrottle(p); + if (unlikely(!iot)) + break; + } while (!css_tryget_online(&iot->css)); + } + rcu_read_unlock(); + + return iot; +} + +static int iothrottle_evaluate_sleep(struct iothrottle *iot, dev_t dev, + ssize_t bytes, int state) +{ + struct iothrottle_node *n; + unsigned long long sleep = 0; + + rcu_read_lock(); + n = iothrottle_node_search(iot, dev); + if (n) { + sleep = iothrottle_limit_sleep(&n->bw, bytes); + /* + * state == 0 is used to do only I/O accounting without + * enforcing sleeps. + */ + if (!state || sleep < msecs_to_jiffies(throttle_timeslice_ms)) + sleep = 0; + if (sleep) + iothrottle_limit_reset(&n->bw); + } + rcu_read_unlock(); + + return sleep; +} + +static noinline void iothrottle_force_sleep(struct iothrottle *iot, + unsigned long long sleep, + int state) +{ + unsigned long expire, now; + + /* + * Allow small IO bursts, by waking up the throttled task after a + * maximum sleep of throttle_timeframe millisec. + */ + if (sleep > msecs_to_jiffies(throttle_timeframe_ms)) + sleep = msecs_to_jiffies(throttle_timeframe_ms); + + now = READ_ONCE(jiffies); + expire = now + sleep; + + /* + * Round up the time to sleep to a multiple of the sleep timeslice. + * + * In this way we can strongly reduce timer softirqs and + * context switches in the system even when there are a lot of + * different cgroups. + */ + expire = roundup(expire, msecs_to_jiffies(throttle_timeslice_ms)); + + /* Force sleep */ + do { + DEFINE_WAIT(wait); + + spin_lock_bh(&iot->wait_lock); + if (unlikely(iot->timer_cancel)) { + spin_unlock_bh(&iot->wait_lock); + break; + } + mod_timer(&iot->timer, expire); + spin_unlock_bh(&iot->wait_lock); + + /* + * Do not enforce interruptible sleep if there are pending + * signals, otherwise we'll end up into a busy loop. + */ + if (signal_pending(current)) + state = TASK_KILLABLE; + + /* Send to sleep */ + prepare_to_wait(&iot->wait, &wait, state); + schedule(); + finish_wait(&iot->wait, &wait); + } while (!fatal_signal_pending(current) && + time_is_after_jiffies(expire)); +} + +int fsio_throttle(dev_t dev, ssize_t bytes, int state) +{ + struct iothrottle *iot; + unsigned long long sleep = 0; + + if (iothrottle_disabled() || is_urgent_task()) + return 0; + if (!dev) + return 0; + iot = try_get_iothrottle_from_task(current); + if (!iot) + return 0; + sleep = iothrottle_evaluate_sleep(iot, dev, bytes, state); + if (unlikely(sleep)) + iothrottle_force_sleep(iot, sleep, state); + css_put(&iot->css); + + return sleep; +} + +struct cgroup_subsys fsio_cgrp_subsys = { + .css_alloc = iothrottle_css_alloc, + .css_free = iothrottle_css_free, + .css_offline = iothrottle_css_offline, + .dfl_cftypes = iothrottle_files, +}; -- 2.17.1