Re: [PATCH 1/2] cgroup: add a new group controller for cephfs

Ilya Dryomov <idryomov@xxxxxxxxx> · Fri, 24 May 2019 12:31:34 +0200

On Tue, Apr 30, 2019 at 2:07 PM <xxhdx1985126@xxxxxxxxx> wrote:
>
> From: Xuehan Xu <xuxuehan@xxxxxx>
>
> this controller is supposed to limit the metadata
> ops or data ops issued to the underlying cluster.
>
> Signed-off-by: Xuehan Xu <xuxuehan@xxxxxx>
> ---
>  include/linux/cgroup_cephfs.h |  57 +++++
>  include/linux/cgroup_subsys.h |   4 +
>  init/Kconfig                  |   5 +
>  kernel/cgroup/Makefile        |   1 +
>  kernel/cgroup/cephfs.c        | 398 ++++++++++++++++++++++++++++++++++
>  5 files changed, 465 insertions(+)
>  create mode 100644 include/linux/cgroup_cephfs.h
>  create mode 100644 kernel/cgroup/cephfs.c
>
> diff --git a/include/linux/cgroup_cephfs.h b/include/linux/cgroup_cephfs.h
> new file mode 100644
> index 000000000000..91809862b8f8
> --- /dev/null
> +++ b/include/linux/cgroup_cephfs.h
> @@ -0,0 +1,57 @@
> +#ifndef _CEPHFS_CGROUP_H
> +#define _CEPHFS_CGROUP_H
> +
> +#include <linux/cgroup.h>
> +
> +#define META_OPS_IOPS_IDX 0
> +#define DATA_OPS_IOPS_IDX 0
> +#define DATA_OPS_BAND_IDX 1
> +#define META_OPS_TB_NUM 1
> +#define DATA_OPS_TB_NUM 2
> +
> +/*
> + * token bucket throttle
> + */
> +struct token_bucket {
> +    u64 remain;
> +    u64 max;
> +    u64 target_throughput;
> +};
> +
> +struct token_bucket_throttle {
> +    struct token_bucket* tb;
> +    u64 tick_interval;
> +    int tb_num;
> +    struct list_head reqs_blocked;
> +    struct mutex bucket_lock;
> +    struct delayed_work tick_work;
> +    unsigned long tbt_timeout;
> +};
> +
> +struct queue_item {
> +    struct list_head token_bucket_throttle_item;
> +    u64* tokens_requested;
> +    int tb_item_num;
> +    struct completion throttled;
> +    unsigned long tbt_timeout;
> +};
> +
> +struct cephfscg {
> +    struct cgroup_subsys_state  css;
> +    spinlock_t          lock;
> +
> +    struct token_bucket_throttle meta_ops_throttle;
> +    struct token_bucket_throttle data_ops_throttle;
> +};
> +
> +extern void schedule_token_bucket_throttle_tick(struct token_bucket_throttle* ptbt, u64 tick_interval);
> +
> +extern void token_bucket_throttle_tick(struct work_struct* work);
> +
> +extern int get_token_bucket_throttle(struct token_bucket_throttle* ptbt, struct queue_item* req);
> +
> +extern int queue_item_init(struct queue_item* qitem, struct token_bucket_throttle* ptbt, int tb_item_num);
> +
> +extern int token_bucket_throttle_init(struct token_bucket_throttle* ptbt, int token_bucket_num);
> +
> +#endif /*_CEPHFS_CGROUP_H*/
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index acb77dcff3b4..577a276570a5 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -61,6 +61,10 @@ SUBSYS(pids)
>  SUBSYS(rdma)
>  #endif
>
> +#if IS_ENABLED(CONFIG_CGROUP_CEPH_FS)
> +SUBSYS(cephfs)
> +#endif
> +
>  /*
>   * The following subsystems are not supported on the default hierarchy.
>   */
> diff --git a/init/Kconfig b/init/Kconfig
> index 4592bf7997c0..e22f3aea9e23 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -867,6 +867,11 @@ config CGROUP_RDMA
>           Attaching processes with active RDMA resources to the cgroup
>           hierarchy is allowed even if can cross the hierarchy's limit.
>
> +config CGROUP_CEPH_FS
> +    bool "cephfs controller"
> +    help
> +        cephfs cgroup controller
> +
>  config CGROUP_FREEZER
>         bool "Freezer controller"
>         help
> diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
> index bfcdae896122..aaf836181f1a 100644
> --- a/kernel/cgroup/Makefile
> +++ b/kernel/cgroup/Makefile
> @@ -6,3 +6,4 @@ obj-$(CONFIG_CGROUP_PIDS) += pids.o
>  obj-$(CONFIG_CGROUP_RDMA) += rdma.o
>  obj-$(CONFIG_CPUSETS) += cpuset.o
>  obj-$(CONFIG_CGROUP_DEBUG) += debug.o
> +obj-$(CONFIG_CGROUP_CEPH_FS) += cephfs.o
> diff --git a/kernel/cgroup/cephfs.c b/kernel/cgroup/cephfs.c
> new file mode 100644
> index 000000000000..65b9e9618a5d
> --- /dev/null
> +++ b/kernel/cgroup/cephfs.c
> @@ -0,0 +1,398 @@
> +#include <linux/cgroup_cephfs.h>
> +#include <linux/slab.h>
> +
> +struct cephfscg cephfscg_root;
> +
> +static void put_token(struct token_bucket_throttle* ptbt, u64 tick_interval)
> +{
> +    struct token_bucket* ptb = NULL;
> +    u64 tokens_to_put = 0;
> +    int i = 0;
> +
> +    for (i = 0; i < ptbt->tb_num; i++) {
> +        ptb = &ptbt->tb[i];
> +
> +        if (!ptb->max)
> +            continue;
> +
> +        tokens_to_put = ptb->target_throughput * tick_interval / HZ;
> +
> +        if (ptb->remain + tokens_to_put >= ptb->max)
> +            ptb->remain = ptb->max;
> +        else
> +            ptb->remain += tokens_to_put;
> +        pr_debug("%s: put_token: token bucket remain: %lld\n", __func__, ptb->remain);
> +    }
> +}
> +
> +static bool should_wait(struct token_bucket_throttle* ptbt, struct queue_item* qitem)
> +{
> +    struct token_bucket* ptb = NULL;
> +    int i = 0;
> +
> +    BUG_ON(ptbt->tb_num != qitem->tb_item_num);
> +    for (i = 0; i < ptbt->tb_num; i++) {
> +        ptb = &ptbt->tb[i];
> +
> +        if (!ptb->max)
> +            continue;
> +
> +        if (ptb->remain < qitem->tokens_requested[i])
> +            return true;
> +    }
> +    return false;
> +}
> +
> +static void get_token(struct token_bucket_throttle* ptbt, struct queue_item* qitem)
> +{
> +    struct token_bucket* ptb = NULL;
> +    int i = 0;
> +    BUG_ON(should_wait(ptbt, qitem));
> +
> +    for (i = 0; i < ptbt->tb_num; i++) {
> +        ptb = &ptbt->tb[i];
> +        if (!ptb->max)
> +            continue;
> +        ptb->remain -= qitem->tokens_requested[i];
> +    }
> +}
> +
> +void schedule_token_bucket_throttle_tick(struct token_bucket_throttle* ptbt, u64 tick_interval)
> +{
> +    if (tick_interval)
> +        schedule_delayed_work(&ptbt->tick_work, tick_interval);
> +}
> +EXPORT_SYMBOL(schedule_token_bucket_throttle_tick);
> +
> +void token_bucket_throttle_tick(struct work_struct* work)
> +{
> +    struct token_bucket_throttle* ptbt =
> +        container_of(work, struct token_bucket_throttle, tick_work.work);
> +    struct queue_item* req = NULL, *tmp = NULL;
> +    LIST_HEAD(reqs_to_go);
> +    u64 tick_interval = ptbt->tick_interval;
> +
> +    mutex_lock(&ptbt->bucket_lock);
> +    put_token(ptbt, tick_interval);
> +    if (!tick_interval)
> +        pr_debug("%s: tick_interval set to 0, turning off the throttle, item: %p\n", __func__, req);
> +
> +    list_for_each_entry_safe(req, tmp, &ptbt->reqs_blocked, token_bucket_throttle_item) {
> +        pr_debug("%s: waiting item: %p\n", __func__, req);
> +        if (tick_interval) {
> +            if (should_wait(ptbt, req))
> +                break;
> +            get_token(ptbt, req);
> +        }
> +        list_del(&req->token_bucket_throttle_item);
> +        list_add_tail(&req->token_bucket_throttle_item, &reqs_to_go);
> +        pr_debug("%s: tokens got for req: %p\n", __func__, req);
> +    }
> +    mutex_unlock(&ptbt->bucket_lock);
> +
> +    list_for_each_entry_safe(req, tmp, &reqs_to_go, token_bucket_throttle_item) {
> +        pr_debug("%s: notifying req: %p, list head: %p\n", __func__, req, &reqs_to_go);
> +        complete_all(&req->throttled);
> +        list_del(&req->token_bucket_throttle_item);
> +    }
> +
> +    if (tick_interval)
> +        schedule_token_bucket_throttle_tick(ptbt, tick_interval);
> +}
> +EXPORT_SYMBOL(token_bucket_throttle_tick);
> +
> +int get_token_bucket_throttle(struct token_bucket_throttle* ptbt, struct queue_item* req)
> +{
> +    int ret = 0;
> +    long timeleft = 0;
> +
> +    mutex_lock(&ptbt->bucket_lock);
> +    if (should_wait(ptbt, req)) {
> +        pr_debug("%s: wait for tokens, req: %p\n", __func__, req);
> +        list_add_tail(&req->token_bucket_throttle_item, &ptbt->reqs_blocked);
> +        mutex_unlock(&ptbt->bucket_lock);
> +        timeleft = wait_for_completion_killable_timeout(&req->throttled, req->tbt_timeout ?: MAX_SCHEDULE_TIMEOUT);
> +        if (timeleft > 0)
> +            ret = 0;
> +        else if (!timeleft)
> +            ret = -EIO; /* timed out */
> +        else {
> +            /* killed */
> +            pr_debug("%s: killed, req: %p\n", __func__, req);
> +            mutex_lock(&ptbt->bucket_lock);
> +            list_del(&req->token_bucket_throttle_item);
> +            mutex_unlock(&ptbt->bucket_lock);
> +            ret = timeleft;
> +        }
> +    } else {
> +        pr_debug("%s: no need to wait for tokens, going ahead, req: %p\n", __func__, req);
> +        get_token(ptbt, req);
> +        mutex_unlock(&ptbt->bucket_lock);
> +    }
> +    return ret;
> +}
> +EXPORT_SYMBOL(get_token_bucket_throttle);
> +
> +int queue_item_init(struct queue_item* qitem, struct token_bucket_throttle* ptbt, int tb_item_num)
> +{
> +    qitem->tokens_requested = kzalloc(sizeof(*qitem->tokens_requested) * tb_item_num, GFP_KERNEL);
> +    if (!qitem->tokens_requested)
> +        return -ENOMEM;
> +
> +    qitem->tb_item_num = tb_item_num;
> +    INIT_LIST_HEAD(&qitem->token_bucket_throttle_item);
> +    init_completion(&qitem->throttled);
> +    qitem->tbt_timeout = ptbt->tbt_timeout;
> +
> +    return 0;
> +}
> +EXPORT_SYMBOL(queue_item_init);
> +
> +int token_bucket_throttle_init(struct token_bucket_throttle* ptbt,
> +        int token_bucket_num)
> +{
> +    int i = 0;
> +
> +    INIT_LIST_HEAD(&ptbt->reqs_blocked);
> +    mutex_init(&ptbt->bucket_lock);
> +    ptbt->tb_num = token_bucket_num;
> +    ptbt->tb = kzalloc(sizeof(*ptbt->tb) * ptbt->tb_num, GFP_KERNEL);
> +    if (!ptbt->tb) {
> +        return -ENOMEM;
> +    }
> +
> +    for (i = 0; i < ptbt->tb_num; i++) {
> +        ptbt->tb[i].target_throughput = 0;
> +        ptbt->tb[i].max = 0;
> +    }
> +    ptbt->tick_interval = 0;
> +    ptbt->tbt_timeout = 0;
> +    INIT_DELAYED_WORK(&ptbt->tick_work, token_bucket_throttle_tick);
> +
> +    return 0;
> +}
> +EXPORT_SYMBOL(token_bucket_throttle_init);
> +
> +static int set_throttle_params(struct token_bucket_throttle* ptbt, char* param_list)
> +{
> +    char* options = strstrip(param_list);
> +    char* val = NULL;
> +    int res = 0;
> +    unsigned long interval = 0, timeout = 0, last_interval = ptbt->tick_interval;
> +
> +    val = strsep(&options, ",");
> +    if (!val)
> +        return -EINVAL;
> +
> +    res = kstrtol(val, 0, &interval);
> +    if (res)
> +        return res;
> +
> +    val = strsep(&options, ",");
> +    if (!val)
> +        return -EINVAL;
> +
> +    res = kstrtol(val, 0, &timeout);
> +    if (res)
> +        return res;
> +
> +    if (last_interval && !interval) {
> +        int i = 0;
> +
> +        for (i = 0; i<ptbt->tb_num; i++) {
> +            if (ptbt->tb[i].max) {
> +                /* all token bucket must be unset
> +                 * before turning off the throttle */
> +                return -EINVAL;
> +            }
> +        }
> +    }
> +    ptbt->tick_interval = msecs_to_jiffies(interval);
> +    ptbt->tbt_timeout = timeout;
> +
> +    if (ptbt->tick_interval && !last_interval) {
> +        schedule_token_bucket_throttle_tick(ptbt, ptbt->tick_interval);
> +    }
> +
> +    return 0;
> +}
> +
> +static int set_tb_params(struct token_bucket_throttle* ptbt, int tb_idx, char* param_list)
> +{
> +    char* options = strstrip(param_list);
> +    char* val = NULL;
> +    int res = 0;
> +    unsigned long throughput = 0, burst = 0;
> +
> +    val = strsep(&options, ",");
> +    if (!val)
> +        return -EINVAL;
> +
> +    res = kstrtol(val, 0, &throughput);
> +    if (res)
> +        return res;
> +
> +    val = strsep(&options, ",");
> +    if (!val)
> +        return -EINVAL;
> +
> +    res = kstrtol(val, 0, &burst);
> +    if (res)
> +        return res;
> +
> +    if (!(throughput && burst) && (throughput || burst)) {
> +        /* either both or none of throughput and burst are set*/
> +        return -EINVAL;
> +    }
> +    if (throughput && !ptbt->tick_interval) {
> +        /* all token bucket must be unset
> +         * before turning off the throttle */
> +        return -EINVAL;
> +    }
> +    ptbt->tb[tb_idx].target_throughput = throughput;
> +    ptbt->tb[tb_idx].max = burst;
> +
> +    return 0;
> +}
> +
> +static ssize_t cephfscg_set_throttle_params(struct kernfs_open_file *of,
> +        char *buf, size_t nbytes, loff_t off)
> +{
> +    const char *throttle_name;
> +    int ret = 0;
> +    struct cephfscg* cephfscg_p =
> +        container_of(seq_css(of->seq_file), struct cephfscg, css);
> +
> +    throttle_name = of->kn->name;
> +    if (!strcmp(throttle_name, "cephfs.meta_ops")) {
> +        ret = set_throttle_params(&cephfscg_p->meta_ops_throttle, buf);
> +    } else if (!strcmp(throttle_name, "cephfs.data_ops")) {
> +        ret = set_throttle_params(&cephfscg_p->data_ops_throttle, buf);
> +    } else if (!strcmp(throttle_name, "cephfs.meta_ops.iops")) {
> +        ret = set_tb_params(&cephfscg_p->meta_ops_throttle, META_OPS_IOPS_IDX, buf);
> +    } else if (!strcmp(throttle_name, "cephfs.data_ops.iops")) {
> +        ret = set_tb_params(&cephfscg_p->data_ops_throttle, DATA_OPS_IOPS_IDX, buf);
> +    } else if (!strcmp(throttle_name, "cephfs.data_ops.band")) {
> +        ret = set_tb_params(&cephfscg_p->data_ops_throttle, DATA_OPS_BAND_IDX, buf);
> +    }
> +
> +    return ret ?: nbytes;
> +}
> +
> +static int cephfscg_throttle_params_read(struct seq_file *sf, void *v)
> +{
> +    const char *throttle_name;
> +    struct cephfscg* cephfscg_p =
> +        container_of(seq_css(sf), struct cephfscg, css);
> +
> +    throttle_name = ((struct kernfs_open_file*)sf->private)->kn->name;
> +    if (!strcmp(throttle_name, "cephfs.meta_ops")) {
> +        seq_printf(sf, "%llu,%lu\n",
> +                cephfscg_p->meta_ops_throttle.tick_interval,
> +                cephfscg_p->meta_ops_throttle.tbt_timeout);
> +    } else if (!strcmp(throttle_name, "cephfs.data_ops")) {
> +        seq_printf(sf, "%llu,%lu\n",
> +                cephfscg_p->data_ops_throttle.tick_interval,
> +                cephfscg_p->data_ops_throttle.tbt_timeout);
> +    } else if (!strcmp(throttle_name, "cephfs.data_ops.iops")) {
> +        seq_printf(sf, "%llu,%llu\n",
> +                cephfscg_p->data_ops_throttle.tb[DATA_OPS_IOPS_IDX].target_throughput,
> +                cephfscg_p->data_ops_throttle.tb[DATA_OPS_IOPS_IDX].max);
> +    } else if (!strcmp(throttle_name, "cephfs.data_ops.band")) {
> +        seq_printf(sf, "%llu,%llu\n",
> +                cephfscg_p->data_ops_throttle.tb[DATA_OPS_BAND_IDX].target_throughput,
> +                cephfscg_p->data_ops_throttle.tb[DATA_OPS_BAND_IDX].max);
> +    } else if (!strcmp(throttle_name, "cephfs.meta_ops.iops")) {
> +        seq_printf(sf, "%llu,%llu\n",
> +                cephfscg_p->meta_ops_throttle.tb[META_OPS_IOPS_IDX].target_throughput,
> +                cephfscg_p->meta_ops_throttle.tb[META_OPS_IOPS_IDX].max);
> +    }
> +
> +    return 0;
> +}
> +
> +static struct cftype cephfscg_files[] = {
> +    {
> +        .name = "meta_ops.iops",
> +        .write = cephfscg_set_throttle_params,
> +        .seq_show = cephfscg_throttle_params_read,
> +    },
> +    {
> +        .name = "meta_ops",
> +        .write = cephfscg_set_throttle_params,
> +        .seq_show = cephfscg_throttle_params_read,
> +    },
> +    {
> +        .name = "data_ops.iops",
> +        .write = cephfscg_set_throttle_params,
> +        .seq_show = cephfscg_throttle_params_read,
> +    },
> +    {
> +        .name = "data_ops.band",
> +        .write = cephfscg_set_throttle_params,
> +        .seq_show = cephfscg_throttle_params_read,
> +    },
> +    {
> +        .name = "data_ops",
> +        .write = cephfscg_set_throttle_params,
> +        .seq_show = cephfscg_throttle_params_read,
> +    },
> +    { }
> +};
> +
> +static struct cgroup_subsys_state *
> +cephfscg_css_alloc(struct cgroup_subsys_state *parent_css) {
> +
> +    struct cephfscg* cephfscg_p = NULL;
> +    struct cgroup_subsys_state *ret = NULL;
> +    int r = 0;
> +
> +    if (!parent_css) {
> +        cephfscg_p = &cephfscg_root;
> +    } else {
> +        cephfscg_p = kzalloc(sizeof(*cephfscg_p), GFP_KERNEL);
> +        if (!cephfscg_p) {
> +            ret = ERR_PTR(-ENOMEM);
> +            goto err;
> +        }
> +    }
> +
> +    spin_lock_init(&cephfscg_p->lock);
> +
> +    r = token_bucket_throttle_init(&cephfscg_p->meta_ops_throttle, 1);
> +    if (r) {
> +        ret = ERR_PTR(r);
> +        goto err;
> +    }
> +
> +    r = token_bucket_throttle_init(&cephfscg_p->data_ops_throttle, 2);
> +    if (r) {
> +        ret = ERR_PTR(r);
> +        goto err;
> +    }
> +
> +    return &cephfscg_p->css;
> +err:
> +    return ret;
> +}
> +
> +static void cephfscg_css_free(struct cgroup_subsys_state *css) {
> +    struct cephfscg* cephfscg_p =
> +        css ? container_of(css, struct cephfscg, css) : NULL;
> +
> +    cancel_delayed_work_sync(&cephfscg_p->meta_ops_throttle.tick_work);
> +    cancel_delayed_work_sync(&cephfscg_p->data_ops_throttle.tick_work);
> +
> +    kfree(cephfscg_p->meta_ops_throttle.tb);
> +    kfree(cephfscg_p->data_ops_throttle.tb);
> +
> +    kfree(cephfscg_p);
> +}
> +
> +struct cgroup_subsys cephfs_cgrp_subsys = {
> +    .css_alloc = cephfscg_css_alloc,
> +    .css_free = cephfscg_css_free,
> +    .dfl_cftypes = cephfscg_files,
> +    .legacy_cftypes = cephfscg_files,
> +};
> +EXPORT_SYMBOL_GPL(cephfs_cgrp_subsys);

Hi Xuehan,

While I understand the desire to use the cgroup interface to allow for
easy adjustment and process granularity, I think this is unlikely to be
accepted in the form of a new controller.  Each controller is supposed
to distribute a specific resource and meta iops, data iops and data
band(width?) mostly fall under the realm of the existing I/O
controller.  Have you run this by the cgroup folks?

Regardless, take a look at Documentation/process/coding-style.rst for
rules on indentation, line length, etc.  Also, the data throttle should
apply to rbd too, so I would change the name to "ceph".

Thanks,

                Ilya