Add cgroup to assoicate tasks with L3 networking domains. AF_INET{6} sockets opened by tasks associated with an l3mdev cgroup are bound to the associated master device when the socket is created. This allows a user to run a command (and its children) within an L3 networking context. The master-device for an l3mdev cgroup must be an L3 master device (e.g., VRF), and it must be set before attaching tasks to the cgroup. Once set the master-device can not change. Nested l3mdev cgroups are not supported. The root (aka default) l3mdev cgroup can not be bound to a master device. Example: ip link add vrf-red type vrf table vrf-red ip link set dev vrf-red up ip link set dev eth1 master vrf-red cgcreate -g l3mdev:vrf-red cgset -r l3mdev.master-device=vrf-red vrf-red cgexec -g l3mdev:vrf-red bash At this point the current shell and its child processes are attached to the vrf-red L3 domain. Any AF_INET and AF_INET6 sockets opened by the tasks are bound to the vrf-red device. TO-DO: - how to auto-create the cgroup when a VRF device is created and auto-deleted when a VRF device is destroyed Signed-off-by: David Ahern <dsa@xxxxxxxxxxxxxxxxxxx> --- include/linux/cgroup_subsys.h | 3 + include/net/l3mdev_cgroup.h | 27 ++++++ net/core/sock.c | 2 + net/l3mdev/Kconfig | 12 +++ net/l3mdev/Makefile | 1 + net/l3mdev/l3mdev_cgroup.c | 195 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 240 insertions(+) create mode 100644 include/net/l3mdev_cgroup.h create mode 100644 net/l3mdev/l3mdev_cgroup.c diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 1a96fdaa33d5..507df40f11de 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -58,6 +58,9 @@ SUBSYS(net_prio) SUBSYS(hugetlb) #endif +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) +SUBSYS(l3mdev) +#endif /* * Subsystems that implement the can_fork() family of callbacks. */ diff --git a/include/net/l3mdev_cgroup.h b/include/net/l3mdev_cgroup.h new file mode 100644 index 000000000000..c20fbb0a7f46 --- /dev/null +++ b/include/net/l3mdev_cgroup.h @@ -0,0 +1,27 @@ +/* + * l3mdev_cgroup.h Control Group for L3 Master Device + * + * Copyright (c) 2015 Cumulus Networks. All rights reserved. + * Copyright (c) 2015 David Ahern <dsa@xxxxxxxxxxxxxxxxxxx> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef _L3MDEV_CGROUP_H +#define _L3MDEV_CGROUP_H + +#if IS_ENABLED(CONFIG_CGROUP_L3MDEV) + +void sock_update_l3mdev(struct sock *sk); + +#else /* !CONFIG_CGROUP_L3MDEV */ + +static inline void sock_update_l3mdev(struct sock *sk) +{ +} + +#endif /* CONFIG_CGROUP_L3MDEV */ +#endif /* _L3MDEV_CGROUP_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 565bab7baca9..19ce06674dd9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -131,6 +131,7 @@ #include <linux/ipsec.h> #include <net/cls_cgroup.h> #include <net/netprio_cgroup.h> +#include <net/l3mdev_cgroup.h> #include <linux/sock_diag.h> #include <linux/filter.h> @@ -1424,6 +1425,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); + sock_update_l3mdev(sk); } return sk; diff --git a/net/l3mdev/Kconfig b/net/l3mdev/Kconfig index 5d47325037bc..3142d810e222 100644 --- a/net/l3mdev/Kconfig +++ b/net/l3mdev/Kconfig @@ -8,3 +8,15 @@ config NET_L3_MASTER_DEV ---help--- This module provides glue between core networking code and device drivers to support L3 master devices like VRF. + +config CGROUP_L3MDEV + bool "L3 Master Device cgroup" + depends on CGROUPS + depends on NET_L3_MASTER_DEV + ---help--- + Cgroup subsystem for assigning processes to an L3 domain. + When a process is assigned to an l3mdev domain all AF_INET and + AF_INET6 sockets opened by the process are bound to the L3 master + device. + + diff --git a/net/l3mdev/Makefile b/net/l3mdev/Makefile index 84a53a6f609a..ae74ebad8db7 100644 --- a/net/l3mdev/Makefile +++ b/net/l3mdev/Makefile @@ -3,3 +3,4 @@ # obj-$(CONFIG_NET_L3_MASTER_DEV) += l3mdev.o +obj-$(CONFIG_CGROUP_L3MDEV) += l3mdev_cgroup.o diff --git a/net/l3mdev/l3mdev_cgroup.c b/net/l3mdev/l3mdev_cgroup.c new file mode 100644 index 000000000000..0326c06bfe02 --- /dev/null +++ b/net/l3mdev/l3mdev_cgroup.c @@ -0,0 +1,195 @@ +/* + * net/l3mdev/l3mdev_cgroup.c Control Group for L3 Master Devices + * + * Copyright (c) 2015 Cumulus Networks. All rights reserved. + * Copyright (c) 2015 David Ahern <dsa@xxxxxxxxxxxxxxxxxxx> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/cgroup.h> +#include <net/sock.h> +#include <net/l3mdev_cgroup.h> + +struct l3mdev_cgroup { + struct cgroup_subsys_state css; + struct net *net; + int dev_idx; +}; + +static inline struct l3mdev_cgroup *css_l3mdev(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct l3mdev_cgroup, css) : NULL; +} + +static void l3mdev_set_bound_dev(struct sock *sk) +{ + struct task_struct *tsk = current; + struct l3mdev_cgroup *l3mdev_cgrp; + + rcu_read_lock(); + + l3mdev_cgrp = css_l3mdev(task_css(tsk, l3mdev_cgrp_id)); + if (l3mdev_cgrp && l3mdev_cgrp->dev_idx) + sk->sk_bound_dev_if = l3mdev_cgrp->dev_idx; + + rcu_read_unlock(); +} + +void sock_update_l3mdev(struct sock *sk) +{ + switch (sk->sk_family) { + case AF_INET: + case AF_INET6: + l3mdev_set_bound_dev(sk); + break; + } +} + +static bool is_root_cgroup(struct cgroup_subsys_state *css) +{ + return !css || !css->parent; +} + +static struct cgroup_subsys_state * +l3mdev_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct l3mdev_cgroup *l3mdev_cgrp; + + /* nested l3mdev domains are not supportd */ + if (!is_root_cgroup(parent_css)) + return ERR_PTR(-EINVAL); + + l3mdev_cgrp = kzalloc(sizeof(*l3mdev_cgrp), GFP_KERNEL); + if (!l3mdev_cgrp) + return ERR_PTR(-ENOMEM); + + return &l3mdev_cgrp->css; +} + +static int l3mdev_css_online(struct cgroup_subsys_state *css) +{ + return 0; +} + +static void l3mdev_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_l3mdev(css)); +} + +static int l3mdev_read(struct seq_file *sf, void *v) +{ + struct cgroup_subsys_state *css = seq_css(sf); + struct l3mdev_cgroup *l3mdev_cgrp = css_l3mdev(css); + + if (!l3mdev_cgrp) + return -EINVAL; + + if (l3mdev_cgrp->net) { + struct net_device *dev; + + dev = dev_get_by_index(l3mdev_cgrp->net, l3mdev_cgrp->dev_idx); + + seq_printf(sf, "net[%u]: device index %d ==> %s\n", + l3mdev_cgrp->net->ns.inum, l3mdev_cgrp->dev_idx, + dev ? dev->name : "<none>"); + + if (dev) + dev_put(dev); + } + return 0; +} + +static ssize_t l3mdev_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup_subsys_state *css = of_css(of); + struct l3mdev_cgroup *l3mdev_cgrp = css_l3mdev(css); + struct net *net = current->nsproxy->net_ns; + struct net_device *dev; + char name[IFNAMSIZ]; + int rc = -EINVAL; + + /* once master device is set can not undo. Must delete + * cgroup and reset + */ + if (l3mdev_cgrp->dev_idx) + goto out; + + /* root cgroup does not bind to an L3 domain */ + if (is_root_cgroup(css)) + goto out; + + if (sscanf(buf, "%" __stringify(IFNAMSIZ) "s", name) != 1) + goto out; + + dev = dev_get_by_name(net, name); + if (!dev) { + rc = -ENODEV; + goto out; + } + + if (netif_is_l3_master(dev)) { + l3mdev_cgrp->net = net; + l3mdev_cgrp->dev_idx = dev->ifindex; + rc = 0; + } + + dev_put(dev); +out: + return rc ? : nbytes; +} + +/* make master device is set for non-root cgroups before tasks can be added */ +static int l3mdev_can_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *dst_css; + struct task_struct *tsk; + int rc = 0; + + cgroup_taskset_for_each(tsk, dst_css, tset) { + struct l3mdev_cgroup *l3mdev_cgrp; + + l3mdev_cgrp = css_l3mdev(dst_css); + if (!is_root_cgroup(dst_css) && !l3mdev_cgrp->dev_idx) { + rc = -ENODEV; + break; + } + } + + return rc; +} + +static struct cftype ss_files[] = { + { + .name = "master-device", + .seq_show = l3mdev_read, + .write = l3mdev_write, + }, + { } /* terminate */ +}; + +struct cgroup_subsys l3mdev_cgrp_subsys = { + .css_alloc = l3mdev_css_alloc, + .css_online = l3mdev_css_online, + .css_free = l3mdev_css_free, + .can_attach = l3mdev_can_attach, + .legacy_cftypes = ss_files, +}; + +static int __init init_cgroup_l3mdev(void) +{ + return 0; +} + +subsys_initcall(init_cgroup_l3mdev); +MODULE_AUTHOR("David Ahern"); +MODULE_DESCRIPTION("Control Group for L3 Networking Domains"); +MODULE_LICENSE("GPL"); -- 1.9.1 -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html