Hi Dan, I finally got to look at it - see comments inline: Dan Smith wrote: > When checkpointing a task tree with network namespaces, we hook into > do_checkpoint_ns() along with the others. Any devices in a given namespace > are checkpointed (including their peer, in the case of veth) sequentially. > Each network device stores a list of protocol addresses, as well as other > information, such as hardware address. > > This patch supports veth pairs, as well as the loopback adapter. The > loopback support is there to make sure that any additional addresses and > state (such as up/down) is copied to the loopback adapter that we are > given in the new network namespace. > > On restart, we instantiate new network namespaces and veth pairs as > necessary. Any device we encounter that isn't in a network namespace > that was checkpointed as part of a task is left in the namespace of the > restarting process. This will be the case for a veth half that exists > in the init netns to provide network access to a container. I think that this documentation (and other pieces from q&a on your patches, e.g. your reply to Serge on 2/22) deserve an honorable mention either in the source files, or in Documentation/checkpoint/. As it turns out, the network related c/r logic - sockets, netns and netdev - all of these are nontrivial and we need to explain them for reviewers, future coders and ... ourselves :) [...] > > Signed-off-by: Dan Smith <danms@xxxxxxxxxx> > Cc: netdev@xxxxxxxxxxxxxxx > --- > checkpoint/checkpoint.c | 6 +- > checkpoint/objhash.c | 48 +++ > include/linux/checkpoint.h | 23 ++ > include/linux/checkpoint_hdr.h | 58 +++ > include/linux/checkpoint_types.h | 1 + > kernel/nsproxy.c | 20 +- > net/Kconfig | 4 + > net/Makefile | 1 + > net/checkpoint_dev.c | 815 ++++++++++++++++++++++++++++++++++++++ > 9 files changed, 970 insertions(+), 6 deletions(-) > create mode 100644 net/checkpoint_dev.c > > diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c > index b3c1c4f..466f594 100644 > --- a/checkpoint/checkpoint.c > +++ b/checkpoint/checkpoint.c > @@ -184,6 +184,7 @@ static int checkpoint_container(struct ckpt_ctx *ctx) > h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_CONTAINER); > if (!h) > return -ENOMEM; > + [nit] noise ? > ret = ckpt_write_obj(ctx, &h->h); > ckpt_hdr_put(ctx, h); > > @@ -284,11 +285,6 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t) > _ckpt_err(ctx, -EPERM, "%(T)Nested mnt_ns unsupported\n"); > ret = -EPERM; > } > - /* no support for >1 private netns */ > - if (nsproxy->net_ns != ctx->root_nsproxy->net_ns) { > - _ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n"); > - ret = -EPERM; > - } > /* no support for >1 private pidns */ > if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) { > _ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n"); > diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c > index fbc58ea..16f2c43 100644 > --- a/checkpoint/objhash.c > +++ b/checkpoint/objhash.c > @@ -348,6 +348,36 @@ static void lsm_string_drop(void *ptr, int lastref) > kref_put(&s->kref, lsm_string_free); > } > > +static int netns_grab(void *ptr) > +{ > + struct net *net = ptr; > + > + get_net(net); > + return 0; > +} > + > +static void netns_drop(void *ptr, int lastref) > +{ > + struct net *net = ptr; > + > + put_net(net); > +} > + > +static int netdev_grab(void *ptr) > +{ > + struct net_device *dev = ptr; > + > + dev_hold(dev); > + return 0; > +} > + > +static void netdev_drop(void *ptr, int lastref) > +{ > + struct net_device *dev = ptr; > + > + dev_put(dev); > +} > + > /* security context strings */ > static int checkpoint_lsm_string(struct ckpt_ctx *ctx, void *ptr); > static struct ckpt_lsm_string *restore_lsm_string(struct ckpt_ctx *ctx); > @@ -550,6 +580,24 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = { > .checkpoint = checkpoint_lsm_string, > .restore = restore_lsm_string_wrap, > }, > + /* Network Namespace Object */ > + { > + .obj_name = "NET_NS", > + .obj_type = CKPT_OBJ_NET_NS, > + .ref_grab = netns_grab, > + .ref_drop = netns_drop, > + .checkpoint = checkpoint_netns, > + .restore = restore_netns, > + }, > + /* Network Device Object */ > + { > + .obj_name = "NET_DEV", > + .obj_type = CKPT_OBJ_NETDEV, > + .ref_grab = netdev_grab, > + .ref_drop = netdev_drop, > + .checkpoint = checkpoint_netdev, > + .restore = restore_netdev, > + }, > }; What about leak detection ? Aren't we missing {netns,netdev}_users()? > > > diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h > index 7101d6f..a25bac1 100644 > --- a/include/linux/checkpoint.h > +++ b/include/linux/checkpoint.h > @@ -35,6 +35,7 @@ > #include <linux/checkpoint_types.h> > #include <linux/checkpoint_hdr.h> > #include <linux/err.h> > +#include <linux/inetdevice.h> > #include <net/sock.h> > > /* sycall helpers */ > @@ -119,6 +120,28 @@ extern int ckpt_sock_getnames(struct ckpt_ctx *ctx, > extern struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx, struct sock *sk); > extern void sock_listening_list_free(struct list_head *head); > > +#ifdef CONFIG_CHECKPOINT_NETNS > +extern int checkpoint_netns(struct ckpt_ctx *ctx, void *ptr); > +extern void *restore_netns(struct ckpt_ctx *ctx); > +extern int checkpoint_netdev(struct ckpt_ctx *ctx, void *ptr); > +extern void *restore_netdev(struct ckpt_ctx *ctx); > + > +extern int ckpt_netdev_in_init_netns(struct ckpt_ctx *ctx, > + struct net_device *dev); > +extern int ckpt_netdev_inet_addrs(struct in_device *indev, > + struct ckpt_netdev_addr *list[]); > +extern int ckpt_netdev_hwaddr(struct net_device *dev, > + struct ckpt_hdr_netdev *h); > +extern struct ckpt_hdr_netdev *ckpt_netdev_base(struct ckpt_ctx *ctx, > + struct net_device *dev, > + struct ckpt_netdev_addr *addrs[]); > +#else > +# define checkpoint_netns NULL > +# define restore_netns NULL > +# define checkpoint_netdev NULL > +# define restore_netdev NULL > +#endif > + > /* ckpt kflags */ > #define ckpt_set_ctx_kflag(__ctx, __kflag) \ > set_bit(__kflag##_BIT, &(__ctx)->kflags) > diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h > index 41412d1..c065739 100644 > --- a/include/linux/checkpoint_hdr.h > +++ b/include/linux/checkpoint_hdr.h > @@ -181,6 +181,12 @@ enum { > #define CKPT_HDR_SOCKET_UNIX CKPT_HDR_SOCKET_UNIX > CKPT_HDR_SOCKET_INET, > #define CKPT_HDR_SOCKET_INET CKPT_HDR_SOCKET_INET > + CKPT_HDR_NET_NS, > +#define CKPT_HDR_NET_NS CKPT_HDR_NET_NS > + CKPT_HDR_NETDEV, > +#define CKPT_HDR_NETDEV CKPT_HDR_NETDEV > + CKPT_HDR_NETDEV_ADDR, > +#define CKPT_HDR_NETDEV_ADDR CKPT_HDR_NETDEV_ADDR > > CKPT_HDR_TAIL = 9001, > #define CKPT_HDR_TAIL CKPT_HDR_TAIL > @@ -253,6 +259,10 @@ enum obj_type { > #define CKPT_OBJ_SECURITY_PTR CKPT_OBJ_SECURITY_PTR > CKPT_OBJ_SECURITY, > #define CKPT_OBJ_SECURITY CKPT_OBJ_SECURITY > + CKPT_OBJ_NET_NS, > +#define CKPT_OBJ_NET_NS CKPT_OBJ_NET_NS > + CKPT_OBJ_NETDEV, > +#define CKPT_OBJ_NETDEV CKPT_OBJ_NETDEV > CKPT_OBJ_MAX > #define CKPT_OBJ_MAX CKPT_OBJ_MAX > }; > @@ -313,6 +323,7 @@ struct ckpt_hdr_tail { > /* container configuration section header */ > struct ckpt_hdr_container { > struct ckpt_hdr h; > + __s32 init_netns_ref; > /* > * the header is followed by the string: > * char lsm_name[SECURITY_NAME_MAX + 1] > @@ -434,6 +445,7 @@ struct ckpt_hdr_ns { > struct ckpt_hdr h; > __s32 uts_objref; > __s32 ipc_objref; > + __s32 net_objref; > } __attribute__((aligned(8))); > > /* cannot include <linux/tty.h> from userspace, so define: */ > @@ -758,6 +770,52 @@ struct ckpt_hdr_file_socket { > __s32 sock_objref; > } __attribute__((aligned(8))); > > +struct ckpt_hdr_netns { > + struct ckpt_hdr h; > + __s32 this_ref; > +} __attribute__((aligned(8))); > + > +enum ckpt_netdev_types { > + CKPT_NETDEV_LO, > + CKPT_NETDEV_VETH, > + CKPT_NETDEV_SIT, > + CKPT_NETDEV_MACVLAN, > +}; > + > +struct ckpt_hdr_netdev { > + struct ckpt_hdr h; > + __s32 netns_ref; > + union { > + struct { > + __s32 this_ref; > + __s32 peer_ref; > + } veth; > + struct { > + __u32 mode; > + } macvlan; > + }; > + __u32 inet_addrs; > + __u16 type; > + __u16 flags; > + __u8 hwaddr[6]; > +} __attribute__((aligned(8))); > + > +enum ckpt_netdev_addr_types { > + CKPT_NETDEV_ADDR_IPV4, > +}; > + > +struct ckpt_netdev_addr { > + __u16 type; Bad alignments ... > + union { > + struct { > + __u32 inet4_local; > + __u32 inet4_address; > + __u32 inet4_mask; > + __u32 inet4_broadcast; > + }; > + } __attribute__((aligned(8))); > +} __attribute__((aligned(8))); > + > struct ckpt_hdr_eventpoll_items { > struct ckpt_hdr h; > __s32 epfile_objref; > diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h > index 5d5e00d..efc9357 100644 > --- a/include/linux/checkpoint_types.h > +++ b/include/linux/checkpoint_types.h > @@ -82,6 +82,7 @@ struct ckpt_ctx { > wait_queue_head_t ghostq; /* waitqueue for ghost tasks */ > struct cred *realcred, *ecred; /* tmp storage for cred at restart */ > struct list_head listen_sockets;/* listening parent sockets */ > + int init_netns_ref; /* Objref of root net namespace */ > > struct ckpt_stats stats; /* statistics */ > > diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c > index 0da0d83..b0e67ff 100644 > --- a/kernel/nsproxy.c > +++ b/kernel/nsproxy.c > @@ -248,6 +248,11 @@ int ckpt_collect_ns(struct ckpt_ctx *ctx, struct task_struct *t) > ret = ckpt_obj_collect(ctx, nsproxy->uts_ns, CKPT_OBJ_UTS_NS); > if (ret < 0) > goto out; I'm unsure why you need a separate configuration option for this ? If you just want a shortcut to "#if defined(CHECKPOINT) && defined(NETNS)" then that's ok, but then in Kconfig (below) it should be tunable by the user. > +#ifdef CONFIG_CHECKPOINT_NETNS > + ret = ckpt_obj_collect(ctx, nsproxy->net_ns, CKPT_OBJ_NET_NS); > + if (ret < 0) > + goto out; > +#endif > ret = ckpt_obj_collect(ctx, nsproxy->ipc_ns, CKPT_OBJ_IPC_NS); > if (ret < 0) > goto out; > @@ -288,6 +293,12 @@ static int do_checkpoint_ns(struct ckpt_ctx *ctx, struct nsproxy *nsproxy) > if (ret < 0) > goto out; > h->ipc_objref = ret; > +#ifdef CONFIG_CHECKPOINT_NETNS > + ret = checkpoint_obj(ctx, nsproxy->net_ns, CKPT_OBJ_NET_NS); > + if (ret < 0) > + goto out; > + h->net_objref = ret; > +#endif > > /* FIXME: for now, only marked visited to pacify leaks */ > ret = ckpt_obj_visit(ctx, nsproxy->mnt_ns, CKPT_OBJ_MNT_NS); > @@ -328,6 +339,14 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx) > ret = PTR_ERR(uts_ns); > goto out; > } > + if (h->net_objref == 0) > + net_ns = current->nsproxy->net_ns; > + else > + net_ns = ckpt_obj_fetch(ctx, h->net_objref, CKPT_OBJ_NET_NS); > + if (IS_ERR(net_ns)) { > + ret = PTR_ERR(net_ns); > + goto out; > + } > > if (h->ipc_objref == 0) > ipc_ns = ctx->root_nsproxy->ipc_ns; > @@ -339,7 +358,6 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx) > } > > mnt_ns = ctx->root_nsproxy->mnt_ns; > - net_ns = ctx->root_nsproxy->net_ns; > > if (uts_ns == current->nsproxy->uts_ns && > ipc_ns == current->nsproxy->ipc_ns && > diff --git a/net/Kconfig b/net/Kconfig > index 041c35e..64dd3cd 100644 > --- a/net/Kconfig > +++ b/net/Kconfig > @@ -276,4 +276,8 @@ source "net/wimax/Kconfig" > source "net/rfkill/Kconfig" > source "net/9p/Kconfig" > > +config CHECKPOINT_NETNS > + bool > + default y if NET && NET_NS && CHECKPOINT > + Did you mean this to be visible (settable) by the user ? > endif # if NET > diff --git a/net/Makefile b/net/Makefile > index 74b038f..570ee98 100644 > --- a/net/Makefile > +++ b/net/Makefile > @@ -67,3 +67,4 @@ endif > obj-$(CONFIG_WIMAX) += wimax/ > > obj-$(CONFIG_CHECKPOINT) += checkpoint.o > +obj-$(CONFIG_CHECKPOINT_NETNS) += checkpoint_dev.o > diff --git a/net/checkpoint_dev.c b/net/checkpoint_dev.c > new file mode 100644 > index 0000000..5479419 > --- /dev/null > +++ b/net/checkpoint_dev.c > @@ -0,0 +1,815 @@ > +/* > + * Copyright 2010 IBM Corporation > + * > + * Author(s): Dan Smith <danms@xxxxxxxxxx> > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License as > + * published by the Free Software Foundation, version 2 of the > + * License. > + */ > + > +#include <linux/sched.h> > +#include <linux/if.h> > +#include <linux/if_arp.h> > +#include <linux/inetdevice.h> > +#include <linux/veth.h> > +#include <linux/checkpoint.h> > +#include <linux/checkpoint_hdr.h> > +#include <linux/deferqueue.h> > + > +#include <net/net_namespace.h> > +#include <net/sch_generic.h> > + > +struct dq_netdev { > + struct net_device *dev; > + struct ckpt_ctx *ctx; > +}; > + > +struct veth_newlink { > + char *peer; > +}; > + > +struct mvl_newlink { > + char this[IFNAMSIZ+1]; > + char base[IFNAMSIZ+1]; > + int mode; > + __u8 *hwaddr; > +}; > + > +typedef int (*new_link_fn)(struct sk_buff *, void *); > + > +static int __kern_devinet_ioctl(struct net *net, unsigned int cmd, void *arg) > +{ > + mm_segment_t fs; > + int ret; > + > + fs = get_fs(); > + set_fs(KERNEL_DS); > + ret = devinet_ioctl(net, cmd, arg); > + set_fs(fs); > + > + return ret; > +} > + > +static int __kern_dev_ioctl(struct net *net, unsigned int cmd, void *arg) > +{ > + mm_segment_t fs; > + int ret; > + > + fs = get_fs(); > + set_fs(KERNEL_DS); > + ret = dev_ioctl(net, cmd, arg); > + set_fs(fs); > + > + return ret; > +} > + > +static struct socket *rtnl_open(void) > +{ > + struct socket *sock; > + int ret; > + > + ret = sock_create(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock); > + if (ret < 0) > + return ERR_PTR(ret); > + > + return sock; > +} > + > +static int rtnl_close(struct socket *rtnl) > +{ > + if (rtnl) > + return kernel_sock_shutdown(rtnl, SHUT_RDWR); > + else > + return 0; > +} > + > +static struct nlmsghdr *rtnl_get_response(struct socket *rtnl, > + struct sk_buff **skb) > +{ > + int ret; > + long timeo = MAX_SCHEDULE_TIMEOUT; > + struct nlmsghdr *nlh; > + > + ret = sk_wait_data(rtnl->sk, &timeo); > + if (!ret) > + return ERR_PTR(-EPIPE); > + > + *skb = skb_dequeue(&rtnl->sk->sk_receive_queue); > + if (!*skb) > + return ERR_PTR(-EPIPE); > + > + ret = -EINVAL; > + nlh = nlmsg_hdr(*skb); > + if (!nlh) > + goto err; > + > + if (nlh->nlmsg_type == NLMSG_ERROR) { > + struct nlmsgerr *errmsg = nlmsg_data(nlh); > + ret = errmsg->error; > + goto err; > + } > + > + return nlh; > + err: > + kfree_skb(*skb); > + *skb = NULL; > + > + return ERR_PTR(ret); > +} > + > +int ckpt_netdev_in_init_netns(struct ckpt_ctx *ctx, struct net_device *dev) > +{ > + return dev->nd_net == current->nsproxy->net_ns; > +} This has raised questions before - probably worth a comment that explain what's the rationale here. > + > +int ckpt_netdev_hwaddr(struct net_device *dev, struct ckpt_hdr_netdev *h) > +{ > + struct net *net = dev->nd_net; > + struct ifreq req; > + int ret; > + > + memcpy(req.ifr_name, dev->name, IFNAMSIZ); > + ret = __kern_dev_ioctl(net, SIOCGIFFLAGS, &req); > + h->flags = req.ifr_flags; > + if (ret < 0) > + return ret; [nit] is it intentional that you assign h->flags before testing ret ? > + > + ret = __kern_dev_ioctl(net, SIOCGIFHWADDR, &req); > + if (ret < 0) > + return ret; > + > + memcpy(h->hwaddr, req.ifr_hwaddr.sa_data, sizeof(h->hwaddr)); > + > + return 0; > +} > + > +int ckpt_netdev_inet_addrs(struct in_device *indev, > + struct ckpt_netdev_addr *_abuf[]) > +{ > + struct ckpt_netdev_addr *abuf = NULL; > + struct in_ifaddr *addr = indev->ifa_list; > + int pages = 0; > + int addrs = 0; > + int max; > + > + retry: > + if (++pages > 4) { > + addrs = -E2BIG; > + goto out; > + } Why 4 ? This makes me wonder if it's worth to allocate a temp buf over the checkpoint ctx (e.g. ctx->tmpbuf) to be used for scratch ? > + > + *_abuf = krealloc(abuf, PAGE_SIZE * pages, GFP_KERNEL); > + if (*_abuf == NULL) { > + addrs = -ENOMEM; > + goto out; > + } > + abuf = *_abuf; > + > + read_lock(&dev_base_lock); > + > + max = (pages * PAGE_SIZE) / sizeof(*abuf); > + while (addr) { > + abuf[addrs].type = CKPT_NETDEV_ADDR_IPV4; /* Only IPv4 now */ > + abuf[addrs].inet4_local = addr->ifa_local; > + abuf[addrs].inet4_address = addr->ifa_address; > + abuf[addrs].inet4_mask = addr->ifa_mask; > + abuf[addrs].inet4_broadcast = addr->ifa_broadcast; > + > + addr = addr->ifa_next; > + if (++addrs >= max) { > + read_unlock(&dev_base_lock); > + goto retry; > + } > + } > + > + read_unlock(&dev_base_lock); > + out: > + if (addrs < 0) { > + kfree(abuf); > + *_abuf = NULL; > + } > + > + return addrs; > +} > + > +struct ckpt_hdr_netdev *ckpt_netdev_base(struct ckpt_ctx *ctx, > + struct net_device *dev, > + struct ckpt_netdev_addr *addrs[]) > +{ > + struct ckpt_hdr_netdev *h; > + int ret; > + > + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_NETDEV); > + if (!h) > + return ERR_PTR(-ENOMEM); > + > + ret = ckpt_netdev_hwaddr(dev, h); > + if (ret < 0) > + goto out; > + > + *addrs = NULL; > + ret = h->inet_addrs = ckpt_netdev_inet_addrs(dev->ip_ptr, addrs); > + if (ret < 0) { > + if (ret == -E2BIG) > + ckpt_err(ctx, ret, > + "Too many inet addresses on interface %s\n", > + dev->name); Do we really need this special case ? I'd be happy with a ckpt_err() for any error - and the actual error number would be useful to tell which case it was. > + goto out; > + } > + > + if (ckpt_netdev_in_init_netns(ctx, dev)) > + ret = h->netns_ref = 0; > + else > + ret = h->netns_ref = checkpoint_obj(ctx, dev->nd_net, > + CKPT_OBJ_NET_NS); > + out: > + if (ret < 0) { > + ckpt_hdr_put(ctx, h); > + h = ERR_PTR(ret); > + if (*addrs) > + kfree(*addrs); > + } > + > + return h; > +} > + > +int checkpoint_netdev(struct ckpt_ctx *ctx, void *ptr) > +{ > + struct net_device *dev = (struct net_device *)ptr; > + > + if (!dev->netdev_ops->ndo_checkpoint) { > + ckpt_err(ctx, -ENOSYS, > + "Device %s does not support checkpoint\n", dev->name); > + return -ENOSYS; > + } > + > + ckpt_debug("checkpointing netdev %s\n", dev->name); > + > + return dev->netdev_ops->ndo_checkpoint(ctx, dev); > +} > + > +int checkpoint_netns(struct ckpt_ctx *ctx, void *ptr) > +{ > + struct net *net = ptr; > + struct net_device *dev; > + struct ckpt_hdr_netns *h; > + int ret; > + > + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_NET_NS); > + if (!h) > + return -ENOMEM; > + > + h->this_ref = ckpt_obj_lookup(ctx, net, CKPT_OBJ_NET_NS); > + BUG_ON(h->this_ref == 0); How about s/==/<=/ ? > + > + ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h); > + if (ret < 0) > + goto out; > + > + for_each_netdev(net, dev) { > + if (!dev->netdev_ops->ndo_checkpoint) { Isn't this check redundant ? I expect it to fail promptly in checkpoint_netdev() above. > + ret = -ENOSYS; > + ckpt_err(ctx, ret, > + "Device %s does not support checkpoint\n", > + dev->name); > + break; > + } > + > + ret = checkpoint_obj(ctx, dev, CKPT_OBJ_NETDEV); > + if (ret < 0) > + break; > + } > + out: > + ckpt_hdr_put(ctx, h); > + > + return ret; > +} > + > +static int restore_in_addrs(struct ckpt_ctx *ctx, > + __u32 naddrs, > + struct net *net, > + struct net_device *dev) > +{ > + __u32 i; > + int ret = 0; > + int len = naddrs * sizeof(struct ckpt_netdev_addr); > + struct ckpt_netdev_addr *addrs = NULL; > + > + addrs = kmalloc(len, GFP_KERNEL); > + if (!addrs) > + return -ENOMEM; > + > + ret = _ckpt_read_buffer(ctx, addrs, len); > + if (ret < 0) > + goto out; Could you use ckpt_read_payload() instead of the above ? > + > + for (i = 0; i < naddrs; i++) { > + struct ckpt_netdev_addr *addr = &addrs[i]; > + struct ifreq req; > + struct sockaddr_in *inaddr; > + > + if (addr->type != CKPT_NETDEV_ADDR_IPV4) { > + ret = -EINVAL; > + ckpt_err(ctx, ret, "Unsupported netdev addr type %i\n", > + addr->type); > + break; > + } > + > + ckpt_debug("restoring %s: %x/%x/%x\n", dev->name, > + addr->inet4_address, > + addr->inet4_mask, > + addr->inet4_broadcast); > + > + memcpy(req.ifr_name, dev->name, IFNAMSIZ); > + > + inaddr = (struct sockaddr_in *)&req.ifr_addr; > + inaddr->sin_addr.s_addr = addr->inet4_address; > + inaddr->sin_family = AF_INET; > + ret = __kern_devinet_ioctl(net, SIOCSIFADDR, &req); > + if (ret < 0) { > + ckpt_err(ctx, ret, "Failed to set address\n"); > + break; > + } > + > + inaddr = (struct sockaddr_in *)&req.ifr_addr; > + inaddr->sin_addr.s_addr = addr->inet4_mask; > + inaddr->sin_family = AF_INET; > + ret = __kern_devinet_ioctl(net, SIOCSIFNETMASK, &req); > + if (ret < 0) { > + ckpt_err(ctx, ret, "Failed to set netmask\n"); > + break; > + } > + > + inaddr = (struct sockaddr_in *)&req.ifr_addr; > + inaddr->sin_addr.s_addr = addr->inet4_broadcast; > + inaddr->sin_family = AF_INET; > + ret = __kern_devinet_ioctl(net, SIOCSIFBRDADDR, &req); > + if (ret < 0) { > + ckpt_err(ctx, ret, "Failed to set broadcast\n"); > + break; > + } > + } > + > + out: > + kfree(addrs); > + > + return ret; > +} > + > +static int veth_new_link_msg(struct sk_buff *skb, void *data) > +{ > + struct nlattr *linkinfo; > + struct nlattr *linkdata; > + struct ifinfomsg ifm; > + int ret = -ENOMEM; > + struct veth_newlink *d = data; > + > + linkinfo = nla_nest_start(skb, IFLA_LINKINFO); > + if (!linkinfo) > + goto out; > + > + ret = nla_put_string(skb, IFLA_INFO_KIND, "veth"); > + if (ret) > + goto out; > + > + linkdata = nla_nest_start(skb, IFLA_INFO_DATA); > + if (!linkdata) { > + ret = -ENOMEM; > + goto out; > + } > + > + ret = nla_put(skb, VETH_INFO_PEER, sizeof(ifm), &ifm); > + if (!ret) > + ret = nla_put_string(skb, IFLA_IFNAME, d->peer); > + > + nla_nest_end(skb, linkdata); > + out: > + nla_nest_end(skb, linkinfo); > + > + return ret; > +} > + > +static int mvl_new_link_msg(struct sk_buff *skb, void *data) > +{ > + struct mvl_newlink *d = data; > + struct nlattr *linkinfo; > + struct nlattr *linkdata; > + struct net_device *lowerdev; > + int ret; > + > + lowerdev = dev_get_by_name(current->nsproxy->net_ns, d->base); > + if (!lowerdev) > + return -ENOENT; > + > + ret = nla_put(skb, IFLA_ADDRESS, ETH_ALEN, d->hwaddr); > + if (ret) > + goto out_put; > + > + ret = nla_put_u32(skb, IFLA_LINK, lowerdev->ifindex); > + if (ret) > + goto out_put; > + > + linkinfo = nla_nest_start(skb, IFLA_LINKINFO); > + if (!linkinfo) { > + ret = -ENOMEM; > + goto out; s/out/out_put/ ?? > + } > + > + ret = nla_put_string(skb, IFLA_INFO_KIND, "macvlan"); > + if (ret) > + goto out; > + > + linkdata = nla_nest_start(skb, IFLA_INFO_DATA); > + if (!linkdata) { > + ret = -ENOMEM; > + goto out; > + } > + > + ret = nla_put_u32(skb, IFLA_MACVLAN_MODE, d->mode); > + nla_nest_end(skb, linkdata); > + out: > + nla_nest_end(skb, linkinfo); > + out_put: > + dev_put(lowerdev); > + > + return ret; > +} > + > +static struct sk_buff *new_link_msg(new_link_fn fn, void *data, char *name) > +{ > + int ret = -ENOMEM; > + int flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK; > + struct nlmsghdr *nlh; > + struct sk_buff *skb; > + struct ifinfomsg *ifm; > + > + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); > + if (!skb) > + goto out; > + > + nlh = nlmsg_put(skb, 0, 0, RTM_NEWLINK, sizeof(*ifm), flags); > + if (!nlh) > + goto out; > + > + ifm = nlmsg_data(nlh); > + memset(ifm, 0, sizeof(*ifm)); > + > + ret = nla_put_string(skb, IFLA_IFNAME, name); > + if (ret) > + goto out; > + > + ret = fn(skb, data); > + > + nlmsg_end(skb, nlh); > + > + out: > + if (ret < 0) { > + kfree_skb(skb); > + skb = ERR_PTR(ret); > + } > + > + return skb; > +} > + > +static struct net_device *rtnl_newlink(new_link_fn fn, void *data, char *name) > +{ > + int ret = -ENOMEM; > + struct socket *rtnl = NULL; > + struct sk_buff *skb = NULL; > + struct nlmsghdr *nlh; > + struct msghdr msg; > + struct kvec kvec; > + > + skb = new_link_msg(fn, data, name); > + if (IS_ERR(skb)) { > + ret = PTR_ERR(skb); > + ckpt_debug("failed to create new link message: %i\n", ret); > + skb = NULL; > + goto out; Is this right - you didn't open the rtnl yet ? Perhaps only ckpt_debug() and then return PTR_ERR(skb). > + } > + > + memset(&msg, 0, sizeof(msg)); > + kvec.iov_len = skb->len; > + kvec.iov_base = skb->head; > + > + rtnl = rtnl_open(); > + if (IS_ERR(rtnl)) { > + ret = PTR_ERR(rtnl); > + ckpt_debug("Unable to open rtnetlink socket: %i\n", ret); > + goto out_noclose; > + } > + > + ret = kernel_sendmsg(rtnl, &msg, &kvec, 1, kvec.iov_len); > + if (ret < 0) > + goto out; > + else if (ret != skb->len) { > + ret = -EIO; > + goto out; > + } > + > + /* Free the send skb to make room for the receive skb */ > + kfree_skb(skb); > + > + nlh = rtnl_get_response(rtnl, &skb); > + if (IS_ERR(nlh)) { If this happens, would rtnl_get_response() place a NULL in skb ? If not, then the kfree_skb() below will free the previous skb again. > + ret = PTR_ERR(nlh); > + ckpt_debug("RTNETLINK said: %i\n", ret); > + } > + out: > + rtnl_close(rtnl); > + out_noclose: > + kfree_skb(skb); > + > + if (ret < 0) > + return ERR_PTR(ret); > + else > + return dev_get_by_name(current->nsproxy->net_ns, name); > +} > + > +static int netdev_noop(void *data) > +{ > + return 0; > +} > + > +static int netdev_cleanup(void *data) > +{ > + struct dq_netdev *dq = data; > + > + dev_put(dq->dev); > + > + if (dq->ctx->errno) { > + ckpt_debug("Unregistering netdev %s\n", dq->dev->name); > + unregister_netdev(dq->dev); > + } > + > + return 0; > +} > + > +static struct net_device *restore_veth(struct ckpt_ctx *ctx, > + struct ckpt_hdr_netdev *h, > + struct net *net) > +{ > + int ret; > + char this_name[IFNAMSIZ]; > + char peer_name[IFNAMSIZ]; > + struct net_device *dev; > + struct net_device *peer; > + int didreg = 0; > + struct ifreq req; > + struct dq_netdev dq; > + > + dq.ctx = ctx; > + > + ret = _ckpt_read_buffer(ctx, this_name, IFNAMSIZ); > + if (ret < 0) > + return ERR_PTR(ret); > + > + ret = _ckpt_read_buffer(ctx, peer_name, IFNAMSIZ); > + if (ret < 0) > + return ERR_PTR(ret); > + > + ckpt_debug("restored veth netdev %s:%s\n", this_name, peer_name); > + > + peer = ckpt_obj_try_fetch(ctx, h->veth.peer_ref, CKPT_OBJ_NETDEV); > + if (IS_ERR(peer)) { > + struct veth_newlink veth = { > + .peer = peer_name, > + }; > + > + /* We're first: allocate the veth pair */ > + didreg = 1; > + > + dev = rtnl_newlink(veth_new_link_msg, &veth, this_name); > + if (IS_ERR(dev)) > + return dev; > + > + peer = dev_get_by_name(current->nsproxy->net_ns, peer_name); > + if (!peer) { > + ret = -EINVAL; > + goto err_dev; > + } > + > + dq.dev = peer; > + ret = deferqueue_add(ctx->deferqueue, &dq, sizeof(dq), > + netdev_noop, netdev_cleanup); > + if (ret) > + goto err_peer; > + > + ret = ckpt_obj_insert(ctx, peer, h->veth.peer_ref, > + CKPT_OBJ_NETDEV); > + if (ret < 0) > + /* Can't recall peer dq, so let it cleanup peer */ > + goto err_dev; This may be a bit simpler if you move the first deferqueue_add() forward to just before the other one. Or better: change dq_netdev to have two pointers, dev and peer (if any is null, the cleanup function will skip). > + dev_put(peer); > + > + dq.dev = dev; > + ret = deferqueue_add(ctx->deferqueue, &dq, sizeof(dq), > + netdev_noop, netdev_cleanup); > + if (ret) > + /* Can't recall peer dq, so let it cleanup peer */ > + goto err_dev; > + > + } else { > + /* We're second: get our dev from the hash */ > + dev = ckpt_obj_fetch(ctx, h->veth.this_ref, CKPT_OBJ_NETDEV); > + if (IS_ERR(dev)) > + return dev; > + } > + > + /* Move to our new netns */ > + rtnl_lock(); > + ret = dev_change_net_namespace(dev, net, dev->name); > + rtnl_unlock(); > + if (ret < 0) > + goto out; > + > + /* Restore MAC address */ > + memcpy(req.ifr_name, dev->name, IFNAMSIZ); > + memcpy(req.ifr_hwaddr.sa_data, h->hwaddr, sizeof(h->hwaddr)); > + req.ifr_hwaddr.sa_family = ARPHRD_ETHER; > + ret = __kern_dev_ioctl(net, SIOCSIFHWADDR, &req); > + out: > + if (ret) > + dev = ERR_PTR(ret); > + > + return dev; > + > + err_peer: > + dev_put(peer); > + unregister_netdev(peer); > + err_dev: > + dev_put(dev); > + unregister_netdev(dev); > + > + return ERR_PTR(ret); > +} > + > +static struct net_device *restore_lo(struct ckpt_ctx *ctx, > + struct ckpt_hdr_netdev *h, > + struct net *net) > +{ > + struct net_device *dev; > + char name[IFNAMSIZ+1]; > + int ret; > + > + dev = dev_get_by_name(net, "lo"); > + if (!dev) > + return ERR_PTR(-EINVAL); > + > + ret = _ckpt_read_buffer(ctx, name, IFNAMSIZ); > + if (ret < 0) > + goto err; > + > + if (strncmp(dev->name, name, IFNAMSIZ) != 0) { > + ret = dev_change_name(dev, name); > + if (ret < 0) > + goto err; > + } > + > + return dev; > + err: > + dev_put(dev); > + > + return ERR_PTR(ret); > +} > + > +static struct net_device *restore_sit(struct ckpt_ctx *ctx, > + struct ckpt_hdr_netdev *h, > + struct net *net) > +{ > + /* Don't actually do anything for SIT devices yet */ > + return dev_get_by_name(net, "sit0"); > +} > + > +static struct net_device *restore_macvlan(struct ckpt_ctx *ctx, > + struct ckpt_hdr_netdev *h, > + struct net *net) > +{ > + struct net_device *dev; > + struct mvl_newlink mvl = { > + .mode = h->macvlan.mode, > + .hwaddr = h->hwaddr, > + }; > + int ret; > + > + ret = _ckpt_read_buffer(ctx, mvl.this, IFNAMSIZ); > + if (ret < 0) > + return ERR_PTR(ret); > + > + ret = _ckpt_read_buffer(ctx, mvl.base, IFNAMSIZ); > + if (ret < 0) > + return ERR_PTR(ret); > + > + dev = rtnl_newlink(mvl_new_link_msg, &mvl, mvl.this); > + if (IS_ERR(dev)) { > + ckpt_err(ctx, PTR_ERR(dev), > + "Failed to create macvlan device %s:%s", > + mvl.this, mvl.base); > + goto out; > + } > + > + rtnl_lock(); > + ret = dev_change_net_namespace(dev, net, dev->name); > + rtnl_unlock(); > + > + if (ret) { > + ckpt_err(ctx, ret, "Failed to change netns of %s:%s\n", > + mvl.this, mvl.base); > + dev_put(dev); > + unregister_netdev(dev); > + dev = ERR_PTR(ret); > + } > + out: > + return dev; > +} > + > +void *restore_netdev(struct ckpt_ctx *ctx) > +{ > + struct ckpt_hdr_netdev *h; > + struct net_device *dev = NULL; > + struct ifreq req; > + struct net *net; > + int ret; > + > + h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_NETDEV); > + if (IS_ERR(h)) { > + ckpt_err(ctx, PTR_ERR(h), "failed to read netdev\n"); This ckpt_err() is redundant. ckpt_read_obj_type() already calls ckpt_err() if it fails. > + return h; > + } > + How about some comment to explain this following snippet ? > + if (h->netns_ref != 0) { > + net = ckpt_obj_try_fetch(ctx, h->netns_ref, CKPT_OBJ_NET_NS); > + if (IS_ERR(net)) { > + ckpt_debug("failed to get net for %i\n", h->netns_ref); > + ret = PTR_ERR(net); > + net = current->nsproxy->net_ns; ^^^^^^^^^^^^^^^^^ Why this ? > + goto out; > + } > + } else > + net = current->nsproxy->net_ns; > + > + if (h->type == CKPT_NETDEV_VETH) > + dev = restore_veth(ctx, h, net); > + else if (h->type == CKPT_NETDEV_LO) > + dev = restore_lo(ctx, h, net); > + else if (h->type == CKPT_NETDEV_SIT) > + dev = restore_sit(ctx, h, net); > + else if (h->type == CKPT_NETDEV_MACVLAN) > + dev = restore_macvlan(ctx, h, net); > + else > + dev = ERR_PTR(-EINVAL); This is ugly. How about a dispatch table intead ? It will also allow in the future for kernel modules to register their restore functions. > + > + if (IS_ERR(dev)) { > + ret = PTR_ERR(dev); > + ckpt_err(ctx, ret, "Netdev type %i not supported\n", h->type); > + goto out; > + } > + > + /* Restore flags (which will likely bring the interface up) */ > + memcpy(req.ifr_name, dev->name, IFNAMSIZ); > + req.ifr_flags = h->flags; > + ret = __kern_dev_ioctl(net, SIOCSIFFLAGS, &req); > + if (ret < 0) > + goto out; > + > + if (h->inet_addrs > 0) > + ret = restore_in_addrs(ctx, h->inet_addrs, net, dev); > + out: > + if (ret) { > + ckpt_err(ctx, ret, "Failed to restore netdevice\n"); > + if ((h->type == CKPT_NETDEV_VETH) && !IS_ERR(dev)) { > + dev_put(dev); > + } > + dev = ERR_PTR(ret); > + } else > + ckpt_debug("restored netdev %s\n", dev->name); > + > + ckpt_hdr_put(ctx, h); > + > + return dev; > +} > + > +void *restore_netns(struct ckpt_ctx *ctx) > +{ > + struct ckpt_hdr_netns *h; > + struct net *net; > + > + h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_NET_NS); > + if (IS_ERR(h)) { > + ckpt_err(ctx, PTR_ERR(h), "failed to read netns\n"); Here, too, no need for ckpt_err(). > + return h; > + } > + Here, too, a comment to explain the snippet will be helpful. > + if (h->this_ref != 0) { > + net = copy_net_ns(CLONE_NEWNET, current->nsproxy->net_ns); > + if (IS_ERR(net)) > + goto out; [nit] this test isn't necessary. > + } else > + net = current->nsproxy->net_ns; > + out: > + ckpt_hdr_put(ctx, h); > + > + return net; > +} Oren. _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers