On 10/20/15 at 03:20pm, Jarno Rajahalme wrote: > Extend OVS conntrack interface to cover NAT. New nested > OVS_CT_ATTR_NAT may be used to include NAT with a CT action. A bare > OVS_CT_ATTR_NAT only mangles existing connections. If > OVS_NAT_ATTR_SRC or OVS_NAT_ATTR_DST is included within the nested > attributes, new (non-committed/non-confirmed) connections are mangled > according to the rest of the nested attributes. > > This work extends on a branch by Thomas Graf at > https://github.com/tgraf/ovs/tree/nat. > > Signed-off-by: Jarno Rajahalme <jrajahalme@xxxxxxxxxx> Awesome work. This is a great start. There are some, probably unintended, unrelated style changes. More comments below. > +enum ovs_nat_attr { > + OVS_NAT_ATTR_UNSPEC, > + OVS_NAT_ATTR_SRC, > + OVS_NAT_ATTR_DST, > + OVS_NAT_ATTR_IP_MIN, > + OVS_NAT_ATTR_IP_MAX, > + OVS_NAT_ATTR_PROTO_MIN, > + OVS_NAT_ATTR_PROTO_MAX, > + OVS_NAT_ATTR_PERSISTENT, > + OVS_NAT_ATTR_PROTO_HASH, > + OVS_NAT_ATTR_PROTO_RANDOM, Simplify this with an OVS_NAT_ATTR_FLAGS? > @@ -137,11 +159,17 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, > ovs_ct_get_label(ct, &key->ct.label); > } > > -/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has > - * previously sent the packet to conntrack via the ct action. > +/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has > + * previously sent the packet to conntrack via the ct action. If > + * 'keep_nat_flags' is true, the existing NAT flags retained, else they are > + * initialized from the connection status. > */ > static void ovs_ct_update_key(const struct sk_buff *skb, > - struct sw_flow_key *key, bool post_ct) > + struct sw_flow_key *key, bool post_ct > +#ifdef CONFIG_NF_NAT_NEEDED > + , bool keep_nat_flags > +#endif > + ) I suggest keeping the argument even for !CONFIG_NF_NAT_NEEDED. This unclutters the call sites of this function. An ifdef inside the keep_nat_flags branch should be enough. The compiler will optimize the code away and it's much prettier to read. > { > const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; > enum ip_conntrack_info ctinfo; > @@ -151,8 +179,20 @@ static void ovs_ct_update_key(const struct sk_buff *skb, > ct = nf_ct_get(skb, &ctinfo); > if (ct) { > state = ovs_ct_get_state(ctinfo); > + /* OVS persists the related flag for the duration of the > + * connection. */ > if (ct->master) > state |= OVS_CS_F_RELATED; > +#ifdef CONFIG_NF_NAT_NEEDED > + if (keep_nat_flags) > + state |= key->ct.state & OVS_CS_F_NAT_MASK; > + else { > + if (ct->status & IPS_SRC_NAT) > + state |= OVS_CS_F_SRC_NAT; > + if (ct->status & IPS_DST_NAT) > + state |= OVS_CS_F_DST_NAT; > + } > +#endif > zone = nf_ct_zone(ct); > } else if (post_ct) { > state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; > @@ -291,7 +337,16 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto) > return NF_DROP; > } > > - return helper->help(skb, protoff, ct, ctinfo); > + if (helper->help(skb, protoff, ct, ctinfo) != NF_ACCEPT) > + return NF_DROP; Return the returned value here instead of hardcoding NF_DROP? > +#ifdef CONFIG_NF_NAT_NEEDED > + /* Adjust seqs after helper. */ A comment on why this is needed would be helpful. > + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) > + && !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) > + return NF_DROP; > +#endif > + return NF_ACCEPT; > @@ -377,7 +432,211 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, > return true; > } > > -static int __ovs_ct_lookup(struct net *net, const struct sw_flow_key *key, > +#ifdef CONFIG_NF_NAT_NEEDED > +/* Modeled after nf_nat_ipv[46]_fn(). > + * range is only used for new, uninitialized NAT state. > + * Returns either NF_ACCEPT or NF_DROP. */ > +static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, > + enum ip_conntrack_info ctinfo, > + const struct nf_nat_range *range, > + enum nf_nat_manip_type maniptype) > +{ > + int hooknum, nh_off, err = NF_ACCEPT; > + > + nh_off = skb_network_offset(skb); > + skb_pull(skb, nh_off); > + > + /* See HOOK2MANIP(). */ > + if (maniptype == NF_NAT_MANIP_SRC) > + hooknum = NF_INET_LOCAL_IN; /* Source NAT */ > + else > + hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ > + > + switch (ctinfo) { > + case IP_CT_RELATED: > + case IP_CT_RELATED_REPLY: > + if (skb->protocol == htons(ETH_P_IP) > + && ip_hdr(skb)->protocol == IPPROTO_ICMP) { > + if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, > + hooknum)) > + err = NF_DROP; > + goto push; > + } else if (skb->protocol == htons(ETH_P_IPV6)) { > + __be16 frag_off; > + u8 nexthdr = ipv6_hdr(skb)->nexthdr; > + int hdrlen = ipv6_skip_exthdr(skb, > + sizeof(struct ipv6hdr), > + &nexthdr, &frag_off); > + > + if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { > + if (!nf_nat_icmpv6_reply_translation(skb, ct, > + ctinfo, > + hooknum, > + hdrlen)) > + err = NF_DROP; > + goto push; > + } > + } > + /* Non-ICMP, fall thru to initialize if needed. */ > + case IP_CT_NEW: > + /* Seen it before? This can happen for loopback, retrans, > + * or local packets. > + */ > + if (!nf_nat_initialized(ct, maniptype)) { Explicit unlikely()? > + /* Initialize according to the NAT action. */ > + err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) > + /* Action is set up to establish a new > + * mapping */ > + ? nf_nat_setup_info(ct, range, maniptype) > + : nf_nat_alloc_null_binding(ct, hooknum); > + } > + break; > + > + case IP_CT_ESTABLISHED: > + case IP_CT_ESTABLISHED_REPLY: > + break; > + > + default: > + err = NF_DROP; > + goto push; > + } > + > + if (err == NF_ACCEPT) > + err = nf_nat_packet(ct, ctinfo, hooknum, skb); If you goto push on init failure (IP_CT_NEW branch), then this conditional is no longer needed and a more straight forward exception handling is seen. > +push: > + skb_push(skb, nh_off); > + > + return err; > +} > +/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. > + * This action can be used to both NAT and reverse NAT, however, reverse NAT > + * can also be done with the conntrack action. */ > +static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, > + const struct ovs_conntrack_info *info, > + struct sk_buff *skb) > +{ > + enum nf_nat_manip_type maniptype; > + enum ip_conntrack_info ctinfo; > + struct nf_conn *ct; > + int err; > + > + /* No NAT action or already NATed? */ > + if (!(info->flags & OVS_CT_F_NAT_MASK) > + || key->ct.state & OVS_CS_F_NAT_MASK) > + return NF_ACCEPT; > + > + ct = nf_ct_get(skb, &ctinfo); > + /* Check if an existing conntrack entry may be found for this skb. > + * This happens when we lose the ct entry pointer due to an upcall. > + * Don't lookup invalid connections. */ > + if (!ct && key->ct.state & OVS_CS_F_TRACKED > + && !(key->ct.state & OVS_CS_F_INVALID)) > + ct = ovs_ct_find_existing(net, &info->zone, info->family, skb, > + &ctinfo); > + if (!ct || nf_ct_is_untracked(ct)) > + /* A NAT action may only be performed on tracked packets. */ > + return NF_ACCEPT; Braces > + /* Add NAT extension if not commited yet. */ > + if (!nf_ct_is_confirmed(ct)) { > + if (!nf_ct_nat_ext_add(ct)) > + return NF_ACCEPT; /* Can't NAT. */ > + } && > + /* Determine NAT type. > + * Check if the NAT type can be deduced from the tracked connection. > + * Make sure expected traffic is NATted only when commiting. */ > + if (info->flags & OVS_CT_F_NAT && ctinfo != IP_CT_NEW > + && ct->status & IPS_NAT_MASK > + && (!(ct->status & IPS_EXPECTED_BIT) > + || info->flags & OVS_CT_F_COMMIT)) { > + /* NAT an established or related connection like before. */ > + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) > + /* This is the REPLY direction for a connection > + * for which NAT was applied in the forward > + * direction. Do the reverse NAT. */ > + maniptype = ct->status & IPS_SRC_NAT > + ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; > + else > + maniptype = ct->status & IPS_SRC_NAT > + ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; > + } > + else if (info->flags & OVS_CT_F_SRC_NAT) > + maniptype = NF_NAT_MANIP_SRC; > + else if (info->flags & OVS_CT_F_DST_NAT) > + maniptype = NF_NAT_MANIP_DST; > + else > + return NF_ACCEPT; /* Connection is not NATed. */ > + > + err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype); > + > + /* Mark NAT done if successful. */ > + if (err == NF_ACCEPT) > + key->ct.state |= (maniptype == NF_NAT_MANIP_SRC) > + ? OVS_CS_F_SRC_NAT : OVS_CS_F_DST_NAT; > + return err; > +} > +#endif > + > +static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, > const struct ovs_conntrack_info *info, > struct sk_buff *skb) > { > @@ -538,6 +819,131 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, > return 0; > } > > +#ifdef CONFIG_NF_NAT_NEEDED > +static int parse_nat(const struct nlattr *attr, > + struct ovs_conntrack_info *info, bool log) > +{ > + struct nlattr *a; > + int rem; > + bool have_ip_max = false; > + bool have_proto_max = false; > + bool ip_vers = (info->family == NFPROTO_IPV6); > + > + nla_for_each_nested(a, attr, rem) { > + static const u16 ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = { > + [OVS_NAT_ATTR_SRC] = {0, 0}, > + [OVS_NAT_ATTR_DST] = {0, 0}, > + [OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr), > + sizeof(struct in6_addr)}, > + [OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr), > + sizeof(struct in6_addr)}, > + [OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16),sizeof(u16)}, > + [OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16),sizeof(u16)}, > + [OVS_NAT_ATTR_PERSISTENT] = {0, 0}, > + [OVS_NAT_ATTR_PROTO_HASH] = {0, 0}, > + [OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0}, > + }; > + int type = nla_type(a); > + > + if (type > OVS_NAT_ATTR_MAX) { > + OVS_NLERR(log, "Unknown nat attribute (type=%d, max=%d).\n", > + type, OVS_NAT_ATTR_MAX); Formatting > + return -EINVAL; > + } > + > + case OVS_NAT_ATTR_IP_MIN: > + nla_memcpy(&info->range.min_addr, a, nla_len(a)); The length attribute should be sizeof of min_addr like for max_addr below. > + info->range.flags |= NF_NAT_RANGE_MAP_IPS; > + break; > + > + case OVS_NAT_ATTR_IP_MAX: > + have_ip_max = true; > + nla_memcpy(&info->range.max_addr, a, > + sizeof(info->range.max_addr)); > + info->range.flags |= NF_NAT_RANGE_MAP_IPS; > + break; > + > + } > static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { > [OVS_CT_ATTR_COMMIT] = { .minlen = 0, > .maxlen = 0 }, > @@ -548,7 +954,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { > [OVS_CT_ATTR_LABEL] = { .minlen = sizeof(struct md_label), > .maxlen = sizeof(struct md_label) }, > [OVS_CT_ATTR_HELPER] = { .minlen = 1, > - .maxlen = NF_CT_HELPER_NAME_LEN } > + .maxlen = NF_CT_HELPER_NAME_LEN }, > +#ifdef CONFIG_NF_NAT_NEEDED > + [OVS_CT_ATTR_NAT] = { .minlen = 0, > + .maxlen = 96 } > +#endif Is the 96 a temporary hack here? > @@ -607,6 +1017,14 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, > return -EINVAL; > } > break; > +#ifdef CONFIG_NF_NAT_NEEDED > + case OVS_CT_ATTR_NAT: { > + int err = parse_nat(a, info, log); > + if (err) > + return err; > + break; > + } > +#endif We should probably bark if user space provides a OVS_CT_ATTR_NAT but the kernel is compiled without support for it. > +#ifdef CONFIG_NF_NAT_NEEDED > +static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info, > + struct sk_buff *skb) > +{ > + struct nlattr *start; > + > + start = nla_nest_start(skb, OVS_CT_ATTR_NAT); > + if (!start) > + return false; > + > + if (info->flags & OVS_CT_F_SRC_NAT) { > + if (nla_put_flag(skb, OVS_NAT_ATTR_SRC)) > + return false; > + } else if (info->flags & OVS_CT_F_DST_NAT) { > + if (nla_put_flag(skb, OVS_NAT_ATTR_DST)) > + return false; > + } else { > + goto out; Is the empty nested attribute intended here? -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html