Hi Hans, On Mon, Oct 03, 2011 at 07:46:42PM +0200, Hans Schillstrom wrote: > diff --git a/include/linux/netfilter/xt_hmark.h b/include/linux/netfilter/xt_hmark.h > new file mode 100644 > index 0000000..6c1436a > --- /dev/null > +++ b/include/linux/netfilter/xt_hmark.h > @@ -0,0 +1,48 @@ > +#ifndef XT_HMARK_H_ > +#define XT_HMARK_H_ > + > +#include <linux/types.h> > + > +/* > + * Flags must not start at 0, since it's used as none. > + */ > +enum { > + XT_HMARK_SADR_AND = 1, /* SNAT & DNAT are used by the kernel module */ > + XT_HMARK_DADR_AND, > + XT_HMARK_SPI_AND, > + XT_HMARK_SPI_OR, > + XT_HMARK_SPORT_AND, > + XT_HMARK_DPORT_AND, > + XT_HMARK_SPORT_OR, > + XT_HMARK_DPORT_OR, > + XT_HMARK_PROTO_AND, > + XT_HMARK_RND, > + XT_HMARK_MODULUS, > + XT_HMARK_OFFSET, > + XT_HMARK_USE_SNAT, > + XT_HMARK_USE_DNAT, > +}; > + > +union ports { > + struct { > + __u16 src; > + __u16 dst; > + } p16; > + __u32 v32; > +}; > + > +struct xt_hmark_info { > + __u32 smask; /* Source address mask */ > + __u32 dmask; /* Dest address mask */ > + union ports pmask; > + union ports pset; > + __u32 spimask; > + __u32 spiset; > + __u16 flags; /* Print out only */ > + __u16 prmask; /* L4 Proto mask */ > + __u32 hashrnd; > + __u32 hmod; /* Modulus */ > + __u32 hoffs; /* Offset */ > +}; > + > +#endif /* XT_HMARK_H_ */ > diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig > index 32bff6d..3abd3a4 100644 > --- a/net/netfilter/Kconfig > +++ b/net/netfilter/Kconfig > @@ -483,6 +483,23 @@ config NETFILTER_XT_TARGET_IDLETIMER > > To compile it as a module, choose M here. If unsure, say N. > > +config NETFILTER_XT_TARGET_HMARK New config option has to go in alphabetic order (this one should go after NETFILTER_XT_TARGET_HL). > + tristate '"HMARK" target support' > + depends on NETFILTER_ADVANCED > + ---help--- > + This option adds the "HMARK" target. > + > + The target allows you to create rules in the "raw" and "mangle" tables > + which alter the netfilter mark (nfmark) field within a given range. > + First a 32 bit hash value is generated then modulus by <limit> and > + finally an offset is added before it's written to nfmark. > + > + Prior to routing, the nfmark can influence the routing method (see > + "Use netfilter MARK value as routing key") and can also be used by > + other subsystems to change their behavior. > + > + The mark match can also be used to match nfmark produced by this module. > + > config NETFILTER_XT_TARGET_LED > tristate '"LED" target support' > depends on LEDS_CLASS && LEDS_TRIGGERS > diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile > index 1a02853..359eeb6 100644 > --- a/net/netfilter/Makefile > +++ b/net/netfilter/Makefile > @@ -56,6 +56,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o > obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o > obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o > obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o > +obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_hmark.o > obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o > obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o > obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o > diff --git a/net/netfilter/xt_hmark.c b/net/netfilter/xt_hmark.c > new file mode 100644 > index 0000000..2f0aa93 > --- /dev/null > +++ b/net/netfilter/xt_hmark.c > @@ -0,0 +1,320 @@ > +/* > + * xt_hmark - Netfilter module to set mark as hash value > + * > + * (C) 2010 Hans Schillstrom <hans.schillstrom@xxxxxxxxxxxx> > + * > + * Description: > + * This module calculates a hash value that can be modified by modulus > + * and an offset. The hash value is based on a direction independent > + * five tuple: src & dst addr src & dst ports and protocol. > + * However src & dst port can be masked and are not used for fragmented > + * packets, ESP and AH don't have ports so SPI will be used instead. > + * For ICMP error messages the hash mark values will be calculated on > + * the source packet i.e. the packet caused the error (If sufficient > + * amount of data exists). > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/module.h> > +#include <linux/skbuff.h> > +#include <net/ip.h> > +#include <linux/icmp.h> > + > +#include <linux/netfilter/xt_hmark.h> > +#include <linux/netfilter/x_tables.h> > +#include <net/netfilter/nf_nat.h> > + > +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) > +# define WITH_IPV6 1 > +#include <net/ipv6.h> > +#include <linux/netfilter_ipv6/ip6_tables.h> > +#endif > + > + extra space not required. > +MODULE_LICENSE("GPL"); > +MODULE_AUTHOR("Hans Schillstrom <hans.schillstrom@xxxxxxxxxxxx>"); > +MODULE_DESCRIPTION("Xtables: packet range mark operations by hash value"); > +MODULE_ALIAS("ipt_HMARK"); > +MODULE_ALIAS("ip6t_HMARK"); > + > +/* > + * ICMP, get inner header so calc can be made on the source message > + * not the icmp header, i.e. same hash mark must be produced > + * on an icmp error message. > + */ > +static int get_inner_hdr(struct sk_buff *skb, int iphsz, int nhoff) This looks very similar to icmp_error in nf_conntrack_proto_icmp.c. Yours lacks of checksumming validation btw. I'm trying to find some place where we can put this function to make it available for both nf_conntrack_ipv4 and your module (to avoid code redundancy), but I didn't find any so far. It would be nice to find some way to avoid duplicating code with similar functionality. > +{ > + const struct icmphdr *icmph; > + struct icmphdr _ih; > + struct iphdr *iph = NULL; > + > + /* Not enough header? */ > + icmph = skb_header_pointer(skb, nhoff + iphsz, sizeof(_ih), &_ih); > + if (icmph == NULL) > + goto out; > + > + if (icmph->type > NR_ICMP_TYPES) > + goto out; > + > + extra space not required. > + /* Error message? */ > + if (icmph->type != ICMP_DEST_UNREACH && > + icmph->type != ICMP_SOURCE_QUENCH && > + icmph->type != ICMP_TIME_EXCEEDED && > + icmph->type != ICMP_PARAMETERPROB && > + icmph->type != ICMP_REDIRECT) > + goto out; > + /* Checkin full IP header plus 8 bytes of protocol to > + * avoid additional coding at protocol handlers. > + */ > + if (!pskb_may_pull(skb, nhoff + iphsz + sizeof(_ih) + 8)) > + goto out; We prefer skb_header_pointer instead. If conntrack is enabled, we can benefit from defragmention. Please, replace all pskb_may_pull by skb_header_pointer in this code. We can assume that the IP header is linear (not fragmented). > + iph = (struct iphdr *)(skb->data + nhoff + iphsz + sizeof(_ih)); > + return nhoff + iphsz + sizeof(_ih); > +out: > + return nhoff; > +} > +/* > + * ICMPv6 > + * Input nhoff Offset into network header > + * offset where ICMPv6 header starts > + * Returns true if it's a icmp error and updates nhoff > + */ > +#ifdef WITH_IPV6 > +static int get_inner6_hdr(struct sk_buff *skb, int *offset, int hdrlen) > +{ > + struct icmp6hdr *icmp6h; > + struct icmp6hdr _ih6; > + > + icmp6h = skb_header_pointer(skb, *offset + hdrlen, sizeof(_ih6), &_ih6); > + if (icmp6h == NULL) > + goto out; > + > + if (icmp6h->icmp6_type && icmp6h->icmp6_type < 128) { > + *offset += hdrlen + sizeof(_ih6); > + return 1; > + } > +out: > + return 0; > +} > +#endif > + > +/* > + * Calc hash value, special casre is taken on icmp and fragmented messages > + * i.e. fragmented messages don't use ports. > + */ > +static __u32 get_hash(struct sk_buff *skb, struct xt_hmark_info *info) This function seems to big to me, please, split it into smaller chunks, like get_hash_ipv4, get_hash_ipv6 and get_hash_ports. > +{ > + int nhoff, hash = 0, poff, proto, frag = 0; > + struct iphdr *ip; > + u8 ip_proto; > + u32 addr1, addr2, ihl; > + u16 snatport = 0, dnatport = 0; > + union { > + u32 v32; > + u16 v16[2]; > + } ports; > + > + nhoff = skb_network_offset(skb); > + proto = skb->protocol; > + > + if (!proto && skb->sk) { > + if (skb->sk->sk_family == AF_INET) > + proto = __constant_htons(ETH_P_IP); > + else if (skb->sk->sk_family == AF_INET6) > + proto = __constant_htons(ETH_P_IPV6); You already have the layer3 protocol number in xt_action_param. No need to use the socket information then. > + } > + > + switch (proto) { > + case __constant_htons(ETH_P_IP): > + { > + enum ip_conntrack_info ctinfo; > + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); > + struct nf_conntrack_tuple *otuple, *rtuple; > + > + if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) > + goto done; > + > + ip = (struct iphdr *) (skb->data + nhoff); > + if (ip->protocol == IPPROTO_ICMP) { > + /* Switch hash calc to inner header ? */ > + nhoff = get_inner_hdr(skb, ip->ihl * 4, nhoff); > + ip = (struct iphdr *) (skb->data + nhoff); > + } > + > + if (ip->frag_off & htons(IP_MF | IP_OFFSET)) > + frag = 1; > + > + ip_proto = ip->protocol; > + ihl = ip->ihl; > + addr1 = (__force u32) ip->saddr & info->smask; > + addr2 = (__force u32) ip->daddr & info->dmask; > + > + if (!ct || !nf_ct_is_confirmed(ct)) You seem to (ab)use nf_ct_is_confirmed to make sure you're not in the original direction. Better use the direction that you get by means of nf_ct_get. > + break; > + otuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; > + /* On the "return flow", to get the original address > + * i,e, replace the source address. > + */ > + if (ct->status & IPS_DST_NAT && > + info->flags & XT_HMARK_USE_DNAT) { > + rtuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; > + addr1 = (__force u32) otuple->dst.u3.in.s_addr; > + dnatport = otuple->dst.u.udp.port; > + } > + /* On the "return flow", to get the original address > + * i,e, replace the destination address. > + */ > + if (ct->status & IPS_SRC_NAT && > + info->flags & XT_HMARK_USE_SNAT) { > + rtuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; > + addr2 = (__force u32) otuple->src.u3.in.s_addr; > + snatport = otuple->src.u.udp.port; > + } > + break; > + } > +#ifdef WITH_IPV6 > + case __constant_htons(ETH_P_IPV6): > + { > + struct ipv6hdr *ip6; /* ip hdr */ > + int hdrlen = 0; /* In ip header */ > + u8 nexthdr; > + int ip6hdrlvl = 0; /* Header level */ > + struct ipv6_opt_hdr _hdr, *hp; > + > +hdr_new: > + if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) > + goto done; > + > + /* ip header */ > + ip6 = (struct ipv6hdr *) (skb->data + nhoff); > + nexthdr = ip6->nexthdr; > + /* nhoff += sizeof(struct ipv6hdr); Where hdr starts */ > + hdrlen = sizeof(struct ipv6hdr); > + hp = skb_header_pointer(skb, nhoff + hdrlen, sizeof(_hdr), > + &_hdr); > + while (nexthdr) { > + switch (nexthdr) { > + case IPPROTO_ICMPV6: > + /* ICMP Error then move ptr to inner header */ > + if (get_inner6_hdr(skb, &nhoff, hdrlen)) { > + ip6hdrlvl++; > + goto hdr_new; > + } > + nhoff += hdrlen; > + goto hdr_rdy; > + > + case NEXTHDR_FRAGMENT: > + if (!ip6hdrlvl) > + frag = 1; > + break; > + /* End of hdr traversing */ > + case NEXTHDR_IPV6: /* Do not process tunnels */ > + case NEXTHDR_TCP: > + case NEXTHDR_UDP: > + case NEXTHDR_ESP: > + case NEXTHDR_AUTH: > + case NEXTHDR_NONE: > + nhoff += hdrlen; > + goto hdr_rdy; > + default: > + goto done; This goto doesn't make too much sense to me, better return 0. > + } > + if (!hp) > + goto done; > + nhoff += hdrlen; /* eat current header */ > + nexthdr = hp->nexthdr; /* Next header */ > + hdrlen = ipv6_optlen(hp); > + hp = skb_header_pointer(skb, nhoff + hdrlen, > + sizeof(_hdr), &_hdr); > + > + if (!pskb_may_pull(skb, nhoff)) > + goto done; > + } > +hdr_rdy: > + ip_proto = nexthdr; > + > + addr1 = (__force u32) ip6->saddr.s6_addr32[3]; > + addr2 = (__force u32) ip6->daddr.s6_addr32[3]; > + ihl = 0; /* (40 >> 2); */ > + break; > + } > +#endif > + default: > + goto done; > + } > + > + ports.v32 = 0; > + poff = proto_ports_offset(ip_proto); > + nhoff += ihl * 4 + poff; > + if (!frag && poff >= 0 && pskb_may_pull(skb, nhoff + 4)) { > + ports.v32 = * (__force u32 *) (skb->data + nhoff); > + if (ip_proto == IPPROTO_ESP || ip_proto == IPPROTO_AH) { > + ports.v32 = (ports.v32 & info->spimask) | info->spiset; > + } else { /* Handle endian */ > + if (snatport) /* Replace snated dst port (ret flow) */ > + ports.v16[1] = snatport; > + if (dnatport) > + ports.v16[0] = dnatport; > + ports.v32 = (ports.v32 & info->pmask.v32) | > + info->pset.v32; > + if (ports.v16[1] < ports.v16[0]) > + swap(ports.v16[0], ports.v16[1]); > + } > + } > + ip_proto &= info->prmask; > + /* get a consistent hash (same value on both flow directions) */ > + if (addr2 < addr1) > + swap(addr1, addr2); > + > + hash = jhash_3words(addr1, addr2, ports.v32, info->hashrnd) ^ ip_proto; > + if (!hash) > + hash = 1; > + > + return hash; > + > +done: > + return 0; > +} I'll try to find more time to look into this. Specifically, I want to review the IPv6 bits more carefully. -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html