The netfilter hook list never uses the prev pointer, and so can be trimmed to be a smaller singly-linked list. In addition to having a more light weight structure for hook traversal, struct net becomes 5568 bytes (down from 6400) and struct net_device becomes 2176 bytes (down from 2240). Signed-off-by: Aaron Conole <aconole@xxxxxxxxxx> Signed-off-by: Florian Westphal <fw@xxxxxxxxx> --- NOTE: The unregister list code is ugly right now - I will be fixing it to a cleaner version in the next round of submission. include/linux/netdevice.h | 2 +- include/linux/netfilter.h | 18 +++--- include/linux/netfilter_ingress.h | 14 +++-- include/net/netfilter/nf_queue.h | 9 ++- include/net/netns/netfilter.h | 2 +- net/bridge/br_netfilter_hooks.c | 21 +++---- net/netfilter/core.c | 115 ++++++++++++++++++++++++-------------- net/netfilter/nf_internals.h | 10 ++-- net/netfilter/nf_queue.c | 15 +++-- net/netfilter/nfnetlink_queue.c | 5 +- 10 files changed, 121 insertions(+), 90 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e84d9d2..8235f67 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1747,7 +1747,7 @@ struct net_device { #endif struct netdev_queue __rcu *ingress_queue; #ifdef CONFIG_NETFILTER_INGRESS - struct list_head nf_hooks_ingress; + struct nf_hook_entry __rcu *nf_hooks_ingress; #endif unsigned char broadcast[MAX_ADDR_LEN]; diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index ad444f0..3390a84 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -55,12 +55,12 @@ struct nf_hook_state { struct net_device *out; struct sock *sk; struct net *net; - struct list_head *hook_list; + struct nf_hook_entry *hook_list; int (*okfn)(struct net *, struct sock *, struct sk_buff *); }; static inline void nf_hook_state_init(struct nf_hook_state *p, - struct list_head *hook_list, + struct nf_hook_entry *hook_list, unsigned int hook, int thresh, u_int8_t pf, struct net_device *indev, @@ -97,6 +97,12 @@ struct nf_hook_ops { int priority; }; +struct nf_hook_entry { + struct nf_hook_entry __rcu *next; + struct nf_hook_ops ops; + const struct nf_hook_ops *orig_ops; +}; + struct nf_sockopt_ops { struct list_head list; @@ -161,8 +167,6 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, int (*okfn)(struct net *, struct sock *, struct sk_buff *), int thresh) { - struct list_head *hook_list; - #ifdef HAVE_JUMP_LABEL if (__builtin_constant_p(pf) && __builtin_constant_p(hook) && @@ -170,14 +174,14 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, return 1; #endif - hook_list = &net->nf.hooks[pf][hook]; - - if (!list_empty(hook_list)) { + if (rcu_access_pointer(net->nf.hooks[pf][hook])) { + struct nf_hook_entry *hook_list; struct nf_hook_state state; int ret; /* We may already have this, but read-locks nest anyway */ rcu_read_lock(); + hook_list = rcu_dereference(net->nf.hooks[pf][hook]); nf_hook_state_init(&state, hook_list, hook, thresh, pf, indev, outdev, sk, net, okfn); diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h index 6965ba0..e3e3f6d 100644 --- a/include/linux/netfilter_ingress.h +++ b/include/linux/netfilter_ingress.h @@ -11,23 +11,27 @@ static inline bool nf_hook_ingress_active(const struct sk_buff *skb) if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS])) return false; #endif - return !list_empty(&skb->dev->nf_hooks_ingress); + return rcu_access_pointer(skb->dev->nf_hooks_ingress) != NULL; } /* caller must hold rcu_read_lock */ static inline int nf_hook_ingress(struct sk_buff *skb) { + struct nf_hook_entry *e = rcu_dereference(skb->dev->nf_hooks_ingress); struct nf_hook_state state; - nf_hook_state_init(&state, &skb->dev->nf_hooks_ingress, - NF_NETDEV_INGRESS, INT_MIN, NFPROTO_NETDEV, - skb->dev, NULL, NULL, dev_net(skb->dev), NULL); + if (unlikely(!e)) + return 0; + + nf_hook_state_init(&state, e, NF_NETDEV_INGRESS, INT_MIN, + NFPROTO_NETDEV, skb->dev, NULL, NULL, + dev_net(skb->dev), NULL); return nf_hook_slow(skb, &state); } static inline void nf_hook_ingress_init(struct net_device *dev) { - INIT_LIST_HEAD(&dev->nf_hooks_ingress); + RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL); } #else /* CONFIG_NETFILTER_INGRESS */ static inline int nf_hook_ingress_active(struct sk_buff *skb) diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 0dbce55..1da5bc0 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -11,7 +11,6 @@ struct nf_queue_entry { struct sk_buff *skb; unsigned int id; - struct nf_hook_ops *elem; struct nf_hook_state state; u16 size; /* sizeof(entry) + saved route keys */ @@ -22,10 +21,10 @@ struct nf_queue_entry { /* Packet queuing */ struct nf_queue_handler { - int (*outfn)(struct nf_queue_entry *entry, - unsigned int queuenum); - void (*nf_hook_drop)(struct net *net, - struct nf_hook_ops *ops); + int (*outfn)(struct nf_queue_entry *entry, + unsigned int queuenum); + void (*nf_hook_drop)(struct net *net, + const struct nf_hook_entry *ops); }; void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh); diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index 36d7235..58487b1 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -16,6 +16,6 @@ struct netns_nf { #ifdef CONFIG_SYSCTL struct ctl_table_header *nf_log_dir_header; #endif - struct list_head hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; + struct nf_hook_entry __rcu *hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; }; #endif diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 19f230c..88bb8ed 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1000,28 +1000,21 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, - struct sk_buf *)) + struct sk_buff *)) { - struct nf_hook_ops *elem; + struct nf_hook_entry *elem; struct nf_hook_state state; - struct list_head *head; int ret; - head = &net->nf.hooks[NFPROTO_BRIDGE][hook]; + elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]); - list_for_each_entry_rcu(elem, head, list) { - struct nf_hook_ops *next; + while (elem && (elem->ops.priority <= NF_BR_PRI_BRNF)) + elem = rcu_dereference(elem->next); - next = list_entry_rcu(list_next_rcu(&elem->list), - struct nf_hook_ops, list); - if (next->priority <= NF_BR_PRI_BRNF) - continue; - } - - if (&elem->list == head) + if (!elem) return okfn(net, sk, skb); - nf_hook_state_init(&state, head, hook, NF_BR_PRI_BRNF + 1, + nf_hook_state_init(&state, elem, hook, NF_BR_PRI_BRNF + 1, NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state); diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 6561d5c..f8df4e4 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -22,6 +22,7 @@ #include <linux/proc_fs.h> #include <linux/mutex.h> #include <linux/slab.h> +#include <linux/rcupdate.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -61,33 +62,52 @@ EXPORT_SYMBOL(nf_hooks_needed); #endif static DEFINE_MUTEX(nf_hook_mutex); +#define nf_entry_dereference(e) \ + rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex)) -static struct list_head *nf_find_hook_list(struct net *net, - const struct nf_hook_ops *reg) +static struct nf_hook_entry *nf_find_hook_list(struct net *net, + const struct nf_hook_ops *reg) { - struct list_head *hook_list = NULL; + struct nf_hook_entry *hook_list = NULL; if (reg->pf != NFPROTO_NETDEV) - hook_list = &net->nf.hooks[reg->pf][reg->hooknum]; + hook_list = rcu_dereference(net->nf.hooks[reg->pf] + [reg->hooknum]); else if (reg->hooknum == NF_NETDEV_INGRESS) { #ifdef CONFIG_NETFILTER_INGRESS if (reg->dev && dev_net(reg->dev) == net) - hook_list = ®->dev->nf_hooks_ingress; + hook_list = + rcu_dereference(reg->dev->nf_hooks_ingress); #endif } return hook_list; } -struct nf_hook_entry { - const struct nf_hook_ops *orig_ops; - struct nf_hook_ops ops; -}; +/* must hold nf_hook_mutex */ +static void nf_set_hook_list(struct net *net, const struct nf_hook_ops *reg, + struct nf_hook_entry *e) +{ + if (reg->pf != NFPROTO_NETDEV) { + rcu_assign_pointer(net->nf.hooks[reg->pf][reg->hooknum], e); +#ifdef CONFIG_NETFILTER_INGRESS + } else if (reg->hooknum == NF_NETDEV_INGRESS) { + rcu_assign_pointer(reg->dev->nf_hooks_ingress, e); +#endif + } else { + net_warn_ratelimited("pf %d, hooknum %d: not set\n", + reg->pf, reg->hooknum); + } +} int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct list_head *hook_list; + struct nf_hook_entry *hook_list; struct nf_hook_entry *entry; - struct nf_hook_ops *elem; + + if (reg->pf == NFPROTO_NETDEV && + (reg->hooknum != NF_NETDEV_INGRESS || + !reg->dev || dev_net(reg->dev) != net)) + return -EINVAL; entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) @@ -96,18 +116,20 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) entry->orig_ops = reg; entry->ops = *reg; + mutex_lock(&nf_hook_mutex); hook_list = nf_find_hook_list(net, reg); - if (!hook_list) { - kfree(entry); - return -ENOENT; + entry->next = hook_list; + while (hook_list && reg->priority >= hook_list->orig_ops->priority && + hook_list->next) { + hook_list = nf_entry_dereference(hook_list->next); } - mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, hook_list, list) { - if (reg->priority < elem->priority) - break; + if (hook_list) { + entry->next = hook_list->next; + rcu_assign_pointer(hook_list->next, entry); + } else { + nf_set_hook_list(net, reg, entry); } - list_add_rcu(&entry->ops.list, elem->list.prev); mutex_unlock(&nf_hook_mutex); #ifdef CONFIG_NETFILTER_INGRESS if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) @@ -122,24 +144,29 @@ EXPORT_SYMBOL(nf_register_net_hook); void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct list_head *hook_list; - struct nf_hook_entry *entry; - struct nf_hook_ops *elem; + struct nf_hook_entry *entry, *prev_prio; - hook_list = nf_find_hook_list(net, reg); - if (!hook_list) + entry = nf_find_hook_list(net, reg); + if (!entry) return; + prev_prio = NULL; mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, hook_list, list) { - entry = container_of(elem, struct nf_hook_entry, ops); + while (entry) { + struct nf_hook_entry *next = + nf_entry_dereference(entry->next); if (entry->orig_ops == reg) { - list_del_rcu(&entry->ops.list); + if (prev_prio) + rcu_assign_pointer(prev_prio->next, next); + else + nf_set_hook_list(net, reg, next); break; } + prev_prio = entry; + entry = next; } mutex_unlock(&nf_hook_mutex); - if (&elem->list == hook_list) { + if (!entry) { WARN(1, "nf_unregister_net_hook: hook not found!\n"); return; } @@ -151,7 +178,7 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); - nf_queue_nf_hook_drop(net, &entry->ops); + nf_queue_nf_hook_drop(net, entry); /* other cpu might still process nfqueue verdict that used reg */ synchronize_net(); kfree(entry); @@ -193,6 +220,8 @@ int nf_register_hook(struct nf_hook_ops *reg) struct net *net, *last; int ret; + WARN_ON(reg->priv); + rtnl_lock(); for_each_net(net) { ret = nf_register_net_hook(net, reg); @@ -253,10 +282,9 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) } EXPORT_SYMBOL(nf_unregister_hooks); -unsigned int nf_iterate(struct list_head *head, - struct sk_buff *skb, +unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state, - struct nf_hook_ops **elemp) + struct nf_hook_entry **elemp) { unsigned int verdict; @@ -264,20 +292,20 @@ unsigned int nf_iterate(struct list_head *head, * The caller must not block between calls to this * function because of risk of continuing from deleted element. */ - list_for_each_entry_continue_rcu((*elemp), head, list) { - if (state->thresh > (*elemp)->priority) + while (*elemp) { + if (state->thresh > (*elemp)->ops.priority) continue; /* Optimization: we don't need to hold module reference here, since function can't sleep. --RR */ repeat: - verdict = (*elemp)->hook((*elemp)->priv, skb, state); + verdict = (*elemp)->ops.hook((*elemp)->ops.priv, skb, state); if (verdict != NF_ACCEPT) { #ifdef CONFIG_NETFILTER_DEBUG if (unlikely((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT)) { NFDEBUG("Evil return from %p(%u).\n", - (*elemp)->hook, state->hook); + (*elemp)->ops.hook, state->hook); continue; } #endif @@ -285,6 +313,7 @@ repeat: return verdict; goto repeat; } + *elemp = (*elemp)->next; } return NF_ACCEPT; } @@ -295,13 +324,13 @@ repeat: * -EPERM for NF_DROP, 0 otherwise. */ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state) { - struct nf_hook_ops *elem; + struct nf_hook_entry *elem; unsigned int verdict; int ret = 0; - elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list); + elem = state->hook_list; next_hook: - verdict = nf_iterate(state->hook_list, skb, state, &elem); + verdict = nf_iterate(skb, state, &elem); if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1; } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { @@ -310,8 +339,10 @@ next_hook: if (ret == 0) ret = -EPERM; } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { - int err = nf_queue(skb, elem, state, - verdict >> NF_VERDICT_QBITS); + int err; + + state->hook_list = elem; + err = nf_queue(skb, state, verdict >> NF_VERDICT_QBITS); if (err < 0) { if (err == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) @@ -438,7 +469,7 @@ static int __net_init netfilter_net_init(struct net *net) for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) { for (h = 0; h < NF_MAX_HOOKS; h++) - INIT_LIST_HEAD(&net->nf.hooks[i][h]); + RCU_INIT_POINTER(net->nf.hooks[i][h], NULL); } #ifdef CONFIG_PROC_FS diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 0655225..6c0846d 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -13,13 +13,13 @@ /* core.c */ -unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, - struct nf_hook_state *state, struct nf_hook_ops **elemp); +unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state, + struct nf_hook_entry **elemp); /* nf_queue.c */ -int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, - struct nf_hook_state *state, unsigned int queuenum); -void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops); +int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, + unsigned int queuenum); +void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *e); int __init netfilter_queue_init(void); /* nf_log.c */ diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index b19ad20..0cb131b 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -96,14 +96,14 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry) } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); -void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops) +void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *e) { const struct nf_queue_handler *qh; rcu_read_lock(); qh = rcu_dereference(net->nf.queue_handler); if (qh) - qh->nf_hook_drop(net, ops); + qh->nf_hook_drop(net, e); rcu_read_unlock(); } @@ -112,7 +112,6 @@ void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops) * through nf_reinject(). */ int nf_queue(struct sk_buff *skb, - struct nf_hook_ops *elem, struct nf_hook_state *state, unsigned int queuenum) { @@ -141,7 +140,6 @@ int nf_queue(struct sk_buff *skb, *entry = (struct nf_queue_entry) { .skb = skb, - .elem = elem, .state = *state, .size = sizeof(*entry) + afinfo->route_key_size, }; @@ -166,7 +164,8 @@ err: void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { struct sk_buff *skb = entry->skb; - struct nf_hook_ops *elem = entry->elem; + struct nf_hook_entry *e = entry->state.hook_list; + struct nf_hook_ops *elem = &e->ops; const struct nf_afinfo *afinfo; int err; @@ -186,8 +185,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) if (verdict == NF_ACCEPT) { next_hook: - verdict = nf_iterate(entry->state.hook_list, - skb, &entry->state, &elem); + verdict = nf_iterate(skb, &entry->state, &e); } switch (verdict & NF_VERDICT_MASK) { @@ -198,7 +196,8 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) local_bh_enable(); break; case NF_QUEUE: - err = nf_queue(skb, elem, &entry->state, + entry->state.hook_list = e; + err = nf_queue(skb, &entry->state, verdict >> NF_VERDICT_QBITS); if (err < 0) { if (err == -ESRCH && diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 216bf69..60d3e51 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -919,10 +919,11 @@ static struct notifier_block nfqnl_dev_notifier = { static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long ops_ptr) { - return entry->elem == (struct nf_hook_ops *)ops_ptr; + return entry->state.hook_list == (struct nf_hook_entry *)ops_ptr; } -static void nfqnl_nf_hook_drop(struct net *net, struct nf_hook_ops *hook) +static void nfqnl_nf_hook_drop(struct net *net, + const struct nf_hook_entry *hook) { struct nfnl_queue_net *q = nfnl_queue_pernet(net); int i; -- 2.5.5 -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html