Functionality improvements * flags changed from 16 to 32 bits * fwmark added (32 bits) * timeout in sec. added (32 bits) * pe data added (Variable length) * IPv6 capabilities (3x16 bytes for addr.) * Version and type in every conn msg. ip_vs_process_message() now handles Version 1 messages and will call ip_vs_process_message_v0() for version 0 messages. ip_vs_proc_conn() is common for both version, and handles the update of connection hash. ip_vs_conn_fill_param_sync() - Version 1 messages only ip_vs_conn_fill_param_sync_v0() - Version 0 messages only *v2 A new option format added as with opt,opt-len,data as a general way to add options to a conn entry. timeout is now in seconds fwmark is not in ip_vs_conn_param any more. Mask for flags received by backup. Basically all changes implements Julians comments. Signed-off-by: Hans Schillstrom <hans.schillstrom@xxxxxxxxxxxx> --- include/linux/ip_vs.h | 8 + net/netfilter/ipvs/ip_vs_sync.c | 468 +++++++++++++++++++++++++++++---------- 2 files changed, 358 insertions(+), 118 deletions(-) diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h index 5f43a3b..4deb383 100644 --- a/include/linux/ip_vs.h +++ b/include/linux/ip_vs.h @@ -89,6 +89,14 @@ #define IP_VS_CONN_F_TEMPLATE 0x1000 /* template, not connection */ #define IP_VS_CONN_F_ONE_PACKET 0x2000 /* forward only one packet */ +#define IP_VS_CONN_F_BACKUP_MASK (IP_VS_CONN_F_FWD_MASK | \ + IP_VS_CONN_F_NOOUTPUT | \ + IP_VS_CONN_F_INACTIVE | \ + IP_VS_CONN_F_SEQ_MASK | \ + IP_VS_CONN_F_NO_CPORT | \ + IP_VS_CONN_F_TEMPLATE \ + ) + /* Flags that are not sent to backup server start from bit 16 */ #define IP_VS_CONN_F_NFCT (1 << 16) /* use netfilter conntrack */ diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index f7f115d..5aa8e04 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -15,6 +15,7 @@ * Alexandre Cassen : Added SyncID support for incoming sync * messages filtering. * Justin Ossevoort : Fix endian problem on sync message size. + * Hans Schillstrom : Added IPv6 and Persistence support */ #define KMSG_COMPONENT "IPVS" @@ -412,53 +413,173 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp) ip_vs_sync_conn(cp->control); } +/* + * fill_param used for proto version 0 + */ static inline int -ip_vs_conn_fill_param_sync(int af, int protocol, - const union nf_inet_addr *caddr, __be16 cport, - const union nf_inet_addr *vaddr, __be16 vport, +ip_vs_conn_fill_param_sync_v0(int af, struct ip_vs_sync_conn_v0 *sc, struct ip_vs_conn_param *p) { - /* XXX: Need to take into account persistence engine */ - ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p); + ip_vs_conn_fill_param(af, sc->protocol, + (const union nf_inet_addr *)&sc->caddr, + sc->cport, + (const union nf_inet_addr *)&sc->vaddr, + sc->vport, p); return 0; } /* - * Process received multicast message and create the corresponding - * ip_vs_conn entries. + * fill_param used by version 1 */ -static void ip_vs_process_message(const char *buffer, const size_t buflen) +static inline int +ip_vs_conn_fill_param_sync(int af, union ip_vs_sync_conn *sc, + struct ip_vs_conn_param *p, char *pe_data, + int pe_data_len, char *pe_name, int pe_name_len ) +{ +#ifdef CONFIG_IP_VS_IPV6 + if ( af == AF_INET6 ) + ip_vs_conn_fill_param(af, sc->v6.protocol, + (const union nf_inet_addr *)&sc->v6.caddr, + sc->v6.cport, + (const union nf_inet_addr *)&sc->v6.vaddr, + sc->v6.vport, p); + else +#endif + ip_vs_conn_fill_param(af, sc->v4.protocol, + (const union nf_inet_addr *)&sc->v4.caddr, + sc->v4.cport, + (const union nf_inet_addr *)&sc->v4.vaddr, + sc->v4.vport, p); + /* Handle pe data */ + if (pe_data_len && pe_data ) { + IP_VS_DBG(10, "%s() pe_data=%s\n", __func__, pe_data); + p->pe_data = kmalloc(pe_data_len, GFP_ATOMIC); + if (!p->pe_data) + return -ENOMEM; + memcpy(p->pe_data, pe_data, pe_data_len); + p->pe_data_len = pe_data_len; + if (pe_name && pe_name_len) { + p->pe = ip_vs_pe_get(pe_name); + IP_VS_DBG(10, "%s() pe_name=%s\n", __func__, pe_name); + } + } + return 0; +} + +/* + * Connection Add / Update. + * Common for version 0 and 1 reception of backup messages. + * Param: ... + * timeout is in sec. + */ +static void ip_vs_proc_conn(struct ip_vs_conn_param *param, unsigned flags, + unsigned state, unsigned protocol, unsigned type, + const union nf_inet_addr *daddr, __be16 dport, + unsigned long timeout, __u32 fwmark, + struct ip_vs_sync_conn_options *opt, + struct ip_vs_protocol *pp ) +{ + struct ip_vs_dest *dest; + struct ip_vs_conn *cp; + + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + cp = ip_vs_conn_in_get(param); + else + cp = ip_vs_ct_in_get(param); + if (!cp) { + /* + * Find the appropriate destination for the connection. + * If it is not found the connection will remain unbound + * but still handled. + */ + dest = ip_vs_find_dest(type, daddr, dport, param->vaddr, + param->vport, protocol, fwmark); + /* Set the approprite ativity flag */ + if (protocol == IPPROTO_TCP) { + if (state != IP_VS_TCP_S_ESTABLISHED) + flags |= IP_VS_CONN_F_INACTIVE; + else + flags &= ~IP_VS_CONN_F_INACTIVE; + } else if (protocol == IPPROTO_SCTP) { + if (state != IP_VS_SCTP_S_ESTABLISHED) + flags |= IP_VS_CONN_F_INACTIVE; + else + flags &= ~IP_VS_CONN_F_INACTIVE; + } + cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); + if (dest) + atomic_dec(&dest->refcnt); + if (!cp) { + pr_err("ip_vs_conn_new failed\n"); + return; + } + } else if (!cp->dest) { + dest = ip_vs_try_bind_dest(cp); + if (dest) + atomic_dec(&dest->refcnt); + } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && + (cp->state != state)) { + /* update active/inactive flag for the connection */ + dest = cp->dest; + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) && + (cp->state != state)) { + dest = cp->dest; + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (state != IP_VS_SCTP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + + if (opt) + memcpy(&cp->in_seq, opt, sizeof(*opt)); + atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); + cp->state = state; + cp->old_state = cp->state; + /* + * For old messages style + * - Not possible to recover the right timeout for templates + * - can not find the right fwmark + * virtual service. If needed, we can do it for + * non-fwmark persistent services. + * New messages style + * - No problem. + */ + if (timeout) + cp->timeout = timeout*HZ; + else if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) + cp->timeout = pp->timeout_table[state]; + else + cp->timeout = (3*60*HZ); + ip_vs_conn_put(cp); +} + +/* + * Process received multicast message for Version 0 + */ +static void ip_vs_process_message_v0(const char *buffer, const size_t buflen) { struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; struct ip_vs_sync_conn_v0 *s; struct ip_vs_sync_conn_options *opt; - struct ip_vs_conn *cp; struct ip_vs_protocol *pp; - struct ip_vs_dest *dest; struct ip_vs_conn_param param; char *p; int i; - if (buflen < sizeof(struct ip_vs_sync_mesg)) { - IP_VS_ERR_RL("sync message header too short\n"); - return; - } - - /* Convert size back to host byte order */ - m->size = ntohs(m->size); - - if (buflen != m->size) { - IP_VS_ERR_RL("bogus sync message size\n"); - return; - } - - /* SyncID sanity check */ - if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { - IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", - m->syncid); - return; - } - p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); for (i=0; i<m->nr_conns; i++) { unsigned flags, state; @@ -505,103 +626,214 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) } } - { - if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport, ¶m)) { - pr_err("ip_vs_conn_fill_param_sync failed"); - return; - } - if (!(flags & IP_VS_CONN_F_TEMPLATE)) - cp = ip_vs_conn_in_get(¶m); - else - cp = ip_vs_ct_in_get(¶m); + if (ip_vs_conn_fill_param_sync_v0(AF_INET, s, ¶m)) { + pr_err("ip_vs_conn_fill_param_sync failed"); + return; } - if (!cp) { - /* - * Find the appropriate destination for the connection. - * If it is not found the connection will remain unbound - * but still handled. - */ - dest = ip_vs_find_dest(AF_INET, - (union nf_inet_addr *)&s->daddr, - s->dport, - (union nf_inet_addr *)&s->vaddr, - s->vport, - s->protocol, 0); - /* Set the approprite ativity flag */ - if (s->protocol == IPPROTO_TCP) { - if (state != IP_VS_TCP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; - } else if (s->protocol == IPPROTO_SCTP) { - if (state != IP_VS_SCTP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; + /* Send timeout as Zero */ + ip_vs_proc_conn(¶m, flags, state, s-> protocol, AF_INET, + (union nf_inet_addr *)&s->daddr, s->dport, + 0, 0, opt, pp ); + + } +} + +/* + * Process received multicast message and create the corresponding + * ip_vs_conn entries. + * Handles Version 0 & 1 + */ +static void ip_vs_process_message(const char *buffer, const size_t buflen) +{ + struct ip_vs_sync_mesg_v2 *m2 = (struct ip_vs_sync_mesg_v2 *)buffer; + union ip_vs_sync_conn *s; + struct ip_vs_protocol *pp; + struct ip_vs_conn_param param; + char *p; + int i, af, nr_conns; + + if (buflen < sizeof(struct ip_vs_sync_mesg)) { + IP_VS_ERR_RL("sync message header too short\n"); + return; + } + + /* Convert size back to host byte order */ + m2->size = ntohs(m2->size); + + if (buflen != m2->size) { + IP_VS_ERR_RL("bogus sync message size\n"); + return; + } + + /* SyncID sanity check */ + if (ip_vs_backup_syncid != 0 && m2->syncid != ip_vs_backup_syncid) { + IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", + m2->syncid); + return; + } + /* Prepare ptrs for version 1 or 2 message */ + if ( m2->version==SYNC_PROTO_VER && m2->reserverd==0 && m2->spare==0) { + p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v2); + nr_conns = m2->nr_conns; + IP_VS_DBG(7, "%s Message v 1, %d bytes with %d conns\n", + __func__, m2->size & SVER_MASK, m2->nr_conns); + } else { + /* Old type of message */ + ip_vs_process_message_v0(buffer, buflen); + return; + } + + for (i=0; i<nr_conns; i++) { + char *msgEnd; + __u32 flags; + unsigned state, size, opt_flags=0; + struct ip_vs_sync_conn_options opt; + char *pe_data=NULL, *pe_name=NULL; + unsigned pe_data_len=0, pe_name_len=0; + unsigned unknownOpt=0; + + s = (union ip_vs_sync_conn *) p; + size = ntohs(s->v4.ver_size) & SVER_MASK; + msgEnd = p + size; + + if (msgEnd > buffer+buflen) { + IP_VS_ERR_RL("bogus conn/size in sync message\n"); + return; + } + if (ntohs( s->v4.ver_size) >> SVER_SHIFT) { + IP_VS_ERR_RL("Unknown version %d in sync message\n", + ntohs( s->v4.ver_size) >> SVER_SHIFT); + return; + } + + if (s->v6.type & STYPE_INET6 ) { +#ifdef CONFIG_IP_VS_IPV6 + af = AF_INET6; + p += sizeof(struct ip_vs_sync_v6); +#else + IP_VS_DBG(2,"IPv6 sync message received, and IPVS is not compiled for IPv6\n"); + p = (char *)s + size; + continue; +#endif + } else { + af = AF_INET; + p += sizeof(struct ip_vs_sync_v4); + } + /* Mask off unsupported flags */ + flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; + flags |= IP_VS_CONN_F_SYNC; + state = ntohs(s->v4.state); + + if (p > buffer+buflen) { + IP_VS_ERR_RL("bogus conn in sync message\n"); + return; + } + /* Process options check optType & optLen + * *p = opt type + * *(p+1)= opt len + */ + while ( p< msgEnd && *p && *(p+1) && p + *(p+1) < msgEnd ) { + switch (*(p++) & 0x7f) { + /* Handle seq option *p = opt len */ + case IPVS_OPT_SEQ_DATA: + if(*p != sizeof(struct ip_vs_sync_conn_options)) { + IP_VS_ERR_RL("bogus conn options in sync message\n"); + return; + } + memcpy((void*)&opt, p+1, *p); + opt_flags |= IPVS_OPT_F_SEQ_DATA; + break; + + case IPVS_OPT_PE_DATA: + if (*p > IP_VS_PEDATA_MAXLEN) { + IP_VS_ERR_RL("bogus pe_data len in sync message\n"); + return; + } + pe_data_len = *p; + pe_data = p+1; + opt_flags |= IPVS_OPT_F_PE_DATA; + break; + case IPVS_OPT_PE_NAME: + if (*p > (IP_VS_PENAME_MAXLEN+1)) { + IP_VS_ERR_RL("bogus pe_name len in sync message\n"); + return; + } + pe_name_len = *p; + pe_name = p+1; + if (*(p+pe_name_len)) { + IP_VS_ERR_RL("bogus pe_name no trailing null in sync message\n"); + return; + } + opt_flags |= IPVS_OPT_F_PE_NAME; + break; + default: + /* Optional data ? */ + if( !(*(p-1) & STYPE_OPT_DATA)) { + IP_VS_ERR_RL("Unknown mandatory option found %d in sync message\n", *(p-1)); + return; + } + IP_VS_DBG(2,"Unknown option found %d in sync message\n", + *(p-1)); + unknownOpt = 1; } - cp = ip_vs_conn_new(¶m, - (union nf_inet_addr *)&s->daddr, - s->dport, flags, dest, 0); - if (dest) - atomic_dec(&dest->refcnt); - if (!cp) { - pr_err("ip_vs_conn_new failed\n"); - return; + p += (*p) + 1; /* Next option */ + } + if (unknownOpt) + continue; /* Skip this take next */ + + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + pp = ip_vs_proto_get(s->v4.protocol); + if (!pp) { + IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", + s->v4.protocol); + continue; } - } else if (!cp->dest) { - dest = ip_vs_try_bind_dest(cp); - if (dest) - atomic_dec(&dest->refcnt); - } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && - (cp->state != state)) { - /* update active/inactive flag for the connection */ - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; + if (state >= pp->num_states) { + IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", + pp->name, state); + continue; } - } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) && - (cp->state != state)) { - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_SCTP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } else { + /* protocol in templates is not used for state/timeout */ + pp = NULL; + if (state > 0) { + IP_VS_DBG(2, "Invalid template state %u in sync msg\n", + state); + state = 0; } } - - if (opt) - memcpy(&cp->in_seq, opt, sizeof(*opt)); - atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); - cp->state = state; - cp->old_state = cp->state; - /* - * We can not recover the right timeout for templates - * in all cases, we can not find the right fwmark - * virtual service. If needed, we can do it for - * non-fwmark persistent services. - */ - if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) - cp->timeout = pp->timeout_table[state]; + if (ip_vs_conn_fill_param_sync(af, s, ¶m, + pe_data, pe_data_len, + pe_name, pe_name_len)) { + pr_err("ip_vs_conn_fill_param_sync failed"); + return; + } + /* If only IPv4, just silent skip IPv6 */ + if ( af == AF_INET ) + ip_vs_proc_conn(¶m, flags, state, s->v4.protocol, + af, + (union nf_inet_addr *)&s->v4.daddr, + s->v4.dport, ntohl(s->v4.timeout), + ntohl(s->v4.fwmark), + (opt_flags & IPVS_OPT_F_SEQ_DATA + ? &opt : NULL), + pp); +#ifdef CONFIG_IP_VS_IPV6 else - cp->timeout = (3*60*HZ); - ip_vs_conn_put(cp); - } + ip_vs_proc_conn(¶m, flags, state, s->v6.protocol, + af, + (union nf_inet_addr *)&s->v6.daddr, + s->v6.dport, ntohl(s->v6.timeout), + ntohl(s->v6.fwmark), + (opt_flags & IPVS_OPT_F_SEQ_DATA + ? &opt : NULL), + pp); +#endif + p = (char *)s + size; + } /* End of for(...) */ } - /* * Setup loopback of outgoing multicasts on a sending socket */ -- 1.7.0.1 -- To unsubscribe from this list: send the line "unsubscribe lvs-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html