Re: [PATCH] ipvs: Fix race conditions in lblcr scheduler

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Aug 18, 2008 at 12:53:05AM +0200, Sven Wegener wrote:
> We can't access the cache entry outside of our critical read-locked region,
> because someone may free that entry. Also getting an entry under read lock,
> then locking for write and trying to delete that entry looks fishy, but should
> be no problem here, because we're only comparing a pointer. Also there is no
> need for our own rwlock, there is already one in the service structure for use
> in the schedulers.

Hi Sven,

this looks good to me. Just a few minor comments inline.

> Signed-off-by: Sven Wegener <sven.wegener@xxxxxxxxxxx>
> ---
>  net/ipv4/ipvs/ip_vs_lblcr.c |  229 +++++++++++++++++++++----------------------
>  1 files changed, 114 insertions(+), 115 deletions(-)
> 
> diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
> index f1c8450..96bfdc2 100644
> --- a/net/ipv4/ipvs/ip_vs_lblcr.c
> +++ b/net/ipv4/ipvs/ip_vs_lblcr.c
> @@ -106,7 +106,7 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
>  			return NULL;
>  	}
>  
> -	e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
> +	e = kmalloc(sizeof(*e), GFP_ATOMIC);

I think that I prefer using struct ip_vs_dest_list rather than *e.
Ditto for *tbl below.

>  	if (e == NULL) {
>  		IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
>  		return NULL;
> @@ -116,11 +116,9 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
>  	e->dest = dest;
>  
>  	/* link it to the list */
> -	write_lock(&set->lock);
>  	e->next = set->list;
>  	set->list = e;
>  	atomic_inc(&set->size);
> -	write_unlock(&set->lock);
>  
>  	set->lastmod = jiffies;
>  	return e;
> @@ -131,7 +129,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
>  {
>  	struct ip_vs_dest_list *e, **ep;
>  
> -	write_lock(&set->lock);
>  	for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
>  		if (e->dest == dest) {
>  			/* HIT */
> @@ -144,7 +141,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
>  		}
>  		ep = &e->next;
>  	}
> -	write_unlock(&set->lock);
>  }
>  
>  static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
> @@ -174,7 +170,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
>  	if (set == NULL)
>  		return NULL;
>  
> -	read_lock(&set->lock);
>  	/* select the first destination server, whose weight > 0 */
>  	for (e=set->list; e!=NULL; e=e->next) {
>  		least = e->dest;
> @@ -188,7 +183,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
>  			goto nextstage;
>  		}
>  	}
> -	read_unlock(&set->lock);
>  	return NULL;
>  
>  	/* find the destination with the weighted least load */
> @@ -207,7 +201,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
>  			loh = doh;
>  		}
>  	}
> -	read_unlock(&set->lock);
>  
>  	IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
>  		  "activeconns %d refcnt %d weight %d overhead %d\n",
> @@ -229,7 +222,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
>  	if (set == NULL)
>  		return NULL;
>  
> -	read_lock(&set->lock);
>  	/* select the first destination server, whose weight > 0 */
>  	for (e=set->list; e!=NULL; e=e->next) {
>  		most = e->dest;
> @@ -239,7 +231,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
>  			goto nextstage;
>  		}
>  	}
> -	read_unlock(&set->lock);
>  	return NULL;
>  
>  	/* find the destination with the weighted most load */
> @@ -256,7 +247,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
>  			moh = doh;
>  		}
>  	}
> -	read_unlock(&set->lock);
>  
>  	IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
>  		  "activeconns %d refcnt %d weight %d overhead %d\n",
> @@ -284,7 +274,6 @@ struct ip_vs_lblcr_entry {
>   *      IPVS lblcr hash table
>   */
>  struct ip_vs_lblcr_table {
> -	rwlock_t	        lock;           /* lock for this table */
>  	struct list_head        bucket[IP_VS_LBLCR_TAB_SIZE];  /* hash bucket */
>  	atomic_t                entries;        /* number of entries */
>  	int                     max_size;       /* maximum size of entries */
> @@ -311,32 +300,6 @@ static ctl_table vs_vars_table[] = {
>  
>  static struct ctl_table_header * sysctl_header;
>  
> -/*
> - *      new/free a ip_vs_lblcr_entry, which is a mapping of a destination
> - *      IP address to a server.
> - */
> -static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__be32 daddr)
> -{
> -	struct ip_vs_lblcr_entry *en;
> -
> -	en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
> -	if (en == NULL) {
> -		IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
> -		return NULL;
> -	}
> -
> -	INIT_LIST_HEAD(&en->list);
> -	en->addr = daddr;
> -
> -	/* initilize its dest set */
> -	atomic_set(&(en->set.size), 0);
> -	en->set.list = NULL;
> -	rwlock_init(&en->set.lock);
> -
> -	return en;
> -}
> -
> -
>  static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
>  {
>  	list_del(&en->list);
> @@ -358,55 +321,68 @@ static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
>   *	Hash an entry in the ip_vs_lblcr_table.
>   *	returns bool success.
>   */
> -static int
> +static void
>  ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
>  {
> -	unsigned hash;
> -
> -	if (!list_empty(&en->list)) {
> -		IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
> -			  "called from %p\n", __builtin_return_address(0));
> -		return 0;
> -	}
> +	unsigned hash = ip_vs_lblcr_hashkey(en->addr);
>  
> -	/*
> -	 *	Hash by destination IP address
> -	 */
> -	hash = ip_vs_lblcr_hashkey(en->addr);
> -
> -	write_lock(&tbl->lock);
>  	list_add(&en->list, &tbl->bucket[hash]);
>  	atomic_inc(&tbl->entries);
> -	write_unlock(&tbl->lock);
> -
> -	return 1;
>  }
>  
>  
>  /*
> - *  Get ip_vs_lblcr_entry associated with supplied parameters.
> + *  Get ip_vs_lblcr_entry associated with supplied parameters. Called under
> + *  read lock.
>   */
>  static inline struct ip_vs_lblcr_entry *
>  ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
>  {
> -	unsigned hash;
> +	unsigned hash = ip_vs_lblcr_hashkey(addr);
>  	struct ip_vs_lblcr_entry *en;
>  
> -	hash = ip_vs_lblcr_hashkey(addr);
> +	list_for_each_entry(en, &tbl->bucket[hash], list)
> +		if (en->addr == addr)
> +			return en;
> +
> +	return NULL;
> +}
>  
> -	read_lock(&tbl->lock);
>  
> -	list_for_each_entry(en, &tbl->bucket[hash], list) {
> -		if (en->addr == addr) {
> -			/* HIT */
> -			read_unlock(&tbl->lock);
> -			return en;
> +/*
> + * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
> + * IP address to a server. Called under write lock.
> + */
> +static inline struct ip_vs_lblcr_entry *
> +ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl,  __be32 daddr,
> +		struct ip_vs_dest *dest)
> +{
> +	struct ip_vs_lblcr_entry *en;
> +
> +	en = ip_vs_lblcr_get(tbl, daddr);
> +	if (!en) {
> +		en = kmalloc(sizeof(*en), GFP_ATOMIC);
> +		if (!en) {
> +			IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
> +			return NULL;
>  		}
> +
> +		en->addr = daddr;
> +		en->lastuse = jiffies;
> +
> +		/* initilize its dest set */
> +		atomic_set(&(en->set.size), 0);
> +		en->set.list = NULL;
> +		rwlock_init(&en->set.lock);
> +
> +		ip_vs_lblcr_hash(tbl, en);
>  	}
>  
> -	read_unlock(&tbl->lock);
> +	write_lock(&en->set.lock);
> +	ip_vs_dest_set_insert(&en->set, dest);
> +	write_unlock(&en->set.lock);
>  
> -	return NULL;
> +	return en;
>  }
>  
>  
> @@ -418,19 +394,18 @@ static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
>  	int i;
>  	struct ip_vs_lblcr_entry *en, *nxt;
>  
> +	/* No locking required, only called during cleanup. */
>  	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
> -		write_lock(&tbl->lock);
>  		list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
>  			ip_vs_lblcr_free(en);
> -			atomic_dec(&tbl->entries);
>  		}
> -		write_unlock(&tbl->lock);
>  	}
>  }
>  
>  
> -static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
> +static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
>  {
> +	struct ip_vs_lblcr_table *tbl = svc->sched_data;
>  	unsigned long now = jiffies;
>  	int i, j;
>  	struct ip_vs_lblcr_entry *en, *nxt;
> @@ -438,7 +413,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
>  	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
>  		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
>  
> -		write_lock(&tbl->lock);
> +		write_lock(&svc->sched_lock);
>  		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
>  			if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
>  				       now))
> @@ -447,7 +422,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
>  			ip_vs_lblcr_free(en);
>  			atomic_dec(&tbl->entries);
>  		}
> -		write_unlock(&tbl->lock);
> +		write_unlock(&svc->sched_lock);
>  	}
>  	tbl->rover = j;
>  }
> @@ -466,17 +441,16 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
>   */
>  static void ip_vs_lblcr_check_expire(unsigned long data)
>  {
> -	struct ip_vs_lblcr_table *tbl;
> +	struct ip_vs_service *svc = (struct ip_vs_service *) data;
> +	struct ip_vs_lblcr_table *tbl = svc->sched_data;
>  	unsigned long now = jiffies;
>  	int goal;
>  	int i, j;
>  	struct ip_vs_lblcr_entry *en, *nxt;
>  
> -	tbl = (struct ip_vs_lblcr_table *)data;
> -
>  	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
>  		/* do full expiration check */
> -		ip_vs_lblcr_full_check(tbl);
> +		ip_vs_lblcr_full_check(svc);
>  		tbl->counter = 1;
>  		goto out;
>  	}
> @@ -493,7 +467,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
>  	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
>  		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
>  
> -		write_lock(&tbl->lock);
> +		write_lock(&svc->sched_lock);
>  		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
>  			if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
>  				continue;
> @@ -502,7 +476,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
>  			atomic_dec(&tbl->entries);
>  			goal--;
>  		}
> -		write_unlock(&tbl->lock);
> +		write_unlock(&svc->sched_lock);
>  		if (goal <= 0)
>  			break;
>  	}
> @@ -520,15 +494,14 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
>  	/*
>  	 *    Allocate the ip_vs_lblcr_table for this service
>  	 */
> -	tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
> +	tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
>  	if (tbl == NULL) {
>  		IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
>  		return -ENOMEM;
>  	}
>  	svc->sched_data = tbl;
>  	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
> -		  "current service\n",
> -		  sizeof(struct ip_vs_lblcr_table));
> +		  "current service\n", sizeof(*tbl));
>  
>  	/*
>  	 *    Initialize the hash buckets
> @@ -536,7 +509,6 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
>  	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
>  		INIT_LIST_HEAD(&tbl->bucket[i]);
>  	}
> -	rwlock_init(&tbl->lock);
>  	tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
>  	tbl->rover = 0;
>  	tbl->counter = 1;
> @@ -545,9 +517,8 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
>  	 *    Hook periodic timer for garbage collection
>  	 */
>  	setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
> -			(unsigned long)tbl);
> -	tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
> -	add_timer(&tbl->periodic_timer);
> +			(unsigned long)svc);
> +	mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
>  
>  	return 0;
>  }
> @@ -564,9 +535,9 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
>  	ip_vs_lblcr_flush(tbl);
>  
>  	/* release the table itself */
> -	kfree(svc->sched_data);
> +	kfree(tbl);
>  	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
> -		  sizeof(struct ip_vs_lblcr_table));
> +		  sizeof(*tbl));
>  
>  	return 0;
>  }
> @@ -663,50 +634,78 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
>  static struct ip_vs_dest *
>  ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
>  {
> -	struct ip_vs_dest *dest;
> -	struct ip_vs_lblcr_table *tbl;
> -	struct ip_vs_lblcr_entry *en;
> +	struct ip_vs_lblcr_table *tbl = svc->sched_data;
>  	struct iphdr *iph = ip_hdr(skb);
> +	struct ip_vs_dest *dest = NULL;
> +	struct ip_vs_lblcr_entry *en;
>  
>  	IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
>  
> -	tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
> +	/* First look in our cache */
> +	read_lock(&svc->sched_lock);
>  	en = ip_vs_lblcr_get(tbl, iph->daddr);
> -	if (en == NULL) {
> -		dest = __ip_vs_lblcr_schedule(svc, iph);
> -		if (dest == NULL) {
> -			IP_VS_DBG(1, "no destination available\n");
> -			return NULL;
> -		}
> -		en = ip_vs_lblcr_new(iph->daddr);
> -		if (en == NULL) {
> -			return NULL;
> -		}
> -		ip_vs_dest_set_insert(&en->set, dest);
> -		ip_vs_lblcr_hash(tbl, en);
> -	} else {
> +	if (en) {
> +		/* We only hold a read lock, but this is atomic */
> +		en->lastuse = jiffies;
> +
> +		/* Get the least loaded destination */
> +		read_lock(&en->set.lock);
>  		dest = ip_vs_dest_set_min(&en->set);
> -		if (!dest || is_overloaded(dest, svc)) {
> -			dest = __ip_vs_lblcr_schedule(svc, iph);
> -			if (dest == NULL) {
> -				IP_VS_DBG(1, "no destination available\n");
> -				return NULL;
> -			}
> -			ip_vs_dest_set_insert(&en->set, dest);
> -		}
> +		read_unlock(&en->set.lock);
> +
> +		/* More than one destination + enough time passed by, cleanup */
>  		if (atomic_read(&en->set.size) > 1 &&
> -		    jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
> +				time_after(jiffies, en->set.lastmod +
> +				sysctl_ip_vs_lblcr_expiration)) {
>  			struct ip_vs_dest *m;
> +
> +			write_lock(&en->set.lock);
>  			m = ip_vs_dest_set_max(&en->set);
>  			if (m)
>  				ip_vs_dest_set_erase(&en->set, m);
> +			write_unlock(&en->set.lock);
> +		}
> +
> +		/* If the destination is not overloaded, use it */
> +		if (dest && !is_overloaded(dest, svc)) {
> +			read_unlock(&svc->sched_lock);
> +			goto out;
> +		}
> +
> +		/* The cache entry is invalid, time to schedule */
> +		dest = __ip_vs_lblcr_schedule(svc, iph);
> +		if (!dest) {
> +			IP_VS_DBG(1, "no destination available\n");
> +			read_unlock(&svc->sched_lock);
> +			return NULL;
>  		}
> +
> +		/* Update our cache entry */
> +		write_lock(&en->set.lock);
> +		ip_vs_dest_set_insert(&en->set, dest);
> +		write_unlock(&en->set.lock);
> +	}
> +	read_unlock(&svc->sched_lock);
> +
> +	if (dest)
> +		goto out;
> +
> +	/* No cache entry, time to schedule */
> +	dest = __ip_vs_lblcr_schedule(svc, iph);
> +	if (!dest) {
> +		IP_VS_DBG(1, "no destination available\n");
> +		return NULL;
>  	}
> -	en->lastuse = jiffies;
>  
> +	/* If we fail to create a cache entry, we'll just use the valid dest */
> +	write_lock(&svc->sched_lock);
> +	ip_vs_lblcr_new(tbl, iph->daddr, dest);
> +	write_unlock(&svc->sched_lock);
> +
> +out:
>  	IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
>  		  "--> server %u.%u.%u.%u:%d\n",
> -		  NIPQUAD(en->addr),
> +		  NIPQUAD(iph->addr),

Minor problem, this should be iph->daddr

>  		  NIPQUAD(dest->addr),
>  		  ntohs(dest->port));
>  
> --
> To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Filesystem Devel]     [Linux NFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux SCSI]     [X.Org]

  Powered by Linux