RE: [PATCH rdma-next 05/12] IB/core: Introduce GID entry reference counts for RoCE

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




> -----Original Message-----
> From: Jason Gunthorpe [mailto:jgg@xxxxxxxx]
> Sent: Wednesday, May 16, 2018 10:31 PM
> To: Leon Romanovsky <leon@xxxxxxxxxx>
> Cc: Doug Ledford <dledford@xxxxxxxxxx>; Leon Romanovsky
> <leonro@xxxxxxxxxxxx>; RDMA mailing list <linux-rdma@xxxxxxxxxxxxxxx>;
> Parav Pandit <parav@xxxxxxxxxxxx>
> Subject: Re: [PATCH rdma-next 05/12] IB/core: Introduce GID entry reference
> counts for RoCE
> 
> On Mon, May 14, 2018 at 11:11:11AM +0300, Leon Romanovsky wrote:
> > From: Parav Pandit <parav@xxxxxxxxxxxx>
> >
> > Currently a GID entry can be deleted from the GID table while it is in
> > use by ULPs, CM modules or user space applications.
> 
> [..]
> 
> So, I still don't like how tricky this is, particularly as we go down the series.
> 
> I prefer a version that kref's actual memory and tracks it that way.
> 
> Here is a compile tested version of this patch revised as I would think.. It
> probably needs some splitting because it is waay to big, but overall I think it is
> much more understandable and auditable this way.
> 
> I particularly like how the locking becomes simpler.
> 
> But what do you think?
> 
> From 03b9e61c0e9af031cf1ad73cb45f0f46c06f3be6 Mon Sep 17 00:00:00 2001
> From: Jason Gunthorpe <jgg@xxxxxxxxxxxx>
> Date: Mon, 14 May 2018 11:11:11 +0300
> Subject: [RFC PATCH] IB/core: Make the gid table entries immutable
> 
> This is the first step to expose the gid_attr struct stored in the gid table to the
> users without having to copy it.
> 
> The gid table entries are re-organized so they are written once before being
> added to the table, and then never changed during the lifetime of the entry.
> 
> When and entry is deleted/added (roce) or replaced (IB) new memory is
> allocated to replace the old entry and the old entry lives on via a kref until all
> users finally drop it.
> 
> For roce this means we can block re-use of gid indexes by dropping an ERR_PTR
> tombstone into the table while the entry is still in use, making the index blocked
> but the content in-accessible.
> 
> Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx>
> Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx>
> Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxxxx>
> ---
>  drivers/infiniband/core/cache.c | 626 +++++++++++++++++---------------
>  1 file changed, 340 insertions(+), 286 deletions(-)
> 
> diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
> index 47fa72b434be00..bc09382b57d280 100644
> --- a/drivers/infiniband/core/cache.c
> +++ b/drivers/infiniband/core/cache.c
> @@ -66,45 +66,149 @@ enum gid_attr_find_mask {
>  	GID_ATTR_FIND_MASK_GID_TYPE	= 1UL << 3,
>  };
> 
> -enum gid_table_entry_props {
> -	GID_TABLE_ENTRY_DEFAULT		= 1 << 1,
> +enum gid_table_entry_flags {
> +	GID_TABLE_ENTRY_IN_TABLE = 1 << 0,
> +	GID_TABLE_ENTRY_ADD_GID = 1 << 1,
> +	GID_TABLE_ENTRY_DEFAULT = 1 << 2,
>  };
> 
> -enum gid_table_entry_state {
> -	GID_TABLE_ENTRY_FREE		= 1,
> -	GID_TABLE_ENTRY_ALLOCATED	= 2,
> -};
> +struct ib_gid_entry {
> +	struct kref kref;
> +	struct work_struct del_work;
> +	union ib_gid gid;
> +	unsigned int flags;
> +	struct ib_gid_attr attr;
> 
> -struct ib_gid_table_entry {
> -	enum gid_table_entry_props	props;
> -	enum gid_table_entry_state	state;
> -	union ib_gid			gid;
> -	struct ib_gid_attr		attr;
> -	void				*context;
> +	/* driver specific data */
> +	void *context;
>  };
> 
>  struct ib_gid_table {
> -	int                  sz;
> -	/* In RoCE, adding a GID to the table requires:
> -	 * (a) Find if this GID is already exists.
> -	 * (b) Find a free space.
> -	 * (c) Write the new GID
> -	 *
> -	 * Delete requires different set of operations:
> -	 * (a) Find the GID
> -	 * (b) Delete it.
> -	 *
> -	 **/
> +	unsigned int sz;
> +	/* The first default_entries in data_vec are reserved for default entries
> */
> +	unsigned int default_entries;
> +
>  	/* Any writer to data_vec must hold this lock and the write side of
>  	 * rwlock. Readers must hold only rwlock. All writers must be in a
>  	 * sleepable context.
>  	 */
>  	struct mutex         lock;
> -	/* rwlock protects data_vec[ix]->props and data)vec[ix->state. */
> +	/* rwlock protects data_vec */
>  	rwlock_t	     rwlock;
> -	struct ib_gid_table_entry *data_vec;
> +	const struct ib_gid_entry *data_vec[];
>  };
> 
> +static struct ib_gid_table *get_table(struct ib_device *device, u8
> +port)
> +
> +{
> +	return device->cache.ports[port - rdma_start_port(device)].gid; }
> +
> +static struct ib_gid_table *get_entry_table(struct ib_gid_entry *entry)
> +{
> +	return get_table(entry->attr.device, entry->attr.port_num); }
> +
> +static void store_entry(struct ib_gid_table *table,
> +			struct ib_gid_entry *entry)
> +{
> +	int ix = entry->attr.index;
> +
> +	lockdep_assert_held(&table->lock);
> +	write_lock_irq(&table->rwlock);
> +
> +	WARN_ON(table->data_vec[ix]);
> +
> +	entry->flags |= GID_TABLE_ENTRY_IN_TABLE;
> +	table->data_vec[entry->attr.index] = entry;
> +
> +	write_unlock_irq(&table->rwlock);
> +}
> +
> +/**
> + * free_roce_gid_work - Release reference to the GID entry
> + * @work: Work structure to refer to GID entry which needs to be
> + * deleted.
> + *
> + * free_roce_gid_work() frees the entry from the HCA's hardware table
> + * if the provider supports it.
> + */
> +static void free_roce_gid_work(struct work_struct *work) {
> +	struct ib_gid_entry *entry =
> +		container_of(work, struct ib_gid_entry, del_work);
> +	struct ib_gid_table *table = get_entry_table(entry);
> +	struct ib_device *device = entry->attr.device;
> +
> +	pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
> +		 device->name, entry->attr.port_num, entry->attr.index,
> +		 entry->gid.raw);
> +
> +	mutex_lock(&table->lock);
> +	if (entry->flags & GID_TABLE_ENTRY_ADD_GID)
> +		device->del_gid(&entry->attr, &entry->context);
> +
> +	if (entry->flags & GID_TABLE_ENTRY_IN_TABLE) {
> +		write_lock_irq(&table->rwlock);
> +		WARN_ON(!IS_ERR(table->data_vec[entry->attr.index]));
> +		table->data_vec[entry->attr.index] = NULL;
> +		write_unlock_irq(&table->rwlock);
> +	}
> +
> +	mutex_unlock(&table->lock);
> +
> +	dev_put(entry->attr.ndev);
> +
> +	kfree(entry);
> +}
> +
> +/* Must be undone with put_entry */
> +static struct ib_gid_entry *alloc_entry(struct ib_device *ib_dev, u8
> +port) {
> +	struct ib_gid_entry *entry;
> +
> +	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> +	if (!entry)
> +		return NULL;
> +
> +	kref_init(&entry->kref);
> +	INIT_WORK(&entry->del_work, free_roce_gid_work);
> +
> +	entry->attr.device = ib_dev;
> +	entry->attr.port_num = port;
> +
> +	return entry;
> +}
> +
> +static struct ib_gid_entry *alloc_entry_roce(struct ib_device *ib_dev, u8 port,
> +					     struct net_device *ndev,
> +					     enum ib_gid_type gid_type)
> +{
> +	struct ib_gid_entry *entry =
> +		alloc_entry(ib_dev, port);
> +
> +	if (!entry)
> +		return NULL;
> +
> +	entry->attr.gid_type = gid_type;
> +	entry->attr.ndev = ndev;
> +	dev_hold(entry->attr.ndev);
> +	return entry;
> +}
> +
> +static void free_entry(struct kref *kref) {
> +	struct ib_gid_entry *entry =
> +		container_of(kref, struct ib_gid_entry, kref);
> +
> +	queue_work(ib_wq, &entry->del_work);
> +}
> +
> +static void put_entry(const struct ib_gid_entry *entry) {
> +	kref_put((struct kref *)&entry->kref, free_entry); }
> +
>  static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port)  {
>  	struct ib_event event;
> @@ -154,37 +258,10 @@ int ib_cache_gid_parse_type_str(const char *buf)  }
> EXPORT_SYMBOL(ib_cache_gid_parse_type_str);
> 
> -static inline bool
> -is_gid_table_entry_free(const struct ib_gid_table_entry *entry)
> +static int setup_roce_gid_entry(struct ib_gid_table *table,
> +				struct ib_gid_entry *entry)
>  {
> -	return entry->state == GID_TABLE_ENTRY_FREE;
> -}
> -
> -static inline bool
> -is_gid_table_entry_valid(const struct ib_gid_table_entry *entry) -{
> -	return entry->state == GID_TABLE_ENTRY_ALLOCATED;
> -}
> -
> -static void del_roce_gid(struct ib_device *device, u8 port_num,
> -			 struct ib_gid_table *table, int ix)
> -{
> -	pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
> -		 device->name, port_num, ix,
> -		 table->data_vec[ix].gid.raw);
> -
> -	if (rdma_cap_roce_gid_table(device, port_num))
> -		device->del_gid(&table->data_vec[ix].attr,
> -				&table->data_vec[ix].context);
> -	dev_put(table->data_vec[ix].attr.ndev);
> -}
> -
> -static int add_roce_gid(struct ib_gid_table *table,
> -			const union ib_gid *gid,
> -			const struct ib_gid_attr *attr)
> -{
> -	struct ib_gid_table_entry *entry;
> -	int ix = attr->index;
> +	const struct ib_gid_attr *attr = &entry->attr;
>  	int ret = 0;
> 
>  	if (!attr->ndev) {
> @@ -194,30 +271,23 @@ static int add_roce_gid(struct ib_gid_table *table,
>  		return -EINVAL;
>  	}
> 
> -	entry = &table->data_vec[ix];
> -	if (!is_gid_table_entry_free(entry)) {
> -		WARN(1, "GID table corruption device=%s port=%d
> index=%d\n",
> -		     attr->device->name, attr->port_num,
> -		     attr->index);
> -		return -EINVAL;
> -	}
> 
>  	if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) {
> -		ret = attr->device->add_gid(gid, attr, &entry->context);
> +		ret = attr->device->add_gid(&entry->gid, &entry->attr,
> +					    &entry->context);
>  		if (ret) {
>  			pr_err("%s GID add failed device=%s port=%d
> index=%d\n",
>  			       __func__, attr->device->name, attr->port_num,
>  			       attr->index);
> -			goto add_err;
> +			return ret;
>  		}
> +		entry->flags |= GID_TABLE_ENTRY_ADD_GID;
>  	}
> -	dev_hold(attr->ndev);
> 
> -add_err:
> -	if (!ret)
> -		pr_debug("%s device=%s port=%d index=%d gid %pI6\n",
> __func__,
> -			 attr->device->name, attr->port_num, ix, gid->raw);
> -	return ret;
> +	pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
> +		 attr->device->name, attr->port_num, entry->attr.index,
> +		 entry->gid.raw);
> +	return 0;
>  }
> 
>  /**
> @@ -232,15 +302,15 @@ static int add_roce_gid(struct ib_gid_table *table,
>   * GID. However such zero GIDs are not added to the cache.
>   */
>  static int add_modify_gid(struct ib_gid_table *table,
> -			  const union ib_gid *gid,
> -			  const struct ib_gid_attr *attr)
> +			  struct ib_gid_entry *entry)
>  {
>  	int ret;
> 
> -	if (rdma_protocol_roce(attr->device, attr->port_num)) {
> -		ret = add_roce_gid(table, gid, attr);
> -		if (ret)
> +	if (rdma_protocol_roce(entry->attr.device, entry->attr.port_num)) {
> +		ret = setup_roce_gid_entry(table, entry);
> +		if (ret) {
>  			return ret;
> +		}
>  	} else {
>  		/*
>  		 * Some HCA's report multiple GID entries with only one @@ -
> 248,99 +318,95 @@ static int add_modify_gid(struct ib_gid_table *table,
>  		 * So ignore such behavior for IB link layer and don't
>  		 * fail the call, but don't add such entry to GID cache.
>  		 */
> -		if (!memcmp(gid, &zgid, sizeof(*gid)))
> +		if (!memcmp(&entry->gid, &zgid, sizeof(entry->gid))) {
> +			put_entry(entry);
>  			return 0;
> +		}
>  	}
> 
> -	lockdep_assert_held(&table->lock);
> -	memcpy(&table->data_vec[attr->index].gid, gid, sizeof(*gid));
> -	memcpy(&table->data_vec[attr->index].attr, attr, sizeof(*attr));
> +	store_entry(table, entry);
> 
> -	write_lock_irq(&table->rwlock);
> -	table->data_vec[attr->index].state = GID_TABLE_ENTRY_ALLOCATED;
> -	write_unlock_irq(&table->rwlock);
>  	return 0;
>  }
> 
> -/**
> - * del_gid - Delete GID table entry
> - *
> - * @ib_dev:	IB device whose GID entry to be deleted
> - * @port:	Port number of the IB device
> - * @table:	GID table of the IB device for a port
> - * @ix:		GID entry index to delete
> - *
> - */
> -static void del_gid(struct ib_device *ib_dev, u8 port,
> -		    struct ib_gid_table *table, int ix)
> +static void del_gid(struct ib_gid_table *table,
> +		    const struct ib_gid_entry *entry)
>  {
> +	unsigned int ix = entry->attr.index;
> +
>  	lockdep_assert_held(&table->lock);
> +
>  	write_lock_irq(&table->rwlock);
> -	table->data_vec[ix].state = GID_TABLE_ENTRY_FREE;
> +
> +	WARN_ON(entry != table->data_vec[ix]);
> +
> +	/*
> +	 * The data_vec holds a marker for roce preventing re-use of the HW
> +	 * slow until the entry kref goes to zero. Other protocols can re-use
> +	 * immediately.
> +	 */
> +	if (rdma_protocol_roce(entry->attr.device, entry->attr.port_num))
> +		table->data_vec[ix] = ERR_PTR(-EINVAL);
> +	else {
> +		table->data_vec[ix] = NULL;
> +		((struct ib_gid_entry *)entry)->flags &=
> +			~GID_TABLE_ENTRY_IN_TABLE;
> +	}
> +
>  	write_unlock_irq(&table->rwlock);
> 
> -	if (rdma_protocol_roce(ib_dev, port))
> -		del_roce_gid(ib_dev, port, table, ix);
> -	memcpy(&table->data_vec[ix].gid, &zgid, sizeof(zgid));
> -	memset(&table->data_vec[ix].attr, 0, sizeof(table->data_vec[ix].attr));
> -	table->data_vec[ix].context = NULL;
> +	put_entry(entry);
>  }
> 
>  /* rwlock should be read locked, or lock should be held */ -static int
> find_gid(struct ib_gid_table *table, const union ib_gid *gid,
> -		    const struct ib_gid_attr *val, bool default_gid,
> -		    unsigned long mask, int *pempty)
> +static const struct ib_gid_entry *find_gid(struct ib_gid_table *table,
> +					   const union ib_gid *gid,
> +					   const struct ib_gid_attr *val,
> +					   bool default_gid, unsigned long
> mask,
> +					   int *pempty)
>  {
>  	int i = 0;
> -	int found = -1;
> -	int empty = pempty ? -1 : 0;
> +	int empty = -1;
> 
> -	while (i < table->sz && (found < 0 || empty < 0)) {
> -		struct ib_gid_table_entry *data = &table->data_vec[i];
> -		struct ib_gid_attr *attr = &data->attr;
> +	for (i = 0; i != table->sz; i++) {
> +		const struct ib_gid_entry *entry = table->data_vec[i];
> +		const struct ib_gid_attr *attr;
>  		int curr_index = i;
> 
> -		i++;
> +		/*
> +		 * Entry is still being freed, it cannot be returned from
> +		 * find_gid
> +		 */
> +		if (IS_ERR(entry))
> +			continue;
> 
>  		/* find_gid() is used during GID addition where it is expected
> -		 * to return a free entry slot which is not duplicate.
> -		 * Free entry slot is requested and returned if pempty is set,
> -		 * so lookup free slot only if requested.
> +		 * to return a free entry slot if there is not duplicate
> +		 * entry.
>  		 */
> -		if (pempty && empty < 0) {
> -			if (is_gid_table_entry_free(data) &&
> -			    (default_gid ==
> -			     !!(data->props & GID_TABLE_ENTRY_DEFAULT))) {
> -				/*
> -				 * Found an invalid (free) entry; allocate it.
> -				 * If default GID is requested, then our
> -				 * found slot must be one of the DEFAULT
> -				 * reserved slots or we fail.
> -				 * This ensures that only DEFAULT reserved
> -				 * slots are used for default property GIDs.
> -				 */
> +		if (entry == NULL) {
> +			/*
> +			 * Found an free entry; allocate it. If a default GID
> +			 * is requested, then our found slot must be one of
> +			 * the DEFAULT reserved slots or we fail.  This
> +			 * ensures that only DEFAULT reserved slots are used
> +			 * for default property GIDs.
> +			 */
> +			if (empty < 0 &&
> +			    (curr_index < table->default_entries) ==
> +				    default_gid)
>  				empty = curr_index;
> -			}
> -		}
> -
> -		/*
> -		 * Additionally find_gid() is used to find valid entry during
> -		 * lookup operation, where validity needs to be checked. So
> -		 * find the empty entry first to continue to search for a free
> -		 * slot and ignore its INVALID flag.
> -		 */
> -		if (!is_gid_table_entry_valid(data))
>  			continue;
> +		}
> 
> -		if (found >= 0)
> -			continue;
> +		attr = &entry->attr;
> 
>  		if (mask & GID_ATTR_FIND_MASK_GID_TYPE &&
>  		    attr->gid_type != val->gid_type)
>  			continue;
> 
>  		if (mask & GID_ATTR_FIND_MASK_GID &&
> -		    memcmp(gid, &data->gid, sizeof(*gid)))
> +		    memcmp(gid, &entry->gid, sizeof(*gid)))
>  			continue;
> 
>  		if (mask & GID_ATTR_FIND_MASK_NETDEV && @@ -348,71
> +414,75 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid
> *gid,
>  			continue;
> 
>  		if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
> -		    !!(data->props & GID_TABLE_ENTRY_DEFAULT) !=
> -		    default_gid)
> +		    (curr_index < table->default_entries) != default_gid)
>  			continue;
> 
> -		found = curr_index;
> +		/*
> +		 * if we find a match then empty is not valid as we didn't
> +		 * scan the whole list.
> +		 */
> +		return entry;
>  	}
> 
>  	if (pempty)
>  		*pempty = empty;
> 
> -	return found;
> +	return NULL;
>  }
> 
> -static void make_default_gid(struct  net_device *dev, union ib_gid *gid)
> +static void make_default_gid(struct net_device *dev, union ib_gid *gid)
>  {
>  	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
>  	addrconf_ifid_eui48(&gid->raw[8], dev);  }
> 
> -static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
> -			      union ib_gid *gid, struct ib_gid_attr *attr,
> -			      unsigned long mask, bool default_gid)
> +static int __ib_cache_gid_add(struct ib_gid_entry *entry, unsigned long
> +mask)
>  {
> -	struct ib_gid_table *table;
> +	struct ib_gid_table *table = get_entry_table(entry);
>  	int ret = 0;
>  	int empty;
> -	int ix;
> 
>  	/* Do not allow adding zero GID in support of
>  	 * IB spec version 1.3 section 4.1.1 point (6) and
>  	 * section 12.7.10 and section 12.7.20
>  	 */
> -	if (!memcmp(gid, &zgid, sizeof(*gid)))
> +	if (!memcmp(&entry->gid, &zgid, sizeof(zgid)))
>  		return -EINVAL;
> 
> -	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
> -
>  	mutex_lock(&table->lock);
> 
> -	ix = find_gid(table, gid, attr, default_gid, mask, &empty);
> -	if (ix >= 0)
> +	if (find_gid(table, &entry->gid, &entry->attr,
> +		     entry->flags & GID_TABLE_ENTRY_DEFAULT, mask, &empty))
>  		goto out_unlock;
> 
>  	if (empty < 0) {
>  		ret = -ENOSPC;
>  		goto out_unlock;
>  	}
> -	attr->device = ib_dev;
> -	attr->index = empty;
> -	attr->port_num = port;
> -	ret = add_modify_gid(table, gid, attr);
> -	if (!ret)
> -		dispatch_gid_change_event(ib_dev, port);
> 
> -out_unlock:
> +	entry->attr.index = empty;
> +	ret = add_modify_gid(table, entry);
>  	mutex_unlock(&table->lock);
> +
>  	if (ret)
> -		pr_warn("%s: unable to add gid %pI6 error=%d\n",
> -			__func__, gid->raw, ret);
> +		goto out;
> +
> +	dispatch_gid_change_event(entry->attr.device,
> +				  entry->attr.port_num);
> +	return 0;
> +
> +out_unlock:
> +	mutex_unlock(&table->lock);
> +out:
> +	pr_warn("%s: unable to add gid %pI6 error=%d\n", __func__,
> +		entry->gid.raw, ret);
>  	return ret;
>  }
> 
>  int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
>  		     union ib_gid *gid, struct ib_gid_attr *attr)  {
> +	struct ib_gid_entry *entry;
>  	struct net_device *idev;
>  	unsigned long mask;
>  	int ret;
> @@ -437,7 +507,11 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8
> port,
>  	       GID_ATTR_FIND_MASK_GID_TYPE |
>  	       GID_ATTR_FIND_MASK_NETDEV;
> 
> -	ret = __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false);
> +	entry = alloc_entry_roce(ib_dev, port, attr->ndev, attr->gid_type);
> +	ret = __ib_cache_gid_add(entry, mask);
> +	if (ret)
> +		put_entry(entry);
> +
>  	return ret;
>  }
> 
> @@ -446,28 +520,30 @@ _ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
>  		  union ib_gid *gid, struct ib_gid_attr *attr,
>  		  unsigned long mask, bool default_gid)  {
> -	struct ib_gid_table *table;
> +	struct ib_gid_table *table = get_table(ib_dev, port);
> +	const struct ib_gid_entry *entry;
>  	int ret = 0;
> -	int ix;
> -
> -	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
> 
>  	mutex_lock(&table->lock);
> 
> -	ix = find_gid(table, gid, attr, default_gid, mask, NULL);
> -	if (ix < 0) {
> +	entry = find_gid(table, gid, attr, default_gid, mask, NULL);
> +	if (!entry) {
>  		ret = -EINVAL;
>  		goto out_unlock;
>  	}
> 
> -	del_gid(ib_dev, port, table, ix);
> +	del_gid(table, entry);
> +
> +	mutex_unlock(&table->lock);
> +
>  	dispatch_gid_change_event(ib_dev, port);
> 
> +	return 0;
> +
>  out_unlock:
>  	mutex_unlock(&table->lock);
> -	if (ret)
> -		pr_debug("%s: can't delete gid %pI6 error=%d\n",
> -			 __func__, gid->raw, ret);
> +	pr_debug("%s: can't delete gid %pI6 error=%d\n",
> +		 __func__, gid->raw, ret);
>  	return ret;
>  }
> 
> @@ -485,17 +561,18 @@ int ib_cache_gid_del(struct ib_device *ib_dev, u8
> port,  int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
>  				     struct net_device *ndev)
>  {
> -	struct ib_gid_table *table;
> +	struct ib_gid_table *table = get_table(ib_dev, port);
>  	int ix;
>  	bool deleted = false;
> 
> -	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
> -
>  	mutex_lock(&table->lock);
> 
>  	for (ix = 0; ix < table->sz; ix++) {
> -		if (table->data_vec[ix].attr.ndev == ndev) {
> -			del_gid(ib_dev, port, table, ix);
> +		if (IS_ERR_OR_NULL(table->data_vec[ix]))
> +			continue;
> +
> +		if (table->data_vec[ix]->attr.ndev == ndev) {
> +			del_gid(table, table->data_vec[ix]);
>  			deleted = true;
>  		}
>  	}
> @@ -511,19 +588,19 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device
> *ib_dev, u8 port,  static int __ib_cache_gid_get(struct ib_device *ib_dev, u8
> port, int index,
>  			      union ib_gid *gid, struct ib_gid_attr *attr)  {
> -	struct ib_gid_table *table;
> -
> -	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
> +	struct ib_gid_table *table = get_table(ib_dev, port);
> +	const struct ib_gid_entry *entry;
> 
>  	if (index < 0 || index >= table->sz)
>  		return -EINVAL;
> 
> -	if (!is_gid_table_entry_valid(&table->data_vec[index]))
> +	if (IS_ERR_OR_NULL(table->data_vec[index]))
>  		return -EAGAIN;
> 
> -	memcpy(gid, &table->data_vec[index].gid, sizeof(*gid));
> +	entry = table->data_vec[index];
> +	memcpy(gid, &entry->gid, sizeof(*gid));
>  	if (attr) {
> -		memcpy(attr, &table->data_vec[index].attr, sizeof(*attr));
> +		memcpy(attr, &entry->attr, sizeof(*attr));
>  		if (attr->ndev)
>  			dev_hold(attr->ndev);
>  	}
> @@ -531,33 +608,28 @@ static int __ib_cache_gid_get(struct ib_device
> *ib_dev, u8 port, int index,
>  	return 0;
>  }
> 
> -static int _ib_cache_gid_table_find(struct ib_device *ib_dev,
> -				    const union ib_gid *gid,
> -				    const struct ib_gid_attr *val,
> -				    unsigned long mask,
> -				    u8 *port, u16 *index)
> +static const struct ib_gid_entry *_ib_cache_gid_table_find(
> +	struct ib_device *ib_dev, const union ib_gid *gid,
> +	const struct ib_gid_attr *val, unsigned long mask)
>  {
> +	const struct ib_gid_entry *entry = NULL;
>  	struct ib_gid_table *table;
>  	u8 p;
> -	int local_index;
>  	unsigned long flags;
> 
>  	for (p = 0; p < ib_dev->phys_port_cnt; p++) {
>  		table = ib_dev->cache.ports[p].gid;
> +
>  		read_lock_irqsave(&table->rwlock, flags);
> -		local_index = find_gid(table, gid, val, false, mask, NULL);
> -		if (local_index >= 0) {
> -			if (index)
> -				*index = local_index;
> -			if (port)
> -				*port = p + rdma_start_port(ib_dev);
> -			read_unlock_irqrestore(&table->rwlock, flags);
> -			return 0;
> -		}
> +		entry = find_gid(table, gid, val, false, mask, NULL);
> +		kref_get((struct kref *)&entry->kref);
>  		read_unlock_irqrestore(&table->rwlock, flags);
> +
> +		if (entry)
> +			break;
>  	}
> 
> -	return -ENOENT;
> +	return entry;
>  }
> 
>  static int ib_cache_gid_find(struct ib_device *ib_dev, @@ -566,6 +638,8 @@
> static int ib_cache_gid_find(struct ib_device *ib_dev,
>  			     struct net_device *ndev, u8 *port,
>  			     u16 *index)
>  {
> +	const struct ib_gid_entry *entry;
> +
>  	unsigned long mask = GID_ATTR_FIND_MASK_GID |
>  			     GID_ATTR_FIND_MASK_GID_TYPE;
>  	struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type};
> @@ -573,8 +647,16 @@ static int ib_cache_gid_find(struct ib_device *ib_dev,
>  	if (ndev)
>  		mask |= GID_ATTR_FIND_MASK_NETDEV;
> 
> -	return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val,
> -					mask, port, index);
> +	entry = _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val, mask);
> +	if (!entry)
> +		return -ENOENT;
> +
> +	*port = entry->attr.port_num;
> +	*index = entry->attr.index;
> +
> +	put_entry(entry);
> +
> +	return 0;
>  }
> 
>  /**
> @@ -596,7 +678,7 @@ int ib_find_cached_gid_by_port(struct ib_device
> *ib_dev,
>  			       u8 port, struct net_device *ndev,
>  			       u16 *index)
>  {
> -	int local_index;
> +	const struct ib_gid_entry *entry;
>  	struct ib_gid_table *table;
>  	unsigned long mask = GID_ATTR_FIND_MASK_GID |
>  			     GID_ATTR_FIND_MASK_GID_TYPE;
> @@ -606,16 +688,16 @@ int ib_find_cached_gid_by_port(struct ib_device
> *ib_dev,
>  	if (!rdma_is_port_valid(ib_dev, port))
>  		return -ENOENT;
> 
> -	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
> +	table = get_table(ib_dev, port);
> 
>  	if (ndev)
>  		mask |= GID_ATTR_FIND_MASK_NETDEV;
> 
>  	read_lock_irqsave(&table->rwlock, flags);
> -	local_index = find_gid(table, gid, &val, false, mask, NULL);
> -	if (local_index >= 0) {
> +	entry = find_gid(table, gid, &val, false, mask, NULL);
> +	if (entry) {
>  		if (index)
> -			*index = local_index;
> +			*index = entry->attr.index;
>  		read_unlock_irqrestore(&table->rwlock, flags);
>  		return 0;
>  	}
> @@ -659,26 +741,21 @@ static int ib_cache_gid_find_by_filter(struct ib_device
> *ib_dev,
>  	unsigned long flags;
>  	bool found = false;
> 
> -
>  	if (!rdma_is_port_valid(ib_dev, port) ||
>  	    !rdma_protocol_roce(ib_dev, port))
>  		return -EPROTONOSUPPORT;
> 
> -	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
> +	table = get_table(ib_dev, port);
> 
>  	read_lock_irqsave(&table->rwlock, flags);
>  	for (i = 0; i < table->sz; i++) {
> -		struct ib_gid_attr attr;
> -
> -		if (!is_gid_table_entry_valid(&table->data_vec[i]))
> +		if (IS_ERR_OR_NULL(table->data_vec[i]))
>  			continue;
> 
> -		if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
> +		if (memcmp(gid, &table->data_vec[i]->gid, sizeof(*gid)))
>  			continue;
> 
> -		memcpy(&attr, &table->data_vec[i].attr, sizeof(attr));
> -
> -		if (filter(gid, &attr, context)) {
> +		if (filter(gid, &table->data_vec[i]->attr, context)) {
>  			found = true;
>  			if (index)
>  				*index = i;
> @@ -692,42 +769,28 @@ static int ib_cache_gid_find_by_filter(struct ib_device
> *ib_dev,
>  	return 0;
>  }
> 
> -static struct ib_gid_table *alloc_gid_table(int sz)
> +static struct ib_gid_table *alloc_gid_table(unsigned int sz)
>  {
> -	struct ib_gid_table *table =
> -		kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL);
> -	int i;
> +	struct ib_gid_table *table;
> 
> +	table = kzalloc(sizeof(struct ib_gid_table) +
> +				sizeof(table->data_vec[0]) * sz,
> +			GFP_KERNEL);
>  	if (!table)
>  		return NULL;
> 
> -	table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL);
> -	if (!table->data_vec)
> -		goto err_free_table;
> -
>  	mutex_init(&table->lock);
> 
>  	table->sz = sz;
>  	rwlock_init(&table->rwlock);
> 
> -	/* Mark all entries as invalid so that allocator can allocate
> -	 * one of the invalid (free) entry.
> -	 */
> -	for (i = 0; i < sz; i++)
> -		table->data_vec[i].state = GID_TABLE_ENTRY_FREE;
>  	return table;
> -
> -err_free_table:
> -	kfree(table);
> -	return NULL;
>  }
> 
>  static void release_gid_table(struct ib_gid_table *table)  {
> -	if (table) {
> -		kfree(table->data_vec);
> +	if (table)
>  		kfree(table);
> -	}
>  }
> 
>  static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, @@ -
> 741,11 +804,11 @@ static void cleanup_gid_table_port(struct ib_device
> *ib_dev, u8 port,
> 
>  	mutex_lock(&table->lock);
>  	for (i = 0; i < table->sz; ++i) {
> -		if (memcmp(&table->data_vec[i].gid, &zgid,
> -			   sizeof(table->data_vec[i].gid))) {
> -			del_gid(ib_dev, port, table, i);
> -			deleted = true;
> -		}
> +		if (IS_ERR_OR_NULL(table->data_vec[i]))
> +			continue;
> +
> +		del_gid(table, table->data_vec[i]);
> +		deleted = true;
>  	}
>  	mutex_unlock(&table->lock);
> 
> @@ -758,59 +821,38 @@ void ib_cache_gid_set_default_gid(struct ib_device
> *ib_dev, u8 port,
>  				  unsigned long gid_type_mask,
>  				  enum ib_cache_gid_default_mode mode)  {
> -	union ib_gid gid = { };
> -	struct ib_gid_attr gid_attr;
> -	struct ib_gid_table *table;
>  	unsigned int gid_type;
>  	unsigned long mask;
> 
> -	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
> -
>  	mask = GID_ATTR_FIND_MASK_GID_TYPE |
>  	       GID_ATTR_FIND_MASK_DEFAULT |
>  	       GID_ATTR_FIND_MASK_NETDEV;
> -	memset(&gid_attr, 0, sizeof(gid_attr));
> -	gid_attr.ndev = ndev;
> 
>  	for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) {
>  		if (1UL << gid_type & ~gid_type_mask)
>  			continue;
> 
> -		gid_attr.gid_type = gid_type;
> 
>  		if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) {
> -			make_default_gid(ndev, &gid);
> -			__ib_cache_gid_add(ib_dev, port, &gid,
> -					   &gid_attr, mask, true);
> +			struct ib_gid_entry *entry =
> +				alloc_entry_roce(ib_dev, port, ndev, gid_type);
> +
> +			entry->flags |= GID_TABLE_ENTRY_DEFAULT;
> +			make_default_gid(ndev, &entry->gid);
> +			if (__ib_cache_gid_add(entry, mask))
> +				put_entry(entry);
>  		} else if (mode == IB_CACHE_GID_DEFAULT_MODE_DELETE) {
> -			_ib_cache_gid_del(ib_dev, port, &gid,
> +			struct ib_gid_attr gid_attr = {
> +				.gid_type = gid_type,
> +				.ndev = ndev,
> +			};
> +
> +			_ib_cache_gid_del(ib_dev, port, NULL,
>  					  &gid_attr, mask, true);
>  		}
>  	}
>  }
> 
> -static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
> -				      struct ib_gid_table *table)
> -{
> -	unsigned int i;
> -	unsigned long roce_gid_type_mask;
> -	unsigned int num_default_gids;
> -	unsigned int current_gid = 0;
> -
> -	roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
> -	num_default_gids = hweight_long(roce_gid_type_mask);
> -	for (i = 0; i < num_default_gids && i < table->sz; i++) {
> -		struct ib_gid_table_entry *entry = &table->data_vec[i];
> -
> -		entry->props |= GID_TABLE_ENTRY_DEFAULT;
> -		current_gid = find_next_bit(&roce_gid_type_mask,
> -					    BITS_PER_LONG,
> -					    current_gid);
> -		entry->attr.gid_type = current_gid++;
> -	}
> -}
> -
> -
>  static void gid_table_release_one(struct ib_device *ib_dev)  {
>  	struct ib_gid_table *table;
> @@ -836,7 +878,8 @@ static int _gid_table_setup_one(struct ib_device
> *ib_dev)
>  		if (!table)
>  			goto rollback_table_setup;
> 
> -		gid_table_reserve_default(ib_dev, rdma_port, table);
> +		table->default_entries =
> +			hweight_long(roce_gid_type_mask_support(ib_dev,
> rdma_port));
>  		ib_dev->cache.ports[port].gid = table;
>  	}
>  	return 0;
> @@ -885,7 +928,8 @@ int ib_get_cached_gid(struct ib_device *device,
>  	if (!rdma_is_port_valid(device, port_num))
>  		return -EINVAL;
> 
> -	table = device->cache.ports[port_num - rdma_start_port(device)].gid;
> +	table = get_table(device, port_num);
> +
>  	read_lock_irqsave(&table->rwlock, flags);
>  	res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr);
>  	read_unlock_irqrestore(&table->rwlock, flags); @@ -1095,31 +1139,35
> @@ EXPORT_SYMBOL(ib_get_cached_port_state);
>  static int config_non_roce_gid_cache(struct ib_device *device,
>  				     u8 port, int gid_tbl_len)
>  {
> -	struct ib_gid_attr gid_attr = {};
> -	struct ib_gid_table *table;
> -	union ib_gid gid;
> +	struct ib_gid_table *table = get_table(device, port);
> +	struct ib_gid_entry *entry = NULL;
>  	int ret = 0;
>  	int i;
> 
> -	gid_attr.device = device;
> -	gid_attr.port_num = port;
> -	table = device->cache.ports[port - rdma_start_port(device)].gid;
> -
>  	mutex_lock(&table->lock);
>  	for (i = 0; i < gid_tbl_len; ++i) {
>  		if (!device->query_gid)
>  			continue;
> -		ret = device->query_gid(device, port, i, &gid);
> +
> +		entry = alloc_entry(device, port);
> +		if (!entry)
> +			return -ENOMEM;
> +
> +		ret = device->query_gid(device, port, i, &entry->gid);
>  		if (ret) {
>  			pr_warn("query_gid failed (%d) for %s (index %d)\n",
>  				ret, device->name, i);
>  			goto err;
>  		}
> -		gid_attr.index = i;
> -		add_modify_gid(table, &gid, &gid_attr);
> +
> +		entry->attr.index = i;
> +		ret = add_modify_gid(table, entry);
> +		if (ret)
> +			goto err;
>  	}
>  err:
>  	mutex_unlock(&table->lock);
> +	put_entry(entry);
>  	return ret;
>  }
> 
> @@ -1136,7 +1184,7 @@ static void ib_cache_update(struct ib_device *device,
>  	if (!rdma_is_port_valid(device, port))
>  		return;
> 
> -	table = device->cache.ports[port - rdma_start_port(device)].gid;
> +	table = get_table(device, port);
> 
>  	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
>  	if (!tprops)
> @@ -1297,6 +1345,12 @@ void ib_cache_cleanup_one(struct ib_device
> *device)
>  	ib_unregister_event_handler(&device->cache.event_handler);
>  	flush_workqueue(ib_wq);
>  	gid_table_cleanup_one(device);
> +
> +	/*
> +	 * Flush the wq second time for any pending GID delete
> +	 * work for RoCE device.
> +	 */
> +	flush_workqueue(ib_wq);
>  }
> 
>  void __init ib_cache_setup(void)
> --
> 2.17.0
I am yet to review this variation, but I have a kref memory based version in progress and have rebased the whole series which simplifies rdma_query_gid() calls.
I am testing it.
I will take a look at this patch as well.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux