Re: [PATCH rdma-next 05/12] IB/core: Introduce GID entry reference counts for RoCE

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, May 14, 2018 at 11:11:11AM +0300, Leon Romanovsky wrote:
> From: Parav Pandit <parav@xxxxxxxxxxxx>
> 
> Currently a GID entry can be deleted from the GID table while it is
> in use by ULPs, CM modules or user space applications.

[..]

So, I still don't like how tricky this is, particularly as we go down
the series.

I prefer a version that kref's actual memory and tracks it that way.

Here is a compile tested version of this patch revised as I would
think.. It probably needs some splitting because it is waay to big,
but overall I think it is much more understandable and auditable this
way.

I particularly like how the locking becomes simpler.

But what do you think?

>From 03b9e61c0e9af031cf1ad73cb45f0f46c06f3be6 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@xxxxxxxxxxxx>
Date: Mon, 14 May 2018 11:11:11 +0300
Subject: [RFC PATCH] IB/core: Make the gid table entries immutable

This is the first step to expose the gid_attr struct stored in the gid
table to the users without having to copy it.

The gid table entries are re-organized so they are written once before
being added to the table, and then never changed during the lifetime of
the entry.

When and entry is deleted/added (roce) or replaced (IB) new memory is
allocated to replace the old entry and the old entry lives on via a
kref until all users finally drop it.

For roce this means we can block re-use of gid indexes by dropping an
ERR_PTR tombstone into the table while the entry is still in use, making
the index blocked but the content in-accessible.

Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx>
Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx>
Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxxxx>
---
 drivers/infiniband/core/cache.c | 626 +++++++++++++++++---------------
 1 file changed, 340 insertions(+), 286 deletions(-)

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 47fa72b434be00..bc09382b57d280 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -66,45 +66,149 @@ enum gid_attr_find_mask {
 	GID_ATTR_FIND_MASK_GID_TYPE	= 1UL << 3,
 };
 
-enum gid_table_entry_props {
-	GID_TABLE_ENTRY_DEFAULT		= 1 << 1,
+enum gid_table_entry_flags {
+	GID_TABLE_ENTRY_IN_TABLE = 1 << 0,
+	GID_TABLE_ENTRY_ADD_GID = 1 << 1,
+	GID_TABLE_ENTRY_DEFAULT = 1 << 2,
 };
 
-enum gid_table_entry_state {
-	GID_TABLE_ENTRY_FREE		= 1,
-	GID_TABLE_ENTRY_ALLOCATED	= 2,
-};
+struct ib_gid_entry {
+	struct kref kref;
+	struct work_struct del_work;
+	union ib_gid gid;
+	unsigned int flags;
+	struct ib_gid_attr attr;
 
-struct ib_gid_table_entry {
-	enum gid_table_entry_props	props;
-	enum gid_table_entry_state	state;
-	union ib_gid			gid;
-	struct ib_gid_attr		attr;
-	void				*context;
+	/* driver specific data */
+	void *context;
 };
 
 struct ib_gid_table {
-	int                  sz;
-	/* In RoCE, adding a GID to the table requires:
-	 * (a) Find if this GID is already exists.
-	 * (b) Find a free space.
-	 * (c) Write the new GID
-	 *
-	 * Delete requires different set of operations:
-	 * (a) Find the GID
-	 * (b) Delete it.
-	 *
-	 **/
+	unsigned int sz;
+	/* The first default_entries in data_vec are reserved for default entries */
+	unsigned int default_entries;
+
 	/* Any writer to data_vec must hold this lock and the write side of
 	 * rwlock. Readers must hold only rwlock. All writers must be in a
 	 * sleepable context.
 	 */
 	struct mutex         lock;
-	/* rwlock protects data_vec[ix]->props and data)vec[ix->state. */
+	/* rwlock protects data_vec */
 	rwlock_t	     rwlock;
-	struct ib_gid_table_entry *data_vec;
+	const struct ib_gid_entry *data_vec[];
 };
 
+static struct ib_gid_table *get_table(struct ib_device *device, u8 port)
+
+{
+	return device->cache.ports[port - rdma_start_port(device)].gid;
+}
+
+static struct ib_gid_table *get_entry_table(struct ib_gid_entry *entry)
+{
+	return get_table(entry->attr.device, entry->attr.port_num);
+}
+
+static void store_entry(struct ib_gid_table *table,
+			struct ib_gid_entry *entry)
+{
+	int ix = entry->attr.index;
+
+	lockdep_assert_held(&table->lock);
+	write_lock_irq(&table->rwlock);
+
+	WARN_ON(table->data_vec[ix]);
+
+	entry->flags |= GID_TABLE_ENTRY_IN_TABLE;
+	table->data_vec[entry->attr.index] = entry;
+
+	write_unlock_irq(&table->rwlock);
+}
+
+/**
+ * free_roce_gid_work - Release reference to the GID entry
+ * @work: Work structure to refer to GID entry which needs to be
+ * deleted.
+ *
+ * free_roce_gid_work() frees the entry from the HCA's hardware table
+ * if the provider supports it.
+ */
+static void free_roce_gid_work(struct work_struct *work)
+{
+	struct ib_gid_entry *entry =
+		container_of(work, struct ib_gid_entry, del_work);
+	struct ib_gid_table *table = get_entry_table(entry);
+	struct ib_device *device = entry->attr.device;
+
+	pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
+		 device->name, entry->attr.port_num, entry->attr.index,
+		 entry->gid.raw);
+
+	mutex_lock(&table->lock);
+	if (entry->flags & GID_TABLE_ENTRY_ADD_GID)
+		device->del_gid(&entry->attr, &entry->context);
+
+	if (entry->flags & GID_TABLE_ENTRY_IN_TABLE) {
+		write_lock_irq(&table->rwlock);
+		WARN_ON(!IS_ERR(table->data_vec[entry->attr.index]));
+		table->data_vec[entry->attr.index] = NULL;
+		write_unlock_irq(&table->rwlock);
+	}
+
+	mutex_unlock(&table->lock);
+
+	dev_put(entry->attr.ndev);
+
+	kfree(entry);
+}
+
+/* Must be undone with put_entry */
+static struct ib_gid_entry *alloc_entry(struct ib_device *ib_dev, u8 port)
+{
+	struct ib_gid_entry *entry;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return NULL;
+
+	kref_init(&entry->kref);
+	INIT_WORK(&entry->del_work, free_roce_gid_work);
+
+	entry->attr.device = ib_dev;
+	entry->attr.port_num = port;
+
+	return entry;
+}
+
+static struct ib_gid_entry *alloc_entry_roce(struct ib_device *ib_dev, u8 port,
+					     struct net_device *ndev,
+					     enum ib_gid_type gid_type)
+{
+	struct ib_gid_entry *entry =
+		alloc_entry(ib_dev, port);
+
+	if (!entry)
+		return NULL;
+
+	entry->attr.gid_type = gid_type;
+	entry->attr.ndev = ndev;
+	dev_hold(entry->attr.ndev);
+	return entry;
+}
+
+static void free_entry(struct kref *kref)
+{
+	struct ib_gid_entry *entry =
+		container_of(kref, struct ib_gid_entry, kref);
+
+	queue_work(ib_wq, &entry->del_work);
+}
+
+static void put_entry(const struct ib_gid_entry *entry)
+{
+	kref_put((struct kref *)&entry->kref, free_entry);
+}
+
 static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port)
 {
 	struct ib_event event;
@@ -154,37 +258,10 @@ int ib_cache_gid_parse_type_str(const char *buf)
 }
 EXPORT_SYMBOL(ib_cache_gid_parse_type_str);
 
-static inline bool
-is_gid_table_entry_free(const struct ib_gid_table_entry *entry)
+static int setup_roce_gid_entry(struct ib_gid_table *table,
+				struct ib_gid_entry *entry)
 {
-	return entry->state == GID_TABLE_ENTRY_FREE;
-}
-
-static inline bool
-is_gid_table_entry_valid(const struct ib_gid_table_entry *entry)
-{
-	return entry->state == GID_TABLE_ENTRY_ALLOCATED;
-}
-
-static void del_roce_gid(struct ib_device *device, u8 port_num,
-			 struct ib_gid_table *table, int ix)
-{
-	pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
-		 device->name, port_num, ix,
-		 table->data_vec[ix].gid.raw);
-
-	if (rdma_cap_roce_gid_table(device, port_num))
-		device->del_gid(&table->data_vec[ix].attr,
-				&table->data_vec[ix].context);
-	dev_put(table->data_vec[ix].attr.ndev);
-}
-
-static int add_roce_gid(struct ib_gid_table *table,
-			const union ib_gid *gid,
-			const struct ib_gid_attr *attr)
-{
-	struct ib_gid_table_entry *entry;
-	int ix = attr->index;
+	const struct ib_gid_attr *attr = &entry->attr;
 	int ret = 0;
 
 	if (!attr->ndev) {
@@ -194,30 +271,23 @@ static int add_roce_gid(struct ib_gid_table *table,
 		return -EINVAL;
 	}
 
-	entry = &table->data_vec[ix];
-	if (!is_gid_table_entry_free(entry)) {
-		WARN(1, "GID table corruption device=%s port=%d index=%d\n",
-		     attr->device->name, attr->port_num,
-		     attr->index);
-		return -EINVAL;
-	}
 
 	if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) {
-		ret = attr->device->add_gid(gid, attr, &entry->context);
+		ret = attr->device->add_gid(&entry->gid, &entry->attr,
+					    &entry->context);
 		if (ret) {
 			pr_err("%s GID add failed device=%s port=%d index=%d\n",
 			       __func__, attr->device->name, attr->port_num,
 			       attr->index);
-			goto add_err;
+			return ret;
 		}
+		entry->flags |= GID_TABLE_ENTRY_ADD_GID;
 	}
-	dev_hold(attr->ndev);
 
-add_err:
-	if (!ret)
-		pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
-			 attr->device->name, attr->port_num, ix, gid->raw);
-	return ret;
+	pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
+		 attr->device->name, attr->port_num, entry->attr.index,
+		 entry->gid.raw);
+	return 0;
 }
 
 /**
@@ -232,15 +302,15 @@ static int add_roce_gid(struct ib_gid_table *table,
  * GID. However such zero GIDs are not added to the cache.
  */
 static int add_modify_gid(struct ib_gid_table *table,
-			  const union ib_gid *gid,
-			  const struct ib_gid_attr *attr)
+			  struct ib_gid_entry *entry)
 {
 	int ret;
 
-	if (rdma_protocol_roce(attr->device, attr->port_num)) {
-		ret = add_roce_gid(table, gid, attr);
-		if (ret)
+	if (rdma_protocol_roce(entry->attr.device, entry->attr.port_num)) {
+		ret = setup_roce_gid_entry(table, entry);
+		if (ret) {
 			return ret;
+		}
 	} else {
 		/*
 		 * Some HCA's report multiple GID entries with only one
@@ -248,99 +318,95 @@ static int add_modify_gid(struct ib_gid_table *table,
 		 * So ignore such behavior for IB link layer and don't
 		 * fail the call, but don't add such entry to GID cache.
 		 */
-		if (!memcmp(gid, &zgid, sizeof(*gid)))
+		if (!memcmp(&entry->gid, &zgid, sizeof(entry->gid))) {
+			put_entry(entry);
 			return 0;
+		}
 	}
 
-	lockdep_assert_held(&table->lock);
-	memcpy(&table->data_vec[attr->index].gid, gid, sizeof(*gid));
-	memcpy(&table->data_vec[attr->index].attr, attr, sizeof(*attr));
+	store_entry(table, entry);
 
-	write_lock_irq(&table->rwlock);
-	table->data_vec[attr->index].state = GID_TABLE_ENTRY_ALLOCATED;
-	write_unlock_irq(&table->rwlock);
 	return 0;
 }
 
-/**
- * del_gid - Delete GID table entry
- *
- * @ib_dev:	IB device whose GID entry to be deleted
- * @port:	Port number of the IB device
- * @table:	GID table of the IB device for a port
- * @ix:		GID entry index to delete
- *
- */
-static void del_gid(struct ib_device *ib_dev, u8 port,
-		    struct ib_gid_table *table, int ix)
+static void del_gid(struct ib_gid_table *table,
+		    const struct ib_gid_entry *entry)
 {
+	unsigned int ix = entry->attr.index;
+
 	lockdep_assert_held(&table->lock);
+
 	write_lock_irq(&table->rwlock);
-	table->data_vec[ix].state = GID_TABLE_ENTRY_FREE;
+
+	WARN_ON(entry != table->data_vec[ix]);
+
+	/*
+	 * The data_vec holds a marker for roce preventing re-use of the HW
+	 * slow until the entry kref goes to zero. Other protocols can re-use
+	 * immediately.
+	 */
+	if (rdma_protocol_roce(entry->attr.device, entry->attr.port_num))
+		table->data_vec[ix] = ERR_PTR(-EINVAL);
+	else {
+		table->data_vec[ix] = NULL;
+		((struct ib_gid_entry *)entry)->flags &=
+			~GID_TABLE_ENTRY_IN_TABLE;
+	}
+
 	write_unlock_irq(&table->rwlock);
 
-	if (rdma_protocol_roce(ib_dev, port))
-		del_roce_gid(ib_dev, port, table, ix);
-	memcpy(&table->data_vec[ix].gid, &zgid, sizeof(zgid));
-	memset(&table->data_vec[ix].attr, 0, sizeof(table->data_vec[ix].attr));
-	table->data_vec[ix].context = NULL;
+	put_entry(entry);
 }
 
 /* rwlock should be read locked, or lock should be held */
-static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
-		    const struct ib_gid_attr *val, bool default_gid,
-		    unsigned long mask, int *pempty)
+static const struct ib_gid_entry *find_gid(struct ib_gid_table *table,
+					   const union ib_gid *gid,
+					   const struct ib_gid_attr *val,
+					   bool default_gid, unsigned long mask,
+					   int *pempty)
 {
 	int i = 0;
-	int found = -1;
-	int empty = pempty ? -1 : 0;
+	int empty = -1;
 
-	while (i < table->sz && (found < 0 || empty < 0)) {
-		struct ib_gid_table_entry *data = &table->data_vec[i];
-		struct ib_gid_attr *attr = &data->attr;
+	for (i = 0; i != table->sz; i++) {
+		const struct ib_gid_entry *entry = table->data_vec[i];
+		const struct ib_gid_attr *attr;
 		int curr_index = i;
 
-		i++;
+		/*
+		 * Entry is still being freed, it cannot be returned from
+		 * find_gid
+		 */
+		if (IS_ERR(entry))
+			continue;
 
 		/* find_gid() is used during GID addition where it is expected
-		 * to return a free entry slot which is not duplicate.
-		 * Free entry slot is requested and returned if pempty is set,
-		 * so lookup free slot only if requested.
+		 * to return a free entry slot if there is not duplicate
+		 * entry.
 		 */
-		if (pempty && empty < 0) {
-			if (is_gid_table_entry_free(data) &&
-			    (default_gid ==
-			     !!(data->props & GID_TABLE_ENTRY_DEFAULT))) {
-				/*
-				 * Found an invalid (free) entry; allocate it.
-				 * If default GID is requested, then our
-				 * found slot must be one of the DEFAULT
-				 * reserved slots or we fail.
-				 * This ensures that only DEFAULT reserved
-				 * slots are used for default property GIDs.
-				 */
+		if (entry == NULL) {
+			/*
+			 * Found an free entry; allocate it. If a default GID
+			 * is requested, then our found slot must be one of
+			 * the DEFAULT reserved slots or we fail.  This
+			 * ensures that only DEFAULT reserved slots are used
+			 * for default property GIDs.
+			 */
+			if (empty < 0 &&
+			    (curr_index < table->default_entries) ==
+				    default_gid)
 				empty = curr_index;
-			}
-		}
-
-		/*
-		 * Additionally find_gid() is used to find valid entry during
-		 * lookup operation, where validity needs to be checked. So
-		 * find the empty entry first to continue to search for a free
-		 * slot and ignore its INVALID flag.
-		 */
-		if (!is_gid_table_entry_valid(data))
 			continue;
+		}
 
-		if (found >= 0)
-			continue;
+		attr = &entry->attr;
 
 		if (mask & GID_ATTR_FIND_MASK_GID_TYPE &&
 		    attr->gid_type != val->gid_type)
 			continue;
 
 		if (mask & GID_ATTR_FIND_MASK_GID &&
-		    memcmp(gid, &data->gid, sizeof(*gid)))
+		    memcmp(gid, &entry->gid, sizeof(*gid)))
 			continue;
 
 		if (mask & GID_ATTR_FIND_MASK_NETDEV &&
@@ -348,71 +414,75 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
 			continue;
 
 		if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
-		    !!(data->props & GID_TABLE_ENTRY_DEFAULT) !=
-		    default_gid)
+		    (curr_index < table->default_entries) != default_gid)
 			continue;
 
-		found = curr_index;
+		/*
+		 * if we find a match then empty is not valid as we didn't
+		 * scan the whole list.
+		 */
+		return entry;
 	}
 
 	if (pempty)
 		*pempty = empty;
 
-	return found;
+	return NULL;
 }
 
-static void make_default_gid(struct  net_device *dev, union ib_gid *gid)
+static void make_default_gid(struct net_device *dev, union ib_gid *gid)
 {
 	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
 	addrconf_ifid_eui48(&gid->raw[8], dev);
 }
 
-static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
-			      union ib_gid *gid, struct ib_gid_attr *attr,
-			      unsigned long mask, bool default_gid)
+static int __ib_cache_gid_add(struct ib_gid_entry *entry, unsigned long mask)
 {
-	struct ib_gid_table *table;
+	struct ib_gid_table *table = get_entry_table(entry);
 	int ret = 0;
 	int empty;
-	int ix;
 
 	/* Do not allow adding zero GID in support of
 	 * IB spec version 1.3 section 4.1.1 point (6) and
 	 * section 12.7.10 and section 12.7.20
 	 */
-	if (!memcmp(gid, &zgid, sizeof(*gid)))
+	if (!memcmp(&entry->gid, &zgid, sizeof(zgid)))
 		return -EINVAL;
 
-	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
-
 	mutex_lock(&table->lock);
 
-	ix = find_gid(table, gid, attr, default_gid, mask, &empty);
-	if (ix >= 0)
+	if (find_gid(table, &entry->gid, &entry->attr,
+		     entry->flags & GID_TABLE_ENTRY_DEFAULT, mask, &empty))
 		goto out_unlock;
 
 	if (empty < 0) {
 		ret = -ENOSPC;
 		goto out_unlock;
 	}
-	attr->device = ib_dev;
-	attr->index = empty;
-	attr->port_num = port;
-	ret = add_modify_gid(table, gid, attr);
-	if (!ret)
-		dispatch_gid_change_event(ib_dev, port);
 
-out_unlock:
+	entry->attr.index = empty;
+	ret = add_modify_gid(table, entry);
 	mutex_unlock(&table->lock);
+
 	if (ret)
-		pr_warn("%s: unable to add gid %pI6 error=%d\n",
-			__func__, gid->raw, ret);
+		goto out;
+
+	dispatch_gid_change_event(entry->attr.device,
+				  entry->attr.port_num);
+	return 0;
+
+out_unlock:
+	mutex_unlock(&table->lock);
+out:
+	pr_warn("%s: unable to add gid %pI6 error=%d\n", __func__,
+		entry->gid.raw, ret);
 	return ret;
 }
 
 int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
 		     union ib_gid *gid, struct ib_gid_attr *attr)
 {
+	struct ib_gid_entry *entry;
 	struct net_device *idev;
 	unsigned long mask;
 	int ret;
@@ -437,7 +507,11 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
 	       GID_ATTR_FIND_MASK_GID_TYPE |
 	       GID_ATTR_FIND_MASK_NETDEV;
 
-	ret = __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false);
+	entry = alloc_entry_roce(ib_dev, port, attr->ndev, attr->gid_type);
+	ret = __ib_cache_gid_add(entry, mask);
+	if (ret)
+		put_entry(entry);
+
 	return ret;
 }
 
@@ -446,28 +520,30 @@ _ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
 		  union ib_gid *gid, struct ib_gid_attr *attr,
 		  unsigned long mask, bool default_gid)
 {
-	struct ib_gid_table *table;
+	struct ib_gid_table *table = get_table(ib_dev, port);
+	const struct ib_gid_entry *entry;
 	int ret = 0;
-	int ix;
-
-	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
 
 	mutex_lock(&table->lock);
 
-	ix = find_gid(table, gid, attr, default_gid, mask, NULL);
-	if (ix < 0) {
+	entry = find_gid(table, gid, attr, default_gid, mask, NULL);
+	if (!entry) {
 		ret = -EINVAL;
 		goto out_unlock;
 	}
 
-	del_gid(ib_dev, port, table, ix);
+	del_gid(table, entry);
+
+	mutex_unlock(&table->lock);
+
 	dispatch_gid_change_event(ib_dev, port);
 
+	return 0;
+
 out_unlock:
 	mutex_unlock(&table->lock);
-	if (ret)
-		pr_debug("%s: can't delete gid %pI6 error=%d\n",
-			 __func__, gid->raw, ret);
+	pr_debug("%s: can't delete gid %pI6 error=%d\n",
+		 __func__, gid->raw, ret);
 	return ret;
 }
 
@@ -485,17 +561,18 @@ int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
 int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
 				     struct net_device *ndev)
 {
-	struct ib_gid_table *table;
+	struct ib_gid_table *table = get_table(ib_dev, port);
 	int ix;
 	bool deleted = false;
 
-	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
-
 	mutex_lock(&table->lock);
 
 	for (ix = 0; ix < table->sz; ix++) {
-		if (table->data_vec[ix].attr.ndev == ndev) {
-			del_gid(ib_dev, port, table, ix);
+		if (IS_ERR_OR_NULL(table->data_vec[ix]))
+			continue;
+
+		if (table->data_vec[ix]->attr.ndev == ndev) {
+			del_gid(table, table->data_vec[ix]);
 			deleted = true;
 		}
 	}
@@ -511,19 +588,19 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
 static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
 			      union ib_gid *gid, struct ib_gid_attr *attr)
 {
-	struct ib_gid_table *table;
-
-	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
+	struct ib_gid_table *table = get_table(ib_dev, port);
+	const struct ib_gid_entry *entry;
 
 	if (index < 0 || index >= table->sz)
 		return -EINVAL;
 
-	if (!is_gid_table_entry_valid(&table->data_vec[index]))
+	if (IS_ERR_OR_NULL(table->data_vec[index]))
 		return -EAGAIN;
 
-	memcpy(gid, &table->data_vec[index].gid, sizeof(*gid));
+	entry = table->data_vec[index];
+	memcpy(gid, &entry->gid, sizeof(*gid));
 	if (attr) {
-		memcpy(attr, &table->data_vec[index].attr, sizeof(*attr));
+		memcpy(attr, &entry->attr, sizeof(*attr));
 		if (attr->ndev)
 			dev_hold(attr->ndev);
 	}
@@ -531,33 +608,28 @@ static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
 	return 0;
 }
 
-static int _ib_cache_gid_table_find(struct ib_device *ib_dev,
-				    const union ib_gid *gid,
-				    const struct ib_gid_attr *val,
-				    unsigned long mask,
-				    u8 *port, u16 *index)
+static const struct ib_gid_entry *_ib_cache_gid_table_find(
+	struct ib_device *ib_dev, const union ib_gid *gid,
+	const struct ib_gid_attr *val, unsigned long mask)
 {
+	const struct ib_gid_entry *entry = NULL;
 	struct ib_gid_table *table;
 	u8 p;
-	int local_index;
 	unsigned long flags;
 
 	for (p = 0; p < ib_dev->phys_port_cnt; p++) {
 		table = ib_dev->cache.ports[p].gid;
+
 		read_lock_irqsave(&table->rwlock, flags);
-		local_index = find_gid(table, gid, val, false, mask, NULL);
-		if (local_index >= 0) {
-			if (index)
-				*index = local_index;
-			if (port)
-				*port = p + rdma_start_port(ib_dev);
-			read_unlock_irqrestore(&table->rwlock, flags);
-			return 0;
-		}
+		entry = find_gid(table, gid, val, false, mask, NULL);
+		kref_get((struct kref *)&entry->kref);
 		read_unlock_irqrestore(&table->rwlock, flags);
+
+		if (entry)
+			break;
 	}
 
-	return -ENOENT;
+	return entry;
 }
 
 static int ib_cache_gid_find(struct ib_device *ib_dev,
@@ -566,6 +638,8 @@ static int ib_cache_gid_find(struct ib_device *ib_dev,
 			     struct net_device *ndev, u8 *port,
 			     u16 *index)
 {
+	const struct ib_gid_entry *entry;
+
 	unsigned long mask = GID_ATTR_FIND_MASK_GID |
 			     GID_ATTR_FIND_MASK_GID_TYPE;
 	struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type};
@@ -573,8 +647,16 @@ static int ib_cache_gid_find(struct ib_device *ib_dev,
 	if (ndev)
 		mask |= GID_ATTR_FIND_MASK_NETDEV;
 
-	return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val,
-					mask, port, index);
+	entry = _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val, mask);
+	if (!entry)
+		return -ENOENT;
+
+	*port = entry->attr.port_num;
+	*index = entry->attr.index;
+
+	put_entry(entry);
+
+	return 0;
 }
 
 /**
@@ -596,7 +678,7 @@ int ib_find_cached_gid_by_port(struct ib_device *ib_dev,
 			       u8 port, struct net_device *ndev,
 			       u16 *index)
 {
-	int local_index;
+	const struct ib_gid_entry *entry;
 	struct ib_gid_table *table;
 	unsigned long mask = GID_ATTR_FIND_MASK_GID |
 			     GID_ATTR_FIND_MASK_GID_TYPE;
@@ -606,16 +688,16 @@ int ib_find_cached_gid_by_port(struct ib_device *ib_dev,
 	if (!rdma_is_port_valid(ib_dev, port))
 		return -ENOENT;
 
-	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
+	table = get_table(ib_dev, port);
 
 	if (ndev)
 		mask |= GID_ATTR_FIND_MASK_NETDEV;
 
 	read_lock_irqsave(&table->rwlock, flags);
-	local_index = find_gid(table, gid, &val, false, mask, NULL);
-	if (local_index >= 0) {
+	entry = find_gid(table, gid, &val, false, mask, NULL);
+	if (entry) {
 		if (index)
-			*index = local_index;
+			*index = entry->attr.index;
 		read_unlock_irqrestore(&table->rwlock, flags);
 		return 0;
 	}
@@ -659,26 +741,21 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
 	unsigned long flags;
 	bool found = false;
 
-
 	if (!rdma_is_port_valid(ib_dev, port) ||
 	    !rdma_protocol_roce(ib_dev, port))
 		return -EPROTONOSUPPORT;
 
-	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
+	table = get_table(ib_dev, port);
 
 	read_lock_irqsave(&table->rwlock, flags);
 	for (i = 0; i < table->sz; i++) {
-		struct ib_gid_attr attr;
-
-		if (!is_gid_table_entry_valid(&table->data_vec[i]))
+		if (IS_ERR_OR_NULL(table->data_vec[i]))
 			continue;
 
-		if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
+		if (memcmp(gid, &table->data_vec[i]->gid, sizeof(*gid)))
 			continue;
 
-		memcpy(&attr, &table->data_vec[i].attr, sizeof(attr));
-
-		if (filter(gid, &attr, context)) {
+		if (filter(gid, &table->data_vec[i]->attr, context)) {
 			found = true;
 			if (index)
 				*index = i;
@@ -692,42 +769,28 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
 	return 0;
 }
 
-static struct ib_gid_table *alloc_gid_table(int sz)
+static struct ib_gid_table *alloc_gid_table(unsigned int sz)
 {
-	struct ib_gid_table *table =
-		kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL);
-	int i;
+	struct ib_gid_table *table;
 
+	table = kzalloc(sizeof(struct ib_gid_table) +
+				sizeof(table->data_vec[0]) * sz,
+			GFP_KERNEL);
 	if (!table)
 		return NULL;
 
-	table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL);
-	if (!table->data_vec)
-		goto err_free_table;
-
 	mutex_init(&table->lock);
 
 	table->sz = sz;
 	rwlock_init(&table->rwlock);
 
-	/* Mark all entries as invalid so that allocator can allocate
-	 * one of the invalid (free) entry.
-	 */
-	for (i = 0; i < sz; i++)
-		table->data_vec[i].state = GID_TABLE_ENTRY_FREE;
 	return table;
-
-err_free_table:
-	kfree(table);
-	return NULL;
 }
 
 static void release_gid_table(struct ib_gid_table *table)
 {
-	if (table) {
-		kfree(table->data_vec);
+	if (table)
 		kfree(table);
-	}
 }
 
 static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
@@ -741,11 +804,11 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
 
 	mutex_lock(&table->lock);
 	for (i = 0; i < table->sz; ++i) {
-		if (memcmp(&table->data_vec[i].gid, &zgid,
-			   sizeof(table->data_vec[i].gid))) {
-			del_gid(ib_dev, port, table, i);
-			deleted = true;
-		}
+		if (IS_ERR_OR_NULL(table->data_vec[i]))
+			continue;
+
+		del_gid(table, table->data_vec[i]);
+		deleted = true;
 	}
 	mutex_unlock(&table->lock);
 
@@ -758,59 +821,38 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
 				  unsigned long gid_type_mask,
 				  enum ib_cache_gid_default_mode mode)
 {
-	union ib_gid gid = { };
-	struct ib_gid_attr gid_attr;
-	struct ib_gid_table *table;
 	unsigned int gid_type;
 	unsigned long mask;
 
-	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
-
 	mask = GID_ATTR_FIND_MASK_GID_TYPE |
 	       GID_ATTR_FIND_MASK_DEFAULT |
 	       GID_ATTR_FIND_MASK_NETDEV;
-	memset(&gid_attr, 0, sizeof(gid_attr));
-	gid_attr.ndev = ndev;
 
 	for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) {
 		if (1UL << gid_type & ~gid_type_mask)
 			continue;
 
-		gid_attr.gid_type = gid_type;
 
 		if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) {
-			make_default_gid(ndev, &gid);
-			__ib_cache_gid_add(ib_dev, port, &gid,
-					   &gid_attr, mask, true);
+			struct ib_gid_entry *entry =
+				alloc_entry_roce(ib_dev, port, ndev, gid_type);
+
+			entry->flags |= GID_TABLE_ENTRY_DEFAULT;
+			make_default_gid(ndev, &entry->gid);
+			if (__ib_cache_gid_add(entry, mask))
+				put_entry(entry);
 		} else if (mode == IB_CACHE_GID_DEFAULT_MODE_DELETE) {
-			_ib_cache_gid_del(ib_dev, port, &gid,
+			struct ib_gid_attr gid_attr = {
+				.gid_type = gid_type,
+				.ndev = ndev,
+			};
+
+			_ib_cache_gid_del(ib_dev, port, NULL,
 					  &gid_attr, mask, true);
 		}
 	}
 }
 
-static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
-				      struct ib_gid_table *table)
-{
-	unsigned int i;
-	unsigned long roce_gid_type_mask;
-	unsigned int num_default_gids;
-	unsigned int current_gid = 0;
-
-	roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
-	num_default_gids = hweight_long(roce_gid_type_mask);
-	for (i = 0; i < num_default_gids && i < table->sz; i++) {
-		struct ib_gid_table_entry *entry = &table->data_vec[i];
-
-		entry->props |= GID_TABLE_ENTRY_DEFAULT;
-		current_gid = find_next_bit(&roce_gid_type_mask,
-					    BITS_PER_LONG,
-					    current_gid);
-		entry->attr.gid_type = current_gid++;
-	}
-}
-
-
 static void gid_table_release_one(struct ib_device *ib_dev)
 {
 	struct ib_gid_table *table;
@@ -836,7 +878,8 @@ static int _gid_table_setup_one(struct ib_device *ib_dev)
 		if (!table)
 			goto rollback_table_setup;
 
-		gid_table_reserve_default(ib_dev, rdma_port, table);
+		table->default_entries =
+			hweight_long(roce_gid_type_mask_support(ib_dev, rdma_port));
 		ib_dev->cache.ports[port].gid = table;
 	}
 	return 0;
@@ -885,7 +928,8 @@ int ib_get_cached_gid(struct ib_device *device,
 	if (!rdma_is_port_valid(device, port_num))
 		return -EINVAL;
 
-	table = device->cache.ports[port_num - rdma_start_port(device)].gid;
+	table = get_table(device, port_num);
+
 	read_lock_irqsave(&table->rwlock, flags);
 	res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr);
 	read_unlock_irqrestore(&table->rwlock, flags);
@@ -1095,31 +1139,35 @@ EXPORT_SYMBOL(ib_get_cached_port_state);
 static int config_non_roce_gid_cache(struct ib_device *device,
 				     u8 port, int gid_tbl_len)
 {
-	struct ib_gid_attr gid_attr = {};
-	struct ib_gid_table *table;
-	union ib_gid gid;
+	struct ib_gid_table *table = get_table(device, port);
+	struct ib_gid_entry *entry = NULL;
 	int ret = 0;
 	int i;
 
-	gid_attr.device = device;
-	gid_attr.port_num = port;
-	table = device->cache.ports[port - rdma_start_port(device)].gid;
-
 	mutex_lock(&table->lock);
 	for (i = 0; i < gid_tbl_len; ++i) {
 		if (!device->query_gid)
 			continue;
-		ret = device->query_gid(device, port, i, &gid);
+
+		entry = alloc_entry(device, port);
+		if (!entry)
+			return -ENOMEM;
+
+		ret = device->query_gid(device, port, i, &entry->gid);
 		if (ret) {
 			pr_warn("query_gid failed (%d) for %s (index %d)\n",
 				ret, device->name, i);
 			goto err;
 		}
-		gid_attr.index = i;
-		add_modify_gid(table, &gid, &gid_attr);
+
+		entry->attr.index = i;
+		ret = add_modify_gid(table, entry);
+		if (ret)
+			goto err;
 	}
 err:
 	mutex_unlock(&table->lock);
+	put_entry(entry);
 	return ret;
 }
 
@@ -1136,7 +1184,7 @@ static void ib_cache_update(struct ib_device *device,
 	if (!rdma_is_port_valid(device, port))
 		return;
 
-	table = device->cache.ports[port - rdma_start_port(device)].gid;
+	table = get_table(device, port);
 
 	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
 	if (!tprops)
@@ -1297,6 +1345,12 @@ void ib_cache_cleanup_one(struct ib_device *device)
 	ib_unregister_event_handler(&device->cache.event_handler);
 	flush_workqueue(ib_wq);
 	gid_table_cleanup_one(device);
+
+	/*
+	 * Flush the wq second time for any pending GID delete
+	 * work for RoCE device.
+	 */
+	flush_workqueue(ib_wq);
 }
 
 void __init ib_cache_setup(void)
-- 
2.17.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux