[PATCH rdma-next 3/4] IB/core: Introduce GID entry reference counts

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Parav Pandit <parav@xxxxxxxxxxxx>

Currently a GID entry can be deleted from the GID table while it is
in use by ULPs, CM modules or user space applications.
It may even get replaced with new attributes and/or GID value which
may lead to undesired traffic in the network.
Previous and new GID entry might be in two different net namespace; this
may lead to send traffic to undesired namespace.
Provider drivers might be working on GID entry while modifying QP or
creating address handle, and GID entry can get modified.

So this patch introduces GID reference count infrastructure without
actually exposing GID get/put/hold APIs to overcome above issues.

GID entry reference count is based on standard kref structure.
The GID table entries are re-organized so they are written once before
being added to the table, and then never changed during the lifetime
of the entry.

This patch introduces GID entry state as INVALID, VALID and
PENDING_DEL. GID_TABLE_ENTRY_PENDING_DEL indicates that a given
GID entry is marked as pending for deletion. While it is in pending
state, allocator must not reuse such entry. When GID entry is in
PENDING_DEL state, no more new references can be taken using find
operation. This ensures that no new processing starts for such
expiring GID entry.

For IB port GID table entry location is decided by the HCA provider
driver. Which means that when GID table entries are in use by the
users of it, ib_cache_update() can overwrite the GID entry and
attributes for IB link layer.
Therefore, when an entry is deleted/added (RoCE) or replaced (IB)
new memory is allocated to replace the old entry and the old entry
lives on via a kref until all users finally drop it.

It also adds and uses readable helper functions at multiple places
to check GID entry state such as is_free() and is_valid().

Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx>
Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx>
---
 drivers/infiniband/core/cache.c | 285 +++++++++++++++++++++++++++++-----------
 include/rdma/ib_verbs.h         |   1 +
 2 files changed, 207 insertions(+), 79 deletions(-)

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index f4279bcff777..77d190d91f1a 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -66,15 +66,24 @@ enum gid_attr_find_mask {
 	GID_ATTR_FIND_MASK_GID_TYPE	= 1UL << 3,
 };
 
-enum gid_table_entry_props {
-	GID_TABLE_ENTRY_INVALID		= 1UL << 0,
+enum gid_table_entry_state {
+	GID_TABLE_ENTRY_INVALID		= 1,
+	GID_TABLE_ENTRY_VALID		= 2,
+	/*
+	 * Indicates that entry is pending to be removed, there may
+	 * be active users of this GID entry.
+	 * When last user of the GID entry releases reference to it,
+	 * GID entry is detached from the table.
+	 */
+	GID_TABLE_ENTRY_PENDING_DEL	= 3,
 };
 
 struct ib_gid_table_entry {
-	unsigned long	    props;
-	union ib_gid        gid;
-	struct ib_gid_attr  attr;
-	void		   *context;
+	struct kref			kref;
+	struct work_struct		del_work;
+	struct ib_gid_attr		attr;
+	void				*context;
+	enum gid_table_entry_state	state;
 };
 
 struct ib_gid_table {
@@ -90,15 +99,16 @@ struct ib_gid_table {
 	 *
 	 **/
 	/* Any writer to data_vec must hold this lock and the write side of
-	 * rwlock. readers must hold only rwlock. All writers must be in a
+	 * rwlock. Readers must hold only rwlock. All writers must be in a
 	 * sleepable context.
 	 */
 	struct mutex			lock;
-	/* rwlock protects data_vec[ix]->props. */
+	/* rwlock protects data_vec[ix]->state and entry pointer.
+	 */
 	rwlock_t			rwlock;
+	struct ib_gid_table_entry	**data_vec;
 	/* bit field, each bit indicates the index of default GID */
 	u32				default_gid_indices;
-	struct ib_gid_table_entry	*data_vec;
 };
 
 static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port)
@@ -179,21 +189,106 @@ static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u8 port)
 	return device->cache.ports[port - rdma_start_port(device)].gid;
 }
 
-static void del_roce_gid(struct ib_device *device, u8 port_num,
-			 struct ib_gid_table *table, int ix)
+static bool is_gid_entry_free(const struct ib_gid_table_entry *entry)
+{
+	return !entry;
+}
+
+static bool is_gid_entry_valid(const struct ib_gid_table_entry *entry)
 {
+	return entry && entry->state == GID_TABLE_ENTRY_VALID;
+}
+
+static void schedule_free_gid(struct kref *kref)
+{
+	struct ib_gid_table_entry *entry =
+			container_of(kref, struct ib_gid_table_entry, kref);
+
+	queue_work(ib_wq, &entry->del_work);
+}
+
+static void free_gid_entry(struct ib_gid_table_entry *entry)
+{
+	struct ib_device *device = entry->attr.device;
+	u8 port_num = entry->attr.port_num;
+	struct ib_gid_table *table = rdma_gid_table(device, port_num);
+
 	pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
-		 device->name, port_num, ix,
-		 table->data_vec[ix].gid.raw);
+		 device->name, port_num, entry->attr.index,
+		 entry->attr.gid.raw);
+
+	mutex_lock(&table->lock);
+	if (rdma_cap_roce_gid_table(device, port_num) &&
+	    entry->state != GID_TABLE_ENTRY_INVALID)
+		device->del_gid(&entry->attr, &entry->context);
+	write_lock_irq(&table->rwlock);
+
+	/*
+	 * The only way to avoid overwriting NULL in table is
+	 * by comparing if it is same entry in table or not!
+	 * If new entry in table is added by the time we free here,
+	 * don't overwrite the table entry.
+	 */
+	if (entry == table->data_vec[entry->attr.index])
+		table->data_vec[entry->attr.index] = NULL;
+	/* Now this index is ready to be allocated */
+	write_unlock_irq(&table->rwlock);
+	mutex_unlock(&table->lock);
+
+	if (entry->attr.ndev)
+		dev_put(entry->attr.ndev);
+	kfree(entry);
+}
+
+/**
+ * free_gid_work - Release reference to the GID entry
+ * @work: Work structure to refer to GID entry which needs to be
+ * deleted.
+ *
+ * free_gid_work() frees the entry from the HCA's hardware table
+ * if provider supports it. It releases reference to netdevice.
+ */
+static void free_gid_work(struct work_struct *work)
+{
+	struct ib_gid_table_entry *entry =
+			container_of(work, struct ib_gid_table_entry, del_work);
+	free_gid_entry(entry);
+}
+
+static struct ib_gid_table_entry *
+alloc_gid_entry(const struct ib_gid_attr *attr)
+{
+	struct ib_gid_table_entry *entry;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return NULL;
+	kref_init(&entry->kref);
+	memcpy(&entry->attr, attr, sizeof(*attr));
+	if (entry->attr.ndev)
+		dev_hold(entry->attr.ndev);
+	INIT_WORK(&entry->del_work, free_gid_work);
+	entry->state = GID_TABLE_ENTRY_INVALID;
+	return entry;
+}
+
+static void
+store_gid_entry(struct ib_gid_table *table, struct ib_gid_table_entry *entry)
+{
+	entry->state = GID_TABLE_ENTRY_VALID;
+
+	lockdep_assert_held(&table->lock);
+	write_lock_irq(&table->rwlock);
+	table->data_vec[entry->attr.index] = entry;
+	write_unlock_irq(&table->rwlock);
+}
 
-	if (rdma_cap_roce_gid_table(device, port_num))
-		device->del_gid(&table->data_vec[ix].attr,
-				&table->data_vec[ix].context);
-	dev_put(table->data_vec[ix].attr.ndev);
+static void put_gid_entry(struct ib_gid_table_entry *entry)
+{
+	kref_put(&entry->kref, schedule_free_gid);
 }
 
 static int add_roce_gid(struct ib_gid_table *table,
-			const union ib_gid *gid,
 			const struct ib_gid_attr *attr)
 {
 	struct ib_gid_table_entry *entry;
@@ -207,8 +302,8 @@ static int add_roce_gid(struct ib_gid_table *table,
 		return -EINVAL;
 	}
 
-	entry = &table->data_vec[ix];
-	if ((entry->props & GID_TABLE_ENTRY_INVALID) == 0) {
+	entry = table->data_vec[ix];
+	if (!is_gid_entry_free(entry)) {
 		WARN(1, "GID table corruption device=%s port=%d index=%d\n",
 		     attr->device->name, attr->port_num,
 		     attr->index);
@@ -216,7 +311,7 @@ static int add_roce_gid(struct ib_gid_table *table,
 	}
 
 	if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) {
-		ret = attr->device->add_gid(gid, attr, &entry->context);
+		ret = attr->device->add_gid(&attr->gid, attr, &entry->context);
 		if (ret) {
 			pr_err("%s GID add failed device=%s port=%d index=%d\n",
 			       __func__, attr->device->name, attr->port_num,
@@ -224,12 +319,11 @@ static int add_roce_gid(struct ib_gid_table *table,
 			goto add_err;
 		}
 	}
-	dev_hold(attr->ndev);
 
 add_err:
 	if (!ret)
 		pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
-			 attr->device->name, attr->port_num, ix, gid->raw);
+			 attr->device->name, attr->port_num, ix, attr->gid.raw);
 	return ret;
 }
 
@@ -237,7 +331,6 @@ static int add_roce_gid(struct ib_gid_table *table,
  * add_modify_gid - Add or modify GID table entry
  *
  * @table:	GID table in which GID to be added or modified
- * @gid:	GID content
  * @attr:	Attributes of the GID
  *
  * Returns 0 on success or appropriate error code. It accepts zero
@@ -245,15 +338,19 @@ static int add_roce_gid(struct ib_gid_table *table,
  * GID. However such zero GIDs are not added to the cache.
  */
 static int add_modify_gid(struct ib_gid_table *table,
-			  const union ib_gid *gid,
 			  const struct ib_gid_attr *attr)
 {
-	int ret;
+	struct ib_gid_table_entry *entry;
+	int ret = 0;
+
+	entry = alloc_gid_entry(attr);
+	if (!entry)
+		return -ENOMEM;
 
 	if (rdma_protocol_roce(attr->device, attr->port_num)) {
-		ret = add_roce_gid(table, gid, attr);
+		ret = add_roce_gid(table, attr);
 		if (ret)
-			return ret;
+			goto done;
 	} else {
 		/*
 		 * Some HCA's report multiple GID entries with only one
@@ -261,18 +358,23 @@ static int add_modify_gid(struct ib_gid_table *table,
 		 * So ignore such behavior for IB link layer and don't
 		 * fail the call, but don't add such entry to GID cache.
 		 */
-		if (rdma_is_zero_gid(gid))
-			return 0;
+		if (rdma_is_zero_gid(&attr->gid)) {
+			ret = 0;
+			goto done;
+		}
+		/*
+		 * If any past GID entry exists, schedule freeing it.
+		 */
+		if (is_gid_entry_valid(table->data_vec[attr->index]))
+			put_gid_entry(table->data_vec[attr->index]);
 	}
 
-	lockdep_assert_held(&table->lock);
-	memcpy(&table->data_vec[attr->index].gid, gid, sizeof(*gid));
-	memcpy(&table->data_vec[attr->index].attr, attr, sizeof(*attr));
-
-	write_lock_irq(&table->rwlock);
-	table->data_vec[attr->index].props &= ~GID_TABLE_ENTRY_INVALID;
-	write_unlock_irq(&table->rwlock);
+	store_gid_entry(table, entry);
 	return 0;
+
+done:
+	put_gid_entry(entry);
+	return ret;
 }
 
 /**
@@ -287,16 +389,25 @@ static int add_modify_gid(struct ib_gid_table *table,
 static void del_gid(struct ib_device *ib_dev, u8 port,
 		    struct ib_gid_table *table, int ix)
 {
+	struct ib_gid_table_entry *entry;
+
 	lockdep_assert_held(&table->lock);
+
+	pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
+		 ib_dev->name, port, ix,
+		 table->data_vec[ix]->attr.gid.raw);
+
 	write_lock_irq(&table->rwlock);
-	table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID;
+	entry = table->data_vec[ix];
+	entry->state = GID_TABLE_ENTRY_PENDING_DEL;
+	/*
+	 * For non RoCE protocol, GID entry slot is ready to use.
+	 */
+	if (!rdma_protocol_roce(ib_dev, port))
+		table->data_vec[ix] = NULL;
 	write_unlock_irq(&table->rwlock);
 
-	if (rdma_protocol_roce(ib_dev, port))
-		del_roce_gid(ib_dev, port, table, ix);
-	memset(&table->data_vec[ix].gid, 0, sizeof(table->data_vec[ix].gid));
-	memset(&table->data_vec[ix].attr, 0, sizeof(table->data_vec[ix].attr));
-	table->data_vec[ix].context = NULL;
+	put_gid_entry(entry);
 }
 
 /* rwlock should be read locked, or lock should be held */
@@ -309,8 +420,8 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
 	int empty = pempty ? -1 : 0;
 
 	while (i < table->sz && (found < 0 || empty < 0)) {
-		struct ib_gid_table_entry *data = &table->data_vec[i];
-		struct ib_gid_attr *attr = &data->attr;
+		struct ib_gid_table_entry *data = table->data_vec[i];
+		struct ib_gid_attr *attr;
 		int curr_index = i;
 
 		i++;
@@ -321,9 +432,9 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
 		 * so lookup free slot only if requested.
 		 */
 		if (pempty && empty < 0) {
-			if (data->props & GID_TABLE_ENTRY_INVALID &&
-			    (default_gid ==
-				is_gid_index_default(table, curr_index))) {
+			if (is_gid_entry_free(data) &&
+			    default_gid ==
+				is_gid_index_default(table, curr_index)) {
 				/*
 				 * Found an invalid (free) entry; allocate it.
 				 * If default GID is requested, then our
@@ -338,22 +449,23 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
 
 		/*
 		 * Additionally find_gid() is used to find valid entry during
-		 * lookup operation, where validity needs to be checked. So
-		 * find the empty entry first to continue to search for a free
-		 * slot and ignore its INVALID flag.
+		 * lookup operation; so ignore the entries which are marked as
+		 * pending for removal and the entries which are marked as
+		 * invalid.
 		 */
-		if (data->props & GID_TABLE_ENTRY_INVALID)
+		if (!is_gid_entry_valid(data))
 			continue;
 
 		if (found >= 0)
 			continue;
 
+		attr = &data->attr;
 		if (mask & GID_ATTR_FIND_MASK_GID_TYPE &&
 		    attr->gid_type != val->gid_type)
 			continue;
 
 		if (mask & GID_ATTR_FIND_MASK_GID &&
-		    memcmp(gid, &data->gid, sizeof(*gid)))
+		    memcmp(gid, &data->attr.gid, sizeof(*gid)))
 			continue;
 
 		if (mask & GID_ATTR_FIND_MASK_NETDEV &&
@@ -410,7 +522,8 @@ static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
 	attr->device = ib_dev;
 	attr->index = empty;
 	attr->port_num = port;
-	ret = add_modify_gid(table, gid, attr);
+	attr->gid = *gid;
+	ret = add_modify_gid(table, attr);
 	if (!ret)
 		dispatch_gid_change_event(ib_dev, port);
 
@@ -506,7 +619,8 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
 	mutex_lock(&table->lock);
 
 	for (ix = 0; ix < table->sz; ix++) {
-		if (table->data_vec[ix].attr.ndev == ndev) {
+		if (is_gid_entry_valid(table->data_vec[ix]) &&
+		    table->data_vec[ix]->attr.ndev == ndev) {
 			del_gid(ib_dev, port, table, ix);
 			deleted = true;
 		}
@@ -530,12 +644,13 @@ static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
 	if (index < 0 || index >= table->sz)
 		return -EINVAL;
 
-	if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID)
+	if (!is_gid_entry_valid(table->data_vec[index]))
 		return -EAGAIN;
 
-	memcpy(gid, &table->data_vec[index].gid, sizeof(*gid));
+	memcpy(gid, &table->data_vec[index]->attr.gid, sizeof(*gid));
 	if (attr) {
-		memcpy(attr, &table->data_vec[index].attr, sizeof(*attr));
+		memcpy(attr, &table->data_vec[index]->attr,
+		       sizeof(*attr));
 		if (attr->ndev)
 			dev_hold(attr->ndev);
 	}
@@ -682,13 +797,14 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
 	for (i = 0; i < table->sz; i++) {
 		struct ib_gid_attr attr;
 
-		if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
+		if (!is_gid_entry_valid(table->data_vec[i]))
 			continue;
 
-		if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
+		if (memcmp(gid, &table->data_vec[i]->attr.gid,
+			   sizeof(*gid)))
 			continue;
 
-		memcpy(&attr, &table->data_vec[i].attr, sizeof(attr));
+		memcpy(&attr, &table->data_vec[i]->attr, sizeof(attr));
 
 		if (filter(gid, &attr, context)) {
 			found = true;
@@ -706,9 +822,7 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
 
 static struct ib_gid_table *alloc_gid_table(int sz)
 {
-	struct ib_gid_table *table =
-		kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL);
-	int i;
+	struct ib_gid_table *table = kzalloc(sizeof(*table), GFP_KERNEL);
 
 	if (!table)
 		return NULL;
@@ -721,12 +835,6 @@ static struct ib_gid_table *alloc_gid_table(int sz)
 
 	table->sz = sz;
 	rwlock_init(&table->rwlock);
-
-	/* Mark all entries as invalid so that allocator can allocate
-	 * one of the invalid (free) entry.
-	 */
-	for (i = 0; i < sz; i++)
-		table->data_vec[i].props |= GID_TABLE_ENTRY_INVALID;
 	return table;
 
 err_free_table:
@@ -734,12 +842,26 @@ static struct ib_gid_table *alloc_gid_table(int sz)
 	return NULL;
 }
 
-static void release_gid_table(struct ib_gid_table *table)
+static void release_gid_table(struct ib_device *device,
+			      u8 port, struct ib_gid_table *table)
 {
-	if (table) {
-		kfree(table->data_vec);
-		kfree(table);
+	int i;
+
+	if (!table)
+		return;
+
+	for (i = 0; i < table->sz; i++) {
+		if (is_gid_entry_free(table->data_vec[i]))
+			continue;
+		if (kref_read(&table->data_vec[i]->kref) > 1)
+			pr_warn("GID entry ref leak for %s (index %d) ref=%d\n",
+				device->name, i,
+				kref_read(&table->data_vec[i]->kref));
+		put_gid_entry(table->data_vec[i]);
 	}
+
+	kfree(table->data_vec);
+	kfree(table);
 }
 
 static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
@@ -753,7 +875,7 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
 
 	mutex_lock(&table->lock);
 	for (i = 0; i < table->sz; ++i) {
-		if (!rdma_is_zero_gid(&table->data_vec[i].gid)) {
+		if (is_gid_entry_valid(table->data_vec[i])) {
 			del_gid(ib_dev, port, table, i);
 			deleted = true;
 		}
@@ -822,7 +944,7 @@ static void gid_table_release_one(struct ib_device *ib_dev)
 
 	for (port = 0; port < ib_dev->phys_port_cnt; port++) {
 		table = ib_dev->cache.ports[port].gid;
-		release_gid_table(table);
+		release_gid_table(ib_dev, port, table);
 		ib_dev->cache.ports[port].gid = NULL;
 	}
 }
@@ -1101,7 +1223,6 @@ static int config_non_roce_gid_cache(struct ib_device *device,
 {
 	struct ib_gid_attr gid_attr = {};
 	struct ib_gid_table *table;
-	union ib_gid gid;
 	int ret = 0;
 	int i;
 
@@ -1113,14 +1234,14 @@ static int config_non_roce_gid_cache(struct ib_device *device,
 	for (i = 0; i < gid_tbl_len; ++i) {
 		if (!device->query_gid)
 			continue;
-		ret = device->query_gid(device, port, i, &gid);
+		ret = device->query_gid(device, port, i, &gid_attr.gid);
 		if (ret) {
 			pr_warn("query_gid failed (%d) for %s (index %d)\n",
 				ret, device->name, i);
 			goto err;
 		}
 		gid_attr.index = i;
-		add_modify_gid(table, &gid, &gid_attr);
+		add_modify_gid(table, &gid_attr);
 	}
 err:
 	mutex_unlock(&table->lock);
@@ -1301,6 +1422,12 @@ void ib_cache_cleanup_one(struct ib_device *device)
 	ib_unregister_event_handler(&device->cache.event_handler);
 	flush_workqueue(ib_wq);
 	gid_table_cleanup_one(device);
+
+	/*
+	 * Flush the wq second time for any pending GID delete
+	 * work for RoCE device.
+	 */
+	flush_workqueue(ib_wq);
 }
 
 void __init ib_cache_setup(void)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 735ae07d03fa..1b13b4bbfbfb 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -94,6 +94,7 @@ enum ib_gid_type {
 struct ib_gid_attr {
 	struct net_device	*ndev;
 	struct ib_device	*device;
+	union ib_gid		gid;
 	enum ib_gid_type	gid_type;
 	u16			index;
 	u8			port_num;
-- 
2.14.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux