From: Parav Pandit <parav@xxxxxxxxxxxx> In order to be able to expose pointers to the ib_gid_attrs in the GID table we need to make it so the value of the pointer cannot be changed. Thus each GID table entry gets a unique piece of kref'd memory that is written only during initialization and remains constant for its lifetime. This eventually will allow the struct ib_gid_attrs to be returned without copy from many of query the APIs, but it also provides a way to track when all users of a HW table index go away. For roce we no longer allow an in-use HW table index to be re-used for a new an different entry. When a GID table entry needs to be removed it is hidden from the find API, but remains as a valid HW index and all ib_gid_attr points remain valid. The HW index is not relased until all users put the kref. Later patches will broadly replace the use of the sgid_index integer with the kref'd structure. Ultimately this will prevent security problems where the OS changes the properties of a HW GID table entry while an active user object is still using the entry. Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx> Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxxxx> Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx> --- drivers/infiniband/core/cache.c | 319 +++++++++++++++++++++++++++------------- include/rdma/ib_verbs.h | 1 + 2 files changed, 217 insertions(+), 103 deletions(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index d51fe92ecf50..eca74ae9dd5b 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -66,15 +66,24 @@ enum gid_attr_find_mask { GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3, }; -enum gid_table_entry_props { - GID_TABLE_ENTRY_INVALID = 1UL << 0, +enum gid_table_entry_state { + GID_TABLE_ENTRY_INVALID = 1, + GID_TABLE_ENTRY_VALID = 2, + /* + * Indicates that entry is pending to be removed, there may + * be active users of this GID entry. + * When last user of the GID entry releases reference to it, + * GID entry is detached from the table. + */ + GID_TABLE_ENTRY_PENDING_DEL = 3, }; struct ib_gid_table_entry { - unsigned long props; - union ib_gid gid; - struct ib_gid_attr attr; - void *context; + struct kref kref; + struct work_struct del_work; + struct ib_gid_attr attr; + void *context; + enum gid_table_entry_state state; }; struct ib_gid_table { @@ -90,15 +99,16 @@ struct ib_gid_table { * **/ /* Any writer to data_vec must hold this lock and the write side of - * rwlock. readers must hold only rwlock. All writers must be in a + * rwlock. Readers must hold only rwlock. All writers must be in a * sleepable context. */ struct mutex lock; - /* rwlock protects data_vec[ix]->props. */ + /* rwlock protects data_vec[ix]->state and entry pointer. + */ rwlock_t rwlock; + struct ib_gid_table_entry **data_vec; /* bit field, each bit indicates the index of default GID */ u32 default_gid_indices; - struct ib_gid_table_entry *data_vec; }; static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) @@ -178,26 +188,113 @@ static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u8 port) return device->cache.ports[port - rdma_start_port(device)].gid; } -static void del_roce_gid(struct ib_device *device, u8 port_num, - struct ib_gid_table *table, int ix) +static bool is_gid_entry_free(const struct ib_gid_table_entry *entry) +{ + return !entry; +} + +static bool is_gid_entry_valid(const struct ib_gid_table_entry *entry) +{ + return entry && entry->state == GID_TABLE_ENTRY_VALID; +} + +static void schedule_free_gid(struct kref *kref) +{ + struct ib_gid_table_entry *entry = + container_of(kref, struct ib_gid_table_entry, kref); + + queue_work(ib_wq, &entry->del_work); +} + +static void free_gid_entry(struct ib_gid_table_entry *entry) { + struct ib_device *device = entry->attr.device; + u8 port_num = entry->attr.port_num; + struct ib_gid_table *table = rdma_gid_table(device, port_num); + pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, - device->name, port_num, ix, - table->data_vec[ix].gid.raw); + device->name, port_num, entry->attr.index, + entry->attr.gid.raw); + + mutex_lock(&table->lock); + if (rdma_cap_roce_gid_table(device, port_num) && + entry->state != GID_TABLE_ENTRY_INVALID) + device->del_gid(&entry->attr, &entry->context); + write_lock_irq(&table->rwlock); - if (rdma_cap_roce_gid_table(device, port_num)) - device->del_gid(&table->data_vec[ix].attr, - &table->data_vec[ix].context); - dev_put(table->data_vec[ix].attr.ndev); + /* + * The only way to avoid overwriting NULL in table is + * by comparing if it is same entry in table or not! + * If new entry in table is added by the time we free here, + * don't overwrite the table entry. + */ + if (entry == table->data_vec[entry->attr.index]) + table->data_vec[entry->attr.index] = NULL; + /* Now this index is ready to be allocated */ + write_unlock_irq(&table->rwlock); + mutex_unlock(&table->lock); + + if (entry->attr.ndev) + dev_put(entry->attr.ndev); + kfree(entry); } -static int add_roce_gid(struct ib_gid_table *table, - const union ib_gid *gid, - const struct ib_gid_attr *attr) +/** + * free_gid_work - Release reference to the GID entry + * @work: Work structure to refer to GID entry which needs to be + * deleted. + * + * free_gid_work() frees the entry from the HCA's hardware table + * if provider supports it. It releases reference to netdevice. + */ +static void free_gid_work(struct work_struct *work) +{ + struct ib_gid_table_entry *entry = + container_of(work, struct ib_gid_table_entry, del_work); + free_gid_entry(entry); +} + +static struct ib_gid_table_entry * +alloc_gid_entry(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry; - int ix = attr->index; - int ret = 0; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return NULL; + kref_init(&entry->kref); + memcpy(&entry->attr, attr, sizeof(*attr)); + if (entry->attr.ndev) + dev_hold(entry->attr.ndev); + INIT_WORK(&entry->del_work, free_gid_work); + entry->state = GID_TABLE_ENTRY_INVALID; + return entry; +} + +static void store_gid_entry(struct ib_gid_table *table, + struct ib_gid_table_entry *entry) +{ + entry->state = GID_TABLE_ENTRY_VALID; + + pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, + entry->attr.device->name, entry->attr.port_num, + entry->attr.index, entry->attr.gid.raw); + + lockdep_assert_held(&table->lock); + write_lock_irq(&table->rwlock); + table->data_vec[entry->attr.index] = entry; + write_unlock_irq(&table->rwlock); +} + +static void put_gid_entry(struct ib_gid_table_entry *entry) +{ + kref_put(&entry->kref, schedule_free_gid); +} + +static int add_roce_gid(struct ib_gid_table_entry *entry) +{ + const struct ib_gid_attr *attr = &entry->attr; + int ret; if (!attr->ndev) { pr_err("%s NULL netdev device=%s port=%d index=%d\n", @@ -205,38 +302,22 @@ static int add_roce_gid(struct ib_gid_table *table, attr->index); return -EINVAL; } - - entry = &table->data_vec[ix]; - if ((entry->props & GID_TABLE_ENTRY_INVALID) == 0) { - WARN(1, "GID table corruption device=%s port=%d index=%d\n", - attr->device->name, attr->port_num, - attr->index); - return -EINVAL; - } - if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) { - ret = attr->device->add_gid(gid, attr, &entry->context); + ret = attr->device->add_gid(&attr->gid, attr, &entry->context); if (ret) { pr_err("%s GID add failed device=%s port=%d index=%d\n", __func__, attr->device->name, attr->port_num, attr->index); - goto add_err; + return ret; } } - dev_hold(attr->ndev); - -add_err: - if (!ret) - pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, - attr->device->name, attr->port_num, ix, gid->raw); - return ret; + return 0; } /** * add_modify_gid - Add or modify GID table entry * * @table: GID table in which GID to be added or modified - * @gid: GID content * @attr: Attributes of the GID * * Returns 0 on success or appropriate error code. It accepts zero @@ -244,34 +325,42 @@ static int add_roce_gid(struct ib_gid_table *table, * GID. However such zero GIDs are not added to the cache. */ static int add_modify_gid(struct ib_gid_table *table, - const union ib_gid *gid, const struct ib_gid_attr *attr) { - int ret; + struct ib_gid_table_entry *entry; + int ret = 0; + + /* + * Invalidate any old entry in the table to make it safe to write to + * this index. + */ + if (is_gid_entry_valid(table->data_vec[attr->index])) + put_gid_entry(table->data_vec[attr->index]); + + /* + * Some HCA's report multiple GID entries with only one valid GID, and + * leave other unused entries as the zero GID. Convert zero GIDs to + * empty table entries instead of storing them. + */ + if (rdma_is_zero_gid(&attr->gid)) + return 0; + + entry = alloc_gid_entry(attr); + if (!entry) + return -ENOMEM; if (rdma_protocol_roce(attr->device, attr->port_num)) { - ret = add_roce_gid(table, gid, attr); + ret = add_roce_gid(entry); if (ret) - return ret; - } else { - /* - * Some HCA's report multiple GID entries with only one - * valid GID, but remaining as zero GID. - * So ignore such behavior for IB link layer and don't - * fail the call, but don't add such entry to GID cache. - */ - if (rdma_is_zero_gid(gid)) - return 0; + goto done; } - lockdep_assert_held(&table->lock); - memcpy(&table->data_vec[attr->index].gid, gid, sizeof(*gid)); - memcpy(&table->data_vec[attr->index].attr, attr, sizeof(*attr)); - - write_lock_irq(&table->rwlock); - table->data_vec[attr->index].props &= ~GID_TABLE_ENTRY_INVALID; - write_unlock_irq(&table->rwlock); + store_gid_entry(table, entry); return 0; + +done: + put_gid_entry(entry); + return ret; } /** @@ -286,16 +375,25 @@ static int add_modify_gid(struct ib_gid_table *table, static void del_gid(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table, int ix) { + struct ib_gid_table_entry *entry; + lockdep_assert_held(&table->lock); + + pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, + ib_dev->name, port, ix, + table->data_vec[ix]->attr.gid.raw); + write_lock_irq(&table->rwlock); - table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID; + entry = table->data_vec[ix]; + entry->state = GID_TABLE_ENTRY_PENDING_DEL; + /* + * For non RoCE protocol, GID entry slot is ready to use. + */ + if (!rdma_protocol_roce(ib_dev, port)) + table->data_vec[ix] = NULL; write_unlock_irq(&table->rwlock); - if (rdma_protocol_roce(ib_dev, port)) - del_roce_gid(ib_dev, port, table, ix); - memset(&table->data_vec[ix].gid, 0, sizeof(table->data_vec[ix].gid)); - memset(&table->data_vec[ix].attr, 0, sizeof(table->data_vec[ix].attr)); - table->data_vec[ix].context = NULL; + put_gid_entry(entry); } /* rwlock should be read locked, or lock should be held */ @@ -308,8 +406,8 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, int empty = pempty ? -1 : 0; while (i < table->sz && (found < 0 || empty < 0)) { - struct ib_gid_table_entry *data = &table->data_vec[i]; - struct ib_gid_attr *attr = &data->attr; + struct ib_gid_table_entry *data = table->data_vec[i]; + struct ib_gid_attr *attr; int curr_index = i; i++; @@ -320,9 +418,9 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, * so lookup free slot only if requested. */ if (pempty && empty < 0) { - if (data->props & GID_TABLE_ENTRY_INVALID && - (default_gid == - is_gid_index_default(table, curr_index))) { + if (is_gid_entry_free(data) && + default_gid == + is_gid_index_default(table, curr_index)) { /* * Found an invalid (free) entry; allocate it. * If default GID is requested, then our @@ -337,22 +435,23 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, /* * Additionally find_gid() is used to find valid entry during - * lookup operation, where validity needs to be checked. So - * find the empty entry first to continue to search for a free - * slot and ignore its INVALID flag. + * lookup operation; so ignore the entries which are marked as + * pending for removal and the entries which are marked as + * invalid. */ - if (data->props & GID_TABLE_ENTRY_INVALID) + if (!is_gid_entry_valid(data)) continue; if (found >= 0) continue; + attr = &data->attr; if (mask & GID_ATTR_FIND_MASK_GID_TYPE && attr->gid_type != val->gid_type) continue; if (mask & GID_ATTR_FIND_MASK_GID && - memcmp(gid, &data->gid, sizeof(*gid))) + memcmp(gid, &data->attr.gid, sizeof(*gid))) continue; if (mask & GID_ATTR_FIND_MASK_NETDEV && @@ -409,7 +508,8 @@ static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port, attr->device = ib_dev; attr->index = empty; attr->port_num = port; - ret = add_modify_gid(table, gid, attr); + attr->gid = *gid; + ret = add_modify_gid(table, attr); if (!ret) dispatch_gid_change_event(ib_dev, port); @@ -505,7 +605,8 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, mutex_lock(&table->lock); for (ix = 0; ix < table->sz; ix++) { - if (table->data_vec[ix].attr.ndev == ndev) { + if (is_gid_entry_valid(table->data_vec[ix]) && + table->data_vec[ix]->attr.ndev == ndev) { del_gid(ib_dev, port, table, ix); deleted = true; } @@ -529,12 +630,13 @@ static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index, if (index < 0 || index >= table->sz) return -EINVAL; - if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) + if (!is_gid_entry_valid(table->data_vec[index])) return -EINVAL; - memcpy(gid, &table->data_vec[index].gid, sizeof(*gid)); + memcpy(gid, &table->data_vec[index]->attr.gid, sizeof(*gid)); if (attr) { - memcpy(attr, &table->data_vec[index].attr, sizeof(*attr)); + memcpy(attr, &table->data_vec[index]->attr, + sizeof(*attr)); if (attr->ndev) dev_hold(attr->ndev); } @@ -681,13 +783,14 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, for (i = 0; i < table->sz; i++) { struct ib_gid_attr attr; - if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) + if (!is_gid_entry_valid(table->data_vec[i])) continue; - if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid))) + if (memcmp(gid, &table->data_vec[i]->attr.gid, + sizeof(*gid))) continue; - memcpy(&attr, &table->data_vec[i].attr, sizeof(attr)); + memcpy(&attr, &table->data_vec[i]->attr, sizeof(attr)); if (filter(gid, &attr, context)) { found = true; @@ -705,9 +808,7 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, static struct ib_gid_table *alloc_gid_table(int sz) { - struct ib_gid_table *table = - kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL); - int i; + struct ib_gid_table *table = kzalloc(sizeof(*table), GFP_KERNEL); if (!table) return NULL; @@ -720,12 +821,6 @@ static struct ib_gid_table *alloc_gid_table(int sz) table->sz = sz; rwlock_init(&table->rwlock); - - /* Mark all entries as invalid so that allocator can allocate - * one of the invalid (free) entry. - */ - for (i = 0; i < sz; i++) - table->data_vec[i].props |= GID_TABLE_ENTRY_INVALID; return table; err_free_table: @@ -733,12 +828,26 @@ static struct ib_gid_table *alloc_gid_table(int sz) return NULL; } -static void release_gid_table(struct ib_gid_table *table) +static void release_gid_table(struct ib_device *device, u8 port, + struct ib_gid_table *table) { - if (table) { - kfree(table->data_vec); - kfree(table); + int i; + + if (!table) + return; + + for (i = 0; i < table->sz; i++) { + if (is_gid_entry_free(table->data_vec[i])) + continue; + if (kref_read(&table->data_vec[i]->kref) > 1) + pr_warn("GID entry ref leak for %s (index %d) ref=%d\n", + device->name, i, + kref_read(&table->data_vec[i]->kref)); + put_gid_entry(table->data_vec[i]); } + + kfree(table->data_vec); + kfree(table); } static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, @@ -752,7 +861,7 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, mutex_lock(&table->lock); for (i = 0; i < table->sz; ++i) { - if (!rdma_is_zero_gid(&table->data_vec[i].gid)) { + if (is_gid_entry_valid(table->data_vec[i])) { del_gid(ib_dev, port, table, i); deleted = true; } @@ -821,7 +930,7 @@ static void gid_table_release_one(struct ib_device *ib_dev) for (port = 0; port < ib_dev->phys_port_cnt; port++) { table = ib_dev->cache.ports[port].gid; - release_gid_table(table); + release_gid_table(ib_dev, port, table); ib_dev->cache.ports[port].gid = NULL; } } @@ -1100,7 +1209,6 @@ static int config_non_roce_gid_cache(struct ib_device *device, { struct ib_gid_attr gid_attr = {}; struct ib_gid_table *table; - union ib_gid gid; int ret = 0; int i; @@ -1112,14 +1220,14 @@ static int config_non_roce_gid_cache(struct ib_device *device, for (i = 0; i < gid_tbl_len; ++i) { if (!device->query_gid) continue; - ret = device->query_gid(device, port, i, &gid); + ret = device->query_gid(device, port, i, &gid_attr.gid); if (ret) { pr_warn("query_gid failed (%d) for %s (index %d)\n", ret, device->name, i); goto err; } gid_attr.index = i; - add_modify_gid(table, &gid, &gid_attr); + add_modify_gid(table, &gid_attr); } err: mutex_unlock(&table->lock); @@ -1300,4 +1408,9 @@ void ib_cache_cleanup_one(struct ib_device *device) ib_unregister_event_handler(&device->cache.event_handler); flush_workqueue(ib_wq); gid_table_cleanup_one(device); + + /* + * Flush the wq second time for any pending GID delete work. + */ + flush_workqueue(ib_wq); } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index ad30f87115a1..937f16a2f414 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -94,6 +94,7 @@ enum ib_gid_type { struct ib_gid_attr { struct net_device *ndev; struct ib_device *device; + union ib_gid gid; enum ib_gid_type gid_type; u16 index; u8 port_num; -- 2.14.3 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html