> -----Original Message----- > From: Jason Gunthorpe [mailto:jgg@xxxxxxxx] > Sent: Wednesday, May 16, 2018 10:31 PM > To: Leon Romanovsky <leon@xxxxxxxxxx> > Cc: Doug Ledford <dledford@xxxxxxxxxx>; Leon Romanovsky > <leonro@xxxxxxxxxxxx>; RDMA mailing list <linux-rdma@xxxxxxxxxxxxxxx>; > Parav Pandit <parav@xxxxxxxxxxxx> > Subject: Re: [PATCH rdma-next 05/12] IB/core: Introduce GID entry reference > counts for RoCE > > On Mon, May 14, 2018 at 11:11:11AM +0300, Leon Romanovsky wrote: > > From: Parav Pandit <parav@xxxxxxxxxxxx> > > > > Currently a GID entry can be deleted from the GID table while it is in > > use by ULPs, CM modules or user space applications. > > [..] > > So, I still don't like how tricky this is, particularly as we go down the series. > > I prefer a version that kref's actual memory and tracks it that way. > > Here is a compile tested version of this patch revised as I would think.. It > probably needs some splitting because it is waay to big, but overall I think it is > much more understandable and auditable this way. > > I particularly like how the locking becomes simpler. > > But what do you think? > > From 03b9e61c0e9af031cf1ad73cb45f0f46c06f3be6 Mon Sep 17 00:00:00 2001 > From: Jason Gunthorpe <jgg@xxxxxxxxxxxx> > Date: Mon, 14 May 2018 11:11:11 +0300 > Subject: [RFC PATCH] IB/core: Make the gid table entries immutable > > This is the first step to expose the gid_attr struct stored in the gid table to the > users without having to copy it. > > The gid table entries are re-organized so they are written once before being > added to the table, and then never changed during the lifetime of the entry. > > When and entry is deleted/added (roce) or replaced (IB) new memory is > allocated to replace the old entry and the old entry lives on via a kref until all > users finally drop it. > > For roce this means we can block re-use of gid indexes by dropping an ERR_PTR > tombstone into the table while the entry is still in use, making the index blocked > but the content in-accessible. > > Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx> > Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx> > Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxxxx> > --- > drivers/infiniband/core/cache.c | 626 +++++++++++++++++--------------- > 1 file changed, 340 insertions(+), 286 deletions(-) > > diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c > index 47fa72b434be00..bc09382b57d280 100644 > --- a/drivers/infiniband/core/cache.c > +++ b/drivers/infiniband/core/cache.c > @@ -66,45 +66,149 @@ enum gid_attr_find_mask { > GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3, > }; > > -enum gid_table_entry_props { > - GID_TABLE_ENTRY_DEFAULT = 1 << 1, > +enum gid_table_entry_flags { > + GID_TABLE_ENTRY_IN_TABLE = 1 << 0, > + GID_TABLE_ENTRY_ADD_GID = 1 << 1, > + GID_TABLE_ENTRY_DEFAULT = 1 << 2, > }; > > -enum gid_table_entry_state { > - GID_TABLE_ENTRY_FREE = 1, > - GID_TABLE_ENTRY_ALLOCATED = 2, > -}; > +struct ib_gid_entry { > + struct kref kref; > + struct work_struct del_work; > + union ib_gid gid; > + unsigned int flags; > + struct ib_gid_attr attr; > > -struct ib_gid_table_entry { > - enum gid_table_entry_props props; > - enum gid_table_entry_state state; > - union ib_gid gid; > - struct ib_gid_attr attr; > - void *context; > + /* driver specific data */ > + void *context; > }; > > struct ib_gid_table { > - int sz; > - /* In RoCE, adding a GID to the table requires: > - * (a) Find if this GID is already exists. > - * (b) Find a free space. > - * (c) Write the new GID > - * > - * Delete requires different set of operations: > - * (a) Find the GID > - * (b) Delete it. > - * > - **/ > + unsigned int sz; > + /* The first default_entries in data_vec are reserved for default entries > */ > + unsigned int default_entries; > + > /* Any writer to data_vec must hold this lock and the write side of > * rwlock. Readers must hold only rwlock. All writers must be in a > * sleepable context. > */ > struct mutex lock; > - /* rwlock protects data_vec[ix]->props and data)vec[ix->state. */ > + /* rwlock protects data_vec */ > rwlock_t rwlock; > - struct ib_gid_table_entry *data_vec; > + const struct ib_gid_entry *data_vec[]; > }; > > +static struct ib_gid_table *get_table(struct ib_device *device, u8 > +port) > + > +{ > + return device->cache.ports[port - rdma_start_port(device)].gid; } > + > +static struct ib_gid_table *get_entry_table(struct ib_gid_entry *entry) > +{ > + return get_table(entry->attr.device, entry->attr.port_num); } > + > +static void store_entry(struct ib_gid_table *table, > + struct ib_gid_entry *entry) > +{ > + int ix = entry->attr.index; > + > + lockdep_assert_held(&table->lock); > + write_lock_irq(&table->rwlock); > + > + WARN_ON(table->data_vec[ix]); > + > + entry->flags |= GID_TABLE_ENTRY_IN_TABLE; > + table->data_vec[entry->attr.index] = entry; > + > + write_unlock_irq(&table->rwlock); > +} > + > +/** > + * free_roce_gid_work - Release reference to the GID entry > + * @work: Work structure to refer to GID entry which needs to be > + * deleted. > + * > + * free_roce_gid_work() frees the entry from the HCA's hardware table > + * if the provider supports it. > + */ > +static void free_roce_gid_work(struct work_struct *work) { > + struct ib_gid_entry *entry = > + container_of(work, struct ib_gid_entry, del_work); > + struct ib_gid_table *table = get_entry_table(entry); > + struct ib_device *device = entry->attr.device; > + > + pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, > + device->name, entry->attr.port_num, entry->attr.index, > + entry->gid.raw); > + > + mutex_lock(&table->lock); > + if (entry->flags & GID_TABLE_ENTRY_ADD_GID) > + device->del_gid(&entry->attr, &entry->context); > + > + if (entry->flags & GID_TABLE_ENTRY_IN_TABLE) { > + write_lock_irq(&table->rwlock); > + WARN_ON(!IS_ERR(table->data_vec[entry->attr.index])); > + table->data_vec[entry->attr.index] = NULL; > + write_unlock_irq(&table->rwlock); > + } > + > + mutex_unlock(&table->lock); > + > + dev_put(entry->attr.ndev); > + > + kfree(entry); > +} > + > +/* Must be undone with put_entry */ > +static struct ib_gid_entry *alloc_entry(struct ib_device *ib_dev, u8 > +port) { > + struct ib_gid_entry *entry; > + > + entry = kzalloc(sizeof(*entry), GFP_KERNEL); > + if (!entry) > + return NULL; > + > + kref_init(&entry->kref); > + INIT_WORK(&entry->del_work, free_roce_gid_work); > + > + entry->attr.device = ib_dev; > + entry->attr.port_num = port; > + > + return entry; > +} > + > +static struct ib_gid_entry *alloc_entry_roce(struct ib_device *ib_dev, u8 port, > + struct net_device *ndev, > + enum ib_gid_type gid_type) > +{ > + struct ib_gid_entry *entry = > + alloc_entry(ib_dev, port); > + > + if (!entry) > + return NULL; > + > + entry->attr.gid_type = gid_type; > + entry->attr.ndev = ndev; > + dev_hold(entry->attr.ndev); > + return entry; > +} > + > +static void free_entry(struct kref *kref) { > + struct ib_gid_entry *entry = > + container_of(kref, struct ib_gid_entry, kref); > + > + queue_work(ib_wq, &entry->del_work); > +} > + > +static void put_entry(const struct ib_gid_entry *entry) { > + kref_put((struct kref *)&entry->kref, free_entry); } > + > static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) { > struct ib_event event; > @@ -154,37 +258,10 @@ int ib_cache_gid_parse_type_str(const char *buf) } > EXPORT_SYMBOL(ib_cache_gid_parse_type_str); > > -static inline bool > -is_gid_table_entry_free(const struct ib_gid_table_entry *entry) > +static int setup_roce_gid_entry(struct ib_gid_table *table, > + struct ib_gid_entry *entry) > { > - return entry->state == GID_TABLE_ENTRY_FREE; > -} > - > -static inline bool > -is_gid_table_entry_valid(const struct ib_gid_table_entry *entry) -{ > - return entry->state == GID_TABLE_ENTRY_ALLOCATED; > -} > - > -static void del_roce_gid(struct ib_device *device, u8 port_num, > - struct ib_gid_table *table, int ix) > -{ > - pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, > - device->name, port_num, ix, > - table->data_vec[ix].gid.raw); > - > - if (rdma_cap_roce_gid_table(device, port_num)) > - device->del_gid(&table->data_vec[ix].attr, > - &table->data_vec[ix].context); > - dev_put(table->data_vec[ix].attr.ndev); > -} > - > -static int add_roce_gid(struct ib_gid_table *table, > - const union ib_gid *gid, > - const struct ib_gid_attr *attr) > -{ > - struct ib_gid_table_entry *entry; > - int ix = attr->index; > + const struct ib_gid_attr *attr = &entry->attr; > int ret = 0; > > if (!attr->ndev) { > @@ -194,30 +271,23 @@ static int add_roce_gid(struct ib_gid_table *table, > return -EINVAL; > } > > - entry = &table->data_vec[ix]; > - if (!is_gid_table_entry_free(entry)) { > - WARN(1, "GID table corruption device=%s port=%d > index=%d\n", > - attr->device->name, attr->port_num, > - attr->index); > - return -EINVAL; > - } > > if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) { > - ret = attr->device->add_gid(gid, attr, &entry->context); > + ret = attr->device->add_gid(&entry->gid, &entry->attr, > + &entry->context); > if (ret) { > pr_err("%s GID add failed device=%s port=%d > index=%d\n", > __func__, attr->device->name, attr->port_num, > attr->index); > - goto add_err; > + return ret; > } > + entry->flags |= GID_TABLE_ENTRY_ADD_GID; > } > - dev_hold(attr->ndev); > > -add_err: > - if (!ret) > - pr_debug("%s device=%s port=%d index=%d gid %pI6\n", > __func__, > - attr->device->name, attr->port_num, ix, gid->raw); > - return ret; > + pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, > + attr->device->name, attr->port_num, entry->attr.index, > + entry->gid.raw); > + return 0; > } > > /** > @@ -232,15 +302,15 @@ static int add_roce_gid(struct ib_gid_table *table, > * GID. However such zero GIDs are not added to the cache. > */ > static int add_modify_gid(struct ib_gid_table *table, > - const union ib_gid *gid, > - const struct ib_gid_attr *attr) > + struct ib_gid_entry *entry) > { > int ret; > > - if (rdma_protocol_roce(attr->device, attr->port_num)) { > - ret = add_roce_gid(table, gid, attr); > - if (ret) > + if (rdma_protocol_roce(entry->attr.device, entry->attr.port_num)) { > + ret = setup_roce_gid_entry(table, entry); > + if (ret) { > return ret; > + } > } else { > /* > * Some HCA's report multiple GID entries with only one @@ - > 248,99 +318,95 @@ static int add_modify_gid(struct ib_gid_table *table, > * So ignore such behavior for IB link layer and don't > * fail the call, but don't add such entry to GID cache. > */ > - if (!memcmp(gid, &zgid, sizeof(*gid))) > + if (!memcmp(&entry->gid, &zgid, sizeof(entry->gid))) { > + put_entry(entry); > return 0; > + } > } > > - lockdep_assert_held(&table->lock); > - memcpy(&table->data_vec[attr->index].gid, gid, sizeof(*gid)); > - memcpy(&table->data_vec[attr->index].attr, attr, sizeof(*attr)); > + store_entry(table, entry); > > - write_lock_irq(&table->rwlock); > - table->data_vec[attr->index].state = GID_TABLE_ENTRY_ALLOCATED; > - write_unlock_irq(&table->rwlock); > return 0; > } > > -/** > - * del_gid - Delete GID table entry > - * > - * @ib_dev: IB device whose GID entry to be deleted > - * @port: Port number of the IB device > - * @table: GID table of the IB device for a port > - * @ix: GID entry index to delete > - * > - */ > -static void del_gid(struct ib_device *ib_dev, u8 port, > - struct ib_gid_table *table, int ix) > +static void del_gid(struct ib_gid_table *table, > + const struct ib_gid_entry *entry) > { > + unsigned int ix = entry->attr.index; > + > lockdep_assert_held(&table->lock); > + > write_lock_irq(&table->rwlock); > - table->data_vec[ix].state = GID_TABLE_ENTRY_FREE; > + > + WARN_ON(entry != table->data_vec[ix]); > + > + /* > + * The data_vec holds a marker for roce preventing re-use of the HW > + * slow until the entry kref goes to zero. Other protocols can re-use > + * immediately. > + */ > + if (rdma_protocol_roce(entry->attr.device, entry->attr.port_num)) > + table->data_vec[ix] = ERR_PTR(-EINVAL); > + else { > + table->data_vec[ix] = NULL; > + ((struct ib_gid_entry *)entry)->flags &= > + ~GID_TABLE_ENTRY_IN_TABLE; > + } > + > write_unlock_irq(&table->rwlock); > > - if (rdma_protocol_roce(ib_dev, port)) > - del_roce_gid(ib_dev, port, table, ix); > - memcpy(&table->data_vec[ix].gid, &zgid, sizeof(zgid)); > - memset(&table->data_vec[ix].attr, 0, sizeof(table->data_vec[ix].attr)); > - table->data_vec[ix].context = NULL; > + put_entry(entry); > } > > /* rwlock should be read locked, or lock should be held */ -static int > find_gid(struct ib_gid_table *table, const union ib_gid *gid, > - const struct ib_gid_attr *val, bool default_gid, > - unsigned long mask, int *pempty) > +static const struct ib_gid_entry *find_gid(struct ib_gid_table *table, > + const union ib_gid *gid, > + const struct ib_gid_attr *val, > + bool default_gid, unsigned long > mask, > + int *pempty) > { > int i = 0; > - int found = -1; > - int empty = pempty ? -1 : 0; > + int empty = -1; > > - while (i < table->sz && (found < 0 || empty < 0)) { > - struct ib_gid_table_entry *data = &table->data_vec[i]; > - struct ib_gid_attr *attr = &data->attr; > + for (i = 0; i != table->sz; i++) { > + const struct ib_gid_entry *entry = table->data_vec[i]; > + const struct ib_gid_attr *attr; > int curr_index = i; > > - i++; > + /* > + * Entry is still being freed, it cannot be returned from > + * find_gid > + */ > + if (IS_ERR(entry)) > + continue; > > /* find_gid() is used during GID addition where it is expected > - * to return a free entry slot which is not duplicate. > - * Free entry slot is requested and returned if pempty is set, > - * so lookup free slot only if requested. > + * to return a free entry slot if there is not duplicate > + * entry. > */ > - if (pempty && empty < 0) { > - if (is_gid_table_entry_free(data) && > - (default_gid == > - !!(data->props & GID_TABLE_ENTRY_DEFAULT))) { > - /* > - * Found an invalid (free) entry; allocate it. > - * If default GID is requested, then our > - * found slot must be one of the DEFAULT > - * reserved slots or we fail. > - * This ensures that only DEFAULT reserved > - * slots are used for default property GIDs. > - */ > + if (entry == NULL) { > + /* > + * Found an free entry; allocate it. If a default GID > + * is requested, then our found slot must be one of > + * the DEFAULT reserved slots or we fail. This > + * ensures that only DEFAULT reserved slots are used > + * for default property GIDs. > + */ > + if (empty < 0 && > + (curr_index < table->default_entries) == > + default_gid) > empty = curr_index; > - } > - } > - > - /* > - * Additionally find_gid() is used to find valid entry during > - * lookup operation, where validity needs to be checked. So > - * find the empty entry first to continue to search for a free > - * slot and ignore its INVALID flag. > - */ > - if (!is_gid_table_entry_valid(data)) > continue; > + } > > - if (found >= 0) > - continue; > + attr = &entry->attr; > > if (mask & GID_ATTR_FIND_MASK_GID_TYPE && > attr->gid_type != val->gid_type) > continue; > > if (mask & GID_ATTR_FIND_MASK_GID && > - memcmp(gid, &data->gid, sizeof(*gid))) > + memcmp(gid, &entry->gid, sizeof(*gid))) > continue; > > if (mask & GID_ATTR_FIND_MASK_NETDEV && @@ -348,71 > +414,75 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid > *gid, > continue; > > if (mask & GID_ATTR_FIND_MASK_DEFAULT && > - !!(data->props & GID_TABLE_ENTRY_DEFAULT) != > - default_gid) > + (curr_index < table->default_entries) != default_gid) > continue; > > - found = curr_index; > + /* > + * if we find a match then empty is not valid as we didn't > + * scan the whole list. > + */ > + return entry; > } > > if (pempty) > *pempty = empty; > > - return found; > + return NULL; > } > > -static void make_default_gid(struct net_device *dev, union ib_gid *gid) > +static void make_default_gid(struct net_device *dev, union ib_gid *gid) > { > gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); > addrconf_ifid_eui48(&gid->raw[8], dev); } > > -static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port, > - union ib_gid *gid, struct ib_gid_attr *attr, > - unsigned long mask, bool default_gid) > +static int __ib_cache_gid_add(struct ib_gid_entry *entry, unsigned long > +mask) > { > - struct ib_gid_table *table; > + struct ib_gid_table *table = get_entry_table(entry); > int ret = 0; > int empty; > - int ix; > > /* Do not allow adding zero GID in support of > * IB spec version 1.3 section 4.1.1 point (6) and > * section 12.7.10 and section 12.7.20 > */ > - if (!memcmp(gid, &zgid, sizeof(*gid))) > + if (!memcmp(&entry->gid, &zgid, sizeof(zgid))) > return -EINVAL; > > - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; > - > mutex_lock(&table->lock); > > - ix = find_gid(table, gid, attr, default_gid, mask, &empty); > - if (ix >= 0) > + if (find_gid(table, &entry->gid, &entry->attr, > + entry->flags & GID_TABLE_ENTRY_DEFAULT, mask, &empty)) > goto out_unlock; > > if (empty < 0) { > ret = -ENOSPC; > goto out_unlock; > } > - attr->device = ib_dev; > - attr->index = empty; > - attr->port_num = port; > - ret = add_modify_gid(table, gid, attr); > - if (!ret) > - dispatch_gid_change_event(ib_dev, port); > > -out_unlock: > + entry->attr.index = empty; > + ret = add_modify_gid(table, entry); > mutex_unlock(&table->lock); > + > if (ret) > - pr_warn("%s: unable to add gid %pI6 error=%d\n", > - __func__, gid->raw, ret); > + goto out; > + > + dispatch_gid_change_event(entry->attr.device, > + entry->attr.port_num); > + return 0; > + > +out_unlock: > + mutex_unlock(&table->lock); > +out: > + pr_warn("%s: unable to add gid %pI6 error=%d\n", __func__, > + entry->gid.raw, ret); > return ret; > } > > int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, > union ib_gid *gid, struct ib_gid_attr *attr) { > + struct ib_gid_entry *entry; > struct net_device *idev; > unsigned long mask; > int ret; > @@ -437,7 +507,11 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 > port, > GID_ATTR_FIND_MASK_GID_TYPE | > GID_ATTR_FIND_MASK_NETDEV; > > - ret = __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false); > + entry = alloc_entry_roce(ib_dev, port, attr->ndev, attr->gid_type); > + ret = __ib_cache_gid_add(entry, mask); > + if (ret) > + put_entry(entry); > + > return ret; > } > > @@ -446,28 +520,30 @@ _ib_cache_gid_del(struct ib_device *ib_dev, u8 port, > union ib_gid *gid, struct ib_gid_attr *attr, > unsigned long mask, bool default_gid) { > - struct ib_gid_table *table; > + struct ib_gid_table *table = get_table(ib_dev, port); > + const struct ib_gid_entry *entry; > int ret = 0; > - int ix; > - > - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; > > mutex_lock(&table->lock); > > - ix = find_gid(table, gid, attr, default_gid, mask, NULL); > - if (ix < 0) { > + entry = find_gid(table, gid, attr, default_gid, mask, NULL); > + if (!entry) { > ret = -EINVAL; > goto out_unlock; > } > > - del_gid(ib_dev, port, table, ix); > + del_gid(table, entry); > + > + mutex_unlock(&table->lock); > + > dispatch_gid_change_event(ib_dev, port); > > + return 0; > + > out_unlock: > mutex_unlock(&table->lock); > - if (ret) > - pr_debug("%s: can't delete gid %pI6 error=%d\n", > - __func__, gid->raw, ret); > + pr_debug("%s: can't delete gid %pI6 error=%d\n", > + __func__, gid->raw, ret); > return ret; > } > > @@ -485,17 +561,18 @@ int ib_cache_gid_del(struct ib_device *ib_dev, u8 > port, int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, > struct net_device *ndev) > { > - struct ib_gid_table *table; > + struct ib_gid_table *table = get_table(ib_dev, port); > int ix; > bool deleted = false; > > - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; > - > mutex_lock(&table->lock); > > for (ix = 0; ix < table->sz; ix++) { > - if (table->data_vec[ix].attr.ndev == ndev) { > - del_gid(ib_dev, port, table, ix); > + if (IS_ERR_OR_NULL(table->data_vec[ix])) > + continue; > + > + if (table->data_vec[ix]->attr.ndev == ndev) { > + del_gid(table, table->data_vec[ix]); > deleted = true; > } > } > @@ -511,19 +588,19 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device > *ib_dev, u8 port, static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 > port, int index, > union ib_gid *gid, struct ib_gid_attr *attr) { > - struct ib_gid_table *table; > - > - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; > + struct ib_gid_table *table = get_table(ib_dev, port); > + const struct ib_gid_entry *entry; > > if (index < 0 || index >= table->sz) > return -EINVAL; > > - if (!is_gid_table_entry_valid(&table->data_vec[index])) > + if (IS_ERR_OR_NULL(table->data_vec[index])) > return -EAGAIN; > > - memcpy(gid, &table->data_vec[index].gid, sizeof(*gid)); > + entry = table->data_vec[index]; > + memcpy(gid, &entry->gid, sizeof(*gid)); > if (attr) { > - memcpy(attr, &table->data_vec[index].attr, sizeof(*attr)); > + memcpy(attr, &entry->attr, sizeof(*attr)); > if (attr->ndev) > dev_hold(attr->ndev); > } > @@ -531,33 +608,28 @@ static int __ib_cache_gid_get(struct ib_device > *ib_dev, u8 port, int index, > return 0; > } > > -static int _ib_cache_gid_table_find(struct ib_device *ib_dev, > - const union ib_gid *gid, > - const struct ib_gid_attr *val, > - unsigned long mask, > - u8 *port, u16 *index) > +static const struct ib_gid_entry *_ib_cache_gid_table_find( > + struct ib_device *ib_dev, const union ib_gid *gid, > + const struct ib_gid_attr *val, unsigned long mask) > { > + const struct ib_gid_entry *entry = NULL; > struct ib_gid_table *table; > u8 p; > - int local_index; > unsigned long flags; > > for (p = 0; p < ib_dev->phys_port_cnt; p++) { > table = ib_dev->cache.ports[p].gid; > + > read_lock_irqsave(&table->rwlock, flags); > - local_index = find_gid(table, gid, val, false, mask, NULL); > - if (local_index >= 0) { > - if (index) > - *index = local_index; > - if (port) > - *port = p + rdma_start_port(ib_dev); > - read_unlock_irqrestore(&table->rwlock, flags); > - return 0; > - } > + entry = find_gid(table, gid, val, false, mask, NULL); > + kref_get((struct kref *)&entry->kref); > read_unlock_irqrestore(&table->rwlock, flags); > + > + if (entry) > + break; > } > > - return -ENOENT; > + return entry; > } > > static int ib_cache_gid_find(struct ib_device *ib_dev, @@ -566,6 +638,8 @@ > static int ib_cache_gid_find(struct ib_device *ib_dev, > struct net_device *ndev, u8 *port, > u16 *index) > { > + const struct ib_gid_entry *entry; > + > unsigned long mask = GID_ATTR_FIND_MASK_GID | > GID_ATTR_FIND_MASK_GID_TYPE; > struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; > @@ -573,8 +647,16 @@ static int ib_cache_gid_find(struct ib_device *ib_dev, > if (ndev) > mask |= GID_ATTR_FIND_MASK_NETDEV; > > - return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val, > - mask, port, index); > + entry = _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val, mask); > + if (!entry) > + return -ENOENT; > + > + *port = entry->attr.port_num; > + *index = entry->attr.index; > + > + put_entry(entry); > + > + return 0; > } > > /** > @@ -596,7 +678,7 @@ int ib_find_cached_gid_by_port(struct ib_device > *ib_dev, > u8 port, struct net_device *ndev, > u16 *index) > { > - int local_index; > + const struct ib_gid_entry *entry; > struct ib_gid_table *table; > unsigned long mask = GID_ATTR_FIND_MASK_GID | > GID_ATTR_FIND_MASK_GID_TYPE; > @@ -606,16 +688,16 @@ int ib_find_cached_gid_by_port(struct ib_device > *ib_dev, > if (!rdma_is_port_valid(ib_dev, port)) > return -ENOENT; > > - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; > + table = get_table(ib_dev, port); > > if (ndev) > mask |= GID_ATTR_FIND_MASK_NETDEV; > > read_lock_irqsave(&table->rwlock, flags); > - local_index = find_gid(table, gid, &val, false, mask, NULL); > - if (local_index >= 0) { > + entry = find_gid(table, gid, &val, false, mask, NULL); > + if (entry) { > if (index) > - *index = local_index; > + *index = entry->attr.index; > read_unlock_irqrestore(&table->rwlock, flags); > return 0; > } > @@ -659,26 +741,21 @@ static int ib_cache_gid_find_by_filter(struct ib_device > *ib_dev, > unsigned long flags; > bool found = false; > > - > if (!rdma_is_port_valid(ib_dev, port) || > !rdma_protocol_roce(ib_dev, port)) > return -EPROTONOSUPPORT; > > - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; > + table = get_table(ib_dev, port); > > read_lock_irqsave(&table->rwlock, flags); > for (i = 0; i < table->sz; i++) { > - struct ib_gid_attr attr; > - > - if (!is_gid_table_entry_valid(&table->data_vec[i])) > + if (IS_ERR_OR_NULL(table->data_vec[i])) > continue; > > - if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid))) > + if (memcmp(gid, &table->data_vec[i]->gid, sizeof(*gid))) > continue; > > - memcpy(&attr, &table->data_vec[i].attr, sizeof(attr)); > - > - if (filter(gid, &attr, context)) { > + if (filter(gid, &table->data_vec[i]->attr, context)) { > found = true; > if (index) > *index = i; > @@ -692,42 +769,28 @@ static int ib_cache_gid_find_by_filter(struct ib_device > *ib_dev, > return 0; > } > > -static struct ib_gid_table *alloc_gid_table(int sz) > +static struct ib_gid_table *alloc_gid_table(unsigned int sz) > { > - struct ib_gid_table *table = > - kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL); > - int i; > + struct ib_gid_table *table; > > + table = kzalloc(sizeof(struct ib_gid_table) + > + sizeof(table->data_vec[0]) * sz, > + GFP_KERNEL); > if (!table) > return NULL; > > - table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL); > - if (!table->data_vec) > - goto err_free_table; > - > mutex_init(&table->lock); > > table->sz = sz; > rwlock_init(&table->rwlock); > > - /* Mark all entries as invalid so that allocator can allocate > - * one of the invalid (free) entry. > - */ > - for (i = 0; i < sz; i++) > - table->data_vec[i].state = GID_TABLE_ENTRY_FREE; > return table; > - > -err_free_table: > - kfree(table); > - return NULL; > } > > static void release_gid_table(struct ib_gid_table *table) { > - if (table) { > - kfree(table->data_vec); > + if (table) > kfree(table); > - } > } > > static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, @@ - > 741,11 +804,11 @@ static void cleanup_gid_table_port(struct ib_device > *ib_dev, u8 port, > > mutex_lock(&table->lock); > for (i = 0; i < table->sz; ++i) { > - if (memcmp(&table->data_vec[i].gid, &zgid, > - sizeof(table->data_vec[i].gid))) { > - del_gid(ib_dev, port, table, i); > - deleted = true; > - } > + if (IS_ERR_OR_NULL(table->data_vec[i])) > + continue; > + > + del_gid(table, table->data_vec[i]); > + deleted = true; > } > mutex_unlock(&table->lock); > > @@ -758,59 +821,38 @@ void ib_cache_gid_set_default_gid(struct ib_device > *ib_dev, u8 port, > unsigned long gid_type_mask, > enum ib_cache_gid_default_mode mode) { > - union ib_gid gid = { }; > - struct ib_gid_attr gid_attr; > - struct ib_gid_table *table; > unsigned int gid_type; > unsigned long mask; > > - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; > - > mask = GID_ATTR_FIND_MASK_GID_TYPE | > GID_ATTR_FIND_MASK_DEFAULT | > GID_ATTR_FIND_MASK_NETDEV; > - memset(&gid_attr, 0, sizeof(gid_attr)); > - gid_attr.ndev = ndev; > > for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) { > if (1UL << gid_type & ~gid_type_mask) > continue; > > - gid_attr.gid_type = gid_type; > > if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) { > - make_default_gid(ndev, &gid); > - __ib_cache_gid_add(ib_dev, port, &gid, > - &gid_attr, mask, true); > + struct ib_gid_entry *entry = > + alloc_entry_roce(ib_dev, port, ndev, gid_type); > + > + entry->flags |= GID_TABLE_ENTRY_DEFAULT; > + make_default_gid(ndev, &entry->gid); > + if (__ib_cache_gid_add(entry, mask)) > + put_entry(entry); > } else if (mode == IB_CACHE_GID_DEFAULT_MODE_DELETE) { > - _ib_cache_gid_del(ib_dev, port, &gid, > + struct ib_gid_attr gid_attr = { > + .gid_type = gid_type, > + .ndev = ndev, > + }; > + > + _ib_cache_gid_del(ib_dev, port, NULL, > &gid_attr, mask, true); > } > } > } > > -static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port, > - struct ib_gid_table *table) > -{ > - unsigned int i; > - unsigned long roce_gid_type_mask; > - unsigned int num_default_gids; > - unsigned int current_gid = 0; > - > - roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port); > - num_default_gids = hweight_long(roce_gid_type_mask); > - for (i = 0; i < num_default_gids && i < table->sz; i++) { > - struct ib_gid_table_entry *entry = &table->data_vec[i]; > - > - entry->props |= GID_TABLE_ENTRY_DEFAULT; > - current_gid = find_next_bit(&roce_gid_type_mask, > - BITS_PER_LONG, > - current_gid); > - entry->attr.gid_type = current_gid++; > - } > -} > - > - > static void gid_table_release_one(struct ib_device *ib_dev) { > struct ib_gid_table *table; > @@ -836,7 +878,8 @@ static int _gid_table_setup_one(struct ib_device > *ib_dev) > if (!table) > goto rollback_table_setup; > > - gid_table_reserve_default(ib_dev, rdma_port, table); > + table->default_entries = > + hweight_long(roce_gid_type_mask_support(ib_dev, > rdma_port)); > ib_dev->cache.ports[port].gid = table; > } > return 0; > @@ -885,7 +928,8 @@ int ib_get_cached_gid(struct ib_device *device, > if (!rdma_is_port_valid(device, port_num)) > return -EINVAL; > > - table = device->cache.ports[port_num - rdma_start_port(device)].gid; > + table = get_table(device, port_num); > + > read_lock_irqsave(&table->rwlock, flags); > res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr); > read_unlock_irqrestore(&table->rwlock, flags); @@ -1095,31 +1139,35 > @@ EXPORT_SYMBOL(ib_get_cached_port_state); > static int config_non_roce_gid_cache(struct ib_device *device, > u8 port, int gid_tbl_len) > { > - struct ib_gid_attr gid_attr = {}; > - struct ib_gid_table *table; > - union ib_gid gid; > + struct ib_gid_table *table = get_table(device, port); > + struct ib_gid_entry *entry = NULL; > int ret = 0; > int i; > > - gid_attr.device = device; > - gid_attr.port_num = port; > - table = device->cache.ports[port - rdma_start_port(device)].gid; > - > mutex_lock(&table->lock); > for (i = 0; i < gid_tbl_len; ++i) { > if (!device->query_gid) > continue; > - ret = device->query_gid(device, port, i, &gid); > + > + entry = alloc_entry(device, port); > + if (!entry) > + return -ENOMEM; > + > + ret = device->query_gid(device, port, i, &entry->gid); > if (ret) { > pr_warn("query_gid failed (%d) for %s (index %d)\n", > ret, device->name, i); > goto err; > } > - gid_attr.index = i; > - add_modify_gid(table, &gid, &gid_attr); > + > + entry->attr.index = i; > + ret = add_modify_gid(table, entry); > + if (ret) > + goto err; > } > err: > mutex_unlock(&table->lock); > + put_entry(entry); > return ret; > } > > @@ -1136,7 +1184,7 @@ static void ib_cache_update(struct ib_device *device, > if (!rdma_is_port_valid(device, port)) > return; > > - table = device->cache.ports[port - rdma_start_port(device)].gid; > + table = get_table(device, port); > > tprops = kmalloc(sizeof *tprops, GFP_KERNEL); > if (!tprops) > @@ -1297,6 +1345,12 @@ void ib_cache_cleanup_one(struct ib_device > *device) > ib_unregister_event_handler(&device->cache.event_handler); > flush_workqueue(ib_wq); > gid_table_cleanup_one(device); > + > + /* > + * Flush the wq second time for any pending GID delete > + * work for RoCE device. > + */ > + flush_workqueue(ib_wq); > } > > void __init ib_cache_setup(void) > -- > 2.17.0 I am yet to review this variation, but I have a kref memory based version in progress and have rebased the whole series which simplifies rdma_query_gid() calls. I am testing it. I will take a look at this patch as well. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html