From: Jason Gunthorpe <jgg@xxxxxxxxxxxx> Several drivers need to find the ib_device from a given netdev. rxe needs this at speed in an unsleepable context, so choose to implement the translation using a RCU safe hash table. The hash table can have a many to one mapping. This is intended to support some future case where multiple IB drivers (ie iWarp and RoCE) connect to the same netdevs. driver_ids will need to be different to support this. In the process this makes the struct ib_device and ib_port_data RCU safe by deferring their kfrees. Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxxxx> --- drivers/infiniband/core/device.c | 119 +++++++++++++++++++++++++++---- include/rdma/ib_verbs.h | 10 ++- 2 files changed, 116 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 14c91f9af6ccc9..ae70091c20c19f 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -40,6 +40,7 @@ #include <linux/netdevice.h> #include <linux/security.h> #include <linux/notifier.h> +#include <linux/hashtable.h> #include <rdma/rdma_netlink.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> @@ -133,6 +134,10 @@ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, !xa_is_err(entry); \ (index)++, entry = xan_find_marked(xa, &(index), filter)) +/* RCU hash table mapping netdevice pointers to struct ib_port_data */ +static DEFINE_SPINLOCK(ndev_hash_lock); +static DECLARE_HASHTABLE(ndev_hash, 5); + static void free_netdevs(struct ib_device *ib_dev); static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data); @@ -143,6 +148,12 @@ static struct notifier_block ibdev_lsm_nb = { .notifier_call = ib_security_change, }; +/* Pointer to the RCU head at the start of the ib_port_data array */ +struct ib_port_data_rcu { + struct rcu_head rcu_head; + struct ib_port_data pdata[]; +}; + static int ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } @@ -294,9 +305,12 @@ static void ib_device_release(struct device *device) WARN_ON(refcount_read(&dev->refcount)); ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); - kfree(dev->port_data); xa_destroy(&dev->client_data); - kfree(dev); + if (dev->port_data) + kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, + pdata[0]), + rcu_head); + kfree_rcu(dev, rcu_head); } static int ib_device_uevent(struct device *device, @@ -461,6 +475,7 @@ static void remove_client_context(struct ib_device *device, static int alloc_port_data(struct ib_device *device) { + struct ib_port_data_rcu *pdata_rcu; unsigned int port; if (device->port_data) @@ -477,17 +492,26 @@ static int alloc_port_data(struct ib_device *device) * Therefore port_data is declared as a 1 based array with potential * empty slots at the beginning. */ - device->port_data = kcalloc(rdma_end_port(device) + 1, - sizeof(*device->port_data), GFP_KERNEL); - if (!device->port_data) + pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, + rdma_end_port(device) + 1), + GFP_KERNEL); + if (!pdata_rcu) return -ENOMEM; + /* + * The rcu_head is put in front of the port data array and the stored + * pointer is adjusted since we never need to see that member until + * kfree_rcu. + */ + device->port_data = pdata_rcu->pdata; rdma_for_each_port (device, port) { struct ib_port_data *pdata = &device->port_data[port]; + pdata->ib_dev = device; spin_lock_init(&pdata->pkey_list_lock); INIT_LIST_HEAD(&pdata->pkey_list); spin_lock_init(&pdata->netdev_lock); + INIT_HLIST_NODE(&pdata->ndev_hash_link); } return 0; } @@ -1028,6 +1052,29 @@ int ib_query_port(struct ib_device *device, } EXPORT_SYMBOL(ib_query_port); +static void add_ndev_hash(struct ib_port_data *pdata) +{ + unsigned long flags; + + might_sleep(); + + spin_lock_irqsave(&ndev_hash_lock, flags); + if (hash_hashed(&pdata->ndev_hash_link)) { + hash_del_rcu(&pdata->ndev_hash_link); + spin_unlock_irqrestore(&ndev_hash_lock, flags); + /* + * We cannot do hash_add_rcu after a hash_del_rcu until the + * grace period + */ + synchronize_rcu(); + spin_lock_irqsave(&ndev_hash_lock, flags); + } + if (pdata->netdev) + hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, + (uintptr_t)pdata->netdev); + spin_unlock_irqrestore(&ndev_hash_lock, flags); +} + /** * ib_device_set_netdev - Associate the ib_dev with an underlying net_device * @ib_dev: Device to modify @@ -1064,17 +1111,19 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, pdata = &ib_dev->port_data[port]; spin_lock_irqsave(&pdata->netdev_lock, flags); - if (pdata->netdev == ndev) { + old_ndev = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); + if (old_ndev == ndev) { spin_unlock_irqrestore(&pdata->netdev_lock, flags); return 0; } - old_ndev = pdata->netdev; if (ndev) dev_hold(ndev); - pdata->netdev = ndev; + rcu_assign_pointer(pdata->netdev, ndev); spin_unlock_irqrestore(&pdata->netdev_lock, flags); + add_ndev_hash(pdata); if (old_ndev) dev_put(old_ndev); @@ -1089,11 +1138,24 @@ static void free_netdevs(struct ib_device *ib_dev) rdma_for_each_port (ib_dev, port) { struct ib_port_data *pdata = &ib_dev->port_data[port]; + struct net_device *ndev; spin_lock_irqsave(&pdata->netdev_lock, flags); - if (pdata->netdev) { - dev_put(pdata->netdev); - pdata->netdev = NULL; + ndev = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); + if (ndev) { + spin_lock(&ndev_hash_lock); + hash_del_rcu(&pdata->ndev_hash_link); + spin_unlock(&ndev_hash_lock); + + /* + * If this is the last dev_put there is still a + * synchronize_rcu before the netdev is kfreed, so we + * can continue to rely on unlocked pointer + * comparisons after the put + */ + rcu_assign_pointer(pdata->netdev, NULL); + dev_put(ndev); } spin_unlock_irqrestore(&pdata->netdev_lock, flags); } @@ -1118,7 +1180,8 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, res = ib_dev->ops.get_netdev(ib_dev, port); else { spin_lock(&pdata->netdev_lock); - res = pdata->netdev; + res = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); if (res) dev_hold(res); spin_unlock(&pdata->netdev_lock); @@ -1136,6 +1199,38 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, return res; } +/** + * ib_device_get_by_netdev - Find an IB device associated with a netdev + * @ndev: netdev to locate + * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) + * + * Find and hold an ib_device that is associated with a netdev via + * ib_device_set_netdev(). The caller must call ib_device_put() on the + * returned pointer. + */ +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, + enum rdma_driver_id driver_id) +{ + struct ib_device *res = NULL; + struct ib_port_data *cur; + + rcu_read_lock(); + hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, + (uintptr_t)ndev) { + if (rcu_access_pointer(cur->netdev) == ndev && + (driver_id == RDMA_DRIVER_UNKNOWN || + cur->ib_dev->driver_id == driver_id) && + ib_device_try_get(cur->ib_dev)) { + res = cur->ib_dev; + break; + } + } + rcu_read_unlock(); + + return res; +} +EXPORT_SYMBOL(ib_device_get_by_netdev); + /** * ib_enum_roce_netdev - enumerate all RoCE ports * @ib_dev : IB device we want to query diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 358bda4a76ff42..585512daef3cb2 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2198,6 +2198,8 @@ struct ib_port_immutable { }; struct ib_port_data { + struct ib_device *ib_dev; + struct ib_port_immutable immutable; spinlock_t pkey_list_lock; @@ -2206,7 +2208,8 @@ struct ib_port_data { struct ib_port_cache cache; spinlock_t netdev_lock; - struct net_device *netdev; + struct net_device __rcu *netdev; + struct hlist_node ndev_hash_link; }; /* rdma netdev type - specifies protocol type */ @@ -2543,6 +2546,7 @@ struct ib_device { struct device *dma_device; struct ib_device_ops ops; char name[IB_DEVICE_NAME_MAX]; + struct rcu_head rcu_head; struct list_head event_handler_list; spinlock_t event_handler_lock; @@ -3997,6 +4001,10 @@ static inline bool ib_device_try_get(struct ib_device *dev) } void ib_device_put(struct ib_device *device); +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, + enum rdma_driver_id driver_id); +struct ib_device *ib_device_get_by_name(const char *name, + enum rdma_driver_id driver_id); struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr); -- 2.20.1