Re: [PATCH v1 for-next 06/16] IB/core: Implement support for MMU notifiers regarding on demand paging regions

Jerome Glisse <j.glisse@xxxxxxxxx> · Thu, 4 Sep 2014 16:24:59 -0400

> * Add an interval tree implementation for ODP umems. Create an interval tree
>   for each ucontext (including a count of the number of ODP MRs in this
>   context, mutex, etc.), and register ODP umems in the interval tree.
> * Add MMU notifiers handling functions, using the interval tree to notify only
>   the relevant umems and underlying MRs.
> * Register to receive MMU notifier events from the MM subsystem upon ODP MR
>   registration (and unregister accordingly).
> * Add a completion object to synchronize the destruction of ODP umems.
> * Add mechanism to abort page faults when there's a concurrent invalidation.
> 
> The way we synchronize between concurrent invalidations and page faults is by
> keeping a counter of currently running invalidations, and a sequence number
> that is incremented whenever an invalidation is caught. The page fault code
> checks the counter and also verifies that the sequence number hasn't
> progressed before it updates the umem's page tables. This is similar to what
> the kvm module does.
> 
> There's currently a rare race in the code when registering a umem in the
> middle of an ongoing notifier. The proper fix is to either serialize the
> insertion to our umem tree with mm_lock_all or use a ucontext wide running
> notifiers count for retries decision. Either is ugly and can lead to some sort
> of starvation. The current workaround is ugly as well - now the user can end
> up with mapped addresses that are not in the user's address space (although it
> is highly unlikely).

I have been trying to wrap my head around this comment. I am totaly unfamiliar
with RDMA code, but from quick look at it when registering umem you take the
mmap_sem in read mode so any munmap from userspace would be serialize. Really
the worst that can happen is that a umem pointing to a mmaped file that is
concurently truncated but even then the address is still valid, but it should
result in a SIGBUS which here is obviously harder to report (again dunno how
RDMA works).

So am i missing something ?

> 
> Signed-off-by: Sagi Grimberg <sagig@xxxxxxxxxxxx>
> Signed-off-by: Shachar Raindel <raindel@xxxxxxxxxxxx>
> Signed-off-by: Haggai Eran <haggaie@xxxxxxxxxxxx>
> Signed-off-by: Yuval Dagan <yuvalda@xxxxxxxxxxxx>
> ---
>  drivers/infiniband/Kconfig            |   1 +
>  drivers/infiniband/core/Makefile      |   2 +-
>  drivers/infiniband/core/umem.c        |   2 +-
>  drivers/infiniband/core/umem_odp.c    | 337 +++++++++++++++++++++++++++++++++-
>  drivers/infiniband/core/umem_rbtree.c |  94 ++++++++++
>  drivers/infiniband/core/uverbs_cmd.c  |  16 ++
>  include/rdma/ib_umem_odp.h            |  56 ++++++
>  include/rdma/ib_verbs.h               |  16 ++
>  8 files changed, 512 insertions(+), 12 deletions(-)
>  create mode 100644 drivers/infiniband/core/umem_rbtree.c
> 
> diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
> index 089a2c2..b899531 100644
> --- a/drivers/infiniband/Kconfig
> +++ b/drivers/infiniband/Kconfig
> @@ -41,6 +41,7 @@ config INFINIBAND_USER_MEM
>  config INFINIBAND_ON_DEMAND_PAGING
>  	bool "InfiniBand on-demand paging support"
>  	depends on INFINIBAND_USER_MEM
> +	select MMU_NOTIFIER
>  	default y
>  	---help---
>  	  On demand paging support for the InfiniBand subsystem.
> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> index c58f791..acf7367 100644
> --- a/drivers/infiniband/core/Makefile
> +++ b/drivers/infiniband/core/Makefile
> @@ -11,7 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
>  ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
>  				device.o fmr_pool.o cache.o netlink.o
>  ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
> -ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
> +ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
>  
>  ib_mad-y :=			mad.o smi.o agent.o mad_rmpp.o
>  
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index e9798e0..014977f 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -72,7 +72,7 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
>   * ib_umem_get - Pin and DMA map userspace memory.
>   *
>   * If access flags indicate ODP memory, avoid pinning. Instead, stores
> - * the mm for future page fault handling.
> + * the mm for future page fault handling in conjuction with MMU notifiers.
>   *
>   * @context: userspace context to pin memory for
>   * @addr: userspace virtual address to start at
> diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
> index 0c90ce50..c048269 100644
> --- a/drivers/infiniband/core/umem_odp.c
> +++ b/drivers/infiniband/core/umem_odp.c
> @@ -41,26 +41,204 @@
>  #include <rdma/ib_umem.h>
>  #include <rdma/ib_umem_odp.h>
>  
> +void ib_umem_notifier_start_account(struct ib_umem *item)
> +{
> +	int notifiers_count;
> +	mutex_lock(&item->odp_data->umem_mutex);
> +	/*
> +	 * Avoid performing another locked operation, as we are
> +	 * already protected by the wrapping mutex.
> +	 */
> +	notifiers_count = atomic_read(&item->odp_data->notifiers_count) + 1;
> +	if (notifiers_count == 1)
> +		reinit_completion(&item->odp_data->notifier_completion);
> +	atomic_set(&item->odp_data->notifiers_count,
> +		   notifiers_count);
> +	mutex_unlock(&item->odp_data->umem_mutex);
> +}
> +EXPORT_SYMBOL(ib_umem_notifier_start_account);
> +
> +void ib_umem_notifier_end_account(struct ib_umem *item)
> +{
> +	int notifiers_count, notifiers_seq;
> +	mutex_lock(&item->odp_data->umem_mutex);
> +	/*
> +	 * This sequence increase will notify the QP page fault that
> +	 * the page that is going to be mapped in the spte could have
> +	 * been freed.
> +	 */
> +	notifiers_seq = atomic_read(&item->odp_data->notifiers_seq) + 1;
> +	atomic_set(&item->odp_data->notifiers_seq,
> +		   notifiers_seq);
> +	/*
> +	 * The above sequence increase must be visible before the
> +	 * below count decrease, which is ensured by the smp_wmb below
> +	 * in conjunction with the smp_rmb in mmu_notifier_retry().
> +	 */
> +	smp_wmb();
> +
> +	notifiers_count = atomic_read(&item->odp_data->notifiers_count);
> +	/*
> +	 * This is a workaround for the unlikely case where we register a umem
> +	 * in the middle of an ongoing notifier.
> +	 */
> +	if (notifiers_count > 0)
> +		notifiers_count -= 1;
> +	else
> +		pr_warn("Got notifier end call without a previous start call");
> +	atomic_set(&item->odp_data->notifiers_count,
> +		   notifiers_count);
> +	if (notifiers_count == 0)
> +		complete_all(&item->odp_data->notifier_completion);
> +	mutex_unlock(&item->odp_data->umem_mutex);
> +}
> +
> +
> +static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start,
> +					       u64 end, void *cookie) {
> +	/*
> +	 * Increase the number of notifiers running, to
> +	 * prevent any further fault handling on this MR.
> +	 */
> +	ib_umem_notifier_start_account(item);
> +	item->odp_data->dying = 1;
> +	/* Make sure that the fact the umem is dying is out before we release
> +	 * all pending page faults. */
> +	smp_wmb();
> +	complete_all(&item->odp_data->notifier_completion);
> +	item->context->invalidate_range(item, ib_umem_start(item),
> +					ib_umem_end(item));
> +	return 0;
> +}
> +
> +static void ib_umem_notifier_release(struct mmu_notifier *mn,
> +				     struct mm_struct *mm)
> +{
> +	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
> +
> +	if (!context->invalidate_range)
> +		return;
> +
> +	down_read(&context->umem_mutex);
> +
> +	rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
> +				      ULLONG_MAX,
> +				      ib_umem_notifier_release_trampoline,
> +				      NULL);
> +
> +	up_read(&context->umem_mutex);
> +}
> +
> +static int invalidate_page_trampoline(struct ib_umem *item, u64 start,
> +				      u64 end, void *cookie)
> +{
> +	ib_umem_notifier_start_account(item);
> +	item->context->invalidate_range(item, start, start + PAGE_SIZE);
> +	ib_umem_notifier_end_account(item);
> +	return 0;
> +}
> +
> +static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
> +					     struct mm_struct *mm,
> +					     unsigned long address)
> +{
> +	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
> +
> +	if (!context->invalidate_range)
> +		return;
> +
> +	down_read(&context->umem_mutex);
> +	rbt_ib_umem_for_each_in_range(&context->umem_tree, address,
> +				      address + PAGE_SIZE,
> +				      invalidate_page_trampoline, NULL);
> +	up_read(&context->umem_mutex);
> +}
> +
> +static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
> +					     u64 end, void *cookie)
> +{
> +	ib_umem_notifier_start_account(item);
> +	item->context->invalidate_range(item, start, end);
> +	return 0;
> +}
> +
> +static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
> +						    struct mm_struct *mm,
> +						    unsigned long start,
> +						    unsigned long end)
> +{
> +	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
> +
> +	if (!context->invalidate_range)
> +		return;
> +
> +	down_read(&context->umem_mutex);
> +	rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
> +				      end,
> +				      invalidate_range_start_trampoline, NULL);
> +	up_read(&context->umem_mutex);
> +}
> +
> +static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
> +					   u64 end, void *cookie)
> +{
> +	ib_umem_notifier_end_account(item);
> +	return 0;
> +}
> +
> +static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
> +						  struct mm_struct *mm,
> +						  unsigned long start,
> +						  unsigned long end)
> +{
> +	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
> +
> +	if (!context->invalidate_range)
> +		return;
> +
> +	down_read(&context->umem_mutex);
> +	rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
> +				      end,
> +				      invalidate_range_end_trampoline, NULL);
> +	up_read(&context->umem_mutex);
> +}
> +
> +static struct mmu_notifier_ops ib_umem_notifiers = {
> +	.release                    = ib_umem_notifier_release,
> +	.invalidate_page            = ib_umem_notifier_invalidate_page,
> +	.invalidate_range_start     = ib_umem_notifier_invalidate_range_start,
> +	.invalidate_range_end       = ib_umem_notifier_invalidate_range_end,
> +};
> +
>  int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
>  {
>  	int ret_val;
>  	struct pid *our_pid;
> +	struct mm_struct *mm = get_task_mm(current);
> +	BUG_ON(!mm);
>  
>  	/* Prevent creating ODP MRs in child processes */
>  	rcu_read_lock();
>  	our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
>  	rcu_read_unlock();
>  	put_pid(our_pid);
> -	if (context->tgid != our_pid)
> -		return -EINVAL;
> +	if (context->tgid != our_pid) {
> +		ret_val = -EINVAL;
> +		goto out_mm;
> +	}
>  
>  	umem->hugetlb = 0;
>  	umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL);
> -	if (!umem->odp_data)
> -		return -ENOMEM;
> +	if (!umem->odp_data) {
> +		ret_val = -ENOMEM;
> +		goto out_mm;
> +	}
> +	umem->odp_data->umem = umem;
>  
>  	mutex_init(&umem->odp_data->umem_mutex);
>  
> +	init_completion(&umem->odp_data->notifier_completion);
> +
>  	umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) *
>  					    sizeof(*umem->odp_data->page_list));
>  	if (!umem->odp_data->page_list) {
> @@ -75,17 +253,66 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
>  		goto out_page_list;
>  	}
>  
> +	/*
> +	 * When using MMU notifiers, we will get a
> +	 * notification before the "current" task (and MM) is
> +	 * destroyed. We use the umem_mutex lock to synchronize.
> +	 */
> +	down_write(&context->umem_mutex);
> +	context->odp_mrs_count++;
> +	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
> +		rbt_ib_umem_insert(&umem->odp_data->interval_tree,
> +				   &context->umem_tree);
> +	downgrade_write(&context->umem_mutex);
> +
> +	if (context->odp_mrs_count == 1) {
> +		/*
> +		 * Note that at this point, no MMU notifier is running
> +		 * for this context!
> +		 */
> +		INIT_HLIST_NODE(&context->mn.hlist);
> +		context->mn.ops = &ib_umem_notifiers;
> +		/*
> +		 * Lock-dep detects a false positive for mmap_sem vs.
> +		 * umem_mutex, due to not grasping downgrade_write correctly.
> +		 */
> +		lockdep_off();
> +		ret_val = mmu_notifier_register(&context->mn, mm);
> +		lockdep_on();
> +		if (ret_val) {
> +			pr_err("Failed to register mmu_notifier %d\n", ret_val);
> +			ret_val = -EBUSY;
> +			goto out_mutex;
> +		}
> +	}
> +
> +	up_read(&context->umem_mutex);
> +
> +	/*
> +	 * Note that doing an mmput can cause a notifier for the relevant mm.
> +	 * If the notifier is called while we hold the umem_mutex, this will
> +	 * cause a deadlock. Therefore, we release the reference only after we
> +	 * released the mutex.
> +	 */
> +	mmput(mm);
>  	return 0;
>  
> +out_mutex:
> +	up_read(&context->umem_mutex);
> +	vfree(umem->odp_data->dma_list);
>  out_page_list:
>  	vfree(umem->odp_data->page_list);
>  out_odp_data:
>  	kfree(umem->odp_data);
> +out_mm:
> +	mmput(mm);
>  	return ret_val;
>  }
>  
>  void ib_umem_odp_release(struct ib_umem *umem)
>  {
> +	struct ib_ucontext *context = umem->context;
> +
>  	/*
>  	 * Ensure that no more pages are mapped in the umem.
>  	 *
> @@ -95,6 +322,49 @@ void ib_umem_odp_release(struct ib_umem *umem)
>  	ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem),
>  				    ib_umem_end(umem));
>  
> +	down_write(&context->umem_mutex);
> +	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
> +		rbt_ib_umem_remove(&umem->odp_data->interval_tree,
> +				   &context->umem_tree);
> +	context->odp_mrs_count--;
> +
> +	/*
> +	 * Downgrade the lock to a read lock. This ensures that the notifiers
> +	 * (who lock the mutex for reading) will be able to finish, and we
> +	 * will be able to enventually obtain the mmu notifiers SRCU. Note
> +	 * that since we are doing it atomically, no other user could register
> +	 * and unregister while we do the check.
> +	 */
> +	downgrade_write(&context->umem_mutex);
> +	if (!context->odp_mrs_count) {
> +		struct task_struct *owning_process = NULL;
> +		struct mm_struct *owning_mm        = NULL;
> +		owning_process = get_pid_task(context->tgid,
> +					      PIDTYPE_PID);
> +		if (owning_process == NULL)
> +			/*
> +			 * The process is already dead, notifier were removed
> +			 * already.
> +			 */
> +			goto out;
> +
> +		owning_mm = get_task_mm(owning_process);
> +		if (owning_mm == NULL)
> +			/*
> +			 * The process' mm is already dead, notifier were
> +			 * removed already.
> +			 */
> +			goto out_put_task;
> +		mmu_notifier_unregister(&context->mn, owning_mm);
> +
> +		mmput(owning_mm);
> +
> +out_put_task:
> +		put_task_struct(owning_process);
> +	}
> +out:
> +	up_read(&context->umem_mutex);
> +
>  	vfree(umem->odp_data->dma_list);
>  	vfree(umem->odp_data->page_list);
>  	kfree(umem);
> @@ -111,7 +381,8 @@ void ib_umem_odp_release(struct ib_umem *umem)
>   *               the sequence number is taken from
>   *               umem->odp_data->notifiers_seq.
>   *
> - * The function returns -EFAULT if the DMA mapping operation fails.
> + * The function returns -EFAULT if the DMA mapping operation fails. It returns
> + * -EAGAIN if a concurrent invalidation prevents us from updating the page. It
>   *
>   * The page is released via put_page even if the operation failed. For
>   * on-demand pinning, the page is released whenever it isn't stored in the
> @@ -120,6 +391,7 @@ void ib_umem_odp_release(struct ib_umem *umem)
>  static int ib_umem_odp_map_dma_single_page(
>  		struct ib_umem *umem,
>  		int page_index,
> +		u64 base_virt_addr,
>  		struct page *page,
>  		u64 access_mask,
>  		unsigned long current_seq)
> @@ -127,8 +399,18 @@ static int ib_umem_odp_map_dma_single_page(
>  	struct ib_device *dev = umem->context->device;
>  	dma_addr_t dma_addr;
>  	int stored_page = 0;
> +	int remove_existing_mapping = 0;
>  	int ret = 0;
>  	mutex_lock(&umem->odp_data->umem_mutex);
> +	/*
> +	 * Note: we avoid writing if seq is different from the initial seq, to
> +	 * handle case of a racing notifier. This check also allows us to bail
> +	 * early if we have a notifier running in parallel with us.
> +	 */
> +	if (ib_umem_mmu_notifier_retry(umem, current_seq)) {
> +		ret = -EAGAIN;
> +		goto out;
> +	}
>  	if (!(umem->odp_data->dma_list[page_index])) {
>  		dma_addr = ib_dma_map_page(dev,
>  					   page,
> @@ -146,14 +428,27 @@ static int ib_umem_odp_map_dma_single_page(
>  	} else {
>  		pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
>  		       umem->odp_data->page_list[page_index], page);
> +		/* Better remove the mapping now, to prevent any further
> +		 * damage. */
> +		remove_existing_mapping = 1;
>  	}
>  
>  out:
>  	mutex_unlock(&umem->odp_data->umem_mutex);
>  
> -	if (!stored_page)
> +	/* On Demand Paging - avoid pinning the page */
> +	if (umem->context->invalidate_range || !stored_page)
>  		put_page(page);
>  
> +	if (remove_existing_mapping && umem->context->invalidate_range) {
> +		invalidate_page_trampoline(
> +			umem,
> +			base_virt_addr + (page_index * PAGE_SIZE),
> +			base_virt_addr + ((page_index+1)*PAGE_SIZE),
> +			NULL);
> +		ret = -EAGAIN;
> +	}
> +
>  	return ret;
>  }
>  
> @@ -166,6 +461,8 @@ out:
>   *
>   * Returns the number of pages mapped in success, negative error code
>   * for failure.
> + * An -EAGAIN error code is returned when a concurrent mmu notifier prevents
> + * the function from completing its task.
>   *
>   * @umem: the umem to map and pin
>   * @user_virt: the address from which we need to map.
> @@ -187,6 +484,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
>  	struct page       **local_page_list = NULL;
>  	u64 off;
>  	int j, k, ret = 0, start_idx, npages = 0;
> +	u64 base_virt_addr;
>  
>  	if (access_mask == 0)
>  		return -EINVAL;
> @@ -201,6 +499,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
>  
>  	off = user_virt & (~PAGE_MASK);
>  	user_virt = user_virt & PAGE_MASK;
> +	base_virt_addr = user_virt;
>  	bcnt += off; /* Charge for the first page offset as well. */
>  
>  	start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT;
> @@ -242,8 +541,8 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
>  		user_virt += npages << PAGE_SHIFT;
>  		for (j = 0; j < npages; ++j) {
>  			ret = ib_umem_odp_map_dma_single_page(
> -				umem, k, local_page_list[j], access_mask,
> -				current_seq);
> +				umem, k, base_virt_addr, local_page_list[j],
> +				access_mask, current_seq);
>  			if (ret < 0)
>  				break;
>  			k++;
> @@ -280,6 +579,11 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
>  	struct ib_device *dev = umem->context->device;
>  	virt  = max_t(u64, virt,  ib_umem_start(umem));
>  	bound = min_t(u64, bound, ib_umem_end(umem));
> +	/* Note that during the run of this function, the
> +	 * notifiers_count of the MR is > 0, preventing any racing
> +	 * faults from completion. We might be racing with other
> +	 * invalidations, so we must make sure we free each page only
> +	 * once. */
>  	for (addr = virt; addr < bound; addr += (u64)umem->page_size) {
>  		idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
>  		mutex_lock(&umem->odp_data->umem_mutex);
> @@ -294,8 +598,21 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
>  			ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
>  					  DMA_BIDIRECTIONAL);
>  			if (umem->writable)
> -				set_page_dirty_lock(head_page);
> -			put_page(page);
> +				/*
> +				 * set_page_dirty prefers being called with
> +				 * the page lock. However, MMU notifiers are
> +				 * called sometimes with and sometimes without
> +				 * the lock. We rely on the umem_mutex instead
> +				 * to prevent other mmu notifiers from
> +				 * continuing and allowing the page mapping to
> +				 * be removed.
> +				 */
> +				set_page_dirty(head_page);
> +			/* on demand pinning support */
> +			if (!umem->context->invalidate_range)
> +				put_page(page);
> +			umem->odp_data->page_list[idx] = NULL;
> +			umem->odp_data->dma_list[idx] = 0;
>  		}
>  		mutex_unlock(&umem->odp_data->umem_mutex);
>  	}
> diff --git a/drivers/infiniband/core/umem_rbtree.c b/drivers/infiniband/core/umem_rbtree.c
> new file mode 100644
> index 0000000..727d788
> --- /dev/null
> +++ b/drivers/infiniband/core/umem_rbtree.c
> @@ -0,0 +1,94 @@
> +/*
> + * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * OpenIB.org BSD license below:
> + *
> + *     Redistribution and use in source and binary forms, with or
> + *     without modification, are permitted provided that the following
> + *     conditions are met:
> + *
> + *      - Redistributions of source code must retain the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer.
> + *
> + *      - Redistributions in binary form must reproduce the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer in the documentation and/or other materials
> + *        provided with the distribution.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/interval_tree_generic.h>
> +#include <linux/sched.h>
> +#include <linux/gfp.h>
> +#include <rdma/ib_umem_odp.h>
> +
> +/*
> + * The ib_umem list keeps track of memory regions for which the HW
> + * device request to receive notification when the related memory
> + * mapping is changed.
> + *
> + * ib_umem_lock protects the list.
> + */
> +
> +static inline u64 node_start(struct umem_odp_node *n)
> +{
> +	struct ib_umem_odp *umem_odp =
> +			container_of(n, struct ib_umem_odp, interval_tree);
> +
> +	return ib_umem_start(umem_odp->umem);
> +}
> +
> +/* Note that the representation of the intervals in the interval tree
> + * considers the ending point as contained in the interval, while the
> + * function ib_umem_end returns the first address which is not contained
> + * in the umem.
> + */
> +static inline u64 node_last(struct umem_odp_node *n)
> +{
> +	struct ib_umem_odp *umem_odp =
> +			container_of(n, struct ib_umem_odp, interval_tree);
> +
> +	return ib_umem_end(umem_odp->umem) - 1;
> +}
> +
> +INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
> +		     node_start, node_last, , rbt_ib_umem)
> +
> +/* @last is not a part of the interval. See comment for function
> + * node_last.
> + */
> +int rbt_ib_umem_for_each_in_range(struct rb_root *root,
> +				  u64 start, u64 last,
> +				  umem_call_back cb,
> +				  void *cookie)
> +{
> +	int ret_val = 0;
> +	struct umem_odp_node *node;
> +	struct ib_umem_odp *umem;
> +
> +	if (unlikely(start == last))
> +		return ret_val;
> +
> +	for (node = rbt_ib_umem_iter_first(root, start, last - 1); node;
> +			node = rbt_ib_umem_iter_next(node, start, last - 1)) {
> +		umem = container_of(node, struct ib_umem_odp, interval_tree);
> +		ret_val = cb(umem->umem, start, last, cookie) || ret_val;
> +	}
> +
> +	return ret_val;
> +}
> diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
> index fe709ca..a81d0c7 100644
> --- a/drivers/infiniband/core/uverbs_cmd.c
> +++ b/drivers/infiniband/core/uverbs_cmd.c
> @@ -289,6 +289,9 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
>  	struct ib_uverbs_get_context_resp resp;
>  	struct ib_udata                   udata;
>  	struct ib_device                 *ibdev = file->device->ib_dev;
> +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
> +	struct ib_device_attr		  dev_attr;
> +#endif
>  	struct ib_ucontext		 *ucontext;
>  	struct file			 *filp;
>  	int ret;
> @@ -331,6 +334,19 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
>  	rcu_read_unlock();
>  	ucontext->closing = 0;
>  
> +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
> +	ucontext->umem_tree = RB_ROOT;
> +	init_rwsem(&ucontext->umem_mutex);
> +	ucontext->odp_mrs_count = 0;
> +
> +	ret = ib_query_device(ibdev, &dev_attr);
> +	if (ret)
> +		goto err_free;
> +	if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
> +		ucontext->invalidate_range = NULL;
> +
> +#endif
> +
>  	resp.num_comp_vectors = file->device->num_comp_vectors;
>  
>  	ret = get_unused_fd_flags(O_CLOEXEC);
> diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
> index 375ce28..9b93206 100644
> --- a/include/rdma/ib_umem_odp.h
> +++ b/include/rdma/ib_umem_odp.h
> @@ -34,6 +34,12 @@
>  #define IB_UMEM_ODP_H
>  
>  #include <rdma/ib_umem.h>
> +#include <linux/interval_tree.h>
> +
> +struct umem_odp_node {
> +	u64 __subtree_last;
> +	struct rb_node rb;
> +};
>  
>  struct ib_umem_odp {
>  	/*
> @@ -58,6 +64,14 @@ struct ib_umem_odp {
>  
>  	atomic_t		notifiers_seq;
>  	atomic_t		notifiers_count;
> +
> +	struct ib_umem		*umem;
> +
> +	/* Tree tracking */
> +	struct umem_odp_node	interval_tree;
> +
> +	struct completion	notifier_completion;
> +	int			dying;
>  };
>  
>  #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
> @@ -85,6 +99,48 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt,
>  void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset,
>  				 u64 bound);
>  
> +void rbt_ib_umem_insert(struct umem_odp_node *node, struct rb_root *root);
> +void rbt_ib_umem_remove(struct umem_odp_node *node, struct rb_root *root);
> +typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end,
> +			      void *cookie);
> +/*
> + * Call the callback on each ib_umem in the range. Returns the logical or of
> + * the return values of the functions called.
> + */
> +int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end,
> +				  umem_call_back cb, void *cookie);
> +
> +struct umem_odp_node *rbt_ib_umem_iter_first(struct rb_root *root,
> +					     u64 start, u64 last);
> +struct umem_odp_node *rbt_ib_umem_iter_next(struct umem_odp_node *node,
> +					    u64 start, u64 last);
> +
> +static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item,
> +					     unsigned long mmu_seq)
> +{
> +	/*
> +	 * This code is strongly based on the KVM code from
> +	 * mmu_notifier_retry. Should be called with
> +	 * item->odp_data->umem_mutex locked.
> +	 */
> +	if (unlikely(atomic_read(&item->odp_data->notifiers_count)))
> +		return 1;
> +	/*
> +	 * Ensure the read of mmu_notifier_count happens before the read
> +	 * of mmu_notifier_seq.  This interacts with the smp_wmb() in
> +	 * mmu_notifier_invalidate_range_end to make sure that the caller
> +	 * either sees the old (non-zero) value of mmu_notifier_count or
> +	 * the new (incremented) value of mmu_notifier_seq.
> +	 */
> +	smp_rmb();
> +	if (atomic_read(&item->odp_data->notifiers_seq) != mmu_seq)
> +		return 1;
> +	return 0;
> +}
> +
> +void ib_umem_notifier_start_account(struct ib_umem *item);
> +void ib_umem_notifier_end_account(struct ib_umem *item);
> +
>  #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>  
>  static inline int ib_umem_odp_get(struct ib_ucontext *context,
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 6d7f75e..54df9a8 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -51,6 +51,7 @@
>  #include <uapi/linux/if_ether.h>
>  
>  #include <linux/atomic.h>
> +#include <linux/mmu_notifier.h>
>  #include <asm/uaccess.h>
>  
>  extern struct workqueue_struct *ib_wq;
> @@ -1142,6 +1143,8 @@ struct ib_fmr_attr {
>  	u8	page_shift;
>  };
>  
> +struct ib_umem;
> +
>  struct ib_ucontext {
>  	struct ib_device       *device;
>  	struct list_head	pd_list;
> @@ -1157,6 +1160,19 @@ struct ib_ucontext {
>  
>  	/* For ODP support: */
>  	struct pid             *tgid;
> +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
> +	struct rb_root      umem_tree;
> +	/*
> +	 * Protects .umem_rbroot and tree, as well as odp_mrs_count and
> +	 * mmu notifiers registration.
> +	 */
> +	struct rw_semaphore	umem_mutex;
> +	void (*invalidate_range)(struct ib_umem *umem,
> +				 unsigned long start, unsigned long end);
> +
> +	struct mmu_notifier	mn;
> +	int                     odp_mrs_count;
> +#endif
>  };
>  
>  struct ib_uobject {
> -- 
> 1.7.11.2
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html