[PATCH RFC 2/2] skiplists for the IOMMU

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is only for experimental use, and it is only lightly tested.

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 0099667..32d2920 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1403,7 +1403,6 @@ static void iommu_detach_domain(struct dmar_domain *domain,
 }
 
 static struct iova_domain reserved_iova_list;
-static struct lock_class_key reserved_rbtree_key;
 
 static int dmar_init_reserved_ranges(void)
 {
@@ -1413,9 +1412,6 @@ static int dmar_init_reserved_ranges(void)
 
 	init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
 
-	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
-		&reserved_rbtree_key);
-
 	/* IOAPIC ranges shouldn't be accessed by DMA */
 	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
 		IOVA_PFN(IOAPIC_RANGE_END));
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 67da6cff..efd5518 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -18,182 +18,113 @@
  */
 
 #include <linux/iova.h>
+#include <linux/random.h>
 
+#define IOVA_PENDING_INIT ((unsigned long) -1)
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
 {
-	spin_lock_init(&iovad->iova_rbtree_lock);
-	iovad->rbroot = RB_ROOT;
-	iovad->cached32_node = NULL;
+	spin_lock_init(&iovad->del_skiplist_lock);
+	spin_lock_init(&iovad->skiplist_lock);
+	sl_init_list(&iovad->skiplist, GFP_ATOMIC);
 	iovad->dma_32bit_pfn = pfn_32bit;
 }
 
-static struct rb_node *
-__get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
+static unsigned long
+__get_cached_addr(struct iova_domain *iovad, unsigned long limit_pfn)
 {
-	if ((*limit_pfn != iovad->dma_32bit_pfn) ||
-		(iovad->cached32_node == NULL))
-		return rb_last(&iovad->rbroot);
-	else {
-		struct rb_node *prev_node = rb_prev(iovad->cached32_node);
-		struct iova *curr_iova =
-			container_of(iovad->cached32_node, struct iova, node);
-		*limit_pfn = curr_iova->pfn_lo - 1;
-		return prev_node;
-	}
-}
-
-static void
-__cached_rbnode_insert_update(struct iova_domain *iovad,
-	unsigned long limit_pfn, struct iova *new)
-{
-	if (limit_pfn != iovad->dma_32bit_pfn)
-		return;
-	iovad->cached32_node = &new->node;
-}
+	unsigned long guess;
 
-static void
-__cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
-{
-	struct iova *cached_iova;
-	struct rb_node *curr;
+	prandom_bytes(&guess, sizeof(guess));
 
-	if (!iovad->cached32_node)
-		return;
-	curr = iovad->cached32_node;
-	cached_iova = container_of(curr, struct iova, node);
-
-	if (free->pfn_lo >= cached_iova->pfn_lo) {
-		struct rb_node *node = rb_next(&free->node);
-		struct iova *iova = container_of(node, struct iova, node);
-
-		/* only cache if it's below 32bit pfn */
-		if (node && iova->pfn_lo < iovad->dma_32bit_pfn)
-			iovad->cached32_node = node;
-		else
-			iovad->cached32_node = NULL;
+	/* the skiplist code is fastest when we spread out the
+	 * key range as much as possible.  Instead of caching the
+	 * last freed or allocated number, return random guesses
+	 * in hopes of fanning out our locking attempts.
+	 */
+	if (limit_pfn == iovad->dma_32bit_pfn) {
+		guess = guess % iovad->dma_32bit_pfn;
+		guess = max_t(unsigned long, guess, IOVA_START_PFN);
+		return guess;
+	} else {
+		guess = max_t(unsigned long, guess, IOVA_START_PFN);
+		guess = min_t(unsigned long, guess, (~0UL) >> 1);
+		return guess;
 	}
 }
 
-/* Computes the padding size required, to make the
- * the start address naturally aligned on its size
- */
-static int
-iova_get_pad_size(int size, unsigned int limit_pfn)
-{
-	unsigned int pad_size = 0;
-	unsigned int order = ilog2(size);
-
-	if (order)
-		pad_size = (limit_pfn + 1) % (1 << order);
-
-	return pad_size;
-}
-
 static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 		unsigned long size, unsigned long limit_pfn,
 			struct iova *new, bool size_aligned)
 {
-	struct rb_node *prev, *curr = NULL;
 	unsigned long flags;
-	unsigned long saved_pfn;
-	unsigned int pad_size = 0;
-
-	/* Walk the tree backwards */
-	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	saved_pfn = limit_pfn;
-	curr = __get_cached_rbnode(iovad, &limit_pfn);
-	prev = curr;
-	while (curr) {
-		struct iova *curr_iova = container_of(curr, struct iova, node);
-
-		if (limit_pfn < curr_iova->pfn_lo)
-			goto move_left;
-		else if (limit_pfn < curr_iova->pfn_hi)
-			goto adjust_limit_pfn;
-		else {
-			if (size_aligned)
-				pad_size = iova_get_pad_size(size, limit_pfn);
-			if ((curr_iova->pfn_hi + size + pad_size) <= limit_pfn)
-				break;	/* found a free slot */
-		}
-adjust_limit_pfn:
-		limit_pfn = curr_iova->pfn_lo - 1;
-move_left:
-		prev = curr;
-		curr = rb_prev(curr);
-	}
+	unsigned long align = 1;
+	unsigned long hint;
 
-	if (!curr) {
-		if (size_aligned)
-			pad_size = iova_get_pad_size(size, limit_pfn);
-		if ((IOVA_START_PFN + size + pad_size) > limit_pfn) {
-			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-			return -ENOMEM;
-		}
-	}
+	int ret;
 
-	/* pfn_lo will point to size aligned address if size_aligned is set */
-	new->pfn_lo = limit_pfn - (size + pad_size) + 1;
-	new->pfn_hi = new->pfn_lo + size - 1;
-
-	/* Insert the new_iova into domain rbtree by holding writer lock */
-	/* Add new node and rebalance tree. */
-	{
-		struct rb_node **entry, *parent = NULL;
-
-		/* If we have 'prev', it's a valid place to start the
-		   insertion. Otherwise, start from the root. */
-		if (prev)
-			entry = &prev;
-		else
-			entry = &iovad->rbroot.rb_node;
-
-		/* Figure out where to put new node */
-		while (*entry) {
-			struct iova *this = container_of(*entry,
-							struct iova, node);
-			parent = *entry;
-
-			if (new->pfn_lo < this->pfn_lo)
-				entry = &((*entry)->rb_left);
-			else if (new->pfn_lo > this->pfn_lo)
-				entry = &((*entry)->rb_right);
-			else
-				BUG(); /* this should not happen */
-		}
+	if (size_aligned)
+		align = size;
 
-		/* Add new node and rebalance tree. */
-		rb_link_node(&new->node, parent, entry);
-		rb_insert_color(&new->node, &iovad->rbroot);
+	/*
+	 * make sure that a lockless search into the tree
+	 * understands we're still doing setup on this iova
+	 */
+	new->pfn_lo = IOVA_PENDING_INIT;
+	new->pfn_hi = IOVA_PENDING_INIT;
+	smp_wmb();
+again:
+	local_irq_save(flags);
+	hint = __get_cached_addr(iovad, limit_pfn);
+	ret = skiplist_insert_hole(&iovad->skiplist,
+				   hint,
+				   limit_pfn, size, align,
+				   &new->slot, GFP_ATOMIC);
+	/*
+	 * insert hole returns -eagain when it found a good
+	 * spot but someone raced in and stole it.  If
+	 * that happens pick a new hint and try again
+	 */
+	if (ret == -EAGAIN) {
+		local_irq_restore(flags);
+		goto again;
 	}
-	__cached_rbnode_insert_update(iovad, saved_pfn, new);
 
-	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	/* we're fully inserted, set our lo/hi */
+	new->pfn_lo = new->slot.key;
+	new->pfn_hi = new->slot.key + new->slot.size - 1;
+	smp_wmb();
+	local_irq_restore(flags);
 
+	if (ret)
+		return -ENOMEM;
 
 	return 0;
 }
 
-static void
-iova_insert_rbtree(struct rb_root *root, struct iova *iova)
+static int
+iova_insert_skiplist(struct sl_list *skiplist, struct iova *iova)
 {
-	struct rb_node **new = &(root->rb_node), *parent = NULL;
-	/* Figure out where to put new node */
-	while (*new) {
-		struct iova *this = container_of(*new, struct iova, node);
-		parent = *new;
-
-		if (iova->pfn_lo < this->pfn_lo)
-			new = &((*new)->rb_left);
-		else if (iova->pfn_lo > this->pfn_lo)
-			new = &((*new)->rb_right);
-		else
-			BUG(); /* this should not happen */
+	int ret;
+	int preload_token;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	preload_token = skiplist_preload(skiplist, GFP_ATOMIC);
+	if (preload_token < 0) {
+		ret = preload_token;
+		local_irq_restore(flags);
+		goto out;
 	}
-	/* Add new node and rebalance tree. */
-	rb_link_node(&iova->node, parent, new);
-	rb_insert_color(&iova->node, root);
+
+	iova->slot.key = iova->pfn_lo;
+	iova->slot.size = iova->pfn_hi - iova->pfn_lo + 1;
+
+	ret = skiplist_insert(skiplist, &iova->slot, preload_token);
+	local_irq_restore(flags);
+	preempt_enable();
+out:
+	return ret;
 }
 
 /**
@@ -245,53 +176,24 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
  */
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
 {
+	struct sl_slot *slot;
 	unsigned long flags;
-	struct rb_node *node;
-
-	/* Take the lock so that no other thread is manipulating the rbtree */
-	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	node = iovad->rbroot.rb_node;
-	while (node) {
-		struct iova *iova = container_of(node, struct iova, node);
-
-		/* If pfn falls within iova's range, return iova */
-		if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
-			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-			/* We are not holding the lock while this iova
-			 * is referenced by the caller as the same thread
-			 * which called this function also calls __free_iova()
-			 * and it is by design that only one thread can possibly
-			 * reference a particular iova and hence no conflict.
-			 */
-			return iova;
-		}
+	struct iova *iova;
 
-		if (pfn < iova->pfn_lo)
-			node = node->rb_left;
-		else if (pfn > iova->pfn_lo)
-			node = node->rb_right;
+	local_irq_save(flags);
+	slot = skiplist_lookup(&iovad->skiplist, pfn, 1);
+	if (!slot) {
+		local_irq_restore(flags);
+		return NULL;
 	}
-
-	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-	return NULL;
-}
-
-/**
- * __free_iova - frees the given iova
- * @iovad: iova domain in question.
- * @iova: iova in question.
- * Frees the given iova belonging to the giving domain
- */
-void
-__free_iova(struct iova_domain *iovad, struct iova *iova)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	__cached_rbnode_delete_update(iovad, iova);
-	rb_erase(&iova->node, &iovad->rbroot);
-	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-	free_iova_mem(iova);
+	iova = sl_slot_entry(slot, struct iova, slot);
+	while (iova->pfn_lo == IOVA_PENDING_INIT ||
+	       iova->pfn_hi == IOVA_PENDING_INIT) {
+		cpu_relax();
+		smp_rmb();
+	}
+	local_irq_restore(flags);
+	return iova;
 }
 
 /**
@@ -304,12 +206,29 @@ __free_iova(struct iova_domain *iovad, struct iova *iova)
 void
 free_iova(struct iova_domain *iovad, unsigned long pfn)
 {
-	struct iova *iova = find_iova(iovad, pfn);
-	if (iova)
-		__free_iova(iovad, iova);
+	struct iova *iova;
+	struct sl_slot *slot;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	slot = skiplist_delete(&iovad->skiplist, pfn, 1);
+	local_irq_restore(flags);
+
+	if (!slot)
+		return;
 
+	iova = sl_slot_entry(slot, struct iova, slot);
+	free_iova_mem(iova);
 }
 
+void
+__free_iova(struct iova_domain *iovad, struct iova *iova)
+{
+	unsigned long pfn = iova->pfn_lo;
+	free_iova(iovad, pfn);
+}
+
+
 /**
  * put_iova_domain - destroys the iova doamin
  * @iovad: - iova domain in question.
@@ -317,29 +236,37 @@ free_iova(struct iova_domain *iovad, unsigned long pfn)
  */
 void put_iova_domain(struct iova_domain *iovad)
 {
-	struct rb_node *node;
+	struct sl_list *skiplist = &iovad->skiplist;
+	struct sl_node *p;
+	struct sl_leaf *leaf;
 	unsigned long flags;
+	struct iova *iova;
+	struct sl_slot *slot;
+	int i;
 
-	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	node = rb_first(&iovad->rbroot);
-	while (node) {
-		struct iova *iova = container_of(node, struct iova, node);
-		rb_erase(node, &iovad->rbroot);
-		free_iova_mem(iova);
-		node = rb_first(&iovad->rbroot);
+	/*
+	 * the skiplist code needs some helpers for iteration.  For now
+	 * roll our own
+	 */
+	local_irq_save(flags);
+	spin_lock(&iovad->skiplist_lock);
+	sl_lock_node(skiplist->head);
+	p = skiplist->head->ptrs[0].next;
+	while (p) {
+		leaf = sl_entry(p);
+		for (i = 0; i < leaf->nr; i++) {
+			slot = leaf->ptrs[i];
+			iova = sl_slot_entry(slot, struct iova, slot);
+			free_iova_mem(iova);
+		}
+		p = leaf->node.ptrs[0].next;
+		sl_free_leaf(leaf);
 	}
-	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-}
-
-static int
-__is_range_overlap(struct rb_node *node,
-	unsigned long pfn_lo, unsigned long pfn_hi)
-{
-	struct iova *iova = container_of(node, struct iova, node);
-
-	if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo))
-		return 1;
-	return 0;
+	/* FIXME call a helper here */
+	memset(skiplist->head->ptrs, 0, sl_node_size(SKIP_MAXLEVEL));
+	sl_unlock_node(skiplist->head);
+	spin_unlock(&iovad->skiplist_lock);
+	local_irq_restore(flags);
 }
 
 static struct iova *
@@ -347,6 +274,7 @@ __insert_new_range(struct iova_domain *iovad,
 	unsigned long pfn_lo, unsigned long pfn_hi)
 {
 	struct iova *iova;
+	int ret;
 
 	iova = alloc_iova_mem();
 	if (!iova)
@@ -354,18 +282,16 @@ __insert_new_range(struct iova_domain *iovad,
 
 	iova->pfn_hi = pfn_hi;
 	iova->pfn_lo = pfn_lo;
-	iova_insert_rbtree(&iovad->rbroot, iova);
-	return iova;
-}
+	ret = iova_insert_skiplist(&iovad->skiplist, iova);
 
-static void
-__adjust_overlap_range(struct iova *iova,
-	unsigned long *pfn_lo, unsigned long *pfn_hi)
-{
-	if (*pfn_lo < iova->pfn_lo)
-		iova->pfn_lo = *pfn_lo;
-	if (*pfn_hi > iova->pfn_hi)
-		*pfn_lo = iova->pfn_hi + 1;
+	if (ret == -ENOMEM) {
+		free_iova_mem(iova);
+		return NULL;
+	} else {
+		BUG_ON(ret);
+	}
+
+	return iova;
 }
 
 /**
@@ -380,32 +306,42 @@ struct iova *
 reserve_iova(struct iova_domain *iovad,
 	unsigned long pfn_lo, unsigned long pfn_hi)
 {
-	struct rb_node *node;
+	struct sl_slot *slot;
 	unsigned long flags;
-	struct iova *iova;
-	unsigned int overlap = 0;
-
-	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-	for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
-		if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
-			iova = container_of(node, struct iova, node);
-			__adjust_overlap_range(iova, &pfn_lo, &pfn_hi);
-			if ((pfn_lo >= iova->pfn_lo) &&
-				(pfn_hi <= iova->pfn_hi))
-				goto finish;
-			overlap = 1;
-
-		} else if (overlap)
-				break;
-	}
-
-	/* We are here either because this is the first reserver node
-	 * or need to insert remaining non overlap addr range
+	struct iova *iova = NULL;
+	struct iova *found = NULL;
+	unsigned long size = pfn_hi - pfn_lo + 1;
+	unsigned long min_pfn = pfn_lo;
+	unsigned long max_pfn = pfn_hi;
+
+	/*
+	 * this is not locking safe.  It only happens while there are no
+	 * concurrent IO requrests (I hope!)
 	 */
-	iova = __insert_new_range(iovad, pfn_lo, pfn_hi);
-finish:
-
-	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	local_irq_save(flags);
+	spin_lock(&iovad->skiplist_lock);
+	while(1) {
+		/*
+		 * really ugly, just delete anything overlapping and
+		 * reinsert the new full range
+		 */
+		slot = skiplist_delete(&iovad->skiplist, pfn_lo, size);
+		if (!slot)
+			break;
+
+		found = sl_slot_entry(slot, struct iova, slot);
+		while (found->pfn_lo == IOVA_PENDING_INIT ||
+		       found->pfn_hi == IOVA_PENDING_INIT) {
+			cpu_relax();
+			smp_rmb();
+		}
+		min_pfn = min(found->pfn_lo, min_pfn);
+		max_pfn = max(found->pfn_hi, max_pfn);
+		free_iova_mem(found);
+	}
+	iova = __insert_new_range(iovad, min_pfn, max_pfn);
+	spin_unlock(&iovad->skiplist_lock);
+	local_irq_restore(flags);
 	return iova;
 }
 
@@ -419,17 +355,33 @@ finish:
 void
 copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
 {
+	struct sl_node *p;
+	struct sl_leaf *leaf;
 	unsigned long flags;
-	struct rb_node *node;
-
-	spin_lock_irqsave(&from->iova_rbtree_lock, flags);
-	for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
-		struct iova *iova = container_of(node, struct iova, node);
-		struct iova *new_iova;
-		new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
-		if (!new_iova)
-			printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
-				iova->pfn_lo, iova->pfn_lo);
+	struct iova *iova;
+	struct iova *new_iova;
+	struct sl_slot *slot;
+	int i;
+
+	/*
+	 * this is not locking safe.  It only happens while there are no
+	 * concurrent IO requrests (I hope!)
+	 */
+	local_irq_save(flags);
+	sl_lock_node(from->skiplist.head);
+	p = from->skiplist.head->ptrs[0].next;
+	while (p) {
+		leaf = sl_entry(p);
+		for (i = 0; i < leaf->nr; i++) {
+			slot = leaf->ptrs[i];
+			iova = sl_slot_entry(slot, struct iova, slot);
+			new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
+			if (!new_iova)
+				printk(KERN_ERR "Reserve iova range %lx@%lx failed\n",
+					iova->pfn_lo, iova->pfn_lo);
+		}
+		p = leaf->node.ptrs[0].next;
 	}
-	spin_unlock_irqrestore(&from->iova_rbtree_lock, flags);
+	sl_unlock_node(from->skiplist.head);
+	local_irq_restore(flags);
 }
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 76a0759..b514c18 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -13,7 +13,7 @@
 
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/rbtree.h>
+#include <linux/skiplist.h>
 #include <linux/dma-mapping.h>
 
 /* IO virtual address start page frame number */
@@ -21,16 +21,16 @@
 
 /* iova structure */
 struct iova {
-	struct rb_node	node;
+	struct sl_slot slot;
 	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
 	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
 };
 
 /* holds all the iova translations for a domain */
 struct iova_domain {
-	spinlock_t	iova_rbtree_lock; /* Lock to protect update of rbtree */
-	struct rb_root	rbroot;		/* iova domain rbtree root */
-	struct rb_node	*cached32_node; /* Save last alloced node */
+	spinlock_t	skiplist_lock;
+	spinlock_t	del_skiplist_lock;
+	struct sl_list	skiplist;		/* iova domain skiplist */
 	unsigned long	dma_32bit_pfn;
 };
 
-- 
1.8.2

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux