prospective 13/12 s390 pgtable_list patch

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Gerald,

It's that moment you've been dreading: I'm hoping that you can, please,
take a look at the patch below, and try building and running with it,
on top of the v2 series of 12 I sent out on Tuesday.

If this seems okay to you, I'll send it properly as 13/12 of that series,
to the full Cc list; but of course you may find I've missed typos and worse
- please don't waste your time on it if it's rubbish, but so far as I can
tell, it is complete and ready now.

Thanks!
Hugh

s390: SLAB_TYPESAFE_BY_RCU mm->context.pgtable_list

Move the s390 pgtable_list (with dedicated spinlock) out of mm->context
into a separate kmem_cache object specified with SLAB_TYPESAFE_BY_RCU:
which then allows __tlb_remove_table() to add a page back to the list,
safely, when the other half is still in use.

Update the commentary on pgtable_list above page_table_alloc().

Stop using pt_mm in struct page: this needs a pt_list in that union.

Signed-off-by: Hugh Dickins <hughd@xxxxxxxxxx>
---
 arch/s390/include/asm/mmu.h         |  10 +-
 arch/s390/include/asm/mmu_context.h |   8 +-
 arch/s390/mm/pgalloc.c              | 151 ++++++++++++++++++----------
 include/linux/mm_types.h            |   4 +-
 4 files changed, 117 insertions(+), 56 deletions(-)

diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index 829d68e2c685..069cdd612d86 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -6,12 +6,18 @@
 #include <linux/errno.h>
 #include <asm/asm-extable.h>
 
+struct pgtable_list {
+	struct list_head list;
+	spinlock_t lock;
+};
+extern struct pgtable_list init_pgtable_list;
+
 typedef struct {
 	spinlock_t lock;
 	cpumask_t cpu_attach_mask;
 	atomic_t flush_count;
 	unsigned int flush_mm;
-	struct list_head pgtable_list;
+	struct pgtable_list *pgtable_list;
 	struct list_head gmap_list;
 	unsigned long gmap_asce;
 	unsigned long asce;
@@ -39,7 +45,7 @@ typedef struct {
 
 #define INIT_MM_CONTEXT(name)						   \
 	.context.lock =	__SPIN_LOCK_UNLOCKED(name.context.lock),	   \
-	.context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
+	.context.pgtable_list = &init_pgtable_list,			   \
 	.context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),
 
 #endif
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 2a38af5a00c2..725f82c55036 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -15,14 +15,20 @@
 #include <asm/ctl_reg.h>
 #include <asm-generic/mm_hooks.h>
 
+bool alloc_pgtable_list(struct mm_struct *mm);
+void destroy_context(struct mm_struct *mm);
+#define destroy_context destroy_context
+
 #define init_new_context init_new_context
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 {
 	unsigned long asce_type, init_entry;
 
+	if (!alloc_pgtable_list(mm))
+		return -ENOMEM;
+
 	spin_lock_init(&mm->context.lock);
-	INIT_LIST_HEAD(&mm->context.pgtable_list);
 	INIT_LIST_HEAD(&mm->context.gmap_list);
 	cpumask_clear(&mm->context.cpu_attach_mask);
 	atomic_set(&mm->context.flush_count, 0);
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 11983a3ff95a..7f851b2dad49 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -159,15 +159,41 @@ void page_table_free_pgste(struct page *page)
 
 #endif /* CONFIG_PGSTE */
 
-/*
- * Temporarily use a global spinlock instead of mm->context.lock.
- * This will be replaced by a per-mm spinlock in a followup commit.
- */
-static DEFINE_SPINLOCK(mm_pgtable_list_lock);
+struct pgtable_list init_pgtable_list = {
+	.list	= LIST_HEAD_INIT(init_pgtable_list.list),
+	.lock	= __SPIN_LOCK_UNLOCKED(init_pgtable_list.lock),
+};
+
+static struct kmem_cache *pgtable_list_cachep;
+
+static void pgtable_list_ctor(void *object)
+{
+	struct pgtable_list *pgtable_list = object;
+
+	INIT_LIST_HEAD(&pgtable_list->list);
+	spin_lock_init(&pgtable_list->lock);
+}
+
+bool alloc_pgtable_list(struct mm_struct *mm)
+{
+	if (unlikely(!pgtable_list_cachep)) {	/* first time */
+		pgtable_list_cachep = kmem_cache_create("pgtable_list",
+			sizeof(struct pgtable_list), 0,
+			SLAB_TYPESAFE_BY_RCU | SLAB_PANIC, pgtable_list_ctor);
+	}
+	mm->context.pgtable_list =
+		kmem_cache_alloc(pgtable_list_cachep, GFP_KERNEL);
+	return !!mm->context.pgtable_list;
+}
+
+void destroy_context(struct mm_struct *mm)
+{
+	kmem_cache_free(pgtable_list_cachep, mm->context.pgtable_list);
+}
+
 /*
  * A 2KB-pgtable is either upper or lower half of a normal page.
- * The second half of the page may be unused or used as another
- * 2KB-pgtable.
+ * The other half of the page may be unused or used as another 2KB-pgtable.
  *
  * Whenever possible the parent page for a new 2KB-pgtable is picked
  * from the list of partially allocated pages mm_context_t::pgtable_list.
@@ -187,11 +213,15 @@ static DEFINE_SPINLOCK(mm_pgtable_list_lock);
  * As follows from the above, no unallocated or fully allocated parent
  * pages are contained in mm_context_t::pgtable_list.
  *
- * NOTE NOTE NOTE: The commentary above and below has not yet been updated:
- * the new rule is that a page is not linked to mm_context_t::pgtable_list
- * while either half is pending free by any method; but afterwards is
- * either relinked to it, or freed, by __tlb_remove_table().  This allows
- * pte_free_defer() to use the page->rcu_head (which overlays page->lru).
+ * However, there is a further twist. If a 2KB-pgtable is allocated in
+ * advance, in case it might be needed, but is then freed without being
+ * used, then it is freed immediately as described above.
+ *
+ * But when a 2KB-pgtable is freed after use, it usually passes through
+ * the quarantine of an RCU grace period, pending the actual free. While
+ * in this pending state, it cannot be reallocated, and its parent page
+ * is not permitted on mm_context_t::pgtable_list - no matter whether
+ * the other half is allocated, or pending free, or free.
  *
  * The upper byte (bits 24-31) of the parent page _refcount is used
  * for tracking contained 2KB-pgtables and has the following format:
@@ -211,32 +241,37 @@ static DEFINE_SPINLOCK(mm_pgtable_list_lock);
  *
  * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
  * The parent page is either:
- *   - added to mm_context_t::pgtable_list in case the second half of the
+ *   - added to mm_context_t::pgtable_list in case the other half of the
  *     parent page is still unallocated;
  *   - removed from mm_context_t::pgtable_list in case both halves of the
  *     parent page are allocated;
- * These operations are protected with mm_context_t::lock.
+ * These operations are protected with mm_context_t::pgtable_list::lock.
  *
  * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
  * and the corresponding PP bit is set to 1 in a single atomic operation.
  * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
  * exclusive and may never be both set to 1!
  * The parent page is either:
- *   - added to mm_context_t::pgtable_list in case the second half of the
- *     parent page is still allocated;
- *   - removed from mm_context_t::pgtable_list in case the second half of
- *     the parent page is unallocated;
- * These operations are protected with mm_context_t::lock.
+ *   - added to mm_context_t::pgtable_list in case the other half of the
+ *     parent page is still allocated - but never when setting the PP bit;
+ *   - removed from mm_context_t::pgtable_list in case the other half of
+ *     the parent page is unallocated (and not pending free, when the
+ *     parent page would already have been removed from the list);
+ * These operations are protected with mm_context_t::pgtable_list::lock.
  *
- * It is important to understand that mm_context_t::lock only protects
- * mm_context_t::pgtable_list and AA bits, but not the parent page itself
- * and PP bits.
+ * It is important to understand that mm_context_t::pgtable_list::lock
+ * only protects mm_context_t::pgtable_list::list and AA bits, but not
+ * the parent page itself and PP bits.
  *
- * Releasing the parent page happens whenever the PP bit turns from 1 to 0,
- * while both AA bits and the second PP bit are already unset. Then the
- * parent page does not contain any 2KB-pgtable fragment anymore, and it has
- * also been removed from mm_context_t::pgtable_list. It is safe to release
- * the page therefore.
+ * Releasing the parent page happens whenever both PP bits and both AA bits
+ * are unset. Then the parent page does not contain a 2KB-pgtable fragment
+ * anymore, and it has also been removed from mm_context_t::pgtable_list.
+ * It is safe to release the page therefore.
+ *
+ * The HH bits play no part in this logic, but indicate whether either
+ * 2KB-pgtable is using, or is waiting to use, the parent page's rcu_head.
+ * When required, the HH bit is set at the same time as setting the PP bit,
+ * then unset just before unsetting the PP bit.
  *
  * PGSTE memory spaces use full 4KB-pgtables and do not need most of the
  * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
@@ -245,6 +280,7 @@ static DEFINE_SPINLOCK(mm_pgtable_list_lock);
  */
 unsigned long *page_table_alloc(struct mm_struct *mm)
 {
+	struct pgtable_list *pgtable_list = mm->context.pgtable_list;
 	unsigned long *table;
 	struct page *page;
 	unsigned int mask, bit;
@@ -252,9 +288,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 	/* Try to get a fragment of a 4K page as a 2K page table */
 	if (!mm_alloc_pgste(mm)) {
 		table = NULL;
-		spin_lock_bh(&mm_pgtable_list_lock);
-		if (!list_empty(&mm->context.pgtable_list)) {
-			page = list_first_entry(&mm->context.pgtable_list,
+		spin_lock_bh(&pgtable_list->lock);
+		if (!list_empty(&pgtable_list->list)) {
+			page = list_first_entry(&pgtable_list->list,
 						struct page, lru);
 			mask = atomic_read(&page->_refcount) >> 24;
 			/* Cannot be on this list if either half pending free */
@@ -267,7 +303,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 			atomic_xor_bits(&page->_refcount, 0x01U << (bit + 24));
 			list_del(&page->lru);
 		}
-		spin_unlock_bh(&mm_pgtable_list_lock);
+		spin_unlock_bh(&pgtable_list->lock);
 		if (table)
 			return table;
 	}
@@ -281,7 +317,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 	}
 	arch_set_page_dat(page, 0);
 	/* Initialize page table */
-	page->pt_mm = mm;
+	page->pt_list = pgtable_list;
 	table = (unsigned long *) page_to_virt(page);
 	if (mm_alloc_pgste(mm)) {
 		/* Return 4K page table with PGSTEs */
@@ -292,9 +328,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 		/* Return the first 2K fragment of the page */
 		atomic_xor_bits(&page->_refcount, 0x01U << 24);
 		memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
-		spin_lock_bh(&mm_pgtable_list_lock);
-		list_add(&page->lru, &mm->context.pgtable_list);
-		spin_unlock_bh(&mm_pgtable_list_lock);
+		spin_lock_bh(&pgtable_list->lock);
+		list_add(&page->lru, &pgtable_list->list);
+		spin_unlock_bh(&pgtable_list->lock);
 	}
 	return table;
 }
@@ -314,6 +350,7 @@ static void page_table_release_check(struct page *page, unsigned long *table,
 
 void page_table_free(struct mm_struct *mm, unsigned long *table)
 {
+	struct pgtable_list *pgtable_list;
 	unsigned int mask, bit, half;
 	struct page *page;
 
@@ -321,7 +358,8 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
 	if (!mm_alloc_pgste(mm)) {
 		/* Free 2K page table fragment of a 4K page */
 		bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
-		spin_lock_bh(&mm_pgtable_list_lock);
+		pgtable_list = mm->context.pgtable_list;
+		spin_lock_bh(&pgtable_list->lock);
 		/*
 		 * Mark the page for release. The actual release will happen
 		 * below from this function, or later from __tlb_remove_table().
@@ -329,10 +367,10 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
 		mask = atomic_xor_bits(&page->_refcount, 0x01U << (bit + 24));
 		mask >>= 24;
 		if (mask & 0x03U)		/* other half is allocated */
-			list_add(&page->lru, &mm->context.pgtable_list);
+			list_add(&page->lru, &pgtable_list->list);
 		else if (!(mask & 0x30U))	/* other half not pending */
 			list_del(&page->lru);
-		spin_unlock_bh(&mm_pgtable_list_lock);
+		spin_unlock_bh(&pgtable_list->lock);
 		if (mask != 0x00U)
 			return;
 		half = 0x01U << bit;
@@ -350,6 +388,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
 			 unsigned long vmaddr)
 {
+	struct pgtable_list *pgtable_list;
 	struct mm_struct *mm;
 	struct page *page;
 	unsigned int bit, mask;
@@ -363,7 +402,8 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
 		return;
 	}
 	bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
-	spin_lock_bh(&mm_pgtable_list_lock);
+	pgtable_list = mm->context.pgtable_list;
+	spin_lock_bh(&pgtable_list->lock);
 	/*
 	 * Mark the page for delayed release.
 	 * The actual release will happen later, from __tlb_remove_table().
@@ -373,7 +413,7 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
 	/* Other half not allocated? Other half not already pending free? */
 	if ((mask & 0x03U) == 0x00U && (mask & 0x30U) != 0x30U)
 		list_del(&page->lru);
-	spin_unlock_bh(&mm_pgtable_list_lock);
+	spin_unlock_bh(&pgtable_list->lock);
 	table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
 	tlb_remove_table(tlb, table);
 }
@@ -391,30 +431,35 @@ void __tlb_remove_table(void *_table)
 	case 0x01U:	/* lower 2K of a 4K page table */
 	case 0x02U:	/* upper 2K of a 4K page table */
 		/*
-		 * If the other half is marked as allocated, page->pt_mm must
+		 * If other half is marked allocated, mm and page->pt_list must
 		 * still be valid, page->rcu_head no longer in use so page->lru
 		 * good for use, so now make the freed half available for reuse.
 		 * But be wary of races with that other half being freed.
 		 */
+		rcu_read_lock();
 		if (atomic_read(&page->_refcount) & (0x03U << 24)) {
-			struct mm_struct *mm = page->pt_mm;
+			struct pgtable_list *pgtable_list = page->pt_list;
 			/*
-			 * It is safe to use page->pt_mm when the other half
-			 * is seen allocated while holding pgtable_list lock;
-			 * but how will it be safe to acquire that spinlock?
-			 * Global mm_pgtable_list_lock is safe and easy for
-			 * now, then a followup commit will split it per-mm.
+			 * Here we are either directly freeing from a valid mm,
+			 * or in RCU callback context.  In the latter case, the
+			 * mm might already have been freed since the _refcount
+			 * check above; but SLAB_TYPESAFE_BY_RCU ensures that
+			 * pgtable_list still points to a valid pgtable_list,
+			 * with a spinlock_t in the right place, even if it is
+			 * no longer "ours".  Take that lock and recheck the
+			 * _refcount before adding to pgtable_list->list.
 			 */
-			spin_lock_bh(&mm_pgtable_list_lock);
+			spin_lock_bh(&pgtable_list->lock);
 			mask = atomic_xor_bits(&page->_refcount, mask << 28);
 			mask >>= 24;
 			if (mask & 0x03U)
-				list_add(&page->lru, &mm->context.pgtable_list);
-			spin_unlock_bh(&mm_pgtable_list_lock);
+				list_add(&page->lru, &pgtable_list->list);
+			spin_unlock_bh(&pgtable_list->lock);
 		} else {
 			mask = atomic_xor_bits(&page->_refcount, mask << 28);
 			mask >>= 24;
 		}
+		rcu_read_unlock();
 		if (mask != 0x00U)
 			return;
 		break;
@@ -475,6 +520,7 @@ static void pte_free_now1(struct rcu_head *head)
 
 void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
 {
+	struct pgtable_list *pgtable_list;
 	unsigned int bit, mask;
 	struct page *page;
 
@@ -486,13 +532,14 @@ void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
 	bit = ((unsigned long)pgtable & ~PAGE_MASK) /
 			(PTRS_PER_PTE * sizeof(pte_t));
 
-	spin_lock_bh(&mm_pgtable_list_lock);
+	pgtable_list = mm->context.pgtable_list;
+	spin_lock_bh(&pgtable_list->lock);
 	mask = atomic_xor_bits(&page->_refcount, 0x15U << (bit + 24));
 	mask >>= 24;
 	/* Other half not allocated? Other half not already pending free? */
 	if ((mask & 0x03U) == 0x00U && (mask & 0x30U) != 0x30U)
 		list_del(&page->lru);
-	spin_unlock_bh(&mm_pgtable_list_lock);
+	spin_unlock_bh(&pgtable_list->lock);
 
 	/* Do not relink on rcu_head if other half already linked on rcu_head */
 	if ((mask & 0x0CU) != 0x0CU)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 09335fa28c41..a5b131abd82f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -31,6 +31,7 @@
 
 struct address_space;
 struct mem_cgroup;
+struct pgtable_list;
 
 /*
  * Each physical page in the system has a struct page associated with
@@ -150,7 +151,8 @@ struct page {
 			 */
 			unsigned long _pt_pad_2;	/* mapping */
 			union {
-				struct mm_struct *pt_mm; /* x86 pgd, s390 */
+				struct mm_struct *pt_mm; /* x86 pgds only */
+				struct pgtable_list *pt_list; /* s390 */
 				atomic_t pt_frag_refcount; /* powerpc */
 			};
 #if ALLOC_SPLIT_PTLOCKS
-- 
2.35.3




[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Index of Archives]     [Kernel Development]     [Kernel Newbies]     [IDE]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite Info]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux ATA RAID]     [Samba]     [Linux Media]     [Device Mapper]

  Powered by Linux