On Wed, Aug 20, 2008 at 12:05:36PM -0500, Christoph Lameter wrote: > Nick Piggin wrote: > > On Wed, Aug 20, 2008 at 11:50:09AM -0500, Christoph Lameter wrote: > >> Nick Piggin wrote: > >> > >>> Indeed that would be a good use for it if this general fallback mechanism > >>> were to be merged. > >> Want me to rebase my virtualizable compound patchset on top of your vmap changes? > > > > Is there much clash between them? Or just the fact that you'll have to > > use vm_map_ram/vm_unmap_ram? > > There is not much of a clash. If you would make vmap/unmap atomic then there > is barely any overlap at all and the patchset becomes much smaller and even > the initial version of it can support in interrupt alloc / free. Well the following (untested) incremental patch is about all that would be required for the higher level vmap layer. We then still need to make kernel page table allocations take a gfp mask and make the init_mm ptl interrupt safe. Hopefully I didn't miss anything else... it should be possible, but as you can see not something we want to add unless there is a good reason. Making only vunmap interrupt safe would be less work. > > I probably wouldn't be able to find time to look at that patchset again > > for a while... but anyway, I've been running the vmap rewrite for quite > > a while on several different systems and workloads without problems, so > > it should be stable enough to test out. And the APIs should not change. > > Yes I think this is good stuff. Hopefully I will get enough time to check it > out in detail. Thanks, more reviews would always be helpful. --- Index: linux-2.6/mm/vmalloc.c =================================================================== --- linux-2.6.orig/mm/vmalloc.c +++ linux-2.6/mm/vmalloc.c @@ -296,6 +296,7 @@ static struct vmap_area *alloc_vmap_area unsigned long vstart, unsigned long vend, int node, gfp_t gfp_mask) { + unsigned long flags; struct vmap_area *va; struct rb_node *n; unsigned long addr; @@ -311,7 +312,7 @@ static struct vmap_area *alloc_vmap_area return ERR_PTR(-ENOMEM); retry: - spin_lock(&vmap_area_lock); + spin_lock_irqsave(&vmap_area_lock, flags); /* XXX: could have a last_hole cache */ n = vmap_area_root.rb_node; if (n) { @@ -353,7 +354,7 @@ retry: } found: if (addr + size > vend) { - spin_unlock(&vmap_area_lock); + spin_unlock_irqrestore(&vmap_area_lock, flags); if (!purged) { purge_vmap_area_lazy(); purged = 1; @@ -371,7 +372,7 @@ found: va->va_end = addr + size; va->flags = 0; __insert_vmap_area(va); - spin_unlock(&vmap_area_lock); + spin_unlock_irqrestore(&vmap_area_lock, flags); return va; } @@ -398,9 +399,11 @@ static void __free_vmap_area(struct vmap */ static void free_vmap_area(struct vmap_area *va) { - spin_lock(&vmap_area_lock); + unsigned long flags; + + spin_lock_irqsave(&vmap_area_lock, flags); __free_vmap_area(va); - spin_unlock(&vmap_area_lock); + spin_unlock_irqrestore(&vmap_area_lock, flags); } /* @@ -456,6 +459,8 @@ static void __purge_vmap_area_lazy(unsig struct vmap_area *va; int nr = 0; + BUG_ON(in_interrupt()); + /* * If sync is 0 but force_flush is 1, we'll go sync anyway but callers * should not expect such behaviour. This just simplifies locking for @@ -492,10 +497,10 @@ static void __purge_vmap_area_lazy(unsig flush_tlb_kernel_range(*start, *end); if (nr) { - spin_lock(&vmap_area_lock); + spin_lock_irq(&vmap_area_lock); list_for_each_entry(va, &valist, purge_list) __free_vmap_area(va); - spin_unlock(&vmap_area_lock); + spin_unlock_irq(&vmap_area_lock); } spin_unlock(&purge_lock); } @@ -510,6 +515,13 @@ static void purge_vmap_area_lazy(void) __purge_vmap_area_lazy(&start, &end, 0, 0); } +static void purge_work_fn(struct work_struct *w) +{ + purge_vmap_area_lazy(); +} + +static DECLARE_WORK(purge_work, purge_work_fn); + /* * Free and unmap a vmap area */ @@ -517,17 +529,22 @@ static void free_unmap_vmap_area(struct { va->flags |= VM_LAZY_FREE; atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); - if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) - purge_vmap_area_lazy(); + if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) { + if (!in_interrupt()) + purge_vmap_area_lazy(); + else + schedule_work(&purge_work); + } } static struct vmap_area *find_vmap_area(unsigned long addr) { + unsigned long flags; struct vmap_area *va; - spin_lock(&vmap_area_lock); + spin_lock_irqsave(&vmap_area_lock, flags); va = __find_vmap_area(addr); - spin_unlock(&vmap_area_lock); + spin_unlock_irqrestore(&vmap_area_lock, flags); return va; } @@ -621,6 +638,7 @@ static unsigned long addr_to_vb_idx(unsi static struct vmap_block *new_vmap_block(gfp_t gfp_mask) { + unsigned long flags; struct vmap_block_queue *vbq; struct vmap_block *vb; struct vmap_area *va; @@ -659,6 +677,7 @@ static struct vmap_block *new_vmap_block INIT_LIST_HEAD(&vb->dirty_list); vb_idx = addr_to_vb_idx(va->va_start); + local_irq_save(flags); spin_lock(&vmap_block_tree_lock); err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); spin_unlock(&vmap_block_tree_lock); @@ -671,6 +690,7 @@ static struct vmap_block *new_vmap_block list_add(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); put_cpu_var(vmap_cpu_blocks); + local_irq_restore(flags); return vb; } @@ -684,9 +704,11 @@ static void rcu_free_vb(struct rcu_head static void free_vmap_block(struct vmap_block *vb) { + unsigned long flags; struct vmap_block *tmp; unsigned long vb_idx; + local_irq_save(flags); spin_lock(&vb->vbq->lock); if (!list_empty(&vb->free_list)) list_del(&vb->free_list); @@ -698,6 +720,7 @@ static void free_vmap_block(struct vmap_ spin_lock(&vmap_block_tree_lock); tmp = radix_tree_delete(&vmap_block_tree, vb_idx); spin_unlock(&vmap_block_tree_lock); + local_irq_restore(flags); BUG_ON(tmp != vb); free_unmap_vmap_area(vb->va); @@ -719,9 +742,10 @@ again: rcu_read_lock(); vbq = &get_cpu_var(vmap_block_queue); list_for_each_entry_rcu(vb, &vbq->free, free_list) { + unsigned long flags; int i; - spin_lock(&vb->lock); + spin_lock_irqsave(&vb->lock, flags); i = bitmap_find_free_region(vb->alloc_map, VMAP_BBMAP_BITS, order); @@ -738,7 +762,7 @@ again: spin_unlock(&vb->lock); break; } - spin_unlock(&vb->lock); + spin_unlock_irqrestore(&vb->lock, flags); } put_cpu_var(vmap_cpu_blocks); rcu_read_unlock(); @@ -755,6 +779,7 @@ again: static void vb_free(const void *addr, unsigned long size) { + unsigned long flags; unsigned long offset; unsigned long vb_idx; unsigned int order; @@ -772,7 +797,7 @@ static void vb_free(const void *addr, un rcu_read_unlock(); BUG_ON(!vb); - spin_lock(&vb->lock); + spin_lock_irqsave(&vb->lock, flags); bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); if (!vb->dirty) { spin_lock(&vb->vbq->lock); @@ -782,10 +807,10 @@ static void vb_free(const void *addr, un vb->dirty += 1UL << order; if (vb->dirty == VMAP_BBMAP_BITS) { BUG_ON(vb->free || !list_empty(&vb->free_list)); - spin_unlock(&vb->lock); + spin_unlock_irqrestore(&vb->lock, flags); free_vmap_block(vb); } else - spin_unlock(&vb->lock); + spin_unlock_irqrestore(&vb->lock, flags); } /** @@ -807,6 +832,8 @@ void vm_unmap_aliases(void) int cpu; int flush = 0; + BUG_ON(in_interrupt()); + for_each_possible_cpu(cpu) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); struct vmap_block *vb; @@ -815,7 +842,7 @@ void vm_unmap_aliases(void) list_for_each_entry_rcu(vb, &vbq->free, free_list) { int i; - spin_lock(&vb->lock); + spin_lock_irq(&vb->lock); i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); while (i < VMAP_BBMAP_BITS) { unsigned long s, e; @@ -837,7 +864,7 @@ void vm_unmap_aliases(void) i = find_next_bit(vb->dirty_map, VMAP_BBMAP_BITS, i); } - spin_unlock(&vb->lock); + spin_unlock_irq(&vb->lock); } rcu_read_unlock(); } @@ -878,21 +905,21 @@ EXPORT_SYMBOL(vm_unmap_ram); * @prot: memory protection to use. PAGE_KERNEL for regular RAM * @returns: a pointer to the address that has been mapped, or NULL on failure */ -void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot, gfp_t gfp_mask) { unsigned long size = count << PAGE_SHIFT; unsigned long addr; void *mem; if (likely(count <= VMAP_MAX_ALLOC)) { - mem = vb_alloc(size, GFP_KERNEL); + mem = vb_alloc(size, gfp_mask); if (IS_ERR(mem)) return NULL; addr = (unsigned long)mem; } else { struct vmap_area *va; va = alloc_vmap_area(size, PAGE_SIZE, - VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); + VMALLOC_START, VMALLOC_END, node, gfp_mask); if (IS_ERR(va)) return NULL; Index: linux-2.6/include/linux/vmalloc.h =================================================================== --- linux-2.6.orig/include/linux/vmalloc.h +++ linux-2.6/include/linux/vmalloc.h @@ -38,7 +38,7 @@ struct vm_struct { */ extern void vm_unmap_ram(const void *mem, unsigned int count); extern void *vm_map_ram(struct page **pages, unsigned int count, - int node, pgprot_t prot); + int node, pgprot_t prot, gfp_t gfp_mask); extern void vm_unmap_aliases(void); extern void *vmalloc(unsigned long size); -- To unsubscribe from this list: send the line "unsubscribe linux-arch" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html