[rfc][patch 1/5] fsblock: prep

Nick Piggin <npiggin@xxxxxxx> · Sat, 28 Feb 2009 12:33:02 +0100

Note: Don't bother reviewing this. It is a lot of rolled up patches.

This includes most of the core code changes required for fsblock.
Basically a rollup of the recent patches I sent, plus a rollup of
vmap changes (importantly change vmap API so vunmap is callable from
interrupt context. The vmap change is irrelevant unless running minix
with superpage size blocks, and fsblock's VMAP_CACHE feature).

---
Index: linux-2.6/mm/vmalloc.c
===================================================================

--- linux-2.6.orig/mm/vmalloc.c
+++ linux-2.6/mm/vmalloc.c
@@ -14,6 +14,7 @@
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/mutex.h>
 #include <linux/interrupt.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -29,6 +30,16 @@
 #include <asm/uaccess.h>
 #include <asm/tlbflush.h>
 
+/*
+ * Add a guard page between each kernel virtual address allocation if
+ * DEBUG_PAGEALLOC is turned on (could be a separate config option, but
+ * no big deal).
+ */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+#define GUARD_SIZE PAGE_SIZE
+#else
+#define GUARD_SIZE 0
+#endif
 
 /*** Page table manipulation functions ***/
 
@@ -323,6 +334,7 @@ static struct vmap_area *alloc_vmap_area
 	unsigned long addr;
 	int purged = 0;
 
+	BUG_ON(in_interrupt());
 	BUG_ON(!size);
 	BUG_ON(size & ~PAGE_MASK);
 
@@ -334,7 +346,7 @@ static struct vmap_area *alloc_vmap_area
 retry:
 	addr = ALIGN(vstart, align);
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irq(&vmap_area_lock);
 	if (addr + size - 1 < addr)
 		goto overflow;
 
@@ -368,7 +380,7 @@ retry:
 		}
 
 		while (addr + size > first->va_start && addr + size <= vend) {
-			addr = ALIGN(first->va_end + PAGE_SIZE, align);
+			addr = ALIGN(first->va_end + GUARD_SIZE, align);
 			if (addr + size - 1 < addr)
 				goto overflow;
 
@@ -382,7 +394,7 @@ retry:
 found:
 	if (addr + size > vend) {
 overflow:
-		spin_unlock(&vmap_area_lock);
+		spin_unlock_irq(&vmap_area_lock);
 		if (!purged) {
 			purge_vmap_area_lazy();
 			purged = 1;
@@ -401,7 +413,7 @@ overflow:
 	va->va_end = addr + size;
 	va->flags = 0;
 	__insert_vmap_area(va);
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irq(&vmap_area_lock);
 
 	return va;
 }
@@ -428,9 +440,9 @@ static void __free_vmap_area(struct vmap
  */
 static void free_vmap_area(struct vmap_area *va)
 {
-	spin_lock(&vmap_area_lock);
+	spin_lock_irq(&vmap_area_lock);
 	__free_vmap_area(va);
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irq(&vmap_area_lock);
 }
 
 /*
@@ -457,8 +469,10 @@ static void vmap_debug_free_range(unsign
 	 * faster).
 	 */
 #ifdef CONFIG_DEBUG_PAGEALLOC
-	vunmap_page_range(start, end);
-	flush_tlb_kernel_range(start, end);
+	if (!irqs_disabled()) {
+		vunmap_page_range(start, end);
+		flush_tlb_kernel_range(start, end);
+	}
 #endif
 }
 
@@ -502,10 +516,9 @@ static atomic_t vmap_lazy_nr = ATOMIC_IN
 static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 					int sync, int force_flush)
 {
-	static DEFINE_SPINLOCK(purge_lock);
+	static DEFINE_MUTEX(purge_lock);
 	LIST_HEAD(valist);
 	struct vmap_area *va;
-	struct vmap_area *n_va;
 	int nr = 0;
 
 	/*
@@ -514,10 +527,10 @@ static void __purge_vmap_area_lazy(unsig
 	 * the case that isn't actually used at the moment anyway.
 	 */
 	if (!sync && !force_flush) {
-		if (!spin_trylock(&purge_lock))
+		if (!mutex_trylock(&purge_lock))
 			return;
 	} else
-		spin_lock(&purge_lock);
+		mutex_lock(&purge_lock);
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(va, &vmap_area_list, list) {
@@ -544,12 +557,12 @@ static void __purge_vmap_area_lazy(unsig
 		flush_tlb_kernel_range(*start, *end);
 
 	if (nr) {
-		spin_lock(&vmap_area_lock);
-		list_for_each_entry_safe(va, n_va, &valist, purge_list)
+		spin_lock_irq(&vmap_area_lock);
+		list_for_each_entry(va, &valist, purge_list)
 			__free_vmap_area(va);
-		spin_unlock(&vmap_area_lock);
+		spin_unlock_irq(&vmap_area_lock);
 	}
-	spin_unlock(&purge_lock);
+	mutex_unlock(&purge_lock);
 }
 
 /*
@@ -573,6 +586,17 @@ static void purge_vmap_area_lazy(void)
 	__purge_vmap_area_lazy(&start, &end, 1, 0);
 }
 
+static void deferred_purge(struct work_struct *work)
+{
+	try_purge_vmap_area_lazy();
+}
+
+static struct work_struct purge_work;
+static void kick_purge_vmap_area_lazy(void)
+{
+	schedule_work(&purge_work);
+}
+
 /*
  * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
  * called for the correct range previously.
@@ -582,7 +606,7 @@ static void free_unmap_vmap_area_noflush
 	va->flags |= VM_LAZY_FREE;
 	atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
 	if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
-		try_purge_vmap_area_lazy();
+		kick_purge_vmap_area_lazy();
 }
 
 /*
@@ -597,10 +621,11 @@ static void free_unmap_vmap_area(struct
 static struct vmap_area *find_vmap_area(unsigned long addr)
 {
 	struct vmap_area *va;
+	unsigned long flags;
 
-	spin_lock(&vmap_area_lock);
+	spin_lock_irqsave(&vmap_area_lock, flags);
 	va = __find_vmap_area(addr);
-	spin_unlock(&vmap_area_lock);
+	spin_unlock_irqrestore(&vmap_area_lock, flags);
 
 	return va;
 }
@@ -734,17 +759,17 @@ static struct vmap_block *new_vmap_block
 	INIT_LIST_HEAD(&vb->dirty_list);
 
 	vb_idx = addr_to_vb_idx(va->va_start);
-	spin_lock(&vmap_block_tree_lock);
+	spin_lock_irq(&vmap_block_tree_lock);
 	err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
-	spin_unlock(&vmap_block_tree_lock);
+	spin_unlock_irq(&vmap_block_tree_lock);
 	BUG_ON(err);
 	radix_tree_preload_end();
 
 	vbq = &get_cpu_var(vmap_block_queue);
 	vb->vbq = vbq;
-	spin_lock(&vbq->lock);
+	spin_lock_irq(&vbq->lock);
 	list_add(&vb->free_list, &vbq->free);
-	spin_unlock(&vbq->lock);
+	spin_unlock_irq(&vbq->lock);
 	put_cpu_var(vmap_cpu_blocks);
 
 	return vb;
@@ -762,17 +787,17 @@ static void free_vmap_block(struct vmap_
 	struct vmap_block *tmp;
 	unsigned long vb_idx;
 
-	spin_lock(&vb->vbq->lock);
+	spin_lock_irq(&vb->vbq->lock);
 	if (!list_empty(&vb->free_list))
 		list_del(&vb->free_list);
 	if (!list_empty(&vb->dirty_list))
 		list_del(&vb->dirty_list);
-	spin_unlock(&vb->vbq->lock);
+	spin_unlock_irq(&vb->vbq->lock);
 
 	vb_idx = addr_to_vb_idx(vb->va->va_start);
-	spin_lock(&vmap_block_tree_lock);
+	spin_lock_irq(&vmap_block_tree_lock);
 	tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
-	spin_unlock(&vmap_block_tree_lock);
+	spin_unlock_irq(&vmap_block_tree_lock);
 	BUG_ON(tmp != vb);
 
 	free_unmap_vmap_area_noflush(vb->va);
@@ -786,6 +811,7 @@ static void *vb_alloc(unsigned long size
 	unsigned long addr = 0;
 	unsigned int order;
 
+	BUG_ON(in_interrupt());
 	BUG_ON(size & ~PAGE_MASK);
 	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
 	order = get_order(size);
@@ -796,7 +822,7 @@ again:
 	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
 		int i;
 
-		spin_lock(&vb->lock);
+		spin_lock_irq(&vb->lock);
 		i = bitmap_find_free_region(vb->alloc_map,
 						VMAP_BBMAP_BITS, order);
 
@@ -806,14 +832,14 @@ again:
 					addr_to_vb_idx(vb->va->va_start));
 			vb->free -= 1UL << order;
 			if (vb->free == 0) {
-				spin_lock(&vbq->lock);
+				spin_lock_irq(&vbq->lock);
 				list_del_init(&vb->free_list);
-				spin_unlock(&vbq->lock);
+				spin_unlock_irq(&vbq->lock);
 			}
-			spin_unlock(&vb->lock);
+			spin_unlock_irq(&vb->lock);
 			break;
 		}
-		spin_unlock(&vb->lock);
+		spin_unlock_irq(&vb->lock);
 	}
 	put_cpu_var(vmap_cpu_blocks);
 	rcu_read_unlock();
@@ -830,11 +856,13 @@ again:
 
 static void vb_free(const void *addr, unsigned long size)
 {
+	unsigned long flags;
 	unsigned long offset;
 	unsigned long vb_idx;
 	unsigned int order;
 	struct vmap_block *vb;
 
+	BUG_ON(in_interrupt());
 	BUG_ON(size & ~PAGE_MASK);
 	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
 
@@ -850,7 +878,7 @@ static void vb_free(const void *addr, un
 	rcu_read_unlock();
 	BUG_ON(!vb);
 
-	spin_lock(&vb->lock);
+	spin_lock_irqsave(&vb->lock, flags);
 	bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
 	if (!vb->dirty) {
 		spin_lock(&vb->vbq->lock);
@@ -860,10 +888,10 @@ static void vb_free(const void *addr, un
 	vb->dirty += 1UL << order;
 	if (vb->dirty == VMAP_BBMAP_BITS) {
 		BUG_ON(vb->free || !list_empty(&vb->free_list));
-		spin_unlock(&vb->lock);
+		spin_unlock_irqrestore(&vb->lock, flags);
 		free_vmap_block(vb);
 	} else
-		spin_unlock(&vb->lock);
+		spin_unlock_irqrestore(&vb->lock, flags);
 }
 
 /**
@@ -888,6 +916,8 @@ void vm_unmap_aliases(void)
 	if (unlikely(!vmap_initialized))
 		return;
 
+	BUG_ON(in_interrupt());
+
 	for_each_possible_cpu(cpu) {
 		struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
 		struct vmap_block *vb;
@@ -896,7 +926,7 @@ void vm_unmap_aliases(void)
 		list_for_each_entry_rcu(vb, &vbq->free, free_list) {
 			int i;
 
-			spin_lock(&vb->lock);
+			spin_lock_irq(&vb->lock);
 			i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
 			while (i < VMAP_BBMAP_BITS) {
 				unsigned long s, e;
@@ -918,7 +948,7 @@ void vm_unmap_aliases(void)
 				i = find_next_bit(vb->dirty_map,
 							VMAP_BBMAP_BITS, i);
 			}
-			spin_unlock(&vb->lock);
+			spin_unlock_irq(&vb->lock);
 		}
 		rcu_read_unlock();
 	}
@@ -942,6 +972,8 @@ void vm_unmap_ram(const void *mem, unsig
 	BUG_ON(addr > VMALLOC_END);
 	BUG_ON(addr & (PAGE_SIZE-1));
 
+	BUG_ON(in_interrupt());
+
 	debug_check_no_locks_freed(mem, size);
 	vmap_debug_free_range(addr, addr+size);
 
@@ -967,6 +999,8 @@ void *vm_map_ram(struct page **pages, un
 	unsigned long addr;
 	void *mem;
 
+	BUG_ON(in_interrupt());
+
 	if (likely(count <= VMAP_MAX_ALLOC)) {
 		mem = vb_alloc(size, GFP_KERNEL);
 		if (IS_ERR(mem))
@@ -996,6 +1030,7 @@ void __init vmalloc_init(void)
 	struct vm_struct *tmp;
 	int i;
 
+	INIT_WORK(&purge_work, deferred_purge);
 	for_each_possible_cpu(i) {
 		struct vmap_block_queue *vbq;
 
@@ -1029,7 +1064,7 @@ void unmap_kernel_range(unsigned long ad
 int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 {
 	unsigned long addr = (unsigned long)area->addr;
-	unsigned long end = addr + area->size - PAGE_SIZE;
+	unsigned long end = addr + area->size;
 	int err;
 
 	err = vmap_page_range(addr, end, prot, *pages);
@@ -1055,7 +1090,6 @@ static struct vm_struct *__get_vm_area_n
 	struct vm_struct *tmp, **p;
 	unsigned long align = 1;
 
-	BUG_ON(in_interrupt());
 	if (flags & VM_IOREMAP) {
 		int bit = fls(size);
 
@@ -1075,11 +1109,6 @@ static struct vm_struct *__get_vm_area_n
 	if (unlikely(!area))
 		return NULL;
 
-	/*
-	 * We always allocate a guard page.
-	 */
-	size += PAGE_SIZE;
-
 	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
 	if (IS_ERR(va)) {
 		kfree(area);
@@ -1096,14 +1125,14 @@ static struct vm_struct *__get_vm_area_n
 	va->private = area;
 	va->flags |= VM_VM_AREA;
 
-	write_lock(&vmlist_lock);
+	write_lock_irq(&vmlist_lock);
 	for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
 		if (tmp->addr >= area->addr)
 			break;
 	}
 	area->next = *p;
 	*p = area;
-	write_unlock(&vmlist_lock);
+	write_unlock_irq(&vmlist_lock);
 
 	return area;
 }
@@ -1180,16 +1209,16 @@ struct vm_struct *remove_vm_area(const v
 	if (va && va->flags & VM_VM_AREA) {
 		struct vm_struct *vm = va->private;
 		struct vm_struct *tmp, **p;
+		unsigned long flags;
 
 		vmap_debug_free_range(va->va_start, va->va_end);
 		free_unmap_vmap_area(va);
-		vm->size -= PAGE_SIZE;
 
-		write_lock(&vmlist_lock);
+		write_lock_irqsave(&vmlist_lock, flags);
 		for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
 			;
 		*p = tmp->next;
-		write_unlock(&vmlist_lock);
+		write_unlock_irqrestore(&vmlist_lock, flags);
 
 		return vm;
 	}
@@ -1250,7 +1279,6 @@ static void __vunmap(const void *addr, i
  */
 void vfree(const void *addr)
 {
-	BUG_ON(in_interrupt());
 	__vunmap(addr, 1);
 }
 EXPORT_SYMBOL(vfree);
@@ -1266,7 +1294,6 @@ EXPORT_SYMBOL(vfree);
  */
 void vunmap(const void *addr)
 {
-	BUG_ON(in_interrupt());
 	__vunmap(addr, 0);
 }
 EXPORT_SYMBOL(vunmap);
@@ -1311,7 +1338,7 @@ static void *__vmalloc_area_node(struct
 	struct page **pages;
 	unsigned int nr_pages, array_size, i;
 
-	nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
+	nr_pages = area->size >> PAGE_SHIFT;
 	array_size = (nr_pages * sizeof(struct page *));
 
 	area->nr_pages = nr_pages;
@@ -1533,10 +1560,10 @@ long vread(char *buf, char *addr, unsign
 	if ((unsigned long) addr + count < count)
 		count = -(unsigned long) addr;
 
-	read_lock(&vmlist_lock);
+	read_lock_irq(&vmlist_lock);
 	for (tmp = vmlist; tmp; tmp = tmp->next) {
 		vaddr = (char *) tmp->addr;
-		if (addr >= vaddr + tmp->size - PAGE_SIZE)
+		if (addr >= vaddr + tmp->size)
 			continue;
 		while (addr < vaddr) {
 			if (count == 0)
@@ -1546,7 +1573,7 @@ long vread(char *buf, char *addr, unsign
 			addr++;
 			count--;
 		}
-		n = vaddr + tmp->size - PAGE_SIZE - addr;
+		n = vaddr + tmp->size - addr;
 		do {
 			if (count == 0)
 				goto finished;
@@ -1557,7 +1584,7 @@ long vread(char *buf, char *addr, unsign
 		} while (--n > 0);
 	}
 finished:
-	read_unlock(&vmlist_lock);
+	read_unlock_irq(&vmlist_lock);
 	return buf - buf_start;
 }
 
@@ -1571,10 +1598,10 @@ long vwrite(char *buf, char *addr, unsig
 	if ((unsigned long) addr + count < count)
 		count = -(unsigned long) addr;
 
-	read_lock(&vmlist_lock);
+	read_lock_irq(&vmlist_lock);
 	for (tmp = vmlist; tmp; tmp = tmp->next) {
 		vaddr = (char *) tmp->addr;
-		if (addr >= vaddr + tmp->size - PAGE_SIZE)
+		if (addr >= vaddr + tmp->size)
 			continue;
 		while (addr < vaddr) {
 			if (count == 0)
@@ -1583,7 +1610,7 @@ long vwrite(char *buf, char *addr, unsig
 			addr++;
 			count--;
 		}
-		n = vaddr + tmp->size - PAGE_SIZE - addr;
+		n = vaddr + tmp->size - addr;
 		do {
 			if (count == 0)
 				goto finished;
@@ -1594,7 +1621,7 @@ long vwrite(char *buf, char *addr, unsig
 		} while (--n > 0);
 	}
 finished:
-	read_unlock(&vmlist_lock);
+	read_unlock_irq(&vmlist_lock);
 	return buf - buf_start;
 }
 
@@ -1629,7 +1656,7 @@ int remap_vmalloc_range(struct vm_area_s
 	if (!(area->flags & VM_USERMAP))
 		return -EINVAL;
 
-	if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
+	if (usize + (pgoff << PAGE_SHIFT) > area->size)
 		return -EINVAL;
 
 	addr += pgoff << PAGE_SHIFT;
@@ -1723,7 +1750,7 @@ static void *s_start(struct seq_file *m,
 	loff_t n = *pos;
 	struct vm_struct *v;
 
-	read_lock(&vmlist_lock);
+	read_lock_irq(&vmlist_lock);
 	v = vmlist;
 	while (n > 0 && v) {
 		n--;
@@ -1746,7 +1773,7 @@ static void *s_next(struct seq_file *m,
 
 static void s_stop(struct seq_file *m, void *p)
 {
-	read_unlock(&vmlist_lock);
+	read_unlock_irq(&vmlist_lock);
 }
 
 static void show_numa_info(struct seq_file *m, struct vm_struct *v)
Index: linux-2.6/arch/arm/mm/ioremap.c
===================================================================
--- linux-2.6.orig/arch/arm/mm/ioremap.c
+++ linux-2.6/arch/arm/mm/ioremap.c
@@ -345,7 +345,7 @@ void __iounmap(volatile void __iomem *io
 	 * all the mappings before the area can be reclaimed
 	 * by someone else.
 	 */
-	write_lock(&vmlist_lock);
+	write_lock_irq(&vmlist_lock);
 	for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) {
 		if ((tmp->flags & VM_IOREMAP) && (tmp->addr == addr)) {
 			if (tmp->flags & VM_ARM_SECTION_MAPPING) {
@@ -355,7 +355,7 @@ void __iounmap(volatile void __iomem *io
 			break;
 		}
 	}
-	write_unlock(&vmlist_lock);
+	write_unlock_irq(&vmlist_lock);
 #endif
 
 	vunmap(addr);
Index: linux-2.6/drivers/xen/xenbus/xenbus_client.c
===================================================================
--- linux-2.6.orig/drivers/xen/xenbus/xenbus_client.c
+++ linux-2.6/drivers/xen/xenbus/xenbus_client.c
@@ -488,12 +488,12 @@ int xenbus_unmap_ring_vfree(struct xenbu
 	 * xenbus_map_ring_valloc, but these 6 lines considerably simplify
 	 * this API.
 	 */
-	read_lock(&vmlist_lock);
+	read_lock_irq(&vmlist_lock);
 	for (area = vmlist; area != NULL; area = area->next) {
 		if (area->addr == vaddr)
 			break;
 	}
-	read_unlock(&vmlist_lock);
+	read_unlock_irq(&vmlist_lock);
 
 	if (!area) {
 		xenbus_dev_error(dev, -ENOENT,
Index: linux-2.6/fs/proc/kcore.c
===================================================================
--- linux-2.6.orig/fs/proc/kcore.c
+++ linux-2.6/fs/proc/kcore.c
@@ -336,7 +336,7 @@ read_kcore(struct file *file, char __use
 			if (!elf_buf)
 				return -ENOMEM;
 
-			read_lock(&vmlist_lock);
+			read_lock_irq(&vmlist_lock);
 			for (m=vmlist; m && cursize; m=m->next) {
 				unsigned long vmstart;
 				unsigned long vmsize;
@@ -364,7 +364,7 @@ read_kcore(struct file *file, char __use
 				memcpy(elf_buf + (vmstart - start),
 					(char *)vmstart, vmsize);
 			}
-			read_unlock(&vmlist_lock);
+			read_unlock_irq(&vmlist_lock);
 			if (copy_to_user(buffer, elf_buf, tsz)) {
 				kfree(elf_buf);
 				return -EFAULT;
Index: linux-2.6/fs/proc/mmu.c
===================================================================
--- linux-2.6.orig/fs/proc/mmu.c
+++ linux-2.6/fs/proc/mmu.c
@@ -30,7 +30,7 @@ void get_vmalloc_info(struct vmalloc_inf
 
 		prev_end = VMALLOC_START;
 
-		read_lock(&vmlist_lock);
+		read_lock_irq(&vmlist_lock);
 
 		for (vma = vmlist; vma; vma = vma->next) {
 			unsigned long addr = (unsigned long) vma->addr;
@@ -55,6 +55,6 @@ void get_vmalloc_info(struct vmalloc_inf
 		if (VMALLOC_END - prev_end > vmi->largest_chunk)
 			vmi->largest_chunk = VMALLOC_END - prev_end;
 
-		read_unlock(&vmlist_lock);
+		read_unlock_irq(&vmlist_lock);
 	}
 }
Index: linux-2.6/arch/x86/mm/ioremap.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/ioremap.c
+++ linux-2.6/arch/x86/mm/ioremap.c
@@ -428,12 +428,12 @@ void iounmap(volatile void __iomem *addr
 	   in parallel. Reuse of the virtual address is prevented by
 	   leaving it in the global lists until we're done with it.
 	   cpa takes care of the direct mappings. */
-	read_lock(&vmlist_lock);
+	read_lock_irq(&vmlist_lock);
 	for (p = vmlist; p; p = p->next) {
 		if (p->addr == (void __force *)addr)
 			break;
 	}
-	read_unlock(&vmlist_lock);
+	read_unlock_irq(&vmlist_lock);
 
 	if (!p) {
 		printk(KERN_ERR "iounmap: bad address %p\n", addr);
Index: linux-2.6/fs/file.c
===================================================================
--- linux-2.6.orig/fs/file.c
+++ linux-2.6/fs/file.c
@@ -20,24 +20,10 @@
 #include <linux/rcupdate.h>
 #include <linux/workqueue.h>
 
-struct fdtable_defer {
-	spinlock_t lock;
-	struct work_struct wq;
-	struct fdtable *next;
-};
-
 int sysctl_nr_open __read_mostly = 1024*1024;
 int sysctl_nr_open_min = BITS_PER_LONG;
 int sysctl_nr_open_max = 1024 * 1024; /* raised later */
 
-/*
- * We use this list to defer free fdtables that have vmalloced
- * sets/arrays. By keeping a per-cpu list, we avoid having to embed
- * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
- * this per-task structure.
- */
-static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
-
 static inline void * alloc_fdmem(unsigned int size)
 {
 	if (size <= PAGE_SIZE)
@@ -62,29 +48,9 @@ static inline void free_fdset(struct fdt
 		vfree(fdt->open_fds);
 }
 
-static void free_fdtable_work(struct work_struct *work)
-{
-	struct fdtable_defer *f =
-		container_of(work, struct fdtable_defer, wq);
-	struct fdtable *fdt;
-
-	spin_lock_bh(&f->lock);
-	fdt = f->next;
-	f->next = NULL;
-	spin_unlock_bh(&f->lock);
-	while(fdt) {
-		struct fdtable *next = fdt->next;
-		vfree(fdt->fd);
-		free_fdset(fdt);
-		kfree(fdt);
-		fdt = next;
-	}
-}
-
 void free_fdtable_rcu(struct rcu_head *rcu)
 {
 	struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
-	struct fdtable_defer *fddef;
 
 	BUG_ON(!fdt);
 
@@ -97,20 +63,9 @@ void free_fdtable_rcu(struct rcu_head *r
 				container_of(fdt, struct files_struct, fdtab));
 		return;
 	}
-	if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) {
-		kfree(fdt->fd);
-		kfree(fdt->open_fds);
-		kfree(fdt);
-	} else {
-		fddef = &get_cpu_var(fdtable_defer_list);
-		spin_lock(&fddef->lock);
-		fdt->next = fddef->next;
-		fddef->next = fdt;
-		/* vmallocs are handled from the workqueue context */
-		schedule_work(&fddef->wq);
-		spin_unlock(&fddef->lock);
-		put_cpu_var(fdtable_defer_list);
-	}
+	free_fdarr(fdt);
+	free_fdset(fdt);
+	kfree(fdt);
 }
 
 /*
@@ -404,19 +359,8 @@ out:
 	return NULL;
 }
 
-static void __devinit fdtable_defer_list_init(int cpu)
-{
-	struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
-	spin_lock_init(&fddef->lock);
-	INIT_WORK(&fddef->wq, free_fdtable_work);
-	fddef->next = NULL;
-}
-
 void __init files_defer_init(void)
 {
-	int i;
-	for_each_possible_cpu(i)
-		fdtable_defer_list_init(i);
 	sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
 			     -BITS_PER_LONG;
 }
Index: linux-2.6/ipc/util.c
===================================================================
--- linux-2.6.orig/ipc/util.c
+++ linux-2.6/ipc/util.c
@@ -477,10 +477,9 @@ void ipc_free(void* ptr, int size)
 
 /*
  * rcu allocations:
- * There are three headers that are prepended to the actual allocation:
+ * There are two headers that are prepended to the actual allocation:
  * - during use: ipc_rcu_hdr.
  * - during the rcu grace period: ipc_rcu_grace.
- * - [only if vmalloc]: ipc_rcu_sched.
  * Their lifetime doesn't overlap, thus the headers share the same memory.
  * Unlike a normal union, they are right-aligned, thus some container_of
  * forward/backward casting is necessary:
@@ -489,33 +488,16 @@ struct ipc_rcu_hdr
 {
 	int refcount;
 	int is_vmalloc;
-	void *data[0];
-};
-
-
-struct ipc_rcu_grace
-{
 	struct rcu_head rcu;
-	/* "void *" makes sure alignment of following data is sane. */
-	void *data[0];
-};
 
-struct ipc_rcu_sched
-{
-	struct work_struct work;
-	/* "void *" makes sure alignment of following data is sane. */
 	void *data[0];
 };
 
-#define HDRLEN_KMALLOC		(sizeof(struct ipc_rcu_grace) > sizeof(struct ipc_rcu_hdr) ? \
-					sizeof(struct ipc_rcu_grace) : sizeof(struct ipc_rcu_hdr))
-#define HDRLEN_VMALLOC		(sizeof(struct ipc_rcu_sched) > HDRLEN_KMALLOC ? \
-					sizeof(struct ipc_rcu_sched) : HDRLEN_KMALLOC)
 
 static inline int rcu_use_vmalloc(int size)
 {
 	/* Too big for a single page? */
-	if (HDRLEN_KMALLOC + size > PAGE_SIZE)
+	if (sizeof(struct ipc_rcu_hdr) + size > PAGE_SIZE)
 		return 1;
 	return 0;
 }
@@ -532,23 +514,26 @@ static inline int rcu_use_vmalloc(int si
 void* ipc_rcu_alloc(int size)
 {
 	void* out;
-	/* 
-	 * We prepend the allocation with the rcu struct, and
-	 * workqueue if necessary (for vmalloc). 
-	 */
+
 	if (rcu_use_vmalloc(size)) {
-		out = vmalloc(HDRLEN_VMALLOC + size);
+		out = vmalloc(sizeof(struct ipc_rcu_hdr) + size);
 		if (out) {
-			out += HDRLEN_VMALLOC;
-			container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 1;
-			container_of(out, struct ipc_rcu_hdr, data)->refcount = 1;
+			struct ipc_rcu_hdr *hdr;
+
+			out += sizeof(struct ipc_rcu_hdr);
+			hdr = container_of(out, struct ipc_rcu_hdr, data);
+			hdr->is_vmalloc = 1;
+			hdr->refcount = 1;
 		}
 	} else {
-		out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL);
+		out = kmalloc(sizeof(struct ipc_rcu_hdr) + size, GFP_KERNEL);
 		if (out) {
-			out += HDRLEN_KMALLOC;
-			container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 0;
-			container_of(out, struct ipc_rcu_hdr, data)->refcount = 1;
+			struct ipc_rcu_hdr *hdr;
+
+			out += sizeof(struct ipc_rcu_hdr);
+			hdr = container_of(out, struct ipc_rcu_hdr, data);
+			hdr->is_vmalloc = 0;
+			hdr->refcount = 1;
 		}
 	}
 
@@ -560,56 +545,30 @@ void ipc_rcu_getref(void *ptr)
 	container_of(ptr, struct ipc_rcu_hdr, data)->refcount++;
 }
 
-static void ipc_do_vfree(struct work_struct *work)
-{
-	vfree(container_of(work, struct ipc_rcu_sched, work));
-}
-
-/**
- * ipc_schedule_free - free ipc + rcu space
- * @head: RCU callback structure for queued work
- * 
- * Since RCU callback function is called in bh,
- * we need to defer the vfree to schedule_work().
- */
-static void ipc_schedule_free(struct rcu_head *head)
-{
-	struct ipc_rcu_grace *grace;
-	struct ipc_rcu_sched *sched;
-
-	grace = container_of(head, struct ipc_rcu_grace, rcu);
-	sched = container_of(&(grace->data[0]), struct ipc_rcu_sched,
-				data[0]);
-
-	INIT_WORK(&sched->work, ipc_do_vfree);
-	schedule_work(&sched->work);
-}
-
 /**
  * ipc_immediate_free - free ipc + rcu space
  * @head: RCU callback structure that contains pointer to be freed
  *
  * Free from the RCU callback context.
  */
-static void ipc_immediate_free(struct rcu_head *head)
+static void ipc_rcu_free(struct rcu_head *head)
 {
-	struct ipc_rcu_grace *free =
-		container_of(head, struct ipc_rcu_grace, rcu);
-	kfree(free);
+	struct ipc_rcu_hdr *hdr = container_of(head, struct ipc_rcu_hdr, rcu);
+
+	if (hdr->is_vmalloc)
+		vfree(hdr);
+	else
+		kfree(hdr);
 }
 
 void ipc_rcu_putref(void *ptr)
 {
-	if (--container_of(ptr, struct ipc_rcu_hdr, data)->refcount > 0)
+	struct ipc_rcu_hdr *hdr = container_of(ptr, struct ipc_rcu_hdr, data);
+
+	if (--hdr->refcount > 0)
 		return;
 
-	if (container_of(ptr, struct ipc_rcu_hdr, data)->is_vmalloc) {
-		call_rcu(&container_of(ptr, struct ipc_rcu_grace, data)->rcu,
-				ipc_schedule_free);
-	} else {
-		call_rcu(&container_of(ptr, struct ipc_rcu_grace, data)->rcu,
-				ipc_immediate_free);
-	}
+	call_rcu(&hdr->rcu, ipc_rcu_free);
 }
 
 /**
Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c
+++ linux-2.6/fs/buffer.c
@@ -166,151 +166,6 @@ void end_buffer_write_sync(struct buffer
 }
 
 /*
- * Write out and wait upon all the dirty data associated with a block
- * device via its mapping.  Does not take the superblock lock.
- */
-int sync_blockdev(struct block_device *bdev)
-{
-	int ret = 0;
-
-	if (bdev)
-		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
-	return ret;
-}
-EXPORT_SYMBOL(sync_blockdev);
-
-/*
- * Write out and wait upon all dirty data associated with this
- * device.   Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.
- */
-int fsync_bdev(struct block_device *bdev)
-{
-	struct super_block *sb = get_super(bdev);
-	if (sb) {
-		int res = fsync_super(sb);
-		drop_super(sb);
-		return res;
-	}
-	return sync_blockdev(bdev);
-}
-
-/**
- * freeze_bdev  --  lock a filesystem and force it into a consistent state
- * @bdev:	blockdevice to lock
- *
- * This takes the block device bd_mount_sem to make sure no new mounts
- * happen on bdev until thaw_bdev() is called.
- * If a superblock is found on this device, we take the s_umount semaphore
- * on it to make sure nobody unmounts until the snapshot creation is done.
- * The reference counter (bd_fsfreeze_count) guarantees that only the last
- * unfreeze process can unfreeze the frozen filesystem actually when multiple
- * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
- * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
- * actually.
- */
-struct super_block *freeze_bdev(struct block_device *bdev)
-{
-	struct super_block *sb;
-	int error = 0;
-
-	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (bdev->bd_fsfreeze_count > 0) {
-		bdev->bd_fsfreeze_count++;
-		sb = get_super(bdev);
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return sb;
-	}
-	bdev->bd_fsfreeze_count++;
-
-	down(&bdev->bd_mount_sem);
-	sb = get_super(bdev);
-	if (sb && !(sb->s_flags & MS_RDONLY)) {
-		sb->s_frozen = SB_FREEZE_WRITE;
-		smp_wmb();
-
-		__fsync_super(sb);
-
-		sb->s_frozen = SB_FREEZE_TRANS;
-		smp_wmb();
-
-		sync_blockdev(sb->s_bdev);
-
-		if (sb->s_op->freeze_fs) {
-			error = sb->s_op->freeze_fs(sb);
-			if (error) {
-				printk(KERN_ERR
-					"VFS:Filesystem freeze failed\n");
-				sb->s_frozen = SB_UNFROZEN;
-				drop_super(sb);
-				up(&bdev->bd_mount_sem);
-				bdev->bd_fsfreeze_count--;
-				mutex_unlock(&bdev->bd_fsfreeze_mutex);
-				return ERR_PTR(error);
-			}
-		}
-	}
-
-	sync_blockdev(bdev);
-	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-
-	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
-}
-EXPORT_SYMBOL(freeze_bdev);
-
-/**
- * thaw_bdev  -- unlock filesystem
- * @bdev:	blockdevice to unlock
- * @sb:		associated superblock
- *
- * Unlocks the filesystem and marks it writeable again after freeze_bdev().
- */
-int thaw_bdev(struct block_device *bdev, struct super_block *sb)
-{
-	int error = 0;
-
-	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (!bdev->bd_fsfreeze_count) {
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return -EINVAL;
-	}
-
-	bdev->bd_fsfreeze_count--;
-	if (bdev->bd_fsfreeze_count > 0) {
-		if (sb)
-			drop_super(sb);
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return 0;
-	}
-
-	if (sb) {
-		BUG_ON(sb->s_bdev != bdev);
-		if (!(sb->s_flags & MS_RDONLY)) {
-			if (sb->s_op->unfreeze_fs) {
-				error = sb->s_op->unfreeze_fs(sb);
-				if (error) {
-					printk(KERN_ERR
-						"VFS:Filesystem thaw failed\n");
-					sb->s_frozen = SB_FREEZE_TRANS;
-					bdev->bd_fsfreeze_count++;
-					mutex_unlock(&bdev->bd_fsfreeze_mutex);
-					return error;
-				}
-			}
-			sb->s_frozen = SB_UNFROZEN;
-			smp_wmb();
-			wake_up(&sb->s_wait_unfrozen);
-		}
-		drop_super(sb);
-	}
-
-	up(&bdev->bd_mount_sem);
-	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-	return 0;
-}
-EXPORT_SYMBOL(thaw_bdev);
-
-/*
  * Various filesystems appear to want __find_get_block to be non-blocking.
  * But it's the page lock which protects the buffers.  To get around this,
  * we get exclusion from try_to_free_buffers with the blockdev mapping's
@@ -599,7 +454,7 @@ EXPORT_SYMBOL(mark_buffer_async_write);
  * written back and waited upon before fsync() returns.
  *
  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
- * inode_has_buffers() and invalidate_inode_buffers() are provided for the
+ * mapping_has_private() and invalidate_inode_buffers() are provided for the
  * management of a list of dependent buffers at ->i_mapping->private_list.
  *
  * Locking is a little subtle: try_to_free_buffers() will remove buffers
@@ -652,11 +507,6 @@ static void __remove_assoc_queue(struct
 	bh->b_assoc_map = NULL;
 }
 
-int inode_has_buffers(struct inode *inode)
-{
-	return !list_empty(&inode->i_data.private_list);
-}
-
 /*
  * osync is designed to support O_SYNC io.  It waits synchronously for
  * all already-submitted IO to complete, but does not queue any new
@@ -932,8 +782,9 @@ static int fsync_buffers_list(spinlock_t
  */
 void invalidate_inode_buffers(struct inode *inode)
 {
-	if (inode_has_buffers(inode)) {
-		struct address_space *mapping = &inode->i_data;
+	struct address_space *mapping = &inode->i_data;
+
+	if (mapping_has_private(mapping)) {
 		struct list_head *list = &mapping->private_list;
 		struct address_space *buffer_mapping = mapping->assoc_mapping;
 
@@ -953,10 +804,10 @@ EXPORT_SYMBOL(invalidate_inode_buffers);
  */
 int remove_inode_buffers(struct inode *inode)
 {
+	struct address_space *mapping = &inode->i_data;
 	int ret = 1;
 
-	if (inode_has_buffers(inode)) {
-		struct address_space *mapping = &inode->i_data;
+	if (mapping_has_private(mapping)) {
 		struct list_head *list = &mapping->private_list;
 		struct address_space *buffer_mapping = mapping->assoc_mapping;
 
@@ -1715,6 +1566,7 @@ static int __block_write_full_page(struc
 	struct buffer_head *bh, *head;
 	const unsigned blocksize = 1 << inode->i_blkbits;
 	int nr_underway = 0;
+	int clean_page = 1;
 
 	BUG_ON(!PageLocked(page));
 
@@ -1725,6 +1577,8 @@ static int __block_write_full_page(struc
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
 	}
 
+	clean_page_prepare(page);
+
 	/*
 	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
 	 * here, and the (potentially unmapped) buffers may become dirty at
@@ -1786,7 +1640,7 @@ static int __block_write_full_page(struc
 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
 			lock_buffer(bh);
 		} else if (!trylock_buffer(bh)) {
-			redirty_page_for_writepage(wbc, page);
+			clean_page = 0;
 			continue;
 		}
 		if (test_clear_buffer_dirty(bh)) {
@@ -1800,6 +1654,8 @@ static int __block_write_full_page(struc
 	 * The page and its buffers are protected by PageWriteback(), so we can
 	 * drop the bh refcounts early.
 	 */
+	if (clean_page)
+		clear_page_dirty(page);
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 
@@ -2475,11 +2331,17 @@ block_page_mkwrite(struct vm_area_struct
 	int ret = -EINVAL;
 
 	lock_page(page);
+	if (!page->mapping) {
+		ret = 0;
+		goto out;
+	}
+
+	BUG_ON(page->mapping != inode->i_mapping);
+
 	size = i_size_read(inode);
-	if ((page->mapping != inode->i_mapping) ||
-	    (page_offset(page) > size)) {
+	if (page_offset(page) > size) {
 		/* page got truncated out from underneath us */
-		goto out_unlock;
+		goto out;
 	}
 
 	/* page is wholly or partially inside EOF */
@@ -2492,8 +2354,7 @@ block_page_mkwrite(struct vm_area_struct
 	if (!ret)
 		ret = block_commit_write(page, 0, end);
 
-out_unlock:
-	unlock_page(page);
+out:
 	return ret;
 }
 
Index: linux-2.6/include/linux/buffer_head.h
===================================================================
--- linux-2.6.orig/include/linux/buffer_head.h
+++ linux-2.6/include/linux/buffer_head.h
@@ -158,22 +158,14 @@ void end_buffer_write_sync(struct buffer
 
 /* Things to do with buffers at mapping->private_list */
 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
-int inode_has_buffers(struct inode *);
 void invalidate_inode_buffers(struct inode *);
 int remove_inode_buffers(struct inode *inode);
 int sync_mapping_buffers(struct address_space *mapping);
 void unmap_underlying_metadata(struct block_device *bdev, sector_t block);
 
 void mark_buffer_async_write(struct buffer_head *bh);
-void invalidate_bdev(struct block_device *);
-int sync_blockdev(struct block_device *bdev);
 void __wait_on_buffer(struct buffer_head *);
 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
-int fsync_bdev(struct block_device *);
-struct super_block *freeze_bdev(struct block_device *);
-int thaw_bdev(struct block_device *, struct super_block *);
-int fsync_super(struct super_block *);
-int fsync_no_super(struct block_device *);
 struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
 			unsigned size);
 struct buffer_head *__getblk(struct block_device *bdev, sector_t block,
@@ -340,7 +332,6 @@ extern int __set_page_dirty_buffers(stru
 static inline void buffer_init(void) {}
 static inline int try_to_free_buffers(struct page *page) { return 1; }
 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
-static inline int inode_has_buffers(struct inode *inode) { return 0; }
 static inline void invalidate_inode_buffers(struct inode *inode) {}
 static inline int remove_inode_buffers(struct inode *inode) { return 1; }
 static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -531,6 +531,20 @@ struct address_space_operations {
 	int (*launder_page) (struct page *);
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
+
+	/*
+	 * release_mapping releases any private data on the mapping so that
+	 * it may be reclaimed. Returns 1 on success or 0 on failure. Second
+	 * parameter 'force' causes dirty data to be invalidated. (XXX: could
+	 * have other flags like sync/async, etc).
+	 */
+	int (*release)(struct address_space *, int);
+
+	/*
+	 * sync writes back and waits for any private data on the mapping,
+	 * as a data consistency operation.
+	 */
+	int (*sync)(struct address_space *);
 };
 
 /*
@@ -616,6 +630,14 @@ struct block_device {
 int mapping_tagged(struct address_space *mapping, int tag);
 
 /*
+ * Does this mapping have anything on its private list?
+ */
+static inline int mapping_has_private(struct address_space *mapping)
+{
+	return !list_empty(&mapping->private_list);
+}
+
+/*
  * Might pages of this file be mapped into userspace?
  */
 static inline int mapping_mapped(struct address_space *mapping)
@@ -1730,6 +1752,13 @@ extern void bd_set_size(struct block_dev
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
 extern struct block_device *open_by_devnum(dev_t, fmode_t);
+extern void invalidate_bdev(struct block_device *);
+extern int sync_blockdev(struct block_device *bdev);
+extern struct super_block *freeze_bdev(struct block_device *);
+extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
+extern int fsync_bdev(struct block_device *);
+extern int fsync_super(struct super_block *);
+extern int fsync_no_super(struct block_device *);
 #else
 static inline void bd_forget(struct inode *inode) {}
 #endif
Index: linux-2.6/fs/block_dev.c
===================================================================
--- linux-2.6.orig/fs/block_dev.c
+++ linux-2.6/fs/block_dev.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/blkpg.h>
 #include <linux/buffer_head.h>
+#include <linux/pagevec.h>
 #include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/mount.h>
@@ -71,7 +72,7 @@ static void kill_bdev(struct block_devic
 int set_blocksize(struct block_device *bdev, int size)
 {
 	/* Size must be a power of two, and between 512 and PAGE_SIZE */
-	if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
+	if (size < 512 || !is_power_of_2(size))
 		return -EINVAL;
 
 	/* Size cannot be smaller than the size supported by the device */
@@ -87,7 +88,6 @@ int set_blocksize(struct block_device *b
 	}
 	return 0;
 }
-
 EXPORT_SYMBOL(set_blocksize);
 
 int sb_set_blocksize(struct super_block *sb, int size)
@@ -174,6 +174,151 @@ blkdev_direct_IO(int rw, struct kiocb *i
 				iov, offset, nr_segs, blkdev_get_blocks, NULL);
 }
 
+/*
+ * Write out and wait upon all the dirty data associated with a block
+ * device via its mapping.  Does not take the superblock lock.
+ */
+int sync_blockdev(struct block_device *bdev)
+{
+	int ret = 0;
+
+	if (bdev)
+		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
+	return ret;
+}
+EXPORT_SYMBOL(sync_blockdev);
+
+/*
+ * Write out and wait upon all dirty data associated with this
+ * device.   Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int fsync_bdev(struct block_device *bdev)
+{
+	struct super_block *sb = get_super(bdev);
+	if (sb) {
+		int res = fsync_super(sb);
+		drop_super(sb);
+		return res;
+	}
+	return sync_blockdev(bdev);
+}
+
+/**
+ * freeze_bdev  --  lock a filesystem and force it into a consistent state
+ * @bdev:	blockdevice to lock
+ *
+ * This takes the block device bd_mount_sem to make sure no new mounts
+ * happen on bdev until thaw_bdev() is called.
+ * If a superblock is found on this device, we take the s_umount semaphore
+ * on it to make sure nobody unmounts until the snapshot creation is done.
+ * The reference counter (bd_fsfreeze_count) guarantees that only the last
+ * unfreeze process can unfreeze the frozen filesystem actually when multiple
+ * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
+ * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
+ * actually.
+ */
+struct super_block *freeze_bdev(struct block_device *bdev)
+{
+	struct super_block *sb;
+	int error = 0;
+
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (bdev->bd_fsfreeze_count > 0) {
+		bdev->bd_fsfreeze_count++;
+		sb = get_super(bdev);
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return sb;
+	}
+	bdev->bd_fsfreeze_count++;
+
+	down(&bdev->bd_mount_sem);
+	sb = get_super(bdev);
+	if (sb && !(sb->s_flags & MS_RDONLY)) {
+		sb->s_frozen = SB_FREEZE_WRITE;
+		smp_wmb();
+
+		__fsync_super(sb);
+
+		sb->s_frozen = SB_FREEZE_TRANS;
+		smp_wmb();
+
+		sync_blockdev(sb->s_bdev);
+
+		if (sb->s_op->freeze_fs) {
+			error = sb->s_op->freeze_fs(sb);
+			if (error) {
+				printk(KERN_ERR
+					"VFS:Filesystem freeze failed\n");
+				sb->s_frozen = SB_UNFROZEN;
+				drop_super(sb);
+				up(&bdev->bd_mount_sem);
+				bdev->bd_fsfreeze_count--;
+				mutex_unlock(&bdev->bd_fsfreeze_mutex);
+				return ERR_PTR(error);
+			}
+		}
+	}
+
+	sync_blockdev(bdev);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+
+	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
+}
+EXPORT_SYMBOL(freeze_bdev);
+
+/**
+ * thaw_bdev  -- unlock filesystem
+ * @bdev:	blockdevice to unlock
+ * @sb:		associated superblock
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_bdev().
+ */
+int thaw_bdev(struct block_device *bdev, struct super_block *sb)
+{
+	int error = 0;
+
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (!bdev->bd_fsfreeze_count) {
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return -EINVAL;
+	}
+
+	bdev->bd_fsfreeze_count--;
+	if (bdev->bd_fsfreeze_count > 0) {
+		if (sb)
+			drop_super(sb);
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return 0;
+	}
+
+	if (sb) {
+		BUG_ON(sb->s_bdev != bdev);
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (sb->s_op->unfreeze_fs) {
+				error = sb->s_op->unfreeze_fs(sb);
+				if (error) {
+					printk(KERN_ERR
+						"VFS:Filesystem thaw failed\n");
+					sb->s_frozen = SB_FREEZE_TRANS;
+					bdev->bd_fsfreeze_count++;
+					mutex_unlock(&bdev->bd_fsfreeze_mutex);
+					return error;
+				}
+			}
+			sb->s_frozen = SB_UNFROZEN;
+			smp_wmb();
+			wake_up(&sb->s_wait_unfrozen);
+		}
+		drop_super(sb);
+	}
+
+	up(&bdev->bd_mount_sem);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(thaw_bdev);
+
 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, blkdev_get_block, wbc);
@@ -206,6 +351,11 @@ static int blkdev_write_end(struct file
 	return ret;
 }
 
+static void blkdev_invalidate_page(struct page *page, unsigned long offset)
+{
+	block_invalidatepage(page, offset);
+}
+
 /*
  * private llseek:
  * for a block special file file->f_path.dentry->d_inode->i_size is zero
@@ -1259,6 +1409,8 @@ static const struct address_space_operat
 	.writepages	= generic_writepages,
 	.releasepage	= blkdev_releasepage,
 	.direct_IO	= blkdev_direct_IO,
+	.set_page_dirty	= __set_page_dirty_buffers,
+	.invalidatepage = blkdev_invalidate_page,
 };
 
 const struct file_operations def_blk_fops = {
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -1960,9 +1960,11 @@ static int do_wp_page(struct mm_struct *
 			 */
 			page_table = pte_offset_map_lock(mm, pmd, address,
 							 &ptl);
-			page_cache_release(old_page);
-			if (!pte_same(*page_table, orig_pte))
+			if (!pte_same(*page_table, orig_pte)) {
+				page_cache_release(old_page);
+				unlock_page(old_page);
 				goto unlock;
+			}
 
 			page_mkwrite = 1;
 		}
@@ -2085,16 +2087,30 @@ unlock:
 		 *
 		 * do_no_page is protected similarly.
 		 */
-		wait_on_page_locked(dirty_page);
-		set_page_dirty_balance(dirty_page, page_mkwrite);
+		if (!page_mkwrite) {
+			wait_on_page_locked(dirty_page);
+			set_page_dirty_balance(dirty_page, page_mkwrite);
+		}
 		put_page(dirty_page);
+		if (page_mkwrite) {
+			struct address_space *mapping = old_page->mapping;
+
+			unlock_page(old_page);
+			page_cache_release(old_page);
+			balance_dirty_pages_ratelimited(mapping);
+		}
 	}
 	return ret;
 oom_free_new:
 	page_cache_release(new_page);
 oom:
-	if (old_page)
+	if (old_page) {
+		if (page_mkwrite) {
+			unlock_page(old_page);
+			page_cache_release(old_page);
+		}
 		page_cache_release(old_page);
+	}
 	return VM_FAULT_OOM;
 
 unwritable_page:
@@ -2647,19 +2663,6 @@ static int __do_fault(struct mm_struct *
 				if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
 					ret = VM_FAULT_SIGBUS;
 					anon = 1; /* no anon but release vmf.page */
-					goto out_unlocked;
-				}
-				lock_page(page);
-				/*
-				 * XXX: this is not quite right (racy vs
-				 * invalidate) to unlock and relock the page
-				 * like this, however a better fix requires
-				 * reworking page_mkwrite locking API, which
-				 * is better done later.
-				 */
-				if (!page->mapping) {
-					ret = 0;
-					anon = 1; /* no anon but release vmf.page */
 					goto out;
 				}
 				page_mkwrite = 1;
@@ -2713,16 +2716,23 @@ static int __do_fault(struct mm_struct *
 	pte_unmap_unlock(page_table, ptl);
 
 out:
-	unlock_page(vmf.page);
-out_unlocked:
-	if (anon)
-		page_cache_release(vmf.page);
-	else if (dirty_page) {
+	if (dirty_page) {
+		struct address_space *mapping = page->mapping;
+
 		if (vma->vm_file)
 			file_update_time(vma->vm_file);
 
+		if (set_page_dirty(dirty_page))
+			page_mkwrite = 1;
 		set_page_dirty_balance(dirty_page, page_mkwrite);
+		unlock_page(dirty_page);
 		put_page(dirty_page);
+		if (page_mkwrite)
+			balance_dirty_pages_ratelimited(mapping);
+	} else {
+		unlock_page(vmf.page);
+		if (anon)
+			page_cache_release(vmf.page);
 	}
 
 	return ret;
Index: linux-2.6/mm/migrate.c
===================================================================
--- linux-2.6.orig/mm/migrate.c
+++ linux-2.6/mm/migrate.c
@@ -486,7 +486,7 @@ static int writeout(struct address_space
 		/* No write method for the address space */
 		return -EINVAL;
 
-	if (!clear_page_dirty_for_io(page))
+	if (!PageDirty(page))
 		/* Someone else already triggered a write */
 		return -EAGAIN;
 
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c
+++ linux-2.6/mm/page-writeback.c
@@ -1028,8 +1028,6 @@ continue_unlock:
 			}
 
 			BUG_ON(PageWriteback(page));
-			if (!clear_page_dirty_for_io(page))
-				goto continue_unlock;
 
 			ret = (*writepage)(page, wbc, data);
 			if (unlikely(ret)) {
@@ -1171,7 +1169,7 @@ int write_one_page(struct page *page, in
 	if (wait)
 		wait_on_page_writeback(page);
 
-	if (clear_page_dirty_for_io(page)) {
+	if (PageDirty(page)) {
 		page_cache_get(page);
 		ret = mapping->a_ops->writepage(page, &wbc);
 		if (ret == 0 && wait) {
@@ -1254,6 +1252,8 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers
  */
 int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
 {
+	printk("redirty!\n");
+	dump_stack();
 	wbc->pages_skipped++;
 	return __set_page_dirty_nobuffers(page);
 }
@@ -1304,6 +1304,35 @@ int set_page_dirty_lock(struct page *pag
 }
 EXPORT_SYMBOL(set_page_dirty_lock);
 
+void clean_page_prepare(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	BUG_ON(!mapping);
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!PageDirty(page));
+
+	if (mapping_cap_account_dirty(page->mapping)) {
+		if (page_mkclean(page))
+			set_page_dirty(page);
+	}
+}
+
+void clear_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	BUG_ON(!mapping);
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!PageDirty(page));
+
+	ClearPageDirty(page);
+	if (mapping_cap_account_dirty(page->mapping)) {
+		dec_zone_page_state(page, NR_FILE_DIRTY);
+		dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+	}
+}
+
 /*
  * Clear a page's dirty flag, while caring for dirty memory accounting.
  * Returns true if the page was previously dirty.
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c
+++ linux-2.6/mm/vmscan.c
@@ -374,7 +374,7 @@ static pageout_t pageout(struct page *pa
 	if (!may_write_to_queue(mapping->backing_dev_info))
 		return PAGE_KEEP;
 
-	if (clear_page_dirty_for_io(page)) {
+	if (PageDirty(page)) {
 		int res;
 		struct writeback_control wbc = {
 			.sync_mode = WB_SYNC_NONE,
Index: linux-2.6/fs/xfs/linux-2.6/xfs_aops.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_aops.c
+++ linux-2.6/fs/xfs/linux-2.6/xfs_aops.c
@@ -452,7 +452,7 @@ xfs_start_page_writeback(
 	ASSERT(PageLocked(page));
 	ASSERT(!PageWriteback(page));
 	if (clear_dirty)
-		clear_page_dirty_for_io(page);
+		clear_page_dirty(page);
 	set_page_writeback(page);
 	unlock_page(page);
 	/* If no buffers on the page are to be written, finish it here */
@@ -1230,6 +1230,7 @@ xfs_vm_writepage(
 
 	xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0);
 
+	clean_page_prepare(page);
 	/*
 	 * We need a transaction if:
 	 *  1. There are delalloc buffers on the page
@@ -1277,9 +1278,7 @@ xfs_vm_writepage(
 	return 0;
 
 out_fail:
-	redirty_page_for_writepage(wbc, page);
-	unlock_page(page);
-	return 0;
+	error = 0;
 out_unlock:
 	unlock_page(page);
 	return error;
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -836,6 +836,8 @@ int redirty_page_for_writepage(struct wr
 int set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
+void clean_page_prepare(struct page *page);
+void clear_page_dirty(struct page *page);
 
 extern unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
Index: linux-2.6/fs/mpage.c
===================================================================
--- linux-2.6.orig/fs/mpage.c
+++ linux-2.6/fs/mpage.c
@@ -463,6 +463,8 @@ int __mpage_writepage(struct page *page,
 	loff_t i_size = i_size_read(inode);
 	int ret = 0;
 
+	clean_page_prepare(page);
+
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
 		struct buffer_head *bh = head;
@@ -616,6 +618,7 @@ alloc_new:
 			try_to_free_buffers(page);
 	}
 
+	clear_page_dirty(page);
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 	unlock_page(page);
Index: linux-2.6/fs/fs-writeback.c
===================================================================
--- linux-2.6.orig/fs/fs-writeback.c
+++ linux-2.6/fs/fs-writeback.c
@@ -782,9 +782,15 @@ int generic_osync_inode(struct inode *in
 	if (what & OSYNC_DATA)
 		err = filemap_fdatawrite(mapping);
 	if (what & (OSYNC_METADATA|OSYNC_DATA)) {
-		err2 = sync_mapping_buffers(mapping);
-		if (!err)
-			err = err2;
+		if (!mapping->a_ops->sync) {
+			err2 = sync_mapping_buffers(mapping);
+			if (!err)
+				err = err2;
+		} else {
+			err2 = mapping->a_ops->sync(mapping);
+			if (!err)
+				err = err2;
+		}
 	}
 	if (what & OSYNC_DATA) {
 		err2 = filemap_fdatawait(mapping);
Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c
+++ linux-2.6/fs/inode.c
@@ -208,7 +208,8 @@ static struct inode *alloc_inode(struct
 
 void destroy_inode(struct inode *inode) 
 {
-	BUG_ON(inode_has_buffers(inode));
+	BUG_ON(mapping_has_private(&inode->i_data));
+	BUG_ON(inode->i_data.nrpages);
 	security_inode_free(inode);
 	if (inode->i_sb->s_op->destroy_inode)
 		inode->i_sb->s_op->destroy_inode(inode);
@@ -277,10 +278,14 @@ void __iget(struct inode * inode)
  */
 void clear_inode(struct inode *inode)
 {
+	struct address_space *mapping = &inode->i_data;
+
 	might_sleep();
-	invalidate_inode_buffers(inode);
+	if (!mapping->a_ops->release)
+		invalidate_inode_buffers(inode);
        
-	BUG_ON(inode->i_data.nrpages);
+	BUG_ON(mapping_has_private(mapping));
+	BUG_ON(mapping->nrpages);
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(inode->i_state & I_CLEAR);
 	inode_sync_wait(inode);
@@ -343,6 +348,7 @@ static int invalidate_list(struct list_h
 	for (;;) {
 		struct list_head * tmp = next;
 		struct inode * inode;
+		struct address_space * mapping;
 
 		/*
 		 * We can reschedule here without worrying about the list's
@@ -356,7 +362,12 @@ static int invalidate_list(struct list_h
 		if (tmp == head)
 			break;
 		inode = list_entry(tmp, struct inode, i_sb_list);
-		invalidate_inode_buffers(inode);
+		mapping = &inode->i_data;
+		if (!mapping->a_ops->release)
+			invalidate_inode_buffers(inode);
+		else
+			mapping->a_ops->release(mapping, 1); /* XXX: should be done in fs? */
+		BUG_ON(mapping_has_private(mapping));
 		if (!atomic_read(&inode->i_count)) {
 			list_move(&inode->i_list, dispose);
 			inode->i_state |= I_FREEING;
@@ -399,13 +410,15 @@ EXPORT_SYMBOL(invalidate_inodes);
 
 static int can_unuse(struct inode *inode)
 {
+	struct address_space *mapping = &inode->i_data;
+
 	if (inode->i_state)
 		return 0;
-	if (inode_has_buffers(inode))
+	if (mapping_has_private(mapping))
 		return 0;
 	if (atomic_read(&inode->i_count))
 		return 0;
-	if (inode->i_data.nrpages)
+	if (mapping->nrpages)
 		return 0;
 	return 1;
 }
@@ -434,6 +447,7 @@ static void prune_icache(int nr_to_scan)
 	spin_lock(&inode_lock);
 	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
 		struct inode *inode;
+		struct address_space *mapping;
 
 		if (list_empty(&inode_unused))
 			break;
@@ -444,10 +458,17 @@ static void prune_icache(int nr_to_scan)
 			list_move(&inode->i_list, &inode_unused);
 			continue;
 		}
-		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+		mapping = &inode->i_data;
+		if (mapping_has_private(mapping) || mapping->nrpages) {
+			int ret;
+
 			__iget(inode);
 			spin_unlock(&inode_lock);
-			if (remove_inode_buffers(inode))
+			if (mapping->a_ops->release)
+				ret = mapping->a_ops->release(mapping, 0);
+			else
+				ret = remove_inode_buffers(inode);
+			if (ret)
 				reap += invalidate_mapping_pages(&inode->i_data,
 								0, -1);
 			iput(inode);
Index: linux-2.6/fs/super.c
===================================================================
--- linux-2.6.orig/fs/super.c
+++ linux-2.6/fs/super.c
@@ -28,7 +28,7 @@
 #include <linux/blkdev.h>
 #include <linux/quotaops.h>
 #include <linux/namei.h>
-#include <linux/buffer_head.h>		/* for fsync_super() */
+#include <linux/fs.h>			/* for fsync_super() */
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -38,6 +38,7 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
+#include <linux/buffer_head.h>		/* sync_blockdev */
 #include <linux/async.h>
 #include <asm/uaccess.h>
 #include "internal.h"
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html