[Linux-cachefs] [PATCH 11/12] FS-Cache: CacheFS: Add cache on blockdevice cache backend

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The attached patch adds CacheFS, a caching facility that uses a block device as
the cache by mounting a quasi-filesystem on it that provides caching facilities
rather than the usual file interface.

CacheFS stores its metadata in a wandering tree. Anything in the current tree
is (more or less) immutable. Changes in the block holding the root of the tree
and the block allocation lists are tracked by a journal. The journal does not
store any specific information about changes to the content or the structure of
the tree.

CacheFS creates an internal inode to cover the entire block device. Metadata
blocks on disk are attached to this as pages. Pages are migrated from one
backing block to another as the blocks become part of the metadata tree and
thus immutable, hence the need for the per-task radix-tree pre-allocation for
several pages in one go: insertion may have to fan out part of the tree. What
we want to do is to keep the same page for as long as possible; and just change
the page->index as the block migrates on disk.

CacheFS is not quite complete. Certain things need to be added or completed:

 (*) Metadata readahead needs to be added.

 (*) Tree scanning needs to be speeded up, probably with readahead.

 (*) Tree scanning needs to build a culling list (currently disabled).

 (*) Tree culling needs to be reworked (currently disabled).

 (*) There needs to be a culling policy created.

 (*) Some bugs need squashing still.

Signed-Off-By: David Howells <dhowells@xxxxxxxxxx>
---
warthog>diffstat -p1 fscache-cachefs-2614mm2.diff
 fs/Kconfig                      |   21 
 fs/Makefile                     |    1 
 fs/cachefs/Makefile             |   37 
 fs/cachefs/allocator.c          | 1382 +++++++++++++++++++++++++++++++++
 fs/cachefs/cachefs-debug.h      |  132 +++
 fs/cachefs/cachefs-inode.h      |   64 +
 fs/cachefs/cachefs-int.h        |  739 +++++++++++++++++
 fs/cachefs/cachefs-layout.h     |  312 +++++++
 fs/cachefs/inode.c              |  224 +++++
 fs/cachefs/interface.c          |  724 +++++++++++++++++
 fs/cachefs/journal-replay.c     |  485 +++++++++++
 fs/cachefs/journal.c            |  483 +++++++++++
 fs/cachefs/kcachefsd.c          |  278 ++++++
 fs/cachefs/main.c               |  224 +++++
 fs/cachefs/meta-aops.c          |  794 +++++++++++++++++++
 fs/cachefs/meta-misc.c          |  348 ++++++++
 fs/cachefs/operation.c          |  571 +++++++++++++
 fs/cachefs/reaper.c             |  120 ++
 fs/cachefs/recycling.c          |  966 +++++++++++++++++++++++
 fs/cachefs/rootdir.c            |  146 +++
 fs/cachefs/status.c             |  234 +++++
 fs/cachefs/super.c              | 1345 ++++++++++++++++++++++++++++++++
 fs/cachefs/tree-cull.c          |   19 
 fs/cachefs/tree-data.c          | 1669 ++++++++++++++++++++++++++++++++++++++++
 fs/cachefs/tree-delete.c        |  597 ++++++++++++++
 fs/cachefs/tree-insert-fanout.c | 1126 ++++++++++++++++++++++++++
 fs/cachefs/tree-insert.c        |  675 ++++++++++++++++
 fs/cachefs/tree-keys.c          |  587 ++++++++++++++
 fs/cachefs/tree-list.c          |  544 +++++++++++++
 fs/cachefs/tree-lookup.c        |  598 ++++++++++++++
 fs/cachefs/tree-misc.c          |  346 ++++++++
 fs/cachefs/tree-move.c          |  299 +++++++
 fs/cachefs/tree-node.c          |  284 ++++++
 fs/cachefs/tree-scan.c          |  972 +++++++++++++++++++++++
 fs/cachefs/tree-update.c        |  175 ++++
 35 files changed, 17521 insertions(+)

diff -uNrp linux-2.6.14-mm2/fs/Kconfig linux-2.6.14-mm2-cachefs/fs/Kconfig
--- linux-2.6.14-mm2/fs/Kconfig	2005-11-14 16:17:54.000000000 +0000
+++ linux-2.6.14-mm2-cachefs/fs/Kconfig	2005-11-14 16:23:38.000000000 +0000
@@ -524,6 +524,27 @@ config FUSE_FS
 
 	  See Documentation/filesystems/caching/fscache.txt for more information.
 
+config CACHEFS
+	tristate "Filesystem caching filesystem"
+	depends on FSCACHE
+	help
+	  This filesystem acts as a cache for other filesystems - primarily
+	  networking filesystems - rather than thus allowing fast local disc to
+	  enhance the speed of slower devices.
+
+	  It is a filesystem so that raw block devices can be made use of more
+	  efficiently, without suffering any overhead from intermediary
+	  filesystems. This does not, however, preclude files being used as
+	  cache devices; this is possible by making use of the loopback block
+	  device driver.
+
+	  The cache can be journalled so that the cache contents aren't
+	  destroyed in the event of a power failure.
+
+	  See Documentation/filesystems/caching/cachefs.txt for more information.
+
+endmenu
+
 menu "CD-ROM/DVD Filesystems"
 
 config ISO9660_FS
diff -uNrp linux-2.6.14-mm2/fs/Makefile linux-2.6.14-mm2-cachefs/fs/Makefile
--- linux-2.6.14-mm2/fs/Makefile	2005-11-14 16:17:54.000000000 +0000
+++ linux-2.6.14-mm2-cachefs/fs/Makefile	2005-11-14 16:23:38.000000000 +0000
@@ -103,6 +103,7 @@ obj-$(CONFIG_AFS_FS)		+= afs/
 obj-$(CONFIG_BEFS_FS)		+= befs/
 obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)		+= hppfs/
+obj-$(CONFIG_CACHEFS)		+= cachefs/
 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
 obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
diff -uNrp linux-2.6.14-mm2/fs/cachefs/allocator.c linux-2.6.14-mm2-cachefs/fs/cachefs/allocator.c
--- linux-2.6.14-mm2/fs/cachefs/allocator.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/allocator.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,1382 @@
+/* allocator.c: CacheFS disk block allocator
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ *
+ * We allocate blocks from one of three sources in order:
+ * - unready block space
+ * - on-disc allocation list
+ * - on-disc recycling list (need to wander)
+ *
+ * The functions in this file are set up to jump from one to another where
+ * possible rather than truly recursing.
+ *
+ * These functions must not return -ENOMEM; -EIO is permitted as we can't then
+ * modify the disk, but -ENOMEM during deletion would be really hard to deal
+ * with.
+ *
+ * This means that we can't add pages to the page cache as the radix tree
+ * allocations might fail. We sleep waiting for BIOs to become available -
+ * which they should being transitory objects.
+ */
+
+//#define __KALLOC
+//#define __KENTER
+//#define __KLEAVE
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/mm_inline.h>
+#include "cachefs-int.h"
+
+/*****************************************************************************/
+/*
+ * validate the pointers in the node and bubble sort it
+ */
+static int cachefs_allocator_sort_alloc_TOS(struct cachefs_super *super,
+					    struct page *page)
+{
+	struct cachefs_ondisc_free_node *alstk;
+	cachefs_block_t *ptr, *end, *defer, last, this;
+	int changed;
+
+	alstk = kmap_atomic(page, KM_USER0);
+
+	end = &alstk->ptrs[CACHEFS_ONDISC_FREELIST_PTRSPERNODE];
+
+	/* check the nodes are all in range */
+	for (ptr = &alstk->ptrs[0]; ptr < end; ptr++)
+		if (unlikely(*ptr < super->layout->bix_cache ||
+			     *ptr >= super->j.alloc_unready ||
+			     (CACHEFS_NULL_PTR != 0 && *ptr == CACHEFS_NULL_PTR)))
+			goto invalid_node;
+
+	/* sort the nodes to get better read-back performance */
+	do {
+		changed = 0;
+		defer = NULL;
+		ptr = &alstk->ptrs[0];
+
+		for (last = *ptr++; ptr < end; ptr++) {
+			this = *ptr;
+
+			if (last > this) {
+				ptr[-1] = this;
+				changed = 1;
+				defer = ptr;
+			}
+			else {
+				if (defer) {
+					*defer = last;
+					defer = NULL;
+				}
+				last = this;
+			}
+		}
+
+		if (defer)
+			*defer = last;
+
+	} while (changed);
+
+	kunmap_atomic(alstk, KM_USER0);
+
+	SetPageFsMisc(page);
+	return 0;
+
+invalid_node:
+	kunmap_atomic(alstk, KM_USER0);
+
+	printk(KERN_ERR "CacheFS: Invalid ptr %x in alloc stk node %lx[%x]\n",
+	       *ptr, page->index, ptr - alstk->ptrs);
+	kleave(" = -EIO");
+	return -EIO;
+
+} /* end cachefs_allocator_sort_alloc_TOS() */
+
+/*****************************************************************************/
+/*
+ * handle the completion of a BIO that read a page for the allocator
+ */
+int cachefs_allocator_end_io_read(struct bio *bio,
+				  unsigned int bytes_done, int err)
+{
+	struct cachefs_super *super = bio->bi_private;
+	struct page *page = bio->bi_io_vec->bv_page;
+
+	_enter("{sz=%u rw=%lu},%u,%d",
+	       bio->bi_size, bio->bi_rw, bytes_done, err);
+
+	if (bio->bi_size)
+		return 1;
+
+	/* mark the page with the appropriate state */
+	if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+		SetPageUptodate(page);
+	} else {
+		set_bit(CACHEFS_SUPER_ERROR_STOP, &super->flags);
+		ClearPageUptodate(page);
+		SetPageError(page);
+	}
+
+	_debug("DONE PAGE %p{%d,%lx,%lx}",
+	       page, page_count(page), page->index, page->flags);
+
+	unlock_page(page);
+	bio_put(bio);
+	return 0;
+
+} /* end cachefs_allocator_end_io_read() */
+
+/*****************************************************************************/
+/*
+ * read the next page in the allocator's chain from disk
+ */
+void cachefs_allocator_read_next(struct cachefs_super *super, int noblock)
+{
+	struct inode *inode;
+	struct page *page = super->page_pfree_nx;
+	struct bio *bio;
+
+	_enter("{%x}", super->alloc_pfree_nx);
+
+	ASSERTCMP(super->alloc_pfree_nx, <, super->j.alloc_unready);
+
+	if (PageMappedToDisk(super->page_pfree_nx))
+		return;
+
+	/* see if the page we want is lurking in the page cache */
+	page = find_get_page(page->mapping, super->alloc_pfree_nx);
+	if (page) {
+		/* it is - remove it from the page cache and substitute for the
+		 * next page we have */
+		struct page *xpage;
+
+		kdebug("EXTRACT %p{%lx} FROM CACHE", page, page->index);
+
+		ASSERT(!PageWriteback(page));
+		ASSERT(!PageFsMisc(page));
+		ASSERT(!PageDirty(page));
+		ASSERTCMP(page_private(page), ==, 0);
+
+		if (TestSetPageLocked(page))
+			BUG();
+
+		remove_from_page_cache(page);
+		xpage = super->page_pfree_nx;
+		super->page_pfree_nx = page;
+		xpage->mapping = NULL;
+		cachefs_page_put(xpage);
+		_leave(" [cached]");
+		return;
+	}
+
+	/* re-use the page we already have */
+	page = super->page_pfree_nx;
+	if (TestSetPageLocked(page)) {
+		if (noblock) {
+			kleave(" [noblock locked]");
+			return;
+		}
+
+		lock_page(page);
+	}
+
+	if (PageUptodate(page) || PageError(page)) {
+		ASSERT(PageMappedToDisk(page));
+		ASSERTCMP(page->index, ==, super->alloc_pfree_nx);
+		unlock_page(page);
+		_leave(" [present]");
+		return;
+	}
+
+	/* dispatch a call to perform the read */
+	if (noblock) {
+		bio = bio_alloc(GFP_ATOMIC, 1);
+		if (!bio) {
+			unlock_page(page);
+			kleave(" [noblock nomem");
+			return;
+		}
+	}
+	else {
+		bio = bio_alloc(GFP_KERNEL | __GFP_WAIT | __GFP_NOFAIL, 1);
+	}
+
+	ASSERT(!PageFsMisc(page));
+
+	page->index = super->alloc_pfree_nx;
+	SetPageMappedToDisk(page);
+
+	inode = page->mapping->host;
+	bio->bi_bdev	= inode->i_sb->s_bdev;
+	bio->bi_sector	= page->index;
+	bio->bi_sector	<<= PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+	bio->bi_end_io	= cachefs_allocator_end_io_read;
+	bio->bi_private	= super;
+
+	if (!bio_add_page(bio, page, PAGE_SIZE, 0))
+		BUG();
+
+	submit_bio(READ, bio);
+
+	_leave("");
+
+} /* end cachefs_allocator_read_next() */
+
+/*****************************************************************************/
+/*
+ * abort an active operation and any queued ops due to an old I/O error
+ */
+static int cachefs_allocator_error_stop(struct cachefs_operation *op)
+{
+	struct cachefs_operation *xop;
+	struct cachefs_super *super = op->super;
+	struct task_struct *task;
+
+	_enter("");
+
+	spin_lock(&super->alloc_lock);
+
+	list_del_init(&op->alloc_link);
+	op->state = CACHEFS_OP_IO_ERROR;
+
+	/* kill all the other ops in the queue */
+	while (!list_empty(&super->alloc_waitq)) {
+		xop = list_entry(super->alloc_waitq.next,
+				 struct cachefs_operation, alloc_link);
+
+		list_del_init(&xop->alloc_link);
+		task = xop->task;
+		xop->task = NULL;
+		smp_mb();
+		xop->state = CACHEFS_OP_IO_ERROR;
+		wake_up_process(task);
+		put_task_struct(task);
+	}
+
+	spin_unlock(&super->alloc_lock);
+	_leave(" = -EIO [old]");
+	return -EIO;
+
+} /* end cachefs_allocator_error_stop() */
+
+/*****************************************************************************/
+/*
+ * flag an I/O error and abort ops
+ */
+static int cachefs_allocator_io_error(struct cachefs_operation *op)
+{
+	struct cachefs_super *super = op->super;
+
+	_enter("");
+
+	super->j.error = 1;
+
+	/* kill the allocation stack */
+	spin_lock(&super->alloc_lock);
+	SetPageError(super->page_pfree);
+	super->j.alloc_pfree = 0;
+	spin_unlock(&super->alloc_lock);
+
+	/* show the error on the console */
+	if (atomic_read(&super->error_count) < 5) {
+		atomic_inc(&super->error_count);
+
+		printk(KERN_ERR
+		       "CacheFS: I/O Error in allocation stack:"
+		       " stopping cache\n");
+		set_bit(CACHEFS_SUPER_ERROR_STOP, &super->flags);
+	}
+
+	return cachefs_allocator_error_stop(op);
+
+} /* end cachefs_allocator_io_error() */
+
+/*****************************************************************************/
+/*
+ * flag a filesystem error and abort ops
+ */
+static int cachefs_allocator_fs_error(struct cachefs_operation *op)
+{
+	struct cachefs_super *super = op->super;
+
+	_enter("");
+
+	super->j.error = 1;
+
+	/* kill the allocation stack */
+	spin_lock(&super->alloc_lock);
+	SetPageError(super->page_pfree);
+	super->j.alloc_pfree = 0;
+	spin_unlock(&super->alloc_lock);
+
+	/* show the error on the console */
+	if (atomic_read(&super->error_count) < 5) {
+		atomic_inc(&super->error_count);
+
+		printk(KERN_ERR
+		       "CacheFS: Filesystem Error in allocation stack:"
+		       " stopping cache\n");
+		set_bit(CACHEFS_SUPER_ERROR_STOP, &super->flags);
+	}
+
+	return cachefs_allocator_error_stop(op);
+
+} /* end cachefs_allocator_fs_error() */
+
+/*****************************************************************************/
+/*
+ * attempt to pass on the responsibility for allocator management to the next
+ * task waiting in the queue after us
+ */
+static int cachefs_allocator_pass_responsibility(struct cachefs_operation *op)
+{
+	struct cachefs_operation *xop;
+	struct cachefs_super *super = op->super;
+	struct task_struct *task;
+
+	kenter("");
+
+	ASSERTCMP(super->alloc_waitq.next, ==, &op->alloc_link);
+
+	list_del_init(&op->alloc_link);
+
+	if (!list_empty(&super->alloc_waitq)) {
+		xop = list_entry(op->alloc_link.next,
+				 struct cachefs_operation, alloc_link);
+		task = xop->task;
+		xop->task = NULL;
+
+		kdebug("pass allocator to %s", task->comm);
+
+		smp_mb();
+		xop->state = CACHEFS_OP_ALLOCATING;
+		wake_up_process(task);
+		put_task_struct(task);
+	}
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_allocator_pass_responsibility() */
+
+/*****************************************************************************/
+/*
+ * actually do the allocation and reclamation in exclusive mode for one
+ * particular operation
+ * - the caller must hold the alloc lock
+ */
+static int cachefs_allocator_exclusive_do_alloc_op(struct cachefs_operation *op,
+						   struct cachefs_ondisc_free_node *alstk,
+						   struct cachefs_ondisc_free_node *rcstk)
+{
+	struct cachefs_super *super = op->super;
+	unsigned apt, rpt, na, nr, acount, rcount;
+	int ret = -EWOULDBLOCK;
+
+	_enter("");
+
+	apt = super->j.alloc_pfree_pt;
+	rpt = super->j.rcm_coll_pt;
+	na = op->n_alloc;
+	nr = op->n_rcm;
+
+	ASSERTCMP(apt, <=, CACHEFS_ONDISC_FREELIST_PTRSPERNODE);
+	ASSERTCMP(rpt, <=, CACHEFS_ONDISC_FREELIST_PTRSPERNODE);
+	ASSERTIF(apt < CACHEFS_ONDISC_FREELIST_PTRSPERNODE, alstk);
+	ASSERT(rcstk);
+	ASSERTIFCMP(alstk, alstk->magic, ==, CACHEFS_ONDISC_FREELIST_READY);
+	ASSERTCMP(rcstk->magic, ==, CACHEFS_ONDISC_FREELIST_PARTIAL);
+
+	/* now service this operation's allocation requests */
+	if (super->j.alloc_unready < super->layout->bix_end &&
+	    na < op->m_alloc
+	    ) {
+		acount = 0;
+		while (super->j.alloc_unready < super->layout->bix_end &&
+		       na < op->m_alloc
+		       ) {
+			_alloc(super, "xo alloc unready %x",
+			       super->j.alloc_unready);
+
+			op->bix_alloc[na++] = super->j.alloc_unready++;
+			acount++;
+		}
+
+		super->space_inprogress -= acount;
+		op->reservation -= acount;
+	}
+
+	acount = rcount = 0;
+	while (na < op->m_alloc) {
+		if (apt >= CACHEFS_ONDISC_FREELIST_PTRSPERNODE)
+			goto progress_halted;
+
+		_alloc(super, "xo alloc %x[%x] to %x",
+		       super->j.alloc_pfree, apt, alstk->ptrs[apt]);
+
+		ASSERTCMP(alstk->ptrs[apt], >=, super->layout->bix_cache);
+		ASSERTCMP(alstk->ptrs[apt], <, super->j.alloc_unready);
+
+		op->bix_alloc[na++] = alstk->ptrs[apt++];
+		acount++;
+	}
+
+	/* and its reclamation requests when we've finished with the
+	 * allocations */
+	while (nr < op->m_rcm) {
+		cachefs_block_t bix;
+
+		if (rpt >= CACHEFS_ONDISC_FREELIST_PTRSPERNODE)
+			goto progress_halted;
+
+		bix = op->bix_rcm[nr];
+		op->bix_rcm[nr] = 0;
+		nr++;
+
+		_alloc(super, "xo rcm %x to %x[%x]",
+		       bix, super->j.rcm_collector, rpt);
+
+		ASSERTCMP(bix, !=, CACHEFS_NULL_PTR);
+		ASSERTCMP(bix, >=, super->layout->bix_cache);
+		ASSERTCMP(bix, <, super->j.alloc_unready);
+
+		rcstk->ptrs[rpt++] = bix;
+		rcount++;
+	}
+
+	/* we completed this operation's requests */
+	ret = 0;
+
+progress_halted:
+	op->n_alloc = na;
+
+	if (nr > op->n_rcm)
+		SetPageDirty(super->page_rcm);
+	op->n_rcm = nr;
+
+	wmb();
+
+	super->j.alloc_pfree_pt = apt;
+	super->j.alloc_pfree_n -= acount;
+	super->j.space_alloc -= acount;
+	super->j.rcm_coll_pt = rpt;
+	super->j.space_rcm += rcount;
+	super->space_inprogress -= acount;
+	op->reservation -= acount;
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_allocator_exclusive_do_alloc_op() */
+
+/*****************************************************************************/
+/*
+ * actually do the allocation and reclamation in exclusive mode
+ */
+static int cachefs_allocator_exclusive_do_alloc(struct cachefs_operation *op)
+{
+	struct cachefs_ondisc_free_node *alstk, *rcstk;
+	struct cachefs_operation *xop;
+	struct cachefs_super *super = op->super;
+	struct task_struct *task;
+
+	_enter("");
+
+	/* we should have sufficient space to allocate at least one block and
+	 * to reclaim at least one block */
+	ASSERTIFCMP(super->j.alloc_unready >= super->layout->bix_end,
+		    super->j.alloc_pfree, >=, super->layout->bix_cache);
+	ASSERTCMP(super->j.rcm_collector, >=, super->layout->bix_cache);
+
+	ASSERT(super->page_pfree);
+	ASSERT(pfn_valid(page_to_pfn(super->page_pfree)));
+
+	ASSERT(super->page_rcm);
+	ASSERT(pfn_valid(page_to_pfn(super->page_rcm)));
+	ASSERT(PageUptodate(super->page_rcm));
+
+	alstk = NULL;
+	if (PageUptodate(super->page_pfree)) {
+		alstk = kmap_atomic(super->page_pfree, KM_USER0);
+		super->alloc_pfree_nx = alstk->next;
+
+		ASSERT(PageFsMisc(super->page_pfree));
+		ASSERTCMP(alstk->magic, ==, CACHEFS_ONDISC_FREELIST_READY);
+	}
+
+	rcstk = kmap_atomic(super->page_rcm, KM_USER1);
+
+	/* make sure the collector has the right magic number on it
+	 * - it may not if there was a crash and the allocation got journalled
+	 *   before the reclamation node was initialised
+	 */
+	if (rcstk->magic != CACHEFS_ONDISC_FREELIST_PARTIAL) {
+		kdebug("RESET RECLAIM COLLECTOR %x", super->j.rcm_collector);
+
+		ASSERTIFCMP(super->j.rcm_coll_pt > 0,
+			    rcstk->magic, ==, CACHEFS_ONDISC_FREELIST_PARTIAL);
+		memset(rcstk, 0, PAGE_SIZE);
+		rcstk->magic = CACHEFS_ONDISC_FREELIST_PARTIAL;
+	}
+
+	spin_lock(&super->alloc_lock);
+
+	/* make sure we've got a spare reclaim stack collector */
+	if (!super->j.rcm_spare) {
+		/* if there's unready space available, then this should've been
+		 * allocated by the caller */
+		ASSERTCMP(super->j.alloc_unready, ==, super->layout->bix_end);
+
+		_alloc(super, "alloc %x[%x] as rcm spare %x",
+		       super->j.alloc_pfree,
+		       super->j.alloc_pfree_pt,
+		       alstk->ptrs[super->j.alloc_pfree_pt]);
+
+		super->j.rcm_spare = alstk->ptrs[super->j.alloc_pfree_pt++];
+
+		ASSERTCMP(super->j.rcm_spare, >=, super->layout->bix_cache);
+		ASSERTCMP(super->j.rcm_spare, <, super->j.alloc_unready);
+
+		super->j.space_alrcm_nodes++;
+		super->j.alloc_pfree_n--;
+		super->j.space_alloc--;
+		super->space_rcmstk_resv--;
+	}
+
+	/* now service this operation's allocation requests */
+	if (cachefs_allocator_exclusive_do_alloc_op(op, alstk, rcstk) < 0)
+		goto progress_halted;
+
+	/* we completed this operation's request */
+	op->state = CACHEFS_OP_RUNNING;
+
+	/* now service some of the other requests in the queue */
+	while (op->alloc_link.next != &super->alloc_waitq) {
+		xop = list_entry(op->alloc_link.next,
+				 struct cachefs_operation, alloc_link);
+
+		if (cachefs_allocator_exclusive_do_alloc_op(xop, alstk, rcstk
+							    ) < 0)
+			goto progress_halted;
+
+		/* we completed that operation too */
+		list_del_init(&xop->alloc_link);
+		task = xop->task;
+		xop->task = NULL;
+
+		kdebug("allocated for %s", task->comm);
+
+		smp_mb();
+		xop->state = CACHEFS_OP_RUNNING;
+		wake_up_process(task);
+		put_task_struct(task);
+	}
+
+	/* we've cleared the queue */
+	list_del_init(&op->alloc_link);
+
+	spin_unlock(&super->alloc_lock);
+
+	kunmap_atomic(rcstk, KM_USER1);
+	if (alstk)
+		kunmap_atomic(alstk, KM_USER0);
+
+	_leave(" = 0");
+	return 0;
+
+	/* we either allocated everything in the alloc TOS or filled the
+	 * reclaim collector
+	 */
+progress_halted:
+	spin_unlock(&super->alloc_lock);
+
+	kunmap_atomic(rcstk, KM_USER1);
+	if (alstk)
+		kunmap_atomic(alstk, KM_USER0);
+
+	_leave(" = -EWOULDBLOCK");
+	return -EWOULDBLOCK;
+
+} /* end cachefs_allocator_exclusive_do_alloc() */
+
+/*****************************************************************************/
+/*
+ * allocate a new reclamation collector page
+ * - must not return out of memory
+ */
+static void cachefs_allocator_new_rcm_collector(struct cachefs_operation *op)
+{
+	struct cachefs_ondisc_free_node *fnode;
+	struct cachefs_super *super = op->super;
+	struct page *page;
+
+	_enter("");
+
+	if (super->j.rcm_coll_pt != 0) {
+		kdebug("RCM_COLL %x[%x]",
+		       super->j.rcm_collector, super->j.rcm_coll_pt);
+	}
+
+	ASSERTCMP(super->j.rcm_coll_pt, ==, 0);
+	ASSERTCMP(super->page_rcm, ==, NULL);
+
+	/* reuse an old page if there's an old one that's completely written
+	 * out */
+	spin_lock(&super->alloc_lock);
+
+	if (!list_empty(&super->rcm_old_pages)) {
+		page = list_entry(super->rcm_old_pages.prev, struct page, lru);
+
+		ASSERT(pfn_valid(page_to_pfn(page)));
+
+		if (!PageWriteback(page)) {
+			kdebug("reuse %p", page);
+			list_del_init(&page->lru);
+			spin_unlock(&super->alloc_lock);
+			goto obtained_page;
+		}
+	}
+
+	spin_unlock(&super->alloc_lock);
+
+	/* try to allocate a new page */
+	page = alloc_page(GFP_HIGHUSER);
+	if (page) {
+		kdebug("alloc %p", page);
+		atomic_inc(&super->cnt_rcmpages);
+		INIT_LIST_HEAD(&page->lru);
+		page->mapping = super->imeta->i_mapping;
+	}
+	else {
+		/* OOM - take the oldest old page and reuse */
+		ASSERT(!list_empty(&super->rcm_old_pages));
+
+		spin_lock(&super->alloc_lock);
+
+		kdebug("oom, reuse %p", page);
+
+		page = list_entry(super->rcm_old_pages.prev, struct page, lru);
+		list_del_init(&page->lru);
+
+		spin_unlock(&super->alloc_lock);
+	}
+
+	ASSERT(pfn_valid(page_to_pfn(page)));
+
+obtained_page:
+	kdebug("pfn %lx addr %p", page_to_pfn(page), page_address(page));
+
+	wait_on_page_writeback(page);
+	ASSERT(!PageDirty(page));
+
+	page->index = super->j.rcm_collector;
+	SetPageMappedToDisk(page);
+
+	fnode = kmap_atomic(page, KM_USER0);
+
+	ASSERT(fnode != (void *) 0x40000000UL);
+
+	memset(fnode, 0xde, PAGE_SIZE);
+	fnode->magic = CACHEFS_ONDISC_FREELIST_PARTIAL;
+	kunmap_atomic(fnode, KM_USER0);
+
+	SetPageUptodate(page);
+
+	super->page_rcm = page;
+	_leave("");
+
+} /* end cachefs_allocator_new_rcm_collector() */
+
+/*****************************************************************************/
+/*
+ * perform the allocation with total exclusion
+ */
+static int cachefs_allocator_exclusive(struct cachefs_operation *op)
+{
+	struct cachefs_ondisc_free_node *fnode;
+	struct cachefs_super *super = op->super;
+	struct page *page;
+
+	kenter("");
+
+	ASSERTIF(super->j.alloc_pfree_pt < CACHEFS_ONDISC_FREELIST_PTRSPERNODE,
+		 super->j.alloc_pfree != 0);
+	ASSERTIF(super->j.alloc_pfree != 0, super->page_pfree);
+	ASSERTIF(super->page_rcm, pfn_valid(page_to_pfn(super->page_rcm)));
+
+go_again:
+	/* make sure the next allocator stack node has begun loading if we know
+	 * what it is */
+	if (super->alloc_pfree_nx && !PageMappedToDisk(super->page_pfree_nx)) {
+		down(&super->alloc_load_sem);
+		cachefs_allocator_read_next(super, 0);
+		up(&super->alloc_load_sem);
+	}
+
+	/* make sure that there's a reclamation collector page available */
+	ASSERT(super->j.rcm_collector);
+
+	if (!super->page_rcm)
+		cachefs_allocator_new_rcm_collector(op);
+
+	if (!PageUptodate(super->page_rcm)) {
+		wait_on_page_locked(super->page_rcm);
+		if (PageError(super->page_rcm))
+			goto io_error_stop;
+
+		ASSERT(PageUptodate(super->page_rcm));
+
+		fnode = kmap_atomic(super->page_rcm, KM_USER0);
+		ASSERTCMP(fnode->magic, ==, CACHEFS_ONDISC_FREELIST_PARTIAL);
+		kunmap_atomic(fnode, KM_USER0);
+	}
+
+	/* if the reclamation collector is full, then we need to:
+	 * - rotate the collector onto the ready stack
+	 * - rotate the spare to the collector
+	 * - start the collector page writing
+	 * - set up the collector page
+	 */
+	if (super->j.rcm_coll_pt >= CACHEFS_ONDISC_FREELIST_PTRSPERNODE) {
+		kalter(super, "push %x onto reclaim ready %x",
+		       super->j.rcm_collector, super->j.rcm_ready);
+
+		ASSERTCMP(super->j.rcm_coll_pt, ==,
+			  CACHEFS_ONDISC_FREELIST_PTRSPERNODE);
+
+		ASSERT(super->j.rcm_spare != 0);
+
+		down(&super->alloc_load_sem);
+
+		/* link to current TOS */
+		ASSERT(pfn_valid(page_to_pfn(super->page_rcm)));
+
+		fnode = kmap_atomic(super->page_rcm, KM_USER0);
+
+		ASSERTCMP(fnode->magic, ==, CACHEFS_ONDISC_FREELIST_PARTIAL);
+		fnode->magic = CACHEFS_ONDISC_FREELIST_READY;
+
+		fnode->next = super->j.rcm_ready;
+		kunmap_atomic(fnode, KM_USER0);
+		SetPageDirty(super->page_rcm);
+
+		/* throw the collector page at the disk */
+		cachefs_allocator_write_page(super->page_rcm);
+		super->j.rcm_ready_n += CACHEFS_ONDISC_FREELIST_PTRSPERNODE;
+
+		/* move the page onto the old page list */
+		ASSERT(list_empty(&super->page_rcm->lru));
+
+		spin_lock(&super->alloc_lock);
+		list_add(&super->page_rcm->lru, &super->rcm_old_pages);
+
+		if (super->rcm_old_pages.prev != &super->page_rcm->lru)
+			set_bit(CACHEFS_SUPER_REDUCE_OLDRCM, &super->flags);
+
+		super->page_rcm = NULL;
+		spin_unlock(&super->alloc_lock);
+
+		/* rotate in the spare */
+		kalter(super, "rotate rcm spare %x to collector",
+		       super->j.rcm_spare);
+
+		super->j.rcm_ready = super->j.rcm_collector;
+		super->j.rcm_collector = super->j.rcm_spare;
+		super->j.rcm_spare = 0;
+		super->j.rcm_coll_pt = 0;
+
+		up(&super->alloc_load_sem);
+		goto go_again;
+	}
+
+	/* if we're only reclaiming stuff, then there may still be unready
+	 * space from which we can allocate reclamation stack nodes */
+	if (super->j.alloc_unready < super->layout->bix_end) {
+		ASSERTCMP(op->n_alloc, ==, op->m_alloc);
+		ASSERTCMP(op->n_rcm, <, op->m_rcm);
+
+		if (!super->j.rcm_collector) {
+			_alloc(super, "alloc unready %x as rcm coll",
+			       super->j.alloc_unready);
+
+			super->j.rcm_collector = super->j.alloc_unready++;
+			super->j.space_alrcm_nodes++;
+			super->space_rcmstk_resv--;
+			goto go_again;
+		}
+
+		if (!super->j.rcm_spare) {
+			_alloc(super, "alloc unready %x as rcm spare",
+			       super->j.alloc_unready);
+
+			super->j.rcm_spare = super->j.alloc_unready++;
+			super->j.space_alrcm_nodes++;
+			super->space_rcmstk_resv--;
+			goto go_again;
+		}
+	}
+
+	/* make sure that the alloc stack TOS page is ready if we have one */
+	if (super->j.alloc_pfree != 0 &&
+	    !PageUptodate(super->page_pfree)
+	    ) {
+		/* we would need to sleep, so if we've already done our stuff,
+		 * then we pass the responsibility onto the next task desiring
+		 * allocation */
+		if (op->state == CACHEFS_OP_RUNNING)
+			goto pass_on_responsibility;
+
+		ASSERT(PageMappedToDisk(super->page_pfree));
+
+		wait_on_page_locked(super->page_pfree);
+
+		if (PageError(super->page_pfree))
+			goto io_error_stop;
+
+		ASSERT(PageUptodate(super->page_pfree));
+	}
+
+	if (unlikely(test_bit(CACHEFS_SUPER_ERROR_STOP, &super->flags)))
+		goto error_stopped;
+
+	/* validate the stack TOS and sort it */
+	if (super->j.alloc_pfree != 0 && !PageFsMisc(super->page_pfree)) {
+		fnode = kmap_atomic(super->page_pfree, KM_USER0);
+		ASSERTCMP(fnode->magic, ==, CACHEFS_ONDISC_FREELIST_PARTIAL);
+		kunmap_atomic(fnode, KM_USER0);
+
+		if (cachefs_allocator_sort_alloc_TOS(super, super->page_pfree
+						     ) < 0)
+			goto fs_error_stop;
+
+		ASSERT(PageFsMisc(super->page_pfree));
+	}
+
+	/* perform the actual space management if there's allocation space
+	 * available */
+	_debug("do second stage alloc [%x]", super->j.alloc_pfree_pt);
+
+	if (super->j.alloc_pfree_pt < CACHEFS_ONDISC_FREELIST_PTRSPERNODE ||
+	    super->j.alloc_unready < super->layout->bix_end
+	    ) {
+		if (cachefs_allocator_exclusive_do_alloc(op) == 0)
+			goto allocation_complete;
+	}
+
+	/* we may need to reclaim the allocation TOS node, but we can't if
+	 * there's no space in the reclamation collector */
+	if (super->j.rcm_coll_pt >= CACHEFS_ONDISC_FREELIST_PTRSPERNODE)
+		goto go_again;
+
+	/* we can only get here by having run out of all immediately available
+	 * allocatable blocks */
+	ASSERTCMP(super->j.alloc_unready, ==, super->layout->bix_end);
+	ASSERTCMP(super->j.alloc_pfree_pt, ==,
+		  CACHEFS_ONDISC_FREELIST_PTRSPERNODE);
+
+	/* reset the page holding the allocation stack TOS node */
+	ClearPageError(super->page_pfree);
+	ClearPageFsMisc(super->page_pfree);
+	ClearPageUptodate(super->page_pfree);
+	ClearPageMappedToDisk(super->page_pfree);
+	super->page_pfree->index = 0;
+
+	/* rotate the allocation stacks to make sure there's space available */
+	if (super->j.alloc_pfree) {
+		/* reclaim the old allocator stack TOS node */
+		struct cachefs_ondisc_free_node *rcstk;
+		cachefs_block_t bix;
+
+		bix = super->j.alloc_pfree;
+
+		_alloc(super, "rcm alloc TOS %x to %x[%x]",
+		       bix,
+		       super->j.rcm_collector,
+		       super->j.rcm_coll_pt);
+
+		ASSERTCMP(bix, >=, super->layout->bix_cache);
+		ASSERTCMP(bix, <, super->j.alloc_unready);
+
+		rcstk = kmap_atomic(super->page_rcm, KM_USER1);
+		rcstk->ptrs[super->j.rcm_coll_pt] = bix;
+		kunmap_atomic(rcstk, KM_USER0);
+		SetPageDirty(super->page_rcm);
+
+		wmb();
+		super->j.rcm_coll_pt++;
+
+		super->j.space_alrcm_nodes--;
+		super->j.space_rcm++;
+		super->j.alloc_pfree = 0;
+	}
+
+	/* rotate the primary allocation stack 2OS to TOS if we can */
+	if (super->alloc_pfree_nx) {
+		_alter(super, "rotate alloc 2OS %x to TOS",
+		       super->alloc_pfree_nx);
+
+		ASSERTCMP(super->alloc_pfree_nx, >=, super->layout->bix_cache);
+		ASSERT(PageMappedToDisk(super->page_pfree_nx));
+
+		super->j.alloc_pfree_pt = 0;
+		super->j.alloc_pfree = super->alloc_pfree_nx;
+		super->alloc_pfree_nx = 0;
+
+		page = super->page_pfree;
+		super->page_pfree = super->page_pfree_nx;
+		super->page_pfree_nx = page;
+		goto go_again;
+	}
+
+	ASSERTCMP(super->j.alloc_pfree_n, ==, 0);
+
+	/* rotate the secondary allocation stack to the primary */
+	if (super->j.alloc_sfree) {
+		_alter(super, "transfer 2nd alloc stack %x (#%u) to primary",
+		       super->j.alloc_sfree, super->j.alloc_sfree_n);
+
+		ASSERTCMP(super->j.alloc_sfree, >=, super->layout->bix_cache);
+		ASSERT(super->page_sfree);
+		ASSERT(PageMappedToDisk(super->page_sfree));
+
+		super->j.alloc_pfree_n = super->j.alloc_sfree_n;
+		super->j.alloc_sfree_n = 0;
+		super->j.alloc_pfree_pt = 0;
+		super->j.alloc_pfree = super->j.alloc_sfree;
+		smp_wmb();
+		super->j.alloc_sfree = 0;
+
+		page = super->page_pfree;
+		super->page_pfree = super->page_sfree;
+		super->page_sfree = page;
+		goto go_again;
+	}
+
+	/* we should never reach here as we should always have made sure there
+	 * are enough unreserved laundered blocks available (space_alloc less
+	 * space_inprogress) before letting an operation run */
+	BUG();
+	return UINT_MAX;
+
+	/* attempt to pass on the responsibility for allocator management to
+	 * the next task waiting in the queue after us */
+pass_on_responsibility:
+	return cachefs_allocator_pass_responsibility(op);
+
+	/* we detected an I/O error in the allocation stack, so we need to stop
+	 * all further caching operations */
+io_error_stop:
+	return cachefs_allocator_io_error(op);
+
+	/* we detected a filesystem error in the allocation stack, so we need
+	 * to stop all further caching operations */
+fs_error_stop:
+	return cachefs_allocator_fs_error(op);
+
+	/* caching has been stopped due to an I/O error */
+error_stopped:
+	return cachefs_allocator_error_stop(op);
+
+	/* all done, including everything in the queue */
+allocation_complete:
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_allocator_exclusive() */
+
+/*****************************************************************************/
+/*
+ * do slow allocation of a block
+ */
+static int cachefs_allocator_slow(struct cachefs_operation *op)
+{
+	struct cachefs_super *super = op->super;
+
+	kenter("");
+
+	/* register our interest in allocating a block */
+	list_add_tail(&op->alloc_link, &super->alloc_waitq);
+
+	if (super->alloc_waitq.next == &op->alloc_link) {
+		/* we'll be in control as we're at the front of the queue */
+		op->task = NULL;
+
+		/* we can now drop the spinlock; since we're in the queue, no
+		 * one else can use the allocator without our say-so */
+		spin_unlock(&super->alloc_lock);
+	}
+	else {
+		kdebug("wait");
+
+		/* we're going to be waiting for someone else to do our work
+		 * for us */
+		op->state = CACHEFS_OP_WAITING_FOR_BLOCK;
+		op->task = current;
+		get_task_struct(op->task);
+
+		/* we can now drop the spinlock; our presence in the contention
+		 * queue prevents anyone after us getting blocks first */
+		spin_unlock(&super->alloc_lock);
+
+		for (;;) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (op->state == CACHEFS_OP_RUNNING)
+				break;
+			schedule();
+		}
+
+		__set_current_state(TASK_RUNNING);
+
+		/* our allocation and releasing may have been done by whoever
+		 * was in control of the allocator */
+		if (op->state == CACHEFS_OP_RUNNING) {
+			kleave(" = 0 [given]");
+			return 0;
+		}
+
+		if (op->state == CACHEFS_OP_IO_ERROR) {
+			kleave(" = -EIO");
+			return -EIO;
+		}
+	}
+
+	/* we've got total control of the allocator - it's up to us to set up
+	 * the allocation system again */
+	ASSERTCMP(op->task, ==, NULL);
+	ASSERTCMP(op->state, ==, CACHEFS_OP_ALLOCATING);
+	ASSERTCMP(super->alloc_waitq.next, ==, &op->alloc_link);
+
+	return cachefs_allocator_exclusive(op);
+
+} /* end cachefs_allocator_slow() */
+
+/*****************************************************************************/
+/*
+ * allocate blocks from the primary alloc stack
+ * - the caller must hold the alloc lock
+ */
+static int cachefs_do_fast_allocate(struct cachefs_operation *op)
+{
+	struct cachefs_ondisc_free_node *fnode;
+	struct cachefs_super *super = op->super;
+	unsigned pt, na, n;
+
+	_enter("%d/%d", op->n_alloc, op->m_alloc);
+
+	if (op->n_alloc < op->m_alloc) {
+		/* can't do anything if the alloc stack TOS is not loaded or
+		 * sorted */
+		if (!PageUptodate(super->page_pfree) ||
+		    !PageFsMisc(super->page_pfree))
+			goto allocation_not_available;
+
+		/* defer if the primary allocation stack is empty */
+		pt = super->j.alloc_pfree_pt;
+		if (pt >= CACHEFS_ONDISC_FREELIST_PTRSPERNODE)
+			goto allocation_not_available;
+
+		/* allocate as many of the requested blocks as we can */
+		fnode = kmap_atomic(super->page_pfree, KM_USER0);
+
+		ASSERTCMP(fnode->magic, ==, CACHEFS_ONDISC_FREELIST_READY);
+
+		super->alloc_pfree_nx = fnode->next;
+
+		na = op->n_alloc;
+		n = 0;
+
+		while (na < op->m_alloc) {
+			_alloc(super, "alloc %x[%x] to %x",
+			       super->j.alloc_pfree, pt, fnode->ptrs[pt]);
+
+			ASSERTCMP(fnode->ptrs[pt], >=, super->layout->bix_cache);
+			ASSERTCMP(fnode->ptrs[pt], <, super->j.alloc_unready);
+
+			op->bix_alloc[na++] = fnode->ptrs[pt++];
+			n++;
+
+			if (pt >= CACHEFS_ONDISC_FREELIST_PTRSPERNODE)
+				break;
+		}
+
+		op->n_alloc = na;
+
+		kunmap_atomic(fnode, KM_USER0);
+
+		super->j.alloc_pfree_pt = pt;
+		super->j.alloc_pfree_n -= n;
+		super->j.space_alloc -= n;
+		super->space_inprogress -= n;
+		op->reservation -= n;
+
+		if (op->n_alloc < op->m_alloc)
+			goto allocation_not_available;
+	}
+
+	return 0;
+
+allocation_not_available:
+	_leave(" = -EWOULDBLOCK");
+	return -EWOULDBLOCK;
+
+} /* end cachefs_do_fast_allocate() */
+
+/*****************************************************************************/
+/*
+ * reclaim blocks
+ * - the caller must hold the alloc lock
+ */
+static int cachefs_do_fast_reclaim(struct cachefs_operation *op)
+{
+	struct cachefs_ondisc_free_node *fnode;
+	struct cachefs_super *super = op->super;
+	cachefs_block_t bix;
+	unsigned pt, n;
+
+	_enter("%d/%d", op->n_rcm, op->m_rcm);
+
+	ASSERTCMP(super->j.rcm_coll_pt, <=,
+		  CACHEFS_ONDISC_FREELIST_PTRSPERNODE);
+	ASSERTCMP(super->j.rcm_collector, >=, super->layout->bix_cache);
+
+	if (op->n_rcm < op->m_rcm) {
+		/* can't do anything if the reclaim collector system is not
+		 * available */
+		if (!super->j.rcm_spare ||
+		    !super->page_rcm ||
+		    !PageUptodate(super->page_rcm))
+			goto reclamation_not_available;
+
+		/* defer if the reclaim collector is full */
+		pt = super->j.rcm_coll_pt;
+		if (pt >= CACHEFS_ONDISC_FREELIST_PTRSPERNODE)
+			goto reclamation_not_available;
+
+		/* reclaim as many of the requested blocks as we can */
+		fnode = kmap_atomic(super->page_rcm, KM_USER0);
+
+		ASSERTCMP(fnode->magic, ==, CACHEFS_ONDISC_FREELIST_PARTIAL);
+
+		n = op->n_rcm;
+		while (n < op->m_rcm) {
+			bix = op->bix_rcm[n];
+			op->bix_rcm[n] = 0;
+			n++;
+
+			_alloc(super, "rcm %x to %x[%x]",
+			       bix, super->j.rcm_collector, pt);
+
+			ASSERTCMP(bix, !=, CACHEFS_NULL_PTR);
+			ASSERTCMP(bix, >=, super->layout->bix_cache);
+			ASSERTCMP(bix, <, super->j.alloc_unready);
+
+			fnode->ptrs[pt++] = bix;
+			if (pt >= CACHEFS_ONDISC_FREELIST_PTRSPERNODE)
+				break;
+		}
+
+		kunmap_atomic(fnode, KM_USER0);
+
+		if (n > 0)
+			SetPageDirty(super->page_rcm);
+
+		wmb();
+		op->n_rcm = n;
+		super->j.rcm_coll_pt = pt;
+		super->j.space_rcm += n;
+
+		if (n < op->m_rcm)
+			goto reclamation_not_available;
+	}
+
+	return 0;
+
+reclamation_not_available:
+	_leave(" = -EWOULDBLOCK");
+	return -EWOULDBLOCK;
+
+} /* end cachefs_do_fast_reclaim() */
+
+/*****************************************************************************/
+/*
+ * finish off the fast allocation by writing out the reclaim list and starting
+ * the next page in the allocation stack loading
+ * - called with the alloc spinlock held
+ */
+static int cachefs_allocator_finish(struct cachefs_operation *op)
+{
+	struct cachefs_ondisc_free_node *fnode;
+	struct cachefs_super *super = op->super;
+
+	_enter("");
+
+	if (down_trylock(&super->alloc_load_sem) != 0) {
+		spin_unlock(&super->alloc_lock);
+		op->state = CACHEFS_OP_RUNNING;
+		_leave(" = 0 [nolock]");
+		return 0;
+	}
+
+	/* start the next alloc stack node loading */
+	if (super->alloc_pfree_nx && !PageMappedToDisk(super->page_pfree_nx))
+		cachefs_allocator_read_next(super, 1);
+
+	/* advance the reclamation process
+	 * - rotate the collector onto the ready stack
+	 * - rotate the spare to the collector
+	 * - start the collector page writing
+	 * - set up the collector page
+	 */
+	if (super->j.rcm_coll_pt >= CACHEFS_ONDISC_FREELIST_PTRSPERNODE &&
+	    super->j.rcm_spare &&
+	    super->page_rcm
+	    ) {
+		kalter(super, "push %x onto reclaim ready %x",
+		       super->j.rcm_collector, super->j.rcm_ready);
+
+		ASSERT(pfn_valid(page_to_pfn(super->page_rcm)));
+
+		fnode = kmap_atomic(super->page_rcm, KM_USER0);
+
+		ASSERTCMP(fnode->magic, ==, CACHEFS_ONDISC_FREELIST_PARTIAL);
+		fnode->magic = CACHEFS_ONDISC_FREELIST_READY;
+
+		fnode->next = super->j.rcm_ready;
+		kunmap_atomic(fnode, KM_USER0);
+		SetPageDirty(super->page_rcm);
+
+		/* attempt to throw the page at the disk */
+		if (cachefs_allocator_write_page_nowait(super->page_rcm) == 0
+		    ) {
+			/* rotate the pointers in journal */
+			super->j.rcm_ready = super->j.rcm_collector;
+			super->j.rcm_collector = super->j.rcm_spare;
+			super->j.rcm_spare = 0;
+
+			super->j.rcm_ready_n += CACHEFS_ONDISC_FREELIST_PTRSPERNODE;
+			super->j.rcm_coll_pt = 0;
+
+			ASSERT(super->j.rcm_collector != 0);
+
+			/* move the page onto the old page list */
+			ASSERT(list_empty(&super->page_rcm->lru));
+
+			list_add(&super->page_rcm->lru, &super->rcm_old_pages);
+
+			if (super->rcm_old_pages.prev != &super->page_rcm->lru)
+				set_bit(CACHEFS_SUPER_REDUCE_OLDRCM, &super->flags);
+
+			super->page_rcm = NULL;
+		}
+	}
+
+	spin_unlock(&super->alloc_lock);
+	up(&super->alloc_load_sem);
+	op->state = CACHEFS_OP_RUNNING;
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_allocator_finish() */
+
+/*****************************************************************************/
+/*
+ * allocate and reclaim blocks
+ */
+int cachefs_allocator(struct cachefs_operation *op)
+{
+	struct cachefs_super *super = op->super;
+	int ret1, ret2;
+
+	_enter("{%d,%d}", op->m_alloc, op->m_rcm);
+
+	ASSERTCMP(op->state, ==, CACHEFS_OP_RUNNING);
+	ASSERT(op->m_alloc || op->m_rcm);
+	ASSERT(super->page_pfree);
+	ASSERT(super->page_pfree_nx);
+
+	if (unlikely(test_bit(CACHEFS_SUPER_ERROR_STOP, &super->flags))) {
+		op->state = CACHEFS_OP_IO_ERROR;
+		_leave(" = -EIO");
+		return -EIO;
+	}
+
+	op->state = CACHEFS_OP_ALLOCATING;
+	op->n_alloc = 0;
+	op->n_rcm = 0;
+
+	/* check the operation hasn't run out of blocks yet */
+	ASSERTCMP(op->reservation, >=, op->n_alloc);
+
+	spin_lock(&super->alloc_lock);
+
+	ASSERTIF(super->page_rcm, pfn_valid(page_to_pfn(super->page_rcm)));
+
+	/* we can allocate and reclaim immediately if there isn't a queue */
+	if (list_empty(&super->alloc_waitq)) {
+		ASSERTIFCMP(super->j.alloc_pfree,
+			    super->page_pfree->index, ==,
+			    super->j.alloc_pfree);
+
+		/* use unready blocks if possible */
+		while (op->n_alloc < op->m_alloc &&
+		       super->layout->bix_end - super->j.alloc_unready > 0
+		       ) {
+			_alloc(super, "alloc unready %x",
+			       super->j.alloc_unready);
+
+			super->space_inprogress--;
+			op->reservation--;
+			op->bix_alloc[op->n_alloc++] =
+				super->j.alloc_unready++;
+		}
+
+		if (op->n_alloc == op->m_alloc && op->m_rcm == 0)
+			goto fast_allocation_complete;
+
+		/* access the allocation and reclamation stacks */
+		ret1 = cachefs_do_fast_allocate(op);
+		ret2 = cachefs_do_fast_reclaim(op);
+		if (ret1 == 0 && ret2 == 0)
+			goto fast_allocation_complete;
+	}
+
+	/* fall back to the slow path */
+	return cachefs_allocator_slow(op);
+
+fast_allocation_complete:
+	ASSERTCMP(op->n_alloc, ==, op->m_alloc);
+	ASSERTCMP(op->n_rcm, ==, op->m_rcm);
+
+	/* start the second block in the alloc list loading and write out the
+	 * reclaim list collector if necessary */
+	if ((super->alloc_pfree_nx && !PageUptodate(super->page_pfree_nx)) ||
+	    super->j.rcm_coll_pt >= CACHEFS_ONDISC_FREELIST_PTRSPERNODE)
+		return cachefs_allocator_finish(op);
+
+	spin_unlock(&super->alloc_lock);
+
+	op->state = CACHEFS_OP_RUNNING;
+	_leave(" = 0 [s %d,%d]", op->n_alloc, op->n_rcm);
+	return 0;
+
+} /* end cachefs_allocator() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/cachefs-debug.h linux-2.6.14-mm2-cachefs/fs/cachefs/cachefs-debug.h
--- linux-2.6.14-mm2/fs/cachefs/cachefs-debug.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/cachefs-debug.h	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,132 @@
+/* cachefs-debug.h: CacheFS debugging
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _FS_CACHEFS_DEBUG_H
+#define _FS_CACHEFS_DEBUG_H
+
+/*****************************************************************************/
+/*
+ * debug tracing
+ */
+extern int cachefs_debug;
+
+#define dbgprintk(FMT,...) \
+	printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
+#define _dbprintk(FMT,...) do { } while(0)
+
+#define kenter(FMT,...)	dbgprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__)
+#define kleave(FMT,...)	dbgprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__)
+#define kdebug(FMT,...)	dbgprintk(FMT ,##__VA_ARGS__)
+
+#define kalloc(SUPER, FMT,...) \
+	dbgprintk("+++ %s: "FMT"", (SUPER)->cache.tag->name ,##__VA_ARGS__)
+
+#define kalter(SUPER, FMT,...) \
+	dbgprintk("[>] %s: "FMT"", (SUPER)->cache.tag->name ,##__VA_ARGS__)
+
+#define kjournal(FMT,...) _dbprintk(FMT ,##__VA_ARGS__)
+
+#if defined(__KENTER) || defined(__KDEBUGALL)
+#define _enter(FMT,...)	kenter(FMT,##__VA_ARGS__)
+#else
+#define _enter(FMT,...)	do { } while(0)
+#endif
+
+#if defined(__KLEAVE) || defined(__KDEBUGALL)
+#define _leave(FMT,...)	kleave(FMT,##__VA_ARGS__)
+#else
+#define _leave(FMT,...)	do { } while(0)
+#endif
+
+#if defined(__KALLOC) || defined(__KDEBUG) || defined(__KDEBUGALL)
+#define _alloc(FMT,...)	kalloc(FMT,##__VA_ARGS__)
+#else
+#define _alloc(FMT,...)	do { } while(0)
+#endif
+
+#if defined(__KALTER) || defined(__KDEBUG) || defined(__KDEBUGALL)
+#define _alter(FMT,...)	kalter(FMT,##__VA_ARGS__)
+#else
+#define _alter(FMT,...)	do { } while(0)
+#endif
+
+#if defined(__KDEBUG) || defined(__KDEBUGALL)
+#define _debug(FMT,...)	kdebug(FMT,##__VA_ARGS__)
+#else
+#define _debug(FMT,...)	do { } while(0)
+#endif
+
+#if 1 // defined(__KDEBUGALL)
+
+#define ASSERT(X)						\
+do {								\
+	if (unlikely(!(X))) {					\
+		printk(KERN_ERR "\n");				\
+		printk(KERN_ERR "CacheFS: Assertion failed\n");	\
+		BUG();						\
+	}							\
+} while(0)
+
+#define ASSERTCMP(X, OP, Y)					\
+do {								\
+	if (unlikely(!((X) OP (Y)))) {				\
+		printk(KERN_ERR "\n");				\
+		printk(KERN_ERR "CacheFS: Assertion failed\n");	\
+		printk(KERN_ERR "%lx " #OP " %lx is false\n",	\
+		       (unsigned long)(X), (unsigned long)(Y));	\
+		BUG();						\
+	}							\
+} while(0)
+
+#define ASSERTIF(C, X)						\
+do {								\
+	if (unlikely((C) && !(X))) {				\
+		printk(KERN_ERR "\n");				\
+		printk(KERN_ERR "CacheFS: Assertion failed\n");	\
+		BUG();						\
+	}							\
+} while(0)
+
+#define ASSERTIFCMP(C, X, OP, Y)				\
+do {								\
+	if (unlikely((C) && !((X) OP (Y)))) {			\
+		printk(KERN_ERR "\n");				\
+		printk(KERN_ERR "CacheFS: Assertion failed\n");	\
+		printk(KERN_ERR "%lx " #OP " %lx is false\n",	\
+		       (unsigned long)(X), (unsigned long)(Y));	\
+		BUG();						\
+	}							\
+} while(0)
+
+#else
+
+#define ASSERT(X)				\
+do {						\
+} while(0)
+
+#define ASSERTCMP(X, OP, Y)			\
+do {						\
+} while(0)
+
+#define ASSERTIF(C, X)				\
+do {						\
+} while(0)
+
+#define ASSERTIFCMP(C, X, OP, Y)		\
+do {						\
+} while(0)
+
+#endif
+
+extern void dump_bio(struct bio *bio, int n);
+
+
+#endif /* _FS_CACHEFS_DEBUG_H */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/cachefs-inode.h linux-2.6.14-mm2-cachefs/fs/cachefs/cachefs-inode.h
--- linux-2.6.14-mm2/fs/cachefs/cachefs-inode.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/cachefs-inode.h	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,64 @@
+/* cachefs-inode.h: CacheFS inode handling
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _FS_CACHEFS_INODE_H
+#define _FS_CACHEFS_INODE_H
+
+enum cachefs_inode_numbers {
+	CACHEFS_INO_NULL		= 0x00000000,
+	CACHEFS_INO_IMETA		= 0x00000001,
+	CACHEFS_INO_STATUS		= 0x00000002,
+	CACHEFS_INO_ROOTDIR		= 0x00000003,
+	CACHEFS_INO__FIRST_FILE
+};
+
+/*****************************************************************************/
+/*
+ * on-disc per-cache inode record
+ */
+struct cachefs_inode
+{
+	struct inode		vfs_inode;	/* VFS inode record for this file */
+};
+
+extern struct inode_operations cachefs_status_inode_operations;
+extern struct file_operations cachefs_status_file_operations;
+
+#define CACHEFS_FS_I(inode) \
+	container_of((inode), struct cachefs_inode, vfs_inode)
+
+extern struct cachefs_inode *cachefs_iget(struct cachefs_super *super,
+					  ino_t ino);
+extern int cachefs_write_inode(struct inode *_inode, int sync);
+extern void cachefs_clear_inode(struct inode *vfs_inode);
+
+static inline struct cachefs_inode *cachefs_igrab(struct cachefs_inode *iinode)
+{
+	struct inode *inode = igrab(&iinode->vfs_inode);
+	return inode ? CACHEFS_FS_I(inode) : NULL;
+}
+
+static inline void cachefs_iput(struct cachefs_inode *inode)
+{
+	if (inode)
+		iput(&inode->vfs_inode);
+}
+
+
+extern struct fscache_cache_ops cachefs_cache_ops;
+extern struct address_space_operations cachefs_meta_addrspace_operations;
+extern struct file_operations cachefs_root_file_operations;
+extern struct inode_operations cachefs_root_inode_operations;
+
+extern int cachefs_fs_init(void);
+extern void cachefs_fs_exit(void);
+
+#endif /* _FS_CACHEFS_INODE_H */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/cachefs-int.h linux-2.6.14-mm2-cachefs/fs/cachefs/cachefs-int.h
--- linux-2.6.14-mm2/fs/cachefs/cachefs-int.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/cachefs-int.h	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,739 @@
+/* cachefs-int.h: general filesystem caching internal defs
+ *
+ * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _FS_CACHEFS_INT_H
+#define _FS_CACHEFS_INT_H
+
+#include <linux/fscache-cache.h>
+#include <linux/timer.h>
+#include <linux/pagevec.h>
+#include <linux/bio.h>
+#include "cachefs-layout.h"
+#include "cachefs-debug.h"
+
+struct cachefs_operation;
+
+#define CACHEFS_DEFAULT_AUTOWANDER_TIMER 3 /* seconds */
+
+#define CACHEFS_DEBUG_TREE_ACCOUNTING	0
+#define CACHEFS_DEBUG_OBJECT_ACCOUNTING	0
+
+#ifdef CONFIG_DEBUG_SLAB
+#define CACHEFS_DEBUG_SLAB 1
+#endif
+
+struct cachefs_super;
+struct cachefs_object;
+struct cachefs_cursor;
+
+extern int kcachefsd(void *_super);
+extern int kreaperd(void *_super);
+
+struct cachefs_cull {
+	uint64_t	ino;
+	time_t		atime;
+};
+
+extern int cachefs_digest_key(struct cachefs_object *object,
+			      const struct cachefs_ondisc_leaf *leaf);
+
+enum cachefs_rcy_state {
+	CACHEFS_RCY_INACTIVE,
+	CACHEFS_RCY_LOADING_NODE,
+	CACHEFS_RCY_PROCESSING_NODE,
+	CACHEFS_RCY_LOADING_PTRBLK,
+	CACHEFS_RCY_PROCESSING_PTRBLK,
+	CACHEFS_RCY_CONSUME_POINTERS_I,
+	CACHEFS_RCY_CONSUME_POINTERS,
+	CACHEFS_RCY_CONSUME_PTRBLK_I,
+	CACHEFS_RCY_CONSUME_PTRBLK,
+	CACHEFS_RCY_CONSUME_NODE_I,
+	CACHEFS_RCY_CONSUME_NODE,
+	CACHEFS_RCY__NSTATES
+} __attribute__((packed));
+
+typedef void (*cachefs_recycle_operation_t)(struct cachefs_super *super,
+					    struct cachefs_operation *op);
+extern const cachefs_recycle_operation_t cachefs_recycle_operations[CACHEFS_RCY__NSTATES];
+
+extern int cachefs_recycle_validate_node(struct cachefs_super *super,
+					 struct page *page,
+					 int rcysp);
+
+enum cachefs_scan_state {
+	CACHEFS_SCAN_INACTIVE,
+	CACHEFS_SCAN_LOADING_REAP_LIST,
+	CACHEFS_SCAN_DESCENDING,
+	CACHEFS_SCAN_VALIDATING_NODE,
+	CACHEFS_SCAN_SCANNING_NODE,
+	CACHEFS_SCAN_ASCENDING,
+	CACHEFS_SCAN_COMPLETING_SCAN,
+	CACHEFS_SCAN_COMPLETING_REAP,
+	CACHEFS_SCAN_ADVANCING_REAP_LIST,
+	CACHEFS_SCAN_SYNCING_CACHE,
+	CACHEFS_SCAN_FINISHED,
+	CACHEFS_SCAN_REAPING_OBJECT,
+	CACHEFS_SCAN_WAITING_FOR_REAPER,
+	CACHEFS_SCAN__NSTATES
+} __attribute__((packed));
+
+typedef void (*cachefs_scan_operation_t)(struct cachefs_super *super);
+extern const cachefs_scan_operation_t cachefs_scan_operations[CACHEFS_SCAN__NSTATES];
+
+/*****************************************************************************/
+/*
+ * record of a journal transaction
+ */
+struct cachefs_journal
+{
+	struct list_head	link;		/* link in writeback list */
+	struct cachefs_super	*super;		/* superblock being transacted */
+	struct cachefs_journal	*dependent;	/* transaction dependent on this one */
+	struct list_head	syncwq;		/* wake up queue for sync calls */
+	atomic_t		remaining;	/* count of remaining items */
+	int			journalled;	/* T if done */
+	cachefs_block_t		alloc_sfree;	/* update to the secondary alloc stack */
+	uint32_t		alloc_sfree_n;	/* update to the 2nd alloc stack count */
+	int32_t			serial;		/* journal serial number */
+};
+
+extern kmem_cache_t *cachefs_journal_jar;
+
+extern int cachefs_journal_process(struct cachefs_super *super);
+extern void cachefs_journal_release(struct cachefs_journal *jnl);
+
+/*****************************************************************************/
+/*
+ * record of an operation
+ */
+enum cachefs_op_type {
+	CACHEFS_OP_INSERT_LEAF,		/* object insertion operation */
+	CACHEFS_OP_DELETE_LEAF,		/* object deletion operation */
+	CACHEFS_OP_UPDATE_LEAF,		/* object update operation */
+	CACHEFS_OP_INSERT_DATA,		/* data blocks insertion operation */
+	CACHEFS_OP_RECYCLE_DATA,	/* data tree tear down and reclaim operation */
+} __attribute__((packed));
+
+enum cachefs_op_state {
+	CACHEFS_OP_INACTIVE,		/* op not doing anything */
+	CACHEFS_OP_RESERVING,		/* op waiting for sufficient space to start */
+	CACHEFS_OP_RUNNING,		/* op is running with space reserved */
+	CACHEFS_OP_ALLOCATING,		/* op is allocating a block */
+	CACHEFS_OP_WAITING_FOR_BLOCK,	/* op is waiting to be given a block (or alloc control) */
+	CACHEFS_OP_IO_ERROR,		/* op aborted due to I/O error */
+} __attribute__((packed));
+
+struct cachefs_operation {
+	struct list_head	op_link;	/* link in operations queues */
+	struct list_head	alloc_link;	/* link in allocation contention queue */
+	struct cachefs_super	*super;
+	struct cachefs_object	*object;
+	struct task_struct	*task;		/* task sleeping on op (NULL if running) */
+
+	union {
+		struct cachefs_tree *nodes[4];
+		struct {
+			struct cachefs_tree *inode;
+			struct cachefs_tree *old_root;
+			struct cachefs_tree *new_root;
+			struct cachefs_tree *new_alloc;
+		} isize;
+		struct {
+			struct cachefs_tree *inode;
+			struct cachefs_tree *point;
+			struct cachefs_tree *next;
+		} data;
+		struct {
+			struct cachefs_tree *point;
+			struct cachefs_tree *next;
+			struct cachefs_tree *pruneto;
+		} del;
+	} p;
+
+	enum cachefs_op_type	reason;		/* reason for allocation */
+	enum cachefs_op_state	state;		/* operation state */
+	unsigned		data_space;	/* amount of data space requested */
+	unsigned		reservation;	/* number of blocks to be reserved */
+	unsigned		alrs_resv;	/* alloc/reclaim stack management reservation */
+	unsigned		excess;		/* excess that must be available over reservation */
+
+	/* allocation/reclamation request parameters & returns */
+#define CACHEFS_OP_MAX_BLOCKS 4
+	cachefs_block_t		bix_alloc[CACHEFS_OP_MAX_BLOCKS]; /* blocks allocated */
+	cachefs_block_t		bix_rcm[CACHEFS_OP_MAX_BLOCKS]; /* blocks to be reclaimed */
+	uint8_t			n_alloc;	/* number allocated */
+	uint8_t			m_alloc;	/* number of allocations requested */
+	uint8_t			n_rcm;		/* number reclaimed */
+	uint8_t			m_rcm;		/* number of reclamations requested */
+};
+
+extern int cachefs_operation_begin(struct cachefs_operation *op);
+extern int cachefs_operation_begin_kcachefsd(struct cachefs_operation *op);
+extern void cachefs_operation_end(struct cachefs_operation *op);
+extern void cachefs_operation_end_kcachefsd(struct cachefs_operation *op);
+extern void cachefs_operation_run(struct cachefs_super *super);
+extern int cachefs_allocator(struct cachefs_operation *op);
+extern void cachefs_allocator_write_page(struct page *page);
+extern int cachefs_allocator_write_page_nowait(struct page *page);
+extern void cachefs_allocator_read_next(struct cachefs_super *super, int noblock);
+extern int cachefs_allocator_end_io_read(struct bio *bio, unsigned int bytes_done, int err);
+
+extern int cachefs_replace_node(struct cachefs_operation *op,
+				struct cachefs_tree *node);
+
+extern void cachefs_replace_add_to_page_cache(struct cachefs_operation *op,
+					      struct cachefs_tree *node);
+
+extern int cachefs_trans_reclaim_block(struct cachefs_super *super, cachefs_block_t bix);
+
+
+/*****************************************************************************/
+/*
+ * netfs page I/O completion callback
+ */
+struct cachefs_io_callback
+{
+	struct cachefs_super	*super;
+	struct cachefs_journal	*jnl;
+	fscache_rw_complete_t	callback_func;
+	void			*callback_data;
+	atomic_t		usage;
+};
+
+extern int cachefs_netfs_io_completion(struct bio *bio,
+				       unsigned int bytes_done,
+				       int error);
+
+static inline void cachefs_io_callback_put(struct cachefs_io_callback *callback)
+{
+	if (atomic_dec_and_test(&callback->usage)) {
+		_debug("free callback %p", callback);
+		if (callback->jnl)
+			cachefs_journal_release(callback->jnl);
+		kfree(callback);
+	}
+}
+
+/*****************************************************************************/
+/*
+ * metadata tree cached branch node
+ * - a branch node contains a set of heterogeneous leaves, where each leaf may
+ *   be one of:
+ *   - an index node object
+ *   - a data file node object
+ *   - an array of pointers to further branches
+ *   - an array of pointers to data blocks
+ *   - a shortcut for a piece of keyspace not immediately distinguishable by
+ *     pointers on this node
+ */
+struct cachefs_tree
+{
+	struct cachefs_tree		*parent;	/* parent node */
+	struct cachefs_object		*object;	/* owner of level 1 dataptrblk */
+	struct rb_root			nodes;		/* children of this node by position */
+	struct rb_root			shortcuts;	/* shortcuts from this node by key */
+	struct rb_root			objects;	/* object leaves in this tree node */
+	struct rb_node			node_rb;	/* link in parent tree node */
+	struct rb_node			aux_rb;		/* link in object's tree or shortcut list */
+	struct rw_semaphore		sem;		/* rearrangement vs walk semaphore */
+	struct page			*page;		/* representation of this node */
+	unsigned long			index;		/* first page of dataptrblk */
+	unsigned long			flags;
+#define CACHEFS_TREE_EXTANT		0		/* T if node is currently extant on disk */
+#define CACHEFS_TREE_DETACHED		1		/* T if block was detached */
+#define CACHEFS_TREE_INSTALLED		2		/* T if node_rb is parent->nodes */
+#define CACHEFS_TREE_S_INSTALLED	3		/* T if aux_rb is in parent->shortcuts */
+#define CACHEFS_TREE_NODE_VALIDATED	4		/* T if page has been validated */
+#define CACHEFS_TREE_NODE_VALID		5		/* T if page contents are valid */
+#define CACHEFS_TREE_NODE_INVALID	6		/* T if page contents are not valid */
+	cachefs_block_t			bix;		/* block holding this node */
+	int32_t				immutable;	/* serial no. of journalled immutability */
+	rwlock_t			lock;		/* list maintenance lock */
+	atomic_t			usage;
+	atomic_t			netfs_usage;	/* num netfs refs on a dataptr block */
+	int				scan_state;	/* scan state for this node */
+	uint16_t			offset;		/* offset of referring leaf in parent */
+	uint16_t			s_offset;	/* ptr offset of shortcut next step */
+	uint16_t			level;		/* level in tree */
+	uint8_t				type;		/* type of node */
+#define CACHEFS_TREE_TYPE_UNSET		0
+#define CACHEFS_TREE_TYPE_NODE		1
+#define CACHEFS_TREE_TYPE_DATAPTRBLK	2
+#define CACHEFS_TREE_TYPE_SHORTCUT	3
+	int				occupancy;	/* number of ptrs + objects + shortcuts */
+};
+
+extern kmem_cache_t *cachefs_node_jar;
+
+extern void cachefs_tree_init_once(void *_node, kmem_cache_t *cachep,
+				   unsigned long flags);
+
+extern int cachefs_node_read(struct cachefs_super *super, struct cachefs_tree *node, int sync);
+extern int cachefs_node_validate(struct cachefs_super *super, struct cachefs_tree *node);
+
+extern struct cachefs_tree *cachefs_tree_slide_readlock(struct cachefs_super *super,
+							struct cachefs_object *object);
+
+#ifdef CACHEFS_DEBUG_SLAB
+#define cachefs_tree_debugcheck(node)					\
+do {									\
+	ASSERT(((unsigned long) (node) & 0xffff0000) != 0x6b6b0000);	\
+	ASSERT(((unsigned long) (node->node_rb.rb_parent) & 0xffff0000) != 0x6b6b0000); \
+	ASSERT((atomic_read(&(node)->usage) & 0xffff0000) != 0x6b6b0000); \
+} while(0)
+#else
+#define cachefs_tree_debugcheck(node)	do {} while(0)
+#endif
+
+static inline struct cachefs_tree *cachefs_tree_get(struct cachefs_tree *node)
+{
+#if CACHEFS_DEBUG_TREE_ACCOUNTING
+	kdebug(" - GET %p{%x,%u} USAGE -> %d",
+	       node, node->bix, node->level,
+	       atomic_read(&node->usage) + 1);
+#endif
+	cachefs_tree_debugcheck(node);
+	atomic_inc(&node->usage);
+	return node;
+}
+
+static inline struct cachefs_tree *cachefs_tree_get2(struct cachefs_tree *node, int n)
+{
+#if CACHEFS_DEBUG_TREE_ACCOUNTING
+	kdebug(" - GET %p{%x,%u} USAGE -> %d",
+	       node, node->bix, node->level,
+	       atomic_read(&node->usage) + n);
+#endif
+	cachefs_tree_debugcheck(node);
+	atomic_add(n, &node->usage);
+	return node;
+}
+
+extern struct cachefs_tree *cachefs_tree_alloc(unsigned long gfp);
+
+extern struct cachefs_tree *cachefs_tree_lookup(unsigned long gfp,
+						struct cachefs_cursor *parent,
+						cachefs_block_t bix,
+						int type,
+						int resident);
+
+extern void __cachefs_tree_link_to_node(struct cachefs_tree *node,
+					struct cachefs_tree *parent);
+
+static inline void cachefs_tree_link_to_node(struct cachefs_tree *node,
+					     struct cachefs_tree *parent)
+{
+	write_lock(&parent->lock);
+	__cachefs_tree_link_to_node(node, parent);
+	write_unlock(&parent->lock);
+}
+
+extern void cachefs_tree_link_to_object(struct cachefs_tree *node,
+					struct cachefs_object *obj);
+
+static inline void __cachefs_tree_unlink_from_node(struct cachefs_tree *node,
+						   struct cachefs_tree *parent)
+{
+	//kenter("%p,%p", node, parent);
+	if (test_and_clear_bit(CACHEFS_TREE_INSTALLED, &node->flags))
+		rb_erase(&node->node_rb, &parent->nodes);
+	if (test_and_clear_bit(CACHEFS_TREE_S_INSTALLED, &node->flags))
+		rb_erase(&node->aux_rb, &parent->shortcuts);
+}
+
+static inline void cachefs_tree_unlink_from_node(struct cachefs_tree *node)
+{
+	//kenter("%p", node);
+	__cachefs_tree_unlink_from_node(node, node->parent);
+	node->parent = NULL;
+}
+
+extern struct cachefs_tree *cachefs_tree_find_node(struct cachefs_tree *node,
+						   uint8_t type,
+						   uint16_t offset);
+
+extern struct cachefs_tree *cachefs_tree_find_level1_dataptr(struct cachefs_object *obj,
+							     unsigned long index);
+
+extern void cachefs_tree_move_leaf(struct cachefs_super *super,
+				   struct cachefs_tree *from_node,
+				   struct cachefs_tree *to_node,
+				   struct cachefs_ondisc_leaf *from,
+				   struct cachefs_ondisc_leaf *to,
+				   uint16_t from_offset,
+				   uint16_t to_offset);
+
+extern void cachefs_tree_slide_leaf(struct cachefs_super *super,
+				    struct cachefs_tree *node,
+				    void *data,
+				    uint16_t from_offset,
+				    uint16_t to_offset);
+
+extern void cachefs_tree_install_leaf(struct cachefs_operation *op,
+				      struct cachefs_tree *node,
+				      struct cachefs_ondisc_leaf *key,
+				      void *data,
+				      uint16_t offset);
+
+/*****************************************************************************/
+/*
+ * object record
+ */
+struct cachefs_object
+{
+	struct fscache_object		fscache;
+	struct cachefs_tree		*node;		/* tree node in which this inode resides */
+	struct rb_root			dataptrblks;	/* level 1 data ptr blocks */
+	struct rb_node			node_rb;	/* link in node's object list */
+	struct list_head		cull_link;	/* link in cull list */
+	struct rw_semaphore		sem;		/* index search/modify mutex */
+	uint8_t				*key;		/* copy of key */
+	uint64_t			objid;		/* ID of this object */
+	uint64_t			pobjid;		/* ID of this object's parent */
+	loff_t				i_size;		/* current file size */
+	unsigned long			page_limit;	/* limit on page index */
+	rwlock_t			lock;
+	atomic_t			usage;
+	atomic_t			fscache_usage;
+	atomic_t			page_usage;	/* number of pages held for netfs */
+	uint32_t			atime;		/* last access time for cull queue */
+	uint16_t			offset;		/* offset of leaf in page */
+	uint16_t			keylen;		/* length of key in bytes */
+	uint8_t				type;		/* object type */
+	uint8_t				flags;		/* on-disk object flags */
+	uint8_t				data_levels;	/* number of levels in data tree */
+	char				has_data;	/* T if has data on disk */
+	char				being_reaped;	/* T if object is being reaped */
+};
+
+extern kmem_cache_t *cachefs_object_jar;
+
+static inline struct cachefs_object *cachefs_object_get(struct cachefs_object *object)
+{
+#if CACHEFS_DEBUG_OBJECT_ACCOUNTING
+	kdebug(" - GET %p{%llx} USAGE -> %d",
+	       object, object->objid, atomic_read(&object->usage) + 1);
+#endif
+	atomic_inc(&object->usage);
+	return object;
+}
+
+extern void cachefs_object_put(struct cachefs_object *object);
+
+extern void cachefs_tree_update_object(struct cachefs_super *super,
+				       struct cachefs_object *object);
+
+extern int cachefs_compare_keys(const struct cachefs_ondisc_leaf *a,
+				const struct cachefs_ondisc_leaf *b);
+
+extern int cachefs_compare_keys_obj(const struct cachefs_object *a,
+				    const struct cachefs_ondisc_leaf *b);
+
+extern void cachefs_extract_key(uint8_t *buffer,
+				const struct cachefs_ondisc_leaf *leaf,
+				int level);
+
+extern unsigned cachefs_extract_subkey(const struct cachefs_ondisc_leaf *leaf,
+				       int level);
+extern unsigned cachefs_extract_subkey_obj(const struct cachefs_object *obj,
+					   int level);
+
+extern int cachefs_keycmp(const struct cachefs_ondisc_leaf *a,
+			  const struct cachefs_ondisc_leaf *b);
+
+extern int cachefs_keycmp_obj(const struct cachefs_object *a,
+			      const struct cachefs_ondisc_leaf *b);
+
+extern int cachefs_tree_lookup_object(struct cachefs_super *super,
+				      struct cachefs_object *object,
+				      struct cachefs_ondisc_leaf *key,
+				      int create);
+
+extern int __cachefs_tree_link_object(struct cachefs_super *super,
+				      struct cachefs_object *object,
+				      struct cachefs_tree *node,
+				      int dupctl);
+
+static inline int cachefs_tree_link_object(struct cachefs_super *super,
+					   struct cachefs_object *object,
+					   struct cachefs_tree *node)
+{
+	int ret;
+
+	object->node = cachefs_tree_get(node);
+	write_lock(&node->lock);
+	ret = __cachefs_tree_link_object(super, object, node, 0);
+	write_unlock(&node->lock);
+	return ret;
+}
+
+static inline void cachefs_tree_unlink_object_from_node(struct cachefs_object *obj)
+{
+	rb_erase(&obj->node_rb, &obj->node->objects);
+	obj->node = NULL;
+	obj->offset = 0xffffU;
+}
+
+extern struct cachefs_object *cachefs_tree_find_object(struct cachefs_tree *node,
+						       uint16_t offset);
+
+extern struct cachefs_tree *cachefs_tree_find_shortcut(struct cachefs_tree *node,
+						       struct cachefs_ondisc_leaf *key);
+
+extern struct cachefs_tree *cachefs_tree_find_shortcut_obj(struct cachefs_tree *node,
+							   struct cachefs_object *obj);
+
+extern void __cachefs_tree_move_object_to_node(struct cachefs_object *object,
+					       struct cachefs_tree *from,
+					       struct cachefs_tree *to,
+					       uint16_t to_offset);
+
+extern int cachefs_data_set_i_size(struct cachefs_object *object, loff_t i_size);
+
+extern int cachefs_data_read_page(struct cachefs_super *super,
+				  struct cachefs_object *object,
+				  struct page *page,
+				  struct cachefs_io_callback *callback,
+				  unsigned long gfp);
+
+extern int cachefs_data_read_pages(struct cachefs_super *super,
+				   struct cachefs_object *object,
+				   struct address_space *mapping,
+				   struct list_head *pages,
+				   int *nr_pages,
+				   struct cachefs_io_callback *callback,
+				   unsigned long gfp);
+
+extern int cachefs_data_alloc_page(struct cachefs_super *super,
+				   struct cachefs_object *object,
+				   struct page *page,
+				   unsigned long gfp);
+
+extern int cachefs_data_write(struct cachefs_super *super,
+			      struct cachefs_object *object,
+			      struct pagevec *pagevec,
+			      struct cachefs_io_callback *callback,
+			      unsigned long gfp);
+
+extern unsigned long cachefs_data_uncache(struct cachefs_super *super,
+					  struct cachefs_object *object,
+					  struct pagevec *pagevec,
+					  unsigned long ix);
+
+/*****************************************************************************/
+/*
+ * cursor for walking the metadata tree
+ */
+struct cachefs_cursor
+{
+	struct cachefs_tree		*point;		/* node in tree to which pointing */
+	uint16_t			level;		/* level in tree of node */
+	uint16_t			offset;		/* which slot held the leaf */
+	uint16_t			s_offset;	/* equivalent ptr offset for shortcut */
+};
+
+extern void cachefs_tree_put(struct cachefs_tree *branch);
+
+static inline void cachefs_cursor_put(struct cachefs_cursor *cursor)
+{
+	if (cursor->point)
+		cachefs_tree_put(cursor->point);
+}
+
+extern int cachefs_tree_insert(struct cachefs_super *super,
+			       struct cachefs_object *object,
+			       struct cachefs_ondisc_leaf *key);
+
+extern int cachefs_tree_insert_fanout(struct cachefs_operation *op,
+				      struct cachefs_ondisc_leaf *key);
+
+extern int cachefs_tree_delete(struct cachefs_super *super,
+			       struct cachefs_object *object);
+
+/*****************************************************************************/
+/*
+ * cachefs superblock private information
+ */
+struct cachefs_super
+{
+	struct fscache_cache	cache;		/* cache handle */
+	struct super_block	*sb;
+	struct inode		*imeta;		/* metadata inode covering the whole blockdev */
+	struct cachefs_tree	*metadata_tree;	/* root of metadata tree cached in memory */
+	struct rw_semaphore	tree_wander_sem; /* tree modify vs wander serialisation */
+	struct list_head	op_waitq;	/* operation wait-for-space queue */
+	struct list_head	op_runq;	/* operation running queue */
+	struct list_head	alloc_waitq;	/* allocator contention queue */
+
+	unsigned long		options;
+#define CACHEFS_SUPER_AUTO_DELETE	0	/* T to automatically delete released nodes */
+#define CACHEFS_SUPER_NOSCAN		1	/* T to suppress the tree scanner */
+
+	unsigned long		flags;
+#define CACHEFS_SUPER_INIT_BLKDEV	0	/* T if initialising blockdev */
+#define CACHEFS_SUPER_NEED_WANDER	1	/* T if autowander timer expired */
+#define CACHEFS_SUPER_WANDER_TIMEOUT	2	/* T if needs to wander */
+#define CACHEFS_SUPER_DO_JOURNAL	3	/* T if should attend to journal */
+#define CACHEFS_SUPER_DO_RECYCLE	4	/* T if should do recycling */
+#define CACHEFS_SUPER_DO_SCAN		5	/* T if should attend to the tree scanner */
+#define CACHEFS_SUPER_DO_CULL		6	/* T if should cull old inodes */
+#define CACHEFS_SUPER_BEGIN_SCAN	7	/* T if should begin new tree scan */
+#define CACHEFS_SUPER_CULL_DISABLED	8	/* T if inode cull disabled */
+#define CACHEFS_SUPER_REPLAYING_JNL	9	/* T if replaying journal */
+#define CACHEFS_SUPER_ERROR_STOP	10	/* T if cache stopped due to error */
+#define CACHEFS_SUPER_REDUCE_OLDRCM	11	/* T if should reduce rcm_old_pages */
+
+	struct semaphore	alloc_load_sem;	/* allocation load control */
+	struct semaphore	deletion_sem;	/* delete objid & data control */
+	spinlock_t		operation_lock;	/* operation  control lock */
+	spinlock_t		alloc_lock;	/* allocation lock */
+	spinlock_t		objects_lock;	/* all-objects lock */
+
+	atomic_t		error_count;	/* displayed error count */
+	int			bio_wr_barrier;	/* command to submit a write barrier BIO */
+	uint32_t		sector_size;
+
+	struct cachefs_ondisc_journal j;	/* journalled tracking */
+
+	/* space tracking */
+	uint32_t		space_transit;		/* space in transit rcm -> alloc */
+	uint32_t		space_inprogress;	/* space reserved by in-progress ops */
+	uint32_t		space_rcmstk_resv;	/* space reserved for reclaim nodes */
+	uint32_t		space_rcmstk_resv_max;	/* max space for reclaim nodes */
+
+	/* allocation tracking */
+	cachefs_block_t		alloc_pfree_nx;	/* primary free block list */
+
+	struct page		*page_pfree;	/* current primary free list in-mem page */
+	struct page		*page_pfree_nx;	/* next primary free list in-mem page */
+	struct page		*page_sfree;	/* front secondary free list in-mem page */
+
+	/* reclamation tracking */
+	struct page		*page_rcm;	/* reclaim collection node */
+	struct list_head	rcm_old_pages;	/* old reclaim collection nodes */
+
+	/* data tree recycling tracking */
+	short			rcy_slots_rem;	/* number of recycling slots remaining in this op */
+	short			rcy_p_nlevels;	/* levels to be processed in current tree */
+	short			rcy_p_level;	/* level being processed */
+	enum cachefs_rcy_state	rcy_state;	/* recycling state */
+
+	struct page		*page_rcy;	/* recycling collection node */
+	struct page		*page_rcy_proc;	/* recycling node being processed */
+	struct page		*page_rcy_blk[8]; /* pointer block levels being processed */
+
+	/* orphaned object reaping */
+	struct page		*page_reap;	/* reap collection node */
+	struct page		*page_reap_proc; /* reap node being processed */
+
+	/* tree scan tracking */
+	unsigned short		scan_maxculls;	/* maximum number of objects retained for culling */
+	unsigned short		scan_nculls;	/* number of cullable object */
+	enum cachefs_scan_state	scan_state;	/* current state-machine state */
+	cachefs_block_t		scan_bix;	/* block currently being operated upon */
+	struct cachefs_tree	*scan_node;	/* node currently being operated upon */
+	struct cachefs_tree	*scan_tmpnode;	/* spare node for bookmarking */
+	struct cachefs_object	*scan_tmpobj;	/* spare object for bookmarking */
+	struct cachefs_object	*scan_reap;	/* object being reaped */
+	struct list_head	scan_culls;	/* list of potentially cullable objects */
+	struct list_head	scan_xculls;	/* list of displaced culla */
+	struct page		*scan_loading;	/* page currently being loaded */
+
+	/* inode culling - finding the inodes with the oldest atime and culling them */
+	unsigned		cull_hiwater;	/* cull enable limit */
+	unsigned		cull_lowater;	/* cull disable limit */
+
+	/* journal tracking */
+	int32_t			jnl_serial;	/* next serial number */
+	unsigned		jnl_timeout;	/* autowander timeout */
+	spinlock_t		jnl_qlock;
+	struct cachefs_journal	*jnl_current;	/* current journal transaction */
+	struct list_head	jnl_transq;	/* list of outstanding transactions */
+	struct page		*jnl_page;	/* page used for writing to the update journal */
+	struct semaphore	jnl_page_sem;	/* semaphore counting out available slots */
+	struct timer_list	jnl_timer;	/* autowander timer */
+
+	/* cache management daemon for this fs */
+	task_t			*dmn_task;	/* cache daemon task */
+	struct completion	dmn_alive;	/* completion of initialisation */
+	struct completion	dmn_dead;	/* completion of death */
+	wait_queue_head_t	dmn_sleepq;	/* general sleep queue */
+	int			dmn_die;	/* request to die */
+#define CACHEFS_DMN_RUNNING	0		/* normal running */
+#define CACHEFS_DMN_RETIRING	1		/* don't do work that'll dirty pages */
+#define CACHEFS_DMN_DIE		2		/* die */
+
+	/* object reaping daemon */
+	struct cachefs_object	*reaper_target;	/* the object to be reaped */
+	task_t			*reaper_task;	/* reaping daemon task */
+	struct completion	reaper_alive;	/* completion of initialisation */
+	struct completion	reaper_dead;	/* completion of death */
+	wait_queue_head_t	reaper_sleepq;	/* general sleep queue */
+	wait_queue_head_t	reaper_waitq;	/* reap completion queue */
+	int			reaper_die;	/* request to die */
+#define CACHEFS_REAPER_RUNNING	0		/* normal running */
+#define CACHEFS_REAPER_DIE	1		/* die */
+
+	/* thing counting */
+	atomic_t		cnt_objects;	/* number of objects allocated by fscache */
+	atomic_t		cnt_rcmpages;	/* number of rcm pages currently allocated */
+
+	/* superblock copy */
+	struct cachefs_ondisc_superblock *layout;
+};
+
+extern int cachefs_journal_replay(struct cachefs_super *super);
+extern void cachefs_journal_wander_timeout(unsigned long data);
+
+extern int cachefs_page_read(struct cachefs_super *super,
+			     cachefs_block_t bix,
+			     int wipe,
+			     struct page **_page);
+
+static inline void cachefs_page_put(struct page *page)
+{
+	if (page) {
+#if 0
+		kdebug("Put page %p; fl=%lx cnt=%d",
+		       page, page->flags, page_count(page));
+#endif
+		page_cache_release(page);
+	}
+}
+
+extern int cachefs_sync(struct cachefs_super *super, int sync, int intr);
+
+static inline
+struct cachefs_journal *__cachefs_journal_get(struct cachefs_journal *jnl)
+{
+	atomic_inc(&jnl->remaining);
+	return jnl;
+}
+
+static inline
+struct cachefs_journal *cachefs_journal_get(struct cachefs_super *super)
+{
+	struct cachefs_journal *jnl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&super->jnl_qlock, flags);
+	jnl = super->jnl_current;
+	atomic_inc(&jnl->remaining);
+	spin_unlock_irqrestore(&super->jnl_qlock, flags);
+	return jnl;
+}
+
+static inline uint64_t cachefs_alloc_objid(struct cachefs_operation *op)
+{
+	return op->super->j.alloc_objid++;
+}
+
+#endif /* _FS_CACHEFS_INT_H */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/cachefs-layout.h linux-2.6.14-mm2-cachefs/fs/cachefs/cachefs-layout.h
--- linux-2.6.14-mm2/fs/cachefs/cachefs-layout.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/cachefs-layout.h	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,312 @@
+/* cachefs-layout.h: general filesystem caching on-disc layout
+ *
+ * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _FS_CACHEFS_LAYOUT_H
+#define _FS_CACHEFS_LAYOUT_H
+
+#include <linux/types.h>
+
+typedef uint32_t cachefs_block_t;
+#define CACHEFS_BLOCK_SHIFT	2
+#define CACHEFS_NULL_FILL	0x0U		// TODO: set to 0
+#define CACHEFS_NULL_PTR	0x00000000U	// TODO: set to 0
+#define CACHEFS_EMPTY_FILL	0xecU		// TODO: set to 0
+#define CACHEFS_EMPTY_PTR	0xececececU	// TODO: set to 0
+
+#define CACHEFS_ONDISC_LEVEL_BITS	(PAGE_SHIFT - CACHEFS_BLOCK_SHIFT)
+#define CACHEFS_ONDISC_LEVEL_SIZE	(1 << CACHEFS_ONDISC_LEVEL_BITS)
+#define CACHEFS_ONDISC_LEVEL_MASK	(~(CACHEFS_ONDISC_LEVEL_SIZE - 1))
+
+#define CACHEFS_ONDISC_LEAF_SHIFT	9
+#define CACHEFS_ONDISC_LEAF_SIZE	(1 << CACHEFS_ONDISC_LEAF_SHIFT) /* 512 */
+#define CACHEFS_ONDISC_LEAF_MASK	(~(CACHEFS_ONDISC_LEAF_SIZE - 1))
+#define CACHEFS_ONDISC_PTRPERLEAF_SHIFT	(CACHEFS_ONDISC_LEAF_SHIFT - CACHEFS_BLOCK_SHIFT)
+#define CACHEFS_ONDISC_PTRPERLEAF	(1 << CACHEFS_ONDISC_PTRPERLEAF_SHIFT)
+#define CACHEFS_ONDISC_PTRPERLEAF_MASK	(~(CACHEFS_ONDISC_PTRPERLEAF - 1))
+#define CACHEFS_ONDISC_LEAF_PER_BLOCK	((long) (PAGE_SIZE / CACHEFS_ONDISC_LEAF_SIZE))
+#define CACHEFS_ONDISC_PTR_PER_BLOCK	(1 << CACHEFS_ONDISC_LEVEL_BITS)
+
+typedef struct { uint32_t csum[1]; } cachefs_digest_t;
+
+/*****************************************************************************/
+/*
+ * cache superblock block layout
+ * - the blockdev is prepared for initialisation by
+ *   'echo "cachefs___" >/dev/hdaXX' before mounting
+ * - when initialised, the magic number is changed to "cachefsrdy"
+ */
+struct cachefs_ondisc_superblock
+{
+	uint8_t				magic[10];	/* magic number */
+#define CACHEFS_SUPER_MAGIC "cachefsrdy"
+#define CACHEFS_SUPER_MAGIC_NEEDS_INIT "cachefs___"
+#define CACHEFS_SUPER_MAGIC_SIZE 10
+
+	uint16_t			endian;		/* 0x1234 stored CPU-normal order */
+#define CACHEFS_SUPER_ENDIAN 0x1234
+
+	uint32_t			version;	/* format version */
+#define CACHEFS_SUPER_VERSION 3
+
+	/* layout */
+	uint32_t			bsize;		/* cache block size (bytes) */
+	uint16_t			bshift;		/* block size (log2 bytes) */
+	uint16_t			asize;		/* alloc chunk size (blocks) */
+	uint16_t			ashift;		/* alloc chunk size (log2 blocks) */
+	uint16_t			pshift;		/* log2 data pointers per block */
+	uint32_t			leaf_size;	/* cache metadata record size */
+	uint32_t			leaf_shift;	/* log2 cache metadata record size */
+	uint32_t			jnl_rsize;	/* journal record size */
+	uint32_t			jnl_recperblk;	/* journal records per block */
+	cachefs_block_t			bix_journal;	/* start of update journal */
+	cachefs_block_t			bix_cache;	/* start of data cache */
+	cachefs_block_t			bix_end;	/* end of cache */
+	cachefs_block_t			bix_null;	/* null block pointer value */
+	cachefs_block_t			bix_empty;	/* empty block pointer value */
+};
+
+/*****************************************************************************/
+/*
+ * indexing tree leaf definition
+ */
+struct cachefs_ondisc_leaf
+{
+	/* subtree pointer
+	 * - shortcut leaves use this to point to the subtree they cut to
+	 * - object leaves use this to point to the data tree
+	 * - this must be the first thing in the leaf
+	 */
+	cachefs_block_t	ptr;
+
+	/* object type
+	 * - differentiate a block of leaves from a branch (leaf/branch
+	 *   pointer) by the first pointer pointing to a block in the update
+	 *   journal, thus representing a leaf type not a branch pointer
+	 */
+	cachefs_block_t type;
+#define CACHEFS_ONDISC_OBJTYPE_NULL_POINTER	CACHEFS_NULL_PTR /* unused part of fanout block */
+#define CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT	1	/* empty node */
+#define CACHEFS_ONDISC_OBJTYPE_SHORTCUT		2	/* shortcut to remote keyspace */
+#define CACHEFS_ONDISC_OBJTYPE_INDEX_OBJECT	3	/* leaf representing an index */
+#define CACHEFS_ONDISC_OBJTYPE_DATA_OBJECT	4	/* leaf representing a data cache */
+#define CACHEFS_ONDISC_OBJTYPE_OTHER_OBJECT	5	/* leaf representing some other object */
+#define CACHEFS_ONDISC_OBJTYPE__LAST		6
+#define	CACHEFS_ONDISC_OBJTYPE_FIRST_POINTER	7
+#define CACHEFS_ONDISC_OBJTYPE_LAST_POINTER	UINT_MAX
+
+	union {
+		/* shortcut over singly-branched tree chain
+		 * - index key: { digest[0..klen] }
+		 */
+		struct { /* (- 512 4 (* 4 4) 2) = 490 */
+			uint16_t	klen;		/* amount of relevant key data (bits) */
+			uint16_t	level;		/* level at which target node lives */
+			uint16_t	s_offset;	/* equivalent ptr offset in parent node */
+			uint8_t		__pad[2];
+
+			/* begin key */
+			uint8_t		key[0];		/* partial key data */
+			/* end key */
+		} shortcut;
+
+		/* representation of an index, data file or other object
+		 * - index key: { digest, parent, netfs_data[0..netfs_klen-1] }
+		 */
+		struct { /* (- 512 8 52 16) = 436 */
+			uint64_t	objid;		/* this object ID */
+#define CACHEFS_ONDISC_FSDEF_OBJID 1
+			uint64_t	size;		/* size of file */
+			uint32_t	nblocks;	/* number of allocated blocks */
+			uint32_t	reservation;	/* number of blocks reserved */
+			uint32_t	atime;		/* last access time */
+			uint16_t	netfs_dlen;	/* netfs index aux data length */
+			uint8_t		data_levels;	/* number of levels in data tree */
+			uint8_t		object_type;	/* type of object */
+			uint8_t		flags;
+#define CACHEFS_ONDISC_OBJECT_HAS_CHILDREN	0x01	/* T if object has child objects */
+#define CACHEFS_ONDISC_OBJECT_IS_PINNED		0x02	/* T if object is pinned */
+#define CACHEFS_ONDISC_DONT_CULL_DIRECTLY	0x04	/* T if should be pinned by parent */
+			uint8_t		__pad[3];
+			char		object_name[16]; /* netfs's name for this type */
+
+			/* begin key */
+			cachefs_digest_t key;		/* digest sum of index chain keys */
+			uint32_t	parent[2];	/* parent object ID (1 for FSDEF) */
+			uint16_t	netfs_klen;	/* netfs index key length */
+			uint8_t		netfs_data[0];	/* netfs index key + auxdata */
+			/* end key */
+		} object;
+	} u;
+};
+
+/*****************************************************************************/
+/*
+ * Free blocks are kept in three very one sided trees (more horsetail plants
+ * than trees)
+ *
+ *        +---------+    +---------+    +---------+    +---------+
+ * stk--->|         |--->|         |--->|         |--->|         |---> NULL
+ *        |  NODE   |	 |  NODE   |	|  NODE   |    |  NODE   |
+ *        |         |	 |         |	|         |    |         |
+ *        +---------+	 +---------+	+---------+    +---------+
+ *           / | \	    / | \	   / | \          / | \
+ *        free blocks    free blocks    free blocks    free blocks
+ *
+ * - each free block is on one of three trees, all pointed to by the ujournal:
+ *   - the "recycling stack" - all newly freed blocks end up on here
+ *   - the "alloc stack" - all allocations are popped off here
+ *   - the "excise stack" - allocations removed by data-write replay
+ *   - when the alloc stack is empty, the recycling stack is transferred into
+ *     it
+ * - the front node on the alloc stack is the current source of block
+ *   allocations
+ *   - when all a node's leaves have been allocated, then the node itself will
+ *     be allocated
+ * - the front node on the recycling stack is the current sink of recycled
+ *   blocks
+ * - the excise stack is emptied onto the recycling stack
+ */
+struct cachefs_ondisc_free_node
+{
+	uint32_t	magic;		/* magic number */
+#define CACHEFS_ONDISC_FREELIST_PARTIAL	0xbeef1355
+#define CACHEFS_ONDISC_FREELIST_READY	0xbeef1355
+	cachefs_block_t	next;		/* next node in free tree */
+	cachefs_block_t	ptrs[0];	/* free blocks depending from this block */
+};
+
+#define CACHEFS_ONDISC_FREELIST_PTRSPERNODE \
+	((PAGE_SIZE - sizeof(struct cachefs_ondisc_free_node)) / sizeof(cachefs_block_t))
+
+/*****************************************************************************/
+/*
+ * Data block trees to be recycled are kept in a stack until we can deal with
+ * them
+ */
+struct cachefs_ondisc_recycle_data
+{
+	cachefs_block_t dataptr;	/* data tree root */
+	unsigned	depth;		/* data tree depth */
+};
+
+struct cachefs_ondisc_recycle_node
+{
+	cachefs_block_t	next;		/* next block in stack */
+	struct cachefs_ondisc_recycle_data trees[0];
+};
+
+#define CACHEFS_ONDISC_RCYSTK_TREESPERNODE				\
+	((long) ((PAGE_SIZE - sizeof(struct cachefs_ondisc_recycle_node)) / \
+		 sizeof(struct cachefs_ondisc_recycle_data)))
+
+/*****************************************************************************/
+/*
+ * Object IDs to be reaped are kept in a stack until we can deal with them
+ */
+struct cachefs_ondisc_reap_node
+{
+	cachefs_block_t	next;		/* next block in stack */
+	uint64_t objids[0];
+};
+
+#define CACHEFS_ONDISC_REAP_OBJIDSPERNODE				\
+	((long) ((PAGE_SIZE - sizeof(uint64_t)) / sizeof(uint64_t)))
+
+/*****************************************************************************/
+/*
+ * on-disk journal
+ * - records changes being made to disk content, particularly the metadata
+ * - the serial number cycles through in ascending order
+ *   - ACKs specify everything between "index" & "block" as being complete
+ *   - serial numbers can wrap, but can't go into window of un-ACK'd marks
+ * - journal slots are the size of a sector (blockdev block size)
+ *   - this means that two adjacent marks are made on separate sectors, and so
+ *     the second doesn't have to wait for the first to be written to disk
+ * - the current slot allocation point is not permitted to lap the currently
+ *   un-ACK'd slots - the requestor must wait
+ */
+enum cachefs_ondisc_jnl_mark {
+	CACHEFS_ONDISC_JNL_EMPTY,		/* empty slot */
+	CACHEFS_ONDISC_JNL_ACK,			/* batch completion mark */
+	CACHEFS_ONDISC_JNL_WANDER,		/* wander the root of the tree */
+	CACHEFS_ONDISC_JNL_UNMOUNT,		/* filesystem unmount */
+
+	CACHEFS_ONDISC_JNL__LAST
+} __attribute__((packed));
+
+struct cachefs_ondisc_journal
+{
+	/* journal control */
+	enum cachefs_ondisc_jnl_mark	mark;	/* type of journal entry */
+	uint8_t				error;	/* T if error encountered */
+	uint32_t			serial;	/* serial number of entry in batch */
+	uint32_t			jtime;	/* journal entry timestamp */
+
+	/* state tracking */
+	cachefs_block_t	tree_root;		/* root of the indexing tree */
+	uint64_t	alloc_objid;		/* next index/file ID to allocate */
+
+	/* block allocation tracking */
+	cachefs_block_t	alloc_unready;		/* start of unready space */
+	cachefs_block_t	alloc_pfree;		/* primary free block list */
+	cachefs_block_t	alloc_sfree;		/* secondary free block list */
+	uint32_t	alloc_pfree_n;		/* occupancy of primary free block list */
+	uint32_t	alloc_sfree_n;		/* occupancy of secondary free block list */
+	uint16_t	alloc_pfree_pt;		/* ptr to next block to allocate */
+
+	/* block reclamation tracking */
+	uint16_t	rcm_coll_pt;		/* insertion point in rcm_collector */
+	uint32_t	rcm_ready_n;		/* number of entries in ready list */
+	cachefs_block_t	rcm_ready;		/* ready recycling list */
+	cachefs_block_t	rcm_collector;		/* current reclamation collector node */
+	cachefs_block_t	rcm_spare;		/* spare block for emergency rcm node allocation */
+
+	/* data tree recycling tracking */
+	cachefs_block_t	rcy_processor;		/* list we're processing */
+	cachefs_block_t	rcy_stack;		/* stack of full lists */
+	cachefs_block_t	rcy_collector;		/* list we're collecting in */
+	int16_t		rcy_offsets[8];		/* offset of pointer to recycle in tree */
+	int16_t		rcy_procsp;		/* processor pop point */
+	int16_t		rcy_collsp;		/* collector push point */
+
+	/* object ID reaping (used to clobber child objects) */
+	cachefs_block_t	reap_processor;		/* stack of objids to reap */
+	cachefs_block_t	reap_stack;		/* objids being reaped */
+	cachefs_block_t	reap_collector;		/* list we're collecting in */
+	int16_t		reap_proccnt;		/* count in processor block */
+	int16_t		reap_collsp;		/* collector push point */
+
+	/* space balancing */
+	uint32_t	space_alloc;		/* space on alloc stacks */
+	uint32_t	space_rcm;		/* space on reclaim stack */
+	uint32_t	space_rcy;		/* space awaiting recycling */
+	uint32_t	space_reap;		/* space awaiting reaping */
+	uint32_t	space_alrcm_nodes;	/* alloc/reclaim stack node space used */
+	uint32_t	space_meta;		/* tree metadata space used */
+	uint32_t	space_data_used;	/* data space used */
+	uint32_t	space_data_pinned;	/* data space pinned and used */
+	uint32_t	space_data_rsv_data;	/* data space reserved and used */
+	uint32_t	space_data_rsv_pin;	/* data space reserved, pinned and used */
+	uint32_t	space_data_reserved;	/* data space reserved but unused */
+};
+
+#define CACHEFS_ONDISC_JNL_MIN_REC_BITS	9	/* log2 minimum journal record size */
+#define CACHEFS_ONDISC_JNL_NUMBLOCK_BITS 4	/* log2 number of blocks in the journal */
+#define CACHEFS_ONDISC_JNL_MIN_REC_SIZE (1 << CACHEFS_ONDISC_JNL_MIN_REC_BITS)	
+#define CACHEFS_ONDISC_JNL_NUMBLOCKS (1 << CACHEFS_ONDISC_JNL_NUMBLOCK_BITS)
+
+#define CACHEFS_ONDISC_JNL_NUMENTS \
+	(1 << (CACHEFS_ONDISC_JNL_NUMBLOCK_BITS + PAGE_SHIFT - CACHEFS_ONDISC_JNL_MIN_REC_BITS))
+
+#define CACHEFS_ONDISC_JNL_SLOT_MASK \
+	(CACHEFS_ONDISC_JNL_NUMENTS - 1)
+
+#endif /* _FS_CACHEFS_LAYOUT_H */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/inode.c linux-2.6.14-mm2-cachefs/fs/cachefs/inode.c
--- linux-2.6.14-mm2/fs/cachefs/inode.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/inode.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,224 @@
+/* cache-inode.c: general cache filesystem inode handling code
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+#include "cachefs-int.h"
+#include "cachefs-inode.h"
+
+static int cachefs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				 struct kstat *stat);
+
+static struct inode_operations cachefs_inode_operations = {
+	.getattr	= cachefs_inode_getattr,
+};
+
+static struct file_operations cachefs_file_operations = {
+	.read		= generic_file_read,
+	.write		= generic_file_write,
+};
+
+/*****************************************************************************/
+/*
+ * set up a status file virtual inode
+ */
+static void cachefs_iget_status_file(struct cachefs_inode *inode)
+{
+	inode->vfs_inode.i_mode		= S_IFREG | S_IRUGO;
+	inode->vfs_inode.i_uid		= 0;
+	inode->vfs_inode.i_gid		= 0;
+	inode->vfs_inode.i_nlink	= 1;
+	inode->vfs_inode.i_size		= 0;
+	inode->vfs_inode.i_atime	= CURRENT_TIME;
+	inode->vfs_inode.i_mtime	= CURRENT_TIME;
+	inode->vfs_inode.i_ctime	= CURRENT_TIME;
+	inode->vfs_inode.i_blksize	= PAGE_SIZE;
+	inode->vfs_inode.i_blkbits	= PAGE_SHIFT;
+	inode->vfs_inode.i_blocks	= 0;
+	inode->vfs_inode.i_version	= 1;
+	inode->vfs_inode.i_flags	|= S_NOATIME;
+	inode->vfs_inode.i_op		= &cachefs_status_inode_operations;
+	inode->vfs_inode.i_fop		= &cachefs_status_file_operations;
+
+} /* end cachefs_iget_status_file() */
+
+/*****************************************************************************/
+/*
+ * set up the metadata file inode (such as the inode we use to represent the
+ * entire block device)
+ */
+static void cachefs_iget_meta_file(struct cachefs_inode *inode,
+				   unsigned blocks)
+{
+	inode->vfs_inode.i_mode		= S_IFREG | S_IRUGO;
+	inode->vfs_inode.i_uid		= 0;
+	inode->vfs_inode.i_gid		= 0;
+	inode->vfs_inode.i_nlink	= 1;
+	inode->vfs_inode.i_size		= (unsigned long) blocks << PAGE_SHIFT;
+	inode->vfs_inode.i_atime	= CURRENT_TIME;
+	inode->vfs_inode.i_mtime	= CURRENT_TIME;
+	inode->vfs_inode.i_ctime	= CURRENT_TIME;
+	inode->vfs_inode.i_blksize	= PAGE_SIZE;
+	inode->vfs_inode.i_blkbits	= PAGE_SHIFT;
+	inode->vfs_inode.i_blocks	= blocks;
+	inode->vfs_inode.i_version	= 1;
+	inode->vfs_inode.i_flags	|= S_NOATIME;
+	inode->vfs_inode.i_op		= &cachefs_inode_operations;
+	inode->vfs_inode.i_fop		= &cachefs_file_operations;
+	inode->vfs_inode.i_mapping->a_ops = &cachefs_meta_addrspace_operations;
+
+} /* end cachefs_iget_meta_file() */
+
+/*****************************************************************************/
+/*
+ * set up the inode attributes for the root directory
+ */
+static void cachefs_iget_root_dir(struct cachefs_inode *inode)
+{
+	inode->vfs_inode.i_mode		= S_IFDIR | S_IRUGO | S_IXUGO;
+	inode->vfs_inode.i_uid		= 0;
+	inode->vfs_inode.i_gid		= 0;
+	inode->vfs_inode.i_nlink	= 2;
+	inode->vfs_inode.i_size		= 0;
+	inode->vfs_inode.i_atime	= CURRENT_TIME;
+	inode->vfs_inode.i_mtime	= CURRENT_TIME;
+	inode->vfs_inode.i_ctime	= CURRENT_TIME;
+	inode->vfs_inode.i_blksize	= PAGE_SIZE;
+	inode->vfs_inode.i_blkbits	= PAGE_SHIFT;
+	inode->vfs_inode.i_blocks	= 0;
+	inode->vfs_inode.i_version	= 1;
+	inode->vfs_inode.i_flags	|= S_NOATIME;
+	inode->vfs_inode.i_op		= &cachefs_root_inode_operations;
+	inode->vfs_inode.i_fop		= &cachefs_root_file_operations;
+
+} /* end cachefs_iget_root_dir() */
+
+/*****************************************************************************/
+/*
+ * attempt to retrieve the inode for a cached file
+ */
+struct cachefs_inode *cachefs_iget(struct cachefs_super *super, ino_t ino)
+{
+	struct cachefs_inode *inode;
+	struct inode *vfs_inode;
+	loff_t nblocks;
+	int ret;
+
+	_enter(",%lu,", ino);
+
+	ASSERT(ino != 0);
+
+	/* it does reside in this cache - create an inode for it */
+	vfs_inode = iget_locked(super->sb, ino);
+	if (!vfs_inode) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	inode = CACHEFS_FS_I(vfs_inode);
+
+	/* deal with an existing inode */
+	if (!(inode->vfs_inode.i_state & I_NEW)) {
+		_leave(" = %p [exist]", inode);
+		return inode;
+	}
+
+	/* new inode */
+	switch (ino) {
+		/* the virtual inode that mirrors the block device */
+	case CACHEFS_INO_IMETA:
+		nblocks = i_size_read(super->sb->s_bdev->bd_inode);
+		do_div(nblocks, PAGE_SIZE);
+		if (nblocks > UINT_MAX)
+			nblocks = UINT_MAX;
+
+		cachefs_iget_meta_file(inode, nblocks);
+		break;
+
+		/* they've asked for the status file virtual inode */
+	case CACHEFS_INO_STATUS:
+		cachefs_iget_status_file(inode);
+		break;
+
+		/* they've asked for an index or a data file cache inode */
+	case CACHEFS_INO_ROOTDIR:
+		cachefs_iget_root_dir(inode);
+		break;
+
+	default:
+		ret = -ENOENT;
+		goto bad_inode;
+	}
+
+	/* success */
+	unlock_new_inode(&inode->vfs_inode);
+
+	_leave(" = %p [new]", inode);
+	return inode;
+
+	/* failure */
+ bad_inode:
+	make_bad_inode(&inode->vfs_inode);
+	unlock_new_inode(&inode->vfs_inode);
+	iput(&inode->vfs_inode);
+
+	_leave(" = %d [bad]", ret);
+	return ERR_PTR(ret);
+
+} /* end cachefs_iget() */
+
+/*****************************************************************************/
+/*
+ * write a cache inode back to disc
+ * - don't use generic_file_write() to write out the meta-data file's meta-data
+ *   as it updates the mtime & ctime and marks the inode dirty again
+ */
+int cachefs_write_inode(struct inode *vfs_inode, int sync)
+{
+	_enter("{sb=%p ino=%lu},%d", vfs_inode->i_sb, vfs_inode->i_ino, sync);
+	return 0;
+
+} /* end cachefs_write_inode() */
+
+/*****************************************************************************/
+/*
+ * clear an inode
+ */
+void cachefs_clear_inode(struct inode *vfs_inode)
+{
+	_enter("{ino=%lu nl=%u}", vfs_inode->i_ino, vfs_inode->i_nlink);
+
+} /* end cachefs_clear_inode() */
+
+/*****************************************************************************/
+/*
+ * read the attributes of an inode
+ */
+int cachefs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			  struct kstat *stat)
+{
+	_enter("{ ino=%lu }", dentry->d_inode->i_ino);
+
+	generic_fillattr(dentry->d_inode, stat);
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_inode_getattr() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/interface.c linux-2.6.14-mm2-cachefs/fs/cachefs/interface.c
--- linux-2.6.14-mm2/fs/cachefs/interface.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/interface.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,724 @@
+/* interface.c: filesystem cache interface
+ *
+ * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include "cachefs-int.h"
+
+#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
+
+/*****************************************************************************/
+/*
+ * look up the nominated node in this cache, creating it if necessary
+ */
+static struct fscache_object *cachefs_lookup_object(struct fscache_cache *_cache,
+						    struct fscache_object *_parent,
+						    struct fscache_cookie *cookie)
+{
+	struct cachefs_object *parent, *object;
+	struct cachefs_ondisc_leaf *leaf;
+	struct cachefs_super *super;
+	uint16_t maxklen, maxdlen, dlen;
+	loff_t bigtmp;
+	void *dbuf;
+	int ret;
+
+	ASSERT(_parent);
+
+	super = container_of(_cache, struct cachefs_super, cache);
+	parent = container_of(_parent, struct cachefs_object, fscache);
+
+	_enter("{%s},%p{%llx,%llx},%p",
+	       super->cache.identifier,
+	       parent, parent->pobjid, parent->objid,
+	       cookie);
+
+	ASSERT(parent->objid != 0);
+
+	/* mark the parent as having children */
+	if (!(parent->flags & CACHEFS_ONDISC_OBJECT_HAS_CHILDREN)) {
+		ASSERT(_parent != super->cache.fsdef);
+		parent->flags |= CACHEFS_ONDISC_OBJECT_HAS_CHILDREN;
+		cachefs_tree_update_object(super, parent);
+	}
+
+	/* create a new object record and a temporary leaf image */
+	object = kmem_cache_alloc(cachefs_object_jar, SLAB_KERNEL);
+	if (!object)
+		goto nomem;
+
+#if CACHEFS_DEBUG_OBJECT_ACCOUNTING
+	kdebug("- ALLOC OBJ %p", object);
+#endif
+
+	leaf = kmalloc(CACHEFS_ONDISC_LEAF_SIZE, GFP_KERNEL);
+	if (!leaf)
+		goto nomem_o;
+
+	/* initialise the object from the parent index */
+	atomic_set(&object->usage, 1);
+	atomic_set(&object->fscache_usage, 1);
+
+	fscache_object_init(&object->fscache);
+	object->fscache.cookie = cookie;
+	object->fscache.cache = &super->cache;
+
+	object->pobjid	= parent->objid;
+	object->objid	= 0;
+	object->node	= NULL;
+	object->offset	= 0xffffU;
+	object->keylen	= 0;
+	object->type	= cookie->def->type;
+	object->data_levels = 0;
+
+	/* initialise the leaf image */
+	memset(leaf, CACHEFS_EMPTY_FILL, CACHEFS_ONDISC_LEAF_SIZE);
+
+	leaf->ptr			= CACHEFS_NULL_PTR;
+	leaf->u.object.nblocks		= 0;
+	leaf->u.object.data_levels	= 0;
+	leaf->u.object.netfs_dlen	= 0;
+	leaf->u.object.flags		= 0;
+	leaf->u.object.object_type	= object->type;
+
+	switch (object->type) {
+	case FSCACHE_COOKIE_TYPE_INDEX:
+		leaf->type = CACHEFS_ONDISC_OBJTYPE_INDEX_OBJECT;
+		break;
+	case FSCACHE_COOKIE_TYPE_DATAFILE:
+		leaf->type = CACHEFS_ONDISC_OBJTYPE_DATA_OBJECT;
+		break;
+	default:
+		leaf->type = CACHEFS_ONDISC_OBJTYPE_OTHER_OBJECT;
+		break;
+	}
+
+	memcpy(leaf->u.object.object_name,
+	       cookie->def->name,
+	       sizeof(leaf->u.object.object_name));
+
+	memcpy(&leaf->u.object.parent, &object->pobjid,
+	       sizeof(leaf->u.object.parent));
+
+	/* record the key on the leaf image */
+	maxklen = CACHEFS_ONDISC_LEAF_SIZE;
+	maxklen -= offsetof(struct cachefs_ondisc_leaf, u.object.netfs_data);
+
+	leaf->u.object.netfs_klen =
+		cookie->def->get_key(cookie->netfs_data,
+				     leaf->u.object.netfs_data,
+				     maxklen);
+
+	BUG_ON(leaf->u.object.netfs_klen > maxklen);
+
+	ret = cachefs_digest_key(object, leaf);
+	if (ret < 0)
+		goto error;
+
+	/* and grab the auxilliary data too */
+	memset(&leaf->u.object.size, 0, sizeof(leaf->u.object.size));
+
+#if 0
+	if (cookie->def->get_attr) {
+		uint64_t fsize;
+		cookie->def->get_attr(cookie->netfs_data, &fsize);
+		memcpy(&leaf->u.object.size, &fsize,
+		       sizeof(leaf->u.object.size));
+	}
+#endif
+
+	if (cookie->def->get_aux) {
+		maxdlen = CACHEFS_ONDISC_LEAF_SIZE;
+		maxdlen -= offsetof(struct cachefs_ondisc_leaf,
+				    u.object.netfs_data);
+		maxdlen -= leaf->u.object.netfs_klen;
+		dbuf = leaf->u.object.netfs_data;
+		dbuf += leaf->u.object.netfs_klen;
+
+		dlen = cookie->def->get_aux(cookie->netfs_data, dbuf, maxdlen);
+		BUG_ON(dlen > maxdlen);
+		leaf->u.object.netfs_dlen = dlen;
+	}
+
+	/* attempt the lookup */
+	ret = cachefs_tree_lookup_object(super, object, leaf, 1);
+	if (ret < 0)
+		goto error;
+
+	kfree(leaf);
+
+	/* work out the page limit */
+	bigtmp = object->i_size;
+	bigtmp += PAGE_SIZE - 1;
+	bigtmp &= ~((loff_t) PAGE_SIZE - 1);
+	if (bigtmp < object->i_size)
+		bigtmp = ((loff_t) 0) - 1;
+	bigtmp >>= PAGE_SHIFT;
+
+	object->page_limit = (bigtmp > ULONG_MAX) ? ULONG_MAX : bigtmp;
+
+	atomic_inc(&super->cnt_objects);
+
+	ASSERT(object->node);
+
+	_leave(" = %p{%p,%llx}", &object->fscache, object->node, object->objid);
+	return &object->fscache;
+
+error:
+	kfree(leaf);
+	atomic_set(&object->fscache_usage, 0);
+	cachefs_object_put(object);
+	_leave(" = %d [error]", ret);
+	return ERR_PTR(ret);
+
+nomem_o:
+	kmem_cache_free(cachefs_object_jar, object);
+nomem:
+	_leave(" = -ENOMEM");
+	return ERR_PTR(-ENOMEM);
+
+} /* end cachefs_lookup_object() */
+
+/*****************************************************************************/
+/*
+ * increment the usage count on an inode object (may fail if unmounting)
+ */
+static struct fscache_object *cachefs_grab_object(struct fscache_object *_object)
+{
+	struct cachefs_object *object;
+
+	_enter("%p", _object);
+
+	object = container_of(_object, struct cachefs_object, fscache);
+
+#ifdef CACHEFS_DEBUG_SLAB
+	ASSERT((atomic_read(&object->fscache_usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+	atomic_inc(&object->fscache_usage);
+	return &object->fscache;
+
+} /* end cachefs_grab_object() */
+
+/*****************************************************************************/
+/*
+ * lock the semaphore on an object object
+ */
+static void cachefs_lock_object(struct fscache_object *_object)
+{
+	struct cachefs_object *object;
+
+	_enter("%p", _object);
+
+	object = container_of(_object, struct cachefs_object, fscache);
+
+#ifdef CACHEFS_DEBUG_SLAB
+	ASSERT((atomic_read(&object->fscache_usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+	down_write(&object->sem);
+
+} /* end cachefs_lock_object() */
+
+/*****************************************************************************/
+/*
+ * unlock the semaphore on an object object
+ */
+static void cachefs_unlock_object(struct fscache_object *_object)
+{
+	struct cachefs_object *object;
+
+	_enter("%p", _object);
+
+	object = container_of(_object, struct cachefs_object, fscache);
+	up_write(&object->sem);
+
+} /* end cachefs_unlock_object() */
+
+/*****************************************************************************/
+/*
+ * update the auxilliary data for an object object on disk
+ */
+static void cachefs_update_object(struct fscache_object *_object)
+{
+	struct cachefs_object *object;
+	struct cachefs_super *super;
+
+	_enter("%p", _object);
+
+	object = container_of(_object, struct cachefs_object, fscache);
+	super = container_of(object->fscache.cache, struct cachefs_super, cache);
+
+	cachefs_tree_update_object(super, object);
+
+} /* end cachefs_update_object() */
+
+/*****************************************************************************/
+/*
+ * dispose of a reference to an object object
+ */
+static void cachefs_put_object(struct fscache_object *_object)
+{
+	struct cachefs_object *object;
+	struct cachefs_super *super;
+
+	ASSERT(_object);
+
+	object = container_of(_object, struct cachefs_object, fscache);
+	_enter("%p{%d,%llx}",
+	       object, atomic_read(&object->usage), object->objid);
+
+	ASSERT(object);
+
+	super = container_of(object->fscache.cache,
+			     struct cachefs_super, cache);
+
+	ASSERTIF(!object->node, _object == super->cache.fsdef);
+
+#ifdef CACHEFS_DEBUG_SLAB
+	ASSERT((atomic_read(&object->fscache_usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+	if (!atomic_dec_and_test(&object->fscache_usage))
+		return;
+
+	_debug("- kill object %p", object);
+
+	atomic_dec(&super->cnt_objects);
+
+	/* dispose of the retained level 0 data page */
+	if (object->data_levels == 0) {
+		_debug("- kill level 0 block");
+
+		ASSERT(atomic_read(&object->page_usage) <= 1);
+		atomic_set(&object->page_usage, 0);
+	}
+	/* dispose of all the retained level 1 data ptr blocks */
+	else if (object->dataptrblks.rb_node) {
+		struct cachefs_tree *dataptrblk;
+		struct rb_node *p, *next;
+		int count;
+
+		_debug("- kill level 1 blocks");
+
+		next = object->dataptrblks.rb_node;
+		while (p = next, p) {
+			while (p->rb_left || p->rb_right)
+				p = p->rb_left ?: p->rb_right;
+
+			next = p->rb_parent;
+			if (next) {
+				if (next->rb_left == p)
+					next->rb_left = NULL;
+				else if (next->rb_right == p)
+					next->rb_right = NULL;
+				else
+					BUG();
+			}
+			else if (object->dataptrblks.rb_node == p) {
+				object->dataptrblks.rb_node = NULL;
+			}
+			else {
+				BUG();
+			}
+			dataptrblk = rb_entry(p, struct cachefs_tree, aux_rb);
+			count = atomic_read(&dataptrblk->netfs_usage);
+
+			_debug("- - blk %lx{%d}", dataptrblk->index, count);
+
+			ASSERT(count > 0);
+			atomic_sub(count, &dataptrblk->netfs_usage);
+			atomic_sub(count, &object->page_usage);
+			dataptrblk->object = NULL;
+			cachefs_tree_put(dataptrblk);
+		}
+
+		ASSERT(!object->dataptrblks.rb_node);
+	}
+
+	if (atomic_read(&object->page_usage) != 0) {
+		printk(KERN_ERR "CacheFS: "
+		       "%d netfs pages remain held on object",
+		       atomic_read(&object->page_usage));
+		BUG();
+	}
+
+	/* delete retired objects */
+	if (test_bit(CACHEFS_SUPER_AUTO_DELETE, &super->options) &&
+//	    object->type == FSCACHE_COOKIE_TYPE_INDEX &&
+	    _object != super->cache.fsdef
+	    ) {
+		set_bit(FSCACHE_OBJECT_RECYCLING, &object->fscache.flags);
+	}
+
+	if (test_bit(FSCACHE_OBJECT_RECYCLING, &object->fscache.flags) &&
+	    _object != super->cache.fsdef)
+		cachefs_tree_delete(super, object);
+
+	cachefs_object_put(object);
+	_leave("");
+
+} /* end cachefs_put_object() */
+
+/*****************************************************************************/
+/*
+ * sync a cache
+ */
+static void cachefs_sync_cache(struct fscache_cache *cache)
+{
+	_enter("%p", cache);
+
+	/* make sure all pages pinned by operations on behalf of the netfs are
+	 * written to disc */
+	cachefs_sync(container_of(cache, struct cachefs_super, cache), 1, 0);
+
+} /* end cachefs_sync_cache() */
+
+/*****************************************************************************/
+/*
+ * set the data size on an object
+ */
+static int cachefs_set_i_size(struct fscache_object *_object, loff_t i_size)
+{
+	struct cachefs_object *object;
+
+	_enter("%p,%llu", _object, i_size);
+
+	object = container_of(_object, struct cachefs_object, fscache);
+
+	if (i_size == object->i_size)
+		return 0;
+
+	return cachefs_data_set_i_size(object, i_size);
+
+} /* end cachefs_set_i_size() */
+
+/*****************************************************************************/
+/*
+ * handle notifications about write operations on a netfs page
+ */
+int cachefs_netfs_io_completion(struct bio *bio,
+				unsigned int bytes_done,
+				int error)
+{
+	struct cachefs_io_callback *callback;
+	struct bio_vec *bvec;
+
+	_enter("%p{%u},%u,%d", bio, bio->bi_size, bytes_done, error);
+
+	/* the operation may not yet be complete */
+	if (bio->bi_size > 0) {
+		_leave(" = 1");
+		return 1;
+	}
+
+	callback = bio->bi_private;
+	bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+
+	do {
+		struct page *page = bvec->bv_page;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		_debug("DONE PAGE %p{%d,%lx,%lx}",
+		       page, page_count(page), page->index, page->flags);
+
+		/* let the netfs know that the data is now safely written or
+		 * that we've failed utterly */
+		callback->callback_func(page, callback->callback_data, error);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+	cachefs_io_callback_put(callback);
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_netfs_io_completion() */
+
+/*****************************************************************************/
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if no buffers can be made available
+ * - returns -ENOBUFS if page is beyond EOF
+ * - if the page is backed by a block in the cache:
+ *   - the page record will be left attached to the object
+ *   - a read will be started which will call the callback on completion
+ *   - 0 will be returned
+ * - else if the page is unbacked:
+ *   - the metadata will be retained
+ *   - -ENODATA will be returned
+ */
+static int cachefs_read_or_alloc_page(struct fscache_object *_object,
+				      struct page *page,
+				      fscache_rw_complete_t callback_func,
+				      void *callback_data,
+				      unsigned long gfp)
+{
+	struct cachefs_io_callback *callback;
+	struct cachefs_object *object;
+	struct cachefs_super *super;
+	int ret;
+
+	object = container_of(_object, struct cachefs_object, fscache);
+	super = container_of(object->fscache.cache, struct cachefs_super, cache);
+
+	//printk("\n");
+	//printk("========================================\n");
+	//printk("\n");
+
+	_enter("{%llx},{%lx},,,", object->objid, page->index);
+
+	if (object->page_limit != ULONG_MAX &&
+	    page->index >= object->page_limit
+	    ) {
+		_leave(" = -ENOBUFS [limit]");
+		return -ENOBUFS;
+	}
+
+	callback = kmalloc(sizeof(*callback), gfp & GFP_LEVEL_MASK);
+	if (!callback) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	memset(callback, 0, sizeof(*callback));
+
+	callback->super = super;
+	callback->callback_func = callback_func;
+	callback->callback_data = callback_data;
+	atomic_set(&callback->usage, 1);
+
+	/* prevent the page from being recycled before we've read it */
+	callback->jnl = cachefs_journal_get(super);
+
+	ret = cachefs_data_read_page(super, object, page, callback, gfp);
+
+	cachefs_io_callback_put(callback);
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_read_or_alloc_page() */
+
+/*****************************************************************************/
+/*
+ * read a list of pages from the cache or allocate blocks in which to store
+ * them
+ */
+static int cachefs_read_or_alloc_pages(struct fscache_object *_object,
+				       struct address_space *mapping,
+				       struct list_head *pages,
+				       unsigned *nr_pages,
+				       fscache_rw_complete_t callback_func,
+				       void *callback_data,
+				       unsigned long gfp)
+{
+	struct cachefs_io_callback *callback;
+	struct cachefs_object *object;
+	struct cachefs_super *super;
+	int ret;
+
+	object = container_of(_object, struct cachefs_object, fscache);
+	super = container_of(object->fscache.cache, struct cachefs_super, cache);
+
+	//printk("\n");
+	//printk("========================================\n");
+	//printk("\n");
+
+	_enter("{%llx},,%d,,", object->objid, *nr_pages);
+
+	callback = kmalloc(sizeof(*callback), gfp & GFP_LEVEL_MASK);
+	if (!callback) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	memset(callback, 0, sizeof(*callback));
+
+	callback->super = super;
+	callback->callback_func = callback_func;
+	callback->callback_data = callback_data;
+	atomic_set(&callback->usage, 1);
+
+	/* prevent the pages from being recycled before we've read them */
+	callback->jnl = cachefs_journal_get(super);
+
+	ret = cachefs_data_read_pages(super, object, mapping, pages, nr_pages,
+				      callback, gfp);
+
+	cachefs_io_callback_put(callback);
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_read_or_alloc_pages() */
+
+/*****************************************************************************/
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if no buffers can be made available
+ * - returns -ENOBUFS if page is beyond EOF
+ * - otherwise:
+ *   - the metadata will be retained
+ *   - 0 will be returned
+ */
+static int cachefs_allocate_page(struct fscache_object *_object,
+				 struct page *page,
+				 unsigned long gfp)
+{
+	struct cachefs_object *object;
+	struct cachefs_super *super;
+
+	object = container_of(_object, struct cachefs_object, fscache);
+	super = container_of(object->fscache.cache, struct cachefs_super, cache);
+
+	//printk("\n");
+	//printk("========================================\n");
+	//printk("\n");
+
+	_enter("{%llx},{%lx},,,", object->objid, page->index);
+
+	if (object->page_limit != ULONG_MAX &&
+	    page->index >= object->page_limit
+	    ) {
+		_leave(" = -ENOBUFS [limit]");
+		return -ENOBUFS;
+	}
+
+	return cachefs_data_alloc_page(super, object, page, gfp);
+
+} /* end cachefs_allocate_page() */
+
+/*****************************************************************************/
+/*
+ * request a page be stored in the cache
+ * - cache withdrawal is prevented by the caller
+ * - this request may be ignored if there's no cache block available, in which
+ *   case -ENOBUFS will be returned
+ * - if a cache block was already allocated:
+ *   - the page cookie will be updated to reflect the block selected
+ *   - a BIO will have been dispatched to write the page - the BIO's completion
+ *     routine will call callback_func
+ *   - returns 0
+ */
+static int cachefs_write_page(struct fscache_object *_object,
+			      struct page *page,
+			      fscache_rw_complete_t callback_func,
+			      void *callback_data,
+			      unsigned long gfp)
+{
+	struct cachefs_object *object;
+	struct cachefs_io_callback *callback;
+	struct cachefs_super *super;
+	struct pagevec pagevec;
+	int ret;
+
+	object = container_of(_object, struct cachefs_object, fscache);
+	super = container_of(object->fscache.cache, struct cachefs_super, cache);
+
+	_enter("{%llx},%p{%lx},,,", object->objid, page, page->index);
+
+	if (object->page_limit != ULONG_MAX &&
+	    page->index >= object->page_limit
+	    ) {
+		_leave(" = -ENOBUFS [limit]");
+		return -ENOBUFS;
+	}
+
+	callback = kmalloc(sizeof(*callback), gfp & GFP_LEVEL_MASK);
+	if (!callback) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	memset(callback, 0, sizeof(*callback));
+
+	callback->super = super;
+	callback->callback_func = callback_func;
+	callback->callback_data = callback_data;
+	atomic_set(&callback->usage, 1);
+
+	/* build a list of the page to be written */
+	pagevec_init(&pagevec, 0);
+	pagevec_add(&pagevec, page);
+
+	/* install a block in the tree and write to it */
+	ret = cachefs_data_write(super, object, &pagevec, callback, gfp);
+
+	cachefs_io_callback_put(callback);
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_write_page() */
+
+/*****************************************************************************/
+/*
+ * detach a backing block from a page
+ * - cache withdrawal is prevented by the caller
+ */
+static void cachefs_uncache_pages(struct fscache_object *_object,
+				  struct pagevec *pagevec)
+{
+	struct cachefs_object *object;
+	struct cachefs_super *super;
+	unsigned long loop;
+
+	object = container_of(_object, struct cachefs_object, fscache);
+	super = container_of(object->fscache.cache, struct cachefs_super, cache);
+
+	_enter("{%llx},{%lu,%lx},,,",
+	       object->objid, pagevec->nr, pagevec->pages[0]->index);
+
+	loop = 0;
+	do {
+		loop = cachefs_data_uncache(super, object, pagevec, loop);
+	} while (loop < pagevec->nr);
+
+	_leave("");
+
+} /* end cachefs_uncache_pages() */
+
+/*****************************************************************************/
+/*
+ * dissociate a cache from all the pages it was backing
+ */
+static void cachefs_dissociate_pages(struct fscache_cache *cache)
+{
+	_enter("");
+
+} /* end cachefs_dissociate_pages() */
+
+struct fscache_cache_ops cachefs_cache_ops = {
+	.name			= "cachefs",
+	.lookup_object		= cachefs_lookup_object,
+	.grab_object		= cachefs_grab_object,
+	.lock_object		= cachefs_lock_object,
+	.unlock_object		= cachefs_unlock_object,
+	.update_object		= cachefs_update_object,
+	.put_object		= cachefs_put_object,
+	.sync_cache		= cachefs_sync_cache,
+	.set_i_size		= cachefs_set_i_size,
+	.read_or_alloc_page	= cachefs_read_or_alloc_page,
+	.read_or_alloc_pages	= cachefs_read_or_alloc_pages,
+	.allocate_page		= cachefs_allocate_page,
+	.write_page		= cachefs_write_page,
+	.uncache_pages		= cachefs_uncache_pages,
+	.dissociate_pages	= cachefs_dissociate_pages,
+};
diff -uNrp linux-2.6.14-mm2/fs/cachefs/journal.c linux-2.6.14-mm2-cachefs/fs/cachefs/journal.c
--- linux-2.6.14-mm2/fs/cachefs/journal.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/journal.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,483 @@
+/* journal.c: CacheFS metadata tree journalling
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/writeback.h>
+#include "cachefs-int.h"
+
+struct cachefs_sync_request {
+	struct list_head	link;
+	struct task_struct	*task;
+	int			done;
+};
+
+kmem_cache_t *cachefs_journal_jar;
+
+/*****************************************************************************/
+/*
+ * autowander timeout handler
+ */
+void cachefs_journal_wander_timeout(unsigned long data)
+{
+	struct cachefs_super *super = (struct cachefs_super *) data;
+
+	_enter("");
+
+	set_bit(CACHEFS_SUPER_WANDER_TIMEOUT, &super->flags);
+	wake_up(&super->dmn_sleepq);
+
+} /* end cachefs_journal_wander_timeout() */
+
+/*****************************************************************************/
+/*
+ * handle completion of journal page write
+ */
+static int cachefs_journal_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
+{
+	struct cachefs_journal *jnl;
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+
+	if (bio->bi_size)
+		return 1;
+
+	_enter("");
+
+	do {
+		struct page *page = bvec->bv_page;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (!uptodate)
+			SetPageError(page);
+
+	} while (bvec >= bio->bi_io_vec);
+
+	jnl = bio->bi_private;
+	bio_put(bio);
+	cachefs_journal_release(jnl);
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_journal_end_io_write() */
+
+/*****************************************************************************/
+/*
+ * cause the metadata tree to wander
+ * - this forces the new root to be committed to the journal
+ * - the various allocation and recycling stacks are also committed
+ */
+static int cachefs_journal_wander(struct cachefs_super *super, int flush)
+{
+	struct cachefs_ondisc_journal *jentry;
+	struct cachefs_journal *newjnl, *prev;
+	unsigned long flags, offset;
+	void *data;
+
+	_enter("{%s}", super->cache.identifier);
+
+	/* allocate a new transaction record */
+	newjnl = kmem_cache_alloc(cachefs_journal_jar, GFP_KERNEL);
+	if (!newjnl) {
+		_leave(" = -ENOMEM");
+		return 0;
+	}
+
+	INIT_LIST_HEAD(&newjnl->link);
+	INIT_LIST_HEAD(&newjnl->syncwq);
+	newjnl->super = super;
+	newjnl->dependent = NULL;
+	atomic_set(&newjnl->remaining, 2);
+	newjnl->journalled = 0;
+	newjnl->alloc_sfree = 0;
+	newjnl->alloc_sfree_n = 0;
+
+	/* reserve a slot */
+	down(&super->jnl_page_sem);
+
+	/* prevent other people from writing to the tree whilst we're busy
+	 * doing things to it */
+	down_write(&super->tree_wander_sem);
+
+	del_timer_sync(&super->jnl_timer);
+	clear_bit(CACHEFS_SUPER_WANDER_TIMEOUT, &super->flags);
+
+	if (!test_and_clear_bit(CACHEFS_SUPER_NEED_WANDER, &super->flags)) {
+		kmem_cache_free(cachefs_journal_jar, newjnl);
+		up_write(&super->tree_wander_sem);
+		_leave(" = 0 [no need]");
+		return 0;
+	}
+
+	/* decide where we're going to store the journal entry and what serial
+	 * number it's going to get */
+	_debug("calc");
+	newjnl->serial = ++super->jnl_serial;
+
+	/* shift along to the new root */
+	_debug("shift");
+
+	ASSERT(!list_empty(&super->jnl_transq));
+
+	spin_lock_irqsave(&super->jnl_qlock, flags);
+
+	prev = super->jnl_current;
+	ASSERT(super->jnl_transq.prev == &prev->link);
+	prev->dependent = newjnl;
+	list_add_tail(&newjnl->link, &super->jnl_transq);
+	super->jnl_current = newjnl;
+
+	spin_unlock_irqrestore(&super->jnl_qlock, flags);
+
+	/* build a journal entry for the mark we're about to make - we do this
+	 * now to store the allocation tracking
+	 */
+	offset = prev->serial & (super->layout->jnl_recperblk - 1);
+	offset *= super->layout->jnl_rsize;
+	ASSERT(offset < super->layout->bsize);
+
+	_debug("MARK %x [%lx]", prev->serial, offset);
+
+	data = kmap_atomic(super->jnl_page, KM_USER0);
+	jentry = data + offset;
+
+	memset(jentry, 0, super->layout->jnl_rsize);
+
+	spin_lock(&super->alloc_lock);
+	*jentry = super->j;
+
+	/* rotate the reclaimed-and-ready list onto the secondary alloc stack
+	 * if that's empty
+	 * - the appearance in the superblock is deferred until the journal
+	 *   write completes
+	 */
+
+	if (super->j.alloc_sfree == 0 &&
+	    super->j.rcm_ready &&
+	    super->space_transit == 0
+	    ) {
+		super->space_transit = super->j.rcm_ready_n;
+		smp_wmb();
+
+		super->j.space_rcm -= super->space_transit;
+		jentry->space_rcm -= super->space_transit;
+
+		jentry->alloc_sfree   = prev->alloc_sfree   = super->j.rcm_ready;
+		jentry->alloc_sfree_n = prev->alloc_sfree_n = super->j.rcm_ready_n;
+		jentry->rcm_ready   = super->j.rcm_ready   = 0;
+		jentry->rcm_ready_n = super->j.rcm_ready_n = 0;
+	}
+
+	jentry->space_alloc += super->space_transit;
+
+	spin_unlock(&super->alloc_lock);
+
+	jentry->mark	= CACHEFS_ONDISC_JNL_WANDER;
+	jentry->serial	= prev->serial;
+	jentry->jtime	= CURRENT_TIME.tv_sec;
+
+	kunmap_atomic(data, KM_USER0);
+
+	/* we can now let further changes take place as they'll affect
+	 * different blocks and pages */
+	up_write(&super->tree_wander_sem);
+
+	/* have to force reclamation nodes out manually as they're not kept in
+	 * the page cache */
+	_debug("push rcm");
+
+	if (super->page_rcm) {
+		down(&super->alloc_load_sem);
+		if (super->page_rcm && PageDirty(super->page_rcm))
+			cachefs_allocator_write_page(super->page_rcm);
+		up(&super->alloc_load_sem);
+	}
+
+	/* force the dirty bits to be written to disk if synchronicity
+	 * requested */
+	if (flush) {
+		_debug("flush");
+
+		if (filemap_fdatawrite(super->imeta->i_mapping) < 0)
+			BUG();
+	}
+
+	/* release the superblock's ref on this transaction */
+	cachefs_journal_release(prev);
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_journal_wander() */
+
+/*****************************************************************************/
+/*
+ * make the in-transit secondary alloc stack live
+ */
+static void cachefs_journal_enliven_alloc_stack(struct cachefs_super *super,
+						struct cachefs_journal *jnl)
+{
+	struct bio *bio;
+
+	_enter(",{%x,%x}", jnl->serial, jnl->alloc_sfree);
+
+	ASSERT(super->page_sfree);
+	ASSERT(!PageMappedToDisk(super->page_sfree));
+	ASSERT(!PageUptodate(super->page_sfree));
+
+	super->page_sfree->index = jnl->alloc_sfree;
+	SetPageMappedToDisk(super->page_sfree);
+
+	lock_page(super->page_sfree);
+
+	/* dispatch a call to perform the read */
+	bio = bio_alloc(GFP_KERNEL | __GFP_WAIT | __GFP_NOFAIL, 1);
+
+	bio->bi_bdev	= super->imeta->i_sb->s_bdev;
+	bio->bi_sector	= super->page_sfree->index;
+	bio->bi_sector	<<= PAGE_SHIFT - super->imeta->i_sb->s_blocksize_bits;
+	bio->bi_end_io	= cachefs_allocator_end_io_read;
+	bio->bi_private	= super;
+
+	if (!bio_add_page(bio, super->page_sfree, PAGE_SIZE, 0))
+		BUG();
+
+	submit_bio(READ, bio);
+
+	_leave("");
+
+} /* end cachefs_journal_enliven_alloc_stack() */
+
+/*****************************************************************************/
+/*
+ * send the next journal entry to disk when all the pages that depend on it
+ * have gone
+ * - called from kcachefsd
+ * - we insert an I/O barrier if we can
+ * - return true if we made progress
+ */
+int cachefs_journal_process(struct cachefs_super *super)
+{
+	struct cachefs_journal *jnl;
+	unsigned long slot, offset;
+	struct bio *bio;
+
+	jnl = list_entry(super->jnl_transq.next, struct cachefs_journal, link);
+
+	//printk("\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n");
+	_enter("%p{%%%x,%d}", jnl, jnl->serial, jnl->journalled);
+	//printk("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n");
+
+	ASSERT(atomic_read(&jnl->remaining) == 0);
+	ASSERT(jnl->journalled <= 1);
+
+	/* deal with a transaction having completed */
+	if (jnl->journalled) {
+		struct cachefs_sync_request *req;
+		struct cachefs_journal *dep;
+		unsigned long flags;
+
+		_debug("complete");
+
+		/* complete the rotation of the reclamation stack to the
+		 * secondary allocation stack */
+		if (jnl->alloc_sfree)
+			cachefs_journal_enliven_alloc_stack(super, jnl);
+
+		spin_lock_irqsave(&super->jnl_qlock, flags);
+		list_del(&jnl->link);
+
+		if (jnl->alloc_sfree) {
+			ASSERT(super->j.alloc_sfree == 0);
+			ASSERT(super->j.alloc_sfree_n == 0);
+			ASSERT(jnl->alloc_sfree_n == super->space_transit);
+
+			spin_lock(&super->alloc_lock);
+			super->j.space_alloc += jnl->alloc_sfree_n;
+			super->j.alloc_sfree_n = jnl->alloc_sfree_n;
+			smp_wmb();
+			super->j.alloc_sfree = jnl->alloc_sfree;
+			super->space_transit = 0;
+			spin_unlock(&super->alloc_lock);
+
+			cachefs_operation_run(super);
+		}
+
+		/* wake up all processes trying to sync up to this point */
+		list_for_each_entry(req, &jnl->dependent->syncwq, link) {
+			struct task_struct *tsk = req->task;
+			mb();
+			req->done = 1;
+			wake_up_process(tsk);
+			put_task_struct(tsk);
+		}
+
+		spin_unlock_irqrestore(&super->jnl_qlock, flags);
+
+		dep = jnl->dependent;
+		kmem_cache_free(cachefs_journal_jar, jnl);
+		up(&super->jnl_page_sem);
+		cachefs_journal_release(dep);
+
+		_leave(" = 1");
+		return 1;
+	}
+
+	/* send a journal entry, with barrier if possible */
+	_debug("write");
+
+	bio = bio_alloc(GFP_KERNEL, 1);
+	if (!bio) {
+		kleave(" = 0 [ENOMEM]");
+		return 0;
+	}
+
+	__cachefs_journal_get(jnl);
+	jnl->journalled++;
+
+	slot = super->layout->bix_journal;
+	slot *= super->layout->jnl_recperblk;
+	slot += jnl->serial & CACHEFS_ONDISC_JNL_SLOT_MASK;
+
+	offset = jnl->serial & (super->layout->jnl_recperblk - 1);
+	offset *= super->layout->jnl_rsize;
+
+	_debug("WRITE %x [%lx, %lx]", jnl->serial, slot, offset);
+
+	bio->bi_bdev	= super->imeta->i_sb->s_bdev;
+	bio->bi_sector	= slot;
+	bio->bi_end_io	= cachefs_journal_end_io_write;
+	bio->bi_private	= jnl;
+
+	if (!bio_add_page(bio, super->jnl_page,
+			  super->layout->jnl_rsize, offset))
+		BUG();
+
+	//dump_bio(bio, 0);
+	submit_bio(super->bio_wr_barrier, bio);
+
+	_leave(" = 1");
+	return 1;
+
+} /* end cachefs_journal_process() */
+
+/*****************************************************************************/
+/*
+ * release a reference to a transaction
+ */
+void cachefs_journal_release(struct cachefs_journal *jnl)
+{
+	struct cachefs_super *super;
+
+#if 0
+	printk("\n<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n");
+	_debug("RELEASE %p{%%%x,} -> %d",
+	       trans, jnl->serial, atomic_read(&jnl->remaining) - 1);
+	printk("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n");
+#endif
+
+	super = jnl->super;
+	if (atomic_dec_and_test(&jnl->remaining)) {
+		_debug("do journal");
+		set_bit(CACHEFS_SUPER_DO_JOURNAL, &super->flags);
+		wake_up(&super->dmn_sleepq);
+	}
+
+	_leave("");
+
+} /* end cachefs_journal_release() */
+
+/*****************************************************************************/
+/*
+ * synchronise the cache to a particular degree
+ * - force the tree to wander if there are any outstanding changes
+ */
+int cachefs_sync(struct cachefs_super *super, int sync, int intr)
+{
+	struct cachefs_sync_request req;
+	unsigned long flags;
+	int ret;
+
+	_enter("");
+
+	ret = cachefs_journal_wander(super, 1);
+	if (ret < 0) {
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	if (!sync) {
+		_leave(" = 0 [nosync]");
+		return 0;
+	}
+
+	/* queue up the sync completion monitor if there's anything left in the
+	 * transaction queue
+	 */
+	_debug("sync");
+
+	req.done = 0;
+
+	spin_lock_irqsave(&super->jnl_qlock, flags);
+	if (super->jnl_transq.next == &super->jnl_current->link) {
+		req.done = 1;
+	}
+	else {
+		req.task = current;
+		get_task_struct(current);
+		list_add_tail(&req.link, &super->jnl_current->syncwq);
+	}
+	spin_unlock_irqrestore(&super->jnl_qlock, flags);
+
+	/* wait for the old transaction queue to empty */
+	if (!req.done) {
+		_debug("wait for %p == %p",
+		       super->jnl_transq.next, &super->jnl_current->link);
+
+		for (;;) {
+			if (intr)
+				set_current_state(TASK_INTERRUPTIBLE);
+			else
+				set_current_state(TASK_UNINTERRUPTIBLE);
+
+			if (req.done ||
+			    (intr && signal_pending(current)))
+				break;
+			schedule();
+		}
+
+		__set_current_state(TASK_RUNNING);
+
+		/* deal with interruption */
+		if (!req.done) {
+			ASSERT(intr);
+
+			spin_lock_irqsave(&super->jnl_qlock, flags);
+			list_del_init(&req.link);
+			spin_unlock_irqrestore(&super->jnl_qlock, flags);
+
+			if (!req.done) {
+				put_task_struct(current);
+				_leave(" = -EINTR");
+				return -EINTR;
+			}
+		}
+	}
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_sync() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/journal-replay.c linux-2.6.14-mm2-cachefs/fs/cachefs/journal-replay.c
--- linux-2.6.14-mm2/fs/cachefs/journal-replay.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/journal-replay.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,485 @@
+/* journal-replay.c: replay the journal
+ *
+ * Copyright (C) 2003-5 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include "cachefs-int.h"
+
+enum cachefs_journal_replay_empty {
+	CACHEFS_JOURNAL_REPLAY_NOTHING_YET,
+	CACHEFS_JOURNAL_REPLAY_FOUND_MARKS,
+	CACHEFS_JOURNAL_REPLAY_GAP_TO_END,
+} __attribute__((packed));
+
+struct cachefs_journal_replay_data {
+	read_descriptor_t	desc;
+	struct cachefs_super	*super;
+	uint32_t		serial_next;
+	uint32_t		serial_hi;
+	uint32_t		serial_wrap;
+	unsigned		slot_hi;
+	unsigned		slot_wrap;
+	char			wrapped;
+	enum cachefs_journal_replay_empty empty;
+};
+
+/*****************************************************************************/
+/*
+ * load the values from the journal into the superblock
+ */
+static inline void cachefs_journal_load_values(struct cachefs_super *super,
+					       struct cachefs_ondisc_journal *jentry)
+{
+	super->j = *jentry;
+
+} /* end cachefs_journal_load_values() */
+
+/*****************************************************************************/
+/*
+ * journal replay actor to find the last recorded entry
+ */
+static int cachefs_journal_find_last_actor(read_descriptor_t *_desc,
+					   struct page *page,
+					   unsigned long offset,
+					   unsigned long size)
+{
+	struct cachefs_journal_replay_data *desc;
+	struct cachefs_ondisc_journal *jentry;
+	struct cachefs_super *super;
+	unsigned slot;
+	void *data;
+	int32_t tmp;
+
+	desc = container_of(_desc, struct cachefs_journal_replay_data, desc);
+	super = desc->super;
+
+	_enter("{%zx},{%lu},%lu,%lu",
+	       desc->desc.count, page->index, offset, size);
+
+	if (size > desc->desc.count)
+		size = desc->desc.count;
+
+	ASSERT(size == PAGE_SIZE);
+	ASSERT(offset == 0);
+
+	data = kmap_atomic(page, KM_USER0);
+
+	slot = (page->index - super->layout->bix_journal);
+	slot *= super->layout->jnl_recperblk;
+
+	/* look for the highest serial number in the journal */
+	for (;
+	     offset < size;
+	     offset += super->layout->jnl_rsize, slot++
+	     ) {
+		jentry = data + offset;
+
+		/* validate the journal marks */
+		if (jentry->mark >= CACHEFS_ONDISC_JNL__LAST) {
+			printk(KERN_ERR "CacheFS:"
+			       " Unknown mark in journal (%x)\n",
+			       jentry->mark);
+			desc->desc.error = -EIO;
+			break;
+		}
+
+		if (jentry->mark == CACHEFS_ONDISC_JNL_EMPTY) {
+			switch (desc->empty) {
+			case CACHEFS_JOURNAL_REPLAY_FOUND_MARKS:
+				if (desc->wrapped) {
+					printk(KERN_ERR "CacheFS:"
+					       " End of journal missing"
+					       " (slot %x, %x)\n",
+					       desc->slot_wrap, slot);
+					desc->desc.error = -EIO;
+					break;
+				}
+
+				/* found an empty slot after a valid slot */
+				desc->empty = CACHEFS_JOURNAL_REPLAY_GAP_TO_END;
+				continue;
+
+			case CACHEFS_JOURNAL_REPLAY_NOTHING_YET:
+				printk(KERN_ERR "CacheFS:"
+				       " Initial journal record missing"
+				       " (slot %d)\n",
+				       slot);
+				desc->desc.error = -EIO;
+				break;
+
+			case CACHEFS_JOURNAL_REPLAY_GAP_TO_END:
+				continue;
+
+			default:
+				BUG();
+			}
+		}
+
+		if (desc->empty == CACHEFS_JOURNAL_REPLAY_GAP_TO_END) {
+			/* found empty slots between two valid marks */
+			printk(KERN_ERR "CacheFS:"
+			       " Unexpected holes in journal (slot %x)\n",
+			       slot);
+			desc->desc.error = -EIO;
+			break;
+		}
+
+		desc->empty = CACHEFS_JOURNAL_REPLAY_FOUND_MARKS;
+
+		/* the serial number must relate directly to the slot number */
+		if ((jentry->serial & CACHEFS_ONDISC_JNL_SLOT_MASK) != slot) {
+			printk(KERN_ERR "CacheFS:"
+			       " Journal serial %x should not be in slot %x\n",
+			       jentry->serial, slot);
+			desc->desc.error = -EIO;
+			break;
+		}
+
+		/* abort if the filesystem/IO error flag is set on disk */
+		if (jentry->error) {
+			desc->desc.error = -EIO;
+			break;
+		}
+
+		/* we just paste the values of the first slot in directly */
+		if (slot == 0) {
+			_debug("JNL[0000] s=%x", jentry->serial);
+
+			cachefs_journal_load_values(super, jentry);
+
+			super->jnl_serial	= jentry->serial + 1;
+			desc->serial_next	= jentry->serial + 1;
+			desc->serial_hi		= jentry->serial;
+			desc->slot_hi		= 0;
+			continue;
+		}
+
+		/* attempt to relate subsequent entries to the previous ones */
+		_debug("JNL[%04x] js=%x { nx=%x }",
+		       slot, jentry->serial, desc->serial_next);
+
+		/* validate the next serial number */
+		tmp = (int32_t) desc->serial_next - (int32_t) jentry->serial;
+		if (tmp != 0) {
+			/* non-contiguous serial numbers - check that the
+			 * journal wrapped */
+			if (desc->wrapped) {
+				printk(KERN_ERR "CacheFS:"
+				       " Journal has multiple wrap points"
+				       " (slot %u{%x} ... %u{%x})\n",
+				       desc->slot_wrap, desc->serial_wrap,
+				       slot, jentry->serial);
+				desc->desc.error = -EIO;
+				break;
+			}
+
+			if (tmp != CACHEFS_ONDISC_JNL_NUMENTS) {
+				printk(KERN_ERR "CacheFS:"
+				       " Journal wrap displacement %u not %u"
+				       " (slot %u{%x}, %u{%x})\n",
+				       tmp, CACHEFS_ONDISC_JNL_NUMENTS,
+				       slot - 1, desc->serial_next - 1,
+				       slot, jentry->serial);
+				desc->desc.error = -EIO;
+				break;
+			}
+
+			desc->serial_next	= jentry->serial + 1;
+			desc->serial_wrap	= jentry->serial;
+			desc->slot_wrap		= slot;
+			desc->wrapped		= 1;
+		}
+		else {
+			/* this entry followed directly on from the last; if it
+			 * has the new current highest serial then copy the
+			 * data back */
+			if (!desc->wrapped) {
+				cachefs_journal_load_values(super, jentry);
+
+				super->jnl_serial	= jentry->serial + 1;
+				desc->serial_hi		= jentry->serial;
+				desc->slot_hi		= slot;
+			}
+
+			desc->serial_next = jentry->serial + 1;
+		}
+	}
+
+	kunmap_atomic(data, KM_USER0);
+
+	if (desc->desc.error) {
+		_leave(" = 0 [error %d]", desc->desc.error);
+		return 0;
+	}
+
+	desc->desc.count	-= size;
+	desc->desc.written	+= size;
+	_leave(" = %lx", size);
+	return size;
+
+} /* end cachefs_journal_find_last_actor() */
+
+/*****************************************************************************/
+/*
+ * replay the journal upon mounting to determine various parameters and to fix
+ * up changes that failed to be made
+ */
+int cachefs_journal_replay(struct cachefs_super *super)
+{
+	struct cachefs_journal_replay_data replay_data;
+	struct address_space *mapping;
+	struct file_ra_state ra;
+	loff_t ppos;
+	int ret;
+
+	_enter("");
+
+	printk(KERN_NOTICE "CacheFS: Replaying the journal...\n");
+
+	set_bit(CACHEFS_SUPER_REPLAYING_JNL, &super->flags);
+
+	/* we use the page cache to do readahead directly on the inode */
+	memset(&ra, 0, sizeof(ra));
+	file_ra_state_init(&ra, super->imeta->i_mapping);
+
+	/* scan the journal to determine the last mask and to extract the state
+	 * recorded therein */
+	memset(&replay_data, 0, sizeof(replay_data));
+	replay_data.desc.count = super->layout->bix_cache;
+	replay_data.desc.count -= super->layout->bix_journal;
+	replay_data.desc.count <<= super->layout->bshift;
+	replay_data.super = super;
+	replay_data.empty = CACHEFS_JOURNAL_REPLAY_NOTHING_YET;
+
+	ppos = super->layout->bix_journal;
+	ppos *= super->layout->bsize;
+
+	do_generic_mapping_read(super->imeta->i_mapping, &ra, NULL, &ppos,
+				&replay_data.desc,
+				cachefs_journal_find_last_actor);
+
+	if (replay_data.desc.error < 0) {
+		clear_bit(CACHEFS_SUPER_REPLAYING_JNL, &super->flags);
+		printk(KERN_ERR "CacheFS:"
+		       " failed to replay journal: %d\n",
+		       replay_data.desc.error);
+		return replay_data.desc.error;
+	}
+
+	clear_bit(CACHEFS_SUPER_REPLAYING_JNL, &super->flags);
+
+	kdebug("JNL: hi=%u{%x} wrap=%u{%x}%s%s",
+	       (super->jnl_serial - 1) & CACHEFS_ONDISC_JNL_SLOT_MASK,
+	       (super->jnl_serial - 1),
+	       replay_data.slot_wrap,
+	       replay_data.serial_wrap,
+	       replay_data.empty ? " empty" : "",
+	       replay_data.wrapped ? " wrapped" : "");
+
+	kdebug("JNL next mark : %u{%x}",
+	       super->jnl_serial & CACHEFS_ONDISC_JNL_SLOT_MASK,
+	       super->jnl_serial);
+
+	super->jnl_current->serial = super->jnl_serial;
+
+	kdebug("JNL next objid: %llx",
+	       super->j.alloc_objid);
+
+	kdebug("JNL Alloc URdy: %08x",
+	       super->j.alloc_unready);
+
+	kdebug("JNL Alloc Prim: %08x[%x] #%d",
+	       super->j.alloc_pfree,
+	       super->j.alloc_pfree_pt,
+	       super->j.alloc_pfree_n);
+
+	kdebug("JNL Alloc Sec : %08x #%x",
+	       super->j.alloc_sfree,
+	       super->j.alloc_sfree_n);
+
+	kdebug("JNL Rcm Rdy   : %08x #%x",
+	       super->j.rcm_ready,
+	       super->j.rcm_ready_n);
+
+	kdebug("JNL Rcm Coll  : %08x[%hu]; spare %08x",
+	       super->j.rcm_collector,
+	       super->j.rcm_coll_pt,
+	       super->j.rcm_spare);
+
+	kdebug("JNL Recyc Proc: %08x[%hd]",
+	       super->j.rcy_processor,
+	       super->j.rcy_procsp);
+
+	kdebug("JNL Recyc Stk : %08x",
+	       super->j.rcy_stack);
+
+	kdebug("JNL Recyc Coll: %08x[%hd]",
+	       super->j.rcy_collector,
+	       super->j.rcy_collsp);
+
+	kdebug("JNL Reap Proc : %08x[%hd]",
+	       super->j.reap_processor,
+	       super->j.reap_proccnt);
+
+	kdebug("JNL Reap Stk  : %08x",
+	       super->j.reap_stack);
+
+	kdebug("JNL Reap Coll : %08x[%hd]",
+	       super->j.reap_collector,
+	       super->j.reap_collsp);
+
+	kdebug("JNL Tree Root : %08x",
+	       super->j.tree_root);
+
+	/* allocate three pages for the allocator list tracking */
+	if (!(super->page_pfree = alloc_page(GFP_HIGHUSER)) ||
+	    !(super->page_pfree_nx = alloc_page(GFP_HIGHUSER)) ||
+	    !(super->page_sfree = alloc_page(GFP_HIGHUSER))
+	    ) {
+		printk(KERN_ERR "CacheFS: Out of memory\n");
+		return -ENOMEM;
+	}
+
+	mapping = super->imeta->i_mapping;
+	super->page_pfree->index	= super->j.alloc_pfree;
+	super->page_pfree->mapping	= mapping;
+	super->page_pfree_nx->index	= 0;
+	super->page_pfree_nx->mapping	= mapping;
+	super->page_sfree->index	= super->j.alloc_sfree;
+	super->page_sfree->mapping	= mapping;
+
+	/* reload the TOS nodes for the allocation and recycling list */
+	if (super->j.alloc_pfree) {
+		/* read the alloc stack TOS */
+		_debug("reading alloc_pfree %x", super->j.alloc_pfree);
+
+		SetPageMappedToDisk(super->page_pfree);
+
+		lock_page(super->page_pfree);
+		ret = mapping->a_ops->readpage(NULL, super->page_pfree);
+		if (ret < 0) {
+			printk(KERN_ERR "CacheFS:"
+			       " Failed to load alloc front: %d\n", ret);
+			return ret;
+		}
+	}
+
+	if (super->j.alloc_sfree) {
+		/* read the second alloc stack TOS */
+		_debug("reading alloc_sfree %x", super->j.alloc_sfree);
+
+		SetPageMappedToDisk(super->page_sfree);
+
+		lock_page(super->page_sfree);
+		ret = mapping->a_ops->readpage(NULL, super->page_sfree);
+		if (ret < 0) {
+			printk(KERN_ERR "CacheFS:"
+			       " Failed to load alloc2 front: %d\n", ret);
+			return ret;
+		}
+	}
+
+	if (super->j.rcm_collector) {
+		/* read the current collector page
+		 * - we don't keep this in the page cache as we don't want to
+		 *   deal with ENOMEM from radix tree allocations during
+		 *   deletion
+		 */
+		_debug("reading rcm_collector %x", super->j.rcm_collector);
+
+		super->page_rcm = alloc_page(GFP_HIGHUSER);
+		if (!super->page_rcm) {
+			printk(KERN_ERR "CacheFS:"
+			       " Failed to allocate reclaim collector\n");
+			return -ENOMEM;
+		}
+
+		atomic_inc(&super->cnt_rcmpages);
+
+		super->page_rcm->index = super->j.rcm_collector;
+		super->page_rcm->mapping = super->imeta->i_mapping;
+		INIT_LIST_HEAD(&super->page_rcm->lru);
+
+		lock_page(super->page_rcm);
+		ret = super->page_rcm->mapping->a_ops->readpage(
+			NULL, super->page_rcm);
+		if (ret < 0) {
+			printk(KERN_ERR "CacheFS:"
+			       " Failed to load reclaim collector: %d\n", ret);
+			return ret;
+		}
+	}
+
+	if (super->j.rcy_collector) {
+		/* read the current recycling collector page */
+		super->page_rcy = NULL;
+		ret = cachefs_page_read(super, super->j.rcy_collector, 0,
+					&super->page_rcy);
+		if (ret < 0) {
+			printk(KERN_ERR "CacheFS:"
+			       " Failed to load recycling collector: %d\n",
+			       ret);
+			return ret;
+		}
+	}
+
+	if (super->j.reap_collector) {
+		/* read the current reap collector page */
+		super->page_reap = NULL;
+		ret = cachefs_page_read(super, super->j.reap_collector, 0,
+					&super->page_reap);
+		if (ret < 0) {
+			printk(KERN_ERR "CacheFS:"
+			       " Failed to load reap collector: %d\n",
+			       ret);
+			return ret;
+		}
+	}
+
+	/* set the second-of-stack from the allocation stack loading */
+	if (PageMappedToDisk(super->page_pfree)) {
+		struct cachefs_ondisc_free_node *fnode;
+
+		wait_on_page_locked(super->page_pfree);
+		if (PageError(super->page_pfree)) {
+			printk(KERN_ERR "CacheFS:"
+			       " I/O Error loading alloc stack\n");
+			return -EIO;
+		}
+
+		fnode = kmap_atomic(super->page_pfree, KM_USER0);
+		super->alloc_pfree_nx = fnode->next;
+		kunmap_atomic(fnode, KM_USER0);
+
+		_debug("alloc next is at: %x", super->alloc_pfree_nx);
+
+		if (super->alloc_pfree_nx)
+			cachefs_allocator_read_next(super, 0);
+	}
+
+	/* tell kcachefsd if there's recycling to be done */
+	if (super->j.rcy_processor ||
+	    super->j.rcy_stack ||
+	    super->j.rcy_collector)
+		set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+
+	/* and start a cull/reap scan immediately */
+	set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+
+	wake_up(&super->dmn_sleepq);
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_journal_replay() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/kcachefsd.c linux-2.6.14-mm2-cachefs/fs/cachefs/kcachefsd.c
--- linux-2.6.14-mm2/fs/cachefs/kcachefsd.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/kcachefsd.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,278 @@
+/* kcachefsd.c: CacheFS management daemon
+ *
+ * Copyright (C) 2003-5 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "cachefs-int.h"
+
+static inline void discard_my_signals(void)
+{
+	while (signal_pending(current)) {
+		siginfo_t sinfo;
+
+		spin_lock_irq(&current->sighand->siglock);
+		dequeue_signal(current, &current->blocked, &sinfo);
+		spin_unlock_irq(&current->sighand->siglock);
+	}
+}
+
+/*****************************************************************************/
+/*
+ * sleep whilst waiting for work
+ */
+static void kcachefsd_sleep(struct cachefs_super *super)
+{
+	DECLARE_WAITQUEUE(myself, current);
+
+	_enter("{%d,%lx}", super->dmn_die, super->flags);
+
+try_again:
+	/* don't consider sleeping if there's work to be done */
+	switch (super->dmn_die) {
+	case CACHEFS_DMN_RUNNING:
+		if (test_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags) ||
+		    test_bit(CACHEFS_SUPER_DO_SCAN, &super->flags) ||
+		    test_bit(CACHEFS_SUPER_REDUCE_OLDRCM, &super->flags))
+			return;
+
+	case CACHEFS_DMN_RETIRING:
+		if (test_bit(CACHEFS_SUPER_WANDER_TIMEOUT, &super->flags) ||
+		    test_bit(CACHEFS_SUPER_DO_JOURNAL, &super->flags))
+			return;
+		break;
+
+	case CACHEFS_DMN_DIE:
+		_leave(" [dead]");
+		complete_and_exit(&super->dmn_dead, 0);
+
+	default:
+		BUG();
+	}
+
+	/* attempt to sleep */
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&super->dmn_sleepq, &myself);
+
+	for (;;) {
+		discard_my_signals();
+
+		/* see if there's work to be done */
+		switch (super->dmn_die) {
+		case CACHEFS_DMN_RUNNING:
+			if (test_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags) ||
+			    test_bit(CACHEFS_SUPER_DO_SCAN, &super->flags) ||
+			    test_bit(CACHEFS_SUPER_REDUCE_OLDRCM, &super->flags))
+				goto work_to_do;
+
+		case CACHEFS_DMN_RETIRING:
+			if (test_bit(CACHEFS_SUPER_WANDER_TIMEOUT, &super->flags) ||
+			    test_bit(CACHEFS_SUPER_DO_JOURNAL, &super->flags))
+				goto work_to_do;
+			break;
+
+		case CACHEFS_DMN_DIE:
+			goto work_to_do;
+
+		default:
+			BUG();
+		}
+
+		schedule();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+
+work_to_do:
+	remove_wait_queue(&super->dmn_sleepq, &myself);
+	set_current_state(TASK_RUNNING);
+
+	if (super->dmn_die >= CACHEFS_DMN_DIE)
+		goto try_again;
+
+} /* end kcachefsd_sleep() */
+
+/*****************************************************************************/
+/*
+ * actually do the work this daemon is intended to do
+ */
+static void kcachefsd_work(struct cachefs_super *super,
+			   struct cachefs_operation *op)
+{
+	_debug("@@@ Begin Cache Management");
+
+	if (super->dmn_die == CACHEFS_DMN_RUNNING) {
+		if (test_and_clear_bit(CACHEFS_SUPER_DO_SCAN, &super->flags)) {
+			if (!cachefs_scan_operations[super->scan_state]) {
+				printk(KERN_ERR "\n");
+				printk(KERN_ERR
+				       "CacheFS: Tree scan state %d not implemented\n",
+				       super->scan_state);
+				BUG();
+			}
+
+			(*cachefs_scan_operations[super->scan_state])(super);
+		}
+
+		if (test_and_clear_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags)) {
+			if (!cachefs_recycle_operations[super->rcy_state]) {
+				printk(KERN_ERR "\n");
+				printk(KERN_ERR
+				       "CacheFS: Recycle state %d not implemented\n",
+				       super->rcy_state);
+				BUG();
+			}
+
+			(*cachefs_recycle_operations[super->rcy_state])(super, op);
+		}
+	}
+
+	/* reduce the membership of the old reclaim page list
+	 * - attempt to end up with one clean page at the end
+	 * - leave pages undergoing writeback in list
+	 */
+	if (test_and_clear_bit(CACHEFS_SUPER_REDUCE_OLDRCM, &super->flags)) {
+		struct page *page, *_n;
+		LIST_HEAD(deadq);
+		int got_spare = 0;
+
+		_debug("reduce rcm_old_pages");
+
+		spin_lock(&super->alloc_lock);
+
+		list_for_each_entry_safe_reverse(page, _n,
+						 &super->rcm_old_pages, lru
+						 ) {
+			ASSERT(pfn_valid(page_to_pfn(page)));
+			ASSERT(page_count(page) > 0);
+
+			if (PageWriteback(page))
+				continue;
+
+			if (super->rcm_old_pages.prev == &super->rcm_old_pages
+			    ) {
+				got_spare = 1;
+				continue;
+			}
+
+			if (got_spare) {
+				list_move_tail(&page->lru, &deadq);
+				continue;
+			}
+
+			got_spare = 1;
+			list_move_tail(&page->lru, &super->rcm_old_pages);
+		}
+
+		spin_unlock(&super->alloc_lock);
+
+		while (!list_empty(&deadq)) {
+			page = list_entry(deadq.next, struct page, lru);
+			list_del_init(&page->lru);
+			_debug("discarding %p{%lx,%d}",
+			       page, page->index, page_count(page));
+			page->mapping = NULL;
+			cachefs_page_put(page);
+			atomic_dec(&super->cnt_rcmpages);
+		}
+	}
+
+	if (test_and_clear_bit(CACHEFS_SUPER_WANDER_TIMEOUT, &super->flags))
+		cachefs_sync(super, 0, 0);
+
+	if (test_and_clear_bit(CACHEFS_SUPER_DO_JOURNAL, &super->flags))
+		cachefs_journal_process(super);
+
+#if 0 // TODO remove
+	/* cull the old inodes if we don't have enough spare blocks available for
+	 * allocation, but only if we wouldn't immediately thrash the cache
+	 */
+	if (test_bit(CACHEFS_SUPER_CULL_DISABLED, &super->flags)) {
+		/* if the cull is disabled, check to see if there're enough
+		 * unpinned blocks to reenable it */
+		if (super->space_unpinned > super->cull_hiwater) {
+			clear_bit(CACHEFS_SUPER_CULL_DISABLED, &super->flags);
+			printk(KERN_INFO "CacheFS: enabling culling");
+		}
+	}
+	else {
+		if (super->space_unpinned > super->cull_lowater) {
+			set_bit(CACHEFS_SUPER_CULL_DISABLED, &super->flags);
+			printk(KERN_INFO "CacheFS: disabling culling");
+		}
+		else {
+			want = super->space_slack;
+			want += atomic_read(&super->space_reserve);
+			if (want < super->alloc_cur_n + super->recycle_cur_n)
+				set_bit(CACHEFS_SUPER_DO_CULL, &super->flags);
+		}
+	}
+#endif
+
+	/* clear the dead objects list */
+	if (!list_empty(&super->scan_xculls)) {
+		spin_lock(&super->objects_lock);
+
+		while (!list_empty(&super->scan_xculls)) {
+			struct cachefs_object *object;
+
+			object = list_entry(super->scan_xculls.next,
+					    struct cachefs_object, cull_link);
+
+			list_del_init(&object->cull_link);
+			cachefs_object_put(object);
+		}
+
+		spin_unlock(&super->objects_lock);
+	}
+
+
+	_debug("@@@ End Cache Management");
+
+} /* end kcachefsd_work() */
+
+/*****************************************************************************/
+/*
+ * cache recycling daemon
+ */
+int kcachefsd(void *_super)
+{
+	struct cachefs_operation recycler_op;
+	struct cachefs_super *super = _super;
+
+	super->dmn_task = current;
+
+	/* set up an operation record for recycling */
+	recycler_op.super	= super;
+	recycler_op.object	= NULL;
+	recycler_op.reason	= CACHEFS_OP_RECYCLE_DATA;
+	recycler_op.state	= CACHEFS_OP_INACTIVE;
+	recycler_op.data_space	= 0;
+
+	daemonize("kcachefsd %02x%02x",
+		  MAJOR(super->sb->s_bdev->bd_inode->i_rdev),
+		  MINOR(super->sb->s_bdev->bd_inode->i_rdev));
+
+	complete(&super->dmn_alive);
+
+	printk("CacheFS: Started kcachefsd %d for cache %s\n",
+	       current->pid, super->cache.identifier);
+
+	/* loop around looking for things to attend to */
+	for (;;) {
+		kcachefsd_sleep(super);
+		kcachefsd_work(super, &recycler_op);
+		cond_resched();
+	}
+
+} /* end kcachefsd() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/main.c linux-2.6.14-mm2-cachefs/fs/cachefs/main.c
--- linux-2.6.14-mm2/fs/cachefs/main.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/main.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,224 @@
+/* main.c: general filesystem caching manager
+ *
+ * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include "cachefs-int.h"
+#include "cachefs-inode.h"
+
+int cachefs_debug = 0;
+
+static int cachefs_init(void);
+static void cachefs_exit(void);
+
+fs_initcall(cachefs_init);
+module_exit(cachefs_exit);
+
+MODULE_DESCRIPTION("Cache File System");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+kmem_cache_t *cachefs_object_jar;
+
+static void cachefs_object_init_once(void *_object, kmem_cache_t *cachep,
+				     unsigned long flags)
+{
+	struct cachefs_object *object = _object;
+
+	switch (flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) {
+	case SLAB_CTOR_CONSTRUCTOR:
+		memset(object, 0, sizeof(*object));
+		fscache_object_init(&object->fscache);
+		init_rwsem(&object->sem);
+		INIT_LIST_HEAD(&object->cull_link);
+		rwlock_init(&object->lock);
+		object->dataptrblks = RB_ROOT;
+		break;
+	default:
+		break;
+	}
+}
+
+/*****************************************************************************/
+/*
+ * initialise the fs caching module
+ */
+static int cachefs_init(void)
+{
+	int ret = -ENOMEM;
+
+	/* create a journal transaction jar */
+	cachefs_journal_jar =
+		kmem_cache_create("cachefs_journal_jar",
+				  sizeof(struct cachefs_journal),
+				  0,
+				  0,
+				  NULL,
+				  NULL);
+	if (!cachefs_journal_jar) {
+		printk(KERN_NOTICE
+		       "CacheFS:"
+		       " Failed to allocate a journal transaction jar\n");
+		goto error_jnl_jar;
+	}
+
+	/* create an object jar */
+	cachefs_object_jar =
+		kmem_cache_create("cachefs_object_jar",
+				  sizeof(struct cachefs_object),
+				  0,
+				  SLAB_HWCACHE_ALIGN,
+				  cachefs_object_init_once,
+				  NULL);
+	if (!cachefs_object_jar) {
+		printk(KERN_NOTICE
+		       "CacheFS: Failed to allocate an object jar\n");
+		goto error_object_jar;
+	}
+
+	/* create a node jar */
+	cachefs_node_jar =
+		kmem_cache_create("cachefs_node_jar",
+				  sizeof(struct cachefs_tree),
+				  0,
+				  SLAB_HWCACHE_ALIGN,
+				  cachefs_tree_init_once,
+				  NULL);
+	if (!cachefs_node_jar) {
+		printk(KERN_NOTICE
+		       "CacheFS: Failed to allocate a node jar\n");
+		goto error_node_jar;
+	}
+
+	/* initialise the filesystem */
+	ret = cachefs_fs_init();
+	if (ret < 0)
+		goto error;
+
+	printk(KERN_INFO "CacheFS: registered\n");
+	return 0;
+
+error:
+	kmem_cache_destroy(cachefs_node_jar);
+error_node_jar:
+	kmem_cache_destroy(cachefs_object_jar);
+error_object_jar:
+	kmem_cache_destroy(cachefs_journal_jar);
+error_jnl_jar:
+	printk(KERN_ERR "CacheFS: failed to register: %d\n", ret);
+	return ret;
+
+} /* end cachefs_init() */
+
+/*****************************************************************************/
+/*
+ * clean up on module removal
+ */
+static void __exit cachefs_exit(void)
+{
+	printk(KERN_INFO "CacheFS: unregistering\n");
+
+	cachefs_fs_exit();
+	kmem_cache_destroy(cachefs_node_jar);
+	kmem_cache_destroy(cachefs_object_jar);
+	kmem_cache_destroy(cachefs_journal_jar);
+
+} /* end cachefs_exit() */
+
+/*****************************************************************************/
+/*
+ * dump a BIO's attributes for debugging purposes
+ */
+void dump_bio(struct bio *bio, int n)
+{
+	unsigned char *stuff;
+	int loop, loop2, bits;
+
+	bits = bio->bi_bdev->bd_inode->i_blkbits;
+
+	printk("BIO %d\n", n);
+	printk("\t- sector=%llx (bix=%llx) size=%x\n",
+	       (unsigned long long) bio->bi_sector,
+	       (unsigned long long) bio->bi_sector >> (PAGE_SHIFT - bits),
+	       bio->bi_size);
+	printk("\t- rw=%lx flags=%lx vcnt=%u/%u\n",
+	       bio->bi_rw,
+	       bio->bi_flags,
+	       bio->bi_vcnt,
+	       bio->bi_max_vecs);
+
+	for (loop = 0; loop < bio->bi_vcnt; loop++) {
+		printk("\t- { pg %p{%2lx} %03hx-%03hx ",
+		       bio->bi_io_vec[loop].bv_page,
+		       bio->bi_io_vec[loop].bv_page->index,
+		       bio->bi_io_vec[loop].bv_offset,
+		       bio->bi_io_vec[loop].bv_offset +
+		       bio->bi_io_vec[loop].bv_len - 1
+		       );
+
+		stuff = page_address(bio->bi_io_vec[loop].bv_page);
+		stuff += bio->bi_io_vec[loop].bv_offset;
+
+		for (loop2 = 0; loop2 < 20; loop2++)
+			printk("%02x", stuff[loop2]);
+
+		printk(" }\n");
+	}
+
+} /* end dump_bio() */
+
+/*****************************************************************************/
+/*
+ * clear the dead space between task_struct and kernel stack
+ * - called by supplying -finstrument-functions to gcc
+ */
+#if 0
+void __cyg_profile_func_enter (void *this_fn, void *call_site)
+__attribute__((no_instrument_function));
+
+void __cyg_profile_func_enter (void *this_fn, void *call_site)
+{
+       asm volatile("  movl    %%esp,%%edi     \n"
+                    "  andl    %0,%%edi        \n"
+                    "  addl    %1,%%edi        \n"
+                    "  movl    %%esp,%%ecx     \n"
+                    "  subl    %%edi,%%ecx     \n"
+                    "  shrl    $2,%%ecx        \n"
+                    "  movl    $0xedededed,%%eax     \n"
+                    "  rep stosl               \n"
+                    :
+                    : "i"(~(THREAD_SIZE-1)), "i"(sizeof(struct thread_info))
+                    : "eax", "ecx", "edi", "memory", "cc"
+                    );
+}
+
+void __cyg_profile_func_exit(void *this_fn, void *call_site)
+__attribute__((no_instrument_function));
+
+void __cyg_profile_func_exit(void *this_fn, void *call_site)
+{
+       asm volatile("  movl    %%esp,%%edi     \n"
+                    "  andl    %0,%%edi        \n"
+                    "  addl    %1,%%edi        \n"
+                    "  movl    %%esp,%%ecx     \n"
+                    "  subl    %%edi,%%ecx     \n"
+                    "  shrl    $2,%%ecx        \n"
+                    "  movl    $0xdadadada,%%eax     \n"
+                    "  rep stosl               \n"
+                    :
+                    : "i"(~(THREAD_SIZE-1)), "i"(sizeof(struct thread_info))
+                    : "eax", "ecx", "edi", "memory", "cc"
+                    );
+}
+#endif
diff -uNrp linux-2.6.14-mm2/fs/cachefs/Makefile linux-2.6.14-mm2-cachefs/fs/cachefs/Makefile
--- linux-2.6.14-mm2/fs/cachefs/Makefile	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/Makefile	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,37 @@
+#
+# Makefile for general caching filesystem
+#
+
+#CFLAGS += -finstrument-functions
+CFLAGS += -Wsign-compare
+
+obj-$(CONFIG_CACHEFS) := cachefs.o
+
+cachefs-y := \
+	allocator.o \
+	inode.o \
+	interface.o \
+	journal.o \
+	journal-replay.o \
+	kcachefsd.o \
+	main.o \
+	meta-aops.o \
+	meta-misc.o \
+	operation.o \
+	reaper.o \
+	recycling.o \
+	rootdir.o \
+	status.o \
+	super.o \
+	tree-data.o \
+	tree-delete.o \
+	tree-insert.o \
+	tree-insert-fanout.o \
+	tree-keys.o \
+	tree-lookup.o \
+	tree-list.o \
+	tree-misc.o \
+	tree-move.o \
+	tree-node.o \
+	tree-scan.o \
+	tree-update.o
diff -uNrp linux-2.6.14-mm2/fs/cachefs/meta-aops.c linux-2.6.14-mm2-cachefs/fs/cachefs/meta-aops.c
--- linux-2.6.14-mm2/fs/cachefs/meta-aops.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/meta-aops.c	2005-11-14 16:38:51.000000000 +0000
@@ -0,0 +1,794 @@
+/* meta-aops.c: address space operations for the metadata inode
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ * - derived from mpage.c, Copyright (C) 2002, Linus Torvalds.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include "cachefs-int.h"
+#include "cachefs-inode.h"
+
+static int cachefs_meta_readpage(struct file *file, struct page *page);
+static int cachefs_meta_readpages(struct file *file,
+				  struct address_space *mapping,
+				  struct list_head *pages,
+				  unsigned nr_pages);
+static int cachefs_meta_prepare_write(struct file *file, struct page *page,
+				      unsigned from, unsigned to);
+static int cachefs_meta_commit_write(struct file *file, struct page *page,
+				     unsigned from, unsigned to);
+static int cachefs_meta_writepage(struct page *page,
+				  struct writeback_control *wbc);
+static int cachefs_meta_writepages(struct address_space *mapping,
+				   struct writeback_control *wbc);
+static int cachefs_meta_set_page_dirty(struct page *page);
+static int cachefs_meta_sync_page(struct page *page);
+static int cachefs_meta_invalidatepage(struct page *page,
+				       unsigned long offset);
+static int cachefs_meta_releasepage(struct page *page, gfp_t gfp_flags);
+
+struct address_space_operations cachefs_meta_addrspace_operations = {
+	.readpage		= cachefs_meta_readpage,
+	.readpages		= cachefs_meta_readpages,
+	.prepare_write		= cachefs_meta_prepare_write,
+	.commit_write		= cachefs_meta_commit_write,
+	.writepage		= cachefs_meta_writepage,
+	.writepages		= cachefs_meta_writepages,
+	.set_page_dirty		= cachefs_meta_set_page_dirty,
+	.sync_page		= cachefs_meta_sync_page,
+	.invalidatepage		= cachefs_meta_invalidatepage,
+	.releasepage		= cachefs_meta_releasepage,
+};
+
+/*****************************************************************************/
+/*
+ * handle the completion of a BIO that read a bundle of pages
+ */
+static int cachefs_meta_end_io_read(struct bio *bio, unsigned int bytes_done,
+				    int err)
+{
+	struct cachefs_super *super;
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+
+	_enter("{sz=%u rw=%lu},%u,%d",
+	       bio->bi_size, bio->bi_rw, bytes_done, err);
+
+	if (bio->bi_size)
+		return 1;
+
+	/* mark all the pages with the appropriate state */
+	do {
+		struct page *page = bvec->bv_page;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (uptodate) {
+			SetPageUptodate(page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		_debug("DONE PAGE %p{%d,%lx,%lx}",
+		       page, page_count(page), page->index, page->flags);
+
+		super = page->mapping->host->i_sb->s_fs_info;
+		if (super->scan_loading == page) {
+			_debug("read page for scanner");
+			set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+			wake_up(&super->dmn_sleepq);
+		}
+
+		unlock_page(page);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+	return 0;
+
+} /* end cachefs_meta_end_io_read() */
+
+/*****************************************************************************/
+/*
+ * allocate a BIO for reading pages from disc
+ */
+static int cachefs_meta_io_alloc(struct super_block *sb, pgoff_t index,
+				 int nr_vecs, int gfp_flags, struct bio **_bio)
+{
+	struct bio *bio;
+	sector_t first_sector = index;
+
+	_enter("{bits=%u},%llu,%d,%x,",
+	       sb->s_blocksize_bits, first_sector, nr_vecs, gfp_flags);
+
+	*_bio = NULL;
+
+	/* try to allocate a BIO that can hold as many of the requested pages
+	 * as possible */
+	bio = bio_alloc(gfp_flags, nr_vecs);
+
+	if (!bio && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (nr_vecs /= 2))
+			bio = bio_alloc(gfp_flags, nr_vecs);
+	}
+
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_bdev	= sb->s_bdev;
+	bio->bi_sector	= first_sector << (PAGE_SHIFT - sb->s_blocksize_bits);
+	bio->bi_end_io	= cachefs_meta_end_io_read;
+
+	*_bio = bio;
+	return 0;
+
+} /* end cachefs_meta_io_alloc() */
+
+/*****************************************************************************/
+/*
+ * set up the actual reading of a page from disc for readpages
+ * - we attempt to share BIOs
+ */
+static int cachefs_meta_do_readpage(struct bio **_bio,
+				    struct page *page,
+				    unsigned nr_pages,
+				    cachefs_block_t *last_block_in_bio)
+{
+	struct cachefs_super *super;
+	struct inode *inode = page->mapping->host;
+	int ret;
+
+	_enter("");
+
+	super = inode->i_sb->s_fs_info;
+	ASSERT(page->index < super->j.alloc_unready);
+
+	SetPageMappedToDisk(page);
+
+	/* dispatch the outstanding BIO if the pages are not adjacent */
+	if (*_bio && *last_block_in_bio != page->index - 1) {
+		submit_bio(READ, *_bio);
+		*_bio = NULL;
+	}
+
+allocate_new_bio:
+	if (!*_bio) {
+		ret = cachefs_meta_io_alloc(inode->i_sb, page->index, nr_pages,
+					    GFP_KERNEL, _bio);
+		if (ret < 0)
+			goto error;
+	}
+
+	if (!bio_add_page(*_bio, page, PAGE_SIZE, 0)) {
+		submit_bio(READ, *_bio);
+		*_bio = NULL;
+		goto allocate_new_bio;
+	}
+
+	*last_block_in_bio = page->index;
+	_leave(" = 0");
+	return 0;
+
+error:
+	if (*_bio) {
+		submit_bio(READ, *_bio);
+		*_bio = NULL;
+	}
+	_leave("= %d", ret);
+	return ret;
+
+} /* end cachefs_meta_do_readpage() */
+
+/*****************************************************************************/
+/*
+ * read a bunch of pages from disc
+ */
+static int cachefs_meta_readpages(struct file *file,
+				  struct address_space *mapping,
+				  struct list_head *pages,
+				  unsigned nr_pages)
+{
+	cachefs_block_t last_block_in_bio = 0;
+	struct pagevec lru_pvec;
+	struct bio *bio = NULL;
+	unsigned page_idx;
+	int ret;
+
+	_enter(",,%u", nr_pages);
+
+	ret = 0;
+	pagevec_init(&lru_pvec, 0);
+
+	/* read all the pages, merging requests where possible */
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		struct page *page = list_entry(pages->prev, struct page, lru);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (!add_to_page_cache(page, mapping, page->index,
+				       GFP_KERNEL)) {
+			ret = cachefs_meta_do_readpage(
+				&bio, page, nr_pages - page_idx,
+				&last_block_in_bio);
+			if (ret < 0)
+				break;
+
+			if (!pagevec_add(&lru_pvec, page))
+				__pagevec_lru_add(&lru_pvec);
+
+		} else {
+			page_cache_release(page);
+		}
+	}
+
+	/* dispatch any left over BIO */
+	if (bio)
+		submit_bio(READ, bio);
+
+	/* add the pages to the LRU queue */
+	pagevec_lru_add(&lru_pvec);
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_meta_readpages() */
+
+/*****************************************************************************/
+/*
+ * read a page from disk
+ */
+static int cachefs_meta_readpage(struct file *file, struct page *page)
+{
+	struct cachefs_super *super;
+	struct inode *inode = page->mapping->host;
+	struct bio *bio;
+	int ret;
+
+	_enter(",{%d,%lx}", page_count(page), page->index);
+
+	super = inode->i_sb->s_fs_info;
+	ASSERT(page->index < super->j.alloc_unready);
+
+	SetPageMappedToDisk(page);
+
+	/* dispatch a call to perform the read */
+	ret = -ENOMEM;
+
+	bio = bio_alloc(GFP_KERNEL, 1);
+	if (bio) {
+		bio->bi_bdev	= inode->i_sb->s_bdev;
+		bio->bi_sector	= page->index;
+		bio->bi_sector	<<= PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+		bio->bi_end_io	= cachefs_meta_end_io_read;
+
+		if (!bio_add_page(bio, page, PAGE_SIZE, 0))
+			BUG();
+
+		submit_bio(READ, bio);
+		ret = 0;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_meta_readpage() */
+
+/*****************************************************************************/
+/*
+ * we don't do prepare_write on metadata
+ */
+static int cachefs_meta_prepare_write(struct file *file, struct page *page,
+				      unsigned from, unsigned to)
+{
+	printk("CacheFS: meta-data prepare_write not supported\n");
+	BUG();
+	return -EIO;
+
+} /* end cachefs_meta_prepare_write() */
+
+/*****************************************************************************/
+/*
+ * we don't do commit_write on metadata
+ */
+static int cachefs_meta_commit_write(struct file *file, struct page *page,
+				     unsigned from, unsigned to)
+{
+	printk("CacheFS: meta-data commit_write not supported\n");
+	BUG();
+	return -EIO;
+
+} /* end cachefs_meta_commit_write() */
+
+/*****************************************************************************/
+/*
+ * handle completion of page write
+ */
+static int cachefs_meta_end_io_write(struct bio *bio, unsigned int bytes_done,
+				     int err)
+{
+	struct cachefs_super *super;
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+
+	if (bio->bi_size)
+		return 1;
+
+	_enter("");
+
+	super = bvec->bv_page->mapping->host->i_sb->s_fs_info;
+
+	do {
+		struct page *page = bvec->bv_page;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (!uptodate)
+			SetPageError(page);
+
+		if (page_private(page)) {
+			struct cachefs_journal *jnl =
+				(struct cachefs_journal *) page_private(page);
+			set_page_private(page, 0);
+			cachefs_journal_release(jnl);
+		}
+
+#if 0
+		kdebug("Wrote page %p; fl=%lx cnt=%d",
+		       page, page->flags, page_count(page));
+#endif
+
+		end_page_writeback(page);
+		put_page(page);
+
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+	return 0;
+
+} /* end cachefs_meta_end_io_write() */
+
+/*****************************************************************************/
+/*
+ * write out a page
+ */
+static int cachefs_meta_writepage(struct page *page,
+				  struct writeback_control *wbc)
+{
+	struct cachefs_super *super;
+	struct bio *bio;
+
+	_enter("{%lx},", page->index);
+
+	ASSERT(PageMappedToDisk(page));
+
+	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH | __GFP_NOFAIL, 1);
+	if (!bio)
+		BUG();
+
+	super = page->mapping->host->i_sb->s_fs_info;
+	ASSERT(page->index < super->j.alloc_unready);
+
+	bio->bi_bdev	= super->sb->s_bdev;
+	bio->bi_sector	= page->index;
+	bio->bi_sector	<<= PAGE_SHIFT - super->sb->s_blocksize_bits;
+	bio->bi_end_io	= cachefs_meta_end_io_write;
+
+	if (!bio_add_page(bio, page, PAGE_SIZE, 0))
+		BUG();
+
+#if 0
+	kdebug("Write page %p; fl=%lx cnt=%d",
+	       page, page->flags, page_count(page));
+#endif
+
+	get_page(page);
+	if (test_set_page_writeback(page))
+		BUG();
+	clear_page_dirty(page);
+	submit_bio(WRITE, bio);
+
+	unlock_page(page);
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_meta_writepage() */
+
+/*****************************************************************************/
+/*
+ * send a page to the disk
+ */
+void cachefs_allocator_write_page(struct page *page)
+{
+	struct cachefs_super *super;
+	struct bio *bio;
+
+	kenter("{%lx},", page->index);
+
+	ASSERT(PageMappedToDisk(page));
+
+	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH | __GFP_NOFAIL, 1);
+	if (!bio)
+		BUG();
+
+	super = page->mapping->host->i_sb->s_fs_info;
+	ASSERT(page->index < super->j.alloc_unready);
+
+	bio->bi_bdev	= super->sb->s_bdev;
+	bio->bi_sector	= page->index;
+	bio->bi_sector	<<= PAGE_SHIFT - super->sb->s_blocksize_bits;
+	bio->bi_end_io	= cachefs_meta_end_io_write;
+
+	if (!bio_add_page(bio, page, PAGE_SIZE, 0))
+		BUG();
+
+	get_page(page);
+
+	/* the journaller and the allocator might trample on one another to
+	 * write out a page, so we have to wait if the page is being written
+	 * out */
+	while (TestSetPageWriteback(page))
+		wait_on_page_writeback(page);
+
+	ClearPageDirty(page);
+
+	//dump_bio(bio, 1234);
+	submit_bio(WRITE, bio);
+
+	_leave("");
+
+} /* end cachefs_allocator_write_page() */
+
+/*****************************************************************************/
+/*
+ * send a page to the disk if it's possible to do so without sleeping
+ */
+int cachefs_allocator_write_page_nowait(struct page *page)
+{
+	struct cachefs_super *super;
+	struct bio *bio;
+
+	kenter("{%lx},", page->index);
+
+	ASSERT(PageMappedToDisk(page));
+
+	/* the journaller and the allocator might trample on one another to
+	 * write out a page, so we have to wait if the page is being written
+	 * out */
+	if (PageWriteback(page)) {
+		kleave(" = -EBUSY [busy wb]");
+		return -EBUSY;
+	}
+
+	bio = bio_alloc(GFP_ATOMIC, 1);
+	if (!bio) {
+		kleave(" = -ENOMEM [busy nomem]");
+		return -ENOMEM;
+	}
+
+	if (TestSetPageWriteback(page)) {
+		bio_put(bio);
+		kleave(" = -EBUSY [busy wb2]");
+		return -EBUSY;
+	}
+
+	super = page->mapping->host->i_sb->s_fs_info;
+	ASSERT(page->index < super->j.alloc_unready);
+
+	bio->bi_bdev	= super->sb->s_bdev;
+	bio->bi_sector	= page->index;
+	bio->bi_sector	<<= PAGE_SHIFT - super->sb->s_blocksize_bits;
+	bio->bi_end_io	= cachefs_meta_end_io_write;
+
+	if (!bio_add_page(bio, page, PAGE_SIZE, 0))
+		BUG();
+
+	get_page(page);
+
+	ClearPageDirty(page);
+
+	//dump_bio(bio, 1234);
+	submit_bio(WRITE, bio);
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_allocator_write_page_nowait() */
+
+/*****************************************************************************/
+/*
+ * write metadata pages to disk
+ */
+static int cachefs_meta_writepages(struct address_space *mapping,
+				   struct writeback_control *wbc)
+{
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	struct cachefs_super *super = mapping->host->i_sb->s_fs_info;
+	pgoff_t index, end = -1;
+	int is_range = 0;
+	int ret, nr_pages, first, stop, send, done, wrap;
+
+	_enter("{%lu},{%u,%lu}",
+	       mapping->host->i_ino, wbc->sync_mode, wbc->nr_to_write);
+
+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		wbc->encountered_congestion = 1;
+		_leave(" = 0 [congested]");
+		return 0;
+	}
+
+	/* detemine the starting page */
+	if (wbc->start || wbc->end) {
+		/* we were given a range to play with */
+		index = wbc->start >> PAGE_CACHE_SHIFT;
+		end = wbc->end >> PAGE_CACHE_SHIFT;
+		is_range = 1;
+		wrap = 0;
+	}
+	else if (wbc->sync_mode == WB_SYNC_NONE) {
+		/* start from place we left off last time */
+		index = mapping->writeback_index;
+		wrap = (index == 0) ? 0 : 1;
+	}
+	else {
+		/* do the whole file, front to back */
+		index = 0;
+		wrap = 0;
+	}
+
+	do {
+		struct pagevec pvec;
+
+		/* grab a bunch of dirty pages to write out */
+		pagevec_init(&pvec, 0);
+	wrap:
+		nr_pages = min(end - index, (pgoff_t) PAGEVEC_SIZE - 1) + 1;
+		if (nr_pages > wbc->nr_to_write)
+			nr_pages = wbc->nr_to_write;
+
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					      PAGECACHE_TAG_DIRTY, nr_pages);
+
+		if (!nr_pages) {
+			if (wrap) {
+				index = 0;
+				wrap = 0;
+				goto wrap;
+			}
+
+			pagevec_release(&pvec);
+			break;
+		}
+
+		_debug("gang %d/%ld from %lx - %lx",
+		       nr_pages, wbc->nr_to_write, index, end);
+
+		first = 0;
+		do {
+			struct page *page;
+			struct bio *bio;
+
+			page = pvec.pages[first];
+			index = page->index;
+			_debug("  - pg[%d] %lx", first, index);
+
+			if (index > end)
+				break;
+
+			ASSERT(PageMappedToDisk(page));
+			ASSERT(index < super->j.alloc_unready);
+
+			/* find out how many on-disk adjacent pages we can fire
+			 * off at once */
+			stop = first + 1;
+			while (stop < nr_pages) {
+				_debug("  - apg[%d] %lx",
+				       stop, pvec.pages[stop]->index);
+				if (pvec.pages[stop]->index != index + 1)
+					break;
+				index++;
+				stop++;
+			}
+
+			_debug("stop %d (%lx)", stop, index);
+
+			ASSERT(index < super->j.alloc_unready);
+
+			/* allocate a BIO to take as many of the run as
+			 * possible */
+			ret = cachefs_meta_io_alloc(mapping->host->i_sb,
+						    page->index,
+						    stop - first,
+						    GFP_NOFS | __GFP_HIGH,
+						    &bio);
+			ASSERT(ret >= 0);
+
+			_debug("got bio");
+
+			bio->bi_end_io = cachefs_meta_end_io_write;
+
+			for (send = first; send < stop; send++) {
+				page = pvec.pages[send];
+
+				_debug("add pg %lx%s%s",
+				       page->index,
+				       PageLocked(page) ? " locked" : "",
+				       PageWriteback(page) ? " wb" : "");
+
+				lock_page(page);
+
+				if (wbc->sync_mode != WB_SYNC_NONE)
+					wait_on_page_writeback(page);
+
+				if (PageWriteback(page) ||
+				    !clear_page_dirty_for_io(page)
+				    )
+					break;
+
+				if (!bio_add_page(bio, page, PAGE_SIZE, 0))
+					break;
+
+#if 0
+				kdebug("Write pages %p; fl=%lx cnt=%d",
+				       page, page->flags, page_count(page));
+#endif
+
+				get_page(page);
+				if (test_set_page_writeback(page))
+					BUG();
+				wbc->nr_to_write--;
+			}
+
+			if (send < stop)
+				unlock_page(pvec.pages[send]);
+			if (send == first) {
+				_debug("first page already in flight");
+				bio_put(bio);
+				first++;
+				continue;
+			}
+
+
+			/* throw the pages at the disk */
+			_debug("submit");
+			//dump_bio(bio, 0);
+			submit_bio(WRITE, bio);
+
+			_debug("unlock");
+			for (; first < send; first++)
+				unlock_page(pvec.pages[first]);
+
+		} while (first < nr_pages);
+
+		_debug("rel");
+		pagevec_release(&pvec);
+
+		/* determine whether we should continue shovelling blocks to
+		 * disk */
+		done = 0;
+		if (wbc->nr_to_write <= 0)
+			done = 1;
+
+		if (wbc->nonblocking && bdi_write_congested(bdi)) {
+			wbc->encountered_congestion = 1;
+			done = 1;
+		}
+
+		if (index > end) {
+			if (wrap) {
+				wrap = 0;
+				index = 0;
+			}
+			else {
+				done = 1;
+			}
+		}
+
+		cond_resched();
+	} while (!done);
+
+	if (!is_range)
+		mapping->writeback_index = index;
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_meta_writepages() */
+
+/*****************************************************************************/
+/*
+ * mark metadata pages dirty, but without trying to muck around with buffer
+ * heads
+ */
+static int cachefs_meta_set_page_dirty(struct page *page)
+{
+	_enter("{%lx}", page->index);
+
+	ASSERT(PageMappedToDisk(page));
+
+	/* make it drive the current transaction */
+	if (!page_private(page)) {
+		struct cachefs_super *super;
+
+		super = page->mapping->host->i_sb->s_fs_info;
+		set_page_private(page,
+				 (unsigned long) cachefs_journal_get(super));
+	}
+
+	return __set_page_dirty_nobuffers(page);
+
+} /* end cachefs_meta_set_page_dirty() */
+
+/*****************************************************************************/
+/*
+ * synchronise a page
+ */
+static int cachefs_meta_sync_page(struct page *page)
+{
+	_enter("{in=%lx pg=%lx %lx}",
+	       page->mapping->host->i_ino, page->index, page->flags);
+
+	/* kick the blockdev into action */
+	return block_sync_page(page);
+
+} /* end cachefs_meta_sync_page() */
+
+/*****************************************************************************/
+/*
+ * invalidate part or all of a page
+ */
+static int cachefs_meta_invalidatepage(struct page *page, unsigned long offset)
+{
+	int ret = 1;
+
+	_enter("{%d,%lx,%lx},", page_count(page), page->index, page->flags);
+
+	ASSERT(PageLocked(page));
+	ASSERT(offset == 0);
+
+	if (PagePrivate(page)) {
+		/* we release page attachments only if the entire page is being
+		 * invalidated - in that case, the block mapping has been
+		 * unconditionally invalidated, so real IO is not possible
+		 * anymore.
+		 */
+		ret = page->mapping->a_ops->releasepage(page, 0);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_meta_invalidatepage() */
+
+/*****************************************************************************/
+/*
+ * release a page and clean up its private data
+ */
+static int cachefs_meta_releasepage(struct page *page, gfp_t gfp_flags)
+{
+	_enter("%p{%lx},%x", page, page->index, gfp_flags);
+
+	ASSERTCMP(page_private(page), ==, 0);
+	ClearPagePrivate(page);
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_meta_releasepage() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/meta-misc.c linux-2.6.14-mm2-cachefs/fs/cachefs/meta-misc.c
--- linux-2.6.14-mm2/fs/cachefs/meta-misc.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/meta-misc.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,348 @@
+/* meta-misc.c: miscellaneous metadata routines
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/swap.h>
+#include <linux/mm_inline.h>
+#include "cachefs-int.h"
+
+/*****************************************************************************/
+/*
+ * dummy readpage to initialise a block with zeros
+ */
+static int cachefs_null_filler(void *data, struct page *page)
+{
+	_enter(",{%lx}", page->index);
+
+	ASSERTCMP(page_private(page), ==, 0);
+	SetPageMappedToDisk(page);
+	SetPageUptodate(page);
+	memclear_highpage_flush(page, 0, PAGE_SIZE);
+	set_page_dirty(page);
+	unlock_page(page);
+	return 0;
+
+} /* end cachefs_null_filler() */
+
+/*****************************************************************************/
+/*
+ * read a metadata block from disk or initialise it
+ * - reads pages through the metadata inode
+ * - caller must wait for page to finish reading
+ */
+int cachefs_page_read(struct cachefs_super *super,
+		      cachefs_block_t bix,
+		      int wipe,
+		      struct page **_page)
+{
+	struct address_space *mapping;
+	struct page *page;
+	filler_t *filler;
+
+	_enter(",,%u,%d", bix, wipe);
+
+	ASSERTIF(wipe, bix != 0);
+	ASSERTIF(bix != 0, bix >= super->layout->bix_cache);
+	ASSERTIF(bix != 0, bix < super->j.alloc_unready);
+
+	/* load the page into the page cache */
+	mapping = super->imeta->i_mapping;
+
+	filler = (filler_t *) mapping->a_ops->readpage;
+	if (wipe)
+		filler = cachefs_null_filler;
+
+	page = read_cache_page(mapping, bix, filler, NULL);
+
+	if (IS_ERR(page)) {
+		*_page = NULL;
+		_leave(" = %ld [rcp]", PTR_ERR(page));
+		return PTR_ERR(page);
+	}
+
+	ASSERT(!PageFsMisc(page));
+
+	*_page = page;
+	_leave(" = 0 [%p]", page);
+	return 0;
+
+} /* end cachefs_page_read() */
+
+/*****************************************************************************/
+/*
+ * read a metadata block from disk or initialise it
+ * - reads pages through the metadata inode
+ */
+int cachefs_node_read(struct cachefs_super *super, struct cachefs_tree *node,
+		      int sync)
+{
+	struct address_space *mapping;
+	struct page *page, *xpage;
+	int ret;
+
+	_enter(",{%x}", node->bix);
+
+	ASSERT(node->bix != CACHEFS_NULL_PTR);
+	ASSERT(node->bix >= super->layout->bix_cache);
+	ASSERT(node->bix < super->j.alloc_unready);
+
+	if (node->page)
+		goto page_already_present;
+
+	/* load the page into the page cache */
+	_debug("reading node %p{%x}", node, node->bix);
+
+	mapping = super->imeta->i_mapping;
+	xpage = NULL;
+
+search_again:
+	page = find_get_page(mapping, node->bix);
+	if (!page) {
+		if (!xpage) {
+			xpage = page_cache_alloc_cold(mapping);
+			if (!xpage) {
+				_leave(" = -ENOMEM");
+				return -ENOMEM;
+			}
+		}
+
+		ret = add_to_page_cache_lru(xpage, mapping,
+					    node->bix, GFP_KERNEL);
+		if (ret < 0) {
+			if (ret == -EEXIST)
+				goto search_again;
+			page_cache_release(xpage);
+			_leave(" = %d", ret);
+			return ret;
+		}
+
+		SetPageFsMisc(xpage);
+		SetPagePrivate(xpage);
+
+		ret = mapping->a_ops->readpage(NULL, xpage);
+		if (ret < 0) {
+			ClearPageFsMisc(xpage);
+			ClearPagePrivate(xpage);
+			page_cache_release(xpage);
+			_leave(" = %d", ret);
+			return ret;
+		}
+
+		page = xpage;
+		xpage = NULL;
+	}
+	else {
+		/* we found a page with the index we were looking for; check
+		 * it's not in use */
+		cachefs_page_put(xpage);
+
+		if (TestSetPageFsMisc(page)) {
+			printk(KERN_ERR
+			       "CacheFS: node %p reusing page still in use:"
+			       " %p { %lx, %lx }\n",
+			       node, page, page->index, page->flags);
+			BUG();
+		}
+	}
+
+	/* wait for the page to finish being read */
+	mark_page_accessed(page);
+
+	/* install the new page if no-one else beat us to it */
+	write_lock(&node->lock);
+	if (!node->page) {
+		node->page = page;
+		page = NULL;
+	}
+	else {
+		ASSERT(node->page == page);
+	}
+	write_unlock(&node->lock);
+	cachefs_page_put(page);
+
+page_already_present:
+	if (!sync && !PageError(node->page) && !PageUptodate(node->page))
+		return -EAGAIN;
+
+	wait_on_page_locked(node->page);
+
+	/* validate the node */
+	return cachefs_node_validate(super, node);
+
+} /* end cachefs_node_read() */
+
+/*****************************************************************************/
+/*
+ * check a metadata node's validity
+ */
+int cachefs_node_validate(struct cachefs_super *super,
+			  struct cachefs_tree *node)
+{
+	struct cachefs_ondisc_leaf *leaf;
+	cachefs_block_t *ptr, bix;
+	void *data;
+	int occupancy, loop, loop2;
+
+	_enter("");
+
+	if (PageError(node->page))
+		goto io_error;
+
+	ASSERT(PageUptodate(node->page));
+	ASSERT(PageFsMisc(node->page));
+	ASSERT(PageMappedToDisk(node->page));
+
+	/* determine whether we need to validate it or not */
+try_again:
+	if (test_bit(CACHEFS_TREE_NODE_VALIDATED, &node->flags)) {
+		if (test_bit(CACHEFS_TREE_NODE_VALID, &node->flags))
+			return 0;
+
+		_leave(" = -EIO [invalid]");
+		return -EIO;
+	}
+
+	write_lock(&node->lock);
+	if (test_bit(CACHEFS_TREE_NODE_VALIDATED, &node->flags)) {
+		write_unlock(&node->lock);
+		goto try_again;
+	}
+
+	/* we do */
+	_debug("validating %x", node->bix);
+
+	data = kmap_atomic(node->page, KM_USER0);
+
+	occupancy = 0;
+
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+		leaf = data + (loop << super->layout->leaf_shift);
+
+		/* typed node leaves have the second pointer slot pointing into
+		 * the journal; real pointer leaves always point elsewhere */
+		switch (leaf->type) {
+		case CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT:
+			break;
+
+		case CACHEFS_ONDISC_OBJTYPE_INDEX_OBJECT:
+		case CACHEFS_ONDISC_OBJTYPE_DATA_OBJECT:
+		case CACHEFS_ONDISC_OBJTYPE_OTHER_OBJECT:
+		case CACHEFS_ONDISC_OBJTYPE_SHORTCUT:
+			if (node->type == CACHEFS_TREE_TYPE_DATAPTRBLK)
+				goto object_in_dataptrblk;
+
+			occupancy++;
+
+			bix = leaf->ptr;
+			if (bix == CACHEFS_NULL_PTR)
+				break;
+
+			if (bix == CACHEFS_EMPTY_PTR ||
+			    bix < super->layout->bix_cache ||
+			    bix >= super->j.alloc_unready)
+				goto invalid_pointer;
+			break;
+
+		default:
+			goto type_error;
+
+			/* pointer block leaf */
+#if CACHEFS_ONDISC_OBJTYPE_NULL_POINTER < CACHEFS_ONDISC_OBJTYPE_FIRST_POINTER
+		case CACHEFS_ONDISC_OBJTYPE_NULL_POINTER:
+#endif
+		case CACHEFS_ONDISC_OBJTYPE_FIRST_POINTER ...
+			CACHEFS_ONDISC_OBJTYPE_LAST_POINTER:
+			ptr = (cachefs_block_t *) leaf;
+
+			for (loop2 = CACHEFS_ONDISC_PTRPERLEAF - 1;
+			     loop2 >= 0;
+			     loop2--
+			     ) {
+				bix = ptr[loop2];
+				if (bix == CACHEFS_NULL_PTR)
+					continue;
+
+				if (bix == CACHEFS_EMPTY_PTR ||
+				    bix < super->layout->bix_cache ||
+				    bix >= super->j.alloc_unready)
+					goto invalid_pointer;
+
+				occupancy++;
+			}
+			break;
+		}
+	}
+
+	kunmap_atomic(data, KM_USER0);
+
+	node->occupancy = occupancy;
+	_debug("occupancy: %d", occupancy);
+
+	set_bit(CACHEFS_TREE_NODE_VALID, &node->flags);
+	set_bit(CACHEFS_TREE_NODE_VALIDATED, &node->flags);
+	write_unlock(&node->lock);
+	_leave(" = 0");
+	return 0;
+
+io_error:
+	set_bit(CACHEFS_TREE_NODE_INVALID, &node->flags);
+	set_bit(CACHEFS_TREE_NODE_VALIDATED, &node->flags);
+	if (atomic_read(&super->error_count) < 5) {
+		atomic_inc(&super->error_count);
+		printk(KERN_ERR
+		       "CacheFS: I/O Error reading block %x\n",
+		       node->bix);
+	}
+	goto error;
+
+object_in_dataptrblk:
+	if (atomic_read(&super->error_count) < 5) {
+		atomic_inc(&super->error_count);
+		printk(KERN_ERR
+		       "CacheFS: Object type %x in data pointer block %x\n",
+		       leaf->type, node->bix);
+	}
+	goto error_u;
+
+type_error:
+	if (atomic_read(&super->error_count) < 5) {
+		atomic_inc(&super->error_count);
+		printk(KERN_ERR
+		       "CacheFS: Unrecognised object type %x in block %x\n",
+		       leaf->type, node->bix);
+	}
+	goto error_u;
+
+invalid_pointer:
+	if (atomic_read(&super->error_count) < 5) {
+		atomic_inc(&super->error_count);
+		printk(KERN_ERR
+		       "CacheFS: Invalid pointer %x in block %x\n",
+		       bix, node->bix);
+	}
+
+error_u:
+	kunmap_atomic(data, KM_USER0);
+	set_bit(CACHEFS_TREE_NODE_INVALID, &node->flags);
+	set_bit(CACHEFS_TREE_NODE_VALIDATED, &node->flags);
+	write_unlock(&node->lock);
+
+error:
+	set_bit(CACHEFS_SUPER_ERROR_STOP, &super->flags);
+	kleave(" = -EIO [data error]");
+	return -EIO;
+
+} /* end cachefs_node_validate() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/operation.c linux-2.6.14-mm2-cachefs/fs/cachefs/operation.c
--- linux-2.6.14-mm2/fs/cachefs/operation.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/operation.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,571 @@
+/* operation.c: CacheFS operations manager
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+//#define __KENTER
+//#define __KDEBUG
+//#define __KLEAVE
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "cachefs-int.h"
+
+/*****************************************************************************/
+/*
+ * make sure there are sufficient blocks available before releasing the letting
+ * the operation proceed
+ */
+int cachefs_operation_begin(struct cachefs_operation *op)
+{
+	unsigned reservation = 0, excess = 0;
+	unsigned maxkeysize, maxlevels, maxdatalevels, available;
+	unsigned delete_min = 0;
+	int ret;
+
+	_enter("%p", op);
+
+	INIT_LIST_HEAD(&op->op_link);
+	INIT_LIST_HEAD(&op->alloc_link);
+
+	op->state = CACHEFS_OP_RESERVING;
+	op->task = NULL;
+	op->p.nodes[0] = NULL;
+	op->p.nodes[1] = NULL;
+	op->p.nodes[2] = NULL;
+	op->p.nodes[3] = NULL;
+	op->n_alloc = 0;
+	op->m_alloc = 0;
+	op->n_rcm = 0;
+	op->m_rcm = 0;
+
+	/* work out the maximum depth of the tree */
+	maxkeysize = op->super->layout->leaf_size;
+	maxkeysize -= offsetof(struct cachefs_ondisc_leaf, u.object.key);
+	maxkeysize *= 8;
+
+	maxlevels = maxkeysize + CACHEFS_ONDISC_LEVEL_BITS - 1;
+	maxlevels /= CACHEFS_ONDISC_LEVEL_BITS;
+
+	maxdatalevels = sizeof(pgoff_t) * 8;
+	maxdatalevels += CACHEFS_ONDISC_LEVEL_BITS - 1;
+	maxdatalevels /= CACHEFS_ONDISC_LEVEL_BITS;
+
+	/* work out the overheads */
+	switch (op->reason) {
+	case CACHEFS_OP_INSERT_LEAF:
+		/* we might have to allocate a side branch if we displace some
+		 * blocks */
+		reservation++;
+		excess = maxlevels;
+		break;
+
+	case CACHEFS_OP_DELETE_LEAF:
+		/* we might have to recycle the object's ID (to kill its
+		 * children) */
+		if (op->object->flags & CACHEFS_ONDISC_OBJECT_HAS_CHILDREN)
+			delete_min++;
+
+		/* and its data tree if not trivial */
+		if (op->object->data_levels > 0)
+			delete_min++;
+
+		reservation += delete_min;
+		break;
+
+	case CACHEFS_OP_UPDATE_LEAF:
+		break;
+
+	case CACHEFS_OP_INSERT_DATA:
+		/* we need to allocate a number of data blocks */
+		reservation += op->data_space;
+		excess = maxdatalevels;
+		break;
+
+	case CACHEFS_OP_RECYCLE_DATA:
+	default:
+		BUG();
+		break;
+	}
+
+	/* worst case is having to replace everything on the path to the object */
+	reservation += maxlevels;
+
+	op->reservation = reservation;
+	op->excess = excess;
+
+	_debug("resv %u, ex %u", reservation, excess);
+
+	/* and, of course, we may need to allocate a node in the reclaim stack
+	 * to hold the released blocks and the allocator node and a spare
+	 */
+	op->alrs_resv = 2;
+
+	/* we must not get stuck if the primary alloc stack is reduced to bare
+	 * minimum (1 free block) and the secondary alloc stack has but a
+	 * single node in it (1023 blocks) */
+	if (op->reservation + op->alrs_resv + op->excess >
+	    CACHEFS_ONDISC_FREELIST_PTRSPERNODE + 1
+	    ) {
+		printk(KERN_ERR "CacheFS:"
+		       " Stuck due to requested allocation being too large:"
+		       " %x > %lx\n",
+		       op->reservation + op->alrs_resv + op->excess,
+		       CACHEFS_ONDISC_FREELIST_PTRSPERNODE + 1);
+
+		BUG();
+	}
+
+	/* we have to prevent the journal from wandering whilst we determine
+	 * whether this operation can proceed
+	 */
+	down_read(&op->super->tree_wander_sem);
+
+	/* lock for allocation */
+	_debug("space: U:%u A:%u I:%u R:%u",
+	       op->super->layout->bix_end - op->super->j.alloc_unready,
+	       op->super->j.space_alloc,
+	       op->super->space_inprogress,
+	       op->super->space_rcmstk_resv);
+
+	spin_lock(&op->super->operation_lock);
+
+	spin_lock(&op->super->alloc_lock);
+	available = op->super->layout->bix_end - op->super->j.alloc_unready;
+	available += op->super->j.space_alloc;
+	available -= op->super->space_inprogress;
+	available -= op->super->space_rcmstk_resv;
+	spin_unlock(&op->super->alloc_lock);
+
+	_debug("op %d: avail=%u reqd=%u%s",
+	       op->reason,
+	       available,
+	       op->reservation + op->alrs_resv + op->excess,
+	       list_empty(&op->super->op_waitq) ? "" : " [sleepers]");
+
+	/* see if we can allow immediate procession (deletes can jump the
+	 * queue) */
+	if (list_empty(&op->super->op_waitq) ||
+	    op->reason == CACHEFS_OP_DELETE_LEAF
+	    ) {
+		if (op->reservation + op->alrs_resv + op->excess < available)
+			goto run_operation_immediately;
+	}
+
+	/* allocation not immediately possible
+	 * - must permit deletion to run with lower credits even if nothing
+	 *   else can run
+	 */
+	if (op->reason == CACHEFS_OP_DELETE_LEAF &&
+	    list_empty(&op->super->op_runq)
+	    ) {
+		/* nothing else is running, therefore the tree cannot be
+		 * rearranged, so we can trust the object's node level as an
+		 * accurate guide */
+		delete_min += op->object->node->level;
+
+		if (op->super->metadata_tree->immutable >=
+		    op->super->jnl_serial)
+			delete_min--; /* root node already replaced */
+
+		if (delete_min + op->alrs_resv + op->excess < available) {
+			op->reservation = delete_min;
+			goto run_operation_immediately;
+		}
+	}
+
+	/* must queue the allocation request
+	 * - deletions go at the front of the queue
+	 */
+	if (op->reason == CACHEFS_OP_DELETE_LEAF)
+		list_add(&op->op_link, &op->super->op_waitq);
+	else
+		list_add_tail(&op->op_link, &op->super->op_waitq);
+
+	op->task = current;
+	get_task_struct(op->task);
+
+	spin_unlock(&op->super->operation_lock);
+
+	/* cause the tree to wander to get at the space reclaimed if there's
+	 * enough of it */
+	up_read(&op->super->tree_wander_sem);
+
+	if (op->super->j.rcm_ready) {
+		_debug("sync to retrieve reclaim stack");
+		ret = cachefs_sync(op->super, 1, 0);
+		if (ret < 0)
+			goto sync_failed;
+	}
+	if (op->super->space_transit) {
+		/* there is space... it's just transiting from the reclamation
+		 * list to the allocation list */
+		kdebug("await transit");
+	}
+	else {
+		/* request an emergency cull */
+		kdebug("need cull");
+		BUG(); // TODO - request an emergency cull
+	}
+
+	/* go to sleep until there's stuff available */
+	_debug("wait for allocation");
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (op->state != CACHEFS_OP_RESERVING ||
+		    signal_pending(current))
+			break;
+		schedule();
+	}
+
+	__set_current_state(TASK_RUNNING);
+
+	if (signal_pending(current)) {
+		spin_lock(&op->super->operation_lock);
+
+		if (op->state == CACHEFS_OP_RESERVING)
+			list_del_init(&op->op_link);
+
+		spin_unlock(&op->super->operation_lock);
+
+		if (op->state == CACHEFS_OP_RESERVING) {
+			put_task_struct(op->task);
+			_leave(" = -EINTR");
+			return -EINTR;
+		}
+	}
+
+	/* the allocator came through whilst we were busy */
+operation_made_runnable:
+	ASSERT(!op->task);
+	down_read(&op->super->tree_wander_sem);
+	_leave(" = 0");
+	return 0;
+
+	/* the operation was immediately runnable, so we set it up and let it
+	 * go */
+run_operation_immediately:
+	list_add_tail(&op->op_link, &op->super->op_runq);
+
+	spin_lock(&op->super->alloc_lock);
+	op->super->space_inprogress += op->reservation;
+	op->super->space_rcmstk_resv += op->alrs_resv;
+	op->super->space_rcmstk_resv_max += op->alrs_resv;
+	spin_unlock(&op->super->alloc_lock);
+
+	op->state = CACHEFS_OP_RUNNING;
+	spin_unlock(&op->super->operation_lock);
+
+	_debug("reserved %u/%u", op->reservation, op->super->space_inprogress);
+
+	_debug("reserved alrs %u/%u/%u",
+	       op->alrs_resv,
+	       op->super->space_rcmstk_resv,
+	       op->super->space_rcmstk_resv_max);
+
+	_leave(" = 0");
+	return 0;
+
+sync_failed:
+	spin_lock(&op->super->operation_lock);
+	if (op->state == CACHEFS_OP_RESERVING)
+		list_del(&op->op_link);
+	spin_unlock(&op->super->operation_lock);
+
+	if (op->state == CACHEFS_OP_RUNNING) {
+		_debug("ignoring sync error %d", ret);
+		goto operation_made_runnable;
+	}
+
+	if (op->task)
+		put_task_struct(op->task);
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_operation_begin() */
+
+/*****************************************************************************/
+/*
+ * make sure there are sufficient blocks available before releasing the letting
+ * the operation proceed
+ * - special for kcachefsd operations which may not sleep
+ * - returns -EINPROGRESS if the op is queued pending block availability
+ */
+int cachefs_operation_begin_kcachefsd(struct cachefs_operation *op)
+{
+	unsigned available;
+
+	_enter("%p", op);
+
+	INIT_LIST_HEAD(&op->op_link);
+	INIT_LIST_HEAD(&op->alloc_link);
+
+	op->state = CACHEFS_OP_RESERVING;
+	op->task = NULL;
+	op->p.nodes[0] = NULL;
+	op->p.nodes[1] = NULL;
+	op->p.nodes[2] = NULL;
+	op->p.nodes[3] = NULL;
+	op->n_alloc = 0;
+	op->m_alloc = 0;
+	op->n_rcm = 0;
+	op->m_rcm = 0;
+	op->reservation = 0;
+	op->excess = 0;
+
+	/* work out the overheads */
+	switch (op->reason) {
+	case CACHEFS_OP_RECYCLE_DATA:
+		break;
+
+	default:
+		BUG();
+		break;
+	}
+
+	/* we may need to allocate a node in the reclaim stack to hold the
+	 * released blocks and the allocator node and a spare
+	 */
+	op->alrs_resv = 2;
+
+	/* we have to prevent the journal from wandering whilst we determine
+	 * whether this operation can proceed
+	 */
+	down_read(&op->super->tree_wander_sem);
+
+	/* lock for allocation */
+	_debug("space: U:%u A:%u I:%u R:%u",
+	       op->super->layout->bix_end - op->super->j.alloc_unready,
+	       op->super->j.space_alloc,
+	       op->super->space_inprogress,
+	       op->super->space_rcmstk_resv);
+
+	spin_lock(&op->super->operation_lock);
+
+	spin_lock(&op->super->alloc_lock);
+	available = op->super->layout->bix_end - op->super->j.alloc_unready;
+	available += op->super->j.space_alloc;
+	available -= op->super->space_inprogress;
+	available -= op->super->space_rcmstk_resv;
+	spin_unlock(&op->super->alloc_lock);
+
+	_debug("op %d: avail=%u reqd=%u%s",
+	       op->reason,
+	       available,
+	       op->reservation + op->alrs_resv + op->excess,
+	       list_empty(&op->super->op_waitq) ? "" : " [sleepers]");
+
+	/* see if we can allow immediate procession (data recycling is
+	 * permitted to jump the queue) */
+	if (list_empty(&op->super->op_waitq) ||
+	    op->reason == CACHEFS_OP_RECYCLE_DATA
+	    ) {
+		if (op->reservation + op->alrs_resv + op->excess < available)
+			goto run_operation_immediately;
+	}
+
+	/* must queue the allocation request
+	 * - reclamations go at the front of the queue
+	 */
+	if (op->reason == CACHEFS_OP_RECYCLE_DATA)
+		list_add(&op->op_link, &op->super->op_waitq);
+	else
+		list_add_tail(&op->op_link, &op->super->op_waitq);
+
+	op->task = current;
+	get_task_struct(op->task);
+
+	spin_unlock(&op->super->operation_lock);
+
+	/* cause the tree to wander to get at the space reclaimed if there's
+	 * enough of it */
+	up_read(&op->super->tree_wander_sem);
+
+	if (op->super->j.rcm_ready) {
+		_debug("sync to retrieve reclaim stack");
+		cachefs_sync(op->super, 0, 0);
+	}
+
+	/* wait asynchronously for the allocation to become possible */
+	_debug("wait for allocator");
+
+	if (op->state != CACHEFS_OP_RESERVING) {
+		_leave(" = -EINPROGRESS");
+		return -EINPROGRESS;
+	}
+
+	/* the allocator came through whilst we were busy */
+	ASSERT(!op->task);
+	_leave(" = 0");
+	return 0;
+
+run_operation_immediately:
+	list_add_tail(&op->op_link, &op->super->op_runq);
+
+	spin_lock(&op->super->alloc_lock);
+	op->super->space_inprogress += op->reservation;
+	op->super->space_rcmstk_resv += op->alrs_resv;
+	op->super->space_rcmstk_resv_max += op->alrs_resv;
+	spin_unlock(&op->super->alloc_lock);
+
+	op->state = CACHEFS_OP_RUNNING;
+	spin_unlock(&op->super->operation_lock);
+
+	/* kcachefsd should only take the wander sem when it needs it */
+	up_read(&op->super->tree_wander_sem);
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_operation_begin_kcachefsd() */
+
+/*****************************************************************************/
+/*
+ * contemplate passing reservations over to sleeping processes
+ * - must be called with the operation lock held
+ */
+static void __cachefs_operation_run(struct cachefs_super *super)
+{
+	struct cachefs_operation *xop;
+	struct task_struct *task;
+	unsigned available;
+
+	spin_lock(&super->alloc_lock);
+	available = super->layout->bix_end - super->j.alloc_unready;
+	available += super->j.space_alloc;
+	available -= super->space_inprogress;
+	available -= super->space_rcmstk_resv;
+	spin_unlock(&super->alloc_lock);
+
+	/* see if anyone waiting for sufficient allocation reservation can now
+	 * be allowed to run */
+	while (!list_empty(&super->op_waitq)) {
+		xop = list_entry(super->op_waitq.next,
+				 struct cachefs_operation, op_link);
+
+		if (available <
+		    xop->reservation + xop->alrs_resv + xop->excess)
+			break; /* no space */
+
+		/* set this op going */
+		kdebug("run op %p '%s'", xop, xop->task->comm);
+
+		list_move_tail(&xop->op_link, &super->op_runq);
+
+		available -= xop->reservation;
+		available -= xop->alrs_resv;
+
+		spin_lock(&super->alloc_lock);
+		super->space_inprogress += xop->reservation;
+		super->space_rcmstk_resv += xop->alrs_resv;
+		super->space_rcmstk_resv_max += xop->alrs_resv;
+		spin_unlock(&super->alloc_lock);
+
+		task = xop->task;
+		xop->task = NULL;
+		smp_mb();
+		xop->state = CACHEFS_OP_RUNNING;
+
+		if (task == super->dmn_task)
+			/* tell kcachefsd what it's supposed to be doing */
+			set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+
+		wake_up_process(task);
+		put_task_struct(task);
+	}
+
+} /* end __cachefs_operation_run() */
+
+/*****************************************************************************/
+/*
+ * run the allocator to let blocked operations allocate if possible
+ */
+void cachefs_operation_run(struct cachefs_super *super)
+{
+	_enter("");
+
+	spin_lock(&super->operation_lock);
+	__cachefs_operation_run(super);
+	spin_unlock(&super->operation_lock);
+
+	_leave("");
+
+} /* end cachefs_operation_run() */
+
+/*****************************************************************************/
+/*
+ * end an operation
+ */
+void cachefs_operation_end(struct cachefs_operation *op)
+{
+	/* give the tree wanderer a chance */
+	up_read(&op->super->tree_wander_sem);
+
+	cachefs_operation_end_kcachefsd(op);
+
+} /* end cachefs_operation_end() */
+
+/*****************************************************************************/
+/*
+ * end an operation, assuming the wander sem doesn't need releasing
+ */
+void cachefs_operation_end_kcachefsd(struct cachefs_operation *op)
+{
+	struct cachefs_super *super = op->super;
+
+	_enter("%p", op);
+
+	set_bit(CACHEFS_SUPER_NEED_WANDER, &super->flags);
+
+	op->state = CACHEFS_OP_INACTIVE;
+
+	/* kick the journal rollover off in a few seconds */
+	if (super->jnl_timeout > 0 && !timer_pending(&super->jnl_timer))
+		mod_timer(&super->jnl_timer, jiffies + super->jnl_timeout);
+
+	/* release this operation and any reserved blocks that weren't used */
+	spin_lock(&super->operation_lock);
+
+	list_del_init(&op->op_link);
+
+	spin_lock(&super->alloc_lock);
+
+	_debug("dec resv %u by %u", super->space_inprogress, op->reservation);
+
+	_debug("dec alrs %u/%u by %u",
+	       super->space_rcmstk_resv,
+	       super->space_rcmstk_resv_max,
+	       op->alrs_resv);
+
+	super->space_inprogress -= op->reservation;
+	super->space_rcmstk_resv_max -= op->alrs_resv;
+	if (super->space_rcmstk_resv > super->space_rcmstk_resv_max)
+		super->space_rcmstk_resv = super->space_rcmstk_resv_max;
+
+	spin_unlock(&super->alloc_lock);
+
+	/* see if there are any other ops that want a go */
+	__cachefs_operation_run(super);
+
+	spin_unlock(&super->operation_lock);
+
+	cachefs_tree_put(op->p.nodes[0]);
+	cachefs_tree_put(op->p.nodes[1]);
+	cachefs_tree_put(op->p.nodes[2]);
+	cachefs_tree_put(op->p.nodes[3]);
+
+	radix_tree_preload_drain_task();
+
+	ASSERT(!op->task);
+
+	_leave("");
+
+} /* end cachefs_operation_end_kcachefsd() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/reaper.c linux-2.6.14-mm2-cachefs/fs/cachefs/reaper.c
--- linux-2.6.14-mm2/fs/cachefs/reaper.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/reaper.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,120 @@
+/* reaper.c: object reaper thread
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "cachefs-int.h"
+
+static inline void discard_my_signals(void)
+{
+	while (signal_pending(current)) {
+		siginfo_t sinfo;
+
+		spin_lock_irq(&current->sighand->siglock);
+		dequeue_signal(current, &current->blocked, &sinfo);
+		spin_unlock_irq(&current->sighand->siglock);
+	}
+}
+
+/*****************************************************************************/
+/*
+ * cache recycling daemon
+ */
+int kreaperd(void *_super)
+{
+	struct cachefs_super *super = _super;
+
+	DECLARE_WAITQUEUE(myself, current);
+
+	super->reaper_task = current;
+
+	daemonize("kreaperd %02x%02x",
+		  MAJOR(super->sb->s_bdev->bd_inode->i_rdev),
+		  MINOR(super->sb->s_bdev->bd_inode->i_rdev));
+
+	complete(&super->reaper_alive);
+
+	printk("CacheFS: Started kreaperd %d for cache %s\n",
+	       current->pid, super->cache.identifier);
+
+	/* loop around looking for things to attend to */
+	while (super->reaper_die == CACHEFS_REAPER_RUNNING &&
+	       !test_bit(CACHEFS_SUPER_ERROR_STOP, &super->flags)
+	       ) {
+		/* sleep until there's a target to reap */
+		if (!super->reaper_target) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			add_wait_queue(&super->reaper_sleepq, &myself);
+
+			for (;;) {
+				discard_my_signals();
+
+				if (test_bit(CACHEFS_SUPER_ERROR_STOP, &super->flags))
+					super->reaper_die = CACHEFS_REAPER_DIE;
+
+				if (super->reaper_die != CACHEFS_REAPER_RUNNING ||
+				    super->reaper_target)
+					break;
+
+				schedule();
+				set_current_state(TASK_INTERRUPTIBLE);
+			}
+
+			remove_wait_queue(&super->reaper_sleepq, &myself);
+			set_current_state(TASK_RUNNING);
+
+			if (super->reaper_die != CACHEFS_REAPER_RUNNING)
+				break;
+		}
+
+		/* delete that object */
+		_debug("reap %llx", super->reaper_target->objid);
+
+		if (cachefs_tree_delete(super, super->reaper_target) == 0) {
+			/* success */
+			struct cachefs_object *xobject;
+
+			_debug("reaped");
+
+			xobject = super->reaper_target;
+
+			cachefs_object_put(xobject);
+
+			/* clear the pointer only after destroying the object
+			 * so that anyone waiting for the reaper to delete an
+			 * object won't find the object in place again */
+			spin_lock(&super->objects_lock);
+			super->reaper_target = NULL;
+			spin_unlock(&super->objects_lock);
+
+			/* wake up anyone waiting for us */
+			wake_up(&super->reaper_waitq);
+
+			/* wake up anyone waiting on the reaper to become
+			 * idle */
+			if (super->scan_state == CACHEFS_SCAN_WAITING_FOR_REAPER ||
+			    super->scan_state == CACHEFS_SCAN_COMPLETING_REAP
+			    ) {
+				set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+				wake_up(&super->dmn_sleepq);
+			}
+		}
+		
+		cond_resched();
+	}
+
+	_leave(" [dead]");
+	complete_and_exit(&super->reaper_dead, 0);
+
+} /* end kreaperd() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/recycling.c linux-2.6.14-mm2-cachefs/fs/cachefs/recycling.c
--- linux-2.6.14-mm2/fs/cachefs/recycling.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/recycling.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,966 @@
+/* recycling.c: data tree recycling
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+//#define __KDEBUGALL
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/completion.h>
+#include "cachefs-int.h"
+
+static void cachefs_recycle_inactive(struct cachefs_super *super,
+				     struct cachefs_operation *op);
+static void cachefs_recycle_loading_node(struct cachefs_super *super,
+					 struct cachefs_operation *op);
+static void cachefs_recycle_processing_node(struct cachefs_super *super,
+					    struct cachefs_operation *op);
+static void cachefs_recycle_loading_ptrblk(struct cachefs_super *super,
+					   struct cachefs_operation *op);
+static void cachefs_recycle_processing_ptrblk(struct cachefs_super *super,
+					      struct cachefs_operation *op);
+static void cachefs_recycle_init_operation(struct cachefs_super *super,
+					   struct cachefs_operation *op);
+static void cachefs_recycle_consume_pointers(struct cachefs_super *super,
+					     struct cachefs_operation *op);
+static void cachefs_recycle_consume_ptrblk(struct cachefs_super *super,
+					   struct cachefs_operation *op);
+static void cachefs_recycle_consume_node(struct cachefs_super *super,
+					 struct cachefs_operation *op);
+
+const cachefs_recycle_operation_t cachefs_recycle_operations[CACHEFS_RCY__NSTATES] = {
+	[CACHEFS_RCY_INACTIVE]		= cachefs_recycle_inactive,
+	[CACHEFS_RCY_LOADING_NODE]	= cachefs_recycle_loading_node,
+	[CACHEFS_RCY_PROCESSING_NODE]	= cachefs_recycle_processing_node,
+	[CACHEFS_RCY_LOADING_PTRBLK]	= cachefs_recycle_loading_ptrblk,
+	[CACHEFS_RCY_PROCESSING_PTRBLK]	= cachefs_recycle_processing_ptrblk,
+	[CACHEFS_RCY_CONSUME_POINTERS_I]= cachefs_recycle_init_operation,
+	[CACHEFS_RCY_CONSUME_POINTERS]	= cachefs_recycle_consume_pointers,
+	[CACHEFS_RCY_CONSUME_PTRBLK_I]	= cachefs_recycle_init_operation,
+	[CACHEFS_RCY_CONSUME_PTRBLK]	= cachefs_recycle_consume_ptrblk,
+	[CACHEFS_RCY_CONSUME_NODE_I]	= cachefs_recycle_init_operation,
+	[CACHEFS_RCY_CONSUME_NODE]	= cachefs_recycle_consume_node,
+};
+
+/*****************************************************************************/
+/*
+ * handle the completion of a BIO that read a page for the recycler
+ */
+static int cachefs_recycle_io_complete(struct bio *bio,
+				       unsigned int bytes_done, int err)
+{
+	struct cachefs_super *super;
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct page *page;
+
+	_enter("{sz=%u rw=%lu},%u,%d",
+	       bio->bi_size, bio->bi_rw, bytes_done, err);
+
+	if (bio->bi_size)
+		return 1;
+
+	/* mark the pages with the appropriate state */
+	page = bvec->bv_page;
+	bio_put(bio);
+
+	if (uptodate) {
+		SetPageUptodate(page);
+	} else {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	}
+
+	super = page->mapping->host->i_sb->s_fs_info;
+	unlock_page(page);
+
+	set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+	wake_up(&super->dmn_sleepq);
+	return 0;
+
+} /* end cachefs_recycle_io_complete() */
+
+/*****************************************************************************/
+/*
+ * read a block from disk for the recycler
+ */
+static int cachefs_recycle_readpage(void *data, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct bio *bio;
+	int ret;
+
+	_enter(",{%d,%lx}", page_count(page), page->index);
+
+	SetPageMappedToDisk(page);
+
+	/* dispatch a call to perform the read */
+	ret = -ENOMEM;
+
+	bio = bio_alloc(GFP_KERNEL, 1);
+	if (bio) {
+		bio->bi_bdev	= inode->i_sb->s_bdev;
+		bio->bi_sector	= page->index;
+		bio->bi_sector	<<= PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+		bio->bi_end_io	= cachefs_recycle_io_complete;
+
+		if (!bio_add_page(bio, page, PAGE_SIZE, 0))
+			BUG();
+
+		submit_bio(READ, bio);
+		ret = 0;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_recycle_readpage() */
+
+/*****************************************************************************/
+/*
+ * read a recycling node metadata block from disk
+ * - reads pages through the metadata inode
+ * - caller must wait for page to finish reading
+ */
+static int cachefs_recycle_node_read(struct cachefs_super *super,
+				     cachefs_block_t bix,
+				     struct page **_page)
+{
+	struct page *page;
+
+	_enter(",%x", bix);
+
+	ASSERT(bix >= super->layout->bix_cache);
+	ASSERT(bix < super->j.alloc_unready);
+
+	/* load the page into the page cache */
+	page = read_cache_page(super->imeta->i_mapping, bix,
+			       cachefs_recycle_readpage, NULL);
+
+	if (IS_ERR(page)) {
+		_leave(" = %ld [rcp]", PTR_ERR(page));
+		return PTR_ERR(page);
+	}
+
+	if (PageUptodate(page) || PageError(page)) {
+		_debug("page already present");
+		set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+	}
+
+	*_page = page;
+	_leave(" = 0 (%p)", page);
+	return 0;
+
+} /* end cachefs_recycle_node_read() */
+
+/*****************************************************************************/
+/*
+ * initiate recycling
+ */
+static void cachefs_recycle_inactive(struct cachefs_super *super,
+				     struct cachefs_operation *op)
+{
+	_enter("");
+
+	ASSERT(!super->page_rcy_proc);
+	ASSERT(super->rcy_p_nlevels == (short) INT_MIN);
+	ASSERT(super->rcy_p_level == -1);
+
+	/* see if we can find something to recycle if we don't have anything
+	 * yet */
+	if (!super->j.rcy_processor &&
+	    (super->j.rcy_stack ||
+	     (super->j.rcy_collector && super->j.rcy_collsp > 0))
+	    ) {
+		ASSERT(!super->page_rcy_proc);
+
+		down_write(&super->tree_wander_sem);
+
+		/* move the pending stack to the active processing stack */
+		if (super->j.rcy_stack) {
+			_debug("steal rcy stack");
+			super->j.rcy_processor = super->j.rcy_stack;
+			super->j.rcy_procsp =
+				CACHEFS_ONDISC_RCYSTK_TREESPERNODE - 1;
+			super->j.rcy_stack = 0;
+		}
+		/* or steal the collector if there's anything in it */
+		else if (super->j.rcy_collector && super->j.rcy_collsp > 0) {
+			_debug("steal rcy collector");
+			super->j.rcy_processor = super->j.rcy_collector;
+			super->j.rcy_procsp = super->j.rcy_collsp - 1;
+			super->j.rcy_collector = 0;
+			super->j.rcy_collsp = -1;
+			super->page_rcy_proc = super->page_rcy;
+			super->page_rcy = NULL;
+		}
+
+		up_write(&super->tree_wander_sem);
+	}
+
+	/* start the active processing stack TOS node loading if there is
+	 * one
+	 * - note that we always have to load the node, even if we're
+	 *   immediately going to scrag it as we need the next pointer from it
+	 */
+	if (super->j.rcy_processor) {
+		if (!super->page_rcy_proc) {
+			/* set to load */
+			if (cachefs_recycle_node_read(super,
+						      super->j.rcy_processor,
+						      &super->page_rcy_proc
+						      ) < 0
+			    ) {
+				set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+				_leave(" [defer]");
+				return;
+			}
+		}
+		else {
+			set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+		}
+
+		super->rcy_state = CACHEFS_RCY_LOADING_NODE;
+	}
+
+	_leave("");
+
+} /* end cachefs_recycle_inactive() */
+
+/*****************************************************************************/
+/*
+ * validate a node in the recycling stack
+ * - the node may not be full, and so may contain "random" data left from a
+ *   crash, but only beyond the limit set by the stack pointer
+ */
+int cachefs_recycle_validate_node(struct cachefs_super *super,
+				  struct page *page,
+				  int rcysp)
+{
+	struct cachefs_ondisc_recycle_node *node;
+	int loop;
+
+	_enter("");
+
+	/* validate the node */
+	node = kmap_atomic(page, KM_USER0);
+
+	if (node->next != 0 &&
+	    (node->next < super->layout->bix_cache ||
+	     node->next >= super->j.alloc_unready)
+	    ) {
+		printk(KERN_ERR "CacheFS:"
+		       " Filesystem Error:"
+		       " Recycling stack node %lx contains"
+		       " bad next ptr %x\n",
+		       page->index, node->next);
+		goto content_error;
+	}
+
+	for (loop = 0; loop < rcysp; loop++) {
+		if (node->trees[loop].dataptr == 0 &&
+		    node->trees[loop].depth == 0)
+			break;
+
+		if (node->trees[loop].dataptr < super->layout->bix_cache ||
+		    node->trees[loop].dataptr >= super->j.alloc_unready
+		    ) {
+			printk(KERN_ERR "CacheFS:"
+			       " Filesystem Error:"
+			       " Recycling stack node %lx contains"
+			       " bad data ptr %x [%x]\n",
+			       page->index, node->trees[loop].dataptr, loop);
+			goto content_error;
+		}
+
+		if (node->trees[loop].depth == 0 ||
+		    node->trees[loop].depth > (64 / CACHEFS_ONDISC_LEVEL_BITS) + 1
+		    ) {
+			printk(KERN_ERR "CacheFS:"
+			       " Filesystem Error:"
+			       " Recycling stack node %lx contains"
+			       " bad data depth #%d for ptr %x [%x]\n",
+			       page->index,
+			       node->trees[loop].depth,
+			       node->trees[loop].dataptr,
+			       loop);
+			goto content_error;
+		}
+	}
+
+	for (loop++; loop < rcysp; loop++) {
+		if (node->trees[loop].dataptr != 0 ||
+		    node->trees[loop].depth != 0
+		    ) {
+			printk(KERN_ERR "CacheFS:"
+			       " Filesystem Error:"
+			       " Recycling stack node %lx contains"
+			       " null entry %x #%d [%x]\n",
+			       page->index,
+			       node->trees[loop].depth,
+			       node->trees[loop].dataptr,
+			       loop);
+			goto content_error;
+		}
+	}
+
+	kunmap_atomic(node, KM_USER0);
+	_leave(" = 0");
+	return 0;
+
+content_error:
+	kunmap_atomic(node, KM_USER0);
+	SetPageError(page);
+	ClearPageUptodate(page);
+	set_bit(CACHEFS_SUPER_ERROR_STOP, &super->flags);
+
+	_leave(" = -EIO");
+	return -EIO;
+
+} /* end cachefs_recycle_validate_node() */
+
+/*****************************************************************************/
+/*
+ * wait for the recycling processing stack TOS node to be read in
+ */
+static void cachefs_recycle_loading_node(struct cachefs_super *super,
+					 struct cachefs_operation *op)
+{
+	_enter("{%x[%d]},", super->j.rcy_processor, super->j.rcy_procsp);
+
+	ASSERT(super->j.rcy_processor != 0);
+	ASSERT(super->j.rcy_procsp >= -1);
+
+	/* start processing of the pointer block */
+	if (PageUptodate(super->page_rcy_proc) &&
+	    cachefs_recycle_validate_node(super,
+					  super->page_rcy_proc,
+					  super->j.rcy_procsp
+					  ) == 0
+	    ) {
+		if (super->j.rcy_procsp < 0)
+			super->rcy_state = CACHEFS_RCY_CONSUME_NODE;
+		else
+			super->rcy_state = CACHEFS_RCY_PROCESSING_NODE;
+
+		set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+		_leave("");
+		return;
+	}
+
+	/* deal with I/O errors */
+	if (PageError(super->page_rcy_proc)) {
+		printk(KERN_ERR
+		       "CacheFS: Recycling node %lx had error; discarding\n",
+		       super->page_rcy_proc->index);
+
+		cachefs_page_put(super->page_rcy_proc);
+		super->page_rcy_proc = NULL;
+
+		down_read(&super->tree_wander_sem);
+		super->j.rcy_processor = 0;
+		super->j.rcy_procsp = -1;
+		up_read(&super->tree_wander_sem);
+
+		super->rcy_state = CACHEFS_RCY_INACTIVE;
+		set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+	}
+
+	_leave(" [error]");
+
+} /* end cachefs_recycle_loading_node() */
+
+/*****************************************************************************/
+/*
+ * process the current ref in the TOS node
+ */
+static void cachefs_recycle_processing_node(struct cachefs_super *super,
+					    struct cachefs_operation *op)
+{
+	struct cachefs_ondisc_recycle_node *node;
+	cachefs_block_t bix;
+
+	_enter("%lx{%d}", super->page_rcy_proc->index, super->j.rcy_procsp);
+
+	down_read(&super->tree_wander_sem);
+
+	if (super->j.rcy_procsp < 0) {
+		/* this node is now empty */
+		super->rcy_p_nlevels = (short) INT_MIN;
+		super->rcy_state = CACHEFS_RCY_CONSUME_NODE;
+		set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+		bix = 0;
+	}
+	else {
+		/* select the tree to consume */
+		node = kmap_atomic(super->page_rcy_proc, KM_USER0);
+		super->rcy_p_nlevels = node->trees[super->j.rcy_procsp].depth;
+		bix = node->trees[super->j.rcy_procsp].dataptr;
+		kunmap_atomic(node, KM_USER0);
+
+		ASSERT(super->rcy_p_nlevels > 0);
+		ASSERT(super->rcy_p_nlevels <= 8);
+		ASSERT(bix != CACHEFS_NULL_PTR);
+		ASSERT(bix >= super->layout->bix_cache);
+		ASSERT(bix < super->j.alloc_unready);
+	}
+
+	up_read(&super->tree_wander_sem);
+
+	/* set the top level pointer block loading if there is one */
+	if (bix) {
+		_debug("block %x", bix);
+
+		super->rcy_p_level = 0;
+
+		if (super->j.rcy_offsets[0] >= CACHEFS_ONDISC_PTR_PER_BLOCK) {
+		    /* we've consumed this tree already, barring the root
+		     * pointer block
+		     */
+			super->rcy_state = CACHEFS_RCY_CONSUME_PTRBLK;
+			set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+		}
+		else {
+			if (cachefs_recycle_node_read(
+				    super, bix,
+				    &super->page_rcy_blk[super->rcy_p_level]
+				    ) < 0
+			    ) {
+				set_bit(CACHEFS_SUPER_DO_RECYCLE,
+					&super->flags);
+				_leave(" [defer]");
+				return;
+			}
+
+			ASSERT(super->page_rcy_blk[super->rcy_p_level]);
+			super->rcy_state = CACHEFS_RCY_LOADING_PTRBLK;
+		}
+	}
+
+	_leave("");
+
+} /* end cachefs_recycle_processing_node() */
+
+/*****************************************************************************/
+/*
+ * wait for a pointer block to be read in
+ */
+static void cachefs_recycle_loading_ptrblk(struct cachefs_super *super,
+					   struct cachefs_operation *op)
+{
+	struct page *page;
+	int offset;
+
+	ASSERT(super->rcy_p_level >= 0);
+
+	page = super->page_rcy_blk[super->rcy_p_level];
+	offset = super->j.rcy_offsets[super->rcy_p_level];
+
+	_enter("%d,%p[%d]",
+	       super->rcy_p_level, page, offset);
+
+	ASSERT(page);
+	ASSERT(offset >= 0);
+
+	/* start processing the pointer block */
+	if (PageUptodate(page)) {
+		/* consume the pointer block if we'd previously eaten all it's
+		 * contents */
+		if (offset == CACHEFS_ONDISC_RCYSTK_TREESPERNODE)
+			super->rcy_state = CACHEFS_RCY_CONSUME_PTRBLK;
+		/* deal specially with pointer blocks that point directly to
+		 * data blocks */
+		else if (super->rcy_p_level >= super->rcy_p_nlevels - 1)
+			super->rcy_state = CACHEFS_RCY_CONSUME_POINTERS;
+		else
+			super->rcy_state = CACHEFS_RCY_PROCESSING_PTRBLK;
+
+		set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+		_leave("");
+		return;
+	}
+
+	/* deal with I/O errors */
+	if (PageError(page)) {
+		printk(KERN_ERR
+		       "CacheFS: Pointer block %lx had error; discarding\n",
+		       page->index);
+
+		cachefs_page_put(page);
+		super->page_rcy_blk[super->rcy_p_level] = NULL;
+		super->rcy_p_level--;
+		if (super->rcy_p_level >= 0) {
+			super->j.rcy_offsets[super->rcy_p_level]++;
+			super->rcy_state = CACHEFS_RCY_PROCESSING_PTRBLK;
+		}
+		else {
+			super->j.rcy_procsp--;
+			super->rcy_state = CACHEFS_RCY_PROCESSING_NODE;
+		}
+		set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+	}
+
+	_leave("");
+
+} /* end cachefs_recycle_loading_ptrblk() */
+
+/*****************************************************************************/
+/*
+ * process the contents of a pointer block that points to other pointer blocks
+ */
+static void cachefs_recycle_processing_ptrblk(struct cachefs_super *super,
+					      struct cachefs_operation *op)
+{
+	cachefs_block_t *ptr, bix;
+	struct page *page;
+	int offset;
+
+	ASSERT(super->rcy_p_level >= 0);
+	ASSERT(super->rcy_p_level < 8);
+
+	page = super->page_rcy_blk[super->rcy_p_level];
+	offset = super->j.rcy_offsets[super->rcy_p_level];
+
+	ASSERT(page);
+
+	_enter("%d/%d,%lx[%d]",
+	       super->rcy_p_level, super->rcy_p_nlevels, page->index, offset);
+
+	ASSERT(offset >= 0);
+	ASSERT(super->rcy_p_level < super->rcy_p_nlevels - 1);
+
+	/* find the next valid pointer to a pointer block if there is one */
+	bix = CACHEFS_NULL_PTR;
+	if (offset < CACHEFS_ONDISC_PTR_PER_BLOCK) {
+		ptr = kmap_atomic(page, KM_USER0);
+
+		while (offset < CACHEFS_ONDISC_PTR_PER_BLOCK) {
+			bix = ptr[offset];
+			if (bix != CACHEFS_NULL_PTR)
+				break;
+			offset++;
+		}
+
+		kunmap_atomic(ptr, KM_USER0);
+	}
+
+	super->j.rcy_offsets[super->rcy_p_level] = offset;
+
+	if (offset >= CACHEFS_ONDISC_PTR_PER_BLOCK) {
+		/* all this block's children have been eaten */
+		ASSERT(bix == CACHEFS_NULL_PTR);
+
+		cachefs_page_put(page);
+		super->page_rcy_blk[super->rcy_p_level] = NULL;
+		super->rcy_state = CACHEFS_RCY_CONSUME_PTRBLK;
+		set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+	}
+	else {
+		/* we've found a child pointer block on which to chew */
+		_debug("load block %d: %x", offset, bix);
+
+		ASSERT(bix != CACHEFS_NULL_PTR);
+		ASSERT(bix >= super->layout->bix_cache);
+		ASSERT(bix < super->j.alloc_unready);
+
+		super->rcy_p_level++;
+		if (super->j.rcy_offsets[super->rcy_p_level] ==
+		    CACHEFS_ONDISC_PTR_PER_BLOCK
+		    ) {
+			/* we've consumed this subtree already, barring the basal
+			 * pointer block
+			 */
+			super->rcy_state = CACHEFS_RCY_CONSUME_PTRBLK;
+			set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+		}
+		else {
+			if (cachefs_recycle_node_read(
+				    super, bix,
+				    &super->page_rcy_blk[super->rcy_p_level]
+				    ) < 0
+			    ) {
+				super->rcy_p_level--;
+				set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+				_leave(" [defer]");
+				return;
+			}
+
+			ASSERT(super->page_rcy_blk[super->rcy_p_level]);
+			super->rcy_state = CACHEFS_RCY_LOADING_PTRBLK;
+		}
+	}
+
+	_leave("");
+
+} /* end cachefs_recycle_processing_ptrblk() */
+
+/*****************************************************************************/
+/*
+ * initialise the allocator operation for pointer reclamation
+ */
+static void cachefs_recycle_init_operation(struct cachefs_super *super,
+					   struct cachefs_operation *op)
+{
+	int ret;
+
+	_enter("st=%u rem=%hd lev=%d/%d",
+	       super->rcy_state, super->rcy_slots_rem,
+	       super->rcy_p_level, super->rcy_p_nlevels);
+
+	ASSERT(super->rcy_slots_rem >= 0);
+
+	switch (op->state) {
+		/* if we're still waiting for the operation to become
+		 * available, then just go back to sleep */
+	case CACHEFS_OP_RESERVING:
+		_leave(" [still waiting]");
+		return;
+
+		/* if the recycling operation has a dearth of recycling slots
+		 * available then we must end it and begin afresh */
+	case CACHEFS_OP_RUNNING:
+		if (super->rcy_slots_rem > 0)
+			break;
+
+		cachefs_operation_end_kcachefsd(op);
+
+		/* if the recycling op isn't running then start it */
+	case CACHEFS_OP_INACTIVE:
+		ret = cachefs_operation_begin_kcachefsd(op);
+		switch (ret) {
+		case -EINPROGRESS:
+			_leave(" [waiting on op]");
+			return;
+
+		case -EIO:
+			_leave(" [IO error]");
+			return;
+
+		default:
+			_leave(" [error %d]", ret);
+			return;
+
+		case 0:
+			super->rcy_slots_rem =
+				CACHEFS_ONDISC_FREELIST_PTRSPERNODE - 2;
+			break;
+		}
+		break;
+
+		/* deal with the operation having being aborted due to a disk
+		 * error */
+	case CACHEFS_OP_IO_ERROR:
+		cachefs_operation_end_kcachefsd(op);
+		_leave("I/O error");
+		return;
+
+	default:
+		BUG();
+	}
+
+	/* move on to the actual operation once we've set everything up */
+	switch (super->rcy_state) {
+	case CACHEFS_RCY_CONSUME_POINTERS_I:
+		super->rcy_state = CACHEFS_RCY_CONSUME_POINTERS;
+		set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+		break;
+
+	case CACHEFS_RCY_CONSUME_PTRBLK_I:
+		super->rcy_state = CACHEFS_RCY_CONSUME_PTRBLK;
+		set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+		break;
+
+	case CACHEFS_RCY_CONSUME_NODE_I:
+		super->rcy_state = CACHEFS_RCY_CONSUME_NODE;
+		set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+		break;
+
+	default:
+		BUG();
+		break;
+	}
+
+	_leave();
+
+} /* end cachefs_recycle_init_operation() */
+
+/*****************************************************************************/
+/*
+ * process the contents of a pointer block that points directly to data blocks
+ */
+static void cachefs_recycle_consume_pointers(struct cachefs_super *super,
+					     struct cachefs_operation *op)
+{
+	cachefs_block_t *ptr, bix;
+	struct page *page;
+	int offset, nrcm;
+
+	page = super->page_rcy_blk[super->rcy_p_level];
+	offset = super->j.rcy_offsets[super->rcy_p_level];
+
+	_enter("%d/%d,%lx[%d]",
+	       super->rcy_p_level, super->rcy_p_nlevels, page->index, offset);
+
+	ASSERT(page);
+	ASSERT(offset >= 0);
+	ASSERT(super->rcy_slots_rem >= 0);
+	ASSERT(super->rcy_p_level == super->rcy_p_nlevels - 1);
+
+	/* make sure there's some reclamation space available in the
+	 * operation */
+	if (super->rcy_slots_rem <= 0) {
+		super->rcy_state = CACHEFS_RCY_CONSUME_POINTERS_I;
+		set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+		_leave(" [reinit op]");
+		return;
+	}
+
+	/* find the next valid pointer to a data block if there is one */
+	if (offset < CACHEFS_ONDISC_PTR_PER_BLOCK) {
+		ptr = kmap_atomic(page, KM_USER0);
+
+		while (offset < CACHEFS_ONDISC_PTR_PER_BLOCK) {
+			if (ptr[offset] != CACHEFS_NULL_PTR)
+				break;
+			offset++;
+		}
+
+		kunmap_atomic(ptr, KM_USER0);
+	}
+
+	super->j.rcy_offsets[super->rcy_p_level] = offset;
+
+	/* if there's anything to consume, pin the journal whilst we eat as
+	 * many valid pointers as we can */
+	if (offset < CACHEFS_ONDISC_PTR_PER_BLOCK) {
+		down_write(&super->tree_wander_sem);
+
+		nrcm = 0;
+		op->m_rcm = 0;
+
+		while (offset < CACHEFS_ONDISC_PTR_PER_BLOCK) {
+			ptr = kmap_atomic(page, KM_USER0);
+
+			do {
+				bix = ptr[offset];
+				if (bix != CACHEFS_NULL_PTR)
+					break;
+			} while (offset++,
+				 offset < CACHEFS_ONDISC_PTR_PER_BLOCK);
+
+			kunmap_atomic(ptr, KM_USER0);
+
+			if (offset >= CACHEFS_ONDISC_PTR_PER_BLOCK)
+				break;
+
+			/* transfer blocks until there's an error or we fill a
+			 * reclamation node */
+			_alter(super, "rcy %x", bix);
+
+			op->bix_rcm[op->m_rcm++] = bix;
+			if (op->m_rcm >= ARRAY_SIZE(op->bix_rcm)) {
+				nrcm += op->m_rcm;
+				cachefs_allocator(op);
+				op->m_rcm = 0;
+			}
+
+			offset++;
+			if (super->rcy_slots_rem <= 0)
+				break; /* we filled this op's recycling
+					* quota */
+		}
+
+		if (op->m_rcm > 0) {
+			nrcm += op->m_rcm;
+			cachefs_allocator(op);
+			op->m_rcm = 0;
+		}
+
+		/* update the journalled tracking */
+		spin_lock(&super->alloc_lock);
+		super->j.space_rcy -= nrcm;
+		spin_unlock(&super->alloc_lock);
+
+		super->j.rcy_offsets[super->rcy_p_level] = offset;
+		up_write(&super->tree_wander_sem);
+	}
+
+	/* eat the pointer block itself once we've eaten all its children */
+	if (offset >= CACHEFS_ONDISC_PTR_PER_BLOCK) {
+		cachefs_page_put(page);
+		super->page_rcy_blk[super->rcy_p_level] = NULL;
+		if (super->rcy_slots_rem <= 0)
+			super->rcy_state = CACHEFS_RCY_CONSUME_PTRBLK_I;
+		else
+			super->rcy_state = CACHEFS_RCY_CONSUME_PTRBLK;
+	}
+	/* re-init the op if necessary prior to consuming more pointers */
+	else if (super->rcy_slots_rem <= 0) {
+		super->rcy_state = CACHEFS_RCY_CONSUME_POINTERS_I;
+	}
+
+	set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+	_leave("");
+
+} /* end cachefs_recycle_consume_pointers() */
+
+/*****************************************************************************/
+/*
+ * dispose of a pointer block that we've finished consuming
+ */
+static void cachefs_recycle_consume_ptrblk(struct cachefs_super *super,
+					   struct cachefs_operation *op)
+{
+	int plevel;
+
+	_enter("%d/%d", super->rcy_p_level, super->rcy_p_nlevels);
+
+	ASSERT(!super->page_rcy_blk[super->rcy_p_level]);
+	ASSERT(super->j.rcy_offsets[super->rcy_p_level] ==
+	       CACHEFS_ONDISC_PTR_PER_BLOCK);
+	ASSERT(super->rcy_slots_rem >= 0);
+
+	/* make sure there's some reclamation space available in the
+	 * operation */
+	if (super->rcy_slots_rem <= 0) {
+		super->rcy_state = CACHEFS_RCY_CONSUME_PTRBLK_I;
+		set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+		_leave(" [reinit op]");
+		return;
+	}
+
+	down_read(&super->tree_wander_sem);
+
+	plevel = super->rcy_p_level - 1;
+
+	if (plevel >= 0) {
+		/* the parent of this block is a pointer block */
+		cachefs_block_t *ptr;
+
+		ptr = kmap_atomic(super->page_rcy_blk[plevel], KM_USER0);
+		op->bix_rcm[0] = ptr[super->j.rcy_offsets[plevel]];
+		kunmap_atomic(ptr, KM_USER0);
+
+		_alter(super, "rcy ptrblk %x", op->bix_rcm[0]);
+
+		super->j.rcy_offsets[plevel + 1] = 0;
+		super->j.rcy_offsets[plevel]++;
+		super->rcy_p_level = plevel;
+		super->rcy_state = CACHEFS_RCY_PROCESSING_PTRBLK;
+		set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+	}
+	else {
+		/* the parent of this block is the current recycling node */
+		struct cachefs_ondisc_recycle_node *node;
+
+		node = kmap_atomic(super->page_rcy_proc, KM_USER0);
+		op->bix_rcm[0] = node->trees[super->j.rcy_procsp].dataptr;
+		kunmap_atomic(node, KM_USER0);
+
+		_alter(super, "rcy ptr root %x", op->bix_rcm[0]);
+
+		super->j.rcy_offsets[0] = 0;
+		super->j.rcy_procsp--;
+		super->rcy_p_level = plevel;
+		super->rcy_state = CACHEFS_RCY_PROCESSING_NODE;
+		set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+	}
+
+	op->m_rcm = 1;
+	cachefs_allocator(op);
+
+	spin_lock(&super->alloc_lock);
+	super->j.space_rcy--;
+	spin_unlock(&super->alloc_lock);
+
+	up_read(&super->tree_wander_sem);
+	_leave("");
+
+} /* end cachefs_recycle_consume_ptrblk() */
+
+/*****************************************************************************/
+/*
+ * dispose of a recycling node that we've finished consuming
+ */
+static void cachefs_recycle_consume_node(struct cachefs_super *super,
+					 struct cachefs_operation *op)
+{
+	struct cachefs_ondisc_recycle_node *node;
+	cachefs_block_t nextbix;
+
+	_enter("%x", super->j.rcy_processor);
+
+	ASSERT(super->j.rcy_procsp == -1);
+	ASSERT(super->rcy_p_level == -1);
+	ASSERT(super->rcy_p_nlevels == (short) INT_MIN);
+	ASSERT(super->page_rcy_proc);
+	ASSERT(super->rcy_slots_rem >= 0);
+
+	/* make sure there's some reclamation space available in the
+	 * operation */
+	if (super->rcy_slots_rem <= 0) {
+		super->rcy_state = CACHEFS_RCY_CONSUME_NODE_I;
+		set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+		_leave(" [reinit op]");
+		return;
+	}
+
+	/* read the pointer to the next recycling node */
+	node = kmap_atomic(super->page_rcy_proc, KM_USER0);
+	nextbix = node->next;
+	kunmap_atomic(node, KM_USER0);
+
+	/* attempt to release this node */
+	down_read(&super->tree_wander_sem);
+
+	op->bix_rcm[0] = super->j.rcy_processor;
+
+	_alter(super, "rcy node %x", op->bix_rcm[0]);
+
+	op->m_rcm = 1;
+	cachefs_allocator(op);
+
+	spin_lock(&super->alloc_lock);
+	super->j.space_rcy--;
+	spin_unlock(&super->alloc_lock);
+
+	cachefs_page_put(super->page_rcy_proc);
+	super->page_rcy_proc = NULL;
+
+	super->j.rcy_processor = nextbix;
+	if (nextbix) {
+		super->j.rcy_procsp = CACHEFS_ONDISC_RCYSTK_TREESPERNODE;
+		set_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags);
+	}
+
+	super->rcy_state = CACHEFS_RCY_INACTIVE;
+
+	up_read(&super->tree_wander_sem);
+
+	/* end the operation if it's not needed just at the moment */
+	if (!test_bit(CACHEFS_SUPER_DO_RECYCLE, &super->flags)) {
+		super->rcy_slots_rem = 0;
+		cachefs_operation_end_kcachefsd(op);
+	}
+
+	_leave("");
+
+} /* end cachefs_recycle_consume_node() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/rootdir.c linux-2.6.14-mm2-cachefs/fs/cachefs/rootdir.c
--- linux-2.6.14-mm2/fs/cachefs/rootdir.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/rootdir.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,146 @@
+/* rootdir.c: general cache filesystem root directory handling code
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/ctype.h>
+#include <linux/circ_buf.h>
+#include <asm/uaccess.h>
+#include "cachefs-int.h"
+#include "cachefs-inode.h"
+
+static int cachefs_root_readdir(struct file *file, void *dirent,
+				filldir_t filldir);
+
+static struct dentry *cachefs_root_lookup(struct inode *dir,
+					  struct dentry *dentry,
+					  struct nameidata *nd);
+
+struct file_operations cachefs_root_file_operations = {
+	.readdir	= cachefs_root_readdir
+};
+
+struct inode_operations cachefs_root_inode_operations = {
+	.lookup		= cachefs_root_lookup
+};
+
+/*****************************************************************************/
+/*
+ * read the cache's root directory
+ */
+static int cachefs_root_readdir(struct file *file,
+				void *cookie,
+				filldir_t filldir)
+{
+	struct cachefs_inode *inode;
+	int ret;
+
+	inode = CACHEFS_FS_I(file->f_dentry->d_inode);
+
+	_enter("{%Ld,{%lu}}", file->f_pos, inode->vfs_inode.i_ino);
+
+	/* do the usual . and .. (allowing for gcc-2.96 not supporting switch
+	 * on long long well) */
+	switch (file->f_pos < INT_MAX ? (int) file->f_pos : INT_MAX) {
+	case 0:
+		ret = filldir(cookie, ".", 1, file->f_pos,
+			      inode->vfs_inode.i_ino, DT_DIR);
+		if (ret < 0)
+			goto done;
+		file->f_pos++;
+	case 1:
+		ret = filldir(cookie, "..", 2, file->f_pos,
+			      parent_ino(file->f_dentry), DT_DIR);
+		if (ret < 0)
+			goto done;
+		file->f_pos++;
+	case 2:
+		ret = filldir(cookie, "status", 6,
+			      file->f_pos, CACHEFS_INO_STATUS, DT_REG);
+		if (ret < 0)
+			goto done;
+		file->f_pos++;
+
+	default:
+		break;
+	}
+
+ done:
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_root_readdir() */
+
+/*****************************************************************************/
+/*
+ * look up an entry in the cache's root directory
+ */
+static struct dentry *cachefs_root_lookup(struct inode *_dir,
+					  struct dentry *dentry,
+					  struct nameidata *nd)
+{
+	struct cachefs_inode *dir, *target;
+	const char *name;
+	ino_t ino;
+
+	dir = CACHEFS_FS_I(_dir);
+	name = dentry->d_name.name;
+
+	_enter("{%lu},{%s}", dir->vfs_inode.i_ino, name);
+
+	/* expose the certain virtual files */
+	switch (dentry->d_name.len) {
+	case 1:
+		if (memcmp(name, ".", 1) == 0) {
+			target = cachefs_igrab(dir);
+			goto instantiate;
+		}
+		break;
+	case 2:
+		if (memcmp(name, "..", 2)==0) {
+			target = cachefs_igrab(dir);
+			goto instantiate;
+		}
+		break;
+	case 6:
+		if (memcmp(name, "status", 6) == 0) {
+			ino = CACHEFS_INO_STATUS;
+			goto get;
+		}
+		break;
+	default:
+		break;
+	}
+
+	_leave(" = -ENOENT");
+	return ERR_PTR(-ENOENT);
+
+	/* get the inode */
+ get:
+	target = cachefs_iget(dir->vfs_inode.i_sb->s_fs_info, ino);
+	if (IS_ERR(target)) {
+		_leave(" = %ld", PTR_ERR(target));
+		return ERR_PTR(PTR_ERR(target));
+	}
+
+	/* instantiate the dentry */
+ instantiate:
+	d_add(dentry, &target->vfs_inode);
+	_leave(" = NULL");
+	return NULL;
+
+} /* end cachefs_root_lookup() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/status.c linux-2.6.14-mm2-cachefs/fs/cachefs/status.c
--- linux-2.6.14-mm2/fs/cachefs/status.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/status.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,234 @@
+/* status.c: status virtual file implementation
+ *
+ * Copyright (C) 2003 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+#include "cachefs-int.h"
+
+static int cachefs_status_open(struct inode *inode, struct file *file);
+static void *cachefs_status_start(struct seq_file *p, loff_t *pos);
+static void *cachefs_status_next(struct seq_file *p, void *v, loff_t *pos);
+static void cachefs_status_stop(struct seq_file *p, void *v);
+static int cachefs_status_show(struct seq_file *m, void *v);
+
+static struct seq_operations cachefs_status_ops = {
+	.start		= cachefs_status_start,
+	.next		= cachefs_status_next,
+	.stop		= cachefs_status_stop,
+	.show		= cachefs_status_show,
+};
+
+struct inode_operations cachefs_status_inode_operations = {
+};
+
+struct file_operations cachefs_status_file_operations = {
+	.open		= cachefs_status_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static const char *cachefs_recycle_operation_names[CACHEFS_RCY__NSTATES] = {
+	[CACHEFS_RCY_INACTIVE]		= "Inactive",
+	[CACHEFS_RCY_LOADING_NODE]	= "LoadingNode",
+	[CACHEFS_RCY_PROCESSING_NODE]	= "ProcessingNode",
+	[CACHEFS_RCY_LOADING_PTRBLK]	= "LoadingPtrBlk",
+	[CACHEFS_RCY_PROCESSING_PTRBLK]	= "ProcessingPtrBlk",
+	[CACHEFS_RCY_CONSUME_POINTERS]	= "ConsumePointers",
+	[CACHEFS_RCY_CONSUME_PTRBLK]	= "ConsumePtrBlk",
+	[CACHEFS_RCY_CONSUME_NODE]	= "ConsumeNode",
+};
+
+static const char *cachefs_scan_operation_names[CACHEFS_SCAN__NSTATES] = {
+	[CACHEFS_SCAN_INACTIVE]			= "Inactive",
+	[CACHEFS_SCAN_LOADING_REAP_LIST]	= "LoadingReapList",
+	[CACHEFS_SCAN_DESCENDING]		= "Descending",
+	[CACHEFS_SCAN_VALIDATING_NODE]		= "ValidatingNode",
+	[CACHEFS_SCAN_SCANNING_NODE]		= "ScanningNode",
+	[CACHEFS_SCAN_ASCENDING]		= "Ascending",
+	[CACHEFS_SCAN_COMPLETING_SCAN]		= "CompletingScan",
+	[CACHEFS_SCAN_COMPLETING_REAP]		= "CompletingReap",
+	[CACHEFS_SCAN_ADVANCING_REAP_LIST]	= "AdvancingReapList",
+	[CACHEFS_SCAN_FINISHED]			= "Finished",
+	[CACHEFS_SCAN_REAPING_OBJECT]		= "ReapingObject",
+	[CACHEFS_SCAN_WAITING_FOR_REAPER]	= "WaitingForReaper",
+};
+
+/*****************************************************************************/
+/*
+ * open a status file
+ */
+static int cachefs_status_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *m;
+	int ret;
+
+	ret = seq_open(file, &cachefs_status_ops);
+	if (ret<0)
+		return ret;
+
+	m = file->private_data;
+	m->private = inode->i_sb->s_fs_info;
+
+	return 0;
+
+} /* end cachefs_status_open() */
+
+/*****************************************************************************/
+/*
+ * set up the iterator to start with the first status item
+ */
+static void *cachefs_status_start(struct seq_file *p, loff_t *pos)
+{
+	return *pos ? NULL : (void *) 1;
+
+} /* end cachefs_status_start() */
+
+/*****************************************************************************/
+/*
+ * next status block in list
+ */
+static void *cachefs_status_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	(*pos)++;
+
+	return NULL;
+
+} /* end cachefs_status_next() */
+
+/*****************************************************************************/
+/*
+ * stop reading
+ */
+static void cachefs_status_stop(struct seq_file *p, void *v)
+{
+} /* end cachefs_status_stop() */
+
+/*****************************************************************************/
+/*
+ * show the status
+ */
+static int cachefs_status_show(struct seq_file *m, void *v)
+{
+	struct cachefs_super *super = m->private;
+
+	seq_puts(m, "CacheFS (c) Red Hat, Inc. 2005\n");
+	seq_puts(m, "\n");
+
+	seq_printf(m, "journal     : %x-%x (%u byte slots, %u per page)\n",
+		   super->layout->bix_journal,
+		   super->layout->bix_cache - 1,
+		   super->layout->jnl_rsize,
+		   super->layout->jnl_recperblk);
+
+	seq_printf(m, "cache       : %x-%x [%x-%x unready]\n",
+		   super->layout->bix_cache,
+		   super->layout->bix_end - 1,
+		   super->j.alloc_unready,
+		   super->layout->bix_end);
+
+	seq_puts(m, "\n");
+
+	seq_printf(m, "trans       : %x [%u]\n",
+		   super->jnl_serial,
+		   super->jnl_serial & CACHEFS_ONDISC_JNL_SLOT_MASK);
+
+	seq_printf(m, "root        : %x\n",
+		   super->j.tree_root);
+
+	seq_puts(m, "\n");
+
+	seq_printf(m, "Op SpcResv  : %x\n", super->space_inprogress);
+	seq_printf(m, "Op RcmResv  : %x/%x",
+		   super->space_rcmstk_resv, super->space_rcmstk_resv_max);
+
+	seq_puts(m, "\n");
+
+	seq_printf(m, "Spc Alloc   : %x\n", super->j.space_alloc);
+	seq_printf(m, "Spc Rcm     : %x\n", super->j.space_rcm);
+	seq_printf(m, "Spc Rcy     : %x\n", super->j.space_rcy);
+	seq_printf(m, "Spc Nodes   : %x\n", super->j.space_alrcm_nodes);
+	seq_printf(m, "Spc Meta    : %x\n", super->j.space_meta);
+	seq_printf(m, "Spc D Used  : %x\n", super->j.space_data_used);
+	seq_printf(m, "Spc D Pin   : %x\n", super->j.space_data_pinned);
+	seq_printf(m, "Spc D Resv  : %x\n", super->j.space_data_reserved);
+	seq_printf(m, "Spc D RUsed : %x\n", super->j.space_data_rsv_data);
+	seq_printf(m, "Spc D RPin  : %x\n", super->j.space_data_rsv_pin);
+
+	seq_puts(m, "\n");
+
+	seq_printf(m, "Alloc Stk   : TOS={%x}+%u NUM=%u\n",
+		   super->j.alloc_pfree,
+		   super->j.alloc_pfree_pt,
+		   super->j.alloc_pfree_n);
+
+	seq_printf(m, "Alloc 2nd   : TOS={%x} NUM=%u\n",
+		   super->j.alloc_sfree,
+		   super->j.alloc_sfree_n);
+
+	seq_printf(m, "Reclm Rdy   : TOS={%x} NUM=%u\n",
+		   super->j.rcm_ready,
+		   super->j.rcm_ready_n);
+
+	seq_printf(m, "Reclm Coll  : TOS={%x}+%d\n",
+		   super->j.rcm_collector,
+		   super->j.rcm_coll_pt);
+
+	seq_printf(m, "Recycler    : %s; levels=%d/%d\n",
+		   cachefs_recycle_operation_names[super->rcy_state],
+		   super->rcy_p_level,
+		   super->rcy_p_nlevels);
+
+	seq_printf(m, "Rcy Proc    : TOS={%x}+%hd\n",
+		   super->j.rcy_processor,
+		   super->j.rcy_procsp);
+
+	seq_printf(m, "Rcy Stk     : TOS={%x}\n",
+		   super->j.rcy_stack);
+
+	seq_printf(m, "Rcy Coll    : TOS={%x}+%hd\n",
+		   super->j.rcy_collector,
+		   super->j.rcy_collsp);
+
+	seq_printf(m, "Reap Proc   : TOS={%x} #%hd\n",
+		   super->j.reap_processor,
+		   super->j.reap_proccnt);
+
+	seq_printf(m, "Reap Stk    : TOS={%x}\n",
+		   super->j.reap_stack);
+
+	seq_printf(m, "Reap Coll   : TOS={%x}+%hd\n",
+		   super->j.reap_collector,
+		   super->j.reap_collsp);
+
+	seq_printf(m, "Scan State  : %s %x\n",
+		   cachefs_scan_operation_names[super->scan_state],
+		   super->scan_bix);
+
+	seq_printf(m, "Cull List   : %d/%d\n",
+		   super->scan_nculls,
+		   super->scan_maxculls);
+
+	seq_printf(m, "nObjects    : %u\n",
+		   atomic_read(&super->cnt_objects));
+
+	seq_puts(m, "\n");
+
+	return 0;
+
+} /* end cachefs_status_show() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/super.c linux-2.6.14-mm2-cachefs/fs/cachefs/super.c
--- linux-2.6.14-mm2/fs/cachefs/super.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/super.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,1345 @@
+/* super.c: general cache filesystem superblock code
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+//#define __KENTER
+//#define __KDEBUG
+//#define __KLEAVE
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/vfs.h>
+#include <linux/parser.h>
+#include <linux/buffer_head.h>
+#include <asm/div64.h>
+#include "cachefs-int.h"
+#include "cachefs-inode.h"
+
+#define CACHEFS_FS_MAGIC 0x43414653 /* 'CAFS' */
+
+static void cachefs_i_init_once(void *_inode, kmem_cache_t *cachep,
+				unsigned long flags);
+
+static struct super_block *cachefs_get_sb(struct file_system_type *fs_type,
+					  int flags, const char *dev_name,
+					  void *data);
+
+static struct inode *cachefs_alloc_inode(struct super_block *sb);
+static void cachefs_destroy_inode(struct inode *inode);
+
+static int cachefs_fill_super(struct super_block *sb, void *_data, int silent);
+static int cachefs_initialise_blockdev(struct cachefs_super *super);
+static int cachefs_statfs(struct super_block *sb, struct kstatfs *buf);
+static int cachefs_sync_fs(struct super_block *sb, int wait);
+static void cachefs_write_super(struct super_block *sb);
+static void cachefs_put_super(struct super_block *sb);
+static int cachefs_check_barrier_cap(struct cachefs_super *super);
+static void cachefs_put_inode(struct inode *inode);
+static void cachefs_delete_inode(struct inode *inode);
+
+static struct file_system_type cachefs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "cachefs",
+	.get_sb		= cachefs_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static struct super_operations cachefs_super_ops = {
+	.statfs		= cachefs_statfs,
+	.alloc_inode	= cachefs_alloc_inode,
+	.write_inode	= cachefs_write_inode,
+	.put_inode	= cachefs_put_inode,
+	.sync_fs	= cachefs_sync_fs,
+	.drop_inode	= generic_delete_inode,
+	.delete_inode	= cachefs_delete_inode,
+	.destroy_inode	= cachefs_destroy_inode,
+	.clear_inode	= cachefs_clear_inode,
+	.write_super	= cachefs_write_super,
+	.put_super	= cachefs_put_super,
+};
+
+static kmem_cache_t *cachefs_inode_cachep;
+
+enum {
+	CACHEFS_OPT_AUTODEL,
+	CACHEFS_OPT_TAG,
+	CACHEFS_OPT_WANDER,
+	CACHEFS_OPT_MAXCULLS,
+	CACHEFS_OPT_NOSCAN,
+	CACHEFS_OPT__END
+};
+
+static match_table_t cachefs_opt_tokens = {
+	{ CACHEFS_OPT_AUTODEL,		"autodel"	},
+	{ CACHEFS_OPT_MAXCULLS,		"maxculls=%u"	},
+	{ CACHEFS_OPT_NOSCAN,		"noscan"	},
+	{ CACHEFS_OPT_TAG,		"tag=%s"	},
+	{ CACHEFS_OPT_WANDER,		"wander=%u"	},
+	{ CACHEFS_OPT__END,		NULL		},
+};
+
+/*****************************************************************************/
+/*
+ * initialise the cache filesystem
+ */
+int __init cachefs_fs_init(void)
+{
+	int ret;
+
+	_enter("");
+
+	/* create ourselves an inode cache */
+	ret = -ENOMEM;
+	cachefs_inode_cachep = kmem_cache_create("cachefs_inode_cache",
+						 sizeof(struct cachefs_inode),
+						 0,
+						 SLAB_HWCACHE_ALIGN,
+						 cachefs_i_init_once,
+						 NULL);
+	if (!cachefs_inode_cachep) {
+		printk(KERN_NOTICE
+		       "CacheFS: Failed to allocate inode cache\n");
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* now export our filesystem to lesser mortals */
+	ret = register_filesystem(&cachefs_fs_type);
+	if (ret<0) {
+		kmem_cache_destroy(cachefs_inode_cachep);
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_fs_init() */
+
+/*****************************************************************************/
+/*
+ * clean up the filesystem
+ */
+void __exit cachefs_fs_exit(void)
+{
+	_enter("");
+
+	unregister_filesystem(&cachefs_fs_type);
+
+	/* destroy our private inode cache */
+	kmem_cache_destroy(cachefs_inode_cachep);
+
+	_leave("");
+
+} /* end cachefs_fs_exit() */
+
+/*****************************************************************************/
+/*
+ * get a cachefs superblock
+ */
+static struct super_block *cachefs_get_sb(struct file_system_type *fs_type,
+					  int flags,
+					  const char *dev_name,
+					  void *options)
+{
+	struct super_block *sb;
+
+	_enter(",,%s,%p", dev_name, options);
+
+	/* allocate a device superblock */
+	sb = get_sb_bdev(fs_type, flags, dev_name, options,
+			 cachefs_fill_super);
+
+	_leave(" = %p", sb);
+	return sb;
+
+} /* end cachefs_get_sb() */
+
+/*****************************************************************************/
+/*
+ * BIO operation completed
+ */
+static int cachefs_bio_completion(struct bio *bio, unsigned int bytes_done,
+				  int error)
+{
+	unsigned short loop;
+
+	_enter("%p{%u},%u,%d", bio, bio->bi_size, bytes_done, error);
+
+	/* we're only interested in completion */
+	if (bio->bi_size > 0) {
+		_leave(" = 1");
+		return 1;
+	}
+
+	/* mark all the pages appropriately and unlock */
+	for (loop = 0; loop < bio->bi_vcnt; loop++) {
+		struct page *page = bio->bi_io_vec[loop].bv_page;
+
+		if (PageLocked(page)) {
+			if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+				SetPageUptodate(page);
+			else
+				SetPageError(page);
+			unlock_page(page);
+		}
+	}
+
+	bio_put(bio);
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_bio_completion() */
+
+/*****************************************************************************/
+/*
+ * submit a read or a write for the page count times starting at the specified
+ * block offset
+ */
+static int cachefs_bio_submit(struct super_block *sb, struct page *page,
+			      unsigned bix, size_t *count, int rw)
+{
+	struct bio *bio;
+	size_t loop;
+
+	if (*count > BIO_MAX_PAGES)
+		*count = BIO_MAX_PAGES;
+
+	/* allocate and initialise a BIO */
+	bio = bio_alloc(GFP_NOFS, *count);
+	if (!bio)
+		return -ENOMEM;
+
+	kenter("{bdev=%p},%p,%u,%u,%d", sb->s_bdev, page, bix, *count, rw);
+
+	SetPageLocked(page);
+
+	bio->bi_sector	= bix * (PAGE_SIZE >> 9);
+	bio->bi_bdev	= sb->s_bdev;
+	bio->bi_end_io	= cachefs_bio_completion;
+	bio->bi_private	= NULL;
+
+	/* we may send the page to several blocks */
+	for (loop = 0; loop < *count; loop++)
+		if (!bio_add_page(bio, page, PAGE_SIZE, 0))
+			break;
+	*count = loop;
+
+	/* send to disc */
+	submit_bio(rw, bio);
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_bio_submit() */
+
+/*****************************************************************************/
+/*
+ * parse the mount options provided
+ */
+static int cachefs_parse_options(struct cachefs_super *super, char *options,
+				 char **_tagname)
+{
+	char *p;
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ","))) {
+		substring_t args[MAX_OPT_ARGS];
+		unsigned tmp;
+		int token;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, cachefs_opt_tokens, args);
+		switch (token) {
+		case CACHEFS_OPT_AUTODEL:
+			__set_bit(CACHEFS_SUPER_AUTO_DELETE, &super->options);
+			break;
+
+		case CACHEFS_OPT_NOSCAN:
+			__set_bit(CACHEFS_SUPER_NOSCAN, &super->options);
+			break;
+
+		case CACHEFS_OPT_TAG:
+			*_tagname = args[0].from;
+			break;
+
+		case CACHEFS_OPT_WANDER:
+			tmp = simple_strtoul(args[0].from, NULL, 0);
+			if (tmp > 60 * 60) {
+				printk(KERN_ERR "CacheFS:"
+				       " Autowander timer limit is 1 hour\n");
+				return -EINVAL;
+			}
+
+			super->jnl_timeout = tmp * HZ;
+			break;
+
+		case CACHEFS_OPT_MAXCULLS:
+			tmp = simple_strtoul(args[0].from, NULL, 0);
+			if (tmp < 3 || tmp > 100) {
+				printk(KERN_ERR "CacheFS:"
+				       " Max cull retention count not between 3 and 100\n");
+				return -EINVAL;
+			}
+
+			super->scan_maxculls = tmp;
+			break;
+
+		default:
+			printk(KERN_ERR "CacheFS:"
+			       " Invalid option: \"%s\" or missing value\n",
+			       p);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+
+} /* end cachefs_parse_options() */
+
+/*****************************************************************************/
+/*
+ * fill in the superblock
+ */
+static int cachefs_fill_super(struct super_block *sb, void *_data, int silent)
+{
+	struct cachefs_object *fsdef = NULL;
+	struct cachefs_super *super = NULL;
+	struct cachefs_inode *iroot = NULL, *imeta;
+	struct dentry *root = NULL;
+	unsigned long asflags;
+	struct page *lpage = NULL;
+	char *tagname = NULL;
+	int ret, prepare;
+
+	_enter("");
+
+	/* a read-only cache isn't a lot of use */
+	if (bdev_read_only(sb->s_bdev)) {
+		printk(KERN_ERR "CacheFS: blockdev read-only\n");
+		return -EROFS;
+	}
+
+	if (sb->s_flags & MS_RDONLY) {
+		printk(KERN_ERR "CacheFS: filesystem mounted read-only\n");
+		return -EROFS;
+	}
+
+	/* we want the block size to be at least as big as the size of a
+	 * journal entry */
+	if (!sb_min_blocksize(sb, sizeof(struct cachefs_ondisc_journal))) {
+		printk(KERN_ERR "CacheFS: unable to set blocksize\n");
+		return -EIO;
+	}
+
+	kdebug("CacheFS: blockdev %u,%u", MAJOR(sb->s_dev), MINOR(sb->s_dev));
+	kdebug("CacheFS: blockdev size %LuMb",
+	       i_size_read(sb->s_bdev->bd_inode) / 1024 / 1024);
+
+	/* allocate a superblock info record and extra bits of memory */
+	ret = -ENOMEM;
+	super = kmalloc(sizeof(*super), GFP_KERNEL);
+	if (!super)
+		goto error;
+
+	memset(super, 0, sizeof(*super));
+
+	init_rwsem(&super->tree_wander_sem);
+	init_MUTEX(&super->alloc_load_sem);
+	init_MUTEX(&super->deletion_sem);
+	spin_lock_init(&super->operation_lock);
+	spin_lock_init(&super->alloc_lock);
+	spin_lock_init(&super->objects_lock);
+	init_timer(&super->jnl_timer);
+	super->jnl_timer.function = cachefs_journal_wander_timeout;
+	super->jnl_timer.data = (unsigned long) super;
+	super->jnl_timeout = CACHEFS_DEFAULT_AUTOWANDER_TIMER * HZ;
+	super->scan_maxculls = 20;
+
+	INIT_LIST_HEAD(&super->op_waitq);
+	INIT_LIST_HEAD(&super->op_runq);
+	INIT_LIST_HEAD(&super->alloc_waitq);
+	INIT_LIST_HEAD(&super->rcm_old_pages);
+	atomic_set(&super->cnt_rcmpages, 0);
+
+	INIT_LIST_HEAD(&super->scan_culls);
+	INIT_LIST_HEAD(&super->scan_xculls);
+
+	spin_lock_init(&super->jnl_qlock);
+	INIT_LIST_HEAD(&super->jnl_transq);
+	sema_init(&super->jnl_page_sem, 7);
+
+	init_completion(&super->dmn_alive);
+	init_completion(&super->dmn_dead);
+	init_waitqueue_head(&super->dmn_sleepq);
+
+	init_completion(&super->reaper_alive);
+	init_completion(&super->reaper_dead);
+	init_waitqueue_head(&super->reaper_sleepq);
+	init_waitqueue_head(&super->reaper_waitq);
+
+	ret = cachefs_parse_options(super, _data, &tagname);
+	if (ret < 0)
+		goto error;
+
+	fsdef = kmem_cache_alloc(cachefs_object_jar, SLAB_KERNEL);
+	if (!fsdef)
+		goto error;
+
+#if CACHEFS_DEBUG_OBJECT_ACCOUNTING
+	kdebug("- ALLOC ROOT OBJ %p", fsdef);
+#endif
+
+	atomic_set(&fsdef->usage, 1);
+	atomic_set(&fsdef->fscache_usage, 1);
+	fsdef->pobjid = CACHEFS_ONDISC_FSDEF_OBJID;
+	fsdef->objid = CACHEFS_ONDISC_FSDEF_OBJID;
+	atomic_set(&super->cnt_objects, 1);
+	fsdef->flags |= CACHEFS_ONDISC_OBJECT_HAS_CHILDREN;
+	fsdef->flags |= CACHEFS_ONDISC_OBJECT_IS_PINNED;
+	fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;
+
+	_debug("- fsdef %p", fsdef);
+
+	super->jnl_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0);
+	if (!super->jnl_page)
+		goto error;
+
+	super->metadata_tree = cachefs_tree_alloc(GFP_KERNEL);
+	if (!super->metadata_tree)
+		goto error;
+
+	super->metadata_tree->flags |= (1 << CACHEFS_TREE_EXTANT);
+	super->metadata_tree->type = CACHEFS_TREE_TYPE_NODE;
+	super->metadata_tree->offset = 0;
+
+	super->jnl_current = kmem_cache_alloc(cachefs_journal_jar,
+					      SLAB_KERNEL);
+	if (!super->jnl_current)
+		goto error;
+
+	memset(super->jnl_current, 0, sizeof(*super->jnl_current));
+	super->jnl_current->super = super;
+	INIT_LIST_HEAD(&super->jnl_current->syncwq);
+	atomic_set(&super->jnl_current->remaining, 1);
+
+	list_add_tail(&super->jnl_current->link, &super->jnl_transq);
+
+	/* initialise the superblock */
+	sb->s_magic		= CACHEFS_FS_MAGIC;
+	sb->s_op		= &cachefs_super_ops;
+	sb->s_fs_info		= super;
+	super->sb		= sb;
+	super->sector_size	= bdev_hardsect_size(super->sb->s_bdev);
+
+	fscache_init_cache(&super->cache,
+			   &cachefs_cache_ops,
+			   "%02x:%02x",
+			   MAJOR(sb->s_dev),
+			   MINOR(sb->s_dev));
+
+	/* create the linear-mapping inode */
+	imeta = cachefs_iget(super, CACHEFS_INO_IMETA);
+	if (IS_ERR(imeta)) {
+		ret = PTR_ERR(imeta);
+		goto error;
+	}
+
+	super->imeta = &imeta->vfs_inode;
+
+	/* read the superblock from disc, making sure the page we allocate is
+	 * directly accessible by the kernel so that we don't have to keep
+	 * kmapping it */
+	asflags = super->imeta->i_mapping->flags;
+	super->j.alloc_unready = UINT_MAX;
+	super->imeta->i_mapping->flags = asflags & ~__GFP_HIGHMEM;
+	lpage = NULL;
+	ret = cachefs_page_read(super, 0, 0, &lpage);
+	super->imeta->i_mapping->flags = asflags;
+	if (ret < 0)
+		goto error;
+
+	super->layout = page_address(lpage);
+
+	/* examine the on-disc superblock record */
+	wait_on_page_locked(lpage);
+
+	if (PageError(lpage)) {
+		printk(KERN_ERR "CacheFS: unable to read cache superblock from disc\n");
+		ret = -EIO;
+		goto error;
+	}
+
+	/* consult the magic number to see whether the device is prepared yet */
+	_debug("blockdev magic %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x %04hx",
+	       super->layout->magic[0],
+	       super->layout->magic[1],
+	       super->layout->magic[2],
+	       super->layout->magic[3],
+	       super->layout->magic[4],
+	       super->layout->magic[5],
+	       super->layout->magic[6],
+	       super->layout->magic[7],
+	       super->layout->magic[8],
+	       super->layout->magic[9],
+	       super->layout->endian
+	       );
+
+	if (memcmp(super->layout->magic,
+		   CACHEFS_SUPER_MAGIC,
+		   CACHEFS_SUPER_MAGIC_SIZE) == 0
+	    ) {
+		printk(KERN_NOTICE "CacheFS: Found initialised cache\n");
+		prepare = 0;
+	}
+	else if (memcmp(super->layout->magic,
+			CACHEFS_SUPER_MAGIC_NEEDS_INIT,
+			CACHEFS_SUPER_MAGIC_SIZE) == 0
+		 ) {
+		printk(KERN_NOTICE "CacheFS: Found uninitialised cache\n");
+		prepare = 1;
+	}
+	else {
+		printk(KERN_ERR "CacheFS: Wrong magic number on cache\n");
+		ret = -EINVAL;
+		goto error;
+	}
+
+	/* check for barrier capability on the blockdev */
+	ret = cachefs_check_barrier_cap(super);
+	if (ret < 0)
+		goto error;
+
+	/* set up the recycling state */
+	super->rcy_p_nlevels = (short) INT_MIN;
+	super->rcy_p_level = -1;
+
+	/* start the manager daemon */
+	ret = kernel_thread(kcachefsd, super, 0);
+	if (ret < 0)
+		goto error;
+	wait_for_completion(&super->dmn_alive);
+
+	/* start the reaper daemon */
+	ret = kernel_thread(kreaperd, super, 0);
+	if (ret < 0)
+		goto error;
+	wait_for_completion(&super->reaper_alive);
+
+	/* initialise the cache if necessary */
+	if (prepare) {
+		ret = cachefs_initialise_blockdev(super);
+		if (ret < 0)
+			goto error;
+	}
+
+	/* replay the journal, even if we've just initialised it */
+	ret = cachefs_journal_replay(super);
+	if (ret < 0)
+		goto error;
+
+	super->metadata_tree->bix = super->j.tree_root;
+
+	/* allocate the root inode and dentry */
+	_debug("get root");
+	iroot = cachefs_iget(super, CACHEFS_INO_ROOTDIR);
+	if (IS_ERR(iroot)) {
+		ret = PTR_ERR(iroot);
+		iroot = NULL;
+		goto error;
+	}
+
+	ret = -ENOMEM;
+	root = d_alloc_root(&iroot->vfs_inode);
+	if (!root)
+		goto error;
+
+	iroot = NULL;
+	super->sb->s_root = root;
+
+	ret = fscache_add_cache(&super->cache, &fsdef->fscache, tagname);
+	if (ret < 0)
+		goto error;
+
+	_leave(" = 0 [super=%p]", super);
+	return 0;
+
+ error:
+	kdebug("ERROR %d", ret);
+
+	if (fsdef)
+		cachefs_object_put(fsdef);
+
+	if (super) {
+		unsigned loop;
+
+		if (super->reaper_task) {
+			super->reaper_die = CACHEFS_REAPER_DIE;
+			wake_up(&super->reaper_sleepq);
+			wait_for_completion(&super->reaper_dead);
+		}
+
+		if (super->dmn_task) {
+			super->dmn_die = CACHEFS_DMN_DIE;
+			wake_up(&super->dmn_sleepq);
+			wait_for_completion(&super->dmn_dead);
+		}
+
+		if (super->page_rcm) {
+			wait_on_page_writeback(super->page_rcm);
+			super->page_rcm->mapping = NULL;
+			cachefs_page_put(super->page_rcm);
+			atomic_dec(&super->cnt_rcmpages);
+		}
+
+		spin_lock(&super->alloc_lock);
+
+		while (!list_empty(&super->rcm_old_pages)) {
+			struct page *page =
+				list_entry(super->rcm_old_pages.next,
+					   struct page, lru);
+
+			list_del_init(&page->lru);
+			spin_unlock(&super->alloc_lock);
+
+			wait_on_page_writeback(page);
+			page->mapping = NULL;
+			cachefs_page_put(page);
+			atomic_dec(&super->cnt_rcmpages);
+
+			spin_lock(&super->alloc_lock);
+		}
+
+		spin_unlock(&super->alloc_lock);
+
+		ASSERT(atomic_read(&super->cnt_rcmpages) == 0);
+
+		if (super->page_pfree) {
+			wait_on_page_locked(super->page_pfree);
+			ASSERT(!PageWriteback(super->page_pfree));
+			super->page_pfree->mapping = NULL;
+			cachefs_page_put(super->page_pfree);
+		}
+
+		if (super->page_pfree_nx) {
+			wait_on_page_locked(super->page_pfree_nx);
+			ASSERT(!PageWriteback(super->page_pfree_nx));
+			super->page_pfree_nx->mapping = NULL;
+			cachefs_page_put(super->page_pfree_nx);
+		}
+
+		if (super->page_sfree) {
+			wait_on_page_locked(super->page_sfree);
+			ASSERT(!PageWriteback(super->page_sfree));
+			super->page_sfree->mapping = NULL;
+			cachefs_page_put(super->page_sfree);
+		}
+
+		cachefs_page_put(super->page_rcy);
+		cachefs_page_put(super->page_rcy_proc);
+		cachefs_page_put(super->page_reap);
+		cachefs_page_put(super->page_reap_proc);
+
+		for (loop = 0;
+		     loop < sizeof(super->page_rcy_blk) / sizeof(void *);
+		     loop++)
+			cachefs_page_put(super->page_rcy_blk[loop]);
+
+		list_splice_init(&super->scan_xculls, &super->scan_culls);
+
+		while (!list_empty(&super->scan_culls)) {
+			struct cachefs_object *object;
+
+			object = list_entry(super->scan_culls.next,
+					    struct cachefs_object, cull_link);
+
+			list_del_init(&object->cull_link);
+			cachefs_object_put(object);
+		}
+
+		if (super->scan_reap)
+			cachefs_object_put(super->scan_reap);
+		if (super->scan_tmpobj)
+			cachefs_object_put(super->scan_tmpobj);
+		cachefs_tree_put(super->scan_tmpnode);
+		cachefs_tree_put(super->scan_node);
+
+		if (super->reaper_target)
+			cachefs_object_put(super->reaper_target);
+
+		if (super->jnl_page)
+			__free_page(super->jnl_page);
+	}
+
+	if (lpage) {
+		wait_on_page_locked(lpage);
+		cachefs_page_put(lpage);
+	}
+
+	dput(root);
+	cachefs_iput(iroot);
+
+	if (super) {
+		if (super->jnl_current)
+			kmem_cache_free(cachefs_journal_jar,
+					super->jnl_current);
+		if (super->metadata_tree)
+			kmem_cache_free(cachefs_node_jar,
+					super->metadata_tree);
+
+		if (super->imeta) {
+			_debug("imeta %p{%d}",
+			       super->imeta,
+			       atomic_read(&super->imeta->i_count));
+			iput(super->imeta);
+		}
+		kfree(super);
+	}
+
+	sb->s_root = NULL;
+	sb->s_fs_info = NULL;
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_fill_super() */
+
+/*****************************************************************************/
+/*
+ * initialise the block device for use as a cache
+ */
+static int cachefs_initialise_blockdev(struct cachefs_super *super)
+{
+	struct cachefs_ondisc_free_node *fnode;
+	struct cachefs_ondisc_journal *jentry;
+	struct cachefs_ondisc_leaf *leaf;
+	cachefs_block_t bix;
+	struct page *page;
+	size_t tmp, qty;
+	loff_t nblocks;
+	void *data;
+	int ret, loop;
+
+	_enter("");
+	set_bit(CACHEFS_SUPER_INIT_BLKDEV, &super->flags);
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+	data = page_address(page);
+
+	/* work out how big the cache is (we use 32-bit block index numbers) */
+	nblocks = i_size_read(super->sb->s_bdev->bd_inode);
+	do_div(nblocks, PAGE_SIZE);
+	if (nblocks > UINT_MAX)
+		nblocks = UINT_MAX;
+	if (nblocks < 256) {
+		printk("CacheFS: cache must be at least 256 pages in size\n");
+		__free_page(page);
+		return -ENOSPC;
+	}
+
+	/* determine the layout */
+	memset(super->layout, 0, PAGE_SIZE);
+	memcpy(super->layout->magic,
+	       CACHEFS_SUPER_MAGIC,
+	       sizeof(super->layout->magic));
+
+	super->layout->endian		= CACHEFS_SUPER_ENDIAN;
+	super->layout->version		= CACHEFS_SUPER_VERSION;
+	super->layout->bsize		= PAGE_SIZE;
+	super->layout->bshift		= PAGE_SHIFT;
+	super->layout->ashift		= 0;
+	super->layout->asize		= (1 << super->layout->ashift);
+	super->layout->leaf_size	= super->sb->s_blocksize;
+	super->layout->leaf_shift	= super->sb->s_blocksize_bits;
+	super->layout->jnl_rsize	= super->sb->s_blocksize;
+	super->layout->jnl_recperblk	=
+		super->layout->bsize / super->layout->jnl_rsize;
+
+	super->layout->pshift		= PAGE_SHIFT - CACHEFS_BLOCK_SHIFT;
+
+	bix = 1;
+
+	/* allocate the update journal */
+	qty = CACHEFS_ONDISC_JNL_NUMBLOCKS;
+
+	super->layout->bix_journal	= bix;
+	bix += qty;
+
+	/* record the start of the cache */
+	super->layout->bix_cache	= bix;
+
+	/* allocate the initial root of the tree */
+	super->j.tree_root		= bix++;
+
+	/* set up a reclamation collector node and a spare */
+	super->j.rcm_collector		= bix++;
+	super->j.rcm_spare		= bix++;
+
+	super->j.alloc_unready		= bix;
+	super->layout->bix_end		= nblocks;
+	super->layout->bix_null		= CACHEFS_NULL_PTR;
+	super->layout->bix_empty	= CACHEFS_EMPTY_PTR;
+
+	printk("CacheFS: block size %u (shift %u)\n",
+	       super->layout->bsize, super->layout->bshift);
+	printk("CacheFS: %u dataptrs/leaf\n", 1 << super->layout->pshift);
+	printk("CacheFS: 00000000 super block\n");
+
+	printk("CacheFS: %08x journal (recsize %zu+%zub)\n",
+	       super->layout->bix_journal,
+	       sizeof(struct cachefs_ondisc_journal),
+	       (size_t) super->layout->jnl_rsize -
+	       sizeof(struct cachefs_ondisc_journal));
+
+	printk("CacheFS: %08x data cache\n", super->layout->bix_cache);
+	printk("CacheFS: %08x indexing tree initial root\n", super->j.tree_root);
+	printk("CacheFS: %08x unready point\n", super->j.alloc_unready);
+	printk("CacheFS: %08x end\n", super->layout->bix_end);
+
+	/* set up the allocation tracking */
+	super->j.alloc_objid	= CACHEFS_ONDISC_FSDEF_OBJID + 1;
+
+	/* initialise the reclaim stack node and its spare with magicked
+	 * blocks */
+	memset(data, 0, PAGE_SIZE);
+	fnode = data;
+	fnode->magic = CACHEFS_ONDISC_FREELIST_PARTIAL;
+
+	tmp = 1;
+	ret = cachefs_bio_submit(super->sb, page, super->j.rcm_collector, &tmp,
+				 WRITE);
+	if (ret < 0) {
+		__free_page(page);
+		return ret;
+	}
+
+	wait_on_page_locked(page);
+	if (PageError(page)) {
+		printk("CacheFS: failed to write initial reclaim collector\n");
+		__free_page(page);
+		return -EIO;
+	}
+
+	tmp = 1;
+	ret = cachefs_bio_submit(super->sb, page, super->j.rcm_spare, &tmp,
+				 WRITE);
+	if (ret < 0) {
+		__free_page(page);
+		return ret;
+	}
+
+	wait_on_page_locked(page);
+	if (PageError(page)) {
+		printk("CacheFS: failed to write initial reclaim spare\n");
+		__free_page(page);
+		return -EIO;
+	}
+
+	/* initialise the tree with one block containing only empty leaves */
+	memset(data, CACHEFS_EMPTY_FILL, PAGE_SIZE);
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+		leaf = data + (loop << super->layout->leaf_shift);
+		leaf->type = CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT;
+	}
+
+	tmp = 1;
+	ret = cachefs_bio_submit(super->sb, page, super->j.tree_root, &tmp,
+				 WRITE);
+	if (ret < 0) {
+		__free_page(page);
+		return ret;
+	}
+
+	wait_on_page_locked(page);
+	if (PageError(page)) {
+		printk("CacheFS: failed to write initial tree root\n");
+		__free_page(page);
+		return -EIO;
+	}
+
+	/* preset the first record in the journal */
+	super->j.rcy_collsp		= 0;
+	super->j.rcy_procsp		= 0;
+	super->j.reap_collsp		= -1;
+	super->j.reap_proccnt		= -1;
+	super->j.space_meta		= 1;
+	super->j.space_alrcm_nodes	= 2;
+	super->j.alloc_pfree_pt		= CACHEFS_ONDISC_FREELIST_PTRSPERNODE;
+
+	memset(data, 0, PAGE_SIZE);
+
+	jentry = data;
+	*jentry = super->j;
+
+	jentry->mark		= CACHEFS_ONDISC_JNL_WANDER;
+	jentry->serial		= 0;
+	jentry->jtime		= CURRENT_TIME.tv_sec;
+
+	tmp = 1;
+	ret = cachefs_bio_submit(super->sb, page, super->layout->bix_journal,
+				 &tmp, WRITE);
+	if (ret < 0) {
+		__free_page(page);
+		return ret;
+	}
+
+	wait_on_page_locked(page);
+	if (PageError(page)) {
+		printk("CacheFS: failed to write initial journal entry\n");
+		__free_page(page);
+		return -EIO;
+	}
+
+	/* clear the rest of the journal */
+	memset(data, 0, PAGE_SIZE);
+
+	bix = super->layout->bix_journal + 1;
+	while (bix < super->layout->bix_cache) {
+		qty = super->layout->bix_cache - bix;
+
+		_debug("clearing blocks %u-%u", bix, bix + qty - 1);
+
+		tmp = qty;
+		ret = cachefs_bio_submit(super->sb, page, bix, &tmp, WRITE);
+		if (ret < 0) {
+			__free_page(page);
+			return ret;
+		}
+
+		wait_on_page_locked(page);
+		if (PageError(page)) {
+			printk("CacheFS: failed to write blocks %zu-%zu\n",
+			       bix, bix + qty - 1);
+			__free_page(page);
+			return -EIO;
+		}
+
+		bix += tmp;
+	}
+
+	__free_page(page);
+
+	/* write the superblock last */
+	_debug("writing superblock");
+	tmp = 1;
+	ret = cachefs_bio_submit(super->sb, virt_to_page(super->layout), 0,
+				 &tmp, super->bio_wr_barrier);
+	if (ret < 0)
+		return ret;
+	wait_on_page_locked(page);
+
+	clear_bit(CACHEFS_SUPER_INIT_BLKDEV, &super->flags);
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_initialise_blockdev() */
+
+/*****************************************************************************/
+/*
+ * return some stats on the filesystem
+ */
+static int cachefs_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+	struct cachefs_super *super = sb->s_fs_info;
+	unsigned long long tmp;
+
+	buf->f_type	= sb->s_magic;
+	buf->f_bsize	= super->layout->bsize;
+	buf->f_blocks	= super->layout->bix_end;
+	buf->f_bavail	= super->j.space_alloc;
+	buf->f_bavail	+= buf->f_blocks - super->j.alloc_unready;
+	buf->f_bfree	= buf->f_bavail + super->j.space_rcm;
+
+	tmp = super->layout->bix_end - super->layout->bix_cache;
+	tmp <<= PAGE_SIZE - super->sb->s_blocksize_bits;
+	if (tmp > LONG_MAX)
+		tmp = LONG_MAX;
+
+	buf->f_files	= tmp;
+
+	tmp = buf->f_bavail;
+	tmp <<= PAGE_SIZE - super->sb->s_blocksize_bits;
+	if (tmp > LONG_MAX)
+		tmp = LONG_MAX;
+
+	buf->f_ffree	= tmp;
+	buf->f_namelen	= NAME_MAX;
+
+	return 0;
+
+} /* end cachefs_statfs() */
+
+/*****************************************************************************/
+/*
+ * synchronise the filesystem to disc
+ */
+static int cachefs_sync_fs(struct super_block *sb, int wait)
+{
+	struct cachefs_super *super = sb->s_fs_info;
+	int ret;
+
+	_enter(",%d", wait);
+
+	/* wait for the current transaction batch to complete */
+	ret = cachefs_sync(super, wait, 1);
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_sync_fs() */
+
+/*****************************************************************************/
+/*
+ * write the superblock back to disc
+ */
+static void cachefs_write_super(struct super_block *sb)
+{
+	struct cachefs_super *super = sb->s_fs_info;
+	struct page *page;
+	size_t tmp;
+	void *data;
+	int ret;
+
+	_enter("");
+
+	/* grab a page to write from */
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		printk("CacheFS:"
+		       " unable to write superblock to disc (ENOMEM)\n");
+		return;
+	}
+
+	/* copy the superblock info into it */
+	data = kmap_atomic(page, KM_USER0);
+	memset(data, 0, PAGE_SIZE);
+	memcpy(data, &super->layout, sizeof(super->layout));
+	kunmap_atomic(data, KM_USER0);
+
+	/* write it to disc */
+	tmp = 1;
+	ret = cachefs_bio_submit(super->sb,page, 0, &tmp, WRITE);
+	if (ret < 0) {
+		printk("CacheFS: unable to write superblock to disc (%d)\n",
+		       ret);
+		return;
+	}
+
+	/* and wait for it to complete */
+	wait_on_page_locked(page);
+
+	sb->s_dirt = 0;
+	_leave("");
+
+} /* end cachefs_write_super() */
+
+/*****************************************************************************/
+/*
+ * finish the unmounting process on the superblock
+ */
+static void cachefs_put_super(struct super_block *sb)
+{
+	struct cachefs_super *super = sb->s_fs_info;
+	unsigned loop;
+
+	_enter("{%p}", super);
+
+	ASSERT(super);
+
+	/* detach the cache from all cookies that reference it */
+	fscache_withdraw_cache(&super->cache);
+	super->cache.ops->put_object(super->cache.fsdef);
+
+	ASSERT(atomic_read(&super->cnt_objects) == 0);
+
+	/* kill the reaper */
+	super->reaper_die = CACHEFS_REAPER_DIE;
+	wake_up(&super->reaper_sleepq);
+	wait_for_completion(&super->reaper_dead);
+
+	/* synchronise the update journal */
+	super->dmn_die = CACHEFS_DMN_RETIRING;
+	cachefs_sync(super, 1, 0);
+
+	/* kill the daemon */
+	super->dmn_die = CACHEFS_DMN_DIE;
+	wake_up(&super->dmn_sleepq);
+	wait_for_completion(&super->dmn_dead);
+
+	/* the autowander timer can go */
+	del_timer_sync(&super->jnl_timer);
+
+	/* release all the pages and blocks we have pinned */
+	if (super->page_rcm) {
+		wait_on_page_writeback(super->page_rcm);
+		super->page_rcm->mapping = NULL;
+		cachefs_page_put(super->page_rcm);
+		atomic_dec(&super->cnt_rcmpages);
+	}
+
+	spin_lock(&super->jnl_qlock);
+
+	while (!list_empty(&super->rcm_old_pages)) {
+		struct page *page = list_entry(super->rcm_old_pages.next,
+					       struct page, lru);
+
+		list_del_init(&page->lru);
+		spin_unlock(&super->jnl_qlock);
+
+		wait_on_page_writeback(page);
+		page->mapping = NULL;
+		cachefs_page_put(page);
+		atomic_dec(&super->cnt_rcmpages);
+
+		spin_lock(&super->jnl_qlock);
+	}
+
+	spin_unlock(&super->jnl_qlock);
+
+	ASSERT(atomic_read(&super->cnt_rcmpages) == 0);
+
+	if (super->page_pfree) {
+		wait_on_page_locked(super->page_pfree);
+		ASSERT(!PageWriteback(super->page_pfree));
+		super->page_pfree->mapping = NULL;
+		cachefs_page_put(super->page_pfree);
+	}
+
+	if (super->page_pfree_nx) {
+		wait_on_page_locked(super->page_pfree_nx);
+		ASSERT(!PageWriteback(super->page_pfree_nx));
+		super->page_pfree_nx->mapping = NULL;
+		cachefs_page_put(super->page_pfree_nx);
+	}
+
+	if (super->page_sfree) {
+		wait_on_page_locked(super->page_sfree);
+		ASSERT(!PageWriteback(super->page_sfree));
+		super->page_sfree->mapping = NULL;
+		cachefs_page_put(super->page_sfree);
+	}
+
+	cachefs_page_put(super->page_rcy);
+	cachefs_page_put(super->page_rcy_proc);
+	cachefs_page_put(super->page_reap);
+	cachefs_page_put(super->page_reap_proc);
+
+	for (loop = 0;
+	     loop < sizeof(super->page_rcy_blk) / sizeof(void *);
+	     loop++)
+		cachefs_page_put(super->page_rcy_blk[loop]);
+
+	/* release the objects in the cull list */
+	list_splice_init(&super->scan_xculls, &super->scan_culls);
+
+	while (!list_empty(&super->scan_culls)) {
+		struct cachefs_object *object;
+
+		object = list_entry(super->scan_culls.next,
+				    struct cachefs_object, cull_link);
+
+		list_del_init(&object->cull_link);
+
+		_debug("free cull %p{%llx,%d}",
+		       object, object->objid, atomic_read(&object->usage));
+		cachefs_object_put(object);
+	}
+
+	if (super->scan_reap)
+		cachefs_object_put(super->scan_reap);
+	if (super->scan_tmpobj)
+		cachefs_object_put(super->scan_tmpobj);
+	cachefs_tree_put(super->scan_tmpnode);
+	cachefs_tree_put(super->scan_node);
+
+	if (super->reaper_target)
+		cachefs_object_put(super->reaper_target);
+
+	__free_page(super->jnl_page);
+
+	kmem_cache_free(cachefs_journal_jar, super->jnl_current);
+
+	cachefs_tree_put(super->metadata_tree);
+	cachefs_page_put(virt_to_page(super->layout));
+
+	iput(super->imeta);
+
+	/* done */
+	kfree(super);
+	_leave("");
+
+} /* end cachefs_put_super() */
+
+/*****************************************************************************/
+/*
+ * initialise an inode cache slab element prior to any use
+ */
+static void cachefs_i_init_once(void *_inode, kmem_cache_t *cachep,
+				unsigned long flags)
+{
+	struct cachefs_inode *inode = _inode;
+
+	_enter("%p,,1", _inode);
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR
+	    ) {
+		memset(inode, 0, sizeof(*inode));
+		inode_init_once(&inode->vfs_inode);
+	}
+
+} /* end cachefs_i_init_once() */
+
+/*****************************************************************************/
+/*
+ * allocate an inode struct from our slab cache
+ */
+static struct inode *cachefs_alloc_inode(struct super_block *sb)
+{
+	struct cachefs_inode *inode;
+
+	inode = (struct cachefs_inode *)
+		kmem_cache_alloc(cachefs_inode_cachep, SLAB_KERNEL);
+	if (!inode)
+		return NULL;
+
+	_leave(" = %p", &inode->vfs_inode);
+	return &inode->vfs_inode;
+
+} /* end cachefs_alloc_inode() */
+
+/*****************************************************************************/
+/*
+ * catch an inode ref being released
+ */
+static void cachefs_put_inode(struct inode *inode)
+{
+	_enter("%p{%d}", inode, atomic_read(&inode->i_count));
+
+} /* end cachefs_put_inode() */
+
+/*****************************************************************************/
+/*
+ * catch an inode being torn down
+ */
+static void cachefs_delete_inode(struct inode *inode)
+{
+	_enter("{%lx,%lu}", inode->i_ino, inode->i_mapping->nrpages);
+
+	truncate_inode_pages(&inode->i_data, 0);
+
+	_debug("clearing");
+
+	clear_inode(inode);
+
+	_leave("");
+
+} /* end cachefs_delete_inode() */
+
+/*****************************************************************************/
+/*
+ * destroy a cachefs inode struct
+ */
+static void cachefs_destroy_inode(struct inode *inode)
+{
+	_enter("{%lu}", inode->i_ino);
+	kmem_cache_free(cachefs_inode_cachep, CACHEFS_FS_I(inode));
+
+} /* end cachefs_destroy_inode() */
+
+/*****************************************************************************/
+/*
+ * barrier capabability check completion
+ */
+static int cachefs_barrier_cap_checked(struct bio *bio,
+				       unsigned int bytes_done,
+				       int error)
+{
+	_enter("%p{%lx},%u,%d", bio, bio->bi_flags, bytes_done, error);
+
+	/* we're only interested in completion */
+	if (bio->bi_size > 0) {
+		_leave(" = 1");
+		return 1;
+	}
+
+	*(int *) bio->bi_private = error;
+	end_page_writeback(bio->bi_io_vec[0].bv_page);
+
+	bio_put(bio);
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_barrier_cap_checked() */
+
+/*****************************************************************************/
+/*
+ * determine whether the block device supports barriers
+ * - need barriers to be able to journal properly
+ */
+static int cachefs_check_barrier_cap(struct cachefs_super *super)
+{
+	struct page *superpage;
+	struct bio *bio;
+	int ret;
+
+	super->bio_wr_barrier = WRITE_BARRIER;
+
+	/* attempt a barriered write on the superblock */
+	superpage = virt_to_page(super->layout);
+	wait_on_page_writeback(superpage);
+	SetPageWriteback(superpage);
+
+	ret = -ENOMEM;
+	bio = bio_alloc(GFP_KERNEL, 1);
+	if (!bio)
+		goto error;
+
+	bio->bi_bdev	= super->sb->s_bdev;
+	bio->bi_private	= &ret;
+	bio->bi_end_io	= cachefs_barrier_cap_checked;
+	bio->bi_sector	= 0;
+
+	if (!bio_add_page(bio, superpage, PAGE_SIZE, 0))
+		BUG();
+
+	/* and send to disc */
+	//dump_bio(bio,1);
+	ret = 0;
+	submit_bio(super->bio_wr_barrier, bio);
+	wait_on_page_writeback(superpage);
+
+	if (ret < 0) {
+		if (ret == -EOPNOTSUPP) {
+			/* it appears barriers are not supported */
+			printk("CacheFS:"
+			       " The blockdev does not support barriers,"
+			       " so the journal may not be reliable\n");
+			super->bio_wr_barrier = WRITE;
+			ret = 0;
+		}
+	}
+
+ error:
+	return ret;
+
+} /* end cachefs_check_barrier_cap() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/tree-cull.c linux-2.6.14-mm2-cachefs/fs/cachefs/tree-cull.c
--- linux-2.6.14-mm2/fs/cachefs/tree-cull.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/tree-cull.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,19 @@
+/* tree-cull.c: tree culling algorithm
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define __KENTER
+#define __KDEBUG
+#define __KLEAVE
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "cachefs-int.h"
+
diff -uNrp linux-2.6.14-mm2/fs/cachefs/tree-data.c linux-2.6.14-mm2-cachefs/fs/cachefs/tree-data.c
--- linux-2.6.14-mm2/fs/cachefs/tree-data.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/tree-data.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,1669 @@
+/* tree-data.c: data handling routines
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+//#define __KENTER
+//#define __KDEBUG
+//#define __KLEAVE
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "cachefs-int.h"
+
+#define log2(n) ffz(~(n))
+
+/*****************************************************************************/
+/*
+ * walk from the root of the tree to the object sliding a write lock down the
+ * tree to the parent of the specified object
+ * - we replace every node that we pass
+ * - the object must be resident in the tree and must be pinning the nodes on
+ *   the path through the tree
+ * - we return the end node write-locked and with its page locked
+ */
+static int cachefs_tree_slide_writelock_replace(struct cachefs_operation *op)
+{
+	struct cachefs_object *xobject;
+	struct cachefs_tree *point, *next;
+	uint16_t level, offset;
+	int ret;
+
+	_enter("");
+
+	/* walk the tree from the root looking for the object and sliding the
+	 * lock down appropriately */
+	point = cachefs_tree_get(op->super->metadata_tree);
+	level = 0;
+
+	down_write(&point->sem);
+
+	ret = cachefs_replace_node(op, point);
+	if (ret < 0)
+		goto error;
+
+begin_step:
+	/* extract the bits of key in which we're immediately interested */
+	offset = cachefs_extract_subkey_obj(op->object, level);
+
+	_debug("step %d subkey=%04x", level, offset);
+
+	/* start by checking the cached nextes and shortcuts leading off of
+	 * this one
+	 */
+	read_lock(&point->lock);
+
+	xobject = cachefs_tree_find_object(point, op->object->offset);
+	if (xobject == op->object)
+		goto found_object;
+
+	next = cachefs_tree_find_node(point, CACHEFS_TREE_TYPE_NODE, offset);
+	if (next)
+		goto move_to_cached_next;
+
+	next = cachefs_tree_find_shortcut_obj(point, op->object);
+	if (next)
+		goto move_to_cached_shortcut;
+
+	read_unlock(&point->lock);
+
+	/* uh oh... the object should be in the tree somewhere */
+	printk(KERN_ERR "Object not connected to in-mem tree\n");
+	printk(KERN_ERR "- obj %llx node %p{%x} level %d offset %04x\n",
+	       op->object->objid, point, point->bix, level, offset);
+	BUG();
+
+	/* we found a suitable next node to move to in the topology cache */
+move_to_cached_shortcut:
+	_debug(">>>> skip to %p [lev %d]", next, next->level);
+	goto move_to_cached_next2;
+
+move_to_cached_next:
+	_debug(">>>> move to %p [lev %d]", next, next->level);
+
+move_to_cached_next2:
+	cachefs_tree_get(next);
+	read_unlock(&point->lock);
+
+	down_write(&next->sem);
+	downgrade_write(&point->sem);
+
+	ret = cachefs_replace_node(op, next);
+	if (ret < 0)
+		goto error_2;
+
+	unlock_page(point->page);
+	up_read(&point->sem);
+	cachefs_tree_put(point);
+
+	ASSERTCMP(next->level, >, level);
+	level = next->level;
+	point = next;
+	goto begin_step;
+
+	/* we found the object we were looking for
+	 * - return with the point node's semaphore still read-locked and a ref
+	 *   held on its usage count
+	 */
+found_object:
+	read_unlock(&point->lock);
+	op->p.data.inode = point;
+
+	_leave(" = 0 [found %x]", op->p.data.inode->bix);
+	return 0;
+
+error_2:
+	unlock_page(point->page);
+	up_write(&next->sem);
+	up_read(&point->sem);
+	cachefs_tree_put(next);
+	cachefs_tree_put(point);
+	_leave(" = %d", ret);
+	return ret;
+
+error:
+	up_write(&point->sem);
+	cachefs_tree_put(point);
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_tree_slide_writelock_replace() */
+
+/*****************************************************************************/
+/*
+ * allocate a data pointer block to be one order higher than the current root,
+ * and point its 0th ptr at the current root
+ */
+static int cachefs_data_alloc_higher_ptrblk(struct cachefs_operation *op,
+					    cachefs_block_t olddataptr)
+{
+	struct cachefs_tree *node;
+	void *data;
+	int ret;
+
+	_enter("{%d}", op->object->data_levels);
+
+	/* get a representation of the next level up */
+	node = cachefs_tree_alloc(GFP_HIGHUSER);
+	if (!node)
+		goto nomem;
+
+	node->type = CACHEFS_TREE_TYPE_DATAPTRBLK;
+	node->level = op->object->data_levels + 1;
+
+	/* allocate a page on which to create the new node */
+	node->page = alloc_page(GFP_HIGHUSER);
+	if (!node->page)
+		goto nomem_t;
+
+	/* initialise the block complete with preset data ptr */
+	data = kmap_atomic(node->page, KM_USER0);
+	memset(data, CACHEFS_NULL_FILL, PAGE_SIZE);
+	*(cachefs_block_t *) data = olddataptr;
+	kunmap_atomic(data, KM_USER0);
+
+	node->page->mapping = op->super->imeta->i_mapping;
+	SetPageUptodate(node->page);
+	SetPageFsMisc(node->page);
+	SetPagePrivate(node->page);
+
+	set_bit(CACHEFS_TREE_NODE_VALID, &node->flags);
+	set_bit(CACHEFS_TREE_NODE_VALIDATED, &node->flags);
+
+	/* allocate a block for this node */
+	down_write(&node->sem);
+
+	ret = cachefs_replace_node(op, node);
+	if (ret < 0)
+		goto error;
+
+	_alter(op->super, "set obj %llx level %d dataptr %x[0000] to %x",
+	       op->object->objid, node->level, node->bix, olddataptr);
+
+	set_page_dirty(node->page);
+	unlock_page(node->page);
+	up_write(&node->sem);
+
+	op->p.isize.new_alloc = node;
+	_leave(" = 0 [node %p{%x}, pg %p]", node, node->bix, node->page);
+	return 0;
+
+error:
+	up_write(&node->sem);
+	cachefs_tree_put(node);
+	op->p.isize.new_alloc = NULL;
+	_leave(" = %d", ret);
+	return ret;
+
+nomem_t:
+	cachefs_tree_put(node);
+nomem:
+	op->p.isize.new_alloc = NULL;
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+
+} /* end cachefs_data_alloc_higher_ptrblk() */
+
+/*****************************************************************************/
+/*
+ * change the size of an object
+ * - may need to expand the data tree vertically
+ */
+int cachefs_data_set_i_size(struct cachefs_object *object, loff_t i_size)
+{
+	struct cachefs_ondisc_leaf *leaf;
+	struct cachefs_operation op;
+	struct fscache_cookie *cookie;
+	cachefs_block_t olddataptr;
+	unsigned long limit, tmp;
+	unsigned delta_nblocks;
+	loff_t bigtmp;
+	int ret, height;
+
+	op.object = object;
+	op.super = container_of(object->fscache.cache,
+				struct cachefs_super, cache);
+
+	_enter("{%s},{i:%llx},%llx",
+	       op.super->cache.identifier, object->objid, i_size);
+
+	/* work out the new height and page limit
+	 * - limit is one more than the maximum page index
+	 * - limit of ULONG_MAX means any page index
+	 */
+	bigtmp = i_size;
+	bigtmp += PAGE_SIZE - 1;
+	bigtmp &= ~((loff_t) PAGE_SIZE - 1);
+	if (bigtmp < i_size)
+		bigtmp = ((loff_t) 0) - 1;
+	bigtmp >>= PAGE_SHIFT;
+
+	limit = (bigtmp > ULONG_MAX) ? ULONG_MAX : bigtmp;
+
+	height = 0;
+	tmp = limit;
+	if (tmp != ULONG_MAX)
+		tmp--;
+	for (; tmp > 0; tmp >>= op.super->layout->pshift)
+		height++;
+
+	_debug("new height %d [old %d%s] limit %lx",
+	       height,
+	       object->data_levels,
+	       object->has_data ? " data" : "",
+	       limit);
+
+	if (height < object->data_levels)
+		height = object->data_levels;
+
+	/* set up an operation record and request allocation reservation */
+	op.object	= object;
+	op.reason	= CACHEFS_OP_INSERT_DATA;
+	op.data_space	= object->data_levels - height;
+
+	ret = cachefs_operation_begin(&op);
+	if (ret < 0) {
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* replace and lock down to the node holding the object */
+	_debug("lock to obj");
+
+	ret = cachefs_tree_slide_writelock_replace(&op);
+	if (ret < 0) {
+		cachefs_operation_end(&op);
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* extract the current data root ptr */
+	leaf = kmap_atomic(op.p.isize.inode->page, KM_USER0) +
+		op.object->offset;
+	olddataptr = leaf->ptr;
+	kunmap_atomic(leaf, KM_USER0);
+
+	/* grab the record of the old data root if currently resident
+	 * - we don't get a root if the data tree is a 0 level tree
+	 */
+	write_lock(&op.p.isize.inode->lock);
+
+	op.p.isize.old_root =
+		cachefs_tree_find_node(op.p.isize.inode,
+				       CACHEFS_TREE_TYPE_DATAPTRBLK,
+				       op.object->offset);
+
+	if (op.p.isize.old_root) {
+		ASSERT(op.object->has_data);
+		ASSERTCMP(op.object->data_levels, >, 0);
+		ASSERTCMP(op.p.isize.old_root->bix, ==, olddataptr);
+
+		cachefs_tree_unlink_from_node(op.p.isize.old_root);
+
+		op.p.isize.new_root = cachefs_tree_get(op.p.isize.new_root);
+	}
+
+	write_unlock(&op.p.isize.inode->lock);
+
+	/* if there's any data, insert data ptr blocks at the top of the tree
+	 * one at a time to build up the height */
+	delta_nblocks = 0;
+
+	if (object->has_data) {
+		_debug("expand %d", height);
+
+		while (op.object->data_levels < height) {
+			_debug("expand %d upwards one [%x]",
+			       op.object->data_levels, olddataptr);
+
+			ret = cachefs_data_alloc_higher_ptrblk(&op,
+							       olddataptr);
+			if (ret < 0)
+				break;
+
+			olddataptr = op.p.isize.new_alloc->bix;
+			op.object->data_levels++;
+			delta_nblocks++;
+
+			/* link level 1 dataptr blocks into the object's
+			 * list */
+			if (op.p.isize.new_alloc->level == 1) {
+				op.p.isize.new_alloc->index = 0;
+				cachefs_tree_link_to_object(
+					op.p.isize.new_alloc,
+					op.object);
+			}
+
+			/* link the last expansion block (lower level) into
+			 * this one (higher level) */
+			if (op.p.isize.new_root) {
+				op.p.isize.new_root->offset = 0;
+				cachefs_tree_link_to_node(
+					op.p.isize.new_root,
+					op.p.isize.new_alloc);
+			}
+
+			/* keep track of both ends of what we've added */
+			op.p.isize.new_root = op.p.isize.new_alloc;
+			if (!op.p.isize.old_root)
+				op.p.isize.old_root =
+					cachefs_tree_get(op.p.isize.new_alloc);
+			op.p.isize.new_alloc = NULL;
+		}
+	}
+	else {
+		/* no data - just proclaim new height */
+		op.object->data_levels = height;
+		ret = 0;
+	}
+
+	/* change the object itself */
+	_debug("change obj [ret %d]", ret);
+
+	ASSERTIFCMP(!op.object->has_data, olddataptr, ==, CACHEFS_NULL_PTR);
+	ASSERTIF(op.object->has_data, olddataptr != CACHEFS_NULL_PTR);
+
+	_alter(op.super, "set obj %llx dataptr %x[%04x] to %x [lev %d]",
+	       op.object->objid,
+	       op.p.isize.inode->bix, op.object->offset,
+	       olddataptr, op.p.isize.inode->level);
+
+	leaf = kmap_atomic(op.p.isize.inode->page, KM_USER0) +
+		op.object->offset;
+
+	leaf->ptr = olddataptr;
+
+	_alter(op.super, "set obj %llx nb=%x+%x dl=%x",
+	       op.object->objid,
+	       leaf->u.object.nblocks, delta_nblocks,
+	       op.object->data_levels);
+
+	leaf->u.object.data_levels = op.object->data_levels;
+	leaf->u.object.nblocks += delta_nblocks;
+
+	spin_lock(&op.super->alloc_lock);
+	op.super->j.space_data_used += delta_nblocks;
+	spin_unlock(&op.super->alloc_lock);
+
+	leaf->u.object.atime = CURRENT_TIME.tv_sec;
+
+	/* update the max size only if we actually managed to bring the tree to
+	 * the requisite height */
+	if (ret == 0) {
+		leaf->u.object.size = i_size;
+		_alter(op.super, "set obj %llx isize to %llx",
+		       op.object->objid, i_size);
+	}
+
+	/* update the auxilliary data from the netfs */
+	cookie = op.object->fscache.cookie;
+	if (cookie->def->get_aux) {
+		uint16_t dlen, maxdlen = CACHEFS_ONDISC_LEAF_SIZE;
+		void *dbuf;
+
+		maxdlen -= offsetof(struct cachefs_ondisc_leaf,
+				    u.object.netfs_data);
+		maxdlen -= leaf->u.object.netfs_klen;
+		dbuf = leaf->u.object.netfs_data;
+		dbuf += leaf->u.object.netfs_klen;
+
+		dlen = cookie->def->get_aux(cookie->netfs_data, dbuf, maxdlen);
+		BUG_ON(dlen > maxdlen);
+		leaf->u.object.netfs_dlen = dlen;
+	}
+
+	kunmap_atomic(leaf, KM_USER0);
+
+	set_page_dirty(op.p.isize.inode->page);
+	unlock_page(op.p.isize.inode->page);
+
+	/* rearrange the in-memory topology cache */
+	_debug("rearrange");
+	if (op.p.isize.new_root) {
+		op.p.isize.new_root->offset = op.object->offset;
+		cachefs_tree_link_to_node(op.p.isize.new_root,
+					  op.p.isize.inode);
+		op.p.isize.new_root = NULL;
+	}
+
+	/* move the ref a netfs has on page 0 in a level-0 tree to the 0th data
+	 * ptr block */
+	if (op.p.isize.old_root &&
+	    op.p.isize.old_root->level == 1 &&
+	    atomic_read(&op.object->page_usage) > 0 &&
+	    atomic_read(&op.p.isize.old_root->netfs_usage) == 0
+	    ) {
+		ASSERTCMP(atomic_read(&op.object->page_usage), ==, 1);
+		atomic_inc(&op.p.isize.old_root->netfs_usage);
+	}
+
+	/* update the in-memory object unless there's an error pending */
+	if (ret == 0) {
+		op.object->i_size = i_size;
+		op.object->page_limit = limit;
+	}
+
+	up_write(&op.p.isize.inode->sem);
+
+	cachefs_operation_end(&op);
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_data_set_i_size() */
+
+/*****************************************************************************/
+/*
+ * walk the data tree looking for the appropriate level-1 data pointer block
+ */
+static int cachefs_data_walk(struct cachefs_super *super,
+			     struct cachefs_object *object,
+			     unsigned long index,
+			     struct cachefs_tree **_dataptr1,
+			     unsigned long gfp)
+{
+	struct cachefs_cursor cursor;
+	struct cachefs_tree *next;
+	cachefs_block_t bix, *ptr;
+	unsigned long tmp;
+	uint8_t height;
+	int ret;
+
+	index &= CACHEFS_ONDISC_LEVEL_MASK;
+
+	_enter(",{%llx},%lx,,", object->objid, index);
+
+	/* slide the lock down the tree until we get a read lock and a ref on
+	 * the node containing the object
+	 */
+	cursor.point = cachefs_tree_slide_readlock(super, object);
+
+	/* determine whether the block requires a higher level tree than what's
+	 * currently available if there's anything actually there yet
+	 */
+	height = 0;
+	for (tmp = index; tmp > 0; tmp >>= super->layout->pshift)
+		height++;
+
+	_debug("height %d {dl=%d}", height, object->data_levels);
+
+	ASSERTCMP(height, <=, object->data_levels);
+
+	/* the top level block is directly pointed to by the object
+	 * - this will be the data block if level 0 is top
+	 */
+	cursor.offset = object->offset;
+	cursor.level = object->data_levels;
+
+	_debug("- dataptr obj %d: %x[%04x]",
+	       cursor.level, cursor.point->bix, cursor.offset);
+
+	ASSERTCMP(cursor.level, >, 0);
+
+	/* now we walk down the in-memory cached data pointer block nodes
+	 * - these may not actually exist on disk yet if some other netfs page
+	 *   is holding a tentative reservation
+	 */
+	while (cursor.level > 0) {
+		/* see if the next level is in the cache */
+		_debug("- dataptr mem %d: %x[%x]",
+		       cursor.level, cursor.point->bix, cursor.offset);
+
+		read_lock(&cursor.point->lock);
+		next = cachefs_tree_find_node(cursor.point,
+					      CACHEFS_TREE_TYPE_DATAPTRBLK,
+					      cursor.offset);
+		if (!next) {
+			read_unlock(&cursor.point->lock);
+			break;
+		}
+
+		cachefs_tree_get(next);
+		read_unlock(&cursor.point->lock);
+
+		/* move to the next mem cached pointer block down */
+		_debug("interm %x", next->bix);
+
+		down_read(&next->sem);
+		up_read(&cursor.point->sem);
+		cachefs_tree_put(cursor.point);
+		cursor.point = next;
+
+		cursor.level--;
+		cursor.offset = index >> (super->layout->pshift * cursor.level);
+		cursor.offset &= (1 << super->layout->pshift) - 1;
+		cursor.offset <<= CACHEFS_BLOCK_SHIFT;
+	}
+
+	/* and now we walk down the on-disk blocks to fill in what we're missing */
+	while (cursor.level > 0) {
+		/* see if this level is on disk */
+		_debug("- dataptr disk %d: %x[%x]",
+		       cursor.level, cursor.point->bix, cursor.offset);
+
+		if (!test_bit(CACHEFS_TREE_EXTANT, &cursor.point->flags))
+			break;
+
+		_debug("got page %p{%lx}",
+		       cursor.point->page, cursor.point->page->index);
+
+		bix = CACHEFS_NULL_PTR;
+		if (!PageMappedToDisk(cursor.point->page))
+			break;
+
+		/* extract the pointer */
+		ptr = kmap_atomic(cursor.point->page, KM_USER0) + cursor.offset;
+		bix = *ptr;
+		kunmap_atomic(ptr, KM_USER0);
+
+		ASSERTIFCMP(CACHEFS_NULL_PTR != 0, bix, !=, 0);
+
+		/* exit the loop if we reached the tip */
+		if (bix == CACHEFS_NULL_PTR)
+			break;
+
+		ASSERTCMP(bix, >=, super->layout->bix_cache);
+		ASSERTCMP(bix, <, super->j.alloc_unready);
+
+		/* extend the path for the next step */
+		next = cachefs_tree_lookup(gfp, &cursor, bix,
+					   CACHEFS_TREE_TYPE_DATAPTRBLK, 0);
+		if (!next)
+			goto nomem;
+
+		/* link level 1 nodes to the object directly */
+		if (next->level == 1) {
+			next->index = index & CACHEFS_ONDISC_LEVEL_MASK;
+			_debug("- - link level 1 to object (%lx)",
+			       next->index);
+			cachefs_tree_link_to_object(next, object);
+		}
+
+		down_read(&next->sem);
+		up_read(&cursor.point->sem);
+		cachefs_tree_put(cursor.point);
+		cursor.point = next;
+
+		cursor.level--;
+		cursor.offset = index >>
+			(super->layout->pshift * cursor.level);
+		cursor.offset &= (1 << super->layout->pshift) - 1;
+		cursor.offset <<= CACHEFS_BLOCK_SHIFT;
+
+		/* load the block from memory */
+		ret = cachefs_node_read(super, next, 1);
+		if (ret < 0) {
+			up_read(&cursor.point->sem);
+			cachefs_cursor_put(&cursor);
+			*_dataptr1 = NULL;
+			_leave(" = %d [read err]", ret);
+			return ret;
+		}
+	}
+
+	/* if we're missing some pointer blocks on disk, then need to insert
+	 * them for potential later instantiation
+	 */
+	_debug("potential");
+
+	while (cursor.level > 0) {
+		_debug("- dataptr pot %d: %x[%x]",
+		       cursor.level, cursor.point->bix, cursor.offset);
+
+		next = cachefs_tree_lookup(gfp, &cursor, CACHEFS_NULL_PTR,
+					   CACHEFS_TREE_TYPE_DATAPTRBLK, 0);
+		if (!next)
+			goto nomem;
+
+		ASSERTCMP(next, !=, cursor.point);
+
+		/* link level 1 nodes to the object directly */
+		if (next->level == 1) {
+			next->index = index & CACHEFS_ONDISC_LEVEL_MASK;
+			_debug("- - link level 1 to object (%lx)",
+			       next->index);
+			cachefs_tree_link_to_object(next, object);
+		}
+
+		down_write(&next->sem);
+		up_read(&cursor.point->sem);
+		cachefs_tree_put(cursor.point);
+		cursor.point = next;
+
+		/* allocate a page in which to build this block */
+		if (!cursor.point->page) {
+			void *data;
+
+			cursor.point->page = alloc_page(GFP_HIGHUSER);
+			if (!cursor.point->page) {
+				up_write(&cursor.point->sem);
+				goto nomem_noup;
+			}
+
+			data = kmap_atomic(cursor.point->page, KM_USER0);
+			memset(data, CACHEFS_NULL_FILL, PAGE_SIZE);
+			kunmap_atomic(data, KM_USER0);
+
+			cursor.point->page->mapping = super->imeta->i_mapping;
+			SetPageUptodate(cursor.point->page);
+			SetPageFsMisc(cursor.point->page);
+			SetPagePrivate(cursor.point->page);
+
+			set_bit(CACHEFS_TREE_NODE_VALID, &cursor.point->flags);
+			set_bit(CACHEFS_TREE_NODE_VALIDATED, &cursor.point->flags);
+
+			_debug("alloc node %p{%p}",
+			       cursor.point, cursor.point->page);
+		}
+
+		downgrade_write(&cursor.point->sem);
+
+		cursor.level--;
+		cursor.offset =
+			index >> (super->layout->pshift * cursor.level);
+		cursor.offset &= (1 << super->layout->pshift) - 1;
+		cursor.offset <<= CACHEFS_BLOCK_SHIFT;
+	}
+
+	/* we've made it to level 0 and we may have a pointer to the data */
+	_debug("level 0 %x[%x]", cursor.point->bix, cursor.offset);
+
+	/* return ENODATA if the metadata chain doesn't entirely exist */
+	ret = 0;
+	if (cursor.point->bix == CACHEFS_NULL_PTR)
+		ret = -ENODATA;
+
+	ASSERT(cursor.point->page);
+	ASSERTCMP(cursor.point->type, ==, CACHEFS_TREE_TYPE_DATAPTRBLK);
+
+	*_dataptr1 = cursor.point;
+	_leave(" = %d [done]", ret);
+	return ret;
+
+nomem:
+	up_read(&cursor.point->sem);
+nomem_noup:
+	cachefs_cursor_put(&cursor);
+	*_dataptr1 = NULL;
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+
+} /* end cachefs_data_walk() */
+
+/*****************************************************************************/
+/*
+ * indicate to the netfs that a page should be marked as being cached
+ */
+static void cachefs_data_read_mark_page(struct cachefs_object *object,
+					struct address_space *mapping,
+					struct page *page)
+{
+	struct fscache_cookie *cookie;
+	struct pagevec cached_pvec;
+
+	pagevec_init(&cached_pvec, 0);
+	pagevec_add(&cached_pvec, page);
+
+	cookie = object->fscache.cookie;
+	cookie->def->mark_pages_cached(cookie->netfs_data, mapping,
+				       &cached_pvec);
+
+} /* end cachefs_data_read_mark_page() */
+
+/*****************************************************************************/
+/*
+ * deal with a read of the single page in a 0-level tree
+ */
+static int cachefs_data_read_page_0level(struct cachefs_super *super,
+					 struct cachefs_object *object,
+					 struct page *page,
+					 struct cachefs_io_callback *callback,
+					 unsigned long gfp)
+{
+	struct cachefs_ondisc_leaf *leaf;
+	struct cachefs_tree *point;
+	cachefs_block_t bix;
+	struct bio *bio;
+
+	_enter("");
+
+	ASSERTCMP(page->index, ==, 0);
+
+	/* slide the lock down the tree until we get a read lock and a ref on
+	 * the node containing the object
+	 */
+	point = cachefs_tree_slide_readlock(super, object);
+
+	_debug("- dataptr obj 0: %x[%04x]", point->bix, object->offset);
+
+	/* get a pointer to the block */
+	leaf = kmap_atomic(object->node->page, KM_USER0) + object->offset;
+	bix = leaf->ptr;
+	kunmap_atomic(leaf, KM_USER0);
+
+	/* deal with there being no backing tree yet */
+	if (bix == CACHEFS_NULL_PTR) {
+		/* retain metadata for page 0, inasmuch as there is any */
+		cachefs_data_read_mark_page(object, page->mapping, page);
+		atomic_inc(&object->page_usage);
+		up_read(&point->sem);
+		cachefs_tree_put(point);
+		_leave(" = -ENODATA");
+		return -ENODATA;
+	}
+
+	if (bix < super->layout->bix_cache || bix >= super->j.alloc_unready) {
+		printk(KERN_ERR
+		       "CacheFS: Block number out of range %x (%x-%x)\n",
+		       bix,
+		       super->layout->bix_cache,
+		       super->j.alloc_unready - 1);
+		BUG();
+	}
+
+	/* dispatch a bio to load the page */
+	bio = bio_alloc(gfp, 1);
+	if (!bio) {
+		up_read(&point->sem);
+		cachefs_tree_put(point);
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	bio->bi_bdev	= super->sb->s_bdev;
+	bio->bi_end_io	= cachefs_netfs_io_completion;
+	bio->bi_sector	= bix;
+	bio->bi_sector	<<= PAGE_SHIFT - super->sb->s_blocksize_bits;
+
+	if (!bio_add_page(bio, page, PAGE_SIZE, 0))
+		BUG();
+
+	/* prevent the page from being recycled before we've read it */
+	atomic_inc(&callback->usage);
+	bio->bi_private	= callback;
+
+	cachefs_data_read_mark_page(object, page->mapping, page);
+
+	//dump_bio(bio, 1);
+	submit_bio(READ, bio);
+
+	/* note metadata "retention" for page 0 */
+	atomic_inc(&object->page_usage);
+
+	up_read(&point->sem);
+	cachefs_tree_put(point);
+	_leave(" = 0 [%x]", bix);
+	return 0;
+
+} /* end cachefs_data_read_page_0level() */
+
+/*****************************************************************************/
+/*
+ * look up a data page belonging to the nominated object
+ * - the object must reside in the in-memory tree
+ */
+int cachefs_data_read_page(struct cachefs_super *super,
+			   struct cachefs_object *object,
+			   struct page *page,
+			   struct cachefs_io_callback *callback,
+			   unsigned long gfp)
+{
+	struct cachefs_tree *dataptr;
+	cachefs_block_t *ptr, bix;
+	struct bio *bio;
+	int ret;
+
+	_enter("{%s},{i:%llx},{p:%lx},,",
+	       super->cache.identifier, object->objid, page->index);
+
+	ASSERTIFCMP(object->page_limit < ULONG_MAX,
+		    page->index, <, object->page_limit);
+
+	/* special handling for 0-level trees */
+	if (object->data_levels == 0)
+		return cachefs_data_read_page_0level(super, object, page,
+						     callback, gfp);
+
+	/* find the parent data pointer block */
+	ret = cachefs_data_walk(super, object, page->index, &dataptr, gfp);
+	if (ret < 0) {
+		if (ret == -ENODATA)
+			goto nodata;
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* find the block holding the page */
+	ptr = kmap_atomic(dataptr->page, KM_USER0);
+	bix = ptr[page->index & ~CACHEFS_ONDISC_LEVEL_MASK];
+	kunmap_atomic(ptr, KM_USER0);
+
+	_debug("datapage: %x[%04lx]: %x",
+	       dataptr->bix, page->index & ~CACHEFS_ONDISC_LEVEL_MASK, bix);
+
+	if (bix == CACHEFS_NULL_PTR)
+		goto nodata;
+
+	if (bix < super->layout->bix_cache || bix >= super->j.alloc_unready) {
+		printk(KERN_ERR
+		       "CacheFS: Block number out of range %x (%x-%x)\n",
+		       bix,
+		       super->layout->bix_cache,
+		       super->j.alloc_unready - 1);
+		BUG();
+	}
+
+	/* dispatch a bio to load the page */
+	bio = bio_alloc(gfp, 1);
+	if (!bio) {
+		up_read(&dataptr->sem);
+		cachefs_tree_put(dataptr);
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	bio->bi_bdev	= super->sb->s_bdev;
+	bio->bi_end_io	= cachefs_netfs_io_completion;
+	bio->bi_sector	= bix;
+	bio->bi_sector	<<= PAGE_SHIFT - super->sb->s_blocksize_bits;
+
+	if (!bio_add_page(bio, page, PAGE_SIZE, 0))
+		BUG();
+
+	atomic_inc(&callback->usage);
+	bio->bi_private	= callback;
+
+	cachefs_data_read_mark_page(object, page->mapping, page);
+
+	//dump_bio(bio, 1);
+	submit_bio(READ, bio);
+
+	/* retain level 1 data ptr blocks whilst we're dealing with them */
+	if (atomic_add_return(1, &dataptr->netfs_usage) == 1)
+		cachefs_tree_get(dataptr);
+	atomic_inc(&object->page_usage);
+
+	up_read(&dataptr->sem);
+	cachefs_tree_put(dataptr);
+	_leave(" = 0 [%x]", bix);
+	return 0;
+
+	/* retain the level-1 data ptr block even if there's no data to be
+	 * read */
+nodata:
+	_debug("nodata");
+
+	cachefs_data_read_mark_page(object, page->mapping, page);
+
+	if (atomic_add_return(1, &dataptr->netfs_usage) == 1)
+		cachefs_tree_get(dataptr);
+	atomic_inc(&object->page_usage);
+
+	up_read(&dataptr->sem);
+	cachefs_tree_put(dataptr);
+	_leave(" = -ENODATA");
+	return -ENODATA;
+
+} /* end cachefs_data_read_page() */
+
+/*****************************************************************************/
+/*
+ * deal with a read of the single page in a 0-level tree
+ */
+static int cachefs_data_read_pages_0level(struct cachefs_super *super,
+					  struct cachefs_object *object,
+					  struct address_space *mapping,
+					  struct list_head *pages,
+					  int *nr_pages,
+					  struct cachefs_io_callback *callback,
+					  unsigned long gfp)
+{
+	struct cachefs_ondisc_leaf *leaf;
+	struct cachefs_tree *point;
+	cachefs_block_t bix;
+	struct page *page;
+	struct bio *bio;
+	int ret;
+
+	page = list_entry(pages->next, struct page, lru);
+
+	_enter("");
+
+	ASSERTCMP(page->index, ==, 0);
+	ASSERTCMP(*nr_pages, ==, 1);
+
+	/* slide the lock down the tree until we get a read lock and a ref on
+	 * the node containing the object
+	 */
+	point = cachefs_tree_slide_readlock(super, object);
+
+	_debug("- dataptr obj 0: %x[%04x]", point->bix, object->offset);
+
+	/* get a pointer to the block */
+	leaf = kmap_atomic(object->node->page, KM_USER0) + object->offset;
+	bix = leaf->ptr;
+	kunmap_atomic(leaf, KM_USER0);
+
+	/* deal with there being no backing tree yet */
+	if (bix == CACHEFS_NULL_PTR) {
+		/* retain metadata for page 0, inasmuch as there is any */
+		cachefs_data_read_mark_page(object, mapping, page);
+		atomic_inc(&object->page_usage);
+		up_read(&point->sem);
+		cachefs_tree_put(point);
+		_leave(" = -ENODATA");
+		return -ENODATA;
+	}
+
+	if (bix < super->layout->bix_cache || bix >= super->j.alloc_unready) {
+		printk(KERN_ERR
+		       "CacheFS: Block number out of range %x (%x-%x)\n",
+		       bix,
+		       super->layout->bix_cache,
+		       super->j.alloc_unready - 1);
+		BUG();
+	}
+
+	/* dispatch a bio to load the page */
+	bio = bio_alloc(gfp, 1);
+	if (!bio) {
+		up_read(&point->sem);
+		cachefs_tree_put(point);
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	bio->bi_bdev	= super->sb->s_bdev;
+	bio->bi_end_io	= cachefs_netfs_io_completion;
+	bio->bi_sector	= bix;
+	bio->bi_sector	<<= PAGE_SHIFT - super->sb->s_blocksize_bits;
+
+	/* the page needs removing from the caller's list and inserting into
+	 * the page cache and LRU system */
+	list_del(&page->lru);
+	ret = add_to_page_cache_lru(page, mapping, page->index, gfp);
+	if (ret < 0) {
+		list_add(&page->lru, pages);
+		bio_put(bio);
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	*nr_pages--;
+
+	if (!bio_add_page(bio, page, PAGE_SIZE, 0))
+		BUG();
+
+	/* prevent the page from being recycled before we've read it */
+	atomic_inc(&callback->usage);
+	bio->bi_private	= callback;
+
+	cachefs_data_read_mark_page(object, page->mapping, page);
+
+	//dump_bio(bio, 1);
+	submit_bio(READ, bio);
+
+	/* note metadata "retention" for page 0 */
+	atomic_inc(&object->page_usage);
+
+	up_read(&point->sem);
+	cachefs_tree_put(point);
+	_leave(" = 0 [%x]", bix);
+	return 0;
+
+} /* end cachefs_data_read_pages_0level() */
+
+/*****************************************************************************/
+/*
+ * look up a bunch of data pages belonging to the nominated object
+ * - the object must exist
+ */
+int cachefs_data_read_pages(struct cachefs_super *super,
+			    struct cachefs_object *object,
+			    struct address_space *mapping,
+			    struct list_head *pages,
+			    int *nr_pages,
+			    struct cachefs_io_callback *callback,
+			    unsigned long gfp)
+{
+	struct cachefs_tree *dataptr;
+	cachefs_block_t last_block_in_bio, *ptr, bix;
+	struct pagevec lru_pvec, cached_pvec;
+	unsigned long dpindex;
+	struct page *page, *_p;
+	struct bio *bio;
+	int retain, ret, ret2, state, contiguity, ix;
+
+	page = list_entry(pages->next, struct page, lru);
+
+	_enter("{%s},{i:%llx},{p:%lx},%d,,",
+	       super->cache.identifier, object->objid,
+	       page->index, *nr_pages);
+
+	/* special handling for 0-level trees */
+	if (object->data_levels == 0)
+		return cachefs_data_read_pages_0level(super, object, mapping,
+						      pages, nr_pages,
+						      callback, gfp);
+
+	/* find the block holding the page */
+	pagevec_init(&lru_pvec, 0);
+	pagevec_init(&cached_pvec, 0);
+	last_block_in_bio = 0;
+	dpindex = 0;
+	retain = 0;
+	state = 0;
+	bio = NULL;
+	ret = 0;
+
+	list_for_each_entry_safe_reverse(page, _p, pages, lru) {
+		ASSERTCMP(*nr_pages, >, 0);
+
+		_debug("read page %p{%lx}", page, page->index);
+
+		if (object->page_limit != ULONG_MAX &&
+		    page->index >= object->page_limit)
+			continue;
+
+		/* drop the level 1 pointer block we were using if this page
+		 * isn't in it */
+		if (state == 1 &&
+		    ((page->index ^ dpindex) & CACHEFS_ONDISC_LEVEL_MASK) != 0
+		    ) {
+			_debug("ditch %lx for %lx", dpindex, page->index);
+
+			/* retain level 1 data ptr blocks whilst they're
+			 * backing in-memory netfs pages */
+			atomic_add(retain, &object->page_usage);
+			if (dataptr) {
+				if (atomic_add_return(retain,
+						      &dataptr->netfs_usage
+						      ) == retain)
+					cachefs_tree_get(dataptr);
+				up_read(&dataptr->sem);
+				cachefs_tree_put(dataptr);
+				dataptr = NULL;
+			}
+			retain = 0;
+			state = 0;
+		}
+
+		/* look up the level 1 pointer block for this page if we don't
+		 * already have it */
+		if (state == 0) {
+			dpindex = page->index & CACHEFS_ONDISC_LEVEL_MASK;
+			ret2 = cachefs_data_walk(super, object, dpindex,
+						 &dataptr, gfp);
+			if (ret2 < 0 && ret2 != -ENODATA && ret2 != -ENOBUFS) {
+				ret = ret2;
+				goto out;
+			}
+			state = 1;
+		}
+
+		/* ignore pages without level 1 data allocated (ENOBUFS) */
+		if (!dataptr) {
+			ret = -ENOBUFS;
+			continue;
+		}
+
+		ptr = kmap_atomic(dataptr->page, KM_USER0);
+
+#if 1
+		ix = page->index & ~CACHEFS_ONDISC_LEVEL_MASK;
+		bix = ptr[ix];
+
+		contiguity = 1;
+		if (bix != CACHEFS_NULL_PTR) {
+			/* see how large a contiguous run we have */
+			for (ix++; ix < CACHEFS_ONDISC_LEVEL_SIZE; ix++) {
+				if (ptr[ix] == bix + contiguity)
+					contiguity++;
+				else
+					break;
+				if (contiguity >= BIO_MAX_PAGES)
+					break;
+			}
+		}
+
+		if (contiguity > *nr_pages)
+			contiguity = *nr_pages;
+
+#else
+		bix = ptr[page->index & ~CACHEFS_ONDISC_LEVEL_MASK];
+#endif
+		kunmap_atomic(ptr, KM_USER0);
+
+		_debug("datapage %lx: %x[%04lx]: %x-%x",
+		       page->index, dataptr->bix,
+		       page->index & ~CACHEFS_ONDISC_LEVEL_MASK,
+		       bix, bix + contiguity - 1);
+
+		/* pages for which there are no data are left in the list */
+		if (bix == CACHEFS_NULL_PTR) {
+			if (ret == 0)
+				ret = -ENODATA;
+			retain++;
+
+			/* tell the netfs about cached pages so that it can
+			 * mark them */
+			if (!pagevec_add(&cached_pvec, page)) {
+				object->fscache.cookie->def->mark_pages_cached(
+					object->fscache.cookie->netfs_data,
+					mapping,
+					&cached_pvec);
+				pagevec_init(&cached_pvec, 0);
+			}
+			continue;
+		}
+
+		if (bix < super->layout->bix_cache ||
+		    bix >= super->j.alloc_unready
+		    ) {
+			printk(KERN_ERR
+			       "CacheFS:"
+			       " Block number out of range %x (%x-%x)\n",
+			       bix,
+			       super->layout->bix_cache,
+			       super->j.alloc_unready - 1);
+			BUG();
+		}
+
+		/* the page needs removing from the caller's list and inserting
+		 * into the page cache and LRU system
+		 */
+		ret = add_to_page_cache(page, mapping, page->index, gfp);
+		if (ret == -ENOMEM)
+			break;
+
+		if (ret == -EEXIST) {
+			/* we've probably racing readpages() vs readpages() on
+			 * the same bit of the same file */
+			list_del_init(&page->lru);
+			page_cache_release(page);
+			ret = 0;
+			continue;
+		}
+
+		/* queue for addition to the LRU */
+		list_del(&page->lru);
+		if (!pagevec_add(&lru_pvec, page))
+			__pagevec_lru_add(&lru_pvec);
+
+		/* add it to the outstanding BIO if:
+		 * - there is one
+		 * - it's contiguous with the end of that BIO
+		 * - that BIO isn't full
+		 */
+		while (!bio ||
+		       (last_block_in_bio != 0 && last_block_in_bio != bix - 1) ||
+		       !bio_add_page(bio, page, PAGE_SIZE, 0)
+		       ) {
+			unsigned long bgfp;
+			int size;
+
+			/* flush the old bio */
+			if (bio) {
+				_debug("flush bio");
+
+				//dump_bio(bio, 1);
+				submit_bio(READ, bio);
+			}
+
+			/* attempt to allocate a BIO big enough to refer to all
+			 * the remaining pages, or at least as many as
+			 * possible */
+			_debug("alloc bio (nr %d)", *nr_pages);
+
+			bgfp = gfp & GFP_LEVEL_MASK;
+
+#if 1
+			size = contiguity;
+#else
+			size = *nr_pages;
+#endif
+			bio = bio_alloc(bgfp, size);
+
+			if (!bio) {
+				size = 1 << log2(size);
+				do {
+					bio = bio_alloc(bgfp, size);
+				} while (!bio && (size >>= 1) > 0);
+			}
+
+			if (!bio)
+				goto nomem_after_added_page;
+
+			_debug("bio %p size %d", bio, size);
+
+			bio->bi_bdev	= super->sb->s_bdev;
+			bio->bi_end_io	= cachefs_netfs_io_completion;
+			bio->bi_sector	= bix;
+			bio->bi_sector	<<= PAGE_SHIFT - super->sb->s_blocksize_bits;
+
+			atomic_inc(&callback->usage);
+			bio->bi_private	= callback;
+
+			last_block_in_bio = 0;
+		}
+
+		/* tell the netfs about cached pages so that it can mark
+		 * them */
+		if (!pagevec_add(&cached_pvec, page)) {
+			object->fscache.cookie->def->mark_pages_cached(
+				object->fscache.cookie->netfs_data,
+				mapping,
+				&cached_pvec);
+			pagevec_init(&cached_pvec, 0);
+		}
+
+		last_block_in_bio = bix;
+		*nr_pages -= 1;
+		retain++;
+	}
+
+	_debug("add to LRU");
+
+out:
+	/* add read pages to the LRU and tell the netfs about cached pages so
+	 * that it can mark them */
+	pagevec_lru_add(&lru_pvec);
+
+	if (pagevec_count(&cached_pvec))
+		object->fscache.cookie->def->mark_pages_cached(
+			object->fscache.cookie->netfs_data,
+			mapping,
+			&cached_pvec);
+
+	/* flush any outstanding read op */
+	if (bio) {
+		//dump_bio(bio, 2);
+		submit_bio(READ, bio);
+	}
+
+	/* retain level 1 data ptr blocks whilst they're backing in-memory
+	 * netfs pages */
+	atomic_add(retain, &object->page_usage);
+
+	if (dataptr) {
+		if (atomic_add_return(retain, &dataptr->netfs_usage) == retain)
+			cachefs_tree_get(dataptr);
+		up_read(&dataptr->sem);
+		cachefs_tree_put(dataptr);
+	}
+
+	_leave(" = %d [%d]", ret, *nr_pages);
+	return ret;
+
+	/* deal with the non-trivial case of getting an ENOMEM after moving a
+	 * page into the page cache */
+nomem_after_added_page:
+	_debug("ENOMEM with page %p moved to page cache", page);
+
+	/* the page is not yet up to date, and neither is it erroneous */
+	ASSERT(!PageUptodate(page));
+	ASSERT(!PageError(page));
+	unlock_page(page);
+
+	ret = -ENOMEM;
+	goto out;
+
+} /* end cachefs_data_read_pages() */
+
+/*****************************************************************************/
+/*
+ * allocate a data page belonging to the nominated object
+ * - the object must exist
+ */
+int cachefs_data_alloc_page(struct cachefs_super *super,
+			    struct cachefs_object *object,
+			    struct page *page,
+			    unsigned long gfp)
+{
+	struct cachefs_tree *dataptr;
+	int ret;
+
+	_enter("{%s},{i:%llx},{p:%lx},,",
+	       super->cache.identifier, object->objid, page->index);
+
+	ASSERTIFCMP(object->page_limit < ULONG_MAX,
+		    page->index, <, object->page_limit);
+
+	/* special handling for 0-level trees */
+	if (object->data_levels == 0) {
+		/* note metadata "retention" for page 0 */
+		cachefs_data_read_mark_page(object, page->mapping, page);
+		atomic_inc(&object->page_usage);
+		_leave(" = 0 [level 0]");
+		return 0;
+	}
+
+	/* find the parent data pointer block */
+	ret = cachefs_data_walk(super, object, page->index, &dataptr, gfp);
+	if (ret < 0 && ret != -ENODATA) {
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* retain the level-1 data ptr block even if there's no data to be
+	 * read */
+	cachefs_data_read_mark_page(object, page->mapping, page);
+
+	if (atomic_add_return(1, &dataptr->netfs_usage) == 1)
+		cachefs_tree_get(dataptr);
+	atomic_inc(&object->page_usage);
+
+	up_read(&dataptr->sem);
+	cachefs_tree_put(dataptr);
+	_leave(" = -ENODATA");
+	return -ENODATA;
+
+} /* end cachefs_data_alloc_page() */
+
+/*****************************************************************************/
+/*
+ * write a bunch of pages of netfs data to disk
+ * - the pagevec must a set of pages all in the same block
+ */
+int cachefs_data_write(struct cachefs_super *super,
+		       struct cachefs_object *object,
+		       struct pagevec *pagevec,
+		       struct cachefs_io_callback *callback,
+		       unsigned long gfp)
+{
+	struct cachefs_ondisc_leaf *leaf;
+	struct cachefs_operation op;
+	struct fscache_cookie *cookie;
+	cachefs_block_t *ptr;
+	unsigned long index;
+	struct bio *bio;
+	uint16_t offset;
+	int level, ret, new;
+
+	_enter("{%s},{i:%llx},{p:%lx},,",
+	       super->cache.identifier,
+	       object->objid,
+	       pagevec->pages[0]->index);
+
+	ASSERTCMP(pagevec->nr, >, 0);
+	ASSERT(pagevec->pages[0]);
+	ASSERTIFCMP(object->data_levels == 0, pagevec->nr, ==, 1);
+	ASSERTIFCMP(object->data_levels == 0, pagevec->pages[0]->index, ==, 0);
+
+	/* set up an operation record and request allocation reservation */
+	op.super	= super;
+	op.object	= object;
+	op.reason	= CACHEFS_OP_INSERT_DATA;
+	op.data_space	= pagevec->nr;
+
+	ret = cachefs_operation_begin(&op);
+	if (ret < 0) {
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* replace and lock down to the node holding the object */
+	_debug("lock to obj");
+
+	ret = cachefs_tree_slide_writelock_replace(&op);
+	if (ret < 0) {
+		cachefs_operation_end(&op);
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* walk out to the appropriate level-1 data ptr block, replacing nodes
+	 * as we go
+	 */
+	op.p.data.point = cachefs_tree_get(op.p.data.inode);
+
+	offset = op.object->offset;
+	index = pagevec->pages[0]->index;
+	level = op.object->data_levels;
+
+	while (level > 0) {
+		_debug("step %x[%04x] level %d",
+		       op.p.data.point->bix, offset, level);
+
+		/* find the next level of pointer block */
+		read_lock(&op.p.data.point->lock);
+		op.p.data.next =
+			cachefs_tree_find_node(op.p.data.point,
+					       CACHEFS_TREE_TYPE_DATAPTRBLK,
+					       offset);
+
+		/* which must be there as we should have it pinned */
+		ASSERT(op.p.data.next);
+		cachefs_tree_get(op.p.data.next);
+		read_unlock(&op.p.data.point->lock);
+
+		/* replace the block */
+		down_write(&op.p.data.next->sem);
+
+		new = (op.p.data.next->bix == CACHEFS_NULL_PTR);
+		ret = cachefs_replace_node(&op, op.p.data.next);
+		if (ret < 0)
+			goto error;
+
+		if (new) {
+			spin_lock(&super->alloc_lock);
+			super->j.space_data_used++;
+			spin_unlock(&super->alloc_lock);
+
+			leaf = kmap_atomic(op.p.data.inode->page, KM_USER0) +
+				op.object->offset;
+			leaf->u.object.nblocks ++;
+			kunmap_atomic(leaf, KM_USER0);
+		}
+
+		/* move to next dataptr block */
+		if (op.p.data.point != op.p.data.inode) {
+			unlock_page(op.p.data.point->page);
+			up_write(&op.p.data.point->sem);
+		}
+		cachefs_tree_put(op.p.data.point);
+
+		op.p.data.point = op.p.data.next;
+		op.p.data.next = NULL;
+
+		level--;
+		offset = index >> level * CACHEFS_ONDISC_LEVEL_BITS;
+		offset &= ~CACHEFS_ONDISC_LEVEL_MASK;
+		offset <<= CACHEFS_BLOCK_SHIFT;
+	}
+
+	_debug("here %x[%04x] level %d",
+	       op.p.data.point->bix, offset, op.p.data.point->level - 1);
+
+	ASSERTCMP(offset, <, PAGE_SIZE);
+	ASSERTCMP(offset & 3, ==, 0);
+
+	/* we don't yet support multi-page writes */
+	ASSERTCMP(pagevec->nr, ==, 1); /* TODO */
+
+	/* allocate or replace a netfs data block */
+	op.m_alloc = 1;
+
+	ptr = kmap_atomic(op.p.data.point->page, KM_USER0) + offset;
+	op.bix_rcm[0] = *ptr;
+	kunmap_atomic(ptr, KM_USER0);
+
+	op.m_rcm = 0;
+	if (op.bix_rcm[0] != CACHEFS_NULL_PTR) {
+		_debug("old netfs block %x", op.bix_rcm[0]);
+		ASSERTCMP(op.bix_rcm[0], >=, super->layout->bix_cache);
+		ASSERTCMP(op.bix_rcm[0], <, super->j.alloc_unready);
+		op.m_rcm = 1;
+	}
+
+	ret = cachefs_allocator(&op);
+	if (ret < 0)
+		goto error;
+
+	/* pin the journal against this page */
+	callback->jnl = cachefs_journal_get(super);
+	atomic_inc(&callback->usage);
+
+	/* write the netfs page to disk */
+	bio = bio_alloc(GFP_KERNEL | __GFP_WAIT | __GFP_NOFAIL, 1);
+	bio->bi_bdev	= super->sb->s_bdev;
+	bio->bi_private	= callback;
+	bio->bi_end_io	= cachefs_netfs_io_completion;
+	bio->bi_sector	= op.bix_alloc[0];
+	bio->bi_sector	<<= PAGE_SHIFT - super->sb->s_blocksize_bits;
+
+	if (!bio_add_page(bio, pagevec->pages[0], PAGE_SIZE, 0))
+		BUG();
+
+	//dump_bio(bio, 1);
+	submit_bio(WRITE, bio);
+
+	/* modify the metadata */
+	_alter(op.super, "set obj %llx dataptr %x[%04x] to %x",
+	       op.object->objid,
+	       op.p.data.point->bix,
+	       offset,
+	       op.bix_alloc[0]);
+
+	ptr = kmap_atomic(op.p.data.point->page, KM_USER0) + offset;
+	*ptr = op.bix_alloc[0];
+	kunmap_atomic(ptr, KM_USER0);
+
+	if (op.p.data.point != op.p.data.inode) {
+		set_page_dirty(op.p.data.point->page);
+		unlock_page(op.p.data.point->page);
+		up_write(&op.p.data.point->sem);
+	}
+
+	/* update the object since we're passing */
+	_alter(op.super, "update obj %llx %x[%04x]",
+	       op.object->objid, op.p.data.inode->bix, op.object->offset);
+
+	leaf = kmap_atomic(op.p.data.inode->page, KM_USER0) +
+		op.object->offset;
+
+	leaf->u.object.nblocks += 1 - op.m_rcm;
+	leaf->u.object.atime = CURRENT_TIME.tv_sec;
+
+	cookie = object->fscache.cookie;
+	if (cookie->def->get_aux) {
+		/* update the netfs auxilliary data */
+		uint16_t dlen, maxdlen = CACHEFS_ONDISC_LEAF_SIZE;
+		void *dbuf;
+
+		maxdlen -= offsetof(struct cachefs_ondisc_leaf,
+				    u.object.netfs_data);
+		maxdlen -= leaf->u.object.netfs_klen;
+		dbuf = leaf->u.object.netfs_data;
+		dbuf += leaf->u.object.netfs_klen;
+
+		dlen = cookie->def->get_aux(cookie->netfs_data, dbuf, maxdlen);
+		BUG_ON(dlen > maxdlen);
+		leaf->u.object.netfs_dlen = dlen;
+	}
+
+	kunmap_atomic(leaf, KM_USER0);
+
+	/* done */
+	set_page_dirty(op.p.data.inode->page);
+	unlock_page(op.p.data.inode->page);
+	up_write(&op.p.data.inode->sem);
+
+	spin_lock(&op.super->alloc_lock);
+	op.super->j.space_data_used += 1 - op.m_rcm;
+	spin_unlock(&op.super->alloc_lock);
+
+	op.object->has_data = 1;
+
+	cachefs_operation_end(&op);
+	_leave(" = 0");
+	return 0;
+
+error:
+	kdebug("error");
+
+	if (op.p.data.next) {
+		kdebug("unlock next page %p", op.p.data.next->page);
+		unlock_page(op.p.data.next->page);
+		up_write(&op.p.data.next->sem);
+	}
+
+	if (op.p.data.point && op.p.data.point != op.p.data.inode) {
+		kdebug("unlock point page %p", op.p.data.point->page);
+		unlock_page(op.p.data.point->page);
+		up_write(&op.p.data.point->sem);
+	}
+
+	kdebug("unlock inode page %p", op.p.data.inode->page);
+	unlock_page(op.p.data.inode->page);
+	up_write(&op.p.data.inode->sem);
+
+	cachefs_operation_end(&op);
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_data_write() */
+
+/*****************************************************************************/
+/*
+ * release a number of pages
+ */
+unsigned long cachefs_data_uncache(struct cachefs_super *super,
+				   struct cachefs_object *object,
+				   struct pagevec *pagevec,
+				   unsigned long ix)
+{
+	struct cachefs_tree *node;
+	unsigned long index, count;
+
+	index = pagevec->pages[ix]->index;
+	_enter(",,{%lx},%lu", index, ix);
+
+	if (object->data_levels == 0) {
+		ASSERTCMP(index, ==, 0);
+		ASSERTCMP(ix, ==, 0);
+		ASSERTCMP(atomic_read(&object->page_usage), ==, 1);
+
+		atomic_dec(&object->page_usage);
+
+		_leave(" = %lu [pageroot]", ix + 1);
+		return ix + 1;
+	}
+
+	/* count the number of releases we can make in one go */
+	count = ix;
+	for (ix++; ix < pagevec->nr; ix++)
+		if (((pagevec->pages[ix]->index ^ index) &
+		     CACHEFS_ONDISC_PTRPERLEAF_MASK
+		     ) != 0)
+			break;
+
+	count = ix - count;
+
+	/* grab the first node off which we have dangling pages */
+	node = cachefs_tree_find_level1_dataptr(object, index);
+
+	/* check that we haven't run out of refs */
+	ASSERT(node);
+
+	_debug("release %lu of %d", count, atomic_read(&node->netfs_usage));
+
+	ASSERTCMP(atomic_read(&node->netfs_usage), >=, (int) count);
+	ASSERTCMP(atomic_read(&object->page_usage), >=, (int) count);
+
+	/* release the refs */
+	if (atomic_sub_and_test(count, &node->netfs_usage))
+		cachefs_tree_put(node);
+	atomic_sub(count, &object->page_usage);
+
+	cachefs_tree_put(node);
+	_leave(" = %lu", ix);
+	return ix;
+
+} /* end cachefs_data_uncache() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/tree-delete.c linux-2.6.14-mm2-cachefs/fs/cachefs/tree-delete.c
--- linux-2.6.14-mm2/fs/cachefs/tree-delete.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/tree-delete.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,597 @@
+/* tree-delete.c: indexing tree leaf deletion
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+//#define __KENTER
+//#define __KLEAVE
+//#define __KDEBUG
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "cachefs-int.h"
+
+static int cachefs_tree_delete_setup_recycling(struct cachefs_operation *op);
+static void cachefs_trans_reap_one(struct cachefs_operation *op);
+static void cachefs_trans_recycle_one(struct cachefs_operation *op);
+
+/*****************************************************************************/
+/*
+ * resolve the next step from op->p.del.point
+ * - caller must have op->p.del.point's semaphore locked
+ * - new node is placed in op->p.del.next
+ */
+static void cachefs_tree_step_one_inmem(struct cachefs_operation *op)
+{
+	struct cachefs_tree *next;
+	uint16_t offset;
+
+	_enter("");
+
+	ASSERT(op->p.del.point);
+	ASSERT(!op->p.del.next);
+
+	/* extract the bits of key in which we're immediately interested */
+	offset = cachefs_extract_subkey_obj(op->object,
+					    op->p.del.point->level);
+
+	_debug("step %d subkey=%04x", op->p.del.point->level, offset);
+
+	/* start by checking the cached branches and shortcuts leading off of
+	 * this one */
+	read_lock(&op->p.del.point->lock);
+
+	next = cachefs_tree_find_node(op->p.del.point, CACHEFS_TREE_TYPE_NODE,
+				      offset);
+	if (next)
+		goto move_to_cached_branch;
+
+	next = cachefs_tree_find_shortcut_obj(op->p.del.point, op->object);
+	if (next)
+		goto move_to_cached_shortcut;
+
+	read_unlock(&op->p.del.point->lock);
+
+	/* uh oh... the object should be in the tree somewhere */
+	printk(KERN_ERR "Object not connected to in-mem tree\n");
+	printk(KERN_ERR "- obj %llx node %p{%x} level %d offset %04x\n",
+	       op->object->objid,
+	       op->p.del.point,
+	       op->p.del.point->bix,
+	       op->p.del.point->level,
+	       offset);
+	BUG();
+
+	/* we found a suitable branch to move to in the topology cache */
+move_to_cached_shortcut:
+	_debug(">>>> skip to shortcut");
+
+move_to_cached_branch:
+	op->p.del.next = cachefs_tree_get(next);
+	_debug(">>>> move to %p [lev %d]", op->p.del.next, op->p.del.next->level);
+
+	read_unlock(&op->p.del.point->lock);
+	_leave(" [found %x]", op->p.del.next->bix);
+
+} /* end cachefs_tree_step_one_inmem() */
+
+/*****************************************************************************/
+/*
+ * delete an object from the tree
+ * - the object's data is recycled immediately if a single page or queued for
+ *   later recycling if more than that
+ * - the object's object ID is placed on the list for later child zapping
+ * - the object's leaf is erased and the tree compressed if possible
+ */
+int cachefs_tree_delete(struct cachefs_super *super,
+			struct cachefs_object *object)
+{
+	struct cachefs_ondisc_leaf *leaf;
+	struct cachefs_operation op;
+	struct cachefs_tree *x;
+	cachefs_block_t *ptr;
+	void *data;
+	int loop, ret, nitems, nrcm;
+
+	_enter(",%p{%llx}", object, object->objid);
+
+	ASSERT(object->node);
+
+	/* set up an operation record and request allocation reservation */
+	op.super	= super;
+	op.object	= object;
+	op.reason	= CACHEFS_OP_DELETE_LEAF;
+	op.data_space	= 0;
+
+	ret = cachefs_operation_begin(&op);
+	if (ret < 0) {
+		_leave(" = %d [begin]", ret);
+		return ret;
+	}
+
+	/* walk a write-lock down from the root until we find the point at
+	 * which we're going to delete back to */
+	op.p.del.point = cachefs_tree_get(op.super->metadata_tree);
+	down_write(&op.p.del.point->sem);
+
+	/* replace the root right at the start */
+	ret = cachefs_replace_node(&op, op.p.del.point);
+	if (ret < 0) {
+		up_write(&op.p.del.point->sem);
+		cachefs_operation_end(&op);
+		_leave(" = %d [repl root]", ret);
+		return ret;
+	}
+
+	op.p.del.pruneto = cachefs_tree_get(op.p.del.point);
+
+	for (;;) {
+		/* see if the object is in the current node */
+		if (object->node == op.p.del.point)
+			break;
+
+		/* take a step down the tree and lock the new node */
+		cachefs_tree_step_one_inmem(&op);
+		down_write(&op.p.del.next->sem);
+
+		/* we need to keep this node and its ancestors if it has more
+		 * than one child */
+		if (op.p.del.next->occupancy > 1) {
+			/* move the prune-back-to point to here */
+			while (op.p.del.pruneto != op.p.del.next) {
+				x = op.p.del.next;
+				while (x->parent != op.p.del.pruneto)
+					x = x->parent;
+
+				downgrade_write(&op.p.del.pruneto->sem);
+
+				ret = cachefs_replace_node(&op, x);
+				if (ret < 0)
+					goto error_whilst_replacing_node;
+
+				unlock_page(op.p.del.pruneto->page);
+				up_read(&op.p.del.pruneto->sem);
+				cachefs_tree_put(op.p.del.pruneto);
+				op.p.del.pruneto = cachefs_tree_get(x);
+			}
+		}
+
+		cachefs_tree_put(op.p.del.point);
+		op.p.del.point = op.p.del.next;
+		op.p.del.next = NULL;
+	}
+
+	/* we've found the object containing the node
+	 * - mustn't fail for anything other than the disk shredding itself
+	 *   from hereon in
+	 */
+	_debug("splat %x{%d}, %x{%d}",
+	       op.p.del.pruneto->bix,
+	       op.p.del.pruneto->occupancy,
+	       op.p.del.point->bix,
+	       op.p.del.point->occupancy);
+
+	/* we should now be able to drop all the writelocks we're holding
+	 * beyond the pruneto point */
+	for (x = op.p.del.point; x != op.p.del.pruneto; x = x->parent)
+		up_write(&x->sem);
+
+	/* make sure we can attach two more pages to the page cache for the
+	 * objid reap and for the data recycler */
+	nitems = 0;
+	if (op.object->has_data && op.object->data_levels > 0)
+		nitems++;
+
+	if (op.object->flags & CACHEFS_ONDISC_OBJECT_HAS_CHILDREN)
+		nitems++;
+
+	if (nitems > 0) {
+		ret = radix_tree_preload_task(GFP_KERNEL, nitems);
+		if (ret < 0) {
+			up_write(&op.p.del.pruneto->sem);
+			cachefs_operation_end(&op);
+			_leave(" = %d [pre radix]", ret);
+			return ret;
+		}
+
+		down(&op.super->deletion_sem);
+
+		/* make sure the recycling stacks are available */
+		ret = cachefs_tree_delete_setup_recycling(&op);
+		if (ret < 0) {
+			up(&op.super->deletion_sem);
+			up_write(&op.p.del.pruneto->sem);
+			cachefs_operation_end(&op);
+			_leave(" = %d [pre radix]", ret);
+			return ret;
+		}
+
+		/* queue a level >0 data tree for kcachefsd to consume */
+		if (op.object->has_data &&
+		    op.object->data_levels > 0
+		    ) {
+			/* locate the block at the root of the tree */
+			leaf = kmap_atomic(op.p.del.point->page, KM_USER0) +
+				op.object->offset;
+			op.bix_rcm[0] = leaf->ptr;
+
+			spin_lock(&super->alloc_lock);
+			super->j.space_data_used -= leaf->u.object.nblocks;
+			super->j.space_rcy += leaf->u.object.nblocks;
+			spin_unlock(&super->alloc_lock);
+
+			kunmap_atomic(leaf, KM_USER0);
+
+			cachefs_trans_recycle_one(&op);
+		}
+
+		/* reap the object's ID if it has any children */
+		if (op.object->flags & CACHEFS_ONDISC_OBJECT_HAS_CHILDREN)
+			cachefs_trans_reap_one(&op);
+
+		up(&op.super->deletion_sem);
+	}
+
+	/* recycle directly the object's data if it's merely a level-0 block */
+	op.m_alloc = 0;
+	op.m_rcm = 0;
+
+	if (op.object->has_data &&
+	    op.object->data_levels == 0
+	    ) {
+		/* locate the block */
+		leaf = kmap_atomic(op.p.del.point->page, KM_USER0) +
+			op.object->offset;
+		op.bix_rcm[0] = leaf->ptr;
+		kunmap_atomic(leaf, KM_USER0);
+
+		op.m_rcm = 1;
+
+		_debug("del level 0: %x", op.bix_rcm[0]);
+
+		spin_lock(&super->alloc_lock);
+		super->j.space_data_used--;
+		spin_unlock(&super->alloc_lock);
+	}
+
+	/* prune back the path
+	 * - note that we ignore I/O errors from the reclaimer - in such a case
+	 *   no further journal commits will be made to the device, and so any
+	 *   changes we make here will be discarded
+	 */
+	nrcm = 0;
+
+	for (x = op.p.del.point; x != op.p.del.pruneto; x = x->parent) {
+		if (op.m_rcm >= ARRAY_SIZE(op.bix_rcm)) {
+			cachefs_allocator(&op);
+			op.m_alloc = 0;
+			op.m_rcm = 0;
+		}
+
+		op.bix_rcm[op.m_rcm++] = x->bix;
+
+		/* this node is deleted */
+		write_lock(&x->lock);
+		set_bit(CACHEFS_TREE_DETACHED, &x->flags);
+		clear_bit(CACHEFS_TREE_EXTANT, &x->flags);
+		x->occupancy--;
+		write_unlock(&x->lock);
+		nrcm++;
+
+		ASSERT(x->occupancy == 0);
+	}
+
+	if (op.m_rcm > 0)
+		cachefs_allocator(&op);
+
+	if (nrcm > 0) {
+		spin_lock(&super->alloc_lock);
+		super->j.space_meta -= nrcm;
+		spin_unlock(&super->alloc_lock);
+	}
+
+	/* adjust the node we pruned back to */
+	_debug("change pruneto");
+
+	data = kmap_atomic(op.p.del.pruneto->page, KM_USER0);
+
+	if (op.p.del.pruneto == op.object->node) {
+		/* just reinitialise the object leaf */
+		_alter(op.super, "erase leaf %x[%04x]",
+		       op.p.del.pruneto->bix, op.object->offset);
+
+		leaf = data + op.object->offset;
+		memset(leaf, CACHEFS_EMPTY_FILL, op.super->layout->leaf_size);
+		leaf->type = CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT;
+	}
+	else {
+		/* need to find the most proximal node being discarded */
+		x = op.p.del.point;
+		while (x->parent != op.p.del.pruneto)
+			x = x->parent;
+
+		switch (x->type) {
+		case CACHEFS_TREE_TYPE_SHORTCUT:
+			/* detach subtree rooted on shortcut */
+			_alter(op.super, "erase shortcut %x[%04x]",
+			       op.p.del.pruneto->bix, x->offset);
+
+			leaf = data + x->offset;
+			memset(leaf, CACHEFS_EMPTY_FILL,
+			       super->layout->leaf_size);
+			leaf->type = CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT;
+			break;
+
+		case CACHEFS_TREE_TYPE_NODE:
+			/* detach subtree rooted on pointer */
+			_alter(op.super, "erase ptr %x[%04x]",
+			       op.p.del.pruneto->bix, x->offset);
+
+			ptr = data + x->offset;
+			*ptr = CACHEFS_NULL_PTR;
+			ptr = data + (x->offset & CACHEFS_ONDISC_LEAF_MASK);
+
+			for (loop = 0;
+			     loop < CACHEFS_ONDISC_PTRPERLEAF;
+			     loop++)
+				if (ptr[loop] != CACHEFS_NULL_PTR)
+					goto changed_pruneto_point;
+
+			/* turn an empty pointer leaf back into an empty slot */
+			_alter(op.super, "erase ptr leaf %x[%04x]",
+			       op.p.del.pruneto->bix,
+			       x->offset & CACHEFS_ONDISC_LEAF_MASK);
+
+			leaf = data + (x->offset & CACHEFS_ONDISC_LEAF_MASK);
+			memset(leaf, CACHEFS_EMPTY_FILL, super->layout->leaf_size);
+			leaf->type = CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT;
+			break;
+
+		default:
+			BUG();
+			break;
+		}
+	}
+
+changed_pruneto_point:
+	kunmap_atomic(data, KM_USER0);
+	set_page_dirty(op.p.del.pruneto->page);
+	unlock_page(op.p.del.pruneto->page);
+
+	/* decrease the occupancy of the most distal node that will remain */
+	op.p.del.pruneto->occupancy--;
+
+	/* seal the operation */
+	up_write(&op.p.del.pruneto->sem);
+	cachefs_operation_end(&op);
+	_leave(" = 0");
+	return 0;
+
+error_whilst_replacing_node:
+	for (x = op.p.del.next; x != op.p.del.pruneto; x = x->parent)
+		up_write(&x->sem);
+	up_read(&op.p.del.pruneto->sem);
+	cachefs_operation_end(&op);
+	_leave(" = %d [repl node]", ret);
+	return ret;
+
+} /* end cachefs_tree_delete() */
+
+/*****************************************************************************/
+/*
+ * set up the recycling and reap stacks
+ * - must be called with deletion_sem held
+ */
+static int cachefs_tree_delete_setup_recycling(struct cachefs_operation *op)
+{
+	int ret;
+
+	_enter("{%x[%d]},{%x[%d]}",
+	       op->super->j.reap_collector, op->super->j.reap_collsp,
+	       op->super->j.rcy_collector, op->super->j.rcy_collsp);
+
+	ASSERT(op->super->j.rcy_collsp < CACHEFS_ONDISC_RCYSTK_TREESPERNODE);
+	ASSERT(op->super->j.reap_collsp < CACHEFS_ONDISC_REAP_OBJIDSPERNODE);
+
+	/* allocate blocks for the two collectors if necessary */
+	op->m_alloc = 0;
+
+	if (!op->super->j.rcy_collector)
+		op->m_alloc++;
+
+	if (!op->super->j.reap_collector)
+		op->m_alloc++;
+
+	if (op->m_alloc > 0) {
+		op->m_rcm = 0;
+		ret = cachefs_allocator(op);
+		if (ret < 0) {
+			_leave(" = %d", ret);
+			return ret;
+		}
+
+		op->n_alloc = 0;
+
+		spin_lock(&op->super->alloc_lock);
+
+		if (!op->super->j.reap_collector) {
+			op->super->j.reap_collector =
+				op->bix_alloc[op->n_alloc++];
+			op->super->j.reap_collsp = 0;
+			op->super->j.space_reap++;
+
+			_alter(op->super, "new reap coll %x [spc %x]",
+			       op->super->j.reap_collector,
+			       op->super->j.space_reap);
+		}
+
+		if (!op->super->j.rcy_collector) {
+			op->super->j.rcy_collector =
+				op->bix_alloc[op->n_alloc];
+			op->super->j.rcy_collsp = 0;
+			op->super->j.space_rcy++;
+
+			_alter(op->super, "new rcy coll %x [spc %x]",
+			       op->super->j.rcy_collector,
+			       op->super->j.space_rcy);
+
+		}
+
+		spin_unlock(&op->super->alloc_lock);
+	}
+
+	/* bind the object ID reaping collector to a page if it isn't already
+	 * so bound */
+	if (!op->super->page_reap) {
+		ret = cachefs_page_read(op->super, op->super->j.reap_collector,
+					1, &op->super->page_reap);
+		if (ret < 0)
+			return ret;
+	}
+
+	/* bind the data tree recycling collector to a page if it isn't already
+	 * so bound */
+	if (!op->super->page_rcy) {
+		ret = cachefs_page_read(op->super, op->super->j.rcy_collector,
+					1, &op->super->page_rcy);
+		if (ret < 0)
+			return ret;
+	}
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_tree_delete_setup_recycling() */
+
+/*****************************************************************************/
+/*
+ * push a dead object ID onto the reaper's conveyor belt
+ * - must be called with deletion_sem held
+ */
+static void cachefs_trans_reap_one(struct cachefs_operation *op)
+{
+	struct cachefs_ondisc_reap_node *rnode;
+
+	_enter("{%x[%d]},{%llx}",
+	       op->super->j.reap_collector, op->super->j.reap_collsp,
+	       op->object->objid);
+
+	ASSERT(!op->super->j.reap_collector ||
+	       op->super->j.reap_collector >= op->super->layout->bix_cache);
+	ASSERT(op->super->j.reap_collector < op->super->j.alloc_unready);
+	ASSERT(op->super->j.reap_collsp < CACHEFS_ONDISC_REAP_OBJIDSPERNODE);
+
+	/* store the dead tree and mark the reclamation node dirty */
+	lock_page(op->super->page_reap);
+	rnode = kmap_atomic(op->super->page_reap, KM_USER0);
+
+	_alter(op->super, "reap %llx to %x[%d]",
+	       op->object->objid,
+	       op->super->j.reap_collector,
+	       op->super->j.reap_collsp);
+
+	rnode->objids[op->super->j.reap_collsp] = op->object->objid;
+	op->super->j.reap_collsp++;
+
+	if (op->super->j.reap_collsp == CACHEFS_ONDISC_REAP_OBJIDSPERNODE) {
+		/* move full blocks to stack */
+		_alter(op->super, "push reap %x on %x",
+		       op->super->j.reap_collector,
+		       op->super->j.reap_stack);
+
+		rnode->next = op->super->j.reap_stack;
+		op->super->j.reap_stack = op->super->j.reap_collector;
+		op->super->j.reap_collector = 0;
+		op->super->j.reap_collsp = 0;
+	}
+
+	kunmap_atomic(rnode, KM_USER0);
+	set_page_dirty(op->super->page_reap);
+	unlock_page(op->super->page_reap);
+
+	if (op->super->j.reap_collsp == 0) {
+		cachefs_page_put(op->super->page_reap);
+		op->super->page_reap = NULL;
+	}
+
+	/* tell kcachefsd there's work for it */
+	set_bit(CACHEFS_SUPER_BEGIN_SCAN, &op->super->flags);
+	wake_up(&op->super->dmn_sleepq);
+
+} /* end cachefs_trans_reap_one() */
+
+/*****************************************************************************/
+/*
+ * propose pushing a data tree onto the recycler's conveyor belt
+ * - must be called with deletion_sem held
+ */
+static void cachefs_trans_recycle_one(struct cachefs_operation *op)
+{
+	struct cachefs_ondisc_recycle_node *rnode;
+
+	_enter("{%x[%d]},{%x,%d}",
+	       op->super->j.rcy_collector, op->super->j.rcy_collsp,
+	       op->bix_rcm[0], op->object->data_levels);
+
+	ASSERT(!op->super->j.rcy_collector ||
+	       op->super->j.rcy_collector >= op->super->layout->bix_cache);
+	ASSERT(op->super->j.rcy_collector < op->super->j.alloc_unready);
+	ASSERT(op->super->j.rcy_collsp >= 0);
+	ASSERT(op->super->j.rcy_collsp < CACHEFS_ONDISC_RCYSTK_TREESPERNODE);
+
+	lock_page(op->super->page_rcy);
+
+	/* validate the recycling node */
+	if (cachefs_recycle_validate_node(op->super,
+					  op->super->page_rcy,
+					  op->super->j.rcy_collsp
+					  ) < 0
+	    ) {
+		unlock_page(op->super->page_rcy);
+		return;
+	}
+
+	/* store the dead tree and mark the reclamation node dirty */
+	rnode = kmap_atomic(op->super->page_rcy, KM_USER0);
+
+	_alter(op->super, "recycle %x(%d) to %x[%d] [spc %x]",
+	       op->bix_rcm[0], op->object->data_levels,
+	       op->super->j.rcy_collector, op->super->j.rcy_collsp,
+	       op->super->j.space_rcy);
+
+	rnode->trees[op->super->j.rcy_collsp].dataptr = op->bix_rcm[0];
+	rnode->trees[op->super->j.rcy_collsp].depth = op->object->data_levels;
+	op->super->j.rcy_collsp++;
+
+	if (op->super->j.rcy_collsp == CACHEFS_ONDISC_RCYSTK_TREESPERNODE) {
+		/* add full blocks to stack */
+		_alter(op->super, "push rcy %x on %x",
+		       op->super->j.rcy_collector,
+		       op->super->j.rcy_stack);
+
+		rnode->next = op->super->j.rcy_stack;
+		op->super->j.rcy_stack = op->super->j.rcy_collector;
+		op->super->j.rcy_collector = 0;
+		op->super->j.rcy_collsp = 0;
+	}
+
+	kunmap_atomic(rnode, KM_USER0);
+	set_page_dirty(op->super->page_rcy);
+	unlock_page(op->super->page_rcy);
+
+	if (op->super->j.rcy_collsp == 0) {
+		cachefs_page_put(op->super->page_rcy);
+		op->super->page_rcy = NULL;
+	}
+
+	/* tell kcachefsd there's work for it */
+	set_bit(CACHEFS_SUPER_DO_RECYCLE, &op->super->flags);
+	wake_up(&op->super->dmn_sleepq);
+
+} /* end cachefs_trans_recycle_one() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/tree-insert.c linux-2.6.14-mm2-cachefs/fs/cachefs/tree-insert.c
--- linux-2.6.14-mm2/fs/cachefs/tree-insert.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/tree-insert.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,675 @@
+/* tree-insert.c: metadata tree leaf insertion
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ *
+ * Possible insertion pre-conditions:
+ *
+ * (0) Terminal node is detached and empty
+ *
+ *	- Step back to last attached node and consider as actual terminal node
+ *
+ * (1) Terminal node has empty slot
+ *
+ *	- No excess allocation required; merely replace extant blocks
+ *	- Write lock terminal node
+ *	- Insert leaf in empty slot
+ *
+ * (2) Terminal node is full; empty pointer points to where leaf could go
+ *
+ *	- Single excess allocation required
+ *	- Write lock terminal node
+ *	- Allocate new block, set pointer and install leaf in new block
+ *	- Need to move matching leaves from parent block into new
+ *	- Parent block needs update (set pointer, move leaves)
+ *
+ * (3) Terminal node is full; sufficient key disparity to permit partial fan
+ *     out of one slot
+ *
+ *	- Single excess allocation required
+ *	- Write lock terminal node
+ *	- Transfer matching leaves into new block
+ *	- Displace object leaf to insert pointer leaf; displaced leaf moves
+ *        into slot made available by transfer of matching leaves.
+ *	- Replace transferred shortcuts that are reduced to single-level with
+ *	  pointer blocks
+ *
+ * (4) Terminal node is full; key similarity crams all leaves into one pointer
+ *     leaf in terminal block
+ *
+ *	- Determine level at which disparity occurs
+ *	- Write lock terminal node
+ *	- Transfer all leaves from terminal node
+ *	- Place shortcut to new subtree into the terminal block
+ *	- Fan-out performed appropriate to disparity at new level:
+ *
+ *	  (a) Two or more leaves with same subkey:
+ *	    - Double excess allocation required
+ *	    - First new block is root of subtree
+ *	      - Holds dissimilar leaves
+ *	      - Holds pointer leaf, pointing to second new block
+ *          - Second new block contains similar leaves
+ *
+ *	  (b) Maximum disparity
+ *	    - Two leaves of the N leaves must occur in same pointer leaf
+ *	    - Triple excess allocation required
+ *	    - First new block is root of subtree
+ *	      - Holds dissimilar leaves
+ *	      - Holds pointer leaf, pointing to second and third new blocks
+ *	    - One similar leaf transferred to second new block
+ *	    - Other similar leaf transferred to third new block
+ *
+ *	  (c) Worst case disparity
+ *	    - All subkeys different
+ *	    - All subkeys map to one pointer leaf
+ *	    - Triple excess allocation required
+ *	    - Select any two leaves
+ *	    - First new block is root of subtree
+ *	      - Holds other leaves
+ *	      - Holds pointer leaf, pointing to second and third new blocks
+ *	    - One selected leaf transferred to second new block
+ *	    - Other selected leaf transferred to third new block
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include "cachefs-int.h"
+
+static int cachefs_tree_walk_writelock_replace(struct cachefs_operation *op);
+
+static void cachefs_tree_insert_directly(struct cachefs_operation *op,
+					 struct cachefs_ondisc_leaf *key,
+					 void *data,
+					 int slot);
+
+static int cachefs_tree_insert_new_block(struct cachefs_operation *op,
+					 uint16_t offset,
+					 struct cachefs_ondisc_leaf *keys);
+
+/*****************************************************************************/
+/*
+ * walk from the root of the tree to as close as possible to where the keyed
+ * object ought to be, locking the nodes as we go
+ * - we attempt to optimise the node packing
+ * - we deal with nodes that have evaporated whilst the lock wasn't held
+ */
+int cachefs_tree_insert(struct cachefs_super *super,
+			struct cachefs_object *object,
+			struct cachefs_ondisc_leaf *key)
+{
+	struct cachefs_ondisc_leaf *leaf;
+	struct cachefs_operation op;
+	uint16_t offset;
+	void *data;
+	int loop, ret;
+
+	//printk("\n");
+	//printk("------------------------------------------------------------\n");
+	//printk("\n");
+	_enter(",,");
+
+	/* set up an operation record and request allocation reservation */
+	op.super	= super;
+	op.object	= object;
+	op.reason	= CACHEFS_OP_INSERT_LEAF;
+	op.data_space	= 0;
+
+	ret = cachefs_operation_begin(&op);
+	if (ret < 0) {
+		_leave(" = %d [begin]", ret);
+		return ret;
+	}
+
+	/* start by walking to the closest node to where the object ought to
+	 * go, writelocking and replacing the superstructure as we go
+	 */
+	ret = cachefs_tree_walk_writelock_replace(&op);
+	if (ret < 0)
+		goto end_operation;
+
+	ASSERT(PageMappedToDisk(op.p.nodes[0]->page));
+	ASSERT(PageLocked(op.p.nodes[0]->page));
+
+	_debug("assess");
+
+	data = kmap_atomic(op.p.nodes[0]->page, KM_USER0);
+
+	/* if the terminal node has an empty slot then we can just insert the
+	 * object directly */
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+		leaf = data + (loop << super->layout->leaf_shift);
+		if (leaf->type == CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT)
+			break;
+	}
+
+	if (loop < CACHEFS_ONDISC_LEAF_PER_BLOCK) {
+		cachefs_tree_insert_directly(&op, key, data, loop);
+		kunmap_atomic(data, KM_USER0);
+		ret = 0;
+		goto end_operation;
+	}
+
+	/* see if there's a null pointer that we can instantiate */
+	offset = cachefs_extract_subkey(key, op.p.nodes[0]->level);
+	_debug("subkey %04x @ lev %d", offset, op.p.nodes[0]->level);
+
+	leaf = data + (offset & CACHEFS_ONDISC_LEAF_MASK);
+
+	if (leaf->type == CACHEFS_NULL_PTR ||
+	    leaf->type > CACHEFS_ONDISC_OBJTYPE__LAST
+	    ) {
+		/* found a pointer block - there should be an empty pointer
+		 * - if there was a non-empty pointer then we should've
+		 *   followed it earlier
+		 */
+		cachefs_block_t *ptr, bix;
+
+		ptr = data + offset;
+		bix = *ptr;
+
+		_debug("ptr (%x @%hx)", bix, offset);
+
+		kunmap_atomic(data, KM_USER0);
+		ASSERT(bix == CACHEFS_NULL_PTR);
+
+		ret = cachefs_tree_insert_new_block(&op, offset, key);
+		goto end_operation;
+	}
+
+	/* fan out the terminal node */
+	_debug("need fanout");
+
+	kunmap_atomic(data, KM_USER0);
+
+	ret = cachefs_tree_insert_fanout(&op, key);
+
+	/* send the transaction diskwards */
+end_operation:
+	up_write(&op.p.nodes[0]->sem);
+	cachefs_operation_end(&op);
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_tree_insert() */
+
+/*****************************************************************************/
+/*
+ * walk from the root of the tree to as close as possible to where the keyed
+ * object ought to be
+ * - if successful, we return with the point node's semaphore still
+ *   write-locked
+ */
+static int cachefs_tree_walk_writelock_replace(struct cachefs_operation *op)
+{
+	struct cachefs_ondisc_leaf *leaf;
+	struct cachefs_cursor cursor, next;
+	struct cachefs_tree *branch;
+	cachefs_block_t *ptr, bix;
+	void *data;
+	int loop, ret;
+
+	_enter("");
+
+	/* walk the tree to see if this object is present or if a leaf can be
+	 * found on which to store the object */
+	cursor.point = cachefs_tree_get(op->super->metadata_tree);
+	cursor.level = 0;
+	cursor.offset = 0xffffU;
+	next.level = 1;
+
+	down_write(&cursor.point->sem);
+
+	/* replace the root right at the start */
+	ret = cachefs_replace_node(op, cursor.point);
+	if (ret < 0)
+		goto error;
+
+begin_step:
+	next.point = NULL;
+	next.level = cursor.level + 1;
+
+	ASSERT(PageLocked(cursor.point->page));
+
+	/* extract the bits of key in which we're immediately interested */
+	cursor.offset = cachefs_extract_subkey_obj(op->object, cursor.level);
+
+	_debug("step %d subkey=%04x", cursor.level, cursor.offset);
+
+	/* start by checking the cached branches and shortcuts leading off of
+	 * this one
+	 */
+	read_lock(&cursor.point->lock);
+
+	branch = cachefs_tree_find_node(cursor.point, CACHEFS_TREE_TYPE_NODE,
+					cursor.offset);
+	if (branch)
+		goto move_to_cached_branch;
+
+	branch = cachefs_tree_find_shortcut_obj(cursor.point, op->object);
+	if (branch)
+		goto move_to_cached_shortcut;
+
+	read_unlock(&cursor.point->lock);
+
+	ASSERT(test_bit(CACHEFS_TREE_EXTANT, &cursor.point->flags));
+
+	/* we need to examine the on-disk contents of this node */
+	if (!test_bit(CACHEFS_TREE_NODE_VALIDATED, &cursor.point->flags)) {
+		_debug("reading node %x", cursor.point->bix);
+
+		ret = cachefs_node_read(op->super, cursor.point, 1);
+		if (ret < 0)
+			goto error_unlock;
+
+		if (cursor.point->immutable - op->super->jnl_serial < 0)
+			_debug("- not immutable");
+	}
+
+	_debug("got page %p{%lx}",
+	       cursor.point->page, cursor.point->page->index);
+
+	data = kmap_atomic(cursor.point->page, KM_USER0);
+
+	/* see if there's a pointer at the correct position for us to walk on
+	 * immediately
+	 */
+	leaf = data + (cursor.offset & CACHEFS_ONDISC_LEAF_MASK);
+
+	ASSERTIF(CACHEFS_EMPTY_PTR != 0, leaf->type != CACHEFS_EMPTY_PTR);
+
+	_debug("ptrblk? %x type %x",
+	       cursor.offset & CACHEFS_ONDISC_LEAF_MASK,
+	       leaf->type);
+
+	if (leaf->type == CACHEFS_ONDISC_OBJTYPE_NULL_POINTER ||
+	    leaf->type > CACHEFS_ONDISC_OBJTYPE__LAST
+	    ) {
+		/* found a pointer block - see if there's a pointer */
+		_debug("ptrblk %x", cursor.offset);
+
+		ptr = data + cursor.offset;
+		bix = *ptr;
+		if (bix != CACHEFS_NULL_PTR)
+			goto follow_pointer;
+	}
+
+	/* there isn't a viable direct pointer; so we need to search for a
+	 * shortcut to follow
+	 */
+	_debug("walk shortcuts");
+
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+		cursor.offset = loop << op->super->layout->leaf_shift;
+		leaf = data + cursor.offset;
+
+		_debug("leaf[%d] type %x", loop, leaf->type);
+
+		if (leaf->type == CACHEFS_ONDISC_OBJTYPE_SHORTCUT &&
+		    cachefs_compare_keys_obj(op->object, leaf) == 1)
+			goto take_shortcut;
+	}
+
+	kunmap_atomic(data, KM_USER0);
+	op->p.nodes[0] = cursor.point;
+	_leave(" = 0 [not found]");
+	return 0;
+
+	/* we found a suitable branch to move to in the topology cache */
+move_to_cached_shortcut:
+	_debug(">>>> skip to shortcut");
+
+move_to_cached_branch:
+	next.point = cachefs_tree_get(branch);
+	next.level = branch->level;
+	_debug(">>>> move to %p [lev %d]", next.point, next.level);
+	read_unlock(&cursor.point->lock);
+
+next_step:
+	down_write(&next.point->sem);
+	downgrade_write(&cursor.point->sem);
+
+	ret = cachefs_replace_node(op, next.point);
+	if (ret < 0)
+		goto error_2;
+
+	unlock_page(cursor.point->page);
+	up_read(&cursor.point->sem);
+	cachefs_cursor_put(&cursor);
+	cursor.point = next.point;
+	cursor.level = next.level;
+	goto begin_step;
+
+	/* found a pointer to a depending block on disk */
+follow_pointer:
+	kunmap_atomic(data, KM_USER0);
+
+	_debug(">>>> walk to %x", bix);
+
+	ASSERT(CACHEFS_EMPTY_PTR != 0 || bix != CACHEFS_EMPTY_PTR);
+
+	if (bix < op->super->layout->bix_cache ||
+	    bix >= op->super->j.alloc_unready
+	    ) {
+		printk(KERN_ERR "can't walk to block %x\n", bix);
+		BUG();
+	}
+
+	ASSERT(bix >= op->super->layout->bix_cache);
+	ASSERT(bix < op->super->j.alloc_unready);
+
+	/* extend the topology cache */
+	cursor.level++;
+	next.point = cachefs_tree_lookup(GFP_KERNEL, &cursor, bix,
+					 CACHEFS_TREE_TYPE_NODE, 0);
+	if (!next.point)
+		goto error_unlock;
+	goto next_step;
+
+	/* if there's a shortcut we should take then we need to follow it */
+take_shortcut:
+	bix = leaf->ptr;
+	next.level = leaf->u.shortcut.level;
+	next.s_offset = leaf->u.shortcut.s_offset;
+	kunmap_atomic(data, KM_USER0);
+
+	ASSERT(bix >= op->super->layout->bix_cache);
+	ASSERT(bix < op->super->j.alloc_unready);
+
+	next.offset = cursor.offset;
+	next.point = cursor.point;
+
+	_debug(">>>> shortcut to %x [lev %d]", bix, next.level);
+
+	/* add to the tree cache */
+	next.point = cachefs_tree_lookup(GFP_KERNEL, &next, bix,
+					 CACHEFS_TREE_TYPE_SHORTCUT, 0);
+	if (!next.point)
+		goto error_unlock;
+	goto next_step;
+
+error_2:
+	unlock_page(cursor.point->page);
+	up_write(&next.point->sem);
+	up_read(&cursor.point->sem);
+	cachefs_cursor_put(&next);
+	cachefs_cursor_put(&cursor);
+	_leave(" = %d", ret);
+	return ret;
+
+error_unlock:
+	unlock_page(cursor.point->page);
+error:
+	up_write(&cursor.point->sem);
+	cachefs_cursor_put(&cursor);
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+
+} /* end cachefs_tree_walk_writelock_replace() */
+
+/*****************************************************************************/
+/*
+ * insert an object directly into a tree node on a spare leaf slot
+ */
+static void cachefs_tree_insert_directly(struct cachefs_operation *op,
+					 struct cachefs_ondisc_leaf *key,
+					 void *data,
+					 int slot)
+{
+	unsigned offset;
+
+	_enter("{%x},{%llx},,", op->p.nodes[0]->bix, op->object->objid);
+
+	offset = slot << op->super->layout->leaf_shift;
+
+	cachefs_tree_install_leaf(op, op->p.nodes[0], key, data, offset);
+
+	/* note that this block has changed */
+	set_page_dirty(op->p.nodes[0]->page);
+	unlock_page(op->p.nodes[0]->page);
+
+	_leave("");
+
+} /* end cachefs_tree_insert_directly() */
+
+/*****************************************************************************/
+/*
+ * insert an object into a newly allocated node
+ * - need to transfer old leaves with matching subkey as well
+ */
+static int cachefs_tree_insert_new_block(struct cachefs_operation *op,
+					 uint16_t offset,
+					 struct cachefs_ondisc_leaf *key)
+{
+	struct cachefs_ondisc_leaf *leaf, *oldleaf;
+	cachefs_block_t *ptr;
+	void *data, *olddata;
+	int loop, loop2, ret;
+
+	struct cachefs_cursor xcursor = {
+		.point	= op->p.nodes[0],
+		.level	= op->p.nodes[0]->level + 1,
+		.offset	= offset,
+	};
+
+	_enter("{%x},{%x},", op->p.nodes[0]->bix, op->p.nodes[1]->bix);
+
+	/* and propose a new node */
+	op->p.nodes[1] = cachefs_tree_lookup(GFP_KERNEL, &xcursor,
+					     CACHEFS_NULL_PTR,
+					     CACHEFS_TREE_TYPE_NODE, 0);
+	if (!op->p.nodes[1]) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	/* and allocate a page on which to create the new node */
+	op->p.nodes[1]->page = alloc_page(GFP_HIGHUSER);
+	if (!op->p.nodes[1]->page) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	op->p.nodes[1]->page->mapping = op->super->imeta->i_mapping;
+	SetPageFsMisc(op->p.nodes[1]->page);
+	SetPagePrivate(op->p.nodes[1]->page);
+
+	/* and then allocate and attach a block */
+	ret = cachefs_replace_node(op, op->p.nodes[1]);
+	if (ret < 0) {
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	spin_lock(&op->super->alloc_lock);
+	op->super->j.space_meta++;
+	spin_unlock(&op->super->alloc_lock);
+
+	ASSERT(PageMappedToDisk(op->p.nodes[1]->page));
+	ASSERT(PageLocked(op->p.nodes[1]->page));
+
+	olddata = kmap_atomic(op->p.nodes[0]->page, KM_USER0);
+	data = kmap_atomic(op->p.nodes[1]->page, KM_USER1);
+
+	/* initialise the new node to empty leaf slots */
+	_alter(op->super, "init node %x", op->p.nodes[1]->bix);
+
+	memset(data, CACHEFS_EMPTY_FILL, PAGE_SIZE);
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+		leaf = data + (loop << op->super->layout->leaf_shift);
+		leaf->type = CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT;
+	}
+
+	SetPageUptodate(op->p.nodes[1]->page);
+
+	set_bit(CACHEFS_TREE_NODE_VALID, &op->p.nodes[1]->flags);
+	set_bit(CACHEFS_TREE_NODE_VALIDATED, &op->p.nodes[1]->flags);
+
+	/* any shortcuts that match this pointer but have run out of levels
+	 * must be turned into pointer blocks first as these need to be placed
+	 * precisely
+	 */
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+		struct cachefs_tree *shortcut;
+
+		oldleaf = olddata + (loop << op->super->layout->leaf_shift);
+
+		if (oldleaf->type != CACHEFS_ONDISC_OBJTYPE_SHORTCUT)
+			continue;
+
+		_debug("short[%d] subkey %04x level %d",
+		       loop,
+		       oldleaf->u.shortcut.s_offset,
+		       oldleaf->u.shortcut.level);
+
+		if (oldleaf->u.shortcut.s_offset != op->p.nodes[1]->offset ||
+		    oldleaf->u.shortcut.level != op->p.nodes[1]->level + 1)
+			continue;
+
+		/* install a pointer block if we haven't already done so for an
+		 * earlier shortcut and set the appropriate pointer */
+		_alter(op->super,
+		       "convert shortcut to %x to ptr leaf %x[%04x]",
+		       oldleaf->ptr,
+		       op->p.nodes[1]->bix,
+		       oldleaf->u.shortcut.s_offset &
+		       CACHEFS_ONDISC_LEAF_MASK);
+
+		leaf = data + (oldleaf->u.shortcut.s_offset &
+			       CACHEFS_ONDISC_LEAF_MASK);
+
+		ASSERT(leaf->type == CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT);
+
+		memset(leaf, CACHEFS_NULL_FILL, CACHEFS_ONDISC_LEAF_SIZE);
+
+		ptr = data + op->p.nodes[1]->offset;
+		*ptr = oldleaf->ptr;
+
+		/* erase the shortcut from the old node */
+		_alter(op->super, "erase slot %x[%04x]",
+		       op->p.nodes[0]->bix,
+		       loop << op->super->layout->leaf_shift);
+
+		memset(oldleaf, CACHEFS_EMPTY_FILL, op->super->layout->leaf_size);
+		oldleaf->type = CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT;
+
+		/* if there's a record for this shortcut in memory then deal
+		 * with it */
+		_debug("- memory");
+		write_lock(&op->p.nodes[0]->lock);
+
+		shortcut = cachefs_tree_find_node(
+			op->p.nodes[0],
+			CACHEFS_TREE_TYPE_SHORTCUT,
+			loop << op->super->layout->leaf_shift);
+
+		if (!shortcut) {
+			write_unlock(&op->p.nodes[0]->lock);
+			continue;
+		}
+
+		cachefs_tree_unlink_from_node(shortcut);
+		write_unlock(&op->p.nodes[0]->lock);
+
+		shortcut->type = CACHEFS_TREE_TYPE_NODE;
+		cachefs_tree_link_to_node(shortcut, op->p.nodes[1]);
+	}
+
+	/* install the new object into the new node */
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+		offset = loop << op->super->layout->leaf_shift;
+		leaf = data + offset;
+
+		if (leaf->type == CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT)
+			goto install_new;
+	}
+
+	BUG(); /* there's a strange lack of empty slots */
+
+install_new:
+	cachefs_tree_install_leaf(op, op->p.nodes[1], key, data, offset);
+
+	op->p.nodes[0]->occupancy++;
+
+	/* scan the old node to see if there's anything else we must transfer
+	 * into the new node
+	 */
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+		uint16_t old_offset = loop << op->super->layout->leaf_shift;
+
+		oldleaf = olddata + old_offset;
+
+		_debug("old[%d] type %x", loop, oldleaf->type);
+
+		switch (oldleaf->type) {
+		case CACHEFS_ONDISC_OBJTYPE_INDEX_OBJECT:
+		case CACHEFS_ONDISC_OBJTYPE_DATA_OBJECT:
+		case CACHEFS_ONDISC_OBJTYPE_OTHER_OBJECT:
+		case CACHEFS_ONDISC_OBJTYPE_SHORTCUT:
+			offset = cachefs_extract_subkey(oldleaf,
+							op->p.nodes[0]->level);
+			if (offset != op->p.nodes[1]->offset)
+				break;
+
+			_debug("old[%d] type %x; subkey %04x == %04x",
+			       loop, oldleaf->type, offset,
+			       op->p.nodes[1]->offset);
+
+			for (loop2 = 0;
+			     loop2 < CACHEFS_ONDISC_LEAF_PER_BLOCK;
+			     loop2++
+			     ) {
+				uint16_t offset;
+
+				offset = loop2 << op->super->layout->leaf_shift;
+				leaf = data + offset;
+
+				if (leaf->type !=
+				    CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT)
+					continue;
+
+				_debug("- move to new slot %d", loop2);
+
+				cachefs_tree_move_leaf(op->super,
+						       op->p.nodes[0],
+						       op->p.nodes[1],
+						       oldleaf, leaf,
+						       old_offset, offset);
+				goto moved_leaf;
+			}
+
+			/* we seem to have run out of space... that shouldn't
+			 * happen as we're instantiating a pointer in a
+			 * pre-existing pointer block to we should move at most
+			 * N-1 leaves and install one new one */
+			BUG();
+
+		moved_leaf:
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	kunmap_atomic(data, KM_USER1);
+	kunmap_atomic(olddata, KM_USER0);
+
+	/* note that the blocks have changed */
+	set_page_dirty(op->p.nodes[0]->page);
+	set_page_dirty(op->p.nodes[1]->page);
+	unlock_page(op->p.nodes[1]->page);
+	unlock_page(op->p.nodes[0]->page);
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_tree_insert_new_block() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/tree-insert-fanout.c linux-2.6.14-mm2-cachefs/fs/cachefs/tree-insert-fanout.c
--- linux-2.6.14-mm2/fs/cachefs/tree-insert-fanout.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/tree-insert-fanout.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,1126 @@
+/* tree-insert-fanout.c: metadata tree leaf insertion
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include "cachefs-int.h"
+
+static const char *cachefs_assessment_types[] = {
+	"---", "new", "obj", "ptr", NULL
+};
+
+static const char *cachefs_assessment_alters[] = {
+	"---", "mov", "sld", "ins", NULL
+};
+
+/*
+ * individual slot assessment in fanout script
+ */
+struct cachefs_assessment {
+	uint64_t	objid;
+	uint16_t	offset;
+	uint16_t	common;		/* number of bits in common */
+	uint8_t		type;
+#define CACHEFS_ASSESS_EMPTY		0
+#define CACHEFS_ASSESS_NEW_OBJECT	1
+#define CACHEFS_ASSESS_OBJECT		2
+#define CACHEFS_ASSESS_PTRBLK		3
+	uint8_t		pslot;		/* slot subkey refers to */
+	uint8_t		nleaf;		/* number of leaves with subkeys to this slot */
+	uint8_t		dest;		/* destination block */
+	uint8_t		dslot;		/* slot in dest block */
+	uint8_t		alter;		/* modification to be made */
+#define CACHEFS_ASSESS_NOCHANGE		0
+#define CACHEFS_ASSESS_MOVE		1
+#define CACHEFS_ASSESS_SLIDE		2
+#define CACHEFS_ASSESS_INSTALL		3
+	uint8_t		occupied;	/* resulting occupation */
+};
+
+/*
+ * fanout script
+ */
+struct cachefs_fanout {
+	int				nalloc;
+	int16_t				shortcut_levels;
+	uint16_t			shortcut_offset;
+	uint16_t			common_offset;
+	uint8_t				nptrs;		/* number of pointer leaves */
+	uint8_t				dump;		/* dump completed fanout */
+	uint8_t				connection[4];
+#define CACHEFS_FANOUT_UNCONNECTED	0
+#define CACHEFS_FANOUT_CONNECTED	1
+#define CACHEFS_FANOUT_CONNECT_TO_0	2
+#define CACHEFS_FANOUT_CONNECT_TO_0AUX	3
+#define CACHEFS_FANOUT_CONNECT_TO_1	4
+#define CACHEFS_FANOUT_CONNECT_TO_1AUX	5
+#define CACHEFS_FANOUT_SHORTCUT_FROM_0	6
+	int8_t				ptrleaf_dest[2]; /* where to create new ptr leaves */
+	uint8_t				ptrleaf_dslot[2];
+	uint8_t				levels[2];	/* connectable levels */
+	struct cachefs_assessment	assess[0];
+};
+
+static void cachefs_tree_init_fanout(struct cachefs_operation *op,
+				     struct cachefs_fanout *fan,
+				     struct cachefs_ondisc_leaf *key,
+				     void *data);
+
+static void cachefs_tree_assess_fanout(struct cachefs_operation *op,
+				       struct cachefs_fanout *fan,
+				       struct cachefs_ondisc_leaf *key,
+				       void *data);
+
+static void cachefs_tree_insert_effect_fanout(struct cachefs_operation *op,
+					      struct cachefs_fanout *fan,
+					      struct cachefs_ondisc_leaf *key);
+
+static void cachefs_dump_fanout(struct cachefs_operation *op,
+				struct cachefs_fanout *fan);
+
+/*****************************************************************************/
+/*
+ * perform a fanout insertion
+ */
+int cachefs_tree_insert_fanout(struct cachefs_operation *op,
+			       struct cachefs_ondisc_leaf *key)
+{
+	struct cachefs_fanout *fan;
+	struct cachefs_tree *node;
+	struct pagevec pvec_lru;
+	size_t size;
+	void *data;
+	int loop, loop2, ret;
+
+	_enter("");
+
+	size = (CACHEFS_ONDISC_LEAF_PER_BLOCK + 1);
+	size *= sizeof(struct cachefs_assessment);
+	size += sizeof(struct cachefs_fanout);
+
+	fan = kzalloc(size, GFP_KERNEL);
+	if (!fan) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	data = kmap_atomic(op->p.nodes[0]->page, KM_USER0);
+
+	cachefs_tree_init_fanout(op, fan, key, data);
+	cachefs_tree_assess_fanout(op, fan, key, data);
+	kunmap_atomic(data, KM_USER0);
+
+	fan->levels[0] = op->p.nodes[0]->level;
+	fan->levels[1] = op->p.nodes[0]->level + 1;
+
+	/* allocate the extra nodes and pages needed */
+	for (loop = 1; loop <= fan->nalloc; loop++) {
+		struct cachefs_cursor xcursor;
+		int type;
+
+		_debug("allocate extra page %d", loop);
+
+		memset(&xcursor, 0, sizeof(xcursor));
+
+		switch (fan->connection[loop]) {
+		case CACHEFS_FANOUT_CONNECT_TO_0:
+		case CACHEFS_FANOUT_CONNECT_TO_0AUX:
+			xcursor.point = op->p.nodes[0];
+			xcursor.level = fan->levels[0] + 1;
+			xcursor.offset = fan->ptrleaf_dslot[0] <<
+				op->super->layout->leaf_shift;
+
+			if (fan->shortcut_offset != 0xffffU) {
+				/* we create a pointer leaf in lieu of a
+				 * shortcut between adjacent levels */
+				xcursor.offset = fan->shortcut_offset;
+				goto found_subkey;
+			}
+			break;
+
+		case CACHEFS_FANOUT_CONNECT_TO_1:
+		case CACHEFS_FANOUT_CONNECT_TO_1AUX:
+			xcursor.point = op->p.nodes[1];
+			xcursor.level = fan->levels[1] + 1;
+			xcursor.offset = fan->ptrleaf_dslot[1] <<
+				op->super->layout->leaf_shift;
+			break;
+
+		case CACHEFS_FANOUT_SHORTCUT_FROM_0:
+			ASSERT(loop == 1);
+			xcursor.point = op->p.nodes[0];
+			xcursor.level = fan->shortcut_levels;
+			fan->levels[1] = fan->shortcut_levels;
+			xcursor.offset = fan->shortcut_offset;
+			goto found_subkey;
+
+		default:
+			BUG();
+			break;
+		}
+
+		for (loop2 = 0;
+		     loop2 < CACHEFS_ONDISC_LEAF_PER_BLOCK + 1;
+		     loop2++
+		     ) {
+			if (fan->assess[loop2].dest == loop) {
+				xcursor.offset = fan->assess[loop2].offset;
+				goto found_subkey;
+			}
+		}
+
+		BUG();	/* should never create a shortcut to a block in which
+			 * we don't place any object leaves */
+	found_subkey:
+
+		_debug("set subkey %04hx on %d", xcursor.offset, loop);
+
+		if (fan->connection[loop] != CACHEFS_FANOUT_SHORTCUT_FROM_0)
+			type = CACHEFS_TREE_TYPE_NODE;
+		else
+			type = CACHEFS_TREE_TYPE_SHORTCUT;
+
+		/* and allocate a new node */
+		node = cachefs_tree_lookup(GFP_KERNEL, &xcursor,
+					   CACHEFS_NULL_PTR, type, 0);
+		if (!node)
+			goto nomem;
+		op->p.nodes[loop] = node;
+
+		/* and allocate a page on which to create the new node */
+		node->page = alloc_page(GFP_HIGHUSER);
+		if (!node->page)
+			goto nomem;
+		node->page->mapping = op->super->imeta->i_mapping;
+		SetPageFsMisc(node->page);
+		SetPagePrivate(node->page);
+	}
+
+	/* addition of the new pages to the page cache must not fail with
+	 * ENOMEM, so we have to bank sufficient radix tree nodes in advance
+	 */
+	ret = radix_tree_preload_task(GFP_KERNEL, fan->nalloc);
+	if (ret < 0)
+		goto error;
+
+	/* allocate blocks for the new nodes */
+	op->m_alloc = fan->nalloc;
+	op->m_rcm = 0;
+
+	ret = cachefs_allocator(op);
+	if (ret < 0)
+		goto error;
+
+	spin_lock(&op->super->alloc_lock);
+	op->super->j.space_meta += fan->nalloc;
+	spin_unlock(&op->super->alloc_lock);
+
+	/* bind the pages to the blocks in the page cache */
+	pagevec_init(&pvec_lru, 0);
+	for (loop = 1; loop <= fan->nalloc; loop++) {
+		op->p.nodes[loop]->bix = op->bix_alloc[loop - 1];
+
+		cachefs_replace_add_to_page_cache(op, op->p.nodes[loop]);
+
+		page_cache_get(op->p.nodes[loop]->page);
+		if (!pagevec_add(&pvec_lru, op->p.nodes[loop]->page))
+			BUG();
+
+		SetPageMappedToDisk(op->p.nodes[loop]->page);
+		mark_page_accessed(op->p.nodes[loop]->page);
+	}
+
+	pagevec_lru_add(&pvec_lru);
+
+	/* alter the contents of the disk */
+	cachefs_tree_insert_effect_fanout(op, fan, key);
+	ret = 0;
+
+error:
+	kfree(fan);
+	_leave(" = %d", ret);
+	return ret;
+
+nomem:
+	ret = -ENOMEM;
+	goto error;
+
+} /* end cachefs_tree_insert_fanout() */
+
+/*****************************************************************************/
+/*
+ * initialise a fanout script
+ * - pull data from the node into the script
+ */
+static void cachefs_tree_init_fanout(struct cachefs_operation *op,
+				     struct cachefs_fanout *fan,
+				     struct cachefs_ondisc_leaf *key,
+				     void *data)
+{
+	struct cachefs_ondisc_leaf *leaf;
+	struct cachefs_assessment *assess, *pass;
+	int loop, tmp;
+
+	_enter("");
+
+	fan->connection[0]	= CACHEFS_FANOUT_CONNECTED;
+	fan->ptrleaf_dest[0]	= -1;
+	fan->ptrleaf_dest[1]	= -1;
+	fan->shortcut_levels	= -1;
+	fan->shortcut_offset	= 0xffffU;
+	fan->common_offset	= 0xfffeU;
+
+	assess = fan->assess;
+
+	/* determine the immediate subkeys and types for all leaves */
+	pass = assess;
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+		int isptr = 0;
+
+		leaf = data + (loop << op->super->layout->leaf_shift);
+
+		switch (leaf->type) {
+#if CACHEFS_ONDISC_OBJTYPE_NULL_POINTER < CACHEFS_ONDISC_OBJTYPE_FIRST_POINTER
+		case CACHEFS_ONDISC_OBJTYPE_NULL_POINTER:
+#endif
+		case CACHEFS_ONDISC_OBJTYPE_FIRST_POINTER ...
+			CACHEFS_ONDISC_OBJTYPE_LAST_POINTER:
+			pass->type = CACHEFS_ASSESS_PTRBLK;
+			pass->offset = 0xfe00U | loop;
+			pass->pslot = 0x80U | loop;
+			fan->nptrs++;
+			isptr = 1;
+			break;
+
+		case CACHEFS_ONDISC_OBJTYPE_SHORTCUT:
+			pass->type = CACHEFS_ASSESS_OBJECT;
+			pass->offset = leaf->u.shortcut.s_offset;
+			break;
+
+		case CACHEFS_ONDISC_OBJTYPE_INDEX_OBJECT:
+		case CACHEFS_ONDISC_OBJTYPE_DATA_OBJECT:
+		case CACHEFS_ONDISC_OBJTYPE_OTHER_OBJECT:
+			pass->type = CACHEFS_ASSESS_OBJECT;
+			pass->objid = leaf->u.object.objid;
+			pass->offset = cachefs_extract_subkey(
+				leaf, op->p.nodes[0]->level);
+			break;
+
+			/* anything else we can't yet or won't handle */
+		default:
+			BUG();
+		}
+
+		if (!isptr) {
+			if (fan->common_offset == 0xfffeU)
+				fan->common_offset = pass->offset;
+			else if (fan->common_offset != pass->offset)
+				fan->common_offset = 0xffffU;
+			pass->pslot = pass->offset >>
+				CACHEFS_ONDISC_LEAF_SHIFT;
+			assess[pass->pslot].nleaf++;
+		}
+
+		pass->alter = CACHEFS_ASSESS_NOCHANGE;
+		pass->dest = 0;
+		pass->dslot = loop;
+		pass->occupied = 1;
+		pass++;
+	}
+
+	ASSERT(fan->common_offset != 0xfffeU);
+	ASSERT(fan->nptrs < CACHEFS_ONDISC_LEAF_PER_BLOCK);
+
+	/* add a record for the object we're attempting to insert */
+	pass->type = CACHEFS_ASSESS_NEW_OBJECT;
+	pass->alter = CACHEFS_ASSESS_NOCHANGE;
+	pass->offset = cachefs_extract_subkey(key, op->p.nodes[0]->level);
+	pass->pslot = pass->offset >> CACHEFS_ONDISC_LEAF_SHIFT;
+	pass->occupied = 0;
+	assess[pass->pslot].nleaf++;
+
+	pass->objid = key->u.object.objid;
+
+	/* insanity checks */
+	ASSERT(assess[CACHEFS_ONDISC_LEAF_PER_BLOCK].nleaf == 0);
+
+	tmp = 0;
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK + 1; loop++) {
+		pass = &fan->assess[loop];
+
+		if (pass->nleaf > CACHEFS_ONDISC_LEAF_PER_BLOCK + 1) {
+			printk(KERN_ERR "\nERROR:\n");
+			cachefs_dump_fanout(op, fan);
+			BUG();
+		}
+
+		if (pass->type == CACHEFS_ASSESS_EMPTY ||
+		    pass->type == CACHEFS_ASSESS_PTRBLK)
+			tmp++;
+
+		tmp += pass->nleaf;
+	}
+
+	if (tmp != CACHEFS_ONDISC_LEAF_PER_BLOCK + 1) {
+		printk(KERN_ERR "\nERROR: tmp=%d\n", tmp);
+		cachefs_dump_fanout(op, fan);
+		BUG();
+	}
+
+	//cachefs_dump_fanout(fan);
+	_leave("");
+
+} /* end cachefs_tree_init_fanout() */
+
+/*****************************************************************************/
+/*
+ * assess the disparity of the leaves in a node
+ * - the node will be full, it may have a mixture of pointer leaves and object
+ *   leaves, though the mixture may be 100% object leaves
+ */
+static void cachefs_tree_assess_fanout(struct cachefs_operation *op,
+				       struct cachefs_fanout *fan,
+				       struct cachefs_ondisc_leaf *key,
+				       void *data)
+{
+	struct cachefs_ondisc_leaf *leaf;
+	struct cachefs_assessment *assess, *pass;
+	struct cachefs_tree *node;
+	int loop, loop2, depth, level;
+	int dslot, xslot, pslot;
+
+	_enter("");
+
+	assess = fan->assess;
+	node = op->p.nodes[0];
+
+	_debug("assess");
+
+	/* if there are pointer blocks present and all the old object leaves
+	 * would dangle from the same pointer, then move them out to a new node
+	 * - there will be fewer objects than slots because some of the slots
+	 *   in the original node will be occupied by pointer leaves
+	 */
+	if (fan->nptrs && fan->common_offset != 0xffffU) {
+		_debug("ptrs and same subkeys");
+
+		pslot = fan->common_offset >> CACHEFS_ONDISC_LEAF_SHIFT;
+		dslot = 0;
+		xslot = -1;
+
+		for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+			if (assess[loop].type == CACHEFS_ASSESS_OBJECT) {
+				assess[loop].alter = CACHEFS_ASSESS_MOVE;
+				assess[loop].dest = 1;
+				assess[loop].dslot = dslot++;
+				assess[loop].occupied = 0;
+				if (xslot < 0 && loop != pslot)
+					xslot = loop;
+			}
+		}
+
+		ASSERT(fan->nptrs >= CACHEFS_ONDISC_LEAF_PER_BLOCK - 1 ||
+		       xslot >= 0);
+		if (unlikely(xslot >= CACHEFS_ONDISC_LEAF_PER_BLOCK)) {
+			printk("xslot: %d\n", xslot);
+			BUG();
+		}
+		ASSERT(xslot < CACHEFS_ONDISC_LEAF_PER_BLOCK);
+
+		assess[pslot].occupied = 2;
+		fan->ptrleaf_dest[0] = 0;
+		fan->ptrleaf_dslot[0] = pslot;
+		fan->connection[1] = CACHEFS_FANOUT_CONNECT_TO_0;
+
+		/* install the new object in the appropriate place also */
+		ASSERT(loop == CACHEFS_ONDISC_LEAF_PER_BLOCK);
+		if (assess[loop].offset == fan->common_offset) {
+			assess[loop].dest = 1;
+			assess[loop].dslot = dslot++;
+		}
+		else if (xslot < 0) {
+			ASSERT(fan->nptrs == CACHEFS_ONDISC_LEAF_PER_BLOCK - 1);
+			assess[loop].dest = 2;
+			assess[loop].dslot = 0;
+			fan->connection[2] = CACHEFS_FANOUT_CONNECT_TO_0AUX;
+			fan->nalloc++;
+		}
+		else {
+			assess[loop].dest = 0;
+			assess[loop].dslot = xslot;
+			assess[xslot].occupied = 1;
+		}
+
+		fan->nalloc++;
+		goto out;
+	}
+
+	/* if there are pointer blocks present, but not all the object leaves
+	 * colocate, see if there are leaves that can be promoted to a new node
+	 * such that the new node is pointed to by an existing pointer leaf
+	 */
+	if (fan->nptrs) {
+		_debug("ptrs");
+
+		for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+			if (assess[loop].type == CACHEFS_ASSESS_OBJECT &&
+			    assess[assess[loop].pslot].type ==
+			    CACHEFS_ASSESS_PTRBLK
+			    )
+				goto promote_old_leaf_directly;
+		}
+		goto cant_promote_directly;
+
+	promote_old_leaf_directly:
+		fan->common_offset = assess[loop].offset;
+		pslot = assess[loop].pslot;
+
+		_debug("promote direct obj %d -> ptr %u[%hx]",
+		       loop, pslot, fan->common_offset);
+
+		dslot = 0;
+		xslot = -1;
+
+		/* promote all the leaves that match this pointer */
+		for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+			if (assess[loop].offset == fan->common_offset) {
+				ASSERT(loop != pslot);
+				assess[loop].alter = CACHEFS_ASSESS_MOVE;
+				assess[loop].dest = 1;
+				assess[loop].dslot = dslot++;
+				assess[loop].occupied = 0;
+				if (xslot < 0)
+					xslot = loop;
+			}
+		}
+
+		assess[pslot].occupied = 2;
+		fan->ptrleaf_dest[0] = 0;
+		fan->ptrleaf_dslot[0] = pslot;
+		fan->connection[1] = CACHEFS_FANOUT_CONNECT_TO_0;
+
+		ASSERT(xslot >= 0);
+
+		/* install the new object in the appropriate place also */
+		if (assess[loop].offset == fan->common_offset) {
+			/* direct subkey can't match the object to be
+			 * installed, else we'd've handled it elsewhere */
+			BUG();
+		}
+		else {
+			assess[loop].dest = 0;
+			assess[loop].dslot = xslot;
+			assess[loop].occupied = 1;
+		}
+
+		fan->nalloc++;
+		goto out;
+	}
+
+cant_promote_directly:
+	/* if not all the object leaves colocate, and none of the leaves will
+	 * dangle from those pointer leaves that exist, then we might be able
+	 * to create a pointer leaf and dangle at least two of the objects (old
+	 * or new) from there
+	 */
+	if (fan->common_offset == 0xffffU ||
+	    fan->common_offset != assess[CACHEFS_ONDISC_LEAF_PER_BLOCK].offset)
+		goto non_colocated;
+
+	/* the only case left should be no pointer leaves, all object leaves
+	 * including new one have same subkey
+	 * - this case involves the production of a shortcut; eating up the
+	 *   keyspace in which they all match until a level is found at which
+	 *   they don't all match
+	 * - we don't create shortcuts of just one level, however; instead we
+	 *   interpolate a pointer leaf and an extra node
+	 */
+	_debug("plumb depth");
+
+	ASSERT(fan->nptrs <= 0);
+	ASSERT(fan->common_offset != 0xffffU);
+	ASSERT(fan->common_offset == assess[CACHEFS_ONDISC_LEAF_PER_BLOCK].offset);
+
+	/* work out the shortest common denomintator */
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+		int tmp;
+		leaf = data + (loop << op->super->layout->leaf_shift);
+		tmp = cachefs_compare_keys(leaf, key);
+		ASSERT(tmp < 0);
+		assess[loop].common = -tmp;
+	}
+
+	depth = INT_MAX;
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++)
+		if (assess[loop].common < depth)
+			depth = assess[loop].common;
+	assess[loop].common = depth;
+	level = depth / CACHEFS_ONDISC_LEVEL_BITS;
+	fan->shortcut_levels = level;
+	fan->shortcut_offset = fan->common_offset;
+	_debug("shortcut to %d, level %d -> %d",
+	       assess[loop].common, node->level, level);
+	ASSERT(node->level + 1 <= level);
+
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++)
+		assess[loop].nleaf = 0;
+
+	/* grab the subkeys for all on the new level */
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+		leaf = data + (loop << op->super->layout->leaf_shift);
+		assess[loop].offset = cachefs_extract_subkey(leaf, level);
+		assess[loop].pslot = assess[loop].offset >>
+			CACHEFS_ONDISC_LEAF_SHIFT;
+		assess[assess[loop].pslot].nleaf++;
+	}
+
+	/* ... including the new object */
+	assess[loop].offset = cachefs_extract_subkey(key, level);
+	assess[loop].pslot = assess[loop].offset >> CACHEFS_ONDISC_LEAF_SHIFT;
+	assess[assess[loop].pslot].nleaf++;
+
+	//cachefs_dump_fanout(fan);
+	fan->nalloc = 1;
+
+	/* create a pointer leaf and dangle at least two of the objects (old or
+	 * new) from there
+	 */
+non_colocated:
+	_debug("noncoloc");
+
+	/* see if we can find two objects that share a subkey */
+	pslot = -1;
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+		if (assess[loop].type == CACHEFS_ASSESS_PTRBLK)
+			continue;
+
+		if (assess[assess[loop].pslot].nleaf <= 1)
+			continue;
+
+		/* see if another object shares a subkey with this
+		 * one */
+		for (loop2 = loop + 1;
+		     loop2 < CACHEFS_ONDISC_LEAF_PER_BLOCK + 1;
+		     loop2++
+		     ) {
+			if (assess[loop2].offset == assess[loop].offset)
+				goto share_subkey;
+		}
+
+		/* failing that, keep track of sharers of a pointer
+		 * block, preferring those who're residing in the slot
+		 * in which their pointer leaf would reside */
+		xslot = assess[loop].pslot;
+		if (pslot < 0 || assess[xslot].pslot == xslot)
+			pslot = xslot;
+	}
+
+	/* it's not possible to have N+1 objects that N slots without
+	 * at least one overlap
+	 */
+	ASSERT(pslot >= 0);
+
+	/* need to create two new nodes pointed to be a new single
+	 * pointer leaf
+	 */
+	_debug("create two new nodes [%d]", pslot);
+
+	xslot = -1;
+	if (assess[pslot].pslot == pslot) {
+		/* the slot for the pointer leaf we're going to create
+		 * is occupied by an object leaf that can dangle from
+		 * that pointer block */
+		assess[pslot].alter = CACHEFS_ASSESS_MOVE;
+		assess[pslot].dest = 1;
+		assess[pslot].dslot = 0;
+		assess[pslot].occupied = 0;
+		fan->common_offset = assess[pslot].offset;
+		dslot = 1;
+		loop = 0;
+	}
+	else {
+		for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++)
+			if (assess[loop].pslot == pslot)
+				break;
+		ASSERT(loop < CACHEFS_ONDISC_LEAF_PER_BLOCK);
+
+		if (loop != pslot)
+			xslot = loop;
+		fan->common_offset = assess[loop].offset;
+		dslot = 0;
+	}
+
+	for (loop2 = loop;
+	     loop2 < CACHEFS_ONDISC_LEAF_PER_BLOCK + 1;
+	     loop2++
+	     ) {
+		if (assess[loop2].offset != fan->common_offset ||
+		    assess[loop2].alter != CACHEFS_ASSESS_NOCHANGE)
+			continue;
+		assess[loop2].alter = CACHEFS_ASSESS_MOVE;
+		assess[loop2].dest = 1;
+		assess[loop2].dslot = dslot++;
+		assess[loop2].occupied = 0;
+
+		if (xslot < 0)
+			xslot = loop2;
+	}
+
+	for (loop2 = CACHEFS_ONDISC_LEAF_PER_BLOCK; loop2 >= loop; loop2--)
+		if (assess[loop2].pslot == pslot &&
+		    assess[loop2].alter == CACHEFS_ASSESS_NOCHANGE)
+			break;
+	ASSERT(loop2 >= loop);
+
+	if (xslot < 0)
+		xslot = loop2;
+
+	fan->common_offset = assess[loop2].offset;
+	dslot = 0;
+	for (; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK + 1; loop++) {
+		if (assess[loop].offset != fan->common_offset)
+			continue;
+		assess[loop].alter = CACHEFS_ASSESS_MOVE;
+		assess[loop].dest = 2;
+		assess[loop].dslot = dslot++;
+		assess[loop].occupied = 0;
+	}
+
+	fan->connection[1] = CACHEFS_FANOUT_CONNECT_TO_0;
+	fan->connection[2] = CACHEFS_FANOUT_CONNECT_TO_0AUX;
+	fan->nalloc += 2;
+	goto share_shift;
+
+	/* propose creation of one new node pointed to by a single pointer
+	 * leaf */
+share_subkey:
+	pslot = assess[loop].offset >> CACHEFS_ONDISC_LEAF_SHIFT;
+
+	_debug("share subkey %04x [%d,%d] ptr %d",
+	       assess[loop].offset, loop, loop2, pslot);
+
+	assess[loop].alter = CACHEFS_ASSESS_MOVE;
+	assess[loop].dest = 1;
+	assess[loop].dslot = 0;
+	assess[loop].occupied = 0;
+
+	assess[loop2].alter = CACHEFS_ASSESS_MOVE;
+	assess[loop2].dest = 1;
+	assess[loop2].dslot = 1;
+	assess[loop2].occupied = 0;
+
+	xslot = (loop == pslot) ? loop2 : loop;
+
+	dslot = 2;
+	fan->common_offset = assess[loop2].offset;
+	for (loop2++;
+	     loop2 < CACHEFS_ONDISC_LEAF_PER_BLOCK + 1;
+	     loop2++
+	     ) {
+		if (assess[loop2].offset != fan->common_offset)
+			continue;
+		assess[loop2].alter = CACHEFS_ASSESS_MOVE;
+		assess[loop2].dest = 1;
+		assess[loop2].dslot = dslot++;
+		assess[loop2].occupied = 0;
+	}
+
+	fan->connection[1] = CACHEFS_FANOUT_CONNECT_TO_0;
+	fan->nalloc += 1;
+
+	/* consider proposing shifting a block to make space for a pointer
+	 * leaf */
+share_shift:
+	_debug("shift %d -> %d?", pslot, xslot);
+	if (assess[pslot].alter == CACHEFS_ASSESS_NOCHANGE) {
+		assess[pslot].alter = CACHEFS_ASSESS_SLIDE;
+		assess[pslot].dest = 0;
+		assess[pslot].dslot = xslot;
+		assess[xslot].occupied = 1;
+		xslot = -1;
+	}
+
+	assess[pslot].occupied = 2;
+	fan->ptrleaf_dest[0] = 0;
+	fan->ptrleaf_dslot[0] = pslot;
+
+	/* make sure the new object gets installed */
+	pass = &assess[CACHEFS_ONDISC_LEAF_PER_BLOCK];
+	if (pass->alter != CACHEFS_ASSESS_NOCHANGE)
+		goto share_fixup_shortcut;
+
+	if (xslot >= 0)
+		goto share_install;
+
+	for (xslot = 0; xslot < CACHEFS_ONDISC_LEAF_PER_BLOCK; xslot++)
+		if (!assess[xslot].occupied)
+			goto share_install;
+
+	/* there should be a hole */
+	BUG();
+
+share_install:
+	pass->dest = 0;
+	pass->dslot = xslot;
+	assess[xslot].occupied = 1;
+
+	/* fixup a shortcut to this block */
+share_fixup_shortcut:
+	if (fan->shortcut_levels < 0)
+		goto out;
+
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK + 1; loop++) {
+		assess[loop].dest++;
+		assess[loop].alter = CACHEFS_ASSESS_MOVE;
+		assess[loop].occupied = 0;
+	}
+
+	ASSERT(fan->ptrleaf_dest[1] < 0);
+
+	if (fan->ptrleaf_dest[0] >= 0) {
+		fan->ptrleaf_dest[1] = fan->ptrleaf_dest[0] + 1;
+		fan->ptrleaf_dslot[1] = fan->ptrleaf_dslot[0];
+	}
+	fan->connection[0] = CACHEFS_FANOUT_CONNECTED;
+	fan->connection[2] = CACHEFS_FANOUT_CONNECT_TO_1;
+
+	if (node->level + 1 < fan->shortcut_levels) {
+		/* we're proposing a shortcut */
+		_debug("@@@@ propose shortcut @@@@");
+
+		assess[0].occupied = 3;
+		fan->connection[1] = CACHEFS_FANOUT_SHORTCUT_FROM_0;
+		fan->ptrleaf_dest[0] = -1;
+	}
+	else {
+		/* we're actually proposing a pointer leaf */
+		uint16_t offset;
+
+		_debug("@@@@ propose extra pointer leaf @@@@");
+
+		offset = fan->shortcut_offset;
+		fan->shortcut_levels = -1;
+
+		pslot = offset >> CACHEFS_ONDISC_LEAF_SHIFT;
+		fan->ptrleaf_dest[0] = 0;
+		fan->ptrleaf_dslot[0] = pslot;
+
+		assess[pslot].occupied = 2;
+		fan->connection[1] = CACHEFS_FANOUT_CONNECT_TO_0;
+	}
+
+	if (fan->nalloc == 3)
+		fan->connection[3] = CACHEFS_FANOUT_CONNECT_TO_1AUX;
+	goto out;
+
+	/* return the assessment */
+out:
+	assess[CACHEFS_ONDISC_LEAF_PER_BLOCK].alter = CACHEFS_ASSESS_INSTALL;
+
+	//cachefs_dump_fanout(fan);
+	_leave(" [%d]", fan->nalloc);
+
+} /* end cachefs_tree_assess_fanout() */
+
+/*****************************************************************************/
+/*
+ * realise the assessment previously determined
+ */
+static void cachefs_tree_insert_effect_fanout(struct cachefs_operation *op,
+					      struct cachefs_fanout *fan,
+					      struct cachefs_ondisc_leaf *key)
+{
+	struct cachefs_ondisc_leaf *leaf;
+	struct cachefs_assessment *assess = fan->assess;
+	struct cachefs_tree *dnode;
+	uint16_t offset;
+	void *src, *dest;
+	int loop, loop2;
+
+
+	_enter("{%x},,", op->p.nodes[0]->bix);
+
+	/* initialise the new nodes */
+	_debug("init %p,%p,%p,%p",
+	       op->p.nodes[0], op->p.nodes[1], op->p.nodes[2], op->p.nodes[3]);
+
+	if (fan->dump)
+		cachefs_dump_fanout(op, fan);
+
+	for (loop = 1; loop < 4; loop++) {
+		dnode = op->p.nodes[loop];
+		if (dnode) {
+			_debug("prepare new block %d", loop);
+
+			if (!PageMappedToDisk(dnode->page)) {
+				printk("\n");
+				printk(KERN_ERR
+				       "CacheFS: NOT MAPPED: [%d] %p{%lx}\n",
+				       loop, dnode->page, dnode->page->index);
+				cachefs_dump_fanout(op, fan);
+				BUG();
+			}
+
+			_alter(op->super, "init node %x", dnode->bix);
+
+			dest = kmap_atomic(dnode->page, KM_USER1);
+
+			memset(dest, CACHEFS_EMPTY_FILL, PAGE_SIZE);
+			for (loop2 = 0;
+			     loop2 < CACHEFS_ONDISC_LEAF_PER_BLOCK;
+			     loop2++
+			     ) {
+				leaf = dest + (loop2 << op->super->layout->leaf_shift);
+				leaf->type = CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT;
+			}
+
+			kunmap_atomic(dest, KM_USER1);
+			SetPageUptodate(dnode->page);
+
+			set_bit(CACHEFS_TREE_NODE_VALID, &dnode->flags);
+			set_bit(CACHEFS_TREE_NODE_VALIDATED, &dnode->flags);
+		}
+	}
+
+	src = kmap_atomic(op->p.nodes[0]->page, KM_USER0);
+
+	/* first of all move leaves out to the new nodes */
+	_debug("move");
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+		uint16_t soff, doff;
+
+		if (assess[loop].alter != CACHEFS_ASSESS_MOVE)
+			continue;
+
+		dnode = op->p.nodes[assess[loop].dest];
+		ASSERT(dnode);
+		ASSERT(dnode->page);
+
+		dest = kmap_atomic(dnode->page, KM_USER1);
+
+		soff = loop << op->super->layout->leaf_shift;
+		doff = assess[loop].dslot << op->super->layout->leaf_shift;
+
+		/* move the leaf and reset the slot */
+		cachefs_tree_move_leaf(op->super, op->p.nodes[0], dnode,
+				       src + soff, dest + doff, soff, doff);
+
+		kunmap_atomic(dest, KM_USER1);
+	}
+
+	/* slide any leaf that needs displacing within the source node */
+	_debug("slide");
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+		uint16_t soff, doff;
+
+		if (assess[loop].alter != CACHEFS_ASSESS_SLIDE)
+			continue;
+
+		soff = loop << op->super->layout->leaf_shift;
+		doff = assess[loop].dslot << op->super->layout->leaf_shift;
+
+		/* slide the leaf and reset the slot */
+		cachefs_tree_slide_leaf(op->super, op->p.nodes[0],
+					src, soff, doff);
+	}
+
+	/* install the new object */
+	_debug("install");
+
+	dnode = op->p.nodes[assess[CACHEFS_ONDISC_LEAF_PER_BLOCK].dest];
+	offset = assess[CACHEFS_ONDISC_LEAF_PER_BLOCK].dslot;
+	offset <<= op->super->layout->leaf_shift;
+
+	dest = kmap_atomic(dnode->page, KM_USER1);
+	cachefs_tree_install_leaf(op, dnode, key, dest, offset);
+	kunmap_atomic(dest, KM_USER1);
+
+	/* clear any now unoccupied slots */
+	_debug("clear");
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+		if (assess[loop].occupied ||
+		    assess[loop].alter == CACHEFS_ASSESS_MOVE ||
+		    assess[loop].alter == CACHEFS_ASSESS_SLIDE)
+			continue;
+
+		_alter(op->super, "erase slot %x[%04x]",
+		       op->p.nodes[0]->bix,
+		       loop << op->super->layout->leaf_shift);
+
+		leaf = src + (loop << op->super->layout->leaf_shift);
+		memset(leaf, CACHEFS_EMPTY_FILL, op->super->layout->leaf_size);
+		leaf->type = CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT;
+	}
+
+	/* create or alter pointer leaves */
+	for (loop = 0; loop < 2; loop++) {
+		cachefs_block_t *ptr;
+		uint8_t ptrleaf_dslot;
+		int8_t ptrleaf_dest;
+
+		ptrleaf_dest = fan->ptrleaf_dest[loop];
+		if (ptrleaf_dest < 0)
+			continue;
+
+		ptrleaf_dslot = fan->ptrleaf_dslot[loop];
+
+		_debug("crt ptrleaf %d[%u]", ptrleaf_dest, ptrleaf_dslot);
+
+		dnode = op->p.nodes[ptrleaf_dest];
+		dest = kmap_atomic(dnode->page, KM_USER1);
+		leaf = dest + (ptrleaf_dslot << op->super->layout->leaf_shift);
+
+		if (assess[ptrleaf_dslot].type != CACHEFS_ASSESS_PTRBLK) {
+			_alter(op->super, "init ptr leaf %x[%04x]",
+			       dnode->bix,
+			       ptrleaf_dslot << op->super->layout->leaf_shift);
+
+			memset(leaf, CACHEFS_NULL_FILL, op->super->layout->leaf_size);
+		}
+
+		for (loop2 = 1; loop2 < 4; loop2++) {
+			switch (fan->connection[loop2]) {
+			case CACHEFS_FANOUT_CONNECT_TO_0:
+			case CACHEFS_FANOUT_CONNECT_TO_0AUX:
+				if (loop != 0)
+					continue;
+				break;
+
+			case CACHEFS_FANOUT_CONNECT_TO_1:
+			case CACHEFS_FANOUT_CONNECT_TO_1AUX:
+				if (loop != 1)
+					continue;
+				break;
+
+			default:
+				continue;
+			}
+
+			dnode->occupancy++;
+
+			/* connect to this pointer leaf */
+			if (!op->p.nodes[loop2]) {
+				printk(KERN_ERR "CacheFS:"
+				       " Can't connect to absent leaf %d:\n",
+				       loop2);
+				cachefs_dump_fanout(op, fan);
+				BUG();
+			}
+
+			_alter(op->super, "set ptr %x[%04x] to %x",
+			       dnode->bix,
+			       op->p.nodes[loop2]->offset,
+			       op->p.nodes[loop2]->bix);
+
+			ptr = dest + op->p.nodes[loop2]->offset;
+			*ptr = op->p.nodes[loop2]->bix;
+		}
+
+		kunmap_atomic(dest, KM_USER1);
+	}
+
+	/* create a shortcut from node 0 to node 1 */
+	if (fan->shortcut_levels >= 0) {
+		_alter(op->super, "create shortcut %x[0000] to %x {%d,%04x}",
+		       op->p.nodes[0]->bix,
+		       op->p.nodes[1]->bix,
+		       fan->shortcut_levels,
+		       fan->shortcut_offset);
+
+		leaf = src;
+		memset(leaf, CACHEFS_EMPTY_FILL, op->super->layout->leaf_size);
+		leaf->ptr = op->p.nodes[1]->bix;
+		leaf->type = CACHEFS_ONDISC_OBJTYPE_SHORTCUT;
+		leaf->u.shortcut.level = fan->shortcut_levels;
+		leaf->u.shortcut.s_offset = fan->shortcut_offset;
+		leaf->u.shortcut.klen =
+			fan->shortcut_levels * CACHEFS_ONDISC_LEVEL_BITS;
+		cachefs_extract_key(leaf->u.shortcut.key, key,
+				    leaf->u.shortcut.level);
+
+		op->p.nodes[0]->occupancy++;
+	}
+
+	/* note that these blocks have changed */
+	_debug("done");
+	kunmap_atomic(src, KM_USER0);
+
+	for (loop = 0; loop < 4; loop++) {
+		dnode = op->p.nodes[loop];
+		if (!dnode)
+			continue;
+
+		set_page_dirty(dnode->page);
+		unlock_page(dnode->page);
+	}
+
+	_leave("");
+
+} /* end cachefs_tree_insert_effect_fanout() */
+
+/*****************************************************************************/
+/*
+ * dump a fanout
+ */
+static void cachefs_dump_fanout(struct cachefs_operation *op,
+				struct cachefs_fanout *fan)
+{
+	struct cachefs_assessment *pass;
+	int loop;
+
+	printk("$ alloc %d\n", fan->nalloc);
+
+	if (fan->shortcut_levels >= 0)
+		printk("$ create shortcut to %d [%04x]\n",
+		       fan->shortcut_levels, fan->shortcut_offset);
+
+	if (fan->ptrleaf_dest[0] >= 0)
+		printk("$ create ptrleaf %d[%02x]\n",
+		       fan->ptrleaf_dest[0], fan->ptrleaf_dslot[0]);
+
+	if (fan->ptrleaf_dest[1] >= 0)
+		printk("$ create ptrleaf %d[%02x]\n",
+		       fan->ptrleaf_dest[1], fan->ptrleaf_dslot[1]);
+
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK + 1; loop++) {
+		pass = &fan->assess[loop];
+
+		printk("$ [%02x %c #%d] %s %04x %02x -> %s %d[%02x] c:%u i:%llx\n",
+		       loop,
+		       'A' + pass->occupied,
+		       pass->nleaf,
+		       cachefs_assessment_types[pass->type],
+		       pass->offset,
+		       pass->pslot,
+		       cachefs_assessment_alters[pass->alter],
+		       pass->dest,
+		       pass->dslot,
+		       pass->common,
+		       pass->objid);
+	}
+
+	printk("$ nodes: { %p:%2x, %p:%2x, %p:%2x, %p:%2x } %u,%u\n",
+	       op->p.nodes[0], fan->connection[0],
+	       op->p.nodes[1], fan->connection[1],
+	       op->p.nodes[2], fan->connection[2],
+	       op->p.nodes[3], fan->connection[3],
+	       fan->levels[0], fan->levels[1]
+	       );
+
+	printk("$ blocks: { %x, %x, %x, %x }\n",
+	       op->p.nodes[0] ? op->p.nodes[0]->bix : 0,
+	       op->p.nodes[1] ? op->p.nodes[1]->bix : 0,
+	       op->p.nodes[2] ? op->p.nodes[2]->bix : 0,
+	       op->p.nodes[3] ? op->p.nodes[3]->bix : 0);
+
+} /* end cachefs_dump_fanout() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/tree-keys.c linux-2.6.14-mm2-cachefs/fs/cachefs/tree-keys.c
--- linux-2.6.14-mm2/fs/cachefs/tree-keys.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/tree-keys.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,587 @@
+/* tree-keys.c: CacheFS key management
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include "cachefs-int.h"
+
+struct cachefs_digest {
+	uint32_t	hash[1];
+};
+
+struct cachefs_key_extract {
+	uint16_t	koffset;	/* offset of key in leaf */
+	uint16_t	kfixlen;	/* length of fixed part of key (bits) */
+	uint16_t	kvarlen;	/* offset of length of variable part of key */
+	uint16_t	kvarlenshift;	/* amount to shift kvarlen by to get bit count */
+	uint16_t	digestoff;	/* offset of digest in leaf */
+};
+
+static const struct cachefs_key_extract cachefs_key_extract_tbl[] = {
+	[CACHEFS_ONDISC_OBJTYPE_SHORTCUT] = {
+		/* key { data[0..klen] } */
+		.koffset = offsetof(struct cachefs_ondisc_leaf, u.shortcut.key),
+		.kfixlen = 0,
+		.kvarlen = offsetof(struct cachefs_ondisc_leaf, u.shortcut.klen),
+		.kvarlenshift = 0,
+		.digestoff = 0,
+	},
+	[CACHEFS_ONDISC_OBJTYPE_INDEX_OBJECT ...
+	 CACHEFS_ONDISC_OBJTYPE_OTHER_OBJECT
+	 ] = {
+		/* key { digest, parent, netfs_data[0..netfs_klen] } */
+		.koffset = offsetof(struct cachefs_ondisc_leaf, u.object.key),
+		.kfixlen = (sizeof(cachefs_digest_t) + sizeof(uint64_t) + sizeof(uint16_t)) * 8,
+		.kvarlen = offsetof(struct cachefs_ondisc_leaf, u.object.netfs_klen),
+		.kvarlenshift = 3,
+		.digestoff = offsetof(struct cachefs_ondisc_leaf, u.object.key),
+	},
+};
+
+/*****************************************************************************/
+/*
+ * extract the level'th subkey of a key
+ * - we assume we extract from bit 0 upwards as LSB data
+ * - each level is CACHEFS_ONDISC_LEVEL_BITS of the key
+ * - we return the subkey shifted for use as an offset
+ */
+unsigned cachefs_extract_subkey(const struct cachefs_ondisc_leaf *leaf,
+				int level)
+{
+	static const struct cachefs_key_extract *ex;
+	unsigned subkey, klen;
+	uint8_t *key;
+
+	_enter("%u,%d", leaf->type, level);
+
+	ex = &cachefs_key_extract_tbl[leaf->type];
+
+	/* find the key data */
+	key = (uint8_t *) leaf + ex->koffset;
+
+	/* work out the number of bits in the key */
+	klen = 0;
+	if (ex->kvarlen) {
+		klen = *(uint16_t *)((unsigned long) leaf + ex->kvarlen);
+		klen <<= ex->kvarlenshift;
+	}
+	klen += ex->kfixlen;
+
+	_debug("%02x%02x%02x%02x%02x%02x%02x%02x [%u] : %d",
+	       key[0], key[1], key[2], key[3], key[4], key[5], key[6], key[7],
+	       klen, level);
+
+	/* skip the first initial whole bytes of the key */
+	level *= CACHEFS_ONDISC_LEVEL_BITS;
+
+	key += level >> 3;
+	klen -= level & ~7;
+	level &= 7;
+
+	/* the next three bytes contain all the bits we could possibly want;
+	 * but which bits are which may somewhat depend on byte order
+	 */
+	subkey = *key++;
+	subkey |= *key++ << 8;
+	subkey |= *key << 16;
+
+	/* limit the key to the correct number of bits
+	 *
+	 *	klen	mask	1<<k	-1
+	 *	1	000001	000002	000001
+	 *	2	000003	000004	000003
+	 *	3	000007	000008	000007
+	 *	22	3FFFFF	400000	3FFFFF
+	 *	23	7FFFFF	800000	7FFFFF
+	 */
+	if (klen < 24) {
+		ASSERT(klen != 0);
+		subkey &= (1 << klen) - 1;
+	}
+
+	/* find the just the bits we actually want
+	 *
+	 *      <-3rd--><-2nd--><-1st-->
+	 *	   20        10        0
+	 *	321*987654321*987654321*
+	 *	              <--------> width=10, level=0
+	 *	             <-------->|
+	 *	            <--------> |
+	 *	           <-------->  |
+	 *	          <-------->   |
+	 *	         <-------->    |
+	 *	        <-------->     |
+	 *	       <-------->      | width=10, level=7
+	 *	          <------------> width=14, level=0
+	 *	         <------------>|
+	 *	        <------------> |
+	 *	       <------------>  |
+	 *	      <------------>   |
+	 *	     <------------>    |
+	 *	    <------------>     |
+	 *	   <------------>      | width=14, level=7
+	 */
+	subkey >>= level;
+	subkey &= (1 << CACHEFS_ONDISC_LEVEL_BITS) - 1;
+	subkey <<= CACHEFS_BLOCK_SHIFT;
+
+	_leave(" = %x", subkey);
+	return subkey;
+
+} /* end cachefs_extract_subkey() */
+
+/*****************************************************************************/
+/*
+ * extract the level'th subkey of an object's key
+ * - we assume we extract from bit 0 upwards as LSB data
+ * - each level is CACHEFS_ONDISC_LEVEL_BITS of the key
+ * - we return the subkey shifted for use as an offset
+ */
+unsigned cachefs_extract_subkey_obj(const struct cachefs_object *object,
+				    int level)
+{
+	unsigned subkey, klen;
+	uint8_t *key;
+
+	_enter(",%d", level);
+
+	key = object->key;
+	klen = object->keylen << 3;
+
+	_debug("%02x%02x%02x%02x%02x%02x%02x%02x [%u] : %d",
+	       key[0], key[1], key[2], key[3], key[4], key[5], key[6], key[7],
+	       klen, level);
+
+	/* skip the first initial whole bytes of the key */
+	level *= CACHEFS_ONDISC_LEVEL_BITS;
+
+	key += level >> 3;
+	klen -= level & ~7;
+	level &= 7;
+
+	/* the next three bytes contain all the bits we could possibly want;
+	 * but which bits are which may somewhat depend on byte order
+	 */
+	subkey = *key++;
+	subkey |= *key++ << 8;
+	subkey |= *key << 16;
+
+	/* limit the key to the correct number of bits
+	 *
+	 *	klen	mask	1<<k	-1
+	 *	1	000001	000002	000001
+	 *	2	000003	000004	000003
+	 *	3	000007	000008	000007
+	 *	22	3FFFFF	400000	3FFFFF
+	 *	23	7FFFFF	800000	7FFFFF
+	 */
+	if (klen < 24) {
+		ASSERT(klen != 0);
+		subkey &= (1 << klen) - 1;
+	}
+
+	/* find the just the bits we actually want
+	 *
+	 *      <-3rd--><-2nd--><-1st-->
+	 *	   20        10        0
+	 *	321*987654321*987654321*
+	 *	              <--------> width=10, level=0
+	 *	             <-------->|
+	 *	            <--------> |
+	 *	           <-------->  |
+	 *	          <-------->   |
+	 *	         <-------->    |
+	 *	        <-------->     |
+	 *	       <-------->      | width=10, level=7
+	 *	          <------------> width=14, level=0
+	 *	         <------------>|
+	 *	        <------------> |
+	 *	       <------------>  |
+	 *	      <------------>   |
+	 *	     <------------>    |
+	 *	    <------------>     |
+	 *	   <------------>      | width=14, level=7
+	 */
+	subkey >>= level;
+	subkey &= (1 << CACHEFS_ONDISC_LEVEL_BITS) - 1;
+	subkey <<= CACHEFS_BLOCK_SHIFT;
+
+	_leave(" = %x", subkey);
+	return subkey;
+
+} /* end cachefs_extract_subkey_obj() */
+
+/*****************************************************************************/
+/*
+ * extract a number of bits from a key
+ */
+void cachefs_extract_key(uint8_t *buffer,
+			 const struct cachefs_ondisc_leaf *leaf,
+			 int level)
+{
+	static const struct cachefs_key_extract *ex;
+	unsigned klen, n;
+	uint8_t *key, b;
+
+	_enter("%u,%d", leaf->type, level);
+
+	ex = &cachefs_key_extract_tbl[leaf->type];
+
+	/* find the key data */
+	key = (uint8_t *) leaf + ex->koffset;
+
+	/* work out the number of bits in the key */
+	klen = 0;
+	if (ex->kvarlen) {
+		klen = *(uint16_t *)((unsigned long) leaf + ex->kvarlen);
+		klen <<= ex->kvarlenshift;
+	}
+	klen += ex->kfixlen;
+
+	_debug("%02x%02x%02x%02x%02x%02x%02x%02x [%u]",
+	       key[0], key[1], key[2], key[3], key[4], key[5], key[6], key[7],
+	       klen);
+
+	level *= CACHEFS_ONDISC_LEVEL_BITS;
+
+	/* copy the whole bytes first */
+	n = min(klen, (unsigned) level) & ~7;
+	level -= n;
+
+	memcpy(buffer, key, n >> 3);
+	if (level == 0) {
+		_leave(" [exact]");
+		return;
+	}
+
+	klen -= n;
+	buffer += n >> 3;
+	key += n >> 3;
+
+	/* copy the partial byte, limiting the number of bits */
+	n = min(klen, (unsigned) level);
+	ASSERT(n > 0 && n <= 7);
+
+	b = *key;
+	b &= (1 << n) - 1;
+	*buffer = b;
+
+	_leave(" [part]");
+
+} /* end cachefs_extract_key() */
+
+/*****************************************************************************/
+/*
+ * compare the keys of two leaves
+ * - we assume we scan from bit 0 upwards as LSB data
+ * - return
+ *	0 if keys identical
+ *	1 if b is a subkey of a
+ *	2 if a is a subkey of b
+ *	-N if they don't match, where N is the number of bits they do match to
+ * - set *diff to:
+ *	0 if keys identical
+ *	+ve if a > b
+ *	-ve if a < b
+ */
+static inline int cachefs_compare_keys_aux(uint8_t *akey,
+					   unsigned aklen,
+					   const struct cachefs_ondisc_leaf *b,
+					   int *diff)
+{
+	static const struct cachefs_key_extract *bex;
+	unsigned bklen, minlen;
+	uint8_t *bkey, xor, mask;
+	int loop, stop;
+
+	_enter("");
+
+	bex = &cachefs_key_extract_tbl[b->type];
+
+	/* find the key data */
+	bkey = (uint8_t *) b + bex->koffset;
+
+	/* work out the number of bits in each key */
+	bklen = 0;
+
+	if (bex->kvarlen) {
+		bklen = *(uint16_t *)((unsigned long) b + bex->kvarlen);
+		bklen <<= bex->kvarlenshift;
+	}
+	bklen += bex->kfixlen;
+
+	_debug("a: %02x%02x%02x%02x%02x%02x%02x%02x [%u]",
+	       akey[0], akey[1], akey[2], akey[3], akey[4], akey[5], akey[6], akey[7],
+	       aklen);
+
+	_debug("b: %02x%02x%02x%02x%02x%02x%02x%02x [%u]",
+	       bkey[0], bkey[1], bkey[2], bkey[3], bkey[4], bkey[5], bkey[6], bkey[7],
+	       bklen);
+
+	minlen = min(aklen, bklen);
+
+	_debug("min %u", minlen);
+
+	/* work through the common number of bytes */
+	stop = minlen >> 3;
+	for (loop = 0; loop < stop; loop++) {
+		xor = *akey ^ *bkey;
+		if (xor != 0) {
+			*diff = (int) *akey - (int) *bkey;
+			goto partial_match;
+		}
+		akey++;
+		bkey++;
+	}
+
+	aklen -= minlen & ~7;
+	bklen -= minlen & ~7;
+	minlen &= 7;
+
+	/* deal with one or both keys now being out of bits */
+short_key:
+	_debug("short l=%d a=%u b=%u m=%u", loop, aklen, bklen, minlen);
+
+	if (bklen == 0) {
+		if (aklen == 0) {
+			*diff = 0;
+			_leave(" = 0 [same]");
+			return 0;
+		}
+		*diff = 1;
+		_leave(" = 1 [b short]");
+		return 1;
+	}
+	if (aklen == 0) {
+		*diff = -1;
+		_leave(" = 1 [a short]");
+		return 2;
+	}
+
+	/* less than a byte is left of one of the keys */
+	mask = (1 << minlen) - 1;
+	xor = (*akey ^ *bkey) & mask;
+	if (xor == 0) {
+		/* they're the same */
+		aklen -= minlen;
+		bklen -= minlen;
+		goto short_key;
+	}
+
+	*diff = (int) (*akey & mask) - (int) (*bkey & mask);
+
+partial_match:
+	_debug("partial: l=%u x=%x fls=%d", loop, xor, ffs(xor));
+	loop = -((loop << 3) + ffs(xor));
+	_leave(" = %d [partial]", loop);
+	return loop;
+
+} /* end cachefs_compare_keys_aux() */
+
+/*****************************************************************************/
+/*
+ * compare the keys of two leaves
+ */
+int cachefs_compare_keys(const struct cachefs_ondisc_leaf *a,
+			 const struct cachefs_ondisc_leaf *b)
+{
+	static const struct cachefs_key_extract *aex;
+	unsigned aklen;
+	uint8_t *akey;
+	int diff;
+
+	_enter("");
+
+	aex = &cachefs_key_extract_tbl[a->type];
+
+	/* find the key data */
+	akey = (uint8_t *) a + aex->koffset;
+
+	/* work out the number of bits in each key */
+	aklen = 0;
+
+	if (aex->kvarlen) {
+		aklen = *(uint16_t *)((unsigned long) a + aex->kvarlen);
+		aklen <<= aex->kvarlenshift;
+	}
+	aklen += aex->kfixlen;
+
+	return cachefs_compare_keys_aux(akey, aklen, b, &diff);
+
+} /* end cachefs_compare_keys() */
+
+/*****************************************************************************/
+/*
+ * compare the key copy attached to an object with that in a leaf
+ */
+int cachefs_compare_keys_obj(const struct cachefs_object *a,
+			     const struct cachefs_ondisc_leaf *b)
+{
+	int diff;
+	return cachefs_compare_keys_aux(a->key, a->keylen << 3, b, &diff);
+
+} /* end cachefs_compare_keys_obj() */
+
+/*****************************************************************************/
+/*
+ * compare the keys for ordering
+ */
+int cachefs_keycmp(const struct cachefs_ondisc_leaf *a,
+		   const struct cachefs_ondisc_leaf *b)
+{
+	static const struct cachefs_key_extract *aex;
+	unsigned aklen;
+	uint8_t *akey;
+	int diff;
+
+	_enter("");
+
+	aex = &cachefs_key_extract_tbl[a->type];
+
+	/* find the key data */
+	akey = (uint8_t *) a + aex->koffset;
+
+	/* work out the number of bits in each key */
+	aklen = 0;
+
+	if (aex->kvarlen) {
+		aklen = *(uint16_t *)((unsigned long) a + aex->kvarlen);
+		aklen <<= aex->kvarlenshift;
+	}
+	aklen += aex->kfixlen;
+
+	cachefs_compare_keys_aux(akey, aklen, b, &diff);
+	return diff;
+
+} /* end cachefs_compare_keys_obj() */
+
+/*****************************************************************************/
+/*
+ * compare the keys
+ * - returns 0 if total match or if b is a proper subkey of a
+ * - returns +ve if a > b
+ * - returns -ve if a < b
+ */
+int cachefs_keycmp_obj(const struct cachefs_object *a,
+		       const struct cachefs_ondisc_leaf *b)
+{
+	int result, diff;
+
+	result = cachefs_compare_keys_aux(a->key, a->keylen << 3, b, &diff);
+
+	return result == 1 ? 0 : diff;
+
+} /* end cachefs_compare_keys_obj() */
+
+/*****************************************************************************/
+/*
+ * initialise a digest workspace
+ */
+static inline void cachefs_key_init_digest(struct cachefs_digest *workspace)
+{
+	memset(workspace, 0, sizeof(*workspace));
+
+} /* end cachefs_key_init_digest() */
+
+/*****************************************************************************/
+/*
+ * add data into a digest
+ */
+static inline void cachefs_key_digest(struct cachefs_digest *workspace,
+				      const void *data, size_t len)
+{
+	const uint8_t *p = data;
+	uint32_t x;
+
+	/* simple hash for now */
+	x = workspace->hash[0];
+	for (; len > 0; len--) {
+		x = rol32(x, 5) ^ 0xa5;
+		x += *p++;
+		x = rol32(x, 2) ^ 0x63;
+	}
+	workspace->hash[0] = x;
+
+} /* end cachefs_key_digest() */
+
+/*****************************************************************************/
+/*
+ * generate a key from a partial digest
+ */
+static inline void cachefs_key_generate(cachefs_digest_t *digest,
+					struct cachefs_digest *workspace)
+{
+	digest->csum[0] = workspace->hash[0];
+
+	_leave(" [%08x]", digest->csum[0]);
+
+} /* end cachefs_key_generate() */
+
+/*****************************************************************************/
+/*
+ * produce a digest for a key and attach a copy of the key to the object
+ */
+int cachefs_digest_key(struct cachefs_object *object,
+		       const struct cachefs_ondisc_leaf *leaf)
+{
+	static const struct cachefs_key_extract *ex;
+	struct cachefs_digest workspace;
+	cachefs_digest_t *digest;
+	unsigned klen;
+	uint8_t *key;
+
+	_enter("");
+
+	/* initialise the workspace */
+	cachefs_key_init_digest(&workspace);
+
+	ex = &cachefs_key_extract_tbl[leaf->type];
+
+	/* find the key data */
+	key = (void *) leaf + ex->koffset;
+	digest = (void *) leaf + ex->digestoff;
+
+	/* no digest yet */
+	memset(digest, 0, sizeof(*digest));
+
+	/* work out the number of bits in the key */
+	klen = 0;
+	if (ex->kvarlen) {
+		klen = *(uint16_t *)((unsigned long) leaf + ex->kvarlen);
+		klen <<= ex->kvarlenshift;
+	}
+	klen += ex->kfixlen;
+	klen >>= 3;
+	object->keylen = klen;
+
+	_debug("%02x%02x%02x%02x%02x%02x%02x%02x [%u]",
+	       key[0], key[1], key[2], key[3], key[4], key[5], key[6], key[7],
+	       klen);
+
+	cachefs_key_digest(&workspace, key, klen);
+	cachefs_key_generate(digest, &workspace);
+
+	object->key = kmalloc(object->keylen + 2, GFP_KERNEL);
+	if (!object->key) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	memcpy(object->key, key, object->keylen);
+	object->key[object->keylen] = 0;
+	object->key[object->keylen + 1] = 0;
+
+	_leave(" = 0 [%08x]", digest->csum[0]);
+	return 0;
+
+} /* end cachefs_digest_key() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/tree-list.c linux-2.6.14-mm2-cachefs/fs/cachefs/tree-list.c
--- linux-2.6.14-mm2/fs/cachefs/tree-list.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/tree-list.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,544 @@
+/* tree-list.c: node + children management
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "cachefs-int.h"
+
+/*****************************************************************************/
+/*
+ * link a tree node to its parent
+ * - caller must have the parent's rwlock write locked
+ * - caller must increment the refcount of node
+ * - caller must change node's parent pointer
+ */
+void __cachefs_tree_link_to_node(struct cachefs_tree *node,
+				 struct cachefs_tree *parent)
+{
+	struct cachefs_tree *xnode;
+	struct rb_node **pp, *p;
+
+	_enter("{%x,%d,%04x},{%x}",
+	       node->bix, node->type, node->offset, parent->bix);
+
+	ASSERT(node->offset < PAGE_SIZE);
+
+	p = NULL;
+	pp = &parent->nodes.rb_node;
+
+	while (*pp) {
+		p = *pp;
+		xnode = rb_entry(p, struct cachefs_tree, node_rb);
+
+		if (xnode->type < node->type)
+			pp = &(*pp)->rb_left;
+		else if (xnode->type > node->type)
+			pp = &(*pp)->rb_right;
+		else if (xnode->offset < node->offset)
+			pp = &(*pp)->rb_left;
+		else if (xnode->offset > node->offset)
+			pp = &(*pp)->rb_right;
+		else
+			BUG(); // unexpectedly matched
+	}
+
+	rb_link_node(&node->node_rb, p, pp);
+	rb_insert_color(&node->node_rb, &parent->nodes);
+	set_bit(CACHEFS_TREE_INSTALLED, &node->flags);
+
+	/* add a shortcuts to the key-based tree too */
+	if (node->type == CACHEFS_TREE_TYPE_SHORTCUT) {
+		void *node_data;
+		int diff;
+
+		node_data = kmap_atomic(node->page, KM_USER1);
+
+		p = NULL;
+		pp = &parent->nodes.rb_node;
+
+		while (*pp) {
+			p = *pp;
+			xnode = rb_entry(p, struct cachefs_tree, aux_rb);
+
+			diff = cachefs_keycmp(node_data + xnode->offset,
+					      node_data + node->offset);
+
+			if (diff < 0)
+				pp = &(*pp)->rb_left;
+			else if (diff > 0)
+				pp = &(*pp)->rb_right;
+			else
+				BUG(); // unexpectedly matched
+		}
+
+		rb_link_node(&node->node_rb, p, pp);
+		rb_insert_color(&node->node_rb, &parent->nodes);
+		set_bit(CACHEFS_TREE_S_INSTALLED, &node->flags);
+	}
+
+	_leave("");
+
+} /* end __cachefs_tree_link_to_node() */
+
+/*****************************************************************************/
+/*
+ * link a level 1 dataptr tree node to an object
+ */
+void cachefs_tree_link_to_object(struct cachefs_tree *node,
+				 struct cachefs_object *object)
+{
+	struct cachefs_tree *xnode;
+	struct rb_node **pp, *p;
+
+	_enter("{%x,%lx},{%llx}", node->bix, node->index, object->objid);
+
+	_debug("### Linking %x to obj %llx", node->bix, object->objid);
+
+	write_lock(&object->lock);
+
+	node->object = object;
+
+	p = NULL;
+	pp = &object->dataptrblks.rb_node;
+	while (*pp) {
+		p = *pp;
+		xnode = rb_entry(p, struct cachefs_tree, aux_rb);
+
+		if (xnode->index < node->index)
+			pp = &(*pp)->rb_left;
+		else if (xnode->index > node->index)
+			pp = &(*pp)->rb_right;
+		else if (xnode == node)
+			goto already_done;
+		else {
+			printk("\n");
+			printk(KERN_ERR
+			       "CacheFS: Multiple link"
+			       " %p{%x} (%lx) vs %p{%x} (%lx)\n",
+			       node, node->bix, node->index,
+			       xnode, xnode->bix, xnode->index);
+			printk("\n");
+			BUG();
+		}
+	}
+
+	rb_link_node(&node->aux_rb, p, pp);
+	rb_insert_color(&node->aux_rb, &object->dataptrblks);
+
+already_done:
+	write_unlock(&object->lock);
+	_leave("");
+
+} /* end cachefs_tree_link_to_object() */
+
+/*****************************************************************************/
+/*
+ * search a node's child nodes for the leaf in a particular slot
+ */
+struct cachefs_tree *cachefs_tree_find_node(struct cachefs_tree *node,
+					    uint8_t type,
+					    uint16_t offset)
+{
+	struct cachefs_tree *xnode;
+	struct rb_node *p = node->nodes.rb_node;
+
+	_enter("{%x},%u,%04hx", node->bix, type, offset);
+
+	while (p) {
+		xnode = rb_entry(p, struct cachefs_tree, node_rb);
+
+		if (xnode->type < type)
+			p = p->rb_left;
+		else if (xnode->type > type)
+			p = p->rb_right;
+		else if (xnode->offset < offset)
+			p = p->rb_left;
+		else if (xnode->offset > offset)
+			p = p->rb_right;
+		else {
+			cachefs_tree_debugcheck(xnode);
+			if (atomic_read(&xnode->usage) <= 0)
+				return NULL; /* prevent a race with put */
+			return xnode;
+		}
+	}
+
+	_leave(" = NULL");
+	return NULL;
+
+} /* end cachefs_tree_find_node() */
+
+/*****************************************************************************/
+/*
+ * search a node's shortcuts for one with a matching key
+ */
+struct cachefs_tree *cachefs_tree_find_shortcut(struct cachefs_tree *node,
+						struct cachefs_ondisc_leaf *key)
+{
+	struct cachefs_tree *xnode;
+	struct rb_node *p = node->shortcuts.rb_node;
+	void *node_data;
+	int diff;
+
+	_enter("{%x},", node->bix);
+
+	if (p) {
+		ASSERT(node->page != NULL);
+		node_data = kmap_atomic(node->page, KM_USER1);
+
+		while (p) {
+			xnode = rb_entry(p, struct cachefs_tree, aux_rb);
+
+			diff = cachefs_keycmp(node_data + xnode->offset, key);
+
+			if (diff < 0)
+				p = p->rb_left;
+			else if (diff > 0)
+				p = p->rb_right;
+			else {
+				cachefs_tree_debugcheck(xnode);
+				if (atomic_read(&xnode->usage) <= 0)
+					goto not_found; /* prevent a race with
+							 * put */
+
+				kunmap_atomic(node_data, KM_USER1);
+				return xnode;
+			}
+		}
+
+	not_found:
+		kunmap_atomic(node_data, KM_USER1);
+	}
+
+	_leave(" = NULL");
+	return NULL;
+
+} /* end cachefs_tree_find_shortcut() */
+
+/*****************************************************************************/
+/*
+ * search a node's shortcuts for one that matches an object's key
+ */
+struct cachefs_tree *cachefs_tree_find_shortcut_obj(struct cachefs_tree *node,
+						    struct cachefs_object *obj)
+{
+	struct cachefs_tree *xnode;
+	struct rb_node *p = node->shortcuts.rb_node;
+	void *node_data;
+	int diff;
+
+	_enter("{%x},{%llx}", node->bix, obj->objid);
+
+	node_data = kmap_atomic(node->page, KM_USER1);
+
+	while (p) {
+		xnode = rb_entry(p, struct cachefs_tree, aux_rb);
+
+		diff = -cachefs_keycmp_obj(obj, node_data + xnode->offset);
+
+		if (diff < 0)
+			p = p->rb_left;
+		else if (diff > 0)
+			p = p->rb_right;
+		else {
+			cachefs_tree_debugcheck(xnode);
+			if (atomic_read(&xnode->usage) <= 0)
+				goto not_found; /* prevent a race with put */
+
+			kunmap_atomic(node_data, KM_USER1);
+			return xnode;
+		}
+	}
+
+not_found:
+	kunmap_atomic(node_data, KM_USER1);
+	_leave(" = NULL");
+	return NULL;
+
+} /* end cachefs_tree_find_shortcut_obj() */
+
+/*****************************************************************************/
+/*
+ * search a node's child objects for the object in a particular slot
+ */
+struct cachefs_object *cachefs_tree_find_object(struct cachefs_tree *node,
+						uint16_t offset)
+{
+	struct cachefs_object *xobj;
+	struct rb_node *p = node->objects.rb_node;
+
+	_enter("{%x},%04hx", node->bix, offset);
+
+	while (p) {
+		xobj = rb_entry(p, struct cachefs_object, node_rb);
+
+		if (xobj->offset < offset)
+			p = p->rb_left;
+		else if (xobj->offset > offset)
+			p = p->rb_right;
+		else
+			return xobj;
+	}
+
+	_leave(" = NULL");
+	return NULL;
+
+} /* end cachefs_tree_find_object() */
+
+/*****************************************************************************/
+/*
+ * search a node's child nodes for a node of a particular type and subkey
+ */
+struct cachefs_tree *cachefs_tree_find_level1_dataptr(struct cachefs_object *object,
+						      unsigned long index)
+{
+	struct cachefs_tree *xnode;
+	struct rb_node *p = object->dataptrblks.rb_node;
+
+	_enter("{%llx},%lx", object->objid, index);
+
+	index &= ~(CACHEFS_ONDISC_PTR_PER_BLOCK - 1);
+
+	while (p) {
+		xnode = rb_entry(p, struct cachefs_tree, aux_rb);
+
+		if (xnode->index < index)
+			p = p->rb_left;
+		else if (xnode->index > index)
+			p = p->rb_right;
+		else
+			return atomic_read(&xnode->usage) > 0 ?
+				cachefs_tree_get(xnode) : NULL;
+	}
+
+	_leave(" = NULL");
+	return NULL;
+
+} /* end cachefs_tree_find_level1_dataptr() */
+
+/*****************************************************************************/
+/*
+ * link an object to a node
+ * - caller must have node's rwlock write locked
+ * - caller must have set the object's node pointer
+ * - dupctl controls duplicate object record handling (normal, cull list, reap)
+ *   - 0: displace entry from culling list; -EEXIST if being reaped
+ *   - 1: add to culling list if not present; -EEXIST if present
+ *   - 2: displace entry from culling list; add new to reaper; -EEXIST if being
+ *        reaped already
+ */
+int __cachefs_tree_link_object(struct cachefs_super *super,
+			       struct cachefs_object *object,
+			       struct cachefs_tree *node,
+			       int dupctl)
+{
+	struct cachefs_object *xobject, *killobj;
+	struct rb_node **pp, *p;
+
+	_enter("{%llx,%04hx,%x},{%x}",
+	       object->objid, object->offset, atomic_read(&object->usage),
+	       node->bix);
+
+	ASSERT(object->offset < 0xffff);
+	ASSERT(object->objid >= CACHEFS_ONDISC_FSDEF_OBJID);
+
+	/* firstly attempt to attach the object to the containing node, indexed
+	 * by offset */
+	p = NULL;
+	pp = &node->objects.rb_node;
+	while (*pp) {
+		p = *pp;
+		xobject = rb_entry(p, struct cachefs_object, node_rb);
+
+		if (xobject->offset < object->offset)
+			pp = &(*pp)->rb_left;
+		else if (xobject->offset > object->offset)
+			pp = &(*pp)->rb_right;
+		else
+			goto found_duplicate;
+	}
+
+	/* normal addition can be dealt with immediately */
+	if (dupctl == 0) {
+		rb_link_node(&object->node_rb, p, pp);
+		rb_insert_color(&object->node_rb, &node->objects);
+		_leave(" = 0");
+		return 0;
+	}
+
+	/* add to the cull queue or the scanner's reap hook if required */
+	killobj = NULL;
+	spin_lock(&super->objects_lock);
+
+	if (dupctl == 1) {
+		/* try to add to the cull queue
+		 * - consumes caller's ref on this object
+		 */
+		struct list_head *_p;
+
+		if (super->scan_nculls == 0) {
+			ASSERT(list_empty(&super->scan_culls));
+			list_add(&object->cull_link, &super->scan_culls);
+			super->scan_nculls++;
+			goto actually_add_to_node;
+		}
+
+		ASSERTIF(super->scan_nculls > 0,
+			 !list_empty(&super->scan_culls));
+
+		if (super->scan_nculls == super->scan_maxculls) {
+			/* queue is full, see if we should displace one */
+			xobject = list_entry(super->scan_culls.prev,
+					     struct cachefs_object, cull_link);
+
+			if (object->atime > xobject->atime)
+				goto cant_add_to_cull_queue;
+
+			ASSERT(atomic_read(&xobject->usage) == 1);
+			list_move(&xobject->cull_link, &super->scan_xculls);
+			killobj = xobject;
+			_debug("- kill displaced cull %p{%llx,%d}",
+			       killobj, killobj->objid,
+			       atomic_read(&object->usage));
+			goto add_to_cull_queue;
+		}
+
+		ASSERT(super->scan_nculls <= super->scan_maxculls);
+		super->scan_nculls++;
+
+	add_to_cull_queue:
+		/* scan backwards through the queue looking for a point at
+		 * which to insert */
+		_p = super->scan_culls.prev;
+		do {
+			xobject = list_entry(super->scan_culls.prev,
+					     struct cachefs_object, cull_link);
+
+			if (object->atime <= xobject->atime)
+				break;
+
+			_p = _p->prev;
+		} while (_p != &super->scan_culls);
+
+		list_add_tail(&object->cull_link, _p);
+	}
+	else if (dupctl == 2) {
+		/*  dangle from the reap point
+		 * - consumes caller's ref on this object
+		 */
+		object->being_reaped = 1;
+		super->scan_reap = object;
+	}
+
+	/* we now know we're definitely going to add the object to the node */
+actually_add_to_node:
+	rb_link_node(&object->node_rb, p, pp);
+	rb_insert_color(&object->node_rb, &node->objects);
+
+	if (killobj) {
+		rb_erase(&killobj->node_rb, &killobj->node->objects);
+		memset(&killobj->node_rb, 0, sizeof(killobj->node_rb));
+		killobj->offset = 0xffffU;
+	}
+
+	spin_unlock(&super->objects_lock);
+	_leave(" = 0");
+	return 0;
+
+	/* if the node can't be put in the queue then we don't want this
+	 * placeholder */
+cant_add_to_cull_queue:
+	spin_unlock(&super->objects_lock);
+	_leave(" = -ENOSPC");
+	return -ENOSPC;
+
+	/* found a duplicate
+	 * - we can replace that with the new object if that's just a
+	 *   placeholder in the culling queue
+	 */
+found_duplicate:
+	if (dupctl != 1) {
+		spin_lock(&super->objects_lock);
+
+		if (!list_empty(&xobject->cull_link))
+			goto replace_object_in_node;
+
+		ASSERT(xobject->being_reaped);
+		ASSERT(super->scan_reap == xobject ||
+		       super->reaper_target == xobject);
+
+		spin_unlock(&super->objects_lock);
+	}
+
+	_leave(" = -EEXIST");
+	return -EEXIST;
+
+replace_object_in_node:
+	/* displace culling queue placeholder */
+	list_move(&xobject->cull_link, &super->scan_xculls);
+	super->scan_nculls--;
+
+	rb_replace_node(&xobject->node_rb, &object->node_rb, &node->objects);
+	memset(&xobject->node_rb, 0, sizeof(xobject->node_rb));
+	xobject->offset = 0xffffU;
+
+	_debug("- kill replaced cull %p{%llx,%d}",
+	       xobject, xobject->objid, atomic_read(&xobject->usage));
+
+	spin_unlock(&super->objects_lock);
+
+	_leave(" = 0 [replaced]");
+	return 0;
+
+} /* end __cachefs_tree_link_object() */
+
+/*****************************************************************************/
+/*
+ * move an object from one node to another
+ * - both nodes must have their rwlocks write-locked by the caller
+ * - the caller must adjust the refcounts on the nodes
+ */
+void __cachefs_tree_move_object_to_node(struct cachefs_object *object,
+					struct cachefs_tree *from,
+					struct cachefs_tree *to,
+					uint16_t to_offset)
+{
+	struct cachefs_object *xobject;
+	struct rb_node **pp, *p;
+
+	_enter("{%llx},{%x},{%x},%04x",
+	       object->objid, from->bix, to->bix, to_offset);
+
+	rb_erase(&object->node_rb, &from->objects);
+
+	object->offset = to_offset;
+
+	/* attach to the destination node */
+	p = NULL;
+	pp = &to->objects.rb_node;
+	while (*pp) {
+		p = *pp;
+		xobject = rb_entry(p, struct cachefs_object, node_rb);
+
+		if (xobject->offset < object->offset)
+			pp = &(*pp)->rb_left;
+		else if (xobject->offset > object->offset)
+			pp = &(*pp)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&object->node_rb, p, pp);
+	rb_insert_color(&object->node_rb, &to->objects);
+	_leave("");
+
+} /* end __cachefs_tree_move_object_to_node() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/tree-lookup.c linux-2.6.14-mm2-cachefs/fs/cachefs/tree-lookup.c
--- linux-2.6.14-mm2/fs/cachefs/tree-lookup.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/tree-lookup.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,598 @@
+/* tree-lookup.c: CacheFS indexing tree lookup
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+//#define __KENTER
+//#define __KDEBUG
+//#define __KLEAVE
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "cachefs-int.h"
+
+static int cachefs_tree_walk(struct cachefs_super *super,
+			     struct cachefs_ondisc_leaf *key,
+			     struct cachefs_cursor *result);
+static int cachefs_tree_check_node_validity(struct cachefs_object *object,
+					    struct cachefs_ondisc_leaf *key);
+static void cachefs_tree_update_info(struct cachefs_super *super,
+				     struct cachefs_object *object,
+				     struct cachefs_ondisc_leaf *key,
+				     int degree);
+
+/*****************************************************************************/
+/*
+ * wait for the object reaper to get out from underfoot
+ */
+static int cachefs_wait_on_reap(struct cachefs_super *super)
+{
+	struct cachefs_object *obj;
+
+	DECLARE_WAITQUEUE(myself, current);
+
+	cond_resched();
+
+	/* wait for the scanner and reaper to flush through */
+	obj = super->scan_reap ?: super->reaper_target;
+	if (obj) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&super->reaper_waitq, &myself);
+
+		while (super->scan_reap == obj ||
+		       super->reaper_target == obj
+		       ) {
+			if (signal_pending(current))
+				break;
+
+			schedule();
+			set_current_state(TASK_INTERRUPTIBLE);
+		}
+
+		remove_wait_queue(&super->reaper_waitq, &myself);
+		__set_current_state(TASK_RUNNING);
+	}
+
+	if (signal_pending(current))
+		return -EINTR;
+
+	return 0;
+
+} /* end cachefs_wait_on_reap() */
+
+/*****************************************************************************/
+/*
+ * lookup an object in the indexing tree
+ * - if so requested we'll create a new node if what we're looking for is
+ *   absent
+ */
+int cachefs_tree_lookup_object(struct cachefs_super *super,
+			       struct cachefs_object *object,
+			       struct cachefs_ondisc_leaf *key,
+			       int create)
+{
+	struct cachefs_cursor cursor;
+	int ret;
+
+	//printk("\n");
+	//printk("----------------------------\n");
+	//printk("\n");
+	_enter("{%s},%p,%p", super->cache.identifier, object, key);
+
+walk_again:
+	/* walk the tree to see if this object is present or if a leaf can be
+	 * found on which to store the object */
+	ret = cachefs_tree_walk(super, key, &cursor);
+	if (ret < 0) {
+		_leave(" = %d [walk fail]", ret);
+		return ret;
+	}
+
+	/* if we've found the object of our desire
+	 * - we now hold a read-lock on the cursor point's semaphore
+	 */
+	if (ret == 1) {
+		_debug("found object");
+
+		/* if it's an inode then check that the netfs likes it */
+		if (key->type == CACHEFS_ONDISC_OBJTYPE_INDEX_OBJECT	||
+		    key->type == CACHEFS_ONDISC_OBJTYPE_DATA_OBJECT	||
+		    key->type == CACHEFS_ONDISC_OBJTYPE_OTHER_OBJECT
+		    ) {
+			/* we found the object we were looking for
+			 * - attach it to the parent node whether or not we're
+			 *   going to zap it immediately
+			 */
+			object->offset = cursor.offset;
+			object->objid = key->u.object.objid;
+
+			ASSERT(object->objid != 0);
+
+			/* bind the object to the node and discard the
+			 * placeholder from the culling queue if there is
+			 * one */
+			if (cachefs_tree_link_object(super, object,
+						     cursor.point) < 0
+			    ) {
+				/* the object is scheduled for demolition by
+				 * the reaper or the culler */
+				up_read(&cursor.point->sem);
+				cachefs_cursor_put(&cursor);
+
+				object->flags = 0;
+				object->has_data = 0;
+				object->data_levels = 0;
+				object->objid = 0;
+
+				ret = cachefs_wait_on_reap(super);
+				if (ret < 0) {
+					kleave(" = %d", ret);
+					return ret;
+				}
+
+				goto walk_again;
+			}
+
+			/* retrieve certain useful bits of information */
+			memcpy(&object->i_size,
+			       &key->u.object.size,
+			       sizeof(object->i_size));
+
+			object->flags = key->u.object.flags;
+			object->has_data = (key->ptr != CACHEFS_NULL_PTR);
+			object->data_levels = key->u.object.data_levels;
+
+			/* consult the netfs */
+			ret = cachefs_tree_check_node_validity(object, key);
+
+			/* update the atime and other info in place */
+			if (ret >= 0) {
+				_debug("update %d", ret);
+
+				cachefs_tree_update_info(super, object,
+							 key, ret);
+
+				up_read(&object->node->sem);
+				cachefs_cursor_put(&cursor);
+
+				_leave(" = 0 [found %p]", object->node);
+				return 0;
+			}
+
+			/* need to delete an obsolete inode */
+			_debug("delete %d", ret);
+
+			/* delete the old object */
+			ret = cachefs_tree_delete(super, object);
+			if (ret < 0) {
+				_leave(" = %d [obsoletion failed]", ret);
+				return ret;
+			}
+
+			/* detach and release the old node and then fall
+			 * through into the creation routines */
+			write_lock(&object->node->lock);
+			cachefs_tree_unlink_object_from_node(object);
+			write_unlock(&object->node->lock);
+
+			up_read(&object->node->sem);
+
+			object->flags = 0;
+			object->has_data = 0;
+			object->data_levels = 0;
+			object->objid = 0;
+		}
+		else {
+			printk(KERN_ERR
+			       "CacheFS: Unsupported object type: %u\n",
+			       key->type);
+			BUG();
+		}
+	}
+
+	/* we didn't find the object of our desire */
+	if (!create) {
+		cachefs_cursor_put(&cursor);
+		_leave(" = -ENODATA");
+		return -ENODATA;
+	}
+
+	/* need to insert a new object */
+	_debug("insert");
+
+	ret = cachefs_tree_insert(super, object, key);
+	ASSERT(ret < 0 || object->objid != 0);
+	cachefs_cursor_put(&cursor);
+	_leave(" = %d [ins %p]", ret, object->node);
+	return ret;
+
+} /* end cachefs_tree_lookup_object() */
+
+/*****************************************************************************/
+/*
+ * walk from the root of the tree to as close as possible to where the keyed
+ * object ought to be
+ * - if successful, we return with the point node's semaphore still read-locked
+ */
+static int cachefs_tree_walk(struct cachefs_super *super,
+			     struct cachefs_ondisc_leaf *key,
+			     struct cachefs_cursor *result)
+{
+	struct cachefs_ondisc_leaf *leaf;
+	struct cachefs_cursor cursor, next;
+	struct cachefs_tree *branch;
+	cachefs_block_t *ptr, bix;
+	void *data;
+	int loop, ret;
+
+	_enter("");
+
+	/* walk the tree to see if this object is present or if a leaf can be
+	 * found on which to store the object */
+	cursor.point = cachefs_tree_get(super->metadata_tree);
+	cursor.level = 0;
+	cursor.offset = 0xffffU;
+	next.level = 1;
+
+	down_read(&cursor.point->sem);
+
+begin_step:
+	if (test_bit(CACHEFS_TREE_NODE_INVALID, &cursor.point->flags)) {
+		cachefs_cursor_put(&cursor);
+		kleave(" = -EIO");
+		return -EIO;
+	}
+
+	next.point = NULL;
+	next.level = cursor.level + 1;
+
+	/* extract the bits of key in which we're immediately interested */
+	cursor.offset = cachefs_extract_subkey(key, cursor.level);
+
+	_debug("step %d subkey=%04x", cursor.level, cursor.offset);
+
+	/* start by checking the cached branches and shortcuts leading off of
+	 * this one
+	 */
+	read_lock(&cursor.point->lock);
+
+	branch = cachefs_tree_find_node(cursor.point, CACHEFS_TREE_TYPE_NODE,
+					cursor.offset);
+	if (branch)
+		goto move_to_cached_branch;
+
+	branch = cachefs_tree_find_shortcut(cursor.point, key);
+	if (branch)
+		goto move_to_cached_shortcut;
+
+	read_unlock(&cursor.point->lock);
+
+	/* if the block doesn't reside on disk, then don't try reading it */
+	if (!test_bit(CACHEFS_TREE_EXTANT, &cursor.point->flags)) {
+		up_read(&cursor.point->sem);
+		*result = cursor;
+		_leave(" = 0 [no block]");
+		return 0;
+	}
+
+	/* we need to examine the on-disk contents of this node */
+	if (!test_bit(CACHEFS_TREE_NODE_VALIDATED, &cursor.point->flags)) {
+		_debug("reading node %x", cursor.point->bix);
+
+		ret = cachefs_node_read(super, cursor.point, 1);
+		if (ret < 0) {
+			up_read(&cursor.point->sem);
+			cachefs_cursor_put(&cursor);
+			_leave(" = %d [read err]", ret);
+			return ret;
+		}
+
+		if (cursor.point->immutable - super->jnl_serial < 0)
+			_debug("- not immutable");
+	}
+
+	_debug("got page %p{%lx}",
+	       cursor.point->page, cursor.point->page->index);
+
+	data = kmap_atomic(cursor.point->page, KM_USER0);
+
+	/* see if there's a pointer at the correct position for us to walk on
+	 * immediately
+	 */
+	leaf = data + (cursor.offset & CACHEFS_ONDISC_LEAF_MASK);
+
+	ASSERTIF(CACHEFS_EMPTY_PTR != 0, leaf->type != CACHEFS_EMPTY_PTR);
+
+	_debug("ptrblk? %x type %x",
+	       cursor.offset & CACHEFS_ONDISC_LEAF_MASK,
+	       leaf->type);
+
+	if (leaf->type == CACHEFS_ONDISC_OBJTYPE_NULL_POINTER ||
+	    leaf->type > CACHEFS_ONDISC_OBJTYPE__LAST
+	    ) {
+		/* found a pointer block - see if there's a pointer */
+		_debug("ptrblk %x", cursor.offset);
+
+		ptr = data + cursor.offset;
+		bix = *ptr;
+		if (bix != CACHEFS_NULL_PTR)
+			goto follow_pointer;
+	}
+
+	/* there isn't a viable direct pointer; so we need to search for a
+	 * match for the object in the leaves in this block
+	 */
+	_debug("walk leaves");
+
+	for (loop = 0; loop < CACHEFS_ONDISC_LEAF_PER_BLOCK; loop++) {
+		cursor.offset = loop << super->layout->leaf_shift;
+		leaf = data + cursor.offset;
+
+		_debug("leaf[%d] type %x", loop, leaf->type);
+
+		/* typed node leaves have "block pointer #0" pointing into the
+		 * journal; real pointer leaves point elsewhere */
+		switch (leaf->type) {
+			/* can insert into empty slots */
+		case CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT:
+			continue;
+
+			/* nodes we might be looking for */
+		case CACHEFS_ONDISC_OBJTYPE_INDEX_OBJECT:
+		case CACHEFS_ONDISC_OBJTYPE_DATA_OBJECT:
+		case CACHEFS_ONDISC_OBJTYPE_OTHER_OBJECT:
+			if (cachefs_compare_keys(key, leaf) == 0)
+				goto found_object;
+			break;
+
+			/* keyspace short cut */
+		case CACHEFS_ONDISC_OBJTYPE_SHORTCUT:
+			if (cachefs_compare_keys(key, leaf) == 1)
+				goto take_shortcut;
+			break;
+
+		default:
+			printk(KERN_ERR "CacheFS:"
+			       " Unrecognised object type %x\n",
+			       leaf->type);
+			kunmap_atomic(data, KM_USER0);
+			up_read(&cursor.point->sem);
+			_leave(" = -EIO [data error]");
+			return -EIO;
+
+			/* pointer block leaf */
+#if CACHEFS_ONDISC_OBJTYPE_NULL_POINTER < CACHEFS_ONDISC_OBJTYPE_FIRST_POINTER
+		case CACHEFS_ONDISC_OBJTYPE_NULL_POINTER:
+#endif
+		case CACHEFS_ONDISC_OBJTYPE_FIRST_POINTER ...
+			CACHEFS_ONDISC_OBJTYPE_LAST_POINTER:
+			break;
+		}
+	}
+
+	kunmap_atomic(data, KM_USER0);
+	up_read(&cursor.point->sem);
+	*result = cursor;
+	_leave(" = 0 [not found]");
+	return 0;
+
+	/* we found the object we were looking for
+	 * - return with the point node's semaphore still read-locked
+	 */
+found_object:
+	memcpy(key, leaf, CACHEFS_ONDISC_LEAF_SIZE);
+	kunmap_atomic(data, KM_USER0);
+	*result = cursor;
+	_leave(" = 1 [found]");
+	return 1;
+
+	/* we found a suitable branch to move to in the topology cache */
+move_to_cached_shortcut:
+	_debug(">>>> skip to shortcut");
+
+move_to_cached_branch:
+	next.point = cachefs_tree_get(branch);
+	next.level = branch->level;
+	_debug(">>>> move to %p [lev %d]", next.point, next.level);
+	read_unlock(&cursor.point->lock);
+
+next_step:
+	down_read(&next.point->sem);
+	up_read(&cursor.point->sem);
+	cachefs_cursor_put(&cursor);
+	cursor.point = next.point;
+	cursor.level = next.level;
+	goto begin_step;
+
+	/* found a pointer to a depending block on disk */
+follow_pointer:
+	kunmap_atomic(data, KM_USER0);
+
+	_debug(">>>> walk to %x", bix);
+
+	ASSERTIF(CACHEFS_EMPTY_PTR != 0, bix != CACHEFS_EMPTY_PTR);
+
+	if (bix < super->layout->bix_cache ||
+	    bix >= super->j.alloc_unready
+	    ) {
+		printk(KERN_ERR "can't walk to block %x\n", bix);
+		BUG();
+	}
+
+	ASSERT(bix >= super->layout->bix_cache);
+	ASSERT(bix < super->j.alloc_unready);
+
+	/* extend the topology cache */
+	cursor.level++;
+	next.point = cachefs_tree_lookup(GFP_KERNEL, &cursor, bix,
+					 CACHEFS_TREE_TYPE_NODE, 0);
+	if (!next.point) {
+		up_read(&cursor.point->sem);
+		cachefs_cursor_put(&cursor);
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	goto next_step;
+
+	/* if there's a shortcut we should take then we need to follow it */
+take_shortcut:
+	bix = leaf->ptr;
+	next.level = leaf->u.shortcut.level;
+	next.s_offset = leaf->u.shortcut.s_offset;
+	kunmap_atomic(data, KM_USER0);
+
+	ASSERT(bix >= super->layout->bix_cache);
+	ASSERT(bix < super->j.alloc_unready);
+
+	next.offset = cursor.offset;
+	next.point = cursor.point;
+
+	_debug(">>>> shortcut to %x [lev %d]", bix, next.level);
+
+	/* add to the tree cache */
+	next.point = cachefs_tree_lookup(GFP_KERNEL, &next, bix,
+					 CACHEFS_TREE_TYPE_SHORTCUT, 0);
+	if (!next.point) {
+		up_read(&cursor.point->sem);
+		cachefs_cursor_put(&cursor);
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	goto next_step;
+
+} /* end cachefs_tree_walk() */
+
+/*****************************************************************************/
+/*
+ * check with the netfs that a node located on disk is valid
+ */
+static int cachefs_tree_check_node_validity(struct cachefs_object *object,
+					    struct cachefs_ondisc_leaf *key)
+{
+	struct fscache_cookie *cookie;
+	uint16_t dlen;
+	void *auxdata;
+
+	_enter("");
+
+	cookie = object->fscache.cookie;
+	if (!cookie) {
+		_leave(" = 0 [no cookie]");
+		return 0;
+	}
+
+	if (!cookie->def->check_aux) {
+		_leave(" = 0 [no check]");
+		return 0;
+	}
+
+	dlen = key->u.object.netfs_dlen;
+	auxdata = key->u.object.netfs_data + key->u.object.netfs_klen;
+
+	switch (cookie->def->check_aux(cookie->netfs_data, auxdata, dlen)) {
+	case FSCACHE_CHECKAUX_OKAY:
+		break;
+
+	case FSCACHE_CHECKAUX_NEEDS_UPDATE:
+		_leave(" = 1");
+		return 1;
+
+	case FSCACHE_CHECKAUX_OBSOLETE:
+		_leave(" = -ESTALE");
+		return -ESTALE;
+	}
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_tree_check_node_validity() */
+
+/*****************************************************************************/
+/*
+ * do an in-place update of the atime
+ * - the caller has the cursor point semaphore read-locked
+ */
+static void cachefs_tree_update_info(struct cachefs_super *super,
+				     struct cachefs_object *object,
+				     struct cachefs_ondisc_leaf *key,
+				     int degree)
+{
+	struct cachefs_ondisc_leaf *leaf;
+	uint32_t atime;
+	void *data;
+
+	_enter(",{%llx,%llx,%x},,",
+	       object->pobjid, object->objid, object->offset);
+
+	atime = CURRENT_TIME.tv_sec;
+	if (key->u.object.atime == atime && degree == 0) {
+		_leave(" [k same]");
+		return;
+	}
+
+	/* update the netfs auxilliary data */
+	if (degree == 1 && object->fscache.cookie) {
+		struct fscache_cookie *cookie = object->fscache.cookie;
+
+#if 0
+		if (cookie->def->get_attr) {
+			uint64_t fsize;
+			cookie->def->get_attr(cookie->netfs_data, &fsize);
+			memcpy(key->u.object.size,
+			       &fsize,
+			       sizeof(key->u.object.size));
+		}
+#endif
+
+		if (cookie->def->get_aux) {
+			uint16_t maxdlen, dlen;
+			void *dbuf;
+
+			maxdlen = CACHEFS_ONDISC_LEAF_SIZE;
+			maxdlen -= offsetof(struct cachefs_ondisc_leaf,
+					    u.object.netfs_data);
+			maxdlen -= key->u.object.netfs_klen;
+
+			dbuf = key->u.object.netfs_data;
+			dbuf += key->u.object.netfs_klen;
+
+			dlen = cookie->def->get_aux(cookie->netfs_data,
+						    dbuf, maxdlen);
+			BUG_ON(dlen > maxdlen);
+			key->u.object.netfs_dlen = dlen;
+		}
+	}
+
+	/* change the atime in the current page */
+	lock_page(object->node->page);
+
+	data = kmap_atomic(object->node->page, KM_USER0);
+	leaf = data + object->offset;
+
+	leaf->u.object.atime = atime;
+	leaf->u.object.flags = object->flags;
+
+	if (degree == 1) {
+		leaf->u.object.size = key->u.object.size;
+
+		memcpy(leaf->u.object.netfs_data + leaf->u.object.netfs_klen,
+		       key->u.object.netfs_data + key->u.object.netfs_klen,
+		       leaf->u.object.netfs_dlen);
+	}
+
+	kunmap_atomic(data, KM_USER0);
+
+	set_page_dirty(object->node->page);
+	unlock_page(object->node->page);
+
+	_leave(" [changed]");
+
+} /* end cachefs_tree_update_info() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/tree-misc.c linux-2.6.14-mm2-cachefs/fs/cachefs/tree-misc.c
--- linux-2.6.14-mm2/fs/cachefs/tree-misc.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/tree-misc.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,346 @@
+/* tree-misc.c: CacheFS indexing tree miscellaneous management functions
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "cachefs-int.h"
+
+kmem_cache_t *cachefs_node_jar;
+
+void cachefs_tree_init_once(void *_node, kmem_cache_t *cachep,
+			    unsigned long flags)
+{
+	struct cachefs_tree *node = _node;
+
+	switch (flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) {
+	case SLAB_CTOR_CONSTRUCTOR:
+		memset(node, 0, sizeof(*node));
+		node->nodes = RB_ROOT;
+		node->objects = RB_ROOT;
+		init_rwsem(&node->sem);
+		rwlock_init(&node->lock);
+		break;
+	default:
+		break;
+	}
+}
+
+/*****************************************************************************/
+/*
+ * allocate a new metadata tree node
+ */
+struct cachefs_tree *cachefs_tree_alloc(unsigned long gfp)
+{
+	struct cachefs_tree *new;
+
+	gfp &= GFP_LEVEL_MASK;
+	new = kmem_cache_alloc(cachefs_node_jar, gfp);
+	if (new) {
+		atomic_set(&new->usage, 1);
+
+		new->parent	= NULL;
+		new->bix	= 0;
+		new->flags	= 0;
+		new->type	= 0;
+		new->level	= 0;
+		new->offset	= 0xffffU;
+		new->s_offset	= 0xffffU;
+		new->occupancy	= 0;
+		new->immutable	= 0;
+		new->scan_state	= INT_MAX;
+	}
+
+	return new;
+
+} /* end cachefs_tree_alloc() */
+
+/*****************************************************************************/
+/*
+ * get a new metadata tree node
+ * - parent->node->sem must be read or write locked
+ * - resident indicates whether the node is likely to be resident or not
+ */
+struct cachefs_tree *cachefs_tree_lookup(unsigned long gfp,
+					 struct cachefs_cursor *cursor,
+					 cachefs_block_t bix,
+					 int type,
+					 int resident)
+{
+	struct cachefs_tree *new, *xnode;
+	struct rb_node **pp, *p;
+
+	_enter(",{%p{%x},[%04x],%04hx},%x,%d,%d",
+	       cursor->point,
+	       cursor->point->bix,
+	       cursor->offset,
+	       cursor->s_offset,
+	       bix, type, resident);
+
+	ASSERT(cursor->offset <= PAGE_SIZE - sizeof(cachefs_block_t));
+	ASSERTIF(CACHEFS_NULL_PTR != 0, bix != 0);
+	cachefs_tree_debugcheck(cursor->point);
+
+	switch (type) {
+	case CACHEFS_TREE_TYPE_NODE:
+	case CACHEFS_TREE_TYPE_DATAPTRBLK:
+	case CACHEFS_TREE_TYPE_SHORTCUT:
+		break;
+	default:
+		BUG();
+	}
+
+	/* allocate and initialise a topology record in case we'll probably
+	 * need to add one */
+	new = NULL;
+
+really_allocate:
+	if (!resident) {
+		gfp &= GFP_LEVEL_MASK;
+		new = kmem_cache_alloc(cachefs_node_jar, gfp);
+		if (!new)
+			return NULL;
+
+		atomic_set(&new->usage, 1);
+
+		new->parent	= cursor->point;
+		new->bix	= bix;
+		new->level	= cursor->level;
+		new->offset	= cursor->offset;
+		new->s_offset	= cursor->s_offset;
+		new->type	= type;
+		new->occupancy	= 0;
+		new->immutable	= 0;
+		new->flags	= 0;
+		new->scan_state	= INT_MAX;
+
+		if (bix != CACHEFS_NULL_PTR)
+			new->flags |= 1 << CACHEFS_TREE_EXTANT;
+	}
+
+	/* check to see if we still need to add one to the parent, add the one
+	 * we obtained above if so, or return the extant one if not
+	 */
+	write_lock(&cursor->point->lock);
+
+redo_search:
+	/* attempt to install */
+	p = NULL;
+	pp = &cursor->point->nodes.rb_node;
+
+	while (*pp) {
+		p = *pp;
+		xnode = rb_entry(p, struct cachefs_tree, node_rb);
+
+		if (xnode->type < type)
+			pp = &(*pp)->rb_left;
+		else if (xnode->type > type)
+			pp = &(*pp)->rb_right;
+		else if (xnode->offset < cursor->offset)
+			pp = &(*pp)->rb_left;
+		else if (xnode->offset > cursor->offset)
+			pp = &(*pp)->rb_right;
+		else
+			goto present; /* it seems to be already present */
+	}
+
+	/* the node is not yet present; make sure we have one */
+	if (!new) {
+		write_unlock(&cursor->point->lock);
+		resident = 0;
+		goto really_allocate;
+	}
+
+	/* install the new node */
+	cachefs_tree_get(cursor->point);
+	rb_link_node(&new->node_rb, p, pp);
+	rb_insert_color(&new->node_rb, &cursor->point->nodes);
+	set_bit(CACHEFS_TREE_INSTALLED, &new->flags);
+
+#if CACHEFS_DEBUG_TREE_ACCOUNTING
+	kdebug(" - NEW %p{%x,%u} USAGE -> 1",
+	       new, new->bix, new->level);
+#endif
+
+	write_unlock(&cursor->point->lock);
+	_leave(" = %p", new);
+	return new;
+
+	/* check that the already extant node isn't going away
+	 * - do the increment no later than at the same time as the test lest
+	 *   we race with put
+	 * - we've got the writelock on the parent so no-one else can be
+	 *   getting a reference if they don't already have one
+	 */
+present:
+	if (atomic_add_return(1, &xnode->usage) == 0 &&
+	    test_and_clear_bit(CACHEFS_TREE_INSTALLED, &xnode->flags)
+	    ) {
+		/* it is going away, but hasn't yet been expunged
+		 * - we need to remove it before we can add our own
+		 * - we need to drop the ref we just got
+		 * - the put will grab the parent writelock before completing
+		 *   the release
+		 */
+		rb_erase(&xnode->node_rb, &cursor->point->nodes);
+		if (!atomic_dec_and_test(&xnode->usage))
+			BUG();
+		xnode = NULL;
+		goto redo_search;
+	}
+
+	/* dispose of any new one we allocated and return the extant one */
+	write_unlock(&cursor->point->lock);
+	if (new)
+		kmem_cache_free(cachefs_node_jar, new);
+	_leave(" = %p [old]", xnode);
+	return xnode;
+
+} /* end cachefs_tree_lookup() */
+
+/*****************************************************************************/
+/*
+ * release record of a branch of the metadata tree
+ */
+void cachefs_tree_put(struct cachefs_tree *node)
+{
+	struct cachefs_tree *parent;
+
+	if (!node)
+		return;
+
+	_enter("%p{%d}", node, atomic_read(&node->usage));
+
+	do {
+#if CACHEFS_DEBUG_TREE_ACCOUNTING
+		if (atomic_read(&node->usage) < 10)
+			kdebug(" - PUT %p{%x,%u} USAGE -> %d [%d]",
+			       node, node->bix, node->level,
+			       atomic_read(&node->usage) - 1,
+			       node->page ? page_count(node->page) : -777);
+#endif
+
+		cachefs_tree_debugcheck(node);
+
+		if (!atomic_dec_and_test(&node->usage))
+			return;
+
+		/* prevent release from racing with rearrangement by atomically
+		 * reading our ref to the parent node whilst clearing it to
+		 * signal that a rearranger that is should ignore this node
+		 * record
+		 */
+		parent = xchg(&node->parent, NULL);
+
+		/* unlink from the parent if not at the root */
+		if (parent) {
+			cachefs_tree_debugcheck(parent);
+			write_lock(&parent->lock);
+			__cachefs_tree_unlink_from_node(node, parent);
+			write_unlock(&parent->lock);
+		}
+
+		ASSERT(!node->nodes.rb_node);
+		ASSERT(!node->shortcuts.rb_node);
+		ASSERT(!node->objects.rb_node);
+
+		/* detach from object if a level 1 data ptr block */
+		ASSERT(atomic_read(&node->netfs_usage) == 0);
+
+		if (node->object) {
+			ASSERT(node->type == CACHEFS_TREE_TYPE_DATAPTRBLK);
+			ASSERT(node->level == 1);
+			ASSERT(node->object->dataptrblks.rb_node);
+
+			_debug("Unlinking %x from obj %llx",
+			       node->bix, node->object->objid);
+
+			write_lock(&node->object->lock);
+			rb_erase(&node->aux_rb,
+				 &node->object->dataptrblks);
+			write_unlock(&node->object->lock);
+		}
+
+		if (node->page) {
+			if (!TestClearPageFsMisc(node->page))
+				BUG();
+			cachefs_page_put(node->page);
+		}
+
+#if CACHEFS_DEBUG_TREE_ACCOUNTING
+		kdebug("- FREE %p{%x,%u}", node, node->bix, node->level);
+#endif
+
+		kmem_cache_free(cachefs_node_jar, node);
+
+	} while ((node = parent));
+
+} /* end cachefs_tree_put() */
+
+/*****************************************************************************/
+/*
+ * dispose of an object record
+ */
+void cachefs_object_put(struct cachefs_object *object)
+{
+	struct cachefs_tree *parent;
+
+	_enter("%p{%p[%04x],%llx,%d}",
+	       object, object->node, object->offset, object->objid,
+	       atomic_read(&object->usage));
+
+#ifdef CACHEFS_DEBUG_SLAB
+	ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+#if CACHEFS_DEBUG_OBJECT_ACCOUNTING
+	if (atomic_read(&object->usage) < 10)
+		kdebug(" - PUT %p{%llx} USAGE -> %d",
+		       object, object->objid, atomic_read(&object->usage) - 1);
+#endif
+
+	if (!atomic_dec_and_test(&object->usage)) {
+		_leave(" [extant]");
+		return;
+	}
+
+	ASSERT(list_empty(&object->cull_link));
+
+	/* prevent release from racing with rearrangement by atomically reading
+	 * our ref to the parent node whilst clearing it to signal that a
+	 * rearranger that is should ignore this node record
+	 */
+	parent = xchg(&object->node, NULL);
+
+	/* detach from the containing node */
+	if (parent) {
+		cachefs_tree_debugcheck(parent);
+
+		if (object->offset != 0xffffU) {
+			write_lock(&parent->lock);
+			rb_erase(&object->node_rb, &parent->objects);
+			write_unlock(&parent->lock);
+		}
+	}
+
+	/* free everything */
+#if CACHEFS_DEBUG_OBJECT_ACCOUNTING
+	kdebug("- FREE %p{%llx}", object, object->objid);
+#endif
+
+	kfree(object->key);
+	object->key = NULL;
+	kmem_cache_free(cachefs_object_jar, object);
+	cachefs_tree_put(parent);
+
+	_leave(" [killed]");
+
+} /* end cachefs_object_put() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/tree-move.c linux-2.6.14-mm2-cachefs/fs/cachefs/tree-move.c
--- linux-2.6.14-mm2/fs/cachefs/tree-move.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/tree-move.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,299 @@
+/* tree-move.c: leaf movement operations
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "cachefs-int.h"
+
+/*****************************************************************************/
+/*
+ * move a leaf from one node to another
+ * - the caller must have exclusive access to both nodes
+ */
+void cachefs_tree_move_leaf(struct cachefs_super *super,
+			    struct cachefs_tree *from_node,
+			    struct cachefs_tree *to_node,
+			    struct cachefs_ondisc_leaf *from,
+			    struct cachefs_ondisc_leaf *to,
+			    uint16_t from_offset,
+			    uint16_t to_offset)
+{
+	struct cachefs_object *object;
+	struct cachefs_tree *shortcut, *dataptr, *prev;
+	int move_refs;
+
+	_enter(",{%x},{%x},{%x,%x},,%hx,%hx",
+	       from_node->bix, to_node->bix,
+	       from->ptr, from->type,
+	       from_offset, to_offset);
+
+	write_lock(&from_node->lock);
+	write_lock(&to_node->lock);
+
+	/* move the leaf and reset the hole left behind */
+	_alter(super, "move leaf from %x[%04x] to %x[%04x]",
+	       from_node->bix, from_offset,
+	       to_node->bix, to_offset);
+
+	memcpy(to, from, super->layout->leaf_size);
+	memset(from, CACHEFS_EMPTY_FILL, super->layout->leaf_size);
+	from->type = CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT;
+
+	/* need to move the in-memory object record if there is one */
+	move_refs = 0;
+	if (to->type == CACHEFS_ONDISC_OBJTYPE_SHORTCUT) {
+		/* see if there's an in-memory record of the shortcut */
+		shortcut = cachefs_tree_find_node(from_node,
+						  CACHEFS_TREE_TYPE_SHORTCUT,
+						  from_offset);
+		if (shortcut)
+			goto found_shortcut;
+	}
+	else if (to->type == CACHEFS_ONDISC_OBJTYPE_INDEX_OBJECT ||
+		 to->type == CACHEFS_ONDISC_OBJTYPE_DATA_OBJECT ||
+		 to->type == CACHEFS_ONDISC_OBJTYPE_OTHER_OBJECT
+		 ) {
+		/* see if there's an in-memory record of the object */
+		object = cachefs_tree_find_object(from_node, from_offset);
+
+		if (object)
+			goto found_object;
+		goto move_dataptrs;
+	}
+
+done:
+	if (move_refs)
+		cachefs_tree_get2(to_node, move_refs);
+	write_unlock(&to_node->lock);
+	write_unlock(&from_node->lock);
+
+	from_node->occupancy--;
+	to_node->occupancy++;
+
+	while (move_refs-- > 0)
+		cachefs_tree_put(from_node);
+
+	_leave("");
+	return;
+
+	/* found an object
+	 * - reattach to new node
+	 * - reattach matching data pointer block too
+	 */
+found_object:
+	/* attempt to redirect this object's node pointer to the new node; note
+	 * that this has the potential to race with release, so we ignore this
+	 * object if the releaser cleared the node pointer
+	 */
+	_debug("move obj %llx", object->objid);
+
+	write_lock(&object->lock);
+
+	ASSERT(to_node);
+	prev = cmpxchg(&object->node, from_node, to_node);
+	ASSERT(!prev || prev == from_node);
+
+	if (prev) {
+		smp_wmb();
+		cachefs_tree_debugcheck(prev);
+		__cachefs_tree_move_object_to_node(object, prev, to_node, to_offset);
+		move_refs++;
+	}
+
+	write_unlock(&object->lock);
+
+	/* see if there are data pointer blocks or data pages for this object
+	 * - there may be some future expansion nodes in addition to the real
+	 *   root
+	 */
+move_dataptrs:
+	dataptr = cachefs_tree_find_node(from_node,
+					 CACHEFS_TREE_TYPE_DATAPTRBLK,
+					 from_offset);
+	if (dataptr) {
+		_debug("move dataptr %p{%d,%04x,%x}",
+		       dataptr, dataptr->type, dataptr->offset, dataptr->bix);
+
+		cachefs_tree_debugcheck(dataptr);
+
+		/* attempt to redirect this data ptr block's parent pointer to
+		 * the new node; note that this has the potential to race with
+		 * release, so we ignore this item if the releaser cleared the
+		 * node pointer
+		 */
+		write_lock(&dataptr->lock);
+
+		prev = cmpxchg(&dataptr->parent, from_node, to_node);
+		if (prev) {
+			smp_wmb();
+			ASSERT(prev == from_node);
+			cachefs_tree_debugcheck(prev);
+			__cachefs_tree_unlink_from_node(dataptr, prev);
+
+			dataptr->offset = to_offset;
+			__cachefs_tree_link_to_node(dataptr, to_node);
+			move_refs++;
+		}
+
+		write_unlock(&dataptr->lock);
+	}
+	goto done;
+
+	/* found a shortcut
+	 * - reattach to new node
+	 */
+found_shortcut:
+	/* attempt to redirect this shortcut's parent pointer to the new node;
+	 * note that this has the potential to race with release, so we ignore
+	 * this item if the releaser cleared the node pointer
+	 */
+	_debug("move shortcut %x", shortcut->offset);
+
+	cachefs_tree_debugcheck(shortcut);
+	write_lock(&shortcut->lock);
+
+	prev = cmpxchg(&shortcut->parent, from_node, to_node);
+	ASSERT(!prev || prev == from_node);
+
+	if (prev) {
+		smp_wmb();
+		cachefs_tree_debugcheck(prev);
+		__cachefs_tree_unlink_from_node(shortcut, prev);
+
+		shortcut->offset = to_offset;
+		__cachefs_tree_link_to_node(shortcut, to_node);
+		move_refs++;
+	}
+
+	write_unlock(&shortcut->lock);
+	goto done;
+
+} /* end cachefs_tree_insert_move_leaf() */
+
+/*****************************************************************************/
+/*
+ * slide a leaf between slots within a node
+ * - the caller must have exclusive access to the node
+ */
+void cachefs_tree_slide_leaf(struct cachefs_super *super,
+			     struct cachefs_tree *node,
+			     void *data,
+			     uint16_t from_offset,
+			     uint16_t to_offset)
+{
+	struct cachefs_ondisc_leaf *leaf;
+	struct cachefs_object *object;
+	struct cachefs_tree *shortcut, *dataptr;
+
+	_enter(",{%x},,%hx,%hx", node->bix, from_offset, to_offset);
+
+	write_lock(&node->lock);
+
+	/* move the leaf and reset the hole left behind */
+	_alter(super, "slide leaf from %x[%04x] to [%04x]",
+	       node->bix, from_offset, to_offset);
+
+	leaf = data + from_offset;
+	memcpy(data + to_offset, leaf, super->layout->leaf_size);
+	memset(leaf, CACHEFS_EMPTY_FILL, super->layout->leaf_size);
+	leaf->type = CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT;
+
+	/* need to move the in-memory object record if there is one */
+	leaf = data + to_offset;
+
+	if (leaf->type == CACHEFS_ONDISC_OBJTYPE_SHORTCUT) {
+		/* see if there's an in-memory record of the shortcut */
+		shortcut = cachefs_tree_find_node(node,
+						  CACHEFS_TREE_TYPE_SHORTCUT,
+						  from_offset);
+		if (shortcut) {
+			_debug("slide short %x", shortcut->offset);
+			__cachefs_tree_unlink_from_node(shortcut, node);
+			shortcut->offset = to_offset;
+			__cachefs_tree_link_to_node(shortcut, node);
+		}
+	}
+	else if (leaf->type == CACHEFS_ONDISC_OBJTYPE_INDEX_OBJECT ||
+		 leaf->type == CACHEFS_ONDISC_OBJTYPE_DATA_OBJECT ||
+		 leaf->type == CACHEFS_ONDISC_OBJTYPE_OTHER_OBJECT
+		 ) {
+		/* see if there's an in-memory record of the object */
+		object = cachefs_tree_find_object(node, from_offset);
+		if (object) {
+			_debug("slide obj %llx", object->objid);
+			__cachefs_tree_move_object_to_node(object, node, node,
+							   to_offset);
+		}
+
+		/* see if there's a data pointer block for this object */
+		dataptr = cachefs_tree_find_node(node,
+						 CACHEFS_TREE_TYPE_DATAPTRBLK,
+						 from_offset);
+		if (dataptr) {
+			__cachefs_tree_unlink_from_node(dataptr, node);
+			dataptr->offset = to_offset;
+			__cachefs_tree_link_to_node(dataptr, node);
+		}
+	}
+	else {
+		/* some time of leaf we don't support sliding of yet */
+		BUG();
+	}
+
+	write_unlock(&node->lock);
+	_leave(" = 0");
+	return;
+
+} /* end cachefs_tree_slide_leaf() */
+
+/*****************************************************************************/
+/*
+ * install a new leaf into a node
+ */
+void cachefs_tree_install_leaf(struct cachefs_operation *op,
+			       struct cachefs_tree *node,
+			       struct cachefs_ondisc_leaf *key,
+			       void *data,
+			       uint16_t offset)
+{
+	struct cachefs_ondisc_leaf *leaf = data + offset;
+
+	_enter("");
+
+	ASSERT(leaf->type == CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT);
+
+	/* make the changes */
+	if (key->type == CACHEFS_ONDISC_OBJTYPE_INDEX_OBJECT	||
+	    key->type == CACHEFS_ONDISC_OBJTYPE_DATA_OBJECT	||
+	    key->type == CACHEFS_ONDISC_OBJTYPE_OTHER_OBJECT
+	    ) {
+		op->object->objid = cachefs_alloc_objid(op);
+
+		_alter(op->super, "insert obj %llx into %x[%04x]",
+		       op->object->objid, node->bix, offset);
+
+		key->u.object.objid = op->object->objid;
+		key->u.object.atime = CURRENT_TIME.tv_sec;
+	}
+
+	memcpy(leaf, key, op->super->layout->leaf_size);
+
+	/* insert the object into the topology cache */
+	op->object->offset = offset;
+	if (cachefs_tree_link_object(op->super, op->object, node) < 0)
+		BUG();
+
+	node->occupancy++;
+
+	_leave("");
+
+} /* end cachefs_tree_install_leaf() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/tree-node.c linux-2.6.14-mm2-cachefs/fs/cachefs/tree-node.c
--- linux-2.6.14-mm2/fs/cachefs/tree-node.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/tree-node.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,284 @@
+/* tree-node.c: tree node operations
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+//#define __KENTER
+//#define __KDEBUG
+//#define __KLEAVE
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include "cachefs-int.h"
+
+/*****************************************************************************/
+/*
+ * add a page to the page cache, displacing any stale page of the same backing
+ * block
+ */
+void cachefs_replace_add_to_page_cache(struct cachefs_operation *op,
+				       struct cachefs_tree *node)
+{
+	struct page *page;
+	int tmp;
+
+	/* want to hold the relevant radix tree nodes in this CPU's dcache as
+	 * we have to walk them twice */
+	preempt_disable();
+
+	/* displace the old page of the same offset if there is one */
+	page = find_get_page(op->super->imeta->i_mapping, node->bix);
+	if (page) {
+		kdebug("displacing page %p { %lx, %lx }",
+		       page, page->index, page->flags);
+
+		if (replace_in_page_cache(node->page,
+					  op->super->imeta->i_mapping,
+					  node->bix
+					  ) != page)
+			BUG();
+
+		preempt_enable();
+
+		/* wait for the old page to be finished with */
+		wait_on_page_writeback(page);
+
+		ASSERT(!PageFsMisc(page));
+		ASSERT(!PageDirty(page));
+		ASSERTCMP(page_private(page), ==, 0);
+		ASSERTCMP(page->mapping, ==, NULL);
+
+		ClearPageMappedToDisk(page);
+		page_cache_release(page);
+	}
+	else {
+		/* need to add the new page */
+		tmp = add_to_page_cache(node->page,
+					op->super->imeta->i_mapping,
+					node->bix,
+					mapping_gfp_mask(op->super->imeta->i_mapping));
+
+		preempt_enable();
+
+		if (tmp < 0) {
+			/* shouldn't get ENOMEM due to auxiliary preloading and
+			 * EEXIST should have been anticipated above */
+			printk("CacheFS: Unexpected Error %d\n", tmp);
+			BUG();
+		}
+	}
+
+#if 0
+	_debug("Added page %p; fl=%lx cnt=%d",
+	       node->page, node->page->flags, page_count(node->page));
+#endif
+
+} /* end cachefs_replace_add_to_page_cache() */
+
+/*****************************************************************************/
+/*
+ * add a page to the page cache and the LRU, displacing any stale page of the
+ * same backing block
+ */
+static inline void cachefs_replace_add_to_page_cache_lru(struct cachefs_operation *op,
+							 struct cachefs_tree *node)
+{
+	cachefs_replace_add_to_page_cache(op, node);
+	lru_cache_add(node->page);
+
+} /* end cachefs_replace_add_to_page_cache_lru() */
+
+/*****************************************************************************/
+/*
+ * allocate or replace a node's backing block
+ * - the caller must have this node's semaphore write-locked
+ * - the caller must have the parent node's semaphore locked (read or write)
+ * - the caller must have replaced the parent node beforehand
+ * - the parent node's page must be locked and will remain so
+ * - we return with the new page locked
+ */
+int cachefs_replace_node(struct cachefs_operation *op,
+			 struct cachefs_tree *node)
+{
+	struct cachefs_journal *jnl;
+	cachefs_block_t *ptr;
+	struct page *oldpage, *newpage;
+	int ret;
+
+	_enter(",%p{%x}", node, node->bix);
+
+	ASSERT(node->page);
+	ASSERT(PageFsMisc(node->page));
+	ASSERTIF(node->parent, node->parent->bix != CACHEFS_NULL_PTR);
+	ASSERTIF(node->parent, node->parent->page);
+	ASSERTIF(node->parent, PageMappedToDisk(node->parent->page));
+	ASSERTIF(node->parent, PageLocked(node->parent->page));
+
+	op->m_rcm = 0;
+
+	if (node->bix != CACHEFS_NULL_PTR) {
+		ASSERT(PageMappedToDisk(node->page));
+		ASSERTCMP(node->page->index, ==, node->bix);
+
+		/* don't replace blocks that have already been replaced since
+		 * last the journal was cranked */
+		if (node->immutable >= op->super->jnl_serial) {
+			lock_page(node->page);
+			_leave(" = 0 [already repl]");
+			return 0;
+		}
+
+		op->m_rcm = 1;
+		op->bix_rcm[0] = node->bix;
+	}
+
+	op->m_alloc = 1;
+
+	/* addition of a page to the page cache must not fail with ENOMEM, so
+	 * we have to bank sufficient radix tree nodes in advance
+	 */
+	ret = radix_tree_preload_task(GFP_KERNEL, 1);
+	if (ret < 0) {
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* get a new block and stick the old in the laundry basket */
+	ret = cachefs_allocator(op);
+	if (ret < 0) {
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	node->bix = op->bix_alloc[0];
+	ASSERT(node->bix >= op->super->layout->bix_cache);
+	ASSERT(node->bix < op->super->j.alloc_unready);
+
+	/* update the pointer in the parent node if there is one */
+	if (node->parent) {
+		_alter(op->super, "change ptr %x[%04x] to %x",
+		       node->parent->bix, node->offset, op->bix_alloc[0]);
+
+		write_lock(&node->parent->lock);
+		ptr = kmap_atomic(node->parent->page, KM_USER0) + node->offset;
+		*ptr = node->bix;
+		kunmap_atomic(ptr, KM_USER0);
+		write_unlock(&node->parent->lock);
+
+		set_page_dirty(node->parent->page);
+	}
+
+	/* if the page currently backing the node isn't being written out then
+	 * subsume any in-place changes and redirect to new block directly
+	 */
+	lock_page(node->page);
+
+	if (PageWriteback(node->page))
+		goto page_being_written;
+
+page_not_being_written:
+	/* rotate the page attachment */
+	if (PageMappedToDisk(node->page)) {
+		int dirty;
+
+		_debug("reattach page %p", node->page);
+
+		ASSERT(node->page->mapping != NULL);
+
+		dirty = test_clear_page_dirty(node->page);
+		remove_from_page_cache(node->page);
+		cachefs_replace_add_to_page_cache(op, node);
+
+		if (dirty)
+			set_page_dirty(node->page);
+	}
+	else {
+		_debug("install page %p", node->page);
+
+		cachefs_replace_add_to_page_cache_lru(op, node);
+	}
+
+	goto done;
+
+page_being_written:
+	/* the page is being written - so we try to get a replacement in-memory
+	 * copy of the old page, but we don't wait in the page allocator; we
+	 * can wait for the page we already have instead
+	 */
+	newpage = alloc_pages(__GFP_HIGHMEM | __GFP_COLD | __GFP_NORECLAIM, 0);
+	if (!newpage) {
+		wait_on_page_writeback(node->page);
+		goto page_not_being_written;
+	}
+
+	_debug("replace page %p with %p", node->page, newpage);
+
+	SetPageLocked(newpage);
+
+	/* set up the new page */
+	SetPageFsMisc(newpage);
+	SetPagePrivate(newpage);
+
+	copy_highpage(newpage, node->page);
+	SetPageUptodate(newpage);
+
+	/* transfer the dirty flag for update-in-place */
+	if (test_clear_page_dirty(node->page)) {
+		_debug("transfer dirty");
+
+		set_page_dirty(newpage);
+
+		/* this page may have been holding the journal whilst doing
+		 * UIP */
+		if (page_private(node->page)) {
+			jnl = (struct cachefs_journal *)
+				page_private(node->page);
+			set_page_private(node->page, 0);
+			//ClearPagePrivate(node->page);
+			_debug("UIP jnl release");
+			cachefs_journal_release(jnl);
+		}
+	}
+
+	/* drop the old page */
+	ClearPageFsMisc(node->page);
+	ClearPagePrivate(node->page);
+	unlock_page(node->page);
+
+	/* substitute in the new page */
+	write_lock(&node->lock);
+	oldpage = node->page;
+	node->page = newpage;
+	write_unlock(&node->lock);
+	page_cache_release(oldpage);
+
+	cachefs_replace_add_to_page_cache_lru(op, node);
+
+	/* finish off */
+done:
+	SetPageMappedToDisk(node->page);
+	mark_page_accessed(node->page);
+
+	ASSERTCMP(node->page->index, ==, node->bix);
+
+	set_bit(CACHEFS_TREE_EXTANT, &node->flags);
+
+	node->immutable = op->super->jnl_serial;
+	if (node == op->super->metadata_tree)
+		op->super->j.tree_root = node->bix;
+
+	_leave(" = 0 [node %x, new]", node->bix);
+	return 0;
+
+} /* end cachefs_replace_node() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/tree-scan.c linux-2.6.14-mm2-cachefs/fs/cachefs/tree-scan.c
--- linux-2.6.14-mm2/fs/cachefs/tree-scan.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/tree-scan.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,972 @@
+/* tree-scan.c: tree scanner - finds orphaned nodes and nodes to cull
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+//#define __KENTER
+//#define __KDEBUG
+//#define __KLEAVE
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "cachefs-int.h"
+
+static void cachefs_scan_inactive(struct cachefs_super *super);
+static void cachefs_scan_loading_reap_list(struct cachefs_super *super);
+static void cachefs_scan_descending(struct cachefs_super *super);
+static void cachefs_scan_validating_node(struct cachefs_super *super);
+static void cachefs_scan_scanning_node(struct cachefs_super *super);
+static void cachefs_scan_ascending(struct cachefs_super *super);
+static void cachefs_scan_completing_scan(struct cachefs_super *super);
+static void cachefs_scan_completing_reap(struct cachefs_super *super);
+static void cachefs_scan_advancing_reap_list(struct cachefs_super *super);
+static void cachefs_scan_syncing_cache(struct cachefs_super *super);
+static void cachefs_scan_finished(struct cachefs_super *super);
+static void cachefs_scan_reaping_object(struct cachefs_super *super);
+static void cachefs_scan_waiting_for_reaper(struct cachefs_super *super);
+
+const cachefs_scan_operation_t cachefs_scan_operations[CACHEFS_SCAN__NSTATES] = {
+	[CACHEFS_SCAN_INACTIVE]			= cachefs_scan_inactive,
+	[CACHEFS_SCAN_LOADING_REAP_LIST]	= cachefs_scan_loading_reap_list,
+	[CACHEFS_SCAN_DESCENDING]		= cachefs_scan_descending,
+	[CACHEFS_SCAN_VALIDATING_NODE]		= cachefs_scan_validating_node,
+	[CACHEFS_SCAN_SCANNING_NODE]		= cachefs_scan_scanning_node,
+	[CACHEFS_SCAN_ASCENDING]		= cachefs_scan_ascending,
+	[CACHEFS_SCAN_COMPLETING_SCAN]		= cachefs_scan_completing_scan,
+	[CACHEFS_SCAN_COMPLETING_REAP]		= cachefs_scan_completing_reap,
+	[CACHEFS_SCAN_ADVANCING_REAP_LIST]	= cachefs_scan_advancing_reap_list,
+	[CACHEFS_SCAN_SYNCING_CACHE]		= cachefs_scan_syncing_cache,
+	[CACHEFS_SCAN_FINISHED]			= cachefs_scan_finished,
+	[CACHEFS_SCAN_REAPING_OBJECT]		= cachefs_scan_reaping_object,
+	[CACHEFS_SCAN_WAITING_FOR_REAPER]	= cachefs_scan_waiting_for_reaper,
+};
+
+/*****************************************************************************/
+/*
+ * handle the completion of a BIO that read a page for the scanner
+ */
+static int cachefs_scan_io_complete(struct bio *bio,
+				    unsigned int bytes_done, int err)
+{
+	struct cachefs_super *super;
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct page *page;
+
+	_enter("{sz=%u rw=%lu},%u,%d",
+	       bio->bi_size, bio->bi_rw, bytes_done, err);
+
+	if (bio->bi_size)
+		return 1;
+
+	/* mark the pages with the appropriate state */
+	page = bvec->bv_page;
+	bio_put(bio);
+
+	if (uptodate) {
+		SetPageUptodate(page);
+	} else {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	}
+
+	super = page->mapping->host->i_sb->s_fs_info;
+	unlock_page(page);
+
+	set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+	wake_up(&super->dmn_sleepq);
+	return 0;
+
+} /* end cachefs_scan_io_complete() */
+
+/*****************************************************************************/
+/*
+ * read a block from disk for the scanner
+ */
+static int cachefs_scan_readpage(void *data, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct bio *bio;
+	int ret;
+
+	_enter(",{%d,%lx}", page_count(page), page->index);
+
+	SetPageMappedToDisk(page);
+
+	/* dispatch a call to perform the read */
+	ret = -ENOMEM;
+
+	bio = bio_alloc(GFP_KERNEL, 1);
+	if (bio) {
+		bio->bi_bdev	= inode->i_sb->s_bdev;
+		bio->bi_sector	= page->index;
+		bio->bi_sector	<<= PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+		bio->bi_end_io	= cachefs_scan_io_complete;
+
+		if (!bio_add_page(bio, page, PAGE_SIZE, 0))
+			BUG();
+
+		submit_bio(READ, bio);
+		ret = 0;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_scan_readpage() */
+
+/*****************************************************************************/
+/*
+ * read a reap list block from disk
+ * - reads pages through the metadata inode
+ * - caller must wait for page to finish reading
+ */
+static int cachefs_scan_node_read(struct cachefs_super *super,
+				  cachefs_block_t bix,
+				  struct page **_page)
+{
+	struct page *page;
+
+	_enter(",%x", bix);
+
+	/* load the page into the page cache */
+	page = read_cache_page(super->imeta->i_mapping, bix,
+			       cachefs_scan_readpage, NULL);
+
+	if (IS_ERR(page)) {
+		_leave(" = %ld [rcp]", PTR_ERR(page));
+		return PTR_ERR(page);
+	}
+
+	if (PageUptodate(page) || PageError(page)) {
+		_debug("page already present");
+		set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+	}
+
+	*_page = page;
+	_leave(" = 0 (%p)", page);
+	return 0;
+
+} /* end cachefs_scan_node_read() */
+
+/*****************************************************************************/
+/*
+ * tree scanning is inactive
+ * - need to load a part of the reap list if there is one
+ */
+static void cachefs_scan_inactive(struct cachefs_super *super)
+{
+	_enter("");
+
+	ASSERT(!super->page_reap_proc);
+
+	if (test_bit(CACHEFS_SUPER_NOSCAN, &super->options)) {
+		_leave(" [no scan]");
+		return;
+	}
+
+	if (test_bit(CACHEFS_SUPER_ERROR_STOP, &super->flags)) {
+		_leave(" [io error]");
+		return;
+	}
+
+	/* see if we can find something to reap if we don't have anything
+	 * yet */
+	if (!super->j.reap_processor &&
+	    (super->j.reap_stack ||
+	     (super->j.reap_collector && super->j.reap_collsp > 0))
+	    ) {
+		ASSERT(!super->page_reap_proc);
+
+		down_write(&super->tree_wander_sem);
+
+		/* move the pending stack to the active processing stack */
+		if (super->j.reap_stack) {
+			_debug("steal reap stack");
+			super->j.reap_processor = super->j.reap_stack;
+			super->j.reap_proccnt =
+				CACHEFS_ONDISC_REAP_OBJIDSPERNODE;
+			super->j.reap_stack = 0;
+		}
+		/* or steal the collector if there's anything in it */
+		else if (super->j.reap_collector && super->j.reap_collsp > 0) {
+			_debug("steal reap collector");
+			super->j.reap_processor = super->j.reap_collector;
+			super->j.reap_proccnt = super->j.reap_collsp;
+			super->j.reap_collector = 0;
+			super->j.reap_collsp = -1;
+			super->page_reap_proc = super->page_reap;
+			super->page_reap = NULL;
+		}
+
+		up_write(&super->tree_wander_sem);
+	}
+
+	/* start the active processing stack TOS node loading if there is one
+	 * - note that we always have to load the node, even if we're
+	 *   immediately going to scrag it as we need the next pointer from it
+	 */
+	if (super->j.reap_processor) {
+		if (!super->page_reap_proc) {
+			/* set to load */
+			if (cachefs_scan_node_read(super,
+						   super->j.reap_processor,
+						   &super->page_reap_proc
+						   ) < 0
+			    ) {
+				set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+				_leave(" [defer]");
+				return;
+			}
+		}
+		else {
+			set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+		}
+
+		super->scan_state = CACHEFS_SCAN_LOADING_REAP_LIST;
+	}
+	/* no object IDs to reap, go straight to the scanner */
+	else {
+		super->scan_state = CACHEFS_SCAN_DESCENDING;
+		set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+	}
+
+	/* start scanning with the root of the tree */
+	ASSERT(!super->scan_node);
+	super->scan_node = cachefs_tree_get(super->metadata_tree);
+	super->scan_bix = super->metadata_tree->bix;
+	ASSERT(super->scan_node->scan_state == INT_MAX);
+	super->scan_node->scan_state = -CACHEFS_ONDISC_LEAF_PER_BLOCK;
+
+	_leave("");
+
+} /* end cachefs_scan_inactive() */
+
+/*****************************************************************************/
+/*
+ * wait for the top of the reap processing list to be read in
+ */
+static void cachefs_scan_loading_reap_list(struct cachefs_super *super)
+{
+	_enter("{%x,%d}", super->j.reap_processor, super->j.reap_proccnt);
+
+	ASSERT(super->j.reap_proccnt > 0);
+
+	/* we can now start processing the tree */
+	if (PageUptodate(super->page_reap_proc)) {
+		super->scan_state = CACHEFS_SCAN_DESCENDING;
+		set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+		_leave("");
+		return;
+	}
+
+	/* deal with I/O errors */
+	if (PageError(super->page_reap_proc)) {
+		printk(KERN_ERR
+		       "CacheFS: Reap TOS node %lx had error; discarding\n",
+		       super->page_reap_proc->index);
+
+		cachefs_page_put(super->page_reap_proc);
+		super->page_reap_proc = NULL;
+
+		down_read(&super->tree_wander_sem);
+		super->j.reap_processor = 0;
+		super->j.reap_proccnt = -1;
+		up_read(&super->tree_wander_sem);
+
+		super->scan_state = CACHEFS_SCAN_INACTIVE;
+		set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+	}
+
+	_leave("");
+
+} /* end cachefs_scan_loading_reap_list() */
+
+/*****************************************************************************/
+/*
+ * handle descent to a new node
+ */
+static void cachefs_scan_descending(struct cachefs_super *super)
+{
+	int ret;
+
+	ASSERT(super->scan_node);
+
+	_enter("%x{%d}", super->scan_node->bix, super->scan_node->scan_state);
+
+	super->scan_bix = super->scan_node->bix;
+
+	/* need to load the node's page now */
+	ret = cachefs_node_read(super, super->scan_node, 0);
+	if (ret < 0 && ret != -EAGAIN) {
+		if (ret == -EIO) {
+			super->scan_state = CACHEFS_SCAN_INACTIVE;
+			_leave(" [io error]");
+			return;
+		}
+
+		_leave(" [defer io]");
+		return;
+	}
+
+	ASSERT(super->scan_node->page);
+
+	/* may have to wait for the disk */
+	if (ret == -EAGAIN) {
+		super->scan_loading = super->scan_node->page;
+		super->scan_state = CACHEFS_SCAN_VALIDATING_NODE;
+		if (PageUptodate(super->scan_loading) ||
+		    PageError(super->scan_loading))
+			set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+	}
+	else {
+		super->scan_state = CACHEFS_SCAN_SCANNING_NODE;
+		set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+	}
+
+	_leave("");
+
+} /* end cachefs_scan_descending() */
+
+/*****************************************************************************/
+/*
+ * handle validation of a node we've just loaded
+ */
+static void cachefs_scan_validating_node(struct cachefs_super *super)
+{
+	ASSERT(super->scan_node);
+	ASSERT(super->scan_loading);
+
+	_enter("%x{%d}", super->scan_node->bix, super->scan_node->scan_state);
+
+	super->scan_bix = super->scan_node->bix;
+
+	if (!PageUptodate(super->scan_loading) &&
+	    !PageError(super->scan_loading)
+	    ) {
+		_leave(" [defer]");
+		return;
+	}
+
+	super->scan_loading = NULL;
+
+	if (cachefs_node_validate(super, super->scan_node) < 0) {
+		super->scan_state = CACHEFS_SCAN_INACTIVE;
+		_leave(" [io error]");
+		return;
+	}
+
+	super->scan_state = CACHEFS_SCAN_SCANNING_NODE;
+	set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+	_leave("");
+
+} /* end cachefs_scan_validating_node() */
+
+/*****************************************************************************/
+/*
+ * check an object to see if it needs reaping
+ */
+static inline int cachefs_scan_check_reap(struct cachefs_super *super,
+					  struct cachefs_ondisc_leaf *leaf,
+					  struct cachefs_ondisc_reap_node *reap)
+{
+	if (reap) {
+		uint64_t objid;
+		int loop;
+
+		memcpy(&objid, leaf->u.object.parent, sizeof(objid));
+
+		for (loop = 0; loop < super->j.reap_proccnt; loop++)
+			if (reap->objids[loop] == objid)
+				return 1;
+	}
+
+	return 0;
+
+} /* end cachefs_scan_check_reap() */
+
+/*****************************************************************************/
+/*
+ * check an object to see if it should be added to the cull list
+ */
+static inline int cachefs_scan_maybe_cull(struct cachefs_super *super,
+					  struct cachefs_ondisc_leaf *leaf,
+					  uint16_t offset)
+{
+	struct cachefs_object *object;
+
+	/* attempt to insert the bookmark object we allocated earlier into the
+	 * cull queue */
+	object = super->scan_tmpobj;
+	super->scan_tmpobj = NULL;
+
+	object->offset	= offset;
+	object->node	= cachefs_tree_get(super->scan_node);
+	object->objid	= leaf->u.object.objid;
+	object->i_size	= leaf->u.object.size;
+	object->type	= leaf->u.object.object_type;
+	object->flags	= leaf->u.object.flags;
+	object->atime	= leaf->u.object.atime;
+	object->has_data = (leaf->ptr != CACHEFS_NULL_PTR);
+	object->data_levels = leaf->u.object.data_levels;
+
+	memcpy(&object->pobjid, leaf->u.object.parent, sizeof(object->pobjid));
+
+	/* attempt insertion into the cull queue */
+	_debug("add %p{%llx} [at %x]", object, object->objid, object->atime);
+
+	if (__cachefs_tree_link_object(super, object, super->scan_node, 1) < 0
+	    ) {
+		/* insertion failed */
+		_debug("- failed");
+
+		cachefs_tree_put(object->node);
+		object->node	= NULL;
+		object->objid	= 0;
+		super->scan_tmpobj = object;
+		return 0;
+	}
+
+	return 1;
+
+} /* end cachefs_scan_maybe_cull() */
+
+/*****************************************************************************/
+/*
+ * scan the current node from end to end
+ */
+static void cachefs_scan_scanning_node(struct cachefs_super *super)
+{
+	struct cachefs_ondisc_reap_node *reap;
+	struct cachefs_ondisc_leaf *leaf;
+	struct cachefs_object *object;
+	struct cachefs_tree *node, *next;
+	cachefs_block_t *ptr, bix;
+	void *data;
+	int offset;
+
+	node = super->scan_node;
+	ASSERT(node);
+
+	_enter("%x{%d}", node->bix, node->scan_state);
+
+	ASSERT(node->scan_state != INT_MAX);
+	ASSERT(node->scan_state >= -CACHEFS_ONDISC_LEAF_PER_BLOCK);
+	ASSERT(node->scan_state < CACHEFS_ONDISC_PTR_PER_BLOCK);
+	ASSERT(node->page);
+	ASSERT(PageUptodate(node->page));
+
+	/* make sure we have a selection of bookmarks available */
+	if (!super->scan_tmpnode) {
+		super->scan_tmpnode = cachefs_tree_alloc(GFP_NOFS);
+		if (!super->scan_tmpnode) {
+			_leave(" [defer]");
+			return;
+		}
+	}
+
+	if (!super->scan_tmpobj) {
+		struct cachefs_object *object;
+
+		object = kmem_cache_alloc(cachefs_object_jar, SLAB_KERNEL);
+		if (!object) {
+			_leave(" [defer]");
+			return;
+		}
+
+#if CACHEFS_DEBUG_OBJECT_ACCOUNTING
+		kdebug("- ALLOC TMPOBJ %p", object);
+#endif
+
+		/* initialise the object from the parent index */
+		atomic_set(&object->usage, 1);
+		atomic_set(&object->fscache_usage, 0);
+
+		fscache_object_init(&object->fscache);
+		object->fscache.cookie = NULL;
+		object->fscache.cache = &super->cache;
+
+		object->pobjid	= 0;
+		object->objid	= 0;
+		object->node	= NULL;
+		object->offset	= 0xffffU;
+		object->keylen	= 0;
+		object->type	= 0;
+		object->data_levels = 0;
+
+		super->scan_tmpobj = object;
+	}
+
+	/* we can only hold the node's rwlock whilst scanning as we don't want
+	 * to sleep */
+	write_lock(&node->lock);
+
+	super->scan_bix = node->bix;
+
+	if (test_bit(CACHEFS_TREE_DETACHED, &node->flags)) {
+		/* node was deleted under us */
+		write_unlock(&node->lock);
+		super->scan_state = CACHEFS_SCAN_ASCENDING;
+		set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+		_leave(" [deleted]");
+		return;
+	}
+
+	data = kmap_atomic(node->page, KM_USER0);
+
+	/* consider object and shortcut leaves first */
+	if (node->scan_state < 0) {
+		reap = NULL;
+		if (super->page_reap_proc) {
+			ASSERT(super->j.reap_proccnt > 0);
+			reap = kmap_atomic(super->page_reap_proc, KM_USER1);
+		}
+
+		do {
+			offset = CACHEFS_ONDISC_LEAF_PER_BLOCK;
+			offset += node->scan_state;
+			offset <<= super->layout->leaf_shift;
+			leaf = data + offset;
+
+			_debug("scan leaf slot %04x [type %x]", offset, leaf->type);
+
+			switch (leaf->type) {
+				/* need to recurse into shortcuts */
+			case CACHEFS_ONDISC_OBJTYPE_SHORTCUT:
+				goto descend_through_shortcut;
+
+				/* need to consider objects for orphan reaping
+				 * and adding to the cull list */
+			case CACHEFS_ONDISC_OBJTYPE_INDEX_OBJECT:
+			case CACHEFS_ONDISC_OBJTYPE_DATA_OBJECT:
+			case CACHEFS_ONDISC_OBJTYPE_OTHER_OBJECT:
+				if (cachefs_scan_check_reap(super, leaf, reap))
+					goto reap_object;
+
+				if (cachefs_scan_maybe_cull(super, leaf,
+							    offset))
+					goto added_to_cull_list;
+
+			case CACHEFS_ONDISC_OBJTYPE_EMPTY_SLOT:
+			default:
+				node->scan_state++;
+				continue;
+			}
+
+
+		} while (node->scan_state < 0);
+
+		if (reap)
+			kunmap_atomic(reap, KM_USER0);
+	}
+
+	/* then process pointer leaves */
+	while (node->scan_state < CACHEFS_ONDISC_PTR_PER_BLOCK) {
+		_debug("scan ptr leaf %04x",
+		       node->scan_state << CACHEFS_BLOCK_SHIFT);
+
+		offset = node->scan_state & CACHEFS_ONDISC_PTRPERLEAF_MASK;
+
+		/* skip object and shortcut leaves */
+		leaf = data + (offset << CACHEFS_BLOCK_SHIFT);
+		if (leaf->type != CACHEFS_NULL_PTR &&
+		    leaf->type < CACHEFS_ONDISC_OBJTYPE_FIRST_POINTER
+		    ) {
+			node->scan_state = offset + CACHEFS_ONDISC_PTRPERLEAF;
+			continue;
+		}
+
+		ptr = data;
+		do {
+			if (ptr[node->scan_state])
+				goto descend_through_pointer;
+			node->scan_state++;
+		} while (node->scan_state & ~CACHEFS_ONDISC_PTRPERLEAF_MASK);
+	}
+
+	/* finished this node */
+	kunmap_atomic(data, KM_USER0);
+	write_unlock(&node->lock);
+
+	super->scan_state = CACHEFS_SCAN_ASCENDING;
+	set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+	_leave("");
+	return;
+
+	/* if we added the temporary object to the cull list then we'll need a
+	 * new one, so go back to kcachefsd to see if that need to do any work
+	 * - we'll pick up another temp object on reentry to this function
+	 */
+added_to_cull_list:
+	if (reap)
+		kunmap_atomic(reap, KM_USER0);
+	kunmap_atomic(data, KM_USER0);
+	node->scan_state++;
+	write_unlock(&node->lock);
+
+	set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+	_leave("");
+	return;
+
+	/* we use another state to reap an object */
+reap_object:
+	_debug("reap object %llx [%04x]", leaf->u.object.objid, offset);
+
+	if (reap)
+		kunmap_atomic(reap, KM_USER0);
+
+	/* need to insert the bookmark object we allocated earlier */
+	object = super->scan_tmpobj;
+	super->scan_tmpobj = NULL;
+
+	object->offset	= offset;
+	object->node	= cachefs_tree_get(node);
+	object->objid	= leaf->u.object.objid;
+	object->i_size	= leaf->u.object.size;
+	object->type	= leaf->u.object.object_type;
+	object->flags	= leaf->u.object.flags;
+	object->has_data = (leaf->ptr != CACHEFS_NULL_PTR);
+	object->data_levels = leaf->u.object.data_levels;
+
+	memcpy(&object->pobjid, leaf->u.object.parent, sizeof(object->pobjid));
+
+	kunmap_atomic(data, KM_USER0);
+
+	if (__cachefs_tree_link_object(super, object, node, 2) < 0)
+		goto already_reaping_this_object;
+
+	write_unlock(&node->lock);
+
+	super->scan_state = CACHEFS_SCAN_REAPING_OBJECT;
+	set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+	_leave(" [reap]");
+	return;
+
+	/* if the scan state on this node was reset by rearrangement, we might
+	 * see orphaned nodes cropping up more than once */
+already_reaping_this_object:
+	_debug("already reaping");
+
+	cachefs_tree_put(node);
+	object->node	= NULL;
+	object->objid	= 0;
+	super->scan_tmpobj = object;
+
+	write_unlock(&node->lock);
+
+	node->scan_state++;
+	set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+	_leave(" [already reaping]");
+	return;
+
+	/* descend through a pointer */
+descend_through_pointer:
+	offset = node->scan_state << CACHEFS_BLOCK_SHIFT;
+	bix = ptr[node->scan_state];
+	_debug("descend ptr %04x to %x", offset, bix);
+	kunmap_atomic(data, KM_USER0);
+
+	next = cachefs_tree_find_node(node, CACHEFS_TREE_TYPE_NODE, offset);
+	if (!next) {
+		/* need to insert the bookmark we allocated earlier */
+		next = super->scan_tmpnode;
+		super->scan_tmpnode = NULL;
+
+		__set_bit(CACHEFS_TREE_EXTANT, &next->flags);
+
+		next->parent	= cachefs_tree_get(node);
+		next->bix	= bix;
+		next->offset	= offset;
+		next->s_offset	= 0;
+		next->level	= node->level + 1;
+		next->type	= CACHEFS_TREE_TYPE_NODE;
+		next->occupancy	= -1;
+
+		__cachefs_tree_link_to_node(next, node);
+	}
+	else {
+		cachefs_tree_get(next);
+	}
+	goto descend;
+
+	/* descend through a shortcut */
+descend_through_shortcut:
+	_debug("descend shortcut %04x to %x", offset, leaf->ptr);
+
+	if (reap)
+		kunmap_atomic(reap, KM_USER0);
+
+	next = cachefs_tree_find_node(node, CACHEFS_TREE_TYPE_SHORTCUT, offset);
+	if (!next) {
+		/* need to insert the bookmark we allocated earlier */
+		next = super->scan_tmpnode;
+		super->scan_tmpnode = NULL;
+
+		__set_bit(CACHEFS_TREE_EXTANT, &next->flags);
+
+		next->parent	= cachefs_tree_get(node);
+		next->bix	= leaf->ptr;
+		next->offset	= offset;
+		next->s_offset	= leaf->u.shortcut.s_offset;
+		next->level	= leaf->u.shortcut.level;
+		next->type	= CACHEFS_TREE_TYPE_SHORTCUT;
+		next->occupancy	= -1;
+
+		__cachefs_tree_link_to_node(next, node);
+	}
+	else {
+		cachefs_tree_get(next);
+	}
+
+descend:
+	kunmap_atomic(data, KM_USER0);
+
+	/* make sure we come back somewhere different unless the scan is
+	 * restarted */
+	node->scan_state++;
+	write_unlock(&node->lock);
+	cachefs_tree_put(node);
+
+	/* set up the new node */
+	ASSERT(next->scan_state == INT_MAX);
+	next->scan_state = -CACHEFS_ONDISC_LEAF_PER_BLOCK;
+	super->scan_node = next;
+	super->scan_bix = next->bix;
+
+	super->scan_state = CACHEFS_SCAN_DESCENDING;
+	set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+	_leave(" [descend]");
+
+} /* end cachefs_scan_scanning_node() */
+
+/*****************************************************************************/
+/*
+ * ascend from the node we've just finished scanning to its parent
+ */
+static void cachefs_scan_ascending(struct cachefs_super *super)
+{
+	struct cachefs_tree *node, *parent;
+
+	node = super->scan_node;
+	ASSERT(node);
+
+	_enter("%x{%d}", node->bix, node->scan_state);
+
+	node->scan_state = INT_MAX;
+	super->scan_node = NULL;
+
+	if (!node->parent) {
+		/* if we're ascending from the root then we've done */
+		cachefs_tree_put(node);
+		super->scan_bix = 0;
+		super->scan_state = CACHEFS_SCAN_COMPLETING_SCAN;
+		set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+		_leave(" [complete]");
+		return;
+	}
+
+	/* ascend carefully to avoid racing with leaf rearrangement */
+	read_lock(&node->lock);
+	parent = cachefs_tree_get(node->parent);
+	read_unlock(&node->lock);
+	cachefs_tree_put(node);
+
+	_debug("ascent to %x [state %d]", parent->bix, parent->scan_state);
+
+	super->scan_node = parent;
+	super->scan_bix = parent->bix;
+	super->scan_state = CACHEFS_SCAN_SCANNING_NODE;
+	set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+	_leave("");
+
+} /* end cachefs_scan_ascending() */
+
+/*****************************************************************************/
+/*
+ * handle completion of scan
+ */
+static void cachefs_scan_completing_scan(struct cachefs_super *super)
+{
+	_enter("");
+
+	ASSERT(!super->scan_node);
+	ASSERT(!super->scan_reap);
+
+	if (super->page_reap_proc)
+		super->scan_state = CACHEFS_SCAN_COMPLETING_REAP;
+	else
+		super->scan_state = CACHEFS_SCAN_FINISHED;
+
+	set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+	_leave("");
+
+} /* end cachefs_scan_completing_scan() */
+
+/*****************************************************************************/
+/*
+ * wait for the final reap to complete
+ */
+static void cachefs_scan_completing_reap(struct cachefs_super *super)
+{
+	_enter("");
+
+	if (!super->reaper_target) {
+		super->scan_state = CACHEFS_SCAN_ADVANCING_REAP_LIST,
+		set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+	}
+
+	_leave("");
+
+} /* end cachefs_scan_completing_reap() */
+
+/*****************************************************************************/
+/*
+ * advance the reap processing stack to the next node
+ */
+static void cachefs_scan_advancing_reap_list(struct cachefs_super *super)
+{
+	_enter("{%x}", super->j.reap_processor);
+	BUG(); // TODO
+
+#if 0
+	struct cachefs_ondisc_reap_node *rnode;
+	cachefs_block_t bix;
+
+	_enter("{%x}", super->j.reap_processor);
+
+	ASSERT(super->page_reap_proc);
+
+	/* read the pointer to the next node in the reaping stack */
+	rnode = kmap_atomic(super->page_reap_proc, KM_USER0);
+	bix = rnode->next;
+	kunmap_atomic(rnode, KM_USER0);
+
+	/* attempt to release the current node */
+	down_write(&super->tree_wander_sem);
+
+	if (cachefs_trans_recycle_to_reclaim(super, super->j.reap_processor) == 0
+	    ) {
+		/* success */
+		cachefs_page_put(super->page_reap_proc);
+		super->page_reap_proc = NULL;
+		super->j.reap_processor = bix;
+		super->j.reap_proccnt = -1;
+		if (bix)
+			super->j.reap_proccnt = CACHEFS_ONDISC_REAP_OBJIDSPERNODE;
+
+		super->scan_state = CACHEFS_SCAN_SYNCING_CACHE;
+		set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+	}
+
+	up_write(&super->tree_wander_sem);
+	_leave("");
+#endif
+
+} /* end cachefs_scan_advancing_reap_list() */
+
+/*****************************************************************************/
+/*
+ * sync the cache to disk to dispose of the reap list node
+ */
+static void cachefs_scan_syncing_cache(struct cachefs_super *super)
+{
+	_enter("");
+
+	cachefs_sync(super, 0, 0);
+	super->scan_state = CACHEFS_SCAN_FINISHED;
+	set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+
+	_leave("");
+
+} /* end cachefs_scan_syncing_cache() */
+
+/*****************************************************************************/
+/*
+ * finished doing the scan
+ */
+static void cachefs_scan_finished(struct cachefs_super *super)
+{
+	_enter("");
+
+	ASSERT(!super->page_reap_proc);
+
+	super->scan_state = CACHEFS_SCAN_INACTIVE;
+
+	/* do another pass immediately if there are other reap stacks to
+	 * process */
+	if (super->j.reap_processor ||
+	    super->j.reap_stack ||
+	    super->j.reap_collector
+	    )
+		set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+
+	_leave("");
+
+} /* end cachefs_scan_finished() */
+
+/*****************************************************************************/
+/*
+ * handle reap of orphaned object
+ */
+static void cachefs_scan_reaping_object(struct cachefs_super *super)
+{
+	struct cachefs_object *object;
+	struct cachefs_tree *node;
+
+	object = super->scan_reap;
+	node = super->scan_node;
+
+	ASSERT(node);
+	ASSERT(object);
+
+	_enter("%x{%d},%llx", node->bix, node->scan_state, object->objid);
+
+	ASSERTIF(node->scan_state != INT_MAX, node->scan_state < 0);
+	ASSERT(node->page);
+	ASSERT(PageUptodate(node->page));
+
+	/* pass the object over to the reaper thread when it becomes ready */
+	super->scan_state = CACHEFS_SCAN_WAITING_FOR_REAPER;
+	if (cmpxchg(&super->reaper_target, NULL, super->scan_reap) != NULL) {
+		_leave(" [busy]");
+		return;
+	}
+
+	/* we can get back to scanning the tree whilst the reaper works in the
+	 * background */
+	smp_wmb();
+	super->scan_reap = NULL;
+	super->scan_state = CACHEFS_SCAN_SCANNING_NODE;
+	set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+
+	/* tell the reaper it's got business */
+	wake_up(&super->reaper_sleepq);
+
+	_leave("");
+
+} /* end cachefs_scan_reaping_object() */
+
+/*****************************************************************************/
+/*
+ * wait for the reaper to dispose of the object we gave it last time
+ */
+static void cachefs_scan_waiting_for_reaper(struct cachefs_super *super)
+{
+	_enter("%x{%d},%llx",
+	       super->scan_node->bix, super->scan_node->scan_state,
+	       super->scan_reap->objid);
+
+	if (!super->reaper_target) {
+		super->scan_state = CACHEFS_SCAN_REAPING_OBJECT;
+		set_bit(CACHEFS_SUPER_DO_SCAN, &super->flags);
+	}
+
+	_leave("");
+
+} /* end cachefs_scan_waiting_for_reaper() */
diff -uNrp linux-2.6.14-mm2/fs/cachefs/tree-update.c linux-2.6.14-mm2-cachefs/fs/cachefs/tree-update.c
--- linux-2.6.14-mm2/fs/cachefs/tree-update.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.14-mm2-cachefs/fs/cachefs/tree-update.c	2005-11-14 16:23:38.000000000 +0000
@@ -0,0 +1,175 @@
+/* tree-update.c: CacheFS indexing tree update
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+//#define __KENTER
+//#define __KLEAVE
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "cachefs-int.h"
+
+/*****************************************************************************/
+/*
+ * walk from the root of the tree to the object sliding a read lock down the
+ * tree to the parent of the specified object
+ * - the object must be resident in the tree and must be pinning the nodes on
+ *   the path through the tree
+ */
+struct cachefs_tree *cachefs_tree_slide_readlock(struct cachefs_super *super,
+						 struct cachefs_object *object)
+{
+	struct cachefs_object *xobject;
+	struct cachefs_tree *point, *branch;
+	uint16_t level, offset;
+
+	_enter("");
+
+	/* attempt to lock the node holding the object leaf directly
+	 * - lock the object to prevent the parent pointer changing whilst we
+	 *   do it
+	 */
+	read_lock(&object->lock);
+
+	if (down_read_trylock(&object->node->sem)) {
+		cachefs_tree_get(object->node);
+		read_unlock(&object->lock);
+		_leave(" = %p [fast]", object->node);
+		return object->node;
+	}
+
+	read_unlock(&object->lock);
+
+	/* walk the tree from the root looking for the object and
+	 * sliding the lock down appropriately */
+	point = cachefs_tree_get(super->metadata_tree);
+	level = 0;
+
+	down_read(&point->sem);
+
+begin_step:
+	/* extract the bits of key in which we're immediately interested */
+	offset = cachefs_extract_subkey_obj(object, level);
+
+	_debug("step %d subkey=%04x", level, offset);
+
+	/* start by checking the cached branches and shortcuts leading off of
+	 * this one
+	 */
+	read_lock(&point->lock);
+
+	xobject = cachefs_tree_find_object(point, object->offset);
+	if (xobject == object)
+		goto found_object;
+
+	branch = cachefs_tree_find_node(point, CACHEFS_TREE_TYPE_NODE, offset);
+	if (branch)
+		goto move_to_cached_branch;
+
+	branch = cachefs_tree_find_shortcut_obj(point, object);
+	if (branch)
+		goto move_to_cached_shortcut;
+
+	read_unlock(&point->lock);
+
+	/* uh oh... the object should be in the tree somewhere */
+	printk(KERN_ERR "Object missing from in-mem tree\n");
+	printk(KERN_ERR "- obj %llx node %p{%x} level %d offset %04x\n",
+	       object->objid, point, point->bix, level, offset);
+	BUG();
+
+	/* we found the object we were looking for
+	 * - return with the point node's semaphore still read-locked and a ref
+	 *   held on its usage count
+	 */
+found_object:
+	read_unlock(&point->lock);
+
+	_leave(" = %p [found]", point);
+	return point;
+
+	/* we found a suitable branch to move to in the topology cache */
+move_to_cached_shortcut:
+	_debug(">>>> skip to %p [lev %d]", branch, branch->level);
+	goto move_to_cached_branch2;
+
+move_to_cached_branch:
+	_debug(">>>> move to %p [lev %d]", branch, branch->level);
+
+move_to_cached_branch2:
+	cachefs_tree_get(branch);
+	read_unlock(&point->lock);
+
+	down_read(&branch->sem);
+	up_read(&point->sem);
+	cachefs_tree_put(point);
+
+	ASSERT(branch->level > level);
+	level = branch->level;
+	point = branch;
+	goto begin_step;
+
+} /* end cachefs_tree_slide_readlock() */
+
+/*****************************************************************************/
+/*
+ * update an object in place
+ * - called by the netfs and also used to update the object flags on disk
+ */
+void cachefs_tree_update_object(struct cachefs_super *super,
+				struct cachefs_object *object)
+{
+	struct cachefs_ondisc_leaf *leaf;
+	struct fscache_cookie *cookie;
+	struct cachefs_tree *node;
+	uint16_t maxdlen, dlen;
+	void *data, *dbuf;
+
+	_enter(",{%llx,%x}", object->objid, object->offset);
+
+	ASSERT(object->key);
+
+	node = cachefs_tree_slide_readlock(super, object);
+
+	lock_page(node->page);
+	data = kmap_atomic(node->page, KM_USER0);
+
+	/* change the atime */
+	leaf = data + object->offset;
+	leaf->u.object.atime = CURRENT_TIME.tv_sec;
+	leaf->u.object.flags = object->flags;
+
+	ASSERT(leaf->type != CACHEFS_NULL_PTR);
+
+	/* update the netfs auxilliary data */
+	cookie = object->fscache.cookie;
+	if (cookie && cookie->def->get_aux) {
+		maxdlen = CACHEFS_ONDISC_LEAF_SIZE;
+
+		maxdlen -= offsetof(struct cachefs_ondisc_leaf,
+				    u.object.netfs_data);
+		maxdlen -= leaf->u.object.netfs_klen;
+		dbuf = leaf->u.object.netfs_data;
+		dbuf += leaf->u.object.netfs_klen;
+
+		dlen = cookie->def->get_aux(cookie->netfs_data, dbuf, maxdlen);
+		BUG_ON(dlen > maxdlen);
+		leaf->u.object.netfs_dlen = dlen;
+	}
+
+	/* schedule the page to be written back */
+	kunmap_atomic(data, KM_USER0);
+	set_page_dirty(node->page);
+	unlock_page(node->page);
+	up_read(&node->sem);
+	cachefs_tree_put(node);
+	_leave("");
+
+} /* end cachefs_tree_update_object() */


[Index of Archives]     [LARTC]     [Bugtraq]     [Yosemite Forum]
  Powered by Linux