On Thu, Sep 05, 2002 at 05:59:09PM +0200, Carles Xavier Munyoz Bald? wrote: > > Please, help me. I believe that the htree indexed directory is a very very > great improvement for the ext2/ext3 file system that must become a standard > as soon as possible. I'm not sure why you're not seeing the improvement based on the Daniel Phillip's ext2 patch (it worked for me) but let me give you an updated set of patches and a test script for ext3. I'm still working on cleaning it up, and the version I'm sending you still is missing some error checks and doesn't have the code cleanup that is still on-going, but it's been slightly better tested than the most bleeding edge code that I have. (Translation: I've tested it for two hours in my hotel room in Koeln, after I got back from the Linux Kongress Social. So **please** don't run this on a production system, at least not until you've done your own testing and validation.) (To Andreas: I've done the work to make the code support multiple hash algorithms, so we can now smoothly transition between filesystems that were running Daniel Phillip's original dx_hack_hash and the latest format as supported by this patch and e2fsprogs 1.28). Using this patch, it is very easy to demonstrate that if you don't turn on the indexed directory filesystem feature, the test scripts take a **lot** longer. With the indexed directory feature turned on, running the script only takes about 3 minutes of real time on my laptop, and the disk drive light was on most of the time. With the indexed directory feature turned off, running the script took over 15 minutes and had only completed a tiny fraction of the directory entries before I gave up and hit Control-C. So you should an orders-of-magnitude improvement with the indexed directory feature turned on. Caveats: This patch and the test script requires the use of e2fsprogs 1.28. Also, the mke2fs options in the test script are designed to stress the htree code using a minimal amount of disk space on my laptop. They should not be used for production filesystems. Finally, the debugfs commands to initialize the filesystem will be automatically done by mke2fs in the next release of e2fsprogs. So rest assured, we *are* working on the htree indexed directory code. I fully intend for this to get integrated into the 2.5 tree, even if we don't have the NFS readdir fixes done at first. - Ted
# This is a BitKeeper generated patch for the following project: # Project Name: Linux kernel tree # This patch format is intended for GNU patch command version 2.5 or higher. # This patch includes the following deltas: # ChangeSet 1.665 -> 1.668 # fs/ext3/namei.c 1.1 -> 1.4 # include/linux/ext3_fs.h 1.3 -> 1.6 # fs/ext3/Makefile 1.2 -> 1.3 # include/linux/ext3_jbd.h 1.2 -> 1.3 # fs/Config.in 1.15 -> 1.16 # fs/ext3/super.c 1.4 -> 1.6 # include/linux/ext3_fs_sb.h 1.2 -> 1.3 # (new) -> 1.2 fs/ext3/hash.c # # The following is the BitKeeper ChangeSet Log # -------------------------------------------- # 02/08/30 tytso@snap.thunk.org 1.666 # Apply http://www.gnuchina.org/~chrisl/htree-ext3-2.4.19-1.diff # -------------------------------------------- # 02/09/03 tytso@snap.thunk.org 1.667 # Use new TEA hash. # -------------------------------------------- # 02/09/05 tytso@think.thunk.org 1.668 # Add support for supporting multiple different types of hash versions # in the kernel, and use the default hash version in the superblock, # as well as the hash seed in the superblock. # -------------------------------------------- # diff -Nru a/fs/Config.in b/fs/Config.in --- a/fs/Config.in Thu Sep 5 20:42:11 2002 +++ b/fs/Config.in Thu Sep 5 20:42:11 2002 @@ -27,6 +27,7 @@ # dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS define_bool CONFIG_JBD $CONFIG_EXT3_FS dep_mbool ' JBD (ext3) debugging support' CONFIG_JBD_DEBUG $CONFIG_JBD +dep_mbool ' Ext3 hashed index (htree) support' CONFIG_EXT3_INDEX $CONFIG_JBD # msdos file systems tristate 'DOS FAT fs support' CONFIG_FAT_FS diff -Nru a/fs/ext3/Makefile b/fs/ext3/Makefile --- a/fs/ext3/Makefile Thu Sep 5 20:42:11 2002 +++ b/fs/ext3/Makefile Thu Sep 5 20:42:11 2002 @@ -10,7 +10,7 @@ O_TARGET := ext3.o obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ - ioctl.o namei.o super.o symlink.o + ioctl.o namei.o super.o symlink.o hash.o obj-m := $(O_TARGET) include $(TOPDIR)/Rules.make diff -Nru a/fs/ext3/hash.c b/fs/ext3/hash.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/ext3/hash.c Thu Sep 5 20:42:11 2002 @@ -0,0 +1,219 @@ +/* + * linux/fs/ext3/hash.c + * + * Copyright (C) 2002 by Theodore Ts'o + * + * This file is released under the GPL v2. + * + * This file may be redistributed under the terms of the GNU Public + * License. + */ + +#include <linux/fs.h> +#include <linux/jbd.h> +#include <linux/sched.h> +#include <linux/ext3_fs.h> + +#define DELTA 0x9E3779B9 + +static void TEA_transform(__u32 buf[4], __u32 const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while(--n); + + buf[0] += b0; + buf[1] += b1; +} + +/* F, G and H are basic MD4 functions: selection, majority, parity */ +#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) + +/* + * The generic round function. The application is so specific that + * we don't bother protecting all the arguments with parens, as is generally + * good macro practice, in favor of extra legibility. + * Rotation is separate from addition to prevent recomputation + */ +#define ROUND(f, a, b, c, d, x, s) \ + (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s))) +#define K1 0 +#define K2 013240474631UL +#define K3 015666365641UL + +/* + * Basic cut-down MD4 transform. Returns only 32 bits of result. + */ +static void halfMD4Transform (__u32 buf[4], __u32 const in[]) +{ + __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3]; + + /* Round 1 */ + ROUND(F, a, b, c, d, in[0] + K1, 3); + ROUND(F, d, a, b, c, in[1] + K1, 7); + ROUND(F, c, d, a, b, in[2] + K1, 11); + ROUND(F, b, c, d, a, in[3] + K1, 19); + ROUND(F, a, b, c, d, in[4] + K1, 3); + ROUND(F, d, a, b, c, in[5] + K1, 7); + ROUND(F, c, d, a, b, in[6] + K1, 11); + ROUND(F, b, c, d, a, in[7] + K1, 19); + + /* Round 2 */ + ROUND(G, a, b, c, d, in[1] + K2, 3); + ROUND(G, d, a, b, c, in[3] + K2, 5); + ROUND(G, c, d, a, b, in[5] + K2, 9); + ROUND(G, b, c, d, a, in[7] + K2, 13); + ROUND(G, a, b, c, d, in[0] + K2, 3); + ROUND(G, d, a, b, c, in[2] + K2, 5); + ROUND(G, c, d, a, b, in[4] + K2, 9); + ROUND(G, b, c, d, a, in[6] + K2, 13); + + /* Round 3 */ + ROUND(H, a, b, c, d, in[3] + K3, 3); + ROUND(H, d, a, b, c, in[7] + K3, 9); + ROUND(H, c, d, a, b, in[2] + K3, 11); + ROUND(H, b, c, d, a, in[6] + K3, 15); + ROUND(H, a, b, c, d, in[1] + K3, 3); + ROUND(H, d, a, b, c, in[5] + K3, 9); + ROUND(H, c, d, a, b, in[0] + K3, 11); + ROUND(H, b, c, d, a, in[4] + K3, 15); + + buf[0] += a; + buf[1] += b; + buf[2] += c; + buf[3] += d; +} + +#undef ROUND +#undef F +#undef G +#undef H +#undef K1 +#undef K2 +#undef K3 + +/* The old legacy hash */ +static __u32 dx_hack_hash (const char *name, int len) +{ + __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + while (len--) { + __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); + + if (hash & 0x80000000) hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return (hash0 << 1); +} + +static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i=0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = msg[i] + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +/* + * Returns the hash of a filename. If len is 0 and name is NULL, then + * this function can be used to test whether or not a hash version is + * supported. + * + * The seed is an 4 longword (32 bits) "secret" which can be used to + * uniquify a hash. If the seed is all zero's, then some default seed + * may be used. + * + * A particular hash version specifies whether or not the seed is + * represented, and whether or not the returned hash is 32 bits or 64 + * bits. 32 bit hashes will return 0 for the minor hash. + */ +int ext3fs_dirhash(int version, const char *name, int len, + const __u32 *seed, + __u32 *ret_hash, + __u32 *ret_minor_hash) +{ + __u32 hash; + __u32 minor_hash = 0; + const char *p; + int i; + __u32 in[8], buf[4]; + + /* Initialize the default seed for the hash checksum functions */ + buf[0] = 0x67452301; + buf[1] = 0xefcdab89; + buf[2] = 0x98badcfe; + buf[3] = 0x10325476; + + /* Check to see if the seed is all zero's */ + if (seed) { + for (i=0; i < 4; i++) { + if (seed[i]) + break; + } + if (i < 4) + memcpy(buf, seed, sizeof(buf)); + } + + switch (version) { + case DX_HASH_LEGACY: + hash = dx_hack_hash(name, len); + break; + case DX_HASH_HALF_MD4: + p = name; + while (len > 0) { + str2hashbuf(p, len, in, 8); + halfMD4Transform(buf, in); + len -= 32; + p += 32; + } + minor_hash = buf[2]; + hash = buf[1]; + break; + case DX_HASH_TEA: + p = name; + while (len > 0) { + str2hashbuf(p, len, in, 4); + TEA_transform(buf, in); + len -= 16; + p += 16; + } + hash = buf[0]; + minor_hash = buf[1]; + break; + default: + *ret_hash = 0; + return -1; + } + *ret_hash = hash & ~1; + if (ret_minor_hash) + *ret_minor_hash = minor_hash; + return 0; +} diff -Nru a/fs/ext3/namei.c b/fs/ext3/namei.c --- a/fs/ext3/namei.c Thu Sep 5 20:42:11 2002 +++ b/fs/ext3/namei.c Thu Sep 5 20:42:11 2002 @@ -16,6 +16,10 @@ * David S. Miller (davem@caip.rutgers.edu), 1995 * Directory entry file type support and forward compatibility hooks * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 + * Hash Tree Directory indexing (c) + * Daniel Phillips, 2001 + * Hash Tree Directory indexing porting + * Christopher Li, 2002 */ #include <linux/fs.h> @@ -38,6 +42,456 @@ #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) +static struct buffer_head *ext3_append(handle_t *handle, + struct inode *inode, + u32 *block, int *err) +{ + struct buffer_head *bh; + + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + if ((bh = ext3_bread(handle, inode, *block, 1, err))) { + inode->i_size += inode->i_sb->s_blocksize; + EXT3_I(inode)->i_disksize = inode->i_size; + ext3_journal_get_write_access(handle,bh); + } + return bh; +} + +#ifndef assert +#define assert(test) J_ASSERT(test) +#endif + +#ifndef swap +#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) +#endif + +typedef struct { u32 v; } le_u32; +typedef struct { u16 v; } le_u16; + +#define dxtrace_on(command) command +#define dxtrace_off(command) +#define dxtrace dxtrace_off + +struct fake_dirent +{ + /*le*/u32 inode; + /*le*/u16 rec_len; + u8 name_len; + u8 file_type; +}; + +struct dx_countlimit +{ + le_u16 limit; + le_u16 count; +}; + +struct dx_entry +{ + le_u32 hash; + le_u32 block; +}; + +/* + * dx_root_info is laid out so that if it should somehow get overlaid by a + * dirent the two low bits of the hash version will be zero. Therefore, the + * hash version mod 4 should never be 0. Sincerely, the paranoia department. + */ + +struct dx_root +{ + struct fake_dirent dot; + char dot_name[4]; + struct fake_dirent dotdot; + char dotdot_name[4]; + struct dx_root_info + { + le_u32 reserved_zero; + u8 hash_version; + u8 info_length; /* 8 */ + u8 indirect_levels; + u8 unused_flags; + } + info; + struct dx_entry entries[0]; +}; + +struct dx_node +{ + struct fake_dirent fake; + struct dx_entry entries[0]; +}; + + +struct dx_frame +{ + struct buffer_head *bh; + struct dx_entry *entries; + struct dx_entry *at; +}; + +struct dx_map_entry +{ + u32 hash; + u32 offs; +}; + +struct dx_hash_info +{ + const char *name; + int namelen; + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + +typedef struct ext3_dir_entry_2 ext3_dirent; +static inline unsigned dx_get_block (struct dx_entry *entry); +static void dx_set_block (struct dx_entry *entry, unsigned value); +static inline unsigned dx_get_hash (struct dx_entry *entry); +static void dx_set_hash (struct dx_entry *entry, unsigned value); +static unsigned dx_get_count (struct dx_entry *entries); +static unsigned dx_get_limit (struct dx_entry *entries); +static void dx_set_count (struct dx_entry *entries, unsigned value); +static void dx_set_limit (struct dx_entry *entries, unsigned value); +static unsigned dx_root_limit (struct inode *dir, unsigned infosize); +static unsigned dx_node_limit (struct inode *dir); +static struct dx_frame *dx_probe(struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame); +static void dx_release (struct dx_frame *frames); +static int dx_make_map (ext3_dirent *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); +static void dx_sort_map(struct dx_map_entry *map, unsigned count); +static ext3_dirent *dx_copy_dirents (char *from, char *to, + struct dx_map_entry *map, int count); +static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); + + +#ifdef CONFIG_EXT3_INDEX +/* + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ + +static inline unsigned dx_get_block (struct dx_entry *entry) +{ + return le32_to_cpu(entry->block.v) & 0x00ffffff; +} + +static inline void dx_set_block (struct dx_entry *entry, unsigned value) +{ + entry->block.v = cpu_to_le32(value); +} + +static inline unsigned dx_get_hash (struct dx_entry *entry) +{ + return le32_to_cpu(entry->hash.v); +} + +static inline void dx_set_hash (struct dx_entry *entry, unsigned value) +{ + entry->hash.v = cpu_to_le32(value); +} + +static inline unsigned dx_get_count (struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->count.v); +} + +static inline unsigned dx_get_limit (struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v); +} + +static inline void dx_set_count (struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value); +} + +static inline void dx_set_limit (struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value); +} + +static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - + EXT3_DIR_REC_LEN(2) - infosize; + return 0? 20: entry_space / sizeof(struct dx_entry); +} + +static inline unsigned dx_node_limit (struct inode *dir) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); + return 0? 22: entry_space / sizeof(struct dx_entry); +} + +/* + * Debug + */ +struct stats +{ + unsigned names; + unsigned space; + unsigned bcount; +}; + +static struct stats dx_show_leaf(struct dx_hash_info *hinfo, ext3_dirent *de, + int size, int show_names) +{ + unsigned names = 0, space = 0; + char *base = (char *) de; + u32 hash; + + printk("names: "); + while ((char *) de < base + size) + { + if (de->inode) + { + if (show_names) + { + int len = de->name_len; + char *name = de->name; + while (len--) printk("%c", *name++); + ext3fs_dirhash(hinfo->hash_version, de->name, + de->name_len, hinfo->seed, + &hash, 0); + printk(":%x.%u ", hash, ((char *) de - base)); + } + space += EXT3_DIR_REC_LEN(de->name_len); + names++; + } + de = (ext3_dirent *) ((char *) de + le16_to_cpu(de->rec_len)); + } + printk("(%i)\n", names); + return (struct stats) { names, space, 1 }; +} + +#if 0 +struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, + struct dx_entry *entries, int levels) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count = dx_get_count (entries), names = 0, space = 0, i; + unsigned bcount = 0; + struct buffer_head *bh; + int err; + printk("%i indexed blocks...\n", count); + for (i = 0; i < count; i++, entries++) + { + u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; + u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; + struct stats stats; + printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); + if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue; + stats = levels? + dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): + dx_show_leaf(hinfo, (ext3_dirent *) bh->b_data, blocksize, 0); + names += stats.names; + space += stats.space; + bcount += stats.bcount; + brelse (bh); + } + if (bcount) + printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", + names, space/bcount,(space/bcount)*100/blocksize); + return (struct stats) { names, space, bcount}; +} +#endif + +/* + * Probe for a directory leaf block to search + */ + +static struct dx_frame * +dx_probe(struct inode *dir, struct dx_hash_info *hinfo, + struct dx_frame *frame_in) +{ + unsigned count, indirect; + struct dx_entry *at, *entries, *p, *q, *m; + struct dx_root *root; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; + u32 hash; + int err; + + frame->bh = NULL; + if (!(bh = ext3_bread (NULL,dir, 0, 0,&err))) + goto fail; + root = (struct dx_root *) bh->b_data; + if (root->info.hash_version != DX_HASH_TEA && + root->info.hash_version != DX_HASH_HALF_MD4 && + root->info.hash_version != DX_HASH_LEGACY) { + ext3_warning(dir->i_sb, __FUNCTION__, + "Unrecognised inode hash code %d", + root->info.hash_version); + brelse(bh); + goto fail; + } + hinfo->hash_version = root->info.hash_version; + hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed; + ext3fs_dirhash(hinfo->hash_version, hinfo->name, + hinfo->namelen, hinfo->seed, &hinfo->hash, + &hinfo->minor_hash); + hash = hinfo->hash; + + if (root->info.unused_flags & 1) { + ext3_warning(dir->i_sb, __FUNCTION__, + "Unimplemented inode hash flags: %#06x", + root->info.unused_flags); + brelse(bh); + goto fail; + } + + if ((indirect = root->info.indirect_levels) > 1) { + ext3_warning(dir->i_sb, __FUNCTION__, + "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); + brelse(bh); + goto fail; + } + + entries = (struct dx_entry *)(((char *)&root->info) + + root->info.info_length); + assert(dx_get_limit(entries) == dx_root_limit(dir, + root->info.info_length)); + dxtrace (printk("Look up %x", hash)); + while (1) + { + count = dx_get_count(entries); + assert (count && count <= dx_get_limit(entries)); + p = entries + 1; + q = entries + count - 1; + while (p <= q) + { + m = p + (q - p)/2; + dxtrace(printk(".")); + if (dx_get_hash(m) > hash) + q = m - 1; + else + p = m + 1; + } + + if (0) // linear search cross check + { + unsigned n = count - 1; + at = entries; + while (n--) + { + dxtrace(printk(",")); + if (dx_get_hash(++at) > hash) + { + at--; + break; + } + } + assert (at == p - 1); + } + + at = p - 1; + dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); + frame->bh = bh; + frame->entries = entries; + frame->at = at; + if (!indirect--) return frame; + if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0,&err))) + goto fail2; + at = entries = ((struct dx_node *) bh->b_data)->entries; + assert (dx_get_limit(entries) == dx_node_limit (dir)); + frame++; + } +fail2: + while (frame >= frame_in) { + brelse(frame->bh); + frame--; + } +fail: + return NULL; +} + +static void dx_release (struct dx_frame *frames) +{ + if (frames[0].bh == NULL) + return; + + if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) + brelse (frames[1].bh); + brelse(frames[0].bh); +} + +/* + * Directory block splitting, compacting + */ + +static int dx_make_map (ext3_dirent *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry map[]) +{ + int count = 0; + char *base = (char *) de; + while ((char *) de < base + size) + { + ext3fs_dirhash(hinfo->hash_version, de->name, + de->name_len, hinfo->seed, &map[count].hash, 0); + map[count].offs = (u32) ((char *) de - base); + de = (ext3_dirent *) ((char *) de + le16_to_cpu(de->rec_len)); + count++; + } + return count; +} + +static void dx_sort_map (struct dx_map_entry *map, unsigned count) +{ + struct dx_map_entry *p, *q, *top = map + count - 1; + int more; + /* Combsort until bubble sort doesn't suck */ + while (count > 2) + { + count = count*10/13; + if (count - 9 < 2) /* 9, 10 -> 11 */ + count = 11; + for (p = top, q = p - count; q >= map; p--, q--) + if (p->hash < q->hash) + swap(*p, *q); + } + /* Garden variety bubble sort */ + do { + more = 0; + q = top; + while (q-- > map) + { + if (q[1].hash >= q[0].hash) + continue; + swap(*(q+1), *q); + more = 1; + } + } while(more); +} + +static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) +{ + struct dx_entry *entries = frame->entries; + struct dx_entry *old = frame->at, *new = old + 1; + int count = dx_get_count(entries); + + assert(count < dx_get_limit(entries)); + assert(old < entries + count); + memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); + dx_set_hash(new, hash); + dx_set_block(new, block); + dx_set_count(entries, count + 1); +} +#endif + + +static void ext3_update_dx_flag(struct inode *inode) +{ + if (!test_opt(inode->i_sb, INDEX)) + EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; +} + /* * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. * @@ -95,6 +549,15 @@ } /* + * p is at least 6 bytes before the end of page + */ +static inline ext3_dirent *ext3_next_entry(ext3_dirent *p) +{ + return (ext3_dirent *)((char*)p + le16_to_cpu(p->rec_len)); +} + + +/* * ext3_find_entry() * * finds an entry in the specified directory with the wanted name. It @@ -105,6 +568,8 @@ * The returned buffer_head has ->b_count elevated. The caller is expected * to brelse() it when appropriate. */ + + static struct buffer_head * ext3_find_entry (struct dentry *dentry, struct ext3_dir_entry_2 ** res_dir) { @@ -119,10 +584,80 @@ int num = 0; int nblocks, i, err; struct inode *dir = dentry->d_parent->d_inode; + int namelen; + const u8 *name; + unsigned blocksize; + ext3_dirent *de, *top; *res_dir = NULL; sb = dir->i_sb; + blocksize = sb->s_blocksize; + namelen = dentry->d_name.len; + name = dentry->d_name.name; + if (namelen > EXT3_NAME_LEN) + return NULL; + if (ext3_dx && is_dx(dir)) { + struct dx_hash_info hinfo; + u32 hash; + struct dx_frame frames[2], *frame; + hinfo.name = name; + hinfo.namelen = namelen; + if (!(frame = dx_probe (dir, &hinfo, frames))) + return NULL; + hash = hinfo.hash; +dxnext: + block = dx_get_block(frame->at); + if (!(bh = ext3_bread (NULL,dir, block, 0, &err))) + goto dxfail; + de = (ext3_dirent *) bh->b_data; + top = (ext3_dirent *) ((char *) de + blocksize - + EXT3_DIR_REC_LEN(0)); + for (; de < top; de = ext3_next_entry(de)) + if (ext3_match (namelen, name, de)) { + if (!ext3_check_dir_entry("ext3_find_entry", + dir, de, bh, + (block<<EXT3_BLOCK_SIZE_BITS(sb)) + +((char *)de - bh->b_data))) { + brelse (bh); + goto dxfail; + } + *res_dir = de; + goto dxfound; + } + brelse (bh); + /* Same hash continues in next block? Search on. */ + if (++(frame->at) == frame->entries + dx_get_count(frame->entries)) + { + struct buffer_head *bh2; + if (frame == frames) + goto dxfail; + if (++(frames->at) == frames->entries + dx_get_count(frames->entries)) + goto dxfail; + /* should omit read if not continued */ + if (!(bh2 = ext3_bread (NULL, dir, + dx_get_block(frames->at), + 0, &err))) + goto dxfail; + brelse (frame->bh); + frame->bh = bh2; + frame->at = frame->entries = ((struct dx_node *) bh2->b_data)->entries; + /* Subtle: the 0th entry has the count, find the hash in frame above */ + if ((dx_get_hash(frames->at) & -2) == hash) + goto dxnext; + goto dxfail; + } + if ((dx_get_hash(frame->at) & -2) == hash) + goto dxnext; +dxfail: + dxtrace(printk("%s not found\n", name)); + dx_release (frames); + return NULL; +dxfound: + dx_release (frames); + return bh; + } + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); start = dir->u.ext3_i.i_dir_start_lookup; if (start >= nblocks) @@ -237,6 +772,88 @@ de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } +static ext3_dirent * +dx_copy_dirents (char *from, char *to, struct dx_map_entry *map, int count) +{ + unsigned rec_len = 0; + + while (count--) { + ext3_dirent *de = (ext3_dirent *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); + ((ext3_dirent *) to)->rec_len = rec_len; + to += rec_len; + map++; + } + return (ext3_dirent *) (to - rec_len); +} + +#ifdef CONFIG_EXT3_INDEX +static ext3_dirent *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, + struct dx_hash_info *hinfo, int *error) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; + struct buffer_head *bh2; + u32 newblock; + unsigned MAX_DX_MAP = PAGE_CACHE_SIZE/EXT3_DIR_REC_LEN(1) + 1; + u32 hash2; + struct dx_map_entry map[MAX_DX_MAP]; + char *data1 = (*bh)->b_data, *data2, *data3; + unsigned split; + ext3_dirent *de, *de2; + + bh2 = ext3_append (handle, dir, &newblock, error); + if (!(bh2)) + { + brelse(*bh); + *bh = NULL; + return (ext3_dirent *)bh2; + } + + BUFFER_TRACE(*bh, "get_write_access"); + ext3_journal_get_write_access(handle, *bh); + BUFFER_TRACE(frame->bh, "get_write_access"); + ext3_journal_get_write_access(handle, frame->bh); + + data2 = bh2->b_data; + + count = dx_make_map ((ext3_dirent *) data1, blocksize, hinfo, map); + split = count/2; // need to adjust to actual middle + dx_sort_map (map, count); + hash2 = map[split].hash; + continued = hash2 == map[split - 1].hash; + dxtrace(printk("Split block %i at %x, %i/%i\n", + dx_get_block(frame->at), hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ + de2 = dx_copy_dirents (data1, data2, map + split, count - split); + data3 = (char *) de2 + de2->rec_len; + de = dx_copy_dirents (data1, data3, map, split); + memcpy(data1, data3, (char *) de + de->rec_len - data3); + de = (ext3_dirent *) ((char *) de - data3 + data1); // relocate de + de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); + de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); + dxtrace(dx_show_leaf (hinfo, (ext3_dirent *) data1, blocksize, 1)); + dxtrace(dx_show_leaf (hinfo, (ext3_dirent *) data2, blocksize, 1)); + + /* Which block gets the new entry? */ + if (hinfo->hash >= hash2) + { + swap(*bh, bh2); + de = de2; + } + dx_insert_block (frame, hash2 + continued, newblock); + ext3_journal_dirty_metadata (handle, bh2); + brelse (bh2); + ext3_journal_dirty_metadata (handle, frame->bh); + dxtrace(dx_show_index ("frame", frame->entries)); + return de; +} +#endif + + /* * ext3_add_entry() * @@ -251,6 +868,7 @@ /* * AKPM: the journalling code here looks wrong on the error paths */ + static int ext3_add_entry (handle_t *handle, struct dentry *dentry, struct inode *inode) { @@ -258,117 +876,290 @@ const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned long offset; - unsigned short rec_len; struct buffer_head * bh; - struct ext3_dir_entry_2 * de, * de1; - struct super_block * sb; + ext3_dirent *de; + struct super_block * sb = dir->i_sb; int retval; + unsigned short reclen = EXT3_DIR_REC_LEN(namelen); + struct dx_hash_info hinfo; - sb = dir->i_sb; + unsigned blocksize = sb->s_blocksize; + unsigned nlen, rlen; + u32 block, blocks; + char *top; if (!namelen) return -EINVAL; - bh = ext3_bread (handle, dir, 0, 0, &retval); - if (!bh) - return retval; - rec_len = EXT3_DIR_REC_LEN(namelen); - offset = 0; - de = (struct ext3_dir_entry_2 *) bh->b_data; - while (1) { - if ((char *)de >= sb->s_blocksize + bh->b_data) { - brelse (bh); - bh = NULL; - bh = ext3_bread (handle, dir, - offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval); - if (!bh) - return retval; - if (dir->i_size <= offset) { - if (dir->i_size == 0) { - brelse(bh); - return -ENOENT; + if (ext3_dx && is_dx(dir)) + { + struct dx_frame frames[2], *frame; + struct dx_entry *entries, *at; + char *data1; + + hinfo.name = name; + hinfo.namelen = namelen; + frame = dx_probe(dir, &hinfo, frames); + if (!frame) + return -EINVAL; /* XXX what's the right error? */ + entries = frame->entries; + at = frame->at; + + if (!(bh = ext3_bread (handle,dir, dx_get_block(frame->at), 0,&retval))) + goto dxfail1; + + BUFFER_TRACE(bh, "get_write_access"); + ext3_journal_get_write_access(handle, bh); + + data1 = bh->b_data; + de = (ext3_dirent *) data1; + top = data1 + (0? 200: blocksize); + while ((char *) de < top) + { + /* FIXME: check EEXIST and dir */ + nlen = EXT3_DIR_REC_LEN(de->name_len); + rlen = le16_to_cpu(de->rec_len); + if ((de->inode? rlen - nlen: rlen) >= reclen) + goto dx_add; + de = (ext3_dirent *) ((char *) de + rlen); + } + /* Block full, should compress but for now just split */ + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) + { + u32 newblock; + unsigned icount = dx_get_count(entries); + int levels = frame - frames; + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; + if (levels && dx_get_count(frames->entries) == dx_get_limit(frames->entries)) + goto dxfull; + bh2 = ext3_append (handle, dir, &newblock, &retval); + if (!(bh2)) + goto dxfail2; + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; + node2->fake.rec_len = cpu_to_le16(blocksize); + node2->fake.inode = 0; + BUFFER_TRACE(frame->bh, "get_write_access"); + ext3_journal_get_write_access(handle, frame->bh); + if (levels) + { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); + dxtrace(printk("Split index %i/%i\n", icount1, icount2)); + + BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + ext3_journal_get_write_access(handle, frames[0].bh); + + memcpy ((char *) entries2, (char *) (entries + icount1), + icount2 * sizeof(struct dx_entry)); + dx_set_count (entries, icount1); + dx_set_count (entries2, icount2); + dx_set_limit (entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ + if (at - entries >= icount1) { + frame->at = at = at - entries - icount1 + entries2; + frame->entries = entries = entries2; + swap(frame->bh, bh2); } - - ext3_debug ("creating next block\n"); - - BUFFER_TRACE(bh, "get_write_access"); - ext3_journal_get_write_access(handle, bh); - de = (struct ext3_dir_entry_2 *) bh->b_data; - de->inode = 0; - de->rec_len = le16_to_cpu(sb->s_blocksize); - dir->u.ext3_i.i_disksize = - dir->i_size = offset + sb->s_blocksize; - dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; - ext3_mark_inode_dirty(handle, dir); + dx_insert_block (frames + 0, hash2, newblock); + dxtrace(dx_show_index ("node", frames[1].entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); + ext3_journal_dirty_metadata(handle, bh2); + brelse (bh2); } else { - - ext3_debug ("skipping to next block\n"); - - de = (struct ext3_dir_entry_2 *) bh->b_data; + dxtrace(printk("Creating second level index...\n")); + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Set up root */ + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); + ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; + + /* Add new access path frame */ + frame = frames + 1; + frame->at = at = at - entries + entries2; + frame->entries = entries = entries2; + frame->bh = bh2; + ext3_journal_get_write_access(handle, frame->bh); } + ext3_journal_dirty_metadata(handle, frames[0].bh); } - if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh, - offset)) { - brelse (bh); - return -ENOENT; - } - if (ext3_match (namelen, name, de)) { + de = do_split(handle, dir, &bh, frame, &hinfo, &retval); + dx_release (frames); + if (!(de)) + goto fail; + nlen = EXT3_DIR_REC_LEN(de->name_len); + rlen = le16_to_cpu(de->rec_len); + goto add; + +dx_add: + dx_release (frames); + goto add; + +dxfull: + ext3_warning(sb, __FUNCTION__, "Directory index full!\n"); + retval = -ENOSPC; +dxfail2: + brelse(bh); +dxfail1: + dx_release (frames); + goto fail1; + } + + blocks = dir->i_size >> sb->s_blocksize_bits; + for (block = 0, offset = 0; block < blocks; block++) { + bh = ext3_bread(handle, dir, block, 0, &retval); + if(!bh) + return retval; + de = (ext3_dirent *)bh->b_data; + top = bh->b_data + blocksize - reclen; + while ((char *) de <= top) { + if (!ext3_check_dir_entry("ext3_add_entry", dir, de, + bh, offset)) { + brelse (bh); + return -EIO; + } + if (ext3_match (namelen, name, de)) { brelse (bh); return -EEXIST; - } - if ((le32_to_cpu(de->inode) == 0 && - le16_to_cpu(de->rec_len) >= rec_len) || - (le16_to_cpu(de->rec_len) >= - EXT3_DIR_REC_LEN(de->name_len) + rec_len)) { - BUFFER_TRACE(bh, "get_write_access"); - ext3_journal_get_write_access(handle, bh); - /* By now the buffer is marked for journaling */ - offset += le16_to_cpu(de->rec_len); - if (le32_to_cpu(de->inode)) { - de1 = (struct ext3_dir_entry_2 *) ((char *) de + - EXT3_DIR_REC_LEN(de->name_len)); - de1->rec_len = - cpu_to_le16(le16_to_cpu(de->rec_len) - - EXT3_DIR_REC_LEN(de->name_len)); - de->rec_len = cpu_to_le16( - EXT3_DIR_REC_LEN(de->name_len)); - de = de1; } - de->file_type = EXT3_FT_UNKNOWN; - if (inode) { - de->inode = cpu_to_le32(inode->i_ino); - ext3_set_de_type(dir->i_sb, de, inode->i_mode); - } else - de->inode = 0; - de->name_len = namelen; - memcpy (de->name, name, namelen); - /* - * XXX shouldn't update any times until successful - * completion of syscall, but too many callers depend - * on this. - * - * XXX similarly, too many callers depend on - * ext3_new_inode() setting the times, but error - * recovery deletes the inode, so the worst that can - * happen is that the times are slightly out of date - * and/or different from the directory change time. - */ - dir->i_mtime = dir->i_ctime = CURRENT_TIME; - dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; - ext3_mark_inode_dirty(handle, dir); - dir->i_version = ++event; - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, bh); + nlen = EXT3_DIR_REC_LEN(de->name_len); + rlen = le16_to_cpu(de->rec_len); + if ((de->inode? rlen - nlen: rlen) >= reclen) + goto add; + de = (ext3_dirent *)((char *)de + rlen); + offset += rlen; + } + if (ext3_dx && blocks == 1 && test_opt(sb, INDEX)) + goto dx_make_index; + brelse(bh); + } + bh = ext3_append(handle, dir, &block, &retval); + if (!bh) + return retval; + de = (ext3_dirent *) bh->b_data; + de->inode = 0; + de->rec_len = cpu_to_le16(rlen = blocksize); + nlen = 0; + goto add; + +add: + BUFFER_TRACE(bh, "get_write_access"); + ext3_journal_get_write_access(handle, bh); + /* By now the buffer is marked for journaling */ + if (de->inode) { + ext3_dirent *de1 = (ext3_dirent *)((char *)de + nlen); + de1->rec_len = cpu_to_le16(rlen - nlen); + de->rec_len = cpu_to_le16(nlen); + de = de1; + } + de->file_type = EXT3_FT_UNKNOWN; + if (inode) { + de->inode = cpu_to_le32(inode->i_ino); + ext3_set_de_type(dir->i_sb, de, inode->i_mode); + } else + de->inode = 0; + de->name_len = namelen; + memcpy (de->name, name, namelen); + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext3_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + ext3_update_dx_flag(dir); + dir->i_version = ++event; + ext3_mark_inode_dirty(handle, dir); + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, bh); + brelse(bh); + return 0; + +dx_make_index: + { + struct buffer_head *bh2; + struct dx_root *root; + struct dx_frame frames[2], *frame; + struct dx_entry *entries; + ext3_dirent *de2; + char *data1; + unsigned len; + + dxtrace(printk("Creating index\n")); + ext3_journal_get_write_access(handle, bh); + root = (struct dx_root *) bh->b_data; + + EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; + bh2 = ext3_append (handle, dir, &block, &retval); + if (!(bh2)) + { brelse(bh); - return 0; + return retval; } - offset += le16_to_cpu(de->rec_len); - de = (struct ext3_dir_entry_2 *) - ((char *) de + le16_to_cpu(de->rec_len)); + data1 = bh2->b_data; + + /* The 0th block becomes the root, move the dirents out */ + de = (ext3_dirent *) &root->info; + len = ((char *) root) + blocksize - (char *) de; + memcpy (data1, de, len); + de = (ext3_dirent *) data1; + top = data1 + len; + while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top) + de = de2; + de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); + /* Initialize the root; the dot dirents already exist */ + de = (ext3_dirent *) (&root->dotdot); + de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2)); + memset (&root->info, 0, sizeof(root->info)); + root->info.info_length = sizeof(root->info); + root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version; + entries = root->entries; + dx_set_block (entries, 1); + dx_set_count (entries, 1); + dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); + + /* Initialize as for dx_probe */ + hinfo.name = name; + hinfo.namelen = namelen; + hinfo.hash_version = root->info.hash_version; + hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed; + ext3fs_dirhash(hinfo.hash_version, hinfo.name, + hinfo.namelen, hinfo.seed, &hinfo.hash, + &hinfo.minor_hash); + frame = frames; + frame->entries = entries; + frame->at = entries; + frame->bh = bh; + bh = bh2; + de = do_split(handle,dir, &bh, frame, &hinfo, &retval); + dx_release (frames); + if (!(de)) + return retval; + nlen = EXT3_DIR_REC_LEN(de->name_len); + rlen = le16_to_cpu(de->rec_len); + goto add; } - brelse (bh); - return -ENOSPC; +fail1: + return retval; +fail: + return -ENOENT; } + /* * ext3_delete_entry deletes a directory entry by merging it with the * previous entry @@ -451,7 +1242,8 @@ struct inode * inode; int err; - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -478,7 +1270,8 @@ struct inode *inode; int err; - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -507,7 +1300,8 @@ if (dir->i_nlink >= EXT3_LINK_MAX) return -EMLINK; - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -557,7 +1351,7 @@ if (err) goto out_no_entry; dir->i_nlink++; - dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_update_dx_flag(dir); ext3_mark_inode_dirty(handle, dir); d_instantiate(dentry, inode); out_stop: @@ -832,7 +1626,7 @@ ext3_mark_inode_dirty(handle, inode); dir->i_nlink--; inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; - dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_update_dx_flag(dir); ext3_mark_inode_dirty(handle, dir); end_rmdir: @@ -878,7 +1672,7 @@ if (retval) goto end_unlink; dir->i_ctime = dir->i_mtime = CURRENT_TIME; - dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_update_dx_flag(dir); ext3_mark_inode_dirty(handle, dir); inode->i_nlink--; if (!inode->i_nlink) @@ -904,7 +1698,8 @@ if (l > dir->i_sb->s_blocksize) return -ENAMETOOLONG; - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5); + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -959,7 +1754,8 @@ if (inode->i_nlink >= EXT3_LINK_MAX) return -EMLINK; - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS); + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + + EXT3_INDEX_EXTRA_TRANS_BLOCKS); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -995,7 +1791,8 @@ old_bh = new_bh = dir_bh = NULL; - handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2); + handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1077,7 +1874,7 @@ new_inode->i_ctime = CURRENT_TIME; } old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; - old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_update_dx_flag(old_dir); if (dir_bh) { BUFFER_TRACE(dir_bh, "get_write_access"); ext3_journal_get_write_access(handle, dir_bh); @@ -1089,7 +1886,7 @@ new_inode->i_nlink--; } else { new_dir->i_nlink++; - new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_update_dx_flag(new_dir); ext3_mark_inode_dirty(handle, new_dir); } } diff -Nru a/fs/ext3/super.c b/fs/ext3/super.c --- a/fs/ext3/super.c Thu Sep 5 20:42:11 2002 +++ b/fs/ext3/super.c Thu Sep 5 20:42:11 2002 @@ -529,6 +529,12 @@ "EXT3 Check option not supported\n"); #endif } + else if (!strcmp (this_char, "index")) +#ifdef CONFIG_EXT3_INDEX + set_opt (*mount_options, INDEX); +#else + printk("EXT3 index option not supported\n"); +#endif else if (!strcmp (this_char, "debug")) set_opt (*mount_options, DEBUG); else if (!strcmp (this_char, "errors")) { @@ -702,6 +708,12 @@ es->s_mtime = cpu_to_le32(CURRENT_TIME); ext3_update_dynamic_rev(sb); EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + + if (test_opt(sb, INDEX)) + EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX); + else if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) + set_opt (EXT3_SB(sb)->s_mount_opt, INDEX); + ext3_commit_super (sb, es, 1); if (test_opt (sb, DEBUG)) printk (KERN_INFO @@ -712,6 +724,7 @@ EXT3_BLOCKS_PER_GROUP(sb), EXT3_INODES_PER_GROUP(sb), sbi->s_mount_opt); + printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ", bdevname(sb->s_dev)); if (EXT3_SB(sb)->s_journal->j_inode == NULL) { @@ -886,6 +899,7 @@ return res; } + struct super_block * ext3_read_super (struct super_block * sb, void * data, int silent) { @@ -1062,6 +1076,9 @@ sbi->s_mount_state = le16_to_cpu(es->s_state); sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); + for (i=0; i < 4; i++) + sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); + sbi->s_def_hash_version = es->s_def_hash_version; if (sbi->s_blocks_per_group > blocksize * 8) { printk (KERN_ERR diff -Nru a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h --- a/include/linux/ext3_fs.h Thu Sep 5 20:42:11 2002 +++ b/include/linux/ext3_fs.h Thu Sep 5 20:42:11 2002 @@ -339,6 +339,7 @@ #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */ #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ +#define EXT3_MOUNT_INDEX 0x4000 /* Enable directory index */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H @@ -437,8 +438,11 @@ /*E0*/ __u32 s_journal_inum; /* inode number of journal file */ __u32 s_journal_dev; /* device number of journal file */ __u32 s_last_orphan; /* start of list of inodes to delete */ - -/*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */ + __u32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_reserved_char_pad; + __u16 s_reserved_word_pad; + __u32 s_reserved[192]; /* Padding to the end of the block */ }; #ifdef __KERNEL__ @@ -575,6 +579,28 @@ #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ ~EXT3_DIR_ROUND) +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#ifdef CONFIG_EXT3_INDEX + enum {ext3_dx = 1}; + #define is_dx(dir) (EXT3_I(dir)->i_flags & EXT3_INDEX_FL) +#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) +#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) +#else + enum {ext3_dx = 0}; + #define is_dx(dir) 0 +#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) +#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) +#endif + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 #ifdef __KERNEL__ /* @@ -618,6 +644,12 @@ unsigned long); /* fsync.c */ extern int ext3_sync_file (struct file *, struct dentry *, int); + +/* hash.c */ +extern int ext3fs_dirhash(int version, const char *name, int len, + const __u32 *seed, + __u32 *ret_hash, + __u32 *ret_minor_hash); /* ialloc.c */ extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int); diff -Nru a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h --- a/include/linux/ext3_fs_sb.h Thu Sep 5 20:42:11 2002 +++ b/include/linux/ext3_fs_sb.h Thu Sep 5 20:42:11 2002 @@ -62,6 +62,8 @@ int s_inode_size; int s_first_ino; u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; /* Journaling */ struct inode * s_journal_inode; diff -Nru a/include/linux/ext3_jbd.h b/include/linux/ext3_jbd.h --- a/include/linux/ext3_jbd.h Thu Sep 5 20:42:11 2002 +++ b/include/linux/ext3_jbd.h Thu Sep 5 20:42:11 2002 @@ -63,6 +63,8 @@ #define EXT3_RESERVE_TRANS_BLOCKS 12 +#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 + int ext3_mark_iloc_dirty(handle_t *handle, struct inode *inode,
#!/bin/sh #dd if=/dev/zero of=test.img bs=8k count=30000 mke2fs -j -F -b 1024 -g 1024 -N 1200000 test.img debugfs -w -R "features dir_index" test.img debugfs -w -R "ssv def_hash_version tea" test.img debugfs -w -R "ssv hash_seed random" test.img mount -t ext3 -o loop test.img /mnt pushd /mnt mkdir test test2 test3 cd test seq -f "%06.0f" 1 100 | xargs touch cd .. cd test2 seq -f "%06.0f" 1 10000 | xargs touch cd .. cd test3 seq -f "%06.0f" 1 400000 | xargs touch cd .. popd umount /mnt