On 4/26/12 1:26 PM, J. Bruce Fields wrote: > How are you testing this? Basically as was suggested in the first ext4 patch series: I created a filesystem with 600,000 files in a dir: for x in $(seq 1 600000); do touch $x; done then exported and mounted that fs to localhost: mount -t nfs localhost:/mnt/export /mnt/nfs and then looked for dups: ls -l | dup -d If you have more .... substantial nfs testing I could do I'm all ears :) -Eric > --b. > > On Thu, Apr 26, 2012 at 01:10:39PM -0500, Eric Sandeen wrote: >> This is based on commit d1f5273e9adb40724a85272f248f210dc4ce919a >> ext4: return 32/64-bit dir name hash according to usage type >> by Fan Yong <yong.fan@xxxxxxxxxxxxx> >> >> Traditionally ext2/3/4 has returned a 32-bit hash value from llseek() >> to appease NFSv2, which can only handle a 32-bit cookie for seekdir() >> and telldir(). However, this causes problems if there are 32-bit hash >> collisions, since the NFSv2 server can get stuck resending the same >> entries from the directory repeatedly. >> >> Allow ext3 to return a full 64-bit hash (both major and minor) for >> telldir to decrease the chance of hash collisions. >> >> This patch does implement a new ext3_dir_llseek op, because with 64-bit >> hashes, nfs will attempt to seek to a hash "offset" which is much >> larger than ext3's s_maxbytes. So for dx dirs, we call >> generic_file_llseek_size() with the appropriate max hash value as the >> maximum seekable size. Otherwise we just pass through to >> generic_file_llseek(). >> >> Patch-updated-by: Bernd Schubert <bernd.schubert@xxxxxxxxxxxxxxxxxx> >> Patch-updated-by: Eric Sandeen <sandeen@xxxxxxxxxx> >> (blame us if something is not correct) >> >> Signed-off-by: Eric Sandeen <sandeen@xxxxxxxxxx> >> --- >> >> diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c >> index cc761ad..92490e9 100644 >> --- a/fs/ext3/dir.c >> +++ b/fs/ext3/dir.c >> @@ -21,30 +21,15 @@ >> * >> */ >> >> +#include <linux/compat.h> >> #include "ext3.h" >> >> static unsigned char ext3_filetype_table[] = { >> DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK >> }; >> >> -static int ext3_readdir(struct file *, void *, filldir_t); >> static int ext3_dx_readdir(struct file * filp, >> void * dirent, filldir_t filldir); >> -static int ext3_release_dir (struct inode * inode, >> - struct file * filp); >> - >> -const struct file_operations ext3_dir_operations = { >> - .llseek = generic_file_llseek, >> - .read = generic_read_dir, >> - .readdir = ext3_readdir, /* we take BKL. needed?*/ >> - .unlocked_ioctl = ext3_ioctl, >> -#ifdef CONFIG_COMPAT >> - .compat_ioctl = ext3_compat_ioctl, >> -#endif >> - .fsync = ext3_sync_file, /* BKL held */ >> - .release = ext3_release_dir, >> -}; >> - >> >> static unsigned char get_dtype(struct super_block *sb, int filetype) >> { >> @@ -55,6 +40,25 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) >> return (ext3_filetype_table[filetype]); >> } >> >> +/** >> + * Check if the given dir-inode refers to an htree-indexed directory >> + * (or a directory which chould potentially get coverted to use htree >> + * indexing). >> + * >> + * Return 1 if it is a dx dir, 0 if not >> + */ >> +static int is_dx_dir(struct inode *inode) >> +{ >> + struct super_block *sb = inode->i_sb; >> + >> + if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, >> + EXT3_FEATURE_COMPAT_DIR_INDEX) && >> + ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || >> + ((inode->i_size >> sb->s_blocksize_bits) == 1))) >> + return 1; >> + >> + return 0; >> +} >> >> int ext3_check_dir_entry (const char * function, struct inode * dir, >> struct ext3_dir_entry_2 * de, >> @@ -94,18 +98,13 @@ static int ext3_readdir(struct file * filp, >> unsigned long offset; >> int i, stored; >> struct ext3_dir_entry_2 *de; >> - struct super_block *sb; >> int err; >> struct inode *inode = filp->f_path.dentry->d_inode; >> + struct super_block *sb = inode->i_sb; >> int ret = 0; >> int dir_has_error = 0; >> >> - sb = inode->i_sb; >> - >> - if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, >> - EXT3_FEATURE_COMPAT_DIR_INDEX) && >> - ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || >> - ((inode->i_size >> sb->s_blocksize_bits) == 1))) { >> + if (is_dx_dir(inode)) { >> err = ext3_dx_readdir(filp, dirent, filldir); >> if (err != ERR_BAD_DX_DIR) { >> ret = err; >> @@ -227,22 +226,87 @@ out: >> return ret; >> } >> >> +static inline int is_32bit_api(void) >> +{ >> +#ifdef CONFIG_COMPAT >> + return is_compat_task(); >> +#else >> + return (BITS_PER_LONG == 32); >> +#endif >> +} >> + >> /* >> * These functions convert from the major/minor hash to an f_pos >> - * value. >> + * value for dx directories >> * >> - * Currently we only use major hash numer. This is unfortunate, but >> - * on 32-bit machines, the same VFS interface is used for lseek and >> - * llseek, so if we use the 64 bit offset, then the 32-bit versions of >> - * lseek/telldir/seekdir will blow out spectacularly, and from within >> - * the ext2 low-level routine, we don't know if we're being called by >> - * a 64-bit version of the system call or the 32-bit version of the >> - * system call. Worse yet, NFSv2 only allows for a 32-bit readdir >> - * cookie. Sigh. >> + * Upper layer (for example NFS) should specify FMODE_32BITHASH or >> + * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted >> + * directly on both 32-bit and 64-bit nodes, under such case, neither >> + * FMODE_32BITHASH nor FMODE_64BITHASH is specified. >> */ >> -#define hash2pos(major, minor) (major >> 1) >> -#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) >> -#define pos2min_hash(pos) (0) >> +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) >> +{ >> + if ((filp->f_mode & FMODE_32BITHASH) || >> + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) >> + return major >> 1; >> + else >> + return ((__u64)(major >> 1) << 32) | (__u64)minor; >> +} >> + >> +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) >> +{ >> + if ((filp->f_mode & FMODE_32BITHASH) || >> + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) >> + return (pos << 1) & 0xffffffff; >> + else >> + return ((pos >> 32) << 1) & 0xffffffff; >> +} >> + >> +static inline __u32 pos2min_hash(struct file *filp, loff_t pos) >> +{ >> + if ((filp->f_mode & FMODE_32BITHASH) || >> + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) >> + return 0; >> + else >> + return pos & 0xffffffff; >> +} >> + >> +/* >> + * Return 32- or 64-bit end-of-file for dx directories >> + */ >> +static inline loff_t ext3_get_htree_eof(struct file *filp) >> +{ >> + if ((filp->f_mode & FMODE_32BITHASH) || >> + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) >> + return EXT3_HTREE_EOF_32BIT; >> + else >> + return EXT3_HTREE_EOF_64BIT; >> +} >> + >> + >> +/* >> + * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both >> + * non-htree and htree directories, where the "offset" is in terms >> + * of the filename hash value instead of the byte offset. >> + * >> + * Because we may return a 64-bit hash that is well beyond s_maxbytes, >> + * we need to pass the max hash as the maximum allowable offset in >> + * the htree directory case. >> + * >> + * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) >> + * will be invalid once the directory was converted into a dx directory >> + */ >> +loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin) >> +{ >> + struct inode *inode = file->f_mapping->host; >> + int dx_dir = is_dx_dir(inode); >> + >> + if (likely(dx_dir)) >> + return generic_file_llseek_size(file, offset, origin, >> + ext3_get_htree_eof(file)); >> + else >> + return generic_file_llseek(file, offset, origin); >> +} >> >> /* >> * This structure holds the nodes of the red-black tree used to store >> @@ -303,15 +367,16 @@ static void free_rb_tree_fname(struct rb_root *root) >> } >> >> >> -static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos) >> +static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp, >> + loff_t pos) >> { >> struct dir_private_info *p; >> >> p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); >> if (!p) >> return NULL; >> - p->curr_hash = pos2maj_hash(pos); >> - p->curr_minor_hash = pos2min_hash(pos); >> + p->curr_hash = pos2maj_hash(filp, pos); >> + p->curr_minor_hash = pos2min_hash(filp, pos); >> return p; >> } >> >> @@ -401,7 +466,7 @@ static int call_filldir(struct file * filp, void * dirent, >> printk("call_filldir: called with null fname?!?\n"); >> return 0; >> } >> - curr_pos = hash2pos(fname->hash, fname->minor_hash); >> + curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); >> while (fname) { >> error = filldir(dirent, fname->name, >> fname->name_len, curr_pos, >> @@ -426,13 +491,13 @@ static int ext3_dx_readdir(struct file * filp, >> int ret; >> >> if (!info) { >> - info = ext3_htree_create_dir_info(filp->f_pos); >> + info = ext3_htree_create_dir_info(filp, filp->f_pos); >> if (!info) >> return -ENOMEM; >> filp->private_data = info; >> } >> >> - if (filp->f_pos == EXT3_HTREE_EOF) >> + if (filp->f_pos == ext3_get_htree_eof(filp)) >> return 0; /* EOF */ >> >> /* Some one has messed with f_pos; reset the world */ >> @@ -440,8 +505,8 @@ static int ext3_dx_readdir(struct file * filp, >> free_rb_tree_fname(&info->root); >> info->curr_node = NULL; >> info->extra_fname = NULL; >> - info->curr_hash = pos2maj_hash(filp->f_pos); >> - info->curr_minor_hash = pos2min_hash(filp->f_pos); >> + info->curr_hash = pos2maj_hash(filp, filp->f_pos); >> + info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); >> } >> >> /* >> @@ -473,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp, >> if (ret < 0) >> return ret; >> if (ret == 0) { >> - filp->f_pos = EXT3_HTREE_EOF; >> + filp->f_pos = ext3_get_htree_eof(filp); >> break; >> } >> info->curr_node = rb_first(&info->root); >> @@ -493,7 +558,7 @@ static int ext3_dx_readdir(struct file * filp, >> info->curr_minor_hash = fname->minor_hash; >> } else { >> if (info->next_hash == ~0) { >> - filp->f_pos = EXT3_HTREE_EOF; >> + filp->f_pos = ext3_get_htree_eof(filp); >> break; >> } >> info->curr_hash = info->next_hash; >> @@ -512,3 +577,15 @@ static int ext3_release_dir (struct inode * inode, struct file * filp) >> >> return 0; >> } >> + >> +const struct file_operations ext3_dir_operations = { >> + .llseek = ext3_dir_llseek, >> + .read = generic_read_dir, >> + .readdir = ext3_readdir, >> + .unlocked_ioctl = ext3_ioctl, >> +#ifdef CONFIG_COMPAT >> + .compat_ioctl = ext3_compat_ioctl, >> +#endif >> + .fsync = ext3_sync_file, >> + .release = ext3_release_dir, >> +}; >> diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h >> index b6515fd..fe5bef7 100644 >> --- a/fs/ext3/ext3.h >> +++ b/fs/ext3/ext3.h >> @@ -920,7 +920,11 @@ struct dx_hash_info >> u32 *seed; >> }; >> >> -#define EXT3_HTREE_EOF 0x7fffffff >> + >> +/* 32 and 64 bit signed EOF for dx directories */ >> +#define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) >> +#define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) >> + >> >> /* >> * Control parameters used by ext3_htree_next_block >> diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c >> index d10231d..ede315c 100644 >> --- a/fs/ext3/hash.c >> +++ b/fs/ext3/hash.c >> @@ -198,8 +198,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) >> return -1; >> } >> hash = hash & ~1; >> - if (hash == (EXT3_HTREE_EOF << 1)) >> - hash = (EXT3_HTREE_EOF-1) << 1; >> + if (hash == (EXT3_HTREE_EOF_32BIT << 1)) >> + hash = (EXT3_HTREE_EOF_32BIT - 1) << 1; >> hinfo->hash = hash; >> hinfo->minor_hash = minor_hash; >> return 0; >> -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html