This patch adds support in GFS2 for the xgetdents syscall by implementing the xreaddir file operation. GFS2 uses vbufs (buffer backed by a vector of pages) to store intermediate data like dirents, stat info and extended attribute keys/values to eventually bundle them into a container structure to return to the user. Signed-off-by: Abhi Das <adas@xxxxxxxxxx> --- fs/gfs2/Makefile | 3 +- fs/gfs2/dir.c | 80 ++-- fs/gfs2/dir.h | 13 +- fs/gfs2/export.c | 2 +- fs/gfs2/file.c | 17 +- fs/gfs2/incore.h | 6 + fs/gfs2/inode.c | 3 +- fs/gfs2/inode.h | 5 + fs/gfs2/ops_fstype.c | 4 + fs/gfs2/sys.c | 26 +- fs/gfs2/util.c | 9 + fs/gfs2/xattr.c | 27 +- fs/gfs2/xattr.h | 23 ++ fs/gfs2/xreaddir.c | 1024 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/xreaddir.h | 84 +++++ 15 files changed, 1260 insertions(+), 66 deletions(-) create mode 100644 fs/gfs2/xreaddir.c create mode 100644 fs/gfs2/xreaddir.h diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index 8612820..da8253b 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -4,7 +4,8 @@ gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \ glops.o log.o lops.o main.o meta_io.o \ aops.o dentry.o export.o file.o \ ops_fstype.o inode.o quota.o \ - recovery.o rgrp.o super.o sys.o trans.o util.o + recovery.o rgrp.o super.o sys.o \ + trans.o util.o xreaddir.o gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 1a349f9..21f5926 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -74,15 +74,13 @@ #include "trans.h" #include "bmap.h" #include "util.h" +#include "xreaddir.h" #define IS_LEAF 1 /* Hashed (leaf) directory */ #define IS_DINODE 2 /* Linear (stuffed dinode block) directory */ #define MAX_RA_BLOCKS 32 /* max read-ahead blocks */ -#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) -#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) - struct qstr gfs2_qdot __read_mostly; struct qstr gfs2_qdotdot __read_mostly; @@ -1185,17 +1183,13 @@ out_kfree: * lt: returns -1 * eq: returns 0 */ - -static int compare_dents(const void *a, const void *b) +int compare_dents_i(const struct gfs2_dirent *dent_a, + const struct gfs2_dirent *dent_b) { - const struct gfs2_dirent *dent_a, *dent_b; u32 hash_a, hash_b; int ret = 0; - dent_a = *(const struct gfs2_dirent **)a; hash_a = be32_to_cpu(dent_a->de_hash); - - dent_b = *(const struct gfs2_dirent **)b; hash_b = be32_to_cpu(dent_b->de_hash); if (hash_a > hash_b) @@ -1217,6 +1211,12 @@ static int compare_dents(const void *a, const void *b) return ret; } +int compare_dents(const void *a, const void *b) +{ + return compare_dents_i(*(const struct gfs2_dirent **)a, + *(const struct gfs2_dirent **)b); +} + /** * do_filldir_main - read out directory entries * @dip: The GFS2 inode @@ -1234,13 +1234,14 @@ static int compare_dents(const void *a, const void *b) */ static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx, - const struct gfs2_dirent **darr, u32 entries, - int *copied) + struct gfs2_xrdir_ctx *xc, const struct gfs2_dirent **darr, + u32 entries, int *copied) { const struct gfs2_dirent *dent, *dent_next; u64 off, off_next; + u64 *dst_pos = xc ? &xc->xc_offset : &ctx->pos; unsigned int x, y; - int run = 0; + int run = 0, error = 0; sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL); @@ -1256,29 +1257,39 @@ static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx, dent_next = darr[y]; off_next = be32_to_cpu(dent_next->de_hash); off_next = gfs2_disk_hash2offset(off_next); - - if (off < ctx->pos) + if (off < *dst_pos) continue; - ctx->pos = off; + + *dst_pos = off; if (off_next == off) { - if (*copied && !run) + if (*copied && !run) { + if (xc) + gfs2_xrdir_partial_collect(xc); return 1; + } run = 1; } else run = 0; } else { - if (off < ctx->pos) + if (off < *dst_pos) continue; - ctx->pos = off; + *dst_pos = off; } - if (!dir_emit(ctx, (const char *)(dent + 1), - be16_to_cpu(dent->de_name_len), - be64_to_cpu(dent->de_inum.no_addr), - be16_to_cpu(dent->de_type))) - return 1; - + if (xc) { + error = gfs2_xrdir_collect_dents(dent, off, xc); + if (error) { + gfs2_xrdir_partial_collect(xc); + return 1; + } + } else { + if (!dir_emit(ctx, (const char *)(dent + 1), + be16_to_cpu(dent->de_name_len), + be64_to_cpu(dent->de_inum.no_addr), + be16_to_cpu(dent->de_type))) + return 1; + } *copied = 1; } @@ -1286,8 +1297,7 @@ static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx, do_filldir fxn, we get the next entry instead of the last one in the current leaf */ - ctx->pos++; - + (*dst_pos)++; return 0; } @@ -1311,8 +1321,8 @@ static void gfs2_free_sort_buffer(void *ptr) } static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, - int *copied, unsigned *depth, - u64 leaf_no) + struct gfs2_xrdir_ctx *xc, int *copied, + unsigned *depth, u64 leaf_no) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -1389,7 +1399,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, } while(lfn); BUG_ON(entries2 != entries); - error = do_filldir_main(ip, ctx, darr, entries, copied); + error = do_filldir_main(ip, ctx, xc, darr, entries, copied); out_free: for(i = 0; i < leaf; i++) brelse(larr[i]); @@ -1454,7 +1464,7 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, */ static int dir_e_read(struct inode *inode, struct dir_context *ctx, - struct file_ra_state *f_ra) + struct gfs2_xrdir_ctx *xc, struct file_ra_state *f_ra) { struct gfs2_inode *dip = GFS2_I(inode); u32 hsize, len = 0; @@ -1465,7 +1475,7 @@ static int dir_e_read(struct inode *inode, struct dir_context *ctx, unsigned depth = 0; hsize = 1 << dip->i_depth; - hash = gfs2_dir_offset2hash(ctx->pos); + hash = gfs2_dir_offset2hash(xc ? xc->xc_offset : ctx->pos); index = hash >> (32 - dip->i_depth); if (dip->i_hash_cache == NULL) @@ -1477,7 +1487,7 @@ static int dir_e_read(struct inode *inode, struct dir_context *ctx, gfs2_dir_readahead(inode, hsize, index, f_ra); while (index < hsize) { - error = gfs2_dir_read_leaf(inode, ctx, + error = gfs2_dir_read_leaf(inode, ctx, xc, &copied, &depth, be64_to_cpu(lp[index])); if (error) @@ -1493,7 +1503,7 @@ static int dir_e_read(struct inode *inode, struct dir_context *ctx, } int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, - struct file_ra_state *f_ra) + struct gfs2_xrdir_ctx *xc, struct file_ra_state *f_ra) { struct gfs2_inode *dip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -1507,7 +1517,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, return 0; if (dip->i_diskflags & GFS2_DIF_EXHASH) - return dir_e_read(inode, ctx, f_ra); + return dir_e_read(inode, ctx, xc, f_ra); if (!gfs2_is_stuffed(dip)) { gfs2_consist_inode(dip); @@ -1539,7 +1549,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, error = -EIO; goto out; } - error = do_filldir_main(dip, ctx, darr, + error = do_filldir_main(dip, ctx, xc, darr, dip->i_entries, &copied); out: kfree(darr); diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 126c65d..8d40590 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -12,6 +12,10 @@ #include <linux/dcache.h> #include <linux/crc32.h> +#include "util.h" + +#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) +#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) struct inode; struct gfs2_inode; @@ -25,6 +29,13 @@ struct gfs2_diradd { struct buffer_head *bh; }; +typedef int (*process_dent_t)(const struct gfs2_dirent *, loff_t, void *, filldir_t); +extern int compare_dents_i(const struct gfs2_dirent *dent_a, + const struct gfs2_dirent *dent_b); +extern int foreach_dent(u64 *offset, void *opaque, filldir_t filldir, + const struct gfs2_dirent **darr, u32 entries, + int *copied, process_dent_t pd_fn); + extern struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename, bool fail_on_exist); @@ -40,7 +51,7 @@ static inline void gfs2_dir_no_add(struct gfs2_diradd *da) } extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, - struct file_ra_state *f_ra); + struct gfs2_xrdir_ctx *xc, struct file_ra_state *f_ra); extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, const struct gfs2_inode *nip, unsigned int new_type); diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 8b9b377..1f5085d 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -114,7 +114,7 @@ static int gfs2_get_name(struct dentry *parent, char *name, if (error) return error; - error = gfs2_dir_read(dir, &gnfd.ctx, &f_ra); + error = gfs2_dir_read(dir, &gnfd.ctx, NULL, &f_ra); gfs2_glock_dq_uninit(&gh); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 26b3f95..d2d7561f 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -16,6 +16,8 @@ #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/mount.h> +#include <linux/stat.h> +#include <linux/sort.h> #include <linux/fs.h> #include <linux/gfs2_ondisk.h> #include <linux/falloc.h> @@ -40,6 +42,7 @@ #include "rgrp.h" #include "trans.h" #include "util.h" +#include "xreaddir.h" /** * gfs2_llseek - seek to a location in a file @@ -100,7 +103,7 @@ static int gfs2_readdir(struct file *file, struct dir_context *ctx) if (error) return error; - error = gfs2_dir_read(dir, ctx, &file->f_ra); + error = gfs2_dir_read(dir, ctx, NULL, &file->f_ra); gfs2_glock_dq_uninit(&d_gh); @@ -562,8 +565,13 @@ int gfs2_open_common(struct inode *inode, struct file *file) return -ENOMEM; mutex_init(&fp->f_fl_mutex); - gfs2_assert_warn(GFS2_SB(inode), !file->private_data); + + if (S_ISDIR(inode->i_mode)) { + ret = gfs2_xrdir_ctx_init(fp, GFS2_SB(inode)); + if (ret) + return ret; + } file->private_data = fp; return 0; } @@ -617,6 +625,9 @@ static int gfs2_release(struct inode *inode, struct file *file) { struct gfs2_inode *ip = GFS2_I(inode); + if (S_ISDIR(ip->i_inode.i_mode)) + gfs2_xrdir_ctx_uninit((struct gfs2_file *)file->private_data); + kfree(file->private_data); file->private_data = NULL; @@ -1075,6 +1086,7 @@ const struct file_operations gfs2_file_fops = { const struct file_operations gfs2_dir_fops = { .iterate = gfs2_readdir, + .xreaddir = gfs2_xreaddir, .unlocked_ioctl = gfs2_ioctl, .open = gfs2_open, .release = gfs2_release, @@ -1105,6 +1117,7 @@ const struct file_operations gfs2_file_fops_nolock = { const struct file_operations gfs2_dir_fops_nolock = { .iterate = gfs2_readdir, + .xreaddir = gfs2_xreaddir, .unlocked_ioctl = gfs2_ioctl, .open = gfs2_open, .release = gfs2_release, diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 67d310c..f86b6d3 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -414,6 +414,7 @@ static inline struct gfs2_sbd *GFS2_SB(const struct inode *inode) struct gfs2_file { struct mutex f_fl_mutex; struct gfs2_holder f_fl_gh; + struct gfs2_xrdir_ctx *f_xrctx; }; struct gfs2_revoke_replay { @@ -570,6 +571,8 @@ struct gfs2_tune { unsigned int gt_complain_secs; unsigned int gt_statfs_quantum; unsigned int gt_statfs_slow; + unsigned int gt_max_vb_pages; /* Max pages to utilize for vector-page buffers */ + unsigned int gt_max_xrdir_dents; /* Maximum dents to process per collect cycle (conserves memory) */ }; enum { @@ -812,6 +815,9 @@ struct gfs2_sbd { struct dentry *debugfs_dentry_glocks; struct dentry *debugfs_dentry_glstats; struct dentry *debugfs_dentry_sbstats; + + /* Vector Pages accounting */ + atomic_t sd_vb_page_count; }; static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e62e594..46c3602 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1833,7 +1833,8 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, } } - generic_fillattr(inode, stat); + gfs2_getattr_i(ip, stat); + if (unlock) gfs2_glock_dq_uninit(&gh); else if (frozen_root && atomic_dec_and_test(&sdp->sd_frozen_root)) diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index ba4d949..665f508 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -93,6 +93,11 @@ err: return -EIO; } +static inline void gfs2_getattr_i(struct gfs2_inode *ip, struct kstat *stat) +{ + generic_fillattr(&ip->i_inode, stat); +} + extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, u64 no_addr, u64 no_formal_ino, int non_block); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index bc564c0..2d541ba 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -60,6 +60,8 @@ static void gfs2_tune_init(struct gfs2_tune *gt) gt->gt_new_files_jdata = 0; gt->gt_max_readahead = 1 << 18; gt->gt_complain_secs = 10; + gt->gt_max_vb_pages = 65536; + gt->gt_max_xrdir_dents = 25000; } static struct gfs2_sbd *init_sbd(struct super_block *sb) @@ -135,6 +137,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) atomic_set(&sdp->sd_frozen_root, 0); init_waitqueue_head(&sdp->sd_frozen_root_wait); + atomic_set(&sdp->sd_vb_page_count, 0); + return sdp; } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 3ab566b..279aa86 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -548,8 +548,8 @@ static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf, return len; } -static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field, - int check_zero, const char *buf, size_t len) +static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field, int check_zero, + unsigned int min, unsigned int max, const char *buf, size_t len) { struct gfs2_tune *gt = &sdp->sd_tune; unsigned int x; @@ -562,6 +562,12 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field, if (check_zero && !x) return -EINVAL; + if (min && x < min) + return -EINVAL; + + if (max && x > max) + return -EINVAL; + spin_lock(>->gt_spin); *field = x; spin_unlock(>->gt_spin); @@ -578,13 +584,21 @@ static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ } \ TUNE_ATTR_3(name, name##_show, store) -#define TUNE_ATTR(name, check_zero) \ +#define TUNE_ATTR(name, check_zero) \ +static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len) \ +{ \ + return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, 0, 0, buf, len); \ +} \ +TUNE_ATTR_2(name, name##_store) + +#define TUNE_ATTR_B(name, min, max) \ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ { \ - return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len); \ + return tune_set(sdp, &sdp->sd_tune.gt_##name, 0, min, max, buf, len); \ } \ TUNE_ATTR_2(name, name##_store) + TUNE_ATTR(quota_warn_period, 0); TUNE_ATTR(quota_quantum, 0); TUNE_ATTR(max_readahead, 0); @@ -593,6 +607,8 @@ TUNE_ATTR(statfs_slow, 0); TUNE_ATTR(new_files_jdata, 0); TUNE_ATTR(statfs_quantum, 1); TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); +TUNE_ATTR_B(max_vb_pages, 32, 8388608); /* total capacity can be 128K to 32G bytes */ +TUNE_ATTR(max_xrdir_dents, 0); static struct attribute *tune_attrs[] = { &tune_attr_quota_warn_period.attr, @@ -603,6 +619,8 @@ static struct attribute *tune_attrs[] = { &tune_attr_statfs_quantum.attr, &tune_attr_quota_scale.attr, &tune_attr_new_files_jdata.attr, + &tune_attr_max_vb_pages.attr, + &tune_attr_max_xrdir_dents.attr, NULL, }; diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 2c1aee3..793f69e 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -301,6 +301,9 @@ static int vp_extend(struct vp_ctx *vpx, int size) { struct gfs2_sbd *sdp = vpx->vp_sdp; + if ((gfs2_tune_get(sdp, gt_max_vb_pages) + - atomic_read(&sdp->sd_vb_page_count)) < size) + goto out; /* first make room for more pointers */ if (size <= 0) return -EINVAL; @@ -317,6 +320,7 @@ static int vp_extend(struct vp_ctx *vpx, int size) goto out; vpx->vp_size += size; + atomic_add(size, &sdp->sd_vb_page_count); return 0; out: return -ENOMEM; @@ -328,6 +332,9 @@ int vp_init(struct gfs2_sbd *sdp, struct vbuf *vb, int init_cap) struct vp_ctx *vpx; cap = DIV_ROUND_UP(init_cap, PAGE_SIZE); + if ((gfs2_tune_get(sdp, gt_max_vb_pages) + - atomic_read(&sdp->sd_vb_page_count)) < cap) + goto out; vpx = kmalloc(sizeof(struct vp_ctx), GFP_KERNEL); if (vpx == NULL) @@ -344,6 +351,7 @@ int vp_init(struct gfs2_sbd *sdp, struct vbuf *vb, int init_cap) vpx->vp_baseptr = vpx->vp_top = page_address(vpx->vp_pages[0]); vpx->vp_sdp = sdp; + atomic_add(cap, &sdp->sd_vb_page_count); vb->v_ptr = vpx->vp_baseptr; vb->v_opaque = vpx; @@ -373,6 +381,7 @@ void vp_uninit(struct vbuf *vb) vp_free_pages(vpx); kfree(vpx->vp_pages); + atomic_sub(vpx->vp_size, &vpx->vp_sdp->sd_vb_page_count); kfree(vpx); vb->v_ptr = vb->v_opaque = NULL; } diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 0b81f78..f156b21 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -11,6 +11,7 @@ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> +#include <linux/sort.h> #include <linux/xattr.h> #include <linux/gfs2_ondisk.h> #include <linux/posix_acl_xattr.h> @@ -19,6 +20,7 @@ #include "gfs2.h" #include "incore.h" #include "acl.h" +#include "dir.h" #include "xattr.h" #include "glock.h" #include "inode.h" @@ -27,6 +29,7 @@ #include "rgrp.h" #include "trans.h" #include "util.h" +#include "xreaddir.h" /** * ea_calc_size - returns the acutal number of bytes the request will take up @@ -72,10 +75,6 @@ static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize) return 0; } -typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh, - struct gfs2_ea_header *ea, - struct gfs2_ea_header *prev, void *private); - static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh, ea_call_t ea_call, void *data) { @@ -113,7 +112,7 @@ fail: return -EIO; } -static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) +int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) { struct buffer_head *bh, *eabh; __be64 *eablk, *end; @@ -374,28 +373,14 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, return 0; if (er->er_data_len) { - char *prefix = NULL; + char prefix[9]; unsigned int l = 0; char c = 0; if (ei->ei_size + ea_size > er->er_data_len) return -ERANGE; - switch (ea->ea_type) { - case GFS2_EATYPE_USR: - prefix = "user."; - l = 5; - break; - case GFS2_EATYPE_SYS: - prefix = "system."; - l = 7; - break; - case GFS2_EATYPE_SECURITY: - prefix = "security."; - l = 9; - break; - } - + l = ea_prefix(ea, prefix, 9); BUG_ON(l == 0); memcpy(er->er_data + ei->ei_size, prefix, l); diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h index d392f83..c09f090 100644 --- a/fs/gfs2/xattr.h +++ b/fs/gfs2/xattr.h @@ -10,6 +10,8 @@ #ifndef __EATTR_DOT_H__ #define __EATTR_DOT_H__ +#include "dir.h" + struct gfs2_inode; struct iattr; @@ -53,9 +55,30 @@ struct gfs2_ea_location { struct gfs2_ea_header *el_prev; }; +static __inline__ int ea_prefix(struct gfs2_ea_header *ea, char *buf, int size) +{ + BUG_ON(size < 9); + switch (ea->ea_type) { + case GFS2_EATYPE_USR: + strncpy(buf, "user.", 5); + return 5; + case GFS2_EATYPE_SYS: + strncpy(buf, "system.", 7); + return 7; + case GFS2_EATYPE_SECURITY: + strncpy(buf, "security.", 9); + return 9; + } + return 0; +} + extern int __gfs2_xattr_set(struct inode *inode, const char *name, const void *value, size_t size, int flags, int type); +typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, + struct gfs2_ea_header *prev, void *private); +extern int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data); extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size); extern int gfs2_ea_dealloc(struct gfs2_inode *ip); diff --git a/fs/gfs2/xreaddir.c b/fs/gfs2/xreaddir.c new file mode 100644 index 0000000..44e0232 --- /dev/null +++ b/fs/gfs2/xreaddir.c @@ -0,0 +1,1024 @@ +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/pagemap.h> +#include <linux/uio.h> +#include <linux/blkdev.h> +#include <linux/mm.h> +#include <linux/mount.h> +#include <linux/stat.h> +#include <linux/sort.h> +#include <linux/fs.h> +#include <linux/gfs2_ondisk.h> +#include <linux/falloc.h> +#include <linux/swap.h> +#include <linux/crc32.h> +#include <linux/writeback.h> +#include <asm/uaccess.h> +#include <linux/dlm.h> +#include <linux/dlm_plock.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" +#include "xattr.h" +#include "xreaddir.h" + +static int gfs2_dirent_dot_or_dotdot(const struct gfs2_dirent *dent) +{ + const char *name = (char *)(dent + 1); + + if (be16_to_cpu(dent->de_type) == DT_DIR) { + if (be16_to_cpu(dent->de_name_len) == 1 && name[0] == '.') + return 1; + if (be16_to_cpu(dent->de_name_len) == 2 && + strncmp(name, "..", 2) == 0) + return 1; + } + return 0; +} + +/* + * Compare the inode blocks of two entries + */ +int ctx_compare_dent_iblks(void *opaque, const void *a, const void *b) +{ + struct gfs2_xrdir_ctx *xc = opaque; + const struct gfs2_xdirent *a_vb_p = *(struct gfs2_xdirent **)a; + const struct gfs2_xdirent *b_vb_p = *(struct gfs2_xdirent **)b; + u64 a_blkno, b_blkno; + + vp_read(&xc->xc_dirents, &a_blkno, &a_vb_p->x_ino, sizeof(u64)); + vp_read(&xc->xc_dirents, &b_blkno, &b_vb_p->x_ino, sizeof(u64)); + + if (a_blkno > b_blkno) + return 1; + else + return -1; +} + +/* + * Compare the xattr blocks of two entries + */ +int ctx_compare_dent_eablks(void *opaque, const void *a, const void *b) +{ + struct gfs2_xrdir_ctx *xc = opaque; + const struct gfs2_xdirent *a_vb_p = *(struct gfs2_xdirent **)a; + const struct gfs2_xdirent *b_vb_p = *(struct gfs2_xdirent **)b; + u64 a_blkno, b_blkno; + + vp_read(&xc->xc_dirents, &a_blkno, &a_vb_p->x_eablk, sizeof(u64)); + vp_read(&xc->xc_dirents, &b_blkno, &b_vb_p->x_eablk, sizeof(u64)); + + if (a_blkno > b_blkno) + return 1; + else + return -1; +} + +/* + * Compare two entries based on their hash value + */ +int ctx_compare_dents(void *opaque, const void *a, const void *b) +{ + struct gfs2_xrdir_ctx *xc = opaque; + const struct gfs2_xdirent *a_vb_p = *(struct gfs2_xdirent **)a; + const struct gfs2_xdirent *b_vb_p = *(struct gfs2_xdirent **)b; + u32 a_hash, b_hash; + int ret = 0; + + vp_read(&xc->xc_dirents, &a_hash, &a_vb_p->x_hash, sizeof(u32)); + vp_read(&xc->xc_dirents, &b_hash, &b_vb_p->x_hash, sizeof(u32)); + + if (a_hash > b_hash) + ret = 1; + else if (a_hash < b_hash) + ret = -1; + else { + unsigned int len_a, len_b; + vp_read(&xc->xc_dirents, &len_a, &a_vb_p->x_namelen, sizeof(unsigned int)); + vp_read(&xc->xc_dirents, &len_b, &b_vb_p->x_namelen, sizeof(unsigned int)); + + if (len_a > len_b) + ret = 1; + else if (len_a < len_b) + ret = -1; + else { + char *a, *b, *buf; + buf = kmalloc(len_a * 2, GFP_KERNEL); + if (buf == NULL) { + ret = 0; + goto out; + } + a = buf; + b = buf + len_a; + + vp_read(&xc->xc_dirents, a, a_vb_p->x_name, len_a); + vp_read(&xc->xc_dirents, b, b_vb_p->x_name, len_b); + + ret = memcmp(a, b, len_a); + + kfree(buf); + } + } +out: + return ret; +} + +void gfs2_xrdir_ctx_uninit(struct gfs2_file *fp) +{ + struct gfs2_xrdir_ctx *xc; + + if (!fp || !fp->f_xrctx) + return; + + xc = fp->f_xrctx; + if (xc->xc_vb_dptrs) + kfree(xc->xc_vb_dptrs); + vp_uninit(&xc->xc_xattr_values); + vp_uninit(&xc->xc_xattr_keys); + vp_uninit(&xc->xc_dirents); + kfree(xc); + fp->f_xrctx = NULL; +} + +int gfs2_xrdir_ctx_init(struct gfs2_file *fp, struct gfs2_sbd *sdp) +{ + struct gfs2_xrdir_ctx *xc; + if (!fp) + return -EINVAL; + + BUG_ON(fp->f_xrctx != NULL); + + xc = kzalloc(sizeof(struct gfs2_xrdir_ctx), GFP_KERNEL); + if (xc == NULL) + return -ENOMEM; + + if (vp_init(sdp, &xc->xc_dirents, 1) || + vp_init(sdp, &xc->xc_xattr_keys, 1) || + vp_init(sdp, &xc->xc_xattr_values, 1)) { + gfs2_xrdir_ctx_uninit(fp); + kfree(xc); + return -ENOMEM; + } + xc->xc_flags |= XC_FL_ALLOCATED; + fp->f_xrctx = xc; + + return 0; +} + +/* + * There was an error while collecting entries. + * Figure out what happened and twiddle flags + * appropriately. + */ +void gfs2_xrdir_partial_collect(struct gfs2_xrdir_ctx *xc) +{ + if (xc->xc_flags & XC_FL_GATHER_PART_INT || + xc->xc_flags & XC_FL_ERROR) + return; + + /* + * We encountered a hash collision situation. We've read + * entries in hash order up to the point (not including) + * the colliding hashes. Setting XC_FL_HASH_COLL denotes + * that. Also setting XC_FL_HASH_COLL_NXT so we know + * that the next time we collect entries, the hash + * colliding entries will be part of the collection + */ + xc->xc_flags |= (XC_FL_HASH_COLL | XC_FL_HASH_COLL_NXT); + xc->xc_flags |= (XC_FL_GATHER_PARTS | XC_FL_GATHER_PART_INT); + xc->xc_hash_coll_off = xc->xc_offset; + + return; +} + +/* + * We have run out of memory while collecting entries and + * don't have a single entry to return to the user. We deal + * with such a situation by halving the number of dents we + * tried to read last time and returning -EAGAIN to the user + * so we can have a go at it again + */ +static int gfs2_xrdir_handle_oom(struct gfs2_xrdir_ctx *xc) +{ + /* next time, only try half the number of dents */ + xc->xc_dent_cap = DIV_ROUND_UP(xc->xc_count, 2); + /* clear out some flags */ + xc->xc_flags &= ~(XC_FL_ERROR_OOM | XC_FL_ERROR); + xc->xc_flags &= ~XC_FL_GATHER_PART_INT; + /* In an oom situation, we're going to re-read fewer + * entries from the same collection. This may or may + * not hit the hash collision we recorded (if any). + * So, we reset the relevant flags */ + xc->xc_flags &= ~(XC_FL_HASH_COLL | XC_FL_HASH_COLL_NXT); + xc->xc_hash_coll_off = 0; + + return -EAGAIN; +} + +static int gfs2_xrdir_collect_errcheck(struct gfs2_xrdir_ctx *xc, int error) +{ + if (error < 0) { /* If we're out of memory */ + if (error == -ENOMEM) + xc->xc_flags |= XC_FL_ERROR_OOM; + xc->xc_flags |= XC_FL_ERROR; + return error; + } else { + if ((xc->xc_dent_cap && xc->xc_count >= xc->xc_dent_cap) || + (xc->xc_dent_memcap && vp_get_size(&xc->xc_dirents) + >= xc->xc_dent_memcap)) { + /* We hit one of our limits, flag and return */ + xc->xc_flags |= XC_FL_GATHER_PARTS; + xc->xc_flags |= XC_FL_GATHER_PART_INT; + return -EOVERFLOW; + } + return 0; + } +} + +/* + * To reduce disk-seeking, we collect all the info in stages. + * In each stage, we access relevant disk blocks in order + * by pre-sorting the entries correspondingly. + * + * 1. Collect entry info (name, ino, type, offset) etc for all the + * entries. Obtained by reading the directory inode + * 2. Collect stat info for all the entries. Obtained by reading + * the file inode blocks. + * 3. Collect xattr info for all the entries. Obtained by reading + * the eattr block of each inode. + * + * With this scheme of collecting data, we don't know what the final + * size of a dirent would be ahead of time. gfs2_xrdir_estimate_dent_memcap() + * attempts to guess the size. Right now it statically computes and + * reserves a fixed percentage of available space for entry+stat info + * and xattr info based on what data is requested by the user. + * + * TODO: Make this dynamic. Analyse the directory being processed + * and use observed ratios to improve throughput. + */ +static u64 gfs2_xrdir_estimate_dent_memcap(struct gfs2_sbd *sdp, + struct gfs2_xrdir_ctx *xc) +{ + u64 avail; + int perc = 80; + unsigned int mask = xc->xc_xattr_mask; + + avail = (gfs2_tune_get(sdp, gt_max_vb_pages) + + vp_get_page_count(&xc->xc_dirents) + + vp_get_page_count(&xc->xc_xattr_keys) + + vp_get_page_count(&xc->xc_xattr_values) - + atomic_read(&sdp->sd_vb_page_count)) * PAGE_SIZE; + if ((mask & XSTAT_XATTR_ALL) && (mask & XSTAT_XATTR_VALUES)) + perc = 50; + + return (avail * perc) / 100; +} + +/* + * We setup the xreaddir context before every collect run + */ +static int gfs2_xrdir_ctx_setup(struct file *file, struct gfs2_xrdir_ctx *xc, + unsigned int flags, unsigned int mask) +{ + struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); + + if (!(xc->xc_flags & XC_FL_GATHER_PARTS)) { + /* + * We only update flags and mask once per readdirplus + * initiation. If there are multiple parts, use the + * same values as initialized at the start + */ + xc->xc_xst_flags = flags; + xc->xc_xattr_mask = mask; + xc->xc_offset = file->f_pos; + } + + /* + * Set limits for this part based on how much memory is available + * or how many entries per cycle as defined by sysfs file. + * If dent_cap established in a previous run, leave it alone + */ + xc->xc_dent_cap = xc->xc_dent_cap ? xc->xc_dent_cap : + gfs2_tune_get(sdp, gt_max_xrdir_dents); + xc->xc_dent_memcap = gfs2_xrdir_estimate_dent_memcap(sdp, xc); + + xc->xc_dent_valid = 0; + xc->xc_count = 0; + xc->xc_next_dent = NULL; + kfree(xc->xc_vb_dptrs); + xc->xc_vb_dptrs = NULL; + vp_reset(&xc->xc_dirents); + vp_reset(&xc->xc_xattr_keys); + vp_reset(&xc->xc_xattr_values); + + return 0; +} + +/* + * Add a gfs2_dirent to the xreaddir context + */ +int gfs2_xrdir_collect_dents(const struct gfs2_dirent *dent, loff_t off, + struct gfs2_xrdir_ctx *xc) +{ + struct gfs2_xdirent *x; + u64 x_ino; + u32 x_hash; + u8 x_valid = 0; + char x_type; + unsigned int x_xattr_count, x_namelen; + const void *nullptr = NULL; + int error = 0; + + if (gfs2_dirent_dot_or_dotdot(dent)) + return 0; + + if (xc->xc_next_dent == NULL) + xc->xc_next_dent = xc->xc_dirents.v_ptr; + x = xc->xc_next_dent; + vp_memset(&xc->xc_dirents, x, 0, sizeof(struct gfs2_xdirent)); + + /* + * If we know that we're encountering hash-colliding + * entries this time around, we read only these in + * and nothing else + */ + if (xc->xc_flags & XC_FL_HASH_COLL_NXT && + off != xc->xc_hash_coll_off) { + /* + * setting dent_cap to how many we've read in + * so we don't read anymore + */ + xc->xc_dent_cap = xc->xc_count; + xc->xc_flags &= ~XC_FL_HASH_COLL_NXT; + /* + * xc_offset will get incremented to read + * at the next offset when everything + * is written out properly this cycle + */ + xc->xc_offset = xc->xc_hash_coll_off; + xc->xc_hash_coll_off = 0; + goto err_check; + } + + /* Copy the dirent contents */ + x_ino = be64_to_cpu(dent->de_inum.no_addr); + x_hash = be32_to_cpu(dent->de_hash); + x_type = be16_to_cpu(dent->de_type); + x_xattr_count = 0; + x_namelen = be16_to_cpu(dent->de_name_len); + + error = vp_write(&xc->xc_dirents, &x->x_ino, &x_ino, sizeof(x->x_ino)); + if (error != sizeof(x->x_ino)) goto err_check; + + error = vp_write(&xc->xc_dirents, &x->x_hash, &x_hash, sizeof(x->x_hash)); + if (error != sizeof(x->x_hash)) goto err_check; + + error = vp_write(&xc->xc_dirents, &x->x_valid, &x_valid, sizeof(x->x_valid)); + if (error != sizeof(x->x_valid)) goto err_check; + + error = vp_write(&xc->xc_dirents, &x->x_type, &x_type, sizeof(x->x_type)); + if (error != sizeof(x->x_type)) goto err_check; + + error = vp_write(&xc->xc_dirents, &x->x_xattr_count, &x_xattr_count, + sizeof(x->x_xattr_count)); + if (error != sizeof(x->x_xattr_count)) goto err_check; + + error = vp_write(&xc->xc_dirents, &x->x_vb_xattr_arr_ptr, &nullptr, + sizeof(x->x_vb_xattr_arr_ptr)); + if (error != sizeof(x->x_vb_xattr_arr_ptr)) goto err_check; + + error = vp_write(&xc->xc_dirents, &x->x_namelen, &x_namelen, + sizeof(x->x_namelen)); + if (error != sizeof(x->x_namelen)) goto err_check; + + error = vp_write(&xc->xc_dirents, &x->x_name, (char*)(dent + 1), x_namelen); + if (error != x_namelen) goto err_check; + + xc->xc_next_dent = x->x_name + x_namelen; + xc->xc_count++; + error = 0; +err_check: + return gfs2_xrdir_collect_errcheck(xc, error); +} + +/* + * Create the array of pointers that point to all the + * collected entries within the xc_dirents vbuf. + */ +static int gfs2_xrdir_create_dptrs(struct gfs2_xrdir_ctx *xc) +{ + int i; + unsigned int namelen; + struct gfs2_xdirent *x = NULL; + + BUG_ON(xc->xc_vb_dptrs || xc->xc_count == 0); + + /* allocate the dirent pointers */ + xc->xc_vb_dptrs = kmalloc(sizeof(struct gfs2_xdirent *) * xc->xc_count, + GFP_KERNEL); + if (xc->xc_vb_dptrs == NULL) + return -ENOMEM; + + for (i = 0; i < xc->xc_count; i++) { + if (!x) + x = xc->xc_dirents.v_ptr; + xc->xc_vb_dptrs[i] = x; + vp_read(&xc->xc_dirents, &namelen, &x->x_namelen, + sizeof(x->x_namelen)); + /* + * reclen is sizeof(struct gfs2_xdirent) + x_namelen. + * see struct gfs2_xdirent for more info + */ + x = (void *)x->x_name + namelen; + } + return 0; +} + +static int gfs2_xrdir_collect_xstat(struct gfs2_xrdir_ctx *xc) +{ + int i; + struct kstat st; + + for (i = 0; i < xc->xc_count; i++) { + struct gfs2_xdirent *x_vb_p = xc->xc_vb_dptrs[i]; + struct gfs2_inode *ip; + + vp_read(&xc->xc_dirents, &ip, &x_vb_p->x_ip, sizeof(struct gfs2_inode *)); + gfs2_getattr_i(ip, &st); + + vp_write(&xc->xc_dirents, &x_vb_p->x_kstat, &st, sizeof(struct kstat)); + vp_write(&xc->xc_dirents, &x_vb_p->x_eablk, &ip->i_eattr, + sizeof(x_vb_p->x_eablk)); + } + return 0; +} + +static inline int xattr_requested(char type, unsigned int mask) +{ + if ((type == GFS2_EATYPE_USR) && (mask & XSTAT_XATTR_USER)) + return 1; + if ((type == GFS2_EATYPE_SYS) && (mask & XSTAT_XATTR_SYSTEM)) + return 1; + if ((type == GFS2_EATYPE_SECURITY) && (mask & XSTAT_XATTR_SECURITY)) + return 1; + return 0; +} + +static int gfs2_xrdir_xattr_list_i(struct gfs2_inode *ip, + struct buffer_head *bh, + struct gfs2_ea_header *ea, + struct gfs2_ea_header *prev, void *private) +{ + struct gfs2_xdir_ctx_bndle *bundle = private; + struct gfs2_xrdir_ctx *xc = bundle->xcb_xc; + struct gfs2_xdirent *x = bundle->xcb_xd; + struct gfs2_xd_xattr *xtr; + char prefix[9]; + unsigned int l = 0, xtr_count, namlen, reclen; + void *p; + + if (!xattr_requested(ea->ea_type, xc->xc_xattr_mask)) + return 0; + + if (ea->ea_type == GFS2_EATYPE_UNUSED) + return 0; + + l = ea_prefix(ea, prefix, 9); + BUG_ON(l == 0); + + xtr = vp_get_top(&xc->xc_xattr_keys); + /* + * Only certain vp_XXX ops can trip -ENOMEM where we might be extending + * the vbuf. We ignore the error code of other ops. + */ + if (vp_memset(&xc->xc_xattr_keys, xtr, 0, + sizeof(struct gfs2_xd_xattr)) == -ENOMEM) + goto set_oom; + + /* if mask says don't do values, skip the following lines */ + if (GFS2_EA_DATA_LEN(ea) > 0 && (xc->xc_xattr_mask & XSTAT_XATTR_VALUES)) { + void *valptr = vp_get_top(&xc->xc_xattr_values); + unsigned long len = GFS2_EA_DATA_LEN(ea); + + vp_write(&xc->xc_xattr_keys, &xtr->xa_value_len, + &len, sizeof(xtr->xa_value_len)); + vp_write(&xc->xc_xattr_keys, &xtr->xa_vb_value_ptr, &valptr, + sizeof(void*)); + vp_read(&xc->xc_xattr_keys, &p, &xtr->xa_vb_value_ptr, + sizeof(void*)); + if (vp_append(&xc->xc_xattr_values, GFS2_EA2DATA(ea), len) + == -ENOMEM) + goto set_oom; + } + + namlen = l + ea->ea_name_len; + vp_write(&xc->xc_xattr_keys, &xtr->xa_keylen, &namlen, + sizeof(xtr->xa_keylen)); + if (vp_write(&xc->xc_xattr_keys, xtr->xa_keyname, &prefix, l) == -ENOMEM) + goto set_oom; + if (vp_write(&xc->xc_xattr_keys, xtr->xa_keyname + l, + GFS2_EA2NAME(ea), namlen) == -ENOMEM) + goto set_oom; + + /* gfs2_xd_xattr.xa_keyname[1] has an extra byte */ + reclen = (xtr->xa_keyname + l + namlen) - (char *)xtr; + vp_write(&xc->xc_xattr_keys, &xtr->xa_reclen, &reclen, + sizeof(xtr->xa_reclen)); + + vp_read(&xc->xc_dirents, &xtr_count, &x->x_xattr_count, + sizeof(x->x_xattr_count)); + xtr_count++; + vp_write(&xc->xc_dirents, &x->x_xattr_count, &xtr_count, + sizeof(x->x_xattr_count)); + + return 0; +set_oom: + xc->xc_flags |= XC_FL_ERROR_OOM; + return -ENOMEM; +} + +int gfs2_xrdir_collect_xattrs(struct gfs2_xrdir_ctx *xc) +{ + int error = 0, i; + + for (i = 0; i < xc->xc_count; i++) { + struct gfs2_xdirent *xtop, *x_vb_p = xc->xc_vb_dptrs[i]; + struct gfs2_inode *ip; + struct gfs2_xdir_ctx_bndle bundle; + u8 valid = 1; + + vp_read(&xc->xc_dirents, &ip, &x_vb_p->x_ip, + sizeof(struct gfs2_inode *)); + + if (!ip->i_eattr || !(xc->xc_xattr_mask & XSTAT_XATTR_ALL)) + goto mark_valid; + + bundle.xcb_xc = xc; + bundle.xcb_xd = x_vb_p; + + xtop = vp_get_top(&xc->xc_xattr_keys); + vp_write(&xc->xc_dirents, &x_vb_p->x_vb_xattr_arr_ptr, &xtop, + sizeof(struct gfs2_xd_xattr*)); + + error = ea_foreach(ip, gfs2_xrdir_xattr_list_i, &bundle); + if (error) + break; + mark_valid: + /* Read the xattrs for this dent, so mark it as valid */ + vp_write(&xc->xc_dirents, &x_vb_p->x_valid, &valid, + sizeof(x_vb_p->x_valid)); + xc->xc_dent_valid++; + } + return error; +} + +static int gfs2_xrdir_collect_extra_info(struct gfs2_xrdir_ctx *xc, + struct gfs2_inode *dip) +{ + int error = -ENOMEM, i; + struct gfs2_holder *ghs; + + /* First sort the dents according to inode blk order for stat */ + ctx_sort(xc, xc->xc_vb_dptrs, xc->xc_count, sizeof(void *), + ctx_compare_dent_iblks, NULL); + + /* Lookup all the inodes for stat info */ + for (i = 0; i < xc->xc_count; i++) { + struct gfs2_xdirent *x_vb_p = xc->xc_vb_dptrs[i]; + u64 ino; + struct inode *inode; + struct gfs2_inode *ip, *nullptr = NULL; + + vp_read(&xc->xc_dirents, &ino, &x_vb_p->x_ino, + sizeof(x_vb_p->x_ino)); + + inode = gfs2_lookup_by_inum(GFS2_SB(&dip->i_inode), ino, NULL, + GFS2_BLKST_DINODE); + if (IS_ERR(inode)) { + vp_write(&xc->xc_dirents, &ip, &nullptr, + sizeof(struct gfs2_inode *)); + error = -1; + goto iput_iarr; + } + ip = GFS2_I(inode); + vp_write(&xc->xc_dirents, &x_vb_p->x_ip, &ip, + sizeof(struct gfs2_inode *)); + } + + /* lock all inodes */ + ghs = kcalloc(xc->xc_count, sizeof(struct gfs2_holder), GFP_NOFS); + if (!ghs) + goto iput_iarr; + for (i = 0; i < xc->xc_count; i++) { + struct gfs2_xdirent *x_vb_p = xc->xc_vb_dptrs[i]; + struct gfs2_inode *ip; + + vp_read(&xc->xc_dirents, &ip, &x_vb_p->x_ip, + sizeof(struct gfs2_inode *)); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, ghs + i); + } + + error = gfs2_glock_nq_m(xc->xc_count, ghs); + if (error) + goto free_ghs; + + if (gfs2_xrdir_collect_xstat(xc)) + goto free_ghs; + + /* Sort the dents according to eattr blk order */ + ctx_sort(xc, xc->xc_vb_dptrs, xc->xc_count, sizeof(void *), + ctx_compare_dent_eablks, NULL); + + error = gfs2_xrdir_collect_xattrs(xc); + + for (i = 0; i < xc->xc_count; i++) + gfs2_glock_dq_uninit(&ghs[i]); +free_ghs: + kfree(ghs); +iput_iarr: + for (i = 0; i < xc->xc_count; i++) { + struct gfs2_xdirent *x_vb_p = xc->xc_vb_dptrs[i]; + struct gfs2_inode *ip; + + vp_read(&xc->xc_dirents, &ip, &x_vb_p->x_ip, + sizeof(struct gfs2_inode *)); + if (ip) + iput(&ip->i_inode); + } + /* Sort the pointers back to dent order */ + ctx_sort(xc, xc->xc_vb_dptrs, xc->xc_count, sizeof(void *), + ctx_compare_dents, NULL); + + if (error == -ENOMEM) { + /* + * If at least one dent has been collected in full, + * void -ENOMEM + * We shuffled the order of dents multiple times while + * retrieving stat and xattrs, so we have to ensure that + * at least the first dent in the final ordering is valid + * in order to be able to return at least 1 entry. This + * is because we need to preserve the order (hash order) + * when we return the dents to the user. XXX: OR DO WE?? + */ + struct gfs2_xdirent *x_vb_p = xc->xc_vb_dptrs[0]; + u8 valid; + vp_read(&xc->xc_dirents, &valid, &x_vb_p->x_valid, + sizeof(x_vb_p->x_valid)); + + if (valid) + error = 0; + else { + u32 hash; + vp_read(&xc->xc_dirents, &hash, &x_vb_p->x_hash, + sizeof(hash)); + xc->xc_offset = gfs2_disk_hash2offset(hash); + } + } + if (!error) + xc->xc_flags |= XC_FL_DATA_AVAIL; + + return error; +} + +static int gfs2_xrdir_to_user_xattrs(struct gfs2_xrdir_ctx *xc, + struct gfs2_xdirent *x, + struct gfs2_xd_xattr *xdx_vb_p, + struct xdirent_xattr __user *xx, + size_t count, size_t *bytes, char *tempbuf) +{ + struct gfs2_xd_xattr xdx; + int attrcount = 0, error = -EINVAL; + + while (attrcount < x->x_xattr_count) { + vp_read(&xc->xc_xattr_keys, &xdx, xdx_vb_p, + sizeof(struct gfs2_xd_xattr)); + + if ((count - *bytes) < + (sizeof(struct xdirent_xattr) + + xdx.xa_keylen + xdx.xa_value_len)) { + error = -EOVERFLOW; + goto out; + } + + if (__put_user(xdx.xa_value_len, &xx->xa_value_len)) + goto out; + + vp_read(&xc->xc_xattr_keys, tempbuf, xdx_vb_p->xa_keyname, + xdx.xa_keylen); + + if (copy_to_user(xx->xa_name_val, tempbuf, xdx.xa_keylen)) + goto out; + if (__put_user(0, xx->xa_name_val + xdx.xa_keylen)) + goto out; + + if ((xc->xc_xattr_mask & XSTAT_XATTR_VALUES) && + xdx.xa_vb_value_ptr) { + vp_read(&xc->xc_xattr_values, tempbuf, xdx.xa_vb_value_ptr, + xdx.xa_value_len); + + if (copy_to_user(xx->xa_name_val + xdx.xa_keylen + 1, tempbuf, + xdx.xa_value_len)) + goto out; + } + + xx = (struct xdirent_xattr __user *) + ((char *)xx + sizeof(xx->xa_value_len) + + xdx.xa_keylen + 1 + xdx.xa_value_len); + xdx_vb_p = (void*) xdx_vb_p + xdx.xa_reclen; + + *bytes += sizeof(struct xdirent_xattr) + xdx.xa_keylen + + xdx.xa_value_len; + attrcount++; + } + error = 0; +out: + return error; +} + +static int gfs2_xrdir_to_user_vars(struct gfs2_xrdir_ctx *xc, + struct gfs2_xdirent *x, + struct gfs2_xdirent *x_vb_p, + struct linux_xdirent __user *lxd, + size_t count, size_t *bytes) +{ + int error = -EINVAL; + char *tempbuf = NULL; + struct xdirent_blob __user *xblob; + struct xdirent_xattr __user *xx; + struct gfs2_xd_xattr *xdx_vb_p; + + tempbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!tempbuf) { + error = -ENOMEM; + goto out; + } + + xblob = &lxd->xd_blob; + + /* copy all the variable length fields */ + if ((count - *bytes) < x->x_namelen) { + error = -EOVERFLOW; + goto free; + } + + vp_read(&xc->xc_dirents, tempbuf, x_vb_p->x_name, x->x_namelen); + + if (copy_to_user(xblob->xb_blob, tempbuf, x->x_namelen)) + goto free; + if (__put_user(0, xblob->xb_blob + x->x_namelen)) + goto free; + + *bytes += x->x_namelen; + error = 0; + + if ((xc->xc_xattr_mask & XSTAT_XATTR_ALL) && + lxd->xd_blob.xb_xattr_count) { + xx = (struct xdirent_xattr __user *) + (xblob->xb_blob + x->x_namelen + 1); + xdx_vb_p = x->x_vb_xattr_arr_ptr; + + error = gfs2_xrdir_to_user_xattrs(xc, x, xdx_vb_p, xx, + count, bytes, tempbuf); + } +free: + kfree(tempbuf); +out: + return error; +} + +static int gfs2_xrdir_to_user_fixed(struct gfs2_xrdir_ctx *xc, + struct gfs2_xdirent *x, + struct gfs2_xdirent *x_vb_p, + struct linux_xdirent __user *lxd, + size_t count, size_t *bytes) +{ + struct xdirent_blob __user *xblob; + int error = -EINVAL; + + vp_read(&xc->xc_dirents, x, x_vb_p, sizeof(struct gfs2_xdirent)); + + if ((count - *bytes) < sizeof(struct linux_xdirent)) { + error = -EOVERFLOW; + goto out; + } + + if (__put_user(x->x_ino, &lxd->xd_ino)) + goto out; + if (__put_user(x->x_type, &lxd->xd_type)) + goto out; + if (__put_user(0, &lxd->xd_off)) + goto out; + + error = xstat_set_result(&x->x_kstat, &lxd->xd_stat); + if (error) + goto out; + + xblob = &lxd->xd_blob; + + error = -EINVAL; + if (__put_user(x->x_xattr_count, &xblob->xb_xattr_count)) + goto out; + + /* copied all the fixed size fields */ + *bytes += sizeof(struct linux_xdirent); + error = 0; +out: + return error; +} + +static size_t gfs2_xrdir_to_user(struct gfs2_xrdir_ctx *xc, void __user *buf, + size_t count) +{ + size_t error = -EINVAL, bytes = 0, bytes_bef = 0; + int i, skip = 1, written = 0; + struct gfs2_xdirent x, *x_vb_p; + struct linux_xdirent __user *lxd = buf; + u8 valid; + + if (!(xc->xc_flags & XC_FL_DATA_AVAIL)) + goto out; + + for (i = 0; i < xc->xc_count; i++) { + u32 hash; + x_vb_p = xc->xc_vb_dptrs[i]; + vp_read(&xc->xc_dirents, &hash, &x_vb_p->x_hash, sizeof(hash)); + + if (skip && xc->xc_vb_dptrs[i] != xc->xc_next_dent) + continue; + skip = 0; + vp_read(&xc->xc_dirents, &valid, &x_vb_p->x_valid, + sizeof(x_vb_p->x_valid)); + if (!valid) + break; + + /* This will fill up x from x_vb_p and subsequently lxd from x */ + error = gfs2_xrdir_to_user_fixed(xc, &x, x_vb_p, lxd, count, + &bytes); + if (error) { + if (error == -EOVERFLOW) + goto overflow; + goto out; + } + + error = gfs2_xrdir_to_user_vars(xc, &x, x_vb_p, lxd, count, + &bytes); + if (error) { + u64 ino; + vp_read(&xc->xc_dirents, &ino, &x_vb_p->x_ino, sizeof(ino)); + if (error == -EOVERFLOW) + goto overflow; + goto out; + } + + if (__put_user(bytes - bytes_bef, &lxd->xd_reclen)) + goto out; + + lxd = (void *)lxd + (bytes - bytes_bef); + xc->xc_next_dent = xc->xc_vb_dptrs[i+1]; + written++; + bytes_bef = bytes; + } +overflow: + if (written) { + if (!valid) { + u32 hash; + x_vb_p = xc->xc_vb_dptrs[i]; + vp_read(&xc->xc_dirents, &hash, &x_vb_p->x_hash, + sizeof(hash)); + /* + * Some of the entries we collected were incomplete, + * so we only wrote the ones that were complete. For + * next time, we'll only try to collect half the + * number of entries. This will also invalidate the + * assumption that we'll encounter hash-colliding + * entries in the next pass + */ + xc->xc_offset = gfs2_disk_hash2offset(hash); + xc->xc_flags &= ~(XC_FL_GATHER_PART_INT | + XC_FL_DATA_AVAIL | + XC_FL_HASH_COLL | + XC_FL_HASH_COLL_NXT); + xc->xc_hash_coll_off = 0; + xc->xc_dent_cap = DIV_ROUND_UP(xc->xc_count, 2); + } else { + /* + * If we didn't overflow the user buffer, we + * have written out all the collected dents to + * the user buffer + */ + if (error != -EOVERFLOW) { + xc->xc_flags &= ~(XC_FL_GATHER_PART_INT | + XC_FL_DATA_AVAIL); + xc->xc_dent_cap = 0; + if (!(xc->xc_flags & XC_FL_HASH_COLL)) + xc->xc_offset++; + } + } + } + if (!written && !skip) { + error = -EOVERFLOW; + goto out; + } + error = bytes_bef; +out: + return error; +} + +/** + * gfs2_xreaddir - GFS2's implementation of xreaddir functionality + * @file : The directory to xreaddir + * @flags : flags used by xstat + * @mask : field mask for xstat and xattrs + * @buf : User buffer to fill data into + * @count : Size of the user buffer in bytes + * + * Collect extended information (xstat, xattrs) about the dents in the + * given directory and fill them into the user buf passed in. + * + * Returns: 0 if successful. + * -EAGAIN if the user should retry. + * -ve values for other errors + */ + +size_t gfs2_xreaddir(struct file *file, unsigned int flags, unsigned int mask, + void __user *buf, size_t count) +{ + struct gfs2_xrdir_ctx *xc = ((struct gfs2_file *) + file->private_data)->f_xrctx; + size_t error = 0; + struct inode *dir = file->f_mapping->host; + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_holder d_gh; + + if (xc->xc_flags & XC_FL_DATA_AVAIL) { + error = gfs2_xrdir_to_user(xc, buf, count); + file->f_pos = xc->xc_offset; + return error; + } + + error = gfs2_xrdir_ctx_setup(file, xc, flags, mask); + if (error) + goto out; + + gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + error = gfs2_glock_nq(&d_gh); + if (error) { + gfs2_holder_uninit(&d_gh); + goto out; + } + + xc->xc_flags &= ~XC_FL_HASH_COLL; + error = gfs2_dir_read(dir, NULL, xc, &file->f_ra); + if (error) { + if (xc->xc_flags & XC_FL_ERROR_OOM) + error = gfs2_xrdir_handle_oom(xc); + goto uninit; + } + + if (xc->xc_count == 0) + goto uninit; + + if (!(xc->xc_flags & XC_FL_GATHER_PARTS)) + xc->xc_flags |= XC_FL_GATHER_FULL; + else if (!(xc->xc_flags & XC_FL_GATHER_PART_INT)) + xc->xc_flags |= XC_FL_GATHER_PART_END; + + error = gfs2_xrdir_create_dptrs(xc); + if (error) { + if (error == -ENOMEM) + error = gfs2_xrdir_handle_oom(xc); + goto uninit; + } + + error = gfs2_xrdir_collect_extra_info(xc, dip); + if (error) { + if (error == -ENOMEM) + error = gfs2_xrdir_handle_oom(xc); + goto uninit; + } + + xc->xc_next_dent = xc->xc_vb_dptrs[0]; + error = gfs2_xrdir_to_user(xc, buf, count); + + file->f_pos = xc->xc_offset; +uninit: + if (xc->xc_flags & XC_FL_HASH_COLL && !(xc->xc_flags & XC_FL_DATA_AVAIL)) + xc->xc_flags &= ~XC_FL_HASH_COLL; + + gfs2_glock_dq_uninit(&d_gh); +out: + return error; +} diff --git a/fs/gfs2/xreaddir.h b/fs/gfs2/xreaddir.h new file mode 100644 index 0000000..ea6c82c --- /dev/null +++ b/fs/gfs2/xreaddir.h @@ -0,0 +1,84 @@ +#ifndef __XREADDIR_H__ +#define __XREADDIR_H__ + +struct gfs2_xd_xattr { + unsigned int xa_reclen; + void *xa_vb_value_ptr; + unsigned long xa_value_len; + unsigned int xa_keylen; + char __pad[7]; + char xa_keyname[1]; +}; + +struct gfs2_xdirent { + u32 x_hash; + u8 x_valid; + struct gfs2_inode *x_ip; + u64 x_ino; + u64 x_eablk; + char x_type; + struct kstat x_kstat; + unsigned int x_xattr_count; + void *x_vb_xattr_arr_ptr; + unsigned int x_namelen; + char x_name[1]; +}; + +#define XC_FL_ALLOCATED 0x00000001 +#define XC_FL_GATHER_FULL 0x00000002 +#define XC_FL_GATHER_PARTS 0x00000004 +#define XC_FL_GATHER_PART_INT 0x00000008 +#define XC_FL_GATHER_PART_END 0x00000010 +#define XC_FL_HASH_COLL 0x00000020 +#define XC_FL_HASH_COLL_NXT 0x00000040 +#define XC_FL_ERROR_OOM 0x00000080 +#define XC_FL_ERROR 0x00000100 +#define XC_FL_DATA_AVAIL 0x00000200 +#define XC_FL_PRINTOK 0x10000000 + +/* + * readdir ctx + */ +struct gfs2_xrdir_ctx { + u32 xc_flags; /* XC_FL_XXXX */ + u64 xc_dent_memcap; /* mem limit per collect */ + u32 xc_dent_cap; /* # dent limit per collect */ + u32 xc_dent_valid; /* # valid dents collected */ + u32 xc_xattr_mask; /* XSTAT_XATTR_XXX see stat.h*/ + u32 xc_xst_flags; /* XSTAT_XXX see stat.h */ + loff_t xc_offset; /* offset of next dent */ + unsigned long xc_count; /* # dents collected */ + loff_t xc_hash_coll_off; /* last hash collision offset */ + void *xc_next_dent; /* next dent to write out */ + void **xc_vb_dptrs; /* ptrs to dents in xc_dirents */ + struct vbuf xc_dirents; /* temp storage for dents */ + struct vbuf xc_xattr_keys; /* xattr keys for dents */ + struct vbuf xc_xattr_values; /* corresponding values */ +}; + +/* + * Ugly struct to blob together these two + * structs. Only used in one place to + * retrieve extended attributes. + * This is so that we don't have to change + * the prototypes of all the existing + * xattr handling functions to accept an + * extra arg. + */ +struct gfs2_xdir_ctx_bndle { + struct gfs2_xrdir_ctx *xcb_xc; + struct gfs2_xdirent *xcb_xd; +}; + +extern size_t gfs2_xreaddir(struct file *file, unsigned int flags, + unsigned int mask, void __user *buf, + size_t count); +extern int gfs2_xrdir_collect_dents(const struct gfs2_dirent *dent, loff_t off, + struct gfs2_xrdir_ctx *xc); +extern void gfs2_xrdir_partial_collect(struct gfs2_xrdir_ctx *xc); +extern int gfs2_xrdir_collect_xattrs(struct gfs2_xrdir_ctx *xc); + +extern int gfs2_xrdir_ctx_init(struct gfs2_file *fp, struct gfs2_sbd *sdp); +extern void gfs2_xrdir_ctx_uninit(struct gfs2_file *fp); + +#endif /* __XREADDIR_H_ */ -- 1.8.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html