Introduce white-out handling in the VFS. Signed-off-by: Jan Blunck <jblunck@xxxxxxx> --- fs/inode.c | 22 ++ fs/namei.c | 417 +++++++++++++++++++++++++++++++++++++++++++++++++++-- fs/readdir.c | 6 include/linux/fs.h | 7 4 files changed, 441 insertions(+), 11 deletions(-) --- a/fs/inode.c +++ b/fs/inode.c @@ -1410,6 +1410,26 @@ void __init inode_init(unsigned long mem INIT_HLIST_HEAD(&inode_hashtable[loop]); } +/* + * Dummy default file-operations: + * Never open a whiteout. This is always a bug. + */ +static int whiteout_no_open(struct inode *irrelevant, struct file *dontcare) +{ + printk("WARNING: at %s:%d %s(): Attempted to open a whiteout!\n", + __FILE__, __LINE__, __FUNCTION__); + /* + * Nobody should ever be able to open a whiteout. On the other hand + * this isn't fatal so lets just print a warning message. + */ + WARN_ON(1); + return -ENXIO; +} + +static struct file_operations def_wht_fops = { + .open = whiteout_no_open, +}; + void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) { inode->i_mode = mode; @@ -1423,6 +1443,8 @@ void init_special_inode(struct inode *in inode->i_fop = &def_fifo_fops; else if (S_ISSOCK(mode)) inode->i_fop = &bad_sock_fops; + else if (S_ISWHT(mode)) + inode->i_fop = &def_wht_fops; else printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", mode); --- a/fs/namei.c +++ b/fs/namei.c @@ -887,7 +887,7 @@ static fastcall int __link_path_walk(con err = -ENOENT; inode = next.dentry->d_inode; - if (!inode) + if (!inode || S_ISWHT(inode->i_mode)) goto out_dput; err = -ENOTDIR; if (!inode->i_op) @@ -951,6 +951,8 @@ last_component: err = -ENOENT; if (!inode) break; + if (S_ISWHT(inode->i_mode)) + break; if (lookup_flags & LOOKUP_DIRECTORY) { err = -ENOTDIR; if (!inode->i_op || !inode->i_op->lookup) @@ -1434,13 +1436,10 @@ static inline int check_sticky(struct in * 10. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ -static int may_delete(struct inode *dir,struct dentry *victim,int isdir) +static int __may_delete(struct inode *dir, struct dentry *victim, int isdir) { int error; - if (!victim->d_inode) - return -ENOENT; - BUG_ON(victim->d_parent->d_inode != dir); audit_inode_child(victim->d_name.name, victim->d_inode, dir); @@ -1466,6 +1465,14 @@ static int may_delete(struct inode *dir, return 0; } +static int may_delete(struct inode *dir, struct dentry *victim, int isdir) +{ + if (!victim->d_inode || S_ISWHT(victim->d_inode->i_mode)) + return -ENOENT; + + return __may_delete(dir, victim, isdir); +} + /* Check whether we can create an object with dentry child in directory * dir. * 1. We can't do it if child already exists (open has special treatment for @@ -1477,7 +1484,7 @@ static int may_delete(struct inode *dir, static inline int may_create(struct inode *dir, struct dentry *child, struct nameidata *nd) { - if (child->d_inode) + if (child->d_inode && !S_ISWHT(child->d_inode->i_mode)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; @@ -1559,6 +1566,13 @@ int vfs_create(struct inode *dir, struct error = security_inode_create(dir, dentry, mode); if (error) return error; + + if (dentry->d_inode && S_ISWHT(dentry->d_inode->i_mode)) { + error = vfs_unlink_whiteout(dir, dentry); + if (error) + return error; + } + DQUOT_INIT(dir); error = dir->i_op->create(dir, dentry, mode, nd); if (!error) @@ -1741,7 +1755,7 @@ do_last: } /* Negative dentry, just create the file */ - if (!path.dentry->d_inode) { + if (!path.dentry->d_inode || S_ISWHT(path.dentry->d_inode->i_mode)) { error = open_namei_create(nd, &path, flag, mode); if (error) goto exit; @@ -1903,6 +1917,12 @@ int vfs_mknod(struct inode *dir, struct if (error) return error; + if (dentry->d_inode && S_ISWHT(dentry->d_inode->i_mode)) { + error = vfs_unlink_whiteout(dir, dentry); + if (error) + return error; + } + DQUOT_INIT(dir); error = dir->i_op->mknod(dir, dentry, mode, dev); if (!error) @@ -1969,6 +1989,7 @@ asmlinkage long sys_mknod(const char __u int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { int error = may_create(dir, dentry, NULL); + int opaque = 0; if (error) return error; @@ -1981,10 +2002,20 @@ int vfs_mkdir(struct inode *dir, struct if (error) return error; + if (dentry->d_inode && S_ISWHT(dentry->d_inode->i_mode)) { + error = vfs_unlink_whiteout(dir, dentry); + if (error) + return error; + opaque = 1; + } + DQUOT_INIT(dir); error = dir->i_op->mkdir(dir, dentry, mode); - if (!error) + if (!error) { fsnotify_mkdir(dir, dentry); + if (opaque) + dentry->d_inode->i_flags |= S_OPAQUE; + } return error; } @@ -2025,6 +2056,360 @@ asmlinkage long sys_mkdir(const char __u return sys_mkdirat(AT_FDCWD, pathname, mode); } +static int filldir_is_empty(void *__buf, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + int *is_empty = (int *)__buf; + + switch (namlen) { + case 2: + if (name[1] != '.') + break; + case 1: + if (name[0] != '.') + break; + return 0; + } + + if (d_type == DT_WHT) + return 0; + + (*is_empty) = 0; + return 0; +} + +static int directory_is_empty(struct dentry *dentry, struct vfsmount *mnt) +{ + struct file *file; + int err; + int is_empty = 1; + + BUG_ON(!S_ISDIR(dentry->d_inode->i_mode)); + + /* references for the file pointer */ + dget(dentry); + mntget(mnt); + + file = dentry_open(dentry, mnt, O_RDONLY); + if (IS_ERR(file)) + return 0; + + err = vfs_readdir(file, filldir_is_empty, &is_empty); + + fput(file); + return is_empty; +} + +/* + * We try to whiteout a dentry. dir is the parent of the whiteout. + * Whiteouts can be vfs_unlink'ed. + */ +int vfs_whiteout(struct inode *dir, struct dentry *dentry) +{ + int err; + + BUG_ON(dentry->d_parent->d_inode != dir); + + /* from may_create() */ + if (dentry->d_inode) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + err = permission(dir, MAY_WRITE | MAY_EXEC, NULL); + if (err) + return err; + + /* from may_delete() */ + if (IS_APPEND(dir)) + return -EPERM; + /* We don't call check_sticky() here because d_inode == NULL */ + + if (!dir->i_op || !dir->i_op->whiteout) + return -EOPNOTSUPP; + + err = dir->i_op->whiteout(dir, dentry); + /* Ignore quota and fsnotify */ + return err; +} + +/* Checks on the victiom for whiteout */ +static inline int may_whiteout(struct dentry *victim, int isdir) +{ + if (!victim->d_inode || S_ISWHT(victim->d_inode->i_mode)) + return -ENOENT; + if (IS_APPEND(victim->d_inode) || IS_IMMUTABLE(victim->d_inode)) + return -EPERM; + if (isdir) { + if (!S_ISDIR(victim->d_inode->i_mode)) + return -ENOTDIR; + if (IS_ROOT(victim)) + return -EBUSY; + } else if (S_ISDIR(victim->d_inode->i_mode)) + return -EISDIR; + if (victim->d_flags & DCACHE_NFSFS_RENAMED) + return -EBUSY; + return 0; +} + +/* + * do_whiteout - whiteout a dentry, either when removing or renaming + * @dentry: the dentry to whiteout + * + * This is called by the VFS when removing or renaming files on an union mount. + * Must be called with nd->dentry->d_inode->i_mutex locked. + */ +static int do_whiteout(struct nameidata *nd, struct path *path, int isdir) +{ + struct path safe = { .dentry = dget(nd->dentry), + .mnt = mntget(nd->mnt) }; + struct dentry *dentry = path->dentry; + struct qstr name; + int err; + + err = may_whiteout(dentry, isdir); + if (err) + goto out; + + err = -ENOTEMPTY; + if (isdir && !directory_is_empty(path->dentry, path->mnt)) + goto out; + + /* safe the name for a later lookup */ + err = -ENOMEM; + name.name = kmalloc(dentry->d_name.len, GFP_KERNEL); + if (!name.name) + goto out; + strncpy((char *)name.name, dentry->d_name.name, dentry->d_name.len); + name.len = dentry->d_name.len; + name.hash = dentry->d_name.hash; + + /* + * If the dentry to whiteout is on the topmost layer of + * the union stack we must get rid of it first before + * creating the whiteout. + */ + if (dentry->d_parent == nd->dentry) { + struct inode *dir = nd->dentry->d_inode; + + if (isdir) + err = vfs_rmdir(dir, dentry); + else + err = vfs_unlink(dir, dentry); + if (err) + goto out_freename; + } + + /* + * Relookup the dentry to whiteout now. We should find a fresh negative + * dentry by this time. + */ + dentry = __lookup_hash_kern(&name, nd->dentry, nd); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_freename; + + dput(path->dentry); + if (path->mnt != safe.mnt) + mntput(path->mnt); + path->mnt = nd->mnt; + path->dentry = dentry; + + err = vfs_whiteout(nd->dentry->d_inode, dentry); +out_freename: + kfree(name.name); +out: + pathput(&safe); + return err; +} + +/* + * vfs_unlink_whiteout - Unlink a single whiteout from the system + * @dir: parent directory + * @dentry: the whiteout itself + * + * This is for unlinking a single whiteout. Don't use vfs_unlink() because we + * don't want any notification stuff etc. but basically it is the same stuff. + */ +int vfs_unlink_whiteout(struct inode *dir, struct dentry *dentry) +{ + int error; + + if (!dentry->d_inode) + return -ENOENT; + + error = __may_delete(dir, dentry, 0); + if (error) + return error; + + if (!dir->i_op || !dir->i_op->unlink) + return -EPERM; + + DQUOT_INIT(dir); + + mutex_lock(&dentry->d_inode->i_mutex); + if (d_mountpoint(dentry)) + error = -EBUSY; + else { + error = security_inode_unlink(dir, dentry); + if (!error) + error = dir->i_op->unlink(dir, dentry); + } + mutex_unlock(&dentry->d_inode->i_mutex); + + /* + * We can call dentry_iput() since nobody could actually do something + * useful with a whiteout. So dropping the reference to the inode + * doesn't make a difference, does it? + * + * It turns the without dentry into a negative dentry ... hmm, couldn't + * this race againt if(inode && S_ISWHT(inode->i_mode)) tests??? + */ + if (!error) { + spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); + if (atomic_read(&dentry->d_count) == 1) { + struct inode *inode = dentry->d_inode; + dentry->d_inode = NULL; + list_del_init(&dentry->d_alias); + spin_unlock(&dentry->d_lock); + spin_unlock(&dcache_lock); + if (dentry->d_op && dentry->d_op->d_iput) + dentry->d_op->d_iput(dentry, inode); + else + iput(inode); + } else { + if (!d_unhashed(dentry)) + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&dcache_lock); + printk("WARNING: at %s:%d %s(): couldn't unlink\n", + __FILE__, __LINE__, __FUNCTION__); + dump_stack(); + } + } + return error; +} + +static int __hash_one_len(const char *name, int len, struct qstr *this) +{ + unsigned long hash; + unsigned char c; + + hash = init_name_hash(); + while (len--) { + c = *(const unsigned char *)name++; + if (c == '/' || c == '\0') + return -EINVAL; + hash = partial_name_hash(c, hash); + } + this->hash = end_name_hash(hash); + return 0; +} + +struct unlink_whiteout_dirent { + struct dentry *parent; + struct list_head list; +}; + +static int filldir_unlink_whiteouts(void *buf, const char *name, int namlen, + loff_t offset, u64 ino, + unsigned int d_type) +{ + struct unlink_whiteout_dirent *dirent = buf; + struct dentry *dentry; + struct qstr this; + int res; + + if (d_type != DT_WHT) + return 0; + + this.name = name; + this.len = namlen; + res = __hash_one_len(name, namlen, &this); + if (res) + return res; + + dentry = __lookup_hash_kern(&this, dirent->parent, NULL); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); + __d_drop(dentry); + if (!list_empty(&dentry->d_lru)) { + list_del(&dentry->d_lru); + dentry_stat.nr_unused--; + } + list_add(&dentry->d_lru, &dirent->list); + spin_unlock(&dentry->d_lock); + spin_unlock(&dcache_lock); + return res; +} + +/* + * do_unlink_whiteouts - remove all whiteouts of an "empty" directory + * @dentry: the directories dentry + * + * Before removing a directory from the file system, we have to make sure + * that there are no stale whiteouts in it. Therefore we call readdir() with + * a special filldir helper to remove all the whiteouts. + * + * XXX: Don't call any security and permission checks here (If we aren't + * allowed to go here, we shouldn't be here at all). Same with i_mutex, don't + * touch it here. + */ +static int do_unlink_whiteouts(struct dentry *dentry) +{ + struct file *file; + struct inode *inode; + struct unlink_whiteout_dirent dirent = + { .list = LIST_HEAD_INIT(dirent.list), + .parent = dentry }; + struct dentry *n; + int res; + + dget(dentry); + + /* + * FIXME: This is bad, because we really don't want to open a new + * file in the kernel but readdir needs a file pointer + */ + file = dentry_open(dentry, NULL, O_RDWR); + if (IS_ERR(file)) { + printk(KERN_ERR "%s: dentry_open failed (%ld)\n", + __FUNCTION__, PTR_ERR(file)); + return PTR_ERR(file); + } + + inode = file->f_path.dentry->d_inode; + + res = -ENOTDIR; + if (!file->f_op || !file->f_op->readdir) + goto out_fput; + + res = -ENOENT; + if (!IS_DEADDIR(inode)) { + res = file->f_op->readdir(file, &dirent, + filldir_unlink_whiteouts); + file_accessed(file); + } + + list_for_each_entry_safe(dentry, n, &dirent.list, d_lru) { + list_del_init(&dentry->d_lru); + res = vfs_unlink_whiteout(inode, dentry); + WARN_ON(res); + dput(dentry); + } + +out_fput: + fput(file); + if (unlikely(res)) + printk(KERN_ERR "%s: readdir failed (%d)\n", + __FUNCTION__, res); + return res; +} + + /* * We try to drop the dentry early: we should have * a usage count of 2 if we're the only user of this @@ -2064,18 +2449,22 @@ int vfs_rmdir(struct inode *dir, struct DQUOT_INIT(dir); - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); dentry_unhash(dentry); if (d_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_rmdir(dir, dentry); if (!error) { + error = do_unlink_whiteouts(dentry); + if (error) + goto out; error = dir->i_op->rmdir(dir, dentry); if (!error) dentry->d_inode->i_flags |= S_DEAD; } } +out: mutex_unlock(&dentry->d_inode->i_mutex); if (!error) { d_delete(dentry); @@ -2243,6 +2632,12 @@ int vfs_symlink(struct inode *dir, struc if (error) return error; + if (dentry->d_inode && S_ISWHT(dentry->d_inode->i_mode)) { + error = vfs_unlink_whiteout(dir, dentry); + if (error) + return error; + } + DQUOT_INIT(dir); error = dir->i_op->symlink(dir, dentry, oldname); if (!error) @@ -2296,7 +2691,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *inode = old_dentry->d_inode; int error; - if (!inode) + if (!inode || S_ISWHT(inode->i_mode)) return -ENOENT; error = may_create(dir, new_dentry, NULL); @@ -2570,7 +2965,7 @@ static int do_rename(int olddfd, const c goto exit3; /* source must exist */ error = -ENOENT; - if (!old.dentry->d_inode) + if (!old.dentry->d_inode || S_ISWHT(old.dentry->d_inode->i_mode)) goto exit4; /* unless the source is a directory trailing slashes give -ENOTDIR */ if (!S_ISDIR(old.dentry->d_inode->i_mode)) { --- a/fs/readdir.c +++ b/fs/readdir.c @@ -148,6 +148,9 @@ static int filldir(void * __buf, const c unsigned long d_ino; int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(long)); + if (d_type == DT_WHT) + return 0; + buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; @@ -233,6 +236,9 @@ static int filldir64(void * __buf, const struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf; int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 1, sizeof(u64)); + if (d_type == DT_WHT) + return 0; + buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -97,6 +97,7 @@ extern int dir_notify_enable; #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 #define FS_SAFE 8 /* Safe to mount by unprivileged users */ +#define FS_WHT 8192 /* FS supports whiteout filetype */ #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() * during rename() internally. @@ -130,6 +131,7 @@ extern int dir_notify_enable; #define MS_NO_LEASES (1<<22) /* fs does not support leases */ #define MS_SETUSER (1<<23) /* set mnt_uid to current user */ #define MS_NOMNT (1<<24) /* don't allow unprivileged submounts */ +#define MS_WHITEOUT (1<<25) /* fs does support white-out filetype */ #define MS_ACTIVE (1<<30) #define MS_NOUSER (1<<31) @@ -156,6 +158,7 @@ extern int dir_notify_enable; #define S_NOCMTIME 128 /* Do not update file c/mtime */ #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */ #define S_PRIVATE 512 /* Inode is fs-internal */ +#define S_OPAQUE 1024 /* Directory is opaque */ /* * Note that nosuid etc flags are inode-specific: setting some file-system @@ -190,6 +193,7 @@ extern int dir_notify_enable; #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) #define IS_NO_LEASES(inode) __IS_FLG(inode, MS_NO_LEASES) +#define IS_OPAQUE(inode) ((inode)->i_flags & S_OPAQUE) /* the read-only stuff doesn't really belong here, but any other place is probably as bad and I don't want to create yet another include file. */ @@ -1087,6 +1091,8 @@ extern int vfs_link(struct dentry *, str extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *); extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); +extern int vfs_whiteout(struct inode *, struct dentry *); +extern int vfs_unlink_whiteout(struct inode *, struct dentry *); /* * VFS dentry helper functions. @@ -1212,6 +1218,7 @@ struct inode_operations { int (*mkdir) (struct inode *,struct dentry *,int); int (*rmdir) (struct inode *,struct dentry *); int (*mknod) (struct inode *,struct dentry *,int,dev_t); + int (*whiteout) (struct inode *, struct dentry *); int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *); int (*readlink) (struct dentry *, char __user *,int); -- - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html