From: Miklos Szeredi <mszeredi@xxxxxxx> This overlay filesystem is a hybrid of entirely filesystem based (unionfs, aufs) and entierly VFS based (union mounts) solutions. The dentry tree is duplicated from the underlying filesystems, this enables fast cached lookups without adding special support into the VFS. This uses slightly more memory than union mounts, but dentries are relatively small. Inode structures are only duplicated for directories. Regular files, symlinks and special files each share a single inode. This means that locking victim for unlink is a quasi-filesystem lock, which is suboptimal, but could be worked around in the VFS. Opening non directories results in the open forwarded to the underlying filesystem. This makes the behavior very similar to union mounts (with the same limitations vs. fchmod/fchown on O_RDONLY file descriptors). Usage: mount -t overlay -olowerdir=/lower,upperdir=/upper overlay /mnt Supported: - all operations Missing: - ensure that filesystems part of the overlay are not modified outside the overlay - optimize directory merging and caching [Neil Brown] - minimal remount support - use correct seek function for directories - initialise is_real before use - rename ovl_fill_cache to ovl_dir_read - add initial revalidate support Signed-off-by: Miklos Szeredi <mszeredi@xxxxxxx> --- fs/Kconfig | 1 fs/Makefile | 1 fs/overlayfs/Kconfig | 4 fs/overlayfs/Makefile | 5 fs/overlayfs/overlayfs.c | 2174 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 2185 insertions(+) Index: linux-2.6/fs/overlayfs/overlayfs.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6/fs/overlayfs/overlayfs.c 2010-10-26 11:29:29.000000000 +0200 @@ -0,0 +1,2174 @@ +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/sched.h> +#include <linux/fs_struct.h> +#include <linux/file.h> +#include <linux/xattr.h> +#include <linux/security.h> +#include <linux/mount.h> +#include <linux/splice.h> +#include <linux/slab.h> +#include <linux/parser.h> +#include <linux/module.h> +#include <linux/uaccess.h> + +MODULE_AUTHOR("Miklos Szeredi <miklos@xxxxxxxxxx>"); +MODULE_DESCRIPTION("Overlay filesystem"); +MODULE_LICENSE("GPL"); + +#define OVL_COPY_UP_CHUNK_SIZE (1 << 20) + +struct ovl_fs { + struct inode *symlink_inode; + struct inode *regular_inode; + struct inode *special_inode; + struct vfsmount *upper_mnt; + struct vfsmount *lower_mnt; +}; + +struct ovl_entry { + struct dentry *__upperdentry; + struct dentry *lowerdentry; + bool opaque; +}; + +static const char *ovl_whiteout_xattr = "trusted.overlay.whiteout"; +static const char *ovl_opaque_xattr = "trusted.overlay.opaque"; +static const char *ovl_whiteout_symlink = "(overlay-whiteout)"; + +enum ovl_path_type { + OVL_PATH_UPPER, + OVL_PATH_MERGE, + OVL_PATH_LOWER, +}; + +static enum ovl_path_type ovl_path_type(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe->__upperdentry) { + if (oe->lowerdentry && S_ISDIR(dentry->d_inode->i_mode)) + return OVL_PATH_MERGE; + else + return OVL_PATH_UPPER; + } else { + return OVL_PATH_LOWER; + } +} + +static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe) +{ + struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry); + smp_read_barrier_depends(); + return upperdentry; +} + +static void ovl_path_upper(struct dentry *dentry, struct path *path) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *oe = dentry->d_fsdata; + + path->mnt = ofs->upper_mnt; + path->dentry = ovl_upperdentry_dereference(oe); +} + +static void ovl_path_lower(struct dentry *dentry, struct path *path) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *oe = dentry->d_fsdata; + + path->mnt = ofs->lower_mnt; + path->dentry = oe->lowerdentry; +} + +static enum ovl_path_type ovl_path_real(struct dentry *dentry, + struct path *path) +{ + + enum ovl_path_type type = ovl_path_type(dentry); + + if (type == OVL_PATH_LOWER) + ovl_path_lower(dentry, path); + else + ovl_path_upper(dentry, path); + + return type; +} + +static struct dentry *ovl_dentry_upper(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return ovl_upperdentry_dereference(oe); +} + +static struct dentry *ovl_dentry_lower(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->lowerdentry; +} + +static struct dentry *ovl_dentry_real(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + struct dentry *realdentry; + + realdentry = ovl_upperdentry_dereference(oe); + if (!realdentry) + realdentry = oe->lowerdentry; + + return realdentry; +} + +static struct file *path_open(struct path *path, int flags) +{ + const struct cred *cred = current_cred(); + + path_get(path); + return dentry_open(path->dentry, path->mnt, flags, cred); +} + +static bool ovl_dentry_is_opaque(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + return oe->opaque; +} + +static void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque) +{ + struct ovl_entry *oe = dentry->d_fsdata; + oe->opaque = opaque; +} + +static void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex)); + WARN_ON(oe->__upperdentry); + smp_wmb(); + oe->__upperdentry = upperdentry; +} + +static bool ovl_is_whiteout(struct dentry *dentry) +{ + int res; + char val; + + if (!dentry) + return false; + if (!dentry->d_inode) + return false; + if (!S_ISLNK(dentry->d_inode->i_mode)) + return false; + + res = vfs_getxattr(dentry, ovl_whiteout_xattr, &val, 1); + if (res == 1 && val == 'y') + return true; + + return false; +} + +static bool ovl_is_opaquedir(struct dentry *dentry) +{ + int res; + char val; + + if (!S_ISDIR(dentry->d_inode->i_mode)) + return false; + + res = vfs_getxattr(dentry, ovl_opaque_xattr, &val, 1); + if (res == 1 && val == 'y') + return true; + + return false; +} + +struct ovl_cache_entry { + struct ovl_cache_entry *next; + struct qstr name; + unsigned int type; + u64 ino; + bool is_whiteout; +}; + +struct ovl_readdir_data { + struct ovl_cache_entry *list; + struct ovl_cache_entry **endp; + struct dentry *dir; + int count; + int err; +}; + +struct ovl_dir_file { + bool is_real; + struct ovl_cache_entry *cache; + struct file *realfile; +}; + +static int ovl_cache_add_entry(struct ovl_readdir_data *rdd, + const char *name, int namelen, u64 ino, + unsigned int d_type, bool is_whiteout) +{ + struct ovl_cache_entry *p; + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + p->name.name = kstrndup(name, namelen, GFP_KERNEL); + if (!p->name.name) { + kfree(p); + return -ENOMEM; + } + p->name.len = namelen; + p->name.hash = 0; + p->type = d_type; + p->ino = ino; + p->is_whiteout = is_whiteout; + p->next = NULL; + *rdd->endp = p; + rdd->endp = &p->next; + + return 0; +} + +static void ovl_cache_free(struct ovl_cache_entry *p) +{ + while (p) { + struct ovl_cache_entry *next = p->next; + + kfree(p->name.name); + kfree(p); + p = next; + } +} + +static int ovl_cache_find_entry(struct ovl_cache_entry *start, + const char *name, int namelen) +{ + struct ovl_cache_entry *p; + int ret = 0; + + for (p = start; p; p = p->next) { + if (p->name.len != namelen) + continue; + if (strncmp(p->name.name, name, namelen) == 0) { + ret = 1; + break; + } + } + + return ret; +} + +static int ovl_fill_lower(void *buf, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ovl_readdir_data *rdd = buf; + + rdd->count++; + if (!ovl_cache_find_entry(rdd->list, name, namlen)) + rdd->err = ovl_cache_add_entry(rdd, name, namlen, ino, d_type, false); + + return rdd->err; +} + +static int ovl_fill_upper(void *buf, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ovl_readdir_data *rdd = buf; + bool is_whiteout = false; + + rdd->count++; + if (d_type == DT_LNK) { + struct dentry *dentry; + + dentry = lookup_one_len(name, rdd->dir, strlen(name)); + if (IS_ERR(dentry)) { + rdd->err = PTR_ERR(dentry); + goto out; + } + is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } + + rdd->err = ovl_cache_add_entry(rdd, name, namlen, ino, d_type, is_whiteout); + +out: + return rdd->err; +} + +static int ovl_dir_read(struct path *realpath, struct ovl_readdir_data *rdd, + filldir_t filler) +{ + const struct cred *old_cred; + struct cred *override_cred; + struct file *realfile; + int err; + + realfile = path_open(realpath, O_RDONLY | O_DIRECTORY); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); + + err = -ENOMEM; + override_cred = prepare_creds(); + if (override_cred) { + /* + * CAP_SYS_ADMIN for getxattr + * CAP_DAC_OVERRIDE for lookup and unlink + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + old_cred = override_creds(override_cred); + + do { + rdd->count = 0; + rdd->err = 0; + err = vfs_readdir(realfile, filler, rdd); + if (err >= 0) + err = rdd->err; + } while (!err && rdd->count); + + revert_creds(old_cred); + put_cred(override_cred); + } + fput(realfile); + + if (err) { + ovl_cache_free(rdd->list); + rdd->list = NULL; + return err; + } + + return 0; +} + +static void ovl_dir_reset(struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + enum ovl_path_type type = ovl_path_type(file->f_path.dentry); + + ovl_cache_free(od->cache); + od->cache = NULL; + WARN_ON(!od->is_real && type != OVL_PATH_MERGE); + if (od->is_real && type == OVL_PATH_MERGE) { + fput(od->realfile); + od->realfile = NULL; + od->is_real = false; + } +} + +static int ovl_readdir(struct file *file, void *buf, filldir_t filler) +{ + struct ovl_dir_file *od = file->private_data; + struct ovl_cache_entry *p; + loff_t off; + int res = 0; + + if (!file->f_pos) + ovl_dir_reset(file); + + if (od->is_real) { + res = vfs_readdir(od->realfile, filler, buf); + file->f_pos = od->realfile->f_pos; + + return res; + } + + if (!od->cache) { + struct path lowerpath; + struct path upperpath; + struct ovl_readdir_data rdd = { + .list = NULL, + .endp = &rdd.list, + }; + + ovl_path_lower(file->f_path.dentry, &lowerpath); + ovl_path_upper(file->f_path.dentry, &upperpath); + + rdd.dir = upperpath.dentry; + res = ovl_dir_read(&upperpath, &rdd, ovl_fill_upper); + if (!res) + res = ovl_dir_read(&lowerpath, &rdd, ovl_fill_lower); + + if (res) + return res; + + od->cache = rdd.list; + } + + off = 0; + for (p = od->cache; p; p = p->next) { + int over; + + if (p->is_whiteout) + continue; + + off++; + if (off <= file->f_pos) + continue; + + over = filler(buf, p->name.name, p->name.len, off - 1, + p->ino, p->type); + if (over) + break; + + file->f_pos = off; + } + + return res; +} + +static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t res; + struct ovl_dir_file *od = file->private_data; + + mutex_lock(&file->f_dentry->d_inode->i_mutex); + if (!file->f_pos) + ovl_dir_reset(file); + + if (od->is_real) { + res = vfs_llseek(od->realfile, offset, origin); + file->f_pos = od->realfile->f_pos; + } else + res = generic_file_llseek_unlocked(file, offset, origin); + mutex_unlock(&file->f_dentry->d_inode->i_mutex); + + return res; +} + +static int ovl_dir_fsync(struct file *file, int datasync) +{ + struct ovl_dir_file *od = file->private_data; + + /* May need to reopen directory if it got copied up */ + if (!od->realfile) { + struct path upperpath; + + ovl_path_upper(file->f_path.dentry, &upperpath); + od->realfile = path_open(&upperpath, O_RDONLY); + if (IS_ERR(od->realfile)) + return PTR_ERR(od->realfile); + } + + return vfs_fsync(od->realfile, datasync); +} + +static int ovl_dir_release(struct inode *inode, struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + + ovl_cache_free(od->cache); + if (od->realfile) + fput(od->realfile); + kfree(od); + + return 0; +} + +static int ovl_dir_open(struct inode *inode, struct file *file) +{ + struct path realpath; + struct file *realfile; + struct ovl_dir_file *od; + enum ovl_path_type type; + + od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL); + if (!od) + return -ENOMEM; + + type = ovl_path_real(file->f_path.dentry, &realpath); + realfile = path_open(&realpath, file->f_flags); + if (IS_ERR(realfile)) { + kfree(od); + return PTR_ERR(realfile); + } + od->realfile = realfile; + od->is_real = (type != OVL_PATH_MERGE); + file->private_data = od; + + return 0; +} + +static const struct file_operations ovl_dir_operations = { + .read = generic_read_dir, + .open = ovl_dir_open, + .readdir = ovl_readdir, + .llseek = ovl_dir_llseek, + .fsync = ovl_dir_fsync, + .release = ovl_dir_release, +}; + +static const struct inode_operations ovl_dir_inode_operations; + +static void ovl_dentry_release(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe) { + dput(oe->__upperdentry); + dput(oe->lowerdentry); + kfree(oe); + } +} + +static const struct dentry_operations ovl_dentry_operations = { + .d_release = ovl_dentry_release, +}; + +static struct inode *ovl_new_inode(struct super_block *sb, umode_t mode) +{ + struct ovl_fs *ufs = sb->s_fs_info; + struct inode *inode; + + switch (mode & S_IFMT) { + case S_IFDIR: + inode = new_inode(sb); + inode->i_flags |= S_NOATIME|S_NOCMTIME; + inode->i_op = &ovl_dir_inode_operations; + inode->i_fop = &ovl_dir_operations; + inode->i_mode = S_IFDIR; + break; + + case S_IFLNK: + inode = ufs->symlink_inode; + atomic_inc(&inode->i_count); + break; + + case S_IFREG: + inode = ufs->regular_inode; + atomic_inc(&inode->i_count); + break; + + case S_IFSOCK: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + inode = ufs->special_inode; + atomic_inc(&inode->i_count); + break; + + default: + WARN(1, "illegal file type: %i\n", mode & S_IFMT); + inode = NULL; + } + + return inode; + +} + +static struct dentry *ovl_lookup_real(struct dentry *dir, struct qstr *name) +{ + struct dentry *dentry; + + mutex_lock(&dir->d_inode->i_mutex); + dentry = lookup_one_len(name->name, dir, name->len); + mutex_unlock(&dir->d_inode->i_mutex); + + if (IS_ERR(dentry)) { + if (PTR_ERR(dentry) == -ENOENT) + dentry = NULL; + } else if (!dentry->d_inode) { + dput(dentry); + dentry = NULL; + } + return dentry; +} + +static struct ovl_entry *ovl_alloc_entry(void) +{ + return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL); +} + +static struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct ovl_entry *oe; + struct dentry *upperdir; + struct dentry *lowerdir; + struct dentry *upperdentry = NULL; + struct dentry *lowerdentry = NULL; + struct inode *inode = NULL; + int err; + + err = -ENOMEM; + oe = ovl_alloc_entry(); + if (!oe) + goto out; + + upperdir = ovl_dentry_upper(dentry->d_parent); + lowerdir = ovl_dentry_lower(dentry->d_parent); + + if (upperdir) { + upperdentry = ovl_lookup_real(upperdir, &dentry->d_name); + err = PTR_ERR(upperdentry); + if (IS_ERR(upperdentry)) + goto out_put_dir; + + if (lowerdir && upperdentry && + (S_ISLNK(upperdentry->d_inode->i_mode) || + S_ISDIR(upperdentry->d_inode->i_mode))) { + const struct cred *old_cred; + struct cred *override_cred; + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_dput_upper; + + /* CAP_SYS_ADMIN needed for getxattr */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + old_cred = override_creds(override_cred); + + if (ovl_is_opaquedir(upperdentry)) { + oe->opaque = true; + } else if (ovl_is_whiteout(upperdentry)) { + dput(upperdentry); + upperdentry = NULL; + oe->opaque = true; + } + revert_creds(old_cred); + put_cred(override_cred); + } + } + if (lowerdir && !oe->opaque) { + lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name); + err = PTR_ERR(lowerdentry); + if (IS_ERR(lowerdentry)) + goto out_dput_upper; + } + + if (lowerdentry && upperdentry && + (!S_ISDIR(upperdentry->d_inode->i_mode) || + !S_ISDIR(lowerdentry->d_inode->i_mode))) { + dput(lowerdentry); + lowerdentry = NULL; + oe->opaque = true; + } + + if (lowerdentry || upperdentry) { + struct dentry *realdentry; + + realdentry = upperdentry ? upperdentry : lowerdentry; + err = -ENOMEM; + inode = ovl_new_inode(dir->i_sb, realdentry->d_inode->i_mode); + if (!inode) + goto out_dput; + } + + if (upperdentry) + oe->__upperdentry = upperdentry; + + if (lowerdentry) + oe->lowerdentry = lowerdentry; + + dentry->d_fsdata = oe; + dentry->d_op = &ovl_dentry_operations; + d_add(dentry, inode); + + return NULL; + +out_dput: + dput(lowerdentry); +out_dput_upper: + dput(upperdentry); +out_put_dir: + kfree(oe); +out: + return ERR_PTR(err); +} + +static int ovl_copy_up_xattr(struct dentry *old, struct dentry *new) +{ + ssize_t list_size, size; + char *buf, *name, *value; + int error; + + if (!old->d_inode->i_op->getxattr || + !new->d_inode->i_op->getxattr) + return 0; + + list_size = vfs_listxattr(old, NULL, 0); + if (list_size <= 0) + return list_size; + + buf = kzalloc(list_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + error = -ENOMEM; + value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL); + if (!value) + goto out; + + list_size = vfs_listxattr(old, buf, list_size); + if (list_size <= 0) { + error = list_size; + goto out_free_value; + } + + for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { + size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX); + if (size <= 0) { + error = size; + goto out_free_value; + } + error = vfs_setxattr(new, name, value, size, 0); + if (error) + goto out_free_value; + } + +out_free_value: + kfree(value); +out: + kfree(buf); + return error; +} + +static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) +{ + struct file *old_file; + struct file *new_file; + int error = 0; + + if (len == 0) + return 0; + + old_file = path_open(old, O_RDONLY); + if (IS_ERR(old_file)) + return PTR_ERR(old_file); + + new_file = path_open(new, O_WRONLY); + if (IS_ERR(new_file)) { + error = PTR_ERR(new_file); + goto out_fput; + } + + /* FIXME: copy up sparse files efficiently */ + while (len) { + loff_t offset = new_file->f_pos; + size_t this_len = OVL_COPY_UP_CHUNK_SIZE; + long bytes; + + if (len < this_len) + this_len = len; + + if (signal_pending_state(TASK_KILLABLE, current)) + return -EINTR; + + bytes = do_splice_direct(old_file, &offset, new_file, this_len, + SPLICE_F_MOVE); + if (bytes <= 0) { + error = bytes; + break; + } + + len -= bytes; + } + + fput(new_file); +out_fput: + fput(old_file); + return error; +} + +static struct dentry *ovl_lookup_create(struct dentry *upperdir, + struct dentry *template) +{ + int err; + struct dentry *newdentry; + struct qstr *name = &template->d_name; + + newdentry = lookup_one_len(name->name, upperdir, name->len); + if (IS_ERR(newdentry)) + return newdentry; + + if (newdentry->d_inode) { + const struct cred *old_cred; + struct cred *override_cred; + + /* No need to check whiteout if lower parent is non-existent */ + err = -EEXIST; + if (!ovl_dentry_lower(template->d_parent)) + goto out_dput; + + if (!S_ISLNK(newdentry->d_inode->i_mode)) + goto out_dput; + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_dput; + + /* + * CAP_SYS_ADMIN for getxattr + * CAP_FOWNER for unlink in sticky directory + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + old_cred = override_creds(override_cred); + + err = -EEXIST; + if (ovl_is_whiteout(newdentry)) + err = vfs_unlink(upperdir->d_inode, newdentry); + + revert_creds(old_cred); + put_cred(override_cred); + if (err) + goto out_dput; + + dput(newdentry); + newdentry = lookup_one_len(name->name, upperdir, name->len); + if (IS_ERR(newdentry)) + return newdentry; + + /* + * Whiteout just been successfully removed, parent + * i_mutex is still held, there's no way the lookup + * could return positive. + */ + WARN_ON(newdentry->d_inode); + } + + return newdentry; + +out_dput: + dput(newdentry); + return ERR_PTR(err); +} + +static struct dentry *ovl_upper_create(struct dentry *upperdir, + struct dentry *dentry, + struct kstat *stat, const char *link) +{ + int err; + struct dentry *newdentry; + struct inode *dir = upperdir->d_inode; + + newdentry = ovl_lookup_create(upperdir, dentry); + if (IS_ERR(newdentry)) + goto out; + + switch (stat->mode & S_IFMT) { + case S_IFREG: + err = vfs_create(dir, newdentry, stat->mode, NULL); + break; + + case S_IFDIR: + err = vfs_mkdir(dir, newdentry, stat->mode); + break; + + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + err = vfs_mknod(dir, newdentry, stat->mode, stat->rdev); + break; + + case S_IFLNK: + err = vfs_symlink(dir, newdentry, link); + break; + + default: + err = -EPERM; + } + if (err) { + dput(newdentry); + newdentry = ERR_PTR(err); + } + +out: + return newdentry; + +} + +static char *ovl_read_symlink(struct dentry *realdentry) +{ + int res; + char *buf; + struct inode *inode = realdentry->d_inode; + mm_segment_t old_fs; + + res = -EINVAL; + if (!inode->i_op->readlink) + goto err; + + res = -ENOMEM; + buf = (char *) __get_free_page(GFP_KERNEL); + if (!buf) + goto err; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = inode->i_op->readlink(realdentry, + (char __user *)buf, PAGE_SIZE - 1); + set_fs(old_fs); + if (res < 0) { + free_page((unsigned long) buf); + goto err; + } + buf[res] = '\0'; + + return buf; + +err: + return ERR_PTR(res); +} + +static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) +{ + struct iattr attr = { + .ia_valid = ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET, + .ia_atime = stat->atime, + .ia_mtime = stat->mtime, + }; + + return notify_change(upperdentry, &attr); +} + +static int ovl_set_mode(struct dentry *upperdentry, umode_t mode) +{ + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = mode, + }; + + return notify_change(upperdentry, &attr); +} + +static int ovl_set_opaque(struct dentry *upperdentry) +{ + int err; + const struct cred *old_cred; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) + return -ENOMEM; + + /* CAP_SYS_ADMIN for setxattr of "trusted" namespace */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + old_cred = override_creds(override_cred); + err = vfs_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0); + revert_creds(old_cred); + put_cred(override_cred); + + return err; +} + +static int ovl_remove_opaque(struct dentry *upperdentry) +{ + int err; + const struct cred *old_cred; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) + return -ENOMEM; + + /* CAP_SYS_ADMIN for removexattr of "trusted" namespace */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + old_cred = override_creds(override_cred); + err = vfs_removexattr(upperdentry, ovl_opaque_xattr); + revert_creds(old_cred); + put_cred(override_cred); + + return err; +} + +static int ovl_copy_up_locked(struct dentry *upperdir, struct dentry *dentry, + struct path *lowerpath, struct kstat *stat, + const char *link) +{ + int err; + struct path newpath; + umode_t mode = stat->mode; + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + + /* Can't properly set mode on creation because of the umask */ + stat->mode &= S_IFMT; + + newpath.mnt = ofs->upper_mnt; + newpath.dentry = ovl_upper_create(upperdir, dentry, stat, link); + if (IS_ERR(newpath.dentry)) { + err = PTR_ERR(newpath.dentry); + + /* Already copied up? */ + if (err == -EEXIST && ovl_path_type(dentry) != OVL_PATH_LOWER) + return 0; + + return err; + } + + /* FIXME: recovery from failure to copy up */ + + if (S_ISREG(stat->mode)) { + err = ovl_copy_up_data(lowerpath, &newpath, stat->size); + if (err) + return err; + } + + err = ovl_copy_up_xattr(lowerpath->dentry, newpath.dentry); + if (err) + return err; + + mutex_lock(&newpath.dentry->d_inode->i_mutex); + err = ovl_set_mode(newpath.dentry, mode); + if (!err) + err = ovl_set_timestamps(newpath.dentry, stat); + mutex_unlock(&newpath.dentry->d_inode->i_mutex); + if (err) + return err; + + ovl_dentry_update(dentry, newpath.dentry); + + /* + * Easiest way to get rid of the lower dentry reference is to + * drop this dentry. This is neither needed nor possible for + * directories. + */ + if (!S_ISDIR(stat->mode)) + d_drop(dentry); + + return 0; +} + +static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, + struct path *lowerpath, struct kstat *stat) +{ + int err; + struct kstat pstat; + struct path parentpath; + struct dentry *upperdir; + const struct cred *old_cred; + struct cred *override_cred; + char *link = NULL; + + ovl_path_upper(parent, &parentpath); + upperdir = parentpath.dentry; + + err = vfs_getattr(parentpath.mnt, parentpath.dentry, &pstat); + if (err) + return err; + + if (S_ISLNK(stat->mode)) { + link = ovl_read_symlink(lowerpath->dentry); + if (IS_ERR(link)) + return PTR_ERR(link); + } + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_free_link; + + override_cred->fsuid = stat->uid; + override_cred->fsgid = stat->gid; + /* + * CAP_SYS_ADMIN for copying up extended attributes + * CAP_DAC_OVERRIDE for create + * CAP_FOWNER for chmod, timestamp update + * CAP_FSETID for chmod + * CAP_MKNOD for mknod + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + cap_raise(override_cred->cap_effective, CAP_FSETID); + cap_raise(override_cred->cap_effective, CAP_MKNOD); + old_cred = override_creds(override_cred); + + mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT); + /* + * Using upper filesystem locking to protect against copy up + * racing with rename (rename means the copy up was already + * successful). + */ + if (dentry->d_parent != parent) { + WARN_ON((ovl_path_type(dentry) == OVL_PATH_LOWER)); + err = 0; + } else { + err = ovl_copy_up_locked(upperdir, dentry, lowerpath, + stat, link); + if (!err) { + /* Restore timestamps on parent (best effort) */ + ovl_set_timestamps(upperdir, &pstat); + } + } + + mutex_unlock(&upperdir->d_inode->i_mutex); + + revert_creds(old_cred); + put_cred(override_cred); + +out_free_link: + if (link) + free_page((unsigned long) link); + + return err; +} + +static int ovl_copy_up(struct dentry *dentry) +{ + int err; + + err = 0; + while (!err) { + struct dentry *next; + struct dentry *parent; + struct path lowerpath; + struct kstat stat; + enum ovl_path_type type = ovl_path_type(dentry); + + if (type != OVL_PATH_LOWER) + break; + + next = dget(dentry); + /* find the topmost dentry not yet copied up */ + for (;;) { + parent = dget_parent(next); + + type = ovl_path_type(parent); + if (type != OVL_PATH_LOWER) + break; + + dput(next); + next = parent; + } + + ovl_path_lower(next, &lowerpath); + err = vfs_getattr(lowerpath.mnt, lowerpath.dentry, &stat); + if (!err) + err = ovl_copy_up_one(parent, next, &lowerpath, &stat); + + dput(parent); + dput(next); + } + + return err; +} + +/* Optimize by not copying up the file first and truncating later */ +static int ovl_copy_up_truncate(struct dentry *dentry, loff_t size) +{ + int err; + struct kstat stat; + struct path lowerpath; + struct dentry *parent = dget_parent(dentry); + + err = ovl_copy_up(parent); + if (err) + goto out_dput_parent; + + ovl_path_lower(dentry, &lowerpath); + err = vfs_getattr(lowerpath.mnt, lowerpath.dentry, &stat); + if (err) + goto out_dput_parent; + + if (size < stat.size) + stat.size = size; + + err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat); + +out_dput_parent: + dput(parent); + return err; +} + +static int ovl_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct dentry *upperdentry; + int err; + + if ((attr->ia_valid & ATTR_SIZE) && !ovl_dentry_upper(dentry)) + err = ovl_copy_up_truncate(dentry, attr->ia_size); + else + err = ovl_copy_up(dentry); + if (err) + return err; + + upperdentry = ovl_dentry_upper(dentry); + + mutex_lock(&upperdentry->d_inode->i_mutex); + err = notify_change(upperdentry, attr); + mutex_unlock(&upperdentry->d_inode->i_mutex); + + return err; +} + +static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct path realpath; + + ovl_path_real(dentry, &realpath); + return vfs_getattr(realpath.mnt, realpath.dentry, stat); +} + +static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + int err; + enum ovl_path_type type; + struct path realpath; + + type = ovl_path_real(dentry, &realpath); + err = vfs_getattr(realpath.mnt, realpath.dentry, stat); + if (err) + return err; + + stat->dev = dentry->d_sb->s_dev; + stat->ino = dentry->d_inode->i_ino; + + /* + * It's probably not worth it to count subdirs to get the + * correct link count. nlink=1 seems to pacify 'find' and + * other utilities. + */ + if (type == OVL_PATH_MERGE) + stat->nlink = 1; + + return 0; +} + +static int ovl_permission(struct dentry *dentry, int mask) +{ + enum ovl_path_type type; + struct path realpath; + struct inode *inode; + int err; + + type = ovl_path_real(dentry, &realpath); + if (type != OVL_PATH_LOWER) + return dentry_permission(realpath.dentry, mask); + + inode = realpath.dentry->d_inode; + if (!(mask & MAY_WRITE) || special_file(inode->i_mode)) + return dentry_permission(realpath.dentry, mask); + + /* Don't check for read-only fs */ + if (mask & MAY_WRITE) { + if (IS_IMMUTABLE(inode)) + return -EACCES; + } + + if (inode->i_op->permission) + err = inode->i_op->permission(realpath.dentry, mask); + else + err = generic_permission(inode, mask, inode->i_op->check_acl); + + if (err) + return err; + + return security_inode_permission(inode, mask); +} + +static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, + const char *link) +{ + int err; + struct dentry *newdentry; + struct dentry *upperdir; + struct inode *inode; + struct kstat stat = { + .mode = mode, + .rdev = rdev, + }; + + err = -ENOMEM; + inode = ovl_new_inode(dentry->d_sb, mode); + if (!inode) + goto out; + + err = ovl_copy_up(dentry->d_parent); + if (err) + goto out_iput; + + upperdir = ovl_dentry_upper(dentry->d_parent); + mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT); + + newdentry = ovl_upper_create(upperdir, dentry, &stat, link); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + + if (ovl_dentry_is_opaque(dentry) && S_ISDIR(mode)) { + err = ovl_set_opaque(newdentry); + if (err) + goto out_dput; + } + ovl_dentry_update(dentry, newdentry); + d_instantiate(dentry, inode); + inode = NULL; + newdentry = NULL; + err = 0; + +out_dput: + dput(newdentry); +out_unlock: + mutex_unlock(&upperdir->d_inode->i_mutex); +out_iput: + iput(inode); +out: + return err; +} + +static int ovl_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL); +} + +static int ovl_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL); +} + +static int ovl_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev) +{ + return ovl_create_object(dentry, mode, rdev, NULL); +} + +static int ovl_symlink(struct inode *dir, struct dentry *dentry, + const char *link) +{ + return ovl_create_object(dentry, S_IFLNK, 0, link); +} + +struct ovl_link_data { + struct dentry *realdentry; + void *cookie; +}; + +static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + void *ret; + struct dentry *realdentry; + struct inode *realinode; + + realdentry = ovl_dentry_real(dentry); + realinode = realdentry->d_inode; + + if (WARN_ON(!realinode->i_op->follow_link)) + return ERR_PTR(-EPERM); + + ret = realinode->i_op->follow_link(realdentry, nd); + if (IS_ERR(ret)) + return ret; + + if (realinode->i_op->put_link) { + struct ovl_link_data *data; + + data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL); + if (!data) { + realinode->i_op->put_link(realdentry, nd, ret); + return ERR_PTR(-ENOMEM); + } + data->realdentry = realdentry; + data->cookie = ret; + + return data; + } else { + return NULL; + } +} + +static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c) +{ + struct inode *realinode; + struct ovl_link_data *data = c; + + if (!data) + return; + + realinode = data->realdentry->d_inode; + realinode->i_op->put_link(data->realdentry, nd, data->cookie); + kfree(data); +} + +static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) +{ + struct path realpath; + struct inode *realinode; + + ovl_path_real(dentry, &realpath); + realinode = realpath.dentry->d_inode; + + if (!realinode->i_op->readlink) + return -EINVAL; + + touch_atime(realpath.mnt, realpath.dentry); + + return realinode->i_op->readlink(realpath.dentry, buf, bufsiz); +} + +static int ovl_whiteout(struct dentry *upperdir, struct dentry *dentry) +{ + int err; + struct dentry *newdentry; + const struct cred *old_cred; + struct cred *override_cred; + + /* FIXME: recheck lower dentry to see if whiteout is really needed */ + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out; + + /* + * CAP_SYS_ADMIN for setxattr + * CAP_DAC_OVERRIDE for symlink creation + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + override_cred->fsuid = 0; + override_cred->fsgid = 0; + old_cred = override_creds(override_cred); + + newdentry = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_put_cred; + + /* Just been removed within the same locked region */ + WARN_ON(newdentry->d_inode); + + err = vfs_symlink(upperdir->d_inode, newdentry, ovl_whiteout_symlink); + if (err) + goto out_dput; + + err = vfs_setxattr(newdentry, ovl_whiteout_xattr, "y", 1, 0); + +out_dput: + dput(newdentry); +out_put_cred: + revert_creds(old_cred); + put_cred(override_cred); +out: + return err; +} + +static int ovl_do_remove(struct dentry *dentry, bool is_dir) +{ + int err; + enum ovl_path_type type; + struct path realpath; + struct dentry *upperdir; + + err = ovl_copy_up(dentry->d_parent); + if (err) + return err; + + upperdir = ovl_dentry_upper(dentry->d_parent); + mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT); + type = ovl_path_real(dentry, &realpath); + if (type != OVL_PATH_LOWER) { + err = -ESTALE; + if (realpath.dentry->d_parent != upperdir) + goto out_d_drop; + + if (is_dir) + err = vfs_rmdir(upperdir->d_inode, realpath.dentry); + else + err = vfs_unlink(upperdir->d_inode, realpath.dentry); + if (err) + goto out_d_drop; + } + + if (type != OVL_PATH_UPPER || ovl_dentry_is_opaque(dentry)) + err = ovl_whiteout(upperdir, dentry); + + /* + * Keeping this dentry hashed would mean having to release + * upperpath/lowerpath, which could only be done if we are the + * sole user of this dentry. Too tricky... Just unhash for + * now. + */ +out_d_drop: + d_drop(dentry); + mutex_unlock(&upperdir->d_inode->i_mutex); + + return err; +} + +static int ovl_unlink(struct inode *dir, struct dentry *dentry) +{ + return ovl_do_remove(dentry, false); +} + +static int ovl_check_empty_dir(struct dentry *dentry) +{ + int err; + struct ovl_cache_entry *p; + struct path lowerpath; + struct path upperpath; + struct ovl_readdir_data rdd = { + .list = NULL, + .endp = &rdd.list, + }; + + ovl_path_upper(dentry, &upperpath); + ovl_path_lower(dentry, &lowerpath); + + if (upperpath.dentry) { + rdd.dir = upperpath.dentry; + err = ovl_dir_read(&upperpath, &rdd, ovl_fill_upper); + if (err) + return err; + } + err = ovl_dir_read(&lowerpath, &rdd, ovl_fill_lower); + if (err) + return err; + + err = 0; + for (p = rdd.list; p; p = p->next) { + if (p->is_whiteout) + continue; + + if (p->name.name[0] == '.') { + if (p->name.len == 1) + continue; + if (p->name.len == 2 && p->name.name[1] == '.') + continue; + } + err = -ENOTEMPTY; + break; + } + + ovl_cache_free(rdd.list); + + return err; +} + +static int ovl_unlink_whiteout(void *buf, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ovl_readdir_data *rdd = buf; + + rdd->count++; + /* check d_type to filter out "." and ".." */ + if (d_type == DT_LNK) { + struct dentry *dentry; + + dentry = lookup_one_len(name, rdd->dir, strlen(name)); + if (IS_ERR(dentry)) { + rdd->err = PTR_ERR(dentry); + } else { + rdd->err = vfs_unlink(rdd->dir->d_inode, dentry); + dput(dentry); + } + } + + return rdd->err; +} + +static int ovl_remove_whiteouts(struct dentry *dentry) +{ + struct path upperpath; + struct ovl_readdir_data rdd = { .list = NULL }; + + ovl_path_upper(dentry, &upperpath); + rdd.dir = upperpath.dentry; + + return ovl_dir_read(&upperpath, &rdd, ovl_unlink_whiteout); +} + +static int ovl_rmdir(struct inode *dir, struct dentry *dentry) +{ + int err; + enum ovl_path_type type; + + type = ovl_path_type(dentry); + if (type != OVL_PATH_UPPER) { + err = ovl_check_empty_dir(dentry); + if (err) + return err; + + if (type == OVL_PATH_MERGE) { + err = ovl_remove_whiteouts(dentry); + if (err) + return err; + } + } + + return ovl_do_remove(dentry, true); +} + +static int ovl_link(struct dentry *old, struct inode *newdir, + struct dentry *new) +{ + int err; + struct dentry *olddentry; + struct dentry *newdentry; + struct dentry *upperdir; + + err = ovl_copy_up(old); + if (err) + goto out; + + err = ovl_copy_up(new->d_parent); + if (err) + goto out; + + upperdir = ovl_dentry_upper(new->d_parent); + mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT); + newdentry = ovl_lookup_create(upperdir, new); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + + olddentry = ovl_dentry_upper(old); + err = vfs_link(olddentry, upperdir->d_inode, newdentry); + if (!err) { + ovl_dentry_update(new, newdentry); + + atomic_inc(&old->d_inode->i_count); + d_instantiate(new, old->d_inode); + } else { + dput(newdentry); + } +out_unlock: + mutex_unlock(&upperdir->d_inode->i_mutex); +out: + return err; + +} + +static int ovl_rename(struct inode *olddir, struct dentry *old, + struct inode *newdir, struct dentry *new) +{ + int err; + enum ovl_path_type old_type; + struct dentry *old_upperdir; + struct dentry *new_upperdir; + struct dentry *olddentry; + struct dentry *newdentry; + struct dentry *trap; + bool is_dir = S_ISDIR(old->d_inode->i_mode); + + /* Don't copy up directory trees */ + old_type = ovl_path_type(old); + if (old_type != OVL_PATH_UPPER && is_dir) + return -EXDEV; + + if (new->d_inode) { + enum ovl_path_type new_type; + + new_type = ovl_path_type(new); + if (new_type != OVL_PATH_UPPER && + S_ISDIR(new->d_inode->i_mode)) { + err = ovl_check_empty_dir(new); + if (err) + return err; + + if (new_type == OVL_PATH_MERGE) { + err = ovl_remove_whiteouts(new); + if (err) + return err; + } + } + } + + err = ovl_copy_up(old); + if (err) + return err; + + err = ovl_copy_up(new->d_parent); + if (err) + return err; + + old_upperdir = ovl_dentry_upper(old->d_parent); + new_upperdir = ovl_dentry_upper(new->d_parent); + + trap = lock_rename(new_upperdir, old_upperdir); + + olddentry = ovl_dentry_upper(old); + newdentry = ovl_dentry_upper(new); + if (newdentry) { + dget(newdentry); + } else { + newdentry = ovl_lookup_create(new_upperdir, new); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + } + + err = -ESTALE; + if (olddentry->d_parent != old_upperdir) + goto out_dput; + if (newdentry->d_parent != new_upperdir) + goto out_dput; + if (olddentry == trap) + goto out_dput; + if (newdentry == trap) + goto out_dput; + + err = vfs_rename(old_upperdir->d_inode, olddentry, + new_upperdir->d_inode, newdentry); + + if (!err) { + bool old_opaque = ovl_dentry_is_opaque(old); + bool new_opaque = ovl_dentry_is_opaque(new); + + if (ovl_path_type(new) != OVL_PATH_UPPER) + new_opaque = true; + + if (old_type != OVL_PATH_UPPER || old_opaque) + err = ovl_whiteout(old_upperdir, old); + if (!err && is_dir) { + if (old_opaque && !new_opaque) { + ovl_remove_opaque(olddentry); + ovl_dentry_set_opaque(old, false); + } + if (!old_opaque && new_opaque) { + err = ovl_set_opaque(olddentry); + ovl_dentry_set_opaque(old, true); + } + } + } + +out_dput: + dput(newdentry); +out_unlock: + unlock_rename(new_upperdir, old_upperdir); + return err; +} + +static bool ovl_is_private_xattr(const char *name) +{ + return strncmp(name, "trusted.overlay.", 14) == 0; +} + +static int ovl_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + int err; + struct dentry *upperdentry; + + if (ovl_is_private_xattr(name)) + return -EPERM; + + err = ovl_copy_up(dentry); + if (err) + return err; + + upperdentry = ovl_dentry_upper(dentry); + return vfs_setxattr(upperdentry, name, value, size, flags); +} + +static ssize_t ovl_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size) +{ + if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && + ovl_is_private_xattr(name)) + return -ENODATA; + + return vfs_getxattr(ovl_dentry_real(dentry), name, value, size); +} + +static ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) +{ + ssize_t res; + int off; + + res = vfs_listxattr(ovl_dentry_real(dentry), list, size); + if (res <= 0 || size == 0) + return res; + + if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE) + return res; + + /* filter out private xattrs */ + for (off = 0; off < res;) { + char *s = list + off; + size_t slen = strlen(s) + 1; + + BUG_ON(off + slen > res); + + if (ovl_is_private_xattr(s)) { + res -= slen; + memmove(s, s + slen, res - off); + } else { + off += slen; + } + } + + return res; +} + +static int ovl_removexattr(struct dentry *dentry, const char *name) +{ + int err; + struct path realpath; + enum ovl_path_type type; + + if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && + ovl_is_private_xattr(name)) + return -ENODATA; + + type = ovl_path_real(dentry, &realpath); + if (type == OVL_PATH_LOWER) { + err = vfs_getxattr(realpath.dentry, name, NULL, 0); + if (err < 0) + return err; + + err = ovl_copy_up(dentry); + if (err) + return err; + + ovl_path_upper(dentry, &realpath); + } + + return vfs_removexattr(realpath.dentry, name); +} + +static const struct inode_operations ovl_dir_inode_operations = { + .lookup = ovl_lookup, + .mkdir = ovl_mkdir, + .symlink = ovl_symlink, + .unlink = ovl_unlink, + .rmdir = ovl_rmdir, + .rename = ovl_rename, + .link = ovl_link, + .setattr = ovl_setattr, + .create = ovl_create, + .mknod = ovl_mknod, + .permission = ovl_permission, + .getattr = ovl_dir_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, +}; + +static const struct inode_operations ovl_file_inode_operations = { + .setattr = ovl_setattr, + .permission = ovl_permission, + .getattr = ovl_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, +}; + +static const struct inode_operations ovl_symlink_inode_operations = { + .setattr = ovl_setattr, + .follow_link = ovl_follow_link, + .put_link = ovl_put_link, + .readlink = ovl_readlink, + .getattr = ovl_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, +}; + +static bool ovl_open_need_copy_up(struct file *file, enum ovl_path_type type, + struct dentry *realdentry) +{ + if (type != OVL_PATH_LOWER) + return false; + + if (special_file(realdentry->d_inode->i_mode)) + return false; + + if (!(file->f_mode & FMODE_WRITE) && !(file->f_flags & O_TRUNC)) + return false; + + return true; +} + +static struct file *ovl_open(struct file *file) +{ + int err; + struct path realpath; + enum ovl_path_type type; + struct dentry *dentry = file->f_path.dentry; + + type = ovl_path_real(dentry, &realpath); + if (ovl_open_need_copy_up(file, type, realpath.dentry)) { + if (file->f_flags & O_TRUNC) + err = ovl_copy_up_truncate(dentry, 0); + else + err = ovl_copy_up(dentry); + if (err) + return ERR_PTR(err); + + ovl_path_upper(dentry, &realpath); + } + + return path_open(&realpath, file->f_flags); +} + +static const struct file_operations ovl_file_operations = { + .open_other = ovl_open, +}; + +static void ovl_put_super(struct super_block *sb) +{ + struct ovl_fs *ufs = sb->s_fs_info; + + if (!(sb->s_flags & MS_RDONLY)) + mnt_drop_write(ufs->upper_mnt); + + mntput(ufs->upper_mnt); + mntput(ufs->lower_mnt); + + iput(ufs->symlink_inode); + iput(ufs->regular_inode); + iput(ufs->special_inode); + kfree(ufs); +} + +static int ovl_remount_fs(struct super_block *sb, int *flagsp, char *data) +{ + int flags = *flagsp; + struct ovl_fs *ufs = sb->s_fs_info; + + /* When remounting rw or ro, we need to adjust the write access to the + * upper fs. + */ + if (((flags ^ sb->s_flags) & MS_RDONLY) == 0) + /* No change to readonly status */ + return 0; + + if (flags & MS_RDONLY) { + mnt_drop_write(ufs->upper_mnt); + return 0; + } else + return mnt_want_write(ufs->upper_mnt); +} + +static bool ovl_is_same_inode(struct dentry *d1, struct dentry *d2) +{ + struct dentry *upperd1; + struct dentry *upperd2; + + upperd1 = ovl_dentry_upper(d1); + upperd2 = ovl_dentry_upper(d2); + + if (upperd1 && upperd2) + return vfs_is_same_inode(upperd1, upperd2); + + if (upperd1 || upperd2) + return false; + + return vfs_is_same_inode(ovl_dentry_lower(d1), ovl_dentry_lower(d2)); +} + +static const struct super_operations ovl_super_operations = { + .put_super = ovl_put_super, + .remount_fs = ovl_remount_fs, + .is_same_inode = ovl_is_same_inode, +}; + +struct ovl_config { + char *lowerdir; + char *upperdir; +}; + +enum { + Opt_lowerdir, + Opt_upperdir, + Opt_err, +}; + +static const match_table_t ovl_tokens = { + {Opt_lowerdir, "lowerdir=%s"}, + {Opt_upperdir, "upperdir=%s"}, + {Opt_err, NULL} +}; + +static int ovl_parse_opt(char *opt, struct ovl_config *config) +{ + char *p; + + config->upperdir = NULL; + config->lowerdir = NULL; + + while ((p = strsep(&opt, ",")) != NULL) { + int token; + substring_t args[MAX_OPT_ARGS]; + + if (!*p) + continue; + + token = match_token(p, ovl_tokens, args); + switch (token) { + case Opt_upperdir: + kfree(config->upperdir); + config->upperdir = match_strdup(&args[0]); + if (!config->upperdir) + return -ENOMEM; + break; + + case Opt_lowerdir: + kfree(config->lowerdir); + config->lowerdir = match_strdup(&args[0]); + if (!config->lowerdir) + return -ENOMEM; + break; + + default: + return -EINVAL; + } + } + return 0; +} + +static int ovl_fill_super(struct super_block *sb, void *data, int silent) +{ + struct path lowerpath; + struct path upperpath; + struct inode *root_inode; + struct dentry *root_dentry; + struct ovl_entry *oe; + struct ovl_fs *ufs; + struct ovl_config config; + int err; + + err = ovl_parse_opt((char *) data, &config); + if (err) + goto out; + + err = -EINVAL; + if (!config.upperdir || !config.lowerdir) + goto out_free_config; + + err = -ENOMEM; + ufs = kmalloc(sizeof(struct ovl_fs), GFP_KERNEL); + if (!ufs) + goto out_free_config; + + ufs->symlink_inode = new_inode(sb); + if (!ufs->symlink_inode) + goto out_free_ufs; + + ufs->regular_inode = new_inode(sb); + if (!ufs->regular_inode) + goto out_put_symlink_inode; + + ufs->special_inode = new_inode(sb); + if (!ufs->special_inode) + goto out_put_regular_inode; + + ufs->symlink_inode->i_flags |= S_NOATIME|S_NOCMTIME; + ufs->symlink_inode->i_mode = S_IFLNK; + ufs->symlink_inode->i_op = &ovl_symlink_inode_operations; + + ufs->regular_inode->i_flags |= S_NOATIME|S_NOCMTIME; + ufs->regular_inode->i_mode = S_IFREG; + ufs->regular_inode->i_op = &ovl_file_inode_operations; + ufs->regular_inode->i_fop = &ovl_file_operations; + + ufs->special_inode->i_flags |= S_NOATIME|S_NOCMTIME; + ufs->special_inode->i_mode = S_IFSOCK; + ufs->special_inode->i_op = &ovl_file_inode_operations; + ufs->special_inode->i_fop = &ovl_file_operations; + + root_inode = ovl_new_inode(sb, S_IFDIR); + if (!root_inode) + goto out_put_special_inode; + + oe = ovl_alloc_entry(); + if (oe == NULL) + goto out_put_root; + + err = kern_path(config.upperdir, LOOKUP_FOLLOW, &upperpath); + if (err) + goto out_free_oe; + + err = kern_path(config.lowerdir, LOOKUP_FOLLOW, &lowerpath); + if (err) + goto out_put_upperpath; + + err = -ENOTDIR; + if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) || + !S_ISDIR(lowerpath.dentry->d_inode->i_mode)) + goto out_put_lowerpath; + + if (!(sb->s_flags & MS_RDONLY)) { + err = mnt_want_write(upperpath.mnt); + if (err) + goto out_put_lowerpath; + } + + err = -ENOMEM; + root_dentry = d_alloc_root(root_inode); + if (!root_dentry) + goto out_drop_write; + + ufs->upper_mnt = upperpath.mnt; + ufs->lower_mnt = lowerpath.mnt; + + oe->__upperdentry = upperpath.dentry; + oe->lowerdentry = lowerpath.dentry; + + root_dentry->d_fsdata = oe; + root_dentry->d_op = &ovl_dentry_operations; + + sb->s_op = &ovl_super_operations; + sb->s_root = root_dentry; + sb->s_fs_info = ufs; + + return 0; + +out_drop_write: + if (!(sb->s_flags & MS_RDONLY)) + mnt_drop_write(upperpath.mnt); +out_put_lowerpath: + path_put(&lowerpath); +out_put_upperpath: + path_put(&upperpath); +out_free_oe: + kfree(oe); +out_put_root: + iput(root_inode); +out_put_special_inode: + iput(ufs->special_inode); +out_put_regular_inode: + iput(ufs->regular_inode); +out_put_symlink_inode: + iput(ufs->symlink_inode); +out_free_ufs: + kfree(ufs); +out_free_config: + kfree(config.lowerdir); + kfree(config.upperdir); +out: + return err; +} + +static int ovl_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *raw_data, struct vfsmount *mnt) +{ + return get_sb_nodev(fs_type, flags, raw_data, ovl_fill_super, mnt); +} + +static struct file_system_type ovl_fs_type = { + .owner = THIS_MODULE, + .name = "overlayfs", + .get_sb = ovl_get_sb, + .kill_sb = kill_anon_super, +}; + +static int __init ovl_init(void) +{ + return register_filesystem(&ovl_fs_type); +} + +static void __exit ovl_exit(void) +{ + unregister_filesystem(&ovl_fs_type); +} + +module_init(ovl_init); +module_exit(ovl_exit); Index: linux-2.6/fs/Kconfig =================================================================== --- linux-2.6.orig/fs/Kconfig 2010-10-26 11:29:16.000000000 +0200 +++ linux-2.6/fs/Kconfig 2010-10-26 11:29:29.000000000 +0200 @@ -63,6 +63,7 @@ source "fs/quota/Kconfig" source "fs/autofs/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" +source "fs/overlayfs/Kconfig" config CUSE tristate "Character device in Userspace support" Index: linux-2.6/fs/Makefile =================================================================== --- linux-2.6.orig/fs/Makefile 2010-10-26 11:29:16.000000000 +0200 +++ linux-2.6/fs/Makefile 2010-10-26 11:29:29.000000000 +0200 @@ -108,6 +108,7 @@ obj-$(CONFIG_AUTOFS_FS) += autofs/ obj-$(CONFIG_AUTOFS4_FS) += autofs4/ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ +obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/ obj-$(CONFIG_UDF_FS) += udf/ obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ obj-$(CONFIG_OMFS_FS) += omfs/ Index: linux-2.6/fs/overlayfs/Kconfig =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6/fs/overlayfs/Kconfig 2010-10-26 11:29:29.000000000 +0200 @@ -0,0 +1,4 @@ +config OVERLAYFS_FS + tristate "Overlay filesystem support" + help + Add support for overlay filesystem. Index: linux-2.6/fs/overlayfs/Makefile =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6/fs/overlayfs/Makefile 2010-10-26 11:29:29.000000000 +0200 @@ -0,0 +1,5 @@ +# +# Makefile for the overlay filesystem. +# + +obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o -- -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html