From: Andiry Xu <jix024@xxxxxxxxxxx> This is the entry point for NOVA filesystem mount and umount. NOVA works on DAX devices. During initialization it gets the device information, such as physical/virtual addresses and device size. It does not access the DAX device during runtime. During initialization NOVA also initializes the root inode. The root inode is a reserved inode and resides on the fixed location. The way to mount and initialize a NOVA instance is: mount -t NOVA -o init /dev/pmem0 /mnt/NOVA This creates a NOVA instance on /dev/pmem0 and mount on /mnt/NOVA. Currently it cannot do anything except mount and umount. Signed-off-by: Andiry Xu <jix024@xxxxxxxxxxx> --- fs/nova/super.c | 630 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 630 insertions(+) create mode 100644 fs/nova/super.c diff --git a/fs/nova/super.c b/fs/nova/super.c new file mode 100644 index 0000000..552fe5d --- /dev/null +++ b/fs/nova/super.c @@ -0,0 +1,630 @@ +/* + * BRIEF DESCRIPTION + * + * Super block operations. + * + * Copyright 2015-2016 Regents of the University of California, + * UCSD Non-Volatile Systems Lab, Andiry Xu <jix024@xxxxxxxxxxx> + * Copyright 2012-2013 Intel Corporation + * Copyright 2009-2011 Marco Stornelli <marco.stornelli@xxxxxxxxx> + * Copyright 2003 Sony Corporation + * Copyright 2003 Matsushita Electric Industrial Co., Ltd. + * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam + * + * This program is free software; you can redistribute it and/or modify it + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/parser.h> +#include <linux/vfs.h> +#include <linux/uaccess.h> +#include <linux/io.h> +#include <linux/seq_file.h> +#include <linux/mount.h> +#include <linux/mm.h> +#include <linux/ctype.h> +#include <linux/bitops.h> +#include <linux/magic.h> +#include <linux/exportfs.h> +#include <linux/random.h> +#include <linux/cred.h> +#include <linux/list.h> +#include <linux/dax.h> +#include "nova.h" +#include "super.h" + +int support_clwb; + +module_param(nova_dbgmask, int, 0444); +MODULE_PARM_DESC(nova_dbgmask, "Control debugging output"); + +static struct super_operations nova_sops; + +static struct kmem_cache *nova_inode_cachep; + + +/* FIXME: should the following variable be one per NOVA instance? */ +unsigned int nova_dbgmask; + +void nova_error_mng(struct super_block *sb, const char *fmt, ...) +{ + va_list args; + + printk(KERN_CRIT "nova error: "); + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + + if (test_opt(sb, ERRORS_PANIC)) + panic("nova: panic from previous error\n"); + if (test_opt(sb, ERRORS_RO)) { + printk(KERN_CRIT "nova err: remounting filesystem read-only"); + sb->s_flags |= MS_RDONLY; + } +} + +static void nova_set_blocksize(struct super_block *sb, unsigned long size) +{ + int bits; + + /* + * We've already validated the user input and the value here must be + * between NOVA_MAX_BLOCK_SIZE and NOVA_MIN_BLOCK_SIZE + * and it must be a power of 2. + */ + bits = fls(size) - 1; + sb->s_blocksize_bits = bits; + sb->s_blocksize = (1 << bits); +} + +static int nova_get_nvmm_info(struct super_block *sb, + struct nova_sb_info *sbi) +{ + void *virt_addr = NULL; + pfn_t __pfn_t; + long size; + struct dax_device *dax_dev; + int ret; + + ret = bdev_dax_supported(sb, PAGE_SIZE); + nova_dbg_verbose("%s: dax_supported = %d; bdev->super=0x%p", + __func__, ret, sb->s_bdev->bd_super); + if (ret) { + nova_err(sb, "device does not support DAX\n"); + return ret; + } + + sbi->s_bdev = sb->s_bdev; + + dax_dev = fs_dax_get_by_host(sb->s_bdev->bd_disk->disk_name); + if (!dax_dev) { + nova_err(sb, "Couldn't retrieve DAX device.\n"); + return -EINVAL; + } + sbi->s_dax_dev = dax_dev; + + size = dax_direct_access(sbi->s_dax_dev, 0, LONG_MAX/PAGE_SIZE, + &virt_addr, &__pfn_t) * PAGE_SIZE; + if (size <= 0) { + nova_err(sb, "direct_access failed\n"); + return -EINVAL; + } + + sbi->virt_addr = virt_addr; + + if (!sbi->virt_addr) { + nova_err(sb, "ioremap of the nova image failed(1)\n"); + return -EINVAL; + } + + sbi->phys_addr = pfn_t_to_pfn(__pfn_t) << PAGE_SHIFT; + sbi->initsize = size; + sbi->replica_reserved_inodes_addr = virt_addr + size - + (sbi->tail_reserved_blocks << PAGE_SHIFT); + sbi->replica_sb_addr = virt_addr + size - PAGE_SIZE; + + nova_dbg("%s: dev %s, phys_addr 0x%llx, virt_addr %p, size %ld\n", + __func__, sbi->s_bdev->bd_disk->disk_name, + sbi->phys_addr, sbi->virt_addr, sbi->initsize); + + return 0; +} + +static loff_t nova_max_size(int bits) +{ + loff_t res; + + res = (1ULL << 63) - 1; + + if (res > MAX_LFS_FILESIZE) + res = MAX_LFS_FILESIZE; + + nova_dbg_verbose("max file size %llu bytes\n", res); + return res; +} + +enum { + Opt_bpi, Opt_init, Opt_mode, Opt_uid, + Opt_gid, Opt_dax, + Opt_err_cont, Opt_err_panic, Opt_err_ro, + Opt_dbgmask, Opt_err +}; + +static const match_table_t tokens = { + { Opt_bpi, "bpi=%u" }, + { Opt_init, "init" }, + { Opt_mode, "mode=%o" }, + { Opt_uid, "uid=%u" }, + { Opt_gid, "gid=%u" }, + { Opt_dax, "dax" }, + { Opt_err_cont, "errors=continue" }, + { Opt_err_panic, "errors=panic" }, + { Opt_err_ro, "errors=remount-ro" }, + { Opt_dbgmask, "dbgmask=%u" }, + { Opt_err, NULL }, +}; + +static int nova_parse_options(char *options, struct nova_sb_info *sbi, + bool remount) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + kuid_t uid; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_bpi: + if (match_int(&args[0], &option)) + goto bad_val; + if (remount && sbi->bpi) + goto bad_opt; + sbi->bpi = option; + break; + case Opt_uid: + if (match_int(&args[0], &option)) + goto bad_val; + uid = make_kuid(current_user_ns(), option); + if (remount && !uid_eq(sbi->uid, uid)) + goto bad_opt; + sbi->uid = uid; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + goto bad_val; + sbi->gid = make_kgid(current_user_ns(), option); + break; + case Opt_mode: + if (match_octal(&args[0], &option)) + goto bad_val; + sbi->mode = option & 01777U; + break; + case Opt_init: + if (remount) + goto bad_opt; + set_opt(sbi->s_mount_opt, FORMAT); + break; + case Opt_err_panic: + clear_opt(sbi->s_mount_opt, ERRORS_CONT); + clear_opt(sbi->s_mount_opt, ERRORS_RO); + set_opt(sbi->s_mount_opt, ERRORS_PANIC); + break; + case Opt_err_ro: + clear_opt(sbi->s_mount_opt, ERRORS_CONT); + clear_opt(sbi->s_mount_opt, ERRORS_PANIC); + set_opt(sbi->s_mount_opt, ERRORS_RO); + break; + case Opt_err_cont: + clear_opt(sbi->s_mount_opt, ERRORS_RO); + clear_opt(sbi->s_mount_opt, ERRORS_PANIC); + set_opt(sbi->s_mount_opt, ERRORS_CONT); + break; + case Opt_dax: + set_opt(sbi->s_mount_opt, DAX); + break; + case Opt_dbgmask: + if (match_int(&args[0], &option)) + goto bad_val; + nova_dbgmask = option; + break; + default: { + goto bad_opt; + } + } + } + + return 0; + +bad_val: + nova_info("Bad value '%s' for mount option '%s'\n", args[0].from, + p); + return -EINVAL; +bad_opt: + nova_info("Bad mount option: \"%s\"\n", p); + return -EINVAL; +} + + +/* Make sure we have enough space */ +static bool nova_check_size(struct super_block *sb, unsigned long size) +{ + unsigned long minimum_size; + + /* space required for super block and root directory.*/ + minimum_size = (HEAD_RESERVED_BLOCKS + TAIL_RESERVED_BLOCKS + 1) + << sb->s_blocksize_bits; + + if (size < minimum_size) + return false; + + return true; +} + +static inline void nova_sync_super(struct super_block *sb) +{ + struct nova_sb_info *sbi = NOVA_SB(sb); + struct nova_super_block *super = nova_get_super(sb); + struct nova_super_block *super_redund; + + super_redund = nova_get_redund_super(sb); + + memcpy_to_pmem_nocache((void *)super, (void *)sbi->nova_sb, + sizeof(struct nova_super_block)); + PERSISTENT_BARRIER(); + + memcpy_to_pmem_nocache((void *)super_redund, (void *)sbi->nova_sb, + sizeof(struct nova_super_block)); + PERSISTENT_BARRIER(); +} + +static struct nova_inode *nova_init(struct super_block *sb, + unsigned long size) +{ + unsigned long blocksize; + struct nova_inode *root_i, *pi; + struct nova_super_block *super; + struct nova_sb_info *sbi = NOVA_SB(sb); + + nova_info("creating an empty nova of size %lu\n", size); + sbi->num_blocks = ((unsigned long)(size) >> PAGE_SHIFT); + + nova_dbgv("nova: Default block size set to 4K\n"); + sbi->blocksize = blocksize = NOVA_DEF_BLOCK_SIZE_4K; + nova_set_blocksize(sb, sbi->blocksize); + + if (!nova_check_size(sb, size)) { + nova_warn("Specified NOVA size too small 0x%lx.\n", size); + return ERR_PTR(-EINVAL); + } + + nova_dbgv("max file name len %d\n", (unsigned int)NOVA_NAME_LEN); + + super = nova_get_super(sb); + + /* clear out super-block and inode table */ + memset_nt(super, 0, sbi->head_reserved_blocks * sbi->blocksize); + + pi = nova_get_inode_by_ino(sb, NOVA_BLOCKNODE_INO); + pi->nova_ino = NOVA_BLOCKNODE_INO; + nova_flush_buffer(pi, CACHELINE_SIZE, 1); + + sbi->nova_sb->s_size = cpu_to_le64(size); + sbi->nova_sb->s_blocksize = cpu_to_le32(blocksize); + sbi->nova_sb->s_magic = cpu_to_le32(NOVA_SUPER_MAGIC); + sbi->nova_sb->s_epoch_id = 0; + + nova_sync_super(sb); + + root_i = nova_get_inode_by_ino(sb, NOVA_ROOT_INO); + nova_dbgv("%s: Allocate root inode @ 0x%p\n", __func__, root_i); + + root_i->i_mode = cpu_to_le16(sbi->mode | S_IFDIR); + root_i->i_uid = cpu_to_le32(from_kuid(&init_user_ns, sbi->uid)); + root_i->i_gid = cpu_to_le32(from_kgid(&init_user_ns, sbi->gid)); + root_i->i_links_count = cpu_to_le16(2); + root_i->i_blk_type = NOVA_BLOCK_TYPE_4K; + root_i->i_flags = 0; + root_i->i_size = cpu_to_le64(sb->s_blocksize); + root_i->i_atime = root_i->i_mtime = root_i->i_ctime = + cpu_to_le32(get_seconds()); + root_i->nova_ino = cpu_to_le64(NOVA_ROOT_INO); + root_i->valid = 1; + + nova_flush_buffer(root_i, sizeof(*root_i), false); + + PERSISTENT_MARK(); + PERSISTENT_BARRIER(); + nova_info("NOVA initialization finish\n"); + return root_i; +} + +static inline void set_default_opts(struct nova_sb_info *sbi) +{ + set_opt(sbi->s_mount_opt, HUGEIOREMAP); + set_opt(sbi->s_mount_opt, ERRORS_CONT); + sbi->head_reserved_blocks = HEAD_RESERVED_BLOCKS; + sbi->tail_reserved_blocks = TAIL_RESERVED_BLOCKS; + sbi->cpus = num_online_cpus(); +} + +static void nova_root_check(struct super_block *sb, struct nova_inode *root_pi) +{ + if (!S_ISDIR(le16_to_cpu(root_pi->i_mode))) + nova_warn("root is not a directory!\n"); +} + +static int nova_fill_super(struct super_block *sb, void *data, int silent) +{ + struct nova_sb_info *sbi = NULL; + struct nova_inode *root_pi; + struct inode *root_i = NULL; + unsigned long blocksize; + u32 random = 0; + int retval = -EINVAL; + + BUILD_BUG_ON(sizeof(struct nova_super_block) > NOVA_SB_SIZE); + + sbi = kzalloc(sizeof(struct nova_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + sbi->nova_sb = kzalloc(sizeof(struct nova_super_block), GFP_KERNEL); + if (!sbi->nova_sb) { + kfree(sbi); + return -ENOMEM; + } + + sb->s_fs_info = sbi; + sbi->sb = sb; + + set_default_opts(sbi); + + /* Currently the log page supports 64 journal pointer pairs */ + if (sbi->cpus > MAX_CPUS) { + nova_err(sb, "NOVA needs more log pointer pages to support more than " + __stringify(MAX_CPUS) " cpus.\n"); + goto out; + } + + retval = nova_get_nvmm_info(sb, sbi); + if (retval) { + nova_err(sb, "%s: Failed to get nvmm info.", + __func__); + goto out; + } + + get_random_bytes(&random, sizeof(u32)); + atomic_set(&sbi->next_generation, random); + + /* Init with default values */ + sbi->mode = (0755); + sbi->uid = current_fsuid(); + sbi->gid = current_fsgid(); + set_opt(sbi->s_mount_opt, HUGEIOREMAP); + + mutex_init(&sbi->s_lock); + + sbi->zeroed_page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!sbi->zeroed_page) { + retval = -ENOMEM; + nova_dbg("%s: sbi->zeroed_page failed.", + __func__); + goto out; + } + + retval = nova_parse_options(data, sbi, 0); + if (retval) { + nova_err(sb, "%s: Failed to parse nova command line options.", + __func__); + goto out; + } + + /* Init a new nova instance */ + if (sbi->s_mount_opt & NOVA_MOUNT_FORMAT) { + root_pi = nova_init(sb, sbi->initsize); + if (IS_ERR(root_pi)) { + nova_err(sb, "%s: root_pi error.", + __func__); + + goto out; + } + goto setup_sb; + } + + blocksize = le32_to_cpu(sbi->nova_sb->s_blocksize); + nova_set_blocksize(sb, blocksize); + + nova_dbg_verbose("blocksize %lu\n", blocksize); + + /* Read the root inode */ + root_pi = nova_get_inode_by_ino(sb, NOVA_ROOT_INO); + + /* Check that the root inode is in a sane state */ + nova_root_check(sb, root_pi); + + /* Set it all up.. */ +setup_sb: + sb->s_magic = le32_to_cpu(sbi->nova_sb->s_magic); + sb->s_op = &nova_sops; + sb->s_maxbytes = nova_max_size(sb->s_blocksize_bits); + sb->s_time_gran = 1000000000; // 1 second. + sb->s_xattr = NULL; + sb->s_flags |= MS_NOSEC; + + root_i = nova_iget(sb, NOVA_ROOT_INO); + if (IS_ERR(root_i)) { + retval = PTR_ERR(root_i); + nova_err(sb, "%s: failed to get root inode", + __func__); + + goto out; + } + + sb->s_root = d_make_root(root_i); + if (!sb->s_root) { + nova_err(sb, "get nova root inode failed\n"); + retval = -ENOMEM; + goto out; + } + + retval = 0; + return retval; + +out: + kfree(sbi->zeroed_page); + sbi->zeroed_page = NULL; + + kfree(sbi->nova_sb); + kfree(sbi); + nova_dbg("%s failed: return %d\n", __func__, retval); + return retval; +} + +static void nova_put_super(struct super_block *sb) +{ + struct nova_sb_info *sbi = NOVA_SB(sb); + + if (sbi->virt_addr) { + sbi->virt_addr = NULL; + } + + kfree(sbi->zeroed_page); + nova_dbgmask = 0; + + kfree(sbi->nova_sb); + kfree(sbi); + sb->s_fs_info = NULL; +} + +static struct inode *nova_alloc_inode(struct super_block *sb) +{ + struct nova_inode_info *vi; + + vi = kmem_cache_alloc(nova_inode_cachep, GFP_NOFS); + if (!vi) + return NULL; + + return &vi->vfs_inode; +} + +static void nova_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct nova_inode_info *vi = NOVA_I(inode); + + nova_dbg_verbose("%s: ino %lu\n", __func__, inode->i_ino); + kmem_cache_free(nova_inode_cachep, vi); +} + +static void nova_destroy_inode(struct inode *inode) +{ + nova_dbgv("%s: %lu\n", __func__, inode->i_ino); + call_rcu(&inode->i_rcu, nova_i_callback); +} + +static void init_once(void *foo) +{ + struct nova_inode_info *vi = foo; + + inode_init_once(&vi->vfs_inode); +} + +static int __init init_inodecache(void) +{ + nova_inode_cachep = kmem_cache_create("nova_inode_cache", + sizeof(struct nova_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD), init_once); + if (nova_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before + * we destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(nova_inode_cachep); +} + + +/* + * the super block writes are all done "on the fly", so the + * super block is never in a "dirty" state, so there's no need + * for write_super. + */ +static struct super_operations nova_sops = { + .alloc_inode = nova_alloc_inode, + .destroy_inode = nova_destroy_inode, + .put_super = nova_put_super, +}; + +static struct dentry *nova_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, nova_fill_super); +} + +static struct file_system_type nova_fs_type = { + .owner = THIS_MODULE, + .name = "NOVA", + .mount = nova_mount, + .kill_sb = kill_block_super, +}; + +static int __init init_nova_fs(void) +{ + int rc = 0; + + nova_dbg("%s: %d cpus online\n", __func__, num_online_cpus()); + if (arch_has_clwb()) + support_clwb = 1; + + nova_info("Arch new instructions support: CLWB %s\n", + support_clwb ? "YES" : "NO"); + + rc = init_inodecache(); + if (rc) + return rc; + + rc = register_filesystem(&nova_fs_type); + if (rc) + goto out1; + + return rc; + +out1: + destroy_inodecache(); + return rc; +} + +static void __exit exit_nova_fs(void) +{ + unregister_filesystem(&nova_fs_type); + destroy_inodecache(); +} + +MODULE_AUTHOR("Andiry Xu <jix024@xxxxxxxxxxx>"); +MODULE_DESCRIPTION("NOVA: NOn-Volatile memory Accelerated File System"); +MODULE_LICENSE("GPL"); + +module_init(init_nova_fs) +module_exit(exit_nova_fs) -- 2.7.4