Linus, The following changes since commit 2241ab53cbb5cdb08a6b2d4688feb13971058f65: Linux 6.2-rc5 (2023-01-21 16:27:01 -0800) are available in the Git repository at: ssh://git@xxxxxxxxxxxxxxxxxxx/pub/scm/linux/kernel/git/dlemoal/zonefs tags/zonefs-6.3-rc1 for you to fetch changes up to 2b188a2cfc4d8f319ad23832ec1390bdae52daf6: zonefs: make kobj_type structure constant (2023-02-13 08:03:48 +0900) Please note that this pull request generates a conflict if fs/zonefs/super.c between commits: c1632a0f1120 ("fs: port ->setattr() to pass mnt_idmap") f2d40141d5d9 ("fs: port inode_init_owner() to mnt_idmap") f861646a6562 ("quota: port to mnt_idmap") from the vfs-idmapping tree and commits: 4008e2a0b01a ("zonefs: Reorganize code") d207794ababe ("zonefs: Dynamically create file inodes when needed") from the zonefs tree. The conflict resolution looks very messy but is in fact only due to a few lines. I am including the resolution diff below for your reference. ---------------------------------------------------------------- zonefs changes for 6.3-rc1 * Reorganize zonefs code to split file related operations to a new fs/zonefs/file.c file. From me. * Modify zonefs to use dynamically allocated inodes and dentries (using the inode and dentry caches) instead of statically allocating everything on mount. This saves a significant amount of memory for very large zoned block devices with 10s of thousands of zones. From me. * Make zonefs_sb_ktype a const struct kobj_type, from Thomas. ---------------------------------------------------------------- Damien Le Moal (6): zonefs: Reorganize code zonefs: Simplify IO error handling zonefs: Reduce struct zonefs_inode_info size zonefs: Separate zone information from inode information zonefs: Dynamically create file inodes when needed zonefs: Cache zone group directory inodes Thomas Weißschuh (1): zonefs: make kobj_type structure constant fs/zonefs/Makefile | 2 +- fs/zonefs/file.c | 878 ++++++++++++++++++++++++ fs/zonefs/super.c | 1931 +++++++++++++++++++--------------------------------- fs/zonefs/sysfs.c | 2 +- fs/zonefs/trace.h | 20 +- fs/zonefs/zonefs.h | 110 ++- 6 files changed, 1678 insertions(+), 1265 deletions(-) create mode 100644 fs/zonefs/file.c Conflict resolution: -------------------- diff --cc fs/zonefs/super.c index 72ef97320b99,010b53545e5b..000000000000 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@@ -526,85 -402,145 +402,145 @@@ void __zonefs_io_error(struct inode *in memalloc_noio_restore(noio_flag); } - static void zonefs_io_error(struct inode *inode, bool write) + static struct kmem_cache *zonefs_inode_cachep; + + static struct inode *zonefs_alloc_inode(struct super_block *sb) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_inode_info *zi; + + zi = alloc_inode_sb(sb, zonefs_inode_cachep, GFP_KERNEL); + if (!zi) + return NULL; + + inode_init_once(&zi->i_vnode); + mutex_init(&zi->i_truncate_mutex); + zi->i_wr_refcnt = 0; - mutex_lock(&zi->i_truncate_mutex); - __zonefs_io_error(inode, write); - mutex_unlock(&zi->i_truncate_mutex); + return &zi->i_vnode; } - static int zonefs_file_truncate(struct inode *inode, loff_t isize) + static void zonefs_free_inode(struct inode *inode) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); - loff_t old_isize; - enum req_op op; - int ret = 0; + kmem_cache_free(zonefs_inode_cachep, ZONEFS_I(inode)); + } - /* - * Only sequential zone files can be truncated and truncation is allowed - * only down to a 0 size, which is equivalent to a zone reset, and to - * the maximum file size, which is equivalent to a zone finish. - */ - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) - return -EPERM; + /* + * File system stat. + */ + static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) + { + struct super_block *sb = dentry->d_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + enum zonefs_ztype t; - if (!isize) - op = REQ_OP_ZONE_RESET; - else if (isize == zi->i_max_size) - op = REQ_OP_ZONE_FINISH; + buf->f_type = ZONEFS_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_namelen = ZONEFS_NAME_MAX; + + spin_lock(&sbi->s_lock); + + buf->f_blocks = sbi->s_blocks; + if (WARN_ON(sbi->s_used_blocks > sbi->s_blocks)) + buf->f_bfree = 0; else - return -EPERM; + buf->f_bfree = buf->f_blocks - sbi->s_used_blocks; + buf->f_bavail = buf->f_bfree; + + for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { + if (sbi->s_zgroup[t].g_nr_zones) + buf->f_files += sbi->s_zgroup[t].g_nr_zones + 1; + } + buf->f_ffree = 0; - inode_dio_wait(inode); + spin_unlock(&sbi->s_lock); - /* Serialize against page faults */ - filemap_invalidate_lock(inode->i_mapping); + buf->f_fsid = uuid_to_fsid(sbi->s_uuid.b); - /* Serialize against zonefs_iomap_begin() */ - mutex_lock(&zi->i_truncate_mutex); + return 0; + } - old_isize = i_size_read(inode); - if (isize == old_isize) - goto unlock; + enum { + Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair, + Opt_explicit_open, Opt_err, + }; - ret = zonefs_zone_mgmt(inode, op); - if (ret) - goto unlock; + static const match_table_t tokens = { + { Opt_errors_ro, "errors=remount-ro"}, + { Opt_errors_zro, "errors=zone-ro"}, + { Opt_errors_zol, "errors=zone-offline"}, + { Opt_errors_repair, "errors=repair"}, + { Opt_explicit_open, "explicit-open" }, + { Opt_err, NULL} + }; - /* - * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, - * take care of open zones. - */ - if (zi->i_flags & ZONEFS_ZONE_OPEN) { - /* - * Truncating a zone to EMPTY or FULL is the equivalent of - * closing the zone. For a truncation to 0, we need to - * re-open the zone to ensure new writes can be processed. - * For a truncation to the maximum file size, the zone is - * closed and writes cannot be accepted anymore, so clear - * the open flag. - */ - if (!isize) - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); - else - zi->i_flags &= ~ZONEFS_ZONE_OPEN; + static int zonefs_parse_options(struct super_block *sb, char *options) + { + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + substring_t args[MAX_OPT_ARGS]; + char *p; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_errors_ro: + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; + sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_RO; + break; + case Opt_errors_zro: + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; + sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZRO; + break; + case Opt_errors_zol: + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; + sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZOL; + break; + case Opt_errors_repair: + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; + sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR; + break; + case Opt_explicit_open: + sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN; + break; + default: + return -EINVAL; + } } - zonefs_update_stats(inode, isize); - truncate_setsize(inode, isize); - zi->i_wpoffset = isize; - zonefs_account_active(inode); + return 0; + } + + static int zonefs_show_options(struct seq_file *seq, struct dentry *root) + { + struct zonefs_sb_info *sbi = ZONEFS_SB(root->d_sb); + + if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO) + seq_puts(seq, ",errors=remount-ro"); + if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) + seq_puts(seq, ",errors=zone-ro"); + if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) + seq_puts(seq, ",errors=zone-offline"); + if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_REPAIR) + seq_puts(seq, ",errors=repair"); - unlock: - mutex_unlock(&zi->i_truncate_mutex); - filemap_invalidate_unlock(inode->i_mapping); + return 0; + } - return ret; + static int zonefs_remount(struct super_block *sb, int *flags, char *data) + { + sync_filesystem(sb); + + return zonefs_parse_options(sb, data); } -static int zonefs_inode_setattr(struct user_namespace *mnt_userns, +static int zonefs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); @@@ -641,8 -577,16 +577,16 @@@ return ret; } - setattr_copy(&init_user_ns, inode, iattr); + setattr_copy(&nop_mnt_idmap, inode, iattr); + if (S_ISREG(inode->i_mode)) { + struct zonefs_zone *z = zonefs_inode_zone(inode); + + z->z_mode = inode->i_mode; + z->z_uid = inode->i_uid; + z->z_gid = inode->i_gid; + } + return 0; } @@@ -650,753 -594,194 +594,194 @@@ static const struct inode_operations zo .setattr = zonefs_inode_setattr, }; - static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, - int datasync) + static long zonefs_fname_to_fno(const struct qstr *fname) { - struct inode *inode = file_inode(file); - int ret = 0; - - if (unlikely(IS_IMMUTABLE(inode))) - return -EPERM; + const char *name = fname->name; + unsigned int len = fname->len; + long fno = 0, shift = 1; + const char *rname; + char c = *name; + unsigned int i; /* - * Since only direct writes are allowed in sequential files, page cache - * flush is needed only for conventional zone files. + * File names are always a base-10 number string without any + * leading 0s. */ - if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) - ret = file_write_and_wait_range(file, start, end); - if (!ret) - ret = blkdev_issue_flush(inode->i_sb->s_bdev); + if (!isdigit(c)) + return -ENOENT; - if (ret) - zonefs_io_error(inode, true); + if (len > 1 && c == '0') + return -ENOENT; - return ret; - } + if (len == 1) + return c - '0'; - static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) - { - struct inode *inode = file_inode(vmf->vma->vm_file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - vm_fault_t ret; - - if (unlikely(IS_IMMUTABLE(inode))) - return VM_FAULT_SIGBUS; - - /* - * Sanity check: only conventional zone files can have shared - * writeable mappings. - */ - if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) - return VM_FAULT_NOPAGE; - - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - - /* Serialize against truncates */ - filemap_invalidate_lock_shared(inode->i_mapping); - ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); - filemap_invalidate_unlock_shared(inode->i_mapping); - - sb_end_pagefault(inode->i_sb); - return ret; - } - - static const struct vm_operations_struct zonefs_file_vm_ops = { - .fault = filemap_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = zonefs_filemap_page_mkwrite, - }; - - static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) - { - /* - * Conventional zones accept random writes, so their files can support - * shared writable mappings. For sequential zone files, only read - * mappings are possible since there are no guarantees for write - * ordering between msync() and page cache writeback. - */ - if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ && - (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) - return -EINVAL; - - file_accessed(file); - vma->vm_ops = &zonefs_file_vm_ops; - - return 0; - } - - static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) - { - loff_t isize = i_size_read(file_inode(file)); - - /* - * Seeks are limited to below the zone size for conventional zones - * and below the zone write pointer for sequential zones. In both - * cases, this limit is the inode size. - */ - return generic_file_llseek_size(file, offset, whence, isize, isize); - } - - static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, - int error, unsigned int flags) - { - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - - if (error) { - zonefs_io_error(inode, true); - return error; - } - - if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) { - /* - * Note that we may be seeing completions out of order, - * but that is not a problem since a write completed - * successfully necessarily means that all preceding writes - * were also successful. So we can safely increase the inode - * size to the write end location. - */ - mutex_lock(&zi->i_truncate_mutex); - if (i_size_read(inode) < iocb->ki_pos + size) { - zonefs_update_stats(inode, iocb->ki_pos + size); - zonefs_i_size_write(inode, iocb->ki_pos + size); - } - mutex_unlock(&zi->i_truncate_mutex); - } - - return 0; - } - - static const struct iomap_dio_ops zonefs_write_dio_ops = { - .end_io = zonefs_file_write_dio_end_io, - }; - - static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) - { - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int max = bdev_max_zone_append_sectors(bdev); - struct bio *bio; - ssize_t size; - int nr_pages; - ssize_t ret; - - max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); - iov_iter_truncate(from, max); - - nr_pages = iov_iter_npages(from, BIO_MAX_VECS); - if (!nr_pages) - return 0; - - bio = bio_alloc(bdev, nr_pages, - REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); - bio->bi_iter.bi_sector = zi->i_zsector; - bio->bi_ioprio = iocb->ki_ioprio; - if (iocb_is_dsync(iocb)) - bio->bi_opf |= REQ_FUA; - - ret = bio_iov_iter_get_pages(bio, from); - if (unlikely(ret)) - goto out_release; - - size = bio->bi_iter.bi_size; - task_io_account_write(size); - - if (iocb->ki_flags & IOCB_HIPRI) - bio_set_polled(bio, iocb); - - ret = submit_bio_wait(bio); - - /* - * If the file zone was written underneath the file system, the zone - * write pointer may not be where we expect it to be, but the zone - * append write can still succeed. So check manually that we wrote where - * we intended to, that is, at zi->i_wpoffset. - */ - if (!ret) { - sector_t wpsector = - zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT); - - if (bio->bi_iter.bi_sector != wpsector) { - zonefs_warn(inode->i_sb, - "Corrupted write pointer %llu for zone at %llu\n", - wpsector, zi->i_zsector); - ret = -EIO; - } - } - - zonefs_file_write_dio_end_io(iocb, size, ret, 0); - trace_zonefs_file_dio_append(inode, size, ret); - - out_release: - bio_release_pages(bio, false); - bio_put(bio); - - if (ret >= 0) { - iocb->ki_pos += size; - return size; - } - - return ret; - } - - /* - * Do not exceed the LFS limits nor the file zone size. If pos is under the - * limit it becomes a short access. If it exceeds the limit, return -EFBIG. - */ - static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, - loff_t count) - { - struct inode *inode = file_inode(file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - loff_t limit = rlimit(RLIMIT_FSIZE); - loff_t max_size = zi->i_max_size; - - if (limit != RLIM_INFINITY) { - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - count = min(count, limit - pos); - } - - if (!(file->f_flags & O_LARGEFILE)) - max_size = min_t(loff_t, MAX_NON_LFS, max_size); - - if (unlikely(pos >= max_size)) - return -EFBIG; - - return min(count, max_size - pos); - } - - static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) - { - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - loff_t count; - - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - - if (!iov_iter_count(from)) - return 0; - - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) - return -EINVAL; - - if (iocb->ki_flags & IOCB_APPEND) { - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) - return -EINVAL; - mutex_lock(&zi->i_truncate_mutex); - iocb->ki_pos = zi->i_wpoffset; - mutex_unlock(&zi->i_truncate_mutex); + for (i = 0, rname = name + len - 1; i < len; i++, rname--) { + c = *rname; + if (!isdigit(c)) + return -ENOENT; + fno += (c - '0') * shift; + shift *= 10; } - count = zonefs_write_check_limits(file, iocb->ki_pos, - iov_iter_count(from)); - if (count < 0) - return count; - - iov_iter_truncate(from, count); - return iov_iter_count(from); - } - - /* - * Handle direct writes. For sequential zone files, this is the only possible - * write path. For these files, check that the user is issuing writes - * sequentially from the end of the file. This code assumes that the block layer - * delivers write requests to the device in sequential order. This is always the - * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE - * elevator feature is being used (e.g. mq-deadline). The block layer always - * automatically select such an elevator for zoned block devices during the - * device initialization. - */ - static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) - { - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; - bool sync = is_sync_kiocb(iocb); - bool append = false; - ssize_t ret, count; - - /* - * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT - * as this can cause write reordering (e.g. the first aio gets EAGAIN - * on the inode lock but the second goes through but is now unaligned). - */ - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync && - (iocb->ki_flags & IOCB_NOWAIT)) - return -EOPNOTSUPP; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock(inode)) - return -EAGAIN; - } else { - inode_lock(inode); - } - - count = zonefs_write_checks(iocb, from); - if (count <= 0) { - ret = count; - goto inode_unlock; - } - - if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { - ret = -EINVAL; - goto inode_unlock; - } - - /* Enforce sequential writes (append only) in sequential zones */ - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) { - mutex_lock(&zi->i_truncate_mutex); - if (iocb->ki_pos != zi->i_wpoffset) { - mutex_unlock(&zi->i_truncate_mutex); - ret = -EINVAL; - goto inode_unlock; - } - mutex_unlock(&zi->i_truncate_mutex); - append = sync; - } - - if (append) - ret = zonefs_file_dio_append(iocb, from); - else - ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, - &zonefs_write_dio_ops, 0, NULL, 0); - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && - (ret > 0 || ret == -EIOCBQUEUED)) { - if (ret > 0) - count = ret; - - /* - * Update the zone write pointer offset assuming the write - * operation succeeded. If it did not, the error recovery path - * will correct it. Also do active seq file accounting. - */ - mutex_lock(&zi->i_truncate_mutex); - zi->i_wpoffset += count; - zonefs_account_active(inode); - mutex_unlock(&zi->i_truncate_mutex); - } - - inode_unlock: - inode_unlock(inode); - - return ret; - } - - static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, - struct iov_iter *from) - { - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - ssize_t ret; - - /* - * Direct IO writes are mandatory for sequential zone files so that the - * write IO issuing order is preserved. - */ - if (zi->i_ztype != ZONEFS_ZTYPE_CNV) - return -EIO; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock(inode)) - return -EAGAIN; - } else { - inode_lock(inode); - } - - ret = zonefs_write_checks(iocb, from); - if (ret <= 0) - goto inode_unlock; - - ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); - if (ret > 0) - iocb->ki_pos += ret; - else if (ret == -EIO) - zonefs_io_error(inode, true); - - inode_unlock: - inode_unlock(inode); - if (ret > 0) - ret = generic_write_sync(iocb, ret); - - return ret; + return fno; } - static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) + static struct inode *zonefs_get_file_inode(struct inode *dir, + struct dentry *dentry) { - struct inode *inode = file_inode(iocb->ki_filp); - - if (unlikely(IS_IMMUTABLE(inode))) - return -EPERM; - - if (sb_rdonly(inode->i_sb)) - return -EROFS; - - /* Write operations beyond the zone size are not allowed */ - if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size) - return -EFBIG; - - if (iocb->ki_flags & IOCB_DIRECT) { - ssize_t ret = zonefs_file_dio_write(iocb, from); - if (ret != -ENOTBLK) - return ret; - } - - return zonefs_file_buffered_write(iocb, from); - } - - static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, - int error, unsigned int flags) - { - if (error) { - zonefs_io_error(file_inode(iocb->ki_filp), false); - return error; - } - - return 0; - } - - static const struct iomap_dio_ops zonefs_read_dio_ops = { - .end_io = zonefs_file_read_dio_end_io, - }; - - static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) - { - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; - loff_t isize; - ssize_t ret; - - /* Offline zones cannot be read */ - if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) - return -EPERM; - - if (iocb->ki_pos >= zi->i_max_size) - return 0; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock_shared(inode)) - return -EAGAIN; - } else { - inode_lock_shared(inode); - } + struct zonefs_zone_group *zgroup = dir->i_private; + struct super_block *sb = dir->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + struct zonefs_zone *z; + struct inode *inode; + ino_t ino; + long fno; - /* Limit read operations to written data */ - mutex_lock(&zi->i_truncate_mutex); - isize = i_size_read(inode); - if (iocb->ki_pos >= isize) { - mutex_unlock(&zi->i_truncate_mutex); - ret = 0; - goto inode_unlock; - } - iov_iter_truncate(to, isize - iocb->ki_pos); - mutex_unlock(&zi->i_truncate_mutex); + /* Get the file number from the file name */ + fno = zonefs_fname_to_fno(&dentry->d_name); + if (fno < 0) + return ERR_PTR(fno); - if (iocb->ki_flags & IOCB_DIRECT) { - size_t count = iov_iter_count(to); + if (!zgroup->g_nr_zones || fno >= zgroup->g_nr_zones) + return ERR_PTR(-ENOENT); - if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { - ret = -EINVAL; - goto inode_unlock; - } - file_accessed(iocb->ki_filp); - ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, - &zonefs_read_dio_ops, 0, NULL, 0); - } else { - ret = generic_file_read_iter(iocb, to); - if (ret == -EIO) - zonefs_io_error(inode, false); + z = &zgroup->g_zones[fno]; + ino = z->z_sector >> sbi->s_zone_sectors_shift; + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) { + WARN_ON_ONCE(inode->i_private != z); + return inode; } - inode_unlock: - inode_unlock_shared(inode); - - return ret; - } + inode->i_ino = ino; + inode->i_mode = z->z_mode; + inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; + inode->i_uid = z->z_uid; + inode->i_gid = z->z_gid; + inode->i_size = z->z_wpoffset; + inode->i_blocks = z->z_capacity >> SECTOR_SHIFT; + inode->i_private = z; - /* - * Write open accounting is done only for sequential files. - */ - static inline bool zonefs_seq_file_need_wro(struct inode *inode, - struct file *file) - { - struct zonefs_inode_info *zi = ZONEFS_I(inode); + inode->i_op = &zonefs_file_inode_operations; + inode->i_fop = &zonefs_file_operations; + inode->i_mapping->a_ops = &zonefs_file_aops; - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) - return false; + /* Update the inode access rights depending on the zone condition */ + zonefs_inode_update_mode(inode); - if (!(file->f_mode & FMODE_WRITE)) - return false; + unlock_new_inode(inode); - return true; + return inode; } - static int zonefs_seq_file_write_open(struct inode *inode) + static struct inode *zonefs_get_zgroup_inode(struct super_block *sb, + enum zonefs_ztype ztype) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); - int ret = 0; - - mutex_lock(&zi->i_truncate_mutex); - - if (!zi->i_wr_refcnt) { - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); - - if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { - - if (sbi->s_max_wro_seq_files - && wro > sbi->s_max_wro_seq_files) { - atomic_dec(&sbi->s_wro_seq_files); - ret = -EBUSY; - goto unlock; - } + struct inode *root = d_inode(sb->s_root); + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + struct inode *inode; + ino_t ino = bdev_nr_zones(sb->s_bdev) + ztype + 1; - if (i_size_read(inode) < zi->i_max_size) { - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); - if (ret) { - atomic_dec(&sbi->s_wro_seq_files); - goto unlock; - } - zi->i_flags |= ZONEFS_ZONE_OPEN; - zonefs_account_active(inode); - } - } - } + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + inode->i_ino = ino; - inode_init_owner(&init_user_ns, inode, root, S_IFDIR | 0555); ++ inode_init_owner(&nop_mnt_idmap, inode, root, S_IFDIR | 0555); + inode->i_size = sbi->s_zgroup[ztype].g_nr_zones; + inode->i_ctime = inode->i_mtime = inode->i_atime = root->i_ctime; + inode->i_private = &sbi->s_zgroup[ztype]; + set_nlink(inode, 2); - zi->i_wr_refcnt++; + inode->i_op = &zonefs_dir_inode_operations; + inode->i_fop = &zonefs_dir_operations; - unlock: - mutex_unlock(&zi->i_truncate_mutex); + unlock_new_inode(inode); - return ret; + return inode; } - static int zonefs_file_open(struct inode *inode, struct file *file) - { - int ret; - - ret = generic_file_open(inode, file); - if (ret) - return ret; - - if (zonefs_seq_file_need_wro(inode, file)) - return zonefs_seq_file_write_open(inode); - - return 0; - } - static void zonefs_seq_file_write_close(struct inode *inode) + static struct inode *zonefs_get_dir_inode(struct inode *dir, + struct dentry *dentry) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; + struct super_block *sb = dir->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); - int ret = 0; - - mutex_lock(&zi->i_truncate_mutex); - - zi->i_wr_refcnt--; - if (zi->i_wr_refcnt) - goto unlock; + const char *name = dentry->d_name.name; + enum zonefs_ztype ztype; /* - * The file zone may not be open anymore (e.g. the file was truncated to - * its maximum size or it was fully written). For this case, we only - * need to decrement the write open count. + * We only need to check for the "seq" directory and + * the "cnv" directory if we have conventional zones. */ - if (zi->i_flags & ZONEFS_ZONE_OPEN) { - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); - if (ret) { - __zonefs_io_error(inode, false); - /* - * Leaving zones explicitly open may lead to a state - * where most zones cannot be written (zone resources - * exhausted). So take preventive action by remounting - * read-only. - */ - if (zi->i_flags & ZONEFS_ZONE_OPEN && - !(sb->s_flags & SB_RDONLY)) { - zonefs_warn(sb, - "closing zone at %llu failed %d\n", - zi->i_zsector, ret); - zonefs_warn(sb, - "remounting filesystem read-only\n"); - sb->s_flags |= SB_RDONLY; - } - goto unlock; - } + if (dentry->d_name.len != 3) + return ERR_PTR(-ENOENT); - zi->i_flags &= ~ZONEFS_ZONE_OPEN; - zonefs_account_active(inode); + for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { + if (sbi->s_zgroup[ztype].g_nr_zones && + memcmp(name, zonefs_zgroup_name(ztype), 3) == 0) + break; } + if (ztype == ZONEFS_ZTYPE_MAX) + return ERR_PTR(-ENOENT); - atomic_dec(&sbi->s_wro_seq_files); - - unlock: - mutex_unlock(&zi->i_truncate_mutex); - } - - static int zonefs_file_release(struct inode *inode, struct file *file) - { - /* - * If we explicitly open a zone we must close it again as well, but the - * zone management operation can fail (either due to an IO error or as - * the zone has gone offline or read-only). Make sure we don't fail the - * close(2) for user-space. - */ - if (zonefs_seq_file_need_wro(inode, file)) - zonefs_seq_file_write_close(inode); - - return 0; + return zonefs_get_zgroup_inode(sb, ztype); } - static const struct file_operations zonefs_file_operations = { - .open = zonefs_file_open, - .release = zonefs_file_release, - .fsync = zonefs_file_fsync, - .mmap = zonefs_file_mmap, - .llseek = zonefs_file_llseek, - .read_iter = zonefs_file_read_iter, - .write_iter = zonefs_file_write_iter, - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, - .iopoll = iocb_bio_iopoll, - }; - - static struct kmem_cache *zonefs_inode_cachep; - - static struct inode *zonefs_alloc_inode(struct super_block *sb) + static struct dentry *zonefs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) { - struct zonefs_inode_info *zi; - - zi = alloc_inode_sb(sb, zonefs_inode_cachep, GFP_KERNEL); - if (!zi) - return NULL; - - inode_init_once(&zi->i_vnode); - mutex_init(&zi->i_truncate_mutex); - zi->i_wr_refcnt = 0; - zi->i_flags = 0; - - return &zi->i_vnode; - } - - static void zonefs_free_inode(struct inode *inode) - { - kmem_cache_free(zonefs_inode_cachep, ZONEFS_I(inode)); - } - - /* - * File system stat. - */ - static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) - { - struct super_block *sb = dentry->d_sb; - struct zonefs_sb_info *sbi = ZONEFS_SB(sb); - enum zonefs_ztype t; + struct inode *inode; - buf->f_type = ZONEFS_MAGIC; - buf->f_bsize = sb->s_blocksize; - buf->f_namelen = ZONEFS_NAME_MAX; + if (dentry->d_name.len > ZONEFS_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); - spin_lock(&sbi->s_lock); - - buf->f_blocks = sbi->s_blocks; - if (WARN_ON(sbi->s_used_blocks > sbi->s_blocks)) - buf->f_bfree = 0; + if (dir == d_inode(dir->i_sb->s_root)) + inode = zonefs_get_dir_inode(dir, dentry); else - buf->f_bfree = buf->f_blocks - sbi->s_used_blocks; - buf->f_bavail = buf->f_bfree; + inode = zonefs_get_file_inode(dir, dentry); + if (IS_ERR(inode)) + return ERR_CAST(inode); - for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { - if (sbi->s_nr_files[t]) - buf->f_files += sbi->s_nr_files[t] + 1; - } - buf->f_ffree = 0; - - spin_unlock(&sbi->s_lock); - - buf->f_fsid = uuid_to_fsid(sbi->s_uuid.b); - - return 0; + return d_splice_alias(inode, dentry); } - enum { - Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair, - Opt_explicit_open, Opt_err, - }; - - static const match_table_t tokens = { - { Opt_errors_ro, "errors=remount-ro"}, - { Opt_errors_zro, "errors=zone-ro"}, - { Opt_errors_zol, "errors=zone-offline"}, - { Opt_errors_repair, "errors=repair"}, - { Opt_explicit_open, "explicit-open" }, - { Opt_err, NULL} - }; - - static int zonefs_parse_options(struct super_block *sb, char *options) + static int zonefs_readdir_root(struct file *file, struct dir_context *ctx) { + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); - substring_t args[MAX_OPT_ARGS]; - char *p; + enum zonefs_ztype ztype = ZONEFS_ZTYPE_CNV; + ino_t base_ino = bdev_nr_zones(sb->s_bdev) + 1; - if (!options) + if (ctx->pos >= inode->i_size) return 0; - while ((p = strsep(&options, ",")) != NULL) { - int token; + if (!dir_emit_dots(file, ctx)) + return 0; - if (!*p) - continue; + if (ctx->pos == 2) { + if (!sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones) + ztype = ZONEFS_ZTYPE_SEQ; - token = match_token(p, tokens, args); - switch (token) { - case Opt_errors_ro: - sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; - sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_RO; - break; - case Opt_errors_zro: - sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; - sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZRO; - break; - case Opt_errors_zol: - sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; - sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZOL; - break; - case Opt_errors_repair: - sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; - sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR; - break; - case Opt_explicit_open: - sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN; - break; - default: - return -EINVAL; - } + if (!dir_emit(ctx, zonefs_zgroup_name(ztype), 3, + base_ino + ztype, DT_DIR)) + return 0; + ctx->pos++; } - return 0; - } - - static int zonefs_show_options(struct seq_file *seq, struct dentry *root) - { - struct zonefs_sb_info *sbi = ZONEFS_SB(root->d_sb); - - if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO) - seq_puts(seq, ",errors=remount-ro"); - if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) - seq_puts(seq, ",errors=zone-ro"); - if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) - seq_puts(seq, ",errors=zone-offline"); - if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_REPAIR) - seq_puts(seq, ",errors=repair"); + if (ctx->pos == 3 && ztype != ZONEFS_ZTYPE_SEQ) { + ztype = ZONEFS_ZTYPE_SEQ; + if (!dir_emit(ctx, zonefs_zgroup_name(ztype), 3, + base_ino + ztype, DT_DIR)) + return 0; + ctx->pos++; + } return 0; }