Add more file/inode operation: vector function operation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .llseek zuf_llseek ZUFS_OP_LLSEEK .fallocate zuf_fallocate ZUFS_OP_FALLOCATE .copy_file_range zuf_copy_file_range ZUFS_OP_COPY .remap_file_range zuf_clone_file_range ZUFS_OP_CLONE .fadvise zuf_fadvise (multiple see rw.c) .fiemap zuf_fiemap ZUFS_OP_FIEMAP See more comments in source code. [v2] SQUASHME zuf: fadvise fix up missing operations Mainly there was a bug found by Vlad, that POSIX_FADV_RANDOM was missing and therefor was returning and error and some tests were failing. But while at it actually implement all the missing advise. Just punch into file->ra the proper flags. FIXME: There is a pending patch by Jan to export generic_fadvise for now duplicate what we need inline. [v3] zuf: lock two zii fix [v4] zuf: Reduce stack usage (fiemap) Same as for IO use the big_alloc to prevent compilation warning Signed-off-by: Boaz Harrosh <boazh@xxxxxxxxxx> --- fs/zuf/_extern.h | 3 + fs/zuf/file.c | 650 +++++++++++++++++++++++++++++++++++++++++++++- fs/zuf/rw.c | 92 +++++++ fs/zuf/zuf-core.c | 5 + fs/zuf/zus_api.h | 83 ++++++ 5 files changed, 832 insertions(+), 1 deletion(-) diff --git a/fs/zuf/_extern.h b/fs/zuf/_extern.h index cafda97c973c..2c7456724ef6 100644 --- a/fs/zuf/_extern.h +++ b/fs/zuf/_extern.h @@ -110,6 +110,9 @@ int _zufs_IO_get_multy(struct zuf_sb_info *sbi, struct inode *inode, void _zufs_IO_put_multy(struct zuf_sb_info *sbi, struct inode *inode, struct _io_gb_multy *io_gb); int zuf_rw_fallocate(struct inode *inode, uint mode, loff_t offset, loff_t len); +int zuf_rw_fadvise(struct super_block *sb, struct file *file, + loff_t offset, loff_t len, int advise, bool rand); + int zuf_iom_execute_sync(struct super_block *sb, struct inode *inode, __u64 *iom_e, uint iom_n); int zuf_iom_execute_async(struct super_block *sb, struct zus_iomap_build *iomb, diff --git a/fs/zuf/file.c b/fs/zuf/file.c index 7fcaf085bf8e..1c51529694e7 100644 --- a/fs/zuf/file.c +++ b/fs/zuf/file.c @@ -15,12 +15,158 @@ #include <linux/fs.h> #include <linux/uio.h> +#include <linux/falloc.h> +#include <linux/fadvise.h> +#include <linux/sched/signal.h> #include "zuf.h" long __zuf_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) { - return -ENOTSUPP; + struct zuf_inode_info *zii = ZUII(inode); + bool need_len_check, need_unmap; + loff_t unmap_len = 0; /* 0 means all file */ + loff_t new_size = len + offset; + loff_t i_size = i_size_read(inode); + int err = 0; + + zuf_dbg_vfs("[%ld] mode=0x%x offset=0x%llx len=0x%llx\n", + inode->i_ino, mode, offset, len); + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + + /* These are all the FL flags we know how to handle on the kernel side + * a zusFS that does not support one of these can just return + * EOPNOTSUPP. + */ + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_NO_HIDE_STALE | FALLOC_FL_COLLAPSE_RANGE | + FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | + FALLOC_FL_UNSHARE_RANGE | ZUFS_FL_TRUNCATE)){ + zuf_dbg_err("Unsupported mode(0x%x)\n", mode); + return -EOPNOTSUPP; + } + + if (mode & FALLOC_FL_PUNCH_HOLE) { + need_len_check = false; + need_unmap = true; + unmap_len = len; + } else if (mode & ZUFS_FL_TRUNCATE) { + need_len_check = true; + new_size = offset; + need_unmap = true; + } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { + need_len_check = false; + need_unmap = true; + } else if (mode & FALLOC_FL_INSERT_RANGE) { + need_len_check = true; + new_size = i_size + len; + need_unmap = true; + } else if (mode & FALLOC_FL_ZERO_RANGE) { + need_len_check = !(mode & FALLOC_FL_KEEP_SIZE); + need_unmap = true; + } else { + /* FALLOC_FL_UNSHARE_RANGE same as regular */ + need_len_check = !(mode & FALLOC_FL_KEEP_SIZE); + need_unmap = false; + } + + if (need_len_check && (new_size > i_size)) { + err = inode_newsize_ok(inode, new_size); + if (unlikely(err)) { + zuf_dbg_err("inode_newsize_ok(0x%llx) => %d\n", + new_size, err); + goto out; + } + } + + if (need_unmap) { + zufc_goose_all_zts(ZUF_ROOT(SBI(inode->i_sb)), inode); + unmap_mapping_range(inode->i_mapping, offset, unmap_len, 1); + } + + zus_inode_cmtime_now(inode, zii->zi); + + err = zuf_rw_fallocate(inode, mode, offset, len); + + /* Even if we had an error these might have changed */ + i_size_write(inode, le64_to_cpu(zii->zi->i_size)); + inode->i_blocks = le64_to_cpu(zii->zi->i_blocks); + +out: + return err; +} + +static long zuf_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file->f_inode; + struct zuf_inode_info *zii = ZUII(inode); + int err; + + zuf_w_lock(zii); + + err = __zuf_fallocate(inode, mode, offset, len); + + zuf_w_unlock(zii); + return err; +} + +static loff_t zuf_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct zuf_inode_info *zii = ZUII(inode); + struct zufs_ioc_seek ioc_seek = { + .hdr.in_len = sizeof(ioc_seek), + .hdr.out_len = sizeof(ioc_seek), + .hdr.operation = ZUFS_OP_LLSEEK, + .zus_ii = zii->zus_ii, + .offset_in = offset, + .whence = whence, + }; + int err = 0; + + zuf_dbg_vfs("[%ld] offset=0x%llx whence=%d\n", + inode->i_ino, offset, whence); + + if (whence != SEEK_DATA && whence != SEEK_HOLE) + return generic_file_llseek(file, offset, whence); + + zuf_r_lock(zii); + + if ((offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) || + offset > inode->i_sb->s_maxbytes) { + err = -EINVAL; + goto out; + } else if (inode->i_size <= offset) { + err = -ENXIO; + goto out; + } else if (!inode->i_blocks) { + if (whence == SEEK_HOLE) + ioc_seek.offset_out = i_size_read(inode); + else + err = -ENXIO; + goto out; + } + + err = zufc_dispatch(ZUF_ROOT(SBI(inode->i_sb)), &ioc_seek.hdr, NULL, 0); + if (unlikely(err)) { + zuf_dbg_err("zufc_dispatch failed => %d\n", err); + goto out; + } + + if (ioc_seek.offset_out != file->f_pos) { + file->f_pos = ioc_seek.offset_out; + file->f_version = 0; + } + +out: + zuf_r_unlock(zii); + + return err ?: ioc_seek.offset_out; } /* This function is called by both msync() and fsync(). */ @@ -87,6 +233,481 @@ static int zuf_fsync(struct file *file, loff_t start, loff_t end, int datasync) return zuf_isync(file_inode(file), start, end, datasync); } +/* This callback is called when a file is closed */ +static int zuf_flush(struct file *file, fl_owner_t id) +{ + zuf_dbg_vfs("[%ld]\n", file->f_inode->i_ino); + return 0; +} + +static int zuf_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 offset, u64 length) +{ + struct super_block *sb = inode->i_sb; + struct zuf_inode_info *zii = ZUII(inode); + struct zufs_ioc_fiemap ioc_fiemap = { + .hdr.operation = ZUFS_OP_FIEMAP, + .hdr.in_len = sizeof(ioc_fiemap), + .hdr.out_len = sizeof(ioc_fiemap), + .zus_ii = zii->zus_ii, + .start = offset, + .length = length, + .flags = fieinfo->fi_flags, + }; + long on_stack[ZUF_MAX_STACK(160) / sizeof(long)]; + struct page **pages = NULL; + enum big_alloc_type bat = 0; + uint nump = 0, extents_max = 0; + int i, err; + + zuf_dbg_vfs("[%ld] offset=0x%llx len=0x%llx extents_max=%u flags=0x%x\n", + inode->i_ino, offset, length, fieinfo->fi_extents_max, + fieinfo->fi_flags); + + /* TODO: Have support for FIEMAP_FLAG_XATTR */ + err = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (unlikely(err)) + return err; + + if (likely(fieinfo->fi_extents_max)) { + ulong start = (ulong)fieinfo->fi_extents_start; + ulong len = fieinfo->fi_extents_max * + sizeof(struct fiemap_extent); + ulong offset = start & (PAGE_SIZE - 1); + ulong end_offset = (offset + len) & (PAGE_SIZE - 1); + ulong __len; + uint nump_r; + + nump = md_o2p_up(offset + len); + if (ZUS_API_MAP_MAX_PAGES < nump) + nump = ZUS_API_MAP_MAX_PAGES; + + __len = nump * PAGE_SIZE - offset; + if (end_offset) + __len -= (PAGE_SIZE - end_offset); + + extents_max = __len / sizeof(struct fiemap_extent); + + ioc_fiemap.hdr.len = extents_max * sizeof(struct fiemap_extent); + ioc_fiemap.hdr.offset = offset; + + pages = big_alloc(nump * sizeof(*pages), sizeof(on_stack), + on_stack, GFP_KERNEL, &bat); + if (unlikely(!pages)) + return -ENOMEM; + + nump_r = get_user_pages_fast(start, nump, WRITE, pages); + if (unlikely(nump != nump_r)) { + err = -EFAULT; + goto free; + } + } + ioc_fiemap.extents_max = extents_max; + + zuf_r_lock(zii); + + err = zufc_dispatch(ZUF_ROOT(SBI(sb)), &ioc_fiemap.hdr, pages, nump); + if (unlikely(err)) { + zuf_dbg_err("zufs_dispatch failed => %d\n", err); + goto out; + } + + fieinfo->fi_extents_mapped = ioc_fiemap.extents_mapped; + if (unlikely(extents_max && + (extents_max < ioc_fiemap.extents_mapped))) { + zuf_err("extents_max=%d extents_mapped=%d\n", extents_max, + ioc_fiemap.extents_mapped); + err = -EINVAL; + } + +out: + zuf_r_unlock(zii); + + for (i = 0; i < nump; ++i) + put_page(pages[i]); +free: + big_free(pages, bat); + + return err; +} + +/* ~~~~~ clone/copy range ~~~~~ */ + +/* + * Copy/paste from Kernel mm/filemap.c::generic_remap_checks + * FIXME: make it EXPORT_GPL + */ +static int _access_check_limits(struct file *file, loff_t pos, + loff_t *count) +{ + struct inode *inode = file->f_mapping->host; + loff_t max_size = inode->i_sb->s_maxbytes; + + if (!(file->f_flags & O_LARGEFILE)) + max_size = MAX_NON_LFS; + + if (unlikely(pos >= max_size)) + return -EFBIG; + *count = min(*count, max_size - pos); + return 0; +} + +static int _write_check_limits(struct file *file, loff_t pos, + loff_t *count) +{ + + loff_t limit = rlimit(RLIMIT_FSIZE); + + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + *count = min(*count, limit - pos); + } + + return _access_check_limits(file, pos, count); +} + +static int _remap_checks(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *req_count, unsigned int remap_flags) +{ + struct inode *inode_in = file_in->f_mapping->host; + struct inode *inode_out = file_out->f_mapping->host; + uint64_t count = *req_count; + uint64_t bcount; + loff_t size_in, size_out; + loff_t bs = inode_out->i_sb->s_blocksize; + int ret; + + /* The start of both ranges must be aligned to an fs block. */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) + return -EINVAL; + + /* Ensure offsets don't wrap. */ + if (pos_in + count < pos_in || pos_out + count < pos_out) + return -EINVAL; + + size_in = i_size_read(inode_in); + size_out = i_size_read(inode_out); + + /* Dedupe requires both ranges to be within EOF. */ + if ((remap_flags & REMAP_FILE_DEDUP) && + (pos_in >= size_in || pos_in + count > size_in || + pos_out >= size_out || pos_out + count > size_out)) + return -EINVAL; + + /* Ensure the infile range is within the infile. */ + if (pos_in >= size_in) + return -EINVAL; + count = min(count, size_in - (uint64_t)pos_in); + + ret = _access_check_limits(file_in, pos_in, &count); + if (ret) + return ret; + + ret = _write_check_limits(file_out, pos_out, &count); + if (ret) + return ret; + + /* + * If the user wanted us to link to the infile's EOF, round up to the + * next block boundary for this check. + * + * Otherwise, make sure the count is also block-aligned, having + * already confirmed the starting offsets' block alignment. + */ + if (pos_in + count == size_in) { + bcount = ALIGN(size_in, bs) - pos_in; + } else { + if (!IS_ALIGNED(count, bs)) + count = ALIGN_DOWN(count, bs); + bcount = count; + } + + /* Don't allow overlapped cloning within the same file. */ + if (inode_in == inode_out && + pos_out + bcount > pos_in && + pos_out < pos_in + bcount) + return -EINVAL; + + /* + * We shortened the request but the caller can't deal with that, so + * bounce the request back to userspace. + */ + if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) + return -EINVAL; + + *req_count = count; + return 0; +} + +/* + * Copy/paste from generic_remap_file_range_prep(). We cannot call + * generic_remap_file_range_prep because it calles fsync twice and we do not + * want to go to the Server so many times. + * So below is just the checks. + * FIXME: Send a patch upstream to split the generic_remap_file_range_prep + * or receive a flag if to do the syncs + * + * Check that the two inodes are eligible for cloning, the ranges make + * sense. + * + * If there's an error, then the usual negative error code is returned. + * Otherwise returns 0 with *len set to the request length. + */ +static int _remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + int ret; + + /* Don't touch certain kinds of inodes */ + if (IS_IMMUTABLE(inode_out)) + return -EPERM; + + if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) + return -ETXTBSY; + + /* Don't reflink dirs, pipes, sockets... */ + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + + /* Zero length dedupe exits immediately; reflink goes to EOF. */ + if (*len == 0) { + loff_t isize = i_size_read(inode_in); + + if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) + return 0; + if (pos_in > isize) + return -EINVAL; + *len = isize - pos_in; + if (*len == 0) + return 0; + } + + /* Check that we don't violate system file offset limits. */ + ret = _remap_checks(file_in, pos_in, file_out, pos_out, len, + remap_flags); + if (ret) + return ret; + + /* + * REMAP_FILE_DEDUP see if extents are the same. + */ + if (remap_flags & REMAP_FILE_DEDUP) + ret = zuf_rw_file_range_compare(inode_in, pos_in, + inode_out, pos_out, *len); + + return ret; +} + +static void _lock_two_ziis(struct zuf_inode_info *zii1, + struct zuf_inode_info *zii2) +{ + if (zii1 > zii2) + swap(zii1, zii2); + + zuf_w_lock(zii1); + if (zii1 != zii2) + zuf_w_lock_nested(zii2); +} + +static void _unlock_two_ziis(struct zuf_inode_info *zii1, + struct zuf_inode_info *zii2) +{ + if (zii1 > zii2) + swap(zii1, zii2); + + if (zii1 != zii2) + zuf_w_unlock(zii2); + zuf_w_unlock(zii1); +} + +static int _clone_file_range(struct inode *src_inode, loff_t pos_in, + struct file *file_out, + struct inode *dst_inode, loff_t pos_out, + u64 len, u64 len_up, int operation) +{ + struct zuf_inode_info *src_zii = ZUII(src_inode); + struct zuf_inode_info *dst_zii = ZUII(dst_inode); + struct zus_inode *dst_zi = dst_zii->zi; + struct super_block *sb = src_inode->i_sb; + struct zufs_ioc_clone ioc_clone = { + .hdr.in_len = sizeof(ioc_clone), + .hdr.out_len = sizeof(ioc_clone), + .hdr.operation = operation, + .src_zus_ii = src_zii->zus_ii, + .dst_zus_ii = dst_zii->zus_ii, + .pos_in = pos_in, + .pos_out = pos_out, + .len = len, + .len_up = len_up, + }; + int err; + + /* NOTE: len==0 means to-end-of-file which is what we want */ + unmap_mapping_range(src_inode->i_mapping, pos_in, len, 0); + unmap_mapping_range(dst_inode->i_mapping, pos_out, len, 0); + + zufc_goose_all_zts(ZUF_ROOT(SBI(dst_inode->i_sb)), dst_inode); + + if ((len_up == 0) && (pos_in || pos_out)) { + zuf_err("Boaz Smoking 0x%llx 0x%llx 0x%llx\n", + pos_in, pos_out, len); + /* Bad caller */ + return -EINVAL; + } + + err = zufc_dispatch(ZUF_ROOT(SBI(sb)), &ioc_clone.hdr, NULL, 0); + if (unlikely(err && err != -EINTR)) { + zuf_dbg_err("failed to clone %ld -> %ld ; err=%d\n", + src_inode->i_ino, dst_inode->i_ino, err); + return err; + } + + dst_inode->i_blocks = le64_to_cpu(dst_zi->i_blocks); + i_size_write(dst_inode, dst_zi->i_size); + + return err; +} + +/* FIXME: Old checks are not needed. I keep them to make sure they + * are not complaining. Will remove _zuf_old_checks SOON + */ +static int _zuf_old_checks(struct super_block *sb, + struct inode *src_inode, loff_t pos_in, + struct inode *dst_inode, loff_t pos_out, loff_t len) +{ + if (src_inode == dst_inode) { + if (pos_in == pos_out) { + zuf_warn("[%ld] Clone nothing!!\n", + src_inode->i_ino); + return 0; + } + if (pos_in < pos_out) { + if (pos_in + len > pos_out) { + zuf_warn("[%ld] overlapping pos_in < pos_out?? => EINVAL\n", + src_inode->i_ino); + return -EINVAL; + } + } else { + if (pos_out + len > pos_in) { + zuf_warn("[%ld] overlapping pos_out < pos_in?? => EINVAL\n", + src_inode->i_ino); + return -EINVAL; + } + } + } + + if ((pos_in & (sb->s_blocksize - 1)) || + (pos_out & (sb->s_blocksize - 1))) { + zuf_err("[%ld] Not aligned len=0x%llx pos_in=0x%llx " + "pos_out=0x%llx src-size=0x%llx dst-size=0x%llx\n", + src_inode->i_ino, len, pos_in, pos_out, + i_size_read(src_inode), i_size_read(dst_inode)); + return -EINVAL; + } + + return 0; +} + +static loff_t zuf_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, uint remap_flags) +{ + struct inode *src_inode = file_inode(file_in); + struct inode *dst_inode = file_inode(file_out); + struct zuf_inode_info *src_zii = ZUII(src_inode); + struct zuf_inode_info *dst_zii = ZUII(dst_inode); + ulong src_size = i_size_read(src_inode); + ulong dst_size = i_size_read(dst_inode); + struct super_block *sb = src_inode->i_sb; + ulong len_up; + int err; + + zuf_dbg_vfs("IN: [%ld]{0x%llx} => [%ld]{0x%llx} length=0x%llx flags=0x%x\n", + src_inode->i_ino, pos_in, dst_inode->i_ino, pos_out, len, + remap_flags); + + if (remap_flags & ~(REMAP_FILE_CAN_SHORTEN | REMAP_FILE_DEDUP)) { + /* New flags we do not know */ + zuf_dbg_err("[%ld] Unknown remap_flags(0x%x)\n", + src_inode->i_ino, remap_flags); + return -EINVAL; + } + + if ((pos_in + len > sb->s_maxbytes) || (pos_out + len > sb->s_maxbytes)) + return -EINVAL; + + _lock_two_ziis(src_zii, dst_zii); + + err = _remap_file_range_prep(file_in, pos_in, file_out, pos_out, &len, + remap_flags); + if (err < 0 || len == 0) + goto out; + err = _zuf_old_checks(sb, src_inode, pos_in, dst_inode, pos_out, len); + if (unlikely(err)) + goto out; + + err = file_remove_privs(file_out); + if (unlikely(err)) + goto out; + + if (!(remap_flags & REMAP_FILE_DEDUP)) + zus_inode_cmtime_now(dst_inode, dst_zii->zi); + + /* See about all-file-clone optimization */ + len_up = len; + if (!pos_in && !pos_out && (src_size <= pos_in + len) && + (dst_size <= src_size)) { + len_up = 0; + } else if (len & (sb->s_blocksize - 1)) { + /* un-aligned len, see if it is beyond EOF */ + if ((src_size > pos_in + len) || + (dst_size > pos_out + len)) { + zuf_err("[%ld][%ld] Not aligned len=0x%llx pos_in=0x%llx " + "pos_out=0x%llx src-size=0x%lx dst-size=0x%lx\n", + src_inode->i_ino, dst_inode->i_ino, len, + pos_in, pos_out, src_size, dst_size); + err = -EINVAL; + goto out; + } + len_up = md_p2o(md_o2p_up(len)); + } + + err = _clone_file_range(src_inode, pos_in, file_out, dst_inode, pos_out, + len, len_up, ZUFS_OP_CLONE); + if (unlikely(err)) + zuf_dbg_err("_clone_file_range failed => %d\n", err); + +out: + _unlock_two_ziis(src_zii, dst_zii); + return err ? err : len; +} + +static ssize_t zuf_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, uint flags) +{ + struct inode *src_inode = file_inode(file_in); + struct inode *dst_inode = file_inode(file_out); + ssize_t ret; + + zuf_dbg_vfs("ino-in=%ld ino-out=%ld pos_in=0x%llx pos_out=0x%llx length=0x%lx\n", + src_inode->i_ino, dst_inode->i_ino, pos_in, pos_out, len); + + ret = zuf_clone_file_range(file_in, pos_in, file_out, pos_out, len, + REMAP_FILE_ADVISORY); + + return ret ?: len; +} + static ssize_t zuf_read_iter(struct kiocb *kiocb, struct iov_iter *ii) { struct inode *inode = file_inode(kiocb->ki_filp); @@ -155,16 +776,43 @@ static ssize_t zuf_write_iter(struct kiocb *kiocb, struct iov_iter *ii) return ret; } +static int zuf_fadvise(struct file *file, loff_t offset, loff_t len, + int advise) +{ + struct inode *inode = file_inode(file); + struct zuf_inode_info *zii = ZUII(inode); + int err; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + zuf_r_lock(zii); + + err = zuf_rw_fadvise(inode->i_sb, file, offset, len, advise, + file->f_mode & FMODE_RANDOM); + + zuf_r_unlock(zii); + + return err; +} + const struct file_operations zuf_file_operations = { .open = generic_file_open, .read_iter = zuf_read_iter, .write_iter = zuf_write_iter, .mmap = zuf_file_mmap, .fsync = zuf_fsync, + .llseek = zuf_llseek, + .flush = zuf_flush, + .fallocate = zuf_fallocate, + .copy_file_range = zuf_copy_file_range, + .remap_file_range = zuf_clone_file_range, + .fadvise = zuf_fadvise, }; const struct inode_operations zuf_file_inode_operations = { .setattr = zuf_setattr, .getattr = zuf_getattr, .update_time = zuf_update_time, + .fiemap = zuf_fiemap, }; diff --git a/fs/zuf/rw.c b/fs/zuf/rw.c index 48f584e71a03..60b7a3e07e17 100644 --- a/fs/zuf/rw.c +++ b/fs/zuf/rw.c @@ -664,6 +664,98 @@ ssize_t zuf_rw_write_iter(struct super_block *sb, struct inode *inode, ii, kiocb, kiocb_ra(kiocb), ZUFS_OP_WRITE, rw); } +static int _fadv_willneed(struct super_block *sb, struct inode *inode, + loff_t offset, loff_t len, bool rand) +{ + struct zufs_ioc_IO io = {}; + struct __zufs_ra ra = { + .start = md_o2p(offset), + .ra_pages = md_o2p_up(len), + .prev_pos = offset - 1, + }; + int err; + + io.ra.start = ra.start; + io.ra.ra_pages = ra.ra_pages; + io.ra.prev_pos = ra.prev_pos; + io.rw = rand ? ZUFS_RW_RAND : 0; + + err = _IO_dispatch(SBI(sb), &io, ZUII(inode), ZUFS_OP_PRE_READ, 0, + NULL, 0, offset, 0); + return err; +} + +static int _fadv_dontneed(struct super_block *sb, struct inode *inode, + loff_t offset, loff_t len) +{ + struct zufs_ioc_sync ioc_range = { + .hdr.in_len = sizeof(ioc_range), + .hdr.operation = ZUFS_OP_SYNC, + .zus_ii = ZUII(inode)->zus_ii, + .offset = offset, + .length = len, + .flags = ZUFS_SF_DONTNEED, + }; + + return zufc_dispatch(ZUF_ROOT(SBI(sb)), &ioc_range.hdr, NULL, 0); +} + +/* FIXME: There is a pending patch from Jan Karta to export generic_fadvise. + * until then duplicate here what we need + */ +#include <linux/backing-dev.h> + +static int _generic_fadvise(struct file *file, loff_t offset, loff_t len, + int advise) +{ + struct backing_dev_info *bdi = inode_to_bdi(file_inode(file)); + + switch (advise) { + case POSIX_FADV_NORMAL: + file->f_ra.ra_pages = bdi->ra_pages; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); + break; + case POSIX_FADV_RANDOM: + spin_lock(&file->f_lock); + file->f_mode |= FMODE_RANDOM; + spin_unlock(&file->f_lock); + break; + case POSIX_FADV_SEQUENTIAL: + file->f_ra.ra_pages = bdi->ra_pages * 2; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); + break; + case POSIX_FADV_NOREUSE: + break; + } + + return 0; +} + +int zuf_rw_fadvise(struct super_block *sb, struct file *file, + loff_t offset, loff_t len, int advise, bool rand) +{ + switch (advise) { + case POSIX_FADV_WILLNEED: + return _fadv_willneed(sb, file_inode(file), offset, len, rand); + case POSIX_FADV_DONTNEED: + return _fadv_dontneed(sb, file_inode(file), offset, len); + + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_NOREUSE: + return _generic_fadvise(file, offset, len, advise); + default: + zuf_warn("Unknown advise %d\n", advise); + return -EINVAL; + } + return -EINVAL; +} + /* ~~~~ iom_dec.c ~~~ */ /* for now here (at rw.c) looks logical */ diff --git a/fs/zuf/zuf-core.c b/fs/zuf/zuf-core.c index cb4a4def646f..4284d2298906 100644 --- a/fs/zuf/zuf-core.c +++ b/fs/zuf/zuf-core.c @@ -95,6 +95,8 @@ const char *zuf_op_name(enum e_zufs_operation op) CASE_ENUM_NAME(ZUFS_OP_REMOVE_DENTRY); CASE_ENUM_NAME(ZUFS_OP_RENAME); CASE_ENUM_NAME(ZUFS_OP_READDIR); + CASE_ENUM_NAME(ZUFS_OP_CLONE); + CASE_ENUM_NAME(ZUFS_OP_COPY); CASE_ENUM_NAME(ZUFS_OP_READ); CASE_ENUM_NAME(ZUFS_OP_PRE_READ); @@ -102,6 +104,9 @@ const char *zuf_op_name(enum e_zufs_operation op) CASE_ENUM_NAME(ZUFS_OP_MMAP_CLOSE); CASE_ENUM_NAME(ZUFS_OP_SETATTR); CASE_ENUM_NAME(ZUFS_OP_SYNC); + CASE_ENUM_NAME(ZUFS_OP_FALLOCATE); + CASE_ENUM_NAME(ZUFS_OP_LLSEEK); + CASE_ENUM_NAME(ZUFS_OP_FIEMAP); CASE_ENUM_NAME(ZUFS_OP_GET_MULTY); CASE_ENUM_NAME(ZUFS_OP_PUT_MULTY); diff --git a/fs/zuf/zus_api.h b/fs/zuf/zus_api.h index e70bd8b7ff69..c8bcb6006fab 100644 --- a/fs/zuf/zus_api.h +++ b/fs/zuf/zus_api.h @@ -455,6 +455,8 @@ enum e_zufs_operation { ZUFS_OP_REMOVE_DENTRY = 9, ZUFS_OP_RENAME = 10, ZUFS_OP_READDIR = 11, + ZUFS_OP_CLONE = 12, + ZUFS_OP_COPY = 13, ZUFS_OP_READ = 14, ZUFS_OP_PRE_READ = 15, @@ -463,6 +465,8 @@ enum e_zufs_operation { ZUFS_OP_SETATTR = 19, ZUFS_OP_SYNC = 20, ZUFS_OP_FALLOCATE = 21, + ZUFS_OP_LLSEEK = 22, + ZUFS_OP_FIEMAP = 28, ZUFS_OP_GET_MULTY = 29, ZUFS_OP_PUT_MULTY = 30, @@ -680,6 +684,85 @@ struct zufs_ioc_sync { __u64 write_unmapped; }; +/* ZUFS_OP_CLONE */ +struct zufs_ioc_clone { + struct zufs_ioc_hdr hdr; + /* IN */ + struct zus_inode_info *src_zus_ii; + struct zus_inode_info *dst_zus_ii; + __u64 pos_in, pos_out; + __u64 len; + __u64 len_up; +}; + +/* ZUFS_OP_LLSEEK */ +struct zufs_ioc_seek { + struct zufs_ioc_hdr hdr; + /* IN */ + struct zus_inode_info *zus_ii; + __u64 offset_in; + __u32 whence; + __u32 pad; + + /* OUT */ + __u64 offset_out; +}; + +/* ZUFS_OP_FIEMAP */ +struct zufs_ioc_fiemap { + struct zufs_ioc_hdr hdr; + + /* IN */ + struct zus_inode_info *zus_ii; + __u64 start; + __u64 length; + __u32 flags; + __u32 extents_max; + + /* OUT */ + __u32 extents_mapped; + __u32 pad; + +} __packed; + +struct zufs_fiemap_extent_info { + struct fiemap_extent *fi_extents_start; + __u32 fi_flags; + __u32 fi_extents_mapped; + __u32 fi_extents_max; + __u32 __pad; +}; + +static inline +int zufs_fiemap_fill_next_extent(struct zufs_fiemap_extent_info *fieinfo, + __u64 logical, __u64 phys, + __u64 len, __u32 flags) +{ + struct fiemap_extent *dest = fieinfo->fi_extents_start; + + if (fieinfo->fi_extents_max == 0) { + fieinfo->fi_extents_mapped++; + return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; + } + + if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) + return 1; + + dest += fieinfo->fi_extents_mapped; + dest->fe_logical = logical; + dest->fe_physical = phys; + dest->fe_length = len; + dest->fe_flags = flags; + + fieinfo->fi_extents_mapped++; + if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max) + return 1; + + return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; +} + + + /* ~~~~ io_map structures && IOCTL(s) ~~~~ */ /* * These set of structures and helpers are used in return of zufs_ioc_IO and -- 2.21.0