To date, the full promise of byte-addressable access to persistent memory has only been half realized via the filesystem-dax interface. The current filesystem-dax mechanism allows an application to consume (read) data from persistent storage at byte-size granularity, bypassing the full page reads required by traditional storage devices. Now, for writes, applications still need to contend with page-granularity dirtying and flushing semantics as well as filesystem coordination for metadata updates after any mmap write. The current situation precludes use cases that leverage byte-granularity / in-place updates to persistent media. To get around this limitation there are some specialized applications that are using the device-dax interface to bypass the overhead and data-safety problems of the current filesystem-dax mmap-write path. QEMU-KVM is forced to use device-dax to safely pass through persistent memory to a guest [1]. Some specialized databases are using device-dax for byte-granularity writes. Outside of those cases, device-dax is difficult for general purpose persistent memory applications to consume. There is demand for access to pmem without needing to contend with special device configuration and other device-dax limitations. The 'daxfile' interface satisfies this demand and realizes one of Dave Chinner's ideas for allowing pmem applications to safely bypass fsync/msync requirements. The idea is to make the file immutable with respect to the offset-to-block mappings for every extent in the file [2]. It turns out that filesystems already need to make this guarantee today. This property is needed for files marked as swap files. The new daxctl() syscall manages setting a file into 'static-dax' mode whereby it arranges for the file to be treated as a swapfile as far as the filesystem is concerned, but not registered with the core-mm as swapfile space. A file in this mode is then safe to be mapped and written without the requirement to fsync/msync the writes. The cpu cache management for flushing data to persistence can be handled completely in userspace. [1]: https://lists.gnu.org/archive/html/qemu-devel/2017-06/msg01207.html [2]: https://lkml.org/lkml/2016/9/11/159 Cc: Jan Kara <jack@xxxxxxx> Cc: Jeff Moyer <jmoyer@xxxxxxxxxx> Cc: Christoph Hellwig <hch@xxxxxx> Cc: Dave Chinner <david@xxxxxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> --- arch/x86/entry/syscalls/syscall_64.tbl | 1 include/linux/dax.h | 9 ++ include/linux/fs.h | 3 + include/linux/syscalls.h | 1 include/uapi/linux/dax.h | 8 + mm/Kconfig | 5 + mm/Makefile | 1 mm/daxfile.c | 186 ++++++++++++++++++++++++++++++++ mm/page_io.c | 31 +++++ 9 files changed, 245 insertions(+) create mode 100644 include/uapi/linux/dax.h create mode 100644 mm/daxfile.c diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 5aef183e2f85..795eb93d6beb 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -339,6 +339,7 @@ 330 common pkey_alloc sys_pkey_alloc 331 common pkey_free sys_pkey_free 332 common statx sys_statx +333 64 daxctl sys_daxctl # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/dax.h b/include/linux/dax.h index 5ec1f6c47716..5f1d0e0ed30f 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -4,8 +4,17 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/radix-tree.h> +#include <uapi/linux/dax.h> #include <asm/pgtable.h> +/* + * TODO: make sys_daxctl() be the generic interface for toggling S_DAX + * across filesystems. For now, mark DAXCTL_F_DAX as an invalid flag + */ +#define DAXCTL_VALID_FLAGS (DAXCTL_F_GET | DAXCTL_F_STATIC) + +int daxfile_activate(struct file *daxfile, unsigned align); + struct iomap_ops; struct dax_device; struct dax_operations { diff --git a/include/linux/fs.h b/include/linux/fs.h index 3e68cabb8457..3af649fb669f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1824,8 +1824,10 @@ struct super_operations { #define S_NOSEC 4096 /* no suid or xattr security attributes */ #ifdef CONFIG_FS_DAX #define S_DAX 8192 /* Direct Access, avoiding the page cache */ +#define S_DAXFILE 16384 /* no truncate (swapfile) semantics + dax */ #else #define S_DAX 0 /* Make all the DAX code disappear */ +#define S_DAXFILE 0 #endif /* @@ -1865,6 +1867,7 @@ struct super_operations { #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) #define IS_DAX(inode) ((inode)->i_flags & S_DAX) +#define IS_DAXFILE(inode) ((inode)->i_flags & S_DAXFILE) #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 980c3c9b06f8..49e5cc4c192e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -701,6 +701,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5); asmlinkage long sys_swapon(const char __user *specialfile, int swap_flags); asmlinkage long sys_swapoff(const char __user *specialfile); +asmlinkage long sys_daxctl(const char __user *path, int flags, int align); asmlinkage long sys_sysctl(struct __sysctl_args __user *args); asmlinkage long sys_sysinfo(struct sysinfo __user *info); asmlinkage long sys_sysfs(int option, diff --git a/include/uapi/linux/dax.h b/include/uapi/linux/dax.h new file mode 100644 index 000000000000..78a41bb392c0 --- /dev/null +++ b/include/uapi/linux/dax.h @@ -0,0 +1,8 @@ +#ifndef _UAPI_LINUX_DAX_H +#define _UAPI_LINUX_DAX_H + +#define DAXCTL_F_GET (1 << 0) +#define DAXCTL_F_DAX (1 << 1) +#define DAXCTL_F_STATIC (1 << 2) + +#endif /* _UAPI_LINUX_DAX_H */ diff --git a/mm/Kconfig b/mm/Kconfig index beb7a455915d..b874565c34eb 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -450,6 +450,11 @@ config TRANSPARENT_HUGE_PAGECACHE def_bool y depends on TRANSPARENT_HUGEPAGE +config DAXFILE + def_bool y + depends on FS_DAX + depends on SWAP + # # UP and nommu archs use km based percpu allocator # diff --git a/mm/Makefile b/mm/Makefile index 026f6a828a50..38d9025a3e37 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -56,6 +56,7 @@ endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o +obj-$(CONFIG_DAXFILE) += daxfile.o obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o diff --git a/mm/daxfile.c b/mm/daxfile.c new file mode 100644 index 000000000000..fe230199c855 --- /dev/null +++ b/mm/daxfile.c @@ -0,0 +1,186 @@ +/* + * Copyright(c) 2017 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/dax.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/syscalls.h> + +/* + * TODO: a list to lookup daxfiles assumes a low number of instances, + * revisit. + */ +static LIST_HEAD(daxfiles); +static DEFINE_SPINLOCK(dax_lock); + +struct dax_info { + struct list_head list; + struct file *daxfile; +}; + +static int daxfile_disable(struct file *victim) +{ + int found = 0; + struct dax_info *d; + struct inode *inode; + struct file *daxfile; + struct address_space *mapping; + + mapping = victim->f_mapping; + spin_lock(&dax_lock); + list_for_each_entry(d, &daxfiles, list) + if (d->daxfile->f_mapping == mapping) { + list_del(&d->list); + found = 1; + break; + } + spin_unlock(&dax_lock); + + if (!found) + return -EINVAL; + + daxfile = d->daxfile; + + inode = mapping->host; + inode->i_flags &= ~(S_SWAPFILE | S_DAXFILE); + filp_close(daxfile, NULL); + + return 0; +} + +static int claim_daxfile_checks(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (!IS_DAX(inode)) + return -EINVAL; + + if (IS_SWAPFILE(inode) || IS_DAXFILE(inode)) + return -EBUSY; + + return 0; +} + +int daxfile_enable(struct file *daxfile, int align) +{ + struct address_space *mapping; + struct inode *inode; + struct dax_info *d; + int rc; + + if (align < 0) + return -EINVAL; + + mapping = daxfile->f_mapping; + inode = mapping->host; + + rc = claim_daxfile_checks(inode); + if (rc) + return rc; + + rc = daxfile_activate(daxfile, align); + if (rc) + return rc; + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) + return -ENOMEM; + INIT_LIST_HEAD(&d->list); + d->daxfile = daxfile; + + spin_lock(&dax_lock); + list_add(&d->list, &daxfiles); + spin_unlock(&dax_lock); + + /* + * We set S_SWAPFILE to gain "no truncate" / static block + * allocation semantics, and S_DAXFILE so we can differentiate + * traditional swapfiles and assume static block mappings in the + * dax mmap path. + */ + inode->i_flags |= S_SWAPFILE | S_DAXFILE; + return 0; +} + +SYSCALL_DEFINE3(daxctl, const char __user *, path, int, flags, int, align) +{ + int rc; + struct filename *name; + struct inode *inode = NULL; + struct file *daxfile = NULL; + struct address_space *mapping; + + if (flags & ~DAXCTL_VALID_FLAGS) + return -EINVAL; + + name = getname(path); + if (IS_ERR(name)) + return PTR_ERR(name); + + daxfile = file_open_name(name, O_RDWR|O_LARGEFILE, 0); + if (IS_ERR(daxfile)) { + rc = PTR_ERR(daxfile); + daxfile = NULL; + goto out; + } + + mapping = daxfile->f_mapping; + inode = mapping->host; + if (flags & DAXCTL_F_GET) { + /* + * We only report the state of DAXCTL_F_STATIC since + * there is no actions for applications to take based on + * the setting of S_DAX. However, if this interface is + * used for toggling S_DAX presumably userspace would + * want to know the state of the flag. + * + * TODO: revisit whether we want to report DAXCTL_F_DAX + * in the IS_DAX() case. + */ + if (IS_DAXFILE(inode)) + rc = DAXCTL_F_STATIC; + else + rc = 0; + + goto out; + } + + /* + * TODO: Should unprivileged users be allowed to control daxfile + * behavior? Perhaps a mount flag... is -o dax that flag? + */ + if (!capable(CAP_LINUX_IMMUTABLE)) { + rc = -EPERM; + goto out; + } + + inode_lock(inode); + if (!IS_DAXFILE(inode) && (flags & DAXCTL_F_STATIC)) { + rc = daxfile_enable(daxfile, align); + /* if successfully enabled hold daxfile open */ + if (rc == 0) + daxfile = NULL; + } else if (IS_DAXFILE(inode) && !(flags & DAXCTL_F_STATIC)) + rc = daxfile_disable(daxfile); + else + rc = 0; + inode_unlock(inode); + +out: + if (daxfile) + filp_close(daxfile, NULL); + if (name) + putname(name); + return rc; +} diff --git a/mm/page_io.c b/mm/page_io.c index 5cec9a3d49f2..35160ad9c51f 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -244,6 +244,37 @@ static int bmap_walk(struct file *file, const unsigned page_size, goto out; } +static int daxfile_check(sector_t block, unsigned long page_no, + enum bmap_check type, void *none) +{ + if (type == BMAP_WALK_DONE) + return 0; + + /* + * Unlike the swapfile case, fail daxfile_activate() if any file + * extent is not page aligned. + */ + if (type != BMAP_WALK_FULLPAGE) + return -EINVAL; + return 0; +} + +int daxfile_activate(struct file *daxfile, unsigned align) +{ + int rc; + + if (!align) + align = PAGE_SIZE; + + if (align < PAGE_SIZE || !is_power_of_2(align)) + return -EINVAL; + + rc = bmap_walk(daxfile, align, ULONG_MAX, NULL, daxfile_check, NULL); + if (rc) + pr_debug("daxctl: daxfile has holes\n"); + return rc; +} + static int swapfile_check(sector_t block, unsigned long page_no, enum bmap_check type, void *_sis) { -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html