The Duet framework code: - bittree.c: red-black bitmap tree that keeps track of items of interest - debug.c: functions used to print information used to debug Duet - hash.c: implementation of the global hash table where page events are stored for all tasks - hook.c: the function invoked by the page cache hooks when Duet is online - init.c: routines used to bring Duet online or offline - path.c: routines performing resolution of UUIDs to paths using d_path - task.c: implementation of Duet task fd operations Signed-off-by: George Amvrosiadis <gamvrosi@xxxxxxxxx> --- init/Kconfig | 2 + mm/Makefile | 1 + mm/duet/Kconfig | 31 +++ mm/duet/Makefile | 7 + mm/duet/bittree.c | 537 +++++++++++++++++++++++++++++++++++++++++++++++++ mm/duet/common.h | 211 ++++++++++++++++++++ mm/duet/debug.c | 98 +++++++++ mm/duet/hash.c | 315 +++++++++++++++++++++++++++++ mm/duet/hook.c | 81 ++++++++ mm/duet/init.c | 172 ++++++++++++++++ mm/duet/path.c | 184 +++++++++++++++++ mm/duet/syscall.h | 61 ++++++ mm/duet/task.c | 584 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 13 files changed, 2284 insertions(+) create mode 100644 mm/duet/Kconfig create mode 100644 mm/duet/Makefile create mode 100644 mm/duet/bittree.c create mode 100644 mm/duet/common.h create mode 100644 mm/duet/debug.c create mode 100644 mm/duet/hash.c create mode 100644 mm/duet/hook.c create mode 100644 mm/duet/init.c create mode 100644 mm/duet/path.c create mode 100644 mm/duet/syscall.h create mode 100644 mm/duet/task.c diff --git a/init/Kconfig b/init/Kconfig index c02d897..6f94b5a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -294,6 +294,8 @@ config USELIB earlier, you may need to enable this syscall. Current systems running glibc can safely disable this. +source mm/duet/Kconfig + config AUDIT bool "Auditing support" depends on NET diff --git a/mm/Makefile b/mm/Makefile index 78c6f7d..074c15f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -99,3 +99,4 @@ obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o +obj-$(CONFIG_DUET) += duet/ diff --git a/mm/duet/Kconfig b/mm/duet/Kconfig new file mode 100644 index 0000000..2f3a0c5 --- /dev/null +++ b/mm/duet/Kconfig @@ -0,0 +1,31 @@ +config DUET + bool "Duet framework support" + + help + Duet is a framework aiming to reduce the IO footprint of analytics + and maintenance work. By exposing page cache events to these tasks, + it allows them to adapt their data processing order, in order to + benefit from data available in the page cache. Duet's operation is + based on hooks into the page cache. + + To compile support for Duet, say Y. + +config DUET_STATS + bool "Duet statistics collection" + depends on DUET + help + This option enables support for the collection of statistics on the + operation of Duet. It will print information about the data structures + used internally, and profiling information about the framework. + + If unsure, say N. + +config DUET_DEBUG + bool "Duet debugging support" + depends on DUET + help + Enable runtime debugging support for the Duet framework. This may + enable additional and expensive checks with negative impact on + performance. + + To compile debugging support for Duet, say Y. If unsure, say N. diff --git a/mm/duet/Makefile b/mm/duet/Makefile new file mode 100644 index 0000000..c0c9e11 --- /dev/null +++ b/mm/duet/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux Duet framework. +# + +obj-$(CONFIG_DUET) += duet.o + +duet-y := init.o hash.o hook.o task.o bittree.o path.o debug.o diff --git a/mm/duet/bittree.c b/mm/duet/bittree.c new file mode 100644 index 0000000..3b20c35 --- /dev/null +++ b/mm/duet/bittree.c @@ -0,0 +1,537 @@ +/* + * Copyright (C) 2016 George Amvrosiadis. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include "common.h" + +#define BMAP_READ 0x01 /* Read bmaps (overrides other flags) */ +#define BMAP_CHECK 0x02 /* Check given bmap value expression */ + /* Sets bmaps to match expression if not set */ + +/* Bmap expressions can be formed using the following flags: */ +#define BMAP_DONE_SET 0x04 /* Set done bmap values */ +#define BMAP_DONE_RST 0x08 /* Reset done bmap values */ +#define BMAP_RELV_SET 0x10 /* Set relevant bmap values */ +#define BMAP_RELV_RST 0x20 /* Reset relevant bmap values */ +#define BMAP_SEEN_SET 0x40 /* Set seen bmap values */ +#define BMAP_SEEN_RST 0x80 /* Reset seen bmap values */ + +/* Some macros to make our life easier */ +#define BMAP_ALL_SET (BMAP_SEEN_SET | BMAP_RELV_SET | BMAP_DONE_SET) +#define BMAP_ALL_RST (BMAP_SEEN_RST | BMAP_RELV_RST | BMAP_DONE_RST) + +#define BITTREE_RANGE PAGE_SIZE /* Bytes per bitmap bit */ +#define BITS_PER_NODE (32768 * 8) /* 32KB bitmaps */ + +#define UUID_IDX(uuid) (((unsigned long long) uuid.gen << 32) | \ + (unsigned long long) uuid.ino) +/* + * The following functions are wrappers for the basic bitmap functions. + * A bitmap is characterized by a starting offset (bstart). The wrappers + * translate an arbitrary idx to the appropriate bit. + */ + +/* Sets (or resets) a single bit */ +static int bmap_set(unsigned long *bmap, __u64 start, __u64 idx, __u8 do_set) +{ + __u64 bofft = idx - start; + + if (bofft + 1 >= start + (BITS_PER_NODE * BITTREE_RANGE)) + return -1; + + /* Convert range to bitmap granularity */ + do_div(bofft, BITTREE_RANGE); + + if (do_set) + bitmap_set(bmap, (unsigned int)bofft, 1); + else + bitmap_clear(bmap, (unsigned int)bofft, 1); + + return 0; +} + +/* Returns value of bit at idx */ +static int bmap_read(unsigned long *bmap, __u64 start, __u64 idx) +{ + __u64 bofft64 = idx - start; + unsigned long *p, mask; + unsigned int bofft; + + if (bofft64 + 1 >= start + (BITS_PER_NODE * BITTREE_RANGE)) + return -1; + + /* Convert offset to bitmap granularity */ + do_div(bofft64, BITTREE_RANGE); + bofft = (unsigned int)bofft64; + + /* Check the bits */ + p = bmap + BIT_WORD(bofft); + mask = BITMAP_FIRST_WORD_MASK(bofft) & BITMAP_LAST_WORD_MASK(bofft + 1); + + if ((*p) & mask) + return 1; + + return 0; +} + +/* Checks whether a bit is set */ +static int bmap_chk(unsigned long *bmap, __u64 start, __u64 idx, __u8 do_set) +{ + __u64 bofft64 = idx - start; + unsigned long *p, mask; + unsigned int bofft; + + if (bofft64 + 1 >= start + (BITS_PER_NODE * BITTREE_RANGE)) + return -1; + + /* Convert range to bitmap granularity */ + do_div(bofft64, BITTREE_RANGE); + + /* Now it is safe to cast these variables */ + bofft = (unsigned int)bofft64; + + /* Check the bit */ + p = bmap + BIT_WORD(bofft); + mask = BITMAP_FIRST_WORD_MASK(bofft) & BITMAP_LAST_WORD_MASK(bofft + 1); + + if (do_set && !((*p) & mask)) + return 0; + else if (!do_set && !(~(*p) & mask)) + return 0; + + return 1; +} + +/* Initializes a bitmap tree node */ +static struct bmap_rbnode *bnode_init(struct duet_bittree *bt, __u64 idx) +{ + struct bmap_rbnode *bnode = NULL; + +#ifdef CONFIG_DUET_STATS + if (bt) { + (bt->statcur)++; + if (bt->statcur > bt->statmax) { + bt->statmax = bt->statcur; + pr_info("duet: %llu BitTree nodes (%llub)\n", + bt->statmax, bt->statmax * BITS_PER_NODE / 8); + } + } +#endif /* CONFIG_DUET_STATS */ + + bnode = kmalloc(sizeof(*bnode), GFP_NOWAIT); + if (!bnode) + return NULL; + + bnode->done = kzalloc(sizeof(unsigned long) * + BITS_TO_LONGS(BITS_PER_NODE), GFP_NOWAIT); + if (!bnode->done) { + kfree(bnode); + return NULL; + } + + /* Allocate relevant bitmap, if needed */ + bnode->relv = kzalloc(sizeof(unsigned long) * + BITS_TO_LONGS(BITS_PER_NODE), GFP_NOWAIT); + if (!bnode->relv) { + kfree(bnode->done); + kfree(bnode); + return NULL; + } + + bnode->seen = kzalloc(sizeof(unsigned long) * + BITS_TO_LONGS(BITS_PER_NODE), GFP_NOWAIT); + if (!bnode->seen) { + kfree(bnode->relv); + kfree(bnode->done); + kfree(bnode); + return NULL; + } + + RB_CLEAR_NODE(&bnode->node); + bnode->idx = idx; + return bnode; +} + +static void bnode_dispose(struct bmap_rbnode *bnode, struct rb_node *rbnode, + struct duet_bittree *bt) +{ +#ifdef CONFIG_DUET_STATS + if (bt) + (bt->statcur)--; +#endif /* CONFIG_DUET_STATS */ + rb_erase(rbnode, &bt->root); + kfree(bnode->relv); + kfree(bnode->seen); + kfree(bnode->done); + kfree(bnode); +} + +/* + * Traverses bitmap nodes to read/set/unset/check a specific bit across bitmaps. + * May insert/remove bitmap nodes as needed. + * + * If DUET_BMAP_READ is set: + * - the bitmap value for idx are read for one or all bitmaps + * Otherwise, if DUET_BMAP_CHECK flag is set: + * - return value 1 means the idx matches the given flags + * - return value 0 means the idx doesn't match the given flags + * Otherwise, if neither flag is set: + * - return value 0 means the idx was updated to match given flags + * + * In all cases, a return value -1 denotes an error. + */ +static int __update_tree(struct duet_bittree *bt, __u64 idx, __u8 flags) +{ + int found, ret, res; + __u64 node_offt, div_rem; + struct rb_node **link, *parent; + struct bmap_rbnode *bnode = NULL; + unsigned long iflags; + + local_irq_save(iflags); + spin_lock(&bt->lock); + + div64_u64_rem(idx, BITTREE_RANGE * BITS_PER_NODE, &div_rem); + node_offt = idx - div_rem; + + /* Look up BitTree node */ + found = 0; + link = &(bt->root).rb_node; + parent = NULL; + + while (*link) { + parent = *link; + bnode = rb_entry(parent, struct bmap_rbnode, node); + + if (bnode->idx > node_offt) { + link = &(*link)->rb_left; + } else if (bnode->idx < node_offt) { + link = &(*link)->rb_right; + } else { + found = 1; + break; + } + } + + /* If we're just reading bitmap values, return them now */ + if (flags & BMAP_READ) { + ret = 0; + + if (!found) + goto done; + + /* First read seen bit */ + res = bmap_read(bnode->seen, bnode->idx, idx); + if (res == -1) { + ret = -1; + goto done; + } + ret |= res << 2; + + /* Then read relevant bit */ + res = bmap_read(bnode->relv, bnode->idx, idx); + if (res == -1) { + ret = -1; + goto done; + } + ret |= res << 1; + + /* Read done bit */ + res = bmap_read(bnode->done, bnode->idx, idx); + if (res == -1) { + ret = -1; + goto done; + } + + ret |= res; + goto done; + } + + /* + * Take appropriate action based on whether we found the node + * and whether we plan to update (SET/RST), or only CHECK it. + * + * NULL | Found !Found | + * -------+------------------------------------+ + * SET | Set Bits | Init new node | + * |------------------+-----------------| + * RST | Clear (dispose?) | Nothing | + * -------+------------------------------------+ + * + * CHECK | Found !Found | + * -------+------------------------------------+ + * SET | Check Bits | Return false | + * |------------------+-----------------| + * RST | Check Bits | Continue | + * -------+------------------------------------+ + */ + + /* First handle setting (or checking set) bits */ + if (flags & BMAP_ALL_SET) { + if (!found && !(flags & BMAP_CHECK)) { + /* Insert the new node */ + bnode = bnode_init(bt, node_offt); + if (!bnode) { + ret = -1; + goto done; + } + + rb_link_node(&bnode->node, parent, link); + rb_insert_color(&bnode->node, &bt->root); + + } else if (!found && (flags & BMAP_CHECK)) { + /* Looking for set bits, node didn't exist */ + ret = 0; + goto done; + } + + /* Set the bits. Return -1 if something goes wrong. */ + if (!(flags & BMAP_CHECK)) { + if ((flags & BMAP_SEEN_SET) && + bmap_set(bnode->seen, bnode->idx, idx, 1)) { + ret = -1; + goto done; + } + + if ((flags & BMAP_RELV_SET) && + bmap_set(bnode->relv, bnode->idx, idx, 1)) { + ret = -1; + goto done; + } + + if ((flags & BMAP_DONE_SET) && + bmap_set(bnode->done, bnode->idx, idx, 1)) { + ret = -1; + goto done; + } + + /* Check the bits. Return if any bits are off */ + } else { + if (flags & BMAP_SEEN_SET) { + ret = bmap_chk(bnode->seen, bnode->idx, idx, 1); + if (ret != 1) + goto done; + } + + if (flags & BMAP_RELV_SET) { + ret = bmap_chk(bnode->relv, bnode->idx, idx, 1); + if (ret != 1) + goto done; + } + + ret = bmap_chk(bnode->done, bnode->idx, idx, 1); + if (ret != 1) + goto done; + } + } + + /* Now handle unsetting any bits */ + if (found && (flags & BMAP_ALL_RST)) { + /* Clear the bits. Return -1 if something goes wrong. */ + if (!(flags & BMAP_CHECK)) { + if ((flags & BMAP_SEEN_RST) && + bmap_set(bnode->seen, bnode->idx, idx, 0)) { + ret = -1; + goto done; + } + + if ((flags & BMAP_RELV_RST) && + bmap_set(bnode->relv, bnode->idx, idx, 0)) { + ret = -1; + goto done; + } + + if ((flags & BMAP_DONE_RST) && + bmap_set(bnode->done, bnode->idx, idx, 0)) { + ret = -1; + goto done; + } + + /* Check the bits. Return if any bits are off */ + } else { + if (flags & BMAP_SEEN_RST) { + ret = bmap_chk(bnode->seen, bnode->idx, idx, 0); + if (ret != 1) + goto done; + } + + if (flags & BMAP_RELV_RST) { + ret = bmap_chk(bnode->relv, bnode->idx, idx, 0); + if (ret != 1) + goto done; + } + + ret = bmap_chk(bnode->done, bnode->idx, idx, 0); + if (ret != 1) + goto done; + } + + /* Dispose of the node if empty */ + if (!(flags & BMAP_CHECK) && + bitmap_empty(bnode->done, BITS_PER_NODE) && + bitmap_empty(bnode->seen, BITS_PER_NODE) && + bitmap_empty(bnode->relv, BITS_PER_NODE)) + bnode_dispose(bnode, parent, bt); + } + + if (!(flags & BMAP_CHECK)) + ret = 0; + else + ret = 1; + +done: + if (ret == -1) + pr_err("duet: blocks were not %s\n", + (flags & BMAP_READ) ? "read" : + ((flags & BMAP_CHECK) ? "checked" : "modified")); + spin_unlock(&bt->lock); + local_irq_restore(iflags); + return ret; +} + +/* + * Check if we have seen this inode before. If not, check if it is relevant. + * Then, check whether it's done. + */ +static int do_bittree_check(struct duet_bittree *bt, struct duet_uuid uuid, + struct duet_task *task, struct inode *inode) +{ + int ret, bits; + unsigned long long idx = UUID_IDX(uuid); + + bits = __update_tree(bt, idx, BMAP_READ); + + if (!(bits & 0x4)) { + /* We have not seen this inode before */ + if (inode) { + ret = do_find_path(task, inode, 0, NULL, 0); + } else if (task) { + ret = duet_find_path(task, uuid, 0, NULL, 0); + } else { + pr_err("duet: check failed, no task/inode\n"); + return -1; + } + + if (!ret) { + /* Mark as relevant and return not done */ + ret = __update_tree(bt, idx, + BMAP_SEEN_SET | BMAP_RELV_SET); + if (ret != -1) + ret = 0; + + } else if (ret == -ENOENT) { + /* Mark as irrelevant and return done */ + ret = __update_tree(bt, idx, BMAP_SEEN_SET); + if (ret != -1) + ret = 1; + + } else { + pr_err("duet: inode relevance undetermined\n"); + return -1; + } + + } else { + /* We know this inode, return 1 if done, or irrelevant */ + ret = ((bits & 0x1) || !(bits & 0x2)) ? 1 : 0; + } + + return ret; +} + +/* Checks if a given inode is done. Skips inode lookup. */ +int bittree_check_inode(struct duet_bittree *bt, struct duet_task *task, + struct inode *inode) +{ + struct duet_uuid uuid; + + uuid.ino = inode->i_ino; + uuid.gen = inode->i_generation; + + return do_bittree_check(bt, uuid, task, inode); +} + +/* Checks if the given entries are done */ +int bittree_check(struct duet_bittree *bt, struct duet_uuid uuid, + struct duet_task *task) +{ + return do_bittree_check(bt, uuid, task, NULL); +} + +/* Mark done bit for given entries */ +int bittree_set(struct duet_bittree *bt, struct duet_uuid uuid) +{ + return __update_tree(bt, UUID_IDX(uuid), BMAP_DONE_SET); +} + +/* Unmark done bit for given entries */ +int bittree_reset(struct duet_bittree *bt, struct duet_uuid uuid) +{ + return __update_tree(bt, UUID_IDX(uuid), BMAP_DONE_RST); +} + +int bittree_print(struct duet_task *task) +{ + struct bmap_rbnode *bnode = NULL; + struct rb_node *node; + unsigned long iflags; + + local_irq_save(iflags); + spin_lock(&task->bittree.lock); + pr_info("duet: Printing task bittree\n"); + node = rb_first(&task->bittree.root); + while (node) { + bnode = rb_entry(node, struct bmap_rbnode, node); + + /* Print node information */ + pr_info("duet: Node key = %llu\n", bnode->idx); + pr_info("duet: Done bits set: %d out of %d\n", + bitmap_weight(bnode->done, BITS_PER_NODE), + BITS_PER_NODE); + pr_info("duet: Relv bits set: %d out of %d\n", + bitmap_weight(bnode->relv, BITS_PER_NODE), + BITS_PER_NODE); + pr_info("duet: Seen bits set: %d out of %d\n", + bitmap_weight(bnode->seen, BITS_PER_NODE), + BITS_PER_NODE); + + node = rb_next(node); + } + spin_unlock(&task->bittree.lock); + local_irq_restore(iflags); + + pr_info("duet: Task #%d bitmap has %d out of %lu bits set\n", + task->id, bitmap_weight(task->bucket_bmap, + duet_env.itm_hash_size), duet_env.itm_hash_size); + + return 0; +} + +void bittree_init(struct duet_bittree *bittree) +{ + spin_lock_init(&bittree->lock); + bittree->root = RB_ROOT; +#ifdef CONFIG_DUET_STATS + bittree->statcur = bittree->statmax = 0; +#endif /* CONFIG_DUET_STATS */ +} + +void bittree_destroy(struct duet_bittree *bittree) +{ + struct rb_node *rbnode; + struct bmap_rbnode *bnode; + + while (!RB_EMPTY_ROOT(&bittree->root)) { + rbnode = rb_first(&bittree->root); + bnode = rb_entry(rbnode, struct bmap_rbnode, node); + bnode_dispose(bnode, rbnode, bittree); + } +} diff --git a/mm/duet/common.h b/mm/duet/common.h new file mode 100644 index 0000000..1dac66b --- /dev/null +++ b/mm/duet/common.h @@ -0,0 +1,211 @@ +/* + * Copyright (C) 2016 George Amvrosiadis. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef _COMMON_H +#define _COMMON_H + +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/bitmap.h> +#include <linux/rculist.h> +#include <linux/syscalls.h> +#include <linux/duet.h> + +#ifdef DUET_DEBUG +#define duet_dbg(...) pr_info(__VA_ARGS__) +#else +#define duet_dbg(...) +#endif + +/* + * Duet can be state-based, and/or event-based. + * + * Event-based Duet monitors events that occurred on a page, during its + * time in the page cache: ADDED, REMOVED, DIRTY, and FLUSHED. + * + * State-based Duet monitors changes in the page cache since the last time + * a notification was sent to the interested application. Registering for + * EXIST informs the application of page additions or removals from the cache + * (i.e. ADDED and REMOVED events cancel each other out if the application + * hasn't been told in the meantime). Registering for MODIFIED events is a + * similar model, where unreported DIRTY and FLUSHED events cancel each other. + */ +#define DUET_PAGE_ADDED 0x0001 +#define DUET_PAGE_REMOVED 0x0002 +#define DUET_PAGE_DIRTY 0x0004 +#define DUET_PAGE_FLUSHED 0x0008 +#define DUET_PAGE_MODIFIED 0x0010 +#define DUET_PAGE_EXISTS 0x0020 +#define DUET_FD_NONBLOCK 0x0040 + +/* Used only for page state */ +#define DUET_MASK_VALID 0x8000 + +/* Some useful flags for clearing bitmaps */ +#define BMAP_SEEN 0x1 +#define BMAP_RELV 0x2 +#define BMAP_DONE 0x4 + +#define DUET_DEF_NUMTASKS 8 +#define DUET_INODE_FREEING (I_WILL_FREE | I_FREEING | I_CLEAR) + +enum { + DUET_STATUS_OFF = 0, + DUET_STATUS_ON, + DUET_STATUS_INIT, + DUET_STATUS_CLEAN, +}; + +/* + * Item struct returned for processing. + * The UUID currently consists of the inode number, the inode generation + * (to help us identify cases of inode reuse), and the task id. + * For state-based duet, we mark a page if it EXISTS or is MODIFIED. + * For event-based duet, we mark a page added, removed, dirtied, and/or flushed. + * Acceptable event combinations will differ based on the task's subscription. + */ +struct duet_uuid { + unsigned long ino; + __u32 gen; + __u8 tid; +}; + +struct duet_item { + struct duet_uuid uuid; + unsigned long idx; + __u16 state; +}; + +/* + * Red-black bitmap tree node. + * Represents the range starting from idx. For block tasks, only the done + * bitmap is used. For file tasks, the seen and relv (relevant) bitmaps are + * also used. The semantics of different states are listed below, where an + * item can be in the unknown state due to a bitmap reset, or because it hasn't + * been encountered yet. + * - !SEEN && !RELV && !DONE: Item in unknown state + * - !SEEN && !RELV && DONE: Item processed, but in unknown state + * - SEEN && !RELV && !DONE: Item not relevant to the task + * - SEEN && RELV && !DONE: Item is relevant, but not processed + * - SEEN && RELV && DONE: Item is relevant, and has already been processed + */ +struct bmap_rbnode { + __u64 idx; + struct rb_node node; + unsigned long *seen; + unsigned long *relv; + unsigned long *done; +}; + +struct item_hnode { + struct hlist_bl_node node; + struct duet_item item; + __u8 refcount; + __u16 *state; /* One entry per task */ +}; + +struct duet_bittree { + spinlock_t lock; + struct rb_root root; +#ifdef CONFIG_DUET_STATS + __u64 statcur; /* Cur # of BitTree nodes */ + __u64 statmax; /* Max # of BitTree nodes */ +#endif /* CONFIG_DUET_STATS */ +}; + +struct duet_task { + __u8 id; /* Internal task ID */ + + int fd; + struct filename *name; + __u32 evtmask; /* Mask of subscribed events */ + struct path *regpath; /* Registered path */ + char *regpathname; /* Registered path name */ + __u16 regpathlen; /* Length of path name */ + + /* Data structures linking task to framework */ + struct list_head task_list; + wait_queue_head_t cleaner_queue; + atomic_t refcount; + char *pathbuf; /* Buffer for getpath */ + struct duet_bittree bittree; /* Progress bitmap */ + wait_queue_head_t event_queue; /* for read() and poll() */ + + /* Hash table bucket bitmap */ + spinlock_t bbmap_lock; + unsigned long *bucket_bmap; + unsigned long bmap_cursor; +}; + +struct duet_info { + atomic_t status; + __u8 numtasks; /* Number of concurrent tasks */ + + /* + * Access to the task list is synchronized via a mutex. However, any + * operations that are on-going for a task (e.g. fetch) will increase + * its refcount. This refcount is consulted when disposing of the task. + */ + struct mutex task_list_mutex; + struct list_head tasks; + + /* ItemTable -- Global page state hash table */ + struct hlist_bl_head *itm_hash_table; + unsigned long itm_hash_size; + unsigned long itm_hash_shift; + unsigned long itm_hash_mask; +}; + +extern struct duet_info duet_env; + +/* hook.c */ +void duet_hook(__u16 evtcode, void *data); + +/* hash.c */ +int hash_init(void); +int hash_add(struct duet_task *task, struct duet_uuid uuid, + unsigned long idx, __u16 evtmask, short in_scan); +int hash_fetch(struct duet_task *task, struct duet_item *itm); +void hash_print(struct duet_task *task); + +/* task.c -- not in linux/duet.h */ +struct duet_task *duet_find_task(__u8 id); +void duet_task_dispose(struct duet_task *task); + +/* path.c */ +int do_find_path(struct duet_task *task, struct inode *inode, + int getpath, char *buf, int bufsize); +int duet_find_path(struct duet_task *task, struct duet_uuid uuid, + int getpath, char *buf, int bufsize); + +/* bittree.c */ +int bittree_check_inode(struct duet_bittree *bt, struct duet_task *task, + struct inode *inode); +int bittree_check(struct duet_bittree *bt, struct duet_uuid uuid, + struct duet_task *task); +int bittree_set(struct duet_bittree *bt, struct duet_uuid uuid); +int bittree_reset(struct duet_bittree *bt, struct duet_uuid uuid); +int bittree_print(struct duet_task *task); +void bittree_init(struct duet_bittree *bittree); +void bittree_destroy(struct duet_bittree *bittree); + +/* init.c */ +int duet_online(void); + +/* debug.c */ +int duet_print_bmap(__u8 id); +int duet_print_item(__u8 id); +int duet_print_list(struct duet_status_args __user *arg); + +#endif /* _COMMON_H */ diff --git a/mm/duet/debug.c b/mm/duet/debug.c new file mode 100644 index 0000000..77a47b6 --- /dev/null +++ b/mm/duet/debug.c @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2016 George Amvrosiadis. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include "common.h" +#include "syscall.h" + +/* Do a preorder print of the BitTree */ +int duet_print_bmap(__u8 id) +{ + struct duet_task *task; + + task = duet_find_task(id); + if (!task) + return -ENOENT; + + if (bittree_print(task)) { + pr_err("duet: failed to print BitTree for task #%d\n", + task->id); + return -1; + } + + /* decref and wake up cleaner if needed */ + if (atomic_dec_and_test(&task->refcount)) + wake_up(&task->cleaner_queue); + + return 0; +} + +/* Do a preorder print of the global hash table */ +int duet_print_item(__u8 id) +{ + struct duet_task *task; + + task = duet_find_task(id); + if (!task) + return -ENOENT; + + hash_print(task); + + /* decref and wake up cleaner if needed */ + if (atomic_dec_and_test(&task->refcount)) + wake_up(&task->cleaner_queue); + + return 0; +} + +int duet_print_list(struct duet_status_args __user *arg) +{ + int i = 0; + struct duet_task *cur; + struct duet_status_args argh, *argp; + + /* Copy in task list header (again) */ + if (copy_from_user(&argh, arg, sizeof(argh))) + return -EFAULT; + + /* Copy in entire task list */ + argp = memdup_user(arg, sizeof(*argp) + (argh.numtasks * + sizeof(struct duet_task_attrs))); + if (IS_ERR(argp)) + return PTR_ERR(argp); + + /* Copy the info for the first numtasks */ + mutex_lock(&duet_env.task_list_mutex); + list_for_each_entry(cur, &duet_env.tasks, task_list) { + argp->tasks[i].id = cur->id; + argp->tasks[i].fd = cur->fd; + memcpy(argp->tasks[i].name, cur->name->name, NAME_MAX); + argp->tasks[i].regmask = cur->evtmask; + memcpy(argp->tasks[i].path, cur->regpathname, cur->regpathlen); + i++; + if (i == argp->numtasks) + break; + } + mutex_unlock(&duet_env.task_list_mutex); + + /* Copy out entire task list */ + if (copy_to_user(arg, argp, sizeof(*argp) + (argp->numtasks * + sizeof(struct duet_task_attrs)))) { + pr_err("duet_print_list: failed to copy out list\n"); + kfree(argp); + return -EINVAL; + } + + duet_dbg("duet_print_list: success sending task list\n"); + kfree(argp); + return 0; +} diff --git a/mm/duet/hash.c b/mm/duet/hash.c new file mode 100644 index 0000000..c2644d6 --- /dev/null +++ b/mm/duet/hash.c @@ -0,0 +1,315 @@ +/* + * Copyright (C) 2016 George Amvrosiadis. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/hash.h> +#include "common.h" + +#define DUET_NEGATE_EXISTS (DUET_PAGE_ADDED | DUET_PAGE_REMOVED) +#define DUET_NEGATE_MODIFIED (DUET_PAGE_DIRTY | DUET_PAGE_FLUSHED) + +/* + * Page state is retained in a global hash table shared by all tasks. + * Indexing is based on the page's inode number and offset. + */ + +static unsigned long hash(unsigned long ino, unsigned long idx) +{ + unsigned long h; + + h = (idx * ino ^ (GOLDEN_RATIO_PRIME + idx)) / L1_CACHE_BYTES; + h = h ^ ((h ^ GOLDEN_RATIO_PRIME) >> duet_env.itm_hash_shift); + return h & duet_env.itm_hash_mask; +} + +int hash_init(void) +{ + /* Allocate power-of-2 number of buckets */ + duet_env.itm_hash_shift = ilog2(totalram_pages); + duet_env.itm_hash_size = 1 << duet_env.itm_hash_shift; + duet_env.itm_hash_mask = duet_env.itm_hash_size - 1; + + pr_debug("duet: allocated global hash table (%lu buckets)\n", + duet_env.itm_hash_size); + duet_env.itm_hash_table = vmalloc(sizeof(struct hlist_bl_head) * + duet_env.itm_hash_size); + if (!duet_env.itm_hash_table) + return 1; + + memset(duet_env.itm_hash_table, 0, sizeof(struct hlist_bl_head) * + duet_env.itm_hash_size); + return 0; +} + +/* Deallocate a hash table node */ +static void hnode_destroy(struct item_hnode *itnode) +{ + kfree(itnode->state); + kfree(itnode); +} + +/* Allocate and initialize a new hash table node */ +static struct item_hnode *hnode_init(struct duet_uuid uuid, unsigned long idx) +{ + struct item_hnode *itnode = NULL; + + itnode = kzalloc(sizeof(struct item_hnode), GFP_NOWAIT); + if (!itnode) + return NULL; + + itnode->state = kcalloc(duet_env.numtasks, sizeof(*(itnode->state)), + GFP_NOWAIT); + if (!(itnode->state)) { + pr_err("duet: failed to allocate hash node state\n"); + kfree(itnode); + return NULL; + } + + (itnode->item).uuid = uuid; + (itnode->item).idx = idx; + itnode->refcount++; + + return itnode; +} + +/* Add one event into the hash table */ +int hash_add(struct duet_task *task, struct duet_uuid uuid, unsigned long idx, + __u16 evtmask, short in_scan) +{ + __u16 curmask = 0; + short found = 0; + unsigned long bnum, flags; + struct hlist_bl_head *b; + struct hlist_bl_node *n; + struct item_hnode *itnode; + + evtmask &= task->evtmask; + + /* Get the bucket */ + bnum = hash(uuid.ino, idx); + b = duet_env.itm_hash_table + bnum; + local_irq_save(flags); + hlist_bl_lock(b); + + /* Lookup the item in the bucket */ + hlist_bl_for_each_entry(itnode, n, b, node) { + if ((itnode->item).uuid.ino == uuid.ino && + (itnode->item).uuid.gen == uuid.gen && + (itnode->item).idx == idx) { + found = 1; + break; + } + } + + duet_dbg("duet: %s hash node (tid %d, ino%lu, gen%lu, idx%lu)\n", + found ? (in_scan ? "replacing" : "updating") : "inserting", + uuid.tid, uuid.ino, uuid.gen, idx); + + if (found) { + curmask = itnode->state[task->id]; + + /* Only up the refcount if we are adding a new mask */ + if (!(curmask & DUET_MASK_VALID) || in_scan) { + if (!in_scan) + itnode->refcount++; + curmask = evtmask | DUET_MASK_VALID; + goto check_dispose; + } + + curmask |= evtmask | DUET_MASK_VALID; + + /* Negate previous events and remove if needed */ + if ((task->evtmask & DUET_PAGE_EXISTS) && + ((curmask & DUET_NEGATE_EXISTS) == DUET_NEGATE_EXISTS)) + curmask &= ~DUET_NEGATE_EXISTS; + + if ((task->evtmask & DUET_PAGE_MODIFIED) && + ((curmask & DUET_NEGATE_MODIFIED) == DUET_NEGATE_MODIFIED)) + curmask &= ~DUET_NEGATE_MODIFIED; + +check_dispose: + if ((curmask == DUET_MASK_VALID) && (itnode->refcount == 1)) { + if (itnode->refcount != 1) { + itnode->state[task->id] = 0; + } else { + hlist_bl_del(&itnode->node); + hnode_destroy(itnode); + } + + /* Are we still interested in this bucket? */ + hlist_bl_for_each_entry(itnode, n, b, node) { + if (itnode->state[task->id] & DUET_MASK_VALID) { + found = 1; + break; + } + } + + if (!found) + clear_bit(bnum, task->bucket_bmap); + } else { + itnode->state[task->id] = curmask; + + /* Update bitmap */ + set_bit(bnum, task->bucket_bmap); + } + } else if (!found) { + if (!evtmask) + goto done; + + itnode = hnode_init(uuid, idx); + if (!itnode) + return 1; + + itnode->state[task->id] = evtmask | DUET_MASK_VALID; + hlist_bl_add_head(&itnode->node, b); + + /* Update bitmap */ + set_bit(bnum, task->bucket_bmap); + } + +done: + hlist_bl_unlock(b); + local_irq_restore(flags); + return 0; +} + +/* Fetch one item for a given task. Return found (1), empty (0), error (-1) */ +int hash_fetch(struct duet_task *task, struct duet_item *itm) +{ + int found; + unsigned long bnum, flags; + struct hlist_bl_head *b; + struct hlist_bl_node *n; + struct item_hnode *itnode; + + local_irq_save(flags); +again: + spin_lock(&task->bbmap_lock); + bnum = find_next_bit(task->bucket_bmap, duet_env.itm_hash_size, + task->bmap_cursor); + + if (bnum == duet_env.itm_hash_size) { + /* Reached end of bitmap */ + found = 0; + + if (task->bmap_cursor != 0) { + /* Started part way, try again */ + bnum = find_next_bit(task->bucket_bmap, + task->bmap_cursor, 0); + + if (bnum != task->bmap_cursor) + found = 1; + } + + if (!found) { + spin_unlock(&task->bbmap_lock); + local_irq_restore(flags); + return 1; + } + } + + task->bmap_cursor = bnum; + clear_bit(bnum, task->bucket_bmap); + spin_unlock(&task->bbmap_lock); + b = duet_env.itm_hash_table + bnum; + + /* Grab first item from bucket */ + hlist_bl_lock(b); + if (!b->first) { + pr_err("duet: empty hash bucket marked in bitmap\n"); + hlist_bl_unlock(b); + goto again; + } + + found = 0; + hlist_bl_for_each_entry(itnode, n, b, node) { + if (itnode->state[task->id] & DUET_MASK_VALID) { + *itm = itnode->item; + itm->uuid.tid = task->id; + itm->state = itnode->state[task->id] & + (~DUET_MASK_VALID); + + itnode->refcount--; + /* Free or update node */ + if (!itnode->refcount) { + hlist_bl_del(n); + hnode_destroy(itnode); + } else { + itnode->state[task->id] = 0; + } + + found = 1; + break; + } + } + + if (!found) { + hlist_bl_unlock(b); + goto again; + } + + /* Are we still interested in this bucket? */ + found = 0; + hlist_bl_for_each_entry(itnode, n, b, node) { + if (itnode->state[task->id] & DUET_MASK_VALID) { + found = 1; + break; + } + } + + if (found) + set_bit(bnum, task->bucket_bmap); + + hlist_bl_unlock(b); + local_irq_restore(flags); + return 0; +} + +/* Warning: expensive printing function. Use with care. */ +void hash_print(struct duet_task *task) +{ + unsigned long loop, count, start, end, buckets, flags; + unsigned long long nodes, tnodes; + struct hlist_bl_head *b; + struct hlist_bl_node *n; + struct item_hnode *itnode; + + count = duet_env.itm_hash_size / 100; + tnodes = nodes = buckets = start = end = 0; + pr_info("duet: Printing hash table\n"); + for (loop = 0; loop < duet_env.itm_hash_size; loop++) { + if (loop - start >= count) { + pr_info("duet: Buckets %lu - %lu: %llu nodes (task: %llu)\n", + start, end, nodes, tnodes); + start = end = loop; + nodes = tnodes = 0; + } + + /* Count bucket nodes */ + b = duet_env.itm_hash_table + loop; + local_irq_save(flags); + hlist_bl_lock(b); + hlist_bl_for_each_entry(itnode, n, b, node) { + nodes++; + if (itnode->state[task->id] & DUET_MASK_VALID) + tnodes++; + } + hlist_bl_unlock(b); + local_irq_restore(flags); + + end = loop; + } + + if (start != loop - 1) + pr_info("duet: Buckets %lu - %lu: %llu nodes (task: %llu)\n", + start, end, nodes, tnodes); +} diff --git a/mm/duet/hook.c b/mm/duet/hook.c new file mode 100644 index 0000000..3ac89f5 --- /dev/null +++ b/mm/duet/hook.c @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2016 George Amvrosiadis. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include "common.h" + +/* Handle an event. We're in RCU context so whatever happens, stay awake! */ +void duet_hook(__u16 evtcode, void *data) +{ + struct page *page = NULL; + struct inode *inode = NULL; + struct duet_task *cur; + unsigned long page_idx = 0; + struct duet_uuid uuid; + + /* Duet must be online */ + if (!duet_online()) + return; + + /* Handle page event */ + page = (struct page *)data; + + /* Duet must be online, and the page must belong to a valid mapping */ + if (!page || !page_mapping(page)) { + duet_dbg("duet: dropped event %x due to NULL mapping\n", + evtcode); + return; + } + + inode = page_mapping(page)->host; + page_idx = page->index; + + /* Check that we're referring to an actual inode and get its UUID */ + if (!inode) + return; + + uuid.ino = inode->i_ino; + uuid.gen = inode->i_generation; + + /* Verify that the inode does not belong to a special file */ + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + return; + + if (!inode->i_ino) { + pr_err("duet: inode not initialized\n"); + return; + } + + /* Look for tasks interested in this event type and invoke callbacks */ + rcu_read_lock(); + list_for_each_entry_rcu(cur, &duet_env.tasks, task_list) { + struct super_block *sb = cur->regpath->mnt->mnt_sb; + + /* Verify that the event refers to the fs we're interested in */ + if (sb && sb != inode->i_sb) + continue; + + duet_dbg("duet: rcvd event %x on (ino %lu, gen %lu, idx %lu)\n", + evtcode, uuid.ino, uuid.gen, page_idx); + + /* Use the inode bitmap to filter out event if applicable */ + if (bittree_check_inode(&cur->bittree, cur, inode) == 1) + continue; + + /* Update the hash table */ + if (hash_add(cur, uuid, page_idx, evtcode, 0)) + pr_err("duet: hash table add failed\n"); + + wake_up(&cur->event_queue); + } + rcu_read_unlock(); +} diff --git a/mm/duet/init.c b/mm/duet/init.c new file mode 100644 index 0000000..a9e5ea1 --- /dev/null +++ b/mm/duet/init.c @@ -0,0 +1,172 @@ +/* + * Copyright (C) 2016 George Amvrosiadis. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include "common.h" +#include "syscall.h" + +struct duet_info duet_env; +duet_hook_t *duet_hook_fp; +EXPORT_SYMBOL(duet_hook_fp); + +int duet_online(void) +{ + return (atomic_read(&duet_env.status) == DUET_STATUS_ON); +} + +int duet_bootstrap(__u16 numtasks) +{ + if (atomic_cmpxchg(&duet_env.status, DUET_STATUS_OFF, DUET_STATUS_INIT) + != DUET_STATUS_OFF) { + pr_err("duet: framework on, bootstrap aborted\n"); + return 1; + } + + duet_env.numtasks = (numtasks ? numtasks : DUET_DEF_NUMTASKS); + + /* Initialize global hash table */ + if (hash_init()) { + pr_err("duet: failed to initialize hash table\n"); + return 1; + } + + /* Initialize task list */ + INIT_LIST_HEAD(&duet_env.tasks); + mutex_init(&duet_env.task_list_mutex); + atomic_set(&duet_env.status, DUET_STATUS_ON); + + rcu_assign_pointer(duet_hook_fp, duet_hook); + synchronize_rcu(); + return 0; +} + +int duet_shutdown(void) +{ + struct duet_task *task; + + if (atomic_cmpxchg(&duet_env.status, DUET_STATUS_ON, DUET_STATUS_CLEAN) + != DUET_STATUS_ON) { + pr_err("duet: framework off, shutdown aborted\n"); + return 1; + } + + rcu_assign_pointer(duet_hook_fp, NULL); + synchronize_rcu(); + + /* Remove all tasks */ + mutex_lock(&duet_env.task_list_mutex); + while (!list_empty(&duet_env.tasks)) { + task = list_entry_rcu(duet_env.tasks.next, struct duet_task, + task_list); + list_del_rcu(&task->task_list); + mutex_unlock(&duet_env.task_list_mutex); + + /* Make sure everyone's let go before we free it */ + synchronize_rcu(); + wait_event(task->cleaner_queue, + atomic_read(&task->refcount) == 0); + duet_task_dispose(task); + + mutex_lock(&duet_env.task_list_mutex); + } + mutex_unlock(&duet_env.task_list_mutex); + + /* Destroy global hash table */ + vfree((void *)duet_env.itm_hash_table); + + INIT_LIST_HEAD(&duet_env.tasks); + mutex_destroy(&duet_env.task_list_mutex); + atomic_set(&duet_env.status, DUET_STATUS_OFF); + return 0; +} + +SYSCALL_DEFINE2(duet_status, u16, flags, struct duet_status_args __user *, arg) +{ + int ret = 0; + struct duet_status_args *sa; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + /* For now, we only support one struct size */ + if (sa->size != sizeof(*sa)) { + pr_err("duet_status: invalid args struct size (%u)\n", + sa->size); + ret = -EINVAL; + goto done; + } + + /* If we're cleaning up, only allow ops that affect Duet status */ + if (atomic_read(&duet_env.status) != DUET_STATUS_ON && !(flags & + (DUET_STATUS_START | DUET_STATUS_STOP | DUET_STATUS_REPORT))) { + pr_err("duet_status: ops rejected during shutdown\n"); + ret = -EINVAL; + goto done; + } + + switch (flags) { + case DUET_STATUS_START: + ret = duet_bootstrap(sa->maxtasks); + + if (ret) + pr_err("duet: failed to enable framework\n"); + else + pr_info("duet: framework enabled\n"); + + break; + + case DUET_STATUS_STOP: + ret = duet_shutdown(); + + if (ret) + pr_err("duet: failed to disable framework\n"); + else + pr_info("duet: framework disabled\n"); + + break; + + case DUET_STATUS_REPORT: + ret = duet_online(); + break; + + case DUET_STATUS_PRINT_BMAP: + ret = duet_print_bmap(sa->id); + break; + + case DUET_STATUS_PRINT_ITEM: + ret = duet_print_item(sa->id); + break; + + case DUET_STATUS_PRINT_LIST: + ret = duet_print_list(arg); + goto done; + + default: + pr_info("duet_status: invalid flags\n"); + ret = -EINVAL; + goto done; + } + + if (copy_to_user(arg, sa, sizeof(*sa))) { + pr_err("duet_status: copy_to_user failed\n"); + ret = -EINVAL; + goto done; + } + +done: + kfree(sa); + return ret; +} diff --git a/mm/duet/path.c b/mm/duet/path.c new file mode 100644 index 0000000..103ab42 --- /dev/null +++ b/mm/duet/path.c @@ -0,0 +1,184 @@ +/* + * Copyright (C) 2016 George Amvrosiadis. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include "common.h" +#include "syscall.h" + +/* Scan through the page cache for a given inode */ +static int find_get_inode(struct super_block *sb, struct duet_uuid c_uuid, + struct inode **c_inode) +{ + struct inode *inode = NULL; + + *c_inode = NULL; + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + if (!*c_inode && inode->i_ino == c_uuid.ino && + inode->i_generation == c_uuid.gen && + !(inode->i_state & DUET_INODE_FREEING)) { + atomic_inc(&inode->i_count); + *c_inode = inode; + spin_unlock(&inode->i_lock); + spin_unlock(&sb->s_inode_list_lock); + return 0; + } + spin_unlock(&inode->i_lock); + } + spin_unlock(&sb->s_inode_list_lock); + + /* We shouldn't get here unless we failed */ + return 1; +} + +int do_find_path(struct duet_task *task, struct inode *inode, int getpath, + char *buf, int bufsize) +{ + int len; + char *p; + struct path path; + struct dentry *alias, *i_dentry = NULL; + + if (!task) { + pr_err("do_find_path: invalid task\n"); + return -EINVAL; + } + + if (getpath) + buf[0] = '\0'; + + /* Get the path for at least one alias of the inode */ + if (hlist_empty(&inode->i_dentry)) + return -ENOENT; + + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + if (IS_ROOT(alias) && (alias->d_flags & DCACHE_DISCONNECTED)) + continue; + + i_dentry = alias; + + /* Now get the path */ + len = PATH_MAX; + memset(task->pathbuf, 0, len); + path.mnt = task->regpath->mnt; + path.dentry = i_dentry; + + p = d_path(&path, task->pathbuf, len); + if (IS_ERR(p)) { + pr_err("do_find_path: d_path failed\n"); + continue; + } else if (!p) { + duet_dbg("do_find_path: dentry not found\n"); + continue; + } + + /* Is this path of interest? */ + if (memcmp(task->regpathname, p, task->regpathlen - 1)) { + duet_dbg("do_find_path: no common ancestor\n"); + continue; + } + + /* Got one. If it fits, return it */ + if (getpath && (bufsize < len - (p - task->pathbuf))) + return -ENOMEM; + + duet_dbg("do_find_path: got %s\n", p); + if (getpath) + memcpy(buf, p, len - (p - task->pathbuf)); + + return 0; + } + + /* We only get here if we got nothing */ + return -ENOENT; +} + +int duet_find_path(struct duet_task *task, struct duet_uuid uuid, int getpath, + char *buf, int bufsize) +{ + int ret = 0; + struct inode *ino; + + if (!task) { + pr_err("duet_find_path: invalid task\n"); + return -EINVAL; + } + + /* First, we need to find struct inode for child and parent */ + if (find_get_inode(task->regpath->mnt->mnt_sb, uuid, &ino)) { + duet_dbg("duet_find_path: child inode not found\n"); + return -ENOENT; + } + + ret = do_find_path(task, ino, getpath, buf, bufsize); + + iput(ino); + return ret; +} + +SYSCALL_DEFINE3(duet_get_path, struct duet_uuid_arg __user *, uuid, + char __user *, pathbuf, int, pathbufsize) +{ + int ret = 0; + struct duet_uuid_arg *ua; + struct duet_task *task; + char *buf; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!duet_online()) + return -ESRCH; + + /* Do some basic sanity checking */ + if (!uuid || pathbufsize <= 0) + return -EINVAL; + + buf = kcalloc(pathbufsize, sizeof(char), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + ua = memdup_user(uuid, sizeof(*uuid)); + if (IS_ERR(ua)) { + kfree(buf); + return PTR_ERR(ua); + } + + /* For now, we only support one struct size */ + if (ua->size != sizeof(*ua)) { + pr_err("duet_get_path: invalid args struct size (%u)\n", + ua->size); + ret = -EINVAL; + goto done; + } + + task = duet_find_task(ua->uuid.tid); + if (!task) { + ret = -ENOENT; + goto done; + } + + ret = duet_find_path(task, ua->uuid, 1, buf, pathbufsize); + + if (!ret && copy_to_user(pathbuf, buf, pathbufsize)) + ret = -EFAULT; + + /* decref and wake up cleaner if needed */ + if (atomic_dec_and_test(&task->refcount)) + wake_up(&task->cleaner_queue); + +done: + kfree(ua); + kfree(buf); + return ret; +} diff --git a/mm/duet/syscall.h b/mm/duet/syscall.h new file mode 100644 index 0000000..1bc6830 --- /dev/null +++ b/mm/duet/syscall.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2016 George Amvrosiadis. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include "common.h" + +/* Status syscall flags */ +#define DUET_STATUS_START 0x0001 +#define DUET_STATUS_STOP 0x0002 +#define DUET_STATUS_REPORT 0x0004 +#define DUET_STATUS_PRINT_BMAP 0x0008 +#define DUET_STATUS_PRINT_ITEM 0x0010 +#define DUET_STATUS_PRINT_LIST 0x0020 + +struct duet_task_attrs { + __u8 id; + int fd; + char name[NAME_MAX]; + __u32 regmask; + char path[PATH_MAX]; +}; + +struct duet_status_args { + __u32 size; /* in */ + union { + /* DUET_START args */ + struct { + __u16 maxtasks; /* in */ + }; + + /* DUET_PRINT_BIT, DUET_PRINT_ITEM args */ + struct { + __u8 id; /* in */ + }; + + /* DUET_PRINT_LIST args */ + struct { + __u16 numtasks; /* out */ + struct duet_task_attrs tasks[0]; /* out */ + }; + }; +}; + +/* Bmap syscall flags */ +#define DUET_BMAP_SET 0x0001 +#define DUET_BMAP_RESET 0x0002 +#define DUET_BMAP_CHECK 0x0004 + +struct duet_uuid_arg { + __u32 size; + struct duet_uuid uuid; +}; diff --git a/mm/duet/task.c b/mm/duet/task.c new file mode 100644 index 0000000..c91fad3 --- /dev/null +++ b/mm/duet/task.c @@ -0,0 +1,584 @@ +/* + * Copyright (C) 2016 George Amvrosiadis. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/namei.h> +#include <linux/anon_inodes.h> +#include "common.h" +#include "syscall.h" + +/* + * To synchronize access to the task list and structures without compromising + * scalability, a two-level approach is used. At the task list level, which is + * rarely updated, RCU is used. For the task structures, we use traditional + * reference counting. The two techniques are interweaved to achieve overall + * consistency. + */ + +static unsigned int duet_poll(struct file *file, poll_table *wait) +{ + __u8 *tid = file->private_data; + struct duet_task *task; + int ret = 0; + + task = duet_find_task(*tid); + if (!task) { + pr_err("duet_poll: task not found\n"); + return ret; + } + + poll_wait(file, &task->event_queue, wait); + if (bitmap_weight(task->bucket_bmap, duet_env.itm_hash_size)) + ret = POLLIN | POLLRDNORM; + + return ret; +} + +/* + * Copy an item to user space, returning how much we copied. + * + * We already checked that the event size is smaller than the + * buffer we had in duet_read() below. + */ +static ssize_t copy_item_to_user(struct duet_task *task, + struct duet_item *item, + char __user *buf) +{ + size_t item_size = sizeof(struct duet_item); + + /* Send the item */ + if (copy_to_user(buf, item, item_size)) + return -EFAULT; + + buf += item_size; + + duet_dbg("duet_read: sending (ino%lu, gen%u, idx%lu, %x)\n", + item->uuid.ino, item->uuid.gen, item->idx, item->state); + + return item_size; +} + +/* + * Sends out duet items. The number of bytes returned corresponds to the number + * of sizeof(struct duet_item) items fetched. Items are checked against the + * bitmap, and discarded if they have been marked; this can happen because an + * insertion can occur between the last read and the last bitmap set operation. + */ +static ssize_t duet_read(struct file *file, char __user *buf, + size_t count, loff_t *pos) +{ + struct duet_task *task; + struct duet_item item; + char __user *start; + int ret, err; + __u8 *tid; + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + start = buf; + tid = file->private_data; + + task = duet_find_task(*tid); + if (!task) + return -ENOENT; + + add_wait_queue(&task->event_queue, &wait); + while (1) { + /* Fetch an item only if there's space to store it */ + if (sizeof(struct duet_item) > count) + err = -EINVAL; + else + err = hash_fetch(task, &item); + + if (!err) { + ret = copy_item_to_user(task, &item, buf); + if (ret < 0) + break; + buf += ret; + count -= ret; + continue; + } + + ret = -EAGAIN; + if (file->f_flags & O_NONBLOCK) + break; + ret = -ERESTARTSYS; + if (signal_pending(current)) + break; + + if (start != buf) + break; + + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); + } + remove_wait_queue(&task->event_queue, &wait); + + if (start != buf && ret != -EFAULT) + ret = buf - start; + + /* Decref and wake up cleaner if needed */ + if (atomic_dec_and_test(&task->refcount)) + wake_up(&task->cleaner_queue); + + return ret; +} + +/* + * Properly dismantle and dispose of a task struct. + * At this point we've guaranteed that noone else is accessing the + * task struct, so we don't need any locks + */ +void duet_task_dispose(struct duet_task *task) +{ + int ret = 0; + struct duet_item itm; + + /* Dispose of the bitmap tree */ + bittree_destroy(&task->bittree); + + /* Dispose of hash table entries, bucket bitmap */ + while (!ret) + ret = hash_fetch(task, &itm); + kfree(task->bucket_bmap); + + putname(task->name); + path_put(task->regpath); + kfree(task->regpath); + kfree(task->regpathname); + kfree(task->pathbuf); + kfree(task); +} + +static int duet_release(struct inode *ignored, struct file *file) +{ + __u8 *tid = file->private_data; + struct duet_task *cur; + + /* Find the task in the list, then dispose of it */ + mutex_lock(&duet_env.task_list_mutex); + list_for_each_entry_rcu(cur, &duet_env.tasks, task_list) { + if (cur->id == *tid) { +#ifdef CONFIG_DUET_STATS + hash_print(cur); + bittree_print(cur); +#endif /* CONFIG_DUET_STATS */ + list_del_rcu(&cur->task_list); + mutex_unlock(&duet_env.task_list_mutex); + + /* Wait until everyone's done with it */ + synchronize_rcu(); + wait_event(cur->cleaner_queue, + atomic_read(&cur->refcount) == 0); + + pr_info("duet: deregistered task %d\n", cur->id); + + duet_task_dispose(cur); + kfree(tid); + return 0; + } + } + mutex_unlock(&duet_env.task_list_mutex); + + return -ENOENT; +} + +static const struct file_operations duet_fops = { + .show_fdinfo = NULL, + .poll = duet_poll, + .read = duet_read, + .fasync = NULL, + .release = duet_release, + .unlocked_ioctl = NULL, + .compat_ioctl = NULL, + .llseek = noop_llseek, +}; + +static int process_inode(struct duet_task *task, struct inode *inode) +{ + struct radix_tree_iter iter; + struct duet_uuid uuid; + void **slot; + __u16 state; + + /* Use the inode bitmap to decide whether to skip inode */ + if (bittree_check_inode(&task->bittree, task, inode) == 1) + return 0; + + /* Go through all pages of this inode */ + rcu_read_lock(); + radix_tree_for_each_slot(slot, &inode->i_mapping->page_tree, &iter, 0) { + struct page *page; + + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + /* + * Shadow entry of recently evicted page, or swap entry + * from shmem/tmpfs. Skip over it. + */ + continue; + } + + state = DUET_PAGE_ADDED; + if (PageDirty(page)) + state |= DUET_PAGE_DIRTY; + uuid.ino = inode->i_ino; + uuid.gen = inode->i_generation; + hash_add(task, uuid, page->index, state, 1); + } + rcu_read_unlock(); + + return 0; +} + +/* Scan through the page cache for events of interest to the task */ +static int scan_page_cache(struct duet_task *task) +{ + struct inode *inode, *prev = NULL; + struct super_block *sb = task->regpath->mnt->mnt_sb; + + pr_info("duet: page cache scan started\n"); + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + struct address_space *mapping = inode->i_mapping; + + spin_lock(&inode->i_lock); + if (inode->i_state & DUET_INODE_FREEING || + mapping->nrpages == 0) { + spin_unlock(&inode->i_lock); + continue; + } + atomic_inc(&inode->i_count); + spin_unlock(&inode->i_lock); + spin_unlock(&sb->s_inode_list_lock); + + /* + * We are holding a reference to inode so it won't be removed + * from s_inodes list while we don't hold the s_inode_list_lock. + * We cannot iput the inode now, though, as we may be holding + * the last reference. We will iput it after the iteration is + * done. + */ + + iput(prev); + prev = inode; + + process_inode(task, inode); + + spin_lock(&sb->s_inode_list_lock); + } + spin_unlock(&sb->s_inode_list_lock); + iput(prev); + + pr_info("duet: page cache scan finished\n"); + + return 0; +} + +/* Find task and increment its refcount */ +struct duet_task *duet_find_task(__u8 id) +{ + struct duet_task *cur, *task = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(cur, &duet_env.tasks, task_list) { + if (cur->id == id) { + task = cur; + atomic_inc(&task->refcount); + break; + } + } + rcu_read_unlock(); + + return task; +} + +/* Allocate and initialize a task struct */ +static int duet_task_init(struct duet_task **task, struct filename *name, + __u32 regmask, struct path *path) +{ + int len; + char *p; + u32 evtmask = regmask; + + /* Do some sanity checking on event mask. */ + if (evtmask & DUET_PAGE_EXISTS) { + if (evtmask & (DUET_PAGE_ADDED | DUET_PAGE_REMOVED)) { + pr_debug("duet_task_init: invalid regmask\n"); + return -EINVAL; + } + evtmask |= (DUET_PAGE_ADDED | DUET_PAGE_REMOVED); + } + + if (evtmask & DUET_PAGE_MODIFIED) { + if (evtmask & (DUET_PAGE_DIRTY | DUET_PAGE_FLUSHED)) { + pr_debug("duet_task_init: invalid regmask\n"); + goto err; + } + evtmask |= (DUET_PAGE_DIRTY | DUET_PAGE_FLUSHED); + } + + /* Allocate task info struct */ + *task = kzalloc(sizeof(**task), GFP_KERNEL); + if (!(*task)) + return -ENOMEM; + + /* Allocate temporary space for getpath file paths */ + (*task)->pathbuf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!(*task)->pathbuf) { + pr_err("duet_task_init: buffer allocation failed\n"); + kfree(*task); + return -ENOMEM; + } + + /* Find and store registered dir path */ + (*task)->regpathname = kzalloc(PATH_MAX, GFP_KERNEL); + if (!(*task)->regpathname) { + pr_err("duet_task_init: path allocation failed\n"); + kfree((*task)->pathbuf); + kfree(*task); + return -ENOMEM; + } + + /* Populate registered dir path buffer */ + len = PATH_MAX; + p = d_path(path, (*task)->pathbuf, len); + if (IS_ERR(p)) { + pr_err("duet_task_init: path registration failed\n"); + goto err; + } else if (!p) { + pr_err("duet_task_init: (null) registered path\n"); + goto err; + } + + (*task)->regpathlen = len - (p - (*task)->pathbuf); + memcpy((*task)->regpathname, p, (*task)->regpathlen); + + (*task)->id = 1; + (*task)->name = name; + (*task)->regpath = path; + (*task)->evtmask = (regmask & 0xffff); + atomic_set(&(*task)->refcount, 0); + INIT_LIST_HEAD(&(*task)->task_list); + init_waitqueue_head(&(*task)->cleaner_queue); + init_waitqueue_head(&(*task)->event_queue); + bittree_init(&(*task)->bittree); + + /* Initialize hash table bitmap */ + (*task)->bmap_cursor = 0; + spin_lock_init(&(*task)->bbmap_lock); + (*task)->bucket_bmap = kzalloc(sizeof(unsigned long) * + BITS_TO_LONGS(duet_env.itm_hash_size), GFP_KERNEL); + if (!(*task)->bucket_bmap) { + pr_err("duet_task_init: hash bitmap alloc failed\n"); + kfree((*task)->regpathname); + kfree((*task)->pathbuf); + kfree(*task); + return -ENOMEM; + } + + return 0; +err: + pr_err("duet_task_init: error registering task\n"); + kfree((*task)->regpathname); + kfree((*task)->pathbuf); + kfree(*task); + return -EINVAL; +} + +/* Register the task with Duet */ +int duet_register_task(struct filename *name, __u32 regmask, struct path *path) +{ + int ret; + __u8 *tid; + struct duet_task *cur, *task = NULL; + struct list_head *last; + + /* Do some sanity checking on the parameters passed */ + if (!path || !regmask) + return -EINVAL; + + if (!path->dentry || !path->dentry->d_inode) { + pr_err("duet_register_task: invalid path\n"); + return -EINVAL; + } + + if (!S_ISDIR(path->dentry->d_inode->i_mode)) { + pr_err("duet_register_task: path is not a dir\n"); + return -EINVAL; + } + + ret = duet_task_init(&task, name, regmask, path); + if (ret) { + pr_err("duet_register_task: initialization failed\n"); + return ret; + } + + /* Now get an anonymous inode to use for communication with Duet */ + tid = kzalloc(sizeof(__u8), GFP_KERNEL); + if (!tid) { + duet_task_dispose(task); + return -ENOMEM; + } + + ret = anon_inode_getfd("duet", &duet_fops, tid, + O_RDONLY | ((regmask & DUET_FD_NONBLOCK) ? O_NONBLOCK : 0)); + if (ret < 0) { + duet_task_dispose(task); + kfree(tid); + return ret; + } + + task->fd = ret; + + /* Find a free task id for the new task. Tasks are sorted by id. */ + mutex_lock(&duet_env.task_list_mutex); + last = &duet_env.tasks; + list_for_each_entry_rcu(cur, &duet_env.tasks, task_list) { + if (cur->id == task->id) + (task->id)++; + else if (cur->id > task->id) + break; + + last = &cur->task_list; + } + list_add_rcu(&task->task_list, last); + mutex_unlock(&duet_env.task_list_mutex); + + *tid = task->id; + + /* Before we return, scan the page cache for pages of interest */ + scan_page_cache(task); + + pr_info("duet: task %d (fd %d) registered %s(%d) with mask %x\n", + task->id, task->fd, task->regpathname, task->regpathlen, + task->evtmask); + + return ret; +} + +SYSCALL_DEFINE3(duet_init, const char __user *, taskname, u32, regmask, + const char __user *, pathname) +{ + int ret; + unsigned int lookup_flags = LOOKUP_DIRECTORY; + struct filename *name = NULL; + struct path *path = NULL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!duet_online()) + return -ESRCH; + + /* Do some basic sanity checking */ + if (!pathname || !regmask) + return -EINVAL; + + if (taskname) { + name = getname(taskname); + if (IS_ERR(name)) + return PTR_ERR(name); + } + + path = kzalloc(sizeof(struct path), GFP_KERNEL); + if (!path) { + putname(name); + return -ENOMEM; + } + + ret = user_path_at(AT_FDCWD, pathname, lookup_flags, path); + if (ret) { + pr_err("duet_init: user_path_at failed\n"); + goto err; + } + + /* Register the task with the framework */ + ret = duet_register_task(name, regmask, path); + if (ret < 0) { + pr_err("duet_init: task registration failed\n"); + goto err; + } + + return ret; + +err: + putname(name); + path_put(path); + kfree(path); + return ret; +} + +SYSCALL_DEFINE2(duet_bmap, u16, flags, struct duet_uuid_arg __user *, arg) +{ + int ret = 0; + struct duet_uuid_arg *ua; + struct duet_task *task; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!duet_online()) + return -ESRCH; + + /* Do some basic sanity checking */ + if (!arg) + return -EINVAL; + + ua = memdup_user(arg, sizeof(*arg)); + if (IS_ERR(ua)) + return PTR_ERR(ua); + + /* For now, we only support one struct size */ + if (ua->size != sizeof(*ua)) { + pr_err("duet_bmap: invalid args struct size (%u)\n", ua->size); + ret = -EINVAL; + goto done; + } + + task = duet_find_task(ua->uuid.tid); + if (!task) + return -ENOENT; + + switch (flags) { + case DUET_BMAP_SET: + ret = bittree_set(&task->bittree, ua->uuid); + break; + + case DUET_BMAP_RESET: + ret = bittree_reset(&task->bittree, ua->uuid); + break; + + case DUET_BMAP_CHECK: + ret = bittree_check(&task->bittree, ua->uuid, task); + break; + + default: + pr_err("duet_bmap: invalid flags\n"); + ret = -EINVAL; + break; + } + + /* decreg and wake up cleaner if needed */ + if (atomic_dec_and_test(&task->refcount)) + wake_up(&task->cleaner_queue); + +done: + kfree(ua); + return ret; +} -- 2.7.4 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>